imw 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +23 -0
- data/Gemfile.lock +47 -0
- data/LICENSE +20 -674
- data/README.rdoc +3 -4
- data/VERSION +1 -1
- data/lib/imw.rb +64 -35
- data/lib/imw/dataset.rb +12 -2
- data/lib/imw/formats.rb +4 -2
- data/lib/imw/formats/delimited.rb +96 -36
- data/lib/imw/formats/excel.rb +69 -101
- data/lib/imw/formats/json.rb +3 -5
- data/lib/imw/formats/pdf.rb +71 -0
- data/lib/imw/formats/yaml.rb +3 -5
- data/lib/imw/metadata.rb +66 -0
- data/lib/imw/metadata/contains_metadata.rb +44 -0
- data/lib/imw/metadata/dsl.rb +111 -0
- data/lib/imw/metadata/field.rb +65 -0
- data/lib/imw/metadata/schema.rb +227 -0
- data/lib/imw/metadata/schematized.rb +27 -0
- data/lib/imw/parsers.rb +1 -0
- data/lib/imw/parsers/flat.rb +44 -0
- data/lib/imw/resource.rb +36 -224
- data/lib/imw/schemes.rb +3 -1
- data/lib/imw/schemes/hdfs.rb +12 -1
- data/lib/imw/schemes/http.rb +1 -2
- data/lib/imw/schemes/local.rb +139 -16
- data/lib/imw/schemes/remote.rb +14 -9
- data/lib/imw/schemes/s3.rb +12 -0
- data/lib/imw/schemes/sql.rb +117 -0
- data/lib/imw/tools.rb +5 -3
- data/lib/imw/tools/downloader.rb +63 -0
- data/lib/imw/tools/summarizer.rb +21 -10
- data/lib/imw/utils.rb +10 -0
- data/lib/imw/utils/dynamically_extendable.rb +137 -0
- data/lib/imw/utils/error.rb +3 -0
- data/lib/imw/utils/extensions.rb +0 -4
- data/lib/imw/utils/extensions/array.rb +6 -7
- data/lib/imw/utils/extensions/hash.rb +3 -5
- data/lib/imw/utils/extensions/string.rb +3 -3
- data/lib/imw/utils/has_uri.rb +114 -0
- data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
- data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +1 -0
- data/spec/data/formats/none/sample +650 -0
- data/spec/data/formats/sgml/sample.xml +617 -0
- data/spec/data/formats/text/sample.txt +650 -0
- data/spec/data/formats/yaml/sample.yaml +410 -0
- data/spec/data/schema-tabular.yaml +11 -0
- data/spec/imw/formats/delimited_spec.rb +34 -2
- data/spec/imw/formats/excel_spec.rb +55 -0
- data/spec/imw/formats/json_spec.rb +3 -3
- data/spec/imw/formats/sgml_spec.rb +4 -4
- data/spec/imw/formats/yaml_spec.rb +3 -3
- data/spec/imw/metadata/field_spec.rb +26 -0
- data/spec/imw/metadata/schema_spec.rb +27 -0
- data/spec/imw/metadata_spec.rb +39 -0
- data/spec/imw/parsers/line_parser_spec.rb +1 -1
- data/spec/imw/resource_spec.rb +0 -100
- data/spec/imw/schemes/hdfs_spec.rb +19 -13
- data/spec/imw/schemes/local_spec.rb +59 -3
- data/spec/imw/schemes/s3_spec.rb +4 -0
- data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
- data/spec/imw/utils/has_uri_spec.rb +55 -0
- data/spec/spec_helper.rb +1 -2
- data/spec/support/random.rb +4 -4
- metadata +58 -17
- data/CHANGELOG +0 -0
- data/TODO +0 -18
- data/spec/data/sample.json +0 -782
- data/spec/data/sample.txt +0 -131
- data/spec/data/sample.xml +0 -653
- data/spec/data/sample.yaml +0 -651
- data/spec/spec.opts +0 -4
- data/spec/support/extensions.rb +0 -18
data/README.rdoc
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
= What is the Infinite Monkeywrench?
|
|
3
2
|
|
|
4
3
|
The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
|
|
@@ -58,18 +57,18 @@ IMW is centered around processing resources. A resource can be
|
|
|
58
57
|
_anything_ with a URI and you create one using IMW.open.
|
|
59
58
|
|
|
60
59
|
csv = IMW.open('/path/to/my_data.csv')
|
|
61
|
-
html = IMW.open('http://www.
|
|
60
|
+
html = IMW.open('http://www.example.com/history/march_2007')
|
|
62
61
|
|
|
63
62
|
IMW dynamically extends a resource with modules appropriate to it when
|
|
64
63
|
you open it. In the above case, +csv+ would be automatically extended
|
|
65
64
|
by the IMW::Resources::Formats::Csv module, among others:
|
|
66
65
|
|
|
67
|
-
csv.
|
|
66
|
+
csv.modules
|
|
68
67
|
=> [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
|
|
69
68
|
|
|
70
69
|
while +html+ will use a different set
|
|
71
70
|
|
|
72
|
-
html.
|
|
71
|
+
html.modules
|
|
73
72
|
=> [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
|
|
74
73
|
|
|
75
74
|
Consult the documentation for the modules a resource uses to learn
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.2.
|
|
1
|
+
0.2.8
|
data/lib/imw.rb
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
require 'rubygems'
|
|
2
|
+
require 'bundler'
|
|
3
|
+
Bundler.setup
|
|
2
4
|
require 'imw/boot'
|
|
3
5
|
require 'imw/utils'
|
|
4
6
|
|
|
@@ -8,15 +10,18 @@ require 'imw/utils'
|
|
|
8
10
|
# transformations of data as a network of dependencies (a la Make or
|
|
9
11
|
# Rake).
|
|
10
12
|
#
|
|
11
|
-
# IMW has a few central concepts: resources,
|
|
12
|
-
# repositories.
|
|
13
|
+
# IMW has a few central concepts: resources, metadata, datasets,
|
|
14
|
+
# workflows, and repositories.
|
|
13
15
|
#
|
|
14
16
|
# Resources represent individual data resources like local files,
|
|
15
|
-
# websites, databases, &c.
|
|
16
|
-
# IMW.open, with IMW doing the work of figuring out what to return
|
|
17
|
+
# websites, databases, &c. An IMW::Resource is typically instantiated
|
|
18
|
+
# via IMW.open, with IMW doing the work of figuring out what to return
|
|
17
19
|
# based on the URI passed in.
|
|
18
20
|
#
|
|
19
|
-
#
|
|
21
|
+
# A Resource can have a schema which describes the fields in its data.
|
|
22
|
+
# IMW::Metadata consists of classes which describe fields.
|
|
23
|
+
#
|
|
24
|
+
# Datasets represent collections of related data resources .. An
|
|
20
25
|
# IMW::Dataset comes with a pre-defined (but customizable) workflow
|
|
21
26
|
# that takes data resources through several steps: rip, parse, munge,
|
|
22
27
|
# and package. The workflow leverages Rake and so the various tasks
|
|
@@ -35,6 +40,7 @@ module IMW
|
|
|
35
40
|
autoload :Parsers, 'imw/parsers'
|
|
36
41
|
autoload :Dataset, 'imw/dataset'
|
|
37
42
|
autoload :Repository, 'imw/repository'
|
|
43
|
+
autoload :Metadata, 'imw/metadata'
|
|
38
44
|
|
|
39
45
|
# Open a resource at the given +uri+. The resource will
|
|
40
46
|
# automatically be extended by modules which make sense given the
|
|
@@ -47,14 +53,23 @@ module IMW
|
|
|
47
53
|
#
|
|
48
54
|
# @param [String, Addressable::URI, IMW::Resource] obj the URI to open
|
|
49
55
|
# @param [Hash] options
|
|
50
|
-
# @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.
|
|
51
|
-
# @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.
|
|
56
|
+
# @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
|
|
57
|
+
# @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
|
|
52
58
|
# @return [IMW::Resource] the resulting resource, property extended for the given URI
|
|
53
|
-
def self.open obj, options={}
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
59
|
+
def self.open obj, options={}, &block
|
|
60
|
+
if obj.is_a?(IMW::Resource)
|
|
61
|
+
resource = obj
|
|
62
|
+
else
|
|
63
|
+
options[:use_modules] ||= (options[:as] || [])
|
|
64
|
+
options[:skip_modules] ||= (options[:without] || [])
|
|
65
|
+
resource = IMW::Resource.new(obj, options)
|
|
66
|
+
end
|
|
67
|
+
if block_given?
|
|
68
|
+
yield resource
|
|
69
|
+
resource.close
|
|
70
|
+
else
|
|
71
|
+
resource
|
|
72
|
+
end
|
|
58
73
|
end
|
|
59
74
|
|
|
60
75
|
# Works the same way as IMW.open except opens the resource for
|
|
@@ -62,8 +77,8 @@ module IMW
|
|
|
62
77
|
#
|
|
63
78
|
# @param [String, Addressable::URI] uri the URI to open
|
|
64
79
|
# @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
|
|
65
|
-
def self.open! uri, options={}
|
|
66
|
-
|
|
80
|
+
def self.open! uri, options={}, &block
|
|
81
|
+
open(uri, options.merge(:mode => 'w'), &block)
|
|
67
82
|
end
|
|
68
83
|
|
|
69
84
|
# The default repository in which to place datasets. See the
|
|
@@ -75,32 +90,41 @@ module IMW
|
|
|
75
90
|
@@repository ||= IMW::Repository.new
|
|
76
91
|
end
|
|
77
92
|
|
|
78
|
-
# Create a dataset and put it in the default IMW repository.
|
|
79
|
-
# yields the dataset so you can define its workflow
|
|
93
|
+
# Create a dataset and put it in the default IMW repository.
|
|
80
94
|
#
|
|
81
|
-
#
|
|
82
|
-
#
|
|
83
|
-
#
|
|
84
|
-
# add_path :raw_data, :ripd, 'raw_data.csv'
|
|
85
|
-
# add_path :fixd_data, :fixd, 'fixed_data.csv'
|
|
95
|
+
# Evaluates the given block in the context of the new dataset. This
|
|
96
|
+
# allows you to define tasks, add paths, and use defined metadata in
|
|
97
|
+
# an elegant way.
|
|
86
98
|
#
|
|
87
|
-
#
|
|
88
|
-
#
|
|
89
|
-
#
|
|
90
|
-
#
|
|
99
|
+
# IMW.dataset :my_dataset do
|
|
100
|
+
#
|
|
101
|
+
# # Define some paths we're going to use
|
|
102
|
+
# add_path :original, :rawd, 'original.csv'
|
|
103
|
+
# add_path :filtered, :fixd, 'filtered.csv'
|
|
104
|
+
# add_path :package, :pkgd, 'filtered.tar.bz2'
|
|
91
105
|
#
|
|
92
|
-
#
|
|
93
|
-
#
|
|
94
|
-
#
|
|
95
|
-
#
|
|
96
|
-
# end.compact.dump(path_to(:fixd_data))
|
|
97
|
-
# end
|
|
106
|
+
# # Copy a CSV filefrom a website to this machine.
|
|
107
|
+
# rip do
|
|
108
|
+
# open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
|
|
109
|
+
# end
|
|
98
110
|
#
|
|
99
|
-
#
|
|
100
|
-
#
|
|
101
|
-
#
|
|
111
|
+
# # Filter the original CSV data by the
|
|
112
|
+
# # <tt>meets_some_condition?</tt> method we define elsewhere...
|
|
113
|
+
# munge do
|
|
114
|
+
# open!(path_to(:filtered)) do |filtered|
|
|
115
|
+
# open(path_to(:original)).each do |row|
|
|
116
|
+
# filtered << row if meets_some_condition?(row)
|
|
117
|
+
# end
|
|
118
|
+
# end
|
|
119
|
+
#
|
|
120
|
+
# # Compress the filtered data to an archive.
|
|
121
|
+
# package do
|
|
122
|
+
# open(path_to(:filtered)).compress.mv(path_to(:package))
|
|
123
|
+
# end
|
|
102
124
|
# end
|
|
103
|
-
#
|
|
125
|
+
#
|
|
126
|
+
# See the <tt>/examples</tt> directory of the IMW distribution for
|
|
127
|
+
# more examples.
|
|
104
128
|
#
|
|
105
129
|
# @param [Symbol, String] handle the handle to identify this dataset with
|
|
106
130
|
# @param [Hash] options a hash of options (see IMW::Dataset)
|
|
@@ -112,3 +136,8 @@ module IMW
|
|
|
112
136
|
end
|
|
113
137
|
|
|
114
138
|
end
|
|
139
|
+
|
|
140
|
+
# Works just like IMW.dataset but defined at a top-level scope.
|
|
141
|
+
def dataset handle, options={}, &block
|
|
142
|
+
IMW.dataset(handle, options, &block)
|
|
143
|
+
end
|
data/lib/imw/dataset.rb
CHANGED
|
@@ -96,9 +96,12 @@ module IMW
|
|
|
96
96
|
# dataset = IMW::Dataset.new :my_dataset, :repository => repo
|
|
97
97
|
class Dataset
|
|
98
98
|
|
|
99
|
-
|
|
99
|
+
# The handle this dataset goes by. Used for identifying it within
|
|
100
|
+
# a repository.
|
|
101
|
+
attr_accessor :handle
|
|
100
102
|
|
|
101
|
-
|
|
103
|
+
# Options for this dataset.
|
|
104
|
+
attr_accessor :options
|
|
102
105
|
|
|
103
106
|
def initialize handle, options = {}
|
|
104
107
|
@options = options
|
|
@@ -111,5 +114,12 @@ module IMW
|
|
|
111
114
|
end
|
|
112
115
|
end
|
|
113
116
|
|
|
117
|
+
# Provides this dataset with a workflow of tasks managed by Rake.
|
|
118
|
+
include IMW::Workflow
|
|
119
|
+
|
|
120
|
+
# Provides this dataset with DSL like methods to construct a
|
|
121
|
+
# schema in an IMW file.
|
|
122
|
+
include IMW::Metadata::DSL
|
|
123
|
+
|
|
114
124
|
end
|
|
115
125
|
end
|
data/lib/imw/formats.rb
CHANGED
|
@@ -10,20 +10,22 @@ module IMW
|
|
|
10
10
|
autoload :Xhtml, 'imw/formats/sgml'
|
|
11
11
|
autoload :Rdf, 'imw/formats/sgml'
|
|
12
12
|
autoload :Yaml, 'imw/formats/yaml'
|
|
13
|
+
autoload :Pdf, 'imw/formats/pdf'
|
|
13
14
|
|
|
14
15
|
# Handlers which augment a resource with data format specific
|
|
15
16
|
# methods.
|
|
16
17
|
HANDLERS = [
|
|
17
18
|
[ "Formats::Csv", /\.csv$/i ],
|
|
18
19
|
[ "Formats::Tsv", /\.tsv$/i ],
|
|
19
|
-
[ "Formats::Excel", /\.
|
|
20
|
+
[ "Formats::Excel", /\.xlsx?$/i ],
|
|
20
21
|
[ "Formats::Json", /\.json$/i ],
|
|
21
22
|
[ "Formats::Xml", /\.xml$/i ],
|
|
22
23
|
[ "Formats::Xsl", /\.xsl$/i ],
|
|
23
24
|
[ "Formats::Html", /\.html?$/i ],
|
|
24
25
|
[ "Formats::Xhtml", /\.xhtml?$/i ],
|
|
25
26
|
[ "Formats::Rdf", /\.rdf?$/i ],
|
|
26
|
-
[ "Formats::Yaml", /\.ya?ml$/i ]
|
|
27
|
+
[ "Formats::Yaml", /\.ya?ml$/i ],
|
|
28
|
+
[ "Formats::Pdf", /\.pdf$/i ]
|
|
27
29
|
]
|
|
28
30
|
end
|
|
29
31
|
end
|
|
@@ -11,9 +11,22 @@ module IMW
|
|
|
11
11
|
# @abstract
|
|
12
12
|
module Delimited
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
# Ensure that this delimited resource is described by a an
|
|
15
|
+
# ordered collection of flat fields.
|
|
16
|
+
def validate_schema!
|
|
17
|
+
raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
|
|
18
|
+
end
|
|
15
19
|
|
|
16
|
-
|
|
20
|
+
# Default options to be passed to
|
|
21
|
+
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
|
22
|
+
# documentation for more information.
|
|
23
|
+
#
|
|
24
|
+
# @return [Hash]
|
|
25
|
+
def delimited_options
|
|
26
|
+
@delimited_options ||= {
|
|
27
|
+
:headers => schema && schema.map { |field| field['name'] }
|
|
28
|
+
}.merge(resource_options_compatible_with_faster_csv)
|
|
29
|
+
end
|
|
17
30
|
|
|
18
31
|
# Return the data in this delimited resource as an array of
|
|
19
32
|
# arrays.
|
|
@@ -27,24 +40,70 @@ module IMW
|
|
|
27
40
|
FasterCSV.parse(read, delimited_options, &block)
|
|
28
41
|
end
|
|
29
42
|
|
|
43
|
+
# Gives us goodies! Needs +each+ below.
|
|
44
|
+
include Enumerable
|
|
45
|
+
|
|
30
46
|
# Call +block+ with each row in this delimited resource.
|
|
31
47
|
def each &block
|
|
32
|
-
|
|
48
|
+
require 'fastercsv'
|
|
49
|
+
FasterCSV.new(io, delimited_options).each(&block)
|
|
33
50
|
end
|
|
34
51
|
|
|
35
|
-
#
|
|
52
|
+
# Emit a single array or an array of arrays into this resource.
|
|
36
53
|
#
|
|
37
|
-
# @param [Array] data array of arrays to
|
|
54
|
+
# @param [Array<Array>, Array] data array or array of arrays to emit
|
|
38
55
|
# @param [Hash] options
|
|
39
|
-
# @option options [true, false] :persist Keep this resource's IO object open after
|
|
40
|
-
def
|
|
56
|
+
# @option options [true, false] :persist Keep this resource's IO object open after emiting
|
|
57
|
+
def emit data, options={}
|
|
41
58
|
require 'fastercsv'
|
|
59
|
+
data = [data] unless data.first.is_a?(Array)
|
|
42
60
|
data.each do |row|
|
|
43
61
|
write(FasterCSV.generate_line(row, delimited_options))
|
|
44
62
|
end
|
|
45
|
-
io.close unless options[:persist]
|
|
46
63
|
self
|
|
47
64
|
end
|
|
65
|
+
alias_method :<<, :emit
|
|
66
|
+
|
|
67
|
+
# Do a heuristic check to determine whether or not the first row
|
|
68
|
+
# of this delimited data is a row of headers.
|
|
69
|
+
#
|
|
70
|
+
# @return [true, false]
|
|
71
|
+
def headers_in_first_line?
|
|
72
|
+
# grab the header and up to 10 body rows
|
|
73
|
+
require 'fastercsv'
|
|
74
|
+
copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
|
|
75
|
+
header = (copy.shift || []) rescue []
|
|
76
|
+
body = 10.times.map { (copy.shift || []) rescue []}.flatten
|
|
77
|
+
|
|
78
|
+
# guess how many elements in a row
|
|
79
|
+
#size_guess = ((header.size + body.map(&:size).inject(0.0) { |e, s| s += e }).to_f / (1 + body.length).to_f).to_i
|
|
80
|
+
|
|
81
|
+
# calculate the fraction of bytes that are [-A-z_] (letters +
|
|
82
|
+
# underscore + hypen) for header and body and compute a
|
|
83
|
+
# threshold determinant
|
|
84
|
+
header_chars = header.map(&:to_s).join
|
|
85
|
+
header_schema_bytes = header_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
|
|
86
|
+
body_chars = body.map(&:to_s).join
|
|
87
|
+
body_schema_bytes = body_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
|
|
88
|
+
header_schema_fraction = header_schema_bytes.size.to_f / header_chars.size.to_f rescue nil
|
|
89
|
+
body_schema_fraction = body_schema_bytes.size.to_f / body_chars.size.to_f rescue nil
|
|
90
|
+
determinant = (body_schema_fraction - header_schema_fraction).abs / 2.0 rescue nil
|
|
91
|
+
|
|
92
|
+
# decide, setting the threshold at 0.05 based on some guesswork...
|
|
93
|
+
determinant && determinant >= 0.05
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# If it seems like there are headers in the first line of this
|
|
97
|
+
# data then go ahead and use them to define a schema.
|
|
98
|
+
#
|
|
99
|
+
# Will overwrite a schema already present for this resource.
|
|
100
|
+
def guess_schema!
|
|
101
|
+
return unless headers_in_first_line?
|
|
102
|
+
copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
|
|
103
|
+
names = (copy.shift || []) rescue []
|
|
104
|
+
self.schema = IMW::Metadata::Schema.new(names)
|
|
105
|
+
delimited_options[:headers] = names
|
|
106
|
+
end
|
|
48
107
|
|
|
49
108
|
# Return a 10-line sample of this file.
|
|
50
109
|
#
|
|
@@ -53,52 +112,53 @@ module IMW
|
|
|
53
112
|
require 'fastercsv'
|
|
54
113
|
returning([]) do |rows|
|
|
55
114
|
row_num = 1
|
|
56
|
-
|
|
115
|
+
each do |row|
|
|
57
116
|
break if row_num > 10
|
|
58
|
-
rows << row
|
|
117
|
+
rows << row.size.times.map { |index| row[index] }
|
|
59
118
|
row_num += 1
|
|
60
119
|
end
|
|
61
120
|
end
|
|
62
121
|
end
|
|
63
|
-
end
|
|
64
122
|
|
|
65
|
-
|
|
66
|
-
|
|
123
|
+
protected
|
|
124
|
+
# An array of option names used by FasterCSV.
|
|
125
|
+
FASTER_CSV_OPTION_NAMES = %w[col_sep row_sep quote_char encoding field_size_limit converters unconverted_fields headers return_headers write_headers header_converters skip_blanks force_quotes].map(&:to_sym)
|
|
67
126
|
|
|
68
|
-
#
|
|
69
|
-
# FasterCSV
|
|
70
|
-
#
|
|
127
|
+
# Return the subset of options this resource was initialized
|
|
128
|
+
# with that are compatible with FasterCSV (it complains when you
|
|
129
|
+
# give it keywords it doesn't know).
|
|
71
130
|
#
|
|
72
131
|
# @return [Hash]
|
|
132
|
+
def resource_options_compatible_with_faster_csv
|
|
133
|
+
@compatible_options ||= returning({}) do |compatible_options|
|
|
134
|
+
FASTER_CSV_OPTION_NAMES.each do |option_name|
|
|
135
|
+
compatible_options[option_name] = resource_options[option_name] if resource_options.has_key?(option_name.to_sym)
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# A module for working with CSV (comma-separated value) formatted
|
|
142
|
+
# data.
|
|
143
|
+
#
|
|
144
|
+
# @see IMW::Formats::Delimited
|
|
145
|
+
module Csv
|
|
146
|
+
include Delimited
|
|
73
147
|
def delimited_options
|
|
74
|
-
@delimited_options ||= {
|
|
75
|
-
:col_sep => ',',
|
|
76
|
-
:headers => false,
|
|
77
|
-
:return_headers => false,
|
|
78
|
-
:write_headers => true,
|
|
79
|
-
:skip_blanks => false,
|
|
80
|
-
:force_quotes => false
|
|
81
|
-
}
|
|
148
|
+
@delimited_options ||= {:col_sep => ","}.merge(super())
|
|
82
149
|
end
|
|
83
150
|
end
|
|
84
151
|
|
|
152
|
+
# A module for working with TSV (tab-separated value) formatted
|
|
153
|
+
# data.
|
|
154
|
+
#
|
|
155
|
+
# @see IMW::Formats::Delimited
|
|
85
156
|
module Tsv
|
|
86
157
|
include Delimited
|
|
87
|
-
|
|
88
|
-
# Default options to be passed to
|
|
89
|
-
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
|
90
|
-
# documentation for more information.
|
|
91
|
-
#
|
|
92
|
-
# @return [Hash]
|
|
93
158
|
def delimited_options
|
|
94
159
|
@delimited_options ||= {
|
|
95
160
|
:col_sep => "\t",
|
|
96
|
-
|
|
97
|
-
:return_headers => false,
|
|
98
|
-
:write_headers => true,
|
|
99
|
-
:skip_blanks => false,
|
|
100
|
-
:force_quotes => false
|
|
101
|
-
}
|
|
161
|
+
}.merge(super())
|
|
102
162
|
end
|
|
103
163
|
end
|
|
104
164
|
end
|
data/lib/imw/formats/excel.rb
CHANGED
|
@@ -4,120 +4,88 @@ module IMW
|
|
|
4
4
|
# Defines methods for reading and writing Microsoft Excel data.
|
|
5
5
|
module Excel
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def
|
|
10
|
-
if
|
|
11
|
-
@book = Spreadsheet.open path
|
|
12
|
-
@sheet = book.worksheet(0)
|
|
13
|
-
|
|
14
|
-
end
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def book
|
|
19
|
-
return @book if @book
|
|
20
|
-
if exists?
|
|
21
|
-
@book = Spreadsheet.open(path)
|
|
22
|
-
else
|
|
23
|
-
@book = Spreadsheet::Workbook.new
|
|
24
|
-
end
|
|
7
|
+
# Ensure that this Excel resource is described by a an ordered
|
|
8
|
+
# collection of flat fields.
|
|
9
|
+
def validate_schema!
|
|
10
|
+
raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
|
|
25
11
|
end
|
|
26
12
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
#
|
|
33
|
-
#it is opened and can be read out with a subsequent call to
|
|
34
|
-
#load(). Otherwise, a new workbook is created and can be written
|
|
35
|
-
#to with the dump() method.
|
|
36
|
-
def initialize uri, mode='r', options={}
|
|
37
|
-
self.uri = uri
|
|
38
|
-
@max_lines = options[:max_lines] || 65000
|
|
39
|
-
@idx = 0
|
|
40
|
-
@book_idx = 0
|
|
41
|
-
@sht_idx = 0
|
|
42
|
-
unless self.exist?
|
|
43
|
-
make_new_book
|
|
44
|
-
make_new_sheet
|
|
45
|
-
else
|
|
46
|
-
get_existing_book
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
#Returns the data in an existing workbook as an
|
|
51
|
-
#array of arrays. Only capable of reading a single sheet.
|
|
13
|
+
# Return the data in this Excel document as an array of arrays.
|
|
14
|
+
#
|
|
15
|
+
# Data from consecutive worksheets will be concatenated into a
|
|
16
|
+
# single outer array.
|
|
17
|
+
#
|
|
18
|
+
# @return [Array<Array>]
|
|
52
19
|
def load
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
#Raises a 'too many lines' error if the number of lines
|
|
60
|
-
#of data exceeds max_lines.
|
|
61
|
-
def dump data
|
|
62
|
-
data.each do |line|
|
|
63
|
-
raise "too many lines" if too_many?
|
|
64
|
-
self << line
|
|
20
|
+
require 'spreadsheet'
|
|
21
|
+
data = []
|
|
22
|
+
Spreadsheet.open(path).worksheets.each do |worksheet|
|
|
23
|
+
data += worksheet.map do |row|
|
|
24
|
+
row.to_a
|
|
25
|
+
end
|
|
65
26
|
end
|
|
66
|
-
|
|
27
|
+
data
|
|
67
28
|
end
|
|
68
29
|
|
|
69
|
-
#
|
|
70
|
-
|
|
71
|
-
def << line
|
|
72
|
-
@sheet.row(@sht_row).concat( line )
|
|
73
|
-
@sht_row += 1
|
|
74
|
-
@idx += 1
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
#Instantiates a new Excel workbook in memory. You shouldn't
|
|
78
|
-
#need to call this directly.
|
|
79
|
-
def make_new_book
|
|
80
|
-
@book = Spreadsheet::Workbook.new
|
|
81
|
-
@book_idx += 1
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
#Makes a new worksheet for a pre-existing Excel workbook.
|
|
85
|
-
#This should be called after recovering from the
|
|
86
|
-
#'too many lines' error.
|
|
87
|
-
def make_new_sheet
|
|
88
|
-
@sheet = @book.create_worksheet
|
|
89
|
-
@sht_idx += 1
|
|
90
|
-
@sht_row = 0 #always start at row 0 in a new sheet
|
|
91
|
-
end
|
|
30
|
+
# Gives us goodies! Needs +each+ below.
|
|
31
|
+
include Enumerable
|
|
92
32
|
|
|
93
|
-
#
|
|
94
|
-
#
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
33
|
+
# Yield each row of this Excel document.
|
|
34
|
+
#
|
|
35
|
+
# Will loop from one worksheet to the next.
|
|
36
|
+
#
|
|
37
|
+
# @yield [Spreadsheet::Excel::Row]
|
|
38
|
+
def each &block
|
|
39
|
+
require 'spreadsheet'
|
|
40
|
+
Spreadsheet.open(path).worksheets.each do |worksheet|
|
|
41
|
+
worksheet.each(&block)
|
|
42
|
+
end
|
|
98
43
|
end
|
|
99
44
|
|
|
100
|
-
#
|
|
101
|
-
#
|
|
102
|
-
|
|
103
|
-
|
|
45
|
+
# Return the number of lines in this Excel document.
|
|
46
|
+
#
|
|
47
|
+
# Measured across worksheets.
|
|
48
|
+
#
|
|
49
|
+
# @return [Integer]
|
|
50
|
+
def num_lines
|
|
51
|
+
require 'spreadsheet'
|
|
52
|
+
Spreadsheet.open(path).worksheets.inject(0) do |sum, worksheet|
|
|
53
|
+
sum += worksheet.row_count
|
|
54
|
+
end
|
|
104
55
|
end
|
|
105
56
|
|
|
106
|
-
#
|
|
107
|
-
#
|
|
108
|
-
def
|
|
109
|
-
|
|
110
|
-
end
|
|
57
|
+
# TODO
|
|
58
|
+
#
|
|
59
|
+
# def emit
|
|
60
|
+
# end
|
|
111
61
|
|
|
112
|
-
#
|
|
113
|
-
#
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
62
|
+
# TODO
|
|
63
|
+
#
|
|
64
|
+
# Extract the following methods from delimited into a module and
|
|
65
|
+
# let both Excel and Delimited use them.
|
|
66
|
+
#
|
|
67
|
+
# Or let Excel include Delimited and let it override
|
|
68
|
+
# appropriately.
|
|
69
|
+
#
|
|
70
|
+
# headers_in_first_line?
|
|
71
|
+
# guess_schema!
|
|
72
|
+
#
|
|
73
|
+
#
|
|
117
74
|
|
|
118
|
-
#
|
|
119
|
-
def
|
|
120
|
-
|
|
75
|
+
#
|
|
76
|
+
def snippet
|
|
77
|
+
require 'spreadsheet'
|
|
78
|
+
returning([]) do |snip|
|
|
79
|
+
row_num = 1
|
|
80
|
+
Spreadsheet.open(path).worksheets.each do |worksheet|
|
|
81
|
+
worksheet.each do |row|
|
|
82
|
+
break if row_num > 10
|
|
83
|
+
snip << row.to_a
|
|
84
|
+
row_num += 1
|
|
85
|
+
end
|
|
86
|
+
break if row_num > 10
|
|
87
|
+
end
|
|
88
|
+
end
|
|
121
89
|
end
|
|
122
90
|
end
|
|
123
91
|
end
|