imw 0.2.7 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +23 -0
- data/Gemfile.lock +47 -0
- data/LICENSE +20 -674
- data/README.rdoc +3 -4
- data/VERSION +1 -1
- data/lib/imw.rb +64 -35
- data/lib/imw/dataset.rb +12 -2
- data/lib/imw/formats.rb +4 -2
- data/lib/imw/formats/delimited.rb +96 -36
- data/lib/imw/formats/excel.rb +69 -101
- data/lib/imw/formats/json.rb +3 -5
- data/lib/imw/formats/pdf.rb +71 -0
- data/lib/imw/formats/yaml.rb +3 -5
- data/lib/imw/metadata.rb +66 -0
- data/lib/imw/metadata/contains_metadata.rb +44 -0
- data/lib/imw/metadata/dsl.rb +111 -0
- data/lib/imw/metadata/field.rb +65 -0
- data/lib/imw/metadata/schema.rb +227 -0
- data/lib/imw/metadata/schematized.rb +27 -0
- data/lib/imw/parsers.rb +1 -0
- data/lib/imw/parsers/flat.rb +44 -0
- data/lib/imw/resource.rb +36 -224
- data/lib/imw/schemes.rb +3 -1
- data/lib/imw/schemes/hdfs.rb +12 -1
- data/lib/imw/schemes/http.rb +1 -2
- data/lib/imw/schemes/local.rb +139 -16
- data/lib/imw/schemes/remote.rb +14 -9
- data/lib/imw/schemes/s3.rb +12 -0
- data/lib/imw/schemes/sql.rb +117 -0
- data/lib/imw/tools.rb +5 -3
- data/lib/imw/tools/downloader.rb +63 -0
- data/lib/imw/tools/summarizer.rb +21 -10
- data/lib/imw/utils.rb +10 -0
- data/lib/imw/utils/dynamically_extendable.rb +137 -0
- data/lib/imw/utils/error.rb +3 -0
- data/lib/imw/utils/extensions.rb +0 -4
- data/lib/imw/utils/extensions/array.rb +6 -7
- data/lib/imw/utils/extensions/hash.rb +3 -5
- data/lib/imw/utils/extensions/string.rb +3 -3
- data/lib/imw/utils/has_uri.rb +114 -0
- data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
- data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +1 -0
- data/spec/data/formats/none/sample +650 -0
- data/spec/data/formats/sgml/sample.xml +617 -0
- data/spec/data/formats/text/sample.txt +650 -0
- data/spec/data/formats/yaml/sample.yaml +410 -0
- data/spec/data/schema-tabular.yaml +11 -0
- data/spec/imw/formats/delimited_spec.rb +34 -2
- data/spec/imw/formats/excel_spec.rb +55 -0
- data/spec/imw/formats/json_spec.rb +3 -3
- data/spec/imw/formats/sgml_spec.rb +4 -4
- data/spec/imw/formats/yaml_spec.rb +3 -3
- data/spec/imw/metadata/field_spec.rb +26 -0
- data/spec/imw/metadata/schema_spec.rb +27 -0
- data/spec/imw/metadata_spec.rb +39 -0
- data/spec/imw/parsers/line_parser_spec.rb +1 -1
- data/spec/imw/resource_spec.rb +0 -100
- data/spec/imw/schemes/hdfs_spec.rb +19 -13
- data/spec/imw/schemes/local_spec.rb +59 -3
- data/spec/imw/schemes/s3_spec.rb +4 -0
- data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
- data/spec/imw/utils/has_uri_spec.rb +55 -0
- data/spec/spec_helper.rb +1 -2
- data/spec/support/random.rb +4 -4
- metadata +58 -17
- data/CHANGELOG +0 -0
- data/TODO +0 -18
- data/spec/data/sample.json +0 -782
- data/spec/data/sample.txt +0 -131
- data/spec/data/sample.xml +0 -653
- data/spec/data/sample.yaml +0 -651
- data/spec/spec.opts +0 -4
- data/spec/support/extensions.rb +0 -18
data/README.rdoc
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
= What is the Infinite Monkeywrench?
|
3
2
|
|
4
3
|
The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
|
@@ -58,18 +57,18 @@ IMW is centered around processing resources. A resource can be
|
|
58
57
|
_anything_ with a URI and you create one using IMW.open.
|
59
58
|
|
60
59
|
csv = IMW.open('/path/to/my_data.csv')
|
61
|
-
html = IMW.open('http://www.
|
60
|
+
html = IMW.open('http://www.example.com/history/march_2007')
|
62
61
|
|
63
62
|
IMW dynamically extends a resource with modules appropriate to it when
|
64
63
|
you open it. In the above case, +csv+ would be automatically extended
|
65
64
|
by the IMW::Resources::Formats::Csv module, among others:
|
66
65
|
|
67
|
-
csv.
|
66
|
+
csv.modules
|
68
67
|
=> [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
|
69
68
|
|
70
69
|
while +html+ will use a different set
|
71
70
|
|
72
|
-
html.
|
71
|
+
html.modules
|
73
72
|
=> [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
|
74
73
|
|
75
74
|
Consult the documentation for the modules a resource uses to learn
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.8
|
data/lib/imw.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
Bundler.setup
|
2
4
|
require 'imw/boot'
|
3
5
|
require 'imw/utils'
|
4
6
|
|
@@ -8,15 +10,18 @@ require 'imw/utils'
|
|
8
10
|
# transformations of data as a network of dependencies (a la Make or
|
9
11
|
# Rake).
|
10
12
|
#
|
11
|
-
# IMW has a few central concepts: resources,
|
12
|
-
# repositories.
|
13
|
+
# IMW has a few central concepts: resources, metadata, datasets,
|
14
|
+
# workflows, and repositories.
|
13
15
|
#
|
14
16
|
# Resources represent individual data resources like local files,
|
15
|
-
# websites, databases, &c.
|
16
|
-
# IMW.open, with IMW doing the work of figuring out what to return
|
17
|
+
# websites, databases, &c. An IMW::Resource is typically instantiated
|
18
|
+
# via IMW.open, with IMW doing the work of figuring out what to return
|
17
19
|
# based on the URI passed in.
|
18
20
|
#
|
19
|
-
#
|
21
|
+
# A Resource can have a schema which describes the fields in its data.
|
22
|
+
# IMW::Metadata consists of classes which describe fields.
|
23
|
+
#
|
24
|
+
# Datasets represent collections of related data resources .. An
|
20
25
|
# IMW::Dataset comes with a pre-defined (but customizable) workflow
|
21
26
|
# that takes data resources through several steps: rip, parse, munge,
|
22
27
|
# and package. The workflow leverages Rake and so the various tasks
|
@@ -35,6 +40,7 @@ module IMW
|
|
35
40
|
autoload :Parsers, 'imw/parsers'
|
36
41
|
autoload :Dataset, 'imw/dataset'
|
37
42
|
autoload :Repository, 'imw/repository'
|
43
|
+
autoload :Metadata, 'imw/metadata'
|
38
44
|
|
39
45
|
# Open a resource at the given +uri+. The resource will
|
40
46
|
# automatically be extended by modules which make sense given the
|
@@ -47,14 +53,23 @@ module IMW
|
|
47
53
|
#
|
48
54
|
# @param [String, Addressable::URI, IMW::Resource] obj the URI to open
|
49
55
|
# @param [Hash] options
|
50
|
-
# @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.
|
51
|
-
# @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.
|
56
|
+
# @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
|
57
|
+
# @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
|
52
58
|
# @return [IMW::Resource] the resulting resource, property extended for the given URI
|
53
|
-
def self.open obj, options={}
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
59
|
+
def self.open obj, options={}, &block
|
60
|
+
if obj.is_a?(IMW::Resource)
|
61
|
+
resource = obj
|
62
|
+
else
|
63
|
+
options[:use_modules] ||= (options[:as] || [])
|
64
|
+
options[:skip_modules] ||= (options[:without] || [])
|
65
|
+
resource = IMW::Resource.new(obj, options)
|
66
|
+
end
|
67
|
+
if block_given?
|
68
|
+
yield resource
|
69
|
+
resource.close
|
70
|
+
else
|
71
|
+
resource
|
72
|
+
end
|
58
73
|
end
|
59
74
|
|
60
75
|
# Works the same way as IMW.open except opens the resource for
|
@@ -62,8 +77,8 @@ module IMW
|
|
62
77
|
#
|
63
78
|
# @param [String, Addressable::URI] uri the URI to open
|
64
79
|
# @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
|
65
|
-
def self.open! uri, options={}
|
66
|
-
|
80
|
+
def self.open! uri, options={}, &block
|
81
|
+
open(uri, options.merge(:mode => 'w'), &block)
|
67
82
|
end
|
68
83
|
|
69
84
|
# The default repository in which to place datasets. See the
|
@@ -75,32 +90,41 @@ module IMW
|
|
75
90
|
@@repository ||= IMW::Repository.new
|
76
91
|
end
|
77
92
|
|
78
|
-
# Create a dataset and put it in the default IMW repository.
|
79
|
-
# yields the dataset so you can define its workflow
|
93
|
+
# Create a dataset and put it in the default IMW repository.
|
80
94
|
#
|
81
|
-
#
|
82
|
-
#
|
83
|
-
#
|
84
|
-
# add_path :raw_data, :ripd, 'raw_data.csv'
|
85
|
-
# add_path :fixd_data, :fixd, 'fixed_data.csv'
|
95
|
+
# Evaluates the given block in the context of the new dataset. This
|
96
|
+
# allows you to define tasks, add paths, and use defined metadata in
|
97
|
+
# an elegant way.
|
86
98
|
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
#
|
90
|
-
#
|
99
|
+
# IMW.dataset :my_dataset do
|
100
|
+
#
|
101
|
+
# # Define some paths we're going to use
|
102
|
+
# add_path :original, :rawd, 'original.csv'
|
103
|
+
# add_path :filtered, :fixd, 'filtered.csv'
|
104
|
+
# add_path :package, :pkgd, 'filtered.tar.bz2'
|
91
105
|
#
|
92
|
-
#
|
93
|
-
#
|
94
|
-
#
|
95
|
-
#
|
96
|
-
# end.compact.dump(path_to(:fixd_data))
|
97
|
-
# end
|
106
|
+
# # Copy a CSV filefrom a website to this machine.
|
107
|
+
# rip do
|
108
|
+
# open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
|
109
|
+
# end
|
98
110
|
#
|
99
|
-
#
|
100
|
-
#
|
101
|
-
#
|
111
|
+
# # Filter the original CSV data by the
|
112
|
+
# # <tt>meets_some_condition?</tt> method we define elsewhere...
|
113
|
+
# munge do
|
114
|
+
# open!(path_to(:filtered)) do |filtered|
|
115
|
+
# open(path_to(:original)).each do |row|
|
116
|
+
# filtered << row if meets_some_condition?(row)
|
117
|
+
# end
|
118
|
+
# end
|
119
|
+
#
|
120
|
+
# # Compress the filtered data to an archive.
|
121
|
+
# package do
|
122
|
+
# open(path_to(:filtered)).compress.mv(path_to(:package))
|
123
|
+
# end
|
102
124
|
# end
|
103
|
-
#
|
125
|
+
#
|
126
|
+
# See the <tt>/examples</tt> directory of the IMW distribution for
|
127
|
+
# more examples.
|
104
128
|
#
|
105
129
|
# @param [Symbol, String] handle the handle to identify this dataset with
|
106
130
|
# @param [Hash] options a hash of options (see IMW::Dataset)
|
@@ -112,3 +136,8 @@ module IMW
|
|
112
136
|
end
|
113
137
|
|
114
138
|
end
|
139
|
+
|
140
|
+
# Works just like IMW.dataset but defined at a top-level scope.
|
141
|
+
def dataset handle, options={}, &block
|
142
|
+
IMW.dataset(handle, options, &block)
|
143
|
+
end
|
data/lib/imw/dataset.rb
CHANGED
@@ -96,9 +96,12 @@ module IMW
|
|
96
96
|
# dataset = IMW::Dataset.new :my_dataset, :repository => repo
|
97
97
|
class Dataset
|
98
98
|
|
99
|
-
|
99
|
+
# The handle this dataset goes by. Used for identifying it within
|
100
|
+
# a repository.
|
101
|
+
attr_accessor :handle
|
100
102
|
|
101
|
-
|
103
|
+
# Options for this dataset.
|
104
|
+
attr_accessor :options
|
102
105
|
|
103
106
|
def initialize handle, options = {}
|
104
107
|
@options = options
|
@@ -111,5 +114,12 @@ module IMW
|
|
111
114
|
end
|
112
115
|
end
|
113
116
|
|
117
|
+
# Provides this dataset with a workflow of tasks managed by Rake.
|
118
|
+
include IMW::Workflow
|
119
|
+
|
120
|
+
# Provides this dataset with DSL like methods to construct a
|
121
|
+
# schema in an IMW file.
|
122
|
+
include IMW::Metadata::DSL
|
123
|
+
|
114
124
|
end
|
115
125
|
end
|
data/lib/imw/formats.rb
CHANGED
@@ -10,20 +10,22 @@ module IMW
|
|
10
10
|
autoload :Xhtml, 'imw/formats/sgml'
|
11
11
|
autoload :Rdf, 'imw/formats/sgml'
|
12
12
|
autoload :Yaml, 'imw/formats/yaml'
|
13
|
+
autoload :Pdf, 'imw/formats/pdf'
|
13
14
|
|
14
15
|
# Handlers which augment a resource with data format specific
|
15
16
|
# methods.
|
16
17
|
HANDLERS = [
|
17
18
|
[ "Formats::Csv", /\.csv$/i ],
|
18
19
|
[ "Formats::Tsv", /\.tsv$/i ],
|
19
|
-
[ "Formats::Excel", /\.
|
20
|
+
[ "Formats::Excel", /\.xlsx?$/i ],
|
20
21
|
[ "Formats::Json", /\.json$/i ],
|
21
22
|
[ "Formats::Xml", /\.xml$/i ],
|
22
23
|
[ "Formats::Xsl", /\.xsl$/i ],
|
23
24
|
[ "Formats::Html", /\.html?$/i ],
|
24
25
|
[ "Formats::Xhtml", /\.xhtml?$/i ],
|
25
26
|
[ "Formats::Rdf", /\.rdf?$/i ],
|
26
|
-
[ "Formats::Yaml", /\.ya?ml$/i ]
|
27
|
+
[ "Formats::Yaml", /\.ya?ml$/i ],
|
28
|
+
[ "Formats::Pdf", /\.pdf$/i ]
|
27
29
|
]
|
28
30
|
end
|
29
31
|
end
|
@@ -11,9 +11,22 @@ module IMW
|
|
11
11
|
# @abstract
|
12
12
|
module Delimited
|
13
13
|
|
14
|
-
|
14
|
+
# Ensure that this delimited resource is described by a an
|
15
|
+
# ordered collection of flat fields.
|
16
|
+
def validate_schema!
|
17
|
+
raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
|
18
|
+
end
|
15
19
|
|
16
|
-
|
20
|
+
# Default options to be passed to
|
21
|
+
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
22
|
+
# documentation for more information.
|
23
|
+
#
|
24
|
+
# @return [Hash]
|
25
|
+
def delimited_options
|
26
|
+
@delimited_options ||= {
|
27
|
+
:headers => schema && schema.map { |field| field['name'] }
|
28
|
+
}.merge(resource_options_compatible_with_faster_csv)
|
29
|
+
end
|
17
30
|
|
18
31
|
# Return the data in this delimited resource as an array of
|
19
32
|
# arrays.
|
@@ -27,24 +40,70 @@ module IMW
|
|
27
40
|
FasterCSV.parse(read, delimited_options, &block)
|
28
41
|
end
|
29
42
|
|
43
|
+
# Gives us goodies! Needs +each+ below.
|
44
|
+
include Enumerable
|
45
|
+
|
30
46
|
# Call +block+ with each row in this delimited resource.
|
31
47
|
def each &block
|
32
|
-
|
48
|
+
require 'fastercsv'
|
49
|
+
FasterCSV.new(io, delimited_options).each(&block)
|
33
50
|
end
|
34
51
|
|
35
|
-
#
|
52
|
+
# Emit a single array or an array of arrays into this resource.
|
36
53
|
#
|
37
|
-
# @param [Array] data array of arrays to
|
54
|
+
# @param [Array<Array>, Array] data array or array of arrays to emit
|
38
55
|
# @param [Hash] options
|
39
|
-
# @option options [true, false] :persist Keep this resource's IO object open after
|
40
|
-
def
|
56
|
+
# @option options [true, false] :persist Keep this resource's IO object open after emiting
|
57
|
+
def emit data, options={}
|
41
58
|
require 'fastercsv'
|
59
|
+
data = [data] unless data.first.is_a?(Array)
|
42
60
|
data.each do |row|
|
43
61
|
write(FasterCSV.generate_line(row, delimited_options))
|
44
62
|
end
|
45
|
-
io.close unless options[:persist]
|
46
63
|
self
|
47
64
|
end
|
65
|
+
alias_method :<<, :emit
|
66
|
+
|
67
|
+
# Do a heuristic check to determine whether or not the first row
|
68
|
+
# of this delimited data is a row of headers.
|
69
|
+
#
|
70
|
+
# @return [true, false]
|
71
|
+
def headers_in_first_line?
|
72
|
+
# grab the header and up to 10 body rows
|
73
|
+
require 'fastercsv'
|
74
|
+
copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
|
75
|
+
header = (copy.shift || []) rescue []
|
76
|
+
body = 10.times.map { (copy.shift || []) rescue []}.flatten
|
77
|
+
|
78
|
+
# guess how many elements in a row
|
79
|
+
#size_guess = ((header.size + body.map(&:size).inject(0.0) { |e, s| s += e }).to_f / (1 + body.length).to_f).to_i
|
80
|
+
|
81
|
+
# calculate the fraction of bytes that are [-A-z_] (letters +
|
82
|
+
# underscore + hypen) for header and body and compute a
|
83
|
+
# threshold determinant
|
84
|
+
header_chars = header.map(&:to_s).join
|
85
|
+
header_schema_bytes = header_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
|
86
|
+
body_chars = body.map(&:to_s).join
|
87
|
+
body_schema_bytes = body_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
|
88
|
+
header_schema_fraction = header_schema_bytes.size.to_f / header_chars.size.to_f rescue nil
|
89
|
+
body_schema_fraction = body_schema_bytes.size.to_f / body_chars.size.to_f rescue nil
|
90
|
+
determinant = (body_schema_fraction - header_schema_fraction).abs / 2.0 rescue nil
|
91
|
+
|
92
|
+
# decide, setting the threshold at 0.05 based on some guesswork...
|
93
|
+
determinant && determinant >= 0.05
|
94
|
+
end
|
95
|
+
|
96
|
+
# If it seems like there are headers in the first line of this
|
97
|
+
# data then go ahead and use them to define a schema.
|
98
|
+
#
|
99
|
+
# Will overwrite a schema already present for this resource.
|
100
|
+
def guess_schema!
|
101
|
+
return unless headers_in_first_line?
|
102
|
+
copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
|
103
|
+
names = (copy.shift || []) rescue []
|
104
|
+
self.schema = IMW::Metadata::Schema.new(names)
|
105
|
+
delimited_options[:headers] = names
|
106
|
+
end
|
48
107
|
|
49
108
|
# Return a 10-line sample of this file.
|
50
109
|
#
|
@@ -53,52 +112,53 @@ module IMW
|
|
53
112
|
require 'fastercsv'
|
54
113
|
returning([]) do |rows|
|
55
114
|
row_num = 1
|
56
|
-
|
115
|
+
each do |row|
|
57
116
|
break if row_num > 10
|
58
|
-
rows << row
|
117
|
+
rows << row.size.times.map { |index| row[index] }
|
59
118
|
row_num += 1
|
60
119
|
end
|
61
120
|
end
|
62
121
|
end
|
63
|
-
end
|
64
122
|
|
65
|
-
|
66
|
-
|
123
|
+
protected
|
124
|
+
# An array of option names used by FasterCSV.
|
125
|
+
FASTER_CSV_OPTION_NAMES = %w[col_sep row_sep quote_char encoding field_size_limit converters unconverted_fields headers return_headers write_headers header_converters skip_blanks force_quotes].map(&:to_sym)
|
67
126
|
|
68
|
-
#
|
69
|
-
# FasterCSV
|
70
|
-
#
|
127
|
+
# Return the subset of options this resource was initialized
|
128
|
+
# with that are compatible with FasterCSV (it complains when you
|
129
|
+
# give it keywords it doesn't know).
|
71
130
|
#
|
72
131
|
# @return [Hash]
|
132
|
+
def resource_options_compatible_with_faster_csv
|
133
|
+
@compatible_options ||= returning({}) do |compatible_options|
|
134
|
+
FASTER_CSV_OPTION_NAMES.each do |option_name|
|
135
|
+
compatible_options[option_name] = resource_options[option_name] if resource_options.has_key?(option_name.to_sym)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
# A module for working with CSV (comma-separated value) formatted
|
142
|
+
# data.
|
143
|
+
#
|
144
|
+
# @see IMW::Formats::Delimited
|
145
|
+
module Csv
|
146
|
+
include Delimited
|
73
147
|
def delimited_options
|
74
|
-
@delimited_options ||= {
|
75
|
-
:col_sep => ',',
|
76
|
-
:headers => false,
|
77
|
-
:return_headers => false,
|
78
|
-
:write_headers => true,
|
79
|
-
:skip_blanks => false,
|
80
|
-
:force_quotes => false
|
81
|
-
}
|
148
|
+
@delimited_options ||= {:col_sep => ","}.merge(super())
|
82
149
|
end
|
83
150
|
end
|
84
151
|
|
152
|
+
# A module for working with TSV (tab-separated value) formatted
|
153
|
+
# data.
|
154
|
+
#
|
155
|
+
# @see IMW::Formats::Delimited
|
85
156
|
module Tsv
|
86
157
|
include Delimited
|
87
|
-
|
88
|
-
# Default options to be passed to
|
89
|
-
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
90
|
-
# documentation for more information.
|
91
|
-
#
|
92
|
-
# @return [Hash]
|
93
158
|
def delimited_options
|
94
159
|
@delimited_options ||= {
|
95
160
|
:col_sep => "\t",
|
96
|
-
|
97
|
-
:return_headers => false,
|
98
|
-
:write_headers => true,
|
99
|
-
:skip_blanks => false,
|
100
|
-
:force_quotes => false
|
101
|
-
}
|
161
|
+
}.merge(super())
|
102
162
|
end
|
103
163
|
end
|
104
164
|
end
|
data/lib/imw/formats/excel.rb
CHANGED
@@ -4,120 +4,88 @@ module IMW
|
|
4
4
|
# Defines methods for reading and writing Microsoft Excel data.
|
5
5
|
module Excel
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
def
|
10
|
-
if
|
11
|
-
@book = Spreadsheet.open path
|
12
|
-
@sheet = book.worksheet(0)
|
13
|
-
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
def book
|
19
|
-
return @book if @book
|
20
|
-
if exists?
|
21
|
-
@book = Spreadsheet.open(path)
|
22
|
-
else
|
23
|
-
@book = Spreadsheet::Workbook.new
|
24
|
-
end
|
7
|
+
# Ensure that this Excel resource is described by a an ordered
|
8
|
+
# collection of flat fields.
|
9
|
+
def validate_schema!
|
10
|
+
raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
|
25
11
|
end
|
26
12
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
#
|
33
|
-
#it is opened and can be read out with a subsequent call to
|
34
|
-
#load(). Otherwise, a new workbook is created and can be written
|
35
|
-
#to with the dump() method.
|
36
|
-
def initialize uri, mode='r', options={}
|
37
|
-
self.uri = uri
|
38
|
-
@max_lines = options[:max_lines] || 65000
|
39
|
-
@idx = 0
|
40
|
-
@book_idx = 0
|
41
|
-
@sht_idx = 0
|
42
|
-
unless self.exist?
|
43
|
-
make_new_book
|
44
|
-
make_new_sheet
|
45
|
-
else
|
46
|
-
get_existing_book
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
#Returns the data in an existing workbook as an
|
51
|
-
#array of arrays. Only capable of reading a single sheet.
|
13
|
+
# Return the data in this Excel document as an array of arrays.
|
14
|
+
#
|
15
|
+
# Data from consecutive worksheets will be concatenated into a
|
16
|
+
# single outer array.
|
17
|
+
#
|
18
|
+
# @return [Array<Array>]
|
52
19
|
def load
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
#Raises a 'too many lines' error if the number of lines
|
60
|
-
#of data exceeds max_lines.
|
61
|
-
def dump data
|
62
|
-
data.each do |line|
|
63
|
-
raise "too many lines" if too_many?
|
64
|
-
self << line
|
20
|
+
require 'spreadsheet'
|
21
|
+
data = []
|
22
|
+
Spreadsheet.open(path).worksheets.each do |worksheet|
|
23
|
+
data += worksheet.map do |row|
|
24
|
+
row.to_a
|
25
|
+
end
|
65
26
|
end
|
66
|
-
|
27
|
+
data
|
67
28
|
end
|
68
29
|
|
69
|
-
#
|
70
|
-
|
71
|
-
def << line
|
72
|
-
@sheet.row(@sht_row).concat( line )
|
73
|
-
@sht_row += 1
|
74
|
-
@idx += 1
|
75
|
-
end
|
76
|
-
|
77
|
-
#Instantiates a new Excel workbook in memory. You shouldn't
|
78
|
-
#need to call this directly.
|
79
|
-
def make_new_book
|
80
|
-
@book = Spreadsheet::Workbook.new
|
81
|
-
@book_idx += 1
|
82
|
-
end
|
83
|
-
|
84
|
-
#Makes a new worksheet for a pre-existing Excel workbook.
|
85
|
-
#This should be called after recovering from the
|
86
|
-
#'too many lines' error.
|
87
|
-
def make_new_sheet
|
88
|
-
@sheet = @book.create_worksheet
|
89
|
-
@sht_idx += 1
|
90
|
-
@sht_row = 0 #always start at row 0 in a new sheet
|
91
|
-
end
|
30
|
+
# Gives us goodies! Needs +each+ below.
|
31
|
+
include Enumerable
|
92
32
|
|
93
|
-
#
|
94
|
-
#
|
95
|
-
|
96
|
-
|
97
|
-
|
33
|
+
# Yield each row of this Excel document.
|
34
|
+
#
|
35
|
+
# Will loop from one worksheet to the next.
|
36
|
+
#
|
37
|
+
# @yield [Spreadsheet::Excel::Row]
|
38
|
+
def each &block
|
39
|
+
require 'spreadsheet'
|
40
|
+
Spreadsheet.open(path).worksheets.each do |worksheet|
|
41
|
+
worksheet.each(&block)
|
42
|
+
end
|
98
43
|
end
|
99
44
|
|
100
|
-
#
|
101
|
-
#
|
102
|
-
|
103
|
-
|
45
|
+
# Return the number of lines in this Excel document.
|
46
|
+
#
|
47
|
+
# Measured across worksheets.
|
48
|
+
#
|
49
|
+
# @return [Integer]
|
50
|
+
def num_lines
|
51
|
+
require 'spreadsheet'
|
52
|
+
Spreadsheet.open(path).worksheets.inject(0) do |sum, worksheet|
|
53
|
+
sum += worksheet.row_count
|
54
|
+
end
|
104
55
|
end
|
105
56
|
|
106
|
-
#
|
107
|
-
#
|
108
|
-
def
|
109
|
-
|
110
|
-
end
|
57
|
+
# TODO
|
58
|
+
#
|
59
|
+
# def emit
|
60
|
+
# end
|
111
61
|
|
112
|
-
#
|
113
|
-
#
|
114
|
-
|
115
|
-
|
116
|
-
|
62
|
+
# TODO
|
63
|
+
#
|
64
|
+
# Extract the following methods from delimited into a module and
|
65
|
+
# let both Excel and Delimited use them.
|
66
|
+
#
|
67
|
+
# Or let Excel include Delimited and let it override
|
68
|
+
# appropriately.
|
69
|
+
#
|
70
|
+
# headers_in_first_line?
|
71
|
+
# guess_schema!
|
72
|
+
#
|
73
|
+
#
|
117
74
|
|
118
|
-
#
|
119
|
-
def
|
120
|
-
|
75
|
+
#
|
76
|
+
def snippet
|
77
|
+
require 'spreadsheet'
|
78
|
+
returning([]) do |snip|
|
79
|
+
row_num = 1
|
80
|
+
Spreadsheet.open(path).worksheets.each do |worksheet|
|
81
|
+
worksheet.each do |row|
|
82
|
+
break if row_num > 10
|
83
|
+
snip << row.to_a
|
84
|
+
row_num += 1
|
85
|
+
end
|
86
|
+
break if row_num > 10
|
87
|
+
end
|
88
|
+
end
|
121
89
|
end
|
122
90
|
end
|
123
91
|
end
|