imw 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
data/README.rdoc CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  = What is the Infinite Monkeywrench?
3
2
 
4
3
  The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
@@ -58,18 +57,18 @@ IMW is centered around processing resources. A resource can be
58
57
  _anything_ with a URI and you create one using IMW.open.
59
58
 
60
59
  csv = IMW.open('/path/to/my_data.csv')
61
- html = IMW.open('http://www.infochimps.com')
60
+ html = IMW.open('http://www.example.com/history/march_2007')
62
61
 
63
62
  IMW dynamically extends a resource with modules appropriate to it when
64
63
  you open it. In the above case, +csv+ would be automatically extended
65
64
  by the IMW::Resources::Formats::Csv module, among others:
66
65
 
67
- csv.resource_modules
66
+ csv.modules
68
67
  => [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
69
68
 
70
69
  while +html+ will use a different set
71
70
 
72
- html.resource_modules
71
+ html.modules
73
72
  => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
74
73
 
75
74
  Consult the documentation for the modules a resource uses to learn
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.7
1
+ 0.2.8
data/lib/imw.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'rubygems'
2
+ require 'bundler'
3
+ Bundler.setup
2
4
  require 'imw/boot'
3
5
  require 'imw/utils'
4
6
 
@@ -8,15 +10,18 @@ require 'imw/utils'
8
10
  # transformations of data as a network of dependencies (a la Make or
9
11
  # Rake).
10
12
  #
11
- # IMW has a few central concepts: resources, datasets, workflows, and
12
- # repositories.
13
+ # IMW has a few central concepts: resources, metadata, datasets,
14
+ # workflows, and repositories.
13
15
  #
14
16
  # Resources represent individual data resources like local files,
15
- # websites, databases, &c. Resources are typically instantiated via
16
- # IMW.open, with IMW doing the work of figuring out what to return
17
+ # websites, databases, &c. An IMW::Resource is typically instantiated
18
+ # via IMW.open, with IMW doing the work of figuring out what to return
17
19
  # based on the URI passed in.
18
20
  #
19
- # Datasets represent collections of related data resources. An
21
+ # A Resource can have a schema which describes the fields in its data.
22
+ # IMW::Metadata consists of classes which describe fields.
23
+ #
24
+ # Datasets represent collections of related data resources .. An
20
25
  # IMW::Dataset comes with a pre-defined (but customizable) workflow
21
26
  # that takes data resources through several steps: rip, parse, munge,
22
27
  # and package. The workflow leverages Rake and so the various tasks
@@ -35,6 +40,7 @@ module IMW
35
40
  autoload :Parsers, 'imw/parsers'
36
41
  autoload :Dataset, 'imw/dataset'
37
42
  autoload :Repository, 'imw/repository'
43
+ autoload :Metadata, 'imw/metadata'
38
44
 
39
45
  # Open a resource at the given +uri+. The resource will
40
46
  # automatically be extended by modules which make sense given the
@@ -47,14 +53,23 @@ module IMW
47
53
  #
48
54
  # @param [String, Addressable::URI, IMW::Resource] obj the URI to open
49
55
  # @param [Hash] options
50
- # @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_resource!
51
- # @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_resource!
56
+ # @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
57
+ # @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
52
58
  # @return [IMW::Resource] the resulting resource, property extended for the given URI
53
- def self.open obj, options={}
54
- return obj if obj.is_a?(IMW::Resource)
55
- options[:use_modules] ||= (options[:as] || [])
56
- options[:skip_modules] ||= (options[:without] || [])
57
- IMW::Resource.new(obj, options)
59
+ def self.open obj, options={}, &block
60
+ if obj.is_a?(IMW::Resource)
61
+ resource = obj
62
+ else
63
+ options[:use_modules] ||= (options[:as] || [])
64
+ options[:skip_modules] ||= (options[:without] || [])
65
+ resource = IMW::Resource.new(obj, options)
66
+ end
67
+ if block_given?
68
+ yield resource
69
+ resource.close
70
+ else
71
+ resource
72
+ end
58
73
  end
59
74
 
60
75
  # Works the same way as IMW.open except opens the resource for
@@ -62,8 +77,8 @@ module IMW
62
77
  #
63
78
  # @param [String, Addressable::URI] uri the URI to open
64
79
  # @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
65
- def self.open! uri, options={}
66
- IMW::Resource.new(uri, options.merge(:mode => 'w'))
80
+ def self.open! uri, options={}, &block
81
+ open(uri, options.merge(:mode => 'w'), &block)
67
82
  end
68
83
 
69
84
  # The default repository in which to place datasets. See the
@@ -75,32 +90,41 @@ module IMW
75
90
  @@repository ||= IMW::Repository.new
76
91
  end
77
92
 
78
- # Create a dataset and put it in the default IMW repository. Also
79
- # yields the dataset so you can define its workflow
93
+ # Create a dataset and put it in the default IMW repository.
80
94
  #
81
- # IMW.dataset :my_dataset do
82
- #
83
- # # Define some paths we're going to use
84
- # add_path :raw_data, :ripd, 'raw_data.csv'
85
- # add_path :fixd_data, :fixd, 'fixed_data.csv'
95
+ # Evaluates the given block in the context of the new dataset. This
96
+ # allows you to define tasks, add paths, and use defined metadata in
97
+ # an elegant way.
86
98
  #
87
- # # Copy a file from a website to this dataset's +ripd+ directory.
88
- # rip do
89
- # IMW.open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:raw_data))
90
- # end
99
+ # IMW.dataset :my_dataset do
100
+ #
101
+ # # Define some paths we're going to use
102
+ # add_path :original, :rawd, 'original.csv'
103
+ # add_path :filtered, :fixd, 'filtered.csv'
104
+ # add_path :package, :pkgd, 'filtered.tar.bz2'
91
105
  #
92
- # # Filter the raw data to those values which match some criterion defined by <tt>accept?</tt>
93
- # munge do
94
- # IMW.open(path_to(:raw_data)).map do |row|
95
- # row if accept?(row)
96
- # end.compact.dump(path_to(:fixd_data))
97
- # end
106
+ # # Copy a CSV filefrom a website to this machine.
107
+ # rip do
108
+ # open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
109
+ # end
98
110
  #
99
- # # Compress this new data
100
- # package do
101
- # IMW.open(path_to(:fixd_data)).compress.mv(path_to(:pkgd))
111
+ # # Filter the original CSV data by the
112
+ # # <tt>meets_some_condition?</tt> method we define elsewhere...
113
+ # munge do
114
+ # open!(path_to(:filtered)) do |filtered|
115
+ # open(path_to(:original)).each do |row|
116
+ # filtered << row if meets_some_condition?(row)
117
+ # end
118
+ # end
119
+ #
120
+ # # Compress the filtered data to an archive.
121
+ # package do
122
+ # open(path_to(:filtered)).compress.mv(path_to(:package))
123
+ # end
102
124
  # end
103
- # end
125
+ #
126
+ # See the <tt>/examples</tt> directory of the IMW distribution for
127
+ # more examples.
104
128
  #
105
129
  # @param [Symbol, String] handle the handle to identify this dataset with
106
130
  # @param [Hash] options a hash of options (see IMW::Dataset)
@@ -112,3 +136,8 @@ module IMW
112
136
  end
113
137
 
114
138
  end
139
+
140
+ # Works just like IMW.dataset but defined at a top-level scope.
141
+ def dataset handle, options={}, &block
142
+ IMW.dataset(handle, options, &block)
143
+ end
data/lib/imw/dataset.rb CHANGED
@@ -96,9 +96,12 @@ module IMW
96
96
  # dataset = IMW::Dataset.new :my_dataset, :repository => repo
97
97
  class Dataset
98
98
 
99
- include IMW::Workflow
99
+ # The handle this dataset goes by. Used for identifying it within
100
+ # a repository.
101
+ attr_accessor :handle
100
102
 
101
- attr_accessor :handle, :options
103
+ # Options for this dataset.
104
+ attr_accessor :options
102
105
 
103
106
  def initialize handle, options = {}
104
107
  @options = options
@@ -111,5 +114,12 @@ module IMW
111
114
  end
112
115
  end
113
116
 
117
+ # Provides this dataset with a workflow of tasks managed by Rake.
118
+ include IMW::Workflow
119
+
120
+ # Provides this dataset with DSL like methods to construct a
121
+ # schema in an IMW file.
122
+ include IMW::Metadata::DSL
123
+
114
124
  end
115
125
  end
data/lib/imw/formats.rb CHANGED
@@ -10,20 +10,22 @@ module IMW
10
10
  autoload :Xhtml, 'imw/formats/sgml'
11
11
  autoload :Rdf, 'imw/formats/sgml'
12
12
  autoload :Yaml, 'imw/formats/yaml'
13
+ autoload :Pdf, 'imw/formats/pdf'
13
14
 
14
15
  # Handlers which augment a resource with data format specific
15
16
  # methods.
16
17
  HANDLERS = [
17
18
  [ "Formats::Csv", /\.csv$/i ],
18
19
  [ "Formats::Tsv", /\.tsv$/i ],
19
- [ "Formats::Excel", /\.xslx?$/i ],
20
+ [ "Formats::Excel", /\.xlsx?$/i ],
20
21
  [ "Formats::Json", /\.json$/i ],
21
22
  [ "Formats::Xml", /\.xml$/i ],
22
23
  [ "Formats::Xsl", /\.xsl$/i ],
23
24
  [ "Formats::Html", /\.html?$/i ],
24
25
  [ "Formats::Xhtml", /\.xhtml?$/i ],
25
26
  [ "Formats::Rdf", /\.rdf?$/i ],
26
- [ "Formats::Yaml", /\.ya?ml$/i ]
27
+ [ "Formats::Yaml", /\.ya?ml$/i ],
28
+ [ "Formats::Pdf", /\.pdf$/i ]
27
29
  ]
28
30
  end
29
31
  end
@@ -11,9 +11,22 @@ module IMW
11
11
  # @abstract
12
12
  module Delimited
13
13
 
14
- include Enumerable
14
+ # Ensure that this delimited resource is described by a an
15
+ # ordered collection of flat fields.
16
+ def validate_schema!
17
+ raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
18
+ end
15
19
 
16
- attr_accessor :delimited_settings
20
+ # Default options to be passed to
21
+ # FasterCSV[http://fastercsv.rubyforge.org/]; see its
22
+ # documentation for more information.
23
+ #
24
+ # @return [Hash]
25
+ def delimited_options
26
+ @delimited_options ||= {
27
+ :headers => schema && schema.map { |field| field['name'] }
28
+ }.merge(resource_options_compatible_with_faster_csv)
29
+ end
17
30
 
18
31
  # Return the data in this delimited resource as an array of
19
32
  # arrays.
@@ -27,24 +40,70 @@ module IMW
27
40
  FasterCSV.parse(read, delimited_options, &block)
28
41
  end
29
42
 
43
+ # Gives us goodies! Needs +each+ below.
44
+ include Enumerable
45
+
30
46
  # Call +block+ with each row in this delimited resource.
31
47
  def each &block
32
- load(&block)
48
+ require 'fastercsv'
49
+ FasterCSV.new(io, delimited_options).each(&block)
33
50
  end
34
51
 
35
- # Dump an array of arrays into this resource.
52
+ # Emit a single array or an array of arrays into this resource.
36
53
  #
37
- # @param [Array] data array of arrays to dump
54
+ # @param [Array<Array>, Array] data array or array of arrays to emit
38
55
  # @param [Hash] options
39
- # @option options [true, false] :persist Keep this resource's IO object open after dumping
40
- def dump data, options={}
56
+ # @option options [true, false] :persist Keep this resource's IO object open after emiting
57
+ def emit data, options={}
41
58
  require 'fastercsv'
59
+ data = [data] unless data.first.is_a?(Array)
42
60
  data.each do |row|
43
61
  write(FasterCSV.generate_line(row, delimited_options))
44
62
  end
45
- io.close unless options[:persist]
46
63
  self
47
64
  end
65
+ alias_method :<<, :emit
66
+
67
+ # Do a heuristic check to determine whether or not the first row
68
+ # of this delimited data is a row of headers.
69
+ #
70
+ # @return [true, false]
71
+ def headers_in_first_line?
72
+ # grab the header and up to 10 body rows
73
+ require 'fastercsv'
74
+ copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
75
+ header = (copy.shift || []) rescue []
76
+ body = 10.times.map { (copy.shift || []) rescue []}.flatten
77
+
78
+ # guess how many elements in a row
79
+ #size_guess = ((header.size + body.map(&:size).inject(0.0) { |e, s| s += e }).to_f / (1 + body.length).to_f).to_i
80
+
81
+ # calculate the fraction of bytes that are [-A-z_] (letters +
82
+ # underscore + hypen) for header and body and compute a
83
+ # threshold determinant
84
+ header_chars = header.map(&:to_s).join
85
+ header_schema_bytes = header_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
86
+ body_chars = body.map(&:to_s).join
87
+ body_schema_bytes = body_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
88
+ header_schema_fraction = header_schema_bytes.size.to_f / header_chars.size.to_f rescue nil
89
+ body_schema_fraction = body_schema_bytes.size.to_f / body_chars.size.to_f rescue nil
90
+ determinant = (body_schema_fraction - header_schema_fraction).abs / 2.0 rescue nil
91
+
92
+ # decide, setting the threshold at 0.05 based on some guesswork...
93
+ determinant && determinant >= 0.05
94
+ end
95
+
96
+ # If it seems like there are headers in the first line of this
97
+ # data then go ahead and use them to define a schema.
98
+ #
99
+ # Will overwrite a schema already present for this resource.
100
+ def guess_schema!
101
+ return unless headers_in_first_line?
102
+ copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
103
+ names = (copy.shift || []) rescue []
104
+ self.schema = IMW::Metadata::Schema.new(names)
105
+ delimited_options[:headers] = names
106
+ end
48
107
 
49
108
  # Return a 10-line sample of this file.
50
109
  #
@@ -53,52 +112,53 @@ module IMW
53
112
  require 'fastercsv'
54
113
  returning([]) do |rows|
55
114
  row_num = 1
56
- FasterCSV.new(io, delimited_options).each do |row|
115
+ each do |row|
57
116
  break if row_num > 10
58
- rows << row
117
+ rows << row.size.times.map { |index| row[index] }
59
118
  row_num += 1
60
119
  end
61
120
  end
62
121
  end
63
- end
64
122
 
65
- module Csv
66
- include Delimited
123
+ protected
124
+ # An array of option names used by FasterCSV.
125
+ FASTER_CSV_OPTION_NAMES = %w[col_sep row_sep quote_char encoding field_size_limit converters unconverted_fields headers return_headers write_headers header_converters skip_blanks force_quotes].map(&:to_sym)
67
126
 
68
- # Default options to be passed to
69
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
70
- # documentation for more information.
127
+ # Return the subset of options this resource was initialized
128
+ # with that are compatible with FasterCSV (it complains when you
129
+ # give it keywords it doesn't know).
71
130
  #
72
131
  # @return [Hash]
132
+ def resource_options_compatible_with_faster_csv
133
+ @compatible_options ||= returning({}) do |compatible_options|
134
+ FASTER_CSV_OPTION_NAMES.each do |option_name|
135
+ compatible_options[option_name] = resource_options[option_name] if resource_options.has_key?(option_name.to_sym)
136
+ end
137
+ end
138
+ end
139
+ end
140
+
141
+ # A module for working with CSV (comma-separated value) formatted
142
+ # data.
143
+ #
144
+ # @see IMW::Formats::Delimited
145
+ module Csv
146
+ include Delimited
73
147
  def delimited_options
74
- @delimited_options ||= {
75
- :col_sep => ',',
76
- :headers => false,
77
- :return_headers => false,
78
- :write_headers => true,
79
- :skip_blanks => false,
80
- :force_quotes => false
81
- }
148
+ @delimited_options ||= {:col_sep => ","}.merge(super())
82
149
  end
83
150
  end
84
151
 
152
+ # A module for working with TSV (tab-separated value) formatted
153
+ # data.
154
+ #
155
+ # @see IMW::Formats::Delimited
85
156
  module Tsv
86
157
  include Delimited
87
-
88
- # Default options to be passed to
89
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
90
- # documentation for more information.
91
- #
92
- # @return [Hash]
93
158
  def delimited_options
94
159
  @delimited_options ||= {
95
160
  :col_sep => "\t",
96
- :headers => false,
97
- :return_headers => false,
98
- :write_headers => true,
99
- :skip_blanks => false,
100
- :force_quotes => false
101
- }
161
+ }.merge(super())
102
162
  end
103
163
  end
104
164
  end
@@ -4,120 +4,88 @@ module IMW
4
4
  # Defines methods for reading and writing Microsoft Excel data.
5
5
  module Excel
6
6
 
7
- attr_accessor :book, :sheet
8
-
9
- def self.extended obj
10
- if obj.exist?
11
- @book = Spreadsheet.open path
12
- @sheet = book.worksheet(0)
13
-
14
- end
15
- end
16
-
17
-
18
- def book
19
- return @book if @book
20
- if exists?
21
- @book = Spreadsheet.open(path)
22
- else
23
- @book = Spreadsheet::Workbook.new
24
- end
7
+ # Ensure that this Excel resource is described by a an ordered
8
+ # collection of flat fields.
9
+ def validate_schema!
10
+ raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
25
11
  end
26
12
 
27
- def sheet
28
- @sheet = @book.create_worksheet
29
- @sheet
30
- end
31
-
32
- #If an Excel file exists at the location specified by uri then
33
- #it is opened and can be read out with a subsequent call to
34
- #load(). Otherwise, a new workbook is created and can be written
35
- #to with the dump() method.
36
- def initialize uri, mode='r', options={}
37
- self.uri = uri
38
- @max_lines = options[:max_lines] || 65000
39
- @idx = 0
40
- @book_idx = 0
41
- @sht_idx = 0
42
- unless self.exist?
43
- make_new_book
44
- make_new_sheet
45
- else
46
- get_existing_book
47
- end
48
- end
49
-
50
- #Returns the data in an existing workbook as an
51
- #array of arrays. Only capable of reading a single sheet.
13
+ # Return the data in this Excel document as an array of arrays.
14
+ #
15
+ # Data from consecutive worksheets will be concatenated into a
16
+ # single outer array.
17
+ #
18
+ # @return [Array<Array>]
52
19
  def load
53
- @sheet.map{|row| row.to_a}
54
- end
55
-
56
- #Dumps data, which is assumed to be an array of arrays, to a
57
- #newly created Excel workbook. Attempting to dump to a book
58
- #that already exists will typically result in file corruption.
59
- #Raises a 'too many lines' error if the number of lines
60
- #of data exceeds max_lines.
61
- def dump data
62
- data.each do |line|
63
- raise "too many lines" if too_many?
64
- self << line
20
+ require 'spreadsheet'
21
+ data = []
22
+ Spreadsheet.open(path).worksheets.each do |worksheet|
23
+ data += worksheet.map do |row|
24
+ row.to_a
25
+ end
65
26
  end
66
- save unless no_data?
27
+ data
67
28
  end
68
29
 
69
- #Processes a single line of data and updates internal variables.
70
- #You shouldn't need to call this directly.
71
- def << line
72
- @sheet.row(@sht_row).concat( line )
73
- @sht_row += 1
74
- @idx += 1
75
- end
76
-
77
- #Instantiates a new Excel workbook in memory. You shouldn't
78
- #need to call this directly.
79
- def make_new_book
80
- @book = Spreadsheet::Workbook.new
81
- @book_idx += 1
82
- end
83
-
84
- #Makes a new worksheet for a pre-existing Excel workbook.
85
- #This should be called after recovering from the
86
- #'too many lines' error.
87
- def make_new_sheet
88
- @sheet = @book.create_worksheet
89
- @sht_idx += 1
90
- @sht_row = 0 #always start at row 0 in a new sheet
91
- end
30
+ # Gives us goodies! Needs +each+ below.
31
+ include Enumerable
92
32
 
93
- #Opens an existing Excel workbook. You shoudn't need to
94
- #call this directly.
95
- def get_existing_book
96
- @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
97
- @sht_idx += 1
33
+ # Yield each row of this Excel document.
34
+ #
35
+ # Will loop from one worksheet to the next.
36
+ #
37
+ # @yield [Spreadsheet::Excel::Row]
38
+ def each &block
39
+ require 'spreadsheet'
40
+ Spreadsheet.open(path).worksheets.each do |worksheet|
41
+ worksheet.each(&block)
42
+ end
98
43
  end
99
44
 
100
- #Increments the current sheet to the next one in
101
- #an open book. Not necessary at the moment.
102
- def incr_sheet
103
- @sheet = book.worksheet @sht_idx
45
+ # Return the number of lines in this Excel document.
46
+ #
47
+ # Measured across worksheets.
48
+ #
49
+ # @return [Integer]
50
+ def num_lines
51
+ require 'spreadsheet'
52
+ Spreadsheet.open(path).worksheets.inject(0) do |sum, worksheet|
53
+ sum += worksheet.row_count
54
+ end
104
55
  end
105
56
 
106
- #There are too many lines if the number of rows attempting
107
- #to be written exceeds max_lines.
108
- def too_many?
109
- @sht_row >= @max_lines
110
- end
57
+ # TODO
58
+ #
59
+ # def emit
60
+ # end
111
61
 
112
- #There is no data if the number of rows attempting to be written
113
- #is zero.
114
- def no_data?
115
- @sht_row == 0
116
- end
62
+ # TODO
63
+ #
64
+ # Extract the following methods from delimited into a module and
65
+ # let both Excel and Delimited use them.
66
+ #
67
+ # Or let Excel include Delimited and let it override
68
+ # appropriately.
69
+ #
70
+ # headers_in_first_line?
71
+ # guess_schema!
72
+ #
73
+ #
117
74
 
118
- #Saves the workbook.
119
- def save
120
- @book.write path
75
+ #
76
+ def snippet
77
+ require 'spreadsheet'
78
+ returning([]) do |snip|
79
+ row_num = 1
80
+ Spreadsheet.open(path).worksheets.each do |worksheet|
81
+ worksheet.each do |row|
82
+ break if row_num > 10
83
+ snip << row.to_a
84
+ row_num += 1
85
+ end
86
+ break if row_num > 10
87
+ end
88
+ end
121
89
  end
122
90
  end
123
91
  end