imw 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
data/README.rdoc CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  = What is the Infinite Monkeywrench?
3
2
 
4
3
  The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
@@ -58,18 +57,18 @@ IMW is centered around processing resources. A resource can be
58
57
  _anything_ with a URI and you create one using IMW.open.
59
58
 
60
59
  csv = IMW.open('/path/to/my_data.csv')
61
- html = IMW.open('http://www.infochimps.com')
60
+ html = IMW.open('http://www.example.com/history/march_2007')
62
61
 
63
62
  IMW dynamically extends a resource with modules appropriate to it when
64
63
  you open it. In the above case, +csv+ would be automatically extended
65
64
  by the IMW::Resources::Formats::Csv module, among others:
66
65
 
67
- csv.resource_modules
66
+ csv.modules
68
67
  => [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
69
68
 
70
69
  while +html+ will use a different set
71
70
 
72
- html.resource_modules
71
+ html.modules
73
72
  => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
74
73
 
75
74
  Consult the documentation for the modules a resource uses to learn
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.7
1
+ 0.2.8
data/lib/imw.rb CHANGED
@@ -1,4 +1,6 @@
1
1
  require 'rubygems'
2
+ require 'bundler'
3
+ Bundler.setup
2
4
  require 'imw/boot'
3
5
  require 'imw/utils'
4
6
 
@@ -8,15 +10,18 @@ require 'imw/utils'
8
10
  # transformations of data as a network of dependencies (a la Make or
9
11
  # Rake).
10
12
  #
11
- # IMW has a few central concepts: resources, datasets, workflows, and
12
- # repositories.
13
+ # IMW has a few central concepts: resources, metadata, datasets,
14
+ # workflows, and repositories.
13
15
  #
14
16
  # Resources represent individual data resources like local files,
15
- # websites, databases, &c. Resources are typically instantiated via
16
- # IMW.open, with IMW doing the work of figuring out what to return
17
+ # websites, databases, &c. An IMW::Resource is typically instantiated
18
+ # via IMW.open, with IMW doing the work of figuring out what to return
17
19
  # based on the URI passed in.
18
20
  #
19
- # Datasets represent collections of related data resources. An
21
+ # A Resource can have a schema which describes the fields in its data.
22
+ # IMW::Metadata consists of classes which describe fields.
23
+ #
24
+ # Datasets represent collections of related data resources .. An
20
25
  # IMW::Dataset comes with a pre-defined (but customizable) workflow
21
26
  # that takes data resources through several steps: rip, parse, munge,
22
27
  # and package. The workflow leverages Rake and so the various tasks
@@ -35,6 +40,7 @@ module IMW
35
40
  autoload :Parsers, 'imw/parsers'
36
41
  autoload :Dataset, 'imw/dataset'
37
42
  autoload :Repository, 'imw/repository'
43
+ autoload :Metadata, 'imw/metadata'
38
44
 
39
45
  # Open a resource at the given +uri+. The resource will
40
46
  # automatically be extended by modules which make sense given the
@@ -47,14 +53,23 @@ module IMW
47
53
  #
48
54
  # @param [String, Addressable::URI, IMW::Resource] obj the URI to open
49
55
  # @param [Hash] options
50
- # @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_resource!
51
- # @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_resource!
56
+ # @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
57
+ # @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
52
58
  # @return [IMW::Resource] the resulting resource, property extended for the given URI
53
- def self.open obj, options={}
54
- return obj if obj.is_a?(IMW::Resource)
55
- options[:use_modules] ||= (options[:as] || [])
56
- options[:skip_modules] ||= (options[:without] || [])
57
- IMW::Resource.new(obj, options)
59
+ def self.open obj, options={}, &block
60
+ if obj.is_a?(IMW::Resource)
61
+ resource = obj
62
+ else
63
+ options[:use_modules] ||= (options[:as] || [])
64
+ options[:skip_modules] ||= (options[:without] || [])
65
+ resource = IMW::Resource.new(obj, options)
66
+ end
67
+ if block_given?
68
+ yield resource
69
+ resource.close
70
+ else
71
+ resource
72
+ end
58
73
  end
59
74
 
60
75
  # Works the same way as IMW.open except opens the resource for
@@ -62,8 +77,8 @@ module IMW
62
77
  #
63
78
  # @param [String, Addressable::URI] uri the URI to open
64
79
  # @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
65
- def self.open! uri, options={}
66
- IMW::Resource.new(uri, options.merge(:mode => 'w'))
80
+ def self.open! uri, options={}, &block
81
+ open(uri, options.merge(:mode => 'w'), &block)
67
82
  end
68
83
 
69
84
  # The default repository in which to place datasets. See the
@@ -75,32 +90,41 @@ module IMW
75
90
  @@repository ||= IMW::Repository.new
76
91
  end
77
92
 
78
- # Create a dataset and put it in the default IMW repository. Also
79
- # yields the dataset so you can define its workflow
93
+ # Create a dataset and put it in the default IMW repository.
80
94
  #
81
- # IMW.dataset :my_dataset do
82
- #
83
- # # Define some paths we're going to use
84
- # add_path :raw_data, :ripd, 'raw_data.csv'
85
- # add_path :fixd_data, :fixd, 'fixed_data.csv'
95
+ # Evaluates the given block in the context of the new dataset. This
96
+ # allows you to define tasks, add paths, and use defined metadata in
97
+ # an elegant way.
86
98
  #
87
- # # Copy a file from a website to this dataset's +ripd+ directory.
88
- # rip do
89
- # IMW.open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:raw_data))
90
- # end
99
+ # IMW.dataset :my_dataset do
100
+ #
101
+ # # Define some paths we're going to use
102
+ # add_path :original, :rawd, 'original.csv'
103
+ # add_path :filtered, :fixd, 'filtered.csv'
104
+ # add_path :package, :pkgd, 'filtered.tar.bz2'
91
105
  #
92
- # # Filter the raw data to those values which match some criterion defined by <tt>accept?</tt>
93
- # munge do
94
- # IMW.open(path_to(:raw_data)).map do |row|
95
- # row if accept?(row)
96
- # end.compact.dump(path_to(:fixd_data))
97
- # end
106
+ # # Copy a CSV filefrom a website to this machine.
107
+ # rip do
108
+ # open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
109
+ # end
98
110
  #
99
- # # Compress this new data
100
- # package do
101
- # IMW.open(path_to(:fixd_data)).compress.mv(path_to(:pkgd))
111
+ # # Filter the original CSV data by the
112
+ # # <tt>meets_some_condition?</tt> method we define elsewhere...
113
+ # munge do
114
+ # open!(path_to(:filtered)) do |filtered|
115
+ # open(path_to(:original)).each do |row|
116
+ # filtered << row if meets_some_condition?(row)
117
+ # end
118
+ # end
119
+ #
120
+ # # Compress the filtered data to an archive.
121
+ # package do
122
+ # open(path_to(:filtered)).compress.mv(path_to(:package))
123
+ # end
102
124
  # end
103
- # end
125
+ #
126
+ # See the <tt>/examples</tt> directory of the IMW distribution for
127
+ # more examples.
104
128
  #
105
129
  # @param [Symbol, String] handle the handle to identify this dataset with
106
130
  # @param [Hash] options a hash of options (see IMW::Dataset)
@@ -112,3 +136,8 @@ module IMW
112
136
  end
113
137
 
114
138
  end
139
+
140
+ # Works just like IMW.dataset but defined at a top-level scope.
141
+ def dataset handle, options={}, &block
142
+ IMW.dataset(handle, options, &block)
143
+ end
data/lib/imw/dataset.rb CHANGED
@@ -96,9 +96,12 @@ module IMW
96
96
  # dataset = IMW::Dataset.new :my_dataset, :repository => repo
97
97
  class Dataset
98
98
 
99
- include IMW::Workflow
99
+ # The handle this dataset goes by. Used for identifying it within
100
+ # a repository.
101
+ attr_accessor :handle
100
102
 
101
- attr_accessor :handle, :options
103
+ # Options for this dataset.
104
+ attr_accessor :options
102
105
 
103
106
  def initialize handle, options = {}
104
107
  @options = options
@@ -111,5 +114,12 @@ module IMW
111
114
  end
112
115
  end
113
116
 
117
+ # Provides this dataset with a workflow of tasks managed by Rake.
118
+ include IMW::Workflow
119
+
120
+ # Provides this dataset with DSL like methods to construct a
121
+ # schema in an IMW file.
122
+ include IMW::Metadata::DSL
123
+
114
124
  end
115
125
  end
data/lib/imw/formats.rb CHANGED
@@ -10,20 +10,22 @@ module IMW
10
10
  autoload :Xhtml, 'imw/formats/sgml'
11
11
  autoload :Rdf, 'imw/formats/sgml'
12
12
  autoload :Yaml, 'imw/formats/yaml'
13
+ autoload :Pdf, 'imw/formats/pdf'
13
14
 
14
15
  # Handlers which augment a resource with data format specific
15
16
  # methods.
16
17
  HANDLERS = [
17
18
  [ "Formats::Csv", /\.csv$/i ],
18
19
  [ "Formats::Tsv", /\.tsv$/i ],
19
- [ "Formats::Excel", /\.xslx?$/i ],
20
+ [ "Formats::Excel", /\.xlsx?$/i ],
20
21
  [ "Formats::Json", /\.json$/i ],
21
22
  [ "Formats::Xml", /\.xml$/i ],
22
23
  [ "Formats::Xsl", /\.xsl$/i ],
23
24
  [ "Formats::Html", /\.html?$/i ],
24
25
  [ "Formats::Xhtml", /\.xhtml?$/i ],
25
26
  [ "Formats::Rdf", /\.rdf?$/i ],
26
- [ "Formats::Yaml", /\.ya?ml$/i ]
27
+ [ "Formats::Yaml", /\.ya?ml$/i ],
28
+ [ "Formats::Pdf", /\.pdf$/i ]
27
29
  ]
28
30
  end
29
31
  end
@@ -11,9 +11,22 @@ module IMW
11
11
  # @abstract
12
12
  module Delimited
13
13
 
14
- include Enumerable
14
+ # Ensure that this delimited resource is described by a an
15
+ # ordered collection of flat fields.
16
+ def validate_schema!
17
+ raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
18
+ end
15
19
 
16
- attr_accessor :delimited_settings
20
+ # Default options to be passed to
21
+ # FasterCSV[http://fastercsv.rubyforge.org/]; see its
22
+ # documentation for more information.
23
+ #
24
+ # @return [Hash]
25
+ def delimited_options
26
+ @delimited_options ||= {
27
+ :headers => schema && schema.map { |field| field['name'] }
28
+ }.merge(resource_options_compatible_with_faster_csv)
29
+ end
17
30
 
18
31
  # Return the data in this delimited resource as an array of
19
32
  # arrays.
@@ -27,24 +40,70 @@ module IMW
27
40
  FasterCSV.parse(read, delimited_options, &block)
28
41
  end
29
42
 
43
+ # Gives us goodies! Needs +each+ below.
44
+ include Enumerable
45
+
30
46
  # Call +block+ with each row in this delimited resource.
31
47
  def each &block
32
- load(&block)
48
+ require 'fastercsv'
49
+ FasterCSV.new(io, delimited_options).each(&block)
33
50
  end
34
51
 
35
- # Dump an array of arrays into this resource.
52
+ # Emit a single array or an array of arrays into this resource.
36
53
  #
37
- # @param [Array] data array of arrays to dump
54
+ # @param [Array<Array>, Array] data array or array of arrays to emit
38
55
  # @param [Hash] options
39
- # @option options [true, false] :persist Keep this resource's IO object open after dumping
40
- def dump data, options={}
56
+ # @option options [true, false] :persist Keep this resource's IO object open after emiting
57
+ def emit data, options={}
41
58
  require 'fastercsv'
59
+ data = [data] unless data.first.is_a?(Array)
42
60
  data.each do |row|
43
61
  write(FasterCSV.generate_line(row, delimited_options))
44
62
  end
45
- io.close unless options[:persist]
46
63
  self
47
64
  end
65
+ alias_method :<<, :emit
66
+
67
+ # Do a heuristic check to determine whether or not the first row
68
+ # of this delimited data is a row of headers.
69
+ #
70
+ # @return [true, false]
71
+ def headers_in_first_line?
72
+ # grab the header and up to 10 body rows
73
+ require 'fastercsv'
74
+ copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
75
+ header = (copy.shift || []) rescue []
76
+ body = 10.times.map { (copy.shift || []) rescue []}.flatten
77
+
78
+ # guess how many elements in a row
79
+ #size_guess = ((header.size + body.map(&:size).inject(0.0) { |e, s| s += e }).to_f / (1 + body.length).to_f).to_i
80
+
81
+ # calculate the fraction of bytes that are [-A-z_] (letters +
82
+ # underscore + hypen) for header and body and compute a
83
+ # threshold determinant
84
+ header_chars = header.map(&:to_s).join
85
+ header_schema_bytes = header_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
86
+ body_chars = body.map(&:to_s).join
87
+ body_schema_bytes = body_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
88
+ header_schema_fraction = header_schema_bytes.size.to_f / header_chars.size.to_f rescue nil
89
+ body_schema_fraction = body_schema_bytes.size.to_f / body_chars.size.to_f rescue nil
90
+ determinant = (body_schema_fraction - header_schema_fraction).abs / 2.0 rescue nil
91
+
92
+ # decide, setting the threshold at 0.05 based on some guesswork...
93
+ determinant && determinant >= 0.05
94
+ end
95
+
96
+ # If it seems like there are headers in the first line of this
97
+ # data then go ahead and use them to define a schema.
98
+ #
99
+ # Will overwrite a schema already present for this resource.
100
+ def guess_schema!
101
+ return unless headers_in_first_line?
102
+ copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
103
+ names = (copy.shift || []) rescue []
104
+ self.schema = IMW::Metadata::Schema.new(names)
105
+ delimited_options[:headers] = names
106
+ end
48
107
 
49
108
  # Return a 10-line sample of this file.
50
109
  #
@@ -53,52 +112,53 @@ module IMW
53
112
  require 'fastercsv'
54
113
  returning([]) do |rows|
55
114
  row_num = 1
56
- FasterCSV.new(io, delimited_options).each do |row|
115
+ each do |row|
57
116
  break if row_num > 10
58
- rows << row
117
+ rows << row.size.times.map { |index| row[index] }
59
118
  row_num += 1
60
119
  end
61
120
  end
62
121
  end
63
- end
64
122
 
65
- module Csv
66
- include Delimited
123
+ protected
124
+ # An array of option names used by FasterCSV.
125
+ FASTER_CSV_OPTION_NAMES = %w[col_sep row_sep quote_char encoding field_size_limit converters unconverted_fields headers return_headers write_headers header_converters skip_blanks force_quotes].map(&:to_sym)
67
126
 
68
- # Default options to be passed to
69
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
70
- # documentation for more information.
127
+ # Return the subset of options this resource was initialized
128
+ # with that are compatible with FasterCSV (it complains when you
129
+ # give it keywords it doesn't know).
71
130
  #
72
131
  # @return [Hash]
132
+ def resource_options_compatible_with_faster_csv
133
+ @compatible_options ||= returning({}) do |compatible_options|
134
+ FASTER_CSV_OPTION_NAMES.each do |option_name|
135
+ compatible_options[option_name] = resource_options[option_name] if resource_options.has_key?(option_name.to_sym)
136
+ end
137
+ end
138
+ end
139
+ end
140
+
141
+ # A module for working with CSV (comma-separated value) formatted
142
+ # data.
143
+ #
144
+ # @see IMW::Formats::Delimited
145
+ module Csv
146
+ include Delimited
73
147
  def delimited_options
74
- @delimited_options ||= {
75
- :col_sep => ',',
76
- :headers => false,
77
- :return_headers => false,
78
- :write_headers => true,
79
- :skip_blanks => false,
80
- :force_quotes => false
81
- }
148
+ @delimited_options ||= {:col_sep => ","}.merge(super())
82
149
  end
83
150
  end
84
151
 
152
+ # A module for working with TSV (tab-separated value) formatted
153
+ # data.
154
+ #
155
+ # @see IMW::Formats::Delimited
85
156
  module Tsv
86
157
  include Delimited
87
-
88
- # Default options to be passed to
89
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
90
- # documentation for more information.
91
- #
92
- # @return [Hash]
93
158
  def delimited_options
94
159
  @delimited_options ||= {
95
160
  :col_sep => "\t",
96
- :headers => false,
97
- :return_headers => false,
98
- :write_headers => true,
99
- :skip_blanks => false,
100
- :force_quotes => false
101
- }
161
+ }.merge(super())
102
162
  end
103
163
  end
104
164
  end
@@ -4,120 +4,88 @@ module IMW
4
4
  # Defines methods for reading and writing Microsoft Excel data.
5
5
  module Excel
6
6
 
7
- attr_accessor :book, :sheet
8
-
9
- def self.extended obj
10
- if obj.exist?
11
- @book = Spreadsheet.open path
12
- @sheet = book.worksheet(0)
13
-
14
- end
15
- end
16
-
17
-
18
- def book
19
- return @book if @book
20
- if exists?
21
- @book = Spreadsheet.open(path)
22
- else
23
- @book = Spreadsheet::Workbook.new
24
- end
7
+ # Ensure that this Excel resource is described by a an ordered
8
+ # collection of flat fields.
9
+ def validate_schema!
10
+ raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
25
11
  end
26
12
 
27
- def sheet
28
- @sheet = @book.create_worksheet
29
- @sheet
30
- end
31
-
32
- #If an Excel file exists at the location specified by uri then
33
- #it is opened and can be read out with a subsequent call to
34
- #load(). Otherwise, a new workbook is created and can be written
35
- #to with the dump() method.
36
- def initialize uri, mode='r', options={}
37
- self.uri = uri
38
- @max_lines = options[:max_lines] || 65000
39
- @idx = 0
40
- @book_idx = 0
41
- @sht_idx = 0
42
- unless self.exist?
43
- make_new_book
44
- make_new_sheet
45
- else
46
- get_existing_book
47
- end
48
- end
49
-
50
- #Returns the data in an existing workbook as an
51
- #array of arrays. Only capable of reading a single sheet.
13
+ # Return the data in this Excel document as an array of arrays.
14
+ #
15
+ # Data from consecutive worksheets will be concatenated into a
16
+ # single outer array.
17
+ #
18
+ # @return [Array<Array>]
52
19
  def load
53
- @sheet.map{|row| row.to_a}
54
- end
55
-
56
- #Dumps data, which is assumed to be an array of arrays, to a
57
- #newly created Excel workbook. Attempting to dump to a book
58
- #that already exists will typically result in file corruption.
59
- #Raises a 'too many lines' error if the number of lines
60
- #of data exceeds max_lines.
61
- def dump data
62
- data.each do |line|
63
- raise "too many lines" if too_many?
64
- self << line
20
+ require 'spreadsheet'
21
+ data = []
22
+ Spreadsheet.open(path).worksheets.each do |worksheet|
23
+ data += worksheet.map do |row|
24
+ row.to_a
25
+ end
65
26
  end
66
- save unless no_data?
27
+ data
67
28
  end
68
29
 
69
- #Processes a single line of data and updates internal variables.
70
- #You shouldn't need to call this directly.
71
- def << line
72
- @sheet.row(@sht_row).concat( line )
73
- @sht_row += 1
74
- @idx += 1
75
- end
76
-
77
- #Instantiates a new Excel workbook in memory. You shouldn't
78
- #need to call this directly.
79
- def make_new_book
80
- @book = Spreadsheet::Workbook.new
81
- @book_idx += 1
82
- end
83
-
84
- #Makes a new worksheet for a pre-existing Excel workbook.
85
- #This should be called after recovering from the
86
- #'too many lines' error.
87
- def make_new_sheet
88
- @sheet = @book.create_worksheet
89
- @sht_idx += 1
90
- @sht_row = 0 #always start at row 0 in a new sheet
91
- end
30
+ # Gives us goodies! Needs +each+ below.
31
+ include Enumerable
92
32
 
93
- #Opens an existing Excel workbook. You shoudn't need to
94
- #call this directly.
95
- def get_existing_book
96
- @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
97
- @sht_idx += 1
33
+ # Yield each row of this Excel document.
34
+ #
35
+ # Will loop from one worksheet to the next.
36
+ #
37
+ # @yield [Spreadsheet::Excel::Row]
38
+ def each &block
39
+ require 'spreadsheet'
40
+ Spreadsheet.open(path).worksheets.each do |worksheet|
41
+ worksheet.each(&block)
42
+ end
98
43
  end
99
44
 
100
- #Increments the current sheet to the next one in
101
- #an open book. Not necessary at the moment.
102
- def incr_sheet
103
- @sheet = book.worksheet @sht_idx
45
+ # Return the number of lines in this Excel document.
46
+ #
47
+ # Measured across worksheets.
48
+ #
49
+ # @return [Integer]
50
+ def num_lines
51
+ require 'spreadsheet'
52
+ Spreadsheet.open(path).worksheets.inject(0) do |sum, worksheet|
53
+ sum += worksheet.row_count
54
+ end
104
55
  end
105
56
 
106
- #There are too many lines if the number of rows attempting
107
- #to be written exceeds max_lines.
108
- def too_many?
109
- @sht_row >= @max_lines
110
- end
57
+ # TODO
58
+ #
59
+ # def emit
60
+ # end
111
61
 
112
- #There is no data if the number of rows attempting to be written
113
- #is zero.
114
- def no_data?
115
- @sht_row == 0
116
- end
62
+ # TODO
63
+ #
64
+ # Extract the following methods from delimited into a module and
65
+ # let both Excel and Delimited use them.
66
+ #
67
+ # Or let Excel include Delimited and let it override
68
+ # appropriately.
69
+ #
70
+ # headers_in_first_line?
71
+ # guess_schema!
72
+ #
73
+ #
117
74
 
118
- #Saves the workbook.
119
- def save
120
- @book.write path
75
+ #
76
+ def snippet
77
+ require 'spreadsheet'
78
+ returning([]) do |snip|
79
+ row_num = 1
80
+ Spreadsheet.open(path).worksheets.each do |worksheet|
81
+ worksheet.each do |row|
82
+ break if row_num > 10
83
+ snip << row.to_a
84
+ row_num += 1
85
+ end
86
+ break if row_num > 10
87
+ end
88
+ end
121
89
  end
122
90
  end
123
91
  end