imw 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
@@ -28,7 +28,6 @@ module IMW
28
28
  IMW::Tools::Transferer.new(:cp, self, new_uri).transfer!
29
29
  end
30
30
 
31
-
32
31
  # Return the basename of the URI or <tt>_index</tt> if it's
33
32
  # blank, as in the case of <tt>http://www.google.com</tt>.
34
33
  #
@@ -79,7 +78,7 @@ module IMW
79
78
  # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
80
79
  def post payload, headers={}, &block
81
80
  make_restclient_request do
82
- RestClient.post(uri.to_s, payload, headers, &block)
81
+ RestClient.post(uri.to_s, payload, &block)
83
82
  end
84
83
  end
85
84
 
@@ -65,7 +65,7 @@ module IMW
65
65
  def dir
66
66
  IMW.open(dirname)
67
67
  end
68
-
68
+
69
69
  end
70
70
 
71
71
  # Defines methods for appropriate for a local file.
@@ -93,6 +93,14 @@ module IMW
93
93
  @io ||= open(path, mode)
94
94
  end
95
95
 
96
+ # Close this resource's file handle if it exists.
97
+ def close
98
+ # explicitly check the @io instance variable b/c self.io
99
+ # will open up a new handle by default
100
+ io.close if @io
101
+ super()
102
+ end
103
+
96
104
  # Read from this file.
97
105
  #
98
106
  # @param [Fixnum] length bytes to read
@@ -101,6 +109,13 @@ module IMW
101
109
  io.read(length)
102
110
  end
103
111
 
112
+ # Read a line from this file.
113
+ #
114
+ # @return [String]
115
+ def readline
116
+ io.readline
117
+ end
118
+
104
119
  # Write to this file
105
120
  #
106
121
  # @param [String, #to_s] text text to write
@@ -109,6 +124,14 @@ module IMW
109
124
  io.write text
110
125
  end
111
126
 
127
+ # Write the text with a trailing newline to this resource.
128
+ #
129
+ # @param [String, #to_s] text
130
+ def puts text
131
+ io.write text.to_s + "\n"
132
+ end
133
+ alias_method :<<, :puts
134
+
112
135
  # Return the lines in this file.
113
136
  #
114
137
  # If passed a block, yield each line of the file to the block.
@@ -132,15 +155,50 @@ module IMW
132
155
  io.map(&block)
133
156
  end
134
157
 
135
- # Dump +data+ into this file.
158
+ # Emit +data+ into this file.
136
159
  #
137
- # @param [String, Array, #each] data object to dump
138
- # @option options [true, false] :persist (false) Don't close the file after writing
139
- def dump data, options={}
160
+ # @param [String, Array, #each] data object to emit
161
+ def emit data, options={}
140
162
  data.each do |element| # works if data is an Array or a String
141
163
  io.puts(element.to_s)
142
164
  end
143
- io.close unless options[:persist]
165
+ end
166
+
167
+ # Return a snippet of text from this resource.
168
+ #
169
+ # Will read the first 1024 bytes and strip non-ASCII
170
+ # characters from them. For more control, redefine this
171
+ # method in another module.
172
+ #
173
+ # @return [String]
174
+ def snippet
175
+ returning([]) do |snip|
176
+ io.read(1024).bytes.each do |byte|
177
+ # CR LF SPACE ~
178
+ snip << byte.chr if byte == 13 || byte == 10 || byte >= 32 && byte <= 126
179
+ end
180
+ end.join
181
+ end
182
+
183
+ # Return the number of lines in this file.
184
+ #
185
+ # @return [Integer]
186
+ def num_lines
187
+ wc[0]
188
+ end
189
+
190
+ # Return the number of words in this file.
191
+ #
192
+ # @return [Integer]
193
+ def num_words
194
+ wc[1]
195
+ end
196
+
197
+ # Return the number of characters in this file.
198
+ #
199
+ # @return [Integer]
200
+ def num_chars
201
+ wc[2]
144
202
  end
145
203
 
146
204
  # Return a summary of properties of this local file.
@@ -154,20 +212,40 @@ module IMW
154
212
  data = {
155
213
  :basename => basename,
156
214
  :size => size,
157
- :extension => extension
215
+ :extension => extension,
216
+ :num_lines => num_lines
158
217
  }
159
- if respond_to?(:snippet)
160
- data[:snippet] = snippet
161
- end
218
+ data[:snippet] = snippet if respond_to?(:snippet)
219
+ data[:schema] = schema if respond_to?(:schema)
162
220
  data
163
221
  end
164
222
 
223
+ protected
224
+
225
+ # Return a triple of line, word, and character counts for this
226
+ # resource.
227
+ #
228
+ # Relies on the Unix utility +wc+.
229
+ #
230
+ # @return [Array<Integer>]
231
+ def wc
232
+ @wc ||= begin
233
+ `wc #{path}`.chomp.strip.split.map(&:to_i)
234
+ rescue
235
+ [0,0,0] # FIXME
236
+ end
237
+ end
238
+
165
239
  end
166
240
 
167
241
  # Defines methods for manipulating the contents of a local
168
242
  # directory.
169
243
  module LocalDirectory
170
244
 
245
+ # Lets local directories contain a special metadata file which
246
+ # describes their contents.
247
+ include IMW::Metadata::ContainsMetadata
248
+
171
249
  # Is this resource a directory?
172
250
  #
173
251
  # @return [true, false]
@@ -207,11 +285,11 @@ module IMW
207
285
  # @param [String, IMW::Resource] obj
208
286
  # @return [true, false]
209
287
  def contains? obj
210
- require 'find'
211
- obj_path = obj.is_a?(String) ? obj : obj.path
212
- Find.find(path) do |sub_path|
213
- return true if sub_path.ends_with?(obj_path)
214
- end
288
+ obj = IMW.open(obj)
289
+ return false unless obj.is_local?
290
+ return true if obj.path == path
291
+ return false unless obj.path.starts_with?(path)
292
+ return true if self[obj.path[path.length..-1]].size > 0
215
293
  false
216
294
  end
217
295
 
@@ -277,6 +355,31 @@ module IMW
277
355
  self
278
356
  end
279
357
 
358
+ # Return the resource at the base path of this resource joined
359
+ # to +path+.
360
+ #
361
+ # IMW.open('/path/to/dir').join('subdir')
362
+ # #=> IMW::Resource at '/path/to/dir/subdir'
363
+ #
364
+ # @param [Array<String>] paths
365
+ # @return [IMW::Resource]
366
+ def join *paths
367
+ IMW.open(File.join(stripped_uri.to_s, *paths))
368
+ end
369
+
370
+ # Recursively walk down this directory
371
+ def walk(options={}, &block)
372
+ require 'find'
373
+ Find.find(path) do |path|
374
+ if options[:only]
375
+ next if options[:only] == :files && !File.file?(path)
376
+ next if options[:only] == :directories && !File.directory?(path)
377
+ next if options[:only] == :symlinks && !File.symlink?(path)
378
+ end
379
+ yield path
380
+ end
381
+ end
382
+
280
383
  # Return a hash summarizing this directory with a key
281
384
  # <tt>:contents</tt> containing an array of hashes summarizing
282
385
  # this directories contents.
@@ -293,10 +396,30 @@ module IMW
293
396
  :basename => basename,
294
397
  :size => size,
295
398
  :num_files => contents.length,
296
- :contents => resources.map { |resource| resource.summary }
399
+ :contents => resources.map do |resource|
400
+ resource.guess_schema! if guess_schema? && resource.respond_to?(:guess_schema!)
401
+ resource_summary = resource.summary
402
+ resource_summary[:schema] = metadata[resource] if metadata && metadata.describe?(resource) # this should be handled by 'resources' method above
403
+ resource_summary
404
+ end
297
405
  }
298
406
  end
299
407
 
408
+ # Whether or not to have this directory's resources guess
409
+ # their schemas when none is provided.
410
+ #
411
+ # @return [true, false]
412
+ def guess_schema?
413
+ (!! @guess_schema)
414
+ end
415
+
416
+ # Force this directory's resources to guess at their schema.
417
+ #
418
+ # @return [true]
419
+ def guess_schema!
420
+ @guess_schema = true
421
+ end
422
+
300
423
  end
301
424
  end
302
425
  end
@@ -39,14 +39,6 @@ module IMW
39
39
  @query_string ||= uri.query
40
40
  end
41
41
 
42
- # Return the fragment part of this resource's URI. Will likely be
43
- # +nil+ for local resources.
44
- #
45
- # @return [String]
46
- def fragment
47
- @fragment ||= uri.fragment
48
- end
49
-
50
42
  # Return the path part of this resource's URI. Will _not_
51
43
  # include the +query_string+ or +fragment+.
52
44
  #
@@ -103,8 +95,21 @@ module IMW
103
95
 
104
96
  module RemoteDirectory
105
97
 
98
+ # Return the resource at the base path of this resource joined
99
+ # to +path+.
100
+ #
101
+ # IMW.open('http://example.com/path/to/dir').join('subdir')
102
+ # #=> IMW::Resource at 'http://example.com/path/to/dir/subdir'
103
+ #
104
+ # @param [Array<String>] paths
105
+ # @return [IMW::Resource]
106
+ def join *paths
107
+ IMW.open(File.join(stripped_uri.to_s, *paths))
108
+ end
109
+
106
110
  #
107
- # TODO -- bloody everything
111
+ # TODO -- bloody everything. what's the best way to tell if
112
+ # the remote URL is a directory?
108
113
  #
109
114
 
110
115
 
@@ -116,6 +116,18 @@ module IMW
116
116
  destination
117
117
  end
118
118
 
119
+ # Return the resource at the base path of this resource joined
120
+ # to +path+.
121
+ #
122
+ # IMW.open('s3:://bucket/path/to/dir').join('subdir')
123
+ # #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
124
+ #
125
+ # @param [Array<String>] paths
126
+ # @return [IMW::Resource]
127
+ def join *paths
128
+ IMW.open(File.join(stripped_uri.to_s, *paths))
129
+ end
130
+
119
131
  protected
120
132
  # Make an S3 connection.
121
133
  #
@@ -0,0 +1,117 @@
1
+ require 'dbi'
2
+
3
+ module IMW
4
+ module Schemes
5
+
6
+ # Encapsulates a connection to a relational database.
7
+ #
8
+ # Calling
9
+ #
10
+ # IMW.open('sql://host:port/database_name')
11
+ #
12
+ # shold create a connection to a database at the given +port+ on
13
+ # the given +host+ using the given +database_name+.
14
+ module SQL
15
+
16
+ # A base implementation of a connection to a relational
17
+ # database.
18
+ #
19
+ # The Base#extended method will examine the +scheme+ of an
20
+ # object extended with this module and choose a more specific
21
+ # database adaptor module to extend with as well.
22
+ module Base
23
+
24
+ # When an IMW::Resource is extended use URI's scheme to choose
25
+ # which other module inside IMW::Schemes::SQL to extend with.
26
+ def self.extended obj
27
+ case obj.scheme
28
+ when 'mysql' then obj.extend(IMW::Schemes::SQL::MySQL)
29
+ when 'postgresql' then obj.extend(IMW::Schemes::SQL::PostgreSQL)
30
+ else raise IMW::ArgumentError.new("Unknown database type: #{obj.scheme}")
31
+ end
32
+ end
33
+
34
+ # For an SQL connection the database will be the same as the
35
+ # path.
36
+ #
37
+ # @return [String]
38
+ def database
39
+ @database ||= path.tr('/','')
40
+ end
41
+
42
+ # Redefineeach method inappropriate for databases.
43
+ [:dirname, :basename, :extname, :extension, :name].each do |method|
44
+ define_method(method) do
45
+ nil
46
+ end
47
+ end
48
+
49
+ # The (cached) database connection for this resource.
50
+ #
51
+ # @return [DBI::DatabaseHandle]
52
+ def connection
53
+ @connection ||= DBI.connect("#{dbi_module}:#{database}:#{host}", user, password)
54
+ end
55
+
56
+ # Return the password associated with user's account on the
57
+ # given database.
58
+ #
59
+ # @return [String]
60
+ def password
61
+ @password ||= resource_options[:password]
62
+ end
63
+
64
+ # Return an array of the table names in the current database.
65
+ #
66
+ # @return [Array<String>]
67
+ def tables
68
+ returning([]) do |table_names|
69
+ execute("SHOW TABLES") do |row|
70
+ table_names << row.first
71
+ end
72
+ end
73
+ end
74
+
75
+ # Execute the (joined) +query_string_parts+ using this
76
+ # resource's cached connection.
77
+ #
78
+ # If passed a block, yield each row of the result set to the
79
+ # block.
80
+ #
81
+ # @param [Array<String>] query_string_parts
82
+ # @yield [DBI::Row]
83
+ # @return [DBI::StatementHandle]
84
+ def execute *query_string_parts, &block
85
+ query = query_string_parts.join(' ')
86
+ IMW.announce_if_verbose "Querying #{self}: #{query}"
87
+ statement = connection.execute(query)
88
+ block_given? ? statement.fetch(&block) : statement
89
+ end
90
+ end
91
+
92
+ # Module for MySQL databases.
93
+ module MySQL
94
+
95
+ # Return the name of the DBI module used to connect to MySQL.
96
+ #
97
+ # @return [String]
98
+ def dbi_module
99
+ "DBI:Mysql"
100
+ end
101
+ end
102
+
103
+ # Module for PostgreSQL databases.
104
+ module PostgreSQL
105
+
106
+ # Return the name of the DBI module used to connect to PostgreSQL.
107
+ #
108
+ # @return [String]
109
+ def dbi_module
110
+ "DBI:Pg"
111
+ end
112
+ end
113
+
114
+ end
115
+ end
116
+ end
117
+
data/lib/imw/tools.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  module IMW
2
2
  module Tools
3
- autoload :Archiver, 'imw/tools/archiver'
4
- autoload :Transferer, 'imw/tools/transferer'
5
- autoload :Summarizer, 'imw/tools/summarizer'
3
+ autoload :Archiver, 'imw/tools/archiver'
4
+ autoload :Transferer, 'imw/tools/transferer'
5
+ autoload :Summarizer, 'imw/tools/summarizer'
6
+ autoload :ExtensionAnalyzer, 'imw/tools/extension_analyzer'
7
+ autoload :Downloader, 'imw/tools/downloader'
6
8
  end
7
9
  end
8
10
 
@@ -0,0 +1,63 @@
1
+ module IMW
2
+ module Tools
3
+
4
+ # A class to download a collection of resources to a shared
5
+ # directory.
6
+ class Downloader
7
+
8
+ def initialize dir, *inputs
9
+ self.dir = dir
10
+ self.inputs = inputs unless inputs.blank?
11
+ end
12
+
13
+ def self.dir= new_dir
14
+ @dir = IMW.open(new_dir)
15
+ raise IMW::PathError.new("#{@dir} must be a local directory") unless @dir.is_local? && @dir.is_directory?
16
+ @dir
17
+ end
18
+ attr_reader :dir
19
+
20
+ def inputs= new_inputs
21
+ @inputs = new_inputs.flatten.compact.map { |raw_input| IMW.open(raw_input) }
22
+ end
23
+ attr_reader :inputs
24
+
25
+ def downloaded_path_for input
26
+ dir.join(input.respond_to?(:effective_basename) ? input.effective_basename : input.basename)
27
+ end
28
+
29
+ def download!
30
+ before_download
31
+ inputs.each do |input|
32
+ downloaded_path = downloaded_path_for(input)
33
+ IMW.log_if_verbose "Downloading #{input} to #{downloaded_path}"
34
+ input.cp(downloaded_path)
35
+ end
36
+ after_download
37
+ end
38
+
39
+ def downloaded?
40
+ downloaded_resources.all? { |resource| resource.exist? }
41
+ end
42
+
43
+ def downloaded_resources
44
+ inputs.map do |input|
45
+ IMW.open(downloaded_path_for(input))
46
+ end
47
+ end
48
+
49
+ def clean!
50
+ IMW.log_if_verbose("Deleting downloader directory #{dir}")
51
+ dir.rm_rf!
52
+ end
53
+
54
+ def before_download
55
+ end
56
+
57
+ def after_download
58
+ end
59
+
60
+ end
61
+ end
62
+ end
63
+