imw 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
@@ -28,7 +28,6 @@ module IMW
28
28
  IMW::Tools::Transferer.new(:cp, self, new_uri).transfer!
29
29
  end
30
30
 
31
-
32
31
  # Return the basename of the URI or <tt>_index</tt> if it's
33
32
  # blank, as in the case of <tt>http://www.google.com</tt>.
34
33
  #
@@ -79,7 +78,7 @@ module IMW
79
78
  # @raise [RestClient::NotModified, RestClient::Unauthorized, RestClient::ResourceNotFound, RestClient::RequestFailed] error from RestClient on non-2xx response codes
80
79
  def post payload, headers={}, &block
81
80
  make_restclient_request do
82
- RestClient.post(uri.to_s, payload, headers, &block)
81
+ RestClient.post(uri.to_s, payload, &block)
83
82
  end
84
83
  end
85
84
 
@@ -65,7 +65,7 @@ module IMW
65
65
  def dir
66
66
  IMW.open(dirname)
67
67
  end
68
-
68
+
69
69
  end
70
70
 
71
71
  # Defines methods for appropriate for a local file.
@@ -93,6 +93,14 @@ module IMW
93
93
  @io ||= open(path, mode)
94
94
  end
95
95
 
96
+ # Close this resource's file handle if it exists.
97
+ def close
98
+ # explicitly check the @io instance variable b/c self.io
99
+ # will open up a new handle by default
100
+ io.close if @io
101
+ super()
102
+ end
103
+
96
104
  # Read from this file.
97
105
  #
98
106
  # @param [Fixnum] length bytes to read
@@ -101,6 +109,13 @@ module IMW
101
109
  io.read(length)
102
110
  end
103
111
 
112
+ # Read a line from this file.
113
+ #
114
+ # @return [String]
115
+ def readline
116
+ io.readline
117
+ end
118
+
104
119
  # Write to this file
105
120
  #
106
121
  # @param [String, #to_s] text text to write
@@ -109,6 +124,14 @@ module IMW
109
124
  io.write text
110
125
  end
111
126
 
127
+ # Write the text with a trailing newline to this resource.
128
+ #
129
+ # @param [String, #to_s] text
130
+ def puts text
131
+ io.write text.to_s + "\n"
132
+ end
133
+ alias_method :<<, :puts
134
+
112
135
  # Return the lines in this file.
113
136
  #
114
137
  # If passed a block, yield each line of the file to the block.
@@ -132,15 +155,50 @@ module IMW
132
155
  io.map(&block)
133
156
  end
134
157
 
135
- # Dump +data+ into this file.
158
+ # Emit +data+ into this file.
136
159
  #
137
- # @param [String, Array, #each] data object to dump
138
- # @option options [true, false] :persist (false) Don't close the file after writing
139
- def dump data, options={}
160
+ # @param [String, Array, #each] data object to emit
161
+ def emit data, options={}
140
162
  data.each do |element| # works if data is an Array or a String
141
163
  io.puts(element.to_s)
142
164
  end
143
- io.close unless options[:persist]
165
+ end
166
+
167
+ # Return a snippet of text from this resource.
168
+ #
169
+ # Will read the first 1024 bytes and strip non-ASCII
170
+ # characters from them. For more control, redefine this
171
+ # method in another module.
172
+ #
173
+ # @return [String]
174
+ def snippet
175
+ returning([]) do |snip|
176
+ io.read(1024).bytes.each do |byte|
177
+ # CR LF SPACE ~
178
+ snip << byte.chr if byte == 13 || byte == 10 || byte >= 32 && byte <= 126
179
+ end
180
+ end.join
181
+ end
182
+
183
+ # Return the number of lines in this file.
184
+ #
185
+ # @return [Integer]
186
+ def num_lines
187
+ wc[0]
188
+ end
189
+
190
+ # Return the number of words in this file.
191
+ #
192
+ # @return [Integer]
193
+ def num_words
194
+ wc[1]
195
+ end
196
+
197
+ # Return the number of characters in this file.
198
+ #
199
+ # @return [Integer]
200
+ def num_chars
201
+ wc[2]
144
202
  end
145
203
 
146
204
  # Return a summary of properties of this local file.
@@ -154,20 +212,40 @@ module IMW
154
212
  data = {
155
213
  :basename => basename,
156
214
  :size => size,
157
- :extension => extension
215
+ :extension => extension,
216
+ :num_lines => num_lines
158
217
  }
159
- if respond_to?(:snippet)
160
- data[:snippet] = snippet
161
- end
218
+ data[:snippet] = snippet if respond_to?(:snippet)
219
+ data[:schema] = schema if respond_to?(:schema)
162
220
  data
163
221
  end
164
222
 
223
+ protected
224
+
225
+ # Return a triple of line, word, and character counts for this
226
+ # resource.
227
+ #
228
+ # Relies on the Unix utility +wc+.
229
+ #
230
+ # @return [Array<Integer>]
231
+ def wc
232
+ @wc ||= begin
233
+ `wc #{path}`.chomp.strip.split.map(&:to_i)
234
+ rescue
235
+ [0,0,0] # FIXME
236
+ end
237
+ end
238
+
165
239
  end
166
240
 
167
241
  # Defines methods for manipulating the contents of a local
168
242
  # directory.
169
243
  module LocalDirectory
170
244
 
245
+ # Lets local directories contain a special metadata file which
246
+ # describes their contents.
247
+ include IMW::Metadata::ContainsMetadata
248
+
171
249
  # Is this resource a directory?
172
250
  #
173
251
  # @return [true, false]
@@ -207,11 +285,11 @@ module IMW
207
285
  # @param [String, IMW::Resource] obj
208
286
  # @return [true, false]
209
287
  def contains? obj
210
- require 'find'
211
- obj_path = obj.is_a?(String) ? obj : obj.path
212
- Find.find(path) do |sub_path|
213
- return true if sub_path.ends_with?(obj_path)
214
- end
288
+ obj = IMW.open(obj)
289
+ return false unless obj.is_local?
290
+ return true if obj.path == path
291
+ return false unless obj.path.starts_with?(path)
292
+ return true if self[obj.path[path.length..-1]].size > 0
215
293
  false
216
294
  end
217
295
 
@@ -277,6 +355,31 @@ module IMW
277
355
  self
278
356
  end
279
357
 
358
+ # Return the resource at the base path of this resource joined
359
+ # to +path+.
360
+ #
361
+ # IMW.open('/path/to/dir').join('subdir')
362
+ # #=> IMW::Resource at '/path/to/dir/subdir'
363
+ #
364
+ # @param [Array<String>] paths
365
+ # @return [IMW::Resource]
366
+ def join *paths
367
+ IMW.open(File.join(stripped_uri.to_s, *paths))
368
+ end
369
+
370
+ # Recursively walk down this directory
371
+ def walk(options={}, &block)
372
+ require 'find'
373
+ Find.find(path) do |path|
374
+ if options[:only]
375
+ next if options[:only] == :files && !File.file?(path)
376
+ next if options[:only] == :directories && !File.directory?(path)
377
+ next if options[:only] == :symlinks && !File.symlink?(path)
378
+ end
379
+ yield path
380
+ end
381
+ end
382
+
280
383
  # Return a hash summarizing this directory with a key
281
384
  # <tt>:contents</tt> containing an array of hashes summarizing
282
385
  # this directories contents.
@@ -293,10 +396,30 @@ module IMW
293
396
  :basename => basename,
294
397
  :size => size,
295
398
  :num_files => contents.length,
296
- :contents => resources.map { |resource| resource.summary }
399
+ :contents => resources.map do |resource|
400
+ resource.guess_schema! if guess_schema? && resource.respond_to?(:guess_schema!)
401
+ resource_summary = resource.summary
402
+ resource_summary[:schema] = metadata[resource] if metadata && metadata.describe?(resource) # this should be handled by 'resources' method above
403
+ resource_summary
404
+ end
297
405
  }
298
406
  end
299
407
 
408
+ # Whether or not to have this directory's resources guess
409
+ # their schemas when none is provided.
410
+ #
411
+ # @return [true, false]
412
+ def guess_schema?
413
+ (!! @guess_schema)
414
+ end
415
+
416
+ # Force this directory's resources to guess at their schema.
417
+ #
418
+ # @return [true]
419
+ def guess_schema!
420
+ @guess_schema = true
421
+ end
422
+
300
423
  end
301
424
  end
302
425
  end
@@ -39,14 +39,6 @@ module IMW
39
39
  @query_string ||= uri.query
40
40
  end
41
41
 
42
- # Return the fragment part of this resource's URI. Will likely be
43
- # +nil+ for local resources.
44
- #
45
- # @return [String]
46
- def fragment
47
- @fragment ||= uri.fragment
48
- end
49
-
50
42
  # Return the path part of this resource's URI. Will _not_
51
43
  # include the +query_string+ or +fragment+.
52
44
  #
@@ -103,8 +95,21 @@ module IMW
103
95
 
104
96
  module RemoteDirectory
105
97
 
98
+ # Return the resource at the base path of this resource joined
99
+ # to +path+.
100
+ #
101
+ # IMW.open('http://example.com/path/to/dir').join('subdir')
102
+ # #=> IMW::Resource at 'http://example.com/path/to/dir/subdir'
103
+ #
104
+ # @param [Array<String>] paths
105
+ # @return [IMW::Resource]
106
+ def join *paths
107
+ IMW.open(File.join(stripped_uri.to_s, *paths))
108
+ end
109
+
106
110
  #
107
- # TODO -- bloody everything
111
+ # TODO -- bloody everything. what's the best way to tell if
112
+ # the remote URL is a directory?
108
113
  #
109
114
 
110
115
 
@@ -116,6 +116,18 @@ module IMW
116
116
  destination
117
117
  end
118
118
 
119
+ # Return the resource at the base path of this resource joined
120
+ # to +path+.
121
+ #
122
+ # IMW.open('s3:://bucket/path/to/dir').join('subdir')
123
+ # #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
124
+ #
125
+ # @param [Array<String>] paths
126
+ # @return [IMW::Resource]
127
+ def join *paths
128
+ IMW.open(File.join(stripped_uri.to_s, *paths))
129
+ end
130
+
119
131
  protected
120
132
  # Make an S3 connection.
121
133
  #
@@ -0,0 +1,117 @@
1
+ require 'dbi'
2
+
3
+ module IMW
4
+ module Schemes
5
+
6
+ # Encapsulates a connection to a relational database.
7
+ #
8
+ # Calling
9
+ #
10
+ # IMW.open('sql://host:port/database_name')
11
+ #
12
+ # shold create a connection to a database at the given +port+ on
13
+ # the given +host+ using the given +database_name+.
14
+ module SQL
15
+
16
+ # A base implementation of a connection to a relational
17
+ # database.
18
+ #
19
+ # The Base#extended method will examine the +scheme+ of an
20
+ # object extended with this module and choose a more specific
21
+ # database adaptor module to extend with as well.
22
+ module Base
23
+
24
+ # When an IMW::Resource is extended use URI's scheme to choose
25
+ # which other module inside IMW::Schemes::SQL to extend with.
26
+ def self.extended obj
27
+ case obj.scheme
28
+ when 'mysql' then obj.extend(IMW::Schemes::SQL::MySQL)
29
+ when 'postgresql' then obj.extend(IMW::Schemes::SQL::PostgreSQL)
30
+ else raise IMW::ArgumentError.new("Unknown database type: #{obj.scheme}")
31
+ end
32
+ end
33
+
34
+ # For an SQL connection the database will be the same as the
35
+ # path.
36
+ #
37
+ # @return [String]
38
+ def database
39
+ @database ||= path.tr('/','')
40
+ end
41
+
42
+ # Redefineeach method inappropriate for databases.
43
+ [:dirname, :basename, :extname, :extension, :name].each do |method|
44
+ define_method(method) do
45
+ nil
46
+ end
47
+ end
48
+
49
+ # The (cached) database connection for this resource.
50
+ #
51
+ # @return [DBI::DatabaseHandle]
52
+ def connection
53
+ @connection ||= DBI.connect("#{dbi_module}:#{database}:#{host}", user, password)
54
+ end
55
+
56
+ # Return the password associated with user's account on the
57
+ # given database.
58
+ #
59
+ # @return [String]
60
+ def password
61
+ @password ||= resource_options[:password]
62
+ end
63
+
64
+ # Return an array of the table names in the current database.
65
+ #
66
+ # @return [Array<String>]
67
+ def tables
68
+ returning([]) do |table_names|
69
+ execute("SHOW TABLES") do |row|
70
+ table_names << row.first
71
+ end
72
+ end
73
+ end
74
+
75
+ # Execute the (joined) +query_string_parts+ using this
76
+ # resource's cached connection.
77
+ #
78
+ # If passed a block, yield each row of the result set to the
79
+ # block.
80
+ #
81
+ # @param [Array<String>] query_string_parts
82
+ # @yield [DBI::Row]
83
+ # @return [DBI::StatementHandle]
84
+ def execute *query_string_parts, &block
85
+ query = query_string_parts.join(' ')
86
+ IMW.announce_if_verbose "Querying #{self}: #{query}"
87
+ statement = connection.execute(query)
88
+ block_given? ? statement.fetch(&block) : statement
89
+ end
90
+ end
91
+
92
+ # Module for MySQL databases.
93
+ module MySQL
94
+
95
+ # Return the name of the DBI module used to connect to MySQL.
96
+ #
97
+ # @return [String]
98
+ def dbi_module
99
+ "DBI:Mysql"
100
+ end
101
+ end
102
+
103
+ # Module for PostgreSQL databases.
104
+ module PostgreSQL
105
+
106
+ # Return the name of the DBI module used to connect to PostgreSQL.
107
+ #
108
+ # @return [String]
109
+ def dbi_module
110
+ "DBI:Pg"
111
+ end
112
+ end
113
+
114
+ end
115
+ end
116
+ end
117
+
data/lib/imw/tools.rb CHANGED
@@ -1,8 +1,10 @@
1
1
  module IMW
2
2
  module Tools
3
- autoload :Archiver, 'imw/tools/archiver'
4
- autoload :Transferer, 'imw/tools/transferer'
5
- autoload :Summarizer, 'imw/tools/summarizer'
3
+ autoload :Archiver, 'imw/tools/archiver'
4
+ autoload :Transferer, 'imw/tools/transferer'
5
+ autoload :Summarizer, 'imw/tools/summarizer'
6
+ autoload :ExtensionAnalyzer, 'imw/tools/extension_analyzer'
7
+ autoload :Downloader, 'imw/tools/downloader'
6
8
  end
7
9
  end
8
10
 
@@ -0,0 +1,63 @@
1
+ module IMW
2
+ module Tools
3
+
4
+ # A class to download a collection of resources to a shared
5
+ # directory.
6
+ class Downloader
7
+
8
+ def initialize dir, *inputs
9
+ self.dir = dir
10
+ self.inputs = inputs unless inputs.blank?
11
+ end
12
+
13
+ def self.dir= new_dir
14
+ @dir = IMW.open(new_dir)
15
+ raise IMW::PathError.new("#{@dir} must be a local directory") unless @dir.is_local? && @dir.is_directory?
16
+ @dir
17
+ end
18
+ attr_reader :dir
19
+
20
+ def inputs= new_inputs
21
+ @inputs = new_inputs.flatten.compact.map { |raw_input| IMW.open(raw_input) }
22
+ end
23
+ attr_reader :inputs
24
+
25
+ def downloaded_path_for input
26
+ dir.join(input.respond_to?(:effective_basename) ? input.effective_basename : input.basename)
27
+ end
28
+
29
+ def download!
30
+ before_download
31
+ inputs.each do |input|
32
+ downloaded_path = downloaded_path_for(input)
33
+ IMW.log_if_verbose "Downloading #{input} to #{downloaded_path}"
34
+ input.cp(downloaded_path)
35
+ end
36
+ after_download
37
+ end
38
+
39
+ def downloaded?
40
+ downloaded_resources.all? { |resource| resource.exist? }
41
+ end
42
+
43
+ def downloaded_resources
44
+ inputs.map do |input|
45
+ IMW.open(downloaded_path_for(input))
46
+ end
47
+ end
48
+
49
+ def clean!
50
+ IMW.log_if_verbose("Deleting downloader directory #{dir}")
51
+ dir.rm_rf!
52
+ end
53
+
54
+ def before_download
55
+ end
56
+
57
+ def after_download
58
+ end
59
+
60
+ end
61
+ end
62
+ end
63
+