imw 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
@@ -11,24 +11,30 @@ module IMW
11
11
  # files. Few large files will not cause a problem.
12
12
  class Summarizer
13
13
 
14
+ # Options for this Summarizer.
15
+ attr_accessor :options
16
+
14
17
  # The inputs given to this Summarizer.
15
18
  attr_reader :inputs
16
19
 
17
- # The resources to this Summarizer, calculated recursively from
18
- # its +inputs+.
20
+ # The resources analyzed, calculated recursively from the
21
+ # +inputs+.
19
22
  attr_reader :resources
20
23
 
21
24
  include IMW::Tools::ExtensionAnalyzer
22
25
 
23
26
  # Initialize a new Summarizer with the given +inputs+.
24
27
  #
28
+ # A Hash of options can be given as the last parameter.
29
+ #
25
30
  # @param [Array<String, IMW::Resource>] inputs
26
31
  # @return [IMW::Tools::Summarizer]
27
32
  def initialize *inputs
28
- self.inputs = inputs.flatten
33
+ self.options = (inputs.last.is_a?(Hash) && inputs.pop) || {}
34
+ self.inputs = inputs.flatten
29
35
  end
30
36
 
31
- # Return the total size.
37
+ # Return the total size of all resources.
32
38
  #
33
39
  # @return [Integer]
34
40
  def total_size
@@ -43,14 +49,20 @@ module IMW
43
49
  # @return [Array<Hash>]
44
50
  def summary
45
51
  @summary ||= inputs.map do |input|
46
- if input.respond_to?(:summary)
47
- input.summary rescue {}
48
- else
49
- {}
50
- end
52
+ #input.guess_schema! if input.schema.nil? && input.respond_to?(:guess_schema!)
53
+ input.respond_to?(:summary) ? input.summary : {}
51
54
  end
52
55
  end
53
56
 
57
+ # The metadata employed by this Summarizer.
58
+ #
59
+ # It can be set by setting <tt>options[:metadata]</tt>.
60
+ #
61
+ # @return [IMW::Metadata, nil]
62
+ def metadata
63
+ @metadata ||= options[:metadata] && IMW::Metadata.load(options[:metadata])
64
+ end
65
+
54
66
  protected
55
67
  # Set new inputs for this summarizer.
56
68
  #
@@ -61,7 +73,6 @@ module IMW
61
73
  def inputs= new_inputs
62
74
  @inputs = new_inputs.map do |path_or_resource|
63
75
  input = IMW.open(path_or_resource)
64
- input.should_exist!("Cannot summarize.")
65
76
  end
66
77
  @resources = inputs.map do |input|
67
78
  input.is_local? && input.is_directory? ? input.all_resources : input
data/lib/imw/utils.rb CHANGED
@@ -1,8 +1,18 @@
1
1
  require 'rubygems'
2
2
  require 'fileutils'
3
+ require 'active_support'
3
4
  require 'imw/utils/error'
4
5
  require 'imw/utils/log'
5
6
  require 'imw/utils/paths'
6
7
  require 'imw/utils/misc'
7
8
  require 'imw/utils/extensions'
8
9
 
10
+ module IMW
11
+
12
+ # Utility modules.
13
+ module Utils
14
+ autoload :DynamicallyExtendable, 'imw/utils/dynamically_extendable'
15
+ autoload :HasURI, 'imw/utils/has_uri'
16
+ end
17
+ end
18
+
@@ -0,0 +1,137 @@
1
+ module IMW
2
+ module Utils
3
+
4
+ # Provides an including class with a class-level array of
5
+ # "handlers" that it can use to dynamically extend its instances
6
+ # with specific modules only if certain conditions are met.
7
+ #
8
+ # This allows different instances of a class to implement very
9
+ # different behavior at runtime.
10
+ #
11
+ # An example use case might be a Database class which dynamically
12
+ # extends its instances with an adaptor module appropriate to the
13
+ # particular database the object refers to.
14
+ module DynamicallyExtendable
15
+
16
+ def self.included obj
17
+ obj.extend(ClassMethods)
18
+ end
19
+
20
+ # Return the modules this object has been extended by.
21
+ #
22
+ # @return [Array]
23
+ def modules
24
+ @modules ||= []
25
+ end
26
+
27
+ # Works just like Object#extend except it keeps track of the
28
+ # modules it has extended.
29
+ #
30
+ # @see IMW::Utils::DynamicallyExtendable#modules
31
+ def extend mod
32
+ modules << mod
33
+ super mod
34
+ end
35
+
36
+ # Iterate through this object's class's handlers and extend this
37
+ # object with the module referred to by any matching handlers.
38
+ def extend_appropriately! options={}
39
+ self.class.extend_instance! self, options
40
+ end
41
+
42
+ # A collection of methods which provide a class including
43
+ # IMW::Utils::DynamicallyExtendable with a class-level Array of
44
+ # handlers that can be applied to instances by calling the
45
+ # instance's +extend_appropriately!+ method.
46
+ module ClassMethods
47
+
48
+ # The handlers an including class has defined.
49
+ #
50
+ # @return [Array<Array>]
51
+ def handlers
52
+ @handlers ||= []
53
+ end
54
+
55
+ # Register a new handler for an including class.
56
+ #
57
+ # +handler+ must be one of
58
+ #
59
+ # 1. Regexp
60
+ # 2. Proc
61
+ # 3. +true+
62
+ #
63
+ # In case (1), if the regular expression matches the
64
+ # instance's +to_s+ method then the module (+mod+) will be
65
+ # used..
66
+ #
67
+ # In case (2), if the Proc returns a value other than +false+
68
+ # or +nil+ after being passed an instance then the module will
69
+ # be used.
70
+ #
71
+ # In case (3), the module will be used.
72
+ #
73
+ # @param [String, Module] mod
74
+ # @param [Regexp, Proc, true] handler
75
+ def register_handler mod, handler
76
+ raise IMW::ArgumentError.new("Module must be either a Module or String") unless mod.is_a?(Module) || mod.is_a?(String)
77
+ raise IMW::ArgumentError.new("Handler must be either a Regexp, Proc, or true") unless handler.is_a?(Regexp) || handler.is_a?(Proc) || handler == true
78
+ handlers << [mod, handler]
79
+ end
80
+
81
+ # Register a collection of handlers.
82
+ #
83
+ # @see IMW::Utils::DynamicallyExtendable::ClassMethods#register_handler
84
+ def register_handlers *pairs
85
+ pairs.each { |pair| register_handler *pair }
86
+ end
87
+
88
+ # Iterate through this class's handlers and extend the given
89
+ # object with modules whose handler conditions match the
90
+ # instance.
91
+ #
92
+ # Passing in <tt>:use_modules</tt> or <tt>:skip_modules</tt>
93
+ # allows overriding the default behavior of handlers.
94
+ #
95
+ # @param [Object] instance
96
+ # @param [Hash] options
97
+ # @option options [Array<String,Module>] use_modules a list of modules used regardless of handlers
98
+ # @option options [Array<String,Module>] skip_modules a list of modules not to be used regardless of handlers
99
+ # @return [Object] the newly extended object
100
+ def extend_instance! instance, options={}
101
+ return if options[:no_modules]
102
+ options.reverse_merge!(:use_modules => [], :skip_modules => [])
103
+ handlers.each do |mod_name, handler|
104
+ case handler
105
+ when Regexp then extend_instance_with_mod_or_string!(instance, mod_name, options[:skip_modules]) if handler =~ instance.to_s
106
+ when Proc then extend_instance_with_mod_or_string!(instance, mod_name, options[:skip_modules]) if handler.call(instance)
107
+ when TrueClass then extend_instance_with_mod_or_string!(instance, mod_name, options[:skip_modules])
108
+ else raise IMW::TypeError("A handler must be Regexp, Proc, or true")
109
+ end
110
+ end
111
+ options[:use_modules].each { |mod_name| extend_instance_with_mod_or_string!(instance, mod_name, options[:skip_modules]) }
112
+ instance
113
+ end
114
+
115
+ # Extend +instance+ with +mod_or_string+. Will work hard to
116
+ # try and interpret +mod_or_string+ as a module if it's a
117
+ # string.
118
+ #
119
+ # @param [Object] instance
120
+ #
121
+ # @param [Module, String] mod_or_string the module or string
122
+ # representing a module to extend the instance with
123
+ #
124
+ # @param [Array<Module,String>] skip_modules modules to exclude
125
+ def extend_instance_with_mod_or_string! instance, mod_or_string, skip_modules
126
+ return if skip_modules.include?(mod_or_string)
127
+ if mod_or_string.is_a?(Module)
128
+ instance.extend(mod_or_string)
129
+ else
130
+ m = IMW.class_eval(mod_or_string)
131
+ instance.extend(m) unless skip_modules.include?(m)
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -25,6 +25,9 @@ module IMW
25
25
  # Error communicating with a remote entity.
26
26
  ArgumentError = Class.new(Error)
27
27
 
28
+ # Error in defining or matching a schema.
29
+ SchemaError = Class.new(Error)
30
+
28
31
  # An error meant to be used when a system call goes awry. It will
29
32
  # report exit status and the process id of the offending call.
30
33
  class SystemCallError < IMW::Error
@@ -4,10 +4,6 @@ require 'imw/utils/extensions/hash'
4
4
  require 'imw/utils/extensions/struct'
5
5
  require 'imw/utils/extensions/symbol'
6
6
 
7
- require 'active_support/core_ext/object/blank'
8
- require 'active_support/core_ext/object/misc'
9
-
10
-
11
7
  module IMW
12
8
  # A replacement for the standard system call which raises an
13
9
  # IMW::SystemCallError if the command fails which prints better
@@ -1,7 +1,6 @@
1
- require 'active_support/core_ext/array/extract_options'
2
- class Array #:nodoc:
3
- include ActiveSupport::CoreExtensions::Array::ExtractOptions
4
- end
1
+ # class Array #:nodoc:
2
+ # include ActiveSupport::CoreExtensions::Array::ExtractOptions
3
+ # end
5
4
 
6
5
  class Array
7
6
 
@@ -107,9 +106,9 @@ class Array
107
106
  terminals
108
107
  end
109
108
 
110
- # Dump the data in this array to the resource at the given +uri+.
111
- def dump uri
112
- IMW.open!(uri).dump(self)
109
+ # Emit the data in this array to the resource at the given +uri+.
110
+ def emit uri
111
+ IMW.open!(uri).emit(self)
113
112
  end
114
113
 
115
114
  end
@@ -1,5 +1,3 @@
1
- require 'active_support/core_ext/hash/reverse_merge'
2
-
3
1
  class Hash
4
2
  # Return the elements of this hash in a pretty-printed string,
5
3
  # inserting +final_string+ between the last two items.
@@ -201,8 +199,8 @@ class Hash
201
199
  terminals
202
200
  end
203
201
 
204
- # Dump the data from this Hash into the given +uri+.
205
- def dump uri
206
- IMW.open!(uri).dump(self)
202
+ # Emit the data from this Hash into the given +uri+.
203
+ def emit uri
204
+ IMW.open!(uri).emit(self)
207
205
  end
208
206
  end
@@ -31,9 +31,9 @@ class String
31
31
  self.downcase.underscore.to_sym
32
32
  end
33
33
 
34
- # Dump this string into the given +uri+.
35
- def dump uri
36
- IMW.open!(uri).dump(self)
34
+ # Emit this string into the given +uri+.
35
+ def emit uri
36
+ IMW.open!(uri).emit(self)
37
37
  end
38
38
 
39
39
  end
@@ -0,0 +1,114 @@
1
+ require 'addressable/uri'
2
+
3
+ module IMW
4
+ module Utils
5
+
6
+ # Endows an including class with a wrapper for Addressable::URI
7
+ module HasURI
8
+
9
+ # The URI of this object.
10
+ attr_reader :uri
11
+
12
+ # Set the URI of this resource by parsing the given +uri+ (if
13
+ # necessary).
14
+ #
15
+ # @param [String, Addressable::URI] uri the uri to parse
16
+ def uri= uri
17
+ if uri.is_a?(Addressable::URI)
18
+ @uri = uri
19
+ else
20
+ begin
21
+ @uri = Addressable::URI.parse(uri.to_s)
22
+ rescue URI::InvalidURIError
23
+ @uri = Addressable::URI.parse(URI.encode(uri.to_s))
24
+ @encoded_uri = true
25
+ end
26
+ end
27
+ end
28
+
29
+ # The scheme of this resource. Will be +nil+ for local resources.
30
+ #
31
+ # @return [String]
32
+ def scheme
33
+ @scheme ||= uri.scheme
34
+ end
35
+
36
+ # The directory name of this resource's path.
37
+ #
38
+ # @return [String]
39
+ def dirname
40
+ @dirname ||= File.dirname(path)
41
+ end
42
+
43
+ # The basename of this resource's path.
44
+ #
45
+ # @return [String]
46
+ def basename
47
+ @basename ||= File.basename(path)
48
+ end
49
+
50
+ # Returns the extension (INCLUDING the '.') of this resource's
51
+ # path. Redefine this in an including class for which this is
52
+ # weird ('.tar.gz' I'm talking to you...)
53
+ #
54
+ # @return [String]
55
+ def extname
56
+ @extname ||= File.extname(path)
57
+ end
58
+
59
+ # Returns the extension (WITHOUT the '.') of this resource's path.
60
+ #
61
+ # @return [String]
62
+ def extension
63
+ @extension ||= extname[1..-1] || ''
64
+ end
65
+
66
+ # Returns the basename of the file with its extension removed
67
+ #
68
+ # IMW.open('/path/to/some_file.tar.gz').name # => some_file
69
+ #
70
+ # @return [String]
71
+ def name
72
+ @name ||= extname ? basename[0,basename.length - extname.length] : basename
73
+ end
74
+
75
+ # Returns the user associated with the host of this URI.
76
+ #
77
+ # @return [String]
78
+ def user
79
+ @user ||= uri.user
80
+ end
81
+
82
+ # Return the fragment part of this resource's URI.
83
+ #
84
+ # Will likely be +nil+ for local resources.
85
+ #
86
+ # @return [String]
87
+ def fragment
88
+ @fragment ||= uri.fragment
89
+ end
90
+
91
+ # Return the URI of this resource with any query strings and
92
+ # fragments removed.
93
+ #
94
+ # @return [URI::Generic]
95
+ def stripped_uri
96
+ uri_args = returning({}) do |args|
97
+ %w[scheme userinfo host port path].each do |method|
98
+ args[method.to_sym] = respond_to?(method) ? send(method) : uri.send(method)
99
+ end
100
+ end
101
+ uri.class.new(uri_args)
102
+ end
103
+
104
+ def to_s
105
+ uri.to_s
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+
112
+
113
+
114
+
@@ -3,7 +3,7 @@ ID,Name,Genus,Species
3
3
  002,Panamanian Night Monkey,Aotus,zonalis
4
4
  003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
5
5
  004,Gray-handed Night Monkey,Aotus,griseimembra
6
- 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
6
+ 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
7
7
  006,Brumback's Night Monkey,Aotus,brumbacki
8
8
  007,Three-striped Night Monkey,Aotus,trivirgatus
9
9
  008,Spix's Night Monkey,Aotus,vociferans
@@ -0,0 +1,11 @@
1
+ Store Number Store Name Address City State Zip Phone Number Latitude Longitude
2
+ 10344 Ace Gambles of Hotchkiss 121 E. Bridge Street Hotchkiss CO 81419 (970) 872-3535 38.7988 -107.7205
3
+ 10347 Al's Ace Hardware 4010 E 29th St Tucson AZ 85711 (520) 747-8144 32.1996 -110.9059
4
+ 10349 Al's Ace Hardware 4722 E Broadway Blvd Tucson AZ 85711 (520) 318-0414 32.2216 -110.8916
5
+ 10351 Al's Ace Hardware 2884 N Campbell Ave Tucson AZ 85719 (520) 325-2432 32.259 -110.9428
6
+ 10353 Lewistown Ace Hardware 815 NE Main St Lewistown MT 59457 (406) 538-4000 47.0702 -109.409
7
+ 10356 Byram Revell Ace Hardware 5726 Terry Rd Jackson MS 39272 (601) 371-8429 32.1919 -90.2542
8
+ 10357 Mr C's Ace Hardware 1201 Precinct Line Rd Hurst TX 76053 (817) 282-3428 32.831 -97.186
9
+ 10358 Gus Bock Ace Hardware 3455 Ridge Rd Lansing IL 60438 (708) 474-5940 41.5643 -87.5321
10
+ 10359 Elberton Ace Hardware 886 Elbert St Elberton GA 30635 (706) 283-3054 34.1008 -82.8495
11
+ 10361 Phil's Hardware 13 SW H St Madras OR 97741 (541) 475-9392 44.627737 -121.13004