imw 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
@@ -11,24 +11,30 @@ module IMW
11
11
  # files. Few large files will not cause a problem.
12
12
  class Summarizer
13
13
 
14
+ # Options for this Summarizer.
15
+ attr_accessor :options
16
+
14
17
  # The inputs given to this Summarizer.
15
18
  attr_reader :inputs
16
19
 
17
- # The resources to this Summarizer, calculated recursively from
18
- # its +inputs+.
20
+ # The resources analyzed, calculated recursively from the
21
+ # +inputs+.
19
22
  attr_reader :resources
20
23
 
21
24
  include IMW::Tools::ExtensionAnalyzer
22
25
 
23
26
  # Initialize a new Summarizer with the given +inputs+.
24
27
  #
28
+ # A Hash of options can be given as the last parameter.
29
+ #
25
30
  # @param [Array<String, IMW::Resource>] inputs
26
31
  # @return [IMW::Tools::Summarizer]
27
32
  def initialize *inputs
28
- self.inputs = inputs.flatten
33
+ self.options = (inputs.last.is_a?(Hash) && inputs.pop) || {}
34
+ self.inputs = inputs.flatten
29
35
  end
30
36
 
31
- # Return the total size.
37
+ # Return the total size of all resources.
32
38
  #
33
39
  # @return [Integer]
34
40
  def total_size
@@ -43,14 +49,20 @@ module IMW
43
49
  # @return [Array<Hash>]
44
50
  def summary
45
51
  @summary ||= inputs.map do |input|
46
- if input.respond_to?(:summary)
47
- input.summary rescue {}
48
- else
49
- {}
50
- end
52
+ #input.guess_schema! if input.schema.nil? && input.respond_to?(:guess_schema!)
53
+ input.respond_to?(:summary) ? input.summary : {}
51
54
  end
52
55
  end
53
56
 
57
+ # The metadata employed by this Summarizer.
58
+ #
59
+ # It can be set by setting <tt>options[:metadata]</tt>.
60
+ #
61
+ # @return [IMW::Metadata, nil]
62
+ def metadata
63
+ @metadata ||= options[:metadata] && IMW::Metadata.load(options[:metadata])
64
+ end
65
+
54
66
  protected
55
67
  # Set new inputs for this summarizer.
56
68
  #
@@ -61,7 +73,6 @@ module IMW
61
73
  def inputs= new_inputs
62
74
  @inputs = new_inputs.map do |path_or_resource|
63
75
  input = IMW.open(path_or_resource)
64
- input.should_exist!("Cannot summarize.")
65
76
  end
66
77
  @resources = inputs.map do |input|
67
78
  input.is_local? && input.is_directory? ? input.all_resources : input
data/lib/imw/utils.rb CHANGED
@@ -1,8 +1,18 @@
1
1
  require 'rubygems'
2
2
  require 'fileutils'
3
+ require 'active_support'
3
4
  require 'imw/utils/error'
4
5
  require 'imw/utils/log'
5
6
  require 'imw/utils/paths'
6
7
  require 'imw/utils/misc'
7
8
  require 'imw/utils/extensions'
8
9
 
10
+ module IMW
11
+
12
+ # Utility modules.
13
+ module Utils
14
+ autoload :DynamicallyExtendable, 'imw/utils/dynamically_extendable'
15
+ autoload :HasURI, 'imw/utils/has_uri'
16
+ end
17
+ end
18
+
@@ -0,0 +1,137 @@
1
+ module IMW
2
+ module Utils
3
+
4
+ # Provides an including class with a class-level array of
5
+ # "handlers" that it can use to dynamically extend its instances
6
+ # with specific modules only if certain conditions are met.
7
+ #
8
+ # This allows different instances of a class to implement very
9
+ # different behavior at runtime.
10
+ #
11
+ # An example use case might be a Database class which dynamically
12
+ # extends its instances with an adaptor module appropriate to the
13
+ # particular database the object refers to.
14
+ module DynamicallyExtendable
15
+
16
+ def self.included obj
17
+ obj.extend(ClassMethods)
18
+ end
19
+
20
+ # Return the modules this object has been extended by.
21
+ #
22
+ # @return [Array]
23
+ def modules
24
+ @modules ||= []
25
+ end
26
+
27
+ # Works just like Object#extend except it keeps track of the
28
+ # modules it has extended.
29
+ #
30
+ # @see IMW::Utils::DynamicallyExtendable#modules
31
+ def extend mod
32
+ modules << mod
33
+ super mod
34
+ end
35
+
36
+ # Iterate through this object's class's handlers and extend this
37
+ # object with the module referred to by any matching handlers.
38
+ def extend_appropriately! options={}
39
+ self.class.extend_instance! self, options
40
+ end
41
+
42
+ # A collection of methods which provide a class including
43
+ # IMW::Utils::DynamicallyExtendable with a class-level Array of
44
+ # handlers that can be applied to instances by calling the
45
+ # instance's +extend_appropriately!+ method.
46
+ module ClassMethods
47
+
48
+ # The handlers an including class has defined.
49
+ #
50
+ # @return [Array<Array>]
51
+ def handlers
52
+ @handlers ||= []
53
+ end
54
+
55
+ # Register a new handler for an including class.
56
+ #
57
+ # +handler+ must be one of
58
+ #
59
+ # 1. Regexp
60
+ # 2. Proc
61
+ # 3. +true+
62
+ #
63
+ # In case (1), if the regular expression matches the
64
+ # instance's +to_s+ method then the module (+mod+) will be
65
+ # used..
66
+ #
67
+ # In case (2), if the Proc returns a value other than +false+
68
+ # or +nil+ after being passed an instance then the module will
69
+ # be used.
70
+ #
71
+ # In case (3), the module will be used.
72
+ #
73
+ # @param [String, Module] mod
74
+ # @param [Regexp, Proc, true] handler
75
+ def register_handler mod, handler
76
+ raise IMW::ArgumentError.new("Module must be either a Module or String") unless mod.is_a?(Module) || mod.is_a?(String)
77
+ raise IMW::ArgumentError.new("Handler must be either a Regexp, Proc, or true") unless handler.is_a?(Regexp) || handler.is_a?(Proc) || handler == true
78
+ handlers << [mod, handler]
79
+ end
80
+
81
+ # Register a collection of handlers.
82
+ #
83
+ # @see IMW::Utils::DynamicallyExtendable::ClassMethods#register_handler
84
+ def register_handlers *pairs
85
+ pairs.each { |pair| register_handler *pair }
86
+ end
87
+
88
+ # Iterate through this class's handlers and extend the given
89
+ # object with modules whose handler conditions match the
90
+ # instance.
91
+ #
92
+ # Passing in <tt>:use_modules</tt> or <tt>:skip_modules</tt>
93
+ # allows overriding the default behavior of handlers.
94
+ #
95
+ # @param [Object] instance
96
+ # @param [Hash] options
97
+ # @option options [Array<String,Module>] use_modules a list of modules used regardless of handlers
98
+ # @option options [Array<String,Module>] skip_modules a list of modules not to be used regardless of handlers
99
+ # @return [Object] the newly extended object
100
+ def extend_instance! instance, options={}
101
+ return if options[:no_modules]
102
+ options.reverse_merge!(:use_modules => [], :skip_modules => [])
103
+ handlers.each do |mod_name, handler|
104
+ case handler
105
+ when Regexp then extend_instance_with_mod_or_string!(instance, mod_name, options[:skip_modules]) if handler =~ instance.to_s
106
+ when Proc then extend_instance_with_mod_or_string!(instance, mod_name, options[:skip_modules]) if handler.call(instance)
107
+ when TrueClass then extend_instance_with_mod_or_string!(instance, mod_name, options[:skip_modules])
108
+ else raise IMW::TypeError("A handler must be Regexp, Proc, or true")
109
+ end
110
+ end
111
+ options[:use_modules].each { |mod_name| extend_instance_with_mod_or_string!(instance, mod_name, options[:skip_modules]) }
112
+ instance
113
+ end
114
+
115
+ # Extend +instance+ with +mod_or_string+. Will work hard to
116
+ # try and interpret +mod_or_string+ as a module if it's a
117
+ # string.
118
+ #
119
+ # @param [Object] instance
120
+ #
121
+ # @param [Module, String] mod_or_string the module or string
122
+ # representing a module to extend the instance with
123
+ #
124
+ # @param [Array<Module,String>] skip_modules modules to exclude
125
+ def extend_instance_with_mod_or_string! instance, mod_or_string, skip_modules
126
+ return if skip_modules.include?(mod_or_string)
127
+ if mod_or_string.is_a?(Module)
128
+ instance.extend(mod_or_string)
129
+ else
130
+ m = IMW.class_eval(mod_or_string)
131
+ instance.extend(m) unless skip_modules.include?(m)
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
@@ -25,6 +25,9 @@ module IMW
25
25
  # Error communicating with a remote entity.
26
26
  ArgumentError = Class.new(Error)
27
27
 
28
+ # Error in defining or matching a schema.
29
+ SchemaError = Class.new(Error)
30
+
28
31
  # An error meant to be used when a system call goes awry. It will
29
32
  # report exit status and the process id of the offending call.
30
33
  class SystemCallError < IMW::Error
@@ -4,10 +4,6 @@ require 'imw/utils/extensions/hash'
4
4
  require 'imw/utils/extensions/struct'
5
5
  require 'imw/utils/extensions/symbol'
6
6
 
7
- require 'active_support/core_ext/object/blank'
8
- require 'active_support/core_ext/object/misc'
9
-
10
-
11
7
  module IMW
12
8
  # A replacement for the standard system call which raises an
13
9
  # IMW::SystemCallError if the command fails which prints better
@@ -1,7 +1,6 @@
1
- require 'active_support/core_ext/array/extract_options'
2
- class Array #:nodoc:
3
- include ActiveSupport::CoreExtensions::Array::ExtractOptions
4
- end
1
+ # class Array #:nodoc:
2
+ # include ActiveSupport::CoreExtensions::Array::ExtractOptions
3
+ # end
5
4
 
6
5
  class Array
7
6
 
@@ -107,9 +106,9 @@ class Array
107
106
  terminals
108
107
  end
109
108
 
110
- # Dump the data in this array to the resource at the given +uri+.
111
- def dump uri
112
- IMW.open!(uri).dump(self)
109
+ # Emit the data in this array to the resource at the given +uri+.
110
+ def emit uri
111
+ IMW.open!(uri).emit(self)
113
112
  end
114
113
 
115
114
  end
@@ -1,5 +1,3 @@
1
- require 'active_support/core_ext/hash/reverse_merge'
2
-
3
1
  class Hash
4
2
  # Return the elements of this hash in a pretty-printed string,
5
3
  # inserting +final_string+ between the last two items.
@@ -201,8 +199,8 @@ class Hash
201
199
  terminals
202
200
  end
203
201
 
204
- # Dump the data from this Hash into the given +uri+.
205
- def dump uri
206
- IMW.open!(uri).dump(self)
202
+ # Emit the data from this Hash into the given +uri+.
203
+ def emit uri
204
+ IMW.open!(uri).emit(self)
207
205
  end
208
206
  end
@@ -31,9 +31,9 @@ class String
31
31
  self.downcase.underscore.to_sym
32
32
  end
33
33
 
34
- # Dump this string into the given +uri+.
35
- def dump uri
36
- IMW.open!(uri).dump(self)
34
+ # Emit this string into the given +uri+.
35
+ def emit uri
36
+ IMW.open!(uri).emit(self)
37
37
  end
38
38
 
39
39
  end
@@ -0,0 +1,114 @@
1
+ require 'addressable/uri'
2
+
3
+ module IMW
4
+ module Utils
5
+
6
+ # Endows an including class with a wrapper for Addressable::URI
7
+ module HasURI
8
+
9
+ # The URI of this object.
10
+ attr_reader :uri
11
+
12
+ # Set the URI of this resource by parsing the given +uri+ (if
13
+ # necessary).
14
+ #
15
+ # @param [String, Addressable::URI] uri the uri to parse
16
+ def uri= uri
17
+ if uri.is_a?(Addressable::URI)
18
+ @uri = uri
19
+ else
20
+ begin
21
+ @uri = Addressable::URI.parse(uri.to_s)
22
+ rescue URI::InvalidURIError
23
+ @uri = Addressable::URI.parse(URI.encode(uri.to_s))
24
+ @encoded_uri = true
25
+ end
26
+ end
27
+ end
28
+
29
+ # The scheme of this resource. Will be +nil+ for local resources.
30
+ #
31
+ # @return [String]
32
+ def scheme
33
+ @scheme ||= uri.scheme
34
+ end
35
+
36
+ # The directory name of this resource's path.
37
+ #
38
+ # @return [String]
39
+ def dirname
40
+ @dirname ||= File.dirname(path)
41
+ end
42
+
43
+ # The basename of this resource's path.
44
+ #
45
+ # @return [String]
46
+ def basename
47
+ @basename ||= File.basename(path)
48
+ end
49
+
50
+ # Returns the extension (INCLUDING the '.') of this resource's
51
+ # path. Redefine this in an including class for which this is
52
+ # weird ('.tar.gz' I'm talking to you...)
53
+ #
54
+ # @return [String]
55
+ def extname
56
+ @extname ||= File.extname(path)
57
+ end
58
+
59
+ # Returns the extension (WITHOUT the '.') of this resource's path.
60
+ #
61
+ # @return [String]
62
+ def extension
63
+ @extension ||= extname[1..-1] || ''
64
+ end
65
+
66
+ # Returns the basename of the file with its extension removed
67
+ #
68
+ # IMW.open('/path/to/some_file.tar.gz').name # => some_file
69
+ #
70
+ # @return [String]
71
+ def name
72
+ @name ||= extname ? basename[0,basename.length - extname.length] : basename
73
+ end
74
+
75
+ # Returns the user associated with the host of this URI.
76
+ #
77
+ # @return [String]
78
+ def user
79
+ @user ||= uri.user
80
+ end
81
+
82
+ # Return the fragment part of this resource's URI.
83
+ #
84
+ # Will likely be +nil+ for local resources.
85
+ #
86
+ # @return [String]
87
+ def fragment
88
+ @fragment ||= uri.fragment
89
+ end
90
+
91
+ # Return the URI of this resource with any query strings and
92
+ # fragments removed.
93
+ #
94
+ # @return [URI::Generic]
95
+ def stripped_uri
96
+ uri_args = returning({}) do |args|
97
+ %w[scheme userinfo host port path].each do |method|
98
+ args[method.to_sym] = respond_to?(method) ? send(method) : uri.send(method)
99
+ end
100
+ end
101
+ uri.class.new(uri_args)
102
+ end
103
+
104
+ def to_s
105
+ uri.to_s
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+
112
+
113
+
114
+
@@ -3,7 +3,7 @@ ID,Name,Genus,Species
3
3
  002,Panamanian Night Monkey,Aotus,zonalis
4
4
  003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
5
5
  004,Gray-handed Night Monkey,Aotus,griseimembra
6
- 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
6
+ 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
7
7
  006,Brumback's Night Monkey,Aotus,brumbacki
8
8
  007,Three-striped Night Monkey,Aotus,trivirgatus
9
9
  008,Spix's Night Monkey,Aotus,vociferans
@@ -0,0 +1,11 @@
1
+ Store Number Store Name Address City State Zip Phone Number Latitude Longitude
2
+ 10344 Ace Gambles of Hotchkiss 121 E. Bridge Street Hotchkiss CO 81419 (970) 872-3535 38.7988 -107.7205
3
+ 10347 Al's Ace Hardware 4010 E 29th St Tucson AZ 85711 (520) 747-8144 32.1996 -110.9059
4
+ 10349 Al's Ace Hardware 4722 E Broadway Blvd Tucson AZ 85711 (520) 318-0414 32.2216 -110.8916
5
+ 10351 Al's Ace Hardware 2884 N Campbell Ave Tucson AZ 85719 (520) 325-2432 32.259 -110.9428
6
+ 10353 Lewistown Ace Hardware 815 NE Main St Lewistown MT 59457 (406) 538-4000 47.0702 -109.409
7
+ 10356 Byram Revell Ace Hardware 5726 Terry Rd Jackson MS 39272 (601) 371-8429 32.1919 -90.2542
8
+ 10357 Mr C's Ace Hardware 1201 Precinct Line Rd Hurst TX 76053 (817) 282-3428 32.831 -97.186
9
+ 10358 Gus Bock Ace Hardware 3455 Ridge Rd Lansing IL 60438 (708) 474-5940 41.5643 -87.5321
10
+ 10359 Elberton Ace Hardware 886 Elbert St Elberton GA 30635 (706) 283-3054 34.1008 -82.8495
11
+ 10361 Phil's Hardware 13 SW H St Madras OR 97741 (541) 475-9392 44.627737 -121.13004