imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -0,0 +1,7 @@
1
+ module IMW
2
+ module Serializer
3
+
4
+ autoload JsonSerializer, 'imw/serializer/json_serializer'
5
+
6
+ end
7
+ end
@@ -0,0 +1,17 @@
1
+ module IMW
2
+ module Serializer
3
+ class JsonSerializer
4
+
5
+ def initialize file_url, mode
6
+ @file_url = file_url
7
+ @mode = mode
8
+ File.open(file_url, mode)
9
+ end
10
+
11
+ def write line
12
+ File.write(line)
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,41 @@
1
+ module IMW
2
+
3
+ class Uri
4
+
5
+ attr_reader :scheme, :format
6
+
7
+ @@schemes = {
8
+ %r{^hdfs:} => 'Hdfs',
9
+ %r{^s3:} => 'S3',
10
+ }
11
+
12
+ @@formats = {
13
+ %r{.csv$} => 'Csv',
14
+ %r{.tsv$} => 'Tsv',
15
+ %r{.json$} => 'Json',
16
+ %r{.ya?ml$} => 'Yaml',
17
+ }
18
+
19
+ def initialize uri
20
+ @scheme = lookup_scheme(uri)
21
+ @format = lookup_format(uri)
22
+ end
23
+
24
+ def lookup_scheme uri
25
+ @@schemes.keys.each do |key|
26
+ next unless uri =~ key
27
+ return @@schemes[key]
28
+ end
29
+ 'Local'
30
+ end
31
+
32
+ def lookup_format uri
33
+ @@formats.keys.each do |key|
34
+ next unless uri =~ key
35
+ return @@formats[key]
36
+ end
37
+ raise InvalidFormatError.new("#{File.extname(uri)} is not currently supported")
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,78 @@
1
+ require 'rubygems'
2
+ require 'rspec'
3
+ require 'imw' ; include IMW
4
+
5
+ describe "IMW::Resource" do
6
+
7
+ context "The Resource class" do
8
+
9
+ it "should respond to the method open" do
10
+ IMW::Resource.should respond_to(:open)
11
+ end
12
+
13
+ context "Resource.open" do
14
+ before :each do
15
+ @uri = "test.csv"
16
+ end
17
+
18
+ it "should return an IMW::Resource object" do
19
+ Resource.open(@uri).should be_instance_of(IMW::Resource)
20
+ end
21
+
22
+ it "should return the value of the block if given a block" do
23
+ Resource.open(@uri) { |obj| nil }.should be_nil
24
+ end
25
+
26
+ it "should accept a block and yield an IMW::Resource object" do
27
+ Resource.open(@uri) do |obj|
28
+ obj.should be_instance_of(IMW::Resource)
29
+ end
30
+ end
31
+
32
+ end
33
+
34
+ it "should respond to the method exists?" do
35
+ Resource.should respond_to(:exists?)
36
+ end
37
+
38
+ context "Resource.exists?" do
39
+ before :each do
40
+ @file = "test"
41
+ end
42
+
43
+ it "should return either true or false" do
44
+ Resource.exists?(@file).should == !!Resource.exists?(@should)
45
+ end
46
+
47
+ end
48
+ end
49
+
50
+ it "should read a Resource and return a string" do
51
+ Resource
52
+ end
53
+
54
+ context "A Resource instance" do
55
+ before :each do
56
+ @uri = "test.csv"
57
+ @resource = Resource.new(@uri)
58
+ end
59
+
60
+ it "should accept a Resource access mode when instantiated" do
61
+ lambda { Resource.new(@uri, 'w') }.should_not raise_error(Exception)
62
+ end
63
+
64
+ it "should raise an error if given an invalid Resource mode" do
65
+ lambda { Resource.new(@uri, 'f') }.should raise_error(IMW::Error::FileModeError)
66
+ end
67
+
68
+ it "should return the uri as a IMW::Uri object" do
69
+ @resource.uri.should be_instance_of(IMW::Uri)
70
+ end
71
+
72
+ it "should respond to the method close" do
73
+ @resource.should respond_to(:close)
74
+ end
75
+
76
+ end
77
+
78
+ end
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'rspec'
3
+ require 'imw' ; include IMW
4
+
5
+ describe "IMW::Uri" do
6
+ before :each do
7
+ @uri = Uri.new("test.csv")
8
+ end
9
+
10
+ it "should respond to scheme" do
11
+ @uri.should respond_to(:scheme)
12
+ end
13
+
14
+ context "A Uri.scheme" do
15
+ before :each do
16
+ @local = "/path/to/file.csv"
17
+ @s3 = "s3://s3-bucket/path/to/file.csv"
18
+ @hdfs = "hdfs://namenodehost/path/to/file.csv"
19
+ end
20
+
21
+ it "should understand the local file scheme" do
22
+ Uri.new(@local).scheme.should == 'Local'
23
+ end
24
+
25
+ it "should understand the s3 file scheme" do
26
+ Uri.new(@s3).scheme.should == 'S3'
27
+ end
28
+
29
+ it "should understand the hadoop file system" do
30
+ Uri.new(@hdfs).scheme.should == 'Hdfs'
31
+ end
32
+
33
+ end
34
+
35
+ it "should respond to format" do
36
+ @uri.should respond_to(:format)
37
+ end
38
+
39
+ context "A Uri.format" do
40
+ before :each do
41
+ @csv = "foo.csv"
42
+ @tsv = "foo.tsv"
43
+ @json = "foo.json"
44
+ @yaml = "foo.yml"
45
+ @invalid = "foor.bar"
46
+ end
47
+
48
+ it "should raise an error when given an invalid format" do
49
+ lambda { Uri.new(@invalid) }.should raise_error(IMW::Error::InvalidFormatError)
50
+ end
51
+
52
+ end
53
+
54
+ end
55
+
metadata CHANGED
@@ -1,28 +1,27 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: imw
3
3
  version: !ruby/object:Gem::Version
4
- hash: 51
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- - 18
10
- version: 0.2.18
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
11
11
  platform: ruby
12
12
  authors:
13
+ - Travis Dempsey
13
14
  - Dhruv Bansal
14
- - Philip (flip) Kromer
15
+ - mrflip
15
16
  autorequire:
16
17
  bindir: bin
17
18
  cert_chain: []
18
19
 
19
- date: 2011-02-16 00:00:00 -06:00
20
+ date: 2011-07-15 00:00:00 -05:00
20
21
  default_executable:
21
22
  dependencies:
22
23
  - !ruby/object:Gem::Dependency
23
- prerelease: false
24
- name: activesupport
25
- version_requirements: &id001 !ruby/object:Gem::Requirement
24
+ requirement: &id001 !ruby/object:Gem::Requirement
26
25
  none: false
27
26
  requirements:
28
27
  - - ">="
@@ -31,12 +30,12 @@ dependencies:
31
30
  segments:
32
31
  - 0
33
32
  version: "0"
34
- requirement: *id001
35
33
  type: :runtime
36
- - !ruby/object:Gem::Dependency
34
+ name: nokogiri
37
35
  prerelease: false
38
- name: addressable
39
- version_requirements: &id002 !ruby/object:Gem::Requirement
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ requirement: &id002 !ruby/object:Gem::Requirement
40
39
  none: false
41
40
  requirements:
42
41
  - - ">="
@@ -45,26 +44,44 @@ dependencies:
45
44
  segments:
46
45
  - 0
47
46
  version: "0"
48
- requirement: *id002
49
- type: :runtime
50
- - !ruby/object:Gem::Dependency
47
+ type: :development
48
+ name: shoulda
51
49
  prerelease: false
52
- name: uuidtools
53
- version_requirements: &id003 !ruby/object:Gem::Requirement
50
+ version_requirements: *id002
51
+ - !ruby/object:Gem::Dependency
52
+ requirement: &id003 !ruby/object:Gem::Requirement
54
53
  none: false
55
54
  requirements:
56
- - - ">="
55
+ - - ~>
57
56
  - !ruby/object:Gem::Version
58
- hash: 3
57
+ hash: 23
59
58
  segments:
59
+ - 1
60
60
  - 0
61
- version: "0"
62
- requirement: *id003
63
- type: :runtime
61
+ - 0
62
+ version: 1.0.0
63
+ type: :development
64
+ name: bundler
65
+ prerelease: false
66
+ version_requirements: *id003
64
67
  - !ruby/object:Gem::Dependency
68
+ requirement: &id004 !ruby/object:Gem::Requirement
69
+ none: false
70
+ requirements:
71
+ - - ~>
72
+ - !ruby/object:Gem::Version
73
+ hash: 7
74
+ segments:
75
+ - 1
76
+ - 5
77
+ - 2
78
+ version: 1.5.2
79
+ type: :development
80
+ name: jeweler
65
81
  prerelease: false
66
- name: rake
67
- version_requirements: &id004 !ruby/object:Gem::Requirement
82
+ version_requirements: *id004
83
+ - !ruby/object:Gem::Dependency
84
+ requirement: &id005 !ruby/object:Gem::Requirement
68
85
  none: false
69
86
  requirements:
70
87
  - - ">="
@@ -73,175 +90,50 @@ dependencies:
73
90
  segments:
74
91
  - 0
75
92
  version: "0"
76
- requirement: *id004
77
- type: :runtime
78
- description: The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the tasks of acquiring, extracting, transforming, loading, and packaging data. It minimizes programmer time by encapsulating common data workflows and patterns and creating interfaces to many other useful Ruby libraries.
79
- email: coders@infochimps.org
80
- executables:
81
- - imw
82
- - tsv_to_json.rb
93
+ type: :development
94
+ name: rcov
95
+ prerelease: false
96
+ version_requirements: *id005
97
+ description: Infinite Monkey Wrench - A framework to make collecting and parsing data fun again.
98
+ email: travis@infochimps.com
99
+ executables: []
100
+
83
101
  extensions: []
84
102
 
85
103
  extra_rdoc_files:
86
- - LICENSE
87
- - README.rdoc
104
+ - LICENSE.txt
105
+ - README.textile
88
106
  files:
89
107
  - Gemfile
90
108
  - Gemfile.lock
91
- - LICENSE
92
- - README.rdoc
109
+ - LICENSE.txt
110
+ - README.textile
93
111
  - Rakefile
94
112
  - VERSION
95
- - bin/imw
96
- - bin/tsv_to_json.rb
97
- - etc/imwrc.rb
98
- - examples/dataset.rb
99
- - examples/metadata.yml
113
+ - examples/foo.rb
114
+ - examples/html_selector.rb
115
+ - examples/nes_game_list.csv
116
+ - examples/nes_gamespot.csv
117
+ - examples/nes_nintendo.csv
118
+ - examples/nes_unlicensed.csv
119
+ - examples/nes_wikipedia.csv
120
+ - examples/nibbler_test.rb
121
+ - examples/script.rb
100
122
  - lib/imw.rb
101
- - lib/imw/archives.rb
102
- - lib/imw/archives/rar.rb
103
- - lib/imw/archives/tar.rb
104
- - lib/imw/archives/tarbz2.rb
105
- - lib/imw/archives/targz.rb
106
- - lib/imw/archives/zip.rb
107
- - lib/imw/boot.rb
108
- - lib/imw/compressed_files.rb
109
- - lib/imw/compressed_files/bz2.rb
110
- - lib/imw/compressed_files/compressible.rb
111
- - lib/imw/compressed_files/gz.rb
112
- - lib/imw/dataset.rb
113
- - lib/imw/dataset/paths.rb
114
- - lib/imw/dataset/workflow.rb
115
- - lib/imw/formats.rb
116
- - lib/imw/formats/delimited.rb
117
- - lib/imw/formats/excel.rb
118
- - lib/imw/formats/json.rb
119
- - lib/imw/formats/pdf.rb
120
- - lib/imw/formats/sgml.rb
121
- - lib/imw/formats/yaml.rb
122
- - lib/imw/metadata.rb
123
- - lib/imw/metadata/contains_metadata.rb
124
- - lib/imw/metadata/dsl.rb
125
- - lib/imw/metadata/field.rb
126
- - lib/imw/metadata/has_metadata.rb
127
- - lib/imw/metadata/has_summary.rb
128
- - lib/imw/metadata/schema.rb
129
- - lib/imw/parsers.rb
130
- - lib/imw/parsers/flat.rb
131
- - lib/imw/parsers/html_parser.rb
132
- - lib/imw/parsers/html_parser/matchers.rb
133
- - lib/imw/parsers/line_parser.rb
134
- - lib/imw/parsers/regexp_parser.rb
135
- - lib/imw/repository.rb
123
+ - lib/imw/error.rb
124
+ - lib/imw/recordizer.rb
125
+ - lib/imw/recordizer/html_selector_recordizer.rb
126
+ - lib/imw/recordizer/string_slice_recordizer.rb
136
127
  - lib/imw/resource.rb
137
- - lib/imw/runner.rb
138
- - lib/imw/schemes.rb
139
- - lib/imw/schemes/ftp.rb
140
- - lib/imw/schemes/hdfs.rb
141
- - lib/imw/schemes/http.rb
142
- - lib/imw/schemes/local.rb
143
- - lib/imw/schemes/remote.rb
144
- - lib/imw/schemes/s3.rb
145
- - lib/imw/schemes/sql.rb
146
- - lib/imw/tools.rb
147
- - lib/imw/tools/aggregator.rb
148
- - lib/imw/tools/archiver.rb
149
- - lib/imw/tools/downloader.rb
150
- - lib/imw/tools/extension_analyzer.rb
151
- - lib/imw/tools/summarizer.rb
152
- - lib/imw/tools/transferer.rb
153
- - lib/imw/utils.rb
154
- - lib/imw/utils/dynamically_extendable.rb
155
- - lib/imw/utils/error.rb
156
- - lib/imw/utils/extensions/hpricot.rb
157
- - lib/imw/utils/has_uri.rb
158
- - lib/imw/utils/log.rb
159
- - lib/imw/utils/misc.rb
160
- - lib/imw/utils/paths.rb
161
- - lib/imw/utils/uri.rb
162
- - lib/imw/utils/uuid.rb
163
- - lib/imw/utils/validate.rb
164
- - lib/imw/utils/version.rb
165
- - spec/data/formats/delimited/sample.csv
166
- - spec/data/formats/delimited/sample.tsv
167
- - spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv
168
- - spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv
169
- - spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv
170
- - spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv
171
- - spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv
172
- - spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv
173
- - spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv
174
- - spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv
175
- - spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv
176
- - spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv
177
- - spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv
178
- - spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv
179
- - spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv
180
- - spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv
181
- - spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv
182
- - spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv
183
- - spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv
184
- - spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv
185
- - spec/data/formats/excel/sample.xls
186
- - spec/data/formats/json/sample.json
187
- - spec/data/formats/none/sample
188
- - spec/data/formats/sgml/sample.xml
189
- - spec/data/formats/text/sample.txt
190
- - spec/data/formats/yaml/sample.yaml
191
- - spec/data/schema-tabular.yaml
192
- - spec/imw/archives/rar_spec.rb
193
- - spec/imw/archives/tar_spec.rb
194
- - spec/imw/archives/tarbz2_spec.rb
195
- - spec/imw/archives/targz_spec.rb
196
- - spec/imw/archives/zip_spec.rb
197
- - spec/imw/archives_spec.rb
198
- - spec/imw/compressed_files/bz2_spec.rb
199
- - spec/imw/compressed_files/compressible_spec.rb
200
- - spec/imw/compressed_files/gz_spec.rb
201
- - spec/imw/compressed_files_spec.rb
202
- - spec/imw/dataset/paths_spec.rb
203
- - spec/imw/dataset/workflow_spec.rb
204
- - spec/imw/formats/delimited_spec.rb
205
- - spec/imw/formats/excel_spec.rb
206
- - spec/imw/formats/json_spec.rb
207
- - spec/imw/formats/sgml_spec.rb
208
- - spec/imw/formats/yaml_spec.rb
209
- - spec/imw/metadata/contains_metadata_spec.rb
210
- - spec/imw/metadata/field_spec.rb
211
- - spec/imw/metadata/has_metadata_spec.rb
212
- - spec/imw/metadata/has_summary_spec.rb
213
- - spec/imw/metadata/schema_spec.rb
214
- - spec/imw/metadata_spec.rb
215
- - spec/imw/parsers/line_parser_spec.rb
216
- - spec/imw/parsers/regexp_parser_spec.rb
217
- - spec/imw/resource_spec.rb
218
- - spec/imw/schemes/hdfs_spec.rb
219
- - spec/imw/schemes/http_spec.rb
220
- - spec/imw/schemes/local_spec.rb
221
- - spec/imw/schemes/remote_spec.rb
222
- - spec/imw/schemes/s3_spec.rb
223
- - spec/imw/schemes/sql_spec.rb
224
- - spec/imw/tools/aggregator_spec.rb
225
- - spec/imw/tools/archiver_spec.rb
226
- - spec/imw/tools/extension_analyzer_spec.rb
227
- - spec/imw/tools/summarizer_spec.rb
228
- - spec/imw/tools/transferer_spec.rb
229
- - spec/imw/utils/dynamically_extendable_spec.rb
230
- - spec/imw/utils/has_uri_spec.rb
231
- - spec/imw/utils/paths_spec.rb
232
- - spec/imw/utils/shared_paths_spec.rb
233
- - spec/imw_spec.rb
234
- - spec/rcov.opts
235
- - spec/spec_helper.rb
236
- - spec/support/custom_matchers.rb
237
- - spec/support/file_contents_matcher.rb
238
- - spec/support/paths_matcher.rb
239
- - spec/support/random.rb
240
- - spec/support/without_regard_to_order_matcher.rb
128
+ - lib/imw/serializer.rb
129
+ - lib/imw/serializer/json_serializer.rb
130
+ - lib/imw/uri.rb
131
+ - spec/resource_spec.rb
132
+ - spec/uri_spec.rb
241
133
  has_rdoc: true
242
134
  homepage: http://github.com/infochimps/imw
243
- licenses: []
244
-
135
+ licenses:
136
+ - MIT
245
137
  post_install_message:
246
138
  rdoc_options: []
247
139
 
@@ -271,54 +163,11 @@ rubyforge_project:
271
163
  rubygems_version: 1.3.7
272
164
  signing_key:
273
165
  specification_version: 3
274
- summary: The Infinite Monkeywrench (IMW) makes acquiring, extracting, transforming, loading, and packaging data easy.
166
+ summary: Infinite Monkey Wrench - A framework to make collecting and parsing data fun again.
275
167
  test_files:
276
- - examples/dataset.rb
277
- - spec/imw/archives/rar_spec.rb
278
- - spec/imw/archives/tar_spec.rb
279
- - spec/imw/archives/tarbz2_spec.rb
280
- - spec/imw/archives/targz_spec.rb
281
- - spec/imw/archives/zip_spec.rb
282
- - spec/imw/archives_spec.rb
283
- - spec/imw/compressed_files/bz2_spec.rb
284
- - spec/imw/compressed_files/compressible_spec.rb
285
- - spec/imw/compressed_files/gz_spec.rb
286
- - spec/imw/compressed_files_spec.rb
287
- - spec/imw/dataset/paths_spec.rb
288
- - spec/imw/dataset/workflow_spec.rb
289
- - spec/imw/formats/delimited_spec.rb
290
- - spec/imw/formats/excel_spec.rb
291
- - spec/imw/formats/json_spec.rb
292
- - spec/imw/formats/sgml_spec.rb
293
- - spec/imw/formats/yaml_spec.rb
294
- - spec/imw/metadata/contains_metadata_spec.rb
295
- - spec/imw/metadata/field_spec.rb
296
- - spec/imw/metadata/has_metadata_spec.rb
297
- - spec/imw/metadata/has_summary_spec.rb
298
- - spec/imw/metadata/schema_spec.rb
299
- - spec/imw/metadata_spec.rb
300
- - spec/imw/parsers/line_parser_spec.rb
301
- - spec/imw/parsers/regexp_parser_spec.rb
302
- - spec/imw/resource_spec.rb
303
- - spec/imw/schemes/hdfs_spec.rb
304
- - spec/imw/schemes/http_spec.rb
305
- - spec/imw/schemes/local_spec.rb
306
- - spec/imw/schemes/remote_spec.rb
307
- - spec/imw/schemes/s3_spec.rb
308
- - spec/imw/schemes/sql_spec.rb
309
- - spec/imw/tools/aggregator_spec.rb
310
- - spec/imw/tools/archiver_spec.rb
311
- - spec/imw/tools/extension_analyzer_spec.rb
312
- - spec/imw/tools/summarizer_spec.rb
313
- - spec/imw/tools/transferer_spec.rb
314
- - spec/imw/utils/dynamically_extendable_spec.rb
315
- - spec/imw/utils/has_uri_spec.rb
316
- - spec/imw/utils/paths_spec.rb
317
- - spec/imw/utils/shared_paths_spec.rb
318
- - spec/imw_spec.rb
319
- - spec/spec_helper.rb
320
- - spec/support/custom_matchers.rb
321
- - spec/support/file_contents_matcher.rb
322
- - spec/support/paths_matcher.rb
323
- - spec/support/random.rb
324
- - spec/support/without_regard_to_order_matcher.rb
168
+ - examples/foo.rb
169
+ - examples/html_selector.rb
170
+ - examples/nibbler_test.rb
171
+ - examples/script.rb
172
+ - spec/resource_spec.rb
173
+ - spec/uri_spec.rb