imw 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
@@ -0,0 +1,410 @@
1
+ ---
2
+ Mandrillus:
3
+ - :species: sphinx
4
+ :name: Mandrill
5
+ :id: 113
6
+ - :species: leucophaeus
7
+ :name: Drill
8
+ :id: 114
9
+ Rungwecebus:
10
+ - :species: kipunji
11
+ :name: Kipunji
12
+ :id: 100
13
+ Miopithecus:
14
+ - :species: talapoin
15
+ :name: Angolan Talapoin
16
+ :id: 38
17
+ - :species: ogouensis
18
+ :name: Gabon Talapoin
19
+ :id: 39
20
+ Presbytis:
21
+ - :species: rubicunda
22
+ :name: Maroon Leaf Monkey
23
+ :id: 130
24
+ Lophocebus:
25
+ - :species: albigena
26
+ :name: Grey-cheeked Mangabey
27
+ :id: 94
28
+ - :species: aterrimus
29
+ :name: Black Crested Mangabey
30
+ :id: 95
31
+ - :species: opdenboschi
32
+ :name: "Opdenbosch's Mangabey"
33
+ :id: 96
34
+ - :species: ugandae
35
+ :name: Uganda Mangabey
36
+ :id: 97
37
+ - :species: johnstoni
38
+ :name: "Johnston's Mangabey"
39
+ :id: 98
40
+ - :species: osmani
41
+ :name: "Osman Hill's Mangabey"
42
+ :id: 99
43
+ Erythrocebus:
44
+ - :species: patas
45
+ :name: Patas Monkey
46
+ :id: 40
47
+ Piliocolobus:
48
+ - :species: badius
49
+ :name: Western Red Colobus
50
+ :id: 120
51
+ - :species: pennantii
52
+ :name: "Pennant's Colobus"
53
+ :id: 121
54
+ - :species: preussi
55
+ :name: "Preuss's Red Colobus"
56
+ :id: 122
57
+ - :species: tholloni
58
+ :name: "Thollon's Red Colobus"
59
+ :id: 123
60
+ - :species: foai
61
+ :name: Central African Red Colobus
62
+ :id: 124
63
+ - :species: tephrosceles
64
+ :name: Ugandan Red Colobus
65
+ :id: 125
66
+ - :species: gordonorum
67
+ :name: Uzungwa Red Colobus
68
+ :id: 126
69
+ - :species: kirkii
70
+ :name: Zanzibar Red Colobus
71
+ :id: 127
72
+ - :species: rufomitratus
73
+ :name: Tana River Red Colobus
74
+ :id: 128
75
+ Cercocebus:
76
+ - :species: atys
77
+ :name: Sooty Mangabey
78
+ :id: 107
79
+ - :species: torquatus
80
+ :name: Collared Mangabey
81
+ :id: 108
82
+ - :species: agilis
83
+ :name: Agile Mangabey
84
+ :id: 109
85
+ - :species: chrysogaster
86
+ :name: Golden-bellied Mangabey
87
+ :id: 110
88
+ - :species: galeritus
89
+ :name: Tana River Mangabey
90
+ :id: 111
91
+ - :species: sanjei
92
+ :name: Sanje Mangabey
93
+ :id: 112
94
+ Theropithecus:
95
+ - :species: gelada
96
+ :name: Gelada
97
+ :id: 106
98
+ Papio:
99
+ - :species: hamadryas
100
+ :name: Hamadryas Baboon
101
+ :id: 101
102
+ - :species: papio
103
+ :name: Guinea Baboon
104
+ :id: 102
105
+ - :species: anubis
106
+ :name: Olive Baboon
107
+ :id: 103
108
+ - :species: cynocephalus
109
+ :name: Yellow Baboon
110
+ :id: 104
111
+ - :species: ursinus
112
+ :name: Chacma Baboon
113
+ :id: 105
114
+ Chlorocebus:
115
+ - :species: sabaeus
116
+ :name: Green Monkey
117
+ :id: 41
118
+ - :species: aethiops
119
+ :name: Grivet
120
+ :id: 42
121
+ - :species: djamdjamensis
122
+ :name: Bale Mountains Vervet
123
+ :id: 43
124
+ - :species: tantalus
125
+ :name: Tantalus Monkey
126
+ :id: 44
127
+ - :species: pygerythrus
128
+ :name: Vervet Monkey
129
+ :id: 45
130
+ - :species: cynosuros
131
+ :name: Malbrouck
132
+ :id: 46
133
+ Allenopithecus:
134
+ - :species: nigroviridis
135
+ :name: "Allen's Swamp Monkey"
136
+ :id: 37
137
+ Procolobus:
138
+ - :species: verus
139
+ :name: Olive Colobus
140
+ :id: 129
141
+ Hylobates:
142
+ - :species: lar lar
143
+ :name: Malaysian Lar Gibbon
144
+ :id: 9
145
+ - :species: lar carpenteri
146
+ :name: "Carpenter's Lar Gibbon"
147
+ :id: 10
148
+ - :species: lar entelloides
149
+ :name: Central Lar Gibbon
150
+ :id: 11
151
+ - :species: lar vestitus
152
+ :name: Sumatran Lar Gibbon
153
+ :id: 12
154
+ - :species: lar yunnanensis
155
+ :name: Yunnan Lar Gibbon
156
+ :id: 13
157
+ - :species: agilis agilis
158
+ :name: Mountain Agile Gibbon
159
+ :id: 14
160
+ - :species: agilis albibarbis
161
+ :name: Bornean White-bearded Gibbon
162
+ :id: 15
163
+ - :species: agilis unko
164
+ :name: Lowland Agile Gibbon
165
+ :id: 16
166
+ - :species: muelleri muelleri
167
+ :name: "M\xC3\xBCller's Gray Gibbon"
168
+ :id: 17
169
+ - :species: muelleri abbotti
170
+ :name: "Abbott's Gray Gibbon"
171
+ :id: 18
172
+ - :species: muelleri funereus
173
+ :name: Northern Gray Gibbon
174
+ :id: 19
175
+ Aotus:
176
+ - :species: lemurinus
177
+ :name: Gray-bellied Night Monkey
178
+ :id: 1
179
+ - :species: zonalis
180
+ :name: Panamanian Night Monkey
181
+ :id: 2
182
+ - :species: jorgehernandezi
183
+ :name: "Hern\xC3\xA1ndez-Camacho's Night Monkey"
184
+ :id: 3
185
+ - :species: griseimembra
186
+ :name: Gray-handed Night Monkey
187
+ :id: 4
188
+ - :species: hershkovitzi
189
+ :name: "Hershkovitz's Night Monkey"
190
+ :id: 5
191
+ - :species: brumbacki
192
+ :name: "Brumback's Night Monkey"
193
+ :id: 6
194
+ - :species: trivirgatus
195
+ :name: Three-striped Night Monkey
196
+ :id: 7
197
+ - :species: vociferans
198
+ :name: "Spix's Night Monkey"
199
+ :id: 8
200
+ Macaca:
201
+ - :species: sylvanus
202
+ :name: Barbary Macaque
203
+ :id: 72
204
+ - :species: silenus
205
+ :name: Lion-tailed Macaque
206
+ :id: 73
207
+ - :species: nemestrina
208
+ :name: Southern Pig-tailed Macaque or Beruk
209
+ :id: 74
210
+ - :species: leonina
211
+ :name: Northern Pig-tailed Macaque
212
+ :id: 75
213
+ - :species: pagensis
214
+ :name: Pagai Island Macaque or Bokkoi
215
+ :id: 76
216
+ - :species: siberu
217
+ :name: Siberut Macaque
218
+ :id: 77
219
+ - :species: maura
220
+ :name: Moor Macaque
221
+ :id: 78
222
+ - :species: ochreata
223
+ :name: Booted Macaque
224
+ :id: 79
225
+ - :species: tonkeana
226
+ :name: Tonkean Macaque
227
+ :id: 80
228
+ - :species: hecki
229
+ :name: "Heck's Macaque"
230
+ :id: 81
231
+ - :species: nigrescens
232
+ :name: Gorontalo Macaque
233
+ :id: 82
234
+ - :species: nigra
235
+ :name: Celebes Crested Macaque or Black Ape
236
+ :id: 83
237
+ - :species: fascicularis
238
+ :name: Crab-eating Macaque or Long-tailed Macaque or Kera
239
+ :id: 84
240
+ - :species: arctoides
241
+ :name: Stump-tailed Macaque or Bear Macaque
242
+ :id: 85
243
+ - :species: mulatta
244
+ :name: Rhesus Macaque
245
+ :id: 86
246
+ - :species: cyclopis
247
+ :name: Formosan Rock Macaque
248
+ :id: 87
249
+ - :species: fuscata
250
+ :name: Japanese Macaque
251
+ :id: 88
252
+ - :species: sinica
253
+ :name: Toque Macaque
254
+ :id: 89
255
+ - :species: radiata
256
+ :name: Bonnet Macaque
257
+ :id: 90
258
+ - :species: assamensis
259
+ :name: Assam Macaque
260
+ :id: 91
261
+ - :species: thibetana
262
+ :name: "Tibetan Macaque or Milne-Edwards' Macaque"
263
+ :id: 92
264
+ - :species: munzala
265
+ :name: Arunachal Macaque or Munzala
266
+ :id: 93
267
+ Colobus:
268
+ - :species: satanas
269
+ :name: Black Colobus
270
+ :id: 115
271
+ - :species: angolensis
272
+ :name: Angola Colobus
273
+ :id: 116
274
+ - :species: polykomos
275
+ :name: King Colobus
276
+ :id: 117
277
+ - :species: vellerosus
278
+ :name: Ursine Colobus
279
+ :id: 118
280
+ - :species: guereza
281
+ :name: Mantled Guereza
282
+ :id: 119
283
+ Cercopithecus:
284
+ - :species: dryas
285
+ :name: Dryas Monkey or Salongo Monkey
286
+ :id: 47
287
+ - :species: diana
288
+ :name: Diana Monkey
289
+ :id: 48
290
+ - :species: roloway
291
+ :name: Roloway Monkey
292
+ :id: 49
293
+ - :species: nictitans
294
+ :name: Greater Spot-nosed Monkey
295
+ :id: 50
296
+ - :species: mitis
297
+ :name: Blue Monkey
298
+ :id: 51
299
+ - :species: doggetti
300
+ :name: Silver Monkey
301
+ :id: 52
302
+ - :species: kandti
303
+ :name: Golden Monkey
304
+ :id: 53
305
+ - :species: albogularis
306
+ :name: "Sykes's Monkey"
307
+ :id: 54
308
+ - :species: mona
309
+ :name: Mona Monkey
310
+ :id: 55
311
+ - :species: campbelli
312
+ :name: "Campbell's Mona Monkey"
313
+ :id: 56
314
+ - :species: lowei
315
+ :name: "Lowe's Mona Monkey"
316
+ :id: 57
317
+ - :species: pogonias
318
+ :name: Crested Mona Monkey
319
+ :id: 58
320
+ - :species: wolfi
321
+ :name: "Wolf's Mona Monkey"
322
+ :id: 59
323
+ - :species: denti
324
+ :name: "Dent's Mona Monkey"
325
+ :id: 60
326
+ - :species: petaurista
327
+ :name: Lesser Spot-nosed Monkey
328
+ :id: 61
329
+ - :species: erythrogaster
330
+ :name: White-throated Guenon
331
+ :id: 62
332
+ - :species: sclateri
333
+ :name: "Sclater's Guenon"
334
+ :id: 63
335
+ - :species: erythrotis
336
+ :name: Red-eared Guenon
337
+ :id: 64
338
+ - :species: cephus
339
+ :name: Moustached Guenon
340
+ :id: 65
341
+ - :species: ascanius
342
+ :name: Red-tailed Monkey
343
+ :id: 66
344
+ - :species: lhoesti
345
+ :name: L'Hoest's Monkey
346
+ :id: 67
347
+ - :species: preussi
348
+ :name: "Preuss's Monkey"
349
+ :id: 68
350
+ - :species: solatus
351
+ :name: Sun-tailed Monkey
352
+ :id: 69
353
+ - :species: hamlyni
354
+ :name: "Hamlyn's Monkey"
355
+ :id: 70
356
+ - :species: neglectus
357
+ :name: "De Brazza's Monkey"
358
+ :id: 71
359
+ Saguinas:
360
+ - :species: niger
361
+ :name: Black Tamarin
362
+ :id: 20
363
+ - :species: nigricollis
364
+ :name: Black-mantled Tamarin
365
+ :id: 21
366
+ - :species: fuscicollis
367
+ :name: Brown-mantled Tamarin
368
+ :id: 22
369
+ - :species: oedipus
370
+ :name: "Cottontop Tamarin or Pinch\xC3\xA9 Tamarin"
371
+ :id: 23
372
+ - :species: imperator
373
+ :name: Emperor Tamarin
374
+ :id: 24
375
+ - :species: geoffroyi
376
+ :name: "Geoffroy's Tamarin"
377
+ :id: 25
378
+ - :species: tripartitus
379
+ :name: Golden-mantled Tamarin
380
+ :id: 26
381
+ - :species: graellsi
382
+ :name: "Graells's Tamarin"
383
+ :id: 27
384
+ - :species: martinsi
385
+ :name: "Martins's Tamarin"
386
+ :id: 28
387
+ - :species: inustus
388
+ :name: Mottle-faced Tamarin
389
+ :id: 29
390
+ - :species: mystax
391
+ :name: Moustached Tamarin
392
+ :id: 30
393
+ - :species: bicolor
394
+ :name: Pied Tamarin
395
+ :id: 31
396
+ - :species: pileatus
397
+ :name: Red-capped Tamarin
398
+ :id: 32
399
+ - :species: midas
400
+ :name: Red-handed Tamarin
401
+ :id: 33
402
+ - :species: leucopus
403
+ :name: White-footed Tamarin
404
+ :id: 34
405
+ - :species: labiatus
406
+ :name: White-lipped Tamarin
407
+ :id: 35
408
+ - :species: melanoleucus
409
+ :name: White-mantled Tamarin
410
+ :id: 36
@@ -0,0 +1,11 @@
1
+ ---
2
+ - id
3
+ - :name: :name
4
+ :title: Common Name
5
+ :type: :string
6
+ - :name: genus
7
+ :type: :string
8
+ :title: Genus
9
+ - :name: :species
10
+ :type: :string
11
+ :title: Species
@@ -5,7 +5,7 @@ describe IMW::Formats::Csv do
5
5
  # effect only code within the FasterCSV library
6
6
 
7
7
  before do
8
- @sample = IMW.open(File.join(IMWTest::DATA_DIR, 'sample.csv'))
8
+ @sample = IMW.open(File.join(IMWTest::DATA_DIR, 'formats/delimited/sample.csv'))
9
9
  end
10
10
 
11
11
  it "should be able to parse the CSV" do
@@ -14,8 +14,40 @@ describe IMW::Formats::Csv do
14
14
 
15
15
  it "should be able to write CSV" do
16
16
  data = [['foobar', 1, 2], ['bazbooz', 3, 4]]
17
- IMW.open!('test.csv').dump(data)
17
+ IMW.open!('test.csv') { |f| f << data }
18
18
  IMW.open('test.csv').load[1].last.should == "4"
19
19
  end
20
20
 
21
+ it "should raise an error on an invalid schema" do
22
+ lambda { @sample.schema = [{:name => :foobar, :has_many => {:associations => [:foo, :bar]}}] }.should raise_error(IMW::SchemaError)
23
+ end
24
+
25
+ it "should accept a valid schema" do
26
+ @sample.schema = [:foo, :bar, :baz]
27
+ @sample.schema.should == [{:name => 'foo'}, {:name => 'bar'}, {:name => 'baz'}]
28
+ end
29
+
30
+ describe "guessing a schema" do
31
+
32
+ Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/with_schema/*')].each do |path|
33
+ it "should correctly guess that with_schema/#{File.basename(path)} has headers in its first row" do
34
+ IMW.open(path).headers_in_first_line?.should == true
35
+ end
36
+ end
37
+
38
+ Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/without_schema/*')].each do |path|
39
+ it "should correctly guess that without_schema/#{File.basename(path)} does not have headers in its first row" do
40
+ IMW.open(path).headers_in_first_line?.should == false
41
+ end
42
+ end
43
+
44
+ it "should automatically set the headers on a source with guessed headers" do
45
+ resource = IMW.open(Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/with_schema/*')].first)
46
+ resource.guess_schema!
47
+ resource.delimited_options[:headers].class.should == Array
48
+ resource.schema.should_not be_empty
49
+ end
50
+
51
+ end
52
+
21
53
  end