imw 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
@@ -0,0 +1,410 @@
1
+ ---
2
+ Mandrillus:
3
+ - :species: sphinx
4
+ :name: Mandrill
5
+ :id: 113
6
+ - :species: leucophaeus
7
+ :name: Drill
8
+ :id: 114
9
+ Rungwecebus:
10
+ - :species: kipunji
11
+ :name: Kipunji
12
+ :id: 100
13
+ Miopithecus:
14
+ - :species: talapoin
15
+ :name: Angolan Talapoin
16
+ :id: 38
17
+ - :species: ogouensis
18
+ :name: Gabon Talapoin
19
+ :id: 39
20
+ Presbytis:
21
+ - :species: rubicunda
22
+ :name: Maroon Leaf Monkey
23
+ :id: 130
24
+ Lophocebus:
25
+ - :species: albigena
26
+ :name: Grey-cheeked Mangabey
27
+ :id: 94
28
+ - :species: aterrimus
29
+ :name: Black Crested Mangabey
30
+ :id: 95
31
+ - :species: opdenboschi
32
+ :name: "Opdenbosch's Mangabey"
33
+ :id: 96
34
+ - :species: ugandae
35
+ :name: Uganda Mangabey
36
+ :id: 97
37
+ - :species: johnstoni
38
+ :name: "Johnston's Mangabey"
39
+ :id: 98
40
+ - :species: osmani
41
+ :name: "Osman Hill's Mangabey"
42
+ :id: 99
43
+ Erythrocebus:
44
+ - :species: patas
45
+ :name: Patas Monkey
46
+ :id: 40
47
+ Piliocolobus:
48
+ - :species: badius
49
+ :name: Western Red Colobus
50
+ :id: 120
51
+ - :species: pennantii
52
+ :name: "Pennant's Colobus"
53
+ :id: 121
54
+ - :species: preussi
55
+ :name: "Preuss's Red Colobus"
56
+ :id: 122
57
+ - :species: tholloni
58
+ :name: "Thollon's Red Colobus"
59
+ :id: 123
60
+ - :species: foai
61
+ :name: Central African Red Colobus
62
+ :id: 124
63
+ - :species: tephrosceles
64
+ :name: Ugandan Red Colobus
65
+ :id: 125
66
+ - :species: gordonorum
67
+ :name: Uzungwa Red Colobus
68
+ :id: 126
69
+ - :species: kirkii
70
+ :name: Zanzibar Red Colobus
71
+ :id: 127
72
+ - :species: rufomitratus
73
+ :name: Tana River Red Colobus
74
+ :id: 128
75
+ Cercocebus:
76
+ - :species: atys
77
+ :name: Sooty Mangabey
78
+ :id: 107
79
+ - :species: torquatus
80
+ :name: Collared Mangabey
81
+ :id: 108
82
+ - :species: agilis
83
+ :name: Agile Mangabey
84
+ :id: 109
85
+ - :species: chrysogaster
86
+ :name: Golden-bellied Mangabey
87
+ :id: 110
88
+ - :species: galeritus
89
+ :name: Tana River Mangabey
90
+ :id: 111
91
+ - :species: sanjei
92
+ :name: Sanje Mangabey
93
+ :id: 112
94
+ Theropithecus:
95
+ - :species: gelada
96
+ :name: Gelada
97
+ :id: 106
98
+ Papio:
99
+ - :species: hamadryas
100
+ :name: Hamadryas Baboon
101
+ :id: 101
102
+ - :species: papio
103
+ :name: Guinea Baboon
104
+ :id: 102
105
+ - :species: anubis
106
+ :name: Olive Baboon
107
+ :id: 103
108
+ - :species: cynocephalus
109
+ :name: Yellow Baboon
110
+ :id: 104
111
+ - :species: ursinus
112
+ :name: Chacma Baboon
113
+ :id: 105
114
+ Chlorocebus:
115
+ - :species: sabaeus
116
+ :name: Green Monkey
117
+ :id: 41
118
+ - :species: aethiops
119
+ :name: Grivet
120
+ :id: 42
121
+ - :species: djamdjamensis
122
+ :name: Bale Mountains Vervet
123
+ :id: 43
124
+ - :species: tantalus
125
+ :name: Tantalus Monkey
126
+ :id: 44
127
+ - :species: pygerythrus
128
+ :name: Vervet Monkey
129
+ :id: 45
130
+ - :species: cynosuros
131
+ :name: Malbrouck
132
+ :id: 46
133
+ Allenopithecus:
134
+ - :species: nigroviridis
135
+ :name: "Allen's Swamp Monkey"
136
+ :id: 37
137
+ Procolobus:
138
+ - :species: verus
139
+ :name: Olive Colobus
140
+ :id: 129
141
+ Hylobates:
142
+ - :species: lar lar
143
+ :name: Malaysian Lar Gibbon
144
+ :id: 9
145
+ - :species: lar carpenteri
146
+ :name: "Carpenter's Lar Gibbon"
147
+ :id: 10
148
+ - :species: lar entelloides
149
+ :name: Central Lar Gibbon
150
+ :id: 11
151
+ - :species: lar vestitus
152
+ :name: Sumatran Lar Gibbon
153
+ :id: 12
154
+ - :species: lar yunnanensis
155
+ :name: Yunnan Lar Gibbon
156
+ :id: 13
157
+ - :species: agilis agilis
158
+ :name: Mountain Agile Gibbon
159
+ :id: 14
160
+ - :species: agilis albibarbis
161
+ :name: Bornean White-bearded Gibbon
162
+ :id: 15
163
+ - :species: agilis unko
164
+ :name: Lowland Agile Gibbon
165
+ :id: 16
166
+ - :species: muelleri muelleri
167
+ :name: "M\xC3\xBCller's Gray Gibbon"
168
+ :id: 17
169
+ - :species: muelleri abbotti
170
+ :name: "Abbott's Gray Gibbon"
171
+ :id: 18
172
+ - :species: muelleri funereus
173
+ :name: Northern Gray Gibbon
174
+ :id: 19
175
+ Aotus:
176
+ - :species: lemurinus
177
+ :name: Gray-bellied Night Monkey
178
+ :id: 1
179
+ - :species: zonalis
180
+ :name: Panamanian Night Monkey
181
+ :id: 2
182
+ - :species: jorgehernandezi
183
+ :name: "Hern\xC3\xA1ndez-Camacho's Night Monkey"
184
+ :id: 3
185
+ - :species: griseimembra
186
+ :name: Gray-handed Night Monkey
187
+ :id: 4
188
+ - :species: hershkovitzi
189
+ :name: "Hershkovitz's Night Monkey"
190
+ :id: 5
191
+ - :species: brumbacki
192
+ :name: "Brumback's Night Monkey"
193
+ :id: 6
194
+ - :species: trivirgatus
195
+ :name: Three-striped Night Monkey
196
+ :id: 7
197
+ - :species: vociferans
198
+ :name: "Spix's Night Monkey"
199
+ :id: 8
200
+ Macaca:
201
+ - :species: sylvanus
202
+ :name: Barbary Macaque
203
+ :id: 72
204
+ - :species: silenus
205
+ :name: Lion-tailed Macaque
206
+ :id: 73
207
+ - :species: nemestrina
208
+ :name: Southern Pig-tailed Macaque or Beruk
209
+ :id: 74
210
+ - :species: leonina
211
+ :name: Northern Pig-tailed Macaque
212
+ :id: 75
213
+ - :species: pagensis
214
+ :name: Pagai Island Macaque or Bokkoi
215
+ :id: 76
216
+ - :species: siberu
217
+ :name: Siberut Macaque
218
+ :id: 77
219
+ - :species: maura
220
+ :name: Moor Macaque
221
+ :id: 78
222
+ - :species: ochreata
223
+ :name: Booted Macaque
224
+ :id: 79
225
+ - :species: tonkeana
226
+ :name: Tonkean Macaque
227
+ :id: 80
228
+ - :species: hecki
229
+ :name: "Heck's Macaque"
230
+ :id: 81
231
+ - :species: nigrescens
232
+ :name: Gorontalo Macaque
233
+ :id: 82
234
+ - :species: nigra
235
+ :name: Celebes Crested Macaque or Black Ape
236
+ :id: 83
237
+ - :species: fascicularis
238
+ :name: Crab-eating Macaque or Long-tailed Macaque or Kera
239
+ :id: 84
240
+ - :species: arctoides
241
+ :name: Stump-tailed Macaque or Bear Macaque
242
+ :id: 85
243
+ - :species: mulatta
244
+ :name: Rhesus Macaque
245
+ :id: 86
246
+ - :species: cyclopis
247
+ :name: Formosan Rock Macaque
248
+ :id: 87
249
+ - :species: fuscata
250
+ :name: Japanese Macaque
251
+ :id: 88
252
+ - :species: sinica
253
+ :name: Toque Macaque
254
+ :id: 89
255
+ - :species: radiata
256
+ :name: Bonnet Macaque
257
+ :id: 90
258
+ - :species: assamensis
259
+ :name: Assam Macaque
260
+ :id: 91
261
+ - :species: thibetana
262
+ :name: "Tibetan Macaque or Milne-Edwards' Macaque"
263
+ :id: 92
264
+ - :species: munzala
265
+ :name: Arunachal Macaque or Munzala
266
+ :id: 93
267
+ Colobus:
268
+ - :species: satanas
269
+ :name: Black Colobus
270
+ :id: 115
271
+ - :species: angolensis
272
+ :name: Angola Colobus
273
+ :id: 116
274
+ - :species: polykomos
275
+ :name: King Colobus
276
+ :id: 117
277
+ - :species: vellerosus
278
+ :name: Ursine Colobus
279
+ :id: 118
280
+ - :species: guereza
281
+ :name: Mantled Guereza
282
+ :id: 119
283
+ Cercopithecus:
284
+ - :species: dryas
285
+ :name: Dryas Monkey or Salongo Monkey
286
+ :id: 47
287
+ - :species: diana
288
+ :name: Diana Monkey
289
+ :id: 48
290
+ - :species: roloway
291
+ :name: Roloway Monkey
292
+ :id: 49
293
+ - :species: nictitans
294
+ :name: Greater Spot-nosed Monkey
295
+ :id: 50
296
+ - :species: mitis
297
+ :name: Blue Monkey
298
+ :id: 51
299
+ - :species: doggetti
300
+ :name: Silver Monkey
301
+ :id: 52
302
+ - :species: kandti
303
+ :name: Golden Monkey
304
+ :id: 53
305
+ - :species: albogularis
306
+ :name: "Sykes's Monkey"
307
+ :id: 54
308
+ - :species: mona
309
+ :name: Mona Monkey
310
+ :id: 55
311
+ - :species: campbelli
312
+ :name: "Campbell's Mona Monkey"
313
+ :id: 56
314
+ - :species: lowei
315
+ :name: "Lowe's Mona Monkey"
316
+ :id: 57
317
+ - :species: pogonias
318
+ :name: Crested Mona Monkey
319
+ :id: 58
320
+ - :species: wolfi
321
+ :name: "Wolf's Mona Monkey"
322
+ :id: 59
323
+ - :species: denti
324
+ :name: "Dent's Mona Monkey"
325
+ :id: 60
326
+ - :species: petaurista
327
+ :name: Lesser Spot-nosed Monkey
328
+ :id: 61
329
+ - :species: erythrogaster
330
+ :name: White-throated Guenon
331
+ :id: 62
332
+ - :species: sclateri
333
+ :name: "Sclater's Guenon"
334
+ :id: 63
335
+ - :species: erythrotis
336
+ :name: Red-eared Guenon
337
+ :id: 64
338
+ - :species: cephus
339
+ :name: Moustached Guenon
340
+ :id: 65
341
+ - :species: ascanius
342
+ :name: Red-tailed Monkey
343
+ :id: 66
344
+ - :species: lhoesti
345
+ :name: L'Hoest's Monkey
346
+ :id: 67
347
+ - :species: preussi
348
+ :name: "Preuss's Monkey"
349
+ :id: 68
350
+ - :species: solatus
351
+ :name: Sun-tailed Monkey
352
+ :id: 69
353
+ - :species: hamlyni
354
+ :name: "Hamlyn's Monkey"
355
+ :id: 70
356
+ - :species: neglectus
357
+ :name: "De Brazza's Monkey"
358
+ :id: 71
359
+ Saguinas:
360
+ - :species: niger
361
+ :name: Black Tamarin
362
+ :id: 20
363
+ - :species: nigricollis
364
+ :name: Black-mantled Tamarin
365
+ :id: 21
366
+ - :species: fuscicollis
367
+ :name: Brown-mantled Tamarin
368
+ :id: 22
369
+ - :species: oedipus
370
+ :name: "Cottontop Tamarin or Pinch\xC3\xA9 Tamarin"
371
+ :id: 23
372
+ - :species: imperator
373
+ :name: Emperor Tamarin
374
+ :id: 24
375
+ - :species: geoffroyi
376
+ :name: "Geoffroy's Tamarin"
377
+ :id: 25
378
+ - :species: tripartitus
379
+ :name: Golden-mantled Tamarin
380
+ :id: 26
381
+ - :species: graellsi
382
+ :name: "Graells's Tamarin"
383
+ :id: 27
384
+ - :species: martinsi
385
+ :name: "Martins's Tamarin"
386
+ :id: 28
387
+ - :species: inustus
388
+ :name: Mottle-faced Tamarin
389
+ :id: 29
390
+ - :species: mystax
391
+ :name: Moustached Tamarin
392
+ :id: 30
393
+ - :species: bicolor
394
+ :name: Pied Tamarin
395
+ :id: 31
396
+ - :species: pileatus
397
+ :name: Red-capped Tamarin
398
+ :id: 32
399
+ - :species: midas
400
+ :name: Red-handed Tamarin
401
+ :id: 33
402
+ - :species: leucopus
403
+ :name: White-footed Tamarin
404
+ :id: 34
405
+ - :species: labiatus
406
+ :name: White-lipped Tamarin
407
+ :id: 35
408
+ - :species: melanoleucus
409
+ :name: White-mantled Tamarin
410
+ :id: 36
@@ -0,0 +1,11 @@
1
+ ---
2
+ - id
3
+ - :name: :name
4
+ :title: Common Name
5
+ :type: :string
6
+ - :name: genus
7
+ :type: :string
8
+ :title: Genus
9
+ - :name: :species
10
+ :type: :string
11
+ :title: Species
@@ -5,7 +5,7 @@ describe IMW::Formats::Csv do
5
5
  # effect only code within the FasterCSV library
6
6
 
7
7
  before do
8
- @sample = IMW.open(File.join(IMWTest::DATA_DIR, 'sample.csv'))
8
+ @sample = IMW.open(File.join(IMWTest::DATA_DIR, 'formats/delimited/sample.csv'))
9
9
  end
10
10
 
11
11
  it "should be able to parse the CSV" do
@@ -14,8 +14,40 @@ describe IMW::Formats::Csv do
14
14
 
15
15
  it "should be able to write CSV" do
16
16
  data = [['foobar', 1, 2], ['bazbooz', 3, 4]]
17
- IMW.open!('test.csv').dump(data)
17
+ IMW.open!('test.csv') { |f| f << data }
18
18
  IMW.open('test.csv').load[1].last.should == "4"
19
19
  end
20
20
 
21
+ it "should raise an error on an invalid schema" do
22
+ lambda { @sample.schema = [{:name => :foobar, :has_many => {:associations => [:foo, :bar]}}] }.should raise_error(IMW::SchemaError)
23
+ end
24
+
25
+ it "should accept a valid schema" do
26
+ @sample.schema = [:foo, :bar, :baz]
27
+ @sample.schema.should == [{:name => 'foo'}, {:name => 'bar'}, {:name => 'baz'}]
28
+ end
29
+
30
+ describe "guessing a schema" do
31
+
32
+ Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/with_schema/*')].each do |path|
33
+ it "should correctly guess that with_schema/#{File.basename(path)} has headers in its first row" do
34
+ IMW.open(path).headers_in_first_line?.should == true
35
+ end
36
+ end
37
+
38
+ Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/without_schema/*')].each do |path|
39
+ it "should correctly guess that without_schema/#{File.basename(path)} does not have headers in its first row" do
40
+ IMW.open(path).headers_in_first_line?.should == false
41
+ end
42
+ end
43
+
44
+ it "should automatically set the headers on a source with guessed headers" do
45
+ resource = IMW.open(Dir[File.join(IMWTest::DATA_DIR, 'formats/delimited/with_schema/*')].first)
46
+ resource.guess_schema!
47
+ resource.delimited_options[:headers].class.should == Array
48
+ resource.schema.should_not be_empty
49
+ end
50
+
51
+ end
52
+
21
53
  end