traject 0.0.2 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/Gemfile +4 -0
  2. data/README.md +85 -61
  3. data/Rakefile +5 -0
  4. data/bin/traject +31 -3
  5. data/doc/settings.md +74 -13
  6. data/lib/tasks/load_maps.rake +48 -0
  7. data/lib/traject/indexer/settings.rb +75 -0
  8. data/lib/traject/indexer.rb +255 -45
  9. data/lib/traject/json_writer.rb +4 -2
  10. data/lib/traject/macros/marc21.rb +18 -6
  11. data/lib/traject/macros/marc21_semantics.rb +405 -0
  12. data/lib/traject/macros/marc_format_classifier.rb +180 -0
  13. data/lib/traject/marc4j_reader.rb +160 -0
  14. data/lib/traject/marc_extractor.rb +33 -17
  15. data/lib/traject/marc_reader.rb +14 -11
  16. data/lib/traject/solrj_writer.rb +247 -9
  17. data/lib/traject/thread_pool.rb +154 -0
  18. data/lib/traject/translation_map.rb +46 -4
  19. data/lib/traject/util.rb +30 -0
  20. data/lib/traject/version.rb +1 -1
  21. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  22. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  23. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  24. data/lib/translation_maps/marc_geographic.yaml +589 -0
  25. data/lib/translation_maps/marc_instruments.yaml +102 -0
  26. data/lib/translation_maps/marc_languages.yaml +490 -0
  27. data/test/indexer/each_record_test.rb +34 -0
  28. data/test/indexer/macros_marc21_semantics_test.rb +206 -0
  29. data/test/indexer/macros_marc21_test.rb +10 -1
  30. data/test/indexer/map_record_test.rb +78 -8
  31. data/test/indexer/read_write_test.rb +43 -10
  32. data/test/indexer/settings_test.rb +60 -4
  33. data/test/indexer/to_field_test.rb +39 -0
  34. data/test/marc4j_reader_test.rb +75 -0
  35. data/test/marc_extractor_test.rb +62 -0
  36. data/test/marc_format_classifier_test.rb +91 -0
  37. data/test/marc_reader_test.rb +12 -0
  38. data/test/solrj_writer_test.rb +146 -43
  39. data/test/test_helper.rb +50 -0
  40. data/test/test_support/245_no_ab.marc +1 -0
  41. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  42. data/test/test_support/bad_subfield_code.marc +1 -0
  43. data/test/test_support/date_resort_to_260.marc +1 -0
  44. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  45. data/test/test_support/date_with_u.marc +1 -0
  46. data/test/test_support/demo_config.rb +153 -0
  47. data/test/test_support/emptyish_record.marc +1 -0
  48. data/test/test_support/louis_armstrong.marc +1 -0
  49. data/test/test_support/manuscript_online_thesis.marc +1 -0
  50. data/test/test_support/microform_online_conference.marc +1 -0
  51. data/test/test_support/multi_era.marc +1 -0
  52. data/test/test_support/multi_geo.marc +1 -0
  53. data/test/test_support/musical_cage.marc +1 -0
  54. data/test/test_support/one-marc8.mrc +1 -0
  55. data/test/test_support/online_only.marc +1 -0
  56. data/test/test_support/packed_041a_lang.marc +1 -0
  57. data/test/test_support/the_business_ren.marc +1 -0
  58. data/test/translation_map_test.rb +8 -0
  59. data/test/translation_maps/properties_map.properties +5 -0
  60. data/traject.gemspec +1 -1
  61. data/vendor/marc4j/README.md +17 -0
  62. data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
  63. metadata +81 -2
@@ -0,0 +1,490 @@
1
+ # Map Language Codes (in 008[35-37], 041) to User Friendly Term
2
+
3
+ # ???: null
4
+ aar: Afar
5
+ abk: Abkhaz
6
+ ace: Achinese
7
+ ach: Acoli
8
+ ada: Adangme
9
+ ady: Adygei
10
+ afa: Afroasiatic (Other)
11
+ afh: Afrihili (Artificial language)
12
+ afr: Afrikaans
13
+ ajm: Aljamia
14
+ aka: Akan
15
+ akk: Akkadian
16
+ alb: Albanian
17
+ ale: Aleut
18
+ alg: Algonquian (Other)
19
+ amh: Amharic
20
+ ang: English, Old (ca. 450-1100)
21
+ apa: Apache languages
22
+ ara: Arabic
23
+ arc: Aramaic
24
+ arg: Aragonese Spanish
25
+ arm: Armenian
26
+ arn: Mapuche
27
+ arp: Arapaho
28
+ art: Artificial (Other)
29
+ arw: Arawak
30
+ asm: Assamese
31
+ ast: Bable
32
+ ath: Athapascan (Other)
33
+ aus: Australian languages
34
+ ava: Avaric
35
+ ave: Avestan
36
+ awa: Awadhi
37
+ aym: Aymara
38
+ aze: Azerbaijani
39
+ bad: Banda
40
+ bai: Bamileke languages
41
+ bak: Bashkir
42
+ bal: Baluchi
43
+ bam: Bambara
44
+ ban: Balinese
45
+ baq: Basque
46
+ bas: Basa
47
+ bat: Baltic (Other)
48
+ bej: Beja
49
+ bel: Belarusian
50
+ bem: Bemba
51
+ ben: Bengali
52
+ ber: Berber (Other)
53
+ bho: Bhojpuri
54
+ bih: Bihari
55
+ bik: Bikol
56
+ bin: Edo
57
+ bis: Bislama
58
+ bla: Siksika
59
+ bnt: Bantu (Other)
60
+ bos: Bosnian
61
+ bra: Braj
62
+ bre: Breton
63
+ btk: Batak
64
+ bua: Buriat
65
+ bug: Bugis
66
+ bul: Bulgarian
67
+ bur: Burmese
68
+ cad: Caddo
69
+ cai: Central American Indian (Other)
70
+ cam: Khmer
71
+ car: Carib
72
+ cat: Catalan
73
+ cau: Caucasian (Other)
74
+ ceb: Cebuano
75
+ cel: Celtic (Other)
76
+ cha: Chamorro
77
+ chb: Chibcha
78
+ che: Chechen
79
+ chg: Chagatai
80
+ chi: Chinese
81
+ chk: Truk
82
+ chm: Mari
83
+ chn: Chinook jargon
84
+ cho: Choctaw
85
+ chp: Chipewyan
86
+ chr: Cherokee
87
+ chu: Church Slavic
88
+ chv: Chuvash
89
+ chy: Cheyenne
90
+ cmc: Chamic languages
91
+ cop: Coptic
92
+ cor: Cornish
93
+ cos: Corsican
94
+ cpe: Creoles and Pidgins, English-based (Other)
95
+ cpf: Creoles and Pidgins, French-based (Other)
96
+ cpp: Creoles and Pidgins, Portuguese-based (Other)
97
+ cre: Cree
98
+ crh: Crimean Tatar
99
+ crp: Creoles and Pidgins (Other)
100
+ cus: Cushitic (Other)
101
+ cze: Czech
102
+ dak: Dakota
103
+ dan: Danish
104
+ dar: Dargwa
105
+ day: Dayak
106
+ del: Delaware
107
+ den: Slave
108
+ dgr: Dogrib
109
+ din: Dinka
110
+ div: Divehi
111
+ doi: Dogri
112
+ dra: Dravidian (Other)
113
+ dua: Duala
114
+ dum: Dutch, Middle (ca. 1050-1350)
115
+ dut: Dutch
116
+ dyu: Dyula
117
+ dzo: Dzongkha
118
+ efi: Efik
119
+ egy: Egyptian
120
+ eka: Ekajuk
121
+ elx: Elamite
122
+ eng: English
123
+ enm: English, Middle (1100-1500)
124
+ epo: Esperanto
125
+ esk: Eskimo languages
126
+ esp: Esperanto
127
+ est: Estonian
128
+ eth: Ethiopic
129
+ ewe: Ewe
130
+ ewo: Ewondo
131
+ fan: Fang
132
+ fao: Faroese
133
+ far: Faroese
134
+ fat: Fanti
135
+ fij: Fijian
136
+ fin: Finnish
137
+ fiu: Finno-Ugrian (Other)
138
+ fon: Fon
139
+ fre: French
140
+ fri: Frisian
141
+ frm: French, Middle (ca. 1400-1600)
142
+ fro: French, Old (ca. 842-1400)
143
+ fry: Frisian
144
+ ful: Fula
145
+ fur: Friulian
146
+ gaa: Ga
147
+ gae: Scottish Gaelic
148
+ gag: Galician
149
+ gal: Oromo
150
+ gay: Gayo
151
+ gba: Gbaya
152
+ gem: Germanic (Other)
153
+ geo: Georgian
154
+ ger: German
155
+ gez: Ethiopic
156
+ gil: Gilbertese
157
+ gla: Scottish Gaelic
158
+ gle: Irish
159
+ glg: Galician
160
+ glv: Manx
161
+ gmh: German, Middle High (ca. 1050-1500)
162
+ goh: German, Old High (ca. 750-1050)
163
+ gon: Gondi
164
+ gor: Gorontalo
165
+ got: Gothic
166
+ grb: Grebo
167
+ grc: Greek, Ancient (to 1453)
168
+ gre: Greek, Modern (1453- )
169
+ grn: Guarani
170
+ gua: Guarani
171
+ guj: Gujarati
172
+ gwi: Gwich'in
173
+ hai: Haida
174
+ hat: Haitian French Creole
175
+ hau: Hausa
176
+ haw: Hawaiian
177
+ heb: Hebrew
178
+ her: Herero
179
+ hil: Hiligaynon
180
+ him: Himachali
181
+ hin: Hindi
182
+ hit: Hittite
183
+ hmn: Hmong
184
+ hmo: Hiri Motu
185
+ hun: Hungarian
186
+ hup: Hupa
187
+ iba: Iban
188
+ ibo: Igbo
189
+ ice: Icelandic
190
+ ido: Ido
191
+ iii: Sichuan Yi
192
+ ijo: Ijo
193
+ iku: Inuktitut
194
+ ile: Interlingue
195
+ ilo: Iloko
196
+ ina: Interlingua (International Auxiliary Language Association)
197
+ inc: Indic (Other)
198
+ ind: Indonesian
199
+ ine: Indo-European (Other)
200
+ inh: Ingush
201
+ int: Interlingua (International Auxiliary Language Association)
202
+ ipk: Inupiaq
203
+ ira: Iranian (Other)
204
+ iri: Irish
205
+ iro: Iroquoian (Other)
206
+ ita: Italian
207
+ jav: Javanese
208
+ jpn: Japanese
209
+ jpr: Judeo-Persian
210
+ jrb: Judeo-Arabic
211
+ kaa: Kara-Kalpak
212
+ kab: Kabyle
213
+ kac: Kachin
214
+ kal: Kalatdlisut
215
+ kam: Kamba
216
+ kan: Kannada
217
+ kar: Karen
218
+ kas: Kashmiri
219
+ kau: Kanuri
220
+ kaw: Kawi
221
+ kaz: Kazakh
222
+ kbd: Kabardian
223
+ kha: Khasi
224
+ khi: Khoisan (Other)
225
+ khm: Khmer
226
+ kho: Khotanese
227
+ kik: Kikuyu
228
+ kin: Kinyarwanda
229
+ kir: Kyrgyz
230
+ kmb: Kimbundu
231
+ kok: Konkani
232
+ kom: Komi
233
+ kon: Kongo
234
+ kor: Korean
235
+ kos: Kusaie
236
+ kpe: Kpelle
237
+ kro: Kru
238
+ kru: Kurukh
239
+ kua: Kuanyama
240
+ kum: Kumyk
241
+ kur: Kurdish
242
+ kus: Kusaie
243
+ kut: Kutenai
244
+ lad: Ladino
245
+ lah: Lahnda
246
+ lam: Lamba
247
+ lan: Occitan (post-1500)
248
+ lao: Lao
249
+ lap: Sami
250
+ lat: Latin
251
+ lav: Latvian
252
+ lez: Lezgian
253
+ lim: Limburgish
254
+ lin: Lingala
255
+ lit: Lithuanian
256
+ lol: Mongo-Nkundu
257
+ loz: Lozi
258
+ ltz: Letzeburgesch
259
+ lua: Luba-Lulua
260
+ lub: Luba-Katanga
261
+ lug: Ganda
262
+ lui: Luiseno
263
+ lun: Lunda
264
+ luo: Luo (Kenya and Tanzania)
265
+ lus: Lushai
266
+ mac: Macedonian
267
+ mad: Madurese
268
+ mag: Magahi
269
+ mah: Marshallese
270
+ mai: Maithili
271
+ mak: Makasar
272
+ mal: Malayalam
273
+ man: Mandingo
274
+ mao: Maori
275
+ map: Austronesian (Other)
276
+ mar: Marathi
277
+ mas: Masai
278
+ max: Manx
279
+ may: Malay
280
+ mdr: Mandar
281
+ men: Mende
282
+ mga: Irish, Middle (ca. 1100-1550)
283
+ mic: Micmac
284
+ min: Minangkabau
285
+ mis: Miscellaneous languages
286
+ mkh: Mon-Khmer (Other)
287
+ mla: Malagasy
288
+ mlg: Malagasy
289
+ mlt: Maltese
290
+ mnc: Manchu
291
+ mni: Manipuri
292
+ mno: Manobo languages
293
+ moh: Mohawk
294
+ mol: Moldavian
295
+ mon: Mongolian
296
+ mos: Moore
297
+ mul: Multiple languages
298
+ mun: Munda (Other)
299
+ mus: Creek
300
+ mwr: Marwari
301
+ myn: Mayan languages
302
+ nah: Nahuatl
303
+ nai: North American Indian (Other)
304
+ nap: Neapolitan Italian
305
+ nau: Nauru
306
+ nav: Navajo
307
+ nbl: Ndebele (South Africa)
308
+ nde: Ndebele (Zimbabwe)
309
+ ndo: Ndonga
310
+ nds: Low German
311
+ nep: Nepali
312
+ new: Newari
313
+ nia: Nias
314
+ nic: Niger-Kordofanian (Other)
315
+ niu: Niuean
316
+ nno: Norwegian (Nynorsk)
317
+ nob: Norwegian (Bokmal)
318
+ nog: Nogai
319
+ non: Old Norse
320
+ nor: Norwegian
321
+ nso: Northern Sotho
322
+ nub: Nubian languages
323
+ nya: Nyanja
324
+ nym: Nyamwezi
325
+ nyn: Nyankole
326
+ nyo: Nyoro
327
+ nzi: Nzima
328
+ oci: Occitan (post-1500)
329
+ oji: Ojibwa
330
+ ori: Oriya
331
+ orm: Oromo
332
+ osa: Osage
333
+ oss: Ossetic
334
+ ota: Turkish, Ottoman
335
+ oto: Otomian languages
336
+ paa: Papuan (Other)
337
+ pag: Pangasinan
338
+ pal: Pahlavi
339
+ pam: Pampanga
340
+ pan: Panjabi
341
+ pap: Papiamento
342
+ pau: Palauan
343
+ peo: Old Persian (ca. 600-400 B.C.)
344
+ per: Persian
345
+ phi: Philippine (Other)
346
+ phn: Phoenician
347
+ pli: Pali
348
+ pol: Polish
349
+ pon: Ponape
350
+ por: Portuguese
351
+ pra: Prakrit languages
352
+ pro: Provencal (to 1500)
353
+ pus: Pushto
354
+ que: Quechua
355
+ raj: Rajasthani
356
+ rap: Rapanui
357
+ rar: Rarotongan
358
+ roa: Romance (Other)
359
+ roh: Raeto-Romance
360
+ rom: Romani
361
+ rum: Romanian
362
+ run: Rundi
363
+ rus: Russian
364
+ sad: Sandawe
365
+ sag: Sango (Ubangi Creole)
366
+ sah: Yakut
367
+ sai: South American Indian (Other)
368
+ sal: Salishan languages
369
+ sam: Samaritan Aramaic
370
+ san: Sanskrit
371
+ sao: Samoan
372
+ sas: Sasak
373
+ sat: Santali
374
+ scc: Serbian
375
+ sco: Scots
376
+ scr: Croatian
377
+ sel: Selkup
378
+ sem: Semitic (Other)
379
+ sga: Irish, Old (to 1100)
380
+ sgn: Sign languages
381
+ shn: Shan
382
+ sho: Shona
383
+ sid: Sidamo
384
+ sin: Sinhalese
385
+ sio: Siouan (Other)
386
+ sit: Sino-Tibetan (Other)
387
+ sla: Slavic (Other)
388
+ slo: Slovak
389
+ slv: Slovenian
390
+ sma: Southern Sami
391
+ sme: Northern Sami
392
+ smi: Sami
393
+ smj: Lule Sami
394
+ smn: Inari Sami
395
+ smo: Samoan
396
+ sms: Skolt Sami
397
+ sna: Shona
398
+ snd: Sindhi
399
+ snh: Sinhalese
400
+ snk: Soninke
401
+ sog: Sogdian
402
+ som: Somali
403
+ son: Songhai
404
+ sot: Sotho
405
+ spa: Spanish
406
+ srd: Sardinian
407
+ srr: Serer
408
+ ssa: Nilo-Saharan (Other)
409
+ sso: Sotho
410
+ ssw: Swazi
411
+ suk: Sukuma
412
+ sun: Sundanese
413
+ sus: Susu
414
+ sux: Sumerian
415
+ swa: Swahili
416
+ swe: Swedish
417
+ swz: Swazi
418
+ syr: Syriac
419
+ tag: Tagalog
420
+ tah: Tahitian
421
+ tai: Tai (Other)
422
+ taj: Tajik
423
+ tam: Tamil
424
+ tar: Tatar
425
+ tat: Tatar
426
+ tel: Telugu
427
+ tem: Temne
428
+ ter: Terena
429
+ tet: Tetum
430
+ tgk: Tajik
431
+ tgl: Tagalog
432
+ tha: Thai
433
+ tib: Tibetan
434
+ tig: Tigre
435
+ tir: Tigrinya
436
+ tiv: Tiv
437
+ tkl: Tokelauan
438
+ tli: Tlingit
439
+ tmh: Tamashek
440
+ tog: Tonga (Nyasa)
441
+ ton: Tongan
442
+ tpi: Tok Pisin
443
+ tru: Truk
444
+ tsi: Tsimshian
445
+ tsn: Tswana
446
+ tso: Tsonga
447
+ tsw: Tswana
448
+ tuk: Turkmen
449
+ tum: Tumbuka
450
+ tup: Tupi languages
451
+ tur: Turkish
452
+ tut: Altaic (Other)
453
+ tvl: Tuvaluan
454
+ twi: Twi
455
+ tyv: Tuvinian
456
+ udm: Udmurt
457
+ uga: Ugaritic
458
+ uig: Uighur
459
+ ukr: Ukrainian
460
+ umb: Umbundu
461
+ # und: Undetermined
462
+ urd: Urdu
463
+ uzb: Uzbek
464
+ vai: Vai
465
+ ven: Venda
466
+ vie: Vietnamese
467
+ vol: Volapuk
468
+ vot: Votic
469
+ wak: Wakashan languages
470
+ wal: Walamo
471
+ war: Waray
472
+ was: Washo
473
+ wel: Welsh
474
+ wen: Sorbian languages
475
+ wln: Walloon
476
+ wol: Wolof
477
+ xal: Kalmyk
478
+ xho: Xhosa
479
+ yao: Yao (Africa)
480
+ yap: Yapese
481
+ yid: Yiddish
482
+ yor: Yoruba
483
+ ypk: Yupik languages
484
+ zap: Zapotec
485
+ zen: Zenaga
486
+ zha: Zhuang
487
+ znd: Zande
488
+ zul: Zulu
489
+ zun: Zuni
490
+ # zxx: null
@@ -0,0 +1,34 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::Indexer#each_record" do
4
+ before do
5
+ @indexer = Traject::Indexer.new
6
+ end
7
+
8
+ describe "checks arguments" do
9
+ it "rejects no-arg block" do
10
+ assert_raises(ArgumentError) do
11
+ @indexer.each_record do
12
+ end
13
+ end
14
+ end
15
+ it "rejects three-arg block" do
16
+ assert_raises(ArgumentError) do
17
+ @indexer.each_record do |one, two, three|
18
+ end
19
+ end
20
+ end
21
+ it "accepts one-arg block" do
22
+ @indexer.each_record do |record|
23
+ end
24
+ end
25
+ it "accepts two-arg block" do
26
+ @indexer.each_record do |record, context|
27
+ end
28
+ end
29
+ it "accepts variable arity block" do
30
+ @indexer.each_record do |*variable|
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,206 @@
1
+ require 'test_helper'
2
+
3
+ require 'traject/indexer'
4
+ require 'traject/macros/marc21_semantics'
5
+
6
+ require 'json'
7
+ require 'marc/record'
8
+
9
+ # See also marc_extractor_test.rb for more detailed tests on marc extraction,
10
+ # this is just a basic test to make sure our macro works passing through to there
11
+ # and other options.
12
+ describe "Traject::Macros::Marc21Semantics" do
13
+ Marc21Semantics = Traject::Macros::Marc21Semantics # shortcut
14
+
15
+ before do
16
+ @indexer = Traject::Indexer.new
17
+ @indexer.extend Marc21Semantics
18
+
19
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
20
+ end
21
+
22
+ it "oclcnum" do
23
+ @indexer.instance_eval do
24
+ to_field "oclcnum", oclcnum
25
+ end
26
+ output = @indexer.map_record(@record)
27
+
28
+ assert_equal %w{2710183 47971712}, output["oclcnum"]
29
+ end
30
+
31
+ describe "marc_sortable_author" do
32
+ # these probably should be taking only certain subfields, but we're copying
33
+ # from SolrMarc that didn't do so either and nobody noticed, so not bothering for now.
34
+ before do
35
+ @indexer.instance_eval do
36
+ to_field "author_sort", marc_sortable_author
37
+ end
38
+ end
39
+ it "collates author and title" do
40
+ output = @indexer.map_record(@record)
41
+
42
+ assert_equal ["Herman, Edward S.Manufacturing consent : the political economy of the mass media / Edward S. Herman and Noam Chomsky ; with a new introduction by the authors."], output["author_sort"]
43
+ end
44
+ it "respects non-filing" do
45
+ @record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
46
+
47
+ output = @indexer.map_record(@record)
48
+
49
+ assert_equal ["Business renaissance quarterly [electronic resource]."], output["author_sort"]
50
+ end
51
+ end
52
+
53
+ describe "marc_sortable_title" do
54
+ before do
55
+ @indexer.instance_eval { to_field "title_sort", marc_sortable_title }
56
+ end
57
+ it "works" do
58
+ output = @indexer.map_record(@record)
59
+ assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title_sort"]
60
+ end
61
+ it "respects non-filing" do
62
+ @record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
63
+ output = @indexer.map_record(@record)
64
+
65
+ assert_equal ["Business renaissance quarterly"], output["title_sort"]
66
+ end
67
+ it "works with a record with no 245$ab" do
68
+ @record = MARC::Reader.new(support_file_path "245_no_ab.marc").to_a.first
69
+ output = @indexer.map_record(@record)
70
+ assert_equal ["Papers"], output["title_sort"]
71
+ end
72
+ end
73
+
74
+ describe "marc_languages" do
75
+ before do
76
+ @indexer.instance_eval {to_field "languages", marc_languages() }
77
+ end
78
+
79
+ it "unpacks packed 041a and translates" do
80
+ @record = MARC::Reader.new(support_file_path "packed_041a_lang.marc").to_a.first
81
+ output = @indexer.map_record(@record)
82
+
83
+ assert_equal ["English", "French", "German", "Italian", "Spanish", "Russian"], output["languages"]
84
+ end
85
+ end
86
+
87
+ describe "marc_instrumentation_humanized" do
88
+ before do
89
+ @record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
90
+ @indexer.instance_eval {to_field "instrumentation", marc_instrumentation_humanized }
91
+ end
92
+
93
+ it "translates, de-duping" do
94
+ output = @indexer.map_record(@record)
95
+
96
+ assert_equal ["Larger ensemble, Unspecified", "Piano", "Soprano voice", "Tenor voice", "Violin", "Larger ensemble, Ethnic", "Guitar", "Voices, Unspecified"], output["instrumentation"]
97
+ end
98
+ end
99
+
100
+ describe "marc_instrument_codes_normalized" do
101
+ before do
102
+ @record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
103
+ @indexer.instance_eval {to_field "instrument_codes", marc_instrument_codes_normalized }
104
+ end
105
+ it "normalizes, de-duping" do
106
+ output = @indexer.map_record(@record)
107
+
108
+ assert_equal ["on", "ka01", "ka", "va01", "va", "vd01", "vd", "sa01", "sa", "oy", "tb01", "tb", "vn12", "vn"],
109
+ output["instrument_codes"]
110
+ end
111
+ it "codes soloist 048$b" do
112
+ @record = MARC::Reader.new(support_file_path "louis_armstrong.marc").to_a.first
113
+ output = @indexer.map_record(@record)
114
+
115
+ assert_equal ["bb01", "bb01.s", "bb", "bb.s", "oe"],
116
+ output["instrument_codes"]
117
+ end
118
+ end
119
+
120
+ describe "publication_date" do
121
+ # there are way too many edge cases for us to test em all, but we'll test some of em.
122
+ it "pulls out 008 date_type s" do
123
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
124
+ assert_equal 2002, Marc21Semantics.publication_date(@record)
125
+ end
126
+ it "uses start date for date_type c continuing resource" do
127
+ @record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
128
+ assert_equal 2006, Marc21Semantics.publication_date(@record)
129
+ end
130
+ it "returns nil when the records really got nothing" do
131
+ @record = MARC::Reader.new(support_file_path "emptyish_record.marc").to_a.first
132
+ assert_equal nil, Marc21Semantics.publication_date(@record)
133
+ end
134
+ it "estimates with a single 'u'" do
135
+ @record = MARC::Reader.new(support_file_path "date_with_u.marc").to_a.first
136
+ # was 184u as date1 on a continuing resource. For continuing resources,
137
+ # we take the first date. And need to deal with the u.
138
+ assert_equal 1845, Marc21Semantics.publication_date(@record)
139
+ end
140
+ it "resorts to 260c" do
141
+ @record = MARC::Reader.new(support_file_path "date_resort_to_260.marc").to_a.first
142
+ assert_equal 1980, Marc21Semantics.publication_date(@record)
143
+ end
144
+ it "works with date type r missing date2" do
145
+ @record = MARC::Reader.new(support_file_path "date_type_r_missing_date2.marc").to_a.first
146
+ assert_equal 1957, Marc21Semantics.publication_date(@record)
147
+ end
148
+ end
149
+
150
+ describe "marc_lcc_to_broad_category" do
151
+ before do
152
+ @indexer.instance_eval {to_field "discipline_facet", marc_lcc_to_broad_category }
153
+ end
154
+ it "maps a simple example" do
155
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
156
+ output = @indexer.map_record(@record)
157
+
158
+ assert_equal ["Language & Literature"], output["discipline_facet"]
159
+ end
160
+ it "maps to default" do
161
+ @record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
162
+ output = @indexer.map_record(@record)
163
+ assert_equal ["Unknown"], output["discipline_facet"]
164
+ end
165
+ it "maps to nothing if none and no default" do
166
+ @indexer.instance_eval {to_field "discipline_no_default", marc_lcc_to_broad_category(:default => nil)}
167
+ @record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
168
+ output = @indexer.map_record(@record)
169
+
170
+ assert_nil output["discipline_no_default"]
171
+ end
172
+ end
173
+
174
+ describe "marc_geo_facet" do
175
+ before do
176
+ @indexer.instance_eval {to_field "geo_facet", marc_geo_facet }
177
+ end
178
+ it "maps a complicated record" do
179
+ @record = MARC::Reader.new(support_file_path "multi_geo.marc").to_a.first
180
+ output = @indexer.map_record(@record)
181
+
182
+ assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
183
+ output["geo_facet"]
184
+ end
185
+ it "maps nothing on a record with no geo" do
186
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
187
+ output = @indexer.map_record(@record)
188
+ assert_nil output["geo_facet"]
189
+ end
190
+ end
191
+
192
+ describe "marc_era_facet" do
193
+ before do
194
+ @indexer.instance_eval {to_field "era_facet", marc_era_facet}
195
+ end
196
+ it "maps a complicated record" do
197
+ @record = MARC::Reader.new(support_file_path "multi_era.marc").to_a.first
198
+ output = @indexer.map_record(@record)
199
+
200
+ assert_equal ["Early modern, 1500-1700", "17th century", "Great Britain: Puritan Revolution, 1642-1660", "Great Britain: Civil War, 1642-1649", "1642-1660"],
201
+ output["era_facet"]
202
+ end
203
+
204
+ end
205
+
206
+ end