traject 0.0.2 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/Gemfile +4 -0
  2. data/README.md +85 -61
  3. data/Rakefile +5 -0
  4. data/bin/traject +31 -3
  5. data/doc/settings.md +74 -13
  6. data/lib/tasks/load_maps.rake +48 -0
  7. data/lib/traject/indexer/settings.rb +75 -0
  8. data/lib/traject/indexer.rb +255 -45
  9. data/lib/traject/json_writer.rb +4 -2
  10. data/lib/traject/macros/marc21.rb +18 -6
  11. data/lib/traject/macros/marc21_semantics.rb +405 -0
  12. data/lib/traject/macros/marc_format_classifier.rb +180 -0
  13. data/lib/traject/marc4j_reader.rb +160 -0
  14. data/lib/traject/marc_extractor.rb +33 -17
  15. data/lib/traject/marc_reader.rb +14 -11
  16. data/lib/traject/solrj_writer.rb +247 -9
  17. data/lib/traject/thread_pool.rb +154 -0
  18. data/lib/traject/translation_map.rb +46 -4
  19. data/lib/traject/util.rb +30 -0
  20. data/lib/traject/version.rb +1 -1
  21. data/lib/translation_maps/lcc_top_level.yaml +26 -0
  22. data/lib/translation_maps/marc_genre_007.yaml +9 -0
  23. data/lib/translation_maps/marc_genre_leader.yaml +22 -0
  24. data/lib/translation_maps/marc_geographic.yaml +589 -0
  25. data/lib/translation_maps/marc_instruments.yaml +102 -0
  26. data/lib/translation_maps/marc_languages.yaml +490 -0
  27. data/test/indexer/each_record_test.rb +34 -0
  28. data/test/indexer/macros_marc21_semantics_test.rb +206 -0
  29. data/test/indexer/macros_marc21_test.rb +10 -1
  30. data/test/indexer/map_record_test.rb +78 -8
  31. data/test/indexer/read_write_test.rb +43 -10
  32. data/test/indexer/settings_test.rb +60 -4
  33. data/test/indexer/to_field_test.rb +39 -0
  34. data/test/marc4j_reader_test.rb +75 -0
  35. data/test/marc_extractor_test.rb +62 -0
  36. data/test/marc_format_classifier_test.rb +91 -0
  37. data/test/marc_reader_test.rb +12 -0
  38. data/test/solrj_writer_test.rb +146 -43
  39. data/test/test_helper.rb +50 -0
  40. data/test/test_support/245_no_ab.marc +1 -0
  41. data/test/test_support/880_with_no_6.utf8.marc +1 -0
  42. data/test/test_support/bad_subfield_code.marc +1 -0
  43. data/test/test_support/date_resort_to_260.marc +1 -0
  44. data/test/test_support/date_type_r_missing_date2.marc +1 -0
  45. data/test/test_support/date_with_u.marc +1 -0
  46. data/test/test_support/demo_config.rb +153 -0
  47. data/test/test_support/emptyish_record.marc +1 -0
  48. data/test/test_support/louis_armstrong.marc +1 -0
  49. data/test/test_support/manuscript_online_thesis.marc +1 -0
  50. data/test/test_support/microform_online_conference.marc +1 -0
  51. data/test/test_support/multi_era.marc +1 -0
  52. data/test/test_support/multi_geo.marc +1 -0
  53. data/test/test_support/musical_cage.marc +1 -0
  54. data/test/test_support/one-marc8.mrc +1 -0
  55. data/test/test_support/online_only.marc +1 -0
  56. data/test/test_support/packed_041a_lang.marc +1 -0
  57. data/test/test_support/the_business_ren.marc +1 -0
  58. data/test/translation_map_test.rb +8 -0
  59. data/test/translation_maps/properties_map.properties +5 -0
  60. data/traject.gemspec +1 -1
  61. data/vendor/marc4j/README.md +17 -0
  62. data/vendor/marc4j/lib/marc4j-2.5.1-beta.jar +0 -0
  63. metadata +81 -2
@@ -0,0 +1,490 @@
1
+ # Map Language Codes (in 008[35-37], 041) to User Friendly Term
2
+
3
+ # ???: null
4
+ aar: Afar
5
+ abk: Abkhaz
6
+ ace: Achinese
7
+ ach: Acoli
8
+ ada: Adangme
9
+ ady: Adygei
10
+ afa: Afroasiatic (Other)
11
+ afh: Afrihili (Artificial language)
12
+ afr: Afrikaans
13
+ ajm: Aljamia
14
+ aka: Akan
15
+ akk: Akkadian
16
+ alb: Albanian
17
+ ale: Aleut
18
+ alg: Algonquian (Other)
19
+ amh: Amharic
20
+ ang: English, Old (ca. 450-1100)
21
+ apa: Apache languages
22
+ ara: Arabic
23
+ arc: Aramaic
24
+ arg: Aragonese Spanish
25
+ arm: Armenian
26
+ arn: Mapuche
27
+ arp: Arapaho
28
+ art: Artificial (Other)
29
+ arw: Arawak
30
+ asm: Assamese
31
+ ast: Bable
32
+ ath: Athapascan (Other)
33
+ aus: Australian languages
34
+ ava: Avaric
35
+ ave: Avestan
36
+ awa: Awadhi
37
+ aym: Aymara
38
+ aze: Azerbaijani
39
+ bad: Banda
40
+ bai: Bamileke languages
41
+ bak: Bashkir
42
+ bal: Baluchi
43
+ bam: Bambara
44
+ ban: Balinese
45
+ baq: Basque
46
+ bas: Basa
47
+ bat: Baltic (Other)
48
+ bej: Beja
49
+ bel: Belarusian
50
+ bem: Bemba
51
+ ben: Bengali
52
+ ber: Berber (Other)
53
+ bho: Bhojpuri
54
+ bih: Bihari
55
+ bik: Bikol
56
+ bin: Edo
57
+ bis: Bislama
58
+ bla: Siksika
59
+ bnt: Bantu (Other)
60
+ bos: Bosnian
61
+ bra: Braj
62
+ bre: Breton
63
+ btk: Batak
64
+ bua: Buriat
65
+ bug: Bugis
66
+ bul: Bulgarian
67
+ bur: Burmese
68
+ cad: Caddo
69
+ cai: Central American Indian (Other)
70
+ cam: Khmer
71
+ car: Carib
72
+ cat: Catalan
73
+ cau: Caucasian (Other)
74
+ ceb: Cebuano
75
+ cel: Celtic (Other)
76
+ cha: Chamorro
77
+ chb: Chibcha
78
+ che: Chechen
79
+ chg: Chagatai
80
+ chi: Chinese
81
+ chk: Truk
82
+ chm: Mari
83
+ chn: Chinook jargon
84
+ cho: Choctaw
85
+ chp: Chipewyan
86
+ chr: Cherokee
87
+ chu: Church Slavic
88
+ chv: Chuvash
89
+ chy: Cheyenne
90
+ cmc: Chamic languages
91
+ cop: Coptic
92
+ cor: Cornish
93
+ cos: Corsican
94
+ cpe: Creoles and Pidgins, English-based (Other)
95
+ cpf: Creoles and Pidgins, French-based (Other)
96
+ cpp: Creoles and Pidgins, Portuguese-based (Other)
97
+ cre: Cree
98
+ crh: Crimean Tatar
99
+ crp: Creoles and Pidgins (Other)
100
+ cus: Cushitic (Other)
101
+ cze: Czech
102
+ dak: Dakota
103
+ dan: Danish
104
+ dar: Dargwa
105
+ day: Dayak
106
+ del: Delaware
107
+ den: Slave
108
+ dgr: Dogrib
109
+ din: Dinka
110
+ div: Divehi
111
+ doi: Dogri
112
+ dra: Dravidian (Other)
113
+ dua: Duala
114
+ dum: Dutch, Middle (ca. 1050-1350)
115
+ dut: Dutch
116
+ dyu: Dyula
117
+ dzo: Dzongkha
118
+ efi: Efik
119
+ egy: Egyptian
120
+ eka: Ekajuk
121
+ elx: Elamite
122
+ eng: English
123
+ enm: English, Middle (1100-1500)
124
+ epo: Esperanto
125
+ esk: Eskimo languages
126
+ esp: Esperanto
127
+ est: Estonian
128
+ eth: Ethiopic
129
+ ewe: Ewe
130
+ ewo: Ewondo
131
+ fan: Fang
132
+ fao: Faroese
133
+ far: Faroese
134
+ fat: Fanti
135
+ fij: Fijian
136
+ fin: Finnish
137
+ fiu: Finno-Ugrian (Other)
138
+ fon: Fon
139
+ fre: French
140
+ fri: Frisian
141
+ frm: French, Middle (ca. 1400-1600)
142
+ fro: French, Old (ca. 842-1400)
143
+ fry: Frisian
144
+ ful: Fula
145
+ fur: Friulian
146
+ gaa: Ga
147
+ gae: Scottish Gaelic
148
+ gag: Galician
149
+ gal: Oromo
150
+ gay: Gayo
151
+ gba: Gbaya
152
+ gem: Germanic (Other)
153
+ geo: Georgian
154
+ ger: German
155
+ gez: Ethiopic
156
+ gil: Gilbertese
157
+ gla: Scottish Gaelic
158
+ gle: Irish
159
+ glg: Galician
160
+ glv: Manx
161
+ gmh: German, Middle High (ca. 1050-1500)
162
+ goh: German, Old High (ca. 750-1050)
163
+ gon: Gondi
164
+ gor: Gorontalo
165
+ got: Gothic
166
+ grb: Grebo
167
+ grc: Greek, Ancient (to 1453)
168
+ gre: Greek, Modern (1453- )
169
+ grn: Guarani
170
+ gua: Guarani
171
+ guj: Gujarati
172
+ gwi: Gwich'in
173
+ hai: Haida
174
+ hat: Haitian French Creole
175
+ hau: Hausa
176
+ haw: Hawaiian
177
+ heb: Hebrew
178
+ her: Herero
179
+ hil: Hiligaynon
180
+ him: Himachali
181
+ hin: Hindi
182
+ hit: Hittite
183
+ hmn: Hmong
184
+ hmo: Hiri Motu
185
+ hun: Hungarian
186
+ hup: Hupa
187
+ iba: Iban
188
+ ibo: Igbo
189
+ ice: Icelandic
190
+ ido: Ido
191
+ iii: Sichuan Yi
192
+ ijo: Ijo
193
+ iku: Inuktitut
194
+ ile: Interlingue
195
+ ilo: Iloko
196
+ ina: Interlingua (International Auxiliary Language Association)
197
+ inc: Indic (Other)
198
+ ind: Indonesian
199
+ ine: Indo-European (Other)
200
+ inh: Ingush
201
+ int: Interlingua (International Auxiliary Language Association)
202
+ ipk: Inupiaq
203
+ ira: Iranian (Other)
204
+ iri: Irish
205
+ iro: Iroquoian (Other)
206
+ ita: Italian
207
+ jav: Javanese
208
+ jpn: Japanese
209
+ jpr: Judeo-Persian
210
+ jrb: Judeo-Arabic
211
+ kaa: Kara-Kalpak
212
+ kab: Kabyle
213
+ kac: Kachin
214
+ kal: Kalatdlisut
215
+ kam: Kamba
216
+ kan: Kannada
217
+ kar: Karen
218
+ kas: Kashmiri
219
+ kau: Kanuri
220
+ kaw: Kawi
221
+ kaz: Kazakh
222
+ kbd: Kabardian
223
+ kha: Khasi
224
+ khi: Khoisan (Other)
225
+ khm: Khmer
226
+ kho: Khotanese
227
+ kik: Kikuyu
228
+ kin: Kinyarwanda
229
+ kir: Kyrgyz
230
+ kmb: Kimbundu
231
+ kok: Konkani
232
+ kom: Komi
233
+ kon: Kongo
234
+ kor: Korean
235
+ kos: Kusaie
236
+ kpe: Kpelle
237
+ kro: Kru
238
+ kru: Kurukh
239
+ kua: Kuanyama
240
+ kum: Kumyk
241
+ kur: Kurdish
242
+ kus: Kusaie
243
+ kut: Kutenai
244
+ lad: Ladino
245
+ lah: Lahnda
246
+ lam: Lamba
247
+ lan: Occitan (post-1500)
248
+ lao: Lao
249
+ lap: Sami
250
+ lat: Latin
251
+ lav: Latvian
252
+ lez: Lezgian
253
+ lim: Limburgish
254
+ lin: Lingala
255
+ lit: Lithuanian
256
+ lol: Mongo-Nkundu
257
+ loz: Lozi
258
+ ltz: Letzeburgesch
259
+ lua: Luba-Lulua
260
+ lub: Luba-Katanga
261
+ lug: Ganda
262
+ lui: Luiseno
263
+ lun: Lunda
264
+ luo: Luo (Kenya and Tanzania)
265
+ lus: Lushai
266
+ mac: Macedonian
267
+ mad: Madurese
268
+ mag: Magahi
269
+ mah: Marshallese
270
+ mai: Maithili
271
+ mak: Makasar
272
+ mal: Malayalam
273
+ man: Mandingo
274
+ mao: Maori
275
+ map: Austronesian (Other)
276
+ mar: Marathi
277
+ mas: Masai
278
+ max: Manx
279
+ may: Malay
280
+ mdr: Mandar
281
+ men: Mende
282
+ mga: Irish, Middle (ca. 1100-1550)
283
+ mic: Micmac
284
+ min: Minangkabau
285
+ mis: Miscellaneous languages
286
+ mkh: Mon-Khmer (Other)
287
+ mla: Malagasy
288
+ mlg: Malagasy
289
+ mlt: Maltese
290
+ mnc: Manchu
291
+ mni: Manipuri
292
+ mno: Manobo languages
293
+ moh: Mohawk
294
+ mol: Moldavian
295
+ mon: Mongolian
296
+ mos: Moore
297
+ mul: Multiple languages
298
+ mun: Munda (Other)
299
+ mus: Creek
300
+ mwr: Marwari
301
+ myn: Mayan languages
302
+ nah: Nahuatl
303
+ nai: North American Indian (Other)
304
+ nap: Neapolitan Italian
305
+ nau: Nauru
306
+ nav: Navajo
307
+ nbl: Ndebele (South Africa)
308
+ nde: Ndebele (Zimbabwe)
309
+ ndo: Ndonga
310
+ nds: Low German
311
+ nep: Nepali
312
+ new: Newari
313
+ nia: Nias
314
+ nic: Niger-Kordofanian (Other)
315
+ niu: Niuean
316
+ nno: Norwegian (Nynorsk)
317
+ nob: Norwegian (Bokmal)
318
+ nog: Nogai
319
+ non: Old Norse
320
+ nor: Norwegian
321
+ nso: Northern Sotho
322
+ nub: Nubian languages
323
+ nya: Nyanja
324
+ nym: Nyamwezi
325
+ nyn: Nyankole
326
+ nyo: Nyoro
327
+ nzi: Nzima
328
+ oci: Occitan (post-1500)
329
+ oji: Ojibwa
330
+ ori: Oriya
331
+ orm: Oromo
332
+ osa: Osage
333
+ oss: Ossetic
334
+ ota: Turkish, Ottoman
335
+ oto: Otomian languages
336
+ paa: Papuan (Other)
337
+ pag: Pangasinan
338
+ pal: Pahlavi
339
+ pam: Pampanga
340
+ pan: Panjabi
341
+ pap: Papiamento
342
+ pau: Palauan
343
+ peo: Old Persian (ca. 600-400 B.C.)
344
+ per: Persian
345
+ phi: Philippine (Other)
346
+ phn: Phoenician
347
+ pli: Pali
348
+ pol: Polish
349
+ pon: Ponape
350
+ por: Portuguese
351
+ pra: Prakrit languages
352
+ pro: Provencal (to 1500)
353
+ pus: Pushto
354
+ que: Quechua
355
+ raj: Rajasthani
356
+ rap: Rapanui
357
+ rar: Rarotongan
358
+ roa: Romance (Other)
359
+ roh: Raeto-Romance
360
+ rom: Romani
361
+ rum: Romanian
362
+ run: Rundi
363
+ rus: Russian
364
+ sad: Sandawe
365
+ sag: Sango (Ubangi Creole)
366
+ sah: Yakut
367
+ sai: South American Indian (Other)
368
+ sal: Salishan languages
369
+ sam: Samaritan Aramaic
370
+ san: Sanskrit
371
+ sao: Samoan
372
+ sas: Sasak
373
+ sat: Santali
374
+ scc: Serbian
375
+ sco: Scots
376
+ scr: Croatian
377
+ sel: Selkup
378
+ sem: Semitic (Other)
379
+ sga: Irish, Old (to 1100)
380
+ sgn: Sign languages
381
+ shn: Shan
382
+ sho: Shona
383
+ sid: Sidamo
384
+ sin: Sinhalese
385
+ sio: Siouan (Other)
386
+ sit: Sino-Tibetan (Other)
387
+ sla: Slavic (Other)
388
+ slo: Slovak
389
+ slv: Slovenian
390
+ sma: Southern Sami
391
+ sme: Northern Sami
392
+ smi: Sami
393
+ smj: Lule Sami
394
+ smn: Inari Sami
395
+ smo: Samoan
396
+ sms: Skolt Sami
397
+ sna: Shona
398
+ snd: Sindhi
399
+ snh: Sinhalese
400
+ snk: Soninke
401
+ sog: Sogdian
402
+ som: Somali
403
+ son: Songhai
404
+ sot: Sotho
405
+ spa: Spanish
406
+ srd: Sardinian
407
+ srr: Serer
408
+ ssa: Nilo-Saharan (Other)
409
+ sso: Sotho
410
+ ssw: Swazi
411
+ suk: Sukuma
412
+ sun: Sundanese
413
+ sus: Susu
414
+ sux: Sumerian
415
+ swa: Swahili
416
+ swe: Swedish
417
+ swz: Swazi
418
+ syr: Syriac
419
+ tag: Tagalog
420
+ tah: Tahitian
421
+ tai: Tai (Other)
422
+ taj: Tajik
423
+ tam: Tamil
424
+ tar: Tatar
425
+ tat: Tatar
426
+ tel: Telugu
427
+ tem: Temne
428
+ ter: Terena
429
+ tet: Tetum
430
+ tgk: Tajik
431
+ tgl: Tagalog
432
+ tha: Thai
433
+ tib: Tibetan
434
+ tig: Tigre
435
+ tir: Tigrinya
436
+ tiv: Tiv
437
+ tkl: Tokelauan
438
+ tli: Tlingit
439
+ tmh: Tamashek
440
+ tog: Tonga (Nyasa)
441
+ ton: Tongan
442
+ tpi: Tok Pisin
443
+ tru: Truk
444
+ tsi: Tsimshian
445
+ tsn: Tswana
446
+ tso: Tsonga
447
+ tsw: Tswana
448
+ tuk: Turkmen
449
+ tum: Tumbuka
450
+ tup: Tupi languages
451
+ tur: Turkish
452
+ tut: Altaic (Other)
453
+ tvl: Tuvaluan
454
+ twi: Twi
455
+ tyv: Tuvinian
456
+ udm: Udmurt
457
+ uga: Ugaritic
458
+ uig: Uighur
459
+ ukr: Ukrainian
460
+ umb: Umbundu
461
+ # und: Undetermined
462
+ urd: Urdu
463
+ uzb: Uzbek
464
+ vai: Vai
465
+ ven: Venda
466
+ vie: Vietnamese
467
+ vol: Volapuk
468
+ vot: Votic
469
+ wak: Wakashan languages
470
+ wal: Walamo
471
+ war: Waray
472
+ was: Washo
473
+ wel: Welsh
474
+ wen: Sorbian languages
475
+ wln: Walloon
476
+ wol: Wolof
477
+ xal: Kalmyk
478
+ xho: Xhosa
479
+ yao: Yao (Africa)
480
+ yap: Yapese
481
+ yid: Yiddish
482
+ yor: Yoruba
483
+ ypk: Yupik languages
484
+ zap: Zapotec
485
+ zen: Zenaga
486
+ zha: Zhuang
487
+ znd: Zande
488
+ zul: Zulu
489
+ zun: Zuni
490
+ # zxx: null
@@ -0,0 +1,34 @@
1
+ require 'test_helper'
2
+
3
+ describe "Traject::Indexer#each_record" do
4
+ before do
5
+ @indexer = Traject::Indexer.new
6
+ end
7
+
8
+ describe "checks arguments" do
9
+ it "rejects no-arg block" do
10
+ assert_raises(ArgumentError) do
11
+ @indexer.each_record do
12
+ end
13
+ end
14
+ end
15
+ it "rejects three-arg block" do
16
+ assert_raises(ArgumentError) do
17
+ @indexer.each_record do |one, two, three|
18
+ end
19
+ end
20
+ end
21
+ it "accepts one-arg block" do
22
+ @indexer.each_record do |record|
23
+ end
24
+ end
25
+ it "accepts two-arg block" do
26
+ @indexer.each_record do |record, context|
27
+ end
28
+ end
29
+ it "accepts variable arity block" do
30
+ @indexer.each_record do |*variable|
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,206 @@
1
+ require 'test_helper'
2
+
3
+ require 'traject/indexer'
4
+ require 'traject/macros/marc21_semantics'
5
+
6
+ require 'json'
7
+ require 'marc/record'
8
+
9
+ # See also marc_extractor_test.rb for more detailed tests on marc extraction,
10
+ # this is just a basic test to make sure our macro works passing through to there
11
+ # and other options.
12
+ describe "Traject::Macros::Marc21Semantics" do
13
+ Marc21Semantics = Traject::Macros::Marc21Semantics # shortcut
14
+
15
+ before do
16
+ @indexer = Traject::Indexer.new
17
+ @indexer.extend Marc21Semantics
18
+
19
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
20
+ end
21
+
22
+ it "oclcnum" do
23
+ @indexer.instance_eval do
24
+ to_field "oclcnum", oclcnum
25
+ end
26
+ output = @indexer.map_record(@record)
27
+
28
+ assert_equal %w{2710183 47971712}, output["oclcnum"]
29
+ end
30
+
31
+ describe "marc_sortable_author" do
32
+ # these probably should be taking only certain subfields, but we're copying
33
+ # from SolrMarc that didn't do so either and nobody noticed, so not bothering for now.
34
+ before do
35
+ @indexer.instance_eval do
36
+ to_field "author_sort", marc_sortable_author
37
+ end
38
+ end
39
+ it "collates author and title" do
40
+ output = @indexer.map_record(@record)
41
+
42
+ assert_equal ["Herman, Edward S.Manufacturing consent : the political economy of the mass media / Edward S. Herman and Noam Chomsky ; with a new introduction by the authors."], output["author_sort"]
43
+ end
44
+ it "respects non-filing" do
45
+ @record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
46
+
47
+ output = @indexer.map_record(@record)
48
+
49
+ assert_equal ["Business renaissance quarterly [electronic resource]."], output["author_sort"]
50
+ end
51
+ end
52
+
53
+ describe "marc_sortable_title" do
54
+ before do
55
+ @indexer.instance_eval { to_field "title_sort", marc_sortable_title }
56
+ end
57
+ it "works" do
58
+ output = @indexer.map_record(@record)
59
+ assert_equal ["Manufacturing consent : the political economy of the mass media"], output["title_sort"]
60
+ end
61
+ it "respects non-filing" do
62
+ @record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
63
+ output = @indexer.map_record(@record)
64
+
65
+ assert_equal ["Business renaissance quarterly"], output["title_sort"]
66
+ end
67
+ it "works with a record with no 245$ab" do
68
+ @record = MARC::Reader.new(support_file_path "245_no_ab.marc").to_a.first
69
+ output = @indexer.map_record(@record)
70
+ assert_equal ["Papers"], output["title_sort"]
71
+ end
72
+ end
73
+
74
+ describe "marc_languages" do
75
+ before do
76
+ @indexer.instance_eval {to_field "languages", marc_languages() }
77
+ end
78
+
79
+ it "unpacks packed 041a and translates" do
80
+ @record = MARC::Reader.new(support_file_path "packed_041a_lang.marc").to_a.first
81
+ output = @indexer.map_record(@record)
82
+
83
+ assert_equal ["English", "French", "German", "Italian", "Spanish", "Russian"], output["languages"]
84
+ end
85
+ end
86
+
87
+ describe "marc_instrumentation_humanized" do
88
+ before do
89
+ @record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
90
+ @indexer.instance_eval {to_field "instrumentation", marc_instrumentation_humanized }
91
+ end
92
+
93
+ it "translates, de-duping" do
94
+ output = @indexer.map_record(@record)
95
+
96
+ assert_equal ["Larger ensemble, Unspecified", "Piano", "Soprano voice", "Tenor voice", "Violin", "Larger ensemble, Ethnic", "Guitar", "Voices, Unspecified"], output["instrumentation"]
97
+ end
98
+ end
99
+
100
+ describe "marc_instrument_codes_normalized" do
101
+ before do
102
+ @record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
103
+ @indexer.instance_eval {to_field "instrument_codes", marc_instrument_codes_normalized }
104
+ end
105
+ it "normalizes, de-duping" do
106
+ output = @indexer.map_record(@record)
107
+
108
+ assert_equal ["on", "ka01", "ka", "va01", "va", "vd01", "vd", "sa01", "sa", "oy", "tb01", "tb", "vn12", "vn"],
109
+ output["instrument_codes"]
110
+ end
111
+ it "codes soloist 048$b" do
112
+ @record = MARC::Reader.new(support_file_path "louis_armstrong.marc").to_a.first
113
+ output = @indexer.map_record(@record)
114
+
115
+ assert_equal ["bb01", "bb01.s", "bb", "bb.s", "oe"],
116
+ output["instrument_codes"]
117
+ end
118
+ end
119
+
120
+ describe "publication_date" do
121
+ # there are way too many edge cases for us to test em all, but we'll test some of em.
122
+ it "pulls out 008 date_type s" do
123
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
124
+ assert_equal 2002, Marc21Semantics.publication_date(@record)
125
+ end
126
+ it "uses start date for date_type c continuing resource" do
127
+ @record = MARC::Reader.new(support_file_path "the_business_ren.marc").to_a.first
128
+ assert_equal 2006, Marc21Semantics.publication_date(@record)
129
+ end
130
+ it "returns nil when the records really got nothing" do
131
+ @record = MARC::Reader.new(support_file_path "emptyish_record.marc").to_a.first
132
+ assert_equal nil, Marc21Semantics.publication_date(@record)
133
+ end
134
+ it "estimates with a single 'u'" do
135
+ @record = MARC::Reader.new(support_file_path "date_with_u.marc").to_a.first
136
+ # was 184u as date1 on a continuing resource. For continuing resources,
137
+ # we take the first date. And need to deal with the u.
138
+ assert_equal 1845, Marc21Semantics.publication_date(@record)
139
+ end
140
+ it "resorts to 260c" do
141
+ @record = MARC::Reader.new(support_file_path "date_resort_to_260.marc").to_a.first
142
+ assert_equal 1980, Marc21Semantics.publication_date(@record)
143
+ end
144
+ it "works with date type r missing date2" do
145
+ @record = MARC::Reader.new(support_file_path "date_type_r_missing_date2.marc").to_a.first
146
+ assert_equal 1957, Marc21Semantics.publication_date(@record)
147
+ end
148
+ end
149
+
150
+ describe "marc_lcc_to_broad_category" do
151
+ before do
152
+ @indexer.instance_eval {to_field "discipline_facet", marc_lcc_to_broad_category }
153
+ end
154
+ it "maps a simple example" do
155
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
156
+ output = @indexer.map_record(@record)
157
+
158
+ assert_equal ["Language & Literature"], output["discipline_facet"]
159
+ end
160
+ it "maps to default" do
161
+ @record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
162
+ output = @indexer.map_record(@record)
163
+ assert_equal ["Unknown"], output["discipline_facet"]
164
+ end
165
+ it "maps to nothing if none and no default" do
166
+ @indexer.instance_eval {to_field "discipline_no_default", marc_lcc_to_broad_category(:default => nil)}
167
+ @record = MARC::Reader.new(support_file_path "musical_cage.marc").to_a.first
168
+ output = @indexer.map_record(@record)
169
+
170
+ assert_nil output["discipline_no_default"]
171
+ end
172
+ end
173
+
174
+ describe "marc_geo_facet" do
175
+ before do
176
+ @indexer.instance_eval {to_field "geo_facet", marc_geo_facet }
177
+ end
178
+ it "maps a complicated record" do
179
+ @record = MARC::Reader.new(support_file_path "multi_geo.marc").to_a.first
180
+ output = @indexer.map_record(@record)
181
+
182
+ assert_equal ["Europe", "Middle East", "Africa, North", "Agora (Athens, Greece)", "Rome (Italy)", "Italy"],
183
+ output["geo_facet"]
184
+ end
185
+ it "maps nothing on a record with no geo" do
186
+ @record = MARC::Reader.new(support_file_path "manufacturing_consent.marc").to_a.first
187
+ output = @indexer.map_record(@record)
188
+ assert_nil output["geo_facet"]
189
+ end
190
+ end
191
+
192
+ describe "marc_era_facet" do
193
+ before do
194
+ @indexer.instance_eval {to_field "era_facet", marc_era_facet}
195
+ end
196
+ it "maps a complicated record" do
197
+ @record = MARC::Reader.new(support_file_path "multi_era.marc").to_a.first
198
+ output = @indexer.map_record(@record)
199
+
200
+ assert_equal ["Early modern, 1500-1700", "17th century", "Great Britain: Puritan Revolution, 1642-1660", "Great Britain: Civil War, 1642-1649", "1642-1660"],
201
+ output["era_facet"]
202
+ end
203
+
204
+ end
205
+
206
+ end