traject 3.3.0 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1700077d5c2d3c667fc9520b659c3ca986b8ab34aee233f62bd7f73fdef91977
4
- data.tar.gz: 736b217f209ed08faba9c1d20c006b29586aa3ebdf088a89e37f5f3b7400de06
3
+ metadata.gz: c30572335810dc620f9a169df6f8f374512d3c472ea34bc03068106959fd1463
4
+ data.tar.gz: 3181c37e41e80416487d730e1983bc647daf480a3a308db30e294c7587adc644
5
5
  SHA512:
6
- metadata.gz: 21877d6cd5b03f7ffbbac316a6d58a3bc65b534cb7457e57d39ba470ad49d99c8677e5e6ede25c650bba5ac3f0b22f9b348ebabb36ac4047433eb8a76379ef1d
7
- data.tar.gz: 4ec1938d2d7b60a61ebde4e9c4e763e511c2896788b56b38ad6f22615dffb57449e29c9ef40e261952e125b90f5fe491fa447b077c7dcb1c55f57d6ef603fd5b
6
+ metadata.gz: 83b73a10113e75106a0fb7af9bec79802d2e3f5c8f3e07742f33a52642a9441c20769072f8ea5bd532011b7d172db6ca007121d6874f705008e1a5a511ca1ff8
7
+ data.tar.gz: d9c53588e8adbd76764c20012baf702591276d84c2cd64ed0bb0d5b742699607a2287d30a2fb4f70c1bf6f8a7338d716d989b208c8983c2a87eeddbd6d96dd3d
@@ -7,6 +7,7 @@ rvm:
7
7
  - 2.4.4
8
8
  - 2.5.1
9
9
  - 2.6.1
10
+ - 2.7.0
10
11
  # avoid having travis install jdk on MRI builds where we don't need it.
11
12
  matrix:
12
13
  include:
data/CHANGES.md CHANGED
@@ -6,7 +6,9 @@
6
6
 
7
7
  *
8
8
 
9
- *
9
+ ## 3.4.0
10
+
11
+ * XML-mode `extract_xpath` now supports extracting attribute values with xpath @attr syntax.
10
12
 
11
13
  ## 3.3.0
12
14
 
data/doc/xml.md CHANGED
@@ -72,6 +72,16 @@ You can use all the standard transforation macros in Traject::Macros::Transforma
72
72
  to_field "something", extract_xpath("//value"), first_only, translation_map("some_map"), default("no value")
73
73
  ```
74
74
 
75
+ ### selecting attribute values
76
+
77
+ Just works, using xpath syntax for selecting an attribute:
78
+
79
+
80
+ ```ruby
81
+ # gets status value in: <oai:header status="something">
82
+ to_field "status", extract_xpath("//oai:record/oai:header/@status")
83
+ ```
84
+
75
85
 
76
86
  ### selecting non-text nodes
77
87
 
@@ -42,11 +42,11 @@ module Traject::Macros
42
42
  #
43
43
  # * :translation_map => String: translate with named translation map looked up in load
44
44
  # path, uses Tranject::TranslationMap.new(translation_map_arg).
45
- # **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)
45
+ # **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)`
46
46
  #
47
47
  # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
48
48
  # have shown themselves useful with Marc, using Marc21.trim_punctuation. **Instead**, use
49
- # `extract_marc(whatever), trim_punctuation
49
+ # `extract_marc(whatever), trim_punctuation`
50
50
  #
51
51
  # * :default => String: if otherwise empty, add default value. **Instead**, use `extract_marc(whatever), default("default value")`
52
52
  #
@@ -26,9 +26,15 @@ module Traject
26
26
  # Make sure to avoid text content that was all blank, which is "between the children"
27
27
  # whitespace.
28
28
  result = result.collect do |n|
29
- n.xpath('.//text()').collect(&:text).tap do |arr|
30
- arr.reject! { |s| s =~ (/\A\s+\z/) }
31
- end.join(" ")
29
+ if n.kind_of?(Nokogiri::XML::Attr)
30
+ # attribute value
31
+ n.value
32
+ else
33
+ # text from node
34
+ n.xpath('.//text()').collect(&:text).tap do |arr|
35
+ arr.reject! { |s| s =~ (/\A\s+\z/) }
36
+ end.join(" ")
37
+ end
32
38
  end
33
39
  else
34
40
  # just put all matches in accumulator as Nokogiri::XML::Node's
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.3.0"
2
+ VERSION = "3.4.0"
3
3
  end
@@ -10,18 +10,21 @@ ady: Adygei
10
10
  afa: Afroasiatic (Other)
11
11
  afh: Afrihili (Artificial language)
12
12
  afr: Afrikaans
13
- ajm: Aljamia
13
+ ain: Ainu
14
+ ajm: Aljamía
14
15
  aka: Akan
15
16
  akk: Akkadian
16
17
  alb: Albanian
17
18
  ale: Aleut
18
19
  alg: Algonquian (Other)
20
+ alt: Altai
19
21
  amh: Amharic
20
- ang: English, Old (ca. 450-1100)
22
+ ang: "English, Old (ca. 450-1100)"
23
+ anp: Angika
21
24
  apa: Apache languages
22
25
  ara: Arabic
23
26
  arc: Aramaic
24
- arg: Aragonese Spanish
27
+ arg: Aragonese
25
28
  arm: Armenian
26
29
  arn: Mapuche
27
30
  arp: Arapaho
@@ -36,7 +39,7 @@ ave: Avestan
36
39
  awa: Awadhi
37
40
  aym: Aymara
38
41
  aze: Azerbaijani
39
- bad: Banda
42
+ bad: Banda languages
40
43
  bai: Bamileke languages
41
44
  bak: Bashkir
42
45
  bal: Baluchi
@@ -51,7 +54,7 @@ bem: Bemba
51
54
  ben: Bengali
52
55
  ber: Berber (Other)
53
56
  bho: Bhojpuri
54
- bih: Bihari
57
+ bih: Bihari (Other)
55
58
  bik: Bikol
56
59
  bin: Edo
57
60
  bis: Bislama
@@ -65,6 +68,7 @@ bua: Buriat
65
68
  bug: Bugis
66
69
  bul: Bulgarian
67
70
  bur: Burmese
71
+ byn: Bilin
68
72
  cad: Caddo
69
73
  cai: Central American Indian (Other)
70
74
  cam: Khmer
@@ -78,7 +82,7 @@ chb: Chibcha
78
82
  che: Chechen
79
83
  chg: Chagatai
80
84
  chi: Chinese
81
- chk: Truk
85
+ chk: Chuukese
82
86
  chm: Mari
83
87
  chn: Chinook jargon
84
88
  cho: Choctaw
@@ -88,15 +92,17 @@ chu: Church Slavic
88
92
  chv: Chuvash
89
93
  chy: Cheyenne
90
94
  cmc: Chamic languages
95
+ cnr: Montenegrin
91
96
  cop: Coptic
92
97
  cor: Cornish
93
98
  cos: Corsican
94
- cpe: Creoles and Pidgins, English-based (Other)
95
- cpf: Creoles and Pidgins, French-based (Other)
96
- cpp: Creoles and Pidgins, Portuguese-based (Other)
99
+ cpe: "Creoles and Pidgins, English-based (Other)"
100
+ cpf: "Creoles and Pidgins, French-based (Other)"
101
+ cpp: "Creoles and Pidgins, Portuguese-based (Other)"
97
102
  cre: Cree
98
103
  crh: Crimean Tatar
99
104
  crp: Creoles and Pidgins (Other)
105
+ csb: Kashubian
100
106
  cus: Cushitic (Other)
101
107
  cze: Czech
102
108
  dak: Dakota
@@ -104,14 +110,15 @@ dan: Danish
104
110
  dar: Dargwa
105
111
  day: Dayak
106
112
  del: Delaware
107
- den: Slave
113
+ den: Slavey
108
114
  dgr: Dogrib
109
115
  din: Dinka
110
116
  div: Divehi
111
117
  doi: Dogri
112
118
  dra: Dravidian (Other)
119
+ dsb: Lower Sorbian
113
120
  dua: Duala
114
- dum: Dutch, Middle (ca. 1050-1350)
121
+ dum: "Dutch, Middle (ca. 1050-1350)"
115
122
  dut: Dutch
116
123
  dyu: Dyula
117
124
  dzo: Dzongkha
@@ -120,7 +127,7 @@ egy: Egyptian
120
127
  eka: Ekajuk
121
128
  elx: Elamite
122
129
  eng: English
123
- enm: English, Middle (1100-1500)
130
+ enm: "English, Middle (1100-1500)"
124
131
  epo: Esperanto
125
132
  esk: Eskimo languages
126
133
  esp: Esperanto
@@ -133,18 +140,21 @@ fao: Faroese
133
140
  far: Faroese
134
141
  fat: Fanti
135
142
  fij: Fijian
143
+ fil: Filipino
136
144
  fin: Finnish
137
145
  fiu: Finno-Ugrian (Other)
138
146
  fon: Fon
139
147
  fre: French
140
148
  fri: Frisian
141
- frm: French, Middle (ca. 1400-1600)
142
- fro: French, Old (ca. 842-1400)
149
+ frm: "French, Middle (ca. 1300-1600)"
150
+ fro: "French, Old (ca. 842-1300)"
151
+ frr: North Frisian
152
+ frs: East Frisian
143
153
  fry: Frisian
144
154
  ful: Fula
145
155
  fur: Friulian
146
- gaa: Ga
147
- gae: Scottish Gaelic
156
+ gaa:
157
+ gae: Scottish Gaelix
148
158
  gag: Galician
149
159
  gal: Oromo
150
160
  gay: Gayo
@@ -158,15 +168,16 @@ gla: Scottish Gaelic
158
168
  gle: Irish
159
169
  glg: Galician
160
170
  glv: Manx
161
- gmh: German, Middle High (ca. 1050-1500)
162
- goh: German, Old High (ca. 750-1050)
171
+ gmh: "German, Middle High (ca. 1050-1500)"
172
+ goh: "German, Old High (ca. 750-1050)"
163
173
  gon: Gondi
164
174
  gor: Gorontalo
165
175
  got: Gothic
166
176
  grb: Grebo
167
- grc: Greek, Ancient (to 1453)
168
- gre: Greek, Modern (1453- )
177
+ grc: "Greek, Ancient (to 1453)"
178
+ gre: "Greek, Modern (1453-)"
169
179
  grn: Guarani
180
+ gsw: Swiss German
170
181
  gua: Guarani
171
182
  guj: Gujarati
172
183
  gwi: Gwich'in
@@ -177,11 +188,13 @@ haw: Hawaiian
177
188
  heb: Hebrew
178
189
  her: Herero
179
190
  hil: Hiligaynon
180
- him: Himachali
191
+ him: Western Pahari languages
181
192
  hin: Hindi
182
193
  hit: Hittite
183
194
  hmn: Hmong
184
195
  hmo: Hiri Motu
196
+ hrv: Croatian
197
+ hsb: Upper Sorbian
185
198
  hun: Hungarian
186
199
  hup: Hupa
187
200
  iba: Iban
@@ -205,16 +218,17 @@ iri: Irish
205
218
  iro: Iroquoian (Other)
206
219
  ita: Italian
207
220
  jav: Javanese
221
+ jbo: Lojban (Artificial language)
208
222
  jpn: Japanese
209
223
  jpr: Judeo-Persian
210
224
  jrb: Judeo-Arabic
211
225
  kaa: Kara-Kalpak
212
226
  kab: Kabyle
213
227
  kac: Kachin
214
- kal: Kalatdlisut
228
+ kal: Kalâtdlisut
215
229
  kam: Kamba
216
230
  kan: Kannada
217
- kar: Karen
231
+ kar: Karen languages
218
232
  kas: Kashmiri
219
233
  kau: Kanuri
220
234
  kaw: Kawi
@@ -232,19 +246,21 @@ kok: Konkani
232
246
  kom: Komi
233
247
  kon: Kongo
234
248
  kor: Korean
235
- kos: Kusaie
249
+ kos: Kosraean
236
250
  kpe: Kpelle
237
- kro: Kru
251
+ krc: Karachay-Balkar
252
+ krl: Karelian
253
+ kro: Kru (Other)
238
254
  kru: Kurukh
239
255
  kua: Kuanyama
240
256
  kum: Kumyk
241
257
  kur: Kurdish
242
258
  kus: Kusaie
243
- kut: Kutenai
259
+ kut: Kootenai
244
260
  lad: Ladino
245
- lah: Lahnda
246
- lam: Lamba
247
- lan: Occitan (post-1500)
261
+ lah: Lahndā
262
+ lam: Lamba (Zambia and Congo)
263
+ lan: Occitan (post 1500)
248
264
  lao: Lao
249
265
  lap: Sami
250
266
  lat: Latin
@@ -255,11 +271,11 @@ lin: Lingala
255
271
  lit: Lithuanian
256
272
  lol: Mongo-Nkundu
257
273
  loz: Lozi
258
- ltz: Letzeburgesch
274
+ ltz: Luxembourgish
259
275
  lua: Luba-Lulua
260
276
  lub: Luba-Katanga
261
277
  lug: Ganda
262
- lui: Luiseno
278
+ lui: Luiseño
263
279
  lun: Lunda
264
280
  luo: Luo (Kenya and Tanzania)
265
281
  lus: Lushai
@@ -274,12 +290,13 @@ man: Mandingo
274
290
  mao: Maori
275
291
  map: Austronesian (Other)
276
292
  mar: Marathi
277
- mas: Masai
293
+ mas: Maasai
278
294
  max: Manx
279
295
  may: Malay
296
+ mdf: Moksha
280
297
  mdr: Mandar
281
298
  men: Mende
282
- mga: Irish, Middle (ca. 1100-1550)
299
+ mga: "Irish, Middle (ca. 1100-1550)"
283
300
  mic: Micmac
284
301
  min: Minangkabau
285
302
  mis: Miscellaneous languages
@@ -293,12 +310,14 @@ mno: Manobo languages
293
310
  moh: Mohawk
294
311
  mol: Moldavian
295
312
  mon: Mongolian
296
- mos: Moore
313
+ mos: Mooré
297
314
  mul: Multiple languages
298
315
  mun: Munda (Other)
299
316
  mus: Creek
317
+ mwl: Mirandese
300
318
  mwr: Marwari
301
319
  myn: Mayan languages
320
+ myv: Erzya
302
321
  nah: Nahuatl
303
322
  nai: North American Indian (Other)
304
323
  nap: Neapolitan Italian
@@ -314,12 +333,14 @@ nia: Nias
314
333
  nic: Niger-Kordofanian (Other)
315
334
  niu: Niuean
316
335
  nno: Norwegian (Nynorsk)
317
- nob: Norwegian (Bokmal)
336
+ nob: Norwegian (Bokmål)
318
337
  nog: Nogai
319
338
  non: Old Norse
320
339
  nor: Norwegian
340
+ nqo: N'Ko
321
341
  nso: Northern Sotho
322
342
  nub: Nubian languages
343
+ nwc: "Newari, Old"
323
344
  nya: Nyanja
324
345
  nym: Nyamwezi
325
346
  nyn: Nyankole
@@ -331,7 +352,7 @@ ori: Oriya
331
352
  orm: Oromo
332
353
  osa: Osage
333
354
  oss: Ossetic
334
- ota: Turkish, Ottoman
355
+ ota: "Turkish, Ottoman"
335
356
  oto: Otomian languages
336
357
  paa: Papuan (Other)
337
358
  pag: Pangasinan
@@ -346,10 +367,10 @@ phi: Philippine (Other)
346
367
  phn: Phoenician
347
368
  pli: Pali
348
369
  pol: Polish
349
- pon: Ponape
370
+ pon: Pohnpeian
350
371
  por: Portuguese
351
372
  pra: Prakrit languages
352
- pro: Provencal (to 1500)
373
+ pro: Provençal (to 1500)
353
374
  pus: Pushto
354
375
  que: Quechua
355
376
  raj: Rajasthani
@@ -360,6 +381,7 @@ roh: Raeto-Romance
360
381
  rom: Romani
361
382
  rum: Romanian
362
383
  run: Rundi
384
+ rup: Aromanian
363
385
  rus: Russian
364
386
  sad: Sandawe
365
387
  sag: Sango (Ubangi Creole)
@@ -372,11 +394,12 @@ sao: Samoan
372
394
  sas: Sasak
373
395
  sat: Santali
374
396
  scc: Serbian
397
+ scn: Sicilian Italian
375
398
  sco: Scots
376
399
  scr: Croatian
377
400
  sel: Selkup
378
401
  sem: Semitic (Other)
379
- sga: Irish, Old (to 1100)
402
+ sga: "Irish, Old (to 1100)"
380
403
  sgn: Sign languages
381
404
  shn: Shan
382
405
  sho: Shona
@@ -404,6 +427,8 @@ son: Songhai
404
427
  sot: Sotho
405
428
  spa: Spanish
406
429
  srd: Sardinian
430
+ srn: Sranan
431
+ srp: Serbian
407
432
  srr: Serer
408
433
  ssa: Nilo-Saharan (Other)
409
434
  sso: Sotho
@@ -415,7 +440,8 @@ sux: Sumerian
415
440
  swa: Swahili
416
441
  swe: Swedish
417
442
  swz: Swazi
418
- syr: Syriac
443
+ syc: Syriac
444
+ syr: "Syriac, Modern"
419
445
  tag: Tagalog
420
446
  tah: Tahitian
421
447
  tai: Tai (Other)
@@ -431,10 +457,11 @@ tgk: Tajik
431
457
  tgl: Tagalog
432
458
  tha: Thai
433
459
  tib: Tibetan
434
- tig: Tigre
460
+ tig: Tigré
435
461
  tir: Tigrinya
436
462
  tiv: Tiv
437
463
  tkl: Tokelauan
464
+ tlh: Klingon (Artificial language)
438
465
  tli: Tlingit
439
466
  tmh: Tamashek
440
467
  tog: Tonga (Nyasa)
@@ -464,17 +491,17 @@ uzb: Uzbek
464
491
  vai: Vai
465
492
  ven: Venda
466
493
  vie: Vietnamese
467
- vol: Volapuk
494
+ vol: Volapük
468
495
  vot: Votic
469
496
  wak: Wakashan languages
470
- wal: Walamo
497
+ wal: Wolayta
471
498
  war: Waray
472
- was: Washo
499
+ was: Washoe
473
500
  wel: Welsh
474
- wen: Sorbian languages
501
+ wen: Sorbian (Other)
475
502
  wln: Walloon
476
503
  wol: Wolof
477
- xal: Kalmyk
504
+ xal: Oirat
478
505
  xho: Xhosa
479
506
  yao: Yao (Africa)
480
507
  yap: Yapese
@@ -482,9 +509,11 @@ yid: Yiddish
482
509
  yor: Yoruba
483
510
  ypk: Yupik languages
484
511
  zap: Zapotec
512
+ zbl: Blissymbolics
485
513
  zen: Zenaga
486
514
  zha: Zhuang
487
- znd: Zande
515
+ znd: Zande languages
488
516
  zul: Zulu
489
517
  zun: Zuni
490
- # zxx: null
518
+ # zxx: No linguistic content
519
+ zza: Zaza
@@ -109,6 +109,41 @@ describe "Traject::NokogiriIndexer" do
109
109
  result["name"].name == "name"
110
110
  })
111
111
  end
112
+ end
112
113
 
114
+ describe "xpath to attribute" do
115
+ let(:indexer) do
116
+ namespaces = @namespaces
117
+ Traject::Indexer::NokogiriIndexer.new("nokogiri.namespaces" => namespaces,
118
+ "nokogiri.each_record_xpath" => "//oai:record") do
119
+ to_field "status", extract_xpath("//oai:record/oai:header/@status")
120
+ end
121
+ end
122
+
123
+ let(:records) { Traject::NokogiriReader.new(StringIO.new(
124
+ <<-XML
125
+ <?xml version="1.0" encoding="UTF-8"?>
126
+ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
127
+ <responseDate>2020-03-03T04:16:09Z</responseDate>
128
+ <request verb="ListRecords" metadataPrefix="marc21" set="blacklight" from="2020-03-02T20:47:11Z">https://na02.alma.exlibrisgroup.com/view/oai/01TULI_INST/request</request>
129
+ <ListRecords>
130
+ <record>
131
+ <header status="deleted">
132
+ <identifier>oai:alma.01TULI_INST:991025803889703811</identifier>
133
+ <datestamp>2020-03-03T03:54:35Z</datestamp>
134
+ <setSpec>blacklight</setSpec>
135
+ <setSpec>rapid_print_journals</setSpec>
136
+ <setSpec>blacklight_qa</setSpec>
137
+ </header>
138
+ </record>
139
+ </ListRecords>
140
+ </OAI-PMH>
141
+ XML
142
+ ), []).to_a }
143
+
144
+ it "extracts the correct attribute" do
145
+ statuses = indexer.map_record(records.first)["status"]
146
+ assert_equal ["deleted"], statuses
147
+ end
113
148
  end
114
149
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.0
4
+ version: 3.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
8
8
  - Bill Dueber
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-12-02 00:00:00.000000000 Z
12
+ date: 2020-07-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -231,7 +231,7 @@ dependencies:
231
231
  - - "~>"
232
232
  - !ruby/object:Gem::Version
233
233
  version: '3.4'
234
- description:
234
+ description:
235
235
  email:
236
236
  - none@nowhere.org
237
237
  executables:
@@ -390,7 +390,7 @@ homepage: http://github.com/traject/traject
390
390
  licenses:
391
391
  - MIT
392
392
  metadata: {}
393
- post_install_message:
393
+ post_install_message:
394
394
  rdoc_options: []
395
395
  require_paths:
396
396
  - lib
@@ -405,9 +405,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
405
405
  - !ruby/object:Gem::Version
406
406
  version: '0'
407
407
  requirements: []
408
- rubyforge_project:
409
- rubygems_version: 2.7.6
410
- signing_key:
408
+ rubygems_version: 3.0.3
409
+ signing_key:
411
410
  specification_version: 4
412
411
  summary: An easy to use, high-performance, flexible and extensible metadata transformation
413
412
  system, focused on library-archives-museums input, and indexing to Solr as output.