traject 3.3.0 → 3.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1700077d5c2d3c667fc9520b659c3ca986b8ab34aee233f62bd7f73fdef91977
4
- data.tar.gz: 736b217f209ed08faba9c1d20c006b29586aa3ebdf088a89e37f5f3b7400de06
3
+ metadata.gz: c30572335810dc620f9a169df6f8f374512d3c472ea34bc03068106959fd1463
4
+ data.tar.gz: 3181c37e41e80416487d730e1983bc647daf480a3a308db30e294c7587adc644
5
5
  SHA512:
6
- metadata.gz: 21877d6cd5b03f7ffbbac316a6d58a3bc65b534cb7457e57d39ba470ad49d99c8677e5e6ede25c650bba5ac3f0b22f9b348ebabb36ac4047433eb8a76379ef1d
7
- data.tar.gz: 4ec1938d2d7b60a61ebde4e9c4e763e511c2896788b56b38ad6f22615dffb57449e29c9ef40e261952e125b90f5fe491fa447b077c7dcb1c55f57d6ef603fd5b
6
+ metadata.gz: 83b73a10113e75106a0fb7af9bec79802d2e3f5c8f3e07742f33a52642a9441c20769072f8ea5bd532011b7d172db6ca007121d6874f705008e1a5a511ca1ff8
7
+ data.tar.gz: d9c53588e8adbd76764c20012baf702591276d84c2cd64ed0bb0d5b742699607a2287d30a2fb4f70c1bf6f8a7338d716d989b208c8983c2a87eeddbd6d96dd3d
@@ -7,6 +7,7 @@ rvm:
7
7
  - 2.4.4
8
8
  - 2.5.1
9
9
  - 2.6.1
10
+ - 2.7.0
10
11
  # avoid having travis install jdk on MRI builds where we don't need it.
11
12
  matrix:
12
13
  include:
data/CHANGES.md CHANGED
@@ -6,7 +6,9 @@
6
6
 
7
7
  *
8
8
 
9
- *
9
+ ## 3.4.0
10
+
11
+ * XML-mode `extract_xpath` now supports extracting attribute values with xpath @attr syntax.
10
12
 
11
13
  ## 3.3.0
12
14
 
data/doc/xml.md CHANGED
@@ -72,6 +72,16 @@ You can use all the standard transforation macros in Traject::Macros::Transforma
72
72
  to_field "something", extract_xpath("//value"), first_only, translation_map("some_map"), default("no value")
73
73
  ```
74
74
 
75
+ ### selecting attribute values
76
+
77
+ Just works, using xpath syntax for selecting an attribute:
78
+
79
+
80
+ ```ruby
81
+ # gets status value in: <oai:header status="something">
82
+ to_field "status", extract_xpath("//oai:record/oai:header/@status")
83
+ ```
84
+
75
85
 
76
86
  ### selecting non-text nodes
77
87
 
@@ -42,11 +42,11 @@ module Traject::Macros
42
42
  #
43
43
  # * :translation_map => String: translate with named translation map looked up in load
44
44
  # path, uses Tranject::TranslationMap.new(translation_map_arg).
45
- # **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)
45
+ # **Instead**, use `extract_marc(whatever), translation_map(translation_map_arg)`
46
46
  #
47
47
  # * :trim_punctuation => true; trims leading/trailing punctuation using standard algorithms that
48
48
  # have shown themselves useful with Marc, using Marc21.trim_punctuation. **Instead**, use
49
- # `extract_marc(whatever), trim_punctuation
49
+ # `extract_marc(whatever), trim_punctuation`
50
50
  #
51
51
  # * :default => String: if otherwise empty, add default value. **Instead**, use `extract_marc(whatever), default("default value")`
52
52
  #
@@ -26,9 +26,15 @@ module Traject
26
26
  # Make sure to avoid text content that was all blank, which is "between the children"
27
27
  # whitespace.
28
28
  result = result.collect do |n|
29
- n.xpath('.//text()').collect(&:text).tap do |arr|
30
- arr.reject! { |s| s =~ (/\A\s+\z/) }
31
- end.join(" ")
29
+ if n.kind_of?(Nokogiri::XML::Attr)
30
+ # attribute value
31
+ n.value
32
+ else
33
+ # text from node
34
+ n.xpath('.//text()').collect(&:text).tap do |arr|
35
+ arr.reject! { |s| s =~ (/\A\s+\z/) }
36
+ end.join(" ")
37
+ end
32
38
  end
33
39
  else
34
40
  # just put all matches in accumulator as Nokogiri::XML::Node's
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.3.0"
2
+ VERSION = "3.4.0"
3
3
  end
@@ -10,18 +10,21 @@ ady: Adygei
10
10
  afa: Afroasiatic (Other)
11
11
  afh: Afrihili (Artificial language)
12
12
  afr: Afrikaans
13
- ajm: Aljamia
13
+ ain: Ainu
14
+ ajm: Aljamía
14
15
  aka: Akan
15
16
  akk: Akkadian
16
17
  alb: Albanian
17
18
  ale: Aleut
18
19
  alg: Algonquian (Other)
20
+ alt: Altai
19
21
  amh: Amharic
20
- ang: English, Old (ca. 450-1100)
22
+ ang: "English, Old (ca. 450-1100)"
23
+ anp: Angika
21
24
  apa: Apache languages
22
25
  ara: Arabic
23
26
  arc: Aramaic
24
- arg: Aragonese Spanish
27
+ arg: Aragonese
25
28
  arm: Armenian
26
29
  arn: Mapuche
27
30
  arp: Arapaho
@@ -36,7 +39,7 @@ ave: Avestan
36
39
  awa: Awadhi
37
40
  aym: Aymara
38
41
  aze: Azerbaijani
39
- bad: Banda
42
+ bad: Banda languages
40
43
  bai: Bamileke languages
41
44
  bak: Bashkir
42
45
  bal: Baluchi
@@ -51,7 +54,7 @@ bem: Bemba
51
54
  ben: Bengali
52
55
  ber: Berber (Other)
53
56
  bho: Bhojpuri
54
- bih: Bihari
57
+ bih: Bihari (Other)
55
58
  bik: Bikol
56
59
  bin: Edo
57
60
  bis: Bislama
@@ -65,6 +68,7 @@ bua: Buriat
65
68
  bug: Bugis
66
69
  bul: Bulgarian
67
70
  bur: Burmese
71
+ byn: Bilin
68
72
  cad: Caddo
69
73
  cai: Central American Indian (Other)
70
74
  cam: Khmer
@@ -78,7 +82,7 @@ chb: Chibcha
78
82
  che: Chechen
79
83
  chg: Chagatai
80
84
  chi: Chinese
81
- chk: Truk
85
+ chk: Chuukese
82
86
  chm: Mari
83
87
  chn: Chinook jargon
84
88
  cho: Choctaw
@@ -88,15 +92,17 @@ chu: Church Slavic
88
92
  chv: Chuvash
89
93
  chy: Cheyenne
90
94
  cmc: Chamic languages
95
+ cnr: Montenegrin
91
96
  cop: Coptic
92
97
  cor: Cornish
93
98
  cos: Corsican
94
- cpe: Creoles and Pidgins, English-based (Other)
95
- cpf: Creoles and Pidgins, French-based (Other)
96
- cpp: Creoles and Pidgins, Portuguese-based (Other)
99
+ cpe: "Creoles and Pidgins, English-based (Other)"
100
+ cpf: "Creoles and Pidgins, French-based (Other)"
101
+ cpp: "Creoles and Pidgins, Portuguese-based (Other)"
97
102
  cre: Cree
98
103
  crh: Crimean Tatar
99
104
  crp: Creoles and Pidgins (Other)
105
+ csb: Kashubian
100
106
  cus: Cushitic (Other)
101
107
  cze: Czech
102
108
  dak: Dakota
@@ -104,14 +110,15 @@ dan: Danish
104
110
  dar: Dargwa
105
111
  day: Dayak
106
112
  del: Delaware
107
- den: Slave
113
+ den: Slavey
108
114
  dgr: Dogrib
109
115
  din: Dinka
110
116
  div: Divehi
111
117
  doi: Dogri
112
118
  dra: Dravidian (Other)
119
+ dsb: Lower Sorbian
113
120
  dua: Duala
114
- dum: Dutch, Middle (ca. 1050-1350)
121
+ dum: "Dutch, Middle (ca. 1050-1350)"
115
122
  dut: Dutch
116
123
  dyu: Dyula
117
124
  dzo: Dzongkha
@@ -120,7 +127,7 @@ egy: Egyptian
120
127
  eka: Ekajuk
121
128
  elx: Elamite
122
129
  eng: English
123
- enm: English, Middle (1100-1500)
130
+ enm: "English, Middle (1100-1500)"
124
131
  epo: Esperanto
125
132
  esk: Eskimo languages
126
133
  esp: Esperanto
@@ -133,18 +140,21 @@ fao: Faroese
133
140
  far: Faroese
134
141
  fat: Fanti
135
142
  fij: Fijian
143
+ fil: Filipino
136
144
  fin: Finnish
137
145
  fiu: Finno-Ugrian (Other)
138
146
  fon: Fon
139
147
  fre: French
140
148
  fri: Frisian
141
- frm: French, Middle (ca. 1400-1600)
142
- fro: French, Old (ca. 842-1400)
149
+ frm: "French, Middle (ca. 1300-1600)"
150
+ fro: "French, Old (ca. 842-1300)"
151
+ frr: North Frisian
152
+ frs: East Frisian
143
153
  fry: Frisian
144
154
  ful: Fula
145
155
  fur: Friulian
146
- gaa: Ga
147
- gae: Scottish Gaelic
156
+ gaa:
157
+ gae: Scottish Gaelix
148
158
  gag: Galician
149
159
  gal: Oromo
150
160
  gay: Gayo
@@ -158,15 +168,16 @@ gla: Scottish Gaelic
158
168
  gle: Irish
159
169
  glg: Galician
160
170
  glv: Manx
161
- gmh: German, Middle High (ca. 1050-1500)
162
- goh: German, Old High (ca. 750-1050)
171
+ gmh: "German, Middle High (ca. 1050-1500)"
172
+ goh: "German, Old High (ca. 750-1050)"
163
173
  gon: Gondi
164
174
  gor: Gorontalo
165
175
  got: Gothic
166
176
  grb: Grebo
167
- grc: Greek, Ancient (to 1453)
168
- gre: Greek, Modern (1453- )
177
+ grc: "Greek, Ancient (to 1453)"
178
+ gre: "Greek, Modern (1453-)"
169
179
  grn: Guarani
180
+ gsw: Swiss German
170
181
  gua: Guarani
171
182
  guj: Gujarati
172
183
  gwi: Gwich'in
@@ -177,11 +188,13 @@ haw: Hawaiian
177
188
  heb: Hebrew
178
189
  her: Herero
179
190
  hil: Hiligaynon
180
- him: Himachali
191
+ him: Western Pahari languages
181
192
  hin: Hindi
182
193
  hit: Hittite
183
194
  hmn: Hmong
184
195
  hmo: Hiri Motu
196
+ hrv: Croatian
197
+ hsb: Upper Sorbian
185
198
  hun: Hungarian
186
199
  hup: Hupa
187
200
  iba: Iban
@@ -205,16 +218,17 @@ iri: Irish
205
218
  iro: Iroquoian (Other)
206
219
  ita: Italian
207
220
  jav: Javanese
221
+ jbo: Lojban (Artificial language)
208
222
  jpn: Japanese
209
223
  jpr: Judeo-Persian
210
224
  jrb: Judeo-Arabic
211
225
  kaa: Kara-Kalpak
212
226
  kab: Kabyle
213
227
  kac: Kachin
214
- kal: Kalatdlisut
228
+ kal: Kalâtdlisut
215
229
  kam: Kamba
216
230
  kan: Kannada
217
- kar: Karen
231
+ kar: Karen languages
218
232
  kas: Kashmiri
219
233
  kau: Kanuri
220
234
  kaw: Kawi
@@ -232,19 +246,21 @@ kok: Konkani
232
246
  kom: Komi
233
247
  kon: Kongo
234
248
  kor: Korean
235
- kos: Kusaie
249
+ kos: Kosraean
236
250
  kpe: Kpelle
237
- kro: Kru
251
+ krc: Karachay-Balkar
252
+ krl: Karelian
253
+ kro: Kru (Other)
238
254
  kru: Kurukh
239
255
  kua: Kuanyama
240
256
  kum: Kumyk
241
257
  kur: Kurdish
242
258
  kus: Kusaie
243
- kut: Kutenai
259
+ kut: Kootenai
244
260
  lad: Ladino
245
- lah: Lahnda
246
- lam: Lamba
247
- lan: Occitan (post-1500)
261
+ lah: Lahndā
262
+ lam: Lamba (Zambia and Congo)
263
+ lan: Occitan (post 1500)
248
264
  lao: Lao
249
265
  lap: Sami
250
266
  lat: Latin
@@ -255,11 +271,11 @@ lin: Lingala
255
271
  lit: Lithuanian
256
272
  lol: Mongo-Nkundu
257
273
  loz: Lozi
258
- ltz: Letzeburgesch
274
+ ltz: Luxembourgish
259
275
  lua: Luba-Lulua
260
276
  lub: Luba-Katanga
261
277
  lug: Ganda
262
- lui: Luiseno
278
+ lui: Luiseño
263
279
  lun: Lunda
264
280
  luo: Luo (Kenya and Tanzania)
265
281
  lus: Lushai
@@ -274,12 +290,13 @@ man: Mandingo
274
290
  mao: Maori
275
291
  map: Austronesian (Other)
276
292
  mar: Marathi
277
- mas: Masai
293
+ mas: Maasai
278
294
  max: Manx
279
295
  may: Malay
296
+ mdf: Moksha
280
297
  mdr: Mandar
281
298
  men: Mende
282
- mga: Irish, Middle (ca. 1100-1550)
299
+ mga: "Irish, Middle (ca. 1100-1550)"
283
300
  mic: Micmac
284
301
  min: Minangkabau
285
302
  mis: Miscellaneous languages
@@ -293,12 +310,14 @@ mno: Manobo languages
293
310
  moh: Mohawk
294
311
  mol: Moldavian
295
312
  mon: Mongolian
296
- mos: Moore
313
+ mos: Mooré
297
314
  mul: Multiple languages
298
315
  mun: Munda (Other)
299
316
  mus: Creek
317
+ mwl: Mirandese
300
318
  mwr: Marwari
301
319
  myn: Mayan languages
320
+ myv: Erzya
302
321
  nah: Nahuatl
303
322
  nai: North American Indian (Other)
304
323
  nap: Neapolitan Italian
@@ -314,12 +333,14 @@ nia: Nias
314
333
  nic: Niger-Kordofanian (Other)
315
334
  niu: Niuean
316
335
  nno: Norwegian (Nynorsk)
317
- nob: Norwegian (Bokmal)
336
+ nob: Norwegian (Bokmål)
318
337
  nog: Nogai
319
338
  non: Old Norse
320
339
  nor: Norwegian
340
+ nqo: N'Ko
321
341
  nso: Northern Sotho
322
342
  nub: Nubian languages
343
+ nwc: "Newari, Old"
323
344
  nya: Nyanja
324
345
  nym: Nyamwezi
325
346
  nyn: Nyankole
@@ -331,7 +352,7 @@ ori: Oriya
331
352
  orm: Oromo
332
353
  osa: Osage
333
354
  oss: Ossetic
334
- ota: Turkish, Ottoman
355
+ ota: "Turkish, Ottoman"
335
356
  oto: Otomian languages
336
357
  paa: Papuan (Other)
337
358
  pag: Pangasinan
@@ -346,10 +367,10 @@ phi: Philippine (Other)
346
367
  phn: Phoenician
347
368
  pli: Pali
348
369
  pol: Polish
349
- pon: Ponape
370
+ pon: Pohnpeian
350
371
  por: Portuguese
351
372
  pra: Prakrit languages
352
- pro: Provencal (to 1500)
373
+ pro: Provençal (to 1500)
353
374
  pus: Pushto
354
375
  que: Quechua
355
376
  raj: Rajasthani
@@ -360,6 +381,7 @@ roh: Raeto-Romance
360
381
  rom: Romani
361
382
  rum: Romanian
362
383
  run: Rundi
384
+ rup: Aromanian
363
385
  rus: Russian
364
386
  sad: Sandawe
365
387
  sag: Sango (Ubangi Creole)
@@ -372,11 +394,12 @@ sao: Samoan
372
394
  sas: Sasak
373
395
  sat: Santali
374
396
  scc: Serbian
397
+ scn: Sicilian Italian
375
398
  sco: Scots
376
399
  scr: Croatian
377
400
  sel: Selkup
378
401
  sem: Semitic (Other)
379
- sga: Irish, Old (to 1100)
402
+ sga: "Irish, Old (to 1100)"
380
403
  sgn: Sign languages
381
404
  shn: Shan
382
405
  sho: Shona
@@ -404,6 +427,8 @@ son: Songhai
404
427
  sot: Sotho
405
428
  spa: Spanish
406
429
  srd: Sardinian
430
+ srn: Sranan
431
+ srp: Serbian
407
432
  srr: Serer
408
433
  ssa: Nilo-Saharan (Other)
409
434
  sso: Sotho
@@ -415,7 +440,8 @@ sux: Sumerian
415
440
  swa: Swahili
416
441
  swe: Swedish
417
442
  swz: Swazi
418
- syr: Syriac
443
+ syc: Syriac
444
+ syr: "Syriac, Modern"
419
445
  tag: Tagalog
420
446
  tah: Tahitian
421
447
  tai: Tai (Other)
@@ -431,10 +457,11 @@ tgk: Tajik
431
457
  tgl: Tagalog
432
458
  tha: Thai
433
459
  tib: Tibetan
434
- tig: Tigre
460
+ tig: Tigré
435
461
  tir: Tigrinya
436
462
  tiv: Tiv
437
463
  tkl: Tokelauan
464
+ tlh: Klingon (Artificial language)
438
465
  tli: Tlingit
439
466
  tmh: Tamashek
440
467
  tog: Tonga (Nyasa)
@@ -464,17 +491,17 @@ uzb: Uzbek
464
491
  vai: Vai
465
492
  ven: Venda
466
493
  vie: Vietnamese
467
- vol: Volapuk
494
+ vol: Volapük
468
495
  vot: Votic
469
496
  wak: Wakashan languages
470
- wal: Walamo
497
+ wal: Wolayta
471
498
  war: Waray
472
- was: Washo
499
+ was: Washoe
473
500
  wel: Welsh
474
- wen: Sorbian languages
501
+ wen: Sorbian (Other)
475
502
  wln: Walloon
476
503
  wol: Wolof
477
- xal: Kalmyk
504
+ xal: Oirat
478
505
  xho: Xhosa
479
506
  yao: Yao (Africa)
480
507
  yap: Yapese
@@ -482,9 +509,11 @@ yid: Yiddish
482
509
  yor: Yoruba
483
510
  ypk: Yupik languages
484
511
  zap: Zapotec
512
+ zbl: Blissymbolics
485
513
  zen: Zenaga
486
514
  zha: Zhuang
487
- znd: Zande
515
+ znd: Zande languages
488
516
  zul: Zulu
489
517
  zun: Zuni
490
- # zxx: null
518
+ # zxx: No linguistic content
519
+ zza: Zaza
@@ -109,6 +109,41 @@ describe "Traject::NokogiriIndexer" do
109
109
  result["name"].name == "name"
110
110
  })
111
111
  end
112
+ end
112
113
 
114
+ describe "xpath to attribute" do
115
+ let(:indexer) do
116
+ namespaces = @namespaces
117
+ Traject::Indexer::NokogiriIndexer.new("nokogiri.namespaces" => namespaces,
118
+ "nokogiri.each_record_xpath" => "//oai:record") do
119
+ to_field "status", extract_xpath("//oai:record/oai:header/@status")
120
+ end
121
+ end
122
+
123
+ let(:records) { Traject::NokogiriReader.new(StringIO.new(
124
+ <<-XML
125
+ <?xml version="1.0" encoding="UTF-8"?>
126
+ <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
127
+ <responseDate>2020-03-03T04:16:09Z</responseDate>
128
+ <request verb="ListRecords" metadataPrefix="marc21" set="blacklight" from="2020-03-02T20:47:11Z">https://na02.alma.exlibrisgroup.com/view/oai/01TULI_INST/request</request>
129
+ <ListRecords>
130
+ <record>
131
+ <header status="deleted">
132
+ <identifier>oai:alma.01TULI_INST:991025803889703811</identifier>
133
+ <datestamp>2020-03-03T03:54:35Z</datestamp>
134
+ <setSpec>blacklight</setSpec>
135
+ <setSpec>rapid_print_journals</setSpec>
136
+ <setSpec>blacklight_qa</setSpec>
137
+ </header>
138
+ </record>
139
+ </ListRecords>
140
+ </OAI-PMH>
141
+ XML
142
+ ), []).to_a }
143
+
144
+ it "extracts the correct attribute" do
145
+ statuses = indexer.map_record(records.first)["status"]
146
+ assert_equal ["deleted"], statuses
147
+ end
113
148
  end
114
149
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.3.0
4
+ version: 3.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
8
8
  - Bill Dueber
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-12-02 00:00:00.000000000 Z
12
+ date: 2020-07-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: concurrent-ruby
@@ -231,7 +231,7 @@ dependencies:
231
231
  - - "~>"
232
232
  - !ruby/object:Gem::Version
233
233
  version: '3.4'
234
- description:
234
+ description:
235
235
  email:
236
236
  - none@nowhere.org
237
237
  executables:
@@ -390,7 +390,7 @@ homepage: http://github.com/traject/traject
390
390
  licenses:
391
391
  - MIT
392
392
  metadata: {}
393
- post_install_message:
393
+ post_install_message:
394
394
  rdoc_options: []
395
395
  require_paths:
396
396
  - lib
@@ -405,9 +405,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
405
405
  - !ruby/object:Gem::Version
406
406
  version: '0'
407
407
  requirements: []
408
- rubyforge_project:
409
- rubygems_version: 2.7.6
410
- signing_key:
408
+ rubygems_version: 3.0.3
409
+ signing_key:
411
410
  specification_version: 4
412
411
  summary: An easy to use, high-performance, flexible and extensible metadata transformation
413
412
  system, focused on library-archives-museums input, and indexing to Solr as output.