traject 3.1.0 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,9 +26,15 @@ module Traject
26
26
  # Make sure to avoid text content that was all blank, which is "between the children"
27
27
  # whitespace.
28
28
  result = result.collect do |n|
29
- n.xpath('.//text()').collect(&:text).tap do |arr|
30
- arr.reject! { |s| s =~ (/\A\s+\z/) }
31
- end.join(" ")
29
+ if n.kind_of?(Nokogiri::XML::Attr)
30
+ # attribute value
31
+ n.value
32
+ else
33
+ # text from node
34
+ n.xpath('.//text()').collect(&:text).tap do |arr|
35
+ arr.reject! { |s| s =~ (/\A\s+\z/) }
36
+ end.join(" ")
37
+ end
32
38
  end
33
39
  else
34
40
  # just put all matches in accumulator as Nokogiri::XML::Node's
@@ -2,9 +2,9 @@ require 'traject/marc_extractor_spec'
2
2
 
3
3
  module Traject
4
4
  # MarcExtractor is a class for extracting lists of strings from a MARC::Record,
5
- # according to specifications. See #parse_string_spec for description of string
6
- # string arguments used to specify extraction. See #initialize for options
7
- # that can be set controlling extraction.
5
+ # according to specifications. See Traject::MarcExtractor::Spec for description
6
+ # of string string arguments used to specify extraction. See #initialize for
7
+ # options that can be set controlling extraction.
8
8
  #
9
9
  # Examples:
10
10
  #
@@ -1,3 +1,5 @@
1
+ require 'nokogiri'
2
+
1
3
  module Traject
2
4
  # A Trajet reader which reads XML, and yields zero to many Nokogiri::XML::Document
3
5
  # objects as source records in the traject pipeline.
@@ -21,6 +23,9 @@ module Traject
21
23
  # If you need to use namespaces here, you need to have them registered with
22
24
  # `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
23
25
  # to use them in your each_record_xpath.
26
+ # * nokogiri.strict_mode: if set to `true` or `"true"`, ask Nokogiri to parse in 'strict'
27
+ # mode, it will raise a `Nokogiri::XML::SyntaxError` if the XML is not well-formed, instead
28
+ # of trying to take it's best-guess correction. https://nokogiri.org/tutorials/ensuring_well_formed_markup.html
24
29
  # * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
25
30
  #
26
31
  # ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
@@ -87,7 +92,11 @@ module Traject
87
92
  end
88
93
 
89
94
  def each
90
- whole_input_doc = Nokogiri::XML.parse(input_stream)
95
+ config_proc = if settings["nokogiri.strict_mode"]
96
+ proc { |config| config.strict }
97
+ end
98
+
99
+ whole_input_doc = Nokogiri::XML.parse(input_stream, &config_proc)
91
100
 
92
101
  if each_record_xpath
93
102
  whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
@@ -115,9 +115,15 @@ module Traject
115
115
  # @returns [HTTP::Client] from http.rb gem
116
116
  def http_client
117
117
  @http_client ||= begin
118
- # timeout setting on http.rb seems to be a mess.
119
- # https://github.com/httprb/http/issues/488
120
- client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
118
+ client = nil
119
+
120
+ if HTTP::VERSION.split(".").first.to_i > 3
121
+ client = HTTP.timeout(timeout)
122
+ else
123
+ # timeout setting on http.rb 3.x are a bit of a mess.
124
+ # https://github.com/httprb/http/issues/488
125
+ client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
126
+ end
121
127
 
122
128
  if settings["oai_pmh.try_gzip"]
123
129
  client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
@@ -41,10 +41,12 @@ require 'concurrent' # for atomic_fixnum
41
41
  #
42
42
  # ## Relevant settings
43
43
  #
44
- # * solr.url (optional if solr.update_url is set) The URL to the solr core to index into
44
+ # * solr.url (optional if solr.update_url is set) The URL to the solr core to index into.
45
+ # (Can include embedded HTTP basic auth as eg `http://user:pass@host/solr`)
45
46
  #
46
47
  # * solr.update_url: The actual update url. If unset, we'll first see if
47
- # "#{solr.url}/update/json" exists, and if not use "#{solr.url}/update"
48
+ # "#{solr.url}/update/json" exists, and if not use "#{solr.url}/update". (Can include
49
+ # embedded HTTP basic auth as eg `http://user:pass@host/solr)
48
50
  #
49
51
  # * solr_writer.batch_size: How big a batch to send to solr. Default is 100.
50
52
  # My tests indicate that this setting doesn't change overall index speed by a ton.
@@ -101,12 +103,17 @@ class Traject::SolrJsonWriter
101
103
  def initialize(argSettings)
102
104
  @settings = Traject::Indexer::Settings.new(argSettings)
103
105
 
106
+
104
107
  # Set max errors
105
108
  @max_skipped = (@settings['solr_writer.max_skipped'] || DEFAULT_MAX_SKIPPED).to_i
106
109
  if @max_skipped < 0
107
110
  @max_skipped = nil
108
111
  end
109
112
 
113
+
114
+ # Figure out where to send updates, and if with basic auth
115
+ @solr_update_url, basic_auth_user, basic_auth_password = self.determine_solr_update_url
116
+
110
117
  @http_client = if @settings["solr_json_writer.http_client"]
111
118
  @settings["solr_json_writer.http_client"]
112
119
  else
@@ -114,6 +121,11 @@ class Traject::SolrJsonWriter
114
121
  if @settings["solr_writer.http_timeout"]
115
122
  client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
116
123
  end
124
+
125
+ if basic_auth_user || basic_auth_password
126
+ client.set_auth(@solr_update_url, basic_auth_user, basic_auth_password)
127
+ end
128
+
117
129
  client
118
130
  end
119
131
 
@@ -137,13 +149,11 @@ class Traject::SolrJsonWriter
137
149
  # this the new default writer.
138
150
  @commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
139
151
 
140
- # Figure out where to send updates
141
- @solr_update_url = self.determine_solr_update_url
142
152
 
143
153
  @solr_update_args = settings["solr_writer.solr_update_args"]
144
154
  @commit_solr_update_args = settings["solr_writer.commit_solr_update_args"]
145
155
 
146
- logger.info(" #{self.class.name} writing to '#{@solr_update_url}' in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
156
+ logger.info(" #{self.class.name} writing to '#{@solr_update_url}' #{"(with HTTP basic auth)" if basic_auth_user || basic_auth_password}in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
147
157
  end
148
158
 
149
159
 
@@ -270,6 +280,13 @@ class Traject::SolrJsonWriter
270
280
  end
271
281
  end
272
282
 
283
+ # Send a delete all query.
284
+ #
285
+ # This method takes no params and will not automatically commit the deletes.
286
+ # @example @writer.delete_all!
287
+ def delete_all!
288
+ delete(query: "*:*")
289
+ end
273
290
 
274
291
  # Get the logger from the settings, or default to an effectively null logger
275
292
  def logger
@@ -355,13 +372,27 @@ class Traject::SolrJsonWriter
355
372
  end
356
373
 
357
374
 
358
- # Relatively complex logic to determine if we have a valid URL and what it is
375
+ # Relatively complex logic to determine if we have a valid URL and what it is,
376
+ # and if we have basic_auth info
377
+ #
378
+ # Empties out user and password embedded in URI returned, to help avoid logging it.
379
+ #
380
+ # @returns [update_url, basic_auth_user, basic_auth_password]
359
381
  def determine_solr_update_url
360
- if settings['solr.update_url']
382
+ url = if settings['solr.update_url']
361
383
  check_solr_update_url(settings['solr.update_url'])
362
384
  else
363
385
  derive_solr_update_url_from_solr_url(settings['solr.url'])
364
386
  end
387
+
388
+ parsed_uri = URI.parse(url)
389
+ user_from_uri, password_from_uri = parsed_uri.user, parsed_uri.password
390
+ parsed_uri.user, parsed_uri.password = nil, nil
391
+
392
+ basic_auth_user = @settings["solr_writer.basic_auth_user"] || user_from_uri
393
+ basic_auth_password = @settings["solr_writer.basic_auth_password"] || password_from_uri
394
+
395
+ return [parsed_uri.to_s, basic_auth_user, basic_auth_password]
365
396
  end
366
397
 
367
398
 
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.1.0"
2
+ VERSION = "3.6.0"
3
3
  end
@@ -10,18 +10,21 @@ ady: Adygei
10
10
  afa: Afroasiatic (Other)
11
11
  afh: Afrihili (Artificial language)
12
12
  afr: Afrikaans
13
- ajm: Aljamia
13
+ ain: Ainu
14
+ ajm: Aljamía
14
15
  aka: Akan
15
16
  akk: Akkadian
16
17
  alb: Albanian
17
18
  ale: Aleut
18
19
  alg: Algonquian (Other)
20
+ alt: Altai
19
21
  amh: Amharic
20
- ang: English, Old (ca. 450-1100)
22
+ ang: "English, Old (ca. 450-1100)"
23
+ anp: Angika
21
24
  apa: Apache languages
22
25
  ara: Arabic
23
26
  arc: Aramaic
24
- arg: Aragonese Spanish
27
+ arg: Aragonese
25
28
  arm: Armenian
26
29
  arn: Mapuche
27
30
  arp: Arapaho
@@ -36,7 +39,7 @@ ave: Avestan
36
39
  awa: Awadhi
37
40
  aym: Aymara
38
41
  aze: Azerbaijani
39
- bad: Banda
42
+ bad: Banda languages
40
43
  bai: Bamileke languages
41
44
  bak: Bashkir
42
45
  bal: Baluchi
@@ -51,7 +54,7 @@ bem: Bemba
51
54
  ben: Bengali
52
55
  ber: Berber (Other)
53
56
  bho: Bhojpuri
54
- bih: Bihari
57
+ bih: Bihari (Other)
55
58
  bik: Bikol
56
59
  bin: Edo
57
60
  bis: Bislama
@@ -65,6 +68,7 @@ bua: Buriat
65
68
  bug: Bugis
66
69
  bul: Bulgarian
67
70
  bur: Burmese
71
+ byn: Bilin
68
72
  cad: Caddo
69
73
  cai: Central American Indian (Other)
70
74
  cam: Khmer
@@ -78,7 +82,7 @@ chb: Chibcha
78
82
  che: Chechen
79
83
  chg: Chagatai
80
84
  chi: Chinese
81
- chk: Truk
85
+ chk: Chuukese
82
86
  chm: Mari
83
87
  chn: Chinook jargon
84
88
  cho: Choctaw
@@ -88,15 +92,17 @@ chu: Church Slavic
88
92
  chv: Chuvash
89
93
  chy: Cheyenne
90
94
  cmc: Chamic languages
95
+ cnr: Montenegrin
91
96
  cop: Coptic
92
97
  cor: Cornish
93
98
  cos: Corsican
94
- cpe: Creoles and Pidgins, English-based (Other)
95
- cpf: Creoles and Pidgins, French-based (Other)
96
- cpp: Creoles and Pidgins, Portuguese-based (Other)
99
+ cpe: "Creoles and Pidgins, English-based (Other)"
100
+ cpf: "Creoles and Pidgins, French-based (Other)"
101
+ cpp: "Creoles and Pidgins, Portuguese-based (Other)"
97
102
  cre: Cree
98
103
  crh: Crimean Tatar
99
104
  crp: Creoles and Pidgins (Other)
105
+ csb: Kashubian
100
106
  cus: Cushitic (Other)
101
107
  cze: Czech
102
108
  dak: Dakota
@@ -104,14 +110,15 @@ dan: Danish
104
110
  dar: Dargwa
105
111
  day: Dayak
106
112
  del: Delaware
107
- den: Slave
113
+ den: Slavey
108
114
  dgr: Dogrib
109
115
  din: Dinka
110
116
  div: Divehi
111
117
  doi: Dogri
112
118
  dra: Dravidian (Other)
119
+ dsb: Lower Sorbian
113
120
  dua: Duala
114
- dum: Dutch, Middle (ca. 1050-1350)
121
+ dum: "Dutch, Middle (ca. 1050-1350)"
115
122
  dut: Dutch
116
123
  dyu: Dyula
117
124
  dzo: Dzongkha
@@ -120,7 +127,7 @@ egy: Egyptian
120
127
  eka: Ekajuk
121
128
  elx: Elamite
122
129
  eng: English
123
- enm: English, Middle (1100-1500)
130
+ enm: "English, Middle (1100-1500)"
124
131
  epo: Esperanto
125
132
  esk: Eskimo languages
126
133
  esp: Esperanto
@@ -133,18 +140,21 @@ fao: Faroese
133
140
  far: Faroese
134
141
  fat: Fanti
135
142
  fij: Fijian
143
+ fil: Filipino
136
144
  fin: Finnish
137
145
  fiu: Finno-Ugrian (Other)
138
146
  fon: Fon
139
147
  fre: French
140
148
  fri: Frisian
141
- frm: French, Middle (ca. 1400-1600)
142
- fro: French, Old (ca. 842-1400)
149
+ frm: "French, Middle (ca. 1300-1600)"
150
+ fro: "French, Old (ca. 842-1300)"
151
+ frr: North Frisian
152
+ frs: East Frisian
143
153
  fry: Frisian
144
154
  ful: Fula
145
155
  fur: Friulian
146
- gaa: Ga
147
- gae: Scottish Gaelic
156
+ gaa:
157
+ gae: Scottish Gaelix
148
158
  gag: Galician
149
159
  gal: Oromo
150
160
  gay: Gayo
@@ -158,15 +168,16 @@ gla: Scottish Gaelic
158
168
  gle: Irish
159
169
  glg: Galician
160
170
  glv: Manx
161
- gmh: German, Middle High (ca. 1050-1500)
162
- goh: German, Old High (ca. 750-1050)
171
+ gmh: "German, Middle High (ca. 1050-1500)"
172
+ goh: "German, Old High (ca. 750-1050)"
163
173
  gon: Gondi
164
174
  gor: Gorontalo
165
175
  got: Gothic
166
176
  grb: Grebo
167
- grc: Greek, Ancient (to 1453)
168
- gre: Greek, Modern (1453- )
177
+ grc: "Greek, Ancient (to 1453)"
178
+ gre: "Greek, Modern (1453-)"
169
179
  grn: Guarani
180
+ gsw: Swiss German
170
181
  gua: Guarani
171
182
  guj: Gujarati
172
183
  gwi: Gwich'in
@@ -177,11 +188,13 @@ haw: Hawaiian
177
188
  heb: Hebrew
178
189
  her: Herero
179
190
  hil: Hiligaynon
180
- him: Himachali
191
+ him: Western Pahari languages
181
192
  hin: Hindi
182
193
  hit: Hittite
183
194
  hmn: Hmong
184
195
  hmo: Hiri Motu
196
+ hrv: Croatian
197
+ hsb: Upper Sorbian
185
198
  hun: Hungarian
186
199
  hup: Hupa
187
200
  iba: Iban
@@ -205,16 +218,17 @@ iri: Irish
205
218
  iro: Iroquoian (Other)
206
219
  ita: Italian
207
220
  jav: Javanese
221
+ jbo: Lojban (Artificial language)
208
222
  jpn: Japanese
209
223
  jpr: Judeo-Persian
210
224
  jrb: Judeo-Arabic
211
225
  kaa: Kara-Kalpak
212
226
  kab: Kabyle
213
227
  kac: Kachin
214
- kal: Kalatdlisut
228
+ kal: Kalâtdlisut
215
229
  kam: Kamba
216
230
  kan: Kannada
217
- kar: Karen
231
+ kar: Karen languages
218
232
  kas: Kashmiri
219
233
  kau: Kanuri
220
234
  kaw: Kawi
@@ -232,19 +246,21 @@ kok: Konkani
232
246
  kom: Komi
233
247
  kon: Kongo
234
248
  kor: Korean
235
- kos: Kusaie
249
+ kos: Kosraean
236
250
  kpe: Kpelle
237
- kro: Kru
251
+ krc: Karachay-Balkar
252
+ krl: Karelian
253
+ kro: Kru (Other)
238
254
  kru: Kurukh
239
255
  kua: Kuanyama
240
256
  kum: Kumyk
241
257
  kur: Kurdish
242
258
  kus: Kusaie
243
- kut: Kutenai
259
+ kut: Kootenai
244
260
  lad: Ladino
245
- lah: Lahnda
246
- lam: Lamba
247
- lan: Occitan (post-1500)
261
+ lah: Lahndā
262
+ lam: Lamba (Zambia and Congo)
263
+ lan: Occitan (post 1500)
248
264
  lao: Lao
249
265
  lap: Sami
250
266
  lat: Latin
@@ -255,11 +271,11 @@ lin: Lingala
255
271
  lit: Lithuanian
256
272
  lol: Mongo-Nkundu
257
273
  loz: Lozi
258
- ltz: Letzeburgesch
274
+ ltz: Luxembourgish
259
275
  lua: Luba-Lulua
260
276
  lub: Luba-Katanga
261
277
  lug: Ganda
262
- lui: Luiseno
278
+ lui: Luiseño
263
279
  lun: Lunda
264
280
  luo: Luo (Kenya and Tanzania)
265
281
  lus: Lushai
@@ -274,12 +290,13 @@ man: Mandingo
274
290
  mao: Maori
275
291
  map: Austronesian (Other)
276
292
  mar: Marathi
277
- mas: Masai
293
+ mas: Maasai
278
294
  max: Manx
279
295
  may: Malay
296
+ mdf: Moksha
280
297
  mdr: Mandar
281
298
  men: Mende
282
- mga: Irish, Middle (ca. 1100-1550)
299
+ mga: "Irish, Middle (ca. 1100-1550)"
283
300
  mic: Micmac
284
301
  min: Minangkabau
285
302
  mis: Miscellaneous languages
@@ -293,12 +310,14 @@ mno: Manobo languages
293
310
  moh: Mohawk
294
311
  mol: Moldavian
295
312
  mon: Mongolian
296
- mos: Moore
313
+ mos: Mooré
297
314
  mul: Multiple languages
298
315
  mun: Munda (Other)
299
316
  mus: Creek
317
+ mwl: Mirandese
300
318
  mwr: Marwari
301
319
  myn: Mayan languages
320
+ myv: Erzya
302
321
  nah: Nahuatl
303
322
  nai: North American Indian (Other)
304
323
  nap: Neapolitan Italian
@@ -314,12 +333,14 @@ nia: Nias
314
333
  nic: Niger-Kordofanian (Other)
315
334
  niu: Niuean
316
335
  nno: Norwegian (Nynorsk)
317
- nob: Norwegian (Bokmal)
336
+ nob: Norwegian (Bokmål)
318
337
  nog: Nogai
319
338
  non: Old Norse
320
339
  nor: Norwegian
340
+ nqo: N'Ko
321
341
  nso: Northern Sotho
322
342
  nub: Nubian languages
343
+ nwc: "Newari, Old"
323
344
  nya: Nyanja
324
345
  nym: Nyamwezi
325
346
  nyn: Nyankole
@@ -331,7 +352,7 @@ ori: Oriya
331
352
  orm: Oromo
332
353
  osa: Osage
333
354
  oss: Ossetic
334
- ota: Turkish, Ottoman
355
+ ota: "Turkish, Ottoman"
335
356
  oto: Otomian languages
336
357
  paa: Papuan (Other)
337
358
  pag: Pangasinan
@@ -346,10 +367,10 @@ phi: Philippine (Other)
346
367
  phn: Phoenician
347
368
  pli: Pali
348
369
  pol: Polish
349
- pon: Ponape
370
+ pon: Pohnpeian
350
371
  por: Portuguese
351
372
  pra: Prakrit languages
352
- pro: Provencal (to 1500)
373
+ pro: Provençal (to 1500)
353
374
  pus: Pushto
354
375
  que: Quechua
355
376
  raj: Rajasthani
@@ -360,6 +381,7 @@ roh: Raeto-Romance
360
381
  rom: Romani
361
382
  rum: Romanian
362
383
  run: Rundi
384
+ rup: Aromanian
363
385
  rus: Russian
364
386
  sad: Sandawe
365
387
  sag: Sango (Ubangi Creole)
@@ -372,11 +394,12 @@ sao: Samoan
372
394
  sas: Sasak
373
395
  sat: Santali
374
396
  scc: Serbian
397
+ scn: Sicilian Italian
375
398
  sco: Scots
376
399
  scr: Croatian
377
400
  sel: Selkup
378
401
  sem: Semitic (Other)
379
- sga: Irish, Old (to 1100)
402
+ sga: "Irish, Old (to 1100)"
380
403
  sgn: Sign languages
381
404
  shn: Shan
382
405
  sho: Shona
@@ -404,6 +427,8 @@ son: Songhai
404
427
  sot: Sotho
405
428
  spa: Spanish
406
429
  srd: Sardinian
430
+ srn: Sranan
431
+ srp: Serbian
407
432
  srr: Serer
408
433
  ssa: Nilo-Saharan (Other)
409
434
  sso: Sotho
@@ -415,7 +440,8 @@ sux: Sumerian
415
440
  swa: Swahili
416
441
  swe: Swedish
417
442
  swz: Swazi
418
- syr: Syriac
443
+ syc: Syriac
444
+ syr: "Syriac, Modern"
419
445
  tag: Tagalog
420
446
  tah: Tahitian
421
447
  tai: Tai (Other)
@@ -431,10 +457,11 @@ tgk: Tajik
431
457
  tgl: Tagalog
432
458
  tha: Thai
433
459
  tib: Tibetan
434
- tig: Tigre
460
+ tig: Tigré
435
461
  tir: Tigrinya
436
462
  tiv: Tiv
437
463
  tkl: Tokelauan
464
+ tlh: Klingon (Artificial language)
438
465
  tli: Tlingit
439
466
  tmh: Tamashek
440
467
  tog: Tonga (Nyasa)
@@ -464,17 +491,17 @@ uzb: Uzbek
464
491
  vai: Vai
465
492
  ven: Venda
466
493
  vie: Vietnamese
467
- vol: Volapuk
494
+ vol: Volapük
468
495
  vot: Votic
469
496
  wak: Wakashan languages
470
- wal: Walamo
497
+ wal: Wolayta
471
498
  war: Waray
472
- was: Washo
499
+ was: Washoe
473
500
  wel: Welsh
474
- wen: Sorbian languages
501
+ wen: Sorbian (Other)
475
502
  wln: Walloon
476
503
  wol: Wolof
477
- xal: Kalmyk
504
+ xal: Oirat
478
505
  xho: Xhosa
479
506
  yao: Yao (Africa)
480
507
  yap: Yapese
@@ -482,9 +509,11 @@ yid: Yiddish
482
509
  yor: Yoruba
483
510
  ypk: Yupik languages
484
511
  zap: Zapotec
512
+ zbl: Blissymbolics
485
513
  zen: Zenaga
486
514
  zha: Zhuang
487
- znd: Zande
515
+ znd: Zande languages
488
516
  zul: Zulu
489
517
  zun: Zuni
490
- # zxx: null
518
+ # zxx: No linguistic content
519
+ zza: Zaza