traject 3.1.0 → 3.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,9 +26,15 @@ module Traject
26
26
  # Make sure to avoid text content that was all blank, which is "between the children"
27
27
  # whitespace.
28
28
  result = result.collect do |n|
29
- n.xpath('.//text()').collect(&:text).tap do |arr|
30
- arr.reject! { |s| s =~ (/\A\s+\z/) }
31
- end.join(" ")
29
+ if n.kind_of?(Nokogiri::XML::Attr)
30
+ # attribute value
31
+ n.value
32
+ else
33
+ # text from node
34
+ n.xpath('.//text()').collect(&:text).tap do |arr|
35
+ arr.reject! { |s| s =~ (/\A\s+\z/) }
36
+ end.join(" ")
37
+ end
32
38
  end
33
39
  else
34
40
  # just put all matches in accumulator as Nokogiri::XML::Node's
@@ -2,9 +2,9 @@ require 'traject/marc_extractor_spec'
2
2
 
3
3
  module Traject
4
4
  # MarcExtractor is a class for extracting lists of strings from a MARC::Record,
5
- # according to specifications. See #parse_string_spec for description of string
6
- # string arguments used to specify extraction. See #initialize for options
7
- # that can be set controlling extraction.
5
+ # according to specifications. See Traject::MarcExtractor::Spec for description
6
+ # of string string arguments used to specify extraction. See #initialize for
7
+ # options that can be set controlling extraction.
8
8
  #
9
9
  # Examples:
10
10
  #
@@ -1,3 +1,5 @@
1
+ require 'nokogiri'
2
+
1
3
  module Traject
2
4
  # A Trajet reader which reads XML, and yields zero to many Nokogiri::XML::Document
3
5
  # objects as source records in the traject pipeline.
@@ -21,6 +23,9 @@ module Traject
21
23
  # If you need to use namespaces here, you need to have them registered with
22
24
  # `nokogiri.default_namespaces`. If your source docs use namespaces, you DO need
23
25
  # to use them in your each_record_xpath.
26
+ # * nokogiri.strict_mode: if set to `true` or `"true"`, ask Nokogiri to parse in 'strict'
27
+ # mode, it will raise a `Nokogiri::XML::SyntaxError` if the XML is not well-formed, instead
28
+ # of trying to take it's best-guess correction. https://nokogiri.org/tutorials/ensuring_well_formed_markup.html
24
29
  # * nokogiri_reader.extra_xpath_hooks: Experimental in progress, see below.
25
30
  #
26
31
  # ## nokogiri_reader.extra_xpath_hooks: For handling nodes outside of your each_record_xpath
@@ -87,7 +92,11 @@ module Traject
87
92
  end
88
93
 
89
94
  def each
90
- whole_input_doc = Nokogiri::XML.parse(input_stream)
95
+ config_proc = if settings["nokogiri.strict_mode"]
96
+ proc { |config| config.strict }
97
+ end
98
+
99
+ whole_input_doc = Nokogiri::XML.parse(input_stream, &config_proc)
91
100
 
92
101
  if each_record_xpath
93
102
  whole_input_doc.xpath(each_record_xpath, default_namespaces).each do |matching_node|
@@ -115,9 +115,15 @@ module Traject
115
115
  # @returns [HTTP::Client] from http.rb gem
116
116
  def http_client
117
117
  @http_client ||= begin
118
- # timeout setting on http.rb seems to be a mess.
119
- # https://github.com/httprb/http/issues/488
120
- client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
118
+ client = nil
119
+
120
+ if HTTP::VERSION.split(".").first.to_i > 3
121
+ client = HTTP.timeout(timeout)
122
+ else
123
+ # timeout setting on http.rb 3.x are a bit of a mess.
124
+ # https://github.com/httprb/http/issues/488
125
+ client = HTTP.timeout(:global, write: timeout / 3, connect: timeout / 3, read: timeout / 3)
126
+ end
121
127
 
122
128
  if settings["oai_pmh.try_gzip"]
123
129
  client = client.use(:auto_inflate).headers("accept-encoding" => "gzip;q=1.0, identity;q=0.5")
@@ -41,10 +41,12 @@ require 'concurrent' # for atomic_fixnum
41
41
  #
42
42
  # ## Relevant settings
43
43
  #
44
- # * solr.url (optional if solr.update_url is set) The URL to the solr core to index into
44
+ # * solr.url (optional if solr.update_url is set) The URL to the solr core to index into.
45
+ # (Can include embedded HTTP basic auth as eg `http://user:pass@host/solr`)
45
46
  #
46
47
  # * solr.update_url: The actual update url. If unset, we'll first see if
47
- # "#{solr.url}/update/json" exists, and if not use "#{solr.url}/update"
48
+ # "#{solr.url}/update/json" exists, and if not use "#{solr.url}/update". (Can include
49
+ # embedded HTTP basic auth as eg `http://user:pass@host/solr)
48
50
  #
49
51
  # * solr_writer.batch_size: How big a batch to send to solr. Default is 100.
50
52
  # My tests indicate that this setting doesn't change overall index speed by a ton.
@@ -101,12 +103,17 @@ class Traject::SolrJsonWriter
101
103
  def initialize(argSettings)
102
104
  @settings = Traject::Indexer::Settings.new(argSettings)
103
105
 
106
+
104
107
  # Set max errors
105
108
  @max_skipped = (@settings['solr_writer.max_skipped'] || DEFAULT_MAX_SKIPPED).to_i
106
109
  if @max_skipped < 0
107
110
  @max_skipped = nil
108
111
  end
109
112
 
113
+
114
+ # Figure out where to send updates, and if with basic auth
115
+ @solr_update_url, basic_auth_user, basic_auth_password = self.determine_solr_update_url
116
+
110
117
  @http_client = if @settings["solr_json_writer.http_client"]
111
118
  @settings["solr_json_writer.http_client"]
112
119
  else
@@ -114,6 +121,11 @@ class Traject::SolrJsonWriter
114
121
  if @settings["solr_writer.http_timeout"]
115
122
  client.connect_timeout = client.receive_timeout = client.send_timeout = @settings["solr_writer.http_timeout"]
116
123
  end
124
+
125
+ if basic_auth_user || basic_auth_password
126
+ client.set_auth(@solr_update_url, basic_auth_user, basic_auth_password)
127
+ end
128
+
117
129
  client
118
130
  end
119
131
 
@@ -137,13 +149,11 @@ class Traject::SolrJsonWriter
137
149
  # this the new default writer.
138
150
  @commit_on_close = (settings["solr_writer.commit_on_close"] || settings["solrj_writer.commit_on_close"]).to_s == "true"
139
151
 
140
- # Figure out where to send updates
141
- @solr_update_url = self.determine_solr_update_url
142
152
 
143
153
  @solr_update_args = settings["solr_writer.solr_update_args"]
144
154
  @commit_solr_update_args = settings["solr_writer.commit_solr_update_args"]
145
155
 
146
- logger.info(" #{self.class.name} writing to '#{@solr_update_url}' in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
156
+ logger.info(" #{self.class.name} writing to '#{@solr_update_url}' #{"(with HTTP basic auth)" if basic_auth_user || basic_auth_password}in batches of #{@batch_size} with #{@thread_pool_size} bg threads")
147
157
  end
148
158
 
149
159
 
@@ -270,6 +280,13 @@ class Traject::SolrJsonWriter
270
280
  end
271
281
  end
272
282
 
283
+ # Send a delete all query.
284
+ #
285
+ # This method takes no params and will not automatically commit the deletes.
286
+ # @example @writer.delete_all!
287
+ def delete_all!
288
+ delete(query: "*:*")
289
+ end
273
290
 
274
291
  # Get the logger from the settings, or default to an effectively null logger
275
292
  def logger
@@ -355,13 +372,27 @@ class Traject::SolrJsonWriter
355
372
  end
356
373
 
357
374
 
358
- # Relatively complex logic to determine if we have a valid URL and what it is
375
+ # Relatively complex logic to determine if we have a valid URL and what it is,
376
+ # and if we have basic_auth info
377
+ #
378
+ # Empties out user and password embedded in URI returned, to help avoid logging it.
379
+ #
380
+ # @returns [update_url, basic_auth_user, basic_auth_password]
359
381
  def determine_solr_update_url
360
- if settings['solr.update_url']
382
+ url = if settings['solr.update_url']
361
383
  check_solr_update_url(settings['solr.update_url'])
362
384
  else
363
385
  derive_solr_update_url_from_solr_url(settings['solr.url'])
364
386
  end
387
+
388
+ parsed_uri = URI.parse(url)
389
+ user_from_uri, password_from_uri = parsed_uri.user, parsed_uri.password
390
+ parsed_uri.user, parsed_uri.password = nil, nil
391
+
392
+ basic_auth_user = @settings["solr_writer.basic_auth_user"] || user_from_uri
393
+ basic_auth_password = @settings["solr_writer.basic_auth_password"] || password_from_uri
394
+
395
+ return [parsed_uri.to_s, basic_auth_user, basic_auth_password]
365
396
  end
366
397
 
367
398
 
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "3.1.0"
2
+ VERSION = "3.6.0"
3
3
  end
@@ -10,18 +10,21 @@ ady: Adygei
10
10
  afa: Afroasiatic (Other)
11
11
  afh: Afrihili (Artificial language)
12
12
  afr: Afrikaans
13
- ajm: Aljamia
13
+ ain: Ainu
14
+ ajm: Aljamía
14
15
  aka: Akan
15
16
  akk: Akkadian
16
17
  alb: Albanian
17
18
  ale: Aleut
18
19
  alg: Algonquian (Other)
20
+ alt: Altai
19
21
  amh: Amharic
20
- ang: English, Old (ca. 450-1100)
22
+ ang: "English, Old (ca. 450-1100)"
23
+ anp: Angika
21
24
  apa: Apache languages
22
25
  ara: Arabic
23
26
  arc: Aramaic
24
- arg: Aragonese Spanish
27
+ arg: Aragonese
25
28
  arm: Armenian
26
29
  arn: Mapuche
27
30
  arp: Arapaho
@@ -36,7 +39,7 @@ ave: Avestan
36
39
  awa: Awadhi
37
40
  aym: Aymara
38
41
  aze: Azerbaijani
39
- bad: Banda
42
+ bad: Banda languages
40
43
  bai: Bamileke languages
41
44
  bak: Bashkir
42
45
  bal: Baluchi
@@ -51,7 +54,7 @@ bem: Bemba
51
54
  ben: Bengali
52
55
  ber: Berber (Other)
53
56
  bho: Bhojpuri
54
- bih: Bihari
57
+ bih: Bihari (Other)
55
58
  bik: Bikol
56
59
  bin: Edo
57
60
  bis: Bislama
@@ -65,6 +68,7 @@ bua: Buriat
65
68
  bug: Bugis
66
69
  bul: Bulgarian
67
70
  bur: Burmese
71
+ byn: Bilin
68
72
  cad: Caddo
69
73
  cai: Central American Indian (Other)
70
74
  cam: Khmer
@@ -78,7 +82,7 @@ chb: Chibcha
78
82
  che: Chechen
79
83
  chg: Chagatai
80
84
  chi: Chinese
81
- chk: Truk
85
+ chk: Chuukese
82
86
  chm: Mari
83
87
  chn: Chinook jargon
84
88
  cho: Choctaw
@@ -88,15 +92,17 @@ chu: Church Slavic
88
92
  chv: Chuvash
89
93
  chy: Cheyenne
90
94
  cmc: Chamic languages
95
+ cnr: Montenegrin
91
96
  cop: Coptic
92
97
  cor: Cornish
93
98
  cos: Corsican
94
- cpe: Creoles and Pidgins, English-based (Other)
95
- cpf: Creoles and Pidgins, French-based (Other)
96
- cpp: Creoles and Pidgins, Portuguese-based (Other)
99
+ cpe: "Creoles and Pidgins, English-based (Other)"
100
+ cpf: "Creoles and Pidgins, French-based (Other)"
101
+ cpp: "Creoles and Pidgins, Portuguese-based (Other)"
97
102
  cre: Cree
98
103
  crh: Crimean Tatar
99
104
  crp: Creoles and Pidgins (Other)
105
+ csb: Kashubian
100
106
  cus: Cushitic (Other)
101
107
  cze: Czech
102
108
  dak: Dakota
@@ -104,14 +110,15 @@ dan: Danish
104
110
  dar: Dargwa
105
111
  day: Dayak
106
112
  del: Delaware
107
- den: Slave
113
+ den: Slavey
108
114
  dgr: Dogrib
109
115
  din: Dinka
110
116
  div: Divehi
111
117
  doi: Dogri
112
118
  dra: Dravidian (Other)
119
+ dsb: Lower Sorbian
113
120
  dua: Duala
114
- dum: Dutch, Middle (ca. 1050-1350)
121
+ dum: "Dutch, Middle (ca. 1050-1350)"
115
122
  dut: Dutch
116
123
  dyu: Dyula
117
124
  dzo: Dzongkha
@@ -120,7 +127,7 @@ egy: Egyptian
120
127
  eka: Ekajuk
121
128
  elx: Elamite
122
129
  eng: English
123
- enm: English, Middle (1100-1500)
130
+ enm: "English, Middle (1100-1500)"
124
131
  epo: Esperanto
125
132
  esk: Eskimo languages
126
133
  esp: Esperanto
@@ -133,18 +140,21 @@ fao: Faroese
133
140
  far: Faroese
134
141
  fat: Fanti
135
142
  fij: Fijian
143
+ fil: Filipino
136
144
  fin: Finnish
137
145
  fiu: Finno-Ugrian (Other)
138
146
  fon: Fon
139
147
  fre: French
140
148
  fri: Frisian
141
- frm: French, Middle (ca. 1400-1600)
142
- fro: French, Old (ca. 842-1400)
149
+ frm: "French, Middle (ca. 1300-1600)"
150
+ fro: "French, Old (ca. 842-1300)"
151
+ frr: North Frisian
152
+ frs: East Frisian
143
153
  fry: Frisian
144
154
  ful: Fula
145
155
  fur: Friulian
146
- gaa: Ga
147
- gae: Scottish Gaelic
156
+ gaa:
157
+ gae: Scottish Gaelix
148
158
  gag: Galician
149
159
  gal: Oromo
150
160
  gay: Gayo
@@ -158,15 +168,16 @@ gla: Scottish Gaelic
158
168
  gle: Irish
159
169
  glg: Galician
160
170
  glv: Manx
161
- gmh: German, Middle High (ca. 1050-1500)
162
- goh: German, Old High (ca. 750-1050)
171
+ gmh: "German, Middle High (ca. 1050-1500)"
172
+ goh: "German, Old High (ca. 750-1050)"
163
173
  gon: Gondi
164
174
  gor: Gorontalo
165
175
  got: Gothic
166
176
  grb: Grebo
167
- grc: Greek, Ancient (to 1453)
168
- gre: Greek, Modern (1453- )
177
+ grc: "Greek, Ancient (to 1453)"
178
+ gre: "Greek, Modern (1453-)"
169
179
  grn: Guarani
180
+ gsw: Swiss German
170
181
  gua: Guarani
171
182
  guj: Gujarati
172
183
  gwi: Gwich'in
@@ -177,11 +188,13 @@ haw: Hawaiian
177
188
  heb: Hebrew
178
189
  her: Herero
179
190
  hil: Hiligaynon
180
- him: Himachali
191
+ him: Western Pahari languages
181
192
  hin: Hindi
182
193
  hit: Hittite
183
194
  hmn: Hmong
184
195
  hmo: Hiri Motu
196
+ hrv: Croatian
197
+ hsb: Upper Sorbian
185
198
  hun: Hungarian
186
199
  hup: Hupa
187
200
  iba: Iban
@@ -205,16 +218,17 @@ iri: Irish
205
218
  iro: Iroquoian (Other)
206
219
  ita: Italian
207
220
  jav: Javanese
221
+ jbo: Lojban (Artificial language)
208
222
  jpn: Japanese
209
223
  jpr: Judeo-Persian
210
224
  jrb: Judeo-Arabic
211
225
  kaa: Kara-Kalpak
212
226
  kab: Kabyle
213
227
  kac: Kachin
214
- kal: Kalatdlisut
228
+ kal: Kalâtdlisut
215
229
  kam: Kamba
216
230
  kan: Kannada
217
- kar: Karen
231
+ kar: Karen languages
218
232
  kas: Kashmiri
219
233
  kau: Kanuri
220
234
  kaw: Kawi
@@ -232,19 +246,21 @@ kok: Konkani
232
246
  kom: Komi
233
247
  kon: Kongo
234
248
  kor: Korean
235
- kos: Kusaie
249
+ kos: Kosraean
236
250
  kpe: Kpelle
237
- kro: Kru
251
+ krc: Karachay-Balkar
252
+ krl: Karelian
253
+ kro: Kru (Other)
238
254
  kru: Kurukh
239
255
  kua: Kuanyama
240
256
  kum: Kumyk
241
257
  kur: Kurdish
242
258
  kus: Kusaie
243
- kut: Kutenai
259
+ kut: Kootenai
244
260
  lad: Ladino
245
- lah: Lahnda
246
- lam: Lamba
247
- lan: Occitan (post-1500)
261
+ lah: Lahndā
262
+ lam: Lamba (Zambia and Congo)
263
+ lan: Occitan (post 1500)
248
264
  lao: Lao
249
265
  lap: Sami
250
266
  lat: Latin
@@ -255,11 +271,11 @@ lin: Lingala
255
271
  lit: Lithuanian
256
272
  lol: Mongo-Nkundu
257
273
  loz: Lozi
258
- ltz: Letzeburgesch
274
+ ltz: Luxembourgish
259
275
  lua: Luba-Lulua
260
276
  lub: Luba-Katanga
261
277
  lug: Ganda
262
- lui: Luiseno
278
+ lui: Luiseño
263
279
  lun: Lunda
264
280
  luo: Luo (Kenya and Tanzania)
265
281
  lus: Lushai
@@ -274,12 +290,13 @@ man: Mandingo
274
290
  mao: Maori
275
291
  map: Austronesian (Other)
276
292
  mar: Marathi
277
- mas: Masai
293
+ mas: Maasai
278
294
  max: Manx
279
295
  may: Malay
296
+ mdf: Moksha
280
297
  mdr: Mandar
281
298
  men: Mende
282
- mga: Irish, Middle (ca. 1100-1550)
299
+ mga: "Irish, Middle (ca. 1100-1550)"
283
300
  mic: Micmac
284
301
  min: Minangkabau
285
302
  mis: Miscellaneous languages
@@ -293,12 +310,14 @@ mno: Manobo languages
293
310
  moh: Mohawk
294
311
  mol: Moldavian
295
312
  mon: Mongolian
296
- mos: Moore
313
+ mos: Mooré
297
314
  mul: Multiple languages
298
315
  mun: Munda (Other)
299
316
  mus: Creek
317
+ mwl: Mirandese
300
318
  mwr: Marwari
301
319
  myn: Mayan languages
320
+ myv: Erzya
302
321
  nah: Nahuatl
303
322
  nai: North American Indian (Other)
304
323
  nap: Neapolitan Italian
@@ -314,12 +333,14 @@ nia: Nias
314
333
  nic: Niger-Kordofanian (Other)
315
334
  niu: Niuean
316
335
  nno: Norwegian (Nynorsk)
317
- nob: Norwegian (Bokmal)
336
+ nob: Norwegian (Bokmål)
318
337
  nog: Nogai
319
338
  non: Old Norse
320
339
  nor: Norwegian
340
+ nqo: N'Ko
321
341
  nso: Northern Sotho
322
342
  nub: Nubian languages
343
+ nwc: "Newari, Old"
323
344
  nya: Nyanja
324
345
  nym: Nyamwezi
325
346
  nyn: Nyankole
@@ -331,7 +352,7 @@ ori: Oriya
331
352
  orm: Oromo
332
353
  osa: Osage
333
354
  oss: Ossetic
334
- ota: Turkish, Ottoman
355
+ ota: "Turkish, Ottoman"
335
356
  oto: Otomian languages
336
357
  paa: Papuan (Other)
337
358
  pag: Pangasinan
@@ -346,10 +367,10 @@ phi: Philippine (Other)
346
367
  phn: Phoenician
347
368
  pli: Pali
348
369
  pol: Polish
349
- pon: Ponape
370
+ pon: Pohnpeian
350
371
  por: Portuguese
351
372
  pra: Prakrit languages
352
- pro: Provencal (to 1500)
373
+ pro: Provençal (to 1500)
353
374
  pus: Pushto
354
375
  que: Quechua
355
376
  raj: Rajasthani
@@ -360,6 +381,7 @@ roh: Raeto-Romance
360
381
  rom: Romani
361
382
  rum: Romanian
362
383
  run: Rundi
384
+ rup: Aromanian
363
385
  rus: Russian
364
386
  sad: Sandawe
365
387
  sag: Sango (Ubangi Creole)
@@ -372,11 +394,12 @@ sao: Samoan
372
394
  sas: Sasak
373
395
  sat: Santali
374
396
  scc: Serbian
397
+ scn: Sicilian Italian
375
398
  sco: Scots
376
399
  scr: Croatian
377
400
  sel: Selkup
378
401
  sem: Semitic (Other)
379
- sga: Irish, Old (to 1100)
402
+ sga: "Irish, Old (to 1100)"
380
403
  sgn: Sign languages
381
404
  shn: Shan
382
405
  sho: Shona
@@ -404,6 +427,8 @@ son: Songhai
404
427
  sot: Sotho
405
428
  spa: Spanish
406
429
  srd: Sardinian
430
+ srn: Sranan
431
+ srp: Serbian
407
432
  srr: Serer
408
433
  ssa: Nilo-Saharan (Other)
409
434
  sso: Sotho
@@ -415,7 +440,8 @@ sux: Sumerian
415
440
  swa: Swahili
416
441
  swe: Swedish
417
442
  swz: Swazi
418
- syr: Syriac
443
+ syc: Syriac
444
+ syr: "Syriac, Modern"
419
445
  tag: Tagalog
420
446
  tah: Tahitian
421
447
  tai: Tai (Other)
@@ -431,10 +457,11 @@ tgk: Tajik
431
457
  tgl: Tagalog
432
458
  tha: Thai
433
459
  tib: Tibetan
434
- tig: Tigre
460
+ tig: Tigré
435
461
  tir: Tigrinya
436
462
  tiv: Tiv
437
463
  tkl: Tokelauan
464
+ tlh: Klingon (Artificial language)
438
465
  tli: Tlingit
439
466
  tmh: Tamashek
440
467
  tog: Tonga (Nyasa)
@@ -464,17 +491,17 @@ uzb: Uzbek
464
491
  vai: Vai
465
492
  ven: Venda
466
493
  vie: Vietnamese
467
- vol: Volapuk
494
+ vol: Volapük
468
495
  vot: Votic
469
496
  wak: Wakashan languages
470
- wal: Walamo
497
+ wal: Wolayta
471
498
  war: Waray
472
- was: Washo
499
+ was: Washoe
473
500
  wel: Welsh
474
- wen: Sorbian languages
501
+ wen: Sorbian (Other)
475
502
  wln: Walloon
476
503
  wol: Wolof
477
- xal: Kalmyk
504
+ xal: Oirat
478
505
  xho: Xhosa
479
506
  yao: Yao (Africa)
480
507
  yap: Yapese
@@ -482,9 +509,11 @@ yid: Yiddish
482
509
  yor: Yoruba
483
510
  ypk: Yupik languages
484
511
  zap: Zapotec
512
+ zbl: Blissymbolics
485
513
  zen: Zenaga
486
514
  zha: Zhuang
487
- znd: Zande
515
+ znd: Zande languages
488
516
  zul: Zulu
489
517
  zun: Zuni
490
- # zxx: null
518
+ # zxx: No linguistic content
519
+ zza: Zaza