oai_schedules 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d2b2bae058157785c775ebd1b8c0ea76e8685adb8588280f6c6427644b9da8b5
4
- data.tar.gz: 928102f14abd5bf59bb7b55bb7e100074c5b20bec3276e1dd129a04e74c6da60
3
+ metadata.gz: 93b5d732552fa18fc37680009dd21bf9509555c5cfeec8c26b2b20a37967f606
4
+ data.tar.gz: cb282dfcd7e8866faa698e5f310ba07223e93d965395734a26665e531cca3ce0
5
5
  SHA512:
6
- metadata.gz: 35bb16497f382eb1accaecd95e1e2b022f3addeeef7a4ae79ab5bc4b9f6b023f3fbee70535815ee0edcf5e48b62fc4bdb697cba378cdbac16f114420394cf85e
7
- data.tar.gz: 3811fd28be558414ebfabcc83e1978080578ff4b33147f0da06cf569ab6b42af498489cf7bef3dd5f14b6e0dbb8598dc615d15220c7496c710f93fcea0149a3e
6
+ metadata.gz: 54a2044fe8446bf0951415302b68d06dccb3c36fc2628be888681418d1a4cecfcfd2723adbce8dd61656a3081087516e40c28e40acaa9037fdbeda88797f9d28
7
+ data.tar.gz: 553f101400c60d6f1134f8ea97c075183a595059737605ab49ede6ef8620a5e0b15d47b2f82ff4253036c485489ddebc0059074f3d87bc3c2fc523d98e067600
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## [0.7.0] - 2025-07-09
2
+
3
+ - Partial harvest policy changed: avoid using resumption tokens
4
+ (adds complication due to expiration time), and change "from" query param
5
+ as datestamp of the most recent record fetched + eps (from granularity)
6
+ - Bugfix: "from" and "to" query params now have correct format based on granularity
7
+ - Better exceptions management
8
+ - Some cleanup and sparse bugfixes
9
+ - Added more sample schedules
10
+ - Added more info in the schedule state
11
+
1
12
  ## [0.4.0] - 2025-03-28
2
13
 
3
14
  - Added logger to digestion function
@@ -0,0 +1,38 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "https://opac.amsab.be/OAI/Server",
5
+ "repository_name": "Amsab Online Catalog",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "help@amsab.be"
9
+ ],
10
+ "earliest_datestamp": "2000-01-01T00:00:00+00:00",
11
+ "deleted_records": "transient",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "marc21"
16
+ ],
17
+ "set": [
18
+ "collection"
19
+ ],
20
+ "id": "490cac7921d1801e6d398aa09bca3ef8f51d07e7dae7cc25676d1ea94126483f"
21
+ },
22
+ "active": true,
23
+ "transformer": {
24
+ "transformer_name": [
25
+ "dummy_transformer"
26
+ ],
27
+ "type": {
28
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
29
+ "value": "dummy_type"
30
+ },
31
+ "uri": "http://dummy-uri.org/",
32
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
33
+ },
34
+ "format": "oai_dc",
35
+ "set": "collection",
36
+ "from": "2000-01-01T00:00:00+00:00",
37
+ "id": "495607cf-f773-463f-8ee1-77d0f53e0c29"
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "https://heron.libis.be/ca_veb_q/admin/service.php/OAI/cw_organisaties/request",
5
+ "repository_name": "VEB",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "collectiveaccess@vlaamse-erfgoedbibliotheken.be"
9
+ ],
10
+ "earliest_datestamp": "2020-08-17T12:55:02+00:00",
11
+ "deleted_records": "transient",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "oai_veb"
16
+ ],
17
+ "set": [
18
+ "1"
19
+ ],
20
+ "id": "4886e3793003d2b1dffbbdb41ca13024c7c809892314ab301ded0d7ecfd7b469"
21
+ },
22
+ "active": true,
23
+ "transformer": {
24
+ "transformer_name": [
25
+ "dummy_transformer"
26
+ ],
27
+ "type": {
28
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
29
+ "value": "dummy_type"
30
+ },
31
+ "uri": "http://dummy-uri.org/",
32
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
33
+ },
34
+ "format": "oai_dc",
35
+ "set": "1",
36
+ "from": "2025-06-10T12:39:00+00:00",
37
+ "id": "5e344861-806b-4361-98f7-a0be6a5984de"
38
+ }
@@ -0,0 +1,41 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "http://lag.hosting.deventit.net/atlantispubliek/oai.axd",
5
+ "repository_name": "Liberas",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "support@deventit.nl"
9
+ ],
10
+ "earliest_datestamp": "1900-01-01T01:01:01+00:00",
11
+ "deleted_records": "persistent",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "APEX",
15
+ "EAC",
16
+ "EAD"
17
+ ],
18
+ "set": [
19
+ "APEX",
20
+ "APEXAO",
21
+ "EAC",
22
+ "EAD"
23
+ ],
24
+ "id": "93b6d240ac9b782664f18823b761a128fc19116506dd228763b52ba8fb64e1b9"
25
+ },
26
+ "active": true,
27
+ "transformer": {
28
+ "transformer_name": [
29
+ "dummy_transformer"
30
+ ],
31
+ "type": {
32
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
33
+ "value": "dummy_type"
34
+ },
35
+ "uri": "http://dummy-uri.org/",
36
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
37
+ },
38
+ "format": "EAD",
39
+ "set": "EAD",
40
+ "id": "712f46ab-f87f-4db8-b69e-7101d9e2ae61"
41
+ }
@@ -0,0 +1,62 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "https://eudml.org/oai/OAIHandler",
5
+ "repository_name": "REPOX Repository",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "mailto:gilberto.pedrosa@ist.utl.pt"
9
+ ],
10
+ "earliest_datestamp": "1970-01-01T00:00:00+00:00",
11
+ "deleted_records": "persistent",
12
+ "granularity": "YYYY-MM-DD",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "ese",
16
+ "eudml-article2",
17
+ "eudml-book2"
18
+ ],
19
+ "set": [
20
+ "BDIM",
21
+ "BulDML",
22
+ "CEDRAM",
23
+ "DMLE",
24
+ "DML_CZ_Monograph",
25
+ "DML_CZ_Proceeding",
26
+ "DML_CZ_Serial",
27
+ "EDPS",
28
+ "ELibM",
29
+ "GALLICA",
30
+ "GDZ_Band",
31
+ "GDZ_Mathematica",
32
+ "GDZ_Monographs",
33
+ "GDZ_RusDML",
34
+ "HDML_Books",
35
+ "HDML_Conferences",
36
+ "HDML_Journals",
37
+ "MISANU",
38
+ "NUMDAM",
39
+ "NUMDAM_book",
40
+ "PLDML",
41
+ "PLDML_book",
42
+ "PMath"
43
+ ],
44
+ "id": "02df523af427deb93b7cb4600ca347f9297d0e31d51c2783c634459dac457bd0"
45
+ },
46
+ "active": true,
47
+ "transformer": {
48
+ "transformer_name": [
49
+ "dummy_transformer"
50
+ ],
51
+ "type": {
52
+ "id": "b1671aad-e825-4b5a-b50b-d8591b425e2a",
53
+ "value": "dummy_type"
54
+ },
55
+ "uri": "http://dummy-uri.org/",
56
+ "id": "7889ce03-28d9-479d-bb9a-b239f179453a"
57
+ },
58
+ "format": "oai_dc",
59
+ "set": "CEDRAM",
60
+ "from": "1970-01-01T00:00:00+00:00",
61
+ "id": "99922363-5b37-4438-a274-5a4a5167f811"
62
+ }
@@ -1,5 +1,5 @@
1
1
  {
2
- "interval": "P1W",
2
+ "interval": "PT5S",
3
3
  "repository": {
4
4
  "uri": "https://eudml.org/oai/OAIHandler",
5
5
  "repository_name": "REPOX Repository",
@@ -58,6 +58,5 @@
58
58
  "format": "oai_dc",
59
59
  "set": "CEDRAM",
60
60
  "from": "1970-01-01T00:00:00+00:00",
61
- "until": "1999-01-12T09:43:02+00:00",
62
61
  "id": "dc34623d-2ae2-4e90-91d8-26f4ba29a056"
63
62
  }
@@ -1,10 +1,11 @@
1
1
  {
2
- "interval": "PT2S",
2
+ "interval": "PT5S",
3
3
  "active": true,
4
4
  "repository": {
5
- "uri": "https://eu.alma.exlibrisgroup.com/view/oai/32KUL_KUL/request"
5
+ "uri": "https://eu.alma.exlibrisgroup.com/view/oai/32KUL_KUL/request",
6
+ "granularity": "YYYY-MM-DDThh:mm:ssZ"
6
7
  },
7
8
  "format": "marc21",
8
9
  "set": "KUL_Rapid_Journals_Print_LendableInternational",
9
- "from": "2025-03-23T00:00:00Z"
10
+ "from": "2024-03-01T00:00:00Z"
10
11
  }
@@ -9,8 +9,9 @@ f_show = lambda do |name, content, records, done, error, state, logger|
9
9
  # ... do your stuff with records ...
10
10
  else
11
11
  puts error.message
12
+ puts error.backtrace
12
13
  end
13
- puts state
14
+ puts JSON.pretty_generate(state)
14
15
  if done
15
16
  puts "done full harvesting"
16
17
  end
@@ -9,8 +9,9 @@ f_show = lambda do |name, content, records, done, error, state, logger|
9
9
  # ... do your stuff with records ...
10
10
  else
11
11
  puts error.message
12
+ puts error.backtrace
12
13
  end
13
- puts state
14
+ puts JSON.pretty_generate(state)
14
15
  if done
15
16
  puts "done full harvesting"
16
17
  end
@@ -7,7 +7,7 @@ require 'concurrent-ruby'
7
7
  require 'logger'
8
8
  require 'data_collector'
9
9
  require 'iso8601'
10
-
10
+ require 'date'
11
11
 
12
12
 
13
13
 
@@ -229,19 +229,22 @@ module OAISchedules
229
229
 
230
230
  def init_schedule_state(state)
231
231
  state["resumption_token"] = nil
232
+ state["expiration_date_resumption_token"] = nil
233
+ state["datetime_now"] = get_datetime_now
232
234
  state["count_success"] = 0
233
235
  state["count_fails"] = 0
234
236
  state["done"] = false
235
237
  state["count_harvested_records"] = 0
236
- state["latest_harvested_records_datestamp"] = ""
238
+ state["latest_harvested_records_datestamp"] = nil
237
239
  end
238
240
 
239
241
 
240
242
  def handle_schedule_state_at_schedule_change(name, content)
241
- # invalidate resumption token if either schedule format or set changes
242
243
  state = @schedules[name][:state]
243
244
  if (content["format"] != @schedules[name][:content]["format"]) \
244
- || (content["set"] != @schedules[name][:content]["set"])
245
+ || (content["set"] != @schedules[name][:content]["set"]) \
246
+ || (content["from"] != @schedules[name][:content]["from"]) \
247
+ || (content["until"] != @schedules[name][:content]["until"])
245
248
  init_schedule_state(state)
246
249
  end
247
250
  end
@@ -347,40 +350,30 @@ module OAISchedules
347
350
  )
348
351
  state["identify"] = data
349
352
  state_machine.add_event(EventHarvesting::DONE_IDENTIFY)
350
- break
351
353
  when StateHarvesting::IDLE
352
354
  state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
353
355
  when StateHarvesting::HARVESTING
354
- unless state.has_key?("count_success")
355
- state["count_success"] = 0
356
- end
357
- unless state.has_key?("count_fails")
358
- state["count_fails"] = 0
359
- end
360
- unless state.has_key?("done")
361
- state["done"] = false
362
- end
363
- unless state.has_key?("count_harvested_records")
364
- state["count_harvested_records"] = 0
365
- end
366
- unless state.has_key?("latest_harvested_records_datestamp")
367
- state["latest_harvested_records_datestamp"] = ""
368
- end
369
- if !state["resumption_token"].nil?
370
- format = ""
371
- from = ""
372
- to = ""
373
- set = ""
374
- resumption_token = state["resumption_token"]
356
+ use_resumption_token = false # don't change
357
+ format = content["format"] || ""
358
+ fmt_dt = content.dig("repository", "granularity")
359
+ from = convert_datetime(content["from"], fmt_dt) || ""
360
+ to = convert_datetime(content["until"], fmt_dt) || ""
361
+ set = content["set"] || ""
362
+ resumption_token = ""
363
+ if use_resumption_token
364
+ if !state["resumption_token"].nil?
365
+ format = ""
366
+ from = ""
367
+ to = ""
368
+ set = ""
369
+ resumption_token = state["resumption_token"]
370
+ end
375
371
  else
376
- format = content["format"] || ""
377
- from = content["from"] || ""
378
- to = content["until"] || ""
379
- set = content["set"] || ""
380
- resumption_token = ""
372
+ from = add_eps_to_datetime(state["latest_harvested_records_datestamp"], fmt_dt) || from
381
373
  end
382
374
  data = nil
383
375
  error = nil
376
+ to_pause = true
384
377
  done = false
385
378
  if state["done"]
386
379
  state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
@@ -396,27 +389,43 @@ module OAISchedules
396
389
  set,
397
390
  resumption_token
398
391
  )
399
- state["resumption_token"] = data["resumptionToken"]
400
- state["count_success"] += 1
392
+ state["resumption_token"] = nil
393
+ state["expiration_date_resumption_token"] = nil
394
+ if use_resumption_token
395
+ if data["resumptionToken"].is_a?(String)
396
+ state["resumption_token"] = data["resumptionToken"]
397
+ elsif data["resumptionToken"].is_a?(Hash)
398
+ data_token = data["resumptionToken"]
399
+ state["resumption_token"] = data_token["$text"]
400
+ state["expiration_date_resumption_token"] = data_token["_expirationDate"]
401
+ end
402
+ end
403
+ state["datetime_now"] = get_datetime_now
401
404
  n_records = data["record"].size
402
405
  state["count_harvested_records"] += n_records
403
406
  if n_records > 0
404
407
  timestamps = data["record"].map do |record|
405
408
  record["header"]["datestamp"]
406
409
  end.sort
407
- state["latest_harvested_records_datestamp"] = timestamps[-1]
410
+ timestamp_latest = timestamps[-1].strftime('%FT%TZ')
411
+ state["latest_harvested_records_datestamp"] = convert_datetime(timestamp_latest, fmt_dt)
408
412
  end
413
+ state["count_success"] += 1
409
414
  rescue StandardError => e
410
415
  state["count_fails"] += 1
411
416
  error = e
412
417
  end
413
418
  if error.nil?
414
- if !data["resumptionToken"].nil?
415
- state_machine.add_event(EventHarvesting::DONE_HARVEST)
416
- done = false
419
+ if use_resumption_token
420
+ if state["resumption_token"].nil?
421
+ state_machine.add_event(EventHarvesting::DONE_HARVEST)
422
+ to_pause = true
423
+ else
424
+ to_pause = false
425
+ end
417
426
  else
418
- state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
419
- done = true
427
+ state_machine.add_event(EventHarvesting::DONE_HARVEST)
428
+ to_pause = true
420
429
  end
421
430
  end
422
431
  state["done"] = done
@@ -425,7 +434,7 @@ module OAISchedules
425
434
  @logger.info("#{name}: writing to state file #{path_file_state}")
426
435
  write_state_file(path_file_state, state)
427
436
  @f_digest&.call(name, content, data, done, error, state, @logger)
428
- break
437
+ break if to_pause
429
438
  when StateHarvesting::COMPLETE
430
439
  @logger.warn("#{name}: full harvesting complete")
431
440
  content["active"] = false
@@ -443,11 +452,19 @@ module OAISchedules
443
452
  verb = "Identify"
444
453
  url_query = "#{url_base}?verb=#{verb}"
445
454
  @logger.info("#{name}: fetching from #{url_query}")
446
- data = DataCollector::Core.filter(DataCollector::Input.new.from_uri(url_query), "$..#{verb}")
455
+ input = DataCollector::Input.new.from_uri(url_query)
456
+ if input.nil?
457
+ raise StandardError, "#{url_query}, service not found"
458
+ end
459
+ data = DataCollector::Core.filter(input, "$..#{verb}")
447
460
  if data.empty?
448
- raise StandardError, "#{name}: GET #{url_query}: URL not available, or response has no element #{verb}"
461
+ error = DataCollector::Core.filter(input, "$..error")
462
+ error = error[0]
463
+ name_error = error['_code']
464
+ text_error = error['$text']
465
+ raise StandardError, "#{name}: #{url_query}: #{name_error}, #{text_error}"
449
466
  end
450
- data = data[0]
467
+ data[0]
451
468
  rescue DataCollector::InputError => e
452
469
  raise RuntimeError, "#{name}: #{url_query} not found, or server error"
453
470
  rescue StandardError => e
@@ -474,11 +491,22 @@ module OAISchedules
474
491
  url_query += "&resumptionToken=#{resumption_token}"
475
492
  end
476
493
  @logger.info("#{name}: fetching from #{url_query}")
477
- data = DataCollector::Core.filter(DataCollector::Input.new.from_uri(url_query), "$..#{verb}")
494
+ input = DataCollector::Input.new.from_uri(url_query)
495
+ if input.nil?
496
+ raise StandardError, "#{url_query}, service not found"
497
+ end
498
+ data = DataCollector::Core.filter(input, "$..#{verb}")
478
499
  if data.empty?
479
- raise StandardError, "#{name}: GET #{url_query}: URL not available, or response has no element #{verb}, or missing query parameters (e.g. set)"
500
+ error = DataCollector::Core.filter(input, "$..error")
501
+ error = error[0]
502
+ name_error = error['_code']
503
+ text_error = error['$text']
504
+ raise StandardError, "#{name}: #{url_query}: #{name_error}, #{text_error}"
505
+ end
506
+ if data.compact.empty?
507
+ raise StandardError, "#{name}: #{url_query}: no records found"
480
508
  end
481
- data = data[0]
509
+ data[0]
482
510
  rescue DataCollector::InputError => e
483
511
  raise RuntimeError, "#{name}: #{url_query} not found, or server error"
484
512
  rescue StandardError => e
@@ -511,6 +539,37 @@ module OAISchedules
511
539
  modify_schedule(name_schedule, content_schedule)
512
540
  end
513
541
 
542
+ def convert_datetime(str_dt, fmt_dt)
543
+ return nil unless str_dt
544
+ return str_dt unless fmt_dt
545
+ dt = DateTime.parse(str_dt)
546
+ case fmt_dt
547
+ when 'YYYY-MM-DD'
548
+ dt.strftime('%F')
549
+ when 'YYYY-MM-DDThh:mm:ssZ'
550
+ dt.strftime('%FT%TZ')
551
+ else
552
+ str_dt
553
+ end
554
+ end
555
+
556
+ def add_eps_to_datetime(str_dt, fmt_dt)
557
+ return nil unless str_dt
558
+ dt = DateTime.parse(str_dt)
559
+ case fmt_dt
560
+ when 'YYYY-MM-DD'
561
+ dt += 1.0
562
+ dt.strftime('%F')
563
+ when 'YYYY-MM-DDThh:mm:ssZ'
564
+ dt += 1.0/(24*60*60)
565
+ dt.strftime('%FT%TZ')
566
+ end
567
+ end
568
+
569
+ def get_datetime_now
570
+ DateTime.now.strftime('%FT%TZ')
571
+ end
572
+
514
573
  end
515
574
 
516
575
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module OaiSchedules
4
- VERSION = "0.6.0"
4
+ VERSION = "0.7.0"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oai_schedules
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Monari
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-04-29 00:00:00.000000000 Z
10
+ date: 2025-07-09 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -78,6 +78,10 @@ files:
78
78
  - LICENSE.txt
79
79
  - README.md
80
80
  - Rakefile
81
+ - examples/dir_schedules/schedule_495607cf-f773-463f-8ee1-77d0f53e0c29.json
82
+ - examples/dir_schedules/schedule_5e344861-806b-4361-98f7-a0be6a5984de.json
83
+ - examples/dir_schedules/schedule_712f46ab-f87f-4db8-b69e-7101d9e2ae61.json
84
+ - examples/dir_schedules/schedule_99922363-5b37-4438-a274-5a4a5167f811.json
81
85
  - examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
82
86
  - examples/dir_schedules/schedule_sample.json
83
87
  - examples/dir_state/.gitkeep