oai_schedules 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3f254ace930dec7e151a5af485130dd23623548d6f9d0860a764b90ce217c4bb
4
- data.tar.gz: bfe93c7f19a894976c53f3d2e3d720e048c77c116b92283034ba7b17d4efe386
3
+ metadata.gz: 93b5d732552fa18fc37680009dd21bf9509555c5cfeec8c26b2b20a37967f606
4
+ data.tar.gz: cb282dfcd7e8866faa698e5f310ba07223e93d965395734a26665e531cca3ce0
5
5
  SHA512:
6
- metadata.gz: bbd9df7881c619f2660dd6b622adf2fd7ec6fa35cde400ff3490b1001b3635c7d89605882e53c9b242cc992de6d0153a7ec5d25d5b41e0672853b997924900a0
7
- data.tar.gz: 918414787e68f89b8dab0e692096e708864ca7b32a81d1341c0cc09e72fbb70d2cd886a137f024e5946b6176b7d4b1b20d606ec12d33c81894c65588ca9c712e
6
+ metadata.gz: 54a2044fe8446bf0951415302b68d06dccb3c36fc2628be888681418d1a4cecfcfd2723adbce8dd61656a3081087516e40c28e40acaa9037fdbeda88797f9d28
7
+ data.tar.gz: 553f101400c60d6f1134f8ea97c075183a595059737605ab49ede6ef8620a5e0b15d47b2f82ff4253036c485489ddebc0059074f3d87bc3c2fc523d98e067600
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## [0.7.0] - 2025-07-09
2
+
3
+ - Partial harvest policy changed: avoid using resumption tokens
4
+ (adds complication due to expiration time), and change "from" query param
5
+ as datestamp of the most recent record fetched + eps (from granularity)
6
+ - Bugfix: "from" and "to" query params now have correct format based on granularity
7
+ - Better exceptions management
8
+ - Some cleanup and sparse bugfixes
9
+ - Added more sample schedules
10
+ - Added more info in the schedule state
11
+
1
12
  ## [0.4.0] - 2025-03-28
2
13
 
3
14
  - Added logger to digestion function
@@ -0,0 +1,38 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "https://opac.amsab.be/OAI/Server",
5
+ "repository_name": "Amsab Online Catalog",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "help@amsab.be"
9
+ ],
10
+ "earliest_datestamp": "2000-01-01T00:00:00+00:00",
11
+ "deleted_records": "transient",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "marc21"
16
+ ],
17
+ "set": [
18
+ "collection"
19
+ ],
20
+ "id": "490cac7921d1801e6d398aa09bca3ef8f51d07e7dae7cc25676d1ea94126483f"
21
+ },
22
+ "active": true,
23
+ "transformer": {
24
+ "transformer_name": [
25
+ "dummy_transformer"
26
+ ],
27
+ "type": {
28
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
29
+ "value": "dummy_type"
30
+ },
31
+ "uri": "http://dummy-uri.org/",
32
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
33
+ },
34
+ "format": "oai_dc",
35
+ "set": "collection",
36
+ "from": "2000-01-01T00:00:00+00:00",
37
+ "id": "495607cf-f773-463f-8ee1-77d0f53e0c29"
38
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "https://heron.libis.be/ca_veb_q/admin/service.php/OAI/cw_organisaties/request",
5
+ "repository_name": "VEB",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "collectiveaccess@vlaamse-erfgoedbibliotheken.be"
9
+ ],
10
+ "earliest_datestamp": "2020-08-17T12:55:02+00:00",
11
+ "deleted_records": "transient",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "oai_veb"
16
+ ],
17
+ "set": [
18
+ "1"
19
+ ],
20
+ "id": "4886e3793003d2b1dffbbdb41ca13024c7c809892314ab301ded0d7ecfd7b469"
21
+ },
22
+ "active": true,
23
+ "transformer": {
24
+ "transformer_name": [
25
+ "dummy_transformer"
26
+ ],
27
+ "type": {
28
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
29
+ "value": "dummy_type"
30
+ },
31
+ "uri": "http://dummy-uri.org/",
32
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
33
+ },
34
+ "format": "oai_dc",
35
+ "set": "1",
36
+ "from": "2025-06-10T12:39:00+00:00",
37
+ "id": "5e344861-806b-4361-98f7-a0be6a5984de"
38
+ }
@@ -0,0 +1,41 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "http://lag.hosting.deventit.net/atlantispubliek/oai.axd",
5
+ "repository_name": "Liberas",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "support@deventit.nl"
9
+ ],
10
+ "earliest_datestamp": "1900-01-01T01:01:01+00:00",
11
+ "deleted_records": "persistent",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "APEX",
15
+ "EAC",
16
+ "EAD"
17
+ ],
18
+ "set": [
19
+ "APEX",
20
+ "APEXAO",
21
+ "EAC",
22
+ "EAD"
23
+ ],
24
+ "id": "93b6d240ac9b782664f18823b761a128fc19116506dd228763b52ba8fb64e1b9"
25
+ },
26
+ "active": true,
27
+ "transformer": {
28
+ "transformer_name": [
29
+ "dummy_transformer"
30
+ ],
31
+ "type": {
32
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
33
+ "value": "dummy_type"
34
+ },
35
+ "uri": "http://dummy-uri.org/",
36
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
37
+ },
38
+ "format": "EAD",
39
+ "set": "EAD",
40
+ "id": "712f46ab-f87f-4db8-b69e-7101d9e2ae61"
41
+ }
@@ -0,0 +1,62 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "https://eudml.org/oai/OAIHandler",
5
+ "repository_name": "REPOX Repository",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "mailto:gilberto.pedrosa@ist.utl.pt"
9
+ ],
10
+ "earliest_datestamp": "1970-01-01T00:00:00+00:00",
11
+ "deleted_records": "persistent",
12
+ "granularity": "YYYY-MM-DD",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "ese",
16
+ "eudml-article2",
17
+ "eudml-book2"
18
+ ],
19
+ "set": [
20
+ "BDIM",
21
+ "BulDML",
22
+ "CEDRAM",
23
+ "DMLE",
24
+ "DML_CZ_Monograph",
25
+ "DML_CZ_Proceeding",
26
+ "DML_CZ_Serial",
27
+ "EDPS",
28
+ "ELibM",
29
+ "GALLICA",
30
+ "GDZ_Band",
31
+ "GDZ_Mathematica",
32
+ "GDZ_Monographs",
33
+ "GDZ_RusDML",
34
+ "HDML_Books",
35
+ "HDML_Conferences",
36
+ "HDML_Journals",
37
+ "MISANU",
38
+ "NUMDAM",
39
+ "NUMDAM_book",
40
+ "PLDML",
41
+ "PLDML_book",
42
+ "PMath"
43
+ ],
44
+ "id": "02df523af427deb93b7cb4600ca347f9297d0e31d51c2783c634459dac457bd0"
45
+ },
46
+ "active": true,
47
+ "transformer": {
48
+ "transformer_name": [
49
+ "dummy_transformer"
50
+ ],
51
+ "type": {
52
+ "id": "b1671aad-e825-4b5a-b50b-d8591b425e2a",
53
+ "value": "dummy_type"
54
+ },
55
+ "uri": "http://dummy-uri.org/",
56
+ "id": "7889ce03-28d9-479d-bb9a-b239f179453a"
57
+ },
58
+ "format": "oai_dc",
59
+ "set": "CEDRAM",
60
+ "from": "1970-01-01T00:00:00+00:00",
61
+ "id": "99922363-5b37-4438-a274-5a4a5167f811"
62
+ }
@@ -1,5 +1,5 @@
1
1
  {
2
- "interval": "P1W",
2
+ "interval": "PT5S",
3
3
  "repository": {
4
4
  "uri": "https://eudml.org/oai/OAIHandler",
5
5
  "repository_name": "REPOX Repository",
@@ -58,6 +58,5 @@
58
58
  "format": "oai_dc",
59
59
  "set": "CEDRAM",
60
60
  "from": "1970-01-01T00:00:00+00:00",
61
- "until": "1999-01-12T09:43:02+00:00",
62
61
  "id": "dc34623d-2ae2-4e90-91d8-26f4ba29a056"
63
62
  }
@@ -1,10 +1,11 @@
1
1
  {
2
- "interval": "PT2S",
2
+ "interval": "PT5S",
3
3
  "active": true,
4
4
  "repository": {
5
- "uri": "https://eu.alma.exlibrisgroup.com/view/oai/32KUL_KUL/request"
5
+ "uri": "https://eu.alma.exlibrisgroup.com/view/oai/32KUL_KUL/request",
6
+ "granularity": "YYYY-MM-DDThh:mm:ssZ"
6
7
  },
7
8
  "format": "marc21",
8
9
  "set": "KUL_Rapid_Journals_Print_LendableInternational",
9
- "from": "2025-03-23T00:00:00Z"
10
+ "from": "2024-03-01T00:00:00Z"
10
11
  }
@@ -9,8 +9,9 @@ f_show = lambda do |name, content, records, done, error, state, logger|
9
9
  # ... do your stuff with records ...
10
10
  else
11
11
  puts error.message
12
+ puts error.backtrace
12
13
  end
13
- puts state
14
+ puts JSON.pretty_generate(state)
14
15
  if done
15
16
  puts "done full harvesting"
16
17
  end
@@ -9,8 +9,9 @@ f_show = lambda do |name, content, records, done, error, state, logger|
9
9
  # ... do your stuff with records ...
10
10
  else
11
11
  puts error.message
12
+ puts error.backtrace
12
13
  end
13
- puts state
14
+ puts JSON.pretty_generate(state)
14
15
  if done
15
16
  puts "done full harvesting"
16
17
  end
@@ -7,7 +7,7 @@ require 'concurrent-ruby'
7
7
  require 'logger'
8
8
  require 'data_collector'
9
9
  require 'iso8601'
10
-
10
+ require 'date'
11
11
 
12
12
 
13
13
 
@@ -181,6 +181,9 @@ module OAISchedules
181
181
  state = {}
182
182
  if File.file?(path_file_state)
183
183
  state = read_state_file(path_file_state)
184
+ else
185
+ init_schedule_state(state)
186
+ write_state_file(path_file_state, state)
184
187
  end
185
188
  # create task
186
189
  task = Concurrent::TimerTask.new(run_now: false) {
@@ -224,14 +227,25 @@ module OAISchedules
224
227
  private
225
228
 
226
229
 
230
+ def init_schedule_state(state)
231
+ state["resumption_token"] = nil
232
+ state["expiration_date_resumption_token"] = nil
233
+ state["datetime_now"] = get_datetime_now
234
+ state["count_success"] = 0
235
+ state["count_fails"] = 0
236
+ state["done"] = false
237
+ state["count_harvested_records"] = 0
238
+ state["latest_harvested_records_datestamp"] = nil
239
+ end
240
+
241
+
227
242
  def handle_schedule_state_at_schedule_change(name, content)
228
- # invalidate resumption token if either schedule format or set changes
229
243
  state = @schedules[name][:state]
230
244
  if (content["format"] != @schedules[name][:content]["format"]) \
231
- || (content["set"] != @schedules[name][:content]["set"])
232
- state["resumption_token"] = nil
233
- state["count_success"] = 0
234
- state["count_fails"] = 0
245
+ || (content["set"] != @schedules[name][:content]["set"]) \
246
+ || (content["from"] != @schedules[name][:content]["from"]) \
247
+ || (content["until"] != @schedules[name][:content]["until"])
248
+ init_schedule_state(state)
235
249
  end
236
250
  end
237
251
 
@@ -336,62 +350,91 @@ module OAISchedules
336
350
  )
337
351
  state["identify"] = data
338
352
  state_machine.add_event(EventHarvesting::DONE_IDENTIFY)
339
- break
340
353
  when StateHarvesting::IDLE
341
354
  state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
342
355
  when StateHarvesting::HARVESTING
343
- unless state.has_key?("count_success")
344
- state["count_success"] = 0
345
- end
346
- unless state.has_key?("count_fails")
347
- state["count_fails"] = 0
348
- end
349
- if !state["resumption_token"].nil?
350
- format = ""
351
- from = ""
352
- to = ""
353
- set = ""
354
- resumption_token = state["resumption_token"]
356
+ use_resumption_token = false # don't change
357
+ format = content["format"] || ""
358
+ fmt_dt = content.dig("repository", "granularity")
359
+ from = convert_datetime(content["from"], fmt_dt) || ""
360
+ to = convert_datetime(content["until"], fmt_dt) || ""
361
+ set = content["set"] || ""
362
+ resumption_token = ""
363
+ if use_resumption_token
364
+ if !state["resumption_token"].nil?
365
+ format = ""
366
+ from = ""
367
+ to = ""
368
+ set = ""
369
+ resumption_token = state["resumption_token"]
370
+ end
355
371
  else
356
- format = content["format"] || ""
357
- from = content["from"] || ""
358
- to = content["until"] || ""
359
- set = content["set"] || ""
360
- resumption_token = ""
372
+ from = add_eps_to_datetime(state["latest_harvested_records_datestamp"], fmt_dt) || from
361
373
  end
362
374
  data = nil
363
375
  error = nil
364
- begin
365
- data = oai_get_records(
366
- name,
367
- content["repository"]["uri"],
368
- format,
369
- from,
370
- to,
371
- set,
372
- resumption_token
373
- )
374
- state["resumption_token"] = data["resumptionToken"]
375
- state["count_success"] += 1
376
- rescue StandardError => e
377
- state["count_fails"] += 1
378
- error = e
376
+ to_pause = true
377
+ done = false
378
+ if state["done"]
379
+ state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
380
+ done = true
381
+ else
382
+ begin
383
+ data = oai_get_records(
384
+ name,
385
+ content["repository"]["uri"],
386
+ format,
387
+ from,
388
+ to,
389
+ set,
390
+ resumption_token
391
+ )
392
+ state["resumption_token"] = nil
393
+ state["expiration_date_resumption_token"] = nil
394
+ if use_resumption_token
395
+ if data["resumptionToken"].is_a?(String)
396
+ state["resumption_token"] = data["resumptionToken"]
397
+ elsif data["resumptionToken"].is_a?(Hash)
398
+ data_token = data["resumptionToken"]
399
+ state["resumption_token"] = data_token["$text"]
400
+ state["expiration_date_resumption_token"] = data_token["_expirationDate"]
401
+ end
402
+ end
403
+ state["datetime_now"] = get_datetime_now
404
+ n_records = data["record"].size
405
+ state["count_harvested_records"] += n_records
406
+ if n_records > 0
407
+ timestamps = data["record"].map do |record|
408
+ record["header"]["datestamp"]
409
+ end.sort
410
+ timestamp_latest = timestamps[-1].strftime('%FT%TZ')
411
+ state["latest_harvested_records_datestamp"] = convert_datetime(timestamp_latest, fmt_dt)
412
+ end
413
+ state["count_success"] += 1
414
+ rescue StandardError => e
415
+ state["count_fails"] += 1
416
+ error = e
417
+ end
418
+ if error.nil?
419
+ if use_resumption_token
420
+ if state["resumption_token"].nil?
421
+ state_machine.add_event(EventHarvesting::DONE_HARVEST)
422
+ to_pause = true
423
+ else
424
+ to_pause = false
425
+ end
426
+ else
427
+ state_machine.add_event(EventHarvesting::DONE_HARVEST)
428
+ to_pause = true
429
+ end
430
+ end
431
+ state["done"] = done
379
432
  end
380
433
  path_file_state = get_path_state_file_from_schedule_name(name)
381
434
  @logger.info("#{name}: writing to state file #{path_file_state}")
382
435
  write_state_file(path_file_state, state)
383
- done = false
384
- if error.nil?
385
- if !data["resumptionToken"].nil?
386
- state_machine.add_event(EventHarvesting::DONE_HARVEST)
387
- done = false
388
- else
389
- state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
390
- done = true
391
- end
392
- end
393
436
  @f_digest&.call(name, content, data, done, error, state, @logger)
394
- break
437
+ break if to_pause
395
438
  when StateHarvesting::COMPLETE
396
439
  @logger.warn("#{name}: full harvesting complete")
397
440
  content["active"] = false
@@ -409,11 +452,19 @@ module OAISchedules
409
452
  verb = "Identify"
410
453
  url_query = "#{url_base}?verb=#{verb}"
411
454
  @logger.info("#{name}: fetching from #{url_query}")
412
- data = DataCollector::Core.filter(DataCollector::Input.new.from_uri(url_query), "$..#{verb}")
455
+ input = DataCollector::Input.new.from_uri(url_query)
456
+ if input.nil?
457
+ raise StandardError, "#{url_query}, service not found"
458
+ end
459
+ data = DataCollector::Core.filter(input, "$..#{verb}")
413
460
  if data.empty?
414
- raise StandardError, "#{name}: GET #{url_query}: URL not available, or response has no element #{verb}"
461
+ error = DataCollector::Core.filter(input, "$..error")
462
+ error = error[0]
463
+ name_error = error['_code']
464
+ text_error = error['$text']
465
+ raise StandardError, "#{name}: #{url_query}: #{name_error}, #{text_error}"
415
466
  end
416
- data = data[0]
467
+ data[0]
417
468
  rescue DataCollector::InputError => e
418
469
  raise RuntimeError, "#{name}: #{url_query} not found, or server error"
419
470
  rescue StandardError => e
@@ -440,11 +491,22 @@ module OAISchedules
440
491
  url_query += "&resumptionToken=#{resumption_token}"
441
492
  end
442
493
  @logger.info("#{name}: fetching from #{url_query}")
443
- data = DataCollector::Core.filter(DataCollector::Input.new.from_uri(url_query), "$..#{verb}")
494
+ input = DataCollector::Input.new.from_uri(url_query)
495
+ if input.nil?
496
+ raise StandardError, "#{url_query}, service not found"
497
+ end
498
+ data = DataCollector::Core.filter(input, "$..#{verb}")
444
499
  if data.empty?
445
- raise StandardError, "#{name}: GET #{url_query}: URL not available, or response has no element #{verb}, or missing query parameters (e.g. set)"
500
+ error = DataCollector::Core.filter(input, "$..error")
501
+ error = error[0]
502
+ name_error = error['_code']
503
+ text_error = error['$text']
504
+ raise StandardError, "#{name}: #{url_query}: #{name_error}, #{text_error}"
446
505
  end
447
- data = data[0]
506
+ if data.compact.empty?
507
+ raise StandardError, "#{name}: #{url_query}: no records found"
508
+ end
509
+ data[0]
448
510
  rescue DataCollector::InputError => e
449
511
  raise RuntimeError, "#{name}: #{url_query} not found, or server error"
450
512
  rescue StandardError => e
@@ -477,6 +539,37 @@ module OAISchedules
477
539
  modify_schedule(name_schedule, content_schedule)
478
540
  end
479
541
 
542
+ def convert_datetime(str_dt, fmt_dt)
543
+ return nil unless str_dt
544
+ return str_dt unless fmt_dt
545
+ dt = DateTime.parse(str_dt)
546
+ case fmt_dt
547
+ when 'YYYY-MM-DD'
548
+ dt.strftime('%F')
549
+ when 'YYYY-MM-DDThh:mm:ssZ'
550
+ dt.strftime('%FT%TZ')
551
+ else
552
+ str_dt
553
+ end
554
+ end
555
+
556
+ def add_eps_to_datetime(str_dt, fmt_dt)
557
+ return nil unless str_dt
558
+ dt = DateTime.parse(str_dt)
559
+ case fmt_dt
560
+ when 'YYYY-MM-DD'
561
+ dt += 1.0
562
+ dt.strftime('%F')
563
+ when 'YYYY-MM-DDThh:mm:ssZ'
564
+ dt += 1.0/(24*60*60)
565
+ dt.strftime('%FT%TZ')
566
+ end
567
+ end
568
+
569
+ def get_datetime_now
570
+ DateTime.now.strftime('%FT%TZ')
571
+ end
572
+
480
573
  end
481
574
 
482
575
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module OaiSchedules
4
- VERSION = "0.5.0"
4
+ VERSION = "0.7.0"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oai_schedules
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Monari
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-04-29 00:00:00.000000000 Z
10
+ date: 2025-07-09 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -78,6 +78,10 @@ files:
78
78
  - LICENSE.txt
79
79
  - README.md
80
80
  - Rakefile
81
+ - examples/dir_schedules/schedule_495607cf-f773-463f-8ee1-77d0f53e0c29.json
82
+ - examples/dir_schedules/schedule_5e344861-806b-4361-98f7-a0be6a5984de.json
83
+ - examples/dir_schedules/schedule_712f46ab-f87f-4db8-b69e-7101d9e2ae61.json
84
+ - examples/dir_schedules/schedule_99922363-5b37-4438-a274-5a4a5167f811.json
81
85
  - examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
82
86
  - examples/dir_schedules/schedule_sample.json
83
87
  - examples/dir_state/.gitkeep