oai_schedules 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 86d2572fbf72c1d75ab9b83efcbb739726962127953458f438cf5ac20278997d
4
- data.tar.gz: b4fe62d6a8280e9bd55c2337d10615768808d0d5ccbeef469a8ad4a7a55b0b05
3
+ metadata.gz: dac0dec1b24093321fea8bb4e8fe05a315255d4a4418f0ff72803f6af902196f
4
+ data.tar.gz: 590c1f29d049ee82a4352ec3d5a79fbd070013be36f4f08a166cf34323c0afd1
5
5
  SHA512:
6
- metadata.gz: 1294b5a37cf3d9ef098c5d1484d1853e5c5990148348b735c7a22809cbab963c1d22e1fae8b3ab525107a21368483ac8eb600f954293dde41a6039db4fed6097
7
- data.tar.gz: bc8a60e47cd6f1778483556e644362e976b97456b5c30ae6c320b526b55958e2852011c002c3268afd15c003f835a03b847d0a9173f4590e418ccfece198f447
6
+ metadata.gz: e7fa2c85188069c4997fe4fb91e39f05b6f9ec250184b60868729701a1fefd16fa3fa830cf993913454e5acb8ea4649d2b16a4bad6fcee499053dbe929b5b105
7
+ data.tar.gz: 50b0ed8e1955e52fe4ea25bd66a38abd397be64d2986ee608c042502091b44e58835110fc18d5406c1c28c431c6e741455091a824ce592c7e8bce4fead953c34
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [0.9.0] - 2025-07-15
2
+
3
+ - Restored resumption token logic (the only one correct)
4
+ - Fixed Zulu time issues
5
+ - Added more info in the schedule state
6
+
1
7
  ## [0.8.0] - 2025-07-10
2
8
 
3
9
  - Complete harvesting is now within a single thread tick
File without changes
@@ -0,0 +1,41 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "http://lag.hosting.deventit.net/atlantispubliek/oai.axd",
5
+ "repository_name": "Liberas",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "support@deventit.nl"
9
+ ],
10
+ "earliest_datestamp": "1900-01-01T01:01:01+00:00",
11
+ "deleted_records": "persistent",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "APEX",
15
+ "EAC",
16
+ "EAD"
17
+ ],
18
+ "set": [
19
+ "APEX",
20
+ "APEXAO",
21
+ "EAC",
22
+ "EAD"
23
+ ],
24
+ "id": "93b6d240ac9b782664f18823b761a128fc19116506dd228763b52ba8fb64e1b9"
25
+ },
26
+ "active": true,
27
+ "transformer": {
28
+ "transformer_name": [
29
+ "dummy_transformer"
30
+ ],
31
+ "type": {
32
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
33
+ "value": "dummy_type"
34
+ },
35
+ "uri": "http://dummy-uri.org/",
36
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
37
+ },
38
+ "format": "EAD",
39
+ "set": "EAD",
40
+ "id": "deventit"
41
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "https://heron.libis.be/ca_veb_q/admin/service.php/OAI/cw_organisaties/request",
5
+ "repository_name": "VEB",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "collectiveaccess@vlaamse-erfgoedbibliotheken.be"
9
+ ],
10
+ "earliest_datestamp": "2020-08-17T12:55:02+00:00",
11
+ "deleted_records": "transient",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "oai_veb"
16
+ ],
17
+ "set": [
18
+ "1"
19
+ ],
20
+ "id": "4886e3793003d2b1dffbbdb41ca13024c7c809892314ab301ded0d7ecfd7b469"
21
+ },
22
+ "active": true,
23
+ "transformer": {
24
+ "transformer_name": [
25
+ "dummy_transformer"
26
+ ],
27
+ "type": {
28
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
29
+ "value": "dummy_type"
30
+ },
31
+ "uri": "http://dummy-uri.org/",
32
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
33
+ },
34
+ "format": "oai_veb",
35
+ "set": "1",
36
+ "from": "1900-06-10T12:39:00+00:00",
37
+ "id": "heron"
38
+ }
@@ -8,6 +8,7 @@ require 'logger'
8
8
  require 'data_collector'
9
9
  require 'iso8601'
10
10
  require 'date'
11
+ require 'active_support/core_ext/time'
11
12
 
12
13
 
13
14
 
@@ -49,6 +50,7 @@ module OAISchedules
49
50
  REQUEST_HARVEST = 2
50
51
  DONE_HARVEST = 3
51
52
  DONE_FULL_HARVEST = 4
53
+ RESTART = 5
52
54
  end
53
55
 
54
56
  class StateMachineHarvesting
@@ -103,6 +105,15 @@ module OAISchedules
103
105
  @state
104
106
  end
105
107
 
108
+ when StateHarvesting::COMPLETE
109
+
110
+ case event
111
+ when EventHarvesting::RESTART
112
+ @state = StateHarvesting::NOT_IDENTIFIED
113
+ else
114
+ @state
115
+ end
116
+
106
117
  else
107
118
  @state
108
119
  end
@@ -228,6 +239,7 @@ module OAISchedules
228
239
 
229
240
 
230
241
  def init_schedule_state(state)
242
+ state["use_resumption_token"] = nil
231
243
  state["resumption_token"] = nil
232
244
  state["expiration_date_resumption_token"] = nil
233
245
  state["datetime_now"] = get_datetime_now
@@ -239,6 +251,7 @@ module OAISchedules
239
251
  state["latest_harvested_records_datestamp"] = nil
240
252
  state["harvesting"] = false
241
253
  state["error"] = nil
254
+ state["status"] = nil
242
255
  end
243
256
 
244
257
 
@@ -357,8 +370,9 @@ module OAISchedules
357
370
  when StateHarvesting::IDLE
358
371
  state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
359
372
  when StateHarvesting::HARVESTING
360
- use_resumption_token = false # don't change
361
- state["harvesting"] = true
373
+ # use_resumption_token = false
374
+ use_resumption_token = true
375
+ state["use_resumption_token"] = use_resumption_token
362
376
  format = content["format"] || ""
363
377
  fmt_dt = content.dig("repository", "granularity")
364
378
  from = convert_datetime(content["from"], fmt_dt) || ""
@@ -367,91 +381,101 @@ module OAISchedules
367
381
  resumption_token = ""
368
382
  if use_resumption_token
369
383
  if !state["resumption_token"].nil?
370
- format = ""
371
- from = ""
372
- to = ""
373
- set = ""
374
- resumption_token = state["resumption_token"]
384
+ # if state["expiration_date_resumption_token"].nil? || (!state["expiration_date_resumption_token"].nil? && (DateTime.parse(state["expiration_date_resumption_token"]) - DateTime.now) > 0)
385
+ format = ""
386
+ from = ""
387
+ to = ""
388
+ set = ""
389
+ resumption_token = state["resumption_token"]
390
+ # end
375
391
  end
376
392
  else
377
393
  from = add_eps_to_datetime(state["latest_harvested_records_datestamp"], fmt_dt) || from
378
394
  end
379
395
  data = nil
380
396
  error = nil
381
- to_pause = true
397
+ err_info = nil
398
+ harvesting = true
382
399
  done = false
383
- if state["done"]
384
- state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
385
- done = true
386
- else
387
- begin
388
- data = oai_get_records(
389
- name,
390
- content["repository"]["uri"],
391
- format,
392
- from,
393
- to,
394
- set,
395
- resumption_token
396
- )
397
- state["resumption_token"] = nil
398
- state["expiration_date_resumption_token"] = nil
399
- if use_resumption_token
400
- if data["resumptionToken"].is_a?(String)
401
- state["resumption_token"] = data["resumptionToken"]
402
- elsif data["resumptionToken"].is_a?(Hash)
403
- data_token = data["resumptionToken"]
404
- state["resumption_token"] = data_token["$text"]
405
- state["expiration_date_resumption_token"] = data_token["_expirationDate"]
406
- end
407
- end
408
- n_records = data["record"].size
409
- state["count_harvested_records"] += n_records
410
- if n_records > 0
411
- timestamps = data["record"].map do |record|
412
- record["header"]["datestamp"]
413
- end.sort
414
- timestamp_latest = timestamps[-1].strftime('%FT%TZ')
415
- state["latest_harvested_records_datestamp"] = convert_datetime(timestamp_latest, fmt_dt)
400
+ status = nil
401
+ begin
402
+ data = oai_get_records(
403
+ name,
404
+ content["repository"]["uri"],
405
+ format,
406
+ from,
407
+ to,
408
+ set,
409
+ resumption_token
410
+ )
411
+ state["resumption_token"] = nil
412
+ state["expiration_date_resumption_token"] = nil
413
+ if use_resumption_token
414
+ if data["resumptionToken"].is_a?(String)
415
+ state["resumption_token"] = data["resumptionToken"]
416
+ elsif data["resumptionToken"].is_a?(Hash)
417
+ data_token = data["resumptionToken"]
418
+ state["resumption_token"] = data_token["$text"]
419
+ state["expiration_date_resumption_token"] = data_token["_expirationDate"]
416
420
  end
417
- state["count_success"] += 1
418
- rescue StandardError => e
419
- state["count_fails"] += 1
420
- error = e
421
421
  end
422
- if error.nil?
423
- state["error"] = nil
424
- if use_resumption_token
425
- if state["resumption_token"].nil?
426
- state_machine.add_event(EventHarvesting::DONE_HARVEST)
427
- to_pause = true
428
- else
429
- to_pause = false
430
- end
422
+ n_records = data["record"].size
423
+ state["count_harvested_records"] += n_records
424
+ if n_records > 0
425
+ timestamps = data["record"].map do |record|
426
+ record["header"]["datestamp"]
427
+ end.sort
428
+ timestamp_latest = timestamps[-1].strftime('%FT%TZ')
429
+ state["latest_harvested_records_datestamp"] = convert_datetime(timestamp_latest, fmt_dt)
430
+ end
431
+ state["count_success"] += 1
432
+ rescue StandardError => e
433
+ state["count_fails"] += 1
434
+ error = e
435
+ end
436
+ if error.nil?
437
+ err_info = nil
438
+ if use_resumption_token
439
+ if state["resumption_token"].nil?
440
+ state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
441
+ harvesting = false
442
+ status = "next harvesting: #{state["datetime_next_harvesting"]}"
431
443
  else
432
- to_pause = false
444
+ state_machine.add_event(EventHarvesting::DONE_HARVEST)
445
+ harvesting = true
446
+ status = "harvesting"
433
447
  end
434
448
  else
435
- state["error"] = {
436
- "message" => error.message,
437
- "backtrace" => error.backtrace
438
- }
439
449
  state_machine.add_event(EventHarvesting::DONE_HARVEST)
440
- to_pause = true
450
+ harvesting = true
451
+ status = "harvesting"
441
452
  end
442
- state["done"] = done
443
- state["harvesting"] = !to_pause
444
- state["datetime_now"] = get_datetime_now
453
+ else
454
+ err_info = {
455
+ "message" => error.message,
456
+ "backtrace" => error.backtrace
457
+ }
458
+ state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
459
+ harvesting = false
460
+ status = "error (see logs), next harvesting: #{state["datetime_next_harvesting"]}"
445
461
  end
462
+ state["done"] = done
463
+ state["harvesting"] = harvesting
464
+ state["datetime_now"] = get_datetime_now
465
+ state["status"] = status
466
+ state["error"] = err_info
446
467
  path_file_state = get_path_state_file_from_schedule_name(name)
447
468
  @logger.info("#{name}: writing to state file #{path_file_state}")
448
469
  write_state_file(path_file_state, state)
449
470
  @f_digest&.call(name, content, data, done, error, state, @logger)
450
- break if to_pause
451
471
  when StateHarvesting::COMPLETE
452
472
  @logger.warn("#{name}: full harvesting complete")
453
- content["active"] = false
454
- handle_schedule_task(name)
473
+ state_machine.add_event(EventHarvesting::RESTART)
474
+ auto_deactivate_schedule = false
475
+ if auto_deactivate_schedule
476
+ content["active"] = false
477
+ handle_schedule_task(name)
478
+ end
455
479
  break
456
480
  else
457
481
  @logger.warn("#{name}: state #{state_machine.state} not known")
@@ -580,7 +604,7 @@ module OAISchedules
580
604
  end
581
605
 
582
606
  def get_datetime_now
583
- DateTime.now.strftime('%FT%TZ')
607
+ Time.current.in_time_zone('Zulu').strftime('%FT%TZ')
584
608
  end
585
609
 
586
610
  def interval_iso8601_to_seconds(str_interval)
@@ -589,8 +613,8 @@ module OAISchedules
589
613
 
590
614
  def get_datetime_next_schedule_tick_from_now(str_interval)
591
615
  interval_s = interval_iso8601_to_seconds(str_interval)
592
- dt = DateTime.now
593
- dt += 1.0*interval_s/(24*60*60)
616
+ dt = Time.current.in_time_zone('Zulu')
617
+ dt += interval_s
594
618
  dt.strftime('%FT%TZ')
595
619
  end
596
620
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module OaiSchedules
4
- VERSION = "0.8.0"
4
+ VERSION = "0.9.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,28 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oai_schedules
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Monari
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-07-10 00:00:00.000000000 Z
10
+ date: 2025-07-15 00:00:00.000000000 Z
11
11
  dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: activesupport
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: 8.0.2
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: 8.0.2
12
26
  - !ruby/object:Gem::Dependency
13
27
  name: concurrent-ruby
14
28
  requirement: !ruby/object:Gem::Requirement
@@ -78,12 +92,15 @@ files:
78
92
  - LICENSE.txt
79
93
  - README.md
80
94
  - Rakefile
81
- - examples/dir_schedules/schedule_495607cf-f773-463f-8ee1-77d0f53e0c29.json
82
- - examples/dir_schedules/schedule_5e344861-806b-4361-98f7-a0be6a5984de.json
83
- - examples/dir_schedules/schedule_712f46ab-f87f-4db8-b69e-7101d9e2ae61.json
84
- - examples/dir_schedules/schedule_99922363-5b37-4438-a274-5a4a5167f811.json
85
- - examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
86
- - examples/dir_schedules/schedule_sample.json
95
+ - examples/dir_schedules/.gitkeep
96
+ - examples/dir_schedules_all/schedule_495607cf-f773-463f-8ee1-77d0f53e0c29.json
97
+ - examples/dir_schedules_all/schedule_5e344861-806b-4361-98f7-a0be6a5984de.json
98
+ - examples/dir_schedules_all/schedule_712f46ab-f87f-4db8-b69e-7101d9e2ae61.json
99
+ - examples/dir_schedules_all/schedule_99922363-5b37-4438-a274-5a4a5167f811.json
100
+ - examples/dir_schedules_all/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
101
+ - examples/dir_schedules_all/schedule_deventit.json
102
+ - examples/dir_schedules_all/schedule_heron.json
103
+ - examples/dir_schedules_all/schedule_sample.json
87
104
  - examples/dir_state/.gitkeep
88
105
  - examples/example_01.rb
89
106
  - examples/example_02.rb