oai_schedules 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/examples/dir_schedules/.gitkeep +0 -0
- data/examples/dir_schedules_all/schedule_deventit.json +41 -0
- data/examples/dir_schedules_all/schedule_heron.json +38 -0
- data/lib/oai_schedules/manager.rb +109 -61
- data/lib/oai_schedules/version.rb +1 -1
- metadata +25 -8
- /data/examples/{dir_schedules → dir_schedules_all}/schedule_495607cf-f773-463f-8ee1-77d0f53e0c29.json +0 -0
- /data/examples/{dir_schedules → dir_schedules_all}/schedule_5e344861-806b-4361-98f7-a0be6a5984de.json +0 -0
- /data/examples/{dir_schedules → dir_schedules_all}/schedule_712f46ab-f87f-4db8-b69e-7101d9e2ae61.json +0 -0
- /data/examples/{dir_schedules → dir_schedules_all}/schedule_99922363-5b37-4438-a274-5a4a5167f811.json +0 -0
- /data/examples/{dir_schedules → dir_schedules_all}/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json +0 -0
- /data/examples/{dir_schedules → dir_schedules_all}/schedule_sample.json +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dac0dec1b24093321fea8bb4e8fe05a315255d4a4418f0ff72803f6af902196f
|
4
|
+
data.tar.gz: 590c1f29d049ee82a4352ec3d5a79fbd070013be36f4f08a166cf34323c0afd1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e7fa2c85188069c4997fe4fb91e39f05b6f9ec250184b60868729701a1fefd16fa3fa830cf993913454e5acb8ea4649d2b16a4bad6fcee499053dbe929b5b105
|
7
|
+
data.tar.gz: 50b0ed8e1955e52fe4ea25bd66a38abd397be64d2986ee608c042502091b44e58835110fc18d5406c1c28c431c6e741455091a824ce592c7e8bce4fead953c34
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## [0.9.0] - 2025-07-15
|
2
|
+
|
3
|
+
- Restored resumption token logic (the only one correct)
|
4
|
+
- Fixed Zulu time issues
|
5
|
+
- Added more info in the schedule state
|
6
|
+
|
7
|
+
## [0.8.0] - 2025-07-10
|
8
|
+
|
9
|
+
- Complete harvesting is now within a single thread tick
|
10
|
+
- Added more info in the schedule state
|
11
|
+
|
1
12
|
## [0.7.0] - 2025-07-09
|
2
13
|
|
3
14
|
- Partial harvest policy changed: avoid using resumption tokens
|
File without changes
|
@@ -0,0 +1,41 @@
|
|
1
|
+
{
|
2
|
+
"interval": "PT5S",
|
3
|
+
"repository": {
|
4
|
+
"uri": "http://lag.hosting.deventit.net/atlantispubliek/oai.axd",
|
5
|
+
"repository_name": "Liberas",
|
6
|
+
"protocol_version": "2.0",
|
7
|
+
"admin_email": [
|
8
|
+
"support@deventit.nl"
|
9
|
+
],
|
10
|
+
"earliest_datestamp": "1900-01-01T01:01:01+00:00",
|
11
|
+
"deleted_records": "persistent",
|
12
|
+
"granularity": "YYYY-MM-DDThh:mm:ssZ",
|
13
|
+
"metadata_format": [
|
14
|
+
"APEX",
|
15
|
+
"EAC",
|
16
|
+
"EAD"
|
17
|
+
],
|
18
|
+
"set": [
|
19
|
+
"APEX",
|
20
|
+
"APEXAO",
|
21
|
+
"EAC",
|
22
|
+
"EAD"
|
23
|
+
],
|
24
|
+
"id": "93b6d240ac9b782664f18823b761a128fc19116506dd228763b52ba8fb64e1b9"
|
25
|
+
},
|
26
|
+
"active": true,
|
27
|
+
"transformer": {
|
28
|
+
"transformer_name": [
|
29
|
+
"dummy_transformer"
|
30
|
+
],
|
31
|
+
"type": {
|
32
|
+
"id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
|
33
|
+
"value": "dummy_type"
|
34
|
+
},
|
35
|
+
"uri": "http://dummy-uri.org/",
|
36
|
+
"id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
|
37
|
+
},
|
38
|
+
"format": "EAD",
|
39
|
+
"set": "EAD",
|
40
|
+
"id": "deventit"
|
41
|
+
}
|
@@ -0,0 +1,38 @@
|
|
1
|
+
{
|
2
|
+
"interval": "PT5S",
|
3
|
+
"repository": {
|
4
|
+
"uri": "https://heron.libis.be/ca_veb_q/admin/service.php/OAI/cw_organisaties/request",
|
5
|
+
"repository_name": "VEB",
|
6
|
+
"protocol_version": "2.0",
|
7
|
+
"admin_email": [
|
8
|
+
"collectiveaccess@vlaamse-erfgoedbibliotheken.be"
|
9
|
+
],
|
10
|
+
"earliest_datestamp": "2020-08-17T12:55:02+00:00",
|
11
|
+
"deleted_records": "transient",
|
12
|
+
"granularity": "YYYY-MM-DDThh:mm:ssZ",
|
13
|
+
"metadata_format": [
|
14
|
+
"oai_dc",
|
15
|
+
"oai_veb"
|
16
|
+
],
|
17
|
+
"set": [
|
18
|
+
"1"
|
19
|
+
],
|
20
|
+
"id": "4886e3793003d2b1dffbbdb41ca13024c7c809892314ab301ded0d7ecfd7b469"
|
21
|
+
},
|
22
|
+
"active": true,
|
23
|
+
"transformer": {
|
24
|
+
"transformer_name": [
|
25
|
+
"dummy_transformer"
|
26
|
+
],
|
27
|
+
"type": {
|
28
|
+
"id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
|
29
|
+
"value": "dummy_type"
|
30
|
+
},
|
31
|
+
"uri": "http://dummy-uri.org/",
|
32
|
+
"id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
|
33
|
+
},
|
34
|
+
"format": "oai_veb",
|
35
|
+
"set": "1",
|
36
|
+
"from": "1900-06-10T12:39:00+00:00",
|
37
|
+
"id": "heron"
|
38
|
+
}
|
@@ -8,6 +8,7 @@ require 'logger'
|
|
8
8
|
require 'data_collector'
|
9
9
|
require 'iso8601'
|
10
10
|
require 'date'
|
11
|
+
require 'active_support/core_ext/time'
|
11
12
|
|
12
13
|
|
13
14
|
|
@@ -49,6 +50,7 @@ module OAISchedules
|
|
49
50
|
REQUEST_HARVEST = 2
|
50
51
|
DONE_HARVEST = 3
|
51
52
|
DONE_FULL_HARVEST = 4
|
53
|
+
RESTART = 5
|
52
54
|
end
|
53
55
|
|
54
56
|
class StateMachineHarvesting
|
@@ -103,6 +105,15 @@ module OAISchedules
|
|
103
105
|
@state
|
104
106
|
end
|
105
107
|
|
108
|
+
when StateHarvesting::COMPLETE
|
109
|
+
|
110
|
+
case event
|
111
|
+
when EventHarvesting::RESTART
|
112
|
+
@state = StateHarvesting::NOT_IDENTIFIED
|
113
|
+
else
|
114
|
+
@state
|
115
|
+
end
|
116
|
+
|
106
117
|
else
|
107
118
|
@state
|
108
119
|
end
|
@@ -228,14 +239,19 @@ module OAISchedules
|
|
228
239
|
|
229
240
|
|
230
241
|
def init_schedule_state(state)
|
242
|
+
state["use_resumption_token"] = nil
|
231
243
|
state["resumption_token"] = nil
|
232
244
|
state["expiration_date_resumption_token"] = nil
|
233
245
|
state["datetime_now"] = get_datetime_now
|
246
|
+
state["datetime_next_harvesting"] = nil
|
234
247
|
state["count_success"] = 0
|
235
248
|
state["count_fails"] = 0
|
236
249
|
state["done"] = false
|
237
250
|
state["count_harvested_records"] = 0
|
238
251
|
state["latest_harvested_records_datestamp"] = nil
|
252
|
+
state["harvesting"] = false
|
253
|
+
state["error"] = nil
|
254
|
+
state["status"] = nil
|
239
255
|
end
|
240
256
|
|
241
257
|
|
@@ -259,8 +275,7 @@ module OAISchedules
|
|
259
275
|
task = @schedules[name][:task]
|
260
276
|
interval_s_safe = 60
|
261
277
|
begin
|
262
|
-
|
263
|
-
interval_s = duration.to_seconds
|
278
|
+
interval_s = interval_iso8601_to_seconds(@schedules[name][:content]["interval"])
|
264
279
|
@logger.info("#{name}: task interval (s): #{interval_s}")
|
265
280
|
th_interval_s = 0.1 # protects from negative, 0 or small time intervals
|
266
281
|
if interval_s < th_interval_s
|
@@ -338,7 +353,9 @@ module OAISchedules
|
|
338
353
|
|
339
354
|
|
340
355
|
def logic(name, content, state_machine, state)
|
356
|
+
state["datetime_next_harvesting"] = get_datetime_next_schedule_tick_from_now(content["interval"])
|
341
357
|
loop do
|
358
|
+
# sleep(3)
|
342
359
|
@logger.info("#{name}: handling state: #{state_machine.state}")
|
343
360
|
case state_machine.state
|
344
361
|
when StateHarvesting::NOT_IDENTIFIED
|
@@ -353,7 +370,9 @@ module OAISchedules
|
|
353
370
|
when StateHarvesting::IDLE
|
354
371
|
state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
|
355
372
|
when StateHarvesting::HARVESTING
|
356
|
-
use_resumption_token = false
|
373
|
+
# use_resumption_token = false
|
374
|
+
use_resumption_token = true
|
375
|
+
state["use_resumption_token"] = use_resumption_token
|
357
376
|
format = content["format"] || ""
|
358
377
|
fmt_dt = content.dig("repository", "granularity")
|
359
378
|
from = convert_datetime(content["from"], fmt_dt) || ""
|
@@ -362,83 +381,101 @@ module OAISchedules
|
|
362
381
|
resumption_token = ""
|
363
382
|
if use_resumption_token
|
364
383
|
if !state["resumption_token"].nil?
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
384
|
+
# if state["expiration_date_resumption_token"].nil? || (!state["expiration_date_resumption_token"].nil? && (DateTime.parse(state["expiration_date_resumption_token"]) - DateTime.now) > 0)
|
385
|
+
format = ""
|
386
|
+
from = ""
|
387
|
+
to = ""
|
388
|
+
set = ""
|
389
|
+
resumption_token = state["resumption_token"]
|
390
|
+
# end
|
370
391
|
end
|
371
392
|
else
|
372
393
|
from = add_eps_to_datetime(state["latest_harvested_records_datestamp"], fmt_dt) || from
|
373
394
|
end
|
374
395
|
data = nil
|
375
396
|
error = nil
|
376
|
-
|
397
|
+
err_info = nil
|
398
|
+
harvesting = true
|
377
399
|
done = false
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
data_token = data["resumptionToken"]
|
399
|
-
state["resumption_token"] = data_token["$text"]
|
400
|
-
state["expiration_date_resumption_token"] = data_token["_expirationDate"]
|
401
|
-
end
|
402
|
-
end
|
403
|
-
state["datetime_now"] = get_datetime_now
|
404
|
-
n_records = data["record"].size
|
405
|
-
state["count_harvested_records"] += n_records
|
406
|
-
if n_records > 0
|
407
|
-
timestamps = data["record"].map do |record|
|
408
|
-
record["header"]["datestamp"]
|
409
|
-
end.sort
|
410
|
-
timestamp_latest = timestamps[-1].strftime('%FT%TZ')
|
411
|
-
state["latest_harvested_records_datestamp"] = convert_datetime(timestamp_latest, fmt_dt)
|
400
|
+
status = nil
|
401
|
+
begin
|
402
|
+
data = oai_get_records(
|
403
|
+
name,
|
404
|
+
content["repository"]["uri"],
|
405
|
+
format,
|
406
|
+
from,
|
407
|
+
to,
|
408
|
+
set,
|
409
|
+
resumption_token
|
410
|
+
)
|
411
|
+
state["resumption_token"] = nil
|
412
|
+
state["expiration_date_resumption_token"] = nil
|
413
|
+
if use_resumption_token
|
414
|
+
if data["resumptionToken"].is_a?(String)
|
415
|
+
state["resumption_token"] = data["resumptionToken"]
|
416
|
+
elsif data["resumptionToken"].is_a?(Hash)
|
417
|
+
data_token = data["resumptionToken"]
|
418
|
+
state["resumption_token"] = data_token["$text"]
|
419
|
+
state["expiration_date_resumption_token"] = data_token["_expirationDate"]
|
412
420
|
end
|
413
|
-
state["count_success"] += 1
|
414
|
-
rescue StandardError => e
|
415
|
-
state["count_fails"] += 1
|
416
|
-
error = e
|
417
421
|
end
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
422
|
+
n_records = data["record"].size
|
423
|
+
state["count_harvested_records"] += n_records
|
424
|
+
if n_records > 0
|
425
|
+
timestamps = data["record"].map do |record|
|
426
|
+
record["header"]["datestamp"]
|
427
|
+
end.sort
|
428
|
+
timestamp_latest = timestamps[-1].strftime('%FT%TZ')
|
429
|
+
state["latest_harvested_records_datestamp"] = convert_datetime(timestamp_latest, fmt_dt)
|
430
|
+
end
|
431
|
+
state["count_success"] += 1
|
432
|
+
rescue StandardError => e
|
433
|
+
state["count_fails"] += 1
|
434
|
+
error = e
|
435
|
+
end
|
436
|
+
if error.nil?
|
437
|
+
err_info = nil
|
438
|
+
if use_resumption_token
|
439
|
+
if state["resumption_token"].nil?
|
440
|
+
state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
|
441
|
+
harvesting = false
|
442
|
+
status = "next harvesting: #{state["datetime_next_harvesting"]}"
|
426
443
|
else
|
427
444
|
state_machine.add_event(EventHarvesting::DONE_HARVEST)
|
428
|
-
|
445
|
+
harvesting = true
|
446
|
+
status = "harvesting"
|
429
447
|
end
|
448
|
+
else
|
449
|
+
state_machine.add_event(EventHarvesting::DONE_HARVEST)
|
450
|
+
harvesting = true
|
451
|
+
status = "harvesting"
|
430
452
|
end
|
431
|
-
|
453
|
+
else
|
454
|
+
err_info = {
|
455
|
+
"message" => error.message,
|
456
|
+
"backtrace" => error.backtrace
|
457
|
+
}
|
458
|
+
state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
|
459
|
+
harvesting = false
|
460
|
+
status = "error (see logs), next harvesting: #{state["datetime_next_harvesting"]}"
|
432
461
|
end
|
462
|
+
state["done"] = done
|
463
|
+
state["harvesting"] = harvesting
|
464
|
+
state["datetime_now"] = get_datetime_now
|
465
|
+
state["status"] = status
|
466
|
+
state["error"] = err_info
|
433
467
|
path_file_state = get_path_state_file_from_schedule_name(name)
|
434
468
|
@logger.info("#{name}: writing to state file #{path_file_state}")
|
435
469
|
write_state_file(path_file_state, state)
|
436
470
|
@f_digest&.call(name, content, data, done, error, state, @logger)
|
437
|
-
break if to_pause
|
438
471
|
when StateHarvesting::COMPLETE
|
439
472
|
@logger.warn("#{name}: full harvesting complete")
|
440
|
-
|
441
|
-
|
473
|
+
state_machine.add_event(EventHarvesting::RESTART)
|
474
|
+
auto_deactivate_schedule = false
|
475
|
+
if auto_deactivate_schedule
|
476
|
+
content["active"] = false
|
477
|
+
handle_schedule_task(name)
|
478
|
+
end
|
442
479
|
break
|
443
480
|
else
|
444
481
|
@logger.warn("#{name}: state #{state_machine.state} not known")
|
@@ -567,7 +604,18 @@ module OAISchedules
|
|
567
604
|
end
|
568
605
|
|
569
606
|
def get_datetime_now
|
570
|
-
|
607
|
+
Time.current.in_time_zone('Zulu').strftime('%FT%TZ')
|
608
|
+
end
|
609
|
+
|
610
|
+
def interval_iso8601_to_seconds(str_interval)
|
611
|
+
ISO8601::Duration.new(str_interval).to_seconds
|
612
|
+
end
|
613
|
+
|
614
|
+
def get_datetime_next_schedule_tick_from_now(str_interval)
|
615
|
+
interval_s = interval_iso8601_to_seconds(str_interval)
|
616
|
+
dt = Time.current.in_time_zone('Zulu')
|
617
|
+
dt += interval_s
|
618
|
+
dt.strftime('%FT%TZ')
|
571
619
|
end
|
572
620
|
|
573
621
|
end
|
metadata
CHANGED
@@ -1,14 +1,28 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: oai_schedules
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Davide Monari
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-07-
|
10
|
+
date: 2025-07-15 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: activesupport
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: 8.0.2
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: 8.0.2
|
12
26
|
- !ruby/object:Gem::Dependency
|
13
27
|
name: concurrent-ruby
|
14
28
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,12 +92,15 @@ files:
|
|
78
92
|
- LICENSE.txt
|
79
93
|
- README.md
|
80
94
|
- Rakefile
|
81
|
-
- examples/dir_schedules
|
82
|
-
- examples/
|
83
|
-
- examples/
|
84
|
-
- examples/
|
85
|
-
- examples/
|
86
|
-
- examples/
|
95
|
+
- examples/dir_schedules/.gitkeep
|
96
|
+
- examples/dir_schedules_all/schedule_495607cf-f773-463f-8ee1-77d0f53e0c29.json
|
97
|
+
- examples/dir_schedules_all/schedule_5e344861-806b-4361-98f7-a0be6a5984de.json
|
98
|
+
- examples/dir_schedules_all/schedule_712f46ab-f87f-4db8-b69e-7101d9e2ae61.json
|
99
|
+
- examples/dir_schedules_all/schedule_99922363-5b37-4438-a274-5a4a5167f811.json
|
100
|
+
- examples/dir_schedules_all/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
|
101
|
+
- examples/dir_schedules_all/schedule_deventit.json
|
102
|
+
- examples/dir_schedules_all/schedule_heron.json
|
103
|
+
- examples/dir_schedules_all/schedule_sample.json
|
87
104
|
- examples/dir_state/.gitkeep
|
88
105
|
- examples/example_01.rb
|
89
106
|
- examples/example_02.rb
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|