oai_schedules 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 93b5d732552fa18fc37680009dd21bf9509555c5cfeec8c26b2b20a37967f606
4
- data.tar.gz: cb282dfcd7e8866faa698e5f310ba07223e93d965395734a26665e531cca3ce0
3
+ metadata.gz: dac0dec1b24093321fea8bb4e8fe05a315255d4a4418f0ff72803f6af902196f
4
+ data.tar.gz: 590c1f29d049ee82a4352ec3d5a79fbd070013be36f4f08a166cf34323c0afd1
5
5
  SHA512:
6
- metadata.gz: 54a2044fe8446bf0951415302b68d06dccb3c36fc2628be888681418d1a4cecfcfd2723adbce8dd61656a3081087516e40c28e40acaa9037fdbeda88797f9d28
7
- data.tar.gz: 553f101400c60d6f1134f8ea97c075183a595059737605ab49ede6ef8620a5e0b15d47b2f82ff4253036c485489ddebc0059074f3d87bc3c2fc523d98e067600
6
+ metadata.gz: e7fa2c85188069c4997fe4fb91e39f05b6f9ec250184b60868729701a1fefd16fa3fa830cf993913454e5acb8ea4649d2b16a4bad6fcee499053dbe929b5b105
7
+ data.tar.gz: 50b0ed8e1955e52fe4ea25bd66a38abd397be64d2986ee608c042502091b44e58835110fc18d5406c1c28c431c6e741455091a824ce592c7e8bce4fead953c34
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## [0.9.0] - 2025-07-15
2
+
3
+ - Restored resumption token logic (the only one correct)
4
+ - Fixed Zulu time issues
5
+ - Added more info in the schedule state
6
+
7
+ ## [0.8.0] - 2025-07-10
8
+
9
+ - Complete harvesting is now within a single thread tick
10
+ - Added more info in the schedule state
11
+
1
12
  ## [0.7.0] - 2025-07-09
2
13
 
3
14
  - Partial harvest policy changed: avoid using resumption tokens
File without changes
@@ -0,0 +1,41 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "http://lag.hosting.deventit.net/atlantispubliek/oai.axd",
5
+ "repository_name": "Liberas",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "support@deventit.nl"
9
+ ],
10
+ "earliest_datestamp": "1900-01-01T01:01:01+00:00",
11
+ "deleted_records": "persistent",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "APEX",
15
+ "EAC",
16
+ "EAD"
17
+ ],
18
+ "set": [
19
+ "APEX",
20
+ "APEXAO",
21
+ "EAC",
22
+ "EAD"
23
+ ],
24
+ "id": "93b6d240ac9b782664f18823b761a128fc19116506dd228763b52ba8fb64e1b9"
25
+ },
26
+ "active": true,
27
+ "transformer": {
28
+ "transformer_name": [
29
+ "dummy_transformer"
30
+ ],
31
+ "type": {
32
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
33
+ "value": "dummy_type"
34
+ },
35
+ "uri": "http://dummy-uri.org/",
36
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
37
+ },
38
+ "format": "EAD",
39
+ "set": "EAD",
40
+ "id": "deventit"
41
+ }
@@ -0,0 +1,38 @@
1
+ {
2
+ "interval": "PT5S",
3
+ "repository": {
4
+ "uri": "https://heron.libis.be/ca_veb_q/admin/service.php/OAI/cw_organisaties/request",
5
+ "repository_name": "VEB",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "collectiveaccess@vlaamse-erfgoedbibliotheken.be"
9
+ ],
10
+ "earliest_datestamp": "2020-08-17T12:55:02+00:00",
11
+ "deleted_records": "transient",
12
+ "granularity": "YYYY-MM-DDThh:mm:ssZ",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "oai_veb"
16
+ ],
17
+ "set": [
18
+ "1"
19
+ ],
20
+ "id": "4886e3793003d2b1dffbbdb41ca13024c7c809892314ab301ded0d7ecfd7b469"
21
+ },
22
+ "active": true,
23
+ "transformer": {
24
+ "transformer_name": [
25
+ "dummy_transformer"
26
+ ],
27
+ "type": {
28
+ "id": "bb8bfd5d-d914-41ae-b9f1-b5065299d9b9",
29
+ "value": "dummy_type"
30
+ },
31
+ "uri": "http://dummy-uri.org/",
32
+ "id": "dd377ec0-c890-4635-b6d8-553ddf609c01"
33
+ },
34
+ "format": "oai_veb",
35
+ "set": "1",
36
+ "from": "1900-06-10T12:39:00+00:00",
37
+ "id": "heron"
38
+ }
@@ -8,6 +8,7 @@ require 'logger'
8
8
  require 'data_collector'
9
9
  require 'iso8601'
10
10
  require 'date'
11
+ require 'active_support/core_ext/time'
11
12
 
12
13
 
13
14
 
@@ -49,6 +50,7 @@ module OAISchedules
49
50
  REQUEST_HARVEST = 2
50
51
  DONE_HARVEST = 3
51
52
  DONE_FULL_HARVEST = 4
53
+ RESTART = 5
52
54
  end
53
55
 
54
56
  class StateMachineHarvesting
@@ -103,6 +105,15 @@ module OAISchedules
103
105
  @state
104
106
  end
105
107
 
108
+ when StateHarvesting::COMPLETE
109
+
110
+ case event
111
+ when EventHarvesting::RESTART
112
+ @state = StateHarvesting::NOT_IDENTIFIED
113
+ else
114
+ @state
115
+ end
116
+
106
117
  else
107
118
  @state
108
119
  end
@@ -228,14 +239,19 @@ module OAISchedules
228
239
 
229
240
 
230
241
  def init_schedule_state(state)
242
+ state["use_resumption_token"] = nil
231
243
  state["resumption_token"] = nil
232
244
  state["expiration_date_resumption_token"] = nil
233
245
  state["datetime_now"] = get_datetime_now
246
+ state["datetime_next_harvesting"] = nil
234
247
  state["count_success"] = 0
235
248
  state["count_fails"] = 0
236
249
  state["done"] = false
237
250
  state["count_harvested_records"] = 0
238
251
  state["latest_harvested_records_datestamp"] = nil
252
+ state["harvesting"] = false
253
+ state["error"] = nil
254
+ state["status"] = nil
239
255
  end
240
256
 
241
257
 
@@ -259,8 +275,7 @@ module OAISchedules
259
275
  task = @schedules[name][:task]
260
276
  interval_s_safe = 60
261
277
  begin
262
- duration = ISO8601::Duration.new(@schedules[name][:content]["interval"])
263
- interval_s = duration.to_seconds
278
+ interval_s = interval_iso8601_to_seconds(@schedules[name][:content]["interval"])
264
279
  @logger.info("#{name}: task interval (s): #{interval_s}")
265
280
  th_interval_s = 0.1 # protects from negative, 0 or small time intervals
266
281
  if interval_s < th_interval_s
@@ -338,7 +353,9 @@ module OAISchedules
338
353
 
339
354
 
340
355
  def logic(name, content, state_machine, state)
356
+ state["datetime_next_harvesting"] = get_datetime_next_schedule_tick_from_now(content["interval"])
341
357
  loop do
358
+ # sleep(3)
342
359
  @logger.info("#{name}: handling state: #{state_machine.state}")
343
360
  case state_machine.state
344
361
  when StateHarvesting::NOT_IDENTIFIED
@@ -353,7 +370,9 @@ module OAISchedules
353
370
  when StateHarvesting::IDLE
354
371
  state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
355
372
  when StateHarvesting::HARVESTING
356
- use_resumption_token = false # don't change
373
+ # use_resumption_token = false
374
+ use_resumption_token = true
375
+ state["use_resumption_token"] = use_resumption_token
357
376
  format = content["format"] || ""
358
377
  fmt_dt = content.dig("repository", "granularity")
359
378
  from = convert_datetime(content["from"], fmt_dt) || ""
@@ -362,83 +381,101 @@ module OAISchedules
362
381
  resumption_token = ""
363
382
  if use_resumption_token
364
383
  if !state["resumption_token"].nil?
365
- format = ""
366
- from = ""
367
- to = ""
368
- set = ""
369
- resumption_token = state["resumption_token"]
384
+ # if state["expiration_date_resumption_token"].nil? || (!state["expiration_date_resumption_token"].nil? && (DateTime.parse(state["expiration_date_resumption_token"]) - DateTime.now) > 0)
385
+ format = ""
386
+ from = ""
387
+ to = ""
388
+ set = ""
389
+ resumption_token = state["resumption_token"]
390
+ # end
370
391
  end
371
392
  else
372
393
  from = add_eps_to_datetime(state["latest_harvested_records_datestamp"], fmt_dt) || from
373
394
  end
374
395
  data = nil
375
396
  error = nil
376
- to_pause = true
397
+ err_info = nil
398
+ harvesting = true
377
399
  done = false
378
- if state["done"]
379
- state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
380
- done = true
381
- else
382
- begin
383
- data = oai_get_records(
384
- name,
385
- content["repository"]["uri"],
386
- format,
387
- from,
388
- to,
389
- set,
390
- resumption_token
391
- )
392
- state["resumption_token"] = nil
393
- state["expiration_date_resumption_token"] = nil
394
- if use_resumption_token
395
- if data["resumptionToken"].is_a?(String)
396
- state["resumption_token"] = data["resumptionToken"]
397
- elsif data["resumptionToken"].is_a?(Hash)
398
- data_token = data["resumptionToken"]
399
- state["resumption_token"] = data_token["$text"]
400
- state["expiration_date_resumption_token"] = data_token["_expirationDate"]
401
- end
402
- end
403
- state["datetime_now"] = get_datetime_now
404
- n_records = data["record"].size
405
- state["count_harvested_records"] += n_records
406
- if n_records > 0
407
- timestamps = data["record"].map do |record|
408
- record["header"]["datestamp"]
409
- end.sort
410
- timestamp_latest = timestamps[-1].strftime('%FT%TZ')
411
- state["latest_harvested_records_datestamp"] = convert_datetime(timestamp_latest, fmt_dt)
400
+ status = nil
401
+ begin
402
+ data = oai_get_records(
403
+ name,
404
+ content["repository"]["uri"],
405
+ format,
406
+ from,
407
+ to,
408
+ set,
409
+ resumption_token
410
+ )
411
+ state["resumption_token"] = nil
412
+ state["expiration_date_resumption_token"] = nil
413
+ if use_resumption_token
414
+ if data["resumptionToken"].is_a?(String)
415
+ state["resumption_token"] = data["resumptionToken"]
416
+ elsif data["resumptionToken"].is_a?(Hash)
417
+ data_token = data["resumptionToken"]
418
+ state["resumption_token"] = data_token["$text"]
419
+ state["expiration_date_resumption_token"] = data_token["_expirationDate"]
412
420
  end
413
- state["count_success"] += 1
414
- rescue StandardError => e
415
- state["count_fails"] += 1
416
- error = e
417
421
  end
418
- if error.nil?
419
- if use_resumption_token
420
- if state["resumption_token"].nil?
421
- state_machine.add_event(EventHarvesting::DONE_HARVEST)
422
- to_pause = true
423
- else
424
- to_pause = false
425
- end
422
+ n_records = data["record"].size
423
+ state["count_harvested_records"] += n_records
424
+ if n_records > 0
425
+ timestamps = data["record"].map do |record|
426
+ record["header"]["datestamp"]
427
+ end.sort
428
+ timestamp_latest = timestamps[-1].strftime('%FT%TZ')
429
+ state["latest_harvested_records_datestamp"] = convert_datetime(timestamp_latest, fmt_dt)
430
+ end
431
+ state["count_success"] += 1
432
+ rescue StandardError => e
433
+ state["count_fails"] += 1
434
+ error = e
435
+ end
436
+ if error.nil?
437
+ err_info = nil
438
+ if use_resumption_token
439
+ if state["resumption_token"].nil?
440
+ state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
441
+ harvesting = false
442
+ status = "next harvesting: #{state["datetime_next_harvesting"]}"
426
443
  else
427
444
  state_machine.add_event(EventHarvesting::DONE_HARVEST)
428
- to_pause = true
445
+ harvesting = true
446
+ status = "harvesting"
429
447
  end
448
+ else
449
+ state_machine.add_event(EventHarvesting::DONE_HARVEST)
450
+ harvesting = true
451
+ status = "harvesting"
430
452
  end
431
- state["done"] = done
453
+ else
454
+ err_info = {
455
+ "message" => error.message,
456
+ "backtrace" => error.backtrace
457
+ }
458
+ state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
459
+ harvesting = false
460
+ status = "error (see logs), next harvesting: #{state["datetime_next_harvesting"]}"
432
461
  end
462
+ state["done"] = done
463
+ state["harvesting"] = harvesting
464
+ state["datetime_now"] = get_datetime_now
465
+ state["status"] = status
466
+ state["error"] = err_info
433
467
  path_file_state = get_path_state_file_from_schedule_name(name)
434
468
  @logger.info("#{name}: writing to state file #{path_file_state}")
435
469
  write_state_file(path_file_state, state)
436
470
  @f_digest&.call(name, content, data, done, error, state, @logger)
437
- break if to_pause
438
471
  when StateHarvesting::COMPLETE
439
472
  @logger.warn("#{name}: full harvesting complete")
440
- content["active"] = false
441
- handle_schedule_task(name)
473
+ state_machine.add_event(EventHarvesting::RESTART)
474
+ auto_deactivate_schedule = false
475
+ if auto_deactivate_schedule
476
+ content["active"] = false
477
+ handle_schedule_task(name)
478
+ end
442
479
  break
443
480
  else
444
481
  @logger.warn("#{name}: state #{state_machine.state} not known")
@@ -567,7 +604,18 @@ module OAISchedules
567
604
  end
568
605
 
569
606
  def get_datetime_now
570
- DateTime.now.strftime('%FT%TZ')
607
+ Time.current.in_time_zone('Zulu').strftime('%FT%TZ')
608
+ end
609
+
610
+ def interval_iso8601_to_seconds(str_interval)
611
+ ISO8601::Duration.new(str_interval).to_seconds
612
+ end
613
+
614
+ def get_datetime_next_schedule_tick_from_now(str_interval)
615
+ interval_s = interval_iso8601_to_seconds(str_interval)
616
+ dt = Time.current.in_time_zone('Zulu')
617
+ dt += interval_s
618
+ dt.strftime('%FT%TZ')
571
619
  end
572
620
 
573
621
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module OaiSchedules
4
- VERSION = "0.7.0"
4
+ VERSION = "0.9.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,28 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oai_schedules
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Monari
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-07-09 00:00:00.000000000 Z
10
+ date: 2025-07-15 00:00:00.000000000 Z
11
11
  dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: activesupport
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: 8.0.2
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: 8.0.2
12
26
  - !ruby/object:Gem::Dependency
13
27
  name: concurrent-ruby
14
28
  requirement: !ruby/object:Gem::Requirement
@@ -78,12 +92,15 @@ files:
78
92
  - LICENSE.txt
79
93
  - README.md
80
94
  - Rakefile
81
- - examples/dir_schedules/schedule_495607cf-f773-463f-8ee1-77d0f53e0c29.json
82
- - examples/dir_schedules/schedule_5e344861-806b-4361-98f7-a0be6a5984de.json
83
- - examples/dir_schedules/schedule_712f46ab-f87f-4db8-b69e-7101d9e2ae61.json
84
- - examples/dir_schedules/schedule_99922363-5b37-4438-a274-5a4a5167f811.json
85
- - examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
86
- - examples/dir_schedules/schedule_sample.json
95
+ - examples/dir_schedules/.gitkeep
96
+ - examples/dir_schedules_all/schedule_495607cf-f773-463f-8ee1-77d0f53e0c29.json
97
+ - examples/dir_schedules_all/schedule_5e344861-806b-4361-98f7-a0be6a5984de.json
98
+ - examples/dir_schedules_all/schedule_712f46ab-f87f-4db8-b69e-7101d9e2ae61.json
99
+ - examples/dir_schedules_all/schedule_99922363-5b37-4438-a274-5a4a5167f811.json
100
+ - examples/dir_schedules_all/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
101
+ - examples/dir_schedules_all/schedule_deventit.json
102
+ - examples/dir_schedules_all/schedule_heron.json
103
+ - examples/dir_schedules_all/schedule_sample.json
87
104
  - examples/dir_state/.gitkeep
88
105
  - examples/example_01.rb
89
106
  - examples/example_02.rb