oai_schedules 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7d574b21314744d8ad56d1a1394d3fc778e860ba636d43569fedee7642ff3af2
4
- data.tar.gz: b06822ea6110eedf0663f40f4c32120c321c0acd9a508deabe3204cb6f094962
3
+ metadata.gz: d2b2bae058157785c775ebd1b8c0ea76e8685adb8588280f6c6427644b9da8b5
4
+ data.tar.gz: 928102f14abd5bf59bb7b55bb7e100074c5b20bec3276e1dd129a04e74c6da60
5
5
  SHA512:
6
- metadata.gz: 288bea5932749f50a8629e6fe7ebc041c68f19d39ca44a7513cbdfe1f24682f079f8d8434eabea6ce1cd1d1d3f40f71923ea4bcd701a5926589c544880de2b47
7
- data.tar.gz: e2c309f3b419a605ad1eac62023bfdbb4102cd1ae7be8809659a42909851abcb90cfc22bccc790d21f5c43554f744f7bffaa40e006d7f67e91db77b20f50c0e6
6
+ metadata.gz: 35bb16497f382eb1accaecd95e1e2b022f3addeeef7a4ae79ab5bc4b9f6b023f3fbee70535815ee0edcf5e48b62fc4bdb697cba378cdbac16f114420394cf85e
7
+ data.tar.gz: 3811fd28be558414ebfabcc83e1978080578ff4b33147f0da06cf569ab6b42af498489cf7bef3dd5f14b6e0dbb8598dc615d15220c7496c710f93fcea0149a3e
data/README.md CHANGED
@@ -22,8 +22,13 @@ gem install oai_schedules
22
22
  ```ruby
23
23
  require 'oai_schedules/manager'
24
24
 
25
- f_show = lambda do |name, content, records, done, logger|
26
- # ... do your stuff with records ...
25
+ f_show = lambda do |name, content, records, done, error, state, logger|
26
+ if error.nil?
27
+ # ... do your stuff with records ...
28
+ else
29
+ puts error.message
30
+ end
31
+ puts state
27
32
  if done
28
33
  puts "done full harvesting"
29
34
  end
@@ -59,7 +64,8 @@ The code will do this by querying `https://eudml.org/oai/OAIHandler?verb=ListRec
59
64
  query parameters.
60
65
  The custom function provided as `f_digest` will then be called at each iteration.
61
66
  This will be provided schedule `name` and `content`, the partial list of `records` as a hash,
62
- a `done` flag (full harvesting complete), and the same logger used internally by the schedules manager.
67
+ a `done` flag (full harvesting complete), an `error` exception if happened, the harvesting `state`,
68
+ and the same logger used internally by the schedules manager.
63
69
  and it will write the new one to the state file, until no token is provided (end of the harvesting).
64
70
  As soon as the schedule is added, it is executed.
65
71
  It is possible to add all schedules in advance, then call `sleep` for infinite event loop.
@@ -0,0 +1,63 @@
1
+ {
2
+ "interval": "P1W",
3
+ "repository": {
4
+ "uri": "https://eudml.org/oai/OAIHandler",
5
+ "repository_name": "REPOX Repository",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "mailto:gilberto.pedrosa@ist.utl.pt"
9
+ ],
10
+ "earliest_datestamp": "1970-01-01T00:00:00+00:00",
11
+ "deleted_records": "persistent",
12
+ "granularity": "YYYY-MM-DD",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "ese",
16
+ "eudml-article2",
17
+ "eudml-book2"
18
+ ],
19
+ "set": [
20
+ "BDIM",
21
+ "BulDML",
22
+ "CEDRAM",
23
+ "DMLE",
24
+ "DML_CZ_Monograph",
25
+ "DML_CZ_Proceeding",
26
+ "DML_CZ_Serial",
27
+ "EDPS",
28
+ "ELibM",
29
+ "GALLICA",
30
+ "GDZ_Band",
31
+ "GDZ_Mathematica",
32
+ "GDZ_Monographs",
33
+ "GDZ_RusDML",
34
+ "HDML_Books",
35
+ "HDML_Conferences",
36
+ "HDML_Journals",
37
+ "MISANU",
38
+ "NUMDAM",
39
+ "NUMDAM_book",
40
+ "PLDML",
41
+ "PLDML_book",
42
+ "PMath"
43
+ ],
44
+ "id": "02df523af427deb93b7cb4600ca347f9297d0e31d51c2783c634459dac457bd0"
45
+ },
46
+ "active": true,
47
+ "transformer": {
48
+ "transformer_name": [
49
+ "dummy_transformer"
50
+ ],
51
+ "type": {
52
+ "id": "b1671aad-e825-4b5a-b50b-d8591b425e2a",
53
+ "value": "dummy_type"
54
+ },
55
+ "uri": "http://dummy-uri.org/",
56
+ "id": "7889ce03-28d9-479d-bb9a-b239f179453a"
57
+ },
58
+ "format": "oai_dc",
59
+ "set": "CEDRAM",
60
+ "from": "1970-01-01T00:00:00+00:00",
61
+ "until": "1999-01-12T09:43:02+00:00",
62
+ "id": "dc34623d-2ae2-4e90-91d8-26f4ba29a056"
63
+ }
@@ -4,8 +4,13 @@ require 'oai_schedules/manager'
4
4
 
5
5
  # usage with folder listener
6
6
 
7
- f_show = lambda do |name, content, records, done, logger|
8
- # ... do your stuff with records ...
7
+ f_show = lambda do |name, content, records, done, error, state, logger|
8
+ if error.nil?
9
+ # ... do your stuff with records ...
10
+ else
11
+ puts error.message
12
+ end
13
+ puts state
9
14
  if done
10
15
  puts "done full harvesting"
11
16
  end
@@ -4,8 +4,13 @@ require 'oai_schedules/manager'
4
4
 
5
5
  # usage with programmatic schedules addition / modify / remove
6
6
 
7
- f_show = lambda do |name, content, records, done, logger|
8
- # ... do your stuff with records ...
7
+ f_show = lambda do |name, content, records, done, error, state, logger|
8
+ if error.nil?
9
+ # ... do your stuff with records ...
10
+ else
11
+ puts error.message
12
+ end
13
+ puts state
9
14
  if done
10
15
  puts "done full harvesting"
11
16
  end
@@ -181,6 +181,9 @@ module OAISchedules
181
181
  state = {}
182
182
  if File.file?(path_file_state)
183
183
  state = read_state_file(path_file_state)
184
+ else
185
+ init_schedule_state(state)
186
+ write_state_file(path_file_state, state)
184
187
  end
185
188
  # create task
186
189
  task = Concurrent::TimerTask.new(run_now: false) {
@@ -224,13 +227,22 @@ module OAISchedules
224
227
  private
225
228
 
226
229
 
230
+ def init_schedule_state(state)
231
+ state["resumption_token"] = nil
232
+ state["count_success"] = 0
233
+ state["count_fails"] = 0
234
+ state["done"] = false
235
+ state["count_harvested_records"] = 0
236
+ state["latest_harvested_records_datestamp"] = ""
237
+ end
238
+
239
+
227
240
  def handle_schedule_state_at_schedule_change(name, content)
228
241
  # invalidate resumption token if either schedule format or set changes
229
242
  state = @schedules[name][:state]
230
243
  if (content["format"] != @schedules[name][:content]["format"]) \
231
244
  || (content["set"] != @schedules[name][:content]["set"])
232
- state["resumption_token"] = nil
233
- state["count_partial_harversting"] = 0
245
+ init_schedule_state(state)
234
246
  end
235
247
  end
236
248
 
@@ -339,8 +351,20 @@ module OAISchedules
339
351
  when StateHarvesting::IDLE
340
352
  state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
341
353
  when StateHarvesting::HARVESTING
342
- unless state.has_key?("count_partial_harversting")
343
- state["count_partial_harversting"] = 0
354
+ unless state.has_key?("count_success")
355
+ state["count_success"] = 0
356
+ end
357
+ unless state.has_key?("count_fails")
358
+ state["count_fails"] = 0
359
+ end
360
+ unless state.has_key?("done")
361
+ state["done"] = false
362
+ end
363
+ unless state.has_key?("count_harvested_records")
364
+ state["count_harvested_records"] = 0
365
+ end
366
+ unless state.has_key?("latest_harvested_records_datestamp")
367
+ state["latest_harvested_records_datestamp"] = ""
344
368
  end
345
369
  if !state["resumption_token"].nil?
346
370
  format = ""
@@ -354,30 +378,53 @@ module OAISchedules
354
378
  to = content["until"] || ""
355
379
  set = content["set"] || ""
356
380
  resumption_token = ""
357
- state["count_partial_harversting"] = 0
358
381
  end
359
- data = oai_get_records(
360
- name,
361
- content["repository"]["uri"],
362
- format,
363
- from,
364
- to,
365
- set,
366
- resumption_token
367
- )
368
- state["resumption_token"] = data["resumptionToken"]
369
- state["count_partial_harversting"] += 1
370
- path_file_state = get_path_state_file_from_schedule_name(name)
371
- @logger.info("#{name}: writing to state file #{path_file_state}")
372
- write_state_file(path_file_state, state)
373
- if !data["resumptionToken"].nil?
374
- state_machine.add_event(EventHarvesting::DONE_HARVEST)
375
- done = false
376
- else
382
+ data = nil
383
+ error = nil
384
+ done = false
385
+ if state["done"]
377
386
  state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
378
387
  done = true
388
+ else
389
+ begin
390
+ data = oai_get_records(
391
+ name,
392
+ content["repository"]["uri"],
393
+ format,
394
+ from,
395
+ to,
396
+ set,
397
+ resumption_token
398
+ )
399
+ state["resumption_token"] = data["resumptionToken"]
400
+ state["count_success"] += 1
401
+ n_records = data["record"].size
402
+ state["count_harvested_records"] += n_records
403
+ if n_records > 0
404
+ timestamps = data["record"].map do |record|
405
+ record["header"]["datestamp"]
406
+ end.sort
407
+ state["latest_harvested_records_datestamp"] = timestamps[-1]
408
+ end
409
+ rescue StandardError => e
410
+ state["count_fails"] += 1
411
+ error = e
412
+ end
413
+ if error.nil?
414
+ if !data["resumptionToken"].nil?
415
+ state_machine.add_event(EventHarvesting::DONE_HARVEST)
416
+ done = false
417
+ else
418
+ state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
419
+ done = true
420
+ end
421
+ end
422
+ state["done"] = done
379
423
  end
380
- @f_digest&.call(name, content, data, done, @logger)
424
+ path_file_state = get_path_state_file_from_schedule_name(name)
425
+ @logger.info("#{name}: writing to state file #{path_file_state}")
426
+ write_state_file(path_file_state, state)
427
+ @f_digest&.call(name, content, data, done, error, state, @logger)
381
428
  break
382
429
  when StateHarvesting::COMPLETE
383
430
  @logger.warn("#{name}: full harvesting complete")
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module OaiSchedules
4
- VERSION = "0.4.0"
4
+ VERSION = "0.6.0"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oai_schedules
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Monari
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-03-28 00:00:00.000000000 Z
10
+ date: 2025-04-29 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -78,6 +78,7 @@ files:
78
78
  - LICENSE.txt
79
79
  - README.md
80
80
  - Rakefile
81
+ - examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
81
82
  - examples/dir_schedules/schedule_sample.json
82
83
  - examples/dir_state/.gitkeep
83
84
  - examples/example_01.rb