oai_schedules 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3e87f0fab9a90cc32bb6c3892a9ad72e6dae77c8b7efbc4ab3198c39b7d2a660
4
- data.tar.gz: b59ddf59dd77df9dc1365a279dbe7ac3a9fb7b8d5f1bb7a33a8951f3d4ce1f48
3
+ metadata.gz: 3f254ace930dec7e151a5af485130dd23623548d6f9d0860a764b90ce217c4bb
4
+ data.tar.gz: bfe93c7f19a894976c53f3d2e3d720e048c77c116b92283034ba7b17d4efe386
5
5
  SHA512:
6
- metadata.gz: d384485639a44d848f35eb3bc0a25a44fe2357cd84aeb37530a683a153aaba6f4b9bb24b7a316caf424e8f0c57b77b775b45f168c06339b3a6b6e35b1b2ee8d9
7
- data.tar.gz: 84117dbd3b7f6757290fdee7a9f3388623330d8d293961946b6ee2058b2be8a7898b60822598bd8e8ac6bf2340d3cbabcba05124d1aa087e876b2a83192ea948
6
+ metadata.gz: bbd9df7881c619f2660dd6b622adf2fd7ec6fa35cde400ff3490b1001b3635c7d89605882e53c9b242cc992de6d0153a7ec5d25d5b41e0672853b997924900a0
7
+ data.tar.gz: 918414787e68f89b8dab0e692096e708864ca7b32a81d1341c0cc09e72fbb70d2cd886a137f024e5946b6176b7d4b1b20d606ec12d33c81894c65588ca9c712e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [0.4.0] - 2025-03-28
2
+
3
+ - Added logger to digestion function
4
+ - Invalidated resumption token when either schedule set or format change
5
+ - Extended error message on OAI URL fetch fail
6
+
1
7
  ## [0.3.0] - 2025-03-26
2
8
 
3
9
  - Added schedule name and content to digestion function
data/README.md CHANGED
@@ -22,8 +22,13 @@ gem install oai_schedules
22
22
  ```ruby
23
23
  require 'oai_schedules/manager'
24
24
 
25
- f_show = lambda do |name, content, records, done|
26
- # ... do your stuff with records ...
25
+ f_show = lambda do |name, content, records, done, error, state, logger|
26
+ if error.nil?
27
+ # ... do your stuff with records ...
28
+ else
29
+ puts error.message
30
+ end
31
+ puts state
27
32
  if done
28
33
  puts "done full harvesting"
29
34
  end
@@ -58,8 +63,9 @@ At every iteration, every 2 seconds, it will get the *partial* records by using
58
63
  The code will do this by querying `https://eudml.org/oai/OAIHandler?verb=ListRecords&...` and adding the necessary
59
64
  query parameters.
60
65
  The custom function provided as `f_digest` will then be called at each iteration.
61
- This will be provided schedule `name` and `content`, the partial list of `records` as a hash,
62
- and a `done` flag (full harvesting complete).
66
+ This will be provided schedule `name` and `content`, the partial list of `records` as a hash,
67
+ a `done` flag (full harvesting complete), an `error` exception if happened, the harvesting `state`,
68
+ and the same logger used internally by the schedules manager.
63
69
  and it will write the new one to the state file, until no token is provided (end of the harvesting).
64
70
  As soon as the schedule is added, it is executed.
65
71
  It is possible to add all schedules in advance, then call `sleep` for infinite event loop.
@@ -0,0 +1,63 @@
1
+ {
2
+ "interval": "P1W",
3
+ "repository": {
4
+ "uri": "https://eudml.org/oai/OAIHandler",
5
+ "repository_name": "REPOX Repository",
6
+ "protocol_version": "2.0",
7
+ "admin_email": [
8
+ "mailto:gilberto.pedrosa@ist.utl.pt"
9
+ ],
10
+ "earliest_datestamp": "1970-01-01T00:00:00+00:00",
11
+ "deleted_records": "persistent",
12
+ "granularity": "YYYY-MM-DD",
13
+ "metadata_format": [
14
+ "oai_dc",
15
+ "ese",
16
+ "eudml-article2",
17
+ "eudml-book2"
18
+ ],
19
+ "set": [
20
+ "BDIM",
21
+ "BulDML",
22
+ "CEDRAM",
23
+ "DMLE",
24
+ "DML_CZ_Monograph",
25
+ "DML_CZ_Proceeding",
26
+ "DML_CZ_Serial",
27
+ "EDPS",
28
+ "ELibM",
29
+ "GALLICA",
30
+ "GDZ_Band",
31
+ "GDZ_Mathematica",
32
+ "GDZ_Monographs",
33
+ "GDZ_RusDML",
34
+ "HDML_Books",
35
+ "HDML_Conferences",
36
+ "HDML_Journals",
37
+ "MISANU",
38
+ "NUMDAM",
39
+ "NUMDAM_book",
40
+ "PLDML",
41
+ "PLDML_book",
42
+ "PMath"
43
+ ],
44
+ "id": "02df523af427deb93b7cb4600ca347f9297d0e31d51c2783c634459dac457bd0"
45
+ },
46
+ "active": true,
47
+ "transformer": {
48
+ "transformer_name": [
49
+ "dummy_transformer"
50
+ ],
51
+ "type": {
52
+ "id": "b1671aad-e825-4b5a-b50b-d8591b425e2a",
53
+ "value": "dummy_type"
54
+ },
55
+ "uri": "http://dummy-uri.org/",
56
+ "id": "7889ce03-28d9-479d-bb9a-b239f179453a"
57
+ },
58
+ "format": "oai_dc",
59
+ "set": "CEDRAM",
60
+ "from": "1970-01-01T00:00:00+00:00",
61
+ "until": "1999-01-12T09:43:02+00:00",
62
+ "id": "dc34623d-2ae2-4e90-91d8-26f4ba29a056"
63
+ }
@@ -4,8 +4,13 @@ require 'oai_schedules/manager'
4
4
 
5
5
  # usage with folder listener
6
6
 
7
- f_show = lambda do |name, content, records, done|
8
- # ... do your stuff with records ...
7
+ f_show = lambda do |name, content, records, done, error, state, logger|
8
+ if error.nil?
9
+ # ... do your stuff with records ...
10
+ else
11
+ puts error.message
12
+ end
13
+ puts state
9
14
  if done
10
15
  puts "done full harvesting"
11
16
  end
@@ -4,8 +4,13 @@ require 'oai_schedules/manager'
4
4
 
5
5
  # usage with programmatic schedules addition / modify / remove
6
6
 
7
- f_show = lambda do |name, content, records, done|
8
- # ... do your stuff with records ...
7
+ f_show = lambda do |name, content, records, done, error, state, logger|
8
+ if error.nil?
9
+ # ... do your stuff with records ...
10
+ else
11
+ puts error.message
12
+ end
13
+ puts state
9
14
  if done
10
15
  puts "done full harvesting"
11
16
  end
@@ -213,6 +213,7 @@ module OAISchedules
213
213
 
214
214
 
215
215
  def modify_schedule(name, content)
216
+ handle_schedule_state_at_schedule_change(name, content)
216
217
  @schedules[name][:content] = deep_copy(content)
217
218
  handle_schedule_task(name)
218
219
  end
@@ -222,6 +223,19 @@ module OAISchedules
222
223
 
223
224
  private
224
225
 
226
+
227
+ def handle_schedule_state_at_schedule_change(name, content)
228
+ # invalidate resumption token if either schedule format or set changes
229
+ state = @schedules[name][:state]
230
+ if (content["format"] != @schedules[name][:content]["format"]) \
231
+ || (content["set"] != @schedules[name][:content]["set"])
232
+ state["resumption_token"] = nil
233
+ state["count_success"] = 0
234
+ state["count_fails"] = 0
235
+ end
236
+ end
237
+
238
+
225
239
  def deep_copy(o)
226
240
  Marshal.load(Marshal.dump(o))
227
241
  end
@@ -326,8 +340,11 @@ module OAISchedules
326
340
  when StateHarvesting::IDLE
327
341
  state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
328
342
  when StateHarvesting::HARVESTING
329
- unless state.has_key?("count_partial_harversting")
330
- state["count_partial_harversting"] = 0
343
+ unless state.has_key?("count_success")
344
+ state["count_success"] = 0
345
+ end
346
+ unless state.has_key?("count_fails")
347
+ state["count_fails"] = 0
331
348
  end
332
349
  if !state["resumption_token"].nil?
333
350
  format = ""
@@ -341,30 +358,39 @@ module OAISchedules
341
358
  to = content["until"] || ""
342
359
  set = content["set"] || ""
343
360
  resumption_token = ""
344
- state["count_partial_harversting"] = 0
345
361
  end
346
- data = oai_get_records(
347
- name,
348
- content["repository"]["uri"],
349
- format,
350
- from,
351
- to,
352
- set,
353
- resumption_token
354
- )
355
- state["resumption_token"] = data["resumptionToken"]
356
- state["count_partial_harversting"] += 1
362
+ data = nil
363
+ error = nil
364
+ begin
365
+ data = oai_get_records(
366
+ name,
367
+ content["repository"]["uri"],
368
+ format,
369
+ from,
370
+ to,
371
+ set,
372
+ resumption_token
373
+ )
374
+ state["resumption_token"] = data["resumptionToken"]
375
+ state["count_success"] += 1
376
+ rescue StandardError => e
377
+ state["count_fails"] += 1
378
+ error = e
379
+ end
357
380
  path_file_state = get_path_state_file_from_schedule_name(name)
358
381
  @logger.info("#{name}: writing to state file #{path_file_state}")
359
382
  write_state_file(path_file_state, state)
360
- if !data["resumptionToken"].nil?
361
- state_machine.add_event(EventHarvesting::DONE_HARVEST)
362
- done = false
363
- else
364
- state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
365
- done = true
383
+ done = false
384
+ if error.nil?
385
+ if !data["resumptionToken"].nil?
386
+ state_machine.add_event(EventHarvesting::DONE_HARVEST)
387
+ done = false
388
+ else
389
+ state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
390
+ done = true
391
+ end
366
392
  end
367
- @f_digest&.call(name, content, data, done)
393
+ @f_digest&.call(name, content, data, done, error, state, @logger)
368
394
  break
369
395
  when StateHarvesting::COMPLETE
370
396
  @logger.warn("#{name}: full harvesting complete")
@@ -416,7 +442,7 @@ module OAISchedules
416
442
  @logger.info("#{name}: fetching from #{url_query}")
417
443
  data = DataCollector::Core.filter(DataCollector::Input.new.from_uri(url_query), "$..#{verb}")
418
444
  if data.empty?
419
- raise StandardError, "#{name}: GET #{url_query}: URL not available, or response has no element #{verb}"
445
+ raise StandardError, "#{name}: GET #{url_query}: URL not available, or response has no element #{verb}, or missing query parameters (e.g. set)"
420
446
  end
421
447
  data = data[0]
422
448
  rescue DataCollector::InputError => e
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module OaiSchedules
4
- VERSION = "0.3.0"
4
+ VERSION = "0.5.0"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oai_schedules
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Davide Monari
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-03-26 00:00:00.000000000 Z
10
+ date: 2025-04-29 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: concurrent-ruby
@@ -78,6 +78,7 @@ files:
78
78
  - LICENSE.txt
79
79
  - README.md
80
80
  - Rakefile
81
+ - examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
81
82
  - examples/dir_schedules/schedule_sample.json
82
83
  - examples/dir_state/.gitkeep
83
84
  - examples/example_01.rb