oai_schedules 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +10 -4
- data/examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json +63 -0
- data/examples/example_01.rb +7 -2
- data/examples/example_02.rb +7 -2
- data/lib/oai_schedules/manager.rb +48 -22
- data/lib/oai_schedules/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3f254ace930dec7e151a5af485130dd23623548d6f9d0860a764b90ce217c4bb
|
4
|
+
data.tar.gz: bfe93c7f19a894976c53f3d2e3d720e048c77c116b92283034ba7b17d4efe386
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bbd9df7881c619f2660dd6b622adf2fd7ec6fa35cde400ff3490b1001b3635c7d89605882e53c9b242cc992de6d0153a7ec5d25d5b41e0672853b997924900a0
|
7
|
+
data.tar.gz: 918414787e68f89b8dab0e692096e708864ca7b32a81d1341c0cc09e72fbb70d2cd886a137f024e5946b6176b7d4b1b20d606ec12d33c81894c65588ca9c712e
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
## [0.4.0] - 2025-03-28
|
2
|
+
|
3
|
+
- Added logger to digestion function
|
4
|
+
- Invalidated resumption token when either schedule set or format change
|
5
|
+
- Extended error message on OAI URL fetch fail
|
6
|
+
|
1
7
|
## [0.3.0] - 2025-03-26
|
2
8
|
|
3
9
|
- Added schedule name and content to digestion function
|
data/README.md
CHANGED
@@ -22,8 +22,13 @@ gem install oai_schedules
|
|
22
22
|
```ruby
|
23
23
|
require 'oai_schedules/manager'
|
24
24
|
|
25
|
-
f_show = lambda do |name, content, records, done|
|
26
|
-
|
25
|
+
f_show = lambda do |name, content, records, done, error, state, logger|
|
26
|
+
if error.nil?
|
27
|
+
# ... do your stuff with records ...
|
28
|
+
else
|
29
|
+
puts error.message
|
30
|
+
end
|
31
|
+
puts state
|
27
32
|
if done
|
28
33
|
puts "done full harvesting"
|
29
34
|
end
|
@@ -58,8 +63,9 @@ At every iteration, every 2 seconds, it will get the *partial* records by using
|
|
58
63
|
The code will do this by querying `https://eudml.org/oai/OAIHandler?verb=ListRecords&...` and adding the necessary
|
59
64
|
query parameters.
|
60
65
|
The custom function provided as `f_digest` will then be called at each iteration.
|
61
|
-
This will be provided schedule `name` and `content`, the partial list of `records` as a hash,
|
62
|
-
|
66
|
+
This will be provided schedule `name` and `content`, the partial list of `records` as a hash,
|
67
|
+
a `done` flag (full harvesting complete), an `error` exception if happened, the harvesting `state`,
|
68
|
+
and the same logger used internally by the schedules manager.
|
63
69
|
and it will write the new one to the state file, until no token is provided (end of the harvesting).
|
64
70
|
As soon as the schedule is added, it is executed.
|
65
71
|
It is possible to add all schedules in advance, then call `sleep` for infinite event loop.
|
@@ -0,0 +1,63 @@
|
|
1
|
+
{
|
2
|
+
"interval": "P1W",
|
3
|
+
"repository": {
|
4
|
+
"uri": "https://eudml.org/oai/OAIHandler",
|
5
|
+
"repository_name": "REPOX Repository",
|
6
|
+
"protocol_version": "2.0",
|
7
|
+
"admin_email": [
|
8
|
+
"mailto:gilberto.pedrosa@ist.utl.pt"
|
9
|
+
],
|
10
|
+
"earliest_datestamp": "1970-01-01T00:00:00+00:00",
|
11
|
+
"deleted_records": "persistent",
|
12
|
+
"granularity": "YYYY-MM-DD",
|
13
|
+
"metadata_format": [
|
14
|
+
"oai_dc",
|
15
|
+
"ese",
|
16
|
+
"eudml-article2",
|
17
|
+
"eudml-book2"
|
18
|
+
],
|
19
|
+
"set": [
|
20
|
+
"BDIM",
|
21
|
+
"BulDML",
|
22
|
+
"CEDRAM",
|
23
|
+
"DMLE",
|
24
|
+
"DML_CZ_Monograph",
|
25
|
+
"DML_CZ_Proceeding",
|
26
|
+
"DML_CZ_Serial",
|
27
|
+
"EDPS",
|
28
|
+
"ELibM",
|
29
|
+
"GALLICA",
|
30
|
+
"GDZ_Band",
|
31
|
+
"GDZ_Mathematica",
|
32
|
+
"GDZ_Monographs",
|
33
|
+
"GDZ_RusDML",
|
34
|
+
"HDML_Books",
|
35
|
+
"HDML_Conferences",
|
36
|
+
"HDML_Journals",
|
37
|
+
"MISANU",
|
38
|
+
"NUMDAM",
|
39
|
+
"NUMDAM_book",
|
40
|
+
"PLDML",
|
41
|
+
"PLDML_book",
|
42
|
+
"PMath"
|
43
|
+
],
|
44
|
+
"id": "02df523af427deb93b7cb4600ca347f9297d0e31d51c2783c634459dac457bd0"
|
45
|
+
},
|
46
|
+
"active": true,
|
47
|
+
"transformer": {
|
48
|
+
"transformer_name": [
|
49
|
+
"dummy_transformer"
|
50
|
+
],
|
51
|
+
"type": {
|
52
|
+
"id": "b1671aad-e825-4b5a-b50b-d8591b425e2a",
|
53
|
+
"value": "dummy_type"
|
54
|
+
},
|
55
|
+
"uri": "http://dummy-uri.org/",
|
56
|
+
"id": "7889ce03-28d9-479d-bb9a-b239f179453a"
|
57
|
+
},
|
58
|
+
"format": "oai_dc",
|
59
|
+
"set": "CEDRAM",
|
60
|
+
"from": "1970-01-01T00:00:00+00:00",
|
61
|
+
"until": "1999-01-12T09:43:02+00:00",
|
62
|
+
"id": "dc34623d-2ae2-4e90-91d8-26f4ba29a056"
|
63
|
+
}
|
data/examples/example_01.rb
CHANGED
@@ -4,8 +4,13 @@ require 'oai_schedules/manager'
|
|
4
4
|
|
5
5
|
# usage with folder listener
|
6
6
|
|
7
|
-
f_show = lambda do |name, content, records, done|
|
8
|
-
|
7
|
+
f_show = lambda do |name, content, records, done, error, state, logger|
|
8
|
+
if error.nil?
|
9
|
+
# ... do your stuff with records ...
|
10
|
+
else
|
11
|
+
puts error.message
|
12
|
+
end
|
13
|
+
puts state
|
9
14
|
if done
|
10
15
|
puts "done full harvesting"
|
11
16
|
end
|
data/examples/example_02.rb
CHANGED
@@ -4,8 +4,13 @@ require 'oai_schedules/manager'
|
|
4
4
|
|
5
5
|
# usage with programmatic schedules addition / modify / remove
|
6
6
|
|
7
|
-
f_show = lambda do |name, content, records, done|
|
8
|
-
|
7
|
+
f_show = lambda do |name, content, records, done, error, state, logger|
|
8
|
+
if error.nil?
|
9
|
+
# ... do your stuff with records ...
|
10
|
+
else
|
11
|
+
puts error.message
|
12
|
+
end
|
13
|
+
puts state
|
9
14
|
if done
|
10
15
|
puts "done full harvesting"
|
11
16
|
end
|
@@ -213,6 +213,7 @@ module OAISchedules
|
|
213
213
|
|
214
214
|
|
215
215
|
def modify_schedule(name, content)
|
216
|
+
handle_schedule_state_at_schedule_change(name, content)
|
216
217
|
@schedules[name][:content] = deep_copy(content)
|
217
218
|
handle_schedule_task(name)
|
218
219
|
end
|
@@ -222,6 +223,19 @@ module OAISchedules
|
|
222
223
|
|
223
224
|
private
|
224
225
|
|
226
|
+
|
227
|
+
def handle_schedule_state_at_schedule_change(name, content)
|
228
|
+
# invalidate resumption token if either schedule format or set changes
|
229
|
+
state = @schedules[name][:state]
|
230
|
+
if (content["format"] != @schedules[name][:content]["format"]) \
|
231
|
+
|| (content["set"] != @schedules[name][:content]["set"])
|
232
|
+
state["resumption_token"] = nil
|
233
|
+
state["count_success"] = 0
|
234
|
+
state["count_fails"] = 0
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
|
225
239
|
def deep_copy(o)
|
226
240
|
Marshal.load(Marshal.dump(o))
|
227
241
|
end
|
@@ -326,8 +340,11 @@ module OAISchedules
|
|
326
340
|
when StateHarvesting::IDLE
|
327
341
|
state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
|
328
342
|
when StateHarvesting::HARVESTING
|
329
|
-
unless state.has_key?("
|
330
|
-
state["
|
343
|
+
unless state.has_key?("count_success")
|
344
|
+
state["count_success"] = 0
|
345
|
+
end
|
346
|
+
unless state.has_key?("count_fails")
|
347
|
+
state["count_fails"] = 0
|
331
348
|
end
|
332
349
|
if !state["resumption_token"].nil?
|
333
350
|
format = ""
|
@@ -341,30 +358,39 @@ module OAISchedules
|
|
341
358
|
to = content["until"] || ""
|
342
359
|
set = content["set"] || ""
|
343
360
|
resumption_token = ""
|
344
|
-
state["count_partial_harversting"] = 0
|
345
361
|
end
|
346
|
-
data =
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
362
|
+
data = nil
|
363
|
+
error = nil
|
364
|
+
begin
|
365
|
+
data = oai_get_records(
|
366
|
+
name,
|
367
|
+
content["repository"]["uri"],
|
368
|
+
format,
|
369
|
+
from,
|
370
|
+
to,
|
371
|
+
set,
|
372
|
+
resumption_token
|
373
|
+
)
|
374
|
+
state["resumption_token"] = data["resumptionToken"]
|
375
|
+
state["count_success"] += 1
|
376
|
+
rescue StandardError => e
|
377
|
+
state["count_fails"] += 1
|
378
|
+
error = e
|
379
|
+
end
|
357
380
|
path_file_state = get_path_state_file_from_schedule_name(name)
|
358
381
|
@logger.info("#{name}: writing to state file #{path_file_state}")
|
359
382
|
write_state_file(path_file_state, state)
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
383
|
+
done = false
|
384
|
+
if error.nil?
|
385
|
+
if !data["resumptionToken"].nil?
|
386
|
+
state_machine.add_event(EventHarvesting::DONE_HARVEST)
|
387
|
+
done = false
|
388
|
+
else
|
389
|
+
state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
|
390
|
+
done = true
|
391
|
+
end
|
366
392
|
end
|
367
|
-
@f_digest&.call(name, content, data, done)
|
393
|
+
@f_digest&.call(name, content, data, done, error, state, @logger)
|
368
394
|
break
|
369
395
|
when StateHarvesting::COMPLETE
|
370
396
|
@logger.warn("#{name}: full harvesting complete")
|
@@ -416,7 +442,7 @@ module OAISchedules
|
|
416
442
|
@logger.info("#{name}: fetching from #{url_query}")
|
417
443
|
data = DataCollector::Core.filter(DataCollector::Input.new.from_uri(url_query), "$..#{verb}")
|
418
444
|
if data.empty?
|
419
|
-
raise StandardError, "#{name}: GET #{url_query}: URL not available, or response has no element #{verb}"
|
445
|
+
raise StandardError, "#{name}: GET #{url_query}: URL not available, or response has no element #{verb}, or missing query parameters (e.g. set)"
|
420
446
|
end
|
421
447
|
data = data[0]
|
422
448
|
rescue DataCollector::InputError => e
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: oai_schedules
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Davide Monari
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-
|
10
|
+
date: 2025-04-29 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: concurrent-ruby
|
@@ -78,6 +78,7 @@ files:
|
|
78
78
|
- LICENSE.txt
|
79
79
|
- README.md
|
80
80
|
- Rakefile
|
81
|
+
- examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
|
81
82
|
- examples/dir_schedules/schedule_sample.json
|
82
83
|
- examples/dir_state/.gitkeep
|
83
84
|
- examples/example_01.rb
|