oai_schedules 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2b2bae058157785c775ebd1b8c0ea76e8685adb8588280f6c6427644b9da8b5
|
4
|
+
data.tar.gz: 928102f14abd5bf59bb7b55bb7e100074c5b20bec3276e1dd129a04e74c6da60
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 35bb16497f382eb1accaecd95e1e2b022f3addeeef7a4ae79ab5bc4b9f6b023f3fbee70535815ee0edcf5e48b62fc4bdb697cba378cdbac16f114420394cf85e
|
7
|
+
data.tar.gz: 3811fd28be558414ebfabcc83e1978080578ff4b33147f0da06cf569ab6b42af498489cf7bef3dd5f14b6e0dbb8598dc615d15220c7496c710f93fcea0149a3e
|
data/README.md
CHANGED
@@ -22,8 +22,13 @@ gem install oai_schedules
|
|
22
22
|
```ruby
|
23
23
|
require 'oai_schedules/manager'
|
24
24
|
|
25
|
-
f_show = lambda do |name, content, records, done, logger|
|
26
|
-
|
25
|
+
f_show = lambda do |name, content, records, done, error, state, logger|
|
26
|
+
if error.nil?
|
27
|
+
# ... do your stuff with records ...
|
28
|
+
else
|
29
|
+
puts error.message
|
30
|
+
end
|
31
|
+
puts state
|
27
32
|
if done
|
28
33
|
puts "done full harvesting"
|
29
34
|
end
|
@@ -59,7 +64,8 @@ The code will do this by querying `https://eudml.org/oai/OAIHandler?verb=ListRec
|
|
59
64
|
query parameters.
|
60
65
|
The custom function provided as `f_digest` will then be called at each iteration.
|
61
66
|
This will be provided schedule `name` and `content`, the partial list of `records` as a hash,
|
62
|
-
a `done` flag (full harvesting complete),
|
67
|
+
a `done` flag (full harvesting complete), an `error` exception if happened, the harvesting `state`,
|
68
|
+
and the same logger used internally by the schedules manager.
|
63
69
|
and it will write the new one to the state file, until no token is provided (end of the harvesting).
|
64
70
|
As soon as the schedule is added, it is executed.
|
65
71
|
It is possible to add all schedules in advance, then call `sleep` for infinite event loop.
|
@@ -0,0 +1,63 @@
|
|
1
|
+
{
|
2
|
+
"interval": "P1W",
|
3
|
+
"repository": {
|
4
|
+
"uri": "https://eudml.org/oai/OAIHandler",
|
5
|
+
"repository_name": "REPOX Repository",
|
6
|
+
"protocol_version": "2.0",
|
7
|
+
"admin_email": [
|
8
|
+
"mailto:gilberto.pedrosa@ist.utl.pt"
|
9
|
+
],
|
10
|
+
"earliest_datestamp": "1970-01-01T00:00:00+00:00",
|
11
|
+
"deleted_records": "persistent",
|
12
|
+
"granularity": "YYYY-MM-DD",
|
13
|
+
"metadata_format": [
|
14
|
+
"oai_dc",
|
15
|
+
"ese",
|
16
|
+
"eudml-article2",
|
17
|
+
"eudml-book2"
|
18
|
+
],
|
19
|
+
"set": [
|
20
|
+
"BDIM",
|
21
|
+
"BulDML",
|
22
|
+
"CEDRAM",
|
23
|
+
"DMLE",
|
24
|
+
"DML_CZ_Monograph",
|
25
|
+
"DML_CZ_Proceeding",
|
26
|
+
"DML_CZ_Serial",
|
27
|
+
"EDPS",
|
28
|
+
"ELibM",
|
29
|
+
"GALLICA",
|
30
|
+
"GDZ_Band",
|
31
|
+
"GDZ_Mathematica",
|
32
|
+
"GDZ_Monographs",
|
33
|
+
"GDZ_RusDML",
|
34
|
+
"HDML_Books",
|
35
|
+
"HDML_Conferences",
|
36
|
+
"HDML_Journals",
|
37
|
+
"MISANU",
|
38
|
+
"NUMDAM",
|
39
|
+
"NUMDAM_book",
|
40
|
+
"PLDML",
|
41
|
+
"PLDML_book",
|
42
|
+
"PMath"
|
43
|
+
],
|
44
|
+
"id": "02df523af427deb93b7cb4600ca347f9297d0e31d51c2783c634459dac457bd0"
|
45
|
+
},
|
46
|
+
"active": true,
|
47
|
+
"transformer": {
|
48
|
+
"transformer_name": [
|
49
|
+
"dummy_transformer"
|
50
|
+
],
|
51
|
+
"type": {
|
52
|
+
"id": "b1671aad-e825-4b5a-b50b-d8591b425e2a",
|
53
|
+
"value": "dummy_type"
|
54
|
+
},
|
55
|
+
"uri": "http://dummy-uri.org/",
|
56
|
+
"id": "7889ce03-28d9-479d-bb9a-b239f179453a"
|
57
|
+
},
|
58
|
+
"format": "oai_dc",
|
59
|
+
"set": "CEDRAM",
|
60
|
+
"from": "1970-01-01T00:00:00+00:00",
|
61
|
+
"until": "1999-01-12T09:43:02+00:00",
|
62
|
+
"id": "dc34623d-2ae2-4e90-91d8-26f4ba29a056"
|
63
|
+
}
|
data/examples/example_01.rb
CHANGED
@@ -4,8 +4,13 @@ require 'oai_schedules/manager'
|
|
4
4
|
|
5
5
|
# usage with folder listener
|
6
6
|
|
7
|
-
f_show = lambda do |name, content, records, done, logger|
|
8
|
-
|
7
|
+
f_show = lambda do |name, content, records, done, error, state, logger|
|
8
|
+
if error.nil?
|
9
|
+
# ... do your stuff with records ...
|
10
|
+
else
|
11
|
+
puts error.message
|
12
|
+
end
|
13
|
+
puts state
|
9
14
|
if done
|
10
15
|
puts "done full harvesting"
|
11
16
|
end
|
data/examples/example_02.rb
CHANGED
@@ -4,8 +4,13 @@ require 'oai_schedules/manager'
|
|
4
4
|
|
5
5
|
# usage with programmatic schedules addition / modify / remove
|
6
6
|
|
7
|
-
f_show = lambda do |name, content, records, done, logger|
|
8
|
-
|
7
|
+
f_show = lambda do |name, content, records, done, error, state, logger|
|
8
|
+
if error.nil?
|
9
|
+
# ... do your stuff with records ...
|
10
|
+
else
|
11
|
+
puts error.message
|
12
|
+
end
|
13
|
+
puts state
|
9
14
|
if done
|
10
15
|
puts "done full harvesting"
|
11
16
|
end
|
@@ -181,6 +181,9 @@ module OAISchedules
|
|
181
181
|
state = {}
|
182
182
|
if File.file?(path_file_state)
|
183
183
|
state = read_state_file(path_file_state)
|
184
|
+
else
|
185
|
+
init_schedule_state(state)
|
186
|
+
write_state_file(path_file_state, state)
|
184
187
|
end
|
185
188
|
# create task
|
186
189
|
task = Concurrent::TimerTask.new(run_now: false) {
|
@@ -224,13 +227,22 @@ module OAISchedules
|
|
224
227
|
private
|
225
228
|
|
226
229
|
|
230
|
+
def init_schedule_state(state)
|
231
|
+
state["resumption_token"] = nil
|
232
|
+
state["count_success"] = 0
|
233
|
+
state["count_fails"] = 0
|
234
|
+
state["done"] = false
|
235
|
+
state["count_harvested_records"] = 0
|
236
|
+
state["latest_harvested_records_datestamp"] = ""
|
237
|
+
end
|
238
|
+
|
239
|
+
|
227
240
|
def handle_schedule_state_at_schedule_change(name, content)
|
228
241
|
# invalidate resumption token if either schedule format or set changes
|
229
242
|
state = @schedules[name][:state]
|
230
243
|
if (content["format"] != @schedules[name][:content]["format"]) \
|
231
244
|
|| (content["set"] != @schedules[name][:content]["set"])
|
232
|
-
state
|
233
|
-
state["count_partial_harversting"] = 0
|
245
|
+
init_schedule_state(state)
|
234
246
|
end
|
235
247
|
end
|
236
248
|
|
@@ -339,8 +351,20 @@ module OAISchedules
|
|
339
351
|
when StateHarvesting::IDLE
|
340
352
|
state_machine.add_event(EventHarvesting::REQUEST_HARVEST)
|
341
353
|
when StateHarvesting::HARVESTING
|
342
|
-
unless state.has_key?("
|
343
|
-
state["
|
354
|
+
unless state.has_key?("count_success")
|
355
|
+
state["count_success"] = 0
|
356
|
+
end
|
357
|
+
unless state.has_key?("count_fails")
|
358
|
+
state["count_fails"] = 0
|
359
|
+
end
|
360
|
+
unless state.has_key?("done")
|
361
|
+
state["done"] = false
|
362
|
+
end
|
363
|
+
unless state.has_key?("count_harvested_records")
|
364
|
+
state["count_harvested_records"] = 0
|
365
|
+
end
|
366
|
+
unless state.has_key?("latest_harvested_records_datestamp")
|
367
|
+
state["latest_harvested_records_datestamp"] = ""
|
344
368
|
end
|
345
369
|
if !state["resumption_token"].nil?
|
346
370
|
format = ""
|
@@ -354,30 +378,53 @@ module OAISchedules
|
|
354
378
|
to = content["until"] || ""
|
355
379
|
set = content["set"] || ""
|
356
380
|
resumption_token = ""
|
357
|
-
state["count_partial_harversting"] = 0
|
358
381
|
end
|
359
|
-
data =
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
from,
|
364
|
-
to,
|
365
|
-
set,
|
366
|
-
resumption_token
|
367
|
-
)
|
368
|
-
state["resumption_token"] = data["resumptionToken"]
|
369
|
-
state["count_partial_harversting"] += 1
|
370
|
-
path_file_state = get_path_state_file_from_schedule_name(name)
|
371
|
-
@logger.info("#{name}: writing to state file #{path_file_state}")
|
372
|
-
write_state_file(path_file_state, state)
|
373
|
-
if !data["resumptionToken"].nil?
|
374
|
-
state_machine.add_event(EventHarvesting::DONE_HARVEST)
|
375
|
-
done = false
|
376
|
-
else
|
382
|
+
data = nil
|
383
|
+
error = nil
|
384
|
+
done = false
|
385
|
+
if state["done"]
|
377
386
|
state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
|
378
387
|
done = true
|
388
|
+
else
|
389
|
+
begin
|
390
|
+
data = oai_get_records(
|
391
|
+
name,
|
392
|
+
content["repository"]["uri"],
|
393
|
+
format,
|
394
|
+
from,
|
395
|
+
to,
|
396
|
+
set,
|
397
|
+
resumption_token
|
398
|
+
)
|
399
|
+
state["resumption_token"] = data["resumptionToken"]
|
400
|
+
state["count_success"] += 1
|
401
|
+
n_records = data["record"].size
|
402
|
+
state["count_harvested_records"] += n_records
|
403
|
+
if n_records > 0
|
404
|
+
timestamps = data["record"].map do |record|
|
405
|
+
record["header"]["datestamp"]
|
406
|
+
end.sort
|
407
|
+
state["latest_harvested_records_datestamp"] = timestamps[-1]
|
408
|
+
end
|
409
|
+
rescue StandardError => e
|
410
|
+
state["count_fails"] += 1
|
411
|
+
error = e
|
412
|
+
end
|
413
|
+
if error.nil?
|
414
|
+
if !data["resumptionToken"].nil?
|
415
|
+
state_machine.add_event(EventHarvesting::DONE_HARVEST)
|
416
|
+
done = false
|
417
|
+
else
|
418
|
+
state_machine.add_event(EventHarvesting::DONE_FULL_HARVEST)
|
419
|
+
done = true
|
420
|
+
end
|
421
|
+
end
|
422
|
+
state["done"] = done
|
379
423
|
end
|
380
|
-
|
424
|
+
path_file_state = get_path_state_file_from_schedule_name(name)
|
425
|
+
@logger.info("#{name}: writing to state file #{path_file_state}")
|
426
|
+
write_state_file(path_file_state, state)
|
427
|
+
@f_digest&.call(name, content, data, done, error, state, @logger)
|
381
428
|
break
|
382
429
|
when StateHarvesting::COMPLETE
|
383
430
|
@logger.warn("#{name}: full harvesting complete")
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: oai_schedules
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Davide Monari
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date: 2025-
|
10
|
+
date: 2025-04-29 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: concurrent-ruby
|
@@ -78,6 +78,7 @@ files:
|
|
78
78
|
- LICENSE.txt
|
79
79
|
- README.md
|
80
80
|
- Rakefile
|
81
|
+
- examples/dir_schedules/schedule_dc34623d-2ae2-4e90-91d8-26f4ba29a056.json
|
81
82
|
- examples/dir_schedules/schedule_sample.json
|
82
83
|
- examples/dir_state/.gitkeep
|
83
84
|
- examples/example_01.rb
|