oai 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +19 -4
- data/Rakefile +7 -0
- data/bin/oai +0 -2
- data/examples/models/file_model.rb +2 -2
- data/lib/oai/client/response.rb +8 -8
- data/lib/oai/client.rb +34 -10
- data/lib/oai/exception.rb +46 -38
- data/lib/oai/harvester/config.rb +1 -1
- data/lib/oai/harvester/harvest.rb +37 -25
- data/lib/oai/harvester/logging.rb +3 -5
- data/lib/oai/harvester.rb +4 -1
- data/lib/oai/provider/model/activerecord_caching_wrapper.rb +5 -8
- data/lib/oai/provider/model/activerecord_wrapper.rb +38 -22
- data/lib/oai/provider/model.rb +1 -1
- data/lib/oai/provider/response/list_records.rb +12 -0
- data/lib/oai/provider/response.rb +7 -4
- data/lib/oai/provider/resumption_token.rb +18 -5
- data/test/activerecord_provider/database/0001_oaipmh_tables.rb +7 -1
- data/test/activerecord_provider/helpers/providers.rb +3 -1
- data/test/activerecord_provider/helpers/transactional_test_case.rb +2 -1
- data/test/activerecord_provider/models/dc_field.rb +8 -0
- data/test/activerecord_provider/models/dc_lang.rb +3 -0
- data/test/activerecord_provider/models/exclusive_set_dc_field.rb +6 -0
- data/test/activerecord_provider/tc_activerecord_wrapper.rb +63 -0
- data/test/activerecord_provider/tc_ar_provider.rb +54 -26
- data/test/activerecord_provider/tc_ar_sets_provider.rb +10 -9
- data/test/activerecord_provider/tc_caching_paging_provider.rb +9 -7
- data/test/activerecord_provider/tc_simple_paging_provider.rb +9 -7
- data/test/client/tc_exception.rb +1 -1
- data/test/client/tc_get_record.rb +1 -1
- data/test/client/tc_http_client.rb +2 -2
- data/test/client/tc_libxml.rb +1 -1
- data/test/client/tc_utf8_escaping.rb +8 -1
- data/test/harvester/tc_harvest.rb +42 -0
- data/test/harvester/test_helper_harvester.rb +6 -0
- data/test/provider/models.rb +3 -3
- data/test/provider/tc_functional_tokens.rb +17 -11
- data/test/provider/tc_provider.rb +26 -0
- metadata +27 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c0a80559d64aa25add07953e978e93f9abfbc89e03c67bf306e79f0d7b18b0f7
|
4
|
+
data.tar.gz: 7989ff1c6dec95c3965cb3adc1d0bc156d250b6129002be50193443994d03caf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db8b7e74c65625da8b47b35ef23a7e620777dfefb0a6a8d63a6a7314534dbfe46606bf863aef4a839087547280700519f35e3b10bdf10bf70ba93959ae3125e2
|
7
|
+
data.tar.gz: ad14343ae9a61b6516f7e079db4afc0b7eddbb5e70d8d962221842a5fe4f8b49da63d37dc8eeb7cfe9e163617d4d2a3cae93e37e8943b784df7d0dc1eb1a7cce
|
data/README.md
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
ruby-oai
|
2
2
|
========
|
3
|
+
[![Build Status](https://github.com/code4lib/ruby-oai/workflows/CI/badge.svg)](https://github.com/code4lib/ruby-oai/actions)
|
3
4
|
|
4
|
-
[![
|
5
|
-
|
6
|
-
[![Gem Version](https://badge.fury.io/rb/kithe.svg)](https://badge.fury.io/rb/oai)
|
5
|
+
[![Gem Version](https://badge.fury.io/rb/oai.svg)](https://badge.fury.io/rb/oai)
|
7
6
|
|
8
7
|
ruby-oai is a Open Archives Protocol for Metadata Harvesting (OAI-PMH)
|
9
8
|
library for Ruby. [OAI-PMH](http://openarchives.org) is a somewhat
|
@@ -47,6 +46,22 @@ For example to initiate a ListRecords request to pubmed you can:
|
|
47
46
|
end
|
48
47
|
```
|
49
48
|
|
49
|
+
### Retry-After
|
50
|
+
This library depends on faraday, but allows a wide range of versions. Depending on the client application's installed version of faraday, there may be different middleware libraries required to support automatically retrying requests that are rate limited/denied with a `Retry-After` header. The OAI client can, however, accept an externally configured faraday http client for handling this. For example, to retry on `429 Too Many Requests`:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
require 'oai'
|
54
|
+
require 'faraday_middleware' # if using faraday version < 2
|
55
|
+
http_client = Faraday.new do |conn|
|
56
|
+
conn.request(:retry, max: 5, retry_statuses: 429)
|
57
|
+
conn.response(:follow_redirects, limit: 5)
|
58
|
+
conn.adapter :net_http
|
59
|
+
end
|
60
|
+
client = OAI::Client.new(base_url, http: http_client)
|
61
|
+
opts = {from:'2012-03-01', until:'2012-04-01', metadata_prefix:'oai_dc'}
|
62
|
+
puts client.list_records(opts).full.count
|
63
|
+
```
|
64
|
+
|
50
65
|
See {OAI::Client} for more details
|
51
66
|
|
52
67
|
Server
|
@@ -97,7 +112,7 @@ There are also convenience tasks to run subsets of tests.
|
|
97
112
|
We use [appraisal](https://github.com/thoughtbot/appraisal) to test ActiveRecord-related functionality under multiple versions of ActiveRecord. While the above commands will test with latest ActiveRecord (allowed in our .gemspec development dependency), you can test under a particular version defined in the [Appraisals](./Appraisals) file like so:
|
98
113
|
|
99
114
|
$ bundle exec appraisal rails-52 rake test
|
100
|
-
$ bundle exec appraisal rails-
|
115
|
+
$ bundle exec appraisal rails-70 rake test
|
101
116
|
|
102
117
|
If you run into trouble with appraisal's gemfiles getting out of date and bundler complaining,
|
103
118
|
try:
|
data/Rakefile
CHANGED
@@ -34,6 +34,13 @@ namespace :test do
|
|
34
34
|
t.warning = false
|
35
35
|
end
|
36
36
|
|
37
|
+
Rake::TestTask.new('harvester') do |t|
|
38
|
+
t.libs << ['lib', 'test/harvester']
|
39
|
+
t.pattern = 'test/harvester/tc_*.rb'
|
40
|
+
#t.verbose = true
|
41
|
+
t.warning = false
|
42
|
+
end
|
43
|
+
|
37
44
|
Rake::TestTask.new('provider') do |t|
|
38
45
|
t.libs << ['lib', 'test/provider']
|
39
46
|
t.pattern = 'test/provider/tc_*.rb'
|
data/bin/oai
CHANGED
@@ -45,8 +45,8 @@ class FileModel < OAI::Provider::Model
|
|
45
45
|
case selector
|
46
46
|
when :all
|
47
47
|
records = Dir["#{@directory}/*.xml"].sort.collect do |file|
|
48
|
-
File.new(file) unless File.stat(file).mtime.utc < opts[:from] or
|
49
|
-
File.stat(file).mtime.utc > opts[:until]
|
48
|
+
File.new(file) unless File.stat(file).mtime.utc < opts[:from].to_time or
|
49
|
+
File.stat(file).mtime.utc > opts[:until].to_time
|
50
50
|
end
|
51
51
|
records
|
52
52
|
else
|
data/lib/oai/client/response.rb
CHANGED
@@ -37,16 +37,16 @@ module OAI
|
|
37
37
|
message = error.content
|
38
38
|
code = ""
|
39
39
|
if defined?(error.property) == nil
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
end
|
40
|
+
code = error.attributes['code']
|
41
|
+
else
|
42
|
+
begin
|
43
|
+
code = error["code"]
|
44
|
+
rescue
|
45
|
+
code = error.property('code')
|
47
46
|
end
|
47
|
+
end
|
48
48
|
end
|
49
|
-
raise OAI::Exception.
|
49
|
+
raise OAI::Exception.for(message: message, code: code)
|
50
50
|
end
|
51
51
|
|
52
52
|
end
|
data/lib/oai/client.rb
CHANGED
@@ -54,7 +54,7 @@ module OAI
|
|
54
54
|
# <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
|
55
55
|
|
56
56
|
class Client
|
57
|
-
|
57
|
+
UNESCAPED_AMPERSAND = /&(?!(?:amp|lt|gt|quot|apos|\#\d+);)/
|
58
58
|
# The constructor which must be passed a valid base url for an oai
|
59
59
|
# service:
|
60
60
|
#
|
@@ -95,7 +95,8 @@ module OAI
|
|
95
95
|
follow_redirects = 5 if follow_redirects == true
|
96
96
|
|
97
97
|
if follow_redirects
|
98
|
-
require '
|
98
|
+
require 'faraday/follow_redirects'
|
99
|
+
builder.use Faraday::FollowRedirects::Middleware
|
99
100
|
builder.response :follow_redirects, :limit => follow_redirects.to_i
|
100
101
|
end
|
101
102
|
builder.adapter :net_http
|
@@ -197,12 +198,9 @@ module OAI
|
|
197
198
|
do_resumable(OAI::ListSetsResponse, 'ListSets', opts)
|
198
199
|
end
|
199
200
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
# fire off the request and return appropriate DOM object
|
204
|
-
uri = build_uri(verb, opts)
|
205
|
-
xml = strip_invalid_utf_8_chars(get(uri))
|
201
|
+
def sanitize_xml(xml)
|
202
|
+
xml = strip_invalid_utf_8_chars(xml)
|
203
|
+
xml = strip_invalid_xml_chars(xml)
|
206
204
|
if @parser == 'libxml'
|
207
205
|
# remove default namespace for oai-pmh since libxml
|
208
206
|
# isn't able to use our xpaths to get at them
|
@@ -210,7 +208,15 @@ module OAI
|
|
210
208
|
xml = xml.gsub(
|
211
209
|
/xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
|
212
210
|
end
|
213
|
-
|
211
|
+
xml
|
212
|
+
end
|
213
|
+
|
214
|
+
private
|
215
|
+
|
216
|
+
def do_request(verb, opts = nil)
|
217
|
+
# fire off the request and return appropriate DOM object
|
218
|
+
uri = build_uri(verb, opts)
|
219
|
+
return load_document(get(uri))
|
214
220
|
end
|
215
221
|
|
216
222
|
def do_resumable(responseClass, verb, opts)
|
@@ -240,6 +246,7 @@ module OAI
|
|
240
246
|
end
|
241
247
|
|
242
248
|
def load_document(xml)
|
249
|
+
xml = sanitize_xml(xml)
|
243
250
|
case @parser
|
244
251
|
when 'libxml'
|
245
252
|
begin
|
@@ -330,7 +337,16 @@ module OAI
|
|
330
337
|
# Regex is from WebCollab:
|
331
338
|
# http://webcollab.sourceforge.net/unicode.html
|
332
339
|
def strip_invalid_utf_8_chars(xml)
|
333
|
-
|
340
|
+
return nil unless xml
|
341
|
+
|
342
|
+
# If it's in a specific encoding other than BINARY, it may trigger
|
343
|
+
# an exception to try to gsub these illegal bytes. Temporarily
|
344
|
+
# put it in BINARY. NOTE: We're not totally sure what's going on
|
345
|
+
# with encodings in this gem in general, it might not be totally reasonable.
|
346
|
+
orig_encoding = xml.encoding
|
347
|
+
xml.force_encoding("BINARY")
|
348
|
+
|
349
|
+
xml = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
|
334
350
|
| [\x00-\x7F][\x80-\xBF]+
|
335
351
|
| ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
|
336
352
|
| [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
|
@@ -338,7 +354,15 @@ module OAI
|
|
338
354
|
| (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')\
|
339
355
|
.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
|
340
356
|
| \xED[\xA0-\xBF][\x80-\xBF]/,'?')
|
357
|
+
|
358
|
+
xml.force_encoding(orig_encoding)
|
359
|
+
|
360
|
+
xml
|
341
361
|
end
|
342
362
|
|
363
|
+
def strip_invalid_xml_chars(xml)
|
364
|
+
return xml unless xml =~ UNESCAPED_AMPERSAND
|
365
|
+
xml.gsub(UNESCAPED_AMPERSAND, '&')
|
366
|
+
end
|
343
367
|
end
|
344
368
|
end
|
data/lib/oai/exception.rb
CHANGED
@@ -4,72 +4,80 @@ module OAI
|
|
4
4
|
# messages will be wrapped in an XML response to the client.
|
5
5
|
|
6
6
|
class Exception < RuntimeError
|
7
|
+
CODE = nil
|
8
|
+
MESSAGE = nil
|
9
|
+
|
7
10
|
attr_reader :code
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
+
@@codes = {}
|
13
|
+
|
14
|
+
def self.register_exception_code(code, exception_class)
|
15
|
+
@@codes[code] = exception_class if exception_class.superclass == OAI::Exception
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.for(message: nil, code: nil)
|
19
|
+
@@codes.fetch(code, Exception).new(message)
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(message = nil, code = nil)
|
23
|
+
super(message || self.class::MESSAGE)
|
24
|
+
@code = code || self.class::CODE
|
12
25
|
end
|
13
26
|
end
|
14
27
|
|
15
28
|
class ArgumentException < Exception
|
16
|
-
|
17
|
-
|
29
|
+
CODE = 'badArgument'
|
30
|
+
MESSAGE = 'The request includes ' \
|
18
31
|
'illegal arguments, is missing required arguments, includes a ' \
|
19
|
-
'repeated argument, or values for arguments have an illegal syntax.'
|
20
|
-
|
21
|
-
end
|
32
|
+
'repeated argument, or values for arguments have an illegal syntax.'
|
33
|
+
register_exception_code(CODE, self)
|
22
34
|
end
|
23
35
|
|
24
36
|
class VerbException < Exception
|
25
|
-
|
26
|
-
|
27
|
-
'verb, the verb argument is missing, or the verb argument is repeated.'
|
28
|
-
|
29
|
-
end
|
37
|
+
CODE = 'badVerb'
|
38
|
+
MESSAGE = 'Value of the verb argument is not a legal OAI-PMH '\
|
39
|
+
'verb, the verb argument is missing, or the verb argument is repeated.'
|
40
|
+
register_exception_code(CODE, self)
|
30
41
|
end
|
31
42
|
|
32
43
|
class FormatException < Exception
|
33
|
-
|
34
|
-
|
44
|
+
CODE = 'cannotDisseminateFormat'
|
45
|
+
MESSAGE = 'The metadata format identified by '\
|
35
46
|
'the value given for the metadataPrefix argument is not supported '\
|
36
|
-
'by the item or by the repository.'
|
37
|
-
|
47
|
+
'by the item or by the repository.'
|
48
|
+
register_exception_code(CODE, self)
|
38
49
|
end
|
39
50
|
|
40
51
|
class IdException < Exception
|
41
|
-
|
42
|
-
|
43
|
-
'unknown or illegal in this repository.'
|
44
|
-
|
52
|
+
CODE = 'idDoesNotExist'
|
53
|
+
MESSAGE = 'The value of the identifier argument is '\
|
54
|
+
'unknown or illegal in this repository.'
|
55
|
+
register_exception_code(CODE, self)
|
45
56
|
end
|
46
57
|
|
47
58
|
class NoMatchException < Exception
|
48
|
-
|
49
|
-
|
50
|
-
'until, set and metadataPrefix arguments results in an empty list.'
|
51
|
-
|
52
|
-
end
|
59
|
+
CODE = 'noRecordsMatch'
|
60
|
+
MESSAGE = 'The combination of the values of the from, '\
|
61
|
+
'until, set and metadataPrefix arguments results in an empty list.'
|
62
|
+
register_exception_code(CODE, self)
|
53
63
|
end
|
54
64
|
|
55
65
|
class MetadataFormatException < Exception
|
56
|
-
|
57
|
-
|
58
|
-
'for the specified item.'
|
59
|
-
|
66
|
+
CODE = 'noMetadataFormats'
|
67
|
+
MESSAGE = 'There are no metadata formats available '\
|
68
|
+
'for the specified item.'
|
69
|
+
register_exception_code(CODE, self)
|
60
70
|
end
|
61
71
|
|
62
72
|
class SetException < Exception
|
63
|
-
|
64
|
-
|
65
|
-
|
73
|
+
CODE = 'noSetHierarchy'
|
74
|
+
MESSAGE = 'This repository does not support sets.'
|
75
|
+
register_exception_code(CODE, self)
|
66
76
|
end
|
67
77
|
|
68
78
|
class ResumptionTokenException < Exception
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
end
|
79
|
+
CODE = 'badResumptionToken'
|
80
|
+
MESSAGE = 'The value of the resumptionToken argument is invalid or expired.'
|
81
|
+
register_exception_code(CODE, self)
|
73
82
|
end
|
74
|
-
|
75
83
|
end
|
data/lib/oai/harvester/config.rb
CHANGED
@@ -3,14 +3,16 @@
|
|
3
3
|
|
4
4
|
module OAI
|
5
5
|
module Harvester
|
6
|
-
|
7
6
|
class Harvest
|
7
|
+
DIRECTORY_LAYOUT = "%Y/%m".freeze
|
8
8
|
|
9
|
-
def initialize(config = nil, directory = nil, date = nil)
|
9
|
+
def initialize(config = nil, directory = nil, date = nil, to = nil)
|
10
10
|
@config = config || Config.load
|
11
11
|
@directory = directory || @config.storage
|
12
12
|
@from = date
|
13
13
|
@from.freeze
|
14
|
+
@until = to
|
15
|
+
@until.freeze
|
14
16
|
@parser = defined?(XML::Document) ? 'libxml' : 'rexml'
|
15
17
|
end
|
16
18
|
|
@@ -30,9 +32,13 @@ module OAI
|
|
30
32
|
|
31
33
|
def harvest(site)
|
32
34
|
opts = build_options_hash(@config.sites[site])
|
33
|
-
|
35
|
+
if @until
|
36
|
+
harvest_time = @until.to_time.utc
|
37
|
+
else
|
38
|
+
harvest_time = Time.now.utc
|
39
|
+
end
|
34
40
|
|
35
|
-
if
|
41
|
+
if OAI::Const::Granularity::LOW == granularity(opts[:url])
|
36
42
|
opts[:until] = harvest_time.strftime("%Y-%m-%d")
|
37
43
|
opts[:from] = @from.strftime("%Y-%m-%d") if @from
|
38
44
|
else
|
@@ -43,22 +49,27 @@ module OAI
|
|
43
49
|
# Allow a from date to be passed in
|
44
50
|
opts[:from] = earliest(opts[:url]) unless opts[:from]
|
45
51
|
opts.delete(:set) if 'all' == opts[:set]
|
46
|
-
|
47
52
|
begin
|
48
53
|
# Connect, and download
|
49
54
|
file, records = call(opts.delete(:url), opts)
|
50
55
|
|
51
|
-
# Move document to storage directory
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
56
|
+
# Move document to storage directory if configured
|
57
|
+
if @directory
|
58
|
+
directory_layout = @config.layouts[site] if @config.layouts
|
59
|
+
dir = File.join(@directory, date_based_directory(harvest_time, directory_layout))
|
60
|
+
FileUtils.mkdir_p dir
|
61
|
+
FileUtils.mv(file.path,
|
62
|
+
File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
|
63
|
+
harvest_time)}.xml.gz"))
|
64
|
+
else
|
65
|
+
puts "no configured destination for temp file" if @interactive
|
66
|
+
end
|
57
67
|
@config.sites[site]['last'] = harvest_time
|
58
|
-
rescue
|
59
|
-
|
60
|
-
|
61
|
-
|
68
|
+
rescue OAI::NoMatchException
|
69
|
+
puts "No new records available" if @interactive
|
70
|
+
rescue OAI::Exception => ex
|
71
|
+
raise ex if not @interactive
|
72
|
+
puts ex.message
|
62
73
|
end
|
63
74
|
end
|
64
75
|
|
@@ -69,15 +80,15 @@ module OAI
|
|
69
80
|
records = 0;
|
70
81
|
client = OAI::Client.new(url, :parser => @parser)
|
71
82
|
provider_config = client.identify
|
72
|
-
|
83
|
+
|
73
84
|
file = Tempfile.new('oai_data')
|
74
85
|
gz = Zlib::GzipWriter.new(file)
|
75
86
|
gz << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
|
76
87
|
gz << "<records>"
|
77
88
|
begin
|
78
89
|
response = client.list_records(options)
|
79
|
-
|
80
|
-
gz << rec
|
90
|
+
response.each do |rec|
|
91
|
+
gz << rec._source
|
81
92
|
records += 1
|
82
93
|
end
|
83
94
|
puts "#{records} records retrieved" if @interactive
|
@@ -89,8 +100,8 @@ module OAI
|
|
89
100
|
puts "\nresumption token recieved, continuing" if @interactive
|
90
101
|
response = client.list_records(:resumption_token =>
|
91
102
|
response.resumption_token)
|
92
|
-
|
93
|
-
gz << rec
|
103
|
+
response.each do |rec|
|
104
|
+
gz << rec._source
|
94
105
|
records += 1
|
95
106
|
end
|
96
107
|
puts "#{records} records retrieved" if @interactive
|
@@ -118,8 +129,9 @@ module OAI
|
|
118
129
|
options
|
119
130
|
end
|
120
131
|
|
121
|
-
def date_based_directory(time)
|
122
|
-
|
132
|
+
def date_based_directory(time, directory_layout = nil)
|
133
|
+
directory_layout ||= Harvest::DIRECTORY_LAYOUT
|
134
|
+
"#{time.strftime(directory_layout)}"
|
123
135
|
end
|
124
136
|
|
125
137
|
def filename(from_time, until_time)
|
@@ -127,7 +139,7 @@ module OAI
|
|
127
139
|
"#{from_time.strftime(format)}_til_#{until_time.strftime(format)}"\
|
128
140
|
"_at_#{until_time.strftime('%H-%M-%S')}"
|
129
141
|
end
|
130
|
-
|
142
|
+
|
131
143
|
def granularity(url)
|
132
144
|
client = OAI::Client.new url
|
133
145
|
client.identify.granularity
|
@@ -137,7 +149,7 @@ module OAI
|
|
137
149
|
def earliest(url)
|
138
150
|
client = OAI::Client.new url
|
139
151
|
identify = client.identify
|
140
|
-
if
|
152
|
+
if OAI::Const::Granularity::LOW == identify.granularity
|
141
153
|
Time.parse(identify.earliest_datestamp).strftime("%Y-%m-%d")
|
142
154
|
else
|
143
155
|
Time.parse(identify.earliest_datestamp).xmlschema
|
@@ -147,4 +159,4 @@ module OAI
|
|
147
159
|
end
|
148
160
|
|
149
161
|
end
|
150
|
-
end
|
162
|
+
end
|
@@ -11,10 +11,9 @@ module OAI
|
|
11
11
|
def initialize(*args)
|
12
12
|
orig_init(*args)
|
13
13
|
@summary = []
|
14
|
-
@logger = Logger.new(File.join(@config.logfile, "harvester.log"),
|
15
|
-
shift_age = 'weekly') if @config.logfile
|
14
|
+
@logger = @config.logfile ? Logger.new(File.join(@config.logfile, "harvester.log"), 'weekly') : Logger.new(STDOUT)
|
16
15
|
@logger.datetime_format = "%Y-%m-%d %H:%M"
|
17
|
-
|
16
|
+
|
18
17
|
# Turn off logging if no logging directory is specified.
|
19
18
|
@logger.level = Logger::FATAL unless @config.logfile
|
20
19
|
end
|
@@ -24,8 +23,7 @@ module OAI
|
|
24
23
|
@logger.info { "Starting regular harvest" }
|
25
24
|
orig_start(sites)
|
26
25
|
begin
|
27
|
-
OAI::Harvester::
|
28
|
-
Mailer.send(@config.mail_server, @config.email, @summary)
|
26
|
+
OAI::Harvester::Mailer.send(@config.mail_server, @config.email, @summary) if @config.email
|
29
27
|
rescue
|
30
28
|
@logger.error { "Error sending out summary email: #{$!}"}
|
31
29
|
end
|
data/lib/oai/harvester.rb
CHANGED
@@ -7,9 +7,12 @@ require 'logger'
|
|
7
7
|
require 'fileutils'
|
8
8
|
require 'ostruct'
|
9
9
|
require 'readline'
|
10
|
-
require 'chronic'
|
11
10
|
require 'socket'
|
12
11
|
|
12
|
+
if not defined?(OAI::Const::VERBS)
|
13
|
+
require 'oai/constants'
|
14
|
+
end
|
15
|
+
|
13
16
|
require 'oai/client'
|
14
17
|
require 'oai/harvester/config'
|
15
18
|
require 'oai/harvester/harvest'
|
@@ -78,13 +78,7 @@ module OAI::Provider
|
|
78
78
|
raise ResumptionTokenException.new unless @limit
|
79
79
|
|
80
80
|
token = ResumptionToken.parse(token_string)
|
81
|
-
|
82
|
-
|
83
|
-
if token.last * @limit + @limit < total
|
84
|
-
select_partial(token)
|
85
|
-
else
|
86
|
-
select_partial(token).records
|
87
|
-
end
|
81
|
+
select_partial(token)
|
88
82
|
end
|
89
83
|
|
90
84
|
# select a subset of the result set, and return it with a
|
@@ -102,10 +96,13 @@ module OAI::Provider
|
|
102
96
|
|
103
97
|
raise ResumptionTokenException.new unless oaitoken
|
104
98
|
|
99
|
+
total = model.where(token_conditions(token)).count
|
100
|
+
# token offset should be nil if this is the last set
|
101
|
+
offset = (token.last * @limit + @limit >= total) ? nil : token.last + 1
|
105
102
|
PartialResult.new(
|
106
103
|
hydrate_records(
|
107
104
|
oaitoken.entries.limit(@limit).offset(token.last * @limit)),
|
108
|
-
token.next(
|
105
|
+
token.next(offset)
|
109
106
|
)
|
110
107
|
end
|
111
108
|
|
@@ -32,12 +32,12 @@ module OAI::Provider
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def earliest
|
35
|
-
earliest_obj = model.order("#{timestamp_field} asc").first
|
35
|
+
earliest_obj = model.order("#{model.base_class.table_name}.#{timestamp_field} asc").first
|
36
36
|
earliest_obj.nil? ? Time.at(0) : earliest_obj.send(timestamp_field)
|
37
37
|
end
|
38
38
|
|
39
39
|
def latest
|
40
|
-
latest_obj = model.order("#{timestamp_field} desc").first
|
40
|
+
latest_obj = model.order("#{model.base_class.table_name}.#{timestamp_field} desc").first
|
41
41
|
latest_obj.nil? ? Time.now : latest_obj.send(timestamp_field)
|
42
42
|
end
|
43
43
|
# A model class is expected to provide a method Model.sets that
|
@@ -61,7 +61,7 @@ module OAI::Provider
|
|
61
61
|
find_scope.where(conditions)
|
62
62
|
end
|
63
63
|
else
|
64
|
-
find_scope.where(conditions).
|
64
|
+
find_scope.where(conditions).where(identifier_field => selector).first
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
@@ -129,15 +129,7 @@ module OAI::Provider
|
|
129
129
|
raise OAI::ResumptionTokenException.new unless @limit
|
130
130
|
|
131
131
|
token = ResumptionToken.parse(token_string)
|
132
|
-
|
133
|
-
|
134
|
-
if @limit < total
|
135
|
-
select_partial(find_scope, token)
|
136
|
-
else # end of result set
|
137
|
-
find_scope.where(token_conditions(token))
|
138
|
-
.limit(@limit)
|
139
|
-
.order("#{identifier_field} asc")
|
140
|
-
end
|
132
|
+
select_partial(find_scope, token)
|
141
133
|
end
|
142
134
|
|
143
135
|
# select a subset of the result set, and return it with a
|
@@ -145,10 +137,12 @@ module OAI::Provider
|
|
145
137
|
def select_partial(find_scope, token)
|
146
138
|
records = find_scope.where(token_conditions(token))
|
147
139
|
.limit(@limit)
|
148
|
-
.order("#{identifier_field} asc")
|
140
|
+
.order("#{model.base_class.table_name}.#{identifier_field} asc")
|
149
141
|
raise OAI::ResumptionTokenException.new unless records
|
150
|
-
offset = records.last.send(identifier_field)
|
151
142
|
|
143
|
+
total = find_scope.where(token_conditions(token)).count
|
144
|
+
# token offset should be nil if this is the last set
|
145
|
+
offset = (@limit >= total) ? nil : records.last.send(identifier_field)
|
152
146
|
PartialResult.new(records, token.next(offset))
|
153
147
|
end
|
154
148
|
|
@@ -164,7 +158,7 @@ module OAI::Provider
|
|
164
158
|
|
165
159
|
return sql if "0" == last_id
|
166
160
|
# Now add last id constraint
|
167
|
-
sql.first << " AND #{identifier_field} > :id"
|
161
|
+
sql.first << " AND #{model.base_class.table_name}.#{identifier_field} > :id"
|
168
162
|
sql.last[:id] = last_id
|
169
163
|
|
170
164
|
return sql
|
@@ -175,27 +169,49 @@ module OAI::Provider
|
|
175
169
|
sql = []
|
176
170
|
esc_values = {}
|
177
171
|
if opts.has_key?(:from)
|
178
|
-
sql << "#{timestamp_field} >= :from"
|
172
|
+
sql << "#{model.base_class.table_name}.#{timestamp_field} >= :from"
|
179
173
|
esc_values[:from] = parse_to_local(opts[:from])
|
180
174
|
end
|
181
175
|
if opts.has_key?(:until)
|
182
176
|
# Handle databases which store fractions of a second by rounding up
|
183
|
-
sql << "#{timestamp_field} < :until"
|
177
|
+
sql << "#{model.base_class.table_name}.#{timestamp_field} < :until"
|
184
178
|
esc_values[:until] = parse_to_local(opts[:until]) { |t| t + 1 }
|
185
179
|
end
|
180
|
+
|
186
181
|
return [sql.join(" AND "), esc_values]
|
187
182
|
end
|
188
183
|
|
189
184
|
private
|
190
185
|
|
191
186
|
def parse_to_local(time)
|
192
|
-
|
187
|
+
if time.respond_to?(:strftime)
|
188
|
+
time_obj = time
|
189
|
+
else
|
190
|
+
begin
|
191
|
+
if time[-1] == "Z"
|
192
|
+
time_obj = Time.strptime(time, "%Y-%m-%dT%H:%M:%S%Z")
|
193
|
+
else
|
194
|
+
time_obj = Date.strptime(time, "%Y-%m-%d")
|
195
|
+
end
|
196
|
+
rescue
|
197
|
+
raise OAI::ArgumentException.new, "unparsable date: '#{time}'"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
193
201
|
time_obj = yield(time_obj) if block_given?
|
194
|
-
|
195
|
-
|
196
|
-
|
202
|
+
|
203
|
+
if time_obj.kind_of?(Date)
|
204
|
+
time_obj.strftime("%Y-%m-%d")
|
205
|
+
else
|
206
|
+
# Convert to same as DB - :local => :getlocal, :utc => :getutc
|
207
|
+
if ActiveRecord::VERSION::MAJOR >= 7
|
208
|
+
tzconv = "get#{ActiveRecord.default_timezone.to_s}".to_sym
|
209
|
+
else
|
210
|
+
tzconv = "get#{model.default_timezone.to_s}".to_sym
|
211
|
+
end
|
212
|
+
time_obj.send(tzconv).strftime("%Y-%m-%d %H:%M:%S")
|
213
|
+
end
|
197
214
|
end
|
198
215
|
|
199
216
|
end
|
200
217
|
end
|
201
|
-
|