oai 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +19 -4
- data/Rakefile +7 -0
- data/bin/oai +0 -2
- data/examples/models/file_model.rb +2 -2
- data/lib/oai/client/response.rb +8 -8
- data/lib/oai/client.rb +34 -10
- data/lib/oai/exception.rb +46 -38
- data/lib/oai/harvester/config.rb +1 -1
- data/lib/oai/harvester/harvest.rb +37 -25
- data/lib/oai/harvester/logging.rb +3 -5
- data/lib/oai/harvester.rb +4 -1
- data/lib/oai/provider/model/activerecord_caching_wrapper.rb +5 -8
- data/lib/oai/provider/model/activerecord_wrapper.rb +38 -22
- data/lib/oai/provider/model.rb +1 -1
- data/lib/oai/provider/response/list_records.rb +12 -0
- data/lib/oai/provider/response.rb +7 -4
- data/lib/oai/provider/resumption_token.rb +18 -5
- data/test/activerecord_provider/database/0001_oaipmh_tables.rb +7 -1
- data/test/activerecord_provider/helpers/providers.rb +3 -1
- data/test/activerecord_provider/helpers/transactional_test_case.rb +2 -1
- data/test/activerecord_provider/models/dc_field.rb +8 -0
- data/test/activerecord_provider/models/dc_lang.rb +3 -0
- data/test/activerecord_provider/models/exclusive_set_dc_field.rb +6 -0
- data/test/activerecord_provider/tc_activerecord_wrapper.rb +63 -0
- data/test/activerecord_provider/tc_ar_provider.rb +54 -26
- data/test/activerecord_provider/tc_ar_sets_provider.rb +10 -9
- data/test/activerecord_provider/tc_caching_paging_provider.rb +9 -7
- data/test/activerecord_provider/tc_simple_paging_provider.rb +9 -7
- data/test/client/tc_exception.rb +1 -1
- data/test/client/tc_get_record.rb +1 -1
- data/test/client/tc_http_client.rb +2 -2
- data/test/client/tc_libxml.rb +1 -1
- data/test/client/tc_utf8_escaping.rb +8 -1
- data/test/harvester/tc_harvest.rb +42 -0
- data/test/harvester/test_helper_harvester.rb +6 -0
- data/test/provider/models.rb +3 -3
- data/test/provider/tc_functional_tokens.rb +17 -11
- data/test/provider/tc_provider.rb +26 -0
- metadata +27 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c0a80559d64aa25add07953e978e93f9abfbc89e03c67bf306e79f0d7b18b0f7
|
4
|
+
data.tar.gz: 7989ff1c6dec95c3965cb3adc1d0bc156d250b6129002be50193443994d03caf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db8b7e74c65625da8b47b35ef23a7e620777dfefb0a6a8d63a6a7314534dbfe46606bf863aef4a839087547280700519f35e3b10bdf10bf70ba93959ae3125e2
|
7
|
+
data.tar.gz: ad14343ae9a61b6516f7e079db4afc0b7eddbb5e70d8d962221842a5fe4f8b49da63d37dc8eeb7cfe9e163617d4d2a3cae93e37e8943b784df7d0dc1eb1a7cce
|
data/README.md
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
ruby-oai
|
2
2
|
========
|
3
|
+
[](https://github.com/code4lib/ruby-oai/actions)
|
3
4
|
|
4
|
-
[](https://badge.fury.io/rb/oai)
|
5
|
+
[](https://badge.fury.io/rb/oai)
|
7
6
|
|
8
7
|
ruby-oai is a Open Archives Protocol for Metadata Harvesting (OAI-PMH)
|
9
8
|
library for Ruby. [OAI-PMH](http://openarchives.org) is a somewhat
|
@@ -47,6 +46,22 @@ For example to initiate a ListRecords request to pubmed you can:
|
|
47
46
|
end
|
48
47
|
```
|
49
48
|
|
49
|
+
### Retry-After
|
50
|
+
This library depends on faraday, but allows a wide range of versions. Depending on the client application's installed version of faraday, there may be different middleware libraries required to support automatically retrying requests that are rate limited/denied with a `Retry-After` header. The OAI client can, however, accept an externally configured faraday http client for handling this. For example, to retry on `429 Too Many Requests`:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
require 'oai'
|
54
|
+
require 'faraday_middleware' # if using faraday version < 2
|
55
|
+
http_client = Faraday.new do |conn|
|
56
|
+
conn.request(:retry, max: 5, retry_statuses: 429)
|
57
|
+
conn.response(:follow_redirects, limit: 5)
|
58
|
+
conn.adapter :net_http
|
59
|
+
end
|
60
|
+
client = OAI::Client.new(base_url, http: http_client)
|
61
|
+
opts = {from:'2012-03-01', until:'2012-04-01', metadata_prefix:'oai_dc'}
|
62
|
+
puts client.list_records(opts).full.count
|
63
|
+
```
|
64
|
+
|
50
65
|
See {OAI::Client} for more details
|
51
66
|
|
52
67
|
Server
|
@@ -97,7 +112,7 @@ There are also convenience tasks to run subsets of tests.
|
|
97
112
|
We use [appraisal](https://github.com/thoughtbot/appraisal) to test ActiveRecord-related functionality under multiple versions of ActiveRecord. While the above commands will test with latest ActiveRecord (allowed in our .gemspec development dependency), you can test under a particular version defined in the [Appraisals](./Appraisals) file like so:
|
98
113
|
|
99
114
|
$ bundle exec appraisal rails-52 rake test
|
100
|
-
$ bundle exec appraisal rails-
|
115
|
+
$ bundle exec appraisal rails-70 rake test
|
101
116
|
|
102
117
|
If you run into trouble with appraisal's gemfiles getting out of date and bundler complaining,
|
103
118
|
try:
|
data/Rakefile
CHANGED
@@ -34,6 +34,13 @@ namespace :test do
|
|
34
34
|
t.warning = false
|
35
35
|
end
|
36
36
|
|
37
|
+
Rake::TestTask.new('harvester') do |t|
|
38
|
+
t.libs << ['lib', 'test/harvester']
|
39
|
+
t.pattern = 'test/harvester/tc_*.rb'
|
40
|
+
#t.verbose = true
|
41
|
+
t.warning = false
|
42
|
+
end
|
43
|
+
|
37
44
|
Rake::TestTask.new('provider') do |t|
|
38
45
|
t.libs << ['lib', 'test/provider']
|
39
46
|
t.pattern = 'test/provider/tc_*.rb'
|
data/bin/oai
CHANGED
@@ -45,8 +45,8 @@ class FileModel < OAI::Provider::Model
|
|
45
45
|
case selector
|
46
46
|
when :all
|
47
47
|
records = Dir["#{@directory}/*.xml"].sort.collect do |file|
|
48
|
-
File.new(file) unless File.stat(file).mtime.utc < opts[:from] or
|
49
|
-
File.stat(file).mtime.utc > opts[:until]
|
48
|
+
File.new(file) unless File.stat(file).mtime.utc < opts[:from].to_time or
|
49
|
+
File.stat(file).mtime.utc > opts[:until].to_time
|
50
50
|
end
|
51
51
|
records
|
52
52
|
else
|
data/lib/oai/client/response.rb
CHANGED
@@ -37,16 +37,16 @@ module OAI
|
|
37
37
|
message = error.content
|
38
38
|
code = ""
|
39
39
|
if defined?(error.property) == nil
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
end
|
40
|
+
code = error.attributes['code']
|
41
|
+
else
|
42
|
+
begin
|
43
|
+
code = error["code"]
|
44
|
+
rescue
|
45
|
+
code = error.property('code')
|
47
46
|
end
|
47
|
+
end
|
48
48
|
end
|
49
|
-
raise OAI::Exception.
|
49
|
+
raise OAI::Exception.for(message: message, code: code)
|
50
50
|
end
|
51
51
|
|
52
52
|
end
|
data/lib/oai/client.rb
CHANGED
@@ -54,7 +54,7 @@ module OAI
|
|
54
54
|
# <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
|
55
55
|
|
56
56
|
class Client
|
57
|
-
|
57
|
+
UNESCAPED_AMPERSAND = /&(?!(?:amp|lt|gt|quot|apos|\#\d+);)/
|
58
58
|
# The constructor which must be passed a valid base url for an oai
|
59
59
|
# service:
|
60
60
|
#
|
@@ -95,7 +95,8 @@ module OAI
|
|
95
95
|
follow_redirects = 5 if follow_redirects == true
|
96
96
|
|
97
97
|
if follow_redirects
|
98
|
-
require '
|
98
|
+
require 'faraday/follow_redirects'
|
99
|
+
builder.use Faraday::FollowRedirects::Middleware
|
99
100
|
builder.response :follow_redirects, :limit => follow_redirects.to_i
|
100
101
|
end
|
101
102
|
builder.adapter :net_http
|
@@ -197,12 +198,9 @@ module OAI
|
|
197
198
|
do_resumable(OAI::ListSetsResponse, 'ListSets', opts)
|
198
199
|
end
|
199
200
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
# fire off the request and return appropriate DOM object
|
204
|
-
uri = build_uri(verb, opts)
|
205
|
-
xml = strip_invalid_utf_8_chars(get(uri))
|
201
|
+
def sanitize_xml(xml)
|
202
|
+
xml = strip_invalid_utf_8_chars(xml)
|
203
|
+
xml = strip_invalid_xml_chars(xml)
|
206
204
|
if @parser == 'libxml'
|
207
205
|
# remove default namespace for oai-pmh since libxml
|
208
206
|
# isn't able to use our xpaths to get at them
|
@@ -210,7 +208,15 @@ module OAI
|
|
210
208
|
xml = xml.gsub(
|
211
209
|
/xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
|
212
210
|
end
|
213
|
-
|
211
|
+
xml
|
212
|
+
end
|
213
|
+
|
214
|
+
private
|
215
|
+
|
216
|
+
def do_request(verb, opts = nil)
|
217
|
+
# fire off the request and return appropriate DOM object
|
218
|
+
uri = build_uri(verb, opts)
|
219
|
+
return load_document(get(uri))
|
214
220
|
end
|
215
221
|
|
216
222
|
def do_resumable(responseClass, verb, opts)
|
@@ -240,6 +246,7 @@ module OAI
|
|
240
246
|
end
|
241
247
|
|
242
248
|
def load_document(xml)
|
249
|
+
xml = sanitize_xml(xml)
|
243
250
|
case @parser
|
244
251
|
when 'libxml'
|
245
252
|
begin
|
@@ -330,7 +337,16 @@ module OAI
|
|
330
337
|
# Regex is from WebCollab:
|
331
338
|
# http://webcollab.sourceforge.net/unicode.html
|
332
339
|
def strip_invalid_utf_8_chars(xml)
|
333
|
-
|
340
|
+
return nil unless xml
|
341
|
+
|
342
|
+
# If it's in a specific encoding other than BINARY, it may trigger
|
343
|
+
# an exception to try to gsub these illegal bytes. Temporarily
|
344
|
+
# put it in BINARY. NOTE: We're not totally sure what's going on
|
345
|
+
# with encodings in this gem in general, it might not be totally reasonable.
|
346
|
+
orig_encoding = xml.encoding
|
347
|
+
xml.force_encoding("BINARY")
|
348
|
+
|
349
|
+
xml = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
|
334
350
|
| [\x00-\x7F][\x80-\xBF]+
|
335
351
|
| ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
|
336
352
|
| [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
|
@@ -338,7 +354,15 @@ module OAI
|
|
338
354
|
| (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')\
|
339
355
|
.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
|
340
356
|
| \xED[\xA0-\xBF][\x80-\xBF]/,'?')
|
357
|
+
|
358
|
+
xml.force_encoding(orig_encoding)
|
359
|
+
|
360
|
+
xml
|
341
361
|
end
|
342
362
|
|
363
|
+
def strip_invalid_xml_chars(xml)
|
364
|
+
return xml unless xml =~ UNESCAPED_AMPERSAND
|
365
|
+
xml.gsub(UNESCAPED_AMPERSAND, '&')
|
366
|
+
end
|
343
367
|
end
|
344
368
|
end
|
data/lib/oai/exception.rb
CHANGED
@@ -4,72 +4,80 @@ module OAI
|
|
4
4
|
# messages will be wrapped in an XML response to the client.
|
5
5
|
|
6
6
|
class Exception < RuntimeError
|
7
|
+
CODE = nil
|
8
|
+
MESSAGE = nil
|
9
|
+
|
7
10
|
attr_reader :code
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
+
@@codes = {}
|
13
|
+
|
14
|
+
def self.register_exception_code(code, exception_class)
|
15
|
+
@@codes[code] = exception_class if exception_class.superclass == OAI::Exception
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.for(message: nil, code: nil)
|
19
|
+
@@codes.fetch(code, Exception).new(message)
|
20
|
+
end
|
21
|
+
|
22
|
+
def initialize(message = nil, code = nil)
|
23
|
+
super(message || self.class::MESSAGE)
|
24
|
+
@code = code || self.class::CODE
|
12
25
|
end
|
13
26
|
end
|
14
27
|
|
15
28
|
class ArgumentException < Exception
|
16
|
-
|
17
|
-
|
29
|
+
CODE = 'badArgument'
|
30
|
+
MESSAGE = 'The request includes ' \
|
18
31
|
'illegal arguments, is missing required arguments, includes a ' \
|
19
|
-
'repeated argument, or values for arguments have an illegal syntax.'
|
20
|
-
|
21
|
-
end
|
32
|
+
'repeated argument, or values for arguments have an illegal syntax.'
|
33
|
+
register_exception_code(CODE, self)
|
22
34
|
end
|
23
35
|
|
24
36
|
class VerbException < Exception
|
25
|
-
|
26
|
-
|
27
|
-
'verb, the verb argument is missing, or the verb argument is repeated.'
|
28
|
-
|
29
|
-
end
|
37
|
+
CODE = 'badVerb'
|
38
|
+
MESSAGE = 'Value of the verb argument is not a legal OAI-PMH '\
|
39
|
+
'verb, the verb argument is missing, or the verb argument is repeated.'
|
40
|
+
register_exception_code(CODE, self)
|
30
41
|
end
|
31
42
|
|
32
43
|
class FormatException < Exception
|
33
|
-
|
34
|
-
|
44
|
+
CODE = 'cannotDisseminateFormat'
|
45
|
+
MESSAGE = 'The metadata format identified by '\
|
35
46
|
'the value given for the metadataPrefix argument is not supported '\
|
36
|
-
'by the item or by the repository.'
|
37
|
-
|
47
|
+
'by the item or by the repository.'
|
48
|
+
register_exception_code(CODE, self)
|
38
49
|
end
|
39
50
|
|
40
51
|
class IdException < Exception
|
41
|
-
|
42
|
-
|
43
|
-
'unknown or illegal in this repository.'
|
44
|
-
|
52
|
+
CODE = 'idDoesNotExist'
|
53
|
+
MESSAGE = 'The value of the identifier argument is '\
|
54
|
+
'unknown or illegal in this repository.'
|
55
|
+
register_exception_code(CODE, self)
|
45
56
|
end
|
46
57
|
|
47
58
|
class NoMatchException < Exception
|
48
|
-
|
49
|
-
|
50
|
-
'until, set and metadataPrefix arguments results in an empty list.'
|
51
|
-
|
52
|
-
end
|
59
|
+
CODE = 'noRecordsMatch'
|
60
|
+
MESSAGE = 'The combination of the values of the from, '\
|
61
|
+
'until, set and metadataPrefix arguments results in an empty list.'
|
62
|
+
register_exception_code(CODE, self)
|
53
63
|
end
|
54
64
|
|
55
65
|
class MetadataFormatException < Exception
|
56
|
-
|
57
|
-
|
58
|
-
'for the specified item.'
|
59
|
-
|
66
|
+
CODE = 'noMetadataFormats'
|
67
|
+
MESSAGE = 'There are no metadata formats available '\
|
68
|
+
'for the specified item.'
|
69
|
+
register_exception_code(CODE, self)
|
60
70
|
end
|
61
71
|
|
62
72
|
class SetException < Exception
|
63
|
-
|
64
|
-
|
65
|
-
|
73
|
+
CODE = 'noSetHierarchy'
|
74
|
+
MESSAGE = 'This repository does not support sets.'
|
75
|
+
register_exception_code(CODE, self)
|
66
76
|
end
|
67
77
|
|
68
78
|
class ResumptionTokenException < Exception
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
end
|
79
|
+
CODE = 'badResumptionToken'
|
80
|
+
MESSAGE = 'The value of the resumptionToken argument is invalid or expired.'
|
81
|
+
register_exception_code(CODE, self)
|
73
82
|
end
|
74
|
-
|
75
83
|
end
|
data/lib/oai/harvester/config.rb
CHANGED
@@ -3,14 +3,16 @@
|
|
3
3
|
|
4
4
|
module OAI
|
5
5
|
module Harvester
|
6
|
-
|
7
6
|
class Harvest
|
7
|
+
DIRECTORY_LAYOUT = "%Y/%m".freeze
|
8
8
|
|
9
|
-
def initialize(config = nil, directory = nil, date = nil)
|
9
|
+
def initialize(config = nil, directory = nil, date = nil, to = nil)
|
10
10
|
@config = config || Config.load
|
11
11
|
@directory = directory || @config.storage
|
12
12
|
@from = date
|
13
13
|
@from.freeze
|
14
|
+
@until = to
|
15
|
+
@until.freeze
|
14
16
|
@parser = defined?(XML::Document) ? 'libxml' : 'rexml'
|
15
17
|
end
|
16
18
|
|
@@ -30,9 +32,13 @@ module OAI
|
|
30
32
|
|
31
33
|
def harvest(site)
|
32
34
|
opts = build_options_hash(@config.sites[site])
|
33
|
-
|
35
|
+
if @until
|
36
|
+
harvest_time = @until.to_time.utc
|
37
|
+
else
|
38
|
+
harvest_time = Time.now.utc
|
39
|
+
end
|
34
40
|
|
35
|
-
if
|
41
|
+
if OAI::Const::Granularity::LOW == granularity(opts[:url])
|
36
42
|
opts[:until] = harvest_time.strftime("%Y-%m-%d")
|
37
43
|
opts[:from] = @from.strftime("%Y-%m-%d") if @from
|
38
44
|
else
|
@@ -43,22 +49,27 @@ module OAI
|
|
43
49
|
# Allow a from date to be passed in
|
44
50
|
opts[:from] = earliest(opts[:url]) unless opts[:from]
|
45
51
|
opts.delete(:set) if 'all' == opts[:set]
|
46
|
-
|
47
52
|
begin
|
48
53
|
# Connect, and download
|
49
54
|
file, records = call(opts.delete(:url), opts)
|
50
55
|
|
51
|
-
# Move document to storage directory
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
56
|
+
# Move document to storage directory if configured
|
57
|
+
if @directory
|
58
|
+
directory_layout = @config.layouts[site] if @config.layouts
|
59
|
+
dir = File.join(@directory, date_based_directory(harvest_time, directory_layout))
|
60
|
+
FileUtils.mkdir_p dir
|
61
|
+
FileUtils.mv(file.path,
|
62
|
+
File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
|
63
|
+
harvest_time)}.xml.gz"))
|
64
|
+
else
|
65
|
+
puts "no configured destination for temp file" if @interactive
|
66
|
+
end
|
57
67
|
@config.sites[site]['last'] = harvest_time
|
58
|
-
rescue
|
59
|
-
|
60
|
-
|
61
|
-
|
68
|
+
rescue OAI::NoMatchException
|
69
|
+
puts "No new records available" if @interactive
|
70
|
+
rescue OAI::Exception => ex
|
71
|
+
raise ex if not @interactive
|
72
|
+
puts ex.message
|
62
73
|
end
|
63
74
|
end
|
64
75
|
|
@@ -69,15 +80,15 @@ module OAI
|
|
69
80
|
records = 0;
|
70
81
|
client = OAI::Client.new(url, :parser => @parser)
|
71
82
|
provider_config = client.identify
|
72
|
-
|
83
|
+
|
73
84
|
file = Tempfile.new('oai_data')
|
74
85
|
gz = Zlib::GzipWriter.new(file)
|
75
86
|
gz << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
|
76
87
|
gz << "<records>"
|
77
88
|
begin
|
78
89
|
response = client.list_records(options)
|
79
|
-
|
80
|
-
gz << rec
|
90
|
+
response.each do |rec|
|
91
|
+
gz << rec._source
|
81
92
|
records += 1
|
82
93
|
end
|
83
94
|
puts "#{records} records retrieved" if @interactive
|
@@ -89,8 +100,8 @@ module OAI
|
|
89
100
|
puts "\nresumption token recieved, continuing" if @interactive
|
90
101
|
response = client.list_records(:resumption_token =>
|
91
102
|
response.resumption_token)
|
92
|
-
|
93
|
-
gz << rec
|
103
|
+
response.each do |rec|
|
104
|
+
gz << rec._source
|
94
105
|
records += 1
|
95
106
|
end
|
96
107
|
puts "#{records} records retrieved" if @interactive
|
@@ -118,8 +129,9 @@ module OAI
|
|
118
129
|
options
|
119
130
|
end
|
120
131
|
|
121
|
-
def date_based_directory(time)
|
122
|
-
|
132
|
+
def date_based_directory(time, directory_layout = nil)
|
133
|
+
directory_layout ||= Harvest::DIRECTORY_LAYOUT
|
134
|
+
"#{time.strftime(directory_layout)}"
|
123
135
|
end
|
124
136
|
|
125
137
|
def filename(from_time, until_time)
|
@@ -127,7 +139,7 @@ module OAI
|
|
127
139
|
"#{from_time.strftime(format)}_til_#{until_time.strftime(format)}"\
|
128
140
|
"_at_#{until_time.strftime('%H-%M-%S')}"
|
129
141
|
end
|
130
|
-
|
142
|
+
|
131
143
|
def granularity(url)
|
132
144
|
client = OAI::Client.new url
|
133
145
|
client.identify.granularity
|
@@ -137,7 +149,7 @@ module OAI
|
|
137
149
|
def earliest(url)
|
138
150
|
client = OAI::Client.new url
|
139
151
|
identify = client.identify
|
140
|
-
if
|
152
|
+
if OAI::Const::Granularity::LOW == identify.granularity
|
141
153
|
Time.parse(identify.earliest_datestamp).strftime("%Y-%m-%d")
|
142
154
|
else
|
143
155
|
Time.parse(identify.earliest_datestamp).xmlschema
|
@@ -147,4 +159,4 @@ module OAI
|
|
147
159
|
end
|
148
160
|
|
149
161
|
end
|
150
|
-
end
|
162
|
+
end
|
@@ -11,10 +11,9 @@ module OAI
|
|
11
11
|
def initialize(*args)
|
12
12
|
orig_init(*args)
|
13
13
|
@summary = []
|
14
|
-
@logger = Logger.new(File.join(@config.logfile, "harvester.log"),
|
15
|
-
shift_age = 'weekly') if @config.logfile
|
14
|
+
@logger = @config.logfile ? Logger.new(File.join(@config.logfile, "harvester.log"), 'weekly') : Logger.new(STDOUT)
|
16
15
|
@logger.datetime_format = "%Y-%m-%d %H:%M"
|
17
|
-
|
16
|
+
|
18
17
|
# Turn off logging if no logging directory is specified.
|
19
18
|
@logger.level = Logger::FATAL unless @config.logfile
|
20
19
|
end
|
@@ -24,8 +23,7 @@ module OAI
|
|
24
23
|
@logger.info { "Starting regular harvest" }
|
25
24
|
orig_start(sites)
|
26
25
|
begin
|
27
|
-
OAI::Harvester::
|
28
|
-
Mailer.send(@config.mail_server, @config.email, @summary)
|
26
|
+
OAI::Harvester::Mailer.send(@config.mail_server, @config.email, @summary) if @config.email
|
29
27
|
rescue
|
30
28
|
@logger.error { "Error sending out summary email: #{$!}"}
|
31
29
|
end
|
data/lib/oai/harvester.rb
CHANGED
@@ -7,9 +7,12 @@ require 'logger'
|
|
7
7
|
require 'fileutils'
|
8
8
|
require 'ostruct'
|
9
9
|
require 'readline'
|
10
|
-
require 'chronic'
|
11
10
|
require 'socket'
|
12
11
|
|
12
|
+
if not defined?(OAI::Const::VERBS)
|
13
|
+
require 'oai/constants'
|
14
|
+
end
|
15
|
+
|
13
16
|
require 'oai/client'
|
14
17
|
require 'oai/harvester/config'
|
15
18
|
require 'oai/harvester/harvest'
|
@@ -78,13 +78,7 @@ module OAI::Provider
|
|
78
78
|
raise ResumptionTokenException.new unless @limit
|
79
79
|
|
80
80
|
token = ResumptionToken.parse(token_string)
|
81
|
-
|
82
|
-
|
83
|
-
if token.last * @limit + @limit < total
|
84
|
-
select_partial(token)
|
85
|
-
else
|
86
|
-
select_partial(token).records
|
87
|
-
end
|
81
|
+
select_partial(token)
|
88
82
|
end
|
89
83
|
|
90
84
|
# select a subset of the result set, and return it with a
|
@@ -102,10 +96,13 @@ module OAI::Provider
|
|
102
96
|
|
103
97
|
raise ResumptionTokenException.new unless oaitoken
|
104
98
|
|
99
|
+
total = model.where(token_conditions(token)).count
|
100
|
+
# token offset should be nil if this is the last set
|
101
|
+
offset = (token.last * @limit + @limit >= total) ? nil : token.last + 1
|
105
102
|
PartialResult.new(
|
106
103
|
hydrate_records(
|
107
104
|
oaitoken.entries.limit(@limit).offset(token.last * @limit)),
|
108
|
-
token.next(
|
105
|
+
token.next(offset)
|
109
106
|
)
|
110
107
|
end
|
111
108
|
|
@@ -32,12 +32,12 @@ module OAI::Provider
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def earliest
|
35
|
-
earliest_obj = model.order("#{timestamp_field} asc").first
|
35
|
+
earliest_obj = model.order("#{model.base_class.table_name}.#{timestamp_field} asc").first
|
36
36
|
earliest_obj.nil? ? Time.at(0) : earliest_obj.send(timestamp_field)
|
37
37
|
end
|
38
38
|
|
39
39
|
def latest
|
40
|
-
latest_obj = model.order("#{timestamp_field} desc").first
|
40
|
+
latest_obj = model.order("#{model.base_class.table_name}.#{timestamp_field} desc").first
|
41
41
|
latest_obj.nil? ? Time.now : latest_obj.send(timestamp_field)
|
42
42
|
end
|
43
43
|
# A model class is expected to provide a method Model.sets that
|
@@ -61,7 +61,7 @@ module OAI::Provider
|
|
61
61
|
find_scope.where(conditions)
|
62
62
|
end
|
63
63
|
else
|
64
|
-
find_scope.where(conditions).
|
64
|
+
find_scope.where(conditions).where(identifier_field => selector).first
|
65
65
|
end
|
66
66
|
end
|
67
67
|
|
@@ -129,15 +129,7 @@ module OAI::Provider
|
|
129
129
|
raise OAI::ResumptionTokenException.new unless @limit
|
130
130
|
|
131
131
|
token = ResumptionToken.parse(token_string)
|
132
|
-
|
133
|
-
|
134
|
-
if @limit < total
|
135
|
-
select_partial(find_scope, token)
|
136
|
-
else # end of result set
|
137
|
-
find_scope.where(token_conditions(token))
|
138
|
-
.limit(@limit)
|
139
|
-
.order("#{identifier_field} asc")
|
140
|
-
end
|
132
|
+
select_partial(find_scope, token)
|
141
133
|
end
|
142
134
|
|
143
135
|
# select a subset of the result set, and return it with a
|
@@ -145,10 +137,12 @@ module OAI::Provider
|
|
145
137
|
def select_partial(find_scope, token)
|
146
138
|
records = find_scope.where(token_conditions(token))
|
147
139
|
.limit(@limit)
|
148
|
-
.order("#{identifier_field} asc")
|
140
|
+
.order("#{model.base_class.table_name}.#{identifier_field} asc")
|
149
141
|
raise OAI::ResumptionTokenException.new unless records
|
150
|
-
offset = records.last.send(identifier_field)
|
151
142
|
|
143
|
+
total = find_scope.where(token_conditions(token)).count
|
144
|
+
# token offset should be nil if this is the last set
|
145
|
+
offset = (@limit >= total) ? nil : records.last.send(identifier_field)
|
152
146
|
PartialResult.new(records, token.next(offset))
|
153
147
|
end
|
154
148
|
|
@@ -164,7 +158,7 @@ module OAI::Provider
|
|
164
158
|
|
165
159
|
return sql if "0" == last_id
|
166
160
|
# Now add last id constraint
|
167
|
-
sql.first << " AND #{identifier_field} > :id"
|
161
|
+
sql.first << " AND #{model.base_class.table_name}.#{identifier_field} > :id"
|
168
162
|
sql.last[:id] = last_id
|
169
163
|
|
170
164
|
return sql
|
@@ -175,27 +169,49 @@ module OAI::Provider
|
|
175
169
|
sql = []
|
176
170
|
esc_values = {}
|
177
171
|
if opts.has_key?(:from)
|
178
|
-
sql << "#{timestamp_field} >= :from"
|
172
|
+
sql << "#{model.base_class.table_name}.#{timestamp_field} >= :from"
|
179
173
|
esc_values[:from] = parse_to_local(opts[:from])
|
180
174
|
end
|
181
175
|
if opts.has_key?(:until)
|
182
176
|
# Handle databases which store fractions of a second by rounding up
|
183
|
-
sql << "#{timestamp_field} < :until"
|
177
|
+
sql << "#{model.base_class.table_name}.#{timestamp_field} < :until"
|
184
178
|
esc_values[:until] = parse_to_local(opts[:until]) { |t| t + 1 }
|
185
179
|
end
|
180
|
+
|
186
181
|
return [sql.join(" AND "), esc_values]
|
187
182
|
end
|
188
183
|
|
189
184
|
private
|
190
185
|
|
191
186
|
def parse_to_local(time)
|
192
|
-
|
187
|
+
if time.respond_to?(:strftime)
|
188
|
+
time_obj = time
|
189
|
+
else
|
190
|
+
begin
|
191
|
+
if time[-1] == "Z"
|
192
|
+
time_obj = Time.strptime(time, "%Y-%m-%dT%H:%M:%S%Z")
|
193
|
+
else
|
194
|
+
time_obj = Date.strptime(time, "%Y-%m-%d")
|
195
|
+
end
|
196
|
+
rescue
|
197
|
+
raise OAI::ArgumentException.new, "unparsable date: '#{time}'"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
193
201
|
time_obj = yield(time_obj) if block_given?
|
194
|
-
|
195
|
-
|
196
|
-
|
202
|
+
|
203
|
+
if time_obj.kind_of?(Date)
|
204
|
+
time_obj.strftime("%Y-%m-%d")
|
205
|
+
else
|
206
|
+
# Convert to same as DB - :local => :getlocal, :utc => :getutc
|
207
|
+
if ActiveRecord::VERSION::MAJOR >= 7
|
208
|
+
tzconv = "get#{ActiveRecord.default_timezone.to_s}".to_sym
|
209
|
+
else
|
210
|
+
tzconv = "get#{model.default_timezone.to_s}".to_sym
|
211
|
+
end
|
212
|
+
time_obj.send(tzconv).strftime("%Y-%m-%d %H:%M:%S")
|
213
|
+
end
|
197
214
|
end
|
198
215
|
|
199
216
|
end
|
200
217
|
end
|
201
|
-
|