oai_talia 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +81 -0
- data/Rakefile +127 -0
- data/bin/oai +68 -0
- data/examples/models/file_model.rb +63 -0
- data/examples/providers/dublin_core.rb +474 -0
- data/lib/oai/client/get_record.rb +15 -0
- data/lib/oai/client/header.rb +18 -0
- data/lib/oai/client/identify.rb +30 -0
- data/lib/oai/client/list_identifiers.rb +12 -0
- data/lib/oai/client/list_metadata_formats.rb +12 -0
- data/lib/oai/client/list_records.rb +21 -0
- data/lib/oai/client/list_sets.rb +19 -0
- data/lib/oai/client/metadata_format.rb +12 -0
- data/lib/oai/client/record.rb +26 -0
- data/lib/oai/client/response.rb +35 -0
- data/lib/oai/client.rb +301 -0
- data/lib/oai/constants.rb +34 -0
- data/lib/oai/exception.rb +75 -0
- data/lib/oai/harvester/config.rb +41 -0
- data/lib/oai/harvester/harvest.rb +150 -0
- data/lib/oai/harvester/logging.rb +70 -0
- data/lib/oai/harvester/mailer.rb +17 -0
- data/lib/oai/harvester/shell.rb +338 -0
- data/lib/oai/harvester.rb +39 -0
- data/lib/oai/provider/metadata_format/oai_dc.rb +29 -0
- data/lib/oai/provider/metadata_format/oai_europeana.rb +38 -0
- data/lib/oai/provider/metadata_format.rb +143 -0
- data/lib/oai/provider/model/activerecord_caching_wrapper.rb +134 -0
- data/lib/oai/provider/model/activerecord_wrapper.rb +139 -0
- data/lib/oai/provider/model.rb +74 -0
- data/lib/oai/provider/partial_result.rb +18 -0
- data/lib/oai/provider/response/error.rb +16 -0
- data/lib/oai/provider/response/get_record.rb +26 -0
- data/lib/oai/provider/response/identify.rb +25 -0
- data/lib/oai/provider/response/list_identifiers.rb +35 -0
- data/lib/oai/provider/response/list_metadata_formats.rb +34 -0
- data/lib/oai/provider/response/list_records.rb +34 -0
- data/lib/oai/provider/response/list_sets.rb +23 -0
- data/lib/oai/provider/response/record_response.rb +70 -0
- data/lib/oai/provider/response.rb +161 -0
- data/lib/oai/provider/resumption_token.rb +106 -0
- data/lib/oai/provider.rb +304 -0
- data/lib/oai/set.rb +29 -0
- data/lib/oai/xpath.rb +75 -0
- data/lib/oai.rb +8 -0
- data/lib/test.rb +25 -0
- data/test/activerecord_provider/config/connection.rb +5 -0
- data/test/activerecord_provider/config/database.yml +6 -0
- data/test/activerecord_provider/database/ar_migration.rb +59 -0
- data/test/activerecord_provider/database/oaipmhtest +0 -0
- data/test/activerecord_provider/fixtures/dc.yml +1501 -0
- data/test/activerecord_provider/helpers/providers.rb +44 -0
- data/test/activerecord_provider/helpers/set_provider.rb +36 -0
- data/test/activerecord_provider/models/dc_field.rb +7 -0
- data/test/activerecord_provider/models/dc_set.rb +6 -0
- data/test/activerecord_provider/models/oai_token.rb +3 -0
- data/test/activerecord_provider/tc_ar_provider.rb +113 -0
- data/test/activerecord_provider/tc_ar_sets_provider.rb +72 -0
- data/test/activerecord_provider/tc_caching_paging_provider.rb +55 -0
- data/test/activerecord_provider/tc_simple_paging_provider.rb +57 -0
- data/test/activerecord_provider/test_helper.rb +4 -0
- data/test/client/helpers/provider.rb +68 -0
- data/test/client/helpers/test_wrapper.rb +11 -0
- data/test/client/tc_exception.rb +36 -0
- data/test/client/tc_get_record.rb +37 -0
- data/test/client/tc_identify.rb +13 -0
- data/test/client/tc_libxml.rb +61 -0
- data/test/client/tc_list_identifiers.rb +52 -0
- data/test/client/tc_list_metadata_formats.rb +18 -0
- data/test/client/tc_list_records.rb +13 -0
- data/test/client/tc_list_sets.rb +19 -0
- data/test/client/tc_low_resolution_dates.rb +14 -0
- data/test/client/tc_utf8_escaping.rb +11 -0
- data/test/client/tc_xpath.rb +26 -0
- data/test/client/test_helper.rb +5 -0
- data/test/provider/models.rb +234 -0
- data/test/provider/tc_exceptions.rb +96 -0
- data/test/provider/tc_functional_tokens.rb +43 -0
- data/test/provider/tc_provider.rb +71 -0
- data/test/provider/tc_resumption_tokens.rb +46 -0
- data/test/provider/tc_simple_provider.rb +92 -0
- data/test/provider/test_helper.rb +36 -0
- data/test/test.xml +22 -0
- metadata +181 -0
data/lib/oai/client.rb
ADDED
@@ -0,0 +1,301 @@
|
|
1
|
+
# External dependencies
|
2
|
+
require 'uri'
|
3
|
+
require 'net/http'
|
4
|
+
require 'cgi'
|
5
|
+
require 'iconv'
|
6
|
+
|
7
|
+
if not defined?(OAI::Const::VERBS)
|
8
|
+
# Shared stuff
|
9
|
+
require 'oai/exception'
|
10
|
+
require 'oai/constants'
|
11
|
+
require 'oai/xpath'
|
12
|
+
require 'oai/set'
|
13
|
+
end
|
14
|
+
|
15
|
+
# Localize requires so user can select a subset of functionality
|
16
|
+
require 'oai/client/metadata_format'
|
17
|
+
require 'oai/client/response'
|
18
|
+
require 'oai/client/header'
|
19
|
+
require 'oai/client/record'
|
20
|
+
require 'oai/client/identify'
|
21
|
+
require 'oai/client/get_record'
|
22
|
+
require 'oai/client/list_identifiers'
|
23
|
+
require 'oai/client/list_metadata_formats'
|
24
|
+
require 'oai/client/list_records'
|
25
|
+
require 'oai/client/list_sets'
|
26
|
+
|
27
|
+
module OAI
|
28
|
+
|
29
|
+
# A OAI::Client provides a client api for issuing OAI-PMH verbs against
|
30
|
+
# a OAI-PMH server. The 6 OAI-PMH verbs translate directly to methods you
|
31
|
+
# can call on a OAI::Client object. Verb arguments are passed as a hash:
|
32
|
+
#
|
33
|
+
# client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
|
34
|
+
# record = client.get_record :identifier => 'oai:pubmedcentral.gov:13901'
|
35
|
+
# for identifier in client.list_identifiers
|
36
|
+
# puts identifier
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# It is worth noting that the api uses methods and parameter names with
|
40
|
+
# underscores in them rather than studly caps. So above list_identifiers
|
41
|
+
# and metadata_prefix are used instead of the listIdentifiers and
|
42
|
+
# metadataPrefix used in the OAI-PMH specification.
|
43
|
+
#
|
44
|
+
# Also, the from and until arguments which specify dates should be passed
|
45
|
+
# in as Date or DateTime objects depending on the granularity supported
|
46
|
+
# by the server.
|
47
|
+
#
|
48
|
+
# For detailed information on the arguments that can be used please consult
|
49
|
+
# the OAI-PMH docs at:
|
50
|
+
#
|
51
|
+
# http://www.openarchives.org/OAI/openarchivesprotocol.html
|
52
|
+
|
53
|
+
class Client
|
54
|
+
|
55
|
+
# The constructor which must be passed a valid base url for an oai
|
56
|
+
# service:
|
57
|
+
#
|
58
|
+
# client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
|
59
|
+
#
|
60
|
+
# If you want to see debugging messages on STDERR use:
|
61
|
+
#
|
62
|
+
# client = OAI::Client.new 'http://example.com', :debug => true
|
63
|
+
#
|
64
|
+
# By default OAI verbs called on the client will return REXML::Element
|
65
|
+
# objects for metadata records, however if you wish you can use the
|
66
|
+
# :parser option to indicate you want to use 'libxml' instead, and get
|
67
|
+
# back XML::Node objects
|
68
|
+
#
|
69
|
+
# client = OAI::Client.new 'http://example.com', :parser => 'libxml'
|
70
|
+
#
|
71
|
+
# === HIGH PERFORMANCE
|
72
|
+
#
|
73
|
+
# If you want to supercharge this api install libxml-ruby >= 0.3.8 and
|
74
|
+
# use the :parser option when you construct your OAI::Client.
|
75
|
+
#
|
76
|
+
def initialize(base_url, options={})
|
77
|
+
@base = URI.parse base_url
|
78
|
+
@debug = options.fetch(:debug, false)
|
79
|
+
@parser = options.fetch(:parser, 'rexml')
|
80
|
+
@follow_redirects = options.fetch(:redirects, true)
|
81
|
+
|
82
|
+
# load appropriate parser
|
83
|
+
case @parser
|
84
|
+
when 'libxml'
|
85
|
+
begin
|
86
|
+
require 'rubygems'
|
87
|
+
require 'xml/libxml'
|
88
|
+
rescue
|
89
|
+
raise OAI::Exception.new("xml/libxml not available")
|
90
|
+
end
|
91
|
+
when 'rexml'
|
92
|
+
require 'rexml/document'
|
93
|
+
require 'rexml/xpath'
|
94
|
+
else
|
95
|
+
raise OAI::Exception.new("unknown parser: #{@parser}")
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Equivalent to a Identify request. You'll get back a OAI::IdentifyResponse
|
100
|
+
# object which is essentially just a wrapper around a REXML::Document
|
101
|
+
# for the response. If you are created your client using the libxml
|
102
|
+
# parser then you will get an XML::Node object instead.
|
103
|
+
|
104
|
+
def identify
|
105
|
+
return OAI::IdentifyResponse.new(do_request('Identify'))
|
106
|
+
end
|
107
|
+
|
108
|
+
# Equivalent to a ListMetadataFormats request. A ListMetadataFormatsResponse
|
109
|
+
# object is returned to you.
|
110
|
+
|
111
|
+
def list_metadata_formats(opts={})
|
112
|
+
return OAI::ListMetadataFormatsResponse.new(do_request('ListMetadataFormats', opts))
|
113
|
+
end
|
114
|
+
|
115
|
+
# Equivalent to a ListIdentifiers request. Pass in :from, :until arguments
|
116
|
+
# as Date or DateTime objects as appropriate depending on the granularity
|
117
|
+
# supported by the server.
|
118
|
+
|
119
|
+
def list_identifiers(opts={})
|
120
|
+
return OAI::ListIdentifiersResponse.new(do_request('ListIdentifiers', opts))
|
121
|
+
end
|
122
|
+
|
123
|
+
# Equivalent to a GetRecord request. You must supply an identifier
|
124
|
+
# argument. You should get back a OAI::GetRecordResponse object
|
125
|
+
# which you can extract a OAI::Record object from.
|
126
|
+
|
127
|
+
def get_record(opts={})
|
128
|
+
return OAI::GetRecordResponse.new(do_request('GetRecord', opts))
|
129
|
+
end
|
130
|
+
|
131
|
+
# Equivalent to the ListRecords request. A ListRecordsResponse
|
132
|
+
# will be returned which you can use to iterate through records
|
133
|
+
#
|
134
|
+
# for record in client.list_records
|
135
|
+
# puts record.metadata
|
136
|
+
# end
|
137
|
+
|
138
|
+
def list_records(opts={})
|
139
|
+
return OAI::ListRecordsResponse.new(do_request('ListRecords', opts))
|
140
|
+
end
|
141
|
+
|
142
|
+
# Equivalent to the ListSets request. A ListSetsResponse object
|
143
|
+
# will be returned which you can use for iterating through the
|
144
|
+
# OAI::Set objects
|
145
|
+
#
|
146
|
+
# for set in client.list_sets
|
147
|
+
# puts set
|
148
|
+
# end
|
149
|
+
|
150
|
+
def list_sets(opts={})
|
151
|
+
return OAI::ListSetsResponse.new(do_request('ListSets', opts))
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
156
|
+
def do_request(verb, opts = nil)
|
157
|
+
# fire off the request and return appropriate DOM object
|
158
|
+
uri = build_uri(verb, opts)
|
159
|
+
xml = strip_invalid_utf_8_chars(get(uri))
|
160
|
+
if @parser == 'libxml'
|
161
|
+
# remove default namespace for oai-pmh since libxml
|
162
|
+
# isn't able to use our xpaths to get at them
|
163
|
+
# if you know a way around thins please let me know
|
164
|
+
xml = xml.gsub(
|
165
|
+
/xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
|
166
|
+
end
|
167
|
+
return load_document(xml)
|
168
|
+
end
|
169
|
+
|
170
|
+
def build_uri(verb, opts)
|
171
|
+
opts = validate_options(verb, opts)
|
172
|
+
uri = @base.clone
|
173
|
+
uri.query = "verb=" << verb
|
174
|
+
opts.each_pair { |k,v| uri.query << '&' << externalize(k) << '=' << encode(v) }
|
175
|
+
uri
|
176
|
+
end
|
177
|
+
|
178
|
+
def encode(value)
|
179
|
+
return CGI.escape(value) unless value.respond_to?(:strftime)
|
180
|
+
if value.kind_of?(DateTime)
|
181
|
+
Time.parse(value.asctime).utc.xmlschema
|
182
|
+
elsif value.kind_of?(Time)
|
183
|
+
value.utc.xmlschema
|
184
|
+
else # Assume something date like
|
185
|
+
value.strftime('%Y-%m-%d')
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def load_document(xml)
|
190
|
+
case @parser
|
191
|
+
when 'libxml'
|
192
|
+
begin
|
193
|
+
parser = XML::Parser.new()
|
194
|
+
parser.string = xml
|
195
|
+
return parser.parse
|
196
|
+
rescue XML::Parser::ParseError => e
|
197
|
+
raise OAI::Exception, 'response not well formed XML: '+e, caller
|
198
|
+
end
|
199
|
+
when 'rexml'
|
200
|
+
begin
|
201
|
+
return REXML::Document.new(xml)
|
202
|
+
rescue REXML::ParseException => e
|
203
|
+
raise OAI::Exception, 'response not well formed XML: '+e.message, caller
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# Do the actual HTTP get, following any temporary redirects
|
209
|
+
def get(uri)
|
210
|
+
response = Net::HTTP.get_response(uri)
|
211
|
+
case response
|
212
|
+
when Net::HTTPSuccess
|
213
|
+
return response.body
|
214
|
+
when Net::HTTPMovedPermanently
|
215
|
+
if @follow_redirects
|
216
|
+
response = get(URI.parse(response['location']))
|
217
|
+
else
|
218
|
+
raise ArgumentError, "Permanently Redirected to [#{response['location']}]"
|
219
|
+
end
|
220
|
+
when Net::HTTPTemporaryRedirect, Net::HTTPFound
|
221
|
+
response = get(URI.parse(response['location']))
|
222
|
+
else
|
223
|
+
raise ArgumentError, "#{response.code_type} [#{response.code}]"
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
def debug(msg)
|
228
|
+
$stderr.print("#{msg}\n") if @debug
|
229
|
+
end
|
230
|
+
|
231
|
+
# Massage the standard OAI options to make them a bit more palatable.
|
232
|
+
def validate_options(verb, opts = {})
|
233
|
+
raise OAI::VerbException.new unless Const::VERBS.keys.include?(verb)
|
234
|
+
|
235
|
+
return {} if opts.nil?
|
236
|
+
|
237
|
+
raise OAI::ArgumentException.new unless opts.respond_to?(:keys)
|
238
|
+
|
239
|
+
realopts = {}
|
240
|
+
# Internalize the hash
|
241
|
+
opts.keys.each do |key|
|
242
|
+
realopts[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = opts.delete(key)
|
243
|
+
end
|
244
|
+
|
245
|
+
return realopts if is_resumption?(realopts)
|
246
|
+
|
247
|
+
# add in a default metadataPrefix if none exists
|
248
|
+
if(Const::VERBS[verb].include?(:metadata_prefix))
|
249
|
+
realopts[:metadata_prefix] ||= 'oai_dc'
|
250
|
+
end
|
251
|
+
|
252
|
+
# Convert date formated strings in dates.
|
253
|
+
#realopts[:from] = parse_date(realopts[:from]) if realopts[:from]
|
254
|
+
#realopts[:until] = parse_date(realopts[:until]) if realopts[:until]
|
255
|
+
|
256
|
+
# check for any bad options
|
257
|
+
unless (realopts.keys - OAI::Const::VERBS[verb]).empty?
|
258
|
+
raise OAI::ArgumentException.new
|
259
|
+
end
|
260
|
+
realopts
|
261
|
+
end
|
262
|
+
|
263
|
+
def is_resumption?(opts)
|
264
|
+
if opts.keys.include?(:resumption_token)
|
265
|
+
return true if 1 == opts.keys.size
|
266
|
+
raise OAI::ArgumentException.new
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# Convert our internal representations back into standard OAI options
|
271
|
+
def externalize(value)
|
272
|
+
value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
|
273
|
+
end
|
274
|
+
|
275
|
+
def parse_date(value)
|
276
|
+
return value if value.respond_to?(:strftime)
|
277
|
+
|
278
|
+
Date.parse(value) # This will raise an exception for badly formatted dates
|
279
|
+
Time.parse(value).utc # Sadly, this will not
|
280
|
+
rescue
|
281
|
+
raise OAI::ArgumentError.new
|
282
|
+
end
|
283
|
+
|
284
|
+
# Strip out invalid UTF-8 characters. Regex from the W3C, inverted.
|
285
|
+
# http://www.w3.org/International/questions/qa-forms-utf-8.en.php
|
286
|
+
#
|
287
|
+
# Regex is from WebCollab:
|
288
|
+
# http://webcollab.sourceforge.net/unicode.html
|
289
|
+
def strip_invalid_utf_8_chars(xml)
|
290
|
+
simple_bytes = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
|
291
|
+
| [\x00-\x7F][\x80-\xBF]+
|
292
|
+
| ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
|
293
|
+
| [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
|
294
|
+
| [\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))
|
295
|
+
| (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')
|
296
|
+
simple_bytes.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
|
297
|
+
| \xED[\xA0-\xBF][\x80-\xBF]/,'?')
|
298
|
+
end
|
299
|
+
|
300
|
+
end
|
301
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module OAI
|
2
|
+
|
3
|
+
module Const
|
4
|
+
# OAI defines six verbs with various allowable options.
|
5
|
+
VERBS = {
|
6
|
+
'Identify' => [],
|
7
|
+
'ListMetadataFormats' => [:identifier],
|
8
|
+
'ListSets' => [:resumption_token], # unused currently
|
9
|
+
'GetRecord' => [:identifier, :from, :until, :set, :metadata_prefix],
|
10
|
+
'ListIdentifiers' => [:from, :until, :set, :metadata_prefix, :resumption_token],
|
11
|
+
'ListRecords' => [:from, :until, :set, :metadata_prefix, :resumption_token]
|
12
|
+
}.freeze
|
13
|
+
|
14
|
+
RESERVED_WORDS = %w{type id}
|
15
|
+
|
16
|
+
# Two granularities are supported in OIA-PMH, daily or seconds.
|
17
|
+
module Granularity
|
18
|
+
LOW = 'YYYY-MM-DD'
|
19
|
+
HIGH = 'YYYY-MM-DDThh:mm:ssZ'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Repositories can support three different schemes for dealing with deletions.
|
23
|
+
# * NO - No deletions allowed
|
24
|
+
# * TRANSIENT - Deletions are supported but may not be permanently maintained.
|
25
|
+
# * PERSISTENT - Deletions are supported and are permanently maintained.
|
26
|
+
module Delete
|
27
|
+
NO = :no
|
28
|
+
TRANSIENT = :transient
|
29
|
+
PERSISTENT = :persistent
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module OAI
|
2
|
+
|
3
|
+
# Standard error responses for problems serving OAI content. These
|
4
|
+
# messages will be wrapped in an XML response to the client.
|
5
|
+
|
6
|
+
class Exception < RuntimeError
|
7
|
+
attr_reader :code
|
8
|
+
|
9
|
+
def initialize(message, code = nil)
|
10
|
+
super(message)
|
11
|
+
@code = code
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class ArgumentException < Exception
|
16
|
+
def initialize()
|
17
|
+
super('The request includes ' \
|
18
|
+
'illegal arguments, is missing required arguments, includes a ' \
|
19
|
+
'repeated argument, or values for arguments have an illegal syntax.',
|
20
|
+
'badArgument')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class VerbException < Exception
|
25
|
+
def initialize()
|
26
|
+
super('Value of the verb argument is not a legal OAI-PMH '\
|
27
|
+
'verb, the verb argument is missing, or the verb argument is repeated.',
|
28
|
+
'badVerb')
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class FormatException < Exception
|
33
|
+
def initialize()
|
34
|
+
super('The metadata format identified by '\
|
35
|
+
'the value given for the metadataPrefix argument is not supported '\
|
36
|
+
'by the item or by the repository.', 'cannotDisseminateFormat')
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
class IdException < Exception
|
41
|
+
def initialize()
|
42
|
+
super('The value of the identifier argument is '\
|
43
|
+
'unknown or illegal in this repository.', 'idDoesNotExist')
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class NoMatchException < Exception
|
48
|
+
def initialize()
|
49
|
+
super('The combination of the values of the from, '\
|
50
|
+
'until, set and metadataPrefix arguments results in an empty list.',
|
51
|
+
'noRecordsMatch')
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class MetadataFormatException < Exception
|
56
|
+
def initialize()
|
57
|
+
super('There are no metadata formats available '\
|
58
|
+
'for the specified item.', 'noMetadataFormats')
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
class SetException < Exception
|
63
|
+
def initialize()
|
64
|
+
super('This repository does not support sets.', 'noSetHierarchy')
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class ResumptionTokenException < Exception
|
69
|
+
def initialize()
|
70
|
+
super('The value of the resumptionToken argument is invalid or expired.',
|
71
|
+
'badResumptionToken')
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# Created by William Groppe on 2006-11-05.
|
3
|
+
# Copyright (c) 2006. All rights reserved.
|
4
|
+
|
5
|
+
module OAI
|
6
|
+
module Harvester
|
7
|
+
|
8
|
+
LOW_RESOLUTION = "YYYY-MM-DD"
|
9
|
+
|
10
|
+
class Config < OpenStruct
|
11
|
+
|
12
|
+
PERIODS = %w(daily weekly monthly)
|
13
|
+
GLOBAL = "/etc/oai/harvester.yml"
|
14
|
+
|
15
|
+
def self.load
|
16
|
+
config = find_config
|
17
|
+
File.exists?(config) ? new(YAML.load_file(config)) : new
|
18
|
+
end
|
19
|
+
|
20
|
+
def save
|
21
|
+
config = Config.find_config
|
22
|
+
open(config, 'w') do |out|
|
23
|
+
YAML.dump(@table, out)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
# Shamelessly lifted from Camping
|
29
|
+
def self.find_config
|
30
|
+
if home = ENV['HOME'] # POSIX
|
31
|
+
return GLOBAL if File.exists?(GLOBAL) && File.writable?(GLOBAL)
|
32
|
+
FileUtils.mkdir_p File.join(home, '.oai')
|
33
|
+
File.join(home, '.oai/harvester.yml')
|
34
|
+
elsif home = ENV['APPDATA'] # MSWIN
|
35
|
+
File.join(home, 'oai/harvester.yml')
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
#
|
2
|
+
# Created by William Groppe on 2006-11-03.
|
3
|
+
|
4
|
+
module OAI
|
5
|
+
module Harvester
|
6
|
+
|
7
|
+
class Harvest
|
8
|
+
|
9
|
+
def initialize(config = nil, directory = nil, date = nil)
|
10
|
+
@config = config || Config.load
|
11
|
+
@directory = directory || @config.storage
|
12
|
+
@from = date
|
13
|
+
@from.freeze
|
14
|
+
@parser = defined?(XML::Document) ? 'libxml' : 'rexml'
|
15
|
+
end
|
16
|
+
|
17
|
+
def start(sites = nil, interactive = false)
|
18
|
+
@interactive = interactive
|
19
|
+
sites = (@config.sites.keys rescue {}) unless sites
|
20
|
+
begin
|
21
|
+
sites.each do |site|
|
22
|
+
harvest(site)
|
23
|
+
end
|
24
|
+
ensure
|
25
|
+
@config.save
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def harvest(site)
|
32
|
+
opts = build_options_hash(@config.sites[site])
|
33
|
+
harvest_time = Time.now.utc
|
34
|
+
|
35
|
+
if "YYYY-MM-DD" == granularity(opts[:url])
|
36
|
+
opts[:until] = harvest_time.strftime("%Y-%m-%d")
|
37
|
+
opts[:from] = @from.strftime("%Y-%m-%d") if @from
|
38
|
+
else
|
39
|
+
opts[:until] = harvest_time.xmlschema
|
40
|
+
opts[:from] = @from.xmlschema if @from
|
41
|
+
end
|
42
|
+
|
43
|
+
# Allow a from date to be passed in
|
44
|
+
opts[:from] = earliest(opts[:url]) unless opts[:from]
|
45
|
+
opts.delete(:set) if 'all' == opts[:set]
|
46
|
+
|
47
|
+
begin
|
48
|
+
# Connect, and download
|
49
|
+
file, records = call(opts.delete(:url), opts)
|
50
|
+
|
51
|
+
# Move document to storage directory
|
52
|
+
dir = File.join(@directory, date_based_directory(harvest_time))
|
53
|
+
FileUtils.mkdir_p dir
|
54
|
+
FileUtils.mv(file.path,
|
55
|
+
File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
|
56
|
+
harvest_time)}.xml.gz"))
|
57
|
+
@config.sites[site]['last'] = harvest_time
|
58
|
+
rescue
|
59
|
+
raise $! unless $!.respond_to?(:code)
|
60
|
+
raise $! if not @interactive || "noRecordsMatch" != $!.code
|
61
|
+
puts "No new records available"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def call(url, opts)
|
66
|
+
# Preserve original options
|
67
|
+
options = opts.dup
|
68
|
+
|
69
|
+
records = 0;
|
70
|
+
client = OAI::Client.new(url, :parser => @parser)
|
71
|
+
provider_config = client.identify
|
72
|
+
|
73
|
+
file = Tempfile.new('oai_data')
|
74
|
+
gz = Zlib::GzipWriter.new(file)
|
75
|
+
gz << "<? xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
|
76
|
+
gz << "<records>"
|
77
|
+
begin
|
78
|
+
response = client.list_records(options)
|
79
|
+
get_records(response.doc).each do |rec|
|
80
|
+
gz << rec
|
81
|
+
records += 1
|
82
|
+
end
|
83
|
+
puts "#{records} records retrieved" if @interactive
|
84
|
+
|
85
|
+
# Get a full response by iterating with the resumption tokens.
|
86
|
+
# Not very Ruby like. Should fix OAI::Client to handle resumption
|
87
|
+
# tokens internally.
|
88
|
+
while(response.resumption_token and not response.resumption_token.empty?)
|
89
|
+
puts "\nresumption token recieved, continuing" if @interactive
|
90
|
+
response = client.list_records(:resumption_token =>
|
91
|
+
response.resumption_token)
|
92
|
+
get_records(response.doc).each do |rec|
|
93
|
+
gz << rec
|
94
|
+
records += 1
|
95
|
+
end
|
96
|
+
puts "#{records} records retrieved" if @interactive
|
97
|
+
end
|
98
|
+
|
99
|
+
gz << "</records>"
|
100
|
+
|
101
|
+
ensure
|
102
|
+
gz.close
|
103
|
+
file.close
|
104
|
+
end
|
105
|
+
|
106
|
+
[file, records]
|
107
|
+
end
|
108
|
+
|
109
|
+
def get_records(doc)
|
110
|
+
doc.find("/OAI-PMH/ListRecords/record").to_a
|
111
|
+
end
|
112
|
+
|
113
|
+
def build_options_hash(site)
|
114
|
+
options = {:url => site['url']}
|
115
|
+
options[:set] = site['set'] if site['set']
|
116
|
+
options[:from] = site['last'].utc.xmlschema if site['last']
|
117
|
+
options[:metadata_prefix] = site['prefix'] if site['prefix']
|
118
|
+
options
|
119
|
+
end
|
120
|
+
|
121
|
+
def date_based_directory(time)
|
122
|
+
"#{time.strftime(DIRECTORY_LAYOUT)}"
|
123
|
+
end
|
124
|
+
|
125
|
+
def filename(from_time, until_time)
|
126
|
+
format = "%Y-%m-%d"
|
127
|
+
"#{from_time.strftime(format)}_til_#{until_time.strftime(format)}"\
|
128
|
+
"_at_#{until_time.strftime('%H-%M-%S')}"
|
129
|
+
end
|
130
|
+
|
131
|
+
def granularity(url)
|
132
|
+
client = OAI::Client.new url
|
133
|
+
client.identify.granularity
|
134
|
+
end
|
135
|
+
|
136
|
+
# Get earliest timestamp from repository
|
137
|
+
def earliest(url)
|
138
|
+
client = OAI::Client.new url
|
139
|
+
identify = client.identify
|
140
|
+
if "YYYY-MM-DD" == identify.granularity
|
141
|
+
Time.parse(identify.earliest_datestamp).strftime("%Y-%m-%d")
|
142
|
+
else
|
143
|
+
Time.parse(identify.earliest_datestamp).xmlschema
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Reopen Harvest and add logging
|
2
|
+
module OAI
|
3
|
+
module Harvester
|
4
|
+
|
5
|
+
class Harvest
|
6
|
+
alias_method :orig_start, :start
|
7
|
+
alias_method :orig_harvest, :harvest
|
8
|
+
alias_method :orig_call, :call
|
9
|
+
alias_method :orig_init, :initialize
|
10
|
+
|
11
|
+
def initialize(*args)
|
12
|
+
orig_init(*args)
|
13
|
+
@summary = []
|
14
|
+
@logger = Logger.new(File.join(@config.logfile, "harvester.log"),
|
15
|
+
shift_age = 'weekly') if @config.logfile
|
16
|
+
@logger.datetime_format = "%Y-%m-%d %H:%M"
|
17
|
+
|
18
|
+
# Turn off logging if no logging directory is specified.
|
19
|
+
@logger.level = Logger::FATAL unless @config.logfile
|
20
|
+
end
|
21
|
+
|
22
|
+
def start(sites = nil, interactive = false)
|
23
|
+
if not interactive
|
24
|
+
@logger.info { "Starting regular harvest" }
|
25
|
+
orig_start(sites)
|
26
|
+
begin
|
27
|
+
OAI::Harvester::
|
28
|
+
Mailer.send(@config.mail_server, @config.email, @summary)
|
29
|
+
rescue
|
30
|
+
@logger.error { "Error sending out summary email: #{$!}"}
|
31
|
+
end
|
32
|
+
else
|
33
|
+
@logger.info { "Starting interactive harvest"}
|
34
|
+
orig_start(sites, true)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def harvest(site)
|
41
|
+
begin
|
42
|
+
@logger.info { "Harvest of '#{site}' starting" }
|
43
|
+
@summary << "Harvest of '#{site}' attempted"
|
44
|
+
orig_harvest(site)
|
45
|
+
rescue OAI::Exception
|
46
|
+
if "noRecordsMatch" == $!.code
|
47
|
+
@logger.info "No new records available"
|
48
|
+
@summary << "'#{site}' had no new records."
|
49
|
+
else
|
50
|
+
@logger.error { "Harvesting of '#{site}' failed, message: #{$!}" }
|
51
|
+
@summary << "'#{site}' had an OAI Error! #{$!}"
|
52
|
+
end
|
53
|
+
rescue
|
54
|
+
@logger.error { "Harvesting of '#{site}' failed, message: #{$!}" }
|
55
|
+
@logger.error { "#{$!.backtrace.join('\n')}" }
|
56
|
+
@summary << "'#{site}' had an Error! #{$!}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def call(url, options)
|
61
|
+
@logger.info { "fetching: #{url} with options #{options.inspect}" }
|
62
|
+
file, records = orig_call(url, options)
|
63
|
+
@logger.info { "retrieved #{records} records" }
|
64
|
+
@summary << "Retrieved #{records} records."
|
65
|
+
return file, records
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module OAI
|
2
|
+
module Harvester
|
3
|
+
|
4
|
+
class Mailer
|
5
|
+
|
6
|
+
def self.send(server = nil, email = nil, message = nil)
|
7
|
+
msg = %{Subject: Harvester Summary\n\n#{message.join("\n")}}
|
8
|
+
to = (email.map { |e| "'#{e}'"}).join(", ")
|
9
|
+
Net::SMTP.start(server) do |smtp|
|
10
|
+
smtp.send_message msg, "harvester@#{Socket.gethostname}", to
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|