oai_talia 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
Files changed (84) hide show
  1. data/README +81 -0
  2. data/Rakefile +127 -0
  3. data/bin/oai +68 -0
  4. data/examples/models/file_model.rb +63 -0
  5. data/examples/providers/dublin_core.rb +474 -0
  6. data/lib/oai/client/get_record.rb +15 -0
  7. data/lib/oai/client/header.rb +18 -0
  8. data/lib/oai/client/identify.rb +30 -0
  9. data/lib/oai/client/list_identifiers.rb +12 -0
  10. data/lib/oai/client/list_metadata_formats.rb +12 -0
  11. data/lib/oai/client/list_records.rb +21 -0
  12. data/lib/oai/client/list_sets.rb +19 -0
  13. data/lib/oai/client/metadata_format.rb +12 -0
  14. data/lib/oai/client/record.rb +26 -0
  15. data/lib/oai/client/response.rb +35 -0
  16. data/lib/oai/client.rb +301 -0
  17. data/lib/oai/constants.rb +34 -0
  18. data/lib/oai/exception.rb +75 -0
  19. data/lib/oai/harvester/config.rb +41 -0
  20. data/lib/oai/harvester/harvest.rb +150 -0
  21. data/lib/oai/harvester/logging.rb +70 -0
  22. data/lib/oai/harvester/mailer.rb +17 -0
  23. data/lib/oai/harvester/shell.rb +338 -0
  24. data/lib/oai/harvester.rb +39 -0
  25. data/lib/oai/provider/metadata_format/oai_dc.rb +29 -0
  26. data/lib/oai/provider/metadata_format/oai_europeana.rb +38 -0
  27. data/lib/oai/provider/metadata_format.rb +143 -0
  28. data/lib/oai/provider/model/activerecord_caching_wrapper.rb +134 -0
  29. data/lib/oai/provider/model/activerecord_wrapper.rb +139 -0
  30. data/lib/oai/provider/model.rb +74 -0
  31. data/lib/oai/provider/partial_result.rb +18 -0
  32. data/lib/oai/provider/response/error.rb +16 -0
  33. data/lib/oai/provider/response/get_record.rb +26 -0
  34. data/lib/oai/provider/response/identify.rb +25 -0
  35. data/lib/oai/provider/response/list_identifiers.rb +35 -0
  36. data/lib/oai/provider/response/list_metadata_formats.rb +34 -0
  37. data/lib/oai/provider/response/list_records.rb +34 -0
  38. data/lib/oai/provider/response/list_sets.rb +23 -0
  39. data/lib/oai/provider/response/record_response.rb +70 -0
  40. data/lib/oai/provider/response.rb +161 -0
  41. data/lib/oai/provider/resumption_token.rb +106 -0
  42. data/lib/oai/provider.rb +304 -0
  43. data/lib/oai/set.rb +29 -0
  44. data/lib/oai/xpath.rb +75 -0
  45. data/lib/oai.rb +8 -0
  46. data/lib/test.rb +25 -0
  47. data/test/activerecord_provider/config/connection.rb +5 -0
  48. data/test/activerecord_provider/config/database.yml +6 -0
  49. data/test/activerecord_provider/database/ar_migration.rb +59 -0
  50. data/test/activerecord_provider/database/oaipmhtest +0 -0
  51. data/test/activerecord_provider/fixtures/dc.yml +1501 -0
  52. data/test/activerecord_provider/helpers/providers.rb +44 -0
  53. data/test/activerecord_provider/helpers/set_provider.rb +36 -0
  54. data/test/activerecord_provider/models/dc_field.rb +7 -0
  55. data/test/activerecord_provider/models/dc_set.rb +6 -0
  56. data/test/activerecord_provider/models/oai_token.rb +3 -0
  57. data/test/activerecord_provider/tc_ar_provider.rb +113 -0
  58. data/test/activerecord_provider/tc_ar_sets_provider.rb +72 -0
  59. data/test/activerecord_provider/tc_caching_paging_provider.rb +55 -0
  60. data/test/activerecord_provider/tc_simple_paging_provider.rb +57 -0
  61. data/test/activerecord_provider/test_helper.rb +4 -0
  62. data/test/client/helpers/provider.rb +68 -0
  63. data/test/client/helpers/test_wrapper.rb +11 -0
  64. data/test/client/tc_exception.rb +36 -0
  65. data/test/client/tc_get_record.rb +37 -0
  66. data/test/client/tc_identify.rb +13 -0
  67. data/test/client/tc_libxml.rb +61 -0
  68. data/test/client/tc_list_identifiers.rb +52 -0
  69. data/test/client/tc_list_metadata_formats.rb +18 -0
  70. data/test/client/tc_list_records.rb +13 -0
  71. data/test/client/tc_list_sets.rb +19 -0
  72. data/test/client/tc_low_resolution_dates.rb +14 -0
  73. data/test/client/tc_utf8_escaping.rb +11 -0
  74. data/test/client/tc_xpath.rb +26 -0
  75. data/test/client/test_helper.rb +5 -0
  76. data/test/provider/models.rb +234 -0
  77. data/test/provider/tc_exceptions.rb +96 -0
  78. data/test/provider/tc_functional_tokens.rb +43 -0
  79. data/test/provider/tc_provider.rb +71 -0
  80. data/test/provider/tc_resumption_tokens.rb +46 -0
  81. data/test/provider/tc_simple_provider.rb +92 -0
  82. data/test/provider/test_helper.rb +36 -0
  83. data/test/test.xml +22 -0
  84. metadata +181 -0
data/lib/oai/client.rb ADDED
@@ -0,0 +1,301 @@
1
+ # External dependencies
2
+ require 'uri'
3
+ require 'net/http'
4
+ require 'cgi'
5
+ require 'iconv'
6
+
7
+ if not defined?(OAI::Const::VERBS)
8
+ # Shared stuff
9
+ require 'oai/exception'
10
+ require 'oai/constants'
11
+ require 'oai/xpath'
12
+ require 'oai/set'
13
+ end
14
+
15
+ # Localize requires so user can select a subset of functionality
16
+ require 'oai/client/metadata_format'
17
+ require 'oai/client/response'
18
+ require 'oai/client/header'
19
+ require 'oai/client/record'
20
+ require 'oai/client/identify'
21
+ require 'oai/client/get_record'
22
+ require 'oai/client/list_identifiers'
23
+ require 'oai/client/list_metadata_formats'
24
+ require 'oai/client/list_records'
25
+ require 'oai/client/list_sets'
26
+
27
+ module OAI
28
+
29
+ # A OAI::Client provides a client api for issuing OAI-PMH verbs against
30
+ # a OAI-PMH server. The 6 OAI-PMH verbs translate directly to methods you
31
+ # can call on a OAI::Client object. Verb arguments are passed as a hash:
32
+ #
33
+ # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
34
+ # record = client.get_record :identifier => 'oai:pubmedcentral.gov:13901'
35
+ # for identifier in client.list_identifiers
36
+ # puts identifier
37
+ # end
38
+ #
39
+ # It is worth noting that the api uses methods and parameter names with
40
+ # underscores in them rather than studly caps. So above list_identifiers
41
+ # and metadata_prefix are used instead of the listIdentifiers and
42
+ # metadataPrefix used in the OAI-PMH specification.
43
+ #
44
+ # Also, the from and until arguments which specify dates should be passed
45
+ # in as Date or DateTime objects depending on the granularity supported
46
+ # by the server.
47
+ #
48
+ # For detailed information on the arguments that can be used please consult
49
+ # the OAI-PMH docs at:
50
+ #
51
+ # http://www.openarchives.org/OAI/openarchivesprotocol.html
52
+
53
+ class Client
54
+
55
+ # The constructor which must be passed a valid base url for an oai
56
+ # service:
57
+ #
58
+ # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
59
+ #
60
+ # If you want to see debugging messages on STDERR use:
61
+ #
62
+ # client = OAI::Client.new 'http://example.com', :debug => true
63
+ #
64
+ # By default OAI verbs called on the client will return REXML::Element
65
+ # objects for metadata records, however if you wish you can use the
66
+ # :parser option to indicate you want to use 'libxml' instead, and get
67
+ # back XML::Node objects
68
+ #
69
+ # client = OAI::Client.new 'http://example.com', :parser => 'libxml'
70
+ #
71
+ # === HIGH PERFORMANCE
72
+ #
73
+ # If you want to supercharge this api install libxml-ruby >= 0.3.8 and
74
+ # use the :parser option when you construct your OAI::Client.
75
+ #
76
+ def initialize(base_url, options={})
77
+ @base = URI.parse base_url
78
+ @debug = options.fetch(:debug, false)
79
+ @parser = options.fetch(:parser, 'rexml')
80
+ @follow_redirects = options.fetch(:redirects, true)
81
+
82
+ # load appropriate parser
83
+ case @parser
84
+ when 'libxml'
85
+ begin
86
+ require 'rubygems'
87
+ require 'xml/libxml'
88
+ rescue
89
+ raise OAI::Exception.new("xml/libxml not available")
90
+ end
91
+ when 'rexml'
92
+ require 'rexml/document'
93
+ require 'rexml/xpath'
94
+ else
95
+ raise OAI::Exception.new("unknown parser: #{@parser}")
96
+ end
97
+ end
98
+
99
+ # Equivalent to a Identify request. You'll get back a OAI::IdentifyResponse
100
+ # object which is essentially just a wrapper around a REXML::Document
101
+ # for the response. If you are created your client using the libxml
102
+ # parser then you will get an XML::Node object instead.
103
+
104
+ def identify
105
+ return OAI::IdentifyResponse.new(do_request('Identify'))
106
+ end
107
+
108
+ # Equivalent to a ListMetadataFormats request. A ListMetadataFormatsResponse
109
+ # object is returned to you.
110
+
111
+ def list_metadata_formats(opts={})
112
+ return OAI::ListMetadataFormatsResponse.new(do_request('ListMetadataFormats', opts))
113
+ end
114
+
115
+ # Equivalent to a ListIdentifiers request. Pass in :from, :until arguments
116
+ # as Date or DateTime objects as appropriate depending on the granularity
117
+ # supported by the server.
118
+
119
+ def list_identifiers(opts={})
120
+ return OAI::ListIdentifiersResponse.new(do_request('ListIdentifiers', opts))
121
+ end
122
+
123
+ # Equivalent to a GetRecord request. You must supply an identifier
124
+ # argument. You should get back a OAI::GetRecordResponse object
125
+ # which you can extract a OAI::Record object from.
126
+
127
+ def get_record(opts={})
128
+ return OAI::GetRecordResponse.new(do_request('GetRecord', opts))
129
+ end
130
+
131
+ # Equivalent to the ListRecords request. A ListRecordsResponse
132
+ # will be returned which you can use to iterate through records
133
+ #
134
+ # for record in client.list_records
135
+ # puts record.metadata
136
+ # end
137
+
138
+ def list_records(opts={})
139
+ return OAI::ListRecordsResponse.new(do_request('ListRecords', opts))
140
+ end
141
+
142
+ # Equivalent to the ListSets request. A ListSetsResponse object
143
+ # will be returned which you can use for iterating through the
144
+ # OAI::Set objects
145
+ #
146
+ # for set in client.list_sets
147
+ # puts set
148
+ # end
149
+
150
+ def list_sets(opts={})
151
+ return OAI::ListSetsResponse.new(do_request('ListSets', opts))
152
+ end
153
+
154
+ private
155
+
156
+ def do_request(verb, opts = nil)
157
+ # fire off the request and return appropriate DOM object
158
+ uri = build_uri(verb, opts)
159
+ xml = strip_invalid_utf_8_chars(get(uri))
160
+ if @parser == 'libxml'
161
+ # remove default namespace for oai-pmh since libxml
162
+ # isn't able to use our xpaths to get at them
163
+ # if you know a way around thins please let me know
164
+ xml = xml.gsub(
165
+ /xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
166
+ end
167
+ return load_document(xml)
168
+ end
169
+
170
+ def build_uri(verb, opts)
171
+ opts = validate_options(verb, opts)
172
+ uri = @base.clone
173
+ uri.query = "verb=" << verb
174
+ opts.each_pair { |k,v| uri.query << '&' << externalize(k) << '=' << encode(v) }
175
+ uri
176
+ end
177
+
178
+ def encode(value)
179
+ return CGI.escape(value) unless value.respond_to?(:strftime)
180
+ if value.kind_of?(DateTime)
181
+ Time.parse(value.asctime).utc.xmlschema
182
+ elsif value.kind_of?(Time)
183
+ value.utc.xmlschema
184
+ else # Assume something date like
185
+ value.strftime('%Y-%m-%d')
186
+ end
187
+ end
188
+
189
+ def load_document(xml)
190
+ case @parser
191
+ when 'libxml'
192
+ begin
193
+ parser = XML::Parser.new()
194
+ parser.string = xml
195
+ return parser.parse
196
+ rescue XML::Parser::ParseError => e
197
+ raise OAI::Exception, 'response not well formed XML: '+e, caller
198
+ end
199
+ when 'rexml'
200
+ begin
201
+ return REXML::Document.new(xml)
202
+ rescue REXML::ParseException => e
203
+ raise OAI::Exception, 'response not well formed XML: '+e.message, caller
204
+ end
205
+ end
206
+ end
207
+
208
+ # Do the actual HTTP get, following any temporary redirects
209
+ def get(uri)
210
+ response = Net::HTTP.get_response(uri)
211
+ case response
212
+ when Net::HTTPSuccess
213
+ return response.body
214
+ when Net::HTTPMovedPermanently
215
+ if @follow_redirects
216
+ response = get(URI.parse(response['location']))
217
+ else
218
+ raise ArgumentError, "Permanently Redirected to [#{response['location']}]"
219
+ end
220
+ when Net::HTTPTemporaryRedirect, Net::HTTPFound
221
+ response = get(URI.parse(response['location']))
222
+ else
223
+ raise ArgumentError, "#{response.code_type} [#{response.code}]"
224
+ end
225
+ end
226
+
227
+ def debug(msg)
228
+ $stderr.print("#{msg}\n") if @debug
229
+ end
230
+
231
+ # Massage the standard OAI options to make them a bit more palatable.
232
+ def validate_options(verb, opts = {})
233
+ raise OAI::VerbException.new unless Const::VERBS.keys.include?(verb)
234
+
235
+ return {} if opts.nil?
236
+
237
+ raise OAI::ArgumentException.new unless opts.respond_to?(:keys)
238
+
239
+ realopts = {}
240
+ # Internalize the hash
241
+ opts.keys.each do |key|
242
+ realopts[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = opts.delete(key)
243
+ end
244
+
245
+ return realopts if is_resumption?(realopts)
246
+
247
+ # add in a default metadataPrefix if none exists
248
+ if(Const::VERBS[verb].include?(:metadata_prefix))
249
+ realopts[:metadata_prefix] ||= 'oai_dc'
250
+ end
251
+
252
+ # Convert date formated strings in dates.
253
+ #realopts[:from] = parse_date(realopts[:from]) if realopts[:from]
254
+ #realopts[:until] = parse_date(realopts[:until]) if realopts[:until]
255
+
256
+ # check for any bad options
257
+ unless (realopts.keys - OAI::Const::VERBS[verb]).empty?
258
+ raise OAI::ArgumentException.new
259
+ end
260
+ realopts
261
+ end
262
+
263
+ def is_resumption?(opts)
264
+ if opts.keys.include?(:resumption_token)
265
+ return true if 1 == opts.keys.size
266
+ raise OAI::ArgumentException.new
267
+ end
268
+ end
269
+
270
+ # Convert our internal representations back into standard OAI options
271
+ def externalize(value)
272
+ value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
273
+ end
274
+
275
+ def parse_date(value)
276
+ return value if value.respond_to?(:strftime)
277
+
278
+ Date.parse(value) # This will raise an exception for badly formatted dates
279
+ Time.parse(value).utc # Sadly, this will not
280
+ rescue
281
+ raise OAI::ArgumentError.new
282
+ end
283
+
284
+ # Strip out invalid UTF-8 characters. Regex from the W3C, inverted.
285
+ # http://www.w3.org/International/questions/qa-forms-utf-8.en.php
286
+ #
287
+ # Regex is from WebCollab:
288
+ # http://webcollab.sourceforge.net/unicode.html
289
+ def strip_invalid_utf_8_chars(xml)
290
+ simple_bytes = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
291
+ | [\x00-\x7F][\x80-\xBF]+
292
+ | ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
293
+ | [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
294
+ | [\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))
295
+ | (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')
296
+ simple_bytes.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
297
+ | \xED[\xA0-\xBF][\x80-\xBF]/,'?')
298
+ end
299
+
300
+ end
301
+ end
@@ -0,0 +1,34 @@
1
+ module OAI
2
+
3
+ module Const
4
+ # OAI defines six verbs with various allowable options.
5
+ VERBS = {
6
+ 'Identify' => [],
7
+ 'ListMetadataFormats' => [:identifier],
8
+ 'ListSets' => [:resumption_token], # unused currently
9
+ 'GetRecord' => [:identifier, :from, :until, :set, :metadata_prefix],
10
+ 'ListIdentifiers' => [:from, :until, :set, :metadata_prefix, :resumption_token],
11
+ 'ListRecords' => [:from, :until, :set, :metadata_prefix, :resumption_token]
12
+ }.freeze
13
+
14
+ RESERVED_WORDS = %w{type id}
15
+
16
+ # Two granularities are supported in OIA-PMH, daily or seconds.
17
+ module Granularity
18
+ LOW = 'YYYY-MM-DD'
19
+ HIGH = 'YYYY-MM-DDThh:mm:ssZ'
20
+ end
21
+
22
+ # Repositories can support three different schemes for dealing with deletions.
23
+ # * NO - No deletions allowed
24
+ # * TRANSIENT - Deletions are supported but may not be permanently maintained.
25
+ # * PERSISTENT - Deletions are supported and are permanently maintained.
26
+ module Delete
27
+ NO = :no
28
+ TRANSIENT = :transient
29
+ PERSISTENT = :persistent
30
+ end
31
+
32
+ end
33
+
34
+ end
@@ -0,0 +1,75 @@
1
+ module OAI
2
+
3
+ # Standard error responses for problems serving OAI content. These
4
+ # messages will be wrapped in an XML response to the client.
5
+
6
+ class Exception < RuntimeError
7
+ attr_reader :code
8
+
9
+ def initialize(message, code = nil)
10
+ super(message)
11
+ @code = code
12
+ end
13
+ end
14
+
15
+ class ArgumentException < Exception
16
+ def initialize()
17
+ super('The request includes ' \
18
+ 'illegal arguments, is missing required arguments, includes a ' \
19
+ 'repeated argument, or values for arguments have an illegal syntax.',
20
+ 'badArgument')
21
+ end
22
+ end
23
+
24
+ class VerbException < Exception
25
+ def initialize()
26
+ super('Value of the verb argument is not a legal OAI-PMH '\
27
+ 'verb, the verb argument is missing, or the verb argument is repeated.',
28
+ 'badVerb')
29
+ end
30
+ end
31
+
32
+ class FormatException < Exception
33
+ def initialize()
34
+ super('The metadata format identified by '\
35
+ 'the value given for the metadataPrefix argument is not supported '\
36
+ 'by the item or by the repository.', 'cannotDisseminateFormat')
37
+ end
38
+ end
39
+
40
+ class IdException < Exception
41
+ def initialize()
42
+ super('The value of the identifier argument is '\
43
+ 'unknown or illegal in this repository.', 'idDoesNotExist')
44
+ end
45
+ end
46
+
47
+ class NoMatchException < Exception
48
+ def initialize()
49
+ super('The combination of the values of the from, '\
50
+ 'until, set and metadataPrefix arguments results in an empty list.',
51
+ 'noRecordsMatch')
52
+ end
53
+ end
54
+
55
+ class MetadataFormatException < Exception
56
+ def initialize()
57
+ super('There are no metadata formats available '\
58
+ 'for the specified item.', 'noMetadataFormats')
59
+ end
60
+ end
61
+
62
+ class SetException < Exception
63
+ def initialize()
64
+ super('This repository does not support sets.', 'noSetHierarchy')
65
+ end
66
+ end
67
+
68
+ class ResumptionTokenException < Exception
69
+ def initialize()
70
+ super('The value of the resumptionToken argument is invalid or expired.',
71
+ 'badResumptionToken')
72
+ end
73
+ end
74
+
75
+ end
@@ -0,0 +1,41 @@
1
+ #
2
+ # Created by William Groppe on 2006-11-05.
3
+ # Copyright (c) 2006. All rights reserved.
4
+
5
+ module OAI
6
+ module Harvester
7
+
8
+ LOW_RESOLUTION = "YYYY-MM-DD"
9
+
10
+ class Config < OpenStruct
11
+
12
+ PERIODS = %w(daily weekly monthly)
13
+ GLOBAL = "/etc/oai/harvester.yml"
14
+
15
+ def self.load
16
+ config = find_config
17
+ File.exists?(config) ? new(YAML.load_file(config)) : new
18
+ end
19
+
20
+ def save
21
+ config = Config.find_config
22
+ open(config, 'w') do |out|
23
+ YAML.dump(@table, out)
24
+ end
25
+ end
26
+
27
+ private
28
+ # Shamelessly lifted from Camping
29
+ def self.find_config
30
+ if home = ENV['HOME'] # POSIX
31
+ return GLOBAL if File.exists?(GLOBAL) && File.writable?(GLOBAL)
32
+ FileUtils.mkdir_p File.join(home, '.oai')
33
+ File.join(home, '.oai/harvester.yml')
34
+ elsif home = ENV['APPDATA'] # MSWIN
35
+ File.join(home, 'oai/harvester.yml')
36
+ end
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,150 @@
1
+ #
2
+ # Created by William Groppe on 2006-11-03.
3
+
4
+ module OAI
5
+ module Harvester
6
+
7
+ class Harvest
8
+
9
+ def initialize(config = nil, directory = nil, date = nil)
10
+ @config = config || Config.load
11
+ @directory = directory || @config.storage
12
+ @from = date
13
+ @from.freeze
14
+ @parser = defined?(XML::Document) ? 'libxml' : 'rexml'
15
+ end
16
+
17
+ def start(sites = nil, interactive = false)
18
+ @interactive = interactive
19
+ sites = (@config.sites.keys rescue {}) unless sites
20
+ begin
21
+ sites.each do |site|
22
+ harvest(site)
23
+ end
24
+ ensure
25
+ @config.save
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def harvest(site)
32
+ opts = build_options_hash(@config.sites[site])
33
+ harvest_time = Time.now.utc
34
+
35
+ if "YYYY-MM-DD" == granularity(opts[:url])
36
+ opts[:until] = harvest_time.strftime("%Y-%m-%d")
37
+ opts[:from] = @from.strftime("%Y-%m-%d") if @from
38
+ else
39
+ opts[:until] = harvest_time.xmlschema
40
+ opts[:from] = @from.xmlschema if @from
41
+ end
42
+
43
+ # Allow a from date to be passed in
44
+ opts[:from] = earliest(opts[:url]) unless opts[:from]
45
+ opts.delete(:set) if 'all' == opts[:set]
46
+
47
+ begin
48
+ # Connect, and download
49
+ file, records = call(opts.delete(:url), opts)
50
+
51
+ # Move document to storage directory
52
+ dir = File.join(@directory, date_based_directory(harvest_time))
53
+ FileUtils.mkdir_p dir
54
+ FileUtils.mv(file.path,
55
+ File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
56
+ harvest_time)}.xml.gz"))
57
+ @config.sites[site]['last'] = harvest_time
58
+ rescue
59
+ raise $! unless $!.respond_to?(:code)
60
+ raise $! if not @interactive || "noRecordsMatch" != $!.code
61
+ puts "No new records available"
62
+ end
63
+ end
64
+
65
+ def call(url, opts)
66
+ # Preserve original options
67
+ options = opts.dup
68
+
69
+ records = 0;
70
+ client = OAI::Client.new(url, :parser => @parser)
71
+ provider_config = client.identify
72
+
73
+ file = Tempfile.new('oai_data')
74
+ gz = Zlib::GzipWriter.new(file)
75
+ gz << "<? xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
76
+ gz << "<records>"
77
+ begin
78
+ response = client.list_records(options)
79
+ get_records(response.doc).each do |rec|
80
+ gz << rec
81
+ records += 1
82
+ end
83
+ puts "#{records} records retrieved" if @interactive
84
+
85
+ # Get a full response by iterating with the resumption tokens.
86
+ # Not very Ruby like. Should fix OAI::Client to handle resumption
87
+ # tokens internally.
88
+ while(response.resumption_token and not response.resumption_token.empty?)
89
+ puts "\nresumption token recieved, continuing" if @interactive
90
+ response = client.list_records(:resumption_token =>
91
+ response.resumption_token)
92
+ get_records(response.doc).each do |rec|
93
+ gz << rec
94
+ records += 1
95
+ end
96
+ puts "#{records} records retrieved" if @interactive
97
+ end
98
+
99
+ gz << "</records>"
100
+
101
+ ensure
102
+ gz.close
103
+ file.close
104
+ end
105
+
106
+ [file, records]
107
+ end
108
+
109
+ def get_records(doc)
110
+ doc.find("/OAI-PMH/ListRecords/record").to_a
111
+ end
112
+
113
+ def build_options_hash(site)
114
+ options = {:url => site['url']}
115
+ options[:set] = site['set'] if site['set']
116
+ options[:from] = site['last'].utc.xmlschema if site['last']
117
+ options[:metadata_prefix] = site['prefix'] if site['prefix']
118
+ options
119
+ end
120
+
121
+ def date_based_directory(time)
122
+ "#{time.strftime(DIRECTORY_LAYOUT)}"
123
+ end
124
+
125
+ def filename(from_time, until_time)
126
+ format = "%Y-%m-%d"
127
+ "#{from_time.strftime(format)}_til_#{until_time.strftime(format)}"\
128
+ "_at_#{until_time.strftime('%H-%M-%S')}"
129
+ end
130
+
131
+ def granularity(url)
132
+ client = OAI::Client.new url
133
+ client.identify.granularity
134
+ end
135
+
136
+ # Get earliest timestamp from repository
137
+ def earliest(url)
138
+ client = OAI::Client.new url
139
+ identify = client.identify
140
+ if "YYYY-MM-DD" == identify.granularity
141
+ Time.parse(identify.earliest_datestamp).strftime("%Y-%m-%d")
142
+ else
143
+ Time.parse(identify.earliest_datestamp).xmlschema
144
+ end
145
+ end
146
+
147
+ end
148
+
149
+ end
150
+ end
@@ -0,0 +1,70 @@
1
+ # Reopen Harvest and add logging
2
+ module OAI
3
+ module Harvester
4
+
5
+ class Harvest
6
+ alias_method :orig_start, :start
7
+ alias_method :orig_harvest, :harvest
8
+ alias_method :orig_call, :call
9
+ alias_method :orig_init, :initialize
10
+
11
+ def initialize(*args)
12
+ orig_init(*args)
13
+ @summary = []
14
+ @logger = Logger.new(File.join(@config.logfile, "harvester.log"),
15
+ shift_age = 'weekly') if @config.logfile
16
+ @logger.datetime_format = "%Y-%m-%d %H:%M"
17
+
18
+ # Turn off logging if no logging directory is specified.
19
+ @logger.level = Logger::FATAL unless @config.logfile
20
+ end
21
+
22
+ def start(sites = nil, interactive = false)
23
+ if not interactive
24
+ @logger.info { "Starting regular harvest" }
25
+ orig_start(sites)
26
+ begin
27
+ OAI::Harvester::
28
+ Mailer.send(@config.mail_server, @config.email, @summary)
29
+ rescue
30
+ @logger.error { "Error sending out summary email: #{$!}"}
31
+ end
32
+ else
33
+ @logger.info { "Starting interactive harvest"}
34
+ orig_start(sites, true)
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def harvest(site)
41
+ begin
42
+ @logger.info { "Harvest of '#{site}' starting" }
43
+ @summary << "Harvest of '#{site}' attempted"
44
+ orig_harvest(site)
45
+ rescue OAI::Exception
46
+ if "noRecordsMatch" == $!.code
47
+ @logger.info "No new records available"
48
+ @summary << "'#{site}' had no new records."
49
+ else
50
+ @logger.error { "Harvesting of '#{site}' failed, message: #{$!}" }
51
+ @summary << "'#{site}' had an OAI Error! #{$!}"
52
+ end
53
+ rescue
54
+ @logger.error { "Harvesting of '#{site}' failed, message: #{$!}" }
55
+ @logger.error { "#{$!.backtrace.join('\n')}" }
56
+ @summary << "'#{site}' had an Error! #{$!}"
57
+ end
58
+ end
59
+
60
+ def call(url, options)
61
+ @logger.info { "fetching: #{url} with options #{options.inspect}" }
62
+ file, records = orig_call(url, options)
63
+ @logger.info { "retrieved #{records} records" }
64
+ @summary << "Retrieved #{records} records."
65
+ return file, records
66
+ end
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,17 @@
1
+ module OAI
2
+ module Harvester
3
+
4
+ class Mailer
5
+
6
+ def self.send(server = nil, email = nil, message = nil)
7
+ msg = %{Subject: Harvester Summary\n\n#{message.join("\n")}}
8
+ to = (email.map { |e| "'#{e}'"}).join(", ")
9
+ Net::SMTP.start(server) do |smtp|
10
+ smtp.send_message msg, "harvester@#{Socket.gethostname}", to
11
+ end
12
+ end
13
+
14
+ end
15
+
16
+ end
17
+ end