oai_talia 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. data/README +81 -0
  2. data/Rakefile +127 -0
  3. data/bin/oai +68 -0
  4. data/examples/models/file_model.rb +63 -0
  5. data/examples/providers/dublin_core.rb +474 -0
  6. data/lib/oai/client/get_record.rb +15 -0
  7. data/lib/oai/client/header.rb +18 -0
  8. data/lib/oai/client/identify.rb +30 -0
  9. data/lib/oai/client/list_identifiers.rb +12 -0
  10. data/lib/oai/client/list_metadata_formats.rb +12 -0
  11. data/lib/oai/client/list_records.rb +21 -0
  12. data/lib/oai/client/list_sets.rb +19 -0
  13. data/lib/oai/client/metadata_format.rb +12 -0
  14. data/lib/oai/client/record.rb +26 -0
  15. data/lib/oai/client/response.rb +35 -0
  16. data/lib/oai/client.rb +301 -0
  17. data/lib/oai/constants.rb +34 -0
  18. data/lib/oai/exception.rb +75 -0
  19. data/lib/oai/harvester/config.rb +41 -0
  20. data/lib/oai/harvester/harvest.rb +150 -0
  21. data/lib/oai/harvester/logging.rb +70 -0
  22. data/lib/oai/harvester/mailer.rb +17 -0
  23. data/lib/oai/harvester/shell.rb +338 -0
  24. data/lib/oai/harvester.rb +39 -0
  25. data/lib/oai/provider/metadata_format/oai_dc.rb +29 -0
  26. data/lib/oai/provider/metadata_format/oai_europeana.rb +38 -0
  27. data/lib/oai/provider/metadata_format.rb +143 -0
  28. data/lib/oai/provider/model/activerecord_caching_wrapper.rb +134 -0
  29. data/lib/oai/provider/model/activerecord_wrapper.rb +139 -0
  30. data/lib/oai/provider/model.rb +74 -0
  31. data/lib/oai/provider/partial_result.rb +18 -0
  32. data/lib/oai/provider/response/error.rb +16 -0
  33. data/lib/oai/provider/response/get_record.rb +26 -0
  34. data/lib/oai/provider/response/identify.rb +25 -0
  35. data/lib/oai/provider/response/list_identifiers.rb +35 -0
  36. data/lib/oai/provider/response/list_metadata_formats.rb +34 -0
  37. data/lib/oai/provider/response/list_records.rb +34 -0
  38. data/lib/oai/provider/response/list_sets.rb +23 -0
  39. data/lib/oai/provider/response/record_response.rb +70 -0
  40. data/lib/oai/provider/response.rb +161 -0
  41. data/lib/oai/provider/resumption_token.rb +106 -0
  42. data/lib/oai/provider.rb +304 -0
  43. data/lib/oai/set.rb +29 -0
  44. data/lib/oai/xpath.rb +75 -0
  45. data/lib/oai.rb +8 -0
  46. data/lib/test.rb +25 -0
  47. data/test/activerecord_provider/config/connection.rb +5 -0
  48. data/test/activerecord_provider/config/database.yml +6 -0
  49. data/test/activerecord_provider/database/ar_migration.rb +59 -0
  50. data/test/activerecord_provider/database/oaipmhtest +0 -0
  51. data/test/activerecord_provider/fixtures/dc.yml +1501 -0
  52. data/test/activerecord_provider/helpers/providers.rb +44 -0
  53. data/test/activerecord_provider/helpers/set_provider.rb +36 -0
  54. data/test/activerecord_provider/models/dc_field.rb +7 -0
  55. data/test/activerecord_provider/models/dc_set.rb +6 -0
  56. data/test/activerecord_provider/models/oai_token.rb +3 -0
  57. data/test/activerecord_provider/tc_ar_provider.rb +113 -0
  58. data/test/activerecord_provider/tc_ar_sets_provider.rb +72 -0
  59. data/test/activerecord_provider/tc_caching_paging_provider.rb +55 -0
  60. data/test/activerecord_provider/tc_simple_paging_provider.rb +57 -0
  61. data/test/activerecord_provider/test_helper.rb +4 -0
  62. data/test/client/helpers/provider.rb +68 -0
  63. data/test/client/helpers/test_wrapper.rb +11 -0
  64. data/test/client/tc_exception.rb +36 -0
  65. data/test/client/tc_get_record.rb +37 -0
  66. data/test/client/tc_identify.rb +13 -0
  67. data/test/client/tc_libxml.rb +61 -0
  68. data/test/client/tc_list_identifiers.rb +52 -0
  69. data/test/client/tc_list_metadata_formats.rb +18 -0
  70. data/test/client/tc_list_records.rb +13 -0
  71. data/test/client/tc_list_sets.rb +19 -0
  72. data/test/client/tc_low_resolution_dates.rb +14 -0
  73. data/test/client/tc_utf8_escaping.rb +11 -0
  74. data/test/client/tc_xpath.rb +26 -0
  75. data/test/client/test_helper.rb +5 -0
  76. data/test/provider/models.rb +234 -0
  77. data/test/provider/tc_exceptions.rb +96 -0
  78. data/test/provider/tc_functional_tokens.rb +43 -0
  79. data/test/provider/tc_provider.rb +71 -0
  80. data/test/provider/tc_resumption_tokens.rb +46 -0
  81. data/test/provider/tc_simple_provider.rb +92 -0
  82. data/test/provider/test_helper.rb +36 -0
  83. data/test/test.xml +22 -0
  84. metadata +181 -0
data/lib/oai/client.rb ADDED
@@ -0,0 +1,301 @@
1
+ # External dependencies
2
+ require 'uri'
3
+ require 'net/http'
4
+ require 'cgi'
5
+ require 'iconv'
6
+
7
+ if not defined?(OAI::Const::VERBS)
8
+ # Shared stuff
9
+ require 'oai/exception'
10
+ require 'oai/constants'
11
+ require 'oai/xpath'
12
+ require 'oai/set'
13
+ end
14
+
15
+ # Localize requires so user can select a subset of functionality
16
+ require 'oai/client/metadata_format'
17
+ require 'oai/client/response'
18
+ require 'oai/client/header'
19
+ require 'oai/client/record'
20
+ require 'oai/client/identify'
21
+ require 'oai/client/get_record'
22
+ require 'oai/client/list_identifiers'
23
+ require 'oai/client/list_metadata_formats'
24
+ require 'oai/client/list_records'
25
+ require 'oai/client/list_sets'
26
+
27
+ module OAI
28
+
29
+ # A OAI::Client provides a client api for issuing OAI-PMH verbs against
30
+ # a OAI-PMH server. The 6 OAI-PMH verbs translate directly to methods you
31
+ # can call on a OAI::Client object. Verb arguments are passed as a hash:
32
+ #
33
+ # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
34
+ # record = client.get_record :identifier => 'oai:pubmedcentral.gov:13901'
35
+ # for identifier in client.list_identifiers
36
+ # puts identifier
37
+ # end
38
+ #
39
+ # It is worth noting that the api uses methods and parameter names with
40
+ # underscores in them rather than studly caps. So above list_identifiers
41
+ # and metadata_prefix are used instead of the listIdentifiers and
42
+ # metadataPrefix used in the OAI-PMH specification.
43
+ #
44
+ # Also, the from and until arguments which specify dates should be passed
45
+ # in as Date or DateTime objects depending on the granularity supported
46
+ # by the server.
47
+ #
48
+ # For detailed information on the arguments that can be used please consult
49
+ # the OAI-PMH docs at:
50
+ #
51
+ # http://www.openarchives.org/OAI/openarchivesprotocol.html
52
+
53
+ class Client
54
+
55
+ # The constructor which must be passed a valid base url for an oai
56
+ # service:
57
+ #
58
+ # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
59
+ #
60
+ # If you want to see debugging messages on STDERR use:
61
+ #
62
+ # client = OAI::Client.new 'http://example.com', :debug => true
63
+ #
64
+ # By default OAI verbs called on the client will return REXML::Element
65
+ # objects for metadata records, however if you wish you can use the
66
+ # :parser option to indicate you want to use 'libxml' instead, and get
67
+ # back XML::Node objects
68
+ #
69
+ # client = OAI::Client.new 'http://example.com', :parser => 'libxml'
70
+ #
71
+ # === HIGH PERFORMANCE
72
+ #
73
+ # If you want to supercharge this api install libxml-ruby >= 0.3.8 and
74
+ # use the :parser option when you construct your OAI::Client.
75
+ #
76
+ def initialize(base_url, options={})
77
+ @base = URI.parse base_url
78
+ @debug = options.fetch(:debug, false)
79
+ @parser = options.fetch(:parser, 'rexml')
80
+ @follow_redirects = options.fetch(:redirects, true)
81
+
82
+ # load appropriate parser
83
+ case @parser
84
+ when 'libxml'
85
+ begin
86
+ require 'rubygems'
87
+ require 'xml/libxml'
88
+ rescue
89
+ raise OAI::Exception.new("xml/libxml not available")
90
+ end
91
+ when 'rexml'
92
+ require 'rexml/document'
93
+ require 'rexml/xpath'
94
+ else
95
+ raise OAI::Exception.new("unknown parser: #{@parser}")
96
+ end
97
+ end
98
+
99
+ # Equivalent to a Identify request. You'll get back a OAI::IdentifyResponse
100
+ # object which is essentially just a wrapper around a REXML::Document
101
+ # for the response. If you are created your client using the libxml
102
+ # parser then you will get an XML::Node object instead.
103
+
104
+ def identify
105
+ return OAI::IdentifyResponse.new(do_request('Identify'))
106
+ end
107
+
108
+ # Equivalent to a ListMetadataFormats request. A ListMetadataFormatsResponse
109
+ # object is returned to you.
110
+
111
+ def list_metadata_formats(opts={})
112
+ return OAI::ListMetadataFormatsResponse.new(do_request('ListMetadataFormats', opts))
113
+ end
114
+
115
+ # Equivalent to a ListIdentifiers request. Pass in :from, :until arguments
116
+ # as Date or DateTime objects as appropriate depending on the granularity
117
+ # supported by the server.
118
+
119
+ def list_identifiers(opts={})
120
+ return OAI::ListIdentifiersResponse.new(do_request('ListIdentifiers', opts))
121
+ end
122
+
123
+ # Equivalent to a GetRecord request. You must supply an identifier
124
+ # argument. You should get back a OAI::GetRecordResponse object
125
+ # which you can extract a OAI::Record object from.
126
+
127
+ def get_record(opts={})
128
+ return OAI::GetRecordResponse.new(do_request('GetRecord', opts))
129
+ end
130
+
131
+ # Equivalent to the ListRecords request. A ListRecordsResponse
132
+ # will be returned which you can use to iterate through records
133
+ #
134
+ # for record in client.list_records
135
+ # puts record.metadata
136
+ # end
137
+
138
+ def list_records(opts={})
139
+ return OAI::ListRecordsResponse.new(do_request('ListRecords', opts))
140
+ end
141
+
142
+ # Equivalent to the ListSets request. A ListSetsResponse object
143
+ # will be returned which you can use for iterating through the
144
+ # OAI::Set objects
145
+ #
146
+ # for set in client.list_sets
147
+ # puts set
148
+ # end
149
+
150
+ def list_sets(opts={})
151
+ return OAI::ListSetsResponse.new(do_request('ListSets', opts))
152
+ end
153
+
154
+ private
155
+
156
+ def do_request(verb, opts = nil)
157
+ # fire off the request and return appropriate DOM object
158
+ uri = build_uri(verb, opts)
159
+ xml = strip_invalid_utf_8_chars(get(uri))
160
+ if @parser == 'libxml'
161
+ # remove default namespace for oai-pmh since libxml
162
+ # isn't able to use our xpaths to get at them
163
+ # if you know a way around thins please let me know
164
+ xml = xml.gsub(
165
+ /xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
166
+ end
167
+ return load_document(xml)
168
+ end
169
+
170
+ def build_uri(verb, opts)
171
+ opts = validate_options(verb, opts)
172
+ uri = @base.clone
173
+ uri.query = "verb=" << verb
174
+ opts.each_pair { |k,v| uri.query << '&' << externalize(k) << '=' << encode(v) }
175
+ uri
176
+ end
177
+
178
+ def encode(value)
179
+ return CGI.escape(value) unless value.respond_to?(:strftime)
180
+ if value.kind_of?(DateTime)
181
+ Time.parse(value.asctime).utc.xmlschema
182
+ elsif value.kind_of?(Time)
183
+ value.utc.xmlschema
184
+ else # Assume something date like
185
+ value.strftime('%Y-%m-%d')
186
+ end
187
+ end
188
+
189
+ def load_document(xml)
190
+ case @parser
191
+ when 'libxml'
192
+ begin
193
+ parser = XML::Parser.new()
194
+ parser.string = xml
195
+ return parser.parse
196
+ rescue XML::Parser::ParseError => e
197
+ raise OAI::Exception, 'response not well formed XML: '+e, caller
198
+ end
199
+ when 'rexml'
200
+ begin
201
+ return REXML::Document.new(xml)
202
+ rescue REXML::ParseException => e
203
+ raise OAI::Exception, 'response not well formed XML: '+e.message, caller
204
+ end
205
+ end
206
+ end
207
+
208
+ # Do the actual HTTP get, following any temporary redirects
209
+ def get(uri)
210
+ response = Net::HTTP.get_response(uri)
211
+ case response
212
+ when Net::HTTPSuccess
213
+ return response.body
214
+ when Net::HTTPMovedPermanently
215
+ if @follow_redirects
216
+ response = get(URI.parse(response['location']))
217
+ else
218
+ raise ArgumentError, "Permanently Redirected to [#{response['location']}]"
219
+ end
220
+ when Net::HTTPTemporaryRedirect, Net::HTTPFound
221
+ response = get(URI.parse(response['location']))
222
+ else
223
+ raise ArgumentError, "#{response.code_type} [#{response.code}]"
224
+ end
225
+ end
226
+
227
+ def debug(msg)
228
+ $stderr.print("#{msg}\n") if @debug
229
+ end
230
+
231
+ # Massage the standard OAI options to make them a bit more palatable.
232
+ def validate_options(verb, opts = {})
233
+ raise OAI::VerbException.new unless Const::VERBS.keys.include?(verb)
234
+
235
+ return {} if opts.nil?
236
+
237
+ raise OAI::ArgumentException.new unless opts.respond_to?(:keys)
238
+
239
+ realopts = {}
240
+ # Internalize the hash
241
+ opts.keys.each do |key|
242
+ realopts[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = opts.delete(key)
243
+ end
244
+
245
+ return realopts if is_resumption?(realopts)
246
+
247
+ # add in a default metadataPrefix if none exists
248
+ if(Const::VERBS[verb].include?(:metadata_prefix))
249
+ realopts[:metadata_prefix] ||= 'oai_dc'
250
+ end
251
+
252
+ # Convert date formated strings in dates.
253
+ #realopts[:from] = parse_date(realopts[:from]) if realopts[:from]
254
+ #realopts[:until] = parse_date(realopts[:until]) if realopts[:until]
255
+
256
+ # check for any bad options
257
+ unless (realopts.keys - OAI::Const::VERBS[verb]).empty?
258
+ raise OAI::ArgumentException.new
259
+ end
260
+ realopts
261
+ end
262
+
263
+ def is_resumption?(opts)
264
+ if opts.keys.include?(:resumption_token)
265
+ return true if 1 == opts.keys.size
266
+ raise OAI::ArgumentException.new
267
+ end
268
+ end
269
+
270
+ # Convert our internal representations back into standard OAI options
271
+ def externalize(value)
272
+ value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
273
+ end
274
+
275
+ def parse_date(value)
276
+ return value if value.respond_to?(:strftime)
277
+
278
+ Date.parse(value) # This will raise an exception for badly formatted dates
279
+ Time.parse(value).utc # Sadly, this will not
280
+ rescue
281
+ raise OAI::ArgumentError.new
282
+ end
283
+
284
+ # Strip out invalid UTF-8 characters. Regex from the W3C, inverted.
285
+ # http://www.w3.org/International/questions/qa-forms-utf-8.en.php
286
+ #
287
+ # Regex is from WebCollab:
288
+ # http://webcollab.sourceforge.net/unicode.html
289
+ def strip_invalid_utf_8_chars(xml)
290
+ simple_bytes = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
291
+ | [\x00-\x7F][\x80-\xBF]+
292
+ | ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
293
+ | [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
294
+ | [\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))
295
+ | (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')
296
+ simple_bytes.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
297
+ | \xED[\xA0-\xBF][\x80-\xBF]/,'?')
298
+ end
299
+
300
+ end
301
+ end
@@ -0,0 +1,34 @@
1
+ module OAI
2
+
3
+ module Const
4
+ # OAI defines six verbs with various allowable options.
5
+ VERBS = {
6
+ 'Identify' => [],
7
+ 'ListMetadataFormats' => [:identifier],
8
+ 'ListSets' => [:resumption_token], # unused currently
9
+ 'GetRecord' => [:identifier, :from, :until, :set, :metadata_prefix],
10
+ 'ListIdentifiers' => [:from, :until, :set, :metadata_prefix, :resumption_token],
11
+ 'ListRecords' => [:from, :until, :set, :metadata_prefix, :resumption_token]
12
+ }.freeze
13
+
14
+ RESERVED_WORDS = %w{type id}
15
+
16
+ # Two granularities are supported in OIA-PMH, daily or seconds.
17
+ module Granularity
18
+ LOW = 'YYYY-MM-DD'
19
+ HIGH = 'YYYY-MM-DDThh:mm:ssZ'
20
+ end
21
+
22
+ # Repositories can support three different schemes for dealing with deletions.
23
+ # * NO - No deletions allowed
24
+ # * TRANSIENT - Deletions are supported but may not be permanently maintained.
25
+ # * PERSISTENT - Deletions are supported and are permanently maintained.
26
+ module Delete
27
+ NO = :no
28
+ TRANSIENT = :transient
29
+ PERSISTENT = :persistent
30
+ end
31
+
32
+ end
33
+
34
+ end
@@ -0,0 +1,75 @@
1
+ module OAI
2
+
3
+ # Standard error responses for problems serving OAI content. These
4
+ # messages will be wrapped in an XML response to the client.
5
+
6
+ class Exception < RuntimeError
7
+ attr_reader :code
8
+
9
+ def initialize(message, code = nil)
10
+ super(message)
11
+ @code = code
12
+ end
13
+ end
14
+
15
+ class ArgumentException < Exception
16
+ def initialize()
17
+ super('The request includes ' \
18
+ 'illegal arguments, is missing required arguments, includes a ' \
19
+ 'repeated argument, or values for arguments have an illegal syntax.',
20
+ 'badArgument')
21
+ end
22
+ end
23
+
24
+ class VerbException < Exception
25
+ def initialize()
26
+ super('Value of the verb argument is not a legal OAI-PMH '\
27
+ 'verb, the verb argument is missing, or the verb argument is repeated.',
28
+ 'badVerb')
29
+ end
30
+ end
31
+
32
+ class FormatException < Exception
33
+ def initialize()
34
+ super('The metadata format identified by '\
35
+ 'the value given for the metadataPrefix argument is not supported '\
36
+ 'by the item or by the repository.', 'cannotDisseminateFormat')
37
+ end
38
+ end
39
+
40
+ class IdException < Exception
41
+ def initialize()
42
+ super('The value of the identifier argument is '\
43
+ 'unknown or illegal in this repository.', 'idDoesNotExist')
44
+ end
45
+ end
46
+
47
+ class NoMatchException < Exception
48
+ def initialize()
49
+ super('The combination of the values of the from, '\
50
+ 'until, set and metadataPrefix arguments results in an empty list.',
51
+ 'noRecordsMatch')
52
+ end
53
+ end
54
+
55
+ class MetadataFormatException < Exception
56
+ def initialize()
57
+ super('There are no metadata formats available '\
58
+ 'for the specified item.', 'noMetadataFormats')
59
+ end
60
+ end
61
+
62
+ class SetException < Exception
63
+ def initialize()
64
+ super('This repository does not support sets.', 'noSetHierarchy')
65
+ end
66
+ end
67
+
68
+ class ResumptionTokenException < Exception
69
+ def initialize()
70
+ super('The value of the resumptionToken argument is invalid or expired.',
71
+ 'badResumptionToken')
72
+ end
73
+ end
74
+
75
+ end
@@ -0,0 +1,41 @@
1
+ #
2
+ # Created by William Groppe on 2006-11-05.
3
+ # Copyright (c) 2006. All rights reserved.
4
+
5
+ module OAI
6
+ module Harvester
7
+
8
+ LOW_RESOLUTION = "YYYY-MM-DD"
9
+
10
+ class Config < OpenStruct
11
+
12
+ PERIODS = %w(daily weekly monthly)
13
+ GLOBAL = "/etc/oai/harvester.yml"
14
+
15
+ def self.load
16
+ config = find_config
17
+ File.exists?(config) ? new(YAML.load_file(config)) : new
18
+ end
19
+
20
+ def save
21
+ config = Config.find_config
22
+ open(config, 'w') do |out|
23
+ YAML.dump(@table, out)
24
+ end
25
+ end
26
+
27
+ private
28
+ # Shamelessly lifted from Camping
29
+ def self.find_config
30
+ if home = ENV['HOME'] # POSIX
31
+ return GLOBAL if File.exists?(GLOBAL) && File.writable?(GLOBAL)
32
+ FileUtils.mkdir_p File.join(home, '.oai')
33
+ File.join(home, '.oai/harvester.yml')
34
+ elsif home = ENV['APPDATA'] # MSWIN
35
+ File.join(home, 'oai/harvester.yml')
36
+ end
37
+ end
38
+
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,150 @@
1
+ #
2
+ # Created by William Groppe on 2006-11-03.
3
+
4
+ module OAI
5
+ module Harvester
6
+
7
+ class Harvest
8
+
9
+ def initialize(config = nil, directory = nil, date = nil)
10
+ @config = config || Config.load
11
+ @directory = directory || @config.storage
12
+ @from = date
13
+ @from.freeze
14
+ @parser = defined?(XML::Document) ? 'libxml' : 'rexml'
15
+ end
16
+
17
+ def start(sites = nil, interactive = false)
18
+ @interactive = interactive
19
+ sites = (@config.sites.keys rescue {}) unless sites
20
+ begin
21
+ sites.each do |site|
22
+ harvest(site)
23
+ end
24
+ ensure
25
+ @config.save
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def harvest(site)
32
+ opts = build_options_hash(@config.sites[site])
33
+ harvest_time = Time.now.utc
34
+
35
+ if "YYYY-MM-DD" == granularity(opts[:url])
36
+ opts[:until] = harvest_time.strftime("%Y-%m-%d")
37
+ opts[:from] = @from.strftime("%Y-%m-%d") if @from
38
+ else
39
+ opts[:until] = harvest_time.xmlschema
40
+ opts[:from] = @from.xmlschema if @from
41
+ end
42
+
43
+ # Allow a from date to be passed in
44
+ opts[:from] = earliest(opts[:url]) unless opts[:from]
45
+ opts.delete(:set) if 'all' == opts[:set]
46
+
47
+ begin
48
+ # Connect, and download
49
+ file, records = call(opts.delete(:url), opts)
50
+
51
+ # Move document to storage directory
52
+ dir = File.join(@directory, date_based_directory(harvest_time))
53
+ FileUtils.mkdir_p dir
54
+ FileUtils.mv(file.path,
55
+ File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
56
+ harvest_time)}.xml.gz"))
57
+ @config.sites[site]['last'] = harvest_time
58
+ rescue
59
+ raise $! unless $!.respond_to?(:code)
60
+ raise $! if not @interactive || "noRecordsMatch" != $!.code
61
+ puts "No new records available"
62
+ end
63
+ end
64
+
65
+ def call(url, opts)
66
+ # Preserve original options
67
+ options = opts.dup
68
+
69
+ records = 0;
70
+ client = OAI::Client.new(url, :parser => @parser)
71
+ provider_config = client.identify
72
+
73
+ file = Tempfile.new('oai_data')
74
+ gz = Zlib::GzipWriter.new(file)
75
+ gz << "<? xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
76
+ gz << "<records>"
77
+ begin
78
+ response = client.list_records(options)
79
+ get_records(response.doc).each do |rec|
80
+ gz << rec
81
+ records += 1
82
+ end
83
+ puts "#{records} records retrieved" if @interactive
84
+
85
+ # Get a full response by iterating with the resumption tokens.
86
+ # Not very Ruby like. Should fix OAI::Client to handle resumption
87
+ # tokens internally.
88
+ while(response.resumption_token and not response.resumption_token.empty?)
89
+ puts "\nresumption token recieved, continuing" if @interactive
90
+ response = client.list_records(:resumption_token =>
91
+ response.resumption_token)
92
+ get_records(response.doc).each do |rec|
93
+ gz << rec
94
+ records += 1
95
+ end
96
+ puts "#{records} records retrieved" if @interactive
97
+ end
98
+
99
+ gz << "</records>"
100
+
101
+ ensure
102
+ gz.close
103
+ file.close
104
+ end
105
+
106
+ [file, records]
107
+ end
108
+
109
+ def get_records(doc)
110
+ doc.find("/OAI-PMH/ListRecords/record").to_a
111
+ end
112
+
113
+ def build_options_hash(site)
114
+ options = {:url => site['url']}
115
+ options[:set] = site['set'] if site['set']
116
+ options[:from] = site['last'].utc.xmlschema if site['last']
117
+ options[:metadata_prefix] = site['prefix'] if site['prefix']
118
+ options
119
+ end
120
+
121
+ def date_based_directory(time)
122
+ "#{time.strftime(DIRECTORY_LAYOUT)}"
123
+ end
124
+
125
+ def filename(from_time, until_time)
126
+ format = "%Y-%m-%d"
127
+ "#{from_time.strftime(format)}_til_#{until_time.strftime(format)}"\
128
+ "_at_#{until_time.strftime('%H-%M-%S')}"
129
+ end
130
+
131
+ def granularity(url)
132
+ client = OAI::Client.new url
133
+ client.identify.granularity
134
+ end
135
+
136
+ # Get earliest timestamp from repository
137
+ def earliest(url)
138
+ client = OAI::Client.new url
139
+ identify = client.identify
140
+ if "YYYY-MM-DD" == identify.granularity
141
+ Time.parse(identify.earliest_datestamp).strftime("%Y-%m-%d")
142
+ else
143
+ Time.parse(identify.earliest_datestamp).xmlschema
144
+ end
145
+ end
146
+
147
+ end
148
+
149
+ end
150
+ end
@@ -0,0 +1,70 @@
1
+ # Reopen Harvest and add logging
2
+ module OAI
3
+ module Harvester
4
+
5
+ class Harvest
6
+ alias_method :orig_start, :start
7
+ alias_method :orig_harvest, :harvest
8
+ alias_method :orig_call, :call
9
+ alias_method :orig_init, :initialize
10
+
11
+ def initialize(*args)
12
+ orig_init(*args)
13
+ @summary = []
14
+ @logger = Logger.new(File.join(@config.logfile, "harvester.log"),
15
+ shift_age = 'weekly') if @config.logfile
16
+ @logger.datetime_format = "%Y-%m-%d %H:%M"
17
+
18
+ # Turn off logging if no logging directory is specified.
19
+ @logger.level = Logger::FATAL unless @config.logfile
20
+ end
21
+
22
+ def start(sites = nil, interactive = false)
23
+ if not interactive
24
+ @logger.info { "Starting regular harvest" }
25
+ orig_start(sites)
26
+ begin
27
+ OAI::Harvester::
28
+ Mailer.send(@config.mail_server, @config.email, @summary)
29
+ rescue
30
+ @logger.error { "Error sending out summary email: #{$!}"}
31
+ end
32
+ else
33
+ @logger.info { "Starting interactive harvest"}
34
+ orig_start(sites, true)
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ def harvest(site)
41
+ begin
42
+ @logger.info { "Harvest of '#{site}' starting" }
43
+ @summary << "Harvest of '#{site}' attempted"
44
+ orig_harvest(site)
45
+ rescue OAI::Exception
46
+ if "noRecordsMatch" == $!.code
47
+ @logger.info "No new records available"
48
+ @summary << "'#{site}' had no new records."
49
+ else
50
+ @logger.error { "Harvesting of '#{site}' failed, message: #{$!}" }
51
+ @summary << "'#{site}' had an OAI Error! #{$!}"
52
+ end
53
+ rescue
54
+ @logger.error { "Harvesting of '#{site}' failed, message: #{$!}" }
55
+ @logger.error { "#{$!.backtrace.join('\n')}" }
56
+ @summary << "'#{site}' had an Error! #{$!}"
57
+ end
58
+ end
59
+
60
+ def call(url, options)
61
+ @logger.info { "fetching: #{url} with options #{options.inspect}" }
62
+ file, records = orig_call(url, options)
63
+ @logger.info { "retrieved #{records} records" }
64
+ @summary << "Retrieved #{records} records."
65
+ return file, records
66
+ end
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,17 @@
1
+ module OAI
2
+ module Harvester
3
+
4
+ class Mailer
5
+
6
+ def self.send(server = nil, email = nil, message = nil)
7
+ msg = %{Subject: Harvester Summary\n\n#{message.join("\n")}}
8
+ to = (email.map { |e| "'#{e}'"}).join(", ")
9
+ Net::SMTP.start(server) do |smtp|
10
+ smtp.send_message msg, "harvester@#{Socket.gethostname}", to
11
+ end
12
+ end
13
+
14
+ end
15
+
16
+ end
17
+ end