oai 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/README.md +28 -23
  2. data/Rakefile +14 -40
  3. data/examples/providers/dublin_core.rb +63 -63
  4. data/lib/oai/client.rb +131 -97
  5. data/lib/oai/client/list_identifiers.rb +1 -0
  6. data/lib/oai/client/list_records.rb +6 -5
  7. data/lib/oai/client/list_sets.rb +6 -5
  8. data/lib/oai/client/record.rb +6 -7
  9. data/lib/oai/client/response.rb +7 -4
  10. data/lib/oai/client/resumable.rb +42 -0
  11. data/lib/oai/harvester/shell.rb +40 -41
  12. data/lib/oai/provider.rb +85 -67
  13. data/lib/oai/provider/metadata_format/oai_dc.rb +5 -6
  14. data/lib/oai/provider/model/activerecord_caching_wrapper.rb +23 -25
  15. data/lib/oai/provider/model/activerecord_wrapper.rb +99 -51
  16. data/lib/oai/provider/response.rb +33 -31
  17. data/lib/oai/provider/response/get_record.rb +7 -7
  18. data/lib/oai/provider/response/list_records.rb +5 -4
  19. data/lib/oai/provider/response/record_response.rb +14 -14
  20. data/test/activerecord_provider/config/connection.rb +8 -4
  21. data/test/activerecord_provider/database/{ar_migration.rb → 0001_oaipmh_tables.rb} +17 -12
  22. data/test/activerecord_provider/helpers/providers.rb +2 -3
  23. data/test/activerecord_provider/helpers/set_provider.rb +10 -22
  24. data/test/activerecord_provider/helpers/transactional_test_case.rb +34 -0
  25. data/test/activerecord_provider/models/dc_field.rb +4 -4
  26. data/test/activerecord_provider/models/dc_set.rb +3 -2
  27. data/test/activerecord_provider/models/exclusive_set_dc_field.rb +11 -0
  28. data/test/activerecord_provider/tc_ar_provider.rb +67 -28
  29. data/test/activerecord_provider/tc_ar_sets_provider.rb +104 -18
  30. data/test/activerecord_provider/tc_caching_paging_provider.rb +6 -10
  31. data/test/activerecord_provider/tc_simple_paging_provider.rb +7 -11
  32. data/test/activerecord_provider/test_helper.rb +10 -0
  33. data/test/client/helpers/provider.rb +44 -47
  34. data/test/client/helpers/test_wrapper.rb +4 -16
  35. data/test/client/tc_http_client.rb +90 -2
  36. data/test/client/tc_list_identifiers.rb +22 -3
  37. data/test/client/tc_list_records.rb +17 -4
  38. data/test/client/tc_list_sets.rb +17 -2
  39. data/test/provider/models.rb +32 -30
  40. data/test/provider/tc_exceptions.rb +30 -20
  41. data/test/provider/tc_functional_tokens.rb +11 -6
  42. data/test/provider/tc_provider.rb +58 -24
  43. data/test/provider/tc_resumption_tokens.rb +6 -6
  44. data/test/provider/tc_simple_provider.rb +51 -26
  45. data/test/provider/test_helper.rb +7 -0
  46. metadata +67 -128
  47. data/test/activerecord_provider/config/database.yml +0 -6
  48. data/test/activerecord_provider/database/oaipmhtest +0 -0
@@ -2,7 +2,6 @@
2
2
  require 'uri'
3
3
  require 'faraday'
4
4
  require 'cgi'
5
- require 'iconv'
6
5
 
7
6
  if not defined?(OAI::Const::VERBS)
8
7
  # Shared stuff
@@ -16,9 +15,10 @@ end
16
15
  require 'oai/client/metadata_format'
17
16
  require 'oai/client/response'
18
17
  require 'oai/client/header'
19
- require 'oai/client/record'
18
+ require 'oai/client/record'
20
19
  require 'oai/client/identify'
21
20
  require 'oai/client/get_record'
21
+ require 'oai/client/resumable'
22
22
  require 'oai/client/list_identifiers'
23
23
  require 'oai/client/list_metadata_formats'
24
24
  require 'oai/client/list_records'
@@ -26,75 +26,79 @@ require 'oai/client/list_sets'
26
26
 
27
27
  module OAI
28
28
 
29
- # A OAI::Client provides a client api for issuing OAI-PMH verbs against
30
- # a OAI-PMH server. The 6 OAI-PMH verbs translate directly to methods you
31
- # can call on a OAI::Client object. Verb arguments are passed as a hash:
29
+ # A `OAI::Client` provides a client api for issuing OAI-PMH verbs against
30
+ # a OAI-PMH server. The 6 OAI-PMH verbs translate directly to methods you
31
+ # can call on a `OAI::Client` object. Verb arguments are passed as a hash:
32
32
  #
33
+ # ```ruby
33
34
  # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
34
35
  # record = client.get_record :identifier => 'oai:pubmedcentral.gov:13901'
35
36
  # for identifier in client.list_identifiers
36
37
  # puts identifier
37
38
  # end
39
+ # ```
38
40
  #
39
- # It is worth noting that the api uses methods and parameter names with
40
- # underscores in them rather than studly caps. So above list_identifiers
41
- # and metadata_prefix are used instead of the listIdentifiers and
42
- # metadataPrefix used in the OAI-PMH specification.
41
+ # It is worth noting that the API uses methods and parameter names with
42
+ # underscores in them rather than studly caps. So above `list_identifiers`
43
+ # and `metadata_prefix` are used instead of the `listIdentifiers` and
44
+ # `metadataPrefix` used in the OAI-PMH specification.
43
45
  #
44
46
  # Also, the from and until arguments which specify dates should be passed
45
- # in as Date or DateTime objects depending on the granularity supported
47
+ # in as `Date` or `DateTime` objects depending on the granularity supported
46
48
  # by the server.
47
49
  #
48
50
  # For detailed information on the arguments that can be used please consult
49
- # the OAI-PMH docs at:
50
- #
51
- # http://www.openarchives.org/OAI/openarchivesprotocol.html
52
-
51
+ # the OAI-PMH docs at
52
+ # <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
53
+
53
54
  class Client
54
55
 
55
- # The constructor which must be passed a valid base url for an oai
56
+ # The constructor which must be passed a valid base url for an oai
56
57
  # service:
57
58
  #
58
- # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
59
+ # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
59
60
  #
60
- # If you want to see debugging messages on STDERR use:
61
+ # If you want to see debugging messages on `STDERR` use:
61
62
  #
62
- # client = OAI::Client.new 'http://example.com', :debug => true
63
+ # client = OAI::Client.new 'http://example.com', :debug => true
63
64
  #
64
- # By default OAI verbs called on the client will return REXML::Element
65
+ # By default OAI verbs called on the client will return `REXML::Element`
65
66
  # objects for metadata records, however if you wish you can use the
66
- # :parser option to indicate you want to use 'libxml' instead, and get
67
- # back XML::Node objects
67
+ # `:parser` option to indicate you want to use `libxml` instead, and get
68
+ # back `XML::Node` objects
68
69
  #
69
- # client = OAI::Client.new 'http://example.com', :parser => 'libxml'
70
- #
71
- # You can configure the Faraday HTTP client by providing an alternate
70
+ # client = OAI::Client.new 'http://example.com', :parser => 'libxml'
71
+ #
72
+ # You can configure the Faraday HTTP client by providing an alternate
72
73
  # Faraday instance:
73
74
  #
74
- # client = OAI::Client.new 'http://example.com', :http => Faraday.new { |c| }
75
+ # ```ruby
76
+ # client = OAI::Client.new 'http://example.com', :http => Faraday.new {|c|}
77
+ # ```
75
78
  #
76
- # === HIGH PERFORMANCE
79
+ # ### HIGH PERFORMANCE
77
80
  #
78
- # If you want to supercharge this api install libxml-ruby >= 0.3.8 and
79
- # use the :parser option when you construct your OAI::Client.
81
+ # If you want to supercharge this api install `libxml-ruby >= 0.3.8` and
82
+ # use the `:parser` option when you construct your `OAI::Client`.
80
83
  #
81
84
  def initialize(base_url, options={})
82
85
  @base = URI.parse base_url
83
86
  @debug = options.fetch(:debug, false)
84
87
  @parser = options.fetch(:parser, 'rexml')
85
88
 
86
- @follow_redirects = options.fetch(:redirects, true)
87
- @http_client = options.fetch(:http, Faraday.new(@base))
88
-
89
- if !options.key?(:http) and @follow_redirects
89
+ @http_client = options.fetch(:http) do
90
+ Faraday.new(:url => @base) do |builder|
91
+ follow_redirects = options.fetch(:redirects, true)
92
+ if follow_redirects
93
+ count = follow_redirects.is_a?(Fixnum) ? follow_redirects : 5
90
94
 
91
- count = @folow_redirects if @folow_redirects.is_a? Fixnum
92
- count ||= 5
93
-
94
- require 'faraday_middleware'
95
- @http_client.use FaradayMiddleware::FollowRedirects, :limit => count
95
+ require 'faraday_middleware'
96
+ builder.response :follow_redirects, :limit => count
97
+ end
98
+ builder.adapter :net_http
99
+ end
96
100
  end
97
-
101
+
98
102
  # load appropriate parser
99
103
  case @parser
100
104
  when 'libxml'
@@ -112,77 +116,107 @@ module OAI
112
116
  end
113
117
  end
114
118
 
115
- # Equivalent to a Identify request. You'll get back a OAI::IdentifyResponse
116
- # object which is essentially just a wrapper around a REXML::Document
117
- # for the response. If you created your client using the libxml
118
- # parser then you will get an XML::Node object instead.
119
-
119
+ # Equivalent to a `Identify` request.
120
+ # You'll get back a `OAI::IdentifyResponse`
121
+ # object which is essentially just a wrapper around a `REXML::Document`
122
+ # for the response. If you created your client using the `libxml`
123
+ # parser then you will get an `XML::Node` object instead.
120
124
  def identify
121
- return OAI::IdentifyResponse.new(do_request('Identify'))
125
+ OAI::IdentifyResponse.new(do_request('Identify'))
122
126
  end
123
127
 
124
- # Equivalent to a ListMetadataFormats request. A ListMetadataFormatsResponse
125
- # object is returned to you.
126
-
128
+ # Equivalent to a `ListMetadataFormats` request.
129
+ # A `ListMetadataFormatsResponse` object is returned to you.
130
+
127
131
  def list_metadata_formats(opts={})
128
- return OAI::ListMetadataFormatsResponse.new(do_request('ListMetadataFormats', opts))
132
+ OAI::ListMetadataFormatsResponse.new(do_request('ListMetadataFormats', opts))
129
133
  end
130
134
 
131
- # Equivalent to a ListIdentifiers request. Pass in :from, :until arguments
132
- # as Date or DateTime objects as appropriate depending on the granularity
133
- # supported by the server.
134
-
135
+ # Equivalent to a `ListIdentifiers` request. Pass in `:from`,
136
+ # `:until` arguments as `Date` or `DateTime` objects as appropriate
137
+ # depending on the granularity supported by the server.
138
+ #
139
+ # You can use seamless resumption with this verb, which allows you to
140
+ # mitigate (to some extent) the lack of a `Count` verb:
141
+ #
142
+ # client.list_identifiers.full.count # Don't try this on PubMed though!
143
+ #
135
144
  def list_identifiers(opts={})
136
- return OAI::ListIdentifiersResponse.new(do_request('ListIdentifiers', opts))
145
+ do_resumable(OAI::ListIdentifiersResponse, 'ListIdentifiers', opts)
137
146
  end
138
147
 
139
- # Equivalent to a GetRecord request. You must supply an identifier
140
- # argument. You should get back a OAI::GetRecordResponse object
141
- # which you can extract a OAI::Record object from.
142
-
148
+ # Equivalent to a `GetRecord` request. You must supply an `:identifier`
149
+ # argument. You should get back a `OAI::GetRecordResponse` object
150
+ # which you can extract a `OAI::Record` object from.
143
151
  def get_record(opts={})
144
- return OAI::GetRecordResponse.new(do_request('GetRecord', opts))
152
+ OAI::GetRecordResponse.new(do_request('GetRecord', opts))
145
153
  end
146
154
 
147
- # Equivalent to the ListRecords request. A ListRecordsResponse
155
+ # Equivalent to the `ListRecords` request. A `ListRecordsResponse`
148
156
  # will be returned which you can use to iterate through records
149
157
  #
150
- # for record in client.list_records
151
- # puts record.metadata
152
- # end
153
-
158
+ # response = client.list_records
159
+ # response.each do |record|
160
+ # puts record.metadata
161
+ # end
162
+ #
163
+ # Alternately, you can use seamless resumption to avoid handling
164
+ # resumption tokens:
165
+ #
166
+ # client.list_records.full.each do |record|
167
+ # puts record.metadata
168
+ # end
169
+ #
170
+ # ### Memory Use
171
+ # `:full` will avoid storing more than one page of records in
172
+ # memory, but your use it in ways that override that behaviour. Be careful
173
+ # to avoid using `client.list_records.full.entries` unless you really want
174
+ # to hold all the records in the feed in memory!
154
175
  def list_records(opts={})
155
- return OAI::ListRecordsResponse.new(do_request('ListRecords', opts))
176
+ do_resumable(OAI::ListRecordsResponse, 'ListRecords', opts)
156
177
  end
157
178
 
158
- # Equivalent to the ListSets request. A ListSetsResponse object
159
- # will be returned which you can use for iterating through the
160
- # OAI::Set objects
179
+ # Equivalent to the `ListSets` request. A `ListSetsResponse` object
180
+ # will be returned which you can use for iterating through the
181
+ # `OAI::Set` objects
182
+ #
183
+ # for set in client.list_sets
184
+ # puts set
185
+ # end
186
+ #
187
+ # A large number of sets is not unusual for some OAI-PMH feeds, so
188
+ # using seamless resumption may be preferable:
161
189
  #
162
- # for set in client.list_sets
163
- # puts set
164
- # end
165
-
190
+ # client.list_sets.full.each do |set|
191
+ # puts set
192
+ # end
166
193
  def list_sets(opts={})
167
- return OAI::ListSetsResponse.new(do_request('ListSets', opts))
194
+ do_resumable(OAI::ListSetsResponse, 'ListSets', opts)
168
195
  end
169
196
 
170
- private
197
+ private
171
198
 
172
199
  def do_request(verb, opts = nil)
173
200
  # fire off the request and return appropriate DOM object
174
201
  uri = build_uri(verb, opts)
175
202
  xml = strip_invalid_utf_8_chars(get(uri))
176
- if @parser == 'libxml'
203
+ if @parser == 'libxml'
177
204
  # remove default namespace for oai-pmh since libxml
178
- # isn't able to use our xpaths to get at them
205
+ # isn't able to use our xpaths to get at them
179
206
  # if you know a way around thins please let me know
180
207
  xml = xml.gsub(
181
- /xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
208
+ /xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
182
209
  end
183
210
  return load_document(xml)
184
211
  end
185
-
212
+
213
+ def do_resumable(responseClass, verb, opts)
214
+ responseClass.new(do_request(verb, opts)) do |response|
215
+ responseClass.new \
216
+ do_request(verb, :resumption_token => response.resumption_token)
217
+ end
218
+ end
219
+
186
220
  def build_uri(verb, opts)
187
221
  opts = validate_options(verb, opts)
188
222
  uri = @base.clone
@@ -190,7 +224,7 @@ module OAI
190
224
  opts.each_pair { |k,v| uri.query << '&' << externalize(k) << '=' << encode(v) }
191
225
  uri
192
226
  end
193
-
227
+
194
228
  def encode(value)
195
229
  return CGI.escape(value) unless value.respond_to?(:strftime)
196
230
  if value.kind_of?(DateTime)
@@ -206,9 +240,9 @@ module OAI
206
240
  case @parser
207
241
  when 'libxml'
208
242
  begin
209
- parser = XML::Parser.string()
243
+ parser = XML::Parser.string(xml)
210
244
  return parser.parse
211
- rescue XML::Parser::ParseError => e
245
+ rescue XML::Error => e
212
246
  raise OAI::Exception, 'response not well formed XML: '+e, caller
213
247
  end
214
248
  when 'rexml'
@@ -229,7 +263,7 @@ module OAI
229
263
  def debug(msg)
230
264
  $stderr.print("#{msg}\n") if @debug
231
265
  end
232
-
266
+
233
267
  # Massage the standard OAI options to make them a bit more palatable.
234
268
  def validate_options(verb, opts = {})
235
269
  raise OAI::VerbException.new unless Const::VERBS.keys.include?(verb)
@@ -237,20 +271,20 @@ module OAI
237
271
  return {} if opts.nil?
238
272
 
239
273
  raise OAI::ArgumentException.new unless opts.respond_to?(:keys)
240
-
274
+
241
275
  realopts = {}
242
276
  # Internalize the hash
243
277
  opts.keys.each do |key|
244
278
  realopts[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = opts.delete(key)
245
279
  end
246
-
280
+
247
281
  return realopts if is_resumption?(realopts)
248
-
282
+
249
283
  # add in a default metadataPrefix if none exists
250
284
  if(Const::VERBS[verb].include?(:metadata_prefix))
251
285
  realopts[:metadata_prefix] ||= 'oai_dc'
252
286
  end
253
-
287
+
254
288
  # Convert date formated strings in dates.
255
289
  #realopts[:from] = parse_date(realopts[:from]) if realopts[:from]
256
290
  #realopts[:until] = parse_date(realopts[:until]) if realopts[:until]
@@ -261,43 +295,43 @@ module OAI
261
295
  end
262
296
  realopts
263
297
  end
264
-
298
+
265
299
  def is_resumption?(opts)
266
- if opts.keys.include?(:resumption_token)
300
+ if opts.keys.include?(:resumption_token)
267
301
  return true if 1 == opts.keys.size
268
302
  raise OAI::ArgumentException.new
269
303
  end
270
304
  end
271
-
305
+
272
306
  # Convert our internal representations back into standard OAI options
273
307
  def externalize(value)
274
308
  value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
275
309
  end
276
-
310
+
277
311
  def parse_date(value)
278
312
  return value if value.respond_to?(:strftime)
279
-
313
+
280
314
  Date.parse(value) # This will raise an exception for badly formatted dates
281
315
  Time.parse(value).utc # Sadly, this will not
282
316
  rescue
283
- raise OAI::ArgumentError.new
317
+ raise OAI::ArgumentError.new
284
318
  end
285
-
319
+
286
320
  # Strip out invalid UTF-8 characters. Regex from the W3C, inverted.
287
321
  # http://www.w3.org/International/questions/qa-forms-utf-8.en.php
288
322
  #
289
- # Regex is from WebCollab:
323
+ # Regex is from WebCollab:
290
324
  # http://webcollab.sourceforge.net/unicode.html
291
325
  def strip_invalid_utf_8_chars(xml)
292
- simple_bytes = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
326
+ xml && xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
293
327
  | [\x00-\x7F][\x80-\xBF]+
294
328
  | ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
295
329
  | [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
296
330
  | [\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))
297
- | (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')
298
- simple_bytes.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
331
+ | (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')\
332
+ .gsub(/\xE0[\x80-\x9F][\x80-\xBF]
299
333
  | \xED[\xA0-\xBF][\x80-\xBF]/,'?')
300
334
  end
301
-
335
+
302
336
  end
303
337
  end
@@ -1,6 +1,7 @@
1
1
  module OAI
2
2
  class ListIdentifiersResponse < Response
3
3
  include Enumerable
4
+ include OAI::Resumable
4
5
  include OAI::XPath
5
6
 
6
7
  def each
@@ -2,15 +2,16 @@ module OAI
2
2
 
3
3
  # allows for iteration across a list of records
4
4
  #
5
- # for record in client.list_records :metadata_prefix => 'oai_dc':
6
- # puts record.metadata
7
- # end
5
+ # client.list_records(:metadata_prefix => 'oai_dc').each do |record|
6
+ # puts record.metadata
7
+ # end
8
8
  #
9
9
  # you'll need to handle resumption tokens
10
-
10
+
11
11
  class ListRecordsResponse < Response
12
- include OAI::XPath
13
12
  include Enumerable
13
+ include OAI::Resumable
14
+ include OAI::XPath
14
15
 
15
16
  def each
16
17
  for record_element in xpath_all(@doc, './/ListRecords/record')
@@ -2,13 +2,14 @@ module OAI
2
2
 
3
3
  # allows for iteration of the sets found in a oai-pmh server
4
4
  #
5
- # for set in client.list_sets
6
- # puts set
7
- # end
8
-
5
+ # for set in client.list_sets
6
+ # puts set
7
+ # end
8
+
9
9
  class ListSetsResponse < Response
10
- include OAI::XPath
11
10
  include Enumerable
11
+ include OAI::Resumable
12
+ include OAI::XPath
12
13
 
13
14
  def each
14
15
  for set_element in xpath_all(@doc, './/set')
@@ -1,13 +1,12 @@
1
1
  module OAI
2
2
 
3
- # A class for representing a Record as returned from a GetRecord
4
- # or ListRecords request. Each record will have a header and metadata
5
- # attribute. The header is a OAI::Header object and the metadata is
6
- # a REXML::Element object for that chunk of XML.
3
+ # A class for representing a Record as returned from a `GetRecord`
4
+ # or `ListRecords` request. Each record will have a header and metadata
5
+ # attribute. The header is a {OAI::Header} object and the metadata is
6
+ # a `REXML::Element` object for that chunk of XML.
7
7
  #
8
- # Note: if your OAI::Client was configured to use the 'libxml' parser
9
- # metadata will return a XML::Node object instead.
10
-
8
+ # Note: if your {OAI::Client} was configured to use the 'libxml' parser
9
+ # metadata will return a `XML::Node` object instead.
11
10
  class Record
12
11
  include OAI::XPath
13
12
  attr_accessor :header, :metadata, :about