oai 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. data/README.md +28 -23
  2. data/Rakefile +14 -40
  3. data/examples/providers/dublin_core.rb +63 -63
  4. data/lib/oai/client.rb +131 -97
  5. data/lib/oai/client/list_identifiers.rb +1 -0
  6. data/lib/oai/client/list_records.rb +6 -5
  7. data/lib/oai/client/list_sets.rb +6 -5
  8. data/lib/oai/client/record.rb +6 -7
  9. data/lib/oai/client/response.rb +7 -4
  10. data/lib/oai/client/resumable.rb +42 -0
  11. data/lib/oai/harvester/shell.rb +40 -41
  12. data/lib/oai/provider.rb +85 -67
  13. data/lib/oai/provider/metadata_format/oai_dc.rb +5 -6
  14. data/lib/oai/provider/model/activerecord_caching_wrapper.rb +23 -25
  15. data/lib/oai/provider/model/activerecord_wrapper.rb +99 -51
  16. data/lib/oai/provider/response.rb +33 -31
  17. data/lib/oai/provider/response/get_record.rb +7 -7
  18. data/lib/oai/provider/response/list_records.rb +5 -4
  19. data/lib/oai/provider/response/record_response.rb +14 -14
  20. data/test/activerecord_provider/config/connection.rb +8 -4
  21. data/test/activerecord_provider/database/{ar_migration.rb → 0001_oaipmh_tables.rb} +17 -12
  22. data/test/activerecord_provider/helpers/providers.rb +2 -3
  23. data/test/activerecord_provider/helpers/set_provider.rb +10 -22
  24. data/test/activerecord_provider/helpers/transactional_test_case.rb +34 -0
  25. data/test/activerecord_provider/models/dc_field.rb +4 -4
  26. data/test/activerecord_provider/models/dc_set.rb +3 -2
  27. data/test/activerecord_provider/models/exclusive_set_dc_field.rb +11 -0
  28. data/test/activerecord_provider/tc_ar_provider.rb +67 -28
  29. data/test/activerecord_provider/tc_ar_sets_provider.rb +104 -18
  30. data/test/activerecord_provider/tc_caching_paging_provider.rb +6 -10
  31. data/test/activerecord_provider/tc_simple_paging_provider.rb +7 -11
  32. data/test/activerecord_provider/test_helper.rb +10 -0
  33. data/test/client/helpers/provider.rb +44 -47
  34. data/test/client/helpers/test_wrapper.rb +4 -16
  35. data/test/client/tc_http_client.rb +90 -2
  36. data/test/client/tc_list_identifiers.rb +22 -3
  37. data/test/client/tc_list_records.rb +17 -4
  38. data/test/client/tc_list_sets.rb +17 -2
  39. data/test/provider/models.rb +32 -30
  40. data/test/provider/tc_exceptions.rb +30 -20
  41. data/test/provider/tc_functional_tokens.rb +11 -6
  42. data/test/provider/tc_provider.rb +58 -24
  43. data/test/provider/tc_resumption_tokens.rb +6 -6
  44. data/test/provider/tc_simple_provider.rb +51 -26
  45. data/test/provider/test_helper.rb +7 -0
  46. metadata +67 -128
  47. data/test/activerecord_provider/config/database.yml +0 -6
  48. data/test/activerecord_provider/database/oaipmhtest +0 -0
@@ -2,7 +2,6 @@
2
2
  require 'uri'
3
3
  require 'faraday'
4
4
  require 'cgi'
5
- require 'iconv'
6
5
 
7
6
  if not defined?(OAI::Const::VERBS)
8
7
  # Shared stuff
@@ -16,9 +15,10 @@ end
16
15
  require 'oai/client/metadata_format'
17
16
  require 'oai/client/response'
18
17
  require 'oai/client/header'
19
- require 'oai/client/record'
18
+ require 'oai/client/record'
20
19
  require 'oai/client/identify'
21
20
  require 'oai/client/get_record'
21
+ require 'oai/client/resumable'
22
22
  require 'oai/client/list_identifiers'
23
23
  require 'oai/client/list_metadata_formats'
24
24
  require 'oai/client/list_records'
@@ -26,75 +26,79 @@ require 'oai/client/list_sets'
26
26
 
27
27
  module OAI
28
28
 
29
- # A OAI::Client provides a client api for issuing OAI-PMH verbs against
30
- # a OAI-PMH server. The 6 OAI-PMH verbs translate directly to methods you
31
- # can call on a OAI::Client object. Verb arguments are passed as a hash:
29
+ # A `OAI::Client` provides a client api for issuing OAI-PMH verbs against
30
+ # a OAI-PMH server. The 6 OAI-PMH verbs translate directly to methods you
31
+ # can call on a `OAI::Client` object. Verb arguments are passed as a hash:
32
32
  #
33
+ # ```ruby
33
34
  # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
34
35
  # record = client.get_record :identifier => 'oai:pubmedcentral.gov:13901'
35
36
  # for identifier in client.list_identifiers
36
37
  # puts identifier
37
38
  # end
39
+ # ```
38
40
  #
39
- # It is worth noting that the api uses methods and parameter names with
40
- # underscores in them rather than studly caps. So above list_identifiers
41
- # and metadata_prefix are used instead of the listIdentifiers and
42
- # metadataPrefix used in the OAI-PMH specification.
41
+ # It is worth noting that the API uses methods and parameter names with
42
+ # underscores in them rather than studly caps. So above `list_identifiers`
43
+ # and `metadata_prefix` are used instead of the `listIdentifiers` and
44
+ # `metadataPrefix` used in the OAI-PMH specification.
43
45
  #
44
46
  # Also, the from and until arguments which specify dates should be passed
45
- # in as Date or DateTime objects depending on the granularity supported
47
+ # in as `Date` or `DateTime` objects depending on the granularity supported
46
48
  # by the server.
47
49
  #
48
50
  # For detailed information on the arguments that can be used please consult
49
- # the OAI-PMH docs at:
50
- #
51
- # http://www.openarchives.org/OAI/openarchivesprotocol.html
52
-
51
+ # the OAI-PMH docs at
52
+ # <http://www.openarchives.org/OAI/openarchivesprotocol.html>.
53
+
53
54
  class Client
54
55
 
55
- # The constructor which must be passed a valid base url for an oai
56
+ # The constructor which must be passed a valid base url for an oai
56
57
  # service:
57
58
  #
58
- # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
59
+ # client = OAI::Client.new 'http://www.pubmedcentral.gov/oai/oai.cgi'
59
60
  #
60
- # If you want to see debugging messages on STDERR use:
61
+ # If you want to see debugging messages on `STDERR` use:
61
62
  #
62
- # client = OAI::Client.new 'http://example.com', :debug => true
63
+ # client = OAI::Client.new 'http://example.com', :debug => true
63
64
  #
64
- # By default OAI verbs called on the client will return REXML::Element
65
+ # By default OAI verbs called on the client will return `REXML::Element`
65
66
  # objects for metadata records, however if you wish you can use the
66
- # :parser option to indicate you want to use 'libxml' instead, and get
67
- # back XML::Node objects
67
+ # `:parser` option to indicate you want to use `libxml` instead, and get
68
+ # back `XML::Node` objects
68
69
  #
69
- # client = OAI::Client.new 'http://example.com', :parser => 'libxml'
70
- #
71
- # You can configure the Faraday HTTP client by providing an alternate
70
+ # client = OAI::Client.new 'http://example.com', :parser => 'libxml'
71
+ #
72
+ # You can configure the Faraday HTTP client by providing an alternate
72
73
  # Faraday instance:
73
74
  #
74
- # client = OAI::Client.new 'http://example.com', :http => Faraday.new { |c| }
75
+ # ```ruby
76
+ # client = OAI::Client.new 'http://example.com', :http => Faraday.new {|c|}
77
+ # ```
75
78
  #
76
- # === HIGH PERFORMANCE
79
+ # ### HIGH PERFORMANCE
77
80
  #
78
- # If you want to supercharge this api install libxml-ruby >= 0.3.8 and
79
- # use the :parser option when you construct your OAI::Client.
81
+ # If you want to supercharge this api install `libxml-ruby >= 0.3.8` and
82
+ # use the `:parser` option when you construct your `OAI::Client`.
80
83
  #
81
84
  def initialize(base_url, options={})
82
85
  @base = URI.parse base_url
83
86
  @debug = options.fetch(:debug, false)
84
87
  @parser = options.fetch(:parser, 'rexml')
85
88
 
86
- @follow_redirects = options.fetch(:redirects, true)
87
- @http_client = options.fetch(:http, Faraday.new(@base))
88
-
89
- if !options.key?(:http) and @follow_redirects
89
+ @http_client = options.fetch(:http) do
90
+ Faraday.new(:url => @base) do |builder|
91
+ follow_redirects = options.fetch(:redirects, true)
92
+ if follow_redirects
93
+ count = follow_redirects.is_a?(Fixnum) ? follow_redirects : 5
90
94
 
91
- count = @folow_redirects if @folow_redirects.is_a? Fixnum
92
- count ||= 5
93
-
94
- require 'faraday_middleware'
95
- @http_client.use FaradayMiddleware::FollowRedirects, :limit => count
95
+ require 'faraday_middleware'
96
+ builder.response :follow_redirects, :limit => count
97
+ end
98
+ builder.adapter :net_http
99
+ end
96
100
  end
97
-
101
+
98
102
  # load appropriate parser
99
103
  case @parser
100
104
  when 'libxml'
@@ -112,77 +116,107 @@ module OAI
112
116
  end
113
117
  end
114
118
 
115
- # Equivalent to a Identify request. You'll get back a OAI::IdentifyResponse
116
- # object which is essentially just a wrapper around a REXML::Document
117
- # for the response. If you created your client using the libxml
118
- # parser then you will get an XML::Node object instead.
119
-
119
+ # Equivalent to a `Identify` request.
120
+ # You'll get back a `OAI::IdentifyResponse`
121
+ # object which is essentially just a wrapper around a `REXML::Document`
122
+ # for the response. If you created your client using the `libxml`
123
+ # parser then you will get an `XML::Node` object instead.
120
124
  def identify
121
- return OAI::IdentifyResponse.new(do_request('Identify'))
125
+ OAI::IdentifyResponse.new(do_request('Identify'))
122
126
  end
123
127
 
124
- # Equivalent to a ListMetadataFormats request. A ListMetadataFormatsResponse
125
- # object is returned to you.
126
-
128
+ # Equivalent to a `ListMetadataFormats` request.
129
+ # A `ListMetadataFormatsResponse` object is returned to you.
130
+
127
131
  def list_metadata_formats(opts={})
128
- return OAI::ListMetadataFormatsResponse.new(do_request('ListMetadataFormats', opts))
132
+ OAI::ListMetadataFormatsResponse.new(do_request('ListMetadataFormats', opts))
129
133
  end
130
134
 
131
- # Equivalent to a ListIdentifiers request. Pass in :from, :until arguments
132
- # as Date or DateTime objects as appropriate depending on the granularity
133
- # supported by the server.
134
-
135
+ # Equivalent to a `ListIdentifiers` request. Pass in `:from`,
136
+ # `:until` arguments as `Date` or `DateTime` objects as appropriate
137
+ # depending on the granularity supported by the server.
138
+ #
139
+ # You can use seamless resumption with this verb, which allows you to
140
+ # mitigate (to some extent) the lack of a `Count` verb:
141
+ #
142
+ # client.list_identifiers.full.count # Don't try this on PubMed though!
143
+ #
135
144
  def list_identifiers(opts={})
136
- return OAI::ListIdentifiersResponse.new(do_request('ListIdentifiers', opts))
145
+ do_resumable(OAI::ListIdentifiersResponse, 'ListIdentifiers', opts)
137
146
  end
138
147
 
139
- # Equivalent to a GetRecord request. You must supply an identifier
140
- # argument. You should get back a OAI::GetRecordResponse object
141
- # which you can extract a OAI::Record object from.
142
-
148
+ # Equivalent to a `GetRecord` request. You must supply an `:identifier`
149
+ # argument. You should get back a `OAI::GetRecordResponse` object
150
+ # which you can extract a `OAI::Record` object from.
143
151
  def get_record(opts={})
144
- return OAI::GetRecordResponse.new(do_request('GetRecord', opts))
152
+ OAI::GetRecordResponse.new(do_request('GetRecord', opts))
145
153
  end
146
154
 
147
- # Equivalent to the ListRecords request. A ListRecordsResponse
155
+ # Equivalent to the `ListRecords` request. A `ListRecordsResponse`
148
156
  # will be returned which you can use to iterate through records
149
157
  #
150
- # for record in client.list_records
151
- # puts record.metadata
152
- # end
153
-
158
+ # response = client.list_records
159
+ # response.each do |record|
160
+ # puts record.metadata
161
+ # end
162
+ #
163
+ # Alternately, you can use seamless resumption to avoid handling
164
+ # resumption tokens:
165
+ #
166
+ # client.list_records.full.each do |record|
167
+ # puts record.metadata
168
+ # end
169
+ #
170
+ # ### Memory Use
171
+ # `:full` will avoid storing more than one page of records in
172
+ # memory, but your use it in ways that override that behaviour. Be careful
173
+ # to avoid using `client.list_records.full.entries` unless you really want
174
+ # to hold all the records in the feed in memory!
154
175
  def list_records(opts={})
155
- return OAI::ListRecordsResponse.new(do_request('ListRecords', opts))
176
+ do_resumable(OAI::ListRecordsResponse, 'ListRecords', opts)
156
177
  end
157
178
 
158
- # Equivalent to the ListSets request. A ListSetsResponse object
159
- # will be returned which you can use for iterating through the
160
- # OAI::Set objects
179
+ # Equivalent to the `ListSets` request. A `ListSetsResponse` object
180
+ # will be returned which you can use for iterating through the
181
+ # `OAI::Set` objects
182
+ #
183
+ # for set in client.list_sets
184
+ # puts set
185
+ # end
186
+ #
187
+ # A large number of sets is not unusual for some OAI-PMH feeds, so
188
+ # using seamless resumption may be preferable:
161
189
  #
162
- # for set in client.list_sets
163
- # puts set
164
- # end
165
-
190
+ # client.list_sets.full.each do |set|
191
+ # puts set
192
+ # end
166
193
  def list_sets(opts={})
167
- return OAI::ListSetsResponse.new(do_request('ListSets', opts))
194
+ do_resumable(OAI::ListSetsResponse, 'ListSets', opts)
168
195
  end
169
196
 
170
- private
197
+ private
171
198
 
172
199
  def do_request(verb, opts = nil)
173
200
  # fire off the request and return appropriate DOM object
174
201
  uri = build_uri(verb, opts)
175
202
  xml = strip_invalid_utf_8_chars(get(uri))
176
- if @parser == 'libxml'
203
+ if @parser == 'libxml'
177
204
  # remove default namespace for oai-pmh since libxml
178
- # isn't able to use our xpaths to get at them
205
+ # isn't able to use our xpaths to get at them
179
206
  # if you know a way around thins please let me know
180
207
  xml = xml.gsub(
181
- /xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
208
+ /xmlns=\"http:\/\/www.openarchives.org\/OAI\/.\..\/\"/, '')
182
209
  end
183
210
  return load_document(xml)
184
211
  end
185
-
212
+
213
+ def do_resumable(responseClass, verb, opts)
214
+ responseClass.new(do_request(verb, opts)) do |response|
215
+ responseClass.new \
216
+ do_request(verb, :resumption_token => response.resumption_token)
217
+ end
218
+ end
219
+
186
220
  def build_uri(verb, opts)
187
221
  opts = validate_options(verb, opts)
188
222
  uri = @base.clone
@@ -190,7 +224,7 @@ module OAI
190
224
  opts.each_pair { |k,v| uri.query << '&' << externalize(k) << '=' << encode(v) }
191
225
  uri
192
226
  end
193
-
227
+
194
228
  def encode(value)
195
229
  return CGI.escape(value) unless value.respond_to?(:strftime)
196
230
  if value.kind_of?(DateTime)
@@ -206,9 +240,9 @@ module OAI
206
240
  case @parser
207
241
  when 'libxml'
208
242
  begin
209
- parser = XML::Parser.string()
243
+ parser = XML::Parser.string(xml)
210
244
  return parser.parse
211
- rescue XML::Parser::ParseError => e
245
+ rescue XML::Error => e
212
246
  raise OAI::Exception, 'response not well formed XML: '+e, caller
213
247
  end
214
248
  when 'rexml'
@@ -229,7 +263,7 @@ module OAI
229
263
  def debug(msg)
230
264
  $stderr.print("#{msg}\n") if @debug
231
265
  end
232
-
266
+
233
267
  # Massage the standard OAI options to make them a bit more palatable.
234
268
  def validate_options(verb, opts = {})
235
269
  raise OAI::VerbException.new unless Const::VERBS.keys.include?(verb)
@@ -237,20 +271,20 @@ module OAI
237
271
  return {} if opts.nil?
238
272
 
239
273
  raise OAI::ArgumentException.new unless opts.respond_to?(:keys)
240
-
274
+
241
275
  realopts = {}
242
276
  # Internalize the hash
243
277
  opts.keys.each do |key|
244
278
  realopts[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = opts.delete(key)
245
279
  end
246
-
280
+
247
281
  return realopts if is_resumption?(realopts)
248
-
282
+
249
283
  # add in a default metadataPrefix if none exists
250
284
  if(Const::VERBS[verb].include?(:metadata_prefix))
251
285
  realopts[:metadata_prefix] ||= 'oai_dc'
252
286
  end
253
-
287
+
254
288
  # Convert date formated strings in dates.
255
289
  #realopts[:from] = parse_date(realopts[:from]) if realopts[:from]
256
290
  #realopts[:until] = parse_date(realopts[:until]) if realopts[:until]
@@ -261,43 +295,43 @@ module OAI
261
295
  end
262
296
  realopts
263
297
  end
264
-
298
+
265
299
  def is_resumption?(opts)
266
- if opts.keys.include?(:resumption_token)
300
+ if opts.keys.include?(:resumption_token)
267
301
  return true if 1 == opts.keys.size
268
302
  raise OAI::ArgumentException.new
269
303
  end
270
304
  end
271
-
305
+
272
306
  # Convert our internal representations back into standard OAI options
273
307
  def externalize(value)
274
308
  value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
275
309
  end
276
-
310
+
277
311
  def parse_date(value)
278
312
  return value if value.respond_to?(:strftime)
279
-
313
+
280
314
  Date.parse(value) # This will raise an exception for badly formatted dates
281
315
  Time.parse(value).utc # Sadly, this will not
282
316
  rescue
283
- raise OAI::ArgumentError.new
317
+ raise OAI::ArgumentError.new
284
318
  end
285
-
319
+
286
320
  # Strip out invalid UTF-8 characters. Regex from the W3C, inverted.
287
321
  # http://www.w3.org/International/questions/qa-forms-utf-8.en.php
288
322
  #
289
- # Regex is from WebCollab:
323
+ # Regex is from WebCollab:
290
324
  # http://webcollab.sourceforge.net/unicode.html
291
325
  def strip_invalid_utf_8_chars(xml)
292
- simple_bytes = xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
326
+ xml && xml.gsub(/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]
293
327
  | [\x00-\x7F][\x80-\xBF]+
294
328
  | ([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*
295
329
  | [\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})
296
330
  | [\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))
297
- | (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')
298
- simple_bytes.gsub(/\xE0[\x80-\x9F][\x80-\xBF]
331
+ | (?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/x, '?')\
332
+ .gsub(/\xE0[\x80-\x9F][\x80-\xBF]
299
333
  | \xED[\xA0-\xBF][\x80-\xBF]/,'?')
300
334
  end
301
-
335
+
302
336
  end
303
337
  end
@@ -1,6 +1,7 @@
1
1
  module OAI
2
2
  class ListIdentifiersResponse < Response
3
3
  include Enumerable
4
+ include OAI::Resumable
4
5
  include OAI::XPath
5
6
 
6
7
  def each
@@ -2,15 +2,16 @@ module OAI
2
2
 
3
3
  # allows for iteration across a list of records
4
4
  #
5
- # for record in client.list_records :metadata_prefix => 'oai_dc':
6
- # puts record.metadata
7
- # end
5
+ # client.list_records(:metadata_prefix => 'oai_dc').each do |record|
6
+ # puts record.metadata
7
+ # end
8
8
  #
9
9
  # you'll need to handle resumption tokens
10
-
10
+
11
11
  class ListRecordsResponse < Response
12
- include OAI::XPath
13
12
  include Enumerable
13
+ include OAI::Resumable
14
+ include OAI::XPath
14
15
 
15
16
  def each
16
17
  for record_element in xpath_all(@doc, './/ListRecords/record')
@@ -2,13 +2,14 @@ module OAI
2
2
 
3
3
  # allows for iteration of the sets found in a oai-pmh server
4
4
  #
5
- # for set in client.list_sets
6
- # puts set
7
- # end
8
-
5
+ # for set in client.list_sets
6
+ # puts set
7
+ # end
8
+
9
9
  class ListSetsResponse < Response
10
- include OAI::XPath
11
10
  include Enumerable
11
+ include OAI::Resumable
12
+ include OAI::XPath
12
13
 
13
14
  def each
14
15
  for set_element in xpath_all(@doc, './/set')
@@ -1,13 +1,12 @@
1
1
  module OAI
2
2
 
3
- # A class for representing a Record as returned from a GetRecord
4
- # or ListRecords request. Each record will have a header and metadata
5
- # attribute. The header is a OAI::Header object and the metadata is
6
- # a REXML::Element object for that chunk of XML.
3
+ # A class for representing a Record as returned from a `GetRecord`
4
+ # or `ListRecords` request. Each record will have a header and metadata
5
+ # attribute. The header is a {OAI::Header} object and the metadata is
6
+ # a `REXML::Element` object for that chunk of XML.
7
7
  #
8
- # Note: if your OAI::Client was configured to use the 'libxml' parser
9
- # metadata will return a XML::Node object instead.
10
-
8
+ # Note: if your {OAI::Client} was configured to use the 'libxml' parser
9
+ # metadata will return a `XML::Node` object instead.
11
10
  class Record
12
11
  include OAI::XPath
13
12
  attr_accessor :header, :metadata, :about