oai 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +80 -0
- data/Rakefile +113 -0
- data/bin/oai +68 -0
- data/examples/models/file_model.rb +63 -0
- data/examples/providers/dublin_core.rb +474 -0
- data/lib/oai.rb +7 -13
- data/lib/oai/client.rb +133 -83
- data/lib/oai/{get_record.rb → client/get_record.rb} +0 -0
- data/lib/oai/{header.rb → client/header.rb} +2 -2
- data/lib/oai/{identify.rb → client/identify.rb} +0 -0
- data/lib/oai/{list_identifiers.rb → client/list_identifiers.rb} +0 -0
- data/lib/oai/{list_metadata_formats.rb → client/list_metadata_formats.rb} +0 -0
- data/lib/oai/{list_records.rb → client/list_records.rb} +0 -0
- data/lib/oai/{list_sets.rb → client/list_sets.rb} +1 -1
- data/lib/oai/{metadata_format.rb → client/metadata_format.rb} +0 -0
- data/lib/oai/{record.rb → client/record.rb} +0 -0
- data/lib/oai/{response.rb → client/response.rb} +1 -1
- data/lib/oai/constants.rb +34 -0
- data/lib/oai/exception.rb +72 -1
- data/lib/oai/harvester.rb +38 -0
- data/lib/oai/harvester/config.rb +41 -0
- data/lib/oai/harvester/harvest.rb +144 -0
- data/lib/oai/harvester/logging.rb +70 -0
- data/lib/oai/harvester/mailer.rb +17 -0
- data/lib/oai/harvester/shell.rb +334 -0
- data/lib/oai/provider.rb +300 -0
- data/lib/oai/provider/metadata_format.rb +72 -0
- data/lib/oai/provider/metadata_format/oai_dc.rb +29 -0
- data/lib/oai/provider/model.rb +71 -0
- data/lib/oai/provider/model/activerecord_caching_wrapper.rb +135 -0
- data/lib/oai/provider/model/activerecord_wrapper.rb +136 -0
- data/lib/oai/provider/partial_result.rb +18 -0
- data/lib/oai/provider/response.rb +119 -0
- data/lib/oai/provider/response/error.rb +16 -0
- data/lib/oai/provider/response/get_record.rb +32 -0
- data/lib/oai/provider/response/identify.rb +24 -0
- data/lib/oai/provider/response/list_identifiers.rb +29 -0
- data/lib/oai/provider/response/list_metadata_formats.rb +21 -0
- data/lib/oai/provider/response/list_records.rb +32 -0
- data/lib/oai/provider/response/list_sets.rb +23 -0
- data/lib/oai/provider/response/record_response.rb +68 -0
- data/lib/oai/provider/resumption_token.rb +106 -0
- data/lib/oai/set.rb +14 -5
- data/test/activerecord_provider/config/connection.rb +5 -0
- data/test/activerecord_provider/config/database.yml +6 -0
- data/test/activerecord_provider/database/ar_migration.rb +59 -0
- data/test/activerecord_provider/database/oaipmhtest +0 -0
- data/test/activerecord_provider/fixtures/dc.yml +1501 -0
- data/test/activerecord_provider/helpers/providers.rb +44 -0
- data/test/activerecord_provider/helpers/set_provider.rb +36 -0
- data/test/activerecord_provider/models/dc_field.rb +7 -0
- data/test/activerecord_provider/models/dc_set.rb +6 -0
- data/test/activerecord_provider/models/oai_token.rb +3 -0
- data/test/activerecord_provider/tc_ar_provider.rb +93 -0
- data/test/activerecord_provider/tc_ar_sets_provider.rb +66 -0
- data/test/activerecord_provider/tc_caching_paging_provider.rb +53 -0
- data/test/activerecord_provider/tc_simple_paging_provider.rb +55 -0
- data/test/activerecord_provider/test_helper.rb +4 -0
- data/test/client/helpers/provider.rb +68 -0
- data/test/client/helpers/test_wrapper.rb +11 -0
- data/test/client/tc_exception.rb +36 -0
- data/test/{tc_get_record.rb → client/tc_get_record.rb} +11 -7
- data/test/client/tc_identify.rb +13 -0
- data/test/{tc_libxml.rb → client/tc_libxml.rb} +20 -10
- data/test/{tc_list_identifiers.rb → client/tc_list_identifiers.rb} +10 -8
- data/test/{tc_list_metadata_formats.rb → client/tc_list_metadata_formats.rb} +4 -1
- data/test/{tc_list_records.rb → client/tc_list_records.rb} +4 -1
- data/test/{tc_list_sets.rb → client/tc_list_sets.rb} +4 -2
- data/test/{tc_xpath.rb → client/tc_xpath.rb} +1 -1
- data/test/client/test_helper.rb +5 -0
- data/test/provider/models.rb +230 -0
- data/test/provider/tc_exceptions.rb +63 -0
- data/test/provider/tc_functional_tokens.rb +42 -0
- data/test/provider/tc_provider.rb +69 -0
- data/test/provider/tc_resumption_tokens.rb +46 -0
- data/test/provider/tc_simple_provider.rb +85 -0
- data/test/provider/test_helper.rb +36 -0
- metadata +123 -27
- data/test/tc_exception.rb +0 -38
- data/test/tc_identify.rb +0 -8
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
|
3
|
+
module OAI::Provider
|
4
|
+
|
5
|
+
# ActiveRecord model class in support of the caching wrapper.
|
6
|
+
class OaiToken < ActiveRecord::Base
|
7
|
+
has_many :entries, :class_name => 'OaiEntry',
|
8
|
+
:order => "record_id", :dependent => :destroy
|
9
|
+
|
10
|
+
validates_uniqueness_of :token
|
11
|
+
|
12
|
+
# Make sanitize_sql a public method so we can make use of it.
|
13
|
+
public
|
14
|
+
|
15
|
+
def self.sanitize_sql(*arg)
|
16
|
+
super(*arg)
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_record_before_save?
|
20
|
+
@new_record_before_save
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
# ActiveRecord model class in support of the caching wrapper.
|
26
|
+
class OaiEntry < ActiveRecord::Base
|
27
|
+
belongs_to :oai_token
|
28
|
+
|
29
|
+
validates_uniqueness_of :record_id, :scope => :oai_token
|
30
|
+
end
|
31
|
+
|
32
|
+
# = OAI::Provider::ActiveRecordCachingWrapper
|
33
|
+
#
|
34
|
+
# This class wraps an ActiveRecord model and delegates all of the record
|
35
|
+
# selection/retrieval to the AR model. It accepts options for specifying
|
36
|
+
# the update timestamp field, a timeout, and a limit. The limit option
|
37
|
+
# is used for doing pagination with resumption tokens. The timeout is
|
38
|
+
# used to expire old tokens from the cache. Default timeout is 12 hours.
|
39
|
+
#
|
40
|
+
# The difference between ActiveRecordWrapper and this class is how the
|
41
|
+
# pagination is accomplished. ActiveRecordWrapper encodes all the
|
42
|
+
# information in the token. That approach should work 99% of the time.
|
43
|
+
# If you have an extremely active respository you may want to consider
|
44
|
+
# the caching wrapper. The caching wrapper takes the entire result set
|
45
|
+
# from a request and caches it in another database table, well tables
|
46
|
+
# actually. So the result returned to the client will always be
|
47
|
+
# internally consistent.
|
48
|
+
#
|
49
|
+
class ActiveRecordCachingWrapper < ActiveRecordWrapper
|
50
|
+
|
51
|
+
attr_reader :model, :timestamp_field, :expire
|
52
|
+
|
53
|
+
def initialize(model, options={})
|
54
|
+
@expire = options.delete(:timeout) || 12.hours
|
55
|
+
super(model, options)
|
56
|
+
end
|
57
|
+
|
58
|
+
def find(selector, options={})
|
59
|
+
sweep_cache
|
60
|
+
return next_set(options[:resumption_token]) if options[:resumption_token]
|
61
|
+
|
62
|
+
conditions = sql_conditions(options)
|
63
|
+
|
64
|
+
if :all == selector
|
65
|
+
total = model.count(:id, :conditions => conditions)
|
66
|
+
if @limit && total > @limit
|
67
|
+
select_partial(
|
68
|
+
ResumptionToken.new(options.merge({:last => 0})))
|
69
|
+
else
|
70
|
+
model.find(:all, :conditions => conditions)
|
71
|
+
end
|
72
|
+
else
|
73
|
+
model.find(selector, :conditions => conditions)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def next_set(token_string)
|
80
|
+
raise ResumptionTokenException.new unless @limit
|
81
|
+
|
82
|
+
token = ResumptionToken.parse(token_string)
|
83
|
+
total = model.count(:id, :conditions => token_conditions(token))
|
84
|
+
|
85
|
+
if token.last * @limit + @limit < total
|
86
|
+
select_partial(token)
|
87
|
+
else
|
88
|
+
select_partial(token).records
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# select a subset of the result set, and return it with a
|
93
|
+
# resumption token to get the next subset
|
94
|
+
def select_partial(token)
|
95
|
+
if 0 == token.last
|
96
|
+
oaitoken = OaiToken.find_or_create_by_token(token.to_s)
|
97
|
+
if oaitoken.new_record_before_save?
|
98
|
+
OaiToken.connection.execute("insert into " +
|
99
|
+
"#{OaiEntry.table_name} (oai_token_id, record_id) " +
|
100
|
+
"select #{oaitoken.id}, id from #{model.table_name} where " +
|
101
|
+
"#{OaiToken.sanitize_sql(token_conditions(token))}")
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
oaitoken = OaiToken.find_by_token(token.to_s)
|
106
|
+
|
107
|
+
raise ResumptionTokenException.new unless oaitoken
|
108
|
+
|
109
|
+
PartialResult.new(
|
110
|
+
hydrate_records(oaitoken.entries.find(:all, :limit => @limit,
|
111
|
+
:offset => token.last * @limit)), token.next(token.last + 1)
|
112
|
+
)
|
113
|
+
end
|
114
|
+
|
115
|
+
def sweep_cache
|
116
|
+
OaiToken.destroy_all(["created_at < ?", Time.now - expire])
|
117
|
+
end
|
118
|
+
|
119
|
+
def hydrate_records(records)
|
120
|
+
model.find(records.collect {|r| r.record_id })
|
121
|
+
end
|
122
|
+
|
123
|
+
def token_conditions(token)
|
124
|
+
sql_conditions token.to_conditions_hash
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
def expires_at(creation)
|
130
|
+
created = Time.parse(creation.strftime("%Y-%m-%d %H:%M:%S"))
|
131
|
+
created.utc + expire
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
|
3
|
+
module OAI::Provider
|
4
|
+
# = OAI::Provider::ActiveRecordWrapper
|
5
|
+
#
|
6
|
+
# This class wraps an ActiveRecord model and delegates all of the record
|
7
|
+
# selection/retrieval to the AR model. It accepts options for specifying
|
8
|
+
# the update timestamp field, a timeout, and a limit. The limit option
|
9
|
+
# is used for doing pagination with resumption tokens. The
|
10
|
+
# expiration timeout is ignored, since all necessary information is
|
11
|
+
# encoded in the token.
|
12
|
+
#
|
13
|
+
class ActiveRecordWrapper < Model
|
14
|
+
|
15
|
+
attr_reader :model, :timestamp_field
|
16
|
+
|
17
|
+
def initialize(model, options={})
|
18
|
+
@model = model
|
19
|
+
@timestamp_field = options.delete(:timestamp_field) || 'updated_at'
|
20
|
+
@limit = options.delete(:limit)
|
21
|
+
|
22
|
+
unless options.empty?
|
23
|
+
raise ArgumentException.new(
|
24
|
+
"Unsupported options [#{options.join(', ')}]"
|
25
|
+
)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def earliest
|
30
|
+
model.find(:first,
|
31
|
+
:order => "#{timestamp_field} asc").send(timestamp_field)
|
32
|
+
end
|
33
|
+
|
34
|
+
def latest
|
35
|
+
model.find(:first,
|
36
|
+
:order => "#{timestamp_field} desc").send(timestamp_field)
|
37
|
+
end
|
38
|
+
|
39
|
+
# A model class is expected to provide a method Model.sets that
|
40
|
+
# returns all the sets the model supports. See the
|
41
|
+
# activerecord_provider tests for an example.
|
42
|
+
def sets
|
43
|
+
model.sets if model.respond_to?(:sets)
|
44
|
+
end
|
45
|
+
|
46
|
+
def find(selector, options={})
|
47
|
+
return next_set(options[:resumption_token]) if options[:resumption_token]
|
48
|
+
conditions = sql_conditions(options)
|
49
|
+
|
50
|
+
if :all == selector
|
51
|
+
total = model.count(:id, :conditions => conditions)
|
52
|
+
if @limit && total > @limit
|
53
|
+
select_partial(ResumptionToken.new(options.merge({:last => 0})))
|
54
|
+
else
|
55
|
+
model.find(:all, :conditions => conditions)
|
56
|
+
end
|
57
|
+
else
|
58
|
+
model.find(selector, :conditions => conditions)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def deleted?(record)
|
63
|
+
if record.respond_to?(:deleted_at)
|
64
|
+
return record.deleted_at
|
65
|
+
elsif record.respond_to?(:deleted)
|
66
|
+
return record.deleted
|
67
|
+
end
|
68
|
+
false
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
# Request the next set in this sequence.
|
74
|
+
def next_set(token_string)
|
75
|
+
raise OAI::ResumptionTokenException.new unless @limit
|
76
|
+
|
77
|
+
token = ResumptionToken.parse(token_string)
|
78
|
+
total = model.count(:id, :conditions => token_conditions(token))
|
79
|
+
|
80
|
+
if @limit < total
|
81
|
+
select_partial(token)
|
82
|
+
else # end of result set
|
83
|
+
model.find(:all,
|
84
|
+
:conditions => token_conditions(token),
|
85
|
+
:limit => @limit, :order => "#{model.primary_key} asc")
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# select a subset of the result set, and return it with a
|
90
|
+
# resumption token to get the next subset
|
91
|
+
def select_partial(token)
|
92
|
+
records = model.find(:all,
|
93
|
+
:conditions => token_conditions(token),
|
94
|
+
:limit => @limit,
|
95
|
+
:order => "#{model.primary_key} asc")
|
96
|
+
|
97
|
+
raise OAI::ResumptionTokenException.new unless records
|
98
|
+
|
99
|
+
offset = records.last.send(model.primary_key.to_sym)
|
100
|
+
|
101
|
+
PartialResult.new(records, token.next(offset))
|
102
|
+
end
|
103
|
+
|
104
|
+
# build a sql conditions statement from the content
|
105
|
+
# of a resumption token. It is very important not to
|
106
|
+
# miss any changes as records may change scope as the
|
107
|
+
# harvest is in progress. To avoid loosing any changes
|
108
|
+
# the last 'id' of the previous set is used as the
|
109
|
+
# filter to the next set.
|
110
|
+
def token_conditions(token)
|
111
|
+
last = token.last
|
112
|
+
sql = sql_conditions token.to_conditions_hash
|
113
|
+
|
114
|
+
return sql if 0 == last
|
115
|
+
# Now add last id constraint
|
116
|
+
sql[0] << " AND #{model.primary_key} > ?"
|
117
|
+
sql << last
|
118
|
+
|
119
|
+
return sql
|
120
|
+
end
|
121
|
+
|
122
|
+
# build a sql conditions statement from an OAI options hash
|
123
|
+
def sql_conditions(opts)
|
124
|
+
sql = []
|
125
|
+
sql << "#{timestamp_field} >= ?" << "#{timestamp_field} <= ?"
|
126
|
+
sql << "set = ?" if opts[:set]
|
127
|
+
|
128
|
+
esc_values = [sql.join(" AND ")]
|
129
|
+
esc_values << opts[:from].localtime << opts[:until].localtime
|
130
|
+
esc_values << opts[:set] if opts[:set]
|
131
|
+
|
132
|
+
return esc_values
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module OAI::Provider
|
2
|
+
# = OAI::Provider::PartialResult
|
3
|
+
#
|
4
|
+
# PartialResult is used for returning a set/page of results from a model
|
5
|
+
# that supports resumption tokens. It should contain and array of
|
6
|
+
# records, and a resumption token for getting the next set/page.
|
7
|
+
#
|
8
|
+
class PartialResult
|
9
|
+
attr_reader :records, :token
|
10
|
+
|
11
|
+
def initialize(records, token = nil)
|
12
|
+
@records = records
|
13
|
+
@token = token
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'builder' unless defined?(Builder)
|
2
|
+
|
3
|
+
module OAI
|
4
|
+
module Provider
|
5
|
+
module Response
|
6
|
+
|
7
|
+
class Base
|
8
|
+
attr_reader :provider, :options
|
9
|
+
|
10
|
+
class << self
|
11
|
+
attr_reader :valid_options, :default_options, :required_options
|
12
|
+
|
13
|
+
def valid_parameters(*args)
|
14
|
+
@valid_options ||= []
|
15
|
+
@valid_options = (@valid_options + args.dup).uniq
|
16
|
+
end
|
17
|
+
|
18
|
+
def default_parameters(options = {})
|
19
|
+
@default_options ||= {}
|
20
|
+
@default_options.merge! options.dup
|
21
|
+
end
|
22
|
+
|
23
|
+
def required_parameters(*args)
|
24
|
+
valid_parameters(*args)
|
25
|
+
@required_options ||= []
|
26
|
+
@required_options = (@required_options + args.dup).uniq
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize(provider, options = {})
|
32
|
+
@provider = provider
|
33
|
+
@options = internalize(options)
|
34
|
+
raise OAI::ArgumentException.new unless valid?
|
35
|
+
end
|
36
|
+
|
37
|
+
def response
|
38
|
+
@builder = Builder::XmlMarkup.new
|
39
|
+
@builder.instruct! :xml, :version=>"1.0", :encoding=>"UTF-8"
|
40
|
+
@builder.tag!('OAI-PMH', header) do
|
41
|
+
@builder.responseDate Time.now.utc.xmlschema
|
42
|
+
@builder.request(provider.url, options)
|
43
|
+
yield @builder
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def header
|
50
|
+
{
|
51
|
+
'xmlns' => "http://www.openarchives.org/OAI/2.0/",
|
52
|
+
'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
|
53
|
+
'xsi:schemaLocation' => %{http://www.openarchives.org/OAI/2.0/
|
54
|
+
http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd}
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
def valid?
|
59
|
+
return true if resumption?
|
60
|
+
|
61
|
+
return true if self.class.valid_options.nil? and options.empty?
|
62
|
+
|
63
|
+
if self.class.required_options
|
64
|
+
return false unless (self.class.required_options - @options.keys).empty?
|
65
|
+
end
|
66
|
+
|
67
|
+
return false unless (@options.keys - self.class.valid_options).empty?
|
68
|
+
|
69
|
+
populate_defaults
|
70
|
+
end
|
71
|
+
|
72
|
+
def populate_defaults
|
73
|
+
self.class.default_options.each do |k,v|
|
74
|
+
@options[k] = v.respond_to?(:call) ? v.call(self) : v if not @options[k]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def resumption?
|
79
|
+
if @options.keys.include?(:resumption_token)
|
80
|
+
return true if 1 == @options.keys.size
|
81
|
+
raise OAI::ArgumentException.new
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Convert our internal representations back into standard OAI options
|
86
|
+
def externalize(value)
|
87
|
+
value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
|
88
|
+
end
|
89
|
+
|
90
|
+
def parse_date(value)
|
91
|
+
return value if value.respond_to?(:strftime)
|
92
|
+
|
93
|
+
# Oddly Chronic doesn't parse an UTC encoded datetime.
|
94
|
+
# Luckily Time does
|
95
|
+
dt = Chronic.parse(value) || Time.parse(value)
|
96
|
+
raise OAI::ArgumentError.new unless dt
|
97
|
+
|
98
|
+
dt.utc
|
99
|
+
end
|
100
|
+
|
101
|
+
def internalize(hash = {})
|
102
|
+
internal = {}
|
103
|
+
hash.keys.each do |key|
|
104
|
+
internal[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = hash[key].dup
|
105
|
+
end
|
106
|
+
|
107
|
+
# Convert date formated strings into internal time values
|
108
|
+
# Convert date formated strings in dates.
|
109
|
+
internal[:from] = parse_date(internal[:from]) if internal[:from]
|
110
|
+
internal[:until] = parse_date(internal[:until]) if internal[:until]
|
111
|
+
|
112
|
+
internal
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module OAI::Provider::Response
|
2
|
+
|
3
|
+
class GetRecord < RecordResponse
|
4
|
+
required_parameters :identifier
|
5
|
+
|
6
|
+
def to_xml
|
7
|
+
id = extract_identifier(options.delete(:identifier))
|
8
|
+
unless record = provider.model.find(id, options)
|
9
|
+
raise OAI::IdException.new
|
10
|
+
end
|
11
|
+
|
12
|
+
response do |r|
|
13
|
+
r.GetRecord do
|
14
|
+
r.record do
|
15
|
+
header_for record
|
16
|
+
data_for record unless deleted?(record)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def extract_identifier(id)
|
25
|
+
id.sub("#{provider.prefix}/", '')
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|