oai 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +80 -0
- data/Rakefile +113 -0
- data/bin/oai +68 -0
- data/examples/models/file_model.rb +63 -0
- data/examples/providers/dublin_core.rb +474 -0
- data/lib/oai.rb +7 -13
- data/lib/oai/client.rb +133 -83
- data/lib/oai/{get_record.rb → client/get_record.rb} +0 -0
- data/lib/oai/{header.rb → client/header.rb} +2 -2
- data/lib/oai/{identify.rb → client/identify.rb} +0 -0
- data/lib/oai/{list_identifiers.rb → client/list_identifiers.rb} +0 -0
- data/lib/oai/{list_metadata_formats.rb → client/list_metadata_formats.rb} +0 -0
- data/lib/oai/{list_records.rb → client/list_records.rb} +0 -0
- data/lib/oai/{list_sets.rb → client/list_sets.rb} +1 -1
- data/lib/oai/{metadata_format.rb → client/metadata_format.rb} +0 -0
- data/lib/oai/{record.rb → client/record.rb} +0 -0
- data/lib/oai/{response.rb → client/response.rb} +1 -1
- data/lib/oai/constants.rb +34 -0
- data/lib/oai/exception.rb +72 -1
- data/lib/oai/harvester.rb +38 -0
- data/lib/oai/harvester/config.rb +41 -0
- data/lib/oai/harvester/harvest.rb +144 -0
- data/lib/oai/harvester/logging.rb +70 -0
- data/lib/oai/harvester/mailer.rb +17 -0
- data/lib/oai/harvester/shell.rb +334 -0
- data/lib/oai/provider.rb +300 -0
- data/lib/oai/provider/metadata_format.rb +72 -0
- data/lib/oai/provider/metadata_format/oai_dc.rb +29 -0
- data/lib/oai/provider/model.rb +71 -0
- data/lib/oai/provider/model/activerecord_caching_wrapper.rb +135 -0
- data/lib/oai/provider/model/activerecord_wrapper.rb +136 -0
- data/lib/oai/provider/partial_result.rb +18 -0
- data/lib/oai/provider/response.rb +119 -0
- data/lib/oai/provider/response/error.rb +16 -0
- data/lib/oai/provider/response/get_record.rb +32 -0
- data/lib/oai/provider/response/identify.rb +24 -0
- data/lib/oai/provider/response/list_identifiers.rb +29 -0
- data/lib/oai/provider/response/list_metadata_formats.rb +21 -0
- data/lib/oai/provider/response/list_records.rb +32 -0
- data/lib/oai/provider/response/list_sets.rb +23 -0
- data/lib/oai/provider/response/record_response.rb +68 -0
- data/lib/oai/provider/resumption_token.rb +106 -0
- data/lib/oai/set.rb +14 -5
- data/test/activerecord_provider/config/connection.rb +5 -0
- data/test/activerecord_provider/config/database.yml +6 -0
- data/test/activerecord_provider/database/ar_migration.rb +59 -0
- data/test/activerecord_provider/database/oaipmhtest +0 -0
- data/test/activerecord_provider/fixtures/dc.yml +1501 -0
- data/test/activerecord_provider/helpers/providers.rb +44 -0
- data/test/activerecord_provider/helpers/set_provider.rb +36 -0
- data/test/activerecord_provider/models/dc_field.rb +7 -0
- data/test/activerecord_provider/models/dc_set.rb +6 -0
- data/test/activerecord_provider/models/oai_token.rb +3 -0
- data/test/activerecord_provider/tc_ar_provider.rb +93 -0
- data/test/activerecord_provider/tc_ar_sets_provider.rb +66 -0
- data/test/activerecord_provider/tc_caching_paging_provider.rb +53 -0
- data/test/activerecord_provider/tc_simple_paging_provider.rb +55 -0
- data/test/activerecord_provider/test_helper.rb +4 -0
- data/test/client/helpers/provider.rb +68 -0
- data/test/client/helpers/test_wrapper.rb +11 -0
- data/test/client/tc_exception.rb +36 -0
- data/test/{tc_get_record.rb → client/tc_get_record.rb} +11 -7
- data/test/client/tc_identify.rb +13 -0
- data/test/{tc_libxml.rb → client/tc_libxml.rb} +20 -10
- data/test/{tc_list_identifiers.rb → client/tc_list_identifiers.rb} +10 -8
- data/test/{tc_list_metadata_formats.rb → client/tc_list_metadata_formats.rb} +4 -1
- data/test/{tc_list_records.rb → client/tc_list_records.rb} +4 -1
- data/test/{tc_list_sets.rb → client/tc_list_sets.rb} +4 -2
- data/test/{tc_xpath.rb → client/tc_xpath.rb} +1 -1
- data/test/client/test_helper.rb +5 -0
- data/test/provider/models.rb +230 -0
- data/test/provider/tc_exceptions.rb +63 -0
- data/test/provider/tc_functional_tokens.rb +42 -0
- data/test/provider/tc_provider.rb +69 -0
- data/test/provider/tc_resumption_tokens.rb +46 -0
- data/test/provider/tc_simple_provider.rb +85 -0
- data/test/provider/test_helper.rb +36 -0
- metadata +123 -27
- data/test/tc_exception.rb +0 -38
- data/test/tc_identify.rb +0 -8
@@ -0,0 +1,135 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
|
3
|
+
module OAI::Provider
|
4
|
+
|
5
|
+
# ActiveRecord model class in support of the caching wrapper.
|
6
|
+
class OaiToken < ActiveRecord::Base
|
7
|
+
has_many :entries, :class_name => 'OaiEntry',
|
8
|
+
:order => "record_id", :dependent => :destroy
|
9
|
+
|
10
|
+
validates_uniqueness_of :token
|
11
|
+
|
12
|
+
# Make sanitize_sql a public method so we can make use of it.
|
13
|
+
public
|
14
|
+
|
15
|
+
def self.sanitize_sql(*arg)
|
16
|
+
super(*arg)
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_record_before_save?
|
20
|
+
@new_record_before_save
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
# ActiveRecord model class in support of the caching wrapper.
|
26
|
+
class OaiEntry < ActiveRecord::Base
|
27
|
+
belongs_to :oai_token
|
28
|
+
|
29
|
+
validates_uniqueness_of :record_id, :scope => :oai_token
|
30
|
+
end
|
31
|
+
|
32
|
+
# = OAI::Provider::ActiveRecordCachingWrapper
|
33
|
+
#
|
34
|
+
# This class wraps an ActiveRecord model and delegates all of the record
|
35
|
+
# selection/retrieval to the AR model. It accepts options for specifying
|
36
|
+
# the update timestamp field, a timeout, and a limit. The limit option
|
37
|
+
# is used for doing pagination with resumption tokens. The timeout is
|
38
|
+
# used to expire old tokens from the cache. Default timeout is 12 hours.
|
39
|
+
#
|
40
|
+
# The difference between ActiveRecordWrapper and this class is how the
|
41
|
+
# pagination is accomplished. ActiveRecordWrapper encodes all the
|
42
|
+
# information in the token. That approach should work 99% of the time.
|
43
|
+
# If you have an extremely active respository you may want to consider
|
44
|
+
# the caching wrapper. The caching wrapper takes the entire result set
|
45
|
+
# from a request and caches it in another database table, well tables
|
46
|
+
# actually. So the result returned to the client will always be
|
47
|
+
# internally consistent.
|
48
|
+
#
|
49
|
+
class ActiveRecordCachingWrapper < ActiveRecordWrapper
|
50
|
+
|
51
|
+
attr_reader :model, :timestamp_field, :expire
|
52
|
+
|
53
|
+
def initialize(model, options={})
|
54
|
+
@expire = options.delete(:timeout) || 12.hours
|
55
|
+
super(model, options)
|
56
|
+
end
|
57
|
+
|
58
|
+
def find(selector, options={})
|
59
|
+
sweep_cache
|
60
|
+
return next_set(options[:resumption_token]) if options[:resumption_token]
|
61
|
+
|
62
|
+
conditions = sql_conditions(options)
|
63
|
+
|
64
|
+
if :all == selector
|
65
|
+
total = model.count(:id, :conditions => conditions)
|
66
|
+
if @limit && total > @limit
|
67
|
+
select_partial(
|
68
|
+
ResumptionToken.new(options.merge({:last => 0})))
|
69
|
+
else
|
70
|
+
model.find(:all, :conditions => conditions)
|
71
|
+
end
|
72
|
+
else
|
73
|
+
model.find(selector, :conditions => conditions)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
protected
|
78
|
+
|
79
|
+
def next_set(token_string)
|
80
|
+
raise ResumptionTokenException.new unless @limit
|
81
|
+
|
82
|
+
token = ResumptionToken.parse(token_string)
|
83
|
+
total = model.count(:id, :conditions => token_conditions(token))
|
84
|
+
|
85
|
+
if token.last * @limit + @limit < total
|
86
|
+
select_partial(token)
|
87
|
+
else
|
88
|
+
select_partial(token).records
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# select a subset of the result set, and return it with a
|
93
|
+
# resumption token to get the next subset
|
94
|
+
def select_partial(token)
|
95
|
+
if 0 == token.last
|
96
|
+
oaitoken = OaiToken.find_or_create_by_token(token.to_s)
|
97
|
+
if oaitoken.new_record_before_save?
|
98
|
+
OaiToken.connection.execute("insert into " +
|
99
|
+
"#{OaiEntry.table_name} (oai_token_id, record_id) " +
|
100
|
+
"select #{oaitoken.id}, id from #{model.table_name} where " +
|
101
|
+
"#{OaiToken.sanitize_sql(token_conditions(token))}")
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
oaitoken = OaiToken.find_by_token(token.to_s)
|
106
|
+
|
107
|
+
raise ResumptionTokenException.new unless oaitoken
|
108
|
+
|
109
|
+
PartialResult.new(
|
110
|
+
hydrate_records(oaitoken.entries.find(:all, :limit => @limit,
|
111
|
+
:offset => token.last * @limit)), token.next(token.last + 1)
|
112
|
+
)
|
113
|
+
end
|
114
|
+
|
115
|
+
def sweep_cache
|
116
|
+
OaiToken.destroy_all(["created_at < ?", Time.now - expire])
|
117
|
+
end
|
118
|
+
|
119
|
+
def hydrate_records(records)
|
120
|
+
model.find(records.collect {|r| r.record_id })
|
121
|
+
end
|
122
|
+
|
123
|
+
def token_conditions(token)
|
124
|
+
sql_conditions token.to_conditions_hash
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
def expires_at(creation)
|
130
|
+
created = Time.parse(creation.strftime("%Y-%m-%d %H:%M:%S"))
|
131
|
+
created.utc + expire
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'active_record'
|
2
|
+
|
3
|
+
module OAI::Provider
|
4
|
+
# = OAI::Provider::ActiveRecordWrapper
|
5
|
+
#
|
6
|
+
# This class wraps an ActiveRecord model and delegates all of the record
|
7
|
+
# selection/retrieval to the AR model. It accepts options for specifying
|
8
|
+
# the update timestamp field, a timeout, and a limit. The limit option
|
9
|
+
# is used for doing pagination with resumption tokens. The
|
10
|
+
# expiration timeout is ignored, since all necessary information is
|
11
|
+
# encoded in the token.
|
12
|
+
#
|
13
|
+
class ActiveRecordWrapper < Model
|
14
|
+
|
15
|
+
attr_reader :model, :timestamp_field
|
16
|
+
|
17
|
+
def initialize(model, options={})
|
18
|
+
@model = model
|
19
|
+
@timestamp_field = options.delete(:timestamp_field) || 'updated_at'
|
20
|
+
@limit = options.delete(:limit)
|
21
|
+
|
22
|
+
unless options.empty?
|
23
|
+
raise ArgumentException.new(
|
24
|
+
"Unsupported options [#{options.join(', ')}]"
|
25
|
+
)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def earliest
|
30
|
+
model.find(:first,
|
31
|
+
:order => "#{timestamp_field} asc").send(timestamp_field)
|
32
|
+
end
|
33
|
+
|
34
|
+
def latest
|
35
|
+
model.find(:first,
|
36
|
+
:order => "#{timestamp_field} desc").send(timestamp_field)
|
37
|
+
end
|
38
|
+
|
39
|
+
# A model class is expected to provide a method Model.sets that
|
40
|
+
# returns all the sets the model supports. See the
|
41
|
+
# activerecord_provider tests for an example.
|
42
|
+
def sets
|
43
|
+
model.sets if model.respond_to?(:sets)
|
44
|
+
end
|
45
|
+
|
46
|
+
def find(selector, options={})
|
47
|
+
return next_set(options[:resumption_token]) if options[:resumption_token]
|
48
|
+
conditions = sql_conditions(options)
|
49
|
+
|
50
|
+
if :all == selector
|
51
|
+
total = model.count(:id, :conditions => conditions)
|
52
|
+
if @limit && total > @limit
|
53
|
+
select_partial(ResumptionToken.new(options.merge({:last => 0})))
|
54
|
+
else
|
55
|
+
model.find(:all, :conditions => conditions)
|
56
|
+
end
|
57
|
+
else
|
58
|
+
model.find(selector, :conditions => conditions)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def deleted?(record)
|
63
|
+
if record.respond_to?(:deleted_at)
|
64
|
+
return record.deleted_at
|
65
|
+
elsif record.respond_to?(:deleted)
|
66
|
+
return record.deleted
|
67
|
+
end
|
68
|
+
false
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
# Request the next set in this sequence.
|
74
|
+
def next_set(token_string)
|
75
|
+
raise OAI::ResumptionTokenException.new unless @limit
|
76
|
+
|
77
|
+
token = ResumptionToken.parse(token_string)
|
78
|
+
total = model.count(:id, :conditions => token_conditions(token))
|
79
|
+
|
80
|
+
if @limit < total
|
81
|
+
select_partial(token)
|
82
|
+
else # end of result set
|
83
|
+
model.find(:all,
|
84
|
+
:conditions => token_conditions(token),
|
85
|
+
:limit => @limit, :order => "#{model.primary_key} asc")
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# select a subset of the result set, and return it with a
|
90
|
+
# resumption token to get the next subset
|
91
|
+
def select_partial(token)
|
92
|
+
records = model.find(:all,
|
93
|
+
:conditions => token_conditions(token),
|
94
|
+
:limit => @limit,
|
95
|
+
:order => "#{model.primary_key} asc")
|
96
|
+
|
97
|
+
raise OAI::ResumptionTokenException.new unless records
|
98
|
+
|
99
|
+
offset = records.last.send(model.primary_key.to_sym)
|
100
|
+
|
101
|
+
PartialResult.new(records, token.next(offset))
|
102
|
+
end
|
103
|
+
|
104
|
+
# build a sql conditions statement from the content
|
105
|
+
# of a resumption token. It is very important not to
|
106
|
+
# miss any changes as records may change scope as the
|
107
|
+
# harvest is in progress. To avoid loosing any changes
|
108
|
+
# the last 'id' of the previous set is used as the
|
109
|
+
# filter to the next set.
|
110
|
+
def token_conditions(token)
|
111
|
+
last = token.last
|
112
|
+
sql = sql_conditions token.to_conditions_hash
|
113
|
+
|
114
|
+
return sql if 0 == last
|
115
|
+
# Now add last id constraint
|
116
|
+
sql[0] << " AND #{model.primary_key} > ?"
|
117
|
+
sql << last
|
118
|
+
|
119
|
+
return sql
|
120
|
+
end
|
121
|
+
|
122
|
+
# build a sql conditions statement from an OAI options hash
|
123
|
+
def sql_conditions(opts)
|
124
|
+
sql = []
|
125
|
+
sql << "#{timestamp_field} >= ?" << "#{timestamp_field} <= ?"
|
126
|
+
sql << "set = ?" if opts[:set]
|
127
|
+
|
128
|
+
esc_values = [sql.join(" AND ")]
|
129
|
+
esc_values << opts[:from].localtime << opts[:until].localtime
|
130
|
+
esc_values << opts[:set] if opts[:set]
|
131
|
+
|
132
|
+
return esc_values
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module OAI::Provider
|
2
|
+
# = OAI::Provider::PartialResult
|
3
|
+
#
|
4
|
+
# PartialResult is used for returning a set/page of results from a model
|
5
|
+
# that supports resumption tokens. It should contain and array of
|
6
|
+
# records, and a resumption token for getting the next set/page.
|
7
|
+
#
|
8
|
+
class PartialResult
|
9
|
+
attr_reader :records, :token
|
10
|
+
|
11
|
+
def initialize(records, token = nil)
|
12
|
+
@records = records
|
13
|
+
@token = token
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'builder' unless defined?(Builder)
|
2
|
+
|
3
|
+
module OAI
|
4
|
+
module Provider
|
5
|
+
module Response
|
6
|
+
|
7
|
+
class Base
|
8
|
+
attr_reader :provider, :options
|
9
|
+
|
10
|
+
class << self
|
11
|
+
attr_reader :valid_options, :default_options, :required_options
|
12
|
+
|
13
|
+
def valid_parameters(*args)
|
14
|
+
@valid_options ||= []
|
15
|
+
@valid_options = (@valid_options + args.dup).uniq
|
16
|
+
end
|
17
|
+
|
18
|
+
def default_parameters(options = {})
|
19
|
+
@default_options ||= {}
|
20
|
+
@default_options.merge! options.dup
|
21
|
+
end
|
22
|
+
|
23
|
+
def required_parameters(*args)
|
24
|
+
valid_parameters(*args)
|
25
|
+
@required_options ||= []
|
26
|
+
@required_options = (@required_options + args.dup).uniq
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize(provider, options = {})
|
32
|
+
@provider = provider
|
33
|
+
@options = internalize(options)
|
34
|
+
raise OAI::ArgumentException.new unless valid?
|
35
|
+
end
|
36
|
+
|
37
|
+
def response
|
38
|
+
@builder = Builder::XmlMarkup.new
|
39
|
+
@builder.instruct! :xml, :version=>"1.0", :encoding=>"UTF-8"
|
40
|
+
@builder.tag!('OAI-PMH', header) do
|
41
|
+
@builder.responseDate Time.now.utc.xmlschema
|
42
|
+
@builder.request(provider.url, options)
|
43
|
+
yield @builder
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def header
|
50
|
+
{
|
51
|
+
'xmlns' => "http://www.openarchives.org/OAI/2.0/",
|
52
|
+
'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
|
53
|
+
'xsi:schemaLocation' => %{http://www.openarchives.org/OAI/2.0/
|
54
|
+
http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd}
|
55
|
+
}
|
56
|
+
end
|
57
|
+
|
58
|
+
def valid?
|
59
|
+
return true if resumption?
|
60
|
+
|
61
|
+
return true if self.class.valid_options.nil? and options.empty?
|
62
|
+
|
63
|
+
if self.class.required_options
|
64
|
+
return false unless (self.class.required_options - @options.keys).empty?
|
65
|
+
end
|
66
|
+
|
67
|
+
return false unless (@options.keys - self.class.valid_options).empty?
|
68
|
+
|
69
|
+
populate_defaults
|
70
|
+
end
|
71
|
+
|
72
|
+
def populate_defaults
|
73
|
+
self.class.default_options.each do |k,v|
|
74
|
+
@options[k] = v.respond_to?(:call) ? v.call(self) : v if not @options[k]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def resumption?
|
79
|
+
if @options.keys.include?(:resumption_token)
|
80
|
+
return true if 1 == @options.keys.size
|
81
|
+
raise OAI::ArgumentException.new
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Convert our internal representations back into standard OAI options
|
86
|
+
def externalize(value)
|
87
|
+
value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
|
88
|
+
end
|
89
|
+
|
90
|
+
def parse_date(value)
|
91
|
+
return value if value.respond_to?(:strftime)
|
92
|
+
|
93
|
+
# Oddly Chronic doesn't parse an UTC encoded datetime.
|
94
|
+
# Luckily Time does
|
95
|
+
dt = Chronic.parse(value) || Time.parse(value)
|
96
|
+
raise OAI::ArgumentError.new unless dt
|
97
|
+
|
98
|
+
dt.utc
|
99
|
+
end
|
100
|
+
|
101
|
+
def internalize(hash = {})
|
102
|
+
internal = {}
|
103
|
+
hash.keys.each do |key|
|
104
|
+
internal[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = hash[key].dup
|
105
|
+
end
|
106
|
+
|
107
|
+
# Convert date formated strings into internal time values
|
108
|
+
# Convert date formated strings in dates.
|
109
|
+
internal[:from] = parse_date(internal[:from]) if internal[:from]
|
110
|
+
internal[:until] = parse_date(internal[:until]) if internal[:until]
|
111
|
+
|
112
|
+
internal
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module OAI::Provider::Response
|
2
|
+
|
3
|
+
class GetRecord < RecordResponse
|
4
|
+
required_parameters :identifier
|
5
|
+
|
6
|
+
def to_xml
|
7
|
+
id = extract_identifier(options.delete(:identifier))
|
8
|
+
unless record = provider.model.find(id, options)
|
9
|
+
raise OAI::IdException.new
|
10
|
+
end
|
11
|
+
|
12
|
+
response do |r|
|
13
|
+
r.GetRecord do
|
14
|
+
r.record do
|
15
|
+
header_for record
|
16
|
+
data_for record unless deleted?(record)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def extract_identifier(id)
|
25
|
+
id.sub("#{provider.prefix}/", '')
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|