oai 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (80) hide show
  1. data/README +80 -0
  2. data/Rakefile +113 -0
  3. data/bin/oai +68 -0
  4. data/examples/models/file_model.rb +63 -0
  5. data/examples/providers/dublin_core.rb +474 -0
  6. data/lib/oai.rb +7 -13
  7. data/lib/oai/client.rb +133 -83
  8. data/lib/oai/{get_record.rb → client/get_record.rb} +0 -0
  9. data/lib/oai/{header.rb → client/header.rb} +2 -2
  10. data/lib/oai/{identify.rb → client/identify.rb} +0 -0
  11. data/lib/oai/{list_identifiers.rb → client/list_identifiers.rb} +0 -0
  12. data/lib/oai/{list_metadata_formats.rb → client/list_metadata_formats.rb} +0 -0
  13. data/lib/oai/{list_records.rb → client/list_records.rb} +0 -0
  14. data/lib/oai/{list_sets.rb → client/list_sets.rb} +1 -1
  15. data/lib/oai/{metadata_format.rb → client/metadata_format.rb} +0 -0
  16. data/lib/oai/{record.rb → client/record.rb} +0 -0
  17. data/lib/oai/{response.rb → client/response.rb} +1 -1
  18. data/lib/oai/constants.rb +34 -0
  19. data/lib/oai/exception.rb +72 -1
  20. data/lib/oai/harvester.rb +38 -0
  21. data/lib/oai/harvester/config.rb +41 -0
  22. data/lib/oai/harvester/harvest.rb +144 -0
  23. data/lib/oai/harvester/logging.rb +70 -0
  24. data/lib/oai/harvester/mailer.rb +17 -0
  25. data/lib/oai/harvester/shell.rb +334 -0
  26. data/lib/oai/provider.rb +300 -0
  27. data/lib/oai/provider/metadata_format.rb +72 -0
  28. data/lib/oai/provider/metadata_format/oai_dc.rb +29 -0
  29. data/lib/oai/provider/model.rb +71 -0
  30. data/lib/oai/provider/model/activerecord_caching_wrapper.rb +135 -0
  31. data/lib/oai/provider/model/activerecord_wrapper.rb +136 -0
  32. data/lib/oai/provider/partial_result.rb +18 -0
  33. data/lib/oai/provider/response.rb +119 -0
  34. data/lib/oai/provider/response/error.rb +16 -0
  35. data/lib/oai/provider/response/get_record.rb +32 -0
  36. data/lib/oai/provider/response/identify.rb +24 -0
  37. data/lib/oai/provider/response/list_identifiers.rb +29 -0
  38. data/lib/oai/provider/response/list_metadata_formats.rb +21 -0
  39. data/lib/oai/provider/response/list_records.rb +32 -0
  40. data/lib/oai/provider/response/list_sets.rb +23 -0
  41. data/lib/oai/provider/response/record_response.rb +68 -0
  42. data/lib/oai/provider/resumption_token.rb +106 -0
  43. data/lib/oai/set.rb +14 -5
  44. data/test/activerecord_provider/config/connection.rb +5 -0
  45. data/test/activerecord_provider/config/database.yml +6 -0
  46. data/test/activerecord_provider/database/ar_migration.rb +59 -0
  47. data/test/activerecord_provider/database/oaipmhtest +0 -0
  48. data/test/activerecord_provider/fixtures/dc.yml +1501 -0
  49. data/test/activerecord_provider/helpers/providers.rb +44 -0
  50. data/test/activerecord_provider/helpers/set_provider.rb +36 -0
  51. data/test/activerecord_provider/models/dc_field.rb +7 -0
  52. data/test/activerecord_provider/models/dc_set.rb +6 -0
  53. data/test/activerecord_provider/models/oai_token.rb +3 -0
  54. data/test/activerecord_provider/tc_ar_provider.rb +93 -0
  55. data/test/activerecord_provider/tc_ar_sets_provider.rb +66 -0
  56. data/test/activerecord_provider/tc_caching_paging_provider.rb +53 -0
  57. data/test/activerecord_provider/tc_simple_paging_provider.rb +55 -0
  58. data/test/activerecord_provider/test_helper.rb +4 -0
  59. data/test/client/helpers/provider.rb +68 -0
  60. data/test/client/helpers/test_wrapper.rb +11 -0
  61. data/test/client/tc_exception.rb +36 -0
  62. data/test/{tc_get_record.rb → client/tc_get_record.rb} +11 -7
  63. data/test/client/tc_identify.rb +13 -0
  64. data/test/{tc_libxml.rb → client/tc_libxml.rb} +20 -10
  65. data/test/{tc_list_identifiers.rb → client/tc_list_identifiers.rb} +10 -8
  66. data/test/{tc_list_metadata_formats.rb → client/tc_list_metadata_formats.rb} +4 -1
  67. data/test/{tc_list_records.rb → client/tc_list_records.rb} +4 -1
  68. data/test/{tc_list_sets.rb → client/tc_list_sets.rb} +4 -2
  69. data/test/{tc_xpath.rb → client/tc_xpath.rb} +1 -1
  70. data/test/client/test_helper.rb +5 -0
  71. data/test/provider/models.rb +230 -0
  72. data/test/provider/tc_exceptions.rb +63 -0
  73. data/test/provider/tc_functional_tokens.rb +42 -0
  74. data/test/provider/tc_provider.rb +69 -0
  75. data/test/provider/tc_resumption_tokens.rb +46 -0
  76. data/test/provider/tc_simple_provider.rb +85 -0
  77. data/test/provider/test_helper.rb +36 -0
  78. metadata +123 -27
  79. data/test/tc_exception.rb +0 -38
  80. data/test/tc_identify.rb +0 -8
@@ -0,0 +1,135 @@
1
+ require 'active_record'
2
+
3
+ module OAI::Provider
4
+
5
+ # ActiveRecord model class in support of the caching wrapper.
6
+ class OaiToken < ActiveRecord::Base
7
+ has_many :entries, :class_name => 'OaiEntry',
8
+ :order => "record_id", :dependent => :destroy
9
+
10
+ validates_uniqueness_of :token
11
+
12
+ # Make sanitize_sql a public method so we can make use of it.
13
+ public
14
+
15
+ def self.sanitize_sql(*arg)
16
+ super(*arg)
17
+ end
18
+
19
+ def new_record_before_save?
20
+ @new_record_before_save
21
+ end
22
+
23
+ end
24
+
25
+ # ActiveRecord model class in support of the caching wrapper.
26
+ class OaiEntry < ActiveRecord::Base
27
+ belongs_to :oai_token
28
+
29
+ validates_uniqueness_of :record_id, :scope => :oai_token
30
+ end
31
+
32
+ # = OAI::Provider::ActiveRecordCachingWrapper
33
+ #
34
+ # This class wraps an ActiveRecord model and delegates all of the record
35
+ # selection/retrieval to the AR model. It accepts options for specifying
36
+ # the update timestamp field, a timeout, and a limit. The limit option
37
+ # is used for doing pagination with resumption tokens. The timeout is
38
+ # used to expire old tokens from the cache. Default timeout is 12 hours.
39
+ #
40
+ # The difference between ActiveRecordWrapper and this class is how the
41
+ # pagination is accomplished. ActiveRecordWrapper encodes all the
42
+ # information in the token. That approach should work 99% of the time.
43
+ # If you have an extremely active respository you may want to consider
44
+ # the caching wrapper. The caching wrapper takes the entire result set
45
+ # from a request and caches it in another database table, well tables
46
+ # actually. So the result returned to the client will always be
47
+ # internally consistent.
48
+ #
49
+ class ActiveRecordCachingWrapper < ActiveRecordWrapper
50
+
51
+ attr_reader :model, :timestamp_field, :expire
52
+
53
+ def initialize(model, options={})
54
+ @expire = options.delete(:timeout) || 12.hours
55
+ super(model, options)
56
+ end
57
+
58
+ def find(selector, options={})
59
+ sweep_cache
60
+ return next_set(options[:resumption_token]) if options[:resumption_token]
61
+
62
+ conditions = sql_conditions(options)
63
+
64
+ if :all == selector
65
+ total = model.count(:id, :conditions => conditions)
66
+ if @limit && total > @limit
67
+ select_partial(
68
+ ResumptionToken.new(options.merge({:last => 0})))
69
+ else
70
+ model.find(:all, :conditions => conditions)
71
+ end
72
+ else
73
+ model.find(selector, :conditions => conditions)
74
+ end
75
+ end
76
+
77
+ protected
78
+
79
+ def next_set(token_string)
80
+ raise ResumptionTokenException.new unless @limit
81
+
82
+ token = ResumptionToken.parse(token_string)
83
+ total = model.count(:id, :conditions => token_conditions(token))
84
+
85
+ if token.last * @limit + @limit < total
86
+ select_partial(token)
87
+ else
88
+ select_partial(token).records
89
+ end
90
+ end
91
+
92
+ # select a subset of the result set, and return it with a
93
+ # resumption token to get the next subset
94
+ def select_partial(token)
95
+ if 0 == token.last
96
+ oaitoken = OaiToken.find_or_create_by_token(token.to_s)
97
+ if oaitoken.new_record_before_save?
98
+ OaiToken.connection.execute("insert into " +
99
+ "#{OaiEntry.table_name} (oai_token_id, record_id) " +
100
+ "select #{oaitoken.id}, id from #{model.table_name} where " +
101
+ "#{OaiToken.sanitize_sql(token_conditions(token))}")
102
+ end
103
+ end
104
+
105
+ oaitoken = OaiToken.find_by_token(token.to_s)
106
+
107
+ raise ResumptionTokenException.new unless oaitoken
108
+
109
+ PartialResult.new(
110
+ hydrate_records(oaitoken.entries.find(:all, :limit => @limit,
111
+ :offset => token.last * @limit)), token.next(token.last + 1)
112
+ )
113
+ end
114
+
115
+ def sweep_cache
116
+ OaiToken.destroy_all(["created_at < ?", Time.now - expire])
117
+ end
118
+
119
+ def hydrate_records(records)
120
+ model.find(records.collect {|r| r.record_id })
121
+ end
122
+
123
+ def token_conditions(token)
124
+ sql_conditions token.to_conditions_hash
125
+ end
126
+
127
+ private
128
+
129
+ def expires_at(creation)
130
+ created = Time.parse(creation.strftime("%Y-%m-%d %H:%M:%S"))
131
+ created.utc + expire
132
+ end
133
+
134
+ end
135
+ end
@@ -0,0 +1,136 @@
1
+ require 'active_record'
2
+
3
+ module OAI::Provider
4
+ # = OAI::Provider::ActiveRecordWrapper
5
+ #
6
+ # This class wraps an ActiveRecord model and delegates all of the record
7
+ # selection/retrieval to the AR model. It accepts options for specifying
8
+ # the update timestamp field, a timeout, and a limit. The limit option
9
+ # is used for doing pagination with resumption tokens. The
10
+ # expiration timeout is ignored, since all necessary information is
11
+ # encoded in the token.
12
+ #
13
+ class ActiveRecordWrapper < Model
14
+
15
+ attr_reader :model, :timestamp_field
16
+
17
+ def initialize(model, options={})
18
+ @model = model
19
+ @timestamp_field = options.delete(:timestamp_field) || 'updated_at'
20
+ @limit = options.delete(:limit)
21
+
22
+ unless options.empty?
23
+ raise ArgumentException.new(
24
+ "Unsupported options [#{options.join(', ')}]"
25
+ )
26
+ end
27
+ end
28
+
29
+ def earliest
30
+ model.find(:first,
31
+ :order => "#{timestamp_field} asc").send(timestamp_field)
32
+ end
33
+
34
+ def latest
35
+ model.find(:first,
36
+ :order => "#{timestamp_field} desc").send(timestamp_field)
37
+ end
38
+
39
+ # A model class is expected to provide a method Model.sets that
40
+ # returns all the sets the model supports. See the
41
+ # activerecord_provider tests for an example.
42
+ def sets
43
+ model.sets if model.respond_to?(:sets)
44
+ end
45
+
46
+ def find(selector, options={})
47
+ return next_set(options[:resumption_token]) if options[:resumption_token]
48
+ conditions = sql_conditions(options)
49
+
50
+ if :all == selector
51
+ total = model.count(:id, :conditions => conditions)
52
+ if @limit && total > @limit
53
+ select_partial(ResumptionToken.new(options.merge({:last => 0})))
54
+ else
55
+ model.find(:all, :conditions => conditions)
56
+ end
57
+ else
58
+ model.find(selector, :conditions => conditions)
59
+ end
60
+ end
61
+
62
+ def deleted?(record)
63
+ if record.respond_to?(:deleted_at)
64
+ return record.deleted_at
65
+ elsif record.respond_to?(:deleted)
66
+ return record.deleted
67
+ end
68
+ false
69
+ end
70
+
71
+ protected
72
+
73
+ # Request the next set in this sequence.
74
+ def next_set(token_string)
75
+ raise OAI::ResumptionTokenException.new unless @limit
76
+
77
+ token = ResumptionToken.parse(token_string)
78
+ total = model.count(:id, :conditions => token_conditions(token))
79
+
80
+ if @limit < total
81
+ select_partial(token)
82
+ else # end of result set
83
+ model.find(:all,
84
+ :conditions => token_conditions(token),
85
+ :limit => @limit, :order => "#{model.primary_key} asc")
86
+ end
87
+ end
88
+
89
+ # select a subset of the result set, and return it with a
90
+ # resumption token to get the next subset
91
+ def select_partial(token)
92
+ records = model.find(:all,
93
+ :conditions => token_conditions(token),
94
+ :limit => @limit,
95
+ :order => "#{model.primary_key} asc")
96
+
97
+ raise OAI::ResumptionTokenException.new unless records
98
+
99
+ offset = records.last.send(model.primary_key.to_sym)
100
+
101
+ PartialResult.new(records, token.next(offset))
102
+ end
103
+
104
+ # build a sql conditions statement from the content
105
+ # of a resumption token. It is very important not to
106
+ # miss any changes as records may change scope as the
107
+ # harvest is in progress. To avoid loosing any changes
108
+ # the last 'id' of the previous set is used as the
109
+ # filter to the next set.
110
+ def token_conditions(token)
111
+ last = token.last
112
+ sql = sql_conditions token.to_conditions_hash
113
+
114
+ return sql if 0 == last
115
+ # Now add last id constraint
116
+ sql[0] << " AND #{model.primary_key} > ?"
117
+ sql << last
118
+
119
+ return sql
120
+ end
121
+
122
+ # build a sql conditions statement from an OAI options hash
123
+ def sql_conditions(opts)
124
+ sql = []
125
+ sql << "#{timestamp_field} >= ?" << "#{timestamp_field} <= ?"
126
+ sql << "set = ?" if opts[:set]
127
+
128
+ esc_values = [sql.join(" AND ")]
129
+ esc_values << opts[:from].localtime << opts[:until].localtime
130
+ esc_values << opts[:set] if opts[:set]
131
+
132
+ return esc_values
133
+ end
134
+
135
+ end
136
+ end
@@ -0,0 +1,18 @@
1
+ module OAI::Provider
2
+ # = OAI::Provider::PartialResult
3
+ #
4
+ # PartialResult is used for returning a set/page of results from a model
5
+ # that supports resumption tokens. It should contain and array of
6
+ # records, and a resumption token for getting the next set/page.
7
+ #
8
+ class PartialResult
9
+ attr_reader :records, :token
10
+
11
+ def initialize(records, token = nil)
12
+ @records = records
13
+ @token = token
14
+ end
15
+
16
+ end
17
+
18
+ end
@@ -0,0 +1,119 @@
1
+ require 'builder' unless defined?(Builder)
2
+
3
+ module OAI
4
+ module Provider
5
+ module Response
6
+
7
+ class Base
8
+ attr_reader :provider, :options
9
+
10
+ class << self
11
+ attr_reader :valid_options, :default_options, :required_options
12
+
13
+ def valid_parameters(*args)
14
+ @valid_options ||= []
15
+ @valid_options = (@valid_options + args.dup).uniq
16
+ end
17
+
18
+ def default_parameters(options = {})
19
+ @default_options ||= {}
20
+ @default_options.merge! options.dup
21
+ end
22
+
23
+ def required_parameters(*args)
24
+ valid_parameters(*args)
25
+ @required_options ||= []
26
+ @required_options = (@required_options + args.dup).uniq
27
+ end
28
+
29
+ end
30
+
31
+ def initialize(provider, options = {})
32
+ @provider = provider
33
+ @options = internalize(options)
34
+ raise OAI::ArgumentException.new unless valid?
35
+ end
36
+
37
+ def response
38
+ @builder = Builder::XmlMarkup.new
39
+ @builder.instruct! :xml, :version=>"1.0", :encoding=>"UTF-8"
40
+ @builder.tag!('OAI-PMH', header) do
41
+ @builder.responseDate Time.now.utc.xmlschema
42
+ @builder.request(provider.url, options)
43
+ yield @builder
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def header
50
+ {
51
+ 'xmlns' => "http://www.openarchives.org/OAI/2.0/",
52
+ 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
53
+ 'xsi:schemaLocation' => %{http://www.openarchives.org/OAI/2.0/
54
+ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd}
55
+ }
56
+ end
57
+
58
+ def valid?
59
+ return true if resumption?
60
+
61
+ return true if self.class.valid_options.nil? and options.empty?
62
+
63
+ if self.class.required_options
64
+ return false unless (self.class.required_options - @options.keys).empty?
65
+ end
66
+
67
+ return false unless (@options.keys - self.class.valid_options).empty?
68
+
69
+ populate_defaults
70
+ end
71
+
72
+ def populate_defaults
73
+ self.class.default_options.each do |k,v|
74
+ @options[k] = v.respond_to?(:call) ? v.call(self) : v if not @options[k]
75
+ end
76
+ end
77
+
78
+ def resumption?
79
+ if @options.keys.include?(:resumption_token)
80
+ return true if 1 == @options.keys.size
81
+ raise OAI::ArgumentException.new
82
+ end
83
+ end
84
+
85
+ # Convert our internal representations back into standard OAI options
86
+ def externalize(value)
87
+ value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
88
+ end
89
+
90
+ def parse_date(value)
91
+ return value if value.respond_to?(:strftime)
92
+
93
+ # Oddly Chronic doesn't parse an UTC encoded datetime.
94
+ # Luckily Time does
95
+ dt = Chronic.parse(value) || Time.parse(value)
96
+ raise OAI::ArgumentError.new unless dt
97
+
98
+ dt.utc
99
+ end
100
+
101
+ def internalize(hash = {})
102
+ internal = {}
103
+ hash.keys.each do |key|
104
+ internal[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = hash[key].dup
105
+ end
106
+
107
+ # Convert date formated strings into internal time values
108
+ # Convert date formated strings in dates.
109
+ internal[:from] = parse_date(internal[:from]) if internal[:from]
110
+ internal[:until] = parse_date(internal[:until]) if internal[:until]
111
+
112
+ internal
113
+ end
114
+
115
+ end
116
+
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,16 @@
1
+ module OAI::Provider::Response
2
+ class Error < Base
3
+
4
+ def initialize(provider, error)
5
+ super(provider)
6
+ @error = error
7
+ end
8
+
9
+ def to_xml
10
+ response do |r|
11
+ r.error @error.to_s, :code => @error.code
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,32 @@
1
+ module OAI::Provider::Response
2
+
3
+ class GetRecord < RecordResponse
4
+ required_parameters :identifier
5
+
6
+ def to_xml
7
+ id = extract_identifier(options.delete(:identifier))
8
+ unless record = provider.model.find(id, options)
9
+ raise OAI::IdException.new
10
+ end
11
+
12
+ response do |r|
13
+ r.GetRecord do
14
+ r.record do
15
+ header_for record
16
+ data_for record unless deleted?(record)
17
+ end
18
+ end
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def extract_identifier(id)
25
+ id.sub("#{provider.prefix}/", '')
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+