oai 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. data/README +80 -0
  2. data/Rakefile +113 -0
  3. data/bin/oai +68 -0
  4. data/examples/models/file_model.rb +63 -0
  5. data/examples/providers/dublin_core.rb +474 -0
  6. data/lib/oai.rb +7 -13
  7. data/lib/oai/client.rb +133 -83
  8. data/lib/oai/{get_record.rb → client/get_record.rb} +0 -0
  9. data/lib/oai/{header.rb → client/header.rb} +2 -2
  10. data/lib/oai/{identify.rb → client/identify.rb} +0 -0
  11. data/lib/oai/{list_identifiers.rb → client/list_identifiers.rb} +0 -0
  12. data/lib/oai/{list_metadata_formats.rb → client/list_metadata_formats.rb} +0 -0
  13. data/lib/oai/{list_records.rb → client/list_records.rb} +0 -0
  14. data/lib/oai/{list_sets.rb → client/list_sets.rb} +1 -1
  15. data/lib/oai/{metadata_format.rb → client/metadata_format.rb} +0 -0
  16. data/lib/oai/{record.rb → client/record.rb} +0 -0
  17. data/lib/oai/{response.rb → client/response.rb} +1 -1
  18. data/lib/oai/constants.rb +34 -0
  19. data/lib/oai/exception.rb +72 -1
  20. data/lib/oai/harvester.rb +38 -0
  21. data/lib/oai/harvester/config.rb +41 -0
  22. data/lib/oai/harvester/harvest.rb +144 -0
  23. data/lib/oai/harvester/logging.rb +70 -0
  24. data/lib/oai/harvester/mailer.rb +17 -0
  25. data/lib/oai/harvester/shell.rb +334 -0
  26. data/lib/oai/provider.rb +300 -0
  27. data/lib/oai/provider/metadata_format.rb +72 -0
  28. data/lib/oai/provider/metadata_format/oai_dc.rb +29 -0
  29. data/lib/oai/provider/model.rb +71 -0
  30. data/lib/oai/provider/model/activerecord_caching_wrapper.rb +135 -0
  31. data/lib/oai/provider/model/activerecord_wrapper.rb +136 -0
  32. data/lib/oai/provider/partial_result.rb +18 -0
  33. data/lib/oai/provider/response.rb +119 -0
  34. data/lib/oai/provider/response/error.rb +16 -0
  35. data/lib/oai/provider/response/get_record.rb +32 -0
  36. data/lib/oai/provider/response/identify.rb +24 -0
  37. data/lib/oai/provider/response/list_identifiers.rb +29 -0
  38. data/lib/oai/provider/response/list_metadata_formats.rb +21 -0
  39. data/lib/oai/provider/response/list_records.rb +32 -0
  40. data/lib/oai/provider/response/list_sets.rb +23 -0
  41. data/lib/oai/provider/response/record_response.rb +68 -0
  42. data/lib/oai/provider/resumption_token.rb +106 -0
  43. data/lib/oai/set.rb +14 -5
  44. data/test/activerecord_provider/config/connection.rb +5 -0
  45. data/test/activerecord_provider/config/database.yml +6 -0
  46. data/test/activerecord_provider/database/ar_migration.rb +59 -0
  47. data/test/activerecord_provider/database/oaipmhtest +0 -0
  48. data/test/activerecord_provider/fixtures/dc.yml +1501 -0
  49. data/test/activerecord_provider/helpers/providers.rb +44 -0
  50. data/test/activerecord_provider/helpers/set_provider.rb +36 -0
  51. data/test/activerecord_provider/models/dc_field.rb +7 -0
  52. data/test/activerecord_provider/models/dc_set.rb +6 -0
  53. data/test/activerecord_provider/models/oai_token.rb +3 -0
  54. data/test/activerecord_provider/tc_ar_provider.rb +93 -0
  55. data/test/activerecord_provider/tc_ar_sets_provider.rb +66 -0
  56. data/test/activerecord_provider/tc_caching_paging_provider.rb +53 -0
  57. data/test/activerecord_provider/tc_simple_paging_provider.rb +55 -0
  58. data/test/activerecord_provider/test_helper.rb +4 -0
  59. data/test/client/helpers/provider.rb +68 -0
  60. data/test/client/helpers/test_wrapper.rb +11 -0
  61. data/test/client/tc_exception.rb +36 -0
  62. data/test/{tc_get_record.rb → client/tc_get_record.rb} +11 -7
  63. data/test/client/tc_identify.rb +13 -0
  64. data/test/{tc_libxml.rb → client/tc_libxml.rb} +20 -10
  65. data/test/{tc_list_identifiers.rb → client/tc_list_identifiers.rb} +10 -8
  66. data/test/{tc_list_metadata_formats.rb → client/tc_list_metadata_formats.rb} +4 -1
  67. data/test/{tc_list_records.rb → client/tc_list_records.rb} +4 -1
  68. data/test/{tc_list_sets.rb → client/tc_list_sets.rb} +4 -2
  69. data/test/{tc_xpath.rb → client/tc_xpath.rb} +1 -1
  70. data/test/client/test_helper.rb +5 -0
  71. data/test/provider/models.rb +230 -0
  72. data/test/provider/tc_exceptions.rb +63 -0
  73. data/test/provider/tc_functional_tokens.rb +42 -0
  74. data/test/provider/tc_provider.rb +69 -0
  75. data/test/provider/tc_resumption_tokens.rb +46 -0
  76. data/test/provider/tc_simple_provider.rb +85 -0
  77. data/test/provider/test_helper.rb +36 -0
  78. metadata +123 -27
  79. data/test/tc_exception.rb +0 -38
  80. data/test/tc_identify.rb +0 -8
@@ -0,0 +1,135 @@
1
+ require 'active_record'
2
+
3
+ module OAI::Provider
4
+
5
+ # ActiveRecord model class in support of the caching wrapper.
6
+ class OaiToken < ActiveRecord::Base
7
+ has_many :entries, :class_name => 'OaiEntry',
8
+ :order => "record_id", :dependent => :destroy
9
+
10
+ validates_uniqueness_of :token
11
+
12
+ # Make sanitize_sql a public method so we can make use of it.
13
+ public
14
+
15
+ def self.sanitize_sql(*arg)
16
+ super(*arg)
17
+ end
18
+
19
+ def new_record_before_save?
20
+ @new_record_before_save
21
+ end
22
+
23
+ end
24
+
25
+ # ActiveRecord model class in support of the caching wrapper.
26
+ class OaiEntry < ActiveRecord::Base
27
+ belongs_to :oai_token
28
+
29
+ validates_uniqueness_of :record_id, :scope => :oai_token
30
+ end
31
+
32
+ # = OAI::Provider::ActiveRecordCachingWrapper
33
+ #
34
+ # This class wraps an ActiveRecord model and delegates all of the record
35
+ # selection/retrieval to the AR model. It accepts options for specifying
36
+ # the update timestamp field, a timeout, and a limit. The limit option
37
+ # is used for doing pagination with resumption tokens. The timeout is
38
+ # used to expire old tokens from the cache. Default timeout is 12 hours.
39
+ #
40
+ # The difference between ActiveRecordWrapper and this class is how the
41
+ # pagination is accomplished. ActiveRecordWrapper encodes all the
42
+ # information in the token. That approach should work 99% of the time.
43
+ # If you have an extremely active respository you may want to consider
44
+ # the caching wrapper. The caching wrapper takes the entire result set
45
+ # from a request and caches it in another database table, well tables
46
+ # actually. So the result returned to the client will always be
47
+ # internally consistent.
48
+ #
49
+ class ActiveRecordCachingWrapper < ActiveRecordWrapper
50
+
51
+ attr_reader :model, :timestamp_field, :expire
52
+
53
+ def initialize(model, options={})
54
+ @expire = options.delete(:timeout) || 12.hours
55
+ super(model, options)
56
+ end
57
+
58
+ def find(selector, options={})
59
+ sweep_cache
60
+ return next_set(options[:resumption_token]) if options[:resumption_token]
61
+
62
+ conditions = sql_conditions(options)
63
+
64
+ if :all == selector
65
+ total = model.count(:id, :conditions => conditions)
66
+ if @limit && total > @limit
67
+ select_partial(
68
+ ResumptionToken.new(options.merge({:last => 0})))
69
+ else
70
+ model.find(:all, :conditions => conditions)
71
+ end
72
+ else
73
+ model.find(selector, :conditions => conditions)
74
+ end
75
+ end
76
+
77
+ protected
78
+
79
+ def next_set(token_string)
80
+ raise ResumptionTokenException.new unless @limit
81
+
82
+ token = ResumptionToken.parse(token_string)
83
+ total = model.count(:id, :conditions => token_conditions(token))
84
+
85
+ if token.last * @limit + @limit < total
86
+ select_partial(token)
87
+ else
88
+ select_partial(token).records
89
+ end
90
+ end
91
+
92
+ # select a subset of the result set, and return it with a
93
+ # resumption token to get the next subset
94
+ def select_partial(token)
95
+ if 0 == token.last
96
+ oaitoken = OaiToken.find_or_create_by_token(token.to_s)
97
+ if oaitoken.new_record_before_save?
98
+ OaiToken.connection.execute("insert into " +
99
+ "#{OaiEntry.table_name} (oai_token_id, record_id) " +
100
+ "select #{oaitoken.id}, id from #{model.table_name} where " +
101
+ "#{OaiToken.sanitize_sql(token_conditions(token))}")
102
+ end
103
+ end
104
+
105
+ oaitoken = OaiToken.find_by_token(token.to_s)
106
+
107
+ raise ResumptionTokenException.new unless oaitoken
108
+
109
+ PartialResult.new(
110
+ hydrate_records(oaitoken.entries.find(:all, :limit => @limit,
111
+ :offset => token.last * @limit)), token.next(token.last + 1)
112
+ )
113
+ end
114
+
115
+ def sweep_cache
116
+ OaiToken.destroy_all(["created_at < ?", Time.now - expire])
117
+ end
118
+
119
+ def hydrate_records(records)
120
+ model.find(records.collect {|r| r.record_id })
121
+ end
122
+
123
+ def token_conditions(token)
124
+ sql_conditions token.to_conditions_hash
125
+ end
126
+
127
+ private
128
+
129
+ def expires_at(creation)
130
+ created = Time.parse(creation.strftime("%Y-%m-%d %H:%M:%S"))
131
+ created.utc + expire
132
+ end
133
+
134
+ end
135
+ end
@@ -0,0 +1,136 @@
1
+ require 'active_record'
2
+
3
+ module OAI::Provider
4
+ # = OAI::Provider::ActiveRecordWrapper
5
+ #
6
+ # This class wraps an ActiveRecord model and delegates all of the record
7
+ # selection/retrieval to the AR model. It accepts options for specifying
8
+ # the update timestamp field, a timeout, and a limit. The limit option
9
+ # is used for doing pagination with resumption tokens. The
10
+ # expiration timeout is ignored, since all necessary information is
11
+ # encoded in the token.
12
+ #
13
+ class ActiveRecordWrapper < Model
14
+
15
+ attr_reader :model, :timestamp_field
16
+
17
+ def initialize(model, options={})
18
+ @model = model
19
+ @timestamp_field = options.delete(:timestamp_field) || 'updated_at'
20
+ @limit = options.delete(:limit)
21
+
22
+ unless options.empty?
23
+ raise ArgumentException.new(
24
+ "Unsupported options [#{options.join(', ')}]"
25
+ )
26
+ end
27
+ end
28
+
29
+ def earliest
30
+ model.find(:first,
31
+ :order => "#{timestamp_field} asc").send(timestamp_field)
32
+ end
33
+
34
+ def latest
35
+ model.find(:first,
36
+ :order => "#{timestamp_field} desc").send(timestamp_field)
37
+ end
38
+
39
+ # A model class is expected to provide a method Model.sets that
40
+ # returns all the sets the model supports. See the
41
+ # activerecord_provider tests for an example.
42
+ def sets
43
+ model.sets if model.respond_to?(:sets)
44
+ end
45
+
46
+ def find(selector, options={})
47
+ return next_set(options[:resumption_token]) if options[:resumption_token]
48
+ conditions = sql_conditions(options)
49
+
50
+ if :all == selector
51
+ total = model.count(:id, :conditions => conditions)
52
+ if @limit && total > @limit
53
+ select_partial(ResumptionToken.new(options.merge({:last => 0})))
54
+ else
55
+ model.find(:all, :conditions => conditions)
56
+ end
57
+ else
58
+ model.find(selector, :conditions => conditions)
59
+ end
60
+ end
61
+
62
+ def deleted?(record)
63
+ if record.respond_to?(:deleted_at)
64
+ return record.deleted_at
65
+ elsif record.respond_to?(:deleted)
66
+ return record.deleted
67
+ end
68
+ false
69
+ end
70
+
71
+ protected
72
+
73
+ # Request the next set in this sequence.
74
+ def next_set(token_string)
75
+ raise OAI::ResumptionTokenException.new unless @limit
76
+
77
+ token = ResumptionToken.parse(token_string)
78
+ total = model.count(:id, :conditions => token_conditions(token))
79
+
80
+ if @limit < total
81
+ select_partial(token)
82
+ else # end of result set
83
+ model.find(:all,
84
+ :conditions => token_conditions(token),
85
+ :limit => @limit, :order => "#{model.primary_key} asc")
86
+ end
87
+ end
88
+
89
+ # select a subset of the result set, and return it with a
90
+ # resumption token to get the next subset
91
+ def select_partial(token)
92
+ records = model.find(:all,
93
+ :conditions => token_conditions(token),
94
+ :limit => @limit,
95
+ :order => "#{model.primary_key} asc")
96
+
97
+ raise OAI::ResumptionTokenException.new unless records
98
+
99
+ offset = records.last.send(model.primary_key.to_sym)
100
+
101
+ PartialResult.new(records, token.next(offset))
102
+ end
103
+
104
+ # build a sql conditions statement from the content
105
+ # of a resumption token. It is very important not to
106
+ # miss any changes as records may change scope as the
107
+ # harvest is in progress. To avoid loosing any changes
108
+ # the last 'id' of the previous set is used as the
109
+ # filter to the next set.
110
+ def token_conditions(token)
111
+ last = token.last
112
+ sql = sql_conditions token.to_conditions_hash
113
+
114
+ return sql if 0 == last
115
+ # Now add last id constraint
116
+ sql[0] << " AND #{model.primary_key} > ?"
117
+ sql << last
118
+
119
+ return sql
120
+ end
121
+
122
+ # build a sql conditions statement from an OAI options hash
123
+ def sql_conditions(opts)
124
+ sql = []
125
+ sql << "#{timestamp_field} >= ?" << "#{timestamp_field} <= ?"
126
+ sql << "set = ?" if opts[:set]
127
+
128
+ esc_values = [sql.join(" AND ")]
129
+ esc_values << opts[:from].localtime << opts[:until].localtime
130
+ esc_values << opts[:set] if opts[:set]
131
+
132
+ return esc_values
133
+ end
134
+
135
+ end
136
+ end
@@ -0,0 +1,18 @@
1
+ module OAI::Provider
2
+ # = OAI::Provider::PartialResult
3
+ #
4
+ # PartialResult is used for returning a set/page of results from a model
5
+ # that supports resumption tokens. It should contain and array of
6
+ # records, and a resumption token for getting the next set/page.
7
+ #
8
+ class PartialResult
9
+ attr_reader :records, :token
10
+
11
+ def initialize(records, token = nil)
12
+ @records = records
13
+ @token = token
14
+ end
15
+
16
+ end
17
+
18
+ end
@@ -0,0 +1,119 @@
1
+ require 'builder' unless defined?(Builder)
2
+
3
+ module OAI
4
+ module Provider
5
+ module Response
6
+
7
+ class Base
8
+ attr_reader :provider, :options
9
+
10
+ class << self
11
+ attr_reader :valid_options, :default_options, :required_options
12
+
13
+ def valid_parameters(*args)
14
+ @valid_options ||= []
15
+ @valid_options = (@valid_options + args.dup).uniq
16
+ end
17
+
18
+ def default_parameters(options = {})
19
+ @default_options ||= {}
20
+ @default_options.merge! options.dup
21
+ end
22
+
23
+ def required_parameters(*args)
24
+ valid_parameters(*args)
25
+ @required_options ||= []
26
+ @required_options = (@required_options + args.dup).uniq
27
+ end
28
+
29
+ end
30
+
31
+ def initialize(provider, options = {})
32
+ @provider = provider
33
+ @options = internalize(options)
34
+ raise OAI::ArgumentException.new unless valid?
35
+ end
36
+
37
+ def response
38
+ @builder = Builder::XmlMarkup.new
39
+ @builder.instruct! :xml, :version=>"1.0", :encoding=>"UTF-8"
40
+ @builder.tag!('OAI-PMH', header) do
41
+ @builder.responseDate Time.now.utc.xmlschema
42
+ @builder.request(provider.url, options)
43
+ yield @builder
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def header
50
+ {
51
+ 'xmlns' => "http://www.openarchives.org/OAI/2.0/",
52
+ 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
53
+ 'xsi:schemaLocation' => %{http://www.openarchives.org/OAI/2.0/
54
+ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd}
55
+ }
56
+ end
57
+
58
+ def valid?
59
+ return true if resumption?
60
+
61
+ return true if self.class.valid_options.nil? and options.empty?
62
+
63
+ if self.class.required_options
64
+ return false unless (self.class.required_options - @options.keys).empty?
65
+ end
66
+
67
+ return false unless (@options.keys - self.class.valid_options).empty?
68
+
69
+ populate_defaults
70
+ end
71
+
72
+ def populate_defaults
73
+ self.class.default_options.each do |k,v|
74
+ @options[k] = v.respond_to?(:call) ? v.call(self) : v if not @options[k]
75
+ end
76
+ end
77
+
78
+ def resumption?
79
+ if @options.keys.include?(:resumption_token)
80
+ return true if 1 == @options.keys.size
81
+ raise OAI::ArgumentException.new
82
+ end
83
+ end
84
+
85
+ # Convert our internal representations back into standard OAI options
86
+ def externalize(value)
87
+ value.to_s.gsub(/_[a-z]/) { |m| m.sub("_", '').capitalize }
88
+ end
89
+
90
+ def parse_date(value)
91
+ return value if value.respond_to?(:strftime)
92
+
93
+ # Oddly Chronic doesn't parse an UTC encoded datetime.
94
+ # Luckily Time does
95
+ dt = Chronic.parse(value) || Time.parse(value)
96
+ raise OAI::ArgumentError.new unless dt
97
+
98
+ dt.utc
99
+ end
100
+
101
+ def internalize(hash = {})
102
+ internal = {}
103
+ hash.keys.each do |key|
104
+ internal[key.to_s.gsub(/([A-Z])/, '_\1').downcase.intern] = hash[key].dup
105
+ end
106
+
107
+ # Convert date formated strings into internal time values
108
+ # Convert date formated strings in dates.
109
+ internal[:from] = parse_date(internal[:from]) if internal[:from]
110
+ internal[:until] = parse_date(internal[:until]) if internal[:until]
111
+
112
+ internal
113
+ end
114
+
115
+ end
116
+
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,16 @@
1
+ module OAI::Provider::Response
2
+ class Error < Base
3
+
4
+ def initialize(provider, error)
5
+ super(provider)
6
+ @error = error
7
+ end
8
+
9
+ def to_xml
10
+ response do |r|
11
+ r.error @error.to_s, :code => @error.code
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,32 @@
1
+ module OAI::Provider::Response
2
+
3
+ class GetRecord < RecordResponse
4
+ required_parameters :identifier
5
+
6
+ def to_xml
7
+ id = extract_identifier(options.delete(:identifier))
8
+ unless record = provider.model.find(id, options)
9
+ raise OAI::IdException.new
10
+ end
11
+
12
+ response do |r|
13
+ r.GetRecord do
14
+ r.record do
15
+ header_for record
16
+ data_for record unless deleted?(record)
17
+ end
18
+ end
19
+ end
20
+ end
21
+
22
+ private
23
+
24
+ def extract_identifier(id)
25
+ id.sub("#{provider.prefix}/", '')
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+
32
+