oai 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +80 -0
- data/Rakefile +113 -0
- data/bin/oai +68 -0
- data/examples/models/file_model.rb +63 -0
- data/examples/providers/dublin_core.rb +474 -0
- data/lib/oai.rb +7 -13
- data/lib/oai/client.rb +133 -83
- data/lib/oai/{get_record.rb → client/get_record.rb} +0 -0
- data/lib/oai/{header.rb → client/header.rb} +2 -2
- data/lib/oai/{identify.rb → client/identify.rb} +0 -0
- data/lib/oai/{list_identifiers.rb → client/list_identifiers.rb} +0 -0
- data/lib/oai/{list_metadata_formats.rb → client/list_metadata_formats.rb} +0 -0
- data/lib/oai/{list_records.rb → client/list_records.rb} +0 -0
- data/lib/oai/{list_sets.rb → client/list_sets.rb} +1 -1
- data/lib/oai/{metadata_format.rb → client/metadata_format.rb} +0 -0
- data/lib/oai/{record.rb → client/record.rb} +0 -0
- data/lib/oai/{response.rb → client/response.rb} +1 -1
- data/lib/oai/constants.rb +34 -0
- data/lib/oai/exception.rb +72 -1
- data/lib/oai/harvester.rb +38 -0
- data/lib/oai/harvester/config.rb +41 -0
- data/lib/oai/harvester/harvest.rb +144 -0
- data/lib/oai/harvester/logging.rb +70 -0
- data/lib/oai/harvester/mailer.rb +17 -0
- data/lib/oai/harvester/shell.rb +334 -0
- data/lib/oai/provider.rb +300 -0
- data/lib/oai/provider/metadata_format.rb +72 -0
- data/lib/oai/provider/metadata_format/oai_dc.rb +29 -0
- data/lib/oai/provider/model.rb +71 -0
- data/lib/oai/provider/model/activerecord_caching_wrapper.rb +135 -0
- data/lib/oai/provider/model/activerecord_wrapper.rb +136 -0
- data/lib/oai/provider/partial_result.rb +18 -0
- data/lib/oai/provider/response.rb +119 -0
- data/lib/oai/provider/response/error.rb +16 -0
- data/lib/oai/provider/response/get_record.rb +32 -0
- data/lib/oai/provider/response/identify.rb +24 -0
- data/lib/oai/provider/response/list_identifiers.rb +29 -0
- data/lib/oai/provider/response/list_metadata_formats.rb +21 -0
- data/lib/oai/provider/response/list_records.rb +32 -0
- data/lib/oai/provider/response/list_sets.rb +23 -0
- data/lib/oai/provider/response/record_response.rb +68 -0
- data/lib/oai/provider/resumption_token.rb +106 -0
- data/lib/oai/set.rb +14 -5
- data/test/activerecord_provider/config/connection.rb +5 -0
- data/test/activerecord_provider/config/database.yml +6 -0
- data/test/activerecord_provider/database/ar_migration.rb +59 -0
- data/test/activerecord_provider/database/oaipmhtest +0 -0
- data/test/activerecord_provider/fixtures/dc.yml +1501 -0
- data/test/activerecord_provider/helpers/providers.rb +44 -0
- data/test/activerecord_provider/helpers/set_provider.rb +36 -0
- data/test/activerecord_provider/models/dc_field.rb +7 -0
- data/test/activerecord_provider/models/dc_set.rb +6 -0
- data/test/activerecord_provider/models/oai_token.rb +3 -0
- data/test/activerecord_provider/tc_ar_provider.rb +93 -0
- data/test/activerecord_provider/tc_ar_sets_provider.rb +66 -0
- data/test/activerecord_provider/tc_caching_paging_provider.rb +53 -0
- data/test/activerecord_provider/tc_simple_paging_provider.rb +55 -0
- data/test/activerecord_provider/test_helper.rb +4 -0
- data/test/client/helpers/provider.rb +68 -0
- data/test/client/helpers/test_wrapper.rb +11 -0
- data/test/client/tc_exception.rb +36 -0
- data/test/{tc_get_record.rb → client/tc_get_record.rb} +11 -7
- data/test/client/tc_identify.rb +13 -0
- data/test/{tc_libxml.rb → client/tc_libxml.rb} +20 -10
- data/test/{tc_list_identifiers.rb → client/tc_list_identifiers.rb} +10 -8
- data/test/{tc_list_metadata_formats.rb → client/tc_list_metadata_formats.rb} +4 -1
- data/test/{tc_list_records.rb → client/tc_list_records.rb} +4 -1
- data/test/{tc_list_sets.rb → client/tc_list_sets.rb} +4 -2
- data/test/{tc_xpath.rb → client/tc_xpath.rb} +1 -1
- data/test/client/test_helper.rb +5 -0
- data/test/provider/models.rb +230 -0
- data/test/provider/tc_exceptions.rb +63 -0
- data/test/provider/tc_functional_tokens.rb +42 -0
- data/test/provider/tc_provider.rb +69 -0
- data/test/provider/tc_resumption_tokens.rb +46 -0
- data/test/provider/tc_simple_provider.rb +85 -0
- data/test/provider/test_helper.rb +36 -0
- metadata +123 -27
- data/test/tc_exception.rb +0 -38
- data/test/tc_identify.rb +0 -8
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'net/smtp'
|
3
|
+
require 'yaml'
|
4
|
+
require 'tempfile'
|
5
|
+
require 'logger'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'ostruct'
|
8
|
+
require 'readline'
|
9
|
+
require 'chronic'
|
10
|
+
require 'socket'
|
11
|
+
|
12
|
+
require 'oai/harvester/config'
|
13
|
+
require 'oai/harvester/harvest'
|
14
|
+
require 'oai/harvester/logging'
|
15
|
+
require 'oai/harvester/mailer'
|
16
|
+
require 'oai/harvester/shell'
|
17
|
+
|
18
|
+
def harvestable_sites(conf)
|
19
|
+
sites = []
|
20
|
+
conf.sites.each do |k, v|
|
21
|
+
sites << k if needs_updating(v['period'], v['last'])
|
22
|
+
end if conf.sites
|
23
|
+
sites
|
24
|
+
end
|
25
|
+
|
26
|
+
def needs_updating(period, last)
|
27
|
+
return true if last.nil?
|
28
|
+
case period
|
29
|
+
when 'daily'
|
30
|
+
return true if Time.now - last > 86000
|
31
|
+
when 'weekly'
|
32
|
+
return true if Time.now - last > 604000
|
33
|
+
when 'monthly'
|
34
|
+
return true if Time.now - last > 2591000
|
35
|
+
end
|
36
|
+
return false
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#
|
2
|
+
# Created by William Groppe on 2006-11-05.
|
3
|
+
# Copyright (c) 2006. All rights reserved.
|
4
|
+
|
5
|
+
module OAI
|
6
|
+
module Harvester
|
7
|
+
|
8
|
+
LOW_RESOLUTION = "YYYY-MM-DD"
|
9
|
+
|
10
|
+
class Config < OpenStruct
|
11
|
+
|
12
|
+
PERIODS = %w(daily weekly monthly)
|
13
|
+
GLOBAL = "/etc/oai/harvester.yml"
|
14
|
+
|
15
|
+
def self.load
|
16
|
+
config = find_config
|
17
|
+
File.exists?(config) ? new(YAML.load_file(config)) : new
|
18
|
+
end
|
19
|
+
|
20
|
+
def save
|
21
|
+
config = Config.find_config
|
22
|
+
open(config, 'w') do |out|
|
23
|
+
YAML.dump(@table, out)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
# Shamelessly lifted from Camping
|
29
|
+
def self.find_config
|
30
|
+
if home = ENV['HOME'] # POSIX
|
31
|
+
return GLOBAL if File.exists?(GLOBAL) && File.writable?(GLOBAL)
|
32
|
+
FileUtils.mkdir_p File.join(home, '.oai')
|
33
|
+
File.join(home, '.oai/harvester.yml')
|
34
|
+
elsif home = ENV['APPDATA'] # MSWIN
|
35
|
+
File.join(home, 'oai/harvester.yml')
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
#
|
2
|
+
# Created by William Groppe on 2006-11-03.
|
3
|
+
|
4
|
+
module OAI
|
5
|
+
module Harvester
|
6
|
+
|
7
|
+
class Harvest
|
8
|
+
|
9
|
+
def initialize(config = nil, directory = nil, date = nil)
|
10
|
+
@config = config || Config.load
|
11
|
+
@directory = directory || @config.storage
|
12
|
+
@from = date
|
13
|
+
@from.freeze
|
14
|
+
@parser = defined?(XML::Document) ? 'libxml' : 'rexml'
|
15
|
+
end
|
16
|
+
|
17
|
+
def start(sites = nil, interactive = false)
|
18
|
+
@interactive = interactive
|
19
|
+
sites = (@config.sites.keys rescue {}) unless sites
|
20
|
+
begin
|
21
|
+
sites.each do |site|
|
22
|
+
harvest(site)
|
23
|
+
end
|
24
|
+
ensure
|
25
|
+
@config.save
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def harvest(site)
|
32
|
+
harvest_time = Time.now.utc
|
33
|
+
opts = build_options_hash(@config.sites[site])
|
34
|
+
opts[:until] = harvest_time.xmlschema
|
35
|
+
|
36
|
+
# Allow a from date to be passed in
|
37
|
+
if(@from)
|
38
|
+
opts[:from] = @from
|
39
|
+
else
|
40
|
+
opts[:from] = earliest(opts[:url])
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.delete(:set) if 'all' == opts[:set]
|
44
|
+
|
45
|
+
begin
|
46
|
+
# Connect, and download
|
47
|
+
file, records = call(opts.delete(:url), opts)
|
48
|
+
|
49
|
+
# Move document to storage directory
|
50
|
+
dir = File.join(@directory, date_based_directory(harvest_time))
|
51
|
+
FileUtils.mkdir_p dir
|
52
|
+
FileUtils.mv(file.path,
|
53
|
+
File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
|
54
|
+
harvest_time)}.xml.gz"))
|
55
|
+
@config.sites[site]['last'] = harvest_time
|
56
|
+
rescue
|
57
|
+
raise $! unless $!.respond_to?(:code)
|
58
|
+
raise $! if not @interactive || "noRecordsMatch" != $!.code
|
59
|
+
puts "No new records available"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def call(url, opts)
|
64
|
+
# Preserve original options
|
65
|
+
options = opts.dup
|
66
|
+
|
67
|
+
records = 0;
|
68
|
+
client = OAI::Client.new(url, :parser => @parser)
|
69
|
+
provider_config = client.identify
|
70
|
+
|
71
|
+
if Harvester::LOW_RESOLUTION == provider_config.granularity
|
72
|
+
options[:from] = Time.parse(options[:from]).strftime("%Y-%m-%d")
|
73
|
+
options[:until] = Time.parse(options[:until]).strftime("%Y-%m-%d")
|
74
|
+
end
|
75
|
+
|
76
|
+
file = Tempfile.new('oai_data')
|
77
|
+
gz = Zlib::GzipWriter.new(file)
|
78
|
+
gz << "<? xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
|
79
|
+
gz << "<records>"
|
80
|
+
begin
|
81
|
+
response = client.list_records(options)
|
82
|
+
get_records(response.doc).each do |rec|
|
83
|
+
gz << rec
|
84
|
+
records += 1
|
85
|
+
end
|
86
|
+
puts "#{records} records retrieved" if @interactive
|
87
|
+
|
88
|
+
# Get a full response by iterating with the resumption tokens.
|
89
|
+
# Not very Ruby like. Should fix OAI::Client to handle resumption
|
90
|
+
# tokens internally.
|
91
|
+
while(response.resumption_token and not response.resumption_token.empty?)
|
92
|
+
puts "\nresumption token recieved, continuing" if @interactive
|
93
|
+
response = client.list_records(:resumption_token =>
|
94
|
+
response.resumption_token)
|
95
|
+
get_records(response.doc).each do |rec|
|
96
|
+
gz << rec
|
97
|
+
records += 1
|
98
|
+
end
|
99
|
+
puts "#{records} records retrieved" if @interactive
|
100
|
+
end
|
101
|
+
|
102
|
+
gz << "</records>"
|
103
|
+
|
104
|
+
ensure
|
105
|
+
gz.close
|
106
|
+
file.close
|
107
|
+
end
|
108
|
+
|
109
|
+
[file, records]
|
110
|
+
end
|
111
|
+
|
112
|
+
def get_records(doc)
|
113
|
+
doc.find("/OAI-PMH/ListRecords/record").to_a
|
114
|
+
end
|
115
|
+
|
116
|
+
def build_options_hash(site)
|
117
|
+
options = {:url => site['url']}
|
118
|
+
options[:set] = site['set'] if site['set']
|
119
|
+
options[:from] = site['last'].utc.xmlschema if site['last']
|
120
|
+
options[:metadata_prefix] = site['prefix'] if site['prefix']
|
121
|
+
options
|
122
|
+
end
|
123
|
+
|
124
|
+
def date_based_directory(time)
|
125
|
+
"#{time.strftime(DIRECTORY_LAYOUT)}"
|
126
|
+
end
|
127
|
+
|
128
|
+
def filename(from_time, until_time)
|
129
|
+
format = "%Y-%m-%d"
|
130
|
+
"#{from_time.strftime(format)}_til_#{until_time.strftime(format)}"\
|
131
|
+
"_at_#{until_time.strftime('%H-%M-%S')}"
|
132
|
+
end
|
133
|
+
|
134
|
+
# Get earliest timestamp from repository
|
135
|
+
def earliest(url)
|
136
|
+
client = OAI::Client.new url
|
137
|
+
identify = client.identify
|
138
|
+
Time.parse(identify.earliest_datestamp).utc.xmlschema
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Reopen Harvest and add logging
|
2
|
+
module OAI
|
3
|
+
module Harvester
|
4
|
+
|
5
|
+
class Harvest
|
6
|
+
alias_method :orig_start, :start
|
7
|
+
alias_method :orig_harvest, :harvest
|
8
|
+
alias_method :orig_call, :call
|
9
|
+
alias_method :orig_init, :initialize
|
10
|
+
|
11
|
+
def initialize(config = nil, directory = nil, date = nil)
|
12
|
+
orig_init(config, directory, date)
|
13
|
+
@summary = []
|
14
|
+
@logger = Logger.new(File.join(@config.logfile, "harvester.log"),
|
15
|
+
shift_age = 'weekly') if @config.logfile
|
16
|
+
@logger.datetime_format = "%Y-%m-%d %H:%M"
|
17
|
+
|
18
|
+
# Turn off logging if no logging directory is specified.
|
19
|
+
@logger.level = Logger::FATAL unless @config.logfile
|
20
|
+
end
|
21
|
+
|
22
|
+
def start(sites = nil, interactive = false)
|
23
|
+
if not interactive
|
24
|
+
@logger.info { "Starting regular harvest" }
|
25
|
+
orig_start(sites)
|
26
|
+
begin
|
27
|
+
OAI::Harvester::
|
28
|
+
Mailer.send(@config.mail_server, @config.email, @summary)
|
29
|
+
rescue
|
30
|
+
@logger.error { "Error sending out summary email: #{$!}"}
|
31
|
+
end
|
32
|
+
else
|
33
|
+
@logger.info { "Starting interactive harvest"}
|
34
|
+
orig_start(sites, true)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def harvest(site)
|
41
|
+
begin
|
42
|
+
@logger.info { "Harvest of '#{site}' starting" }
|
43
|
+
@summary << "Harvest of '#{site}' attempted"
|
44
|
+
orig_harvest(site)
|
45
|
+
rescue OAI::Exception
|
46
|
+
if "noRecordsMatch" == $!.code
|
47
|
+
@logger.info "No new records available"
|
48
|
+
@summary << "'#{site}' had no new records."
|
49
|
+
else
|
50
|
+
@logger.error { "Harvesting of '#{site}' failed, message: #{$!}" }
|
51
|
+
@summary << "'#{site}' had an OAI Error! #{$!}"
|
52
|
+
end
|
53
|
+
rescue
|
54
|
+
@logger.error { "Harvesting of '#{site}' failed, message: #{$!}" }
|
55
|
+
@logger.error { "#{$!.backtrace.join('\n')}" }
|
56
|
+
@summary << "'#{site}' had an Error! #{$!}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def call(url, options)
|
61
|
+
@logger.info { "fetching: #{url} with options #{options.inspect}" }
|
62
|
+
file, records = orig_call(url, options)
|
63
|
+
@logger.info { "retrieved #{records} records" }
|
64
|
+
@summary << "Retrieved #{records} records."
|
65
|
+
return file, records
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module OAI
|
2
|
+
module Harvester
|
3
|
+
|
4
|
+
class Mailer
|
5
|
+
|
6
|
+
def self.send(server = nil, email = nil, message = nil)
|
7
|
+
msg = %{Subject: Harvester Summary\n\n#{message.join("\n")}}
|
8
|
+
to = (email.map { |e| "'#{e}'"}).join(", ")
|
9
|
+
Net::SMTP.start(server) do |smtp|
|
10
|
+
smtp.send_message msg, "harvester@#{Socket.gethostname}", to
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,334 @@
|
|
1
|
+
module OAI
|
2
|
+
module Harvester
|
3
|
+
# = OAI::Harvester::Shell
|
4
|
+
#
|
5
|
+
# A OAI-PMH client shell allowing OAI Harvesting to be configured in
|
6
|
+
# an interactive manner. Typing 'oai' on the command line starts the
|
7
|
+
# shell. The first time the shell is run it will prompt for the following
|
8
|
+
# configuration details:
|
9
|
+
# 1. A storage directory for all harvested records. Harvests will be
|
10
|
+
# stored under this directory in a directory structure based on the
|
11
|
+
# date of the harvest.
|
12
|
+
# 2. A log file directory.
|
13
|
+
# 3. Email address(es) for sending daily harvesting activity reports.
|
14
|
+
# 4. Network address of the SMTP server for sending mail.
|
15
|
+
#
|
16
|
+
# After the initial configuration, new harvest sites can be added by using
|
17
|
+
# the 'new' command. Sites are identified via nickname assigned by the
|
18
|
+
# user. After choosing a nickname, provide the URL of a harvestable site,
|
19
|
+
# and the shell will prompt you for the rest of the configuration
|
20
|
+
# information.
|
21
|
+
#
|
22
|
+
# The shell automatically pulls down the list of sets in the repository, and
|
23
|
+
# the supported metadata prefixes. Making it very simple to setup harvests.
|
24
|
+
#
|
25
|
+
class Shell
|
26
|
+
include Readline
|
27
|
+
|
28
|
+
def initialize(config)
|
29
|
+
@conf = config
|
30
|
+
@conf.sites ||= {} # Initialize sites hash there isn't one
|
31
|
+
end
|
32
|
+
|
33
|
+
def start
|
34
|
+
unless @conf.storage
|
35
|
+
banner "Entering first-time setup"
|
36
|
+
config
|
37
|
+
setup_cron
|
38
|
+
end
|
39
|
+
puts "type 'help' for help"
|
40
|
+
while((input = readline("oai> ", true)) != 'exit')
|
41
|
+
begin
|
42
|
+
cmd = input.split
|
43
|
+
if 1 == cmd.size
|
44
|
+
self.send(cmd[0])
|
45
|
+
else
|
46
|
+
self.send(cmd.shift, cmd.join(" "))
|
47
|
+
end
|
48
|
+
rescue
|
49
|
+
puts "Not a recognized command, or bad options. Type 'help' for clues."
|
50
|
+
#puts $!
|
51
|
+
#puts $!.backtrace.join("\n")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def help
|
59
|
+
banner "Commands:"
|
60
|
+
puts "\tharvest site [date] - Harvest site(s) manually"
|
61
|
+
puts "\tconfig - Configure harvester"
|
62
|
+
puts "\tlist <config> - List known providers or configuration"
|
63
|
+
puts "\tinfo [site[, site]] - Show information about a provider."
|
64
|
+
puts "\tnew - Add a new provider site to harvester"
|
65
|
+
puts "\tremove [site] - Remove a provider site from harvester"
|
66
|
+
puts "\tedit [site] - Change settings for a provider site"
|
67
|
+
puts "\texit - Exit the harvester shell.\n\n"
|
68
|
+
end
|
69
|
+
|
70
|
+
def harvest(options)
|
71
|
+
site, *date = options.split(/\s/)
|
72
|
+
if @conf.sites.keys.include?(site)
|
73
|
+
banner "Harvesting '#{site}'"
|
74
|
+
if date && !date.empty?
|
75
|
+
begin
|
76
|
+
date = Chronic.parse(date.join(' ')).utc.xmlschema
|
77
|
+
rescue NoMethodError
|
78
|
+
puts "Couldn't parse the date supplied"
|
79
|
+
return
|
80
|
+
end
|
81
|
+
else
|
82
|
+
date = nil
|
83
|
+
end
|
84
|
+
harvester = Harvest.new(@conf, @conf.storage, date)
|
85
|
+
harvester.start(site, true)
|
86
|
+
puts "done"
|
87
|
+
else
|
88
|
+
puts "Unknown repository: '#{args[0]}'"
|
89
|
+
end
|
90
|
+
puts # blank line
|
91
|
+
end
|
92
|
+
|
93
|
+
def list(args = nil)
|
94
|
+
if 'config' == args
|
95
|
+
banner "Current Configuration"
|
96
|
+
list_config
|
97
|
+
else
|
98
|
+
banner "Configured Repositories"
|
99
|
+
@conf.sites.keys.each do |k|
|
100
|
+
puts k
|
101
|
+
end
|
102
|
+
end
|
103
|
+
puts # blank line
|
104
|
+
end
|
105
|
+
|
106
|
+
def info(args)
|
107
|
+
banner "Provider Site Information"
|
108
|
+
sites = args.split(/[,\s|\s|,]/)
|
109
|
+
sites.each do |site|
|
110
|
+
print_site(site)
|
111
|
+
end
|
112
|
+
puts
|
113
|
+
end
|
114
|
+
|
115
|
+
def new
|
116
|
+
banner "Define New Harvesting Site"
|
117
|
+
name, site = form
|
118
|
+
@conf.sites[name] = site
|
119
|
+
@conf.save
|
120
|
+
end
|
121
|
+
|
122
|
+
def edit(name)
|
123
|
+
banner "Edit Harvesting Site"
|
124
|
+
name, site = form(name)
|
125
|
+
@conf.sites[name] = site
|
126
|
+
@conf.save
|
127
|
+
end
|
128
|
+
|
129
|
+
def remove(site)
|
130
|
+
if 'Y' == readline("Remove #{site}? (Y/N): ").upcase
|
131
|
+
@conf.sites.delete(site)
|
132
|
+
@conf.save
|
133
|
+
puts "#{site} removed"
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# http://oai.getty.edu:80/oaicat/OAIHandler
|
138
|
+
def form(name = nil)
|
139
|
+
begin
|
140
|
+
if not name
|
141
|
+
name = prompt("nickname", nil)
|
142
|
+
while(@conf.sites.keys.include?(name))
|
143
|
+
show 0, "Nickname already in use, choose another."
|
144
|
+
name = prompt("nickname")
|
145
|
+
end
|
146
|
+
end
|
147
|
+
site = @conf.sites[name] || {}
|
148
|
+
|
149
|
+
# URL
|
150
|
+
url = prompt("url", site['url'])
|
151
|
+
while(not (site['url'] = verify(url)))
|
152
|
+
puts "Trouble contacting provider, bad url?"
|
153
|
+
url = prompt("url", site['url'])
|
154
|
+
end
|
155
|
+
|
156
|
+
# Metadata formats
|
157
|
+
formats = metadata(site['url'])
|
158
|
+
report "Repository supports [#{formats.join(', ')}] metadata formats."
|
159
|
+
prefix = prompt("prefix", site['prefix'])
|
160
|
+
while(not formats.include?(prefix))
|
161
|
+
prefix = prompt("prefix", site['prefix'])
|
162
|
+
end
|
163
|
+
site['prefix'] = prefix
|
164
|
+
|
165
|
+
# Sets
|
166
|
+
sets = ['all']
|
167
|
+
begin
|
168
|
+
sets.concat sets(site['url'])
|
169
|
+
site['set'] = 'all' unless site['set'] # default to all sets
|
170
|
+
report "Repository supports [#{sets.join(', ')}] metadata sets."
|
171
|
+
set = prompt("set", site['set'])
|
172
|
+
while(not sets.include?(site['set']))
|
173
|
+
set = prompt("set", site['set'])
|
174
|
+
end
|
175
|
+
site['set'] = set
|
176
|
+
rescue
|
177
|
+
site['set'] = 'all'
|
178
|
+
end
|
179
|
+
|
180
|
+
# Period
|
181
|
+
period = expand_period(prompt("period", "daily"))
|
182
|
+
while(not Config::PERIODS.include?(period))
|
183
|
+
puts "Must be daily, weekly, or monthly"
|
184
|
+
period = expand_period(prompt("period", "daily"))
|
185
|
+
end
|
186
|
+
|
187
|
+
site['period'] = period
|
188
|
+
|
189
|
+
return [name, site]
|
190
|
+
rescue
|
191
|
+
puts "Problem adding/updating provider, aborting. (#{$!})"
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
def config
|
196
|
+
begin
|
197
|
+
directory = prompt("storage directory", @conf.storage)
|
198
|
+
while not directory_acceptable(directory)
|
199
|
+
directory = prompt("storage directory: ", @conf.storage)
|
200
|
+
end
|
201
|
+
|
202
|
+
email = @conf.email.join(', ') rescue nil
|
203
|
+
@conf.email = parse_emails(prompt("email", email))
|
204
|
+
|
205
|
+
@conf.mail_server = prompt("mail server", @conf.mail_server)
|
206
|
+
|
207
|
+
logfile = prompt("log file(s) directory", @conf.logfile)
|
208
|
+
while not directory_acceptable(logfile)
|
209
|
+
logfile = prompt("log file(s) directory", @conf.logfile)
|
210
|
+
end
|
211
|
+
@conf.storage = directory
|
212
|
+
@conf.logfile = logfile
|
213
|
+
@conf.save
|
214
|
+
rescue
|
215
|
+
nil
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def display(key, value, split = 40)
|
220
|
+
(split - key.size).times { print " " } if key.size < split
|
221
|
+
puts "#{key}: #{value}"
|
222
|
+
end
|
223
|
+
|
224
|
+
def banner(str)
|
225
|
+
puts "\n#{str}"
|
226
|
+
str.size.times { print "-" }
|
227
|
+
puts "\n"
|
228
|
+
end
|
229
|
+
|
230
|
+
def report(str)
|
231
|
+
puts "\n#{str}\n"
|
232
|
+
end
|
233
|
+
|
234
|
+
def indent(number)
|
235
|
+
number.times do
|
236
|
+
print "\t"
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
def prompt(text, default = nil, split = 20)
|
241
|
+
prompt_text = "#{text} [#{default}]: "
|
242
|
+
(split - prompt_text.size).times { print " " } if prompt_text.size < split
|
243
|
+
value = readline(prompt_text, true)
|
244
|
+
raise RuntimeError.new("Exit loop") unless value
|
245
|
+
return value.empty? ? default : value
|
246
|
+
end
|
247
|
+
|
248
|
+
def verify(url)
|
249
|
+
begin
|
250
|
+
client = OAI::Client.new(url, :redirects => false)
|
251
|
+
identify = client.identify
|
252
|
+
puts "Repository name \"#{identify.repository_name}\""
|
253
|
+
return url
|
254
|
+
rescue
|
255
|
+
if $!.to_s =~ /^Permanently Redirected to \[(.*)\?.*\]/
|
256
|
+
report "Provider redirected to: #{$1}"
|
257
|
+
verify($1)
|
258
|
+
else
|
259
|
+
puts "Error selecting repository: #{$!}"
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
def metadata(url)
|
265
|
+
formats = []
|
266
|
+
client = OAI::Client.new url
|
267
|
+
response = client.list_metadata_formats
|
268
|
+
response.to_a.each do |format|
|
269
|
+
formats << format.prefix
|
270
|
+
end
|
271
|
+
formats
|
272
|
+
end
|
273
|
+
|
274
|
+
def sets(url)
|
275
|
+
sets = []
|
276
|
+
client = OAI::Client.new url
|
277
|
+
response = client.list_sets
|
278
|
+
response.to_a.each do |set|
|
279
|
+
sets << set.spec
|
280
|
+
end
|
281
|
+
sets
|
282
|
+
end
|
283
|
+
|
284
|
+
def directory_acceptable(dir)
|
285
|
+
if not (dir && File.exists?(dir) && File.writable?(dir))
|
286
|
+
puts "Directory doesn't exist, or isn't writtable."
|
287
|
+
return false
|
288
|
+
end
|
289
|
+
true
|
290
|
+
end
|
291
|
+
|
292
|
+
def expand_period(str)
|
293
|
+
return str if Config::PERIODS.include?(str)
|
294
|
+
Config::PERIODS.each { |p| return p if p =~ /^#{str}/}
|
295
|
+
nil
|
296
|
+
end
|
297
|
+
|
298
|
+
def parse_emails(emails)
|
299
|
+
return nil unless emails
|
300
|
+
addresses = emails.split(/[,\s|\s|,]/)
|
301
|
+
end
|
302
|
+
|
303
|
+
def list_config
|
304
|
+
display("storage directory", @conf.storage, 20)
|
305
|
+
display("email", @conf.email.join(', '), 20) if @conf.email
|
306
|
+
display("mail server", @conf.mail_server, 20) if @conf.mail_server
|
307
|
+
display("log location", @conf.logfile, 20) if @conf.logfile
|
308
|
+
end
|
309
|
+
|
310
|
+
def list_sites
|
311
|
+
banner "Sites"
|
312
|
+
@conf.sites.each_key { |site| print_site(site) }
|
313
|
+
end
|
314
|
+
|
315
|
+
def print_site(site)
|
316
|
+
puts site
|
317
|
+
@conf.sites[site].each { |k,v| display(k, v, 15)}
|
318
|
+
end
|
319
|
+
|
320
|
+
def setup_cron
|
321
|
+
banner "Scheduling Automatic Harvesting"
|
322
|
+
puts "To activate automatic harvesting you must add an entry to"
|
323
|
+
puts "your scheduler. Linux/Mac OS X users should add the following"
|
324
|
+
puts "entry to their crontabs:\n\n"
|
325
|
+
puts "0 0 * * * #{$0} -D\n\n"
|
326
|
+
puts "Windows users should use WinAt to schedule"
|
327
|
+
puts "#{$0} to run every night.\n\n\n"
|
328
|
+
end
|
329
|
+
|
330
|
+
end
|
331
|
+
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|