mrt-tind-harvester 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a8a12f69bf55200b7d6b02ee260281c3d1d7b050
4
+ data.tar.gz: 6ea06ac8dbff3b873ef69042512f3d81fb5b2e73
5
+ SHA512:
6
+ metadata.gz: 7fdd4a906e4b22079c5a15f2eab8062dd551e6cc8e3ecf7d8909554b0bdec73ac84f616c12a862f03f854d57e3146880f59f1f82f5d3f7d38c2abce92222d7ee
7
+ data.tar.gz: dddc4f22729901b865d383dcfb63c8a0a04d9f4921a5c5a137ef6c619eff5886842747a6a5ffdae3e168e31bec189681d3e096fbe22b1cce52b7443d79ea0f5e
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ Dir.chdir(__dir__) { require 'bundler/setup' }
4
+
5
+ require 'optparse'
6
+ require 'optparse/time'
7
+ require 'ostruct'
8
+ require 'merritt/tind'
9
+
10
+ USAGE = OpenStruct.new(
11
+ CONFIG: 'path to configuration file (required)'.freeze,
12
+ DRY_RUN: 'dry run (harvest, but do not submit or update last_harvest)'.freeze,
13
+ FROM: 'start date/time (inclusive) for selective harvesting'.freeze,
14
+ UNTIL: 'end date/time (inclusive) for selective harvesting'.freeze,
15
+ HELP: 'print help and exit'
16
+ ).freeze
17
+
18
+ class TINDHarvesterApp
19
+ attr_reader :options
20
+
21
+ def initialize
22
+ @options = OpenStruct.new(dry_run: false, help: false)
23
+ end
24
+
25
+ def option_parser
26
+ @option_parser = OptionParser.new do |opts|
27
+ opts.on('-c CONFIG', '--config CONFIG', USAGE.CONFIG) { |config_file| options.config_file = config_file }
28
+ opts.on('-f', '--from DATETIME', Time, USAGE.FROM) { |from_time| options.from_time = to_utc_time(from_time) }
29
+ opts.on('-u', '--until DATETIME', Time, USAGE.UNTIL) { |until_time| options.until_time = to_utc_time(until_time) }
30
+ opts.on('-n', '--dry-run', USAGE.DRY_RUN) { options.dry_run = true }
31
+ opts.on('-h', '--help', USAGE.HELP) { options.help = true }
32
+ end
33
+ end
34
+
35
+ def do_harvest!
36
+ option_parser.parse!
37
+
38
+ print_help_and_exit(0) if options.help
39
+
40
+ harvester = Merritt::TIND::Harvester.from_file(config_file, dry_run: options.dry_run)
41
+ warn("Starting harvester; logging to #{harvester.log_path}")
42
+ harvester.process_feed!(from_time: options.from_time, until_time: options.until_time)
43
+ rescue StandardError => e
44
+ warn(e)
45
+ print_help_and_exit(1)
46
+ end
47
+
48
+ private
49
+
50
+ def config_file
51
+ config_file = options.config_file
52
+ return config_file if config_file && File.exist?(config_file)
53
+
54
+ exit_with_error('No configuration file specified') unless config_file
55
+ exit_with_error('The specified configuration file does not exist: ' + config_file)
56
+ end
57
+
58
+ def help
59
+ option_parser.to_s
60
+ end
61
+
62
+ def exit_with_error(msg)
63
+ warn(msg)
64
+ print_help_and_exit(1)
65
+ end
66
+
67
+ def print_help_and_exit(status)
68
+ warn(help)
69
+ exit(status)
70
+ end
71
+ end
72
+
73
+ app = TINDHarvesterApp.new
74
+ app.do_harvest!
@@ -0,0 +1 @@
1
+ Dir.glob(File.expand_path('merritt/*.rb', __dir__)).sort.each(&method(:require))
@@ -0,0 +1 @@
1
+ Dir.glob(File.expand_path('tind/*.rb', __dir__)).sort.each(&method(:require))
@@ -0,0 +1,114 @@
1
+ require 'pathname'
2
+ require 'yaml'
3
+
4
+ module Merritt
5
+ module TIND
6
+ class Config
7
+
8
+ attr_reader :config_h
9
+ attr_reader :config_path
10
+
11
+ def initialize(config_h = nil, config_yml: nil)
12
+ @config_h = config_h || {}
13
+ @config_path = Pathname.new(config_yml).realpath if config_yml
14
+ end
15
+
16
+ def oai_base_url
17
+ oai_config_h['base_url']
18
+ end
19
+
20
+ def oai_set
21
+ oai_config_h['set']
22
+ end
23
+
24
+ def stop_file_path
25
+ @stop_file_path ||= begin
26
+ stop_file = config_h['stop_file']
27
+ resolve_relative_path(stop_file)
28
+ end
29
+ end
30
+
31
+ def mrt_collection_ark
32
+ merritt_config_h['collection_ark']
33
+ end
34
+
35
+ def mrt_ingest_url
36
+ merritt_config_h['ingest_url']
37
+ end
38
+
39
+ def mrt_ingest_profile
40
+ merritt_config_h['ingest_profile']
41
+ end
42
+
43
+ def db_config_path
44
+ @db_config_path ||= begin
45
+ db = merritt_config_h['database']
46
+ resolve_relative_path(db)
47
+ end
48
+ end
49
+
50
+ def log_level
51
+ log_config_h['level']
52
+ end
53
+
54
+ def log_path
55
+ @log_path ||= begin
56
+ lp = log_config_h['file']
57
+ resolve_relative_path(lp)
58
+ end
59
+ end
60
+
61
+ def last_harvest_path
62
+ @last_harvest_path ||= begin
63
+ lh = config_h['last_harvest']
64
+ resolve_relative_path(lh)
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ def oai_config_h
71
+ config_h['oai'] || {}
72
+ end
73
+
74
+ def merritt_config_h
75
+ config_h['merritt'] || {}
76
+ end
77
+
78
+ def log_config_h
79
+ config_h['log'] || {}
80
+ end
81
+
82
+ def resolve_relative_path(filename)
83
+ return nil unless filename
84
+
85
+ pathname = Pathname.new(filename)
86
+ return pathname if pathname.absolute?
87
+ return pathname unless config_path
88
+
89
+ (config_path.parent + pathname).cleanpath
90
+ end
91
+
92
+ class << self
93
+
94
+ def from_file(config_yml)
95
+ # A missing config.yml is not normal
96
+ raise ArgumentError, "Can't read config from nil file" unless config_yml
97
+ raise ArgumentError, "Specified config file #{config_yml} does not exist" unless File.exist?(config_yml)
98
+
99
+ config_h = YAML.load_file(config_yml)
100
+ env_config = config_h[environment]
101
+ raise ArgumentError, "No configuration for environment '#{environment}' found in #{config_yml}" if env_config.nil? || env_config.empty?
102
+
103
+ Config.new(env_config, config_yml: config_yml)
104
+ end
105
+
106
+ def environment
107
+ %w[HARVESTER_ENV RAILS_ENV RACK_ENV].each { |v| return ENV[v] if ENV[v] }
108
+ 'development'
109
+ end
110
+ end
111
+
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,29 @@
1
+ require 'oai/client'
2
+
3
+ module Merritt
4
+ module TIND
5
+ class Feed
6
+ include Enumerable
7
+
8
+ def initialize(resp)
9
+ @resp = ensure_full_response(resp)
10
+ end
11
+
12
+ def each
13
+ return enum_for(:each) unless block_given?
14
+
15
+ @resp.each { |oai_record| yield Record.from_oai(oai_record) }
16
+ end
17
+
18
+ private
19
+
20
+ def ensure_full_response(resp)
21
+ return resp unless resp.respond_to?(:resumption_token) # already wrapped
22
+ return resp unless resp.resumption_token # nothing to paginate
23
+
24
+ resp.full
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,61 @@
1
+ module Merritt
2
+ module TIND
3
+ class FeedProcessor
4
+
5
+ attr_reader :feed
6
+ attr_reader :harvester
7
+ attr_reader :server
8
+
9
+ def initialize(feed:, server:, harvester:)
10
+ @feed = feed
11
+ @server = server
12
+ @harvester = harvester
13
+ end
14
+
15
+ def process_feed!
16
+ feed.each { |r| process_record(r, server) }
17
+
18
+ log.debug("Updating #{config.last_harvest_path}:\n#{last_harvest_next.to_yaml.gsub(/^/, "\t")}")
19
+ update_last_harvest!
20
+ end
21
+
22
+ private
23
+
24
+ def config
25
+ harvester.config
26
+ end
27
+
28
+ def log
29
+ harvester.log
30
+ end
31
+
32
+ def dry_run?
33
+ harvester.dry_run?
34
+ end
35
+
36
+ def last_harvest_next
37
+ @last_harvest_next ||= begin
38
+ last_harvest = harvester.last_harvest
39
+ last_harvest ? last_harvest.clone : LastHarvest.new
40
+ end
41
+ end
42
+
43
+ def update_last_harvest!
44
+ if dry_run?
45
+ log.info("Dry run: #{config.last_harvest_path} not updated")
46
+ else
47
+ last_harvest_next.write_to(config.last_harvest_path)
48
+ end
49
+ end
50
+
51
+ def process_record(r, server)
52
+ RecordProcessor.new(r, harvester, server).process_record!
53
+ @last_harvest_next = last_harvest_next.update(success: r)
54
+ rescue StandardError => e
55
+ # TODO: can we identify failures after submission?
56
+ log.warn(e)
57
+ @last_harvest_next = last_harvest_next.update(failure: r)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,66 @@
1
+ require 'timeout'
2
+
3
+ module Merritt
4
+ module TIND
5
+ module Files
6
+ DEFAULT_TIMEOUT_SECS = 5
7
+ DEFAULT_SLEEP_INTERVAL_SECS = 0.1
8
+
9
+ class << self
10
+
11
+ def with_lock(filename)
12
+ f = acquire_lock(filename)
13
+ yield f
14
+ ensure
15
+ f.flock(File::LOCK_UN) if f
16
+ end
17
+
18
+ def rotate_and_lock(filename)
19
+ with_lock(filename) do |f|
20
+ if File.size?(filename)
21
+ rotating(filename) { |f1| yield f1 }
22
+ else
23
+ yield f
24
+ end
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def rotating(filename)
31
+ rotate_to = rotated_name(filename)
32
+
33
+ File.rename(filename, rotate_to)
34
+ with_lock(filename) { |f| yield f }
35
+ end
36
+
37
+ def rotated_name(filename)
38
+ loop do
39
+ renamed_file = filename + '-' + Time.now.utc.iso8601(3)
40
+ return renamed_file unless File.exist?(renamed_file)
41
+
42
+ sleep(DEFAULT_SLEEP_INTERVAL_SECS)
43
+ end
44
+ end
45
+
46
+ def acquire_lock(filename)
47
+ Timeout.timeout(DEFAULT_TIMEOUT_SECS) do
48
+ loop do
49
+ f = File.open(filename, 'a+')
50
+ f.flock(File::LOCK_EX)
51
+ return f if File.identical?(filename, f)
52
+
53
+ # we do cover this, but it's called in a subprocess
54
+ # so SimpleCov can't tell we've called it
55
+ # :nocov:
56
+ f.flock(File::LOCK_UN)
57
+ sleep(DEFAULT_SLEEP_INTERVAL_SECS)
58
+ # :nocov:
59
+ end
60
+ end
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,138 @@
1
+ require 'faraday_middleware'
2
+ require 'oai/client'
3
+
4
+ module Merritt
5
+ module TIND
6
+ class Harvester
7
+
8
+ attr_reader :config
9
+
10
+ def initialize(config, dry_run: false)
11
+ @config = config
12
+ @dry_run = dry_run
13
+
14
+ set_str = config.oai_set ? "'#{config.oai_set}'" : '<nil>'
15
+ log.info("Initializing harvester for base URL #{oai_base_uri}, set #{set_str} => collection #{config.mrt_collection_ark}")
16
+ end
17
+
18
+ def process_feed!(from_time: nil, until_time: nil)
19
+ return if stop_file_present?
20
+
21
+ opts = to_oai_opts(from_time, until_time)
22
+ log.info("harvesting #{query_uri(opts)}")
23
+ resp = oai_client.list_records(opts)
24
+ feed = Feed.new(resp)
25
+ return process_feed(feed, nil) if dry_run?
26
+
27
+ with_server { |server| process_feed(feed, server) }
28
+ end
29
+
30
+ def with_server
31
+ server = Mrt::Ingest::OneTimeServer.new
32
+ server.start_server
33
+ yield server
34
+ ensure
35
+ server.join_server
36
+ end
37
+
38
+ def dry_run?
39
+ @dry_run
40
+ end
41
+
42
+ def stop_file_present?
43
+ stop_file_path = config.stop_file_path
44
+ stop_file_present = stop_file_path && stop_file_path.exist?
45
+ log.info("Stop file present: #{config.stop_file_path}") if stop_file_present
46
+ stop_file_present
47
+ end
48
+
49
+ def last_harvest
50
+ # read this from the file every time
51
+ LastHarvest.from_file(config.last_harvest_path)
52
+ end
53
+
54
+ def oai_client
55
+ @oai_client ||= Harvester.oai_client_for(oai_base_uri)
56
+ end
57
+
58
+ def oai_base_uri
59
+ @oai_base_uri ||= URI.parse(config.oai_base_url)
60
+ end
61
+
62
+ def mrt_collection_ark
63
+ config.mrt_collection_ark
64
+ end
65
+
66
+ def mrt_ingest_profile
67
+ config.mrt_ingest_profile
68
+ end
69
+
70
+ def mrt_inv_db
71
+ @mrt_inv_db ||= InventoryDB.from_file(config.db_config_path)
72
+ end
73
+
74
+ def mrt_ingest_client
75
+ # TODO: secure way to get username and password?
76
+ @mrt_ingest_client ||= Mrt::Ingest::Client.new(config.mrt_ingest_url)
77
+ end
78
+
79
+ def log
80
+ @log ||= Logging.new_logger(log_path, config.log_level)
81
+ end
82
+
83
+ def determine_from_time(from_time = nil)
84
+ return from_time if from_time
85
+
86
+ lh = last_harvest
87
+ oldest_failed = lh.oldest_failed_datestamp
88
+ return oldest_failed if oldest_failed
89
+
90
+ lh.newest_success_datestamp
91
+ end
92
+
93
+ def log_path
94
+ config.log_path
95
+ end
96
+
97
+ private
98
+
99
+ def process_feed(feed, server)
100
+ return unless feed
101
+
102
+ feed_processor = FeedProcessor.new(feed: feed, server: server, harvester: self)
103
+ feed_processor.process_feed!
104
+ end
105
+
106
+ def query_uri(opts)
107
+ query = '?ListRecords'
108
+ opts.each { |k, v| query << "&#{k}=#{v}" } if opts
109
+ oai_base_uri.merge(query)
110
+ end
111
+
112
+ def to_oai_opts(from_time, until_time)
113
+ from_time = determine_from_time(from_time)
114
+ from_iso8601, until_iso8601 = Times.iso8601_range(from_time, until_time)
115
+ { from: from_iso8601, until: until_iso8601, set: config.oai_set }.compact
116
+ end
117
+
118
+ class << self
119
+
120
+ def from_file(config_yml, dry_run: false)
121
+ config = Config.from_file(config_yml)
122
+ Harvester.new(config, dry_run: dry_run)
123
+ end
124
+
125
+ def oai_client_for(base_uri)
126
+ # Workaround for https://github.com/code4lib/ruby-oai/issues/45
127
+ http_client = Faraday.new(base_uri) do |conn|
128
+ conn.request(:retry, max: 5, retry_statuses: 503)
129
+ conn.response(:follow_redirects, limit: 5)
130
+ conn.adapter(:net_http)
131
+ end
132
+ OAI::Client.new(base_uri.to_s, http: http_client)
133
+ end
134
+ end
135
+
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,54 @@
1
+ require 'mysql2'
2
+
3
+ module Merritt
4
+ module TIND
5
+ class InventoryDB
6
+
7
+ attr_reader :db_connection
8
+
9
+ def initialize(db_config_h)
10
+ @db_connection = Mysql2::Client.new(db_config_h)
11
+ end
12
+
13
+ class << self
14
+ def from_file(db_config_path)
15
+ raise "Can't connect to nil database" unless db_config_path
16
+ raise ArgumentError, "Specified database config #{db_config_path} does not exist" unless File.exist?(db_config_path)
17
+
18
+ db_config = YAML.load_file(db_config_path)
19
+ env_db_config = db_config[Config.environment]
20
+ InventoryDB.new(env_db_config)
21
+ end
22
+ end
23
+
24
+ def find_existing_object(local_id, collection_ark)
25
+ result = existing_object_stmt.execute(local_id, collection_ark).first
26
+ return nil unless result
27
+
28
+ OpenStruct.new(result)
29
+ end
30
+
31
+ private
32
+
33
+ # TODO: is this right or should we be using erc_where? or both?
34
+ EXISTING_OBJECT_SQL = <<~SQL.freeze
35
+ SELECT o.*
36
+ FROM inv_objects AS o
37
+ JOIN inv_collections_inv_objects AS co
38
+ ON co.inv_object_id = o.id
39
+ JOIN inv_collections AS c
40
+ ON c.id = co.inv_collection_id
41
+ JOIN inv_localids AS li
42
+ ON li.inv_object_ark = o.ark
43
+ WHERE li.local_id = ?
44
+ AND c.ark = ?
45
+ LIMIT 1
46
+ SQL
47
+
48
+ def existing_object_stmt
49
+ @existing_object_stmt ||= db_connection.prepare(EXISTING_OBJECT_SQL)
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,84 @@
1
+ require 'yaml'
2
+
3
+ module Merritt
4
+ module TIND
5
+
6
+ class LastHarvest
7
+
8
+ OLDEST_FAILED = 'oldest_failed'.freeze
9
+ NEWEST_SUCCESS = 'newest_success'.freeze
10
+
11
+ attr_reader :oldest_failed
12
+ attr_reader :newest_success
13
+
14
+ # @param oldest_failed [Record, nil] the oldest record that failed to submit
15
+ # @param newest_success [Record, nil] the newest record successfully submitted
16
+ def initialize(oldest_failed: nil, newest_success: nil)
17
+ @oldest_failed = oldest_failed
18
+ @newest_success = newest_success
19
+ end
20
+
21
+ def to_h
22
+ {
23
+ OLDEST_FAILED => (oldest_failed && oldest_failed.to_h),
24
+ NEWEST_SUCCESS => (newest_success && newest_success.to_h)
25
+ }
26
+ end
27
+
28
+ def to_yaml
29
+ to_h.to_yaml
30
+ end
31
+
32
+ def write_to(last_harvest_yml)
33
+ Files.rotate_and_lock(last_harvest_yml) do |f|
34
+ f.write(to_yaml)
35
+ end
36
+ end
37
+
38
+ def oldest_failed_datestamp
39
+ oldest_failed && oldest_failed.datestamp
40
+ end
41
+
42
+ def newest_success_datestamp
43
+ newest_success && newest_success.datestamp
44
+ end
45
+
46
+ def update(success: nil, failure: nil)
47
+ LastHarvest.new(
48
+ newest_success: Record.later(success, newest_success),
49
+ oldest_failed: Record.earlier(failure, oldest_failed)
50
+ )
51
+ end
52
+
53
+ private
54
+
55
+ def initialize_dup(source)
56
+ @newest_success = source.newest_success && source.newest_success.dup
57
+ @oldest_failed = source.oldest_failed && source.oldest_failed.dup
58
+ end
59
+
60
+ def initialize_clone(source)
61
+ @newest_success = source.newest_success && source.newest_success.clone
62
+ @oldest_failed = source.oldest_failed && source.oldest_failed.clone
63
+ end
64
+
65
+ class << self
66
+ def from_file(last_harvest_yml)
67
+ return from_hash(YAML.load_file(last_harvest_yml)) if last_harvest_yml && File.exist?(last_harvest_yml)
68
+
69
+ # A missing last_yarvest.yml is normal
70
+ LastHarvest.new
71
+ end
72
+
73
+ def from_hash(h)
74
+ LastHarvest.new(
75
+ oldest_failed: Record.from_hash(h[OLDEST_FAILED]),
76
+ newest_success: Record.from_hash(h[NEWEST_SUCCESS])
77
+ )
78
+ end
79
+
80
+ end
81
+
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,42 @@
1
+ require 'logger'
2
+ require 'time'
3
+
4
+ module Merritt
5
+ module TIND
6
+ module Logging
7
+ NUM_LOG_FILES = 10
8
+ DEFAULT_LOG_LEVEL = Logger::DEBUG
9
+
10
+ class << self
11
+ def fmt_log(severity, datetime, _, msg)
12
+ "#{datetime.iso8601}\t#{severity}\t#{msg}\n"
13
+ end
14
+
15
+ def new_logger(log_dev = nil, log_level = nil)
16
+ log_dev ||= STDERR
17
+ log_level ||= Logger::DEBUG
18
+
19
+ created_log_dir = ensure_log_dir(log_dev)
20
+ logger = Logger.new(log_dev, NUM_LOG_FILES, level: log_level, formatter: Logging.method(:fmt_log))
21
+ created_log_dir.each { |d| logger.info("Created log directory #{d}") } if created_log_dir
22
+ logger
23
+ end
24
+
25
+ private
26
+
27
+ def io_like?(log_dev)
28
+ # This is how Ruby's Logger identifies an IO-like log device
29
+ log_dev.respond_to?(:write) && log_dev.respond_to?(:close)
30
+ end
31
+
32
+ def ensure_log_dir(log_dev)
33
+ return if io_like?(log_dev)
34
+
35
+ # assume it's a string or a pathname
36
+ log_dir = Pathname.new(log_dev).parent
37
+ FileUtils.mkdir_p(log_dir) unless log_dir.exist?
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,7 @@
1
+ module Merritt
2
+ module TIND
3
+ NAME = 'mrt-tind-harvester'.freeze
4
+ VERSION = '0.0.1'.freeze
5
+ COPYRIGHT = 'Copyright (c) 2019 The Regents of the University of California'.freeze
6
+ end
7
+ end
@@ -0,0 +1,109 @@
1
+ require 'oai/client'
2
+ require 'time'
3
+
4
+ module Merritt
5
+ module TIND
6
+ class Record
7
+ IDENTIFIER = 'identifier'.freeze
8
+ DATESTAMP = 'datestamp'.freeze
9
+
10
+ attr_reader :identifier
11
+ attr_reader :datestamp
12
+ attr_reader :metadata
13
+
14
+ def initialize(identifier:, datestamp:, oai_metadata: nil)
15
+ @identifier = identifier
16
+ @datestamp = datestamp
17
+ @metadata = oai_metadata
18
+ end
19
+
20
+ def erc
21
+ # TODO: something smarter when we know the real requirements
22
+ {
23
+ 'what' => identifier,
24
+ 'where' => local_id,
25
+ 'when' => dc_dates.first || datestamp,
26
+ 'when/created' => dc_dates.first || datestamp,
27
+ 'when/modified' => datestamp
28
+ }
29
+ end
30
+
31
+ def dc_identifiers
32
+ @dc_identifiers ||= REXML::XPath.match(metadata, './/dc:identifier').map(&:text)
33
+ end
34
+
35
+ def dc_dates
36
+ @dc_dates ||= begin
37
+ REXML::XPath.match(metadata, './/dc:date')
38
+ .map(&:text)
39
+ .map { |t| Time.parse(t) }
40
+ end
41
+ end
42
+
43
+ def dc_titles
44
+ @dc_titles ||= REXML::XPath.match(metadata, './/dc:title').map(&:text)
45
+ end
46
+
47
+ def dc_creators
48
+ @dc_creators ||= REXML::XPath.match(metadata, './/dc:creator').map(&:text)
49
+ end
50
+
51
+ def content_uri
52
+ @content_uri ||= begin
53
+ # TODO: something smarter when we know the real requirements
54
+ content_url = dc_identifiers.find do |dc_id|
55
+ dc_id.start_with?('http') && dc_id.end_with?('jpg')
56
+ end
57
+ content_url && URI.parse(content_url)
58
+ end
59
+ end
60
+
61
+ def local_id
62
+ # TODO: something smarter when we know the real requirements
63
+ dc_identifiers.first || identifier
64
+ end
65
+
66
+ def to_h
67
+ { IDENTIFIER => identifier, DATESTAMP => datestamp }
68
+ end
69
+
70
+ class << self
71
+
72
+ def later(r1, r2)
73
+ return r1 if r2.nil?
74
+ return r2 if r1.nil?
75
+ return r1 if (r1.datestamp <=> r2.datestamp) > 0
76
+
77
+ r2
78
+ end
79
+
80
+ def earlier(r1, r2)
81
+ return r1 if r2.nil?
82
+ return r2 if r1.nil?
83
+ return r1 if (r1.datestamp <=> r2.datestamp) < 0
84
+
85
+ r2
86
+ end
87
+
88
+ def from_hash(h)
89
+ return unless h
90
+
91
+ Record.new(identifier: h[IDENTIFIER], datestamp: h[DATESTAMP])
92
+ end
93
+
94
+ # Constructs a new {Record} wrapping the specified record.
95
+ #
96
+ # @param oai_record [OAI::Record] An OAI record as returned by `OAI::Client`
97
+ def from_oai(oai_record)
98
+ raise ArgumentError, "can't parse nil record" unless oai_record
99
+
100
+ header = oai_record.header
101
+ identifier = header.identifier
102
+ datestamp = header.datestamp && Time.parse(header.datestamp)
103
+ Record.new(identifier: identifier, datestamp: datestamp, oai_metadata: oai_record.metadata)
104
+ end
105
+
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,90 @@
1
+ require 'mrt/ingest'
2
+ require 'ostruct'
3
+
4
+ module Merritt
5
+ module TIND
6
+ class RecordProcessor
7
+
8
+ USER_AGENT = 'Merritt/TIND Harvester'.freeze
9
+
10
+ attr_reader :record
11
+ attr_reader :harvester
12
+ attr_reader :server
13
+
14
+ def initialize(record, harvester, server)
15
+ @record = record
16
+ @harvester = harvester
17
+ @server = server
18
+ end
19
+
20
+ def process_record!
21
+ return true if already_up_to_date?
22
+
23
+ log.info("Processing record: #{local_id} (content: #{content_uri}")
24
+ return true if harvester.dry_run?
25
+
26
+ submit_to_ingest!
27
+ end
28
+
29
+ private
30
+
31
+ def submit_to_ingest!
32
+ ingest_object.add_component(content_uri)
33
+ response = ingest_object.start_ingest(ingest_client, ingest_profile, USER_AGENT)
34
+ log.info("Batch #{response.batch_id} queued at #{response.submission_date}")
35
+ true # TODO: is there anything in the response that might cause us to return false?
36
+ end
37
+
38
+ def already_up_to_date?
39
+ @already_up_to_date ||= existing_object && existing_object.modified >= record.datestamp
40
+ end
41
+
42
+ def existing_object
43
+ @existing_object = (find_existing_object || false) if @existing_object.nil?
44
+ @existing_object
45
+ end
46
+
47
+ def find_existing_object
48
+ inv_db.find_existing_object(local_id, collection_ark)
49
+ end
50
+
51
+ def inv_db
52
+ harvester.mrt_inv_db
53
+ end
54
+
55
+ def local_id
56
+ record.local_id
57
+ end
58
+
59
+ def content_uri
60
+ record.content_uri
61
+ end
62
+
63
+ def collection_ark
64
+ harvester.mrt_collection_ark
65
+ end
66
+
67
+ def ingest_client
68
+ harvester.mrt_ingest_client
69
+ end
70
+
71
+ def ingest_profile
72
+ harvester.mrt_ingest_profile
73
+ end
74
+
75
+ def log
76
+ harvester.log
77
+ end
78
+
79
+ def ingest_object
80
+ @ingest_object ||= begin
81
+ Mrt::Ingest::IObject.new(
82
+ erc: record.erc,
83
+ server: server,
84
+ local_identifier: record.local_id
85
+ )
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,35 @@
1
+ require 'time'
2
+
3
+ module Merritt
4
+ module TIND
5
+ module Times
6
+ class << self
7
+ def iso8601_range(from_time, until_time)
8
+ from_time, until_time = valid_range(from_time, until_time)
9
+ [
10
+ from_time && from_time.iso8601,
11
+ until_time && until_time.iso8601
12
+ ]
13
+ end
14
+
15
+ private
16
+
17
+ def valid_range(from_time, until_time)
18
+ from_time, until_time = [from_time, until_time].map(&method(:utc_or_nil))
19
+ if from_time && until_time
20
+ raise RangeError, "from_time #{from_time} must be <= until_time #{until_time}" if from_time > until_time
21
+ end
22
+
23
+ [from_time, until_time]
24
+ end
25
+
26
+ def utc_or_nil(time)
27
+ return time.utc if time.respond_to?(:utc)
28
+ return unless time
29
+
30
+ raise ArgumentError, "time #{time} does not appear to be a Time"
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
metadata ADDED
@@ -0,0 +1,298 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mrt-tind-harvester
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - David Moles
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-05-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mrt-ingest
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.0.5
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.0.5
27
+ - !ruby/object:Gem::Dependency
28
+ name: mysql2
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.4.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.4.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.10'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.10'
55
+ - !ruby/object:Gem::Dependency
56
+ name: oai
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.4'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.4'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rest-client
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '2.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: capistrano
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.4'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.4'
97
+ - !ruby/object:Gem::Dependency
98
+ name: capistrano-bundler
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.1'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.1'
111
+ - !ruby/object:Gem::Dependency
112
+ name: database_cleaner
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.5'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.5'
125
+ - !ruby/object:Gem::Dependency
126
+ name: factory_bot
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '4.11'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '4.11'
139
+ - !ruby/object:Gem::Dependency
140
+ name: rake
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '12.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '12.0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: rspec
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '3.8'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '3.8'
167
+ - !ruby/object:Gem::Dependency
168
+ name: rubocop
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - "~>"
172
+ - !ruby/object:Gem::Version
173
+ version: '0.68'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '0.68'
181
+ - !ruby/object:Gem::Dependency
182
+ name: rubocop-rspec
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - "~>"
186
+ - !ruby/object:Gem::Version
187
+ version: '1.33'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - "~>"
193
+ - !ruby/object:Gem::Version
194
+ version: '1.33'
195
+ - !ruby/object:Gem::Dependency
196
+ name: simplecov
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - "~>"
200
+ - !ruby/object:Gem::Version
201
+ version: '0.16'
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - "~>"
207
+ - !ruby/object:Gem::Version
208
+ version: '0.16'
209
+ - !ruby/object:Gem::Dependency
210
+ name: simplecov-console
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '0.4'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '0.4'
223
+ - !ruby/object:Gem::Dependency
224
+ name: standalone_migrations
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - "~>"
228
+ - !ruby/object:Gem::Version
229
+ version: '5.2'
230
+ type: :development
231
+ prerelease: false
232
+ version_requirements: !ruby/object:Gem::Requirement
233
+ requirements:
234
+ - - "~>"
235
+ - !ruby/object:Gem::Version
236
+ version: '5.2'
237
+ - !ruby/object:Gem::Dependency
238
+ name: webmock
239
+ requirement: !ruby/object:Gem::Requirement
240
+ requirements:
241
+ - - "~>"
242
+ - !ruby/object:Gem::Version
243
+ version: '3.5'
244
+ type: :development
245
+ prerelease: false
246
+ version_requirements: !ruby/object:Gem::Requirement
247
+ requirements:
248
+ - - "~>"
249
+ - !ruby/object:Gem::Version
250
+ version: '3.5'
251
+ description: Harvests TIND OAI-PMH feed to identify files for ingest into Merritt
252
+ email:
253
+ - david.moles@ucop.edu
254
+ executables:
255
+ - mrt-tind-harvester
256
+ extensions: []
257
+ extra_rdoc_files: []
258
+ files:
259
+ - bin/mrt-tind-harvester
260
+ - lib/merritt.rb
261
+ - lib/merritt/tind.rb
262
+ - lib/merritt/tind/config.rb
263
+ - lib/merritt/tind/feed.rb
264
+ - lib/merritt/tind/feed_processor.rb
265
+ - lib/merritt/tind/files.rb
266
+ - lib/merritt/tind/harvester.rb
267
+ - lib/merritt/tind/inventory_db.rb
268
+ - lib/merritt/tind/last_harvest.rb
269
+ - lib/merritt/tind/logging.rb
270
+ - lib/merritt/tind/module_info.rb
271
+ - lib/merritt/tind/record.rb
272
+ - lib/merritt/tind/record_processor.rb
273
+ - lib/merritt/tind/times.rb
274
+ homepage: https://github.com/CDLUC3/mrt-tind-harvester
275
+ licenses:
276
+ - MIT
277
+ metadata: {}
278
+ post_install_message:
279
+ rdoc_options: []
280
+ require_paths:
281
+ - lib
282
+ required_ruby_version: !ruby/object:Gem::Requirement
283
+ requirements:
284
+ - - "~>"
285
+ - !ruby/object:Gem::Version
286
+ version: '2.4'
287
+ required_rubygems_version: !ruby/object:Gem::Requirement
288
+ requirements:
289
+ - - ">="
290
+ - !ruby/object:Gem::Version
291
+ version: '0'
292
+ requirements: []
293
+ rubyforge_project:
294
+ rubygems_version: 2.6.14.1
295
+ signing_key:
296
+ specification_version: 4
297
+ summary: TIND harvester for Merritt
298
+ test_files: []