mrt-tind-harvester 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a8a12f69bf55200b7d6b02ee260281c3d1d7b050
4
+ data.tar.gz: 6ea06ac8dbff3b873ef69042512f3d81fb5b2e73
5
+ SHA512:
6
+ metadata.gz: 7fdd4a906e4b22079c5a15f2eab8062dd551e6cc8e3ecf7d8909554b0bdec73ac84f616c12a862f03f854d57e3146880f59f1f82f5d3f7d38c2abce92222d7ee
7
+ data.tar.gz: dddc4f22729901b865d383dcfb63c8a0a04d9f4921a5c5a137ef6c619eff5886842747a6a5ffdae3e168e31bec189681d3e096fbe22b1cce52b7443d79ea0f5e
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ Dir.chdir(__dir__) { require 'bundler/setup' }
4
+
5
+ require 'optparse'
6
+ require 'optparse/time'
7
+ require 'ostruct'
8
+ require 'merritt/tind'
9
+
10
+ USAGE = OpenStruct.new(
11
+ CONFIG: 'path to configuration file (required)'.freeze,
12
+ DRY_RUN: 'dry run (harvest, but do not submit or update last_harvest)'.freeze,
13
+ FROM: 'start date/time (inclusive) for selective harvesting'.freeze,
14
+ UNTIL: 'end date/time (inclusive) for selective harvesting'.freeze,
15
+ HELP: 'print help and exit'
16
+ ).freeze
17
+
18
+ class TINDHarvesterApp
19
+ attr_reader :options
20
+
21
+ def initialize
22
+ @options = OpenStruct.new(dry_run: false, help: false)
23
+ end
24
+
25
+ def option_parser
26
+ @option_parser = OptionParser.new do |opts|
27
+ opts.on('-c CONFIG', '--config CONFIG', USAGE.CONFIG) { |config_file| options.config_file = config_file }
28
+ opts.on('-f', '--from DATETIME', Time, USAGE.FROM) { |from_time| options.from_time = to_utc_time(from_time) }
29
+ opts.on('-u', '--until DATETIME', Time, USAGE.UNTIL) { |until_time| options.until_time = to_utc_time(until_time) }
30
+ opts.on('-n', '--dry-run', USAGE.DRY_RUN) { options.dry_run = true }
31
+ opts.on('-h', '--help', USAGE.HELP) { options.help = true }
32
+ end
33
+ end
34
+
35
+ def do_harvest!
36
+ option_parser.parse!
37
+
38
+ print_help_and_exit(0) if options.help
39
+
40
+ harvester = Merritt::TIND::Harvester.from_file(config_file, dry_run: options.dry_run)
41
+ warn("Starting harvester; logging to #{harvester.log_path}")
42
+ harvester.process_feed!(from_time: options.from_time, until_time: options.until_time)
43
+ rescue StandardError => e
44
+ warn(e)
45
+ print_help_and_exit(1)
46
+ end
47
+
48
+ private
49
+
50
+ def config_file
51
+ config_file = options.config_file
52
+ return config_file if config_file && File.exist?(config_file)
53
+
54
+ exit_with_error('No configuration file specified') unless config_file
55
+ exit_with_error('The specified configuration file does not exist: ' + config_file)
56
+ end
57
+
58
+ def help
59
+ option_parser.to_s
60
+ end
61
+
62
+ def exit_with_error(msg)
63
+ warn(msg)
64
+ print_help_and_exit(1)
65
+ end
66
+
67
+ def print_help_and_exit(status)
68
+ warn(help)
69
+ exit(status)
70
+ end
71
+ end
72
+
73
+ app = TINDHarvesterApp.new
74
+ app.do_harvest!
@@ -0,0 +1 @@
1
+ Dir.glob(File.expand_path('merritt/*.rb', __dir__)).sort.each(&method(:require))
@@ -0,0 +1 @@
1
+ Dir.glob(File.expand_path('tind/*.rb', __dir__)).sort.each(&method(:require))
@@ -0,0 +1,114 @@
1
+ require 'pathname'
2
+ require 'yaml'
3
+
4
+ module Merritt
5
+ module TIND
6
+ class Config
7
+
8
+ attr_reader :config_h
9
+ attr_reader :config_path
10
+
11
+ def initialize(config_h = nil, config_yml: nil)
12
+ @config_h = config_h || {}
13
+ @config_path = Pathname.new(config_yml).realpath if config_yml
14
+ end
15
+
16
+ def oai_base_url
17
+ oai_config_h['base_url']
18
+ end
19
+
20
+ def oai_set
21
+ oai_config_h['set']
22
+ end
23
+
24
+ def stop_file_path
25
+ @stop_file_path ||= begin
26
+ stop_file = config_h['stop_file']
27
+ resolve_relative_path(stop_file)
28
+ end
29
+ end
30
+
31
+ def mrt_collection_ark
32
+ merritt_config_h['collection_ark']
33
+ end
34
+
35
+ def mrt_ingest_url
36
+ merritt_config_h['ingest_url']
37
+ end
38
+
39
+ def mrt_ingest_profile
40
+ merritt_config_h['ingest_profile']
41
+ end
42
+
43
+ def db_config_path
44
+ @db_config_path ||= begin
45
+ db = merritt_config_h['database']
46
+ resolve_relative_path(db)
47
+ end
48
+ end
49
+
50
+ def log_level
51
+ log_config_h['level']
52
+ end
53
+
54
+ def log_path
55
+ @log_path ||= begin
56
+ lp = log_config_h['file']
57
+ resolve_relative_path(lp)
58
+ end
59
+ end
60
+
61
+ def last_harvest_path
62
+ @last_harvest_path ||= begin
63
+ lh = config_h['last_harvest']
64
+ resolve_relative_path(lh)
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ def oai_config_h
71
+ config_h['oai'] || {}
72
+ end
73
+
74
+ def merritt_config_h
75
+ config_h['merritt'] || {}
76
+ end
77
+
78
+ def log_config_h
79
+ config_h['log'] || {}
80
+ end
81
+
82
+ def resolve_relative_path(filename)
83
+ return nil unless filename
84
+
85
+ pathname = Pathname.new(filename)
86
+ return pathname if pathname.absolute?
87
+ return pathname unless config_path
88
+
89
+ (config_path.parent + pathname).cleanpath
90
+ end
91
+
92
+ class << self
93
+
94
+ def from_file(config_yml)
95
+ # A missing config.yml is not normal
96
+ raise ArgumentError, "Can't read config from nil file" unless config_yml
97
+ raise ArgumentError, "Specified config file #{config_yml} does not exist" unless File.exist?(config_yml)
98
+
99
+ config_h = YAML.load_file(config_yml)
100
+ env_config = config_h[environment]
101
+ raise ArgumentError, "No configuration for environment '#{environment}' found in #{config_yml}" if env_config.nil? || env_config.empty?
102
+
103
+ Config.new(env_config, config_yml: config_yml)
104
+ end
105
+
106
+ def environment
107
+ %w[HARVESTER_ENV RAILS_ENV RACK_ENV].each { |v| return ENV[v] if ENV[v] }
108
+ 'development'
109
+ end
110
+ end
111
+
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,29 @@
1
+ require 'oai/client'
2
+
3
+ module Merritt
4
+ module TIND
5
+ class Feed
6
+ include Enumerable
7
+
8
+ def initialize(resp)
9
+ @resp = ensure_full_response(resp)
10
+ end
11
+
12
+ def each
13
+ return enum_for(:each) unless block_given?
14
+
15
+ @resp.each { |oai_record| yield Record.from_oai(oai_record) }
16
+ end
17
+
18
+ private
19
+
20
+ def ensure_full_response(resp)
21
+ return resp unless resp.respond_to?(:resumption_token) # already wrapped
22
+ return resp unless resp.resumption_token # nothing to paginate
23
+
24
+ resp.full
25
+ end
26
+
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,61 @@
1
+ module Merritt
2
+ module TIND
3
+ class FeedProcessor
4
+
5
+ attr_reader :feed
6
+ attr_reader :harvester
7
+ attr_reader :server
8
+
9
+ def initialize(feed:, server:, harvester:)
10
+ @feed = feed
11
+ @server = server
12
+ @harvester = harvester
13
+ end
14
+
15
+ def process_feed!
16
+ feed.each { |r| process_record(r, server) }
17
+
18
+ log.debug("Updating #{config.last_harvest_path}:\n#{last_harvest_next.to_yaml.gsub(/^/, "\t")}")
19
+ update_last_harvest!
20
+ end
21
+
22
+ private
23
+
24
+ def config
25
+ harvester.config
26
+ end
27
+
28
+ def log
29
+ harvester.log
30
+ end
31
+
32
+ def dry_run?
33
+ harvester.dry_run?
34
+ end
35
+
36
+ def last_harvest_next
37
+ @last_harvest_next ||= begin
38
+ last_harvest = harvester.last_harvest
39
+ last_harvest ? last_harvest.clone : LastHarvest.new
40
+ end
41
+ end
42
+
43
+ def update_last_harvest!
44
+ if dry_run?
45
+ log.info("Dry run: #{config.last_harvest_path} not updated")
46
+ else
47
+ last_harvest_next.write_to(config.last_harvest_path)
48
+ end
49
+ end
50
+
51
+ def process_record(r, server)
52
+ RecordProcessor.new(r, harvester, server).process_record!
53
+ @last_harvest_next = last_harvest_next.update(success: r)
54
+ rescue StandardError => e
55
+ # TODO: can we identify failures after submission?
56
+ log.warn(e)
57
+ @last_harvest_next = last_harvest_next.update(failure: r)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,66 @@
1
+ require 'timeout'
2
+
3
+ module Merritt
4
+ module TIND
5
+ module Files
6
+ DEFAULT_TIMEOUT_SECS = 5
7
+ DEFAULT_SLEEP_INTERVAL_SECS = 0.1
8
+
9
+ class << self
10
+
11
+ def with_lock(filename)
12
+ f = acquire_lock(filename)
13
+ yield f
14
+ ensure
15
+ f.flock(File::LOCK_UN) if f
16
+ end
17
+
18
+ def rotate_and_lock(filename)
19
+ with_lock(filename) do |f|
20
+ if File.size?(filename)
21
+ rotating(filename) { |f1| yield f1 }
22
+ else
23
+ yield f
24
+ end
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def rotating(filename)
31
+ rotate_to = rotated_name(filename)
32
+
33
+ File.rename(filename, rotate_to)
34
+ with_lock(filename) { |f| yield f }
35
+ end
36
+
37
+ def rotated_name(filename)
38
+ loop do
39
+ renamed_file = filename + '-' + Time.now.utc.iso8601(3)
40
+ return renamed_file unless File.exist?(renamed_file)
41
+
42
+ sleep(DEFAULT_SLEEP_INTERVAL_SECS)
43
+ end
44
+ end
45
+
46
+ def acquire_lock(filename)
47
+ Timeout.timeout(DEFAULT_TIMEOUT_SECS) do
48
+ loop do
49
+ f = File.open(filename, 'a+')
50
+ f.flock(File::LOCK_EX)
51
+ return f if File.identical?(filename, f)
52
+
53
+ # we do cover this, but it's called in a subprocess
54
+ # so SimpleCov can't tell we've called it
55
+ # :nocov:
56
+ f.flock(File::LOCK_UN)
57
+ sleep(DEFAULT_SLEEP_INTERVAL_SECS)
58
+ # :nocov:
59
+ end
60
+ end
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,138 @@
1
+ require 'faraday_middleware'
2
+ require 'oai/client'
3
+
4
+ module Merritt
5
+ module TIND
6
+ class Harvester
7
+
8
+ attr_reader :config
9
+
10
+ def initialize(config, dry_run: false)
11
+ @config = config
12
+ @dry_run = dry_run
13
+
14
+ set_str = config.oai_set ? "'#{config.oai_set}'" : '<nil>'
15
+ log.info("Initializing harvester for base URL #{oai_base_uri}, set #{set_str} => collection #{config.mrt_collection_ark}")
16
+ end
17
+
18
+ def process_feed!(from_time: nil, until_time: nil)
19
+ return if stop_file_present?
20
+
21
+ opts = to_oai_opts(from_time, until_time)
22
+ log.info("harvesting #{query_uri(opts)}")
23
+ resp = oai_client.list_records(opts)
24
+ feed = Feed.new(resp)
25
+ return process_feed(feed, nil) if dry_run?
26
+
27
+ with_server { |server| process_feed(feed, server) }
28
+ end
29
+
30
+ def with_server
31
+ server = Mrt::Ingest::OneTimeServer.new
32
+ server.start_server
33
+ yield server
34
+ ensure
35
+ server.join_server
36
+ end
37
+
38
+ def dry_run?
39
+ @dry_run
40
+ end
41
+
42
+ def stop_file_present?
43
+ stop_file_path = config.stop_file_path
44
+ stop_file_present = stop_file_path && stop_file_path.exist?
45
+ log.info("Stop file present: #{config.stop_file_path}") if stop_file_present
46
+ stop_file_present
47
+ end
48
+
49
+ def last_harvest
50
+ # read this from the file every time
51
+ LastHarvest.from_file(config.last_harvest_path)
52
+ end
53
+
54
+ def oai_client
55
+ @oai_client ||= Harvester.oai_client_for(oai_base_uri)
56
+ end
57
+
58
+ def oai_base_uri
59
+ @oai_base_uri ||= URI.parse(config.oai_base_url)
60
+ end
61
+
62
+ def mrt_collection_ark
63
+ config.mrt_collection_ark
64
+ end
65
+
66
+ def mrt_ingest_profile
67
+ config.mrt_ingest_profile
68
+ end
69
+
70
+ def mrt_inv_db
71
+ @mrt_inv_db ||= InventoryDB.from_file(config.db_config_path)
72
+ end
73
+
74
+ def mrt_ingest_client
75
+ # TODO: secure way to get username and password?
76
+ @mrt_ingest_client ||= Mrt::Ingest::Client.new(config.mrt_ingest_url)
77
+ end
78
+
79
+ def log
80
+ @log ||= Logging.new_logger(log_path, config.log_level)
81
+ end
82
+
83
+ def determine_from_time(from_time = nil)
84
+ return from_time if from_time
85
+
86
+ lh = last_harvest
87
+ oldest_failed = lh.oldest_failed_datestamp
88
+ return oldest_failed if oldest_failed
89
+
90
+ lh.newest_success_datestamp
91
+ end
92
+
93
+ def log_path
94
+ config.log_path
95
+ end
96
+
97
+ private
98
+
99
+ def process_feed(feed, server)
100
+ return unless feed
101
+
102
+ feed_processor = FeedProcessor.new(feed: feed, server: server, harvester: self)
103
+ feed_processor.process_feed!
104
+ end
105
+
106
+ def query_uri(opts)
107
+ query = '?ListRecords'
108
+ opts.each { |k, v| query << "&#{k}=#{v}" } if opts
109
+ oai_base_uri.merge(query)
110
+ end
111
+
112
+ def to_oai_opts(from_time, until_time)
113
+ from_time = determine_from_time(from_time)
114
+ from_iso8601, until_iso8601 = Times.iso8601_range(from_time, until_time)
115
+ { from: from_iso8601, until: until_iso8601, set: config.oai_set }.compact
116
+ end
117
+
118
+ class << self
119
+
120
+ def from_file(config_yml, dry_run: false)
121
+ config = Config.from_file(config_yml)
122
+ Harvester.new(config, dry_run: dry_run)
123
+ end
124
+
125
+ def oai_client_for(base_uri)
126
+ # Workaround for https://github.com/code4lib/ruby-oai/issues/45
127
+ http_client = Faraday.new(base_uri) do |conn|
128
+ conn.request(:retry, max: 5, retry_statuses: 503)
129
+ conn.response(:follow_redirects, limit: 5)
130
+ conn.adapter(:net_http)
131
+ end
132
+ OAI::Client.new(base_uri.to_s, http: http_client)
133
+ end
134
+ end
135
+
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,54 @@
1
+ require 'mysql2'
2
+
3
+ module Merritt
4
+ module TIND
5
+ class InventoryDB
6
+
7
+ attr_reader :db_connection
8
+
9
+ def initialize(db_config_h)
10
+ @db_connection = Mysql2::Client.new(db_config_h)
11
+ end
12
+
13
+ class << self
14
+ def from_file(db_config_path)
15
+ raise "Can't connect to nil database" unless db_config_path
16
+ raise ArgumentError, "Specified database config #{db_config_path} does not exist" unless File.exist?(db_config_path)
17
+
18
+ db_config = YAML.load_file(db_config_path)
19
+ env_db_config = db_config[Config.environment]
20
+ InventoryDB.new(env_db_config)
21
+ end
22
+ end
23
+
24
+ def find_existing_object(local_id, collection_ark)
25
+ result = existing_object_stmt.execute(local_id, collection_ark).first
26
+ return nil unless result
27
+
28
+ OpenStruct.new(result)
29
+ end
30
+
31
+ private
32
+
33
+ # TODO: is this right or should we be using erc_where? or both?
34
+ EXISTING_OBJECT_SQL = <<~SQL.freeze
35
+ SELECT o.*
36
+ FROM inv_objects AS o
37
+ JOIN inv_collections_inv_objects AS co
38
+ ON co.inv_object_id = o.id
39
+ JOIN inv_collections AS c
40
+ ON c.id = co.inv_collection_id
41
+ JOIN inv_localids AS li
42
+ ON li.inv_object_ark = o.ark
43
+ WHERE li.local_id = ?
44
+ AND c.ark = ?
45
+ LIMIT 1
46
+ SQL
47
+
48
+ def existing_object_stmt
49
+ @existing_object_stmt ||= db_connection.prepare(EXISTING_OBJECT_SQL)
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,84 @@
1
+ require 'yaml'
2
+
3
+ module Merritt
4
+ module TIND
5
+
6
+ class LastHarvest
7
+
8
+ OLDEST_FAILED = 'oldest_failed'.freeze
9
+ NEWEST_SUCCESS = 'newest_success'.freeze
10
+
11
+ attr_reader :oldest_failed
12
+ attr_reader :newest_success
13
+
14
+ # @param oldest_failed [Record, nil] the oldest record that failed to submit
15
+ # @param newest_success [Record, nil] the newest record successfully submitted
16
+ def initialize(oldest_failed: nil, newest_success: nil)
17
+ @oldest_failed = oldest_failed
18
+ @newest_success = newest_success
19
+ end
20
+
21
+ def to_h
22
+ {
23
+ OLDEST_FAILED => (oldest_failed && oldest_failed.to_h),
24
+ NEWEST_SUCCESS => (newest_success && newest_success.to_h)
25
+ }
26
+ end
27
+
28
+ def to_yaml
29
+ to_h.to_yaml
30
+ end
31
+
32
+ def write_to(last_harvest_yml)
33
+ Files.rotate_and_lock(last_harvest_yml) do |f|
34
+ f.write(to_yaml)
35
+ end
36
+ end
37
+
38
+ def oldest_failed_datestamp
39
+ oldest_failed && oldest_failed.datestamp
40
+ end
41
+
42
+ def newest_success_datestamp
43
+ newest_success && newest_success.datestamp
44
+ end
45
+
46
+ def update(success: nil, failure: nil)
47
+ LastHarvest.new(
48
+ newest_success: Record.later(success, newest_success),
49
+ oldest_failed: Record.earlier(failure, oldest_failed)
50
+ )
51
+ end
52
+
53
+ private
54
+
55
+ def initialize_dup(source)
56
+ @newest_success = source.newest_success && source.newest_success.dup
57
+ @oldest_failed = source.oldest_failed && source.oldest_failed.dup
58
+ end
59
+
60
+ def initialize_clone(source)
61
+ @newest_success = source.newest_success && source.newest_success.clone
62
+ @oldest_failed = source.oldest_failed && source.oldest_failed.clone
63
+ end
64
+
65
+ class << self
66
+ def from_file(last_harvest_yml)
67
+ return from_hash(YAML.load_file(last_harvest_yml)) if last_harvest_yml && File.exist?(last_harvest_yml)
68
+
69
+ # A missing last_yarvest.yml is normal
70
+ LastHarvest.new
71
+ end
72
+
73
+ def from_hash(h)
74
+ LastHarvest.new(
75
+ oldest_failed: Record.from_hash(h[OLDEST_FAILED]),
76
+ newest_success: Record.from_hash(h[NEWEST_SUCCESS])
77
+ )
78
+ end
79
+
80
+ end
81
+
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,42 @@
1
+ require 'logger'
2
+ require 'time'
3
+
4
+ module Merritt
5
+ module TIND
6
+ module Logging
7
+ NUM_LOG_FILES = 10
8
+ DEFAULT_LOG_LEVEL = Logger::DEBUG
9
+
10
+ class << self
11
+ def fmt_log(severity, datetime, _, msg)
12
+ "#{datetime.iso8601}\t#{severity}\t#{msg}\n"
13
+ end
14
+
15
+ def new_logger(log_dev = nil, log_level = nil)
16
+ log_dev ||= STDERR
17
+ log_level ||= Logger::DEBUG
18
+
19
+ created_log_dir = ensure_log_dir(log_dev)
20
+ logger = Logger.new(log_dev, NUM_LOG_FILES, level: log_level, formatter: Logging.method(:fmt_log))
21
+ created_log_dir.each { |d| logger.info("Created log directory #{d}") } if created_log_dir
22
+ logger
23
+ end
24
+
25
+ private
26
+
27
+ def io_like?(log_dev)
28
+ # This is how Ruby's Logger identifies an IO-like log device
29
+ log_dev.respond_to?(:write) && log_dev.respond_to?(:close)
30
+ end
31
+
32
+ def ensure_log_dir(log_dev)
33
+ return if io_like?(log_dev)
34
+
35
+ # assume it's a string or a pathname
36
+ log_dir = Pathname.new(log_dev).parent
37
+ FileUtils.mkdir_p(log_dir) unless log_dir.exist?
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,7 @@
1
+ module Merritt
2
+ module TIND
3
+ NAME = 'mrt-tind-harvester'.freeze
4
+ VERSION = '0.0.1'.freeze
5
+ COPYRIGHT = 'Copyright (c) 2019 The Regents of the University of California'.freeze
6
+ end
7
+ end
@@ -0,0 +1,109 @@
1
+ require 'oai/client'
2
+ require 'time'
3
+
4
+ module Merritt
5
+ module TIND
6
+ class Record
7
+ IDENTIFIER = 'identifier'.freeze
8
+ DATESTAMP = 'datestamp'.freeze
9
+
10
+ attr_reader :identifier
11
+ attr_reader :datestamp
12
+ attr_reader :metadata
13
+
14
+ def initialize(identifier:, datestamp:, oai_metadata: nil)
15
+ @identifier = identifier
16
+ @datestamp = datestamp
17
+ @metadata = oai_metadata
18
+ end
19
+
20
+ def erc
21
+ # TODO: something smarter when we know the real requirements
22
+ {
23
+ 'what' => identifier,
24
+ 'where' => local_id,
25
+ 'when' => dc_dates.first || datestamp,
26
+ 'when/created' => dc_dates.first || datestamp,
27
+ 'when/modified' => datestamp
28
+ }
29
+ end
30
+
31
+ def dc_identifiers
32
+ @dc_identifiers ||= REXML::XPath.match(metadata, './/dc:identifier').map(&:text)
33
+ end
34
+
35
+ def dc_dates
36
+ @dc_dates ||= begin
37
+ REXML::XPath.match(metadata, './/dc:date')
38
+ .map(&:text)
39
+ .map { |t| Time.parse(t) }
40
+ end
41
+ end
42
+
43
+ def dc_titles
44
+ @dc_titles ||= REXML::XPath.match(metadata, './/dc:title').map(&:text)
45
+ end
46
+
47
+ def dc_creators
48
+ @dc_creators ||= REXML::XPath.match(metadata, './/dc:creator').map(&:text)
49
+ end
50
+
51
+ def content_uri
52
+ @content_uri ||= begin
53
+ # TODO: something smarter when we know the real requirements
54
+ content_url = dc_identifiers.find do |dc_id|
55
+ dc_id.start_with?('http') && dc_id.end_with?('jpg')
56
+ end
57
+ content_url && URI.parse(content_url)
58
+ end
59
+ end
60
+
61
+ def local_id
62
+ # TODO: something smarter when we know the real requirements
63
+ dc_identifiers.first || identifier
64
+ end
65
+
66
+ def to_h
67
+ { IDENTIFIER => identifier, DATESTAMP => datestamp }
68
+ end
69
+
70
+ class << self
71
+
72
+ def later(r1, r2)
73
+ return r1 if r2.nil?
74
+ return r2 if r1.nil?
75
+ return r1 if (r1.datestamp <=> r2.datestamp) > 0
76
+
77
+ r2
78
+ end
79
+
80
+ def earlier(r1, r2)
81
+ return r1 if r2.nil?
82
+ return r2 if r1.nil?
83
+ return r1 if (r1.datestamp <=> r2.datestamp) < 0
84
+
85
+ r2
86
+ end
87
+
88
+ def from_hash(h)
89
+ return unless h
90
+
91
+ Record.new(identifier: h[IDENTIFIER], datestamp: h[DATESTAMP])
92
+ end
93
+
94
+ # Constructs a new {Record} wrapping the specified record.
95
+ #
96
+ # @param oai_record [OAI::Record] An OAI record as returned by `OAI::Client`
97
+ def from_oai(oai_record)
98
+ raise ArgumentError, "can't parse nil record" unless oai_record
99
+
100
+ header = oai_record.header
101
+ identifier = header.identifier
102
+ datestamp = header.datestamp && Time.parse(header.datestamp)
103
+ Record.new(identifier: identifier, datestamp: datestamp, oai_metadata: oai_record.metadata)
104
+ end
105
+
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,90 @@
1
+ require 'mrt/ingest'
2
+ require 'ostruct'
3
+
4
+ module Merritt
5
+ module TIND
6
+ class RecordProcessor
7
+
8
+ USER_AGENT = 'Merritt/TIND Harvester'.freeze
9
+
10
+ attr_reader :record
11
+ attr_reader :harvester
12
+ attr_reader :server
13
+
14
+ def initialize(record, harvester, server)
15
+ @record = record
16
+ @harvester = harvester
17
+ @server = server
18
+ end
19
+
20
+ def process_record!
21
+ return true if already_up_to_date?
22
+
23
+ log.info("Processing record: #{local_id} (content: #{content_uri}")
24
+ return true if harvester.dry_run?
25
+
26
+ submit_to_ingest!
27
+ end
28
+
29
+ private
30
+
31
+ def submit_to_ingest!
32
+ ingest_object.add_component(content_uri)
33
+ response = ingest_object.start_ingest(ingest_client, ingest_profile, USER_AGENT)
34
+ log.info("Batch #{response.batch_id} queued at #{response.submission_date}")
35
+ true # TODO: is there anything in the response that might cause us to return false?
36
+ end
37
+
38
+ def already_up_to_date?
39
+ @already_up_to_date ||= existing_object && existing_object.modified >= record.datestamp
40
+ end
41
+
42
+ def existing_object
43
+ @existing_object = (find_existing_object || false) if @existing_object.nil?
44
+ @existing_object
45
+ end
46
+
47
+ def find_existing_object
48
+ inv_db.find_existing_object(local_id, collection_ark)
49
+ end
50
+
51
+ def inv_db
52
+ harvester.mrt_inv_db
53
+ end
54
+
55
+ def local_id
56
+ record.local_id
57
+ end
58
+
59
+ def content_uri
60
+ record.content_uri
61
+ end
62
+
63
+ def collection_ark
64
+ harvester.mrt_collection_ark
65
+ end
66
+
67
+ def ingest_client
68
+ harvester.mrt_ingest_client
69
+ end
70
+
71
+ def ingest_profile
72
+ harvester.mrt_ingest_profile
73
+ end
74
+
75
+ def log
76
+ harvester.log
77
+ end
78
+
79
+ def ingest_object
80
+ @ingest_object ||= begin
81
+ Mrt::Ingest::IObject.new(
82
+ erc: record.erc,
83
+ server: server,
84
+ local_identifier: record.local_id
85
+ )
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,35 @@
1
+ require 'time'
2
+
3
+ module Merritt
4
+ module TIND
5
+ module Times
6
+ class << self
7
+ def iso8601_range(from_time, until_time)
8
+ from_time, until_time = valid_range(from_time, until_time)
9
+ [
10
+ from_time && from_time.iso8601,
11
+ until_time && until_time.iso8601
12
+ ]
13
+ end
14
+
15
+ private
16
+
17
+ def valid_range(from_time, until_time)
18
+ from_time, until_time = [from_time, until_time].map(&method(:utc_or_nil))
19
+ if from_time && until_time
20
+ raise RangeError, "from_time #{from_time} must be <= until_time #{until_time}" if from_time > until_time
21
+ end
22
+
23
+ [from_time, until_time]
24
+ end
25
+
26
+ def utc_or_nil(time)
27
+ return time.utc if time.respond_to?(:utc)
28
+ return unless time
29
+
30
+ raise ArgumentError, "time #{time} does not appear to be a Time"
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
metadata ADDED
@@ -0,0 +1,298 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mrt-tind-harvester
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - David Moles
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-05-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mrt-ingest
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.0.5
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.0.5
27
+ - !ruby/object:Gem::Dependency
28
+ name: mysql2
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.4.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.4.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.10'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.10'
55
+ - !ruby/object:Gem::Dependency
56
+ name: oai
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.4'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.4'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rest-client
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '2.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: capistrano
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.4'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.4'
97
+ - !ruby/object:Gem::Dependency
98
+ name: capistrano-bundler
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.1'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.1'
111
+ - !ruby/object:Gem::Dependency
112
+ name: database_cleaner
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.5'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.5'
125
+ - !ruby/object:Gem::Dependency
126
+ name: factory_bot
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '4.11'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '4.11'
139
+ - !ruby/object:Gem::Dependency
140
+ name: rake
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '12.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '12.0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: rspec
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '3.8'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '3.8'
167
+ - !ruby/object:Gem::Dependency
168
+ name: rubocop
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - "~>"
172
+ - !ruby/object:Gem::Version
173
+ version: '0.68'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '0.68'
181
+ - !ruby/object:Gem::Dependency
182
+ name: rubocop-rspec
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - "~>"
186
+ - !ruby/object:Gem::Version
187
+ version: '1.33'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - "~>"
193
+ - !ruby/object:Gem::Version
194
+ version: '1.33'
195
+ - !ruby/object:Gem::Dependency
196
+ name: simplecov
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - "~>"
200
+ - !ruby/object:Gem::Version
201
+ version: '0.16'
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - "~>"
207
+ - !ruby/object:Gem::Version
208
+ version: '0.16'
209
+ - !ruby/object:Gem::Dependency
210
+ name: simplecov-console
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '0.4'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '0.4'
223
+ - !ruby/object:Gem::Dependency
224
+ name: standalone_migrations
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - "~>"
228
+ - !ruby/object:Gem::Version
229
+ version: '5.2'
230
+ type: :development
231
+ prerelease: false
232
+ version_requirements: !ruby/object:Gem::Requirement
233
+ requirements:
234
+ - - "~>"
235
+ - !ruby/object:Gem::Version
236
+ version: '5.2'
237
+ - !ruby/object:Gem::Dependency
238
+ name: webmock
239
+ requirement: !ruby/object:Gem::Requirement
240
+ requirements:
241
+ - - "~>"
242
+ - !ruby/object:Gem::Version
243
+ version: '3.5'
244
+ type: :development
245
+ prerelease: false
246
+ version_requirements: !ruby/object:Gem::Requirement
247
+ requirements:
248
+ - - "~>"
249
+ - !ruby/object:Gem::Version
250
+ version: '3.5'
251
+ description: Harvests TIND OAI-PMH feed to identify files for ingest into Merritt
252
+ email:
253
+ - david.moles@ucop.edu
254
+ executables:
255
+ - mrt-tind-harvester
256
+ extensions: []
257
+ extra_rdoc_files: []
258
+ files:
259
+ - bin/mrt-tind-harvester
260
+ - lib/merritt.rb
261
+ - lib/merritt/tind.rb
262
+ - lib/merritt/tind/config.rb
263
+ - lib/merritt/tind/feed.rb
264
+ - lib/merritt/tind/feed_processor.rb
265
+ - lib/merritt/tind/files.rb
266
+ - lib/merritt/tind/harvester.rb
267
+ - lib/merritt/tind/inventory_db.rb
268
+ - lib/merritt/tind/last_harvest.rb
269
+ - lib/merritt/tind/logging.rb
270
+ - lib/merritt/tind/module_info.rb
271
+ - lib/merritt/tind/record.rb
272
+ - lib/merritt/tind/record_processor.rb
273
+ - lib/merritt/tind/times.rb
274
+ homepage: https://github.com/CDLUC3/mrt-tind-harvester
275
+ licenses:
276
+ - MIT
277
+ metadata: {}
278
+ post_install_message:
279
+ rdoc_options: []
280
+ require_paths:
281
+ - lib
282
+ required_ruby_version: !ruby/object:Gem::Requirement
283
+ requirements:
284
+ - - "~>"
285
+ - !ruby/object:Gem::Version
286
+ version: '2.4'
287
+ required_rubygems_version: !ruby/object:Gem::Requirement
288
+ requirements:
289
+ - - ">="
290
+ - !ruby/object:Gem::Version
291
+ version: '0'
292
+ requirements: []
293
+ rubyforge_project:
294
+ rubygems_version: 2.6.14.1
295
+ signing_key:
296
+ specification_version: 4
297
+ summary: TIND harvester for Merritt
298
+ test_files: []