pushmi_pullyu 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ require 'fileutils'
2
+
3
+ module PushmiPullyu::AIP
4
+ class NoidInvalid < StandardError; end
5
+
6
+ def create(noid)
7
+ raise NoidInvalid if noid.blank? || noid.include?('/')
8
+
9
+ aip_directory = "#{PushmiPullyu.options[:workdir]}/#{noid}"
10
+ aip_filename = "#{aip_directory}.tar"
11
+
12
+ PushmiPullyu::AIP::Downloader.new(noid, aip_directory).run
13
+ PushmiPullyu::AIP::Creator.new(noid, aip_directory, aip_filename).run
14
+
15
+ yield aip_filename
16
+
17
+ FileUtils.rm_rf(aip_filename) if File.exist?(aip_filename)
18
+ FileUtils.rm_rf(aip_directory) if File.exist?(aip_directory)
19
+ end
20
+
21
+ module_function :create
22
+ end
@@ -0,0 +1,44 @@
1
+ require 'minitar'
2
+ require 'bagit'
3
+ require 'fileutils'
4
+
5
+ class PushmiPullyu::AIP::Creator
6
+
7
+ class BagInvalid < StandardError; end
8
+
9
+ # Assumption: the AIP has already been downloaded
10
+
11
+ def initialize(noid, aip_directory, aip_filename)
12
+ @noid = noid
13
+ @aip_directory = aip_directory
14
+ @aip_filename = aip_filename
15
+ end
16
+
17
+ def run
18
+ bag_aip
19
+ tar_bag
20
+ end
21
+
22
+ private
23
+
24
+ def bag_aip
25
+ bag = BagIt::Bag.new(@aip_directory)
26
+ bag.manifest!
27
+ raise BagInvalid unless bag.valid?
28
+ end
29
+
30
+ def tar_bag
31
+ # We want to change the directory to the work directory path so we get the tar file to be exactly
32
+ # the contents of the noid directory and not the entire work directory structure. For example the noid.tar
33
+ # contains just the noid directory instead of having the noid.tar contain the tmp directory
34
+ # which contains the workdir directory and then finally the noid directory
35
+
36
+ # Before we change directorys, we need to calculate the absolute filepath of our aip filename
37
+ tar_aip_filename = File.expand_path(@aip_filename)
38
+
39
+ Dir.chdir(PushmiPullyu.options[:workdir]) do
40
+ Minitar.pack(@noid, File.open(tar_aip_filename, 'wb'))
41
+ end
42
+ end
43
+
44
+ end
@@ -0,0 +1,177 @@
1
+ require 'fileutils'
2
+ require 'ostruct'
3
+ require 'rdf'
4
+ require 'rdf/n3'
5
+
6
+ # Download all of the metadata/datastreams and associated data
7
+ # related to an object
8
+ class PushmiPullyu::AIP::Downloader
9
+
10
+ class NoContentFilename < StandardError; end
11
+
12
+ def initialize(noid, aip_directory)
13
+ @noid = noid
14
+ @aip_directory = aip_directory
15
+ end
16
+
17
+ def run
18
+ make_directories
19
+
20
+ PushmiPullyu.logger.info("#{@noid}: Retreiving data from Fedora ...")
21
+
22
+ [:main_object, :fixity, :content_datastream_metadata, :versions, :thumbnail,
23
+ :characterization, :fedora3foxml, :fedora3foxml_metadata].each do |item|
24
+ path_spec = aip_paths[item]
25
+ download_and_log(path_spec, PushmiPullyu::AIP::FedoraFetcher.new(@noid))
26
+ end
27
+
28
+ # Need content filename from metadata
29
+ path_spec = OpenStruct.new(
30
+ remote: '/content',
31
+ local: content_filename, # lookup filename derived from metadata
32
+ optional: false
33
+ )
34
+ download_and_log(path_spec, PushmiPullyu::AIP::FedoraFetcher.new(@noid))
35
+
36
+ download_permissions
37
+ end
38
+
39
+ private
40
+
41
+ def download_and_log(path_spec, fedora_fetcher)
42
+ output_file = path_spec.local
43
+
44
+ log_fetching(fedora_fetcher.object_url(path_spec.remote), output_file)
45
+
46
+ is_rdf = (output_file !~ /\.n3$/)
47
+
48
+ is_success = fedora_fetcher.download_object(output_file,
49
+ url_extra: path_spec.remote,
50
+ optional: path_spec.optional,
51
+ is_rdf: is_rdf)
52
+ log_saved(is_success, output_file)
53
+ end
54
+
55
+ def download_permissions
56
+ PushmiPullyu.logger.info("#{@noid}: looking up permissions from Solr ...")
57
+ results = PushmiPullyu::AIP::SolrFetcher.new(@noid).fetch_permission_object_ids
58
+ if results.empty?
59
+ PushmiPullyu.logger.info("#{@noid}: permissions not found")
60
+ else
61
+ results.each do |permission_id|
62
+ PushmiPullyu.logger.info("#{@noid}: permission object #{permission_id} found")
63
+ download_permission(permission_id)
64
+ end
65
+ end
66
+ end
67
+
68
+ def download_permission(permission_id)
69
+ path_spec = OpenStruct.new(
70
+ remote: nil,
71
+ local: "#{aip_dirs.metadata}/permission_#{permission_id}.n3",
72
+ optional: false
73
+ )
74
+ download_and_log(path_spec, PushmiPullyu::AIP::FedoraFetcher.new(permission_id))
75
+ end
76
+
77
+ ### Logging
78
+
79
+ def log_fetching(url, output_file)
80
+ message = "#{@noid}: #{output_file} -- fetching from #{url} ..."
81
+ PushmiPullyu::Logging.log_aip_activity(@aip_directory, message)
82
+ end
83
+
84
+ def log_saved(is_success, output_file)
85
+ message = "#{@noid}: #{output_file} -- #{is_success ? 'saved' : 'not_found'}"
86
+ PushmiPullyu::Logging.log_aip_activity(@aip_directory, message)
87
+ end
88
+
89
+ ### Directories
90
+
91
+ def aip_dirs
92
+ @aip_dirs ||= OpenStruct.new(
93
+ objects: "#{@aip_directory}/data/objects",
94
+ metadata: "#{@aip_directory}/data/objects/metadata",
95
+ logs: "#{@aip_directory}/data/logs",
96
+ thumbnails: "#{@aip_directory}/data/thumbnails"
97
+ )
98
+ end
99
+
100
+ def make_directories
101
+ clean_directories
102
+ PushmiPullyu.logger.debug("#{@noid}: Creating directories ...")
103
+ aip_dirs.to_h.values.each do |path|
104
+ FileUtils.mkdir_p(path)
105
+ end
106
+ PushmiPullyu.logger.debug("#{@noid}: Creating directories done")
107
+ end
108
+
109
+ def clean_directories
110
+ return unless File.exist?(@aip_directory)
111
+ PushmiPullyu.logger.debug("#{@noid}: Nuking directories ...")
112
+ FileUtils.rm_rf(@aip_directory)
113
+ end
114
+
115
+ ### Files
116
+
117
+ def aip_paths
118
+ @aip_paths ||= OpenStruct.new(
119
+ main_object: OpenStruct.new(
120
+ remote: nil, # Base path
121
+ local: "#{aip_dirs.metadata}/object_metadata.n3",
122
+ optional: false
123
+ ),
124
+ fixity: OpenStruct.new(
125
+ remote: '/content/fcr:fixity',
126
+ local: "#{aip_dirs.logs}/content_fixity_report.n3",
127
+ optional: false
128
+ ),
129
+ content_datastream_metadata: OpenStruct.new(
130
+ remote: '/content/fcr:metadata',
131
+ local: "#{aip_dirs.metadata}/content_fcr_metadata.n3",
132
+ optional: false
133
+ ),
134
+ versions: OpenStruct.new(
135
+ remote: '/content/fcr:versions',
136
+ local: "#{aip_dirs.metadata}/content_versions.n3",
137
+ optional: false
138
+ ),
139
+
140
+ # Optional downloads
141
+ thumbnail: OpenStruct.new(
142
+ remote: '/thumbnail',
143
+ local: "#{aip_dirs.thumbnails}/thumbnail",
144
+ optional: true
145
+ ),
146
+ characterization: OpenStruct.new(
147
+ remote: '/characterization',
148
+ local: "#{aip_dirs.logs}/content_characterization.n3",
149
+ optional: true
150
+ ),
151
+ fedora3foxml: OpenStruct.new(
152
+ remote: '/fedora3foxml',
153
+ local: "#{aip_dirs.metadata}/fedora3foxml.xml",
154
+ optional: true
155
+ ),
156
+ fedora3foxml_metadata: OpenStruct.new(
157
+ remote: '/fedora3foxml/fcr:metadata',
158
+ local: "#{aip_dirs.metadata}/fedora3foxml.n3",
159
+ optional: true
160
+ )
161
+ ).freeze
162
+ end
163
+
164
+ # Extract filename from main object metadata
165
+ def content_filename
166
+ filename_predicate = RDF::URI('info:fedora/fedora-system:def/model#downloadFilename')
167
+
168
+ graph = RDF::Graph.load(aip_paths.main_object.local)
169
+
170
+ graph.query(predicate: filename_predicate) do |results|
171
+ return "#{aip_dirs.objects}/#{results.object}"
172
+ end
173
+
174
+ raise NoContentFilename
175
+ end
176
+
177
+ end
@@ -0,0 +1,59 @@
1
+ require 'net/http'
2
+
3
+ class PushmiPullyu::AIP::FedoraFetcher
4
+
5
+ class FedoraFetchError < StandardError; end
6
+
7
+ RDF_FORMAT = 'text/rdf+n3'.freeze
8
+
9
+ def initialize(noid)
10
+ @noid = noid
11
+ end
12
+
13
+ def object_url(url_extra = nil)
14
+ url = "#{PushmiPullyu.options[:fedora][:url]}#{base_path}/#{pairtree}"
15
+ url += url_extra if url_extra
16
+ url
17
+ end
18
+
19
+ # Return true on success, raise an error otherwise
20
+ # (or use 'optional' to return false on 404)
21
+ def download_object(download_path, url_extra: nil,
22
+ optional: false, is_rdf: false)
23
+
24
+ uri = URI(object_url(url_extra))
25
+
26
+ request = Net::HTTP::Get.new(uri)
27
+ request.basic_auth(PushmiPullyu.options[:fedora][:user],
28
+ PushmiPullyu.options[:fedora][:password])
29
+
30
+ request['Accept'] = RDF_FORMAT if is_rdf
31
+
32
+ response = Net::HTTP.start(uri.hostname, uri.port) do |http|
33
+ http.request(request)
34
+ end
35
+
36
+ if response.is_a?(Net::HTTPSuccess)
37
+ file = File.open(download_path, 'wb')
38
+ file.write(response.body)
39
+ file.close
40
+ return true
41
+ elsif response.is_a?(Net::HTTPNotFound)
42
+ raise FedoraFetchError unless optional
43
+ return false
44
+ else
45
+ raise FedoraFetchError
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def pairtree
52
+ "#{@noid[0..1]}/#{@noid[2..3]}/#{@noid[4..5]}/#{@noid[6..7]}/#{@noid}"
53
+ end
54
+
55
+ def base_path
56
+ PushmiPullyu.options[:fedora][:base_path]
57
+ end
58
+
59
+ end
@@ -0,0 +1,33 @@
1
+ require 'json'
2
+ require 'net/http'
3
+
4
+ class PushmiPullyu::AIP::SolrFetcher
5
+
6
+ class SolrFetchError < StandardError; end
7
+
8
+ def initialize(noid)
9
+ @noid = noid
10
+ end
11
+
12
+ def fetch_permission_object_ids
13
+ hash = JSON.parse(run_query_json)
14
+
15
+ return [] if hash['response']['docs'].empty?
16
+
17
+ hash['response']['docs'].map { |hit| hit['id'] }
18
+ end
19
+
20
+ private
21
+
22
+ # Return fetched results, else raise an error
23
+ def run_query_json
24
+ response = Net::HTTP.get_response(
25
+ URI("#{PushmiPullyu.options[:solr][:url]}/select?q=accessTo_ssim:#{@noid}&fl=id&wt=json")
26
+ )
27
+
28
+ return response.body if response.is_a?(Net::HTTPSuccess)
29
+
30
+ raise SolrFetchError
31
+ end
32
+
33
+ end
@@ -0,0 +1,255 @@
1
+ require 'erb'
2
+ require 'fileutils'
3
+ require 'optparse'
4
+ require 'rollbar'
5
+ require 'singleton'
6
+ require 'yaml'
7
+
8
+ # CLI runner
9
+ class PushmiPullyu::CLI
10
+
11
+ include Singleton
12
+ include PushmiPullyu::Logging
13
+
14
+ COMMANDS = ['start', 'stop', 'restart', 'reload', 'run', 'zap', 'status'].freeze
15
+
16
+ def initialize
17
+ PushmiPullyu.server_running = true # set to false by interrupt signal trap
18
+ PushmiPullyu.reset_logger = false # set to true by SIGHUP trap
19
+ end
20
+
21
+ def parse(argv = ARGV)
22
+ opts = parse_options(argv)
23
+ opts[:daemonize] = true if COMMANDS.include? argv[0]
24
+ opts = parse_config(opts[:config_file]).merge(opts) if opts[:config_file]
25
+
26
+ PushmiPullyu.options = opts
27
+ end
28
+
29
+ def run
30
+ configure_rollbar
31
+ begin
32
+ if options[:daemonize]
33
+ start_server_as_daemon
34
+ else
35
+ # If we're running in the foreground sync the output.
36
+ $stdout.sync = $stderr.sync = true
37
+ start_server
38
+ end
39
+ # rubocop:disable Lint/RescueException
40
+ rescue Exception => e
41
+ Rollbar.error(e)
42
+ raise e
43
+ end
44
+ end
45
+
46
+ def start_server
47
+ setup_signal_traps
48
+
49
+ setup_log
50
+ print_banner
51
+
52
+ run_tick_loop
53
+ end
54
+
55
+ private
56
+
57
+ def configure_rollbar
58
+ Rollbar.configure do |config|
59
+ config.enabled = false unless options[:rollbar_token].present?
60
+ config.access_token = options[:rollbar_token]
61
+ end
62
+ end
63
+
64
+ def options
65
+ PushmiPullyu.options
66
+ end
67
+
68
+ def parse_config(config_file)
69
+ opts = {}
70
+ if File.exist?(config_file)
71
+ opts = YAML.safe_load(ERB.new(IO.read(config_file)).result).deep_symbolize_keys || opts
72
+ end
73
+
74
+ opts
75
+ end
76
+
77
+ # Parse the options.
78
+ def parse_options(argv)
79
+ opts = {}
80
+
81
+ @parsed_opts = OptionParser.new do |o|
82
+ o.banner = 'Usage: pushmi_pullyu [options] [start|stop|restart|run]'
83
+ o.separator ''
84
+ o.separator 'Specific options:'
85
+
86
+ o.on('-a', '--minimum-age AGE',
87
+ Float, 'Minimum amount of time an item must spend in the queue, in seconds.') do |minimum_age|
88
+ opts[:minimum_age] = minimum_age
89
+ end
90
+
91
+ o.on('-d', '--debug', 'Enable debug logging') do
92
+ opts[:debug] = true
93
+ end
94
+
95
+ o.on('-r', '--rollbar-token TOKEN', 'Enable error reporting to Rollbar') do |token|
96
+ opts[:rollbar_token] = token if token.present?
97
+ end
98
+
99
+ o.on '-C', '--config PATH', 'Path for YAML config file' do |config_file|
100
+ opts[:config_file] = config_file
101
+ end
102
+
103
+ o.on('-L', '--logdir PATH', 'Path for directory to store log files') do |logdir|
104
+ opts[:logdir] = logdir
105
+ end
106
+
107
+ o.on('-D', '--piddir PATH', 'Path for directory to store pid files') do |piddir|
108
+ opts[:piddir] = piddir
109
+ end
110
+
111
+ o.on('-W', '--workdir PATH', 'Path for directory where AIP creation work takes place in') do |workdir|
112
+ opts[:workdir] = workdir
113
+ end
114
+
115
+ o.on('-N', '--process_name NAME', 'Name of the application process') do |process_name|
116
+ opts[:process_name] = process_name
117
+ end
118
+
119
+ o.on('-m', '--monitor', 'Start monitor process for a deamon') do
120
+ opts[:monitor] = true
121
+ end
122
+
123
+ o.on('-q', '--queue NAME', 'Name of the queue to read from') do |queue|
124
+ opts[:queue_name] = queue
125
+ end
126
+
127
+ o.separator ''
128
+ o.separator 'Common options:'
129
+
130
+ o.on_tail('-v', '--version', 'Show version') do
131
+ puts "PushmiPullyu version: #{PushmiPullyu::VERSION}"
132
+ exit
133
+ end
134
+
135
+ o.on_tail('-h', '--help', 'Show this message') do
136
+ puts o
137
+ exit
138
+ end
139
+ end.parse!(argv)
140
+
141
+ ['config/pushmi_pullyu.yml', 'config/pushmi_pullyu.yml.erb'].each do |filename|
142
+ opts[:config_file] ||= filename if File.exist?(filename)
143
+ end
144
+
145
+ opts
146
+ end
147
+
148
+ def print_banner
149
+ logger.info "Loading PushmiPullyu #{PushmiPullyu::VERSION}"
150
+ logger.info "Running in #{RUBY_DESCRIPTION}"
151
+ logger.info 'Starting processing, hit Ctrl-C to stop' unless options[:daemonize]
152
+ end
153
+
154
+ def rotate_logs
155
+ PushmiPullyu::Logging.reopen
156
+ Daemonize.redirect_io(PushmiPullyu.application_log_file) if options[:daemonize]
157
+ PushmiPullyu.reset_logger = false
158
+ end
159
+
160
+ def run_preservation_cycle
161
+ item = queue.wait_next_item
162
+
163
+ # add additional information about the error context to errors that occur while processing this item.
164
+ Rollbar.scoped(noid: item) do
165
+ begin
166
+ # Download AIP from Fedora, bag and tar AIP directory and cleanup after block code
167
+ PushmiPullyu::AIP.create(item) do |aip_filename|
168
+ # Push tarred AIP to swift API
169
+ deposited_file = swift.deposit_file(aip_filename, options[:swift][:container])
170
+ # Log successful preservation event to the log files
171
+ PushmiPullyu::Logging.log_preservation_event(deposited_file)
172
+ end
173
+ rescue => e
174
+ Rollbar.error(e)
175
+ logger.error(e)
176
+ # TODO: we could re-raise here and let the daemon die on any preservation error, or just log the issue and
177
+ # move on to the next item.
178
+ end
179
+ end
180
+ end
181
+
182
+ def run_tick_loop
183
+ while PushmiPullyu.server_running?
184
+ run_preservation_cycle
185
+ rotate_logs if PushmiPullyu.reset_logger?
186
+ end
187
+ end
188
+
189
+ def setup_log
190
+ if options[:daemonize]
191
+ PushmiPullyu::Logging.initialize_logger(PushmiPullyu.application_log_file)
192
+ else
193
+ logger.formatter = PushmiPullyu::Logging::SimpleFormatter.new
194
+ end
195
+ logger.level = ::Logger::DEBUG if options[:debug]
196
+ end
197
+
198
+ def setup_signal_traps
199
+ Signal.trap('INT') { shutdown }
200
+ Signal.trap('TERM') { shutdown }
201
+ Signal.trap('HUP') { PushmiPullyu.reset_logger = true }
202
+ end
203
+
204
+ def queue
205
+ @queue ||= PushmiPullyu::PreservationQueue.new(connection: {
206
+ host: options[:redis][:host],
207
+ port: options[:redis][:port]
208
+ },
209
+ queue_name: options[:queue_name],
210
+ age_at_least: options[:minimum_age])
211
+ end
212
+
213
+ def swift
214
+ @swift ||= PushmiPullyu::SwiftDepositer.new(username: options[:swift][:username],
215
+ password: options[:swift][:password],
216
+ tenant: options[:swift][:tenant],
217
+ endpoint: options[:swift][:endpoint],
218
+ auth_version: options[:swift][:auth_version])
219
+ end
220
+
221
+ # On first call of shutdown, this will gracefully close the main run loop
222
+ # which let's the program exit itself. Calling shutdown again will force shutdown the program
223
+ def shutdown
224
+ if !PushmiPullyu.server_running?
225
+ exit!(1)
226
+ else
227
+ # using stderr instead of logger as it uses an underlying mutex which is not allowed inside trap contexts.
228
+ $stderr.puts 'Exiting... Interrupt again to force quit.'
229
+ PushmiPullyu.server_running = false
230
+ end
231
+ end
232
+
233
+ def start_server_as_daemon
234
+ require 'daemons'
235
+
236
+ pwd = Dir.pwd # Current directory is changed during daemonization, so store it
237
+
238
+ opts = {
239
+ ARGV: @parsed_opts,
240
+ dir: options[:piddir],
241
+ dir_mode: :normal,
242
+ monitor: options[:monitor],
243
+ log_output: true,
244
+ log_dir: File.expand_path(options[:logdir]),
245
+ logfilename: File.basename(PushmiPullyu.application_log_file),
246
+ output_logfilename: File.basename(PushmiPullyu.application_log_file)
247
+ }
248
+
249
+ Daemons.run_proc(options[:process_name], opts) do |*_argv|
250
+ Dir.chdir(pwd)
251
+ start_server
252
+ end
253
+ end
254
+
255
+ end