pushmi_pullyu 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,22 @@
1
+ require 'fileutils'
2
+
3
+ module PushmiPullyu::AIP
4
+ class NoidInvalid < StandardError; end
5
+
6
+ def create(noid)
7
+ raise NoidInvalid if noid.blank? || noid.include?('/')
8
+
9
+ aip_directory = "#{PushmiPullyu.options[:workdir]}/#{noid}"
10
+ aip_filename = "#{aip_directory}.tar"
11
+
12
+ PushmiPullyu::AIP::Downloader.new(noid, aip_directory).run
13
+ PushmiPullyu::AIP::Creator.new(noid, aip_directory, aip_filename).run
14
+
15
+ yield aip_filename
16
+
17
+ FileUtils.rm_rf(aip_filename) if File.exist?(aip_filename)
18
+ FileUtils.rm_rf(aip_directory) if File.exist?(aip_directory)
19
+ end
20
+
21
+ module_function :create
22
+ end
@@ -0,0 +1,44 @@
1
+ require 'minitar'
2
+ require 'bagit'
3
+ require 'fileutils'
4
+
5
+ class PushmiPullyu::AIP::Creator
6
+
7
+ class BagInvalid < StandardError; end
8
+
9
+ # Assumption: the AIP has already been downloaded
10
+
11
+ def initialize(noid, aip_directory, aip_filename)
12
+ @noid = noid
13
+ @aip_directory = aip_directory
14
+ @aip_filename = aip_filename
15
+ end
16
+
17
+ def run
18
+ bag_aip
19
+ tar_bag
20
+ end
21
+
22
+ private
23
+
24
+ def bag_aip
25
+ bag = BagIt::Bag.new(@aip_directory)
26
+ bag.manifest!
27
+ raise BagInvalid unless bag.valid?
28
+ end
29
+
30
+ def tar_bag
31
+ # We want to change the directory to the work directory path so we get the tar file to be exactly
32
+ # the contents of the noid directory and not the entire work directory structure. For example the noid.tar
33
+ # contains just the noid directory instead of having the noid.tar contain the tmp directory
34
+ # which contains the workdir directory and then finally the noid directory
35
+
36
+ # Before we change directorys, we need to calculate the absolute filepath of our aip filename
37
+ tar_aip_filename = File.expand_path(@aip_filename)
38
+
39
+ Dir.chdir(PushmiPullyu.options[:workdir]) do
40
+ Minitar.pack(@noid, File.open(tar_aip_filename, 'wb'))
41
+ end
42
+ end
43
+
44
+ end
@@ -0,0 +1,177 @@
1
+ require 'fileutils'
2
+ require 'ostruct'
3
+ require 'rdf'
4
+ require 'rdf/n3'
5
+
6
+ # Download all of the metadata/datastreams and associated data
7
+ # related to an object
8
+ class PushmiPullyu::AIP::Downloader
9
+
10
+ class NoContentFilename < StandardError; end
11
+
12
+ def initialize(noid, aip_directory)
13
+ @noid = noid
14
+ @aip_directory = aip_directory
15
+ end
16
+
17
+ def run
18
+ make_directories
19
+
20
+ PushmiPullyu.logger.info("#{@noid}: Retreiving data from Fedora ...")
21
+
22
+ [:main_object, :fixity, :content_datastream_metadata, :versions, :thumbnail,
23
+ :characterization, :fedora3foxml, :fedora3foxml_metadata].each do |item|
24
+ path_spec = aip_paths[item]
25
+ download_and_log(path_spec, PushmiPullyu::AIP::FedoraFetcher.new(@noid))
26
+ end
27
+
28
+ # Need content filename from metadata
29
+ path_spec = OpenStruct.new(
30
+ remote: '/content',
31
+ local: content_filename, # lookup filename derived from metadata
32
+ optional: false
33
+ )
34
+ download_and_log(path_spec, PushmiPullyu::AIP::FedoraFetcher.new(@noid))
35
+
36
+ download_permissions
37
+ end
38
+
39
+ private
40
+
41
+ def download_and_log(path_spec, fedora_fetcher)
42
+ output_file = path_spec.local
43
+
44
+ log_fetching(fedora_fetcher.object_url(path_spec.remote), output_file)
45
+
46
+ is_rdf = (output_file !~ /\.n3$/)
47
+
48
+ is_success = fedora_fetcher.download_object(output_file,
49
+ url_extra: path_spec.remote,
50
+ optional: path_spec.optional,
51
+ is_rdf: is_rdf)
52
+ log_saved(is_success, output_file)
53
+ end
54
+
55
+ def download_permissions
56
+ PushmiPullyu.logger.info("#{@noid}: looking up permissions from Solr ...")
57
+ results = PushmiPullyu::AIP::SolrFetcher.new(@noid).fetch_permission_object_ids
58
+ if results.empty?
59
+ PushmiPullyu.logger.info("#{@noid}: permissions not found")
60
+ else
61
+ results.each do |permission_id|
62
+ PushmiPullyu.logger.info("#{@noid}: permission object #{permission_id} found")
63
+ download_permission(permission_id)
64
+ end
65
+ end
66
+ end
67
+
68
+ def download_permission(permission_id)
69
+ path_spec = OpenStruct.new(
70
+ remote: nil,
71
+ local: "#{aip_dirs.metadata}/permission_#{permission_id}.n3",
72
+ optional: false
73
+ )
74
+ download_and_log(path_spec, PushmiPullyu::AIP::FedoraFetcher.new(permission_id))
75
+ end
76
+
77
+ ### Logging
78
+
79
+ def log_fetching(url, output_file)
80
+ message = "#{@noid}: #{output_file} -- fetching from #{url} ..."
81
+ PushmiPullyu::Logging.log_aip_activity(@aip_directory, message)
82
+ end
83
+
84
+ def log_saved(is_success, output_file)
85
+ message = "#{@noid}: #{output_file} -- #{is_success ? 'saved' : 'not_found'}"
86
+ PushmiPullyu::Logging.log_aip_activity(@aip_directory, message)
87
+ end
88
+
89
+ ### Directories
90
+
91
+ def aip_dirs
92
+ @aip_dirs ||= OpenStruct.new(
93
+ objects: "#{@aip_directory}/data/objects",
94
+ metadata: "#{@aip_directory}/data/objects/metadata",
95
+ logs: "#{@aip_directory}/data/logs",
96
+ thumbnails: "#{@aip_directory}/data/thumbnails"
97
+ )
98
+ end
99
+
100
+ def make_directories
101
+ clean_directories
102
+ PushmiPullyu.logger.debug("#{@noid}: Creating directories ...")
103
+ aip_dirs.to_h.values.each do |path|
104
+ FileUtils.mkdir_p(path)
105
+ end
106
+ PushmiPullyu.logger.debug("#{@noid}: Creating directories done")
107
+ end
108
+
109
+ def clean_directories
110
+ return unless File.exist?(@aip_directory)
111
+ PushmiPullyu.logger.debug("#{@noid}: Nuking directories ...")
112
+ FileUtils.rm_rf(@aip_directory)
113
+ end
114
+
115
+ ### Files
116
+
117
+ def aip_paths
118
+ @aip_paths ||= OpenStruct.new(
119
+ main_object: OpenStruct.new(
120
+ remote: nil, # Base path
121
+ local: "#{aip_dirs.metadata}/object_metadata.n3",
122
+ optional: false
123
+ ),
124
+ fixity: OpenStruct.new(
125
+ remote: '/content/fcr:fixity',
126
+ local: "#{aip_dirs.logs}/content_fixity_report.n3",
127
+ optional: false
128
+ ),
129
+ content_datastream_metadata: OpenStruct.new(
130
+ remote: '/content/fcr:metadata',
131
+ local: "#{aip_dirs.metadata}/content_fcr_metadata.n3",
132
+ optional: false
133
+ ),
134
+ versions: OpenStruct.new(
135
+ remote: '/content/fcr:versions',
136
+ local: "#{aip_dirs.metadata}/content_versions.n3",
137
+ optional: false
138
+ ),
139
+
140
+ # Optional downloads
141
+ thumbnail: OpenStruct.new(
142
+ remote: '/thumbnail',
143
+ local: "#{aip_dirs.thumbnails}/thumbnail",
144
+ optional: true
145
+ ),
146
+ characterization: OpenStruct.new(
147
+ remote: '/characterization',
148
+ local: "#{aip_dirs.logs}/content_characterization.n3",
149
+ optional: true
150
+ ),
151
+ fedora3foxml: OpenStruct.new(
152
+ remote: '/fedora3foxml',
153
+ local: "#{aip_dirs.metadata}/fedora3foxml.xml",
154
+ optional: true
155
+ ),
156
+ fedora3foxml_metadata: OpenStruct.new(
157
+ remote: '/fedora3foxml/fcr:metadata',
158
+ local: "#{aip_dirs.metadata}/fedora3foxml.n3",
159
+ optional: true
160
+ )
161
+ ).freeze
162
+ end
163
+
164
+ # Extract filename from main object metadata
165
+ def content_filename
166
+ filename_predicate = RDF::URI('info:fedora/fedora-system:def/model#downloadFilename')
167
+
168
+ graph = RDF::Graph.load(aip_paths.main_object.local)
169
+
170
+ graph.query(predicate: filename_predicate) do |results|
171
+ return "#{aip_dirs.objects}/#{results.object}"
172
+ end
173
+
174
+ raise NoContentFilename
175
+ end
176
+
177
+ end
@@ -0,0 +1,59 @@
1
+ require 'net/http'
2
+
3
+ class PushmiPullyu::AIP::FedoraFetcher
4
+
5
+ class FedoraFetchError < StandardError; end
6
+
7
+ RDF_FORMAT = 'text/rdf+n3'.freeze
8
+
9
+ def initialize(noid)
10
+ @noid = noid
11
+ end
12
+
13
+ def object_url(url_extra = nil)
14
+ url = "#{PushmiPullyu.options[:fedora][:url]}#{base_path}/#{pairtree}"
15
+ url += url_extra if url_extra
16
+ url
17
+ end
18
+
19
+ # Return true on success, raise an error otherwise
20
+ # (or use 'optional' to return false on 404)
21
+ def download_object(download_path, url_extra: nil,
22
+ optional: false, is_rdf: false)
23
+
24
+ uri = URI(object_url(url_extra))
25
+
26
+ request = Net::HTTP::Get.new(uri)
27
+ request.basic_auth(PushmiPullyu.options[:fedora][:user],
28
+ PushmiPullyu.options[:fedora][:password])
29
+
30
+ request['Accept'] = RDF_FORMAT if is_rdf
31
+
32
+ response = Net::HTTP.start(uri.hostname, uri.port) do |http|
33
+ http.request(request)
34
+ end
35
+
36
+ if response.is_a?(Net::HTTPSuccess)
37
+ file = File.open(download_path, 'wb')
38
+ file.write(response.body)
39
+ file.close
40
+ return true
41
+ elsif response.is_a?(Net::HTTPNotFound)
42
+ raise FedoraFetchError unless optional
43
+ return false
44
+ else
45
+ raise FedoraFetchError
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def pairtree
52
+ "#{@noid[0..1]}/#{@noid[2..3]}/#{@noid[4..5]}/#{@noid[6..7]}/#{@noid}"
53
+ end
54
+
55
+ def base_path
56
+ PushmiPullyu.options[:fedora][:base_path]
57
+ end
58
+
59
+ end
@@ -0,0 +1,33 @@
1
+ require 'json'
2
+ require 'net/http'
3
+
4
+ class PushmiPullyu::AIP::SolrFetcher
5
+
6
+ class SolrFetchError < StandardError; end
7
+
8
+ def initialize(noid)
9
+ @noid = noid
10
+ end
11
+
12
+ def fetch_permission_object_ids
13
+ hash = JSON.parse(run_query_json)
14
+
15
+ return [] if hash['response']['docs'].empty?
16
+
17
+ hash['response']['docs'].map { |hit| hit['id'] }
18
+ end
19
+
20
+ private
21
+
22
+ # Return fetched results, else raise an error
23
+ def run_query_json
24
+ response = Net::HTTP.get_response(
25
+ URI("#{PushmiPullyu.options[:solr][:url]}/select?q=accessTo_ssim:#{@noid}&fl=id&wt=json")
26
+ )
27
+
28
+ return response.body if response.is_a?(Net::HTTPSuccess)
29
+
30
+ raise SolrFetchError
31
+ end
32
+
33
+ end
@@ -0,0 +1,255 @@
1
+ require 'erb'
2
+ require 'fileutils'
3
+ require 'optparse'
4
+ require 'rollbar'
5
+ require 'singleton'
6
+ require 'yaml'
7
+
8
+ # CLI runner
9
+ class PushmiPullyu::CLI
10
+
11
+ include Singleton
12
+ include PushmiPullyu::Logging
13
+
14
+ COMMANDS = ['start', 'stop', 'restart', 'reload', 'run', 'zap', 'status'].freeze
15
+
16
+ def initialize
17
+ PushmiPullyu.server_running = true # set to false by interrupt signal trap
18
+ PushmiPullyu.reset_logger = false # set to true by SIGHUP trap
19
+ end
20
+
21
+ def parse(argv = ARGV)
22
+ opts = parse_options(argv)
23
+ opts[:daemonize] = true if COMMANDS.include? argv[0]
24
+ opts = parse_config(opts[:config_file]).merge(opts) if opts[:config_file]
25
+
26
+ PushmiPullyu.options = opts
27
+ end
28
+
29
+ def run
30
+ configure_rollbar
31
+ begin
32
+ if options[:daemonize]
33
+ start_server_as_daemon
34
+ else
35
+ # If we're running in the foreground sync the output.
36
+ $stdout.sync = $stderr.sync = true
37
+ start_server
38
+ end
39
+ # rubocop:disable Lint/RescueException
40
+ rescue Exception => e
41
+ Rollbar.error(e)
42
+ raise e
43
+ end
44
+ end
45
+
46
+ def start_server
47
+ setup_signal_traps
48
+
49
+ setup_log
50
+ print_banner
51
+
52
+ run_tick_loop
53
+ end
54
+
55
+ private
56
+
57
+ def configure_rollbar
58
+ Rollbar.configure do |config|
59
+ config.enabled = false unless options[:rollbar_token].present?
60
+ config.access_token = options[:rollbar_token]
61
+ end
62
+ end
63
+
64
+ def options
65
+ PushmiPullyu.options
66
+ end
67
+
68
+ def parse_config(config_file)
69
+ opts = {}
70
+ if File.exist?(config_file)
71
+ opts = YAML.safe_load(ERB.new(IO.read(config_file)).result).deep_symbolize_keys || opts
72
+ end
73
+
74
+ opts
75
+ end
76
+
77
+ # Parse the options.
78
+ def parse_options(argv)
79
+ opts = {}
80
+
81
+ @parsed_opts = OptionParser.new do |o|
82
+ o.banner = 'Usage: pushmi_pullyu [options] [start|stop|restart|run]'
83
+ o.separator ''
84
+ o.separator 'Specific options:'
85
+
86
+ o.on('-a', '--minimum-age AGE',
87
+ Float, 'Minimum amount of time an item must spend in the queue, in seconds.') do |minimum_age|
88
+ opts[:minimum_age] = minimum_age
89
+ end
90
+
91
+ o.on('-d', '--debug', 'Enable debug logging') do
92
+ opts[:debug] = true
93
+ end
94
+
95
+ o.on('-r', '--rollbar-token TOKEN', 'Enable error reporting to Rollbar') do |token|
96
+ opts[:rollbar_token] = token if token.present?
97
+ end
98
+
99
+ o.on '-C', '--config PATH', 'Path for YAML config file' do |config_file|
100
+ opts[:config_file] = config_file
101
+ end
102
+
103
+ o.on('-L', '--logdir PATH', 'Path for directory to store log files') do |logdir|
104
+ opts[:logdir] = logdir
105
+ end
106
+
107
+ o.on('-D', '--piddir PATH', 'Path for directory to store pid files') do |piddir|
108
+ opts[:piddir] = piddir
109
+ end
110
+
111
+ o.on('-W', '--workdir PATH', 'Path for directory where AIP creation work takes place in') do |workdir|
112
+ opts[:workdir] = workdir
113
+ end
114
+
115
+ o.on('-N', '--process_name NAME', 'Name of the application process') do |process_name|
116
+ opts[:process_name] = process_name
117
+ end
118
+
119
+ o.on('-m', '--monitor', 'Start monitor process for a deamon') do
120
+ opts[:monitor] = true
121
+ end
122
+
123
+ o.on('-q', '--queue NAME', 'Name of the queue to read from') do |queue|
124
+ opts[:queue_name] = queue
125
+ end
126
+
127
+ o.separator ''
128
+ o.separator 'Common options:'
129
+
130
+ o.on_tail('-v', '--version', 'Show version') do
131
+ puts "PushmiPullyu version: #{PushmiPullyu::VERSION}"
132
+ exit
133
+ end
134
+
135
+ o.on_tail('-h', '--help', 'Show this message') do
136
+ puts o
137
+ exit
138
+ end
139
+ end.parse!(argv)
140
+
141
+ ['config/pushmi_pullyu.yml', 'config/pushmi_pullyu.yml.erb'].each do |filename|
142
+ opts[:config_file] ||= filename if File.exist?(filename)
143
+ end
144
+
145
+ opts
146
+ end
147
+
148
+ def print_banner
149
+ logger.info "Loading PushmiPullyu #{PushmiPullyu::VERSION}"
150
+ logger.info "Running in #{RUBY_DESCRIPTION}"
151
+ logger.info 'Starting processing, hit Ctrl-C to stop' unless options[:daemonize]
152
+ end
153
+
154
+ def rotate_logs
155
+ PushmiPullyu::Logging.reopen
156
+ Daemonize.redirect_io(PushmiPullyu.application_log_file) if options[:daemonize]
157
+ PushmiPullyu.reset_logger = false
158
+ end
159
+
160
+ def run_preservation_cycle
161
+ item = queue.wait_next_item
162
+
163
+ # add additional information about the error context to errors that occur while processing this item.
164
+ Rollbar.scoped(noid: item) do
165
+ begin
166
+ # Download AIP from Fedora, bag and tar AIP directory and cleanup after block code
167
+ PushmiPullyu::AIP.create(item) do |aip_filename|
168
+ # Push tarred AIP to swift API
169
+ deposited_file = swift.deposit_file(aip_filename, options[:swift][:container])
170
+ # Log successful preservation event to the log files
171
+ PushmiPullyu::Logging.log_preservation_event(deposited_file)
172
+ end
173
+ rescue => e
174
+ Rollbar.error(e)
175
+ logger.error(e)
176
+ # TODO: we could re-raise here and let the daemon die on any preservation error, or just log the issue and
177
+ # move on to the next item.
178
+ end
179
+ end
180
+ end
181
+
182
+ def run_tick_loop
183
+ while PushmiPullyu.server_running?
184
+ run_preservation_cycle
185
+ rotate_logs if PushmiPullyu.reset_logger?
186
+ end
187
+ end
188
+
189
+ def setup_log
190
+ if options[:daemonize]
191
+ PushmiPullyu::Logging.initialize_logger(PushmiPullyu.application_log_file)
192
+ else
193
+ logger.formatter = PushmiPullyu::Logging::SimpleFormatter.new
194
+ end
195
+ logger.level = ::Logger::DEBUG if options[:debug]
196
+ end
197
+
198
+ def setup_signal_traps
199
+ Signal.trap('INT') { shutdown }
200
+ Signal.trap('TERM') { shutdown }
201
+ Signal.trap('HUP') { PushmiPullyu.reset_logger = true }
202
+ end
203
+
204
+ def queue
205
+ @queue ||= PushmiPullyu::PreservationQueue.new(connection: {
206
+ host: options[:redis][:host],
207
+ port: options[:redis][:port]
208
+ },
209
+ queue_name: options[:queue_name],
210
+ age_at_least: options[:minimum_age])
211
+ end
212
+
213
+ def swift
214
+ @swift ||= PushmiPullyu::SwiftDepositer.new(username: options[:swift][:username],
215
+ password: options[:swift][:password],
216
+ tenant: options[:swift][:tenant],
217
+ endpoint: options[:swift][:endpoint],
218
+ auth_version: options[:swift][:auth_version])
219
+ end
220
+
221
+ # On first call of shutdown, this will gracefully close the main run loop
222
+ # which let's the program exit itself. Calling shutdown again will force shutdown the program
223
+ def shutdown
224
+ if !PushmiPullyu.server_running?
225
+ exit!(1)
226
+ else
227
+ # using stderr instead of logger as it uses an underlying mutex which is not allowed inside trap contexts.
228
+ $stderr.puts 'Exiting... Interrupt again to force quit.'
229
+ PushmiPullyu.server_running = false
230
+ end
231
+ end
232
+
233
+ def start_server_as_daemon
234
+ require 'daemons'
235
+
236
+ pwd = Dir.pwd # Current directory is changed during daemonization, so store it
237
+
238
+ opts = {
239
+ ARGV: @parsed_opts,
240
+ dir: options[:piddir],
241
+ dir_mode: :normal,
242
+ monitor: options[:monitor],
243
+ log_output: true,
244
+ log_dir: File.expand_path(options[:logdir]),
245
+ logfilename: File.basename(PushmiPullyu.application_log_file),
246
+ output_logfilename: File.basename(PushmiPullyu.application_log_file)
247
+ }
248
+
249
+ Daemons.run_proc(options[:process_name], opts) do |*_argv|
250
+ Dir.chdir(pwd)
251
+ start_server
252
+ end
253
+ end
254
+
255
+ end