assembly-utils 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rvmrc.example +1 -0
- data/Gemfile +4 -0
- data/README.rdoc +51 -0
- data/Rakefile +8 -0
- data/assembly-utils.gemspec +36 -0
- data/bin/console +9 -0
- data/bin/run_all_tests +3 -0
- data/config/boot.rb +13 -0
- data/config/connect_to_dor.rb +6 -0
- data/lib/assembly-utils/utils.rb +817 -0
- data/lib/assembly-utils/version.rb +8 -0
- data/lib/assembly-utils.rb +37 -0
- data/spec/spec_helper.rb +31 -0
- data/spec/test_data/druid_dd999dd9999.xml +98 -0
- data/spec/test_data/local_dev_revs.yaml +57 -0
- data/spec/test_data/test_log.yaml +15 -0
- data/spec/utils_spec.rb +118 -0
- metadata +238 -0
@@ -0,0 +1,817 @@
|
|
1
|
+
require 'net/ssh'
|
2
|
+
require 'csv'
|
3
|
+
require 'csv-mapper'
|
4
|
+
require 'druid-tools'
|
5
|
+
|
6
|
+
begin
|
7
|
+
require 'net/ssh/kerberos'
|
8
|
+
rescue LoadError
|
9
|
+
end
|
10
|
+
|
11
|
+
module Assembly
|
12
|
+
|
13
|
+
|
14
|
+
# The Utils class contains methods to help with accessioning and assembly
|
15
|
+
class Utils
|
16
|
+
|
17
|
+
WFS = Dor::WorkflowService
|
18
|
+
REPO = 'dor'
|
19
|
+
|
20
|
+
# Get the staging directory tree given a druid, and optionally prepend a basepath.
|
21
|
+
# Deprecated and should not be needed anymore.
|
22
|
+
#
|
23
|
+
# @param [String] pid druid pid (e.g. 'aa000aa0001')
|
24
|
+
# @param [String] base_path optional base path to prepend to druid path
|
25
|
+
#
|
26
|
+
# @return [string] path to material that is being staged, with optional prepended base path
|
27
|
+
#
|
28
|
+
# Example:
|
29
|
+
# puts Assembly::Utils.get_staging_path('aa000aa0001','tmp')
|
30
|
+
# > "tmp/aa/000/aa/0001"
|
31
|
+
def self.get_staging_path(pid,base_path=nil)
|
32
|
+
d=DruidTools::Druid.new(pid,base_path)
|
33
|
+
path=File.dirname(d.path)
|
34
|
+
return path
|
35
|
+
end
|
36
|
+
|
37
|
+
# Insert the specified workflow into the specified object.
|
38
|
+
#
|
39
|
+
# @param [String] pid druid pid (e.g. 'aa000aa0001')
|
40
|
+
# @param [String] workflow name (e.g. 'accessionWF')
|
41
|
+
# @param [String] repository name (e.g. 'dor') -- optional, defaults to dor
|
42
|
+
#
|
43
|
+
# @return [boolean] indicates success of web service call
|
44
|
+
#
|
45
|
+
# Example:
|
46
|
+
# puts Assembly::Utils.insert_workflow('druid:aa000aa0001','accessionWF')
|
47
|
+
# > true
|
48
|
+
def self.insert_workflow(pid,workflow,repo='dor')
|
49
|
+
url = "#{Dor::Config.dor.service_root}/objects/#{pid}/apo_workflows/#{workflow}"
|
50
|
+
result = RestClient.post url, {}
|
51
|
+
return ([200,201,202,204].include?(result.code) && result)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Claim a specific druid as already used to be sure it won't get used again.
|
55
|
+
# Not needed for normal purposes, only if you manually register something in Fedora Admin outside of DOR services gem.
|
56
|
+
#
|
57
|
+
# @param [String] pid druid pid (e.g. 'aa000aa0001')
|
58
|
+
#
|
59
|
+
# @return [boolean] indicates success of web service call
|
60
|
+
#
|
61
|
+
# Example:
|
62
|
+
# puts Assembly::Utils.claim_druid('aa000aa0001')
|
63
|
+
# > true
|
64
|
+
def self.claim_druid(pid)
|
65
|
+
sc = Dor::Config.suri
|
66
|
+
url = "#{sc.url}/suri2/namespaces/#{sc.id_namespace}"
|
67
|
+
rcr = RestClient::Resource.new(url, :user => sc.user, :password => sc.pass)
|
68
|
+
resp = rcr["identifiers/#{pid}"].put('')
|
69
|
+
return resp.code == "204"
|
70
|
+
end
|
71
|
+
|
72
|
+
# Force a full re-index of the supplied druid in solr and fedora.
|
73
|
+
#
|
74
|
+
# @param [String] druid druid (e.g. 'druid:aa000aa0001')
|
75
|
+
#
|
76
|
+
# Example:
|
77
|
+
# puts Assembly::Utils.reindex('druid:aa000aa0001')
|
78
|
+
def self.reindex(druid)
|
79
|
+
obj = Dor.load_instance druid
|
80
|
+
solr_doc = obj.to_solr
|
81
|
+
Dor::SearchService.solr.add(solr_doc, :add_attributes => {:commitWithin => 1000}) unless obj.nil?
|
82
|
+
Dor.find(pid).update_index
|
83
|
+
end
|
84
|
+
|
85
|
+
# Export one or more objects given a single or array of pids, with output to the specified directory as FOXML files
|
86
|
+
#
|
87
|
+
# @param [Array] pids - an array of pids to export (can also pass a single pid as a string)
|
88
|
+
# @param [String] output_dir - the full path to output the foxml files
|
89
|
+
#
|
90
|
+
# Example:
|
91
|
+
# Assembly::Utils.export_objects(['druid:aa000aa0001','druid:bb000bb0001'],'/tmp')
|
92
|
+
def self.export_objects(pids,output_dir)
|
93
|
+
pids=[pids] if pids.class==String
|
94
|
+
pids.each {|pid| ActiveFedora::FixtureExporter.export_to_path(pid, output_dir)}
|
95
|
+
end
|
96
|
+
|
97
|
+
# Import all of the FOXML files in the specified directory into Fedora
|
98
|
+
#
|
99
|
+
# @param [String] source_dir - the full path to import the foxml files
|
100
|
+
#
|
101
|
+
# Example:
|
102
|
+
# Assembly::Utils.import_objects('/tmp')
|
103
|
+
def self.import_objects(source_dir)
|
104
|
+
Dir.chdir(source_dir)
|
105
|
+
files=Dir.glob('*.foxml.xml')
|
106
|
+
files.each do |file|
|
107
|
+
pid = ActiveFedora::FixtureLoader.import_to_fedora(File.join(source_dir,file))
|
108
|
+
ActiveFedora::FixtureLoader.index(pid)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Get a list of druids that match the given array of source IDs.
|
113
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
114
|
+
#
|
115
|
+
# @param [String] source_ids array of source ids to lookup
|
116
|
+
#
|
117
|
+
# @return [array] druids
|
118
|
+
# Example:
|
119
|
+
#
|
120
|
+
# puts Assembly::Utils.get_druids_by_sourceid(['revs-01','revs-02'])
|
121
|
+
# > ['druid:aa000aa0001','druid:aa000aa0002']
|
122
|
+
def self.get_druids_by_sourceid(source_ids)
|
123
|
+
druids=[]
|
124
|
+
source_ids.each {|sid| druids << Dor::SearchService.query_by_id(sid)}
|
125
|
+
druids.flatten
|
126
|
+
end
|
127
|
+
|
128
|
+
# Show the workflow status of specific steps in assembly and/or accession workflows for the provided druids.
|
129
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
130
|
+
#
|
131
|
+
# @param [Hash] params parameters specified as a hash, using symbols for options:
|
132
|
+
# * :druids => array of druids to get workflow status for
|
133
|
+
# * :workflows => an optional array of workflow names as symbols, options are :assembly and :accession; defaults to :assembly
|
134
|
+
# * :filename => optional filename if you want to send output to a CSV
|
135
|
+
#
|
136
|
+
# @return [string] comma delimited output or CSV file
|
137
|
+
#
|
138
|
+
# Example:
|
139
|
+
# Assembly::Utils.workflow_status(:druids=>['druid:aa000aa0001','druid:aa000aa0002'],:workflows=>[:assembly,:accession],:filename=>'output.csv')
|
140
|
+
def self.workflow_status(params={})
|
141
|
+
|
142
|
+
druids=params[:druids] || []
|
143
|
+
workflows=params[:workflows] || [:assembly]
|
144
|
+
filename=params[:filename] || ''
|
145
|
+
|
146
|
+
accession_steps = %w(content-metadata descriptive-metadata rights-metadata remediate-object shelve publish)
|
147
|
+
assembly_steps = %w(jp2-create checksum-compute exif-collect accessioning-initiate)
|
148
|
+
|
149
|
+
puts "Generating report"
|
150
|
+
|
151
|
+
csv = CSV.open(filename, "w") if filename != ''
|
152
|
+
|
153
|
+
header=["druid"]
|
154
|
+
header << assembly_steps if workflows.include?(:assembly)
|
155
|
+
header << accession_steps if workflows.include?(:accession)
|
156
|
+
csv << header.flatten if filename != ''
|
157
|
+
puts header.join(',')
|
158
|
+
|
159
|
+
druids.each do |druid|
|
160
|
+
output=[druid]
|
161
|
+
assembly_steps.each {|step| output << self.get_workflow_status(druid,'assemblyWF',step)} if workflows.include?(:assembly)
|
162
|
+
accession_steps.each {|step| output << self.get_workflow_status(druid,'accessionWF',step)} if workflows.include?(:accession)
|
163
|
+
csv << output if filename != ''
|
164
|
+
puts output.join(',')
|
165
|
+
end
|
166
|
+
|
167
|
+
if filename != ''
|
168
|
+
csv.close
|
169
|
+
puts "Report generated in #{filename}"
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
# Show the workflow status of a specific step in a specific workflow for the provided druid.
|
175
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
176
|
+
#
|
177
|
+
# @param [string] druid a druid string
|
178
|
+
# @param [string] workflow name of workflow
|
179
|
+
# @param [string] step name of step
|
180
|
+
#
|
181
|
+
# @return [string] workflow step status, returns nil if no workflow found
|
182
|
+
#
|
183
|
+
# Example:
|
184
|
+
# puts Assembly::Utils.get_workflow_status('druid:aa000aa0001','assemblyWF','jp2-create')
|
185
|
+
# > "completed"
|
186
|
+
def self.get_workflow_status(druid,workflow,step)
|
187
|
+
Dor::WorkflowService.get_workflow_status('dor', druid, workflow, step)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Cleanup a list of objects and associated files given a list of druids. WARNING: VERY DESTRUCTIVE.
|
191
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
192
|
+
#
|
193
|
+
# @param [Hash] params parameters specified as a hash, using symbols for options:
|
194
|
+
# * :druids => array of druids to cleanup
|
195
|
+
# * :steps => an array of steps, specified as symbols, indicating steps to be run, options are:
|
196
|
+
# :stacks=This will remove all files from the stacks that were shelved for the objects
|
197
|
+
# :dor=This will delete objects from Fedora
|
198
|
+
# :stage=This will delete the staged content in the assembly workspace
|
199
|
+
# :symlinks=This will remove the symlink from the dor workspace
|
200
|
+
# :workflows=This will remove the assemblyWF and accessoiningWF workflows for this object
|
201
|
+
# * :dry_run => do not actually clean up (defaults to false)
|
202
|
+
#
|
203
|
+
# Example:
|
204
|
+
# Assembly::Utils.cleanup(:druids=>['druid:aa000aa0001','druid:aa000aa0002'],:steps=>[:stacks,:dor,:stage,:symlinks,:workflows])
|
205
|
+
def self.cleanup(params={})
|
206
|
+
|
207
|
+
druids=params[:druids] || []
|
208
|
+
steps=params[:steps] || []
|
209
|
+
dry_run=params[:dry_run] || false
|
210
|
+
|
211
|
+
allowed_steps={:stacks=>'This will remove all files from the stacks that were shelved for the objects',
|
212
|
+
:dor=>'This will delete objects from Fedora',
|
213
|
+
:stage=>"This will delete the staged content in #{Assembly::ASSEMBLY_WORKSPACE}",
|
214
|
+
:symlinks=>"This will remove the symlink from #{Assembly::DOR_WORKSPACE}",
|
215
|
+
:workflows=>"This will remove the accessionWF and assemblyWF workflows"}
|
216
|
+
|
217
|
+
num_steps=0
|
218
|
+
|
219
|
+
puts 'THIS IS A DRY RUN' if dry_run
|
220
|
+
|
221
|
+
Assembly::Utils.confirm "Run on '#{ENV['ROBOT_ENVIRONMENT']}'? Any response other than 'y' or 'yes' will stop the cleanup now."
|
222
|
+
Assembly::Utils.confirm "Are you really sure you want to run on production? CLEANUP IS NOT REVERSIBLE" if ENV['ROBOT_ENVIRONMENT'] == 'production'
|
223
|
+
|
224
|
+
steps.each do |step|
|
225
|
+
if allowed_steps.keys.include?(step)
|
226
|
+
Assembly::Utils.confirm "Run step '#{step}'? #{allowed_steps[step]}. Any response other than 'y' or 'yes' will stop the cleanup now."
|
227
|
+
num_steps+=1 # count the valid steps found and agreed to
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
raise "no valid steps specified for cleanup" if num_steps == 0
|
232
|
+
raise "no druids provided" if druids.size == 0
|
233
|
+
|
234
|
+
druids.each {|pid| Assembly::Utils.cleanup_object(pid,steps,dry_run)}
|
235
|
+
|
236
|
+
end
|
237
|
+
|
238
|
+
# Cleanup a single objects and associated files given a druid. WARNING: VERY DESTRUCTIVE.
|
239
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
240
|
+
#
|
241
|
+
# @param [string] pid a druid
|
242
|
+
# @param [array] steps an array of steps, options below
|
243
|
+
# :stacks=This will remove all files from the stacks that were shelved for the objects
|
244
|
+
# :dor=This will delete objects from Fedora
|
245
|
+
# :stage=This will delete the staged content in the assembly workspace
|
246
|
+
# :symlinks=This will remove the symlink from the dor workspace
|
247
|
+
# :workflows=This will remove the assemblyWF and accessoiningWF workflows for this object
|
248
|
+
# @param [boolean] dry_run do not actually clean up (defaults to false)
|
249
|
+
#
|
250
|
+
# Example:
|
251
|
+
# Assembly::Utils.cleanup_object('druid:aa000aa0001',[:stacks,:dor,:stage,:symlinks,:workflows])
|
252
|
+
def self.cleanup_object(pid,steps,dry_run=false)
|
253
|
+
begin
|
254
|
+
# start up an SSH session if we are going to try and remove content from the stacks
|
255
|
+
ssh_session=Net::SSH.start(Dor::Config.stacks.host,Dor::Config.stacks.user, :auth_methods => %w(gssapi-with-mic publickey hostbased password keyboard-interactive)) if steps.include?(:stacks) && defined?(stacks_server)
|
256
|
+
|
257
|
+
druid_tree=DruidTools::Druid.new(pid).tree
|
258
|
+
puts "Cleaning up #{pid}"
|
259
|
+
if steps.include?(:dor)
|
260
|
+
puts "-- deleting #{pid} from Fedora #{ENV['ROBOT_ENVIRONMENT']}"
|
261
|
+
Assembly::Utils.unregister(pid) unless dry_run
|
262
|
+
end
|
263
|
+
if steps.include?(:symlinks)
|
264
|
+
path_to_symlinks=[]
|
265
|
+
path_to_symlinks << File.join(Assembly::DOR_WORKSPACE,druid_tree)
|
266
|
+
path_to_symlinks << Assembly::Utils.get_staging_path(pid,Assembly::DOR_WORKSPACE)
|
267
|
+
path_to_symlinks.each do |path|
|
268
|
+
if File::directory?(path)
|
269
|
+
puts "-- deleting folder #{path} (WARNING: should have been a symlink)"
|
270
|
+
FileUtils::rm_rf path unless dry_run
|
271
|
+
elsif File.symlink?(path)
|
272
|
+
puts "-- deleting symlink #{path}"
|
273
|
+
File.delete(path) unless dry_run
|
274
|
+
else
|
275
|
+
puts "-- Skipping #{path}: not a folder or symlink"
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
if steps.include?(:stage)
|
280
|
+
path_to_content=Assembly::Utils.get_staging_path(pid,Assembly::ASSEMBLY_WORKSPACE)
|
281
|
+
puts "-- deleting folder #{path_to_content}"
|
282
|
+
FileUtils.rm_rf path_to_content if !dry_run && File.exists?(path_to_content)
|
283
|
+
end
|
284
|
+
if steps.include?(:stacks)
|
285
|
+
path_to_content= Dor::DigitalStacksService.stacks_storage_dir(pid)
|
286
|
+
puts "-- removing files from the stacks on #{stacks_server} at #{path_to_content}"
|
287
|
+
ssh_session.exec!("rm -fr #{path_to_content}") unless dry_run
|
288
|
+
end
|
289
|
+
if steps.include?(:workflows)
|
290
|
+
puts "-- deleting #{pid} accessionWF and assemblyWF workflows from Fedora #{ENV['ROBOT_ENVIRONMENT']}"
|
291
|
+
unless dry_run
|
292
|
+
Dor::WorkflowService.delete_workflow('dor',pid,'accessionWF')
|
293
|
+
Dor::WorkflowService.delete_workflow('dor',pid,'assemblyWF')
|
294
|
+
end
|
295
|
+
end
|
296
|
+
rescue Exception => e
|
297
|
+
puts "** cleaning up failed for #{pid} with #{e.message}"
|
298
|
+
end
|
299
|
+
ssh_session.close if ssh_session
|
300
|
+
end
|
301
|
+
|
302
|
+
# Delete an object from DOR.
|
303
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
304
|
+
#
|
305
|
+
# @param [string] pid the druid
|
306
|
+
#
|
307
|
+
# Example:
|
308
|
+
# Assembly::Utils.delete_from_dor('druid:aa000aa0001')
|
309
|
+
def self.delete_from_dor(pid)
|
310
|
+
|
311
|
+
Dor::Config.fedora.client["objects/#{pid}"].delete
|
312
|
+
Dor::SearchService.solr.delete_by_id(pid)
|
313
|
+
Dor::SearchService.solr.commit
|
314
|
+
|
315
|
+
end
|
316
|
+
|
317
|
+
# Quicky update rights metadata for any existing list of objects using default rights metadata pulled from the supplied APO
|
318
|
+
#
|
319
|
+
# @param [array] druids - an array of druids
|
320
|
+
# @param [string] apo_druid - the druid of the APO to pull rights metadata from
|
321
|
+
# @param [boolean] publish - defaults to false, if true, will publish each object after replacing datastreams (must be run on server with rights to do this)
|
322
|
+
#
|
323
|
+
# Example:
|
324
|
+
# druids=%w{druid:aa111aa1111 druid:bb222bb2222}
|
325
|
+
# apo_druid='druid:cc222cc2222'
|
326
|
+
# Assembly::Utils.update_rights_metadata(druids,apo_druid)
|
327
|
+
def self.update_rights_metadata(druids,apo_druid,publish=false)
|
328
|
+
apo = Dor::Item.find(apo_druid)
|
329
|
+
rights_md = apo.datastreams['defaultObjectRights']
|
330
|
+
self.replace_datastreams(druids,'rightsMetadata',rights_md.content,publish)
|
331
|
+
end
|
332
|
+
|
333
|
+
# Replace a specific datastream for a series of objects in DOR with new content
|
334
|
+
#
|
335
|
+
# @param [array] druids - an array of druids
|
336
|
+
# @param [string] datastream_name - the name of the datastream to replace
|
337
|
+
# @param [string] new_content - the new content to replace the entire datastream with
|
338
|
+
# @param [boolean] publish - defaults to false, if true, will publish each object after replacing datastreams (must be run on server with rights to do this)
|
339
|
+
#
|
340
|
+
# Example:
|
341
|
+
# druids=%w{druid:aa111aa1111 druid:bb222bb2222}
|
342
|
+
# new_content='<xml><more nodes>this should be the whole datastream</more nodes></xml>'
|
343
|
+
# datastream='rightsMetadata'
|
344
|
+
# Assembly::Utils.replace_datastreams(druids,datastream,new_content)
|
345
|
+
def self.replace_datastreams(druids,datastream_name,new_content,publish=false)
|
346
|
+
druids.each do |druid|
|
347
|
+
obj = Dor::Item.find(druid)
|
348
|
+
ds = obj.datastreams[datastream_name]
|
349
|
+
if ds
|
350
|
+
ds.content = new_content
|
351
|
+
ds.save
|
352
|
+
puts "replaced #{datastream_name} for #{druid}"
|
353
|
+
if publish
|
354
|
+
obj.publish_metadata
|
355
|
+
puts "--object re-published"
|
356
|
+
end
|
357
|
+
else
|
358
|
+
puts "#{datastream_name} does not exist for #{druid}"
|
359
|
+
end
|
360
|
+
end
|
361
|
+
end
|
362
|
+
|
363
|
+
# Republish a list of druids. Only works when run from a server with access rights to the stacks (e.g. lyberservices-prod)
|
364
|
+
#
|
365
|
+
# @param [array] druids - an array of druids
|
366
|
+
#
|
367
|
+
# Example:
|
368
|
+
# druids=%w{druid:aa111aa1111 druid:bb222bb2222}
|
369
|
+
# Assembly::Utils.republish(druids)
|
370
|
+
def self.republish(druids)
|
371
|
+
druids.each do |druid|
|
372
|
+
obj = Dor::Item.find(druid)
|
373
|
+
obj.publish_metadata
|
374
|
+
puts "republished #{druid}"
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
# Determines if the specifed APO object contains a specified workflow defined in it
|
379
|
+
# DEPRACATED NOW THAT REIFED WORKFLOWS ARE USED
|
380
|
+
# @param [string] druid - the druid of the APO to check
|
381
|
+
# @param [string] workflow - the name of the workflow to check
|
382
|
+
#
|
383
|
+
# @return [boolean] if workflow is defined in APO
|
384
|
+
#
|
385
|
+
# Example:
|
386
|
+
# Assembly::Utils.apo_workflow_defined?('druid:oo000oo0001','assembly')
|
387
|
+
# > true
|
388
|
+
def self.apo_workflow_defined?(druid,workflow)
|
389
|
+
puts "************WARNING - THIS METHOD MAY NOT BE USEFUL ANYMORE SINCE WORKFLOWS ARE NO LONGER DEFINED IN THE APO**************"
|
390
|
+
obj = Dor::Item.find(druid)
|
391
|
+
raise 'object not an APO' if obj.identityMetadata.objectType.first != 'adminPolicy'
|
392
|
+
xml_doc=Nokogiri::XML(obj.administrativeMetadata.content)
|
393
|
+
xml_doc.xpath("//#{workflow}").size == 1 || xml_doc.xpath("//*[@id='#{workflow}']").size == 1
|
394
|
+
end
|
395
|
+
|
396
|
+
# Determines if the specifed object is an APO
|
397
|
+
# @param [string] druid - the druid of the APO to check
|
398
|
+
#
|
399
|
+
# @return [boolean] if object exist and is an APO
|
400
|
+
#
|
401
|
+
# Example:
|
402
|
+
# Assembly::Utils.is_apo?('druid:oo000oo0001')
|
403
|
+
# > true
|
404
|
+
def self.is_apo?(druid)
|
405
|
+
begin
|
406
|
+
obj = Dor::Item.find(druid)
|
407
|
+
return obj.identityMetadata.objectType.first == 'adminPolicy'
|
408
|
+
rescue
|
409
|
+
return false
|
410
|
+
end
|
411
|
+
end
|
412
|
+
|
413
|
+
# Update a specific datastream for a series of objects in DOR by searching and replacing content
|
414
|
+
#
|
415
|
+
# @param [array] druids - an array of druids
|
416
|
+
# @param [string] datastream_name - the name of the datastream to replace
|
417
|
+
# @param [string] find_content - the content to find
|
418
|
+
# @param [string] replace_content - the content to replace the found content with
|
419
|
+
#
|
420
|
+
# Example:
|
421
|
+
# druids=%w{druid:aa111aa1111 druid:bb222bb2222}
|
422
|
+
# find_content='FooBarBaz'
|
423
|
+
# replace_content='Stanford Rules'
|
424
|
+
# datastream='rightsMetadata'
|
425
|
+
# Assembly::Utils.update_datastreams(druids,datastream,find_content,replace_content)
|
426
|
+
def self.update_datastreams(druids,datastream_name,find_content,replace_content)
|
427
|
+
druids.each do |druid|
|
428
|
+
obj = Dor::Item.find(druid)
|
429
|
+
ds = obj.datastreams[datastream_name]
|
430
|
+
if ds
|
431
|
+
updated_content=ds.content.gsub(find_content,replace_content)
|
432
|
+
ds.content = updated_content
|
433
|
+
ds.save
|
434
|
+
puts "updated #{datastream_name} for #{druid}"
|
435
|
+
else
|
436
|
+
puts "#{datastream_name} does not exist for #{druid}"
|
437
|
+
end
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
# Unregister a DOR object, which includes deleting it and deleting all its workflows
|
442
|
+
#
|
443
|
+
# @param [string] pid of druid
|
444
|
+
#
|
445
|
+
# @return [boolean] if deletion succeed or not
|
446
|
+
def self.unregister(pid)
|
447
|
+
|
448
|
+
begin
|
449
|
+
Assembly::Utils.delete_all_workflows pid
|
450
|
+
Assembly::Utils.delete_from_dor pid
|
451
|
+
return true
|
452
|
+
rescue
|
453
|
+
return false
|
454
|
+
end
|
455
|
+
|
456
|
+
end
|
457
|
+
|
458
|
+
# Set the workflow step for the given PID to an error state
|
459
|
+
#
|
460
|
+
# @param [string] pid of druid
|
461
|
+
# @param [string] step to set to error
|
462
|
+
#
|
463
|
+
def self.set_workflow_step_to_error(pid, step)
|
464
|
+
wf_name = Assembly::ASSEMBLY_WF
|
465
|
+
msg = 'Integration testing'
|
466
|
+
params = ['dor', pid, wf_name, step, msg]
|
467
|
+
resp = Dor::WorkflowService.update_workflow_error_status *params
|
468
|
+
raise "update_workflow_error_status() returned false." unless resp == true
|
469
|
+
end
|
470
|
+
|
471
|
+
# Delete all workflows for the given PID. Destructive and should only be used when deleting an object from DOR.
|
472
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
473
|
+
#
|
474
|
+
# @param [string] pid of druid
|
475
|
+
# @param [String] repo repository dealing with the workflow. Default is 'dor'. Another option is 'sdr'
|
476
|
+
# e.g.
|
477
|
+
# Assembly::Utils.delete_all_workflows('druid:oo000oo0001')
|
478
|
+
def self.delete_all_workflows(pid, repo='dor')
|
479
|
+
Dor::WorkflowService.get_workflows(pid).each {|workflow| Dor::WorkflowService.delete_workflow(repo,pid,workflow)}
|
480
|
+
end
|
481
|
+
|
482
|
+
# Reindex the supplied PID in solr.
|
483
|
+
#
|
484
|
+
# @param [string] pid of druid
|
485
|
+
# e.g.
|
486
|
+
# Assembly::Utils.reindex('druid:oo000oo0001')
|
487
|
+
def self.reindex(pid)
|
488
|
+
obj = Dor.load_instance pid
|
489
|
+
solr_doc = obj.to_solr
|
490
|
+
Dor::SearchService.solr.add(solr_doc, :add_attributes => {:commitWithin => 1000}) unless obj.nil?
|
491
|
+
end
|
492
|
+
|
493
|
+
# Clear stray workflows - remove any workflow steps for orphaned objects.
|
494
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
495
|
+
def self.clear_stray_workflows
|
496
|
+
repo = 'dor'
|
497
|
+
wf = 'assemblyWF'
|
498
|
+
msg = 'Integration testing'
|
499
|
+
wfs = Dor::WorkflowService
|
500
|
+
steps = Assembly::ASSEMBLY_WF_STEPS.map { |s| s[0] }
|
501
|
+
completed = steps[0]
|
502
|
+
|
503
|
+
steps.each do |waiting|
|
504
|
+
druids = wfs.get_objects_for_workstep completed, waiting, repo, wf
|
505
|
+
druids.each do |dru|
|
506
|
+
params = [repo, dru, wf, waiting, msg]
|
507
|
+
resp = wfs.update_workflow_error_status *params
|
508
|
+
puts "updated: resp=#{resp} params=#{params.inspect}"
|
509
|
+
end
|
510
|
+
end
|
511
|
+
end
|
512
|
+
|
513
|
+
# Check if the object is full accessioned and ingested.
|
514
|
+
# This method only works when this gem is used in a project that is configured to connect to the workflow service.
|
515
|
+
#
|
516
|
+
# @param [string] pid the druid to operate on
|
517
|
+
#
|
518
|
+
# @return [boolean] if object is fully ingested
|
519
|
+
# Example:
|
520
|
+
# Assembly::Utils.is_ingested?('druid:oo000oo0001')
|
521
|
+
# > false
|
522
|
+
def self.is_ingested?(pid)
|
523
|
+
WFS.get_lifecycle(REPO, pid, 'accessioned') ? true : false
|
524
|
+
end
|
525
|
+
|
526
|
+
# Check if the object is on ingest hold
|
527
|
+
# This method only works when this gem is used in a project that is configured to connect to the workflow service.
|
528
|
+
#
|
529
|
+
# @param [string] pid the druid to operate on
|
530
|
+
#
|
531
|
+
# @return [boolean] if object is on ingest hold
|
532
|
+
# Example:
|
533
|
+
# Assembly::Utils.ingest_hold?('druid:oo000oo0001')
|
534
|
+
# > false
|
535
|
+
def self.ingest_hold?(pid)
|
536
|
+
WFS.get_workflow_status(REPO, pid, 'accessionWF','sdr-ingest-transfer') == 'hold'
|
537
|
+
end
|
538
|
+
|
539
|
+
# Check if the object is submitted
|
540
|
+
# This method only works when this gem is used in a project that is configured to connect to the workflow service.
|
541
|
+
#
|
542
|
+
# @param [string] pid the druid to operate on
|
543
|
+
#
|
544
|
+
# @return [boolean] if object is submitted
|
545
|
+
# Example:
|
546
|
+
# Assembly::Utils.is_submitted?('druid:oo000oo0001')
|
547
|
+
# > false
|
548
|
+
def self.is_submitted?(pid)
|
549
|
+
WFS.get_lifecycle(REPO, pid, 'submitted') == nil
|
550
|
+
end
|
551
|
+
|
552
|
+
# Reset the workflow states for a list of druids given a list of workflow names and steps.
|
553
|
+
# Provide a list of druids in an array, and a hash containing workflow names (e.g. 'assemblyWF' or 'accessionWF') as the keys, and arrays of steps
|
554
|
+
# as the corresponding values (e.g. ['checksum-compute','jp2-create']) and they will all be reset to "waiting".
|
555
|
+
# This method only works when this gem is used in a project that is configured to connect to DOR
|
556
|
+
#
|
557
|
+
# @param [Hash] params parameters specified as a hash, using symbols for options:
|
558
|
+
# * :druids => array of druids
|
559
|
+
# * :steps => a hash, containing workflow names as keys, and an array of steps
|
560
|
+
# * :state => a string for the name of the state to reset to, defaults to 'waiting' (could be 'completed' for example)
|
561
|
+
#
|
562
|
+
# Example:
|
563
|
+
# druids=['druid:aa111aa1111','druid:bb222bb2222']
|
564
|
+
# steps={'assemblyWF' => ['checksum-compute'],'accessionWF' => ['content-metadata','descriptive-metadata']}
|
565
|
+
# Assembly::Utils.reset_workflow_states(:druids=>druids,:steps=>steps)
|
566
|
+
def self.reset_workflow_states(params={})
|
567
|
+
druids=params[:druids] || []
|
568
|
+
workflows=params[:steps] || {}
|
569
|
+
state=params[:state] || "waiting"
|
570
|
+
druids.each do |druid|
|
571
|
+
puts "** #{druid}"
|
572
|
+
begin
|
573
|
+
workflows.each do |workflow,steps|
|
574
|
+
steps.each do |step|
|
575
|
+
puts "Updating #{workflow}:#{step} to #{state}"
|
576
|
+
Dor::WorkflowService.update_workflow_status 'dor',druid,workflow, step, state
|
577
|
+
end
|
578
|
+
end
|
579
|
+
rescue Exception => e
|
580
|
+
puts "an error occurred trying to update workflows for #{druid} with message #{e.message}"
|
581
|
+
end
|
582
|
+
end
|
583
|
+
end
|
584
|
+
|
585
|
+
# Get a list of druids from a CSV file which has a heading of "druid" and put them into a Ruby array.
|
586
|
+
# Useful if you want to import a report from argo
|
587
|
+
#
|
588
|
+
# @param [string] filename of CSV that has a column called "druid"
|
589
|
+
#
|
590
|
+
# @return [array] array of druids
|
591
|
+
#
|
592
|
+
# Example:
|
593
|
+
# Assembly::Utils.read_druids_from_file('download.csv') # ['druid:xxxxx','druid:yyyyy']
|
594
|
+
def self.read_druids_from_file(csv_filename)
|
595
|
+
rows=CsvMapper.import(csv_filename) do read_attributes_from_file end
|
596
|
+
druids=[]
|
597
|
+
rows.each do |row|
|
598
|
+
druid=row.druid
|
599
|
+
druid="druid:#{druid}" unless druid.include?('druid:')
|
600
|
+
druids << druid
|
601
|
+
end
|
602
|
+
return druids
|
603
|
+
end
|
604
|
+
|
605
|
+
# Get a list of druids that have errored out in a particular workflow and step
|
606
|
+
#
|
607
|
+
# @param [string] workflow name
|
608
|
+
# @param [string] step name
|
609
|
+
# @param [string] tag -- optional, if supplied, results will be filtered by the exact tag supplied; note this will dramatically slow down the response if there are many results
|
610
|
+
#
|
611
|
+
# @return [hash] hash of results, with key has a druid, and value as the error message
|
612
|
+
# e.g.
|
613
|
+
# result=Assembly::Utils.get_errored_objects_for_workstep('accessionWF','content-metadata','Project : Revs')
|
614
|
+
# => {"druid:qd556jq0580"=>"druid:qd556jq0580 - Item error; caused by #<Rubydora::FedoraInvalidRequest: Error modifying datastream contentMetadata for druid:qd556jq0580. See logger for details>"}
|
615
|
+
def self.get_errored_objects_for_workstep workflow, step, tag = ''
|
616
|
+
result=Dor::WorkflowService.get_errored_objects_for_workstep workflow,step,'dor'
|
617
|
+
if tag == ''
|
618
|
+
return result
|
619
|
+
else
|
620
|
+
filtered_result={}
|
621
|
+
result.each do |druid,error|
|
622
|
+
begin
|
623
|
+
item=Dor::Item.find(druid)
|
624
|
+
filtered_result.merge!(druid=>error) if item.tags.include? tag
|
625
|
+
rescue
|
626
|
+
end
|
627
|
+
end
|
628
|
+
return filtered_result
|
629
|
+
end
|
630
|
+
end
|
631
|
+
|
632
|
+
# Reset any objects in a specific workflow step and state that have errored out back to waiting
|
633
|
+
#
|
634
|
+
# @param [string] workflow name
|
635
|
+
# @param [string] step name
|
636
|
+
# @param [string] tag -- optional, if supplied, results will be filtered by the exact tag supplied; note this will dramatically slow down the response if there are many results
|
637
|
+
#
|
638
|
+
# @return [hash] hash of results that have been reset, with key has a druid, and value as the error message
|
639
|
+
# e.g.
|
640
|
+
# result=Assembly::Utils.reset_errored_objects_for_workstep('accessionWF','content-metadata')
|
641
|
+
# => {"druid:qd556jq0580"=>"druid:qd556jq0580 - Item error; caused by #<Rubydora::FedoraInvalidRequest: Error modifying datastream contentMetadata for druid:qd556jq0580. See logger for details>"}
|
642
|
+
def self.reset_errored_objects_for_workstep workflow, step, tag=''
|
643
|
+
result=self.get_errored_objects_for_workstep workflow,step,tag
|
644
|
+
druids=[]
|
645
|
+
result.each {|k,v| druids << k}
|
646
|
+
self.reset_workflow_states(:druids=>druids,:steps=>{workflow=>[step]}) if druids.size > 0
|
647
|
+
return result
|
648
|
+
end
|
649
|
+
|
650
|
+
# Read in a list of druids from a pre-assembly progress load file and load into an array.
|
651
|
+
#
|
652
|
+
# @param [string] progress_log_file filename
|
653
|
+
# @param [boolean] completed if true, returns druids that have completed, if false, returns druids that failed (defaults to true)
|
654
|
+
#
|
655
|
+
# @return [array] list of druids
|
656
|
+
#
|
657
|
+
# Example:
|
658
|
+
# druids=Assembly::Utils.get_druids_from_log('/dor/preassembly/sohp_accession_log.yaml')
|
659
|
+
# puts druids
|
660
|
+
# > ['aa000aa0001','aa000aa0002']
|
661
|
+
def self.get_druids_from_log(progress_log_file,completed=true)
|
662
|
+
druids=[]
|
663
|
+
docs = YAML.load_stream(Assembly::Utils.read_file(progress_log_file))
|
664
|
+
docs = docs.documents if docs.respond_to? :documents
|
665
|
+
docs.each { |obj| druids << obj[:pid] if obj[:pre_assem_finished] == completed}
|
666
|
+
return druids
|
667
|
+
end
|
668
|
+
|
669
|
+
# Read in a YAML configuration file from disk and return a hash
|
670
|
+
#
|
671
|
+
# @param [string] filename of YAML config file to read
|
672
|
+
#
|
673
|
+
# @return [hash] configuration contents as a hash
|
674
|
+
#
|
675
|
+
# Example:
|
676
|
+
# config_filename='/thumpers/dpgthumper2-smpl/SC1017_SOHP/sohp_prod_accession.yaml'
|
677
|
+
# config=Assembly::Utils.load_config(config_filename)
|
678
|
+
# puts config['progress_log_file']
|
679
|
+
# > "/dor/preassembly/sohp_accession_log.yaml"
|
680
|
+
def self.load_config(filename)
|
681
|
+
YAML.load(Assembly::Utils.read_file(filename))
|
682
|
+
end
|
683
|
+
|
684
|
+
# Read in a file from disk
|
685
|
+
#
|
686
|
+
# @param [string] filename to read
|
687
|
+
#
|
688
|
+
# @return [string] file contents as a string
|
689
|
+
def self.read_file(filename)
|
690
|
+
return File.readable?(filename) ? IO.read(filename) : ''
|
691
|
+
end
|
692
|
+
|
693
|
+
# Used by the completion_report and project_tag_report in the pre-assembly project
|
694
|
+
#
|
695
|
+
# @param [solr_document] doc a solr document result
|
696
|
+
# @param [boolean] check_status_in_dor indicates if we should check for the workflow states in dor or trust SOLR is up to date (defaults to false)
|
697
|
+
#
|
698
|
+
# @return [string] a comma delimited row for the report
|
699
|
+
def self.solr_doc_parser(doc,check_status_in_dor=false)
|
700
|
+
|
701
|
+
druid = doc[:id]
|
702
|
+
|
703
|
+
if Solrizer::VERSION < '3.0'
|
704
|
+
label = doc[:objectLabel_t]
|
705
|
+
title=doc[:public_dc_title_t].nil? ? '' : doc[:public_dc_title_t].first
|
706
|
+
|
707
|
+
if check_status_in_dor
|
708
|
+
accessioned = self.get_workflow_status(druid,'accessionWF','publish')=="completed"
|
709
|
+
shelved = self.get_workflow_status(druid,'accessionWF','shelve')=="completed"
|
710
|
+
else
|
711
|
+
accessioned = doc[:wf_wps_facet].nil? ? false : doc[:wf_wps_facet].include?("accessionWF:publish:completed")
|
712
|
+
shelved = doc[:wf_wps_facet].nil? ? false : doc[:wf_wps_facet].include?("accessionWF:shelve:completed")
|
713
|
+
end
|
714
|
+
source_id = doc[:source_id_t]
|
715
|
+
files=doc[:content_file_t]
|
716
|
+
else
|
717
|
+
label = doc[Solrizer.solr_name('objectLabel', :displayable)]
|
718
|
+
title = doc.fetch(Solrizer.solr_name('public_dc_title', :displayable), []).first || ''
|
719
|
+
|
720
|
+
if check_status_in_dor
|
721
|
+
accessioned = self.get_workflow_status(druid,'accessionWF','publish')=="completed"
|
722
|
+
shelved = self.get_workflow_status(druid,'accessionWF','shelve')=="completed"
|
723
|
+
else
|
724
|
+
accessioned = doc.fetch(Solrizer.solr_name('wf_wps', :symbol), []).include?("accessionWF:publish:completed")
|
725
|
+
shelved = doc.fetch(Solrizer.solr_name('wf_wps', :symbol), []).include?("accessionWF:shelve:completed")
|
726
|
+
end
|
727
|
+
source_id = doc[Solrizer.solr_name('source_id', :symbol)]
|
728
|
+
files=doc[Solrizer.solr_name('content_file', :symbol)]
|
729
|
+
|
730
|
+
end
|
731
|
+
|
732
|
+
if files.nil?
|
733
|
+
file_type_list=""
|
734
|
+
num_files=0
|
735
|
+
else
|
736
|
+
num_files = files.size
|
737
|
+
# count the amount of each file type
|
738
|
+
file_types=Hash.new(0)
|
739
|
+
unless num_files == 0
|
740
|
+
files.each {|file| file_types[File.extname(file)]+=1}
|
741
|
+
file_type_list=file_types.map{|k,v| "#{k}=#{v}"}.join(' | ')
|
742
|
+
end
|
743
|
+
end
|
744
|
+
|
745
|
+
purl_link = ""
|
746
|
+
val = druid.split(/:/).last
|
747
|
+
purl_link = File.join(Assembly::PURL_BASE_URL, val)
|
748
|
+
|
749
|
+
return [druid, label, title, source_id, accessioned, shelved, purl_link, num_files,file_type_list]
|
750
|
+
|
751
|
+
end
|
752
|
+
|
753
|
+
# Takes a hash data structure and recursively converts all hash keys from strings to symbols.
|
754
|
+
#
|
755
|
+
# @param [hash] h hash
|
756
|
+
#
|
757
|
+
# @return [hash] a hash with all keys converted from strings to symbols
|
758
|
+
#
|
759
|
+
# Example:
|
760
|
+
# Assembly::Utils.symbolize_keys({'dude'=>'is cool','i'=>'am too'})
|
761
|
+
# > {:dude=>"is cool", :i=>"am too"}
|
762
|
+
def self.symbolize_keys(h)
|
763
|
+
if h.instance_of? Hash
|
764
|
+
h.inject({}) { |hh,(k,v)| hh[k.to_sym] = symbolize_keys(v); hh }
|
765
|
+
elsif h.instance_of? Array
|
766
|
+
h.map { |v| symbolize_keys(v) }
|
767
|
+
else
|
768
|
+
h
|
769
|
+
end
|
770
|
+
end
|
771
|
+
|
772
|
+
# Takes a hash and converts its string values to symbols -- not recursively.
|
773
|
+
#
|
774
|
+
# @param [hash] h hash
|
775
|
+
#
|
776
|
+
# @return [hash] a hash with all keys converted from strings to symbols
|
777
|
+
#
|
778
|
+
# Example:
|
779
|
+
# Assembly::Utils.values_to_symbols!({'dude'=>'iscool','i'=>'amtoo'})
|
780
|
+
# > {"i"=>:amtoo, "dude"=>:iscool}
|
781
|
+
def self.values_to_symbols!(h)
|
782
|
+
h.each { |k,v| h[k] = v.to_sym if v.class == String }
|
783
|
+
end
|
784
|
+
|
785
|
+
# Removes any duplicate tags within each druid
|
786
|
+
#
|
787
|
+
# @param [array] druids - an array of druids
|
788
|
+
def self.remove_duplicate_tags(druids)
|
789
|
+
druids.each do |druid|
|
790
|
+
i = Dor::Item.find(druid)
|
791
|
+
if i and i.tags.size > 1 # multiple tags
|
792
|
+
i.tags.each do |tag|
|
793
|
+
if (i.tags.select {|t| t == tag}).size > 1 # tag is duplicate
|
794
|
+
i.remove_tag(tag)
|
795
|
+
i.add_tag(tag)
|
796
|
+
puts "Saving #{druid} to remove duplicate tag='#{tag}'"
|
797
|
+
i.save
|
798
|
+
end
|
799
|
+
end
|
800
|
+
end
|
801
|
+
end
|
802
|
+
end
|
803
|
+
|
804
|
+
private
|
805
|
+
# Used by the cleanup to ask user for confirmation of each step. Any response other than 'yes' results in the raising of an error
|
806
|
+
#
|
807
|
+
# @param [string] message the message to show to a user
|
808
|
+
#
|
809
|
+
def self.confirm(message)
|
810
|
+
puts message
|
811
|
+
response=gets.chomp.downcase
|
812
|
+
raise "Exiting" if response != 'y' && response != 'yes'
|
813
|
+
end
|
814
|
+
|
815
|
+
end
|
816
|
+
|
817
|
+
end
|