assembly-utils 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,817 @@
1
+ require 'net/ssh'
2
+ require 'csv'
3
+ require 'csv-mapper'
4
+ require 'druid-tools'
5
+
6
+ begin
7
+ require 'net/ssh/kerberos'
8
+ rescue LoadError
9
+ end
10
+
11
+ module Assembly
12
+
13
+
14
+ # The Utils class contains methods to help with accessioning and assembly
15
+ class Utils
16
+
17
+ WFS = Dor::WorkflowService
18
+ REPO = 'dor'
19
+
20
+ # Get the staging directory tree given a druid, and optionally prepend a basepath.
21
+ # Deprecated and should not be needed anymore.
22
+ #
23
+ # @param [String] pid druid pid (e.g. 'aa000aa0001')
24
+ # @param [String] base_path optional base path to prepend to druid path
25
+ #
26
+ # @return [string] path to material that is being staged, with optional prepended base path
27
+ #
28
+ # Example:
29
+ # puts Assembly::Utils.get_staging_path('aa000aa0001','tmp')
30
+ # > "tmp/aa/000/aa/0001"
31
+ def self.get_staging_path(pid,base_path=nil)
32
+ d=DruidTools::Druid.new(pid,base_path)
33
+ path=File.dirname(d.path)
34
+ return path
35
+ end
36
+
37
+ # Insert the specified workflow into the specified object.
38
+ #
39
+ # @param [String] pid druid pid (e.g. 'aa000aa0001')
40
+ # @param [String] workflow name (e.g. 'accessionWF')
41
+ # @param [String] repository name (e.g. 'dor') -- optional, defaults to dor
42
+ #
43
+ # @return [boolean] indicates success of web service call
44
+ #
45
+ # Example:
46
+ # puts Assembly::Utils.insert_workflow('druid:aa000aa0001','accessionWF')
47
+ # > true
48
+ def self.insert_workflow(pid,workflow,repo='dor')
49
+ url = "#{Dor::Config.dor.service_root}/objects/#{pid}/apo_workflows/#{workflow}"
50
+ result = RestClient.post url, {}
51
+ return ([200,201,202,204].include?(result.code) && result)
52
+ end
53
+
54
+ # Claim a specific druid as already used to be sure it won't get used again.
55
+ # Not needed for normal purposes, only if you manually register something in Fedora Admin outside of DOR services gem.
56
+ #
57
+ # @param [String] pid druid pid (e.g. 'aa000aa0001')
58
+ #
59
+ # @return [boolean] indicates success of web service call
60
+ #
61
+ # Example:
62
+ # puts Assembly::Utils.claim_druid('aa000aa0001')
63
+ # > true
64
+ def self.claim_druid(pid)
65
+ sc = Dor::Config.suri
66
+ url = "#{sc.url}/suri2/namespaces/#{sc.id_namespace}"
67
+ rcr = RestClient::Resource.new(url, :user => sc.user, :password => sc.pass)
68
+ resp = rcr["identifiers/#{pid}"].put('')
69
+ return resp.code == "204"
70
+ end
71
+
72
+ # Force a full re-index of the supplied druid in solr and fedora.
73
+ #
74
+ # @param [String] druid druid (e.g. 'druid:aa000aa0001')
75
+ #
76
+ # Example:
77
+ # puts Assembly::Utils.reindex('druid:aa000aa0001')
78
+ def self.reindex(druid)
79
+ obj = Dor.load_instance druid
80
+ solr_doc = obj.to_solr
81
+ Dor::SearchService.solr.add(solr_doc, :add_attributes => {:commitWithin => 1000}) unless obj.nil?
82
+ Dor.find(pid).update_index
83
+ end
84
+
85
+ # Export one or more objects given a single or array of pids, with output to the specified directory as FOXML files
86
+ #
87
+ # @param [Array] pids - an array of pids to export (can also pass a single pid as a string)
88
+ # @param [String] output_dir - the full path to output the foxml files
89
+ #
90
+ # Example:
91
+ # Assembly::Utils.export_objects(['druid:aa000aa0001','druid:bb000bb0001'],'/tmp')
92
+ def self.export_objects(pids,output_dir)
93
+ pids=[pids] if pids.class==String
94
+ pids.each {|pid| ActiveFedora::FixtureExporter.export_to_path(pid, output_dir)}
95
+ end
96
+
97
+ # Import all of the FOXML files in the specified directory into Fedora
98
+ #
99
+ # @param [String] source_dir - the full path to import the foxml files
100
+ #
101
+ # Example:
102
+ # Assembly::Utils.import_objects('/tmp')
103
+ def self.import_objects(source_dir)
104
+ Dir.chdir(source_dir)
105
+ files=Dir.glob('*.foxml.xml')
106
+ files.each do |file|
107
+ pid = ActiveFedora::FixtureLoader.import_to_fedora(File.join(source_dir,file))
108
+ ActiveFedora::FixtureLoader.index(pid)
109
+ end
110
+ end
111
+
112
+ # Get a list of druids that match the given array of source IDs.
113
+ # This method only works when this gem is used in a project that is configured to connect to DOR
114
+ #
115
+ # @param [String] source_ids array of source ids to lookup
116
+ #
117
+ # @return [array] druids
118
+ # Example:
119
+ #
120
+ # puts Assembly::Utils.get_druids_by_sourceid(['revs-01','revs-02'])
121
+ # > ['druid:aa000aa0001','druid:aa000aa0002']
122
+ def self.get_druids_by_sourceid(source_ids)
123
+ druids=[]
124
+ source_ids.each {|sid| druids << Dor::SearchService.query_by_id(sid)}
125
+ druids.flatten
126
+ end
127
+
128
+ # Show the workflow status of specific steps in assembly and/or accession workflows for the provided druids.
129
+ # This method only works when this gem is used in a project that is configured to connect to DOR
130
+ #
131
+ # @param [Hash] params parameters specified as a hash, using symbols for options:
132
+ # * :druids => array of druids to get workflow status for
133
+ # * :workflows => an optional array of workflow names as symbols, options are :assembly and :accession; defaults to :assembly
134
+ # * :filename => optional filename if you want to send output to a CSV
135
+ #
136
+ # @return [string] comma delimited output or CSV file
137
+ #
138
+ # Example:
139
+ # Assembly::Utils.workflow_status(:druids=>['druid:aa000aa0001','druid:aa000aa0002'],:workflows=>[:assembly,:accession],:filename=>'output.csv')
140
+ def self.workflow_status(params={})
141
+
142
+ druids=params[:druids] || []
143
+ workflows=params[:workflows] || [:assembly]
144
+ filename=params[:filename] || ''
145
+
146
+ accession_steps = %w(content-metadata descriptive-metadata rights-metadata remediate-object shelve publish)
147
+ assembly_steps = %w(jp2-create checksum-compute exif-collect accessioning-initiate)
148
+
149
+ puts "Generating report"
150
+
151
+ csv = CSV.open(filename, "w") if filename != ''
152
+
153
+ header=["druid"]
154
+ header << assembly_steps if workflows.include?(:assembly)
155
+ header << accession_steps if workflows.include?(:accession)
156
+ csv << header.flatten if filename != ''
157
+ puts header.join(',')
158
+
159
+ druids.each do |druid|
160
+ output=[druid]
161
+ assembly_steps.each {|step| output << self.get_workflow_status(druid,'assemblyWF',step)} if workflows.include?(:assembly)
162
+ accession_steps.each {|step| output << self.get_workflow_status(druid,'accessionWF',step)} if workflows.include?(:accession)
163
+ csv << output if filename != ''
164
+ puts output.join(',')
165
+ end
166
+
167
+ if filename != ''
168
+ csv.close
169
+ puts "Report generated in #{filename}"
170
+ end
171
+
172
+ end
173
+
174
+ # Show the workflow status of a specific step in a specific workflow for the provided druid.
175
+ # This method only works when this gem is used in a project that is configured to connect to DOR
176
+ #
177
+ # @param [string] druid a druid string
178
+ # @param [string] workflow name of workflow
179
+ # @param [string] step name of step
180
+ #
181
+ # @return [string] workflow step status, returns nil if no workflow found
182
+ #
183
+ # Example:
184
+ # puts Assembly::Utils.get_workflow_status('druid:aa000aa0001','assemblyWF','jp2-create')
185
+ # > "completed"
186
+ def self.get_workflow_status(druid,workflow,step)
187
+ Dor::WorkflowService.get_workflow_status('dor', druid, workflow, step)
188
+ end
189
+
190
+ # Cleanup a list of objects and associated files given a list of druids. WARNING: VERY DESTRUCTIVE.
191
+ # This method only works when this gem is used in a project that is configured to connect to DOR
192
+ #
193
+ # @param [Hash] params parameters specified as a hash, using symbols for options:
194
+ # * :druids => array of druids to cleanup
195
+ # * :steps => an array of steps, specified as symbols, indicating steps to be run, options are:
196
+ # :stacks=This will remove all files from the stacks that were shelved for the objects
197
+ # :dor=This will delete objects from Fedora
198
+ # :stage=This will delete the staged content in the assembly workspace
199
+ # :symlinks=This will remove the symlink from the dor workspace
200
+ # :workflows=This will remove the assemblyWF and accessoiningWF workflows for this object
201
+ # * :dry_run => do not actually clean up (defaults to false)
202
+ #
203
+ # Example:
204
+ # Assembly::Utils.cleanup(:druids=>['druid:aa000aa0001','druid:aa000aa0002'],:steps=>[:stacks,:dor,:stage,:symlinks,:workflows])
205
+ def self.cleanup(params={})
206
+
207
+ druids=params[:druids] || []
208
+ steps=params[:steps] || []
209
+ dry_run=params[:dry_run] || false
210
+
211
+ allowed_steps={:stacks=>'This will remove all files from the stacks that were shelved for the objects',
212
+ :dor=>'This will delete objects from Fedora',
213
+ :stage=>"This will delete the staged content in #{Assembly::ASSEMBLY_WORKSPACE}",
214
+ :symlinks=>"This will remove the symlink from #{Assembly::DOR_WORKSPACE}",
215
+ :workflows=>"This will remove the accessionWF and assemblyWF workflows"}
216
+
217
+ num_steps=0
218
+
219
+ puts 'THIS IS A DRY RUN' if dry_run
220
+
221
+ Assembly::Utils.confirm "Run on '#{ENV['ROBOT_ENVIRONMENT']}'? Any response other than 'y' or 'yes' will stop the cleanup now."
222
+ Assembly::Utils.confirm "Are you really sure you want to run on production? CLEANUP IS NOT REVERSIBLE" if ENV['ROBOT_ENVIRONMENT'] == 'production'
223
+
224
+ steps.each do |step|
225
+ if allowed_steps.keys.include?(step)
226
+ Assembly::Utils.confirm "Run step '#{step}'? #{allowed_steps[step]}. Any response other than 'y' or 'yes' will stop the cleanup now."
227
+ num_steps+=1 # count the valid steps found and agreed to
228
+ end
229
+ end
230
+
231
+ raise "no valid steps specified for cleanup" if num_steps == 0
232
+ raise "no druids provided" if druids.size == 0
233
+
234
+ druids.each {|pid| Assembly::Utils.cleanup_object(pid,steps,dry_run)}
235
+
236
+ end
237
+
238
+ # Cleanup a single objects and associated files given a druid. WARNING: VERY DESTRUCTIVE.
239
+ # This method only works when this gem is used in a project that is configured to connect to DOR
240
+ #
241
+ # @param [string] pid a druid
242
+ # @param [array] steps an array of steps, options below
243
+ # :stacks=This will remove all files from the stacks that were shelved for the objects
244
+ # :dor=This will delete objects from Fedora
245
+ # :stage=This will delete the staged content in the assembly workspace
246
+ # :symlinks=This will remove the symlink from the dor workspace
247
+ # :workflows=This will remove the assemblyWF and accessoiningWF workflows for this object
248
+ # @param [boolean] dry_run do not actually clean up (defaults to false)
249
+ #
250
+ # Example:
251
+ # Assembly::Utils.cleanup_object('druid:aa000aa0001',[:stacks,:dor,:stage,:symlinks,:workflows])
252
+ def self.cleanup_object(pid,steps,dry_run=false)
253
+ begin
254
+ # start up an SSH session if we are going to try and remove content from the stacks
255
+ ssh_session=Net::SSH.start(Dor::Config.stacks.host,Dor::Config.stacks.user, :auth_methods => %w(gssapi-with-mic publickey hostbased password keyboard-interactive)) if steps.include?(:stacks) && defined?(stacks_server)
256
+
257
+ druid_tree=DruidTools::Druid.new(pid).tree
258
+ puts "Cleaning up #{pid}"
259
+ if steps.include?(:dor)
260
+ puts "-- deleting #{pid} from Fedora #{ENV['ROBOT_ENVIRONMENT']}"
261
+ Assembly::Utils.unregister(pid) unless dry_run
262
+ end
263
+ if steps.include?(:symlinks)
264
+ path_to_symlinks=[]
265
+ path_to_symlinks << File.join(Assembly::DOR_WORKSPACE,druid_tree)
266
+ path_to_symlinks << Assembly::Utils.get_staging_path(pid,Assembly::DOR_WORKSPACE)
267
+ path_to_symlinks.each do |path|
268
+ if File::directory?(path)
269
+ puts "-- deleting folder #{path} (WARNING: should have been a symlink)"
270
+ FileUtils::rm_rf path unless dry_run
271
+ elsif File.symlink?(path)
272
+ puts "-- deleting symlink #{path}"
273
+ File.delete(path) unless dry_run
274
+ else
275
+ puts "-- Skipping #{path}: not a folder or symlink"
276
+ end
277
+ end
278
+ end
279
+ if steps.include?(:stage)
280
+ path_to_content=Assembly::Utils.get_staging_path(pid,Assembly::ASSEMBLY_WORKSPACE)
281
+ puts "-- deleting folder #{path_to_content}"
282
+ FileUtils.rm_rf path_to_content if !dry_run && File.exists?(path_to_content)
283
+ end
284
+ if steps.include?(:stacks)
285
+ path_to_content= Dor::DigitalStacksService.stacks_storage_dir(pid)
286
+ puts "-- removing files from the stacks on #{stacks_server} at #{path_to_content}"
287
+ ssh_session.exec!("rm -fr #{path_to_content}") unless dry_run
288
+ end
289
+ if steps.include?(:workflows)
290
+ puts "-- deleting #{pid} accessionWF and assemblyWF workflows from Fedora #{ENV['ROBOT_ENVIRONMENT']}"
291
+ unless dry_run
292
+ Dor::WorkflowService.delete_workflow('dor',pid,'accessionWF')
293
+ Dor::WorkflowService.delete_workflow('dor',pid,'assemblyWF')
294
+ end
295
+ end
296
+ rescue Exception => e
297
+ puts "** cleaning up failed for #{pid} with #{e.message}"
298
+ end
299
+ ssh_session.close if ssh_session
300
+ end
301
+
302
+ # Delete an object from DOR.
303
+ # This method only works when this gem is used in a project that is configured to connect to DOR
304
+ #
305
+ # @param [string] pid the druid
306
+ #
307
+ # Example:
308
+ # Assembly::Utils.delete_from_dor('druid:aa000aa0001')
309
+ def self.delete_from_dor(pid)
310
+
311
+ Dor::Config.fedora.client["objects/#{pid}"].delete
312
+ Dor::SearchService.solr.delete_by_id(pid)
313
+ Dor::SearchService.solr.commit
314
+
315
+ end
316
+
317
+ # Quicky update rights metadata for any existing list of objects using default rights metadata pulled from the supplied APO
318
+ #
319
+ # @param [array] druids - an array of druids
320
+ # @param [string] apo_druid - the druid of the APO to pull rights metadata from
321
+ # @param [boolean] publish - defaults to false, if true, will publish each object after replacing datastreams (must be run on server with rights to do this)
322
+ #
323
+ # Example:
324
+ # druids=%w{druid:aa111aa1111 druid:bb222bb2222}
325
+ # apo_druid='druid:cc222cc2222'
326
+ # Assembly::Utils.update_rights_metadata(druids,apo_druid)
327
+ def self.update_rights_metadata(druids,apo_druid,publish=false)
328
+ apo = Dor::Item.find(apo_druid)
329
+ rights_md = apo.datastreams['defaultObjectRights']
330
+ self.replace_datastreams(druids,'rightsMetadata',rights_md.content,publish)
331
+ end
332
+
333
+ # Replace a specific datastream for a series of objects in DOR with new content
334
+ #
335
+ # @param [array] druids - an array of druids
336
+ # @param [string] datastream_name - the name of the datastream to replace
337
+ # @param [string] new_content - the new content to replace the entire datastream with
338
+ # @param [boolean] publish - defaults to false, if true, will publish each object after replacing datastreams (must be run on server with rights to do this)
339
+ #
340
+ # Example:
341
+ # druids=%w{druid:aa111aa1111 druid:bb222bb2222}
342
+ # new_content='<xml><more nodes>this should be the whole datastream</more nodes></xml>'
343
+ # datastream='rightsMetadata'
344
+ # Assembly::Utils.replace_datastreams(druids,datastream,new_content)
345
+ def self.replace_datastreams(druids,datastream_name,new_content,publish=false)
346
+ druids.each do |druid|
347
+ obj = Dor::Item.find(druid)
348
+ ds = obj.datastreams[datastream_name]
349
+ if ds
350
+ ds.content = new_content
351
+ ds.save
352
+ puts "replaced #{datastream_name} for #{druid}"
353
+ if publish
354
+ obj.publish_metadata
355
+ puts "--object re-published"
356
+ end
357
+ else
358
+ puts "#{datastream_name} does not exist for #{druid}"
359
+ end
360
+ end
361
+ end
362
+
363
+ # Republish a list of druids. Only works when run from a server with access rights to the stacks (e.g. lyberservices-prod)
364
+ #
365
+ # @param [array] druids - an array of druids
366
+ #
367
+ # Example:
368
+ # druids=%w{druid:aa111aa1111 druid:bb222bb2222}
369
+ # Assembly::Utils.republish(druids)
370
+ def self.republish(druids)
371
+ druids.each do |druid|
372
+ obj = Dor::Item.find(druid)
373
+ obj.publish_metadata
374
+ puts "republished #{druid}"
375
+ end
376
+ end
377
+
378
+ # Determines if the specifed APO object contains a specified workflow defined in it
379
+ # DEPRACATED NOW THAT REIFED WORKFLOWS ARE USED
380
+ # @param [string] druid - the druid of the APO to check
381
+ # @param [string] workflow - the name of the workflow to check
382
+ #
383
+ # @return [boolean] if workflow is defined in APO
384
+ #
385
+ # Example:
386
+ # Assembly::Utils.apo_workflow_defined?('druid:oo000oo0001','assembly')
387
+ # > true
388
+ def self.apo_workflow_defined?(druid,workflow)
389
+ puts "************WARNING - THIS METHOD MAY NOT BE USEFUL ANYMORE SINCE WORKFLOWS ARE NO LONGER DEFINED IN THE APO**************"
390
+ obj = Dor::Item.find(druid)
391
+ raise 'object not an APO' if obj.identityMetadata.objectType.first != 'adminPolicy'
392
+ xml_doc=Nokogiri::XML(obj.administrativeMetadata.content)
393
+ xml_doc.xpath("//#{workflow}").size == 1 || xml_doc.xpath("//*[@id='#{workflow}']").size == 1
394
+ end
395
+
396
+ # Determines if the specifed object is an APO
397
+ # @param [string] druid - the druid of the APO to check
398
+ #
399
+ # @return [boolean] if object exist and is an APO
400
+ #
401
+ # Example:
402
+ # Assembly::Utils.is_apo?('druid:oo000oo0001')
403
+ # > true
404
+ def self.is_apo?(druid)
405
+ begin
406
+ obj = Dor::Item.find(druid)
407
+ return obj.identityMetadata.objectType.first == 'adminPolicy'
408
+ rescue
409
+ return false
410
+ end
411
+ end
412
+
413
+ # Update a specific datastream for a series of objects in DOR by searching and replacing content
414
+ #
415
+ # @param [array] druids - an array of druids
416
+ # @param [string] datastream_name - the name of the datastream to replace
417
+ # @param [string] find_content - the content to find
418
+ # @param [string] replace_content - the content to replace the found content with
419
+ #
420
+ # Example:
421
+ # druids=%w{druid:aa111aa1111 druid:bb222bb2222}
422
+ # find_content='FooBarBaz'
423
+ # replace_content='Stanford Rules'
424
+ # datastream='rightsMetadata'
425
+ # Assembly::Utils.update_datastreams(druids,datastream,find_content,replace_content)
426
+ def self.update_datastreams(druids,datastream_name,find_content,replace_content)
427
+ druids.each do |druid|
428
+ obj = Dor::Item.find(druid)
429
+ ds = obj.datastreams[datastream_name]
430
+ if ds
431
+ updated_content=ds.content.gsub(find_content,replace_content)
432
+ ds.content = updated_content
433
+ ds.save
434
+ puts "updated #{datastream_name} for #{druid}"
435
+ else
436
+ puts "#{datastream_name} does not exist for #{druid}"
437
+ end
438
+ end
439
+ end
440
+
441
+ # Unregister a DOR object, which includes deleting it and deleting all its workflows
442
+ #
443
+ # @param [string] pid of druid
444
+ #
445
+ # @return [boolean] if deletion succeed or not
446
+ def self.unregister(pid)
447
+
448
+ begin
449
+ Assembly::Utils.delete_all_workflows pid
450
+ Assembly::Utils.delete_from_dor pid
451
+ return true
452
+ rescue
453
+ return false
454
+ end
455
+
456
+ end
457
+
458
+ # Set the workflow step for the given PID to an error state
459
+ #
460
+ # @param [string] pid of druid
461
+ # @param [string] step to set to error
462
+ #
463
+ def self.set_workflow_step_to_error(pid, step)
464
+ wf_name = Assembly::ASSEMBLY_WF
465
+ msg = 'Integration testing'
466
+ params = ['dor', pid, wf_name, step, msg]
467
+ resp = Dor::WorkflowService.update_workflow_error_status *params
468
+ raise "update_workflow_error_status() returned false." unless resp == true
469
+ end
470
+
471
+ # Delete all workflows for the given PID. Destructive and should only be used when deleting an object from DOR.
472
+ # This method only works when this gem is used in a project that is configured to connect to DOR
473
+ #
474
+ # @param [string] pid of druid
475
+ # @param [String] repo repository dealing with the workflow. Default is 'dor'. Another option is 'sdr'
476
+ # e.g.
477
+ # Assembly::Utils.delete_all_workflows('druid:oo000oo0001')
478
+ def self.delete_all_workflows(pid, repo='dor')
479
+ Dor::WorkflowService.get_workflows(pid).each {|workflow| Dor::WorkflowService.delete_workflow(repo,pid,workflow)}
480
+ end
481
+
482
+ # Reindex the supplied PID in solr.
483
+ #
484
+ # @param [string] pid of druid
485
+ # e.g.
486
+ # Assembly::Utils.reindex('druid:oo000oo0001')
487
+ def self.reindex(pid)
488
+ obj = Dor.load_instance pid
489
+ solr_doc = obj.to_solr
490
+ Dor::SearchService.solr.add(solr_doc, :add_attributes => {:commitWithin => 1000}) unless obj.nil?
491
+ end
492
+
493
+ # Clear stray workflows - remove any workflow steps for orphaned objects.
494
+ # This method only works when this gem is used in a project that is configured to connect to DOR
495
+ def self.clear_stray_workflows
496
+ repo = 'dor'
497
+ wf = 'assemblyWF'
498
+ msg = 'Integration testing'
499
+ wfs = Dor::WorkflowService
500
+ steps = Assembly::ASSEMBLY_WF_STEPS.map { |s| s[0] }
501
+ completed = steps[0]
502
+
503
+ steps.each do |waiting|
504
+ druids = wfs.get_objects_for_workstep completed, waiting, repo, wf
505
+ druids.each do |dru|
506
+ params = [repo, dru, wf, waiting, msg]
507
+ resp = wfs.update_workflow_error_status *params
508
+ puts "updated: resp=#{resp} params=#{params.inspect}"
509
+ end
510
+ end
511
+ end
512
+
513
+ # Check if the object is full accessioned and ingested.
514
+ # This method only works when this gem is used in a project that is configured to connect to the workflow service.
515
+ #
516
+ # @param [string] pid the druid to operate on
517
+ #
518
+ # @return [boolean] if object is fully ingested
519
+ # Example:
520
+ # Assembly::Utils.is_ingested?('druid:oo000oo0001')
521
+ # > false
522
+ def self.is_ingested?(pid)
523
+ WFS.get_lifecycle(REPO, pid, 'accessioned') ? true : false
524
+ end
525
+
526
+ # Check if the object is on ingest hold
527
+ # This method only works when this gem is used in a project that is configured to connect to the workflow service.
528
+ #
529
+ # @param [string] pid the druid to operate on
530
+ #
531
+ # @return [boolean] if object is on ingest hold
532
+ # Example:
533
+ # Assembly::Utils.ingest_hold?('druid:oo000oo0001')
534
+ # > false
535
+ def self.ingest_hold?(pid)
536
+ WFS.get_workflow_status(REPO, pid, 'accessionWF','sdr-ingest-transfer') == 'hold'
537
+ end
538
+
539
+ # Check if the object is submitted
540
+ # This method only works when this gem is used in a project that is configured to connect to the workflow service.
541
+ #
542
+ # @param [string] pid the druid to operate on
543
+ #
544
+ # @return [boolean] if object is submitted
545
+ # Example:
546
+ # Assembly::Utils.is_submitted?('druid:oo000oo0001')
547
+ # > false
548
+ def self.is_submitted?(pid)
549
+ WFS.get_lifecycle(REPO, pid, 'submitted') == nil
550
+ end
551
+
552
+ # Reset the workflow states for a list of druids given a list of workflow names and steps.
553
+ # Provide a list of druids in an array, and a hash containing workflow names (e.g. 'assemblyWF' or 'accessionWF') as the keys, and arrays of steps
554
+ # as the corresponding values (e.g. ['checksum-compute','jp2-create']) and they will all be reset to "waiting".
555
+ # This method only works when this gem is used in a project that is configured to connect to DOR
556
+ #
557
+ # @param [Hash] params parameters specified as a hash, using symbols for options:
558
+ # * :druids => array of druids
559
+ # * :steps => a hash, containing workflow names as keys, and an array of steps
560
+ # * :state => a string for the name of the state to reset to, defaults to 'waiting' (could be 'completed' for example)
561
+ #
562
+ # Example:
563
+ # druids=['druid:aa111aa1111','druid:bb222bb2222']
564
+ # steps={'assemblyWF' => ['checksum-compute'],'accessionWF' => ['content-metadata','descriptive-metadata']}
565
+ # Assembly::Utils.reset_workflow_states(:druids=>druids,:steps=>steps)
566
+ def self.reset_workflow_states(params={})
567
+ druids=params[:druids] || []
568
+ workflows=params[:steps] || {}
569
+ state=params[:state] || "waiting"
570
+ druids.each do |druid|
571
+ puts "** #{druid}"
572
+ begin
573
+ workflows.each do |workflow,steps|
574
+ steps.each do |step|
575
+ puts "Updating #{workflow}:#{step} to #{state}"
576
+ Dor::WorkflowService.update_workflow_status 'dor',druid,workflow, step, state
577
+ end
578
+ end
579
+ rescue Exception => e
580
+ puts "an error occurred trying to update workflows for #{druid} with message #{e.message}"
581
+ end
582
+ end
583
+ end
584
+
585
+ # Get a list of druids from a CSV file which has a heading of "druid" and put them into a Ruby array.
586
+ # Useful if you want to import a report from argo
587
+ #
588
+ # @param [string] filename of CSV that has a column called "druid"
589
+ #
590
+ # @return [array] array of druids
591
+ #
592
+ # Example:
593
+ # Assembly::Utils.read_druids_from_file('download.csv') # ['druid:xxxxx','druid:yyyyy']
594
+ def self.read_druids_from_file(csv_filename)
595
+ rows=CsvMapper.import(csv_filename) do read_attributes_from_file end
596
+ druids=[]
597
+ rows.each do |row|
598
+ druid=row.druid
599
+ druid="druid:#{druid}" unless druid.include?('druid:')
600
+ druids << druid
601
+ end
602
+ return druids
603
+ end
604
+
605
+ # Get a list of druids that have errored out in a particular workflow and step
606
+ #
607
+ # @param [string] workflow name
608
+ # @param [string] step name
609
+ # @param [string] tag -- optional, if supplied, results will be filtered by the exact tag supplied; note this will dramatically slow down the response if there are many results
610
+ #
611
+ # @return [hash] hash of results, with key has a druid, and value as the error message
612
+ # e.g.
613
+ # result=Assembly::Utils.get_errored_objects_for_workstep('accessionWF','content-metadata','Project : Revs')
614
+ # => {"druid:qd556jq0580"=>"druid:qd556jq0580 - Item error; caused by #<Rubydora::FedoraInvalidRequest: Error modifying datastream contentMetadata for druid:qd556jq0580. See logger for details>"}
615
+ def self.get_errored_objects_for_workstep workflow, step, tag = ''
616
+ result=Dor::WorkflowService.get_errored_objects_for_workstep workflow,step,'dor'
617
+ if tag == ''
618
+ return result
619
+ else
620
+ filtered_result={}
621
+ result.each do |druid,error|
622
+ begin
623
+ item=Dor::Item.find(druid)
624
+ filtered_result.merge!(druid=>error) if item.tags.include? tag
625
+ rescue
626
+ end
627
+ end
628
+ return filtered_result
629
+ end
630
+ end
631
+
632
+ # Reset any objects in a specific workflow step and state that have errored out back to waiting
633
+ #
634
+ # @param [string] workflow name
635
+ # @param [string] step name
636
+ # @param [string] tag -- optional, if supplied, results will be filtered by the exact tag supplied; note this will dramatically slow down the response if there are many results
637
+ #
638
+ # @return [hash] hash of results that have been reset, with key has a druid, and value as the error message
639
+ # e.g.
640
+ # result=Assembly::Utils.reset_errored_objects_for_workstep('accessionWF','content-metadata')
641
+ # => {"druid:qd556jq0580"=>"druid:qd556jq0580 - Item error; caused by #<Rubydora::FedoraInvalidRequest: Error modifying datastream contentMetadata for druid:qd556jq0580. See logger for details>"}
642
+ def self.reset_errored_objects_for_workstep workflow, step, tag=''
643
+ result=self.get_errored_objects_for_workstep workflow,step,tag
644
+ druids=[]
645
+ result.each {|k,v| druids << k}
646
+ self.reset_workflow_states(:druids=>druids,:steps=>{workflow=>[step]}) if druids.size > 0
647
+ return result
648
+ end
649
+
650
+ # Read in a list of druids from a pre-assembly progress load file and load into an array.
651
+ #
652
+ # @param [string] progress_log_file filename
653
+ # @param [boolean] completed if true, returns druids that have completed, if false, returns druids that failed (defaults to true)
654
+ #
655
+ # @return [array] list of druids
656
+ #
657
+ # Example:
658
+ # druids=Assembly::Utils.get_druids_from_log('/dor/preassembly/sohp_accession_log.yaml')
659
+ # puts druids
660
+ # > ['aa000aa0001','aa000aa0002']
661
+ def self.get_druids_from_log(progress_log_file,completed=true)
662
+ druids=[]
663
+ docs = YAML.load_stream(Assembly::Utils.read_file(progress_log_file))
664
+ docs = docs.documents if docs.respond_to? :documents
665
+ docs.each { |obj| druids << obj[:pid] if obj[:pre_assem_finished] == completed}
666
+ return druids
667
+ end
668
+
669
+ # Read in a YAML configuration file from disk and return a hash
670
+ #
671
+ # @param [string] filename of YAML config file to read
672
+ #
673
+ # @return [hash] configuration contents as a hash
674
+ #
675
+ # Example:
676
+ # config_filename='/thumpers/dpgthumper2-smpl/SC1017_SOHP/sohp_prod_accession.yaml'
677
+ # config=Assembly::Utils.load_config(config_filename)
678
+ # puts config['progress_log_file']
679
+ # > "/dor/preassembly/sohp_accession_log.yaml"
680
+ def self.load_config(filename)
681
+ YAML.load(Assembly::Utils.read_file(filename))
682
+ end
683
+
684
+ # Read in a file from disk
685
+ #
686
+ # @param [string] filename to read
687
+ #
688
+ # @return [string] file contents as a string
689
+ def self.read_file(filename)
690
+ return File.readable?(filename) ? IO.read(filename) : ''
691
+ end
692
+
693
+ # Used by the completion_report and project_tag_report in the pre-assembly project
694
+ #
695
+ # @param [solr_document] doc a solr document result
696
+ # @param [boolean] check_status_in_dor indicates if we should check for the workflow states in dor or trust SOLR is up to date (defaults to false)
697
+ #
698
+ # @return [string] a comma delimited row for the report
699
+ def self.solr_doc_parser(doc,check_status_in_dor=false)
700
+
701
+ druid = doc[:id]
702
+
703
+ if Solrizer::VERSION < '3.0'
704
+ label = doc[:objectLabel_t]
705
+ title=doc[:public_dc_title_t].nil? ? '' : doc[:public_dc_title_t].first
706
+
707
+ if check_status_in_dor
708
+ accessioned = self.get_workflow_status(druid,'accessionWF','publish')=="completed"
709
+ shelved = self.get_workflow_status(druid,'accessionWF','shelve')=="completed"
710
+ else
711
+ accessioned = doc[:wf_wps_facet].nil? ? false : doc[:wf_wps_facet].include?("accessionWF:publish:completed")
712
+ shelved = doc[:wf_wps_facet].nil? ? false : doc[:wf_wps_facet].include?("accessionWF:shelve:completed")
713
+ end
714
+ source_id = doc[:source_id_t]
715
+ files=doc[:content_file_t]
716
+ else
717
+ label = doc[Solrizer.solr_name('objectLabel', :displayable)]
718
+ title = doc.fetch(Solrizer.solr_name('public_dc_title', :displayable), []).first || ''
719
+
720
+ if check_status_in_dor
721
+ accessioned = self.get_workflow_status(druid,'accessionWF','publish')=="completed"
722
+ shelved = self.get_workflow_status(druid,'accessionWF','shelve')=="completed"
723
+ else
724
+ accessioned = doc.fetch(Solrizer.solr_name('wf_wps', :symbol), []).include?("accessionWF:publish:completed")
725
+ shelved = doc.fetch(Solrizer.solr_name('wf_wps', :symbol), []).include?("accessionWF:shelve:completed")
726
+ end
727
+ source_id = doc[Solrizer.solr_name('source_id', :symbol)]
728
+ files=doc[Solrizer.solr_name('content_file', :symbol)]
729
+
730
+ end
731
+
732
+ if files.nil?
733
+ file_type_list=""
734
+ num_files=0
735
+ else
736
+ num_files = files.size
737
+ # count the amount of each file type
738
+ file_types=Hash.new(0)
739
+ unless num_files == 0
740
+ files.each {|file| file_types[File.extname(file)]+=1}
741
+ file_type_list=file_types.map{|k,v| "#{k}=#{v}"}.join(' | ')
742
+ end
743
+ end
744
+
745
+ purl_link = ""
746
+ val = druid.split(/:/).last
747
+ purl_link = File.join(Assembly::PURL_BASE_URL, val)
748
+
749
+ return [druid, label, title, source_id, accessioned, shelved, purl_link, num_files,file_type_list]
750
+
751
+ end
752
+
753
+ # Takes a hash data structure and recursively converts all hash keys from strings to symbols.
754
+ #
755
+ # @param [hash] h hash
756
+ #
757
+ # @return [hash] a hash with all keys converted from strings to symbols
758
+ #
759
+ # Example:
760
+ # Assembly::Utils.symbolize_keys({'dude'=>'is cool','i'=>'am too'})
761
+ # > {:dude=>"is cool", :i=>"am too"}
762
+ def self.symbolize_keys(h)
763
+ if h.instance_of? Hash
764
+ h.inject({}) { |hh,(k,v)| hh[k.to_sym] = symbolize_keys(v); hh }
765
+ elsif h.instance_of? Array
766
+ h.map { |v| symbolize_keys(v) }
767
+ else
768
+ h
769
+ end
770
+ end
771
+
772
+ # Takes a hash and converts its string values to symbols -- not recursively.
773
+ #
774
+ # @param [hash] h hash
775
+ #
776
+ # @return [hash] a hash with all keys converted from strings to symbols
777
+ #
778
+ # Example:
779
+ # Assembly::Utils.values_to_symbols!({'dude'=>'iscool','i'=>'amtoo'})
780
+ # > {"i"=>:amtoo, "dude"=>:iscool}
781
+ def self.values_to_symbols!(h)
782
+ h.each { |k,v| h[k] = v.to_sym if v.class == String }
783
+ end
784
+
785
+ # Removes any duplicate tags within each druid
786
+ #
787
+ # @param [array] druids - an array of druids
788
+ def self.remove_duplicate_tags(druids)
789
+ druids.each do |druid|
790
+ i = Dor::Item.find(druid)
791
+ if i and i.tags.size > 1 # multiple tags
792
+ i.tags.each do |tag|
793
+ if (i.tags.select {|t| t == tag}).size > 1 # tag is duplicate
794
+ i.remove_tag(tag)
795
+ i.add_tag(tag)
796
+ puts "Saving #{druid} to remove duplicate tag='#{tag}'"
797
+ i.save
798
+ end
799
+ end
800
+ end
801
+ end
802
+ end
803
+
804
+ private
805
+ # Used by the cleanup to ask user for confirmation of each step. Any response other than 'yes' results in the raising of an error
806
+ #
807
+ # @param [string] message the message to show to a user
808
+ #
809
+ def self.confirm(message)
810
+ puts message
811
+ response=gets.chomp.downcase
812
+ raise "Exiting" if response != 'y' && response != 'yes'
813
+ end
814
+
815
+ end
816
+
817
+ end