assembly-utils 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,817 @@
1
+ require 'net/ssh'
2
+ require 'csv'
3
+ require 'csv-mapper'
4
+ require 'druid-tools'
5
+
6
+ begin
7
+ require 'net/ssh/kerberos'
8
+ rescue LoadError
9
+ end
10
+
11
+ module Assembly
12
+
13
+
14
+ # The Utils class contains methods to help with accessioning and assembly
15
+ class Utils
16
+
17
+ WFS = Dor::WorkflowService
18
+ REPO = 'dor'
19
+
20
+ # Get the staging directory tree given a druid, and optionally prepend a basepath.
21
+ # Deprecated and should not be needed anymore.
22
+ #
23
+ # @param [String] pid druid pid (e.g. 'aa000aa0001')
24
+ # @param [String] base_path optional base path to prepend to druid path
25
+ #
26
+ # @return [string] path to material that is being staged, with optional prepended base path
27
+ #
28
+ # Example:
29
+ # puts Assembly::Utils.get_staging_path('aa000aa0001','tmp')
30
+ # > "tmp/aa/000/aa/0001"
31
+ def self.get_staging_path(pid,base_path=nil)
32
+ d=DruidTools::Druid.new(pid,base_path)
33
+ path=File.dirname(d.path)
34
+ return path
35
+ end
36
+
37
+ # Insert the specified workflow into the specified object.
38
+ #
39
+ # @param [String] pid druid pid (e.g. 'aa000aa0001')
40
+ # @param [String] workflow name (e.g. 'accessionWF')
41
+ # @param [String] repository name (e.g. 'dor') -- optional, defaults to dor
42
+ #
43
+ # @return [boolean] indicates success of web service call
44
+ #
45
+ # Example:
46
+ # puts Assembly::Utils.insert_workflow('druid:aa000aa0001','accessionWF')
47
+ # > true
48
+ def self.insert_workflow(pid,workflow,repo='dor')
49
+ url = "#{Dor::Config.dor.service_root}/objects/#{pid}/apo_workflows/#{workflow}"
50
+ result = RestClient.post url, {}
51
+ return ([200,201,202,204].include?(result.code) && result)
52
+ end
53
+
54
+ # Claim a specific druid as already used to be sure it won't get used again.
55
+ # Not needed for normal purposes, only if you manually register something in Fedora Admin outside of DOR services gem.
56
+ #
57
+ # @param [String] pid druid pid (e.g. 'aa000aa0001')
58
+ #
59
+ # @return [boolean] indicates success of web service call
60
+ #
61
+ # Example:
62
+ # puts Assembly::Utils.claim_druid('aa000aa0001')
63
+ # > true
64
+ def self.claim_druid(pid)
65
+ sc = Dor::Config.suri
66
+ url = "#{sc.url}/suri2/namespaces/#{sc.id_namespace}"
67
+ rcr = RestClient::Resource.new(url, :user => sc.user, :password => sc.pass)
68
+ resp = rcr["identifiers/#{pid}"].put('')
69
+ return resp.code == "204"
70
+ end
71
+
72
+ # Force a full re-index of the supplied druid in solr and fedora.
73
+ #
74
+ # @param [String] druid druid (e.g. 'druid:aa000aa0001')
75
+ #
76
+ # Example:
77
+ # puts Assembly::Utils.reindex('druid:aa000aa0001')
78
+ def self.reindex(druid)
79
+ obj = Dor.load_instance druid
80
+ solr_doc = obj.to_solr
81
+ Dor::SearchService.solr.add(solr_doc, :add_attributes => {:commitWithin => 1000}) unless obj.nil?
82
+ Dor.find(pid).update_index
83
+ end
84
+
85
+ # Export one or more objects given a single or array of pids, with output to the specified directory as FOXML files
86
+ #
87
+ # @param [Array] pids - an array of pids to export (can also pass a single pid as a string)
88
+ # @param [String] output_dir - the full path to output the foxml files
89
+ #
90
+ # Example:
91
+ # Assembly::Utils.export_objects(['druid:aa000aa0001','druid:bb000bb0001'],'/tmp')
92
+ def self.export_objects(pids,output_dir)
93
+ pids=[pids] if pids.class==String
94
+ pids.each {|pid| ActiveFedora::FixtureExporter.export_to_path(pid, output_dir)}
95
+ end
96
+
97
+ # Import all of the FOXML files in the specified directory into Fedora
98
+ #
99
+ # @param [String] source_dir - the full path to import the foxml files
100
+ #
101
+ # Example:
102
+ # Assembly::Utils.import_objects('/tmp')
103
+ def self.import_objects(source_dir)
104
+ Dir.chdir(source_dir)
105
+ files=Dir.glob('*.foxml.xml')
106
+ files.each do |file|
107
+ pid = ActiveFedora::FixtureLoader.import_to_fedora(File.join(source_dir,file))
108
+ ActiveFedora::FixtureLoader.index(pid)
109
+ end
110
+ end
111
+
112
+ # Get a list of druids that match the given array of source IDs.
113
+ # This method only works when this gem is used in a project that is configured to connect to DOR
114
+ #
115
+ # @param [String] source_ids array of source ids to lookup
116
+ #
117
+ # @return [array] druids
118
+ # Example:
119
+ #
120
+ # puts Assembly::Utils.get_druids_by_sourceid(['revs-01','revs-02'])
121
+ # > ['druid:aa000aa0001','druid:aa000aa0002']
122
+ def self.get_druids_by_sourceid(source_ids)
123
+ druids=[]
124
+ source_ids.each {|sid| druids << Dor::SearchService.query_by_id(sid)}
125
+ druids.flatten
126
+ end
127
+
128
+ # Show the workflow status of specific steps in assembly and/or accession workflows for the provided druids.
129
+ # This method only works when this gem is used in a project that is configured to connect to DOR
130
+ #
131
+ # @param [Hash] params parameters specified as a hash, using symbols for options:
132
+ # * :druids => array of druids to get workflow status for
133
+ # * :workflows => an optional array of workflow names as symbols, options are :assembly and :accession; defaults to :assembly
134
+ # * :filename => optional filename if you want to send output to a CSV
135
+ #
136
+ # @return [string] comma delimited output or CSV file
137
+ #
138
+ # Example:
139
+ # Assembly::Utils.workflow_status(:druids=>['druid:aa000aa0001','druid:aa000aa0002'],:workflows=>[:assembly,:accession],:filename=>'output.csv')
140
+ def self.workflow_status(params={})
141
+
142
+ druids=params[:druids] || []
143
+ workflows=params[:workflows] || [:assembly]
144
+ filename=params[:filename] || ''
145
+
146
+ accession_steps = %w(content-metadata descriptive-metadata rights-metadata remediate-object shelve publish)
147
+ assembly_steps = %w(jp2-create checksum-compute exif-collect accessioning-initiate)
148
+
149
+ puts "Generating report"
150
+
151
+ csv = CSV.open(filename, "w") if filename != ''
152
+
153
+ header=["druid"]
154
+ header << assembly_steps if workflows.include?(:assembly)
155
+ header << accession_steps if workflows.include?(:accession)
156
+ csv << header.flatten if filename != ''
157
+ puts header.join(',')
158
+
159
+ druids.each do |druid|
160
+ output=[druid]
161
+ assembly_steps.each {|step| output << self.get_workflow_status(druid,'assemblyWF',step)} if workflows.include?(:assembly)
162
+ accession_steps.each {|step| output << self.get_workflow_status(druid,'accessionWF',step)} if workflows.include?(:accession)
163
+ csv << output if filename != ''
164
+ puts output.join(',')
165
+ end
166
+
167
+ if filename != ''
168
+ csv.close
169
+ puts "Report generated in #{filename}"
170
+ end
171
+
172
+ end
173
+
174
+ # Show the workflow status of a specific step in a specific workflow for the provided druid.
175
+ # This method only works when this gem is used in a project that is configured to connect to DOR
176
+ #
177
+ # @param [string] druid a druid string
178
+ # @param [string] workflow name of workflow
179
+ # @param [string] step name of step
180
+ #
181
+ # @return [string] workflow step status, returns nil if no workflow found
182
+ #
183
+ # Example:
184
+ # puts Assembly::Utils.get_workflow_status('druid:aa000aa0001','assemblyWF','jp2-create')
185
+ # > "completed"
186
+ def self.get_workflow_status(druid,workflow,step)
187
+ Dor::WorkflowService.get_workflow_status('dor', druid, workflow, step)
188
+ end
189
+
190
+ # Cleanup a list of objects and associated files given a list of druids. WARNING: VERY DESTRUCTIVE.
191
+ # This method only works when this gem is used in a project that is configured to connect to DOR
192
+ #
193
+ # @param [Hash] params parameters specified as a hash, using symbols for options:
194
+ # * :druids => array of druids to cleanup
195
+ # * :steps => an array of steps, specified as symbols, indicating steps to be run, options are:
196
+ # :stacks=This will remove all files from the stacks that were shelved for the objects
197
+ # :dor=This will delete objects from Fedora
198
+ # :stage=This will delete the staged content in the assembly workspace
199
+ # :symlinks=This will remove the symlink from the dor workspace
200
+ # :workflows=This will remove the assemblyWF and accessoiningWF workflows for this object
201
+ # * :dry_run => do not actually clean up (defaults to false)
202
+ #
203
+ # Example:
204
+ # Assembly::Utils.cleanup(:druids=>['druid:aa000aa0001','druid:aa000aa0002'],:steps=>[:stacks,:dor,:stage,:symlinks,:workflows])
205
+ def self.cleanup(params={})
206
+
207
+ druids=params[:druids] || []
208
+ steps=params[:steps] || []
209
+ dry_run=params[:dry_run] || false
210
+
211
+ allowed_steps={:stacks=>'This will remove all files from the stacks that were shelved for the objects',
212
+ :dor=>'This will delete objects from Fedora',
213
+ :stage=>"This will delete the staged content in #{Assembly::ASSEMBLY_WORKSPACE}",
214
+ :symlinks=>"This will remove the symlink from #{Assembly::DOR_WORKSPACE}",
215
+ :workflows=>"This will remove the accessionWF and assemblyWF workflows"}
216
+
217
+ num_steps=0
218
+
219
+ puts 'THIS IS A DRY RUN' if dry_run
220
+
221
+ Assembly::Utils.confirm "Run on '#{ENV['ROBOT_ENVIRONMENT']}'? Any response other than 'y' or 'yes' will stop the cleanup now."
222
+ Assembly::Utils.confirm "Are you really sure you want to run on production? CLEANUP IS NOT REVERSIBLE" if ENV['ROBOT_ENVIRONMENT'] == 'production'
223
+
224
+ steps.each do |step|
225
+ if allowed_steps.keys.include?(step)
226
+ Assembly::Utils.confirm "Run step '#{step}'? #{allowed_steps[step]}. Any response other than 'y' or 'yes' will stop the cleanup now."
227
+ num_steps+=1 # count the valid steps found and agreed to
228
+ end
229
+ end
230
+
231
+ raise "no valid steps specified for cleanup" if num_steps == 0
232
+ raise "no druids provided" if druids.size == 0
233
+
234
+ druids.each {|pid| Assembly::Utils.cleanup_object(pid,steps,dry_run)}
235
+
236
+ end
237
+
238
+ # Cleanup a single objects and associated files given a druid. WARNING: VERY DESTRUCTIVE.
239
+ # This method only works when this gem is used in a project that is configured to connect to DOR
240
+ #
241
+ # @param [string] pid a druid
242
+ # @param [array] steps an array of steps, options below
243
+ # :stacks=This will remove all files from the stacks that were shelved for the objects
244
+ # :dor=This will delete objects from Fedora
245
+ # :stage=This will delete the staged content in the assembly workspace
246
+ # :symlinks=This will remove the symlink from the dor workspace
247
+ # :workflows=This will remove the assemblyWF and accessoiningWF workflows for this object
248
+ # @param [boolean] dry_run do not actually clean up (defaults to false)
249
+ #
250
+ # Example:
251
+ # Assembly::Utils.cleanup_object('druid:aa000aa0001',[:stacks,:dor,:stage,:symlinks,:workflows])
252
+ def self.cleanup_object(pid,steps,dry_run=false)
253
+ begin
254
+ # start up an SSH session if we are going to try and remove content from the stacks
255
+ ssh_session=Net::SSH.start(Dor::Config.stacks.host,Dor::Config.stacks.user, :auth_methods => %w(gssapi-with-mic publickey hostbased password keyboard-interactive)) if steps.include?(:stacks) && defined?(stacks_server)
256
+
257
+ druid_tree=DruidTools::Druid.new(pid).tree
258
+ puts "Cleaning up #{pid}"
259
+ if steps.include?(:dor)
260
+ puts "-- deleting #{pid} from Fedora #{ENV['ROBOT_ENVIRONMENT']}"
261
+ Assembly::Utils.unregister(pid) unless dry_run
262
+ end
263
+ if steps.include?(:symlinks)
264
+ path_to_symlinks=[]
265
+ path_to_symlinks << File.join(Assembly::DOR_WORKSPACE,druid_tree)
266
+ path_to_symlinks << Assembly::Utils.get_staging_path(pid,Assembly::DOR_WORKSPACE)
267
+ path_to_symlinks.each do |path|
268
+ if File::directory?(path)
269
+ puts "-- deleting folder #{path} (WARNING: should have been a symlink)"
270
+ FileUtils::rm_rf path unless dry_run
271
+ elsif File.symlink?(path)
272
+ puts "-- deleting symlink #{path}"
273
+ File.delete(path) unless dry_run
274
+ else
275
+ puts "-- Skipping #{path}: not a folder or symlink"
276
+ end
277
+ end
278
+ end
279
+ if steps.include?(:stage)
280
+ path_to_content=Assembly::Utils.get_staging_path(pid,Assembly::ASSEMBLY_WORKSPACE)
281
+ puts "-- deleting folder #{path_to_content}"
282
+ FileUtils.rm_rf path_to_content if !dry_run && File.exists?(path_to_content)
283
+ end
284
+ if steps.include?(:stacks)
285
+ path_to_content= Dor::DigitalStacksService.stacks_storage_dir(pid)
286
+ puts "-- removing files from the stacks on #{stacks_server} at #{path_to_content}"
287
+ ssh_session.exec!("rm -fr #{path_to_content}") unless dry_run
288
+ end
289
+ if steps.include?(:workflows)
290
+ puts "-- deleting #{pid} accessionWF and assemblyWF workflows from Fedora #{ENV['ROBOT_ENVIRONMENT']}"
291
+ unless dry_run
292
+ Dor::WorkflowService.delete_workflow('dor',pid,'accessionWF')
293
+ Dor::WorkflowService.delete_workflow('dor',pid,'assemblyWF')
294
+ end
295
+ end
296
+ rescue Exception => e
297
+ puts "** cleaning up failed for #{pid} with #{e.message}"
298
+ end
299
+ ssh_session.close if ssh_session
300
+ end
301
+
302
+ # Delete an object from DOR.
303
+ # This method only works when this gem is used in a project that is configured to connect to DOR
304
+ #
305
+ # @param [string] pid the druid
306
+ #
307
+ # Example:
308
+ # Assembly::Utils.delete_from_dor('druid:aa000aa0001')
309
+ def self.delete_from_dor(pid)
310
+
311
+ Dor::Config.fedora.client["objects/#{pid}"].delete
312
+ Dor::SearchService.solr.delete_by_id(pid)
313
+ Dor::SearchService.solr.commit
314
+
315
+ end
316
+
317
+ # Quicky update rights metadata for any existing list of objects using default rights metadata pulled from the supplied APO
318
+ #
319
+ # @param [array] druids - an array of druids
320
+ # @param [string] apo_druid - the druid of the APO to pull rights metadata from
321
+ # @param [boolean] publish - defaults to false, if true, will publish each object after replacing datastreams (must be run on server with rights to do this)
322
+ #
323
+ # Example:
324
+ # druids=%w{druid:aa111aa1111 druid:bb222bb2222}
325
+ # apo_druid='druid:cc222cc2222'
326
+ # Assembly::Utils.update_rights_metadata(druids,apo_druid)
327
+ def self.update_rights_metadata(druids,apo_druid,publish=false)
328
+ apo = Dor::Item.find(apo_druid)
329
+ rights_md = apo.datastreams['defaultObjectRights']
330
+ self.replace_datastreams(druids,'rightsMetadata',rights_md.content,publish)
331
+ end
332
+
333
+ # Replace a specific datastream for a series of objects in DOR with new content
334
+ #
335
+ # @param [array] druids - an array of druids
336
+ # @param [string] datastream_name - the name of the datastream to replace
337
+ # @param [string] new_content - the new content to replace the entire datastream with
338
+ # @param [boolean] publish - defaults to false, if true, will publish each object after replacing datastreams (must be run on server with rights to do this)
339
+ #
340
+ # Example:
341
+ # druids=%w{druid:aa111aa1111 druid:bb222bb2222}
342
+ # new_content='<xml><more nodes>this should be the whole datastream</more nodes></xml>'
343
+ # datastream='rightsMetadata'
344
+ # Assembly::Utils.replace_datastreams(druids,datastream,new_content)
345
+ def self.replace_datastreams(druids,datastream_name,new_content,publish=false)
346
+ druids.each do |druid|
347
+ obj = Dor::Item.find(druid)
348
+ ds = obj.datastreams[datastream_name]
349
+ if ds
350
+ ds.content = new_content
351
+ ds.save
352
+ puts "replaced #{datastream_name} for #{druid}"
353
+ if publish
354
+ obj.publish_metadata
355
+ puts "--object re-published"
356
+ end
357
+ else
358
+ puts "#{datastream_name} does not exist for #{druid}"
359
+ end
360
+ end
361
+ end
362
+
363
+ # Republish a list of druids. Only works when run from a server with access rights to the stacks (e.g. lyberservices-prod)
364
+ #
365
+ # @param [array] druids - an array of druids
366
+ #
367
+ # Example:
368
+ # druids=%w{druid:aa111aa1111 druid:bb222bb2222}
369
+ # Assembly::Utils.republish(druids)
370
+ def self.republish(druids)
371
+ druids.each do |druid|
372
+ obj = Dor::Item.find(druid)
373
+ obj.publish_metadata
374
+ puts "republished #{druid}"
375
+ end
376
+ end
377
+
378
+ # Determines if the specifed APO object contains a specified workflow defined in it
379
+ # DEPRACATED NOW THAT REIFED WORKFLOWS ARE USED
380
+ # @param [string] druid - the druid of the APO to check
381
+ # @param [string] workflow - the name of the workflow to check
382
+ #
383
+ # @return [boolean] if workflow is defined in APO
384
+ #
385
+ # Example:
386
+ # Assembly::Utils.apo_workflow_defined?('druid:oo000oo0001','assembly')
387
+ # > true
388
+ def self.apo_workflow_defined?(druid,workflow)
389
+ puts "************WARNING - THIS METHOD MAY NOT BE USEFUL ANYMORE SINCE WORKFLOWS ARE NO LONGER DEFINED IN THE APO**************"
390
+ obj = Dor::Item.find(druid)
391
+ raise 'object not an APO' if obj.identityMetadata.objectType.first != 'adminPolicy'
392
+ xml_doc=Nokogiri::XML(obj.administrativeMetadata.content)
393
+ xml_doc.xpath("//#{workflow}").size == 1 || xml_doc.xpath("//*[@id='#{workflow}']").size == 1
394
+ end
395
+
396
+ # Determines if the specifed object is an APO
397
+ # @param [string] druid - the druid of the APO to check
398
+ #
399
+ # @return [boolean] if object exist and is an APO
400
+ #
401
+ # Example:
402
+ # Assembly::Utils.is_apo?('druid:oo000oo0001')
403
+ # > true
404
+ def self.is_apo?(druid)
405
+ begin
406
+ obj = Dor::Item.find(druid)
407
+ return obj.identityMetadata.objectType.first == 'adminPolicy'
408
+ rescue
409
+ return false
410
+ end
411
+ end
412
+
413
+ # Update a specific datastream for a series of objects in DOR by searching and replacing content
414
+ #
415
+ # @param [array] druids - an array of druids
416
+ # @param [string] datastream_name - the name of the datastream to replace
417
+ # @param [string] find_content - the content to find
418
+ # @param [string] replace_content - the content to replace the found content with
419
+ #
420
+ # Example:
421
+ # druids=%w{druid:aa111aa1111 druid:bb222bb2222}
422
+ # find_content='FooBarBaz'
423
+ # replace_content='Stanford Rules'
424
+ # datastream='rightsMetadata'
425
+ # Assembly::Utils.update_datastreams(druids,datastream,find_content,replace_content)
426
+ def self.update_datastreams(druids,datastream_name,find_content,replace_content)
427
+ druids.each do |druid|
428
+ obj = Dor::Item.find(druid)
429
+ ds = obj.datastreams[datastream_name]
430
+ if ds
431
+ updated_content=ds.content.gsub(find_content,replace_content)
432
+ ds.content = updated_content
433
+ ds.save
434
+ puts "updated #{datastream_name} for #{druid}"
435
+ else
436
+ puts "#{datastream_name} does not exist for #{druid}"
437
+ end
438
+ end
439
+ end
440
+
441
+ # Unregister a DOR object, which includes deleting it and deleting all its workflows
442
+ #
443
+ # @param [string] pid of druid
444
+ #
445
+ # @return [boolean] if deletion succeed or not
446
+ def self.unregister(pid)
447
+
448
+ begin
449
+ Assembly::Utils.delete_all_workflows pid
450
+ Assembly::Utils.delete_from_dor pid
451
+ return true
452
+ rescue
453
+ return false
454
+ end
455
+
456
+ end
457
+
458
+ # Set the workflow step for the given PID to an error state
459
+ #
460
+ # @param [string] pid of druid
461
+ # @param [string] step to set to error
462
+ #
463
+ def self.set_workflow_step_to_error(pid, step)
464
+ wf_name = Assembly::ASSEMBLY_WF
465
+ msg = 'Integration testing'
466
+ params = ['dor', pid, wf_name, step, msg]
467
+ resp = Dor::WorkflowService.update_workflow_error_status *params
468
+ raise "update_workflow_error_status() returned false." unless resp == true
469
+ end
470
+
471
+ # Delete all workflows for the given PID. Destructive and should only be used when deleting an object from DOR.
472
+ # This method only works when this gem is used in a project that is configured to connect to DOR
473
+ #
474
+ # @param [string] pid of druid
475
+ # @param [String] repo repository dealing with the workflow. Default is 'dor'. Another option is 'sdr'
476
+ # e.g.
477
+ # Assembly::Utils.delete_all_workflows('druid:oo000oo0001')
478
+ def self.delete_all_workflows(pid, repo='dor')
479
+ Dor::WorkflowService.get_workflows(pid).each {|workflow| Dor::WorkflowService.delete_workflow(repo,pid,workflow)}
480
+ end
481
+
482
+ # Reindex the supplied PID in solr.
483
+ #
484
+ # @param [string] pid of druid
485
+ # e.g.
486
+ # Assembly::Utils.reindex('druid:oo000oo0001')
487
+ def self.reindex(pid)
488
+ obj = Dor.load_instance pid
489
+ solr_doc = obj.to_solr
490
+ Dor::SearchService.solr.add(solr_doc, :add_attributes => {:commitWithin => 1000}) unless obj.nil?
491
+ end
492
+
493
+ # Clear stray workflows - remove any workflow steps for orphaned objects.
494
+ # This method only works when this gem is used in a project that is configured to connect to DOR
495
+ def self.clear_stray_workflows
496
+ repo = 'dor'
497
+ wf = 'assemblyWF'
498
+ msg = 'Integration testing'
499
+ wfs = Dor::WorkflowService
500
+ steps = Assembly::ASSEMBLY_WF_STEPS.map { |s| s[0] }
501
+ completed = steps[0]
502
+
503
+ steps.each do |waiting|
504
+ druids = wfs.get_objects_for_workstep completed, waiting, repo, wf
505
+ druids.each do |dru|
506
+ params = [repo, dru, wf, waiting, msg]
507
+ resp = wfs.update_workflow_error_status *params
508
+ puts "updated: resp=#{resp} params=#{params.inspect}"
509
+ end
510
+ end
511
+ end
512
+
513
+ # Check if the object is full accessioned and ingested.
514
+ # This method only works when this gem is used in a project that is configured to connect to the workflow service.
515
+ #
516
+ # @param [string] pid the druid to operate on
517
+ #
518
+ # @return [boolean] if object is fully ingested
519
+ # Example:
520
+ # Assembly::Utils.is_ingested?('druid:oo000oo0001')
521
+ # > false
522
+ def self.is_ingested?(pid)
523
+ WFS.get_lifecycle(REPO, pid, 'accessioned') ? true : false
524
+ end
525
+
526
+ # Check if the object is on ingest hold
527
+ # This method only works when this gem is used in a project that is configured to connect to the workflow service.
528
+ #
529
+ # @param [string] pid the druid to operate on
530
+ #
531
+ # @return [boolean] if object is on ingest hold
532
+ # Example:
533
+ # Assembly::Utils.ingest_hold?('druid:oo000oo0001')
534
+ # > false
535
+ def self.ingest_hold?(pid)
536
+ WFS.get_workflow_status(REPO, pid, 'accessionWF','sdr-ingest-transfer') == 'hold'
537
+ end
538
+
539
+ # Check if the object is submitted
540
+ # This method only works when this gem is used in a project that is configured to connect to the workflow service.
541
+ #
542
+ # @param [string] pid the druid to operate on
543
+ #
544
+ # @return [boolean] if object is submitted
545
+ # Example:
546
+ # Assembly::Utils.is_submitted?('druid:oo000oo0001')
547
+ # > false
548
+ def self.is_submitted?(pid)
549
+ WFS.get_lifecycle(REPO, pid, 'submitted') == nil
550
+ end
551
+
552
+ # Reset the workflow states for a list of druids given a list of workflow names and steps.
553
+ # Provide a list of druids in an array, and a hash containing workflow names (e.g. 'assemblyWF' or 'accessionWF') as the keys, and arrays of steps
554
+ # as the corresponding values (e.g. ['checksum-compute','jp2-create']) and they will all be reset to "waiting".
555
+ # This method only works when this gem is used in a project that is configured to connect to DOR
556
+ #
557
+ # @param [Hash] params parameters specified as a hash, using symbols for options:
558
+ # * :druids => array of druids
559
+ # * :steps => a hash, containing workflow names as keys, and an array of steps
560
+ # * :state => a string for the name of the state to reset to, defaults to 'waiting' (could be 'completed' for example)
561
+ #
562
+ # Example:
563
+ # druids=['druid:aa111aa1111','druid:bb222bb2222']
564
+ # steps={'assemblyWF' => ['checksum-compute'],'accessionWF' => ['content-metadata','descriptive-metadata']}
565
+ # Assembly::Utils.reset_workflow_states(:druids=>druids,:steps=>steps)
566
+ def self.reset_workflow_states(params={})
567
+ druids=params[:druids] || []
568
+ workflows=params[:steps] || {}
569
+ state=params[:state] || "waiting"
570
+ druids.each do |druid|
571
+ puts "** #{druid}"
572
+ begin
573
+ workflows.each do |workflow,steps|
574
+ steps.each do |step|
575
+ puts "Updating #{workflow}:#{step} to #{state}"
576
+ Dor::WorkflowService.update_workflow_status 'dor',druid,workflow, step, state
577
+ end
578
+ end
579
+ rescue Exception => e
580
+ puts "an error occurred trying to update workflows for #{druid} with message #{e.message}"
581
+ end
582
+ end
583
+ end
584
+
585
+ # Get a list of druids from a CSV file which has a heading of "druid" and put them into a Ruby array.
586
+ # Useful if you want to import a report from argo
587
+ #
588
+ # @param [string] filename of CSV that has a column called "druid"
589
+ #
590
+ # @return [array] array of druids
591
+ #
592
+ # Example:
593
+ # Assembly::Utils.read_druids_from_file('download.csv') # ['druid:xxxxx','druid:yyyyy']
594
+ def self.read_druids_from_file(csv_filename)
595
+ rows=CsvMapper.import(csv_filename) do read_attributes_from_file end
596
+ druids=[]
597
+ rows.each do |row|
598
+ druid=row.druid
599
+ druid="druid:#{druid}" unless druid.include?('druid:')
600
+ druids << druid
601
+ end
602
+ return druids
603
+ end
604
+
605
+ # Get a list of druids that have errored out in a particular workflow and step
606
+ #
607
+ # @param [string] workflow name
608
+ # @param [string] step name
609
+ # @param [string] tag -- optional, if supplied, results will be filtered by the exact tag supplied; note this will dramatically slow down the response if there are many results
610
+ #
611
+ # @return [hash] hash of results, with key has a druid, and value as the error message
612
+ # e.g.
613
+ # result=Assembly::Utils.get_errored_objects_for_workstep('accessionWF','content-metadata','Project : Revs')
614
+ # => {"druid:qd556jq0580"=>"druid:qd556jq0580 - Item error; caused by #<Rubydora::FedoraInvalidRequest: Error modifying datastream contentMetadata for druid:qd556jq0580. See logger for details>"}
615
+ def self.get_errored_objects_for_workstep workflow, step, tag = ''
616
+ result=Dor::WorkflowService.get_errored_objects_for_workstep workflow,step,'dor'
617
+ if tag == ''
618
+ return result
619
+ else
620
+ filtered_result={}
621
+ result.each do |druid,error|
622
+ begin
623
+ item=Dor::Item.find(druid)
624
+ filtered_result.merge!(druid=>error) if item.tags.include? tag
625
+ rescue
626
+ end
627
+ end
628
+ return filtered_result
629
+ end
630
+ end
631
+
632
+ # Reset any objects in a specific workflow step and state that have errored out back to waiting
633
+ #
634
+ # @param [string] workflow name
635
+ # @param [string] step name
636
+ # @param [string] tag -- optional, if supplied, results will be filtered by the exact tag supplied; note this will dramatically slow down the response if there are many results
637
+ #
638
+ # @return [hash] hash of results that have been reset, with key has a druid, and value as the error message
639
+ # e.g.
640
+ # result=Assembly::Utils.reset_errored_objects_for_workstep('accessionWF','content-metadata')
641
+ # => {"druid:qd556jq0580"=>"druid:qd556jq0580 - Item error; caused by #<Rubydora::FedoraInvalidRequest: Error modifying datastream contentMetadata for druid:qd556jq0580. See logger for details>"}
642
+ def self.reset_errored_objects_for_workstep workflow, step, tag=''
643
+ result=self.get_errored_objects_for_workstep workflow,step,tag
644
+ druids=[]
645
+ result.each {|k,v| druids << k}
646
+ self.reset_workflow_states(:druids=>druids,:steps=>{workflow=>[step]}) if druids.size > 0
647
+ return result
648
+ end
649
+
650
+ # Read in a list of druids from a pre-assembly progress load file and load into an array.
651
+ #
652
+ # @param [string] progress_log_file filename
653
+ # @param [boolean] completed if true, returns druids that have completed, if false, returns druids that failed (defaults to true)
654
+ #
655
+ # @return [array] list of druids
656
+ #
657
+ # Example:
658
+ # druids=Assembly::Utils.get_druids_from_log('/dor/preassembly/sohp_accession_log.yaml')
659
+ # puts druids
660
+ # > ['aa000aa0001','aa000aa0002']
661
+ def self.get_druids_from_log(progress_log_file,completed=true)
662
+ druids=[]
663
+ docs = YAML.load_stream(Assembly::Utils.read_file(progress_log_file))
664
+ docs = docs.documents if docs.respond_to? :documents
665
+ docs.each { |obj| druids << obj[:pid] if obj[:pre_assem_finished] == completed}
666
+ return druids
667
+ end
668
+
669
+ # Read in a YAML configuration file from disk and return a hash
670
+ #
671
+ # @param [string] filename of YAML config file to read
672
+ #
673
+ # @return [hash] configuration contents as a hash
674
+ #
675
+ # Example:
676
+ # config_filename='/thumpers/dpgthumper2-smpl/SC1017_SOHP/sohp_prod_accession.yaml'
677
+ # config=Assembly::Utils.load_config(config_filename)
678
+ # puts config['progress_log_file']
679
+ # > "/dor/preassembly/sohp_accession_log.yaml"
680
+ def self.load_config(filename)
681
+ YAML.load(Assembly::Utils.read_file(filename))
682
+ end
683
+
684
+ # Read in a file from disk
685
+ #
686
+ # @param [string] filename to read
687
+ #
688
+ # @return [string] file contents as a string
689
+ def self.read_file(filename)
690
+ return File.readable?(filename) ? IO.read(filename) : ''
691
+ end
692
+
693
+ # Used by the completion_report and project_tag_report in the pre-assembly project
694
+ #
695
+ # @param [solr_document] doc a solr document result
696
+ # @param [boolean] check_status_in_dor indicates if we should check for the workflow states in dor or trust SOLR is up to date (defaults to false)
697
+ #
698
+ # @return [string] a comma delimited row for the report
699
+ def self.solr_doc_parser(doc,check_status_in_dor=false)
700
+
701
+ druid = doc[:id]
702
+
703
+ if Solrizer::VERSION < '3.0'
704
+ label = doc[:objectLabel_t]
705
+ title=doc[:public_dc_title_t].nil? ? '' : doc[:public_dc_title_t].first
706
+
707
+ if check_status_in_dor
708
+ accessioned = self.get_workflow_status(druid,'accessionWF','publish')=="completed"
709
+ shelved = self.get_workflow_status(druid,'accessionWF','shelve')=="completed"
710
+ else
711
+ accessioned = doc[:wf_wps_facet].nil? ? false : doc[:wf_wps_facet].include?("accessionWF:publish:completed")
712
+ shelved = doc[:wf_wps_facet].nil? ? false : doc[:wf_wps_facet].include?("accessionWF:shelve:completed")
713
+ end
714
+ source_id = doc[:source_id_t]
715
+ files=doc[:content_file_t]
716
+ else
717
+ label = doc[Solrizer.solr_name('objectLabel', :displayable)]
718
+ title = doc.fetch(Solrizer.solr_name('public_dc_title', :displayable), []).first || ''
719
+
720
+ if check_status_in_dor
721
+ accessioned = self.get_workflow_status(druid,'accessionWF','publish')=="completed"
722
+ shelved = self.get_workflow_status(druid,'accessionWF','shelve')=="completed"
723
+ else
724
+ accessioned = doc.fetch(Solrizer.solr_name('wf_wps', :symbol), []).include?("accessionWF:publish:completed")
725
+ shelved = doc.fetch(Solrizer.solr_name('wf_wps', :symbol), []).include?("accessionWF:shelve:completed")
726
+ end
727
+ source_id = doc[Solrizer.solr_name('source_id', :symbol)]
728
+ files=doc[Solrizer.solr_name('content_file', :symbol)]
729
+
730
+ end
731
+
732
+ if files.nil?
733
+ file_type_list=""
734
+ num_files=0
735
+ else
736
+ num_files = files.size
737
+ # count the amount of each file type
738
+ file_types=Hash.new(0)
739
+ unless num_files == 0
740
+ files.each {|file| file_types[File.extname(file)]+=1}
741
+ file_type_list=file_types.map{|k,v| "#{k}=#{v}"}.join(' | ')
742
+ end
743
+ end
744
+
745
+ purl_link = ""
746
+ val = druid.split(/:/).last
747
+ purl_link = File.join(Assembly::PURL_BASE_URL, val)
748
+
749
+ return [druid, label, title, source_id, accessioned, shelved, purl_link, num_files,file_type_list]
750
+
751
+ end
752
+
753
+ # Takes a hash data structure and recursively converts all hash keys from strings to symbols.
754
+ #
755
+ # @param [hash] h hash
756
+ #
757
+ # @return [hash] a hash with all keys converted from strings to symbols
758
+ #
759
+ # Example:
760
+ # Assembly::Utils.symbolize_keys({'dude'=>'is cool','i'=>'am too'})
761
+ # > {:dude=>"is cool", :i=>"am too"}
762
+ def self.symbolize_keys(h)
763
+ if h.instance_of? Hash
764
+ h.inject({}) { |hh,(k,v)| hh[k.to_sym] = symbolize_keys(v); hh }
765
+ elsif h.instance_of? Array
766
+ h.map { |v| symbolize_keys(v) }
767
+ else
768
+ h
769
+ end
770
+ end
771
+
772
+ # Takes a hash and converts its string values to symbols -- not recursively.
773
+ #
774
+ # @param [hash] h hash
775
+ #
776
+ # @return [hash] a hash with all keys converted from strings to symbols
777
+ #
778
+ # Example:
779
+ # Assembly::Utils.values_to_symbols!({'dude'=>'iscool','i'=>'amtoo'})
780
+ # > {"i"=>:amtoo, "dude"=>:iscool}
781
+ def self.values_to_symbols!(h)
782
+ h.each { |k,v| h[k] = v.to_sym if v.class == String }
783
+ end
784
+
785
+ # Removes any duplicate tags within each druid
786
+ #
787
+ # @param [array] druids - an array of druids
788
+ def self.remove_duplicate_tags(druids)
789
+ druids.each do |druid|
790
+ i = Dor::Item.find(druid)
791
+ if i and i.tags.size > 1 # multiple tags
792
+ i.tags.each do |tag|
793
+ if (i.tags.select {|t| t == tag}).size > 1 # tag is duplicate
794
+ i.remove_tag(tag)
795
+ i.add_tag(tag)
796
+ puts "Saving #{druid} to remove duplicate tag='#{tag}'"
797
+ i.save
798
+ end
799
+ end
800
+ end
801
+ end
802
+ end
803
+
804
+ private
805
+ # Used by the cleanup to ask user for confirmation of each step. Any response other than 'yes' results in the raising of an error
806
+ #
807
+ # @param [string] message the message to show to a user
808
+ #
809
+ def self.confirm(message)
810
+ puts message
811
+ response=gets.chomp.downcase
812
+ raise "Exiting" if response != 'y' && response != 'yes'
813
+ end
814
+
815
+ end
816
+
817
+ end