lyber-core 0.9.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,103 @@
1
+ require 'dor_service'
2
+ require "roxml_models/identity_metadata/identity_metadata"
3
+ require "roxml_models/identity_metadata/dublin_core"
4
+
5
+ # Represents a single object being processed as part of a workflow queue
6
+ module LyberCore
7
+ module Robots
8
+ class WorkItem
9
+
10
+ # The queue that this workitem is a member of
11
+ attr_reader :work_queue
12
+ # The primary id for the object being processed
13
+ attr_accessor :druid
14
+ # An object used to hold unmarshalled XML from the identityMetadata datastream
15
+ attr_accessor :identity_metadata
16
+ # Timings for this workitem's processing
17
+ attr_reader :start_time
18
+ attr_reader :end_time
19
+ attr_reader :elapsed_time
20
+
21
+ # Create a new WorkItem object, save a pointer to the parent WorkQueue, and start the timer
22
+ def initialize(work_queue)
23
+ @work_queue = work_queue
24
+ @start_time = Time.new
25
+ end
26
+
27
+ # Inject an IdentityMetadata object (currently used for unit testing only)
28
+ def identity_metadata=(identity_metadata)
29
+ @identity_metadata = identity_metadata
30
+ end
31
+
32
+ #save the IdentityMetadata object to identityMetadata datastream
33
+ def identity_metadata_save
34
+ unless DorService.get_datastream(@druid, 'identityMetadata')
35
+ DorService.add_datastream(@druid, 'identityMetadata', 'identityMetadata', self.identity_metadata.to_xml.to_xml)
36
+ else
37
+ DorService.update_datastream(@druid, 'identityMetadata', self.identity_metadata.to_xml.to_xml, content_type='application/xml', versionable = false)
38
+ end #unless
39
+ end #identity_metadata_save
40
+
41
+ # Return the IdentityMetadata object bound to identityMetadata datastream XML
42
+ def identity_metadata
43
+ if (@identity_metadata == nil)
44
+ if (@druid == nil)
45
+ @identity_metadata = IdentityMetadata.new
46
+ else
47
+ idmd_str = DorService.get_datastream(@druid, 'identityMetadata')
48
+ @identity_metadata = IdentityMetadata.from_xml(idmd_str)
49
+ end
50
+ end
51
+ return @identity_metadata
52
+ end
53
+
54
+ # Return the identifier value for the specified identier name
55
+ def identifier(key)
56
+ return self.identity_metadata.get_identifier_value(key)
57
+ end
58
+
59
+ # Add a new name,value pair to the set of identifiers
60
+ def identifier_add(key, value)
61
+ self.identity_metadata.add_identifier(key, value)
62
+ end
63
+
64
+ # Return an array of strings where each entry consists of name:value
65
+ def id_pairs
66
+ self.identity_metadata.get_id_pairs
67
+ end
68
+
69
+ # Return the druid for the work item if it exists, else the first identifier value
70
+ def item_id
71
+ return @druid if @druid
72
+ pairs = self.identity_metadata.get_id_pairs
73
+ return pairs[0] if (pairs.size > 0)
74
+ end
75
+
76
+ # Record the successful outcome of the workstep operation for this workitem
77
+ def set_success
78
+ @work_queue.success_count += 1
79
+ @end_time = Time.new
80
+ @elapsed_time = @end_time - @start_time
81
+ LyberCore::Log.info("#{item_id} completed in #{@elapsed_time} seconds")
82
+ if (@druid)
83
+ Dor::WorkflowService.update_workflow_status(@work_queue.workflow.repository, @druid, @work_queue.workflow.workflow_id, @work_queue.workflow_step, 'completed', @elapsed_time)
84
+ end
85
+ end
86
+
87
+ # Record the unsuccessful outcome of the workstep operation for this workitem
88
+ def set_error(e)
89
+ @work_queue.error_count += 1
90
+ @end_time = Time.new
91
+ @elapsed_time = @end_time - @start_time
92
+ LyberCore::Log.error("#{item_id} error - #{e.backtrace}")
93
+ # By default puts will output an array with a newline between each item.
94
+ if (@druid)
95
+ DorService.update_workflow_error_status(@work_queue.workflow.repository, @druid, @work_queue.workflow.workflow_id, @work_queue.workflow_step, e.message)
96
+ end
97
+ # We've caught and processed the error at this point, I don't think we want to raise it again. --bess
98
+ # raise e
99
+ end
100
+
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,154 @@
1
+ require 'dor_service'
2
+ require 'dlss_service'
3
+ require 'yaml'
4
+
5
+ module LyberCore
6
+ module Robots
7
+ # Represents a set of workitem objects to be processed by a given step of a workflow
8
+ class WorkQueue
9
+
10
+ # The workflow that this queue is a part of
11
+ attr_reader :workflow
12
+ # The step in the workflow that is being processed against this queue
13
+ attr_reader :workflow_step
14
+ # The workflow step that should have already been completed for the workitem object
15
+ attr_reader :prerequisite
16
+ # The maximum number of workitem objects to process in one run of a robot
17
+ attr_reader :batch_limit
18
+ # The maximum number of errors to allow before terminating the batch run
19
+ attr_reader :error_limit
20
+ # The array of primary identifiers for the workitem objects to be processed
21
+ attr_reader :druids
22
+ # An alternative identitier to be used when druids are not yet available (e.g at registration)
23
+ attr_reader :identifier_name
24
+ attr_reader :identifier_values
25
+ # The tally of how many items have been processed
26
+ attr_reader :item_count
27
+ attr_accessor :success_count
28
+ attr_accessor :error_count
29
+ # The timings for the batch run
30
+ attr_reader :start_time
31
+ attr_reader :end_time
32
+ attr :elapsed_time
33
+
34
+ attr_reader :config_file
35
+
36
+
37
+ # Create a new WorkQueue object for the specified step,
38
+ # save a pointer to the parent WorkFlow,
39
+ # start the timer,
40
+ # read in the configuration information for the work step
41
+ def initialize(workflow=nil, workflow_step=nil)
42
+ LyberCore::Log.debug("Initializing work queue with workflow #{workflow} and workflow_step #{workflow_step}")
43
+ @start_time = Time.new
44
+ LyberCore::Log.info("Starting #{workflow_step} at #{@start_time}")
45
+ @workflow = workflow
46
+ @workflow_step = workflow_step
47
+ @item_count = 0
48
+ @success_count = 0
49
+ @error_count = 0
50
+ # nil arguments should only be used if in test mode
51
+ if (workflow.nil? || workflow_step.nil?)
52
+ @batch_limit = 2
53
+ @error_limit = 1
54
+ return
55
+ end
56
+
57
+ self.process_config_file
58
+
59
+ end
60
+
61
+ def process_config_file
62
+ LyberCore::Log.debug("Processing config file ... ")
63
+ LyberCore::Log.debug("@workflow.workflow_config_dir = #{@workflow.workflow_config_dir}")
64
+
65
+ @config_file = File.join(@workflow.workflow_config_dir, 'process-config.yaml')
66
+ LyberCore::Log.debug("I'm opening the config file at #{@config_file}")
67
+
68
+ # Does the file exist?
69
+ raise "Can't open process-config file #{@config_file}" unless File.file? @config_file
70
+
71
+ process_config = YAML.load_file(config_file)
72
+ LyberCore::Log.debug("process_config: #{process_config.inspect}")
73
+
74
+ @prerequisite = process_config[@workflow_step]["prerequisite"]
75
+ LyberCore::Log.debug("@prerequisite: #{@prerequisite}")
76
+
77
+ @batch_limit = process_config[@workflow_step]['batch_limit']
78
+ LyberCore::Log.debug("@batch_limit: #{@batch_limit}")
79
+
80
+ @error_limit = process_config[@workflow_step]['error_limit']
81
+ LyberCore::Log.debug("@error_limit: #{@error_limit}")
82
+
83
+ end
84
+
85
+ # Explicitly specify a set of druids to be processed by the workflow step
86
+ def enqueue_druids(druid_array)
87
+ LyberCore::Log.debug("\nEnqueing an array of druids...")
88
+ @druids = druid_array
89
+ LyberCore::Log.debug("\n@druids = #{@druids}")
90
+ end
91
+
92
+ # Obtain the set of druids to be processed using a database query
93
+ # to obtain the repository objects that are awaiting this step
94
+ def enqueue_workstep_waiting()
95
+ begin
96
+ LyberCore::Log.debug("\nEnqueing workstep waiting...")
97
+ object_list_xml = DorService.get_objects_for_workstep(workflow.repository, workflow.workflow_id, @prerequisite, @workflow_step)
98
+ LyberCore::Log.debug("\nobject_list_xml = #{object_list_xml}")
99
+ @druids = DlssService.get_some_druids_from_object_list(object_list_xml,self.batch_limit)
100
+ LyberCore::Log.debug("\n@druids = #{@druids}")
101
+ rescue Exception => e
102
+ raise e
103
+ end
104
+ end
105
+
106
+ # Use an alternative set of identifiers as the basis of this queue
107
+ # e.g. use array of barcodes as basis for google register-object robot
108
+ def enqueue_identifiers(identifier_name, identifier_values)
109
+ @identifier_name = identifier_name
110
+ @identifier_values = identifier_values
111
+ end
112
+
113
+ # Get the next WorkItem to be processed by the robot for the workflow step
114
+ def next_item()
115
+ if (@item_count >= @batch_limit )
116
+ LyberCore::Log.info "Batch limit of #{@batch_limit} items reached"
117
+ return nil
118
+ end
119
+ if (@error_count >= @error_limit )
120
+ LyberCore::Log.info "Error limit of #{@error_limit} items reached"
121
+ return nil
122
+ end
123
+ work_item = LyberCore::Robots::WorkItem.new(self)
124
+ if (@druids)
125
+ return nil if (@item_count >= @druids.length)
126
+ work_item.druid= @druids[@item_count]
127
+ elsif (@identifier_values)
128
+ return nil if (@item_count >= @identifier_values.length)
129
+ work_item.identifier_add(@identifier_name,@identifier_values[@item_count])
130
+ else
131
+ return nil
132
+ end
133
+ @item_count += 1
134
+ return work_item
135
+ end
136
+
137
+ # Output the batch's timings and other statistics to the main log file
138
+ def print_stats
139
+ @end_time = Time.new
140
+ @elapsed_time = @end_time - @start_time
141
+ LyberCore::Log.info "Total time: " + @elapsed_time.to_s + "\n"
142
+ LyberCore::Log.info "Completed objects: " + self.success_count.to_s + "\n"
143
+ LyberCore::Log.info "Errors: " + self.error_count.to_s + "\n"
144
+ end
145
+
146
+ def print_empty_stats
147
+ @end_time = Time.new
148
+ @elapsed_time = @end_time - @start_time
149
+ LyberCore::Log.info "Total time: " + @elapsed_time.to_s + "\n"
150
+ LyberCore::Log.info "Empty queue"
151
+ end
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,104 @@
1
+ module LyberCore
2
+ module Robots
3
+ class Workflow
4
+
5
+ attr_reader :workflow_name
6
+ attr_reader :workflow_config_dir
7
+ attr_reader :workflow_config_file
8
+ attr_reader :workflow_config
9
+
10
+ attr_reader :collection_name
11
+ attr_reader :collection_config_dir
12
+
13
+
14
+ # @param [String] workflow_name name of the workflow
15
+ # @param [Hash] options a hash of optional arguments
16
+ # @return [LyberCore::Robots::Workflow] a workflow object
17
+ # @example Create a new workflow object with a collection_name
18
+ # @wf = LyberCore::Robots::Workflow.new(workflow_name, {:collection_name => collection})
19
+ def initialize(workflow_name, options = {})
20
+ # ROBOT_ROOT must be set before invoking a robot
21
+ raise "ROBOT_ROOT isn't set. Please set it to point to where your config files live." unless defined? ROBOT_ROOT
22
+
23
+ @workflow_name = workflow_name
24
+ @collection_name = options[:collection_name]
25
+ self.load_workflow_config
26
+ end
27
+
28
+ def load_workflow_config
29
+ # # can override the default location of workflow config files
30
+ # # by setting WORKFLOW_CONFIG_HOME environmental variable
31
+ unless ROBOT_ROOT
32
+ if not (config_home = ENV['WORKFLOW_CONFIG_HOME'] )
33
+ config_home = File.join(File.dirname(__FILE__), "..", "..", "config")
34
+ end
35
+ else
36
+ config_home = File.join(ROBOT_ROOT, "config", "workflows")
37
+ end
38
+
39
+ @workflow_config_dir = File.join(config_home, @workflow_name )
40
+ LyberCore::Log.debug("@workflow_config_dir = #{@workflow_config_dir}")
41
+ @collection_config_dir = File.join(@workflow_config_dir, @collection_name ) if(@collection_name)
42
+ @workflow_config_file = File.join(@workflow_config_dir, 'workflow-config.yaml')
43
+ if (File.exist?(@workflow_config_file))
44
+ @workflow_config = YAML.load_file(workflow_config_file)
45
+ else
46
+ raise "Workflow config not found!
47
+ ROBOT_ROOT = #{ROBOT_ROOT}
48
+ expecting to find workflow_config_file in #{@workflow_config_file}
49
+ "
50
+ end
51
+ end
52
+
53
+ def workflow_collection
54
+ return @workflow_name + "_" + @collection_name
55
+ end
56
+
57
+ def workflow_id
58
+ return @workflow_name
59
+ end
60
+
61
+ # Which repository are we operating against?
62
+ # Should return either "dor" or "sdr"
63
+ def repository
64
+ return @workflow_config['repository']
65
+ end
66
+
67
+ # Construct the fully qualified filename and see if
68
+ # a file exists there. If it doesn't exist or isn't
69
+ # a file, raise an error.
70
+ def workflow_process_xml_filename
71
+ file = File.join(@workflow_config_dir, @workflow_name + '.xml')
72
+ if File.file? file
73
+ return file
74
+ else
75
+ raise "#{file} is not a file"
76
+ end
77
+ end
78
+
79
+ # Return the contents of the file at workflow_process_xml_filename
80
+ def workflow_process_xml
81
+ return IO.read(workflow_process_xml_filename)
82
+ end
83
+
84
+ def object_template_filepath
85
+ Dir.foreach(@collection_config_dir) do |file|
86
+ if file.match(/ObjectTemplate.xml$/)
87
+ return File.join(@collection_config_dir, file)
88
+ end
89
+ end
90
+ Dir.foreach(@workflow_config_dir) do |file|
91
+ if file.match(/ObjectTemplate.xml$/)
92
+ return File.join(@workflow_config_dir, file)
93
+ end
94
+ end
95
+ raise "Object Template not found"
96
+ end
97
+
98
+ # receives a workflow step and returns
99
+ def queue(workflow_step)
100
+ return WorkQueue.new(self, workflow_step)
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,77 @@
1
+ require 'fileutils'
2
+ module LyberCore
3
+ module Robots
4
+ class Workspace
5
+
6
+ attr_reader :workflow_name
7
+ attr_reader :collection_name
8
+ attr_reader :workspace_base
9
+
10
+ def initialize(workflow_name, collection_name=nil)
11
+ @workflow_name = workflow_name
12
+ @collection_name = collection_name
13
+ @workspace_base = set_workspace_base
14
+ ensure_workspace_exists(@workspace_base)
15
+ end
16
+
17
+ # Usually WORKSPACE_HOME is set in your environment config file,
18
+ # but you can override the default location of workspace files
19
+ # by setting a WORKSPACE_HOME environment variable
20
+ def set_workspace_home
21
+ begin
22
+ if not (workspace_home = ENV['WORKSPACE_HOME'] )
23
+ workspace_home = WORKSPACE_HOME
24
+ end
25
+ rescue NameError => e
26
+ LyberCore::Log.fatal("WORKSPACE_HOME is undefined. Do you need to set it in your config file?")
27
+ raise e
28
+ end
29
+ end
30
+
31
+ def set_workspace_base
32
+ workspace_home = set_workspace_home
33
+ if (@collection_name)
34
+ @workspace_base = File.join(workspace_home, @workflow_name, @collection_name)
35
+ else
36
+ @workspace_base = File.join(workspace_home, @workflow_name)
37
+ end
38
+ end
39
+
40
+ def ensure_workspace_exists(workspace)
41
+ begin
42
+ FileUtils.mkdir_p(workspace) unless File.directory?(workspace)
43
+ rescue
44
+ LyberCore::Log.fatal("Can't create workspace_base #{workspace}")
45
+ raise
46
+ end
47
+ end
48
+
49
+ # Remove the first part of the druid
50
+ # @param [String] druid
51
+ # @return [String]
52
+ def normalized_druid(druid)
53
+ druid.sub(/druid:/, '')
54
+ end
55
+
56
+ def object_dir(dir_type, druid)
57
+ dir_name = File.join(@workspace_base, dir_type, normalized_druid(druid))
58
+ ensure_workspace_exists(dir_name)
59
+ return dir_name
60
+ end
61
+
62
+ # The place where the original tar file from google is stored
63
+ def original_dir(druid)
64
+ object_dir('original', druid)
65
+ end
66
+
67
+ def content_dir(druid)
68
+ return object_dir('content', druid)
69
+ end
70
+
71
+ def metadata_dir(druid)
72
+ return object_dir('metadata', druid)
73
+ end
74
+
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,4 @@
1
+
2
+ require 'lyber_core/utils/file_utilities'
3
+ require 'lyber_core/utils/checksum_validate'
4
+ require 'lyber_core/utils/bagit_bag'
@@ -0,0 +1,100 @@
1
+ require 'find'
2
+ require 'fileutils'
3
+ require 'bagit' # http://github.com/flazz/bagit
4
+ require 'dor_service'
5
+
6
+ module LyberCore
7
+ module Utils
8
+ class BagitBag
9
+
10
+ def initialize(bag_dir)
11
+ @bag_dir = bag_dir
12
+ if (File.exist?(@bag_dir))
13
+ FileUtils.rm_r(@bag_dir)
14
+ end
15
+ @bag = BagIt::Bag.new @bag_dir
16
+ end
17
+
18
+ def add_content_files(source_dir, use_links)
19
+ data_content_dir = File.join(@bag_dir, 'data', 'content')
20
+ copy_dir(source_dir,data_content_dir, use_links)
21
+ end
22
+
23
+ def copy_dir(source_dir, target_dir, use_links)
24
+ FileUtils.mkdir_p(target_dir)
25
+ Dir.foreach(source_dir) do |file|
26
+ unless (file == '.' or file == '..')
27
+ source_file = File.join(source_dir, file)
28
+ target_file = File.join(target_dir, file)
29
+ if File.directory?(source_file)
30
+ copy_dir(source_file, target_file, use_links)
31
+ elsif (use_links)
32
+ File.link(source_file, target_file)
33
+ else
34
+ File.copy(source_file, target_file)
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ def add_metadata_file_from_string( metadata_string, file_name)
41
+ if (not metadata_string.nil? )
42
+ data_file_path = "metadata/#{file_name}"
43
+ @bag.add_file(data_file_path) do |io|
44
+ io.puts metadata_string
45
+ end
46
+ end
47
+ end
48
+
49
+ def write_metadata_info(md_hash)
50
+ payload = bag_payload()
51
+ bag_info_hash = {
52
+ 'Bag-Size' => bag_size_human(payload[0]),
53
+ 'Payload-Oxum' => "#{payload[0]}.#{payload[1]}",
54
+ }
55
+ @bag.write_bag_info(md_hash.merge(bag_info_hash))
56
+ File.rename(@bag.bag_info_txt_file, File.join(@bag.bag_dir,'bag-info.txt'))
57
+ end
58
+
59
+ def bag_payload()
60
+ bytes = 0
61
+ files = 0
62
+ Find.find(@bag.data_dir) do |filepath|
63
+ if (not File.directory?(filepath))
64
+ bytes += File.size(filepath)
65
+ files += 1
66
+ end
67
+ end
68
+ return [bytes, files]
69
+ end
70
+
71
+ def bag_size_human(bytes)
72
+ count = 0
73
+ size = bytes
74
+ while ( size >= 1000 and count < 4 )
75
+ size /= 1000.0
76
+ count += 1
77
+ end
78
+ if (count == 0)
79
+ return sprintf("%d B", size)
80
+ else
81
+ return sprintf("%.2f %s", size, %w[B KB MB GB TB][count] )
82
+ end
83
+ end
84
+
85
+ def write_manifests()
86
+ @bag.manifest!
87
+ @bag.tagmanifest!
88
+ end
89
+
90
+ def validate()
91
+ if not @bag.valid?
92
+ raise "bag not valid: #{@bag_dir}"
93
+ end
94
+
95
+ end
96
+
97
+ end
98
+
99
+ end
100
+ end