lyber-core 0.9.6.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,103 @@
1
+ require 'dor_service'
2
+ require "roxml_models/identity_metadata/identity_metadata"
3
+ require "roxml_models/identity_metadata/dublin_core"
4
+
5
+ # Represents a single object being processed as part of a workflow queue
6
+ module LyberCore
7
+ module Robots
8
+ class WorkItem
9
+
10
+ # The queue that this workitem is a member of
11
+ attr_reader :work_queue
12
+ # The primary id for the object being processed
13
+ attr_accessor :druid
14
+ # An object used to hold unmarshalled XML from the identityMetadata datastream
15
+ attr_accessor :identity_metadata
16
+ # Timings for this workitem's processing
17
+ attr_reader :start_time
18
+ attr_reader :end_time
19
+ attr_reader :elapsed_time
20
+
21
+ # Create a new WorkItem object, save a pointer to the parent WorkQueue, and start the timer
22
+ def initialize(work_queue)
23
+ @work_queue = work_queue
24
+ @start_time = Time.new
25
+ end
26
+
27
+ # Inject an IdentityMetadata object (currently used for unit testing only)
28
+ def identity_metadata=(identity_metadata)
29
+ @identity_metadata = identity_metadata
30
+ end
31
+
32
+ #save the IdentityMetadata object to identityMetadata datastream
33
+ def identity_metadata_save
34
+ unless DorService.get_datastream(@druid, 'identityMetadata')
35
+ DorService.add_datastream(@druid, 'identityMetadata', 'identityMetadata', self.identity_metadata.to_xml.to_xml)
36
+ else
37
+ DorService.update_datastream(@druid, 'identityMetadata', self.identity_metadata.to_xml.to_xml, content_type='application/xml', versionable = false)
38
+ end #unless
39
+ end #identity_metadata_save
40
+
41
+ # Return the IdentityMetadata object bound to identityMetadata datastream XML
42
+ def identity_metadata
43
+ if (@identity_metadata == nil)
44
+ if (@druid == nil)
45
+ @identity_metadata = IdentityMetadata.new
46
+ else
47
+ idmd_str = DorService.get_datastream(@druid, 'identityMetadata')
48
+ @identity_metadata = IdentityMetadata.from_xml(idmd_str)
49
+ end
50
+ end
51
+ return @identity_metadata
52
+ end
53
+
54
+ # Return the identifier value for the specified identier name
55
+ def identifier(key)
56
+ return self.identity_metadata.get_identifier_value(key)
57
+ end
58
+
59
+ # Add a new name,value pair to the set of identifiers
60
+ def identifier_add(key, value)
61
+ self.identity_metadata.add_identifier(key, value)
62
+ end
63
+
64
+ # Return an array of strings where each entry consists of name:value
65
+ def id_pairs
66
+ self.identity_metadata.get_id_pairs
67
+ end
68
+
69
+ # Return the druid for the work item if it exists, else the first identifier value
70
+ def item_id
71
+ return @druid if @druid
72
+ pairs = self.identity_metadata.get_id_pairs
73
+ return pairs[0] if (pairs.size > 0)
74
+ end
75
+
76
+ # Record the successful outcome of the workstep operation for this workitem
77
+ def set_success
78
+ @work_queue.success_count += 1
79
+ @end_time = Time.new
80
+ @elapsed_time = @end_time - @start_time
81
+ LyberCore::Log.info("#{item_id} completed in #{@elapsed_time} seconds")
82
+ if (@druid)
83
+ Dor::WorkflowService.update_workflow_status(@work_queue.workflow.repository, @druid, @work_queue.workflow.workflow_id, @work_queue.workflow_step, 'completed', @elapsed_time)
84
+ end
85
+ end
86
+
87
+ # Record the unsuccessful outcome of the workstep operation for this workitem
88
+ def set_error(e)
89
+ @work_queue.error_count += 1
90
+ @end_time = Time.new
91
+ @elapsed_time = @end_time - @start_time
92
+ LyberCore::Log.error("#{item_id} error - #{e.backtrace}")
93
+ # By default puts will output an array with a newline between each item.
94
+ if (@druid)
95
+ DorService.update_workflow_error_status(@work_queue.workflow.repository, @druid, @work_queue.workflow.workflow_id, @work_queue.workflow_step, e.message)
96
+ end
97
+ # We've caught and processed the error at this point, I don't think we want to raise it again. --bess
98
+ # raise e
99
+ end
100
+
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,154 @@
1
+ require 'dor_service'
2
+ require 'dlss_service'
3
+ require 'yaml'
4
+
5
+ module LyberCore
6
+ module Robots
7
+ # Represents a set of workitem objects to be processed by a given step of a workflow
8
+ class WorkQueue
9
+
10
+ # The workflow that this queue is a part of
11
+ attr_reader :workflow
12
+ # The step in the workflow that is being processed against this queue
13
+ attr_reader :workflow_step
14
+ # The workflow step that should have already been completed for the workitem object
15
+ attr_reader :prerequisite
16
+ # The maximum number of workitem objects to process in one run of a robot
17
+ attr_reader :batch_limit
18
+ # The maximum number of errors to allow before terminating the batch run
19
+ attr_reader :error_limit
20
+ # The array of primary identifiers for the workitem objects to be processed
21
+ attr_reader :druids
22
+ # An alternative identitier to be used when druids are not yet available (e.g at registration)
23
+ attr_reader :identifier_name
24
+ attr_reader :identifier_values
25
+ # The tally of how many items have been processed
26
+ attr_reader :item_count
27
+ attr_accessor :success_count
28
+ attr_accessor :error_count
29
+ # The timings for the batch run
30
+ attr_reader :start_time
31
+ attr_reader :end_time
32
+ attr :elapsed_time
33
+
34
+ attr_reader :config_file
35
+
36
+
37
+ # Create a new WorkQueue object for the specified step,
38
+ # save a pointer to the parent WorkFlow,
39
+ # start the timer,
40
+ # read in the configuration information for the work step
41
+ def initialize(workflow=nil, workflow_step=nil)
42
+ LyberCore::Log.debug("Initializing work queue with workflow #{workflow} and workflow_step #{workflow_step}")
43
+ @start_time = Time.new
44
+ LyberCore::Log.info("Starting #{workflow_step} at #{@start_time}")
45
+ @workflow = workflow
46
+ @workflow_step = workflow_step
47
+ @item_count = 0
48
+ @success_count = 0
49
+ @error_count = 0
50
+ # nil arguments should only be used if in test mode
51
+ if (workflow.nil? || workflow_step.nil?)
52
+ @batch_limit = 2
53
+ @error_limit = 1
54
+ return
55
+ end
56
+
57
+ self.process_config_file
58
+
59
+ end
60
+
61
+ def process_config_file
62
+ LyberCore::Log.debug("Processing config file ... ")
63
+ LyberCore::Log.debug("@workflow.workflow_config_dir = #{@workflow.workflow_config_dir}")
64
+
65
+ @config_file = File.join(@workflow.workflow_config_dir, 'process-config.yaml')
66
+ LyberCore::Log.debug("I'm opening the config file at #{@config_file}")
67
+
68
+ # Does the file exist?
69
+ raise "Can't open process-config file #{@config_file}" unless File.file? @config_file
70
+
71
+ process_config = YAML.load_file(config_file)
72
+ LyberCore::Log.debug("process_config: #{process_config.inspect}")
73
+
74
+ @prerequisite = process_config[@workflow_step]["prerequisite"]
75
+ LyberCore::Log.debug("@prerequisite: #{@prerequisite}")
76
+
77
+ @batch_limit = process_config[@workflow_step]['batch_limit']
78
+ LyberCore::Log.debug("@batch_limit: #{@batch_limit}")
79
+
80
+ @error_limit = process_config[@workflow_step]['error_limit']
81
+ LyberCore::Log.debug("@error_limit: #{@error_limit}")
82
+
83
+ end
84
+
85
+ # Explicitly specify a set of druids to be processed by the workflow step
86
+ def enqueue_druids(druid_array)
87
+ LyberCore::Log.debug("\nEnqueing an array of druids...")
88
+ @druids = druid_array
89
+ LyberCore::Log.debug("\n@druids = #{@druids}")
90
+ end
91
+
92
+ # Obtain the set of druids to be processed using a database query
93
+ # to obtain the repository objects that are awaiting this step
94
+ def enqueue_workstep_waiting()
95
+ begin
96
+ LyberCore::Log.debug("\nEnqueing workstep waiting...")
97
+ object_list_xml = DorService.get_objects_for_workstep(workflow.repository, workflow.workflow_id, @prerequisite, @workflow_step)
98
+ LyberCore::Log.debug("\nobject_list_xml = #{object_list_xml}")
99
+ @druids = DlssService.get_some_druids_from_object_list(object_list_xml,self.batch_limit)
100
+ LyberCore::Log.debug("\n@druids = #{@druids}")
101
+ rescue Exception => e
102
+ raise e
103
+ end
104
+ end
105
+
106
+ # Use an alternative set of identifiers as the basis of this queue
107
+ # e.g. use array of barcodes as basis for google register-object robot
108
+ def enqueue_identifiers(identifier_name, identifier_values)
109
+ @identifier_name = identifier_name
110
+ @identifier_values = identifier_values
111
+ end
112
+
113
+ # Get the next WorkItem to be processed by the robot for the workflow step
114
+ def next_item()
115
+ if (@item_count >= @batch_limit )
116
+ LyberCore::Log.info "Batch limit of #{@batch_limit} items reached"
117
+ return nil
118
+ end
119
+ if (@error_count >= @error_limit )
120
+ LyberCore::Log.info "Error limit of #{@error_limit} items reached"
121
+ return nil
122
+ end
123
+ work_item = LyberCore::Robots::WorkItem.new(self)
124
+ if (@druids)
125
+ return nil if (@item_count >= @druids.length)
126
+ work_item.druid= @druids[@item_count]
127
+ elsif (@identifier_values)
128
+ return nil if (@item_count >= @identifier_values.length)
129
+ work_item.identifier_add(@identifier_name,@identifier_values[@item_count])
130
+ else
131
+ return nil
132
+ end
133
+ @item_count += 1
134
+ return work_item
135
+ end
136
+
137
+ # Output the batch's timings and other statistics to the main log file
138
+ def print_stats
139
+ @end_time = Time.new
140
+ @elapsed_time = @end_time - @start_time
141
+ LyberCore::Log.info "Total time: " + @elapsed_time.to_s + "\n"
142
+ LyberCore::Log.info "Completed objects: " + self.success_count.to_s + "\n"
143
+ LyberCore::Log.info "Errors: " + self.error_count.to_s + "\n"
144
+ end
145
+
146
+ def print_empty_stats
147
+ @end_time = Time.new
148
+ @elapsed_time = @end_time - @start_time
149
+ LyberCore::Log.info "Total time: " + @elapsed_time.to_s + "\n"
150
+ LyberCore::Log.info "Empty queue"
151
+ end
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,104 @@
1
+ module LyberCore
2
+ module Robots
3
+ class Workflow
4
+
5
+ attr_reader :workflow_name
6
+ attr_reader :workflow_config_dir
7
+ attr_reader :workflow_config_file
8
+ attr_reader :workflow_config
9
+
10
+ attr_reader :collection_name
11
+ attr_reader :collection_config_dir
12
+
13
+
14
+ # @param [String] workflow_name name of the workflow
15
+ # @param [Hash] options a hash of optional arguments
16
+ # @return [LyberCore::Robots::Workflow] a workflow object
17
+ # @example Create a new workflow object with a collection_name
18
+ # @wf = LyberCore::Robots::Workflow.new(workflow_name, {:collection_name => collection})
19
+ def initialize(workflow_name, options = {})
20
+ # ROBOT_ROOT must be set before invoking a robot
21
+ raise "ROBOT_ROOT isn't set. Please set it to point to where your config files live." unless defined? ROBOT_ROOT
22
+
23
+ @workflow_name = workflow_name
24
+ @collection_name = options[:collection_name]
25
+ self.load_workflow_config
26
+ end
27
+
28
+ def load_workflow_config
29
+ # # can override the default location of workflow config files
30
+ # # by setting WORKFLOW_CONFIG_HOME environmental variable
31
+ unless ROBOT_ROOT
32
+ if not (config_home = ENV['WORKFLOW_CONFIG_HOME'] )
33
+ config_home = File.join(File.dirname(__FILE__), "..", "..", "config")
34
+ end
35
+ else
36
+ config_home = File.join(ROBOT_ROOT, "config", "workflows")
37
+ end
38
+
39
+ @workflow_config_dir = File.join(config_home, @workflow_name )
40
+ LyberCore::Log.debug("@workflow_config_dir = #{@workflow_config_dir}")
41
+ @collection_config_dir = File.join(@workflow_config_dir, @collection_name ) if(@collection_name)
42
+ @workflow_config_file = File.join(@workflow_config_dir, 'workflow-config.yaml')
43
+ if (File.exist?(@workflow_config_file))
44
+ @workflow_config = YAML.load_file(workflow_config_file)
45
+ else
46
+ raise "Workflow config not found!
47
+ ROBOT_ROOT = #{ROBOT_ROOT}
48
+ expecting to find workflow_config_file in #{@workflow_config_file}
49
+ "
50
+ end
51
+ end
52
+
53
+ def workflow_collection
54
+ return @workflow_name + "_" + @collection_name
55
+ end
56
+
57
+ def workflow_id
58
+ return @workflow_name
59
+ end
60
+
61
+ # Which repository are we operating against?
62
+ # Should return either "dor" or "sdr"
63
+ def repository
64
+ return @workflow_config['repository']
65
+ end
66
+
67
+ # Construct the fully qualified filename and see if
68
+ # a file exists there. If it doesn't exist or isn't
69
+ # a file, raise an error.
70
+ def workflow_process_xml_filename
71
+ file = File.join(@workflow_config_dir, @workflow_name + '.xml')
72
+ if File.file? file
73
+ return file
74
+ else
75
+ raise "#{file} is not a file"
76
+ end
77
+ end
78
+
79
+ # Return the contents of the file at workflow_process_xml_filename
80
+ def workflow_process_xml
81
+ return IO.read(workflow_process_xml_filename)
82
+ end
83
+
84
+ def object_template_filepath
85
+ Dir.foreach(@collection_config_dir) do |file|
86
+ if file.match(/ObjectTemplate.xml$/)
87
+ return File.join(@collection_config_dir, file)
88
+ end
89
+ end
90
+ Dir.foreach(@workflow_config_dir) do |file|
91
+ if file.match(/ObjectTemplate.xml$/)
92
+ return File.join(@workflow_config_dir, file)
93
+ end
94
+ end
95
+ raise "Object Template not found"
96
+ end
97
+
98
+ # receives a workflow step and returns
99
+ def queue(workflow_step)
100
+ return WorkQueue.new(self, workflow_step)
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,77 @@
1
+ require 'fileutils'
2
+ module LyberCore
3
+ module Robots
4
+ class Workspace
5
+
6
+ attr_reader :workflow_name
7
+ attr_reader :collection_name
8
+ attr_reader :workspace_base
9
+
10
+ def initialize(workflow_name, collection_name=nil)
11
+ @workflow_name = workflow_name
12
+ @collection_name = collection_name
13
+ @workspace_base = set_workspace_base
14
+ ensure_workspace_exists(@workspace_base)
15
+ end
16
+
17
+ # Usually WORKSPACE_HOME is set in your environment config file,
18
+ # but you can override the default location of workspace files
19
+ # by setting a WORKSPACE_HOME environment variable
20
+ def set_workspace_home
21
+ begin
22
+ if not (workspace_home = ENV['WORKSPACE_HOME'] )
23
+ workspace_home = WORKSPACE_HOME
24
+ end
25
+ rescue NameError => e
26
+ LyberCore::Log.fatal("WORKSPACE_HOME is undefined. Do you need to set it in your config file?")
27
+ raise e
28
+ end
29
+ end
30
+
31
+ def set_workspace_base
32
+ workspace_home = set_workspace_home
33
+ if (@collection_name)
34
+ @workspace_base = File.join(workspace_home, @workflow_name, @collection_name)
35
+ else
36
+ @workspace_base = File.join(workspace_home, @workflow_name)
37
+ end
38
+ end
39
+
40
+ def ensure_workspace_exists(workspace)
41
+ begin
42
+ FileUtils.mkdir_p(workspace) unless File.directory?(workspace)
43
+ rescue
44
+ LyberCore::Log.fatal("Can't create workspace_base #{workspace}")
45
+ raise
46
+ end
47
+ end
48
+
49
+ # Remove the first part of the druid
50
+ # @param [String] druid
51
+ # @return [String]
52
+ def normalized_druid(druid)
53
+ druid.sub(/druid:/, '')
54
+ end
55
+
56
+ def object_dir(dir_type, druid)
57
+ dir_name = File.join(@workspace_base, dir_type, normalized_druid(druid))
58
+ ensure_workspace_exists(dir_name)
59
+ return dir_name
60
+ end
61
+
62
+ # The place where the original tar file from google is stored
63
+ def original_dir(druid)
64
+ object_dir('original', druid)
65
+ end
66
+
67
+ def content_dir(druid)
68
+ return object_dir('content', druid)
69
+ end
70
+
71
+ def metadata_dir(druid)
72
+ return object_dir('metadata', druid)
73
+ end
74
+
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,4 @@
1
+
2
+ require 'lyber_core/utils/file_utilities'
3
+ require 'lyber_core/utils/checksum_validate'
4
+ require 'lyber_core/utils/bagit_bag'
@@ -0,0 +1,100 @@
1
+ require 'find'
2
+ require 'fileutils'
3
+ require 'bagit' # http://github.com/flazz/bagit
4
+ require 'dor_service'
5
+
6
+ module LyberCore
7
+ module Utils
8
+ class BagitBag
9
+
10
+ def initialize(bag_dir)
11
+ @bag_dir = bag_dir
12
+ if (File.exist?(@bag_dir))
13
+ FileUtils.rm_r(@bag_dir)
14
+ end
15
+ @bag = BagIt::Bag.new @bag_dir
16
+ end
17
+
18
+ def add_content_files(source_dir, use_links)
19
+ data_content_dir = File.join(@bag_dir, 'data', 'content')
20
+ copy_dir(source_dir,data_content_dir, use_links)
21
+ end
22
+
23
+ def copy_dir(source_dir, target_dir, use_links)
24
+ FileUtils.mkdir_p(target_dir)
25
+ Dir.foreach(source_dir) do |file|
26
+ unless (file == '.' or file == '..')
27
+ source_file = File.join(source_dir, file)
28
+ target_file = File.join(target_dir, file)
29
+ if File.directory?(source_file)
30
+ copy_dir(source_file, target_file, use_links)
31
+ elsif (use_links)
32
+ File.link(source_file, target_file)
33
+ else
34
+ File.copy(source_file, target_file)
35
+ end
36
+ end
37
+ end
38
+ end
39
+
40
+ def add_metadata_file_from_string( metadata_string, file_name)
41
+ if (not metadata_string.nil? )
42
+ data_file_path = "metadata/#{file_name}"
43
+ @bag.add_file(data_file_path) do |io|
44
+ io.puts metadata_string
45
+ end
46
+ end
47
+ end
48
+
49
+ def write_metadata_info(md_hash)
50
+ payload = bag_payload()
51
+ bag_info_hash = {
52
+ 'Bag-Size' => bag_size_human(payload[0]),
53
+ 'Payload-Oxum' => "#{payload[0]}.#{payload[1]}",
54
+ }
55
+ @bag.write_bag_info(md_hash.merge(bag_info_hash))
56
+ File.rename(@bag.bag_info_txt_file, File.join(@bag.bag_dir,'bag-info.txt'))
57
+ end
58
+
59
+ def bag_payload()
60
+ bytes = 0
61
+ files = 0
62
+ Find.find(@bag.data_dir) do |filepath|
63
+ if (not File.directory?(filepath))
64
+ bytes += File.size(filepath)
65
+ files += 1
66
+ end
67
+ end
68
+ return [bytes, files]
69
+ end
70
+
71
+ def bag_size_human(bytes)
72
+ count = 0
73
+ size = bytes
74
+ while ( size >= 1000 and count < 4 )
75
+ size /= 1000.0
76
+ count += 1
77
+ end
78
+ if (count == 0)
79
+ return sprintf("%d B", size)
80
+ else
81
+ return sprintf("%.2f %s", size, %w[B KB MB GB TB][count] )
82
+ end
83
+ end
84
+
85
+ def write_manifests()
86
+ @bag.manifest!
87
+ @bag.tagmanifest!
88
+ end
89
+
90
+ def validate()
91
+ if not @bag.valid?
92
+ raise "bag not valid: #{@bag_dir}"
93
+ end
94
+
95
+ end
96
+
97
+ end
98
+
99
+ end
100
+ end