lyber-core 1.3.0 → 3.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,104 +0,0 @@
1
- module LyberCore
2
- module Robots
3
- class Workflow
4
-
5
- attr_reader :workflow_name
6
- attr_reader :workflow_config_dir
7
- attr_reader :workflow_config_file
8
- attr_reader :workflow_config
9
-
10
- attr_reader :collection_name
11
- attr_reader :collection_config_dir
12
-
13
-
14
- # @param [String] workflow_name name of the workflow
15
- # @param [Hash] options a hash of optional arguments
16
- # @return [LyberCore::Robots::Workflow] a workflow object
17
- # @example Create a new workflow object with a collection_name
18
- # @wf = LyberCore::Robots::Workflow.new(workflow_name, {:collection_name => collection})
19
- def initialize(workflow_name, options = {})
20
- # ROBOT_ROOT must be set before invoking a robot
21
- raise "ROBOT_ROOT isn't set. Please set it to point to where your config files live." unless defined? ROBOT_ROOT
22
-
23
- @workflow_name = workflow_name
24
- @collection_name = options[:collection_name]
25
- self.load_workflow_config
26
- end
27
-
28
- def load_workflow_config
29
- # # can override the default location of workflow config files
30
- # # by setting WORKFLOW_CONFIG_HOME environmental variable
31
- unless ROBOT_ROOT
32
- if not (config_home = ENV['WORKFLOW_CONFIG_HOME'] )
33
- config_home = File.join(File.dirname(__FILE__), "..", "..", "config")
34
- end
35
- else
36
- config_home = File.join(ROBOT_ROOT, "config", "workflows")
37
- end
38
-
39
- @workflow_config_dir = File.join(config_home, @workflow_name )
40
- LyberCore::Log.debug("@workflow_config_dir = #{@workflow_config_dir}")
41
- @collection_config_dir = File.join(@workflow_config_dir, @collection_name ) if(@collection_name)
42
- @workflow_config_file = File.join(@workflow_config_dir, 'workflow-config.yaml')
43
- if (File.exist?(@workflow_config_file))
44
- @workflow_config = YAML.load_file(workflow_config_file)
45
- else
46
- raise "Workflow config not found!
47
- ROBOT_ROOT = #{ROBOT_ROOT}
48
- expecting to find workflow_config_file in #{@workflow_config_file}
49
- "
50
- end
51
- end
52
-
53
- def workflow_collection
54
- return @workflow_name + "_" + @collection_name
55
- end
56
-
57
- def workflow_id
58
- return @workflow_name
59
- end
60
-
61
- # Which repository are we operating against?
62
- # Should return either "dor" or "sdr"
63
- def repository
64
- return @workflow_config['repository']
65
- end
66
-
67
- # Construct the fully qualified filename and see if
68
- # a file exists there. If it doesn't exist or isn't
69
- # a file, raise an error.
70
- def workflow_process_xml_filename
71
- file = File.join(@workflow_config_dir, @workflow_name + '.xml')
72
- if File.file? file
73
- return file
74
- else
75
- raise "#{file} is not a file"
76
- end
77
- end
78
-
79
- # Return the contents of the file at workflow_process_xml_filename
80
- def workflow_process_xml
81
- return IO.read(workflow_process_xml_filename)
82
- end
83
-
84
- def object_template_filepath
85
- Dir.foreach(@collection_config_dir) do |file|
86
- if file.match(/ObjectTemplate.xml$/)
87
- return File.join(@collection_config_dir, file)
88
- end
89
- end
90
- Dir.foreach(@workflow_config_dir) do |file|
91
- if file.match(/ObjectTemplate.xml$/)
92
- return File.join(@workflow_config_dir, file)
93
- end
94
- end
95
- raise "Object Template not found"
96
- end
97
-
98
- # receives a workflow step and returns
99
- def queue(workflow_step)
100
- return WorkQueue.new(self, workflow_step)
101
- end
102
- end
103
- end
104
- end
@@ -1,77 +0,0 @@
1
- require 'fileutils'
2
- module LyberCore
3
- module Robots
4
- class Workspace
5
-
6
- attr_reader :workflow_name
7
- attr_reader :collection_name
8
- attr_reader :workspace_base
9
-
10
- def initialize(workflow_name, collection_name=nil)
11
- @workflow_name = workflow_name
12
- @collection_name = collection_name
13
- @workspace_base = set_workspace_base
14
- ensure_workspace_exists(@workspace_base)
15
- end
16
-
17
- # Usually WORKSPACE_HOME is set in your environment config file,
18
- # but you can override the default location of workspace files
19
- # by setting a WORKSPACE_HOME environment variable
20
- def set_workspace_home
21
- begin
22
- if not (workspace_home = ENV['WORKSPACE_HOME'] )
23
- workspace_home = Dor::Config.robots.workspace
24
- end
25
- rescue NameError => e
26
- LyberCore::Log.fatal("WORKSPACE_HOME is undefined. Do you need to set it in your config file?")
27
- raise e
28
- end
29
- end
30
-
31
- def set_workspace_base
32
- workspace_home = set_workspace_home
33
- if (@collection_name)
34
- @workspace_base = File.join(workspace_home, @workflow_name, @collection_name)
35
- else
36
- @workspace_base = File.join(workspace_home, @workflow_name)
37
- end
38
- end
39
-
40
- def ensure_workspace_exists(workspace)
41
- begin
42
- FileUtils.mkdir_p(workspace) unless File.directory?(workspace)
43
- rescue
44
- LyberCore::Log.fatal("Can't create workspace_base #{workspace}")
45
- raise
46
- end
47
- end
48
-
49
- # Remove the first part of the druid
50
- # @param [String] druid
51
- # @return [String]
52
- def normalized_druid(druid)
53
- druid.sub(/druid:/, '')
54
- end
55
-
56
- def object_dir(dir_type, druid)
57
- dir_name = File.join(@workspace_base, dir_type, normalized_druid(druid))
58
- ensure_workspace_exists(dir_name)
59
- return dir_name
60
- end
61
-
62
- # The place where the original tar file from google is stored
63
- def original_dir(druid)
64
- object_dir('original', druid)
65
- end
66
-
67
- def content_dir(druid)
68
- return object_dir('content', druid)
69
- end
70
-
71
- def metadata_dir(druid)
72
- return object_dir('metadata', druid)
73
- end
74
-
75
- end
76
- end
77
- end
@@ -1,4 +0,0 @@
1
-
2
- require 'lyber_core/utils/file_utilities'
3
- require 'lyber_core/utils/checksum_validate'
4
- require 'lyber_core/utils/bagit_bag'
@@ -1,100 +0,0 @@
1
- require 'find'
2
- require 'fileutils'
3
- require 'bagit' # http://github.com/flazz/bagit
4
- require 'dor_service'
5
-
6
- module LyberCore
7
- module Utils
8
- class BagitBag
9
-
10
- def initialize(bag_dir)
11
- @bag_dir = bag_dir
12
- if (File.exist?(@bag_dir))
13
- FileUtils.rm_r(@bag_dir)
14
- end
15
- @bag = BagIt::Bag.new @bag_dir
16
- end
17
-
18
- def add_content_files(source_dir, use_links)
19
- data_content_dir = File.join(@bag_dir, 'data', 'content')
20
- copy_dir(source_dir,data_content_dir, use_links)
21
- end
22
-
23
- def copy_dir(source_dir, target_dir, use_links)
24
- FileUtils.mkdir_p(target_dir)
25
- Dir.foreach(source_dir) do |file|
26
- unless (file == '.' or file == '..')
27
- source_file = File.join(source_dir, file)
28
- target_file = File.join(target_dir, file)
29
- if File.directory?(source_file)
30
- copy_dir(source_file, target_file, use_links)
31
- elsif (use_links)
32
- File.link(source_file, target_file)
33
- else
34
- File.copy(source_file, target_file)
35
- end
36
- end
37
- end
38
- end
39
-
40
- def add_metadata_file_from_string( metadata_string, file_name)
41
- if (not metadata_string.nil? )
42
- data_file_path = "metadata/#{file_name}"
43
- @bag.add_file(data_file_path) do |io|
44
- io.puts metadata_string
45
- end
46
- end
47
- end
48
-
49
- def write_metadata_info(md_hash)
50
- payload = bag_payload()
51
- bag_info_hash = {
52
- 'Bag-Size' => bag_size_human(payload[0]),
53
- 'Payload-Oxum' => "#{payload[0]}.#{payload[1]}",
54
- }
55
- @bag.write_bag_info(md_hash.merge(bag_info_hash))
56
- File.rename(@bag.bag_info_txt_file, File.join(@bag.bag_dir,'bag-info.txt'))
57
- end
58
-
59
- def bag_payload()
60
- bytes = 0
61
- files = 0
62
- Find.find(@bag.data_dir) do |filepath|
63
- if (not File.directory?(filepath))
64
- bytes += File.size(filepath)
65
- files += 1
66
- end
67
- end
68
- return [bytes, files]
69
- end
70
-
71
- def bag_size_human(bytes)
72
- count = 0
73
- size = bytes
74
- while ( size >= 1000 and count < 4 )
75
- size /= 1000.0
76
- count += 1
77
- end
78
- if (count == 0)
79
- return sprintf("%d B", size)
80
- else
81
- return sprintf("%.2f %s", size, %w[B KB MB GB TB][count] )
82
- end
83
- end
84
-
85
- def write_manifests()
86
- @bag.manifest!
87
- @bag.tagmanifest!
88
- end
89
-
90
- def validate()
91
- if not @bag.valid?
92
- raise "bag not valid: #{@bag_dir}"
93
- end
94
-
95
- end
96
-
97
- end
98
-
99
- end
100
- end
@@ -1,65 +0,0 @@
1
- require 'nokogiri'
2
- require 'active_support'
3
-
4
- module LyberCore
5
- module Utils
6
- class ChecksumValidate
7
- #Code here
8
-
9
- def self.compare_hashes(hash1, hash2)
10
- return (hash1 == hash2)
11
- end
12
-
13
- def self.get_hash_differences(hash1, hash2)
14
- return hash1.diff(hash2)
15
- end
16
-
17
- def self.md5_hash_from_md5sum(md5sum)
18
- checksum_hash = {}
19
- md5sum.each do |line|
20
- line.chomp!
21
- digest,filename = line.split(/[ *]{2}/)
22
- checksum_hash[filename] = digest.downcase
23
- end
24
- return checksum_hash
25
- end
26
-
27
- def self.md5_hash_from_mets(mets)
28
- mets_checksum_hash = {}
29
- doc = Nokogiri::XML(mets)
30
- doc.xpath('/mets:mets/mets:fileSec//mets:file', {'mets' => 'http://www.loc.gov/METS/'}).each do |filenode|
31
- digest = filenode.attribute('CHECKSUM')
32
- if (digest)
33
- flocat = filenode.xpath('mets:FLocat', {'mets' => 'http://www.loc.gov/METS/'}).first
34
- if (flocat)
35
- filename = flocat.attribute_with_ns('href', 'http://www.w3.org/1999/xlink')
36
- if (filename)
37
- mets_checksum_hash[filename.text] = digest.text.downcase
38
- end
39
- end
40
- end
41
- end
42
- return mets_checksum_hash
43
- end
44
-
45
- def self.md5_hash_from_content_metadata(content_md)
46
- content_md_checksum_hash = {}
47
- doc = Nokogiri::XML(content_md)
48
- doc.xpath('/contentMetadata/resource[@type="page"]/file').each do |filenode|
49
- filename = filenode.attribute('id')
50
- if (filename)
51
- md5_element = filenode.xpath('checksum[@type="MD5"]').first
52
- if (md5_element)
53
- digest = md5_element.text
54
- if (digest)
55
- content_md_checksum_hash[filename.text] = digest.downcase
56
- end
57
- end
58
- end
59
- end
60
- return content_md_checksum_hash
61
-
62
- end
63
- end
64
- end
65
- end
@@ -1,168 +0,0 @@
1
- require 'fileutils'
2
- require 'systemu'
3
-
4
- # File Utilities for use in transferring filesystem objects,
5
- # decrypting a file, unpacking a targz archive, and validating checksums
6
- # @author rnanders@stanford.edu
7
- module LyberCore
8
- module Utils
9
- class FileUtilities
10
-
11
-
12
- # Executes a system command in a subprocess.
13
- # The method will return stdout from the command if execution was successful.
14
- # The method will raise an exception if if execution fails.
15
- # The exception's message will contain the explaination of the failure.
16
- # @param [String] command the command to be executed
17
- # @return [String] stdout from the command if execution was successful
18
- def FileUtilities.execute(command)
19
- status, stdout, stderr = systemu(command)
20
- if (status.exitstatus != 0)
21
- raise stderr
22
- end
23
- return stdout
24
- rescue
25
- msg = "Command failed to execute: [#{command}] caused by <STDERR = #{stderr.split($/).join('; ')}>"
26
- msg << " STDOUT = #{stdout.split($/).join('; ')}" if (stdout && (stdout.length > 0))
27
- raise msg
28
-
29
- end
30
-
31
- # Generates a dirname for storing or retrieving a file in
32
- # "pair tree" hierachical structure, where the path is derived
33
- # from segments of a barcode string
34
- #
35
- # = Input:
36
- # * barcode = barcode string
37
- #
38
- # = Return value:
39
- # * A string containing a slash-delimited dirname derived from the barcode
40
- def FileUtilities.pair_tree_from_barcode(barcode)
41
- if (barcode.class != String)
42
- raise "Barcode must be a String"
43
- end
44
- # figure out if this is a SUL barcode or from coordinate library
45
- library_prefix=barcode[0..4]
46
- if ( library_prefix == '36105' )
47
- pair_tree=barcode[5..10].gsub(/(..)/, '\1/')
48
- else
49
- library_prefix=barcode[0..2]
50
- pair_tree=barcode[3..8].gsub(/(..)/, '\1/')
51
- end
52
- return "#{library_prefix}/#{pair_tree}"
53
- end
54
-
55
- # Transfers a filesystem object (file or directory)
56
- # from a source to a target location. Uses rsync in "archive" mode
57
- # over an ssh connection.
58
- #
59
- # = Inputs:
60
- # * filename = basename of the filesystem object to be transferred
61
- # * source_dir = dirname of the source location from which the object is read
62
- # * dest_dir = dirname of the target location to which the object is written
63
- # If one of the locations is on a remote server, then the dirname should be
64
- # prefixed with user@hosthame:
65
- #
66
- # = Return value:
67
- # * The method will return true if the transfer is successful.
68
- # * The method will raise an exception if either the rsync command fails,
69
- # or a test for the existence of the transferred object fails.
70
- # The exception's message will contain the explaination of the failure
71
- #
72
- # Network transfers will only succeed if the appropriate public key
73
- # authentication has been previously set up.
74
- def FileUtilities.transfer_object(filename, source_dir, dest_dir)
75
- source_path=File.join(source_dir, filename)
76
- rsync='rsync -a -e ssh '
77
- rsync_cmd = rsync + "'" + source_path + "' " + dest_dir
78
- LyberCore::Log.debug("rsync command is: #{rsync_cmd}")
79
- self.execute(rsync_cmd)
80
- if not File.exists?(File.join(dest_dir, filename))
81
- raise "#{filename} is not found in #{dest_dir}"
82
- end
83
- return true
84
- end
85
-
86
- # Decrypts a GPG encrypted file using the "gpg" command
87
- #
88
- # = Inputs:
89
- # * workspace_dir = dirname containing the file
90
- # * targzgpg = the filename of the GPG encrypted file
91
- # * targz = the filename of the unencrypted file
92
- # * passphrase = the string used to decrypt the file
93
- #
94
- # = Return value:
95
- # * The method will return true if the decryption is successful.
96
- # * The method will raise an exception if either the decryption command fails,
97
- # or a test for the existence of the decrypted file fails.
98
- # The exception's message will contain the explaination of the failure
99
- def FileUtilities.gpgdecrypt(workspace_dir, targzgpg, targz, passphrase)
100
- LyberCore::Log.debug("decrypting #{targzgpg}")
101
- gpg_cmd="/usr/bin/gpg --passphrase '#{passphrase}' " +
102
- "--batch --no-mdc-warning --no-secmem-warning " +
103
- " --output " + File.join(workspace_dir, targz) +
104
- " --decrypt " + File.join(workspace_dir, targzgpg)
105
- self.execute(gpg_cmd)
106
- if not File.exists?(File.join(workspace_dir, targz))
107
- raise "#{targz} was not created in #{workspace_dir}"
108
- end
109
- return true
110
- end
111
-
112
- # Unpacks a TAR-ed, GZipped archive using a "tar -xzf" command
113
- #
114
- # = Inputs:
115
- # * original_dir = dirname containing the archive file
116
- # * targz = the filename of the archive file
117
- # * destination_dir = the target directory into which the contents are written
118
- #
119
- # = Return value:
120
- # * The method will return true if the unpacking is successful.
121
- # * The method will raise an exception if either the unpack command fails,
122
- # or a test for the existence of files in the target directory fails.
123
- # The exception's message will contain the explaination of the failure.
124
- def FileUtilities.unpack(original_dir, targz, destination_dir)
125
- LyberCore::Log.debug("unpacking #{targz}")
126
- FileUtils.mkdir_p(destination_dir)
127
- dir_save = Dir.pwd
128
- Dir.chdir(destination_dir)
129
- unpack_cmd="tar -xzf " + File.join(original_dir, targz)
130
- self.execute(unpack_cmd)
131
- if not (Dir.entries(destination_dir).length > 0)
132
- raise "#{destination_dir} is empty"
133
- end
134
- return true
135
- ensure
136
- Dir.chdir(dir_save)
137
- end
138
-
139
- # Verifies MD5 checksums for the files in a directory
140
- # against the checksum values in the supplied file
141
- # (Uses md5sum command)
142
- #
143
- # = Inputs:
144
- # * directory = dirname containing the file to be checked
145
- # * checksum_file = the name of the file containing the expected checksums
146
- #
147
- # = Return value:
148
- # * The method will return true if the verification is successful.
149
- # * The method will raise an exception if either the md5sum command fails,
150
- # or a test of the md5sum output indicates a checksum mismatch.
151
- # The exception's message will contain the explaination of the failure.
152
- def FileUtilities.verify_checksums(directory, checksum_file)
153
- LyberCore::Log.debug("verifying checksums in #{directory}")
154
- dir_save = Dir.pwd
155
- Dir.chdir(directory)
156
- checksum_cmd = 'md5sum -c ' + checksum_file + ' | grep -v OK | wc -l'
157
- badcount = self.execute(checksum_cmd).to_i
158
- if not (badcount==0)
159
- raise "#{badcount} files had bad checksums"
160
- end
161
- return true
162
- ensure
163
- Dir.chdir(dir_save)
164
- end
165
- end
166
-
167
- end
168
- end