preservation 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/PITCHME.md +18 -17
- data/README.md +21 -43
- data/lib/preservation.rb +7 -3
- data/lib/preservation/builder.rb +84 -0
- data/lib/preservation/{string_util.rb → conversion.rb} +7 -2
- data/lib/preservation/ingest.rb +4 -129
- data/lib/preservation/report/database.rb +26 -0
- data/lib/preservation/report/transfer.rb +166 -0
- data/lib/preservation/storage.rb +50 -0
- data/lib/preservation/temporal.rb +21 -0
- data/lib/preservation/transfer/pure.rb +215 -0
- data/lib/preservation/version.rb +1 -1
- data/preservation.gemspec +3 -3
- metadata +13 -9
- data/lib/preservation/ingest_report.rb +0 -172
- data/lib/preservation/pure_ingest.rb +0 -188
data/lib/preservation/version.rb
CHANGED
data/preservation.gemspec
CHANGED
@@ -8,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Preservation::VERSION
|
9
9
|
spec.authors = ["Adrian Albin-Clark"]
|
10
10
|
spec.email = ["a.albin-clark@lancaster.ac.uk"]
|
11
|
-
spec.summary = %q{
|
12
|
-
spec.description = %q{
|
13
|
-
spec.homepage = "https://
|
11
|
+
spec.summary = %q{Extraction and Transformation for Loading by Archivematica's Automation Tools.}
|
12
|
+
spec.description = %q{Extraction and Transformation for Loading by Archivematica's Automation Tools. Includes transfer preparation, reporting and disk space management.}
|
13
|
+
spec.homepage = "https://aalbinclark.gitbooks.io/preservation"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preservation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adrian Albin-Clark
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-09-
|
11
|
+
date: 2016-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: free_disk_space
|
@@ -52,8 +52,8 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.3'
|
55
|
-
description:
|
56
|
-
|
55
|
+
description: Extraction and Transformation for Loading by Archivematica's Automation
|
56
|
+
Tools. Includes transfer preparation, reporting and disk space management.
|
57
57
|
email:
|
58
58
|
- a.albin-clark@lancaster.ac.uk
|
59
59
|
executables: []
|
@@ -68,14 +68,18 @@ files:
|
|
68
68
|
- README.md
|
69
69
|
- Rakefile
|
70
70
|
- lib/preservation.rb
|
71
|
+
- lib/preservation/builder.rb
|
71
72
|
- lib/preservation/configuration.rb
|
73
|
+
- lib/preservation/conversion.rb
|
72
74
|
- lib/preservation/ingest.rb
|
73
|
-
- lib/preservation/
|
74
|
-
- lib/preservation/
|
75
|
-
- lib/preservation/
|
75
|
+
- lib/preservation/report/database.rb
|
76
|
+
- lib/preservation/report/transfer.rb
|
77
|
+
- lib/preservation/storage.rb
|
78
|
+
- lib/preservation/temporal.rb
|
79
|
+
- lib/preservation/transfer/pure.rb
|
76
80
|
- lib/preservation/version.rb
|
77
81
|
- preservation.gemspec
|
78
|
-
homepage: https://
|
82
|
+
homepage: https://aalbinclark.gitbooks.io/preservation
|
79
83
|
licenses:
|
80
84
|
- MIT
|
81
85
|
metadata: {}
|
@@ -98,6 +102,6 @@ rubyforge_project:
|
|
98
102
|
rubygems_version: 2.2.2
|
99
103
|
signing_key:
|
100
104
|
specification_version: 4
|
101
|
-
summary:
|
105
|
+
summary: Extraction and Transformation for Loading by Archivematica's Automation Tools.
|
102
106
|
test_files: []
|
103
107
|
has_rdoc:
|
@@ -1,172 +0,0 @@
|
|
1
|
-
module Preservation
|
2
|
-
|
3
|
-
# Ingest reporting
|
4
|
-
#
|
5
|
-
class IngestReport
|
6
|
-
|
7
|
-
def initialize
|
8
|
-
create_db_connection
|
9
|
-
end
|
10
|
-
|
11
|
-
# Transfers based on presence (or not) of a particular status
|
12
|
-
#
|
13
|
-
# @param status_to_find [String]
|
14
|
-
# @param status_presence [Boolean]
|
15
|
-
def transfer_status(status_to_find: nil, status_presence: true)
|
16
|
-
if status_presence === true
|
17
|
-
status_presence = '='
|
18
|
-
else
|
19
|
-
status_presence = '<>'
|
20
|
-
end
|
21
|
-
|
22
|
-
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
|
23
|
-
|
24
|
-
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
25
|
-
# and use hex function in DB query
|
26
|
-
records = []
|
27
|
-
@db.results_as_hash = true
|
28
|
-
@db.execute( query, [ status_to_find ] ) do |row|
|
29
|
-
id = row['id']
|
30
|
-
uuid = row['uuid']
|
31
|
-
bin_path = StringUtil.hex_to_bin row['hex_path']
|
32
|
-
unit_type = row['unit_type']
|
33
|
-
status = row['status']
|
34
|
-
microservice = row['microservice']
|
35
|
-
current = row['current']
|
36
|
-
o = {}
|
37
|
-
o['path'] = bin_path if !bin_path.empty?
|
38
|
-
o['unit_type'] = unit_type if !unit_type.empty?
|
39
|
-
o['status'] = status if !status.empty?
|
40
|
-
o['microservice'] = microservice if !microservice.empty?
|
41
|
-
o['current'] = current if current
|
42
|
-
o['id'] = id if id
|
43
|
-
o['uuid'] = uuid if !uuid.empty?
|
44
|
-
|
45
|
-
records << o
|
46
|
-
end
|
47
|
-
|
48
|
-
records
|
49
|
-
end
|
50
|
-
|
51
|
-
# Current transfer
|
52
|
-
#
|
53
|
-
# @return [Hash]
|
54
|
-
def transfer_current
|
55
|
-
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
|
56
|
-
|
57
|
-
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
58
|
-
# and use hex function in DB query
|
59
|
-
o = {}
|
60
|
-
@db.results_as_hash = true
|
61
|
-
@db.execute( query ) do |row|
|
62
|
-
id = row['id']
|
63
|
-
uuid = row['uuid']
|
64
|
-
bin_path = hex_to_bin row['hex_path']
|
65
|
-
unit_type = row['unit_type']
|
66
|
-
status = row['status']
|
67
|
-
microservice = row['microservice']
|
68
|
-
current = row['current']
|
69
|
-
o['path'] = bin_path if !bin_path.empty?
|
70
|
-
o['unit_type'] = unit_type if !unit_type.empty?
|
71
|
-
o['status'] = status if !status.empty?
|
72
|
-
o['microservice'] = microservice if !microservice.empty?
|
73
|
-
o['current'] = current if current
|
74
|
-
o['id'] = id if id
|
75
|
-
o['uuid'] = uuid if !uuid.empty?
|
76
|
-
end
|
77
|
-
o
|
78
|
-
end
|
79
|
-
|
80
|
-
# Count of complete transfers
|
81
|
-
#
|
82
|
-
# @return [Integer]
|
83
|
-
def transfer_complete_count
|
84
|
-
query = 'SELECT count(*) FROM unit WHERE status = ?'
|
85
|
-
|
86
|
-
status_to_find = 'COMPLETE'
|
87
|
-
@db.results_as_hash = true
|
88
|
-
@db.get_first_value( query, [status_to_find] )
|
89
|
-
end
|
90
|
-
|
91
|
-
# Compilation of statistics and data, with focus on exceptions
|
92
|
-
#
|
93
|
-
# @return [Hash]
|
94
|
-
def transfer_exception
|
95
|
-
incomplete = transfer_status(status_to_find: 'COMPLETE', status_presence: false)
|
96
|
-
failed = transfer_status(status_to_find: 'FAILED', status_presence: true)
|
97
|
-
current = transfer_current
|
98
|
-
complete_count = transfer_complete_count
|
99
|
-
report = {}
|
100
|
-
report['current'] = current if !current.empty?
|
101
|
-
report['failed'] = {}
|
102
|
-
report['failed']['count'] = failed.count
|
103
|
-
report['failed']['data'] = failed if !failed.empty?
|
104
|
-
report['incomplete'] = {}
|
105
|
-
report['incomplete']['count'] = incomplete.count
|
106
|
-
report['incomplete']['data'] = incomplete if !incomplete.empty?
|
107
|
-
report['complete'] = {}
|
108
|
-
report['complete']['count'] = complete_count if complete_count
|
109
|
-
report
|
110
|
-
end
|
111
|
-
|
112
|
-
# Is it in database?
|
113
|
-
# @param path_to_find [String] directory name within ingest path
|
114
|
-
# @return [Boolean]
|
115
|
-
def in_db?(path_to_find)
|
116
|
-
in_db = false
|
117
|
-
|
118
|
-
# Get path out of DB as a hex string
|
119
|
-
query = 'SELECT hex(path) FROM unit'
|
120
|
-
|
121
|
-
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
122
|
-
# and use hex function in DB query
|
123
|
-
@db.execute( query ) do |row|
|
124
|
-
bin_path = StringUtil.hex_to_bin row[0]
|
125
|
-
if bin_path === path_to_find
|
126
|
-
in_db = true
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
in_db
|
131
|
-
end
|
132
|
-
|
133
|
-
# Has preservation been done?
|
134
|
-
# @param path_to_find [String] directory name within ingest path
|
135
|
-
# @return [Boolean]
|
136
|
-
def preserved?(path_to_find)
|
137
|
-
preserved = false
|
138
|
-
|
139
|
-
# 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
|
140
|
-
# indicates completed
|
141
|
-
unit_type_to_find = 'ingest'
|
142
|
-
status_to_find = 'COMPLETE'
|
143
|
-
|
144
|
-
# Get path out of DB as a hex string for completed ingests
|
145
|
-
query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
|
146
|
-
|
147
|
-
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
148
|
-
# and use hex function in DB query
|
149
|
-
@db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
|
150
|
-
bin_path = StringUtil.hex_to_bin row[0]
|
151
|
-
if bin_path === path_to_find
|
152
|
-
preserved = true
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
preserved
|
157
|
-
end
|
158
|
-
|
159
|
-
|
160
|
-
private
|
161
|
-
|
162
|
-
def create_db_connection
|
163
|
-
if Preservation.db_path.nil?
|
164
|
-
puts 'Missing db_path'
|
165
|
-
exit
|
166
|
-
end
|
167
|
-
@db = SQLite3::Database.new Preservation.db_path
|
168
|
-
end
|
169
|
-
|
170
|
-
end
|
171
|
-
|
172
|
-
end
|
@@ -1,188 +0,0 @@
|
|
1
|
-
module Preservation
|
2
|
-
|
3
|
-
# Ingest for Pure
|
4
|
-
#
|
5
|
-
class PureIngest < Ingest
|
6
|
-
|
7
|
-
def initialize
|
8
|
-
super
|
9
|
-
end
|
10
|
-
|
11
|
-
# For each uuid, if necessary, fetch the metadata,
|
12
|
-
# prepare a directory in the ingest path and populate it with the files and
|
13
|
-
# JSON description file.
|
14
|
-
#
|
15
|
-
# @param uuids [Array<String>] uuids to preserve
|
16
|
-
# @param dir_name_scheme [Symbol] method to make directory name
|
17
|
-
# @param delay [Integer] days to wait (after modification date) before preserving
|
18
|
-
def prepare_dataset(uuids: [],
|
19
|
-
dir_name_scheme: :uuid,
|
20
|
-
delay: 0)
|
21
|
-
dir_base_path = Preservation.ingest_path
|
22
|
-
|
23
|
-
uuids.each do |uuid|
|
24
|
-
dataset = Puree::Dataset.new
|
25
|
-
dataset.find uuid: uuid
|
26
|
-
d = dataset.metadata
|
27
|
-
if d.empty?
|
28
|
-
@logger.info 'No metadata for ' + uuid
|
29
|
-
next
|
30
|
-
end
|
31
|
-
# configurable to become more human-readable
|
32
|
-
dir_name = build_directory_name(d, dir_name_scheme)
|
33
|
-
|
34
|
-
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
35
|
-
# continue only if there is no DB entry
|
36
|
-
# continue only if the dataset has a DOI
|
37
|
-
# continue only if there are files for this resource
|
38
|
-
# continue only if it is time to preserve
|
39
|
-
if !dir_name.nil? &&
|
40
|
-
!dir_name.empty? &&
|
41
|
-
!@report.in_db?(dir_name) &&
|
42
|
-
!d['doi'].empty? &&
|
43
|
-
!d['file'].empty? &&
|
44
|
-
time_to_preserve?(d['modified'], delay)
|
45
|
-
|
46
|
-
dir_file_path = dir_base_path + '/' + dir_name
|
47
|
-
dir_metadata_path = dir_file_path + '/metadata/'
|
48
|
-
metadata_filename = dir_metadata_path + 'metadata.json'
|
49
|
-
|
50
|
-
# calculate total size of data files
|
51
|
-
download_storage_required = 0
|
52
|
-
d['file'].each { |i| download_storage_required += i['size'].to_i }
|
53
|
-
|
54
|
-
# do we have enough space in filesystem to fetch data files?
|
55
|
-
if enough_storage_for_download? download_storage_required
|
56
|
-
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
57
|
-
else
|
58
|
-
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
59
|
-
next
|
60
|
-
end
|
61
|
-
|
62
|
-
# has metadata file been created? if so, files and metadata are in place
|
63
|
-
# continue only if files not present in ingest location
|
64
|
-
if !File.size? metadata_filename
|
65
|
-
|
66
|
-
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
|
67
|
-
|
68
|
-
data = []
|
69
|
-
d['file'].each do |f|
|
70
|
-
o = package_dataset_metadata d, f
|
71
|
-
data << o
|
72
|
-
wget_str = build_wget Puree.username,
|
73
|
-
Puree.password,
|
74
|
-
f['url']
|
75
|
-
|
76
|
-
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
77
|
-
|
78
|
-
# fetch the file
|
79
|
-
Dir.chdir(dir_file_path) do
|
80
|
-
# puts 'Changing dir to ' + Dir.pwd
|
81
|
-
# puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
|
82
|
-
if File.size?(f['name'])
|
83
|
-
# puts 'Should be deleting ' + f['name']
|
84
|
-
File.delete(f['name'])
|
85
|
-
end
|
86
|
-
# puts f['name'] + ' missing or empty'
|
87
|
-
# puts wget_str
|
88
|
-
`#{wget_str}`
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
93
|
-
|
94
|
-
pretty = JSON.pretty_generate( data, :indent => ' ')
|
95
|
-
# puts pretty
|
96
|
-
File.write(metadata_filename,pretty)
|
97
|
-
@logger.info 'Created ' + metadata_filename
|
98
|
-
end
|
99
|
-
else
|
100
|
-
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
private
|
106
|
-
|
107
|
-
def package_dataset_metadata(d, f)
|
108
|
-
o = {}
|
109
|
-
o['filename'] = 'objects/' + f['name']
|
110
|
-
o['dc.title'] = d['title']
|
111
|
-
if !d['description'].empty?
|
112
|
-
o['dc.description'] = d['description']
|
113
|
-
end
|
114
|
-
o['dcterms.created'] = d['created']
|
115
|
-
if !d['available']['year'].empty?
|
116
|
-
o['dcterms.available'] = Puree::Date.iso(d['available'])
|
117
|
-
end
|
118
|
-
o['dc.publisher'] = d['publisher']
|
119
|
-
if !d['doi'].empty?
|
120
|
-
o['dc.identifier'] = d['doi']
|
121
|
-
end
|
122
|
-
if !d['spatial'].empty?
|
123
|
-
o['dcterms.spatial'] = d['spatial']
|
124
|
-
end
|
125
|
-
if !d['temporal']['start']['year'].empty?
|
126
|
-
temporal_range = ''
|
127
|
-
temporal_range << Puree::Date.iso(d['temporal']['start'])
|
128
|
-
if !d['temporal']['end']['year'].empty?
|
129
|
-
temporal_range << '/'
|
130
|
-
temporal_range << Puree::Date.iso(d['temporal']['end'])
|
131
|
-
end
|
132
|
-
o['dcterms.temporal'] = temporal_range
|
133
|
-
end
|
134
|
-
creators = []
|
135
|
-
contributors = []
|
136
|
-
person_types = %w(internal external other)
|
137
|
-
person_types.each do |person_type|
|
138
|
-
d['person'][person_type].each do |i|
|
139
|
-
if i['role'] == 'Creator'
|
140
|
-
creator = i['name']['last'] + ', ' + i['name']['first']
|
141
|
-
creators << creator
|
142
|
-
end
|
143
|
-
if i['role'] == 'Contributor'
|
144
|
-
contributor = i['name']['last'] + ', ' + i['name']['first']
|
145
|
-
contributors << contributor
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
o['dc.creator'] = creators
|
150
|
-
if !contributors.empty?
|
151
|
-
o['dc.contributor'] = contributors
|
152
|
-
end
|
153
|
-
keywords = []
|
154
|
-
d['keyword'].each { |i|
|
155
|
-
keywords << i
|
156
|
-
}
|
157
|
-
if !keywords.empty?
|
158
|
-
o['dc.subject'] = keywords
|
159
|
-
end
|
160
|
-
if !f['license']['name'].empty?
|
161
|
-
o['dcterms.license'] = f['license']['name']
|
162
|
-
end
|
163
|
-
# o['dc.format'] = f['mime']
|
164
|
-
|
165
|
-
related = []
|
166
|
-
publications = d['publication']
|
167
|
-
publications.each do |i|
|
168
|
-
o_related = {}
|
169
|
-
o_related['dc.title'] = i['title']
|
170
|
-
o_related['type'] = i['type']
|
171
|
-
pub = Puree::Publication.new
|
172
|
-
pub.find uuid: i['uuid']
|
173
|
-
doi = pub.doi
|
174
|
-
if doi
|
175
|
-
o_related['dc.identifier'] = doi
|
176
|
-
end
|
177
|
-
related << o_related
|
178
|
-
end
|
179
|
-
if !related.empty?
|
180
|
-
o['related'] = related
|
181
|
-
end
|
182
|
-
|
183
|
-
o
|
184
|
-
end
|
185
|
-
|
186
|
-
end
|
187
|
-
|
188
|
-
end
|