preservation 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/PITCHME.md +18 -17
- data/README.md +21 -43
- data/lib/preservation.rb +7 -3
- data/lib/preservation/builder.rb +84 -0
- data/lib/preservation/{string_util.rb → conversion.rb} +7 -2
- data/lib/preservation/ingest.rb +4 -129
- data/lib/preservation/report/database.rb +26 -0
- data/lib/preservation/report/transfer.rb +166 -0
- data/lib/preservation/storage.rb +50 -0
- data/lib/preservation/temporal.rb +21 -0
- data/lib/preservation/transfer/pure.rb +215 -0
- data/lib/preservation/version.rb +1 -1
- data/preservation.gemspec +3 -3
- metadata +13 -9
- data/lib/preservation/ingest_report.rb +0 -172
- data/lib/preservation/pure_ingest.rb +0 -188
data/lib/preservation/version.rb
CHANGED
data/preservation.gemspec
CHANGED
@@ -8,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Preservation::VERSION
|
9
9
|
spec.authors = ["Adrian Albin-Clark"]
|
10
10
|
spec.email = ["a.albin-clark@lancaster.ac.uk"]
|
11
|
-
spec.summary = %q{
|
12
|
-
spec.description = %q{
|
13
|
-
spec.homepage = "https://
|
11
|
+
spec.summary = %q{Extraction and Transformation for Loading by Archivematica's Automation Tools.}
|
12
|
+
spec.description = %q{Extraction and Transformation for Loading by Archivematica's Automation Tools. Includes transfer preparation, reporting and disk space management.}
|
13
|
+
spec.homepage = "https://aalbinclark.gitbooks.io/preservation"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0")
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preservation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adrian Albin-Clark
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-09-
|
11
|
+
date: 2016-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: free_disk_space
|
@@ -52,8 +52,8 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.3'
|
55
|
-
description:
|
56
|
-
|
55
|
+
description: Extraction and Transformation for Loading by Archivematica's Automation
|
56
|
+
Tools. Includes transfer preparation, reporting and disk space management.
|
57
57
|
email:
|
58
58
|
- a.albin-clark@lancaster.ac.uk
|
59
59
|
executables: []
|
@@ -68,14 +68,18 @@ files:
|
|
68
68
|
- README.md
|
69
69
|
- Rakefile
|
70
70
|
- lib/preservation.rb
|
71
|
+
- lib/preservation/builder.rb
|
71
72
|
- lib/preservation/configuration.rb
|
73
|
+
- lib/preservation/conversion.rb
|
72
74
|
- lib/preservation/ingest.rb
|
73
|
-
- lib/preservation/
|
74
|
-
- lib/preservation/
|
75
|
-
- lib/preservation/
|
75
|
+
- lib/preservation/report/database.rb
|
76
|
+
- lib/preservation/report/transfer.rb
|
77
|
+
- lib/preservation/storage.rb
|
78
|
+
- lib/preservation/temporal.rb
|
79
|
+
- lib/preservation/transfer/pure.rb
|
76
80
|
- lib/preservation/version.rb
|
77
81
|
- preservation.gemspec
|
78
|
-
homepage: https://
|
82
|
+
homepage: https://aalbinclark.gitbooks.io/preservation
|
79
83
|
licenses:
|
80
84
|
- MIT
|
81
85
|
metadata: {}
|
@@ -98,6 +102,6 @@ rubyforge_project:
|
|
98
102
|
rubygems_version: 2.2.2
|
99
103
|
signing_key:
|
100
104
|
specification_version: 4
|
101
|
-
summary:
|
105
|
+
summary: Extraction and Transformation for Loading by Archivematica's Automation Tools.
|
102
106
|
test_files: []
|
103
107
|
has_rdoc:
|
@@ -1,172 +0,0 @@
|
|
1
|
-
module Preservation
|
2
|
-
|
3
|
-
# Ingest reporting
|
4
|
-
#
|
5
|
-
class IngestReport
|
6
|
-
|
7
|
-
def initialize
|
8
|
-
create_db_connection
|
9
|
-
end
|
10
|
-
|
11
|
-
# Transfers based on presence (or not) of a particular status
|
12
|
-
#
|
13
|
-
# @param status_to_find [String]
|
14
|
-
# @param status_presence [Boolean]
|
15
|
-
def transfer_status(status_to_find: nil, status_presence: true)
|
16
|
-
if status_presence === true
|
17
|
-
status_presence = '='
|
18
|
-
else
|
19
|
-
status_presence = '<>'
|
20
|
-
end
|
21
|
-
|
22
|
-
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
|
23
|
-
|
24
|
-
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
25
|
-
# and use hex function in DB query
|
26
|
-
records = []
|
27
|
-
@db.results_as_hash = true
|
28
|
-
@db.execute( query, [ status_to_find ] ) do |row|
|
29
|
-
id = row['id']
|
30
|
-
uuid = row['uuid']
|
31
|
-
bin_path = StringUtil.hex_to_bin row['hex_path']
|
32
|
-
unit_type = row['unit_type']
|
33
|
-
status = row['status']
|
34
|
-
microservice = row['microservice']
|
35
|
-
current = row['current']
|
36
|
-
o = {}
|
37
|
-
o['path'] = bin_path if !bin_path.empty?
|
38
|
-
o['unit_type'] = unit_type if !unit_type.empty?
|
39
|
-
o['status'] = status if !status.empty?
|
40
|
-
o['microservice'] = microservice if !microservice.empty?
|
41
|
-
o['current'] = current if current
|
42
|
-
o['id'] = id if id
|
43
|
-
o['uuid'] = uuid if !uuid.empty?
|
44
|
-
|
45
|
-
records << o
|
46
|
-
end
|
47
|
-
|
48
|
-
records
|
49
|
-
end
|
50
|
-
|
51
|
-
# Current transfer
|
52
|
-
#
|
53
|
-
# @return [Hash]
|
54
|
-
def transfer_current
|
55
|
-
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
|
56
|
-
|
57
|
-
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
58
|
-
# and use hex function in DB query
|
59
|
-
o = {}
|
60
|
-
@db.results_as_hash = true
|
61
|
-
@db.execute( query ) do |row|
|
62
|
-
id = row['id']
|
63
|
-
uuid = row['uuid']
|
64
|
-
bin_path = hex_to_bin row['hex_path']
|
65
|
-
unit_type = row['unit_type']
|
66
|
-
status = row['status']
|
67
|
-
microservice = row['microservice']
|
68
|
-
current = row['current']
|
69
|
-
o['path'] = bin_path if !bin_path.empty?
|
70
|
-
o['unit_type'] = unit_type if !unit_type.empty?
|
71
|
-
o['status'] = status if !status.empty?
|
72
|
-
o['microservice'] = microservice if !microservice.empty?
|
73
|
-
o['current'] = current if current
|
74
|
-
o['id'] = id if id
|
75
|
-
o['uuid'] = uuid if !uuid.empty?
|
76
|
-
end
|
77
|
-
o
|
78
|
-
end
|
79
|
-
|
80
|
-
# Count of complete transfers
|
81
|
-
#
|
82
|
-
# @return [Integer]
|
83
|
-
def transfer_complete_count
|
84
|
-
query = 'SELECT count(*) FROM unit WHERE status = ?'
|
85
|
-
|
86
|
-
status_to_find = 'COMPLETE'
|
87
|
-
@db.results_as_hash = true
|
88
|
-
@db.get_first_value( query, [status_to_find] )
|
89
|
-
end
|
90
|
-
|
91
|
-
# Compilation of statistics and data, with focus on exceptions
|
92
|
-
#
|
93
|
-
# @return [Hash]
|
94
|
-
def transfer_exception
|
95
|
-
incomplete = transfer_status(status_to_find: 'COMPLETE', status_presence: false)
|
96
|
-
failed = transfer_status(status_to_find: 'FAILED', status_presence: true)
|
97
|
-
current = transfer_current
|
98
|
-
complete_count = transfer_complete_count
|
99
|
-
report = {}
|
100
|
-
report['current'] = current if !current.empty?
|
101
|
-
report['failed'] = {}
|
102
|
-
report['failed']['count'] = failed.count
|
103
|
-
report['failed']['data'] = failed if !failed.empty?
|
104
|
-
report['incomplete'] = {}
|
105
|
-
report['incomplete']['count'] = incomplete.count
|
106
|
-
report['incomplete']['data'] = incomplete if !incomplete.empty?
|
107
|
-
report['complete'] = {}
|
108
|
-
report['complete']['count'] = complete_count if complete_count
|
109
|
-
report
|
110
|
-
end
|
111
|
-
|
112
|
-
# Is it in database?
|
113
|
-
# @param path_to_find [String] directory name within ingest path
|
114
|
-
# @return [Boolean]
|
115
|
-
def in_db?(path_to_find)
|
116
|
-
in_db = false
|
117
|
-
|
118
|
-
# Get path out of DB as a hex string
|
119
|
-
query = 'SELECT hex(path) FROM unit'
|
120
|
-
|
121
|
-
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
122
|
-
# and use hex function in DB query
|
123
|
-
@db.execute( query ) do |row|
|
124
|
-
bin_path = StringUtil.hex_to_bin row[0]
|
125
|
-
if bin_path === path_to_find
|
126
|
-
in_db = true
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
in_db
|
131
|
-
end
|
132
|
-
|
133
|
-
# Has preservation been done?
|
134
|
-
# @param path_to_find [String] directory name within ingest path
|
135
|
-
# @return [Boolean]
|
136
|
-
def preserved?(path_to_find)
|
137
|
-
preserved = false
|
138
|
-
|
139
|
-
# 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
|
140
|
-
# indicates completed
|
141
|
-
unit_type_to_find = 'ingest'
|
142
|
-
status_to_find = 'COMPLETE'
|
143
|
-
|
144
|
-
# Get path out of DB as a hex string for completed ingests
|
145
|
-
query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
|
146
|
-
|
147
|
-
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
148
|
-
# and use hex function in DB query
|
149
|
-
@db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
|
150
|
-
bin_path = StringUtil.hex_to_bin row[0]
|
151
|
-
if bin_path === path_to_find
|
152
|
-
preserved = true
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
preserved
|
157
|
-
end
|
158
|
-
|
159
|
-
|
160
|
-
private
|
161
|
-
|
162
|
-
def create_db_connection
|
163
|
-
if Preservation.db_path.nil?
|
164
|
-
puts 'Missing db_path'
|
165
|
-
exit
|
166
|
-
end
|
167
|
-
@db = SQLite3::Database.new Preservation.db_path
|
168
|
-
end
|
169
|
-
|
170
|
-
end
|
171
|
-
|
172
|
-
end
|
@@ -1,188 +0,0 @@
|
|
1
|
-
module Preservation
|
2
|
-
|
3
|
-
# Ingest for Pure
|
4
|
-
#
|
5
|
-
class PureIngest < Ingest
|
6
|
-
|
7
|
-
def initialize
|
8
|
-
super
|
9
|
-
end
|
10
|
-
|
11
|
-
# For each uuid, if necessary, fetch the metadata,
|
12
|
-
# prepare a directory in the ingest path and populate it with the files and
|
13
|
-
# JSON description file.
|
14
|
-
#
|
15
|
-
# @param uuids [Array<String>] uuids to preserve
|
16
|
-
# @param dir_name_scheme [Symbol] method to make directory name
|
17
|
-
# @param delay [Integer] days to wait (after modification date) before preserving
|
18
|
-
def prepare_dataset(uuids: [],
|
19
|
-
dir_name_scheme: :uuid,
|
20
|
-
delay: 0)
|
21
|
-
dir_base_path = Preservation.ingest_path
|
22
|
-
|
23
|
-
uuids.each do |uuid|
|
24
|
-
dataset = Puree::Dataset.new
|
25
|
-
dataset.find uuid: uuid
|
26
|
-
d = dataset.metadata
|
27
|
-
if d.empty?
|
28
|
-
@logger.info 'No metadata for ' + uuid
|
29
|
-
next
|
30
|
-
end
|
31
|
-
# configurable to become more human-readable
|
32
|
-
dir_name = build_directory_name(d, dir_name_scheme)
|
33
|
-
|
34
|
-
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
35
|
-
# continue only if there is no DB entry
|
36
|
-
# continue only if the dataset has a DOI
|
37
|
-
# continue only if there are files for this resource
|
38
|
-
# continue only if it is time to preserve
|
39
|
-
if !dir_name.nil? &&
|
40
|
-
!dir_name.empty? &&
|
41
|
-
!@report.in_db?(dir_name) &&
|
42
|
-
!d['doi'].empty? &&
|
43
|
-
!d['file'].empty? &&
|
44
|
-
time_to_preserve?(d['modified'], delay)
|
45
|
-
|
46
|
-
dir_file_path = dir_base_path + '/' + dir_name
|
47
|
-
dir_metadata_path = dir_file_path + '/metadata/'
|
48
|
-
metadata_filename = dir_metadata_path + 'metadata.json'
|
49
|
-
|
50
|
-
# calculate total size of data files
|
51
|
-
download_storage_required = 0
|
52
|
-
d['file'].each { |i| download_storage_required += i['size'].to_i }
|
53
|
-
|
54
|
-
# do we have enough space in filesystem to fetch data files?
|
55
|
-
if enough_storage_for_download? download_storage_required
|
56
|
-
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
57
|
-
else
|
58
|
-
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
59
|
-
next
|
60
|
-
end
|
61
|
-
|
62
|
-
# has metadata file been created? if so, files and metadata are in place
|
63
|
-
# continue only if files not present in ingest location
|
64
|
-
if !File.size? metadata_filename
|
65
|
-
|
66
|
-
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
|
67
|
-
|
68
|
-
data = []
|
69
|
-
d['file'].each do |f|
|
70
|
-
o = package_dataset_metadata d, f
|
71
|
-
data << o
|
72
|
-
wget_str = build_wget Puree.username,
|
73
|
-
Puree.password,
|
74
|
-
f['url']
|
75
|
-
|
76
|
-
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
77
|
-
|
78
|
-
# fetch the file
|
79
|
-
Dir.chdir(dir_file_path) do
|
80
|
-
# puts 'Changing dir to ' + Dir.pwd
|
81
|
-
# puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
|
82
|
-
if File.size?(f['name'])
|
83
|
-
# puts 'Should be deleting ' + f['name']
|
84
|
-
File.delete(f['name'])
|
85
|
-
end
|
86
|
-
# puts f['name'] + ' missing or empty'
|
87
|
-
# puts wget_str
|
88
|
-
`#{wget_str}`
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
93
|
-
|
94
|
-
pretty = JSON.pretty_generate( data, :indent => ' ')
|
95
|
-
# puts pretty
|
96
|
-
File.write(metadata_filename,pretty)
|
97
|
-
@logger.info 'Created ' + metadata_filename
|
98
|
-
end
|
99
|
-
else
|
100
|
-
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
private
|
106
|
-
|
107
|
-
def package_dataset_metadata(d, f)
|
108
|
-
o = {}
|
109
|
-
o['filename'] = 'objects/' + f['name']
|
110
|
-
o['dc.title'] = d['title']
|
111
|
-
if !d['description'].empty?
|
112
|
-
o['dc.description'] = d['description']
|
113
|
-
end
|
114
|
-
o['dcterms.created'] = d['created']
|
115
|
-
if !d['available']['year'].empty?
|
116
|
-
o['dcterms.available'] = Puree::Date.iso(d['available'])
|
117
|
-
end
|
118
|
-
o['dc.publisher'] = d['publisher']
|
119
|
-
if !d['doi'].empty?
|
120
|
-
o['dc.identifier'] = d['doi']
|
121
|
-
end
|
122
|
-
if !d['spatial'].empty?
|
123
|
-
o['dcterms.spatial'] = d['spatial']
|
124
|
-
end
|
125
|
-
if !d['temporal']['start']['year'].empty?
|
126
|
-
temporal_range = ''
|
127
|
-
temporal_range << Puree::Date.iso(d['temporal']['start'])
|
128
|
-
if !d['temporal']['end']['year'].empty?
|
129
|
-
temporal_range << '/'
|
130
|
-
temporal_range << Puree::Date.iso(d['temporal']['end'])
|
131
|
-
end
|
132
|
-
o['dcterms.temporal'] = temporal_range
|
133
|
-
end
|
134
|
-
creators = []
|
135
|
-
contributors = []
|
136
|
-
person_types = %w(internal external other)
|
137
|
-
person_types.each do |person_type|
|
138
|
-
d['person'][person_type].each do |i|
|
139
|
-
if i['role'] == 'Creator'
|
140
|
-
creator = i['name']['last'] + ', ' + i['name']['first']
|
141
|
-
creators << creator
|
142
|
-
end
|
143
|
-
if i['role'] == 'Contributor'
|
144
|
-
contributor = i['name']['last'] + ', ' + i['name']['first']
|
145
|
-
contributors << contributor
|
146
|
-
end
|
147
|
-
end
|
148
|
-
end
|
149
|
-
o['dc.creator'] = creators
|
150
|
-
if !contributors.empty?
|
151
|
-
o['dc.contributor'] = contributors
|
152
|
-
end
|
153
|
-
keywords = []
|
154
|
-
d['keyword'].each { |i|
|
155
|
-
keywords << i
|
156
|
-
}
|
157
|
-
if !keywords.empty?
|
158
|
-
o['dc.subject'] = keywords
|
159
|
-
end
|
160
|
-
if !f['license']['name'].empty?
|
161
|
-
o['dcterms.license'] = f['license']['name']
|
162
|
-
end
|
163
|
-
# o['dc.format'] = f['mime']
|
164
|
-
|
165
|
-
related = []
|
166
|
-
publications = d['publication']
|
167
|
-
publications.each do |i|
|
168
|
-
o_related = {}
|
169
|
-
o_related['dc.title'] = i['title']
|
170
|
-
o_related['type'] = i['type']
|
171
|
-
pub = Puree::Publication.new
|
172
|
-
pub.find uuid: i['uuid']
|
173
|
-
doi = pub.doi
|
174
|
-
if doi
|
175
|
-
o_related['dc.identifier'] = doi
|
176
|
-
end
|
177
|
-
related << o_related
|
178
|
-
end
|
179
|
-
if !related.empty?
|
180
|
-
o['related'] = related
|
181
|
-
end
|
182
|
-
|
183
|
-
o
|
184
|
-
end
|
185
|
-
|
186
|
-
end
|
187
|
-
|
188
|
-
end
|