preservation 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/PITCHME.md +18 -17
- data/README.md +21 -43
- data/lib/preservation.rb +7 -3
- data/lib/preservation/builder.rb +84 -0
- data/lib/preservation/{string_util.rb → conversion.rb} +7 -2
- data/lib/preservation/ingest.rb +4 -129
- data/lib/preservation/report/database.rb +26 -0
- data/lib/preservation/report/transfer.rb +166 -0
- data/lib/preservation/storage.rb +50 -0
- data/lib/preservation/temporal.rb +21 -0
- data/lib/preservation/transfer/pure.rb +215 -0
- data/lib/preservation/version.rb +1 -1
- data/preservation.gemspec +3 -3
- metadata +13 -9
- data/lib/preservation/ingest_report.rb +0 -172
- data/lib/preservation/pure_ingest.rb +0 -188
@@ -0,0 +1,26 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Reporting
|
4
|
+
#
|
5
|
+
module Report
|
6
|
+
|
7
|
+
# Database
|
8
|
+
#
|
9
|
+
module Database
|
10
|
+
|
11
|
+
# Database connection
|
12
|
+
#
|
13
|
+
# @return [SQLite3::Database]
|
14
|
+
def self.db_connection(db_path)
|
15
|
+
if db_path.nil?
|
16
|
+
puts 'Missing db_path'
|
17
|
+
exit
|
18
|
+
end
|
19
|
+
@db ||= SQLite3::Database.new db_path
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
module Report
|
4
|
+
|
5
|
+
# Transfer reporting
|
6
|
+
#
|
7
|
+
module Transfer
|
8
|
+
|
9
|
+
# Transfers based on presence (or not) of a particular status
|
10
|
+
#
|
11
|
+
# @param status_to_find [String]
|
12
|
+
# @param status_presence [Boolean]
|
13
|
+
def self.status(status_to_find: nil, status_presence: true)
|
14
|
+
if status_presence === true
|
15
|
+
status_presence = '='
|
16
|
+
else
|
17
|
+
status_presence = '<>'
|
18
|
+
end
|
19
|
+
|
20
|
+
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
|
21
|
+
|
22
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
23
|
+
# and use hex function in DB query
|
24
|
+
records = []
|
25
|
+
db.results_as_hash = true
|
26
|
+
db.execute( query, [ status_to_find ] ) do |row|
|
27
|
+
id = row['id']
|
28
|
+
uuid = row['uuid']
|
29
|
+
bin_path = Preservation::Conversion.hex_to_bin row['hex_path']
|
30
|
+
unit_type = row['unit_type']
|
31
|
+
status = row['status']
|
32
|
+
microservice = row['microservice']
|
33
|
+
current = row['current']
|
34
|
+
o = {}
|
35
|
+
o['path'] = bin_path if !bin_path.empty?
|
36
|
+
o['unit_type'] = unit_type if !unit_type.empty?
|
37
|
+
o['status'] = status if !status.empty?
|
38
|
+
o['microservice'] = microservice if !microservice.empty?
|
39
|
+
o['current'] = current if current
|
40
|
+
o['id'] = id if id
|
41
|
+
o['uuid'] = uuid if !uuid.empty?
|
42
|
+
|
43
|
+
records << o
|
44
|
+
end
|
45
|
+
|
46
|
+
records
|
47
|
+
end
|
48
|
+
|
49
|
+
# Current transfer
|
50
|
+
#
|
51
|
+
# @return [Hash]
|
52
|
+
def self.current
|
53
|
+
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
|
54
|
+
|
55
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
56
|
+
# and use hex function in DB query
|
57
|
+
o = {}
|
58
|
+
db.results_as_hash = true
|
59
|
+
db.execute( query ) do |row|
|
60
|
+
id = row['id']
|
61
|
+
uuid = row['uuid']
|
62
|
+
bin_path = hex_to_bin row['hex_path']
|
63
|
+
unit_type = row['unit_type']
|
64
|
+
status = row['status']
|
65
|
+
microservice = row['microservice']
|
66
|
+
current = row['current']
|
67
|
+
o['path'] = bin_path if !bin_path.empty?
|
68
|
+
o['unit_type'] = unit_type if !unit_type.empty?
|
69
|
+
o['status'] = status if !status.empty?
|
70
|
+
o['microservice'] = microservice if !microservice.empty?
|
71
|
+
o['current'] = current if current
|
72
|
+
o['id'] = id if id
|
73
|
+
o['uuid'] = uuid if !uuid.empty?
|
74
|
+
end
|
75
|
+
o
|
76
|
+
end
|
77
|
+
|
78
|
+
# Count of complete transfers
|
79
|
+
#
|
80
|
+
# @return [Integer]
|
81
|
+
def self.complete_count
|
82
|
+
query = 'SELECT count(*) FROM unit WHERE status = ?'
|
83
|
+
|
84
|
+
status_to_find = 'COMPLETE'
|
85
|
+
db.results_as_hash = true
|
86
|
+
db.get_first_value( query, [status_to_find] )
|
87
|
+
end
|
88
|
+
|
89
|
+
# Compilation of statistics and data, with focus on exceptions
|
90
|
+
#
|
91
|
+
# @return [Hash]
|
92
|
+
def self.exception
|
93
|
+
incomplete = status(status_to_find: 'COMPLETE', status_presence: false)
|
94
|
+
failed = status(status_to_find: 'FAILED', status_presence: true)
|
95
|
+
report = {}
|
96
|
+
report['current'] = current if !current.empty?
|
97
|
+
report['failed'] = {}
|
98
|
+
report['failed']['count'] = failed.count
|
99
|
+
report['failed']['data'] = failed if !failed.empty?
|
100
|
+
report['incomplete'] = {}
|
101
|
+
report['incomplete']['count'] = incomplete.count
|
102
|
+
report['incomplete']['data'] = incomplete if !incomplete.empty?
|
103
|
+
report['complete'] = {}
|
104
|
+
report['complete']['count'] = complete_count if complete_count
|
105
|
+
report
|
106
|
+
end
|
107
|
+
|
108
|
+
# Is it in database?
|
109
|
+
# @param path_to_find [String] directory name within ingest path
|
110
|
+
# @return [Boolean]
|
111
|
+
def self.in_db?(path_to_find)
|
112
|
+
in_db = false
|
113
|
+
|
114
|
+
# Get path out of DB as a hex string
|
115
|
+
query = 'SELECT hex(path) FROM unit'
|
116
|
+
|
117
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
118
|
+
# and use hex function in DB query
|
119
|
+
db.execute( query ) do |row|
|
120
|
+
bin_path = Preservation::Conversion.hex_to_bin row[0]
|
121
|
+
if bin_path === path_to_find
|
122
|
+
in_db = true
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
in_db
|
127
|
+
end
|
128
|
+
|
129
|
+
# Has preservation been done?
|
130
|
+
# @param path_to_find [String] directory name within ingest path
|
131
|
+
# @return [Boolean]
|
132
|
+
def self.preserved?(path_to_find)
|
133
|
+
preserved = false
|
134
|
+
|
135
|
+
# 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
|
136
|
+
# indicates completed
|
137
|
+
unit_type_to_find = 'ingest'
|
138
|
+
status_to_find = 'COMPLETE'
|
139
|
+
|
140
|
+
# Get path out of DB as a hex string for completed ingests
|
141
|
+
query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
|
142
|
+
|
143
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
144
|
+
# and use hex function in DB query
|
145
|
+
db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
|
146
|
+
bin_path = Preservation::Conversion.hex_to_bin row[0]
|
147
|
+
if bin_path === path_to_find
|
148
|
+
preserved = true
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
preserved
|
153
|
+
end
|
154
|
+
|
155
|
+
# Db
|
156
|
+
#
|
157
|
+
# @return [SQLite3::Database]
|
158
|
+
def self.db
|
159
|
+
Preservation::Report::Database.db_connection Preservation.db_path
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Storage
|
4
|
+
#
|
5
|
+
module Storage
|
6
|
+
|
7
|
+
# Free up disk space for completed transfers
|
8
|
+
#
|
9
|
+
def self.cleanup
|
10
|
+
preserved = get_preserved
|
11
|
+
if !preserved.nil? && !preserved.empty?
|
12
|
+
preserved.each do |i|
|
13
|
+
# skip anything that has a different owner to script
|
14
|
+
if File.stat(i).grpowned?
|
15
|
+
FileUtils.remove_dir i
|
16
|
+
# @logger.info 'Deleted ' + i
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Enough storage for download?
|
23
|
+
#
|
24
|
+
# @return [Boolean]
|
25
|
+
def self.enough_storage_for_download?(required_bytes)
|
26
|
+
# scale up the required space using a multiplier
|
27
|
+
multiplier = 2
|
28
|
+
available = FreeDiskSpace.bytes('/')
|
29
|
+
required_bytes * multiplier < available ? true : false
|
30
|
+
end
|
31
|
+
|
32
|
+
# Collect all paths from DB where preservation has been done
|
33
|
+
# @return [Array<String>]
|
34
|
+
def self.get_preserved
|
35
|
+
ingest_complete = Preservation::Report::Transfer.status(status_to_find: 'COMPLETE',
|
36
|
+
status_presence: true)
|
37
|
+
preserved = []
|
38
|
+
ingest_complete.each do |i|
|
39
|
+
dir_path = Preservation.ingest_path + '/' + i['path']
|
40
|
+
if File.exists?(dir_path)
|
41
|
+
preserved << dir_path
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
preserved
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Temporal
|
4
|
+
#
|
5
|
+
module Temporal
|
6
|
+
|
7
|
+
# time_to_preserve?
|
8
|
+
#
|
9
|
+
# @param start_utc [String]
|
10
|
+
# @param delay [Integer] days to wait (after start date) before preserving
|
11
|
+
# @return [Boolean]
|
12
|
+
def self.time_to_preserve?(start_utc, delay)
|
13
|
+
now = DateTime.now
|
14
|
+
start_datetime = DateTime.parse(start_utc)
|
15
|
+
days_since_start = (now - start_datetime).to_i # result in days
|
16
|
+
days_since_start >= delay ? true : false
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,215 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Transfer preparation
|
4
|
+
#
|
5
|
+
module Transfer
|
6
|
+
|
7
|
+
# Transfer preparation for Pure
|
8
|
+
#
|
9
|
+
class Pure < Ingest
|
10
|
+
|
11
|
+
# @param base_url [String]
|
12
|
+
# @param username [String]
|
13
|
+
# @param password [String]
|
14
|
+
# @param basic_auth [Boolean]
|
15
|
+
def initialize(base_url: nil, username: nil, password: nil, basic_auth: nil)
|
16
|
+
super()
|
17
|
+
@base_url = base_url
|
18
|
+
@basic_auth = basic_auth
|
19
|
+
if basic_auth === true
|
20
|
+
@username = username
|
21
|
+
@password = password
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# For given uuid, if necessary, fetch the metadata,
|
26
|
+
# prepare a directory in the ingest path and populate it with the files and
|
27
|
+
# JSON description file.
|
28
|
+
#
|
29
|
+
# @param uuid [String] uuid to preserve
|
30
|
+
# @param dir_scheme [Symbol] how to make directory name
|
31
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
32
|
+
def prepare_dataset(uuid: nil,
|
33
|
+
dir_scheme: :uuid,
|
34
|
+
delay: 0)
|
35
|
+
if uuid.nil?
|
36
|
+
@logger.error 'Missing ' + uuid
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
dir_base_path = Preservation.ingest_path
|
40
|
+
|
41
|
+
dataset = Puree::Dataset.new base_url: @base_url,
|
42
|
+
username: @username,
|
43
|
+
password: @password,
|
44
|
+
basic_auth: @basic_auth
|
45
|
+
|
46
|
+
dataset.find uuid: uuid
|
47
|
+
d = dataset.metadata
|
48
|
+
if d.empty?
|
49
|
+
@logger.error 'No metadata for ' + uuid
|
50
|
+
exit
|
51
|
+
end
|
52
|
+
# configurable to become more human-readable
|
53
|
+
dir_name = Preservation::Builder.build_directory_name(d, dir_scheme)
|
54
|
+
|
55
|
+
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
56
|
+
# continue only if there is no DB entry
|
57
|
+
# continue only if the dataset has a DOI
|
58
|
+
# continue only if there are files for this resource
|
59
|
+
# continue only if it is time to preserve
|
60
|
+
if !dir_name.nil? &&
|
61
|
+
!dir_name.empty? &&
|
62
|
+
!Preservation::Report::Transfer.in_db?(dir_name) &&
|
63
|
+
!d['doi'].empty? &&
|
64
|
+
!d['file'].empty? &&
|
65
|
+
Preservation::Temporal.time_to_preserve?(d['modified'], delay)
|
66
|
+
|
67
|
+
dir_file_path = dir_base_path + '/' + dir_name
|
68
|
+
dir_metadata_path = dir_file_path + '/metadata/'
|
69
|
+
metadata_filename = dir_metadata_path + 'metadata.json'
|
70
|
+
|
71
|
+
# calculate total size of data files
|
72
|
+
download_storage_required = 0
|
73
|
+
d['file'].each { |i| download_storage_required += i['size'].to_i }
|
74
|
+
|
75
|
+
# do we have enough space in filesystem to fetch data files?
|
76
|
+
if Preservation::Storage.enough_storage_for_download? download_storage_required
|
77
|
+
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
78
|
+
else
|
79
|
+
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
80
|
+
end
|
81
|
+
|
82
|
+
# has metadata file been created? if so, files and metadata are in place
|
83
|
+
# continue only if files not present in ingest location
|
84
|
+
if !File.size? metadata_filename
|
85
|
+
|
86
|
+
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
|
87
|
+
|
88
|
+
data = []
|
89
|
+
d['file'].each do |f|
|
90
|
+
o = package_dataset_metadata d, f
|
91
|
+
data << o
|
92
|
+
wget_str = Preservation::Builder.build_wget @username,
|
93
|
+
@password,
|
94
|
+
f['url']
|
95
|
+
|
96
|
+
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
97
|
+
|
98
|
+
# fetch the file
|
99
|
+
Dir.chdir(dir_file_path) do
|
100
|
+
# puts 'Changing dir to ' + Dir.pwd
|
101
|
+
# puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
|
102
|
+
if File.size?(f['name'])
|
103
|
+
# puts 'Should be deleting ' + f['name']
|
104
|
+
File.delete(f['name'])
|
105
|
+
end
|
106
|
+
# puts f['name'] + ' missing or empty'
|
107
|
+
# puts wget_str
|
108
|
+
`#{wget_str}`
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
113
|
+
|
114
|
+
pretty = JSON.pretty_generate( data, :indent => ' ')
|
115
|
+
# puts pretty
|
116
|
+
File.write(metadata_filename,pretty)
|
117
|
+
@logger.info 'Created ' + metadata_filename
|
118
|
+
else
|
119
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid'] +
|
120
|
+
' because ' + metadata_filename + ' exists'
|
121
|
+
end
|
122
|
+
else
|
123
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
def package_dataset_metadata(d, f)
|
130
|
+
o = {}
|
131
|
+
o['filename'] = 'objects/' + f['name']
|
132
|
+
o['dc.title'] = d['title']
|
133
|
+
if !d['description'].empty?
|
134
|
+
o['dc.description'] = d['description']
|
135
|
+
end
|
136
|
+
o['dcterms.created'] = d['created']
|
137
|
+
if !d['available']['year'].empty?
|
138
|
+
o['dcterms.available'] = Puree::Date.iso(d['available'])
|
139
|
+
end
|
140
|
+
o['dc.publisher'] = d['publisher']
|
141
|
+
if !d['doi'].empty?
|
142
|
+
o['dc.identifier'] = d['doi']
|
143
|
+
end
|
144
|
+
if !d['spatial'].empty?
|
145
|
+
o['dcterms.spatial'] = d['spatial']
|
146
|
+
end
|
147
|
+
if !d['temporal']['start']['year'].empty?
|
148
|
+
temporal_range = ''
|
149
|
+
temporal_range << Puree::Date.iso(d['temporal']['start'])
|
150
|
+
if !d['temporal']['end']['year'].empty?
|
151
|
+
temporal_range << '/'
|
152
|
+
temporal_range << Puree::Date.iso(d['temporal']['end'])
|
153
|
+
end
|
154
|
+
o['dcterms.temporal'] = temporal_range
|
155
|
+
end
|
156
|
+
creators = []
|
157
|
+
contributors = []
|
158
|
+
person_types = %w(internal external other)
|
159
|
+
person_types.each do |person_type|
|
160
|
+
d['person'][person_type].each do |i|
|
161
|
+
if i['role'] == 'Creator'
|
162
|
+
creator = i['name']['last'] + ', ' + i['name']['first']
|
163
|
+
creators << creator
|
164
|
+
end
|
165
|
+
if i['role'] == 'Contributor'
|
166
|
+
contributor = i['name']['last'] + ', ' + i['name']['first']
|
167
|
+
contributors << contributor
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
o['dc.creator'] = creators
|
172
|
+
if !contributors.empty?
|
173
|
+
o['dc.contributor'] = contributors
|
174
|
+
end
|
175
|
+
keywords = []
|
176
|
+
d['keyword'].each { |i|
|
177
|
+
keywords << i
|
178
|
+
}
|
179
|
+
if !keywords.empty?
|
180
|
+
o['dc.subject'] = keywords
|
181
|
+
end
|
182
|
+
if !f['license']['name'].empty?
|
183
|
+
o['dcterms.license'] = f['license']['name']
|
184
|
+
end
|
185
|
+
# o['dc.format'] = f['mime']
|
186
|
+
|
187
|
+
related = []
|
188
|
+
publications = d['publication']
|
189
|
+
publications.each do |i|
|
190
|
+
o_related = {}
|
191
|
+
o_related['dc.title'] = i['title']
|
192
|
+
o_related['type'] = i['type']
|
193
|
+
pub = Puree::Publication.new base_url: @base_url,
|
194
|
+
username: @username,
|
195
|
+
password: @password,
|
196
|
+
basic_auth: @basic_auth
|
197
|
+
pub.find uuid: i['uuid']
|
198
|
+
doi = pub.doi
|
199
|
+
if doi
|
200
|
+
o_related['dc.identifier'] = doi
|
201
|
+
end
|
202
|
+
related << o_related
|
203
|
+
end
|
204
|
+
if !related.empty?
|
205
|
+
o['related'] = related
|
206
|
+
end
|
207
|
+
|
208
|
+
o
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
end
|