preservation 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/PITCHME.md +18 -17
- data/README.md +21 -43
- data/lib/preservation.rb +7 -3
- data/lib/preservation/builder.rb +84 -0
- data/lib/preservation/{string_util.rb → conversion.rb} +7 -2
- data/lib/preservation/ingest.rb +4 -129
- data/lib/preservation/report/database.rb +26 -0
- data/lib/preservation/report/transfer.rb +166 -0
- data/lib/preservation/storage.rb +50 -0
- data/lib/preservation/temporal.rb +21 -0
- data/lib/preservation/transfer/pure.rb +215 -0
- data/lib/preservation/version.rb +1 -1
- data/preservation.gemspec +3 -3
- metadata +13 -9
- data/lib/preservation/ingest_report.rb +0 -172
- data/lib/preservation/pure_ingest.rb +0 -188
@@ -0,0 +1,26 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Reporting
|
4
|
+
#
|
5
|
+
module Report
|
6
|
+
|
7
|
+
# Database
|
8
|
+
#
|
9
|
+
module Database
|
10
|
+
|
11
|
+
# Database connection
|
12
|
+
#
|
13
|
+
# @return [SQLite3::Database]
|
14
|
+
def self.db_connection(db_path)
|
15
|
+
if db_path.nil?
|
16
|
+
puts 'Missing db_path'
|
17
|
+
exit
|
18
|
+
end
|
19
|
+
@db ||= SQLite3::Database.new db_path
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,166 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
module Report
|
4
|
+
|
5
|
+
# Transfer reporting
|
6
|
+
#
|
7
|
+
module Transfer
|
8
|
+
|
9
|
+
# Transfers based on presence (or not) of a particular status
|
10
|
+
#
|
11
|
+
# @param status_to_find [String]
|
12
|
+
# @param status_presence [Boolean]
|
13
|
+
def self.status(status_to_find: nil, status_presence: true)
|
14
|
+
if status_presence === true
|
15
|
+
status_presence = '='
|
16
|
+
else
|
17
|
+
status_presence = '<>'
|
18
|
+
end
|
19
|
+
|
20
|
+
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
|
21
|
+
|
22
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
23
|
+
# and use hex function in DB query
|
24
|
+
records = []
|
25
|
+
db.results_as_hash = true
|
26
|
+
db.execute( query, [ status_to_find ] ) do |row|
|
27
|
+
id = row['id']
|
28
|
+
uuid = row['uuid']
|
29
|
+
bin_path = Preservation::Conversion.hex_to_bin row['hex_path']
|
30
|
+
unit_type = row['unit_type']
|
31
|
+
status = row['status']
|
32
|
+
microservice = row['microservice']
|
33
|
+
current = row['current']
|
34
|
+
o = {}
|
35
|
+
o['path'] = bin_path if !bin_path.empty?
|
36
|
+
o['unit_type'] = unit_type if !unit_type.empty?
|
37
|
+
o['status'] = status if !status.empty?
|
38
|
+
o['microservice'] = microservice if !microservice.empty?
|
39
|
+
o['current'] = current if current
|
40
|
+
o['id'] = id if id
|
41
|
+
o['uuid'] = uuid if !uuid.empty?
|
42
|
+
|
43
|
+
records << o
|
44
|
+
end
|
45
|
+
|
46
|
+
records
|
47
|
+
end
|
48
|
+
|
49
|
+
# Current transfer
|
50
|
+
#
|
51
|
+
# @return [Hash]
|
52
|
+
def self.current
|
53
|
+
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
|
54
|
+
|
55
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
56
|
+
# and use hex function in DB query
|
57
|
+
o = {}
|
58
|
+
db.results_as_hash = true
|
59
|
+
db.execute( query ) do |row|
|
60
|
+
id = row['id']
|
61
|
+
uuid = row['uuid']
|
62
|
+
bin_path = hex_to_bin row['hex_path']
|
63
|
+
unit_type = row['unit_type']
|
64
|
+
status = row['status']
|
65
|
+
microservice = row['microservice']
|
66
|
+
current = row['current']
|
67
|
+
o['path'] = bin_path if !bin_path.empty?
|
68
|
+
o['unit_type'] = unit_type if !unit_type.empty?
|
69
|
+
o['status'] = status if !status.empty?
|
70
|
+
o['microservice'] = microservice if !microservice.empty?
|
71
|
+
o['current'] = current if current
|
72
|
+
o['id'] = id if id
|
73
|
+
o['uuid'] = uuid if !uuid.empty?
|
74
|
+
end
|
75
|
+
o
|
76
|
+
end
|
77
|
+
|
78
|
+
# Count of complete transfers
|
79
|
+
#
|
80
|
+
# @return [Integer]
|
81
|
+
def self.complete_count
|
82
|
+
query = 'SELECT count(*) FROM unit WHERE status = ?'
|
83
|
+
|
84
|
+
status_to_find = 'COMPLETE'
|
85
|
+
db.results_as_hash = true
|
86
|
+
db.get_first_value( query, [status_to_find] )
|
87
|
+
end
|
88
|
+
|
89
|
+
# Compilation of statistics and data, with focus on exceptions
|
90
|
+
#
|
91
|
+
# @return [Hash]
|
92
|
+
def self.exception
|
93
|
+
incomplete = status(status_to_find: 'COMPLETE', status_presence: false)
|
94
|
+
failed = status(status_to_find: 'FAILED', status_presence: true)
|
95
|
+
report = {}
|
96
|
+
report['current'] = current if !current.empty?
|
97
|
+
report['failed'] = {}
|
98
|
+
report['failed']['count'] = failed.count
|
99
|
+
report['failed']['data'] = failed if !failed.empty?
|
100
|
+
report['incomplete'] = {}
|
101
|
+
report['incomplete']['count'] = incomplete.count
|
102
|
+
report['incomplete']['data'] = incomplete if !incomplete.empty?
|
103
|
+
report['complete'] = {}
|
104
|
+
report['complete']['count'] = complete_count if complete_count
|
105
|
+
report
|
106
|
+
end
|
107
|
+
|
108
|
+
# Is it in database?
|
109
|
+
# @param path_to_find [String] directory name within ingest path
|
110
|
+
# @return [Boolean]
|
111
|
+
def self.in_db?(path_to_find)
|
112
|
+
in_db = false
|
113
|
+
|
114
|
+
# Get path out of DB as a hex string
|
115
|
+
query = 'SELECT hex(path) FROM unit'
|
116
|
+
|
117
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
118
|
+
# and use hex function in DB query
|
119
|
+
db.execute( query ) do |row|
|
120
|
+
bin_path = Preservation::Conversion.hex_to_bin row[0]
|
121
|
+
if bin_path === path_to_find
|
122
|
+
in_db = true
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
in_db
|
127
|
+
end
|
128
|
+
|
129
|
+
# Has preservation been done?
|
130
|
+
# @param path_to_find [String] directory name within ingest path
|
131
|
+
# @return [Boolean]
|
132
|
+
def self.preserved?(path_to_find)
|
133
|
+
preserved = false
|
134
|
+
|
135
|
+
# 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
|
136
|
+
# indicates completed
|
137
|
+
unit_type_to_find = 'ingest'
|
138
|
+
status_to_find = 'COMPLETE'
|
139
|
+
|
140
|
+
# Get path out of DB as a hex string for completed ingests
|
141
|
+
query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
|
142
|
+
|
143
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
144
|
+
# and use hex function in DB query
|
145
|
+
db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
|
146
|
+
bin_path = Preservation::Conversion.hex_to_bin row[0]
|
147
|
+
if bin_path === path_to_find
|
148
|
+
preserved = true
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
preserved
|
153
|
+
end
|
154
|
+
|
155
|
+
# Db
|
156
|
+
#
|
157
|
+
# @return [SQLite3::Database]
|
158
|
+
def self.db
|
159
|
+
Preservation::Report::Database.db_connection Preservation.db_path
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Storage
|
4
|
+
#
|
5
|
+
module Storage
|
6
|
+
|
7
|
+
# Free up disk space for completed transfers
|
8
|
+
#
|
9
|
+
def self.cleanup
|
10
|
+
preserved = get_preserved
|
11
|
+
if !preserved.nil? && !preserved.empty?
|
12
|
+
preserved.each do |i|
|
13
|
+
# skip anything that has a different owner to script
|
14
|
+
if File.stat(i).grpowned?
|
15
|
+
FileUtils.remove_dir i
|
16
|
+
# @logger.info 'Deleted ' + i
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Enough storage for download?
|
23
|
+
#
|
24
|
+
# @return [Boolean]
|
25
|
+
def self.enough_storage_for_download?(required_bytes)
|
26
|
+
# scale up the required space using a multiplier
|
27
|
+
multiplier = 2
|
28
|
+
available = FreeDiskSpace.bytes('/')
|
29
|
+
required_bytes * multiplier < available ? true : false
|
30
|
+
end
|
31
|
+
|
32
|
+
# Collect all paths from DB where preservation has been done
|
33
|
+
# @return [Array<String>]
|
34
|
+
def self.get_preserved
|
35
|
+
ingest_complete = Preservation::Report::Transfer.status(status_to_find: 'COMPLETE',
|
36
|
+
status_presence: true)
|
37
|
+
preserved = []
|
38
|
+
ingest_complete.each do |i|
|
39
|
+
dir_path = Preservation.ingest_path + '/' + i['path']
|
40
|
+
if File.exists?(dir_path)
|
41
|
+
preserved << dir_path
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
preserved
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Temporal
|
4
|
+
#
|
5
|
+
module Temporal
|
6
|
+
|
7
|
+
# time_to_preserve?
|
8
|
+
#
|
9
|
+
# @param start_utc [String]
|
10
|
+
# @param delay [Integer] days to wait (after start date) before preserving
|
11
|
+
# @return [Boolean]
|
12
|
+
def self.time_to_preserve?(start_utc, delay)
|
13
|
+
now = DateTime.now
|
14
|
+
start_datetime = DateTime.parse(start_utc)
|
15
|
+
days_since_start = (now - start_datetime).to_i # result in days
|
16
|
+
days_since_start >= delay ? true : false
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,215 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Transfer preparation
|
4
|
+
#
|
5
|
+
module Transfer
|
6
|
+
|
7
|
+
# Transfer preparation for Pure
|
8
|
+
#
|
9
|
+
class Pure < Ingest
|
10
|
+
|
11
|
+
# @param base_url [String]
|
12
|
+
# @param username [String]
|
13
|
+
# @param password [String]
|
14
|
+
# @param basic_auth [Boolean]
|
15
|
+
def initialize(base_url: nil, username: nil, password: nil, basic_auth: nil)
|
16
|
+
super()
|
17
|
+
@base_url = base_url
|
18
|
+
@basic_auth = basic_auth
|
19
|
+
if basic_auth === true
|
20
|
+
@username = username
|
21
|
+
@password = password
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# For given uuid, if necessary, fetch the metadata,
|
26
|
+
# prepare a directory in the ingest path and populate it with the files and
|
27
|
+
# JSON description file.
|
28
|
+
#
|
29
|
+
# @param uuid [String] uuid to preserve
|
30
|
+
# @param dir_scheme [Symbol] how to make directory name
|
31
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
32
|
+
def prepare_dataset(uuid: nil,
|
33
|
+
dir_scheme: :uuid,
|
34
|
+
delay: 0)
|
35
|
+
if uuid.nil?
|
36
|
+
@logger.error 'Missing ' + uuid
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
dir_base_path = Preservation.ingest_path
|
40
|
+
|
41
|
+
dataset = Puree::Dataset.new base_url: @base_url,
|
42
|
+
username: @username,
|
43
|
+
password: @password,
|
44
|
+
basic_auth: @basic_auth
|
45
|
+
|
46
|
+
dataset.find uuid: uuid
|
47
|
+
d = dataset.metadata
|
48
|
+
if d.empty?
|
49
|
+
@logger.error 'No metadata for ' + uuid
|
50
|
+
exit
|
51
|
+
end
|
52
|
+
# configurable to become more human-readable
|
53
|
+
dir_name = Preservation::Builder.build_directory_name(d, dir_scheme)
|
54
|
+
|
55
|
+
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
56
|
+
# continue only if there is no DB entry
|
57
|
+
# continue only if the dataset has a DOI
|
58
|
+
# continue only if there are files for this resource
|
59
|
+
# continue only if it is time to preserve
|
60
|
+
if !dir_name.nil? &&
|
61
|
+
!dir_name.empty? &&
|
62
|
+
!Preservation::Report::Transfer.in_db?(dir_name) &&
|
63
|
+
!d['doi'].empty? &&
|
64
|
+
!d['file'].empty? &&
|
65
|
+
Preservation::Temporal.time_to_preserve?(d['modified'], delay)
|
66
|
+
|
67
|
+
dir_file_path = dir_base_path + '/' + dir_name
|
68
|
+
dir_metadata_path = dir_file_path + '/metadata/'
|
69
|
+
metadata_filename = dir_metadata_path + 'metadata.json'
|
70
|
+
|
71
|
+
# calculate total size of data files
|
72
|
+
download_storage_required = 0
|
73
|
+
d['file'].each { |i| download_storage_required += i['size'].to_i }
|
74
|
+
|
75
|
+
# do we have enough space in filesystem to fetch data files?
|
76
|
+
if Preservation::Storage.enough_storage_for_download? download_storage_required
|
77
|
+
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
78
|
+
else
|
79
|
+
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
80
|
+
end
|
81
|
+
|
82
|
+
# has metadata file been created? if so, files and metadata are in place
|
83
|
+
# continue only if files not present in ingest location
|
84
|
+
if !File.size? metadata_filename
|
85
|
+
|
86
|
+
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
|
87
|
+
|
88
|
+
data = []
|
89
|
+
d['file'].each do |f|
|
90
|
+
o = package_dataset_metadata d, f
|
91
|
+
data << o
|
92
|
+
wget_str = Preservation::Builder.build_wget @username,
|
93
|
+
@password,
|
94
|
+
f['url']
|
95
|
+
|
96
|
+
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
97
|
+
|
98
|
+
# fetch the file
|
99
|
+
Dir.chdir(dir_file_path) do
|
100
|
+
# puts 'Changing dir to ' + Dir.pwd
|
101
|
+
# puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
|
102
|
+
if File.size?(f['name'])
|
103
|
+
# puts 'Should be deleting ' + f['name']
|
104
|
+
File.delete(f['name'])
|
105
|
+
end
|
106
|
+
# puts f['name'] + ' missing or empty'
|
107
|
+
# puts wget_str
|
108
|
+
`#{wget_str}`
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
113
|
+
|
114
|
+
pretty = JSON.pretty_generate( data, :indent => ' ')
|
115
|
+
# puts pretty
|
116
|
+
File.write(metadata_filename,pretty)
|
117
|
+
@logger.info 'Created ' + metadata_filename
|
118
|
+
else
|
119
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid'] +
|
120
|
+
' because ' + metadata_filename + ' exists'
|
121
|
+
end
|
122
|
+
else
|
123
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
def package_dataset_metadata(d, f)
|
130
|
+
o = {}
|
131
|
+
o['filename'] = 'objects/' + f['name']
|
132
|
+
o['dc.title'] = d['title']
|
133
|
+
if !d['description'].empty?
|
134
|
+
o['dc.description'] = d['description']
|
135
|
+
end
|
136
|
+
o['dcterms.created'] = d['created']
|
137
|
+
if !d['available']['year'].empty?
|
138
|
+
o['dcterms.available'] = Puree::Date.iso(d['available'])
|
139
|
+
end
|
140
|
+
o['dc.publisher'] = d['publisher']
|
141
|
+
if !d['doi'].empty?
|
142
|
+
o['dc.identifier'] = d['doi']
|
143
|
+
end
|
144
|
+
if !d['spatial'].empty?
|
145
|
+
o['dcterms.spatial'] = d['spatial']
|
146
|
+
end
|
147
|
+
if !d['temporal']['start']['year'].empty?
|
148
|
+
temporal_range = ''
|
149
|
+
temporal_range << Puree::Date.iso(d['temporal']['start'])
|
150
|
+
if !d['temporal']['end']['year'].empty?
|
151
|
+
temporal_range << '/'
|
152
|
+
temporal_range << Puree::Date.iso(d['temporal']['end'])
|
153
|
+
end
|
154
|
+
o['dcterms.temporal'] = temporal_range
|
155
|
+
end
|
156
|
+
creators = []
|
157
|
+
contributors = []
|
158
|
+
person_types = %w(internal external other)
|
159
|
+
person_types.each do |person_type|
|
160
|
+
d['person'][person_type].each do |i|
|
161
|
+
if i['role'] == 'Creator'
|
162
|
+
creator = i['name']['last'] + ', ' + i['name']['first']
|
163
|
+
creators << creator
|
164
|
+
end
|
165
|
+
if i['role'] == 'Contributor'
|
166
|
+
contributor = i['name']['last'] + ', ' + i['name']['first']
|
167
|
+
contributors << contributor
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
o['dc.creator'] = creators
|
172
|
+
if !contributors.empty?
|
173
|
+
o['dc.contributor'] = contributors
|
174
|
+
end
|
175
|
+
keywords = []
|
176
|
+
d['keyword'].each { |i|
|
177
|
+
keywords << i
|
178
|
+
}
|
179
|
+
if !keywords.empty?
|
180
|
+
o['dc.subject'] = keywords
|
181
|
+
end
|
182
|
+
if !f['license']['name'].empty?
|
183
|
+
o['dcterms.license'] = f['license']['name']
|
184
|
+
end
|
185
|
+
# o['dc.format'] = f['mime']
|
186
|
+
|
187
|
+
related = []
|
188
|
+
publications = d['publication']
|
189
|
+
publications.each do |i|
|
190
|
+
o_related = {}
|
191
|
+
o_related['dc.title'] = i['title']
|
192
|
+
o_related['type'] = i['type']
|
193
|
+
pub = Puree::Publication.new base_url: @base_url,
|
194
|
+
username: @username,
|
195
|
+
password: @password,
|
196
|
+
basic_auth: @basic_auth
|
197
|
+
pub.find uuid: i['uuid']
|
198
|
+
doi = pub.doi
|
199
|
+
if doi
|
200
|
+
o_related['dc.identifier'] = doi
|
201
|
+
end
|
202
|
+
related << o_related
|
203
|
+
end
|
204
|
+
if !related.empty?
|
205
|
+
o['related'] = related
|
206
|
+
end
|
207
|
+
|
208
|
+
o
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
end
|