preservation 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ module Preservation
2
+
3
+ # Reporting
4
+ #
5
+ module Report
6
+
7
+ # Database
8
+ #
9
+ module Database
10
+
11
+ # Database connection
12
+ #
13
+ # @return [SQLite3::Database]
14
+ def self.db_connection(db_path)
15
+ if db_path.nil?
16
+ puts 'Missing db_path'
17
+ exit
18
+ end
19
+ @db ||= SQLite3::Database.new db_path
20
+ end
21
+
22
+ end
23
+
24
+ end
25
+
26
+ end
@@ -0,0 +1,166 @@
1
+ module Preservation
2
+
3
+ module Report
4
+
5
+ # Transfer reporting
6
+ #
7
+ module Transfer
8
+
9
+ # Transfers based on presence (or not) of a particular status
10
+ #
11
+ # @param status_to_find [String]
12
+ # @param status_presence [Boolean]
13
+ def self.status(status_to_find: nil, status_presence: true)
14
+ if status_presence === true
15
+ status_presence = '='
16
+ else
17
+ status_presence = '<>'
18
+ end
19
+
20
+ query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
21
+
22
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
23
+ # and use hex function in DB query
24
+ records = []
25
+ db.results_as_hash = true
26
+ db.execute( query, [ status_to_find ] ) do |row|
27
+ id = row['id']
28
+ uuid = row['uuid']
29
+ bin_path = Preservation::Conversion.hex_to_bin row['hex_path']
30
+ unit_type = row['unit_type']
31
+ status = row['status']
32
+ microservice = row['microservice']
33
+ current = row['current']
34
+ o = {}
35
+ o['path'] = bin_path if !bin_path.empty?
36
+ o['unit_type'] = unit_type if !unit_type.empty?
37
+ o['status'] = status if !status.empty?
38
+ o['microservice'] = microservice if !microservice.empty?
39
+ o['current'] = current if current
40
+ o['id'] = id if id
41
+ o['uuid'] = uuid if !uuid.empty?
42
+
43
+ records << o
44
+ end
45
+
46
+ records
47
+ end
48
+
49
+ # Current transfer
50
+ #
51
+ # @return [Hash]
52
+ def self.current
53
+ query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
54
+
55
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
56
+ # and use hex function in DB query
57
+ o = {}
58
+ db.results_as_hash = true
59
+ db.execute( query ) do |row|
60
+ id = row['id']
61
+ uuid = row['uuid']
62
+ bin_path = hex_to_bin row['hex_path']
63
+ unit_type = row['unit_type']
64
+ status = row['status']
65
+ microservice = row['microservice']
66
+ current = row['current']
67
+ o['path'] = bin_path if !bin_path.empty?
68
+ o['unit_type'] = unit_type if !unit_type.empty?
69
+ o['status'] = status if !status.empty?
70
+ o['microservice'] = microservice if !microservice.empty?
71
+ o['current'] = current if current
72
+ o['id'] = id if id
73
+ o['uuid'] = uuid if !uuid.empty?
74
+ end
75
+ o
76
+ end
77
+
78
+ # Count of complete transfers
79
+ #
80
+ # @return [Integer]
81
+ def self.complete_count
82
+ query = 'SELECT count(*) FROM unit WHERE status = ?'
83
+
84
+ status_to_find = 'COMPLETE'
85
+ db.results_as_hash = true
86
+ db.get_first_value( query, [status_to_find] )
87
+ end
88
+
89
+ # Compilation of statistics and data, with focus on exceptions
90
+ #
91
+ # @return [Hash]
92
+ def self.exception
93
+ incomplete = status(status_to_find: 'COMPLETE', status_presence: false)
94
+ failed = status(status_to_find: 'FAILED', status_presence: true)
95
+ report = {}
96
+ report['current'] = current if !current.empty?
97
+ report['failed'] = {}
98
+ report['failed']['count'] = failed.count
99
+ report['failed']['data'] = failed if !failed.empty?
100
+ report['incomplete'] = {}
101
+ report['incomplete']['count'] = incomplete.count
102
+ report['incomplete']['data'] = incomplete if !incomplete.empty?
103
+ report['complete'] = {}
104
+ report['complete']['count'] = complete_count if complete_count
105
+ report
106
+ end
107
+
108
+ # Is it in database?
109
+ # @param path_to_find [String] directory name within ingest path
110
+ # @return [Boolean]
111
+ def self.in_db?(path_to_find)
112
+ in_db = false
113
+
114
+ # Get path out of DB as a hex string
115
+ query = 'SELECT hex(path) FROM unit'
116
+
117
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
118
+ # and use hex function in DB query
119
+ db.execute( query ) do |row|
120
+ bin_path = Preservation::Conversion.hex_to_bin row[0]
121
+ if bin_path === path_to_find
122
+ in_db = true
123
+ end
124
+ end
125
+
126
+ in_db
127
+ end
128
+
129
+ # Has preservation been done?
130
+ # @param path_to_find [String] directory name within ingest path
131
+ # @return [Boolean]
132
+ def self.preserved?(path_to_find)
133
+ preserved = false
134
+
135
+ # 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
136
+ # indicates completed
137
+ unit_type_to_find = 'ingest'
138
+ status_to_find = 'COMPLETE'
139
+
140
+ # Get path out of DB as a hex string for completed ingests
141
+ query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
142
+
143
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
144
+ # and use hex function in DB query
145
+ db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
146
+ bin_path = Preservation::Conversion.hex_to_bin row[0]
147
+ if bin_path === path_to_find
148
+ preserved = true
149
+ end
150
+ end
151
+
152
+ preserved
153
+ end
154
+
155
+ # Db
156
+ #
157
+ # @return [SQLite3::Database]
158
+ def self.db
159
+ Preservation::Report::Database.db_connection Preservation.db_path
160
+ end
161
+
162
+ end
163
+
164
+ end
165
+
166
+ end
@@ -0,0 +1,50 @@
1
+ module Preservation
2
+
3
+ # Storage
4
+ #
5
+ module Storage
6
+
7
+ # Free up disk space for completed transfers
8
+ #
9
+ def self.cleanup
10
+ preserved = get_preserved
11
+ if !preserved.nil? && !preserved.empty?
12
+ preserved.each do |i|
13
+ # skip anything that has a different owner to script
14
+ if File.stat(i).grpowned?
15
+ FileUtils.remove_dir i
16
+ # @logger.info 'Deleted ' + i
17
+ end
18
+ end
19
+ end
20
+ end
21
+
22
+ # Enough storage for download?
23
+ #
24
+ # @return [Boolean]
25
+ def self.enough_storage_for_download?(required_bytes)
26
+ # scale up the required space using a multiplier
27
+ multiplier = 2
28
+ available = FreeDiskSpace.bytes('/')
29
+ required_bytes * multiplier < available ? true : false
30
+ end
31
+
32
+ # Collect all paths from DB where preservation has been done
33
+ # @return [Array<String>]
34
+ def self.get_preserved
35
+ ingest_complete = Preservation::Report::Transfer.status(status_to_find: 'COMPLETE',
36
+ status_presence: true)
37
+ preserved = []
38
+ ingest_complete.each do |i|
39
+ dir_path = Preservation.ingest_path + '/' + i['path']
40
+ if File.exists?(dir_path)
41
+ preserved << dir_path
42
+ end
43
+ end
44
+
45
+ preserved
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,21 @@
1
+ module Preservation
2
+
3
+ # Temporal
4
+ #
5
+ module Temporal
6
+
7
+ # time_to_preserve?
8
+ #
9
+ # @param start_utc [String]
10
+ # @param delay [Integer] days to wait (after start date) before preserving
11
+ # @return [Boolean]
12
+ def self.time_to_preserve?(start_utc, delay)
13
+ now = DateTime.now
14
+ start_datetime = DateTime.parse(start_utc)
15
+ days_since_start = (now - start_datetime).to_i # result in days
16
+ days_since_start >= delay ? true : false
17
+ end
18
+
19
+ end
20
+
21
+ end
@@ -0,0 +1,215 @@
1
+ module Preservation
2
+
3
+ # Transfer preparation
4
+ #
5
+ module Transfer
6
+
7
+ # Transfer preparation for Pure
8
+ #
9
+ class Pure < Ingest
10
+
11
+ # @param base_url [String]
12
+ # @param username [String]
13
+ # @param password [String]
14
+ # @param basic_auth [Boolean]
15
+ def initialize(base_url: nil, username: nil, password: nil, basic_auth: nil)
16
+ super()
17
+ @base_url = base_url
18
+ @basic_auth = basic_auth
19
+ if basic_auth === true
20
+ @username = username
21
+ @password = password
22
+ end
23
+ end
24
+
25
+ # For given uuid, if necessary, fetch the metadata,
26
+ # prepare a directory in the ingest path and populate it with the files and
27
+ # JSON description file.
28
+ #
29
+ # @param uuid [String] uuid to preserve
30
+ # @param dir_scheme [Symbol] how to make directory name
31
+ # @param delay [Integer] days to wait (after modification date) before preserving
32
+ def prepare_dataset(uuid: nil,
33
+ dir_scheme: :uuid,
34
+ delay: 0)
35
+ if uuid.nil?
36
+ @logger.error 'Missing ' + uuid
37
+ exit
38
+ end
39
+ dir_base_path = Preservation.ingest_path
40
+
41
+ dataset = Puree::Dataset.new base_url: @base_url,
42
+ username: @username,
43
+ password: @password,
44
+ basic_auth: @basic_auth
45
+
46
+ dataset.find uuid: uuid
47
+ d = dataset.metadata
48
+ if d.empty?
49
+ @logger.error 'No metadata for ' + uuid
50
+ exit
51
+ end
52
+ # configurable to become more human-readable
53
+ dir_name = Preservation::Builder.build_directory_name(d, dir_scheme)
54
+
55
+ # continue only if dir_name is not empty (e.g. because there was no DOI)
56
+ # continue only if there is no DB entry
57
+ # continue only if the dataset has a DOI
58
+ # continue only if there are files for this resource
59
+ # continue only if it is time to preserve
60
+ if !dir_name.nil? &&
61
+ !dir_name.empty? &&
62
+ !Preservation::Report::Transfer.in_db?(dir_name) &&
63
+ !d['doi'].empty? &&
64
+ !d['file'].empty? &&
65
+ Preservation::Temporal.time_to_preserve?(d['modified'], delay)
66
+
67
+ dir_file_path = dir_base_path + '/' + dir_name
68
+ dir_metadata_path = dir_file_path + '/metadata/'
69
+ metadata_filename = dir_metadata_path + 'metadata.json'
70
+
71
+ # calculate total size of data files
72
+ download_storage_required = 0
73
+ d['file'].each { |i| download_storage_required += i['size'].to_i }
74
+
75
+ # do we have enough space in filesystem to fetch data files?
76
+ if Preservation::Storage.enough_storage_for_download? download_storage_required
77
+ # @logger.info 'Sufficient disk space for ' + dir_file_path
78
+ else
79
+ @logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
80
+ end
81
+
82
+ # has metadata file been created? if so, files and metadata are in place
83
+ # continue only if files not present in ingest location
84
+ if !File.size? metadata_filename
85
+
86
+ @logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
87
+
88
+ data = []
89
+ d['file'].each do |f|
90
+ o = package_dataset_metadata d, f
91
+ data << o
92
+ wget_str = Preservation::Builder.build_wget @username,
93
+ @password,
94
+ f['url']
95
+
96
+ Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
97
+
98
+ # fetch the file
99
+ Dir.chdir(dir_file_path) do
100
+ # puts 'Changing dir to ' + Dir.pwd
101
+ # puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
102
+ if File.size?(f['name'])
103
+ # puts 'Should be deleting ' + f['name']
104
+ File.delete(f['name'])
105
+ end
106
+ # puts f['name'] + ' missing or empty'
107
+ # puts wget_str
108
+ `#{wget_str}`
109
+ end
110
+ end
111
+
112
+ Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
113
+
114
+ pretty = JSON.pretty_generate( data, :indent => ' ')
115
+ # puts pretty
116
+ File.write(metadata_filename,pretty)
117
+ @logger.info 'Created ' + metadata_filename
118
+ else
119
+ @logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid'] +
120
+ ' because ' + metadata_filename + ' exists'
121
+ end
122
+ else
123
+ @logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
124
+ end
125
+ end
126
+
127
+ private
128
+
129
+ def package_dataset_metadata(d, f)
130
+ o = {}
131
+ o['filename'] = 'objects/' + f['name']
132
+ o['dc.title'] = d['title']
133
+ if !d['description'].empty?
134
+ o['dc.description'] = d['description']
135
+ end
136
+ o['dcterms.created'] = d['created']
137
+ if !d['available']['year'].empty?
138
+ o['dcterms.available'] = Puree::Date.iso(d['available'])
139
+ end
140
+ o['dc.publisher'] = d['publisher']
141
+ if !d['doi'].empty?
142
+ o['dc.identifier'] = d['doi']
143
+ end
144
+ if !d['spatial'].empty?
145
+ o['dcterms.spatial'] = d['spatial']
146
+ end
147
+ if !d['temporal']['start']['year'].empty?
148
+ temporal_range = ''
149
+ temporal_range << Puree::Date.iso(d['temporal']['start'])
150
+ if !d['temporal']['end']['year'].empty?
151
+ temporal_range << '/'
152
+ temporal_range << Puree::Date.iso(d['temporal']['end'])
153
+ end
154
+ o['dcterms.temporal'] = temporal_range
155
+ end
156
+ creators = []
157
+ contributors = []
158
+ person_types = %w(internal external other)
159
+ person_types.each do |person_type|
160
+ d['person'][person_type].each do |i|
161
+ if i['role'] == 'Creator'
162
+ creator = i['name']['last'] + ', ' + i['name']['first']
163
+ creators << creator
164
+ end
165
+ if i['role'] == 'Contributor'
166
+ contributor = i['name']['last'] + ', ' + i['name']['first']
167
+ contributors << contributor
168
+ end
169
+ end
170
+ end
171
+ o['dc.creator'] = creators
172
+ if !contributors.empty?
173
+ o['dc.contributor'] = contributors
174
+ end
175
+ keywords = []
176
+ d['keyword'].each { |i|
177
+ keywords << i
178
+ }
179
+ if !keywords.empty?
180
+ o['dc.subject'] = keywords
181
+ end
182
+ if !f['license']['name'].empty?
183
+ o['dcterms.license'] = f['license']['name']
184
+ end
185
+ # o['dc.format'] = f['mime']
186
+
187
+ related = []
188
+ publications = d['publication']
189
+ publications.each do |i|
190
+ o_related = {}
191
+ o_related['dc.title'] = i['title']
192
+ o_related['type'] = i['type']
193
+ pub = Puree::Publication.new base_url: @base_url,
194
+ username: @username,
195
+ password: @password,
196
+ basic_auth: @basic_auth
197
+ pub.find uuid: i['uuid']
198
+ doi = pub.doi
199
+ if doi
200
+ o_related['dc.identifier'] = doi
201
+ end
202
+ related << o_related
203
+ end
204
+ if !related.empty?
205
+ o['related'] = related
206
+ end
207
+
208
+ o
209
+ end
210
+
211
+ end
212
+
213
+ end
214
+
215
+ end