preservation 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,26 @@
1
+ module Preservation
2
+
3
+ # Reporting
4
+ #
5
+ module Report
6
+
7
+ # Database
8
+ #
9
+ module Database
10
+
11
+ # Database connection
12
+ #
13
+ # @return [SQLite3::Database]
14
+ def self.db_connection(db_path)
15
+ if db_path.nil?
16
+ puts 'Missing db_path'
17
+ exit
18
+ end
19
+ @db ||= SQLite3::Database.new db_path
20
+ end
21
+
22
+ end
23
+
24
+ end
25
+
26
+ end
@@ -0,0 +1,166 @@
1
+ module Preservation
2
+
3
+ module Report
4
+
5
+ # Transfer reporting
6
+ #
7
+ module Transfer
8
+
9
+ # Transfers based on presence (or not) of a particular status
10
+ #
11
+ # @param status_to_find [String]
12
+ # @param status_presence [Boolean]
13
+ def self.status(status_to_find: nil, status_presence: true)
14
+ if status_presence === true
15
+ status_presence = '='
16
+ else
17
+ status_presence = '<>'
18
+ end
19
+
20
+ query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
21
+
22
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
23
+ # and use hex function in DB query
24
+ records = []
25
+ db.results_as_hash = true
26
+ db.execute( query, [ status_to_find ] ) do |row|
27
+ id = row['id']
28
+ uuid = row['uuid']
29
+ bin_path = Preservation::Conversion.hex_to_bin row['hex_path']
30
+ unit_type = row['unit_type']
31
+ status = row['status']
32
+ microservice = row['microservice']
33
+ current = row['current']
34
+ o = {}
35
+ o['path'] = bin_path if !bin_path.empty?
36
+ o['unit_type'] = unit_type if !unit_type.empty?
37
+ o['status'] = status if !status.empty?
38
+ o['microservice'] = microservice if !microservice.empty?
39
+ o['current'] = current if current
40
+ o['id'] = id if id
41
+ o['uuid'] = uuid if !uuid.empty?
42
+
43
+ records << o
44
+ end
45
+
46
+ records
47
+ end
48
+
49
+ # Current transfer
50
+ #
51
+ # @return [Hash]
52
+ def self.current
53
+ query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
54
+
55
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
56
+ # and use hex function in DB query
57
+ o = {}
58
+ db.results_as_hash = true
59
+ db.execute( query ) do |row|
60
+ id = row['id']
61
+ uuid = row['uuid']
62
+ bin_path = hex_to_bin row['hex_path']
63
+ unit_type = row['unit_type']
64
+ status = row['status']
65
+ microservice = row['microservice']
66
+ current = row['current']
67
+ o['path'] = bin_path if !bin_path.empty?
68
+ o['unit_type'] = unit_type if !unit_type.empty?
69
+ o['status'] = status if !status.empty?
70
+ o['microservice'] = microservice if !microservice.empty?
71
+ o['current'] = current if current
72
+ o['id'] = id if id
73
+ o['uuid'] = uuid if !uuid.empty?
74
+ end
75
+ o
76
+ end
77
+
78
+ # Count of complete transfers
79
+ #
80
+ # @return [Integer]
81
+ def self.complete_count
82
+ query = 'SELECT count(*) FROM unit WHERE status = ?'
83
+
84
+ status_to_find = 'COMPLETE'
85
+ db.results_as_hash = true
86
+ db.get_first_value( query, [status_to_find] )
87
+ end
88
+
89
+ # Compilation of statistics and data, with focus on exceptions
90
+ #
91
+ # @return [Hash]
92
+ def self.exception
93
+ incomplete = status(status_to_find: 'COMPLETE', status_presence: false)
94
+ failed = status(status_to_find: 'FAILED', status_presence: true)
95
+ report = {}
96
+ report['current'] = current if !current.empty?
97
+ report['failed'] = {}
98
+ report['failed']['count'] = failed.count
99
+ report['failed']['data'] = failed if !failed.empty?
100
+ report['incomplete'] = {}
101
+ report['incomplete']['count'] = incomplete.count
102
+ report['incomplete']['data'] = incomplete if !incomplete.empty?
103
+ report['complete'] = {}
104
+ report['complete']['count'] = complete_count if complete_count
105
+ report
106
+ end
107
+
108
+ # Is it in database?
109
+ # @param path_to_find [String] directory name within ingest path
110
+ # @return [Boolean]
111
+ def self.in_db?(path_to_find)
112
+ in_db = false
113
+
114
+ # Get path out of DB as a hex string
115
+ query = 'SELECT hex(path) FROM unit'
116
+
117
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
118
+ # and use hex function in DB query
119
+ db.execute( query ) do |row|
120
+ bin_path = Preservation::Conversion.hex_to_bin row[0]
121
+ if bin_path === path_to_find
122
+ in_db = true
123
+ end
124
+ end
125
+
126
+ in_db
127
+ end
128
+
129
+ # Has preservation been done?
130
+ # @param path_to_find [String] directory name within ingest path
131
+ # @return [Boolean]
132
+ def self.preserved?(path_to_find)
133
+ preserved = false
134
+
135
+ # 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
136
+ # indicates completed
137
+ unit_type_to_find = 'ingest'
138
+ status_to_find = 'COMPLETE'
139
+
140
+ # Get path out of DB as a hex string for completed ingests
141
+ query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
142
+
143
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
144
+ # and use hex function in DB query
145
+ db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
146
+ bin_path = Preservation::Conversion.hex_to_bin row[0]
147
+ if bin_path === path_to_find
148
+ preserved = true
149
+ end
150
+ end
151
+
152
+ preserved
153
+ end
154
+
155
+ # Db
156
+ #
157
+ # @return [SQLite3::Database]
158
+ def self.db
159
+ Preservation::Report::Database.db_connection Preservation.db_path
160
+ end
161
+
162
+ end
163
+
164
+ end
165
+
166
+ end
@@ -0,0 +1,50 @@
1
+ module Preservation
2
+
3
+ # Storage
4
+ #
5
+ module Storage
6
+
7
+ # Free up disk space for completed transfers
8
+ #
9
+ def self.cleanup
10
+ preserved = get_preserved
11
+ if !preserved.nil? && !preserved.empty?
12
+ preserved.each do |i|
13
+ # skip anything that has a different owner to script
14
+ if File.stat(i).grpowned?
15
+ FileUtils.remove_dir i
16
+ # @logger.info 'Deleted ' + i
17
+ end
18
+ end
19
+ end
20
+ end
21
+
22
+ # Enough storage for download?
23
+ #
24
+ # @return [Boolean]
25
+ def self.enough_storage_for_download?(required_bytes)
26
+ # scale up the required space using a multiplier
27
+ multiplier = 2
28
+ available = FreeDiskSpace.bytes('/')
29
+ required_bytes * multiplier < available ? true : false
30
+ end
31
+
32
+ # Collect all paths from DB where preservation has been done
33
+ # @return [Array<String>]
34
+ def self.get_preserved
35
+ ingest_complete = Preservation::Report::Transfer.status(status_to_find: 'COMPLETE',
36
+ status_presence: true)
37
+ preserved = []
38
+ ingest_complete.each do |i|
39
+ dir_path = Preservation.ingest_path + '/' + i['path']
40
+ if File.exists?(dir_path)
41
+ preserved << dir_path
42
+ end
43
+ end
44
+
45
+ preserved
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,21 @@
1
+ module Preservation
2
+
3
+ # Temporal
4
+ #
5
+ module Temporal
6
+
7
+ # time_to_preserve?
8
+ #
9
+ # @param start_utc [String]
10
+ # @param delay [Integer] days to wait (after start date) before preserving
11
+ # @return [Boolean]
12
+ def self.time_to_preserve?(start_utc, delay)
13
+ now = DateTime.now
14
+ start_datetime = DateTime.parse(start_utc)
15
+ days_since_start = (now - start_datetime).to_i # result in days
16
+ days_since_start >= delay ? true : false
17
+ end
18
+
19
+ end
20
+
21
+ end
@@ -0,0 +1,215 @@
1
+ module Preservation
2
+
3
+ # Transfer preparation
4
+ #
5
+ module Transfer
6
+
7
+ # Transfer preparation for Pure
8
+ #
9
+ class Pure < Ingest
10
+
11
+ # @param base_url [String]
12
+ # @param username [String]
13
+ # @param password [String]
14
+ # @param basic_auth [Boolean]
15
+ def initialize(base_url: nil, username: nil, password: nil, basic_auth: nil)
16
+ super()
17
+ @base_url = base_url
18
+ @basic_auth = basic_auth
19
+ if basic_auth === true
20
+ @username = username
21
+ @password = password
22
+ end
23
+ end
24
+
25
+ # For given uuid, if necessary, fetch the metadata,
26
+ # prepare a directory in the ingest path and populate it with the files and
27
+ # JSON description file.
28
+ #
29
+ # @param uuid [String] uuid to preserve
30
+ # @param dir_scheme [Symbol] how to make directory name
31
+ # @param delay [Integer] days to wait (after modification date) before preserving
32
+ def prepare_dataset(uuid: nil,
33
+ dir_scheme: :uuid,
34
+ delay: 0)
35
+ if uuid.nil?
36
+ @logger.error 'Missing ' + uuid
37
+ exit
38
+ end
39
+ dir_base_path = Preservation.ingest_path
40
+
41
+ dataset = Puree::Dataset.new base_url: @base_url,
42
+ username: @username,
43
+ password: @password,
44
+ basic_auth: @basic_auth
45
+
46
+ dataset.find uuid: uuid
47
+ d = dataset.metadata
48
+ if d.empty?
49
+ @logger.error 'No metadata for ' + uuid
50
+ exit
51
+ end
52
+ # configurable to become more human-readable
53
+ dir_name = Preservation::Builder.build_directory_name(d, dir_scheme)
54
+
55
+ # continue only if dir_name is not empty (e.g. because there was no DOI)
56
+ # continue only if there is no DB entry
57
+ # continue only if the dataset has a DOI
58
+ # continue only if there are files for this resource
59
+ # continue only if it is time to preserve
60
+ if !dir_name.nil? &&
61
+ !dir_name.empty? &&
62
+ !Preservation::Report::Transfer.in_db?(dir_name) &&
63
+ !d['doi'].empty? &&
64
+ !d['file'].empty? &&
65
+ Preservation::Temporal.time_to_preserve?(d['modified'], delay)
66
+
67
+ dir_file_path = dir_base_path + '/' + dir_name
68
+ dir_metadata_path = dir_file_path + '/metadata/'
69
+ metadata_filename = dir_metadata_path + 'metadata.json'
70
+
71
+ # calculate total size of data files
72
+ download_storage_required = 0
73
+ d['file'].each { |i| download_storage_required += i['size'].to_i }
74
+
75
+ # do we have enough space in filesystem to fetch data files?
76
+ if Preservation::Storage.enough_storage_for_download? download_storage_required
77
+ # @logger.info 'Sufficient disk space for ' + dir_file_path
78
+ else
79
+ @logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
80
+ end
81
+
82
+ # has metadata file been created? if so, files and metadata are in place
83
+ # continue only if files not present in ingest location
84
+ if !File.size? metadata_filename
85
+
86
+ @logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
87
+
88
+ data = []
89
+ d['file'].each do |f|
90
+ o = package_dataset_metadata d, f
91
+ data << o
92
+ wget_str = Preservation::Builder.build_wget @username,
93
+ @password,
94
+ f['url']
95
+
96
+ Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
97
+
98
+ # fetch the file
99
+ Dir.chdir(dir_file_path) do
100
+ # puts 'Changing dir to ' + Dir.pwd
101
+ # puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
102
+ if File.size?(f['name'])
103
+ # puts 'Should be deleting ' + f['name']
104
+ File.delete(f['name'])
105
+ end
106
+ # puts f['name'] + ' missing or empty'
107
+ # puts wget_str
108
+ `#{wget_str}`
109
+ end
110
+ end
111
+
112
+ Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
113
+
114
+ pretty = JSON.pretty_generate( data, :indent => ' ')
115
+ # puts pretty
116
+ File.write(metadata_filename,pretty)
117
+ @logger.info 'Created ' + metadata_filename
118
+ else
119
+ @logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid'] +
120
+ ' because ' + metadata_filename + ' exists'
121
+ end
122
+ else
123
+ @logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
124
+ end
125
+ end
126
+
127
+ private
128
+
129
+ def package_dataset_metadata(d, f)
130
+ o = {}
131
+ o['filename'] = 'objects/' + f['name']
132
+ o['dc.title'] = d['title']
133
+ if !d['description'].empty?
134
+ o['dc.description'] = d['description']
135
+ end
136
+ o['dcterms.created'] = d['created']
137
+ if !d['available']['year'].empty?
138
+ o['dcterms.available'] = Puree::Date.iso(d['available'])
139
+ end
140
+ o['dc.publisher'] = d['publisher']
141
+ if !d['doi'].empty?
142
+ o['dc.identifier'] = d['doi']
143
+ end
144
+ if !d['spatial'].empty?
145
+ o['dcterms.spatial'] = d['spatial']
146
+ end
147
+ if !d['temporal']['start']['year'].empty?
148
+ temporal_range = ''
149
+ temporal_range << Puree::Date.iso(d['temporal']['start'])
150
+ if !d['temporal']['end']['year'].empty?
151
+ temporal_range << '/'
152
+ temporal_range << Puree::Date.iso(d['temporal']['end'])
153
+ end
154
+ o['dcterms.temporal'] = temporal_range
155
+ end
156
+ creators = []
157
+ contributors = []
158
+ person_types = %w(internal external other)
159
+ person_types.each do |person_type|
160
+ d['person'][person_type].each do |i|
161
+ if i['role'] == 'Creator'
162
+ creator = i['name']['last'] + ', ' + i['name']['first']
163
+ creators << creator
164
+ end
165
+ if i['role'] == 'Contributor'
166
+ contributor = i['name']['last'] + ', ' + i['name']['first']
167
+ contributors << contributor
168
+ end
169
+ end
170
+ end
171
+ o['dc.creator'] = creators
172
+ if !contributors.empty?
173
+ o['dc.contributor'] = contributors
174
+ end
175
+ keywords = []
176
+ d['keyword'].each { |i|
177
+ keywords << i
178
+ }
179
+ if !keywords.empty?
180
+ o['dc.subject'] = keywords
181
+ end
182
+ if !f['license']['name'].empty?
183
+ o['dcterms.license'] = f['license']['name']
184
+ end
185
+ # o['dc.format'] = f['mime']
186
+
187
+ related = []
188
+ publications = d['publication']
189
+ publications.each do |i|
190
+ o_related = {}
191
+ o_related['dc.title'] = i['title']
192
+ o_related['type'] = i['type']
193
+ pub = Puree::Publication.new base_url: @base_url,
194
+ username: @username,
195
+ password: @password,
196
+ basic_auth: @basic_auth
197
+ pub.find uuid: i['uuid']
198
+ doi = pub.doi
199
+ if doi
200
+ o_related['dc.identifier'] = doi
201
+ end
202
+ related << o_related
203
+ end
204
+ if !related.empty?
205
+ o['related'] = related
206
+ end
207
+
208
+ o
209
+ end
210
+
211
+ end
212
+
213
+ end
214
+
215
+ end