preservation 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +173 -14
- data/lib/preservation.rb +2 -2
- data/lib/preservation/builder.rb +5 -5
- data/lib/preservation/report/database.rb +1 -2
- data/lib/preservation/temporal.rb +3 -4
- data/lib/preservation/transfer/base.rb +42 -0
- data/lib/preservation/transfer/dataset.rb +258 -0
- data/lib/preservation/version.rb +1 -1
- data/preservation.gemspec +4 -4
- metadata +10 -10
- data/lib/preservation/ingest.rb +0 -38
- data/lib/preservation/transfer/pure.rb +0 -259
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54db84bdb0bc782f05420b420200e78b9394a6af
|
4
|
+
data.tar.gz: a243b3e89cdf0fe830df9eea16639094d5854af1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51d73c2067b1d48c7ce8a5eff9659fd0cb0059e59850e6a0d80c9865a9080a9e718e839b0d83aa2bde790fbcdae18d2eb7f26480f7f72ae9a74084fc7f6975f1
|
7
|
+
data.tar.gz: b58bd774f4905d98fee7be08f4a99a1bca5fa3bd115f8136c7a524c92937757e1ddfea63308ed589c4aaa11174b73499d1b43d993fee660df95e1ab533998a6a
|
data/CHANGELOG.md
CHANGED
@@ -4,8 +4,17 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
4
4
|
|
5
5
|
## Unreleased
|
6
6
|
|
7
|
+
## 0.5.0 - 2017-05-23
|
8
|
+
### Changed
|
9
|
+
- Transfer - created as ISO8601 date format.
|
10
|
+
|
11
|
+
### Fixed
|
12
|
+
- Transfer - handling DOIs of related works for both datasets and publications.
|
13
|
+
- Transfer - handling missing DOIs of related works.
|
14
|
+
|
7
15
|
## 0.4.2 - 2017-05-18
|
8
16
|
### Fixed
|
17
|
+
- Transfer - presence check for DOI of a related work.
|
9
18
|
|
10
19
|
## 0.4.1 - 2016-09-30
|
11
20
|
### Fixed
|
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# Preservation
|
2
2
|
|
3
|
-
Extraction
|
3
|
+
Extraction from the Pure Research Information System and transformation for
|
4
|
+
loading by Archivematica.
|
5
|
+
|
6
|
+
Includes transfer preparation, reporting and disk space management.
|
4
7
|
|
5
8
|
## Status
|
6
9
|
|
@@ -27,7 +30,9 @@ Or install it yourself as:
|
|
27
30
|
## Usage
|
28
31
|
|
29
32
|
### Configuration
|
30
|
-
|
33
|
+
|
34
|
+
Configure Preservation. If ```log_path``` is omitted, logging (standard library)
|
35
|
+
writes to STDOUT.
|
31
36
|
|
32
37
|
```ruby
|
33
38
|
Preservation.configure do |config|
|
@@ -37,24 +42,129 @@ Preservation.configure do |config|
|
|
37
42
|
end
|
38
43
|
```
|
39
44
|
|
45
|
+
Create a hash for passing to a transfer.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
# Pure host with authentication.
|
49
|
+
config = {
|
50
|
+
url: ENV['PURE_URL'],
|
51
|
+
username: ENV['PURE_USERNAME'],
|
52
|
+
password: ENV['PURE_PASSWORD']
|
53
|
+
}
|
54
|
+
```
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
# Pure host without authentication.
|
58
|
+
config = {
|
59
|
+
url: ENV['PURE_URL']
|
60
|
+
}
|
61
|
+
```
|
40
62
|
|
41
63
|
### Transfer
|
42
|
-
|
64
|
+
|
65
|
+
Configure a transfer to retrieve data from a Pure host.
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
transfer = Preservation::Transfer::Dataset.new config
|
69
|
+
```
|
70
|
+
|
71
|
+
#### Single
|
72
|
+
|
73
|
+
If necessary, fetch the metadata, prepare a directory in the ingest path and
|
74
|
+
populate it with the files and JSON description file.
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
transfer.prepare uuid: 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'
|
78
|
+
```
|
79
|
+
|
80
|
+
#### Batch
|
81
|
+
|
82
|
+
For multiple Pure datasets, if necessary, fetch the metadata, prepare a
|
83
|
+
directory in the ingest path and populate it with the files and JSON description
|
84
|
+
file.
|
85
|
+
|
86
|
+
A maximum of 10 will be prepared using the doi_short directory naming scheme.
|
87
|
+
Each dataset will only be prepared if 20 days have elapsed since the metadata
|
88
|
+
record was last modified.
|
43
89
|
|
44
90
|
```ruby
|
45
|
-
transfer
|
46
|
-
|
47
|
-
|
48
|
-
basic_auth: true
|
91
|
+
transfer.prepare_batch max: 10,
|
92
|
+
dir_scheme: :doi_short,
|
93
|
+
delay: 20
|
49
94
|
```
|
50
95
|
|
51
|
-
|
52
|
-
|
96
|
+
#### Directory name
|
97
|
+
|
98
|
+
The following are permitted values for the dir_scheme parameter:
|
53
99
|
|
54
100
|
```ruby
|
55
|
-
|
101
|
+
:uuid_title
|
102
|
+
:title_uuid
|
103
|
+
:date_uuid_title
|
104
|
+
:date_title_uuid
|
105
|
+
:date_time_uuid
|
106
|
+
:date_time_title
|
107
|
+
:date_time_uuid_title
|
108
|
+
:date_time_title_uuid
|
109
|
+
:uuid
|
110
|
+
:doi
|
111
|
+
:doi_short
|
56
112
|
```
|
57
113
|
|
114
|
+
#### Load directory
|
115
|
+
|
116
|
+
A transfer-ready directory, with a name built according to the directory scheme
|
117
|
+
specified, in this case doi_short. This particular example has only one file
|
118
|
+
Ebola_data_Jun15.zip in the dataset.
|
119
|
+
```
|
120
|
+
.
|
121
|
+
├── 10.17635-lancaster-researchdata-6
|
122
|
+
│ ├── Ebola_data_Jun15.zip
|
123
|
+
│ └── metadata
|
124
|
+
│ └── metadata.json
|
125
|
+
```
|
126
|
+
|
127
|
+
metadata.json:
|
128
|
+
|
129
|
+
```json
|
130
|
+
[
|
131
|
+
{
|
132
|
+
"filename": "objects/Ebola_data_Jun15.zip",
|
133
|
+
"dc.title": "Ebolavirus evolution 2013-2015",
|
134
|
+
"dc.description": "Data used for analysis of selection and evolutionary rate in Zaire Ebolavirus variant Makona",
|
135
|
+
"dcterms.created": "2015-06-04",
|
136
|
+
"dcterms.available": "2015-06-04",
|
137
|
+
"dc.publisher": "Lancaster University",
|
138
|
+
"dc.identifier": "http://dx.doi.org/10.17635/lancaster/researchdata/6",
|
139
|
+
"dcterms.spatial": [
|
140
|
+
"Guinea, Sierra Leone, Liberia"
|
141
|
+
],
|
142
|
+
"dc.creator": [
|
143
|
+
"Gatherer, Derek"
|
144
|
+
],
|
145
|
+
"dc.contributor": [
|
146
|
+
"Robertson, David",
|
147
|
+
"Lovell, Simon"
|
148
|
+
],
|
149
|
+
"dc.subject": [
|
150
|
+
"Ebolavirus",
|
151
|
+
"evolution",
|
152
|
+
"phylogenetics",
|
153
|
+
"virulence",
|
154
|
+
"Filoviridae",
|
155
|
+
"positive selection"
|
156
|
+
],
|
157
|
+
"dcterms.license": "CC BY",
|
158
|
+
"dc.relation": [
|
159
|
+
"http://dx.doi.org/10.1136/ebmed-2014-110127",
|
160
|
+
"http://dx.doi.org/10.1099/vir.0.067199-0"
|
161
|
+
]
|
162
|
+
}
|
163
|
+
]
|
164
|
+
```
|
165
|
+
|
166
|
+
### Storage
|
167
|
+
|
58
168
|
Free up disk space for completed transfers. Can be done at any time.
|
59
169
|
|
60
170
|
```ruby
|
@@ -62,13 +172,62 @@ Preservation::Storage.cleanup
|
|
62
172
|
```
|
63
173
|
|
64
174
|
### Report
|
175
|
+
|
65
176
|
Can be used for scheduled monitoring of transfers.
|
66
177
|
|
67
178
|
```ruby
|
68
179
|
Preservation::Report::Transfer.exception
|
69
180
|
```
|
70
181
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
182
|
+
Formatted as JSON:
|
183
|
+
|
184
|
+
```json
|
185
|
+
{
|
186
|
+
"pending": {
|
187
|
+
"count": 3,
|
188
|
+
"data": [
|
189
|
+
{
|
190
|
+
"path": "10.17635-lancaster-researchdata-72",
|
191
|
+
"path_timestamp": "2016-09-29 12:08:58 +0100"
|
192
|
+
},
|
193
|
+
{
|
194
|
+
"path": "10.17635-lancaster-researchdata-74",
|
195
|
+
"path_timestamp": "2016-09-29 12:08:59 +0100"
|
196
|
+
},
|
197
|
+
{
|
198
|
+
"path": "10.17635-lancaster-researchdata-75",
|
199
|
+
"path_timestamp": "2016-09-29 12:09:00 +0100"
|
200
|
+
}
|
201
|
+
]
|
202
|
+
},
|
203
|
+
"current": {
|
204
|
+
"path": "10.17635-lancaster-researchdata-90",
|
205
|
+
"unit_type": "ingest",
|
206
|
+
"status": "PROCESSING",
|
207
|
+
"current": 1,
|
208
|
+
"id": 91,
|
209
|
+
"uuid": "ebf048c3-0ca8-409c-94cf-ab3e5d97e901",
|
210
|
+
"path_timestamp": "2016-09-28 17:09:33 +0100
|
211
|
+
},
|
212
|
+
"failed": {
|
213
|
+
"count": 0
|
214
|
+
},
|
215
|
+
"incomplete": {
|
216
|
+
"count": 1,
|
217
|
+
"data": [
|
218
|
+
{
|
219
|
+
"path": "10.17635-lancaster-researchdata-90",
|
220
|
+
"unit_type": "ingest",
|
221
|
+
"status": "PROCESSING",
|
222
|
+
"current": 1,
|
223
|
+
"id": 91,
|
224
|
+
"uuid": "ebf048c3-0ca8-409c-94cf-ab3e5d97e901",
|
225
|
+
"path_timestamp": "2016-09-28 17:09:33 +0100"
|
226
|
+
}
|
227
|
+
]
|
228
|
+
},
|
229
|
+
"complete": {
|
230
|
+
"count": 78
|
231
|
+
}
|
232
|
+
}
|
233
|
+
```
|
data/lib/preservation.rb
CHANGED
@@ -8,11 +8,11 @@ require 'preservation/configuration'
|
|
8
8
|
require 'preservation/report/database'
|
9
9
|
require 'preservation/report/transfer'
|
10
10
|
require 'preservation/conversion'
|
11
|
-
require 'preservation/ingest'
|
12
11
|
require 'preservation/builder'
|
13
12
|
require 'preservation/storage'
|
14
13
|
require 'preservation/temporal'
|
15
|
-
require 'preservation/transfer/
|
14
|
+
require 'preservation/transfer/base'
|
15
|
+
require 'preservation/transfer/dataset'
|
16
16
|
require 'preservation/version'
|
17
17
|
|
18
18
|
# Top level namespace
|
data/lib/preservation/builder.rb
CHANGED
@@ -35,9 +35,9 @@ module Preservation
|
|
35
35
|
# @param directory_name_scheme [Symbol]
|
36
36
|
# @return [String]
|
37
37
|
def self.build_directory_name(metadata_record, directory_name_scheme)
|
38
|
-
doi = metadata_record[
|
39
|
-
uuid = metadata_record[
|
40
|
-
title = metadata_record[
|
38
|
+
doi = metadata_record[:doi]
|
39
|
+
uuid = metadata_record[:uuid]
|
40
|
+
title = metadata_record[:title].strip.gsub(' ', '-').gsub('/', '-')
|
41
41
|
time = Time.new
|
42
42
|
date = time.strftime("%Y-%m-%d")
|
43
43
|
time = time.strftime("%H:%M:%S")
|
@@ -63,12 +63,12 @@ module Preservation
|
|
63
63
|
when :uuid
|
64
64
|
uuid
|
65
65
|
when :doi
|
66
|
-
if doi.empty?
|
66
|
+
if doi.nil? || doi.empty?
|
67
67
|
return ''
|
68
68
|
end
|
69
69
|
doi.gsub('/', '-')
|
70
70
|
when :doi_short
|
71
|
-
if doi.empty?
|
71
|
+
if doi.nil? || doi.empty?
|
72
72
|
return ''
|
73
73
|
end
|
74
74
|
doi_short_to_remove = 'http://dx.doi.org/'
|
@@ -6,13 +6,12 @@ module Preservation
|
|
6
6
|
|
7
7
|
# time_to_preserve?
|
8
8
|
#
|
9
|
-
# @param start_utc [
|
9
|
+
# @param start_utc [Time]
|
10
10
|
# @param delay [Integer] days to wait (after start date) before preserving
|
11
11
|
# @return [Boolean]
|
12
12
|
def self.time_to_preserve?(start_utc, delay)
|
13
|
-
now =
|
14
|
-
|
15
|
-
days_since_start = (now - start_datetime).to_i # result in days
|
13
|
+
now = Time.now
|
14
|
+
days_since_start = (now - start_utc).to_i # result in days
|
16
15
|
days_since_start >= delay ? true : false
|
17
16
|
end
|
18
17
|
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
module Transfer
|
4
|
+
|
5
|
+
# Transfer base
|
6
|
+
#
|
7
|
+
class Base
|
8
|
+
|
9
|
+
attr_reader :logger
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
setup_logger
|
13
|
+
check_ingest_path
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def check_ingest_path
|
19
|
+
if Preservation.ingest_path.nil?
|
20
|
+
@logger.error 'Missing ingest path'
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def setup_logger
|
26
|
+
if @logger.nil?
|
27
|
+
if Preservation.log_path.nil?
|
28
|
+
@logger = Logger.new STDOUT
|
29
|
+
else
|
30
|
+
# Keep data for today and the past 20 days
|
31
|
+
@logger = Logger.new File.new(Preservation.log_path, 'a'), 20, 'daily'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
@logger.level = Logger::INFO
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
@@ -0,0 +1,258 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Transfer preparation
|
4
|
+
#
|
5
|
+
module Transfer
|
6
|
+
|
7
|
+
# Transfer preparation for dataset
|
8
|
+
#
|
9
|
+
class Dataset < Preservation::Transfer::Base
|
10
|
+
|
11
|
+
# @param config [Hash]
|
12
|
+
def initialize(config)
|
13
|
+
super()
|
14
|
+
@config = config
|
15
|
+
end
|
16
|
+
|
17
|
+
# For given uuid, if necessary, fetch the metadata,
|
18
|
+
# prepare a directory in the ingest path and populate it with the files and
|
19
|
+
# JSON description file.
|
20
|
+
#
|
21
|
+
# @param uuid [String] uuid to preserve
|
22
|
+
# @param dir_scheme [Symbol] how to make directory name
|
23
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
24
|
+
# @return [Boolean] indicates presence of metadata description file
|
25
|
+
def prepare(uuid: nil,
|
26
|
+
dir_scheme: :uuid,
|
27
|
+
delay: 0)
|
28
|
+
success = false
|
29
|
+
|
30
|
+
if uuid.nil?
|
31
|
+
@logger.error 'Missing ' + uuid
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
dir_base_path = Preservation.ingest_path
|
35
|
+
|
36
|
+
dataset_extractor = Puree::Extractor::Dataset.new @config
|
37
|
+
d = dataset_extractor.find uuid: uuid
|
38
|
+
if !d
|
39
|
+
@logger.error 'No metadata for ' + uuid
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
|
43
|
+
metadata_record = {
|
44
|
+
doi: d.doi,
|
45
|
+
uuid: d.uuid,
|
46
|
+
title: d.title
|
47
|
+
}
|
48
|
+
|
49
|
+
# configurable to become more human-readable
|
50
|
+
dir_name = Preservation::Builder.build_directory_name(metadata_record, dir_scheme)
|
51
|
+
|
52
|
+
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
53
|
+
# continue only if there is no DB entry
|
54
|
+
# continue only if the dataset has a DOI
|
55
|
+
# continue only if there are files for this resource
|
56
|
+
# continue only if it is time to preserve
|
57
|
+
if !dir_name.nil? &&
|
58
|
+
!dir_name.empty? &&
|
59
|
+
!Preservation::Report::Transfer.in_db?(dir_name) &&
|
60
|
+
d.doi &&
|
61
|
+
!d.files.empty? &&
|
62
|
+
Preservation::Temporal.time_to_preserve?(d.modified, delay)
|
63
|
+
|
64
|
+
dir_file_path = dir_base_path + '/' + dir_name
|
65
|
+
dir_metadata_path = dir_file_path + '/metadata/'
|
66
|
+
metadata_filename = dir_metadata_path + 'metadata.json'
|
67
|
+
|
68
|
+
# calculate total size of data files
|
69
|
+
download_storage_required = 0
|
70
|
+
d.files.each { |i| download_storage_required += i.size.to_i }
|
71
|
+
|
72
|
+
# do we have enough space in filesystem to fetch data files?
|
73
|
+
if Preservation::Storage.enough_storage_for_download? download_storage_required
|
74
|
+
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
75
|
+
else
|
76
|
+
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
77
|
+
end
|
78
|
+
|
79
|
+
# has metadata file been created? if so, files and metadata are in place
|
80
|
+
# continue only if files not present in ingest location
|
81
|
+
if !File.size? metadata_filename
|
82
|
+
|
83
|
+
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d.uuid
|
84
|
+
|
85
|
+
data = []
|
86
|
+
d.files.each do |f|
|
87
|
+
o = package_metadata d, f
|
88
|
+
data << o
|
89
|
+
wget_str = Preservation::Builder.build_wget @config[:username],
|
90
|
+
@config[:password],
|
91
|
+
f.url
|
92
|
+
|
93
|
+
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
94
|
+
|
95
|
+
# fetch the file
|
96
|
+
Dir.chdir(dir_file_path) do
|
97
|
+
# puts 'Changing dir to ' + Dir.pwd
|
98
|
+
# puts 'Size of ' + f.name + ' is ' + File.size(f.name).to_s
|
99
|
+
if File.size?(f.name)
|
100
|
+
# puts 'Should be deleting ' + f['name']
|
101
|
+
File.delete(f.name)
|
102
|
+
end
|
103
|
+
# puts f.name + ' missing or empty'
|
104
|
+
# puts wget_str
|
105
|
+
`#{wget_str}`
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
110
|
+
|
111
|
+
pretty = JSON.pretty_generate( data, :indent => ' ')
|
112
|
+
# puts pretty
|
113
|
+
File.write(metadata_filename,pretty)
|
114
|
+
@logger.info 'Created ' + metadata_filename
|
115
|
+
success = true
|
116
|
+
else
|
117
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d.uuid +
|
118
|
+
' because ' + metadata_filename + ' exists'
|
119
|
+
end
|
120
|
+
else
|
121
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d.uuid
|
122
|
+
end
|
123
|
+
success
|
124
|
+
end
|
125
|
+
|
126
|
+
# For multiple datasets, if necessary, fetch the metadata,
|
127
|
+
# prepare a directory in the ingest path and populate it with the files and
|
128
|
+
# JSON description file.
|
129
|
+
#
|
130
|
+
# @param max [Integer] maximum to prepare, omit to set no maximum
|
131
|
+
# @param dir_scheme [Symbol] how to make directory name
|
132
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
133
|
+
def prepare_batch(max: nil,
|
134
|
+
dir_scheme: :uuid,
|
135
|
+
delay: 30)
|
136
|
+
collection_extractor = Puree::Extractor::Collection.new config: @config,
|
137
|
+
resource: :dataset
|
138
|
+
count = collection_extractor.count
|
139
|
+
|
140
|
+
max = count if max.nil?
|
141
|
+
|
142
|
+
batch_size = 10
|
143
|
+
num_prepared = 0
|
144
|
+
0.step(count, batch_size) do |n|
|
145
|
+
|
146
|
+
dataset_collection = collection_extractor.find limit: batch_size,
|
147
|
+
offset: n
|
148
|
+
dataset_collection.each do |dataset|
|
149
|
+
success = prepare uuid: dataset.uuid,
|
150
|
+
dir_scheme: dir_scheme.to_sym,
|
151
|
+
delay: delay
|
152
|
+
|
153
|
+
num_prepared += 1 if success
|
154
|
+
exit if num_prepared == max
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
private
|
160
|
+
|
161
|
+
def package_metadata(d, f)
|
162
|
+
o = {}
|
163
|
+
o['filename'] = 'objects/' + f.name
|
164
|
+
o['dc.title'] = d.title
|
165
|
+
if d.description
|
166
|
+
o['dc.description'] = d.description
|
167
|
+
end
|
168
|
+
o['dcterms.created'] = d.created.strftime("%F")
|
169
|
+
if d.available
|
170
|
+
o['dcterms.available'] = d.available.strftime("%F")
|
171
|
+
end
|
172
|
+
o['dc.publisher'] = d.publisher
|
173
|
+
if d.doi
|
174
|
+
o['dc.identifier'] = d.doi
|
175
|
+
end
|
176
|
+
if !d.spatial_places.empty?
|
177
|
+
o['dcterms.spatial'] = d.spatial_places
|
178
|
+
end
|
179
|
+
|
180
|
+
temporal = d.temporal
|
181
|
+
temporal_range = ''
|
182
|
+
if temporal
|
183
|
+
if temporal.start
|
184
|
+
temporal_range << temporal.start.strftime("%F")
|
185
|
+
if temporal.end
|
186
|
+
temporal_range << '/'
|
187
|
+
temporal_range << temporal.end.strftime("%F")
|
188
|
+
end
|
189
|
+
o['dcterms.temporal'] = temporal_range
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
creators = []
|
194
|
+
contributors = []
|
195
|
+
all_persons = []
|
196
|
+
all_persons << d.persons_internal
|
197
|
+
all_persons << d.persons_external
|
198
|
+
all_persons << d.persons_other
|
199
|
+
all_persons.each do |person_type|
|
200
|
+
person_type.each do |i|
|
201
|
+
name = i.name.last_first if i.name
|
202
|
+
if i.role == 'Creator'
|
203
|
+
creators << name if name
|
204
|
+
end
|
205
|
+
if i.role == 'Contributor'
|
206
|
+
contributors << name if name
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
o['dc.creator'] = creators
|
212
|
+
if !contributors.empty?
|
213
|
+
o['dc.contributor'] = contributors
|
214
|
+
end
|
215
|
+
keywords = []
|
216
|
+
d.keywords.each { |i|
|
217
|
+
keywords << i
|
218
|
+
}
|
219
|
+
if !keywords.empty?
|
220
|
+
o['dc.subject'] = keywords
|
221
|
+
end
|
222
|
+
|
223
|
+
o['dcterms.license'] = f.license.name if f.license
|
224
|
+
# o['dc.format'] = f.mime
|
225
|
+
|
226
|
+
related = []
|
227
|
+
publications = d.publications
|
228
|
+
publications.each do |i|
|
229
|
+
if i.type === 'Dataset'
|
230
|
+
extractor = Puree::Extractor::Dataset.new @config
|
231
|
+
dataset = extractor.find uuid: i.uuid
|
232
|
+
doi = dataset.doi
|
233
|
+
if doi
|
234
|
+
related << doi
|
235
|
+
end
|
236
|
+
end
|
237
|
+
if i.type === 'Publication'
|
238
|
+
extractor = Puree::Extractor::Publication.new @config
|
239
|
+
publication = extractor.find uuid: i.uuid
|
240
|
+
dois = publication.dois
|
241
|
+
if !dois.empty?
|
242
|
+
# Only one needed
|
243
|
+
related << dois[0]
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
if !related.empty?
|
248
|
+
o['dc.relation'] = related
|
249
|
+
end
|
250
|
+
|
251
|
+
o
|
252
|
+
end
|
253
|
+
|
254
|
+
end
|
255
|
+
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
data/lib/preservation/version.rb
CHANGED
data/preservation.gemspec
CHANGED
@@ -8,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Preservation::VERSION
|
9
9
|
spec.authors = ["Adrian Albin-Clark"]
|
10
10
|
spec.email = ["a.albin-clark@lancaster.ac.uk"]
|
11
|
-
spec.summary = %q{Extraction
|
12
|
-
|
13
|
-
spec.homepage = "https://
|
11
|
+
spec.summary = %q{Extraction from the Pure Research Information System and transformation for
|
12
|
+
loading by Archivematica.}
|
13
|
+
spec.homepage = "https://github.com/lulibrary/preservation"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -21,6 +21,6 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.required_ruby_version = '~> 2.1'
|
22
22
|
|
23
23
|
spec.add_runtime_dependency 'free_disk_space', '~> 1.0'
|
24
|
-
spec.add_runtime_dependency 'puree', '~>
|
24
|
+
spec.add_runtime_dependency 'puree', '~> 1.3'
|
25
25
|
spec.add_runtime_dependency 'sqlite3', '~> 1.3'
|
26
26
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preservation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adrian Albin-Clark
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: free_disk_space
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '1.3'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '1.3'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: sqlite3
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,8 +52,7 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.3'
|
55
|
-
description:
|
56
|
-
Tools. Includes transfer preparation, reporting and disk space management.
|
55
|
+
description:
|
57
56
|
email:
|
58
57
|
- a.albin-clark@lancaster.ac.uk
|
59
58
|
executables: []
|
@@ -71,15 +70,15 @@ files:
|
|
71
70
|
- lib/preservation/builder.rb
|
72
71
|
- lib/preservation/configuration.rb
|
73
72
|
- lib/preservation/conversion.rb
|
74
|
-
- lib/preservation/ingest.rb
|
75
73
|
- lib/preservation/report/database.rb
|
76
74
|
- lib/preservation/report/transfer.rb
|
77
75
|
- lib/preservation/storage.rb
|
78
76
|
- lib/preservation/temporal.rb
|
79
|
-
- lib/preservation/transfer/
|
77
|
+
- lib/preservation/transfer/base.rb
|
78
|
+
- lib/preservation/transfer/dataset.rb
|
80
79
|
- lib/preservation/version.rb
|
81
80
|
- preservation.gemspec
|
82
|
-
homepage: https://
|
81
|
+
homepage: https://github.com/lulibrary/preservation
|
83
82
|
licenses:
|
84
83
|
- MIT
|
85
84
|
metadata: {}
|
@@ -102,5 +101,6 @@ rubyforge_project:
|
|
102
101
|
rubygems_version: 2.2.2
|
103
102
|
signing_key:
|
104
103
|
specification_version: 4
|
105
|
-
summary: Extraction
|
104
|
+
summary: Extraction from the Pure Research Information System and transformation for
|
105
|
+
loading by Archivematica.
|
106
106
|
test_files: []
|
data/lib/preservation/ingest.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
module Preservation
|
2
|
-
|
3
|
-
# Ingest
|
4
|
-
#
|
5
|
-
class Ingest
|
6
|
-
|
7
|
-
attr_reader :logger
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
setup_logger
|
11
|
-
check_ingest_path
|
12
|
-
end
|
13
|
-
|
14
|
-
private
|
15
|
-
|
16
|
-
def check_ingest_path
|
17
|
-
if Preservation.ingest_path.nil?
|
18
|
-
@logger.error 'Missing ingest path'
|
19
|
-
exit
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def setup_logger
|
24
|
-
if @logger.nil?
|
25
|
-
if Preservation.log_path.nil?
|
26
|
-
@logger = Logger.new STDOUT
|
27
|
-
else
|
28
|
-
# Keep data for today and the past 20 days
|
29
|
-
@logger = Logger.new File.new(Preservation.log_path, 'a'), 20, 'daily'
|
30
|
-
end
|
31
|
-
end
|
32
|
-
@logger.level = Logger::INFO
|
33
|
-
end
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
|
@@ -1,259 +0,0 @@
|
|
1
|
-
module Preservation
|
2
|
-
|
3
|
-
# Transfer preparation
|
4
|
-
#
|
5
|
-
module Transfer
|
6
|
-
|
7
|
-
# Transfer preparation for Pure
|
8
|
-
#
|
9
|
-
class Pure < Ingest
|
10
|
-
|
11
|
-
# @param base_url [String]
|
12
|
-
# @param username [String]
|
13
|
-
# @param password [String]
|
14
|
-
# @param basic_auth [Boolean]
|
15
|
-
def initialize(base_url: nil, username: nil, password: nil, basic_auth: nil)
|
16
|
-
super()
|
17
|
-
@base_url = base_url
|
18
|
-
@basic_auth = basic_auth
|
19
|
-
if basic_auth === true
|
20
|
-
@username = username
|
21
|
-
@password = password
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# For given uuid, if necessary, fetch the metadata,
|
26
|
-
# prepare a directory in the ingest path and populate it with the files and
|
27
|
-
# JSON description file.
|
28
|
-
#
|
29
|
-
# @param uuid [String] uuid to preserve
|
30
|
-
# @param dir_scheme [Symbol] how to make directory name
|
31
|
-
# @param delay [Integer] days to wait (after modification date) before preserving
|
32
|
-
# @return [Boolean] indicates presence of metadata description file
|
33
|
-
def prepare_dataset(uuid: nil,
|
34
|
-
dir_scheme: :uuid,
|
35
|
-
delay: 0)
|
36
|
-
success = false
|
37
|
-
|
38
|
-
if uuid.nil?
|
39
|
-
@logger.error 'Missing ' + uuid
|
40
|
-
exit
|
41
|
-
end
|
42
|
-
dir_base_path = Preservation.ingest_path
|
43
|
-
|
44
|
-
dataset = Puree::Dataset.new base_url: @base_url,
|
45
|
-
username: @username,
|
46
|
-
password: @password,
|
47
|
-
basic_auth: @basic_auth
|
48
|
-
|
49
|
-
dataset.find uuid: uuid
|
50
|
-
d = dataset.metadata
|
51
|
-
if d.empty?
|
52
|
-
@logger.error 'No metadata for ' + uuid
|
53
|
-
exit
|
54
|
-
end
|
55
|
-
|
56
|
-
# configurable to become more human-readable
|
57
|
-
dir_name = Preservation::Builder.build_directory_name(d, dir_scheme)
|
58
|
-
|
59
|
-
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
60
|
-
# continue only if there is no DB entry
|
61
|
-
# continue only if the dataset has a DOI
|
62
|
-
# continue only if there are files for this resource
|
63
|
-
# continue only if it is time to preserve
|
64
|
-
if !dir_name.nil? &&
|
65
|
-
!dir_name.empty? &&
|
66
|
-
!Preservation::Report::Transfer.in_db?(dir_name) &&
|
67
|
-
!d['doi'].empty? &&
|
68
|
-
!d['file'].empty? &&
|
69
|
-
Preservation::Temporal.time_to_preserve?(d['modified'], delay)
|
70
|
-
|
71
|
-
dir_file_path = dir_base_path + '/' + dir_name
|
72
|
-
dir_metadata_path = dir_file_path + '/metadata/'
|
73
|
-
metadata_filename = dir_metadata_path + 'metadata.json'
|
74
|
-
|
75
|
-
# calculate total size of data files
|
76
|
-
download_storage_required = 0
|
77
|
-
d['file'].each { |i| download_storage_required += i['size'].to_i }
|
78
|
-
|
79
|
-
# do we have enough space in filesystem to fetch data files?
|
80
|
-
if Preservation::Storage.enough_storage_for_download? download_storage_required
|
81
|
-
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
82
|
-
else
|
83
|
-
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
84
|
-
end
|
85
|
-
|
86
|
-
# has metadata file been created? if so, files and metadata are in place
|
87
|
-
# continue only if files not present in ingest location
|
88
|
-
if !File.size? metadata_filename
|
89
|
-
|
90
|
-
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
|
91
|
-
|
92
|
-
data = []
|
93
|
-
d['file'].each do |f|
|
94
|
-
o = package_dataset_metadata d, f
|
95
|
-
data << o
|
96
|
-
wget_str = Preservation::Builder.build_wget @username,
|
97
|
-
@password,
|
98
|
-
f['url']
|
99
|
-
|
100
|
-
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
101
|
-
|
102
|
-
# fetch the file
|
103
|
-
Dir.chdir(dir_file_path) do
|
104
|
-
# puts 'Changing dir to ' + Dir.pwd
|
105
|
-
# puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
|
106
|
-
if File.size?(f['name'])
|
107
|
-
# puts 'Should be deleting ' + f['name']
|
108
|
-
File.delete(f['name'])
|
109
|
-
end
|
110
|
-
# puts f['name'] + ' missing or empty'
|
111
|
-
# puts wget_str
|
112
|
-
`#{wget_str}`
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
117
|
-
|
118
|
-
pretty = JSON.pretty_generate( data, :indent => ' ')
|
119
|
-
# puts pretty
|
120
|
-
File.write(metadata_filename,pretty)
|
121
|
-
@logger.info 'Created ' + metadata_filename
|
122
|
-
success = true
|
123
|
-
else
|
124
|
-
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid'] +
|
125
|
-
' because ' + metadata_filename + ' exists'
|
126
|
-
end
|
127
|
-
else
|
128
|
-
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
|
129
|
-
end
|
130
|
-
success
|
131
|
-
end
|
132
|
-
|
133
|
-
# For multiple datasets, if necessary, fetch the metadata,
|
134
|
-
# prepare a directory in the ingest path and populate it with the files and
|
135
|
-
# JSON description file.
|
136
|
-
#
|
137
|
-
# @param max [Integer] maximum to prepare, omit to set no maximum
|
138
|
-
# @param dir_scheme [Symbol] how to make directory name
|
139
|
-
# @param delay [Integer] days to wait (after modification date) before preserving
|
140
|
-
def prepare_dataset_batch(max: nil,
|
141
|
-
dir_scheme: :uuid,
|
142
|
-
delay: 30)
|
143
|
-
collection = Puree::Collection.new resource: :dataset,
|
144
|
-
base_url: @base_url,
|
145
|
-
username: @username,
|
146
|
-
password: @password,
|
147
|
-
basic_auth: @basic_auth
|
148
|
-
count = collection.count
|
149
|
-
|
150
|
-
max = count if max.nil?
|
151
|
-
|
152
|
-
batch_size = 10
|
153
|
-
num_prepared = 0
|
154
|
-
0.step(count, batch_size) do |n|
|
155
|
-
|
156
|
-
minimal_metadata = collection.find limit: batch_size,
|
157
|
-
offset: n,
|
158
|
-
full: false
|
159
|
-
uuids = []
|
160
|
-
minimal_metadata.each do |i|
|
161
|
-
uuids << i['uuid']
|
162
|
-
end
|
163
|
-
|
164
|
-
uuids.each do |uuid|
|
165
|
-
success = prepare_dataset uuid: uuid,
|
166
|
-
dir_scheme: dir_scheme.to_sym,
|
167
|
-
delay: delay
|
168
|
-
|
169
|
-
num_prepared += 1 if success
|
170
|
-
exit if num_prepared == max
|
171
|
-
end
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
private
|
176
|
-
|
177
|
-
def package_dataset_metadata(d, f)
|
178
|
-
o = {}
|
179
|
-
o['filename'] = 'objects/' + f['name']
|
180
|
-
o['dc.title'] = d['title']
|
181
|
-
if !d['description'].empty?
|
182
|
-
o['dc.description'] = d['description']
|
183
|
-
end
|
184
|
-
o['dcterms.created'] = d['created']
|
185
|
-
if !d['available']['year'].empty?
|
186
|
-
o['dcterms.available'] = Puree::Date.iso(d['available'])
|
187
|
-
end
|
188
|
-
o['dc.publisher'] = d['publisher']
|
189
|
-
if !d['doi'].empty?
|
190
|
-
o['dc.identifier'] = d['doi']
|
191
|
-
end
|
192
|
-
if !d['spatial'].empty?
|
193
|
-
o['dcterms.spatial'] = d['spatial']
|
194
|
-
end
|
195
|
-
if !d['temporal']['start']['year'].empty?
|
196
|
-
temporal_range = ''
|
197
|
-
temporal_range << Puree::Date.iso(d['temporal']['start'])
|
198
|
-
if !d['temporal']['end']['year'].empty?
|
199
|
-
temporal_range << '/'
|
200
|
-
temporal_range << Puree::Date.iso(d['temporal']['end'])
|
201
|
-
end
|
202
|
-
o['dcterms.temporal'] = temporal_range
|
203
|
-
end
|
204
|
-
creators = []
|
205
|
-
contributors = []
|
206
|
-
person_types = %w(internal external other)
|
207
|
-
person_types.each do |person_type|
|
208
|
-
d['person'][person_type].each do |i|
|
209
|
-
if i['role'] == 'Creator'
|
210
|
-
creator = i['name']['last'] + ', ' + i['name']['first']
|
211
|
-
creators << creator
|
212
|
-
end
|
213
|
-
if i['role'] == 'Contributor'
|
214
|
-
contributor = i['name']['last'] + ', ' + i['name']['first']
|
215
|
-
contributors << contributor
|
216
|
-
end
|
217
|
-
end
|
218
|
-
end
|
219
|
-
o['dc.creator'] = creators
|
220
|
-
if !contributors.empty?
|
221
|
-
o['dc.contributor'] = contributors
|
222
|
-
end
|
223
|
-
keywords = []
|
224
|
-
d['keyword'].each { |i|
|
225
|
-
keywords << i
|
226
|
-
}
|
227
|
-
if !keywords.empty?
|
228
|
-
o['dc.subject'] = keywords
|
229
|
-
end
|
230
|
-
if !f['license']['name'].empty?
|
231
|
-
o['dcterms.license'] = f['license']['name']
|
232
|
-
end
|
233
|
-
# o['dc.format'] = f['mime']
|
234
|
-
|
235
|
-
related = []
|
236
|
-
publications = d['publication']
|
237
|
-
publications.each do |i|
|
238
|
-
pub = Puree::Publication.new base_url: @base_url,
|
239
|
-
username: @username,
|
240
|
-
password: @password,
|
241
|
-
basic_auth: @basic_auth
|
242
|
-
pub.find uuid: i['uuid']
|
243
|
-
doi = pub.doi
|
244
|
-
if doi
|
245
|
-
related << doi
|
246
|
-
end
|
247
|
-
end
|
248
|
-
if !related.empty?
|
249
|
-
o['dc.relation'] = related
|
250
|
-
end
|
251
|
-
|
252
|
-
o
|
253
|
-
end
|
254
|
-
|
255
|
-
end
|
256
|
-
|
257
|
-
end
|
258
|
-
|
259
|
-
end
|