preservation 0.4.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +173 -14
- data/lib/preservation.rb +2 -2
- data/lib/preservation/builder.rb +5 -5
- data/lib/preservation/report/database.rb +1 -2
- data/lib/preservation/temporal.rb +3 -4
- data/lib/preservation/transfer/base.rb +42 -0
- data/lib/preservation/transfer/dataset.rb +258 -0
- data/lib/preservation/version.rb +1 -1
- data/preservation.gemspec +4 -4
- metadata +10 -10
- data/lib/preservation/ingest.rb +0 -38
- data/lib/preservation/transfer/pure.rb +0 -259
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 54db84bdb0bc782f05420b420200e78b9394a6af
|
4
|
+
data.tar.gz: a243b3e89cdf0fe830df9eea16639094d5854af1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 51d73c2067b1d48c7ce8a5eff9659fd0cb0059e59850e6a0d80c9865a9080a9e718e839b0d83aa2bde790fbcdae18d2eb7f26480f7f72ae9a74084fc7f6975f1
|
7
|
+
data.tar.gz: b58bd774f4905d98fee7be08f4a99a1bca5fa3bd115f8136c7a524c92937757e1ddfea63308ed589c4aaa11174b73499d1b43d993fee660df95e1ab533998a6a
|
data/CHANGELOG.md
CHANGED
@@ -4,8 +4,17 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
4
4
|
|
5
5
|
## Unreleased
|
6
6
|
|
7
|
+
## 0.5.0 - 2017-05-23
|
8
|
+
### Changed
|
9
|
+
- Transfer - created as ISO8601 date format.
|
10
|
+
|
11
|
+
### Fixed
|
12
|
+
- Transfer - handling DOIs of related works for both datasets and publications.
|
13
|
+
- Transfer - handling missing DOIs of related works.
|
14
|
+
|
7
15
|
## 0.4.2 - 2017-05-18
|
8
16
|
### Fixed
|
17
|
+
- Transfer - presence check for DOI of a related work.
|
9
18
|
|
10
19
|
## 0.4.1 - 2016-09-30
|
11
20
|
### Fixed
|
data/README.md
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# Preservation
|
2
2
|
|
3
|
-
Extraction
|
3
|
+
Extraction from the Pure Research Information System and transformation for
|
4
|
+
loading by Archivematica.
|
5
|
+
|
6
|
+
Includes transfer preparation, reporting and disk space management.
|
4
7
|
|
5
8
|
## Status
|
6
9
|
|
@@ -27,7 +30,9 @@ Or install it yourself as:
|
|
27
30
|
## Usage
|
28
31
|
|
29
32
|
### Configuration
|
30
|
-
|
33
|
+
|
34
|
+
Configure Preservation. If ```log_path``` is omitted, logging (standard library)
|
35
|
+
writes to STDOUT.
|
31
36
|
|
32
37
|
```ruby
|
33
38
|
Preservation.configure do |config|
|
@@ -37,24 +42,129 @@ Preservation.configure do |config|
|
|
37
42
|
end
|
38
43
|
```
|
39
44
|
|
45
|
+
Create a hash for passing to a transfer.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
# Pure host with authentication.
|
49
|
+
config = {
|
50
|
+
url: ENV['PURE_URL'],
|
51
|
+
username: ENV['PURE_USERNAME'],
|
52
|
+
password: ENV['PURE_PASSWORD']
|
53
|
+
}
|
54
|
+
```
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
# Pure host without authentication.
|
58
|
+
config = {
|
59
|
+
url: ENV['PURE_URL']
|
60
|
+
}
|
61
|
+
```
|
40
62
|
|
41
63
|
### Transfer
|
42
|
-
|
64
|
+
|
65
|
+
Configure a transfer to retrieve data from a Pure host.
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
transfer = Preservation::Transfer::Dataset.new config
|
69
|
+
```
|
70
|
+
|
71
|
+
#### Single
|
72
|
+
|
73
|
+
If necessary, fetch the metadata, prepare a directory in the ingest path and
|
74
|
+
populate it with the files and JSON description file.
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
transfer.prepare uuid: 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'
|
78
|
+
```
|
79
|
+
|
80
|
+
#### Batch
|
81
|
+
|
82
|
+
For multiple Pure datasets, if necessary, fetch the metadata, prepare a
|
83
|
+
directory in the ingest path and populate it with the files and JSON description
|
84
|
+
file.
|
85
|
+
|
86
|
+
A maximum of 10 will be prepared using the doi_short directory naming scheme.
|
87
|
+
Each dataset will only be prepared if 20 days have elapsed since the metadata
|
88
|
+
record was last modified.
|
43
89
|
|
44
90
|
```ruby
|
45
|
-
transfer
|
46
|
-
|
47
|
-
|
48
|
-
basic_auth: true
|
91
|
+
transfer.prepare_batch max: 10,
|
92
|
+
dir_scheme: :doi_short,
|
93
|
+
delay: 20
|
49
94
|
```
|
50
95
|
|
51
|
-
|
52
|
-
|
96
|
+
#### Directory name
|
97
|
+
|
98
|
+
The following are permitted values for the dir_scheme parameter:
|
53
99
|
|
54
100
|
```ruby
|
55
|
-
|
101
|
+
:uuid_title
|
102
|
+
:title_uuid
|
103
|
+
:date_uuid_title
|
104
|
+
:date_title_uuid
|
105
|
+
:date_time_uuid
|
106
|
+
:date_time_title
|
107
|
+
:date_time_uuid_title
|
108
|
+
:date_time_title_uuid
|
109
|
+
:uuid
|
110
|
+
:doi
|
111
|
+
:doi_short
|
56
112
|
```
|
57
113
|
|
114
|
+
#### Load directory
|
115
|
+
|
116
|
+
A transfer-ready directory, with a name built according to the directory scheme
|
117
|
+
specified, in this case doi_short. This particular example has only one file
|
118
|
+
Ebola_data_Jun15.zip in the dataset.
|
119
|
+
```
|
120
|
+
.
|
121
|
+
├── 10.17635-lancaster-researchdata-6
|
122
|
+
│ ├── Ebola_data_Jun15.zip
|
123
|
+
│ └── metadata
|
124
|
+
│ └── metadata.json
|
125
|
+
```
|
126
|
+
|
127
|
+
metadata.json:
|
128
|
+
|
129
|
+
```json
|
130
|
+
[
|
131
|
+
{
|
132
|
+
"filename": "objects/Ebola_data_Jun15.zip",
|
133
|
+
"dc.title": "Ebolavirus evolution 2013-2015",
|
134
|
+
"dc.description": "Data used for analysis of selection and evolutionary rate in Zaire Ebolavirus variant Makona",
|
135
|
+
"dcterms.created": "2015-06-04",
|
136
|
+
"dcterms.available": "2015-06-04",
|
137
|
+
"dc.publisher": "Lancaster University",
|
138
|
+
"dc.identifier": "http://dx.doi.org/10.17635/lancaster/researchdata/6",
|
139
|
+
"dcterms.spatial": [
|
140
|
+
"Guinea, Sierra Leone, Liberia"
|
141
|
+
],
|
142
|
+
"dc.creator": [
|
143
|
+
"Gatherer, Derek"
|
144
|
+
],
|
145
|
+
"dc.contributor": [
|
146
|
+
"Robertson, David",
|
147
|
+
"Lovell, Simon"
|
148
|
+
],
|
149
|
+
"dc.subject": [
|
150
|
+
"Ebolavirus",
|
151
|
+
"evolution",
|
152
|
+
"phylogenetics",
|
153
|
+
"virulence",
|
154
|
+
"Filoviridae",
|
155
|
+
"positive selection"
|
156
|
+
],
|
157
|
+
"dcterms.license": "CC BY",
|
158
|
+
"dc.relation": [
|
159
|
+
"http://dx.doi.org/10.1136/ebmed-2014-110127",
|
160
|
+
"http://dx.doi.org/10.1099/vir.0.067199-0"
|
161
|
+
]
|
162
|
+
}
|
163
|
+
]
|
164
|
+
```
|
165
|
+
|
166
|
+
### Storage
|
167
|
+
|
58
168
|
Free up disk space for completed transfers. Can be done at any time.
|
59
169
|
|
60
170
|
```ruby
|
@@ -62,13 +172,62 @@ Preservation::Storage.cleanup
|
|
62
172
|
```
|
63
173
|
|
64
174
|
### Report
|
175
|
+
|
65
176
|
Can be used for scheduled monitoring of transfers.
|
66
177
|
|
67
178
|
```ruby
|
68
179
|
Preservation::Report::Transfer.exception
|
69
180
|
```
|
70
181
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
182
|
+
Formatted as JSON:
|
183
|
+
|
184
|
+
```json
|
185
|
+
{
|
186
|
+
"pending": {
|
187
|
+
"count": 3,
|
188
|
+
"data": [
|
189
|
+
{
|
190
|
+
"path": "10.17635-lancaster-researchdata-72",
|
191
|
+
"path_timestamp": "2016-09-29 12:08:58 +0100"
|
192
|
+
},
|
193
|
+
{
|
194
|
+
"path": "10.17635-lancaster-researchdata-74",
|
195
|
+
"path_timestamp": "2016-09-29 12:08:59 +0100"
|
196
|
+
},
|
197
|
+
{
|
198
|
+
"path": "10.17635-lancaster-researchdata-75",
|
199
|
+
"path_timestamp": "2016-09-29 12:09:00 +0100"
|
200
|
+
}
|
201
|
+
]
|
202
|
+
},
|
203
|
+
"current": {
|
204
|
+
"path": "10.17635-lancaster-researchdata-90",
|
205
|
+
"unit_type": "ingest",
|
206
|
+
"status": "PROCESSING",
|
207
|
+
"current": 1,
|
208
|
+
"id": 91,
|
209
|
+
"uuid": "ebf048c3-0ca8-409c-94cf-ab3e5d97e901",
|
210
|
+
"path_timestamp": "2016-09-28 17:09:33 +0100
|
211
|
+
},
|
212
|
+
"failed": {
|
213
|
+
"count": 0
|
214
|
+
},
|
215
|
+
"incomplete": {
|
216
|
+
"count": 1,
|
217
|
+
"data": [
|
218
|
+
{
|
219
|
+
"path": "10.17635-lancaster-researchdata-90",
|
220
|
+
"unit_type": "ingest",
|
221
|
+
"status": "PROCESSING",
|
222
|
+
"current": 1,
|
223
|
+
"id": 91,
|
224
|
+
"uuid": "ebf048c3-0ca8-409c-94cf-ab3e5d97e901",
|
225
|
+
"path_timestamp": "2016-09-28 17:09:33 +0100"
|
226
|
+
}
|
227
|
+
]
|
228
|
+
},
|
229
|
+
"complete": {
|
230
|
+
"count": 78
|
231
|
+
}
|
232
|
+
}
|
233
|
+
```
|
data/lib/preservation.rb
CHANGED
@@ -8,11 +8,11 @@ require 'preservation/configuration'
|
|
8
8
|
require 'preservation/report/database'
|
9
9
|
require 'preservation/report/transfer'
|
10
10
|
require 'preservation/conversion'
|
11
|
-
require 'preservation/ingest'
|
12
11
|
require 'preservation/builder'
|
13
12
|
require 'preservation/storage'
|
14
13
|
require 'preservation/temporal'
|
15
|
-
require 'preservation/transfer/
|
14
|
+
require 'preservation/transfer/base'
|
15
|
+
require 'preservation/transfer/dataset'
|
16
16
|
require 'preservation/version'
|
17
17
|
|
18
18
|
# Top level namespace
|
data/lib/preservation/builder.rb
CHANGED
@@ -35,9 +35,9 @@ module Preservation
|
|
35
35
|
# @param directory_name_scheme [Symbol]
|
36
36
|
# @return [String]
|
37
37
|
def self.build_directory_name(metadata_record, directory_name_scheme)
|
38
|
-
doi = metadata_record[
|
39
|
-
uuid = metadata_record[
|
40
|
-
title = metadata_record[
|
38
|
+
doi = metadata_record[:doi]
|
39
|
+
uuid = metadata_record[:uuid]
|
40
|
+
title = metadata_record[:title].strip.gsub(' ', '-').gsub('/', '-')
|
41
41
|
time = Time.new
|
42
42
|
date = time.strftime("%Y-%m-%d")
|
43
43
|
time = time.strftime("%H:%M:%S")
|
@@ -63,12 +63,12 @@ module Preservation
|
|
63
63
|
when :uuid
|
64
64
|
uuid
|
65
65
|
when :doi
|
66
|
-
if doi.empty?
|
66
|
+
if doi.nil? || doi.empty?
|
67
67
|
return ''
|
68
68
|
end
|
69
69
|
doi.gsub('/', '-')
|
70
70
|
when :doi_short
|
71
|
-
if doi.empty?
|
71
|
+
if doi.nil? || doi.empty?
|
72
72
|
return ''
|
73
73
|
end
|
74
74
|
doi_short_to_remove = 'http://dx.doi.org/'
|
@@ -6,13 +6,12 @@ module Preservation
|
|
6
6
|
|
7
7
|
# time_to_preserve?
|
8
8
|
#
|
9
|
-
# @param start_utc [
|
9
|
+
# @param start_utc [Time]
|
10
10
|
# @param delay [Integer] days to wait (after start date) before preserving
|
11
11
|
# @return [Boolean]
|
12
12
|
def self.time_to_preserve?(start_utc, delay)
|
13
|
-
now =
|
14
|
-
|
15
|
-
days_since_start = (now - start_datetime).to_i # result in days
|
13
|
+
now = Time.now
|
14
|
+
days_since_start = (now - start_utc).to_i # result in days
|
16
15
|
days_since_start >= delay ? true : false
|
17
16
|
end
|
18
17
|
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
module Transfer
|
4
|
+
|
5
|
+
# Transfer base
|
6
|
+
#
|
7
|
+
class Base
|
8
|
+
|
9
|
+
attr_reader :logger
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
setup_logger
|
13
|
+
check_ingest_path
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def check_ingest_path
|
19
|
+
if Preservation.ingest_path.nil?
|
20
|
+
@logger.error 'Missing ingest path'
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def setup_logger
|
26
|
+
if @logger.nil?
|
27
|
+
if Preservation.log_path.nil?
|
28
|
+
@logger = Logger.new STDOUT
|
29
|
+
else
|
30
|
+
# Keep data for today and the past 20 days
|
31
|
+
@logger = Logger.new File.new(Preservation.log_path, 'a'), 20, 'daily'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
@logger.level = Logger::INFO
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
@@ -0,0 +1,258 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Transfer preparation
|
4
|
+
#
|
5
|
+
module Transfer
|
6
|
+
|
7
|
+
# Transfer preparation for dataset
|
8
|
+
#
|
9
|
+
class Dataset < Preservation::Transfer::Base
|
10
|
+
|
11
|
+
# @param config [Hash]
|
12
|
+
def initialize(config)
|
13
|
+
super()
|
14
|
+
@config = config
|
15
|
+
end
|
16
|
+
|
17
|
+
# For given uuid, if necessary, fetch the metadata,
|
18
|
+
# prepare a directory in the ingest path and populate it with the files and
|
19
|
+
# JSON description file.
|
20
|
+
#
|
21
|
+
# @param uuid [String] uuid to preserve
|
22
|
+
# @param dir_scheme [Symbol] how to make directory name
|
23
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
24
|
+
# @return [Boolean] indicates presence of metadata description file
|
25
|
+
def prepare(uuid: nil,
|
26
|
+
dir_scheme: :uuid,
|
27
|
+
delay: 0)
|
28
|
+
success = false
|
29
|
+
|
30
|
+
if uuid.nil?
|
31
|
+
@logger.error 'Missing ' + uuid
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
dir_base_path = Preservation.ingest_path
|
35
|
+
|
36
|
+
dataset_extractor = Puree::Extractor::Dataset.new @config
|
37
|
+
d = dataset_extractor.find uuid: uuid
|
38
|
+
if !d
|
39
|
+
@logger.error 'No metadata for ' + uuid
|
40
|
+
exit
|
41
|
+
end
|
42
|
+
|
43
|
+
metadata_record = {
|
44
|
+
doi: d.doi,
|
45
|
+
uuid: d.uuid,
|
46
|
+
title: d.title
|
47
|
+
}
|
48
|
+
|
49
|
+
# configurable to become more human-readable
|
50
|
+
dir_name = Preservation::Builder.build_directory_name(metadata_record, dir_scheme)
|
51
|
+
|
52
|
+
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
53
|
+
# continue only if there is no DB entry
|
54
|
+
# continue only if the dataset has a DOI
|
55
|
+
# continue only if there are files for this resource
|
56
|
+
# continue only if it is time to preserve
|
57
|
+
if !dir_name.nil? &&
|
58
|
+
!dir_name.empty? &&
|
59
|
+
!Preservation::Report::Transfer.in_db?(dir_name) &&
|
60
|
+
d.doi &&
|
61
|
+
!d.files.empty? &&
|
62
|
+
Preservation::Temporal.time_to_preserve?(d.modified, delay)
|
63
|
+
|
64
|
+
dir_file_path = dir_base_path + '/' + dir_name
|
65
|
+
dir_metadata_path = dir_file_path + '/metadata/'
|
66
|
+
metadata_filename = dir_metadata_path + 'metadata.json'
|
67
|
+
|
68
|
+
# calculate total size of data files
|
69
|
+
download_storage_required = 0
|
70
|
+
d.files.each { |i| download_storage_required += i.size.to_i }
|
71
|
+
|
72
|
+
# do we have enough space in filesystem to fetch data files?
|
73
|
+
if Preservation::Storage.enough_storage_for_download? download_storage_required
|
74
|
+
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
75
|
+
else
|
76
|
+
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
77
|
+
end
|
78
|
+
|
79
|
+
# has metadata file been created? if so, files and metadata are in place
|
80
|
+
# continue only if files not present in ingest location
|
81
|
+
if !File.size? metadata_filename
|
82
|
+
|
83
|
+
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d.uuid
|
84
|
+
|
85
|
+
data = []
|
86
|
+
d.files.each do |f|
|
87
|
+
o = package_metadata d, f
|
88
|
+
data << o
|
89
|
+
wget_str = Preservation::Builder.build_wget @config[:username],
|
90
|
+
@config[:password],
|
91
|
+
f.url
|
92
|
+
|
93
|
+
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
94
|
+
|
95
|
+
# fetch the file
|
96
|
+
Dir.chdir(dir_file_path) do
|
97
|
+
# puts 'Changing dir to ' + Dir.pwd
|
98
|
+
# puts 'Size of ' + f.name + ' is ' + File.size(f.name).to_s
|
99
|
+
if File.size?(f.name)
|
100
|
+
# puts 'Should be deleting ' + f['name']
|
101
|
+
File.delete(f.name)
|
102
|
+
end
|
103
|
+
# puts f.name + ' missing or empty'
|
104
|
+
# puts wget_str
|
105
|
+
`#{wget_str}`
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
110
|
+
|
111
|
+
pretty = JSON.pretty_generate( data, :indent => ' ')
|
112
|
+
# puts pretty
|
113
|
+
File.write(metadata_filename,pretty)
|
114
|
+
@logger.info 'Created ' + metadata_filename
|
115
|
+
success = true
|
116
|
+
else
|
117
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d.uuid +
|
118
|
+
' because ' + metadata_filename + ' exists'
|
119
|
+
end
|
120
|
+
else
|
121
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d.uuid
|
122
|
+
end
|
123
|
+
success
|
124
|
+
end
|
125
|
+
|
126
|
+
# For multiple datasets, if necessary, fetch the metadata,
|
127
|
+
# prepare a directory in the ingest path and populate it with the files and
|
128
|
+
# JSON description file.
|
129
|
+
#
|
130
|
+
# @param max [Integer] maximum to prepare, omit to set no maximum
|
131
|
+
# @param dir_scheme [Symbol] how to make directory name
|
132
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
133
|
+
def prepare_batch(max: nil,
|
134
|
+
dir_scheme: :uuid,
|
135
|
+
delay: 30)
|
136
|
+
collection_extractor = Puree::Extractor::Collection.new config: @config,
|
137
|
+
resource: :dataset
|
138
|
+
count = collection_extractor.count
|
139
|
+
|
140
|
+
max = count if max.nil?
|
141
|
+
|
142
|
+
batch_size = 10
|
143
|
+
num_prepared = 0
|
144
|
+
0.step(count, batch_size) do |n|
|
145
|
+
|
146
|
+
dataset_collection = collection_extractor.find limit: batch_size,
|
147
|
+
offset: n
|
148
|
+
dataset_collection.each do |dataset|
|
149
|
+
success = prepare uuid: dataset.uuid,
|
150
|
+
dir_scheme: dir_scheme.to_sym,
|
151
|
+
delay: delay
|
152
|
+
|
153
|
+
num_prepared += 1 if success
|
154
|
+
exit if num_prepared == max
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
private
|
160
|
+
|
161
|
+
def package_metadata(d, f)
|
162
|
+
o = {}
|
163
|
+
o['filename'] = 'objects/' + f.name
|
164
|
+
o['dc.title'] = d.title
|
165
|
+
if d.description
|
166
|
+
o['dc.description'] = d.description
|
167
|
+
end
|
168
|
+
o['dcterms.created'] = d.created.strftime("%F")
|
169
|
+
if d.available
|
170
|
+
o['dcterms.available'] = d.available.strftime("%F")
|
171
|
+
end
|
172
|
+
o['dc.publisher'] = d.publisher
|
173
|
+
if d.doi
|
174
|
+
o['dc.identifier'] = d.doi
|
175
|
+
end
|
176
|
+
if !d.spatial_places.empty?
|
177
|
+
o['dcterms.spatial'] = d.spatial_places
|
178
|
+
end
|
179
|
+
|
180
|
+
temporal = d.temporal
|
181
|
+
temporal_range = ''
|
182
|
+
if temporal
|
183
|
+
if temporal.start
|
184
|
+
temporal_range << temporal.start.strftime("%F")
|
185
|
+
if temporal.end
|
186
|
+
temporal_range << '/'
|
187
|
+
temporal_range << temporal.end.strftime("%F")
|
188
|
+
end
|
189
|
+
o['dcterms.temporal'] = temporal_range
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
creators = []
|
194
|
+
contributors = []
|
195
|
+
all_persons = []
|
196
|
+
all_persons << d.persons_internal
|
197
|
+
all_persons << d.persons_external
|
198
|
+
all_persons << d.persons_other
|
199
|
+
all_persons.each do |person_type|
|
200
|
+
person_type.each do |i|
|
201
|
+
name = i.name.last_first if i.name
|
202
|
+
if i.role == 'Creator'
|
203
|
+
creators << name if name
|
204
|
+
end
|
205
|
+
if i.role == 'Contributor'
|
206
|
+
contributors << name if name
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
o['dc.creator'] = creators
|
212
|
+
if !contributors.empty?
|
213
|
+
o['dc.contributor'] = contributors
|
214
|
+
end
|
215
|
+
keywords = []
|
216
|
+
d.keywords.each { |i|
|
217
|
+
keywords << i
|
218
|
+
}
|
219
|
+
if !keywords.empty?
|
220
|
+
o['dc.subject'] = keywords
|
221
|
+
end
|
222
|
+
|
223
|
+
o['dcterms.license'] = f.license.name if f.license
|
224
|
+
# o['dc.format'] = f.mime
|
225
|
+
|
226
|
+
related = []
|
227
|
+
publications = d.publications
|
228
|
+
publications.each do |i|
|
229
|
+
if i.type === 'Dataset'
|
230
|
+
extractor = Puree::Extractor::Dataset.new @config
|
231
|
+
dataset = extractor.find uuid: i.uuid
|
232
|
+
doi = dataset.doi
|
233
|
+
if doi
|
234
|
+
related << doi
|
235
|
+
end
|
236
|
+
end
|
237
|
+
if i.type === 'Publication'
|
238
|
+
extractor = Puree::Extractor::Publication.new @config
|
239
|
+
publication = extractor.find uuid: i.uuid
|
240
|
+
dois = publication.dois
|
241
|
+
if !dois.empty?
|
242
|
+
# Only one needed
|
243
|
+
related << dois[0]
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
if !related.empty?
|
248
|
+
o['dc.relation'] = related
|
249
|
+
end
|
250
|
+
|
251
|
+
o
|
252
|
+
end
|
253
|
+
|
254
|
+
end
|
255
|
+
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
data/lib/preservation/version.rb
CHANGED
data/preservation.gemspec
CHANGED
@@ -8,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Preservation::VERSION
|
9
9
|
spec.authors = ["Adrian Albin-Clark"]
|
10
10
|
spec.email = ["a.albin-clark@lancaster.ac.uk"]
|
11
|
-
spec.summary = %q{Extraction
|
12
|
-
|
13
|
-
spec.homepage = "https://
|
11
|
+
spec.summary = %q{Extraction from the Pure Research Information System and transformation for
|
12
|
+
loading by Archivematica.}
|
13
|
+
spec.homepage = "https://github.com/lulibrary/preservation"
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -21,6 +21,6 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.required_ruby_version = '~> 2.1'
|
22
22
|
|
23
23
|
spec.add_runtime_dependency 'free_disk_space', '~> 1.0'
|
24
|
-
spec.add_runtime_dependency 'puree', '~>
|
24
|
+
spec.add_runtime_dependency 'puree', '~> 1.3'
|
25
25
|
spec.add_runtime_dependency 'sqlite3', '~> 1.3'
|
26
26
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: preservation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adrian Albin-Clark
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: free_disk_space
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '1.3'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '1.3'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: sqlite3
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,8 +52,7 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '1.3'
|
55
|
-
description:
|
56
|
-
Tools. Includes transfer preparation, reporting and disk space management.
|
55
|
+
description:
|
57
56
|
email:
|
58
57
|
- a.albin-clark@lancaster.ac.uk
|
59
58
|
executables: []
|
@@ -71,15 +70,15 @@ files:
|
|
71
70
|
- lib/preservation/builder.rb
|
72
71
|
- lib/preservation/configuration.rb
|
73
72
|
- lib/preservation/conversion.rb
|
74
|
-
- lib/preservation/ingest.rb
|
75
73
|
- lib/preservation/report/database.rb
|
76
74
|
- lib/preservation/report/transfer.rb
|
77
75
|
- lib/preservation/storage.rb
|
78
76
|
- lib/preservation/temporal.rb
|
79
|
-
- lib/preservation/transfer/
|
77
|
+
- lib/preservation/transfer/base.rb
|
78
|
+
- lib/preservation/transfer/dataset.rb
|
80
79
|
- lib/preservation/version.rb
|
81
80
|
- preservation.gemspec
|
82
|
-
homepage: https://
|
81
|
+
homepage: https://github.com/lulibrary/preservation
|
83
82
|
licenses:
|
84
83
|
- MIT
|
85
84
|
metadata: {}
|
@@ -102,5 +101,6 @@ rubyforge_project:
|
|
102
101
|
rubygems_version: 2.2.2
|
103
102
|
signing_key:
|
104
103
|
specification_version: 4
|
105
|
-
summary: Extraction
|
104
|
+
summary: Extraction from the Pure Research Information System and transformation for
|
105
|
+
loading by Archivematica.
|
106
106
|
test_files: []
|
data/lib/preservation/ingest.rb
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
module Preservation
|
2
|
-
|
3
|
-
# Ingest
|
4
|
-
#
|
5
|
-
class Ingest
|
6
|
-
|
7
|
-
attr_reader :logger
|
8
|
-
|
9
|
-
def initialize
|
10
|
-
setup_logger
|
11
|
-
check_ingest_path
|
12
|
-
end
|
13
|
-
|
14
|
-
private
|
15
|
-
|
16
|
-
def check_ingest_path
|
17
|
-
if Preservation.ingest_path.nil?
|
18
|
-
@logger.error 'Missing ingest path'
|
19
|
-
exit
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def setup_logger
|
24
|
-
if @logger.nil?
|
25
|
-
if Preservation.log_path.nil?
|
26
|
-
@logger = Logger.new STDOUT
|
27
|
-
else
|
28
|
-
# Keep data for today and the past 20 days
|
29
|
-
@logger = Logger.new File.new(Preservation.log_path, 'a'), 20, 'daily'
|
30
|
-
end
|
31
|
-
end
|
32
|
-
@logger.level = Logger::INFO
|
33
|
-
end
|
34
|
-
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
38
|
-
|
@@ -1,259 +0,0 @@
|
|
1
|
-
module Preservation
|
2
|
-
|
3
|
-
# Transfer preparation
|
4
|
-
#
|
5
|
-
module Transfer
|
6
|
-
|
7
|
-
# Transfer preparation for Pure
|
8
|
-
#
|
9
|
-
class Pure < Ingest
|
10
|
-
|
11
|
-
# @param base_url [String]
|
12
|
-
# @param username [String]
|
13
|
-
# @param password [String]
|
14
|
-
# @param basic_auth [Boolean]
|
15
|
-
def initialize(base_url: nil, username: nil, password: nil, basic_auth: nil)
|
16
|
-
super()
|
17
|
-
@base_url = base_url
|
18
|
-
@basic_auth = basic_auth
|
19
|
-
if basic_auth === true
|
20
|
-
@username = username
|
21
|
-
@password = password
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
# For given uuid, if necessary, fetch the metadata,
|
26
|
-
# prepare a directory in the ingest path and populate it with the files and
|
27
|
-
# JSON description file.
|
28
|
-
#
|
29
|
-
# @param uuid [String] uuid to preserve
|
30
|
-
# @param dir_scheme [Symbol] how to make directory name
|
31
|
-
# @param delay [Integer] days to wait (after modification date) before preserving
|
32
|
-
# @return [Boolean] indicates presence of metadata description file
|
33
|
-
def prepare_dataset(uuid: nil,
|
34
|
-
dir_scheme: :uuid,
|
35
|
-
delay: 0)
|
36
|
-
success = false
|
37
|
-
|
38
|
-
if uuid.nil?
|
39
|
-
@logger.error 'Missing ' + uuid
|
40
|
-
exit
|
41
|
-
end
|
42
|
-
dir_base_path = Preservation.ingest_path
|
43
|
-
|
44
|
-
dataset = Puree::Dataset.new base_url: @base_url,
|
45
|
-
username: @username,
|
46
|
-
password: @password,
|
47
|
-
basic_auth: @basic_auth
|
48
|
-
|
49
|
-
dataset.find uuid: uuid
|
50
|
-
d = dataset.metadata
|
51
|
-
if d.empty?
|
52
|
-
@logger.error 'No metadata for ' + uuid
|
53
|
-
exit
|
54
|
-
end
|
55
|
-
|
56
|
-
# configurable to become more human-readable
|
57
|
-
dir_name = Preservation::Builder.build_directory_name(d, dir_scheme)
|
58
|
-
|
59
|
-
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
60
|
-
# continue only if there is no DB entry
|
61
|
-
# continue only if the dataset has a DOI
|
62
|
-
# continue only if there are files for this resource
|
63
|
-
# continue only if it is time to preserve
|
64
|
-
if !dir_name.nil? &&
|
65
|
-
!dir_name.empty? &&
|
66
|
-
!Preservation::Report::Transfer.in_db?(dir_name) &&
|
67
|
-
!d['doi'].empty? &&
|
68
|
-
!d['file'].empty? &&
|
69
|
-
Preservation::Temporal.time_to_preserve?(d['modified'], delay)
|
70
|
-
|
71
|
-
dir_file_path = dir_base_path + '/' + dir_name
|
72
|
-
dir_metadata_path = dir_file_path + '/metadata/'
|
73
|
-
metadata_filename = dir_metadata_path + 'metadata.json'
|
74
|
-
|
75
|
-
# calculate total size of data files
|
76
|
-
download_storage_required = 0
|
77
|
-
d['file'].each { |i| download_storage_required += i['size'].to_i }
|
78
|
-
|
79
|
-
# do we have enough space in filesystem to fetch data files?
|
80
|
-
if Preservation::Storage.enough_storage_for_download? download_storage_required
|
81
|
-
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
82
|
-
else
|
83
|
-
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
84
|
-
end
|
85
|
-
|
86
|
-
# has metadata file been created? if so, files and metadata are in place
|
87
|
-
# continue only if files not present in ingest location
|
88
|
-
if !File.size? metadata_filename
|
89
|
-
|
90
|
-
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
|
91
|
-
|
92
|
-
data = []
|
93
|
-
d['file'].each do |f|
|
94
|
-
o = package_dataset_metadata d, f
|
95
|
-
data << o
|
96
|
-
wget_str = Preservation::Builder.build_wget @username,
|
97
|
-
@password,
|
98
|
-
f['url']
|
99
|
-
|
100
|
-
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
101
|
-
|
102
|
-
# fetch the file
|
103
|
-
Dir.chdir(dir_file_path) do
|
104
|
-
# puts 'Changing dir to ' + Dir.pwd
|
105
|
-
# puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
|
106
|
-
if File.size?(f['name'])
|
107
|
-
# puts 'Should be deleting ' + f['name']
|
108
|
-
File.delete(f['name'])
|
109
|
-
end
|
110
|
-
# puts f['name'] + ' missing or empty'
|
111
|
-
# puts wget_str
|
112
|
-
`#{wget_str}`
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
117
|
-
|
118
|
-
pretty = JSON.pretty_generate( data, :indent => ' ')
|
119
|
-
# puts pretty
|
120
|
-
File.write(metadata_filename,pretty)
|
121
|
-
@logger.info 'Created ' + metadata_filename
|
122
|
-
success = true
|
123
|
-
else
|
124
|
-
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid'] +
|
125
|
-
' because ' + metadata_filename + ' exists'
|
126
|
-
end
|
127
|
-
else
|
128
|
-
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
|
129
|
-
end
|
130
|
-
success
|
131
|
-
end
|
132
|
-
|
133
|
-
# For multiple datasets, if necessary, fetch the metadata,
|
134
|
-
# prepare a directory in the ingest path and populate it with the files and
|
135
|
-
# JSON description file.
|
136
|
-
#
|
137
|
-
# @param max [Integer] maximum to prepare, omit to set no maximum
|
138
|
-
# @param dir_scheme [Symbol] how to make directory name
|
139
|
-
# @param delay [Integer] days to wait (after modification date) before preserving
|
140
|
-
def prepare_dataset_batch(max: nil,
|
141
|
-
dir_scheme: :uuid,
|
142
|
-
delay: 30)
|
143
|
-
collection = Puree::Collection.new resource: :dataset,
|
144
|
-
base_url: @base_url,
|
145
|
-
username: @username,
|
146
|
-
password: @password,
|
147
|
-
basic_auth: @basic_auth
|
148
|
-
count = collection.count
|
149
|
-
|
150
|
-
max = count if max.nil?
|
151
|
-
|
152
|
-
batch_size = 10
|
153
|
-
num_prepared = 0
|
154
|
-
0.step(count, batch_size) do |n|
|
155
|
-
|
156
|
-
minimal_metadata = collection.find limit: batch_size,
|
157
|
-
offset: n,
|
158
|
-
full: false
|
159
|
-
uuids = []
|
160
|
-
minimal_metadata.each do |i|
|
161
|
-
uuids << i['uuid']
|
162
|
-
end
|
163
|
-
|
164
|
-
uuids.each do |uuid|
|
165
|
-
success = prepare_dataset uuid: uuid,
|
166
|
-
dir_scheme: dir_scheme.to_sym,
|
167
|
-
delay: delay
|
168
|
-
|
169
|
-
num_prepared += 1 if success
|
170
|
-
exit if num_prepared == max
|
171
|
-
end
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
private
|
176
|
-
|
177
|
-
def package_dataset_metadata(d, f)
|
178
|
-
o = {}
|
179
|
-
o['filename'] = 'objects/' + f['name']
|
180
|
-
o['dc.title'] = d['title']
|
181
|
-
if !d['description'].empty?
|
182
|
-
o['dc.description'] = d['description']
|
183
|
-
end
|
184
|
-
o['dcterms.created'] = d['created']
|
185
|
-
if !d['available']['year'].empty?
|
186
|
-
o['dcterms.available'] = Puree::Date.iso(d['available'])
|
187
|
-
end
|
188
|
-
o['dc.publisher'] = d['publisher']
|
189
|
-
if !d['doi'].empty?
|
190
|
-
o['dc.identifier'] = d['doi']
|
191
|
-
end
|
192
|
-
if !d['spatial'].empty?
|
193
|
-
o['dcterms.spatial'] = d['spatial']
|
194
|
-
end
|
195
|
-
if !d['temporal']['start']['year'].empty?
|
196
|
-
temporal_range = ''
|
197
|
-
temporal_range << Puree::Date.iso(d['temporal']['start'])
|
198
|
-
if !d['temporal']['end']['year'].empty?
|
199
|
-
temporal_range << '/'
|
200
|
-
temporal_range << Puree::Date.iso(d['temporal']['end'])
|
201
|
-
end
|
202
|
-
o['dcterms.temporal'] = temporal_range
|
203
|
-
end
|
204
|
-
creators = []
|
205
|
-
contributors = []
|
206
|
-
person_types = %w(internal external other)
|
207
|
-
person_types.each do |person_type|
|
208
|
-
d['person'][person_type].each do |i|
|
209
|
-
if i['role'] == 'Creator'
|
210
|
-
creator = i['name']['last'] + ', ' + i['name']['first']
|
211
|
-
creators << creator
|
212
|
-
end
|
213
|
-
if i['role'] == 'Contributor'
|
214
|
-
contributor = i['name']['last'] + ', ' + i['name']['first']
|
215
|
-
contributors << contributor
|
216
|
-
end
|
217
|
-
end
|
218
|
-
end
|
219
|
-
o['dc.creator'] = creators
|
220
|
-
if !contributors.empty?
|
221
|
-
o['dc.contributor'] = contributors
|
222
|
-
end
|
223
|
-
keywords = []
|
224
|
-
d['keyword'].each { |i|
|
225
|
-
keywords << i
|
226
|
-
}
|
227
|
-
if !keywords.empty?
|
228
|
-
o['dc.subject'] = keywords
|
229
|
-
end
|
230
|
-
if !f['license']['name'].empty?
|
231
|
-
o['dcterms.license'] = f['license']['name']
|
232
|
-
end
|
233
|
-
# o['dc.format'] = f['mime']
|
234
|
-
|
235
|
-
related = []
|
236
|
-
publications = d['publication']
|
237
|
-
publications.each do |i|
|
238
|
-
pub = Puree::Publication.new base_url: @base_url,
|
239
|
-
username: @username,
|
240
|
-
password: @password,
|
241
|
-
basic_auth: @basic_auth
|
242
|
-
pub.find uuid: i['uuid']
|
243
|
-
doi = pub.doi
|
244
|
-
if doi
|
245
|
-
related << doi
|
246
|
-
end
|
247
|
-
end
|
248
|
-
if !related.empty?
|
249
|
-
o['dc.relation'] = related
|
250
|
-
end
|
251
|
-
|
252
|
-
o
|
253
|
-
end
|
254
|
-
|
255
|
-
end
|
256
|
-
|
257
|
-
end
|
258
|
-
|
259
|
-
end
|