preservation 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/PITCHME.md +18 -17
- data/README.md +21 -43
- data/lib/preservation.rb +7 -3
- data/lib/preservation/builder.rb +84 -0
- data/lib/preservation/{string_util.rb → conversion.rb} +7 -2
- data/lib/preservation/ingest.rb +4 -129
- data/lib/preservation/report/database.rb +26 -0
- data/lib/preservation/report/transfer.rb +166 -0
- data/lib/preservation/storage.rb +50 -0
- data/lib/preservation/temporal.rb +21 -0
- data/lib/preservation/transfer/pure.rb +215 -0
- data/lib/preservation/version.rb +1 -1
- data/preservation.gemspec +3 -3
- metadata +13 -9
- data/lib/preservation/ingest_report.rb +0 -172
- data/lib/preservation/pure_ingest.rb +0 -188
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37837b8796fc9b31c0c135b966de5837fd99aba9
|
4
|
+
data.tar.gz: eb8bd1f506a30b8035b18d541c9fabc84c67fe29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d1f1e0ea0408952329524d1653da892eeb61f1ab1a3ef4847ee4520b0255c464f7a18c3969f5f3c96b5e0ae69d1b05ae22ea63eb91ced76c75288d7527cd52c
|
7
|
+
data.tar.gz: b559abab2a467dacb0cb8afcd531b9cea7c8415ce3b08c7b653d4711631bf4c30ef40d992e806150b83c17664d3fbe7ec734cc80de2011e3032021643924d06f
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
4
4
|
|
5
5
|
## Unreleased
|
6
6
|
|
7
|
+
## 0.2.0 - 2016-09-17
|
8
|
+
### Changed
|
9
|
+
- Singular uuid rather than an array of uuids as parameter for transfer preparation.
|
10
|
+
- Modules, classes and API.
|
11
|
+
|
7
12
|
## 0.1.0 - 2016-09-13
|
8
13
|
### Added
|
9
14
|
- Transfer preparation.
|
data/PITCHME.md
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
#HSLIDE
|
2
2
|
|
3
3
|
## Rationale
|
4
|
-
Archivematica's
|
4
|
+
Archivematica's <a href="https://github.com/artefactual/automation-tools" target="_blank">Automation Tools</a>
|
5
5
|
work with files and descriptive metadata which must be provided in a certain way.
|
6
6
|
|
7
|
-
|
8
7
|
#HSLIDE
|
9
8
|
|
10
9
|
## Preservation: a way to manage ingest
|
@@ -17,32 +16,33 @@ work with files and descriptive metadata which must be provided in a certain way
|
|
17
16
|
|
18
17
|
#HSLIDE
|
19
18
|
|
20
|
-
## Preservation:
|
19
|
+
## Preservation: transfer
|
20
|
+
|
21
|
+
Create a transfer using the Pure Research Information System as a data source.
|
21
22
|
|
22
|
-
Create an ingestor for Pure.
|
23
23
|
```ruby
|
24
|
-
|
24
|
+
transfer = Preservation::Transfer::Pure.new base_url: ENV['PURE_BASE_URL'],
|
25
|
+
username: ENV['PURE_USERNAME'],
|
26
|
+
password: ENV['PURE_PASSWORD'],
|
27
|
+
basic_auth: true
|
25
28
|
```
|
26
29
|
|
27
|
-
For
|
28
|
-
ingest path and populate it with the files and JSON description file.
|
30
|
+
For a Pure dataset, if necessary, fetch the metadata, prepare
|
31
|
+
a directory in the ingest path and populate it with the files and JSON description file.
|
29
32
|
|
30
33
|
```ruby
|
31
|
-
|
32
|
-
dir_name_scheme: :doi_short,
|
33
|
-
delay: 0
|
34
|
+
transfer.prepare_dataset uuid: 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'
|
34
35
|
```
|
35
36
|
|
36
|
-
Free up disk space for completed transfers.
|
37
|
+
Free up disk space for completed transfers. Can be done at any time.
|
37
38
|
|
38
39
|
```ruby
|
39
|
-
|
40
|
+
Preservation::Storage.cleanup
|
40
41
|
```
|
41
42
|
|
42
43
|
#VSLIDE
|
43
44
|
|
44
45
|
## Transfer-ready directory
|
45
|
-
|
46
46
|
```
|
47
47
|
.
|
48
48
|
├── 10.17635-lancaster-researchdata-6
|
@@ -86,7 +86,7 @@ ingest.cleanup_preserved
|
|
86
86
|
"dcterms.license": "CC BY",
|
87
87
|
"related": [
|
88
88
|
{
|
89
|
-
"dc.title": "The unprecedented scale of the West African Ebola virus disease outbreak is due to environmental
|
89
|
+
"dc.title": "The unprecedented scale of the West African Ebola virus disease outbreak is due to environmental and sociological factors, not special attributes of the currently circulating strain of the virus",
|
90
90
|
"type": "Journal article",
|
91
91
|
"dc.identifier": "http://dx.doi.org/10.1136/ebmed-2014-110127"
|
92
92
|
},
|
@@ -107,8 +107,7 @@ ingest.cleanup_preserved
|
|
107
107
|
Can be used for scheduled monitoring of transfers.
|
108
108
|
|
109
109
|
```ruby
|
110
|
-
|
111
|
-
report.transfer_exception
|
110
|
+
Preservation::Report::Transfer.exception
|
112
111
|
```
|
113
112
|
|
114
113
|
#HSLIDE
|
@@ -123,4 +122,6 @@ report.transfer_exception
|
|
123
122
|
|
124
123
|
## Documentation
|
125
124
|
|
126
|
-
<a href="http://www.rubydoc.info/gems/preservation" target="_blank">API in YARD</a>
|
125
|
+
<a href="http://www.rubydoc.info/gems/preservation" target="_blank">API in YARD</a>
|
126
|
+
|
127
|
+
<a href="https://aalbinclark.gitbooks.io/preservation" target="_blank">Detailed usage in GitBook</a>
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Preservation [](https://badge.fury.io/rb/preservation) [](https://gitpitch.com/lulibrary/preservation/master?grs=github&t=sky)
|
2
2
|
|
3
|
-
|
3
|
+
Extraction and Transformation for Loading by Archivematica's Automation Tools.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -19,70 +19,48 @@ Or install it yourself as:
|
|
19
19
|
## Usage
|
20
20
|
|
21
21
|
### Configuration
|
22
|
-
Configure Preservation. If ```log_path``` is omitted, logging (standard library)
|
22
|
+
Configure Preservation. If ```log_path``` is omitted, logging (standard library) writes to STDOUT.
|
23
23
|
|
24
24
|
```ruby
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
end
|
30
|
-
```
|
31
|
-
|
32
|
-
Configure data source.
|
33
|
-
|
34
|
-
```ruby
|
35
|
-
Puree.configure do |config|
|
36
|
-
config.base_url = ENV['PURE_BASE_URL']
|
37
|
-
config.username = ENV['PURE_USERNAME']
|
38
|
-
config.password = ENV['PURE_PASSWORD']
|
39
|
-
config.basic_auth = true
|
25
|
+
Preservation.configure do |config|
|
26
|
+
config.db_path = ENV['ARCHIVEMATICA_DB_PATH']
|
27
|
+
config.ingest_path = ENV['ARCHIVEMATICA_INGEST_PATH']
|
28
|
+
config.log_path = ENV['PRESERVATION_LOG_PATH']
|
40
29
|
end
|
41
30
|
```
|
42
31
|
|
43
|
-
### Transfers
|
44
|
-
|
45
|
-
Get some dataset UUIDs for preservation.
|
46
32
|
|
47
|
-
|
48
|
-
|
49
|
-
minimal_metadata = c.find limit: 2,
|
50
|
-
offset: 10,
|
51
|
-
full: false
|
52
|
-
uuids = []
|
53
|
-
minimal_metadata.each do |i|
|
54
|
-
uuids << i['uuid']
|
55
|
-
end
|
56
|
-
```
|
57
|
-
|
58
|
-
Create an ingestor for Pure.
|
33
|
+
### Transfer
|
34
|
+
Create a transfer using the Pure Research Information System as a data source.
|
59
35
|
|
60
36
|
```ruby
|
61
|
-
|
37
|
+
transfer = Preservation::Transfer::Pure.new base_url: ENV['PURE_BASE_URL'],
|
38
|
+
username: ENV['PURE_USERNAME'],
|
39
|
+
password: ENV['PURE_PASSWORD'],
|
40
|
+
basic_auth: true
|
62
41
|
```
|
63
42
|
|
64
|
-
For
|
43
|
+
For a Pure dataset, if necessary, fetch the metadata, prepare
|
65
44
|
a directory in the ingest path and populate it with the files and JSON description file.
|
66
45
|
|
67
46
|
```ruby
|
68
|
-
|
69
|
-
dir_name_scheme: :doi_short,
|
70
|
-
delay: 0
|
47
|
+
transfer.prepare_dataset uuid: 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'
|
71
48
|
```
|
72
49
|
|
73
|
-
Free up disk space for completed transfers.
|
50
|
+
Free up disk space for completed transfers. Can be done at any time.
|
74
51
|
|
75
52
|
```ruby
|
76
|
-
|
53
|
+
Preservation::Storage.cleanup
|
77
54
|
```
|
78
55
|
|
79
|
-
###
|
56
|
+
### Report
|
80
57
|
Can be used for scheduled monitoring of transfers.
|
81
58
|
|
82
59
|
```ruby
|
83
|
-
|
84
|
-
report.transfer_exception
|
60
|
+
Preservation::Report::Transfer.exception
|
85
61
|
```
|
86
62
|
|
87
63
|
## Documentation
|
88
|
-
[API in YARD](http://www.rubydoc.info/gems/preservation)
|
64
|
+
[API in YARD](http://www.rubydoc.info/gems/preservation)
|
65
|
+
|
66
|
+
[Detailed usage in GitBook](https://aalbinclark.gitbooks.io/preservation)
|
data/lib/preservation.rb
CHANGED
@@ -5,10 +5,14 @@ require 'logger'
|
|
5
5
|
require 'puree'
|
6
6
|
require 'sqlite3'
|
7
7
|
require 'preservation/configuration'
|
8
|
-
require 'preservation/
|
8
|
+
require 'preservation/report/database'
|
9
|
+
require 'preservation/report/transfer'
|
10
|
+
require 'preservation/conversion'
|
9
11
|
require 'preservation/ingest'
|
10
|
-
require 'preservation/
|
11
|
-
require 'preservation/
|
12
|
+
require 'preservation/builder'
|
13
|
+
require 'preservation/storage'
|
14
|
+
require 'preservation/temporal'
|
15
|
+
require 'preservation/transfer/pure'
|
12
16
|
require 'preservation/version'
|
13
17
|
|
14
18
|
# Top level namespace
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Builder
|
4
|
+
#
|
5
|
+
module Builder
|
6
|
+
|
7
|
+
# Build wget string
|
8
|
+
#
|
9
|
+
# @param username [String]
|
10
|
+
# @param password [String]
|
11
|
+
# @param file_url [String]
|
12
|
+
# @return [String]
|
13
|
+
def self.build_wget(username, password, file_url)
|
14
|
+
# construct wget command with parameters
|
15
|
+
wget_str = ''
|
16
|
+
wget_str << 'wget'
|
17
|
+
wget_str << ' '
|
18
|
+
wget_str << '--user'
|
19
|
+
wget_str << ' '
|
20
|
+
wget_str << username
|
21
|
+
wget_str << ' '
|
22
|
+
wget_str << '--password'
|
23
|
+
wget_str << ' '
|
24
|
+
wget_str << '"' + password + '"'
|
25
|
+
wget_str << ' '
|
26
|
+
wget_str << file_url
|
27
|
+
wget_str << ' '
|
28
|
+
wget_str << '--no-check-certificate'
|
29
|
+
wget_str
|
30
|
+
end
|
31
|
+
|
32
|
+
# Build directory name
|
33
|
+
#
|
34
|
+
# @param metadata record [Hash]
|
35
|
+
# @param directory_name_scheme [Symbol]
|
36
|
+
# @return [String]
|
37
|
+
def self.build_directory_name(metadata_record, directory_name_scheme)
|
38
|
+
doi = metadata_record['doi']
|
39
|
+
uuid = metadata_record['uuid']
|
40
|
+
title = metadata_record['title'].strip.gsub(' ', '-').gsub('/', '-')
|
41
|
+
time = Time.new
|
42
|
+
date = time.strftime("%Y-%m-%d")
|
43
|
+
time = time.strftime("%H:%M:%S")
|
44
|
+
join_str = '-----'
|
45
|
+
|
46
|
+
case directory_name_scheme
|
47
|
+
when :uuid_title
|
48
|
+
[uuid, title].join(join_str)
|
49
|
+
when :title_uuid
|
50
|
+
[title, uuid].join(join_str)
|
51
|
+
when :date_uuid_title
|
52
|
+
[date, uuid, title].join(join_str)
|
53
|
+
when :date_title_uuid
|
54
|
+
[date, title, uuid].join(join_str)
|
55
|
+
when :date_time_uuid
|
56
|
+
[date, time, uuid].join(join_str)
|
57
|
+
when :date_time_title
|
58
|
+
[date, time, title].join(join_str)
|
59
|
+
when :date_time_uuid_title
|
60
|
+
[date, time, uuid, title].join(join_str)
|
61
|
+
when :date_time_title_uuid
|
62
|
+
[date, time, title, uuid].join(join_str)
|
63
|
+
when :uuid
|
64
|
+
uuid
|
65
|
+
when :doi
|
66
|
+
if doi.empty?
|
67
|
+
return ''
|
68
|
+
end
|
69
|
+
doi.gsub('/', '-')
|
70
|
+
when :doi_short
|
71
|
+
if doi.empty?
|
72
|
+
return ''
|
73
|
+
end
|
74
|
+
doi_short_to_remove = 'http://dx.doi.org/'
|
75
|
+
short = doi.gsub(doi_short_to_remove, '')
|
76
|
+
short.gsub!('/', '-')
|
77
|
+
else
|
78
|
+
uuid
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
@@ -1,15 +1,20 @@
|
|
1
1
|
module Preservation
|
2
2
|
|
3
|
-
#
|
3
|
+
# Conversion
|
4
4
|
#
|
5
|
-
module
|
5
|
+
module Conversion
|
6
6
|
# Binary to hexadecimal
|
7
7
|
#
|
8
|
+
# @param [Binary String]
|
9
|
+
# @return [Hexadecimal String]
|
8
10
|
def self.bin_to_hex(s)
|
9
11
|
s.each_byte.map { |b| b.to_s(16) }.join
|
10
12
|
end
|
11
13
|
|
12
14
|
# Hexadecimal to binary
|
15
|
+
#
|
16
|
+
# @param [Hexadecimal String]
|
17
|
+
# @return [Binary String]
|
13
18
|
def self.hex_to_bin(s)
|
14
19
|
s.scan(/../).map { |x| x.hex.chr }.join
|
15
20
|
end
|
data/lib/preservation/ingest.rb
CHANGED
@@ -1,57 +1,21 @@
|
|
1
1
|
module Preservation
|
2
2
|
|
3
|
-
#
|
3
|
+
# Ingest
|
4
4
|
#
|
5
5
|
class Ingest
|
6
6
|
|
7
7
|
attr_reader :logger
|
8
8
|
|
9
9
|
def initialize
|
10
|
-
check_ingest_path
|
11
10
|
setup_logger
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
# Free up disk space for completed transfers
|
16
|
-
#
|
17
|
-
def cleanup_preserved
|
18
|
-
preserved = get_preserved
|
19
|
-
if !preserved.nil? && !preserved.empty?
|
20
|
-
preserved.each do |i|
|
21
|
-
# skip anything that has a different owner to script
|
22
|
-
if File.stat(i).grpowned?
|
23
|
-
FileUtils.remove_dir i
|
24
|
-
@logger.info 'Deleted ' + i
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
11
|
+
check_ingest_path
|
12
|
+
end
|
30
13
|
|
31
14
|
private
|
32
15
|
|
33
|
-
def build_wget(username, password, file_url)
|
34
|
-
# construct wget command with parameters
|
35
|
-
wget_str = ''
|
36
|
-
wget_str << 'wget'
|
37
|
-
wget_str << ' '
|
38
|
-
wget_str << '--user'
|
39
|
-
wget_str << ' '
|
40
|
-
wget_str << username
|
41
|
-
wget_str << ' '
|
42
|
-
wget_str << '--password'
|
43
|
-
wget_str << ' '
|
44
|
-
wget_str << '"' + password + '"'
|
45
|
-
wget_str << ' '
|
46
|
-
wget_str << file_url
|
47
|
-
wget_str << ' '
|
48
|
-
wget_str << '--no-check-certificate'
|
49
|
-
wget_str
|
50
|
-
end
|
51
|
-
|
52
16
|
def check_ingest_path
|
53
17
|
if Preservation.ingest_path.nil?
|
54
|
-
|
18
|
+
@logger.error 'Missing ingest path'
|
55
19
|
exit
|
56
20
|
end
|
57
21
|
end
|
@@ -68,95 +32,6 @@ module Preservation
|
|
68
32
|
@logger.level = Logger::INFO
|
69
33
|
end
|
70
34
|
|
71
|
-
def setup_report
|
72
|
-
if Preservation.db_path.nil?
|
73
|
-
puts 'Missing db path'
|
74
|
-
exit
|
75
|
-
else
|
76
|
-
@report = IngestReport.new
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def enough_storage_for_download?(required_bytes)
|
81
|
-
# scale up the required space using a multiplier
|
82
|
-
multiplier = 2
|
83
|
-
available = FreeDiskSpace.bytes('/')
|
84
|
-
required_bytes * multiplier < available ? true : false
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_directory_name(metadata_record, directory_name_scheme)
|
88
|
-
doi = metadata_record['doi']
|
89
|
-
uuid = metadata_record['uuid']
|
90
|
-
title = metadata_record['title'].strip.gsub(' ', '-').gsub('/', '-')
|
91
|
-
time = Time.new
|
92
|
-
date = time.strftime("%Y-%m-%d")
|
93
|
-
time = time.strftime("%H:%M:%S")
|
94
|
-
join_str = '-----'
|
95
|
-
|
96
|
-
case directory_name_scheme
|
97
|
-
when :uuid_title
|
98
|
-
[uuid, title].join(join_str)
|
99
|
-
when :title_uuid
|
100
|
-
[title, uuid].join(join_str)
|
101
|
-
when :date_uuid_title
|
102
|
-
[date, uuid, title].join(join_str)
|
103
|
-
when :date_title_uuid
|
104
|
-
[date, title, uuid].join(join_str)
|
105
|
-
when :date_time_uuid
|
106
|
-
[date, time, uuid].join(join_str)
|
107
|
-
when :date_time_title
|
108
|
-
[date, time, title].join(join_str)
|
109
|
-
when :date_time_uuid_title
|
110
|
-
[date, time, uuid, title].join(join_str)
|
111
|
-
when :date_time_title_uuid
|
112
|
-
[date, time, title, uuid].join(join_str)
|
113
|
-
when :uuid
|
114
|
-
uuid
|
115
|
-
when :doi
|
116
|
-
if doi.empty?
|
117
|
-
return ''
|
118
|
-
end
|
119
|
-
doi.gsub('/', '-')
|
120
|
-
when :doi_short
|
121
|
-
if doi.empty?
|
122
|
-
return ''
|
123
|
-
end
|
124
|
-
doi_short_to_remove = 'http://dx.doi.org/'
|
125
|
-
short = doi.gsub(doi_short_to_remove, '')
|
126
|
-
short.gsub!('/', '-')
|
127
|
-
else
|
128
|
-
uuid
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
# time_to_preserve?
|
133
|
-
#
|
134
|
-
# @param start_utc [String]
|
135
|
-
# @param delay [Integer] days to wait (after modification date) before preserving
|
136
|
-
# @return [Boolean]
|
137
|
-
def time_to_preserve?(start_utc, delay)
|
138
|
-
now = DateTime.now
|
139
|
-
modified_datetime = DateTime.parse(start_utc)
|
140
|
-
days_since_modified = (now - modified_datetime).to_i # result in days
|
141
|
-
days_since_modified >= delay ? true : false
|
142
|
-
end
|
143
|
-
|
144
|
-
# # Collect all paths from DB where preservation has been done
|
145
|
-
# # @return [Array<String>]
|
146
|
-
def get_preserved
|
147
|
-
ingest_complete = @report.transfer_status(status_to_find: 'COMPLETE',
|
148
|
-
status_presence: true)
|
149
|
-
preserved = []
|
150
|
-
ingest_complete.each do |i|
|
151
|
-
dir_path = Preservation.ingest_path + '/' + i['path']
|
152
|
-
if File.exists?(dir_path)
|
153
|
-
preserved << dir_path
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
preserved
|
158
|
-
end
|
159
|
-
|
160
35
|
end
|
161
36
|
|
162
37
|
end
|