preservation 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/PITCHME.md +18 -17
- data/README.md +21 -43
- data/lib/preservation.rb +7 -3
- data/lib/preservation/builder.rb +84 -0
- data/lib/preservation/{string_util.rb → conversion.rb} +7 -2
- data/lib/preservation/ingest.rb +4 -129
- data/lib/preservation/report/database.rb +26 -0
- data/lib/preservation/report/transfer.rb +166 -0
- data/lib/preservation/storage.rb +50 -0
- data/lib/preservation/temporal.rb +21 -0
- data/lib/preservation/transfer/pure.rb +215 -0
- data/lib/preservation/version.rb +1 -1
- data/preservation.gemspec +3 -3
- metadata +13 -9
- data/lib/preservation/ingest_report.rb +0 -172
- data/lib/preservation/pure_ingest.rb +0 -188
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37837b8796fc9b31c0c135b966de5837fd99aba9
|
4
|
+
data.tar.gz: eb8bd1f506a30b8035b18d541c9fabc84c67fe29
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5d1f1e0ea0408952329524d1653da892eeb61f1ab1a3ef4847ee4520b0255c464f7a18c3969f5f3c96b5e0ae69d1b05ae22ea63eb91ced76c75288d7527cd52c
|
7
|
+
data.tar.gz: b559abab2a467dacb0cb8afcd531b9cea7c8415ce3b08c7b653d4711631bf4c30ef40d992e806150b83c17664d3fbe7ec734cc80de2011e3032021643924d06f
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,11 @@ This project adheres to [Semantic Versioning](http://semver.org/).
|
|
4
4
|
|
5
5
|
## Unreleased
|
6
6
|
|
7
|
+
## 0.2.0 - 2016-09-17
|
8
|
+
### Changed
|
9
|
+
- Singular uuid rather than an array of uuids as parameter for transfer preparation.
|
10
|
+
- Modules, classes and API.
|
11
|
+
|
7
12
|
## 0.1.0 - 2016-09-13
|
8
13
|
### Added
|
9
14
|
- Transfer preparation.
|
data/PITCHME.md
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
#HSLIDE
|
2
2
|
|
3
3
|
## Rationale
|
4
|
-
Archivematica's
|
4
|
+
Archivematica's <a href="https://github.com/artefactual/automation-tools" target="_blank">Automation Tools</a>
|
5
5
|
work with files and descriptive metadata which must be provided in a certain way.
|
6
6
|
|
7
|
-
|
8
7
|
#HSLIDE
|
9
8
|
|
10
9
|
## Preservation: a way to manage ingest
|
@@ -17,32 +16,33 @@ work with files and descriptive metadata which must be provided in a certain way
|
|
17
16
|
|
18
17
|
#HSLIDE
|
19
18
|
|
20
|
-
## Preservation:
|
19
|
+
## Preservation: transfer
|
20
|
+
|
21
|
+
Create a transfer using the Pure Research Information System as a data source.
|
21
22
|
|
22
|
-
Create an ingestor for Pure.
|
23
23
|
```ruby
|
24
|
-
|
24
|
+
transfer = Preservation::Transfer::Pure.new base_url: ENV['PURE_BASE_URL'],
|
25
|
+
username: ENV['PURE_USERNAME'],
|
26
|
+
password: ENV['PURE_PASSWORD'],
|
27
|
+
basic_auth: true
|
25
28
|
```
|
26
29
|
|
27
|
-
For
|
28
|
-
ingest path and populate it with the files and JSON description file.
|
30
|
+
For a Pure dataset, if necessary, fetch the metadata, prepare
|
31
|
+
a directory in the ingest path and populate it with the files and JSON description file.
|
29
32
|
|
30
33
|
```ruby
|
31
|
-
|
32
|
-
dir_name_scheme: :doi_short,
|
33
|
-
delay: 0
|
34
|
+
transfer.prepare_dataset uuid: 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'
|
34
35
|
```
|
35
36
|
|
36
|
-
Free up disk space for completed transfers.
|
37
|
+
Free up disk space for completed transfers. Can be done at any time.
|
37
38
|
|
38
39
|
```ruby
|
39
|
-
|
40
|
+
Preservation::Storage.cleanup
|
40
41
|
```
|
41
42
|
|
42
43
|
#VSLIDE
|
43
44
|
|
44
45
|
## Transfer-ready directory
|
45
|
-
|
46
46
|
```
|
47
47
|
.
|
48
48
|
├── 10.17635-lancaster-researchdata-6
|
@@ -86,7 +86,7 @@ ingest.cleanup_preserved
|
|
86
86
|
"dcterms.license": "CC BY",
|
87
87
|
"related": [
|
88
88
|
{
|
89
|
-
"dc.title": "The unprecedented scale of the West African Ebola virus disease outbreak is due to environmental
|
89
|
+
"dc.title": "The unprecedented scale of the West African Ebola virus disease outbreak is due to environmental and sociological factors, not special attributes of the currently circulating strain of the virus",
|
90
90
|
"type": "Journal article",
|
91
91
|
"dc.identifier": "http://dx.doi.org/10.1136/ebmed-2014-110127"
|
92
92
|
},
|
@@ -107,8 +107,7 @@ ingest.cleanup_preserved
|
|
107
107
|
Can be used for scheduled monitoring of transfers.
|
108
108
|
|
109
109
|
```ruby
|
110
|
-
|
111
|
-
report.transfer_exception
|
110
|
+
Preservation::Report::Transfer.exception
|
112
111
|
```
|
113
112
|
|
114
113
|
#HSLIDE
|
@@ -123,4 +122,6 @@ report.transfer_exception
|
|
123
122
|
|
124
123
|
## Documentation
|
125
124
|
|
126
|
-
<a href="http://www.rubydoc.info/gems/preservation" target="_blank">API in YARD</a>
|
125
|
+
<a href="http://www.rubydoc.info/gems/preservation" target="_blank">API in YARD</a>
|
126
|
+
|
127
|
+
<a href="https://aalbinclark.gitbooks.io/preservation" target="_blank">Detailed usage in GitBook</a>
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Preservation [![Gem Version](https://badge.fury.io/rb/preservation.svg)](https://badge.fury.io/rb/preservation) [![GitPitch](https://gitpitch.com/assets/badge.svg)](https://gitpitch.com/lulibrary/preservation/master?grs=github&t=sky)
|
2
2
|
|
3
|
-
|
3
|
+
Extraction and Transformation for Loading by Archivematica's Automation Tools.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -19,70 +19,48 @@ Or install it yourself as:
|
|
19
19
|
## Usage
|
20
20
|
|
21
21
|
### Configuration
|
22
|
-
Configure Preservation. If ```log_path``` is omitted, logging (standard library)
|
22
|
+
Configure Preservation. If ```log_path``` is omitted, logging (standard library) writes to STDOUT.
|
23
23
|
|
24
24
|
```ruby
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
end
|
30
|
-
```
|
31
|
-
|
32
|
-
Configure data source.
|
33
|
-
|
34
|
-
```ruby
|
35
|
-
Puree.configure do |config|
|
36
|
-
config.base_url = ENV['PURE_BASE_URL']
|
37
|
-
config.username = ENV['PURE_USERNAME']
|
38
|
-
config.password = ENV['PURE_PASSWORD']
|
39
|
-
config.basic_auth = true
|
25
|
+
Preservation.configure do |config|
|
26
|
+
config.db_path = ENV['ARCHIVEMATICA_DB_PATH']
|
27
|
+
config.ingest_path = ENV['ARCHIVEMATICA_INGEST_PATH']
|
28
|
+
config.log_path = ENV['PRESERVATION_LOG_PATH']
|
40
29
|
end
|
41
30
|
```
|
42
31
|
|
43
|
-
### Transfers
|
44
|
-
|
45
|
-
Get some dataset UUIDs for preservation.
|
46
32
|
|
47
|
-
|
48
|
-
|
49
|
-
minimal_metadata = c.find limit: 2,
|
50
|
-
offset: 10,
|
51
|
-
full: false
|
52
|
-
uuids = []
|
53
|
-
minimal_metadata.each do |i|
|
54
|
-
uuids << i['uuid']
|
55
|
-
end
|
56
|
-
```
|
57
|
-
|
58
|
-
Create an ingestor for Pure.
|
33
|
+
### Transfer
|
34
|
+
Create a transfer using the Pure Research Information System as a data source.
|
59
35
|
|
60
36
|
```ruby
|
61
|
-
|
37
|
+
transfer = Preservation::Transfer::Pure.new base_url: ENV['PURE_BASE_URL'],
|
38
|
+
username: ENV['PURE_USERNAME'],
|
39
|
+
password: ENV['PURE_PASSWORD'],
|
40
|
+
basic_auth: true
|
62
41
|
```
|
63
42
|
|
64
|
-
For
|
43
|
+
For a Pure dataset, if necessary, fetch the metadata, prepare
|
65
44
|
a directory in the ingest path and populate it with the files and JSON description file.
|
66
45
|
|
67
46
|
```ruby
|
68
|
-
|
69
|
-
dir_name_scheme: :doi_short,
|
70
|
-
delay: 0
|
47
|
+
transfer.prepare_dataset uuid: 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'
|
71
48
|
```
|
72
49
|
|
73
|
-
Free up disk space for completed transfers.
|
50
|
+
Free up disk space for completed transfers. Can be done at any time.
|
74
51
|
|
75
52
|
```ruby
|
76
|
-
|
53
|
+
Preservation::Storage.cleanup
|
77
54
|
```
|
78
55
|
|
79
|
-
###
|
56
|
+
### Report
|
80
57
|
Can be used for scheduled monitoring of transfers.
|
81
58
|
|
82
59
|
```ruby
|
83
|
-
|
84
|
-
report.transfer_exception
|
60
|
+
Preservation::Report::Transfer.exception
|
85
61
|
```
|
86
62
|
|
87
63
|
## Documentation
|
88
|
-
[API in YARD](http://www.rubydoc.info/gems/preservation)
|
64
|
+
[API in YARD](http://www.rubydoc.info/gems/preservation)
|
65
|
+
|
66
|
+
[Detailed usage in GitBook](https://aalbinclark.gitbooks.io/preservation)
|
data/lib/preservation.rb
CHANGED
@@ -5,10 +5,14 @@ require 'logger'
|
|
5
5
|
require 'puree'
|
6
6
|
require 'sqlite3'
|
7
7
|
require 'preservation/configuration'
|
8
|
-
require 'preservation/
|
8
|
+
require 'preservation/report/database'
|
9
|
+
require 'preservation/report/transfer'
|
10
|
+
require 'preservation/conversion'
|
9
11
|
require 'preservation/ingest'
|
10
|
-
require 'preservation/
|
11
|
-
require 'preservation/
|
12
|
+
require 'preservation/builder'
|
13
|
+
require 'preservation/storage'
|
14
|
+
require 'preservation/temporal'
|
15
|
+
require 'preservation/transfer/pure'
|
12
16
|
require 'preservation/version'
|
13
17
|
|
14
18
|
# Top level namespace
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Builder
|
4
|
+
#
|
5
|
+
module Builder
|
6
|
+
|
7
|
+
# Build wget string
|
8
|
+
#
|
9
|
+
# @param username [String]
|
10
|
+
# @param password [String]
|
11
|
+
# @param file_url [String]
|
12
|
+
# @return [String]
|
13
|
+
def self.build_wget(username, password, file_url)
|
14
|
+
# construct wget command with parameters
|
15
|
+
wget_str = ''
|
16
|
+
wget_str << 'wget'
|
17
|
+
wget_str << ' '
|
18
|
+
wget_str << '--user'
|
19
|
+
wget_str << ' '
|
20
|
+
wget_str << username
|
21
|
+
wget_str << ' '
|
22
|
+
wget_str << '--password'
|
23
|
+
wget_str << ' '
|
24
|
+
wget_str << '"' + password + '"'
|
25
|
+
wget_str << ' '
|
26
|
+
wget_str << file_url
|
27
|
+
wget_str << ' '
|
28
|
+
wget_str << '--no-check-certificate'
|
29
|
+
wget_str
|
30
|
+
end
|
31
|
+
|
32
|
+
# Build directory name
|
33
|
+
#
|
34
|
+
# @param metadata record [Hash]
|
35
|
+
# @param directory_name_scheme [Symbol]
|
36
|
+
# @return [String]
|
37
|
+
def self.build_directory_name(metadata_record, directory_name_scheme)
|
38
|
+
doi = metadata_record['doi']
|
39
|
+
uuid = metadata_record['uuid']
|
40
|
+
title = metadata_record['title'].strip.gsub(' ', '-').gsub('/', '-')
|
41
|
+
time = Time.new
|
42
|
+
date = time.strftime("%Y-%m-%d")
|
43
|
+
time = time.strftime("%H:%M:%S")
|
44
|
+
join_str = '-----'
|
45
|
+
|
46
|
+
case directory_name_scheme
|
47
|
+
when :uuid_title
|
48
|
+
[uuid, title].join(join_str)
|
49
|
+
when :title_uuid
|
50
|
+
[title, uuid].join(join_str)
|
51
|
+
when :date_uuid_title
|
52
|
+
[date, uuid, title].join(join_str)
|
53
|
+
when :date_title_uuid
|
54
|
+
[date, title, uuid].join(join_str)
|
55
|
+
when :date_time_uuid
|
56
|
+
[date, time, uuid].join(join_str)
|
57
|
+
when :date_time_title
|
58
|
+
[date, time, title].join(join_str)
|
59
|
+
when :date_time_uuid_title
|
60
|
+
[date, time, uuid, title].join(join_str)
|
61
|
+
when :date_time_title_uuid
|
62
|
+
[date, time, title, uuid].join(join_str)
|
63
|
+
when :uuid
|
64
|
+
uuid
|
65
|
+
when :doi
|
66
|
+
if doi.empty?
|
67
|
+
return ''
|
68
|
+
end
|
69
|
+
doi.gsub('/', '-')
|
70
|
+
when :doi_short
|
71
|
+
if doi.empty?
|
72
|
+
return ''
|
73
|
+
end
|
74
|
+
doi_short_to_remove = 'http://dx.doi.org/'
|
75
|
+
short = doi.gsub(doi_short_to_remove, '')
|
76
|
+
short.gsub!('/', '-')
|
77
|
+
else
|
78
|
+
uuid
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
@@ -1,15 +1,20 @@
|
|
1
1
|
module Preservation
|
2
2
|
|
3
|
-
#
|
3
|
+
# Conversion
|
4
4
|
#
|
5
|
-
module
|
5
|
+
module Conversion
|
6
6
|
# Binary to hexadecimal
|
7
7
|
#
|
8
|
+
# @param [Binary String]
|
9
|
+
# @return [Hexadecimal String]
|
8
10
|
def self.bin_to_hex(s)
|
9
11
|
s.each_byte.map { |b| b.to_s(16) }.join
|
10
12
|
end
|
11
13
|
|
12
14
|
# Hexadecimal to binary
|
15
|
+
#
|
16
|
+
# @param [Hexadecimal String]
|
17
|
+
# @return [Binary String]
|
13
18
|
def self.hex_to_bin(s)
|
14
19
|
s.scan(/../).map { |x| x.hex.chr }.join
|
15
20
|
end
|
data/lib/preservation/ingest.rb
CHANGED
@@ -1,57 +1,21 @@
|
|
1
1
|
module Preservation
|
2
2
|
|
3
|
-
#
|
3
|
+
# Ingest
|
4
4
|
#
|
5
5
|
class Ingest
|
6
6
|
|
7
7
|
attr_reader :logger
|
8
8
|
|
9
9
|
def initialize
|
10
|
-
check_ingest_path
|
11
10
|
setup_logger
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
# Free up disk space for completed transfers
|
16
|
-
#
|
17
|
-
def cleanup_preserved
|
18
|
-
preserved = get_preserved
|
19
|
-
if !preserved.nil? && !preserved.empty?
|
20
|
-
preserved.each do |i|
|
21
|
-
# skip anything that has a different owner to script
|
22
|
-
if File.stat(i).grpowned?
|
23
|
-
FileUtils.remove_dir i
|
24
|
-
@logger.info 'Deleted ' + i
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
11
|
+
check_ingest_path
|
12
|
+
end
|
30
13
|
|
31
14
|
private
|
32
15
|
|
33
|
-
def build_wget(username, password, file_url)
|
34
|
-
# construct wget command with parameters
|
35
|
-
wget_str = ''
|
36
|
-
wget_str << 'wget'
|
37
|
-
wget_str << ' '
|
38
|
-
wget_str << '--user'
|
39
|
-
wget_str << ' '
|
40
|
-
wget_str << username
|
41
|
-
wget_str << ' '
|
42
|
-
wget_str << '--password'
|
43
|
-
wget_str << ' '
|
44
|
-
wget_str << '"' + password + '"'
|
45
|
-
wget_str << ' '
|
46
|
-
wget_str << file_url
|
47
|
-
wget_str << ' '
|
48
|
-
wget_str << '--no-check-certificate'
|
49
|
-
wget_str
|
50
|
-
end
|
51
|
-
|
52
16
|
def check_ingest_path
|
53
17
|
if Preservation.ingest_path.nil?
|
54
|
-
|
18
|
+
@logger.error 'Missing ingest path'
|
55
19
|
exit
|
56
20
|
end
|
57
21
|
end
|
@@ -68,95 +32,6 @@ module Preservation
|
|
68
32
|
@logger.level = Logger::INFO
|
69
33
|
end
|
70
34
|
|
71
|
-
def setup_report
|
72
|
-
if Preservation.db_path.nil?
|
73
|
-
puts 'Missing db path'
|
74
|
-
exit
|
75
|
-
else
|
76
|
-
@report = IngestReport.new
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
def enough_storage_for_download?(required_bytes)
|
81
|
-
# scale up the required space using a multiplier
|
82
|
-
multiplier = 2
|
83
|
-
available = FreeDiskSpace.bytes('/')
|
84
|
-
required_bytes * multiplier < available ? true : false
|
85
|
-
end
|
86
|
-
|
87
|
-
def build_directory_name(metadata_record, directory_name_scheme)
|
88
|
-
doi = metadata_record['doi']
|
89
|
-
uuid = metadata_record['uuid']
|
90
|
-
title = metadata_record['title'].strip.gsub(' ', '-').gsub('/', '-')
|
91
|
-
time = Time.new
|
92
|
-
date = time.strftime("%Y-%m-%d")
|
93
|
-
time = time.strftime("%H:%M:%S")
|
94
|
-
join_str = '-----'
|
95
|
-
|
96
|
-
case directory_name_scheme
|
97
|
-
when :uuid_title
|
98
|
-
[uuid, title].join(join_str)
|
99
|
-
when :title_uuid
|
100
|
-
[title, uuid].join(join_str)
|
101
|
-
when :date_uuid_title
|
102
|
-
[date, uuid, title].join(join_str)
|
103
|
-
when :date_title_uuid
|
104
|
-
[date, title, uuid].join(join_str)
|
105
|
-
when :date_time_uuid
|
106
|
-
[date, time, uuid].join(join_str)
|
107
|
-
when :date_time_title
|
108
|
-
[date, time, title].join(join_str)
|
109
|
-
when :date_time_uuid_title
|
110
|
-
[date, time, uuid, title].join(join_str)
|
111
|
-
when :date_time_title_uuid
|
112
|
-
[date, time, title, uuid].join(join_str)
|
113
|
-
when :uuid
|
114
|
-
uuid
|
115
|
-
when :doi
|
116
|
-
if doi.empty?
|
117
|
-
return ''
|
118
|
-
end
|
119
|
-
doi.gsub('/', '-')
|
120
|
-
when :doi_short
|
121
|
-
if doi.empty?
|
122
|
-
return ''
|
123
|
-
end
|
124
|
-
doi_short_to_remove = 'http://dx.doi.org/'
|
125
|
-
short = doi.gsub(doi_short_to_remove, '')
|
126
|
-
short.gsub!('/', '-')
|
127
|
-
else
|
128
|
-
uuid
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
# time_to_preserve?
|
133
|
-
#
|
134
|
-
# @param start_utc [String]
|
135
|
-
# @param delay [Integer] days to wait (after modification date) before preserving
|
136
|
-
# @return [Boolean]
|
137
|
-
def time_to_preserve?(start_utc, delay)
|
138
|
-
now = DateTime.now
|
139
|
-
modified_datetime = DateTime.parse(start_utc)
|
140
|
-
days_since_modified = (now - modified_datetime).to_i # result in days
|
141
|
-
days_since_modified >= delay ? true : false
|
142
|
-
end
|
143
|
-
|
144
|
-
# # Collect all paths from DB where preservation has been done
|
145
|
-
# # @return [Array<String>]
|
146
|
-
def get_preserved
|
147
|
-
ingest_complete = @report.transfer_status(status_to_find: 'COMPLETE',
|
148
|
-
status_presence: true)
|
149
|
-
preserved = []
|
150
|
-
ingest_complete.each do |i|
|
151
|
-
dir_path = Preservation.ingest_path + '/' + i['path']
|
152
|
-
if File.exists?(dir_path)
|
153
|
-
preserved << dir_path
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
preserved
|
158
|
-
end
|
159
|
-
|
160
35
|
end
|
161
36
|
|
162
37
|
end
|