preservation 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/CHANGELOG.md +11 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/PITCHME.md +126 -0
- data/README.md +88 -0
- data/Rakefile +1 -0
- data/lib/preservation.rb +24 -0
- data/lib/preservation/configuration.rb +15 -0
- data/lib/preservation/ingest.rb +163 -0
- data/lib/preservation/ingest_report.rb +172 -0
- data/lib/preservation/pure_ingest.rb +188 -0
- data/lib/preservation/string_util.rb +19 -0
- data/lib/preservation/version.rb +5 -0
- data/preservation.gemspec +26 -0
- metadata +103 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: dfcf307b70473079a60f2801c6bd11e4f4c289d8
|
4
|
+
data.tar.gz: 0e13efac8904ccd96fc690644520e20f388a4fce
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 939f44e9a24177232e900953f4b59b64cfe62e0bce056765444dd9a726b64b441cae7b8fc6b3cea233ef9febd732a9a27f0759992cafe4e8494213edc947fffa
|
7
|
+
data.tar.gz: 2a86f065c50cf43c8ea7bcaa707d278f05b7b73b2be53cbebad47e50f38ffa3e8324dc88a71ded669495e6c7f719ca7a7dbd8943da3bcf43a186f6f56df40daf
|
data/.gitignore
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# Change Log
|
2
|
+
All notable changes to this project will be documented in this file.
|
3
|
+
This project adheres to [Semantic Versioning](http://semver.org/).
|
4
|
+
|
5
|
+
## Unreleased
|
6
|
+
|
7
|
+
## 0.1.0 - 2016-09-13
|
8
|
+
### Added
|
9
|
+
- Transfer preparation.
|
10
|
+
- Reporting from transfers database.
|
11
|
+
- Disk space management.
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2016 Adrian Albin-Clark
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/PITCHME.md
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
#HSLIDE
|
2
|
+
|
3
|
+
## Rationale
|
4
|
+
Archivematica's [Automation Tools](https://github.com/artefactual/automation-tools)
|
5
|
+
work with files and descriptive metadata which must be provided in a certain way.
|
6
|
+
|
7
|
+
|
8
|
+
#HSLIDE
|
9
|
+
|
10
|
+
## Preservation: a way to manage ingest
|
11
|
+
|
12
|
+
#VSLIDE
|
13
|
+
|
14
|
+
- Transfer preparation.
|
15
|
+
- Reporting from transfers database. <!-- .element: class="fragment" -->
|
16
|
+
- Disk space management. <!-- .element: class="fragment" -->
|
17
|
+
|
18
|
+
#HSLIDE
|
19
|
+
|
20
|
+
## Preservation: ingest
|
21
|
+
|
22
|
+
Create an ingestor for Pure.
|
23
|
+
```ruby
|
24
|
+
ingest = Preservation::PureIngest.new
|
25
|
+
```
|
26
|
+
|
27
|
+
For each uuid, if necessary, fetch the metadata, prepare a directory in the
|
28
|
+
ingest path and populate it with the files and JSON description file.
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
ingest.prepare_dataset uuids: uuids,
|
32
|
+
dir_name_scheme: :doi_short,
|
33
|
+
delay: 0
|
34
|
+
```
|
35
|
+
|
36
|
+
Free up disk space for completed transfers.
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
ingest.cleanup_preserved
|
40
|
+
```
|
41
|
+
|
42
|
+
#VSLIDE
|
43
|
+
|
44
|
+
## Transfer-ready directory
|
45
|
+
|
46
|
+
```
|
47
|
+
.
|
48
|
+
├── 10.17635-lancaster-researchdata-6
|
49
|
+
│ ├── Ebola_data_Jun15.zip
|
50
|
+
│ └── metadata
|
51
|
+
│ └── metadata.json
|
52
|
+
```
|
53
|
+
|
54
|
+
#VSLIDE
|
55
|
+
|
56
|
+
## Transfer-ready metadata
|
57
|
+
|
58
|
+
```json
|
59
|
+
[
|
60
|
+
{
|
61
|
+
"filename": "objects/Ebola_data_Jun15.zip",
|
62
|
+
"dc.title": "Ebolavirus evolution 2013-2015",
|
63
|
+
"dc.description": "Data used for analysis of selection and evolutionary rate in Zaire Ebolavirus variant Makona",
|
64
|
+
"dcterms.created": "2015-06-04T16:11:34.713+01:00",
|
65
|
+
"dcterms.available": "2015-06-04",
|
66
|
+
"dc.publisher": "Lancaster University",
|
67
|
+
"dc.identifier": "http://dx.doi.org/10.17635/lancaster/researchdata/6",
|
68
|
+
"dcterms.spatial": [
|
69
|
+
"Guinea, Sierra Leone, Liberia"
|
70
|
+
],
|
71
|
+
"dc.creator": [
|
72
|
+
"Gatherer, Derek"
|
73
|
+
],
|
74
|
+
"dc.contributor": [
|
75
|
+
"Robertson, David",
|
76
|
+
"Lovell, Simon"
|
77
|
+
],
|
78
|
+
"dc.subject": [
|
79
|
+
"Ebolavirus",
|
80
|
+
"evolution",
|
81
|
+
"phylogenetics",
|
82
|
+
"virulence",
|
83
|
+
"Filoviridae",
|
84
|
+
"positive selection"
|
85
|
+
],
|
86
|
+
"dcterms.license": "CC BY",
|
87
|
+
"related": [
|
88
|
+
{
|
89
|
+
"dc.title": "The unprecedented scale of the West African Ebola virus disease outbreak is due to environmental an$
|
90
|
+
"type": "Journal article",
|
91
|
+
"dc.identifier": "http://dx.doi.org/10.1136/ebmed-2014-110127"
|
92
|
+
},
|
93
|
+
{
|
94
|
+
"dc.title": "The 2014 Ebola virus disease outbreak in West Africa",
|
95
|
+
"type": "Journal article",
|
96
|
+
"dc.identifier": "http://dx.doi.org/10.1099/vir.0.067199-0"
|
97
|
+
}
|
98
|
+
]
|
99
|
+
}
|
100
|
+
]
|
101
|
+
```
|
102
|
+
|
103
|
+
#HSLIDE
|
104
|
+
|
105
|
+
## Preservation: reporting
|
106
|
+
|
107
|
+
Can be used for scheduled monitoring of transfers.
|
108
|
+
|
109
|
+
```ruby
|
110
|
+
report = Preservation::IngestReport.new
|
111
|
+
report.transfer_exception
|
112
|
+
```
|
113
|
+
|
114
|
+
#HSLIDE
|
115
|
+
|
116
|
+
## Location
|
117
|
+
|
118
|
+
<a href="https://rubygems.org/gems/preservation" target="_blank">RubyGems</a>
|
119
|
+
|
120
|
+
<a href="https://github.com/lulibrary/preservation" target="_blank">GitHub</a>
|
121
|
+
|
122
|
+
#HSLIDE
|
123
|
+
|
124
|
+
## Documentation
|
125
|
+
|
126
|
+
<a href="http://www.rubydoc.info/gems/preservation" target="_blank">API in YARD</a>
|
data/README.md
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# Preservation [](https://badge.fury.io/rb/preservation) [](https://gitpitch.com/lulibrary/preservation/master?grs=github&t=sky)
|
2
|
+
|
3
|
+
Ingest management for Archivematica's [Automation Tools](https://github.com/artefactual/automation-tools).
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'preservation'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install preservation
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
### Configuration
|
22
|
+
Configure Preservation. If ```log_path``` is omitted, logging (standard library) redirects to STDOUT.
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
Preservation.configure do |config|
|
26
|
+
config.db_path = ENV['ARCHIVEMATICA_DB_PATH']
|
27
|
+
config.ingest_path = ENV['ARCHIVEMATICA_INGEST_PATH']
|
28
|
+
config.log_path = ENV['PRESERVATION_LOG_PATH']
|
29
|
+
end
|
30
|
+
```
|
31
|
+
|
32
|
+
Configure data source.
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
Puree.configure do |config|
|
36
|
+
config.base_url = ENV['PURE_BASE_URL']
|
37
|
+
config.username = ENV['PURE_USERNAME']
|
38
|
+
config.password = ENV['PURE_PASSWORD']
|
39
|
+
config.basic_auth = true
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
### Transfers
|
44
|
+
|
45
|
+
Get some dataset UUIDs for preservation.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
c = Puree::Collection.new resource: :dataset
|
49
|
+
minimal_metadata = c.find limit: 2,
|
50
|
+
offset: 10,
|
51
|
+
full: false
|
52
|
+
uuids = []
|
53
|
+
minimal_metadata.each do |i|
|
54
|
+
uuids << i['uuid']
|
55
|
+
end
|
56
|
+
```
|
57
|
+
|
58
|
+
Create an ingestor for Pure.
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
ingest = Preservation::PureIngest.new
|
62
|
+
```
|
63
|
+
|
64
|
+
For each uuid, if necessary, fetch the metadata, prepare
|
65
|
+
a directory in the ingest path and populate it with the files and JSON description file.
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
ingest.prepare_dataset uuids: uuids,
|
69
|
+
dir_name_scheme: :doi_short,
|
70
|
+
delay: 0
|
71
|
+
```
|
72
|
+
|
73
|
+
Free up disk space for completed transfers.
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
ingest.cleanup_preserved
|
77
|
+
```
|
78
|
+
|
79
|
+
### Reporting
|
80
|
+
Can be used for scheduled monitoring of transfers.
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
report = Preservation::IngestReport.new
|
84
|
+
report.transfer_exception
|
85
|
+
```
|
86
|
+
|
87
|
+
## Documentation
|
88
|
+
[API in YARD](http://www.rubydoc.info/gems/preservation)
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/preservation.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'date'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'free_disk_space'
|
4
|
+
require 'logger'
|
5
|
+
require 'puree'
|
6
|
+
require 'sqlite3'
|
7
|
+
require 'preservation/configuration'
|
8
|
+
require 'preservation/ingest_report'
|
9
|
+
require 'preservation/ingest'
|
10
|
+
require 'preservation/pure_ingest'
|
11
|
+
require 'preservation/string_util'
|
12
|
+
require 'preservation/version'
|
13
|
+
|
14
|
+
# Top level namespace
|
15
|
+
#
|
16
|
+
module Preservation
|
17
|
+
|
18
|
+
class << self
|
19
|
+
|
20
|
+
include Preservation::Configuration
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Base class for metadata and file management
|
4
|
+
#
|
5
|
+
class Ingest
|
6
|
+
|
7
|
+
attr_reader :logger
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
check_ingest_path
|
11
|
+
setup_logger
|
12
|
+
setup_report
|
13
|
+
end
|
14
|
+
|
15
|
+
# Free up disk space for completed transfers
|
16
|
+
#
|
17
|
+
def cleanup_preserved
|
18
|
+
preserved = get_preserved
|
19
|
+
if !preserved.nil? && !preserved.empty?
|
20
|
+
preserved.each do |i|
|
21
|
+
# skip anything that has a different owner to script
|
22
|
+
if File.stat(i).grpowned?
|
23
|
+
FileUtils.remove_dir i
|
24
|
+
@logger.info 'Deleted ' + i
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def build_wget(username, password, file_url)
|
34
|
+
# construct wget command with parameters
|
35
|
+
wget_str = ''
|
36
|
+
wget_str << 'wget'
|
37
|
+
wget_str << ' '
|
38
|
+
wget_str << '--user'
|
39
|
+
wget_str << ' '
|
40
|
+
wget_str << username
|
41
|
+
wget_str << ' '
|
42
|
+
wget_str << '--password'
|
43
|
+
wget_str << ' '
|
44
|
+
wget_str << '"' + password + '"'
|
45
|
+
wget_str << ' '
|
46
|
+
wget_str << file_url
|
47
|
+
wget_str << ' '
|
48
|
+
wget_str << '--no-check-certificate'
|
49
|
+
wget_str
|
50
|
+
end
|
51
|
+
|
52
|
+
def check_ingest_path
|
53
|
+
if Preservation.ingest_path.nil?
|
54
|
+
puts 'Missing ingest path'
|
55
|
+
exit
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def setup_logger
|
60
|
+
if @logger.nil?
|
61
|
+
if Preservation.log_path.nil?
|
62
|
+
@logger = Logger.new STDOUT
|
63
|
+
else
|
64
|
+
# Keep data for today and the past 20 days
|
65
|
+
@logger = Logger.new File.new(Preservation.log_path, 'a'), 20, 'daily'
|
66
|
+
end
|
67
|
+
end
|
68
|
+
@logger.level = Logger::INFO
|
69
|
+
end
|
70
|
+
|
71
|
+
def setup_report
|
72
|
+
if Preservation.db_path.nil?
|
73
|
+
puts 'Missing db path'
|
74
|
+
exit
|
75
|
+
else
|
76
|
+
@report = IngestReport.new
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def enough_storage_for_download?(required_bytes)
|
81
|
+
# scale up the required space using a multiplier
|
82
|
+
multiplier = 2
|
83
|
+
available = FreeDiskSpace.bytes('/')
|
84
|
+
required_bytes * multiplier < available ? true : false
|
85
|
+
end
|
86
|
+
|
87
|
+
def build_directory_name(metadata_record, directory_name_scheme)
|
88
|
+
doi = metadata_record['doi']
|
89
|
+
uuid = metadata_record['uuid']
|
90
|
+
title = metadata_record['title'].strip.gsub(' ', '-').gsub('/', '-')
|
91
|
+
time = Time.new
|
92
|
+
date = time.strftime("%Y-%m-%d")
|
93
|
+
time = time.strftime("%H:%M:%S")
|
94
|
+
join_str = '-----'
|
95
|
+
|
96
|
+
case directory_name_scheme
|
97
|
+
when :uuid_title
|
98
|
+
[uuid, title].join(join_str)
|
99
|
+
when :title_uuid
|
100
|
+
[title, uuid].join(join_str)
|
101
|
+
when :date_uuid_title
|
102
|
+
[date, uuid, title].join(join_str)
|
103
|
+
when :date_title_uuid
|
104
|
+
[date, title, uuid].join(join_str)
|
105
|
+
when :date_time_uuid
|
106
|
+
[date, time, uuid].join(join_str)
|
107
|
+
when :date_time_title
|
108
|
+
[date, time, title].join(join_str)
|
109
|
+
when :date_time_uuid_title
|
110
|
+
[date, time, uuid, title].join(join_str)
|
111
|
+
when :date_time_title_uuid
|
112
|
+
[date, time, title, uuid].join(join_str)
|
113
|
+
when :uuid
|
114
|
+
uuid
|
115
|
+
when :doi
|
116
|
+
if doi.empty?
|
117
|
+
return ''
|
118
|
+
end
|
119
|
+
doi.gsub('/', '-')
|
120
|
+
when :doi_short
|
121
|
+
if doi.empty?
|
122
|
+
return ''
|
123
|
+
end
|
124
|
+
doi_short_to_remove = 'http://dx.doi.org/'
|
125
|
+
short = doi.gsub(doi_short_to_remove, '')
|
126
|
+
short.gsub!('/', '-')
|
127
|
+
else
|
128
|
+
uuid
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# time_to_preserve?
|
133
|
+
#
|
134
|
+
# @param start_utc [String]
|
135
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
136
|
+
# @return [Boolean]
|
137
|
+
def time_to_preserve?(start_utc, delay)
|
138
|
+
now = DateTime.now
|
139
|
+
modified_datetime = DateTime.parse(start_utc)
|
140
|
+
days_since_modified = (now - modified_datetime).to_i # result in days
|
141
|
+
days_since_modified >= delay ? true : false
|
142
|
+
end
|
143
|
+
|
144
|
+
# # Collect all paths from DB where preservation has been done
|
145
|
+
# # @return [Array<String>]
|
146
|
+
def get_preserved
|
147
|
+
ingest_complete = @report.transfer_status(status_to_find: 'COMPLETE',
|
148
|
+
status_presence: true)
|
149
|
+
preserved = []
|
150
|
+
ingest_complete.each do |i|
|
151
|
+
dir_path = Preservation.ingest_path + '/' + i['path']
|
152
|
+
if File.exists?(dir_path)
|
153
|
+
preserved << dir_path
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
preserved
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
@@ -0,0 +1,172 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Ingest reporting
|
4
|
+
#
|
5
|
+
class IngestReport
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
create_db_connection
|
9
|
+
end
|
10
|
+
|
11
|
+
# Transfers based on presence (or not) of a particular status
|
12
|
+
#
|
13
|
+
# @param status_to_find [String]
|
14
|
+
# @param status_presence [Boolean]
|
15
|
+
def transfer_status(status_to_find: nil, status_presence: true)
|
16
|
+
if status_presence === true
|
17
|
+
status_presence = '='
|
18
|
+
else
|
19
|
+
status_presence = '<>'
|
20
|
+
end
|
21
|
+
|
22
|
+
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
|
23
|
+
|
24
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
25
|
+
# and use hex function in DB query
|
26
|
+
records = []
|
27
|
+
@db.results_as_hash = true
|
28
|
+
@db.execute( query, [ status_to_find ] ) do |row|
|
29
|
+
id = row['id']
|
30
|
+
uuid = row['uuid']
|
31
|
+
bin_path = StringUtil.hex_to_bin row['hex_path']
|
32
|
+
unit_type = row['unit_type']
|
33
|
+
status = row['status']
|
34
|
+
microservice = row['microservice']
|
35
|
+
current = row['current']
|
36
|
+
o = {}
|
37
|
+
o['path'] = bin_path if !bin_path.empty?
|
38
|
+
o['unit_type'] = unit_type if !unit_type.empty?
|
39
|
+
o['status'] = status if !status.empty?
|
40
|
+
o['microservice'] = microservice if !microservice.empty?
|
41
|
+
o['current'] = current if current
|
42
|
+
o['id'] = id if id
|
43
|
+
o['uuid'] = uuid if !uuid.empty?
|
44
|
+
|
45
|
+
records << o
|
46
|
+
end
|
47
|
+
|
48
|
+
records
|
49
|
+
end
|
50
|
+
|
51
|
+
# Current transfer
|
52
|
+
#
|
53
|
+
# @return [Hash]
|
54
|
+
def transfer_current
|
55
|
+
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
|
56
|
+
|
57
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
58
|
+
# and use hex function in DB query
|
59
|
+
o = {}
|
60
|
+
@db.results_as_hash = true
|
61
|
+
@db.execute( query ) do |row|
|
62
|
+
id = row['id']
|
63
|
+
uuid = row['uuid']
|
64
|
+
bin_path = hex_to_bin row['hex_path']
|
65
|
+
unit_type = row['unit_type']
|
66
|
+
status = row['status']
|
67
|
+
microservice = row['microservice']
|
68
|
+
current = row['current']
|
69
|
+
o['path'] = bin_path if !bin_path.empty?
|
70
|
+
o['unit_type'] = unit_type if !unit_type.empty?
|
71
|
+
o['status'] = status if !status.empty?
|
72
|
+
o['microservice'] = microservice if !microservice.empty?
|
73
|
+
o['current'] = current if current
|
74
|
+
o['id'] = id if id
|
75
|
+
o['uuid'] = uuid if !uuid.empty?
|
76
|
+
end
|
77
|
+
o
|
78
|
+
end
|
79
|
+
|
80
|
+
# Count of complete transfers
|
81
|
+
#
|
82
|
+
# @return [Integer]
|
83
|
+
def transfer_complete_count
|
84
|
+
query = 'SELECT count(*) FROM unit WHERE status = ?'
|
85
|
+
|
86
|
+
status_to_find = 'COMPLETE'
|
87
|
+
@db.results_as_hash = true
|
88
|
+
@db.get_first_value( query, [status_to_find] )
|
89
|
+
end
|
90
|
+
|
91
|
+
# Compilation of statistics and data, with focus on exceptions
|
92
|
+
#
|
93
|
+
# @return [Hash]
|
94
|
+
def transfer_exception
|
95
|
+
incomplete = transfer_status(status_to_find: 'COMPLETE', status_presence: false)
|
96
|
+
failed = transfer_status(status_to_find: 'FAILED', status_presence: true)
|
97
|
+
current = transfer_current
|
98
|
+
complete_count = transfer_complete_count
|
99
|
+
report = {}
|
100
|
+
report['current'] = current if !current.empty?
|
101
|
+
report['failed'] = {}
|
102
|
+
report['failed']['count'] = failed.count
|
103
|
+
report['failed']['data'] = failed if !failed.empty?
|
104
|
+
report['incomplete'] = {}
|
105
|
+
report['incomplete']['count'] = incomplete.count
|
106
|
+
report['incomplete']['data'] = incomplete if !incomplete.empty?
|
107
|
+
report['complete'] = {}
|
108
|
+
report['complete']['count'] = complete_count if complete_count
|
109
|
+
report
|
110
|
+
end
|
111
|
+
|
112
|
+
# Is it in database?
|
113
|
+
# @param path_to_find [String] directory name within ingest path
|
114
|
+
# @return [Boolean]
|
115
|
+
def in_db?(path_to_find)
|
116
|
+
in_db = false
|
117
|
+
|
118
|
+
# Get path out of DB as a hex string
|
119
|
+
query = 'SELECT hex(path) FROM unit'
|
120
|
+
|
121
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
122
|
+
# and use hex function in DB query
|
123
|
+
@db.execute( query ) do |row|
|
124
|
+
bin_path = StringUtil.hex_to_bin row[0]
|
125
|
+
if bin_path === path_to_find
|
126
|
+
in_db = true
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
in_db
|
131
|
+
end
|
132
|
+
|
133
|
+
# Has preservation been done?
|
134
|
+
# @param path_to_find [String] directory name within ingest path
|
135
|
+
# @return [Boolean]
|
136
|
+
def preserved?(path_to_find)
|
137
|
+
preserved = false
|
138
|
+
|
139
|
+
# 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
|
140
|
+
# indicates completed
|
141
|
+
unit_type_to_find = 'ingest'
|
142
|
+
status_to_find = 'COMPLETE'
|
143
|
+
|
144
|
+
# Get path out of DB as a hex string for completed ingests
|
145
|
+
query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
|
146
|
+
|
147
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
148
|
+
# and use hex function in DB query
|
149
|
+
@db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
|
150
|
+
bin_path = StringUtil.hex_to_bin row[0]
|
151
|
+
if bin_path === path_to_find
|
152
|
+
preserved = true
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
preserved
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
private
|
161
|
+
|
162
|
+
def create_db_connection
|
163
|
+
if Preservation.db_path.nil?
|
164
|
+
puts 'Missing db_path'
|
165
|
+
exit
|
166
|
+
end
|
167
|
+
@db = SQLite3::Database.new Preservation.db_path
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Ingest for Pure
|
4
|
+
#
|
5
|
+
class PureIngest < Ingest
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
# For each uuid, if necessary, fetch the metadata,
|
12
|
+
# prepare a directory in the ingest path and populate it with the files and
|
13
|
+
# JSON description file.
|
14
|
+
#
|
15
|
+
# @param uuids [Array<String>] uuids to preserve
|
16
|
+
# @param dir_name_scheme [Symbol] method to make directory name
|
17
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
18
|
+
def prepare_dataset(uuids: [],
|
19
|
+
dir_name_scheme: :uuid,
|
20
|
+
delay: 0)
|
21
|
+
dir_base_path = Preservation.ingest_path
|
22
|
+
|
23
|
+
uuids.each do |uuid|
|
24
|
+
dataset = Puree::Dataset.new
|
25
|
+
dataset.find uuid: uuid
|
26
|
+
d = dataset.metadata
|
27
|
+
if d.empty?
|
28
|
+
@logger.info 'No metadata for ' + uuid
|
29
|
+
next
|
30
|
+
end
|
31
|
+
# configurable to become more human-readable
|
32
|
+
dir_name = build_directory_name(d, dir_name_scheme)
|
33
|
+
|
34
|
+
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
35
|
+
# continue only if there is no DB entry
|
36
|
+
# continue only if the dataset has a DOI
|
37
|
+
# continue only if there are files for this resource
|
38
|
+
# continue only if it is time to preserve
|
39
|
+
if !dir_name.nil? &&
|
40
|
+
!dir_name.empty? &&
|
41
|
+
!@report.in_db?(dir_name) &&
|
42
|
+
!d['doi'].empty? &&
|
43
|
+
!d['file'].empty? &&
|
44
|
+
time_to_preserve?(d['modified'], delay)
|
45
|
+
|
46
|
+
dir_file_path = dir_base_path + '/' + dir_name
|
47
|
+
dir_metadata_path = dir_file_path + '/metadata/'
|
48
|
+
metadata_filename = dir_metadata_path + 'metadata.json'
|
49
|
+
|
50
|
+
# calculate total size of data files
|
51
|
+
download_storage_required = 0
|
52
|
+
d['file'].each { |i| download_storage_required += i['size'].to_i }
|
53
|
+
|
54
|
+
# do we have enough space in filesystem to fetch data files?
|
55
|
+
if enough_storage_for_download? download_storage_required
|
56
|
+
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
57
|
+
else
|
58
|
+
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
59
|
+
next
|
60
|
+
end
|
61
|
+
|
62
|
+
# has metadata file been created? if so, files and metadata are in place
|
63
|
+
# continue only if files not present in ingest location
|
64
|
+
if !File.size? metadata_filename
|
65
|
+
|
66
|
+
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
|
67
|
+
|
68
|
+
data = []
|
69
|
+
d['file'].each do |f|
|
70
|
+
o = package_dataset_metadata d, f
|
71
|
+
data << o
|
72
|
+
wget_str = build_wget Puree.username,
|
73
|
+
Puree.password,
|
74
|
+
f['url']
|
75
|
+
|
76
|
+
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
77
|
+
|
78
|
+
# fetch the file
|
79
|
+
Dir.chdir(dir_file_path) do
|
80
|
+
# puts 'Changing dir to ' + Dir.pwd
|
81
|
+
# puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
|
82
|
+
if File.size?(f['name'])
|
83
|
+
# puts 'Should be deleting ' + f['name']
|
84
|
+
File.delete(f['name'])
|
85
|
+
end
|
86
|
+
# puts f['name'] + ' missing or empty'
|
87
|
+
# puts wget_str
|
88
|
+
`#{wget_str}`
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
93
|
+
|
94
|
+
pretty = JSON.pretty_generate( data, :indent => ' ')
|
95
|
+
# puts pretty
|
96
|
+
File.write(metadata_filename,pretty)
|
97
|
+
@logger.info 'Created ' + metadata_filename
|
98
|
+
end
|
99
|
+
else
|
100
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def package_dataset_metadata(d, f)
|
108
|
+
o = {}
|
109
|
+
o['filename'] = 'objects/' + f['name']
|
110
|
+
o['dc.title'] = d['title']
|
111
|
+
if !d['description'].empty?
|
112
|
+
o['dc.description'] = d['description']
|
113
|
+
end
|
114
|
+
o['dcterms.created'] = d['created']
|
115
|
+
if !d['available']['year'].empty?
|
116
|
+
o['dcterms.available'] = Puree::Date.iso(d['available'])
|
117
|
+
end
|
118
|
+
o['dc.publisher'] = d['publisher']
|
119
|
+
if !d['doi'].empty?
|
120
|
+
o['dc.identifier'] = d['doi']
|
121
|
+
end
|
122
|
+
if !d['spatial'].empty?
|
123
|
+
o['dcterms.spatial'] = d['spatial']
|
124
|
+
end
|
125
|
+
if !d['temporal']['start']['year'].empty?
|
126
|
+
temporal_range = ''
|
127
|
+
temporal_range << Puree::Date.iso(d['temporal']['start'])
|
128
|
+
if !d['temporal']['end']['year'].empty?
|
129
|
+
temporal_range << '/'
|
130
|
+
temporal_range << Puree::Date.iso(d['temporal']['end'])
|
131
|
+
end
|
132
|
+
o['dcterms.temporal'] = temporal_range
|
133
|
+
end
|
134
|
+
creators = []
|
135
|
+
contributors = []
|
136
|
+
person_types = %w(internal external other)
|
137
|
+
person_types.each do |person_type|
|
138
|
+
d['person'][person_type].each do |i|
|
139
|
+
if i['role'] == 'Creator'
|
140
|
+
creator = i['name']['last'] + ', ' + i['name']['first']
|
141
|
+
creators << creator
|
142
|
+
end
|
143
|
+
if i['role'] == 'Contributor'
|
144
|
+
contributor = i['name']['last'] + ', ' + i['name']['first']
|
145
|
+
contributors << contributor
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
o['dc.creator'] = creators
|
150
|
+
if !contributors.empty?
|
151
|
+
o['dc.contributor'] = contributors
|
152
|
+
end
|
153
|
+
keywords = []
|
154
|
+
d['keyword'].each { |i|
|
155
|
+
keywords << i
|
156
|
+
}
|
157
|
+
if !keywords.empty?
|
158
|
+
o['dc.subject'] = keywords
|
159
|
+
end
|
160
|
+
if !f['license']['name'].empty?
|
161
|
+
o['dcterms.license'] = f['license']['name']
|
162
|
+
end
|
163
|
+
# o['dc.format'] = f['mime']
|
164
|
+
|
165
|
+
related = []
|
166
|
+
publications = d['publication']
|
167
|
+
publications.each do |i|
|
168
|
+
o_related = {}
|
169
|
+
o_related['dc.title'] = i['title']
|
170
|
+
o_related['type'] = i['type']
|
171
|
+
pub = Puree::Publication.new
|
172
|
+
pub.find uuid: i['uuid']
|
173
|
+
doi = pub.doi
|
174
|
+
if doi
|
175
|
+
o_related['dc.identifier'] = doi
|
176
|
+
end
|
177
|
+
related << o_related
|
178
|
+
end
|
179
|
+
if !related.empty?
|
180
|
+
o['related'] = related
|
181
|
+
end
|
182
|
+
|
183
|
+
o
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# String utilities
|
4
|
+
#
|
5
|
+
module StringUtil
|
6
|
+
# Binary to hexadecimal
|
7
|
+
#
|
8
|
+
def self.bin_to_hex(s)
|
9
|
+
s.each_byte.map { |b| b.to_s(16) }.join
|
10
|
+
end
|
11
|
+
|
12
|
+
# Hexadecimal to binary
|
13
|
+
def self.hex_to_bin(s)
|
14
|
+
s.scan(/../).map { |x| x.hex.chr }.join
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'preservation/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "preservation"
|
8
|
+
spec.version = Preservation::VERSION
|
9
|
+
spec.authors = ["Adrian Albin-Clark"]
|
10
|
+
spec.email = ["a.albin-clark@lancaster.ac.uk"]
|
11
|
+
spec.summary = %q{Ingest management for Archivematica's Automation Tools.}
|
12
|
+
spec.description = %q{Transfer preparation, reporting and disk space management for Archivematica's Automation Tools.}
|
13
|
+
spec.homepage = "https://rubygems.org/gems/preservation"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.required_ruby_version = '~> 2.1'
|
22
|
+
|
23
|
+
spec.add_runtime_dependency 'free_disk_space', '~> 1.0'
|
24
|
+
spec.add_runtime_dependency 'puree', '~> 0.17'
|
25
|
+
spec.add_runtime_dependency'sqlite3', '~> 1.3'
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: preservation
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Adrian Albin-Clark
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-09-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: free_disk_space
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: puree
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.17'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.17'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: sqlite3
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.3'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.3'
|
55
|
+
description: Transfer preparation, reporting and disk space management for Archivematica's
|
56
|
+
Automation Tools.
|
57
|
+
email:
|
58
|
+
- a.albin-clark@lancaster.ac.uk
|
59
|
+
executables: []
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- ".gitignore"
|
64
|
+
- CHANGELOG.md
|
65
|
+
- Gemfile
|
66
|
+
- LICENSE.txt
|
67
|
+
- PITCHME.md
|
68
|
+
- README.md
|
69
|
+
- Rakefile
|
70
|
+
- lib/preservation.rb
|
71
|
+
- lib/preservation/configuration.rb
|
72
|
+
- lib/preservation/ingest.rb
|
73
|
+
- lib/preservation/ingest_report.rb
|
74
|
+
- lib/preservation/pure_ingest.rb
|
75
|
+
- lib/preservation/string_util.rb
|
76
|
+
- lib/preservation/version.rb
|
77
|
+
- preservation.gemspec
|
78
|
+
homepage: https://rubygems.org/gems/preservation
|
79
|
+
licenses:
|
80
|
+
- MIT
|
81
|
+
metadata: {}
|
82
|
+
post_install_message:
|
83
|
+
rdoc_options: []
|
84
|
+
require_paths:
|
85
|
+
- lib
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - "~>"
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '2.1'
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
requirements: []
|
97
|
+
rubyforge_project:
|
98
|
+
rubygems_version: 2.2.2
|
99
|
+
signing_key:
|
100
|
+
specification_version: 4
|
101
|
+
summary: Ingest management for Archivematica's Automation Tools.
|
102
|
+
test_files: []
|
103
|
+
has_rdoc:
|