preservation 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/CHANGELOG.md +11 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/PITCHME.md +126 -0
- data/README.md +88 -0
- data/Rakefile +1 -0
- data/lib/preservation.rb +24 -0
- data/lib/preservation/configuration.rb +15 -0
- data/lib/preservation/ingest.rb +163 -0
- data/lib/preservation/ingest_report.rb +172 -0
- data/lib/preservation/pure_ingest.rb +188 -0
- data/lib/preservation/string_util.rb +19 -0
- data/lib/preservation/version.rb +5 -0
- data/preservation.gemspec +26 -0
- metadata +103 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: dfcf307b70473079a60f2801c6bd11e4f4c289d8
|
4
|
+
data.tar.gz: 0e13efac8904ccd96fc690644520e20f388a4fce
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 939f44e9a24177232e900953f4b59b64cfe62e0bce056765444dd9a726b64b441cae7b8fc6b3cea233ef9febd732a9a27f0759992cafe4e8494213edc947fffa
|
7
|
+
data.tar.gz: 2a86f065c50cf43c8ea7bcaa707d278f05b7b73b2be53cbebad47e50f38ffa3e8324dc88a71ded669495e6c7f719ca7a7dbd8943da3bcf43a186f6f56df40daf
|
data/.gitignore
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# Change Log
|
2
|
+
All notable changes to this project will be documented in this file.
|
3
|
+
This project adheres to [Semantic Versioning](http://semver.org/).
|
4
|
+
|
5
|
+
## Unreleased
|
6
|
+
|
7
|
+
## 0.1.0 - 2016-09-13
|
8
|
+
### Added
|
9
|
+
- Transfer preparation.
|
10
|
+
- Reporting from transfers database.
|
11
|
+
- Disk space management.
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2016 Adrian Albin-Clark
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/PITCHME.md
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
#HSLIDE
|
2
|
+
|
3
|
+
## Rationale
|
4
|
+
Archivematica's [Automation Tools](https://github.com/artefactual/automation-tools)
|
5
|
+
work with files and descriptive metadata which must be provided in a certain way.
|
6
|
+
|
7
|
+
|
8
|
+
#HSLIDE
|
9
|
+
|
10
|
+
## Preservation: a way to manage ingest
|
11
|
+
|
12
|
+
#VSLIDE
|
13
|
+
|
14
|
+
- Transfer preparation.
|
15
|
+
- Reporting from transfers database. <!-- .element: class="fragment" -->
|
16
|
+
- Disk space management. <!-- .element: class="fragment" -->
|
17
|
+
|
18
|
+
#HSLIDE
|
19
|
+
|
20
|
+
## Preservation: ingest
|
21
|
+
|
22
|
+
Create an ingestor for Pure.
|
23
|
+
```ruby
|
24
|
+
ingest = Preservation::PureIngest.new
|
25
|
+
```
|
26
|
+
|
27
|
+
For each uuid, if necessary, fetch the metadata, prepare a directory in the
|
28
|
+
ingest path and populate it with the files and JSON description file.
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
ingest.prepare_dataset uuids: uuids,
|
32
|
+
dir_name_scheme: :doi_short,
|
33
|
+
delay: 0
|
34
|
+
```
|
35
|
+
|
36
|
+
Free up disk space for completed transfers.
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
ingest.cleanup_preserved
|
40
|
+
```
|
41
|
+
|
42
|
+
#VSLIDE
|
43
|
+
|
44
|
+
## Transfer-ready directory
|
45
|
+
|
46
|
+
```
|
47
|
+
.
|
48
|
+
├── 10.17635-lancaster-researchdata-6
|
49
|
+
│ ├── Ebola_data_Jun15.zip
|
50
|
+
│ └── metadata
|
51
|
+
│ └── metadata.json
|
52
|
+
```
|
53
|
+
|
54
|
+
#VSLIDE
|
55
|
+
|
56
|
+
## Transfer-ready metadata
|
57
|
+
|
58
|
+
```json
|
59
|
+
[
|
60
|
+
{
|
61
|
+
"filename": "objects/Ebola_data_Jun15.zip",
|
62
|
+
"dc.title": "Ebolavirus evolution 2013-2015",
|
63
|
+
"dc.description": "Data used for analysis of selection and evolutionary rate in Zaire Ebolavirus variant Makona",
|
64
|
+
"dcterms.created": "2015-06-04T16:11:34.713+01:00",
|
65
|
+
"dcterms.available": "2015-06-04",
|
66
|
+
"dc.publisher": "Lancaster University",
|
67
|
+
"dc.identifier": "http://dx.doi.org/10.17635/lancaster/researchdata/6",
|
68
|
+
"dcterms.spatial": [
|
69
|
+
"Guinea, Sierra Leone, Liberia"
|
70
|
+
],
|
71
|
+
"dc.creator": [
|
72
|
+
"Gatherer, Derek"
|
73
|
+
],
|
74
|
+
"dc.contributor": [
|
75
|
+
"Robertson, David",
|
76
|
+
"Lovell, Simon"
|
77
|
+
],
|
78
|
+
"dc.subject": [
|
79
|
+
"Ebolavirus",
|
80
|
+
"evolution",
|
81
|
+
"phylogenetics",
|
82
|
+
"virulence",
|
83
|
+
"Filoviridae",
|
84
|
+
"positive selection"
|
85
|
+
],
|
86
|
+
"dcterms.license": "CC BY",
|
87
|
+
"related": [
|
88
|
+
{
|
89
|
+
"dc.title": "The unprecedented scale of the West African Ebola virus disease outbreak is due to environmental an$
|
90
|
+
"type": "Journal article",
|
91
|
+
"dc.identifier": "http://dx.doi.org/10.1136/ebmed-2014-110127"
|
92
|
+
},
|
93
|
+
{
|
94
|
+
"dc.title": "The 2014 Ebola virus disease outbreak in West Africa",
|
95
|
+
"type": "Journal article",
|
96
|
+
"dc.identifier": "http://dx.doi.org/10.1099/vir.0.067199-0"
|
97
|
+
}
|
98
|
+
]
|
99
|
+
}
|
100
|
+
]
|
101
|
+
```
|
102
|
+
|
103
|
+
#HSLIDE
|
104
|
+
|
105
|
+
## Preservation: reporting
|
106
|
+
|
107
|
+
Can be used for scheduled monitoring of transfers.
|
108
|
+
|
109
|
+
```ruby
|
110
|
+
report = Preservation::IngestReport.new
|
111
|
+
report.transfer_exception
|
112
|
+
```
|
113
|
+
|
114
|
+
#HSLIDE
|
115
|
+
|
116
|
+
## Location
|
117
|
+
|
118
|
+
<a href="https://rubygems.org/gems/preservation" target="_blank">RubyGems</a>
|
119
|
+
|
120
|
+
<a href="https://github.com/lulibrary/preservation" target="_blank">GitHub</a>
|
121
|
+
|
122
|
+
#HSLIDE
|
123
|
+
|
124
|
+
## Documentation
|
125
|
+
|
126
|
+
<a href="http://www.rubydoc.info/gems/preservation" target="_blank">API in YARD</a>
|
data/README.md
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
# Preservation [![Gem Version](https://badge.fury.io/rb/preservation.svg)](https://badge.fury.io/rb/preservation) [![GitPitch](https://gitpitch.com/assets/badge.svg)](https://gitpitch.com/lulibrary/preservation/master?grs=github&t=sky)
|
2
|
+
|
3
|
+
Ingest management for Archivematica's [Automation Tools](https://github.com/artefactual/automation-tools).
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'preservation'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install preservation
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
### Configuration
|
22
|
+
Configure Preservation. If ```log_path``` is omitted, logging (standard library) redirects to STDOUT.
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
Preservation.configure do |config|
|
26
|
+
config.db_path = ENV['ARCHIVEMATICA_DB_PATH']
|
27
|
+
config.ingest_path = ENV['ARCHIVEMATICA_INGEST_PATH']
|
28
|
+
config.log_path = ENV['PRESERVATION_LOG_PATH']
|
29
|
+
end
|
30
|
+
```
|
31
|
+
|
32
|
+
Configure data source.
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
Puree.configure do |config|
|
36
|
+
config.base_url = ENV['PURE_BASE_URL']
|
37
|
+
config.username = ENV['PURE_USERNAME']
|
38
|
+
config.password = ENV['PURE_PASSWORD']
|
39
|
+
config.basic_auth = true
|
40
|
+
end
|
41
|
+
```
|
42
|
+
|
43
|
+
### Transfers
|
44
|
+
|
45
|
+
Get some dataset UUIDs for preservation.
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
c = Puree::Collection.new resource: :dataset
|
49
|
+
minimal_metadata = c.find limit: 2,
|
50
|
+
offset: 10,
|
51
|
+
full: false
|
52
|
+
uuids = []
|
53
|
+
minimal_metadata.each do |i|
|
54
|
+
uuids << i['uuid']
|
55
|
+
end
|
56
|
+
```
|
57
|
+
|
58
|
+
Create an ingestor for Pure.
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
ingest = Preservation::PureIngest.new
|
62
|
+
```
|
63
|
+
|
64
|
+
For each uuid, if necessary, fetch the metadata, prepare
|
65
|
+
a directory in the ingest path and populate it with the files and JSON description file.
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
ingest.prepare_dataset uuids: uuids,
|
69
|
+
dir_name_scheme: :doi_short,
|
70
|
+
delay: 0
|
71
|
+
```
|
72
|
+
|
73
|
+
Free up disk space for completed transfers.
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
ingest.cleanup_preserved
|
77
|
+
```
|
78
|
+
|
79
|
+
### Reporting
|
80
|
+
Can be used for scheduled monitoring of transfers.
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
report = Preservation::IngestReport.new
|
84
|
+
report.transfer_exception
|
85
|
+
```
|
86
|
+
|
87
|
+
## Documentation
|
88
|
+
[API in YARD](http://www.rubydoc.info/gems/preservation)
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/preservation.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'date'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'free_disk_space'
|
4
|
+
require 'logger'
|
5
|
+
require 'puree'
|
6
|
+
require 'sqlite3'
|
7
|
+
require 'preservation/configuration'
|
8
|
+
require 'preservation/ingest_report'
|
9
|
+
require 'preservation/ingest'
|
10
|
+
require 'preservation/pure_ingest'
|
11
|
+
require 'preservation/string_util'
|
12
|
+
require 'preservation/version'
|
13
|
+
|
14
|
+
# Top level namespace
|
15
|
+
#
|
16
|
+
module Preservation
|
17
|
+
|
18
|
+
class << self
|
19
|
+
|
20
|
+
include Preservation::Configuration
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,163 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Base class for metadata and file management
|
4
|
+
#
|
5
|
+
class Ingest
|
6
|
+
|
7
|
+
attr_reader :logger
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
check_ingest_path
|
11
|
+
setup_logger
|
12
|
+
setup_report
|
13
|
+
end
|
14
|
+
|
15
|
+
# Free up disk space for completed transfers
|
16
|
+
#
|
17
|
+
def cleanup_preserved
|
18
|
+
preserved = get_preserved
|
19
|
+
if !preserved.nil? && !preserved.empty?
|
20
|
+
preserved.each do |i|
|
21
|
+
# skip anything that has a different owner to script
|
22
|
+
if File.stat(i).grpowned?
|
23
|
+
FileUtils.remove_dir i
|
24
|
+
@logger.info 'Deleted ' + i
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def build_wget(username, password, file_url)
|
34
|
+
# construct wget command with parameters
|
35
|
+
wget_str = ''
|
36
|
+
wget_str << 'wget'
|
37
|
+
wget_str << ' '
|
38
|
+
wget_str << '--user'
|
39
|
+
wget_str << ' '
|
40
|
+
wget_str << username
|
41
|
+
wget_str << ' '
|
42
|
+
wget_str << '--password'
|
43
|
+
wget_str << ' '
|
44
|
+
wget_str << '"' + password + '"'
|
45
|
+
wget_str << ' '
|
46
|
+
wget_str << file_url
|
47
|
+
wget_str << ' '
|
48
|
+
wget_str << '--no-check-certificate'
|
49
|
+
wget_str
|
50
|
+
end
|
51
|
+
|
52
|
+
def check_ingest_path
|
53
|
+
if Preservation.ingest_path.nil?
|
54
|
+
puts 'Missing ingest path'
|
55
|
+
exit
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def setup_logger
|
60
|
+
if @logger.nil?
|
61
|
+
if Preservation.log_path.nil?
|
62
|
+
@logger = Logger.new STDOUT
|
63
|
+
else
|
64
|
+
# Keep data for today and the past 20 days
|
65
|
+
@logger = Logger.new File.new(Preservation.log_path, 'a'), 20, 'daily'
|
66
|
+
end
|
67
|
+
end
|
68
|
+
@logger.level = Logger::INFO
|
69
|
+
end
|
70
|
+
|
71
|
+
def setup_report
|
72
|
+
if Preservation.db_path.nil?
|
73
|
+
puts 'Missing db path'
|
74
|
+
exit
|
75
|
+
else
|
76
|
+
@report = IngestReport.new
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def enough_storage_for_download?(required_bytes)
|
81
|
+
# scale up the required space using a multiplier
|
82
|
+
multiplier = 2
|
83
|
+
available = FreeDiskSpace.bytes('/')
|
84
|
+
required_bytes * multiplier < available ? true : false
|
85
|
+
end
|
86
|
+
|
87
|
+
def build_directory_name(metadata_record, directory_name_scheme)
|
88
|
+
doi = metadata_record['doi']
|
89
|
+
uuid = metadata_record['uuid']
|
90
|
+
title = metadata_record['title'].strip.gsub(' ', '-').gsub('/', '-')
|
91
|
+
time = Time.new
|
92
|
+
date = time.strftime("%Y-%m-%d")
|
93
|
+
time = time.strftime("%H:%M:%S")
|
94
|
+
join_str = '-----'
|
95
|
+
|
96
|
+
case directory_name_scheme
|
97
|
+
when :uuid_title
|
98
|
+
[uuid, title].join(join_str)
|
99
|
+
when :title_uuid
|
100
|
+
[title, uuid].join(join_str)
|
101
|
+
when :date_uuid_title
|
102
|
+
[date, uuid, title].join(join_str)
|
103
|
+
when :date_title_uuid
|
104
|
+
[date, title, uuid].join(join_str)
|
105
|
+
when :date_time_uuid
|
106
|
+
[date, time, uuid].join(join_str)
|
107
|
+
when :date_time_title
|
108
|
+
[date, time, title].join(join_str)
|
109
|
+
when :date_time_uuid_title
|
110
|
+
[date, time, uuid, title].join(join_str)
|
111
|
+
when :date_time_title_uuid
|
112
|
+
[date, time, title, uuid].join(join_str)
|
113
|
+
when :uuid
|
114
|
+
uuid
|
115
|
+
when :doi
|
116
|
+
if doi.empty?
|
117
|
+
return ''
|
118
|
+
end
|
119
|
+
doi.gsub('/', '-')
|
120
|
+
when :doi_short
|
121
|
+
if doi.empty?
|
122
|
+
return ''
|
123
|
+
end
|
124
|
+
doi_short_to_remove = 'http://dx.doi.org/'
|
125
|
+
short = doi.gsub(doi_short_to_remove, '')
|
126
|
+
short.gsub!('/', '-')
|
127
|
+
else
|
128
|
+
uuid
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# time_to_preserve?
|
133
|
+
#
|
134
|
+
# @param start_utc [String]
|
135
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
136
|
+
# @return [Boolean]
|
137
|
+
def time_to_preserve?(start_utc, delay)
|
138
|
+
now = DateTime.now
|
139
|
+
modified_datetime = DateTime.parse(start_utc)
|
140
|
+
days_since_modified = (now - modified_datetime).to_i # result in days
|
141
|
+
days_since_modified >= delay ? true : false
|
142
|
+
end
|
143
|
+
|
144
|
+
# # Collect all paths from DB where preservation has been done
|
145
|
+
# # @return [Array<String>]
|
146
|
+
def get_preserved
|
147
|
+
ingest_complete = @report.transfer_status(status_to_find: 'COMPLETE',
|
148
|
+
status_presence: true)
|
149
|
+
preserved = []
|
150
|
+
ingest_complete.each do |i|
|
151
|
+
dir_path = Preservation.ingest_path + '/' + i['path']
|
152
|
+
if File.exists?(dir_path)
|
153
|
+
preserved << dir_path
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
preserved
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
|
@@ -0,0 +1,172 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Ingest reporting
|
4
|
+
#
|
5
|
+
class IngestReport
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
create_db_connection
|
9
|
+
end
|
10
|
+
|
11
|
+
# Transfers based on presence (or not) of a particular status
|
12
|
+
#
|
13
|
+
# @param status_to_find [String]
|
14
|
+
# @param status_presence [Boolean]
|
15
|
+
def transfer_status(status_to_find: nil, status_presence: true)
|
16
|
+
if status_presence === true
|
17
|
+
status_presence = '='
|
18
|
+
else
|
19
|
+
status_presence = '<>'
|
20
|
+
end
|
21
|
+
|
22
|
+
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
|
23
|
+
|
24
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
25
|
+
# and use hex function in DB query
|
26
|
+
records = []
|
27
|
+
@db.results_as_hash = true
|
28
|
+
@db.execute( query, [ status_to_find ] ) do |row|
|
29
|
+
id = row['id']
|
30
|
+
uuid = row['uuid']
|
31
|
+
bin_path = StringUtil.hex_to_bin row['hex_path']
|
32
|
+
unit_type = row['unit_type']
|
33
|
+
status = row['status']
|
34
|
+
microservice = row['microservice']
|
35
|
+
current = row['current']
|
36
|
+
o = {}
|
37
|
+
o['path'] = bin_path if !bin_path.empty?
|
38
|
+
o['unit_type'] = unit_type if !unit_type.empty?
|
39
|
+
o['status'] = status if !status.empty?
|
40
|
+
o['microservice'] = microservice if !microservice.empty?
|
41
|
+
o['current'] = current if current
|
42
|
+
o['id'] = id if id
|
43
|
+
o['uuid'] = uuid if !uuid.empty?
|
44
|
+
|
45
|
+
records << o
|
46
|
+
end
|
47
|
+
|
48
|
+
records
|
49
|
+
end
|
50
|
+
|
51
|
+
# Current transfer
|
52
|
+
#
|
53
|
+
# @return [Hash]
|
54
|
+
def transfer_current
|
55
|
+
query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
|
56
|
+
|
57
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
58
|
+
# and use hex function in DB query
|
59
|
+
o = {}
|
60
|
+
@db.results_as_hash = true
|
61
|
+
@db.execute( query ) do |row|
|
62
|
+
id = row['id']
|
63
|
+
uuid = row['uuid']
|
64
|
+
bin_path = hex_to_bin row['hex_path']
|
65
|
+
unit_type = row['unit_type']
|
66
|
+
status = row['status']
|
67
|
+
microservice = row['microservice']
|
68
|
+
current = row['current']
|
69
|
+
o['path'] = bin_path if !bin_path.empty?
|
70
|
+
o['unit_type'] = unit_type if !unit_type.empty?
|
71
|
+
o['status'] = status if !status.empty?
|
72
|
+
o['microservice'] = microservice if !microservice.empty?
|
73
|
+
o['current'] = current if current
|
74
|
+
o['id'] = id if id
|
75
|
+
o['uuid'] = uuid if !uuid.empty?
|
76
|
+
end
|
77
|
+
o
|
78
|
+
end
|
79
|
+
|
80
|
+
# Count of complete transfers
|
81
|
+
#
|
82
|
+
# @return [Integer]
|
83
|
+
def transfer_complete_count
|
84
|
+
query = 'SELECT count(*) FROM unit WHERE status = ?'
|
85
|
+
|
86
|
+
status_to_find = 'COMPLETE'
|
87
|
+
@db.results_as_hash = true
|
88
|
+
@db.get_first_value( query, [status_to_find] )
|
89
|
+
end
|
90
|
+
|
91
|
+
# Compilation of statistics and data, with focus on exceptions
|
92
|
+
#
|
93
|
+
# @return [Hash]
|
94
|
+
def transfer_exception
|
95
|
+
incomplete = transfer_status(status_to_find: 'COMPLETE', status_presence: false)
|
96
|
+
failed = transfer_status(status_to_find: 'FAILED', status_presence: true)
|
97
|
+
current = transfer_current
|
98
|
+
complete_count = transfer_complete_count
|
99
|
+
report = {}
|
100
|
+
report['current'] = current if !current.empty?
|
101
|
+
report['failed'] = {}
|
102
|
+
report['failed']['count'] = failed.count
|
103
|
+
report['failed']['data'] = failed if !failed.empty?
|
104
|
+
report['incomplete'] = {}
|
105
|
+
report['incomplete']['count'] = incomplete.count
|
106
|
+
report['incomplete']['data'] = incomplete if !incomplete.empty?
|
107
|
+
report['complete'] = {}
|
108
|
+
report['complete']['count'] = complete_count if complete_count
|
109
|
+
report
|
110
|
+
end
|
111
|
+
|
112
|
+
# Is it in database?
|
113
|
+
# @param path_to_find [String] directory name within ingest path
|
114
|
+
# @return [Boolean]
|
115
|
+
def in_db?(path_to_find)
|
116
|
+
in_db = false
|
117
|
+
|
118
|
+
# Get path out of DB as a hex string
|
119
|
+
query = 'SELECT hex(path) FROM unit'
|
120
|
+
|
121
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
122
|
+
# and use hex function in DB query
|
123
|
+
@db.execute( query ) do |row|
|
124
|
+
bin_path = StringUtil.hex_to_bin row[0]
|
125
|
+
if bin_path === path_to_find
|
126
|
+
in_db = true
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
in_db
|
131
|
+
end
|
132
|
+
|
133
|
+
# Has preservation been done?
|
134
|
+
# @param path_to_find [String] directory name within ingest path
|
135
|
+
# @return [Boolean]
|
136
|
+
def preserved?(path_to_find)
|
137
|
+
preserved = false
|
138
|
+
|
139
|
+
# 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
|
140
|
+
# indicates completed
|
141
|
+
unit_type_to_find = 'ingest'
|
142
|
+
status_to_find = 'COMPLETE'
|
143
|
+
|
144
|
+
# Get path out of DB as a hex string for completed ingests
|
145
|
+
query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
|
146
|
+
|
147
|
+
# Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
|
148
|
+
# and use hex function in DB query
|
149
|
+
@db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
|
150
|
+
bin_path = StringUtil.hex_to_bin row[0]
|
151
|
+
if bin_path === path_to_find
|
152
|
+
preserved = true
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
preserved
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
private
|
161
|
+
|
162
|
+
def create_db_connection
|
163
|
+
if Preservation.db_path.nil?
|
164
|
+
puts 'Missing db_path'
|
165
|
+
exit
|
166
|
+
end
|
167
|
+
@db = SQLite3::Database.new Preservation.db_path
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
@@ -0,0 +1,188 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# Ingest for Pure
|
4
|
+
#
|
5
|
+
class PureIngest < Ingest
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
# For each uuid, if necessary, fetch the metadata,
|
12
|
+
# prepare a directory in the ingest path and populate it with the files and
|
13
|
+
# JSON description file.
|
14
|
+
#
|
15
|
+
# @param uuids [Array<String>] uuids to preserve
|
16
|
+
# @param dir_name_scheme [Symbol] method to make directory name
|
17
|
+
# @param delay [Integer] days to wait (after modification date) before preserving
|
18
|
+
def prepare_dataset(uuids: [],
|
19
|
+
dir_name_scheme: :uuid,
|
20
|
+
delay: 0)
|
21
|
+
dir_base_path = Preservation.ingest_path
|
22
|
+
|
23
|
+
uuids.each do |uuid|
|
24
|
+
dataset = Puree::Dataset.new
|
25
|
+
dataset.find uuid: uuid
|
26
|
+
d = dataset.metadata
|
27
|
+
if d.empty?
|
28
|
+
@logger.info 'No metadata for ' + uuid
|
29
|
+
next
|
30
|
+
end
|
31
|
+
# configurable to become more human-readable
|
32
|
+
dir_name = build_directory_name(d, dir_name_scheme)
|
33
|
+
|
34
|
+
# continue only if dir_name is not empty (e.g. because there was no DOI)
|
35
|
+
# continue only if there is no DB entry
|
36
|
+
# continue only if the dataset has a DOI
|
37
|
+
# continue only if there are files for this resource
|
38
|
+
# continue only if it is time to preserve
|
39
|
+
if !dir_name.nil? &&
|
40
|
+
!dir_name.empty? &&
|
41
|
+
!@report.in_db?(dir_name) &&
|
42
|
+
!d['doi'].empty? &&
|
43
|
+
!d['file'].empty? &&
|
44
|
+
time_to_preserve?(d['modified'], delay)
|
45
|
+
|
46
|
+
dir_file_path = dir_base_path + '/' + dir_name
|
47
|
+
dir_metadata_path = dir_file_path + '/metadata/'
|
48
|
+
metadata_filename = dir_metadata_path + 'metadata.json'
|
49
|
+
|
50
|
+
# calculate total size of data files
|
51
|
+
download_storage_required = 0
|
52
|
+
d['file'].each { |i| download_storage_required += i['size'].to_i }
|
53
|
+
|
54
|
+
# do we have enough space in filesystem to fetch data files?
|
55
|
+
if enough_storage_for_download? download_storage_required
|
56
|
+
# @logger.info 'Sufficient disk space for ' + dir_file_path
|
57
|
+
else
|
58
|
+
@logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
|
59
|
+
next
|
60
|
+
end
|
61
|
+
|
62
|
+
# has metadata file been created? if so, files and metadata are in place
|
63
|
+
# continue only if files not present in ingest location
|
64
|
+
if !File.size? metadata_filename
|
65
|
+
|
66
|
+
@logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
|
67
|
+
|
68
|
+
data = []
|
69
|
+
d['file'].each do |f|
|
70
|
+
o = package_dataset_metadata d, f
|
71
|
+
data << o
|
72
|
+
wget_str = build_wget Puree.username,
|
73
|
+
Puree.password,
|
74
|
+
f['url']
|
75
|
+
|
76
|
+
Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
|
77
|
+
|
78
|
+
# fetch the file
|
79
|
+
Dir.chdir(dir_file_path) do
|
80
|
+
# puts 'Changing dir to ' + Dir.pwd
|
81
|
+
# puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
|
82
|
+
if File.size?(f['name'])
|
83
|
+
# puts 'Should be deleting ' + f['name']
|
84
|
+
File.delete(f['name'])
|
85
|
+
end
|
86
|
+
# puts f['name'] + ' missing or empty'
|
87
|
+
# puts wget_str
|
88
|
+
`#{wget_str}`
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
|
93
|
+
|
94
|
+
pretty = JSON.pretty_generate( data, :indent => ' ')
|
95
|
+
# puts pretty
|
96
|
+
File.write(metadata_filename,pretty)
|
97
|
+
@logger.info 'Created ' + metadata_filename
|
98
|
+
end
|
99
|
+
else
|
100
|
+
@logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
def package_dataset_metadata(d, f)
|
108
|
+
o = {}
|
109
|
+
o['filename'] = 'objects/' + f['name']
|
110
|
+
o['dc.title'] = d['title']
|
111
|
+
if !d['description'].empty?
|
112
|
+
o['dc.description'] = d['description']
|
113
|
+
end
|
114
|
+
o['dcterms.created'] = d['created']
|
115
|
+
if !d['available']['year'].empty?
|
116
|
+
o['dcterms.available'] = Puree::Date.iso(d['available'])
|
117
|
+
end
|
118
|
+
o['dc.publisher'] = d['publisher']
|
119
|
+
if !d['doi'].empty?
|
120
|
+
o['dc.identifier'] = d['doi']
|
121
|
+
end
|
122
|
+
if !d['spatial'].empty?
|
123
|
+
o['dcterms.spatial'] = d['spatial']
|
124
|
+
end
|
125
|
+
if !d['temporal']['start']['year'].empty?
|
126
|
+
temporal_range = ''
|
127
|
+
temporal_range << Puree::Date.iso(d['temporal']['start'])
|
128
|
+
if !d['temporal']['end']['year'].empty?
|
129
|
+
temporal_range << '/'
|
130
|
+
temporal_range << Puree::Date.iso(d['temporal']['end'])
|
131
|
+
end
|
132
|
+
o['dcterms.temporal'] = temporal_range
|
133
|
+
end
|
134
|
+
creators = []
|
135
|
+
contributors = []
|
136
|
+
person_types = %w(internal external other)
|
137
|
+
person_types.each do |person_type|
|
138
|
+
d['person'][person_type].each do |i|
|
139
|
+
if i['role'] == 'Creator'
|
140
|
+
creator = i['name']['last'] + ', ' + i['name']['first']
|
141
|
+
creators << creator
|
142
|
+
end
|
143
|
+
if i['role'] == 'Contributor'
|
144
|
+
contributor = i['name']['last'] + ', ' + i['name']['first']
|
145
|
+
contributors << contributor
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
o['dc.creator'] = creators
|
150
|
+
if !contributors.empty?
|
151
|
+
o['dc.contributor'] = contributors
|
152
|
+
end
|
153
|
+
keywords = []
|
154
|
+
d['keyword'].each { |i|
|
155
|
+
keywords << i
|
156
|
+
}
|
157
|
+
if !keywords.empty?
|
158
|
+
o['dc.subject'] = keywords
|
159
|
+
end
|
160
|
+
if !f['license']['name'].empty?
|
161
|
+
o['dcterms.license'] = f['license']['name']
|
162
|
+
end
|
163
|
+
# o['dc.format'] = f['mime']
|
164
|
+
|
165
|
+
related = []
|
166
|
+
publications = d['publication']
|
167
|
+
publications.each do |i|
|
168
|
+
o_related = {}
|
169
|
+
o_related['dc.title'] = i['title']
|
170
|
+
o_related['type'] = i['type']
|
171
|
+
pub = Puree::Publication.new
|
172
|
+
pub.find uuid: i['uuid']
|
173
|
+
doi = pub.doi
|
174
|
+
if doi
|
175
|
+
o_related['dc.identifier'] = doi
|
176
|
+
end
|
177
|
+
related << o_related
|
178
|
+
end
|
179
|
+
if !related.empty?
|
180
|
+
o['related'] = related
|
181
|
+
end
|
182
|
+
|
183
|
+
o
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Preservation
|
2
|
+
|
3
|
+
# String utilities
|
4
|
+
#
|
5
|
+
module StringUtil
|
6
|
+
# Binary to hexadecimal
|
7
|
+
#
|
8
|
+
def self.bin_to_hex(s)
|
9
|
+
s.each_byte.map { |b| b.to_s(16) }.join
|
10
|
+
end
|
11
|
+
|
12
|
+
# Hexadecimal to binary
|
13
|
+
def self.hex_to_bin(s)
|
14
|
+
s.scan(/../).map { |x| x.hex.chr }.join
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'preservation/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "preservation"
|
8
|
+
spec.version = Preservation::VERSION
|
9
|
+
spec.authors = ["Adrian Albin-Clark"]
|
10
|
+
spec.email = ["a.albin-clark@lancaster.ac.uk"]
|
11
|
+
spec.summary = %q{Ingest management for Archivematica's Automation Tools.}
|
12
|
+
spec.description = %q{Transfer preparation, reporting and disk space management for Archivematica's Automation Tools.}
|
13
|
+
spec.homepage = "https://rubygems.org/gems/preservation"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.required_ruby_version = '~> 2.1'
|
22
|
+
|
23
|
+
spec.add_runtime_dependency 'free_disk_space', '~> 1.0'
|
24
|
+
spec.add_runtime_dependency 'puree', '~> 0.17'
|
25
|
+
spec.add_runtime_dependency'sqlite3', '~> 1.3'
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: preservation
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Adrian Albin-Clark
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-09-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: free_disk_space
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: puree
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.17'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.17'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: sqlite3
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.3'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.3'
|
55
|
+
description: Transfer preparation, reporting and disk space management for Archivematica's
|
56
|
+
Automation Tools.
|
57
|
+
email:
|
58
|
+
- a.albin-clark@lancaster.ac.uk
|
59
|
+
executables: []
|
60
|
+
extensions: []
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- ".gitignore"
|
64
|
+
- CHANGELOG.md
|
65
|
+
- Gemfile
|
66
|
+
- LICENSE.txt
|
67
|
+
- PITCHME.md
|
68
|
+
- README.md
|
69
|
+
- Rakefile
|
70
|
+
- lib/preservation.rb
|
71
|
+
- lib/preservation/configuration.rb
|
72
|
+
- lib/preservation/ingest.rb
|
73
|
+
- lib/preservation/ingest_report.rb
|
74
|
+
- lib/preservation/pure_ingest.rb
|
75
|
+
- lib/preservation/string_util.rb
|
76
|
+
- lib/preservation/version.rb
|
77
|
+
- preservation.gemspec
|
78
|
+
homepage: https://rubygems.org/gems/preservation
|
79
|
+
licenses:
|
80
|
+
- MIT
|
81
|
+
metadata: {}
|
82
|
+
post_install_message:
|
83
|
+
rdoc_options: []
|
84
|
+
require_paths:
|
85
|
+
- lib
|
86
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - "~>"
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '2.1'
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
requirements: []
|
97
|
+
rubyforge_project:
|
98
|
+
rubygems_version: 2.2.2
|
99
|
+
signing_key:
|
100
|
+
specification_version: 4
|
101
|
+
summary: Ingest management for Archivematica's Automation Tools.
|
102
|
+
test_files: []
|
103
|
+
has_rdoc:
|