preservation 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: dfcf307b70473079a60f2801c6bd11e4f4c289d8
4
+ data.tar.gz: 0e13efac8904ccd96fc690644520e20f388a4fce
5
+ SHA512:
6
+ metadata.gz: 939f44e9a24177232e900953f4b59b64cfe62e0bce056765444dd9a726b64b441cae7b8fc6b3cea233ef9febd732a9a27f0759992cafe4e8494213edc947fffa
7
+ data.tar.gz: 2a86f065c50cf43c8ea7bcaa707d278f05b7b73b2be53cbebad47e50f38ffa3e8324dc88a71ded669495e6c7f719ca7a7dbd8943da3bcf43a186f6f56df40daf
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .idea
data/CHANGELOG.md ADDED
@@ -0,0 +1,11 @@
1
+ # Change Log
2
+ All notable changes to this project will be documented in this file.
3
+ This project adheres to [Semantic Versioning](http://semver.org/).
4
+
5
+ ## Unreleased
6
+
7
+ ## 0.1.0 - 2016-09-13
8
+ ### Added
9
+ - Transfer preparation.
10
+ - Reporting from transfers database.
11
+ - Disk space management.
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in preservation.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2016 Adrian Albin-Clark
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/PITCHME.md ADDED
@@ -0,0 +1,126 @@
1
+ #HSLIDE
2
+
3
+ ## Rationale
4
+ Archivematica's [Automation Tools](https://github.com/artefactual/automation-tools)
5
+ work with files and descriptive metadata which must be provided in a certain way.
6
+
7
+
8
+ #HSLIDE
9
+
10
+ ## Preservation: a way to manage ingest
11
+
12
+ #VSLIDE
13
+
14
+ - Transfer preparation.
15
+ - Reporting from transfers database. <!-- .element: class="fragment" -->
16
+ - Disk space management. <!-- .element: class="fragment" -->
17
+
18
+ #HSLIDE
19
+
20
+ ## Preservation: ingest
21
+
22
+ Create an ingestor for Pure.
23
+ ```ruby
24
+ ingest = Preservation::PureIngest.new
25
+ ```
26
+
27
+ For each uuid, if necessary, fetch the metadata, prepare a directory in the
28
+ ingest path and populate it with the files and JSON description file.
29
+
30
+ ```ruby
31
+ ingest.prepare_dataset uuids: uuids,
32
+ dir_name_scheme: :doi_short,
33
+ delay: 0
34
+ ```
35
+
36
+ Free up disk space for completed transfers.
37
+
38
+ ```ruby
39
+ ingest.cleanup_preserved
40
+ ```
41
+
42
+ #VSLIDE
43
+
44
+ ## Transfer-ready directory
45
+
46
+ ```
47
+ .
48
+ ├── 10.17635-lancaster-researchdata-6
49
+ │   ├── Ebola_data_Jun15.zip
50
+ │   └── metadata
51
+ │   └── metadata.json
52
+ ```
53
+
54
+ #VSLIDE
55
+
56
+ ## Transfer-ready metadata
57
+
58
+ ```json
59
+ [
60
+ {
61
+ "filename": "objects/Ebola_data_Jun15.zip",
62
+ "dc.title": "Ebolavirus evolution 2013-2015",
63
+ "dc.description": "Data used for analysis of selection and evolutionary rate in Zaire Ebolavirus variant Makona",
64
+ "dcterms.created": "2015-06-04T16:11:34.713+01:00",
65
+ "dcterms.available": "2015-06-04",
66
+ "dc.publisher": "Lancaster University",
67
+ "dc.identifier": "http://dx.doi.org/10.17635/lancaster/researchdata/6",
68
+ "dcterms.spatial": [
69
+ "Guinea, Sierra Leone, Liberia"
70
+ ],
71
+ "dc.creator": [
72
+ "Gatherer, Derek"
73
+ ],
74
+ "dc.contributor": [
75
+ "Robertson, David",
76
+ "Lovell, Simon"
77
+ ],
78
+ "dc.subject": [
79
+ "Ebolavirus",
80
+ "evolution",
81
+ "phylogenetics",
82
+ "virulence",
83
+ "Filoviridae",
84
+ "positive selection"
85
+ ],
86
+ "dcterms.license": "CC BY",
87
+ "related": [
88
+ {
89
+ "dc.title": "The unprecedented scale of the West African Ebola virus disease outbreak is due to environmental an$
90
+ "type": "Journal article",
91
+ "dc.identifier": "http://dx.doi.org/10.1136/ebmed-2014-110127"
92
+ },
93
+ {
94
+ "dc.title": "The 2014 Ebola virus disease outbreak in West Africa",
95
+ "type": "Journal article",
96
+ "dc.identifier": "http://dx.doi.org/10.1099/vir.0.067199-0"
97
+ }
98
+ ]
99
+ }
100
+ ]
101
+ ```
102
+
103
+ #HSLIDE
104
+
105
+ ## Preservation: reporting
106
+
107
+ Can be used for scheduled monitoring of transfers.
108
+
109
+ ```ruby
110
+ report = Preservation::IngestReport.new
111
+ report.transfer_exception
112
+ ```
113
+
114
+ #HSLIDE
115
+
116
+ ## Location
117
+
118
+ <a href="https://rubygems.org/gems/preservation" target="_blank">RubyGems</a>
119
+
120
+ <a href="https://github.com/lulibrary/preservation" target="_blank">GitHub</a>
121
+
122
+ #HSLIDE
123
+
124
+ ## Documentation
125
+
126
+ <a href="http://www.rubydoc.info/gems/preservation" target="_blank">API in YARD</a>
data/README.md ADDED
@@ -0,0 +1,88 @@
1
+ # Preservation [![Gem Version](https://badge.fury.io/rb/preservation.svg)](https://badge.fury.io/rb/preservation) [![GitPitch](https://gitpitch.com/assets/badge.svg)](https://gitpitch.com/lulibrary/preservation/master?grs=github&t=sky)
2
+
3
+ Ingest management for Archivematica's [Automation Tools](https://github.com/artefactual/automation-tools).
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'preservation'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install preservation
18
+
19
+ ## Usage
20
+
21
+ ### Configuration
22
+ Configure Preservation. If ```log_path``` is omitted, logging (standard library) redirects to STDOUT.
23
+
24
+ ```ruby
25
+ Preservation.configure do |config|
26
+ config.db_path = ENV['ARCHIVEMATICA_DB_PATH']
27
+ config.ingest_path = ENV['ARCHIVEMATICA_INGEST_PATH']
28
+ config.log_path = ENV['PRESERVATION_LOG_PATH']
29
+ end
30
+ ```
31
+
32
+ Configure data source.
33
+
34
+ ```ruby
35
+ Puree.configure do |config|
36
+ config.base_url = ENV['PURE_BASE_URL']
37
+ config.username = ENV['PURE_USERNAME']
38
+ config.password = ENV['PURE_PASSWORD']
39
+ config.basic_auth = true
40
+ end
41
+ ```
42
+
43
+ ### Transfers
44
+
45
+ Get some dataset UUIDs for preservation.
46
+
47
+ ```ruby
48
+ c = Puree::Collection.new resource: :dataset
49
+ minimal_metadata = c.find limit: 2,
50
+ offset: 10,
51
+ full: false
52
+ uuids = []
53
+ minimal_metadata.each do |i|
54
+ uuids << i['uuid']
55
+ end
56
+ ```
57
+
58
+ Create an ingestor for Pure.
59
+
60
+ ```ruby
61
+ ingest = Preservation::PureIngest.new
62
+ ```
63
+
64
+ For each uuid, if necessary, fetch the metadata, prepare
65
+ a directory in the ingest path and populate it with the files and JSON description file.
66
+
67
+ ```ruby
68
+ ingest.prepare_dataset uuids: uuids,
69
+ dir_name_scheme: :doi_short,
70
+ delay: 0
71
+ ```
72
+
73
+ Free up disk space for completed transfers.
74
+
75
+ ```ruby
76
+ ingest.cleanup_preserved
77
+ ```
78
+
79
+ ### Reporting
80
+ Can be used for scheduled monitoring of transfers.
81
+
82
+ ```ruby
83
+ report = Preservation::IngestReport.new
84
+ report.transfer_exception
85
+ ```
86
+
87
+ ## Documentation
88
+ [API in YARD](http://www.rubydoc.info/gems/preservation)
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ require 'date'
2
+ require 'fileutils'
3
+ require 'free_disk_space'
4
+ require 'logger'
5
+ require 'puree'
6
+ require 'sqlite3'
7
+ require 'preservation/configuration'
8
+ require 'preservation/ingest_report'
9
+ require 'preservation/ingest'
10
+ require 'preservation/pure_ingest'
11
+ require 'preservation/string_util'
12
+ require 'preservation/version'
13
+
14
+ # Top level namespace
15
+ #
16
+ module Preservation
17
+
18
+ class << self
19
+
20
+ include Preservation::Configuration
21
+
22
+ end
23
+
24
+ end
@@ -0,0 +1,15 @@
1
+ module Preservation
2
+
3
+ # Configuration options
4
+ #
5
+ module Configuration
6
+
7
+ attr_accessor :db_path, :ingest_path, :log_path
8
+
9
+ def configure
10
+ yield self
11
+ end
12
+
13
+ end
14
+
15
+ end
@@ -0,0 +1,163 @@
1
+ module Preservation
2
+
3
+ # Base class for metadata and file management
4
+ #
5
+ class Ingest
6
+
7
+ attr_reader :logger
8
+
9
+ def initialize
10
+ check_ingest_path
11
+ setup_logger
12
+ setup_report
13
+ end
14
+
15
+ # Free up disk space for completed transfers
16
+ #
17
+ def cleanup_preserved
18
+ preserved = get_preserved
19
+ if !preserved.nil? && !preserved.empty?
20
+ preserved.each do |i|
21
+ # skip anything that has a different owner to script
22
+ if File.stat(i).grpowned?
23
+ FileUtils.remove_dir i
24
+ @logger.info 'Deleted ' + i
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+
31
+ private
32
+
33
+ def build_wget(username, password, file_url)
34
+ # construct wget command with parameters
35
+ wget_str = ''
36
+ wget_str << 'wget'
37
+ wget_str << ' '
38
+ wget_str << '--user'
39
+ wget_str << ' '
40
+ wget_str << username
41
+ wget_str << ' '
42
+ wget_str << '--password'
43
+ wget_str << ' '
44
+ wget_str << '"' + password + '"'
45
+ wget_str << ' '
46
+ wget_str << file_url
47
+ wget_str << ' '
48
+ wget_str << '--no-check-certificate'
49
+ wget_str
50
+ end
51
+
52
+ def check_ingest_path
53
+ if Preservation.ingest_path.nil?
54
+ puts 'Missing ingest path'
55
+ exit
56
+ end
57
+ end
58
+
59
+ def setup_logger
60
+ if @logger.nil?
61
+ if Preservation.log_path.nil?
62
+ @logger = Logger.new STDOUT
63
+ else
64
+ # Keep data for today and the past 20 days
65
+ @logger = Logger.new File.new(Preservation.log_path, 'a'), 20, 'daily'
66
+ end
67
+ end
68
+ @logger.level = Logger::INFO
69
+ end
70
+
71
+ def setup_report
72
+ if Preservation.db_path.nil?
73
+ puts 'Missing db path'
74
+ exit
75
+ else
76
+ @report = IngestReport.new
77
+ end
78
+ end
79
+
80
+ def enough_storage_for_download?(required_bytes)
81
+ # scale up the required space using a multiplier
82
+ multiplier = 2
83
+ available = FreeDiskSpace.bytes('/')
84
+ required_bytes * multiplier < available ? true : false
85
+ end
86
+
87
+ def build_directory_name(metadata_record, directory_name_scheme)
88
+ doi = metadata_record['doi']
89
+ uuid = metadata_record['uuid']
90
+ title = metadata_record['title'].strip.gsub(' ', '-').gsub('/', '-')
91
+ time = Time.new
92
+ date = time.strftime("%Y-%m-%d")
93
+ time = time.strftime("%H:%M:%S")
94
+ join_str = '-----'
95
+
96
+ case directory_name_scheme
97
+ when :uuid_title
98
+ [uuid, title].join(join_str)
99
+ when :title_uuid
100
+ [title, uuid].join(join_str)
101
+ when :date_uuid_title
102
+ [date, uuid, title].join(join_str)
103
+ when :date_title_uuid
104
+ [date, title, uuid].join(join_str)
105
+ when :date_time_uuid
106
+ [date, time, uuid].join(join_str)
107
+ when :date_time_title
108
+ [date, time, title].join(join_str)
109
+ when :date_time_uuid_title
110
+ [date, time, uuid, title].join(join_str)
111
+ when :date_time_title_uuid
112
+ [date, time, title, uuid].join(join_str)
113
+ when :uuid
114
+ uuid
115
+ when :doi
116
+ if doi.empty?
117
+ return ''
118
+ end
119
+ doi.gsub('/', '-')
120
+ when :doi_short
121
+ if doi.empty?
122
+ return ''
123
+ end
124
+ doi_short_to_remove = 'http://dx.doi.org/'
125
+ short = doi.gsub(doi_short_to_remove, '')
126
+ short.gsub!('/', '-')
127
+ else
128
+ uuid
129
+ end
130
+ end
131
+
132
+ # time_to_preserve?
133
+ #
134
+ # @param start_utc [String]
135
+ # @param delay [Integer] days to wait (after modification date) before preserving
136
+ # @return [Boolean]
137
+ def time_to_preserve?(start_utc, delay)
138
+ now = DateTime.now
139
+ modified_datetime = DateTime.parse(start_utc)
140
+ days_since_modified = (now - modified_datetime).to_i # result in days
141
+ days_since_modified >= delay ? true : false
142
+ end
143
+
144
+ # # Collect all paths from DB where preservation has been done
145
+ # # @return [Array<String>]
146
+ def get_preserved
147
+ ingest_complete = @report.transfer_status(status_to_find: 'COMPLETE',
148
+ status_presence: true)
149
+ preserved = []
150
+ ingest_complete.each do |i|
151
+ dir_path = Preservation.ingest_path + '/' + i['path']
152
+ if File.exists?(dir_path)
153
+ preserved << dir_path
154
+ end
155
+ end
156
+
157
+ preserved
158
+ end
159
+
160
+ end
161
+
162
+ end
163
+
@@ -0,0 +1,172 @@
1
+ module Preservation
2
+
3
+ # Ingest reporting
4
+ #
5
+ class IngestReport
6
+
7
+ def initialize
8
+ create_db_connection
9
+ end
10
+
11
+ # Transfers based on presence (or not) of a particular status
12
+ #
13
+ # @param status_to_find [String]
14
+ # @param status_presence [Boolean]
15
+ def transfer_status(status_to_find: nil, status_presence: true)
16
+ if status_presence === true
17
+ status_presence = '='
18
+ else
19
+ status_presence = '<>'
20
+ end
21
+
22
+ query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
23
+
24
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
25
+ # and use hex function in DB query
26
+ records = []
27
+ @db.results_as_hash = true
28
+ @db.execute( query, [ status_to_find ] ) do |row|
29
+ id = row['id']
30
+ uuid = row['uuid']
31
+ bin_path = StringUtil.hex_to_bin row['hex_path']
32
+ unit_type = row['unit_type']
33
+ status = row['status']
34
+ microservice = row['microservice']
35
+ current = row['current']
36
+ o = {}
37
+ o['path'] = bin_path if !bin_path.empty?
38
+ o['unit_type'] = unit_type if !unit_type.empty?
39
+ o['status'] = status if !status.empty?
40
+ o['microservice'] = microservice if !microservice.empty?
41
+ o['current'] = current if current
42
+ o['id'] = id if id
43
+ o['uuid'] = uuid if !uuid.empty?
44
+
45
+ records << o
46
+ end
47
+
48
+ records
49
+ end
50
+
51
+ # Current transfer
52
+ #
53
+ # @return [Hash]
54
+ def transfer_current
55
+ query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
56
+
57
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
58
+ # and use hex function in DB query
59
+ o = {}
60
+ @db.results_as_hash = true
61
+ @db.execute( query ) do |row|
62
+ id = row['id']
63
+ uuid = row['uuid']
64
+ bin_path = hex_to_bin row['hex_path']
65
+ unit_type = row['unit_type']
66
+ status = row['status']
67
+ microservice = row['microservice']
68
+ current = row['current']
69
+ o['path'] = bin_path if !bin_path.empty?
70
+ o['unit_type'] = unit_type if !unit_type.empty?
71
+ o['status'] = status if !status.empty?
72
+ o['microservice'] = microservice if !microservice.empty?
73
+ o['current'] = current if current
74
+ o['id'] = id if id
75
+ o['uuid'] = uuid if !uuid.empty?
76
+ end
77
+ o
78
+ end
79
+
80
+ # Count of complete transfers
81
+ #
82
+ # @return [Integer]
83
+ def transfer_complete_count
84
+ query = 'SELECT count(*) FROM unit WHERE status = ?'
85
+
86
+ status_to_find = 'COMPLETE'
87
+ @db.results_as_hash = true
88
+ @db.get_first_value( query, [status_to_find] )
89
+ end
90
+
91
+ # Compilation of statistics and data, with focus on exceptions
92
+ #
93
+ # @return [Hash]
94
+ def transfer_exception
95
+ incomplete = transfer_status(status_to_find: 'COMPLETE', status_presence: false)
96
+ failed = transfer_status(status_to_find: 'FAILED', status_presence: true)
97
+ current = transfer_current
98
+ complete_count = transfer_complete_count
99
+ report = {}
100
+ report['current'] = current if !current.empty?
101
+ report['failed'] = {}
102
+ report['failed']['count'] = failed.count
103
+ report['failed']['data'] = failed if !failed.empty?
104
+ report['incomplete'] = {}
105
+ report['incomplete']['count'] = incomplete.count
106
+ report['incomplete']['data'] = incomplete if !incomplete.empty?
107
+ report['complete'] = {}
108
+ report['complete']['count'] = complete_count if complete_count
109
+ report
110
+ end
111
+
112
+ # Is it in database?
113
+ # @param path_to_find [String] directory name within ingest path
114
+ # @return [Boolean]
115
+ def in_db?(path_to_find)
116
+ in_db = false
117
+
118
+ # Get path out of DB as a hex string
119
+ query = 'SELECT hex(path) FROM unit'
120
+
121
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
122
+ # and use hex function in DB query
123
+ @db.execute( query ) do |row|
124
+ bin_path = StringUtil.hex_to_bin row[0]
125
+ if bin_path === path_to_find
126
+ in_db = true
127
+ end
128
+ end
129
+
130
+ in_db
131
+ end
132
+
133
+ # Has preservation been done?
134
+ # @param path_to_find [String] directory name within ingest path
135
+ # @return [Boolean]
136
+ def preserved?(path_to_find)
137
+ preserved = false
138
+
139
+ # 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
140
+ # indicates completed
141
+ unit_type_to_find = 'ingest'
142
+ status_to_find = 'COMPLETE'
143
+
144
+ # Get path out of DB as a hex string for completed ingests
145
+ query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
146
+
147
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
148
+ # and use hex function in DB query
149
+ @db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
150
+ bin_path = StringUtil.hex_to_bin row[0]
151
+ if bin_path === path_to_find
152
+ preserved = true
153
+ end
154
+ end
155
+
156
+ preserved
157
+ end
158
+
159
+
160
+ private
161
+
162
+ def create_db_connection
163
+ if Preservation.db_path.nil?
164
+ puts 'Missing db_path'
165
+ exit
166
+ end
167
+ @db = SQLite3::Database.new Preservation.db_path
168
+ end
169
+
170
+ end
171
+
172
+ end
@@ -0,0 +1,188 @@
1
+ module Preservation
2
+
3
+ # Ingest for Pure
4
+ #
5
+ class PureIngest < Ingest
6
+
7
+ def initialize
8
+ super
9
+ end
10
+
11
+ # For each uuid, if necessary, fetch the metadata,
12
+ # prepare a directory in the ingest path and populate it with the files and
13
+ # JSON description file.
14
+ #
15
+ # @param uuids [Array<String>] uuids to preserve
16
+ # @param dir_name_scheme [Symbol] method to make directory name
17
+ # @param delay [Integer] days to wait (after modification date) before preserving
18
+ def prepare_dataset(uuids: [],
19
+ dir_name_scheme: :uuid,
20
+ delay: 0)
21
+ dir_base_path = Preservation.ingest_path
22
+
23
+ uuids.each do |uuid|
24
+ dataset = Puree::Dataset.new
25
+ dataset.find uuid: uuid
26
+ d = dataset.metadata
27
+ if d.empty?
28
+ @logger.info 'No metadata for ' + uuid
29
+ next
30
+ end
31
+ # configurable to become more human-readable
32
+ dir_name = build_directory_name(d, dir_name_scheme)
33
+
34
+ # continue only if dir_name is not empty (e.g. because there was no DOI)
35
+ # continue only if there is no DB entry
36
+ # continue only if the dataset has a DOI
37
+ # continue only if there are files for this resource
38
+ # continue only if it is time to preserve
39
+ if !dir_name.nil? &&
40
+ !dir_name.empty? &&
41
+ !@report.in_db?(dir_name) &&
42
+ !d['doi'].empty? &&
43
+ !d['file'].empty? &&
44
+ time_to_preserve?(d['modified'], delay)
45
+
46
+ dir_file_path = dir_base_path + '/' + dir_name
47
+ dir_metadata_path = dir_file_path + '/metadata/'
48
+ metadata_filename = dir_metadata_path + 'metadata.json'
49
+
50
+ # calculate total size of data files
51
+ download_storage_required = 0
52
+ d['file'].each { |i| download_storage_required += i['size'].to_i }
53
+
54
+ # do we have enough space in filesystem to fetch data files?
55
+ if enough_storage_for_download? download_storage_required
56
+ # @logger.info 'Sufficient disk space for ' + dir_file_path
57
+ else
58
+ @logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
59
+ next
60
+ end
61
+
62
+ # has metadata file been created? if so, files and metadata are in place
63
+ # continue only if files not present in ingest location
64
+ if !File.size? metadata_filename
65
+
66
+ @logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
67
+
68
+ data = []
69
+ d['file'].each do |f|
70
+ o = package_dataset_metadata d, f
71
+ data << o
72
+ wget_str = build_wget Puree.username,
73
+ Puree.password,
74
+ f['url']
75
+
76
+ Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
77
+
78
+ # fetch the file
79
+ Dir.chdir(dir_file_path) do
80
+ # puts 'Changing dir to ' + Dir.pwd
81
+ # puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
82
+ if File.size?(f['name'])
83
+ # puts 'Should be deleting ' + f['name']
84
+ File.delete(f['name'])
85
+ end
86
+ # puts f['name'] + ' missing or empty'
87
+ # puts wget_str
88
+ `#{wget_str}`
89
+ end
90
+ end
91
+
92
+ Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
93
+
94
+ pretty = JSON.pretty_generate( data, :indent => ' ')
95
+ # puts pretty
96
+ File.write(metadata_filename,pretty)
97
+ @logger.info 'Created ' + metadata_filename
98
+ end
99
+ else
100
+ @logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
101
+ end
102
+ end
103
+ end
104
+
105
+ private
106
+
107
+ def package_dataset_metadata(d, f)
108
+ o = {}
109
+ o['filename'] = 'objects/' + f['name']
110
+ o['dc.title'] = d['title']
111
+ if !d['description'].empty?
112
+ o['dc.description'] = d['description']
113
+ end
114
+ o['dcterms.created'] = d['created']
115
+ if !d['available']['year'].empty?
116
+ o['dcterms.available'] = Puree::Date.iso(d['available'])
117
+ end
118
+ o['dc.publisher'] = d['publisher']
119
+ if !d['doi'].empty?
120
+ o['dc.identifier'] = d['doi']
121
+ end
122
+ if !d['spatial'].empty?
123
+ o['dcterms.spatial'] = d['spatial']
124
+ end
125
+ if !d['temporal']['start']['year'].empty?
126
+ temporal_range = ''
127
+ temporal_range << Puree::Date.iso(d['temporal']['start'])
128
+ if !d['temporal']['end']['year'].empty?
129
+ temporal_range << '/'
130
+ temporal_range << Puree::Date.iso(d['temporal']['end'])
131
+ end
132
+ o['dcterms.temporal'] = temporal_range
133
+ end
134
+ creators = []
135
+ contributors = []
136
+ person_types = %w(internal external other)
137
+ person_types.each do |person_type|
138
+ d['person'][person_type].each do |i|
139
+ if i['role'] == 'Creator'
140
+ creator = i['name']['last'] + ', ' + i['name']['first']
141
+ creators << creator
142
+ end
143
+ if i['role'] == 'Contributor'
144
+ contributor = i['name']['last'] + ', ' + i['name']['first']
145
+ contributors << contributor
146
+ end
147
+ end
148
+ end
149
+ o['dc.creator'] = creators
150
+ if !contributors.empty?
151
+ o['dc.contributor'] = contributors
152
+ end
153
+ keywords = []
154
+ d['keyword'].each { |i|
155
+ keywords << i
156
+ }
157
+ if !keywords.empty?
158
+ o['dc.subject'] = keywords
159
+ end
160
+ if !f['license']['name'].empty?
161
+ o['dcterms.license'] = f['license']['name']
162
+ end
163
+ # o['dc.format'] = f['mime']
164
+
165
+ related = []
166
+ publications = d['publication']
167
+ publications.each do |i|
168
+ o_related = {}
169
+ o_related['dc.title'] = i['title']
170
+ o_related['type'] = i['type']
171
+ pub = Puree::Publication.new
172
+ pub.find uuid: i['uuid']
173
+ doi = pub.doi
174
+ if doi
175
+ o_related['dc.identifier'] = doi
176
+ end
177
+ related << o_related
178
+ end
179
+ if !related.empty?
180
+ o['related'] = related
181
+ end
182
+
183
+ o
184
+ end
185
+
186
+ end
187
+
188
+ end
@@ -0,0 +1,19 @@
1
+ module Preservation
2
+
3
+ # String utilities
4
+ #
5
+ module StringUtil
6
+ # Binary to hexadecimal
7
+ #
8
+ def self.bin_to_hex(s)
9
+ s.each_byte.map { |b| b.to_s(16) }.join
10
+ end
11
+
12
+ # Hexadecimal to binary
13
+ def self.hex_to_bin(s)
14
+ s.scan(/../).map { |x| x.hex.chr }.join
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -0,0 +1,5 @@
1
+ module Preservation
2
+ # Semantic version number
3
+ #
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'preservation/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "preservation"
8
+ spec.version = Preservation::VERSION
9
+ spec.authors = ["Adrian Albin-Clark"]
10
+ spec.email = ["a.albin-clark@lancaster.ac.uk"]
11
+ spec.summary = %q{Ingest management for Archivematica's Automation Tools.}
12
+ spec.description = %q{Transfer preparation, reporting and disk space management for Archivematica's Automation Tools.}
13
+ spec.homepage = "https://rubygems.org/gems/preservation"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = '~> 2.1'
22
+
23
+ spec.add_runtime_dependency 'free_disk_space', '~> 1.0'
24
+ spec.add_runtime_dependency 'puree', '~> 0.17'
25
+ spec.add_runtime_dependency'sqlite3', '~> 1.3'
26
+ end
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: preservation
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Adrian Albin-Clark
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-09-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: free_disk_space
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: puree
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.17'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.17'
41
+ - !ruby/object:Gem::Dependency
42
+ name: sqlite3
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.3'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.3'
55
+ description: Transfer preparation, reporting and disk space management for Archivematica's
56
+ Automation Tools.
57
+ email:
58
+ - a.albin-clark@lancaster.ac.uk
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".gitignore"
64
+ - CHANGELOG.md
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - PITCHME.md
68
+ - README.md
69
+ - Rakefile
70
+ - lib/preservation.rb
71
+ - lib/preservation/configuration.rb
72
+ - lib/preservation/ingest.rb
73
+ - lib/preservation/ingest_report.rb
74
+ - lib/preservation/pure_ingest.rb
75
+ - lib/preservation/string_util.rb
76
+ - lib/preservation/version.rb
77
+ - preservation.gemspec
78
+ homepage: https://rubygems.org/gems/preservation
79
+ licenses:
80
+ - MIT
81
+ metadata: {}
82
+ post_install_message:
83
+ rdoc_options: []
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '2.1'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ requirements: []
97
+ rubyforge_project:
98
+ rubygems_version: 2.2.2
99
+ signing_key:
100
+ specification_version: 4
101
+ summary: Ingest management for Archivematica's Automation Tools.
102
+ test_files: []
103
+ has_rdoc: