preservation 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: dfcf307b70473079a60f2801c6bd11e4f4c289d8
4
+ data.tar.gz: 0e13efac8904ccd96fc690644520e20f388a4fce
5
+ SHA512:
6
+ metadata.gz: 939f44e9a24177232e900953f4b59b64cfe62e0bce056765444dd9a726b64b441cae7b8fc6b3cea233ef9febd732a9a27f0759992cafe4e8494213edc947fffa
7
+ data.tar.gz: 2a86f065c50cf43c8ea7bcaa707d278f05b7b73b2be53cbebad47e50f38ffa3e8324dc88a71ded669495e6c7f719ca7a7dbd8943da3bcf43a186f6f56df40daf
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .idea
data/CHANGELOG.md ADDED
@@ -0,0 +1,11 @@
1
+ # Change Log
2
+ All notable changes to this project will be documented in this file.
3
+ This project adheres to [Semantic Versioning](http://semver.org/).
4
+
5
+ ## Unreleased
6
+
7
+ ## 0.1.0 - 2016-09-13
8
+ ### Added
9
+ - Transfer preparation.
10
+ - Reporting from transfers database.
11
+ - Disk space management.
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in preservation.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2016 Adrian Albin-Clark
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/PITCHME.md ADDED
@@ -0,0 +1,126 @@
1
+ #HSLIDE
2
+
3
+ ## Rationale
4
+ Archivematica's [Automation Tools](https://github.com/artefactual/automation-tools)
5
+ work with files and descriptive metadata which must be provided in a certain way.
6
+
7
+
8
+ #HSLIDE
9
+
10
+ ## Preservation: a way to manage ingest
11
+
12
+ #VSLIDE
13
+
14
+ - Transfer preparation.
15
+ - Reporting from transfers database. <!-- .element: class="fragment" -->
16
+ - Disk space management. <!-- .element: class="fragment" -->
17
+
18
+ #HSLIDE
19
+
20
+ ## Preservation: ingest
21
+
22
+ Create an ingestor for Pure.
23
+ ```ruby
24
+ ingest = Preservation::PureIngest.new
25
+ ```
26
+
27
+ For each uuid, if necessary, fetch the metadata, prepare a directory in the
28
+ ingest path and populate it with the files and JSON description file.
29
+
30
+ ```ruby
31
+ ingest.prepare_dataset uuids: uuids,
32
+ dir_name_scheme: :doi_short,
33
+ delay: 0
34
+ ```
35
+
36
+ Free up disk space for completed transfers.
37
+
38
+ ```ruby
39
+ ingest.cleanup_preserved
40
+ ```
41
+
42
+ #VSLIDE
43
+
44
+ ## Transfer-ready directory
45
+
46
+ ```
47
+ .
48
+ ├── 10.17635-lancaster-researchdata-6
49
+ │   ├── Ebola_data_Jun15.zip
50
+ │   └── metadata
51
+ │   └── metadata.json
52
+ ```
53
+
54
+ #VSLIDE
55
+
56
+ ## Transfer-ready metadata
57
+
58
+ ```json
59
+ [
60
+ {
61
+ "filename": "objects/Ebola_data_Jun15.zip",
62
+ "dc.title": "Ebolavirus evolution 2013-2015",
63
+ "dc.description": "Data used for analysis of selection and evolutionary rate in Zaire Ebolavirus variant Makona",
64
+ "dcterms.created": "2015-06-04T16:11:34.713+01:00",
65
+ "dcterms.available": "2015-06-04",
66
+ "dc.publisher": "Lancaster University",
67
+ "dc.identifier": "http://dx.doi.org/10.17635/lancaster/researchdata/6",
68
+ "dcterms.spatial": [
69
+ "Guinea, Sierra Leone, Liberia"
70
+ ],
71
+ "dc.creator": [
72
+ "Gatherer, Derek"
73
+ ],
74
+ "dc.contributor": [
75
+ "Robertson, David",
76
+ "Lovell, Simon"
77
+ ],
78
+ "dc.subject": [
79
+ "Ebolavirus",
80
+ "evolution",
81
+ "phylogenetics",
82
+ "virulence",
83
+ "Filoviridae",
84
+ "positive selection"
85
+ ],
86
+ "dcterms.license": "CC BY",
87
+ "related": [
88
+ {
89
+ "dc.title": "The unprecedented scale of the West African Ebola virus disease outbreak is due to environmental an$
90
+ "type": "Journal article",
91
+ "dc.identifier": "http://dx.doi.org/10.1136/ebmed-2014-110127"
92
+ },
93
+ {
94
+ "dc.title": "The 2014 Ebola virus disease outbreak in West Africa",
95
+ "type": "Journal article",
96
+ "dc.identifier": "http://dx.doi.org/10.1099/vir.0.067199-0"
97
+ }
98
+ ]
99
+ }
100
+ ]
101
+ ```
102
+
103
+ #HSLIDE
104
+
105
+ ## Preservation: reporting
106
+
107
+ Can be used for scheduled monitoring of transfers.
108
+
109
+ ```ruby
110
+ report = Preservation::IngestReport.new
111
+ report.transfer_exception
112
+ ```
113
+
114
+ #HSLIDE
115
+
116
+ ## Location
117
+
118
+ <a href="https://rubygems.org/gems/preservation" target="_blank">RubyGems</a>
119
+
120
+ <a href="https://github.com/lulibrary/preservation" target="_blank">GitHub</a>
121
+
122
+ #HSLIDE
123
+
124
+ ## Documentation
125
+
126
+ <a href="http://www.rubydoc.info/gems/preservation" target="_blank">API in YARD</a>
data/README.md ADDED
@@ -0,0 +1,88 @@
1
+ # Preservation [![Gem Version](https://badge.fury.io/rb/preservation.svg)](https://badge.fury.io/rb/preservation) [![GitPitch](https://gitpitch.com/assets/badge.svg)](https://gitpitch.com/lulibrary/preservation/master?grs=github&t=sky)
2
+
3
+ Ingest management for Archivematica's [Automation Tools](https://github.com/artefactual/automation-tools).
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'preservation'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install preservation
18
+
19
+ ## Usage
20
+
21
+ ### Configuration
22
+ Configure Preservation. If ```log_path``` is omitted, logging (standard library) redirects to STDOUT.
23
+
24
+ ```ruby
25
+ Preservation.configure do |config|
26
+ config.db_path = ENV['ARCHIVEMATICA_DB_PATH']
27
+ config.ingest_path = ENV['ARCHIVEMATICA_INGEST_PATH']
28
+ config.log_path = ENV['PRESERVATION_LOG_PATH']
29
+ end
30
+ ```
31
+
32
+ Configure data source.
33
+
34
+ ```ruby
35
+ Puree.configure do |config|
36
+ config.base_url = ENV['PURE_BASE_URL']
37
+ config.username = ENV['PURE_USERNAME']
38
+ config.password = ENV['PURE_PASSWORD']
39
+ config.basic_auth = true
40
+ end
41
+ ```
42
+
43
+ ### Transfers
44
+
45
+ Get some dataset UUIDs for preservation.
46
+
47
+ ```ruby
48
+ c = Puree::Collection.new resource: :dataset
49
+ minimal_metadata = c.find limit: 2,
50
+ offset: 10,
51
+ full: false
52
+ uuids = []
53
+ minimal_metadata.each do |i|
54
+ uuids << i['uuid']
55
+ end
56
+ ```
57
+
58
+ Create an ingestor for Pure.
59
+
60
+ ```ruby
61
+ ingest = Preservation::PureIngest.new
62
+ ```
63
+
64
+ For each uuid, if necessary, fetch the metadata, prepare
65
+ a directory in the ingest path and populate it with the files and JSON description file.
66
+
67
+ ```ruby
68
+ ingest.prepare_dataset uuids: uuids,
69
+ dir_name_scheme: :doi_short,
70
+ delay: 0
71
+ ```
72
+
73
+ Free up disk space for completed transfers.
74
+
75
+ ```ruby
76
+ ingest.cleanup_preserved
77
+ ```
78
+
79
+ ### Reporting
80
+ Can be used for scheduled monitoring of transfers.
81
+
82
+ ```ruby
83
+ report = Preservation::IngestReport.new
84
+ report.transfer_exception
85
+ ```
86
+
87
+ ## Documentation
88
+ [API in YARD](http://www.rubydoc.info/gems/preservation)
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ require 'date'
2
+ require 'fileutils'
3
+ require 'free_disk_space'
4
+ require 'logger'
5
+ require 'puree'
6
+ require 'sqlite3'
7
+ require 'preservation/configuration'
8
+ require 'preservation/ingest_report'
9
+ require 'preservation/ingest'
10
+ require 'preservation/pure_ingest'
11
+ require 'preservation/string_util'
12
+ require 'preservation/version'
13
+
14
+ # Top level namespace
15
+ #
16
+ module Preservation
17
+
18
+ class << self
19
+
20
+ include Preservation::Configuration
21
+
22
+ end
23
+
24
+ end
@@ -0,0 +1,15 @@
1
+ module Preservation
2
+
3
+ # Configuration options
4
+ #
5
+ module Configuration
6
+
7
+ attr_accessor :db_path, :ingest_path, :log_path
8
+
9
+ def configure
10
+ yield self
11
+ end
12
+
13
+ end
14
+
15
+ end
@@ -0,0 +1,163 @@
1
+ module Preservation
2
+
3
+ # Base class for metadata and file management
4
+ #
5
+ class Ingest
6
+
7
+ attr_reader :logger
8
+
9
+ def initialize
10
+ check_ingest_path
11
+ setup_logger
12
+ setup_report
13
+ end
14
+
15
+ # Free up disk space for completed transfers
16
+ #
17
+ def cleanup_preserved
18
+ preserved = get_preserved
19
+ if !preserved.nil? && !preserved.empty?
20
+ preserved.each do |i|
21
+ # skip anything that has a different owner to script
22
+ if File.stat(i).grpowned?
23
+ FileUtils.remove_dir i
24
+ @logger.info 'Deleted ' + i
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+
31
+ private
32
+
33
+ def build_wget(username, password, file_url)
34
+ # construct wget command with parameters
35
+ wget_str = ''
36
+ wget_str << 'wget'
37
+ wget_str << ' '
38
+ wget_str << '--user'
39
+ wget_str << ' '
40
+ wget_str << username
41
+ wget_str << ' '
42
+ wget_str << '--password'
43
+ wget_str << ' '
44
+ wget_str << '"' + password + '"'
45
+ wget_str << ' '
46
+ wget_str << file_url
47
+ wget_str << ' '
48
+ wget_str << '--no-check-certificate'
49
+ wget_str
50
+ end
51
+
52
+ def check_ingest_path
53
+ if Preservation.ingest_path.nil?
54
+ puts 'Missing ingest path'
55
+ exit
56
+ end
57
+ end
58
+
59
+ def setup_logger
60
+ if @logger.nil?
61
+ if Preservation.log_path.nil?
62
+ @logger = Logger.new STDOUT
63
+ else
64
+ # Keep data for today and the past 20 days
65
+ @logger = Logger.new File.new(Preservation.log_path, 'a'), 20, 'daily'
66
+ end
67
+ end
68
+ @logger.level = Logger::INFO
69
+ end
70
+
71
+ def setup_report
72
+ if Preservation.db_path.nil?
73
+ puts 'Missing db path'
74
+ exit
75
+ else
76
+ @report = IngestReport.new
77
+ end
78
+ end
79
+
80
+ def enough_storage_for_download?(required_bytes)
81
+ # scale up the required space using a multiplier
82
+ multiplier = 2
83
+ available = FreeDiskSpace.bytes('/')
84
+ required_bytes * multiplier < available ? true : false
85
+ end
86
+
87
+ def build_directory_name(metadata_record, directory_name_scheme)
88
+ doi = metadata_record['doi']
89
+ uuid = metadata_record['uuid']
90
+ title = metadata_record['title'].strip.gsub(' ', '-').gsub('/', '-')
91
+ time = Time.new
92
+ date = time.strftime("%Y-%m-%d")
93
+ time = time.strftime("%H:%M:%S")
94
+ join_str = '-----'
95
+
96
+ case directory_name_scheme
97
+ when :uuid_title
98
+ [uuid, title].join(join_str)
99
+ when :title_uuid
100
+ [title, uuid].join(join_str)
101
+ when :date_uuid_title
102
+ [date, uuid, title].join(join_str)
103
+ when :date_title_uuid
104
+ [date, title, uuid].join(join_str)
105
+ when :date_time_uuid
106
+ [date, time, uuid].join(join_str)
107
+ when :date_time_title
108
+ [date, time, title].join(join_str)
109
+ when :date_time_uuid_title
110
+ [date, time, uuid, title].join(join_str)
111
+ when :date_time_title_uuid
112
+ [date, time, title, uuid].join(join_str)
113
+ when :uuid
114
+ uuid
115
+ when :doi
116
+ if doi.empty?
117
+ return ''
118
+ end
119
+ doi.gsub('/', '-')
120
+ when :doi_short
121
+ if doi.empty?
122
+ return ''
123
+ end
124
+ doi_short_to_remove = 'http://dx.doi.org/'
125
+ short = doi.gsub(doi_short_to_remove, '')
126
+ short.gsub!('/', '-')
127
+ else
128
+ uuid
129
+ end
130
+ end
131
+
132
+ # time_to_preserve?
133
+ #
134
+ # @param start_utc [String]
135
+ # @param delay [Integer] days to wait (after modification date) before preserving
136
+ # @return [Boolean]
137
+ def time_to_preserve?(start_utc, delay)
138
+ now = DateTime.now
139
+ modified_datetime = DateTime.parse(start_utc)
140
+ days_since_modified = (now - modified_datetime).to_i # result in days
141
+ days_since_modified >= delay ? true : false
142
+ end
143
+
144
+ # # Collect all paths from DB where preservation has been done
145
+ # # @return [Array<String>]
146
+ def get_preserved
147
+ ingest_complete = @report.transfer_status(status_to_find: 'COMPLETE',
148
+ status_presence: true)
149
+ preserved = []
150
+ ingest_complete.each do |i|
151
+ dir_path = Preservation.ingest_path + '/' + i['path']
152
+ if File.exists?(dir_path)
153
+ preserved << dir_path
154
+ end
155
+ end
156
+
157
+ preserved
158
+ end
159
+
160
+ end
161
+
162
+ end
163
+
@@ -0,0 +1,172 @@
1
+ module Preservation
2
+
3
+ # Ingest reporting
4
+ #
5
+ class IngestReport
6
+
7
+ def initialize
8
+ create_db_connection
9
+ end
10
+
11
+ # Transfers based on presence (or not) of a particular status
12
+ #
13
+ # @param status_to_find [String]
14
+ # @param status_presence [Boolean]
15
+ def transfer_status(status_to_find: nil, status_presence: true)
16
+ if status_presence === true
17
+ status_presence = '='
18
+ else
19
+ status_presence = '<>'
20
+ end
21
+
22
+ query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
23
+
24
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
25
+ # and use hex function in DB query
26
+ records = []
27
+ @db.results_as_hash = true
28
+ @db.execute( query, [ status_to_find ] ) do |row|
29
+ id = row['id']
30
+ uuid = row['uuid']
31
+ bin_path = StringUtil.hex_to_bin row['hex_path']
32
+ unit_type = row['unit_type']
33
+ status = row['status']
34
+ microservice = row['microservice']
35
+ current = row['current']
36
+ o = {}
37
+ o['path'] = bin_path if !bin_path.empty?
38
+ o['unit_type'] = unit_type if !unit_type.empty?
39
+ o['status'] = status if !status.empty?
40
+ o['microservice'] = microservice if !microservice.empty?
41
+ o['current'] = current if current
42
+ o['id'] = id if id
43
+ o['uuid'] = uuid if !uuid.empty?
44
+
45
+ records << o
46
+ end
47
+
48
+ records
49
+ end
50
+
51
+ # Current transfer
52
+ #
53
+ # @return [Hash]
54
+ def transfer_current
55
+ query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
56
+
57
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
58
+ # and use hex function in DB query
59
+ o = {}
60
+ @db.results_as_hash = true
61
+ @db.execute( query ) do |row|
62
+ id = row['id']
63
+ uuid = row['uuid']
64
+ bin_path = hex_to_bin row['hex_path']
65
+ unit_type = row['unit_type']
66
+ status = row['status']
67
+ microservice = row['microservice']
68
+ current = row['current']
69
+ o['path'] = bin_path if !bin_path.empty?
70
+ o['unit_type'] = unit_type if !unit_type.empty?
71
+ o['status'] = status if !status.empty?
72
+ o['microservice'] = microservice if !microservice.empty?
73
+ o['current'] = current if current
74
+ o['id'] = id if id
75
+ o['uuid'] = uuid if !uuid.empty?
76
+ end
77
+ o
78
+ end
79
+
80
+ # Count of complete transfers
81
+ #
82
+ # @return [Integer]
83
+ def transfer_complete_count
84
+ query = 'SELECT count(*) FROM unit WHERE status = ?'
85
+
86
+ status_to_find = 'COMPLETE'
87
+ @db.results_as_hash = true
88
+ @db.get_first_value( query, [status_to_find] )
89
+ end
90
+
91
+ # Compilation of statistics and data, with focus on exceptions
92
+ #
93
+ # @return [Hash]
94
+ def transfer_exception
95
+ incomplete = transfer_status(status_to_find: 'COMPLETE', status_presence: false)
96
+ failed = transfer_status(status_to_find: 'FAILED', status_presence: true)
97
+ current = transfer_current
98
+ complete_count = transfer_complete_count
99
+ report = {}
100
+ report['current'] = current if !current.empty?
101
+ report['failed'] = {}
102
+ report['failed']['count'] = failed.count
103
+ report['failed']['data'] = failed if !failed.empty?
104
+ report['incomplete'] = {}
105
+ report['incomplete']['count'] = incomplete.count
106
+ report['incomplete']['data'] = incomplete if !incomplete.empty?
107
+ report['complete'] = {}
108
+ report['complete']['count'] = complete_count if complete_count
109
+ report
110
+ end
111
+
112
+ # Is it in database?
113
+ # @param path_to_find [String] directory name within ingest path
114
+ # @return [Boolean]
115
+ def in_db?(path_to_find)
116
+ in_db = false
117
+
118
+ # Get path out of DB as a hex string
119
+ query = 'SELECT hex(path) FROM unit'
120
+
121
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
122
+ # and use hex function in DB query
123
+ @db.execute( query ) do |row|
124
+ bin_path = StringUtil.hex_to_bin row[0]
125
+ if bin_path === path_to_find
126
+ in_db = true
127
+ end
128
+ end
129
+
130
+ in_db
131
+ end
132
+
133
+ # Has preservation been done?
134
+ # @param path_to_find [String] directory name within ingest path
135
+ # @return [Boolean]
136
+ def preserved?(path_to_find)
137
+ preserved = false
138
+
139
+ # 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
140
+ # indicates completed
141
+ unit_type_to_find = 'ingest'
142
+ status_to_find = 'COMPLETE'
143
+
144
+ # Get path out of DB as a hex string for completed ingests
145
+ query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
146
+
147
+ # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
148
+ # and use hex function in DB query
149
+ @db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
150
+ bin_path = StringUtil.hex_to_bin row[0]
151
+ if bin_path === path_to_find
152
+ preserved = true
153
+ end
154
+ end
155
+
156
+ preserved
157
+ end
158
+
159
+
160
+ private
161
+
162
+ def create_db_connection
163
+ if Preservation.db_path.nil?
164
+ puts 'Missing db_path'
165
+ exit
166
+ end
167
+ @db = SQLite3::Database.new Preservation.db_path
168
+ end
169
+
170
+ end
171
+
172
+ end
@@ -0,0 +1,188 @@
1
+ module Preservation
2
+
3
+ # Ingest for Pure
4
+ #
5
+ class PureIngest < Ingest
6
+
7
+ def initialize
8
+ super
9
+ end
10
+
11
+ # For each uuid, if necessary, fetch the metadata,
12
+ # prepare a directory in the ingest path and populate it with the files and
13
+ # JSON description file.
14
+ #
15
+ # @param uuids [Array<String>] uuids to preserve
16
+ # @param dir_name_scheme [Symbol] method to make directory name
17
+ # @param delay [Integer] days to wait (after modification date) before preserving
18
+ def prepare_dataset(uuids: [],
19
+ dir_name_scheme: :uuid,
20
+ delay: 0)
21
+ dir_base_path = Preservation.ingest_path
22
+
23
+ uuids.each do |uuid|
24
+ dataset = Puree::Dataset.new
25
+ dataset.find uuid: uuid
26
+ d = dataset.metadata
27
+ if d.empty?
28
+ @logger.info 'No metadata for ' + uuid
29
+ next
30
+ end
31
+ # configurable to become more human-readable
32
+ dir_name = build_directory_name(d, dir_name_scheme)
33
+
34
+ # continue only if dir_name is not empty (e.g. because there was no DOI)
35
+ # continue only if there is no DB entry
36
+ # continue only if the dataset has a DOI
37
+ # continue only if there are files for this resource
38
+ # continue only if it is time to preserve
39
+ if !dir_name.nil? &&
40
+ !dir_name.empty? &&
41
+ !@report.in_db?(dir_name) &&
42
+ !d['doi'].empty? &&
43
+ !d['file'].empty? &&
44
+ time_to_preserve?(d['modified'], delay)
45
+
46
+ dir_file_path = dir_base_path + '/' + dir_name
47
+ dir_metadata_path = dir_file_path + '/metadata/'
48
+ metadata_filename = dir_metadata_path + 'metadata.json'
49
+
50
+ # calculate total size of data files
51
+ download_storage_required = 0
52
+ d['file'].each { |i| download_storage_required += i['size'].to_i }
53
+
54
+ # do we have enough space in filesystem to fetch data files?
55
+ if enough_storage_for_download? download_storage_required
56
+ # @logger.info 'Sufficient disk space for ' + dir_file_path
57
+ else
58
+ @logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
59
+ next
60
+ end
61
+
62
+ # has metadata file been created? if so, files and metadata are in place
63
+ # continue only if files not present in ingest location
64
+ if !File.size? metadata_filename
65
+
66
+ @logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
67
+
68
+ data = []
69
+ d['file'].each do |f|
70
+ o = package_dataset_metadata d, f
71
+ data << o
72
+ wget_str = build_wget Puree.username,
73
+ Puree.password,
74
+ f['url']
75
+
76
+ Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
77
+
78
+ # fetch the file
79
+ Dir.chdir(dir_file_path) do
80
+ # puts 'Changing dir to ' + Dir.pwd
81
+ # puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
82
+ if File.size?(f['name'])
83
+ # puts 'Should be deleting ' + f['name']
84
+ File.delete(f['name'])
85
+ end
86
+ # puts f['name'] + ' missing or empty'
87
+ # puts wget_str
88
+ `#{wget_str}`
89
+ end
90
+ end
91
+
92
+ Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
93
+
94
+ pretty = JSON.pretty_generate( data, :indent => ' ')
95
+ # puts pretty
96
+ File.write(metadata_filename,pretty)
97
+ @logger.info 'Created ' + metadata_filename
98
+ end
99
+ else
100
+ @logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
101
+ end
102
+ end
103
+ end
104
+
105
+ private
106
+
107
+ def package_dataset_metadata(d, f)
108
+ o = {}
109
+ o['filename'] = 'objects/' + f['name']
110
+ o['dc.title'] = d['title']
111
+ if !d['description'].empty?
112
+ o['dc.description'] = d['description']
113
+ end
114
+ o['dcterms.created'] = d['created']
115
+ if !d['available']['year'].empty?
116
+ o['dcterms.available'] = Puree::Date.iso(d['available'])
117
+ end
118
+ o['dc.publisher'] = d['publisher']
119
+ if !d['doi'].empty?
120
+ o['dc.identifier'] = d['doi']
121
+ end
122
+ if !d['spatial'].empty?
123
+ o['dcterms.spatial'] = d['spatial']
124
+ end
125
+ if !d['temporal']['start']['year'].empty?
126
+ temporal_range = ''
127
+ temporal_range << Puree::Date.iso(d['temporal']['start'])
128
+ if !d['temporal']['end']['year'].empty?
129
+ temporal_range << '/'
130
+ temporal_range << Puree::Date.iso(d['temporal']['end'])
131
+ end
132
+ o['dcterms.temporal'] = temporal_range
133
+ end
134
+ creators = []
135
+ contributors = []
136
+ person_types = %w(internal external other)
137
+ person_types.each do |person_type|
138
+ d['person'][person_type].each do |i|
139
+ if i['role'] == 'Creator'
140
+ creator = i['name']['last'] + ', ' + i['name']['first']
141
+ creators << creator
142
+ end
143
+ if i['role'] == 'Contributor'
144
+ contributor = i['name']['last'] + ', ' + i['name']['first']
145
+ contributors << contributor
146
+ end
147
+ end
148
+ end
149
+ o['dc.creator'] = creators
150
+ if !contributors.empty?
151
+ o['dc.contributor'] = contributors
152
+ end
153
+ keywords = []
154
+ d['keyword'].each { |i|
155
+ keywords << i
156
+ }
157
+ if !keywords.empty?
158
+ o['dc.subject'] = keywords
159
+ end
160
+ if !f['license']['name'].empty?
161
+ o['dcterms.license'] = f['license']['name']
162
+ end
163
+ # o['dc.format'] = f['mime']
164
+
165
+ related = []
166
+ publications = d['publication']
167
+ publications.each do |i|
168
+ o_related = {}
169
+ o_related['dc.title'] = i['title']
170
+ o_related['type'] = i['type']
171
+ pub = Puree::Publication.new
172
+ pub.find uuid: i['uuid']
173
+ doi = pub.doi
174
+ if doi
175
+ o_related['dc.identifier'] = doi
176
+ end
177
+ related << o_related
178
+ end
179
+ if !related.empty?
180
+ o['related'] = related
181
+ end
182
+
183
+ o
184
+ end
185
+
186
+ end
187
+
188
+ end
@@ -0,0 +1,19 @@
1
+ module Preservation
2
+
3
+ # String utilities
4
+ #
5
+ module StringUtil
6
+ # Binary to hexadecimal
7
+ #
8
+ def self.bin_to_hex(s)
9
+ s.each_byte.map { |b| b.to_s(16) }.join
10
+ end
11
+
12
+ # Hexadecimal to binary
13
+ def self.hex_to_bin(s)
14
+ s.scan(/../).map { |x| x.hex.chr }.join
15
+ end
16
+
17
+ end
18
+
19
+ end
@@ -0,0 +1,5 @@
1
+ module Preservation
2
+ # Semantic version number
3
+ #
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'preservation/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "preservation"
8
+ spec.version = Preservation::VERSION
9
+ spec.authors = ["Adrian Albin-Clark"]
10
+ spec.email = ["a.albin-clark@lancaster.ac.uk"]
11
+ spec.summary = %q{Ingest management for Archivematica's Automation Tools.}
12
+ spec.description = %q{Transfer preparation, reporting and disk space management for Archivematica's Automation Tools.}
13
+ spec.homepage = "https://rubygems.org/gems/preservation"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = '~> 2.1'
22
+
23
+ spec.add_runtime_dependency 'free_disk_space', '~> 1.0'
24
+ spec.add_runtime_dependency 'puree', '~> 0.17'
25
+ spec.add_runtime_dependency'sqlite3', '~> 1.3'
26
+ end
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: preservation
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Adrian Albin-Clark
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-09-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: free_disk_space
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: puree
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.17'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.17'
41
+ - !ruby/object:Gem::Dependency
42
+ name: sqlite3
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.3'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.3'
55
+ description: Transfer preparation, reporting and disk space management for Archivematica's
56
+ Automation Tools.
57
+ email:
58
+ - a.albin-clark@lancaster.ac.uk
59
+ executables: []
60
+ extensions: []
61
+ extra_rdoc_files: []
62
+ files:
63
+ - ".gitignore"
64
+ - CHANGELOG.md
65
+ - Gemfile
66
+ - LICENSE.txt
67
+ - PITCHME.md
68
+ - README.md
69
+ - Rakefile
70
+ - lib/preservation.rb
71
+ - lib/preservation/configuration.rb
72
+ - lib/preservation/ingest.rb
73
+ - lib/preservation/ingest_report.rb
74
+ - lib/preservation/pure_ingest.rb
75
+ - lib/preservation/string_util.rb
76
+ - lib/preservation/version.rb
77
+ - preservation.gemspec
78
+ homepage: https://rubygems.org/gems/preservation
79
+ licenses:
80
+ - MIT
81
+ metadata: {}
82
+ post_install_message:
83
+ rdoc_options: []
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '2.1'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ requirements: []
97
+ rubyforge_project:
98
+ rubygems_version: 2.2.2
99
+ signing_key:
100
+ specification_version: 4
101
+ summary: Ingest management for Archivematica's Automation Tools.
102
+ test_files: []
103
+ has_rdoc: