preservation 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  module Preservation
2
2
  # Semantic version number
3
3
  #
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
data/preservation.gemspec CHANGED
@@ -8,9 +8,9 @@ Gem::Specification.new do |spec|
8
8
  spec.version = Preservation::VERSION
9
9
  spec.authors = ["Adrian Albin-Clark"]
10
10
  spec.email = ["a.albin-clark@lancaster.ac.uk"]
11
- spec.summary = %q{Ingest management for Archivematica's Automation Tools.}
12
- spec.description = %q{Transfer preparation, reporting and disk space management for Archivematica's Automation Tools.}
13
- spec.homepage = "https://rubygems.org/gems/preservation"
11
+ spec.summary = %q{Extraction and Transformation for Loading by Archivematica's Automation Tools.}
12
+ spec.description = %q{Extraction and Transformation for Loading by Archivematica's Automation Tools. Includes transfer preparation, reporting and disk space management.}
13
+ spec.homepage = "https://aalbinclark.gitbooks.io/preservation"
14
14
  spec.license = "MIT"
15
15
 
16
16
  spec.files = `git ls-files -z`.split("\x0")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: preservation
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adrian Albin-Clark
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-13 00:00:00.000000000 Z
11
+ date: 2016-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: free_disk_space
@@ -52,8 +52,8 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.3'
55
- description: Transfer preparation, reporting and disk space management for Archivematica's
56
- Automation Tools.
55
+ description: Extraction and Transformation for Loading by Archivematica's Automation
56
+ Tools. Includes transfer preparation, reporting and disk space management.
57
57
  email:
58
58
  - a.albin-clark@lancaster.ac.uk
59
59
  executables: []
@@ -68,14 +68,18 @@ files:
68
68
  - README.md
69
69
  - Rakefile
70
70
  - lib/preservation.rb
71
+ - lib/preservation/builder.rb
71
72
  - lib/preservation/configuration.rb
73
+ - lib/preservation/conversion.rb
72
74
  - lib/preservation/ingest.rb
73
- - lib/preservation/ingest_report.rb
74
- - lib/preservation/pure_ingest.rb
75
- - lib/preservation/string_util.rb
75
+ - lib/preservation/report/database.rb
76
+ - lib/preservation/report/transfer.rb
77
+ - lib/preservation/storage.rb
78
+ - lib/preservation/temporal.rb
79
+ - lib/preservation/transfer/pure.rb
76
80
  - lib/preservation/version.rb
77
81
  - preservation.gemspec
78
- homepage: https://rubygems.org/gems/preservation
82
+ homepage: https://aalbinclark.gitbooks.io/preservation
79
83
  licenses:
80
84
  - MIT
81
85
  metadata: {}
@@ -98,6 +102,6 @@ rubyforge_project:
98
102
  rubygems_version: 2.2.2
99
103
  signing_key:
100
104
  specification_version: 4
101
- summary: Ingest management for Archivematica's Automation Tools.
105
+ summary: Extraction and Transformation for Loading by Archivematica's Automation Tools.
102
106
  test_files: []
103
107
  has_rdoc:
@@ -1,172 +0,0 @@
1
- module Preservation
2
-
3
- # Ingest reporting
4
- #
5
- class IngestReport
6
-
7
- def initialize
8
- create_db_connection
9
- end
10
-
11
- # Transfers based on presence (or not) of a particular status
12
- #
13
- # @param status_to_find [String]
14
- # @param status_presence [Boolean]
15
- def transfer_status(status_to_find: nil, status_presence: true)
16
- if status_presence === true
17
- status_presence = '='
18
- else
19
- status_presence = '<>'
20
- end
21
-
22
- query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
23
-
24
- # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
25
- # and use hex function in DB query
26
- records = []
27
- @db.results_as_hash = true
28
- @db.execute( query, [ status_to_find ] ) do |row|
29
- id = row['id']
30
- uuid = row['uuid']
31
- bin_path = StringUtil.hex_to_bin row['hex_path']
32
- unit_type = row['unit_type']
33
- status = row['status']
34
- microservice = row['microservice']
35
- current = row['current']
36
- o = {}
37
- o['path'] = bin_path if !bin_path.empty?
38
- o['unit_type'] = unit_type if !unit_type.empty?
39
- o['status'] = status if !status.empty?
40
- o['microservice'] = microservice if !microservice.empty?
41
- o['current'] = current if current
42
- o['id'] = id if id
43
- o['uuid'] = uuid if !uuid.empty?
44
-
45
- records << o
46
- end
47
-
48
- records
49
- end
50
-
51
- # Current transfer
52
- #
53
- # @return [Hash]
54
- def transfer_current
55
- query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
56
-
57
- # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
58
- # and use hex function in DB query
59
- o = {}
60
- @db.results_as_hash = true
61
- @db.execute( query ) do |row|
62
- id = row['id']
63
- uuid = row['uuid']
64
- bin_path = hex_to_bin row['hex_path']
65
- unit_type = row['unit_type']
66
- status = row['status']
67
- microservice = row['microservice']
68
- current = row['current']
69
- o['path'] = bin_path if !bin_path.empty?
70
- o['unit_type'] = unit_type if !unit_type.empty?
71
- o['status'] = status if !status.empty?
72
- o['microservice'] = microservice if !microservice.empty?
73
- o['current'] = current if current
74
- o['id'] = id if id
75
- o['uuid'] = uuid if !uuid.empty?
76
- end
77
- o
78
- end
79
-
80
- # Count of complete transfers
81
- #
82
- # @return [Integer]
83
- def transfer_complete_count
84
- query = 'SELECT count(*) FROM unit WHERE status = ?'
85
-
86
- status_to_find = 'COMPLETE'
87
- @db.results_as_hash = true
88
- @db.get_first_value( query, [status_to_find] )
89
- end
90
-
91
- # Compilation of statistics and data, with focus on exceptions
92
- #
93
- # @return [Hash]
94
- def transfer_exception
95
- incomplete = transfer_status(status_to_find: 'COMPLETE', status_presence: false)
96
- failed = transfer_status(status_to_find: 'FAILED', status_presence: true)
97
- current = transfer_current
98
- complete_count = transfer_complete_count
99
- report = {}
100
- report['current'] = current if !current.empty?
101
- report['failed'] = {}
102
- report['failed']['count'] = failed.count
103
- report['failed']['data'] = failed if !failed.empty?
104
- report['incomplete'] = {}
105
- report['incomplete']['count'] = incomplete.count
106
- report['incomplete']['data'] = incomplete if !incomplete.empty?
107
- report['complete'] = {}
108
- report['complete']['count'] = complete_count if complete_count
109
- report
110
- end
111
-
112
- # Is it in database?
113
- # @param path_to_find [String] directory name within ingest path
114
- # @return [Boolean]
115
- def in_db?(path_to_find)
116
- in_db = false
117
-
118
- # Get path out of DB as a hex string
119
- query = 'SELECT hex(path) FROM unit'
120
-
121
- # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
122
- # and use hex function in DB query
123
- @db.execute( query ) do |row|
124
- bin_path = StringUtil.hex_to_bin row[0]
125
- if bin_path === path_to_find
126
- in_db = true
127
- end
128
- end
129
-
130
- in_db
131
- end
132
-
133
- # Has preservation been done?
134
- # @param path_to_find [String] directory name within ingest path
135
- # @return [Boolean]
136
- def preserved?(path_to_find)
137
- preserved = false
138
-
139
- # 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
140
- # indicates completed
141
- unit_type_to_find = 'ingest'
142
- status_to_find = 'COMPLETE'
143
-
144
- # Get path out of DB as a hex string for completed ingests
145
- query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
146
-
147
- # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
148
- # and use hex function in DB query
149
- @db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
150
- bin_path = StringUtil.hex_to_bin row[0]
151
- if bin_path === path_to_find
152
- preserved = true
153
- end
154
- end
155
-
156
- preserved
157
- end
158
-
159
-
160
- private
161
-
162
- def create_db_connection
163
- if Preservation.db_path.nil?
164
- puts 'Missing db_path'
165
- exit
166
- end
167
- @db = SQLite3::Database.new Preservation.db_path
168
- end
169
-
170
- end
171
-
172
- end
@@ -1,188 +0,0 @@
1
- module Preservation
2
-
3
- # Ingest for Pure
4
- #
5
- class PureIngest < Ingest
6
-
7
- def initialize
8
- super
9
- end
10
-
11
- # For each uuid, if necessary, fetch the metadata,
12
- # prepare a directory in the ingest path and populate it with the files and
13
- # JSON description file.
14
- #
15
- # @param uuids [Array<String>] uuids to preserve
16
- # @param dir_name_scheme [Symbol] method to make directory name
17
- # @param delay [Integer] days to wait (after modification date) before preserving
18
- def prepare_dataset(uuids: [],
19
- dir_name_scheme: :uuid,
20
- delay: 0)
21
- dir_base_path = Preservation.ingest_path
22
-
23
- uuids.each do |uuid|
24
- dataset = Puree::Dataset.new
25
- dataset.find uuid: uuid
26
- d = dataset.metadata
27
- if d.empty?
28
- @logger.info 'No metadata for ' + uuid
29
- next
30
- end
31
- # configurable to become more human-readable
32
- dir_name = build_directory_name(d, dir_name_scheme)
33
-
34
- # continue only if dir_name is not empty (e.g. because there was no DOI)
35
- # continue only if there is no DB entry
36
- # continue only if the dataset has a DOI
37
- # continue only if there are files for this resource
38
- # continue only if it is time to preserve
39
- if !dir_name.nil? &&
40
- !dir_name.empty? &&
41
- !@report.in_db?(dir_name) &&
42
- !d['doi'].empty? &&
43
- !d['file'].empty? &&
44
- time_to_preserve?(d['modified'], delay)
45
-
46
- dir_file_path = dir_base_path + '/' + dir_name
47
- dir_metadata_path = dir_file_path + '/metadata/'
48
- metadata_filename = dir_metadata_path + 'metadata.json'
49
-
50
- # calculate total size of data files
51
- download_storage_required = 0
52
- d['file'].each { |i| download_storage_required += i['size'].to_i }
53
-
54
- # do we have enough space in filesystem to fetch data files?
55
- if enough_storage_for_download? download_storage_required
56
- # @logger.info 'Sufficient disk space for ' + dir_file_path
57
- else
58
- @logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
59
- next
60
- end
61
-
62
- # has metadata file been created? if so, files and metadata are in place
63
- # continue only if files not present in ingest location
64
- if !File.size? metadata_filename
65
-
66
- @logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
67
-
68
- data = []
69
- d['file'].each do |f|
70
- o = package_dataset_metadata d, f
71
- data << o
72
- wget_str = build_wget Puree.username,
73
- Puree.password,
74
- f['url']
75
-
76
- Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
77
-
78
- # fetch the file
79
- Dir.chdir(dir_file_path) do
80
- # puts 'Changing dir to ' + Dir.pwd
81
- # puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
82
- if File.size?(f['name'])
83
- # puts 'Should be deleting ' + f['name']
84
- File.delete(f['name'])
85
- end
86
- # puts f['name'] + ' missing or empty'
87
- # puts wget_str
88
- `#{wget_str}`
89
- end
90
- end
91
-
92
- Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
93
-
94
- pretty = JSON.pretty_generate( data, :indent => ' ')
95
- # puts pretty
96
- File.write(metadata_filename,pretty)
97
- @logger.info 'Created ' + metadata_filename
98
- end
99
- else
100
- @logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
101
- end
102
- end
103
- end
104
-
105
- private
106
-
107
- def package_dataset_metadata(d, f)
108
- o = {}
109
- o['filename'] = 'objects/' + f['name']
110
- o['dc.title'] = d['title']
111
- if !d['description'].empty?
112
- o['dc.description'] = d['description']
113
- end
114
- o['dcterms.created'] = d['created']
115
- if !d['available']['year'].empty?
116
- o['dcterms.available'] = Puree::Date.iso(d['available'])
117
- end
118
- o['dc.publisher'] = d['publisher']
119
- if !d['doi'].empty?
120
- o['dc.identifier'] = d['doi']
121
- end
122
- if !d['spatial'].empty?
123
- o['dcterms.spatial'] = d['spatial']
124
- end
125
- if !d['temporal']['start']['year'].empty?
126
- temporal_range = ''
127
- temporal_range << Puree::Date.iso(d['temporal']['start'])
128
- if !d['temporal']['end']['year'].empty?
129
- temporal_range << '/'
130
- temporal_range << Puree::Date.iso(d['temporal']['end'])
131
- end
132
- o['dcterms.temporal'] = temporal_range
133
- end
134
- creators = []
135
- contributors = []
136
- person_types = %w(internal external other)
137
- person_types.each do |person_type|
138
- d['person'][person_type].each do |i|
139
- if i['role'] == 'Creator'
140
- creator = i['name']['last'] + ', ' + i['name']['first']
141
- creators << creator
142
- end
143
- if i['role'] == 'Contributor'
144
- contributor = i['name']['last'] + ', ' + i['name']['first']
145
- contributors << contributor
146
- end
147
- end
148
- end
149
- o['dc.creator'] = creators
150
- if !contributors.empty?
151
- o['dc.contributor'] = contributors
152
- end
153
- keywords = []
154
- d['keyword'].each { |i|
155
- keywords << i
156
- }
157
- if !keywords.empty?
158
- o['dc.subject'] = keywords
159
- end
160
- if !f['license']['name'].empty?
161
- o['dcterms.license'] = f['license']['name']
162
- end
163
- # o['dc.format'] = f['mime']
164
-
165
- related = []
166
- publications = d['publication']
167
- publications.each do |i|
168
- o_related = {}
169
- o_related['dc.title'] = i['title']
170
- o_related['type'] = i['type']
171
- pub = Puree::Publication.new
172
- pub.find uuid: i['uuid']
173
- doi = pub.doi
174
- if doi
175
- o_related['dc.identifier'] = doi
176
- end
177
- related << o_related
178
- end
179
- if !related.empty?
180
- o['related'] = related
181
- end
182
-
183
- o
184
- end
185
-
186
- end
187
-
188
- end