preservation 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
  module Preservation
2
2
  # Semantic version number
3
3
  #
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.0"
5
5
  end
data/preservation.gemspec CHANGED
@@ -8,9 +8,9 @@ Gem::Specification.new do |spec|
8
8
  spec.version = Preservation::VERSION
9
9
  spec.authors = ["Adrian Albin-Clark"]
10
10
  spec.email = ["a.albin-clark@lancaster.ac.uk"]
11
- spec.summary = %q{Ingest management for Archivematica's Automation Tools.}
12
- spec.description = %q{Transfer preparation, reporting and disk space management for Archivematica's Automation Tools.}
13
- spec.homepage = "https://rubygems.org/gems/preservation"
11
+ spec.summary = %q{Extraction and Transformation for Loading by Archivematica's Automation Tools.}
12
+ spec.description = %q{Extraction and Transformation for Loading by Archivematica's Automation Tools. Includes transfer preparation, reporting and disk space management.}
13
+ spec.homepage = "https://aalbinclark.gitbooks.io/preservation"
14
14
  spec.license = "MIT"
15
15
 
16
16
  spec.files = `git ls-files -z`.split("\x0")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: preservation
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adrian Albin-Clark
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-13 00:00:00.000000000 Z
11
+ date: 2016-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: free_disk_space
@@ -52,8 +52,8 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.3'
55
- description: Transfer preparation, reporting and disk space management for Archivematica's
56
- Automation Tools.
55
+ description: Extraction and Transformation for Loading by Archivematica's Automation
56
+ Tools. Includes transfer preparation, reporting and disk space management.
57
57
  email:
58
58
  - a.albin-clark@lancaster.ac.uk
59
59
  executables: []
@@ -68,14 +68,18 @@ files:
68
68
  - README.md
69
69
  - Rakefile
70
70
  - lib/preservation.rb
71
+ - lib/preservation/builder.rb
71
72
  - lib/preservation/configuration.rb
73
+ - lib/preservation/conversion.rb
72
74
  - lib/preservation/ingest.rb
73
- - lib/preservation/ingest_report.rb
74
- - lib/preservation/pure_ingest.rb
75
- - lib/preservation/string_util.rb
75
+ - lib/preservation/report/database.rb
76
+ - lib/preservation/report/transfer.rb
77
+ - lib/preservation/storage.rb
78
+ - lib/preservation/temporal.rb
79
+ - lib/preservation/transfer/pure.rb
76
80
  - lib/preservation/version.rb
77
81
  - preservation.gemspec
78
- homepage: https://rubygems.org/gems/preservation
82
+ homepage: https://aalbinclark.gitbooks.io/preservation
79
83
  licenses:
80
84
  - MIT
81
85
  metadata: {}
@@ -98,6 +102,6 @@ rubyforge_project:
98
102
  rubygems_version: 2.2.2
99
103
  signing_key:
100
104
  specification_version: 4
101
- summary: Ingest management for Archivematica's Automation Tools.
105
+ summary: Extraction and Transformation for Loading by Archivematica's Automation Tools.
102
106
  test_files: []
103
107
  has_rdoc:
@@ -1,172 +0,0 @@
1
- module Preservation
2
-
3
- # Ingest reporting
4
- #
5
- class IngestReport
6
-
7
- def initialize
8
- create_db_connection
9
- end
10
-
11
- # Transfers based on presence (or not) of a particular status
12
- #
13
- # @param status_to_find [String]
14
- # @param status_presence [Boolean]
15
- def transfer_status(status_to_find: nil, status_presence: true)
16
- if status_presence === true
17
- status_presence = '='
18
- else
19
- status_presence = '<>'
20
- end
21
-
22
- query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE status #{status_presence} ?"
23
-
24
- # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
25
- # and use hex function in DB query
26
- records = []
27
- @db.results_as_hash = true
28
- @db.execute( query, [ status_to_find ] ) do |row|
29
- id = row['id']
30
- uuid = row['uuid']
31
- bin_path = StringUtil.hex_to_bin row['hex_path']
32
- unit_type = row['unit_type']
33
- status = row['status']
34
- microservice = row['microservice']
35
- current = row['current']
36
- o = {}
37
- o['path'] = bin_path if !bin_path.empty?
38
- o['unit_type'] = unit_type if !unit_type.empty?
39
- o['status'] = status if !status.empty?
40
- o['microservice'] = microservice if !microservice.empty?
41
- o['current'] = current if current
42
- o['id'] = id if id
43
- o['uuid'] = uuid if !uuid.empty?
44
-
45
- records << o
46
- end
47
-
48
- records
49
- end
50
-
51
- # Current transfer
52
- #
53
- # @return [Hash]
54
- def transfer_current
55
- query = "SELECT id, uuid, hex(path) as hex_path, unit_type, status, microservice, current FROM unit WHERE current = 1"
56
-
57
- # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
58
- # and use hex function in DB query
59
- o = {}
60
- @db.results_as_hash = true
61
- @db.execute( query ) do |row|
62
- id = row['id']
63
- uuid = row['uuid']
64
- bin_path = hex_to_bin row['hex_path']
65
- unit_type = row['unit_type']
66
- status = row['status']
67
- microservice = row['microservice']
68
- current = row['current']
69
- o['path'] = bin_path if !bin_path.empty?
70
- o['unit_type'] = unit_type if !unit_type.empty?
71
- o['status'] = status if !status.empty?
72
- o['microservice'] = microservice if !microservice.empty?
73
- o['current'] = current if current
74
- o['id'] = id if id
75
- o['uuid'] = uuid if !uuid.empty?
76
- end
77
- o
78
- end
79
-
80
- # Count of complete transfers
81
- #
82
- # @return [Integer]
83
- def transfer_complete_count
84
- query = 'SELECT count(*) FROM unit WHERE status = ?'
85
-
86
- status_to_find = 'COMPLETE'
87
- @db.results_as_hash = true
88
- @db.get_first_value( query, [status_to_find] )
89
- end
90
-
91
- # Compilation of statistics and data, with focus on exceptions
92
- #
93
- # @return [Hash]
94
- def transfer_exception
95
- incomplete = transfer_status(status_to_find: 'COMPLETE', status_presence: false)
96
- failed = transfer_status(status_to_find: 'FAILED', status_presence: true)
97
- current = transfer_current
98
- complete_count = transfer_complete_count
99
- report = {}
100
- report['current'] = current if !current.empty?
101
- report['failed'] = {}
102
- report['failed']['count'] = failed.count
103
- report['failed']['data'] = failed if !failed.empty?
104
- report['incomplete'] = {}
105
- report['incomplete']['count'] = incomplete.count
106
- report['incomplete']['data'] = incomplete if !incomplete.empty?
107
- report['complete'] = {}
108
- report['complete']['count'] = complete_count if complete_count
109
- report
110
- end
111
-
112
- # Is it in database?
113
- # @param path_to_find [String] directory name within ingest path
114
- # @return [Boolean]
115
- def in_db?(path_to_find)
116
- in_db = false
117
-
118
- # Get path out of DB as a hex string
119
- query = 'SELECT hex(path) FROM unit'
120
-
121
- # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
122
- # and use hex function in DB query
123
- @db.execute( query ) do |row|
124
- bin_path = StringUtil.hex_to_bin row[0]
125
- if bin_path === path_to_find
126
- in_db = true
127
- end
128
- end
129
-
130
- in_db
131
- end
132
-
133
- # Has preservation been done?
134
- # @param path_to_find [String] directory name within ingest path
135
- # @return [Boolean]
136
- def preserved?(path_to_find)
137
- preserved = false
138
-
139
- # 'ingest' value in unit_type and 'COMPLETE' value in status DB fields
140
- # indicates completed
141
- unit_type_to_find = 'ingest'
142
- status_to_find = 'COMPLETE'
143
-
144
- # Get path out of DB as a hex string for completed ingests
145
- query = 'SELECT hex(path) FROM unit WHERE unit_type = ? AND status = ?'
146
-
147
- # Archivematica stores path as BLOB, so need to convert path to Hex, to search for it
148
- # and use hex function in DB query
149
- @db.execute( query, [ unit_type_to_find, status_to_find ] ) do |row|
150
- bin_path = StringUtil.hex_to_bin row[0]
151
- if bin_path === path_to_find
152
- preserved = true
153
- end
154
- end
155
-
156
- preserved
157
- end
158
-
159
-
160
- private
161
-
162
- def create_db_connection
163
- if Preservation.db_path.nil?
164
- puts 'Missing db_path'
165
- exit
166
- end
167
- @db = SQLite3::Database.new Preservation.db_path
168
- end
169
-
170
- end
171
-
172
- end
@@ -1,188 +0,0 @@
1
- module Preservation
2
-
3
- # Ingest for Pure
4
- #
5
- class PureIngest < Ingest
6
-
7
- def initialize
8
- super
9
- end
10
-
11
- # For each uuid, if necessary, fetch the metadata,
12
- # prepare a directory in the ingest path and populate it with the files and
13
- # JSON description file.
14
- #
15
- # @param uuids [Array<String>] uuids to preserve
16
- # @param dir_name_scheme [Symbol] method to make directory name
17
- # @param delay [Integer] days to wait (after modification date) before preserving
18
- def prepare_dataset(uuids: [],
19
- dir_name_scheme: :uuid,
20
- delay: 0)
21
- dir_base_path = Preservation.ingest_path
22
-
23
- uuids.each do |uuid|
24
- dataset = Puree::Dataset.new
25
- dataset.find uuid: uuid
26
- d = dataset.metadata
27
- if d.empty?
28
- @logger.info 'No metadata for ' + uuid
29
- next
30
- end
31
- # configurable to become more human-readable
32
- dir_name = build_directory_name(d, dir_name_scheme)
33
-
34
- # continue only if dir_name is not empty (e.g. because there was no DOI)
35
- # continue only if there is no DB entry
36
- # continue only if the dataset has a DOI
37
- # continue only if there are files for this resource
38
- # continue only if it is time to preserve
39
- if !dir_name.nil? &&
40
- !dir_name.empty? &&
41
- !@report.in_db?(dir_name) &&
42
- !d['doi'].empty? &&
43
- !d['file'].empty? &&
44
- time_to_preserve?(d['modified'], delay)
45
-
46
- dir_file_path = dir_base_path + '/' + dir_name
47
- dir_metadata_path = dir_file_path + '/metadata/'
48
- metadata_filename = dir_metadata_path + 'metadata.json'
49
-
50
- # calculate total size of data files
51
- download_storage_required = 0
52
- d['file'].each { |i| download_storage_required += i['size'].to_i }
53
-
54
- # do we have enough space in filesystem to fetch data files?
55
- if enough_storage_for_download? download_storage_required
56
- # @logger.info 'Sufficient disk space for ' + dir_file_path
57
- else
58
- @logger.error 'Insufficient disk space to store files fetched from Pure. Skipping ' + dir_file_path
59
- next
60
- end
61
-
62
- # has metadata file been created? if so, files and metadata are in place
63
- # continue only if files not present in ingest location
64
- if !File.size? metadata_filename
65
-
66
- @logger.info 'Preparing ' + dir_name + ', Pure UUID ' + d['uuid']
67
-
68
- data = []
69
- d['file'].each do |f|
70
- o = package_dataset_metadata d, f
71
- data << o
72
- wget_str = build_wget Puree.username,
73
- Puree.password,
74
- f['url']
75
-
76
- Dir.mkdir(dir_file_path) if !Dir.exists?(dir_file_path)
77
-
78
- # fetch the file
79
- Dir.chdir(dir_file_path) do
80
- # puts 'Changing dir to ' + Dir.pwd
81
- # puts 'Size of ' + f['name'] + ' is ' + File.size(f['name']).to_s
82
- if File.size?(f['name'])
83
- # puts 'Should be deleting ' + f['name']
84
- File.delete(f['name'])
85
- end
86
- # puts f['name'] + ' missing or empty'
87
- # puts wget_str
88
- `#{wget_str}`
89
- end
90
- end
91
-
92
- Dir.mkdir(dir_metadata_path) if !Dir.exists?(dir_metadata_path)
93
-
94
- pretty = JSON.pretty_generate( data, :indent => ' ')
95
- # puts pretty
96
- File.write(metadata_filename,pretty)
97
- @logger.info 'Created ' + metadata_filename
98
- end
99
- else
100
- @logger.info 'Skipping ' + dir_name + ', Pure UUID ' + d['uuid']
101
- end
102
- end
103
- end
104
-
105
- private
106
-
107
- def package_dataset_metadata(d, f)
108
- o = {}
109
- o['filename'] = 'objects/' + f['name']
110
- o['dc.title'] = d['title']
111
- if !d['description'].empty?
112
- o['dc.description'] = d['description']
113
- end
114
- o['dcterms.created'] = d['created']
115
- if !d['available']['year'].empty?
116
- o['dcterms.available'] = Puree::Date.iso(d['available'])
117
- end
118
- o['dc.publisher'] = d['publisher']
119
- if !d['doi'].empty?
120
- o['dc.identifier'] = d['doi']
121
- end
122
- if !d['spatial'].empty?
123
- o['dcterms.spatial'] = d['spatial']
124
- end
125
- if !d['temporal']['start']['year'].empty?
126
- temporal_range = ''
127
- temporal_range << Puree::Date.iso(d['temporal']['start'])
128
- if !d['temporal']['end']['year'].empty?
129
- temporal_range << '/'
130
- temporal_range << Puree::Date.iso(d['temporal']['end'])
131
- end
132
- o['dcterms.temporal'] = temporal_range
133
- end
134
- creators = []
135
- contributors = []
136
- person_types = %w(internal external other)
137
- person_types.each do |person_type|
138
- d['person'][person_type].each do |i|
139
- if i['role'] == 'Creator'
140
- creator = i['name']['last'] + ', ' + i['name']['first']
141
- creators << creator
142
- end
143
- if i['role'] == 'Contributor'
144
- contributor = i['name']['last'] + ', ' + i['name']['first']
145
- contributors << contributor
146
- end
147
- end
148
- end
149
- o['dc.creator'] = creators
150
- if !contributors.empty?
151
- o['dc.contributor'] = contributors
152
- end
153
- keywords = []
154
- d['keyword'].each { |i|
155
- keywords << i
156
- }
157
- if !keywords.empty?
158
- o['dc.subject'] = keywords
159
- end
160
- if !f['license']['name'].empty?
161
- o['dcterms.license'] = f['license']['name']
162
- end
163
- # o['dc.format'] = f['mime']
164
-
165
- related = []
166
- publications = d['publication']
167
- publications.each do |i|
168
- o_related = {}
169
- o_related['dc.title'] = i['title']
170
- o_related['type'] = i['type']
171
- pub = Puree::Publication.new
172
- pub.find uuid: i['uuid']
173
- doi = pub.doi
174
- if doi
175
- o_related['dc.identifier'] = doi
176
- end
177
- related << o_related
178
- end
179
- if !related.empty?
180
- o['related'] = related
181
- end
182
-
183
- o
184
- end
185
-
186
- end
187
-
188
- end