bulkrax 3.4.0 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -5
- data/app/controllers/bulkrax/exporters_controller.rb +1 -1
- data/app/jobs/bulkrax/create_relationships_job.rb +4 -2
- data/app/models/bulkrax/entry.rb +0 -2
- data/app/models/bulkrax/exporter.rb +15 -2
- data/app/models/concerns/bulkrax/dynamic_record_lookup.rb +7 -8
- data/app/models/concerns/bulkrax/export_behavior.rb +0 -22
- data/app/models/concerns/bulkrax/file_set_entry_behavior.rb +5 -1
- data/app/models/concerns/bulkrax/import_behavior.rb +2 -2
- data/app/parsers/bulkrax/application_parser.rb +6 -25
- data/app/parsers/bulkrax/bagit_parser.rb +69 -160
- data/app/parsers/bulkrax/csv_parser.rb +54 -10
- data/app/views/bulkrax/exporters/_downloads.html.erb +8 -0
- data/app/views/bulkrax/exporters/_form.html.erb +3 -0
- data/app/views/bulkrax/exporters/index.html.erb +5 -2
- data/app/views/bulkrax/exporters/show.html.erb +4 -7
- data/lib/bulkrax/version.rb +1 -1
- data/lib/tasks/bulkrax_tasks.rake +28 -4
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eb56d86ee90ae9e1cf0628504694e1301ab8f2d6b24ffa8fd323f8953a8ee956
|
4
|
+
data.tar.gz: 71056b077e300f27eee3bcccd9d7e2bee2fc7bdf2fc6ba9248b69a29f3994f9c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05ea49e6f2c5e73cbddacf35dcaf9de499760d7093e3ae8f3ce4ea5ab28e25d065b7877607436fcbe02d21e17c2df940b0224f1ea7d638a602486ce807d99981
|
7
|
+
data.tar.gz: ecdda29924e09793e62684f16ebcd79cd90ab9e8204d011b89a577ca667a644709ff2df5b525ac6c32355426038c05d9fc3d74c6efb20ee7cfab653d9b89b67a
|
data/README.md
CHANGED
@@ -70,7 +70,7 @@ Bulkrax.setup do |config|
|
|
70
70
|
end
|
71
71
|
```
|
72
72
|
|
73
|
-
The [configuration guide](https://github.com/samvera-labs/bulkrax/wiki/
|
73
|
+
The [configuration guide](https://github.com/samvera-labs/bulkrax/wiki/Configuring-Bulkrax) provides detailed instructions on the various available configurations.
|
74
74
|
|
75
75
|
Example:
|
76
76
|
|
@@ -120,7 +120,7 @@ It's unlikely that the incoming import data has fields that exactly match those
|
|
120
120
|
|
121
121
|
By default, a mapping for the OAI parser has been added to map standard oai_dc fields to Hyrax basic_metadata. The other parsers have no default mapping, and will map any incoming fields to Hyrax properties with the same name. Configurations can be added in `config/intializers/bulkrax.rb`
|
122
122
|
|
123
|
-
Configuring field mappings is documented in the [Bulkrax Configuration Guide](https://github.com/samvera-labs/bulkrax/wiki/
|
123
|
+
Configuring field mappings is documented in the [Bulkrax Configuration Guide](https://github.com/samvera-labs/bulkrax/wiki/Configuring-Bulkrax).
|
124
124
|
|
125
125
|
## Importing Files
|
126
126
|
|
@@ -151,7 +151,7 @@ end
|
|
151
151
|
|
152
152
|
## Customizing Bulkrax
|
153
153
|
|
154
|
-
For further information on how to extend and customize Bulkrax, please see the [Bulkrax Customization Guide](https://github.com/samvera-labs/bulkrax/wiki/Customizing).
|
154
|
+
For further information on how to extend and customize Bulkrax, please see the [Bulkrax Customization Guide](https://github.com/samvera-labs/bulkrax/wiki/Customizing-Bulkrax).
|
155
155
|
|
156
156
|
## How it Works
|
157
157
|
Once you have Bulkrax installed, you will have access to an easy to use interface with which you are able to create, edit, delete, run, and re-run imports and exports.
|
@@ -191,8 +191,6 @@ We encourage everyone to help improve this project. Bug reports and pull reques
|
|
191
191
|
|
192
192
|
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](https://contributor-covenant.org) code of conduct.
|
193
193
|
|
194
|
-
All Contributors should have signed the Samvera Contributor License Agreement (CLA)
|
195
|
-
|
196
194
|
## Questions
|
197
195
|
Questions can be sent to support@notch8.com. Please make sure to include "Bulkrax" in the subject line of your email.
|
198
196
|
|
@@ -42,10 +42,12 @@ module Bulkrax
|
|
42
42
|
pending_relationships.each do |rel|
|
43
43
|
raise ::StandardError, %("#{rel}" needs either a child or a parent to create a relationship) if rel.child_id.nil? || rel.parent_id.nil?
|
44
44
|
@child_entry, child_record = find_record(rel.child_id, importer_run_id)
|
45
|
-
|
45
|
+
if child_record
|
46
|
+
child_record.is_a?(::Collection) ? @child_records[:collections] << child_record : @child_records[:works] << child_record
|
47
|
+
end
|
46
48
|
end
|
47
49
|
|
48
|
-
if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.
|
50
|
+
if (child_records[:collections].blank? && child_records[:works].blank?) || parent_record.nil?
|
49
51
|
reschedule({ parent_identifier: parent_identifier, importer_run_id: importer_run_id })
|
50
52
|
return false # stop current job from continuing to run after rescheduling
|
51
53
|
end
|
data/app/models/bulkrax/entry.rb
CHANGED
@@ -4,8 +4,6 @@ module Bulkrax
|
|
4
4
|
# Custom error class for collections_created?
|
5
5
|
class CollectionsCreatedError < RuntimeError; end
|
6
6
|
class OAIError < RuntimeError; end
|
7
|
-
# TODO: remove when ApplicationParser#bagit_zip_file_size_check is removed
|
8
|
-
class BagitZipError < RuntimeError; end
|
9
7
|
class Entry < ApplicationRecord
|
10
8
|
include Bulkrax::HasMatchers
|
11
9
|
include Bulkrax::ImportBehavior
|
@@ -124,9 +124,13 @@ module Bulkrax
|
|
124
124
|
end
|
125
125
|
|
126
126
|
def exporter_export_zip_path
|
127
|
-
@exporter_export_zip_path ||= File.join(parser.base_path('export'), "export_#{self.id}_#{self.exporter_runs.last.id}
|
127
|
+
@exporter_export_zip_path ||= File.join(parser.base_path('export'), "export_#{self.id}_#{self.exporter_runs.last.id}")
|
128
128
|
rescue
|
129
|
-
@exporter_export_zip_path ||= File.join(parser.base_path('export'), "export_#{self.id}_0
|
129
|
+
@exporter_export_zip_path ||= File.join(parser.base_path('export'), "export_#{self.id}_0")
|
130
|
+
end
|
131
|
+
|
132
|
+
def exporter_export_zip_files
|
133
|
+
@exporter_export_zip_files ||= Dir["#{exporter_export_zip_path}/**"].map { |zip| Array(zip.split('/').last) }
|
130
134
|
end
|
131
135
|
|
132
136
|
def export_properties
|
@@ -137,5 +141,14 @@ module Bulkrax
|
|
137
141
|
def metadata_only?
|
138
142
|
export_type == 'metadata'
|
139
143
|
end
|
144
|
+
|
145
|
+
def sort_zip_files(zip_files)
|
146
|
+
zip_files.sort_by do |item|
|
147
|
+
number = item.split('_').last.match(/\d+/)&.[](0) || 0.to_s
|
148
|
+
sort_number = number.rjust(4, "0")
|
149
|
+
|
150
|
+
sort_number
|
151
|
+
end
|
152
|
+
end
|
140
153
|
end
|
141
154
|
end
|
@@ -12,15 +12,14 @@ module Bulkrax
|
|
12
12
|
# check for our entry in our current importer first
|
13
13
|
importer_id = ImporterRun.find(importer_run_id).importer_id
|
14
14
|
default_scope = { identifier: identifier, importerexporter_type: 'Bulkrax::Importer' }
|
15
|
-
record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
begin
|
17
|
+
# the identifier parameter can be a :source_identifier or the id of an object
|
18
|
+
record = Entry.find_by(default_scope.merge({ importerexporter_id: importer_id })) || Entry.find_by(default_scope)
|
19
|
+
record ||= ActiveFedora::Base.find(identifier)
|
20
|
+
# NameError for if ActiveFedora isn't installed
|
21
|
+
rescue NameError, ActiveFedora::ObjectNotFoundError
|
22
|
+
record = nil
|
24
23
|
end
|
25
24
|
|
26
25
|
# return the found entry here instead of searching for it again in the CreateRelationshipsJob
|
@@ -7,9 +7,6 @@ module Bulkrax
|
|
7
7
|
|
8
8
|
def build_for_exporter
|
9
9
|
build_export_metadata
|
10
|
-
# TODO(alishaevn): determine if the line below is still necessary
|
11
|
-
# the csv and bagit parsers also have write_files methods
|
12
|
-
write_files if export_type == 'full' && !importerexporter.parser_klass.include?('Bagit')
|
13
10
|
rescue RSolr::Error::Http, CollectionsCreatedError => e
|
14
11
|
raise e
|
15
12
|
rescue StandardError => e
|
@@ -26,25 +23,6 @@ module Bulkrax
|
|
26
23
|
@hyrax_record ||= ActiveFedora::Base.find(self.identifier)
|
27
24
|
end
|
28
25
|
|
29
|
-
def write_files
|
30
|
-
return if hyrax_record.is_a?(Collection)
|
31
|
-
|
32
|
-
file_sets = hyrax_record.file_set? ? Array.wrap(hyrax_record) : hyrax_record.file_sets
|
33
|
-
file_sets << hyrax_record.thumbnail if hyrax_record.thumbnail.present? && hyrax_record.work? && exporter.include_thumbnails
|
34
|
-
file_sets.each do |fs|
|
35
|
-
path = File.join(exporter_export_path, 'files')
|
36
|
-
FileUtils.mkdir_p(path)
|
37
|
-
file = filename(fs)
|
38
|
-
require 'open-uri'
|
39
|
-
io = open(fs.original_file.uri)
|
40
|
-
next if file.blank?
|
41
|
-
File.open(File.join(path, file), 'wb') do |f|
|
42
|
-
f.write(io.read)
|
43
|
-
f.close
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
26
|
# Prepend the file_set id to ensure a unique filename and also one that is not longer than 255 characters
|
49
27
|
def filename(file_set)
|
50
28
|
return if file_set.original_file.blank?
|
@@ -8,10 +8,14 @@ module Bulkrax
|
|
8
8
|
|
9
9
|
def add_path_to_file
|
10
10
|
parsed_metadata['file'].each_with_index do |filename, i|
|
11
|
-
|
11
|
+
next if filename.blank?
|
12
|
+
|
13
|
+
path_to_file = parser.path_to_files(filename: filename)
|
12
14
|
|
13
15
|
parsed_metadata['file'][i] = path_to_file
|
14
16
|
end
|
17
|
+
parsed_metadata['file'].delete('')
|
18
|
+
|
15
19
|
raise ::StandardError, "one or more file paths are invalid: #{parsed_metadata['file'].join(', ')}" unless parsed_metadata['file'].map { |file_path| ::File.file?(file_path) }.all?
|
16
20
|
|
17
21
|
parsed_metadata['file']
|
@@ -12,8 +12,8 @@ module Bulkrax
|
|
12
12
|
raise CollectionsCreatedError unless collections_created?
|
13
13
|
@item = factory.run!
|
14
14
|
add_user_to_permission_templates! if self.class.to_s.include?("Collection")
|
15
|
-
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping].present?
|
16
|
-
child_jobs if self.parsed_metadata[related_children_parsed_mapping].present?
|
15
|
+
parent_jobs if self.parsed_metadata[related_parents_parsed_mapping]&.join.present?
|
16
|
+
child_jobs if self.parsed_metadata[related_children_parsed_mapping]&.join.present?
|
17
17
|
end
|
18
18
|
rescue RSolr::Error::Http, CollectionsCreatedError => e
|
19
19
|
raise e
|
@@ -247,8 +247,6 @@ module Bulkrax
|
|
247
247
|
def write
|
248
248
|
write_files
|
249
249
|
zip
|
250
|
-
# uncomment next line to debug for faulty zipping during bagit export
|
251
|
-
bagit_zip_file_size_check if importerexporter.parser_klass.include?('Bagit')
|
252
250
|
end
|
253
251
|
|
254
252
|
def unzip(file_to_unzip)
|
@@ -262,30 +260,13 @@ module Bulkrax
|
|
262
260
|
end
|
263
261
|
|
264
262
|
def zip
|
265
|
-
FileUtils.
|
266
|
-
Zip::File.open(exporter_export_zip_path, create: true) do |zip_file|
|
267
|
-
Dir["#{exporter_export_path}/**/**"].each do |file|
|
268
|
-
zip_file.add(file.sub("#{exporter_export_path}/", ''), file)
|
269
|
-
end
|
270
|
-
end
|
271
|
-
end
|
263
|
+
FileUtils.mkdir_p(exporter_export_zip_path)
|
272
264
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
begin
|
279
|
-
raise BagitZipError, "Invalid Bag, file size mismatch for #{file.sub("#{exporter_export_path}/", '')}" if File.size(file) != zipped_file.size
|
280
|
-
rescue BagitZipError => e
|
281
|
-
matched_entry_ids = importerexporter.entry_ids.select do |id|
|
282
|
-
Bulkrax::Entry.find(id).identifier.include?(zipped_file.name.split('/').first)
|
283
|
-
end
|
284
|
-
matched_entry_ids.each do |entry_id|
|
285
|
-
Bulkrax::Entry.find(entry_id).status_info(e)
|
286
|
-
status_info('Complete (with failures)')
|
287
|
-
end
|
288
|
-
end
|
265
|
+
Dir["#{exporter_export_path}/**"].each do |folder|
|
266
|
+
zip_path = "#{exporter_export_zip_path.split('/').last}_#{folder.split('/').last}.zip"
|
267
|
+
Zip::File.open(File.join("#{exporter_export_zip_path}/#{zip_path}"), create: true) do |zip_file|
|
268
|
+
Dir["#{folder}/**/**"].each do |file|
|
269
|
+
zip_file.add(file.sub("#{folder}/", ''), file)
|
289
270
|
end
|
290
271
|
end
|
291
272
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Bulkrax
|
4
|
-
class BagitParser <
|
4
|
+
class BagitParser < CsvParser # rubocop:disable Metrics/ClassLength
|
5
5
|
include ExportBehavior
|
6
6
|
|
7
7
|
def self.export_supported?
|
@@ -20,12 +20,8 @@ module Bulkrax
|
|
20
20
|
rdf_format ? RdfEntry : CsvEntry
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
def file_set_entry_class
|
28
|
-
CsvFileSetEntry
|
23
|
+
def path_to_files(filename:)
|
24
|
+
@path_to_files ||= Dir.glob(File.join(import_file_path, '**/data', filename)).first
|
29
25
|
end
|
30
26
|
|
31
27
|
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
@@ -36,39 +32,41 @@ module Bulkrax
|
|
36
32
|
end.flatten.compact.uniq
|
37
33
|
end
|
38
34
|
|
39
|
-
#
|
40
|
-
# Create an Array of all metadata records, one per file
|
35
|
+
# Create an Array of all metadata records
|
41
36
|
def records(_opts = {})
|
42
37
|
raise StandardError, 'No BagIt records were found' if bags.blank?
|
43
38
|
@records ||= bags.map do |bag|
|
44
39
|
path = metadata_path(bag)
|
45
40
|
raise StandardError, 'No metadata files were found' if path.blank?
|
46
41
|
data = entry_class.read_data(path)
|
47
|
-
|
48
|
-
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
49
|
-
data
|
42
|
+
get_data(bag, data)
|
50
43
|
end
|
44
|
+
|
45
|
+
@records = @records.flatten
|
51
46
|
end
|
52
47
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
66
|
-
ImportCollectionJob.perform_now(new_entry.id, current_run.id)
|
67
|
-
increment_counters(index, collection: true)
|
48
|
+
def get_data(bag, data)
|
49
|
+
if entry_class == CsvEntry
|
50
|
+
data = data.map do |data_row|
|
51
|
+
record_data = entry_class.data_for_entry(data_row, source_identifier, self)
|
52
|
+
next record_data if importerexporter.metadata_only?
|
53
|
+
|
54
|
+
record_data[:file] = bag.bag_files.join('|') if ::Hyrax.config.curation_concerns.include? record_data[:model]&.constantize
|
55
|
+
record_data
|
56
|
+
end
|
57
|
+
else
|
58
|
+
data = entry_class.data_for_entry(data, source_identifier, self)
|
59
|
+
data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
|
68
60
|
end
|
61
|
+
|
62
|
+
data
|
69
63
|
end
|
70
64
|
|
71
65
|
def create_works
|
66
|
+
entry_class == CsvEntry ? super : create_rdf_works
|
67
|
+
end
|
68
|
+
|
69
|
+
def create_rdf_works
|
72
70
|
records.each_with_index do |record, index|
|
73
71
|
next unless record_has_source_identifier(record, index)
|
74
72
|
break if limit_reached?(limit, index)
|
@@ -87,19 +85,6 @@ module Bulkrax
|
|
87
85
|
status_info(e)
|
88
86
|
end
|
89
87
|
|
90
|
-
def collections
|
91
|
-
records.map { |r| r[related_parents_parsed_mapping].split(/\s*[;|]\s*/) if r[related_parents_parsed_mapping].present? }.flatten.compact.uniq
|
92
|
-
end
|
93
|
-
|
94
|
-
def collections_total
|
95
|
-
collections.size
|
96
|
-
end
|
97
|
-
|
98
|
-
# TODO: change to differentiate between collection and work records when adding ability to import collection metadata
|
99
|
-
def works_total
|
100
|
-
total
|
101
|
-
end
|
102
|
-
|
103
88
|
def total
|
104
89
|
@total = importer.parser_fields['total'] || 0 if importer?
|
105
90
|
|
@@ -112,18 +97,6 @@ module Bulkrax
|
|
112
97
|
@total = 0
|
113
98
|
end
|
114
99
|
|
115
|
-
def extra_filters
|
116
|
-
output = ""
|
117
|
-
if importerexporter.start_date.present?
|
118
|
-
start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
|
119
|
-
finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
|
120
|
-
output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
|
121
|
-
end
|
122
|
-
output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
|
123
|
-
output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
|
124
|
-
output
|
125
|
-
end
|
126
|
-
|
127
100
|
def current_record_ids
|
128
101
|
@work_ids = []
|
129
102
|
@collection_ids = []
|
@@ -140,78 +113,39 @@ module Bulkrax
|
|
140
113
|
when 'importer'
|
141
114
|
set_ids_for_exporting_from_importer
|
142
115
|
end
|
143
|
-
@work_ids + @collection_ids + @file_set_ids
|
144
|
-
end
|
145
116
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
entry_ids = Importer.find(importerexporter.export_source).entries.pluck(:id)
|
150
|
-
complete_statuses = Status.latest_by_statusable
|
151
|
-
.includes(:statusable)
|
152
|
-
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
|
153
|
-
|
154
|
-
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
|
155
|
-
extra_filters = extra_filters.presence || '*:*'
|
156
|
-
|
157
|
-
{ :@work_ids => ::Hyrax.config.curation_concerns, :@collection_ids => [::Collection], :@file_set_ids => [::FileSet] }.each do |instance_var, models_to_search|
|
158
|
-
instance_variable_set(instance_var, ActiveFedora::SolrService.post(
|
159
|
-
extra_filters.to_s,
|
160
|
-
fq: [
|
161
|
-
%(#{::Solrizer.solr_name(work_identifier)}:("#{complete_entry_identifiers.join('" OR "')}")),
|
162
|
-
"has_model_ssim:(#{models_to_search.join(' OR ')})"
|
163
|
-
],
|
164
|
-
fl: 'id',
|
165
|
-
rows: 2_000_000_000
|
166
|
-
)['response']['docs'].map { |obj| obj['id'] })
|
167
|
-
end
|
117
|
+
find_child_file_sets(@work_ids) if importerexporter.export_from == 'collection' || importerexporter.export_from == 'worktype'
|
118
|
+
|
119
|
+
@work_ids + @collection_ids + @file_set_ids
|
168
120
|
end
|
169
121
|
|
170
122
|
# export methods
|
171
123
|
|
172
|
-
def create_new_entries
|
173
|
-
current_record_ids.each_with_index do |id, index|
|
174
|
-
break if limit_reached?(limit, index)
|
175
|
-
|
176
|
-
this_entry_class = if @collection_ids.include?(id)
|
177
|
-
collection_entry_class
|
178
|
-
elsif @file_set_ids.include?(id)
|
179
|
-
file_set_entry_class
|
180
|
-
else
|
181
|
-
entry_class
|
182
|
-
end
|
183
|
-
new_entry = find_or_create_entry(this_entry_class, id, 'Bulkrax::Exporter')
|
184
|
-
|
185
|
-
begin
|
186
|
-
entry = ExportWorkJob.perform_now(new_entry.id, current_run.id)
|
187
|
-
rescue => e
|
188
|
-
Rails.logger.info("#{e.message} was detected during export")
|
189
|
-
end
|
190
|
-
|
191
|
-
self.headers |= entry.parsed_metadata.keys if entry
|
192
|
-
end
|
193
|
-
end
|
194
|
-
alias create_from_collection create_new_entries
|
195
|
-
alias create_from_importer create_new_entries
|
196
|
-
alias create_from_worktype create_new_entries
|
197
|
-
alias create_from_all create_new_entries
|
198
|
-
|
199
124
|
# rubocop:disable Metrics/MethodLength, Metrics/AbcSize
|
200
125
|
def write_files
|
201
126
|
require 'open-uri'
|
202
127
|
require 'socket'
|
128
|
+
|
129
|
+
folder_count = 1
|
130
|
+
records_in_folder = 0
|
131
|
+
|
203
132
|
importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].each do |entry|
|
204
|
-
|
205
|
-
next unless Hyrax.config.curation_concerns.include?(
|
206
|
-
|
133
|
+
record = ActiveFedora::Base.find(entry.identifier)
|
134
|
+
next unless Hyrax.config.curation_concerns.include?(record.class)
|
135
|
+
|
207
136
|
bag_entries = [entry]
|
137
|
+
file_set_entries = Bulkrax::CsvFileSetEntry.where(importerexporter_id: importerexporter.id).where("parsed_metadata LIKE '%#{record.id}%'")
|
138
|
+
file_set_entries.each { |fse| bag_entries << fse }
|
208
139
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
140
|
+
records_in_folder += bag_entries.count
|
141
|
+
if records_in_folder > records_split_count
|
142
|
+
folder_count += 1
|
143
|
+
records_in_folder = bag_entries.count
|
144
|
+
end
|
145
|
+
|
146
|
+
bag ||= BagIt::Bag.new setup_bagit_folder(folder_count, entry.identifier)
|
214
147
|
|
148
|
+
record.file_sets.each do |fs|
|
215
149
|
file_name = filename(fs)
|
216
150
|
next if file_name.blank?
|
217
151
|
io = open(fs.original_file.uri)
|
@@ -226,17 +160,21 @@ module Bulkrax
|
|
226
160
|
end
|
227
161
|
end
|
228
162
|
|
229
|
-
CSV.open(setup_csv_metadata_export_file(entry.identifier), "w", headers: export_headers, write_headers: true) do |csv|
|
163
|
+
CSV.open(setup_csv_metadata_export_file(folder_count, entry.identifier), "w", headers: export_headers, write_headers: true) do |csv|
|
230
164
|
bag_entries.each { |csv_entry| csv << csv_entry.parsed_metadata }
|
231
165
|
end
|
232
|
-
|
166
|
+
|
167
|
+
write_triples(folder_count, entry)
|
233
168
|
bag.manifest!(algo: 'sha256')
|
234
169
|
end
|
235
170
|
end
|
236
171
|
# rubocop:enable Metrics/MethodLength, Metrics/AbcSize
|
237
172
|
|
238
|
-
def setup_csv_metadata_export_file(id)
|
239
|
-
File.join(importerexporter.exporter_export_path,
|
173
|
+
def setup_csv_metadata_export_file(folder_count, id)
|
174
|
+
path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
|
175
|
+
FileUtils.mkdir_p(path) unless File.exist?(path)
|
176
|
+
|
177
|
+
File.join(path, id, 'metadata.csv')
|
240
178
|
end
|
241
179
|
|
242
180
|
def key_allowed(key)
|
@@ -245,66 +183,31 @@ module Bulkrax
|
|
245
183
|
key != source_identifier.to_s
|
246
184
|
end
|
247
185
|
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
# we don't want access_control_id exported and we want file at the end
|
253
|
-
headers.delete('access_control_id') if headers.include?('access_control_id')
|
186
|
+
def setup_triple_metadata_export_file(folder_count, id)
|
187
|
+
path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
|
188
|
+
FileUtils.mkdir_p(path) unless File.exist?(path)
|
254
189
|
|
255
|
-
|
256
|
-
headers.prepend('model')
|
257
|
-
headers.prepend(source_identifier.to_s)
|
258
|
-
headers.prepend('id')
|
259
|
-
|
260
|
-
headers.uniq
|
190
|
+
File.join(path, id, 'metadata.nt')
|
261
191
|
end
|
262
192
|
|
263
|
-
def
|
264
|
-
|
265
|
-
|
266
|
-
@object_names = mapping.values.map { |value| value['object'] }
|
267
|
-
@object_names.uniq!.delete(nil)
|
268
|
-
|
269
|
-
@object_names
|
270
|
-
end
|
271
|
-
|
272
|
-
def sort_headers(headers)
|
273
|
-
# converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
|
274
|
-
# while keeping objects grouped together
|
275
|
-
headers.sort_by do |item|
|
276
|
-
number = item.match(/\d+/)&.[](0) || 0.to_s
|
277
|
-
sort_number = number.rjust(4, "0")
|
278
|
-
object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
|
279
|
-
remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
|
280
|
-
"#{object_prefix}_#{sort_number}_#{remainder}"
|
281
|
-
end
|
282
|
-
end
|
193
|
+
def setup_bagit_folder(folder_count, id)
|
194
|
+
path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
|
195
|
+
FileUtils.mkdir_p(path) unless File.exist?(path)
|
283
196
|
|
284
|
-
|
285
|
-
File.join(importerexporter.exporter_export_path, id, 'metadata.nt')
|
197
|
+
File.join(path, id)
|
286
198
|
end
|
287
199
|
|
288
|
-
def
|
289
|
-
File.join(importerexporter.exporter_export_path, id)
|
290
|
-
end
|
291
|
-
|
292
|
-
def write_triples(e)
|
200
|
+
def write_triples(folder_count, e)
|
293
201
|
sd = SolrDocument.find(e.identifier)
|
294
202
|
return if sd.nil?
|
295
203
|
|
296
204
|
req = ActionDispatch::Request.new({ 'HTTP_HOST' => Socket.gethostname })
|
297
205
|
rdf = Hyrax::GraphExporter.new(sd, req).fetch.dump(:ntriples)
|
298
|
-
File.open(setup_triple_metadata_export_file(e.identifier), "w") do |triples|
|
206
|
+
File.open(setup_triple_metadata_export_file(folder_count, e.identifier), "w") do |triples|
|
299
207
|
triples.write(rdf)
|
300
208
|
end
|
301
209
|
end
|
302
210
|
|
303
|
-
def required_elements?(keys)
|
304
|
-
return if keys.blank?
|
305
|
-
!required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
|
306
|
-
end
|
307
|
-
|
308
211
|
# @todo - investigate getting directory structure
|
309
212
|
# @todo - investigate using perform_later, and having the importer check for
|
310
213
|
# DownloadCloudFileJob before it starts
|
@@ -355,5 +258,11 @@ module Bulkrax
|
|
355
258
|
return nil unless bag.valid?
|
356
259
|
bag
|
357
260
|
end
|
261
|
+
|
262
|
+
# use the version of this method from the application parser instead
|
263
|
+
def real_import_file_path
|
264
|
+
return importer_unzip_path if file? && zip?
|
265
|
+
parser_fields['import_file_path']
|
266
|
+
end
|
358
267
|
end
|
359
268
|
end
|
@@ -4,6 +4,7 @@ require 'csv'
|
|
4
4
|
module Bulkrax
|
5
5
|
class CsvParser < ApplicationParser # rubocop:disable Metrics/ClassLength
|
6
6
|
include ErroredEntries
|
7
|
+
include ExportBehavior
|
7
8
|
attr_writer :collections, :file_sets, :works
|
8
9
|
|
9
10
|
def self.export_supported?
|
@@ -207,6 +208,13 @@ module Bulkrax
|
|
207
208
|
@work_ids + @collection_ids + @file_set_ids
|
208
209
|
end
|
209
210
|
|
211
|
+
# find the related file set ids so entries can be made for export
|
212
|
+
def find_child_file_sets(work_ids)
|
213
|
+
work_ids.each do |id|
|
214
|
+
ActiveFedora::Base.find(id).file_set_ids.each { |fs_id| @file_set_ids << fs_id }
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
210
218
|
# Set the following instance variables: @work_ids, @collection_ids, @file_set_ids
|
211
219
|
# @see #current_record_ids
|
212
220
|
def set_ids_for_exporting_from_importer
|
@@ -272,8 +280,8 @@ module Bulkrax
|
|
272
280
|
CsvFileSetEntry
|
273
281
|
end
|
274
282
|
|
275
|
-
#
|
276
|
-
#
|
283
|
+
# TODO: figure out why using the version of this method that's in the bagit parser
|
284
|
+
# breaks specs for the "if importer?" line
|
277
285
|
def total
|
278
286
|
@total = importer.parser_fields['total'] || 0 if importer?
|
279
287
|
@total = limit || current_record_ids.count if exporter?
|
@@ -283,6 +291,10 @@ module Bulkrax
|
|
283
291
|
@total = 0
|
284
292
|
end
|
285
293
|
|
294
|
+
def records_split_count
|
295
|
+
1000
|
296
|
+
end
|
297
|
+
|
286
298
|
# @todo - investigate getting directory structure
|
287
299
|
# @todo - investigate using perform_later, and having the importer check for
|
288
300
|
# DownloadCloudFileJob before it starts
|
@@ -307,9 +319,37 @@ module Bulkrax
|
|
307
319
|
# export methods
|
308
320
|
|
309
321
|
def write_files
|
310
|
-
|
311
|
-
|
312
|
-
|
322
|
+
require 'open-uri'
|
323
|
+
folder_count = 0
|
324
|
+
|
325
|
+
importerexporter.entries.where(identifier: current_record_ids)[0..limit || total].in_groups_of(records_split_count, false) do |group|
|
326
|
+
folder_count += 1
|
327
|
+
|
328
|
+
CSV.open(setup_export_file(folder_count), "w", headers: export_headers, write_headers: true) do |csv|
|
329
|
+
group.each do |entry|
|
330
|
+
csv << entry.parsed_metadata
|
331
|
+
next if importerexporter.metadata_only? || entry.type == 'Bulkrax::CsvCollectionEntry'
|
332
|
+
|
333
|
+
store_files(entry.identifier, folder_count.to_s)
|
334
|
+
end
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
339
|
+
def store_files(identifier, folder_count)
|
340
|
+
record = ActiveFedora::Base.find(identifier)
|
341
|
+
file_sets = record.file_set? ? Array.wrap(record) : record.file_sets
|
342
|
+
file_sets << record.thumbnail if exporter.include_thumbnails && record.thumbnail.present? && record.work?
|
343
|
+
file_sets.each do |fs|
|
344
|
+
path = File.join(exporter_export_path, folder_count, 'files')
|
345
|
+
FileUtils.mkdir_p(path) unless File.exist? path
|
346
|
+
file = filename(fs)
|
347
|
+
io = open(fs.original_file.uri)
|
348
|
+
next if file.blank?
|
349
|
+
|
350
|
+
File.open(File.join(path, file), 'wb') do |f|
|
351
|
+
f.write(io.read)
|
352
|
+
f.close
|
313
353
|
end
|
314
354
|
end
|
315
355
|
end
|
@@ -356,8 +396,11 @@ module Bulkrax
|
|
356
396
|
end
|
357
397
|
|
358
398
|
# in the parser as it is specific to the format
|
359
|
-
def setup_export_file
|
360
|
-
File.join(importerexporter.exporter_export_path,
|
399
|
+
def setup_export_file(folder_count)
|
400
|
+
path = File.join(importerexporter.exporter_export_path, folder_count.to_s)
|
401
|
+
FileUtils.mkdir_p(path) unless File.exist?(path)
|
402
|
+
|
403
|
+
File.join(path, "export_#{importerexporter.export_source}_from_#{importerexporter.export_from}_#{folder_count}.csv")
|
361
404
|
end
|
362
405
|
|
363
406
|
# Retrieve file paths for [:file] mapping in records
|
@@ -382,10 +425,11 @@ module Bulkrax
|
|
382
425
|
end
|
383
426
|
|
384
427
|
# Retrieve the path where we expect to find the files
|
385
|
-
def path_to_files
|
428
|
+
def path_to_files(**args)
|
429
|
+
filename = args.fetch(:filename, '')
|
430
|
+
|
386
431
|
@path_to_files ||= File.join(
|
387
|
-
zip? ? importer_unzip_path : File.dirname(import_file_path),
|
388
|
-
'files'
|
432
|
+
zip? ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
|
389
433
|
)
|
390
434
|
end
|
391
435
|
|
@@ -29,6 +29,7 @@
|
|
29
29
|
|
30
30
|
<%= form.input :export_source_importer,
|
31
31
|
label: t('bulkrax.exporter.labels.importer'),
|
32
|
+
# required: true,
|
32
33
|
prompt: 'Select from the list',
|
33
34
|
label_html: { class: 'importer export-source-option hidden' },
|
34
35
|
input_html: { class: 'importer export-source-option hidden' },
|
@@ -37,6 +38,7 @@
|
|
37
38
|
<%= form.input :export_source_collection,
|
38
39
|
prompt: 'Start typing ...',
|
39
40
|
label: t('bulkrax.exporter.labels.collection'),
|
41
|
+
# required: true,
|
40
42
|
placeholder: @collection&.title&.first,
|
41
43
|
label_html: { class: 'collection export-source-option hidden' },
|
42
44
|
input_html: {
|
@@ -50,6 +52,7 @@
|
|
50
52
|
|
51
53
|
<%= form.input :export_source_worktype,
|
52
54
|
label: t('bulkrax.exporter.labels.worktype'),
|
55
|
+
# required: true,
|
53
56
|
prompt: 'Select from the list',
|
54
57
|
label_html: { class: 'worktype export-source-option hidden' },
|
55
58
|
input_html: { class: 'worktype export-source-option hidden' },
|
@@ -21,7 +21,7 @@
|
|
21
21
|
<th scope="col">Name</th>
|
22
22
|
<th scope="col">Status</th>
|
23
23
|
<th scope="col">Date Exported</th>
|
24
|
-
<th scope="col"
|
24
|
+
<th scope="col">Downloadable Files</th>
|
25
25
|
<th scope="col"></th>
|
26
26
|
<th scope="col"></th>
|
27
27
|
<th scope="col"></th>
|
@@ -35,7 +35,10 @@
|
|
35
35
|
<td><%= exporter.created_at %></td>
|
36
36
|
<td>
|
37
37
|
<% if File.exist?(exporter.exporter_export_zip_path) %>
|
38
|
-
<%=
|
38
|
+
<%= simple_form_for(exporter, method: :get, url: exporter_download_path(exporter)) do |form| %>
|
39
|
+
<%= render 'downloads', exporter: exporter, form: form %>
|
40
|
+
<%= form.button :submit, value: 'Download', data: { disable_with: false } %>
|
41
|
+
<% end %>
|
39
42
|
<% end%>
|
40
43
|
</td>
|
41
44
|
<td><%= link_to raw('<span class="glyphicon glyphicon-info-sign"></span>'), exporter_path(exporter) %></td>
|
@@ -8,10 +8,11 @@
|
|
8
8
|
<div class='panel-body'>
|
9
9
|
|
10
10
|
<% if File.exist?(@exporter.exporter_export_zip_path) %>
|
11
|
-
|
11
|
+
<%= simple_form_for @exporter, method: :get, url: exporter_download_path(@exporter), html: { class: 'form-inline bulkrax-p-align' } do |form| %>
|
12
12
|
<strong>Download:</strong>
|
13
|
-
<%=
|
14
|
-
|
13
|
+
<%= render 'downloads', exporter: @exporter, form: form %>
|
14
|
+
<%= form.button :submit, value: 'Download', data: { disable_with: false } %>
|
15
|
+
<% end %>
|
15
16
|
<% end %>
|
16
17
|
|
17
18
|
<p class='bulkrax-p-align'>
|
@@ -135,10 +136,6 @@
|
|
135
136
|
<%= page_entries_info(@work_entries) %><br>
|
136
137
|
<%= paginate(@work_entries, param_name: :work_entries_page) %>
|
137
138
|
<br>
|
138
|
-
<% if File.exist?(@exporter.exporter_export_zip_path) %>
|
139
|
-
<%= link_to 'Download', exporter_download_path(@exporter) %>
|
140
|
-
|
|
141
|
-
<% end %>
|
142
139
|
<%= link_to 'Edit', edit_exporter_path(@exporter) %>
|
143
140
|
|
|
144
141
|
<%= link_to 'Back', exporters_path %>
|
data/lib/bulkrax/version.rb
CHANGED
@@ -1,6 +1,30 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
namespace :bulkrax do
|
4
|
+
desc "Remove old exported zips and create new ones with the new file structure"
|
5
|
+
task rerun_all_exporters: :environment do
|
6
|
+
if defined?(::Hyku)
|
7
|
+
Account.find_each do |account|
|
8
|
+
puts "=============== updating #{account.name} ============"
|
9
|
+
next if account.name == "search"
|
10
|
+
switch!(account)
|
11
|
+
|
12
|
+
rerun_exporters_and_delete_zips
|
13
|
+
|
14
|
+
puts "=============== finished updating #{account.name} ============"
|
15
|
+
end
|
16
|
+
else
|
17
|
+
rerun_exporters_and_delete_zips
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def rerun_exporters_and_delete_zips
|
22
|
+
begin
|
23
|
+
Bulkrax::Exporter.all.each { |e| Bulkrax::ExporterJob.perform_later(e.id) }
|
24
|
+
rescue => e
|
25
|
+
puts "(#{e.message})"
|
26
|
+
end
|
27
|
+
|
28
|
+
Dir["tmp/exports/**.zip"].each { |zip_path| FileUtils.rm_rf(zip_path) }
|
29
|
+
end
|
30
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bulkrax
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 4.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Kaufman
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -331,6 +331,7 @@ files:
|
|
331
331
|
- app/views/bulkrax/entries/_parsed_metadata.html.erb
|
332
332
|
- app/views/bulkrax/entries/_raw_metadata.html.erb
|
333
333
|
- app/views/bulkrax/entries/show.html.erb
|
334
|
+
- app/views/bulkrax/exporters/_downloads.html.erb
|
334
335
|
- app/views/bulkrax/exporters/_form.html.erb
|
335
336
|
- app/views/bulkrax/exporters/edit.html.erb
|
336
337
|
- app/views/bulkrax/exporters/index.html.erb
|
@@ -404,7 +405,7 @@ homepage: https://github.com/samvera-labs/bulkrax
|
|
404
405
|
licenses:
|
405
406
|
- Apache-2.0
|
406
407
|
metadata: {}
|
407
|
-
post_install_message:
|
408
|
+
post_install_message:
|
408
409
|
rdoc_options: []
|
409
410
|
require_paths:
|
410
411
|
- lib
|
@@ -419,8 +420,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
419
420
|
- !ruby/object:Gem::Version
|
420
421
|
version: '0'
|
421
422
|
requirements: []
|
422
|
-
rubygems_version: 3.
|
423
|
-
signing_key:
|
423
|
+
rubygems_version: 3.0.3
|
424
|
+
signing_key:
|
424
425
|
specification_version: 4
|
425
426
|
summary: Import and export tool for Hyrax and Hyku
|
426
427
|
test_files: []
|