bulkrax 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +205 -0
- data/README.md +202 -0
- data/Rakefile +42 -0
- data/app/assets/config/bulkrax_manifest.js +2 -0
- data/app/assets/javascripts/bulkrax/application.js +14 -0
- data/app/assets/javascripts/bulkrax/bulkrax.js +11 -0
- data/app/assets/javascripts/bulkrax/entries.js +15 -0
- data/app/assets/javascripts/bulkrax/exporters.js +60 -0
- data/app/assets/javascripts/bulkrax/importers.js.erb +166 -0
- data/app/assets/stylesheets/bulkrax/accordion.scss +40 -0
- data/app/assets/stylesheets/bulkrax/application.css +15 -0
- data/app/assets/stylesheets/bulkrax/coderay.scss +264 -0
- data/app/assets/stylesheets/bulkrax/import_export.scss +37 -0
- data/app/controllers/bulkrax/application_controller.rb +8 -0
- data/app/controllers/bulkrax/entries_controller.rb +44 -0
- data/app/controllers/bulkrax/exporters_controller.rb +125 -0
- data/app/controllers/bulkrax/importers_controller.rb +315 -0
- data/app/controllers/concerns/bulkrax/api.rb +29 -0
- data/app/factories/bulkrax/object_factory.rb +230 -0
- data/app/helpers/bulkrax/application_helper.rb +15 -0
- data/app/helpers/bulkrax/exporters_helper.rb +6 -0
- data/app/helpers/bulkrax/importers_helper.rb +13 -0
- data/app/helpers/bulkrax/validation_helper.rb +153 -0
- data/app/jobs/bulkrax/application_job.rb +6 -0
- data/app/jobs/bulkrax/child_relationships_job.rb +128 -0
- data/app/jobs/bulkrax/delete_work_job.rb +16 -0
- data/app/jobs/bulkrax/download_cloud_file_job.rb +18 -0
- data/app/jobs/bulkrax/export_work_job.rb +37 -0
- data/app/jobs/bulkrax/exporter_job.rb +14 -0
- data/app/jobs/bulkrax/import_work_collection_job.rb +41 -0
- data/app/jobs/bulkrax/import_work_job.rb +32 -0
- data/app/jobs/bulkrax/importer_job.rb +26 -0
- data/app/mailers/bulkrax/application_mailer.rb +8 -0
- data/app/matchers/bulkrax/application_matcher.rb +113 -0
- data/app/matchers/bulkrax/bagit_matcher.rb +6 -0
- data/app/matchers/bulkrax/csv_matcher.rb +6 -0
- data/app/matchers/bulkrax/oai_matcher.rb +6 -0
- data/app/models/bulkrax/application_record.rb +7 -0
- data/app/models/bulkrax/csv_collection_entry.rb +19 -0
- data/app/models/bulkrax/csv_entry.rb +163 -0
- data/app/models/bulkrax/entry.rb +104 -0
- data/app/models/bulkrax/exporter.rb +122 -0
- data/app/models/bulkrax/exporter_run.rb +7 -0
- data/app/models/bulkrax/import_failed.rb +13 -0
- data/app/models/bulkrax/importer.rb +155 -0
- data/app/models/bulkrax/importer_run.rb +8 -0
- data/app/models/bulkrax/oai_dc_entry.rb +6 -0
- data/app/models/bulkrax/oai_entry.rb +74 -0
- data/app/models/bulkrax/oai_qualified_dc_entry.rb +6 -0
- data/app/models/bulkrax/oai_set_entry.rb +19 -0
- data/app/models/bulkrax/rdf_collection_entry.rb +19 -0
- data/app/models/bulkrax/rdf_entry.rb +90 -0
- data/app/models/bulkrax/status.rb +25 -0
- data/app/models/bulkrax/xml_entry.rb +73 -0
- data/app/models/concerns/bulkrax/download_behavior.rb +61 -0
- data/app/models/concerns/bulkrax/errored_entries.rb +45 -0
- data/app/models/concerns/bulkrax/export_behavior.rb +58 -0
- data/app/models/concerns/bulkrax/file_factory.rb +140 -0
- data/app/models/concerns/bulkrax/has_local_processing.rb +7 -0
- data/app/models/concerns/bulkrax/has_matchers.rb +155 -0
- data/app/models/concerns/bulkrax/import_behavior.rb +90 -0
- data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +34 -0
- data/app/models/concerns/bulkrax/status_info.rb +56 -0
- data/app/parsers/bulkrax/application_parser.rb +299 -0
- data/app/parsers/bulkrax/bagit_parser.rb +157 -0
- data/app/parsers/bulkrax/csv_parser.rb +266 -0
- data/app/parsers/bulkrax/oai_dc_parser.rb +130 -0
- data/app/parsers/bulkrax/oai_qualified_dc_parser.rb +9 -0
- data/app/parsers/bulkrax/xml_parser.rb +103 -0
- data/app/views/bulkrax/entries/_parsed_metadata.html.erb +19 -0
- data/app/views/bulkrax/entries/_raw_metadata.html.erb +19 -0
- data/app/views/bulkrax/entries/show.html.erb +63 -0
- data/app/views/bulkrax/exporters/_form.html.erb +120 -0
- data/app/views/bulkrax/exporters/edit.html.erb +23 -0
- data/app/views/bulkrax/exporters/index.html.erb +67 -0
- data/app/views/bulkrax/exporters/new.html.erb +23 -0
- data/app/views/bulkrax/exporters/show.html.erb +124 -0
- data/app/views/bulkrax/importers/_bagit_fields.html.erb +54 -0
- data/app/views/bulkrax/importers/_browse_everything.html.erb +12 -0
- data/app/views/bulkrax/importers/_csv_fields.html.erb +39 -0
- data/app/views/bulkrax/importers/_edit_form_buttons.html.erb +16 -0
- data/app/views/bulkrax/importers/_form.html.erb +35 -0
- data/app/views/bulkrax/importers/_oai_fields.html.erb +42 -0
- data/app/views/bulkrax/importers/_xml_fields.html.erb +60 -0
- data/app/views/bulkrax/importers/edit.html.erb +20 -0
- data/app/views/bulkrax/importers/index.html.erb +77 -0
- data/app/views/bulkrax/importers/new.html.erb +25 -0
- data/app/views/bulkrax/importers/show.html.erb +175 -0
- data/app/views/bulkrax/importers/upload_corrected_entries.html.erb +37 -0
- data/app/views/bulkrax/shared/_bulkrax_errors.html.erb +52 -0
- data/app/views/bulkrax/shared/_bulkrax_field_mapping.html.erb +39 -0
- data/app/views/hyrax/dashboard/sidebar/_bulkrax_sidebar_additions.html.erb +6 -0
- data/app/views/hyrax/dashboard/sidebar/_repository_content.html.erb +19 -0
- data/app/views/layouts/bulkrax/application.html.erb +14 -0
- data/config/locales/bulkrax.en.yml +36 -0
- data/config/routes.rb +18 -0
- data/db/migrate/20181011230201_create_bulkrax_importers.rb +18 -0
- data/db/migrate/20181011230228_create_bulkrax_importer_runs.rb +16 -0
- data/db/migrate/20190325183136_create_bulkrax_entries.rb +16 -0
- data/db/migrate/20190601221109_add_status_to_entry.rb +9 -0
- data/db/migrate/20190715161939_add_collections_to_importer_runs.rb +6 -0
- data/db/migrate/20190715162044_change_collection_ids_on_entries.rb +5 -0
- data/db/migrate/20190729124607_create_bulkrax_exporters.rb +19 -0
- data/db/migrate/20190729134158_create_bulkrax_exporter_runs.rb +14 -0
- data/db/migrate/20190731114016_change_importer_and_exporter_to_polymorphic.rb +12 -0
- data/db/migrate/20191203225129_add_total_collection_records_to_importer_runs.rb +5 -0
- data/db/migrate/20191204191623_add_children_to_importer_runs.rb +6 -0
- data/db/migrate/20191204223857_change_total_records_to_total_work_entries.rb +6 -0
- data/db/migrate/20191212155530_change_entry_last_error.rb +19 -0
- data/db/migrate/20200108194557_add_validate_only_to_bulkrax_importers.rb +5 -0
- data/db/migrate/20200301232856_add_status_to_importers.rb +9 -0
- data/db/migrate/20200312190638_remove_foreign_key_from_bulkrax_entries.rb +5 -0
- data/db/migrate/20200326235838_add_status_to_exporters.rb +7 -0
- data/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb +5 -0
- data/db/migrate/20200818055819_create_bulkrax_statuses.rb +18 -0
- data/db/migrate/20200819054016_move_to_statuses.rb +30 -0
- data/db/migrate/20201106014204_add_date_filter_and_status_to_bulkrax_exporters.rb +7 -0
- data/db/migrate/20201117220007_add_workflow_status_to_bulkrax_exporter.rb +5 -0
- data/db/migrate/20210806044408_remove_unused_last_error.rb +7 -0
- data/db/migrate/20210806065737_increase_text_sizes.rb +12 -0
- data/lib/bulkrax.rb +161 -0
- data/lib/bulkrax/engine.rb +37 -0
- data/lib/bulkrax/version.rb +5 -0
- data/lib/generators/bulkrax/install_generator.rb +80 -0
- data/lib/generators/bulkrax/templates/README +3 -0
- data/lib/generators/bulkrax/templates/app/assets/images/bulkrax/removed.png +0 -0
- data/lib/generators/bulkrax/templates/app/models/concerns/bulkrax/has_local_processing.rb +8 -0
- data/lib/generators/bulkrax/templates/bin/importer +140 -0
- data/lib/generators/bulkrax/templates/config/bulkrax_api.yml +84 -0
- data/lib/generators/bulkrax/templates/config/initializers/bulkrax.rb +72 -0
- data/lib/tasks/bulkrax_tasks.rake +6 -0
- metadata +388 -0
@@ -0,0 +1,157 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Bulkrax
|
4
|
+
class BagitParser < ApplicationParser
|
5
|
+
def self.export_supported?
|
6
|
+
false # @todo will be supported
|
7
|
+
end
|
8
|
+
|
9
|
+
def valid_import?
|
10
|
+
return true if import_fields.present?
|
11
|
+
rescue => e
|
12
|
+
status_info(e)
|
13
|
+
false
|
14
|
+
end
|
15
|
+
|
16
|
+
def entry_class
|
17
|
+
parser_fields['metadata_format'].constantize
|
18
|
+
end
|
19
|
+
|
20
|
+
def collection_entry_class
|
21
|
+
parser_fields['metadata_format'].gsub('Entry', 'CollectionEntry').constantize
|
22
|
+
rescue
|
23
|
+
Entry
|
24
|
+
end
|
25
|
+
|
26
|
+
# Take a random sample of 10 metadata_paths and work out the import fields from that
|
27
|
+
def import_fields
|
28
|
+
raise StandardError, 'No metadata files were found' if metadata_paths.blank?
|
29
|
+
@import_fields ||= metadata_paths.sample(10).map do |path|
|
30
|
+
entry_class.fields_from_data(entry_class.read_data(path))
|
31
|
+
end.flatten.compact.uniq
|
32
|
+
end
|
33
|
+
|
34
|
+
# Assume a single metadata record per path
|
35
|
+
# Create an Array of all metadata records, one per file
|
36
|
+
def records(_opts = {})
|
37
|
+
raise StandardError, 'No BagIt records were found' if bags.blank?
|
38
|
+
@records ||= bags.map do |bag|
|
39
|
+
path = metadata_path(bag)
|
40
|
+
raise StandardError, 'No metadata files were found' if path.blank?
|
41
|
+
data = entry_class.read_data(path)
|
42
|
+
data = entry_class.data_for_entry(data, source_identifier)
|
43
|
+
data[:file] = bag.bag_files.join('|')
|
44
|
+
data
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Find or create collections referenced by works
|
49
|
+
# If the import data also contains records for these works, they will be updated
|
50
|
+
# during create works
|
51
|
+
def create_collections
|
52
|
+
collections.each_with_index do |collection, index|
|
53
|
+
next if collection.blank?
|
54
|
+
metadata = {
|
55
|
+
title: [collection],
|
56
|
+
work_identifier => [collection],
|
57
|
+
visibility: 'open',
|
58
|
+
collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
|
59
|
+
}
|
60
|
+
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
61
|
+
ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
|
62
|
+
increment_counters(index, true)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def create_works
|
67
|
+
records.each_with_index do |record, index|
|
68
|
+
next unless record_has_source_identifier(record, index)
|
69
|
+
break if limit_reached?(limit, index)
|
70
|
+
|
71
|
+
seen[record[source_identifier]] = true
|
72
|
+
new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record)
|
73
|
+
if record[:delete].present?
|
74
|
+
DeleteWorkJob.send(perform_method, new_entry, current_run)
|
75
|
+
else
|
76
|
+
ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
|
77
|
+
end
|
78
|
+
increment_counters(index)
|
79
|
+
end
|
80
|
+
importer.record_status
|
81
|
+
rescue StandardError => e
|
82
|
+
status_info(e)
|
83
|
+
end
|
84
|
+
|
85
|
+
def collections
|
86
|
+
records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
|
87
|
+
end
|
88
|
+
|
89
|
+
def collections_total
|
90
|
+
collections.size
|
91
|
+
end
|
92
|
+
|
93
|
+
def total
|
94
|
+
metadata_paths.count
|
95
|
+
end
|
96
|
+
|
97
|
+
def required_elements?(keys)
|
98
|
+
return if keys.blank?
|
99
|
+
!required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
|
100
|
+
end
|
101
|
+
|
102
|
+
# @todo - investigate getting directory structure
|
103
|
+
# @todo - investigate using perform_later, and having the importer check for
|
104
|
+
# DownloadCloudFileJob before it starts
|
105
|
+
def retrieve_cloud_files(files)
|
106
|
+
# There should only be one zip file for Bagit, take the first
|
107
|
+
return if files['0'].blank?
|
108
|
+
target_file = File.join(path_for_import, files['0']['file_name'].tr(' ', '_'))
|
109
|
+
# Now because we want the files in place before the importer runs
|
110
|
+
Bulkrax::DownloadCloudFileJob.perform_now(files['0'], target_file)
|
111
|
+
return target_file
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def bags
|
117
|
+
return @bags if @bags.present?
|
118
|
+
new_bag = bag(import_file_path)
|
119
|
+
@bags = if new_bag
|
120
|
+
[new_bag]
|
121
|
+
else
|
122
|
+
Dir.glob("#{import_file_path}/**/*").map { |d| bag(d) }
|
123
|
+
end
|
124
|
+
@bags.delete(nil)
|
125
|
+
raise StandardError, 'No valid bags found' if @bags.blank?
|
126
|
+
return @bags
|
127
|
+
end
|
128
|
+
|
129
|
+
# Gather the paths to all bags; skip any stray files
|
130
|
+
def bag_paths
|
131
|
+
bags.map(&:bag_dir)
|
132
|
+
end
|
133
|
+
|
134
|
+
def metadata_file_name
|
135
|
+
raise StandardError, 'The metadata file name must be specified' if parser_fields['metadata_file_name'].blank?
|
136
|
+
parser_fields['metadata_file_name']
|
137
|
+
end
|
138
|
+
|
139
|
+
# Gather the paths to all metadata files matching the metadata_file_name
|
140
|
+
def metadata_paths
|
141
|
+
@metadata_paths ||= bag_paths.map do |b|
|
142
|
+
Dir.glob("#{b}/**/*").select { |f| File.file?(f) && f.ends_with?(metadata_file_name) }
|
143
|
+
end.flatten.compact
|
144
|
+
end
|
145
|
+
|
146
|
+
def metadata_path(bag)
|
147
|
+
Dir.glob("#{bag.bag_dir}/**/*").detect { |f| File.file?(f) && f.ends_with?(metadata_file_name) }
|
148
|
+
end
|
149
|
+
|
150
|
+
def bag(path)
|
151
|
+
return nil unless path && File.exist?(File.join(path, 'bagit.txt'))
|
152
|
+
bag = BagIt::Bag.new(path)
|
153
|
+
return nil unless bag.valid?
|
154
|
+
bag
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,266 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
module Bulkrax
|
5
|
+
class CsvParser < ApplicationParser
|
6
|
+
include ErroredEntries
|
7
|
+
def self.export_supported?
|
8
|
+
true
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(importerexporter)
|
12
|
+
@importerexporter = importerexporter
|
13
|
+
end
|
14
|
+
|
15
|
+
def collections
|
16
|
+
# does the CSV contain a collection column?
|
17
|
+
return [] unless import_fields.include?(:collection)
|
18
|
+
# retrieve a list of unique collections
|
19
|
+
records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
|
20
|
+
end
|
21
|
+
|
22
|
+
def collections_total
|
23
|
+
collections.size
|
24
|
+
end
|
25
|
+
|
26
|
+
def records(_opts = {})
|
27
|
+
file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
|
28
|
+
# data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
|
29
|
+
@records ||= entry_class.read_data(file_for_import).map { |record_data| entry_class.data_for_entry(record_data, nil) }
|
30
|
+
end
|
31
|
+
|
32
|
+
# We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data
|
33
|
+
def import_fields
|
34
|
+
@import_fields ||= records.inject(:merge).keys.compact.uniq
|
35
|
+
end
|
36
|
+
|
37
|
+
def required_elements?(keys)
|
38
|
+
return if keys.blank?
|
39
|
+
missing_elements(keys).blank?
|
40
|
+
end
|
41
|
+
|
42
|
+
def missing_elements(keys)
|
43
|
+
required_elements.map(&:to_s) - keys.map(&:to_s)
|
44
|
+
end
|
45
|
+
|
46
|
+
def valid_import?
|
47
|
+
error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(import_fields).join(', ')}"
|
48
|
+
raise StandardError, error_alert unless required_elements?(import_fields)
|
49
|
+
|
50
|
+
file_paths.is_a?(Array)
|
51
|
+
rescue StandardError => e
|
52
|
+
status_info(e)
|
53
|
+
false
|
54
|
+
end
|
55
|
+
|
56
|
+
def create_collections
|
57
|
+
collections.each_with_index do |collection, index|
|
58
|
+
next if collection.blank?
|
59
|
+
metadata = {
|
60
|
+
title: [collection],
|
61
|
+
work_identifier => [collection],
|
62
|
+
visibility: 'open',
|
63
|
+
collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
|
64
|
+
}
|
65
|
+
new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
|
66
|
+
ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
|
67
|
+
increment_counters(index, true)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def create_works
|
72
|
+
records.each_with_index do |record, index|
|
73
|
+
next unless record_has_source_identifier(record, index)
|
74
|
+
break if limit_reached?(limit, index)
|
75
|
+
|
76
|
+
seen[record[source_identifier]] = true
|
77
|
+
new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record.to_h.compact)
|
78
|
+
if record[:delete].present?
|
79
|
+
DeleteWorkJob.send(perform_method, new_entry, current_run)
|
80
|
+
else
|
81
|
+
ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
|
82
|
+
end
|
83
|
+
increment_counters(index)
|
84
|
+
end
|
85
|
+
importer.record_status
|
86
|
+
rescue StandardError => e
|
87
|
+
status_info(e)
|
88
|
+
end
|
89
|
+
|
90
|
+
def write_partial_import_file(file)
|
91
|
+
import_filename = import_file_path.split('/').last
|
92
|
+
partial_import_filename = "#{File.basename(import_filename, '.csv')}_corrected_entries.csv"
|
93
|
+
|
94
|
+
path = File.join(path_for_import, partial_import_filename)
|
95
|
+
FileUtils.mv(
|
96
|
+
file.path,
|
97
|
+
path
|
98
|
+
)
|
99
|
+
path
|
100
|
+
end
|
101
|
+
|
102
|
+
def create_parent_child_relationships
|
103
|
+
super
|
104
|
+
end
|
105
|
+
|
106
|
+
def extra_filters
|
107
|
+
output = ""
|
108
|
+
if importerexporter.start_date.present?
|
109
|
+
start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
|
110
|
+
finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
|
111
|
+
output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
|
112
|
+
end
|
113
|
+
output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
|
114
|
+
output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
|
115
|
+
output
|
116
|
+
end
|
117
|
+
|
118
|
+
def current_work_ids
|
119
|
+
case importerexporter.export_from
|
120
|
+
when 'collection'
|
121
|
+
ActiveFedora::SolrService.query("member_of_collection_ids_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
|
122
|
+
when 'worktype'
|
123
|
+
ActiveFedora::SolrService.query("has_model_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
|
124
|
+
when 'importer'
|
125
|
+
entry_ids = Bulkrax::Importer.find(importerexporter.export_source).entries.pluck(:id)
|
126
|
+
complete_statuses = Bulkrax::Status.latest_by_statusable
|
127
|
+
.includes(:statusable)
|
128
|
+
.where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
|
129
|
+
complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier }
|
130
|
+
|
131
|
+
ActiveFedora::SolrService.query("#{work_identifier}_tesim:(#{complete_entry_identifiers.join(' OR ')})#{extra_filters}", rows: 2_000_000_000).map(&:id)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def create_new_entries
|
136
|
+
current_work_ids.each_with_index do |wid, index|
|
137
|
+
break if limit_reached?(limit, index)
|
138
|
+
new_entry = find_or_create_entry(entry_class, wid, 'Bulkrax::Exporter')
|
139
|
+
Bulkrax::ExportWorkJob.perform_now(new_entry.id, current_run.id)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
alias create_from_collection create_new_entries
|
143
|
+
alias create_from_importer create_new_entries
|
144
|
+
alias create_from_worktype create_new_entries
|
145
|
+
|
146
|
+
def entry_class
|
147
|
+
CsvEntry
|
148
|
+
end
|
149
|
+
|
150
|
+
def collection_entry_class
|
151
|
+
CsvCollectionEntry
|
152
|
+
end
|
153
|
+
|
154
|
+
# See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
|
155
|
+
# Changed to grep as wc -l counts blank lines, and ignores the final unescaped line (which may or may not contain data)
|
156
|
+
def total
|
157
|
+
if importer?
|
158
|
+
return @total if @total&.positive?
|
159
|
+
# windows enocded
|
160
|
+
@total = `grep -c ^M #{real_import_file_path}`.to_i - 1
|
161
|
+
# unix encoded
|
162
|
+
@total = `grep -vc ^$ #{real_import_file_path}`.to_i - 1 if @total < 1
|
163
|
+
elsif exporter?
|
164
|
+
@total = importerexporter.entries.count
|
165
|
+
else
|
166
|
+
@total = 0
|
167
|
+
end
|
168
|
+
return @total
|
169
|
+
rescue StandardErrorr
|
170
|
+
@total = 0
|
171
|
+
end
|
172
|
+
|
173
|
+
# @todo - investigate getting directory structure
|
174
|
+
# @todo - investigate using perform_later, and having the importer check for
|
175
|
+
# DownloadCloudFileJob before it starts
|
176
|
+
def retrieve_cloud_files(files)
|
177
|
+
files_path = File.join(path_for_import, 'files')
|
178
|
+
FileUtils.mkdir_p(files_path) unless File.exist?(files_path)
|
179
|
+
files.each_pair do |_key, file|
|
180
|
+
# fixes bug where auth headers do not get attached properly
|
181
|
+
if file['auth_header'].present?
|
182
|
+
file['headers'] ||= {}
|
183
|
+
file['headers'].merge!(file['auth_header'])
|
184
|
+
end
|
185
|
+
# this only works for uniquely named files
|
186
|
+
target_file = File.join(files_path, file['file_name'].tr(' ', '_'))
|
187
|
+
# Now because we want the files in place before the importer runs
|
188
|
+
# Problematic for a large upload
|
189
|
+
Bulkrax::DownloadCloudFileJob.perform_now(file, target_file)
|
190
|
+
end
|
191
|
+
return nil
|
192
|
+
end
|
193
|
+
|
194
|
+
# export methods
|
195
|
+
|
196
|
+
def write_files
|
197
|
+
CSV.open(setup_export_file, "w", headers: export_headers, write_headers: true) do |csv|
|
198
|
+
importerexporter.entries.where(identifier: current_work_ids)[0..limit || total].each do |e|
|
199
|
+
csv << e.parsed_metadata
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def key_allowed(key)
|
205
|
+
!Bulkrax.reserved_properties.include?(key) &&
|
206
|
+
new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
|
207
|
+
key != source_identifier.to_s
|
208
|
+
end
|
209
|
+
|
210
|
+
# All possible column names
|
211
|
+
def export_headers
|
212
|
+
headers = ['id']
|
213
|
+
headers << source_identifier.to_s
|
214
|
+
headers << 'model'
|
215
|
+
importerexporter.mapping.each_key { |key| headers << key if key_allowed(key) }
|
216
|
+
headers << 'file'
|
217
|
+
headers.uniq
|
218
|
+
end
|
219
|
+
|
220
|
+
# in the parser as it is specific to the format
|
221
|
+
def setup_export_file
|
222
|
+
File.join(importerexporter.exporter_export_path, 'export.csv')
|
223
|
+
end
|
224
|
+
|
225
|
+
# Retrieve file paths for [:file] mapping in records
|
226
|
+
# and check all listed files exist.
|
227
|
+
def file_paths
|
228
|
+
raise StandardError, 'No records were found' if records.blank?
|
229
|
+
@file_paths ||= records.map do |r|
|
230
|
+
file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
|
231
|
+
next if r[file_mapping].blank?
|
232
|
+
|
233
|
+
r[file_mapping].split(/\s*[:;|]\s*/).map do |f|
|
234
|
+
file = File.join(path_to_files, f.tr(' ', '_'))
|
235
|
+
if File.exist?(file) # rubocop:disable Style/GuardClause
|
236
|
+
file
|
237
|
+
else
|
238
|
+
raise "File #{file} does not exist"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end.flatten.compact.uniq
|
242
|
+
end
|
243
|
+
|
244
|
+
# Retrieve the path where we expect to find the files
|
245
|
+
def path_to_files
|
246
|
+
@path_to_files ||= File.join(
|
247
|
+
File.file?(import_file_path) ? File.dirname(import_file_path) : import_file_path,
|
248
|
+
'files'
|
249
|
+
)
|
250
|
+
end
|
251
|
+
|
252
|
+
private
|
253
|
+
|
254
|
+
# Override to return the first CSV in the path, if a zip file is supplied
|
255
|
+
# We expect a single CSV at the top level of the zip in the CSVParser
|
256
|
+
# but we are willing to go look for it if need be
|
257
|
+
def real_import_file_path
|
258
|
+
if file? && zip?
|
259
|
+
unzip(parser_fields['import_file_path'])
|
260
|
+
return Dir["#{importer_unzip_path}/**/*.csv"].first
|
261
|
+
else
|
262
|
+
parser_fields['import_file_path']
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Bulkrax
|
4
|
+
class OaiDcParser < ApplicationParser
|
5
|
+
attr_accessor :headers
|
6
|
+
delegate :list_sets, to: :client
|
7
|
+
|
8
|
+
def initialize(importerexporter)
|
9
|
+
super
|
10
|
+
@headers = { from: importerexporter.user.email }
|
11
|
+
end
|
12
|
+
|
13
|
+
def client
|
14
|
+
@client ||= OAI::Client.new(importerexporter.parser_fields['base_url'],
|
15
|
+
headers: headers,
|
16
|
+
parser: 'libxml',
|
17
|
+
metadata_prefix: importerexporter.parser_fields['metadata_prefix'])
|
18
|
+
rescue StandardError
|
19
|
+
raise OAIError
|
20
|
+
end
|
21
|
+
|
22
|
+
def collection_name
|
23
|
+
@collection_name ||= parser_fields['set'] || 'all'
|
24
|
+
end
|
25
|
+
|
26
|
+
def entry_class
|
27
|
+
OaiDcEntry
|
28
|
+
end
|
29
|
+
|
30
|
+
def collection_entry_class
|
31
|
+
OaiSetEntry
|
32
|
+
end
|
33
|
+
|
34
|
+
def records(opts = {})
|
35
|
+
opts[:set] = collection_name unless collection_name == 'all'
|
36
|
+
|
37
|
+
opts[:from] = importerexporter&.last_imported_at&.strftime("%Y-%m-%d") if importerexporter.last_imported_at && only_updates
|
38
|
+
|
39
|
+
if opts[:quick]
|
40
|
+
opts.delete(:quick)
|
41
|
+
begin
|
42
|
+
@short_records = client.list_identifiers(opts)
|
43
|
+
rescue OAI::Exception => e
|
44
|
+
return @short_records = [] if e.code == "noRecordsMatch"
|
45
|
+
raise e
|
46
|
+
end
|
47
|
+
else
|
48
|
+
begin
|
49
|
+
@records ||= client.list_records(opts.merge(metadata_prefix: parser_fields['metadata_prefix']))
|
50
|
+
rescue OAI::Exception => e
|
51
|
+
return @records = [] if e.code == "noRecordsMatch"
|
52
|
+
raise e
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# the set of fields available in the import data
|
58
|
+
def import_fields
|
59
|
+
['contributor', 'coverage', 'creator', 'date', 'description', 'format', 'identifier', 'language', 'publisher', 'relation', 'rights', 'source', 'subject', 'title', 'type']
|
60
|
+
end
|
61
|
+
|
62
|
+
delegate :list_sets, to: :client
|
63
|
+
|
64
|
+
def create_collections
|
65
|
+
metadata = {
|
66
|
+
visibility: 'open',
|
67
|
+
collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
|
68
|
+
}
|
69
|
+
|
70
|
+
collections.each_with_index do |set, index|
|
71
|
+
next unless collection_name == 'all' || collection_name == set.spec
|
72
|
+
unique_collection_identifier = importerexporter.unique_collection_identifier(set.spec)
|
73
|
+
metadata[:title] = [set.name]
|
74
|
+
metadata[work_identifier] = [unique_collection_identifier]
|
75
|
+
|
76
|
+
new_entry = collection_entry_class.where(importerexporter: importerexporter, identifier: unique_collection_identifier, raw_metadata: metadata).first_or_create!
|
77
|
+
# perform now to ensure this gets created before work imports start
|
78
|
+
ImportWorkCollectionJob.perform_now(new_entry.id, importerexporter.current_run.id)
|
79
|
+
increment_counters(index, true)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def create_works
|
84
|
+
results = self.records(quick: true)
|
85
|
+
return if results.blank?
|
86
|
+
results.full.each_with_index do |record, index|
|
87
|
+
identifier = record.send(source_identifier)
|
88
|
+
if identifier.blank?
|
89
|
+
if Bulkrax.fill_in_blank_source_identifiers.present?
|
90
|
+
identifier = Bulkrax.fill_in_blank_source_identifiers.call(self, index)
|
91
|
+
else
|
92
|
+
invalid_record("Missing #{source_identifier} for #{record.to_h}\n")
|
93
|
+
next
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
break if limit_reached?(limit, index)
|
98
|
+
seen[identifier] = true
|
99
|
+
new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: identifier).first_or_create!
|
100
|
+
if record.deleted?
|
101
|
+
DeleteWorkJob.send(perform_method, new_entry, importerexporter.current_run)
|
102
|
+
else
|
103
|
+
ImportWorkJob.send(perform_method, new_entry.id, importerexporter.current_run.id)
|
104
|
+
end
|
105
|
+
increment_counters(index)
|
106
|
+
end
|
107
|
+
importer.record_status
|
108
|
+
end
|
109
|
+
|
110
|
+
def collections
|
111
|
+
@collections ||= list_sets
|
112
|
+
end
|
113
|
+
|
114
|
+
def collections_total
|
115
|
+
if collection_name == 'all'
|
116
|
+
collections.count
|
117
|
+
else
|
118
|
+
1
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def create_parent_child_relationships; end
|
123
|
+
|
124
|
+
def total
|
125
|
+
@total ||= records(quick: true).doc.find(".//resumptionToken").to_a.first.attributes["completeListSize"].to_i
|
126
|
+
rescue
|
127
|
+
@total = 0
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|