bulkrax 9.3.5 → 9.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -1
- data/app/assets/javascripts/bulkrax/application.js +2 -1
- data/app/assets/javascripts/bulkrax/bulkrax.js +13 -4
- data/app/assets/javascripts/bulkrax/bulkrax_utils.js +96 -0
- data/app/assets/javascripts/bulkrax/datatables.js +1 -0
- data/app/assets/javascripts/bulkrax/entries.js +17 -10
- data/app/assets/javascripts/bulkrax/importers.js.erb +9 -2
- data/app/assets/javascripts/bulkrax/importers_stepper.js +2420 -0
- data/app/assets/stylesheets/bulkrax/application.css +1 -1
- data/app/assets/stylesheets/bulkrax/stepper/_header.scss +83 -0
- data/app/assets/stylesheets/bulkrax/stepper/_mixins.scss +26 -0
- data/app/assets/stylesheets/bulkrax/stepper/_navigation.scss +103 -0
- data/app/assets/stylesheets/bulkrax/stepper/_responsive.scss +46 -0
- data/app/assets/stylesheets/bulkrax/stepper/_review.scss +92 -0
- data/app/assets/stylesheets/bulkrax/stepper/_settings.scss +106 -0
- data/app/assets/stylesheets/bulkrax/stepper/_success.scss +26 -0
- data/app/assets/stylesheets/bulkrax/stepper/_summary.scss +171 -0
- data/app/assets/stylesheets/bulkrax/stepper/_upload.scss +339 -0
- data/app/assets/stylesheets/bulkrax/stepper/_validation.scss +237 -0
- data/app/assets/stylesheets/bulkrax/stepper/_variables.scss +46 -0
- data/app/assets/stylesheets/bulkrax/stepper.scss +32 -0
- data/app/controllers/bulkrax/guided_imports_controller.rb +175 -0
- data/app/controllers/bulkrax/importers_controller.rb +28 -31
- data/app/controllers/concerns/bulkrax/guided_import_demo_scenarios.rb +201 -0
- data/app/controllers/concerns/bulkrax/importer_file_handler.rb +217 -0
- data/app/factories/bulkrax/object_factory.rb +3 -2
- data/app/factories/bulkrax/valkyrie_object_factory.rb +61 -17
- data/app/jobs/bulkrax/importer_job.rb +11 -4
- data/app/models/bulkrax/csv_entry.rb +27 -7
- data/app/models/bulkrax/entry.rb +4 -0
- data/app/models/bulkrax/importer.rb +31 -1
- data/app/models/concerns/bulkrax/has_matchers.rb +2 -2
- data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +6 -5
- data/app/parsers/bulkrax/application_parser.rb +31 -5
- data/app/parsers/bulkrax/csv_parser.rb +42 -10
- data/app/parsers/concerns/bulkrax/csv_parser/csv_template_generation.rb +73 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation.rb +133 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_helpers.rb +282 -0
- data/app/parsers/concerns/bulkrax/csv_parser/csv_validation_hierarchy.rb +96 -0
- data/app/services/bulkrax/csv_template/column_builder.rb +60 -0
- data/app/services/bulkrax/csv_template/column_descriptor.rb +58 -0
- data/app/services/bulkrax/csv_template/csv_builder.rb +83 -0
- data/app/services/bulkrax/csv_template/explanation_builder.rb +57 -0
- data/app/services/bulkrax/csv_template/field_analyzer.rb +56 -0
- data/app/services/bulkrax/csv_template/file_path_generator.rb +47 -0
- data/app/services/bulkrax/csv_template/file_validator.rb +68 -0
- data/app/services/bulkrax/csv_template/mapping_manager.rb +55 -0
- data/app/services/bulkrax/csv_template/model_loader.rb +50 -0
- data/app/services/bulkrax/csv_template/row_builder.rb +35 -0
- data/app/services/bulkrax/csv_template/schema_analyzer.rb +70 -0
- data/app/services/bulkrax/csv_template/split_formatter.rb +44 -0
- data/app/services/bulkrax/csv_template/value_determiner.rb +68 -0
- data/app/services/bulkrax/stepper_response_formatter.rb +347 -0
- data/app/services/bulkrax/validation_error_csv_builder.rb +99 -0
- data/app/validators/bulkrax/csv_row/child_reference.rb +56 -0
- data/app/validators/bulkrax/csv_row/circular_reference.rb +71 -0
- data/app/validators/bulkrax/csv_row/controlled_vocabulary.rb +74 -0
- data/app/validators/bulkrax/csv_row/duplicate_identifier.rb +63 -0
- data/app/validators/bulkrax/csv_row/missing_source_identifier.rb +31 -0
- data/app/validators/bulkrax/csv_row/parent_reference.rb +59 -0
- data/app/validators/bulkrax/csv_row/required_values.rb +64 -0
- data/app/views/bulkrax/guided_imports/new.html.erb +567 -0
- data/app/views/bulkrax/importers/index.html.erb +6 -1
- data/app/views/bulkrax/importers/new.html.erb +1 -1
- data/app/views/bulkrax/importers/show.html.erb +17 -1
- data/config/i18n-tasks.yml +195 -0
- data/config/locales/bulkrax.de.yml +504 -0
- data/config/locales/bulkrax.en.yml +459 -233
- data/config/locales/bulkrax.es.yml +504 -0
- data/config/locales/bulkrax.fr.yml +504 -0
- data/config/locales/bulkrax.it.yml +504 -0
- data/config/locales/bulkrax.pt-BR.yml +504 -0
- data/config/locales/bulkrax.zh.yml +503 -0
- data/config/routes.rb +10 -1
- data/lib/bulkrax/data/demo_scenarios.json +2235 -0
- data/lib/bulkrax/version.rb +1 -1
- data/lib/bulkrax.rb +31 -0
- metadata +55 -16
- data/app/services/bulkrax/sample_csv_service/column_builder.rb +0 -58
- data/app/services/bulkrax/sample_csv_service/column_descriptor.rb +0 -56
- data/app/services/bulkrax/sample_csv_service/csv_builder.rb +0 -82
- data/app/services/bulkrax/sample_csv_service/explanation_builder.rb +0 -51
- data/app/services/bulkrax/sample_csv_service/field_analyzer.rb +0 -54
- data/app/services/bulkrax/sample_csv_service/file_path_generator.rb +0 -16
- data/app/services/bulkrax/sample_csv_service/mapping_manager.rb +0 -36
- data/app/services/bulkrax/sample_csv_service/model_loader.rb +0 -40
- data/app/services/bulkrax/sample_csv_service/row_builder.rb +0 -33
- data/app/services/bulkrax/sample_csv_service/schema_analyzer.rb +0 -69
- data/app/services/bulkrax/sample_csv_service/split_formatter.rb +0 -42
- data/app/services/bulkrax/sample_csv_service/value_determiner.rb +0 -67
- data/app/services/bulkrax/sample_csv_service.rb +0 -78
- /data/{app/services → lib}/wings/custom_queries/find_by_source_identifier.rb +0 -0
|
@@ -4,7 +4,10 @@ module Bulkrax
|
|
|
4
4
|
class CsvParser < ApplicationParser # rubocop:disable Metrics/ClassLength
|
|
5
5
|
include ErroredEntries
|
|
6
6
|
include ExportBehavior
|
|
7
|
+
include CsvParser::CsvTemplateGeneration
|
|
8
|
+
include CsvParser::CsvValidation
|
|
7
9
|
attr_writer :collections, :file_sets, :works
|
|
10
|
+
attr_accessor :validation_mode
|
|
8
11
|
|
|
9
12
|
def self.export_supported?
|
|
10
13
|
true
|
|
@@ -14,12 +17,14 @@ module Bulkrax
|
|
|
14
17
|
return @records if @records.present?
|
|
15
18
|
|
|
16
19
|
file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
|
|
17
|
-
# data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
|
|
18
20
|
csv_data = entry_class.read_data(file_for_import)
|
|
19
|
-
|
|
20
|
-
|
|
21
|
+
unless validation_mode
|
|
22
|
+
importer.parser_fields['total'] = csv_data.count
|
|
23
|
+
importer.save
|
|
24
|
+
end
|
|
21
25
|
|
|
22
26
|
@records = csv_data.map { |record_data| entry_class.data_for_entry(record_data, nil, self) }
|
|
27
|
+
@records
|
|
23
28
|
end
|
|
24
29
|
|
|
25
30
|
# rubocop:disable Metrics/AbcSize
|
|
@@ -95,11 +100,11 @@ module Bulkrax
|
|
|
95
100
|
def missing_elements(record)
|
|
96
101
|
keys_from_record = keys_without_numbers(record.reject { |_, v| v.blank? }.keys.compact.uniq.map(&:to_s))
|
|
97
102
|
keys = []
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
keys << k if keys_from_record.include?(vf)
|
|
103
|
+
mapping_values = importerexporter.mapping.stringify_keys
|
|
104
|
+
mapping_values.each do |k, v|
|
|
105
|
+
from_values = Array.wrap(v.is_a?(Hash) ? (v['from'] || v[:from]) : nil)
|
|
106
|
+
from_values.each do |vf|
|
|
107
|
+
keys << k if vf.present? && keys_from_record.include?(vf.to_s.strip)
|
|
103
108
|
end
|
|
104
109
|
end
|
|
105
110
|
required_elements.map(&:to_s) - keys.uniq.map(&:to_s)
|
|
@@ -360,8 +365,11 @@ module Bulkrax
|
|
|
360
365
|
else
|
|
361
366
|
Bulkrax.multi_value_element_split_on
|
|
362
367
|
end
|
|
368
|
+
files_dir = path_to_files
|
|
369
|
+
raise StandardError, "Record references local files but no files directory could be resolved from the import path" if files_dir.nil?
|
|
370
|
+
|
|
363
371
|
r[file_mapping].split(split_pattern).map do |f|
|
|
364
|
-
file = File.join(
|
|
372
|
+
file = File.join(files_dir, f.strip.tr(' ', '_'))
|
|
365
373
|
if File.exist?(file) # rubocop:disable Style/GuardClause
|
|
366
374
|
file
|
|
367
375
|
else
|
|
@@ -376,8 +384,11 @@ module Bulkrax
|
|
|
376
384
|
filename = args.fetch(:filename, '')
|
|
377
385
|
|
|
378
386
|
return @path_to_files if @path_to_files.present? && filename.blank?
|
|
387
|
+
# The zip file could be either the main import file, or a separate attachments zip file.
|
|
388
|
+
# We want to check for both of those before we determine the path to the files.
|
|
389
|
+
have_zip_file = zip? || (parser_fields['attachments_zip_path'] && zip_file?(parser_fields['attachments_zip_path']))
|
|
379
390
|
@path_to_files = File.join(
|
|
380
|
-
|
|
391
|
+
have_zip_file ? importer_unzip_path : File.dirname(import_file_path), 'files', filename
|
|
381
392
|
)
|
|
382
393
|
|
|
383
394
|
return @path_to_files if File.exist?(@path_to_files)
|
|
@@ -386,8 +397,29 @@ module Bulkrax
|
|
|
386
397
|
File.join(importer_unzip_path, 'files', filename) if file? && zip?
|
|
387
398
|
end
|
|
388
399
|
|
|
400
|
+
def unzip(file_to_unzip)
|
|
401
|
+
super
|
|
402
|
+
normalize_unzipped_files_structure(importer_unzip_path)
|
|
403
|
+
end
|
|
404
|
+
|
|
389
405
|
private
|
|
390
406
|
|
|
407
|
+
# Ensure files extracted from a zip always land in a `files/` subdirectory
|
|
408
|
+
# regardless of how the zip was structured. If files were extracted directly
|
|
409
|
+
# into dest_dir (flat zip with no `files/` folder), move them into
|
|
410
|
+
# dest_dir/files/ so that path_to_files can reliably locate them.
|
|
411
|
+
def normalize_unzipped_files_structure(dest_dir)
|
|
412
|
+
flat_files = Dir.glob(File.join(dest_dir, '*')).select { |f| File.file?(f) && !f.end_with?('.csv') }
|
|
413
|
+
return if flat_files.empty?
|
|
414
|
+
|
|
415
|
+
files_dir = File.join(dest_dir, 'files')
|
|
416
|
+
FileUtils.mkdir_p(files_dir)
|
|
417
|
+
flat_files.each do |f|
|
|
418
|
+
dest = File.join(files_dir, File.basename(f))
|
|
419
|
+
FileUtils.mv(f, dest) unless File.exist?(dest)
|
|
420
|
+
end
|
|
421
|
+
end
|
|
422
|
+
|
|
391
423
|
def unique_collection_identifier(collection_hash)
|
|
392
424
|
entry_uid = collection_hash[source_identifier]
|
|
393
425
|
entry_uid ||= if Bulkrax.fill_in_blank_source_identifiers.present?
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
class CsvParser < ApplicationParser
|
|
5
|
+
module CsvTemplateGeneration
|
|
6
|
+
extend ActiveSupport::Concern
|
|
7
|
+
|
|
8
|
+
class_methods do
|
|
9
|
+
# Generate a CSV template for the specified models.
|
|
10
|
+
#
|
|
11
|
+
# @param models [Array<String>, String] Model names or 'all' for all available models
|
|
12
|
+
# @param output [String] Output format: 'file' or 'csv_string'
|
|
13
|
+
# @param admin_set_id [String, nil] Optional admin set ID for context
|
|
14
|
+
# @param args [Hash] Additional arguments passed to output method (e.g., file_path)
|
|
15
|
+
# @return [String] File path (for 'file' output) or CSV string (for 'csv_string' output)
|
|
16
|
+
def generate_template(models: [], output: 'file', admin_set_id: nil, **args)
|
|
17
|
+
raise NameError, "Hyrax is not defined" unless defined?(::Hyrax)
|
|
18
|
+
TemplateContext.new(models: models, admin_set_id: admin_set_id).send("to_#{output}", **args)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# Holds state for a single template generation run.
|
|
24
|
+
# Provides the interface expected by CsvTemplate:: components.
|
|
25
|
+
class TemplateContext
|
|
26
|
+
attr_reader :mappings, :all_models, :admin_set_id, :field_analyzer, :mapping_manager
|
|
27
|
+
|
|
28
|
+
def initialize(models: nil, admin_set_id: nil)
|
|
29
|
+
@admin_set_id = admin_set_id
|
|
30
|
+
@mapping_manager = CsvTemplate::MappingManager.new
|
|
31
|
+
@mappings = @mapping_manager.mappings
|
|
32
|
+
@field_analyzer = CsvTemplate::FieldAnalyzer.new(@mappings, admin_set_id)
|
|
33
|
+
@all_models = CsvTemplate::ModelLoader.new(Array.wrap(models)).models
|
|
34
|
+
@csv_builder = CsvTemplate::CsvBuilder.new(self)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def to_file(file_path: nil)
|
|
38
|
+
file_path ||= CsvTemplate::FilePathGenerator.default_path(@admin_set_id)
|
|
39
|
+
@csv_builder.write_to_file(file_path)
|
|
40
|
+
file_path
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def to_csv_string
|
|
44
|
+
@csv_builder.generate_string
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def field_metadata_for_all_models
|
|
48
|
+
@field_metadata ||= @all_models.each_with_object({}) do |model, hash|
|
|
49
|
+
field_list = @field_analyzer.find_or_create_field_list_for(model_name: model)
|
|
50
|
+
hash[model] = {
|
|
51
|
+
properties: field_list.dig(model, "properties") || [],
|
|
52
|
+
required_terms: field_list.dig(model, "required_terms") || [],
|
|
53
|
+
controlled_vocab_terms: field_list.dig(model, "controlled_vocab_terms") || []
|
|
54
|
+
}
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def valid_headers_for_models
|
|
59
|
+
@valid_headers ||= begin
|
|
60
|
+
column_builder = CsvTemplate::ColumnBuilder.new(self)
|
|
61
|
+
all_columns = column_builder.all_columns
|
|
62
|
+
all_columns - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES
|
|
63
|
+
rescue StandardError => e
|
|
64
|
+
Rails.logger.error("Error building valid headers: #{e.message}")
|
|
65
|
+
standard_fields = %w[model source_identifier parent parents file]
|
|
66
|
+
model_fields = field_metadata_for_all_models.values.flat_map { |m| m[:properties] }
|
|
67
|
+
(standard_fields + model_fields).uniq
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
class CsvParser < ApplicationParser
|
|
5
|
+
module CsvValidation
|
|
6
|
+
extend ActiveSupport::Concern
|
|
7
|
+
|
|
8
|
+
included do
|
|
9
|
+
# Lightweight struct used to satisfy the CsvTemplate::ColumnBuilder
|
|
10
|
+
# interface without constructing a full template context.
|
|
11
|
+
ValidationContext = Struct.new(:mapping_manager, :field_analyzer, :all_models, :mappings, keyword_init: true)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
class_methods do
|
|
15
|
+
include CsvValidationHelpers
|
|
16
|
+
|
|
17
|
+
# Validate a CSV (and optional zip) without a persisted Importer record.
|
|
18
|
+
#
|
|
19
|
+
# @param csv_file [File, ActionDispatch::Http::UploadedFile, String] path or file object
|
|
20
|
+
# @param zip_file [File, ActionDispatch::Http::UploadedFile, nil]
|
|
21
|
+
# @param admin_set_id [String, nil]
|
|
22
|
+
# @return [Hash] validation result compatible with the guided import UI
|
|
23
|
+
def validate_csv(csv_file:, zip_file: nil, admin_set_id: nil)
|
|
24
|
+
raw_csv, headers, mapping_manager, mappings, source_id_key, csv_data, field_metadata, field_analyzer =
|
|
25
|
+
parse_csv_inputs(csv_file, admin_set_id)
|
|
26
|
+
|
|
27
|
+
all_ids = csv_data.map { |r| r[:source_identifier] }.compact.to_set
|
|
28
|
+
header_issues = check_headers(headers, raw_csv, mapping_manager, mappings, field_metadata, field_analyzer)
|
|
29
|
+
missing_required = header_issues[:missing_required]
|
|
30
|
+
notices, row_errors, file_validator, collections, works, file_sets =
|
|
31
|
+
run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id)
|
|
32
|
+
|
|
33
|
+
result = assemble_result(
|
|
34
|
+
headers: headers, missing_required: missing_required, header_issues: header_issues,
|
|
35
|
+
row_errors: row_errors, csv_data: csv_data, file_validator: file_validator,
|
|
36
|
+
collections: collections, works: works, file_sets: file_sets, notices: notices
|
|
37
|
+
)
|
|
38
|
+
apply_rights_statement_validation_override!(result, missing_required)
|
|
39
|
+
result[:raw_csv_data] = csv_data
|
|
40
|
+
result
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
# Builds notices, runs row validators, file validator, and hierarchy extraction.
|
|
46
|
+
# Returns [notices, row_errors, file_validator, collections, works, file_sets].
|
|
47
|
+
def run_validations(csv_data, all_ids, headers, source_id_key, mappings, field_metadata, missing_required, zip_file, admin_set_id) # rubocop:disable Metrics/ParameterLists
|
|
48
|
+
find_record = build_find_record
|
|
49
|
+
notices = []
|
|
50
|
+
append_missing_source_id!(missing_required, headers, source_id_key, csv_data.map { |r| r[:model] }.compact.uniq)
|
|
51
|
+
append_missing_model_notice!(notices, headers, csv_data)
|
|
52
|
+
|
|
53
|
+
row_errors = run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices)
|
|
54
|
+
file_validator = CsvTemplate::FileValidator.new(csv_data, zip_file, admin_set_id)
|
|
55
|
+
collections, works, file_sets = extract_hierarchy_items(csv_data, all_ids, find_record, mappings)
|
|
56
|
+
[notices, row_errors, file_validator, collections, works, file_sets]
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Reads the CSV, resolves mappings, parses rows, and builds field metadata.
|
|
60
|
+
# Returns the values needed by all subsequent validation steps.
|
|
61
|
+
def parse_csv_inputs(csv_file, admin_set_id)
|
|
62
|
+
# Use CsvEntry.read_data so header normalisation is identical to a real import.
|
|
63
|
+
raw_csv = CsvEntry.read_data(csv_file)
|
|
64
|
+
headers = raw_csv.headers.map(&:to_s)
|
|
65
|
+
|
|
66
|
+
mapping_manager = CsvTemplate::MappingManager.new
|
|
67
|
+
mappings = mapping_manager.mappings
|
|
68
|
+
|
|
69
|
+
source_id_key = resolve_validation_key(mapping_manager, flag: 'source_identifier', default: :source_identifier)
|
|
70
|
+
parent_key = resolve_validation_key(mapping_manager, flag: 'related_parents_field_mapping', default: :parents)
|
|
71
|
+
children_key = resolve_validation_key(mapping_manager, flag: 'related_children_field_mapping', default: :children)
|
|
72
|
+
file_key = resolve_validation_key(mapping_manager, key: 'file', default: :file)
|
|
73
|
+
|
|
74
|
+
csv_data = parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)
|
|
75
|
+
all_models = csv_data.map { |r| r[:model] }.compact.uniq
|
|
76
|
+
all_models |= [Bulkrax.default_work_type] if Bulkrax.default_work_type.present?
|
|
77
|
+
field_analyzer = CsvTemplate::FieldAnalyzer.new(mappings, admin_set_id)
|
|
78
|
+
field_metadata = build_validation_field_metadata(all_models, field_analyzer)
|
|
79
|
+
|
|
80
|
+
[raw_csv, headers, mapping_manager, mappings, source_id_key, csv_data, field_metadata, field_analyzer]
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Runs all header-level checks and returns a hash of results.
|
|
84
|
+
def check_headers(headers, raw_csv, mapping_manager, mappings, field_metadata, field_analyzer) # rubocop:disable Metrics/ParameterLists
|
|
85
|
+
all_models = field_metadata.keys
|
|
86
|
+
valid_headers = build_valid_validation_headers(mapping_manager, field_analyzer,
|
|
87
|
+
all_models, mappings, field_metadata)
|
|
88
|
+
suffixed = headers.select { |h| h.match?(/_\d+\z/) }
|
|
89
|
+
valid_headers = (valid_headers + suffixed).uniq
|
|
90
|
+
|
|
91
|
+
{
|
|
92
|
+
missing_required: find_missing_required_headers(headers, field_metadata, mapping_manager),
|
|
93
|
+
unrecognized: find_unrecognized_validation_headers(headers, valid_headers),
|
|
94
|
+
empty_columns: find_empty_column_positions(headers, raw_csv)
|
|
95
|
+
}
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def extract_hierarchy_items(csv_data, all_ids, find_record, mappings)
|
|
99
|
+
extract_validation_items(
|
|
100
|
+
csv_data, all_ids, find_record,
|
|
101
|
+
parent_split_pattern: resolve_parent_split_pattern(mappings),
|
|
102
|
+
child_split_pattern: resolve_children_split_pattern(mappings) || '|'
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Runs all registered row validators and returns the collected errors.
|
|
107
|
+
def run_row_validators(csv_data, all_ids, source_id_key, mappings, field_metadata, find_record, notices = []) # rubocop:disable Metrics/ParameterLists
|
|
108
|
+
context = {
|
|
109
|
+
errors: [],
|
|
110
|
+
warnings: [],
|
|
111
|
+
seen_ids: {},
|
|
112
|
+
all_ids: all_ids,
|
|
113
|
+
source_identifier: source_id_key.to_s,
|
|
114
|
+
parent_split_pattern: resolve_parent_split_pattern(mappings),
|
|
115
|
+
child_split_pattern: resolve_children_split_pattern(mappings),
|
|
116
|
+
parent_column: resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents'),
|
|
117
|
+
children_column: resolve_relationship_column(mappings, 'related_children_field_mapping', 'children'),
|
|
118
|
+
mappings: mappings,
|
|
119
|
+
field_metadata: field_metadata,
|
|
120
|
+
find_record_by_source_identifier: find_record,
|
|
121
|
+
relationship_graph: build_relationship_graph(csv_data, mappings),
|
|
122
|
+
notices: notices
|
|
123
|
+
}
|
|
124
|
+
csv_data.each_with_index do |record, index|
|
|
125
|
+
row_number = index + 2 # 1-indexed, plus header row
|
|
126
|
+
Bulkrax.csv_row_validators.each { |v| v.call(record, row_number, context) }
|
|
127
|
+
end
|
|
128
|
+
context[:errors]
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Bulkrax
|
|
4
|
+
class CsvParser < ApplicationParser
|
|
5
|
+
# Private helper methods for CsvValidation.
|
|
6
|
+
module CsvValidationHelpers # rubocop:disable Metrics/ModuleLength
|
|
7
|
+
include CsvValidationHierarchy
|
|
8
|
+
|
|
9
|
+
# Resolve a symbol key from mappings for use as a record hash key.
|
|
10
|
+
# Returns a Symbol matching the parser's symbol-keyed record hash.
|
|
11
|
+
def resolve_validation_key(mapping_manager, key: nil, flag: nil, default:)
|
|
12
|
+
options = mapping_manager.resolve_column_name(key: key, flag: flag, default: default.to_s)
|
|
13
|
+
options.first&.to_sym || default
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Parse rows from a CsvEntry.read_data result into the canonical record shape.
|
|
17
|
+
# CsvEntry.read_data returns CSV::Row objects with symbol headers; blank rows
|
|
18
|
+
# are already filtered by CsvWrapper.
|
|
19
|
+
def parse_validation_rows(raw_csv, source_id_key, parent_key, children_key, file_key)
|
|
20
|
+
raw_csv.map do |row|
|
|
21
|
+
# CSV::Row#to_h converts symbol headers → string-keyed hash
|
|
22
|
+
row_hash = row.to_h.transform_keys(&:to_s)
|
|
23
|
+
{
|
|
24
|
+
source_identifier: row[source_id_key],
|
|
25
|
+
model: row[:model],
|
|
26
|
+
parent: row[parent_key],
|
|
27
|
+
children: row[children_key],
|
|
28
|
+
file: row[file_key],
|
|
29
|
+
raw_row: row_hash
|
|
30
|
+
}
|
|
31
|
+
end
|
|
32
|
+
rescue StandardError => e
|
|
33
|
+
Rails.logger.error("CsvParser.validate_csv: error parsing rows – #{e.message}")
|
|
34
|
+
[]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def build_validation_field_metadata(all_models, field_analyzer)
|
|
38
|
+
all_models.each_with_object({}) do |model, hash|
|
|
39
|
+
field_list = field_analyzer.find_or_create_field_list_for(model_name: model)
|
|
40
|
+
hash[model] = {
|
|
41
|
+
properties: field_list.dig(model, 'properties') || [],
|
|
42
|
+
required_terms: field_list.dig(model, 'required_terms') || [],
|
|
43
|
+
controlled_vocab_terms: field_list.dig(model, 'controlled_vocab_terms') || []
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def build_valid_validation_headers(mapping_manager, field_analyzer, all_models, mappings, field_metadata)
|
|
49
|
+
svc = ValidationContext.new(
|
|
50
|
+
mapping_manager: mapping_manager,
|
|
51
|
+
field_analyzer: field_analyzer,
|
|
52
|
+
all_models: all_models,
|
|
53
|
+
mappings: mappings
|
|
54
|
+
)
|
|
55
|
+
all_cols = CsvTemplate::ColumnBuilder.new(svc).all_columns
|
|
56
|
+
all_cols - CsvTemplate::CsvBuilder::IGNORED_PROPERTIES
|
|
57
|
+
rescue StandardError => e
|
|
58
|
+
Rails.logger.error("CsvParser.validate_csv: error building valid headers – #{e.message}")
|
|
59
|
+
standard = %w[model source_identifier parents children file]
|
|
60
|
+
model_fields = field_metadata.values.flat_map { |m| m[:properties] }
|
|
61
|
+
.map { |prop| mapping_manager.key_to_mapped_column(prop) }
|
|
62
|
+
(standard + model_fields).uniq
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def find_missing_required_headers(headers, field_metadata, mapping_manager)
|
|
66
|
+
csv_keys = headers.map { |h| mapping_manager.mapped_to_key(h).sub(/_\d+\z/, '') }.uniq
|
|
67
|
+
missing = []
|
|
68
|
+
field_metadata.each do |model, meta|
|
|
69
|
+
(meta[:required_terms] || []).each do |field|
|
|
70
|
+
missing << { model: model, field: field } unless csv_keys.include?(field)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
missing.uniq
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def find_unrecognized_validation_headers(headers, valid_headers)
|
|
77
|
+
checker = DidYouMean::SpellChecker.new(dictionary: valid_headers)
|
|
78
|
+
headers
|
|
79
|
+
.reject { |h| h.blank? || valid_headers.include?(h) || valid_headers.include?(h.sub(/_\d+\z/, '')) }
|
|
80
|
+
.index_with { |h| checker.correct(h).first }
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def find_empty_column_positions(headers, raw_csv)
|
|
84
|
+
headers.each_with_index.filter_map do |h, i|
|
|
85
|
+
next if h.present?
|
|
86
|
+
has_data = raw_csv.any? { |row| row.fields[i].present? }
|
|
87
|
+
i + 1 if has_data
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Adds a missing source_identifier entry to missing_required when the column
|
|
92
|
+
# is absent and fill_in_blank_source_identifiers is not configured.
|
|
93
|
+
def append_missing_source_id!(missing_required, headers, source_id_key, all_models)
|
|
94
|
+
return if headers.map(&:to_s).include?(source_id_key.to_s)
|
|
95
|
+
return if Bulkrax.fill_in_blank_source_identifiers.present?
|
|
96
|
+
|
|
97
|
+
all_models.each { |model| missing_required << { model: model, field: source_id_key.to_s } }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Adds a file-level notice when the model column is absent or every row has a blank
|
|
101
|
+
# model value, indicating that the default work type will be used for all rows.
|
|
102
|
+
# When this notice is present the per-row default_work_type_used warnings are
|
|
103
|
+
# suppressed in the formatter — no need to repeat the same message for every row.
|
|
104
|
+
def append_missing_model_notice!(notices, headers, csv_data)
|
|
105
|
+
default_model = Bulkrax.default_work_type
|
|
106
|
+
return if default_model.blank?
|
|
107
|
+
|
|
108
|
+
model_column_present = headers.map(&:to_s).include?('model')
|
|
109
|
+
all_rows_blank = model_column_present && csv_data.all? { |r| r[:model].blank? }
|
|
110
|
+
|
|
111
|
+
return if model_column_present && !all_rows_blank
|
|
112
|
+
|
|
113
|
+
key_suffix = all_rows_blank ? 'column_empty' : 'column_missing'
|
|
114
|
+
base_key = 'bulkrax.importer.guided_import.validation.default_work_type_notice'
|
|
115
|
+
notices << {
|
|
116
|
+
field: 'model',
|
|
117
|
+
default_work_type: default_model,
|
|
118
|
+
message: I18n.t("#{base_key}.message_#{key_suffix}", default_work_type: default_model),
|
|
119
|
+
suggestion: I18n.t("#{base_key}.suggestion_#{key_suffix}")
|
|
120
|
+
}
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def apply_rights_statement_validation_override!(result, missing_required)
|
|
124
|
+
only_rights = missing_required.present? &&
|
|
125
|
+
missing_required.all? { |h| h[:field].to_s == 'rights_statement' }
|
|
126
|
+
return unless only_rights && !result[:isValid]
|
|
127
|
+
return if result[:headers].blank?
|
|
128
|
+
return if result[:missingFiles]&.any?
|
|
129
|
+
|
|
130
|
+
result[:isValid] = true
|
|
131
|
+
result[:hasWarnings] = true
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Assembles the final result hash returned to the guided import UI.
|
|
135
|
+
def assemble_result(headers:, missing_required:, header_issues:, row_errors:, csv_data:, file_validator:, collections:, works:, file_sets:, notices: []) # rubocop:disable Metrics/ParameterLists
|
|
136
|
+
row_error_entries = row_errors.select { |e| e[:severity] == 'error' }
|
|
137
|
+
row_warning_entries = row_errors.select { |e| e[:severity] == 'warning' }
|
|
138
|
+
has_errors = missing_required.any? || headers.blank? || csv_data.empty? ||
|
|
139
|
+
file_validator.missing_files.any? || row_error_entries.any?
|
|
140
|
+
has_warnings = header_issues[:unrecognized].any? || header_issues[:empty_columns].any? ||
|
|
141
|
+
file_validator.possible_missing_files? || row_warning_entries.any? || notices.any?
|
|
142
|
+
|
|
143
|
+
{
|
|
144
|
+
headers: headers,
|
|
145
|
+
missingRequired: missing_required,
|
|
146
|
+
notices: notices,
|
|
147
|
+
unrecognized: header_issues[:unrecognized],
|
|
148
|
+
emptyColumns: header_issues[:empty_columns],
|
|
149
|
+
rowCount: csv_data.length,
|
|
150
|
+
isValid: !has_errors,
|
|
151
|
+
hasWarnings: has_warnings,
|
|
152
|
+
rowErrors: row_errors,
|
|
153
|
+
collections: collections,
|
|
154
|
+
works: works,
|
|
155
|
+
fileSets: file_sets,
|
|
156
|
+
totalItems: csv_data.length,
|
|
157
|
+
fileReferences: file_validator.count_references,
|
|
158
|
+
missingFiles: file_validator.missing_files,
|
|
159
|
+
foundFiles: file_validator.found_files_count,
|
|
160
|
+
zipIncluded: file_validator.zip_included?
|
|
161
|
+
}
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Builds the find_record lambda used by row validators and hierarchy extraction.
|
|
165
|
+
def build_find_record
|
|
166
|
+
all_mappings = Bulkrax.field_mappings['Bulkrax::CsvParser'] || {}
|
|
167
|
+
work_identifier = all_mappings.find { |_k, v| v['source_identifier'] == true }&.first || 'source'
|
|
168
|
+
work_identifier_search = Array.wrap(all_mappings.dig(work_identifier, 'search_field')).first&.to_s ||
|
|
169
|
+
"#{work_identifier}_sim"
|
|
170
|
+
->(id) { find_record_by_source_identifier(id, work_identifier, work_identifier_search) }
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Attempt to locate an existing repository record by its identifier.
|
|
174
|
+
# The identifier may be a repository object ID or a source_identifier property value.
|
|
175
|
+
# Checks the repository directly (by ID, then by Solr property search) — a Bulkrax
|
|
176
|
+
# Entry record alone is not sufficient, as the object may never have been created.
|
|
177
|
+
#
|
|
178
|
+
# @param identifier [String]
|
|
179
|
+
# @param work_identifier [String] the source_identifier property name (e.g. "source")
|
|
180
|
+
# @param work_identifier_search [String] the Solr field for source_identifier (e.g. "source_sim")
|
|
181
|
+
# @return [Boolean] true if a matching repository object is found
|
|
182
|
+
def find_record_by_source_identifier(identifier, work_identifier, work_identifier_search)
|
|
183
|
+
return false if identifier.blank?
|
|
184
|
+
|
|
185
|
+
return true if Bulkrax.object_factory.find_or_nil(identifier).present?
|
|
186
|
+
|
|
187
|
+
[Bulkrax.collection_model_class, *Bulkrax.curation_concerns].any? do |klass|
|
|
188
|
+
Bulkrax.object_factory.search_by_property(
|
|
189
|
+
value: identifier,
|
|
190
|
+
klass: klass,
|
|
191
|
+
search_field: work_identifier_search,
|
|
192
|
+
name_field: work_identifier
|
|
193
|
+
).present?
|
|
194
|
+
end
|
|
195
|
+
rescue StandardError
|
|
196
|
+
false
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Returns the raw CSV column name (String) for a relationship field.
|
|
200
|
+
# Looks for the mapping entry flagged with +flag+ and returns its first +from+ value,
|
|
201
|
+
# falling back to +default+ when none is found.
|
|
202
|
+
def resolve_relationship_column(mappings, flag, default)
|
|
203
|
+
entry = mappings.find { |_k, v| v.is_a?(Hash) && v[flag] }
|
|
204
|
+
entry&.last&.dig('from')&.first || default
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def resolve_parent_split_pattern(mappings)
|
|
208
|
+
split_val = mappings.dig('parents', 'split') || mappings.dig(:parents, :split)
|
|
209
|
+
return nil if split_val.blank?
|
|
210
|
+
return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true
|
|
211
|
+
|
|
212
|
+
split_val
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def resolve_children_split_pattern(mappings)
|
|
216
|
+
split_val = mappings.dig('children', 'split') || mappings.dig(:children, :split)
|
|
217
|
+
return nil if split_val.blank?
|
|
218
|
+
return Bulkrax::DEFAULT_MULTI_VALUE_ELEMENT_SPLIT_ON if split_val == true
|
|
219
|
+
|
|
220
|
+
split_val
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Builds a graph of { source_identifier => [parent_ids] } from all CSV records.
|
|
224
|
+
# Used by CircularReference validator to detect cycles across the whole CSV.
|
|
225
|
+
#
|
|
226
|
+
# Parent edges are collected from both directions:
|
|
227
|
+
# - explicit parent declarations (parents / parents_N columns)
|
|
228
|
+
# - inverted child declarations (children / children_N columns), mirroring
|
|
229
|
+
# the normalisation done in importers_stepper.js#normalizeRelationships
|
|
230
|
+
def build_relationship_graph(csv_data, mappings)
|
|
231
|
+
parent_column = resolve_relationship_column(mappings, 'related_parents_field_mapping', 'parents')
|
|
232
|
+
children_column = resolve_relationship_column(mappings, 'related_children_field_mapping', 'children')
|
|
233
|
+
parent_suffix = /\A#{Regexp.escape(parent_column)}_\d+\z/
|
|
234
|
+
children_suffix = /\A#{Regexp.escape(children_column)}_\d+\z/
|
|
235
|
+
|
|
236
|
+
graph = build_parent_edges(csv_data, parent_suffix, resolve_parent_split_pattern(mappings))
|
|
237
|
+
invert_child_edges(graph, csv_data, children_suffix, resolve_children_split_pattern(mappings))
|
|
238
|
+
graph
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def build_parent_edges(csv_data, suffix_pattern, split_pattern)
|
|
242
|
+
csv_data.each_with_object({}) do |record, graph|
|
|
243
|
+
id = record[:source_identifier]
|
|
244
|
+
next if id.blank?
|
|
245
|
+
|
|
246
|
+
base_ids = split_or_single(record[:parent], split_pattern)
|
|
247
|
+
suffix_ids = suffixed_values(record[:raw_row], suffix_pattern)
|
|
248
|
+
graph[id] = (base_ids + suffix_ids).uniq
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
def invert_child_edges(graph, csv_data, suffix_pattern, split_pattern)
|
|
253
|
+
csv_data.each do |record|
|
|
254
|
+
id = record[:source_identifier]
|
|
255
|
+
next if id.blank?
|
|
256
|
+
|
|
257
|
+
child_ids = split_or_single(record[:children], split_pattern) +
|
|
258
|
+
suffixed_values(record[:raw_row], suffix_pattern)
|
|
259
|
+
child_ids.each do |child_id|
|
|
260
|
+
graph[child_id] ||= []
|
|
261
|
+
graph[child_id] << id unless graph[child_id].include?(id)
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def split_or_single(value, split_pattern)
|
|
267
|
+
if split_pattern
|
|
268
|
+
value.to_s.split(split_pattern).map(&:strip).reject(&:blank?)
|
|
269
|
+
elsif value.present?
|
|
270
|
+
[value.to_s.strip]
|
|
271
|
+
else
|
|
272
|
+
[]
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
def suffixed_values(raw_row, suffix_pattern)
|
|
277
|
+
raw_row.select { |k, _| k.to_s.match?(suffix_pattern) }
|
|
278
|
+
.values.map(&:to_s).map(&:strip).reject(&:blank?)
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
end
|