bulkrax 1.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/app/controllers/bulkrax/exporters_controller.rb +12 -4
  4. data/app/controllers/bulkrax/importers_controller.rb +22 -17
  5. data/app/factories/bulkrax/object_factory.rb +44 -61
  6. data/app/jobs/bulkrax/create_relationships_job.rb +187 -0
  7. data/app/jobs/bulkrax/delete_work_job.rb +6 -2
  8. data/app/jobs/bulkrax/export_work_job.rb +3 -1
  9. data/app/jobs/bulkrax/exporter_job.rb +1 -0
  10. data/app/jobs/bulkrax/{import_work_collection_job.rb → import_collection_job.rb} +2 -2
  11. data/app/jobs/bulkrax/importer_job.rb +16 -1
  12. data/app/matchers/bulkrax/application_matcher.rb +9 -6
  13. data/app/models/bulkrax/csv_collection_entry.rb +8 -6
  14. data/app/models/bulkrax/csv_entry.rb +139 -45
  15. data/app/models/bulkrax/entry.rb +19 -8
  16. data/app/models/bulkrax/exporter.rb +12 -5
  17. data/app/models/bulkrax/importer.rb +22 -5
  18. data/app/models/bulkrax/oai_entry.rb +5 -1
  19. data/app/models/bulkrax/rdf_entry.rb +16 -7
  20. data/app/models/bulkrax/xml_entry.rb +4 -0
  21. data/app/models/concerns/bulkrax/export_behavior.rb +2 -2
  22. data/app/models/concerns/bulkrax/file_factory.rb +2 -1
  23. data/app/models/concerns/bulkrax/has_matchers.rb +59 -16
  24. data/app/models/concerns/bulkrax/import_behavior.rb +35 -5
  25. data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +19 -0
  26. data/app/models/concerns/bulkrax/status_info.rb +4 -4
  27. data/app/parsers/bulkrax/application_parser.rb +59 -84
  28. data/app/parsers/bulkrax/bagit_parser.rb +12 -3
  29. data/app/parsers/bulkrax/csv_parser.rb +117 -62
  30. data/app/parsers/bulkrax/oai_dc_parser.rb +5 -2
  31. data/app/parsers/bulkrax/xml_parser.rb +5 -0
  32. data/app/views/bulkrax/exporters/_form.html.erb +1 -1
  33. data/app/views/bulkrax/exporters/show.html.erb +13 -1
  34. data/app/views/bulkrax/importers/_edit_form_buttons.html.erb +45 -14
  35. data/app/views/bulkrax/importers/edit.html.erb +2 -0
  36. data/app/views/bulkrax/importers/index.html.erb +15 -17
  37. data/app/views/bulkrax/importers/show.html.erb +6 -2
  38. data/config/locales/bulkrax.en.yml +1 -0
  39. data/db/migrate/20190731114016_change_importer_and_exporter_to_polymorphic.rb +5 -1
  40. data/db/migrate/20211004170708_change_bulkrax_statuses_error_message_column_type_to_text.rb +5 -0
  41. data/db/migrate/20211203195233_rename_children_counters_to_relationships.rb +6 -0
  42. data/lib/bulkrax/engine.rb +1 -1
  43. data/lib/bulkrax/version.rb +1 -1
  44. data/lib/bulkrax.rb +9 -17
  45. data/lib/generators/bulkrax/templates/bin/importer +17 -11
  46. data/lib/generators/bulkrax/templates/config/bulkrax_api.yml +3 -1
  47. data/lib/generators/bulkrax/templates/config/initializers/bulkrax.rb +7 -12
  48. metadata +13 -7
  49. data/app/jobs/bulkrax/child_relationships_job.rb +0 -128
@@ -12,6 +12,8 @@ module Bulkrax
12
12
  raise CollectionsCreatedError unless collections_created?
13
13
  @item = factory.run!
14
14
  end
15
+ parent_jobs if self.parsed_metadata[related_parents_parsed_mapping].present?
16
+ child_jobs if self.parsed_metadata[related_children_parsed_mapping].present?
15
17
  rescue RSolr::Error::Http, CollectionsCreatedError => e
16
18
  raise e
17
19
  rescue StandardError => e
@@ -22,7 +24,19 @@ module Bulkrax
22
24
  return @item
23
25
  end
24
26
 
25
- def find_or_create_collection_ids
27
+ def parent_jobs
28
+ self.parsed_metadata[related_parents_parsed_mapping].each do |parent_identifier|
29
+ CreateRelationshipsJob.perform_later(entry_identifier: self.identifier, parent_identifier: parent_identifier, importer_run: self.last_run)
30
+ end
31
+ end
32
+
33
+ def child_jobs
34
+ self.parsed_metadata[related_children_parsed_mapping].each do |child_identifier|
35
+ CreateRelationshipsJob.perform_later(entry_identifier: self.identifier, child_identifier: child_identifier, importer_run: self.last_run)
36
+ end
37
+ end
38
+
39
+ def find_collection_ids
26
40
  self.collection_ids
27
41
  end
28
42
 
@@ -57,15 +71,27 @@ module Bulkrax
57
71
  end
58
72
 
59
73
  def add_collections
60
- return if find_or_create_collection_ids.blank?
61
- self.parsed_metadata['collections'] = []
62
- self.parsed_metadata['collections'] += find_or_create_collection_ids.map { |c| { id: c } }
74
+ return if find_collection_ids.blank?
75
+
76
+ ActiveSupport::Deprecation.warn(
77
+ 'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
78
+ ' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
79
+ )
80
+ self.parsed_metadata['member_of_collections_attributes'] = {}
81
+ find_collection_ids.each_with_index do |c, i|
82
+ self.parsed_metadata['member_of_collections_attributes'][i.to_s] = { id: c }
83
+ end
63
84
  end
64
85
 
65
86
  def factory
87
+ ActiveSupport::Deprecation.warn(
88
+ 'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
89
+ ' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
90
+ )
66
91
  @factory ||= Bulkrax::ObjectFactory.new(attributes: self.parsed_metadata,
67
92
  source_identifier_value: identifier,
68
93
  work_identifier: parser.work_identifier,
94
+ collection_field_mapping: parser.collection_field_mapping,
69
95
  replace_files: replace_files,
70
96
  user: user,
71
97
  klass: factory_class,
@@ -80,7 +106,11 @@ module Bulkrax
80
106
  else
81
107
  Bulkrax.default_work_type
82
108
  end
83
- fc.constantize
109
+
110
+ # return the name of the collection or work
111
+ fc.tr!(' ', '_')
112
+ fc.downcase! if fc.match?(/[-_]/)
113
+ fc.camelcase.constantize
84
114
  rescue NameError
85
115
  nil
86
116
  rescue
@@ -25,10 +25,29 @@ module Bulkrax
25
25
  if collection
26
26
  current_run.total_collection_entries = index + 1 unless parser.collections_total.positive?
27
27
  else
28
+ # TODO: differentiate between work and collection counts for exporters
28
29
  current_run.total_work_entries = index + 1 unless limit.to_i.positive? || parser.total.positive?
29
30
  end
30
31
  current_run.enqueued_records = index + 1
31
32
  current_run.save!
32
33
  end
34
+
35
+ def keys_without_numbers(keys)
36
+ keys.map { |key| key_without_numbers(key) }
37
+ end
38
+
39
+ def key_without_numbers(key)
40
+ key.gsub(/_\d+/, '').sub(/^\d+_/, '')
41
+ end
42
+
43
+ # Is this a file?
44
+ def file?
45
+ parser_fields&.[]('import_file_path') && File.file?(parser_fields['import_file_path'])
46
+ end
47
+
48
+ # Is this a zip file?
49
+ def zip?
50
+ parser_fields&.[]('import_file_path') && MIME::Types.type_for(parser_fields['import_file_path']).include?('application/zip')
51
+ end
33
52
  end
34
53
  end
@@ -33,13 +33,13 @@ module Bulkrax
33
33
  current_status&.created_at
34
34
  end
35
35
 
36
- def status_info(e = nil)
36
+ def status_info(e = nil, current_run = nil)
37
37
  if e.nil?
38
- self.statuses.create!(status_message: 'Complete', runnable: last_run)
38
+ self.statuses.create!(status_message: 'Complete', runnable: current_run || last_run)
39
39
  elsif e.is_a?(String)
40
- self.statuses.create!(status_message: e, runnable: last_run)
40
+ self.statuses.create!(status_message: e, runnable: current_run || last_run)
41
41
  else
42
- self.statuses.create!(status_message: 'Failed', runnable: last_run, error_class: e.class.to_s, error_message: e.message, error_backtrace: e.backtrace)
42
+ self.statuses.create!(status_message: 'Failed', runnable: current_run || last_run, error_class: e.class.to_s, error_message: e.message, error_backtrace: e.backtrace)
43
43
  end
44
44
  end
45
45
 
@@ -1,15 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Bulkrax
4
- class ApplicationParser
5
- attr_accessor :importerexporter
4
+ class ApplicationParser # rubocop:disable Metrics/ClassLength
5
+ attr_accessor :importerexporter, :headers
6
6
  alias importer importerexporter
7
7
  alias exporter importerexporter
8
- delegate :only_updates, :limit, :current_run, :errors,
9
- :seen, :increment_counters, :parser_fields, :user,
10
- :exporter_export_path, :exporter_export_zip_path, :importer_unzip_path, :validate_only,
11
- :status, :status_info, :status_at,
12
- to: :importerexporter
8
+ delegate :only_updates, :limit, :current_run, :errors, :mapping,
9
+ :seen, :increment_counters, :parser_fields, :user, :keys_without_numbers,
10
+ :key_without_numbers, :status, :status_info, :status_at,
11
+ :exporter_export_path, :exporter_export_zip_path, :importer_unzip_path, :validate_only,
12
+ to: :importerexporter
13
13
 
14
14
  def self.parser_fields
15
15
  {}
@@ -25,6 +25,7 @@ module Bulkrax
25
25
 
26
26
  def initialize(importerexporter)
27
27
  @importerexporter = importerexporter
28
+ @headers = []
28
29
  end
29
30
 
30
31
  # @api
@@ -43,20 +44,54 @@ module Bulkrax
43
44
  end
44
45
 
45
46
  def source_identifier
46
- @source_identifier ||= identifier_hash.values.first&.[]("from")&.first&.to_sym || :source_identifier
47
+ @source_identifier ||= get_field_mapping_hash_for('source_identifier')&.values&.first&.[]('from')&.first&.to_sym || :source_identifier
47
48
  end
48
49
 
49
50
  def work_identifier
50
- @work_identifier ||= identifier_hash.keys.first&.to_sym || :source
51
+ @work_identifier ||= get_field_mapping_hash_for('source_identifier')&.keys&.first&.to_sym || :source
51
52
  end
52
53
 
53
- def identifier_hash
54
- @identifier_hash ||= importerexporter.mapping.select do |_, h|
55
- h.key?("source_identifier")
56
- end
57
- raise StandardError, "more than one source_identifier declared: #{@identifier_hash.keys.join(', ')}" if @identifier_hash.length > 1
54
+ def related_parents_raw_mapping
55
+ @related_parents_raw_mapping ||= get_field_mapping_hash_for('related_parents_field_mapping')&.values&.first&.[]('from')&.first
56
+ end
57
+
58
+ def related_parents_parsed_mapping
59
+ @related_parents_parsed_mapping ||= get_field_mapping_hash_for('related_parents_field_mapping')&.keys&.first
60
+ end
61
+
62
+ def related_children_raw_mapping
63
+ @related_children_raw_mapping ||= get_field_mapping_hash_for('related_children_field_mapping')&.values&.first&.[]('from')&.first
64
+ end
65
+
66
+ def related_children_parsed_mapping
67
+ @related_children_parsed_mapping ||= get_field_mapping_hash_for('related_children_field_mapping')&.keys&.first
68
+ end
69
+
70
+ def get_field_mapping_hash_for(key)
71
+ return instance_variable_get("@#{key}_hash") if instance_variable_get("@#{key}_hash").present?
72
+
73
+ instance_variable_set(
74
+ "@#{key}_hash",
75
+ importerexporter.mapping.with_indifferent_access.select { |_, h| h.key?(key) }
76
+ )
77
+ raise StandardError, "more than one #{key} declared: #{instance_variable_get("@#{key}_hash").keys.join(', ')}" if instance_variable_get("@#{key}_hash").length > 1
58
78
 
59
- @identifier_hash
79
+ instance_variable_get("@#{key}_hash")
80
+ end
81
+
82
+ def collection_field_mapping
83
+ ActiveSupport::Deprecation.warn(
84
+ 'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
85
+ ' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
86
+ )
87
+ Bulkrax.collection_field_mapping[self.entry_class.to_s]&.to_sym || :collection
88
+ end
89
+
90
+ def model_field_mappings
91
+ model_mappings = Bulkrax.field_mappings[self.class.to_s]&.dig('model', :from) || []
92
+ model_mappings |= ['model']
93
+
94
+ model_mappings
60
95
  end
61
96
 
62
97
  def perform_method
@@ -91,76 +126,19 @@ module Bulkrax
91
126
  path
92
127
  end
93
128
 
129
+ # Base path for imported and exported files
130
+ def base_path(type = 'import')
131
+ ENV['HYKU_MULTITENANT'] ? File.join(Bulkrax.send("#{type}_path"), Site.instance.account.name) : Bulkrax.send("#{type}_path")
132
+ end
133
+
94
134
  # Path where we'll store the import metadata and files
95
135
  # this is used for uploaded and cloud files
96
136
  def path_for_import
97
- @path_for_import = File.join(Bulkrax.import_path, importerexporter.path_string)
137
+ @path_for_import = File.join(base_path, importerexporter.path_string)
98
138
  FileUtils.mkdir_p(@path_for_import) unless File.exist?(@path_for_import)
99
139
  @path_for_import
100
140
  end
101
141
 
102
- # Optional, only used by certain parsers
103
- # Other parsers should override with a custom or empty method
104
- # Will be skipped unless the #record is a Hash
105
- def create_parent_child_relationships
106
- parents.each do |key, value|
107
- parent = entry_class.where(
108
- identifier: key,
109
- importerexporter_id: importerexporter.id,
110
- importerexporter_type: 'Bulkrax::Importer'
111
- ).first
112
-
113
- # not finding the entries here indicates that the given identifiers are incorrect
114
- # in that case we should log that
115
- children = value.map do |child|
116
- entry_class.where(
117
- identifier: child,
118
- importerexporter_id: importerexporter.id,
119
- importerexporter_type: 'Bulkrax::Importer'
120
- ).first
121
- end.compact.uniq
122
-
123
- if parent.present? && (children.length != value.length)
124
- # Increment the failures for the number we couldn't find
125
- # Because all of our entries have been created by now, if we can't find them, the data is wrong
126
- Rails.logger.error("Expected #{value.length} children for parent entry #{parent.id}, found #{children.length}")
127
- break if children.empty?
128
- Rails.logger.warn("Adding #{children.length} children to parent entry #{parent.id} (expected #{value.length})")
129
- end
130
- parent_id = parent.id
131
- child_entry_ids = children.map(&:id)
132
- ChildRelationshipsJob.perform_later(parent_id, child_entry_ids, current_run.id)
133
- end
134
- rescue StandardError => e
135
- status_info(e)
136
- end
137
-
138
- def parents
139
- @parents ||= setup_parents
140
- end
141
-
142
- def setup_parents
143
- pts = []
144
- records.each do |record|
145
- r = if record.respond_to?(:to_h)
146
- record.to_h
147
- else
148
- record
149
- end
150
- next unless r.is_a?(Hash)
151
- children = if r[:children].is_a?(String)
152
- r[:children].split(/\s*[:;|]\s*/)
153
- else
154
- r[:children]
155
- end
156
- next if children.blank?
157
- pts << {
158
- r[source_identifier] => children
159
- }
160
- end
161
- pts.blank? ? pts : pts.inject(:merge)
162
- end
163
-
164
142
  def setup_export_file
165
143
  raise StandardError, 'must be defined' if exporter?
166
144
  end
@@ -288,12 +266,9 @@ module Bulkrax
288
266
  private
289
267
 
290
268
  def real_import_file_path
291
- if file? && zip?
292
- unzip(parser_fields['import_file_path'])
293
- return importer_unzip_path
294
- else
295
- parser_fields['import_file_path']
296
- end
269
+ return importer_unzip_path if file? && zip?
270
+
271
+ parser_fields['import_file_path']
297
272
  end
298
273
  end
299
274
  end
@@ -40,7 +40,7 @@ module Bulkrax
40
40
  raise StandardError, 'No metadata files were found' if path.blank?
41
41
  data = entry_class.read_data(path)
42
42
  data = entry_class.data_for_entry(data, source_identifier)
43
- data[:file] = bag.bag_files.join('|')
43
+ data[:file] = bag.bag_files.join('|') unless importerexporter.metadata_only?
44
44
  data
45
45
  end
46
46
  end
@@ -58,7 +58,7 @@ module Bulkrax
58
58
  collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
59
59
  }
60
60
  new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
61
- ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
61
+ ImportCollectionJob.perform_now(new_entry.id, current_run.id)
62
62
  increment_counters(index, true)
63
63
  end
64
64
  end
@@ -83,13 +83,22 @@ module Bulkrax
83
83
  end
84
84
 
85
85
  def collections
86
- records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
86
+ ActiveSupport::Deprecation.warn(
87
+ 'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
88
+ ' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
89
+ )
90
+ records.map { |r| r[collection_field_mapping].split(/\s*[;|]\s*/) if r[collection_field_mapping].present? }.flatten.compact.uniq
87
91
  end
88
92
 
89
93
  def collections_total
90
94
  collections.size
91
95
  end
92
96
 
97
+ # TODO: change to differentiate between collection and work records when adding ability to import collection metadata
98
+ def works_total
99
+ total
100
+ end
101
+
93
102
  def total
94
103
  metadata_paths.count
95
104
  end
@@ -2,31 +2,47 @@
2
2
 
3
3
  require 'csv'
4
4
  module Bulkrax
5
- class CsvParser < ApplicationParser
5
+ class CsvParser < ApplicationParser # rubocop:disable Metrics/ClassLength
6
6
  include ErroredEntries
7
7
  def self.export_supported?
8
8
  true
9
9
  end
10
10
 
11
- def initialize(importerexporter)
12
- @importerexporter = importerexporter
11
+ def records(_opts = {})
12
+ file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
13
+ # data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
14
+ csv_data = entry_class.read_data(file_for_import)
15
+ importer.parser_fields['total'] = csv_data.count
16
+ importer.save
17
+ @records ||= csv_data.map { |record_data| entry_class.data_for_entry(record_data, nil) }
13
18
  end
14
19
 
15
20
  def collections
16
- # does the CSV contain a collection column?
17
- return [] unless import_fields.include?(:collection)
21
+ ActiveSupport::Deprecation.warn(
22
+ 'Creating Collections using the collection_field_mapping will no longer be supported as of Bulkrax version 3.0.' \
23
+ ' Please configure Bulkrax to use related_parents_field_mapping and related_children_field_mapping instead.'
24
+ )
18
25
  # retrieve a list of unique collections
19
- records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
26
+ records.map do |r|
27
+ collections = []
28
+ r[collection_field_mapping].split(/\s*[;|]\s*/).each { |title| collections << { title: title } } if r[collection_field_mapping].present?
29
+ model_field_mappings.each do |model_mapping|
30
+ collections << r if r[model_mapping.to_sym]&.downcase == 'collection'
31
+ end
32
+ collections
33
+ end.flatten.compact.uniq
20
34
  end
21
35
 
22
36
  def collections_total
23
37
  collections.size
24
38
  end
25
39
 
26
- def records(_opts = {})
27
- file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
28
- # data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
29
- @records ||= entry_class.read_data(file_for_import).map { |record_data| entry_class.data_for_entry(record_data, nil) }
40
+ def works
41
+ records - collections
42
+ end
43
+
44
+ def works_total
45
+ works.size
30
46
  end
31
47
 
32
48
  # We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data
@@ -44,8 +60,9 @@ module Bulkrax
44
60
  end
45
61
 
46
62
  def valid_import?
47
- error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(import_fields).join(', ')}"
48
- raise StandardError, error_alert unless required_elements?(import_fields)
63
+ import_strings = keys_without_numbers(import_fields.map(&:to_s))
64
+ error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(import_strings).join(', ')}"
65
+ raise StandardError, error_alert unless required_elements?(import_strings)
49
66
 
50
67
  file_paths.is_a?(Array)
51
68
  rescue StandardError => e
@@ -56,26 +73,26 @@ module Bulkrax
56
73
  def create_collections
57
74
  collections.each_with_index do |collection, index|
58
75
  next if collection.blank?
59
- metadata = {
60
- title: [collection],
61
- work_identifier => [collection],
62
- visibility: 'open',
63
- collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
64
- }
65
- new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
66
- ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
76
+ break if records.find_index(collection).present? && limit_reached?(limit, records.find_index(collection))
77
+
78
+ new_entry = find_or_create_entry(collection_entry_class, unique_collection_identifier(collection), 'Bulkrax::Importer', collection.to_h)
79
+ # TODO: add support for :delete option
80
+ ImportCollectionJob.perform_now(new_entry.id, current_run.id)
67
81
  increment_counters(index, true)
68
82
  end
83
+ importer.record_status
84
+ rescue StandardError => e
85
+ status_info(e)
69
86
  end
70
87
 
71
88
  def create_works
72
- records.each_with_index do |record, index|
73
- next unless record_has_source_identifier(record, index)
74
- break if limit_reached?(limit, index)
89
+ works.each_with_index do |work, index|
90
+ next unless record_has_source_identifier(work, records.find_index(work))
91
+ break if limit_reached?(limit, records.find_index(work))
75
92
 
76
- seen[record[source_identifier]] = true
77
- new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record.to_h.compact)
78
- if record[:delete].present?
93
+ seen[work[source_identifier]] = true
94
+ new_entry = find_or_create_entry(entry_class, work[source_identifier], 'Bulkrax::Importer', work.to_h)
95
+ if work[:delete].present?
79
96
  DeleteWorkJob.send(perform_method, new_entry, current_run)
80
97
  else
81
98
  ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
@@ -99,10 +116,6 @@ module Bulkrax
99
116
  path
100
117
  end
101
118
 
102
- def create_parent_child_relationships
103
- super
104
- end
105
-
106
119
  def extra_filters
107
120
  output = ""
108
121
  if importerexporter.start_date.present?
@@ -117,6 +130,8 @@ module Bulkrax
117
130
 
118
131
  def current_work_ids
119
132
  case importerexporter.export_from
133
+ when 'all'
134
+ ActiveFedora::SolrService.query("has_model_ssim:(#{Hyrax.config.curation_concerns.join(' OR ')}) #{extra_filters}", rows: 2_147_483_647).map(&:id)
120
135
  when 'collection'
121
136
  ActiveFedora::SolrService.query("member_of_collection_ids_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
122
137
  when 'worktype'
@@ -126,9 +141,16 @@ module Bulkrax
126
141
  complete_statuses = Bulkrax::Status.latest_by_statusable
127
142
  .includes(:statusable)
128
143
  .where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
129
- complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier }
130
144
 
131
- ActiveFedora::SolrService.query("#{work_identifier}_tesim:(#{complete_entry_identifiers.join(' OR ')})#{extra_filters}", rows: 2_000_000_000).map(&:id)
145
+ complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier&.gsub(':', '\:') }
146
+ extra_filters = extra_filters.presence || '*:*'
147
+
148
+ ActiveFedora::SolrService.get(
149
+ extra_filters.to_s,
150
+ fq: "#{work_identifier}_sim:(#{complete_entry_identifiers.join(' OR ')})",
151
+ fl: 'id',
152
+ rows: 2_000_000_000
153
+ )['response']['docs'].map { |obj| obj['id'] }
132
154
  end
133
155
  end
134
156
 
@@ -136,12 +158,18 @@ module Bulkrax
136
158
  current_work_ids.each_with_index do |wid, index|
137
159
  break if limit_reached?(limit, index)
138
160
  new_entry = find_or_create_entry(entry_class, wid, 'Bulkrax::Exporter')
139
- Bulkrax::ExportWorkJob.perform_now(new_entry.id, current_run.id)
161
+ begin
162
+ entry = Bulkrax::ExportWorkJob.perform_now(new_entry.id, current_run.id)
163
+ rescue => e
164
+ Rails.logger.info("#{e.message} was detected during export")
165
+ end
166
+ self.headers |= entry.parsed_metadata.keys if entry
140
167
  end
141
168
  end
142
169
  alias create_from_collection create_new_entries
143
170
  alias create_from_importer create_new_entries
144
171
  alias create_from_worktype create_new_entries
172
+ alias create_from_all create_new_entries
145
173
 
146
174
  def entry_class
147
175
  CsvEntry
@@ -154,19 +182,11 @@ module Bulkrax
154
182
  # See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
155
183
  # Changed to grep as wc -l counts blank lines, and ignores the final unescaped line (which may or may not contain data)
156
184
  def total
157
- if importer?
158
- return @total if @total&.positive?
159
- # windows enocded
160
- @total = `grep -c ^M #{real_import_file_path}`.to_i - 1
161
- # unix encoded
162
- @total = `grep -vc ^$ #{real_import_file_path}`.to_i - 1 if @total < 1
163
- elsif exporter?
164
- @total = importerexporter.entries.count
165
- else
166
- @total = 0
167
- end
168
- return @total
169
- rescue StandardErrorr
185
+ @total = importer.parser_fields['total'] || 0 if importer?
186
+ @total = importerexporter.entries.count if exporter?
187
+
188
+ return @total || 0
189
+ rescue StandardError
170
190
  @total = 0
171
191
  end
172
192
 
@@ -201,31 +221,58 @@ module Bulkrax
201
221
  end
202
222
  end
203
223
 
204
- def key_allowed(key)
205
- !Bulkrax.reserved_properties.include?(key) &&
206
- new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
224
+ def export_key_allowed(key)
225
+ new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
207
226
  key != source_identifier.to_s
208
227
  end
209
228
 
210
229
  # All possible column names
211
230
  def export_headers
212
- headers = ['id']
213
- headers << source_identifier.to_s
214
- headers << 'model'
215
- importerexporter.mapping.each_key { |key| headers << key if key_allowed(key) }
216
- headers << 'file'
231
+ headers = sort_headers(self.headers)
232
+
233
+ # we don't want access_control_id exported and we want file at the end
234
+ headers.delete('access_control_id') if headers.include?('access_control_id')
235
+
236
+ # add the headers below at the beginning or end to maintain the preexisting export behavior
237
+ headers.prepend('model')
238
+ headers.prepend(source_identifier.to_s)
239
+ headers.prepend('id')
240
+
217
241
  headers.uniq
218
242
  end
219
243
 
244
+ def object_names
245
+ return @object_names if @object_names
246
+
247
+ @object_names = mapping.values.map { |value| value['object'] }
248
+ @object_names.uniq!.delete(nil)
249
+
250
+ @object_names
251
+ end
252
+
253
+ def sort_headers(headers)
254
+ # converting headers like creator_name_1 to creator_1_name so they get sorted by numerical order
255
+ # while keeping objects grouped together
256
+ headers.sort_by do |item|
257
+ number = item.match(/\d+/)&.[](0) || 0.to_s
258
+ sort_number = number.rjust(4, "0")
259
+ object_prefix = object_names.detect { |o| item.match(/^#{o}/) } || item
260
+ remainder = item.gsub(/^#{object_prefix}_/, '').gsub(/_#{number}/, '')
261
+ "#{object_prefix}_#{sort_number}_#{remainder}"
262
+ end
263
+ end
264
+
220
265
  # in the parser as it is specific to the format
221
266
  def setup_export_file
222
- File.join(importerexporter.exporter_export_path, 'export.csv')
267
+ File.join(importerexporter.exporter_export_path, "export_#{importerexporter.export_source}_from_#{importerexporter.export_from}.csv")
223
268
  end
224
269
 
225
270
  # Retrieve file paths for [:file] mapping in records
226
271
  # and check all listed files exist.
227
272
  def file_paths
228
273
  raise StandardError, 'No records were found' if records.blank?
274
+ return [] if importerexporter.metadata_only?
275
+
229
276
  @file_paths ||= records.map do |r|
230
277
  file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
231
278
  next if r[file_mapping].blank?
@@ -244,23 +291,31 @@ module Bulkrax
244
291
  # Retrieve the path where we expect to find the files
245
292
  def path_to_files
246
293
  @path_to_files ||= File.join(
247
- File.file?(import_file_path) ? File.dirname(import_file_path) : import_file_path,
294
+ zip? ? importer_unzip_path : File.dirname(import_file_path),
248
295
  'files'
249
296
  )
250
297
  end
251
298
 
252
299
  private
253
300
 
301
+ def unique_collection_identifier(collection_hash)
302
+ entry_uid = collection_hash[source_identifier]
303
+ entry_uid ||= if Bulkrax.fill_in_blank_source_identifiers.present?
304
+ Bulkrax.fill_in_blank_source_identifiers.call(self, records.find_index(collection_hash))
305
+ else
306
+ collection_hash[:title].split(/\s*[;|]\s*/).first
307
+ end
308
+
309
+ entry_uid
310
+ end
311
+
254
312
  # Override to return the first CSV in the path, if a zip file is supplied
255
313
  # We expect a single CSV at the top level of the zip in the CSVParser
256
314
  # but we are willing to go look for it if need be
257
315
  def real_import_file_path
258
- if file? && zip?
259
- unzip(parser_fields['import_file_path'])
260
- return Dir["#{importer_unzip_path}/**/*.csv"].first
261
- else
262
- parser_fields['import_file_path']
263
- end
316
+ return Dir["#{importer_unzip_path}/**/*.csv"].first if file? && zip?
317
+
318
+ parser_fields['import_file_path']
264
319
  end
265
320
  end
266
321
  end
@@ -75,7 +75,7 @@ module Bulkrax
75
75
 
76
76
  new_entry = collection_entry_class.where(importerexporter: importerexporter, identifier: unique_collection_identifier, raw_metadata: metadata).first_or_create!
77
77
  # perform now to ensure this gets created before work imports start
78
- ImportWorkCollectionJob.perform_now(new_entry.id, importerexporter.current_run.id)
78
+ ImportCollectionJob.perform_now(new_entry.id, importerexporter.current_run.id)
79
79
  increment_counters(index, true)
80
80
  end
81
81
  end
@@ -119,7 +119,10 @@ module Bulkrax
119
119
  end
120
120
  end
121
121
 
122
- def create_parent_child_relationships; end
122
+ # TODO: change to differentiate between collection and work records when adding ability to import collection metadata
123
+ def works_total
124
+ total
125
+ end
123
126
 
124
127
  def total
125
128
  @total ||= records(quick: true).doc.find(".//resumptionToken").to_a.first.attributes["completeListSize"].to_i