bulkrax 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +205 -0
  3. data/README.md +202 -0
  4. data/Rakefile +42 -0
  5. data/app/assets/config/bulkrax_manifest.js +2 -0
  6. data/app/assets/javascripts/bulkrax/application.js +14 -0
  7. data/app/assets/javascripts/bulkrax/bulkrax.js +11 -0
  8. data/app/assets/javascripts/bulkrax/entries.js +15 -0
  9. data/app/assets/javascripts/bulkrax/exporters.js +60 -0
  10. data/app/assets/javascripts/bulkrax/importers.js.erb +166 -0
  11. data/app/assets/stylesheets/bulkrax/accordion.scss +40 -0
  12. data/app/assets/stylesheets/bulkrax/application.css +15 -0
  13. data/app/assets/stylesheets/bulkrax/coderay.scss +264 -0
  14. data/app/assets/stylesheets/bulkrax/import_export.scss +37 -0
  15. data/app/controllers/bulkrax/application_controller.rb +8 -0
  16. data/app/controllers/bulkrax/entries_controller.rb +44 -0
  17. data/app/controllers/bulkrax/exporters_controller.rb +125 -0
  18. data/app/controllers/bulkrax/importers_controller.rb +315 -0
  19. data/app/controllers/concerns/bulkrax/api.rb +29 -0
  20. data/app/factories/bulkrax/object_factory.rb +230 -0
  21. data/app/helpers/bulkrax/application_helper.rb +15 -0
  22. data/app/helpers/bulkrax/exporters_helper.rb +6 -0
  23. data/app/helpers/bulkrax/importers_helper.rb +13 -0
  24. data/app/helpers/bulkrax/validation_helper.rb +153 -0
  25. data/app/jobs/bulkrax/application_job.rb +6 -0
  26. data/app/jobs/bulkrax/child_relationships_job.rb +128 -0
  27. data/app/jobs/bulkrax/delete_work_job.rb +16 -0
  28. data/app/jobs/bulkrax/download_cloud_file_job.rb +18 -0
  29. data/app/jobs/bulkrax/export_work_job.rb +37 -0
  30. data/app/jobs/bulkrax/exporter_job.rb +14 -0
  31. data/app/jobs/bulkrax/import_work_collection_job.rb +41 -0
  32. data/app/jobs/bulkrax/import_work_job.rb +32 -0
  33. data/app/jobs/bulkrax/importer_job.rb +26 -0
  34. data/app/mailers/bulkrax/application_mailer.rb +8 -0
  35. data/app/matchers/bulkrax/application_matcher.rb +113 -0
  36. data/app/matchers/bulkrax/bagit_matcher.rb +6 -0
  37. data/app/matchers/bulkrax/csv_matcher.rb +6 -0
  38. data/app/matchers/bulkrax/oai_matcher.rb +6 -0
  39. data/app/models/bulkrax/application_record.rb +7 -0
  40. data/app/models/bulkrax/csv_collection_entry.rb +19 -0
  41. data/app/models/bulkrax/csv_entry.rb +163 -0
  42. data/app/models/bulkrax/entry.rb +104 -0
  43. data/app/models/bulkrax/exporter.rb +122 -0
  44. data/app/models/bulkrax/exporter_run.rb +7 -0
  45. data/app/models/bulkrax/import_failed.rb +13 -0
  46. data/app/models/bulkrax/importer.rb +155 -0
  47. data/app/models/bulkrax/importer_run.rb +8 -0
  48. data/app/models/bulkrax/oai_dc_entry.rb +6 -0
  49. data/app/models/bulkrax/oai_entry.rb +74 -0
  50. data/app/models/bulkrax/oai_qualified_dc_entry.rb +6 -0
  51. data/app/models/bulkrax/oai_set_entry.rb +19 -0
  52. data/app/models/bulkrax/rdf_collection_entry.rb +19 -0
  53. data/app/models/bulkrax/rdf_entry.rb +90 -0
  54. data/app/models/bulkrax/status.rb +25 -0
  55. data/app/models/bulkrax/xml_entry.rb +73 -0
  56. data/app/models/concerns/bulkrax/download_behavior.rb +61 -0
  57. data/app/models/concerns/bulkrax/errored_entries.rb +45 -0
  58. data/app/models/concerns/bulkrax/export_behavior.rb +58 -0
  59. data/app/models/concerns/bulkrax/file_factory.rb +140 -0
  60. data/app/models/concerns/bulkrax/has_local_processing.rb +7 -0
  61. data/app/models/concerns/bulkrax/has_matchers.rb +155 -0
  62. data/app/models/concerns/bulkrax/import_behavior.rb +90 -0
  63. data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +34 -0
  64. data/app/models/concerns/bulkrax/status_info.rb +56 -0
  65. data/app/parsers/bulkrax/application_parser.rb +299 -0
  66. data/app/parsers/bulkrax/bagit_parser.rb +157 -0
  67. data/app/parsers/bulkrax/csv_parser.rb +266 -0
  68. data/app/parsers/bulkrax/oai_dc_parser.rb +130 -0
  69. data/app/parsers/bulkrax/oai_qualified_dc_parser.rb +9 -0
  70. data/app/parsers/bulkrax/xml_parser.rb +103 -0
  71. data/app/views/bulkrax/entries/_parsed_metadata.html.erb +19 -0
  72. data/app/views/bulkrax/entries/_raw_metadata.html.erb +19 -0
  73. data/app/views/bulkrax/entries/show.html.erb +63 -0
  74. data/app/views/bulkrax/exporters/_form.html.erb +120 -0
  75. data/app/views/bulkrax/exporters/edit.html.erb +23 -0
  76. data/app/views/bulkrax/exporters/index.html.erb +67 -0
  77. data/app/views/bulkrax/exporters/new.html.erb +23 -0
  78. data/app/views/bulkrax/exporters/show.html.erb +124 -0
  79. data/app/views/bulkrax/importers/_bagit_fields.html.erb +54 -0
  80. data/app/views/bulkrax/importers/_browse_everything.html.erb +12 -0
  81. data/app/views/bulkrax/importers/_csv_fields.html.erb +39 -0
  82. data/app/views/bulkrax/importers/_edit_form_buttons.html.erb +16 -0
  83. data/app/views/bulkrax/importers/_form.html.erb +35 -0
  84. data/app/views/bulkrax/importers/_oai_fields.html.erb +42 -0
  85. data/app/views/bulkrax/importers/_xml_fields.html.erb +60 -0
  86. data/app/views/bulkrax/importers/edit.html.erb +20 -0
  87. data/app/views/bulkrax/importers/index.html.erb +77 -0
  88. data/app/views/bulkrax/importers/new.html.erb +25 -0
  89. data/app/views/bulkrax/importers/show.html.erb +175 -0
  90. data/app/views/bulkrax/importers/upload_corrected_entries.html.erb +37 -0
  91. data/app/views/bulkrax/shared/_bulkrax_errors.html.erb +52 -0
  92. data/app/views/bulkrax/shared/_bulkrax_field_mapping.html.erb +39 -0
  93. data/app/views/hyrax/dashboard/sidebar/_bulkrax_sidebar_additions.html.erb +6 -0
  94. data/app/views/hyrax/dashboard/sidebar/_repository_content.html.erb +19 -0
  95. data/app/views/layouts/bulkrax/application.html.erb +14 -0
  96. data/config/locales/bulkrax.en.yml +36 -0
  97. data/config/routes.rb +18 -0
  98. data/db/migrate/20181011230201_create_bulkrax_importers.rb +18 -0
  99. data/db/migrate/20181011230228_create_bulkrax_importer_runs.rb +16 -0
  100. data/db/migrate/20190325183136_create_bulkrax_entries.rb +16 -0
  101. data/db/migrate/20190601221109_add_status_to_entry.rb +9 -0
  102. data/db/migrate/20190715161939_add_collections_to_importer_runs.rb +6 -0
  103. data/db/migrate/20190715162044_change_collection_ids_on_entries.rb +5 -0
  104. data/db/migrate/20190729124607_create_bulkrax_exporters.rb +19 -0
  105. data/db/migrate/20190729134158_create_bulkrax_exporter_runs.rb +14 -0
  106. data/db/migrate/20190731114016_change_importer_and_exporter_to_polymorphic.rb +12 -0
  107. data/db/migrate/20191203225129_add_total_collection_records_to_importer_runs.rb +5 -0
  108. data/db/migrate/20191204191623_add_children_to_importer_runs.rb +6 -0
  109. data/db/migrate/20191204223857_change_total_records_to_total_work_entries.rb +6 -0
  110. data/db/migrate/20191212155530_change_entry_last_error.rb +19 -0
  111. data/db/migrate/20200108194557_add_validate_only_to_bulkrax_importers.rb +5 -0
  112. data/db/migrate/20200301232856_add_status_to_importers.rb +9 -0
  113. data/db/migrate/20200312190638_remove_foreign_key_from_bulkrax_entries.rb +5 -0
  114. data/db/migrate/20200326235838_add_status_to_exporters.rb +7 -0
  115. data/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb +5 -0
  116. data/db/migrate/20200818055819_create_bulkrax_statuses.rb +18 -0
  117. data/db/migrate/20200819054016_move_to_statuses.rb +30 -0
  118. data/db/migrate/20201106014204_add_date_filter_and_status_to_bulkrax_exporters.rb +7 -0
  119. data/db/migrate/20201117220007_add_workflow_status_to_bulkrax_exporter.rb +5 -0
  120. data/db/migrate/20210806044408_remove_unused_last_error.rb +7 -0
  121. data/db/migrate/20210806065737_increase_text_sizes.rb +12 -0
  122. data/lib/bulkrax.rb +161 -0
  123. data/lib/bulkrax/engine.rb +37 -0
  124. data/lib/bulkrax/version.rb +5 -0
  125. data/lib/generators/bulkrax/install_generator.rb +80 -0
  126. data/lib/generators/bulkrax/templates/README +3 -0
  127. data/lib/generators/bulkrax/templates/app/assets/images/bulkrax/removed.png +0 -0
  128. data/lib/generators/bulkrax/templates/app/models/concerns/bulkrax/has_local_processing.rb +8 -0
  129. data/lib/generators/bulkrax/templates/bin/importer +140 -0
  130. data/lib/generators/bulkrax/templates/config/bulkrax_api.yml +84 -0
  131. data/lib/generators/bulkrax/templates/config/initializers/bulkrax.rb +72 -0
  132. data/lib/tasks/bulkrax_tasks.rake +6 -0
  133. metadata +388 -0
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Bulkrax
4
+ class BagitParser < ApplicationParser
5
+ def self.export_supported?
6
+ false # @todo will be supported
7
+ end
8
+
9
+ def valid_import?
10
+ return true if import_fields.present?
11
+ rescue => e
12
+ status_info(e)
13
+ false
14
+ end
15
+
16
+ def entry_class
17
+ parser_fields['metadata_format'].constantize
18
+ end
19
+
20
+ def collection_entry_class
21
+ parser_fields['metadata_format'].gsub('Entry', 'CollectionEntry').constantize
22
+ rescue
23
+ Entry
24
+ end
25
+
26
+ # Take a random sample of 10 metadata_paths and work out the import fields from that
27
+ def import_fields
28
+ raise StandardError, 'No metadata files were found' if metadata_paths.blank?
29
+ @import_fields ||= metadata_paths.sample(10).map do |path|
30
+ entry_class.fields_from_data(entry_class.read_data(path))
31
+ end.flatten.compact.uniq
32
+ end
33
+
34
+ # Assume a single metadata record per path
35
+ # Create an Array of all metadata records, one per file
36
+ def records(_opts = {})
37
+ raise StandardError, 'No BagIt records were found' if bags.blank?
38
+ @records ||= bags.map do |bag|
39
+ path = metadata_path(bag)
40
+ raise StandardError, 'No metadata files were found' if path.blank?
41
+ data = entry_class.read_data(path)
42
+ data = entry_class.data_for_entry(data, source_identifier)
43
+ data[:file] = bag.bag_files.join('|')
44
+ data
45
+ end
46
+ end
47
+
48
+ # Find or create collections referenced by works
49
+ # If the import data also contains records for these works, they will be updated
50
+ # during create works
51
+ def create_collections
52
+ collections.each_with_index do |collection, index|
53
+ next if collection.blank?
54
+ metadata = {
55
+ title: [collection],
56
+ work_identifier => [collection],
57
+ visibility: 'open',
58
+ collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
59
+ }
60
+ new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
61
+ ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
62
+ increment_counters(index, true)
63
+ end
64
+ end
65
+
66
+ def create_works
67
+ records.each_with_index do |record, index|
68
+ next unless record_has_source_identifier(record, index)
69
+ break if limit_reached?(limit, index)
70
+
71
+ seen[record[source_identifier]] = true
72
+ new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record)
73
+ if record[:delete].present?
74
+ DeleteWorkJob.send(perform_method, new_entry, current_run)
75
+ else
76
+ ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
77
+ end
78
+ increment_counters(index)
79
+ end
80
+ importer.record_status
81
+ rescue StandardError => e
82
+ status_info(e)
83
+ end
84
+
85
+ def collections
86
+ records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
87
+ end
88
+
89
+ def collections_total
90
+ collections.size
91
+ end
92
+
93
+ def total
94
+ metadata_paths.count
95
+ end
96
+
97
+ def required_elements?(keys)
98
+ return if keys.blank?
99
+ !required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
100
+ end
101
+
102
+ # @todo - investigate getting directory structure
103
+ # @todo - investigate using perform_later, and having the importer check for
104
+ # DownloadCloudFileJob before it starts
105
+ def retrieve_cloud_files(files)
106
+ # There should only be one zip file for Bagit, take the first
107
+ return if files['0'].blank?
108
+ target_file = File.join(path_for_import, files['0']['file_name'].tr(' ', '_'))
109
+ # Now because we want the files in place before the importer runs
110
+ Bulkrax::DownloadCloudFileJob.perform_now(files['0'], target_file)
111
+ return target_file
112
+ end
113
+
114
+ private
115
+
116
+ def bags
117
+ return @bags if @bags.present?
118
+ new_bag = bag(import_file_path)
119
+ @bags = if new_bag
120
+ [new_bag]
121
+ else
122
+ Dir.glob("#{import_file_path}/**/*").map { |d| bag(d) }
123
+ end
124
+ @bags.delete(nil)
125
+ raise StandardError, 'No valid bags found' if @bags.blank?
126
+ return @bags
127
+ end
128
+
129
+ # Gather the paths to all bags; skip any stray files
130
+ def bag_paths
131
+ bags.map(&:bag_dir)
132
+ end
133
+
134
+ def metadata_file_name
135
+ raise StandardError, 'The metadata file name must be specified' if parser_fields['metadata_file_name'].blank?
136
+ parser_fields['metadata_file_name']
137
+ end
138
+
139
+ # Gather the paths to all metadata files matching the metadata_file_name
140
+ def metadata_paths
141
+ @metadata_paths ||= bag_paths.map do |b|
142
+ Dir.glob("#{b}/**/*").select { |f| File.file?(f) && f.ends_with?(metadata_file_name) }
143
+ end.flatten.compact
144
+ end
145
+
146
+ def metadata_path(bag)
147
+ Dir.glob("#{bag.bag_dir}/**/*").detect { |f| File.file?(f) && f.ends_with?(metadata_file_name) }
148
+ end
149
+
150
+ def bag(path)
151
+ return nil unless path && File.exist?(File.join(path, 'bagit.txt'))
152
+ bag = BagIt::Bag.new(path)
153
+ return nil unless bag.valid?
154
+ bag
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,266 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ module Bulkrax
5
+ class CsvParser < ApplicationParser
6
+ include ErroredEntries
7
+ def self.export_supported?
8
+ true
9
+ end
10
+
11
+ def initialize(importerexporter)
12
+ @importerexporter = importerexporter
13
+ end
14
+
15
+ def collections
16
+ # does the CSV contain a collection column?
17
+ return [] unless import_fields.include?(:collection)
18
+ # retrieve a list of unique collections
19
+ records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
20
+ end
21
+
22
+ def collections_total
23
+ collections.size
24
+ end
25
+
26
+ def records(_opts = {})
27
+ file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
28
+ # data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
29
+ @records ||= entry_class.read_data(file_for_import).map { |record_data| entry_class.data_for_entry(record_data, nil) }
30
+ end
31
+
32
+ # We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data
33
+ def import_fields
34
+ @import_fields ||= records.inject(:merge).keys.compact.uniq
35
+ end
36
+
37
+ def required_elements?(keys)
38
+ return if keys.blank?
39
+ missing_elements(keys).blank?
40
+ end
41
+
42
+ def missing_elements(keys)
43
+ required_elements.map(&:to_s) - keys.map(&:to_s)
44
+ end
45
+
46
+ def valid_import?
47
+ error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(import_fields).join(', ')}"
48
+ raise StandardError, error_alert unless required_elements?(import_fields)
49
+
50
+ file_paths.is_a?(Array)
51
+ rescue StandardError => e
52
+ status_info(e)
53
+ false
54
+ end
55
+
56
+ def create_collections
57
+ collections.each_with_index do |collection, index|
58
+ next if collection.blank?
59
+ metadata = {
60
+ title: [collection],
61
+ work_identifier => [collection],
62
+ visibility: 'open',
63
+ collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
64
+ }
65
+ new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
66
+ ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
67
+ increment_counters(index, true)
68
+ end
69
+ end
70
+
71
+ def create_works
72
+ records.each_with_index do |record, index|
73
+ next unless record_has_source_identifier(record, index)
74
+ break if limit_reached?(limit, index)
75
+
76
+ seen[record[source_identifier]] = true
77
+ new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record.to_h.compact)
78
+ if record[:delete].present?
79
+ DeleteWorkJob.send(perform_method, new_entry, current_run)
80
+ else
81
+ ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
82
+ end
83
+ increment_counters(index)
84
+ end
85
+ importer.record_status
86
+ rescue StandardError => e
87
+ status_info(e)
88
+ end
89
+
90
+ def write_partial_import_file(file)
91
+ import_filename = import_file_path.split('/').last
92
+ partial_import_filename = "#{File.basename(import_filename, '.csv')}_corrected_entries.csv"
93
+
94
+ path = File.join(path_for_import, partial_import_filename)
95
+ FileUtils.mv(
96
+ file.path,
97
+ path
98
+ )
99
+ path
100
+ end
101
+
102
+ def create_parent_child_relationships
103
+ super
104
+ end
105
+
106
+ def extra_filters
107
+ output = ""
108
+ if importerexporter.start_date.present?
109
+ start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
110
+ finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
111
+ output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
112
+ end
113
+ output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
114
+ output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
115
+ output
116
+ end
117
+
118
+ def current_work_ids
119
+ case importerexporter.export_from
120
+ when 'collection'
121
+ ActiveFedora::SolrService.query("member_of_collection_ids_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
122
+ when 'worktype'
123
+ ActiveFedora::SolrService.query("has_model_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
124
+ when 'importer'
125
+ entry_ids = Bulkrax::Importer.find(importerexporter.export_source).entries.pluck(:id)
126
+ complete_statuses = Bulkrax::Status.latest_by_statusable
127
+ .includes(:statusable)
128
+ .where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
129
+ complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier }
130
+
131
+ ActiveFedora::SolrService.query("#{work_identifier}_tesim:(#{complete_entry_identifiers.join(' OR ')})#{extra_filters}", rows: 2_000_000_000).map(&:id)
132
+ end
133
+ end
134
+
135
+ def create_new_entries
136
+ current_work_ids.each_with_index do |wid, index|
137
+ break if limit_reached?(limit, index)
138
+ new_entry = find_or_create_entry(entry_class, wid, 'Bulkrax::Exporter')
139
+ Bulkrax::ExportWorkJob.perform_now(new_entry.id, current_run.id)
140
+ end
141
+ end
142
+ alias create_from_collection create_new_entries
143
+ alias create_from_importer create_new_entries
144
+ alias create_from_worktype create_new_entries
145
+
146
+ def entry_class
147
+ CsvEntry
148
+ end
149
+
150
+ def collection_entry_class
151
+ CsvCollectionEntry
152
+ end
153
+
154
+ # See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
155
+ # Changed to grep as wc -l counts blank lines, and ignores the final unescaped line (which may or may not contain data)
156
+ def total
157
+ if importer?
158
+ return @total if @total&.positive?
159
+ # windows enocded
160
+ @total = `grep -c ^M #{real_import_file_path}`.to_i - 1
161
+ # unix encoded
162
+ @total = `grep -vc ^$ #{real_import_file_path}`.to_i - 1 if @total < 1
163
+ elsif exporter?
164
+ @total = importerexporter.entries.count
165
+ else
166
+ @total = 0
167
+ end
168
+ return @total
169
+ rescue StandardErrorr
170
+ @total = 0
171
+ end
172
+
173
+ # @todo - investigate getting directory structure
174
+ # @todo - investigate using perform_later, and having the importer check for
175
+ # DownloadCloudFileJob before it starts
176
+ def retrieve_cloud_files(files)
177
+ files_path = File.join(path_for_import, 'files')
178
+ FileUtils.mkdir_p(files_path) unless File.exist?(files_path)
179
+ files.each_pair do |_key, file|
180
+ # fixes bug where auth headers do not get attached properly
181
+ if file['auth_header'].present?
182
+ file['headers'] ||= {}
183
+ file['headers'].merge!(file['auth_header'])
184
+ end
185
+ # this only works for uniquely named files
186
+ target_file = File.join(files_path, file['file_name'].tr(' ', '_'))
187
+ # Now because we want the files in place before the importer runs
188
+ # Problematic for a large upload
189
+ Bulkrax::DownloadCloudFileJob.perform_now(file, target_file)
190
+ end
191
+ return nil
192
+ end
193
+
194
+ # export methods
195
+
196
+ def write_files
197
+ CSV.open(setup_export_file, "w", headers: export_headers, write_headers: true) do |csv|
198
+ importerexporter.entries.where(identifier: current_work_ids)[0..limit || total].each do |e|
199
+ csv << e.parsed_metadata
200
+ end
201
+ end
202
+ end
203
+
204
+ def key_allowed(key)
205
+ !Bulkrax.reserved_properties.include?(key) &&
206
+ new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
207
+ key != source_identifier.to_s
208
+ end
209
+
210
+ # All possible column names
211
+ def export_headers
212
+ headers = ['id']
213
+ headers << source_identifier.to_s
214
+ headers << 'model'
215
+ importerexporter.mapping.each_key { |key| headers << key if key_allowed(key) }
216
+ headers << 'file'
217
+ headers.uniq
218
+ end
219
+
220
+ # in the parser as it is specific to the format
221
+ def setup_export_file
222
+ File.join(importerexporter.exporter_export_path, 'export.csv')
223
+ end
224
+
225
+ # Retrieve file paths for [:file] mapping in records
226
+ # and check all listed files exist.
227
+ def file_paths
228
+ raise StandardError, 'No records were found' if records.blank?
229
+ @file_paths ||= records.map do |r|
230
+ file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
231
+ next if r[file_mapping].blank?
232
+
233
+ r[file_mapping].split(/\s*[:;|]\s*/).map do |f|
234
+ file = File.join(path_to_files, f.tr(' ', '_'))
235
+ if File.exist?(file) # rubocop:disable Style/GuardClause
236
+ file
237
+ else
238
+ raise "File #{file} does not exist"
239
+ end
240
+ end
241
+ end.flatten.compact.uniq
242
+ end
243
+
244
+ # Retrieve the path where we expect to find the files
245
+ def path_to_files
246
+ @path_to_files ||= File.join(
247
+ File.file?(import_file_path) ? File.dirname(import_file_path) : import_file_path,
248
+ 'files'
249
+ )
250
+ end
251
+
252
+ private
253
+
254
+ # Override to return the first CSV in the path, if a zip file is supplied
255
+ # We expect a single CSV at the top level of the zip in the CSVParser
256
+ # but we are willing to go look for it if need be
257
+ def real_import_file_path
258
+ if file? && zip?
259
+ unzip(parser_fields['import_file_path'])
260
+ return Dir["#{importer_unzip_path}/**/*.csv"].first
261
+ else
262
+ parser_fields['import_file_path']
263
+ end
264
+ end
265
+ end
266
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Bulkrax
4
+ class OaiDcParser < ApplicationParser
5
+ attr_accessor :headers
6
+ delegate :list_sets, to: :client
7
+
8
+ def initialize(importerexporter)
9
+ super
10
+ @headers = { from: importerexporter.user.email }
11
+ end
12
+
13
+ def client
14
+ @client ||= OAI::Client.new(importerexporter.parser_fields['base_url'],
15
+ headers: headers,
16
+ parser: 'libxml',
17
+ metadata_prefix: importerexporter.parser_fields['metadata_prefix'])
18
+ rescue StandardError
19
+ raise OAIError
20
+ end
21
+
22
+ def collection_name
23
+ @collection_name ||= parser_fields['set'] || 'all'
24
+ end
25
+
26
+ def entry_class
27
+ OaiDcEntry
28
+ end
29
+
30
+ def collection_entry_class
31
+ OaiSetEntry
32
+ end
33
+
34
+ def records(opts = {})
35
+ opts[:set] = collection_name unless collection_name == 'all'
36
+
37
+ opts[:from] = importerexporter&.last_imported_at&.strftime("%Y-%m-%d") if importerexporter.last_imported_at && only_updates
38
+
39
+ if opts[:quick]
40
+ opts.delete(:quick)
41
+ begin
42
+ @short_records = client.list_identifiers(opts)
43
+ rescue OAI::Exception => e
44
+ return @short_records = [] if e.code == "noRecordsMatch"
45
+ raise e
46
+ end
47
+ else
48
+ begin
49
+ @records ||= client.list_records(opts.merge(metadata_prefix: parser_fields['metadata_prefix']))
50
+ rescue OAI::Exception => e
51
+ return @records = [] if e.code == "noRecordsMatch"
52
+ raise e
53
+ end
54
+ end
55
+ end
56
+
57
+ # the set of fields available in the import data
58
+ def import_fields
59
+ ['contributor', 'coverage', 'creator', 'date', 'description', 'format', 'identifier', 'language', 'publisher', 'relation', 'rights', 'source', 'subject', 'title', 'type']
60
+ end
61
+
62
+ delegate :list_sets, to: :client
63
+
64
+ def create_collections
65
+ metadata = {
66
+ visibility: 'open',
67
+ collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
68
+ }
69
+
70
+ collections.each_with_index do |set, index|
71
+ next unless collection_name == 'all' || collection_name == set.spec
72
+ unique_collection_identifier = importerexporter.unique_collection_identifier(set.spec)
73
+ metadata[:title] = [set.name]
74
+ metadata[work_identifier] = [unique_collection_identifier]
75
+
76
+ new_entry = collection_entry_class.where(importerexporter: importerexporter, identifier: unique_collection_identifier, raw_metadata: metadata).first_or_create!
77
+ # perform now to ensure this gets created before work imports start
78
+ ImportWorkCollectionJob.perform_now(new_entry.id, importerexporter.current_run.id)
79
+ increment_counters(index, true)
80
+ end
81
+ end
82
+
83
+ def create_works
84
+ results = self.records(quick: true)
85
+ return if results.blank?
86
+ results.full.each_with_index do |record, index|
87
+ identifier = record.send(source_identifier)
88
+ if identifier.blank?
89
+ if Bulkrax.fill_in_blank_source_identifiers.present?
90
+ identifier = Bulkrax.fill_in_blank_source_identifiers.call(self, index)
91
+ else
92
+ invalid_record("Missing #{source_identifier} for #{record.to_h}\n")
93
+ next
94
+ end
95
+ end
96
+
97
+ break if limit_reached?(limit, index)
98
+ seen[identifier] = true
99
+ new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: identifier).first_or_create!
100
+ if record.deleted?
101
+ DeleteWorkJob.send(perform_method, new_entry, importerexporter.current_run)
102
+ else
103
+ ImportWorkJob.send(perform_method, new_entry.id, importerexporter.current_run.id)
104
+ end
105
+ increment_counters(index)
106
+ end
107
+ importer.record_status
108
+ end
109
+
110
+ def collections
111
+ @collections ||= list_sets
112
+ end
113
+
114
+ def collections_total
115
+ if collection_name == 'all'
116
+ collections.count
117
+ else
118
+ 1
119
+ end
120
+ end
121
+
122
+ def create_parent_child_relationships; end
123
+
124
+ def total
125
+ @total ||= records(quick: true).doc.find(".//resumptionToken").to_a.first.attributes["completeListSize"].to_i
126
+ rescue
127
+ @total = 0
128
+ end
129
+ end
130
+ end