bulkrax 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (133) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +205 -0
  3. data/README.md +202 -0
  4. data/Rakefile +42 -0
  5. data/app/assets/config/bulkrax_manifest.js +2 -0
  6. data/app/assets/javascripts/bulkrax/application.js +14 -0
  7. data/app/assets/javascripts/bulkrax/bulkrax.js +11 -0
  8. data/app/assets/javascripts/bulkrax/entries.js +15 -0
  9. data/app/assets/javascripts/bulkrax/exporters.js +60 -0
  10. data/app/assets/javascripts/bulkrax/importers.js.erb +166 -0
  11. data/app/assets/stylesheets/bulkrax/accordion.scss +40 -0
  12. data/app/assets/stylesheets/bulkrax/application.css +15 -0
  13. data/app/assets/stylesheets/bulkrax/coderay.scss +264 -0
  14. data/app/assets/stylesheets/bulkrax/import_export.scss +37 -0
  15. data/app/controllers/bulkrax/application_controller.rb +8 -0
  16. data/app/controllers/bulkrax/entries_controller.rb +44 -0
  17. data/app/controllers/bulkrax/exporters_controller.rb +125 -0
  18. data/app/controllers/bulkrax/importers_controller.rb +315 -0
  19. data/app/controllers/concerns/bulkrax/api.rb +29 -0
  20. data/app/factories/bulkrax/object_factory.rb +230 -0
  21. data/app/helpers/bulkrax/application_helper.rb +15 -0
  22. data/app/helpers/bulkrax/exporters_helper.rb +6 -0
  23. data/app/helpers/bulkrax/importers_helper.rb +13 -0
  24. data/app/helpers/bulkrax/validation_helper.rb +153 -0
  25. data/app/jobs/bulkrax/application_job.rb +6 -0
  26. data/app/jobs/bulkrax/child_relationships_job.rb +128 -0
  27. data/app/jobs/bulkrax/delete_work_job.rb +16 -0
  28. data/app/jobs/bulkrax/download_cloud_file_job.rb +18 -0
  29. data/app/jobs/bulkrax/export_work_job.rb +37 -0
  30. data/app/jobs/bulkrax/exporter_job.rb +14 -0
  31. data/app/jobs/bulkrax/import_work_collection_job.rb +41 -0
  32. data/app/jobs/bulkrax/import_work_job.rb +32 -0
  33. data/app/jobs/bulkrax/importer_job.rb +26 -0
  34. data/app/mailers/bulkrax/application_mailer.rb +8 -0
  35. data/app/matchers/bulkrax/application_matcher.rb +113 -0
  36. data/app/matchers/bulkrax/bagit_matcher.rb +6 -0
  37. data/app/matchers/bulkrax/csv_matcher.rb +6 -0
  38. data/app/matchers/bulkrax/oai_matcher.rb +6 -0
  39. data/app/models/bulkrax/application_record.rb +7 -0
  40. data/app/models/bulkrax/csv_collection_entry.rb +19 -0
  41. data/app/models/bulkrax/csv_entry.rb +163 -0
  42. data/app/models/bulkrax/entry.rb +104 -0
  43. data/app/models/bulkrax/exporter.rb +122 -0
  44. data/app/models/bulkrax/exporter_run.rb +7 -0
  45. data/app/models/bulkrax/import_failed.rb +13 -0
  46. data/app/models/bulkrax/importer.rb +155 -0
  47. data/app/models/bulkrax/importer_run.rb +8 -0
  48. data/app/models/bulkrax/oai_dc_entry.rb +6 -0
  49. data/app/models/bulkrax/oai_entry.rb +74 -0
  50. data/app/models/bulkrax/oai_qualified_dc_entry.rb +6 -0
  51. data/app/models/bulkrax/oai_set_entry.rb +19 -0
  52. data/app/models/bulkrax/rdf_collection_entry.rb +19 -0
  53. data/app/models/bulkrax/rdf_entry.rb +90 -0
  54. data/app/models/bulkrax/status.rb +25 -0
  55. data/app/models/bulkrax/xml_entry.rb +73 -0
  56. data/app/models/concerns/bulkrax/download_behavior.rb +61 -0
  57. data/app/models/concerns/bulkrax/errored_entries.rb +45 -0
  58. data/app/models/concerns/bulkrax/export_behavior.rb +58 -0
  59. data/app/models/concerns/bulkrax/file_factory.rb +140 -0
  60. data/app/models/concerns/bulkrax/has_local_processing.rb +7 -0
  61. data/app/models/concerns/bulkrax/has_matchers.rb +155 -0
  62. data/app/models/concerns/bulkrax/import_behavior.rb +90 -0
  63. data/app/models/concerns/bulkrax/importer_exporter_behavior.rb +34 -0
  64. data/app/models/concerns/bulkrax/status_info.rb +56 -0
  65. data/app/parsers/bulkrax/application_parser.rb +299 -0
  66. data/app/parsers/bulkrax/bagit_parser.rb +157 -0
  67. data/app/parsers/bulkrax/csv_parser.rb +266 -0
  68. data/app/parsers/bulkrax/oai_dc_parser.rb +130 -0
  69. data/app/parsers/bulkrax/oai_qualified_dc_parser.rb +9 -0
  70. data/app/parsers/bulkrax/xml_parser.rb +103 -0
  71. data/app/views/bulkrax/entries/_parsed_metadata.html.erb +19 -0
  72. data/app/views/bulkrax/entries/_raw_metadata.html.erb +19 -0
  73. data/app/views/bulkrax/entries/show.html.erb +63 -0
  74. data/app/views/bulkrax/exporters/_form.html.erb +120 -0
  75. data/app/views/bulkrax/exporters/edit.html.erb +23 -0
  76. data/app/views/bulkrax/exporters/index.html.erb +67 -0
  77. data/app/views/bulkrax/exporters/new.html.erb +23 -0
  78. data/app/views/bulkrax/exporters/show.html.erb +124 -0
  79. data/app/views/bulkrax/importers/_bagit_fields.html.erb +54 -0
  80. data/app/views/bulkrax/importers/_browse_everything.html.erb +12 -0
  81. data/app/views/bulkrax/importers/_csv_fields.html.erb +39 -0
  82. data/app/views/bulkrax/importers/_edit_form_buttons.html.erb +16 -0
  83. data/app/views/bulkrax/importers/_form.html.erb +35 -0
  84. data/app/views/bulkrax/importers/_oai_fields.html.erb +42 -0
  85. data/app/views/bulkrax/importers/_xml_fields.html.erb +60 -0
  86. data/app/views/bulkrax/importers/edit.html.erb +20 -0
  87. data/app/views/bulkrax/importers/index.html.erb +77 -0
  88. data/app/views/bulkrax/importers/new.html.erb +25 -0
  89. data/app/views/bulkrax/importers/show.html.erb +175 -0
  90. data/app/views/bulkrax/importers/upload_corrected_entries.html.erb +37 -0
  91. data/app/views/bulkrax/shared/_bulkrax_errors.html.erb +52 -0
  92. data/app/views/bulkrax/shared/_bulkrax_field_mapping.html.erb +39 -0
  93. data/app/views/hyrax/dashboard/sidebar/_bulkrax_sidebar_additions.html.erb +6 -0
  94. data/app/views/hyrax/dashboard/sidebar/_repository_content.html.erb +19 -0
  95. data/app/views/layouts/bulkrax/application.html.erb +14 -0
  96. data/config/locales/bulkrax.en.yml +36 -0
  97. data/config/routes.rb +18 -0
  98. data/db/migrate/20181011230201_create_bulkrax_importers.rb +18 -0
  99. data/db/migrate/20181011230228_create_bulkrax_importer_runs.rb +16 -0
  100. data/db/migrate/20190325183136_create_bulkrax_entries.rb +16 -0
  101. data/db/migrate/20190601221109_add_status_to_entry.rb +9 -0
  102. data/db/migrate/20190715161939_add_collections_to_importer_runs.rb +6 -0
  103. data/db/migrate/20190715162044_change_collection_ids_on_entries.rb +5 -0
  104. data/db/migrate/20190729124607_create_bulkrax_exporters.rb +19 -0
  105. data/db/migrate/20190729134158_create_bulkrax_exporter_runs.rb +14 -0
  106. data/db/migrate/20190731114016_change_importer_and_exporter_to_polymorphic.rb +12 -0
  107. data/db/migrate/20191203225129_add_total_collection_records_to_importer_runs.rb +5 -0
  108. data/db/migrate/20191204191623_add_children_to_importer_runs.rb +6 -0
  109. data/db/migrate/20191204223857_change_total_records_to_total_work_entries.rb +6 -0
  110. data/db/migrate/20191212155530_change_entry_last_error.rb +19 -0
  111. data/db/migrate/20200108194557_add_validate_only_to_bulkrax_importers.rb +5 -0
  112. data/db/migrate/20200301232856_add_status_to_importers.rb +9 -0
  113. data/db/migrate/20200312190638_remove_foreign_key_from_bulkrax_entries.rb +5 -0
  114. data/db/migrate/20200326235838_add_status_to_exporters.rb +7 -0
  115. data/db/migrate/20200601204556_add_invalid_record_to_importer_run.rb +5 -0
  116. data/db/migrate/20200818055819_create_bulkrax_statuses.rb +18 -0
  117. data/db/migrate/20200819054016_move_to_statuses.rb +30 -0
  118. data/db/migrate/20201106014204_add_date_filter_and_status_to_bulkrax_exporters.rb +7 -0
  119. data/db/migrate/20201117220007_add_workflow_status_to_bulkrax_exporter.rb +5 -0
  120. data/db/migrate/20210806044408_remove_unused_last_error.rb +7 -0
  121. data/db/migrate/20210806065737_increase_text_sizes.rb +12 -0
  122. data/lib/bulkrax.rb +161 -0
  123. data/lib/bulkrax/engine.rb +37 -0
  124. data/lib/bulkrax/version.rb +5 -0
  125. data/lib/generators/bulkrax/install_generator.rb +80 -0
  126. data/lib/generators/bulkrax/templates/README +3 -0
  127. data/lib/generators/bulkrax/templates/app/assets/images/bulkrax/removed.png +0 -0
  128. data/lib/generators/bulkrax/templates/app/models/concerns/bulkrax/has_local_processing.rb +8 -0
  129. data/lib/generators/bulkrax/templates/bin/importer +140 -0
  130. data/lib/generators/bulkrax/templates/config/bulkrax_api.yml +84 -0
  131. data/lib/generators/bulkrax/templates/config/initializers/bulkrax.rb +72 -0
  132. data/lib/tasks/bulkrax_tasks.rake +6 -0
  133. metadata +388 -0
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Bulkrax
4
+ class BagitParser < ApplicationParser
5
+ def self.export_supported?
6
+ false # @todo will be supported
7
+ end
8
+
9
+ def valid_import?
10
+ return true if import_fields.present?
11
+ rescue => e
12
+ status_info(e)
13
+ false
14
+ end
15
+
16
+ def entry_class
17
+ parser_fields['metadata_format'].constantize
18
+ end
19
+
20
+ def collection_entry_class
21
+ parser_fields['metadata_format'].gsub('Entry', 'CollectionEntry').constantize
22
+ rescue
23
+ Entry
24
+ end
25
+
26
+ # Take a random sample of 10 metadata_paths and work out the import fields from that
27
+ def import_fields
28
+ raise StandardError, 'No metadata files were found' if metadata_paths.blank?
29
+ @import_fields ||= metadata_paths.sample(10).map do |path|
30
+ entry_class.fields_from_data(entry_class.read_data(path))
31
+ end.flatten.compact.uniq
32
+ end
33
+
34
+ # Assume a single metadata record per path
35
+ # Create an Array of all metadata records, one per file
36
+ def records(_opts = {})
37
+ raise StandardError, 'No BagIt records were found' if bags.blank?
38
+ @records ||= bags.map do |bag|
39
+ path = metadata_path(bag)
40
+ raise StandardError, 'No metadata files were found' if path.blank?
41
+ data = entry_class.read_data(path)
42
+ data = entry_class.data_for_entry(data, source_identifier)
43
+ data[:file] = bag.bag_files.join('|')
44
+ data
45
+ end
46
+ end
47
+
48
+ # Find or create collections referenced by works
49
+ # If the import data also contains records for these works, they will be updated
50
+ # during create works
51
+ def create_collections
52
+ collections.each_with_index do |collection, index|
53
+ next if collection.blank?
54
+ metadata = {
55
+ title: [collection],
56
+ work_identifier => [collection],
57
+ visibility: 'open',
58
+ collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
59
+ }
60
+ new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
61
+ ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
62
+ increment_counters(index, true)
63
+ end
64
+ end
65
+
66
+ def create_works
67
+ records.each_with_index do |record, index|
68
+ next unless record_has_source_identifier(record, index)
69
+ break if limit_reached?(limit, index)
70
+
71
+ seen[record[source_identifier]] = true
72
+ new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record)
73
+ if record[:delete].present?
74
+ DeleteWorkJob.send(perform_method, new_entry, current_run)
75
+ else
76
+ ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
77
+ end
78
+ increment_counters(index)
79
+ end
80
+ importer.record_status
81
+ rescue StandardError => e
82
+ status_info(e)
83
+ end
84
+
85
+ def collections
86
+ records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
87
+ end
88
+
89
+ def collections_total
90
+ collections.size
91
+ end
92
+
93
+ def total
94
+ metadata_paths.count
95
+ end
96
+
97
+ def required_elements?(keys)
98
+ return if keys.blank?
99
+ !required_elements.map { |el| keys.map(&:to_s).include?(el) }.include?(false)
100
+ end
101
+
102
+ # @todo - investigate getting directory structure
103
+ # @todo - investigate using perform_later, and having the importer check for
104
+ # DownloadCloudFileJob before it starts
105
+ def retrieve_cloud_files(files)
106
+ # There should only be one zip file for Bagit, take the first
107
+ return if files['0'].blank?
108
+ target_file = File.join(path_for_import, files['0']['file_name'].tr(' ', '_'))
109
+ # Now because we want the files in place before the importer runs
110
+ Bulkrax::DownloadCloudFileJob.perform_now(files['0'], target_file)
111
+ return target_file
112
+ end
113
+
114
+ private
115
+
116
+ def bags
117
+ return @bags if @bags.present?
118
+ new_bag = bag(import_file_path)
119
+ @bags = if new_bag
120
+ [new_bag]
121
+ else
122
+ Dir.glob("#{import_file_path}/**/*").map { |d| bag(d) }
123
+ end
124
+ @bags.delete(nil)
125
+ raise StandardError, 'No valid bags found' if @bags.blank?
126
+ return @bags
127
+ end
128
+
129
+ # Gather the paths to all bags; skip any stray files
130
+ def bag_paths
131
+ bags.map(&:bag_dir)
132
+ end
133
+
134
+ def metadata_file_name
135
+ raise StandardError, 'The metadata file name must be specified' if parser_fields['metadata_file_name'].blank?
136
+ parser_fields['metadata_file_name']
137
+ end
138
+
139
+ # Gather the paths to all metadata files matching the metadata_file_name
140
+ def metadata_paths
141
+ @metadata_paths ||= bag_paths.map do |b|
142
+ Dir.glob("#{b}/**/*").select { |f| File.file?(f) && f.ends_with?(metadata_file_name) }
143
+ end.flatten.compact
144
+ end
145
+
146
+ def metadata_path(bag)
147
+ Dir.glob("#{bag.bag_dir}/**/*").detect { |f| File.file?(f) && f.ends_with?(metadata_file_name) }
148
+ end
149
+
150
+ def bag(path)
151
+ return nil unless path && File.exist?(File.join(path, 'bagit.txt'))
152
+ bag = BagIt::Bag.new(path)
153
+ return nil unless bag.valid?
154
+ bag
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,266 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ module Bulkrax
5
+ class CsvParser < ApplicationParser
6
+ include ErroredEntries
7
+ def self.export_supported?
8
+ true
9
+ end
10
+
11
+ def initialize(importerexporter)
12
+ @importerexporter = importerexporter
13
+ end
14
+
15
+ def collections
16
+ # does the CSV contain a collection column?
17
+ return [] unless import_fields.include?(:collection)
18
+ # retrieve a list of unique collections
19
+ records.map { |r| r[:collection].split(/\s*[;|]\s*/) if r[:collection].present? }.flatten.compact.uniq
20
+ end
21
+
22
+ def collections_total
23
+ collections.size
24
+ end
25
+
26
+ def records(_opts = {})
27
+ file_for_import = only_updates ? parser_fields['partial_import_file_path'] : import_file_path
28
+ # data for entry does not need source_identifier for csv, because csvs are read sequentially and mapped after raw data is read.
29
+ @records ||= entry_class.read_data(file_for_import).map { |record_data| entry_class.data_for_entry(record_data, nil) }
30
+ end
31
+
32
+ # We could use CsvEntry#fields_from_data(data) but that would mean re-reading the data
33
+ def import_fields
34
+ @import_fields ||= records.inject(:merge).keys.compact.uniq
35
+ end
36
+
37
+ def required_elements?(keys)
38
+ return if keys.blank?
39
+ missing_elements(keys).blank?
40
+ end
41
+
42
+ def missing_elements(keys)
43
+ required_elements.map(&:to_s) - keys.map(&:to_s)
44
+ end
45
+
46
+ def valid_import?
47
+ error_alert = "Missing at least one required element, missing element(s) are: #{missing_elements(import_fields).join(', ')}"
48
+ raise StandardError, error_alert unless required_elements?(import_fields)
49
+
50
+ file_paths.is_a?(Array)
51
+ rescue StandardError => e
52
+ status_info(e)
53
+ false
54
+ end
55
+
56
+ def create_collections
57
+ collections.each_with_index do |collection, index|
58
+ next if collection.blank?
59
+ metadata = {
60
+ title: [collection],
61
+ work_identifier => [collection],
62
+ visibility: 'open',
63
+ collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
64
+ }
65
+ new_entry = find_or_create_entry(collection_entry_class, collection, 'Bulkrax::Importer', metadata)
66
+ ImportWorkCollectionJob.perform_now(new_entry.id, current_run.id)
67
+ increment_counters(index, true)
68
+ end
69
+ end
70
+
71
+ def create_works
72
+ records.each_with_index do |record, index|
73
+ next unless record_has_source_identifier(record, index)
74
+ break if limit_reached?(limit, index)
75
+
76
+ seen[record[source_identifier]] = true
77
+ new_entry = find_or_create_entry(entry_class, record[source_identifier], 'Bulkrax::Importer', record.to_h.compact)
78
+ if record[:delete].present?
79
+ DeleteWorkJob.send(perform_method, new_entry, current_run)
80
+ else
81
+ ImportWorkJob.send(perform_method, new_entry.id, current_run.id)
82
+ end
83
+ increment_counters(index)
84
+ end
85
+ importer.record_status
86
+ rescue StandardError => e
87
+ status_info(e)
88
+ end
89
+
90
+ def write_partial_import_file(file)
91
+ import_filename = import_file_path.split('/').last
92
+ partial_import_filename = "#{File.basename(import_filename, '.csv')}_corrected_entries.csv"
93
+
94
+ path = File.join(path_for_import, partial_import_filename)
95
+ FileUtils.mv(
96
+ file.path,
97
+ path
98
+ )
99
+ path
100
+ end
101
+
102
+ def create_parent_child_relationships
103
+ super
104
+ end
105
+
106
+ def extra_filters
107
+ output = ""
108
+ if importerexporter.start_date.present?
109
+ start_dt = importerexporter.start_date.to_datetime.strftime('%FT%TZ')
110
+ finish_dt = importerexporter.finish_date.present? ? importerexporter.finish_date.to_datetime.end_of_day.strftime('%FT%TZ') : "NOW"
111
+ output += " AND system_modified_dtsi:[#{start_dt} TO #{finish_dt}]"
112
+ end
113
+ output += importerexporter.work_visibility.present? ? " AND visibility_ssi:#{importerexporter.work_visibility}" : ""
114
+ output += importerexporter.workflow_status.present? ? " AND workflow_state_name_ssim:#{importerexporter.workflow_status}" : ""
115
+ output
116
+ end
117
+
118
+ def current_work_ids
119
+ case importerexporter.export_from
120
+ when 'collection'
121
+ ActiveFedora::SolrService.query("member_of_collection_ids_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
122
+ when 'worktype'
123
+ ActiveFedora::SolrService.query("has_model_ssim:#{importerexporter.export_source + extra_filters}", rows: 2_000_000_000).map(&:id)
124
+ when 'importer'
125
+ entry_ids = Bulkrax::Importer.find(importerexporter.export_source).entries.pluck(:id)
126
+ complete_statuses = Bulkrax::Status.latest_by_statusable
127
+ .includes(:statusable)
128
+ .where('bulkrax_statuses.statusable_id IN (?) AND bulkrax_statuses.statusable_type = ? AND status_message = ?', entry_ids, 'Bulkrax::Entry', 'Complete')
129
+ complete_entry_identifiers = complete_statuses.map { |s| s.statusable&.identifier }
130
+
131
+ ActiveFedora::SolrService.query("#{work_identifier}_tesim:(#{complete_entry_identifiers.join(' OR ')})#{extra_filters}", rows: 2_000_000_000).map(&:id)
132
+ end
133
+ end
134
+
135
+ def create_new_entries
136
+ current_work_ids.each_with_index do |wid, index|
137
+ break if limit_reached?(limit, index)
138
+ new_entry = find_or_create_entry(entry_class, wid, 'Bulkrax::Exporter')
139
+ Bulkrax::ExportWorkJob.perform_now(new_entry.id, current_run.id)
140
+ end
141
+ end
142
+ alias create_from_collection create_new_entries
143
+ alias create_from_importer create_new_entries
144
+ alias create_from_worktype create_new_entries
145
+
146
+ def entry_class
147
+ CsvEntry
148
+ end
149
+
150
+ def collection_entry_class
151
+ CsvCollectionEntry
152
+ end
153
+
154
+ # See https://stackoverflow.com/questions/2650517/count-the-number-of-lines-in-a-file-without-reading-entire-file-into-memory
155
+ # Changed to grep as wc -l counts blank lines, and ignores the final unescaped line (which may or may not contain data)
156
+ def total
157
+ if importer?
158
+ return @total if @total&.positive?
159
+ # windows enocded
160
+ @total = `grep -c ^M #{real_import_file_path}`.to_i - 1
161
+ # unix encoded
162
+ @total = `grep -vc ^$ #{real_import_file_path}`.to_i - 1 if @total < 1
163
+ elsif exporter?
164
+ @total = importerexporter.entries.count
165
+ else
166
+ @total = 0
167
+ end
168
+ return @total
169
+ rescue StandardErrorr
170
+ @total = 0
171
+ end
172
+
173
+ # @todo - investigate getting directory structure
174
+ # @todo - investigate using perform_later, and having the importer check for
175
+ # DownloadCloudFileJob before it starts
176
+ def retrieve_cloud_files(files)
177
+ files_path = File.join(path_for_import, 'files')
178
+ FileUtils.mkdir_p(files_path) unless File.exist?(files_path)
179
+ files.each_pair do |_key, file|
180
+ # fixes bug where auth headers do not get attached properly
181
+ if file['auth_header'].present?
182
+ file['headers'] ||= {}
183
+ file['headers'].merge!(file['auth_header'])
184
+ end
185
+ # this only works for uniquely named files
186
+ target_file = File.join(files_path, file['file_name'].tr(' ', '_'))
187
+ # Now because we want the files in place before the importer runs
188
+ # Problematic for a large upload
189
+ Bulkrax::DownloadCloudFileJob.perform_now(file, target_file)
190
+ end
191
+ return nil
192
+ end
193
+
194
+ # export methods
195
+
196
+ def write_files
197
+ CSV.open(setup_export_file, "w", headers: export_headers, write_headers: true) do |csv|
198
+ importerexporter.entries.where(identifier: current_work_ids)[0..limit || total].each do |e|
199
+ csv << e.parsed_metadata
200
+ end
201
+ end
202
+ end
203
+
204
+ def key_allowed(key)
205
+ !Bulkrax.reserved_properties.include?(key) &&
206
+ new_entry(entry_class, 'Bulkrax::Exporter').field_supported?(key) &&
207
+ key != source_identifier.to_s
208
+ end
209
+
210
+ # All possible column names
211
+ def export_headers
212
+ headers = ['id']
213
+ headers << source_identifier.to_s
214
+ headers << 'model'
215
+ importerexporter.mapping.each_key { |key| headers << key if key_allowed(key) }
216
+ headers << 'file'
217
+ headers.uniq
218
+ end
219
+
220
+ # in the parser as it is specific to the format
221
+ def setup_export_file
222
+ File.join(importerexporter.exporter_export_path, 'export.csv')
223
+ end
224
+
225
+ # Retrieve file paths for [:file] mapping in records
226
+ # and check all listed files exist.
227
+ def file_paths
228
+ raise StandardError, 'No records were found' if records.blank?
229
+ @file_paths ||= records.map do |r|
230
+ file_mapping = Bulkrax.field_mappings.dig(self.class.to_s, 'file', :from)&.first&.to_sym || :file
231
+ next if r[file_mapping].blank?
232
+
233
+ r[file_mapping].split(/\s*[:;|]\s*/).map do |f|
234
+ file = File.join(path_to_files, f.tr(' ', '_'))
235
+ if File.exist?(file) # rubocop:disable Style/GuardClause
236
+ file
237
+ else
238
+ raise "File #{file} does not exist"
239
+ end
240
+ end
241
+ end.flatten.compact.uniq
242
+ end
243
+
244
+ # Retrieve the path where we expect to find the files
245
+ def path_to_files
246
+ @path_to_files ||= File.join(
247
+ File.file?(import_file_path) ? File.dirname(import_file_path) : import_file_path,
248
+ 'files'
249
+ )
250
+ end
251
+
252
+ private
253
+
254
+ # Override to return the first CSV in the path, if a zip file is supplied
255
+ # We expect a single CSV at the top level of the zip in the CSVParser
256
+ # but we are willing to go look for it if need be
257
+ def real_import_file_path
258
+ if file? && zip?
259
+ unzip(parser_fields['import_file_path'])
260
+ return Dir["#{importer_unzip_path}/**/*.csv"].first
261
+ else
262
+ parser_fields['import_file_path']
263
+ end
264
+ end
265
+ end
266
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Bulkrax
4
+ class OaiDcParser < ApplicationParser
5
+ attr_accessor :headers
6
+ delegate :list_sets, to: :client
7
+
8
+ def initialize(importerexporter)
9
+ super
10
+ @headers = { from: importerexporter.user.email }
11
+ end
12
+
13
+ def client
14
+ @client ||= OAI::Client.new(importerexporter.parser_fields['base_url'],
15
+ headers: headers,
16
+ parser: 'libxml',
17
+ metadata_prefix: importerexporter.parser_fields['metadata_prefix'])
18
+ rescue StandardError
19
+ raise OAIError
20
+ end
21
+
22
+ def collection_name
23
+ @collection_name ||= parser_fields['set'] || 'all'
24
+ end
25
+
26
+ def entry_class
27
+ OaiDcEntry
28
+ end
29
+
30
+ def collection_entry_class
31
+ OaiSetEntry
32
+ end
33
+
34
+ def records(opts = {})
35
+ opts[:set] = collection_name unless collection_name == 'all'
36
+
37
+ opts[:from] = importerexporter&.last_imported_at&.strftime("%Y-%m-%d") if importerexporter.last_imported_at && only_updates
38
+
39
+ if opts[:quick]
40
+ opts.delete(:quick)
41
+ begin
42
+ @short_records = client.list_identifiers(opts)
43
+ rescue OAI::Exception => e
44
+ return @short_records = [] if e.code == "noRecordsMatch"
45
+ raise e
46
+ end
47
+ else
48
+ begin
49
+ @records ||= client.list_records(opts.merge(metadata_prefix: parser_fields['metadata_prefix']))
50
+ rescue OAI::Exception => e
51
+ return @records = [] if e.code == "noRecordsMatch"
52
+ raise e
53
+ end
54
+ end
55
+ end
56
+
57
+ # the set of fields available in the import data
58
+ def import_fields
59
+ ['contributor', 'coverage', 'creator', 'date', 'description', 'format', 'identifier', 'language', 'publisher', 'relation', 'rights', 'source', 'subject', 'title', 'type']
60
+ end
61
+
62
+ delegate :list_sets, to: :client
63
+
64
+ def create_collections
65
+ metadata = {
66
+ visibility: 'open',
67
+ collection_type_gid: Hyrax::CollectionType.find_or_create_default_collection_type.gid
68
+ }
69
+
70
+ collections.each_with_index do |set, index|
71
+ next unless collection_name == 'all' || collection_name == set.spec
72
+ unique_collection_identifier = importerexporter.unique_collection_identifier(set.spec)
73
+ metadata[:title] = [set.name]
74
+ metadata[work_identifier] = [unique_collection_identifier]
75
+
76
+ new_entry = collection_entry_class.where(importerexporter: importerexporter, identifier: unique_collection_identifier, raw_metadata: metadata).first_or_create!
77
+ # perform now to ensure this gets created before work imports start
78
+ ImportWorkCollectionJob.perform_now(new_entry.id, importerexporter.current_run.id)
79
+ increment_counters(index, true)
80
+ end
81
+ end
82
+
83
+ def create_works
84
+ results = self.records(quick: true)
85
+ return if results.blank?
86
+ results.full.each_with_index do |record, index|
87
+ identifier = record.send(source_identifier)
88
+ if identifier.blank?
89
+ if Bulkrax.fill_in_blank_source_identifiers.present?
90
+ identifier = Bulkrax.fill_in_blank_source_identifiers.call(self, index)
91
+ else
92
+ invalid_record("Missing #{source_identifier} for #{record.to_h}\n")
93
+ next
94
+ end
95
+ end
96
+
97
+ break if limit_reached?(limit, index)
98
+ seen[identifier] = true
99
+ new_entry = entry_class.where(importerexporter: self.importerexporter, identifier: identifier).first_or_create!
100
+ if record.deleted?
101
+ DeleteWorkJob.send(perform_method, new_entry, importerexporter.current_run)
102
+ else
103
+ ImportWorkJob.send(perform_method, new_entry.id, importerexporter.current_run.id)
104
+ end
105
+ increment_counters(index)
106
+ end
107
+ importer.record_status
108
+ end
109
+
110
+ def collections
111
+ @collections ||= list_sets
112
+ end
113
+
114
+ def collections_total
115
+ if collection_name == 'all'
116
+ collections.count
117
+ else
118
+ 1
119
+ end
120
+ end
121
+
122
+ def create_parent_child_relationships; end
123
+
124
+ def total
125
+ @total ||= records(quick: true).doc.find(".//resumptionToken").to_a.first.attributes["completeListSize"].to_i
126
+ rescue
127
+ @total = 0
128
+ end
129
+ end
130
+ end