hyrax-ingest 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +52 -0
  3. data/config/routes.rb +2 -0
  4. data/lib/hyrax/ingest.rb +12 -0
  5. data/lib/hyrax/ingest/batch_runner.rb +130 -0
  6. data/lib/hyrax/ingest/configuration.rb +54 -0
  7. data/lib/hyrax/ingest/engine.rb +6 -0
  8. data/lib/hyrax/ingest/errors.rb +186 -0
  9. data/lib/hyrax/ingest/fetcher.rb +55 -0
  10. data/lib/hyrax/ingest/fetcher/base.rb +78 -0
  11. data/lib/hyrax/ingest/fetcher/csv_file.rb +89 -0
  12. data/lib/hyrax/ingest/fetcher/date_time.rb +15 -0
  13. data/lib/hyrax/ingest/fetcher/literal.rb +24 -0
  14. data/lib/hyrax/ingest/fetcher/premis_event_type.rb +28 -0
  15. data/lib/hyrax/ingest/fetcher/rdf_uri.rb +21 -0
  16. data/lib/hyrax/ingest/fetcher/xml_file.rb +54 -0
  17. data/lib/hyrax/ingest/fetcher/yaml_file.rb +36 -0
  18. data/lib/hyrax/ingest/has_depositor.rb +13 -0
  19. data/lib/hyrax/ingest/has_iteration.rb +18 -0
  20. data/lib/hyrax/ingest/has_logger.rb +29 -0
  21. data/lib/hyrax/ingest/has_report.rb +17 -0
  22. data/lib/hyrax/ingest/has_shared_sip.rb +20 -0
  23. data/lib/hyrax/ingest/has_sip.rb +20 -0
  24. data/lib/hyrax/ingest/ingester.rb +75 -0
  25. data/lib/hyrax/ingest/ingester/active_fedora_base_ingester.rb +136 -0
  26. data/lib/hyrax/ingest/ingester/active_fedora_file_ingester.rb +17 -0
  27. data/lib/hyrax/ingest/ingester/active_fedora_property_assigner.rb +67 -0
  28. data/lib/hyrax/ingest/ingester/base.rb +28 -0
  29. data/lib/hyrax/ingest/ingester/file_set_ingester.rb +68 -0
  30. data/lib/hyrax/ingest/ingester/preservation_event_ingester.rb +27 -0
  31. data/lib/hyrax/ingest/ingester/work_ingester.rb +55 -0
  32. data/lib/hyrax/ingest/reporting.rb +13 -0
  33. data/lib/hyrax/ingest/reporting/configuration.rb +22 -0
  34. data/lib/hyrax/ingest/reporting/report.rb +79 -0
  35. data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.html.erb +77 -0
  36. data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.txt.erb +21 -0
  37. data/lib/hyrax/ingest/runner.rb +103 -0
  38. data/lib/hyrax/ingest/sip.rb +92 -0
  39. data/lib/hyrax/ingest/transformer.rb +42 -0
  40. data/lib/hyrax/ingest/transformer/base.rb +12 -0
  41. data/lib/hyrax/ingest/transformer/to_date.rb +33 -0
  42. data/lib/hyrax/ingest/version.rb +5 -0
  43. data/lib/tasks/ingest_tasks.rake +22 -0
  44. metadata +330 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c0d5eeb006d3931a1a7668b71e9a0ce85d603bbff3025aba8552eb42958806d4
4
+ data.tar.gz: 97de1d90af9b90972ceb24df28ffebd9687d9ac63fc18fc4d59ec96e54359949
5
+ SHA512:
6
+ metadata.gz: 1175542851df962db08f1e1568ff5d07a53a9630995d9af0ff1b0c6a7d7cff08a871e4c03d28f479998198bd6e2f8b990874510f8424259440ae21e468429c0c
7
+ data.tar.gz: 12ffb343ec907eefe6c8ac367e8bb963239a32f2aea789189e9e11ed21c8eacb27983d8365adf42505531ff523995b79701d998425f907290d6b74467acb99c8
@@ -0,0 +1,52 @@
1
+ # Hyrax::Ingest
2
+
3
+ [![Build Status](https://travis-ci.org/IUBLibTech/hyrax-ingest.svg?branch=master)](https://travis-ci.org/IUBLibTech/hyrax-ingest)
4
+
5
+ Hyrax::Ingest is an extensible plugin for ingesting content and metadata into
6
+ a Hyrax[https://github.com/samvera/hyrax] repository based on a declarative
7
+ configuration.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'hyrax-ingest'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install hyrax-ingest
24
+
25
+ ## Overview
26
+
27
+ The Hyrax::Ingest plugin defines "ingest" as the process of mapping content
28
+ and data from one or more sources to one or more persistence layers to be used
29
+ by a Hyrax application.
30
+
31
+
32
+ ## Development
33
+
34
+ After checking out the repo, run `bin/setup` to install dependencies. Then,
35
+ run `rake spec` to run the tests. You can also run `bin/console` for an
36
+ interactive prompt that will allow you to experiment.
37
+
38
+ To install this gem onto your local machine, run `bundle exec rake install`.
39
+ To release a new version, update the version number in `version.rb`, and then
40
+ run `bundle exec rake release`, which will create a git tag for the version,
41
+ push git commits and tags, and push the `.gem` file to
42
+ [rubygems.org](https://rubygems.org).
43
+
44
+ ## Contributing
45
+
46
+ TODO: Add CONTRIBUTING.md per the Samvera standard, and refer to it here.
47
+
48
+ ## License
49
+
50
+ The gem is available as open source under the terms of the [MIT
51
+ License](http://opensource.org/licenses/MIT).
52
+
@@ -0,0 +1,2 @@
1
+ Rails.application.routes.draw do
2
+ end
@@ -0,0 +1,12 @@
1
+ require 'hyrax/ingest/engine'
2
+ require 'hyrax'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ class << self
7
+ def root
8
+ File.expand_path('../../../', __FILE__)
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,130 @@
1
+ require 'hyrax/ingest/runner'
2
+ require 'hyrax/ingest/has_report'
3
+ require 'hyrax/ingest/has_logger'
4
+ require 'hyrax/ingest/has_depositor'
5
+ require 'interloper'
6
+
7
+ module Hyrax
8
+ module Ingest
9
+ class BatchRunner
10
+ include HasReport
11
+ include HasLogger
12
+ include HasDepositor
13
+ include Interloper
14
+
15
+ # Report before and after a batch is ingested.
16
+ before(:run!) do
17
+ logger.info "Starting batch ingest, batch size = #{iterations}."
18
+ report.stat[:batch_size] = iterations
19
+ report.stat[:datetime_started] = DateTime.now
20
+ end
21
+
22
+ after(:run!) do
23
+ logger.info "Batch ingest complete."
24
+ report.stat[:datetime_completed] = DateTime.now
25
+ report.write_to_file
26
+ end
27
+
28
+ attr_reader :sip_paths
29
+
30
+ def initialize(config_file_path:, sip_paths: [], shared_sip_path: nil, iterations: nil, depositor: nil)
31
+ @sip_paths = sip_paths
32
+ @shared_sip_path = shared_sip_path
33
+ @config_file_path = config_file_path
34
+ @iterations = iterations
35
+ self.depositor = depositor
36
+ end
37
+
38
+ def run!
39
+ runners.each { |runner| runner.run! }
40
+ rescue StandardError => error
41
+ # TODO: move to callback, but :rescue hook doesn't exist yet in
42
+ # Interloper gem.
43
+ report.stat[:datetime_completed] = DateTime.now
44
+ report.failed_with error
45
+ raise error
46
+ ensure
47
+ # TODO: move to callback, but :ensure hook doesn't exist yet in
48
+ # Interloper gem.
49
+ report.write_to_file
50
+ end
51
+
52
+ # Returns an array containing the IDs of new or updated records.
53
+ # Currently only returns the IDs for ActiveFedora records (or
54
+ # subclasses) that are specified at the top level (i.e. not nested) of
55
+ # the ingest configuration.
56
+ # @return [Array] list of IDs
57
+ def ingested_ids
58
+ ingested_ids_by_type.flatten
59
+ end
60
+
61
+ # Returns an hash containing the IDs of new or updated records, keyed by
62
+ # the model class by which they were saved.
63
+ # Example:
64
+ # { FileSet => ['123', '456'], Work => ['789'] }
65
+ # Currently only returns the IDs for ActiveFedora records (or
66
+ # subclasses) that are specified at the top level (i.e. not nested) of
67
+ # the ingest configuration.
68
+ # @return [Hash] IDs keyed by the model class by which they were saved.
69
+ def ingested_ids_by_type
70
+ {}.tap do |h|
71
+ runners.each do |runner|
72
+ runner.ingested_ids_by_type.each do |type, ids|
73
+ h[type] ||= []
74
+ h[type] += ids
75
+ h[type].uniq!
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ # Returns an array containing the IDs of new or updated records.
82
+ # Currently only returns the IDs for ActiveFedora records (or
83
+ # subclasses) that are specified at the top level (i.e. not nested) of
84
+ # the ingest configuration.
85
+ # @return [Array] list of IDs
86
+ def ingested_ids
87
+ ingested_ids_by_type.flatten
88
+ end
89
+
90
+ # Returns an hash containing the IDs of new or updated records, keyed by
91
+ # the model class by which they were saved.
92
+ # Example:
93
+ # { FileSet => ['123', '456'], Work => ['789'] }
94
+ # Currently only returns the IDs for ActiveFedora records (or
95
+ # subclasses) that are specified at the top level (i.e. not nested) of
96
+ # the ingest configuration.
97
+ # @return [Hash] IDs keyed by the model class by which they were saved.
98
+ def ingested_ids_by_type
99
+ {}.tap do |h|
100
+ runners.each do |runner|
101
+ runner.ingested_ids_by_type.each do |type, ids|
102
+ h[type] ||= []
103
+ h[type] += ids
104
+ h[type].uniq!
105
+ end
106
+ end
107
+ end
108
+ end
109
+
110
+ private
111
+
112
+ def iterations
113
+ # Return @iterations.to_i if it's not nil and not empty
114
+ unless @iterations.nil? || @iterations.to_s.empty?
115
+ @iterations
116
+ else
117
+ [1, @sip_paths.count].max
118
+ end.to_i
119
+ end
120
+
121
+ def runners
122
+ @runners ||= (0...iterations).map do |iteration|
123
+ Hyrax::Ingest::Runner.new(config_file_path: @config_file_path, sip_path: @sip_paths[iteration], shared_sip_path: @shared_sip_path, iteration: iteration, depositor: depositor).tap do |runner|
124
+ runner.report = report if runner.respond_to? :report=
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,54 @@
1
+ require 'hyrax/ingest/errors'
2
+ require 'active_support/core_ext/hash/keys'
3
+
4
+ module Hyrax
5
+ module Ingest
6
+ class Configuration
7
+ attr_reader :config_file_path
8
+
9
+ def initialize(config_file_path:)
10
+ @config_file_path = config_file_path.to_s
11
+ raise Hyrax::Ingest::Errors::NoConfigFileFound.new(@config_file_path) unless File.exist? @config_file_path
12
+ validate!
13
+ end
14
+
15
+ # @return [Array] Array of hashes, where each hash is the configuration options
16
+ # for an Ingester instance
17
+ def ingester_configs
18
+ @ingester_configs ||= config[:ingest]
19
+ end
20
+
21
+ private
22
+
23
+ # @return [Hash] The config hash parsed from yaml file, and with keys
24
+ # converted from strings to symbols.
25
+ def config
26
+ @config ||= Psych.load_file(config_file_path).deep_symbolize_keys
27
+ end
28
+
29
+ # Validates the configuration.
30
+ # @raise [Hyrax::Ingest::Errors::InvalidConfig] When the configuration is invalid.
31
+ def validate!
32
+ validate_top_level_key!
33
+ validate_ingester_configs_array!
34
+ validate_ingester_config_hashes!
35
+ end
36
+
37
+ # @raise [Hyrax::Ingest::Errors::InvalidConfig] When the top level
38
+ # 'ingest' key is missing.
39
+ def validate_top_level_key!
40
+ raise Hyrax::Ingest::Errors::InvalidConfig.new(config_file_path, "Top-level key 'ingest' is missing.") unless config[:ingest]
41
+ end
42
+
43
+ def validate_ingester_configs_array!
44
+ raise Hyrax::Ingest::Errors::InvalidConfig.new(config_file_path, "Value under top-level 'ingest' key must be an array containing the configuration for each ingester you want to use.") unless config[:ingest].respond_to?(:each)
45
+ end
46
+
47
+ def validate_ingester_config_hashes!
48
+ config[:ingest].each do |ingest_config|
49
+ raise Hyrax::Ingest::Errors::InvalidConfig.new(config_file_path, "Each ingester configuration must be a single key-value pair, where the key is the type of ingester, and the value is a hash containing the configuration for the ingester. But a #{ingester_config.class} was found instead.") unless ingest_config.respond_to? :keys
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,6 @@
1
+ module Hyrax
2
+ module Ingest
3
+ class Engine < ::Rails::Engine
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,186 @@
1
+ module Hyrax
2
+ module Ingest
3
+ # Base class that allows for rescuing from all Hyrax::Ingest errors.
4
+ # Intentionally no-op.
5
+ class Error < StandardError; end
6
+
7
+ module Errors
8
+ class InvalidSIPPath < Hyrax::Ingest::Error
9
+ def initialize(invalid_path)
10
+ super("#{invalid_path} is not a valid SIP path")
11
+ end
12
+ end
13
+
14
+ class NoSIPSpecified < Hyrax::Ingest::Error
15
+ def initialize(obj)
16
+ super("No SIP was specified.")
17
+ end
18
+ end
19
+
20
+ class NoSharedSIPSpecified < Hyrax::Ingest::Error
21
+ def initialize
22
+ super("No shared SIP was specified.")
23
+ end
24
+ end
25
+
26
+ class InvalidSIP < Hyrax::Ingest::Error
27
+ def initialize(invalid_sip_obj)
28
+ super("Invalid SIP object. SIP must be an instance of Hyrax::Ingest::SIP (or an instance of a subclass) but an instance of #{invalid_sip_obj.class} was found.")
29
+ end
30
+ end
31
+
32
+ class UnknownActiveFedoraModel < Hyrax::Ingest::Error
33
+ def initialize(af_model_class_name)
34
+ super("Unknown ActiveFedora model type '#{af_model_class_name.to_s}'")
35
+ end
36
+ end
37
+
38
+ class FileNotFoundInSIP < Hyrax::Ingest::Error
39
+ def initialize(sip_path, string_or_regexp)
40
+ super("No file matching #{string_or_regexp.inspect.to_s} was found in the SIP at path '#{sip_path}'")
41
+ end
42
+ end
43
+
44
+ class UnknownAssignerClass < Hyrax::Ingest::Error
45
+ def initialize(class_name, available_classes)
46
+ super("Assigner class '#{class_name}' not found. Available assigner classes are: #{Array(available_classes).join(', ')}")
47
+ end
48
+ end
49
+
50
+ class AmbiguousAssignerClass < Hyrax::Ingest::Error
51
+ def initialize(class_name, matching_classes)
52
+ super("Assigner class name '#{class_name}' is ambiguous; could mean any one of the following classes: #{Array(matching_classes).join(',')}. Please use namespaces in the class name to be more specific.")
53
+ end
54
+ end
55
+
56
+ class UnknownFetcherClass < Hyrax::Ingest::Error
57
+ def initialize(class_name, available_classes)
58
+ super("Fetcher class '#{class_name}' not found. Available transformer classes are: #{Array(available_classes).join(', ')}")
59
+ end
60
+ end
61
+
62
+ class AmbiguousFetcherClass < Hyrax::Ingest::Error
63
+ def initialize(class_name, matching_classes)
64
+ super("Fetcher class name '#{class_name}' is ambiguous; could mean any one of the following classes: #{Array(matching_classes).join(',')}. Please use namespaces in the class name to be more specific.")
65
+ end
66
+ end
67
+
68
+ class InvalidFetcher < Hyrax::Ingest::Error
69
+ def initialize(class_name)
70
+ super("Invalid fetcher '#{class_name}'; Fetcher objects must extend Hyrax::Ingest::Fetcher::Base.")
71
+ end
72
+ end
73
+
74
+ class MissingConfigOptions < Hyrax::Ingest::Error
75
+ def initialize(config_options)
76
+ super("Missing config options: '#{config_options.join(', ')}'")
77
+ end
78
+ end
79
+
80
+ class UnknownIngesterClass < Hyrax::Ingest::Error
81
+ def initialize(class_name, available_classes)
82
+ super("Ingester class '#{class_name}' not found. Available ingester classes are: #{Array(available_classes).join(', ')}")
83
+ end
84
+ end
85
+
86
+ class AmbiguousIngesterClass < Hyrax::Ingest::Error
87
+ def initialize(class_name, matching_classes)
88
+ super("Ingester class name '#{class_name}' is ambiguous; could mean any one of the following classes: #{Array(matching_classes).join(',')}. Please use namespaces in the class name to be more specific.")
89
+ end
90
+ end
91
+
92
+ class RecordNotFound < Hyrax::Ingest::Error
93
+ def initialize(model_class_name, where_clause)
94
+ super("Record of type '#{model_class_name}' could not be found where #{where_clause}.")
95
+ end
96
+ end
97
+
98
+ class AmbiguousFetchOptions < Hyrax::Ingest::Error
99
+ def initialize(ambiguous_options)
100
+ super("Could not determine which transformer class to use given the following options: #{Array(ambiguous_options).join(', ')}")
101
+ end
102
+ end
103
+
104
+ class MissingRequiredFetchOptions < Hyrax::Ingest::Error
105
+ def initialize(missing_options)
106
+ super("Missing required assignment options: #{Array(missing_options).join(', ')}")
107
+ end
108
+ end
109
+
110
+ class InvalidFetchOptions < Hyrax::Ingest::Error
111
+ def initialize(invalid_options)
112
+ super("Invalid fetch options: #{Array(invalid_options).join(', ')}")
113
+ end
114
+ end
115
+
116
+ class InvalidConfig < Hyrax::Ingest::Error
117
+ def initialize(config_file_path, msg=nil)
118
+ message = ["Invalid configuration in '#{config_file_path}'.", msg.to_s].join("\n")
119
+ super(message)
120
+ end
121
+ end
122
+
123
+ class InvalidIngesterClass < Hyrax::Ingest::Error
124
+ def initialize(invalid_class)
125
+ super("Invalid ingester class #{invalid_class} does not extend Hyrax::Ingest::Ingester::Base")
126
+ end
127
+ end
128
+
129
+ class UnknownTransformerClass < Hyrax::Ingest::Error
130
+ def initialize(class_name, available_classes)
131
+ super("Transformer class '#{class_name}' not found. Available transformer classes are: #{Array(available_classes).join(', ')}")
132
+ end
133
+ end
134
+
135
+ class AmbiguousTransformerClass < Hyrax::Ingest::Error
136
+ def initialize(class_name, matching_classes)
137
+ super("Transformer class name '#{class_name}' is ambiguous; could mean any one of the following classes: #{Array(matching_classes).join(',')}. Please use namespaces in the class name to be more specific.")
138
+ end
139
+ end
140
+
141
+ class NoConfigFileFound < Hyrax::Ingest::Error
142
+ def initialize(path)
143
+ super("No ingest config file exists at '#{path}'.")
144
+ end
145
+ end
146
+
147
+ class UnknownRdfPredicate < Hyrax::Ingest::Error
148
+ def initialize(unknown_rdf_predicate, af_model_class_name)
149
+ super("Unknown RDF Predicate '#{unknown_rdf_predicate}' for ActiveFedora model '#{af_model_class_name}'")
150
+ end
151
+ end
152
+
153
+ class UnrecognizedTransformOption < Hyrax::Ingest::Error
154
+ def initialize(unrecognized_transform_option)
155
+ super("Unknown transform option: '#{unrecognized_transform_option}'")
156
+ end
157
+ end
158
+
159
+ class UnableToPrintIngestReport < Hyrax::Ingest::Error
160
+ def initialize(unable_to_read_from_this_thing)
161
+ super("Cannot print ingest report from #{unable_to_read_from_this_thing}. To print a report, it must be written to a file.")
162
+ end
163
+ end
164
+
165
+ class InvalidReportConfigOptions < Hyrax::Ingest::Error
166
+ def initialize(invalid_options)
167
+ super("Invalid reporting config options: #{invalid_options.join(', ')}")
168
+ end
169
+ end
170
+
171
+ class MissingCsvColumn < Hyrax::Ingest::Error
172
+ def initialize(invalid_column)
173
+ super("Unknown column header for: #{invalid_column}.")
174
+ end
175
+ end
176
+
177
+ class ConfigurationError < Hyrax::Ingest::Error; end
178
+
179
+ class InvalidActiveFedoraPropertyValue < Hyrax::Ingest::Error
180
+ def initialize(value, property_name, rdf_predicate)
181
+ super("Could not assign '#{value}' to property #{property_name} (with RDF predicate '#{rdf_predicate}')")
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end