hyrax-ingest 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +52 -0
- data/config/routes.rb +2 -0
- data/lib/hyrax/ingest.rb +12 -0
- data/lib/hyrax/ingest/batch_runner.rb +130 -0
- data/lib/hyrax/ingest/configuration.rb +54 -0
- data/lib/hyrax/ingest/engine.rb +6 -0
- data/lib/hyrax/ingest/errors.rb +186 -0
- data/lib/hyrax/ingest/fetcher.rb +55 -0
- data/lib/hyrax/ingest/fetcher/base.rb +78 -0
- data/lib/hyrax/ingest/fetcher/csv_file.rb +89 -0
- data/lib/hyrax/ingest/fetcher/date_time.rb +15 -0
- data/lib/hyrax/ingest/fetcher/literal.rb +24 -0
- data/lib/hyrax/ingest/fetcher/premis_event_type.rb +28 -0
- data/lib/hyrax/ingest/fetcher/rdf_uri.rb +21 -0
- data/lib/hyrax/ingest/fetcher/xml_file.rb +54 -0
- data/lib/hyrax/ingest/fetcher/yaml_file.rb +36 -0
- data/lib/hyrax/ingest/has_depositor.rb +13 -0
- data/lib/hyrax/ingest/has_iteration.rb +18 -0
- data/lib/hyrax/ingest/has_logger.rb +29 -0
- data/lib/hyrax/ingest/has_report.rb +17 -0
- data/lib/hyrax/ingest/has_shared_sip.rb +20 -0
- data/lib/hyrax/ingest/has_sip.rb +20 -0
- data/lib/hyrax/ingest/ingester.rb +75 -0
- data/lib/hyrax/ingest/ingester/active_fedora_base_ingester.rb +136 -0
- data/lib/hyrax/ingest/ingester/active_fedora_file_ingester.rb +17 -0
- data/lib/hyrax/ingest/ingester/active_fedora_property_assigner.rb +67 -0
- data/lib/hyrax/ingest/ingester/base.rb +28 -0
- data/lib/hyrax/ingest/ingester/file_set_ingester.rb +68 -0
- data/lib/hyrax/ingest/ingester/preservation_event_ingester.rb +27 -0
- data/lib/hyrax/ingest/ingester/work_ingester.rb +55 -0
- data/lib/hyrax/ingest/reporting.rb +13 -0
- data/lib/hyrax/ingest/reporting/configuration.rb +22 -0
- data/lib/hyrax/ingest/reporting/report.rb +79 -0
- data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.html.erb +77 -0
- data/lib/hyrax/ingest/reporting/views/hyrax_ingest_report.txt.erb +21 -0
- data/lib/hyrax/ingest/runner.rb +103 -0
- data/lib/hyrax/ingest/sip.rb +92 -0
- data/lib/hyrax/ingest/transformer.rb +42 -0
- data/lib/hyrax/ingest/transformer/base.rb +12 -0
- data/lib/hyrax/ingest/transformer/to_date.rb +33 -0
- data/lib/hyrax/ingest/version.rb +5 -0
- data/lib/tasks/ingest_tasks.rake +22 -0
- metadata +330 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'hyrax/ingest/fetcher/xml_file'
|
2
|
+
require 'hyrax/ingest/fetcher/yaml_file'
|
3
|
+
require 'hyrax/ingest/fetcher/csv_file'
|
4
|
+
require 'hyrax/ingest/fetcher/literal'
|
5
|
+
require 'hyrax/ingest/fetcher/premis_event_type'
|
6
|
+
require 'hyrax/ingest/fetcher/rdf_uri'
|
7
|
+
require 'hyrax/ingest/fetcher/date_time'
|
8
|
+
require 'hyrax/ingest/errors'
|
9
|
+
|
10
|
+
module Hyrax
|
11
|
+
module Ingest
|
12
|
+
module Fetcher
|
13
|
+
class << self
|
14
|
+
# @return Set The set of all fetcher classes.
|
15
|
+
def all_classes
|
16
|
+
@all_classes ||= Set.new.tap do |all_classes|
|
17
|
+
all_classes << Hyrax::Ingest::Fetcher::XMLFile
|
18
|
+
all_classes << Hyrax::Ingest::Fetcher::YAMLFile
|
19
|
+
all_classes << Hyrax::Ingest::Fetcher::CSVFile
|
20
|
+
all_classes << Hyrax::Ingest::Fetcher::Literal
|
21
|
+
all_classes << Hyrax::Ingest::Fetcher::PremisEventType
|
22
|
+
all_classes << Hyrax::Ingest::Fetcher::RdfUri
|
23
|
+
all_classes << Hyrax::Ingest::Fetcher::DateTime
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# @param [string] options The key is the fetcher class name
|
28
|
+
# and the value is the hash of options to pass to the constructor of the fetcher
|
29
|
+
# class.
|
30
|
+
# @return An instance of the fetcher class.
|
31
|
+
def factory(class_name, options={})
|
32
|
+
options ||= {}
|
33
|
+
find_class_by_name(class_name).new(options)
|
34
|
+
end
|
35
|
+
|
36
|
+
# @param [String] class_name The stringified class name, with our
|
37
|
+
# without namespaces.
|
38
|
+
# @raise [Hyrax::Ingest::Errors::UnknownFetcherClass] When there is no
|
39
|
+
# corresponding fetcher class for the given value of the `class_name`
|
40
|
+
# param.
|
41
|
+
# @raise [Hyrax::Ingest::Errors::UnknownfetcherClass] When the value of
|
42
|
+
# `class_name` param is insufficient in determining a fetcher class.
|
43
|
+
# @return [Class] The appropriate fetcher class.
|
44
|
+
def find_class_by_name(class_name)
|
45
|
+
found_classes = all_classes.select do |class_const|
|
46
|
+
(class_const.to_s == class_name) || (class_const.to_s =~ /::#{class_name}/)
|
47
|
+
end
|
48
|
+
raise Hyrax::Ingest::Errors::UnknownFetcherClass.new(class_name, all_classes) if found_classes.count == 0
|
49
|
+
raise Hyrax::Ingest::Errors::AmbiguousFetcherClass.new(class_name, found_classes) if found_classes.count > 1
|
50
|
+
found_classes.first
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'hyrax/ingest/errors'
|
2
|
+
require 'hyrax/ingest/sip'
|
3
|
+
require 'interloper'
|
4
|
+
require 'hyrax/ingest/has_logger'
|
5
|
+
require 'hyrax/ingest/has_report'
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
module Hyrax
|
10
|
+
module Ingest
|
11
|
+
module Fetcher
|
12
|
+
class Base
|
13
|
+
include Interloper
|
14
|
+
include HasReport
|
15
|
+
include HasLogger
|
16
|
+
|
17
|
+
# Callback to log missing values that have been marked as required.
|
18
|
+
after(:fetch) do
|
19
|
+
# Use a flag to only report the missing value once.
|
20
|
+
unless @after_fetch_run_once
|
21
|
+
if required? && fetched_value_is_empty?
|
22
|
+
report_missing_required_value
|
23
|
+
end
|
24
|
+
@after_fetch_run_once = true
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def initialize(options={})
|
29
|
+
@required = options.delete(:required)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Boolean reader for @required.
|
33
|
+
def required?; !!@required; end
|
34
|
+
|
35
|
+
# Subclasses should override this method with the logic required to
|
36
|
+
# fetch values from a SIP. The overidden method should set
|
37
|
+
# @fetched_value, and return it.
|
38
|
+
# @see Hyrax::Ingest::Fetcher::XMLFile#fetch
|
39
|
+
# @abstract
|
40
|
+
def fetch
|
41
|
+
@fetched_value
|
42
|
+
end
|
43
|
+
|
44
|
+
protected
|
45
|
+
|
46
|
+
# Determines whether the value that was fetched by #fetch is empty
|
47
|
+
# or not. Be default, the fetched value is empty if it is nil, an
|
48
|
+
# empty string, an empty array, an empty hash, or an array
|
49
|
+
# containing any combination of those. Overwrite this method in
|
50
|
+
# subclasses to change the definition of empty in those contexts.
|
51
|
+
# The return value is used for reporting which values are missing,
|
52
|
+
# but required.
|
53
|
+
# @return [Boolean] True if @fetched_value is considered to be empty; false otherwise.
|
54
|
+
def fetched_value_is_empty?
|
55
|
+
Array(@fetched_value).reduce(true) do |all_empty, val|
|
56
|
+
all_empty &&= ( val.nil? || ( val.respond_to?(:empty?) && val.empty? ) )
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Reports occurrences of missing required values.
|
61
|
+
# Subclasses should override this method to provide further detail
|
62
|
+
# by passing an options hash that will be available for reporting.
|
63
|
+
# @see Hyrax::Ingest::Fetcher::XMLFile#report_missing_required_value
|
64
|
+
# @example
|
65
|
+
# # Provide additional info to the report.
|
66
|
+
# def report_missing_require_value
|
67
|
+
# super(foo: "bar")
|
68
|
+
# end
|
69
|
+
def report_missing_required_value(params={})
|
70
|
+
short_class_name = self.class.to_s.gsub(/.*\:\:/, '')
|
71
|
+
logger.warn "Missing required value from #{short_class_name} with params = #{params}"
|
72
|
+
report.stat[:missing_required_values][self.class] ||= []
|
73
|
+
report.stat[:missing_required_values][self.class] << params
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'hyrax/ingest/fetcher/base'
|
2
|
+
require 'hyrax/ingest/has_sip'
|
3
|
+
require 'hyrax/ingest/has_iteration'
|
4
|
+
require 'roo'
|
5
|
+
|
6
|
+
module Hyrax
|
7
|
+
module Ingest
|
8
|
+
module Fetcher
|
9
|
+
class CSVFile < Base
|
10
|
+
attr_reader :filename, :column, :row, :header_row_number, :row_offset
|
11
|
+
|
12
|
+
include HasSIP
|
13
|
+
include HasIteration
|
14
|
+
|
15
|
+
def initialize(options={})
|
16
|
+
raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
|
17
|
+
raise ArgumentError, "Required option :column is missing" unless options.key?(:column)
|
18
|
+
raise ArgumentError, "Required option :row is missing" unless options.key?(:row)
|
19
|
+
|
20
|
+
@filename = options.fetch(:filename)
|
21
|
+
@column = options.fetch(:column, '').to_s.strip.downcase
|
22
|
+
@row = options.fetch(:row, '').to_s.strip.downcase
|
23
|
+
@header_row_number = options.fetch(:header_row_number, 1)
|
24
|
+
@row_offset = options[:row_offset]
|
25
|
+
super
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch
|
29
|
+
@fetched_value ||= cell_value
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def roo
|
35
|
+
@roo ||= Roo::CSV.new(sip.find_file_path(filename))
|
36
|
+
end
|
37
|
+
|
38
|
+
def cell_value
|
39
|
+
roo.cell(row_number, column_number)
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return The column number from which to retrieve the cell.
|
43
|
+
def column_number
|
44
|
+
@column_number ||= column_number_from_header || specific_column_number
|
45
|
+
# TODO: custom error
|
46
|
+
raise ArgumentError, "Value for column: option must be a number or a column header; '#{column}' was given." if @column_number.nil?
|
47
|
+
@column_number
|
48
|
+
end
|
49
|
+
|
50
|
+
def column_number_from_header
|
51
|
+
headers.index(column) + 1 if headers.index(column)
|
52
|
+
end
|
53
|
+
|
54
|
+
def specific_column_number
|
55
|
+
column.to_i if string_is_integer?(column)
|
56
|
+
end
|
57
|
+
|
58
|
+
def row_number
|
59
|
+
@row_number ||= next_row || specific_row_number
|
60
|
+
# TODO: custom error
|
61
|
+
raise ArgumentError, "Value for row: option must be a number or the keyword 'next'; #{@row} was given." if @row_number.nil?
|
62
|
+
@row_number + row_offset
|
63
|
+
end
|
64
|
+
|
65
|
+
def next_row
|
66
|
+
iteration + 1 if @row == 'next'
|
67
|
+
end
|
68
|
+
|
69
|
+
def specific_row_number
|
70
|
+
row.to_i if string_is_integer?(row)
|
71
|
+
end
|
72
|
+
|
73
|
+
def row_offset
|
74
|
+
@row_offset || header_row_number
|
75
|
+
end
|
76
|
+
|
77
|
+
def headers
|
78
|
+
@headers ||= roo.row(header_row_number).map(&:to_s).map(&:downcase)
|
79
|
+
end
|
80
|
+
|
81
|
+
def string_is_integer?(str)
|
82
|
+
# remove leading zeros first
|
83
|
+
str.sub!(/^0+/, '')
|
84
|
+
str.to_i.to_s == str.to_s
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'hyrax/ingest/errors'
|
2
|
+
require 'hyrax/ingest/sip'
|
3
|
+
|
4
|
+
module Hyrax
|
5
|
+
module Ingest
|
6
|
+
module Fetcher
|
7
|
+
class Literal < Base
|
8
|
+
attr_reader :literal_value
|
9
|
+
|
10
|
+
inherit_callbacks_for :fetch
|
11
|
+
|
12
|
+
def initialize(options={})
|
13
|
+
options = { value: options } unless options.is_a? Hash
|
14
|
+
@literal_value = options.delete(:value)
|
15
|
+
super
|
16
|
+
end
|
17
|
+
|
18
|
+
def fetch
|
19
|
+
@fetched_value = literal_value
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This class adhere's to the Hyrax::Ingest::Fetcher::Base interface to work other classes within the Hyrax::Ingest
|
2
|
+
# gem. If effectively maps PREMIS Event Type abbreviations to the URI that is then associated with a
|
3
|
+
# Hyrax::Preservation::Event model.
|
4
|
+
|
5
|
+
require 'hyrax/ingest/errors'
|
6
|
+
require 'hyrax/ingest/sip'
|
7
|
+
|
8
|
+
module Hyrax
|
9
|
+
module Ingest
|
10
|
+
module Fetcher
|
11
|
+
class PremisEventType < Base
|
12
|
+
inherit_callbacks_for :fetch
|
13
|
+
|
14
|
+
attr_reader :abbr
|
15
|
+
|
16
|
+
def initialize(options={})
|
17
|
+
options = { abbr: options } unless options.is_a? Hash
|
18
|
+
@abbr = options.delete(:abbr)
|
19
|
+
super
|
20
|
+
end
|
21
|
+
|
22
|
+
def fetch
|
23
|
+
@fetched_value ||= Hyrax::Preservation::PremisEventType.find_by_abbr(@abbr).uri
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rdf'
|
2
|
+
|
3
|
+
module Hyrax
|
4
|
+
module Ingest
|
5
|
+
module Fetcher
|
6
|
+
class RdfUri < Base
|
7
|
+
attr_reader :uri_str
|
8
|
+
|
9
|
+
def initialize(options={})
|
10
|
+
options = { uri_str: options } unless options.is_a? Hash
|
11
|
+
@uri_str = options.delete(:uri_str)
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def fetch
|
16
|
+
@fetched_value ||= ::RDF::URI.new(uri_str)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'hyrax/ingest/fetcher/base'
|
2
|
+
require 'hyrax/ingest/has_sip'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'hyrax/ingest/reporting'
|
5
|
+
require 'interloper'
|
6
|
+
|
7
|
+
module Hyrax
|
8
|
+
module Ingest
|
9
|
+
module Fetcher
|
10
|
+
class XMLFile < Base
|
11
|
+
include HasSIP
|
12
|
+
|
13
|
+
attr_reader :filename, :xpath, :default, :fetched_value
|
14
|
+
|
15
|
+
inherit_callbacks_after :fetch
|
16
|
+
|
17
|
+
def initialize(options={})
|
18
|
+
raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
|
19
|
+
raise ArgumentError, "Required option :xpath is missing" unless options.key?(:xpath)
|
20
|
+
@filename = options[:filename]
|
21
|
+
@xpath = options[:xpath]
|
22
|
+
@default = options[:default] || []
|
23
|
+
super
|
24
|
+
end
|
25
|
+
|
26
|
+
# Overrides Hyrax::Ingest::Fetcher::Base#fetch
|
27
|
+
# @return [String] The value fetched from the XML file
|
28
|
+
def fetch
|
29
|
+
@fetched_value ||= begin
|
30
|
+
fetched_from_xml = noko.xpath(xpath).map(&:text)
|
31
|
+
fetched_from_xml.empty? ? default : fetched_from_xml
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
|
37
|
+
# Overrides
|
38
|
+
# Hyrax::Ingest::Fetcher::Base#report_missing_required_value,
|
39
|
+
# passing filename and xpath inforamtion. The report summary has
|
40
|
+
# specific logic to look for these.
|
41
|
+
def report_missing_required_value
|
42
|
+
super(filename: filename, xpath: xpath)
|
43
|
+
end
|
44
|
+
|
45
|
+
def noko
|
46
|
+
@noko ||= Nokogiri::XML(sip.read_file(filename)).tap do |n|
|
47
|
+
# TODO: allow using namespaces instead of blindly removing them.
|
48
|
+
n.remove_namespaces!
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'hyrax/ingest/fetcher/base'
|
2
|
+
require 'hyrax/ingest/has_sip'
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
module Hyrax
|
6
|
+
module Ingest
|
7
|
+
module Fetcher
|
8
|
+
class YAMLFile < Base
|
9
|
+
include HasSIP
|
10
|
+
|
11
|
+
inherit_callbacks_for :fetch
|
12
|
+
|
13
|
+
attr_reader :filename, :yaml_path
|
14
|
+
|
15
|
+
def initialize(options={})
|
16
|
+
raise ArgumentError, "Required option :filename is missing" unless options.key?(:filename)
|
17
|
+
raise ArgumentError, "Required option :yaml_path is missing" unless options.key?(:yaml_path)
|
18
|
+
@filename = options[:filename]
|
19
|
+
@yaml_path = options[:yaml_path]
|
20
|
+
super
|
21
|
+
end
|
22
|
+
|
23
|
+
def fetch
|
24
|
+
@fetched_value ||= yaml.dig(*yaml_path)
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
# @return [Hash, Array, String] The YAML parsed into Ruby objects.
|
30
|
+
def yaml
|
31
|
+
@yaml ||= YAML.load sip.read_file(filename)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Simple interface for injecting a Hyrax::Ingest::SIP dependency.
|
2
|
+
require 'hyrax/ingest/sip'
|
3
|
+
require 'hyrax/ingest/errors'
|
4
|
+
|
5
|
+
module Hyrax
|
6
|
+
module Ingest
|
7
|
+
module HasDepositor
|
8
|
+
def HasDepositor.included(mod)
|
9
|
+
attr_accessor :depositor
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Simple interface for injecting a Hyrax::Ingest::SIP dependency.
|
2
|
+
|
3
|
+
require 'hyrax/ingest/sip'
|
4
|
+
require 'hyrax/ingest/errors'
|
5
|
+
|
6
|
+
module Hyrax
|
7
|
+
module Ingest
|
8
|
+
module HasIteration
|
9
|
+
def iteration=(iteration)
|
10
|
+
@iteration = iteration.to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
def iteration
|
14
|
+
@iteration ||= 0
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'logger'
|
2
|
+
|
3
|
+
module Hyrax
|
4
|
+
module Ingest
|
5
|
+
module HasLogger
|
6
|
+
def logger
|
7
|
+
@logger ||= Logger.new(HasLogger.config.default_log_to)
|
8
|
+
end
|
9
|
+
|
10
|
+
def logger=(logger)
|
11
|
+
raise Hyrax::Ingest::Errors::InvalidLogger unless logger.is_a? Logger
|
12
|
+
@logger = logger
|
13
|
+
end
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def config
|
17
|
+
@config ||= Configuration.new
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class Configuration
|
22
|
+
attr_accessor :default_log_to
|
23
|
+
def initialize
|
24
|
+
@default_log_to = STDOUT
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|