omelette 0.0.1a
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +3 -0
- data/.travis.yml +6 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +75 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/omelette +5 -0
- data/lib/omelette.rb +10 -0
- data/lib/omelette/command_line.rb +10 -0
- data/lib/omelette/importer.rb +284 -0
- data/lib/omelette/importer/context.rb +42 -0
- data/lib/omelette/importer/errors.rb +40 -0
- data/lib/omelette/importer/settings.rb +106 -0
- data/lib/omelette/importer/steps.rb +64 -0
- data/lib/omelette/macros/xpath.rb +14 -0
- data/lib/omelette/null_writer.rb +20 -0
- data/lib/omelette/omeka_json_writer.rb +9 -0
- data/lib/omelette/thread_pool.rb +161 -0
- data/lib/omelette/util.rb +146 -0
- data/lib/omelette/version.rb +3 -0
- data/lib/omelette/xml_reader.rb +27 -0
- data/omelette.gemspec +36 -0
- metadata +213 -0
@@ -0,0 +1,42 @@
|
|
1
|
+
# Represents the context of a specific item being imported, passed
|
2
|
+
# to importing logic blocks
|
3
|
+
#
|
4
|
+
class Omelette::Importer
|
5
|
+
class Context
|
6
|
+
def initialize(hash_init = {})
|
7
|
+
# TODO, argument checking for required args?
|
8
|
+
|
9
|
+
self.clipboard = {}
|
10
|
+
self.output_hash = {}
|
11
|
+
|
12
|
+
hash_init.each_pair do |key, value|
|
13
|
+
self.send("#{key}=", value)
|
14
|
+
end
|
15
|
+
|
16
|
+
@skip = false
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_accessor :clipboard, :output_hash, :logger
|
20
|
+
attr_accessor :import_step, :source_item, :settings, :source_item_id
|
21
|
+
# 1-based position in stream of processed records.
|
22
|
+
attr_accessor :position
|
23
|
+
|
24
|
+
# Should we be skipping this record?
|
25
|
+
attr_accessor :skipmessage
|
26
|
+
|
27
|
+
# Set the fact that this record should be skipped, with an
|
28
|
+
# optional message
|
29
|
+
def skip!(msg = '(no message given)')
|
30
|
+
@skipmessage = msg
|
31
|
+
@skip = true
|
32
|
+
end
|
33
|
+
|
34
|
+
# Should we skip this record?
|
35
|
+
def skip?
|
36
|
+
@skip
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
class Omelette::Importer
|
2
|
+
# Arity error on a passed block
|
3
|
+
class ArityError < ArgumentError;
|
4
|
+
end
|
5
|
+
class NamingError < ArgumentError;
|
6
|
+
end
|
7
|
+
|
8
|
+
# Raised by #load_config_file when config file can not
|
9
|
+
# be processed.
|
10
|
+
#
|
11
|
+
# The exception #message includes an error message formatted
|
12
|
+
# for good display to the developer, in the console.
|
13
|
+
#
|
14
|
+
# Original exception raised when processing config file
|
15
|
+
# can be found in #original. Original exception should ordinarily
|
16
|
+
# have a good stack trace, including the file path of the config
|
17
|
+
# file in question.
|
18
|
+
#
|
19
|
+
# Original config path in #config_file, and line number in config
|
20
|
+
# file that triggered the exception in #config_file_lineno (may be nil)
|
21
|
+
#
|
22
|
+
# A filtered backtrace just DOWN from config file (not including trace
|
23
|
+
# from omelette loading config file itself) can be found in
|
24
|
+
# #config_file_backtrace
|
25
|
+
class ConfigLoadError < StandardError
|
26
|
+
# We'd have #cause in ruby 2.1, filled out for us, but we want
|
27
|
+
# to work before then, so we use our own 'original'
|
28
|
+
attr_reader :original, :config_file, :config_file_lineno, :config_file_backtrace
|
29
|
+
|
30
|
+
def initialize(config_file_path, original_exception)
|
31
|
+
@original = original_exception
|
32
|
+
@config_file = config_file_path
|
33
|
+
@config_file_lineno = Omelette::Util.backtrace_lineno_for_config(config_file_path, original_exception)
|
34
|
+
@config_file_backtrace = Omelette::Util.backtrace_from_config(config_file_path, original_exception)
|
35
|
+
message = "Error loading configuration file #{self.config_file}:#{self.config_file_lineno} #{original_exception.class}:#{original_exception.message}"
|
36
|
+
|
37
|
+
super(message)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'hashie'
|
2
|
+
require 'concurrent'
|
3
|
+
|
4
|
+
class Omelette::Importer
|
5
|
+
|
6
|
+
# A Hash of settings for a Omelette::Importer, which also ends up passed along
|
7
|
+
# to other objects Omelette::Importer interacts with.
|
8
|
+
#
|
9
|
+
# Enhanced with a few features from Hashie, to make it for
|
10
|
+
# instance string/symbol indifferent
|
11
|
+
#
|
12
|
+
# method #provide(key, value) is added, to do like settings[key] ||= value,
|
13
|
+
# set only if not already set (but unlike ||=, nil or false can count as already set)
|
14
|
+
#
|
15
|
+
# Also has an interesting 'defaults' system, meant to play along
|
16
|
+
# with configuration file 'provide' statements. There is a built-in hash of
|
17
|
+
# defaults, which will be lazily filled in if accessed and not yet
|
18
|
+
# set. (nil can count as set, though!). If they haven't been lazily
|
19
|
+
# set yet, then #provide will still fill them in. But you can also call
|
20
|
+
# fill_in_defaults! to fill all defaults in, if you know configuration
|
21
|
+
# files have all been loaded, and want to fill them in for inspection.
|
22
|
+
class Settings < Hash
|
23
|
+
include Hashie::Extensions::MergeInitializer # can init with hash
|
24
|
+
include Hashie::Extensions::IndifferentAccess
|
25
|
+
|
26
|
+
def initialize(*args)
|
27
|
+
super
|
28
|
+
self.default_proc = lambda do |hash, key|
|
29
|
+
if self.class.defaults.has_key?(key)
|
30
|
+
return hash[key] = self.class.defaults[key]
|
31
|
+
else
|
32
|
+
return nil
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# a cautious store, which only saves key=value if
|
38
|
+
# there was not already a value for #key. Can be used
|
39
|
+
# to set settings that can be overridden on command line,
|
40
|
+
# or general first-set-wins settings.
|
41
|
+
def provide(key, value)
|
42
|
+
unless has_key? key
|
43
|
+
store(key, value)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# reverse_merge copied from ActiveSupport, pretty straightforward,
|
48
|
+
# modified to make sure we return a Settings
|
49
|
+
def reverse_merge(other_hash)
|
50
|
+
self.class.new(other_hash).merge(self)
|
51
|
+
end
|
52
|
+
|
53
|
+
def reverse_merge!(other_hash)
|
54
|
+
replace(reverse_merge(other_hash))
|
55
|
+
end
|
56
|
+
|
57
|
+
def fill_in_defaults!
|
58
|
+
self.reverse_merge!(self.class.defaults)
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
def self.defaults
|
63
|
+
{
|
64
|
+
# Reader defaults
|
65
|
+
'reader_class_name' => 'Omelette::XmlReader',
|
66
|
+
'marc_source.type' => 'binary',
|
67
|
+
|
68
|
+
# Writer defaults
|
69
|
+
'writer_class_name' => 'Omelette::OmekaJsonWriter',
|
70
|
+
'omeka_writer.thread_pool' => 1,
|
71
|
+
|
72
|
+
# Threading and logging
|
73
|
+
'processing_thread_pool' => self.default_processing_thread_pool,
|
74
|
+
'log.batch_size.severity' => 'info',
|
75
|
+
|
76
|
+
# how to post-process the accumulator
|
77
|
+
'allow_nil_values' => false,
|
78
|
+
'allow_duplicate_values' => true,
|
79
|
+
|
80
|
+
'allow_empty_elements' => false
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
def inspect
|
85
|
+
# Keep any key ending in password out of the inspect
|
86
|
+
self.inject({}) do |hash, (key, value)|
|
87
|
+
if /password\Z/.match(key)
|
88
|
+
hash[key] = '[hidden]'
|
89
|
+
else
|
90
|
+
hash[key] = value
|
91
|
+
end
|
92
|
+
hash
|
93
|
+
end.inspect
|
94
|
+
end
|
95
|
+
|
96
|
+
protected
|
97
|
+
def self.default_processing_thread_pool
|
98
|
+
if ['jruby', 'rbx'].include? ENV['RUBY_ENGINE']
|
99
|
+
[1, Concurrent.processor_count - 1].max
|
100
|
+
else
|
101
|
+
1
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
class Omelette::Importer
|
2
|
+
class ToElementStep
|
3
|
+
attr_accessor :element_name, :element_set_name, :element_id, :block, :source_location, :element_map
|
4
|
+
attr_reader :lambda
|
5
|
+
|
6
|
+
def initialize(element_name, element_set_name, element_map, lambda, block, source_location)
|
7
|
+
self.element_name = element_name
|
8
|
+
self.element_set_name = element_set_name
|
9
|
+
self.lambda = lambda
|
10
|
+
self.block = block
|
11
|
+
self.source_location = source_location
|
12
|
+
validate!
|
13
|
+
|
14
|
+
self.element_id = element_map[self.element_set_name][self.element_name]
|
15
|
+
end
|
16
|
+
|
17
|
+
def validate!
|
18
|
+
if self.element_name.nil? || !self.element_name.is_a?(String) || self.element_name.empty?
|
19
|
+
raise NamingError.new("to_element requires the element name (as a string) as the first argument at #{self.source_location}")
|
20
|
+
end
|
21
|
+
if self.element_set_name.nil? || !self.element_set_name.is_a?(String) || self.element_set_name.empty?
|
22
|
+
raise NamingError.new("to_element requires the element set name (as a string) as the second argument at #{self.source_location}")
|
23
|
+
end
|
24
|
+
|
25
|
+
[self.lambda, self.block].each do |proc|
|
26
|
+
if proc && (proc.arity < 2 || proc.arity > 3)
|
27
|
+
raise ArityError.new("error parsing element '#{element_name}': block/proc given to to_element needs 2 or 3 (or variable) arguments #{proc}, (#{self.inspect})")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_element_step?
|
33
|
+
true
|
34
|
+
end
|
35
|
+
|
36
|
+
def lambda=(lam)
|
37
|
+
@lambda = lam
|
38
|
+
@lambda_arity = @lambda ? @lambda.arity : 0
|
39
|
+
end
|
40
|
+
|
41
|
+
def inspect
|
42
|
+
"(to_element #{self.element_name} at #{self.source_location})"
|
43
|
+
end
|
44
|
+
|
45
|
+
# to_element ""
|
46
|
+
def execute(context)
|
47
|
+
accumulator = []
|
48
|
+
item = context.source_item
|
49
|
+
if @lambda
|
50
|
+
if @lambda_arity == 2
|
51
|
+
@lambda.call item, accumulator
|
52
|
+
else
|
53
|
+
@lambda.call item, accumulator, context
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
if @block
|
58
|
+
@block.call(item, accumulator, context)
|
59
|
+
end
|
60
|
+
|
61
|
+
return accumulator
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Omelette::Macros
|
2
|
+
module Xpath
|
3
|
+
def extract_xpath(xpath, options={})
|
4
|
+
options[:html] = false unless options.has_key? :html
|
5
|
+
lambda do |item, elements, context|
|
6
|
+
nodes = item.xpath xpath
|
7
|
+
nodes.map { |node|
|
8
|
+
elements << { html: options[:html], element: { id: context.import_step.element_id }, text: node.to_s.strip }
|
9
|
+
}
|
10
|
+
end
|
11
|
+
end
|
12
|
+
module_function :extract_xpath
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# A Null writer that does absolutely nothing with records given to it,
|
2
|
+
# just drops em on the floor.
|
3
|
+
class Omelette::NullWriter
|
4
|
+
def initialize(_arg_settings)
|
5
|
+
end
|
6
|
+
|
7
|
+
|
8
|
+
def serialize(_context)
|
9
|
+
# null
|
10
|
+
end
|
11
|
+
|
12
|
+
def put(_context)
|
13
|
+
# null
|
14
|
+
end
|
15
|
+
|
16
|
+
def close
|
17
|
+
# null
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require 'concurrent'
|
2
|
+
require 'thread' # for Queue
|
3
|
+
|
4
|
+
module Omelette
|
5
|
+
# An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
|
6
|
+
# and other apparatus. Concurrent::ThreadPool is a Java ThreadPool executor on
|
7
|
+
# jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
|
8
|
+
#
|
9
|
+
# 1) Initialize with chosen pool size -- we create fixed size pools, where
|
10
|
+
# core and max sizes are the same.
|
11
|
+
#
|
12
|
+
# 2) If initialized with nil or 0 for threadcount, no thread pool will actually
|
13
|
+
# be created, and work sent to the Omelette::ThreadPool will just be executed
|
14
|
+
# in the caller thread. We call this a nil threadpool. One situation it can be useful
|
15
|
+
# is if you are running under MRI, where multi-core parallelism isn't available, so
|
16
|
+
# an actual threadpool may not be useful. (Although in some cases a thread pool,
|
17
|
+
# especially one with size 1, can be useful in MRI for I/O blocking operations)
|
18
|
+
#
|
19
|
+
# 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
|
20
|
+
# execution -- if configurred with a nil threadcount, your block will just be
|
21
|
+
# executed in calling thread. Be careful to not refer to any non-local
|
22
|
+
# variables in the block, unless the variable has an object you can
|
23
|
+
# use thread-safely!
|
24
|
+
#
|
25
|
+
# 4) We configure our underlying Concurrent::ThreadPool
|
26
|
+
# with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
|
27
|
+
# the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
|
28
|
+
# meaning the block will end up executing in caller's own thread. With the kind
|
29
|
+
# of work we're doing, where each unit of work is small and there are many of them--
|
30
|
+
# the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
|
31
|
+
# the work queue from getting too large and exhausting memory, when producers are
|
32
|
+
# faster than consumers.
|
33
|
+
#
|
34
|
+
# 5) Any exceptions raised by pool-executed work are captured accumulated in a thread-safe
|
35
|
+
# manner, and can be re-raised in the thread of your choice by calling
|
36
|
+
# #raise_collected_exception!
|
37
|
+
#
|
38
|
+
# 6) When you are done with the threadpool, you can and must call
|
39
|
+
# #shutdown_and_wait, which will wait for all current queued work
|
40
|
+
# to complete, then return. You can not give any more work to the pool
|
41
|
+
# after you do this. By default it'll wait pretty much forever, which should
|
42
|
+
# be fine. If you never call shutdown, then queued or in-progress work
|
43
|
+
# may be abandoned when the program ends, which would be bad.
|
44
|
+
#
|
45
|
+
# 7) We will keep track of total times a block is run in thread pool, and
|
46
|
+
# total elapsed (wall) time of running all blocks, so an average_execution_ms
|
47
|
+
# time can be given. #average_execution_ms may be inaccurate if called when
|
48
|
+
# threads are still executing, as it's not entirely thread safe (may get
|
49
|
+
# an off by one as to total iterations)
|
50
|
+
class ThreadPool
|
51
|
+
attr_reader :pool_size, :queue_capacity
|
52
|
+
|
53
|
+
# First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
|
54
|
+
# work in caller thread.
|
55
|
+
def initialize(pool_size)
|
56
|
+
@thread_pool = nil # assume we don't have one
|
57
|
+
@exceptions_caught_queue = [] # start off without exceptions
|
58
|
+
unless pool_size.nil? || pool_size == 0
|
59
|
+
@pool_size = pool_size.to_i
|
60
|
+
@queue_capacity = pool_size * 3
|
61
|
+
|
62
|
+
@thread_pool = Concurrent::ThreadPoolExecutor.new(
|
63
|
+
:min_threads => @pool_size,
|
64
|
+
:max_threads => @pool_size,
|
65
|
+
:max_queue => @queue_capacity,
|
66
|
+
:fallback_policy => :caller_runs
|
67
|
+
)
|
68
|
+
|
69
|
+
# A thread-safe queue to collect exceptions cross-threads.
|
70
|
+
# We really only need to save the first exception, but a queue
|
71
|
+
# is a convenient way to store a value concurrency-safely, and
|
72
|
+
# might as well store all of them.
|
73
|
+
@exceptions_caught_queue = Queue.new
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Pass it a block, MAYBE gets executed in the bg in a thread pool. Maybe
|
78
|
+
# gets executed in the calling thread.
|
79
|
+
#
|
80
|
+
# There are actually two 'maybes':
|
81
|
+
#
|
82
|
+
# * If Omelette::ThreadPool was configured with null thread pool, then ALL
|
83
|
+
# work will be executed in calling thread.
|
84
|
+
#
|
85
|
+
# * If there is a thread pool, but it's work queue is full, then a job
|
86
|
+
# will be executed in calling thread (because we configured our java
|
87
|
+
# thread pool with a limited sized queue, and CallerRunsPolicy rejection strategy)
|
88
|
+
#
|
89
|
+
# You can pass arbitrary arguments to the method, that will then be passed
|
90
|
+
# to your block -- similar to how ruby Thread.new works. This is convenient
|
91
|
+
# for creating variables unique to the block that won't be shared outside
|
92
|
+
# the thread:
|
93
|
+
#
|
94
|
+
# thread_pool.maybe_in_thread_pool(x, y) do |x1, y1|
|
95
|
+
# 100.times do
|
96
|
+
# something_with(x1)
|
97
|
+
# end
|
98
|
+
# end
|
99
|
+
# x = "someting else"
|
100
|
+
# # If we hadn't passed args with block, and had just
|
101
|
+
# # used x in the block, it'd be the SAME x as this one,
|
102
|
+
# # and would be pointing to a different string now!
|
103
|
+
#
|
104
|
+
# Note, that just makes block-local variables, it doesn't
|
105
|
+
# help you with whether a data structure itself is thread safe.
|
106
|
+
def maybe_in_thread_pool(*args)
|
107
|
+
if @thread_pool
|
108
|
+
@thread_pool.post do
|
109
|
+
begin
|
110
|
+
yield(*args)
|
111
|
+
rescue Exception => e
|
112
|
+
collect_exception(e)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
else
|
116
|
+
yield(*args)
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
# thread-safe way of storing an exception, to raise
|
123
|
+
# later in a different thread. We don't guarantee
|
124
|
+
# that we can store more than one at a time, only
|
125
|
+
# the first one recorded may be stored.
|
126
|
+
def collect_exception(e)
|
127
|
+
@exceptions_caught_queue.push(e)
|
128
|
+
end
|
129
|
+
|
130
|
+
# If there's a stored collected exception, raise it
|
131
|
+
# again now. Call this to re-raise exceptions caught in
|
132
|
+
# other threads in the thread of your choice.
|
133
|
+
#
|
134
|
+
# If you call this method on a ThreadPool initialized with nil
|
135
|
+
# as a non-functioning threadpool -- then this method is just
|
136
|
+
# a no-op.
|
137
|
+
def raise_collected_exception!
|
138
|
+
unless @exceptions_caught_queue.empty?
|
139
|
+
e = @exceptions_caught_queue.pop
|
140
|
+
raise e
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# shutdown threadpool, and wait for all work to complete.
|
145
|
+
# this one is also a no-op if you have a null ThreadPool that
|
146
|
+
# doesn't really have a threadpool at all.
|
147
|
+
#
|
148
|
+
# returns elapsed time in seconds it took to shutdown
|
149
|
+
def shutdown_and_wait
|
150
|
+
start_t = Time.now
|
151
|
+
|
152
|
+
if @thread_pool
|
153
|
+
@thread_pool.shutdown
|
154
|
+
@thread_pool.wait_for_termination
|
155
|
+
end
|
156
|
+
|
157
|
+
return (Time.now - start_t)
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|