omelette 0.0.1a

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ # Represents the context of a specific item being imported, passed
2
+ # to importing logic blocks
3
+ #
4
+ class Omelette::Importer
5
+ class Context
6
+ def initialize(hash_init = {})
7
+ # TODO, argument checking for required args?
8
+
9
+ self.clipboard = {}
10
+ self.output_hash = {}
11
+
12
+ hash_init.each_pair do |key, value|
13
+ self.send("#{key}=", value)
14
+ end
15
+
16
+ @skip = false
17
+ end
18
+
19
+ attr_accessor :clipboard, :output_hash, :logger
20
+ attr_accessor :import_step, :source_item, :settings, :source_item_id
21
+ # 1-based position in stream of processed records.
22
+ attr_accessor :position
23
+
24
+ # Should we be skipping this record?
25
+ attr_accessor :skipmessage
26
+
27
+ # Set the fact that this record should be skipped, with an
28
+ # optional message
29
+ def skip!(msg = '(no message given)')
30
+ @skipmessage = msg
31
+ @skip = true
32
+ end
33
+
34
+ # Should we skip this record?
35
+ def skip?
36
+ @skip
37
+ end
38
+
39
+ end
40
+
41
+
42
+ end
@@ -0,0 +1,40 @@
1
+ class Omelette::Importer
2
+ # Arity error on a passed block
3
+ class ArityError < ArgumentError;
4
+ end
5
+ class NamingError < ArgumentError;
6
+ end
7
+
8
+ # Raised by #load_config_file when config file can not
9
+ # be processed.
10
+ #
11
+ # The exception #message includes an error message formatted
12
+ # for good display to the developer, in the console.
13
+ #
14
+ # Original exception raised when processing config file
15
+ # can be found in #original. Original exception should ordinarily
16
+ # have a good stack trace, including the file path of the config
17
+ # file in question.
18
+ #
19
+ # Original config path in #config_file, and line number in config
20
+ # file that triggered the exception in #config_file_lineno (may be nil)
21
+ #
22
+ # A filtered backtrace just DOWN from config file (not including trace
23
+ # from omelette loading config file itself) can be found in
24
+ # #config_file_backtrace
25
+ class ConfigLoadError < StandardError
26
+ # We'd have #cause in ruby 2.1, filled out for us, but we want
27
+ # to work before then, so we use our own 'original'
28
+ attr_reader :original, :config_file, :config_file_lineno, :config_file_backtrace
29
+
30
+ def initialize(config_file_path, original_exception)
31
+ @original = original_exception
32
+ @config_file = config_file_path
33
+ @config_file_lineno = Omelette::Util.backtrace_lineno_for_config(config_file_path, original_exception)
34
+ @config_file_backtrace = Omelette::Util.backtrace_from_config(config_file_path, original_exception)
35
+ message = "Error loading configuration file #{self.config_file}:#{self.config_file_lineno} #{original_exception.class}:#{original_exception.message}"
36
+
37
+ super(message)
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,106 @@
1
+ require 'hashie'
2
+ require 'concurrent'
3
+
4
+ class Omelette::Importer
5
+
6
+ # A Hash of settings for a Omelette::Importer, which also ends up passed along
7
+ # to other objects Omelette::Importer interacts with.
8
+ #
9
+ # Enhanced with a few features from Hashie, to make it for
10
+ # instance string/symbol indifferent
11
+ #
12
+ # method #provide(key, value) is added, to do like settings[key] ||= value,
13
+ # set only if not already set (but unlike ||=, nil or false can count as already set)
14
+ #
15
+ # Also has an interesting 'defaults' system, meant to play along
16
+ # with configuration file 'provide' statements. There is a built-in hash of
17
+ # defaults, which will be lazily filled in if accessed and not yet
18
+ # set. (nil can count as set, though!). If they haven't been lazily
19
+ # set yet, then #provide will still fill them in. But you can also call
20
+ # fill_in_defaults! to fill all defaults in, if you know configuration
21
+ # files have all been loaded, and want to fill them in for inspection.
22
+ class Settings < Hash
23
+ include Hashie::Extensions::MergeInitializer # can init with hash
24
+ include Hashie::Extensions::IndifferentAccess
25
+
26
+ def initialize(*args)
27
+ super
28
+ self.default_proc = lambda do |hash, key|
29
+ if self.class.defaults.has_key?(key)
30
+ return hash[key] = self.class.defaults[key]
31
+ else
32
+ return nil
33
+ end
34
+ end
35
+ end
36
+
37
+ # a cautious store, which only saves key=value if
38
+ # there was not already a value for #key. Can be used
39
+ # to set settings that can be overridden on command line,
40
+ # or general first-set-wins settings.
41
+ def provide(key, value)
42
+ unless has_key? key
43
+ store(key, value)
44
+ end
45
+ end
46
+
47
+ # reverse_merge copied from ActiveSupport, pretty straightforward,
48
+ # modified to make sure we return a Settings
49
+ def reverse_merge(other_hash)
50
+ self.class.new(other_hash).merge(self)
51
+ end
52
+
53
+ def reverse_merge!(other_hash)
54
+ replace(reverse_merge(other_hash))
55
+ end
56
+
57
+ def fill_in_defaults!
58
+ self.reverse_merge!(self.class.defaults)
59
+ end
60
+
61
+
62
+ def self.defaults
63
+ {
64
+ # Reader defaults
65
+ 'reader_class_name' => 'Omelette::XmlReader',
66
+ 'marc_source.type' => 'binary',
67
+
68
+ # Writer defaults
69
+ 'writer_class_name' => 'Omelette::OmekaJsonWriter',
70
+ 'omeka_writer.thread_pool' => 1,
71
+
72
+ # Threading and logging
73
+ 'processing_thread_pool' => self.default_processing_thread_pool,
74
+ 'log.batch_size.severity' => 'info',
75
+
76
+ # how to post-process the accumulator
77
+ 'allow_nil_values' => false,
78
+ 'allow_duplicate_values' => true,
79
+
80
+ 'allow_empty_elements' => false
81
+ }
82
+ end
83
+
84
+ def inspect
85
+ # Keep any key ending in password out of the inspect
86
+ self.inject({}) do |hash, (key, value)|
87
+ if /password\Z/.match(key)
88
+ hash[key] = '[hidden]'
89
+ else
90
+ hash[key] = value
91
+ end
92
+ hash
93
+ end.inspect
94
+ end
95
+
96
+ protected
97
+ def self.default_processing_thread_pool
98
+ if ['jruby', 'rbx'].include? ENV['RUBY_ENGINE']
99
+ [1, Concurrent.processor_count - 1].max
100
+ else
101
+ 1
102
+ end
103
+ end
104
+
105
+ end
106
+ end
@@ -0,0 +1,64 @@
1
+ class Omelette::Importer
2
+ class ToElementStep
3
+ attr_accessor :element_name, :element_set_name, :element_id, :block, :source_location, :element_map
4
+ attr_reader :lambda
5
+
6
+ def initialize(element_name, element_set_name, element_map, lambda, block, source_location)
7
+ self.element_name = element_name
8
+ self.element_set_name = element_set_name
9
+ self.lambda = lambda
10
+ self.block = block
11
+ self.source_location = source_location
12
+ validate!
13
+
14
+ self.element_id = element_map[self.element_set_name][self.element_name]
15
+ end
16
+
17
+ def validate!
18
+ if self.element_name.nil? || !self.element_name.is_a?(String) || self.element_name.empty?
19
+ raise NamingError.new("to_element requires the element name (as a string) as the first argument at #{self.source_location}")
20
+ end
21
+ if self.element_set_name.nil? || !self.element_set_name.is_a?(String) || self.element_set_name.empty?
22
+ raise NamingError.new("to_element requires the element set name (as a string) as the second argument at #{self.source_location}")
23
+ end
24
+
25
+ [self.lambda, self.block].each do |proc|
26
+ if proc && (proc.arity < 2 || proc.arity > 3)
27
+ raise ArityError.new("error parsing element '#{element_name}': block/proc given to to_element needs 2 or 3 (or variable) arguments #{proc}, (#{self.inspect})")
28
+ end
29
+ end
30
+ end
31
+
32
+ def to_element_step?
33
+ true
34
+ end
35
+
36
+ def lambda=(lam)
37
+ @lambda = lam
38
+ @lambda_arity = @lambda ? @lambda.arity : 0
39
+ end
40
+
41
+ def inspect
42
+ "(to_element #{self.element_name} at #{self.source_location})"
43
+ end
44
+
45
+ # to_element ""
46
+ def execute(context)
47
+ accumulator = []
48
+ item = context.source_item
49
+ if @lambda
50
+ if @lambda_arity == 2
51
+ @lambda.call item, accumulator
52
+ else
53
+ @lambda.call item, accumulator, context
54
+ end
55
+ end
56
+
57
+ if @block
58
+ @block.call(item, accumulator, context)
59
+ end
60
+
61
+ return accumulator
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,14 @@
1
+ module Omelette::Macros
2
+ module Xpath
3
+ def extract_xpath(xpath, options={})
4
+ options[:html] = false unless options.has_key? :html
5
+ lambda do |item, elements, context|
6
+ nodes = item.xpath xpath
7
+ nodes.map { |node|
8
+ elements << { html: options[:html], element: { id: context.import_step.element_id }, text: node.to_s.strip }
9
+ }
10
+ end
11
+ end
12
+ module_function :extract_xpath
13
+ end
14
+ end
@@ -0,0 +1,20 @@
1
+ # A Null writer that does absolutely nothing with records given to it,
2
+ # just drops em on the floor.
3
+ class Omelette::NullWriter
4
+ def initialize(_arg_settings)
5
+ end
6
+
7
+
8
+ def serialize(_context)
9
+ # null
10
+ end
11
+
12
+ def put(_context)
13
+ # null
14
+ end
15
+
16
+ def close
17
+ # null
18
+ end
19
+
20
+ end
@@ -0,0 +1,9 @@
1
+ # Write to Omeka using the API.
2
+ class Omelette::OmekaJsonWriter
3
+ attr_reader :settings, :thread_pool_size
4
+ attr_reader :batched_queue
5
+
6
+ def initialize(arg_settings)
7
+ @settings = Omelette::Importer::Settings.new arg_settings
8
+ end
9
+ end
@@ -0,0 +1,161 @@
1
+ require 'concurrent'
2
+ require 'thread' # for Queue
3
+
4
+ module Omelette
5
+ # An abstraction wrapping a Concurrent::ThreadPool in some configuration choices
6
+ # and other apparatus. Concurrent::ThreadPool is a Java ThreadPool executor on
7
+ # jruby for performance, and is ruby-concurrent's own ruby implementation otherwise.
8
+ #
9
+ # 1) Initialize with chosen pool size -- we create fixed size pools, where
10
+ # core and max sizes are the same.
11
+ #
12
+ # 2) If initialized with nil or 0 for threadcount, no thread pool will actually
13
+ # be created, and work sent to the Omelette::ThreadPool will just be executed
14
+ # in the caller thread. We call this a nil threadpool. One situation it can be useful
15
+ # is if you are running under MRI, where multi-core parallelism isn't available, so
16
+ # an actual threadpool may not be useful. (Although in some cases a thread pool,
17
+ # especially one with size 1, can be useful in MRI for I/O blocking operations)
18
+ #
19
+ # 3) Use the #maybe_in_threadpool method to send blocks to thread pool for
20
+ # execution -- if configurred with a nil threadcount, your block will just be
21
+ # executed in calling thread. Be careful to not refer to any non-local
22
+ # variables in the block, unless the variable has an object you can
23
+ # use thread-safely!
24
+ #
25
+ # 4) We configure our underlying Concurrent::ThreadPool
26
+ # with a work queue that will buffer up to (pool_size*3) tasks. If the queue is full,
27
+ # the underlying Concurrent::ThreadPool is set up to use the :caller_runs policy
28
+ # meaning the block will end up executing in caller's own thread. With the kind
29
+ # of work we're doing, where each unit of work is small and there are many of them--
30
+ # the :caller_runs policy serves as an effective 'back pressure' mechanism to keep
31
+ # the work queue from getting too large and exhausting memory, when producers are
32
+ # faster than consumers.
33
+ #
34
+ # 5) Any exceptions raised by pool-executed work are captured accumulated in a thread-safe
35
+ # manner, and can be re-raised in the thread of your choice by calling
36
+ # #raise_collected_exception!
37
+ #
38
+ # 6) When you are done with the threadpool, you can and must call
39
+ # #shutdown_and_wait, which will wait for all current queued work
40
+ # to complete, then return. You can not give any more work to the pool
41
+ # after you do this. By default it'll wait pretty much forever, which should
42
+ # be fine. If you never call shutdown, then queued or in-progress work
43
+ # may be abandoned when the program ends, which would be bad.
44
+ #
45
+ # 7) We will keep track of total times a block is run in thread pool, and
46
+ # total elapsed (wall) time of running all blocks, so an average_execution_ms
47
+ # time can be given. #average_execution_ms may be inaccurate if called when
48
+ # threads are still executing, as it's not entirely thread safe (may get
49
+ # an off by one as to total iterations)
50
+ class ThreadPool
51
+ attr_reader :pool_size, :queue_capacity
52
+
53
+ # First arg is pool size, 0 or nil and we'll be a null/no-op pool which executes
54
+ # work in caller thread.
55
+ def initialize(pool_size)
56
+ @thread_pool = nil # assume we don't have one
57
+ @exceptions_caught_queue = [] # start off without exceptions
58
+ unless pool_size.nil? || pool_size == 0
59
+ @pool_size = pool_size.to_i
60
+ @queue_capacity = pool_size * 3
61
+
62
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
63
+ :min_threads => @pool_size,
64
+ :max_threads => @pool_size,
65
+ :max_queue => @queue_capacity,
66
+ :fallback_policy => :caller_runs
67
+ )
68
+
69
+ # A thread-safe queue to collect exceptions cross-threads.
70
+ # We really only need to save the first exception, but a queue
71
+ # is a convenient way to store a value concurrency-safely, and
72
+ # might as well store all of them.
73
+ @exceptions_caught_queue = Queue.new
74
+ end
75
+ end
76
+
77
+ # Pass it a block, MAYBE gets executed in the bg in a thread pool. Maybe
78
+ # gets executed in the calling thread.
79
+ #
80
+ # There are actually two 'maybes':
81
+ #
82
+ # * If Omelette::ThreadPool was configured with null thread pool, then ALL
83
+ # work will be executed in calling thread.
84
+ #
85
+ # * If there is a thread pool, but it's work queue is full, then a job
86
+ # will be executed in calling thread (because we configured our java
87
+ # thread pool with a limited sized queue, and CallerRunsPolicy rejection strategy)
88
+ #
89
+ # You can pass arbitrary arguments to the method, that will then be passed
90
+ # to your block -- similar to how ruby Thread.new works. This is convenient
91
+ # for creating variables unique to the block that won't be shared outside
92
+ # the thread:
93
+ #
94
+ # thread_pool.maybe_in_thread_pool(x, y) do |x1, y1|
95
+ # 100.times do
96
+ # something_with(x1)
97
+ # end
98
+ # end
99
+ # x = "someting else"
100
+ # # If we hadn't passed args with block, and had just
101
+ # # used x in the block, it'd be the SAME x as this one,
102
+ # # and would be pointing to a different string now!
103
+ #
104
+ # Note, that just makes block-local variables, it doesn't
105
+ # help you with whether a data structure itself is thread safe.
106
+ def maybe_in_thread_pool(*args)
107
+ if @thread_pool
108
+ @thread_pool.post do
109
+ begin
110
+ yield(*args)
111
+ rescue Exception => e
112
+ collect_exception(e)
113
+ end
114
+ end
115
+ else
116
+ yield(*args)
117
+ end
118
+
119
+ end
120
+
121
+
122
+ # thread-safe way of storing an exception, to raise
123
+ # later in a different thread. We don't guarantee
124
+ # that we can store more than one at a time, only
125
+ # the first one recorded may be stored.
126
+ def collect_exception(e)
127
+ @exceptions_caught_queue.push(e)
128
+ end
129
+
130
+ # If there's a stored collected exception, raise it
131
+ # again now. Call this to re-raise exceptions caught in
132
+ # other threads in the thread of your choice.
133
+ #
134
+ # If you call this method on a ThreadPool initialized with nil
135
+ # as a non-functioning threadpool -- then this method is just
136
+ # a no-op.
137
+ def raise_collected_exception!
138
+ unless @exceptions_caught_queue.empty?
139
+ e = @exceptions_caught_queue.pop
140
+ raise e
141
+ end
142
+ end
143
+
144
+ # shutdown threadpool, and wait for all work to complete.
145
+ # this one is also a no-op if you have a null ThreadPool that
146
+ # doesn't really have a threadpool at all.
147
+ #
148
+ # returns elapsed time in seconds it took to shutdown
149
+ def shutdown_and_wait
150
+ start_t = Time.now
151
+
152
+ if @thread_pool
153
+ @thread_pool.shutdown
154
+ @thread_pool.wait_for_termination
155
+ end
156
+
157
+ return (Time.now - start_t)
158
+ end
159
+
160
+ end
161
+ end