omelette 0.0.1a

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d5c99f1e799db62ab94213b0919b401f00fa7f0f
4
+ data.tar.gz: 8f89f4fb46dee7a19305f20432acf7ead96aa9bf
5
+ SHA512:
6
+ metadata.gz: 0d5ab5c28afe2ec4b2cd7d137813465c317ba2a3519baa287719a6fa545aaad1912fa322092de188349555c7fbd53c0264c46155842130389bd7ebfa063b3d79
7
+ data.tar.gz: b58aa6854cff01fbf7bd79250e128c8f69aa2c03975ee3376c735395d0626fb921589326c0d98d879861645e843c747ab9ce526e507749c32053db1635619efd
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+ /.ruby-version
13
+ /.ruby-gemset
14
+ /.idea
15
+ /omelette-*.gem
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.0
5
+ - 2.4.0
6
+ before_install: gem install bundler -v 1.16.0
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in omelette.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,75 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ omelette (0.0.1a)
5
+ concurrent-ruby (~> 1.0)
6
+ hashie (~> 3.5)
7
+ nokogiri (~> 1.8)
8
+ rest-client (~> 2.0)
9
+ thor (~> 0.20)
10
+ yell (~> 2.0)
11
+
12
+ GEM
13
+ remote: https://rubygems.org/
14
+ specs:
15
+ addressable (2.5.2)
16
+ public_suffix (>= 2.0.2, < 4.0)
17
+ concurrent-ruby (1.0.5)
18
+ crack (0.4.3)
19
+ safe_yaml (~> 1.0.0)
20
+ diff-lcs (1.3)
21
+ domain_name (0.5.20170404)
22
+ unf (>= 0.0.5, < 1.0.0)
23
+ hashdiff (0.3.7)
24
+ hashie (3.5.6)
25
+ http-cookie (1.0.3)
26
+ domain_name (~> 0.5)
27
+ mime-types (3.1)
28
+ mime-types-data (~> 3.2015)
29
+ mime-types-data (3.2016.0521)
30
+ mini_portile2 (2.3.0)
31
+ netrc (0.11.0)
32
+ nokogiri (1.8.1)
33
+ mini_portile2 (~> 2.3.0)
34
+ public_suffix (3.0.1)
35
+ rake (10.5.0)
36
+ rest-client (2.0.2)
37
+ http-cookie (>= 1.0.2, < 2.0)
38
+ mime-types (>= 1.16, < 4.0)
39
+ netrc (~> 0.8)
40
+ rspec (3.7.0)
41
+ rspec-core (~> 3.7.0)
42
+ rspec-expectations (~> 3.7.0)
43
+ rspec-mocks (~> 3.7.0)
44
+ rspec-core (3.7.0)
45
+ rspec-support (~> 3.7.0)
46
+ rspec-expectations (3.7.0)
47
+ diff-lcs (>= 1.2.0, < 2.0)
48
+ rspec-support (~> 3.7.0)
49
+ rspec-mocks (3.7.0)
50
+ diff-lcs (>= 1.2.0, < 2.0)
51
+ rspec-support (~> 3.7.0)
52
+ rspec-support (3.7.0)
53
+ safe_yaml (1.0.4)
54
+ thor (0.20.0)
55
+ unf (0.1.4)
56
+ unf_ext
57
+ unf_ext (0.0.7.4)
58
+ webmock (3.1.0)
59
+ addressable (>= 2.3.6)
60
+ crack (>= 0.3.2)
61
+ hashdiff
62
+ yell (2.0.7)
63
+
64
+ PLATFORMS
65
+ ruby
66
+
67
+ DEPENDENCIES
68
+ bundler (~> 1.16)
69
+ omelette!
70
+ rake (~> 10.0)
71
+ rspec (~> 3.0)
72
+ webmock (~> 3.0)
73
+
74
+ BUNDLED WITH
75
+ 1.16.0
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2017 Dazhi Jiao
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # Omelette
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/omelette`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'omelette'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install omelette
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/omelette. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
40
+
41
+ ## Code of Conduct
42
+
43
+ Everyone interacting in the Omelette project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/omelette/blob/master/CODE_OF_CONDUCT.md).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "omelette"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/exe/omelette ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'omelette'
4
+
5
+ Omelette::CommandLine.start(ARGV)
data/lib/omelette.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'omelette/version'
2
+ require 'omelette/importer'
3
+ require 'omelette/util'
4
+ require 'omelette/macros/xpath'
5
+ require 'omelette/xml_reader'
6
+ require 'omelette/command_line'
7
+
8
+ module Omelette
9
+ # Your code goes here...
10
+ end
@@ -0,0 +1,10 @@
1
+ require 'thor'
2
+
3
+ module Omelette
4
+ class CommandLine < Thor
5
+ desc 'import', 'import a file'
6
+ def import(ids = nil)
7
+ puts 'omelette import: To be implemented'
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,284 @@
1
+ require 'yell'
2
+ require 'omelette/importer/context'
3
+ require 'omelette/importer/settings'
4
+ require 'omelette/importer/errors'
5
+ require 'omelette/importer/steps'
6
+ require 'omelette/thread_pool'
7
+ require 'omelette/macros/xpath'
8
+
9
+ class Omelette::Importer
10
+ include Omelette::Macros::Xpath
11
+
12
+ attr_writer :reader_class, :writer_class, :writer
13
+ attr_reader :logger
14
+
15
+ def initialize(arg_settings = {})
16
+ @settings = Settings.new(arg_settings)
17
+ @import_steps = []
18
+ @after_processing_steps = []
19
+
20
+ @logger = create_logger
21
+ end
22
+
23
+ def elements_map
24
+ @elements_map ||= Omelette::Util.build_elements_map @settings['omeka_api_root']
25
+ end
26
+
27
+ # Pass a string file path, a Pathname, or a File object, for
28
+ # a config file to load into indexer.
29
+ #
30
+ # Can raise:
31
+ # * Errno::ENOENT or Errno::EACCES if file path is not accessible
32
+ # * Omelette::Importer::ConfigLoadError if exception is raised evaluating
33
+ # the config. A ConfigLoadError has information in it about original
34
+ # exception, and exactly what config file and line number triggered it.
35
+ def load_config_file(file_path)
36
+ File.open file_path do |file|
37
+ begin
38
+ self.instance_eval file.read, file_path.to_s
39
+ rescue ScriptError, StandardError => ex
40
+ raise ConfigLoadError.new(file_path.to_s, ex)
41
+ end
42
+ end
43
+ end
44
+
45
+ def settings(new_settings = nil, &block)
46
+ @settings.merge! new_settings if new_settings
47
+ @settings.instance_eval &block if block_given?
48
+ return @settings
49
+ end
50
+
51
+ def to_element(element_name, element_set_name, aLambda = nil, &block)
52
+ @import_steps << ToElementStep.new(element_name, element_set_name, elements_map, aLambda, block, Omelette::Util.extract_caller_location(caller.first))
53
+ end
54
+
55
+ # Processes a single item according to extracting rules set up in
56
+ # this importer. Returns the output hash (a hash whose keys are
57
+ # string fields, and values are arrays of one or more values in that field)
58
+ #
59
+ # This is a convenience shortcut for #map_to_context! -- use that one
60
+ # if you want to provide addtional context
61
+ # like position, and/or get back the full context.
62
+ def map_item(item)
63
+ context = Context.new(source_item: item, settings: settings)
64
+ map_to_context! context
65
+ return context.output_hash
66
+ end
67
+
68
+ def map_to_context!(context)
69
+ @import_steps.each do |import_step|
70
+ break if context.skip?
71
+
72
+ # set the to_element step for error reporting
73
+ context.import_step = import_step
74
+ elements = log_mapping_errors context, import_step do
75
+ import_step.execute context
76
+ end
77
+ add_elements_to_context!(elements, context) if import_step.to_element_step?
78
+
79
+ # Unset the import step after it's finished
80
+ context.import_step = nil
81
+ end
82
+ return context
83
+ end
84
+
85
+ # Add the accumulator to the context with the correct field name
86
+ # Do post-processing on the accumulator (remove nil values, allow empty
87
+ # fields, etc)
88
+ #
89
+ # Only get here if we've got a to_field step; otherwise the
90
+ # call to get a field_name will throw an error
91
+
92
+ ALLOW_NIL_VALUES = 'allow_nil_values'.freeze
93
+ ALLOW_EMPTY_FIELDS = 'allow_empty_fields'.freeze
94
+ ALLOW_DUPLICATE_VALUES = 'allow_duplicate_values'.freeze
95
+
96
+ def add_elements_to_context!(elements, context)
97
+ elements.compact! unless settings[ALLOW_NIL_VALUES]
98
+ return if elements.empty? and not (settings[ALLOW_EMPTY_FIELDS])
99
+
100
+ element_name = context.import_step.element_name
101
+ context.output_hash[element_name] ||= []
102
+
103
+ existing_element = context.output_hash[element_name].concat element
104
+ existing_element.uniq! unless settings[ALLOW_DUPLICATE_VALUES]
105
+
106
+ rescue NameError => ex
107
+ msg = 'Tried to call add_element_to_context with a non-to_element step'
108
+ msg += context.import_step.inspect
109
+ logger.error msg
110
+ raise ArgumentError.new msg
111
+ end
112
+
113
+ def process(files)
114
+ settings.fill_in_defaults!
115
+
116
+ count = 0
117
+ start_time = batch_start_time = Time.now
118
+ logger.debug "beginning Omelette::Import*process with settings: #{settings.inspect}"
119
+ reader = self.reader! files
120
+ processing_threads = settings['processing_thread_pool'].to_i
121
+ thread_pool = Omelette::ThreadPool.new processing_threads
122
+
123
+ logger.info " Indexer with #{processing_threads} processing threads, reader: #{reader.class.name} and writer: #{writer.class.name}"
124
+ log_batch_size = settings['log.batch_size'] && settings['log.batch_size'].to_i
125
+
126
+ reader.each do |item, item_id; position|
127
+ count += 1
128
+ position = count
129
+
130
+ thread_pool.raise_collected_exception!
131
+
132
+ if settings['debug_ascii_progress'].to_s == 'true'
133
+ $stderr.write '.' if count % settings['solr_writer.batch_size'].to_i == 0
134
+ end
135
+
136
+ context = Context.new source_item: item, source_item_id: item_id, settings: settings, position: position, logger: logger
137
+
138
+ if log_batch_size && (count % log_batch_size == 0)
139
+ batch_rps = log_batch_size / (Time.now - batch_start_time)
140
+ overall_rps = count / (Time.now - start_time)
141
+ logger.send(settings['log.batch_size.severity'].downcase.to_sym, "Omelette::Importer#process, read #{count} records at id:#{context.source_record_id}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
142
+ batch_start_time = Time.now
143
+ end
144
+
145
+ thread_pool.maybe_in_thread_pool(context) do |context|
146
+ map_to_context! context
147
+ if context.skip?
148
+ log_skip context
149
+ else
150
+ writer.put context
151
+ end
152
+ end
153
+ end
154
+
155
+ $stderr.write "\n" if settings['debug_ascii_progress'].to_s == 'true'
156
+
157
+ logger.debug 'Shutting down #processing mapper threadpool...'
158
+ thread_pool.shutdown_and_wait
159
+ logger.debug '#processing mapper threadpool shutdown complete.'
160
+
161
+ thread_pool.raise_collected_exception!
162
+
163
+
164
+ writer.close if writer.respond_to?(:close)
165
+
166
+ @after_processing_steps.each do |step|
167
+ begin
168
+ step.execute
169
+ rescue Exception => e
170
+ logger.fatal("Unexpected exception #{e} when executing #{step}")
171
+ raise e
172
+ end
173
+ end
174
+
175
+ elapsed = Time.now - start_time
176
+ avg_rps = (count / elapsed)
177
+ logger.info "finished Indexer#process: #{count} records in #{'%.3f' % elapsed} seconds; #{'%.1f' % avg_rps} records/second overall."
178
+
179
+ if writer.respond_to?(:skipped_record_count) && writer.skipped_record_count > 0
180
+ logger.error "Indexer#process returning 'false' due to #{writer.skipped_record_count} skipped records."
181
+ return false
182
+ end
183
+
184
+ return true
185
+ end
186
+
187
+ def reader_class
188
+ unless defined? @reader_class
189
+ @reader_class = Object.const_get(settings['reader_class_name']) rescue nil
190
+ end
191
+ return @reader_class
192
+ end
193
+
194
+ def writer_class
195
+ writer.class
196
+ end
197
+
198
+ # Instantiate a Omelette Reader, using class set
199
+ # in #reader_class, initialized with io_stream passed in
200
+ def reader!(ids)
201
+ return reader_class.new(ids, settings.merge('logger' => logger))
202
+ end
203
+
204
+ # Instantiate a Writer, suing class set in #writer_class
205
+ def writer!
206
+ writer_class = @writer_class || Object.const_get(settings['writer_class_name']) rescue nil
207
+ writer_class.new(settings.merge('logger' => logger))
208
+ end
209
+
210
+ def writer
211
+ @writer ||= settings['writer'] || writer!
212
+ end
213
+
214
+ # Log that the current record is being skipped, using
215
+ # data in context.position and context.skipmessage
216
+ def log_skip(context)
217
+ logger.debug "Skipped record #{context.position}: #{context.skipmessage}"
218
+ end
219
+ private :log_skip
220
+
221
+ def log_mapping_errors(context, import_step)
222
+ begin
223
+ yield
224
+ rescue Exception => ex
225
+ msg = "Unexpected error on record id `#{context.source_item_id}` at file position #{context.position}\n"
226
+ msg += " while executing #{import_step.inspect}\n"
227
+ msg += Omelette::Util.exception_to_log_message(e)
228
+
229
+ logger.error msg
230
+ begin
231
+ logger.debug "Item: #{context.source_item.to_s}"
232
+ rescue Exception => item_exception
233
+ logger.debug "(Could not log item, #{item_exception})"
234
+ end
235
+ raise ex
236
+ end
237
+ end
238
+ private :log_mapping_errors
239
+
240
+ # Create logger according to settings
241
+ def create_logger
242
+ logger_level = settings['log.level'] || 'info'
243
+
244
+ # log everything to STDERR or specified logfile
245
+ logger = Yell::Logger.new(:null)
246
+ logger.format = logger_format
247
+ logger.level = logger_level
248
+
249
+ logger_destination = settings['log.file'] || 'STDERR'
250
+ # We intentionally repeat the logger_level
251
+ # on the adapter, so it will stay there if overall level
252
+ # is changed.
253
+ case logger_destination
254
+ when 'STDERR'
255
+ logger.adapter :stderr, level: logger_level, format: logger_format
256
+ when 'STDOUT'
257
+ logger.adapter :stdout, level: logger_level, format: logger_format
258
+ else
259
+ logger.adapter :file, logger_destination, level: logger_level, format: logger_format
260
+ end
261
+
262
+
263
+ # ADDITIONALLY log error and higher to....
264
+ if settings['log.error_file']
265
+ logger.adapter :file, settings['log.error_file'], :level => 'gte.error'
266
+ end
267
+
268
+ return logger
269
+ end
270
+ private :create_logger
271
+
272
+ def logger_format
273
+ format = settings['log.format'] || '%d %5L %m'
274
+ format = case format
275
+ when 'false' then
276
+ false
277
+ when '' then
278
+ nil
279
+ else
280
+ format
281
+ end
282
+ end
283
+ private :logger_format
284
+ end