syphon 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ Gemfile.lock
2
+ test/config.yml
data/CHANGELOG ADDED
@@ -0,0 +1,3 @@
1
+ == 0.0.1 2013-11-18
2
+
3
+ * Hi.
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
3
+
4
+ gem 'debugger', '~> 1.6.0', platform: :ruby_19
5
+ gem 'looksee', '~> 1.1.0'
6
+ gem 'ritual', '~> 0.4.0'
7
+ gem 'temporaries', '~> 0.3.0'
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) George Ogata
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,5 @@
1
+ ## Syphon
2
+
3
+ Syphon data from an Arel source into ElasticSearch.
4
+
5
+ ### Work in progress
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'ritual'
@@ -0,0 +1,57 @@
1
+ module Syphon
2
+ class Builder
3
+ def initialize(results, schema)
4
+ @results = results
5
+ @schema = schema
6
+ @nested_docs = {}
7
+ end
8
+
9
+ attr_reader :results, :schema, :nested_docs
10
+
11
+ def each
12
+ current_id = nil
13
+ current_document = nil
14
+ results.each_with_index do |row, index|
15
+ id = row[0]
16
+ if current_id.nil? || id != current_id
17
+ yield current_document unless current_document.nil?
18
+ current_document = {}
19
+ current_id = id
20
+ end
21
+ add_to_document(current_document, row)
22
+ end
23
+ yield current_document unless current_document.nil?
24
+ end
25
+
26
+ def add_to_document(document, row, schema = self.schema, index = 0)
27
+ schema.fields.each do |name, field|
28
+ if field.is_a?(Schema::NestedField)
29
+ nested_doc = {}
30
+ index = add_to_document(nested_doc, row, field.nested_schema, index)
31
+ document[field.name] = combine(document[field.name], nested_doc)
32
+ index
33
+ else
34
+ document[field.name] = combine(document[field.name], row[index])
35
+ index += 1
36
+ end
37
+ end
38
+ index
39
+ end
40
+
41
+ include Enumerable
42
+
43
+ private
44
+
45
+ def combine(existing, incoming)
46
+ if existing
47
+ if existing.is_a?(Array)
48
+ existing << incoming unless existing.include?(incoming)
49
+ else
50
+ existing == incoming ? existing : [existing, incoming]
51
+ end
52
+ else
53
+ incoming
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,108 @@
1
+ module Syphon
2
+ module Index
3
+ def self.included(base)
4
+ base.extend ClassMethods
5
+ base.pre_sql ||= []
6
+ super
7
+ end
8
+
9
+ module ClassMethods
10
+ attr_accessor :pre_sql
11
+
12
+ def inherited(subclass)
13
+ subclass.pre_sql = pre_sql.dup
14
+ super
15
+ end
16
+
17
+ def database_connection
18
+ Syphon.database_connection
19
+ end
20
+
21
+ def client
22
+ Syphon.client
23
+ end
24
+
25
+ def index_name
26
+ @index_name ||=
27
+ begin
28
+ prefix = Syphon.index_namespace.to_s.empty? ? '' : "#{Syphon.index_namespace}_"
29
+ prefix + name.sub(/Index\z/, '').underscore.pluralize
30
+ end
31
+ end
32
+
33
+ def sources
34
+ @sources ||= {}
35
+ end
36
+
37
+ def build(options = {})
38
+ old_internal_name = internal_index_name
39
+ new_internal_name = new_internal_index_name(index_name)
40
+
41
+ client.indices.create(index: new_internal_name)
42
+ sources.each do |name, source|
43
+ body = source.mapping
44
+ client.indices.put_mapping(index: new_internal_name, type: source.type, body: body)
45
+ source.import(index: new_internal_name) unless options[:schema_only]
46
+ end
47
+
48
+ warmups.each { |w| w.call(new_internal_name) }
49
+
50
+ remove = {remove: {index: old_internal_name, alias: index_name}} if old_internal_name
51
+ add = {add: {index: new_internal_name, alias: index_name}}
52
+ client.indices.update_aliases body: {actions: [remove, add].compact}
53
+ client.indices.delete(index: old_internal_name) if old_internal_name
54
+ end
55
+
56
+ def destroy
57
+ internal_name = internal_index_name and
58
+ client.indices.delete index: internal_name
59
+ end
60
+
61
+ def search(options = {})
62
+ options[:index] ||= index_name
63
+ options[:type] ||= source.type
64
+ client.search(options)
65
+ end
66
+
67
+ def define_source(name = nil, options = {}, &block)
68
+ source = sources[name] ||= Source.new(self, name, options)
69
+ source.schema.configure(&block) if block
70
+ source
71
+ end
72
+
73
+ def define_warmup(&block)
74
+ warmups << block
75
+ end
76
+
77
+ def source(name = nil)
78
+ sources[name]
79
+ end
80
+
81
+ def warmups
82
+ @warmups ||= []
83
+ end
84
+
85
+ protected
86
+
87
+ attr_writer :index_name
88
+
89
+ private
90
+
91
+ def internal_index_name
92
+ index_name, alias_info = client.indices.get_alias(name: self.index_name).first
93
+ index_name
94
+ rescue Elasticsearch::Transport::Transport::Errors::NotFound
95
+ nil
96
+ end
97
+
98
+ def new_internal_index_name(index_name)
99
+ i = 0
100
+ loop do
101
+ name = "#{index_name}_#{i}"
102
+ return name if !client.indices.exists(index: name)
103
+ i += 1
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,28 @@
1
+ module Syphon
2
+ class Railtie < Rails::Railtie
3
+ rake_tasks do
4
+ require 'syphon/tasks'
5
+ end
6
+
7
+ initializer "syphon.initialize" do
8
+ if Syphon.database_configuration.empty? && defined?(ActiveRecord::Base)
9
+ db_configs = ActiveRecord::Base.configurations
10
+ db_config = db_configs["#{Rails.env}_syphon"] || db_configs[Rails.env] and
11
+ Syphon.database_configuration = db_config.symbolize_keys
12
+ end
13
+
14
+ path = "#{Rails.root}/config/syphon.yml"
15
+ if File.exist?(path)
16
+ erb = File.read(path)
17
+ yaml = ERB.new(erb).result
18
+ config = YAML.load(yaml)[Rails.env] and
19
+ Syphon.configuration = config.symbolize_keys
20
+ end
21
+
22
+ if Syphon.index_namespace.nil?
23
+ app_name = Rails.application.class.parent_name.underscore
24
+ Syphon.index_namespace = "#{app_name}_#{Rails.env}"
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,167 @@
1
+ module Syphon
2
+ class Schema
3
+ def initialize(&block)
4
+ @fields = {}
5
+ @relation = nil
6
+ @joins = []
7
+ @conditions = nil
8
+ @group_clause = nil
9
+ @having_clause = nil
10
+ configure(&block) if block
11
+ end
12
+
13
+ attr_reader :fields, :joins
14
+ attr_accessor :relation, :conditions, :group_clause, :having_clause
15
+
16
+ def configure(&block)
17
+ DSL.new(self)._eval(&block)
18
+ end
19
+
20
+ def query(options = {})
21
+ order = options[:order] and
22
+ order_by_fragment = "ORDER BY #{query_fragment(order)}"
23
+ limit = options[:limit] and
24
+ limit_fragment = "LIMIT #{query_fragment(limit)}"
25
+
26
+ select_fragment = options[:select] || select_fragments
27
+ where_fragment = where_fragment(options.slice(:scope, :invert))
28
+
29
+ <<-EOS.strip.gsub(/\s+/, ' ')
30
+ SELECT #{select_fragment}
31
+ FROM #{query_fragment(relation)}
32
+ #{joins_fragment}
33
+ #{where_fragment}
34
+ #{group_by_fragment}
35
+ #{having_fragment}
36
+ #{order_by_fragment}
37
+ #{limit_fragment}
38
+ EOS
39
+ end
40
+
41
+ def properties
42
+ mapping = {}
43
+ fields.each do |name, field|
44
+ mapping[name] = field.properties
45
+ end
46
+ mapping
47
+ end
48
+
49
+ private
50
+
51
+ def select_fragments
52
+ fields.map { |name, field| field.select }.join(', ')
53
+ end
54
+
55
+ def joins_fragment
56
+ return nil if joins.empty?
57
+ joins.map { |j| query_fragment(j) }.join(' ')
58
+ end
59
+
60
+ def where_fragment(options)
61
+ fragment = query_fragment(conditions) || '1'
62
+ fragment = "NOT (#{fragment})" if options[:invert]
63
+ scope = options[:scope] and
64
+ fragment = "(#{fragment}) AND (#{scope})"
65
+ fragment == '1' ? nil : "WHERE #{fragment}"
66
+ end
67
+
68
+ def group_by_fragment
69
+ clause = query_fragment(group_clause) and
70
+ "GROUP BY #{clause}"
71
+ end
72
+
73
+ def having_fragment
74
+ clause = query_fragment(having_clause) and
75
+ "HAVING #{clause}"
76
+ end
77
+
78
+ def query_fragment(string_or_callable)
79
+ if string_or_callable.respond_to?(:call)
80
+ string_or_callable.call
81
+ elsif string_or_callable
82
+ string_or_callable
83
+ end
84
+ end
85
+
86
+ class Field
87
+ def initialize(name, type, expression, options = {})
88
+ @name = name.to_sym
89
+ @type = type
90
+ @expression = expression
91
+ @properties = options.merge(type: type)
92
+ end
93
+
94
+ attr_reader :name, :type, :expression, :properties
95
+
96
+ def select(outer = nil)
97
+ name = outer ? "#{outer}[#{self.name}]" : self.name
98
+ "#{expression} AS `#{name}`"
99
+ end
100
+ end
101
+
102
+ class NestedField < Field
103
+ def initialize(name, options = {}, &block)
104
+ super(name, :nested, nil, options)
105
+ @nested_schema = Schema.new(&block)
106
+ end
107
+
108
+ attr_reader :nested_schema
109
+
110
+ def properties
111
+ super.merge(properties: nested_schema.properties)
112
+ end
113
+
114
+ def select
115
+ nested_schema.fields.map { |n, f| f.select(name) }.join(', ')
116
+ end
117
+ end
118
+
119
+ DSL = Struct.new(:schema) do
120
+ def _eval(&block)
121
+ if block.arity == 1
122
+ block.call(self)
123
+ else
124
+ instance_eval(&block)
125
+ end
126
+ schema
127
+ end
128
+
129
+ def field(name, type, expression, options = {})
130
+ schema.fields[name.to_sym] = Field.new(name, type, expression, options)
131
+ end
132
+
133
+ %w[string short byte integer long float double date boolean binary geo_point].each do |type|
134
+ class_eval <<-EOS, __FILE__, __LINE__ + 1
135
+ def #{type}(name, expression, options = {})
136
+ field(name, :#{type}, expression, options)
137
+ end
138
+ EOS
139
+ end
140
+
141
+ def nested(name, options = {}, &block)
142
+ schema.fields[name.to_sym] = NestedField.new(name, options, &block)
143
+ end
144
+
145
+ {
146
+ from: :relation,
147
+ where: :conditions,
148
+ group_by: :group_clause,
149
+ having: :having_clause,
150
+ }.each do |dsl_method, schema_attribute|
151
+ class_eval <<-EOS, __FILE__, __LINE__ + 1
152
+ def #{dsl_method}(string = nil, &block)
153
+ string && block and
154
+ raise ArgumentError, "both string and block given"
155
+ schema.#{schema_attribute} = string || block
156
+ end
157
+ EOS
158
+ end
159
+
160
+ def join(string = nil, &block)
161
+ string && block and
162
+ raise ArgumentError, "both string and block given"
163
+ schema.joins << (string || block)
164
+ end
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,76 @@
1
+ module Syphon
2
+ class Source
3
+ def initialize(index, name, options = {}, &block)
4
+ @index = index
5
+ @name = name && name.to_sym
6
+ @type = options[:type] || default_type
7
+ @schema = Schema.new(&block)
8
+ end
9
+
10
+ attr_reader :index, :name, :type, :schema
11
+
12
+ def mapping
13
+ {type => {properties: schema.properties}}
14
+ end
15
+
16
+ def import(options = {})
17
+ db = index.database_connection
18
+ query = options[:query] || schema.query(order: "`#{schema.relation}`.id")
19
+ index.pre_sql.each { |sql| db.query(sql) }
20
+ rows = db.query(query, as: :array, stream: true, cache_rows: false)
21
+ builder = Builder.new(rows, schema)
22
+
23
+ builder.each_slice(1000) do |slice|
24
+ body = []
25
+ slice.each do |document|
26
+ body << {index: meta(document[:id], options)} << document
27
+ end
28
+ client.bulk body: body
29
+ end
30
+ client.indices.refresh index: options[:index] || index.index_name
31
+ end
32
+
33
+ def update_ids(ids)
34
+ return if ids.empty?
35
+ query = schema.query(
36
+ scope: "`#{schema.relation}`.id IN (#{ids.join(', ')})",
37
+ order: "`#{schema.relation}`.id",
38
+ )
39
+ rows = Syphon.database_connection.query(query, as: :array)
40
+ docs = Builder.new(rows, schema).to_a
41
+ body = bulk_indexes(docs) + bulk_deletes(ids, docs)
42
+ client.bulk body: body, refresh: true unless body.empty?
43
+ end
44
+
45
+ protected
46
+
47
+ def client
48
+ index.client
49
+ end
50
+
51
+ private
52
+
53
+ def bulk_indexes(documents, options = {})
54
+ documents.flat_map do |document|
55
+ [{index: meta(document[:id], options)}, document]
56
+ end
57
+ end
58
+
59
+ def bulk_deletes(ids, documents, options = {})
60
+ ids_to_delete = ids - documents.map { |document| document[:id] }
61
+ ids_to_delete.map do |id|
62
+ {delete: meta(id, options)}
63
+ end
64
+ end
65
+
66
+ def meta(id, options = {})
67
+ {_index: options[:index] || index.index_name, _type: type, _id: id}
68
+ end
69
+
70
+ private
71
+
72
+ def default_type
73
+ @type_name ||= index.name.sub(/Index\z/, '').underscore.to_sym
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,26 @@
1
+ namespace :syphon do
2
+ task :build, [:indices] => :environment do |t, args|
3
+ require 'set'
4
+ classes = Syphon.index_classes
5
+
6
+ if (indices = args[:indices]).present?
7
+ class_names = indices.scan(/\w+/).to_set
8
+ classes.select! { |c| class_names.include?(c.name) }
9
+ end
10
+
11
+ n = classes.size
12
+ if n == 0
13
+ if indices
14
+ puts "No index classes found matching '#{indices}'. Available: #{Syphon.index_classes.map(&:name).join(', ')}"
15
+ else
16
+ puts "No index classes"
17
+ end
18
+ else
19
+ classes.each_with_index do |klass, i|
20
+ puts "#{i+1}/#{n}: Building #{klass}..."
21
+ klass.build
22
+ end
23
+ puts "Done."
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,11 @@
1
+ module Syphon
2
+ VERSION = [0, 0, 1]
3
+
4
+ class << VERSION
5
+ include Comparable
6
+
7
+ def to_s
8
+ join('.')
9
+ end
10
+ end
11
+ end
data/lib/syphon.rb ADDED
@@ -0,0 +1,41 @@
1
+ require 'active_support/inflector'
2
+ require 'elasticsearch'
3
+ require 'mysql2'
4
+
5
+ module Syphon
6
+ autoload :Builder, 'syphon/builder'
7
+ autoload :Index, 'syphon/index'
8
+ autoload :Schema, 'syphon/schema'
9
+ autoload :Source, 'syphon/source'
10
+ autoload :VERSION, 'syphon/version'
11
+
12
+ class << self
13
+ attr_writer :configuration, :database_configuration, :index_namespace
14
+
15
+ def configuration
16
+ @configuration ||= {}
17
+ end
18
+
19
+ def database_configuration
20
+ @database_configuration ||= {}
21
+ end
22
+
23
+ def index_namespace
24
+ @index_namespace ||= configuration[:index_namespace]
25
+ end
26
+
27
+ def database_connection
28
+ @database_connection ||= Mysql2::Client.new(database_configuration)
29
+ end
30
+
31
+ def client
32
+ Thread.current[:syphon_client] ||= Elasticsearch::Client.new(Syphon.configuration)
33
+ end
34
+
35
+ def index_classes
36
+ Syphon.configuration['index_classes'].map(&:constantize)
37
+ end
38
+ end
39
+ end
40
+
41
+ require 'syphon/railtie' if defined?(Rails)
data/syphon.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ $:.unshift File.expand_path('lib', File.dirname(__FILE__))
2
+ require 'syphon/version'
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = 'syphon'
6
+ gem.version = Syphon::VERSION
7
+ gem.authors = ['George Ogata']
8
+ gem.email = ['george.ogata@gmail.com']
9
+ gem.description = "Syphon data from an Arel source into ElasticSearch"
10
+ gem.summary = "Syphon data from an Arel source into ElasticSearch"
11
+ gem.homepage = 'https://github.com/howaboutwe/syphon'
12
+
13
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
14
+ gem.files = `git ls-files`.split("\n")
15
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+
17
+ gem.add_dependency 'elasticsearch', '~> 0.4.0'
18
+ gem.add_dependency 'activesupport', '~> 3.2.0'
19
+ gem.add_dependency 'mysql2', '~> 0.3.12'
20
+
21
+ gem.add_development_dependency 'bundler'
22
+ end
@@ -0,0 +1,8 @@
1
+ database:
2
+ adapter: mysql2
3
+ host: localhost
4
+ username: root
5
+ password:
6
+ database: syphon
7
+
8
+ elasticsearch: {}
@@ -0,0 +1,73 @@
1
+ require_relative '../test_helper'
2
+
3
+ describe Syphon::Builder do
4
+ describe "#each" do
5
+ it "returns a document for each row with a distinct id" do
6
+ schema = Syphon::Schema.new do
7
+ integer :id, 0
8
+ string :name, 'x'
9
+ end
10
+ results = [[1, 'one'], [2, 'two']]
11
+ Syphon::Builder.new(results, schema).to_a.
12
+ must_equal [{id: 1, name: 'one'}, {id: 2, name: 'two'}]
13
+ end
14
+
15
+ it "builds nested documents for nested fields" do
16
+ schema = Syphon::Schema.new do
17
+ integer :id, 0
18
+ nested :nested1 do
19
+ integer :a, 'x'
20
+ integer :b, 'x'
21
+ end
22
+ nested :nested2 do
23
+ integer :a, 'x'
24
+ end
25
+ end
26
+ results = [[1, 10, 11, 12], [2, 20, 21, 22]]
27
+ Syphon::Builder.new(results, schema).to_a.must_equal [
28
+ {id: 1, nested1: {a: 10, b: 11}, nested2: {a: 12}},
29
+ {id: 2, nested1: {a: 20, b: 21}, nested2: {a: 22}},
30
+ ]
31
+ end
32
+
33
+ it "merges content from rows with the same root id" do
34
+ schema = Syphon::Schema.new do
35
+ integer :id, 0
36
+ string :name, 'x'
37
+ end
38
+ results = [[1, 'one'], [1, 'two']]
39
+ Syphon::Builder.new(results, schema).to_a.
40
+ must_equal [{id: 1, name: ['one', 'two']}]
41
+ end
42
+
43
+ it "merges content with the same root id correctly when there are nested fields" do
44
+ schema = Syphon::Schema.new do
45
+ integer :id, 0
46
+ nested :nested1 do
47
+ integer :a, 'x'
48
+ end
49
+ nested :nested2 do
50
+ integer :a, 'x'
51
+ end
52
+ end
53
+ results = [[1, 10, 11], [2, 20, 21]]
54
+ Syphon::Builder.new(results, schema).to_a.must_equal [
55
+ {id: 1, nested1: {a: 10}, nested2: {a: 11}},
56
+ {id: 2, nested1: {a: 20}, nested2: {a: 21}},
57
+ ]
58
+ end
59
+
60
+ it "supports arrays as nested fields" do
61
+ schema = Syphon::Schema.new do
62
+ integer :id, 0
63
+ nested :nested1 do
64
+ integer :id, 0
65
+ integer :name, 'x'
66
+ end
67
+ end
68
+ results = [[1, 2, 'a'], [1, 3, 'b']]
69
+ Syphon::Builder.new(results, schema).to_a.
70
+ must_equal [{id: 1, nested1: [{id: 2, name: 'a'}, {id: 3, name: 'b'}]}]
71
+ end
72
+ end
73
+ end