syphon 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ Gemfile.lock
2
+ test/config.yml
data/CHANGELOG ADDED
@@ -0,0 +1,3 @@
1
+ == 0.0.1 2013-11-18
2
+
3
+ * Hi.
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
3
+
4
+ gem 'debugger', '~> 1.6.0', platform: :ruby_19
5
+ gem 'looksee', '~> 1.1.0'
6
+ gem 'ritual', '~> 0.4.0'
7
+ gem 'temporaries', '~> 0.3.0'
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) George Ogata
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,5 @@
1
+ ## Syphon
2
+
3
+ Syphon data from an Arel source into ElasticSearch.
4
+
5
+ ### Work in progress
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'ritual'
@@ -0,0 +1,57 @@
1
+ module Syphon
2
+ class Builder
3
+ def initialize(results, schema)
4
+ @results = results
5
+ @schema = schema
6
+ @nested_docs = {}
7
+ end
8
+
9
+ attr_reader :results, :schema, :nested_docs
10
+
11
+ def each
12
+ current_id = nil
13
+ current_document = nil
14
+ results.each_with_index do |row, index|
15
+ id = row[0]
16
+ if current_id.nil? || id != current_id
17
+ yield current_document unless current_document.nil?
18
+ current_document = {}
19
+ current_id = id
20
+ end
21
+ add_to_document(current_document, row)
22
+ end
23
+ yield current_document unless current_document.nil?
24
+ end
25
+
26
+ def add_to_document(document, row, schema = self.schema, index = 0)
27
+ schema.fields.each do |name, field|
28
+ if field.is_a?(Schema::NestedField)
29
+ nested_doc = {}
30
+ index = add_to_document(nested_doc, row, field.nested_schema, index)
31
+ document[field.name] = combine(document[field.name], nested_doc)
32
+ index
33
+ else
34
+ document[field.name] = combine(document[field.name], row[index])
35
+ index += 1
36
+ end
37
+ end
38
+ index
39
+ end
40
+
41
+ include Enumerable
42
+
43
+ private
44
+
45
+ def combine(existing, incoming)
46
+ if existing
47
+ if existing.is_a?(Array)
48
+ existing << incoming unless existing.include?(incoming)
49
+ else
50
+ existing == incoming ? existing : [existing, incoming]
51
+ end
52
+ else
53
+ incoming
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,108 @@
1
+ module Syphon
2
+ module Index
3
+ def self.included(base)
4
+ base.extend ClassMethods
5
+ base.pre_sql ||= []
6
+ super
7
+ end
8
+
9
+ module ClassMethods
10
+ attr_accessor :pre_sql
11
+
12
+ def inherited(subclass)
13
+ subclass.pre_sql = pre_sql.dup
14
+ super
15
+ end
16
+
17
+ def database_connection
18
+ Syphon.database_connection
19
+ end
20
+
21
+ def client
22
+ Syphon.client
23
+ end
24
+
25
+ def index_name
26
+ @index_name ||=
27
+ begin
28
+ prefix = Syphon.index_namespace.to_s.empty? ? '' : "#{Syphon.index_namespace}_"
29
+ prefix + name.sub(/Index\z/, '').underscore.pluralize
30
+ end
31
+ end
32
+
33
+ def sources
34
+ @sources ||= {}
35
+ end
36
+
37
+ def build(options = {})
38
+ old_internal_name = internal_index_name
39
+ new_internal_name = new_internal_index_name(index_name)
40
+
41
+ client.indices.create(index: new_internal_name)
42
+ sources.each do |name, source|
43
+ body = source.mapping
44
+ client.indices.put_mapping(index: new_internal_name, type: source.type, body: body)
45
+ source.import(index: new_internal_name) unless options[:schema_only]
46
+ end
47
+
48
+ warmups.each { |w| w.call(new_internal_name) }
49
+
50
+ remove = {remove: {index: old_internal_name, alias: index_name}} if old_internal_name
51
+ add = {add: {index: new_internal_name, alias: index_name}}
52
+ client.indices.update_aliases body: {actions: [remove, add].compact}
53
+ client.indices.delete(index: old_internal_name) if old_internal_name
54
+ end
55
+
56
+ def destroy
57
+ internal_name = internal_index_name and
58
+ client.indices.delete index: internal_name
59
+ end
60
+
61
+ def search(options = {})
62
+ options[:index] ||= index_name
63
+ options[:type] ||= source.type
64
+ client.search(options)
65
+ end
66
+
67
+ def define_source(name = nil, options = {}, &block)
68
+ source = sources[name] ||= Source.new(self, name, options)
69
+ source.schema.configure(&block) if block
70
+ source
71
+ end
72
+
73
+ def define_warmup(&block)
74
+ warmups << block
75
+ end
76
+
77
+ def source(name = nil)
78
+ sources[name]
79
+ end
80
+
81
+ def warmups
82
+ @warmups ||= []
83
+ end
84
+
85
+ protected
86
+
87
+ attr_writer :index_name
88
+
89
+ private
90
+
91
+ def internal_index_name
92
+ index_name, alias_info = client.indices.get_alias(name: self.index_name).first
93
+ index_name
94
+ rescue Elasticsearch::Transport::Transport::Errors::NotFound
95
+ nil
96
+ end
97
+
98
+ def new_internal_index_name(index_name)
99
+ i = 0
100
+ loop do
101
+ name = "#{index_name}_#{i}"
102
+ return name if !client.indices.exists(index: name)
103
+ i += 1
104
+ end
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,28 @@
1
+ module Syphon
2
+ class Railtie < Rails::Railtie
3
+ rake_tasks do
4
+ require 'syphon/tasks'
5
+ end
6
+
7
+ initializer "syphon.initialize" do
8
+ if Syphon.database_configuration.empty? && defined?(ActiveRecord::Base)
9
+ db_configs = ActiveRecord::Base.configurations
10
+ db_config = db_configs["#{Rails.env}_syphon"] || db_configs[Rails.env] and
11
+ Syphon.database_configuration = db_config.symbolize_keys
12
+ end
13
+
14
+ path = "#{Rails.root}/config/syphon.yml"
15
+ if File.exist?(path)
16
+ erb = File.read(path)
17
+ yaml = ERB.new(erb).result
18
+ config = YAML.load(yaml)[Rails.env] and
19
+ Syphon.configuration = config.symbolize_keys
20
+ end
21
+
22
+ if Syphon.index_namespace.nil?
23
+ app_name = Rails.application.class.parent_name.underscore
24
+ Syphon.index_namespace = "#{app_name}_#{Rails.env}"
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,167 @@
1
+ module Syphon
2
+ class Schema
3
+ def initialize(&block)
4
+ @fields = {}
5
+ @relation = nil
6
+ @joins = []
7
+ @conditions = nil
8
+ @group_clause = nil
9
+ @having_clause = nil
10
+ configure(&block) if block
11
+ end
12
+
13
+ attr_reader :fields, :joins
14
+ attr_accessor :relation, :conditions, :group_clause, :having_clause
15
+
16
+ def configure(&block)
17
+ DSL.new(self)._eval(&block)
18
+ end
19
+
20
+ def query(options = {})
21
+ order = options[:order] and
22
+ order_by_fragment = "ORDER BY #{query_fragment(order)}"
23
+ limit = options[:limit] and
24
+ limit_fragment = "LIMIT #{query_fragment(limit)}"
25
+
26
+ select_fragment = options[:select] || select_fragments
27
+ where_fragment = where_fragment(options.slice(:scope, :invert))
28
+
29
+ <<-EOS.strip.gsub(/\s+/, ' ')
30
+ SELECT #{select_fragment}
31
+ FROM #{query_fragment(relation)}
32
+ #{joins_fragment}
33
+ #{where_fragment}
34
+ #{group_by_fragment}
35
+ #{having_fragment}
36
+ #{order_by_fragment}
37
+ #{limit_fragment}
38
+ EOS
39
+ end
40
+
41
+ def properties
42
+ mapping = {}
43
+ fields.each do |name, field|
44
+ mapping[name] = field.properties
45
+ end
46
+ mapping
47
+ end
48
+
49
+ private
50
+
51
+ def select_fragments
52
+ fields.map { |name, field| field.select }.join(', ')
53
+ end
54
+
55
+ def joins_fragment
56
+ return nil if joins.empty?
57
+ joins.map { |j| query_fragment(j) }.join(' ')
58
+ end
59
+
60
+ def where_fragment(options)
61
+ fragment = query_fragment(conditions) || '1'
62
+ fragment = "NOT (#{fragment})" if options[:invert]
63
+ scope = options[:scope] and
64
+ fragment = "(#{fragment}) AND (#{scope})"
65
+ fragment == '1' ? nil : "WHERE #{fragment}"
66
+ end
67
+
68
+ def group_by_fragment
69
+ clause = query_fragment(group_clause) and
70
+ "GROUP BY #{clause}"
71
+ end
72
+
73
+ def having_fragment
74
+ clause = query_fragment(having_clause) and
75
+ "HAVING #{clause}"
76
+ end
77
+
78
+ def query_fragment(string_or_callable)
79
+ if string_or_callable.respond_to?(:call)
80
+ string_or_callable.call
81
+ elsif string_or_callable
82
+ string_or_callable
83
+ end
84
+ end
85
+
86
+ class Field
87
+ def initialize(name, type, expression, options = {})
88
+ @name = name.to_sym
89
+ @type = type
90
+ @expression = expression
91
+ @properties = options.merge(type: type)
92
+ end
93
+
94
+ attr_reader :name, :type, :expression, :properties
95
+
96
+ def select(outer = nil)
97
+ name = outer ? "#{outer}[#{self.name}]" : self.name
98
+ "#{expression} AS `#{name}`"
99
+ end
100
+ end
101
+
102
+ class NestedField < Field
103
+ def initialize(name, options = {}, &block)
104
+ super(name, :nested, nil, options)
105
+ @nested_schema = Schema.new(&block)
106
+ end
107
+
108
+ attr_reader :nested_schema
109
+
110
+ def properties
111
+ super.merge(properties: nested_schema.properties)
112
+ end
113
+
114
+ def select
115
+ nested_schema.fields.map { |n, f| f.select(name) }.join(', ')
116
+ end
117
+ end
118
+
119
+ DSL = Struct.new(:schema) do
120
+ def _eval(&block)
121
+ if block.arity == 1
122
+ block.call(self)
123
+ else
124
+ instance_eval(&block)
125
+ end
126
+ schema
127
+ end
128
+
129
+ def field(name, type, expression, options = {})
130
+ schema.fields[name.to_sym] = Field.new(name, type, expression, options)
131
+ end
132
+
133
+ %w[string short byte integer long float double date boolean binary geo_point].each do |type|
134
+ class_eval <<-EOS, __FILE__, __LINE__ + 1
135
+ def #{type}(name, expression, options = {})
136
+ field(name, :#{type}, expression, options)
137
+ end
138
+ EOS
139
+ end
140
+
141
+ def nested(name, options = {}, &block)
142
+ schema.fields[name.to_sym] = NestedField.new(name, options, &block)
143
+ end
144
+
145
+ {
146
+ from: :relation,
147
+ where: :conditions,
148
+ group_by: :group_clause,
149
+ having: :having_clause,
150
+ }.each do |dsl_method, schema_attribute|
151
+ class_eval <<-EOS, __FILE__, __LINE__ + 1
152
+ def #{dsl_method}(string = nil, &block)
153
+ string && block and
154
+ raise ArgumentError, "both string and block given"
155
+ schema.#{schema_attribute} = string || block
156
+ end
157
+ EOS
158
+ end
159
+
160
+ def join(string = nil, &block)
161
+ string && block and
162
+ raise ArgumentError, "both string and block given"
163
+ schema.joins << (string || block)
164
+ end
165
+ end
166
+ end
167
+ end
@@ -0,0 +1,76 @@
1
+ module Syphon
2
+ class Source
3
+ def initialize(index, name, options = {}, &block)
4
+ @index = index
5
+ @name = name && name.to_sym
6
+ @type = options[:type] || default_type
7
+ @schema = Schema.new(&block)
8
+ end
9
+
10
+ attr_reader :index, :name, :type, :schema
11
+
12
+ def mapping
13
+ {type => {properties: schema.properties}}
14
+ end
15
+
16
+ def import(options = {})
17
+ db = index.database_connection
18
+ query = options[:query] || schema.query(order: "`#{schema.relation}`.id")
19
+ index.pre_sql.each { |sql| db.query(sql) }
20
+ rows = db.query(query, as: :array, stream: true, cache_rows: false)
21
+ builder = Builder.new(rows, schema)
22
+
23
+ builder.each_slice(1000) do |slice|
24
+ body = []
25
+ slice.each do |document|
26
+ body << {index: meta(document[:id], options)} << document
27
+ end
28
+ client.bulk body: body
29
+ end
30
+ client.indices.refresh index: options[:index] || index.index_name
31
+ end
32
+
33
+ def update_ids(ids)
34
+ return if ids.empty?
35
+ query = schema.query(
36
+ scope: "`#{schema.relation}`.id IN (#{ids.join(', ')})",
37
+ order: "`#{schema.relation}`.id",
38
+ )
39
+ rows = Syphon.database_connection.query(query, as: :array)
40
+ docs = Builder.new(rows, schema).to_a
41
+ body = bulk_indexes(docs) + bulk_deletes(ids, docs)
42
+ client.bulk body: body, refresh: true unless body.empty?
43
+ end
44
+
45
+ protected
46
+
47
+ def client
48
+ index.client
49
+ end
50
+
51
+ private
52
+
53
+ def bulk_indexes(documents, options = {})
54
+ documents.flat_map do |document|
55
+ [{index: meta(document[:id], options)}, document]
56
+ end
57
+ end
58
+
59
+ def bulk_deletes(ids, documents, options = {})
60
+ ids_to_delete = ids - documents.map { |document| document[:id] }
61
+ ids_to_delete.map do |id|
62
+ {delete: meta(id, options)}
63
+ end
64
+ end
65
+
66
+ def meta(id, options = {})
67
+ {_index: options[:index] || index.index_name, _type: type, _id: id}
68
+ end
69
+
70
+ private
71
+
72
+ def default_type
73
+ @type_name ||= index.name.sub(/Index\z/, '').underscore.to_sym
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,26 @@
1
+ namespace :syphon do
2
+ task :build, [:indices] => :environment do |t, args|
3
+ require 'set'
4
+ classes = Syphon.index_classes
5
+
6
+ if (indices = args[:indices]).present?
7
+ class_names = indices.scan(/\w+/).to_set
8
+ classes.select! { |c| class_names.include?(c.name) }
9
+ end
10
+
11
+ n = classes.size
12
+ if n == 0
13
+ if indices
14
+ puts "No index classes found matching '#{indices}'. Available: #{Syphon.index_classes.map(&:name).join(', ')}"
15
+ else
16
+ puts "No index classes"
17
+ end
18
+ else
19
+ classes.each_with_index do |klass, i|
20
+ puts "#{i+1}/#{n}: Building #{klass}..."
21
+ klass.build
22
+ end
23
+ puts "Done."
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,11 @@
1
+ module Syphon
2
+ VERSION = [0, 0, 1]
3
+
4
+ class << VERSION
5
+ include Comparable
6
+
7
+ def to_s
8
+ join('.')
9
+ end
10
+ end
11
+ end
data/lib/syphon.rb ADDED
@@ -0,0 +1,41 @@
1
+ require 'active_support/inflector'
2
+ require 'elasticsearch'
3
+ require 'mysql2'
4
+
5
+ module Syphon
6
+ autoload :Builder, 'syphon/builder'
7
+ autoload :Index, 'syphon/index'
8
+ autoload :Schema, 'syphon/schema'
9
+ autoload :Source, 'syphon/source'
10
+ autoload :VERSION, 'syphon/version'
11
+
12
+ class << self
13
+ attr_writer :configuration, :database_configuration, :index_namespace
14
+
15
+ def configuration
16
+ @configuration ||= {}
17
+ end
18
+
19
+ def database_configuration
20
+ @database_configuration ||= {}
21
+ end
22
+
23
+ def index_namespace
24
+ @index_namespace ||= configuration[:index_namespace]
25
+ end
26
+
27
+ def database_connection
28
+ @database_connection ||= Mysql2::Client.new(database_configuration)
29
+ end
30
+
31
+ def client
32
+ Thread.current[:syphon_client] ||= Elasticsearch::Client.new(Syphon.configuration)
33
+ end
34
+
35
+ def index_classes
36
+ Syphon.configuration['index_classes'].map(&:constantize)
37
+ end
38
+ end
39
+ end
40
+
41
+ require 'syphon/railtie' if defined?(Rails)
data/syphon.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ $:.unshift File.expand_path('lib', File.dirname(__FILE__))
2
+ require 'syphon/version'
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = 'syphon'
6
+ gem.version = Syphon::VERSION
7
+ gem.authors = ['George Ogata']
8
+ gem.email = ['george.ogata@gmail.com']
9
+ gem.description = "Syphon data from an Arel source into ElasticSearch"
10
+ gem.summary = "Syphon data from an Arel source into ElasticSearch"
11
+ gem.homepage = 'https://github.com/howaboutwe/syphon'
12
+
13
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
14
+ gem.files = `git ls-files`.split("\n")
15
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+
17
+ gem.add_dependency 'elasticsearch', '~> 0.4.0'
18
+ gem.add_dependency 'activesupport', '~> 3.2.0'
19
+ gem.add_dependency 'mysql2', '~> 0.3.12'
20
+
21
+ gem.add_development_dependency 'bundler'
22
+ end
@@ -0,0 +1,8 @@
1
+ database:
2
+ adapter: mysql2
3
+ host: localhost
4
+ username: root
5
+ password:
6
+ database: syphon
7
+
8
+ elasticsearch: {}
@@ -0,0 +1,73 @@
1
+ require_relative '../test_helper'
2
+
3
+ describe Syphon::Builder do
4
+ describe "#each" do
5
+ it "returns a document for each row with a distinct id" do
6
+ schema = Syphon::Schema.new do
7
+ integer :id, 0
8
+ string :name, 'x'
9
+ end
10
+ results = [[1, 'one'], [2, 'two']]
11
+ Syphon::Builder.new(results, schema).to_a.
12
+ must_equal [{id: 1, name: 'one'}, {id: 2, name: 'two'}]
13
+ end
14
+
15
+ it "builds nested documents for nested fields" do
16
+ schema = Syphon::Schema.new do
17
+ integer :id, 0
18
+ nested :nested1 do
19
+ integer :a, 'x'
20
+ integer :b, 'x'
21
+ end
22
+ nested :nested2 do
23
+ integer :a, 'x'
24
+ end
25
+ end
26
+ results = [[1, 10, 11, 12], [2, 20, 21, 22]]
27
+ Syphon::Builder.new(results, schema).to_a.must_equal [
28
+ {id: 1, nested1: {a: 10, b: 11}, nested2: {a: 12}},
29
+ {id: 2, nested1: {a: 20, b: 21}, nested2: {a: 22}},
30
+ ]
31
+ end
32
+
33
+ it "merges content from rows with the same root id" do
34
+ schema = Syphon::Schema.new do
35
+ integer :id, 0
36
+ string :name, 'x'
37
+ end
38
+ results = [[1, 'one'], [1, 'two']]
39
+ Syphon::Builder.new(results, schema).to_a.
40
+ must_equal [{id: 1, name: ['one', 'two']}]
41
+ end
42
+
43
+ it "merges content with the same root id correctly when there are nested fields" do
44
+ schema = Syphon::Schema.new do
45
+ integer :id, 0
46
+ nested :nested1 do
47
+ integer :a, 'x'
48
+ end
49
+ nested :nested2 do
50
+ integer :a, 'x'
51
+ end
52
+ end
53
+ results = [[1, 10, 11], [2, 20, 21]]
54
+ Syphon::Builder.new(results, schema).to_a.must_equal [
55
+ {id: 1, nested1: {a: 10}, nested2: {a: 11}},
56
+ {id: 2, nested1: {a: 20}, nested2: {a: 21}},
57
+ ]
58
+ end
59
+
60
+ it "supports arrays as nested fields" do
61
+ schema = Syphon::Schema.new do
62
+ integer :id, 0
63
+ nested :nested1 do
64
+ integer :id, 0
65
+ integer :name, 'x'
66
+ end
67
+ end
68
+ results = [[1, 2, 'a'], [1, 3, 'b']]
69
+ Syphon::Builder.new(results, schema).to_a.
70
+ must_equal [{id: 1, nested1: [{id: 2, name: 'a'}, {id: 3, name: 'b'}]}]
71
+ end
72
+ end
73
+ end