wukong-load 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.yardopts +5 -0
- data/Gemfile +16 -0
- data/LICENSE.md +1 -1
- data/README.md +100 -34
- data/bin/wu-load +1 -47
- data/bin/wu-source +4 -0
- data/lib/wukong-load.rb +36 -3
- data/lib/wukong-load/load_runner.rb +64 -0
- data/lib/wukong-load/loader.rb +7 -0
- data/lib/wukong-load/loaders/elasticsearch.rb +151 -0
- data/lib/wukong-load/loaders/kafka.rb +98 -0
- data/lib/wukong-load/loaders/mongodb.rb +123 -0
- data/lib/wukong-load/loaders/sql.rb +169 -0
- data/lib/wukong-load/models/http_request.rb +60 -0
- data/lib/wukong-load/source_driver.rb +46 -0
- data/lib/wukong-load/source_runner.rb +36 -0
- data/lib/wukong-load/version.rb +1 -1
- data/spec/spec_helper.rb +13 -0
- data/spec/wukong-load/loaders/elasticsearch_spec.rb +142 -0
- data/spec/wukong-load/loaders/kafka_spec.rb +72 -0
- data/spec/wukong-load/loaders/mongodb_spec.rb +100 -0
- data/spec/wukong-load/loaders/sql_spec.rb +112 -0
- data/spec/wukong-load/models/http_request_spec.rb +21 -0
- data/wukong-load.gemspec +3 -2
- metadata +26 -10
- data/lib/wukong-load/configuration.rb +0 -8
- data/lib/wukong-load/elasticsearch.rb +0 -99
- data/lib/wukong-load/runner.rb +0 -48
- data/spec/wukong-load/elasticsearch_spec.rb +0 -140
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative('../loader')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module Load
|
5
|
+
|
6
|
+
# Loads data into Kafka.
|
7
|
+
#
|
8
|
+
# Uses the `kafka-rb` gem to create a Kafka::Producer to write to
|
9
|
+
# Kafka.
|
10
|
+
#
|
11
|
+
# Allows loading records into a given topic on a given partition.
|
12
|
+
# Records can have fields `_topic` and `_partition` which override
|
13
|
+
# the given topic and partition on a per-record basis.
|
14
|
+
#
|
15
|
+
# The names of these fields within each record (`_topic` and
|
16
|
+
# `_partition`) can be customized.
|
17
|
+
class KafkaLoader < Loader
|
18
|
+
|
19
|
+
field :host, String, :default => 'localhost', :doc => "Kafka broker host"
|
20
|
+
field :port, Integer, :default => 9092, :doc => "Kafka broker port"
|
21
|
+
field :topic, String, :default => 'test', :doc => "Kafka topic"
|
22
|
+
field :topic_field, String, :default => '_topic', :doc => "Field within records which names the Kafka topic"
|
23
|
+
field :partition, Integer, :default => 0, :doc => "Kafka partition"
|
24
|
+
field :partition_field, String, :default => '_partition', :doc => "Field within records which names the Kafka partition"
|
25
|
+
|
26
|
+
description <<-EOF.gsub(/^ {8}/,'')
|
27
|
+
Loads newline-separated, JSON-formatted records over STDIN
|
28
|
+
into a Kafka queue.
|
29
|
+
|
30
|
+
$ cat data.json | wu-load kafka
|
31
|
+
|
32
|
+
By default, wu-load attempts to write each input record to a
|
33
|
+
local Kafka broker.
|
34
|
+
|
35
|
+
Input records will be written to a default Kafka topic on a
|
36
|
+
default partition. Each record can have _topic and _partition
|
37
|
+
fields to override this on a per-record basis.
|
38
|
+
|
39
|
+
The fields used (_topic and _partition) can be changed:
|
40
|
+
|
41
|
+
$ cat data.json | wu-load kafka --host=10.123.123.123 --topic=hits --partition_field=segment_id
|
42
|
+
EOF
|
43
|
+
|
44
|
+
# The Kafka producer used to send messages to Kafka.
|
45
|
+
attr_accessor :producer
|
46
|
+
|
47
|
+
# Creates the producer.
|
48
|
+
def setup
|
49
|
+
begin
|
50
|
+
require 'kafka'
|
51
|
+
rescue => e
|
52
|
+
raise Error.new("Please ensure that the 'kafka-rb' gem is installed and available (in your Gemfile)")
|
53
|
+
end
|
54
|
+
log.debug("Connecting to Kafka broker at #{host}:#{port}...")
|
55
|
+
begin
|
56
|
+
self.producer = Kafka::MultiProducer.new(:host => host, :port => port)
|
57
|
+
rescue => e
|
58
|
+
raise Error.new(e.message)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Load a single record into Kafka.
|
63
|
+
#
|
64
|
+
# @param [Hash] record
|
65
|
+
def load record
|
66
|
+
begin
|
67
|
+
topic = topic_for(record)
|
68
|
+
partition = partition_for(record)
|
69
|
+
bytes = producer.send(topic, messages_for(record), :partition => partition)
|
70
|
+
log.info("Wrote #{bytes} bytes to #{topic}/#{partition}")
|
71
|
+
rescue => e
|
72
|
+
handle_error(record, e)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# :nodoc:
|
77
|
+
def topic_for record
|
78
|
+
record[topic_field] || self.topic
|
79
|
+
end
|
80
|
+
|
81
|
+
# :nodoc:
|
82
|
+
def messages_for record
|
83
|
+
[Kafka::Message.new(MultiJson.dump(record))]
|
84
|
+
end
|
85
|
+
|
86
|
+
# :nodoc:
|
87
|
+
def partition_for record
|
88
|
+
record[partition_field] ? record[partition_field].to_i : partition
|
89
|
+
end
|
90
|
+
|
91
|
+
register :kafka_loader
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require_relative('../loader')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module Load
|
5
|
+
|
6
|
+
# Loads data into MongoDB.
|
7
|
+
#
|
8
|
+
# Uses the 'mongo' gem to connect and write data.
|
9
|
+
#
|
10
|
+
# Allows loading records into a given database and collection.
|
11
|
+
# Records can have fields `_database` and `_collection` which
|
12
|
+
# override the given database and collection on a per-record
|
13
|
+
# basis.
|
14
|
+
#
|
15
|
+
# Records can have an `_id` field which indicates an update, not
|
16
|
+
# an insert.
|
17
|
+
#
|
18
|
+
# The names of these fields within each record (`_database`,
|
19
|
+
# `_collection`, and `_id`) can be customized.
|
20
|
+
class MongoDBLoader < Loader
|
21
|
+
|
22
|
+
field :host, String, :default => 'localhost', :doc => "MongoDB host"
|
23
|
+
field :port, Integer,:default => 27017, :doc => "Port on MongoDB host"
|
24
|
+
field :database, String, :default => 'wukong', :doc => "Default MongoDB database"
|
25
|
+
field :collection, String, :default => 'streaming_record', :doc => "Default MongoDB collection"
|
26
|
+
field :database_field, String, :default => '_database', :doc => "Name of field in each record overriding default MongoDB database"
|
27
|
+
field :collection_field, String, :default => '_collection', :doc => "Name of field in each record overriding default MongoDB collection"
|
28
|
+
field :id_field, String, :default => '_id', :doc => "Name of field in each record providing ID of existing MongoDB record to update"
|
29
|
+
|
30
|
+
description <<-EOF.gsub(/^ {8}/,'')
|
31
|
+
Loads newline-separated, JSON-formatted records over STDIN
|
32
|
+
into MongoDB.
|
33
|
+
|
34
|
+
$ cat data.json | wu-load mongodb
|
35
|
+
|
36
|
+
By default, wu-load attempts to write each input record to a
|
37
|
+
local MongoDB server.
|
38
|
+
|
39
|
+
Input records will be written to a default database and
|
40
|
+
collection. Each record can have _database and _collection
|
41
|
+
fields to override this on a per-record basis.
|
42
|
+
|
43
|
+
Records with an _id field will be trigger updates, the rest
|
44
|
+
inserts.
|
45
|
+
|
46
|
+
All other fields within a record are assumed to be the names
|
47
|
+
of actual columns in the table.
|
48
|
+
|
49
|
+
The fields used (_index, _collection, and _id) can be changed:
|
50
|
+
|
51
|
+
$ cat data.json | wu-load mongodb --host=10.123.123.123 --database=web_events --collection=impressions --id_field=impression_id
|
52
|
+
EOF
|
53
|
+
|
54
|
+
# The Mongo::MongoClient we'll use for talking to MongoDB.
|
55
|
+
attr_accessor :client
|
56
|
+
|
57
|
+
# Creates the client connection.
|
58
|
+
def setup
|
59
|
+
begin
|
60
|
+
require 'mongo'
|
61
|
+
rescue => e
|
62
|
+
raise Error.new("Please ensure that the 'mongo' gem is installed and available (in your Gemfile)")
|
63
|
+
end
|
64
|
+
h = host.gsub(%r{^http://},'')
|
65
|
+
log.debug("Connecting to MongoDB server at #{h}:#{port}...")
|
66
|
+
begin
|
67
|
+
self.client = Mongo::MongoClient.new(h, port)
|
68
|
+
rescue => e
|
69
|
+
raise Error.new(e.message)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Load a single record into MongoDB.
|
74
|
+
#
|
75
|
+
# If the record has an ID, we'll issue an update, otherwise an
|
76
|
+
# insert.
|
77
|
+
#
|
78
|
+
# @param [record] Hash
|
79
|
+
def load record
|
80
|
+
id = id_for(record)
|
81
|
+
if id
|
82
|
+
res = collection_for(record).update({:_id => id}, record, :upsert => true)
|
83
|
+
if res['updatedExisting']
|
84
|
+
log.info("Updated #{id}")
|
85
|
+
else
|
86
|
+
log.info("Inserted #{id}")
|
87
|
+
end
|
88
|
+
else
|
89
|
+
res = collection_for(record).insert(record)
|
90
|
+
log.info("Inserted #{res}")
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# :nodoc:
|
95
|
+
def database_for record
|
96
|
+
client[database_name_for(record)]
|
97
|
+
end
|
98
|
+
|
99
|
+
# :nodoc:
|
100
|
+
def collection_for record
|
101
|
+
database_for(record)[collection_name_for(record)]
|
102
|
+
end
|
103
|
+
|
104
|
+
# :nodoc:
|
105
|
+
def database_name_for record
|
106
|
+
record[database_field] || self.database
|
107
|
+
end
|
108
|
+
|
109
|
+
# :nodoc:
|
110
|
+
def collection_name_for record
|
111
|
+
record[collection_field] || self.collection
|
112
|
+
end
|
113
|
+
|
114
|
+
# :nodoc:
|
115
|
+
def id_for record
|
116
|
+
record[id_field]
|
117
|
+
end
|
118
|
+
|
119
|
+
register :mongodb_loader
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require_relative('../loader')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module Load
|
5
|
+
|
6
|
+
# Loads data into SQL databases.
|
7
|
+
#
|
8
|
+
# Uses the 'mysql' gem to connect and write data. Yes, MySQL !=
|
9
|
+
# SQL but we'll get there, I promise...
|
10
|
+
#
|
11
|
+
# Allows loading records into a given database and table. Records
|
12
|
+
# can have fields `_database` and `_table` which override the
|
13
|
+
# given database and table on a per-record basis.
|
14
|
+
#
|
15
|
+
# Records can have an `_id` field which indicates an update, not
|
16
|
+
# an insert.
|
17
|
+
#
|
18
|
+
# The names of these fields within each record (`_database`,
|
19
|
+
# `_table`, and `_id`) can be customized.
|
20
|
+
class SQLLoader < Loader
|
21
|
+
|
22
|
+
field :host, String, :default => 'localhost', :doc => "SQL host"
|
23
|
+
field :port, Integer,:default => 3306, :doc => "Port on SQL host"
|
24
|
+
field :username, String, :default => (ENV['USER'] || 'wukong'), :doc => "User to connect as"
|
25
|
+
field :password, String, :doc => "Password for user"
|
26
|
+
field :database, String, :default => 'wukong', :doc => "Default database"
|
27
|
+
field :table, String, :default => 'streaming_record', :doc => "Default table"
|
28
|
+
field :database_field, String, :default => '_database', :doc => "Name of field in each record overriding default database"
|
29
|
+
field :table_field, String, :default => '_table', :doc => "Name of field in each record overriding default table"
|
30
|
+
field :id_field, String, :default => '_id', :doc => "Name of field in each record providing ID of existing row to update"
|
31
|
+
|
32
|
+
description <<-EOF.gsub(/^ {8}/,'')
|
33
|
+
Loads newline-separated, JSON-formatted records over STDIN
|
34
|
+
into MySQL using its HTTP API.
|
35
|
+
|
36
|
+
$ cat data.json | wu-load sql
|
37
|
+
|
38
|
+
By default, wu-load attempts to write each input record to a
|
39
|
+
local SQL server.
|
40
|
+
|
41
|
+
Input records will be written to a default database and table.
|
42
|
+
Each record can have _database and _table fields to override
|
43
|
+
this on a per-record basis.
|
44
|
+
|
45
|
+
Records with an _id field will be trigger updates, the rest
|
46
|
+
inserts.
|
47
|
+
|
48
|
+
All other fields within a record are assumed to be the names
|
49
|
+
of actual columns in the table.
|
50
|
+
|
51
|
+
The fields used (_index, _table, and _id) can be changed:
|
52
|
+
|
53
|
+
$ cat data.json | wu-load sql --host=10.123.123.123 --database=web_events --table=impressions --id_field=impression_id
|
54
|
+
EOF
|
55
|
+
|
56
|
+
# The Mongo::MongoClient we'll use for talking to MongoDB.
|
57
|
+
attr_accessor :client
|
58
|
+
|
59
|
+
# Creates the client connection.
|
60
|
+
def setup
|
61
|
+
begin
|
62
|
+
require 'mysql2'
|
63
|
+
rescue => e
|
64
|
+
raise Error.new("Please ensure that the 'mysql2' gem is installed and available (in your Gemfile)")
|
65
|
+
end
|
66
|
+
log.debug("Connecting to SQL server at <#{host}:#{port}> as <#{username}>#{' using password' if password}...")
|
67
|
+
begin
|
68
|
+
self.client = Mysql2::Client.new(sql_params)
|
69
|
+
rescue => e
|
70
|
+
raise Error.new(e)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# :nodoc:
|
75
|
+
def sql_params
|
76
|
+
{:host => host, :port => port}.tap do |params|
|
77
|
+
params[:username] if username
|
78
|
+
params[:password] if password
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Load a single record into the database.
|
83
|
+
#
|
84
|
+
# If the record has an ID, we'll issue an update, otherwise an
|
85
|
+
# insert.
|
86
|
+
#
|
87
|
+
# @param [record] Hash
|
88
|
+
def load record
|
89
|
+
id = id_for(record)
|
90
|
+
if id
|
91
|
+
perform_query(update_query(record))
|
92
|
+
log.info("Updated #{id}")
|
93
|
+
else
|
94
|
+
perform_query(insert_query(record))
|
95
|
+
log.info("Inserted")
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# :nodoc:
|
100
|
+
def insert_query record
|
101
|
+
"INSERT INTO #{database_name_for(record)}.#{table_name_for(record)} (#{fields_of(record)}) VALUES (#{values_of(record)}) ON DUPLICATE KEY UPDATE #{fields_and_values_of(record)}"
|
102
|
+
end
|
103
|
+
|
104
|
+
# :nodoc:
|
105
|
+
def update_query record
|
106
|
+
"UPDATE #{database_name_for(record)}.#{table_name_for(record)} SET #{fields_and_values_of(record)} WHERE `id`=#{id_for(record)}"
|
107
|
+
end
|
108
|
+
|
109
|
+
# :nodoc:
|
110
|
+
def field_names_of record
|
111
|
+
record.keys.reject { |key| [database_field, table_field, id_field].include?(key) }.sort
|
112
|
+
end
|
113
|
+
|
114
|
+
# :nodoc:
|
115
|
+
def fields_of record
|
116
|
+
field_names_of(record).map { |name| identifier_for(name) }.join(', ')
|
117
|
+
end
|
118
|
+
|
119
|
+
# :nodoc:
|
120
|
+
def values_of record
|
121
|
+
field_names_of(record).map { |name| value_for(record[name]) }.join(', ')
|
122
|
+
end
|
123
|
+
|
124
|
+
# :nodoc:
|
125
|
+
def fields_and_values_of record
|
126
|
+
field_names_of(record).map { |name| [identifier_for(name), value_for(record[name])].join('=') }.join(', ')
|
127
|
+
end
|
128
|
+
|
129
|
+
# :nodoc:
|
130
|
+
def database_name_for record
|
131
|
+
identifier_for(record[database_field] || self.database)
|
132
|
+
end
|
133
|
+
|
134
|
+
# :nodoc:
|
135
|
+
def table_name_for record
|
136
|
+
identifier_for(record[table_field] || self.table)
|
137
|
+
end
|
138
|
+
|
139
|
+
# :nodoc:
|
140
|
+
def identifier_for thing
|
141
|
+
'`' + client.escape(thing.to_s) + '`'
|
142
|
+
end
|
143
|
+
|
144
|
+
# :nodoc:
|
145
|
+
def value_for thing
|
146
|
+
case thing
|
147
|
+
when Fixnum then thing
|
148
|
+
when nil then 'NULL'
|
149
|
+
else
|
150
|
+
'"' + client.escape(thing.to_s) + '"'
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
|
155
|
+
# :nodoc:
|
156
|
+
def id_for record
|
157
|
+
value_for(record[id_field]) if record[id_field]
|
158
|
+
end
|
159
|
+
|
160
|
+
# :nodoc:
|
161
|
+
def perform_query query
|
162
|
+
client.query query
|
163
|
+
end
|
164
|
+
|
165
|
+
register :sql_loader
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Wukong
|
2
|
+
|
3
|
+
# Represents a generic HTTP request.
|
4
|
+
class HttpRequest
|
5
|
+
|
6
|
+
include Gorillib::Model
|
7
|
+
|
8
|
+
field :timestamp, Integer, :doc => "Timestamp at which the HTTP request was received"
|
9
|
+
field :verb, String, :doc => "HTTP verb of the request"
|
10
|
+
field :path, String, :doc => "Absolute path to the resource requested"
|
11
|
+
field :params, Hash, :doc => "Query parameters contained in the request", :default => {}
|
12
|
+
field :headers, Hash, :doc => "HTTP headers of the request", :default => {}
|
13
|
+
field :ip_address, String, :doc => "IP address of the client"
|
14
|
+
field :body, String, :doc => "Body of the request"
|
15
|
+
|
16
|
+
# Return the URL of this request.
|
17
|
+
#
|
18
|
+
# @return [String]
|
19
|
+
def url
|
20
|
+
File.join(headers['Host'] || '', (path || ''))
|
21
|
+
end
|
22
|
+
|
23
|
+
# Return the HTTP Referer of this request.
|
24
|
+
#
|
25
|
+
# @return [String]
|
26
|
+
def referer
|
27
|
+
headers['Referer']
|
28
|
+
end
|
29
|
+
alias_method :referrer, :referer
|
30
|
+
|
31
|
+
# Return the HTTP User-Agent of this request.
|
32
|
+
#
|
33
|
+
# @return [String]
|
34
|
+
def user_agent
|
35
|
+
headers['User-Agent']
|
36
|
+
end
|
37
|
+
|
38
|
+
# Return the HTTP Cookie of this request.
|
39
|
+
#
|
40
|
+
# @return [String]
|
41
|
+
def cookie
|
42
|
+
headers['Cookie']
|
43
|
+
end
|
44
|
+
|
45
|
+
# Return the "best" IP address from this request.
|
46
|
+
#
|
47
|
+
# Will return the first IP address in the HTTP X-Forwarded-For chain
|
48
|
+
# if present, otherwise will return the IP address of the request
|
49
|
+
# itself.
|
50
|
+
#
|
51
|
+
# @return [String]
|
52
|
+
def best_ip_address
|
53
|
+
ip_string = headers['X-Forwarded-For']
|
54
|
+
return ip_address if ip_string.blank?
|
55
|
+
ips = ip_string.split(/\s*,\s*/)
|
56
|
+
ips.empty? ? ip_address : ips.first # client comes first, then proxies in order
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|