wukong-load 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.yardopts +5 -0
- data/Gemfile +16 -0
- data/LICENSE.md +1 -1
- data/README.md +100 -34
- data/bin/wu-load +1 -47
- data/bin/wu-source +4 -0
- data/lib/wukong-load.rb +36 -3
- data/lib/wukong-load/load_runner.rb +64 -0
- data/lib/wukong-load/loader.rb +7 -0
- data/lib/wukong-load/loaders/elasticsearch.rb +151 -0
- data/lib/wukong-load/loaders/kafka.rb +98 -0
- data/lib/wukong-load/loaders/mongodb.rb +123 -0
- data/lib/wukong-load/loaders/sql.rb +169 -0
- data/lib/wukong-load/models/http_request.rb +60 -0
- data/lib/wukong-load/source_driver.rb +46 -0
- data/lib/wukong-load/source_runner.rb +36 -0
- data/lib/wukong-load/version.rb +1 -1
- data/spec/spec_helper.rb +13 -0
- data/spec/wukong-load/loaders/elasticsearch_spec.rb +142 -0
- data/spec/wukong-load/loaders/kafka_spec.rb +72 -0
- data/spec/wukong-load/loaders/mongodb_spec.rb +100 -0
- data/spec/wukong-load/loaders/sql_spec.rb +112 -0
- data/spec/wukong-load/models/http_request_spec.rb +21 -0
- data/wukong-load.gemspec +3 -2
- metadata +26 -10
- data/lib/wukong-load/configuration.rb +0 -8
- data/lib/wukong-load/elasticsearch.rb +0 -99
- data/lib/wukong-load/runner.rb +0 -48
- data/spec/wukong-load/elasticsearch_spec.rb +0 -140
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative('../loader')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module Load
|
5
|
+
|
6
|
+
# Loads data into Kafka.
|
7
|
+
#
|
8
|
+
# Uses the `kafka-rb` gem to create a Kafka::Producer to write to
|
9
|
+
# Kafka.
|
10
|
+
#
|
11
|
+
# Allows loading records into a given topic on a given partition.
|
12
|
+
# Records can have fields `_topic` and `_partition` which override
|
13
|
+
# the given topic and partition on a per-record basis.
|
14
|
+
#
|
15
|
+
# The names of these fields within each record (`_topic` and
|
16
|
+
# `_partition`) can be customized.
|
17
|
+
class KafkaLoader < Loader
|
18
|
+
|
19
|
+
field :host, String, :default => 'localhost', :doc => "Kafka broker host"
|
20
|
+
field :port, Integer, :default => 9092, :doc => "Kafka broker port"
|
21
|
+
field :topic, String, :default => 'test', :doc => "Kafka topic"
|
22
|
+
field :topic_field, String, :default => '_topic', :doc => "Field within records which names the Kafka topic"
|
23
|
+
field :partition, Integer, :default => 0, :doc => "Kafka partition"
|
24
|
+
field :partition_field, String, :default => '_partition', :doc => "Field within records which names the Kafka partition"
|
25
|
+
|
26
|
+
description <<-EOF.gsub(/^ {8}/,'')
|
27
|
+
Loads newline-separated, JSON-formatted records over STDIN
|
28
|
+
into a Kafka queue.
|
29
|
+
|
30
|
+
$ cat data.json | wu-load kafka
|
31
|
+
|
32
|
+
By default, wu-load attempts to write each input record to a
|
33
|
+
local Kafka broker.
|
34
|
+
|
35
|
+
Input records will be written to a default Kafka topic on a
|
36
|
+
default partition. Each record can have _topic and _partition
|
37
|
+
fields to override this on a per-record basis.
|
38
|
+
|
39
|
+
The fields used (_topic and _partition) can be changed:
|
40
|
+
|
41
|
+
$ cat data.json | wu-load kafka --host=10.123.123.123 --topic=hits --partition_field=segment_id
|
42
|
+
EOF
|
43
|
+
|
44
|
+
# The Kafka producer used to send messages to Kafka.
|
45
|
+
attr_accessor :producer
|
46
|
+
|
47
|
+
# Creates the producer.
|
48
|
+
def setup
|
49
|
+
begin
|
50
|
+
require 'kafka'
|
51
|
+
rescue => e
|
52
|
+
raise Error.new("Please ensure that the 'kafka-rb' gem is installed and available (in your Gemfile)")
|
53
|
+
end
|
54
|
+
log.debug("Connecting to Kafka broker at #{host}:#{port}...")
|
55
|
+
begin
|
56
|
+
self.producer = Kafka::MultiProducer.new(:host => host, :port => port)
|
57
|
+
rescue => e
|
58
|
+
raise Error.new(e.message)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Load a single record into Kafka.
|
63
|
+
#
|
64
|
+
# @param [Hash] record
|
65
|
+
def load record
|
66
|
+
begin
|
67
|
+
topic = topic_for(record)
|
68
|
+
partition = partition_for(record)
|
69
|
+
bytes = producer.send(topic, messages_for(record), :partition => partition)
|
70
|
+
log.info("Wrote #{bytes} bytes to #{topic}/#{partition}")
|
71
|
+
rescue => e
|
72
|
+
handle_error(record, e)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# :nodoc:
|
77
|
+
def topic_for record
|
78
|
+
record[topic_field] || self.topic
|
79
|
+
end
|
80
|
+
|
81
|
+
# :nodoc:
|
82
|
+
def messages_for record
|
83
|
+
[Kafka::Message.new(MultiJson.dump(record))]
|
84
|
+
end
|
85
|
+
|
86
|
+
# :nodoc:
|
87
|
+
def partition_for record
|
88
|
+
record[partition_field] ? record[partition_field].to_i : partition
|
89
|
+
end
|
90
|
+
|
91
|
+
register :kafka_loader
|
92
|
+
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require_relative('../loader')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module Load
|
5
|
+
|
6
|
+
# Loads data into MongoDB.
|
7
|
+
#
|
8
|
+
# Uses the 'mongo' gem to connect and write data.
|
9
|
+
#
|
10
|
+
# Allows loading records into a given database and collection.
|
11
|
+
# Records can have fields `_database` and `_collection` which
|
12
|
+
# override the given database and collection on a per-record
|
13
|
+
# basis.
|
14
|
+
#
|
15
|
+
# Records can have an `_id` field which indicates an update, not
|
16
|
+
# an insert.
|
17
|
+
#
|
18
|
+
# The names of these fields within each record (`_database`,
|
19
|
+
# `_collection`, and `_id`) can be customized.
|
20
|
+
class MongoDBLoader < Loader
|
21
|
+
|
22
|
+
field :host, String, :default => 'localhost', :doc => "MongoDB host"
|
23
|
+
field :port, Integer,:default => 27017, :doc => "Port on MongoDB host"
|
24
|
+
field :database, String, :default => 'wukong', :doc => "Default MongoDB database"
|
25
|
+
field :collection, String, :default => 'streaming_record', :doc => "Default MongoDB collection"
|
26
|
+
field :database_field, String, :default => '_database', :doc => "Name of field in each record overriding default MongoDB database"
|
27
|
+
field :collection_field, String, :default => '_collection', :doc => "Name of field in each record overriding default MongoDB collection"
|
28
|
+
field :id_field, String, :default => '_id', :doc => "Name of field in each record providing ID of existing MongoDB record to update"
|
29
|
+
|
30
|
+
description <<-EOF.gsub(/^ {8}/,'')
|
31
|
+
Loads newline-separated, JSON-formatted records over STDIN
|
32
|
+
into MongoDB.
|
33
|
+
|
34
|
+
$ cat data.json | wu-load mongodb
|
35
|
+
|
36
|
+
By default, wu-load attempts to write each input record to a
|
37
|
+
local MongoDB server.
|
38
|
+
|
39
|
+
Input records will be written to a default database and
|
40
|
+
collection. Each record can have _database and _collection
|
41
|
+
fields to override this on a per-record basis.
|
42
|
+
|
43
|
+
Records with an _id field will be trigger updates, the rest
|
44
|
+
inserts.
|
45
|
+
|
46
|
+
All other fields within a record are assumed to be the names
|
47
|
+
of actual columns in the table.
|
48
|
+
|
49
|
+
The fields used (_index, _collection, and _id) can be changed:
|
50
|
+
|
51
|
+
$ cat data.json | wu-load mongodb --host=10.123.123.123 --database=web_events --collection=impressions --id_field=impression_id
|
52
|
+
EOF
|
53
|
+
|
54
|
+
# The Mongo::MongoClient we'll use for talking to MongoDB.
|
55
|
+
attr_accessor :client
|
56
|
+
|
57
|
+
# Creates the client connection.
|
58
|
+
def setup
|
59
|
+
begin
|
60
|
+
require 'mongo'
|
61
|
+
rescue => e
|
62
|
+
raise Error.new("Please ensure that the 'mongo' gem is installed and available (in your Gemfile)")
|
63
|
+
end
|
64
|
+
h = host.gsub(%r{^http://},'')
|
65
|
+
log.debug("Connecting to MongoDB server at #{h}:#{port}...")
|
66
|
+
begin
|
67
|
+
self.client = Mongo::MongoClient.new(h, port)
|
68
|
+
rescue => e
|
69
|
+
raise Error.new(e.message)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Load a single record into MongoDB.
|
74
|
+
#
|
75
|
+
# If the record has an ID, we'll issue an update, otherwise an
|
76
|
+
# insert.
|
77
|
+
#
|
78
|
+
# @param [record] Hash
|
79
|
+
def load record
|
80
|
+
id = id_for(record)
|
81
|
+
if id
|
82
|
+
res = collection_for(record).update({:_id => id}, record, :upsert => true)
|
83
|
+
if res['updatedExisting']
|
84
|
+
log.info("Updated #{id}")
|
85
|
+
else
|
86
|
+
log.info("Inserted #{id}")
|
87
|
+
end
|
88
|
+
else
|
89
|
+
res = collection_for(record).insert(record)
|
90
|
+
log.info("Inserted #{res}")
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# :nodoc:
|
95
|
+
def database_for record
|
96
|
+
client[database_name_for(record)]
|
97
|
+
end
|
98
|
+
|
99
|
+
# :nodoc:
|
100
|
+
def collection_for record
|
101
|
+
database_for(record)[collection_name_for(record)]
|
102
|
+
end
|
103
|
+
|
104
|
+
# :nodoc:
|
105
|
+
def database_name_for record
|
106
|
+
record[database_field] || self.database
|
107
|
+
end
|
108
|
+
|
109
|
+
# :nodoc:
|
110
|
+
def collection_name_for record
|
111
|
+
record[collection_field] || self.collection
|
112
|
+
end
|
113
|
+
|
114
|
+
# :nodoc:
|
115
|
+
def id_for record
|
116
|
+
record[id_field]
|
117
|
+
end
|
118
|
+
|
119
|
+
register :mongodb_loader
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,169 @@
|
|
1
|
+
require_relative('../loader')
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module Load
|
5
|
+
|
6
|
+
# Loads data into SQL databases.
|
7
|
+
#
|
8
|
+
# Uses the 'mysql' gem to connect and write data. Yes, MySQL !=
|
9
|
+
# SQL but we'll get there, I promise...
|
10
|
+
#
|
11
|
+
# Allows loading records into a given database and table. Records
|
12
|
+
# can have fields `_database` and `_table` which override the
|
13
|
+
# given database and table on a per-record basis.
|
14
|
+
#
|
15
|
+
# Records can have an `_id` field which indicates an update, not
|
16
|
+
# an insert.
|
17
|
+
#
|
18
|
+
# The names of these fields within each record (`_database`,
|
19
|
+
# `_table`, and `_id`) can be customized.
|
20
|
+
class SQLLoader < Loader
|
21
|
+
|
22
|
+
field :host, String, :default => 'localhost', :doc => "SQL host"
|
23
|
+
field :port, Integer,:default => 3306, :doc => "Port on SQL host"
|
24
|
+
field :username, String, :default => (ENV['USER'] || 'wukong'), :doc => "User to connect as"
|
25
|
+
field :password, String, :doc => "Password for user"
|
26
|
+
field :database, String, :default => 'wukong', :doc => "Default database"
|
27
|
+
field :table, String, :default => 'streaming_record', :doc => "Default table"
|
28
|
+
field :database_field, String, :default => '_database', :doc => "Name of field in each record overriding default database"
|
29
|
+
field :table_field, String, :default => '_table', :doc => "Name of field in each record overriding default table"
|
30
|
+
field :id_field, String, :default => '_id', :doc => "Name of field in each record providing ID of existing row to update"
|
31
|
+
|
32
|
+
description <<-EOF.gsub(/^ {8}/,'')
|
33
|
+
Loads newline-separated, JSON-formatted records over STDIN
|
34
|
+
into MySQL using its HTTP API.
|
35
|
+
|
36
|
+
$ cat data.json | wu-load sql
|
37
|
+
|
38
|
+
By default, wu-load attempts to write each input record to a
|
39
|
+
local SQL server.
|
40
|
+
|
41
|
+
Input records will be written to a default database and table.
|
42
|
+
Each record can have _database and _table fields to override
|
43
|
+
this on a per-record basis.
|
44
|
+
|
45
|
+
Records with an _id field will be trigger updates, the rest
|
46
|
+
inserts.
|
47
|
+
|
48
|
+
All other fields within a record are assumed to be the names
|
49
|
+
of actual columns in the table.
|
50
|
+
|
51
|
+
The fields used (_index, _table, and _id) can be changed:
|
52
|
+
|
53
|
+
$ cat data.json | wu-load sql --host=10.123.123.123 --database=web_events --table=impressions --id_field=impression_id
|
54
|
+
EOF
|
55
|
+
|
56
|
+
# The Mongo::MongoClient we'll use for talking to MongoDB.
|
57
|
+
attr_accessor :client
|
58
|
+
|
59
|
+
# Creates the client connection.
|
60
|
+
def setup
|
61
|
+
begin
|
62
|
+
require 'mysql2'
|
63
|
+
rescue => e
|
64
|
+
raise Error.new("Please ensure that the 'mysql2' gem is installed and available (in your Gemfile)")
|
65
|
+
end
|
66
|
+
log.debug("Connecting to SQL server at <#{host}:#{port}> as <#{username}>#{' using password' if password}...")
|
67
|
+
begin
|
68
|
+
self.client = Mysql2::Client.new(sql_params)
|
69
|
+
rescue => e
|
70
|
+
raise Error.new(e)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# :nodoc:
|
75
|
+
def sql_params
|
76
|
+
{:host => host, :port => port}.tap do |params|
|
77
|
+
params[:username] if username
|
78
|
+
params[:password] if password
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Load a single record into the database.
|
83
|
+
#
|
84
|
+
# If the record has an ID, we'll issue an update, otherwise an
|
85
|
+
# insert.
|
86
|
+
#
|
87
|
+
# @param [record] Hash
|
88
|
+
def load record
|
89
|
+
id = id_for(record)
|
90
|
+
if id
|
91
|
+
perform_query(update_query(record))
|
92
|
+
log.info("Updated #{id}")
|
93
|
+
else
|
94
|
+
perform_query(insert_query(record))
|
95
|
+
log.info("Inserted")
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# :nodoc:
|
100
|
+
def insert_query record
|
101
|
+
"INSERT INTO #{database_name_for(record)}.#{table_name_for(record)} (#{fields_of(record)}) VALUES (#{values_of(record)}) ON DUPLICATE KEY UPDATE #{fields_and_values_of(record)}"
|
102
|
+
end
|
103
|
+
|
104
|
+
# :nodoc:
|
105
|
+
def update_query record
|
106
|
+
"UPDATE #{database_name_for(record)}.#{table_name_for(record)} SET #{fields_and_values_of(record)} WHERE `id`=#{id_for(record)}"
|
107
|
+
end
|
108
|
+
|
109
|
+
# :nodoc:
|
110
|
+
def field_names_of record
|
111
|
+
record.keys.reject { |key| [database_field, table_field, id_field].include?(key) }.sort
|
112
|
+
end
|
113
|
+
|
114
|
+
# :nodoc:
|
115
|
+
def fields_of record
|
116
|
+
field_names_of(record).map { |name| identifier_for(name) }.join(', ')
|
117
|
+
end
|
118
|
+
|
119
|
+
# :nodoc:
|
120
|
+
def values_of record
|
121
|
+
field_names_of(record).map { |name| value_for(record[name]) }.join(', ')
|
122
|
+
end
|
123
|
+
|
124
|
+
# :nodoc:
|
125
|
+
def fields_and_values_of record
|
126
|
+
field_names_of(record).map { |name| [identifier_for(name), value_for(record[name])].join('=') }.join(', ')
|
127
|
+
end
|
128
|
+
|
129
|
+
# :nodoc:
|
130
|
+
def database_name_for record
|
131
|
+
identifier_for(record[database_field] || self.database)
|
132
|
+
end
|
133
|
+
|
134
|
+
# :nodoc:
|
135
|
+
def table_name_for record
|
136
|
+
identifier_for(record[table_field] || self.table)
|
137
|
+
end
|
138
|
+
|
139
|
+
# :nodoc:
|
140
|
+
def identifier_for thing
|
141
|
+
'`' + client.escape(thing.to_s) + '`'
|
142
|
+
end
|
143
|
+
|
144
|
+
# :nodoc:
|
145
|
+
def value_for thing
|
146
|
+
case thing
|
147
|
+
when Fixnum then thing
|
148
|
+
when nil then 'NULL'
|
149
|
+
else
|
150
|
+
'"' + client.escape(thing.to_s) + '"'
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
|
155
|
+
# :nodoc:
|
156
|
+
def id_for record
|
157
|
+
value_for(record[id_field]) if record[id_field]
|
158
|
+
end
|
159
|
+
|
160
|
+
# :nodoc:
|
161
|
+
def perform_query query
|
162
|
+
client.query query
|
163
|
+
end
|
164
|
+
|
165
|
+
register :sql_loader
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Wukong
|
2
|
+
|
3
|
+
# Represents a generic HTTP request.
|
4
|
+
class HttpRequest
|
5
|
+
|
6
|
+
include Gorillib::Model
|
7
|
+
|
8
|
+
field :timestamp, Integer, :doc => "Timestamp at which the HTTP request was received"
|
9
|
+
field :verb, String, :doc => "HTTP verb of the request"
|
10
|
+
field :path, String, :doc => "Absolute path to the resource requested"
|
11
|
+
field :params, Hash, :doc => "Query parameters contained in the request", :default => {}
|
12
|
+
field :headers, Hash, :doc => "HTTP headers of the request", :default => {}
|
13
|
+
field :ip_address, String, :doc => "IP address of the client"
|
14
|
+
field :body, String, :doc => "Body of the request"
|
15
|
+
|
16
|
+
# Return the URL of this request.
|
17
|
+
#
|
18
|
+
# @return [String]
|
19
|
+
def url
|
20
|
+
File.join(headers['Host'] || '', (path || ''))
|
21
|
+
end
|
22
|
+
|
23
|
+
# Return the HTTP Referer of this request.
|
24
|
+
#
|
25
|
+
# @return [String]
|
26
|
+
def referer
|
27
|
+
headers['Referer']
|
28
|
+
end
|
29
|
+
alias_method :referrer, :referer
|
30
|
+
|
31
|
+
# Return the HTTP User-Agent of this request.
|
32
|
+
#
|
33
|
+
# @return [String]
|
34
|
+
def user_agent
|
35
|
+
headers['User-Agent']
|
36
|
+
end
|
37
|
+
|
38
|
+
# Return the HTTP Cookie of this request.
|
39
|
+
#
|
40
|
+
# @return [String]
|
41
|
+
def cookie
|
42
|
+
headers['Cookie']
|
43
|
+
end
|
44
|
+
|
45
|
+
# Return the "best" IP address from this request.
|
46
|
+
#
|
47
|
+
# Will return the first IP address in the HTTP X-Forwarded-For chain
|
48
|
+
# if present, otherwise will return the IP address of the request
|
49
|
+
# itself.
|
50
|
+
#
|
51
|
+
# @return [String]
|
52
|
+
def best_ip_address
|
53
|
+
ip_string = headers['X-Forwarded-For']
|
54
|
+
return ip_address if ip_string.blank?
|
55
|
+
ips = ip_string.split(/\s*,\s*/)
|
56
|
+
ips.empty? ? ip_address : ips.first # client comes first, then proxies in order
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|