wukong-load 0.0.2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,98 @@
1
+ require_relative('../loader')
2
+
3
+ module Wukong
4
+ module Load
5
+
6
+ # Loads data into Kafka.
7
+ #
8
+ # Uses the `kafka-rb` gem to create a Kafka::Producer to write to
9
+ # Kafka.
10
+ #
11
+ # Allows loading records into a given topic on a given partition.
12
+ # Records can have fields `_topic` and `_partition` which override
13
+ # the given topic and partition on a per-record basis.
14
+ #
15
+ # The names of these fields within each record (`_topic` and
16
+ # `_partition`) can be customized.
17
+ class KafkaLoader < Loader
18
+
19
+ field :host, String, :default => 'localhost', :doc => "Kafka broker host"
20
+ field :port, Integer, :default => 9092, :doc => "Kafka broker port"
21
+ field :topic, String, :default => 'test', :doc => "Kafka topic"
22
+ field :topic_field, String, :default => '_topic', :doc => "Field within records which names the Kafka topic"
23
+ field :partition, Integer, :default => 0, :doc => "Kafka partition"
24
+ field :partition_field, String, :default => '_partition', :doc => "Field within records which names the Kafka partition"
25
+
26
+ description <<-EOF.gsub(/^ {8}/,'')
27
+ Loads newline-separated, JSON-formatted records over STDIN
28
+ into a Kafka queue.
29
+
30
+ $ cat data.json | wu-load kafka
31
+
32
+ By default, wu-load attempts to write each input record to a
33
+ local Kafka broker.
34
+
35
+ Input records will be written to a default Kafka topic on a
36
+ default partition. Each record can have _topic and _partition
37
+ fields to override this on a per-record basis.
38
+
39
+ The fields used (_topic and _partition) can be changed:
40
+
41
+ $ cat data.json | wu-load kafka --host=10.123.123.123 --topic=hits --partition_field=segment_id
42
+ EOF
43
+
44
+ # The Kafka producer used to send messages to Kafka.
45
+ attr_accessor :producer
46
+
47
+ # Creates the producer.
48
+ def setup
49
+ begin
50
+ require 'kafka'
51
+ rescue => e
52
+ raise Error.new("Please ensure that the 'kafka-rb' gem is installed and available (in your Gemfile)")
53
+ end
54
+ log.debug("Connecting to Kafka broker at #{host}:#{port}...")
55
+ begin
56
+ self.producer = Kafka::MultiProducer.new(:host => host, :port => port)
57
+ rescue => e
58
+ raise Error.new(e.message)
59
+ end
60
+ end
61
+
62
+ # Load a single record into Kafka.
63
+ #
64
+ # @param [Hash] record
65
+ def load record
66
+ begin
67
+ topic = topic_for(record)
68
+ partition = partition_for(record)
69
+ bytes = producer.send(topic, messages_for(record), :partition => partition)
70
+ log.info("Wrote #{bytes} bytes to #{topic}/#{partition}")
71
+ rescue => e
72
+ handle_error(record, e)
73
+ end
74
+ end
75
+
76
+ # :nodoc:
77
+ def topic_for record
78
+ record[topic_field] || self.topic
79
+ end
80
+
81
+ # :nodoc:
82
+ def messages_for record
83
+ [Kafka::Message.new(MultiJson.dump(record))]
84
+ end
85
+
86
+ # :nodoc:
87
+ def partition_for record
88
+ record[partition_field] ? record[partition_field].to_i : partition
89
+ end
90
+
91
+ register :kafka_loader
92
+
93
+ end
94
+ end
95
+ end
96
+
97
+
98
+
@@ -0,0 +1,123 @@
1
+ require_relative('../loader')
2
+
3
+ module Wukong
4
+ module Load
5
+
6
+ # Loads data into MongoDB.
7
+ #
8
+ # Uses the 'mongo' gem to connect and write data.
9
+ #
10
+ # Allows loading records into a given database and collection.
11
+ # Records can have fields `_database` and `_collection` which
12
+ # override the given database and collection on a per-record
13
+ # basis.
14
+ #
15
+ # Records can have an `_id` field which indicates an update, not
16
+ # an insert.
17
+ #
18
+ # The names of these fields within each record (`_database`,
19
+ # `_collection`, and `_id`) can be customized.
20
+ class MongoDBLoader < Loader
21
+
22
+ field :host, String, :default => 'localhost', :doc => "MongoDB host"
23
+ field :port, Integer,:default => 27017, :doc => "Port on MongoDB host"
24
+ field :database, String, :default => 'wukong', :doc => "Default MongoDB database"
25
+ field :collection, String, :default => 'streaming_record', :doc => "Default MongoDB collection"
26
+ field :database_field, String, :default => '_database', :doc => "Name of field in each record overriding default MongoDB database"
27
+ field :collection_field, String, :default => '_collection', :doc => "Name of field in each record overriding default MongoDB collection"
28
+ field :id_field, String, :default => '_id', :doc => "Name of field in each record providing ID of existing MongoDB record to update"
29
+
30
+ description <<-EOF.gsub(/^ {8}/,'')
31
+ Loads newline-separated, JSON-formatted records over STDIN
32
+ into MongoDB.
33
+
34
+ $ cat data.json | wu-load mongodb
35
+
36
+ By default, wu-load attempts to write each input record to a
37
+ local MongoDB server.
38
+
39
+ Input records will be written to a default database and
40
+ collection. Each record can have _database and _collection
41
+ fields to override this on a per-record basis.
42
+
43
+ Records with an _id field will be trigger updates, the rest
44
+ inserts.
45
+
46
+ All other fields within a record are assumed to be the names
47
+ of actual columns in the table.
48
+
49
+ The fields used (_index, _collection, and _id) can be changed:
50
+
51
+ $ cat data.json | wu-load mongodb --host=10.123.123.123 --database=web_events --collection=impressions --id_field=impression_id
52
+ EOF
53
+
54
+ # The Mongo::MongoClient we'll use for talking to MongoDB.
55
+ attr_accessor :client
56
+
57
+ # Creates the client connection.
58
+ def setup
59
+ begin
60
+ require 'mongo'
61
+ rescue => e
62
+ raise Error.new("Please ensure that the 'mongo' gem is installed and available (in your Gemfile)")
63
+ end
64
+ h = host.gsub(%r{^http://},'')
65
+ log.debug("Connecting to MongoDB server at #{h}:#{port}...")
66
+ begin
67
+ self.client = Mongo::MongoClient.new(h, port)
68
+ rescue => e
69
+ raise Error.new(e.message)
70
+ end
71
+ end
72
+
73
+ # Load a single record into MongoDB.
74
+ #
75
+ # If the record has an ID, we'll issue an update, otherwise an
76
+ # insert.
77
+ #
78
+ # @param [record] Hash
79
+ def load record
80
+ id = id_for(record)
81
+ if id
82
+ res = collection_for(record).update({:_id => id}, record, :upsert => true)
83
+ if res['updatedExisting']
84
+ log.info("Updated #{id}")
85
+ else
86
+ log.info("Inserted #{id}")
87
+ end
88
+ else
89
+ res = collection_for(record).insert(record)
90
+ log.info("Inserted #{res}")
91
+ end
92
+ end
93
+
94
+ # :nodoc:
95
+ def database_for record
96
+ client[database_name_for(record)]
97
+ end
98
+
99
+ # :nodoc:
100
+ def collection_for record
101
+ database_for(record)[collection_name_for(record)]
102
+ end
103
+
104
+ # :nodoc:
105
+ def database_name_for record
106
+ record[database_field] || self.database
107
+ end
108
+
109
+ # :nodoc:
110
+ def collection_name_for record
111
+ record[collection_field] || self.collection
112
+ end
113
+
114
+ # :nodoc:
115
+ def id_for record
116
+ record[id_field]
117
+ end
118
+
119
+ register :mongodb_loader
120
+
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,169 @@
1
+ require_relative('../loader')
2
+
3
+ module Wukong
4
+ module Load
5
+
6
+ # Loads data into SQL databases.
7
+ #
8
+ # Uses the 'mysql' gem to connect and write data. Yes, MySQL !=
9
+ # SQL but we'll get there, I promise...
10
+ #
11
+ # Allows loading records into a given database and table. Records
12
+ # can have fields `_database` and `_table` which override the
13
+ # given database and table on a per-record basis.
14
+ #
15
+ # Records can have an `_id` field which indicates an update, not
16
+ # an insert.
17
+ #
18
+ # The names of these fields within each record (`_database`,
19
+ # `_table`, and `_id`) can be customized.
20
+ class SQLLoader < Loader
21
+
22
+ field :host, String, :default => 'localhost', :doc => "SQL host"
23
+ field :port, Integer,:default => 3306, :doc => "Port on SQL host"
24
+ field :username, String, :default => (ENV['USER'] || 'wukong'), :doc => "User to connect as"
25
+ field :password, String, :doc => "Password for user"
26
+ field :database, String, :default => 'wukong', :doc => "Default database"
27
+ field :table, String, :default => 'streaming_record', :doc => "Default table"
28
+ field :database_field, String, :default => '_database', :doc => "Name of field in each record overriding default database"
29
+ field :table_field, String, :default => '_table', :doc => "Name of field in each record overriding default table"
30
+ field :id_field, String, :default => '_id', :doc => "Name of field in each record providing ID of existing row to update"
31
+
32
+ description <<-EOF.gsub(/^ {8}/,'')
33
+ Loads newline-separated, JSON-formatted records over STDIN
34
+ into MySQL using its HTTP API.
35
+
36
+ $ cat data.json | wu-load sql
37
+
38
+ By default, wu-load attempts to write each input record to a
39
+ local SQL server.
40
+
41
+ Input records will be written to a default database and table.
42
+ Each record can have _database and _table fields to override
43
+ this on a per-record basis.
44
+
45
+ Records with an _id field will be trigger updates, the rest
46
+ inserts.
47
+
48
+ All other fields within a record are assumed to be the names
49
+ of actual columns in the table.
50
+
51
+ The fields used (_index, _table, and _id) can be changed:
52
+
53
+ $ cat data.json | wu-load sql --host=10.123.123.123 --database=web_events --table=impressions --id_field=impression_id
54
+ EOF
55
+
56
+ # The Mongo::MongoClient we'll use for talking to MongoDB.
57
+ attr_accessor :client
58
+
59
+ # Creates the client connection.
60
+ def setup
61
+ begin
62
+ require 'mysql2'
63
+ rescue => e
64
+ raise Error.new("Please ensure that the 'mysql2' gem is installed and available (in your Gemfile)")
65
+ end
66
+ log.debug("Connecting to SQL server at <#{host}:#{port}> as <#{username}>#{' using password' if password}...")
67
+ begin
68
+ self.client = Mysql2::Client.new(sql_params)
69
+ rescue => e
70
+ raise Error.new(e)
71
+ end
72
+ end
73
+
74
+ # :nodoc:
75
+ def sql_params
76
+ {:host => host, :port => port}.tap do |params|
77
+ params[:username] if username
78
+ params[:password] if password
79
+ end
80
+ end
81
+
82
+ # Load a single record into the database.
83
+ #
84
+ # If the record has an ID, we'll issue an update, otherwise an
85
+ # insert.
86
+ #
87
+ # @param [record] Hash
88
+ def load record
89
+ id = id_for(record)
90
+ if id
91
+ perform_query(update_query(record))
92
+ log.info("Updated #{id}")
93
+ else
94
+ perform_query(insert_query(record))
95
+ log.info("Inserted")
96
+ end
97
+ end
98
+
99
+ # :nodoc:
100
+ def insert_query record
101
+ "INSERT INTO #{database_name_for(record)}.#{table_name_for(record)} (#{fields_of(record)}) VALUES (#{values_of(record)}) ON DUPLICATE KEY UPDATE #{fields_and_values_of(record)}"
102
+ end
103
+
104
+ # :nodoc:
105
+ def update_query record
106
+ "UPDATE #{database_name_for(record)}.#{table_name_for(record)} SET #{fields_and_values_of(record)} WHERE `id`=#{id_for(record)}"
107
+ end
108
+
109
+ # :nodoc:
110
+ def field_names_of record
111
+ record.keys.reject { |key| [database_field, table_field, id_field].include?(key) }.sort
112
+ end
113
+
114
+ # :nodoc:
115
+ def fields_of record
116
+ field_names_of(record).map { |name| identifier_for(name) }.join(', ')
117
+ end
118
+
119
+ # :nodoc:
120
+ def values_of record
121
+ field_names_of(record).map { |name| value_for(record[name]) }.join(', ')
122
+ end
123
+
124
+ # :nodoc:
125
+ def fields_and_values_of record
126
+ field_names_of(record).map { |name| [identifier_for(name), value_for(record[name])].join('=') }.join(', ')
127
+ end
128
+
129
+ # :nodoc:
130
+ def database_name_for record
131
+ identifier_for(record[database_field] || self.database)
132
+ end
133
+
134
+ # :nodoc:
135
+ def table_name_for record
136
+ identifier_for(record[table_field] || self.table)
137
+ end
138
+
139
+ # :nodoc:
140
+ def identifier_for thing
141
+ '`' + client.escape(thing.to_s) + '`'
142
+ end
143
+
144
+ # :nodoc:
145
+ def value_for thing
146
+ case thing
147
+ when Fixnum then thing
148
+ when nil then 'NULL'
149
+ else
150
+ '"' + client.escape(thing.to_s) + '"'
151
+ end
152
+ end
153
+
154
+
155
+ # :nodoc:
156
+ def id_for record
157
+ value_for(record[id_field]) if record[id_field]
158
+ end
159
+
160
+ # :nodoc:
161
+ def perform_query query
162
+ client.query query
163
+ end
164
+
165
+ register :sql_loader
166
+
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,60 @@
1
+ module Wukong
2
+
3
+ # Represents a generic HTTP request.
4
+ class HttpRequest
5
+
6
+ include Gorillib::Model
7
+
8
+ field :timestamp, Integer, :doc => "Timestamp at which the HTTP request was received"
9
+ field :verb, String, :doc => "HTTP verb of the request"
10
+ field :path, String, :doc => "Absolute path to the resource requested"
11
+ field :params, Hash, :doc => "Query parameters contained in the request", :default => {}
12
+ field :headers, Hash, :doc => "HTTP headers of the request", :default => {}
13
+ field :ip_address, String, :doc => "IP address of the client"
14
+ field :body, String, :doc => "Body of the request"
15
+
16
+ # Return the URL of this request.
17
+ #
18
+ # @return [String]
19
+ def url
20
+ File.join(headers['Host'] || '', (path || ''))
21
+ end
22
+
23
+ # Return the HTTP Referer of this request.
24
+ #
25
+ # @return [String]
26
+ def referer
27
+ headers['Referer']
28
+ end
29
+ alias_method :referrer, :referer
30
+
31
+ # Return the HTTP User-Agent of this request.
32
+ #
33
+ # @return [String]
34
+ def user_agent
35
+ headers['User-Agent']
36
+ end
37
+
38
+ # Return the HTTP Cookie of this request.
39
+ #
40
+ # @return [String]
41
+ def cookie
42
+ headers['Cookie']
43
+ end
44
+
45
+ # Return the "best" IP address from this request.
46
+ #
47
+ # Will return the first IP address in the HTTP X-Forwarded-For chain
48
+ # if present, otherwise will return the IP address of the request
49
+ # itself.
50
+ #
51
+ # @return [String]
52
+ def best_ip_address
53
+ ip_string = headers['X-Forwarded-For']
54
+ return ip_address if ip_string.blank?
55
+ ips = ip_string.split(/\s*,\s*/)
56
+ ips.empty? ? ip_address : ips.first # client comes first, then proxies in order
57
+ end
58
+
59
+ end
60
+ end