wukong-load 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ require_relative('../loader')
2
+
3
+ module Wukong
4
+ module Load
5
+
6
+ # Loads data into Kafka.
7
+ #
8
+ # Uses the `kafka-rb` gem to create a Kafka::Producer to write to
9
+ # Kafka.
10
+ #
11
+ # Allows loading records into a given topic on a given partition.
12
+ # Records can have fields `_topic` and `_partition` which override
13
+ # the given topic and partition on a per-record basis.
14
+ #
15
+ # The names of these fields within each record (`_topic` and
16
+ # `_partition`) can be customized.
17
+ class KafkaLoader < Loader
18
+
19
+ field :host, String, :default => 'localhost', :doc => "Kafka broker host"
20
+ field :port, Integer, :default => 9092, :doc => "Kafka broker port"
21
+ field :topic, String, :default => 'test', :doc => "Kafka topic"
22
+ field :topic_field, String, :default => '_topic', :doc => "Field within records which names the Kafka topic"
23
+ field :partition, Integer, :default => 0, :doc => "Kafka partition"
24
+ field :partition_field, String, :default => '_partition', :doc => "Field within records which names the Kafka partition"
25
+
26
+ description <<-EOF.gsub(/^ {8}/,'')
27
+ Loads newline-separated, JSON-formatted records over STDIN
28
+ into a Kafka queue.
29
+
30
+ $ cat data.json | wu-load kafka
31
+
32
+ By default, wu-load attempts to write each input record to a
33
+ local Kafka broker.
34
+
35
+ Input records will be written to a default Kafka topic on a
36
+ default partition. Each record can have _topic and _partition
37
+ fields to override this on a per-record basis.
38
+
39
+ The fields used (_topic and _partition) can be changed:
40
+
41
+ $ cat data.json | wu-load kafka --host=10.123.123.123 --topic=hits --partition_field=segment_id
42
+ EOF
43
+
44
+ # The Kafka producer used to send messages to Kafka.
45
+ attr_accessor :producer
46
+
47
+ # Creates the producer.
48
+ def setup
49
+ begin
50
+ require 'kafka'
51
+ rescue => e
52
+ raise Error.new("Please ensure that the 'kafka-rb' gem is installed and available (in your Gemfile)")
53
+ end
54
+ log.debug("Connecting to Kafka broker at #{host}:#{port}...")
55
+ begin
56
+ self.producer = Kafka::MultiProducer.new(:host => host, :port => port)
57
+ rescue => e
58
+ raise Error.new(e.message)
59
+ end
60
+ end
61
+
62
+ # Load a single record into Kafka.
63
+ #
64
+ # @param [Hash] record
65
+ def load record
66
+ begin
67
+ topic = topic_for(record)
68
+ partition = partition_for(record)
69
+ bytes = producer.send(topic, messages_for(record), :partition => partition)
70
+ log.info("Wrote #{bytes} bytes to #{topic}/#{partition}")
71
+ rescue => e
72
+ handle_error(record, e)
73
+ end
74
+ end
75
+
76
+ # :nodoc:
77
+ def topic_for record
78
+ record[topic_field] || self.topic
79
+ end
80
+
81
+ # :nodoc:
82
+ def messages_for record
83
+ [Kafka::Message.new(MultiJson.dump(record))]
84
+ end
85
+
86
+ # :nodoc:
87
+ def partition_for record
88
+ record[partition_field] ? record[partition_field].to_i : partition
89
+ end
90
+
91
+ register :kafka_loader
92
+
93
+ end
94
+ end
95
+ end
96
+
97
+
98
+
@@ -0,0 +1,123 @@
1
+ require_relative('../loader')
2
+
3
+ module Wukong
4
+ module Load
5
+
6
+ # Loads data into MongoDB.
7
+ #
8
+ # Uses the 'mongo' gem to connect and write data.
9
+ #
10
+ # Allows loading records into a given database and collection.
11
+ # Records can have fields `_database` and `_collection` which
12
+ # override the given database and collection on a per-record
13
+ # basis.
14
+ #
15
+ # Records can have an `_id` field which indicates an update, not
16
+ # an insert.
17
+ #
18
+ # The names of these fields within each record (`_database`,
19
+ # `_collection`, and `_id`) can be customized.
20
+ class MongoDBLoader < Loader
21
+
22
+ field :host, String, :default => 'localhost', :doc => "MongoDB host"
23
+ field :port, Integer,:default => 27017, :doc => "Port on MongoDB host"
24
+ field :database, String, :default => 'wukong', :doc => "Default MongoDB database"
25
+ field :collection, String, :default => 'streaming_record', :doc => "Default MongoDB collection"
26
+ field :database_field, String, :default => '_database', :doc => "Name of field in each record overriding default MongoDB database"
27
+ field :collection_field, String, :default => '_collection', :doc => "Name of field in each record overriding default MongoDB collection"
28
+ field :id_field, String, :default => '_id', :doc => "Name of field in each record providing ID of existing MongoDB record to update"
29
+
30
+ description <<-EOF.gsub(/^ {8}/,'')
31
+ Loads newline-separated, JSON-formatted records over STDIN
32
+ into MongoDB.
33
+
34
+ $ cat data.json | wu-load mongodb
35
+
36
+ By default, wu-load attempts to write each input record to a
37
+ local MongoDB server.
38
+
39
+ Input records will be written to a default database and
40
+ collection. Each record can have _database and _collection
41
+ fields to override this on a per-record basis.
42
+
43
+ Records with an _id field will be trigger updates, the rest
44
+ inserts.
45
+
46
+ All other fields within a record are assumed to be the names
47
+ of actual columns in the table.
48
+
49
+ The fields used (_index, _collection, and _id) can be changed:
50
+
51
+ $ cat data.json | wu-load mongodb --host=10.123.123.123 --database=web_events --collection=impressions --id_field=impression_id
52
+ EOF
53
+
54
+ # The Mongo::MongoClient we'll use for talking to MongoDB.
55
+ attr_accessor :client
56
+
57
+ # Creates the client connection.
58
+ def setup
59
+ begin
60
+ require 'mongo'
61
+ rescue => e
62
+ raise Error.new("Please ensure that the 'mongo' gem is installed and available (in your Gemfile)")
63
+ end
64
+ h = host.gsub(%r{^http://},'')
65
+ log.debug("Connecting to MongoDB server at #{h}:#{port}...")
66
+ begin
67
+ self.client = Mongo::MongoClient.new(h, port)
68
+ rescue => e
69
+ raise Error.new(e.message)
70
+ end
71
+ end
72
+
73
+ # Load a single record into MongoDB.
74
+ #
75
+ # If the record has an ID, we'll issue an update, otherwise an
76
+ # insert.
77
+ #
78
+ # @param [record] Hash
79
+ def load record
80
+ id = id_for(record)
81
+ if id
82
+ res = collection_for(record).update({:_id => id}, record, :upsert => true)
83
+ if res['updatedExisting']
84
+ log.info("Updated #{id}")
85
+ else
86
+ log.info("Inserted #{id}")
87
+ end
88
+ else
89
+ res = collection_for(record).insert(record)
90
+ log.info("Inserted #{res}")
91
+ end
92
+ end
93
+
94
+ # :nodoc:
95
+ def database_for record
96
+ client[database_name_for(record)]
97
+ end
98
+
99
+ # :nodoc:
100
+ def collection_for record
101
+ database_for(record)[collection_name_for(record)]
102
+ end
103
+
104
+ # :nodoc:
105
+ def database_name_for record
106
+ record[database_field] || self.database
107
+ end
108
+
109
+ # :nodoc:
110
+ def collection_name_for record
111
+ record[collection_field] || self.collection
112
+ end
113
+
114
+ # :nodoc:
115
+ def id_for record
116
+ record[id_field]
117
+ end
118
+
119
+ register :mongodb_loader
120
+
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,169 @@
1
+ require_relative('../loader')
2
+
3
+ module Wukong
4
+ module Load
5
+
6
+ # Loads data into SQL databases.
7
+ #
8
+ # Uses the 'mysql' gem to connect and write data. Yes, MySQL !=
9
+ # SQL but we'll get there, I promise...
10
+ #
11
+ # Allows loading records into a given database and table. Records
12
+ # can have fields `_database` and `_table` which override the
13
+ # given database and table on a per-record basis.
14
+ #
15
+ # Records can have an `_id` field which indicates an update, not
16
+ # an insert.
17
+ #
18
+ # The names of these fields within each record (`_database`,
19
+ # `_table`, and `_id`) can be customized.
20
+ class SQLLoader < Loader
21
+
22
+ field :host, String, :default => 'localhost', :doc => "SQL host"
23
+ field :port, Integer,:default => 3306, :doc => "Port on SQL host"
24
+ field :username, String, :default => (ENV['USER'] || 'wukong'), :doc => "User to connect as"
25
+ field :password, String, :doc => "Password for user"
26
+ field :database, String, :default => 'wukong', :doc => "Default database"
27
+ field :table, String, :default => 'streaming_record', :doc => "Default table"
28
+ field :database_field, String, :default => '_database', :doc => "Name of field in each record overriding default database"
29
+ field :table_field, String, :default => '_table', :doc => "Name of field in each record overriding default table"
30
+ field :id_field, String, :default => '_id', :doc => "Name of field in each record providing ID of existing row to update"
31
+
32
+ description <<-EOF.gsub(/^ {8}/,'')
33
+ Loads newline-separated, JSON-formatted records over STDIN
34
+ into MySQL using its HTTP API.
35
+
36
+ $ cat data.json | wu-load sql
37
+
38
+ By default, wu-load attempts to write each input record to a
39
+ local SQL server.
40
+
41
+ Input records will be written to a default database and table.
42
+ Each record can have _database and _table fields to override
43
+ this on a per-record basis.
44
+
45
+ Records with an _id field will be trigger updates, the rest
46
+ inserts.
47
+
48
+ All other fields within a record are assumed to be the names
49
+ of actual columns in the table.
50
+
51
+ The fields used (_index, _table, and _id) can be changed:
52
+
53
+ $ cat data.json | wu-load sql --host=10.123.123.123 --database=web_events --table=impressions --id_field=impression_id
54
+ EOF
55
+
56
+ # The Mongo::MongoClient we'll use for talking to MongoDB.
57
+ attr_accessor :client
58
+
59
+ # Creates the client connection.
60
+ def setup
61
+ begin
62
+ require 'mysql2'
63
+ rescue => e
64
+ raise Error.new("Please ensure that the 'mysql2' gem is installed and available (in your Gemfile)")
65
+ end
66
+ log.debug("Connecting to SQL server at <#{host}:#{port}> as <#{username}>#{' using password' if password}...")
67
+ begin
68
+ self.client = Mysql2::Client.new(sql_params)
69
+ rescue => e
70
+ raise Error.new(e)
71
+ end
72
+ end
73
+
74
+ # :nodoc:
75
+ def sql_params
76
+ {:host => host, :port => port}.tap do |params|
77
+ params[:username] if username
78
+ params[:password] if password
79
+ end
80
+ end
81
+
82
+ # Load a single record into the database.
83
+ #
84
+ # If the record has an ID, we'll issue an update, otherwise an
85
+ # insert.
86
+ #
87
+ # @param [record] Hash
88
+ def load record
89
+ id = id_for(record)
90
+ if id
91
+ perform_query(update_query(record))
92
+ log.info("Updated #{id}")
93
+ else
94
+ perform_query(insert_query(record))
95
+ log.info("Inserted")
96
+ end
97
+ end
98
+
99
+ # :nodoc:
100
+ def insert_query record
101
+ "INSERT INTO #{database_name_for(record)}.#{table_name_for(record)} (#{fields_of(record)}) VALUES (#{values_of(record)}) ON DUPLICATE KEY UPDATE #{fields_and_values_of(record)}"
102
+ end
103
+
104
+ # :nodoc:
105
+ def update_query record
106
+ "UPDATE #{database_name_for(record)}.#{table_name_for(record)} SET #{fields_and_values_of(record)} WHERE `id`=#{id_for(record)}"
107
+ end
108
+
109
+ # :nodoc:
110
+ def field_names_of record
111
+ record.keys.reject { |key| [database_field, table_field, id_field].include?(key) }.sort
112
+ end
113
+
114
+ # :nodoc:
115
+ def fields_of record
116
+ field_names_of(record).map { |name| identifier_for(name) }.join(', ')
117
+ end
118
+
119
+ # :nodoc:
120
+ def values_of record
121
+ field_names_of(record).map { |name| value_for(record[name]) }.join(', ')
122
+ end
123
+
124
+ # :nodoc:
125
+ def fields_and_values_of record
126
+ field_names_of(record).map { |name| [identifier_for(name), value_for(record[name])].join('=') }.join(', ')
127
+ end
128
+
129
+ # :nodoc:
130
+ def database_name_for record
131
+ identifier_for(record[database_field] || self.database)
132
+ end
133
+
134
+ # :nodoc:
135
+ def table_name_for record
136
+ identifier_for(record[table_field] || self.table)
137
+ end
138
+
139
+ # :nodoc:
140
+ def identifier_for thing
141
+ '`' + client.escape(thing.to_s) + '`'
142
+ end
143
+
144
+ # :nodoc:
145
+ def value_for thing
146
+ case thing
147
+ when Fixnum then thing
148
+ when nil then 'NULL'
149
+ else
150
+ '"' + client.escape(thing.to_s) + '"'
151
+ end
152
+ end
153
+
154
+
155
+ # :nodoc:
156
+ def id_for record
157
+ value_for(record[id_field]) if record[id_field]
158
+ end
159
+
160
+ # :nodoc:
161
+ def perform_query query
162
+ client.query query
163
+ end
164
+
165
+ register :sql_loader
166
+
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,60 @@
1
+ module Wukong
2
+
3
+ # Represents a generic HTTP request.
4
+ class HttpRequest
5
+
6
+ include Gorillib::Model
7
+
8
+ field :timestamp, Integer, :doc => "Timestamp at which the HTTP request was received"
9
+ field :verb, String, :doc => "HTTP verb of the request"
10
+ field :path, String, :doc => "Absolute path to the resource requested"
11
+ field :params, Hash, :doc => "Query parameters contained in the request", :default => {}
12
+ field :headers, Hash, :doc => "HTTP headers of the request", :default => {}
13
+ field :ip_address, String, :doc => "IP address of the client"
14
+ field :body, String, :doc => "Body of the request"
15
+
16
+ # Return the URL of this request.
17
+ #
18
+ # @return [String]
19
+ def url
20
+ File.join(headers['Host'] || '', (path || ''))
21
+ end
22
+
23
+ # Return the HTTP Referer of this request.
24
+ #
25
+ # @return [String]
26
+ def referer
27
+ headers['Referer']
28
+ end
29
+ alias_method :referrer, :referer
30
+
31
+ # Return the HTTP User-Agent of this request.
32
+ #
33
+ # @return [String]
34
+ def user_agent
35
+ headers['User-Agent']
36
+ end
37
+
38
+ # Return the HTTP Cookie of this request.
39
+ #
40
+ # @return [String]
41
+ def cookie
42
+ headers['Cookie']
43
+ end
44
+
45
+ # Return the "best" IP address from this request.
46
+ #
47
+ # Will return the first IP address in the HTTP X-Forwarded-For chain
48
+ # if present, otherwise will return the IP address of the request
49
+ # itself.
50
+ #
51
+ # @return [String]
52
+ def best_ip_address
53
+ ip_string = headers['X-Forwarded-For']
54
+ return ip_address if ip_string.blank?
55
+ ips = ip_string.split(/\s*,\s*/)
56
+ ips.empty? ? ip_address : ips.first # client comes first, then proxies in order
57
+ end
58
+
59
+ end
60
+ end