fluent-plugin-cassandra-cql 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -2,12 +2,11 @@
2
2
 
3
3
  Cassandra output plugin for Fluentd.
4
4
 
5
- Implemented using the cassandra-cql gem and targets CQL 3.0.0
5
+ Implemented using the cassandra-cql gem and targets [CQL 3.0.0](http://www.datastax.com/docs/1.1/references/cql/index)
6
6
  and Cassandra 1.1.x
7
7
 
8
8
  # Raison d'être
9
- Currently, there's another Fluentd Cassandra plugin [see
10
- here](https://github.com/tomitakazutaka/fluent-plugin-cassandra)
9
+ Currently, there's another [Fluentd Cassandra plugin](https://github.com/tomitakazutaka/fluent-plugin-cassandra)
11
10
 
12
11
  It's implemented via the Twitter Cassandra gem, which:
13
12
 
@@ -30,7 +29,8 @@ via RubyGems
30
29
 
31
30
  # create table (column family)
32
31
  CREATE TABLE events (id varchar, ts bigint, payload text, PRIMARY KEY (id, ts)) WITH CLUSTERING ORDER BY (ts DESC);
33
-
32
+
33
+ # NOTE: schema definition should match that specified in the Fluentd.conf configuration file
34
34
 
35
35
  ## Fluentd.conf Configuration
36
36
  <match cassandra.**>
@@ -38,17 +38,21 @@ via RubyGems
38
38
  host 127.0.0.1 # cassandra hostname.
39
39
  port 9160 # cassandra thrft port.
40
40
  keyspace FluentdLoggers # cassandra keyspace
41
- columnfamily events # cassandra column family
41
+ columnfamily spec_events # cassandra column family
42
42
  ttl 60 # cassandra ttl *optional => default is 0*
43
+ schema # cassandra column family schema *hash where keys => column names and values => data types*
44
+ data_keys # comma delimited string of the fluentd hash's keys
45
+ pop_data_keys # keep or pop key/values from the fluentd hash when storing it as json
43
46
  </match>
44
47
 
45
48
  # Tests
46
49
 
47
50
  rake rspec
48
51
 
49
- NOTE: requires that cassandra be installed on the machine running the tests
52
+ NOTE: requires that cassandra be installed on the machine running the
53
+ test as well as a keyspace named "FluentdLoggers" and a column family
54
+ named "spec_events"
50
55
 
51
56
  # TODOs
52
57
  1) make host and port configurable for tests
53
- 2) make schema definition configurable
54
- 3) add rake task to generate keyspace and columnfamily
58
+ 2) add rake task to generate keyspace and columnfamily
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "fluent-plugin-cassandra-cql"
8
- s.version = "0.0.1"
8
+ s.version = "0.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["obie quelland"]
12
- s.date = "2012-11-07"
12
+ s.date = "2012-11-11"
13
13
  s.description = "Fluent output plugin for Cassandra via CQL version 3.0.0"
14
14
  s.email = "quelland@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -28,7 +28,8 @@ Gem::Specification.new do |s|
28
28
  "lib/fluent/plugin/out_cassandra.rb",
29
29
  "spec/cassandra_output_spec.rb",
30
30
  "spec/spec.opts",
31
- "spec/spec_helper.rb"
31
+ "spec/spec_helper.rb",
32
+ "spec/support/helpers.rb"
32
33
  ]
33
34
  s.homepage = "http://github.com/obieq/fluent-plugin-cassandra-cql"
34
35
  s.licenses = ["MIT"]
@@ -6,33 +6,42 @@ module Fluent
6
6
 
7
7
  class CassandraOutput < BufferedOutput
8
8
  Fluent::Plugin.register_output('cassandra', self)
9
- include SetTimeKeyMixin
10
- include SetTagKeyMixin
11
9
 
12
- config_param :host, :string
13
- config_param :port, :integer
14
- config_param :keyspace, :string
15
- config_param :columnfamily, :string
16
- config_param :ttl, :integer, :default => 0
10
+ config_param :host, :string
11
+ config_param :port, :integer
12
+ config_param :keyspace, :string
13
+ config_param :columnfamily, :string
14
+ config_param :ttl, :integer, :default => 0
15
+ config_param :schema, :string
16
+ config_param :data_keys, :string
17
+
18
+ # remove keys from the fluentd json event as they're processed
19
+ # for individual columns?
20
+ config_param :pop_data_keys, :bool, :default => true
17
21
 
18
22
  def connection
19
- @connection ||= get_connection
23
+ @connection ||= get_connection(self.host, self.port, self.keyspace)
20
24
  end
21
25
 
22
- #config_set_default :include_time_key, true
23
- #config_set_default :include_tag_key, true
24
- #config_set_default :time_format, "%Y%m%d%H%M%S"
25
-
26
26
  def configure(conf)
27
27
  super
28
28
 
29
- raise ConfigError, "'Host' is required by Cassandra output (ex: localhost, 127.0.0.1, ec2-54-242-141-252.compute-1.amazonaws.com" unless self.keyspace = conf['keyspace']
30
- raise ConfigError, "'Port' is required by Cassandra output (ex: 9160)" unless self.columnfamily = conf['columnfamily']
31
- raise ConfigError, "'Keyspace' is required by Cassandra output (ex: FluentdLoggers)" unless self.keyspace = conf['keyspace']
32
- raise ConfigError, "'ColumnFamily' is required by Cassandra output (ex: events)" unless self.columnfamily = conf['columnfamily']
33
-
34
- #@host = conf.has_key?('host') ? conf['host'] : 'localhost'
35
- #@port = conf.has_key?('port') ? conf['port'] : 9160
29
+ # perform validations
30
+ raise ConfigError, "'Host' is required by Cassandra output (ex: localhost, 127.0.0.1, ec2-54-242-141-252.compute-1.amazonaws.com" if self.host.nil?
31
+ raise ConfigError, "'Port' is required by Cassandra output (ex: 9160)" if self.port.nil?
32
+ raise ConfigError, "'Keyspace' is required by Cassandra output (ex: FluentdLoggers)" if self.keyspace.nil?
33
+ raise ConfigError, "'ColumnFamily' is required by Cassandra output (ex: events)" if self.columnfamily.nil?
34
+ raise ConfigError, "'Schema' is required by Cassandra output (ex: id,ts,payload)" if self.schema.nil?
35
+ raise ConfigError, "'Schema' must contain at least two column names (ex: id,ts,payload)" if self.schema.split(',').count < 2
36
+ raise ConfigError, "'DataKeys' is required by Cassandra output (ex: tag,created_at,data)" if self.data_keys.nil?
37
+
38
+ # convert schema from string to hash
39
+ # NOTE: ok to use eval b/c this isn't this isn't a user
40
+ # supplied string
41
+ self.schema = eval(self.schema)
42
+
43
+ # convert data keys from string to array
44
+ self.data_keys = self.data_keys.split(',')
36
45
  end
37
46
 
38
47
  def start
@@ -50,17 +59,49 @@ module Fluent
50
59
 
51
60
  def write(chunk)
52
61
  chunk.msgpack_each { |record|
53
- @connection.execute("INSERT INTO #{self.columnfamily} (id, ts, payload) " +
54
- "VALUES ('#{record['tag']}', #{record['time']}, '#{record.to_json}') " +
55
- "USING TTL #{self.ttl}")
62
+ values = build_insert_values_string(self.schema.keys, self.data_keys, record, self.pop_data_keys)
63
+ cql = "INSERT INTO #{self.columnfamily} (#{self.schema.keys.join(',')}) " +
64
+ "VALUES (#{values}) " +
65
+ "USING TTL #{self.ttl}"
66
+ @connection.execute(cql)
56
67
  }
57
68
  end
58
69
 
59
70
  private
60
71
 
61
- def get_connection
62
- connection_string = "#{self.host}:#{self.port}"
63
- ::CassandraCQL::Database.new(connection_string, {:keyspace => "\"#{self.keyspace}\"", :cql_version => "3.0.0"})
72
+ def get_connection(host, port, keyspace)
73
+ connection_string = "#{host}:#{port}"
74
+ ::CassandraCQL::Database.new(connection_string, {:keyspace => "\"#{keyspace}\"", :cql_version => "3.0.0"})
75
+ end
76
+
77
+ def build_insert_values_string(schema_keys, data_keys, record, pop_data_keys)
78
+ values = data_keys.map.with_index do |key, index|
79
+ if pop_data_keys
80
+ schema[schema_keys[index]] == :string ? "'#{record.delete(key)}'" : record.delete(key)
81
+ else
82
+ schema[schema_keys[index]] == :string ? "'#{record[key]}'" : record[key]
83
+ end
84
+ end
85
+
86
+ # if we have one more schema key than data keys,
87
+ # we can then infer that we should store the event
88
+ # as a string representation of the corresponding
89
+ # json object in the last schema column
90
+ if schema_keys.count == data_keys.count + 1
91
+ values << if record.count > 0
92
+ "'#{record.to_json}'"
93
+ else
94
+ # by this point, the extra schema column has been
95
+ # added to insert cql statement, so we must put
96
+ # something in it
97
+ # TODO: detect this scenario earlier and don't
98
+ # specify the column name/value at all
99
+ # when constructing the cql stmt
100
+ "''"
101
+ end
102
+ end
103
+
104
+ return values.join(',')
64
105
  end
65
106
 
66
107
  end
@@ -1,24 +1,34 @@
1
1
  require 'spec_helper'
2
2
  Fluent::Test.setup
3
3
 
4
+ SPEC_COLUMN_FAMILY = "spec_events"
5
+ DATA_KEYS = "tag,time"
6
+
4
7
  CONFIG = %[
5
8
  host 127.0.0.1
6
9
  port 9160
7
10
  keyspace FluentdLoggers
8
- columnfamily events
11
+ columnfamily #{SPEC_COLUMN_FAMILY}
12
+ ttl 0
13
+ schema {:id => :string, :ts => :bigint, :payload => :string}
14
+ data_keys #{DATA_KEYS}
15
+ pop_data_keys true
9
16
  ]
10
17
 
11
18
  describe Fluent::CassandraOutput do
19
+ include Helpers
20
+
12
21
  let(:driver) { Fluent::Test::BufferedOutputTestDriver.new(Fluent::CassandraOutput, 'test') }
13
22
 
14
23
  after(:each) do
15
24
  d = Fluent::Test::BufferedOutputTestDriver.new(Fluent::CassandraOutput, 'test')
16
25
  d.configure(CONFIG)
17
- d.instance.connection.execute("TRUNCATE events")
26
+ d.instance.connection.execute("TRUNCATE #{SPEC_COLUMN_FAMILY}")
18
27
  end
19
28
 
20
- def add_ttl_to_config(ttl)
21
- return CONFIG + %[ ttl #{ttl}\n]
29
+ def set_config_value(config, config_name, value)
30
+ search_text = config.split("\n").map {|text| text if text.strip!.to_s.start_with? config_name.to_s}.compact![0]
31
+ config.gsub(search_text, "#{config_name} #{value}")
22
32
  end
23
33
 
24
34
  context 'configuring' do
@@ -29,18 +39,17 @@ describe Fluent::CassandraOutput do
29
39
  driver.instance.host.should eq('127.0.0.1')
30
40
  driver.instance.port.should eq(9160)
31
41
  driver.instance.keyspace.should eq('FluentdLoggers')
32
- driver.instance.columnfamily.should eq('events')
42
+ driver.instance.columnfamily.should eq(SPEC_COLUMN_FAMILY)
33
43
  driver.instance.ttl.should eq(0)
34
44
  end
35
45
 
36
46
  it 'should configure ttl' do
37
47
  ttl = 20
38
- driver.configure(add_ttl_to_config(ttl))
48
+ driver.configure(set_config_value(CONFIG, :ttl, ttl))
39
49
  driver.instance.ttl.should eq(ttl)
40
50
  end
41
51
 
42
52
  describe 'exceptions' do
43
-
44
53
  it 'should raise an exception if host is not configured' do
45
54
  expect { driver.configure(CONFIG.gsub("host", "invalid_config_name")) }.to raise_error Fluent::ConfigError
46
55
  end
@@ -56,12 +65,11 @@ describe Fluent::CassandraOutput do
56
65
  it 'should raise an exception if columnfamily is not configured' do
57
66
  expect { driver.configure(CONFIG.gsub("columnfamily", "invalid_config_name")) }.to raise_error Fluent::ConfigError
58
67
  end
59
-
60
68
  end
61
69
 
62
- end
70
+ end # context configuring
63
71
 
64
- context 'fluentd logging' do
72
+ context 'logging' do
65
73
 
66
74
  it 'should start' do
67
75
  driver.configure(CONFIG)
@@ -84,55 +92,67 @@ describe Fluent::CassandraOutput do
84
92
  driver.run
85
93
  end
86
94
 
87
- it 'should write' do
88
- driver.configure(CONFIG)
89
- tag1 = "test1"
90
- tag2 = "test2"
91
- time1 = Time.now.to_i
92
- time2 = Time.now.to_i + 2
93
- record1 = {'tag' => tag1, 'time' => time1, 'a' => 10, 'b' => 'Tesla'}
94
- record2 = {'tag' => tag2, 'time' => time2, 'a' => 20, 'b' => 'Edison'}
95
- records = [record1, record2]
96
-
97
- driver.emit(records[0])
98
- driver.emit(records[1])
99
- driver.run # persists to cassandra
100
-
101
- # query cassandra to verify data was correctly persisted
102
- row_num = records.count # non-zero based index
103
- events = driver.instance.connection.execute("SELECT * FROM events")
104
- events.rows.should eq(records.count)
105
- events.fetch do | event | # events should be sorted desc by tag, then time
106
- row_num -= 1 # zero-based index
107
- hash = event.to_hash
108
- hash['id'].should eq(records[row_num]['tag'])
109
- hash['ts'].should eq(records[row_num]['time'])
110
- hash['payload'].should eq(records[row_num].to_json)
95
+ context 'writing' do
96
+ context 'as json' do
97
+
98
+ describe 'pop no data keys' do
99
+ it 'should store json in last column' do
100
+ driver.configure(set_config_value(CONFIG, :pop_data_keys, false))
101
+ write(driver, SPEC_COLUMN_FAMILY, false)
102
+ end
103
+ end
104
+
105
+ describe 'pop some data keys' do
106
+ it 'should store json in last last column' do
107
+ driver.configure(set_config_value(CONFIG, :pop_data_keys, true))
108
+ write(driver, SPEC_COLUMN_FAMILY, false)
109
+ end
110
+ end
111
+
112
+ describe 'pop all data keys' do
113
+ it 'should store empty string in last column' do
114
+ driver.configure(CONFIG)
115
+ write(driver, SPEC_COLUMN_FAMILY, true)
116
+ end
117
+ end
118
+
119
+ end # context as json
120
+
121
+ context 'as columns' do # no need to test popping of keys b/c it makes no difference
122
+
123
+ it 'should write' do
124
+ config = set_config_value(CONFIG, :data_keys, DATA_KEYS + ',a')
125
+ config = set_config_value(CONFIG, :pop_data_keys, false)
126
+ driver.configure(config)
127
+ write(driver, SPEC_COLUMN_FAMILY, false)
128
+ end
129
+
130
+ end # context as columns
131
+
132
+ it 'should not locate event after ttl has expired' do
133
+ time = Time.now.to_i
134
+ tag = "ttl_test"
135
+ ttl = 1 # set ttl to 1 second
136
+
137
+ driver.configure(set_config_value(CONFIG, :ttl, ttl))
138
+ driver.emit({'tag' => tag, 'time' => time, 'a' => 1})
139
+ driver.run
140
+
141
+ # verify record... should return in less than one sec if hitting
142
+ # cassandra running on localhost
143
+ events = driver.instance.connection.execute("SELECT * FROM #{SPEC_COLUMN_FAMILY} where ts = #{time}")
144
+ events.rows.should eq(1)
145
+
146
+ # now, sleep long enough for the event to be expired from cassandra
147
+ sleep(ttl + 1)
148
+
149
+ # re-query and verify that no events were returned
150
+ events = driver.instance.connection.execute("SELECT * FROM #{SPEC_COLUMN_FAMILY} where ts = #{time}")
151
+ events.rows.should eq(0)
111
152
  end
112
- end
113
153
 
114
- it 'should not locate event after ttl has expired' do
115
- time = Time.now.to_i
116
- tag = "ttl_test"
117
- ttl = 1 # set ttl to 1 second
154
+ end # context writing
118
155
 
119
- driver.configure(add_ttl_to_config(ttl))
120
- driver.emit({'tag' => tag, 'time' => time, 'a' => 1})
121
- driver.run
122
-
123
- # verify record... should return in less than one sec if hitting
124
- # cassandra running on localhost
125
- events = driver.instance.connection.execute("SELECT * FROM events where ts = #{time}")
126
- events.rows.should eq(1)
127
-
128
- # now, sleep long enough for the event to be expired from cassandra
129
- sleep(ttl)
130
-
131
- # re-query and verify that no events were returned
132
- events = driver.instance.connection.execute("SELECT * FROM events where ts = #{time}")
133
- events.rows.should eq(0)
134
- end
135
-
136
- end
156
+ end # context logging
137
157
 
138
158
  end # CassandraOutput
data/spec/spec_helper.rb CHANGED
@@ -6,4 +6,4 @@ require 'fluent/test'
6
6
  Dir["./lib/**/*.rb"].each {|f| require f}
7
7
 
8
8
  # require the shared example files
9
- #Dir["./spec/support/**/*.rb"].each {|f| require f}
9
+ Dir["./spec/support/**/*.rb"].each {|f| require f}
@@ -0,0 +1,56 @@
1
+ module Helpers
2
+
3
+ def write(driver, column_family_name, tag_and_time_only)
4
+ tag1 = "test1"
5
+ tag2 = "test2"
6
+ time1 = Time.now.to_i
7
+ time2 = Time.now.to_i + 2
8
+
9
+ record1 = {'tag' => tag1, 'time' => time1}
10
+ record2 = {'tag' => tag2, 'time' => time2}
11
+
12
+ unless tag_and_time_only
13
+ record1.merge!({'a' => 10, 'b' => 'Tesla'})
14
+ record2.merge!({'a' => 20, 'b' => 'Edison'})
15
+ end
16
+
17
+ # store both records in an array
18
+ records = [record1, record2]
19
+
20
+ driver.emit(records[0])
21
+ driver.emit(records[1])
22
+ driver.run # persists to cassandra
23
+
24
+ # query cassandra to verify data was correctly persisted
25
+ row_num = records.count # non-zero based index
26
+ events = driver.instance.connection.execute("SELECT * FROM #{column_family_name}")
27
+ events.rows.should eq(records.count)
28
+ events.fetch do | event | # events should be sorted desc by tag, then time
29
+ row_num -= 1 # zero-based index
30
+
31
+ record = records[row_num]
32
+ db_hash = event.to_hash
33
+
34
+ # need to take in account that we've popped both tag and time
35
+ # from the payload data when we saved it
36
+ if driver.instance.pop_data_keys
37
+ db_hash['id'].should eq(record.delete('tag'))
38
+ db_hash['ts'].should eq(record.delete('time'))
39
+ else
40
+ db_hash['id'].should eq(record['tag'])
41
+ db_hash['ts'].should eq(record['time'])
42
+ end
43
+
44
+ if driver.instance.schema.keys.count == driver.instance.data_keys.count + 1 # store as json
45
+ if record.count > 0
46
+ db_hash['payload'].should eq(record.to_json)
47
+ else
48
+ db_hash['payload'].should eq('')
49
+ end
50
+ else
51
+ db_hash['payload'].should eq(record[record.keys[db_hash.keys.index('payload')]])
52
+ end
53
+ end
54
+ end
55
+
56
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-cassandra-cql
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-07 00:00:00.000000000 Z
12
+ date: 2012-11-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: fluentd
@@ -143,6 +143,7 @@ files:
143
143
  - spec/cassandra_output_spec.rb
144
144
  - spec/spec.opts
145
145
  - spec/spec_helper.rb
146
+ - spec/support/helpers.rb
146
147
  homepage: http://github.com/obieq/fluent-plugin-cassandra-cql
147
148
  licenses:
148
149
  - MIT
@@ -158,7 +159,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
158
159
  version: '0'
159
160
  segments:
160
161
  - 0
161
- hash: -709806578343991657
162
+ hash: -4467205590141374709
162
163
  required_rubygems_version: !ruby/object:Gem::Requirement
163
164
  none: false
164
165
  requirements: