fluent-plugin-cassandra-cql 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -2,12 +2,11 @@
2
2
 
3
3
  Cassandra output plugin for Fluentd.
4
4
 
5
- Implemented using the cassandra-cql gem and targets CQL 3.0.0
5
+ Implemented using the cassandra-cql gem and targets [CQL 3.0.0](http://www.datastax.com/docs/1.1/references/cql/index)
6
6
  and Cassandra 1.1.x
7
7
 
8
8
  # Raison d'être
9
- Currently, there's another Fluentd Cassandra plugin [see
10
- here](https://github.com/tomitakazutaka/fluent-plugin-cassandra)
9
+ Currently, there's another [Fluentd Cassandra plugin](https://github.com/tomitakazutaka/fluent-plugin-cassandra)
11
10
 
12
11
  It's implemented via the Twitter Cassandra gem, which:
13
12
 
@@ -30,7 +29,8 @@ via RubyGems
30
29
 
31
30
  # create table (column family)
32
31
  CREATE TABLE events (id varchar, ts bigint, payload text, PRIMARY KEY (id, ts)) WITH CLUSTERING ORDER BY (ts DESC);
33
-
32
+
33
+ # NOTE: schema definition should match that specified in the Fluentd.conf configuration file
34
34
 
35
35
  ## Fluentd.conf Configuration
36
36
  <match cassandra.**>
@@ -38,17 +38,21 @@ via RubyGems
38
38
  host 127.0.0.1 # cassandra hostname.
39
39
  port 9160 # cassandra thrft port.
40
40
  keyspace FluentdLoggers # cassandra keyspace
41
- columnfamily events # cassandra column family
41
+ columnfamily spec_events # cassandra column family
42
42
  ttl 60 # cassandra ttl *optional => default is 0*
43
+ schema # cassandra column family schema *hash where keys => column names and values => data types*
44
+ data_keys # comma delimited string of the fluentd hash's keys
45
+ pop_data_keys # keep or pop key/values from the fluentd hash when storing it as json
43
46
  </match>
44
47
 
45
48
  # Tests
46
49
 
47
50
  rake rspec
48
51
 
49
- NOTE: requires that cassandra be installed on the machine running the tests
52
+ NOTE: requires that cassandra be installed on the machine running the
53
+ test as well as a keyspace named "FluentdLoggers" and a column family
54
+ named "spec_events"
50
55
 
51
56
  # TODOs
52
57
  1) make host and port configurable for tests
53
- 2) make schema definition configurable
54
- 3) add rake task to generate keyspace and columnfamily
58
+ 2) add rake task to generate keyspace and columnfamily
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "fluent-plugin-cassandra-cql"
8
- s.version = "0.0.1"
8
+ s.version = "0.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["obie quelland"]
12
- s.date = "2012-11-07"
12
+ s.date = "2012-11-11"
13
13
  s.description = "Fluent output plugin for Cassandra via CQL version 3.0.0"
14
14
  s.email = "quelland@gmail.com"
15
15
  s.extra_rdoc_files = [
@@ -28,7 +28,8 @@ Gem::Specification.new do |s|
28
28
  "lib/fluent/plugin/out_cassandra.rb",
29
29
  "spec/cassandra_output_spec.rb",
30
30
  "spec/spec.opts",
31
- "spec/spec_helper.rb"
31
+ "spec/spec_helper.rb",
32
+ "spec/support/helpers.rb"
32
33
  ]
33
34
  s.homepage = "http://github.com/obieq/fluent-plugin-cassandra-cql"
34
35
  s.licenses = ["MIT"]
@@ -6,33 +6,42 @@ module Fluent
6
6
 
7
7
  class CassandraOutput < BufferedOutput
8
8
  Fluent::Plugin.register_output('cassandra', self)
9
- include SetTimeKeyMixin
10
- include SetTagKeyMixin
11
9
 
12
- config_param :host, :string
13
- config_param :port, :integer
14
- config_param :keyspace, :string
15
- config_param :columnfamily, :string
16
- config_param :ttl, :integer, :default => 0
10
+ config_param :host, :string
11
+ config_param :port, :integer
12
+ config_param :keyspace, :string
13
+ config_param :columnfamily, :string
14
+ config_param :ttl, :integer, :default => 0
15
+ config_param :schema, :string
16
+ config_param :data_keys, :string
17
+
18
+ # remove keys from the fluentd json event as they're processed
19
+ # for individual columns?
20
+ config_param :pop_data_keys, :bool, :default => true
17
21
 
18
22
  def connection
19
- @connection ||= get_connection
23
+ @connection ||= get_connection(self.host, self.port, self.keyspace)
20
24
  end
21
25
 
22
- #config_set_default :include_time_key, true
23
- #config_set_default :include_tag_key, true
24
- #config_set_default :time_format, "%Y%m%d%H%M%S"
25
-
26
26
  def configure(conf)
27
27
  super
28
28
 
29
- raise ConfigError, "'Host' is required by Cassandra output (ex: localhost, 127.0.0.1, ec2-54-242-141-252.compute-1.amazonaws.com" unless self.keyspace = conf['keyspace']
30
- raise ConfigError, "'Port' is required by Cassandra output (ex: 9160)" unless self.columnfamily = conf['columnfamily']
31
- raise ConfigError, "'Keyspace' is required by Cassandra output (ex: FluentdLoggers)" unless self.keyspace = conf['keyspace']
32
- raise ConfigError, "'ColumnFamily' is required by Cassandra output (ex: events)" unless self.columnfamily = conf['columnfamily']
33
-
34
- #@host = conf.has_key?('host') ? conf['host'] : 'localhost'
35
- #@port = conf.has_key?('port') ? conf['port'] : 9160
29
+ # perform validations
30
+ raise ConfigError, "'Host' is required by Cassandra output (ex: localhost, 127.0.0.1, ec2-54-242-141-252.compute-1.amazonaws.com" if self.host.nil?
31
+ raise ConfigError, "'Port' is required by Cassandra output (ex: 9160)" if self.port.nil?
32
+ raise ConfigError, "'Keyspace' is required by Cassandra output (ex: FluentdLoggers)" if self.keyspace.nil?
33
+ raise ConfigError, "'ColumnFamily' is required by Cassandra output (ex: events)" if self.columnfamily.nil?
34
+ raise ConfigError, "'Schema' is required by Cassandra output (ex: id,ts,payload)" if self.schema.nil?
35
+ raise ConfigError, "'Schema' must contain at least two column names (ex: id,ts,payload)" if self.schema.split(',').count < 2
36
+ raise ConfigError, "'DataKeys' is required by Cassandra output (ex: tag,created_at,data)" if self.data_keys.nil?
37
+
38
+ # convert schema from string to hash
39
+ # NOTE: ok to use eval b/c this isn't this isn't a user
40
+ # supplied string
41
+ self.schema = eval(self.schema)
42
+
43
+ # convert data keys from string to array
44
+ self.data_keys = self.data_keys.split(',')
36
45
  end
37
46
 
38
47
  def start
@@ -50,17 +59,49 @@ module Fluent
50
59
 
51
60
  def write(chunk)
52
61
  chunk.msgpack_each { |record|
53
- @connection.execute("INSERT INTO #{self.columnfamily} (id, ts, payload) " +
54
- "VALUES ('#{record['tag']}', #{record['time']}, '#{record.to_json}') " +
55
- "USING TTL #{self.ttl}")
62
+ values = build_insert_values_string(self.schema.keys, self.data_keys, record, self.pop_data_keys)
63
+ cql = "INSERT INTO #{self.columnfamily} (#{self.schema.keys.join(',')}) " +
64
+ "VALUES (#{values}) " +
65
+ "USING TTL #{self.ttl}"
66
+ @connection.execute(cql)
56
67
  }
57
68
  end
58
69
 
59
70
  private
60
71
 
61
- def get_connection
62
- connection_string = "#{self.host}:#{self.port}"
63
- ::CassandraCQL::Database.new(connection_string, {:keyspace => "\"#{self.keyspace}\"", :cql_version => "3.0.0"})
72
+ def get_connection(host, port, keyspace)
73
+ connection_string = "#{host}:#{port}"
74
+ ::CassandraCQL::Database.new(connection_string, {:keyspace => "\"#{keyspace}\"", :cql_version => "3.0.0"})
75
+ end
76
+
77
+ def build_insert_values_string(schema_keys, data_keys, record, pop_data_keys)
78
+ values = data_keys.map.with_index do |key, index|
79
+ if pop_data_keys
80
+ schema[schema_keys[index]] == :string ? "'#{record.delete(key)}'" : record.delete(key)
81
+ else
82
+ schema[schema_keys[index]] == :string ? "'#{record[key]}'" : record[key]
83
+ end
84
+ end
85
+
86
+ # if we have one more schema key than data keys,
87
+ # we can then infer that we should store the event
88
+ # as a string representation of the corresponding
89
+ # json object in the last schema column
90
+ if schema_keys.count == data_keys.count + 1
91
+ values << if record.count > 0
92
+ "'#{record.to_json}'"
93
+ else
94
+ # by this point, the extra schema column has been
95
+ # added to insert cql statement, so we must put
96
+ # something in it
97
+ # TODO: detect this scenario earlier and don't
98
+ # specify the column name/value at all
99
+ # when constructing the cql stmt
100
+ "''"
101
+ end
102
+ end
103
+
104
+ return values.join(',')
64
105
  end
65
106
 
66
107
  end
@@ -1,24 +1,34 @@
1
1
  require 'spec_helper'
2
2
  Fluent::Test.setup
3
3
 
4
+ SPEC_COLUMN_FAMILY = "spec_events"
5
+ DATA_KEYS = "tag,time"
6
+
4
7
  CONFIG = %[
5
8
  host 127.0.0.1
6
9
  port 9160
7
10
  keyspace FluentdLoggers
8
- columnfamily events
11
+ columnfamily #{SPEC_COLUMN_FAMILY}
12
+ ttl 0
13
+ schema {:id => :string, :ts => :bigint, :payload => :string}
14
+ data_keys #{DATA_KEYS}
15
+ pop_data_keys true
9
16
  ]
10
17
 
11
18
  describe Fluent::CassandraOutput do
19
+ include Helpers
20
+
12
21
  let(:driver) { Fluent::Test::BufferedOutputTestDriver.new(Fluent::CassandraOutput, 'test') }
13
22
 
14
23
  after(:each) do
15
24
  d = Fluent::Test::BufferedOutputTestDriver.new(Fluent::CassandraOutput, 'test')
16
25
  d.configure(CONFIG)
17
- d.instance.connection.execute("TRUNCATE events")
26
+ d.instance.connection.execute("TRUNCATE #{SPEC_COLUMN_FAMILY}")
18
27
  end
19
28
 
20
- def add_ttl_to_config(ttl)
21
- return CONFIG + %[ ttl #{ttl}\n]
29
+ def set_config_value(config, config_name, value)
30
+ search_text = config.split("\n").map {|text| text if text.strip!.to_s.start_with? config_name.to_s}.compact![0]
31
+ config.gsub(search_text, "#{config_name} #{value}")
22
32
  end
23
33
 
24
34
  context 'configuring' do
@@ -29,18 +39,17 @@ describe Fluent::CassandraOutput do
29
39
  driver.instance.host.should eq('127.0.0.1')
30
40
  driver.instance.port.should eq(9160)
31
41
  driver.instance.keyspace.should eq('FluentdLoggers')
32
- driver.instance.columnfamily.should eq('events')
42
+ driver.instance.columnfamily.should eq(SPEC_COLUMN_FAMILY)
33
43
  driver.instance.ttl.should eq(0)
34
44
  end
35
45
 
36
46
  it 'should configure ttl' do
37
47
  ttl = 20
38
- driver.configure(add_ttl_to_config(ttl))
48
+ driver.configure(set_config_value(CONFIG, :ttl, ttl))
39
49
  driver.instance.ttl.should eq(ttl)
40
50
  end
41
51
 
42
52
  describe 'exceptions' do
43
-
44
53
  it 'should raise an exception if host is not configured' do
45
54
  expect { driver.configure(CONFIG.gsub("host", "invalid_config_name")) }.to raise_error Fluent::ConfigError
46
55
  end
@@ -56,12 +65,11 @@ describe Fluent::CassandraOutput do
56
65
  it 'should raise an exception if columnfamily is not configured' do
57
66
  expect { driver.configure(CONFIG.gsub("columnfamily", "invalid_config_name")) }.to raise_error Fluent::ConfigError
58
67
  end
59
-
60
68
  end
61
69
 
62
- end
70
+ end # context configuring
63
71
 
64
- context 'fluentd logging' do
72
+ context 'logging' do
65
73
 
66
74
  it 'should start' do
67
75
  driver.configure(CONFIG)
@@ -84,55 +92,67 @@ describe Fluent::CassandraOutput do
84
92
  driver.run
85
93
  end
86
94
 
87
- it 'should write' do
88
- driver.configure(CONFIG)
89
- tag1 = "test1"
90
- tag2 = "test2"
91
- time1 = Time.now.to_i
92
- time2 = Time.now.to_i + 2
93
- record1 = {'tag' => tag1, 'time' => time1, 'a' => 10, 'b' => 'Tesla'}
94
- record2 = {'tag' => tag2, 'time' => time2, 'a' => 20, 'b' => 'Edison'}
95
- records = [record1, record2]
96
-
97
- driver.emit(records[0])
98
- driver.emit(records[1])
99
- driver.run # persists to cassandra
100
-
101
- # query cassandra to verify data was correctly persisted
102
- row_num = records.count # non-zero based index
103
- events = driver.instance.connection.execute("SELECT * FROM events")
104
- events.rows.should eq(records.count)
105
- events.fetch do | event | # events should be sorted desc by tag, then time
106
- row_num -= 1 # zero-based index
107
- hash = event.to_hash
108
- hash['id'].should eq(records[row_num]['tag'])
109
- hash['ts'].should eq(records[row_num]['time'])
110
- hash['payload'].should eq(records[row_num].to_json)
95
+ context 'writing' do
96
+ context 'as json' do
97
+
98
+ describe 'pop no data keys' do
99
+ it 'should store json in last column' do
100
+ driver.configure(set_config_value(CONFIG, :pop_data_keys, false))
101
+ write(driver, SPEC_COLUMN_FAMILY, false)
102
+ end
103
+ end
104
+
105
+ describe 'pop some data keys' do
106
+ it 'should store json in last last column' do
107
+ driver.configure(set_config_value(CONFIG, :pop_data_keys, true))
108
+ write(driver, SPEC_COLUMN_FAMILY, false)
109
+ end
110
+ end
111
+
112
+ describe 'pop all data keys' do
113
+ it 'should store empty string in last column' do
114
+ driver.configure(CONFIG)
115
+ write(driver, SPEC_COLUMN_FAMILY, true)
116
+ end
117
+ end
118
+
119
+ end # context as json
120
+
121
+ context 'as columns' do # no need to test popping of keys b/c it makes no difference
122
+
123
+ it 'should write' do
124
+ config = set_config_value(CONFIG, :data_keys, DATA_KEYS + ',a')
125
+ config = set_config_value(CONFIG, :pop_data_keys, false)
126
+ driver.configure(config)
127
+ write(driver, SPEC_COLUMN_FAMILY, false)
128
+ end
129
+
130
+ end # context as columns
131
+
132
+ it 'should not locate event after ttl has expired' do
133
+ time = Time.now.to_i
134
+ tag = "ttl_test"
135
+ ttl = 1 # set ttl to 1 second
136
+
137
+ driver.configure(set_config_value(CONFIG, :ttl, ttl))
138
+ driver.emit({'tag' => tag, 'time' => time, 'a' => 1})
139
+ driver.run
140
+
141
+ # verify record... should return in less than one sec if hitting
142
+ # cassandra running on localhost
143
+ events = driver.instance.connection.execute("SELECT * FROM #{SPEC_COLUMN_FAMILY} where ts = #{time}")
144
+ events.rows.should eq(1)
145
+
146
+ # now, sleep long enough for the event to be expired from cassandra
147
+ sleep(ttl + 1)
148
+
149
+ # re-query and verify that no events were returned
150
+ events = driver.instance.connection.execute("SELECT * FROM #{SPEC_COLUMN_FAMILY} where ts = #{time}")
151
+ events.rows.should eq(0)
111
152
  end
112
- end
113
153
 
114
- it 'should not locate event after ttl has expired' do
115
- time = Time.now.to_i
116
- tag = "ttl_test"
117
- ttl = 1 # set ttl to 1 second
154
+ end # context writing
118
155
 
119
- driver.configure(add_ttl_to_config(ttl))
120
- driver.emit({'tag' => tag, 'time' => time, 'a' => 1})
121
- driver.run
122
-
123
- # verify record... should return in less than one sec if hitting
124
- # cassandra running on localhost
125
- events = driver.instance.connection.execute("SELECT * FROM events where ts = #{time}")
126
- events.rows.should eq(1)
127
-
128
- # now, sleep long enough for the event to be expired from cassandra
129
- sleep(ttl)
130
-
131
- # re-query and verify that no events were returned
132
- events = driver.instance.connection.execute("SELECT * FROM events where ts = #{time}")
133
- events.rows.should eq(0)
134
- end
135
-
136
- end
156
+ end # context logging
137
157
 
138
158
  end # CassandraOutput
data/spec/spec_helper.rb CHANGED
@@ -6,4 +6,4 @@ require 'fluent/test'
6
6
  Dir["./lib/**/*.rb"].each {|f| require f}
7
7
 
8
8
  # require the shared example files
9
- #Dir["./spec/support/**/*.rb"].each {|f| require f}
9
+ Dir["./spec/support/**/*.rb"].each {|f| require f}
@@ -0,0 +1,56 @@
1
+ module Helpers
2
+
3
+ def write(driver, column_family_name, tag_and_time_only)
4
+ tag1 = "test1"
5
+ tag2 = "test2"
6
+ time1 = Time.now.to_i
7
+ time2 = Time.now.to_i + 2
8
+
9
+ record1 = {'tag' => tag1, 'time' => time1}
10
+ record2 = {'tag' => tag2, 'time' => time2}
11
+
12
+ unless tag_and_time_only
13
+ record1.merge!({'a' => 10, 'b' => 'Tesla'})
14
+ record2.merge!({'a' => 20, 'b' => 'Edison'})
15
+ end
16
+
17
+ # store both records in an array
18
+ records = [record1, record2]
19
+
20
+ driver.emit(records[0])
21
+ driver.emit(records[1])
22
+ driver.run # persists to cassandra
23
+
24
+ # query cassandra to verify data was correctly persisted
25
+ row_num = records.count # non-zero based index
26
+ events = driver.instance.connection.execute("SELECT * FROM #{column_family_name}")
27
+ events.rows.should eq(records.count)
28
+ events.fetch do | event | # events should be sorted desc by tag, then time
29
+ row_num -= 1 # zero-based index
30
+
31
+ record = records[row_num]
32
+ db_hash = event.to_hash
33
+
34
+ # need to take in account that we've popped both tag and time
35
+ # from the payload data when we saved it
36
+ if driver.instance.pop_data_keys
37
+ db_hash['id'].should eq(record.delete('tag'))
38
+ db_hash['ts'].should eq(record.delete('time'))
39
+ else
40
+ db_hash['id'].should eq(record['tag'])
41
+ db_hash['ts'].should eq(record['time'])
42
+ end
43
+
44
+ if driver.instance.schema.keys.count == driver.instance.data_keys.count + 1 # store as json
45
+ if record.count > 0
46
+ db_hash['payload'].should eq(record.to_json)
47
+ else
48
+ db_hash['payload'].should eq('')
49
+ end
50
+ else
51
+ db_hash['payload'].should eq(record[record.keys[db_hash.keys.index('payload')]])
52
+ end
53
+ end
54
+ end
55
+
56
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-cassandra-cql
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-07 00:00:00.000000000 Z
12
+ date: 2012-11-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: fluentd
@@ -143,6 +143,7 @@ files:
143
143
  - spec/cassandra_output_spec.rb
144
144
  - spec/spec.opts
145
145
  - spec/spec_helper.rb
146
+ - spec/support/helpers.rb
146
147
  homepage: http://github.com/obieq/fluent-plugin-cassandra-cql
147
148
  licenses:
148
149
  - MIT
@@ -158,7 +159,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
158
159
  version: '0'
159
160
  segments:
160
161
  - 0
161
- hash: -709806578343991657
162
+ hash: -4467205590141374709
162
163
  required_rubygems_version: !ruby/object:Gem::Requirement
163
164
  none: false
164
165
  requirements: