quandl_cassandra 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/UPGRADE.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## 0.3.0
2
+
3
+ * Base::Connection methods use #with_connection
4
+ * InsertColumnAttributes uses Batch.insert
5
+ * InsertColumns uses Batch.insert
6
+ * add Quandl::Cassandra::Batch
7
+ * insert_columns_in_batches_with_threads does not care how many rows are sent
8
+
1
9
  ## 0.0.1
2
10
 
3
11
  * Begin
@@ -7,6 +7,7 @@ require "active_support/core_ext/object"
7
7
 
8
8
  require 'cql'
9
9
  require 'scope_composer'
10
+ require 'facter'
10
11
 
11
12
  require 'quandl/logger'
12
13
  require "quandl/data"
@@ -16,6 +17,7 @@ require 'quandl/cassandra/error'
16
17
  require 'quandl/cassandra/types'
17
18
  require 'quandl/cassandra/base'
18
19
  require 'quandl/cassandra/configuration'
20
+ require 'quandl/cassandra/batch'
19
21
 
20
22
  require 'quandl/cassandra_models/column'
21
23
  require 'quandl/cassandra_models/column_attribute'
@@ -12,17 +12,40 @@ module Quandl::Cassandra::Base::Connection
12
12
  end
13
13
 
14
14
  def prepare(statement)
15
- connection.prepare(statement)
15
+ with_connection do |c|
16
+ c.prepare(statement)
17
+ end
16
18
  end
17
19
 
18
20
  def execute_async(statement, query_consistency = nil)
19
21
  query_consistency = consistency unless query_consistency.present?
20
- connection.async.execute( statement, query_consistency )
22
+ with_connection do |c|
23
+ c.async.execute( statement, query_consistency )
24
+ end
21
25
  end
22
26
 
23
- def execute(statement, query_consistency = nil)
24
- query_consistency = consistency unless query_consistency.present?
25
- connection.execute( statement, query_consistency )
27
+ def execute(statement, qconsistency = nil)
28
+ qconsistency = consistency unless qconsistency.present?
29
+ with_connection do |c|
30
+ c.execute( statement, qconsistency )
31
+ end
32
+ end
33
+
34
+ def with_connection(&block)
35
+ begin
36
+ yield(connection)
37
+
38
+ rescue Cql::Io::ConnectionError => e
39
+ Quandl::Logger.error(e)
40
+ reset_connection
41
+ raise Cql::Io::ConnectionError
42
+
43
+ rescue Cql::NotConnectedError => e
44
+ Quandl::Logger.error(e)
45
+ reset_connection
46
+ raise Cql::NotConnectedError
47
+
48
+ end
26
49
  end
27
50
 
28
51
  def connection
@@ -33,6 +56,12 @@ module Quandl::Cassandra::Base::Connection
33
56
  Quandl::Cassandra.configuration.consistency
34
57
  end
35
58
 
59
+ def reset_connection
60
+ connection.close
61
+ @@connection = establish_connection
62
+ true
63
+ end
64
+
36
65
  def establish_connection
37
66
  c = Cql::Client.connect(
38
67
  hosts: Quandl::Cassandra.configuration.hosts,
@@ -10,19 +10,7 @@ module Quandl::Cassandra::Base::Logging
10
10
  Quandl::Logger.debug(statement)
11
11
  super if defined?(super)
12
12
  end
13
-
14
- def execute_async(*args, &block)
15
- statement = args.first.to_s
16
- statement = "#{statement[0..200]} ... #{statement.length} chars" if statement.length > 200
17
- t1 = Time.now
18
- begin
19
- r = super if defined?(super)
20
- ensure
21
- Quandl::Logger.debug("(#{t1.elapsed_ms}) #{statement}")
22
- end
23
- r
24
- end
25
-
13
+
26
14
  def execute(*args, &block)
27
15
  statement = args.first.to_s
28
16
  statement = "#{statement[0..200]} ... #{statement.length} chars" if statement.length > 200
@@ -0,0 +1,9 @@
1
+ require 'quandl/cassandra/batch/logging'
2
+ require 'quandl/cassandra/batch/insert'
3
+
4
+ class Quandl::Cassandra::Batch
5
+
6
+ include Quandl::Cassandra::Batch::Insert
7
+ include Quandl::Cassandra::Batch::Logging if defined?(QUANDL_LOGGER) && QUANDL_LOGGER == true
8
+
9
+ end
@@ -0,0 +1,53 @@
1
+ module Quandl::Cassandra::Batch::Insert
2
+
3
+ extend ActiveSupport::Concern
4
+
5
+ module ClassMethods
6
+
7
+ # Quandl::Cassandra::Batch.insert(rows) do |id, type, time, value|
8
+ # "INSERT INTO columns (id, type, time, value) VALUES (#{id}, '#{type}', #{time}, #{value})"
9
+ # end
10
+ def insert(rows, &block)
11
+ insert_in_batches(rows, &block)
12
+ end
13
+
14
+ def batch_size
15
+ Quandl::Cassandra.configuration.batch_size
16
+ end
17
+
18
+
19
+ protected
20
+
21
+ def insert_in_batches_with_threads(rows, &block)
22
+ # split rows into groups by rows_per_thread
23
+ threads = rows.each_slice( rows_per_thread(rows) ).map do |rows_slice|
24
+ Thread.start{ insert_in_batches(rows_slice, &block) }
25
+ end
26
+ threads.each(&:join)
27
+ end
28
+
29
+ def insert_in_batches(rows, &block)
30
+ futures = []
31
+ rows.each_slice( batch_size ).each do |rows_slice|
32
+ statements = rows_slice.collect{|row| block.call( *row ) }
33
+ futures << execute_async_batch(statements)
34
+ end
35
+ futures.collect(&:value)
36
+ end
37
+
38
+ def rows_per_thread(rows)
39
+ r = rows.count / Facter.processorcount.to_i
40
+ r = 1 if r <= 0
41
+ r
42
+ end
43
+
44
+
45
+ private
46
+
47
+ def execute_async_batch(statements)
48
+ batch = %Q{BEGIN UNLOGGED BATCH\n#{statements.join("\n")}\nAPPLY BATCH;}
49
+ future = Quandl::Cassandra::Base.execute_async( batch )
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,22 @@
1
+ class Quandl::Cassandra::Batch
2
+ module Logging
3
+
4
+ extend ActiveSupport::Concern
5
+
6
+ module ClassMethods
7
+
8
+ def insert(rows, &block)
9
+ # log init
10
+ statement = block.call(rows[0].collect{'?'})
11
+ t1 = Time.now
12
+ # call method
13
+ r = super if defined?(super)
14
+ # log write
15
+ Quandl::Logger.debug("(#{t1.elapsed_ms}) [#{rows.count} rows] BATCH #{statement}")
16
+ r
17
+ end
18
+
19
+ end
20
+
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  module Quandl
2
2
  module Cassandra
3
- VERSION = '0.2.1'
3
+ VERSION = '0.3.0'
4
4
  end
5
5
  end
@@ -18,39 +18,27 @@ class Quandl::Cassandra::Column::Read::SelectColumns < Quandl::Cassandra::Column
18
18
  end
19
19
 
20
20
  def select_data
21
+ t1 = Time.now
22
+ # fire off the queries
21
23
  prepared = Quandl::Cassandra::Base.prepare( statement )
22
24
  data = {}
25
+ futures = []
23
26
  attributes[:column_ids].each_with_index do | id, index |
24
27
  # pluck column type from collapses
25
28
  type = attributes[:column_collapses][index].to_s
26
29
  # bind and execute query
27
- rows = prepared.execute( id, type, :one )
28
- # collect result
29
- rows.each do |row|
30
+ futures << prepared.async.execute( id, type, Quandl::Cassandra::Base.consistency )
31
+ end
32
+ # collect the results
33
+ futures.each_with_index do |future, index|
34
+ # collect result
35
+ future.value.each do |row|
30
36
  data[row['time']] ||= Array.new( attributes[:column_ids].count )
31
37
  data[row['time']][index] ||= row['value']
32
38
  end
33
39
  end
34
- data.collect(&:flatten)
35
- # # fire off the queries
36
- # futures = []
37
- # attrs[:column_ids].each_with_index do | id, index |
38
- # # pluck column type from collapses
39
- # type = attrs[:column_collapses][index].to_s
40
- # # bind and execute query
41
- # futures << Quandl::Cassandra::Base.connection.execute_async( statement.bind( id, type ) )
42
- # end
43
- # # collect the results
44
- # futures.each_with_index do |future, column_index|
45
- # t1 = Time.now
46
- # rows = JavaDriver::ResultSet.new( future.get_uninterruptibly ).to_a
47
- # rows.each do |row|
48
- # data[row[0]] ||= Array.new(attrs[:column_ids].count)
49
- # data[row[0]][column_index] ||= row[1]
50
- # end
51
- # JCQL::CommonLogger.info "#{cql} (#{attrs[:column_ids][column_index]}) (#{t1.elapsed.microseconds}ms)"
52
- # end
53
- # data
40
+ Quandl::Logger.debug("(#{t1.elapsed_ms}) #{self.class.name}.select_data")
41
+ data
54
42
  end
55
43
 
56
44
  def statement
@@ -5,18 +5,16 @@ class Quandl::Cassandra::Column::Write::InsertColumnAttributes < Quandl::Cassand
5
5
 
6
6
  def perform
7
7
  return if column_ids.blank?
8
- column_ids.each_with_index{|column_id, position|
9
- Quandl::Cassandra::Base.execute( datasets_statement( column_id, position ) )
10
- Quandl::Cassandra::Base.execute( column_attributes_statement( column_id ) )
11
- }
12
- end
13
-
14
- def datasets_statement( column_id, position )
15
- "INSERT INTO datasets (id, column_id, position) VALUES (#{id}, #{column_id}, #{position})"
16
- end
17
-
18
- def column_attributes_statement( column_id )
19
- "INSERT INTO column_attributes ( id, frequency ) VALUES ( #{column_id}, '#{frequency}' )"
8
+ # format data for batch insertion
9
+ rows_values = []
10
+ column_ids.each_with_index do |column_id, position|
11
+ rows_values << [id, column_id, position, frequency]
12
+ end
13
+ # insert data
14
+ Quandl::Cassandra::Batch.insert(rows_values) do |id, column_id, position, frequency|
15
+ %Q{INSERT INTO datasets (id, column_id, position) VALUES (#{id}, #{column_id}, #{position})
16
+ INSERT INTO column_attributes ( id, frequency ) VALUES ( #{column_id}, '#{frequency}' )}
17
+ end
20
18
  end
21
19
 
22
20
  end
@@ -1,30 +1,9 @@
1
1
  class Quandl::Cassandra::Column::Write::InsertColumns < Quandl::Cassandra::Column::Write
2
2
 
3
3
  def perform
4
- insert_columns_in_batches
5
- end
6
-
7
- def insert_columns_in_batches
8
- threads = statement_values.each_slice( statement_values.size / 8 ).map do |threads_slice|
9
- Thread.start do
10
- futures = []
11
- threads_slice.each_slice( Quandl::Cassandra.configuration.batch_size ).each do |batch_slice|
12
- statements = batch_slice.collect{|row| statement( *row ) }
13
- futures << execute_async_batch(statements)
14
- end
15
- futures.collect(&:value)
16
- end
4
+ Quandl::Cassandra::Batch.insert(statement_values) do |id, type, time, value|
5
+ "INSERT INTO columns (id, type, time, value) VALUES (#{id}, '#{type}', #{time}, #{value})"
17
6
  end
18
- threads.each(&:join)
19
- end
20
-
21
- def execute_async_batch(statements)
22
- batch = %Q{BEGIN UNLOGGED BATCH\n#{statements.join("\n")}\nAPPLY BATCH;}
23
- future = Quandl::Cassandra::Base.execute_async( batch )
24
- end
25
-
26
- def statement( id, type, time, value )
27
- "INSERT INTO columns (id, type, time, value) VALUES (#{id}, '#{type}', #{time}, #{value})"
28
7
  end
29
8
 
30
9
  end
@@ -20,7 +20,7 @@ class Quandl::Cassandra::Dataset < Quandl::Cassandra::Base
20
20
  self.columns[index].assign_attributes(attrs)
21
21
  end
22
22
  end
23
-
23
+
24
24
  def columns
25
25
  @columns ||= column_ids.collect{|cid| Quandl::Cassandra::ColumnAttribute.find(cid) }
26
26
  end
@@ -25,7 +25,8 @@ Gem::Specification.new do |s|
25
25
 
26
26
  s.add_runtime_dependency "activesupport", ">= 3.0.0"
27
27
  s.add_runtime_dependency "activemodel", ">= 3.0.0"
28
-
28
+
29
+ s.add_runtime_dependency "facter", "~> 1.7.3"
29
30
  s.add_runtime_dependency "scope_composer", "~> 0.4"
30
31
  s.add_runtime_dependency "quandl_data", "~> 1.0"
31
32
  s.add_runtime_dependency "quandl_logger", "~> 0.1"
@@ -12,4 +12,12 @@ describe Quandl::Cassandra::Column::Write do
12
12
  data.should eq source_data
13
13
  end
14
14
 
15
+ context "given tiny data array" do
16
+ let(:data){ Quandl::Fabricate::Data.rand( columns: 1, rows: 2, nils: false ) }
17
+ before(:each){ Quandl::Cassandra::Column.write( id: id, data: data ) }
18
+ it "should have written the data" do
19
+ Quandl::Cassandra::Column.read( id: id ).should eq data
20
+ end
21
+ end
22
+
15
23
  end
@@ -41,62 +41,66 @@ describe Quandl::Cassandra::Dataset do
41
41
  its(:table_name){ should eq 'datasets' }
42
42
  end
43
43
 
44
- describe "#save" do
45
-
46
- before(:each){
47
- dataset.data = Quandl::Fabricate::Data.rand(rows: 10, columns: 2, nils: false)
48
- dataset.save
49
- }
44
+ context "given data" do
45
+ before(:each){ dataset.data = Quandl::Fabricate::Data.rand(rows: 10, columns: 2, nils: false) }
46
+
47
+ describe "#save" do
48
+ before(:each){ dataset.save }
50
49
 
51
- subject{ dataset }
50
+ subject{ dataset }
52
51
 
53
- its(:changes){ should be_blank }
52
+ its(:changes){ should be_blank }
53
+ its(:frequency){ should eq 'daily' }
54
54
 
55
- describe ".find" do
56
- subject{ Quandl::Cassandra::Dataset.find(id) }
57
- its(:data){ should eq dataset.data.to_table }
58
- it "data should count and return data" do
59
- subject.data.count.should eq 10
60
- subject.data.to_table.should be_a Quandl::Cassandra::Data
61
- end
62
- it "columns should eq dataset.columns" do
63
- subject.columns.collect{|c| c.id.to_s }.should eq dataset.columns.collect{|c| c.id.to_s }
64
- end
65
- it "column_ids should eq dataset.column_ids" do
66
- subject.column_ids.collect(&:to_s).should eq dataset.column_ids.collect(&:to_s)
55
+ describe ".find" do
56
+ subject{ Quandl::Cassandra::Dataset.find(id) }
57
+ its(:data){ should eq dataset.data.to_table }
58
+ its(:frequency){ should eq 'daily' }
59
+
60
+ it "data should count and return data" do
61
+ subject.data.count.should eq 10
62
+ subject.data.to_table.should be_a Quandl::Cassandra::Data
63
+ end
64
+ it "columns should eq dataset.columns" do
65
+ subject.columns.collect{|c| c.id.to_s }.should eq dataset.columns.collect{|c| c.id.to_s }
66
+ end
67
+ it "column_ids should eq dataset.column_ids" do
68
+ subject.column_ids.collect(&:to_s).should eq dataset.column_ids.collect(&:to_s)
69
+ end
67
70
  end
68
- end
69
71
 
70
- describe "#reload" do
71
- before(:each){
72
- dataset.data.limit(5).to_a
73
- dataset.reload
74
- }
72
+ describe "#reload" do
73
+ before(:each){
74
+ dataset.data.limit(5).to_a
75
+ dataset.reload
76
+ }
75
77
 
76
- describe "#attributes" do
77
- subject{ dataset.attributes }
78
- its([:data]){ should eq nil }
79
- end
78
+ describe "#attributes" do
79
+ subject{ dataset.attributes }
80
+ its([:data]){ should eq nil }
81
+ end
80
82
 
81
- end
83
+ end
82
84
 
83
- describe "#data" do
84
- subject{ dataset.data }
85
- its(:count){ should eq 10 }
86
- end
85
+ describe "#data" do
86
+ subject{ dataset.data }
87
+ its(:count){ should eq 10 }
88
+ end
87
89
 
88
- describe "#column_ids" do
89
- subject{ dataset.column_ids }
90
- its(:count){ should eq 2 }
91
- its(:first){ should be_a Cql::Uuid }
92
- end
90
+ describe "#column_ids" do
91
+ subject{ dataset.column_ids }
92
+ its(:count){ should eq 2 }
93
+ its(:first){ should be_a Cql::Uuid }
94
+ end
93
95
 
94
- describe "#columns" do
95
- subject{ dataset.columns }
96
- its(:count){ should eq 2 }
97
- its(:first){ should be_a Quandl::Cassandra::ColumnAttribute }
98
- end
96
+ describe "#columns" do
97
+ subject{ dataset.columns }
98
+ its(:count){ should eq 2 }
99
+ its(:first){ should be_a Quandl::Cassandra::ColumnAttribute }
100
+ end
99
101
 
102
+ end
103
+
100
104
  end
101
105
 
102
106
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: quandl_cassandra
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-11-03 00:00:00.000000000 Z
12
+ date: 2013-11-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -123,6 +123,22 @@ dependencies:
123
123
  - - '>='
124
124
  - !ruby/object:Gem::Version
125
125
  version: 3.0.0
126
+ - !ruby/object:Gem::Dependency
127
+ name: facter
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ~>
132
+ - !ruby/object:Gem::Version
133
+ version: 1.7.3
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ~>
140
+ - !ruby/object:Gem::Version
141
+ version: 1.7.3
126
142
  - !ruby/object:Gem::Dependency
127
143
  name: scope_composer
128
144
  requirement: !ruby/object:Gem::Requirement
@@ -227,6 +243,9 @@ files:
227
243
  - lib/quandl/cassandra/base/sanitization.rb
228
244
  - lib/quandl/cassandra/base/schema.rb
229
245
  - lib/quandl/cassandra/base/scoping.rb
246
+ - lib/quandl/cassandra/batch.rb
247
+ - lib/quandl/cassandra/batch/insert.rb
248
+ - lib/quandl/cassandra/batch/logging.rb
230
249
  - lib/quandl/cassandra/configuration.rb
231
250
  - lib/quandl/cassandra/error.rb
232
251
  - lib/quandl/cassandra/types.rb