quandl_cassandra 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/UPGRADE.md CHANGED
@@ -1,3 +1,11 @@
1
+ ## 0.3.0
2
+
3
+ * Base::Connection methods use #with_connection
4
+ * InsertColumnAttributes uses Batch.insert
5
+ * InsertColumns uses Batch.insert
6
+ * add Quandl::Cassandra::Batch
7
+ * insert_columns_in_batches_with_threads does not care how many rows are sent
8
+
1
9
  ## 0.0.1
2
10
 
3
11
  * Begin
@@ -7,6 +7,7 @@ require "active_support/core_ext/object"
7
7
 
8
8
  require 'cql'
9
9
  require 'scope_composer'
10
+ require 'facter'
10
11
 
11
12
  require 'quandl/logger'
12
13
  require "quandl/data"
@@ -16,6 +17,7 @@ require 'quandl/cassandra/error'
16
17
  require 'quandl/cassandra/types'
17
18
  require 'quandl/cassandra/base'
18
19
  require 'quandl/cassandra/configuration'
20
+ require 'quandl/cassandra/batch'
19
21
 
20
22
  require 'quandl/cassandra_models/column'
21
23
  require 'quandl/cassandra_models/column_attribute'
@@ -12,17 +12,40 @@ module Quandl::Cassandra::Base::Connection
12
12
  end
13
13
 
14
14
  def prepare(statement)
15
- connection.prepare(statement)
15
+ with_connection do |c|
16
+ c.prepare(statement)
17
+ end
16
18
  end
17
19
 
18
20
  def execute_async(statement, query_consistency = nil)
19
21
  query_consistency = consistency unless query_consistency.present?
20
- connection.async.execute( statement, query_consistency )
22
+ with_connection do |c|
23
+ c.async.execute( statement, query_consistency )
24
+ end
21
25
  end
22
26
 
23
- def execute(statement, query_consistency = nil)
24
- query_consistency = consistency unless query_consistency.present?
25
- connection.execute( statement, query_consistency )
27
+ def execute(statement, qconsistency = nil)
28
+ qconsistency = consistency unless qconsistency.present?
29
+ with_connection do |c|
30
+ c.execute( statement, qconsistency )
31
+ end
32
+ end
33
+
34
+ def with_connection(&block)
35
+ begin
36
+ yield(connection)
37
+
38
+ rescue Cql::Io::ConnectionError => e
39
+ Quandl::Logger.error(e)
40
+ reset_connection
41
+ raise Cql::Io::ConnectionError
42
+
43
+ rescue Cql::NotConnectedError => e
44
+ Quandl::Logger.error(e)
45
+ reset_connection
46
+ raise Cql::NotConnectedError
47
+
48
+ end
26
49
  end
27
50
 
28
51
  def connection
@@ -33,6 +56,12 @@ module Quandl::Cassandra::Base::Connection
33
56
  Quandl::Cassandra.configuration.consistency
34
57
  end
35
58
 
59
+ def reset_connection
60
+ connection.close
61
+ @@connection = establish_connection
62
+ true
63
+ end
64
+
36
65
  def establish_connection
37
66
  c = Cql::Client.connect(
38
67
  hosts: Quandl::Cassandra.configuration.hosts,
@@ -10,19 +10,7 @@ module Quandl::Cassandra::Base::Logging
10
10
  Quandl::Logger.debug(statement)
11
11
  super if defined?(super)
12
12
  end
13
-
14
- def execute_async(*args, &block)
15
- statement = args.first.to_s
16
- statement = "#{statement[0..200]} ... #{statement.length} chars" if statement.length > 200
17
- t1 = Time.now
18
- begin
19
- r = super if defined?(super)
20
- ensure
21
- Quandl::Logger.debug("(#{t1.elapsed_ms}) #{statement}")
22
- end
23
- r
24
- end
25
-
13
+
26
14
  def execute(*args, &block)
27
15
  statement = args.first.to_s
28
16
  statement = "#{statement[0..200]} ... #{statement.length} chars" if statement.length > 200
@@ -0,0 +1,9 @@
1
+ require 'quandl/cassandra/batch/logging'
2
+ require 'quandl/cassandra/batch/insert'
3
+
4
+ class Quandl::Cassandra::Batch
5
+
6
+ include Quandl::Cassandra::Batch::Insert
7
+ include Quandl::Cassandra::Batch::Logging if defined?(QUANDL_LOGGER) && QUANDL_LOGGER == true
8
+
9
+ end
@@ -0,0 +1,53 @@
1
+ module Quandl::Cassandra::Batch::Insert
2
+
3
+ extend ActiveSupport::Concern
4
+
5
+ module ClassMethods
6
+
7
+ # Quandl::Cassandra::Batch.insert(rows) do |id, type, time, value|
8
+ # "INSERT INTO columns (id, type, time, value) VALUES (#{id}, '#{type}', #{time}, #{value})"
9
+ # end
10
+ def insert(rows, &block)
11
+ insert_in_batches(rows, &block)
12
+ end
13
+
14
+ def batch_size
15
+ Quandl::Cassandra.configuration.batch_size
16
+ end
17
+
18
+
19
+ protected
20
+
21
+ def insert_in_batches_with_threads(rows, &block)
22
+ # split rows into groups by rows_per_thread
23
+ threads = rows.each_slice( rows_per_thread(rows) ).map do |rows_slice|
24
+ Thread.start{ insert_in_batches(rows_slice, &block) }
25
+ end
26
+ threads.each(&:join)
27
+ end
28
+
29
+ def insert_in_batches(rows, &block)
30
+ futures = []
31
+ rows.each_slice( batch_size ).each do |rows_slice|
32
+ statements = rows_slice.collect{|row| block.call( *row ) }
33
+ futures << execute_async_batch(statements)
34
+ end
35
+ futures.collect(&:value)
36
+ end
37
+
38
+ def rows_per_thread(rows)
39
+ r = rows.count / Facter.processorcount.to_i
40
+ r = 1 if r <= 0
41
+ r
42
+ end
43
+
44
+
45
+ private
46
+
47
+ def execute_async_batch(statements)
48
+ batch = %Q{BEGIN UNLOGGED BATCH\n#{statements.join("\n")}\nAPPLY BATCH;}
49
+ future = Quandl::Cassandra::Base.execute_async( batch )
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,22 @@
1
+ class Quandl::Cassandra::Batch
2
+ module Logging
3
+
4
+ extend ActiveSupport::Concern
5
+
6
+ module ClassMethods
7
+
8
+ def insert(rows, &block)
9
+ # log init
10
+ statement = block.call(rows[0].collect{'?'})
11
+ t1 = Time.now
12
+ # call method
13
+ r = super if defined?(super)
14
+ # log write
15
+ Quandl::Logger.debug("(#{t1.elapsed_ms}) [#{rows.count} rows] BATCH #{statement}")
16
+ r
17
+ end
18
+
19
+ end
20
+
21
+ end
22
+ end
@@ -1,5 +1,5 @@
1
1
  module Quandl
2
2
  module Cassandra
3
- VERSION = '0.2.1'
3
+ VERSION = '0.3.0'
4
4
  end
5
5
  end
@@ -18,39 +18,27 @@ class Quandl::Cassandra::Column::Read::SelectColumns < Quandl::Cassandra::Column
18
18
  end
19
19
 
20
20
  def select_data
21
+ t1 = Time.now
22
+ # fire off the queries
21
23
  prepared = Quandl::Cassandra::Base.prepare( statement )
22
24
  data = {}
25
+ futures = []
23
26
  attributes[:column_ids].each_with_index do | id, index |
24
27
  # pluck column type from collapses
25
28
  type = attributes[:column_collapses][index].to_s
26
29
  # bind and execute query
27
- rows = prepared.execute( id, type, :one )
28
- # collect result
29
- rows.each do |row|
30
+ futures << prepared.async.execute( id, type, Quandl::Cassandra::Base.consistency )
31
+ end
32
+ # collect the results
33
+ futures.each_with_index do |future, index|
34
+ # collect result
35
+ future.value.each do |row|
30
36
  data[row['time']] ||= Array.new( attributes[:column_ids].count )
31
37
  data[row['time']][index] ||= row['value']
32
38
  end
33
39
  end
34
- data.collect(&:flatten)
35
- # # fire off the queries
36
- # futures = []
37
- # attrs[:column_ids].each_with_index do | id, index |
38
- # # pluck column type from collapses
39
- # type = attrs[:column_collapses][index].to_s
40
- # # bind and execute query
41
- # futures << Quandl::Cassandra::Base.connection.execute_async( statement.bind( id, type ) )
42
- # end
43
- # # collect the results
44
- # futures.each_with_index do |future, column_index|
45
- # t1 = Time.now
46
- # rows = JavaDriver::ResultSet.new( future.get_uninterruptibly ).to_a
47
- # rows.each do |row|
48
- # data[row[0]] ||= Array.new(attrs[:column_ids].count)
49
- # data[row[0]][column_index] ||= row[1]
50
- # end
51
- # JCQL::CommonLogger.info "#{cql} (#{attrs[:column_ids][column_index]}) (#{t1.elapsed.microseconds}ms)"
52
- # end
53
- # data
40
+ Quandl::Logger.debug("(#{t1.elapsed_ms}) #{self.class.name}.select_data")
41
+ data
54
42
  end
55
43
 
56
44
  def statement
@@ -5,18 +5,16 @@ class Quandl::Cassandra::Column::Write::InsertColumnAttributes < Quandl::Cassand
5
5
 
6
6
  def perform
7
7
  return if column_ids.blank?
8
- column_ids.each_with_index{|column_id, position|
9
- Quandl::Cassandra::Base.execute( datasets_statement( column_id, position ) )
10
- Quandl::Cassandra::Base.execute( column_attributes_statement( column_id ) )
11
- }
12
- end
13
-
14
- def datasets_statement( column_id, position )
15
- "INSERT INTO datasets (id, column_id, position) VALUES (#{id}, #{column_id}, #{position})"
16
- end
17
-
18
- def column_attributes_statement( column_id )
19
- "INSERT INTO column_attributes ( id, frequency ) VALUES ( #{column_id}, '#{frequency}' )"
8
+ # format data for batch insertion
9
+ rows_values = []
10
+ column_ids.each_with_index do |column_id, position|
11
+ rows_values << [id, column_id, position, frequency]
12
+ end
13
+ # insert data
14
+ Quandl::Cassandra::Batch.insert(rows_values) do |id, column_id, position, frequency|
15
+ %Q{INSERT INTO datasets (id, column_id, position) VALUES (#{id}, #{column_id}, #{position})
16
+ INSERT INTO column_attributes ( id, frequency ) VALUES ( #{column_id}, '#{frequency}' )}
17
+ end
20
18
  end
21
19
 
22
20
  end
@@ -1,30 +1,9 @@
1
1
  class Quandl::Cassandra::Column::Write::InsertColumns < Quandl::Cassandra::Column::Write
2
2
 
3
3
  def perform
4
- insert_columns_in_batches
5
- end
6
-
7
- def insert_columns_in_batches
8
- threads = statement_values.each_slice( statement_values.size / 8 ).map do |threads_slice|
9
- Thread.start do
10
- futures = []
11
- threads_slice.each_slice( Quandl::Cassandra.configuration.batch_size ).each do |batch_slice|
12
- statements = batch_slice.collect{|row| statement( *row ) }
13
- futures << execute_async_batch(statements)
14
- end
15
- futures.collect(&:value)
16
- end
4
+ Quandl::Cassandra::Batch.insert(statement_values) do |id, type, time, value|
5
+ "INSERT INTO columns (id, type, time, value) VALUES (#{id}, '#{type}', #{time}, #{value})"
17
6
  end
18
- threads.each(&:join)
19
- end
20
-
21
- def execute_async_batch(statements)
22
- batch = %Q{BEGIN UNLOGGED BATCH\n#{statements.join("\n")}\nAPPLY BATCH;}
23
- future = Quandl::Cassandra::Base.execute_async( batch )
24
- end
25
-
26
- def statement( id, type, time, value )
27
- "INSERT INTO columns (id, type, time, value) VALUES (#{id}, '#{type}', #{time}, #{value})"
28
7
  end
29
8
 
30
9
  end
@@ -20,7 +20,7 @@ class Quandl::Cassandra::Dataset < Quandl::Cassandra::Base
20
20
  self.columns[index].assign_attributes(attrs)
21
21
  end
22
22
  end
23
-
23
+
24
24
  def columns
25
25
  @columns ||= column_ids.collect{|cid| Quandl::Cassandra::ColumnAttribute.find(cid) }
26
26
  end
@@ -25,7 +25,8 @@ Gem::Specification.new do |s|
25
25
 
26
26
  s.add_runtime_dependency "activesupport", ">= 3.0.0"
27
27
  s.add_runtime_dependency "activemodel", ">= 3.0.0"
28
-
28
+
29
+ s.add_runtime_dependency "facter", "~> 1.7.3"
29
30
  s.add_runtime_dependency "scope_composer", "~> 0.4"
30
31
  s.add_runtime_dependency "quandl_data", "~> 1.0"
31
32
  s.add_runtime_dependency "quandl_logger", "~> 0.1"
@@ -12,4 +12,12 @@ describe Quandl::Cassandra::Column::Write do
12
12
  data.should eq source_data
13
13
  end
14
14
 
15
+ context "given tiny data array" do
16
+ let(:data){ Quandl::Fabricate::Data.rand( columns: 1, rows: 2, nils: false ) }
17
+ before(:each){ Quandl::Cassandra::Column.write( id: id, data: data ) }
18
+ it "should have written the data" do
19
+ Quandl::Cassandra::Column.read( id: id ).should eq data
20
+ end
21
+ end
22
+
15
23
  end
@@ -41,62 +41,66 @@ describe Quandl::Cassandra::Dataset do
41
41
  its(:table_name){ should eq 'datasets' }
42
42
  end
43
43
 
44
- describe "#save" do
45
-
46
- before(:each){
47
- dataset.data = Quandl::Fabricate::Data.rand(rows: 10, columns: 2, nils: false)
48
- dataset.save
49
- }
44
+ context "given data" do
45
+ before(:each){ dataset.data = Quandl::Fabricate::Data.rand(rows: 10, columns: 2, nils: false) }
46
+
47
+ describe "#save" do
48
+ before(:each){ dataset.save }
50
49
 
51
- subject{ dataset }
50
+ subject{ dataset }
52
51
 
53
- its(:changes){ should be_blank }
52
+ its(:changes){ should be_blank }
53
+ its(:frequency){ should eq 'daily' }
54
54
 
55
- describe ".find" do
56
- subject{ Quandl::Cassandra::Dataset.find(id) }
57
- its(:data){ should eq dataset.data.to_table }
58
- it "data should count and return data" do
59
- subject.data.count.should eq 10
60
- subject.data.to_table.should be_a Quandl::Cassandra::Data
61
- end
62
- it "columns should eq dataset.columns" do
63
- subject.columns.collect{|c| c.id.to_s }.should eq dataset.columns.collect{|c| c.id.to_s }
64
- end
65
- it "column_ids should eq dataset.column_ids" do
66
- subject.column_ids.collect(&:to_s).should eq dataset.column_ids.collect(&:to_s)
55
+ describe ".find" do
56
+ subject{ Quandl::Cassandra::Dataset.find(id) }
57
+ its(:data){ should eq dataset.data.to_table }
58
+ its(:frequency){ should eq 'daily' }
59
+
60
+ it "data should count and return data" do
61
+ subject.data.count.should eq 10
62
+ subject.data.to_table.should be_a Quandl::Cassandra::Data
63
+ end
64
+ it "columns should eq dataset.columns" do
65
+ subject.columns.collect{|c| c.id.to_s }.should eq dataset.columns.collect{|c| c.id.to_s }
66
+ end
67
+ it "column_ids should eq dataset.column_ids" do
68
+ subject.column_ids.collect(&:to_s).should eq dataset.column_ids.collect(&:to_s)
69
+ end
67
70
  end
68
- end
69
71
 
70
- describe "#reload" do
71
- before(:each){
72
- dataset.data.limit(5).to_a
73
- dataset.reload
74
- }
72
+ describe "#reload" do
73
+ before(:each){
74
+ dataset.data.limit(5).to_a
75
+ dataset.reload
76
+ }
75
77
 
76
- describe "#attributes" do
77
- subject{ dataset.attributes }
78
- its([:data]){ should eq nil }
79
- end
78
+ describe "#attributes" do
79
+ subject{ dataset.attributes }
80
+ its([:data]){ should eq nil }
81
+ end
80
82
 
81
- end
83
+ end
82
84
 
83
- describe "#data" do
84
- subject{ dataset.data }
85
- its(:count){ should eq 10 }
86
- end
85
+ describe "#data" do
86
+ subject{ dataset.data }
87
+ its(:count){ should eq 10 }
88
+ end
87
89
 
88
- describe "#column_ids" do
89
- subject{ dataset.column_ids }
90
- its(:count){ should eq 2 }
91
- its(:first){ should be_a Cql::Uuid }
92
- end
90
+ describe "#column_ids" do
91
+ subject{ dataset.column_ids }
92
+ its(:count){ should eq 2 }
93
+ its(:first){ should be_a Cql::Uuid }
94
+ end
93
95
 
94
- describe "#columns" do
95
- subject{ dataset.columns }
96
- its(:count){ should eq 2 }
97
- its(:first){ should be_a Quandl::Cassandra::ColumnAttribute }
98
- end
96
+ describe "#columns" do
97
+ subject{ dataset.columns }
98
+ its(:count){ should eq 2 }
99
+ its(:first){ should be_a Quandl::Cassandra::ColumnAttribute }
100
+ end
99
101
 
102
+ end
103
+
100
104
  end
101
105
 
102
106
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: quandl_cassandra
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-11-03 00:00:00.000000000 Z
12
+ date: 2013-11-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -123,6 +123,22 @@ dependencies:
123
123
  - - '>='
124
124
  - !ruby/object:Gem::Version
125
125
  version: 3.0.0
126
+ - !ruby/object:Gem::Dependency
127
+ name: facter
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ~>
132
+ - !ruby/object:Gem::Version
133
+ version: 1.7.3
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ~>
140
+ - !ruby/object:Gem::Version
141
+ version: 1.7.3
126
142
  - !ruby/object:Gem::Dependency
127
143
  name: scope_composer
128
144
  requirement: !ruby/object:Gem::Requirement
@@ -227,6 +243,9 @@ files:
227
243
  - lib/quandl/cassandra/base/sanitization.rb
228
244
  - lib/quandl/cassandra/base/schema.rb
229
245
  - lib/quandl/cassandra/base/scoping.rb
246
+ - lib/quandl/cassandra/batch.rb
247
+ - lib/quandl/cassandra/batch/insert.rb
248
+ - lib/quandl/cassandra/batch/logging.rb
230
249
  - lib/quandl/cassandra/configuration.rb
231
250
  - lib/quandl/cassandra/error.rb
232
251
  - lib/quandl/cassandra/types.rb