quandl_cassandra 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -4,4 +4,5 @@
4
4
  quandl_cassandra-*
5
5
  .rvmrc
6
6
  *.gem
7
- *.log
7
+ *.log
8
+ log/*
@@ -21,7 +21,7 @@ module Cassandra
21
21
  def initialize
22
22
  @hosts = ['localhost']
23
23
  @consistency = :all
24
- @batch_size = 35
24
+ @batch_size = 35 * 4
25
25
  end
26
26
 
27
27
  def to_h
@@ -1,5 +1,5 @@
1
1
  module Quandl
2
2
  module Cassandra
3
- VERSION = '0.2.0'
3
+ VERSION = '0.2.1'
4
4
  end
5
5
  end
@@ -1,7 +1,7 @@
1
1
  class Quandl::Cassandra::Column::Write < Quandl::Strategy::Strategize
2
2
 
3
3
  # strategy attributes
4
- define_attributes :id, :data, :frequency, :column_ids, :frequency_data, :frequency_column_data
4
+ define_attributes :id, :data, :frequency, :column_ids, :frequency_data, :frequency_column_data, :statement_values
5
5
 
6
6
  require_relative 'write/insert_columns'
7
7
  require_relative 'write/insert_column_attributes'
@@ -7,36 +7,31 @@ class Quandl::Cassandra::Column::Write::GroupDataByColumn < Quandl::Cassandra::C
7
7
  # { source: { UUID: [[1,2], [2,4]], UUID: [[1,3],[2,8]] }}
8
8
 
9
9
  def perform
10
- self.frequency_column_data = {}
11
- # for each { frequency: [ [12,3,4], ... ] }
12
- frequency_data.each do |frequency, data|
13
- # assign grouped data to frequency_column_data
14
- self.frequency_column_data[frequency] = group_data_by_column(data)
15
- end
10
+ group_by_statement_values
16
11
  end
17
12
 
18
- def group_data_by_column(data)
19
- column_data = {}
13
+ def group_by_statement_values
20
14
  # for each [ [date, val, val], ... ]
21
- data.each do |row|
22
- # extract date
23
- date = row[0]
24
- # for each [ val, val, ... ]
25
- row[1..-1].each_with_index do |value, index|
26
- # ensure array
27
- column_data[ column_id(index) ] ||= []
28
- # group each each [date, value] by column_id, excluding nil
29
- column_data[ column_id(index) ] << [date, value] unless value.blank?
15
+ self.statement_values = []
16
+ frequency_data.each do |frequency, rows|
17
+ frequency = frequency.to_s
18
+ rows.each do |row|
19
+ # extract date
20
+ date = row[0]
21
+ # for each [ val, val, ... ]
22
+ row[1..-1].each_with_index do |value, index|
23
+ # ensure array
24
+ self.statement_values << [ column_id(index), frequency, date, value] unless value.blank?
25
+ end
30
26
  end
31
27
  end
32
- column_data
33
28
  end
34
29
 
35
30
  def column_id(index)
36
31
  # ensure column_ids is defined
37
32
  self.column_ids ||= Quandl::Cassandra::Dataset.find_column_ids_by_id(id)
38
33
  # ensure column_ids[index] is present
39
- self.column_ids[index] ||= SecureRandom.uuid
34
+ self.column_ids[index] ||= Cql::Uuid.new(SecureRandom.uuid)
40
35
  end
41
36
 
42
37
  end
@@ -1,30 +1,21 @@
1
1
  class Quandl::Cassandra::Column::Write::InsertColumns < Quandl::Cassandra::Column::Write
2
2
 
3
3
  def perform
4
- insert_data_in_batches.collect(&:value)
4
+ insert_columns_in_batches
5
5
  end
6
6
 
7
- def insert_data_in_batches
8
- futures = []
9
- statements = []
10
- frequency_column_data.each do |frequency, column_data|
11
- column_data.each do |column_id, rows|
12
- rows.each do |time_value|
13
- # collect statements
14
- statements << statement( column_id, frequency, time_value[0], time_value[1] )
15
- # after 30 statements are collected, execute a batch insert
16
- if statements.count >= Quandl::Cassandra.configuration.batch_size
17
- # collect the futures
18
- futures << execute_async_batch(statements)
19
- # clear statements
20
- statements = []
21
- end
7
+ def insert_columns_in_batches
8
+ threads = statement_values.each_slice( statement_values.size / 8 ).map do |threads_slice|
9
+ Thread.start do
10
+ futures = []
11
+ threads_slice.each_slice( Quandl::Cassandra.configuration.batch_size ).each do |batch_slice|
12
+ statements = batch_slice.collect{|row| statement( *row ) }
13
+ futures << execute_async_batch(statements)
22
14
  end
15
+ futures.collect(&:value)
23
16
  end
24
17
  end
25
- # execute any remaining statements
26
- futures << execute_async_batch(statements) if statements.count > 0
27
- futures
18
+ threads.each(&:join)
28
19
  end
29
20
 
30
21
  def execute_async_batch(statements)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: quandl_cassandra
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-11-02 00:00:00.000000000 Z
12
+ date: 2013-11-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake