quandl_cassandra 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -4,4 +4,5 @@
4
4
  quandl_cassandra-*
5
5
  .rvmrc
6
6
  *.gem
7
- *.log
7
+ *.log
8
+ log/*
@@ -21,7 +21,7 @@ module Cassandra
21
21
  def initialize
22
22
  @hosts = ['localhost']
23
23
  @consistency = :all
24
- @batch_size = 35
24
+ @batch_size = 35 * 4
25
25
  end
26
26
 
27
27
  def to_h
@@ -1,5 +1,5 @@
1
1
  module Quandl
2
2
  module Cassandra
3
- VERSION = '0.2.0'
3
+ VERSION = '0.2.1'
4
4
  end
5
5
  end
@@ -1,7 +1,7 @@
1
1
  class Quandl::Cassandra::Column::Write < Quandl::Strategy::Strategize
2
2
 
3
3
  # strategy attributes
4
- define_attributes :id, :data, :frequency, :column_ids, :frequency_data, :frequency_column_data
4
+ define_attributes :id, :data, :frequency, :column_ids, :frequency_data, :frequency_column_data, :statement_values
5
5
 
6
6
  require_relative 'write/insert_columns'
7
7
  require_relative 'write/insert_column_attributes'
@@ -7,36 +7,31 @@ class Quandl::Cassandra::Column::Write::GroupDataByColumn < Quandl::Cassandra::C
7
7
  # { source: { UUID: [[1,2], [2,4]], UUID: [[1,3],[2,8]] }}
8
8
 
9
9
  def perform
10
- self.frequency_column_data = {}
11
- # for each { frequency: [ [12,3,4], ... ] }
12
- frequency_data.each do |frequency, data|
13
- # assign grouped data to frequency_column_data
14
- self.frequency_column_data[frequency] = group_data_by_column(data)
15
- end
10
+ group_by_statement_values
16
11
  end
17
12
 
18
- def group_data_by_column(data)
19
- column_data = {}
13
+ def group_by_statement_values
20
14
  # for each [ [date, val, val], ... ]
21
- data.each do |row|
22
- # extract date
23
- date = row[0]
24
- # for each [ val, val, ... ]
25
- row[1..-1].each_with_index do |value, index|
26
- # ensure array
27
- column_data[ column_id(index) ] ||= []
28
- # group each each [date, value] by column_id, excluding nil
29
- column_data[ column_id(index) ] << [date, value] unless value.blank?
15
+ self.statement_values = []
16
+ frequency_data.each do |frequency, rows|
17
+ frequency = frequency.to_s
18
+ rows.each do |row|
19
+ # extract date
20
+ date = row[0]
21
+ # for each [ val, val, ... ]
22
+ row[1..-1].each_with_index do |value, index|
23
+ # ensure array
24
+ self.statement_values << [ column_id(index), frequency, date, value] unless value.blank?
25
+ end
30
26
  end
31
27
  end
32
- column_data
33
28
  end
34
29
 
35
30
  def column_id(index)
36
31
  # ensure column_ids is defined
37
32
  self.column_ids ||= Quandl::Cassandra::Dataset.find_column_ids_by_id(id)
38
33
  # ensure column_ids[index] is present
39
- self.column_ids[index] ||= SecureRandom.uuid
34
+ self.column_ids[index] ||= Cql::Uuid.new(SecureRandom.uuid)
40
35
  end
41
36
 
42
37
  end
@@ -1,30 +1,21 @@
1
1
  class Quandl::Cassandra::Column::Write::InsertColumns < Quandl::Cassandra::Column::Write
2
2
 
3
3
  def perform
4
- insert_data_in_batches.collect(&:value)
4
+ insert_columns_in_batches
5
5
  end
6
6
 
7
- def insert_data_in_batches
8
- futures = []
9
- statements = []
10
- frequency_column_data.each do |frequency, column_data|
11
- column_data.each do |column_id, rows|
12
- rows.each do |time_value|
13
- # collect statements
14
- statements << statement( column_id, frequency, time_value[0], time_value[1] )
15
- # after 30 statements are collected, execute a batch insert
16
- if statements.count >= Quandl::Cassandra.configuration.batch_size
17
- # collect the futures
18
- futures << execute_async_batch(statements)
19
- # clear statements
20
- statements = []
21
- end
7
+ def insert_columns_in_batches
8
+ threads = statement_values.each_slice( statement_values.size / 8 ).map do |threads_slice|
9
+ Thread.start do
10
+ futures = []
11
+ threads_slice.each_slice( Quandl::Cassandra.configuration.batch_size ).each do |batch_slice|
12
+ statements = batch_slice.collect{|row| statement( *row ) }
13
+ futures << execute_async_batch(statements)
22
14
  end
15
+ futures.collect(&:value)
23
16
  end
24
17
  end
25
- # execute any remaining statements
26
- futures << execute_async_batch(statements) if statements.count > 0
27
- futures
18
+ threads.each(&:join)
28
19
  end
29
20
 
30
21
  def execute_async_batch(statements)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: quandl_cassandra
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-11-02 00:00:00.000000000 Z
12
+ date: 2013-11-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake