taps 0.3.11 → 0.3.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/taps/config.rb CHANGED
@@ -5,43 +5,43 @@ require 'yaml'
5
5
  Sequel.datetime_class = DateTime
6
6
 
7
7
  module Taps
8
- def self.version_yml
9
- @@version_yml ||= YAML.load(File.read(File.dirname(__FILE__) + '/../../VERSION.yml'))
10
- end
11
-
12
- def self.version
13
- version = "#{version_yml[:major]}.#{version_yml[:minor]}.#{version_yml[:patch]}"
14
- version += ".#{version_yml[:build]}" if version_yml[:build]
15
- version
16
- end
17
-
18
- def self.compatible_version
19
- "#{version_yml[:major]}.#{version_yml[:minor]}"
20
- end
21
-
22
- def self.exiting=(val)
23
- @@exiting = val
24
- end
25
-
26
- def exiting?
27
- (@@exiting ||= false) == true
28
- end
29
-
30
- class Config
31
- class << self
32
- attr_accessor :taps_database_url
33
- attr_accessor :login, :password, :database_url, :remote_url
34
- attr_accessor :chunksize
35
-
36
- def verify_database_url(db_url=nil)
37
- db_url ||= self.database_url
38
- db = Sequel.connect(db_url)
39
- db.tables
40
- db.disconnect
41
- rescue Object => e
42
- puts "Failed to connect to database:\n #{e.class} -> #{e}"
43
- exit 1
44
- end
45
- end
46
- end
8
+ def self.version_yml
9
+ @@version_yml ||= YAML.load(File.read(File.dirname(__FILE__) + '/../../VERSION.yml'))
10
+ end
11
+
12
+ def self.version
13
+ version = "#{version_yml[:major]}.#{version_yml[:minor]}.#{version_yml[:patch]}"
14
+ version += ".#{version_yml[:build]}" if version_yml[:build]
15
+ version
16
+ end
17
+
18
+ def self.compatible_version
19
+ "#{version_yml[:major]}.#{version_yml[:minor]}"
20
+ end
21
+
22
+ def self.exiting=(val)
23
+ @@exiting = val
24
+ end
25
+
26
+ def exiting?
27
+ (@@exiting ||= false) == true
28
+ end
29
+
30
+ class Config
31
+ class << self
32
+ attr_accessor :taps_database_url
33
+ attr_accessor :login, :password, :database_url, :remote_url
34
+ attr_accessor :chunksize
35
+
36
+ def verify_database_url(db_url=nil)
37
+ db_url ||= self.database_url
38
+ db = Sequel.connect(db_url)
39
+ db.tables
40
+ db.disconnect
41
+ rescue Object => e
42
+ puts "Failed to connect to database:\n #{e.class} -> #{e}"
43
+ exit 1
44
+ end
45
+ end
46
+ end
47
47
  end
@@ -7,301 +7,301 @@ require 'json/pure'
7
7
  module Taps
8
8
 
9
9
  class DataStream
10
- class CorruptedData < Exception; end
11
-
12
- DEFAULT_CHUNKSIZE = 1000
13
-
14
- attr_reader :db, :state
15
-
16
- def initialize(db, state)
17
- @db = db
18
- @state = {
19
- :offset => 0,
20
- :avg_chunksize => 0,
21
- :num_chunksize => 0,
22
- :total_chunksize => 0,
23
- }.merge(state)
24
- @state[:chunksize] ||= DEFAULT_CHUNKSIZE
25
- @complete = false
26
- end
27
-
28
- def log
29
- Taps.log
30
- end
31
-
32
- def error=(val)
33
- state[:error] = val
34
- end
35
-
36
- def error
37
- state[:error] || false
38
- end
39
-
40
- def table_name
41
- state[:table_name].to_sym
42
- end
43
-
44
- def table_name_sql
45
- table_name.identifier
46
- end
47
-
48
- def to_hash
49
- state.merge(:klass => self.class.to_s)
50
- end
51
-
52
- def to_json
53
- to_hash.to_json
54
- end
55
-
56
- def string_columns
57
- @string_columns ||= Taps::Utils.incorrect_blobs(db, table_name)
58
- end
59
-
60
- def table
61
- @table ||= db[table_name_sql]
62
- end
63
-
64
- def order_by(name=nil)
65
- @order_by ||= begin
66
- name ||= table_name
67
- Taps::Utils.order_by(db, name)
68
- end
69
- end
70
-
71
- def increment(row_count)
72
- state[:offset] += row_count
73
- end
74
-
75
- # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
76
- # goes below 100 or maybe if offset is > 1000
77
- def fetch_rows
78
- state[:chunksize] = fetch_chunksize
79
- ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
80
- log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
81
- rows = Taps::Utils.format_data(ds.all,
82
- :string_columns => string_columns)
83
- update_chunksize_stats
84
- rows
85
- end
86
-
87
- def max_chunksize_training
88
- 20
89
- end
90
-
91
- def fetch_chunksize
92
- chunksize = state[:chunksize]
93
- return chunksize if state[:num_chunksize] < max_chunksize_training
94
- return chunksize if state[:avg_chunksize] == 0
95
- return chunksize if state[:error]
96
- state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
97
- end
98
-
99
- def update_chunksize_stats
100
- return if state[:num_chunksize] >= max_chunksize_training
101
- state[:total_chunksize] += state[:chunksize]
102
- state[:num_chunksize] += 1
103
- state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
104
- end
105
-
106
- def encode_rows(rows)
107
- Taps::Utils.base64encode(Marshal.dump(rows))
108
- end
109
-
110
- def fetch
111
- log.debug "DataStream#fetch state -> #{state.inspect}"
112
-
113
- t1 = Time.now
114
- rows = fetch_rows
115
- encoded_data = encode_rows(rows)
116
- t2 = Time.now
117
- elapsed_time = t2 - t1
118
-
119
- @complete = rows == { }
120
-
121
- [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
122
- end
123
-
124
- def complete?
125
- @complete
126
- end
127
-
128
- def fetch_remote(resource, headers)
129
- params = fetch_from_resource(resource, headers)
130
- encoded_data = params[:encoded_data]
131
- json = params[:json]
132
-
133
- rows = parse_encoded_data(encoded_data, json[:checksum])
134
- @complete = rows == { }
135
-
136
- # update local state
137
- state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
138
-
139
- unless @complete
140
- import_rows(rows)
141
- rows[:data].size
142
- else
143
- 0
144
- end
145
- end
146
-
147
- # this one is used inside the server process
148
- def fetch_remote_in_server(params)
149
- json = self.class.parse_json(params[:json])
150
- encoded_data = params[:encoded_data]
151
-
152
- rows = parse_encoded_data(encoded_data, json[:checksum])
153
- @complete = rows == { }
154
-
155
- unless @complete
156
- import_rows(rows)
157
- rows[:data].size
158
- else
159
- 0
160
- end
161
- end
162
-
163
- def fetch_from_resource(resource, headers)
164
- res = nil
165
- log.debug "DataStream#fetch_from_resource state -> #{state.inspect}"
166
- state[:chunksize] = Taps::Utils.calculate_chunksize(state[:chunksize]) do |c|
167
- state[:chunksize] = c
168
- res = resource.post({:state => self.to_json}, headers)
169
- end
170
-
171
- begin
172
- params = Taps::Multipart.parse(res)
173
- params[:json] = self.class.parse_json(params[:json]) if params.has_key?(:json)
174
- return params
175
- rescue JSON::Parser
176
- raise DataStream::CorruptedData.new("Invalid JSON Received")
177
- end
178
- end
179
-
180
- def self.parse_json(json)
181
- hash = JSON.parse(json).symbolize_keys
182
- hash[:state].symbolize_keys! if hash.has_key?(:state)
183
- hash
184
- end
185
-
186
- def parse_encoded_data(encoded_data, checksum)
187
- raise DataStream::CorruptedData.new("Checksum Failed") unless Taps::Utils.valid_data?(encoded_data, checksum)
188
-
189
- begin
190
- return Marshal.load(Taps::Utils.base64decode(encoded_data))
191
- rescue Object => e
192
- unless ENV['NO_DUMP_MARSHAL_ERRORS']
193
- puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
194
- File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
195
- end
196
- raise
197
- end
198
- end
199
-
200
- def import_rows(rows)
201
- table.import(rows[:header], rows[:data])
202
- state[:offset] += rows[:data].size
203
- end
204
-
205
- def self.factory(db, state)
206
- if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
207
- Sequel::MySQL.convert_invalid_date_time = :nil
208
- end
209
-
210
- if state.has_key?(:klass)
211
- return eval(state[:klass]).new(db, state)
212
- end
213
-
214
- if Taps::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
215
- DataStreamKeyed.new(db, state)
216
- else
217
- DataStream.new(db, state)
218
- end
219
- end
10
+ class CorruptedData < Exception; end
11
+
12
+ DEFAULT_CHUNKSIZE = 1000
13
+
14
+ attr_reader :db, :state
15
+
16
+ def initialize(db, state)
17
+ @db = db
18
+ @state = {
19
+ :offset => 0,
20
+ :avg_chunksize => 0,
21
+ :num_chunksize => 0,
22
+ :total_chunksize => 0,
23
+ }.merge(state)
24
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
25
+ @complete = false
26
+ end
27
+
28
+ def log
29
+ Taps.log
30
+ end
31
+
32
+ def error=(val)
33
+ state[:error] = val
34
+ end
35
+
36
+ def error
37
+ state[:error] || false
38
+ end
39
+
40
+ def table_name
41
+ state[:table_name].to_sym
42
+ end
43
+
44
+ def table_name_sql
45
+ table_name.identifier
46
+ end
47
+
48
+ def to_hash
49
+ state.merge(:klass => self.class.to_s)
50
+ end
51
+
52
+ def to_json
53
+ to_hash.to_json
54
+ end
55
+
56
+ def string_columns
57
+ @string_columns ||= Taps::Utils.incorrect_blobs(db, table_name)
58
+ end
59
+
60
+ def table
61
+ @table ||= db[table_name_sql]
62
+ end
63
+
64
+ def order_by(name=nil)
65
+ @order_by ||= begin
66
+ name ||= table_name
67
+ Taps::Utils.order_by(db, name)
68
+ end
69
+ end
70
+
71
+ def increment(row_count)
72
+ state[:offset] += row_count
73
+ end
74
+
75
+ # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
76
+ # goes below 100 or maybe if offset is > 1000
77
+ def fetch_rows
78
+ state[:chunksize] = fetch_chunksize
79
+ ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
80
+ log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
81
+ rows = Taps::Utils.format_data(ds.all,
82
+ :string_columns => string_columns)
83
+ update_chunksize_stats
84
+ rows
85
+ end
86
+
87
+ def max_chunksize_training
88
+ 20
89
+ end
90
+
91
+ def fetch_chunksize
92
+ chunksize = state[:chunksize]
93
+ return chunksize if state[:num_chunksize] < max_chunksize_training
94
+ return chunksize if state[:avg_chunksize] == 0
95
+ return chunksize if state[:error]
96
+ state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
97
+ end
98
+
99
+ def update_chunksize_stats
100
+ return if state[:num_chunksize] >= max_chunksize_training
101
+ state[:total_chunksize] += state[:chunksize]
102
+ state[:num_chunksize] += 1
103
+ state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
104
+ end
105
+
106
+ def encode_rows(rows)
107
+ Taps::Utils.base64encode(Marshal.dump(rows))
108
+ end
109
+
110
+ def fetch
111
+ log.debug "DataStream#fetch state -> #{state.inspect}"
112
+
113
+ t1 = Time.now
114
+ rows = fetch_rows
115
+ encoded_data = encode_rows(rows)
116
+ t2 = Time.now
117
+ elapsed_time = t2 - t1
118
+
119
+ @complete = rows == { }
120
+
121
+ [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
122
+ end
123
+
124
+ def complete?
125
+ @complete
126
+ end
127
+
128
+ def fetch_remote(resource, headers)
129
+ params = fetch_from_resource(resource, headers)
130
+ encoded_data = params[:encoded_data]
131
+ json = params[:json]
132
+
133
+ rows = parse_encoded_data(encoded_data, json[:checksum])
134
+ @complete = rows == { }
135
+
136
+ # update local state
137
+ state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
138
+
139
+ unless @complete
140
+ import_rows(rows)
141
+ rows[:data].size
142
+ else
143
+ 0
144
+ end
145
+ end
146
+
147
+ # this one is used inside the server process
148
+ def fetch_remote_in_server(params)
149
+ json = self.class.parse_json(params[:json])
150
+ encoded_data = params[:encoded_data]
151
+
152
+ rows = parse_encoded_data(encoded_data, json[:checksum])
153
+ @complete = rows == { }
154
+
155
+ unless @complete
156
+ import_rows(rows)
157
+ rows[:data].size
158
+ else
159
+ 0
160
+ end
161
+ end
162
+
163
+ def fetch_from_resource(resource, headers)
164
+ res = nil
165
+ log.debug "DataStream#fetch_from_resource state -> #{state.inspect}"
166
+ state[:chunksize] = Taps::Utils.calculate_chunksize(state[:chunksize]) do |c|
167
+ state[:chunksize] = c
168
+ res = resource.post({:state => self.to_json}, headers)
169
+ end
170
+
171
+ begin
172
+ params = Taps::Multipart.parse(res)
173
+ params[:json] = self.class.parse_json(params[:json]) if params.has_key?(:json)
174
+ return params
175
+ rescue JSON::Parser
176
+ raise DataStream::CorruptedData.new("Invalid JSON Received")
177
+ end
178
+ end
179
+
180
+ def self.parse_json(json)
181
+ hash = JSON.parse(json).symbolize_keys
182
+ hash[:state].symbolize_keys! if hash.has_key?(:state)
183
+ hash
184
+ end
185
+
186
+ def parse_encoded_data(encoded_data, checksum)
187
+ raise DataStream::CorruptedData.new("Checksum Failed") unless Taps::Utils.valid_data?(encoded_data, checksum)
188
+
189
+ begin
190
+ return Marshal.load(Taps::Utils.base64decode(encoded_data))
191
+ rescue Object => e
192
+ unless ENV['NO_DUMP_MARSHAL_ERRORS']
193
+ puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
194
+ File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
195
+ end
196
+ raise
197
+ end
198
+ end
199
+
200
+ def import_rows(rows)
201
+ table.import(rows[:header], rows[:data])
202
+ state[:offset] += rows[:data].size
203
+ end
204
+
205
+ def self.factory(db, state)
206
+ if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
207
+ Sequel::MySQL.convert_invalid_date_time = :nil
208
+ end
209
+
210
+ if state.has_key?(:klass)
211
+ return eval(state[:klass]).new(db, state)
212
+ end
213
+
214
+ if Taps::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
215
+ DataStreamKeyed.new(db, state)
216
+ else
217
+ DataStream.new(db, state)
218
+ end
219
+ end
220
220
  end
221
221
 
222
222
 
223
223
  class DataStreamKeyed < DataStream
224
- attr_accessor :buffer
225
-
226
- def initialize(db, state)
227
- super(db, state)
228
- @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(state)
229
- @state[:chunksize] ||= DEFAULT_CHUNKSIZE
230
- @buffer = []
231
- end
232
-
233
- def primary_key
234
- state[:primary_key].to_sym
235
- end
236
-
237
- def buffer_limit
238
- if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
239
- state[:last_fetched]
240
- else
241
- state[:filter]
242
- end
243
- end
244
-
245
- def calc_limit(chunksize)
246
- # we want to not fetch more than is needed while we're
247
- # inside sinatra but locally we can select more than
248
- # is strictly needed
249
- if defined?(Sinatra)
250
- (chunksize * 1.1).ceil
251
- else
252
- (chunksize * 3).ceil
253
- end
254
- end
255
-
256
- def load_buffer(chunksize)
257
- # make sure BasicObject is not polluted by subsequent requires
258
- Sequel::BasicObject.remove_methods!
259
-
260
- num = 0
261
- loop do
262
- limit = calc_limit(chunksize)
263
- # we have to use local variables in order for the virtual row filter to work correctly
264
- key = primary_key
265
- buf_limit = buffer_limit
266
- ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
267
- log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
268
- data = ds.all
269
- self.buffer += data
270
- num += data.size
271
- if data.size > 0
272
- # keep a record of the last primary key value in the buffer
273
- state[:filter] = self.buffer.last[ primary_key ]
274
- end
275
-
276
- break if num >= chunksize or data.size == 0
277
- end
278
- end
279
-
280
- def fetch_buffered(chunksize)
281
- load_buffer(chunksize) if self.buffer.size < chunksize
282
- rows = buffer.slice(0, chunksize)
283
- state[:last_fetched] = if rows.size > 0
284
- rows.last[ primary_key ]
285
- else
286
- nil
287
- end
288
- rows
289
- end
290
-
291
- def import_rows(rows)
292
- table.import(rows[:header], rows[:data])
293
- end
294
-
295
- def fetch_rows
296
- chunksize = state[:chunksize]
297
- Taps::Utils.format_data(fetch_buffered(chunksize) || [],
298
- :string_columns => string_columns)
299
- end
300
-
301
- def increment(row_count)
302
- # pop the rows we just successfully sent off the buffer
303
- @buffer.slice!(0, row_count)
304
- end
224
+ attr_accessor :buffer
225
+
226
+ def initialize(db, state)
227
+ super(db, state)
228
+ @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(state)
229
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
230
+ @buffer = []
231
+ end
232
+
233
+ def primary_key
234
+ state[:primary_key].to_sym
235
+ end
236
+
237
+ def buffer_limit
238
+ if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
239
+ state[:last_fetched]
240
+ else
241
+ state[:filter]
242
+ end
243
+ end
244
+
245
+ def calc_limit(chunksize)
246
+ # we want to not fetch more than is needed while we're
247
+ # inside sinatra but locally we can select more than
248
+ # is strictly needed
249
+ if defined?(Sinatra)
250
+ (chunksize * 1.1).ceil
251
+ else
252
+ (chunksize * 3).ceil
253
+ end
254
+ end
255
+
256
+ def load_buffer(chunksize)
257
+ # make sure BasicObject is not polluted by subsequent requires
258
+ Sequel::BasicObject.remove_methods!
259
+
260
+ num = 0
261
+ loop do
262
+ limit = calc_limit(chunksize)
263
+ # we have to use local variables in order for the virtual row filter to work correctly
264
+ key = primary_key
265
+ buf_limit = buffer_limit
266
+ ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
267
+ log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
268
+ data = ds.all
269
+ self.buffer += data
270
+ num += data.size
271
+ if data.size > 0
272
+ # keep a record of the last primary key value in the buffer
273
+ state[:filter] = self.buffer.last[ primary_key ]
274
+ end
275
+
276
+ break if num >= chunksize or data.size == 0
277
+ end
278
+ end
279
+
280
+ def fetch_buffered(chunksize)
281
+ load_buffer(chunksize) if self.buffer.size < chunksize
282
+ rows = buffer.slice(0, chunksize)
283
+ state[:last_fetched] = if rows.size > 0
284
+ rows.last[ primary_key ]
285
+ else
286
+ nil
287
+ end
288
+ rows
289
+ end
290
+
291
+ def import_rows(rows)
292
+ table.import(rows[:header], rows[:data])
293
+ end
294
+
295
+ def fetch_rows
296
+ chunksize = state[:chunksize]
297
+ Taps::Utils.format_data(fetch_buffered(chunksize) || [],
298
+ :string_columns => string_columns)
299
+ end
300
+
301
+ def increment(row_count)
302
+ # pop the rows we just successfully sent off the buffer
303
+ @buffer.slice!(0, row_count)
304
+ end
305
305
  end
306
306
 
307
307
  end