taps 0.3.11 → 0.3.12

Sign up to get free protection for your applications and to get access to all the features.
data/lib/taps/config.rb CHANGED
@@ -5,43 +5,43 @@ require 'yaml'
5
5
  Sequel.datetime_class = DateTime
6
6
 
7
7
  module Taps
8
- def self.version_yml
9
- @@version_yml ||= YAML.load(File.read(File.dirname(__FILE__) + '/../../VERSION.yml'))
10
- end
11
-
12
- def self.version
13
- version = "#{version_yml[:major]}.#{version_yml[:minor]}.#{version_yml[:patch]}"
14
- version += ".#{version_yml[:build]}" if version_yml[:build]
15
- version
16
- end
17
-
18
- def self.compatible_version
19
- "#{version_yml[:major]}.#{version_yml[:minor]}"
20
- end
21
-
22
- def self.exiting=(val)
23
- @@exiting = val
24
- end
25
-
26
- def exiting?
27
- (@@exiting ||= false) == true
28
- end
29
-
30
- class Config
31
- class << self
32
- attr_accessor :taps_database_url
33
- attr_accessor :login, :password, :database_url, :remote_url
34
- attr_accessor :chunksize
35
-
36
- def verify_database_url(db_url=nil)
37
- db_url ||= self.database_url
38
- db = Sequel.connect(db_url)
39
- db.tables
40
- db.disconnect
41
- rescue Object => e
42
- puts "Failed to connect to database:\n #{e.class} -> #{e}"
43
- exit 1
44
- end
45
- end
46
- end
8
+ def self.version_yml
9
+ @@version_yml ||= YAML.load(File.read(File.dirname(__FILE__) + '/../../VERSION.yml'))
10
+ end
11
+
12
+ def self.version
13
+ version = "#{version_yml[:major]}.#{version_yml[:minor]}.#{version_yml[:patch]}"
14
+ version += ".#{version_yml[:build]}" if version_yml[:build]
15
+ version
16
+ end
17
+
18
+ def self.compatible_version
19
+ "#{version_yml[:major]}.#{version_yml[:minor]}"
20
+ end
21
+
22
+ def self.exiting=(val)
23
+ @@exiting = val
24
+ end
25
+
26
+ def exiting?
27
+ (@@exiting ||= false) == true
28
+ end
29
+
30
+ class Config
31
+ class << self
32
+ attr_accessor :taps_database_url
33
+ attr_accessor :login, :password, :database_url, :remote_url
34
+ attr_accessor :chunksize
35
+
36
+ def verify_database_url(db_url=nil)
37
+ db_url ||= self.database_url
38
+ db = Sequel.connect(db_url)
39
+ db.tables
40
+ db.disconnect
41
+ rescue Object => e
42
+ puts "Failed to connect to database:\n #{e.class} -> #{e}"
43
+ exit 1
44
+ end
45
+ end
46
+ end
47
47
  end
@@ -7,301 +7,301 @@ require 'json/pure'
7
7
  module Taps
8
8
 
9
9
  class DataStream
10
- class CorruptedData < Exception; end
11
-
12
- DEFAULT_CHUNKSIZE = 1000
13
-
14
- attr_reader :db, :state
15
-
16
- def initialize(db, state)
17
- @db = db
18
- @state = {
19
- :offset => 0,
20
- :avg_chunksize => 0,
21
- :num_chunksize => 0,
22
- :total_chunksize => 0,
23
- }.merge(state)
24
- @state[:chunksize] ||= DEFAULT_CHUNKSIZE
25
- @complete = false
26
- end
27
-
28
- def log
29
- Taps.log
30
- end
31
-
32
- def error=(val)
33
- state[:error] = val
34
- end
35
-
36
- def error
37
- state[:error] || false
38
- end
39
-
40
- def table_name
41
- state[:table_name].to_sym
42
- end
43
-
44
- def table_name_sql
45
- table_name.identifier
46
- end
47
-
48
- def to_hash
49
- state.merge(:klass => self.class.to_s)
50
- end
51
-
52
- def to_json
53
- to_hash.to_json
54
- end
55
-
56
- def string_columns
57
- @string_columns ||= Taps::Utils.incorrect_blobs(db, table_name)
58
- end
59
-
60
- def table
61
- @table ||= db[table_name_sql]
62
- end
63
-
64
- def order_by(name=nil)
65
- @order_by ||= begin
66
- name ||= table_name
67
- Taps::Utils.order_by(db, name)
68
- end
69
- end
70
-
71
- def increment(row_count)
72
- state[:offset] += row_count
73
- end
74
-
75
- # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
76
- # goes below 100 or maybe if offset is > 1000
77
- def fetch_rows
78
- state[:chunksize] = fetch_chunksize
79
- ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
80
- log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
81
- rows = Taps::Utils.format_data(ds.all,
82
- :string_columns => string_columns)
83
- update_chunksize_stats
84
- rows
85
- end
86
-
87
- def max_chunksize_training
88
- 20
89
- end
90
-
91
- def fetch_chunksize
92
- chunksize = state[:chunksize]
93
- return chunksize if state[:num_chunksize] < max_chunksize_training
94
- return chunksize if state[:avg_chunksize] == 0
95
- return chunksize if state[:error]
96
- state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
97
- end
98
-
99
- def update_chunksize_stats
100
- return if state[:num_chunksize] >= max_chunksize_training
101
- state[:total_chunksize] += state[:chunksize]
102
- state[:num_chunksize] += 1
103
- state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
104
- end
105
-
106
- def encode_rows(rows)
107
- Taps::Utils.base64encode(Marshal.dump(rows))
108
- end
109
-
110
- def fetch
111
- log.debug "DataStream#fetch state -> #{state.inspect}"
112
-
113
- t1 = Time.now
114
- rows = fetch_rows
115
- encoded_data = encode_rows(rows)
116
- t2 = Time.now
117
- elapsed_time = t2 - t1
118
-
119
- @complete = rows == { }
120
-
121
- [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
122
- end
123
-
124
- def complete?
125
- @complete
126
- end
127
-
128
- def fetch_remote(resource, headers)
129
- params = fetch_from_resource(resource, headers)
130
- encoded_data = params[:encoded_data]
131
- json = params[:json]
132
-
133
- rows = parse_encoded_data(encoded_data, json[:checksum])
134
- @complete = rows == { }
135
-
136
- # update local state
137
- state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
138
-
139
- unless @complete
140
- import_rows(rows)
141
- rows[:data].size
142
- else
143
- 0
144
- end
145
- end
146
-
147
- # this one is used inside the server process
148
- def fetch_remote_in_server(params)
149
- json = self.class.parse_json(params[:json])
150
- encoded_data = params[:encoded_data]
151
-
152
- rows = parse_encoded_data(encoded_data, json[:checksum])
153
- @complete = rows == { }
154
-
155
- unless @complete
156
- import_rows(rows)
157
- rows[:data].size
158
- else
159
- 0
160
- end
161
- end
162
-
163
- def fetch_from_resource(resource, headers)
164
- res = nil
165
- log.debug "DataStream#fetch_from_resource state -> #{state.inspect}"
166
- state[:chunksize] = Taps::Utils.calculate_chunksize(state[:chunksize]) do |c|
167
- state[:chunksize] = c
168
- res = resource.post({:state => self.to_json}, headers)
169
- end
170
-
171
- begin
172
- params = Taps::Multipart.parse(res)
173
- params[:json] = self.class.parse_json(params[:json]) if params.has_key?(:json)
174
- return params
175
- rescue JSON::Parser
176
- raise DataStream::CorruptedData.new("Invalid JSON Received")
177
- end
178
- end
179
-
180
- def self.parse_json(json)
181
- hash = JSON.parse(json).symbolize_keys
182
- hash[:state].symbolize_keys! if hash.has_key?(:state)
183
- hash
184
- end
185
-
186
- def parse_encoded_data(encoded_data, checksum)
187
- raise DataStream::CorruptedData.new("Checksum Failed") unless Taps::Utils.valid_data?(encoded_data, checksum)
188
-
189
- begin
190
- return Marshal.load(Taps::Utils.base64decode(encoded_data))
191
- rescue Object => e
192
- unless ENV['NO_DUMP_MARSHAL_ERRORS']
193
- puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
194
- File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
195
- end
196
- raise
197
- end
198
- end
199
-
200
- def import_rows(rows)
201
- table.import(rows[:header], rows[:data])
202
- state[:offset] += rows[:data].size
203
- end
204
-
205
- def self.factory(db, state)
206
- if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
207
- Sequel::MySQL.convert_invalid_date_time = :nil
208
- end
209
-
210
- if state.has_key?(:klass)
211
- return eval(state[:klass]).new(db, state)
212
- end
213
-
214
- if Taps::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
215
- DataStreamKeyed.new(db, state)
216
- else
217
- DataStream.new(db, state)
218
- end
219
- end
10
+ class CorruptedData < Exception; end
11
+
12
+ DEFAULT_CHUNKSIZE = 1000
13
+
14
+ attr_reader :db, :state
15
+
16
+ def initialize(db, state)
17
+ @db = db
18
+ @state = {
19
+ :offset => 0,
20
+ :avg_chunksize => 0,
21
+ :num_chunksize => 0,
22
+ :total_chunksize => 0,
23
+ }.merge(state)
24
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
25
+ @complete = false
26
+ end
27
+
28
+ def log
29
+ Taps.log
30
+ end
31
+
32
+ def error=(val)
33
+ state[:error] = val
34
+ end
35
+
36
+ def error
37
+ state[:error] || false
38
+ end
39
+
40
+ def table_name
41
+ state[:table_name].to_sym
42
+ end
43
+
44
+ def table_name_sql
45
+ table_name.identifier
46
+ end
47
+
48
+ def to_hash
49
+ state.merge(:klass => self.class.to_s)
50
+ end
51
+
52
+ def to_json
53
+ to_hash.to_json
54
+ end
55
+
56
+ def string_columns
57
+ @string_columns ||= Taps::Utils.incorrect_blobs(db, table_name)
58
+ end
59
+
60
+ def table
61
+ @table ||= db[table_name_sql]
62
+ end
63
+
64
+ def order_by(name=nil)
65
+ @order_by ||= begin
66
+ name ||= table_name
67
+ Taps::Utils.order_by(db, name)
68
+ end
69
+ end
70
+
71
+ def increment(row_count)
72
+ state[:offset] += row_count
73
+ end
74
+
75
+ # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
76
+ # goes below 100 or maybe if offset is > 1000
77
+ def fetch_rows
78
+ state[:chunksize] = fetch_chunksize
79
+ ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
80
+ log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
81
+ rows = Taps::Utils.format_data(ds.all,
82
+ :string_columns => string_columns)
83
+ update_chunksize_stats
84
+ rows
85
+ end
86
+
87
+ def max_chunksize_training
88
+ 20
89
+ end
90
+
91
+ def fetch_chunksize
92
+ chunksize = state[:chunksize]
93
+ return chunksize if state[:num_chunksize] < max_chunksize_training
94
+ return chunksize if state[:avg_chunksize] == 0
95
+ return chunksize if state[:error]
96
+ state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
97
+ end
98
+
99
+ def update_chunksize_stats
100
+ return if state[:num_chunksize] >= max_chunksize_training
101
+ state[:total_chunksize] += state[:chunksize]
102
+ state[:num_chunksize] += 1
103
+ state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
104
+ end
105
+
106
+ def encode_rows(rows)
107
+ Taps::Utils.base64encode(Marshal.dump(rows))
108
+ end
109
+
110
+ def fetch
111
+ log.debug "DataStream#fetch state -> #{state.inspect}"
112
+
113
+ t1 = Time.now
114
+ rows = fetch_rows
115
+ encoded_data = encode_rows(rows)
116
+ t2 = Time.now
117
+ elapsed_time = t2 - t1
118
+
119
+ @complete = rows == { }
120
+
121
+ [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
122
+ end
123
+
124
+ def complete?
125
+ @complete
126
+ end
127
+
128
+ def fetch_remote(resource, headers)
129
+ params = fetch_from_resource(resource, headers)
130
+ encoded_data = params[:encoded_data]
131
+ json = params[:json]
132
+
133
+ rows = parse_encoded_data(encoded_data, json[:checksum])
134
+ @complete = rows == { }
135
+
136
+ # update local state
137
+ state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
138
+
139
+ unless @complete
140
+ import_rows(rows)
141
+ rows[:data].size
142
+ else
143
+ 0
144
+ end
145
+ end
146
+
147
+ # this one is used inside the server process
148
+ def fetch_remote_in_server(params)
149
+ json = self.class.parse_json(params[:json])
150
+ encoded_data = params[:encoded_data]
151
+
152
+ rows = parse_encoded_data(encoded_data, json[:checksum])
153
+ @complete = rows == { }
154
+
155
+ unless @complete
156
+ import_rows(rows)
157
+ rows[:data].size
158
+ else
159
+ 0
160
+ end
161
+ end
162
+
163
+ def fetch_from_resource(resource, headers)
164
+ res = nil
165
+ log.debug "DataStream#fetch_from_resource state -> #{state.inspect}"
166
+ state[:chunksize] = Taps::Utils.calculate_chunksize(state[:chunksize]) do |c|
167
+ state[:chunksize] = c
168
+ res = resource.post({:state => self.to_json}, headers)
169
+ end
170
+
171
+ begin
172
+ params = Taps::Multipart.parse(res)
173
+ params[:json] = self.class.parse_json(params[:json]) if params.has_key?(:json)
174
+ return params
175
+ rescue JSON::Parser
176
+ raise DataStream::CorruptedData.new("Invalid JSON Received")
177
+ end
178
+ end
179
+
180
+ def self.parse_json(json)
181
+ hash = JSON.parse(json).symbolize_keys
182
+ hash[:state].symbolize_keys! if hash.has_key?(:state)
183
+ hash
184
+ end
185
+
186
+ def parse_encoded_data(encoded_data, checksum)
187
+ raise DataStream::CorruptedData.new("Checksum Failed") unless Taps::Utils.valid_data?(encoded_data, checksum)
188
+
189
+ begin
190
+ return Marshal.load(Taps::Utils.base64decode(encoded_data))
191
+ rescue Object => e
192
+ unless ENV['NO_DUMP_MARSHAL_ERRORS']
193
+ puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
194
+ File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
195
+ end
196
+ raise
197
+ end
198
+ end
199
+
200
+ def import_rows(rows)
201
+ table.import(rows[:header], rows[:data])
202
+ state[:offset] += rows[:data].size
203
+ end
204
+
205
+ def self.factory(db, state)
206
+ if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
207
+ Sequel::MySQL.convert_invalid_date_time = :nil
208
+ end
209
+
210
+ if state.has_key?(:klass)
211
+ return eval(state[:klass]).new(db, state)
212
+ end
213
+
214
+ if Taps::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
215
+ DataStreamKeyed.new(db, state)
216
+ else
217
+ DataStream.new(db, state)
218
+ end
219
+ end
220
220
  end
221
221
 
222
222
 
223
223
  class DataStreamKeyed < DataStream
224
- attr_accessor :buffer
225
-
226
- def initialize(db, state)
227
- super(db, state)
228
- @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(state)
229
- @state[:chunksize] ||= DEFAULT_CHUNKSIZE
230
- @buffer = []
231
- end
232
-
233
- def primary_key
234
- state[:primary_key].to_sym
235
- end
236
-
237
- def buffer_limit
238
- if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
239
- state[:last_fetched]
240
- else
241
- state[:filter]
242
- end
243
- end
244
-
245
- def calc_limit(chunksize)
246
- # we want to not fetch more than is needed while we're
247
- # inside sinatra but locally we can select more than
248
- # is strictly needed
249
- if defined?(Sinatra)
250
- (chunksize * 1.1).ceil
251
- else
252
- (chunksize * 3).ceil
253
- end
254
- end
255
-
256
- def load_buffer(chunksize)
257
- # make sure BasicObject is not polluted by subsequent requires
258
- Sequel::BasicObject.remove_methods!
259
-
260
- num = 0
261
- loop do
262
- limit = calc_limit(chunksize)
263
- # we have to use local variables in order for the virtual row filter to work correctly
264
- key = primary_key
265
- buf_limit = buffer_limit
266
- ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
267
- log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
268
- data = ds.all
269
- self.buffer += data
270
- num += data.size
271
- if data.size > 0
272
- # keep a record of the last primary key value in the buffer
273
- state[:filter] = self.buffer.last[ primary_key ]
274
- end
275
-
276
- break if num >= chunksize or data.size == 0
277
- end
278
- end
279
-
280
- def fetch_buffered(chunksize)
281
- load_buffer(chunksize) if self.buffer.size < chunksize
282
- rows = buffer.slice(0, chunksize)
283
- state[:last_fetched] = if rows.size > 0
284
- rows.last[ primary_key ]
285
- else
286
- nil
287
- end
288
- rows
289
- end
290
-
291
- def import_rows(rows)
292
- table.import(rows[:header], rows[:data])
293
- end
294
-
295
- def fetch_rows
296
- chunksize = state[:chunksize]
297
- Taps::Utils.format_data(fetch_buffered(chunksize) || [],
298
- :string_columns => string_columns)
299
- end
300
-
301
- def increment(row_count)
302
- # pop the rows we just successfully sent off the buffer
303
- @buffer.slice!(0, row_count)
304
- end
224
+ attr_accessor :buffer
225
+
226
+ def initialize(db, state)
227
+ super(db, state)
228
+ @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(state)
229
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
230
+ @buffer = []
231
+ end
232
+
233
+ def primary_key
234
+ state[:primary_key].to_sym
235
+ end
236
+
237
+ def buffer_limit
238
+ if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
239
+ state[:last_fetched]
240
+ else
241
+ state[:filter]
242
+ end
243
+ end
244
+
245
+ def calc_limit(chunksize)
246
+ # we want to not fetch more than is needed while we're
247
+ # inside sinatra but locally we can select more than
248
+ # is strictly needed
249
+ if defined?(Sinatra)
250
+ (chunksize * 1.1).ceil
251
+ else
252
+ (chunksize * 3).ceil
253
+ end
254
+ end
255
+
256
+ def load_buffer(chunksize)
257
+ # make sure BasicObject is not polluted by subsequent requires
258
+ Sequel::BasicObject.remove_methods!
259
+
260
+ num = 0
261
+ loop do
262
+ limit = calc_limit(chunksize)
263
+ # we have to use local variables in order for the virtual row filter to work correctly
264
+ key = primary_key
265
+ buf_limit = buffer_limit
266
+ ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
267
+ log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
268
+ data = ds.all
269
+ self.buffer += data
270
+ num += data.size
271
+ if data.size > 0
272
+ # keep a record of the last primary key value in the buffer
273
+ state[:filter] = self.buffer.last[ primary_key ]
274
+ end
275
+
276
+ break if num >= chunksize or data.size == 0
277
+ end
278
+ end
279
+
280
+ def fetch_buffered(chunksize)
281
+ load_buffer(chunksize) if self.buffer.size < chunksize
282
+ rows = buffer.slice(0, chunksize)
283
+ state[:last_fetched] = if rows.size > 0
284
+ rows.last[ primary_key ]
285
+ else
286
+ nil
287
+ end
288
+ rows
289
+ end
290
+
291
+ def import_rows(rows)
292
+ table.import(rows[:header], rows[:data])
293
+ end
294
+
295
+ def fetch_rows
296
+ chunksize = state[:chunksize]
297
+ Taps::Utils.format_data(fetch_buffered(chunksize) || [],
298
+ :string_columns => string_columns)
299
+ end
300
+
301
+ def increment(row_count)
302
+ # pop the rows we just successfully sent off the buffer
303
+ @buffer.slice!(0, row_count)
304
+ end
305
305
  end
306
306
 
307
307
  end