taps 0.3.11 → 0.3.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +3 -3
- data/VERSION.yml +1 -1
- data/bin/schema +28 -28
- data/lib/taps/cli.rb +171 -166
- data/lib/taps/config.rb +39 -39
- data/lib/taps/data_stream.rb +291 -291
- data/lib/taps/db_session.rb +13 -13
- data/lib/taps/log.rb +12 -12
- data/lib/taps/monkey.rb +17 -17
- data/lib/taps/multipart.rb +51 -51
- data/lib/taps/operation.rb +525 -525
- data/lib/taps/schema.rb +58 -58
- data/lib/taps/server.rb +154 -154
- data/lib/taps/utils.rb +145 -145
- data/spec/base.rb +11 -11
- data/spec/cli_spec.rb +5 -5
- data/spec/data_stream_spec.rb +16 -16
- data/spec/operation_spec.rb +21 -21
- data/spec/server_spec.rb +26 -26
- data/spec/utils_spec.rb +49 -49
- metadata +5 -5
data/lib/taps/config.rb
CHANGED
@@ -5,43 +5,43 @@ require 'yaml'
|
|
5
5
|
Sequel.datetime_class = DateTime
|
6
6
|
|
7
7
|
module Taps
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
8
|
+
def self.version_yml
|
9
|
+
@@version_yml ||= YAML.load(File.read(File.dirname(__FILE__) + '/../../VERSION.yml'))
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.version
|
13
|
+
version = "#{version_yml[:major]}.#{version_yml[:minor]}.#{version_yml[:patch]}"
|
14
|
+
version += ".#{version_yml[:build]}" if version_yml[:build]
|
15
|
+
version
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.compatible_version
|
19
|
+
"#{version_yml[:major]}.#{version_yml[:minor]}"
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.exiting=(val)
|
23
|
+
@@exiting = val
|
24
|
+
end
|
25
|
+
|
26
|
+
def exiting?
|
27
|
+
(@@exiting ||= false) == true
|
28
|
+
end
|
29
|
+
|
30
|
+
class Config
|
31
|
+
class << self
|
32
|
+
attr_accessor :taps_database_url
|
33
|
+
attr_accessor :login, :password, :database_url, :remote_url
|
34
|
+
attr_accessor :chunksize
|
35
|
+
|
36
|
+
def verify_database_url(db_url=nil)
|
37
|
+
db_url ||= self.database_url
|
38
|
+
db = Sequel.connect(db_url)
|
39
|
+
db.tables
|
40
|
+
db.disconnect
|
41
|
+
rescue Object => e
|
42
|
+
puts "Failed to connect to database:\n #{e.class} -> #{e}"
|
43
|
+
exit 1
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
47
|
end
|
data/lib/taps/data_stream.rb
CHANGED
@@ -7,301 +7,301 @@ require 'json/pure'
|
|
7
7
|
module Taps
|
8
8
|
|
9
9
|
class DataStream
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
10
|
+
class CorruptedData < Exception; end
|
11
|
+
|
12
|
+
DEFAULT_CHUNKSIZE = 1000
|
13
|
+
|
14
|
+
attr_reader :db, :state
|
15
|
+
|
16
|
+
def initialize(db, state)
|
17
|
+
@db = db
|
18
|
+
@state = {
|
19
|
+
:offset => 0,
|
20
|
+
:avg_chunksize => 0,
|
21
|
+
:num_chunksize => 0,
|
22
|
+
:total_chunksize => 0,
|
23
|
+
}.merge(state)
|
24
|
+
@state[:chunksize] ||= DEFAULT_CHUNKSIZE
|
25
|
+
@complete = false
|
26
|
+
end
|
27
|
+
|
28
|
+
def log
|
29
|
+
Taps.log
|
30
|
+
end
|
31
|
+
|
32
|
+
def error=(val)
|
33
|
+
state[:error] = val
|
34
|
+
end
|
35
|
+
|
36
|
+
def error
|
37
|
+
state[:error] || false
|
38
|
+
end
|
39
|
+
|
40
|
+
def table_name
|
41
|
+
state[:table_name].to_sym
|
42
|
+
end
|
43
|
+
|
44
|
+
def table_name_sql
|
45
|
+
table_name.identifier
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_hash
|
49
|
+
state.merge(:klass => self.class.to_s)
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_json
|
53
|
+
to_hash.to_json
|
54
|
+
end
|
55
|
+
|
56
|
+
def string_columns
|
57
|
+
@string_columns ||= Taps::Utils.incorrect_blobs(db, table_name)
|
58
|
+
end
|
59
|
+
|
60
|
+
def table
|
61
|
+
@table ||= db[table_name_sql]
|
62
|
+
end
|
63
|
+
|
64
|
+
def order_by(name=nil)
|
65
|
+
@order_by ||= begin
|
66
|
+
name ||= table_name
|
67
|
+
Taps::Utils.order_by(db, name)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def increment(row_count)
|
72
|
+
state[:offset] += row_count
|
73
|
+
end
|
74
|
+
|
75
|
+
# keep a record of the average chunksize within the first few hundred thousand records, after chunksize
|
76
|
+
# goes below 100 or maybe if offset is > 1000
|
77
|
+
def fetch_rows
|
78
|
+
state[:chunksize] = fetch_chunksize
|
79
|
+
ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
|
80
|
+
log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
|
81
|
+
rows = Taps::Utils.format_data(ds.all,
|
82
|
+
:string_columns => string_columns)
|
83
|
+
update_chunksize_stats
|
84
|
+
rows
|
85
|
+
end
|
86
|
+
|
87
|
+
def max_chunksize_training
|
88
|
+
20
|
89
|
+
end
|
90
|
+
|
91
|
+
def fetch_chunksize
|
92
|
+
chunksize = state[:chunksize]
|
93
|
+
return chunksize if state[:num_chunksize] < max_chunksize_training
|
94
|
+
return chunksize if state[:avg_chunksize] == 0
|
95
|
+
return chunksize if state[:error]
|
96
|
+
state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
|
97
|
+
end
|
98
|
+
|
99
|
+
def update_chunksize_stats
|
100
|
+
return if state[:num_chunksize] >= max_chunksize_training
|
101
|
+
state[:total_chunksize] += state[:chunksize]
|
102
|
+
state[:num_chunksize] += 1
|
103
|
+
state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
|
104
|
+
end
|
105
|
+
|
106
|
+
def encode_rows(rows)
|
107
|
+
Taps::Utils.base64encode(Marshal.dump(rows))
|
108
|
+
end
|
109
|
+
|
110
|
+
def fetch
|
111
|
+
log.debug "DataStream#fetch state -> #{state.inspect}"
|
112
|
+
|
113
|
+
t1 = Time.now
|
114
|
+
rows = fetch_rows
|
115
|
+
encoded_data = encode_rows(rows)
|
116
|
+
t2 = Time.now
|
117
|
+
elapsed_time = t2 - t1
|
118
|
+
|
119
|
+
@complete = rows == { }
|
120
|
+
|
121
|
+
[encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
|
122
|
+
end
|
123
|
+
|
124
|
+
def complete?
|
125
|
+
@complete
|
126
|
+
end
|
127
|
+
|
128
|
+
def fetch_remote(resource, headers)
|
129
|
+
params = fetch_from_resource(resource, headers)
|
130
|
+
encoded_data = params[:encoded_data]
|
131
|
+
json = params[:json]
|
132
|
+
|
133
|
+
rows = parse_encoded_data(encoded_data, json[:checksum])
|
134
|
+
@complete = rows == { }
|
135
|
+
|
136
|
+
# update local state
|
137
|
+
state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
|
138
|
+
|
139
|
+
unless @complete
|
140
|
+
import_rows(rows)
|
141
|
+
rows[:data].size
|
142
|
+
else
|
143
|
+
0
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# this one is used inside the server process
|
148
|
+
def fetch_remote_in_server(params)
|
149
|
+
json = self.class.parse_json(params[:json])
|
150
|
+
encoded_data = params[:encoded_data]
|
151
|
+
|
152
|
+
rows = parse_encoded_data(encoded_data, json[:checksum])
|
153
|
+
@complete = rows == { }
|
154
|
+
|
155
|
+
unless @complete
|
156
|
+
import_rows(rows)
|
157
|
+
rows[:data].size
|
158
|
+
else
|
159
|
+
0
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def fetch_from_resource(resource, headers)
|
164
|
+
res = nil
|
165
|
+
log.debug "DataStream#fetch_from_resource state -> #{state.inspect}"
|
166
|
+
state[:chunksize] = Taps::Utils.calculate_chunksize(state[:chunksize]) do |c|
|
167
|
+
state[:chunksize] = c
|
168
|
+
res = resource.post({:state => self.to_json}, headers)
|
169
|
+
end
|
170
|
+
|
171
|
+
begin
|
172
|
+
params = Taps::Multipart.parse(res)
|
173
|
+
params[:json] = self.class.parse_json(params[:json]) if params.has_key?(:json)
|
174
|
+
return params
|
175
|
+
rescue JSON::Parser
|
176
|
+
raise DataStream::CorruptedData.new("Invalid JSON Received")
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def self.parse_json(json)
|
181
|
+
hash = JSON.parse(json).symbolize_keys
|
182
|
+
hash[:state].symbolize_keys! if hash.has_key?(:state)
|
183
|
+
hash
|
184
|
+
end
|
185
|
+
|
186
|
+
def parse_encoded_data(encoded_data, checksum)
|
187
|
+
raise DataStream::CorruptedData.new("Checksum Failed") unless Taps::Utils.valid_data?(encoded_data, checksum)
|
188
|
+
|
189
|
+
begin
|
190
|
+
return Marshal.load(Taps::Utils.base64decode(encoded_data))
|
191
|
+
rescue Object => e
|
192
|
+
unless ENV['NO_DUMP_MARSHAL_ERRORS']
|
193
|
+
puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
|
194
|
+
File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
|
195
|
+
end
|
196
|
+
raise
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def import_rows(rows)
|
201
|
+
table.import(rows[:header], rows[:data])
|
202
|
+
state[:offset] += rows[:data].size
|
203
|
+
end
|
204
|
+
|
205
|
+
def self.factory(db, state)
|
206
|
+
if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
|
207
|
+
Sequel::MySQL.convert_invalid_date_time = :nil
|
208
|
+
end
|
209
|
+
|
210
|
+
if state.has_key?(:klass)
|
211
|
+
return eval(state[:klass]).new(db, state)
|
212
|
+
end
|
213
|
+
|
214
|
+
if Taps::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
|
215
|
+
DataStreamKeyed.new(db, state)
|
216
|
+
else
|
217
|
+
DataStream.new(db, state)
|
218
|
+
end
|
219
|
+
end
|
220
220
|
end
|
221
221
|
|
222
222
|
|
223
223
|
class DataStreamKeyed < DataStream
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
224
|
+
attr_accessor :buffer
|
225
|
+
|
226
|
+
def initialize(db, state)
|
227
|
+
super(db, state)
|
228
|
+
@state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(state)
|
229
|
+
@state[:chunksize] ||= DEFAULT_CHUNKSIZE
|
230
|
+
@buffer = []
|
231
|
+
end
|
232
|
+
|
233
|
+
def primary_key
|
234
|
+
state[:primary_key].to_sym
|
235
|
+
end
|
236
|
+
|
237
|
+
def buffer_limit
|
238
|
+
if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
|
239
|
+
state[:last_fetched]
|
240
|
+
else
|
241
|
+
state[:filter]
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def calc_limit(chunksize)
|
246
|
+
# we want to not fetch more than is needed while we're
|
247
|
+
# inside sinatra but locally we can select more than
|
248
|
+
# is strictly needed
|
249
|
+
if defined?(Sinatra)
|
250
|
+
(chunksize * 1.1).ceil
|
251
|
+
else
|
252
|
+
(chunksize * 3).ceil
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
def load_buffer(chunksize)
|
257
|
+
# make sure BasicObject is not polluted by subsequent requires
|
258
|
+
Sequel::BasicObject.remove_methods!
|
259
|
+
|
260
|
+
num = 0
|
261
|
+
loop do
|
262
|
+
limit = calc_limit(chunksize)
|
263
|
+
# we have to use local variables in order for the virtual row filter to work correctly
|
264
|
+
key = primary_key
|
265
|
+
buf_limit = buffer_limit
|
266
|
+
ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
|
267
|
+
log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
|
268
|
+
data = ds.all
|
269
|
+
self.buffer += data
|
270
|
+
num += data.size
|
271
|
+
if data.size > 0
|
272
|
+
# keep a record of the last primary key value in the buffer
|
273
|
+
state[:filter] = self.buffer.last[ primary_key ]
|
274
|
+
end
|
275
|
+
|
276
|
+
break if num >= chunksize or data.size == 0
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
def fetch_buffered(chunksize)
|
281
|
+
load_buffer(chunksize) if self.buffer.size < chunksize
|
282
|
+
rows = buffer.slice(0, chunksize)
|
283
|
+
state[:last_fetched] = if rows.size > 0
|
284
|
+
rows.last[ primary_key ]
|
285
|
+
else
|
286
|
+
nil
|
287
|
+
end
|
288
|
+
rows
|
289
|
+
end
|
290
|
+
|
291
|
+
def import_rows(rows)
|
292
|
+
table.import(rows[:header], rows[:data])
|
293
|
+
end
|
294
|
+
|
295
|
+
def fetch_rows
|
296
|
+
chunksize = state[:chunksize]
|
297
|
+
Taps::Utils.format_data(fetch_buffered(chunksize) || [],
|
298
|
+
:string_columns => string_columns)
|
299
|
+
end
|
300
|
+
|
301
|
+
def increment(row_count)
|
302
|
+
# pop the rows we just successfully sent off the buffer
|
303
|
+
@buffer.slice!(0, row_count)
|
304
|
+
end
|
305
305
|
end
|
306
306
|
|
307
307
|
end
|