taps-jruby 0.3.14

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,333 @@
1
+ require 'taps/monkey'
2
+ require 'taps/multipart'
3
+ require 'taps/utils'
4
+ require 'taps/log'
5
+ require 'taps/errors'
6
+ require 'json/pure'
7
+
8
+ module Taps
9
+
10
+ class DataStream
11
+ DEFAULT_CHUNKSIZE = 1000
12
+
13
+ attr_reader :db, :state
14
+
15
+ def initialize(db, state)
16
+ @db = db
17
+ @state = {
18
+ :offset => 0,
19
+ :avg_chunksize => 0,
20
+ :num_chunksize => 0,
21
+ :total_chunksize => 0,
22
+ }.merge(state)
23
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
24
+ @complete = false
25
+ end
26
+
27
+ def log
28
+ Taps.log
29
+ end
30
+
31
+ def error=(val)
32
+ state[:error] = val
33
+ end
34
+
35
+ def error
36
+ state[:error] || false
37
+ end
38
+
39
+ def table_name
40
+ state[:table_name].to_sym
41
+ end
42
+
43
+ def table_name_sql
44
+ table_name.identifier
45
+ end
46
+
47
+ def to_hash
48
+ state.merge(:klass => self.class.to_s)
49
+ end
50
+
51
+ def to_json
52
+ to_hash.to_json
53
+ end
54
+
55
+ def string_columns
56
+ @string_columns ||= Taps::Utils.incorrect_blobs(db, table_name)
57
+ end
58
+
59
+ def table
60
+ @table ||= db[table_name_sql]
61
+ end
62
+
63
+ def order_by(name=nil)
64
+ @order_by ||= begin
65
+ name ||= table_name
66
+ Taps::Utils.order_by(db, name)
67
+ end
68
+ end
69
+
70
+ def increment(row_count)
71
+ state[:offset] += row_count
72
+ end
73
+
74
+ # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
75
+ # goes below 100 or maybe if offset is > 1000
76
+ def fetch_rows
77
+ state[:chunksize] = fetch_chunksize
78
+ ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
79
+ log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
80
+ rows = Taps::Utils.format_data(ds.all,
81
+ :string_columns => string_columns,
82
+ :schema => db.schema(table_name),
83
+ :table => table_name
84
+ )
85
+ update_chunksize_stats
86
+ rows
87
+ end
88
+
89
+ def max_chunksize_training
90
+ 20
91
+ end
92
+
93
+ def fetch_chunksize
94
+ chunksize = state[:chunksize]
95
+ return chunksize if state[:num_chunksize] < max_chunksize_training
96
+ return chunksize if state[:avg_chunksize] == 0
97
+ return chunksize if state[:error]
98
+ state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
99
+ end
100
+
101
+ def update_chunksize_stats
102
+ return if state[:num_chunksize] >= max_chunksize_training
103
+ state[:total_chunksize] += state[:chunksize]
104
+ state[:num_chunksize] += 1
105
+ state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
106
+ end
107
+
108
+ def encode_rows(rows)
109
+ Taps::Utils.base64encode(Marshal.dump(rows))
110
+ end
111
+
112
+ def fetch
113
+ log.debug "DataStream#fetch state -> #{state.inspect}"
114
+
115
+ t1 = Time.now
116
+ rows = fetch_rows
117
+ encoded_data = encode_rows(rows)
118
+ t2 = Time.now
119
+ elapsed_time = t2 - t1
120
+
121
+ @complete = rows == { }
122
+
123
+ [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
124
+ end
125
+
126
+ def complete?
127
+ @complete
128
+ end
129
+
130
+ def fetch_remote(resource, headers)
131
+ params = fetch_from_resource(resource, headers)
132
+ encoded_data = params[:encoded_data]
133
+ json = params[:json]
134
+
135
+ rows = parse_encoded_data(encoded_data, json[:checksum])
136
+ @complete = rows == { }
137
+
138
+ # update local state
139
+ state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
140
+
141
+ unless @complete
142
+ import_rows(rows)
143
+ rows[:data].size
144
+ else
145
+ 0
146
+ end
147
+ end
148
+
149
+ # this one is used inside the server process
150
+ def fetch_remote_in_server(params)
151
+ json = self.class.parse_json(params[:json])
152
+ encoded_data = params[:encoded_data]
153
+
154
+ rows = parse_encoded_data(encoded_data, json[:checksum])
155
+ @complete = rows == { }
156
+
157
+ unless @complete
158
+ import_rows(rows)
159
+ rows[:data].size
160
+ else
161
+ 0
162
+ end
163
+ end
164
+
165
+ def fetch_from_resource(resource, headers)
166
+ res = nil
167
+ log.debug "DataStream#fetch_from_resource state -> #{state.inspect}"
168
+ state[:chunksize] = Taps::Utils.calculate_chunksize(state[:chunksize]) do |c|
169
+ state[:chunksize] = c
170
+ res = resource.post({:state => self.to_json}, headers)
171
+ end
172
+
173
+ begin
174
+ params = Taps::Multipart.parse(res)
175
+ params[:json] = self.class.parse_json(params[:json]) if params.has_key?(:json)
176
+ return params
177
+ rescue JSON::Parser
178
+ raise Taps::CorruptedData.new("Invalid JSON Received")
179
+ end
180
+ end
181
+
182
+ def self.parse_json(json)
183
+ hash = JSON.parse(json).symbolize_keys
184
+ hash[:state].symbolize_keys! if hash.has_key?(:state)
185
+ hash
186
+ end
187
+
188
+ def parse_encoded_data(encoded_data, checksum)
189
+ raise Taps::CorruptedData.new("Checksum Failed") unless Taps::Utils.valid_data?(encoded_data, checksum)
190
+
191
+ begin
192
+ return Marshal.load(Taps::Utils.base64decode(encoded_data))
193
+ rescue Object => e
194
+ unless ENV['NO_DUMP_MARSHAL_ERRORS']
195
+ puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
196
+ File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
197
+ end
198
+ raise
199
+ end
200
+ end
201
+
202
+ def import_rows(rows)
203
+ table.import(rows[:header], rows[:data])
204
+ state[:offset] += rows[:data].size
205
+ end
206
+
207
+ def verify_stream
208
+ state[:offset] = table.count
209
+ end
210
+
211
+ def verify_remote_stream(resource, headers)
212
+ json_raw = resource.post({:state => self.to_json}, headers).to_s
213
+ json = self.class.parse_json(json_raw)
214
+
215
+ self.class.new(db, json[:state])
216
+ end
217
+
218
+ def self.factory(db, state)
219
+ if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
220
+ Sequel::MySQL.convert_invalid_date_time = :nil
221
+ end
222
+
223
+ if state.has_key?(:klass)
224
+ return eval(state[:klass]).new(db, state)
225
+ end
226
+
227
+ if Taps::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
228
+ DataStreamKeyed.new(db, state)
229
+ else
230
+ DataStream.new(db, state)
231
+ end
232
+ end
233
+ end
234
+
235
+
236
+ class DataStreamKeyed < DataStream
237
+ attr_accessor :buffer
238
+
239
+ def initialize(db, state)
240
+ super(db, state)
241
+ @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(state)
242
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
243
+ @buffer = []
244
+ end
245
+
246
+ def primary_key
247
+ state[:primary_key].to_sym
248
+ end
249
+
250
+ def buffer_limit
251
+ if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
252
+ state[:last_fetched]
253
+ else
254
+ state[:filter]
255
+ end
256
+ end
257
+
258
+ def calc_limit(chunksize)
259
+ # we want to not fetch more than is needed while we're
260
+ # inside sinatra but locally we can select more than
261
+ # is strictly needed
262
+ if defined?(Sinatra)
263
+ (chunksize * 1.1).ceil
264
+ else
265
+ (chunksize * 3).ceil
266
+ end
267
+ end
268
+
269
+ def load_buffer(chunksize)
270
+ # make sure BasicObject is not polluted by subsequent requires
271
+ Sequel::BasicObject.remove_methods!
272
+
273
+ num = 0
274
+ loop do
275
+ limit = calc_limit(chunksize)
276
+ # we have to use local variables in order for the virtual row filter to work correctly
277
+ key = primary_key
278
+ buf_limit = buffer_limit
279
+ ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
280
+ log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
281
+ data = ds.all
282
+ self.buffer += data
283
+ num += data.size
284
+ if data.size > 0
285
+ # keep a record of the last primary key value in the buffer
286
+ state[:filter] = self.buffer.last[ primary_key ]
287
+ end
288
+
289
+ break if num >= chunksize or data.size == 0
290
+ end
291
+ end
292
+
293
+ def fetch_buffered(chunksize)
294
+ load_buffer(chunksize) if self.buffer.size < chunksize
295
+ rows = buffer.slice(0, chunksize)
296
+ state[:last_fetched] = if rows.size > 0
297
+ rows.last[ primary_key ]
298
+ else
299
+ nil
300
+ end
301
+ rows
302
+ end
303
+
304
+ def import_rows(rows)
305
+ table.import(rows[:header], rows[:data])
306
+ end
307
+
308
+ def fetch_rows
309
+ chunksize = state[:chunksize]
310
+ Taps::Utils.format_data(fetch_buffered(chunksize) || [],
311
+ :string_columns => string_columns)
312
+ end
313
+
314
+ def increment(row_count)
315
+ # pop the rows we just successfully sent off the buffer
316
+ @buffer.slice!(0, row_count)
317
+ end
318
+
319
+ def verify_stream
320
+ key = primary_key
321
+ ds = table.order(*order_by)
322
+ current_filter = ds.max(key.sql_number)
323
+
324
+ # set the current filter to the max of the primary key
325
+ state[:filter] = current_filter
326
+ # clear out the last_fetched value so it can restart from scratch
327
+ state[:last_fetched] = nil
328
+
329
+ log.debug "DataStreamKeyed#verify_stream -> state: #{state.inspect}"
330
+ end
331
+ end
332
+
333
+ end
@@ -0,0 +1,20 @@
1
+ Sequel::Model.db = Sequel.connect(Taps::Config.taps_database_url)
2
+
3
+ class DbSession < Sequel::Model
4
+ plugin :schema
5
+ set_schema do
6
+ primary_key :id
7
+ text :key
8
+ text :database_url
9
+ timestamp :started_at
10
+ timestamp :last_access
11
+ end
12
+
13
+ def conn
14
+ Sequel.connect(database_url) do |db|
15
+ yield db if block_given?
16
+ end
17
+ end
18
+ end
19
+
20
+ DbSession.create_table! unless DbSession.table_exists?
@@ -0,0 +1,15 @@
1
+ module Taps
2
+ class BaseError < StandardError
3
+ attr_reader :original_backtrace
4
+
5
+ def initialize(message, opts={})
6
+ @original_backtrace = opts.delete(:backtrace)
7
+ super(message)
8
+ end
9
+ end
10
+
11
+ class NotImplemented < BaseError; end
12
+ class DuplicatePrimaryKeyError < BaseError; end
13
+ class CorruptedData < BaseError; end
14
+ class InvalidData < BaseError; end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Taps
2
+ def self.log=(log)
3
+ @@log = log
4
+ end
5
+
6
+ def self.log
7
+ @@log ||= begin
8
+ require 'logger'
9
+ log = Logger.new($stderr)
10
+ log.level = Logger::ERROR
11
+ log.datetime_format = "%Y-%m-%d %H:%M:%S"
12
+ log
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,21 @@
1
+ class Hash
2
+ def symbolize_keys
3
+ inject({}) do |options, (key, value)|
4
+ options[(key.to_sym rescue key) || key] = value
5
+ options
6
+ end
7
+ end
8
+
9
+ def symbolize_keys!
10
+ self.replace(symbolize_keys)
11
+ end
12
+
13
+ def symbolize_recursively!
14
+ self.replace(symbolize_keys)
15
+ self.each do |k, v|
16
+ if v.kind_of?(Hash)
17
+ v.symbolize_keys!
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,73 @@
1
+ require 'restclient'
2
+ require 'rack/utils'
3
+ require 'json/pure'
4
+ require 'stringio'
5
+
6
+ module Taps
7
+ class Multipart
8
+ class Container
9
+ attr_accessor :attachments
10
+
11
+ def initialize
12
+ @attachments = []
13
+ end
14
+
15
+ def attach(opts)
16
+ mp = Taps::Multipart.new(opts)
17
+ attachments << mp
18
+ end
19
+
20
+ def generate
21
+ hash = {}
22
+ attachments.each do |mp|
23
+ hash[mp.name] = mp
24
+ end
25
+ m = RestClient::Payload::Multipart.new(hash)
26
+ [m.to_s, m.headers['Content-Type']]
27
+ end
28
+ end
29
+
30
+ attr_reader :opts
31
+
32
+ def initialize(opts={})
33
+ @opts = opts
34
+ end
35
+
36
+ def name
37
+ opts[:name]
38
+ end
39
+
40
+ def to_s
41
+ opts[:payload]
42
+ end
43
+
44
+ def content_type
45
+ opts[:content_type] || 'text/plain'
46
+ end
47
+
48
+ def original_filename
49
+ opts[:original_filename]
50
+ end
51
+
52
+ def self.create
53
+ c = Taps::Multipart::Container.new
54
+ yield c
55
+ c.generate
56
+ end
57
+
58
+ # response is a rest-client response
59
+ def self.parse(response)
60
+ content = response.to_s
61
+ env = {
62
+ 'CONTENT_TYPE' => response.headers[:content_type],
63
+ 'CONTENT_LENGTH' => content.size,
64
+ 'rack.input' => StringIO.new(content)
65
+ }
66
+
67
+ params = Rack::Utils::Multipart.parse_multipart(env)
68
+ params.symbolize_keys!
69
+ params
70
+ end
71
+
72
+ end
73
+ end