taps-jruby 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,333 @@
1
+ require 'taps/monkey'
2
+ require 'taps/multipart'
3
+ require 'taps/utils'
4
+ require 'taps/log'
5
+ require 'taps/errors'
6
+ require 'json/pure'
7
+
8
+ module Taps
9
+
10
+ class DataStream
11
+ DEFAULT_CHUNKSIZE = 1000
12
+
13
+ attr_reader :db, :state
14
+
15
+ def initialize(db, state)
16
+ @db = db
17
+ @state = {
18
+ :offset => 0,
19
+ :avg_chunksize => 0,
20
+ :num_chunksize => 0,
21
+ :total_chunksize => 0,
22
+ }.merge(state)
23
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
24
+ @complete = false
25
+ end
26
+
27
+ def log
28
+ Taps.log
29
+ end
30
+
31
+ def error=(val)
32
+ state[:error] = val
33
+ end
34
+
35
+ def error
36
+ state[:error] || false
37
+ end
38
+
39
+ def table_name
40
+ state[:table_name].to_sym
41
+ end
42
+
43
+ def table_name_sql
44
+ table_name.identifier
45
+ end
46
+
47
+ def to_hash
48
+ state.merge(:klass => self.class.to_s)
49
+ end
50
+
51
+ def to_json
52
+ to_hash.to_json
53
+ end
54
+
55
+ def string_columns
56
+ @string_columns ||= Taps::Utils.incorrect_blobs(db, table_name)
57
+ end
58
+
59
+ def table
60
+ @table ||= db[table_name_sql]
61
+ end
62
+
63
+ def order_by(name=nil)
64
+ @order_by ||= begin
65
+ name ||= table_name
66
+ Taps::Utils.order_by(db, name)
67
+ end
68
+ end
69
+
70
+ def increment(row_count)
71
+ state[:offset] += row_count
72
+ end
73
+
74
+ # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
75
+ # goes below 100 or maybe if offset is > 1000
76
+ def fetch_rows
77
+ state[:chunksize] = fetch_chunksize
78
+ ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
79
+ log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
80
+ rows = Taps::Utils.format_data(ds.all,
81
+ :string_columns => string_columns,
82
+ :schema => db.schema(table_name),
83
+ :table => table_name
84
+ )
85
+ update_chunksize_stats
86
+ rows
87
+ end
88
+
89
+ def max_chunksize_training
90
+ 20
91
+ end
92
+
93
+ def fetch_chunksize
94
+ chunksize = state[:chunksize]
95
+ return chunksize if state[:num_chunksize] < max_chunksize_training
96
+ return chunksize if state[:avg_chunksize] == 0
97
+ return chunksize if state[:error]
98
+ state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
99
+ end
100
+
101
+ def update_chunksize_stats
102
+ return if state[:num_chunksize] >= max_chunksize_training
103
+ state[:total_chunksize] += state[:chunksize]
104
+ state[:num_chunksize] += 1
105
+ state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
106
+ end
107
+
108
+ def encode_rows(rows)
109
+ Taps::Utils.base64encode(Marshal.dump(rows))
110
+ end
111
+
112
+ def fetch
113
+ log.debug "DataStream#fetch state -> #{state.inspect}"
114
+
115
+ t1 = Time.now
116
+ rows = fetch_rows
117
+ encoded_data = encode_rows(rows)
118
+ t2 = Time.now
119
+ elapsed_time = t2 - t1
120
+
121
+ @complete = rows == { }
122
+
123
+ [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
124
+ end
125
+
126
+ def complete?
127
+ @complete
128
+ end
129
+
130
+ def fetch_remote(resource, headers)
131
+ params = fetch_from_resource(resource, headers)
132
+ encoded_data = params[:encoded_data]
133
+ json = params[:json]
134
+
135
+ rows = parse_encoded_data(encoded_data, json[:checksum])
136
+ @complete = rows == { }
137
+
138
+ # update local state
139
+ state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
140
+
141
+ unless @complete
142
+ import_rows(rows)
143
+ rows[:data].size
144
+ else
145
+ 0
146
+ end
147
+ end
148
+
149
+ # this one is used inside the server process
150
+ def fetch_remote_in_server(params)
151
+ json = self.class.parse_json(params[:json])
152
+ encoded_data = params[:encoded_data]
153
+
154
+ rows = parse_encoded_data(encoded_data, json[:checksum])
155
+ @complete = rows == { }
156
+
157
+ unless @complete
158
+ import_rows(rows)
159
+ rows[:data].size
160
+ else
161
+ 0
162
+ end
163
+ end
164
+
165
+ def fetch_from_resource(resource, headers)
166
+ res = nil
167
+ log.debug "DataStream#fetch_from_resource state -> #{state.inspect}"
168
+ state[:chunksize] = Taps::Utils.calculate_chunksize(state[:chunksize]) do |c|
169
+ state[:chunksize] = c
170
+ res = resource.post({:state => self.to_json}, headers)
171
+ end
172
+
173
+ begin
174
+ params = Taps::Multipart.parse(res)
175
+ params[:json] = self.class.parse_json(params[:json]) if params.has_key?(:json)
176
+ return params
177
+ rescue JSON::Parser
178
+ raise Taps::CorruptedData.new("Invalid JSON Received")
179
+ end
180
+ end
181
+
182
+ def self.parse_json(json)
183
+ hash = JSON.parse(json).symbolize_keys
184
+ hash[:state].symbolize_keys! if hash.has_key?(:state)
185
+ hash
186
+ end
187
+
188
+ def parse_encoded_data(encoded_data, checksum)
189
+ raise Taps::CorruptedData.new("Checksum Failed") unless Taps::Utils.valid_data?(encoded_data, checksum)
190
+
191
+ begin
192
+ return Marshal.load(Taps::Utils.base64decode(encoded_data))
193
+ rescue Object => e
194
+ unless ENV['NO_DUMP_MARSHAL_ERRORS']
195
+ puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
196
+ File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
197
+ end
198
+ raise
199
+ end
200
+ end
201
+
202
+ def import_rows(rows)
203
+ table.import(rows[:header], rows[:data])
204
+ state[:offset] += rows[:data].size
205
+ end
206
+
207
+ def verify_stream
208
+ state[:offset] = table.count
209
+ end
210
+
211
+ def verify_remote_stream(resource, headers)
212
+ json_raw = resource.post({:state => self.to_json}, headers).to_s
213
+ json = self.class.parse_json(json_raw)
214
+
215
+ self.class.new(db, json[:state])
216
+ end
217
+
218
+ def self.factory(db, state)
219
+ if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
220
+ Sequel::MySQL.convert_invalid_date_time = :nil
221
+ end
222
+
223
+ if state.has_key?(:klass)
224
+ return eval(state[:klass]).new(db, state)
225
+ end
226
+
227
+ if Taps::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
228
+ DataStreamKeyed.new(db, state)
229
+ else
230
+ DataStream.new(db, state)
231
+ end
232
+ end
233
+ end
234
+
235
+
236
+ class DataStreamKeyed < DataStream
237
+ attr_accessor :buffer
238
+
239
+ def initialize(db, state)
240
+ super(db, state)
241
+ @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(state)
242
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
243
+ @buffer = []
244
+ end
245
+
246
+ def primary_key
247
+ state[:primary_key].to_sym
248
+ end
249
+
250
+ def buffer_limit
251
+ if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
252
+ state[:last_fetched]
253
+ else
254
+ state[:filter]
255
+ end
256
+ end
257
+
258
+ def calc_limit(chunksize)
259
+ # we want to not fetch more than is needed while we're
260
+ # inside sinatra but locally we can select more than
261
+ # is strictly needed
262
+ if defined?(Sinatra)
263
+ (chunksize * 1.1).ceil
264
+ else
265
+ (chunksize * 3).ceil
266
+ end
267
+ end
268
+
269
+ def load_buffer(chunksize)
270
+ # make sure BasicObject is not polluted by subsequent requires
271
+ Sequel::BasicObject.remove_methods!
272
+
273
+ num = 0
274
+ loop do
275
+ limit = calc_limit(chunksize)
276
+ # we have to use local variables in order for the virtual row filter to work correctly
277
+ key = primary_key
278
+ buf_limit = buffer_limit
279
+ ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
280
+ log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
281
+ data = ds.all
282
+ self.buffer += data
283
+ num += data.size
284
+ if data.size > 0
285
+ # keep a record of the last primary key value in the buffer
286
+ state[:filter] = self.buffer.last[ primary_key ]
287
+ end
288
+
289
+ break if num >= chunksize or data.size == 0
290
+ end
291
+ end
292
+
293
+ def fetch_buffered(chunksize)
294
+ load_buffer(chunksize) if self.buffer.size < chunksize
295
+ rows = buffer.slice(0, chunksize)
296
+ state[:last_fetched] = if rows.size > 0
297
+ rows.last[ primary_key ]
298
+ else
299
+ nil
300
+ end
301
+ rows
302
+ end
303
+
304
+ def import_rows(rows)
305
+ table.import(rows[:header], rows[:data])
306
+ end
307
+
308
+ def fetch_rows
309
+ chunksize = state[:chunksize]
310
+ Taps::Utils.format_data(fetch_buffered(chunksize) || [],
311
+ :string_columns => string_columns)
312
+ end
313
+
314
+ def increment(row_count)
315
+ # pop the rows we just successfully sent off the buffer
316
+ @buffer.slice!(0, row_count)
317
+ end
318
+
319
+ def verify_stream
320
+ key = primary_key
321
+ ds = table.order(*order_by)
322
+ current_filter = ds.max(key.sql_number)
323
+
324
+ # set the current filter to the max of the primary key
325
+ state[:filter] = current_filter
326
+ # clear out the last_fetched value so it can restart from scratch
327
+ state[:last_fetched] = nil
328
+
329
+ log.debug "DataStreamKeyed#verify_stream -> state: #{state.inspect}"
330
+ end
331
+ end
332
+
333
+ end
@@ -0,0 +1,20 @@
1
+ Sequel::Model.db = Sequel.connect(Taps::Config.taps_database_url)
2
+
3
+ class DbSession < Sequel::Model
4
+ plugin :schema
5
+ set_schema do
6
+ primary_key :id
7
+ text :key
8
+ text :database_url
9
+ timestamp :started_at
10
+ timestamp :last_access
11
+ end
12
+
13
+ def conn
14
+ Sequel.connect(database_url) do |db|
15
+ yield db if block_given?
16
+ end
17
+ end
18
+ end
19
+
20
+ DbSession.create_table! unless DbSession.table_exists?
@@ -0,0 +1,15 @@
1
+ module Taps
2
+ class BaseError < StandardError
3
+ attr_reader :original_backtrace
4
+
5
+ def initialize(message, opts={})
6
+ @original_backtrace = opts.delete(:backtrace)
7
+ super(message)
8
+ end
9
+ end
10
+
11
+ class NotImplemented < BaseError; end
12
+ class DuplicatePrimaryKeyError < BaseError; end
13
+ class CorruptedData < BaseError; end
14
+ class InvalidData < BaseError; end
15
+ end
@@ -0,0 +1,15 @@
1
+ module Taps
2
+ def self.log=(log)
3
+ @@log = log
4
+ end
5
+
6
+ def self.log
7
+ @@log ||= begin
8
+ require 'logger'
9
+ log = Logger.new($stderr)
10
+ log.level = Logger::ERROR
11
+ log.datetime_format = "%Y-%m-%d %H:%M:%S"
12
+ log
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,21 @@
1
+ class Hash
2
+ def symbolize_keys
3
+ inject({}) do |options, (key, value)|
4
+ options[(key.to_sym rescue key) || key] = value
5
+ options
6
+ end
7
+ end
8
+
9
+ def symbolize_keys!
10
+ self.replace(symbolize_keys)
11
+ end
12
+
13
+ def symbolize_recursively!
14
+ self.replace(symbolize_keys)
15
+ self.each do |k, v|
16
+ if v.kind_of?(Hash)
17
+ v.symbolize_keys!
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,73 @@
1
+ require 'restclient'
2
+ require 'rack/utils'
3
+ require 'json/pure'
4
+ require 'stringio'
5
+
6
+ module Taps
7
+ class Multipart
8
+ class Container
9
+ attr_accessor :attachments
10
+
11
+ def initialize
12
+ @attachments = []
13
+ end
14
+
15
+ def attach(opts)
16
+ mp = Taps::Multipart.new(opts)
17
+ attachments << mp
18
+ end
19
+
20
+ def generate
21
+ hash = {}
22
+ attachments.each do |mp|
23
+ hash[mp.name] = mp
24
+ end
25
+ m = RestClient::Payload::Multipart.new(hash)
26
+ [m.to_s, m.headers['Content-Type']]
27
+ end
28
+ end
29
+
30
+ attr_reader :opts
31
+
32
+ def initialize(opts={})
33
+ @opts = opts
34
+ end
35
+
36
+ def name
37
+ opts[:name]
38
+ end
39
+
40
+ def to_s
41
+ opts[:payload]
42
+ end
43
+
44
+ def content_type
45
+ opts[:content_type] || 'text/plain'
46
+ end
47
+
48
+ def original_filename
49
+ opts[:original_filename]
50
+ end
51
+
52
+ def self.create
53
+ c = Taps::Multipart::Container.new
54
+ yield c
55
+ c.generate
56
+ end
57
+
58
+ # response is a rest-client response
59
+ def self.parse(response)
60
+ content = response.to_s
61
+ env = {
62
+ 'CONTENT_TYPE' => response.headers[:content_type],
63
+ 'CONTENT_LENGTH' => content.size,
64
+ 'rack.input' => StringIO.new(content)
65
+ }
66
+
67
+ params = Rack::Utils::Multipart.parse_multipart(env)
68
+ params.symbolize_keys!
69
+ params
70
+ end
71
+
72
+ end
73
+ end