tapsoob 0.2.7-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,350 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'tapsoob/log'
3
+ require 'tapsoob/utils'
4
+
5
+ module Tapsoob
6
+ class DataStream
7
+ DEFAULT_CHUNKSIZE = 1000
8
+
9
+ attr_reader :db, :state
10
+
11
+ def initialize(db, state)
12
+ @db = db
13
+ @state = {
14
+ :offset => 0,
15
+ :avg_chunksize => 0,
16
+ :num_chunksize => 0,
17
+ :total_chunksize => 0
18
+ }.merge(state)
19
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
20
+ @complete = false
21
+ end
22
+
23
+ def log
24
+ Tapsoob.log
25
+ end
26
+
27
+ def error=(val)
28
+ state[:error] = val
29
+ end
30
+
31
+ def error
32
+ state[:error] || false
33
+ end
34
+
35
+ def table_name
36
+ state[:table_name].to_sym
37
+ end
38
+
39
+ def table_name_sql
40
+ table_name
41
+ end
42
+
43
+ def to_hash
44
+ state.merge(:klass => self.class.to_s)
45
+ end
46
+
47
+ def to_json
48
+ JSON.generate(to_hash)
49
+ end
50
+
51
+ def string_columns
52
+ @string_columns ||= Tapsoob::Utils.incorrect_blobs(db, table_name)
53
+ end
54
+
55
+ def table
56
+ @table ||= db[table_name_sql]
57
+ end
58
+
59
+ def order_by(name=nil)
60
+ @order_by ||= begin
61
+ name ||= table_name
62
+ Tapsoob::Utils.order_by(db, name)
63
+ end
64
+ end
65
+
66
+ def increment(row_count)
67
+ state[:offset] += row_count
68
+ end
69
+
70
+ # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
71
+ # goes below 100 or maybe if offset is > 1000
72
+ def fetch_rows
73
+ state[:chunksize] = fetch_chunksize
74
+ ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
75
+ log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
76
+ rows = Tapsoob::Utils.format_data(ds.all,
77
+ :string_columns => string_columns,
78
+ :schema => db.schema(table_name),
79
+ :table => table_name
80
+ )
81
+ update_chunksize_stats
82
+ rows
83
+ end
84
+
85
+ def fetch_file(dump_path)
86
+ state[:chunksize] = fetch_chunksize
87
+ ds = JSON.parse(File.read(File.join(dump_path, "data", "#{table_name}.json")))
88
+ log.debug "DataStream#fetch_file"
89
+ rows = {
90
+ :header => ds["header"],
91
+ :data => ds["data"][state[:offset], (state[:offset] + state[:chunksize])] || [ ]
92
+ }
93
+ update_chunksize_stats
94
+ rows
95
+ end
96
+
97
+ def max_chunksize_training
98
+ 20
99
+ end
100
+
101
+ def fetch_chunksize
102
+ chunksize = state[:chunksize]
103
+ return chunksize if state[:num_chunksize] < max_chunksize_training
104
+ return chunksize if state[:avg_chunksize] == 0
105
+ return chunksize if state[:error]
106
+ state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
107
+ end
108
+
109
+ def update_chunksize_stats
110
+ return if state[:num_chunksize] >= max_chunksize_training
111
+ state[:total_chunksize] += state[:chunksize]
112
+ state[:num_chunksize] += 1
113
+ state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
114
+ end
115
+
116
+ def encode_rows(rows)
117
+ Tapsoob::Utils.base64encode(Marshal.dump(rows))
118
+ end
119
+
120
+ def fetch(opts = {})
121
+ opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
122
+
123
+ log.debug "DataStream#fetch state -> #{state.inspect}"
124
+
125
+ t1 = Time.now
126
+ rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
127
+ encoded_data = encode_rows(rows)
128
+ t2 = Time.now
129
+ elapsed_time = t2 - t1
130
+
131
+ if opts[:type] == "file"
132
+ @complete = rows[:data] == [ ]
133
+ else
134
+ @complete = rows == { }
135
+ end
136
+
137
+ [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
138
+ end
139
+
140
+ def complete?
141
+ @complete
142
+ end
143
+
144
+ def fetch_database(dump_path)
145
+ params = fetch_from_database
146
+ encoded_data = params[:encoded_data]
147
+ json = params[:json]
148
+
149
+ rows = parse_encoded_data(encoded_data, json[:checksum])
150
+
151
+ @complete = rows == { }
152
+
153
+ # update local state
154
+ state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
155
+
156
+ unless @complete
157
+ Tapsoob::Utils.export_rows(dump_path, table_name, rows)
158
+ state[:offset] += rows[:data].size
159
+ rows[:data].size
160
+ else
161
+ 0
162
+ end
163
+ end
164
+
165
+ def fetch_from_database
166
+ res = nil
167
+ log.debug "DataStream#fetch_from_database state -> #{state.inspect}"
168
+ state[:chunksize] = Tapsoob::Utils.calculate_chunksize(state[:chunksize]) do |c|
169
+ state[:chunksize] = c.to_i
170
+ encoded_data = fetch.first
171
+
172
+ checksum = Tapsoob::Utils.checksum(encoded_data).to_s
173
+
174
+ res = {
175
+ :json => { :checksum => checksum, :state => to_hash },
176
+ :encoded_data => encoded_data
177
+ }
178
+ end
179
+
180
+ res
181
+ end
182
+
183
+ def fetch_data_in_database(params)
184
+ encoded_data = params[:encoded_data]
185
+
186
+ rows = parse_encoded_data(encoded_data, params[:checksum])
187
+
188
+ @complete = rows[:data] == [ ]
189
+
190
+ unless @complete
191
+ import_rows(rows)
192
+ rows[:data].size
193
+ else
194
+ 0
195
+ end
196
+ end
197
+
198
+ def self.parse_json(json)
199
+ hash = JSON.parse(json).symbolize_keys
200
+ hash[:state].symbolize_keys! if hash.has_key?(:state)
201
+ hash
202
+ end
203
+
204
+ def parse_encoded_data(encoded_data, checksum)
205
+ raise Tapsoob::CorruptedData.new("Checksum Failed") unless Tapsoob::Utils.valid_data?(encoded_data, checksum)
206
+
207
+ begin
208
+ return Marshal.load(Tapsoob::Utils.base64decode(encoded_data))
209
+ rescue Object => e
210
+ unless ENV['NO_DUMP_MARSHAL_ERRORS']
211
+ puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
212
+ File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
213
+ end
214
+ raise e
215
+ end
216
+ end
217
+
218
+ def import_rows(rows)
219
+ table.import(rows[:header], rows[:data], :commit_every => 100)
220
+ state[:offset] += rows[:data].size
221
+ rescue Exception => ex
222
+ case ex.message
223
+ when /integer out of range/ then
224
+ raise Tapsoob::InvalidData, <<-ERROR, []
225
+ \nDetected integer data that exceeds the maximum allowable size for an integer type.
226
+ This generally occurs when importing from SQLite due to the fact that SQLite does
227
+ not enforce maximum values on integer types.
228
+ ERROR
229
+ else raise ex
230
+ end
231
+ end
232
+
233
+ def verify_stream
234
+ state[:offset] = table.count
235
+ end
236
+
237
+ def self.factory(db, state)
238
+ if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
239
+ Sequel::MySQL.convert_invalid_date_time = :nil
240
+ end
241
+
242
+ if state.has_key?(:klass)
243
+ return eval(state[:klass]).new(db, state)
244
+ end
245
+
246
+ if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
247
+ DataStreamKeyed.new(db, state)
248
+ else
249
+ DataStream.new(db, state)
250
+ end
251
+ end
252
+ end
253
+
254
+ class DataStreamKeyed < DataStream
255
+ attr_accessor :buffer
256
+
257
+ def initialize(db, state)
258
+ super(db, state)
259
+ @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(@state)
260
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
261
+ @buffer = []
262
+ end
263
+
264
+ def primary_key
265
+ state[:primary_key].to_sym
266
+ end
267
+
268
+ def buffer_limit
269
+ if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
270
+ state[:last_fetched]
271
+ else
272
+ state[:filter]
273
+ end
274
+ end
275
+
276
+ def calc_limit(chunksize)
277
+ # we want to not fetch more than is needed while we're
278
+ # inside sinatra but locally we can select more than
279
+ # is strictly needed
280
+ if defined?(Sinatra)
281
+ (chunksize * 1.1).ceil
282
+ else
283
+ (chunksize * 3).ceil
284
+ end
285
+ end
286
+
287
+ def load_buffer(chunksize)
288
+ # make sure BasicObject is not polluted by subsequent requires
289
+ Sequel::BasicObject.remove_methods!
290
+
291
+ num = 0
292
+ loop do
293
+ limit = calc_limit(chunksize)
294
+ # we have to use local variables in order for the virtual row filter to work correctly
295
+ key = primary_key
296
+ buf_limit = buffer_limit
297
+ ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
298
+ log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
299
+ data = ds.all
300
+ self.buffer += data
301
+ num += data.size
302
+ if data.size > 0
303
+ # keep a record of the last primary key value in the buffer
304
+ state[:filter] = self.buffer.last[ primary_key ]
305
+ end
306
+
307
+ break if num >= chunksize or data.size == 0
308
+ end
309
+ end
310
+
311
+ def fetch_buffered(chunksize)
312
+ load_buffer(chunksize) if self.buffer.size < chunksize
313
+ rows = buffer.slice(0, chunksize)
314
+ state[:last_fetched] = if rows.size > 0
315
+ rows.last[ primary_key ]
316
+ else
317
+ nil
318
+ end
319
+ rows
320
+ end
321
+
322
+ #def import_rows(rows)
323
+ # table.import(rows[:header], rows[:data])
324
+ #end
325
+
326
+ #def fetch_rows
327
+ # chunksize = state[:chunksize]
328
+ # Tapsoob::Utils.format_data(fetch_buffered(chunksize) || [],
329
+ # :string_columns => string_columns)
330
+ #end
331
+
332
+ def increment(row_count)
333
+ # pop the rows we just successfully sent off the buffer
334
+ @buffer.slice!(0, row_count)
335
+ end
336
+
337
+ def verify_stream
338
+ key = primary_key
339
+ ds = table.order(*order_by)
340
+ current_filter = ds.max(key.sql_number)
341
+
342
+ # set the current filter to the max of the primary key
343
+ state[:filter] = current_filter
344
+ # clear out the last_fetched value so it can restart from scratch
345
+ state[:last_fetched] = nil
346
+
347
+ log.debug "DataStreamKeyed#verify_stream -> state: #{state.inspect}"
348
+ end
349
+ end
350
+ end
@@ -0,0 +1,16 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module Tapsoob
3
+ class BaseError < StandardError
4
+ attr_reader :original_backtrace
5
+
6
+ def initialize(message, opts = {})
7
+ @original_backtrace = opts.delete(:backtrace)
8
+ super(message)
9
+ end
10
+ end
11
+
12
+ class NotImplemented < BaseError; end
13
+ class DuplicatePrimaryKeyError < BaseError; end
14
+ class CorruptedData < BaseError; end
15
+ class InvalidData < BaseError; end
16
+ end
@@ -0,0 +1,16 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module Tapsoob
3
+ def self.log=(log)
4
+ @@log = log
5
+ end
6
+
7
+ def self.log
8
+ @@log ||= begin
9
+ require 'logger'
10
+ log = Logger.new($stderr)
11
+ log.level = Logger::ERROR
12
+ log.datetime_format = "%Y-%m-%d %H:%M:%S"
13
+ log
14
+ end
15
+ end
16
+ end