tapsoob 0.2.7-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'tapsoob/log'
3
+ require 'tapsoob/utils'
4
+
5
+ module Tapsoob
6
+ class DataStream
7
+ DEFAULT_CHUNKSIZE = 1000
8
+
9
+ attr_reader :db, :state
10
+
11
+ def initialize(db, state)
12
+ @db = db
13
+ @state = {
14
+ :offset => 0,
15
+ :avg_chunksize => 0,
16
+ :num_chunksize => 0,
17
+ :total_chunksize => 0
18
+ }.merge(state)
19
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
20
+ @complete = false
21
+ end
22
+
23
+ def log
24
+ Tapsoob.log
25
+ end
26
+
27
+ def error=(val)
28
+ state[:error] = val
29
+ end
30
+
31
+ def error
32
+ state[:error] || false
33
+ end
34
+
35
+ def table_name
36
+ state[:table_name].to_sym
37
+ end
38
+
39
+ def table_name_sql
40
+ table_name
41
+ end
42
+
43
+ def to_hash
44
+ state.merge(:klass => self.class.to_s)
45
+ end
46
+
47
+ def to_json
48
+ JSON.generate(to_hash)
49
+ end
50
+
51
+ def string_columns
52
+ @string_columns ||= Tapsoob::Utils.incorrect_blobs(db, table_name)
53
+ end
54
+
55
+ def table
56
+ @table ||= db[table_name_sql]
57
+ end
58
+
59
+ def order_by(name=nil)
60
+ @order_by ||= begin
61
+ name ||= table_name
62
+ Tapsoob::Utils.order_by(db, name)
63
+ end
64
+ end
65
+
66
+ def increment(row_count)
67
+ state[:offset] += row_count
68
+ end
69
+
70
+ # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
71
+ # goes below 100 or maybe if offset is > 1000
72
+ def fetch_rows
73
+ state[:chunksize] = fetch_chunksize
74
+ ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
75
+ log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
76
+ rows = Tapsoob::Utils.format_data(ds.all,
77
+ :string_columns => string_columns,
78
+ :schema => db.schema(table_name),
79
+ :table => table_name
80
+ )
81
+ update_chunksize_stats
82
+ rows
83
+ end
84
+
85
+ def fetch_file(dump_path)
86
+ state[:chunksize] = fetch_chunksize
87
+ ds = JSON.parse(File.read(File.join(dump_path, "data", "#{table_name}.json")))
88
+ log.debug "DataStream#fetch_file"
89
+ rows = {
90
+ :header => ds["header"],
91
+ :data => ds["data"][state[:offset], (state[:offset] + state[:chunksize])] || [ ]
92
+ }
93
+ update_chunksize_stats
94
+ rows
95
+ end
96
+
97
+ def max_chunksize_training
98
+ 20
99
+ end
100
+
101
+ def fetch_chunksize
102
+ chunksize = state[:chunksize]
103
+ return chunksize if state[:num_chunksize] < max_chunksize_training
104
+ return chunksize if state[:avg_chunksize] == 0
105
+ return chunksize if state[:error]
106
+ state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
107
+ end
108
+
109
+ def update_chunksize_stats
110
+ return if state[:num_chunksize] >= max_chunksize_training
111
+ state[:total_chunksize] += state[:chunksize]
112
+ state[:num_chunksize] += 1
113
+ state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
114
+ end
115
+
116
+ def encode_rows(rows)
117
+ Tapsoob::Utils.base64encode(Marshal.dump(rows))
118
+ end
119
+
120
+ def fetch(opts = {})
121
+ opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
122
+
123
+ log.debug "DataStream#fetch state -> #{state.inspect}"
124
+
125
+ t1 = Time.now
126
+ rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
127
+ encoded_data = encode_rows(rows)
128
+ t2 = Time.now
129
+ elapsed_time = t2 - t1
130
+
131
+ if opts[:type] == "file"
132
+ @complete = rows[:data] == [ ]
133
+ else
134
+ @complete = rows == { }
135
+ end
136
+
137
+ [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time]
138
+ end
139
+
140
+ def complete?
141
+ @complete
142
+ end
143
+
144
+ def fetch_database(dump_path)
145
+ params = fetch_from_database
146
+ encoded_data = params[:encoded_data]
147
+ json = params[:json]
148
+
149
+ rows = parse_encoded_data(encoded_data, json[:checksum])
150
+
151
+ @complete = rows == { }
152
+
153
+ # update local state
154
+ state.merge!(json[:state].merge(:chunksize => state[:chunksize]))
155
+
156
+ unless @complete
157
+ Tapsoob::Utils.export_rows(dump_path, table_name, rows)
158
+ state[:offset] += rows[:data].size
159
+ rows[:data].size
160
+ else
161
+ 0
162
+ end
163
+ end
164
+
165
+ def fetch_from_database
166
+ res = nil
167
+ log.debug "DataStream#fetch_from_database state -> #{state.inspect}"
168
+ state[:chunksize] = Tapsoob::Utils.calculate_chunksize(state[:chunksize]) do |c|
169
+ state[:chunksize] = c.to_i
170
+ encoded_data = fetch.first
171
+
172
+ checksum = Tapsoob::Utils.checksum(encoded_data).to_s
173
+
174
+ res = {
175
+ :json => { :checksum => checksum, :state => to_hash },
176
+ :encoded_data => encoded_data
177
+ }
178
+ end
179
+
180
+ res
181
+ end
182
+
183
+ def fetch_data_in_database(params)
184
+ encoded_data = params[:encoded_data]
185
+
186
+ rows = parse_encoded_data(encoded_data, params[:checksum])
187
+
188
+ @complete = rows[:data] == [ ]
189
+
190
+ unless @complete
191
+ import_rows(rows)
192
+ rows[:data].size
193
+ else
194
+ 0
195
+ end
196
+ end
197
+
198
+ def self.parse_json(json)
199
+ hash = JSON.parse(json).symbolize_keys
200
+ hash[:state].symbolize_keys! if hash.has_key?(:state)
201
+ hash
202
+ end
203
+
204
+ def parse_encoded_data(encoded_data, checksum)
205
+ raise Tapsoob::CorruptedData.new("Checksum Failed") unless Tapsoob::Utils.valid_data?(encoded_data, checksum)
206
+
207
+ begin
208
+ return Marshal.load(Tapsoob::Utils.base64decode(encoded_data))
209
+ rescue Object => e
210
+ unless ENV['NO_DUMP_MARSHAL_ERRORS']
211
+ puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
212
+ File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
213
+ end
214
+ raise e
215
+ end
216
+ end
217
+
218
+ def import_rows(rows)
219
+ table.import(rows[:header], rows[:data], :commit_every => 100)
220
+ state[:offset] += rows[:data].size
221
+ rescue Exception => ex
222
+ case ex.message
223
+ when /integer out of range/ then
224
+ raise Tapsoob::InvalidData, <<-ERROR, []
225
+ \nDetected integer data that exceeds the maximum allowable size for an integer type.
226
+ This generally occurs when importing from SQLite due to the fact that SQLite does
227
+ not enforce maximum values on integer types.
228
+ ERROR
229
+ else raise ex
230
+ end
231
+ end
232
+
233
+ def verify_stream
234
+ state[:offset] = table.count
235
+ end
236
+
237
+ def self.factory(db, state)
238
+ if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
239
+ Sequel::MySQL.convert_invalid_date_time = :nil
240
+ end
241
+
242
+ if state.has_key?(:klass)
243
+ return eval(state[:klass]).new(db, state)
244
+ end
245
+
246
+ if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
247
+ DataStreamKeyed.new(db, state)
248
+ else
249
+ DataStream.new(db, state)
250
+ end
251
+ end
252
+ end
253
+
254
+ class DataStreamKeyed < DataStream
255
+ attr_accessor :buffer
256
+
257
+ def initialize(db, state)
258
+ super(db, state)
259
+ @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(@state)
260
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
261
+ @buffer = []
262
+ end
263
+
264
+ def primary_key
265
+ state[:primary_key].to_sym
266
+ end
267
+
268
+ def buffer_limit
269
+ if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
270
+ state[:last_fetched]
271
+ else
272
+ state[:filter]
273
+ end
274
+ end
275
+
276
+ def calc_limit(chunksize)
277
+ # we want to not fetch more than is needed while we're
278
+ # inside sinatra but locally we can select more than
279
+ # is strictly needed
280
+ if defined?(Sinatra)
281
+ (chunksize * 1.1).ceil
282
+ else
283
+ (chunksize * 3).ceil
284
+ end
285
+ end
286
+
287
+ def load_buffer(chunksize)
288
+ # make sure BasicObject is not polluted by subsequent requires
289
+ Sequel::BasicObject.remove_methods!
290
+
291
+ num = 0
292
+ loop do
293
+ limit = calc_limit(chunksize)
294
+ # we have to use local variables in order for the virtual row filter to work correctly
295
+ key = primary_key
296
+ buf_limit = buffer_limit
297
+ ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
298
+ log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
299
+ data = ds.all
300
+ self.buffer += data
301
+ num += data.size
302
+ if data.size > 0
303
+ # keep a record of the last primary key value in the buffer
304
+ state[:filter] = self.buffer.last[ primary_key ]
305
+ end
306
+
307
+ break if num >= chunksize or data.size == 0
308
+ end
309
+ end
310
+
311
+ def fetch_buffered(chunksize)
312
+ load_buffer(chunksize) if self.buffer.size < chunksize
313
+ rows = buffer.slice(0, chunksize)
314
+ state[:last_fetched] = if rows.size > 0
315
+ rows.last[ primary_key ]
316
+ else
317
+ nil
318
+ end
319
+ rows
320
+ end
321
+
322
+ #def import_rows(rows)
323
+ # table.import(rows[:header], rows[:data])
324
+ #end
325
+
326
+ #def fetch_rows
327
+ # chunksize = state[:chunksize]
328
+ # Tapsoob::Utils.format_data(fetch_buffered(chunksize) || [],
329
+ # :string_columns => string_columns)
330
+ #end
331
+
332
+ def increment(row_count)
333
+ # pop the rows we just successfully sent off the buffer
334
+ @buffer.slice!(0, row_count)
335
+ end
336
+
337
+ def verify_stream
338
+ key = primary_key
339
+ ds = table.order(*order_by)
340
+ current_filter = ds.max(key.sql_number)
341
+
342
+ # set the current filter to the max of the primary key
343
+ state[:filter] = current_filter
344
+ # clear out the last_fetched value so it can restart from scratch
345
+ state[:last_fetched] = nil
346
+
347
+ log.debug "DataStreamKeyed#verify_stream -> state: #{state.inspect}"
348
+ end
349
+ end
350
+ end
@@ -0,0 +1,16 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module Tapsoob
3
+ class BaseError < StandardError
4
+ attr_reader :original_backtrace
5
+
6
+ def initialize(message, opts = {})
7
+ @original_backtrace = opts.delete(:backtrace)
8
+ super(message)
9
+ end
10
+ end
11
+
12
+ class NotImplemented < BaseError; end
13
+ class DuplicatePrimaryKeyError < BaseError; end
14
+ class CorruptedData < BaseError; end
15
+ class InvalidData < BaseError; end
16
+ end
@@ -0,0 +1,16 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module Tapsoob
3
+ def self.log=(log)
4
+ @@log = log
5
+ end
6
+
7
+ def self.log
8
+ @@log ||= begin
9
+ require 'logger'
10
+ log = Logger.new($stderr)
11
+ log.level = Logger::ERROR
12
+ log.datetime_format = "%Y-%m-%d %H:%M:%S"
13
+ log
14
+ end
15
+ end
16
+ end