tapsoob 0.6.2-java → 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'tapsoob/data_stream/base'
3
+
4
+ module Tapsoob
5
+ module DataStream
6
+ class Keyed < Base
7
+ attr_accessor :buffer
8
+
9
+ def initialize(db, state, opts = {})
10
+ super(db, state, opts)
11
+ @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(@state)
12
+ @state[:chunksize] ||= DEFAULT_CHUNKSIZE
13
+ @buffer = []
14
+ end
15
+
16
+ def primary_key
17
+ state[:primary_key].to_sym
18
+ end
19
+
20
+ def buffer_limit
21
+ if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
22
+ state[:last_fetched]
23
+ else
24
+ state[:filter]
25
+ end
26
+ end
27
+
28
+ def calc_limit(chunksize)
29
+ # we want to not fetch more than is needed while we're
30
+ # inside sinatra but locally we can select more than
31
+ # is strictly needed
32
+ if defined?(Sinatra)
33
+ (chunksize * 1.1).ceil
34
+ else
35
+ (chunksize * 3).ceil
36
+ end
37
+ end
38
+
39
+ def load_buffer(chunksize)
40
+ num = 0
41
+ loop do
42
+ limit = calc_limit(chunksize)
43
+ # we have to use local variables in order for the virtual row filter to work correctly
44
+ key = primary_key
45
+ buf_limit = buffer_limit
46
+ ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
47
+ log.debug "DataStream::Keyed#load_buffer SQL -> #{ds.sql}"
48
+ data = ds.all
49
+ self.buffer += data
50
+ num += data.size
51
+ if data.any?
52
+ # keep a record of the last primary key value in the buffer
53
+ state[:filter] = self.buffer.last[primary_key]
54
+ end
55
+
56
+ break if num >= chunksize || data.empty?
57
+ end
58
+ end
59
+
60
+ def fetch_buffered(chunksize)
61
+ load_buffer(chunksize) if buffer.size < chunksize
62
+ rows = buffer.slice(0, chunksize)
63
+ state[:last_fetched] = rows.any? ? rows.last[primary_key] : nil
64
+ rows
65
+ end
66
+
67
+ def increment(row_count)
68
+ # pop the rows we just successfully sent off the buffer
69
+ @buffer.slice!(0, row_count)
70
+ end
71
+
72
+ def verify_stream
73
+ key = primary_key
74
+ ds = table.order(*order_by)
75
+ current_filter = ds.max(key.sql_number)
76
+
77
+ # set the current filter to the max of the primary key
78
+ state[:filter] = current_filter
79
+ # clear out the last_fetched value so it can restart from scratch
80
+ state[:last_fetched] = nil
81
+
82
+ log.debug "DataStream::Keyed#verify_stream -> state: #{state.inspect}"
83
+ end
84
+
85
+ # Calculate PK range for partitioning
86
+ def self.calculate_pk_ranges(db, table_name, num_partitions)
87
+ key = Tapsoob::Utils.order_by(db, table_name).first
88
+ ds = db[table_name.to_sym]
89
+
90
+ # Get total row count
91
+ total_rows = ds.count
92
+ return [[ds.min(key) || 0, ds.max(key) || 0]] if total_rows == 0 || num_partitions <= 1
93
+
94
+ # Calculate target rows per partition
95
+ rows_per_partition = (total_rows.to_f / num_partitions).ceil
96
+
97
+ # Find PK boundaries at percentiles using OFFSET
98
+ # This ensures even distribution of ROWS, not PK values
99
+ ranges = []
100
+ (0...num_partitions).each do |i|
101
+ # Calculate row offset for this partition's start
102
+ start_offset = i * rows_per_partition
103
+ end_offset = [(i + 1) * rows_per_partition - 1, total_rows - 1].min
104
+
105
+ # Get the PK value at this row offset
106
+ start_pk = ds.order(key).limit(1, start_offset).select(key).first
107
+ start_pk = start_pk ? start_pk[key] : (ds.min(key) || 0)
108
+
109
+ # Get the PK value at the end offset (or max for last partition)
110
+ if i == num_partitions - 1
111
+ end_pk = ds.max(key) || start_pk
112
+ else
113
+ end_pk_row = ds.order(key).limit(1, end_offset).select(key).first
114
+ end_pk = end_pk_row ? end_pk_row[key] : start_pk
115
+ end
116
+
117
+ ranges << [start_pk, end_pk]
118
+ end
119
+
120
+ ranges
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,64 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'tapsoob/data_stream/base'
3
+
4
+ module Tapsoob
5
+ module DataStream
6
+ # DataStream variant for PK-based range partitioning
7
+ class KeyedPartition < Base
8
+ def initialize(db, state, opts = {})
9
+ super(db, state, opts)
10
+ # :partition_range = [min_pk, max_pk] for this partition
11
+ # :last_pk = last primary key value fetched
12
+ @state = {
13
+ :partition_range => nil,
14
+ :last_pk => nil
15
+ }.merge(@state)
16
+ end
17
+
18
+ def primary_key
19
+ @primary_key ||= Tapsoob::Utils.order_by(db, table_name).first
20
+ end
21
+
22
+ def fetch_rows
23
+ return {} if state[:partition_range].nil?
24
+
25
+ # Only count once on first fetch
26
+ state[:size] ||= table.count
27
+
28
+ min_pk, max_pk = state[:partition_range]
29
+ chunksize = state[:chunksize]
30
+
31
+ # Build query with PK range filter
32
+ key = primary_key
33
+ last = state[:last_pk] || (min_pk - 1)
34
+
35
+ ds = table.order(*order_by).filter do
36
+ (Sequel.identifier(key) > last) & (Sequel.identifier(key) >= min_pk) & (Sequel.identifier(key) <= max_pk)
37
+ end.limit(chunksize)
38
+
39
+ data = ds.all
40
+
41
+ # Update last_pk for next fetch
42
+ if data.any?
43
+ state[:last_pk] = data.last[primary_key]
44
+ else
45
+ # No data found in this range - mark partition as complete
46
+ state[:last_pk] = max_pk
47
+ end
48
+
49
+ Tapsoob::Utils.format_data(db, data,
50
+ :string_columns => string_columns,
51
+ :schema => db.schema(table_name),
52
+ :table => table_name
53
+ )
54
+ end
55
+
56
+ def complete?
57
+ return true if state[:partition_range].nil?
58
+ min_pk, max_pk = state[:partition_range]
59
+ # Complete when we've fetched past the max PK
60
+ state[:last_pk] && state[:last_pk] >= max_pk
61
+ end
62
+ end
63
+ end
64
+ end
@@ -1,383 +1,12 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require 'tapsoob/log'
3
- require 'tapsoob/utils'
4
2
 
5
3
  module Tapsoob
6
- class DataStream
7
- DEFAULT_CHUNKSIZE = 1000
8
-
9
- attr_reader :db, :state, :options
10
-
11
- def initialize(db, state, opts = {})
12
- @db = db
13
- @state = {
14
- :offset => 0,
15
- :avg_chunksize => 0,
16
- :num_chunksize => 0,
17
- :total_chunksize => 0
18
- }.merge(state)
19
- @state[:chunksize] ||= DEFAULT_CHUNKSIZE
20
- @options = opts
21
- @complete = false
22
- end
23
-
24
- def log
25
- Tapsoob.log.level = Logger::DEBUG if state[:debug]
26
- Tapsoob.log
27
- end
28
-
29
- def error=(val)
30
- state[:error] = val
31
- end
32
-
33
- def error
34
- state[:error] || false
35
- end
36
-
37
- def table_name
38
- state[:table_name].to_sym
39
- end
40
-
41
- def table_name_sql
42
- table_name
43
- end
44
-
45
- def to_hash
46
- state.merge(:klass => self.class.to_s)
47
- end
48
-
49
- def to_json
50
- JSON.generate(to_hash)
51
- end
52
-
53
- def string_columns
54
- @string_columns ||= Tapsoob::Utils.incorrect_blobs(db, table_name)
55
- end
56
-
57
- def table
58
- @table ||= db[table_name_sql]
59
- end
60
-
61
- def order_by(name=nil)
62
- @order_by ||= begin
63
- name ||= table_name
64
- Tapsoob::Utils.order_by(db, name)
65
- end
66
- end
67
-
68
- def increment(row_count)
69
- state[:offset] += row_count
70
- end
71
-
72
- # keep a record of the average chunksize within the first few hundred thousand records, after chunksize
73
- # goes below 100 or maybe if offset is > 1000
74
- def fetch_rows
75
- #state[:chunksize] = fetch_chunksize
76
- ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
77
- state[:size] = table.count
78
- log.debug "DataStream#fetch_rows SQL -> #{ds.sql}"
79
- rows = Tapsoob::Utils.format_data(db, ds.all,
80
- :string_columns => string_columns,
81
- :schema => db.schema(table_name),
82
- :table => table_name
83
- )
84
- update_chunksize_stats
85
- rows
86
- end
87
-
88
- def fetch_file(dump_path)
89
- #state[:chunksize] = fetch_chunksize
90
- # Read NDJSON format - each line is a separate JSON chunk
91
- file_path = File.join(dump_path, "data", "#{table_name}.json")
92
-
93
- # Parse all chunks and combine them
94
- all_data = []
95
- table_name_val = nil
96
- header_val = nil
97
- types_val = nil
98
-
99
- File.readlines(file_path).each do |line|
100
- chunk = JSON.parse(line.strip)
101
- table_name_val ||= chunk["table_name"]
102
- header_val ||= chunk["header"]
103
- types_val ||= chunk["types"]
104
- all_data.concat(chunk["data"]) if chunk["data"]
105
- end
106
-
107
- # Apply skip-duplicates if needed
108
- all_data = all_data.uniq if @options[:"skip-duplicates"]
109
-
110
- state[:size] = all_data.size
111
- log.debug "DataStream#fetch_file"
112
-
113
- rows = {
114
- :table_name => table_name_val,
115
- :header => header_val,
116
- :data => (all_data[state[:offset], state[:chunksize]] || []),
117
- :types => types_val
118
- }
119
- update_chunksize_stats
120
- rows
121
- end
122
-
123
- def max_chunksize_training
124
- 20
125
- end
126
-
127
- #def fetch_chunksize
128
- # chunksize = state[:chunksize]
129
- # return chunksize if state[:num_chunksize] < max_chunksize_training
130
- # return chunksize if state[:avg_chunksize] == 0
131
- # return chunksize if state[:error]
132
- # state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize
133
- #end
134
-
135
- def update_chunksize_stats
136
- return if state[:num_chunksize] >= max_chunksize_training
137
- state[:total_chunksize] += state[:chunksize]
138
- state[:num_chunksize] += 1
139
- state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
140
- end
141
-
142
- def encode_rows(rows)
143
- Tapsoob::Utils.base64encode(Marshal.dump(rows))
144
- end
145
-
146
- def fetch(opts = {})
147
- opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
148
-
149
- log.debug "DataStream#fetch state -> #{state.inspect}"
150
-
151
- t1 = Time.now
152
- rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
153
- encoded_data = encode_rows(rows)
154
- t2 = Time.now
155
- elapsed_time = t2 - t1
156
-
157
- state[:offset] += (rows == {} ? 0 : rows[:data].size)
158
-
159
- [encoded_data, (rows == {} ? 0 : rows[:data].size), elapsed_time]
160
- end
161
-
162
- def complete?
163
- state[:offset] >= state[:size]
164
- end
165
-
166
- def fetch_data_from_database(params)
167
- encoded_data = params[:encoded_data]
168
-
169
- rows = parse_encoded_data(encoded_data, params[:checksum])
170
-
171
- # update local state
172
- state.merge!(params[:state].merge(:chunksize => state[:chunksize]))
173
-
174
- yield rows if block_given?
175
- (rows == {} ? 0 : rows[:data].size)
176
- end
177
-
178
- def fetch_data_to_database(params)
179
- encoded_data = params[:encoded_data]
180
-
181
- rows = parse_encoded_data(encoded_data, params[:checksum])
182
-
183
- import_rows(rows)
184
- (rows == {} ? 0 : rows[:data].size)
185
- end
186
-
187
- def self.parse_json(json)
188
- hash = JSON.parse(json).symbolize_keys
189
- hash[:state].symbolize_keys! if hash.has_key?(:state)
190
- hash
191
- end
192
-
193
- def parse_encoded_data(encoded_data, checksum)
194
- raise Tapsoob::CorruptedData.new("Checksum Failed") unless Tapsoob::Utils.valid_data?(encoded_data, checksum)
195
-
196
- begin
197
- return Marshal.load(Tapsoob::Utils.base64decode(encoded_data))
198
- rescue Object => e
199
- unless ENV['NO_DUMP_MARSHAL_ERRORS']
200
- puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
201
- File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
202
- end
203
- raise e
204
- end
205
- end
206
-
207
- def import_rows(rows)
208
- columns = rows[:header]
209
- data = rows[:data]
210
-
211
- # Only import existing columns
212
- if table.columns.size != columns.size
213
- existing_columns = table.columns.map(&:to_s)
214
- additional_columns = columns - existing_columns
215
- additional_columns_idxs = additional_columns.map { |c| columns.index(c) }
216
- additional_columns_idxs.reverse.each do |idx|
217
- columns.delete_at(idx)
218
- rows[:types].delete_at(idx)
219
- end
220
- data.each_index { |didx| additional_columns_idxs.reverse.each { |idx| data[didx].delete_at(idx) } }
221
- end
222
-
223
- # Decode blobs
224
- if rows.has_key?(:types) && rows[:types].include?("blob")
225
- blob_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == "blob" }
226
- data.each_index do |idx|
227
- blob_indices.each do |bi|
228
- data[idx][bi] = Sequel::SQL::Blob.new(Tapsoob::Utils.base64decode(data[idx][bi])) unless data[idx][bi].nil?
229
- end
230
- end
231
- end
232
-
233
- # Parse date/datetime/time columns
234
- if rows.has_key?(:types)
235
- %w(date datetime time).each do |type|
236
- if rows[:types].include?(type)
237
- type_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == type }
238
- data.each_index do |idx|
239
- type_indices.each do |ti|
240
- data[idx][ti] = Sequel.send("string_to_#{type}".to_sym, data[idx][ti]) unless data[idx][ti].nil?
241
- end
242
- end
243
- end
244
- end
245
- end
246
-
247
- # Remove id column
248
- if @options[:"discard-identity"] && rows[:header].include?("id")
249
- columns = rows[:header] - ["id"]
250
- data = data.map { |d| d[1..-1] }
251
- end
252
-
253
- table.import(columns, data, :commit_every => 100)
254
- rescue Exception => ex
255
- case ex.message
256
- when /integer out of range/ then
257
- raise Tapsoob::InvalidData, <<-ERROR, []
258
- \nDetected integer data that exceeds the maximum allowable size for an integer type.
259
- This generally occurs when importing from SQLite due to the fact that SQLite does
260
- not enforce maximum values on integer types.
261
- ERROR
262
- else raise ex
263
- end
264
- end
265
-
266
- def verify_stream
267
- state[:offset] = table.count
268
- end
269
-
270
- def self.factory(db, state, opts)
271
- if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
272
- Sequel::MySQL.convert_invalid_date_time = :nil
273
- end
274
-
275
- if state.has_key?(:klass)
276
- return eval(state[:klass]).new(db, state, opts)
277
- end
278
-
279
- if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
280
- DataStreamKeyed.new(db, state, opts)
281
- else
282
- DataStream.new(db, state, opts)
283
- end
284
- end
285
- end
286
-
287
- class DataStreamKeyed < DataStream
288
- attr_accessor :buffer
289
-
290
- def initialize(db, state, opts = {})
291
- super(db, state, opts)
292
- @state = { :primary_key => order_by(state[:table_name]).first, :filter => 0 }.merge(@state)
293
- @state[:chunksize] ||= DEFAULT_CHUNKSIZE
294
- @buffer = []
295
- end
296
-
297
- def primary_key
298
- state[:primary_key].to_sym
299
- end
300
-
301
- def buffer_limit
302
- if state[:last_fetched] and state[:last_fetched] < state[:filter] and self.buffer.size == 0
303
- state[:last_fetched]
304
- else
305
- state[:filter]
306
- end
307
- end
308
-
309
- def calc_limit(chunksize)
310
- # we want to not fetch more than is needed while we're
311
- # inside sinatra but locally we can select more than
312
- # is strictly needed
313
- if defined?(Sinatra)
314
- (chunksize * 1.1).ceil
315
- else
316
- (chunksize * 3).ceil
317
- end
318
- end
319
-
320
- def load_buffer(chunksize)
321
- # make sure BasicObject is not polluted by subsequent requires
322
- Sequel::BasicObject.remove_methods!
323
-
324
- num = 0
325
- loop do
326
- limit = calc_limit(chunksize)
327
- # we have to use local variables in order for the virtual row filter to work correctly
328
- key = primary_key
329
- buf_limit = buffer_limit
330
- ds = table.order(*order_by).filter { key.sql_number > buf_limit }.limit(limit)
331
- log.debug "DataStreamKeyed#load_buffer SQL -> #{ds.sql}"
332
- data = ds.all
333
- self.buffer += data
334
- num += data.size
335
- if data.size > 0
336
- # keep a record of the last primary key value in the buffer
337
- state[:filter] = self.buffer.last[ primary_key ]
338
- end
339
-
340
- break if num >= chunksize or data.size == 0
341
- end
342
- end
343
-
344
- def fetch_buffered(chunksize)
345
- load_buffer(chunksize) if self.buffer.size < chunksize
346
- rows = buffer.slice(0, chunksize)
347
- state[:last_fetched] = if rows.size > 0
348
- rows.last[ primary_key ]
349
- else
350
- nil
351
- end
352
- rows
353
- end
354
-
355
- #def import_rows(rows)
356
- # table.import(rows[:header], rows[:data])
357
- #end
358
-
359
- #def fetch_rows
360
- # chunksize = state[:chunksize]
361
- # Tapsoob::Utils.format_data(fetch_buffered(chunksize) || [],
362
- # :string_columns => string_columns)
363
- #end
364
-
365
- def increment(row_count)
366
- # pop the rows we just successfully sent off the buffer
367
- @buffer.slice!(0, row_count)
368
- end
369
-
370
- def verify_stream
371
- key = primary_key
372
- ds = table.order(*order_by)
373
- current_filter = ds.max(key.sql_number)
374
-
375
- # set the current filter to the max of the primary key
376
- state[:filter] = current_filter
377
- # clear out the last_fetched value so it can restart from scratch
378
- state[:last_fetched] = nil
379
-
380
- log.debug "DataStreamKeyed#verify_stream -> state: #{state.inspect}"
381
- end
4
+ module DataStream
5
+ # Require all DataStream classes
6
+ require 'tapsoob/data_stream/base'
7
+ require 'tapsoob/data_stream/keyed'
8
+ require 'tapsoob/data_stream/keyed_partition'
9
+ require 'tapsoob/data_stream/interleaved'
10
+ require 'tapsoob/data_stream/file_partition'
382
11
  end
383
12
  end