tapsoob 0.6.2-java → 0.7.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +18 -2
- data/lib/tapsoob/cli/data_stream.rb +3 -3
- data/lib/tapsoob/cli/root.rb +2 -3
- data/lib/tapsoob/data_stream/base.rb +315 -0
- data/lib/tapsoob/data_stream/file_partition.rb +87 -0
- data/lib/tapsoob/data_stream/interleaved.rb +80 -0
- data/lib/tapsoob/data_stream/keyed.rb +124 -0
- data/lib/tapsoob/data_stream/keyed_partition.rb +64 -0
- data/lib/tapsoob/data_stream.rb +7 -378
- data/lib/tapsoob/operation/base.rb +240 -0
- data/lib/tapsoob/operation/pull.rb +419 -0
- data/lib/tapsoob/operation/push.rb +446 -0
- data/lib/tapsoob/operation.rb +5 -664
- data/lib/tapsoob/progress/bar.rb +0 -4
- data/lib/tapsoob/progress/multi_bar.rb +90 -58
- data/lib/tapsoob/progress/thread_safe_bar.rb +0 -3
- data/lib/tapsoob/progress_event.rb +109 -0
- data/lib/tapsoob/version.rb +1 -1
- data/lib/tasks/tapsoob.rake +2 -2
- metadata +11 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 87a418bd365385b576c8eaeac8c5a54dec4b2254a3f7e30aa89a1e62bb0bb691
|
|
4
|
+
data.tar.gz: 3191066263768280a015c824411a797a5642052238c48f89cb55d84368bdb13b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5318c4e1212ed88cdb4e9468f7cc7e39024074fc34d76a00b972758d00a1e9ab1f6fb91e7ee15b65b5bc3c1c657509f1aa5db11d37b5113bc191e55f5a6606de
|
|
7
|
+
data.tar.gz: b6545013c50c34b73d9c0a62d8b2995749187aee051bc296a1f4d2d4754c0504a24d215554b436b4a958e3b46da833603ec291039d2b94b2f81d0ffecd5b5d02
|
data/README.md
CHANGED
|
@@ -14,6 +14,23 @@ Tapsoob currently rely on the Sequel ORM (<http://sequel.rubyforge.org/>) so we
|
|
|
14
14
|
If you're using either Oracle or Oracle XE you will need some extra requirements. If you're using Ruby you'll need to have your ORACLE_HOME environnement variable set properly and the `ruby-oci8` gem installed. However if you're using jRuby you'll need to have the official Oracle JDBC driver (see here for more informations: <http://www.oracle.com/technetwork/articles/dsl/jruby-oracle11g-330825.html>) and it should be loaded prior to using Tapsoob otherwise you won't be able to connect the database.
|
|
15
15
|
|
|
16
16
|
|
|
17
|
+
## Recent changes
|
|
18
|
+
|
|
19
|
+
### 0.7.0
|
|
20
|
+
|
|
21
|
+
### Features
|
|
22
|
+
|
|
23
|
+
* Introducing a new CLI interface when dumping/loading data using parallelization, much cleaner and better way to keep track of what's going on.
|
|
24
|
+
* Introducing de-facto intra-table parallelization for large tables which makes a tremendous difference (10x speed boost in most cases).
|
|
25
|
+
* When using the `--progress=false` or `--no-progress` option there's now PROGRESS data being outputted to STDERR (only when dumping/loading to/from a directory).
|
|
26
|
+
|
|
27
|
+
### Internal changes
|
|
28
|
+
|
|
29
|
+
* Moved all DataStream* related classes into a specific module.
|
|
30
|
+
* Moved all Operation related classes into a specific module.
|
|
31
|
+
* Moved all progress bars related Classes into a specific module.
|
|
32
|
+
|
|
33
|
+
|
|
17
34
|
## Exporting your data
|
|
18
35
|
|
|
19
36
|
tapsoob pull [OPTIONS] <dump_path> <database_url>
|
|
@@ -60,7 +77,7 @@ If you're using Rails, there's also two Rake tasks provided:
|
|
|
60
77
|
* `tapsoob:pull` which dumps the database into a new folder under the `db` folder
|
|
61
78
|
* `tapsoob:push` which reads the last dump you made from `tapsoob:pull` from the `db` folder
|
|
62
79
|
|
|
63
|
-
##
|
|
80
|
+
## Parallelization support from 0.6.1 onwards
|
|
64
81
|
|
|
65
82
|
You can now dump/load a full database or data using parallelization to speed up the process at memory cost and database load like so :
|
|
66
83
|
|
|
@@ -78,7 +95,6 @@ Your exports can be moved from one machine to another for backups or replication
|
|
|
78
95
|
|
|
79
96
|
## ToDo
|
|
80
97
|
|
|
81
|
-
* Add a compression layer
|
|
82
98
|
* Tests (in progress)
|
|
83
99
|
|
|
84
100
|
|
|
@@ -27,7 +27,7 @@ module Tapsoob
|
|
|
27
27
|
opts[:parallel] = 1
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
op = Tapsoob::Operation.factory(:pull, database_url, dump_path, opts)
|
|
30
|
+
op = Tapsoob::Operation::Base.factory(:pull, database_url, dump_path, opts)
|
|
31
31
|
op.pull_data
|
|
32
32
|
end
|
|
33
33
|
|
|
@@ -46,7 +46,7 @@ module Tapsoob
|
|
|
46
46
|
|
|
47
47
|
# If dump_path is provided, use the Operation class for proper parallel support
|
|
48
48
|
if dump_path && Dir.exist?(dump_path)
|
|
49
|
-
op = Tapsoob::Operation.factory(:push, database_url, dump_path, opts)
|
|
49
|
+
op = Tapsoob::Operation::Base.factory(:push, database_url, dump_path, opts)
|
|
50
50
|
op.push_data
|
|
51
51
|
else
|
|
52
52
|
# STDIN mode: read and import data directly (no parallel support for STDIN)
|
|
@@ -66,7 +66,7 @@ module Tapsoob
|
|
|
66
66
|
db(database_url, opts)[table_name.to_sym].truncate
|
|
67
67
|
end
|
|
68
68
|
|
|
69
|
-
stream = Tapsoob::DataStream.factory(db(database_url, opts), {
|
|
69
|
+
stream = Tapsoob::DataStream::Base.factory(db(database_url, opts), {
|
|
70
70
|
table_name: table_name,
|
|
71
71
|
chunksize: opts[:default_chunksize]
|
|
72
72
|
}, { :"discard-identity" => opts[:"discard-identity"] || false, :purge => opts[:purge] || false, :debug => opts[:debug] })
|
data/lib/tapsoob/cli/root.rb
CHANGED
|
@@ -125,7 +125,7 @@ module Tapsoob
|
|
|
125
125
|
FileUtils.mkpath "#{dump_path}/data"
|
|
126
126
|
FileUtils.mkpath "#{dump_path}/indexes"
|
|
127
127
|
|
|
128
|
-
Tapsoob::Operation.factory(method, database_url, dump_path, opts).run
|
|
128
|
+
Tapsoob::Operation::Base.factory(method, database_url, dump_path, opts).run
|
|
129
129
|
end
|
|
130
130
|
|
|
131
131
|
def clientresumexfer(method, dump_path, database_url, opts)
|
|
@@ -134,7 +134,6 @@ module Tapsoob
|
|
|
134
134
|
|
|
135
135
|
dump_path = dump_path || session.delete(:dump_path)
|
|
136
136
|
|
|
137
|
-
require 'taps/operation'
|
|
138
137
|
|
|
139
138
|
newsession = session.merge({
|
|
140
139
|
:default_chunksize => opts[:default_chunksize],
|
|
@@ -142,7 +141,7 @@ module Tapsoob
|
|
|
142
141
|
:resume => true
|
|
143
142
|
})
|
|
144
143
|
|
|
145
|
-
Tapsoob::Operation.factory(method, database_url, dump_path, newsession).run
|
|
144
|
+
Tapsoob::Operation::Base.factory(method, database_url, dump_path, newsession).run
|
|
146
145
|
end
|
|
147
146
|
end
|
|
148
147
|
end
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
|
2
|
+
require 'tapsoob/log'
|
|
3
|
+
require 'tapsoob/utils'
|
|
4
|
+
|
|
5
|
+
module Tapsoob
|
|
6
|
+
module DataStream
|
|
7
|
+
class Base
|
|
8
|
+
DEFAULT_CHUNKSIZE = 1000
|
|
9
|
+
|
|
10
|
+
attr_reader :db, :state, :options
|
|
11
|
+
|
|
12
|
+
def initialize(db, state, opts = {})
|
|
13
|
+
@db = db
|
|
14
|
+
@state = {
|
|
15
|
+
:offset => 0,
|
|
16
|
+
:avg_chunksize => 0,
|
|
17
|
+
:num_chunksize => 0,
|
|
18
|
+
:total_chunksize => 0
|
|
19
|
+
}.merge(state)
|
|
20
|
+
@state[:chunksize] ||= DEFAULT_CHUNKSIZE
|
|
21
|
+
@options = opts
|
|
22
|
+
@complete = false
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def log
|
|
26
|
+
Tapsoob.log.level = Logger::DEBUG if state[:debug]
|
|
27
|
+
Tapsoob.log
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def error=(val)
|
|
31
|
+
state[:error] = val
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def error
|
|
35
|
+
state[:error] || false
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def table_name
|
|
39
|
+
state[:table_name].to_sym
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def table_name_sql
|
|
43
|
+
table_name
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def to_hash
|
|
47
|
+
state.merge(:klass => self.class.to_s)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def to_json
|
|
51
|
+
JSON.generate(to_hash)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def string_columns
|
|
55
|
+
@string_columns ||= Tapsoob::Utils.incorrect_blobs(db, table_name)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def table
|
|
59
|
+
@table ||= db[table_name_sql]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def order_by(name=nil)
|
|
63
|
+
@order_by ||= begin
|
|
64
|
+
name ||= table_name
|
|
65
|
+
Tapsoob::Utils.order_by(db, name)
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def increment(row_count)
|
|
70
|
+
state[:offset] += row_count
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# keep a record of the average chunksize within the first few hundred thousand records, after chunksize
|
|
74
|
+
# goes below 100 or maybe if offset is > 1000
|
|
75
|
+
def fetch_rows
|
|
76
|
+
# Only count once on first fetch
|
|
77
|
+
state[:size] ||= table.count
|
|
78
|
+
|
|
79
|
+
ds = table.order(*order_by).limit(state[:chunksize], state[:offset])
|
|
80
|
+
log.debug "DataStream::Base#fetch_rows SQL -> #{ds.sql}"
|
|
81
|
+
rows = Tapsoob::Utils.format_data(db, ds.all,
|
|
82
|
+
:string_columns => string_columns,
|
|
83
|
+
:schema => db.schema(table_name),
|
|
84
|
+
:table => table_name
|
|
85
|
+
)
|
|
86
|
+
update_chunksize_stats
|
|
87
|
+
rows
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def fetch_file(dump_path)
|
|
91
|
+
# Stream NDJSON format - read line by line without loading entire file
|
|
92
|
+
file_path = File.join(dump_path, "data", "#{table_name}.json")
|
|
93
|
+
|
|
94
|
+
# Initialize state on first call
|
|
95
|
+
unless state[:file_initialized]
|
|
96
|
+
state[:file_initialized] = true
|
|
97
|
+
state[:lines_read] = 0
|
|
98
|
+
state[:total_lines] = File.foreach(file_path).count
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
table_name_val = nil
|
|
102
|
+
header_val = nil
|
|
103
|
+
types_val = nil
|
|
104
|
+
data_batch = []
|
|
105
|
+
|
|
106
|
+
# Read from current offset
|
|
107
|
+
File.open(file_path, 'r') do |file|
|
|
108
|
+
# Skip to current offset
|
|
109
|
+
state[:lines_read].times { file.gets }
|
|
110
|
+
|
|
111
|
+
# Read chunksize worth of lines
|
|
112
|
+
state[:chunksize].times do
|
|
113
|
+
break if file.eof?
|
|
114
|
+
line = file.gets
|
|
115
|
+
next unless line
|
|
116
|
+
|
|
117
|
+
chunk = JSON.parse(line.strip)
|
|
118
|
+
table_name_val ||= chunk["table_name"]
|
|
119
|
+
header_val ||= chunk["header"]
|
|
120
|
+
types_val ||= chunk["types"]
|
|
121
|
+
data_batch.concat(chunk["data"]) if chunk["data"]
|
|
122
|
+
|
|
123
|
+
state[:lines_read] += 1
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Apply skip-duplicates if needed
|
|
128
|
+
data_batch = data_batch.uniq if @options[:"skip-duplicates"]
|
|
129
|
+
|
|
130
|
+
# Don't set state[:size] or state[:offset] here - they're managed separately
|
|
131
|
+
# for completion tracking based on actual data rows imported
|
|
132
|
+
log.debug "DataStream::Base#fetch_file: read #{data_batch.size} rows from #{state[:lines_read]} lines (total #{state[:total_lines]} lines in file)"
|
|
133
|
+
|
|
134
|
+
rows = {
|
|
135
|
+
:table_name => table_name_val,
|
|
136
|
+
:header => header_val,
|
|
137
|
+
:data => data_batch,
|
|
138
|
+
:types => types_val
|
|
139
|
+
}
|
|
140
|
+
update_chunksize_stats
|
|
141
|
+
rows
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def max_chunksize_training
|
|
145
|
+
20
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def update_chunksize_stats
|
|
149
|
+
return if state[:num_chunksize] >= max_chunksize_training
|
|
150
|
+
state[:total_chunksize] += state[:chunksize]
|
|
151
|
+
state[:num_chunksize] += 1
|
|
152
|
+
state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize]
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def encode_rows(rows)
|
|
156
|
+
Tapsoob::Utils.base64encode(Marshal.dump(rows))
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def fetch(opts = {})
|
|
160
|
+
opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
|
|
161
|
+
|
|
162
|
+
log.debug "DataStream::Base#fetch state -> #{state.inspect}"
|
|
163
|
+
|
|
164
|
+
t1 = Time.now
|
|
165
|
+
rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
|
|
166
|
+
encoded_data = encode_rows(rows)
|
|
167
|
+
t2 = Time.now
|
|
168
|
+
elapsed_time = t2 - t1
|
|
169
|
+
|
|
170
|
+
# Only increment offset for database fetches
|
|
171
|
+
# For file fetches, offset is managed by fetch_file (tracks lines read, not rows)
|
|
172
|
+
if opts[:type] != "file"
|
|
173
|
+
state[:offset] += (rows == {} ? 0 : rows[:data].size)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
[encoded_data, (rows == {} ? 0 : rows[:data].size), elapsed_time]
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def complete?
|
|
180
|
+
# For file-based loading, check if we've read all lines
|
|
181
|
+
if state[:file_initialized]
|
|
182
|
+
result = state[:lines_read] >= state[:total_lines]
|
|
183
|
+
log.debug "DataStream::Base#complete? (file) lines_read=#{state[:lines_read]} total_lines=#{state[:total_lines]} result=#{result} table=#{table_name}"
|
|
184
|
+
result
|
|
185
|
+
else
|
|
186
|
+
# For database fetching, check offset vs size
|
|
187
|
+
result = state[:offset] >= state[:size]
|
|
188
|
+
log.debug "DataStream::Base#complete? (db) offset=#{state[:offset]} size=#{state[:size]} result=#{result} table=#{table_name}"
|
|
189
|
+
result
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def fetch_data_from_database(params)
|
|
194
|
+
encoded_data = params[:encoded_data]
|
|
195
|
+
|
|
196
|
+
rows = parse_encoded_data(encoded_data, params[:checksum])
|
|
197
|
+
|
|
198
|
+
# update local state
|
|
199
|
+
state.merge!(params[:state].merge(:chunksize => state[:chunksize]))
|
|
200
|
+
|
|
201
|
+
yield rows if block_given?
|
|
202
|
+
(rows == {} ? 0 : rows[:data].size)
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def fetch_data_to_database(params)
|
|
206
|
+
encoded_data = params[:encoded_data]
|
|
207
|
+
|
|
208
|
+
rows = parse_encoded_data(encoded_data, params[:checksum])
|
|
209
|
+
|
|
210
|
+
log.debug "DataStream::Base#fetch_data_to_database: importing #{rows[:data] ? rows[:data].size : 0} rows for table #{table_name rescue 'unknown'}"
|
|
211
|
+
import_rows(rows)
|
|
212
|
+
(rows == {} ? 0 : rows[:data].size)
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def self.parse_json(json)
|
|
216
|
+
hash = JSON.parse(json).symbolize_keys
|
|
217
|
+
hash[:state].symbolize_keys! if hash.has_key?(:state)
|
|
218
|
+
hash
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def parse_encoded_data(encoded_data, checksum)
|
|
222
|
+
raise Tapsoob::CorruptedData.new("Checksum Failed") unless Tapsoob::Utils.valid_data?(encoded_data, checksum)
|
|
223
|
+
|
|
224
|
+
begin
|
|
225
|
+
return Marshal.load(Tapsoob::Utils.base64decode(encoded_data))
|
|
226
|
+
rescue Object => e
|
|
227
|
+
unless ENV['NO_DUMP_MARSHAL_ERRORS']
|
|
228
|
+
puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat"
|
|
229
|
+
File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) }
|
|
230
|
+
end
|
|
231
|
+
raise e
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def import_rows(rows)
|
|
236
|
+
columns = rows[:header]
|
|
237
|
+
data = rows[:data]
|
|
238
|
+
|
|
239
|
+
# Only import existing columns
|
|
240
|
+
if table.columns.size != columns.size
|
|
241
|
+
existing_columns = table.columns.map(&:to_s)
|
|
242
|
+
additional_columns = columns - existing_columns
|
|
243
|
+
additional_columns_idxs = additional_columns.map { |c| columns.index(c) }
|
|
244
|
+
additional_columns_idxs.reverse.each do |idx|
|
|
245
|
+
columns.delete_at(idx)
|
|
246
|
+
rows[:types].delete_at(idx)
|
|
247
|
+
end
|
|
248
|
+
data.each_index { |didx| additional_columns_idxs.reverse.each { |idx| data[didx].delete_at(idx) } }
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Decode blobs
|
|
252
|
+
if rows.has_key?(:types) && rows[:types].include?("blob")
|
|
253
|
+
blob_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == "blob" }
|
|
254
|
+
data.each_index do |idx|
|
|
255
|
+
blob_indices.each do |bi|
|
|
256
|
+
data[idx][bi] = Sequel::SQL::Blob.new(Tapsoob::Utils.base64decode(data[idx][bi])) unless data[idx][bi].nil?
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Parse date/datetime/time columns
|
|
262
|
+
if rows.has_key?(:types)
|
|
263
|
+
%w(date datetime time).each do |type|
|
|
264
|
+
if rows[:types].include?(type)
|
|
265
|
+
type_indices = rows[:types].each_index.select { |idx| rows[:types][idx] == type }
|
|
266
|
+
data.each_index do |idx|
|
|
267
|
+
type_indices.each do |ti|
|
|
268
|
+
data[idx][ti] = Sequel.send("string_to_#{type}".to_sym, data[idx][ti]) unless data[idx][ti].nil?
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Remove id column
|
|
276
|
+
if @options[:"discard-identity"] && rows[:header].include?("id")
|
|
277
|
+
columns = rows[:header] - ["id"]
|
|
278
|
+
data = data.map { |d| d[1..-1] }
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
table.import(columns, data, :commit_every => 100)
|
|
282
|
+
rescue Exception => ex
|
|
283
|
+
case ex.message
|
|
284
|
+
when /integer out of range/ then
|
|
285
|
+
raise Tapsoob::InvalidData, <<-ERROR, []
|
|
286
|
+
\nDetected integer data that exceeds the maximum allowable size for an integer type.
|
|
287
|
+
This generally occurs when importing from SQLite due to the fact that SQLite does
|
|
288
|
+
not enforce maximum values on integer types.
|
|
289
|
+
ERROR
|
|
290
|
+
else raise ex
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
def verify_stream
|
|
295
|
+
state[:offset] = table.count
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def self.factory(db, state, opts)
|
|
299
|
+
if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=)
|
|
300
|
+
Sequel::MySQL.convert_invalid_date_time = :nil
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
if state.has_key?(:klass)
|
|
304
|
+
return eval(state[:klass]).new(db, state, opts)
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym)
|
|
308
|
+
Tapsoob::DataStream::Keyed.new(db, state, opts)
|
|
309
|
+
else
|
|
310
|
+
Tapsoob::DataStream::Base.new(db, state, opts)
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
|
2
|
+
require 'tapsoob/data_stream/base'
|
|
3
|
+
|
|
4
|
+
module Tapsoob
|
|
5
|
+
module DataStream
|
|
6
|
+
# DataStream variant for file-based parallelized loading
|
|
7
|
+
# Each worker reads a different portion of the NDJSON file
|
|
8
|
+
class FilePartition < Base
|
|
9
|
+
def initialize(db, state, opts = {})
|
|
10
|
+
super(db, state, opts)
|
|
11
|
+
@state = {
|
|
12
|
+
:line_range => nil, # [start_line, end_line]
|
|
13
|
+
:lines_read => 0
|
|
14
|
+
}.merge(@state)
|
|
15
|
+
|
|
16
|
+
# Initialize current_line from line_range if provided
|
|
17
|
+
if @state[:line_range]
|
|
18
|
+
start_line, end_line = @state[:line_range]
|
|
19
|
+
@state[:current_line] = start_line
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def fetch_file(dump_path)
|
|
24
|
+
return {} if state[:line_range].nil?
|
|
25
|
+
|
|
26
|
+
file_path = File.join(dump_path, "data", "#{table_name}.json")
|
|
27
|
+
start_line, end_line = state[:line_range]
|
|
28
|
+
|
|
29
|
+
table_name_val = nil
|
|
30
|
+
header_val = nil
|
|
31
|
+
types_val = nil
|
|
32
|
+
data_batch = []
|
|
33
|
+
|
|
34
|
+
# Read lines in this worker's range
|
|
35
|
+
File.open(file_path, 'r') do |file|
|
|
36
|
+
# Skip to current position
|
|
37
|
+
state[:current_line].times { file.gets }
|
|
38
|
+
|
|
39
|
+
# Read up to chunksize lines, but don't exceed end_line
|
|
40
|
+
lines_to_read = [state[:chunksize], end_line - state[:current_line] + 1].min
|
|
41
|
+
log.debug "DataStream::FilePartition#fetch_file: current_line=#{state[:current_line]} end_line=#{end_line} lines_to_read=#{lines_to_read} chunksize=#{state[:chunksize]} table=#{table_name}"
|
|
42
|
+
|
|
43
|
+
lines_to_read.times do
|
|
44
|
+
break if file.eof? || state[:current_line] > end_line
|
|
45
|
+
|
|
46
|
+
line = file.gets
|
|
47
|
+
next unless line
|
|
48
|
+
|
|
49
|
+
chunk = JSON.parse(line.strip)
|
|
50
|
+
table_name_val ||= chunk["table_name"]
|
|
51
|
+
header_val ||= chunk["header"]
|
|
52
|
+
types_val ||= chunk["types"]
|
|
53
|
+
data_batch.concat(chunk["data"]) if chunk["data"]
|
|
54
|
+
|
|
55
|
+
state[:current_line] += 1
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
log.debug "DataStream::FilePartition#fetch_file: read #{data_batch.size} rows in #{state[:current_line] - start_line} lines table=#{table_name}"
|
|
60
|
+
|
|
61
|
+
# Apply skip-duplicates if needed
|
|
62
|
+
data_batch = data_batch.uniq if @options[:"skip-duplicates"]
|
|
63
|
+
|
|
64
|
+
state[:size] = end_line - start_line + 1
|
|
65
|
+
state[:offset] = state[:current_line] - start_line
|
|
66
|
+
|
|
67
|
+
rows = {
|
|
68
|
+
:table_name => table_name_val,
|
|
69
|
+
:header => header_val,
|
|
70
|
+
:data => data_batch,
|
|
71
|
+
:types => types_val
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
update_chunksize_stats
|
|
75
|
+
rows
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def complete?
|
|
79
|
+
return true if state[:line_range].nil?
|
|
80
|
+
start_line, end_line = state[:line_range]
|
|
81
|
+
result = state[:current_line] && state[:current_line] > end_line
|
|
82
|
+
log.debug "DataStream::FilePartition#complete? current_line=#{state[:current_line]} end_line=#{end_line} result=#{result} table=#{table_name}"
|
|
83
|
+
result
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
|
2
|
+
require 'tapsoob/data_stream/base'
|
|
3
|
+
|
|
4
|
+
module Tapsoob
|
|
5
|
+
module DataStream
|
|
6
|
+
# DataStream variant for interleaved chunk-based partitioning (for tables without integer PK)
|
|
7
|
+
class Interleaved < Base
|
|
8
|
+
def initialize(db, state, opts = {})
|
|
9
|
+
super(db, state, opts)
|
|
10
|
+
# :worker_id = which worker this is (0-indexed)
|
|
11
|
+
# :num_workers = total number of workers
|
|
12
|
+
# :chunk_number = current chunk number for this worker
|
|
13
|
+
@state = {
|
|
14
|
+
:worker_id => 0,
|
|
15
|
+
:num_workers => 1,
|
|
16
|
+
:chunk_number => 0
|
|
17
|
+
}.merge(@state)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def fetch_rows
|
|
21
|
+
worker_id = state[:worker_id]
|
|
22
|
+
num_workers = state[:num_workers]
|
|
23
|
+
chunk_number = state[:chunk_number]
|
|
24
|
+
chunksize = state[:chunksize]
|
|
25
|
+
|
|
26
|
+
# Only count once on first fetch
|
|
27
|
+
state[:size] ||= table.count
|
|
28
|
+
|
|
29
|
+
# Calculate which global chunk this worker should fetch
|
|
30
|
+
# Worker 0: chunks 0, num_workers, 2*num_workers, ...
|
|
31
|
+
# Worker 1: chunks 1, num_workers+1, 2*num_workers+1, ...
|
|
32
|
+
global_chunk_index = (chunk_number * num_workers) + worker_id
|
|
33
|
+
offset = global_chunk_index * chunksize
|
|
34
|
+
|
|
35
|
+
ds = table.order(*order_by).limit(chunksize, offset)
|
|
36
|
+
log.debug "DataStream::Interleaved#fetch_rows SQL -> #{ds.sql} (worker #{worker_id}/#{num_workers}, chunk #{chunk_number})"
|
|
37
|
+
|
|
38
|
+
rows = Tapsoob::Utils.format_data(db, ds.all,
|
|
39
|
+
:string_columns => string_columns,
|
|
40
|
+
:schema => db.schema(table_name),
|
|
41
|
+
:table => table_name
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
update_chunksize_stats
|
|
45
|
+
rows
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def fetch(opts = {})
|
|
49
|
+
opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts)
|
|
50
|
+
|
|
51
|
+
log.debug "DataStream::Interleaved#fetch state -> #{state.inspect}"
|
|
52
|
+
|
|
53
|
+
t1 = Time.now
|
|
54
|
+
rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows)
|
|
55
|
+
encoded_data = encode_rows(rows)
|
|
56
|
+
t2 = Time.now
|
|
57
|
+
elapsed_time = t2 - t1
|
|
58
|
+
|
|
59
|
+
row_count = (rows == {} ? 0 : rows[:data].size)
|
|
60
|
+
|
|
61
|
+
# Always increment chunk number to avoid infinite loops
|
|
62
|
+
# Even if we got 0 rows, move to the next chunk position
|
|
63
|
+
state[:chunk_number] += 1
|
|
64
|
+
state[:offset] += row_count
|
|
65
|
+
|
|
66
|
+
[encoded_data, row_count, elapsed_time]
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def increment(row_count)
|
|
70
|
+
# This is called by the old code path - not used in new parallel implementation
|
|
71
|
+
state[:chunk_number] += 1
|
|
72
|
+
state[:offset] += row_count
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def complete?
|
|
76
|
+
state[:offset] >= state[:size]
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|