tapsoob 0.7.17 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ # Shared examples that any adapter-specific integration suite can include.
2
+ # The including example group must define before(:all) that sets:
3
+ # @src_url, @dst_url — Sequel connection URLs
4
+ # @src_db, @dst_db — connected Sequel::Database objects
5
+ # Individual examples access these via the src_url/dst_url/src_db/dst_db helpers
6
+ # defined in DbHelpers (which delegate to the ivars set in before(:all)).
7
+
8
+ RSpec.shared_examples 'a complete round-trip' do
9
+ it 'pulls without error' do
10
+ expect { pull(src_url, dump_dir) }.not_to raise_error
11
+ end
12
+
13
+ it 'creates schema dump files for every table' do
14
+ pull(src_url, dump_dir)
15
+ src_db.tables.each do |table|
16
+ expect(File).to exist(File.join(dump_dir, 'schemas', "#{table}.rb"))
17
+ end
18
+ end
19
+
20
+ it 'creates data dump files for every seeded table' do
21
+ pull(src_url, dump_dir)
22
+ %i[users orders products documents attachments events large_table null_heavy].each do |table|
23
+ expect(File).to exist(File.join(dump_dir, 'data', "#{table}.json"))
24
+ end
25
+ end
26
+
27
+ it 'pushes without error' do
28
+ pull(src_url, dump_dir)
29
+ expect { push(dst_url, dump_dir) }.not_to raise_error
30
+ end
31
+
32
+ it 'preserves row counts for all tables' do
33
+ round_trip(src_url, dst_url, dump_dir)
34
+ expect_same_counts(src_db, dst_db)
35
+ end
36
+
37
+ it 'preserves NULL values in null_heavy' do
38
+ round_trip(src_url, dst_url, dump_dir)
39
+ null_rows = dst_db[:null_heavy].where(maybe_name: nil).count
40
+ expect(null_rows).to be > 0
41
+ end
42
+
43
+ it 'preserves string content in users.email' do
44
+ round_trip(src_url, dst_url, dump_dir)
45
+ src_emails = src_db[:users].select_map(:email).sort
46
+ dst_emails = dst_db[:users].select_map(:email).sort
47
+ expect(dst_emails).to eq(src_emails)
48
+ end
49
+
50
+ it 'preserves BLOB payloads in attachments' do
51
+ round_trip(src_url, dst_url, dump_dir)
52
+ src_db[:attachments].order(:id).each do |src_row|
53
+ dst_row = dst_db[:attachments][id: src_row[:id]]
54
+ expect(dst_row).not_to be_nil
55
+ expect(dst_row[:payload].to_s.bytes).to eq(src_row[:payload].to_s.bytes)
56
+ end
57
+ end
58
+
59
+ it 'preserves large TEXT bodies in documents' do
60
+ round_trip(src_url, dst_url, dump_dir)
61
+ src_db[:documents].order(:id).each do |src_row|
62
+ dst_row = dst_db[:documents][id: src_row[:id]]
63
+ expect(dst_row[:body]).to eq(src_row[:body])
64
+ end
65
+ end
66
+
67
+ it 'handles the no-PK events table' do
68
+ round_trip(src_url, dst_url, dump_dir)
69
+ expect(dst_db[:events].count).to eq(src_db[:events].count)
70
+ end
71
+ end
72
+
73
+ RSpec.shared_examples 'a parallel round-trip' do |workers:|
74
+ it "preserves row counts with #{workers} parallel workers" do
75
+ round_trip(src_url, dst_url, dump_dir, parallel: workers)
76
+ expect_same_counts(src_db, dst_db)
77
+ end
78
+
79
+ it "handles the large_table (>100K rows) with #{workers} workers" do
80
+ round_trip(src_url, dst_url, dump_dir, parallel: workers)
81
+ expect(dst_db[:large_table].count).to eq(Fixtures::LARGE_TABLE_ROWS)
82
+ end
83
+ end
@@ -0,0 +1,163 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe 'Large dataset system tests', :system do
4
+ before(:all) do
5
+ @src_url = DbHelpers.adapt_url(ENV.fetch('SRC_DATABASE_URL', 'sqlite://tmp/tapsoob_system_src.db'))
6
+ @dst_url = DbHelpers.adapt_url(ENV.fetch('DST_DATABASE_URL', 'sqlite://tmp/tapsoob_system_dst.db'))
7
+
8
+ FileUtils.mkdir_p('tmp')
9
+ File.delete('tmp/tapsoob_system_src.db') rescue nil
10
+ File.delete('tmp/tapsoob_system_dst.db') rescue nil
11
+
12
+ @src_db = DbHelpers.connect(@src_url)
13
+ @dst_db = DbHelpers.connect(@dst_url)
14
+
15
+ Fixtures.create_tables(@src_db)
16
+ Fixtures.seed(@src_db)
17
+ end
18
+
19
+ before(:each) do
20
+ Fixtures.drop_tables(@dst_db)
21
+ end
22
+
23
+ after(:all) do
24
+ Fixtures.drop_tables(@src_db)
25
+ Fixtures.drop_tables(@dst_db)
26
+ DbHelpers.disconnect_all
27
+ File.delete('tmp/tapsoob_system_src.db') rescue nil
28
+ File.delete('tmp/tapsoob_system_dst.db') rescue nil
29
+ end
30
+
31
+ # ── large_table: intra-table parallelization threshold ───────────────────────
32
+
33
+ describe 'large_table (150K rows)' do
34
+ it 'transfers all rows in serial mode' do
35
+ round_trip(src_url, dst_url, dump_dir)
36
+ expect(dst_db[:large_table].count).to eq(Fixtures::LARGE_TABLE_ROWS)
37
+ end
38
+
39
+ it 'transfers all rows with parallel: 2' do
40
+ round_trip(src_url, dst_url, dump_dir, parallel: 2)
41
+ expect(dst_db[:large_table].count).to eq(Fixtures::LARGE_TABLE_ROWS)
42
+ end
43
+
44
+ it 'transfers all rows with parallel: 4' do
45
+ round_trip(src_url, dst_url, dump_dir, parallel: 4)
46
+ expect(dst_db[:large_table].count).to eq(Fixtures::LARGE_TABLE_ROWS)
47
+ end
48
+
49
+ it 'has no duplicate rows after parallel pull' do
50
+ round_trip(src_url, dst_url, dump_dir, parallel: 4)
51
+ total = dst_db[:large_table].count
52
+ distinct = dst_db[:large_table].select(:id).distinct.count
53
+ expect(distinct).to eq(total)
54
+ end
55
+ end
56
+
57
+ # ── documents: large TEXT columns ────────────────────────────────────────────
58
+
59
+ describe 'documents table (large TEXT)' do
60
+ it 'preserves body content exactly' do
61
+ round_trip(src_url, dst_url, dump_dir)
62
+ src_db[:documents].order(:id).each do |src_row|
63
+ dst_row = dst_db[:documents][id: src_row[:id]]
64
+ expect(dst_row[:body]).to eq(src_row[:body]),
65
+ "body mismatch for document #{src_row[:id]}: " \
66
+ "src=#{src_row[:body]&.length} bytes dst=#{dst_row[:body]&.length} bytes"
67
+ end
68
+ end
69
+
70
+ it 'handles documents with nil body' do
71
+ round_trip(src_url, dst_url, dump_dir)
72
+ expect(dst_db[:documents].where(body: nil).count).to eq(src_db[:documents].where(body: nil).count)
73
+ end
74
+ end
75
+
76
+ # ── attachments: BLOB encoding/decoding ──────────────────────────────────────
77
+
78
+ describe 'attachments table (binary BLOBs up to 256 KB)' do
79
+ it 'preserves every byte of every payload' do
80
+ round_trip(src_url, dst_url, dump_dir)
81
+ mismatch_count = 0
82
+ src_db[:attachments].order(:id).each do |src_row|
83
+ dst_row = dst_db[:attachments][id: src_row[:id]]
84
+ mismatch_count += 1 unless dst_row[:payload].to_s.bytes == src_row[:payload].to_s.bytes
85
+ end
86
+ expect(mismatch_count).to eq(0)
87
+ end
88
+
89
+ it 'preserves size_bytes metadata' do
90
+ round_trip(src_url, dst_url, dump_dir)
91
+ src_db[:attachments].order(:id).each do |src_row|
92
+ dst_row = dst_db[:attachments][id: src_row[:id]]
93
+ expect(dst_row[:size_bytes]).to eq(src_row[:size_bytes])
94
+ end
95
+ end
96
+ end
97
+
98
+ # ── null_heavy: NULL preservation ────────────────────────────────────────────
99
+
100
+ describe 'null_heavy table' do
101
+ it 'preserves NULLs in every nullable column' do
102
+ round_trip(src_url, dst_url, dump_dir)
103
+ %i[maybe_name maybe_number maybe_score maybe_date maybe_text].each do |col|
104
+ src_nulls = src_db[:null_heavy].where(col => nil).count
105
+ dst_nulls = dst_db[:null_heavy].where(col => nil).count
106
+ expect(dst_nulls).to eq(src_nulls),
107
+ "NULL count mismatch for null_heavy.#{col}: src=#{src_nulls} dst=#{dst_nulls}"
108
+ end
109
+ end
110
+ end
111
+
112
+ # ── events: table without primary key ────────────────────────────────────────
113
+
114
+ describe 'events table (no primary key)' do
115
+ it 'uses the Base (non-keyed) stream' do
116
+ round_trip(src_url, dst_url, dump_dir)
117
+ expect(dst_db[:events].count).to eq(src_db[:events].count)
118
+ end
119
+ end
120
+
121
+ # ── adaptive chunksize: very small chunks ────────────────────────────────────
122
+
123
+ describe 'adaptive chunksize under load' do
124
+ it 'completes with chunksize=1 (extreme case)' do
125
+ small_src = DbHelpers.adapt_url('sqlite://tmp/tapsoob_small_src.db')
126
+ small_dst = DbHelpers.adapt_url('sqlite://tmp/tapsoob_small_dst.db')
127
+ small_dir = Dir.mktmpdir
128
+
129
+ begin
130
+ sdb = DbHelpers.connect(small_src)
131
+ sdb.create_table!(:small_test) { primary_key :id; String :v, size: 50 }
132
+ 100.times { |i| sdb[:small_test].insert(v: "row_#{i}") }
133
+
134
+ round_trip(small_src, small_dst, small_dir, default_chunksize: 1)
135
+ expect(DbHelpers.connect(small_dst)[:small_test].count).to eq(100)
136
+ ensure
137
+ FileUtils.rm_rf(small_dir)
138
+ File.delete('tmp/tapsoob_small_src.db') rescue nil
139
+ File.delete('tmp/tapsoob_small_dst.db') rescue nil
140
+ # Reconnect suite DBs after disconnect_all clears the pool
141
+ DbHelpers.disconnect_all
142
+ @src_db = DbHelpers.connect(@src_url)
143
+ @dst_db = DbHelpers.connect(@dst_url)
144
+ end
145
+ end
146
+ end
147
+
148
+ # ── FK order: orders depends on users ────────────────────────────────────────
149
+
150
+ describe 'foreign key dependency ordering' do
151
+ it 'pushes users before orders (table_order.txt respected)' do
152
+ pull(src_url, dump_dir)
153
+ order_file = File.join(dump_dir, 'table_order.txt')
154
+ if File.exist?(order_file)
155
+ order = File.readlines(order_file).map(&:strip)
156
+ users_idx = order.index('users')
157
+ orders_idx = order.index('orders')
158
+ expect(users_idx).to be < orders_idx if users_idx && orders_idx
159
+ end
160
+ expect { push(dst_url, dump_dir) }.not_to raise_error
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,105 @@
1
+ require 'spec_helper'
2
+ require 'tapsoob/chunksize'
3
+
4
+ RSpec.describe Tapsoob::Chunksize do
5
+ subject(:cs) { described_class.new(1000) }
6
+
7
+ describe '#initialize' do
8
+ it 'stores the initial chunksize' do
9
+ expect(cs.to_i).to eq(1000)
10
+ end
11
+
12
+ it 'starts with zero retries' do
13
+ expect(cs.retries).to eq(0)
14
+ end
15
+
16
+ it 'starts with zero idle_secs' do
17
+ expect(cs.idle_secs).to eq(0.0)
18
+ end
19
+ end
20
+
21
+ describe '#reset_chunksize' do
22
+ context 'with 0 retries (first failure)' do
23
+ it 'resets to 10' do
24
+ expect(cs.reset_chunksize).to eq(10)
25
+ end
26
+ end
27
+
28
+ context 'with 1 retry' do
29
+ it 'resets to 10' do
30
+ cs.retries = 1
31
+ expect(cs.reset_chunksize).to eq(10)
32
+ end
33
+ end
34
+
35
+ context 'with 2+ retries' do
36
+ it 'resets to 1' do
37
+ cs.retries = 2
38
+ expect(cs.reset_chunksize).to eq(1)
39
+ end
40
+ end
41
+ end
42
+
43
+ describe '#diff' do
44
+ before do
45
+ cs.start_time = 0.0
46
+ cs.end_time = 10.0
47
+ cs.time_in_db = 3.0
48
+ cs.idle_secs = 2.0
49
+ end
50
+
51
+ it 'returns end_time - start_time - time_in_db - idle_secs' do
52
+ expect(cs.diff).to eq(5.0)
53
+ end
54
+ end
55
+
56
+ describe '#calc_new_chunksize' do
57
+ def make(chunksize, diff_val)
58
+ c = described_class.new(chunksize)
59
+ # manufacture a diff by setting times such that diff == diff_val
60
+ c.start_time = 0.0
61
+ c.end_time = diff_val
62
+ c.time_in_db = 0.0
63
+ c.idle_secs = 0.0
64
+ c
65
+ end
66
+
67
+ it 'halves (roughly) when diff > 3.0' do
68
+ c = make(900, 3.5)
69
+ expect(c.calc_new_chunksize).to eq((900 / 3.0).ceil)
70
+ end
71
+
72
+ it 'decrements by 100 when diff is 1.1..3.0' do
73
+ c = make(900, 2.0)
74
+ expect(c.calc_new_chunksize).to eq(800)
75
+ end
76
+
77
+ it 'doubles when diff < 0.8' do
78
+ c = make(500, 0.5)
79
+ expect(c.calc_new_chunksize).to eq(1000)
80
+ end
81
+
82
+ it 'increments by 100 when diff is 0.8..1.1' do
83
+ c = make(500, 0.9)
84
+ expect(c.calc_new_chunksize).to eq(600)
85
+ end
86
+
87
+ it 'never returns less than 1' do
88
+ c = make(1, 5.0)
89
+ expect(c.calc_new_chunksize).to be >= 1
90
+ end
91
+
92
+ it 'holds chunksize unchanged when retries > 0' do
93
+ c = make(500, 0.5)
94
+ c.retries = 1
95
+ expect(c.calc_new_chunksize).to eq(500)
96
+ end
97
+ end
98
+
99
+ describe '#time_delta' do
100
+ it 'returns elapsed seconds for the block' do
101
+ delta = cs.time_delta { sleep 0.01 }
102
+ expect(delta).to be_between(0.005, 1.0)
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,220 @@
1
+ require 'spec_helper'
2
+ require 'tapsoob/data_stream'
3
+
4
+ # Shared low-level stream tests run against an in-process SQLite connection.
5
+ # They exercise the fetch/encode/decode cycle, completion logic, and the
6
+ # factory method – all without any filesystem dump.
7
+
8
+ RSpec.describe Tapsoob::DataStream do
9
+ # ── shared DB setup ──────────────────────────────────────────────────────────
10
+
11
+ let(:db) do
12
+ d = connect_sqlite
13
+ d.extension :schema_dumper
14
+ d.create_table(:stream_test) do
15
+ primary_key :id
16
+ String :label, size: 50
17
+ Integer :value
18
+ end
19
+ # Insert 50 rows
20
+ 50.times { |i| d[:stream_test].insert(label: "row_#{i}", value: i) }
21
+ d
22
+ end
23
+
24
+ let(:db_nopk) do
25
+ d = connect_sqlite
26
+ d.extension :schema_dumper
27
+ d.create_table(:nopk_test) do
28
+ String :key, size: 50
29
+ Integer :val
30
+ end
31
+ 20.times { |i| d[:nopk_test].insert(key: "k#{i}", val: i) }
32
+ d
33
+ end
34
+
35
+ after { db.disconnect; db_nopk.disconnect }
36
+
37
+ # ── Base ─────────────────────────────────────────────────────────────────────
38
+
39
+ describe Tapsoob::DataStream::Base do
40
+ subject(:stream) do
41
+ described_class.new(db, { table_name: :stream_test, chunksize: 10 })
42
+ end
43
+
44
+ describe '#fetch' do
45
+ it 'returns [encoded_data, row_count, elapsed]' do
46
+ encoded, count, elapsed = stream.fetch
47
+ expect(encoded).to be_a(String)
48
+ expect(count).to eq(10)
49
+ expect(elapsed).to be_a(Float)
50
+ end
51
+
52
+ it 'advances offset on each call' do
53
+ stream.fetch
54
+ expect(stream.state[:offset]).to eq(10)
55
+ stream.fetch
56
+ expect(stream.state[:offset]).to eq(20)
57
+ end
58
+ end
59
+
60
+ describe '#complete?' do
61
+ it 'is false before all rows are fetched' do
62
+ stream.fetch
63
+ expect(stream.complete?).to be false
64
+ end
65
+
66
+ it 'is true after all rows are fetched' do
67
+ 5.times { stream.fetch }
68
+ expect(stream.complete?).to be true
69
+ end
70
+ end
71
+
72
+ describe '#fetch_data_from_database' do
73
+ it 'yields a hash with :table_name, :header, :data, :types' do
74
+ encoded, _, _ = stream.fetch
75
+ data_params = {
76
+ state: stream.to_hash,
77
+ checksum: Tapsoob::Utils.checksum(encoded).to_s,
78
+ encoded_data: encoded
79
+ }
80
+ yielded = nil
81
+ stream.fetch_data_from_database(data_params) { |rows| yielded = rows }
82
+ expect(yielded).to include(:table_name, :header, :data)
83
+ end
84
+ end
85
+
86
+ describe '#parse_encoded_data' do
87
+ it 'raises CorruptedData on checksum mismatch' do
88
+ encoded, _, _ = stream.fetch
89
+ expect {
90
+ stream.parse_encoded_data(encoded, '0')
91
+ }.to raise_error(Tapsoob::CorruptedData)
92
+ end
93
+ end
94
+
95
+ describe '.factory' do
96
+ it 'returns Keyed stream for table with integer PK' do
97
+ result = described_class.factory(db, { table_name: :stream_test, chunksize: 10 }, {})
98
+ expect(result).to be_a(Tapsoob::DataStream::Keyed)
99
+ end
100
+
101
+ it 'returns Base stream for table without integer PK' do
102
+ result = described_class.factory(db_nopk, { table_name: :nopk_test, chunksize: 10 }, {})
103
+ expect(result).to be_a(Tapsoob::DataStream::Base)
104
+ end
105
+ end
106
+ end
107
+
108
+ # ── Keyed ────────────────────────────────────────────────────────────────────
109
+
110
+ describe Tapsoob::DataStream::Keyed do
111
+ subject(:stream) do
112
+ described_class.new(db, { table_name: :stream_test, chunksize: 10 })
113
+ end
114
+
115
+ it 'fetches all 50 rows without duplicates' do
116
+ all_ids = []
117
+ loop do
118
+ encoded, count, _ = stream.fetch
119
+ break if count == 0
120
+ data_params = {
121
+ state: stream.to_hash,
122
+ checksum: Tapsoob::Utils.checksum(encoded).to_s,
123
+ encoded_data: encoded
124
+ }
125
+ stream.fetch_data_from_database(data_params) do |rows|
126
+ id_idx = rows[:header].index(:id)
127
+ all_ids.concat(rows[:data].map { |r| r[id_idx] })
128
+ end
129
+ break if stream.complete?
130
+ end
131
+ expect(all_ids.uniq.size).to eq(50)
132
+ expect(all_ids.size).to eq(50)
133
+ end
134
+
135
+ describe '.calculate_pk_ranges' do
136
+ it 'returns the right number of ranges' do
137
+ ranges = described_class.calculate_pk_ranges(db, :stream_test, 4)
138
+ expect(ranges.size).to eq(4)
139
+ end
140
+
141
+ it 'covers the full PK range' do
142
+ min = db[:stream_test].min(:id)
143
+ max = db[:stream_test].max(:id)
144
+ ranges = described_class.calculate_pk_ranges(db, :stream_test, 4)
145
+ expect(ranges.first.first).to eq(min)
146
+ expect(ranges.last.last).to eq(max)
147
+ end
148
+ end
149
+ end
150
+
151
+ # ── KeyedPartition ───────────────────────────────────────────────────────────
152
+
153
+ describe Tapsoob::DataStream::KeyedPartition do
154
+ it 'fetches only rows within its assigned PK range' do
155
+ min = db[:stream_test].min(:id)
156
+ max = db[:stream_test].max(:id)
157
+ mid = (min + max) / 2
158
+
159
+ stream = described_class.new(db, {
160
+ table_name: :stream_test,
161
+ chunksize: 100,
162
+ partition_range: [min, mid]
163
+ })
164
+
165
+ all_ids = []
166
+ until stream.complete?
167
+ encoded, count, _ = stream.fetch
168
+ break if count == 0
169
+ data_params = {
170
+ state: stream.to_hash,
171
+ checksum: Tapsoob::Utils.checksum(encoded).to_s,
172
+ encoded_data: encoded
173
+ }
174
+ stream.fetch_data_from_database(data_params) do |rows|
175
+ id_idx = rows[:header].index(:id)
176
+ all_ids.concat(rows[:data].map { |r| r[id_idx] })
177
+ end
178
+ end
179
+
180
+ expect(all_ids).to all(be_between(min, mid))
181
+ end
182
+ end
183
+
184
+ # ── Interleaved ──────────────────────────────────────────────────────────────
185
+
186
+ describe Tapsoob::DataStream::Interleaved do
187
+ it 'two workers together cover all rows without overlap' do
188
+ worker0 = described_class.new(db, {
189
+ table_name: :stream_test, chunksize: 10, worker_id: 0, num_workers: 2
190
+ })
191
+ worker1 = described_class.new(db, {
192
+ table_name: :stream_test, chunksize: 10, worker_id: 1, num_workers: 2
193
+ })
194
+
195
+ def drain(stream, db)
196
+ ids = []
197
+ until stream.complete?
198
+ encoded, count, _ = stream.fetch
199
+ break if count == 0
200
+ params = {
201
+ state: stream.to_hash,
202
+ checksum: Tapsoob::Utils.checksum(encoded).to_s,
203
+ encoded_data: encoded
204
+ }
205
+ stream.fetch_data_from_database(params) do |rows|
206
+ id_idx = rows[:header].index(:id)
207
+ ids.concat(rows[:data].map { |r| r[id_idx] })
208
+ end
209
+ end
210
+ ids
211
+ end
212
+
213
+ ids0 = drain(worker0, db)
214
+ ids1 = drain(worker1, db)
215
+
216
+ expect((ids0 + ids1).sort).to eq((ids0 + ids1).uniq.sort)
217
+ expect((ids0 + ids1).size).to eq(50)
218
+ end
219
+ end
220
+ end