ingestr 0.6.6__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +29 -4
- ingestr/src/factory.py +2 -0
- ingestr/src/sources.py +1 -3
- ingestr/src/sql_database/__init__.py +62 -28
- ingestr/src/sql_database/arrow_helpers.py +139 -0
- ingestr/src/sql_database/helpers.py +57 -33
- ingestr/src/sql_database/schema_types.py +58 -81
- ingestr/src/version.py +1 -1
- {ingestr-0.6.6.dist-info → ingestr-0.7.0.dist-info}/METADATA +2 -2
- {ingestr-0.6.6.dist-info → ingestr-0.7.0.dist-info}/RECORD +13 -17
- ingestr/main_test.py +0 -875
- ingestr/src/destinations_test.py +0 -113
- ingestr/src/factory_test.py +0 -13
- ingestr/src/gorgias/helpers_test.py +0 -45
- ingestr/src/sources_test.py +0 -96
- {ingestr-0.6.6.dist-info → ingestr-0.7.0.dist-info}/WHEEL +0 -0
- {ingestr-0.6.6.dist-info → ingestr-0.7.0.dist-info}/entry_points.txt +0 -0
- {ingestr-0.6.6.dist-info → ingestr-0.7.0.dist-info}/licenses/LICENSE.md +0 -0
ingestr/main_test.py
DELETED
|
@@ -1,875 +0,0 @@
|
|
|
1
|
-
import csv
|
|
2
|
-
import os
|
|
3
|
-
import random
|
|
4
|
-
import shutil
|
|
5
|
-
import string
|
|
6
|
-
|
|
7
|
-
import duckdb
|
|
8
|
-
from typer.testing import CliRunner
|
|
9
|
-
|
|
10
|
-
from ingestr.main import app
|
|
11
|
-
|
|
12
|
-
runner = CliRunner()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def get_abs_path(relative_path):
|
|
16
|
-
return os.path.abspath(os.path.join(os.path.dirname(__file__), relative_path))
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def invoke_ingest_command(
|
|
20
|
-
source_uri,
|
|
21
|
-
source_table,
|
|
22
|
-
dest_uri,
|
|
23
|
-
dest_table,
|
|
24
|
-
inc_strategy=None,
|
|
25
|
-
inc_key=None,
|
|
26
|
-
primary_key=None,
|
|
27
|
-
merge_key=None,
|
|
28
|
-
interval_start=None,
|
|
29
|
-
interval_end=None,
|
|
30
|
-
sql_backend=None,
|
|
31
|
-
loader_file_format=None,
|
|
32
|
-
):
|
|
33
|
-
args = [
|
|
34
|
-
"ingest",
|
|
35
|
-
"--source-uri",
|
|
36
|
-
source_uri,
|
|
37
|
-
"--source-table",
|
|
38
|
-
source_table,
|
|
39
|
-
"--dest-uri",
|
|
40
|
-
dest_uri,
|
|
41
|
-
"--dest-table",
|
|
42
|
-
dest_table,
|
|
43
|
-
]
|
|
44
|
-
|
|
45
|
-
if inc_strategy:
|
|
46
|
-
args.append("--incremental-strategy")
|
|
47
|
-
args.append(inc_strategy)
|
|
48
|
-
|
|
49
|
-
if inc_key:
|
|
50
|
-
args.append("--incremental-key")
|
|
51
|
-
args.append(inc_key)
|
|
52
|
-
|
|
53
|
-
if primary_key:
|
|
54
|
-
args.append("--primary-key")
|
|
55
|
-
args.append(primary_key)
|
|
56
|
-
|
|
57
|
-
if merge_key:
|
|
58
|
-
args.append("--merge-key")
|
|
59
|
-
args.append(merge_key)
|
|
60
|
-
|
|
61
|
-
if interval_start:
|
|
62
|
-
args.append("--interval-start")
|
|
63
|
-
args.append(interval_start)
|
|
64
|
-
|
|
65
|
-
if interval_end:
|
|
66
|
-
args.append("--interval-end")
|
|
67
|
-
args.append(interval_end)
|
|
68
|
-
|
|
69
|
-
if sql_backend:
|
|
70
|
-
args.append("--sql-backend")
|
|
71
|
-
args.append(sql_backend)
|
|
72
|
-
|
|
73
|
-
if loader_file_format:
|
|
74
|
-
args.append("--loader-file-format")
|
|
75
|
-
args.append(loader_file_format)
|
|
76
|
-
|
|
77
|
-
result = runner.invoke(
|
|
78
|
-
app,
|
|
79
|
-
args,
|
|
80
|
-
input="y\n",
|
|
81
|
-
env={"DISABLE_TELEMETRY": "true"},
|
|
82
|
-
)
|
|
83
|
-
return result
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
### These are DuckDB-to-DuckDB tests
|
|
87
|
-
def test_create_replace_duckdb_to_duckdb():
|
|
88
|
-
try:
|
|
89
|
-
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
90
|
-
except Exception:
|
|
91
|
-
pass
|
|
92
|
-
|
|
93
|
-
dbname = f"test_create_replace_{get_random_string(5)}.db"
|
|
94
|
-
|
|
95
|
-
abs_db_path = get_abs_path(f"./testdata/{dbname}")
|
|
96
|
-
rel_db_path_to_command = f"ingestr/testdata/{dbname}"
|
|
97
|
-
|
|
98
|
-
conn = duckdb.connect(abs_db_path)
|
|
99
|
-
conn.execute("DROP SCHEMA IF EXISTS testschema CASCADE")
|
|
100
|
-
conn.execute("CREATE SCHEMA testschema")
|
|
101
|
-
conn.execute(
|
|
102
|
-
"CREATE TABLE testschema.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP)"
|
|
103
|
-
)
|
|
104
|
-
conn.execute("INSERT INTO testschema.input VALUES (1, 'val1', '2022-01-01')")
|
|
105
|
-
conn.execute("INSERT INTO testschema.input VALUES (2, 'val2', '2022-02-01')")
|
|
106
|
-
|
|
107
|
-
res = conn.sql("select count(*) from testschema.input").fetchall()
|
|
108
|
-
assert res[0][0] == 2
|
|
109
|
-
|
|
110
|
-
result = invoke_ingest_command(
|
|
111
|
-
f"duckdb:///{rel_db_path_to_command}",
|
|
112
|
-
"testschema.input",
|
|
113
|
-
f"duckdb:///{rel_db_path_to_command}",
|
|
114
|
-
"testschema.output",
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
print(result.stdout)
|
|
118
|
-
|
|
119
|
-
assert result.exit_code == 0
|
|
120
|
-
|
|
121
|
-
res = conn.sql(
|
|
122
|
-
"select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema.output"
|
|
123
|
-
).fetchall()
|
|
124
|
-
assert len(res) == 2
|
|
125
|
-
assert res[0] == (1, "val1", "2022-01-01")
|
|
126
|
-
assert res[1] == (2, "val2", "2022-02-01")
|
|
127
|
-
|
|
128
|
-
try:
|
|
129
|
-
os.remove(abs_db_path)
|
|
130
|
-
except Exception:
|
|
131
|
-
pass
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
def test_append_duckdb_to_duckdb():
|
|
135
|
-
try:
|
|
136
|
-
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
137
|
-
except Exception:
|
|
138
|
-
pass
|
|
139
|
-
|
|
140
|
-
abs_db_path = get_abs_path("./testdata/test_append.db")
|
|
141
|
-
rel_db_path_to_command = "ingestr/testdata/test_append.db"
|
|
142
|
-
uri = f"duckdb:///{rel_db_path_to_command}"
|
|
143
|
-
|
|
144
|
-
conn = duckdb.connect(abs_db_path)
|
|
145
|
-
conn.execute("DROP SCHEMA IF EXISTS testschema_append CASCADE")
|
|
146
|
-
conn.execute("CHECKPOINT")
|
|
147
|
-
|
|
148
|
-
conn.execute("CREATE SCHEMA testschema_append")
|
|
149
|
-
conn.execute(
|
|
150
|
-
"CREATE TABLE testschema_append.input (id INTEGER, val VARCHAR, updated_at DATE)"
|
|
151
|
-
)
|
|
152
|
-
conn.execute(
|
|
153
|
-
"INSERT INTO testschema_append.input VALUES (1, 'val1', '2022-01-01'), (2, 'val2', '2022-01-02')"
|
|
154
|
-
)
|
|
155
|
-
conn.execute("CHECKPOINT")
|
|
156
|
-
|
|
157
|
-
res = conn.sql("select count(*) from testschema_append.input").fetchall()
|
|
158
|
-
assert res[0][0] == 2
|
|
159
|
-
|
|
160
|
-
def run():
|
|
161
|
-
res = invoke_ingest_command(
|
|
162
|
-
uri,
|
|
163
|
-
"testschema_append.input",
|
|
164
|
-
uri,
|
|
165
|
-
"testschema_append.output",
|
|
166
|
-
"append",
|
|
167
|
-
"updated_at",
|
|
168
|
-
sql_backend="sqlalchemy",
|
|
169
|
-
)
|
|
170
|
-
assert res.exit_code == 0
|
|
171
|
-
|
|
172
|
-
def get_output_table():
|
|
173
|
-
conn.execute("CHECKPOINT")
|
|
174
|
-
return conn.sql(
|
|
175
|
-
"select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_append.output order by id asc"
|
|
176
|
-
).fetchall()
|
|
177
|
-
|
|
178
|
-
run()
|
|
179
|
-
|
|
180
|
-
res = get_output_table()
|
|
181
|
-
assert len(res) == 2
|
|
182
|
-
assert res[0] == (1, "val1", "2022-01-01")
|
|
183
|
-
assert res[1] == (2, "val2", "2022-01-02")
|
|
184
|
-
|
|
185
|
-
# # run again, nothing should be inserted into the output table
|
|
186
|
-
run()
|
|
187
|
-
|
|
188
|
-
res = get_output_table()
|
|
189
|
-
assert len(res) == 2
|
|
190
|
-
assert res[0] == (1, "val1", "2022-01-01")
|
|
191
|
-
assert res[1] == (2, "val2", "2022-01-02")
|
|
192
|
-
|
|
193
|
-
try:
|
|
194
|
-
os.remove(abs_db_path)
|
|
195
|
-
except Exception:
|
|
196
|
-
pass
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def test_merge_with_primary_key_duckdb_to_duckdb():
|
|
200
|
-
try:
|
|
201
|
-
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
202
|
-
except Exception:
|
|
203
|
-
pass
|
|
204
|
-
|
|
205
|
-
abs_db_path = get_abs_path("./testdata/test_merge_with_primary_key.db")
|
|
206
|
-
rel_db_path_to_command = "ingestr/testdata/test_merge_with_primary_key.db"
|
|
207
|
-
uri = f"duckdb:///{rel_db_path_to_command}"
|
|
208
|
-
|
|
209
|
-
conn = duckdb.connect(abs_db_path)
|
|
210
|
-
conn.execute("DROP SCHEMA IF EXISTS testschema_merge CASCADE")
|
|
211
|
-
conn.execute("CREATE SCHEMA testschema_merge")
|
|
212
|
-
conn.execute(
|
|
213
|
-
"CREATE TABLE testschema_merge.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP WITH TIME ZONE)"
|
|
214
|
-
)
|
|
215
|
-
conn.execute("INSERT INTO testschema_merge.input VALUES (1, 'val1', '2022-01-01')")
|
|
216
|
-
conn.execute("INSERT INTO testschema_merge.input VALUES (2, 'val2', '2022-02-01')")
|
|
217
|
-
|
|
218
|
-
res = conn.sql("select count(*) from testschema_merge.input").fetchall()
|
|
219
|
-
assert res[0][0] == 2
|
|
220
|
-
|
|
221
|
-
def run():
|
|
222
|
-
res = invoke_ingest_command(
|
|
223
|
-
uri,
|
|
224
|
-
"testschema_merge.input",
|
|
225
|
-
uri,
|
|
226
|
-
"testschema_merge.output",
|
|
227
|
-
"merge",
|
|
228
|
-
"updated_at",
|
|
229
|
-
"id",
|
|
230
|
-
sql_backend="sqlalchemy",
|
|
231
|
-
)
|
|
232
|
-
assert res.exit_code == 0
|
|
233
|
-
return res
|
|
234
|
-
|
|
235
|
-
def get_output_rows():
|
|
236
|
-
conn.execute("CHECKPOINT")
|
|
237
|
-
return conn.sql(
|
|
238
|
-
"select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_merge.output order by id asc"
|
|
239
|
-
).fetchall()
|
|
240
|
-
|
|
241
|
-
def assert_output_equals(expected):
|
|
242
|
-
res = get_output_rows()
|
|
243
|
-
assert len(res) == len(expected)
|
|
244
|
-
for i, row in enumerate(expected):
|
|
245
|
-
assert res[i] == row
|
|
246
|
-
|
|
247
|
-
run()
|
|
248
|
-
assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
|
|
249
|
-
|
|
250
|
-
first_run_id = conn.sql(
|
|
251
|
-
"select _dlt_load_id from testschema_merge.output limit 1"
|
|
252
|
-
).fetchall()[0][0]
|
|
253
|
-
|
|
254
|
-
##############################
|
|
255
|
-
# we'll run again, we don't expect any changes since the data hasn't changed
|
|
256
|
-
run()
|
|
257
|
-
assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
|
|
258
|
-
|
|
259
|
-
# we also ensure that the other rows were not touched
|
|
260
|
-
count_by_run_id = conn.sql(
|
|
261
|
-
"select _dlt_load_id, count(*) from testschema_merge.output group by 1"
|
|
262
|
-
).fetchall()
|
|
263
|
-
assert len(count_by_run_id) == 1
|
|
264
|
-
assert count_by_run_id[0][1] == 2
|
|
265
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
266
|
-
##############################
|
|
267
|
-
|
|
268
|
-
##############################
|
|
269
|
-
# now we'll modify the source data but not the updated at, the output table should not be updated
|
|
270
|
-
conn.execute("UPDATE testschema_merge.input SET val = 'val1_modified' WHERE id = 2")
|
|
271
|
-
|
|
272
|
-
run()
|
|
273
|
-
assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
|
|
274
|
-
|
|
275
|
-
# we also ensure that the other rows were not touched
|
|
276
|
-
count_by_run_id = conn.sql(
|
|
277
|
-
"select _dlt_load_id, count(*) from testschema_merge.output group by 1"
|
|
278
|
-
).fetchall()
|
|
279
|
-
assert len(count_by_run_id) == 1
|
|
280
|
-
assert count_by_run_id[0][1] == 2
|
|
281
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
282
|
-
##############################
|
|
283
|
-
|
|
284
|
-
##############################
|
|
285
|
-
# now we'll insert a new row but with an old date, the new row will not show up
|
|
286
|
-
conn.execute("INSERT INTO testschema_merge.input VALUES (3, 'val3', '2022-01-01')")
|
|
287
|
-
|
|
288
|
-
run()
|
|
289
|
-
assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
|
|
290
|
-
|
|
291
|
-
# we also ensure that the other rows were not touched
|
|
292
|
-
count_by_run_id = conn.sql(
|
|
293
|
-
"select _dlt_load_id, count(*) from testschema_merge.output group by 1"
|
|
294
|
-
).fetchall()
|
|
295
|
-
assert len(count_by_run_id) == 1
|
|
296
|
-
assert count_by_run_id[0][1] == 2
|
|
297
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
298
|
-
##############################
|
|
299
|
-
|
|
300
|
-
##############################
|
|
301
|
-
# now we'll insert a new row but with a new date, the new row will show up
|
|
302
|
-
conn.execute("INSERT INTO testschema_merge.input VALUES (3, 'val3', '2022-02-02')")
|
|
303
|
-
|
|
304
|
-
run()
|
|
305
|
-
assert_output_equals(
|
|
306
|
-
[
|
|
307
|
-
(1, "val1", "2022-01-01"),
|
|
308
|
-
(2, "val2", "2022-02-01"),
|
|
309
|
-
(3, "val3", "2022-02-02"),
|
|
310
|
-
]
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
# we have a new run that inserted rows to this table, so the run count should be 2
|
|
314
|
-
count_by_run_id = conn.sql(
|
|
315
|
-
"select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 2 desc"
|
|
316
|
-
).fetchall()
|
|
317
|
-
assert len(count_by_run_id) == 2
|
|
318
|
-
assert count_by_run_id[0][1] == 2
|
|
319
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
320
|
-
# we don't care about the run ID
|
|
321
|
-
assert count_by_run_id[1][1] == 1
|
|
322
|
-
##############################
|
|
323
|
-
|
|
324
|
-
##############################
|
|
325
|
-
# lastly, let's try modifying the updated_at of an old column, it should be updated in the output table
|
|
326
|
-
conn.execute(
|
|
327
|
-
"UPDATE testschema_merge.input SET val='val2_modified', updated_at = '2022-02-03' WHERE id = 2"
|
|
328
|
-
)
|
|
329
|
-
|
|
330
|
-
run()
|
|
331
|
-
assert_output_equals(
|
|
332
|
-
[
|
|
333
|
-
(1, "val1", "2022-01-01"),
|
|
334
|
-
(2, "val2_modified", "2022-02-03"),
|
|
335
|
-
(3, "val3", "2022-02-02"),
|
|
336
|
-
]
|
|
337
|
-
)
|
|
338
|
-
|
|
339
|
-
# we have a new run that inserted rows to this table, so the run count should be 2
|
|
340
|
-
count_by_run_id = conn.sql(
|
|
341
|
-
"select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 2 desc, 1 asc"
|
|
342
|
-
).fetchall()
|
|
343
|
-
assert len(count_by_run_id) == 3
|
|
344
|
-
assert count_by_run_id[0][1] == 1
|
|
345
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
346
|
-
# we don't care about the rest of the run IDs
|
|
347
|
-
assert count_by_run_id[1][1] == 1
|
|
348
|
-
assert count_by_run_id[2][1] == 1
|
|
349
|
-
##############################
|
|
350
|
-
|
|
351
|
-
try:
|
|
352
|
-
os.remove(abs_db_path)
|
|
353
|
-
except Exception:
|
|
354
|
-
pass
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
def test_delete_insert_without_primary_key_duckdb_to_duckdb():
|
|
358
|
-
try:
|
|
359
|
-
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
360
|
-
except Exception:
|
|
361
|
-
pass
|
|
362
|
-
|
|
363
|
-
abs_db_path = get_abs_path("./testdata/test_delete_insert_without_primary_key.db")
|
|
364
|
-
rel_db_path_to_command = (
|
|
365
|
-
"ingestr/testdata/test_delete_insert_without_primary_key.db"
|
|
366
|
-
)
|
|
367
|
-
uri = f"duckdb:///{rel_db_path_to_command}"
|
|
368
|
-
|
|
369
|
-
conn = duckdb.connect(abs_db_path)
|
|
370
|
-
conn.execute("DROP SCHEMA IF EXISTS testschema_delete_insert CASCADE")
|
|
371
|
-
conn.execute("CREATE SCHEMA testschema_delete_insert")
|
|
372
|
-
conn.execute(
|
|
373
|
-
"CREATE TABLE testschema_delete_insert.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP WITH TIME ZONE)"
|
|
374
|
-
)
|
|
375
|
-
conn.execute(
|
|
376
|
-
"INSERT INTO testschema_delete_insert.input VALUES (1, 'val1', '2022-01-01 00:00:00+00:00')"
|
|
377
|
-
)
|
|
378
|
-
conn.execute(
|
|
379
|
-
"INSERT INTO testschema_delete_insert.input VALUES (2, 'val2', '2022-02-01 00:00:00+00:00')"
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
res = conn.sql("select count(*) from testschema_delete_insert.input").fetchall()
|
|
383
|
-
assert res[0][0] == 2
|
|
384
|
-
|
|
385
|
-
def run():
|
|
386
|
-
res = invoke_ingest_command(
|
|
387
|
-
uri,
|
|
388
|
-
"testschema_delete_insert.input",
|
|
389
|
-
uri,
|
|
390
|
-
"testschema_delete_insert.output",
|
|
391
|
-
inc_strategy="delete+insert",
|
|
392
|
-
inc_key="updated_at",
|
|
393
|
-
sql_backend="sqlalchemy",
|
|
394
|
-
loader_file_format="jsonl",
|
|
395
|
-
)
|
|
396
|
-
assert res.exit_code == 0
|
|
397
|
-
return res
|
|
398
|
-
|
|
399
|
-
def get_output_rows():
|
|
400
|
-
conn.execute("CHECKPOINT")
|
|
401
|
-
return conn.sql(
|
|
402
|
-
"select id, val, strftime(CAST(updated_at AT TIME ZONE 'UTC' AS TIMESTAMP), '%Y-%m-%d %H:%M:%S') from testschema_delete_insert.output order by id asc"
|
|
403
|
-
).fetchall()
|
|
404
|
-
|
|
405
|
-
def assert_output_equals(expected):
|
|
406
|
-
res = get_output_rows()
|
|
407
|
-
assert len(res) == len(expected)
|
|
408
|
-
for i, row in enumerate(expected):
|
|
409
|
-
assert res[i] == row
|
|
410
|
-
|
|
411
|
-
run()
|
|
412
|
-
assert_output_equals(
|
|
413
|
-
[(1, "val1", "2022-01-01 00:00:00"), (2, "val2", "2022-02-01 00:00:00")]
|
|
414
|
-
)
|
|
415
|
-
|
|
416
|
-
first_run_id = conn.sql(
|
|
417
|
-
"select _dlt_load_id from testschema_delete_insert.output limit 1"
|
|
418
|
-
).fetchall()[0][0]
|
|
419
|
-
|
|
420
|
-
##############################
|
|
421
|
-
# we'll run again, since this is a delete+insert, we expect the run ID to change for the last one
|
|
422
|
-
res = run()
|
|
423
|
-
assert_output_equals(
|
|
424
|
-
[(1, "val1", "2022-01-01 00:00:00"), (2, "val2", "2022-02-01 00:00:00")]
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
# we ensure that one of the rows is updated with a new run
|
|
428
|
-
count_by_run_id = conn.sql(
|
|
429
|
-
"select _dlt_load_id, count(*) from testschema_delete_insert.output group by 1 order by 1 asc"
|
|
430
|
-
).fetchall()
|
|
431
|
-
assert len(count_by_run_id) == 2
|
|
432
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
433
|
-
assert count_by_run_id[0][1] == 1
|
|
434
|
-
assert count_by_run_id[1][0] != first_run_id
|
|
435
|
-
assert count_by_run_id[1][1] == 1
|
|
436
|
-
##############################
|
|
437
|
-
|
|
438
|
-
##############################
|
|
439
|
-
# now we'll insert a few more lines for the same day, the new rows should show up
|
|
440
|
-
conn.execute(
|
|
441
|
-
"INSERT INTO testschema_delete_insert.input VALUES (3, 'val3', '2022-02-01 00:00:00+00:00'), (4, 'val4', '2022-02-01 00:00:00+00:00')"
|
|
442
|
-
)
|
|
443
|
-
conn.execute("CHECKPOINT")
|
|
444
|
-
|
|
445
|
-
run()
|
|
446
|
-
assert_output_equals(
|
|
447
|
-
[
|
|
448
|
-
(1, "val1", "2022-01-01 00:00:00"),
|
|
449
|
-
(2, "val2", "2022-02-01 00:00:00"),
|
|
450
|
-
(3, "val3", "2022-02-01 00:00:00"),
|
|
451
|
-
(4, "val4", "2022-02-01 00:00:00"),
|
|
452
|
-
]
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
# the new rows should have a new run ID, there should be 2 distinct runs now
|
|
456
|
-
count_by_run_id = conn.sql(
|
|
457
|
-
"select _dlt_load_id, count(*) from testschema_delete_insert.output group by 1 order by 2 desc, 1 asc"
|
|
458
|
-
).fetchall()
|
|
459
|
-
assert len(count_by_run_id) == 2
|
|
460
|
-
assert count_by_run_id[0][0] != first_run_id
|
|
461
|
-
assert count_by_run_id[0][1] == 3 # 2 new rows + 1 old row
|
|
462
|
-
assert count_by_run_id[1][0] == first_run_id
|
|
463
|
-
assert count_by_run_id[1][1] == 1
|
|
464
|
-
##############################
|
|
465
|
-
|
|
466
|
-
try:
|
|
467
|
-
os.remove(abs_db_path)
|
|
468
|
-
except Exception:
|
|
469
|
-
pass
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
def test_delete_insert_with_timerange_duckdb_to_duckdb():
|
|
473
|
-
try:
|
|
474
|
-
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
475
|
-
except Exception:
|
|
476
|
-
pass
|
|
477
|
-
|
|
478
|
-
abs_db_path = get_abs_path("./testdata/test_delete_insert_with_timerange.db")
|
|
479
|
-
rel_db_path_to_command = "ingestr/testdata/test_delete_insert_with_timerange.db"
|
|
480
|
-
uri = f"duckdb:///{rel_db_path_to_command}"
|
|
481
|
-
|
|
482
|
-
conn = duckdb.connect(abs_db_path)
|
|
483
|
-
conn.execute("DROP SCHEMA IF EXISTS testschema_delete_insert_timerange CASCADE")
|
|
484
|
-
conn.execute("CREATE SCHEMA testschema_delete_insert_timerange")
|
|
485
|
-
conn.execute(
|
|
486
|
-
"CREATE TABLE testschema_delete_insert_timerange.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP WITH TIME ZONE)"
|
|
487
|
-
)
|
|
488
|
-
conn.execute(
|
|
489
|
-
"""INSERT INTO testschema_delete_insert_timerange.input VALUES
|
|
490
|
-
(1, 'val1', '2022-01-01T00:00:00Z'),
|
|
491
|
-
(2, 'val2', '2022-01-01T00:00:00Z'),
|
|
492
|
-
(3, 'val3', '2022-01-02T00:00:00Z'),
|
|
493
|
-
(4, 'val4', '2022-01-02T00:00:00Z'),
|
|
494
|
-
(5, 'val5', '2022-01-03T00:00:00Z'),
|
|
495
|
-
(6, 'val6', '2022-01-03T00:00:00Z')
|
|
496
|
-
"""
|
|
497
|
-
)
|
|
498
|
-
|
|
499
|
-
res = conn.sql(
|
|
500
|
-
"select count(*) from testschema_delete_insert_timerange.input"
|
|
501
|
-
).fetchall()
|
|
502
|
-
assert res[0][0] == 6
|
|
503
|
-
|
|
504
|
-
def run(start_date: str, end_date: str):
|
|
505
|
-
res = invoke_ingest_command(
|
|
506
|
-
uri,
|
|
507
|
-
"testschema_delete_insert_timerange.input",
|
|
508
|
-
uri,
|
|
509
|
-
"testschema_delete_insert_timerange.output",
|
|
510
|
-
inc_strategy="delete+insert",
|
|
511
|
-
inc_key="updated_at",
|
|
512
|
-
interval_start=start_date,
|
|
513
|
-
interval_end=end_date,
|
|
514
|
-
sql_backend="sqlalchemy",
|
|
515
|
-
loader_file_format="jsonl",
|
|
516
|
-
)
|
|
517
|
-
assert res.exit_code == 0
|
|
518
|
-
return res
|
|
519
|
-
|
|
520
|
-
def get_output_rows():
|
|
521
|
-
conn.execute("CHECKPOINT")
|
|
522
|
-
return conn.sql(
|
|
523
|
-
"select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_delete_insert_timerange.output order by id asc"
|
|
524
|
-
).fetchall()
|
|
525
|
-
|
|
526
|
-
def assert_output_equals(expected):
|
|
527
|
-
res = get_output_rows()
|
|
528
|
-
assert len(res) == len(expected)
|
|
529
|
-
for i, row in enumerate(expected):
|
|
530
|
-
assert res[i] == row
|
|
531
|
-
|
|
532
|
-
run(
|
|
533
|
-
"2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z"
|
|
534
|
-
) # dlt runs them with the end date exclusive
|
|
535
|
-
assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-01-01")])
|
|
536
|
-
|
|
537
|
-
first_run_id = conn.sql(
|
|
538
|
-
"select _dlt_load_id from testschema_delete_insert_timerange.output limit 1"
|
|
539
|
-
).fetchall()[0][0]
|
|
540
|
-
|
|
541
|
-
##############################
|
|
542
|
-
# we'll run again, since this is a delete+insert, we expect the run ID to change for the last one
|
|
543
|
-
run(
|
|
544
|
-
"2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z"
|
|
545
|
-
) # dlt runs them with the end date exclusive
|
|
546
|
-
assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-01-01")])
|
|
547
|
-
|
|
548
|
-
# both rows should have a new run ID
|
|
549
|
-
count_by_run_id = conn.sql(
|
|
550
|
-
"select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
|
|
551
|
-
).fetchall()
|
|
552
|
-
assert len(count_by_run_id) == 1
|
|
553
|
-
assert count_by_run_id[0][0] != first_run_id
|
|
554
|
-
assert count_by_run_id[0][1] == 2
|
|
555
|
-
##############################
|
|
556
|
-
|
|
557
|
-
##############################
|
|
558
|
-
# now run for the day after, new rows should land
|
|
559
|
-
run("2022-01-02T00:00:00Z", "2022-01-03T00:00:00Z")
|
|
560
|
-
assert_output_equals(
|
|
561
|
-
[
|
|
562
|
-
(1, "val1", "2022-01-01"),
|
|
563
|
-
(2, "val2", "2022-01-01"),
|
|
564
|
-
(3, "val3", "2022-01-02"),
|
|
565
|
-
(4, "val4", "2022-01-02"),
|
|
566
|
-
]
|
|
567
|
-
)
|
|
568
|
-
|
|
569
|
-
# there should be 4 rows with 2 distinct run IDs
|
|
570
|
-
count_by_run_id = conn.sql(
|
|
571
|
-
"select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
|
|
572
|
-
).fetchall()
|
|
573
|
-
assert len(count_by_run_id) == 2
|
|
574
|
-
assert count_by_run_id[0][1] == 2
|
|
575
|
-
assert count_by_run_id[1][1] == 2
|
|
576
|
-
##############################
|
|
577
|
-
|
|
578
|
-
##############################
|
|
579
|
-
# let's bring in the rows for the third day
|
|
580
|
-
run("2022-01-03T00:00:00Z", "2022-01-04T00:00:00Z")
|
|
581
|
-
assert_output_equals(
|
|
582
|
-
[
|
|
583
|
-
(1, "val1", "2022-01-01"),
|
|
584
|
-
(2, "val2", "2022-01-01"),
|
|
585
|
-
(3, "val3", "2022-01-02"),
|
|
586
|
-
(4, "val4", "2022-01-02"),
|
|
587
|
-
(5, "val5", "2022-01-03"),
|
|
588
|
-
(6, "val6", "2022-01-03"),
|
|
589
|
-
]
|
|
590
|
-
)
|
|
591
|
-
|
|
592
|
-
# there should be 6 rows with 3 distinct run IDs
|
|
593
|
-
count_by_run_id = conn.sql(
|
|
594
|
-
"select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
|
|
595
|
-
).fetchall()
|
|
596
|
-
assert len(count_by_run_id) == 3
|
|
597
|
-
assert count_by_run_id[0][1] == 2
|
|
598
|
-
assert count_by_run_id[1][1] == 2
|
|
599
|
-
assert count_by_run_id[2][1] == 2
|
|
600
|
-
##############################
|
|
601
|
-
|
|
602
|
-
##############################
|
|
603
|
-
# now let's do a backfill for the first day again, the rows should be updated
|
|
604
|
-
conn.execute(
|
|
605
|
-
"UPDATE testschema_delete_insert_timerange.input SET val = 'val1_modified' WHERE id = 1"
|
|
606
|
-
)
|
|
607
|
-
|
|
608
|
-
run("2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z")
|
|
609
|
-
assert_output_equals(
|
|
610
|
-
[
|
|
611
|
-
(1, "val1_modified", "2022-01-01"),
|
|
612
|
-
(2, "val2", "2022-01-01"),
|
|
613
|
-
(3, "val3", "2022-01-02"),
|
|
614
|
-
(4, "val4", "2022-01-02"),
|
|
615
|
-
(5, "val5", "2022-01-03"),
|
|
616
|
-
(6, "val6", "2022-01-03"),
|
|
617
|
-
]
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
# there should still be 6 rows with 3 distinct run IDs
|
|
621
|
-
count_by_run_id = conn.sql(
|
|
622
|
-
"select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
|
|
623
|
-
).fetchall()
|
|
624
|
-
assert len(count_by_run_id) == 3
|
|
625
|
-
assert count_by_run_id[0][1] == 2
|
|
626
|
-
assert count_by_run_id[1][1] == 2
|
|
627
|
-
assert count_by_run_id[2][1] == 2
|
|
628
|
-
##############################
|
|
629
|
-
|
|
630
|
-
try:
|
|
631
|
-
os.remove(abs_db_path)
|
|
632
|
-
except Exception:
|
|
633
|
-
pass
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
### These are CSV-to-DuckDB tests
|
|
637
|
-
def test_create_replace_csv_to_duckdb():
|
|
638
|
-
try:
|
|
639
|
-
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
640
|
-
except Exception:
|
|
641
|
-
pass
|
|
642
|
-
|
|
643
|
-
abs_db_path = get_abs_path("./testdata/test_create_replace_csv.db")
|
|
644
|
-
rel_db_path_to_command = "ingestr/testdata/test_create_replace_csv.db"
|
|
645
|
-
rel_source_path_to_command = "ingestr/testdata/create_replace.csv"
|
|
646
|
-
|
|
647
|
-
conn = duckdb.connect(abs_db_path)
|
|
648
|
-
|
|
649
|
-
result = invoke_ingest_command(
|
|
650
|
-
f"csv://{rel_source_path_to_command}",
|
|
651
|
-
"testschema.input",
|
|
652
|
-
f"duckdb:///{rel_db_path_to_command}",
|
|
653
|
-
"testschema.output",
|
|
654
|
-
)
|
|
655
|
-
|
|
656
|
-
assert result.exit_code == 0
|
|
657
|
-
|
|
658
|
-
res = conn.sql(
|
|
659
|
-
"select symbol, date, is_enabled, name from testschema.output"
|
|
660
|
-
).fetchall()
|
|
661
|
-
|
|
662
|
-
# read CSV file
|
|
663
|
-
actual_rows = []
|
|
664
|
-
with open(get_abs_path("./testdata/create_replace.csv"), "r") as f:
|
|
665
|
-
reader = csv.reader(f, delimiter=",", quotechar='"')
|
|
666
|
-
next(reader, None)
|
|
667
|
-
for row in reader:
|
|
668
|
-
actual_rows.append(row)
|
|
669
|
-
|
|
670
|
-
# compare the CSV file with the DuckDB table
|
|
671
|
-
assert len(res) == len(actual_rows)
|
|
672
|
-
for i, row in enumerate(actual_rows):
|
|
673
|
-
assert res[i] == tuple(row)
|
|
674
|
-
|
|
675
|
-
try:
|
|
676
|
-
os.remove(abs_db_path)
|
|
677
|
-
except Exception:
|
|
678
|
-
pass
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
def get_random_string(length):
|
|
682
|
-
letters = string.ascii_lowercase
|
|
683
|
-
result_str = "".join(random.choice(letters) for i in range(length))
|
|
684
|
-
return result_str
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
def test_merge_with_primary_key_csv_to_duckdb():
|
|
688
|
-
try:
|
|
689
|
-
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
690
|
-
except Exception:
|
|
691
|
-
pass
|
|
692
|
-
|
|
693
|
-
dbname = f"test_merge_with_primary_key_csv{get_random_string(5)}.db"
|
|
694
|
-
abs_db_path = get_abs_path(f"./testdata/{dbname}")
|
|
695
|
-
rel_db_path_to_command = f"ingestr/testdata/{dbname}"
|
|
696
|
-
uri = f"duckdb:///{rel_db_path_to_command}"
|
|
697
|
-
|
|
698
|
-
conn = duckdb.connect(abs_db_path)
|
|
699
|
-
|
|
700
|
-
def run(source: str):
|
|
701
|
-
res = invoke_ingest_command(
|
|
702
|
-
source,
|
|
703
|
-
"whatever", # table name doesnt matter for CSV
|
|
704
|
-
uri,
|
|
705
|
-
"testschema_merge.output",
|
|
706
|
-
"merge",
|
|
707
|
-
"date",
|
|
708
|
-
"symbol",
|
|
709
|
-
)
|
|
710
|
-
assert res.exit_code == 0
|
|
711
|
-
return res
|
|
712
|
-
|
|
713
|
-
def get_output_rows():
|
|
714
|
-
conn.execute("CHECKPOINT")
|
|
715
|
-
return conn.sql(
|
|
716
|
-
"select symbol, date, is_enabled, name from testschema_merge.output order by symbol asc"
|
|
717
|
-
).fetchall()
|
|
718
|
-
|
|
719
|
-
def assert_output_equals_to_csv(path: str):
|
|
720
|
-
res = get_output_rows()
|
|
721
|
-
actual_rows = []
|
|
722
|
-
with open(get_abs_path(path), "r") as f:
|
|
723
|
-
reader = csv.reader(f, delimiter=",", quotechar='"')
|
|
724
|
-
next(reader, None)
|
|
725
|
-
for row in reader:
|
|
726
|
-
actual_rows.append(row)
|
|
727
|
-
|
|
728
|
-
assert len(res) == len(actual_rows)
|
|
729
|
-
for i, row in enumerate(actual_rows):
|
|
730
|
-
assert res[i] == tuple(row)
|
|
731
|
-
|
|
732
|
-
run("csv://ingestr/testdata/merge_part1.csv")
|
|
733
|
-
assert_output_equals_to_csv("./testdata/merge_part1.csv")
|
|
734
|
-
|
|
735
|
-
first_run_id = conn.sql(
|
|
736
|
-
"select _dlt_load_id from testschema_merge.output limit 1"
|
|
737
|
-
).fetchall()[0][0]
|
|
738
|
-
|
|
739
|
-
##############################
|
|
740
|
-
# we'll run again, we don't expect any changes since the data hasn't changed
|
|
741
|
-
run("csv://ingestr/testdata/merge_part1.csv")
|
|
742
|
-
assert_output_equals_to_csv("./testdata/merge_part1.csv")
|
|
743
|
-
|
|
744
|
-
# we also ensure that the other rows were not touched
|
|
745
|
-
count_by_run_id = conn.sql(
|
|
746
|
-
"select _dlt_load_id, count(*) from testschema_merge.output group by 1"
|
|
747
|
-
).fetchall()
|
|
748
|
-
assert len(count_by_run_id) == 1
|
|
749
|
-
assert count_by_run_id[0][1] == 3
|
|
750
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
751
|
-
##############################
|
|
752
|
-
|
|
753
|
-
##############################
|
|
754
|
-
# now we'll run the same ingestion but with a different file this time
|
|
755
|
-
|
|
756
|
-
run("csv://ingestr/testdata/merge_part2.csv")
|
|
757
|
-
assert_output_equals_to_csv("./testdata/merge_expected.csv")
|
|
758
|
-
|
|
759
|
-
# let's check the runs
|
|
760
|
-
count_by_run_id = conn.sql(
|
|
761
|
-
"select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 1 asc"
|
|
762
|
-
).fetchall()
|
|
763
|
-
|
|
764
|
-
# we expect that there's a new load ID now
|
|
765
|
-
assert len(count_by_run_id) == 2
|
|
766
|
-
|
|
767
|
-
# there should be only one row with the first load ID
|
|
768
|
-
assert count_by_run_id[0][1] == 1
|
|
769
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
770
|
-
|
|
771
|
-
# there should be a new run with the rest, 2 rows updated + 1 new row
|
|
772
|
-
assert count_by_run_id[1][1] == 3
|
|
773
|
-
##############################
|
|
774
|
-
|
|
775
|
-
try:
|
|
776
|
-
os.remove(abs_db_path)
|
|
777
|
-
except Exception:
|
|
778
|
-
pass
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
def test_delete_insert_without_primary_key_csv_to_duckdb():
|
|
782
|
-
try:
|
|
783
|
-
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
784
|
-
except Exception:
|
|
785
|
-
pass
|
|
786
|
-
|
|
787
|
-
dbname = f"test_merge_with_primary_key_csv{get_random_string(5)}.db"
|
|
788
|
-
abs_db_path = get_abs_path(f"./testdata/{dbname}")
|
|
789
|
-
rel_db_path_to_command = f"ingestr/testdata/{dbname}"
|
|
790
|
-
uri = f"duckdb:///{rel_db_path_to_command}"
|
|
791
|
-
|
|
792
|
-
conn = duckdb.connect(abs_db_path)
|
|
793
|
-
|
|
794
|
-
def run(source: str):
|
|
795
|
-
res = invoke_ingest_command(
|
|
796
|
-
source,
|
|
797
|
-
"whatever", # table name doesnt matter for CSV
|
|
798
|
-
uri,
|
|
799
|
-
"testschema.output",
|
|
800
|
-
"delete+insert",
|
|
801
|
-
"date",
|
|
802
|
-
)
|
|
803
|
-
assert res.exit_code == 0
|
|
804
|
-
return res
|
|
805
|
-
|
|
806
|
-
def get_output_rows():
|
|
807
|
-
conn.execute("CHECKPOINT")
|
|
808
|
-
return conn.sql(
|
|
809
|
-
"select symbol, date, is_enabled, name from testschema.output order by symbol asc"
|
|
810
|
-
).fetchall()
|
|
811
|
-
|
|
812
|
-
def assert_output_equals_to_csv(path: str):
|
|
813
|
-
res = get_output_rows()
|
|
814
|
-
actual_rows = []
|
|
815
|
-
with open(get_abs_path(path), "r") as f:
|
|
816
|
-
reader = csv.reader(f, delimiter=",", quotechar='"')
|
|
817
|
-
next(reader, None)
|
|
818
|
-
for row in reader:
|
|
819
|
-
actual_rows.append(row)
|
|
820
|
-
|
|
821
|
-
assert len(res) == len(actual_rows)
|
|
822
|
-
for i, row in enumerate(actual_rows):
|
|
823
|
-
assert res[i] == tuple(row)
|
|
824
|
-
|
|
825
|
-
run("csv://ingestr/testdata/delete_insert_part1.csv")
|
|
826
|
-
assert_output_equals_to_csv("./testdata/delete_insert_part1.csv")
|
|
827
|
-
|
|
828
|
-
first_run_id = conn.sql(
|
|
829
|
-
"select _dlt_load_id from testschema.output limit 1"
|
|
830
|
-
).fetchall()[0][0]
|
|
831
|
-
|
|
832
|
-
##############################
|
|
833
|
-
# we'll run again, we expect the data to be the same, but a new load_id to exist
|
|
834
|
-
# this is due to the fact that the old data won't be touched, but the ones with the
|
|
835
|
-
# latest value will be rewritten
|
|
836
|
-
run("csv://ingestr/testdata/delete_insert_part1.csv")
|
|
837
|
-
assert_output_equals_to_csv("./testdata/delete_insert_part1.csv")
|
|
838
|
-
|
|
839
|
-
# we also ensure that the other rows were not touched
|
|
840
|
-
count_by_run_id = conn.sql(
|
|
841
|
-
"select _dlt_load_id, count(*) from testschema.output group by 1 order by 1 asc"
|
|
842
|
-
).fetchall()
|
|
843
|
-
|
|
844
|
-
assert len(count_by_run_id) == 2
|
|
845
|
-
assert count_by_run_id[0][1] == 1
|
|
846
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
847
|
-
assert count_by_run_id[1][1] == 3
|
|
848
|
-
##############################
|
|
849
|
-
|
|
850
|
-
##############################
|
|
851
|
-
# now we'll run the same ingestion but with a different file this time
|
|
852
|
-
|
|
853
|
-
run("csv://ingestr/testdata/delete_insert_part2.csv")
|
|
854
|
-
assert_output_equals_to_csv("./testdata/delete_insert_expected.csv")
|
|
855
|
-
|
|
856
|
-
# let's check the runs
|
|
857
|
-
count_by_run_id = conn.sql(
|
|
858
|
-
"select _dlt_load_id, count(*) from testschema.output group by 1 order by 1 asc"
|
|
859
|
-
).fetchall()
|
|
860
|
-
|
|
861
|
-
# we expect that there's a new load ID now
|
|
862
|
-
assert len(count_by_run_id) == 2
|
|
863
|
-
|
|
864
|
-
# there should be only one row with the first load ID, oldest date
|
|
865
|
-
assert count_by_run_id[0][1] == 1
|
|
866
|
-
assert count_by_run_id[0][0] == first_run_id
|
|
867
|
-
|
|
868
|
-
# there should be a new run with the rest, 3 rows updated + 1 new row
|
|
869
|
-
assert count_by_run_id[1][1] == 4
|
|
870
|
-
##############################
|
|
871
|
-
|
|
872
|
-
try:
|
|
873
|
-
os.remove(abs_db_path)
|
|
874
|
-
except Exception:
|
|
875
|
-
pass
|