ingestr 0.6.6__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main_test.py DELETED
@@ -1,875 +0,0 @@
1
- import csv
2
- import os
3
- import random
4
- import shutil
5
- import string
6
-
7
- import duckdb
8
- from typer.testing import CliRunner
9
-
10
- from ingestr.main import app
11
-
12
- runner = CliRunner()
13
-
14
-
15
- def get_abs_path(relative_path):
16
- return os.path.abspath(os.path.join(os.path.dirname(__file__), relative_path))
17
-
18
-
19
- def invoke_ingest_command(
20
- source_uri,
21
- source_table,
22
- dest_uri,
23
- dest_table,
24
- inc_strategy=None,
25
- inc_key=None,
26
- primary_key=None,
27
- merge_key=None,
28
- interval_start=None,
29
- interval_end=None,
30
- sql_backend=None,
31
- loader_file_format=None,
32
- ):
33
- args = [
34
- "ingest",
35
- "--source-uri",
36
- source_uri,
37
- "--source-table",
38
- source_table,
39
- "--dest-uri",
40
- dest_uri,
41
- "--dest-table",
42
- dest_table,
43
- ]
44
-
45
- if inc_strategy:
46
- args.append("--incremental-strategy")
47
- args.append(inc_strategy)
48
-
49
- if inc_key:
50
- args.append("--incremental-key")
51
- args.append(inc_key)
52
-
53
- if primary_key:
54
- args.append("--primary-key")
55
- args.append(primary_key)
56
-
57
- if merge_key:
58
- args.append("--merge-key")
59
- args.append(merge_key)
60
-
61
- if interval_start:
62
- args.append("--interval-start")
63
- args.append(interval_start)
64
-
65
- if interval_end:
66
- args.append("--interval-end")
67
- args.append(interval_end)
68
-
69
- if sql_backend:
70
- args.append("--sql-backend")
71
- args.append(sql_backend)
72
-
73
- if loader_file_format:
74
- args.append("--loader-file-format")
75
- args.append(loader_file_format)
76
-
77
- result = runner.invoke(
78
- app,
79
- args,
80
- input="y\n",
81
- env={"DISABLE_TELEMETRY": "true"},
82
- )
83
- return result
84
-
85
-
86
- ### These are DuckDB-to-DuckDB tests
87
- def test_create_replace_duckdb_to_duckdb():
88
- try:
89
- shutil.rmtree(get_abs_path("../pipeline_data"))
90
- except Exception:
91
- pass
92
-
93
- dbname = f"test_create_replace_{get_random_string(5)}.db"
94
-
95
- abs_db_path = get_abs_path(f"./testdata/{dbname}")
96
- rel_db_path_to_command = f"ingestr/testdata/{dbname}"
97
-
98
- conn = duckdb.connect(abs_db_path)
99
- conn.execute("DROP SCHEMA IF EXISTS testschema CASCADE")
100
- conn.execute("CREATE SCHEMA testschema")
101
- conn.execute(
102
- "CREATE TABLE testschema.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP)"
103
- )
104
- conn.execute("INSERT INTO testschema.input VALUES (1, 'val1', '2022-01-01')")
105
- conn.execute("INSERT INTO testschema.input VALUES (2, 'val2', '2022-02-01')")
106
-
107
- res = conn.sql("select count(*) from testschema.input").fetchall()
108
- assert res[0][0] == 2
109
-
110
- result = invoke_ingest_command(
111
- f"duckdb:///{rel_db_path_to_command}",
112
- "testschema.input",
113
- f"duckdb:///{rel_db_path_to_command}",
114
- "testschema.output",
115
- )
116
-
117
- print(result.stdout)
118
-
119
- assert result.exit_code == 0
120
-
121
- res = conn.sql(
122
- "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema.output"
123
- ).fetchall()
124
- assert len(res) == 2
125
- assert res[0] == (1, "val1", "2022-01-01")
126
- assert res[1] == (2, "val2", "2022-02-01")
127
-
128
- try:
129
- os.remove(abs_db_path)
130
- except Exception:
131
- pass
132
-
133
-
134
- def test_append_duckdb_to_duckdb():
135
- try:
136
- shutil.rmtree(get_abs_path("../pipeline_data"))
137
- except Exception:
138
- pass
139
-
140
- abs_db_path = get_abs_path("./testdata/test_append.db")
141
- rel_db_path_to_command = "ingestr/testdata/test_append.db"
142
- uri = f"duckdb:///{rel_db_path_to_command}"
143
-
144
- conn = duckdb.connect(abs_db_path)
145
- conn.execute("DROP SCHEMA IF EXISTS testschema_append CASCADE")
146
- conn.execute("CHECKPOINT")
147
-
148
- conn.execute("CREATE SCHEMA testschema_append")
149
- conn.execute(
150
- "CREATE TABLE testschema_append.input (id INTEGER, val VARCHAR, updated_at DATE)"
151
- )
152
- conn.execute(
153
- "INSERT INTO testschema_append.input VALUES (1, 'val1', '2022-01-01'), (2, 'val2', '2022-01-02')"
154
- )
155
- conn.execute("CHECKPOINT")
156
-
157
- res = conn.sql("select count(*) from testschema_append.input").fetchall()
158
- assert res[0][0] == 2
159
-
160
- def run():
161
- res = invoke_ingest_command(
162
- uri,
163
- "testschema_append.input",
164
- uri,
165
- "testschema_append.output",
166
- "append",
167
- "updated_at",
168
- sql_backend="sqlalchemy",
169
- )
170
- assert res.exit_code == 0
171
-
172
- def get_output_table():
173
- conn.execute("CHECKPOINT")
174
- return conn.sql(
175
- "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_append.output order by id asc"
176
- ).fetchall()
177
-
178
- run()
179
-
180
- res = get_output_table()
181
- assert len(res) == 2
182
- assert res[0] == (1, "val1", "2022-01-01")
183
- assert res[1] == (2, "val2", "2022-01-02")
184
-
185
- # # run again, nothing should be inserted into the output table
186
- run()
187
-
188
- res = get_output_table()
189
- assert len(res) == 2
190
- assert res[0] == (1, "val1", "2022-01-01")
191
- assert res[1] == (2, "val2", "2022-01-02")
192
-
193
- try:
194
- os.remove(abs_db_path)
195
- except Exception:
196
- pass
197
-
198
-
199
- def test_merge_with_primary_key_duckdb_to_duckdb():
200
- try:
201
- shutil.rmtree(get_abs_path("../pipeline_data"))
202
- except Exception:
203
- pass
204
-
205
- abs_db_path = get_abs_path("./testdata/test_merge_with_primary_key.db")
206
- rel_db_path_to_command = "ingestr/testdata/test_merge_with_primary_key.db"
207
- uri = f"duckdb:///{rel_db_path_to_command}"
208
-
209
- conn = duckdb.connect(abs_db_path)
210
- conn.execute("DROP SCHEMA IF EXISTS testschema_merge CASCADE")
211
- conn.execute("CREATE SCHEMA testschema_merge")
212
- conn.execute(
213
- "CREATE TABLE testschema_merge.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP WITH TIME ZONE)"
214
- )
215
- conn.execute("INSERT INTO testschema_merge.input VALUES (1, 'val1', '2022-01-01')")
216
- conn.execute("INSERT INTO testschema_merge.input VALUES (2, 'val2', '2022-02-01')")
217
-
218
- res = conn.sql("select count(*) from testschema_merge.input").fetchall()
219
- assert res[0][0] == 2
220
-
221
- def run():
222
- res = invoke_ingest_command(
223
- uri,
224
- "testschema_merge.input",
225
- uri,
226
- "testschema_merge.output",
227
- "merge",
228
- "updated_at",
229
- "id",
230
- sql_backend="sqlalchemy",
231
- )
232
- assert res.exit_code == 0
233
- return res
234
-
235
- def get_output_rows():
236
- conn.execute("CHECKPOINT")
237
- return conn.sql(
238
- "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_merge.output order by id asc"
239
- ).fetchall()
240
-
241
- def assert_output_equals(expected):
242
- res = get_output_rows()
243
- assert len(res) == len(expected)
244
- for i, row in enumerate(expected):
245
- assert res[i] == row
246
-
247
- run()
248
- assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
249
-
250
- first_run_id = conn.sql(
251
- "select _dlt_load_id from testschema_merge.output limit 1"
252
- ).fetchall()[0][0]
253
-
254
- ##############################
255
- # we'll run again, we don't expect any changes since the data hasn't changed
256
- run()
257
- assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
258
-
259
- # we also ensure that the other rows were not touched
260
- count_by_run_id = conn.sql(
261
- "select _dlt_load_id, count(*) from testschema_merge.output group by 1"
262
- ).fetchall()
263
- assert len(count_by_run_id) == 1
264
- assert count_by_run_id[0][1] == 2
265
- assert count_by_run_id[0][0] == first_run_id
266
- ##############################
267
-
268
- ##############################
269
- # now we'll modify the source data but not the updated at, the output table should not be updated
270
- conn.execute("UPDATE testschema_merge.input SET val = 'val1_modified' WHERE id = 2")
271
-
272
- run()
273
- assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
274
-
275
- # we also ensure that the other rows were not touched
276
- count_by_run_id = conn.sql(
277
- "select _dlt_load_id, count(*) from testschema_merge.output group by 1"
278
- ).fetchall()
279
- assert len(count_by_run_id) == 1
280
- assert count_by_run_id[0][1] == 2
281
- assert count_by_run_id[0][0] == first_run_id
282
- ##############################
283
-
284
- ##############################
285
- # now we'll insert a new row but with an old date, the new row will not show up
286
- conn.execute("INSERT INTO testschema_merge.input VALUES (3, 'val3', '2022-01-01')")
287
-
288
- run()
289
- assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
290
-
291
- # we also ensure that the other rows were not touched
292
- count_by_run_id = conn.sql(
293
- "select _dlt_load_id, count(*) from testschema_merge.output group by 1"
294
- ).fetchall()
295
- assert len(count_by_run_id) == 1
296
- assert count_by_run_id[0][1] == 2
297
- assert count_by_run_id[0][0] == first_run_id
298
- ##############################
299
-
300
- ##############################
301
- # now we'll insert a new row but with a new date, the new row will show up
302
- conn.execute("INSERT INTO testschema_merge.input VALUES (3, 'val3', '2022-02-02')")
303
-
304
- run()
305
- assert_output_equals(
306
- [
307
- (1, "val1", "2022-01-01"),
308
- (2, "val2", "2022-02-01"),
309
- (3, "val3", "2022-02-02"),
310
- ]
311
- )
312
-
313
- # we have a new run that inserted rows to this table, so the run count should be 2
314
- count_by_run_id = conn.sql(
315
- "select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 2 desc"
316
- ).fetchall()
317
- assert len(count_by_run_id) == 2
318
- assert count_by_run_id[0][1] == 2
319
- assert count_by_run_id[0][0] == first_run_id
320
- # we don't care about the run ID
321
- assert count_by_run_id[1][1] == 1
322
- ##############################
323
-
324
- ##############################
325
- # lastly, let's try modifying the updated_at of an old column, it should be updated in the output table
326
- conn.execute(
327
- "UPDATE testschema_merge.input SET val='val2_modified', updated_at = '2022-02-03' WHERE id = 2"
328
- )
329
-
330
- run()
331
- assert_output_equals(
332
- [
333
- (1, "val1", "2022-01-01"),
334
- (2, "val2_modified", "2022-02-03"),
335
- (3, "val3", "2022-02-02"),
336
- ]
337
- )
338
-
339
- # we have a new run that inserted rows to this table, so the run count should be 2
340
- count_by_run_id = conn.sql(
341
- "select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 2 desc, 1 asc"
342
- ).fetchall()
343
- assert len(count_by_run_id) == 3
344
- assert count_by_run_id[0][1] == 1
345
- assert count_by_run_id[0][0] == first_run_id
346
- # we don't care about the rest of the run IDs
347
- assert count_by_run_id[1][1] == 1
348
- assert count_by_run_id[2][1] == 1
349
- ##############################
350
-
351
- try:
352
- os.remove(abs_db_path)
353
- except Exception:
354
- pass
355
-
356
-
357
- def test_delete_insert_without_primary_key_duckdb_to_duckdb():
358
- try:
359
- shutil.rmtree(get_abs_path("../pipeline_data"))
360
- except Exception:
361
- pass
362
-
363
- abs_db_path = get_abs_path("./testdata/test_delete_insert_without_primary_key.db")
364
- rel_db_path_to_command = (
365
- "ingestr/testdata/test_delete_insert_without_primary_key.db"
366
- )
367
- uri = f"duckdb:///{rel_db_path_to_command}"
368
-
369
- conn = duckdb.connect(abs_db_path)
370
- conn.execute("DROP SCHEMA IF EXISTS testschema_delete_insert CASCADE")
371
- conn.execute("CREATE SCHEMA testschema_delete_insert")
372
- conn.execute(
373
- "CREATE TABLE testschema_delete_insert.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP WITH TIME ZONE)"
374
- )
375
- conn.execute(
376
- "INSERT INTO testschema_delete_insert.input VALUES (1, 'val1', '2022-01-01 00:00:00+00:00')"
377
- )
378
- conn.execute(
379
- "INSERT INTO testschema_delete_insert.input VALUES (2, 'val2', '2022-02-01 00:00:00+00:00')"
380
- )
381
-
382
- res = conn.sql("select count(*) from testschema_delete_insert.input").fetchall()
383
- assert res[0][0] == 2
384
-
385
- def run():
386
- res = invoke_ingest_command(
387
- uri,
388
- "testschema_delete_insert.input",
389
- uri,
390
- "testschema_delete_insert.output",
391
- inc_strategy="delete+insert",
392
- inc_key="updated_at",
393
- sql_backend="sqlalchemy",
394
- loader_file_format="jsonl",
395
- )
396
- assert res.exit_code == 0
397
- return res
398
-
399
- def get_output_rows():
400
- conn.execute("CHECKPOINT")
401
- return conn.sql(
402
- "select id, val, strftime(CAST(updated_at AT TIME ZONE 'UTC' AS TIMESTAMP), '%Y-%m-%d %H:%M:%S') from testschema_delete_insert.output order by id asc"
403
- ).fetchall()
404
-
405
- def assert_output_equals(expected):
406
- res = get_output_rows()
407
- assert len(res) == len(expected)
408
- for i, row in enumerate(expected):
409
- assert res[i] == row
410
-
411
- run()
412
- assert_output_equals(
413
- [(1, "val1", "2022-01-01 00:00:00"), (2, "val2", "2022-02-01 00:00:00")]
414
- )
415
-
416
- first_run_id = conn.sql(
417
- "select _dlt_load_id from testschema_delete_insert.output limit 1"
418
- ).fetchall()[0][0]
419
-
420
- ##############################
421
- # we'll run again, since this is a delete+insert, we expect the run ID to change for the last one
422
- res = run()
423
- assert_output_equals(
424
- [(1, "val1", "2022-01-01 00:00:00"), (2, "val2", "2022-02-01 00:00:00")]
425
- )
426
-
427
- # we ensure that one of the rows is updated with a new run
428
- count_by_run_id = conn.sql(
429
- "select _dlt_load_id, count(*) from testschema_delete_insert.output group by 1 order by 1 asc"
430
- ).fetchall()
431
- assert len(count_by_run_id) == 2
432
- assert count_by_run_id[0][0] == first_run_id
433
- assert count_by_run_id[0][1] == 1
434
- assert count_by_run_id[1][0] != first_run_id
435
- assert count_by_run_id[1][1] == 1
436
- ##############################
437
-
438
- ##############################
439
- # now we'll insert a few more lines for the same day, the new rows should show up
440
- conn.execute(
441
- "INSERT INTO testschema_delete_insert.input VALUES (3, 'val3', '2022-02-01 00:00:00+00:00'), (4, 'val4', '2022-02-01 00:00:00+00:00')"
442
- )
443
- conn.execute("CHECKPOINT")
444
-
445
- run()
446
- assert_output_equals(
447
- [
448
- (1, "val1", "2022-01-01 00:00:00"),
449
- (2, "val2", "2022-02-01 00:00:00"),
450
- (3, "val3", "2022-02-01 00:00:00"),
451
- (4, "val4", "2022-02-01 00:00:00"),
452
- ]
453
- )
454
-
455
- # the new rows should have a new run ID, there should be 2 distinct runs now
456
- count_by_run_id = conn.sql(
457
- "select _dlt_load_id, count(*) from testschema_delete_insert.output group by 1 order by 2 desc, 1 asc"
458
- ).fetchall()
459
- assert len(count_by_run_id) == 2
460
- assert count_by_run_id[0][0] != first_run_id
461
- assert count_by_run_id[0][1] == 3 # 2 new rows + 1 old row
462
- assert count_by_run_id[1][0] == first_run_id
463
- assert count_by_run_id[1][1] == 1
464
- ##############################
465
-
466
- try:
467
- os.remove(abs_db_path)
468
- except Exception:
469
- pass
470
-
471
-
472
- def test_delete_insert_with_timerange_duckdb_to_duckdb():
473
- try:
474
- shutil.rmtree(get_abs_path("../pipeline_data"))
475
- except Exception:
476
- pass
477
-
478
- abs_db_path = get_abs_path("./testdata/test_delete_insert_with_timerange.db")
479
- rel_db_path_to_command = "ingestr/testdata/test_delete_insert_with_timerange.db"
480
- uri = f"duckdb:///{rel_db_path_to_command}"
481
-
482
- conn = duckdb.connect(abs_db_path)
483
- conn.execute("DROP SCHEMA IF EXISTS testschema_delete_insert_timerange CASCADE")
484
- conn.execute("CREATE SCHEMA testschema_delete_insert_timerange")
485
- conn.execute(
486
- "CREATE TABLE testschema_delete_insert_timerange.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP WITH TIME ZONE)"
487
- )
488
- conn.execute(
489
- """INSERT INTO testschema_delete_insert_timerange.input VALUES
490
- (1, 'val1', '2022-01-01T00:00:00Z'),
491
- (2, 'val2', '2022-01-01T00:00:00Z'),
492
- (3, 'val3', '2022-01-02T00:00:00Z'),
493
- (4, 'val4', '2022-01-02T00:00:00Z'),
494
- (5, 'val5', '2022-01-03T00:00:00Z'),
495
- (6, 'val6', '2022-01-03T00:00:00Z')
496
- """
497
- )
498
-
499
- res = conn.sql(
500
- "select count(*) from testschema_delete_insert_timerange.input"
501
- ).fetchall()
502
- assert res[0][0] == 6
503
-
504
- def run(start_date: str, end_date: str):
505
- res = invoke_ingest_command(
506
- uri,
507
- "testschema_delete_insert_timerange.input",
508
- uri,
509
- "testschema_delete_insert_timerange.output",
510
- inc_strategy="delete+insert",
511
- inc_key="updated_at",
512
- interval_start=start_date,
513
- interval_end=end_date,
514
- sql_backend="sqlalchemy",
515
- loader_file_format="jsonl",
516
- )
517
- assert res.exit_code == 0
518
- return res
519
-
520
- def get_output_rows():
521
- conn.execute("CHECKPOINT")
522
- return conn.sql(
523
- "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_delete_insert_timerange.output order by id asc"
524
- ).fetchall()
525
-
526
- def assert_output_equals(expected):
527
- res = get_output_rows()
528
- assert len(res) == len(expected)
529
- for i, row in enumerate(expected):
530
- assert res[i] == row
531
-
532
- run(
533
- "2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z"
534
- ) # dlt runs them with the end date exclusive
535
- assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-01-01")])
536
-
537
- first_run_id = conn.sql(
538
- "select _dlt_load_id from testschema_delete_insert_timerange.output limit 1"
539
- ).fetchall()[0][0]
540
-
541
- ##############################
542
- # we'll run again, since this is a delete+insert, we expect the run ID to change for the last one
543
- run(
544
- "2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z"
545
- ) # dlt runs them with the end date exclusive
546
- assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-01-01")])
547
-
548
- # both rows should have a new run ID
549
- count_by_run_id = conn.sql(
550
- "select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
551
- ).fetchall()
552
- assert len(count_by_run_id) == 1
553
- assert count_by_run_id[0][0] != first_run_id
554
- assert count_by_run_id[0][1] == 2
555
- ##############################
556
-
557
- ##############################
558
- # now run for the day after, new rows should land
559
- run("2022-01-02T00:00:00Z", "2022-01-03T00:00:00Z")
560
- assert_output_equals(
561
- [
562
- (1, "val1", "2022-01-01"),
563
- (2, "val2", "2022-01-01"),
564
- (3, "val3", "2022-01-02"),
565
- (4, "val4", "2022-01-02"),
566
- ]
567
- )
568
-
569
- # there should be 4 rows with 2 distinct run IDs
570
- count_by_run_id = conn.sql(
571
- "select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
572
- ).fetchall()
573
- assert len(count_by_run_id) == 2
574
- assert count_by_run_id[0][1] == 2
575
- assert count_by_run_id[1][1] == 2
576
- ##############################
577
-
578
- ##############################
579
- # let's bring in the rows for the third day
580
- run("2022-01-03T00:00:00Z", "2022-01-04T00:00:00Z")
581
- assert_output_equals(
582
- [
583
- (1, "val1", "2022-01-01"),
584
- (2, "val2", "2022-01-01"),
585
- (3, "val3", "2022-01-02"),
586
- (4, "val4", "2022-01-02"),
587
- (5, "val5", "2022-01-03"),
588
- (6, "val6", "2022-01-03"),
589
- ]
590
- )
591
-
592
- # there should be 6 rows with 3 distinct run IDs
593
- count_by_run_id = conn.sql(
594
- "select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
595
- ).fetchall()
596
- assert len(count_by_run_id) == 3
597
- assert count_by_run_id[0][1] == 2
598
- assert count_by_run_id[1][1] == 2
599
- assert count_by_run_id[2][1] == 2
600
- ##############################
601
-
602
- ##############################
603
- # now let's do a backfill for the first day again, the rows should be updated
604
- conn.execute(
605
- "UPDATE testschema_delete_insert_timerange.input SET val = 'val1_modified' WHERE id = 1"
606
- )
607
-
608
- run("2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z")
609
- assert_output_equals(
610
- [
611
- (1, "val1_modified", "2022-01-01"),
612
- (2, "val2", "2022-01-01"),
613
- (3, "val3", "2022-01-02"),
614
- (4, "val4", "2022-01-02"),
615
- (5, "val5", "2022-01-03"),
616
- (6, "val6", "2022-01-03"),
617
- ]
618
- )
619
-
620
- # there should still be 6 rows with 3 distinct run IDs
621
- count_by_run_id = conn.sql(
622
- "select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
623
- ).fetchall()
624
- assert len(count_by_run_id) == 3
625
- assert count_by_run_id[0][1] == 2
626
- assert count_by_run_id[1][1] == 2
627
- assert count_by_run_id[2][1] == 2
628
- ##############################
629
-
630
- try:
631
- os.remove(abs_db_path)
632
- except Exception:
633
- pass
634
-
635
-
636
- ### These are CSV-to-DuckDB tests
637
- def test_create_replace_csv_to_duckdb():
638
- try:
639
- shutil.rmtree(get_abs_path("../pipeline_data"))
640
- except Exception:
641
- pass
642
-
643
- abs_db_path = get_abs_path("./testdata/test_create_replace_csv.db")
644
- rel_db_path_to_command = "ingestr/testdata/test_create_replace_csv.db"
645
- rel_source_path_to_command = "ingestr/testdata/create_replace.csv"
646
-
647
- conn = duckdb.connect(abs_db_path)
648
-
649
- result = invoke_ingest_command(
650
- f"csv://{rel_source_path_to_command}",
651
- "testschema.input",
652
- f"duckdb:///{rel_db_path_to_command}",
653
- "testschema.output",
654
- )
655
-
656
- assert result.exit_code == 0
657
-
658
- res = conn.sql(
659
- "select symbol, date, is_enabled, name from testschema.output"
660
- ).fetchall()
661
-
662
- # read CSV file
663
- actual_rows = []
664
- with open(get_abs_path("./testdata/create_replace.csv"), "r") as f:
665
- reader = csv.reader(f, delimiter=",", quotechar='"')
666
- next(reader, None)
667
- for row in reader:
668
- actual_rows.append(row)
669
-
670
- # compare the CSV file with the DuckDB table
671
- assert len(res) == len(actual_rows)
672
- for i, row in enumerate(actual_rows):
673
- assert res[i] == tuple(row)
674
-
675
- try:
676
- os.remove(abs_db_path)
677
- except Exception:
678
- pass
679
-
680
-
681
- def get_random_string(length):
682
- letters = string.ascii_lowercase
683
- result_str = "".join(random.choice(letters) for i in range(length))
684
- return result_str
685
-
686
-
687
- def test_merge_with_primary_key_csv_to_duckdb():
688
- try:
689
- shutil.rmtree(get_abs_path("../pipeline_data"))
690
- except Exception:
691
- pass
692
-
693
- dbname = f"test_merge_with_primary_key_csv{get_random_string(5)}.db"
694
- abs_db_path = get_abs_path(f"./testdata/{dbname}")
695
- rel_db_path_to_command = f"ingestr/testdata/{dbname}"
696
- uri = f"duckdb:///{rel_db_path_to_command}"
697
-
698
- conn = duckdb.connect(abs_db_path)
699
-
700
- def run(source: str):
701
- res = invoke_ingest_command(
702
- source,
703
- "whatever", # table name doesnt matter for CSV
704
- uri,
705
- "testschema_merge.output",
706
- "merge",
707
- "date",
708
- "symbol",
709
- )
710
- assert res.exit_code == 0
711
- return res
712
-
713
- def get_output_rows():
714
- conn.execute("CHECKPOINT")
715
- return conn.sql(
716
- "select symbol, date, is_enabled, name from testschema_merge.output order by symbol asc"
717
- ).fetchall()
718
-
719
- def assert_output_equals_to_csv(path: str):
720
- res = get_output_rows()
721
- actual_rows = []
722
- with open(get_abs_path(path), "r") as f:
723
- reader = csv.reader(f, delimiter=",", quotechar='"')
724
- next(reader, None)
725
- for row in reader:
726
- actual_rows.append(row)
727
-
728
- assert len(res) == len(actual_rows)
729
- for i, row in enumerate(actual_rows):
730
- assert res[i] == tuple(row)
731
-
732
- run("csv://ingestr/testdata/merge_part1.csv")
733
- assert_output_equals_to_csv("./testdata/merge_part1.csv")
734
-
735
- first_run_id = conn.sql(
736
- "select _dlt_load_id from testschema_merge.output limit 1"
737
- ).fetchall()[0][0]
738
-
739
- ##############################
740
- # we'll run again, we don't expect any changes since the data hasn't changed
741
- run("csv://ingestr/testdata/merge_part1.csv")
742
- assert_output_equals_to_csv("./testdata/merge_part1.csv")
743
-
744
- # we also ensure that the other rows were not touched
745
- count_by_run_id = conn.sql(
746
- "select _dlt_load_id, count(*) from testschema_merge.output group by 1"
747
- ).fetchall()
748
- assert len(count_by_run_id) == 1
749
- assert count_by_run_id[0][1] == 3
750
- assert count_by_run_id[0][0] == first_run_id
751
- ##############################
752
-
753
- ##############################
754
- # now we'll run the same ingestion but with a different file this time
755
-
756
- run("csv://ingestr/testdata/merge_part2.csv")
757
- assert_output_equals_to_csv("./testdata/merge_expected.csv")
758
-
759
- # let's check the runs
760
- count_by_run_id = conn.sql(
761
- "select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 1 asc"
762
- ).fetchall()
763
-
764
- # we expect that there's a new load ID now
765
- assert len(count_by_run_id) == 2
766
-
767
- # there should be only one row with the first load ID
768
- assert count_by_run_id[0][1] == 1
769
- assert count_by_run_id[0][0] == first_run_id
770
-
771
- # there should be a new run with the rest, 2 rows updated + 1 new row
772
- assert count_by_run_id[1][1] == 3
773
- ##############################
774
-
775
- try:
776
- os.remove(abs_db_path)
777
- except Exception:
778
- pass
779
-
780
-
781
- def test_delete_insert_without_primary_key_csv_to_duckdb():
782
- try:
783
- shutil.rmtree(get_abs_path("../pipeline_data"))
784
- except Exception:
785
- pass
786
-
787
- dbname = f"test_merge_with_primary_key_csv{get_random_string(5)}.db"
788
- abs_db_path = get_abs_path(f"./testdata/{dbname}")
789
- rel_db_path_to_command = f"ingestr/testdata/{dbname}"
790
- uri = f"duckdb:///{rel_db_path_to_command}"
791
-
792
- conn = duckdb.connect(abs_db_path)
793
-
794
- def run(source: str):
795
- res = invoke_ingest_command(
796
- source,
797
- "whatever", # table name doesnt matter for CSV
798
- uri,
799
- "testschema.output",
800
- "delete+insert",
801
- "date",
802
- )
803
- assert res.exit_code == 0
804
- return res
805
-
806
- def get_output_rows():
807
- conn.execute("CHECKPOINT")
808
- return conn.sql(
809
- "select symbol, date, is_enabled, name from testschema.output order by symbol asc"
810
- ).fetchall()
811
-
812
- def assert_output_equals_to_csv(path: str):
813
- res = get_output_rows()
814
- actual_rows = []
815
- with open(get_abs_path(path), "r") as f:
816
- reader = csv.reader(f, delimiter=",", quotechar='"')
817
- next(reader, None)
818
- for row in reader:
819
- actual_rows.append(row)
820
-
821
- assert len(res) == len(actual_rows)
822
- for i, row in enumerate(actual_rows):
823
- assert res[i] == tuple(row)
824
-
825
- run("csv://ingestr/testdata/delete_insert_part1.csv")
826
- assert_output_equals_to_csv("./testdata/delete_insert_part1.csv")
827
-
828
- first_run_id = conn.sql(
829
- "select _dlt_load_id from testschema.output limit 1"
830
- ).fetchall()[0][0]
831
-
832
- ##############################
833
- # we'll run again, we expect the data to be the same, but a new load_id to exist
834
- # this is due to the fact that the old data won't be touched, but the ones with the
835
- # latest value will be rewritten
836
- run("csv://ingestr/testdata/delete_insert_part1.csv")
837
- assert_output_equals_to_csv("./testdata/delete_insert_part1.csv")
838
-
839
- # we also ensure that the other rows were not touched
840
- count_by_run_id = conn.sql(
841
- "select _dlt_load_id, count(*) from testschema.output group by 1 order by 1 asc"
842
- ).fetchall()
843
-
844
- assert len(count_by_run_id) == 2
845
- assert count_by_run_id[0][1] == 1
846
- assert count_by_run_id[0][0] == first_run_id
847
- assert count_by_run_id[1][1] == 3
848
- ##############################
849
-
850
- ##############################
851
- # now we'll run the same ingestion but with a different file this time
852
-
853
- run("csv://ingestr/testdata/delete_insert_part2.csv")
854
- assert_output_equals_to_csv("./testdata/delete_insert_expected.csv")
855
-
856
- # let's check the runs
857
- count_by_run_id = conn.sql(
858
- "select _dlt_load_id, count(*) from testschema.output group by 1 order by 1 asc"
859
- ).fetchall()
860
-
861
- # we expect that there's a new load ID now
862
- assert len(count_by_run_id) == 2
863
-
864
- # there should be only one row with the first load ID, oldest date
865
- assert count_by_run_id[0][1] == 1
866
- assert count_by_run_id[0][0] == first_run_id
867
-
868
- # there should be a new run with the rest, 3 rows updated + 1 new row
869
- assert count_by_run_id[1][1] == 4
870
- ##############################
871
-
872
- try:
873
- os.remove(abs_db_path)
874
- except Exception:
875
- pass