ingestr 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +16 -3
- ingestr/main_test.py +287 -7
- ingestr/src/factory.py +6 -0
- ingestr/src/gorgias/__init__.py +587 -0
- ingestr/src/gorgias/helpers.py +149 -0
- ingestr/src/gorgias/helpers_test.py +45 -0
- ingestr/src/sources.py +95 -3
- ingestr/src/version.py +1 -1
- ingestr/testdata/create_replace.csv +21 -0
- ingestr/testdata/delete_insert_expected.csv +6 -0
- ingestr/testdata/delete_insert_part1.csv +5 -0
- ingestr/testdata/delete_insert_part2.csv +6 -0
- ingestr/testdata/merge_expected.csv +5 -0
- ingestr/testdata/merge_part1.csv +4 -0
- ingestr/testdata/merge_part2.csv +5 -0
- {ingestr-0.6.0.dist-info → ingestr-0.6.2.dist-info}/METADATA +3 -2
- {ingestr-0.6.0.dist-info → ingestr-0.6.2.dist-info}/RECORD +20 -15
- {ingestr-0.6.0.dist-info → ingestr-0.6.2.dist-info}/WHEEL +1 -1
- ingestr/testdata/test_append.db +0 -0
- ingestr/testdata/test_create_replace.db +0 -0
- ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
- ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
- ingestr/testdata/test_merge_with_primary_key.db +0 -0
- {ingestr-0.6.0.dist-info → ingestr-0.6.2.dist-info}/entry_points.txt +0 -0
- {ingestr-0.6.0.dist-info → ingestr-0.6.2.dist-info}/licenses/LICENSE.md +0 -0
ingestr/main.py
CHANGED
|
@@ -90,6 +90,7 @@ class IncrementalStrategy(str, Enum):
|
|
|
90
90
|
append = "append"
|
|
91
91
|
delete_insert = "delete+insert"
|
|
92
92
|
merge = "merge"
|
|
93
|
+
none = "none"
|
|
93
94
|
|
|
94
95
|
|
|
95
96
|
class LoaderFileFormat(str, Enum):
|
|
@@ -136,7 +137,7 @@ def ingest(
|
|
|
136
137
|
),
|
|
137
138
|
] = None, # type: ignore
|
|
138
139
|
incremental_key: Annotated[
|
|
139
|
-
str,
|
|
140
|
+
Optional[str],
|
|
140
141
|
typer.Option(
|
|
141
142
|
help="The incremental key from the table to be used for incremental strategies",
|
|
142
143
|
envvar="INCREMENTAL_KEY",
|
|
@@ -257,6 +258,16 @@ def ingest(
|
|
|
257
258
|
full_refresh=full_refresh,
|
|
258
259
|
)
|
|
259
260
|
|
|
261
|
+
if source.handles_incrementality():
|
|
262
|
+
incremental_strategy = IncrementalStrategy.none
|
|
263
|
+
incremental_key = None
|
|
264
|
+
|
|
265
|
+
incremental_strategy_text = (
|
|
266
|
+
incremental_strategy.value
|
|
267
|
+
if incremental_strategy.value != IncrementalStrategy.none
|
|
268
|
+
else "Platform-specific"
|
|
269
|
+
)
|
|
270
|
+
|
|
260
271
|
print()
|
|
261
272
|
print("[bold green]Initiated the pipeline with the following:[/bold green]")
|
|
262
273
|
print(
|
|
@@ -266,7 +277,7 @@ def ingest(
|
|
|
266
277
|
f"[bold yellow] Destination:[/bold yellow] {factory.destination_scheme} / {dest_table}"
|
|
267
278
|
)
|
|
268
279
|
print(
|
|
269
|
-
f"[bold yellow] Incremental Strategy:[/bold yellow] {
|
|
280
|
+
f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy_text}"
|
|
270
281
|
)
|
|
271
282
|
print(
|
|
272
283
|
f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
|
|
@@ -317,7 +328,9 @@ def ingest(
|
|
|
317
328
|
uri=dest_uri,
|
|
318
329
|
table=dest_table,
|
|
319
330
|
),
|
|
320
|
-
write_disposition=incremental_strategy.value
|
|
331
|
+
write_disposition=incremental_strategy.value
|
|
332
|
+
if incremental_strategy.value != IncrementalStrategy.none
|
|
333
|
+
else None, # type: ignore
|
|
321
334
|
primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
|
|
322
335
|
loader_file_format=loader_file_format.value
|
|
323
336
|
if loader_file_format is not None
|
ingestr/main_test.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
|
+
import csv
|
|
1
2
|
import os
|
|
3
|
+
import random
|
|
2
4
|
import shutil
|
|
5
|
+
import string
|
|
3
6
|
|
|
4
7
|
import duckdb
|
|
5
8
|
from typer.testing import CliRunner
|
|
@@ -80,9 +83,17 @@ def invoke_ingest_command(
|
|
|
80
83
|
return result
|
|
81
84
|
|
|
82
85
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
+
### These are DuckDB-to-DuckDB tests
|
|
87
|
+
def test_create_replace_duckdb_to_duckdb():
|
|
88
|
+
try:
|
|
89
|
+
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
90
|
+
except Exception:
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
dbname = f"test_create_replace_{get_random_string(5)}.db"
|
|
94
|
+
|
|
95
|
+
abs_db_path = get_abs_path(f"./testdata/{dbname}")
|
|
96
|
+
rel_db_path_to_command = f"ingestr/testdata/{dbname}"
|
|
86
97
|
|
|
87
98
|
conn = duckdb.connect(abs_db_path)
|
|
88
99
|
conn.execute("DROP SCHEMA IF EXISTS testschema CASCADE")
|
|
@@ -103,6 +114,8 @@ def test_create_replace():
|
|
|
103
114
|
"testschema.output",
|
|
104
115
|
)
|
|
105
116
|
|
|
117
|
+
print(result.stdout)
|
|
118
|
+
|
|
106
119
|
assert result.exit_code == 0
|
|
107
120
|
|
|
108
121
|
res = conn.sql(
|
|
@@ -112,8 +125,13 @@ def test_create_replace():
|
|
|
112
125
|
assert res[0] == (1, "val1", "2022-01-01")
|
|
113
126
|
assert res[1] == (2, "val2", "2022-02-01")
|
|
114
127
|
|
|
128
|
+
try:
|
|
129
|
+
os.remove(abs_db_path)
|
|
130
|
+
except Exception:
|
|
131
|
+
pass
|
|
132
|
+
|
|
115
133
|
|
|
116
|
-
def
|
|
134
|
+
def test_append_duckdb_to_duckdb():
|
|
117
135
|
try:
|
|
118
136
|
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
119
137
|
except Exception:
|
|
@@ -172,8 +190,13 @@ def test_append():
|
|
|
172
190
|
assert res[0] == (1, "val1", "2022-01-01")
|
|
173
191
|
assert res[1] == (2, "val2", "2022-01-02")
|
|
174
192
|
|
|
193
|
+
try:
|
|
194
|
+
os.remove(abs_db_path)
|
|
195
|
+
except Exception:
|
|
196
|
+
pass
|
|
175
197
|
|
|
176
|
-
|
|
198
|
+
|
|
199
|
+
def test_merge_with_primary_key_duckdb_to_duckdb():
|
|
177
200
|
try:
|
|
178
201
|
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
179
202
|
except Exception:
|
|
@@ -325,8 +348,13 @@ def test_merge_with_primary_key():
|
|
|
325
348
|
assert count_by_run_id[2][1] == 1
|
|
326
349
|
##############################
|
|
327
350
|
|
|
351
|
+
try:
|
|
352
|
+
os.remove(abs_db_path)
|
|
353
|
+
except Exception:
|
|
354
|
+
pass
|
|
355
|
+
|
|
328
356
|
|
|
329
|
-
def
|
|
357
|
+
def test_delete_insert_without_primary_key_duckdb_to_duckdb():
|
|
330
358
|
try:
|
|
331
359
|
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
332
360
|
except Exception:
|
|
@@ -435,8 +463,13 @@ def test_delete_insert_without_primary_key():
|
|
|
435
463
|
assert count_by_run_id[1][1] == 1
|
|
436
464
|
##############################
|
|
437
465
|
|
|
466
|
+
try:
|
|
467
|
+
os.remove(abs_db_path)
|
|
468
|
+
except Exception:
|
|
469
|
+
pass
|
|
438
470
|
|
|
439
|
-
|
|
471
|
+
|
|
472
|
+
def test_delete_insert_with_timerange_duckdb_to_duckdb():
|
|
440
473
|
try:
|
|
441
474
|
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
442
475
|
except Exception:
|
|
@@ -593,3 +626,250 @@ def test_delete_insert_with_timerange():
|
|
|
593
626
|
assert count_by_run_id[1][1] == 2
|
|
594
627
|
assert count_by_run_id[2][1] == 2
|
|
595
628
|
##############################
|
|
629
|
+
|
|
630
|
+
try:
|
|
631
|
+
os.remove(abs_db_path)
|
|
632
|
+
except Exception:
|
|
633
|
+
pass
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
### These are CSV-to-DuckDB tests
|
|
637
|
+
def test_create_replace_csv_to_duckdb():
|
|
638
|
+
try:
|
|
639
|
+
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
640
|
+
except Exception:
|
|
641
|
+
pass
|
|
642
|
+
|
|
643
|
+
abs_db_path = get_abs_path("./testdata/test_create_replace_csv.db")
|
|
644
|
+
rel_db_path_to_command = "ingestr/testdata/test_create_replace_csv.db"
|
|
645
|
+
rel_source_path_to_command = "ingestr/testdata/create_replace.csv"
|
|
646
|
+
|
|
647
|
+
conn = duckdb.connect(abs_db_path)
|
|
648
|
+
|
|
649
|
+
result = invoke_ingest_command(
|
|
650
|
+
f"csv://{rel_source_path_to_command}",
|
|
651
|
+
"testschema.input",
|
|
652
|
+
f"duckdb:///{rel_db_path_to_command}",
|
|
653
|
+
"testschema.output",
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
assert result.exit_code == 0
|
|
657
|
+
|
|
658
|
+
res = conn.sql(
|
|
659
|
+
"select symbol, date, is_enabled, name from testschema.output"
|
|
660
|
+
).fetchall()
|
|
661
|
+
|
|
662
|
+
# read CSV file
|
|
663
|
+
actual_rows = []
|
|
664
|
+
with open(get_abs_path("./testdata/create_replace.csv"), "r") as f:
|
|
665
|
+
reader = csv.reader(f, delimiter=",", quotechar='"')
|
|
666
|
+
next(reader, None)
|
|
667
|
+
for row in reader:
|
|
668
|
+
actual_rows.append(row)
|
|
669
|
+
|
|
670
|
+
# compare the CSV file with the DuckDB table
|
|
671
|
+
assert len(res) == len(actual_rows)
|
|
672
|
+
for i, row in enumerate(actual_rows):
|
|
673
|
+
assert res[i] == tuple(row)
|
|
674
|
+
|
|
675
|
+
try:
|
|
676
|
+
os.remove(abs_db_path)
|
|
677
|
+
except Exception:
|
|
678
|
+
pass
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def get_random_string(length):
|
|
682
|
+
letters = string.ascii_lowercase
|
|
683
|
+
result_str = "".join(random.choice(letters) for i in range(length))
|
|
684
|
+
return result_str
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def test_merge_with_primary_key_csv_to_duckdb():
|
|
688
|
+
try:
|
|
689
|
+
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
690
|
+
except Exception:
|
|
691
|
+
pass
|
|
692
|
+
|
|
693
|
+
dbname = f"test_merge_with_primary_key_csv{get_random_string(5)}.db"
|
|
694
|
+
abs_db_path = get_abs_path(f"./testdata/{dbname}")
|
|
695
|
+
rel_db_path_to_command = f"ingestr/testdata/{dbname}"
|
|
696
|
+
uri = f"duckdb:///{rel_db_path_to_command}"
|
|
697
|
+
|
|
698
|
+
conn = duckdb.connect(abs_db_path)
|
|
699
|
+
|
|
700
|
+
def run(source: str):
|
|
701
|
+
res = invoke_ingest_command(
|
|
702
|
+
source,
|
|
703
|
+
"whatever", # table name doesnt matter for CSV
|
|
704
|
+
uri,
|
|
705
|
+
"testschema_merge.output",
|
|
706
|
+
"merge",
|
|
707
|
+
"date",
|
|
708
|
+
"symbol",
|
|
709
|
+
)
|
|
710
|
+
assert res.exit_code == 0
|
|
711
|
+
return res
|
|
712
|
+
|
|
713
|
+
def get_output_rows():
|
|
714
|
+
conn.execute("CHECKPOINT")
|
|
715
|
+
return conn.sql(
|
|
716
|
+
"select symbol, date, is_enabled, name from testschema_merge.output order by symbol asc"
|
|
717
|
+
).fetchall()
|
|
718
|
+
|
|
719
|
+
def assert_output_equals_to_csv(path: str):
|
|
720
|
+
res = get_output_rows()
|
|
721
|
+
actual_rows = []
|
|
722
|
+
with open(get_abs_path(path), "r") as f:
|
|
723
|
+
reader = csv.reader(f, delimiter=",", quotechar='"')
|
|
724
|
+
next(reader, None)
|
|
725
|
+
for row in reader:
|
|
726
|
+
actual_rows.append(row)
|
|
727
|
+
|
|
728
|
+
assert len(res) == len(actual_rows)
|
|
729
|
+
for i, row in enumerate(actual_rows):
|
|
730
|
+
assert res[i] == tuple(row)
|
|
731
|
+
|
|
732
|
+
run("csv://ingestr/testdata/merge_part1.csv")
|
|
733
|
+
assert_output_equals_to_csv("./testdata/merge_part1.csv")
|
|
734
|
+
|
|
735
|
+
first_run_id = conn.sql(
|
|
736
|
+
"select _dlt_load_id from testschema_merge.output limit 1"
|
|
737
|
+
).fetchall()[0][0]
|
|
738
|
+
|
|
739
|
+
##############################
|
|
740
|
+
# we'll run again, we don't expect any changes since the data hasn't changed
|
|
741
|
+
run("csv://ingestr/testdata/merge_part1.csv")
|
|
742
|
+
assert_output_equals_to_csv("./testdata/merge_part1.csv")
|
|
743
|
+
|
|
744
|
+
# we also ensure that the other rows were not touched
|
|
745
|
+
count_by_run_id = conn.sql(
|
|
746
|
+
"select _dlt_load_id, count(*) from testschema_merge.output group by 1"
|
|
747
|
+
).fetchall()
|
|
748
|
+
assert len(count_by_run_id) == 1
|
|
749
|
+
assert count_by_run_id[0][1] == 3
|
|
750
|
+
assert count_by_run_id[0][0] == first_run_id
|
|
751
|
+
##############################
|
|
752
|
+
|
|
753
|
+
##############################
|
|
754
|
+
# now we'll run the same ingestion but with a different file this time
|
|
755
|
+
|
|
756
|
+
run("csv://ingestr/testdata/merge_part2.csv")
|
|
757
|
+
assert_output_equals_to_csv("./testdata/merge_expected.csv")
|
|
758
|
+
|
|
759
|
+
# let's check the runs
|
|
760
|
+
count_by_run_id = conn.sql(
|
|
761
|
+
"select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 1 asc"
|
|
762
|
+
).fetchall()
|
|
763
|
+
|
|
764
|
+
# we expect that there's a new load ID now
|
|
765
|
+
assert len(count_by_run_id) == 2
|
|
766
|
+
|
|
767
|
+
# there should be only one row with the first load ID
|
|
768
|
+
assert count_by_run_id[0][1] == 1
|
|
769
|
+
assert count_by_run_id[0][0] == first_run_id
|
|
770
|
+
|
|
771
|
+
# there should be a new run with the rest, 2 rows updated + 1 new row
|
|
772
|
+
assert count_by_run_id[1][1] == 3
|
|
773
|
+
##############################
|
|
774
|
+
|
|
775
|
+
try:
|
|
776
|
+
os.remove(abs_db_path)
|
|
777
|
+
except Exception:
|
|
778
|
+
pass
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def test_delete_insert_without_primary_key_csv_to_duckdb():
|
|
782
|
+
try:
|
|
783
|
+
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
784
|
+
except Exception:
|
|
785
|
+
pass
|
|
786
|
+
|
|
787
|
+
dbname = f"test_merge_with_primary_key_csv{get_random_string(5)}.db"
|
|
788
|
+
abs_db_path = get_abs_path(f"./testdata/{dbname}")
|
|
789
|
+
rel_db_path_to_command = f"ingestr/testdata/{dbname}"
|
|
790
|
+
uri = f"duckdb:///{rel_db_path_to_command}"
|
|
791
|
+
|
|
792
|
+
conn = duckdb.connect(abs_db_path)
|
|
793
|
+
|
|
794
|
+
def run(source: str):
|
|
795
|
+
res = invoke_ingest_command(
|
|
796
|
+
source,
|
|
797
|
+
"whatever", # table name doesnt matter for CSV
|
|
798
|
+
uri,
|
|
799
|
+
"testschema.output",
|
|
800
|
+
"delete+insert",
|
|
801
|
+
"date",
|
|
802
|
+
)
|
|
803
|
+
assert res.exit_code == 0
|
|
804
|
+
return res
|
|
805
|
+
|
|
806
|
+
def get_output_rows():
|
|
807
|
+
conn.execute("CHECKPOINT")
|
|
808
|
+
return conn.sql(
|
|
809
|
+
"select symbol, date, is_enabled, name from testschema.output order by symbol asc"
|
|
810
|
+
).fetchall()
|
|
811
|
+
|
|
812
|
+
def assert_output_equals_to_csv(path: str):
|
|
813
|
+
res = get_output_rows()
|
|
814
|
+
actual_rows = []
|
|
815
|
+
with open(get_abs_path(path), "r") as f:
|
|
816
|
+
reader = csv.reader(f, delimiter=",", quotechar='"')
|
|
817
|
+
next(reader, None)
|
|
818
|
+
for row in reader:
|
|
819
|
+
actual_rows.append(row)
|
|
820
|
+
|
|
821
|
+
assert len(res) == len(actual_rows)
|
|
822
|
+
for i, row in enumerate(actual_rows):
|
|
823
|
+
assert res[i] == tuple(row)
|
|
824
|
+
|
|
825
|
+
run("csv://ingestr/testdata/delete_insert_part1.csv")
|
|
826
|
+
assert_output_equals_to_csv("./testdata/delete_insert_part1.csv")
|
|
827
|
+
|
|
828
|
+
first_run_id = conn.sql(
|
|
829
|
+
"select _dlt_load_id from testschema.output limit 1"
|
|
830
|
+
).fetchall()[0][0]
|
|
831
|
+
|
|
832
|
+
##############################
|
|
833
|
+
# we'll run again, we expect the data to be the same, but a new load_id to exist
|
|
834
|
+
# this is due to the fact that the old data won't be touched, but the ones with the
|
|
835
|
+
# latest value will be rewritten
|
|
836
|
+
run("csv://ingestr/testdata/delete_insert_part1.csv")
|
|
837
|
+
assert_output_equals_to_csv("./testdata/delete_insert_part1.csv")
|
|
838
|
+
|
|
839
|
+
# we also ensure that the other rows were not touched
|
|
840
|
+
count_by_run_id = conn.sql(
|
|
841
|
+
"select _dlt_load_id, count(*) from testschema.output group by 1 order by 1 asc"
|
|
842
|
+
).fetchall()
|
|
843
|
+
|
|
844
|
+
assert len(count_by_run_id) == 2
|
|
845
|
+
assert count_by_run_id[0][1] == 1
|
|
846
|
+
assert count_by_run_id[0][0] == first_run_id
|
|
847
|
+
assert count_by_run_id[1][1] == 3
|
|
848
|
+
##############################
|
|
849
|
+
|
|
850
|
+
##############################
|
|
851
|
+
# now we'll run the same ingestion but with a different file this time
|
|
852
|
+
|
|
853
|
+
run("csv://ingestr/testdata/delete_insert_part2.csv")
|
|
854
|
+
assert_output_equals_to_csv("./testdata/delete_insert_expected.csv")
|
|
855
|
+
|
|
856
|
+
# let's check the runs
|
|
857
|
+
count_by_run_id = conn.sql(
|
|
858
|
+
"select _dlt_load_id, count(*) from testschema.output group by 1 order by 1 asc"
|
|
859
|
+
).fetchall()
|
|
860
|
+
|
|
861
|
+
# we expect that there's a new load ID now
|
|
862
|
+
assert len(count_by_run_id) == 2
|
|
863
|
+
|
|
864
|
+
# there should be only one row with the first load ID, oldest date
|
|
865
|
+
assert count_by_run_id[0][1] == 1
|
|
866
|
+
assert count_by_run_id[0][0] == first_run_id
|
|
867
|
+
|
|
868
|
+
# there should be a new run with the rest, 3 rows updated + 1 new row
|
|
869
|
+
assert count_by_run_id[1][1] == 4
|
|
870
|
+
##############################
|
|
871
|
+
|
|
872
|
+
try:
|
|
873
|
+
os.remove(abs_db_path)
|
|
874
|
+
except Exception:
|
|
875
|
+
pass
|
ingestr/src/factory.py
CHANGED
|
@@ -16,6 +16,7 @@ from ingestr.src.destinations import (
|
|
|
16
16
|
)
|
|
17
17
|
from ingestr.src.sources import (
|
|
18
18
|
GoogleSheetsSource,
|
|
19
|
+
GorgiasSource,
|
|
19
20
|
LocalCsvSource,
|
|
20
21
|
MongoDbSource,
|
|
21
22
|
NotionSource,
|
|
@@ -45,6 +46,9 @@ class SourceProtocol(Protocol):
|
|
|
45
46
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
46
47
|
pass
|
|
47
48
|
|
|
49
|
+
def handles_incrementality(self) -> bool:
|
|
50
|
+
pass
|
|
51
|
+
|
|
48
52
|
|
|
49
53
|
class DestinationProtocol(Protocol):
|
|
50
54
|
def dlt_dest(self, uri: str, **kwargs) -> Destination:
|
|
@@ -94,6 +98,8 @@ class SourceDestinationFactory:
|
|
|
94
98
|
return GoogleSheetsSource()
|
|
95
99
|
elif self.source_scheme == "shopify":
|
|
96
100
|
return ShopifySource()
|
|
101
|
+
elif self.source_scheme == "gorgias":
|
|
102
|
+
return GorgiasSource()
|
|
97
103
|
else:
|
|
98
104
|
raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
|
|
99
105
|
|