ingestr 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main.py CHANGED
@@ -90,6 +90,7 @@ class IncrementalStrategy(str, Enum):
90
90
  append = "append"
91
91
  delete_insert = "delete+insert"
92
92
  merge = "merge"
93
+ none = "none"
93
94
 
94
95
 
95
96
  class LoaderFileFormat(str, Enum):
@@ -136,7 +137,7 @@ def ingest(
136
137
  ),
137
138
  ] = None, # type: ignore
138
139
  incremental_key: Annotated[
139
- str,
140
+ Optional[str],
140
141
  typer.Option(
141
142
  help="The incremental key from the table to be used for incremental strategies",
142
143
  envvar="INCREMENTAL_KEY",
@@ -257,6 +258,16 @@ def ingest(
257
258
  full_refresh=full_refresh,
258
259
  )
259
260
 
261
+ if source.handles_incrementality():
262
+ incremental_strategy = IncrementalStrategy.none
263
+ incremental_key = None
264
+
265
+ incremental_strategy_text = (
266
+ incremental_strategy.value
267
+ if incremental_strategy.value != IncrementalStrategy.none
268
+ else "Platform-specific"
269
+ )
270
+
260
271
  print()
261
272
  print("[bold green]Initiated the pipeline with the following:[/bold green]")
262
273
  print(
@@ -266,7 +277,7 @@ def ingest(
266
277
  f"[bold yellow] Destination:[/bold yellow] {factory.destination_scheme} / {dest_table}"
267
278
  )
268
279
  print(
269
- f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy.value}"
280
+ f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy_text}"
270
281
  )
271
282
  print(
272
283
  f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
@@ -317,7 +328,9 @@ def ingest(
317
328
  uri=dest_uri,
318
329
  table=dest_table,
319
330
  ),
320
- write_disposition=incremental_strategy.value, # type: ignore
331
+ write_disposition=incremental_strategy.value
332
+ if incremental_strategy.value != IncrementalStrategy.none
333
+ else None, # type: ignore
321
334
  primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
322
335
  loader_file_format=loader_file_format.value
323
336
  if loader_file_format is not None
ingestr/main_test.py CHANGED
@@ -1,5 +1,8 @@
1
+ import csv
1
2
  import os
3
+ import random
2
4
  import shutil
5
+ import string
3
6
 
4
7
  import duckdb
5
8
  from typer.testing import CliRunner
@@ -80,9 +83,17 @@ def invoke_ingest_command(
80
83
  return result
81
84
 
82
85
 
83
- def test_create_replace():
84
- abs_db_path = get_abs_path("./testdata/test_create_replace.db")
85
- rel_db_path_to_command = "ingestr/testdata/test_create_replace.db"
86
+ ### These are DuckDB-to-DuckDB tests
87
+ def test_create_replace_duckdb_to_duckdb():
88
+ try:
89
+ shutil.rmtree(get_abs_path("../pipeline_data"))
90
+ except Exception:
91
+ pass
92
+
93
+ dbname = f"test_create_replace_{get_random_string(5)}.db"
94
+
95
+ abs_db_path = get_abs_path(f"./testdata/{dbname}")
96
+ rel_db_path_to_command = f"ingestr/testdata/{dbname}"
86
97
 
87
98
  conn = duckdb.connect(abs_db_path)
88
99
  conn.execute("DROP SCHEMA IF EXISTS testschema CASCADE")
@@ -103,6 +114,8 @@ def test_create_replace():
103
114
  "testschema.output",
104
115
  )
105
116
 
117
+ print(result.stdout)
118
+
106
119
  assert result.exit_code == 0
107
120
 
108
121
  res = conn.sql(
@@ -112,8 +125,13 @@ def test_create_replace():
112
125
  assert res[0] == (1, "val1", "2022-01-01")
113
126
  assert res[1] == (2, "val2", "2022-02-01")
114
127
 
128
+ try:
129
+ os.remove(abs_db_path)
130
+ except Exception:
131
+ pass
132
+
115
133
 
116
- def test_append():
134
+ def test_append_duckdb_to_duckdb():
117
135
  try:
118
136
  shutil.rmtree(get_abs_path("../pipeline_data"))
119
137
  except Exception:
@@ -172,8 +190,13 @@ def test_append():
172
190
  assert res[0] == (1, "val1", "2022-01-01")
173
191
  assert res[1] == (2, "val2", "2022-01-02")
174
192
 
193
+ try:
194
+ os.remove(abs_db_path)
195
+ except Exception:
196
+ pass
175
197
 
176
- def test_merge_with_primary_key():
198
+
199
+ def test_merge_with_primary_key_duckdb_to_duckdb():
177
200
  try:
178
201
  shutil.rmtree(get_abs_path("../pipeline_data"))
179
202
  except Exception:
@@ -325,8 +348,13 @@ def test_merge_with_primary_key():
325
348
  assert count_by_run_id[2][1] == 1
326
349
  ##############################
327
350
 
351
+ try:
352
+ os.remove(abs_db_path)
353
+ except Exception:
354
+ pass
355
+
328
356
 
329
- def test_delete_insert_without_primary_key():
357
+ def test_delete_insert_without_primary_key_duckdb_to_duckdb():
330
358
  try:
331
359
  shutil.rmtree(get_abs_path("../pipeline_data"))
332
360
  except Exception:
@@ -435,8 +463,13 @@ def test_delete_insert_without_primary_key():
435
463
  assert count_by_run_id[1][1] == 1
436
464
  ##############################
437
465
 
466
+ try:
467
+ os.remove(abs_db_path)
468
+ except Exception:
469
+ pass
438
470
 
439
- def test_delete_insert_with_timerange():
471
+
472
+ def test_delete_insert_with_timerange_duckdb_to_duckdb():
440
473
  try:
441
474
  shutil.rmtree(get_abs_path("../pipeline_data"))
442
475
  except Exception:
@@ -593,3 +626,250 @@ def test_delete_insert_with_timerange():
593
626
  assert count_by_run_id[1][1] == 2
594
627
  assert count_by_run_id[2][1] == 2
595
628
  ##############################
629
+
630
+ try:
631
+ os.remove(abs_db_path)
632
+ except Exception:
633
+ pass
634
+
635
+
636
+ ### These are CSV-to-DuckDB tests
637
+ def test_create_replace_csv_to_duckdb():
638
+ try:
639
+ shutil.rmtree(get_abs_path("../pipeline_data"))
640
+ except Exception:
641
+ pass
642
+
643
+ abs_db_path = get_abs_path("./testdata/test_create_replace_csv.db")
644
+ rel_db_path_to_command = "ingestr/testdata/test_create_replace_csv.db"
645
+ rel_source_path_to_command = "ingestr/testdata/create_replace.csv"
646
+
647
+ conn = duckdb.connect(abs_db_path)
648
+
649
+ result = invoke_ingest_command(
650
+ f"csv://{rel_source_path_to_command}",
651
+ "testschema.input",
652
+ f"duckdb:///{rel_db_path_to_command}",
653
+ "testschema.output",
654
+ )
655
+
656
+ assert result.exit_code == 0
657
+
658
+ res = conn.sql(
659
+ "select symbol, date, is_enabled, name from testschema.output"
660
+ ).fetchall()
661
+
662
+ # read CSV file
663
+ actual_rows = []
664
+ with open(get_abs_path("./testdata/create_replace.csv"), "r") as f:
665
+ reader = csv.reader(f, delimiter=",", quotechar='"')
666
+ next(reader, None)
667
+ for row in reader:
668
+ actual_rows.append(row)
669
+
670
+ # compare the CSV file with the DuckDB table
671
+ assert len(res) == len(actual_rows)
672
+ for i, row in enumerate(actual_rows):
673
+ assert res[i] == tuple(row)
674
+
675
+ try:
676
+ os.remove(abs_db_path)
677
+ except Exception:
678
+ pass
679
+
680
+
681
+ def get_random_string(length):
682
+ letters = string.ascii_lowercase
683
+ result_str = "".join(random.choice(letters) for i in range(length))
684
+ return result_str
685
+
686
+
687
+ def test_merge_with_primary_key_csv_to_duckdb():
688
+ try:
689
+ shutil.rmtree(get_abs_path("../pipeline_data"))
690
+ except Exception:
691
+ pass
692
+
693
+ dbname = f"test_merge_with_primary_key_csv{get_random_string(5)}.db"
694
+ abs_db_path = get_abs_path(f"./testdata/{dbname}")
695
+ rel_db_path_to_command = f"ingestr/testdata/{dbname}"
696
+ uri = f"duckdb:///{rel_db_path_to_command}"
697
+
698
+ conn = duckdb.connect(abs_db_path)
699
+
700
+ def run(source: str):
701
+ res = invoke_ingest_command(
702
+ source,
703
+ "whatever", # table name doesnt matter for CSV
704
+ uri,
705
+ "testschema_merge.output",
706
+ "merge",
707
+ "date",
708
+ "symbol",
709
+ )
710
+ assert res.exit_code == 0
711
+ return res
712
+
713
+ def get_output_rows():
714
+ conn.execute("CHECKPOINT")
715
+ return conn.sql(
716
+ "select symbol, date, is_enabled, name from testschema_merge.output order by symbol asc"
717
+ ).fetchall()
718
+
719
+ def assert_output_equals_to_csv(path: str):
720
+ res = get_output_rows()
721
+ actual_rows = []
722
+ with open(get_abs_path(path), "r") as f:
723
+ reader = csv.reader(f, delimiter=",", quotechar='"')
724
+ next(reader, None)
725
+ for row in reader:
726
+ actual_rows.append(row)
727
+
728
+ assert len(res) == len(actual_rows)
729
+ for i, row in enumerate(actual_rows):
730
+ assert res[i] == tuple(row)
731
+
732
+ run("csv://ingestr/testdata/merge_part1.csv")
733
+ assert_output_equals_to_csv("./testdata/merge_part1.csv")
734
+
735
+ first_run_id = conn.sql(
736
+ "select _dlt_load_id from testschema_merge.output limit 1"
737
+ ).fetchall()[0][0]
738
+
739
+ ##############################
740
+ # we'll run again, we don't expect any changes since the data hasn't changed
741
+ run("csv://ingestr/testdata/merge_part1.csv")
742
+ assert_output_equals_to_csv("./testdata/merge_part1.csv")
743
+
744
+ # we also ensure that the other rows were not touched
745
+ count_by_run_id = conn.sql(
746
+ "select _dlt_load_id, count(*) from testschema_merge.output group by 1"
747
+ ).fetchall()
748
+ assert len(count_by_run_id) == 1
749
+ assert count_by_run_id[0][1] == 3
750
+ assert count_by_run_id[0][0] == first_run_id
751
+ ##############################
752
+
753
+ ##############################
754
+ # now we'll run the same ingestion but with a different file this time
755
+
756
+ run("csv://ingestr/testdata/merge_part2.csv")
757
+ assert_output_equals_to_csv("./testdata/merge_expected.csv")
758
+
759
+ # let's check the runs
760
+ count_by_run_id = conn.sql(
761
+ "select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 1 asc"
762
+ ).fetchall()
763
+
764
+ # we expect that there's a new load ID now
765
+ assert len(count_by_run_id) == 2
766
+
767
+ # there should be only one row with the first load ID
768
+ assert count_by_run_id[0][1] == 1
769
+ assert count_by_run_id[0][0] == first_run_id
770
+
771
+ # there should be a new run with the rest, 2 rows updated + 1 new row
772
+ assert count_by_run_id[1][1] == 3
773
+ ##############################
774
+
775
+ try:
776
+ os.remove(abs_db_path)
777
+ except Exception:
778
+ pass
779
+
780
+
781
+ def test_delete_insert_without_primary_key_csv_to_duckdb():
782
+ try:
783
+ shutil.rmtree(get_abs_path("../pipeline_data"))
784
+ except Exception:
785
+ pass
786
+
787
+ dbname = f"test_merge_with_primary_key_csv{get_random_string(5)}.db"
788
+ abs_db_path = get_abs_path(f"./testdata/{dbname}")
789
+ rel_db_path_to_command = f"ingestr/testdata/{dbname}"
790
+ uri = f"duckdb:///{rel_db_path_to_command}"
791
+
792
+ conn = duckdb.connect(abs_db_path)
793
+
794
+ def run(source: str):
795
+ res = invoke_ingest_command(
796
+ source,
797
+ "whatever", # table name doesnt matter for CSV
798
+ uri,
799
+ "testschema.output",
800
+ "delete+insert",
801
+ "date",
802
+ )
803
+ assert res.exit_code == 0
804
+ return res
805
+
806
+ def get_output_rows():
807
+ conn.execute("CHECKPOINT")
808
+ return conn.sql(
809
+ "select symbol, date, is_enabled, name from testschema.output order by symbol asc"
810
+ ).fetchall()
811
+
812
+ def assert_output_equals_to_csv(path: str):
813
+ res = get_output_rows()
814
+ actual_rows = []
815
+ with open(get_abs_path(path), "r") as f:
816
+ reader = csv.reader(f, delimiter=",", quotechar='"')
817
+ next(reader, None)
818
+ for row in reader:
819
+ actual_rows.append(row)
820
+
821
+ assert len(res) == len(actual_rows)
822
+ for i, row in enumerate(actual_rows):
823
+ assert res[i] == tuple(row)
824
+
825
+ run("csv://ingestr/testdata/delete_insert_part1.csv")
826
+ assert_output_equals_to_csv("./testdata/delete_insert_part1.csv")
827
+
828
+ first_run_id = conn.sql(
829
+ "select _dlt_load_id from testschema.output limit 1"
830
+ ).fetchall()[0][0]
831
+
832
+ ##############################
833
+ # we'll run again, we expect the data to be the same, but a new load_id to exist
834
+ # this is due to the fact that the old data won't be touched, but the ones with the
835
+ # latest value will be rewritten
836
+ run("csv://ingestr/testdata/delete_insert_part1.csv")
837
+ assert_output_equals_to_csv("./testdata/delete_insert_part1.csv")
838
+
839
+ # we also ensure that the other rows were not touched
840
+ count_by_run_id = conn.sql(
841
+ "select _dlt_load_id, count(*) from testschema.output group by 1 order by 1 asc"
842
+ ).fetchall()
843
+
844
+ assert len(count_by_run_id) == 2
845
+ assert count_by_run_id[0][1] == 1
846
+ assert count_by_run_id[0][0] == first_run_id
847
+ assert count_by_run_id[1][1] == 3
848
+ ##############################
849
+
850
+ ##############################
851
+ # now we'll run the same ingestion but with a different file this time
852
+
853
+ run("csv://ingestr/testdata/delete_insert_part2.csv")
854
+ assert_output_equals_to_csv("./testdata/delete_insert_expected.csv")
855
+
856
+ # let's check the runs
857
+ count_by_run_id = conn.sql(
858
+ "select _dlt_load_id, count(*) from testschema.output group by 1 order by 1 asc"
859
+ ).fetchall()
860
+
861
+ # we expect that there's a new load ID now
862
+ assert len(count_by_run_id) == 2
863
+
864
+ # there should be only one row with the first load ID, oldest date
865
+ assert count_by_run_id[0][1] == 1
866
+ assert count_by_run_id[0][0] == first_run_id
867
+
868
+ # there should be a new run with the rest, 3 rows updated + 1 new row
869
+ assert count_by_run_id[1][1] == 4
870
+ ##############################
871
+
872
+ try:
873
+ os.remove(abs_db_path)
874
+ except Exception:
875
+ pass
ingestr/src/factory.py CHANGED
@@ -16,6 +16,7 @@ from ingestr.src.destinations import (
16
16
  )
17
17
  from ingestr.src.sources import (
18
18
  GoogleSheetsSource,
19
+ GorgiasSource,
19
20
  LocalCsvSource,
20
21
  MongoDbSource,
21
22
  NotionSource,
@@ -45,6 +46,9 @@ class SourceProtocol(Protocol):
45
46
  def dlt_source(self, uri: str, table: str, **kwargs):
46
47
  pass
47
48
 
49
+ def handles_incrementality(self) -> bool:
50
+ pass
51
+
48
52
 
49
53
  class DestinationProtocol(Protocol):
50
54
  def dlt_dest(self, uri: str, **kwargs) -> Destination:
@@ -94,6 +98,8 @@ class SourceDestinationFactory:
94
98
  return GoogleSheetsSource()
95
99
  elif self.source_scheme == "shopify":
96
100
  return ShopifySource()
101
+ elif self.source_scheme == "gorgias":
102
+ return GorgiasSource()
97
103
  else:
98
104
  raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
99
105