ingestr 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

ingestr/main.py CHANGED
@@ -1,186 +1,274 @@
1
1
  import hashlib
2
+ from datetime import datetime
3
+ from typing import Optional
2
4
 
3
5
  import dlt
6
+ import humanize
4
7
  import typer
8
+ from rich.console import Console
9
+ from typing_extensions import Annotated
5
10
 
6
11
  from ingestr.src.factory import SourceDestinationFactory
7
- from rich import print
8
- from dlt.common.pipeline import LoadInfo
9
- import humanize
10
-
11
- app = typer.Typer(name="ingestr")
12
-
13
-
14
- @app.command()
15
- def ingest(
16
- source_uri: str = None, # type: ignore
17
- dest_uri: str = None, # type: ignore
18
- source_table: str = None, # type: ignore
19
- dest_table: str = None, # type: ignore
20
- incremental_key: str = None, # type: ignore
21
- incremental_strategy: str = "replace", # type: ignore
22
- ):
23
- if not source_uri:
24
- typer.echo("Please provide a source URI")
25
- raise typer.Abort()
26
-
27
- if not dest_uri:
28
- typer.echo("Please provide a destination URI")
29
- raise typer.Abort()
12
+ from ingestr.src.telemetry.event import track
30
13
 
31
- if not source_table:
32
- print("[bold red]Please provide a source table [\red bold]")
33
- raise typer.Abort()
14
+ app = typer.Typer(
15
+ name="ingestr",
16
+ help="ingestr is the CLI tool to ingest data from one source to another",
17
+ rich_markup_mode="rich",
18
+ )
34
19
 
35
- if not dest_table:
36
- typer.echo("Please provide a destination table")
37
- raise typer.Abort()
20
+ console = Console()
21
+ print = console.print
38
22
 
23
+ DATE_FORMATS = [
24
+ "%Y-%m-%d",
25
+ "%Y-%m-%dT%H:%M:%S",
26
+ "%Y-%m-%dT%H:%M:%S%z",
27
+ "%Y-%m-%d %H:%M:%S",
28
+ "%Y-%m-%dT%H:%M:%S.%f",
29
+ "%Y-%m-%dT%H:%M:%S.%f%z",
30
+ ]
39
31
 
40
- factory = SourceDestinationFactory(source_uri, dest_uri)
41
- source = factory.get_source()
42
- destination = factory.get_destination()
43
32
 
44
- m = hashlib.sha256()
45
- m.update(dest_table.encode("utf-8"))
46
-
47
- pipeline = dlt.pipeline(
48
- pipeline_name=m.hexdigest(),
49
- destination=destination.dlt_dest(
50
- uri=dest_uri,
33
+ @app.command()
34
+ def ingest(
35
+ source_uri: Annotated[
36
+ str, typer.Option(help="The URI of the [green]source[/green]")
37
+ ], # type: ignore
38
+ dest_uri: Annotated[
39
+ str, typer.Option(help="The URI of the [cyan]destination[/cyan]")
40
+ ], # type: ignore
41
+ source_table: Annotated[
42
+ str, typer.Option(help="The table name in the [green]source[/green] to fetch")
43
+ ], # type: ignore
44
+ dest_table: Annotated[
45
+ str,
46
+ typer.Option(
47
+ help="The table in the [cyan]destination[/cyan] to save the data into"
48
+ ),
49
+ ] = None, # type: ignore
50
+ incremental_key: Annotated[
51
+ str,
52
+ typer.Option(
53
+ help="The incremental key from the table to be used for incremental strategies"
54
+ ),
55
+ ] = None, # type: ignore
56
+ incremental_strategy: Annotated[
57
+ str,
58
+ typer.Option(
59
+ help="The incremental strategy to use, must be one of 'replace', 'append', 'delete+insert', or 'merge'"
51
60
  ),
52
- progress=dlt.progress.log(dump_system_stats=False),
53
- pipelines_dir="pipeline_data",
61
+ ] = "replace", # type: ignore
62
+ interval_start: Annotated[
63
+ Optional[datetime],
64
+ typer.Option(
65
+ help="The start of the interval the incremental key will cover",
66
+ formats=DATE_FORMATS,
67
+ ),
68
+ ] = None, # type: ignore
69
+ interval_end: Annotated[
70
+ Optional[datetime],
71
+ typer.Option(
72
+ help="The end of the interval the incremental key will cover",
73
+ formats=DATE_FORMATS,
74
+ ),
75
+ ] = None, # type: ignore
76
+ primary_key: Annotated[Optional[list[str]], typer.Option(help="The merge ")] = None, # type: ignore
77
+ ):
78
+ track(
79
+ "command_triggered",
80
+ {
81
+ "command": "ingest",
82
+ },
54
83
  )
55
84
 
56
- print()
57
- print(f"[bold green]Initiated pipeline, starting...[/bold green]")
58
- print()
85
+ try:
86
+ if not dest_table:
87
+ print()
88
+ print(
89
+ "[yellow]Destination table is not given, defaulting to the source table.[/yellow]"
90
+ )
91
+ dest_table = source_table
92
+
93
+ merge_key = None
94
+ if incremental_strategy == "delete+insert":
95
+ merge_key = incremental_key
96
+ incremental_strategy = "merge"
97
+
98
+ factory = SourceDestinationFactory(source_uri, dest_uri)
99
+ source = factory.get_source()
100
+ destination = factory.get_destination()
101
+
102
+ m = hashlib.sha256()
103
+ m.update(dest_table.encode("utf-8"))
104
+
105
+ pipeline = dlt.pipeline(
106
+ pipeline_name=m.hexdigest(),
107
+ destination=destination.dlt_dest(
108
+ uri=dest_uri,
109
+ ),
110
+ progress=dlt.progress.log(dump_system_stats=False),
111
+ pipelines_dir="pipeline_data",
112
+ dataset_name="testschema",
113
+ )
59
114
 
60
- incremental = []
61
- if incremental_key:
62
- incremental = [incremental_key]
115
+ print()
116
+ print("[bold green]Initiated the pipeline with the following:[/bold green]")
117
+ print(
118
+ f"[bold yellow] Source:[/bold yellow] {factory.source_scheme} / {source_table}"
119
+ )
120
+ print(
121
+ f"[bold yellow] Destination:[/bold yellow] {factory.destination_scheme} / {dest_table}"
122
+ )
123
+ print(
124
+ f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy}"
125
+ )
126
+ print(
127
+ f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
128
+ )
129
+ print()
130
+
131
+ continuePipeline = typer.confirm("Are you sure you would like to continue?")
132
+ if not continuePipeline:
133
+ track("command_finished", {"command": "ingest", "status": "aborted"})
134
+ raise typer.Abort()
135
+
136
+ print()
137
+ print("[bold green]Starting the ingestion...[/bold green]")
138
+ print()
139
+
140
+ run_info = pipeline.run(
141
+ source.dlt_source(
142
+ uri=source_uri,
143
+ table=source_table,
144
+ incremental_key=incremental_key,
145
+ merge_key=merge_key,
146
+ interval_start=interval_start,
147
+ interval_end=interval_end,
148
+ ),
149
+ **destination.dlt_run_params(
150
+ uri=dest_uri,
151
+ table=dest_table,
152
+ ),
153
+ write_disposition=incremental_strategy, # type: ignore
154
+ primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
155
+ )
63
156
 
64
- run_info = pipeline.run(
65
- source.dlt_source(
66
- uri=source_uri,
67
- table=source_table,
68
- incremental_key=incremental_key,
69
- incremental_strategy=incremental_strategy,
70
- ),
71
- **destination.dlt_run_params(
72
- uri=dest_uri,
73
- table=dest_table,
74
- ),
75
- write_disposition=incremental_strategy, # type: ignore
76
- primary_key=incremental,
77
- )
157
+ elapsedHuman = ""
158
+ if run_info.started_at:
159
+ elapsed = run_info.finished_at - run_info.started_at
160
+ elapsedHuman = f"in {humanize.precisedelta(elapsed)}"
78
161
 
79
- print()
80
- print(f"[bold green]Successfully finished loading data from '{factory.source_scheme}' to '{factory.destination_scheme}'. [/bold green]")
81
- # typer.echo(printLoadInfo(run_info))
82
-
83
-
84
- def printLoadInfo(info: LoadInfo):
85
- msg = f"Pipeline {info.pipeline.pipeline_name} load step completed in "
86
- if info.started_at:
87
- elapsed = info.finished_at - info.started_at
88
- msg += humanize.precisedelta(elapsed)
89
- else:
90
- msg += "---"
91
- msg += (
92
- f"\n{len(info.loads_ids)} load package(s) were loaded to destination"
93
- f" {info.destination_name} and into dataset {info.dataset_name}\n"
94
- )
95
- if info.staging_name:
96
- msg += (
97
- f"The {info.staging_name} staging destination used"
98
- f" {info.staging_displayable_credentials} location to stage data\n"
162
+ print()
163
+ print(
164
+ f"[bold green]Successfully finished loading data from '{factory.source_scheme}' to '{factory.destination_scheme}' {elapsedHuman} [/bold green]"
165
+ )
166
+ print()
167
+ track(
168
+ "command_finished",
169
+ {
170
+ "command": "ingest",
171
+ "status": "success",
172
+ },
99
173
  )
100
174
 
101
- msg += (
102
- f"The {info.destination_name} destination used"
103
- f" {info.destination_displayable_credentials} location to store data"
104
- )
105
- msg += info._load_packages_asstr(info.load_packages, 0)
106
- return msg
175
+ except Exception as e:
176
+ track(
177
+ "command_finished",
178
+ {"command": "ingest", "status": "failed", "error": str(e)},
179
+ )
180
+ raise
107
181
 
108
182
 
109
183
  @app.command()
110
184
  def example_uris():
185
+ track(
186
+ "command_triggered",
187
+ {
188
+ "command": "example-uris",
189
+ },
190
+ )
191
+
111
192
  print()
112
193
  typer.echo(
113
- f"Following are some example URI formats for supported sources and destinations:"
194
+ "Following are some example URI formats for supported sources and destinations:"
114
195
  )
115
196
 
116
197
  print()
117
198
  print(
118
- f"[bold green]Postgres:[/bold green] [white]postgres://user:password@host:port/dbname?sslmode=require [/white]"
199
+ "[bold green]Postgres:[/bold green] [white]postgres://user:password@host:port/dbname?sslmode=require [/white]"
119
200
  )
120
201
  print(
121
- f"[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql[/white dim]"
202
+ "[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql[/white dim]"
122
203
  )
123
204
 
124
205
  print()
125
206
  print(
126
- f"[bold green]BigQuery:[/bold green] [white]bigquery://project-id?credentials_path=/path/to/credentials.json&location=US [/white]"
207
+ "[bold green]BigQuery:[/bold green] [white]bigquery://project-id?credentials_path=/path/to/credentials.json&location=US [/white]"
127
208
  )
128
209
  print(
129
- f"[white dim]└── https://github.com/googleapis/python-bigquery-sqlalchemy?tab=readme-ov-file#connection-string-parameters[/white dim]"
210
+ "[white dim]└── https://github.com/googleapis/python-bigquery-sqlalchemy?tab=readme-ov-file#connection-string-parameters[/white dim]"
130
211
  )
131
212
 
132
213
  print()
133
214
  print(
134
- f"[bold green]Snowflake:[/bold green] [white]snowflake://user:password@account/dbname?warehouse=COMPUTE_WH [/white]"
215
+ "[bold green]Snowflake:[/bold green] [white]snowflake://user:password@account/dbname?warehouse=COMPUTE_WH [/white]"
135
216
  )
136
217
  print(
137
- f"[white dim]└── https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#connection-parameters"
218
+ "[white dim]└── https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#connection-parameters"
138
219
  )
139
220
 
140
221
  print()
141
222
  print(
142
- f"[bold green]Redshift:[/bold green] [white]redshift://user:password@host:port/dbname?sslmode=require [/white]"
223
+ "[bold green]Redshift:[/bold green] [white]redshift://user:password@host:port/dbname?sslmode=require [/white]"
143
224
  )
144
225
  print(
145
- f"[white dim]└── https://aws.amazon.com/blogs/big-data/use-the-amazon-redshift-sqlalchemy-dialect-to-interact-with-amazon-redshift/[/white dim]"
226
+ "[white dim]└── https://aws.amazon.com/blogs/big-data/use-the-amazon-redshift-sqlalchemy-dialect-to-interact-with-amazon-redshift/[/white dim]"
146
227
  )
147
228
 
148
229
  print()
149
230
  print(
150
- f"[bold green]Databricks:[/bold green] [white]databricks://token:<access_token>@<server_hostname>?http_path=<http_path>&catalog=<catalog>&schema=<schema>[/white]"
231
+ "[bold green]Databricks:[/bold green] [white]databricks://token:<access_token>@<server_hostname>?http_path=<http_path>&catalog=<catalog>&schema=<schema>[/white]"
151
232
  )
152
- print(f"[white dim]└── https://docs.databricks.com/en/dev-tools/sqlalchemy.html")
233
+ print("[white dim]└── https://docs.databricks.com/en/dev-tools/sqlalchemy.html")
153
234
 
154
235
  print()
155
236
  print(
156
- f"[bold green]Microsoft SQL Server:[/bold green] [white]mssql://user:password@host:port/dbname?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes [/white]"
237
+ "[bold green]Microsoft SQL Server:[/bold green] [white]mssql://user:password@host:port/dbname?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes [/white]"
157
238
  )
158
239
  print(
159
- f"[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#microsoft-sql-server"
240
+ "[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#microsoft-sql-server"
160
241
  )
161
242
 
162
243
  print()
163
244
  print(
164
- f"[bold green]MySQL:[/bold green] [white]mysql://user:password@host:port/dbname [/white]"
245
+ "[bold green]MySQL:[/bold green] [white]mysql://user:password@host:port/dbname [/white]"
165
246
  )
166
247
  print(
167
- f"[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#mysql[/white dim]"
248
+ "[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#mysql[/white dim]"
168
249
  )
169
250
 
170
251
  print()
171
- print(f"[bold green]DuckDB:[/bold green] [white]duckdb://path/to/database [/white]")
172
- print(f"[white dim]└── https://github.com/Mause/duckdb_engine[/white dim]")
252
+ print("[bold green]DuckDB:[/bold green] [white]duckdb://path/to/database [/white]")
253
+ print("[white dim]└── https://github.com/Mause/duckdb_engine[/white dim]")
173
254
 
174
255
  print()
175
- print(f"[bold green]SQLite:[/bold green] [white]sqlite://path/to/database [/white]")
256
+ print("[bold green]SQLite:[/bold green] [white]sqlite://path/to/database [/white]")
176
257
  print(
177
- f"[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#sqlite[/white dim]"
258
+ "[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#sqlite[/white dim]"
178
259
  )
179
260
 
180
261
  print()
181
262
  typer.echo(
182
263
  "These are all coming from SQLAlchemy's URI format, so they should be familiar to most users."
183
264
  )
265
+ track(
266
+ "command_finished",
267
+ {
268
+ "command": "example-uris",
269
+ "status": "success",
270
+ },
271
+ )
184
272
 
185
273
 
186
274
  def main():
ingestr/main_test.py ADDED
@@ -0,0 +1,579 @@
1
+ import os
2
+ import shutil
3
+
4
+ import duckdb
5
+ import pytest
6
+ from typer.testing import CliRunner
7
+
8
+ from ingestr.main import app
9
+
10
+ runner = CliRunner()
11
+
12
+
13
+ def get_abs_path(relative_path):
14
+ return os.path.abspath(os.path.join(os.path.dirname(__file__), relative_path))
15
+
16
+
17
+ def invoke_ingest_command(
18
+ source_uri,
19
+ source_table,
20
+ dest_uri,
21
+ dest_table,
22
+ inc_strategy=None,
23
+ inc_key=None,
24
+ primary_key=None,
25
+ merge_key=None,
26
+ interval_start=None,
27
+ interval_end=None,
28
+ ):
29
+ args = [
30
+ "ingest",
31
+ "--source-uri",
32
+ source_uri,
33
+ "--source-table",
34
+ source_table,
35
+ "--dest-uri",
36
+ dest_uri,
37
+ "--dest-table",
38
+ dest_table,
39
+ ]
40
+
41
+ if inc_strategy:
42
+ args.append("--incremental-strategy")
43
+ args.append(inc_strategy)
44
+
45
+ if inc_key:
46
+ args.append("--incremental-key")
47
+ args.append(inc_key)
48
+
49
+ if primary_key:
50
+ args.append("--primary-key")
51
+ args.append(primary_key)
52
+
53
+ if merge_key:
54
+ args.append("--merge-key")
55
+ args.append(merge_key)
56
+
57
+ if interval_start:
58
+ args.append("--interval-start")
59
+ args.append(interval_start)
60
+
61
+ if interval_end:
62
+ args.append("--interval-end")
63
+ args.append(interval_end)
64
+
65
+ result = runner.invoke(
66
+ app,
67
+ args,
68
+ input="y\n",
69
+ env={"DISABLE_TELEMETRY": "true"},
70
+ )
71
+ return result
72
+
73
+
74
+ def test_create_replace():
75
+ abs_db_path = get_abs_path("./testdata/test_create_replace.db")
76
+ rel_db_path_to_command = "ingestr/testdata/test_create_replace.db"
77
+
78
+ conn = duckdb.connect(abs_db_path)
79
+ conn.execute("DROP SCHEMA IF EXISTS testschema CASCADE")
80
+ conn.execute("CREATE SCHEMA testschema")
81
+ conn.execute(
82
+ "CREATE TABLE testschema.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP)"
83
+ )
84
+ conn.execute("INSERT INTO testschema.input VALUES (1, 'val1', '2022-01-01')")
85
+ conn.execute("INSERT INTO testschema.input VALUES (2, 'val2', '2022-02-01')")
86
+
87
+ res = conn.sql("select count(*) from testschema.input").fetchall()
88
+ assert res[0][0] == 2
89
+
90
+ result = invoke_ingest_command(
91
+ f"duckdb:///{rel_db_path_to_command}",
92
+ "testschema.input",
93
+ f"duckdb:///{rel_db_path_to_command}",
94
+ "testschema.output",
95
+ )
96
+
97
+ assert result.exit_code == 0
98
+
99
+ res = conn.sql(
100
+ "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema.output"
101
+ ).fetchall()
102
+ assert len(res) == 2
103
+ assert res[0] == (1, "val1", "2022-01-01")
104
+ assert res[1] == (2, "val2", "2022-02-01")
105
+
106
+
107
+ @pytest.mark.skip(
108
+ reason="this doesn't work at the moment due to a bug with dlt: https://github.com/dlt-hub/dlt/issues/971"
109
+ )
110
+ def test_append():
111
+ try:
112
+ shutil.rmtree(get_abs_path("../pipeline_data"))
113
+ except Exception:
114
+ pass
115
+
116
+ abs_db_path = get_abs_path("./testdata/test_append.db")
117
+ rel_db_path_to_command = "ingestr/testdata/test_append.db"
118
+ uri = f"duckdb:///{rel_db_path_to_command}"
119
+
120
+ conn = duckdb.connect(abs_db_path)
121
+ conn.execute("DROP SCHEMA IF EXISTS testschema_append CASCADE")
122
+ conn.execute("CHECKPOINT")
123
+
124
+ conn.execute("CREATE SCHEMA testschema_append")
125
+ conn.execute(
126
+ "CREATE TABLE testschema_append.input (id INTEGER, val VARCHAR, updated_at DATE)"
127
+ )
128
+ conn.execute(
129
+ "INSERT INTO testschema_append.input VALUES (1, 'val1', '2022-01-01'), (2, 'val2', '2022-01-02')"
130
+ )
131
+ conn.execute("CHECKPOINT")
132
+
133
+ res = conn.sql("select count(*) from testschema_append.input").fetchall()
134
+ assert res[0][0] == 2
135
+
136
+ def run():
137
+ res = invoke_ingest_command(
138
+ uri,
139
+ "testschema_append.input",
140
+ uri,
141
+ "testschema_append.output",
142
+ "append",
143
+ "updated_at",
144
+ )
145
+ assert res.exit_code == 0
146
+
147
+ def get_output_table():
148
+ conn.execute("CHECKPOINT")
149
+ return conn.sql(
150
+ "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_append.output"
151
+ ).fetchall()
152
+
153
+ run()
154
+
155
+ res = get_output_table()
156
+ assert len(res) == 2
157
+ assert res[0] == (1, "val1", "2022-01-01")
158
+ assert res[1] == (2, "val2", "2022-01-02")
159
+
160
+ # # run again, nothing should be inserted into the output table
161
+ run()
162
+
163
+ res = get_output_table()
164
+ assert len(res) == 2
165
+ assert res[0] == (1, "val1", "2022-01-01")
166
+ assert res[1] == (2, "val2", "2022-02-01")
167
+
168
+
169
+ def test_merge_with_primary_key():
170
+ try:
171
+ shutil.rmtree(get_abs_path("../pipeline_data"))
172
+ except Exception:
173
+ pass
174
+
175
+ abs_db_path = get_abs_path("./testdata/test_merge_with_primary_key.db")
176
+ rel_db_path_to_command = "ingestr/testdata/test_merge_with_primary_key.db"
177
+ uri = f"duckdb:///{rel_db_path_to_command}"
178
+
179
+ conn = duckdb.connect(abs_db_path)
180
+ conn.execute("DROP SCHEMA IF EXISTS testschema_merge CASCADE")
181
+ conn.execute("CREATE SCHEMA testschema_merge")
182
+ conn.execute(
183
+ "CREATE TABLE testschema_merge.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP)"
184
+ )
185
+ conn.execute("INSERT INTO testschema_merge.input VALUES (1, 'val1', '2022-01-01')")
186
+ conn.execute("INSERT INTO testschema_merge.input VALUES (2, 'val2', '2022-02-01')")
187
+
188
+ res = conn.sql("select count(*) from testschema_merge.input").fetchall()
189
+ assert res[0][0] == 2
190
+
191
+ def run():
192
+ res = invoke_ingest_command(
193
+ uri,
194
+ "testschema_merge.input",
195
+ uri,
196
+ "testschema_merge.output",
197
+ "merge",
198
+ "updated_at",
199
+ "id",
200
+ )
201
+ assert res.exit_code == 0
202
+ return res
203
+
204
+ def get_output_rows():
205
+ conn.execute("CHECKPOINT")
206
+ return conn.sql(
207
+ "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_merge.output order by id asc"
208
+ ).fetchall()
209
+
210
+ def assert_output_equals(expected):
211
+ res = get_output_rows()
212
+ assert len(res) == len(expected)
213
+ for i, row in enumerate(expected):
214
+ assert res[i] == row
215
+
216
+ run()
217
+ assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
218
+
219
+ first_run_id = conn.sql(
220
+ "select _dlt_load_id from testschema_merge.output limit 1"
221
+ ).fetchall()[0][0]
222
+
223
+ ##############################
224
+ # we'll run again, we don't expect any changes since the data hasn't changed
225
+ run()
226
+ assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
227
+
228
+ # we also ensure that the other rows were not touched
229
+ count_by_run_id = conn.sql(
230
+ "select _dlt_load_id, count(*) from testschema_merge.output group by 1"
231
+ ).fetchall()
232
+ assert len(count_by_run_id) == 1
233
+ assert count_by_run_id[0][1] == 2
234
+ assert count_by_run_id[0][0] == first_run_id
235
+ ##############################
236
+
237
+ ##############################
238
+ # now we'll modify the source data but not the updated at, the output table should not be updated
239
+ conn.execute("UPDATE testschema_merge.input SET val = 'val1_modified' WHERE id = 2")
240
+
241
+ run()
242
+ assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
243
+
244
+ # we also ensure that the other rows were not touched
245
+ count_by_run_id = conn.sql(
246
+ "select _dlt_load_id, count(*) from testschema_merge.output group by 1"
247
+ ).fetchall()
248
+ assert len(count_by_run_id) == 1
249
+ assert count_by_run_id[0][1] == 2
250
+ assert count_by_run_id[0][0] == first_run_id
251
+ ##############################
252
+
253
+ ##############################
254
+ # now we'll insert a new row but with an old date, the new row will not show up
255
+ conn.execute("INSERT INTO testschema_merge.input VALUES (3, 'val3', '2022-01-01')")
256
+
257
+ run()
258
+ assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
259
+
260
+ # we also ensure that the other rows were not touched
261
+ count_by_run_id = conn.sql(
262
+ "select _dlt_load_id, count(*) from testschema_merge.output group by 1"
263
+ ).fetchall()
264
+ assert len(count_by_run_id) == 1
265
+ assert count_by_run_id[0][1] == 2
266
+ assert count_by_run_id[0][0] == first_run_id
267
+ ##############################
268
+
269
+ ##############################
270
+ # now we'll insert a new row but with a new date, the new row will show up
271
+ conn.execute("INSERT INTO testschema_merge.input VALUES (3, 'val3', '2022-02-02')")
272
+
273
+ run()
274
+ assert_output_equals(
275
+ [
276
+ (1, "val1", "2022-01-01"),
277
+ (2, "val2", "2022-02-01"),
278
+ (3, "val3", "2022-02-02"),
279
+ ]
280
+ )
281
+
282
+ # we have a new run that inserted rows to this table, so the run count should be 2
283
+ count_by_run_id = conn.sql(
284
+ "select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 2 desc"
285
+ ).fetchall()
286
+ assert len(count_by_run_id) == 2
287
+ assert count_by_run_id[0][1] == 2
288
+ assert count_by_run_id[0][0] == first_run_id
289
+ # we don't care about the run ID
290
+ assert count_by_run_id[1][1] == 1
291
+ ##############################
292
+
293
+ ##############################
294
+ # lastly, let's try modifying the updated_at of an old column, it should be updated in the output table
295
+ conn.execute(
296
+ "UPDATE testschema_merge.input SET val='val2_modified', updated_at = '2022-02-03' WHERE id = 2"
297
+ )
298
+
299
+ run()
300
+ assert_output_equals(
301
+ [
302
+ (1, "val1", "2022-01-01"),
303
+ (2, "val2_modified", "2022-02-03"),
304
+ (3, "val3", "2022-02-02"),
305
+ ]
306
+ )
307
+
308
+ # we have a new run that inserted rows to this table, so the run count should be 2
309
+ count_by_run_id = conn.sql(
310
+ "select _dlt_load_id, count(*) from testschema_merge.output group by 1 order by 2 desc, 1 asc"
311
+ ).fetchall()
312
+ assert len(count_by_run_id) == 3
313
+ assert count_by_run_id[0][1] == 1
314
+ assert count_by_run_id[0][0] == first_run_id
315
+ # we don't care about the rest of the run IDs
316
+ assert count_by_run_id[1][1] == 1
317
+ assert count_by_run_id[2][1] == 1
318
+ ##############################
319
+
320
+
321
+ def test_delete_insert_without_primary_key():
322
+ try:
323
+ shutil.rmtree(get_abs_path("../pipeline_data"))
324
+ except Exception:
325
+ pass
326
+
327
+ abs_db_path = get_abs_path("./testdata/test_delete_insert_without_primary_key.db")
328
+ rel_db_path_to_command = (
329
+ "ingestr/testdata/test_delete_insert_without_primary_key.db"
330
+ )
331
+ uri = f"duckdb:///{rel_db_path_to_command}"
332
+
333
+ conn = duckdb.connect(abs_db_path)
334
+ conn.execute("DROP SCHEMA IF EXISTS testschema_delete_insert CASCADE")
335
+ conn.execute("CREATE SCHEMA testschema_delete_insert")
336
+ conn.execute(
337
+ "CREATE TABLE testschema_delete_insert.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP)"
338
+ )
339
+ conn.execute(
340
+ "INSERT INTO testschema_delete_insert.input VALUES (1, 'val1', '2022-01-01')"
341
+ )
342
+ conn.execute(
343
+ "INSERT INTO testschema_delete_insert.input VALUES (2, 'val2', '2022-02-01')"
344
+ )
345
+
346
+ res = conn.sql("select count(*) from testschema_delete_insert.input").fetchall()
347
+ assert res[0][0] == 2
348
+
349
+ def run():
350
+ res = invoke_ingest_command(
351
+ uri,
352
+ "testschema_delete_insert.input",
353
+ uri,
354
+ "testschema_delete_insert.output",
355
+ inc_strategy="delete+insert",
356
+ inc_key="updated_at",
357
+ )
358
+ assert res.exit_code == 0
359
+ return res
360
+
361
+ def get_output_rows():
362
+ conn.execute("CHECKPOINT")
363
+ return conn.sql(
364
+ "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_delete_insert.output order by id asc"
365
+ ).fetchall()
366
+
367
+ def assert_output_equals(expected):
368
+ res = get_output_rows()
369
+ assert len(res) == len(expected)
370
+ for i, row in enumerate(expected):
371
+ assert res[i] == row
372
+
373
+ run()
374
+ assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
375
+
376
+ first_run_id = conn.sql(
377
+ "select _dlt_load_id from testschema_delete_insert.output limit 1"
378
+ ).fetchall()[0][0]
379
+
380
+ ##############################
381
+ # we'll run again, since this is a delete+insert, we expect the run ID to change for the last one
382
+ run()
383
+ assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-02-01")])
384
+
385
+ # we ensure that one of the rows is updated with a new run
386
+ count_by_run_id = conn.sql(
387
+ "select _dlt_load_id, count(*) from testschema_delete_insert.output group by 1 order by 1 asc"
388
+ ).fetchall()
389
+ assert len(count_by_run_id) == 2
390
+ assert count_by_run_id[0][0] == first_run_id
391
+ assert count_by_run_id[0][1] == 1
392
+ assert count_by_run_id[1][0] != first_run_id
393
+ assert count_by_run_id[1][1] == 1
394
+ ##############################
395
+
396
+ ##############################
397
+ # now we'll insert a few more lines for the same day, the new rows should show up
398
+ conn.execute(
399
+ "INSERT INTO testschema_delete_insert.input VALUES (3, 'val3', '2022-02-01'), (4, 'val4', '2022-02-01')"
400
+ )
401
+ conn.execute("CHECKPOINT")
402
+
403
+ run()
404
+ assert_output_equals(
405
+ [
406
+ (1, "val1", "2022-01-01"),
407
+ (2, "val2", "2022-02-01"),
408
+ (3, "val3", "2022-02-01"),
409
+ (4, "val4", "2022-02-01"),
410
+ ]
411
+ )
412
+
413
+ # the new rows should have a new run ID, there should be 2 distinct runs now
414
+ count_by_run_id = conn.sql(
415
+ "select _dlt_load_id, count(*) from testschema_delete_insert.output group by 1 order by 2 desc, 1 asc"
416
+ ).fetchall()
417
+ assert len(count_by_run_id) == 2
418
+ assert count_by_run_id[0][0] != first_run_id
419
+ assert count_by_run_id[0][1] == 3 # 2 new rows + 1 old row
420
+ assert count_by_run_id[1][0] == first_run_id
421
+ assert count_by_run_id[1][1] == 1
422
+ ##############################
423
+
424
+
425
+ def test_delete_insert_with_timerange():
426
+ try:
427
+ shutil.rmtree(get_abs_path("../pipeline_data"))
428
+ except Exception:
429
+ pass
430
+
431
+ abs_db_path = get_abs_path("./testdata/test_delete_insert_with_timerange.db")
432
+ rel_db_path_to_command = "ingestr/testdata/test_delete_insert_with_timerange.db"
433
+ uri = f"duckdb:///{rel_db_path_to_command}"
434
+
435
+ conn = duckdb.connect(abs_db_path)
436
+ conn.execute("DROP SCHEMA IF EXISTS testschema_delete_insert_timerange CASCADE")
437
+ conn.execute("CREATE SCHEMA testschema_delete_insert_timerange")
438
+ conn.execute(
439
+ "CREATE TABLE testschema_delete_insert_timerange.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP)"
440
+ )
441
+ conn.execute(
442
+ """INSERT INTO testschema_delete_insert_timerange.input VALUES
443
+ (1, 'val1', '2022-01-01'),
444
+ (2, 'val2', '2022-01-01'),
445
+ (3, 'val3', '2022-01-02'),
446
+ (4, 'val4', '2022-01-02'),
447
+ (5, 'val5', '2022-01-03'),
448
+ (6, 'val6', '2022-01-03')
449
+ """
450
+ )
451
+
452
+ res = conn.sql(
453
+ "select count(*) from testschema_delete_insert_timerange.input"
454
+ ).fetchall()
455
+ assert res[0][0] == 6
456
+
457
+ def run(start_date: str, end_date: str):
458
+ res = invoke_ingest_command(
459
+ uri,
460
+ "testschema_delete_insert_timerange.input",
461
+ uri,
462
+ "testschema_delete_insert_timerange.output",
463
+ inc_strategy="delete+insert",
464
+ inc_key="updated_at",
465
+ interval_start=start_date,
466
+ interval_end=end_date,
467
+ )
468
+ assert res.exit_code == 0
469
+ return res
470
+
471
+ def get_output_rows():
472
+ conn.execute("CHECKPOINT")
473
+ return conn.sql(
474
+ "select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_delete_insert_timerange.output order by id asc"
475
+ ).fetchall()
476
+
477
+ def assert_output_equals(expected):
478
+ res = get_output_rows()
479
+ assert len(res) == len(expected)
480
+ for i, row in enumerate(expected):
481
+ assert res[i] == row
482
+
483
+ run(
484
+ "2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z"
485
+ ) # dlt runs them with the end date exclusive
486
+ assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-01-01")])
487
+
488
+ first_run_id = conn.sql(
489
+ "select _dlt_load_id from testschema_delete_insert_timerange.output limit 1"
490
+ ).fetchall()[0][0]
491
+
492
+ ##############################
493
+ # we'll run again, since this is a delete+insert, we expect the run ID to change for the last one
494
+ run(
495
+ "2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z"
496
+ ) # dlt runs them with the end date exclusive
497
+ assert_output_equals([(1, "val1", "2022-01-01"), (2, "val2", "2022-01-01")])
498
+
499
+ # both rows should have a new run ID
500
+ count_by_run_id = conn.sql(
501
+ "select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
502
+ ).fetchall()
503
+ assert len(count_by_run_id) == 1
504
+ assert count_by_run_id[0][0] != first_run_id
505
+ assert count_by_run_id[0][1] == 2
506
+ ##############################
507
+
508
+ ##############################
509
+ # now run for the day after, new rows should land
510
+ run("2022-01-02T00:00:00Z", "2022-01-03T00:00:00Z")
511
+ assert_output_equals(
512
+ [
513
+ (1, "val1", "2022-01-01"),
514
+ (2, "val2", "2022-01-01"),
515
+ (3, "val3", "2022-01-02"),
516
+ (4, "val4", "2022-01-02"),
517
+ ]
518
+ )
519
+
520
+ # there should be 4 rows with 2 distinct run IDs
521
+ count_by_run_id = conn.sql(
522
+ "select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
523
+ ).fetchall()
524
+ assert len(count_by_run_id) == 2
525
+ assert count_by_run_id[0][1] == 2
526
+ assert count_by_run_id[1][1] == 2
527
+ ##############################
528
+
529
+ ##############################
530
+ # let's bring in the rows for the third day
531
+ run("2022-01-03T00:00:00Z", "2022-01-04T00:00:00Z")
532
+ assert_output_equals(
533
+ [
534
+ (1, "val1", "2022-01-01"),
535
+ (2, "val2", "2022-01-01"),
536
+ (3, "val3", "2022-01-02"),
537
+ (4, "val4", "2022-01-02"),
538
+ (5, "val5", "2022-01-03"),
539
+ (6, "val6", "2022-01-03"),
540
+ ]
541
+ )
542
+
543
+ # there should be 6 rows with 3 distinct run IDs
544
+ count_by_run_id = conn.sql(
545
+ "select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
546
+ ).fetchall()
547
+ assert len(count_by_run_id) == 3
548
+ assert count_by_run_id[0][1] == 2
549
+ assert count_by_run_id[1][1] == 2
550
+ assert count_by_run_id[2][1] == 2
551
+ ##############################
552
+
553
+ ##############################
554
+ # now let's do a backfill for the first day again, the rows should be updated
555
+ conn.execute(
556
+ "UPDATE testschema_delete_insert_timerange.input SET val = 'val1_modified' WHERE id = 1"
557
+ )
558
+
559
+ run("2022-01-01T00:00:00Z", "2022-01-02T00:00:00Z")
560
+ assert_output_equals(
561
+ [
562
+ (1, "val1_modified", "2022-01-01"),
563
+ (2, "val2", "2022-01-01"),
564
+ (3, "val3", "2022-01-02"),
565
+ (4, "val4", "2022-01-02"),
566
+ (5, "val5", "2022-01-03"),
567
+ (6, "val6", "2022-01-03"),
568
+ ]
569
+ )
570
+
571
+ # there should still be 6 rows with 3 distinct run IDs
572
+ count_by_run_id = conn.sql(
573
+ "select _dlt_load_id, count(*) from testschema_delete_insert_timerange.output group by 1 order by 1 asc"
574
+ ).fetchall()
575
+ assert len(count_by_run_id) == 3
576
+ assert count_by_run_id[0][1] == 2
577
+ assert count_by_run_id[1][1] == 2
578
+ assert count_by_run_id[2][1] == 2
579
+ ##############################
ingestr/src/factory.py CHANGED
@@ -3,6 +3,7 @@ from urllib.parse import urlparse
3
3
 
4
4
  from ingestr.src.destinations import (
5
5
  BigQueryDestination,
6
+ DatabricksDestination,
6
7
  DuckDBDestination,
7
8
  MsSQLDestination,
8
9
  PostgresDestination,
@@ -59,12 +60,13 @@ class SourceDestinationFactory:
59
60
  def get_destination(self) -> DestinationProtocol:
60
61
  match: dict[str, DestinationProtocol] = {
61
62
  "bigquery": BigQueryDestination(),
63
+ "databricks": DatabricksDestination(),
64
+ "duckdb": DuckDBDestination(),
65
+ "mssql": MsSQLDestination(),
62
66
  "postgres": PostgresDestination(),
63
67
  "postgresql": PostgresDestination(),
64
- "snowflake": SnowflakeDestination(),
65
68
  "redshift": RedshiftDestination(),
66
- "duckdb": DuckDBDestination(),
67
- "mssql": MsSQLDestination(),
69
+ "snowflake": SnowflakeDestination(),
68
70
  }
69
71
 
70
72
  if self.destination_scheme in match:
ingestr/src/sources.py CHANGED
@@ -1,7 +1,6 @@
1
1
  from typing import Callable
2
2
 
3
3
  import dlt
4
- import pendulum
5
4
 
6
5
  from ingestr.src.sql_database import sql_table
7
6
 
@@ -19,10 +18,14 @@ class SqlSource:
19
18
 
20
19
  incremental = None
21
20
  if kwargs.get("incremental_key"):
21
+ start_value = kwargs.get("interval_start")
22
+ end_value = kwargs.get("interval_end")
23
+
22
24
  incremental = dlt.sources.incremental(
23
25
  kwargs.get("incremental_key", ""),
24
- primary_key=(),
25
- initial_value=pendulum.now(),
26
+ # primary_key=(),
27
+ initial_value=start_value,
28
+ end_value=end_value,
26
29
  )
27
30
 
28
31
  table_instance = self.table_builder(
@@ -30,6 +33,7 @@ class SqlSource:
30
33
  schema=table_fields[-2],
31
34
  table=table_fields[-1],
32
35
  incremental=incremental,
36
+ merge_key=kwargs.get("merge_key"),
33
37
  )
34
38
 
35
39
  return table_instance
@@ -22,11 +22,12 @@ class SqlSourceTest(unittest.TestCase):
22
22
  table = "schema.table"
23
23
 
24
24
  # monkey patch the sql_table function
25
- def sql_table(credentials, schema, table, incremental):
25
+ def sql_table(credentials, schema, table, incremental, merge_key):
26
26
  self.assertEqual(credentials, uri)
27
27
  self.assertEqual(schema, "schema")
28
28
  self.assertEqual(table, "table")
29
29
  self.assertIsNone(incremental)
30
+ self.assertIsNone(merge_key)
30
31
  return dlt.resource()
31
32
 
32
33
  source = SqlSource(table_builder=sql_table)
@@ -39,12 +40,13 @@ class SqlSourceTest(unittest.TestCase):
39
40
  incremental_key = "id"
40
41
 
41
42
  # monkey patch the sql_table function
42
- def sql_table(credentials, schema, table, incremental):
43
+ def sql_table(credentials, schema, table, incremental, merge_key):
43
44
  self.assertEqual(credentials, uri)
44
45
  self.assertEqual(schema, "schema")
45
46
  self.assertEqual(table, "table")
46
47
  self.assertIsInstance(incremental, dlt.sources.incremental)
47
48
  self.assertEqual(incremental.cursor_path, incremental_key)
49
+ self.assertIsNone(merge_key)
48
50
  return dlt.resource()
49
51
 
50
52
  source = SqlSource(table_builder=sql_table)
@@ -1,6 +1,6 @@
1
1
  """Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads."""
2
2
 
3
- from typing import Any, Iterable, List, Optional, Union
3
+ from typing import Any, Optional, Union
4
4
 
5
5
  import dlt
6
6
  from dlt.sources import DltResource
@@ -9,7 +9,6 @@ from sqlalchemy import MetaData, Table
9
9
  from sqlalchemy.engine import Engine
10
10
 
11
11
  from .helpers import (
12
- SqlDatabaseTableConfiguration,
13
12
  engine_from_credentials,
14
13
  get_primary_key,
15
14
  table_rows,
@@ -17,51 +16,6 @@ from .helpers import (
17
16
  from .schema_types import table_to_columns
18
17
 
19
18
 
20
- @dlt.source
21
- def sql_database(
22
- credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value,
23
- schema: Optional[str] = dlt.config.value,
24
- metadata: Optional[MetaData] = None,
25
- table_names: Optional[List[str]] = dlt.config.value,
26
- detect_precision_hints: Optional[bool] = dlt.config.value,
27
- ) -> Iterable[DltResource]:
28
- """
29
- A DLT source which loads data from an SQL database using SQLAlchemy.
30
- Resources are automatically created for each table in the schema or from the given list of tables.
31
-
32
- Args:
33
- credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
34
- schema (Optional[str]): Name of the database schema to load (if different from default).
35
- metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
36
- table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
37
- detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
38
- This is disabled by default.
39
- Returns:
40
- Iterable[DltResource]: A list of DLT resources for each table to be loaded.
41
- """
42
-
43
- # set up alchemy engine
44
- engine = engine_from_credentials(credentials)
45
- engine.execution_options(stream_results=True)
46
- metadata = metadata or MetaData(schema=schema)
47
-
48
- # use provided tables or all tables
49
- if table_names:
50
- tables = [Table(name, metadata, autoload_with=engine) for name in table_names]
51
- else:
52
- metadata.reflect(bind=engine)
53
- tables = list(metadata.tables.values())
54
-
55
- for table in tables:
56
- yield dlt.resource(
57
- table_rows,
58
- name=table.name,
59
- primary_key=get_primary_key(table),
60
- spec=SqlDatabaseTableConfiguration,
61
- columns=table_to_columns(table) if detect_precision_hints else None,
62
- )(engine, table)
63
-
64
-
65
19
  def sql_table(
66
20
  credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value,
67
21
  table: str = dlt.config.value,
@@ -69,6 +23,7 @@ def sql_table(
69
23
  metadata: Optional[MetaData] = None,
70
24
  incremental: Optional[dlt.sources.incremental[Any]] = None,
71
25
  detect_precision_hints: Optional[bool] = dlt.config.value,
26
+ merge_key: Optional[str] = None,
72
27
  ) -> DltResource:
73
28
  """
74
29
  A dlt resource which loads data from an SQL database table using SQLAlchemy.
@@ -101,4 +56,5 @@ def sql_table(
101
56
  name=table_obj.name,
102
57
  primary_key=get_primary_key(table_obj),
103
58
  columns=table_to_columns(table_obj) if detect_precision_hints else None,
59
+ merge_key=merge_key, # type: ignore
104
60
  )(engine, table_obj, incremental=incremental)
@@ -63,7 +63,6 @@ class TableLoader:
63
63
  query = query.order_by(order_by)
64
64
  if self.last_value is None:
65
65
  return query
66
-
67
66
  return query.where(filter_op(self.cursor_column, self.last_value))
68
67
 
69
68
  def load_rows(self) -> Iterator[List[TDataItem]]:
@@ -0,0 +1,14 @@
1
+ import os
2
+
3
+ import machineid
4
+ import rudderstack.analytics as rudder_analytics # type: ignore
5
+
6
+ rudder_analytics.write_key = "2cUr13DDQcX2x2kAfMEfdrKvrQa"
7
+ rudder_analytics.dataPlaneUrl = "https://getbruinbumlky.dataplane.rudderstack.com"
8
+
9
+
10
+ def track(event_name, event_properties):
11
+ if os.environ.get("DISABLE_TELEMETRY", False):
12
+ return
13
+
14
+ rudder_analytics.track(machineid.hashed_id(), event_name, event_properties)
Binary file
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestr
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
5
5
  Project-URL: Homepage, https://github.com/bruin-data/ingestr
6
6
  Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -16,11 +16,14 @@ Classifier: Topic :: Database
16
16
  Requires-Python: >=3.9
17
17
  Requires-Dist: databricks-sql-connector==2.9.3
18
18
  Requires-Dist: dlt==0.4.3
19
+ Requires-Dist: duckdb-engine==0.11.1
19
20
  Requires-Dist: duckdb==0.9.2
21
+ Requires-Dist: google-cloud-bigquery-storage
20
22
  Requires-Dist: pendulum==3.0.0
21
23
  Requires-Dist: psycopg2==2.9.9
22
24
  Requires-Dist: pyodbc==5.1.0
23
25
  Requires-Dist: rich==13.7.0
26
+ Requires-Dist: rudder-sdk-python==2.0.2
24
27
  Requires-Dist: snowflake-sqlalchemy==1.5.1
25
28
  Requires-Dist: sqlalchemy-bigquery==1.9.0
26
29
  Requires-Dist: sqlalchemy2-stubs==0.0.2a38
@@ -32,18 +35,19 @@ Description-Content-Type: text/markdown
32
35
  <div align="center">
33
36
  <img src="./resources/ingestr.svg" width="500" />
34
37
  <p>Ingest & copy data from any source to any destination without any code</p>
38
+ <img src="./resources/demo.gif" width="500" />
35
39
  </div>
36
40
 
41
+
37
42
  -----
38
43
 
39
44
  Ingestr is a command-line application that allows you to ingest data from any source into any destination using simple command-line flags, no code necessary.
40
45
 
41
- - ✨ copy data from your Postges / Mongo / BigQuery or any other source into any destination
42
- - ➕ incremental loading
46
+ - ✨ copy data from your database into any destination
47
+ - ➕ incremental loading: `append`, `merge` or `delete+insert`
43
48
  - 🐍 single-command installation
44
- - 💅 Docker image for easy installation & usage
45
49
 
46
- ingestr takes away the complexity of managing any backend or writing any code for ingesting data, simply run the command and watch the magic.
50
+ ingestr takes away the complexity of managing any backend or writing any code for ingesting data, simply run the command and watch the data land on its destination.
47
51
 
48
52
 
49
53
  ## Installation
@@ -67,6 +71,10 @@ This command will:
67
71
  - get the table `public.some_data` from the Postgres instance.
68
72
  - upload this data to your BigQuery warehouse under the schema `ingestr` and table `some_data`.
69
73
 
74
+ ## Documentation
75
+ You can see the full documentation [here](https://bruindata.github.com/ingestr).
76
+
77
+
70
78
  ## Supported Sources & Destinations
71
79
 
72
80
  | Database | Source | Destination |
@@ -79,4 +87,9 @@ This command will:
79
87
  | DuckDB | ✅ | ✅ |
80
88
  | Microsoft SQL Server | ✅ | ✅ |
81
89
  | SQLite | ✅ | ❌ |
82
- | MySQL | ✅ | ❌ |
90
+ | MySQL | ✅ | ❌ |
91
+
92
+ More to come soon!
93
+
94
+ ## Acknowledgements
95
+ This project would not have been possible without the amazing work done by the [SQLAlchemy](https://www.sqlalchemy.org/) and [dlt](https://dlthub.com/) teams. We relied on their work to connect to various sources and destinations, and built `ingestr` as a simple, opinionated wrapper around their work.
@@ -0,0 +1,23 @@
1
+ ingestr/main.py,sha256=fogv6KlxO2Y0fYEl36s0YVTF1S7TCmksQvL_siVI7rE,8703
2
+ ingestr/main_test.py,sha256=IQx2bjTnuixoY9ndgErBTaP3QE5AWvsLtv7ZPi8vJtA,19285
3
+ ingestr/src/destinations.py,sha256=LyA_26S3tHMeJiFwxX3XYV39lLOKqKACL0wWn3IGyP4,2673
4
+ ingestr/src/destinations_test.py,sha256=rgEk8EpAntFbSOwXovC4prv3RA22mwq8pIO6sZ_rYzg,4212
5
+ ingestr/src/factory.py,sha256=iBmp2spbUkkvOfwdRf6uo_5j9fasTTSWdS79Kc4jRQw,2141
6
+ ingestr/src/sources.py,sha256=WdbkY0S54H3rNy8kgOH3VBfE5oB0TSsOCCY5GBDU8Ss,1130
7
+ ingestr/src/sources_test.py,sha256=l_Obs7z3_WulTVa2ZeCDYOTT97Mj1nd1AgA3P5OivV0,1929
8
+ ingestr/src/sql_database/__init__.py,sha256=_hOl_RpOzzd2Sf6r0ETzcjr8gD9fjXRfrgr8QM4pe6w,2606
9
+ ingestr/src/sql_database/helpers.py,sha256=1yw-E9uTr4_6VnFxYpFBZkA76nC3UmXYwFxoxMRnhC0,4441
10
+ ingestr/src/sql_database/schema_types.py,sha256=PCIdLrT5Xc4vmoaf6OJSeXLlyN05alwgcQ-TDXd8hbQ,2153
11
+ ingestr/src/sql_database/settings.py,sha256=PaLPayAb1QGHHcPlrZ7eJ1fonDA6-sOGh-ZiueIFhRg,76
12
+ ingestr/src/telemetry/event.py,sha256=ByPdlu5YYRXKM6hnZa48uAKvvz5jOq4bP-OAn3Nc7bQ,426
13
+ ingestr/src/testdata/fakebqcredentials.json,sha256=scc6TUc963KAbKTLZCfcmqVzbtzDCW1_8JNRnyAXyy8,628
14
+ ingestr/testdata/test_append.db,sha256=OI0K5lwvwJpbTKdv3MkIq1RKiGA4SHclCXThPW0q4Xo,1060864
15
+ ingestr/testdata/test_create_replace.db,sha256=-ByzuQxPW5wa7hiVUHxvckGgEn1NpGZgN2zogsg80-U,536576
16
+ ingestr/testdata/test_delete_insert_with_timerange.db,sha256=ClL0WO4f3lq7kaEDHs8LTKKl5pdJUB9HncvamRZqsjY,1585152
17
+ ingestr/testdata/test_delete_insert_without_primary_key.db,sha256=feNVnNzfJY-DXQF84eBmW62YNKfB8WxXXt66Or9HqZ4,1847296
18
+ ingestr/testdata/test_merge_with_primary_key.db,sha256=t5fm_kKLtl4G2l8PLdXqOs3eAwEv8keS-sD7fwlRQDQ,1847296
19
+ ingestr-0.0.4.dist-info/METADATA,sha256=lgJ-p08vbVwbgCpWPhpWFFJoaTPeKSKrOVqIvlzHjso,3563
20
+ ingestr-0.0.4.dist-info/WHEEL,sha256=TJPnKdtrSue7xZ_AVGkp9YXcvDrobsjBds1du3Nx6dc,87
21
+ ingestr-0.0.4.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
22
+ ingestr-0.0.4.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
23
+ ingestr-0.0.4.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- ingestr/main.py,sha256=CtBKVI13T1iGYQvOMjjvR66WlbntXuTmRe23k3GiIDo,5982
2
- ingestr/src/destinations.py,sha256=LyA_26S3tHMeJiFwxX3XYV39lLOKqKACL0wWn3IGyP4,2673
3
- ingestr/src/destinations_test.py,sha256=rgEk8EpAntFbSOwXovC4prv3RA22mwq8pIO6sZ_rYzg,4212
4
- ingestr/src/factory.py,sha256=YMIZSWY9ojrrl-M5_VC5eDR7TdBRF9Wem9pDCOw5DbU,2063
5
- ingestr/src/sources.py,sha256=8UeHfi1XY0C8us0fHQObkjLXDkWA_fAWhfBltUW2hTQ,956
6
- ingestr/src/sources_test.py,sha256=gcoEFpakLeYSl9cliwoSAakzr1R9pVml5V4H9ibMCZA,1825
7
- ingestr/src/sql_database/__init__.py,sha256=mLiaDGaz3h5LRCnmjPTUNhp1fIjgU5rCcBOQ2r3Iu1Q,4643
8
- ingestr/src/sql_database/helpers.py,sha256=r3b5MDpMjpV4U94gCKSOUH9t7YdP0OCAOqgpjbe7iSs,4442
9
- ingestr/src/sql_database/schema_types.py,sha256=PCIdLrT5Xc4vmoaf6OJSeXLlyN05alwgcQ-TDXd8hbQ,2153
10
- ingestr/src/sql_database/settings.py,sha256=PaLPayAb1QGHHcPlrZ7eJ1fonDA6-sOGh-ZiueIFhRg,76
11
- ingestr/src/testdata/fakebqcredentials.json,sha256=scc6TUc963KAbKTLZCfcmqVzbtzDCW1_8JNRnyAXyy8,628
12
- ingestr-0.0.2.dist-info/METADATA,sha256=SjqtV4QYsYp8nSRueNkUA_pdxMS2Mg0RONRo-Y2gdGQ,2972
13
- ingestr-0.0.2.dist-info/WHEEL,sha256=TJPnKdtrSue7xZ_AVGkp9YXcvDrobsjBds1du3Nx6dc,87
14
- ingestr-0.0.2.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
15
- ingestr-0.0.2.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
16
- ingestr-0.0.2.dist-info/RECORD,,