ingestr 0.3.1__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- {ingestr-0.3.1 → ingestr-0.3.3}/PKG-INFO +1 -1
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/notion.md +1 -18
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/main.py +58 -23
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/__init__.py +1 -1
- ingestr-0.3.3/ingestr/src/version.py +1 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/testdata/test_append.db +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/testdata/test_create_replace.db +0 -0
- ingestr-0.3.1/ingestr/testdata/test_merge_with_primary_key.db → ingestr-0.3.3/ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
- ingestr-0.3.1/ingestr/testdata/test_delete_insert_with_timerange.db → ingestr-0.3.3/ingestr/testdata/test_merge_with_primary_key.db +0 -0
- ingestr-0.3.1/docs/supported-sources/images/notion_example.png +0 -0
- ingestr-0.3.1/ingestr/src/version.py +0 -1
- {ingestr-0.3.1 → ingestr-0.3.3}/.dockerignore +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/.github/workflows/deploy-docs.yml +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/.github/workflows/docker.yml +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/.gitignore +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/Dockerfile +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/LICENSE.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/Makefile +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/README.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/.vitepress/config.mjs +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/.vitepress/theme/custom.css +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/.vitepress/theme/index.js +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/commands/example-uris.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/commands/ingest.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/getting-started/core-concepts.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/getting-started/incremental-loading.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/getting-started/quickstart.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/getting-started/telemetry.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/index.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/bigquery.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/csv.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/databricks.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/duckdb.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/mongodb.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/mssql.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/mysql.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/oracle.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/overview.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/postgres.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/redshift.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/snowflake.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/docs/supported-sources/sqlite.md +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/main_test.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/destinations.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/destinations_test.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/factory.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/factory_test.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/mongodb/__init__.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/mongodb/helpers.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/helpers/__init__.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/helpers/client.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/helpers/database.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/notion/settings.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sources.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sources_test.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sql_database/__init__.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sql_database/helpers.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sql_database/override.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/sql_database/schema_types.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/telemetry/event.py +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/src/testdata/fakebqcredentials.json +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/ingestr/testdata/.gitignore +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/package-lock.json +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/package.json +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/pyproject.toml +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/requirements-dev.txt +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/requirements.txt +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/resources/demo.gif +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/resources/demo.tape +0 -0
- {ingestr-0.3.1 → ingestr-0.3.3}/resources/ingestr.svg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -25,24 +25,7 @@ Once you complete the guide, you should have an API key, and the table ID to con
|
|
|
25
25
|
ingestr ingest --source-uri 'notion://?api_key=secret_12345' --source-table 'bfeaafc0c25f40a9asdasd672a9456f3' --dest-uri duckdb:///notion.duckdb --dest-table 'notion.output'
|
|
26
26
|
```
|
|
27
27
|
|
|
28
|
-
The result of this command will be a
|
|
29
|
-
|
|
30
|
-
Take a look at the following Notion table:
|
|
31
|
-

|
|
32
|
-
|
|
33
|
-
Ingesting this table using ingestr will create a bunch of new tables with quite a lot of details in them. The following query is a reconstruction of the table as it looks on Notion:
|
|
34
|
-
|
|
35
|
-
```sql
|
|
36
|
-
select n.text__content, s.text__content, o.properties__numerical_value__number, r.text__content
|
|
37
|
-
from notion.output o
|
|
38
|
-
join notion.output__properties__name__title n on n._dlt_parent_id = o._dlt_id
|
|
39
|
-
join notion.output__properties__another_col__rich_text r on r._dlt_parent_id = o._dlt_id
|
|
40
|
-
join notion.output__properties__second_value__rich_text s on s._dlt_parent_id = o._dlt_id
|
|
41
|
-
order by 1;
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
Take this as a starting point and play around with the data.
|
|
45
|
-
|
|
28
|
+
The result of this command will be a table in the `notion.duckdb` database with JSON columns.
|
|
46
29
|
|
|
47
30
|
> [!CAUTION]
|
|
48
31
|
> Notion does not support incremental loading, which means every time you run the command, it will copy the entire table from Notion to the destination. This can be slow for large tables.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
from datetime import datetime
|
|
3
|
+
from enum import Enum
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import dlt
|
|
@@ -41,6 +42,9 @@ PARQUET_SUPPORTED_DESTINATIONS = [
|
|
|
41
42
|
"synapse",
|
|
42
43
|
]
|
|
43
44
|
|
|
45
|
+
# these sources would return a JSON for sure, which means they cannot be used with Parquet loader for BigQuery
|
|
46
|
+
JSON_RETURNING_SOURCES = ["notion"]
|
|
47
|
+
|
|
44
48
|
|
|
45
49
|
class SpinnerCollector(Collector):
|
|
46
50
|
status: Status
|
|
@@ -80,6 +84,30 @@ class SpinnerCollector(Collector):
|
|
|
80
84
|
self.status.stop()
|
|
81
85
|
|
|
82
86
|
|
|
87
|
+
class IncrementalStrategy(str, Enum):
|
|
88
|
+
create_replace = "replace"
|
|
89
|
+
append = "append"
|
|
90
|
+
delete_insert = "delete+insert"
|
|
91
|
+
merge = "merge"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class LoaderFileFormat(str, Enum):
|
|
95
|
+
jsonl = "jsonl"
|
|
96
|
+
parquet = "parquet"
|
|
97
|
+
insert_values = "insert_values"
|
|
98
|
+
csv = "csv"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class SqlBackend(str, Enum):
|
|
102
|
+
sqlalchemy = "sqlalchemy"
|
|
103
|
+
pyarrow = "pyarrow"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class Progress(str, Enum):
|
|
107
|
+
interactive = "interactive"
|
|
108
|
+
log = "log"
|
|
109
|
+
|
|
110
|
+
|
|
83
111
|
@app.command()
|
|
84
112
|
def ingest(
|
|
85
113
|
source_uri: Annotated[
|
|
@@ -114,12 +142,12 @@ def ingest(
|
|
|
114
142
|
),
|
|
115
143
|
] = None, # type: ignore
|
|
116
144
|
incremental_strategy: Annotated[
|
|
117
|
-
|
|
145
|
+
IncrementalStrategy,
|
|
118
146
|
typer.Option(
|
|
119
|
-
help="The incremental strategy to use
|
|
147
|
+
help="The incremental strategy to use",
|
|
120
148
|
envvar="INCREMENTAL_STRATEGY",
|
|
121
149
|
),
|
|
122
|
-
] =
|
|
150
|
+
] = IncrementalStrategy.create_replace, # type: ignore
|
|
123
151
|
interval_start: Annotated[
|
|
124
152
|
Optional[datetime],
|
|
125
153
|
typer.Option(
|
|
@@ -158,26 +186,26 @@ def ingest(
|
|
|
158
186
|
),
|
|
159
187
|
] = False, # type: ignore
|
|
160
188
|
progress: Annotated[
|
|
161
|
-
|
|
189
|
+
Progress,
|
|
162
190
|
typer.Option(
|
|
163
191
|
help="The progress display type, must be one of 'interactive', 'log'",
|
|
164
192
|
envvar="PROGRESS",
|
|
165
193
|
),
|
|
166
|
-
] =
|
|
194
|
+
] = Progress.interactive, # type: ignore
|
|
167
195
|
sql_backend: Annotated[
|
|
168
|
-
|
|
196
|
+
SqlBackend,
|
|
169
197
|
typer.Option(
|
|
170
|
-
help="The SQL backend to use
|
|
198
|
+
help="The SQL backend to use",
|
|
171
199
|
envvar="SQL_BACKEND",
|
|
172
200
|
),
|
|
173
|
-
] =
|
|
201
|
+
] = SqlBackend.pyarrow, # type: ignore
|
|
174
202
|
loader_file_format: Annotated[
|
|
175
|
-
Optional[
|
|
203
|
+
Optional[LoaderFileFormat],
|
|
176
204
|
typer.Option(
|
|
177
|
-
help="The file format to use when loading data
|
|
205
|
+
help="The file format to use when loading data",
|
|
178
206
|
envvar="LOADER_FILE_FORMAT",
|
|
179
207
|
),
|
|
180
|
-
] =
|
|
208
|
+
] = None, # type: ignore
|
|
181
209
|
):
|
|
182
210
|
track(
|
|
183
211
|
"command_triggered",
|
|
@@ -207,15 +235,15 @@ def ingest(
|
|
|
207
235
|
original_incremental_strategy = incremental_strategy
|
|
208
236
|
|
|
209
237
|
merge_key = None
|
|
210
|
-
if incremental_strategy ==
|
|
238
|
+
if incremental_strategy == IncrementalStrategy.delete_insert:
|
|
211
239
|
merge_key = incremental_key
|
|
212
|
-
incremental_strategy =
|
|
240
|
+
incremental_strategy = IncrementalStrategy.merge
|
|
213
241
|
|
|
214
242
|
m = hashlib.sha256()
|
|
215
243
|
m.update(dest_table.encode("utf-8"))
|
|
216
244
|
|
|
217
245
|
progressInstance: Collector = SpinnerCollector()
|
|
218
|
-
if progress ==
|
|
246
|
+
if progress == Progress.log:
|
|
219
247
|
progressInstance = LogCollector()
|
|
220
248
|
|
|
221
249
|
pipeline = dlt.pipeline(
|
|
@@ -237,7 +265,7 @@ def ingest(
|
|
|
237
265
|
f"[bold yellow] Destination:[/bold yellow] {factory.destination_scheme} / {dest_table}"
|
|
238
266
|
)
|
|
239
267
|
print(
|
|
240
|
-
f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy}"
|
|
268
|
+
f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy.value}"
|
|
241
269
|
)
|
|
242
270
|
print(
|
|
243
271
|
f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
|
|
@@ -263,19 +291,24 @@ def ingest(
|
|
|
263
291
|
merge_key=merge_key,
|
|
264
292
|
interval_start=interval_start,
|
|
265
293
|
interval_end=interval_end,
|
|
266
|
-
sql_backend=sql_backend,
|
|
294
|
+
sql_backend=sql_backend.value,
|
|
267
295
|
)
|
|
268
296
|
|
|
269
|
-
if original_incremental_strategy ==
|
|
297
|
+
if original_incremental_strategy == IncrementalStrategy.delete_insert:
|
|
270
298
|
dlt_source.incremental.primary_key = ()
|
|
271
299
|
|
|
272
300
|
if (
|
|
273
301
|
factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
|
|
274
|
-
and loader_file_format
|
|
302
|
+
and loader_file_format is None
|
|
275
303
|
):
|
|
276
|
-
loader_file_format =
|
|
277
|
-
|
|
278
|
-
|
|
304
|
+
loader_file_format = LoaderFileFormat.parquet
|
|
305
|
+
|
|
306
|
+
# if the source is a JSON returning source, we cannot use Parquet loader for BigQuery
|
|
307
|
+
if (
|
|
308
|
+
factory.destination_scheme == "bigquery"
|
|
309
|
+
and factory.source_scheme in JSON_RETURNING_SOURCES
|
|
310
|
+
):
|
|
311
|
+
loader_file_format = None
|
|
279
312
|
|
|
280
313
|
run_info = pipeline.run(
|
|
281
314
|
dlt_source,
|
|
@@ -283,9 +316,11 @@ def ingest(
|
|
|
283
316
|
uri=dest_uri,
|
|
284
317
|
table=dest_table,
|
|
285
318
|
),
|
|
286
|
-
write_disposition=incremental_strategy, # type: ignore
|
|
319
|
+
write_disposition=incremental_strategy.value, # type: ignore
|
|
287
320
|
primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
|
|
288
|
-
loader_file_format=loader_file_format
|
|
321
|
+
loader_file_format=loader_file_format.value
|
|
322
|
+
if loader_file_format is not None
|
|
323
|
+
else None, # type: ignore
|
|
289
324
|
)
|
|
290
325
|
|
|
291
326
|
destination.post_load()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.3"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.3.1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|