ingestr 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- {ingestr-0.3.0 → ingestr-0.3.2}/PKG-INFO +1 -1
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/notion.md +1 -18
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/main.py +41 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/main_test.py +16 -1
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/notion/__init__.py +1 -1
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/sources.py +1 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/sources_test.py +4 -2
- ingestr-0.3.2/ingestr/src/version.py +1 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/testdata/test_append.db +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/testdata/test_create_replace.db +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/testdata/test_merge_with_primary_key.db +0 -0
- ingestr-0.3.0/burakdb +0 -0
- ingestr-0.3.0/docs/supported-sources/images/notion_example.png +0 -0
- ingestr-0.3.0/ingestr/src/version.py +0 -1
- {ingestr-0.3.0 → ingestr-0.3.2}/.dockerignore +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/.github/workflows/deploy-docs.yml +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/.github/workflows/docker.yml +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/.gitignore +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/Dockerfile +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/LICENSE.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/Makefile +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/README.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/.vitepress/config.mjs +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/.vitepress/theme/custom.css +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/.vitepress/theme/index.js +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/commands/example-uris.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/commands/ingest.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/getting-started/core-concepts.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/getting-started/incremental-loading.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/getting-started/quickstart.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/getting-started/telemetry.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/index.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/bigquery.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/csv.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/databricks.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/duckdb.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/mongodb.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/mssql.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/mysql.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/oracle.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/overview.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/postgres.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/redshift.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/snowflake.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/docs/supported-sources/sqlite.md +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/destinations.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/destinations_test.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/factory.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/factory_test.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/mongodb/__init__.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/mongodb/helpers.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/notion/helpers/__init__.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/notion/helpers/client.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/notion/helpers/database.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/notion/settings.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/sql_database/__init__.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/sql_database/helpers.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/sql_database/override.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/sql_database/schema_types.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/telemetry/event.py +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/src/testdata/fakebqcredentials.json +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/ingestr/testdata/.gitignore +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/package-lock.json +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/package.json +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/pyproject.toml +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/requirements-dev.txt +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/requirements.txt +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/resources/demo.gif +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/resources/demo.tape +0 -0
- {ingestr-0.3.0 → ingestr-0.3.2}/resources/ingestr.svg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -25,24 +25,7 @@ Once you complete the guide, you should have an API key, and the table ID to con
|
|
|
25
25
|
ingestr ingest --source-uri 'notion://?api_key=secret_12345' --source-table 'bfeaafc0c25f40a9asdasd672a9456f3' --dest-uri duckdb:///notion.duckdb --dest-table 'notion.output'
|
|
26
26
|
```
|
|
27
27
|
|
|
28
|
-
The result of this command will be a
|
|
29
|
-
|
|
30
|
-
Take a look at the following Notion table:
|
|
31
|
-

|
|
32
|
-
|
|
33
|
-
Ingesting this table using ingestr will create a bunch of new tables with quite a lot of details in them. The following query is a reconstruction of the table as it looks on Notion:
|
|
34
|
-
|
|
35
|
-
```sql
|
|
36
|
-
select n.text__content, s.text__content, o.properties__numerical_value__number, r.text__content
|
|
37
|
-
from notion.output o
|
|
38
|
-
join notion.output__properties__name__title n on n._dlt_parent_id = o._dlt_id
|
|
39
|
-
join notion.output__properties__another_col__rich_text r on r._dlt_parent_id = o._dlt_id
|
|
40
|
-
join notion.output__properties__second_value__rich_text s on s._dlt_parent_id = o._dlt_id
|
|
41
|
-
order by 1;
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
Take this as a starting point and play around with the data.
|
|
45
|
-
|
|
28
|
+
The result of this command will be a table in the `notion.duckdb` database with JSON columns.
|
|
46
29
|
|
|
47
30
|
> [!CAUTION]
|
|
48
31
|
> Notion does not support incremental loading, which means every time you run the command, it will copy the entire table from Notion to the destination. This can be slow for large tables.
|
|
@@ -32,6 +32,18 @@ DATE_FORMATS = [
|
|
|
32
32
|
"%Y-%m-%dT%H:%M:%S.%f%z",
|
|
33
33
|
]
|
|
34
34
|
|
|
35
|
+
# https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
|
|
36
|
+
PARQUET_SUPPORTED_DESTINATIONS = [
|
|
37
|
+
"bigquery",
|
|
38
|
+
"duckdb",
|
|
39
|
+
"snowflake",
|
|
40
|
+
"databricks",
|
|
41
|
+
"synapse",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
# these sources would return a JSON for sure, which means they cannot be used with Parquet loader for BigQuery
|
|
45
|
+
JSON_RETURNING_SOURCES = ["notion"]
|
|
46
|
+
|
|
35
47
|
|
|
36
48
|
class SpinnerCollector(Collector):
|
|
37
49
|
status: Status
|
|
@@ -155,6 +167,20 @@ def ingest(
|
|
|
155
167
|
envvar="PROGRESS",
|
|
156
168
|
),
|
|
157
169
|
] = "interactive", # type: ignore
|
|
170
|
+
sql_backend: Annotated[
|
|
171
|
+
Optional[str],
|
|
172
|
+
typer.Option(
|
|
173
|
+
help="The SQL backend to use, must be one of 'sqlalchemy', 'pyarrow'",
|
|
174
|
+
envvar="SQL_BACKEND",
|
|
175
|
+
),
|
|
176
|
+
] = "pyarrow", # type: ignore
|
|
177
|
+
loader_file_format: Annotated[
|
|
178
|
+
Optional[str],
|
|
179
|
+
typer.Option(
|
|
180
|
+
help="The file format to use when loading data, must be one of 'jsonl', 'parquet', 'default'",
|
|
181
|
+
envvar="LOADER_FILE_FORMAT",
|
|
182
|
+
),
|
|
183
|
+
] = "default", # type: ignore
|
|
158
184
|
):
|
|
159
185
|
track(
|
|
160
186
|
"command_triggered",
|
|
@@ -240,11 +266,25 @@ def ingest(
|
|
|
240
266
|
merge_key=merge_key,
|
|
241
267
|
interval_start=interval_start,
|
|
242
268
|
interval_end=interval_end,
|
|
269
|
+
sql_backend=sql_backend,
|
|
243
270
|
)
|
|
244
271
|
|
|
245
272
|
if original_incremental_strategy == "delete+insert":
|
|
246
273
|
dlt_source.incremental.primary_key = ()
|
|
247
274
|
|
|
275
|
+
if (
|
|
276
|
+
factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
|
|
277
|
+
and loader_file_format == "default"
|
|
278
|
+
):
|
|
279
|
+
loader_file_format = "parquet"
|
|
280
|
+
|
|
281
|
+
# if the source is a JSON returning source, we cannot use Parquet loader for BigQuery
|
|
282
|
+
if factory.destination_scheme == 'bigquery' and factory.source_scheme in JSON_RETURNING_SOURCES:
|
|
283
|
+
loader_file_format = "jsonl"
|
|
284
|
+
|
|
285
|
+
elif loader_file_format == "default":
|
|
286
|
+
loader_file_format = "jsonl"
|
|
287
|
+
|
|
248
288
|
run_info = pipeline.run(
|
|
249
289
|
dlt_source,
|
|
250
290
|
**destination.dlt_run_params(
|
|
@@ -253,6 +293,7 @@ def ingest(
|
|
|
253
293
|
),
|
|
254
294
|
write_disposition=incremental_strategy, # type: ignore
|
|
255
295
|
primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
|
|
296
|
+
loader_file_format=loader_file_format, # type: ignore
|
|
256
297
|
)
|
|
257
298
|
|
|
258
299
|
destination.post_load()
|
|
@@ -24,6 +24,8 @@ def invoke_ingest_command(
|
|
|
24
24
|
merge_key=None,
|
|
25
25
|
interval_start=None,
|
|
26
26
|
interval_end=None,
|
|
27
|
+
sql_backend=None,
|
|
28
|
+
loader_file_format=None,
|
|
27
29
|
):
|
|
28
30
|
args = [
|
|
29
31
|
"ingest",
|
|
@@ -61,6 +63,14 @@ def invoke_ingest_command(
|
|
|
61
63
|
args.append("--interval-end")
|
|
62
64
|
args.append(interval_end)
|
|
63
65
|
|
|
66
|
+
if sql_backend:
|
|
67
|
+
args.append("--sql-backend")
|
|
68
|
+
args.append(sql_backend)
|
|
69
|
+
|
|
70
|
+
if loader_file_format:
|
|
71
|
+
args.append("--loader-file-format")
|
|
72
|
+
args.append(loader_file_format)
|
|
73
|
+
|
|
64
74
|
result = runner.invoke(
|
|
65
75
|
app,
|
|
66
76
|
args,
|
|
@@ -93,7 +103,6 @@ def test_create_replace():
|
|
|
93
103
|
"testschema.output",
|
|
94
104
|
)
|
|
95
105
|
|
|
96
|
-
print(result.stdout)
|
|
97
106
|
assert result.exit_code == 0
|
|
98
107
|
|
|
99
108
|
res = conn.sql(
|
|
@@ -138,6 +147,7 @@ def test_append():
|
|
|
138
147
|
"testschema_append.output",
|
|
139
148
|
"append",
|
|
140
149
|
"updated_at",
|
|
150
|
+
sql_backend="sqlalchemy",
|
|
141
151
|
)
|
|
142
152
|
assert res.exit_code == 0
|
|
143
153
|
|
|
@@ -194,6 +204,7 @@ def test_merge_with_primary_key():
|
|
|
194
204
|
"merge",
|
|
195
205
|
"updated_at",
|
|
196
206
|
"id",
|
|
207
|
+
sql_backend="sqlalchemy",
|
|
197
208
|
)
|
|
198
209
|
assert res.exit_code == 0
|
|
199
210
|
return res
|
|
@@ -351,6 +362,8 @@ def test_delete_insert_without_primary_key():
|
|
|
351
362
|
"testschema_delete_insert.output",
|
|
352
363
|
inc_strategy="delete+insert",
|
|
353
364
|
inc_key="updated_at",
|
|
365
|
+
sql_backend="sqlalchemy",
|
|
366
|
+
loader_file_format="jsonl",
|
|
354
367
|
)
|
|
355
368
|
assert res.exit_code == 0
|
|
356
369
|
return res
|
|
@@ -465,6 +478,8 @@ def test_delete_insert_with_timerange():
|
|
|
465
478
|
inc_key="updated_at",
|
|
466
479
|
interval_start=start_date,
|
|
467
480
|
interval_end=end_date,
|
|
481
|
+
sql_backend="sqlalchemy",
|
|
482
|
+
loader_file_format="jsonl",
|
|
468
483
|
)
|
|
469
484
|
assert res.exit_code == 0
|
|
470
485
|
return res
|
|
@@ -22,10 +22,11 @@ class SqlSourceTest(unittest.TestCase):
|
|
|
22
22
|
table = "schema.table"
|
|
23
23
|
|
|
24
24
|
# monkey patch the sql_table function
|
|
25
|
-
def sql_table(credentials, schema, table, incremental, merge_key):
|
|
25
|
+
def sql_table(credentials, schema, table, incremental, merge_key, backend):
|
|
26
26
|
self.assertEqual(credentials, uri)
|
|
27
27
|
self.assertEqual(schema, "schema")
|
|
28
28
|
self.assertEqual(table, "table")
|
|
29
|
+
self.assertEqual(backend, "sqlalchemy")
|
|
29
30
|
self.assertIsNone(incremental)
|
|
30
31
|
self.assertIsNone(merge_key)
|
|
31
32
|
return dlt.resource()
|
|
@@ -40,10 +41,11 @@ class SqlSourceTest(unittest.TestCase):
|
|
|
40
41
|
incremental_key = "id"
|
|
41
42
|
|
|
42
43
|
# monkey patch the sql_table function
|
|
43
|
-
def sql_table(credentials, schema, table, incremental, merge_key):
|
|
44
|
+
def sql_table(credentials, schema, table, incremental, merge_key, backend):
|
|
44
45
|
self.assertEqual(credentials, uri)
|
|
45
46
|
self.assertEqual(schema, "schema")
|
|
46
47
|
self.assertEqual(table, "table")
|
|
48
|
+
self.assertEqual(backend, "sqlalchemy")
|
|
47
49
|
self.assertIsInstance(incremental, dlt.sources.incremental)
|
|
48
50
|
self.assertEqual(incremental.cursor_path, incremental_key)
|
|
49
51
|
self.assertIsNone(merge_key)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.2"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
ingestr-0.3.0/burakdb
DELETED
|
Binary file
|
|
Binary file
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.3.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|