ingestr 0.2.6__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- {ingestr-0.2.6 → ingestr-0.3.0}/Makefile +1 -1
- {ingestr-0.2.6 → ingestr-0.3.0}/PKG-INFO +87 -23
- {ingestr-0.2.6 → ingestr-0.3.0}/README.md +78 -14
- ingestr-0.3.0/burakdb +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/.vitepress/config.mjs +29 -20
- ingestr-0.3.0/docs/supported-sources/images/notion_example.png +0 -0
- ingestr-0.3.0/docs/supported-sources/notion.md +49 -0
- ingestr-0.3.0/docs/supported-sources/overview.md +84 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/snowflake.md +3 -2
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/main.py +13 -4
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/main_test.py +16 -11
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/destinations.py +2 -1
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/factory.py +3 -1
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/mongodb/__init__.py +1 -1
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/mongodb/helpers.py +5 -5
- ingestr-0.3.0/ingestr/src/notion/__init__.py +55 -0
- ingestr-0.3.0/ingestr/src/notion/helpers/__init__.py +0 -0
- ingestr-0.3.0/ingestr/src/notion/helpers/client.py +164 -0
- ingestr-0.3.0/ingestr/src/notion/helpers/database.py +78 -0
- ingestr-0.3.0/ingestr/src/notion/settings.py +3 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/sources.py +24 -0
- ingestr-0.3.0/ingestr/src/sql_database/__init__.py +172 -0
- ingestr-0.3.0/ingestr/src/sql_database/helpers.py +258 -0
- ingestr-0.3.0/ingestr/src/sql_database/override.py +9 -0
- ingestr-0.3.0/ingestr/src/sql_database/schema_types.py +162 -0
- ingestr-0.3.0/ingestr/src/version.py +1 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/testdata/test_append.db +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/testdata/test_create_replace.db +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
- ingestr-0.2.6/ingestr/testdata/test_merge_with_primary_key.db → ingestr-0.3.0/ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
- ingestr-0.2.6/ingestr/testdata/test_delete_insert_without_primary_key.db → ingestr-0.3.0/ingestr/testdata/test_merge_with_primary_key.db +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/pyproject.toml +12 -1
- ingestr-0.3.0/requirements-dev.txt +9 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/requirements.txt +8 -8
- ingestr-0.2.6/docs/supported-sources/overview.md +0 -19
- ingestr-0.2.6/ingestr/src/sql_database/__init__.py +0 -60
- ingestr-0.2.6/ingestr/src/sql_database/helpers.py +0 -128
- ingestr-0.2.6/ingestr/src/sql_database/schema_types.py +0 -54
- ingestr-0.2.6/ingestr/src/sql_database/settings.py +0 -3
- ingestr-0.2.6/ingestr/src/version.py +0 -1
- ingestr-0.2.6/requirements-dev.txt +0 -10
- {ingestr-0.2.6 → ingestr-0.3.0}/.dockerignore +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/.github/workflows/deploy-docs.yml +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/.github/workflows/docker.yml +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/.gitignore +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/Dockerfile +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/LICENSE.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/.vitepress/theme/custom.css +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/.vitepress/theme/index.js +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/commands/example-uris.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/commands/ingest.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/getting-started/core-concepts.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/getting-started/incremental-loading.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/getting-started/quickstart.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/getting-started/telemetry.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/index.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/bigquery.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/csv.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/databricks.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/duckdb.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/mongodb.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/mssql.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/mysql.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/oracle.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/postgres.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/redshift.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/docs/supported-sources/sqlite.md +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/destinations_test.py +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/factory_test.py +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/sources_test.py +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/telemetry/event.py +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/src/testdata/fakebqcredentials.json +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/ingestr/testdata/.gitignore +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/package-lock.json +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/package.json +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/resources/demo.gif +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/resources/demo.tape +0 -0
- {ingestr-0.2.6 → ingestr-0.3.0}/resources/ingestr.svg +0 -0
|
@@ -22,7 +22,7 @@ test-specific: venv
|
|
|
22
22
|
|
|
23
23
|
lint-ci:
|
|
24
24
|
ruff check ingestr --fix && ruff format ingestr
|
|
25
|
-
mypy
|
|
25
|
+
mypy --config-file pyproject.toml --explicit-package-bases ingestr
|
|
26
26
|
|
|
27
27
|
lint: venv
|
|
28
28
|
. venv/bin/activate; $(MAKE) lint-ci
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ingestr
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bruin-data/ingestr
|
|
6
6
|
Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
|
|
@@ -16,26 +16,26 @@ Classifier: Topic :: Database
|
|
|
16
16
|
Requires-Python: >=3.9
|
|
17
17
|
Requires-Dist: cx-oracle==8.3.0
|
|
18
18
|
Requires-Dist: databricks-sql-connector==2.9.3
|
|
19
|
-
Requires-Dist: dlt==0.4.
|
|
20
|
-
Requires-Dist: duckdb-engine==0.11.
|
|
21
|
-
Requires-Dist: duckdb==0.10.
|
|
19
|
+
Requires-Dist: dlt==0.4.8
|
|
20
|
+
Requires-Dist: duckdb-engine==0.11.5
|
|
21
|
+
Requires-Dist: duckdb==0.10.2
|
|
22
22
|
Requires-Dist: google-cloud-bigquery-storage==2.24.0
|
|
23
23
|
Requires-Dist: pendulum==3.0.0
|
|
24
24
|
Requires-Dist: psycopg2-binary==2.9.9
|
|
25
25
|
Requires-Dist: py-machineid==0.5.1
|
|
26
|
-
Requires-Dist: pymongo==4.6.
|
|
26
|
+
Requires-Dist: pymongo==4.6.3
|
|
27
27
|
Requires-Dist: pymysql==1.1.0
|
|
28
28
|
Requires-Dist: pyodbc==5.1.0
|
|
29
29
|
Requires-Dist: redshift-connector==2.1.0
|
|
30
|
-
Requires-Dist: rich==13.7.
|
|
30
|
+
Requires-Dist: rich==13.7.1
|
|
31
31
|
Requires-Dist: rudder-sdk-python==2.1.0
|
|
32
|
-
Requires-Dist: snowflake-sqlalchemy==1.5.
|
|
33
|
-
Requires-Dist: sqlalchemy-bigquery==1.
|
|
32
|
+
Requires-Dist: snowflake-sqlalchemy==1.5.3
|
|
33
|
+
Requires-Dist: sqlalchemy-bigquery==1.11.0
|
|
34
34
|
Requires-Dist: sqlalchemy-redshift==0.8.14
|
|
35
35
|
Requires-Dist: sqlalchemy2-stubs==0.0.2a38
|
|
36
36
|
Requires-Dist: sqlalchemy==1.4.52
|
|
37
37
|
Requires-Dist: tqdm==4.66.2
|
|
38
|
-
Requires-Dist: typer==0.
|
|
38
|
+
Requires-Dist: typer==0.12.3
|
|
39
39
|
Description-Content-Type: text/markdown
|
|
40
40
|
|
|
41
41
|
<div align="center">
|
|
@@ -91,20 +91,84 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
91
91
|
|
|
92
92
|
## Supported Sources & Destinations
|
|
93
93
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
94
|
+
<table>
|
|
95
|
+
<tr>
|
|
96
|
+
<th></th>
|
|
97
|
+
<th>Source</th>
|
|
98
|
+
<th>Destination</th>
|
|
99
|
+
</tr>
|
|
100
|
+
<tr>
|
|
101
|
+
<td colspan="3" style='text-align:center;'><strong>Databases</strong></td>
|
|
102
|
+
</tr>
|
|
103
|
+
<tr>
|
|
104
|
+
<td>Postgres</td>
|
|
105
|
+
<td>✅</td>
|
|
106
|
+
<td>✅</td>
|
|
107
|
+
</tr>
|
|
108
|
+
<tr>
|
|
109
|
+
<td>BigQuery</td>
|
|
110
|
+
<td>✅</td>
|
|
111
|
+
<td>✅</td>
|
|
112
|
+
</tr>
|
|
113
|
+
<tr>
|
|
114
|
+
<td>Snowflake</td>
|
|
115
|
+
<td>✅</td>
|
|
116
|
+
<td>✅</td>
|
|
117
|
+
</tr>
|
|
118
|
+
<tr>
|
|
119
|
+
<td>Redshift</td>
|
|
120
|
+
<td>✅</td>
|
|
121
|
+
<td>✅</td>
|
|
122
|
+
</tr>
|
|
123
|
+
<tr>
|
|
124
|
+
<td>Databricks</td>
|
|
125
|
+
<td>✅</td>
|
|
126
|
+
<td>✅</td>
|
|
127
|
+
</tr>
|
|
128
|
+
<tr>
|
|
129
|
+
<td>DuckDB</td>
|
|
130
|
+
<td>✅</td>
|
|
131
|
+
<td>✅</td>
|
|
132
|
+
</tr>
|
|
133
|
+
<tr>
|
|
134
|
+
<td>Microsoft SQL Server</td>
|
|
135
|
+
<td>✅</td>
|
|
136
|
+
<td>✅</td>
|
|
137
|
+
</tr>
|
|
138
|
+
<tr>
|
|
139
|
+
<td>Local CSV file</td>
|
|
140
|
+
<td>✅</td>
|
|
141
|
+
<td>✅</td>
|
|
142
|
+
</tr>
|
|
143
|
+
<tr>
|
|
144
|
+
<td>MongoDB</td>
|
|
145
|
+
<td>✅</td>
|
|
146
|
+
<td>❌</td>
|
|
147
|
+
</tr>
|
|
148
|
+
<tr>
|
|
149
|
+
<td>Oracle</td>
|
|
150
|
+
<td>✅</td>
|
|
151
|
+
<td>❌</td>
|
|
152
|
+
</tr>
|
|
153
|
+
<tr>
|
|
154
|
+
<td>SQLite</td>
|
|
155
|
+
<td>✅</td>
|
|
156
|
+
<td>❌</td>
|
|
157
|
+
</tr>
|
|
158
|
+
<tr>
|
|
159
|
+
<td>MySQL</td>
|
|
160
|
+
<td>✅</td>
|
|
161
|
+
<td>❌</td>
|
|
162
|
+
</tr>
|
|
163
|
+
<tr>
|
|
164
|
+
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
165
|
+
</tr>
|
|
166
|
+
<tr>
|
|
167
|
+
<td>Notion</td>
|
|
168
|
+
<td>✅</td>
|
|
169
|
+
<td>❌</td>
|
|
170
|
+
</tr>
|
|
171
|
+
</table>
|
|
108
172
|
|
|
109
173
|
More to come soon!
|
|
110
174
|
|
|
@@ -51,20 +51,84 @@ Join our Slack community [here](https://join.slack.com/t/bruindatacommunity/shar
|
|
|
51
51
|
|
|
52
52
|
## Supported Sources & Destinations
|
|
53
53
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
54
|
+
<table>
|
|
55
|
+
<tr>
|
|
56
|
+
<th></th>
|
|
57
|
+
<th>Source</th>
|
|
58
|
+
<th>Destination</th>
|
|
59
|
+
</tr>
|
|
60
|
+
<tr>
|
|
61
|
+
<td colspan="3" style='text-align:center;'><strong>Databases</strong></td>
|
|
62
|
+
</tr>
|
|
63
|
+
<tr>
|
|
64
|
+
<td>Postgres</td>
|
|
65
|
+
<td>✅</td>
|
|
66
|
+
<td>✅</td>
|
|
67
|
+
</tr>
|
|
68
|
+
<tr>
|
|
69
|
+
<td>BigQuery</td>
|
|
70
|
+
<td>✅</td>
|
|
71
|
+
<td>✅</td>
|
|
72
|
+
</tr>
|
|
73
|
+
<tr>
|
|
74
|
+
<td>Snowflake</td>
|
|
75
|
+
<td>✅</td>
|
|
76
|
+
<td>✅</td>
|
|
77
|
+
</tr>
|
|
78
|
+
<tr>
|
|
79
|
+
<td>Redshift</td>
|
|
80
|
+
<td>✅</td>
|
|
81
|
+
<td>✅</td>
|
|
82
|
+
</tr>
|
|
83
|
+
<tr>
|
|
84
|
+
<td>Databricks</td>
|
|
85
|
+
<td>✅</td>
|
|
86
|
+
<td>✅</td>
|
|
87
|
+
</tr>
|
|
88
|
+
<tr>
|
|
89
|
+
<td>DuckDB</td>
|
|
90
|
+
<td>✅</td>
|
|
91
|
+
<td>✅</td>
|
|
92
|
+
</tr>
|
|
93
|
+
<tr>
|
|
94
|
+
<td>Microsoft SQL Server</td>
|
|
95
|
+
<td>✅</td>
|
|
96
|
+
<td>✅</td>
|
|
97
|
+
</tr>
|
|
98
|
+
<tr>
|
|
99
|
+
<td>Local CSV file</td>
|
|
100
|
+
<td>✅</td>
|
|
101
|
+
<td>✅</td>
|
|
102
|
+
</tr>
|
|
103
|
+
<tr>
|
|
104
|
+
<td>MongoDB</td>
|
|
105
|
+
<td>✅</td>
|
|
106
|
+
<td>❌</td>
|
|
107
|
+
</tr>
|
|
108
|
+
<tr>
|
|
109
|
+
<td>Oracle</td>
|
|
110
|
+
<td>✅</td>
|
|
111
|
+
<td>❌</td>
|
|
112
|
+
</tr>
|
|
113
|
+
<tr>
|
|
114
|
+
<td>SQLite</td>
|
|
115
|
+
<td>✅</td>
|
|
116
|
+
<td>❌</td>
|
|
117
|
+
</tr>
|
|
118
|
+
<tr>
|
|
119
|
+
<td>MySQL</td>
|
|
120
|
+
<td>✅</td>
|
|
121
|
+
<td>❌</td>
|
|
122
|
+
</tr>
|
|
123
|
+
<tr>
|
|
124
|
+
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
125
|
+
</tr>
|
|
126
|
+
<tr>
|
|
127
|
+
<td>Notion</td>
|
|
128
|
+
<td>✅</td>
|
|
129
|
+
<td>❌</td>
|
|
130
|
+
</tr>
|
|
131
|
+
</table>
|
|
68
132
|
|
|
69
133
|
More to come soon!
|
|
70
134
|
|
ingestr-0.3.0/burakdb
ADDED
|
Binary file
|
|
@@ -4,20 +4,17 @@ import { defineConfig } from "vitepress";
|
|
|
4
4
|
export default defineConfig({
|
|
5
5
|
title: "ingestr",
|
|
6
6
|
description: "Ingest & copy data between any source and any destination",
|
|
7
|
-
base:
|
|
7
|
+
base: "/ingestr/",
|
|
8
8
|
head: [
|
|
9
|
+
["script", { async: "", src: "https://www.googletagmanager.com/gtag/js?id=G-MZJ20PP4MJ" }],
|
|
9
10
|
[
|
|
10
|
-
|
|
11
|
-
{ async: '', src: 'https://www.googletagmanager.com/gtag/js?id=G-MZJ20PP4MJ' }
|
|
12
|
-
],
|
|
13
|
-
[
|
|
14
|
-
'script',
|
|
11
|
+
"script",
|
|
15
12
|
{},
|
|
16
13
|
`window.dataLayer = window.dataLayer || [];
|
|
17
14
|
function gtag(){dataLayer.push(arguments);}
|
|
18
15
|
gtag('js', new Date());
|
|
19
|
-
gtag('config', 'G-MZJ20PP4MJ')
|
|
20
|
-
]
|
|
16
|
+
gtag('config', 'G-MZJ20PP4MJ');`,
|
|
17
|
+
],
|
|
21
18
|
],
|
|
22
19
|
themeConfig: {
|
|
23
20
|
// https://vitepress.dev/reference/default-theme-config
|
|
@@ -46,19 +43,31 @@ export default defineConfig({
|
|
|
46
43
|
{
|
|
47
44
|
text: "Sources & Destinations",
|
|
48
45
|
items: [
|
|
49
|
-
{ text: "AWS Redshift", link: "/supported-sources/redshift.md" },
|
|
50
|
-
{ text: "Databricks", link: "/supported-sources/databricks.md" },
|
|
51
|
-
{ text: "DuckDB", link: "/supported-sources/duckdb.md" },
|
|
52
|
-
{ text: "Google BigQuery", link: "/supported-sources/bigquery.md" },
|
|
53
|
-
{ text: "Local CSV Files", link: "/supported-sources/csv.md" },
|
|
54
|
-
{ text: "Microsoft SQL Server", link: "/supported-sources/mssql.md" },
|
|
55
|
-
{ text: "MongoDB", link: "/supported-sources/mongodb.md" },
|
|
56
|
-
{ text: "MySQL", link: "/supported-sources/mysql.md" },
|
|
57
|
-
{ text: "Oracle", link: "/supported-sources/oracle.md" },
|
|
58
46
|
{ text: "Overview", link: "/supported-sources/overview.md" },
|
|
59
|
-
{
|
|
60
|
-
|
|
61
|
-
|
|
47
|
+
{
|
|
48
|
+
text: "Databases",
|
|
49
|
+
collapsed: false,
|
|
50
|
+
items: [
|
|
51
|
+
{ text: "AWS Redshift", link: "/supported-sources/redshift.md" },
|
|
52
|
+
{ text: "Databricks", link: "/supported-sources/databricks.md" },
|
|
53
|
+
{ text: "DuckDB", link: "/supported-sources/duckdb.md" },
|
|
54
|
+
{ text: "Google BigQuery", link: "/supported-sources/bigquery.md" },
|
|
55
|
+
{ text: "Local CSV Files", link: "/supported-sources/csv.md" },
|
|
56
|
+
{ text: "Microsoft SQL Server", link: "/supported-sources/mssql.md" },
|
|
57
|
+
{ text: "MongoDB", link: "/supported-sources/mongodb.md" },
|
|
58
|
+
{ text: "MySQL", link: "/supported-sources/mysql.md" },
|
|
59
|
+
{ text: "Oracle", link: "/supported-sources/oracle.md" },
|
|
60
|
+
{ text: "Postgres", link: "/supported-sources/postgres.md" },
|
|
61
|
+
{ text: "Snowflake", link: "/supported-sources/snowflake.md" },
|
|
62
|
+
{ text: "SQLite", link: "/supported-sources/sqlite.md" },
|
|
63
|
+
],
|
|
64
|
+
},
|
|
65
|
+
|
|
66
|
+
{
|
|
67
|
+
text: "Platforms",
|
|
68
|
+
collapsed: false,
|
|
69
|
+
items: [{ text: "Notion", link: "/supported-sources/notion.md" }],
|
|
70
|
+
},
|
|
62
71
|
],
|
|
63
72
|
},
|
|
64
73
|
],
|
|
Binary file
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Notion
|
|
2
|
+
[Notion](https://www.notion.so/) is an all-in-one workspace for note-taking, project management, and database management.
|
|
3
|
+
|
|
4
|
+
ingestr supports Notion as a source.
|
|
5
|
+
|
|
6
|
+
## URI Format
|
|
7
|
+
The URI format for Notion is as follows:
|
|
8
|
+
|
|
9
|
+
```plaintext
|
|
10
|
+
notion://?api_key=token
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
URI parameters:
|
|
14
|
+
- `api_key`: the integration token used for authentication with the Notion API
|
|
15
|
+
|
|
16
|
+
The URI is used to connect to the Notion API for extracting data. More details on setting up Notion integrations can be found [here](https://developers.notion.com/docs/getting-started).
|
|
17
|
+
|
|
18
|
+
## Setting up a Notion Integration
|
|
19
|
+
|
|
20
|
+
Notion requires a few steps to set up an integration, please follow the guide dltHub [has built here](https://dlthub.com/docs/dlt-ecosystem/verified-sources/notion#setup-guide).
|
|
21
|
+
|
|
22
|
+
Once you complete the guide, you should have an API key, and the table ID to connect to. Let's say your API token is `secret_12345` and the database you'd like to connect to is `bfeaafc0c25f40a9asdasd672a9456f3`, here's a sample command that will copy the data from the Notion table into a duckdb database:
|
|
23
|
+
|
|
24
|
+
```sh
|
|
25
|
+
ingestr ingest --source-uri 'notion://?api_key=secret_12345' --source-table 'bfeaafc0c25f40a9asdasd672a9456f3' --dest-uri duckdb:///notion.duckdb --dest-table 'notion.output'
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
The result of this command will be a bunch of tables in the `notion.duckdb` database. The Notion integration creates a bunch of extra tables in the schema to keep track of additional information about every field in a database. You should take some time to play around with the data and understand how it's structured, and take a good look at `_dlt_parent_id` column in the tables to understand the relationships between tables.
|
|
29
|
+
|
|
30
|
+
Take a look at the following Notion table:
|
|
31
|
+

|
|
32
|
+
|
|
33
|
+
Ingesting this table using ingestr will create a bunch of new tables with quite a lot of details in them. The following query is a reconstruction of the table as it looks on Notion:
|
|
34
|
+
|
|
35
|
+
```sql
|
|
36
|
+
select n.text__content, s.text__content, o.properties__numerical_value__number, r.text__content
|
|
37
|
+
from notion.output o
|
|
38
|
+
join notion.output__properties__name__title n on n._dlt_parent_id = o._dlt_id
|
|
39
|
+
join notion.output__properties__another_col__rich_text r on r._dlt_parent_id = o._dlt_id
|
|
40
|
+
join notion.output__properties__second_value__rich_text s on s._dlt_parent_id = o._dlt_id
|
|
41
|
+
order by 1;
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Take this as a starting point and play around with the data.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
> [!CAUTION]
|
|
48
|
+
> Notion does not support incremental loading, which means every time you run the command, it will copy the entire table from Notion to the destination. This can be slow for large tables.
|
|
49
|
+
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Supported Sources & Destinations
|
|
2
|
+
ingestr supports the following sources and destinations:
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
<table>
|
|
6
|
+
<tr>
|
|
7
|
+
<th></th>
|
|
8
|
+
<th>Source</th>
|
|
9
|
+
<th>Destination</th>
|
|
10
|
+
</tr>
|
|
11
|
+
<tr>
|
|
12
|
+
<td colspan="3" style='text-align:center;'><strong>Databases</strong></td>
|
|
13
|
+
</tr>
|
|
14
|
+
<tr>
|
|
15
|
+
<td>Postgres</td>
|
|
16
|
+
<td>✅</td>
|
|
17
|
+
<td>✅</td>
|
|
18
|
+
</tr>
|
|
19
|
+
<tr>
|
|
20
|
+
<td>BigQuery</td>
|
|
21
|
+
<td>✅</td>
|
|
22
|
+
<td>✅</td>
|
|
23
|
+
</tr>
|
|
24
|
+
<tr>
|
|
25
|
+
<td>Snowflake</td>
|
|
26
|
+
<td>✅</td>
|
|
27
|
+
<td>✅</td>
|
|
28
|
+
</tr>
|
|
29
|
+
<tr>
|
|
30
|
+
<td>Redshift</td>
|
|
31
|
+
<td>✅</td>
|
|
32
|
+
<td>✅</td>
|
|
33
|
+
</tr>
|
|
34
|
+
<tr>
|
|
35
|
+
<td>Databricks</td>
|
|
36
|
+
<td>✅</td>
|
|
37
|
+
<td>✅</td>
|
|
38
|
+
</tr>
|
|
39
|
+
<tr>
|
|
40
|
+
<td>DuckDB</td>
|
|
41
|
+
<td>✅</td>
|
|
42
|
+
<td>✅</td>
|
|
43
|
+
</tr>
|
|
44
|
+
<tr>
|
|
45
|
+
<td>Microsoft SQL Server</td>
|
|
46
|
+
<td>✅</td>
|
|
47
|
+
<td>✅</td>
|
|
48
|
+
</tr>
|
|
49
|
+
<tr>
|
|
50
|
+
<td>Local CSV file</td>
|
|
51
|
+
<td>✅</td>
|
|
52
|
+
<td>✅</td>
|
|
53
|
+
</tr>
|
|
54
|
+
<tr>
|
|
55
|
+
<td>MongoDB</td>
|
|
56
|
+
<td>✅</td>
|
|
57
|
+
<td>❌</td>
|
|
58
|
+
</tr>
|
|
59
|
+
<tr>
|
|
60
|
+
<td>Oracle</td>
|
|
61
|
+
<td>✅</td>
|
|
62
|
+
<td>❌</td>
|
|
63
|
+
</tr>
|
|
64
|
+
<tr>
|
|
65
|
+
<td>SQLite</td>
|
|
66
|
+
<td>✅</td>
|
|
67
|
+
<td>❌</td>
|
|
68
|
+
</tr>
|
|
69
|
+
<tr>
|
|
70
|
+
<td>MySQL</td>
|
|
71
|
+
<td>✅</td>
|
|
72
|
+
<td>❌</td>
|
|
73
|
+
</tr>
|
|
74
|
+
<tr>
|
|
75
|
+
<td colspan="3" style='text-align:center;'><strong>Platforms</strong></td>
|
|
76
|
+
</tr>
|
|
77
|
+
<tr>
|
|
78
|
+
<td>Notion</td>
|
|
79
|
+
<td>✅</td>
|
|
80
|
+
<td>❌</td>
|
|
81
|
+
</tr>
|
|
82
|
+
</table>
|
|
83
|
+
|
|
84
|
+
More to come soon!
|
|
@@ -7,7 +7,7 @@ ingestr supports Snowflake as both a source and destination.
|
|
|
7
7
|
The URI format for Snowflake is as follows:
|
|
8
8
|
|
|
9
9
|
```plaintext
|
|
10
|
-
snowflake://user:password@account/dbname?warehouse=COMPUTE_WH
|
|
10
|
+
snowflake://user:password@account/dbname?warehouse=COMPUTE_WH&role=data_scientist
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
URI parameters:
|
|
@@ -15,6 +15,7 @@ URI parameters:
|
|
|
15
15
|
- `password`: the password for the user
|
|
16
16
|
- `account`: your Snowflake account identifier
|
|
17
17
|
- `dbname`: the name of the database to connect to
|
|
18
|
-
- `warehouse`: the name of the warehouse to use
|
|
18
|
+
- `warehouse`: the name of the warehouse to use (optional)
|
|
19
|
+
- `role`: the name of the role to use (optional)
|
|
19
20
|
|
|
20
21
|
The same URI structure can be used both for sources and destinations. You can read more about SQLAlchemy's Snowflake dialect [here](https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#connection-parameters).
|
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import dlt
|
|
6
6
|
import humanize
|
|
7
7
|
import typer
|
|
8
|
-
from dlt.common.runtime.collector import Collector
|
|
8
|
+
from dlt.common.runtime.collector import Collector, LogCollector
|
|
9
9
|
from rich.console import Console
|
|
10
10
|
from rich.status import Status
|
|
11
11
|
from typing_extensions import Annotated
|
|
@@ -34,8 +34,6 @@ DATE_FORMATS = [
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class SpinnerCollector(Collector):
|
|
37
|
-
"""A Collector that shows progress with `tqdm` progress bars"""
|
|
38
|
-
|
|
39
37
|
status: Status
|
|
40
38
|
current_step: str
|
|
41
39
|
started: bool
|
|
@@ -150,6 +148,13 @@ def ingest(
|
|
|
150
148
|
envvar="FULL_REFRESH",
|
|
151
149
|
),
|
|
152
150
|
] = False, # type: ignore
|
|
151
|
+
progress: Annotated[
|
|
152
|
+
Optional[str],
|
|
153
|
+
typer.Option(
|
|
154
|
+
help="The progress display type, must be one of 'interactive', 'log'",
|
|
155
|
+
envvar="PROGRESS",
|
|
156
|
+
),
|
|
157
|
+
] = "interactive", # type: ignore
|
|
153
158
|
):
|
|
154
159
|
track(
|
|
155
160
|
"command_triggered",
|
|
@@ -186,12 +191,16 @@ def ingest(
|
|
|
186
191
|
m = hashlib.sha256()
|
|
187
192
|
m.update(dest_table.encode("utf-8"))
|
|
188
193
|
|
|
194
|
+
progressInstance: Collector = SpinnerCollector()
|
|
195
|
+
if progress == "log":
|
|
196
|
+
progressInstance = LogCollector()
|
|
197
|
+
|
|
189
198
|
pipeline = dlt.pipeline(
|
|
190
199
|
pipeline_name=m.hexdigest(),
|
|
191
200
|
destination=destination.dlt_dest(
|
|
192
201
|
uri=dest_uri,
|
|
193
202
|
),
|
|
194
|
-
progress=
|
|
203
|
+
progress=progressInstance,
|
|
195
204
|
pipelines_dir="pipeline_data",
|
|
196
205
|
full_refresh=full_refresh,
|
|
197
206
|
)
|
|
@@ -93,6 +93,7 @@ def test_create_replace():
|
|
|
93
93
|
"testschema.output",
|
|
94
94
|
)
|
|
95
95
|
|
|
96
|
+
print(result.stdout)
|
|
96
97
|
assert result.exit_code == 0
|
|
97
98
|
|
|
98
99
|
res = conn.sql(
|
|
@@ -333,10 +334,10 @@ def test_delete_insert_without_primary_key():
|
|
|
333
334
|
"CREATE TABLE testschema_delete_insert.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP WITH TIME ZONE)"
|
|
334
335
|
)
|
|
335
336
|
conn.execute(
|
|
336
|
-
"INSERT INTO testschema_delete_insert.input VALUES (1, 'val1', '2022-01-01')"
|
|
337
|
+
"INSERT INTO testschema_delete_insert.input VALUES (1, 'val1', '2022-01-01 00:00:00+00:00')"
|
|
337
338
|
)
|
|
338
339
|
conn.execute(
|
|
339
|
-
"INSERT INTO testschema_delete_insert.input VALUES (2, 'val2', '2022-02-01')"
|
|
340
|
+
"INSERT INTO testschema_delete_insert.input VALUES (2, 'val2', '2022-02-01 00:00:00+00:00')"
|
|
340
341
|
)
|
|
341
342
|
|
|
342
343
|
res = conn.sql("select count(*) from testschema_delete_insert.input").fetchall()
|
|
@@ -357,7 +358,7 @@ def test_delete_insert_without_primary_key():
|
|
|
357
358
|
def get_output_rows():
|
|
358
359
|
conn.execute("CHECKPOINT")
|
|
359
360
|
return conn.sql(
|
|
360
|
-
"select id, val, strftime(updated_at, '%Y-%m-%d')
|
|
361
|
+
"select id, val, strftime(CAST(updated_at AT TIME ZONE 'UTC' AS TIMESTAMP), '%Y-%m-%d %H:%M:%S') from testschema_delete_insert.output order by id asc"
|
|
361
362
|
).fetchall()
|
|
362
363
|
|
|
363
364
|
def assert_output_equals(expected):
|
|
@@ -367,7 +368,9 @@ def test_delete_insert_without_primary_key():
|
|
|
367
368
|
assert res[i] == row
|
|
368
369
|
|
|
369
370
|
run()
|
|
370
|
-
assert_output_equals(
|
|
371
|
+
assert_output_equals(
|
|
372
|
+
[(1, "val1", "2022-01-01 00:00:00"), (2, "val2", "2022-02-01 00:00:00")]
|
|
373
|
+
)
|
|
371
374
|
|
|
372
375
|
first_run_id = conn.sql(
|
|
373
376
|
"select _dlt_load_id from testschema_delete_insert.output limit 1"
|
|
@@ -375,8 +378,10 @@ def test_delete_insert_without_primary_key():
|
|
|
375
378
|
|
|
376
379
|
##############################
|
|
377
380
|
# we'll run again, since this is a delete+insert, we expect the run ID to change for the last one
|
|
378
|
-
run()
|
|
379
|
-
assert_output_equals(
|
|
381
|
+
res = run()
|
|
382
|
+
assert_output_equals(
|
|
383
|
+
[(1, "val1", "2022-01-01 00:00:00"), (2, "val2", "2022-02-01 00:00:00")]
|
|
384
|
+
)
|
|
380
385
|
|
|
381
386
|
# we ensure that one of the rows is updated with a new run
|
|
382
387
|
count_by_run_id = conn.sql(
|
|
@@ -392,17 +397,17 @@ def test_delete_insert_without_primary_key():
|
|
|
392
397
|
##############################
|
|
393
398
|
# now we'll insert a few more lines for the same day, the new rows should show up
|
|
394
399
|
conn.execute(
|
|
395
|
-
"INSERT INTO testschema_delete_insert.input VALUES (3, 'val3', '2022-02-01'), (4, 'val4', '2022-02-01')"
|
|
400
|
+
"INSERT INTO testschema_delete_insert.input VALUES (3, 'val3', '2022-02-01 00:00:00+00:00'), (4, 'val4', '2022-02-01 00:00:00+00:00')"
|
|
396
401
|
)
|
|
397
402
|
conn.execute("CHECKPOINT")
|
|
398
403
|
|
|
399
404
|
run()
|
|
400
405
|
assert_output_equals(
|
|
401
406
|
[
|
|
402
|
-
(1, "val1", "2022-01-01"),
|
|
403
|
-
(2, "val2", "2022-02-01"),
|
|
404
|
-
(3, "val3", "2022-02-01"),
|
|
405
|
-
(4, "val4", "2022-02-01"),
|
|
407
|
+
(1, "val1", "2022-01-01 00:00:00"),
|
|
408
|
+
(2, "val2", "2022-02-01 00:00:00"),
|
|
409
|
+
(3, "val3", "2022-02-01 00:00:00"),
|
|
410
|
+
(4, "val4", "2022-02-01 00:00:00"),
|
|
406
411
|
]
|
|
407
412
|
)
|
|
408
413
|
|
|
@@ -177,7 +177,8 @@ class CsvDestination(GenericSqlDestination):
|
|
|
177
177
|
)
|
|
178
178
|
|
|
179
179
|
output_path = self.uri.split("://")[1]
|
|
180
|
-
|
|
180
|
+
if output_path.count("/") > 1:
|
|
181
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
181
182
|
|
|
182
183
|
with gzip.open(first_file_path, "rt", encoding="utf-8") as jsonl_file: # type: ignore
|
|
183
184
|
with open(output_path, "w", newline="") as csv_file:
|
|
@@ -14,7 +14,7 @@ from ingestr.src.destinations import (
|
|
|
14
14
|
SnowflakeDestination,
|
|
15
15
|
SynapseDestination,
|
|
16
16
|
)
|
|
17
|
-
from ingestr.src.sources import LocalCsvSource, MongoDbSource, SqlSource
|
|
17
|
+
from ingestr.src.sources import LocalCsvSource, MongoDbSource, NotionSource, SqlSource
|
|
18
18
|
|
|
19
19
|
SQL_SOURCE_SCHEMES = [
|
|
20
20
|
"bigquery",
|
|
@@ -80,6 +80,8 @@ class SourceDestinationFactory:
|
|
|
80
80
|
return LocalCsvSource()
|
|
81
81
|
elif self.source_scheme == "mongodb":
|
|
82
82
|
return MongoDbSource()
|
|
83
|
+
elif self.source_scheme == "notion":
|
|
84
|
+
return NotionSource()
|
|
83
85
|
else:
|
|
84
86
|
raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
|
|
85
87
|
|
|
@@ -70,7 +70,7 @@ def mongodb_collection(
|
|
|
70
70
|
collection: str = dlt.config.value,
|
|
71
71
|
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
|
|
72
72
|
write_disposition: Optional[str] = dlt.config.value,
|
|
73
|
-
parallel: Optional[bool] =
|
|
73
|
+
parallel: Optional[bool] = False,
|
|
74
74
|
) -> Any:
|
|
75
75
|
"""
|
|
76
76
|
A DLT source which loads a collection from a mongo database using PyMongo.
|
|
@@ -83,7 +83,7 @@ class CollectionLoaderParallell(CollectionLoader):
|
|
|
83
83
|
def _get_cursor(self) -> TCursor:
|
|
84
84
|
cursor = self.collection.find(filter=self._filter_op)
|
|
85
85
|
if self._sort_op:
|
|
86
|
-
cursor = cursor.sort(self._sort_op)
|
|
86
|
+
cursor = cursor.sort(self._sort_op)
|
|
87
87
|
return cursor
|
|
88
88
|
|
|
89
89
|
@dlt.defer
|
|
@@ -155,11 +155,11 @@ class MongoDbCollectionConfiguration(BaseConfiguration):
|
|
|
155
155
|
|
|
156
156
|
@configspec
|
|
157
157
|
class MongoDbCollectionResourceConfiguration(BaseConfiguration):
|
|
158
|
-
connection_url: str
|
|
159
|
-
database: Optional[str]
|
|
160
|
-
collection: str
|
|
158
|
+
connection_url: str = dlt.secrets.value
|
|
159
|
+
database: Optional[str] = dlt.config.value
|
|
160
|
+
collection: str = dlt.config.value
|
|
161
161
|
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
162
|
-
write_disposition: Optional[str] =
|
|
162
|
+
write_disposition: Optional[str] = dlt.config.value
|
|
163
163
|
parallel: Optional[bool] = False
|
|
164
164
|
|
|
165
165
|
|