ingestr 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +32 -16
- ingestr/main_test.py +18 -17
- ingestr/src/destinations.py +2 -1
- ingestr/src/factory.py +3 -1
- ingestr/src/mongodb/__init__.py +1 -1
- ingestr/src/mongodb/helpers.py +5 -5
- ingestr/src/notion/__init__.py +55 -0
- ingestr/src/notion/helpers/__init__.py +0 -0
- ingestr/src/notion/helpers/client.py +164 -0
- ingestr/src/notion/helpers/database.py +78 -0
- ingestr/src/notion/settings.py +3 -0
- ingestr/src/sources.py +24 -0
- ingestr/src/sql_database/__init__.py +125 -13
- ingestr/src/sql_database/helpers.py +162 -30
- ingestr/src/sql_database/override.py +9 -0
- ingestr/src/sql_database/schema_types.py +135 -27
- ingestr/src/version.py +1 -1
- ingestr/testdata/test_append.db +0 -0
- ingestr/testdata/test_create_replace.db +0 -0
- ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
- ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
- ingestr/testdata/test_merge_with_primary_key.db +0 -0
- {ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/METADATA +89 -25
- ingestr-0.3.0.dist-info/RECORD +33 -0
- {ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/WHEEL +1 -1
- ingestr/src/sql_database/settings.py +0 -3
- ingestr-0.2.5.dist-info/RECORD +0 -27
- {ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/entry_points.txt +0 -0
- {ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/licenses/LICENSE.md +0 -0
ingestr/main.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import dlt
|
|
6
6
|
import humanize
|
|
7
7
|
import typer
|
|
8
|
-
from dlt.common.runtime.collector import Collector
|
|
8
|
+
from dlt.common.runtime.collector import Collector, LogCollector
|
|
9
9
|
from rich.console import Console
|
|
10
10
|
from rich.status import Status
|
|
11
11
|
from typing_extensions import Annotated
|
|
@@ -34,8 +34,6 @@ DATE_FORMATS = [
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class SpinnerCollector(Collector):
|
|
37
|
-
"""A Collector that shows progress with `tqdm` progress bars"""
|
|
38
|
-
|
|
39
37
|
status: Status
|
|
40
38
|
current_step: str
|
|
41
39
|
started: bool
|
|
@@ -150,6 +148,13 @@ def ingest(
|
|
|
150
148
|
envvar="FULL_REFRESH",
|
|
151
149
|
),
|
|
152
150
|
] = False, # type: ignore
|
|
151
|
+
progress: Annotated[
|
|
152
|
+
Optional[str],
|
|
153
|
+
typer.Option(
|
|
154
|
+
help="The progress display type, must be one of 'interactive', 'log'",
|
|
155
|
+
envvar="PROGRESS",
|
|
156
|
+
),
|
|
157
|
+
] = "interactive", # type: ignore
|
|
153
158
|
):
|
|
154
159
|
track(
|
|
155
160
|
"command_triggered",
|
|
@@ -172,24 +177,30 @@ def ingest(
|
|
|
172
177
|
)
|
|
173
178
|
dest_table = source_table
|
|
174
179
|
|
|
180
|
+
factory = SourceDestinationFactory(source_uri, dest_uri)
|
|
181
|
+
source = factory.get_source()
|
|
182
|
+
destination = factory.get_destination()
|
|
183
|
+
|
|
184
|
+
original_incremental_strategy = incremental_strategy
|
|
185
|
+
|
|
175
186
|
merge_key = None
|
|
176
187
|
if incremental_strategy == "delete+insert":
|
|
177
188
|
merge_key = incremental_key
|
|
178
189
|
incremental_strategy = "merge"
|
|
179
190
|
|
|
180
|
-
factory = SourceDestinationFactory(source_uri, dest_uri)
|
|
181
|
-
source = factory.get_source()
|
|
182
|
-
destination = factory.get_destination()
|
|
183
|
-
|
|
184
191
|
m = hashlib.sha256()
|
|
185
192
|
m.update(dest_table.encode("utf-8"))
|
|
186
193
|
|
|
194
|
+
progressInstance: Collector = SpinnerCollector()
|
|
195
|
+
if progress == "log":
|
|
196
|
+
progressInstance = LogCollector()
|
|
197
|
+
|
|
187
198
|
pipeline = dlt.pipeline(
|
|
188
199
|
pipeline_name=m.hexdigest(),
|
|
189
200
|
destination=destination.dlt_dest(
|
|
190
201
|
uri=dest_uri,
|
|
191
202
|
),
|
|
192
|
-
progress=
|
|
203
|
+
progress=progressInstance,
|
|
193
204
|
pipelines_dir="pipeline_data",
|
|
194
205
|
full_refresh=full_refresh,
|
|
195
206
|
)
|
|
@@ -222,15 +233,20 @@ def ingest(
|
|
|
222
233
|
if factory.source_scheme == "sqlite":
|
|
223
234
|
source_table = "main." + source_table.split(".")[-1]
|
|
224
235
|
|
|
236
|
+
dlt_source = source.dlt_source(
|
|
237
|
+
uri=source_uri,
|
|
238
|
+
table=source_table,
|
|
239
|
+
incremental_key=incremental_key,
|
|
240
|
+
merge_key=merge_key,
|
|
241
|
+
interval_start=interval_start,
|
|
242
|
+
interval_end=interval_end,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if original_incremental_strategy == "delete+insert":
|
|
246
|
+
dlt_source.incremental.primary_key = ()
|
|
247
|
+
|
|
225
248
|
run_info = pipeline.run(
|
|
226
|
-
|
|
227
|
-
uri=source_uri,
|
|
228
|
-
table=source_table,
|
|
229
|
-
incremental_key=incremental_key,
|
|
230
|
-
merge_key=merge_key,
|
|
231
|
-
interval_start=interval_start,
|
|
232
|
-
interval_end=interval_end,
|
|
233
|
-
),
|
|
249
|
+
dlt_source,
|
|
234
250
|
**destination.dlt_run_params(
|
|
235
251
|
uri=dest_uri,
|
|
236
252
|
table=dest_table,
|
ingestr/main_test.py
CHANGED
|
@@ -2,7 +2,6 @@ import os
|
|
|
2
2
|
import shutil
|
|
3
3
|
|
|
4
4
|
import duckdb
|
|
5
|
-
import pytest
|
|
6
5
|
from typer.testing import CliRunner
|
|
7
6
|
|
|
8
7
|
from ingestr.main import app
|
|
@@ -94,6 +93,7 @@ def test_create_replace():
|
|
|
94
93
|
"testschema.output",
|
|
95
94
|
)
|
|
96
95
|
|
|
96
|
+
print(result.stdout)
|
|
97
97
|
assert result.exit_code == 0
|
|
98
98
|
|
|
99
99
|
res = conn.sql(
|
|
@@ -104,9 +104,6 @@ def test_create_replace():
|
|
|
104
104
|
assert res[1] == (2, "val2", "2022-02-01")
|
|
105
105
|
|
|
106
106
|
|
|
107
|
-
@pytest.mark.skip(
|
|
108
|
-
reason="this doesn't work at the moment due to a bug with dlt: https://github.com/dlt-hub/dlt/issues/971"
|
|
109
|
-
)
|
|
110
107
|
def test_append():
|
|
111
108
|
try:
|
|
112
109
|
shutil.rmtree(get_abs_path("../pipeline_data"))
|
|
@@ -147,7 +144,7 @@ def test_append():
|
|
|
147
144
|
def get_output_table():
|
|
148
145
|
conn.execute("CHECKPOINT")
|
|
149
146
|
return conn.sql(
|
|
150
|
-
"select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_append.output"
|
|
147
|
+
"select id, val, strftime(updated_at, '%Y-%m-%d') as updated_at from testschema_append.output order by id asc"
|
|
151
148
|
).fetchall()
|
|
152
149
|
|
|
153
150
|
run()
|
|
@@ -163,7 +160,7 @@ def test_append():
|
|
|
163
160
|
res = get_output_table()
|
|
164
161
|
assert len(res) == 2
|
|
165
162
|
assert res[0] == (1, "val1", "2022-01-01")
|
|
166
|
-
assert res[1] == (2, "val2", "2022-02
|
|
163
|
+
assert res[1] == (2, "val2", "2022-01-02")
|
|
167
164
|
|
|
168
165
|
|
|
169
166
|
def test_merge_with_primary_key():
|
|
@@ -337,10 +334,10 @@ def test_delete_insert_without_primary_key():
|
|
|
337
334
|
"CREATE TABLE testschema_delete_insert.input (id INTEGER, val VARCHAR, updated_at TIMESTAMP WITH TIME ZONE)"
|
|
338
335
|
)
|
|
339
336
|
conn.execute(
|
|
340
|
-
"INSERT INTO testschema_delete_insert.input VALUES (1, 'val1', '2022-01-01')"
|
|
337
|
+
"INSERT INTO testschema_delete_insert.input VALUES (1, 'val1', '2022-01-01 00:00:00+00:00')"
|
|
341
338
|
)
|
|
342
339
|
conn.execute(
|
|
343
|
-
"INSERT INTO testschema_delete_insert.input VALUES (2, 'val2', '2022-02-01')"
|
|
340
|
+
"INSERT INTO testschema_delete_insert.input VALUES (2, 'val2', '2022-02-01 00:00:00+00:00')"
|
|
344
341
|
)
|
|
345
342
|
|
|
346
343
|
res = conn.sql("select count(*) from testschema_delete_insert.input").fetchall()
|
|
@@ -361,7 +358,7 @@ def test_delete_insert_without_primary_key():
|
|
|
361
358
|
def get_output_rows():
|
|
362
359
|
conn.execute("CHECKPOINT")
|
|
363
360
|
return conn.sql(
|
|
364
|
-
"select id, val, strftime(updated_at, '%Y-%m-%d')
|
|
361
|
+
"select id, val, strftime(CAST(updated_at AT TIME ZONE 'UTC' AS TIMESTAMP), '%Y-%m-%d %H:%M:%S') from testschema_delete_insert.output order by id asc"
|
|
365
362
|
).fetchall()
|
|
366
363
|
|
|
367
364
|
def assert_output_equals(expected):
|
|
@@ -371,7 +368,9 @@ def test_delete_insert_without_primary_key():
|
|
|
371
368
|
assert res[i] == row
|
|
372
369
|
|
|
373
370
|
run()
|
|
374
|
-
assert_output_equals(
|
|
371
|
+
assert_output_equals(
|
|
372
|
+
[(1, "val1", "2022-01-01 00:00:00"), (2, "val2", "2022-02-01 00:00:00")]
|
|
373
|
+
)
|
|
375
374
|
|
|
376
375
|
first_run_id = conn.sql(
|
|
377
376
|
"select _dlt_load_id from testschema_delete_insert.output limit 1"
|
|
@@ -379,8 +378,10 @@ def test_delete_insert_without_primary_key():
|
|
|
379
378
|
|
|
380
379
|
##############################
|
|
381
380
|
# we'll run again, since this is a delete+insert, we expect the run ID to change for the last one
|
|
382
|
-
run()
|
|
383
|
-
assert_output_equals(
|
|
381
|
+
res = run()
|
|
382
|
+
assert_output_equals(
|
|
383
|
+
[(1, "val1", "2022-01-01 00:00:00"), (2, "val2", "2022-02-01 00:00:00")]
|
|
384
|
+
)
|
|
384
385
|
|
|
385
386
|
# we ensure that one of the rows is updated with a new run
|
|
386
387
|
count_by_run_id = conn.sql(
|
|
@@ -396,17 +397,17 @@ def test_delete_insert_without_primary_key():
|
|
|
396
397
|
##############################
|
|
397
398
|
# now we'll insert a few more lines for the same day, the new rows should show up
|
|
398
399
|
conn.execute(
|
|
399
|
-
"INSERT INTO testschema_delete_insert.input VALUES (3, 'val3', '2022-02-01'), (4, 'val4', '2022-02-01')"
|
|
400
|
+
"INSERT INTO testschema_delete_insert.input VALUES (3, 'val3', '2022-02-01 00:00:00+00:00'), (4, 'val4', '2022-02-01 00:00:00+00:00')"
|
|
400
401
|
)
|
|
401
402
|
conn.execute("CHECKPOINT")
|
|
402
403
|
|
|
403
404
|
run()
|
|
404
405
|
assert_output_equals(
|
|
405
406
|
[
|
|
406
|
-
(1, "val1", "2022-01-01"),
|
|
407
|
-
(2, "val2", "2022-02-01"),
|
|
408
|
-
(3, "val3", "2022-02-01"),
|
|
409
|
-
(4, "val4", "2022-02-01"),
|
|
407
|
+
(1, "val1", "2022-01-01 00:00:00"),
|
|
408
|
+
(2, "val2", "2022-02-01 00:00:00"),
|
|
409
|
+
(3, "val3", "2022-02-01 00:00:00"),
|
|
410
|
+
(4, "val4", "2022-02-01 00:00:00"),
|
|
410
411
|
]
|
|
411
412
|
)
|
|
412
413
|
|
ingestr/src/destinations.py
CHANGED
|
@@ -177,7 +177,8 @@ class CsvDestination(GenericSqlDestination):
|
|
|
177
177
|
)
|
|
178
178
|
|
|
179
179
|
output_path = self.uri.split("://")[1]
|
|
180
|
-
|
|
180
|
+
if output_path.count("/") > 1:
|
|
181
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
181
182
|
|
|
182
183
|
with gzip.open(first_file_path, "rt", encoding="utf-8") as jsonl_file: # type: ignore
|
|
183
184
|
with open(output_path, "w", newline="") as csv_file:
|
ingestr/src/factory.py
CHANGED
|
@@ -14,7 +14,7 @@ from ingestr.src.destinations import (
|
|
|
14
14
|
SnowflakeDestination,
|
|
15
15
|
SynapseDestination,
|
|
16
16
|
)
|
|
17
|
-
from ingestr.src.sources import LocalCsvSource, MongoDbSource, SqlSource
|
|
17
|
+
from ingestr.src.sources import LocalCsvSource, MongoDbSource, NotionSource, SqlSource
|
|
18
18
|
|
|
19
19
|
SQL_SOURCE_SCHEMES = [
|
|
20
20
|
"bigquery",
|
|
@@ -80,6 +80,8 @@ class SourceDestinationFactory:
|
|
|
80
80
|
return LocalCsvSource()
|
|
81
81
|
elif self.source_scheme == "mongodb":
|
|
82
82
|
return MongoDbSource()
|
|
83
|
+
elif self.source_scheme == "notion":
|
|
84
|
+
return NotionSource()
|
|
83
85
|
else:
|
|
84
86
|
raise ValueError(f"Unsupported source scheme: {self.source_scheme}")
|
|
85
87
|
|
ingestr/src/mongodb/__init__.py
CHANGED
|
@@ -70,7 +70,7 @@ def mongodb_collection(
|
|
|
70
70
|
collection: str = dlt.config.value,
|
|
71
71
|
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
|
|
72
72
|
write_disposition: Optional[str] = dlt.config.value,
|
|
73
|
-
parallel: Optional[bool] =
|
|
73
|
+
parallel: Optional[bool] = False,
|
|
74
74
|
) -> Any:
|
|
75
75
|
"""
|
|
76
76
|
A DLT source which loads a collection from a mongo database using PyMongo.
|
ingestr/src/mongodb/helpers.py
CHANGED
|
@@ -83,7 +83,7 @@ class CollectionLoaderParallell(CollectionLoader):
|
|
|
83
83
|
def _get_cursor(self) -> TCursor:
|
|
84
84
|
cursor = self.collection.find(filter=self._filter_op)
|
|
85
85
|
if self._sort_op:
|
|
86
|
-
cursor = cursor.sort(self._sort_op)
|
|
86
|
+
cursor = cursor.sort(self._sort_op)
|
|
87
87
|
return cursor
|
|
88
88
|
|
|
89
89
|
@dlt.defer
|
|
@@ -155,11 +155,11 @@ class MongoDbCollectionConfiguration(BaseConfiguration):
|
|
|
155
155
|
|
|
156
156
|
@configspec
|
|
157
157
|
class MongoDbCollectionResourceConfiguration(BaseConfiguration):
|
|
158
|
-
connection_url: str
|
|
159
|
-
database: Optional[str]
|
|
160
|
-
collection: str
|
|
158
|
+
connection_url: str = dlt.secrets.value
|
|
159
|
+
database: Optional[str] = dlt.config.value
|
|
160
|
+
collection: str = dlt.config.value
|
|
161
161
|
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
162
|
-
write_disposition: Optional[str] =
|
|
162
|
+
write_disposition: Optional[str] = dlt.config.value
|
|
163
163
|
parallel: Optional[bool] = False
|
|
164
164
|
|
|
165
165
|
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""A source that extracts data from Notion API"""
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Iterator, List, Optional
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from dlt.sources import DltResource
|
|
7
|
+
|
|
8
|
+
from .helpers.client import NotionClient
|
|
9
|
+
from .helpers.database import NotionDatabase
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dlt.source
|
|
13
|
+
def notion_databases(
|
|
14
|
+
database_ids: Optional[List[Dict[str, str]]] = None,
|
|
15
|
+
api_key: str = dlt.secrets.value,
|
|
16
|
+
) -> Iterator[DltResource]:
|
|
17
|
+
"""
|
|
18
|
+
Retrieves data from Notion databases.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
database_ids (List[Dict[str, str]], optional): A list of dictionaries
|
|
22
|
+
each containing a database id and a name.
|
|
23
|
+
Defaults to None. If None, the function will generate all databases
|
|
24
|
+
in the workspace that are accessible to the integration.
|
|
25
|
+
api_key (str): The Notion API secret key.
|
|
26
|
+
|
|
27
|
+
Yields:
|
|
28
|
+
DltResource: Data resources from Notion databases.
|
|
29
|
+
"""
|
|
30
|
+
notion_client = NotionClient(api_key)
|
|
31
|
+
|
|
32
|
+
if database_ids is None:
|
|
33
|
+
search_results = notion_client.search(
|
|
34
|
+
filter_criteria={"value": "database", "property": "object"}
|
|
35
|
+
)
|
|
36
|
+
database_ids = [
|
|
37
|
+
{"id": result["id"], "use_name": result["title"][0]["plain_text"]}
|
|
38
|
+
for result in search_results
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
for database in database_ids:
|
|
42
|
+
if "use_name" not in database:
|
|
43
|
+
# Fetch the database details from Notion
|
|
44
|
+
details = notion_client.get_database(database["id"])
|
|
45
|
+
|
|
46
|
+
# Extract the name/title from the details
|
|
47
|
+
database["use_name"] = details["title"][0]["plain_text"]
|
|
48
|
+
|
|
49
|
+
notion_database = NotionDatabase(database["id"], notion_client)
|
|
50
|
+
yield dlt.resource( # type: ignore
|
|
51
|
+
notion_database.query(),
|
|
52
|
+
primary_key="id",
|
|
53
|
+
name=database["use_name"],
|
|
54
|
+
write_disposition="replace",
|
|
55
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterator, Optional
|
|
2
|
+
|
|
3
|
+
from dlt.sources.helpers import requests
|
|
4
|
+
|
|
5
|
+
from ..settings import API_URL
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NotionClient:
|
|
9
|
+
"""A client to interact with the Notion API.
|
|
10
|
+
|
|
11
|
+
Attributes:
|
|
12
|
+
api_key (str): The Notion API secret key.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, api_key: Optional[str] = None):
|
|
16
|
+
self.api_key = api_key
|
|
17
|
+
|
|
18
|
+
def _create_headers(self) -> Dict[str, str]:
|
|
19
|
+
headers = {
|
|
20
|
+
"accept": "application/json",
|
|
21
|
+
"Notion-Version": "2022-06-28",
|
|
22
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
23
|
+
}
|
|
24
|
+
return headers
|
|
25
|
+
|
|
26
|
+
def _filter_out_none_values(self, dict_in: Dict[str, Any]) -> Dict[str, Any]:
|
|
27
|
+
return {k: v for k, v in dict_in.items() if v is not None}
|
|
28
|
+
|
|
29
|
+
def get_endpoint(
|
|
30
|
+
self, resource: str, resource_id: str, subresource: Optional[str] = None
|
|
31
|
+
) -> str:
|
|
32
|
+
"""Returns the endpoint for a given resource.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
resource (str): The resource to get the endpoint for.
|
|
36
|
+
resource_id (str): The id of the resource.
|
|
37
|
+
subresource (str, optional): The subresource to get the endpoint for.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
str: The endpoint for the resource.
|
|
41
|
+
"""
|
|
42
|
+
url = f"{API_URL}/{resource}/{resource_id}"
|
|
43
|
+
if subresource:
|
|
44
|
+
url += f"/{subresource}"
|
|
45
|
+
return url
|
|
46
|
+
|
|
47
|
+
def fetch_resource(
|
|
48
|
+
self, resource: str, resource_id: str, subresource: Optional[str] = None
|
|
49
|
+
) -> Any:
|
|
50
|
+
"""Fetches a resource from the Notion API.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
resource (str): The resource to fetch.
|
|
54
|
+
resource_id (str): The id of the resource.
|
|
55
|
+
subresource (str, optional): The subresource to fetch. Defaults to None.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Any: The resource from the Notion API.
|
|
59
|
+
"""
|
|
60
|
+
url = self.get_endpoint(resource, resource_id, subresource)
|
|
61
|
+
headers = self._create_headers()
|
|
62
|
+
response = requests.get(url, headers=headers)
|
|
63
|
+
response.raise_for_status()
|
|
64
|
+
return response.json()
|
|
65
|
+
|
|
66
|
+
def send_payload(
|
|
67
|
+
self,
|
|
68
|
+
resource: str,
|
|
69
|
+
resource_id: str,
|
|
70
|
+
subresource: Optional[str] = None,
|
|
71
|
+
query_params: Optional[Dict[str, Any]] = None,
|
|
72
|
+
payload: Optional[Dict[str, Any]] = None,
|
|
73
|
+
) -> Any:
|
|
74
|
+
"""Sends a payload to the Notion API using the POST method.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
resource (str): The resource to send the payload to.
|
|
78
|
+
resource_id (str): The id of the resource.
|
|
79
|
+
subresource (str, optional): The subresource to send the payload to.
|
|
80
|
+
Defaults to None.
|
|
81
|
+
query_params (Dict[str, Any], optional): The query parameters to send
|
|
82
|
+
with the payload. Defaults to None.
|
|
83
|
+
payload (Dict[str, Any], optional): The payload to send. Defaults to None.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Any: The response from the Notion API.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
requests.HTTPError: If the response from the Notion API is not 200.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
url = self.get_endpoint(resource, resource_id, subresource)
|
|
93
|
+
headers = self._create_headers()
|
|
94
|
+
|
|
95
|
+
if payload is None:
|
|
96
|
+
payload = {}
|
|
97
|
+
|
|
98
|
+
filtered_payload = self._filter_out_none_values(payload)
|
|
99
|
+
|
|
100
|
+
response = requests.post(
|
|
101
|
+
url, headers=headers, params=query_params, json=filtered_payload
|
|
102
|
+
)
|
|
103
|
+
response.raise_for_status()
|
|
104
|
+
return response.json()
|
|
105
|
+
|
|
106
|
+
def search(
|
|
107
|
+
self,
|
|
108
|
+
query: Optional[str] = None,
|
|
109
|
+
filter_criteria: Optional[Dict[str, Any]] = None,
|
|
110
|
+
sort: Optional[Dict[str, Any]] = None,
|
|
111
|
+
start_cursor: Optional[str] = None,
|
|
112
|
+
page_size: Optional[int] = None,
|
|
113
|
+
) -> Iterator[Dict[str, Any]]:
|
|
114
|
+
"""Searches all parent or child pages and databases that have been
|
|
115
|
+
shared with an integration.
|
|
116
|
+
|
|
117
|
+
Notion API Reference. Search:
|
|
118
|
+
https://developers.notion.com/reference/post-search
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
query (str, optional): The string to search for. Defaults to None.
|
|
122
|
+
filter_criteria (Dict[str, Any], optional): The filter to apply to
|
|
123
|
+
the results.
|
|
124
|
+
sort (Dict[str, Any], optional): The sort to apply to the results.
|
|
125
|
+
start_cursor (str, optional): The cursor to start the query at.
|
|
126
|
+
Defaults to None.
|
|
127
|
+
page_size (int, optional): The number of results to return.
|
|
128
|
+
Defaults to None.
|
|
129
|
+
|
|
130
|
+
Yields:
|
|
131
|
+
Dict[str, Any]: A result from the search.
|
|
132
|
+
"""
|
|
133
|
+
has_more = True
|
|
134
|
+
|
|
135
|
+
while has_more:
|
|
136
|
+
payload = {
|
|
137
|
+
"query": query,
|
|
138
|
+
"sort": sort,
|
|
139
|
+
"filter": filter_criteria,
|
|
140
|
+
"start_cursor": start_cursor,
|
|
141
|
+
"page_size": page_size,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
filtered_payload = self._filter_out_none_values(payload)
|
|
145
|
+
|
|
146
|
+
response = self.send_payload("search", "", payload=filtered_payload)
|
|
147
|
+
|
|
148
|
+
for result in response.get("results", []):
|
|
149
|
+
yield result
|
|
150
|
+
|
|
151
|
+
next_cursor = response.get("next_cursor")
|
|
152
|
+
has_more = next_cursor is not None
|
|
153
|
+
start_cursor = next_cursor
|
|
154
|
+
|
|
155
|
+
def get_database(self, database_id: str) -> Any:
|
|
156
|
+
"""Fetches the details of a specific database by its ID.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
database_id (str): The ID of the database to fetch.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Any: The details of the database.
|
|
163
|
+
"""
|
|
164
|
+
return self.fetch_resource("databases", database_id)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterable, Optional
|
|
2
|
+
|
|
3
|
+
from dlt.common.typing import TDataItem
|
|
4
|
+
|
|
5
|
+
from .client import NotionClient
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NotionDatabase:
|
|
9
|
+
"""
|
|
10
|
+
A class to represent a Notion database.
|
|
11
|
+
|
|
12
|
+
Attributes:
|
|
13
|
+
database_id (str): The ID of the Notion database.
|
|
14
|
+
notion_client (NotionClient): A client to interact with the Notion API.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, database_id: str, notion_client: NotionClient):
|
|
18
|
+
self.database_id = database_id
|
|
19
|
+
self.notion_client = notion_client
|
|
20
|
+
|
|
21
|
+
def get_structure(self) -> Any:
|
|
22
|
+
"""Retrieves the structure of the database.
|
|
23
|
+
|
|
24
|
+
Notion API Reference. Retrieve a database:
|
|
25
|
+
https://developers.notion.com/reference/retrieve-a-database
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Any: The structure of the database.
|
|
29
|
+
"""
|
|
30
|
+
return self.notion_client.fetch_resource("databases", self.database_id)
|
|
31
|
+
|
|
32
|
+
def query(
|
|
33
|
+
self,
|
|
34
|
+
filter_properties: Optional[Dict[str, Any]] = None,
|
|
35
|
+
filter_criteria: Optional[Dict[str, Any]] = None,
|
|
36
|
+
sorts: Optional[Dict[str, Any]] = None,
|
|
37
|
+
start_cursor: Optional[str] = None,
|
|
38
|
+
page_size: Optional[int] = None,
|
|
39
|
+
) -> Iterable[TDataItem]:
|
|
40
|
+
"""Queries the database for records.
|
|
41
|
+
|
|
42
|
+
Notion API Reference. Query a database:
|
|
43
|
+
https://developers.notion.com/reference/post-database-query
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
filter_properties (Dict[str, Any], optional): A dictionary of
|
|
47
|
+
properties to filter the records by. Defaults to None.
|
|
48
|
+
filter_criteria (Dict[str, Any], optional): A dictionary of filters
|
|
49
|
+
to apply to the records. Defaults to None.
|
|
50
|
+
sorts (Dict[str, Any], optional): A dictionary of sorts to apply
|
|
51
|
+
to the records. Defaults to None.
|
|
52
|
+
start_cursor (str, optional): The cursor to start the query at.
|
|
53
|
+
Defaults to None.
|
|
54
|
+
page_size (int, optional): The number of records to return.
|
|
55
|
+
Defaults to None.
|
|
56
|
+
|
|
57
|
+
Yields:
|
|
58
|
+
List[Dict[str, Any]]: A record from the database.
|
|
59
|
+
"""
|
|
60
|
+
while True:
|
|
61
|
+
payload = {
|
|
62
|
+
"filter": filter_criteria,
|
|
63
|
+
"sorts": sorts,
|
|
64
|
+
"start_cursor": start_cursor,
|
|
65
|
+
"page_size": page_size,
|
|
66
|
+
}
|
|
67
|
+
response = self.notion_client.send_payload(
|
|
68
|
+
"databases",
|
|
69
|
+
self.database_id,
|
|
70
|
+
subresource="query",
|
|
71
|
+
query_params=filter_properties,
|
|
72
|
+
payload=payload,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
yield response.get("results", [])
|
|
76
|
+
if not response.get("has_more"):
|
|
77
|
+
break
|
|
78
|
+
start_cursor = response.get("next_cursor")
|
ingestr/src/sources.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
from typing import Callable
|
|
3
|
+
from urllib.parse import parse_qs, urlparse
|
|
3
4
|
|
|
4
5
|
import dlt
|
|
5
6
|
|
|
6
7
|
from ingestr.src.mongodb import mongodb_collection
|
|
8
|
+
from ingestr.src.notion import notion_databases
|
|
7
9
|
from ingestr.src.sql_database import sql_table
|
|
8
10
|
|
|
9
11
|
|
|
@@ -104,3 +106,25 @@ class LocalCsvSource:
|
|
|
104
106
|
csv_file,
|
|
105
107
|
merge_key=kwargs.get("merge_key"), # type: ignore
|
|
106
108
|
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class NotionSource:
|
|
112
|
+
table_builder: Callable
|
|
113
|
+
|
|
114
|
+
def __init__(self, table_builder=notion_databases) -> None:
|
|
115
|
+
self.table_builder = table_builder
|
|
116
|
+
|
|
117
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
118
|
+
if kwargs.get("incremental_key"):
|
|
119
|
+
raise ValueError("Incremental loads are not supported for Notion")
|
|
120
|
+
|
|
121
|
+
source_fields = urlparse(uri)
|
|
122
|
+
source_params = parse_qs(source_fields.query)
|
|
123
|
+
api_key = source_params.get("api_key")
|
|
124
|
+
if not api_key:
|
|
125
|
+
raise ValueError("api_key in the URI is required to connect to Notion")
|
|
126
|
+
|
|
127
|
+
return self.table_builder(
|
|
128
|
+
database_ids=[{"id": table}],
|
|
129
|
+
api_key=api_key[0],
|
|
130
|
+
)
|