ingestr 0.13.13__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin_max/__init__.py +6 -4
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +37 -10
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +508 -27
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +107 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +15 -8
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +2933 -245
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/METADATA +229 -19
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.13.dist-info/RECORD +0 -115
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/blob.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from typing import Tuple, TypeAlias
|
|
3
|
-
from urllib.parse import ParseResult
|
|
3
|
+
from urllib.parse import ParseResult, urlparse
|
|
4
4
|
|
|
5
5
|
BucketName: TypeAlias = str
|
|
6
6
|
FileGlob: TypeAlias = str
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
class UnsupportedEndpointError(Exception):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
9
13
|
def parse_uri(uri: ParseResult, table: str) -> Tuple[BucketName, FileGlob]:
|
|
10
14
|
"""
|
|
11
15
|
parse the URI of a blob storage and
|
|
@@ -14,13 +18,16 @@ def parse_uri(uri: ParseResult, table: str) -> Tuple[BucketName, FileGlob]:
|
|
|
14
18
|
Supports the following Forms:
|
|
15
19
|
- uri: "gs://"
|
|
16
20
|
table: "bucket-name/file-glob"
|
|
21
|
+
- uri: "gs://uri-bucket-name" (uri-bucket-name is preferred)
|
|
22
|
+
table: "gs://table-bucket-name/file-glob"
|
|
23
|
+
- uri: "gs://"
|
|
24
|
+
table: "gs://bucket-name/file-glob"
|
|
17
25
|
- uri: gs://bucket-name/file-glob
|
|
18
26
|
table: None
|
|
19
27
|
- uri: "gs://bucket-name"
|
|
20
28
|
table: "file-glob"
|
|
21
29
|
|
|
22
|
-
The first form is the prefered method. Other forms are supported
|
|
23
|
-
for backward compatibility, but discouraged.
|
|
30
|
+
The first form is the prefered method. Other forms are supported but discouraged.
|
|
24
31
|
"""
|
|
25
32
|
|
|
26
33
|
table = table.strip()
|
|
@@ -34,16 +41,36 @@ def parse_uri(uri: ParseResult, table: str) -> Tuple[BucketName, FileGlob]:
|
|
|
34
41
|
)
|
|
35
42
|
return host, uri.path.lstrip("/")
|
|
36
43
|
|
|
44
|
+
table_uri = urlparse(table)
|
|
45
|
+
|
|
37
46
|
if host != "":
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
)
|
|
43
|
-
return host, table.lstrip("/")
|
|
47
|
+
return host, table_uri.path.lstrip("/")
|
|
48
|
+
|
|
49
|
+
if table_uri.hostname:
|
|
50
|
+
return table_uri.hostname, table_uri.path.lstrip("/")
|
|
44
51
|
|
|
45
|
-
parts =
|
|
52
|
+
parts = table_uri.path.lstrip("/").split("/", maxsplit=1)
|
|
46
53
|
if len(parts) != 2:
|
|
47
54
|
return "", parts[0]
|
|
48
55
|
|
|
49
56
|
return parts[0], parts[1]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def parse_endpoint(path: str) -> str:
|
|
60
|
+
"""
|
|
61
|
+
Parse the endpoint kind from the URI.
|
|
62
|
+
|
|
63
|
+
kind is a file format. one of [csv, jsonl, parquet]
|
|
64
|
+
"""
|
|
65
|
+
file_extension = path.split(".")[-1]
|
|
66
|
+
if file_extension == "gz":
|
|
67
|
+
file_extension = path.split(".")[-2]
|
|
68
|
+
if file_extension == "csv":
|
|
69
|
+
endpoint = "read_csv"
|
|
70
|
+
elif file_extension == "jsonl":
|
|
71
|
+
endpoint = "read_jsonl"
|
|
72
|
+
elif file_extension == "parquet":
|
|
73
|
+
endpoint = "read_parquet"
|
|
74
|
+
else:
|
|
75
|
+
raise UnsupportedEndpointError(f"Unsupported file format: {file_extension}")
|
|
76
|
+
return endpoint
|
ingestr/src/buildinfo.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
version = "v0.
|
|
1
|
+
version = "v0.14.104"
|
ingestr/src/chess/__init__.py
CHANGED
|
@@ -75,7 +75,7 @@ def players_archives(players: List[str]) -> Iterator[List[TDataItem]]:
|
|
|
75
75
|
|
|
76
76
|
|
|
77
77
|
@dlt.resource(
|
|
78
|
-
write_disposition="
|
|
78
|
+
write_disposition="replace", columns={"end_time": {"data_type": "timestamp"}}
|
|
79
79
|
)
|
|
80
80
|
def players_games(
|
|
81
81
|
players: List[str], start_month: str = None, end_month: str = None
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Simple ClickUp source."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
import pendulum
|
|
8
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
9
|
+
from dlt.sources import DltResource
|
|
10
|
+
|
|
11
|
+
from .helpers import ClickupClient
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dlt.source(max_table_nesting=0)
|
|
15
|
+
def clickup_source(
|
|
16
|
+
api_token: str = dlt.secrets.value,
|
|
17
|
+
start_date: datetime = None,
|
|
18
|
+
end_date: datetime = None,
|
|
19
|
+
) -> Iterable[DltResource]:
|
|
20
|
+
client = ClickupClient(api_token)
|
|
21
|
+
|
|
22
|
+
@dlt.resource(
|
|
23
|
+
name="user",
|
|
24
|
+
primary_key="id",
|
|
25
|
+
write_disposition="merge",
|
|
26
|
+
)
|
|
27
|
+
def user() -> Iterable[dict]:
|
|
28
|
+
data = client.get("/user")
|
|
29
|
+
yield data["user"]
|
|
30
|
+
|
|
31
|
+
@dlt.resource(name="teams", primary_key="id", write_disposition="merge")
|
|
32
|
+
def teams() -> Iterable[dict]:
|
|
33
|
+
for team in client.get_teams():
|
|
34
|
+
yield team
|
|
35
|
+
|
|
36
|
+
@dlt.resource(name="spaces", primary_key="id", write_disposition="merge")
|
|
37
|
+
def spaces() -> Iterable[dict]:
|
|
38
|
+
for space in client.get_spaces():
|
|
39
|
+
yield space
|
|
40
|
+
|
|
41
|
+
@dlt.resource(name="lists", write_disposition="merge", primary_key="id")
|
|
42
|
+
def lists() -> Iterable[dict]:
|
|
43
|
+
for list in client.get_lists():
|
|
44
|
+
yield list
|
|
45
|
+
|
|
46
|
+
@dlt.resource(
|
|
47
|
+
name="tasks",
|
|
48
|
+
write_disposition="merge",
|
|
49
|
+
primary_key="id",
|
|
50
|
+
columns={"date_updated": {"data_type": "timestamp"}},
|
|
51
|
+
)
|
|
52
|
+
def tasks(
|
|
53
|
+
date_updated: dlt.sources.incremental[str] = dlt.sources.incremental(
|
|
54
|
+
"date_updated",
|
|
55
|
+
initial_value=ensure_pendulum_datetime(start_date).in_timezone("UTC"),
|
|
56
|
+
range_end="closed",
|
|
57
|
+
range_start="closed",
|
|
58
|
+
),
|
|
59
|
+
) -> Iterable[dict]:
|
|
60
|
+
if date_updated.last_value:
|
|
61
|
+
start = ensure_pendulum_datetime(date_updated.last_value).in_timezone("UTC")
|
|
62
|
+
else:
|
|
63
|
+
start = ensure_pendulum_datetime(start_date).in_timezone("UTC")
|
|
64
|
+
|
|
65
|
+
if date_updated.end_value is None:
|
|
66
|
+
end = pendulum.now("UTC")
|
|
67
|
+
else:
|
|
68
|
+
end = date_updated.end_value.in_timezone("UTC")
|
|
69
|
+
|
|
70
|
+
for list_obj in client.get_lists():
|
|
71
|
+
for task in client.paginated(
|
|
72
|
+
f"/list/{list_obj['id']}/task", "tasks", {"page_size": 100}
|
|
73
|
+
):
|
|
74
|
+
task_dt = ensure_pendulum_datetime(int(task["date_updated"]) / 1000)
|
|
75
|
+
if task_dt >= start and task_dt <= end:
|
|
76
|
+
task["date_updated"] = task_dt
|
|
77
|
+
yield task
|
|
78
|
+
|
|
79
|
+
return (
|
|
80
|
+
user,
|
|
81
|
+
teams,
|
|
82
|
+
spaces,
|
|
83
|
+
lists,
|
|
84
|
+
tasks,
|
|
85
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from typing import Iterable, Optional
|
|
2
|
+
|
|
3
|
+
from ..http_client import create_client
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ClickupClient:
|
|
7
|
+
def __init__(self, api_token: str):
|
|
8
|
+
self.session = create_client()
|
|
9
|
+
self.base_url = "https://api.clickup.com/api/v2"
|
|
10
|
+
self.headers = {"Authorization": api_token}
|
|
11
|
+
|
|
12
|
+
def get(self, endpoint: str, params: Optional[dict] = None) -> dict:
|
|
13
|
+
url = f"{self.base_url}{endpoint}"
|
|
14
|
+
resp = self.session.get(url, headers=self.headers, params=params or {})
|
|
15
|
+
resp.raise_for_status()
|
|
16
|
+
return resp.json()
|
|
17
|
+
|
|
18
|
+
def paginated(
|
|
19
|
+
self, endpoint: str, key: str, params: Optional[dict] = None
|
|
20
|
+
) -> Iterable[dict]:
|
|
21
|
+
page = 0
|
|
22
|
+
params = params or {}
|
|
23
|
+
while True:
|
|
24
|
+
params["page"] = page
|
|
25
|
+
data = self.get(endpoint, params)
|
|
26
|
+
items = data.get(key, data)
|
|
27
|
+
if not items:
|
|
28
|
+
break
|
|
29
|
+
for item in items:
|
|
30
|
+
yield item
|
|
31
|
+
if data.get("last_page") or len(items) < params.get("page_size", 100):
|
|
32
|
+
break
|
|
33
|
+
page += 1
|
|
34
|
+
|
|
35
|
+
def get_teams(self):
|
|
36
|
+
data = self.get("/team")
|
|
37
|
+
return data.get("teams", [])
|
|
38
|
+
|
|
39
|
+
def get_spaces(self):
|
|
40
|
+
for team in self.get_teams():
|
|
41
|
+
for space in self.paginated(f"/team/{team['id']}/space", "spaces"):
|
|
42
|
+
yield space
|
|
43
|
+
|
|
44
|
+
def get_lists(self):
|
|
45
|
+
for space in self.get_spaces():
|
|
46
|
+
for lst in self.paginated(f"/space/{space['id']}/list", "lists"):
|
|
47
|
+
yield lst
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from dlt.common.runtime.collector import Collector
|
|
4
|
+
from rich.status import Status
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SpinnerCollector(Collector):
|
|
8
|
+
status: Status
|
|
9
|
+
current_step: str
|
|
10
|
+
started: bool
|
|
11
|
+
|
|
12
|
+
def __init__(self) -> None:
|
|
13
|
+
self.status = Status("Ingesting data...", spinner="dots")
|
|
14
|
+
self.started = False
|
|
15
|
+
|
|
16
|
+
def update(
|
|
17
|
+
self,
|
|
18
|
+
name: str,
|
|
19
|
+
inc: int = 1,
|
|
20
|
+
total: Optional[int] = None,
|
|
21
|
+
message: Optional[str] = None, # type: ignore
|
|
22
|
+
label: str = "",
|
|
23
|
+
**kwargs,
|
|
24
|
+
) -> None:
|
|
25
|
+
self.status.update(self.current_step)
|
|
26
|
+
|
|
27
|
+
def _start(self, step: str) -> None:
|
|
28
|
+
self.current_step = self.__step_to_label(step)
|
|
29
|
+
self.status.start()
|
|
30
|
+
|
|
31
|
+
def __step_to_label(self, step: str) -> str:
|
|
32
|
+
verb = step.split(" ")[0].lower()
|
|
33
|
+
if verb.startswith("normalize"):
|
|
34
|
+
return "Normalizing the data"
|
|
35
|
+
elif verb.startswith("load"):
|
|
36
|
+
return "Loading the data to the destination"
|
|
37
|
+
elif verb.startswith("extract"):
|
|
38
|
+
return "Extracting the data from the source"
|
|
39
|
+
|
|
40
|
+
return f"{verb.capitalize()} the data"
|
|
41
|
+
|
|
42
|
+
def _stop(self) -> None:
|
|
43
|
+
self.status.stop()
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Source that loads data from Couchbase buckets, supports incremental loads."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import dlt
|
|
6
|
+
from dlt.sources import DltResource
|
|
7
|
+
|
|
8
|
+
from .helpers import (
|
|
9
|
+
CouchbaseConfiguration,
|
|
10
|
+
client_from_credentials,
|
|
11
|
+
fetch_documents,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dlt.source(max_table_nesting=0)
|
|
16
|
+
def couchbase_source(
|
|
17
|
+
connection_string: str = dlt.secrets.value,
|
|
18
|
+
username: str = dlt.secrets.value,
|
|
19
|
+
password: str = dlt.secrets.value,
|
|
20
|
+
bucket: str = dlt.config.value,
|
|
21
|
+
scope: Optional[str] = dlt.config.value,
|
|
22
|
+
collection: Optional[str] = dlt.config.value,
|
|
23
|
+
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
|
|
24
|
+
write_disposition: Optional[str] = dlt.config.value,
|
|
25
|
+
limit: Optional[int] = None,
|
|
26
|
+
) -> DltResource:
|
|
27
|
+
"""
|
|
28
|
+
A DLT source which loads data from a Couchbase bucket using Couchbase Python SDK.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
connection_string (str): Couchbase connection string (e.g., 'couchbase://localhost')
|
|
32
|
+
username (str): Couchbase username
|
|
33
|
+
password (str): Couchbase password
|
|
34
|
+
bucket (str): Bucket name to load data from
|
|
35
|
+
scope (Optional[str]): Scope name (defaults to '_default')
|
|
36
|
+
collection (Optional[str]): Collection name (defaults to '_default')
|
|
37
|
+
incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading.
|
|
38
|
+
E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
|
|
39
|
+
write_disposition (str): Write disposition of the resource.
|
|
40
|
+
limit (Optional[int]): The maximum number of documents to load.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
DltResource: A DLT resource for the Couchbase collection.
|
|
44
|
+
"""
|
|
45
|
+
# Set up Couchbase client
|
|
46
|
+
cluster = client_from_credentials(connection_string, username, password)
|
|
47
|
+
|
|
48
|
+
resource_name = f"{bucket}_{scope}_{collection}"
|
|
49
|
+
|
|
50
|
+
return dlt.resource( # type: ignore[call-overload, arg-type]
|
|
51
|
+
fetch_documents,
|
|
52
|
+
name=resource_name,
|
|
53
|
+
primary_key="id",
|
|
54
|
+
write_disposition=write_disposition or "replace",
|
|
55
|
+
spec=CouchbaseConfiguration,
|
|
56
|
+
max_table_nesting=0,
|
|
57
|
+
)(
|
|
58
|
+
cluster=cluster,
|
|
59
|
+
bucket_name=bucket,
|
|
60
|
+
scope_name=scope,
|
|
61
|
+
collection_name=collection,
|
|
62
|
+
incremental=incremental,
|
|
63
|
+
limit=limit,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dlt.resource(
|
|
68
|
+
name=lambda args: f"{args['bucket']}_{args['scope']}_{args['collection']}",
|
|
69
|
+
standalone=True,
|
|
70
|
+
spec=CouchbaseConfiguration, # type: ignore[arg-type]
|
|
71
|
+
)
|
|
72
|
+
def couchbase_collection(
|
|
73
|
+
connection_string: str = dlt.secrets.value,
|
|
74
|
+
username: str = dlt.secrets.value,
|
|
75
|
+
password: str = dlt.secrets.value,
|
|
76
|
+
bucket: str = dlt.config.value,
|
|
77
|
+
scope: Optional[str] = dlt.config.value,
|
|
78
|
+
collection: Optional[str] = dlt.config.value,
|
|
79
|
+
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
|
|
80
|
+
write_disposition: Optional[str] = dlt.config.value,
|
|
81
|
+
limit: Optional[int] = None,
|
|
82
|
+
chunk_size: Optional[int] = 1000,
|
|
83
|
+
) -> DltResource:
|
|
84
|
+
"""
|
|
85
|
+
A DLT resource which loads a collection from Couchbase.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
connection_string (str): Couchbase connection string (e.g., 'couchbase://localhost')
|
|
89
|
+
username (str): Couchbase username
|
|
90
|
+
password (str): Couchbase password
|
|
91
|
+
bucket (str): Bucket name to load data from
|
|
92
|
+
scope (Optional[str]): Scope name (defaults to '_default')
|
|
93
|
+
collection (Optional[str]): Collection name (defaults to '_default')
|
|
94
|
+
incremental (Optional[dlt.sources.incremental]): Option to enable incremental loading.
|
|
95
|
+
write_disposition (str): Write disposition of the resource.
|
|
96
|
+
limit (Optional[int]): The maximum number of documents to load.
|
|
97
|
+
chunk_size (Optional[int]): The number of documents to load in each batch.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
DltResource: A DLT resource for the Couchbase collection.
|
|
101
|
+
"""
|
|
102
|
+
# Set up Couchbase client
|
|
103
|
+
cluster = client_from_credentials(connection_string, username, password)
|
|
104
|
+
|
|
105
|
+
return dlt.resource( # type: ignore[call-overload]
|
|
106
|
+
fetch_documents,
|
|
107
|
+
name=f"{bucket}_{scope}_{collection}",
|
|
108
|
+
primary_key="id",
|
|
109
|
+
write_disposition=write_disposition or "replace",
|
|
110
|
+
)(
|
|
111
|
+
cluster=cluster,
|
|
112
|
+
bucket_name=bucket,
|
|
113
|
+
scope_name=scope,
|
|
114
|
+
collection_name=collection,
|
|
115
|
+
incremental=incremental,
|
|
116
|
+
limit=limit,
|
|
117
|
+
chunk_size=chunk_size,
|
|
118
|
+
)
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Helper functions for Couchbase source."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
from typing import Any, Dict, Iterator, Optional
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from couchbase.auth import PasswordAuthenticator # type: ignore[import-untyped]
|
|
8
|
+
from couchbase.cluster import Cluster # type: ignore[import-untyped]
|
|
9
|
+
from couchbase.options import ( # type: ignore[import-untyped]
|
|
10
|
+
ClusterOptions,
|
|
11
|
+
QueryOptions,
|
|
12
|
+
)
|
|
13
|
+
from dlt.common.configuration import configspec
|
|
14
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@configspec
|
|
18
|
+
class CouchbaseConfiguration:
|
|
19
|
+
"""Configuration for Couchbase source."""
|
|
20
|
+
|
|
21
|
+
connection_string: str = dlt.secrets.value
|
|
22
|
+
username: str = dlt.secrets.value
|
|
23
|
+
password: str = dlt.secrets.value
|
|
24
|
+
bucket: str = dlt.config.value
|
|
25
|
+
scope: Optional[str] = dlt.config.value
|
|
26
|
+
collection: Optional[str] = dlt.config.value
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def client_from_credentials(
|
|
30
|
+
connection_string: str, username: str, password: str
|
|
31
|
+
) -> Cluster:
|
|
32
|
+
"""
|
|
33
|
+
Create a Couchbase cluster client from credentials.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
connection_string: Couchbase connection string
|
|
37
|
+
- Local/self-hosted: 'couchbase://localhost'
|
|
38
|
+
- Capella (cloud): 'couchbases://your-instance.cloud.couchbase.com'
|
|
39
|
+
username: Couchbase username
|
|
40
|
+
password: Couchbase password
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Cluster: Connected Couchbase cluster instance
|
|
44
|
+
"""
|
|
45
|
+
auth = PasswordAuthenticator(username, password)
|
|
46
|
+
options = ClusterOptions(auth)
|
|
47
|
+
|
|
48
|
+
# Apply wan_development profile for Capella (couchbases://) connections
|
|
49
|
+
# This helps avoid latency issues when accessing from different networks
|
|
50
|
+
if connection_string.startswith("couchbases://"):
|
|
51
|
+
options.apply_profile("wan_development")
|
|
52
|
+
|
|
53
|
+
cluster = Cluster(connection_string, options)
|
|
54
|
+
cluster.wait_until_ready(timedelta(seconds=30))
|
|
55
|
+
|
|
56
|
+
return cluster
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def fetch_documents(
|
|
60
|
+
cluster: Cluster,
|
|
61
|
+
bucket_name: str,
|
|
62
|
+
scope_name: str,
|
|
63
|
+
collection_name: str,
|
|
64
|
+
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
|
|
65
|
+
limit: Optional[int] = None,
|
|
66
|
+
chunk_size: Optional[int] = 1000,
|
|
67
|
+
) -> Iterator[Dict[str, Any]]:
|
|
68
|
+
"""
|
|
69
|
+
Fetch documents from a Couchbase collection using N1QL queries.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
cluster: Couchbase cluster instance
|
|
73
|
+
bucket_name: Name of the bucket
|
|
74
|
+
scope_name: Name of the scope
|
|
75
|
+
collection_name: Name of the collection
|
|
76
|
+
incremental: Incremental loading configuration
|
|
77
|
+
limit: Maximum number of documents to fetch
|
|
78
|
+
chunk_size: Number of documents to fetch per batch
|
|
79
|
+
|
|
80
|
+
Yields:
|
|
81
|
+
Dict[str, Any]: Document data
|
|
82
|
+
"""
|
|
83
|
+
# Build N1QL query with full path
|
|
84
|
+
full_collection_path = f"`{bucket_name}`.`{scope_name}`.`{collection_name}`"
|
|
85
|
+
n1ql_query = f"SELECT META().id as id, c.* FROM {full_collection_path} c"
|
|
86
|
+
|
|
87
|
+
# Add incremental filter if provided
|
|
88
|
+
if incremental and incremental.cursor_path:
|
|
89
|
+
where_clause = f" WHERE {incremental.cursor_path} >= $start_value"
|
|
90
|
+
if incremental.end_value is not None:
|
|
91
|
+
where_clause += f" AND {incremental.cursor_path} < $end_value"
|
|
92
|
+
n1ql_query += where_clause
|
|
93
|
+
|
|
94
|
+
# Add limit if provided
|
|
95
|
+
if limit:
|
|
96
|
+
n1ql_query += f" LIMIT {limit}"
|
|
97
|
+
|
|
98
|
+
# Execute query
|
|
99
|
+
try:
|
|
100
|
+
query_options = QueryOptions()
|
|
101
|
+
|
|
102
|
+
# Add parameters if incremental
|
|
103
|
+
if incremental and incremental.cursor_path:
|
|
104
|
+
named_parameters = {"start_value": incremental.last_value}
|
|
105
|
+
if incremental.end_value is not None:
|
|
106
|
+
named_parameters["end_value"] = incremental.end_value
|
|
107
|
+
query_options = QueryOptions(named_parameters=named_parameters)
|
|
108
|
+
|
|
109
|
+
result = cluster.query(n1ql_query, query_options)
|
|
110
|
+
|
|
111
|
+
# Yield documents
|
|
112
|
+
count = 0
|
|
113
|
+
for row in result:
|
|
114
|
+
doc = dict(row)
|
|
115
|
+
|
|
116
|
+
# Convert datetime fields to proper format
|
|
117
|
+
if (
|
|
118
|
+
incremental
|
|
119
|
+
and incremental.cursor_path
|
|
120
|
+
and incremental.cursor_path in doc
|
|
121
|
+
):
|
|
122
|
+
cursor_value = doc[incremental.cursor_path]
|
|
123
|
+
if isinstance(cursor_value, (str, datetime)):
|
|
124
|
+
doc[incremental.cursor_path] = ensure_pendulum_datetime(
|
|
125
|
+
cursor_value
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
yield doc
|
|
129
|
+
|
|
130
|
+
count += 1
|
|
131
|
+
if limit and count >= limit:
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
raise Exception(f"Error executing Couchbase N1QL query: {str(e)}")
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This source provides data extraction from Cursor via the REST API.
|
|
3
|
+
|
|
4
|
+
It fetches team member information from the Cursor API.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Iterable, Optional
|
|
8
|
+
|
|
9
|
+
import dlt
|
|
10
|
+
from dlt.common.typing import TDataItem
|
|
11
|
+
|
|
12
|
+
from .helpers import CursorClient
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dlt.source
|
|
16
|
+
def cursor_source() -> Any:
|
|
17
|
+
"""
|
|
18
|
+
The main function that fetches data from Cursor API.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Sequence[DltResource]: A sequence of DltResource objects containing the fetched data.
|
|
22
|
+
"""
|
|
23
|
+
return [
|
|
24
|
+
team_members,
|
|
25
|
+
daily_usage_data,
|
|
26
|
+
team_spend,
|
|
27
|
+
filtered_usage_events,
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dlt.resource(
|
|
32
|
+
write_disposition="replace",
|
|
33
|
+
max_table_nesting=0,
|
|
34
|
+
)
|
|
35
|
+
def team_members(
|
|
36
|
+
api_key: str = dlt.secrets.value,
|
|
37
|
+
) -> Iterable[TDataItem]:
|
|
38
|
+
client = CursorClient(api_key=api_key)
|
|
39
|
+
|
|
40
|
+
members = client.get_team_members()
|
|
41
|
+
yield from members
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dlt.resource(
|
|
45
|
+
write_disposition="replace",
|
|
46
|
+
max_table_nesting=0,
|
|
47
|
+
)
|
|
48
|
+
def daily_usage_data(
|
|
49
|
+
api_key: str = dlt.secrets.value,
|
|
50
|
+
start_date: Optional[int] = dlt.config.value,
|
|
51
|
+
end_date: Optional[int] = dlt.config.value,
|
|
52
|
+
) -> Iterable[TDataItem]:
|
|
53
|
+
client = CursorClient(api_key=api_key)
|
|
54
|
+
|
|
55
|
+
yield from client.get_daily_usage_data(start_date=start_date, end_date=end_date)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dlt.resource(
|
|
59
|
+
write_disposition="replace",
|
|
60
|
+
max_table_nesting=0,
|
|
61
|
+
)
|
|
62
|
+
def team_spend(
|
|
63
|
+
api_key: str = dlt.secrets.value,
|
|
64
|
+
) -> Iterable[TDataItem]:
|
|
65
|
+
client = CursorClient(api_key=api_key)
|
|
66
|
+
|
|
67
|
+
yield from client.get_team_spend()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dlt.resource(
|
|
71
|
+
write_disposition="replace",
|
|
72
|
+
max_table_nesting=0,
|
|
73
|
+
)
|
|
74
|
+
def filtered_usage_events(
|
|
75
|
+
api_key: str = dlt.secrets.value,
|
|
76
|
+
start_date: Optional[int] = dlt.config.value,
|
|
77
|
+
end_date: Optional[int] = dlt.config.value,
|
|
78
|
+
) -> Iterable[TDataItem]:
|
|
79
|
+
client = CursorClient(api_key=api_key)
|
|
80
|
+
|
|
81
|
+
yield from client.get_filtered_usage_events(
|
|
82
|
+
start_date=start_date, end_date=end_date
|
|
83
|
+
)
|