ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +22 -3
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +169 -1
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +26 -23
- ingestr/src/facebook_ads/helpers.py +47 -1
- ingestr/src/factory.py +48 -0
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +9 -0
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -163
- ingestr/src/frankfurter/helpers.py +3 -3
- ingestr/src/freshdesk/__init__.py +25 -8
- ingestr/src/freshdesk/freshdesk_client.py +40 -5
- ingestr/src/fundraiseup/__init__.py +49 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +6 -4
- ingestr/src/google_analytics/__init__.py +1 -1
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/hubspot/__init__.py +6 -12
- ingestr/src/influxdb/__init__.py +1 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/klaviyo/__init__.py +5 -5
- ingestr/src/linear/__init__.py +553 -116
- ingestr/src/linear/helpers.py +77 -38
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +5 -2
- ingestr/src/mongodb/helpers.py +384 -10
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +15 -8
- ingestr/src/shopify/__init__.py +1 -1
- ingestr/src/smartsheets/__init__.py +33 -5
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/sources.py +1418 -54
- ingestr/src/stripe_analytics/__init__.py +2 -19
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/tests/unit/test_smartsheets.py +6 -9
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Helper functions for Couchbase source."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
from typing import Any, Dict, Iterator, Optional
|
|
5
|
+
|
|
6
|
+
import dlt
|
|
7
|
+
from couchbase.auth import PasswordAuthenticator # type: ignore[import-untyped]
|
|
8
|
+
from couchbase.cluster import Cluster # type: ignore[import-untyped]
|
|
9
|
+
from couchbase.options import ( # type: ignore[import-untyped]
|
|
10
|
+
ClusterOptions,
|
|
11
|
+
QueryOptions,
|
|
12
|
+
)
|
|
13
|
+
from dlt.common.configuration import configspec
|
|
14
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@configspec
|
|
18
|
+
class CouchbaseConfiguration:
|
|
19
|
+
"""Configuration for Couchbase source."""
|
|
20
|
+
|
|
21
|
+
connection_string: str = dlt.secrets.value
|
|
22
|
+
username: str = dlt.secrets.value
|
|
23
|
+
password: str = dlt.secrets.value
|
|
24
|
+
bucket: str = dlt.config.value
|
|
25
|
+
scope: Optional[str] = dlt.config.value
|
|
26
|
+
collection: Optional[str] = dlt.config.value
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def client_from_credentials(
|
|
30
|
+
connection_string: str, username: str, password: str
|
|
31
|
+
) -> Cluster:
|
|
32
|
+
"""
|
|
33
|
+
Create a Couchbase cluster client from credentials.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
connection_string: Couchbase connection string
|
|
37
|
+
- Local/self-hosted: 'couchbase://localhost'
|
|
38
|
+
- Capella (cloud): 'couchbases://your-instance.cloud.couchbase.com'
|
|
39
|
+
username: Couchbase username
|
|
40
|
+
password: Couchbase password
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Cluster: Connected Couchbase cluster instance
|
|
44
|
+
"""
|
|
45
|
+
auth = PasswordAuthenticator(username, password)
|
|
46
|
+
options = ClusterOptions(auth)
|
|
47
|
+
|
|
48
|
+
# Apply wan_development profile for Capella (couchbases://) connections
|
|
49
|
+
# This helps avoid latency issues when accessing from different networks
|
|
50
|
+
if connection_string.startswith("couchbases://"):
|
|
51
|
+
options.apply_profile("wan_development")
|
|
52
|
+
|
|
53
|
+
cluster = Cluster(connection_string, options)
|
|
54
|
+
cluster.wait_until_ready(timedelta(seconds=30))
|
|
55
|
+
|
|
56
|
+
return cluster
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def fetch_documents(
|
|
60
|
+
cluster: Cluster,
|
|
61
|
+
bucket_name: str,
|
|
62
|
+
scope_name: str,
|
|
63
|
+
collection_name: str,
|
|
64
|
+
incremental: Optional[dlt.sources.incremental] = None, # type: ignore[type-arg]
|
|
65
|
+
limit: Optional[int] = None,
|
|
66
|
+
chunk_size: Optional[int] = 1000,
|
|
67
|
+
) -> Iterator[Dict[str, Any]]:
|
|
68
|
+
"""
|
|
69
|
+
Fetch documents from a Couchbase collection using N1QL queries.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
cluster: Couchbase cluster instance
|
|
73
|
+
bucket_name: Name of the bucket
|
|
74
|
+
scope_name: Name of the scope
|
|
75
|
+
collection_name: Name of the collection
|
|
76
|
+
incremental: Incremental loading configuration
|
|
77
|
+
limit: Maximum number of documents to fetch
|
|
78
|
+
chunk_size: Number of documents to fetch per batch
|
|
79
|
+
|
|
80
|
+
Yields:
|
|
81
|
+
Dict[str, Any]: Document data
|
|
82
|
+
"""
|
|
83
|
+
# Build N1QL query with full path
|
|
84
|
+
full_collection_path = f"`{bucket_name}`.`{scope_name}`.`{collection_name}`"
|
|
85
|
+
n1ql_query = f"SELECT META().id as id, c.* FROM {full_collection_path} c"
|
|
86
|
+
|
|
87
|
+
# Add incremental filter if provided
|
|
88
|
+
if incremental and incremental.cursor_path:
|
|
89
|
+
where_clause = f" WHERE {incremental.cursor_path} >= $start_value"
|
|
90
|
+
if incremental.end_value is not None:
|
|
91
|
+
where_clause += f" AND {incremental.cursor_path} < $end_value"
|
|
92
|
+
n1ql_query += where_clause
|
|
93
|
+
|
|
94
|
+
# Add limit if provided
|
|
95
|
+
if limit:
|
|
96
|
+
n1ql_query += f" LIMIT {limit}"
|
|
97
|
+
|
|
98
|
+
# Execute query
|
|
99
|
+
try:
|
|
100
|
+
query_options = QueryOptions()
|
|
101
|
+
|
|
102
|
+
# Add parameters if incremental
|
|
103
|
+
if incremental and incremental.cursor_path:
|
|
104
|
+
named_parameters = {"start_value": incremental.last_value}
|
|
105
|
+
if incremental.end_value is not None:
|
|
106
|
+
named_parameters["end_value"] = incremental.end_value
|
|
107
|
+
query_options = QueryOptions(named_parameters=named_parameters)
|
|
108
|
+
|
|
109
|
+
result = cluster.query(n1ql_query, query_options)
|
|
110
|
+
|
|
111
|
+
# Yield documents
|
|
112
|
+
count = 0
|
|
113
|
+
for row in result:
|
|
114
|
+
doc = dict(row)
|
|
115
|
+
|
|
116
|
+
# Convert datetime fields to proper format
|
|
117
|
+
if (
|
|
118
|
+
incremental
|
|
119
|
+
and incremental.cursor_path
|
|
120
|
+
and incremental.cursor_path in doc
|
|
121
|
+
):
|
|
122
|
+
cursor_value = doc[incremental.cursor_path]
|
|
123
|
+
if isinstance(cursor_value, (str, datetime)):
|
|
124
|
+
doc[incremental.cursor_path] = ensure_pendulum_datetime(
|
|
125
|
+
cursor_value
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
yield doc
|
|
129
|
+
|
|
130
|
+
count += 1
|
|
131
|
+
if limit and count >= limit:
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
raise Exception(f"Error executing Couchbase N1QL query: {str(e)}")
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This source provides data extraction from Cursor via the REST API.
|
|
3
|
+
|
|
4
|
+
It fetches team member information from the Cursor API.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any, Iterable, Optional
|
|
8
|
+
|
|
9
|
+
import dlt
|
|
10
|
+
from dlt.common.typing import TDataItem
|
|
11
|
+
|
|
12
|
+
from .helpers import CursorClient
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dlt.source
|
|
16
|
+
def cursor_source() -> Any:
|
|
17
|
+
"""
|
|
18
|
+
The main function that fetches data from Cursor API.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Sequence[DltResource]: A sequence of DltResource objects containing the fetched data.
|
|
22
|
+
"""
|
|
23
|
+
return [
|
|
24
|
+
team_members,
|
|
25
|
+
daily_usage_data,
|
|
26
|
+
team_spend,
|
|
27
|
+
filtered_usage_events,
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dlt.resource(
|
|
32
|
+
write_disposition="replace",
|
|
33
|
+
max_table_nesting=0,
|
|
34
|
+
)
|
|
35
|
+
def team_members(
|
|
36
|
+
api_key: str = dlt.secrets.value,
|
|
37
|
+
) -> Iterable[TDataItem]:
|
|
38
|
+
client = CursorClient(api_key=api_key)
|
|
39
|
+
|
|
40
|
+
members = client.get_team_members()
|
|
41
|
+
yield from members
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dlt.resource(
|
|
45
|
+
write_disposition="replace",
|
|
46
|
+
max_table_nesting=0,
|
|
47
|
+
)
|
|
48
|
+
def daily_usage_data(
|
|
49
|
+
api_key: str = dlt.secrets.value,
|
|
50
|
+
start_date: Optional[int] = dlt.config.value,
|
|
51
|
+
end_date: Optional[int] = dlt.config.value,
|
|
52
|
+
) -> Iterable[TDataItem]:
|
|
53
|
+
client = CursorClient(api_key=api_key)
|
|
54
|
+
|
|
55
|
+
yield from client.get_daily_usage_data(start_date=start_date, end_date=end_date)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dlt.resource(
|
|
59
|
+
write_disposition="replace",
|
|
60
|
+
max_table_nesting=0,
|
|
61
|
+
)
|
|
62
|
+
def team_spend(
|
|
63
|
+
api_key: str = dlt.secrets.value,
|
|
64
|
+
) -> Iterable[TDataItem]:
|
|
65
|
+
client = CursorClient(api_key=api_key)
|
|
66
|
+
|
|
67
|
+
yield from client.get_team_spend()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dlt.resource(
|
|
71
|
+
write_disposition="replace",
|
|
72
|
+
max_table_nesting=0,
|
|
73
|
+
)
|
|
74
|
+
def filtered_usage_events(
|
|
75
|
+
api_key: str = dlt.secrets.value,
|
|
76
|
+
start_date: Optional[int] = dlt.config.value,
|
|
77
|
+
end_date: Optional[int] = dlt.config.value,
|
|
78
|
+
) -> Iterable[TDataItem]:
|
|
79
|
+
client = CursorClient(api_key=api_key)
|
|
80
|
+
|
|
81
|
+
yield from client.get_filtered_usage_events(
|
|
82
|
+
start_date=start_date, end_date=end_date
|
|
83
|
+
)
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Cursor source helpers"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Dict, Iterator, List, Optional
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
REQUEST_TIMEOUT = 30
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CursorClient:
|
|
11
|
+
"""Cursor REST API client with API key authentication."""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
api_key: str,
|
|
16
|
+
base_url: str = "https://api.cursor.com",
|
|
17
|
+
timeout: int = REQUEST_TIMEOUT,
|
|
18
|
+
):
|
|
19
|
+
"""
|
|
20
|
+
Initialize Cursor client with API key authentication.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
api_key: API key for authentication
|
|
24
|
+
base_url: Cursor API base URL
|
|
25
|
+
timeout: Request timeout in seconds
|
|
26
|
+
"""
|
|
27
|
+
self.base_url = base_url.rstrip("/")
|
|
28
|
+
self.timeout = timeout
|
|
29
|
+
self.api_key = api_key
|
|
30
|
+
|
|
31
|
+
def _make_request(
|
|
32
|
+
self,
|
|
33
|
+
endpoint: str,
|
|
34
|
+
method: str = "POST",
|
|
35
|
+
json_data: Optional[Dict[str, Any]] = None,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
"""
|
|
38
|
+
Make HTTP request to Cursor API.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
endpoint: API endpoint path
|
|
42
|
+
method: HTTP method (default: POST)
|
|
43
|
+
json_data: JSON data for request body
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
JSON response data
|
|
47
|
+
"""
|
|
48
|
+
url = f"{self.base_url}/{endpoint.lstrip('/')}"
|
|
49
|
+
|
|
50
|
+
if json_data is not None:
|
|
51
|
+
response = requests.request(
|
|
52
|
+
method=method,
|
|
53
|
+
url=url,
|
|
54
|
+
auth=(self.api_key, ""),
|
|
55
|
+
timeout=self.timeout,
|
|
56
|
+
headers={"Content-Type": "application/json"},
|
|
57
|
+
json=json_data,
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
response = requests.request(
|
|
61
|
+
method=method,
|
|
62
|
+
url=url,
|
|
63
|
+
auth=(self.api_key, ""),
|
|
64
|
+
timeout=self.timeout,
|
|
65
|
+
headers={"Content-Type": "application/json"},
|
|
66
|
+
json={},
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
response.raise_for_status()
|
|
70
|
+
return response.json()
|
|
71
|
+
|
|
72
|
+
def _paginate(
|
|
73
|
+
self,
|
|
74
|
+
endpoint: str,
|
|
75
|
+
data_key: str,
|
|
76
|
+
base_payload: Optional[Dict[str, Any]] = None,
|
|
77
|
+
page_size: Optional[int] = 100,
|
|
78
|
+
has_next_page_check: Optional[Callable[[Dict[str, Any]], bool]] = None,
|
|
79
|
+
) -> Iterator[Dict[str, Any]]:
|
|
80
|
+
"""
|
|
81
|
+
Generic pagination helper for API endpoints.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
endpoint: API endpoint to call
|
|
85
|
+
data_key: Key in response containing the data array
|
|
86
|
+
base_payload: Base payload to include in each request
|
|
87
|
+
page_size: Number of results per page (default: 100)
|
|
88
|
+
has_next_page_check: Optional function to check if there's a next page from response
|
|
89
|
+
|
|
90
|
+
Yields:
|
|
91
|
+
Individual records from the paginated response
|
|
92
|
+
"""
|
|
93
|
+
page = 1
|
|
94
|
+
base_payload = base_payload or {}
|
|
95
|
+
|
|
96
|
+
while True:
|
|
97
|
+
payload = base_payload.copy()
|
|
98
|
+
|
|
99
|
+
if page_size:
|
|
100
|
+
payload["pageSize"] = page_size
|
|
101
|
+
payload["page"] = page
|
|
102
|
+
|
|
103
|
+
result = self._make_request(endpoint, json_data=payload)
|
|
104
|
+
data = result.get(data_key, [])
|
|
105
|
+
|
|
106
|
+
if not data:
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
for record in data:
|
|
110
|
+
yield record
|
|
111
|
+
|
|
112
|
+
# If page_size is not set, we get all data in one request
|
|
113
|
+
if not page_size:
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
# Custom check for next page if provided
|
|
117
|
+
if has_next_page_check:
|
|
118
|
+
if not has_next_page_check(result):
|
|
119
|
+
break
|
|
120
|
+
# Default: if we got less data than page_size, we've reached the end
|
|
121
|
+
elif len(data) < page_size:
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
page += 1
|
|
125
|
+
|
|
126
|
+
def get_team_members(self) -> List[Dict[str, Any]]:
|
|
127
|
+
response = self._make_request("teams/members", method="GET")
|
|
128
|
+
return response.get("teamMembers", [])
|
|
129
|
+
|
|
130
|
+
def get_daily_usage_data(
|
|
131
|
+
self,
|
|
132
|
+
start_date: Optional[int] = None,
|
|
133
|
+
end_date: Optional[int] = None,
|
|
134
|
+
page_size: Optional[int] = 100,
|
|
135
|
+
) -> Iterator[Dict[str, Any]]:
|
|
136
|
+
base_payload = {}
|
|
137
|
+
if start_date is not None:
|
|
138
|
+
base_payload["startDate"] = start_date
|
|
139
|
+
if end_date is not None:
|
|
140
|
+
base_payload["endDate"] = end_date
|
|
141
|
+
|
|
142
|
+
yield from self._paginate(
|
|
143
|
+
endpoint="teams/daily-usage-data",
|
|
144
|
+
data_key="data",
|
|
145
|
+
base_payload=base_payload,
|
|
146
|
+
page_size=page_size,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def get_team_spend(
|
|
150
|
+
self,
|
|
151
|
+
page_size: Optional[int] = 100,
|
|
152
|
+
) -> Iterator[Dict[str, Any]]:
|
|
153
|
+
def check_has_next_page(response: Dict[str, Any]) -> bool:
|
|
154
|
+
current_page = response.get("currentPage", 1)
|
|
155
|
+
total_pages = response.get("totalPages", 1)
|
|
156
|
+
return current_page < total_pages
|
|
157
|
+
|
|
158
|
+
yield from self._paginate(
|
|
159
|
+
endpoint="teams/spend",
|
|
160
|
+
data_key="teamMemberSpend",
|
|
161
|
+
page_size=page_size,
|
|
162
|
+
has_next_page_check=check_has_next_page,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def get_filtered_usage_events(
|
|
166
|
+
self,
|
|
167
|
+
start_date: Optional[int] = None,
|
|
168
|
+
end_date: Optional[int] = None,
|
|
169
|
+
page_size: Optional[int] = 100,
|
|
170
|
+
) -> Iterator[Dict[str, Any]]:
|
|
171
|
+
base_payload = {}
|
|
172
|
+
if start_date is not None:
|
|
173
|
+
base_payload["startDate"] = start_date
|
|
174
|
+
if end_date is not None:
|
|
175
|
+
base_payload["endDate"] = end_date
|
|
176
|
+
|
|
177
|
+
# Custom check for hasNextPage
|
|
178
|
+
def check_has_next_page(response: Dict[str, Any]) -> bool:
|
|
179
|
+
pagination = response.get("pagination", {})
|
|
180
|
+
return pagination.get("hasNextPage", False)
|
|
181
|
+
|
|
182
|
+
yield from self._paginate(
|
|
183
|
+
endpoint="teams/filtered-usage-events",
|
|
184
|
+
data_key="usageEvents",
|
|
185
|
+
base_payload=base_payload,
|
|
186
|
+
page_size=page_size,
|
|
187
|
+
has_next_page_check=check_has_next_page,
|
|
188
|
+
)
|
ingestr/src/destinations.py
CHANGED
|
@@ -19,12 +19,26 @@ from dlt.destinations.impl.clickhouse.configuration import (
|
|
|
19
19
|
ClickHouseCredentials,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
+
from ingestr.src.elasticsearch.helpers import elasticsearch_insert
|
|
22
23
|
from ingestr.src.errors import MissingValueError
|
|
23
24
|
from ingestr.src.loader import load_dlt_file
|
|
25
|
+
from ingestr.src.mongodb.helpers import mongodb_insert
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
class GenericSqlDestination:
|
|
27
29
|
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
30
|
+
if uri.startswith("databricks://"):
|
|
31
|
+
p = urlparse(uri)
|
|
32
|
+
q = parse_qs(p.query)
|
|
33
|
+
schema = q.get("schema", [None])[0]
|
|
34
|
+
if not schema:
|
|
35
|
+
raise ValueError("Databricks requires schema in the URI.")
|
|
36
|
+
res = {
|
|
37
|
+
"dataset_name": schema,
|
|
38
|
+
"table_name": table,
|
|
39
|
+
}
|
|
40
|
+
return res
|
|
41
|
+
|
|
28
42
|
table_fields = table.split(".")
|
|
29
43
|
if len(table_fields) != 2:
|
|
30
44
|
raise ValueError("Table name must be in the format <schema>.<table>")
|
|
@@ -147,6 +161,24 @@ class DuckDBDestination(GenericSqlDestination):
|
|
|
147
161
|
return dlt.destinations.duckdb(uri, **kwargs)
|
|
148
162
|
|
|
149
163
|
|
|
164
|
+
class MotherduckDestination(GenericSqlDestination):
|
|
165
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
166
|
+
from urllib.parse import parse_qs, urlparse
|
|
167
|
+
|
|
168
|
+
parsed = urlparse(uri)
|
|
169
|
+
query = parse_qs(parsed.query)
|
|
170
|
+
token = query.get("token", [None])[0]
|
|
171
|
+
from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
|
|
172
|
+
|
|
173
|
+
creds = {
|
|
174
|
+
"password": token,
|
|
175
|
+
}
|
|
176
|
+
if parsed.path.lstrip("/"):
|
|
177
|
+
creds["database"] = parsed.path.lstrip("/")
|
|
178
|
+
|
|
179
|
+
return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
|
|
180
|
+
|
|
181
|
+
|
|
150
182
|
def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
|
|
151
183
|
# ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
|
|
152
184
|
tup = struct.unpack(
|
|
@@ -252,7 +284,26 @@ class MsSQLDestination(GenericSqlDestination):
|
|
|
252
284
|
|
|
253
285
|
class DatabricksDestination(GenericSqlDestination):
|
|
254
286
|
def dlt_dest(self, uri: str, **kwargs):
|
|
255
|
-
|
|
287
|
+
p = urlparse(uri)
|
|
288
|
+
q = parse_qs(p.query)
|
|
289
|
+
access_token = p.password
|
|
290
|
+
server_hostname = p.hostname
|
|
291
|
+
http_path = q.get("http_path", [None])[0]
|
|
292
|
+
catalog = q.get("catalog", [None])[0]
|
|
293
|
+
schema = q.get("schema", [None])[0]
|
|
294
|
+
|
|
295
|
+
creds = {
|
|
296
|
+
"access_token": access_token,
|
|
297
|
+
"server_hostname": server_hostname,
|
|
298
|
+
"http_path": http_path,
|
|
299
|
+
"catalog": catalog,
|
|
300
|
+
"schema": schema,
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
return dlt.destinations.databricks(
|
|
304
|
+
credentials=creds,
|
|
305
|
+
**kwargs,
|
|
306
|
+
)
|
|
256
307
|
|
|
257
308
|
|
|
258
309
|
class SynapseDestination(GenericSqlDestination):
|
|
@@ -538,6 +589,76 @@ class MySqlDestination(GenericSqlDestination):
|
|
|
538
589
|
}
|
|
539
590
|
|
|
540
591
|
|
|
592
|
+
class TrinoTypeMapper:
|
|
593
|
+
"""Custom type mapper for Trino to handle unsupported types."""
|
|
594
|
+
|
|
595
|
+
@staticmethod
|
|
596
|
+
def create_type_mapper():
|
|
597
|
+
"""Create a custom type mapper for Trino."""
|
|
598
|
+
from dlt.destinations.impl.sqlalchemy.type_mapper import SqlalchemyTypeMapper
|
|
599
|
+
from sqlalchemy import BigInteger, Text
|
|
600
|
+
from sqlalchemy.sql import sqltypes
|
|
601
|
+
|
|
602
|
+
class CustomTrinoTypeMapper(SqlalchemyTypeMapper):
|
|
603
|
+
"""Custom type mapper that converts unsupported Trino types."""
|
|
604
|
+
|
|
605
|
+
def to_destination_type(self, column, table=None):
|
|
606
|
+
# Handle special cases before calling parent
|
|
607
|
+
data_type = column.get("data_type", "")
|
|
608
|
+
|
|
609
|
+
# Convert JSON to VARCHAR for Trino's Iceberg catalog
|
|
610
|
+
if data_type == "json":
|
|
611
|
+
# Use TEXT (unlimited VARCHAR) for JSON data
|
|
612
|
+
return Text()
|
|
613
|
+
|
|
614
|
+
# Convert BINARY to VARCHAR
|
|
615
|
+
if data_type == "binary":
|
|
616
|
+
return Text()
|
|
617
|
+
|
|
618
|
+
# Handle integer types - always use BIGINT for Trino
|
|
619
|
+
# Note: dlt uses "bigint" internally, not "integer"
|
|
620
|
+
if data_type in ["bigint", "integer", "int"]:
|
|
621
|
+
return BigInteger()
|
|
622
|
+
|
|
623
|
+
# For other types, try parent mapper
|
|
624
|
+
try:
|
|
625
|
+
type_ = super().to_destination_type(column, table)
|
|
626
|
+
except Exception:
|
|
627
|
+
# If parent can't handle it, default to TEXT
|
|
628
|
+
return Text()
|
|
629
|
+
|
|
630
|
+
# Convert any INTEGER type to BIGINT
|
|
631
|
+
if isinstance(type_, sqltypes.Integer) and not isinstance(
|
|
632
|
+
type_, sqltypes.BigInteger
|
|
633
|
+
):
|
|
634
|
+
return BigInteger()
|
|
635
|
+
|
|
636
|
+
# Ensure VARCHAR types don't have constraints that Trino doesn't support
|
|
637
|
+
if isinstance(type_, sqltypes.String):
|
|
638
|
+
# Return TEXT for unlimited string
|
|
639
|
+
return Text()
|
|
640
|
+
|
|
641
|
+
return type_
|
|
642
|
+
|
|
643
|
+
return CustomTrinoTypeMapper
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
class TrinoDestination(GenericSqlDestination):
|
|
647
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
648
|
+
# Import required modules
|
|
649
|
+
from dlt.destinations.impl.sqlalchemy.factory import (
|
|
650
|
+
sqlalchemy as sqlalchemy_factory,
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
# Create the destination with custom type mapper
|
|
654
|
+
# We need to use the factory to properly configure the type mapper
|
|
655
|
+
dest = sqlalchemy_factory(
|
|
656
|
+
credentials=uri, type_mapper=TrinoTypeMapper.create_type_mapper(), **kwargs
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
return dest
|
|
660
|
+
|
|
661
|
+
|
|
541
662
|
class BlobStorageDestination(abc.ABC):
|
|
542
663
|
@abc.abstractmethod
|
|
543
664
|
def credentials(self, params: dict) -> FileSystemCredentials:
|
|
@@ -654,3 +775,50 @@ class GCSDestination(BlobStorageDestination):
|
|
|
654
775
|
credentials = json.loads(base64.b64decode(credentials_base64[0]).decode()) # type: ignore
|
|
655
776
|
|
|
656
777
|
return credentials
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
class ElasticsearchDestination:
|
|
781
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
782
|
+
from urllib.parse import urlparse
|
|
783
|
+
|
|
784
|
+
parsed_uri = urlparse(uri)
|
|
785
|
+
|
|
786
|
+
# Extract connection details from URI
|
|
787
|
+
scheme = parsed_uri.scheme or "http"
|
|
788
|
+
host = parsed_uri.hostname or "localhost"
|
|
789
|
+
port = parsed_uri.port or 9200
|
|
790
|
+
username = parsed_uri.username
|
|
791
|
+
password = parsed_uri.password
|
|
792
|
+
|
|
793
|
+
# Build connection string
|
|
794
|
+
if username and password:
|
|
795
|
+
connection_string = f"{scheme}://{username}:{password}@{host}:{port}"
|
|
796
|
+
else:
|
|
797
|
+
connection_string = f"{scheme}://{host}:{port}"
|
|
798
|
+
|
|
799
|
+
# Add query parameters if any
|
|
800
|
+
if parsed_uri.query:
|
|
801
|
+
connection_string += f"?{parsed_uri.query}"
|
|
802
|
+
|
|
803
|
+
return elasticsearch_insert(connection_string=connection_string)
|
|
804
|
+
|
|
805
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
806
|
+
return {
|
|
807
|
+
"table_name": table,
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
def post_load(self):
|
|
811
|
+
pass
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
class MongoDBDestination:
|
|
815
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
816
|
+
return mongodb_insert(uri)
|
|
817
|
+
|
|
818
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
819
|
+
return {
|
|
820
|
+
"table_name": table,
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
def post_load(self):
|
|
824
|
+
pass
|