ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin/__init__.py +262 -0
- ingestr/src/applovin_max/__init__.py +117 -0
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +38 -11
- ingestr/src/buildinfo.py +1 -0
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +520 -33
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +116 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/loader.py +69 -0
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/personio/__init__.py +331 -0
- ingestr/src/personio/helpers.py +86 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +156 -0
- ingestr/src/salesforce/helpers.py +64 -0
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +3132 -212
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/version.py +6 -1
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- ingestr-0.14.104.dist-info/METADATA +563 -0
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.2.dist-info/METADATA +0 -302
- ingestr-0.13.2.dist-info/RECORD +0 -107
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/destinations.py
CHANGED
|
@@ -1,21 +1,44 @@
|
|
|
1
|
+
import abc
|
|
1
2
|
import base64
|
|
2
3
|
import csv
|
|
4
|
+
import datetime
|
|
3
5
|
import json
|
|
4
6
|
import os
|
|
5
7
|
import shutil
|
|
8
|
+
import struct
|
|
6
9
|
import tempfile
|
|
7
10
|
from urllib.parse import parse_qs, quote, urlparse
|
|
8
11
|
|
|
9
12
|
import dlt
|
|
10
|
-
import
|
|
13
|
+
import dlt.destinations.impl.filesystem.filesystem
|
|
11
14
|
from dlt.common.configuration.specs import AwsCredentials
|
|
15
|
+
from dlt.common.destination.capabilities import DestinationCapabilitiesContext
|
|
16
|
+
from dlt.common.schema import Schema
|
|
17
|
+
from dlt.common.storages.configuration import FileSystemCredentials
|
|
12
18
|
from dlt.destinations.impl.clickhouse.configuration import (
|
|
13
19
|
ClickHouseCredentials,
|
|
14
20
|
)
|
|
15
21
|
|
|
22
|
+
from ingestr.src.elasticsearch.helpers import elasticsearch_insert
|
|
23
|
+
from ingestr.src.errors import MissingValueError
|
|
24
|
+
from ingestr.src.loader import load_dlt_file
|
|
25
|
+
from ingestr.src.mongodb.helpers import mongodb_insert
|
|
26
|
+
|
|
16
27
|
|
|
17
28
|
class GenericSqlDestination:
|
|
18
29
|
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
30
|
+
if uri.startswith("databricks://"):
|
|
31
|
+
p = urlparse(uri)
|
|
32
|
+
q = parse_qs(p.query)
|
|
33
|
+
schema = q.get("schema", [None])[0]
|
|
34
|
+
if not schema:
|
|
35
|
+
raise ValueError("Databricks requires schema in the URI.")
|
|
36
|
+
res = {
|
|
37
|
+
"dataset_name": schema,
|
|
38
|
+
"table_name": table,
|
|
39
|
+
}
|
|
40
|
+
return res
|
|
41
|
+
|
|
19
42
|
table_fields = table.split(".")
|
|
20
43
|
if len(table_fields) != 2:
|
|
21
44
|
raise ValueError("Table name must be in the format <schema>.<table>")
|
|
@@ -59,9 +82,30 @@ class BigQueryDestination:
|
|
|
59
82
|
base64.b64decode(credentials_base64[0]).decode("utf-8")
|
|
60
83
|
)
|
|
61
84
|
|
|
85
|
+
staging_bucket = kwargs.get("staging_bucket", None)
|
|
86
|
+
if staging_bucket:
|
|
87
|
+
if not staging_bucket.startswith("gs://"):
|
|
88
|
+
raise ValueError("Staging bucket must start with gs://")
|
|
89
|
+
|
|
90
|
+
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = staging_bucket
|
|
91
|
+
os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__PROJECT_ID"] = (
|
|
92
|
+
credentials.get("project_id", None)
|
|
93
|
+
)
|
|
94
|
+
os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__PRIVATE_KEY"] = (
|
|
95
|
+
credentials.get("private_key", None)
|
|
96
|
+
)
|
|
97
|
+
os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__CLIENT_EMAIL"] = (
|
|
98
|
+
credentials.get("client_email", None)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
project_id = None
|
|
102
|
+
if source_fields.hostname:
|
|
103
|
+
project_id = source_fields.hostname
|
|
104
|
+
|
|
62
105
|
return dlt.destinations.bigquery(
|
|
63
106
|
credentials=credentials, # type: ignore
|
|
64
|
-
location=location,
|
|
107
|
+
location=location, # type: ignore
|
|
108
|
+
project_id=project_id,
|
|
65
109
|
**kwargs,
|
|
66
110
|
)
|
|
67
111
|
|
|
@@ -77,12 +121,24 @@ class BigQueryDestination:
|
|
|
77
121
|
"table_name": table_fields[-1],
|
|
78
122
|
}
|
|
79
123
|
|
|
124
|
+
staging_bucket = kwargs.get("staging_bucket", None)
|
|
125
|
+
if staging_bucket:
|
|
126
|
+
res["staging"] = "filesystem"
|
|
127
|
+
|
|
80
128
|
return res
|
|
81
129
|
|
|
82
130
|
def post_load(self):
|
|
83
131
|
pass
|
|
84
132
|
|
|
85
133
|
|
|
134
|
+
class CrateDBDestination(GenericSqlDestination):
|
|
135
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
136
|
+
uri = uri.replace("cratedb://", "postgres://")
|
|
137
|
+
import dlt_cratedb.impl.cratedb.factory
|
|
138
|
+
|
|
139
|
+
return dlt_cratedb.impl.cratedb.factory.cratedb(credentials=uri, **kwargs)
|
|
140
|
+
|
|
141
|
+
|
|
86
142
|
class PostgresDestination(GenericSqlDestination):
|
|
87
143
|
def dlt_dest(self, uri: str, **kwargs):
|
|
88
144
|
return dlt.destinations.postgres(credentials=uri, **kwargs)
|
|
@@ -105,14 +161,149 @@ class DuckDBDestination(GenericSqlDestination):
|
|
|
105
161
|
return dlt.destinations.duckdb(uri, **kwargs)
|
|
106
162
|
|
|
107
163
|
|
|
164
|
+
class MotherduckDestination(GenericSqlDestination):
|
|
165
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
166
|
+
from urllib.parse import parse_qs, urlparse
|
|
167
|
+
|
|
168
|
+
parsed = urlparse(uri)
|
|
169
|
+
query = parse_qs(parsed.query)
|
|
170
|
+
token = query.get("token", [None])[0]
|
|
171
|
+
from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
|
|
172
|
+
|
|
173
|
+
creds = {
|
|
174
|
+
"password": token,
|
|
175
|
+
}
|
|
176
|
+
if parsed.path.lstrip("/"):
|
|
177
|
+
creds["database"] = parsed.path.lstrip("/")
|
|
178
|
+
|
|
179
|
+
return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
|
|
183
|
+
# ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
|
|
184
|
+
tup = struct.unpack(
|
|
185
|
+
"<6hI2h", dto_value
|
|
186
|
+
) # e.g., (2017, 3, 16, 10, 35, 18, 500000000, -6, 0)
|
|
187
|
+
return datetime.datetime(
|
|
188
|
+
tup[0],
|
|
189
|
+
tup[1],
|
|
190
|
+
tup[2],
|
|
191
|
+
tup[3],
|
|
192
|
+
tup[4],
|
|
193
|
+
tup[5],
|
|
194
|
+
tup[6] // 1000,
|
|
195
|
+
datetime.timezone(datetime.timedelta(hours=tup[7], minutes=tup[8])),
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# MSSQL_COPT_SS_ACCESS_TOKEN is a connection attribute used to pass
|
|
200
|
+
# an Azure Active Directory access token to the SQL Server ODBC driver.
|
|
201
|
+
MSSQL_COPT_SS_ACCESS_TOKEN = 1256
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def serialize_azure_token(token):
|
|
205
|
+
# https://github.com/mkleehammer/pyodbc/issues/228#issuecomment-494773723
|
|
206
|
+
encoded = token.encode("utf_16_le")
|
|
207
|
+
return struct.pack("<i", len(encoded)) + encoded
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def build_mssql_dest():
|
|
211
|
+
# https://github.com/bruin-data/ingestr/issues/293
|
|
212
|
+
|
|
213
|
+
from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration
|
|
214
|
+
from dlt.destinations.impl.mssql.mssql import (
|
|
215
|
+
HINT_TO_MSSQL_ATTR,
|
|
216
|
+
MsSqlJobClient,
|
|
217
|
+
)
|
|
218
|
+
from dlt.destinations.impl.mssql.sql_client import (
|
|
219
|
+
PyOdbcMsSqlClient,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
class OdbcMsSqlClient(PyOdbcMsSqlClient):
|
|
223
|
+
SKIP_CREDENTIALS = {"PWD", "AUTHENTICATION", "UID"}
|
|
224
|
+
|
|
225
|
+
def open_connection(self):
|
|
226
|
+
cfg = self.credentials._get_odbc_dsn_dict()
|
|
227
|
+
if (
|
|
228
|
+
cfg.get("AUTHENTICATION", "").strip().lower()
|
|
229
|
+
!= "activedirectoryaccesstoken"
|
|
230
|
+
):
|
|
231
|
+
return super().open_connection()
|
|
232
|
+
|
|
233
|
+
import pyodbc # type: ignore
|
|
234
|
+
|
|
235
|
+
dsn = ";".join(
|
|
236
|
+
[f"{k}={v}" for k, v in cfg.items() if k not in self.SKIP_CREDENTIALS]
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
self._conn = pyodbc.connect(
|
|
240
|
+
dsn,
|
|
241
|
+
timeout=self.credentials.connect_timeout,
|
|
242
|
+
attrs_before={
|
|
243
|
+
MSSQL_COPT_SS_ACCESS_TOKEN: serialize_azure_token(cfg["PWD"]),
|
|
244
|
+
},
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# https://github.com/mkleehammer/pyodbc/wiki/Using-an-Output-Converter-function
|
|
248
|
+
self._conn.add_output_converter(-155, handle_datetimeoffset)
|
|
249
|
+
self._conn.autocommit = True
|
|
250
|
+
return self._conn
|
|
251
|
+
|
|
252
|
+
class MsSqlClient(MsSqlJobClient):
|
|
253
|
+
def __init__(
|
|
254
|
+
self,
|
|
255
|
+
schema: Schema,
|
|
256
|
+
config: MsSqlClientConfiguration,
|
|
257
|
+
capabilities: DestinationCapabilitiesContext,
|
|
258
|
+
) -> None:
|
|
259
|
+
sql_client = OdbcMsSqlClient(
|
|
260
|
+
config.normalize_dataset_name(schema),
|
|
261
|
+
config.normalize_staging_dataset_name(schema),
|
|
262
|
+
config.credentials,
|
|
263
|
+
capabilities,
|
|
264
|
+
)
|
|
265
|
+
super(MsSqlJobClient, self).__init__(schema, config, sql_client)
|
|
266
|
+
self.config: MsSqlClientConfiguration = config
|
|
267
|
+
self.sql_client = sql_client
|
|
268
|
+
self.active_hints = HINT_TO_MSSQL_ATTR if self.config.create_indexes else {}
|
|
269
|
+
self.type_mapper = capabilities.get_type_mapper()
|
|
270
|
+
|
|
271
|
+
class MsSqlDestImpl(dlt.destinations.mssql):
|
|
272
|
+
@property
|
|
273
|
+
def client_class(self):
|
|
274
|
+
return MsSqlClient
|
|
275
|
+
|
|
276
|
+
return MsSqlDestImpl
|
|
277
|
+
|
|
278
|
+
|
|
108
279
|
class MsSQLDestination(GenericSqlDestination):
|
|
109
280
|
def dlt_dest(self, uri: str, **kwargs):
|
|
110
|
-
|
|
281
|
+
cls = build_mssql_dest()
|
|
282
|
+
return cls(credentials=uri, **kwargs)
|
|
111
283
|
|
|
112
284
|
|
|
113
285
|
class DatabricksDestination(GenericSqlDestination):
|
|
114
286
|
def dlt_dest(self, uri: str, **kwargs):
|
|
115
|
-
|
|
287
|
+
p = urlparse(uri)
|
|
288
|
+
q = parse_qs(p.query)
|
|
289
|
+
access_token = p.password
|
|
290
|
+
server_hostname = p.hostname
|
|
291
|
+
http_path = q.get("http_path", [None])[0]
|
|
292
|
+
catalog = q.get("catalog", [None])[0]
|
|
293
|
+
schema = q.get("schema", [None])[0]
|
|
294
|
+
|
|
295
|
+
creds = {
|
|
296
|
+
"access_token": access_token,
|
|
297
|
+
"server_hostname": server_hostname,
|
|
298
|
+
"http_path": http_path,
|
|
299
|
+
"catalog": catalog,
|
|
300
|
+
"schema": schema,
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
return dlt.destinations.databricks(
|
|
304
|
+
credentials=creds,
|
|
305
|
+
**kwargs,
|
|
306
|
+
)
|
|
116
307
|
|
|
117
308
|
|
|
118
309
|
class SynapseDestination(GenericSqlDestination):
|
|
@@ -184,11 +375,9 @@ class CsvDestination(GenericSqlDestination):
|
|
|
184
375
|
if output_path.count("/") > 1:
|
|
185
376
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
186
377
|
|
|
187
|
-
table = pyarrow.parquet.read_table(first_file_path)
|
|
188
|
-
rows = table.to_pylist()
|
|
189
378
|
with open(output_path, "w", newline="") as csv_file:
|
|
190
379
|
csv_writer = None
|
|
191
|
-
for row in
|
|
380
|
+
for row in load_dlt_file(first_file_path):
|
|
192
381
|
row = filter_keys(row)
|
|
193
382
|
if csv_writer is None:
|
|
194
383
|
csv_writer = csv.DictWriter(csv_file, fieldnames=row.keys())
|
|
@@ -211,43 +400,64 @@ class AthenaDestination:
|
|
|
211
400
|
if not bucket.startswith("s3://"):
|
|
212
401
|
bucket = f"s3://{bucket}"
|
|
213
402
|
|
|
214
|
-
|
|
215
|
-
if query_result_path:
|
|
216
|
-
if not query_result_path.startswith("s3://"):
|
|
217
|
-
query_result_path = f"s3://{query_result_path}"
|
|
218
|
-
else:
|
|
219
|
-
query_result_path = bucket
|
|
403
|
+
bucket = bucket.rstrip("/")
|
|
220
404
|
|
|
221
|
-
|
|
222
|
-
if not
|
|
223
|
-
raise ValueError("
|
|
405
|
+
dest_table = kwargs.get("dest_table", None)
|
|
406
|
+
if not dest_table:
|
|
407
|
+
raise ValueError("A destination table is required to connect to Athena.")
|
|
224
408
|
|
|
225
|
-
|
|
226
|
-
if
|
|
227
|
-
raise ValueError(
|
|
409
|
+
dest_table_fields = dest_table.split(".")
|
|
410
|
+
if len(dest_table_fields) != 2:
|
|
411
|
+
raise ValueError(
|
|
412
|
+
f"Table name must be in the format <schema>.<table>, given: {dest_table}"
|
|
413
|
+
)
|
|
228
414
|
|
|
229
|
-
|
|
415
|
+
query_result_path = f"{bucket}/{dest_table_fields[0]}_staging/metadata"
|
|
230
416
|
|
|
417
|
+
access_key_id = source_params.get("access_key_id", [None])[0]
|
|
418
|
+
secret_access_key = source_params.get("secret_access_key", [None])[0]
|
|
419
|
+
session_token = source_params.get("session_token", [None])[0]
|
|
420
|
+
profile_name = source_params.get("profile", ["default"])[0]
|
|
231
421
|
region_name = source_params.get("region_name", [None])[0]
|
|
422
|
+
|
|
423
|
+
if not access_key_id and not secret_access_key:
|
|
424
|
+
import botocore.session # type: ignore
|
|
425
|
+
|
|
426
|
+
session = botocore.session.Session(profile=profile_name)
|
|
427
|
+
default = session.get_credentials()
|
|
428
|
+
if not profile_name:
|
|
429
|
+
raise ValueError(
|
|
430
|
+
"You have to either provide access_key_id and secret_access_key pair or a valid AWS profile name."
|
|
431
|
+
)
|
|
432
|
+
access_key_id = default.access_key
|
|
433
|
+
secret_access_key = default.secret_key
|
|
434
|
+
session_token = default.token
|
|
435
|
+
if region_name is None:
|
|
436
|
+
region_name = session.get_config_variable("region")
|
|
437
|
+
|
|
232
438
|
if not region_name:
|
|
233
439
|
raise ValueError("The region_name is required to connect to Athena.")
|
|
234
440
|
|
|
235
441
|
os.environ["DESTINATION__BUCKET_URL"] = bucket
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
442
|
+
if access_key_id and secret_access_key:
|
|
443
|
+
os.environ["DESTINATION__CREDENTIALS__AWS_ACCESS_KEY_ID"] = access_key_id
|
|
444
|
+
os.environ["DESTINATION__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = (
|
|
445
|
+
secret_access_key
|
|
446
|
+
)
|
|
447
|
+
if session_token:
|
|
448
|
+
os.environ["DESTINATION__CREDENTIALS__AWS_SESSION_TOKEN"] = session_token
|
|
240
449
|
|
|
241
|
-
credentials = AwsCredentials(
|
|
242
|
-
aws_access_key_id=access_key_id,
|
|
243
|
-
aws_secret_access_key=secret_access_key,
|
|
244
|
-
region_name=region_name,
|
|
245
|
-
)
|
|
246
450
|
return dlt.destinations.athena(
|
|
247
451
|
query_result_bucket=query_result_path,
|
|
248
|
-
athena_work_group=
|
|
249
|
-
credentials=
|
|
452
|
+
athena_work_group=source_params.get("workgroup", [None])[0], # type: ignore
|
|
453
|
+
credentials=AwsCredentials(
|
|
454
|
+
aws_access_key_id=access_key_id, # type: ignore
|
|
455
|
+
aws_secret_access_key=secret_access_key, # type: ignore
|
|
456
|
+
aws_session_token=session_token,
|
|
457
|
+
region_name=region_name,
|
|
458
|
+
),
|
|
250
459
|
destination_name=bucket,
|
|
460
|
+
force_iceberg=True,
|
|
251
461
|
)
|
|
252
462
|
|
|
253
463
|
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
@@ -297,14 +507,16 @@ class ClickhouseDestination:
|
|
|
297
507
|
raise ValueError(
|
|
298
508
|
"The TCP port of the ClickHouse server is required to establish a connection."
|
|
299
509
|
)
|
|
300
|
-
|
|
510
|
+
|
|
301
511
|
query_params = parse_qs(parsed_uri.query)
|
|
302
512
|
secure = int(query_params["secure"][0]) if "secure" in query_params else 1
|
|
303
513
|
|
|
304
514
|
http_port = (
|
|
305
515
|
int(query_params["http_port"][0])
|
|
306
516
|
if "http_port" in query_params
|
|
307
|
-
else 8443
|
|
517
|
+
else 8443
|
|
518
|
+
if secure == 1
|
|
519
|
+
else 8123
|
|
308
520
|
)
|
|
309
521
|
|
|
310
522
|
if secure not in (0, 1):
|
|
@@ -335,3 +547,278 @@ class ClickhouseDestination:
|
|
|
335
547
|
|
|
336
548
|
def post_load(self):
|
|
337
549
|
pass
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
class BlobFSClient(dlt.destinations.impl.filesystem.filesystem.FilesystemClient):
|
|
553
|
+
@property
|
|
554
|
+
def dataset_path(self):
|
|
555
|
+
# override to remove dataset path
|
|
556
|
+
return self.bucket_path
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
class BlobFS(dlt.destinations.filesystem):
|
|
560
|
+
@property
|
|
561
|
+
def client_class(self):
|
|
562
|
+
return BlobFSClient
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
class SqliteDestination(GenericSqlDestination):
|
|
566
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
567
|
+
return dlt.destinations.sqlalchemy(credentials=uri)
|
|
568
|
+
|
|
569
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs):
|
|
570
|
+
return {
|
|
571
|
+
# https://dlthub.com/docs/dlt-ecosystem/destinations/sqlalchemy#dataset-files
|
|
572
|
+
"dataset_name": "main",
|
|
573
|
+
"table_name": table,
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
class MySqlDestination(GenericSqlDestination):
|
|
578
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
579
|
+
return dlt.destinations.sqlalchemy(credentials=uri)
|
|
580
|
+
|
|
581
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs):
|
|
582
|
+
parsed = urlparse(uri)
|
|
583
|
+
database = parsed.path.lstrip("/")
|
|
584
|
+
if not database:
|
|
585
|
+
raise ValueError("You need to specify a database")
|
|
586
|
+
return {
|
|
587
|
+
"dataset_name": database,
|
|
588
|
+
"table_name": table,
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
class TrinoTypeMapper:
|
|
593
|
+
"""Custom type mapper for Trino to handle unsupported types."""
|
|
594
|
+
|
|
595
|
+
@staticmethod
|
|
596
|
+
def create_type_mapper():
|
|
597
|
+
"""Create a custom type mapper for Trino."""
|
|
598
|
+
from dlt.destinations.impl.sqlalchemy.type_mapper import SqlalchemyTypeMapper
|
|
599
|
+
from sqlalchemy import BigInteger, Text
|
|
600
|
+
from sqlalchemy.sql import sqltypes
|
|
601
|
+
|
|
602
|
+
class CustomTrinoTypeMapper(SqlalchemyTypeMapper):
|
|
603
|
+
"""Custom type mapper that converts unsupported Trino types."""
|
|
604
|
+
|
|
605
|
+
def to_destination_type(self, column, table=None):
|
|
606
|
+
# Handle special cases before calling parent
|
|
607
|
+
data_type = column.get("data_type", "")
|
|
608
|
+
|
|
609
|
+
# Convert JSON to VARCHAR for Trino's Iceberg catalog
|
|
610
|
+
if data_type == "json":
|
|
611
|
+
# Use TEXT (unlimited VARCHAR) for JSON data
|
|
612
|
+
return Text()
|
|
613
|
+
|
|
614
|
+
# Convert BINARY to VARCHAR
|
|
615
|
+
if data_type == "binary":
|
|
616
|
+
return Text()
|
|
617
|
+
|
|
618
|
+
# Handle integer types - always use BIGINT for Trino
|
|
619
|
+
# Note: dlt uses "bigint" internally, not "integer"
|
|
620
|
+
if data_type in ["bigint", "integer", "int"]:
|
|
621
|
+
return BigInteger()
|
|
622
|
+
|
|
623
|
+
# For other types, try parent mapper
|
|
624
|
+
try:
|
|
625
|
+
type_ = super().to_destination_type(column, table)
|
|
626
|
+
except Exception:
|
|
627
|
+
# If parent can't handle it, default to TEXT
|
|
628
|
+
return Text()
|
|
629
|
+
|
|
630
|
+
# Convert any INTEGER type to BIGINT
|
|
631
|
+
if isinstance(type_, sqltypes.Integer) and not isinstance(
|
|
632
|
+
type_, sqltypes.BigInteger
|
|
633
|
+
):
|
|
634
|
+
return BigInteger()
|
|
635
|
+
|
|
636
|
+
# Ensure VARCHAR types don't have constraints that Trino doesn't support
|
|
637
|
+
if isinstance(type_, sqltypes.String):
|
|
638
|
+
# Return TEXT for unlimited string
|
|
639
|
+
return Text()
|
|
640
|
+
|
|
641
|
+
return type_
|
|
642
|
+
|
|
643
|
+
return CustomTrinoTypeMapper
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
class TrinoDestination(GenericSqlDestination):
|
|
647
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
648
|
+
# Import required modules
|
|
649
|
+
from dlt.destinations.impl.sqlalchemy.factory import (
|
|
650
|
+
sqlalchemy as sqlalchemy_factory,
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
# Create the destination with custom type mapper
|
|
654
|
+
# We need to use the factory to properly configure the type mapper
|
|
655
|
+
dest = sqlalchemy_factory(
|
|
656
|
+
credentials=uri, type_mapper=TrinoTypeMapper.create_type_mapper(), **kwargs
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
return dest
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
class BlobStorageDestination(abc.ABC):
|
|
663
|
+
@abc.abstractmethod
|
|
664
|
+
def credentials(self, params: dict) -> FileSystemCredentials:
|
|
665
|
+
"""Build credentials for the blob storage destination."""
|
|
666
|
+
pass
|
|
667
|
+
|
|
668
|
+
@property
|
|
669
|
+
@abc.abstractmethod
|
|
670
|
+
def protocol(self) -> str:
|
|
671
|
+
"""The protocol used for the blob storage destination."""
|
|
672
|
+
pass
|
|
673
|
+
|
|
674
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
675
|
+
parsed_uri = urlparse(uri)
|
|
676
|
+
params = parse_qs(parsed_uri.query)
|
|
677
|
+
creds = self.credentials(params)
|
|
678
|
+
|
|
679
|
+
dest_table = kwargs["dest_table"]
|
|
680
|
+
|
|
681
|
+
# only validate if dest_table is not a full URI
|
|
682
|
+
if not parsed_uri.netloc:
|
|
683
|
+
dest_table = self.validate_table(dest_table)
|
|
684
|
+
|
|
685
|
+
table_parts = dest_table.split("/")
|
|
686
|
+
|
|
687
|
+
if parsed_uri.path.strip("/"):
|
|
688
|
+
path_parts = parsed_uri.path.strip("/ ").split("/")
|
|
689
|
+
table_parts = path_parts + table_parts
|
|
690
|
+
|
|
691
|
+
if parsed_uri.netloc:
|
|
692
|
+
table_parts.insert(0, parsed_uri.netloc.strip())
|
|
693
|
+
|
|
694
|
+
base_path = "/".join(table_parts[:-1])
|
|
695
|
+
|
|
696
|
+
opts = {
|
|
697
|
+
"bucket_url": f"{self.protocol}://{base_path}",
|
|
698
|
+
"credentials": creds,
|
|
699
|
+
# supresses dlt warnings about dataset name normalization.
|
|
700
|
+
# we don't use dataset names in S3 so it's fine to disable this.
|
|
701
|
+
"enable_dataset_name_normalization": False,
|
|
702
|
+
}
|
|
703
|
+
layout = params.get("layout", [None])[0]
|
|
704
|
+
if layout is not None:
|
|
705
|
+
opts["layout"] = layout
|
|
706
|
+
|
|
707
|
+
return BlobFS(**opts) # type: ignore
|
|
708
|
+
|
|
709
|
+
def validate_table(self, table: str):
|
|
710
|
+
table = table.strip("/ ")
|
|
711
|
+
if len(table.split("/")) < 2:
|
|
712
|
+
raise ValueError("Table name must be in the format {bucket-name}/{path}")
|
|
713
|
+
return table
|
|
714
|
+
|
|
715
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs):
|
|
716
|
+
table_parts = table.split("/")
|
|
717
|
+
return {
|
|
718
|
+
"table_name": table_parts[-1].strip(),
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
def post_load(self) -> None:
|
|
722
|
+
pass
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
class S3Destination(BlobStorageDestination):
|
|
726
|
+
@property
|
|
727
|
+
def protocol(self) -> str:
|
|
728
|
+
return "s3"
|
|
729
|
+
|
|
730
|
+
def credentials(self, params: dict) -> FileSystemCredentials:
|
|
731
|
+
access_key_id = params.get("access_key_id", [None])[0]
|
|
732
|
+
if access_key_id is None:
|
|
733
|
+
raise MissingValueError("access_key_id", "S3")
|
|
734
|
+
|
|
735
|
+
secret_access_key = params.get("secret_access_key", [None])[0]
|
|
736
|
+
if secret_access_key is None:
|
|
737
|
+
raise MissingValueError("secret_access_key", "S3")
|
|
738
|
+
|
|
739
|
+
endpoint_url = params.get("endpoint_url", [None])[0]
|
|
740
|
+
if endpoint_url is not None:
|
|
741
|
+
parsed_endpoint = urlparse(endpoint_url)
|
|
742
|
+
if not parsed_endpoint.scheme or not parsed_endpoint.netloc:
|
|
743
|
+
raise ValueError("Invalid endpoint_url. Must be a valid URL.")
|
|
744
|
+
|
|
745
|
+
return AwsCredentials(
|
|
746
|
+
aws_access_key_id=access_key_id,
|
|
747
|
+
aws_secret_access_key=secret_access_key,
|
|
748
|
+
endpoint_url=endpoint_url,
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
class GCSDestination(BlobStorageDestination):
|
|
753
|
+
@property
|
|
754
|
+
def protocol(self) -> str:
|
|
755
|
+
return "gs"
|
|
756
|
+
|
|
757
|
+
def credentials(self, params: dict) -> FileSystemCredentials:
|
|
758
|
+
"""Builds GCS credentials from the provided parameters."""
|
|
759
|
+
credentials_path = params.get("credentials_path")
|
|
760
|
+
credentials_base64 = params.get("credentials_base64")
|
|
761
|
+
credentials_available = any(
|
|
762
|
+
map(
|
|
763
|
+
lambda x: x is not None,
|
|
764
|
+
[credentials_path, credentials_base64],
|
|
765
|
+
)
|
|
766
|
+
)
|
|
767
|
+
if credentials_available is False:
|
|
768
|
+
raise MissingValueError("credentials_path or credentials_base64", "GCS")
|
|
769
|
+
|
|
770
|
+
credentials = None
|
|
771
|
+
if credentials_path:
|
|
772
|
+
with open(credentials_path[0], "r") as f:
|
|
773
|
+
credentials = json.load(f)
|
|
774
|
+
else:
|
|
775
|
+
credentials = json.loads(base64.b64decode(credentials_base64[0]).decode()) # type: ignore
|
|
776
|
+
|
|
777
|
+
return credentials
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
class ElasticsearchDestination:
|
|
781
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
782
|
+
from urllib.parse import urlparse
|
|
783
|
+
|
|
784
|
+
parsed_uri = urlparse(uri)
|
|
785
|
+
|
|
786
|
+
# Extract connection details from URI
|
|
787
|
+
scheme = parsed_uri.scheme or "http"
|
|
788
|
+
host = parsed_uri.hostname or "localhost"
|
|
789
|
+
port = parsed_uri.port or 9200
|
|
790
|
+
username = parsed_uri.username
|
|
791
|
+
password = parsed_uri.password
|
|
792
|
+
|
|
793
|
+
# Build connection string
|
|
794
|
+
if username and password:
|
|
795
|
+
connection_string = f"{scheme}://{username}:{password}@{host}:{port}"
|
|
796
|
+
else:
|
|
797
|
+
connection_string = f"{scheme}://{host}:{port}"
|
|
798
|
+
|
|
799
|
+
# Add query parameters if any
|
|
800
|
+
if parsed_uri.query:
|
|
801
|
+
connection_string += f"?{parsed_uri.query}"
|
|
802
|
+
|
|
803
|
+
return elasticsearch_insert(connection_string=connection_string)
|
|
804
|
+
|
|
805
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
806
|
+
return {
|
|
807
|
+
"table_name": table,
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
def post_load(self):
|
|
811
|
+
pass
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
class MongoDBDestination:
|
|
815
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
816
|
+
return mongodb_insert(uri)
|
|
817
|
+
|
|
818
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
819
|
+
return {
|
|
820
|
+
"table_name": table,
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
def post_load(self):
|
|
824
|
+
pass
|