omniload 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omniload/conftest.py +72 -0
- omniload/main.py +810 -0
- omniload/src/.gitignore +10 -0
- omniload/src/adjust/__init__.py +108 -0
- omniload/src/adjust/adjust_helpers.py +122 -0
- omniload/src/airtable/__init__.py +84 -0
- omniload/src/allium/__init__.py +128 -0
- omniload/src/anthropic/__init__.py +277 -0
- omniload/src/anthropic/helpers.py +525 -0
- omniload/src/applovin/__init__.py +316 -0
- omniload/src/applovin_max/__init__.py +117 -0
- omniload/src/appsflyer/__init__.py +325 -0
- omniload/src/appsflyer/client.py +110 -0
- omniload/src/appstore/__init__.py +142 -0
- omniload/src/appstore/client.py +126 -0
- omniload/src/appstore/errors.py +15 -0
- omniload/src/appstore/models.py +117 -0
- omniload/src/appstore/resources.py +179 -0
- omniload/src/arrow/__init__.py +81 -0
- omniload/src/asana_source/__init__.py +281 -0
- omniload/src/asana_source/helpers.py +30 -0
- omniload/src/asana_source/settings.py +158 -0
- omniload/src/attio/__init__.py +102 -0
- omniload/src/attio/helpers.py +65 -0
- omniload/src/blob.py +95 -0
- omniload/src/bruin/__init__.py +76 -0
- omniload/src/chess/__init__.py +180 -0
- omniload/src/chess/helpers.py +35 -0
- omniload/src/chess/settings.py +18 -0
- omniload/src/clickup/__init__.py +85 -0
- omniload/src/clickup/helpers.py +47 -0
- omniload/src/collector/spinner.py +43 -0
- omniload/src/couchbase_source/__init__.py +118 -0
- omniload/src/couchbase_source/helpers.py +135 -0
- omniload/src/cursor/__init__.py +83 -0
- omniload/src/cursor/helpers.py +188 -0
- omniload/src/customer_io/__init__.py +486 -0
- omniload/src/customer_io/helpers.py +530 -0
- omniload/src/destinations.py +982 -0
- omniload/src/docebo/__init__.py +589 -0
- omniload/src/docebo/client.py +435 -0
- omniload/src/docebo/helpers.py +97 -0
- omniload/src/dune/__init__.py +104 -0
- omniload/src/dune/helpers.py +108 -0
- omniload/src/dynamodb/__init__.py +86 -0
- omniload/src/elasticsearch/__init__.py +80 -0
- omniload/src/elasticsearch/helpers.py +141 -0
- omniload/src/errors.py +26 -0
- omniload/src/facebook_ads/__init__.py +403 -0
- omniload/src/facebook_ads/exceptions.py +19 -0
- omniload/src/facebook_ads/helpers.py +296 -0
- omniload/src/facebook_ads/settings.py +224 -0
- omniload/src/facebook_ads/utils.py +53 -0
- omniload/src/factory.py +305 -0
- omniload/src/filesystem/__init__.py +133 -0
- omniload/src/filesystem/helpers.py +114 -0
- omniload/src/filesystem/readers.py +187 -0
- omniload/src/filters.py +62 -0
- omniload/src/fireflies/__init__.py +151 -0
- omniload/src/fireflies/helpers.py +753 -0
- omniload/src/fluxx/__init__.py +10013 -0
- omniload/src/fluxx/helpers.py +233 -0
- omniload/src/frankfurter/__init__.py +157 -0
- omniload/src/frankfurter/helpers.py +48 -0
- omniload/src/freshdesk/__init__.py +103 -0
- omniload/src/freshdesk/freshdesk_client.py +151 -0
- omniload/src/freshdesk/settings.py +23 -0
- omniload/src/fundraiseup/__init__.py +95 -0
- omniload/src/fundraiseup/client.py +81 -0
- omniload/src/github/__init__.py +202 -0
- omniload/src/github/helpers.py +207 -0
- omniload/src/github/queries.py +129 -0
- omniload/src/github/settings.py +24 -0
- omniload/src/google_ads/__init__.py +198 -0
- omniload/src/google_ads/field.py +17 -0
- omniload/src/google_ads/metrics.py +254 -0
- omniload/src/google_ads/predicates.py +37 -0
- omniload/src/google_ads/reports.py +411 -0
- omniload/src/google_ads/test_google_ads.py +184 -0
- omniload/src/google_analytics/__init__.py +144 -0
- omniload/src/google_analytics/helpers.py +312 -0
- omniload/src/google_sheets/README.md +95 -0
- omniload/src/google_sheets/__init__.py +166 -0
- omniload/src/google_sheets/helpers/__init__.py +15 -0
- omniload/src/google_sheets/helpers/api_calls.py +160 -0
- omniload/src/google_sheets/helpers/data_processing.py +316 -0
- omniload/src/gorgias/__init__.py +595 -0
- omniload/src/gorgias/helpers.py +166 -0
- omniload/src/hostaway/__init__.py +302 -0
- omniload/src/hostaway/client.py +288 -0
- omniload/src/http/__init__.py +38 -0
- omniload/src/http/readers.py +146 -0
- omniload/src/http_client.py +24 -0
- omniload/src/hubspot/__init__.py +800 -0
- omniload/src/hubspot/helpers.py +417 -0
- omniload/src/hubspot/settings.py +329 -0
- omniload/src/indeed/__init__.py +153 -0
- omniload/src/indeed/helpers.py +228 -0
- omniload/src/influxdb/__init__.py +46 -0
- omniload/src/influxdb/client.py +34 -0
- omniload/src/intercom/__init__.py +142 -0
- omniload/src/intercom/helpers.py +674 -0
- omniload/src/intercom/settings.py +279 -0
- omniload/src/isoc_pulse/__init__.py +159 -0
- omniload/src/jira_source/__init__.py +377 -0
- omniload/src/jira_source/helpers.py +510 -0
- omniload/src/jira_source/settings.py +184 -0
- omniload/src/kafka/__init__.py +120 -0
- omniload/src/kafka/helpers.py +241 -0
- omniload/src/kinesis/__init__.py +153 -0
- omniload/src/kinesis/helpers.py +96 -0
- omniload/src/klaviyo/__init__.py +237 -0
- omniload/src/klaviyo/client.py +212 -0
- omniload/src/klaviyo/helpers.py +19 -0
- omniload/src/linear/__init__.py +634 -0
- omniload/src/linear/helpers.py +111 -0
- omniload/src/linkedin_ads/__init__.py +266 -0
- omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
- omniload/src/linkedin_ads/helpers.py +246 -0
- omniload/src/loader.py +69 -0
- omniload/src/mailchimp/__init__.py +126 -0
- omniload/src/mailchimp/helpers.py +226 -0
- omniload/src/mailchimp/settings.py +164 -0
- omniload/src/masking.py +344 -0
- omniload/src/mixpanel/__init__.py +62 -0
- omniload/src/mixpanel/client.py +104 -0
- omniload/src/monday/__init__.py +246 -0
- omniload/src/monday/helpers.py +392 -0
- omniload/src/monday/settings.py +325 -0
- omniload/src/mongodb/__init__.py +281 -0
- omniload/src/mongodb/helpers.py +975 -0
- omniload/src/notion/__init__.py +69 -0
- omniload/src/notion/helpers/__init__.py +14 -0
- omniload/src/notion/helpers/client.py +178 -0
- omniload/src/notion/helpers/database.py +92 -0
- omniload/src/notion/settings.py +17 -0
- omniload/src/partition.py +32 -0
- omniload/src/personio/__init__.py +345 -0
- omniload/src/personio/helpers.py +100 -0
- omniload/src/phantombuster/__init__.py +65 -0
- omniload/src/phantombuster/client.py +87 -0
- omniload/src/pinterest/__init__.py +82 -0
- omniload/src/pipedrive/__init__.py +212 -0
- omniload/src/pipedrive/helpers/__init__.py +37 -0
- omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
- omniload/src/pipedrive/helpers/pages.py +129 -0
- omniload/src/pipedrive/settings.py +41 -0
- omniload/src/pipedrive/typing.py +17 -0
- omniload/src/plusvibeai/__init__.py +335 -0
- omniload/src/plusvibeai/helpers.py +544 -0
- omniload/src/plusvibeai/settings.py +252 -0
- omniload/src/primer/__init__.py +45 -0
- omniload/src/primer/helpers.py +79 -0
- omniload/src/quickbooks/__init__.py +117 -0
- omniload/src/reddit_ads/__init__.py +183 -0
- omniload/src/reddit_ads/helpers.py +232 -0
- omniload/src/resource.py +40 -0
- omniload/src/revenuecat/__init__.py +83 -0
- omniload/src/revenuecat/helpers.py +237 -0
- omniload/src/salesforce/__init__.py +170 -0
- omniload/src/salesforce/helpers.py +78 -0
- omniload/src/shopify/__init__.py +1953 -0
- omniload/src/shopify/exceptions.py +17 -0
- omniload/src/shopify/helpers.py +202 -0
- omniload/src/shopify/settings.py +19 -0
- omniload/src/slack/__init__.py +290 -0
- omniload/src/slack/helpers.py +218 -0
- omniload/src/slack/settings.py +36 -0
- omniload/src/smartsheets/__init__.py +82 -0
- omniload/src/snapchat_ads/__init__.py +455 -0
- omniload/src/snapchat_ads/client.py +72 -0
- omniload/src/snapchat_ads/helpers.py +630 -0
- omniload/src/snapchat_ads/settings.py +130 -0
- omniload/src/socrata_source/__init__.py +83 -0
- omniload/src/socrata_source/helpers.py +85 -0
- omniload/src/socrata_source/settings.py +8 -0
- omniload/src/solidgate/__init__.py +219 -0
- omniload/src/solidgate/helpers.py +154 -0
- omniload/src/sources.py +5408 -0
- omniload/src/sql_database/__init__.py +0 -0
- omniload/src/sql_database/callbacks.py +66 -0
- omniload/src/stripe_analytics/__init__.py +183 -0
- omniload/src/stripe_analytics/helpers.py +386 -0
- omniload/src/stripe_analytics/settings.py +80 -0
- omniload/src/table_definition.py +15 -0
- omniload/src/testdata/fakebqcredentials.json +14 -0
- omniload/src/tiktok_ads/__init__.py +150 -0
- omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
- omniload/src/time.py +11 -0
- omniload/src/trustpilot/__init__.py +48 -0
- omniload/src/trustpilot/client.py +48 -0
- omniload/src/version.py +6 -0
- omniload/src/wise/__init__.py +68 -0
- omniload/src/wise/client.py +63 -0
- omniload/src/zendesk/__init__.py +480 -0
- omniload/src/zendesk/helpers/__init__.py +39 -0
- omniload/src/zendesk/helpers/api_helpers.py +119 -0
- omniload/src/zendesk/helpers/credentials.py +68 -0
- omniload/src/zendesk/helpers/talk_api.py +132 -0
- omniload/src/zendesk/settings.py +71 -0
- omniload/src/zoom/__init__.py +99 -0
- omniload/src/zoom/helpers.py +102 -0
- omniload/testdata/.gitignore +2 -0
- omniload/testdata/create_replace.csv +21 -0
- omniload/testdata/delete_insert_expected.csv +6 -0
- omniload/testdata/delete_insert_part1.csv +5 -0
- omniload/testdata/delete_insert_part2.csv +6 -0
- omniload/testdata/merge_expected.csv +5 -0
- omniload/testdata/merge_part1.csv +4 -0
- omniload/testdata/merge_part2.csv +5 -0
- omniload/tests/unit/test_smartsheets.py +133 -0
- omniload-0.0.0.dev0.dist-info/METADATA +439 -0
- omniload-0.0.0.dev0.dist-info/RECORD +218 -0
- omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
- omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
- omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
|
@@ -0,0 +1,982 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import base64
|
|
3
|
+
import csv
|
|
4
|
+
import datetime
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import shutil
|
|
8
|
+
import struct
|
|
9
|
+
import tempfile
|
|
10
|
+
from urllib.parse import parse_qs, quote, urlparse
|
|
11
|
+
|
|
12
|
+
import dlt
|
|
13
|
+
import dlt.destinations.impl.filesystem.filesystem
|
|
14
|
+
import requests
|
|
15
|
+
from dlt.common.configuration.specs import AwsCredentials
|
|
16
|
+
from dlt.common.destination.capabilities import DestinationCapabilitiesContext
|
|
17
|
+
from dlt.common.schema import Schema
|
|
18
|
+
from dlt.common.storages.configuration import FileSystemCredentials
|
|
19
|
+
from dlt.destinations.impl.clickhouse.configuration import (
|
|
20
|
+
ClickHouseCredentials,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from omniload.src.elasticsearch.helpers import elasticsearch_insert
|
|
24
|
+
from omniload.src.errors import MissingValueError
|
|
25
|
+
from omniload.src.loader import load_dlt_file
|
|
26
|
+
from omniload.src.mongodb import mongodb_insert
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class GenericSqlDestination:
|
|
30
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
31
|
+
table_fields = table.split(".")
|
|
32
|
+
if len(table_fields) != 2:
|
|
33
|
+
raise ValueError("Table name must be in the format <schema>.<table>")
|
|
34
|
+
|
|
35
|
+
res = {
|
|
36
|
+
"dataset_name": table_fields[-2],
|
|
37
|
+
"table_name": table_fields[-1],
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return res
|
|
41
|
+
|
|
42
|
+
def post_load(self):
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BigQueryDestination:
|
|
47
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
48
|
+
source_fields = urlparse(uri)
|
|
49
|
+
source_params = parse_qs(source_fields.query)
|
|
50
|
+
|
|
51
|
+
cred_path = source_params.get("credentials_path")
|
|
52
|
+
credentials_base64 = source_params.get("credentials_base64")
|
|
53
|
+
|
|
54
|
+
location = None
|
|
55
|
+
if source_params.get("location"):
|
|
56
|
+
loc_params = source_params.get("location", [])
|
|
57
|
+
if len(loc_params) > 1:
|
|
58
|
+
raise ValueError("Only one location is allowed")
|
|
59
|
+
location = loc_params[0]
|
|
60
|
+
|
|
61
|
+
# Following dlt's pattern (like google_analytics), we let dlt's credential resolution
|
|
62
|
+
# handle defaults automatically. When credentials_path or credentials_base64 are not
|
|
63
|
+
# provided, dlt will use Application Default Credentials via GcpServiceAccountCredentials.
|
|
64
|
+
credentials = None
|
|
65
|
+
if cred_path:
|
|
66
|
+
with open(cred_path[0], "r") as f:
|
|
67
|
+
credentials = json.load(f)
|
|
68
|
+
elif credentials_base64:
|
|
69
|
+
credentials = json.loads(
|
|
70
|
+
base64.b64decode(credentials_base64[0]).decode("utf-8")
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
staging_bucket = kwargs.get("staging_bucket", None)
|
|
74
|
+
if staging_bucket:
|
|
75
|
+
if not staging_bucket.startswith("gs://"):
|
|
76
|
+
raise ValueError("Staging bucket must start with gs://")
|
|
77
|
+
|
|
78
|
+
os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = staging_bucket
|
|
79
|
+
if credentials:
|
|
80
|
+
os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__PROJECT_ID"] = (
|
|
81
|
+
credentials.get("project_id", None)
|
|
82
|
+
)
|
|
83
|
+
os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__PRIVATE_KEY"] = (
|
|
84
|
+
credentials.get("private_key", None)
|
|
85
|
+
)
|
|
86
|
+
os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__CLIENT_EMAIL"] = (
|
|
87
|
+
credentials.get("client_email", None)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
project_id = None
|
|
91
|
+
if source_fields.hostname:
|
|
92
|
+
project_id = source_fields.hostname
|
|
93
|
+
|
|
94
|
+
return dlt.destinations.bigquery(
|
|
95
|
+
credentials=credentials, # type: ignore
|
|
96
|
+
location=location, # type: ignore
|
|
97
|
+
project_id=project_id,
|
|
98
|
+
**kwargs,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
102
|
+
table_fields = table.split(".")
|
|
103
|
+
if len(table_fields) != 2 and len(table_fields) != 3:
|
|
104
|
+
raise ValueError(
|
|
105
|
+
"Table name must be in the format <dataset>.<table> or <project>.<dataset>.<table>"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
res = {
|
|
109
|
+
"dataset_name": table_fields[-2],
|
|
110
|
+
"table_name": table_fields[-1],
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
staging_bucket = kwargs.get("staging_bucket", None)
|
|
114
|
+
if staging_bucket:
|
|
115
|
+
res["staging"] = "filesystem"
|
|
116
|
+
|
|
117
|
+
return res
|
|
118
|
+
|
|
119
|
+
def post_load(self):
|
|
120
|
+
pass
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class CrateDBDestination(GenericSqlDestination):
|
|
124
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
125
|
+
uri = uri.replace("cratedb://", "postgres://")
|
|
126
|
+
import dlt_cratedb.impl.cratedb.factory
|
|
127
|
+
|
|
128
|
+
return dlt_cratedb.impl.cratedb.factory.cratedb(credentials=uri, **kwargs)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class PostgresDestination(GenericSqlDestination):
|
|
132
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
133
|
+
return dlt.destinations.postgres(credentials=uri, **kwargs)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class SnowflakeDestination(GenericSqlDestination):
|
|
137
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
138
|
+
return dlt.destinations.snowflake(credentials=uri, **kwargs)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class RedshiftDestination(GenericSqlDestination):
|
|
142
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
143
|
+
return dlt.destinations.redshift(
|
|
144
|
+
credentials=uri.replace("redshift://", "postgresql://"), **kwargs
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class DuckDBDestination(GenericSqlDestination):
|
|
149
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
150
|
+
kwargs.pop("dest_table", None)
|
|
151
|
+
kwargs.pop("staging_bucket", None)
|
|
152
|
+
return dlt.destinations.duckdb(uri, **kwargs)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class MotherduckDestination(GenericSqlDestination):
|
|
156
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
157
|
+
from urllib.parse import parse_qs, urlparse
|
|
158
|
+
|
|
159
|
+
parsed = urlparse(uri)
|
|
160
|
+
query = parse_qs(parsed.query)
|
|
161
|
+
token = query.get("token", [None])[0]
|
|
162
|
+
from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
|
|
163
|
+
|
|
164
|
+
creds = {
|
|
165
|
+
"password": token,
|
|
166
|
+
}
|
|
167
|
+
database = parsed.path.lstrip("/") or parsed.netloc
|
|
168
|
+
if database:
|
|
169
|
+
creds["database"] = database
|
|
170
|
+
|
|
171
|
+
return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
|
|
175
|
+
# ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
|
|
176
|
+
tup = struct.unpack(
|
|
177
|
+
"<6hI2h", dto_value
|
|
178
|
+
) # e.g., (2017, 3, 16, 10, 35, 18, 500000000, -6, 0)
|
|
179
|
+
return datetime.datetime(
|
|
180
|
+
tup[0],
|
|
181
|
+
tup[1],
|
|
182
|
+
tup[2],
|
|
183
|
+
tup[3],
|
|
184
|
+
tup[4],
|
|
185
|
+
tup[5],
|
|
186
|
+
tup[6] // 1000,
|
|
187
|
+
datetime.timezone(datetime.timedelta(hours=tup[7], minutes=tup[8])),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# MSSQL_COPT_SS_ACCESS_TOKEN is a connection attribute used to pass
|
|
192
|
+
# an Azure Active Directory access token to the SQL Server ODBC driver.
|
|
193
|
+
MSSQL_COPT_SS_ACCESS_TOKEN = 1256
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def serialize_azure_token(token):
|
|
197
|
+
# https://github.com/mkleehammer/pyodbc/issues/228#issuecomment-494773723
|
|
198
|
+
encoded = token.encode("utf_16_le")
|
|
199
|
+
return struct.pack("<i", len(encoded)) + encoded
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def build_mssql_dest():
|
|
203
|
+
# https://github.com/bruin-data/ingestr/issues/293
|
|
204
|
+
|
|
205
|
+
from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration
|
|
206
|
+
from dlt.destinations.impl.mssql.mssql import (
|
|
207
|
+
HINT_TO_MSSQL_ATTR,
|
|
208
|
+
MsSqlJobClient,
|
|
209
|
+
)
|
|
210
|
+
from dlt.destinations.impl.mssql.sql_client import (
|
|
211
|
+
PyOdbcMsSqlClient,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
class OdbcMsSqlClient(PyOdbcMsSqlClient):
|
|
215
|
+
SKIP_CREDENTIALS = {"PWD", "AUTHENTICATION", "UID"}
|
|
216
|
+
|
|
217
|
+
def open_connection(self):
|
|
218
|
+
cfg = self.credentials.get_odbc_dsn_dict()
|
|
219
|
+
if (
|
|
220
|
+
cfg.get("AUTHENTICATION", "").strip().lower()
|
|
221
|
+
!= "activedirectoryaccesstoken"
|
|
222
|
+
):
|
|
223
|
+
return super().open_connection()
|
|
224
|
+
|
|
225
|
+
import pyodbc # type: ignore
|
|
226
|
+
|
|
227
|
+
dsn = ";".join(
|
|
228
|
+
[f"{k}={v}" for k, v in cfg.items() if k not in self.SKIP_CREDENTIALS]
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
self._conn = pyodbc.connect(
|
|
232
|
+
dsn,
|
|
233
|
+
timeout=self.credentials.connect_timeout,
|
|
234
|
+
attrs_before={
|
|
235
|
+
MSSQL_COPT_SS_ACCESS_TOKEN: serialize_azure_token(cfg["PWD"]),
|
|
236
|
+
},
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# https://github.com/mkleehammer/pyodbc/wiki/Using-an-Output-Converter-function
|
|
240
|
+
self._conn.add_output_converter(-155, handle_datetimeoffset)
|
|
241
|
+
self._conn.autocommit = True
|
|
242
|
+
return self._conn
|
|
243
|
+
|
|
244
|
+
class MsSqlClient(MsSqlJobClient):
|
|
245
|
+
def __init__(
|
|
246
|
+
self,
|
|
247
|
+
schema: Schema,
|
|
248
|
+
config: MsSqlClientConfiguration,
|
|
249
|
+
capabilities: DestinationCapabilitiesContext,
|
|
250
|
+
) -> None:
|
|
251
|
+
sql_client = OdbcMsSqlClient(
|
|
252
|
+
config.normalize_dataset_name(schema),
|
|
253
|
+
config.normalize_staging_dataset_name(schema),
|
|
254
|
+
config.credentials,
|
|
255
|
+
capabilities,
|
|
256
|
+
)
|
|
257
|
+
super(MsSqlJobClient, self).__init__(schema, config, sql_client)
|
|
258
|
+
self.config: MsSqlClientConfiguration = config
|
|
259
|
+
self.sql_client = sql_client
|
|
260
|
+
self.active_hints = HINT_TO_MSSQL_ATTR if self.config.create_indexes else {}
|
|
261
|
+
self.type_mapper = capabilities.get_type_mapper()
|
|
262
|
+
|
|
263
|
+
class MsSqlDestImpl(dlt.destinations.mssql):
|
|
264
|
+
@property
|
|
265
|
+
def client_class(self):
|
|
266
|
+
return MsSqlClient
|
|
267
|
+
|
|
268
|
+
return MsSqlDestImpl
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class MsSQLDestination(GenericSqlDestination):
|
|
272
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
273
|
+
cls = build_mssql_dest()
|
|
274
|
+
return cls(credentials=uri, **kwargs)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def get_databricks_oauth_token(
|
|
278
|
+
server_hostname: str, client_id: str, client_secret: str
|
|
279
|
+
) -> str:
|
|
280
|
+
"""
|
|
281
|
+
Exchange Databricks OAuth M2M client credentials for an access token.
|
|
282
|
+
|
|
283
|
+
This implements the OAuth 2.0 client credentials grant flow for Databricks
|
|
284
|
+
service principal authentication.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
server_hostname: The Databricks workspace hostname (e.g., dbc-xxx.cloud.databricks.com)
|
|
288
|
+
client_id: The service principal's client ID (application ID)
|
|
289
|
+
client_secret: The OAuth secret for the service principal
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
The access token string
|
|
293
|
+
|
|
294
|
+
Raises:
|
|
295
|
+
ValueError: If inputs are invalid or the token request fails
|
|
296
|
+
"""
|
|
297
|
+
if not server_hostname:
|
|
298
|
+
raise ValueError("server_hostname is required for OAuth token exchange")
|
|
299
|
+
if not client_id:
|
|
300
|
+
raise ValueError("client_id is required for OAuth token exchange")
|
|
301
|
+
if not client_secret:
|
|
302
|
+
raise ValueError("client_secret is required for OAuth token exchange")
|
|
303
|
+
|
|
304
|
+
token_url = f"https://{server_hostname}/oidc/v1/token"
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
response = requests.post(
|
|
308
|
+
token_url,
|
|
309
|
+
data={
|
|
310
|
+
"grant_type": "client_credentials",
|
|
311
|
+
"scope": "all-apis",
|
|
312
|
+
},
|
|
313
|
+
auth=(client_id, client_secret),
|
|
314
|
+
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
|
315
|
+
timeout=30,
|
|
316
|
+
)
|
|
317
|
+
except requests.exceptions.RequestException as e:
|
|
318
|
+
raise ValueError(
|
|
319
|
+
f"Failed to connect to Databricks OAuth endpoint at {token_url}: {e}"
|
|
320
|
+
) from e
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
response.raise_for_status()
|
|
324
|
+
except requests.exceptions.HTTPError as e:
|
|
325
|
+
raise ValueError(
|
|
326
|
+
f"Failed to obtain Databricks OAuth token: HTTP {response.status_code}"
|
|
327
|
+
) from e
|
|
328
|
+
|
|
329
|
+
try:
|
|
330
|
+
token_data = response.json()
|
|
331
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
332
|
+
raise ValueError("Invalid JSON response from Databricks OAuth endpoint") from e
|
|
333
|
+
|
|
334
|
+
if "access_token" not in token_data:
|
|
335
|
+
raise ValueError("Databricks OAuth response missing 'access_token' field")
|
|
336
|
+
|
|
337
|
+
return token_data["access_token"]
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
class DatabricksDestination(GenericSqlDestination):
|
|
341
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
342
|
+
p = urlparse(uri)
|
|
343
|
+
q = parse_qs(p.query)
|
|
344
|
+
server_hostname = p.hostname
|
|
345
|
+
http_path = q.get("http_path", [None])[0]
|
|
346
|
+
catalog = q.get("catalog", [None])[0]
|
|
347
|
+
|
|
348
|
+
if not server_hostname:
|
|
349
|
+
raise ValueError("Databricks URI must include a server hostname")
|
|
350
|
+
|
|
351
|
+
# Check for OAuth M2M credentials (client_id and client_secret)
|
|
352
|
+
client_id = q.get("client_id", [None])[0]
|
|
353
|
+
client_secret = q.get("client_secret", [None])[0]
|
|
354
|
+
|
|
355
|
+
access_token: str
|
|
356
|
+
if client_id and client_secret:
|
|
357
|
+
# OAuth M2M authentication: exchange client credentials for access token
|
|
358
|
+
access_token = get_databricks_oauth_token(
|
|
359
|
+
server_hostname, client_id, client_secret
|
|
360
|
+
)
|
|
361
|
+
else:
|
|
362
|
+
# Traditional token-based authentication
|
|
363
|
+
if not p.password:
|
|
364
|
+
raise ValueError(
|
|
365
|
+
"Databricks URI must include an access token or client_id/client_secret"
|
|
366
|
+
)
|
|
367
|
+
access_token = p.password
|
|
368
|
+
|
|
369
|
+
creds = {
|
|
370
|
+
"access_token": access_token,
|
|
371
|
+
"server_hostname": server_hostname,
|
|
372
|
+
"http_path": http_path,
|
|
373
|
+
"catalog": catalog,
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
return dlt.destinations.databricks(
|
|
377
|
+
credentials=creds,
|
|
378
|
+
**kwargs,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
382
|
+
p = urlparse(uri)
|
|
383
|
+
q = parse_qs(p.query)
|
|
384
|
+
uri_schema = q.get("schema", [None])[0]
|
|
385
|
+
|
|
386
|
+
table_fields = table.split(".")
|
|
387
|
+
|
|
388
|
+
# If table is in schema.table format, use that (overrides URI schema)
|
|
389
|
+
if len(table_fields) == 2:
|
|
390
|
+
return {
|
|
391
|
+
"dataset_name": table_fields[0],
|
|
392
|
+
"table_name": table_fields[1],
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
# If table is just a table name, use schema from URI
|
|
396
|
+
if len(table_fields) == 1 and uri_schema:
|
|
397
|
+
return {
|
|
398
|
+
"dataset_name": uri_schema,
|
|
399
|
+
"table_name": table_fields[0],
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
raise ValueError(
|
|
403
|
+
"Table name must be in the format <schema>.<table>, or specify schema in the URI"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
class SynapseDestination(GenericSqlDestination):
|
|
408
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
409
|
+
return dlt.destinations.synapse(credentials=uri, **kwargs)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class CustomCsvDestination(dlt.destinations.filesystem):
|
|
413
|
+
pass
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
class CsvDestination(GenericSqlDestination):
|
|
417
|
+
temp_path: str
|
|
418
|
+
actual_path: str
|
|
419
|
+
uri: str
|
|
420
|
+
dataset_name: str
|
|
421
|
+
table_name: str
|
|
422
|
+
|
|
423
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
424
|
+
table_fields = table.split(".")
|
|
425
|
+
if len(table_fields) != 2:
|
|
426
|
+
raise ValueError("Table name must be in the format <schema>.<table>")
|
|
427
|
+
|
|
428
|
+
res = {
|
|
429
|
+
"dataset_name": table_fields[-2],
|
|
430
|
+
"table_name": table_fields[-1],
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
self.dataset_name = res["dataset_name"]
|
|
434
|
+
self.table_name = res["table_name"]
|
|
435
|
+
self.uri = uri
|
|
436
|
+
|
|
437
|
+
return res
|
|
438
|
+
|
|
439
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
440
|
+
if uri.startswith("csv://"):
|
|
441
|
+
uri = uri.replace("csv://", "file://")
|
|
442
|
+
|
|
443
|
+
temp_path = tempfile.mkdtemp()
|
|
444
|
+
self.actual_path = uri
|
|
445
|
+
self.temp_path = temp_path
|
|
446
|
+
return CustomCsvDestination(bucket_url=f"file://{temp_path}", **kwargs)
|
|
447
|
+
|
|
448
|
+
# I dislike this implementation quite a bit since it ties the implementation to some internal details on how dlt works
|
|
449
|
+
# I would prefer a custom destination that allows me to do this easily but dlt seems to have a lot of internal details that are not documented
|
|
450
|
+
# I tried to make it work with a nicer destination implementation but I couldn't, so I decided to go with this hack to experiment
|
|
451
|
+
# if anyone has a better idea on how to do this, I am open to contributions or suggestions
|
|
452
|
+
def post_load(self):
|
|
453
|
+
def find_first_file(path):
|
|
454
|
+
for entry in os.listdir(path):
|
|
455
|
+
full_path = os.path.join(path, entry)
|
|
456
|
+
if os.path.isfile(full_path):
|
|
457
|
+
return full_path
|
|
458
|
+
|
|
459
|
+
return None
|
|
460
|
+
|
|
461
|
+
def filter_keys(dictionary):
|
|
462
|
+
return {
|
|
463
|
+
key: value
|
|
464
|
+
for key, value in dictionary.items()
|
|
465
|
+
if not key.startswith("_dlt_")
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
first_file_path = find_first_file(
|
|
469
|
+
f"{self.temp_path}/{self.dataset_name}/{self.table_name}"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
output_path = self.uri.split("://")[1]
|
|
473
|
+
if output_path.count("/") > 1:
|
|
474
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
475
|
+
|
|
476
|
+
def _rewrite_csv_with_fieldnames(path, fieldnames):
|
|
477
|
+
tmp_fd, tmp_path = tempfile.mkstemp(
|
|
478
|
+
suffix=".csv", dir=os.path.dirname(path) or "."
|
|
479
|
+
)
|
|
480
|
+
try:
|
|
481
|
+
os.close(tmp_fd)
|
|
482
|
+
with (
|
|
483
|
+
open(path, "r", newline="") as old,
|
|
484
|
+
open(tmp_path, "w", newline="") as new,
|
|
485
|
+
):
|
|
486
|
+
reader = csv.DictReader(old)
|
|
487
|
+
writer = csv.DictWriter(new, fieldnames=fieldnames, restval="")
|
|
488
|
+
writer.writeheader()
|
|
489
|
+
for r in reader:
|
|
490
|
+
writer.writerow(r)
|
|
491
|
+
os.replace(tmp_path, path)
|
|
492
|
+
except BaseException:
|
|
493
|
+
os.unlink(tmp_path)
|
|
494
|
+
raise
|
|
495
|
+
|
|
496
|
+
fieldnames = {}
|
|
497
|
+
csv_writer = None
|
|
498
|
+
csv_file = None
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
for row in load_dlt_file(first_file_path):
|
|
502
|
+
row = filter_keys(row)
|
|
503
|
+
new_fields = False
|
|
504
|
+
for key in row:
|
|
505
|
+
if key not in fieldnames:
|
|
506
|
+
fieldnames[key] = None
|
|
507
|
+
new_fields = True
|
|
508
|
+
|
|
509
|
+
if csv_writer is None:
|
|
510
|
+
csv_file = open(output_path, "w", newline="")
|
|
511
|
+
csv_writer = csv.DictWriter(
|
|
512
|
+
csv_file, fieldnames=fieldnames, restval=""
|
|
513
|
+
)
|
|
514
|
+
csv_writer.writeheader()
|
|
515
|
+
elif new_fields:
|
|
516
|
+
csv_file.close()
|
|
517
|
+
_rewrite_csv_with_fieldnames(output_path, list(fieldnames))
|
|
518
|
+
csv_file = open(output_path, "a", newline="")
|
|
519
|
+
csv_writer = csv.DictWriter(
|
|
520
|
+
csv_file, fieldnames=fieldnames, restval=""
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
csv_writer.writerow(row)
|
|
524
|
+
finally:
|
|
525
|
+
if csv_file:
|
|
526
|
+
csv_file.close()
|
|
527
|
+
shutil.rmtree(self.temp_path)
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
class AthenaDestination:
|
|
531
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
532
|
+
encoded_uri = quote(uri, safe=":/?&=")
|
|
533
|
+
source_fields = urlparse(encoded_uri)
|
|
534
|
+
source_params = parse_qs(source_fields.query)
|
|
535
|
+
|
|
536
|
+
bucket = source_params.get("bucket", [None])[0]
|
|
537
|
+
if not bucket:
|
|
538
|
+
raise ValueError("A bucket is required to connect to Athena.")
|
|
539
|
+
|
|
540
|
+
if not bucket.startswith("s3://"):
|
|
541
|
+
bucket = f"s3://{bucket}"
|
|
542
|
+
|
|
543
|
+
bucket = bucket.rstrip("/")
|
|
544
|
+
|
|
545
|
+
dest_table = kwargs.get("dest_table", None)
|
|
546
|
+
if not dest_table:
|
|
547
|
+
raise ValueError("A destination table is required to connect to Athena.")
|
|
548
|
+
|
|
549
|
+
dest_table_fields = dest_table.split(".")
|
|
550
|
+
if len(dest_table_fields) != 2:
|
|
551
|
+
raise ValueError(
|
|
552
|
+
f"Table name must be in the format <schema>.<table>, given: {dest_table}"
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
query_result_path = f"{bucket}/{dest_table_fields[0]}_staging/metadata"
|
|
556
|
+
|
|
557
|
+
access_key_id = source_params.get("access_key_id", [None])[0]
|
|
558
|
+
secret_access_key = source_params.get("secret_access_key", [None])[0]
|
|
559
|
+
session_token = source_params.get("session_token", [None])[0]
|
|
560
|
+
profile_name = source_params.get("profile", ["default"])[0]
|
|
561
|
+
region_name = source_params.get("region_name", [None])[0]
|
|
562
|
+
|
|
563
|
+
if not access_key_id and not secret_access_key:
|
|
564
|
+
import botocore.session # type: ignore
|
|
565
|
+
|
|
566
|
+
session = botocore.session.Session(profile=profile_name)
|
|
567
|
+
default = session.get_credentials()
|
|
568
|
+
if not profile_name:
|
|
569
|
+
raise ValueError(
|
|
570
|
+
"You have to either provide access_key_id and secret_access_key pair or a valid AWS profile name."
|
|
571
|
+
)
|
|
572
|
+
access_key_id = default.access_key
|
|
573
|
+
secret_access_key = default.secret_key
|
|
574
|
+
session_token = default.token
|
|
575
|
+
if region_name is None:
|
|
576
|
+
region_name = session.get_config_variable("region")
|
|
577
|
+
|
|
578
|
+
if not region_name:
|
|
579
|
+
raise ValueError("The region_name is required to connect to Athena.")
|
|
580
|
+
|
|
581
|
+
os.environ["DESTINATION__BUCKET_URL"] = bucket
|
|
582
|
+
if access_key_id and secret_access_key:
|
|
583
|
+
os.environ["DESTINATION__CREDENTIALS__AWS_ACCESS_KEY_ID"] = access_key_id
|
|
584
|
+
os.environ["DESTINATION__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = (
|
|
585
|
+
secret_access_key
|
|
586
|
+
)
|
|
587
|
+
if session_token:
|
|
588
|
+
os.environ["DESTINATION__CREDENTIALS__AWS_SESSION_TOKEN"] = session_token
|
|
589
|
+
|
|
590
|
+
return dlt.destinations.athena(
|
|
591
|
+
query_result_bucket=query_result_path,
|
|
592
|
+
athena_work_group=source_params.get("workgroup", [None])[0], # type: ignore
|
|
593
|
+
credentials=AwsCredentials(
|
|
594
|
+
aws_access_key_id=access_key_id, # type: ignore
|
|
595
|
+
aws_secret_access_key=secret_access_key, # type: ignore
|
|
596
|
+
aws_session_token=session_token,
|
|
597
|
+
region_name=region_name,
|
|
598
|
+
),
|
|
599
|
+
destination_name=bucket,
|
|
600
|
+
force_iceberg=True,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
604
|
+
table_fields = table.split(".")
|
|
605
|
+
if len(table_fields) != 2:
|
|
606
|
+
raise ValueError("Table name must be in the format <schema>.<table>")
|
|
607
|
+
return {
|
|
608
|
+
"table_format": "iceberg",
|
|
609
|
+
"dataset_name": table_fields[-2],
|
|
610
|
+
"table_name": table_fields[-1],
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
def post_load(self):
|
|
614
|
+
pass
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
class ClickhouseDestination:
|
|
618
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
619
|
+
parsed_uri = urlparse(uri)
|
|
620
|
+
|
|
621
|
+
if "dest_table" in kwargs:
|
|
622
|
+
table = kwargs["dest_table"]
|
|
623
|
+
database = table.split(".")[0]
|
|
624
|
+
else:
|
|
625
|
+
database = parsed_uri.path.lstrip("/")
|
|
626
|
+
|
|
627
|
+
username = parsed_uri.username
|
|
628
|
+
if not username:
|
|
629
|
+
raise ValueError(
|
|
630
|
+
"A username is required to connect to the ClickHouse database."
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
password = parsed_uri.password
|
|
634
|
+
if not password:
|
|
635
|
+
raise ValueError(
|
|
636
|
+
"A password is required to authenticate with the ClickHouse database."
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
host = parsed_uri.hostname
|
|
640
|
+
if not host:
|
|
641
|
+
raise ValueError(
|
|
642
|
+
"The hostname or IP address of the ClickHouse server is required to establish a connection."
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
port = parsed_uri.port
|
|
646
|
+
if not port:
|
|
647
|
+
raise ValueError(
|
|
648
|
+
"The TCP port of the ClickHouse server is required to establish a connection."
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
query_params = parse_qs(parsed_uri.query)
|
|
652
|
+
secure = int(query_params["secure"][0]) if "secure" in query_params else 1
|
|
653
|
+
|
|
654
|
+
default_http_port = 8443 if secure == 1 else 8123
|
|
655
|
+
http_port = (
|
|
656
|
+
int(query_params["http_port"][0])
|
|
657
|
+
if "http_port" in query_params
|
|
658
|
+
else default_http_port
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
if secure not in (0, 1):
|
|
662
|
+
raise ValueError(
|
|
663
|
+
"Invalid value for secure. Set to `1` for a secure HTTPS connection or `0` for a non-secure HTTP connection."
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
credentials = ClickHouseCredentials(
|
|
667
|
+
{
|
|
668
|
+
"host": host,
|
|
669
|
+
"port": port,
|
|
670
|
+
"username": username,
|
|
671
|
+
"password": password,
|
|
672
|
+
"database": database,
|
|
673
|
+
"http_port": http_port,
|
|
674
|
+
"secure": secure,
|
|
675
|
+
}
|
|
676
|
+
)
|
|
677
|
+
return dlt.destinations.clickhouse(credentials=credentials)
|
|
678
|
+
|
|
679
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
680
|
+
table_fields = table.split(".")
|
|
681
|
+
if len(table_fields) != 2:
|
|
682
|
+
raise ValueError("Table name must be in the format <schema>.<table>")
|
|
683
|
+
return {
|
|
684
|
+
"table_name": table_fields[-1],
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
def post_load(self):
|
|
688
|
+
pass
|
|
689
|
+
|
|
690
|
+
@staticmethod
|
|
691
|
+
def engine_settings(uri: str) -> dict[str, str]:
|
|
692
|
+
parsed_uri = urlparse(uri)
|
|
693
|
+
query_params = parse_qs(parsed_uri.query)
|
|
694
|
+
return {
|
|
695
|
+
key[len("engine.") :]: query_params[key][0]
|
|
696
|
+
for key in query_params
|
|
697
|
+
if key.startswith("engine.")
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
@staticmethod
|
|
701
|
+
def engine_type(uri: str) -> str | None:
|
|
702
|
+
parsed_uri = urlparse(uri)
|
|
703
|
+
query_params = parse_qs(parsed_uri.query)
|
|
704
|
+
values = query_params.get("engine")
|
|
705
|
+
if values:
|
|
706
|
+
return values[0]
|
|
707
|
+
return None
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
class BlobFSClient(dlt.destinations.impl.filesystem.filesystem.FilesystemClient):
|
|
711
|
+
@property
|
|
712
|
+
def dataset_path(self):
|
|
713
|
+
# override to remove dataset path
|
|
714
|
+
return self.bucket_path
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
class BlobFS(dlt.destinations.filesystem):
|
|
718
|
+
@property
|
|
719
|
+
def client_class(self):
|
|
720
|
+
return BlobFSClient
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
class SqliteDestination(GenericSqlDestination):
|
|
724
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
725
|
+
return dlt.destinations.sqlalchemy(credentials=uri)
|
|
726
|
+
|
|
727
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs):
|
|
728
|
+
return {
|
|
729
|
+
# https://dlthub.com/docs/dlt-ecosystem/destinations/sqlalchemy#dataset-files
|
|
730
|
+
"dataset_name": "main",
|
|
731
|
+
"table_name": table,
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
class MySqlDestination(GenericSqlDestination):
|
|
736
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
737
|
+
return dlt.destinations.sqlalchemy(credentials=uri)
|
|
738
|
+
|
|
739
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs):
|
|
740
|
+
parsed = urlparse(uri)
|
|
741
|
+
database = parsed.path.lstrip("/")
|
|
742
|
+
if not database:
|
|
743
|
+
raise ValueError("You need to specify a database")
|
|
744
|
+
return {
|
|
745
|
+
"dataset_name": database,
|
|
746
|
+
"table_name": table,
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
class TrinoTypeMapper:
|
|
751
|
+
"""Custom type mapper for Trino to handle unsupported types."""
|
|
752
|
+
|
|
753
|
+
@staticmethod
|
|
754
|
+
def create_type_mapper():
|
|
755
|
+
"""Create a custom type mapper for Trino."""
|
|
756
|
+
from dlt.destinations.impl.sqlalchemy.type_mapper import SqlalchemyTypeMapper
|
|
757
|
+
from sqlalchemy import BigInteger, Text
|
|
758
|
+
from sqlalchemy.sql import sqltypes
|
|
759
|
+
|
|
760
|
+
class CustomTrinoTypeMapper(SqlalchemyTypeMapper):
|
|
761
|
+
"""Custom type mapper that converts unsupported Trino types."""
|
|
762
|
+
|
|
763
|
+
def to_destination_type(self, column, table=None):
|
|
764
|
+
# Handle special cases before calling parent
|
|
765
|
+
data_type = column.get("data_type", "")
|
|
766
|
+
|
|
767
|
+
# Convert JSON to VARCHAR for Trino's Iceberg catalog
|
|
768
|
+
if data_type == "json":
|
|
769
|
+
# Use TEXT (unlimited VARCHAR) for JSON data
|
|
770
|
+
return Text()
|
|
771
|
+
|
|
772
|
+
# Convert BINARY to VARCHAR
|
|
773
|
+
if data_type == "binary":
|
|
774
|
+
return Text()
|
|
775
|
+
|
|
776
|
+
# Handle integer types - always use BIGINT for Trino
|
|
777
|
+
# Note: dlt uses "bigint" internally, not "integer"
|
|
778
|
+
if data_type in ["bigint", "integer", "int"]:
|
|
779
|
+
return BigInteger()
|
|
780
|
+
|
|
781
|
+
# For other types, try parent mapper
|
|
782
|
+
try:
|
|
783
|
+
type_ = super().to_destination_type(column, table)
|
|
784
|
+
except Exception:
|
|
785
|
+
# If parent can't handle it, default to TEXT
|
|
786
|
+
return Text()
|
|
787
|
+
|
|
788
|
+
# Convert any INTEGER type to BIGINT
|
|
789
|
+
if isinstance(type_, sqltypes.Integer) and not isinstance(
|
|
790
|
+
type_, sqltypes.BigInteger
|
|
791
|
+
):
|
|
792
|
+
return BigInteger()
|
|
793
|
+
|
|
794
|
+
# Ensure VARCHAR types don't have constraints that Trino doesn't support
|
|
795
|
+
if isinstance(type_, sqltypes.String):
|
|
796
|
+
# Return TEXT for unlimited string
|
|
797
|
+
return Text()
|
|
798
|
+
|
|
799
|
+
return type_
|
|
800
|
+
|
|
801
|
+
return CustomTrinoTypeMapper
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
class TrinoDestination(GenericSqlDestination):
|
|
805
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
806
|
+
# Import required modules
|
|
807
|
+
from dlt.destinations.impl.sqlalchemy.factory import (
|
|
808
|
+
sqlalchemy as sqlalchemy_factory,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
# Create the destination with custom type mapper
|
|
812
|
+
# We need to use the factory to properly configure the type mapper
|
|
813
|
+
dest = sqlalchemy_factory(
|
|
814
|
+
credentials=uri, type_mapper=TrinoTypeMapper.create_type_mapper(), **kwargs
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
return dest
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
class BlobStorageDestination(abc.ABC):
|
|
821
|
+
@abc.abstractmethod
|
|
822
|
+
def credentials(self, params: dict) -> FileSystemCredentials:
|
|
823
|
+
"""Build credentials for the blob storage destination."""
|
|
824
|
+
pass
|
|
825
|
+
|
|
826
|
+
@property
|
|
827
|
+
@abc.abstractmethod
|
|
828
|
+
def protocol(self) -> str:
|
|
829
|
+
"""The protocol used for the blob storage destination."""
|
|
830
|
+
pass
|
|
831
|
+
|
|
832
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
833
|
+
parsed_uri = urlparse(uri)
|
|
834
|
+
params = parse_qs(parsed_uri.query)
|
|
835
|
+
creds = self.credentials(params)
|
|
836
|
+
|
|
837
|
+
dest_table = kwargs["dest_table"]
|
|
838
|
+
|
|
839
|
+
# only validate if dest_table is not a full URI
|
|
840
|
+
if not parsed_uri.netloc:
|
|
841
|
+
dest_table = self.validate_table(dest_table)
|
|
842
|
+
|
|
843
|
+
table_parts = dest_table.split("/")
|
|
844
|
+
|
|
845
|
+
if parsed_uri.path.strip("/"):
|
|
846
|
+
path_parts = parsed_uri.path.strip("/ ").split("/")
|
|
847
|
+
table_parts = path_parts + table_parts
|
|
848
|
+
|
|
849
|
+
if parsed_uri.netloc:
|
|
850
|
+
table_parts.insert(0, parsed_uri.netloc.strip())
|
|
851
|
+
|
|
852
|
+
base_path = "/".join(table_parts[:-1])
|
|
853
|
+
|
|
854
|
+
opts = {
|
|
855
|
+
"bucket_url": f"{self.protocol}://{base_path}",
|
|
856
|
+
"credentials": creds,
|
|
857
|
+
# supresses dlt warnings about dataset name normalization.
|
|
858
|
+
# we don't use dataset names in S3 so it's fine to disable this.
|
|
859
|
+
"enable_dataset_name_normalization": False,
|
|
860
|
+
}
|
|
861
|
+
layout = params.get("layout", [None])[0]
|
|
862
|
+
if layout is not None:
|
|
863
|
+
opts["layout"] = layout
|
|
864
|
+
|
|
865
|
+
return BlobFS(**opts) # type: ignore
|
|
866
|
+
|
|
867
|
+
def validate_table(self, table: str):
|
|
868
|
+
table = table.strip("/ ")
|
|
869
|
+
if len(table.split("/")) < 2:
|
|
870
|
+
raise ValueError("Table name must be in the format {bucket-name}/{path}")
|
|
871
|
+
return table
|
|
872
|
+
|
|
873
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs):
|
|
874
|
+
table_parts = table.split("/")
|
|
875
|
+
return {
|
|
876
|
+
"table_name": table_parts[-1].strip(),
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
def post_load(self) -> None:
|
|
880
|
+
pass
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
class S3Destination(BlobStorageDestination):
|
|
884
|
+
@property
|
|
885
|
+
def protocol(self) -> str:
|
|
886
|
+
return "s3"
|
|
887
|
+
|
|
888
|
+
def credentials(self, params: dict) -> FileSystemCredentials:
|
|
889
|
+
access_key_id = params.get("access_key_id", [None])[0]
|
|
890
|
+
if access_key_id is None:
|
|
891
|
+
raise MissingValueError("access_key_id", "S3")
|
|
892
|
+
|
|
893
|
+
secret_access_key = params.get("secret_access_key", [None])[0]
|
|
894
|
+
if secret_access_key is None:
|
|
895
|
+
raise MissingValueError("secret_access_key", "S3")
|
|
896
|
+
|
|
897
|
+
endpoint_url = params.get("endpoint_url", [None])[0]
|
|
898
|
+
if endpoint_url is not None:
|
|
899
|
+
parsed_endpoint = urlparse(endpoint_url)
|
|
900
|
+
if not parsed_endpoint.scheme or not parsed_endpoint.netloc:
|
|
901
|
+
raise ValueError("Invalid endpoint_url. Must be a valid URL.")
|
|
902
|
+
|
|
903
|
+
return AwsCredentials(
|
|
904
|
+
aws_access_key_id=access_key_id,
|
|
905
|
+
aws_secret_access_key=secret_access_key,
|
|
906
|
+
endpoint_url=endpoint_url,
|
|
907
|
+
)
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
class GCSDestination(BlobStorageDestination):
|
|
911
|
+
@property
|
|
912
|
+
def protocol(self) -> str:
|
|
913
|
+
return "gs"
|
|
914
|
+
|
|
915
|
+
def credentials(self, params: dict) -> FileSystemCredentials:
|
|
916
|
+
"""Builds GCS credentials from the provided parameters."""
|
|
917
|
+
credentials_path = params.get("credentials_path")
|
|
918
|
+
credentials_base64 = params.get("credentials_base64")
|
|
919
|
+
credentials_available = any(
|
|
920
|
+
map(
|
|
921
|
+
lambda x: x is not None,
|
|
922
|
+
[credentials_path, credentials_base64],
|
|
923
|
+
)
|
|
924
|
+
)
|
|
925
|
+
if credentials_available is False:
|
|
926
|
+
raise MissingValueError("credentials_path or credentials_base64", "GCS")
|
|
927
|
+
|
|
928
|
+
credentials = None
|
|
929
|
+
if credentials_path:
|
|
930
|
+
with open(credentials_path[0], "r") as f:
|
|
931
|
+
credentials = json.load(f)
|
|
932
|
+
else:
|
|
933
|
+
credentials = json.loads(base64.b64decode(credentials_base64[0]).decode()) # type: ignore
|
|
934
|
+
|
|
935
|
+
return credentials
|
|
936
|
+
|
|
937
|
+
|
|
938
|
+
class ElasticsearchDestination:
|
|
939
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
940
|
+
from urllib.parse import urlparse
|
|
941
|
+
|
|
942
|
+
parsed_uri = urlparse(uri)
|
|
943
|
+
|
|
944
|
+
# Extract connection details from URI
|
|
945
|
+
scheme = parsed_uri.scheme or "http"
|
|
946
|
+
host = parsed_uri.hostname or "localhost"
|
|
947
|
+
port = parsed_uri.port or 9200
|
|
948
|
+
username = parsed_uri.username
|
|
949
|
+
password = parsed_uri.password
|
|
950
|
+
|
|
951
|
+
# Build connection string
|
|
952
|
+
if username and password:
|
|
953
|
+
connection_string = f"{scheme}://{username}:{password}@{host}:{port}"
|
|
954
|
+
else:
|
|
955
|
+
connection_string = f"{scheme}://{host}:{port}"
|
|
956
|
+
|
|
957
|
+
# Add query parameters if any
|
|
958
|
+
if parsed_uri.query:
|
|
959
|
+
connection_string += f"?{parsed_uri.query}"
|
|
960
|
+
|
|
961
|
+
return elasticsearch_insert(connection_string=connection_string)
|
|
962
|
+
|
|
963
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
964
|
+
return {
|
|
965
|
+
"table_name": table,
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
def post_load(self):
|
|
969
|
+
pass
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
class MongoDBDestination:
|
|
973
|
+
def dlt_dest(self, uri: str, **kwargs):
|
|
974
|
+
return mongodb_insert(uri)
|
|
975
|
+
|
|
976
|
+
def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
|
|
977
|
+
return {
|
|
978
|
+
"table_name": table,
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
def post_load(self):
|
|
982
|
+
pass
|