ingestr 0.13.13__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin_max/__init__.py +6 -4
  10. ingestr/src/appsflyer/__init__.py +325 -0
  11. ingestr/src/appsflyer/client.py +49 -45
  12. ingestr/src/appstore/__init__.py +1 -0
  13. ingestr/src/arrow/__init__.py +9 -1
  14. ingestr/src/asana_source/__init__.py +1 -1
  15. ingestr/src/attio/__init__.py +102 -0
  16. ingestr/src/attio/helpers.py +65 -0
  17. ingestr/src/blob.py +37 -10
  18. ingestr/src/buildinfo.py +1 -1
  19. ingestr/src/chess/__init__.py +1 -1
  20. ingestr/src/clickup/__init__.py +85 -0
  21. ingestr/src/clickup/helpers.py +47 -0
  22. ingestr/src/collector/spinner.py +43 -0
  23. ingestr/src/couchbase_source/__init__.py +118 -0
  24. ingestr/src/couchbase_source/helpers.py +135 -0
  25. ingestr/src/cursor/__init__.py +83 -0
  26. ingestr/src/cursor/helpers.py +188 -0
  27. ingestr/src/destinations.py +508 -27
  28. ingestr/src/docebo/__init__.py +589 -0
  29. ingestr/src/docebo/client.py +435 -0
  30. ingestr/src/docebo/helpers.py +97 -0
  31. ingestr/src/elasticsearch/__init__.py +80 -0
  32. ingestr/src/elasticsearch/helpers.py +138 -0
  33. ingestr/src/errors.py +8 -0
  34. ingestr/src/facebook_ads/__init__.py +47 -28
  35. ingestr/src/facebook_ads/helpers.py +59 -37
  36. ingestr/src/facebook_ads/settings.py +2 -0
  37. ingestr/src/facebook_ads/utils.py +39 -0
  38. ingestr/src/factory.py +107 -2
  39. ingestr/src/filesystem/__init__.py +8 -3
  40. ingestr/src/filters.py +46 -3
  41. ingestr/src/fluxx/__init__.py +9906 -0
  42. ingestr/src/fluxx/helpers.py +209 -0
  43. ingestr/src/frankfurter/__init__.py +157 -0
  44. ingestr/src/frankfurter/helpers.py +48 -0
  45. ingestr/src/freshdesk/__init__.py +89 -0
  46. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  47. ingestr/src/freshdesk/settings.py +9 -0
  48. ingestr/src/fundraiseup/__init__.py +95 -0
  49. ingestr/src/fundraiseup/client.py +81 -0
  50. ingestr/src/github/__init__.py +41 -6
  51. ingestr/src/github/helpers.py +5 -5
  52. ingestr/src/google_analytics/__init__.py +22 -4
  53. ingestr/src/google_analytics/helpers.py +124 -6
  54. ingestr/src/google_sheets/__init__.py +4 -4
  55. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  56. ingestr/src/hostaway/__init__.py +302 -0
  57. ingestr/src/hostaway/client.py +288 -0
  58. ingestr/src/http/__init__.py +35 -0
  59. ingestr/src/http/readers.py +114 -0
  60. ingestr/src/http_client.py +24 -0
  61. ingestr/src/hubspot/__init__.py +66 -23
  62. ingestr/src/hubspot/helpers.py +52 -22
  63. ingestr/src/hubspot/settings.py +14 -7
  64. ingestr/src/influxdb/__init__.py +46 -0
  65. ingestr/src/influxdb/client.py +34 -0
  66. ingestr/src/intercom/__init__.py +142 -0
  67. ingestr/src/intercom/helpers.py +674 -0
  68. ingestr/src/intercom/settings.py +279 -0
  69. ingestr/src/isoc_pulse/__init__.py +159 -0
  70. ingestr/src/jira_source/__init__.py +340 -0
  71. ingestr/src/jira_source/helpers.py +439 -0
  72. ingestr/src/jira_source/settings.py +170 -0
  73. ingestr/src/kafka/__init__.py +4 -1
  74. ingestr/src/kinesis/__init__.py +139 -0
  75. ingestr/src/kinesis/helpers.py +82 -0
  76. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  77. ingestr/src/linear/__init__.py +634 -0
  78. ingestr/src/linear/helpers.py +111 -0
  79. ingestr/src/linkedin_ads/helpers.py +0 -1
  80. ingestr/src/mailchimp/__init__.py +126 -0
  81. ingestr/src/mailchimp/helpers.py +226 -0
  82. ingestr/src/mailchimp/settings.py +164 -0
  83. ingestr/src/masking.py +344 -0
  84. ingestr/src/mixpanel/__init__.py +62 -0
  85. ingestr/src/mixpanel/client.py +99 -0
  86. ingestr/src/monday/__init__.py +246 -0
  87. ingestr/src/monday/helpers.py +392 -0
  88. ingestr/src/monday/settings.py +328 -0
  89. ingestr/src/mongodb/__init__.py +72 -8
  90. ingestr/src/mongodb/helpers.py +915 -38
  91. ingestr/src/partition.py +32 -0
  92. ingestr/src/phantombuster/__init__.py +65 -0
  93. ingestr/src/phantombuster/client.py +87 -0
  94. ingestr/src/pinterest/__init__.py +82 -0
  95. ingestr/src/pipedrive/__init__.py +198 -0
  96. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  97. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  98. ingestr/src/pipedrive/helpers/pages.py +115 -0
  99. ingestr/src/pipedrive/settings.py +27 -0
  100. ingestr/src/pipedrive/typing.py +3 -0
  101. ingestr/src/plusvibeai/__init__.py +335 -0
  102. ingestr/src/plusvibeai/helpers.py +544 -0
  103. ingestr/src/plusvibeai/settings.py +252 -0
  104. ingestr/src/quickbooks/__init__.py +117 -0
  105. ingestr/src/resource.py +40 -0
  106. ingestr/src/revenuecat/__init__.py +83 -0
  107. ingestr/src/revenuecat/helpers.py +237 -0
  108. ingestr/src/salesforce/__init__.py +15 -8
  109. ingestr/src/shopify/__init__.py +1 -17
  110. ingestr/src/smartsheets/__init__.py +82 -0
  111. ingestr/src/snapchat_ads/__init__.py +489 -0
  112. ingestr/src/snapchat_ads/client.py +72 -0
  113. ingestr/src/snapchat_ads/helpers.py +535 -0
  114. ingestr/src/socrata_source/__init__.py +83 -0
  115. ingestr/src/socrata_source/helpers.py +85 -0
  116. ingestr/src/socrata_source/settings.py +8 -0
  117. ingestr/src/solidgate/__init__.py +219 -0
  118. ingestr/src/solidgate/helpers.py +154 -0
  119. ingestr/src/sources.py +2933 -245
  120. ingestr/src/stripe_analytics/__init__.py +49 -21
  121. ingestr/src/stripe_analytics/helpers.py +286 -1
  122. ingestr/src/stripe_analytics/settings.py +62 -10
  123. ingestr/src/telemetry/event.py +10 -9
  124. ingestr/src/tiktok_ads/__init__.py +12 -6
  125. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  126. ingestr/src/trustpilot/__init__.py +48 -0
  127. ingestr/src/trustpilot/client.py +48 -0
  128. ingestr/src/wise/__init__.py +68 -0
  129. ingestr/src/wise/client.py +63 -0
  130. ingestr/src/zoom/__init__.py +99 -0
  131. ingestr/src/zoom/helpers.py +102 -0
  132. ingestr/tests/unit/test_smartsheets.py +133 -0
  133. {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/METADATA +229 -19
  134. ingestr-0.14.104.dist-info/RECORD +203 -0
  135. ingestr/src/appsflyer/_init_.py +0 -24
  136. ingestr-0.13.13.dist-info/RECORD +0 -115
  137. {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  138. {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  139. {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,22 +1,44 @@
1
+ import abc
1
2
  import base64
2
3
  import csv
4
+ import datetime
3
5
  import json
4
6
  import os
5
7
  import shutil
8
+ import struct
6
9
  import tempfile
7
10
  from urllib.parse import parse_qs, quote, urlparse
8
11
 
9
12
  import dlt
13
+ import dlt.destinations.impl.filesystem.filesystem
10
14
  from dlt.common.configuration.specs import AwsCredentials
15
+ from dlt.common.destination.capabilities import DestinationCapabilitiesContext
16
+ from dlt.common.schema import Schema
17
+ from dlt.common.storages.configuration import FileSystemCredentials
11
18
  from dlt.destinations.impl.clickhouse.configuration import (
12
19
  ClickHouseCredentials,
13
20
  )
14
21
 
22
+ from ingestr.src.elasticsearch.helpers import elasticsearch_insert
23
+ from ingestr.src.errors import MissingValueError
15
24
  from ingestr.src.loader import load_dlt_file
25
+ from ingestr.src.mongodb.helpers import mongodb_insert
16
26
 
17
27
 
18
28
  class GenericSqlDestination:
19
29
  def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
30
+ if uri.startswith("databricks://"):
31
+ p = urlparse(uri)
32
+ q = parse_qs(p.query)
33
+ schema = q.get("schema", [None])[0]
34
+ if not schema:
35
+ raise ValueError("Databricks requires schema in the URI.")
36
+ res = {
37
+ "dataset_name": schema,
38
+ "table_name": table,
39
+ }
40
+ return res
41
+
20
42
  table_fields = table.split(".")
21
43
  if len(table_fields) != 2:
22
44
  raise ValueError("Table name must be in the format <schema>.<table>")
@@ -60,13 +82,29 @@ class BigQueryDestination:
60
82
  base64.b64decode(credentials_base64[0]).decode("utf-8")
61
83
  )
62
84
 
85
+ staging_bucket = kwargs.get("staging_bucket", None)
86
+ if staging_bucket:
87
+ if not staging_bucket.startswith("gs://"):
88
+ raise ValueError("Staging bucket must start with gs://")
89
+
90
+ os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = staging_bucket
91
+ os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__PROJECT_ID"] = (
92
+ credentials.get("project_id", None)
93
+ )
94
+ os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__PRIVATE_KEY"] = (
95
+ credentials.get("private_key", None)
96
+ )
97
+ os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__CLIENT_EMAIL"] = (
98
+ credentials.get("client_email", None)
99
+ )
100
+
63
101
  project_id = None
64
102
  if source_fields.hostname:
65
103
  project_id = source_fields.hostname
66
104
 
67
105
  return dlt.destinations.bigquery(
68
106
  credentials=credentials, # type: ignore
69
- location=location,
107
+ location=location, # type: ignore
70
108
  project_id=project_id,
71
109
  **kwargs,
72
110
  )
@@ -83,12 +121,24 @@ class BigQueryDestination:
83
121
  "table_name": table_fields[-1],
84
122
  }
85
123
 
124
+ staging_bucket = kwargs.get("staging_bucket", None)
125
+ if staging_bucket:
126
+ res["staging"] = "filesystem"
127
+
86
128
  return res
87
129
 
88
130
  def post_load(self):
89
131
  pass
90
132
 
91
133
 
134
+ class CrateDBDestination(GenericSqlDestination):
135
+ def dlt_dest(self, uri: str, **kwargs):
136
+ uri = uri.replace("cratedb://", "postgres://")
137
+ import dlt_cratedb.impl.cratedb.factory
138
+
139
+ return dlt_cratedb.impl.cratedb.factory.cratedb(credentials=uri, **kwargs)
140
+
141
+
92
142
  class PostgresDestination(GenericSqlDestination):
93
143
  def dlt_dest(self, uri: str, **kwargs):
94
144
  return dlt.destinations.postgres(credentials=uri, **kwargs)
@@ -111,14 +161,149 @@ class DuckDBDestination(GenericSqlDestination):
111
161
  return dlt.destinations.duckdb(uri, **kwargs)
112
162
 
113
163
 
164
+ class MotherduckDestination(GenericSqlDestination):
165
+ def dlt_dest(self, uri: str, **kwargs):
166
+ from urllib.parse import parse_qs, urlparse
167
+
168
+ parsed = urlparse(uri)
169
+ query = parse_qs(parsed.query)
170
+ token = query.get("token", [None])[0]
171
+ from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
172
+
173
+ creds = {
174
+ "password": token,
175
+ }
176
+ if parsed.path.lstrip("/"):
177
+ creds["database"] = parsed.path.lstrip("/")
178
+
179
+ return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
180
+
181
+
182
+ def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
183
+ # ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
184
+ tup = struct.unpack(
185
+ "<6hI2h", dto_value
186
+ ) # e.g., (2017, 3, 16, 10, 35, 18, 500000000, -6, 0)
187
+ return datetime.datetime(
188
+ tup[0],
189
+ tup[1],
190
+ tup[2],
191
+ tup[3],
192
+ tup[4],
193
+ tup[5],
194
+ tup[6] // 1000,
195
+ datetime.timezone(datetime.timedelta(hours=tup[7], minutes=tup[8])),
196
+ )
197
+
198
+
199
+ # MSSQL_COPT_SS_ACCESS_TOKEN is a connection attribute used to pass
200
+ # an Azure Active Directory access token to the SQL Server ODBC driver.
201
+ MSSQL_COPT_SS_ACCESS_TOKEN = 1256
202
+
203
+
204
+ def serialize_azure_token(token):
205
+ # https://github.com/mkleehammer/pyodbc/issues/228#issuecomment-494773723
206
+ encoded = token.encode("utf_16_le")
207
+ return struct.pack("<i", len(encoded)) + encoded
208
+
209
+
210
+ def build_mssql_dest():
211
+ # https://github.com/bruin-data/ingestr/issues/293
212
+
213
+ from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration
214
+ from dlt.destinations.impl.mssql.mssql import (
215
+ HINT_TO_MSSQL_ATTR,
216
+ MsSqlJobClient,
217
+ )
218
+ from dlt.destinations.impl.mssql.sql_client import (
219
+ PyOdbcMsSqlClient,
220
+ )
221
+
222
+ class OdbcMsSqlClient(PyOdbcMsSqlClient):
223
+ SKIP_CREDENTIALS = {"PWD", "AUTHENTICATION", "UID"}
224
+
225
+ def open_connection(self):
226
+ cfg = self.credentials._get_odbc_dsn_dict()
227
+ if (
228
+ cfg.get("AUTHENTICATION", "").strip().lower()
229
+ != "activedirectoryaccesstoken"
230
+ ):
231
+ return super().open_connection()
232
+
233
+ import pyodbc # type: ignore
234
+
235
+ dsn = ";".join(
236
+ [f"{k}={v}" for k, v in cfg.items() if k not in self.SKIP_CREDENTIALS]
237
+ )
238
+
239
+ self._conn = pyodbc.connect(
240
+ dsn,
241
+ timeout=self.credentials.connect_timeout,
242
+ attrs_before={
243
+ MSSQL_COPT_SS_ACCESS_TOKEN: serialize_azure_token(cfg["PWD"]),
244
+ },
245
+ )
246
+
247
+ # https://github.com/mkleehammer/pyodbc/wiki/Using-an-Output-Converter-function
248
+ self._conn.add_output_converter(-155, handle_datetimeoffset)
249
+ self._conn.autocommit = True
250
+ return self._conn
251
+
252
+ class MsSqlClient(MsSqlJobClient):
253
+ def __init__(
254
+ self,
255
+ schema: Schema,
256
+ config: MsSqlClientConfiguration,
257
+ capabilities: DestinationCapabilitiesContext,
258
+ ) -> None:
259
+ sql_client = OdbcMsSqlClient(
260
+ config.normalize_dataset_name(schema),
261
+ config.normalize_staging_dataset_name(schema),
262
+ config.credentials,
263
+ capabilities,
264
+ )
265
+ super(MsSqlJobClient, self).__init__(schema, config, sql_client)
266
+ self.config: MsSqlClientConfiguration = config
267
+ self.sql_client = sql_client
268
+ self.active_hints = HINT_TO_MSSQL_ATTR if self.config.create_indexes else {}
269
+ self.type_mapper = capabilities.get_type_mapper()
270
+
271
+ class MsSqlDestImpl(dlt.destinations.mssql):
272
+ @property
273
+ def client_class(self):
274
+ return MsSqlClient
275
+
276
+ return MsSqlDestImpl
277
+
278
+
114
279
  class MsSQLDestination(GenericSqlDestination):
115
280
  def dlt_dest(self, uri: str, **kwargs):
116
- return dlt.destinations.mssql(credentials=uri, **kwargs)
281
+ cls = build_mssql_dest()
282
+ return cls(credentials=uri, **kwargs)
117
283
 
118
284
 
119
285
  class DatabricksDestination(GenericSqlDestination):
120
286
  def dlt_dest(self, uri: str, **kwargs):
121
- return dlt.destinations.databricks(credentials=uri, **kwargs)
287
+ p = urlparse(uri)
288
+ q = parse_qs(p.query)
289
+ access_token = p.password
290
+ server_hostname = p.hostname
291
+ http_path = q.get("http_path", [None])[0]
292
+ catalog = q.get("catalog", [None])[0]
293
+ schema = q.get("schema", [None])[0]
294
+
295
+ creds = {
296
+ "access_token": access_token,
297
+ "server_hostname": server_hostname,
298
+ "http_path": http_path,
299
+ "catalog": catalog,
300
+ "schema": schema,
301
+ }
302
+
303
+ return dlt.destinations.databricks(
304
+ credentials=creds,
305
+ **kwargs,
306
+ )
122
307
 
123
308
 
124
309
  class SynapseDestination(GenericSqlDestination):
@@ -215,43 +400,64 @@ class AthenaDestination:
215
400
  if not bucket.startswith("s3://"):
216
401
  bucket = f"s3://{bucket}"
217
402
 
218
- query_result_path = source_params.get("query_results_path", [None])[0]
219
- if query_result_path:
220
- if not query_result_path.startswith("s3://"):
221
- query_result_path = f"s3://{query_result_path}"
222
- else:
223
- query_result_path = bucket
403
+ bucket = bucket.rstrip("/")
224
404
 
225
- access_key_id = source_params.get("access_key_id", [None])[0]
226
- if not access_key_id:
227
- raise ValueError("The AWS access_key_id is required to connect to Athena.")
405
+ dest_table = kwargs.get("dest_table", None)
406
+ if not dest_table:
407
+ raise ValueError("A destination table is required to connect to Athena.")
228
408
 
229
- secret_access_key = source_params.get("secret_access_key", [None])[0]
230
- if not secret_access_key:
231
- raise ValueError("The AWS secret_access_key is required to connect Athena")
409
+ dest_table_fields = dest_table.split(".")
410
+ if len(dest_table_fields) != 2:
411
+ raise ValueError(
412
+ f"Table name must be in the format <schema>.<table>, given: {dest_table}"
413
+ )
232
414
 
233
- work_group = source_params.get("workgroup", [None])[0]
415
+ query_result_path = f"{bucket}/{dest_table_fields[0]}_staging/metadata"
234
416
 
417
+ access_key_id = source_params.get("access_key_id", [None])[0]
418
+ secret_access_key = source_params.get("secret_access_key", [None])[0]
419
+ session_token = source_params.get("session_token", [None])[0]
420
+ profile_name = source_params.get("profile", ["default"])[0]
235
421
  region_name = source_params.get("region_name", [None])[0]
422
+
423
+ if not access_key_id and not secret_access_key:
424
+ import botocore.session # type: ignore
425
+
426
+ session = botocore.session.Session(profile=profile_name)
427
+ default = session.get_credentials()
428
+ if not profile_name:
429
+ raise ValueError(
430
+ "You have to either provide access_key_id and secret_access_key pair or a valid AWS profile name."
431
+ )
432
+ access_key_id = default.access_key
433
+ secret_access_key = default.secret_key
434
+ session_token = default.token
435
+ if region_name is None:
436
+ region_name = session.get_config_variable("region")
437
+
236
438
  if not region_name:
237
439
  raise ValueError("The region_name is required to connect to Athena.")
238
440
 
239
441
  os.environ["DESTINATION__BUCKET_URL"] = bucket
240
- os.environ["DESTINATION__CREDENTIALS__AWS_ACCESS_KEY_ID"] = access_key_id
241
- os.environ["DESTINATION__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = (
242
- secret_access_key
243
- )
442
+ if access_key_id and secret_access_key:
443
+ os.environ["DESTINATION__CREDENTIALS__AWS_ACCESS_KEY_ID"] = access_key_id
444
+ os.environ["DESTINATION__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = (
445
+ secret_access_key
446
+ )
447
+ if session_token:
448
+ os.environ["DESTINATION__CREDENTIALS__AWS_SESSION_TOKEN"] = session_token
244
449
 
245
- credentials = AwsCredentials(
246
- aws_access_key_id=access_key_id,
247
- aws_secret_access_key=secret_access_key,
248
- region_name=region_name,
249
- )
250
450
  return dlt.destinations.athena(
251
451
  query_result_bucket=query_result_path,
252
- athena_work_group=work_group,
253
- credentials=credentials,
452
+ athena_work_group=source_params.get("workgroup", [None])[0], # type: ignore
453
+ credentials=AwsCredentials(
454
+ aws_access_key_id=access_key_id, # type: ignore
455
+ aws_secret_access_key=secret_access_key, # type: ignore
456
+ aws_session_token=session_token,
457
+ region_name=region_name,
458
+ ),
254
459
  destination_name=bucket,
460
+ force_iceberg=True,
255
461
  )
256
462
 
257
463
  def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
@@ -341,3 +547,278 @@ class ClickhouseDestination:
341
547
 
342
548
  def post_load(self):
343
549
  pass
550
+
551
+
552
+ class BlobFSClient(dlt.destinations.impl.filesystem.filesystem.FilesystemClient):
553
+ @property
554
+ def dataset_path(self):
555
+ # override to remove dataset path
556
+ return self.bucket_path
557
+
558
+
559
+ class BlobFS(dlt.destinations.filesystem):
560
+ @property
561
+ def client_class(self):
562
+ return BlobFSClient
563
+
564
+
565
+ class SqliteDestination(GenericSqlDestination):
566
+ def dlt_dest(self, uri: str, **kwargs):
567
+ return dlt.destinations.sqlalchemy(credentials=uri)
568
+
569
+ def dlt_run_params(self, uri: str, table: str, **kwargs):
570
+ return {
571
+ # https://dlthub.com/docs/dlt-ecosystem/destinations/sqlalchemy#dataset-files
572
+ "dataset_name": "main",
573
+ "table_name": table,
574
+ }
575
+
576
+
577
+ class MySqlDestination(GenericSqlDestination):
578
+ def dlt_dest(self, uri: str, **kwargs):
579
+ return dlt.destinations.sqlalchemy(credentials=uri)
580
+
581
+ def dlt_run_params(self, uri: str, table: str, **kwargs):
582
+ parsed = urlparse(uri)
583
+ database = parsed.path.lstrip("/")
584
+ if not database:
585
+ raise ValueError("You need to specify a database")
586
+ return {
587
+ "dataset_name": database,
588
+ "table_name": table,
589
+ }
590
+
591
+
592
+ class TrinoTypeMapper:
593
+ """Custom type mapper for Trino to handle unsupported types."""
594
+
595
+ @staticmethod
596
+ def create_type_mapper():
597
+ """Create a custom type mapper for Trino."""
598
+ from dlt.destinations.impl.sqlalchemy.type_mapper import SqlalchemyTypeMapper
599
+ from sqlalchemy import BigInteger, Text
600
+ from sqlalchemy.sql import sqltypes
601
+
602
+ class CustomTrinoTypeMapper(SqlalchemyTypeMapper):
603
+ """Custom type mapper that converts unsupported Trino types."""
604
+
605
+ def to_destination_type(self, column, table=None):
606
+ # Handle special cases before calling parent
607
+ data_type = column.get("data_type", "")
608
+
609
+ # Convert JSON to VARCHAR for Trino's Iceberg catalog
610
+ if data_type == "json":
611
+ # Use TEXT (unlimited VARCHAR) for JSON data
612
+ return Text()
613
+
614
+ # Convert BINARY to VARCHAR
615
+ if data_type == "binary":
616
+ return Text()
617
+
618
+ # Handle integer types - always use BIGINT for Trino
619
+ # Note: dlt uses "bigint" internally, not "integer"
620
+ if data_type in ["bigint", "integer", "int"]:
621
+ return BigInteger()
622
+
623
+ # For other types, try parent mapper
624
+ try:
625
+ type_ = super().to_destination_type(column, table)
626
+ except Exception:
627
+ # If parent can't handle it, default to TEXT
628
+ return Text()
629
+
630
+ # Convert any INTEGER type to BIGINT
631
+ if isinstance(type_, sqltypes.Integer) and not isinstance(
632
+ type_, sqltypes.BigInteger
633
+ ):
634
+ return BigInteger()
635
+
636
+ # Ensure VARCHAR types don't have constraints that Trino doesn't support
637
+ if isinstance(type_, sqltypes.String):
638
+ # Return TEXT for unlimited string
639
+ return Text()
640
+
641
+ return type_
642
+
643
+ return CustomTrinoTypeMapper
644
+
645
+
646
+ class TrinoDestination(GenericSqlDestination):
647
+ def dlt_dest(self, uri: str, **kwargs):
648
+ # Import required modules
649
+ from dlt.destinations.impl.sqlalchemy.factory import (
650
+ sqlalchemy as sqlalchemy_factory,
651
+ )
652
+
653
+ # Create the destination with custom type mapper
654
+ # We need to use the factory to properly configure the type mapper
655
+ dest = sqlalchemy_factory(
656
+ credentials=uri, type_mapper=TrinoTypeMapper.create_type_mapper(), **kwargs
657
+ )
658
+
659
+ return dest
660
+
661
+
662
+ class BlobStorageDestination(abc.ABC):
663
+ @abc.abstractmethod
664
+ def credentials(self, params: dict) -> FileSystemCredentials:
665
+ """Build credentials for the blob storage destination."""
666
+ pass
667
+
668
+ @property
669
+ @abc.abstractmethod
670
+ def protocol(self) -> str:
671
+ """The protocol used for the blob storage destination."""
672
+ pass
673
+
674
+ def dlt_dest(self, uri: str, **kwargs):
675
+ parsed_uri = urlparse(uri)
676
+ params = parse_qs(parsed_uri.query)
677
+ creds = self.credentials(params)
678
+
679
+ dest_table = kwargs["dest_table"]
680
+
681
+ # only validate if dest_table is not a full URI
682
+ if not parsed_uri.netloc:
683
+ dest_table = self.validate_table(dest_table)
684
+
685
+ table_parts = dest_table.split("/")
686
+
687
+ if parsed_uri.path.strip("/"):
688
+ path_parts = parsed_uri.path.strip("/ ").split("/")
689
+ table_parts = path_parts + table_parts
690
+
691
+ if parsed_uri.netloc:
692
+ table_parts.insert(0, parsed_uri.netloc.strip())
693
+
694
+ base_path = "/".join(table_parts[:-1])
695
+
696
+ opts = {
697
+ "bucket_url": f"{self.protocol}://{base_path}",
698
+ "credentials": creds,
699
+ # supresses dlt warnings about dataset name normalization.
700
+ # we don't use dataset names in S3 so it's fine to disable this.
701
+ "enable_dataset_name_normalization": False,
702
+ }
703
+ layout = params.get("layout", [None])[0]
704
+ if layout is not None:
705
+ opts["layout"] = layout
706
+
707
+ return BlobFS(**opts) # type: ignore
708
+
709
+ def validate_table(self, table: str):
710
+ table = table.strip("/ ")
711
+ if len(table.split("/")) < 2:
712
+ raise ValueError("Table name must be in the format {bucket-name}/{path}")
713
+ return table
714
+
715
+ def dlt_run_params(self, uri: str, table: str, **kwargs):
716
+ table_parts = table.split("/")
717
+ return {
718
+ "table_name": table_parts[-1].strip(),
719
+ }
720
+
721
+ def post_load(self) -> None:
722
+ pass
723
+
724
+
725
+ class S3Destination(BlobStorageDestination):
726
+ @property
727
+ def protocol(self) -> str:
728
+ return "s3"
729
+
730
+ def credentials(self, params: dict) -> FileSystemCredentials:
731
+ access_key_id = params.get("access_key_id", [None])[0]
732
+ if access_key_id is None:
733
+ raise MissingValueError("access_key_id", "S3")
734
+
735
+ secret_access_key = params.get("secret_access_key", [None])[0]
736
+ if secret_access_key is None:
737
+ raise MissingValueError("secret_access_key", "S3")
738
+
739
+ endpoint_url = params.get("endpoint_url", [None])[0]
740
+ if endpoint_url is not None:
741
+ parsed_endpoint = urlparse(endpoint_url)
742
+ if not parsed_endpoint.scheme or not parsed_endpoint.netloc:
743
+ raise ValueError("Invalid endpoint_url. Must be a valid URL.")
744
+
745
+ return AwsCredentials(
746
+ aws_access_key_id=access_key_id,
747
+ aws_secret_access_key=secret_access_key,
748
+ endpoint_url=endpoint_url,
749
+ )
750
+
751
+
752
+ class GCSDestination(BlobStorageDestination):
753
+ @property
754
+ def protocol(self) -> str:
755
+ return "gs"
756
+
757
+ def credentials(self, params: dict) -> FileSystemCredentials:
758
+ """Builds GCS credentials from the provided parameters."""
759
+ credentials_path = params.get("credentials_path")
760
+ credentials_base64 = params.get("credentials_base64")
761
+ credentials_available = any(
762
+ map(
763
+ lambda x: x is not None,
764
+ [credentials_path, credentials_base64],
765
+ )
766
+ )
767
+ if credentials_available is False:
768
+ raise MissingValueError("credentials_path or credentials_base64", "GCS")
769
+
770
+ credentials = None
771
+ if credentials_path:
772
+ with open(credentials_path[0], "r") as f:
773
+ credentials = json.load(f)
774
+ else:
775
+ credentials = json.loads(base64.b64decode(credentials_base64[0]).decode()) # type: ignore
776
+
777
+ return credentials
778
+
779
+
780
+ class ElasticsearchDestination:
781
+ def dlt_dest(self, uri: str, **kwargs):
782
+ from urllib.parse import urlparse
783
+
784
+ parsed_uri = urlparse(uri)
785
+
786
+ # Extract connection details from URI
787
+ scheme = parsed_uri.scheme or "http"
788
+ host = parsed_uri.hostname or "localhost"
789
+ port = parsed_uri.port or 9200
790
+ username = parsed_uri.username
791
+ password = parsed_uri.password
792
+
793
+ # Build connection string
794
+ if username and password:
795
+ connection_string = f"{scheme}://{username}:{password}@{host}:{port}"
796
+ else:
797
+ connection_string = f"{scheme}://{host}:{port}"
798
+
799
+ # Add query parameters if any
800
+ if parsed_uri.query:
801
+ connection_string += f"?{parsed_uri.query}"
802
+
803
+ return elasticsearch_insert(connection_string=connection_string)
804
+
805
+ def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
806
+ return {
807
+ "table_name": table,
808
+ }
809
+
810
+ def post_load(self):
811
+ pass
812
+
813
+
814
+ class MongoDBDestination:
815
+ def dlt_dest(self, uri: str, **kwargs):
816
+ return mongodb_insert(uri)
817
+
818
+ def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
819
+ return {
820
+ "table_name": table,
821
+ }
822
+
823
+ def post_load(self):
824
+ pass