ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin/__init__.py +262 -0
  10. ingestr/src/applovin_max/__init__.py +117 -0
  11. ingestr/src/appsflyer/__init__.py +325 -0
  12. ingestr/src/appsflyer/client.py +49 -45
  13. ingestr/src/appstore/__init__.py +1 -0
  14. ingestr/src/arrow/__init__.py +9 -1
  15. ingestr/src/asana_source/__init__.py +1 -1
  16. ingestr/src/attio/__init__.py +102 -0
  17. ingestr/src/attio/helpers.py +65 -0
  18. ingestr/src/blob.py +38 -11
  19. ingestr/src/buildinfo.py +1 -0
  20. ingestr/src/chess/__init__.py +1 -1
  21. ingestr/src/clickup/__init__.py +85 -0
  22. ingestr/src/clickup/helpers.py +47 -0
  23. ingestr/src/collector/spinner.py +43 -0
  24. ingestr/src/couchbase_source/__init__.py +118 -0
  25. ingestr/src/couchbase_source/helpers.py +135 -0
  26. ingestr/src/cursor/__init__.py +83 -0
  27. ingestr/src/cursor/helpers.py +188 -0
  28. ingestr/src/destinations.py +520 -33
  29. ingestr/src/docebo/__init__.py +589 -0
  30. ingestr/src/docebo/client.py +435 -0
  31. ingestr/src/docebo/helpers.py +97 -0
  32. ingestr/src/elasticsearch/__init__.py +80 -0
  33. ingestr/src/elasticsearch/helpers.py +138 -0
  34. ingestr/src/errors.py +8 -0
  35. ingestr/src/facebook_ads/__init__.py +47 -28
  36. ingestr/src/facebook_ads/helpers.py +59 -37
  37. ingestr/src/facebook_ads/settings.py +2 -0
  38. ingestr/src/facebook_ads/utils.py +39 -0
  39. ingestr/src/factory.py +116 -2
  40. ingestr/src/filesystem/__init__.py +8 -3
  41. ingestr/src/filters.py +46 -3
  42. ingestr/src/fluxx/__init__.py +9906 -0
  43. ingestr/src/fluxx/helpers.py +209 -0
  44. ingestr/src/frankfurter/__init__.py +157 -0
  45. ingestr/src/frankfurter/helpers.py +48 -0
  46. ingestr/src/freshdesk/__init__.py +89 -0
  47. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  48. ingestr/src/freshdesk/settings.py +9 -0
  49. ingestr/src/fundraiseup/__init__.py +95 -0
  50. ingestr/src/fundraiseup/client.py +81 -0
  51. ingestr/src/github/__init__.py +41 -6
  52. ingestr/src/github/helpers.py +5 -5
  53. ingestr/src/google_analytics/__init__.py +22 -4
  54. ingestr/src/google_analytics/helpers.py +124 -6
  55. ingestr/src/google_sheets/__init__.py +4 -4
  56. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  57. ingestr/src/hostaway/__init__.py +302 -0
  58. ingestr/src/hostaway/client.py +288 -0
  59. ingestr/src/http/__init__.py +35 -0
  60. ingestr/src/http/readers.py +114 -0
  61. ingestr/src/http_client.py +24 -0
  62. ingestr/src/hubspot/__init__.py +66 -23
  63. ingestr/src/hubspot/helpers.py +52 -22
  64. ingestr/src/hubspot/settings.py +14 -7
  65. ingestr/src/influxdb/__init__.py +46 -0
  66. ingestr/src/influxdb/client.py +34 -0
  67. ingestr/src/intercom/__init__.py +142 -0
  68. ingestr/src/intercom/helpers.py +674 -0
  69. ingestr/src/intercom/settings.py +279 -0
  70. ingestr/src/isoc_pulse/__init__.py +159 -0
  71. ingestr/src/jira_source/__init__.py +340 -0
  72. ingestr/src/jira_source/helpers.py +439 -0
  73. ingestr/src/jira_source/settings.py +170 -0
  74. ingestr/src/kafka/__init__.py +4 -1
  75. ingestr/src/kinesis/__init__.py +139 -0
  76. ingestr/src/kinesis/helpers.py +82 -0
  77. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  78. ingestr/src/linear/__init__.py +634 -0
  79. ingestr/src/linear/helpers.py +111 -0
  80. ingestr/src/linkedin_ads/helpers.py +0 -1
  81. ingestr/src/loader.py +69 -0
  82. ingestr/src/mailchimp/__init__.py +126 -0
  83. ingestr/src/mailchimp/helpers.py +226 -0
  84. ingestr/src/mailchimp/settings.py +164 -0
  85. ingestr/src/masking.py +344 -0
  86. ingestr/src/mixpanel/__init__.py +62 -0
  87. ingestr/src/mixpanel/client.py +99 -0
  88. ingestr/src/monday/__init__.py +246 -0
  89. ingestr/src/monday/helpers.py +392 -0
  90. ingestr/src/monday/settings.py +328 -0
  91. ingestr/src/mongodb/__init__.py +72 -8
  92. ingestr/src/mongodb/helpers.py +915 -38
  93. ingestr/src/partition.py +32 -0
  94. ingestr/src/personio/__init__.py +331 -0
  95. ingestr/src/personio/helpers.py +86 -0
  96. ingestr/src/phantombuster/__init__.py +65 -0
  97. ingestr/src/phantombuster/client.py +87 -0
  98. ingestr/src/pinterest/__init__.py +82 -0
  99. ingestr/src/pipedrive/__init__.py +198 -0
  100. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  101. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  102. ingestr/src/pipedrive/helpers/pages.py +115 -0
  103. ingestr/src/pipedrive/settings.py +27 -0
  104. ingestr/src/pipedrive/typing.py +3 -0
  105. ingestr/src/plusvibeai/__init__.py +335 -0
  106. ingestr/src/plusvibeai/helpers.py +544 -0
  107. ingestr/src/plusvibeai/settings.py +252 -0
  108. ingestr/src/quickbooks/__init__.py +117 -0
  109. ingestr/src/resource.py +40 -0
  110. ingestr/src/revenuecat/__init__.py +83 -0
  111. ingestr/src/revenuecat/helpers.py +237 -0
  112. ingestr/src/salesforce/__init__.py +156 -0
  113. ingestr/src/salesforce/helpers.py +64 -0
  114. ingestr/src/shopify/__init__.py +1 -17
  115. ingestr/src/smartsheets/__init__.py +82 -0
  116. ingestr/src/snapchat_ads/__init__.py +489 -0
  117. ingestr/src/snapchat_ads/client.py +72 -0
  118. ingestr/src/snapchat_ads/helpers.py +535 -0
  119. ingestr/src/socrata_source/__init__.py +83 -0
  120. ingestr/src/socrata_source/helpers.py +85 -0
  121. ingestr/src/socrata_source/settings.py +8 -0
  122. ingestr/src/solidgate/__init__.py +219 -0
  123. ingestr/src/solidgate/helpers.py +154 -0
  124. ingestr/src/sources.py +3132 -212
  125. ingestr/src/stripe_analytics/__init__.py +49 -21
  126. ingestr/src/stripe_analytics/helpers.py +286 -1
  127. ingestr/src/stripe_analytics/settings.py +62 -10
  128. ingestr/src/telemetry/event.py +10 -9
  129. ingestr/src/tiktok_ads/__init__.py +12 -6
  130. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  131. ingestr/src/trustpilot/__init__.py +48 -0
  132. ingestr/src/trustpilot/client.py +48 -0
  133. ingestr/src/version.py +6 -1
  134. ingestr/src/wise/__init__.py +68 -0
  135. ingestr/src/wise/client.py +63 -0
  136. ingestr/src/zoom/__init__.py +99 -0
  137. ingestr/src/zoom/helpers.py +102 -0
  138. ingestr/tests/unit/test_smartsheets.py +133 -0
  139. ingestr-0.14.104.dist-info/METADATA +563 -0
  140. ingestr-0.14.104.dist-info/RECORD +203 -0
  141. ingestr/src/appsflyer/_init_.py +0 -24
  142. ingestr-0.13.2.dist-info/METADATA +0 -302
  143. ingestr-0.13.2.dist-info/RECORD +0 -107
  144. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  145. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  146. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,21 +1,44 @@
1
+ import abc
1
2
  import base64
2
3
  import csv
4
+ import datetime
3
5
  import json
4
6
  import os
5
7
  import shutil
8
+ import struct
6
9
  import tempfile
7
10
  from urllib.parse import parse_qs, quote, urlparse
8
11
 
9
12
  import dlt
10
- import pyarrow.parquet # type: ignore
13
+ import dlt.destinations.impl.filesystem.filesystem
11
14
  from dlt.common.configuration.specs import AwsCredentials
15
+ from dlt.common.destination.capabilities import DestinationCapabilitiesContext
16
+ from dlt.common.schema import Schema
17
+ from dlt.common.storages.configuration import FileSystemCredentials
12
18
  from dlt.destinations.impl.clickhouse.configuration import (
13
19
  ClickHouseCredentials,
14
20
  )
15
21
 
22
+ from ingestr.src.elasticsearch.helpers import elasticsearch_insert
23
+ from ingestr.src.errors import MissingValueError
24
+ from ingestr.src.loader import load_dlt_file
25
+ from ingestr.src.mongodb.helpers import mongodb_insert
26
+
16
27
 
17
28
  class GenericSqlDestination:
18
29
  def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
30
+ if uri.startswith("databricks://"):
31
+ p = urlparse(uri)
32
+ q = parse_qs(p.query)
33
+ schema = q.get("schema", [None])[0]
34
+ if not schema:
35
+ raise ValueError("Databricks requires schema in the URI.")
36
+ res = {
37
+ "dataset_name": schema,
38
+ "table_name": table,
39
+ }
40
+ return res
41
+
19
42
  table_fields = table.split(".")
20
43
  if len(table_fields) != 2:
21
44
  raise ValueError("Table name must be in the format <schema>.<table>")
@@ -59,9 +82,30 @@ class BigQueryDestination:
59
82
  base64.b64decode(credentials_base64[0]).decode("utf-8")
60
83
  )
61
84
 
85
+ staging_bucket = kwargs.get("staging_bucket", None)
86
+ if staging_bucket:
87
+ if not staging_bucket.startswith("gs://"):
88
+ raise ValueError("Staging bucket must start with gs://")
89
+
90
+ os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = staging_bucket
91
+ os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__PROJECT_ID"] = (
92
+ credentials.get("project_id", None)
93
+ )
94
+ os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__PRIVATE_KEY"] = (
95
+ credentials.get("private_key", None)
96
+ )
97
+ os.environ["DESTINATION__FILESYSTEM__CREDENTIALS__CLIENT_EMAIL"] = (
98
+ credentials.get("client_email", None)
99
+ )
100
+
101
+ project_id = None
102
+ if source_fields.hostname:
103
+ project_id = source_fields.hostname
104
+
62
105
  return dlt.destinations.bigquery(
63
106
  credentials=credentials, # type: ignore
64
- location=location,
107
+ location=location, # type: ignore
108
+ project_id=project_id,
65
109
  **kwargs,
66
110
  )
67
111
 
@@ -77,12 +121,24 @@ class BigQueryDestination:
77
121
  "table_name": table_fields[-1],
78
122
  }
79
123
 
124
+ staging_bucket = kwargs.get("staging_bucket", None)
125
+ if staging_bucket:
126
+ res["staging"] = "filesystem"
127
+
80
128
  return res
81
129
 
82
130
  def post_load(self):
83
131
  pass
84
132
 
85
133
 
134
+ class CrateDBDestination(GenericSqlDestination):
135
+ def dlt_dest(self, uri: str, **kwargs):
136
+ uri = uri.replace("cratedb://", "postgres://")
137
+ import dlt_cratedb.impl.cratedb.factory
138
+
139
+ return dlt_cratedb.impl.cratedb.factory.cratedb(credentials=uri, **kwargs)
140
+
141
+
86
142
  class PostgresDestination(GenericSqlDestination):
87
143
  def dlt_dest(self, uri: str, **kwargs):
88
144
  return dlt.destinations.postgres(credentials=uri, **kwargs)
@@ -105,14 +161,149 @@ class DuckDBDestination(GenericSqlDestination):
105
161
  return dlt.destinations.duckdb(uri, **kwargs)
106
162
 
107
163
 
164
+ class MotherduckDestination(GenericSqlDestination):
165
+ def dlt_dest(self, uri: str, **kwargs):
166
+ from urllib.parse import parse_qs, urlparse
167
+
168
+ parsed = urlparse(uri)
169
+ query = parse_qs(parsed.query)
170
+ token = query.get("token", [None])[0]
171
+ from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
172
+
173
+ creds = {
174
+ "password": token,
175
+ }
176
+ if parsed.path.lstrip("/"):
177
+ creds["database"] = parsed.path.lstrip("/")
178
+
179
+ return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
180
+
181
+
182
+ def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
183
+ # ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
184
+ tup = struct.unpack(
185
+ "<6hI2h", dto_value
186
+ ) # e.g., (2017, 3, 16, 10, 35, 18, 500000000, -6, 0)
187
+ return datetime.datetime(
188
+ tup[0],
189
+ tup[1],
190
+ tup[2],
191
+ tup[3],
192
+ tup[4],
193
+ tup[5],
194
+ tup[6] // 1000,
195
+ datetime.timezone(datetime.timedelta(hours=tup[7], minutes=tup[8])),
196
+ )
197
+
198
+
199
+ # MSSQL_COPT_SS_ACCESS_TOKEN is a connection attribute used to pass
200
+ # an Azure Active Directory access token to the SQL Server ODBC driver.
201
+ MSSQL_COPT_SS_ACCESS_TOKEN = 1256
202
+
203
+
204
+ def serialize_azure_token(token):
205
+ # https://github.com/mkleehammer/pyodbc/issues/228#issuecomment-494773723
206
+ encoded = token.encode("utf_16_le")
207
+ return struct.pack("<i", len(encoded)) + encoded
208
+
209
+
210
+ def build_mssql_dest():
211
+ # https://github.com/bruin-data/ingestr/issues/293
212
+
213
+ from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration
214
+ from dlt.destinations.impl.mssql.mssql import (
215
+ HINT_TO_MSSQL_ATTR,
216
+ MsSqlJobClient,
217
+ )
218
+ from dlt.destinations.impl.mssql.sql_client import (
219
+ PyOdbcMsSqlClient,
220
+ )
221
+
222
+ class OdbcMsSqlClient(PyOdbcMsSqlClient):
223
+ SKIP_CREDENTIALS = {"PWD", "AUTHENTICATION", "UID"}
224
+
225
+ def open_connection(self):
226
+ cfg = self.credentials._get_odbc_dsn_dict()
227
+ if (
228
+ cfg.get("AUTHENTICATION", "").strip().lower()
229
+ != "activedirectoryaccesstoken"
230
+ ):
231
+ return super().open_connection()
232
+
233
+ import pyodbc # type: ignore
234
+
235
+ dsn = ";".join(
236
+ [f"{k}={v}" for k, v in cfg.items() if k not in self.SKIP_CREDENTIALS]
237
+ )
238
+
239
+ self._conn = pyodbc.connect(
240
+ dsn,
241
+ timeout=self.credentials.connect_timeout,
242
+ attrs_before={
243
+ MSSQL_COPT_SS_ACCESS_TOKEN: serialize_azure_token(cfg["PWD"]),
244
+ },
245
+ )
246
+
247
+ # https://github.com/mkleehammer/pyodbc/wiki/Using-an-Output-Converter-function
248
+ self._conn.add_output_converter(-155, handle_datetimeoffset)
249
+ self._conn.autocommit = True
250
+ return self._conn
251
+
252
+ class MsSqlClient(MsSqlJobClient):
253
+ def __init__(
254
+ self,
255
+ schema: Schema,
256
+ config: MsSqlClientConfiguration,
257
+ capabilities: DestinationCapabilitiesContext,
258
+ ) -> None:
259
+ sql_client = OdbcMsSqlClient(
260
+ config.normalize_dataset_name(schema),
261
+ config.normalize_staging_dataset_name(schema),
262
+ config.credentials,
263
+ capabilities,
264
+ )
265
+ super(MsSqlJobClient, self).__init__(schema, config, sql_client)
266
+ self.config: MsSqlClientConfiguration = config
267
+ self.sql_client = sql_client
268
+ self.active_hints = HINT_TO_MSSQL_ATTR if self.config.create_indexes else {}
269
+ self.type_mapper = capabilities.get_type_mapper()
270
+
271
+ class MsSqlDestImpl(dlt.destinations.mssql):
272
+ @property
273
+ def client_class(self):
274
+ return MsSqlClient
275
+
276
+ return MsSqlDestImpl
277
+
278
+
108
279
  class MsSQLDestination(GenericSqlDestination):
109
280
  def dlt_dest(self, uri: str, **kwargs):
110
- return dlt.destinations.mssql(credentials=uri, **kwargs)
281
+ cls = build_mssql_dest()
282
+ return cls(credentials=uri, **kwargs)
111
283
 
112
284
 
113
285
  class DatabricksDestination(GenericSqlDestination):
114
286
  def dlt_dest(self, uri: str, **kwargs):
115
- return dlt.destinations.databricks(credentials=uri, **kwargs)
287
+ p = urlparse(uri)
288
+ q = parse_qs(p.query)
289
+ access_token = p.password
290
+ server_hostname = p.hostname
291
+ http_path = q.get("http_path", [None])[0]
292
+ catalog = q.get("catalog", [None])[0]
293
+ schema = q.get("schema", [None])[0]
294
+
295
+ creds = {
296
+ "access_token": access_token,
297
+ "server_hostname": server_hostname,
298
+ "http_path": http_path,
299
+ "catalog": catalog,
300
+ "schema": schema,
301
+ }
302
+
303
+ return dlt.destinations.databricks(
304
+ credentials=creds,
305
+ **kwargs,
306
+ )
116
307
 
117
308
 
118
309
  class SynapseDestination(GenericSqlDestination):
@@ -184,11 +375,9 @@ class CsvDestination(GenericSqlDestination):
184
375
  if output_path.count("/") > 1:
185
376
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
186
377
 
187
- table = pyarrow.parquet.read_table(first_file_path)
188
- rows = table.to_pylist()
189
378
  with open(output_path, "w", newline="") as csv_file:
190
379
  csv_writer = None
191
- for row in rows:
380
+ for row in load_dlt_file(first_file_path):
192
381
  row = filter_keys(row)
193
382
  if csv_writer is None:
194
383
  csv_writer = csv.DictWriter(csv_file, fieldnames=row.keys())
@@ -211,43 +400,64 @@ class AthenaDestination:
211
400
  if not bucket.startswith("s3://"):
212
401
  bucket = f"s3://{bucket}"
213
402
 
214
- query_result_path = source_params.get("query_results_path", [None])[0]
215
- if query_result_path:
216
- if not query_result_path.startswith("s3://"):
217
- query_result_path = f"s3://{query_result_path}"
218
- else:
219
- query_result_path = bucket
403
+ bucket = bucket.rstrip("/")
220
404
 
221
- access_key_id = source_params.get("access_key_id", [None])[0]
222
- if not access_key_id:
223
- raise ValueError("The AWS access_key_id is required to connect to Athena.")
405
+ dest_table = kwargs.get("dest_table", None)
406
+ if not dest_table:
407
+ raise ValueError("A destination table is required to connect to Athena.")
224
408
 
225
- secret_access_key = source_params.get("secret_access_key", [None])[0]
226
- if not secret_access_key:
227
- raise ValueError("The AWS secret_access_key is required to connect Athena")
409
+ dest_table_fields = dest_table.split(".")
410
+ if len(dest_table_fields) != 2:
411
+ raise ValueError(
412
+ f"Table name must be in the format <schema>.<table>, given: {dest_table}"
413
+ )
228
414
 
229
- work_group = source_params.get("workgroup", [None])[0]
415
+ query_result_path = f"{bucket}/{dest_table_fields[0]}_staging/metadata"
230
416
 
417
+ access_key_id = source_params.get("access_key_id", [None])[0]
418
+ secret_access_key = source_params.get("secret_access_key", [None])[0]
419
+ session_token = source_params.get("session_token", [None])[0]
420
+ profile_name = source_params.get("profile", ["default"])[0]
231
421
  region_name = source_params.get("region_name", [None])[0]
422
+
423
+ if not access_key_id and not secret_access_key:
424
+ import botocore.session # type: ignore
425
+
426
+ session = botocore.session.Session(profile=profile_name)
427
+ default = session.get_credentials()
428
+ if not profile_name:
429
+ raise ValueError(
430
+ "You have to either provide access_key_id and secret_access_key pair or a valid AWS profile name."
431
+ )
432
+ access_key_id = default.access_key
433
+ secret_access_key = default.secret_key
434
+ session_token = default.token
435
+ if region_name is None:
436
+ region_name = session.get_config_variable("region")
437
+
232
438
  if not region_name:
233
439
  raise ValueError("The region_name is required to connect to Athena.")
234
440
 
235
441
  os.environ["DESTINATION__BUCKET_URL"] = bucket
236
- os.environ["DESTINATION__CREDENTIALS__AWS_ACCESS_KEY_ID"] = access_key_id
237
- os.environ["DESTINATION__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = (
238
- secret_access_key
239
- )
442
+ if access_key_id and secret_access_key:
443
+ os.environ["DESTINATION__CREDENTIALS__AWS_ACCESS_KEY_ID"] = access_key_id
444
+ os.environ["DESTINATION__CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = (
445
+ secret_access_key
446
+ )
447
+ if session_token:
448
+ os.environ["DESTINATION__CREDENTIALS__AWS_SESSION_TOKEN"] = session_token
240
449
 
241
- credentials = AwsCredentials(
242
- aws_access_key_id=access_key_id,
243
- aws_secret_access_key=secret_access_key,
244
- region_name=region_name,
245
- )
246
450
  return dlt.destinations.athena(
247
451
  query_result_bucket=query_result_path,
248
- athena_work_group=work_group,
249
- credentials=credentials,
452
+ athena_work_group=source_params.get("workgroup", [None])[0], # type: ignore
453
+ credentials=AwsCredentials(
454
+ aws_access_key_id=access_key_id, # type: ignore
455
+ aws_secret_access_key=secret_access_key, # type: ignore
456
+ aws_session_token=session_token,
457
+ region_name=region_name,
458
+ ),
250
459
  destination_name=bucket,
460
+ force_iceberg=True,
251
461
  )
252
462
 
253
463
  def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
@@ -297,14 +507,16 @@ class ClickhouseDestination:
297
507
  raise ValueError(
298
508
  "The TCP port of the ClickHouse server is required to establish a connection."
299
509
  )
300
-
510
+
301
511
  query_params = parse_qs(parsed_uri.query)
302
512
  secure = int(query_params["secure"][0]) if "secure" in query_params else 1
303
513
 
304
514
  http_port = (
305
515
  int(query_params["http_port"][0])
306
516
  if "http_port" in query_params
307
- else 8443 if secure == 1 else 8123
517
+ else 8443
518
+ if secure == 1
519
+ else 8123
308
520
  )
309
521
 
310
522
  if secure not in (0, 1):
@@ -335,3 +547,278 @@ class ClickhouseDestination:
335
547
 
336
548
  def post_load(self):
337
549
  pass
550
+
551
+
552
+ class BlobFSClient(dlt.destinations.impl.filesystem.filesystem.FilesystemClient):
553
+ @property
554
+ def dataset_path(self):
555
+ # override to remove dataset path
556
+ return self.bucket_path
557
+
558
+
559
+ class BlobFS(dlt.destinations.filesystem):
560
+ @property
561
+ def client_class(self):
562
+ return BlobFSClient
563
+
564
+
565
+ class SqliteDestination(GenericSqlDestination):
566
+ def dlt_dest(self, uri: str, **kwargs):
567
+ return dlt.destinations.sqlalchemy(credentials=uri)
568
+
569
+ def dlt_run_params(self, uri: str, table: str, **kwargs):
570
+ return {
571
+ # https://dlthub.com/docs/dlt-ecosystem/destinations/sqlalchemy#dataset-files
572
+ "dataset_name": "main",
573
+ "table_name": table,
574
+ }
575
+
576
+
577
+ class MySqlDestination(GenericSqlDestination):
578
+ def dlt_dest(self, uri: str, **kwargs):
579
+ return dlt.destinations.sqlalchemy(credentials=uri)
580
+
581
+ def dlt_run_params(self, uri: str, table: str, **kwargs):
582
+ parsed = urlparse(uri)
583
+ database = parsed.path.lstrip("/")
584
+ if not database:
585
+ raise ValueError("You need to specify a database")
586
+ return {
587
+ "dataset_name": database,
588
+ "table_name": table,
589
+ }
590
+
591
+
592
+ class TrinoTypeMapper:
593
+ """Custom type mapper for Trino to handle unsupported types."""
594
+
595
+ @staticmethod
596
+ def create_type_mapper():
597
+ """Create a custom type mapper for Trino."""
598
+ from dlt.destinations.impl.sqlalchemy.type_mapper import SqlalchemyTypeMapper
599
+ from sqlalchemy import BigInteger, Text
600
+ from sqlalchemy.sql import sqltypes
601
+
602
+ class CustomTrinoTypeMapper(SqlalchemyTypeMapper):
603
+ """Custom type mapper that converts unsupported Trino types."""
604
+
605
+ def to_destination_type(self, column, table=None):
606
+ # Handle special cases before calling parent
607
+ data_type = column.get("data_type", "")
608
+
609
+ # Convert JSON to VARCHAR for Trino's Iceberg catalog
610
+ if data_type == "json":
611
+ # Use TEXT (unlimited VARCHAR) for JSON data
612
+ return Text()
613
+
614
+ # Convert BINARY to VARCHAR
615
+ if data_type == "binary":
616
+ return Text()
617
+
618
+ # Handle integer types - always use BIGINT for Trino
619
+ # Note: dlt uses "bigint" internally, not "integer"
620
+ if data_type in ["bigint", "integer", "int"]:
621
+ return BigInteger()
622
+
623
+ # For other types, try parent mapper
624
+ try:
625
+ type_ = super().to_destination_type(column, table)
626
+ except Exception:
627
+ # If parent can't handle it, default to TEXT
628
+ return Text()
629
+
630
+ # Convert any INTEGER type to BIGINT
631
+ if isinstance(type_, sqltypes.Integer) and not isinstance(
632
+ type_, sqltypes.BigInteger
633
+ ):
634
+ return BigInteger()
635
+
636
+ # Ensure VARCHAR types don't have constraints that Trino doesn't support
637
+ if isinstance(type_, sqltypes.String):
638
+ # Return TEXT for unlimited string
639
+ return Text()
640
+
641
+ return type_
642
+
643
+ return CustomTrinoTypeMapper
644
+
645
+
646
+ class TrinoDestination(GenericSqlDestination):
647
+ def dlt_dest(self, uri: str, **kwargs):
648
+ # Import required modules
649
+ from dlt.destinations.impl.sqlalchemy.factory import (
650
+ sqlalchemy as sqlalchemy_factory,
651
+ )
652
+
653
+ # Create the destination with custom type mapper
654
+ # We need to use the factory to properly configure the type mapper
655
+ dest = sqlalchemy_factory(
656
+ credentials=uri, type_mapper=TrinoTypeMapper.create_type_mapper(), **kwargs
657
+ )
658
+
659
+ return dest
660
+
661
+
662
+ class BlobStorageDestination(abc.ABC):
663
+ @abc.abstractmethod
664
+ def credentials(self, params: dict) -> FileSystemCredentials:
665
+ """Build credentials for the blob storage destination."""
666
+ pass
667
+
668
+ @property
669
+ @abc.abstractmethod
670
+ def protocol(self) -> str:
671
+ """The protocol used for the blob storage destination."""
672
+ pass
673
+
674
+ def dlt_dest(self, uri: str, **kwargs):
675
+ parsed_uri = urlparse(uri)
676
+ params = parse_qs(parsed_uri.query)
677
+ creds = self.credentials(params)
678
+
679
+ dest_table = kwargs["dest_table"]
680
+
681
+ # only validate if dest_table is not a full URI
682
+ if not parsed_uri.netloc:
683
+ dest_table = self.validate_table(dest_table)
684
+
685
+ table_parts = dest_table.split("/")
686
+
687
+ if parsed_uri.path.strip("/"):
688
+ path_parts = parsed_uri.path.strip("/ ").split("/")
689
+ table_parts = path_parts + table_parts
690
+
691
+ if parsed_uri.netloc:
692
+ table_parts.insert(0, parsed_uri.netloc.strip())
693
+
694
+ base_path = "/".join(table_parts[:-1])
695
+
696
+ opts = {
697
+ "bucket_url": f"{self.protocol}://{base_path}",
698
+ "credentials": creds,
699
+ # supresses dlt warnings about dataset name normalization.
700
+ # we don't use dataset names in S3 so it's fine to disable this.
701
+ "enable_dataset_name_normalization": False,
702
+ }
703
+ layout = params.get("layout", [None])[0]
704
+ if layout is not None:
705
+ opts["layout"] = layout
706
+
707
+ return BlobFS(**opts) # type: ignore
708
+
709
+ def validate_table(self, table: str):
710
+ table = table.strip("/ ")
711
+ if len(table.split("/")) < 2:
712
+ raise ValueError("Table name must be in the format {bucket-name}/{path}")
713
+ return table
714
+
715
+ def dlt_run_params(self, uri: str, table: str, **kwargs):
716
+ table_parts = table.split("/")
717
+ return {
718
+ "table_name": table_parts[-1].strip(),
719
+ }
720
+
721
+ def post_load(self) -> None:
722
+ pass
723
+
724
+
725
+ class S3Destination(BlobStorageDestination):
726
+ @property
727
+ def protocol(self) -> str:
728
+ return "s3"
729
+
730
+ def credentials(self, params: dict) -> FileSystemCredentials:
731
+ access_key_id = params.get("access_key_id", [None])[0]
732
+ if access_key_id is None:
733
+ raise MissingValueError("access_key_id", "S3")
734
+
735
+ secret_access_key = params.get("secret_access_key", [None])[0]
736
+ if secret_access_key is None:
737
+ raise MissingValueError("secret_access_key", "S3")
738
+
739
+ endpoint_url = params.get("endpoint_url", [None])[0]
740
+ if endpoint_url is not None:
741
+ parsed_endpoint = urlparse(endpoint_url)
742
+ if not parsed_endpoint.scheme or not parsed_endpoint.netloc:
743
+ raise ValueError("Invalid endpoint_url. Must be a valid URL.")
744
+
745
+ return AwsCredentials(
746
+ aws_access_key_id=access_key_id,
747
+ aws_secret_access_key=secret_access_key,
748
+ endpoint_url=endpoint_url,
749
+ )
750
+
751
+
752
+ class GCSDestination(BlobStorageDestination):
753
+ @property
754
+ def protocol(self) -> str:
755
+ return "gs"
756
+
757
+ def credentials(self, params: dict) -> FileSystemCredentials:
758
+ """Builds GCS credentials from the provided parameters."""
759
+ credentials_path = params.get("credentials_path")
760
+ credentials_base64 = params.get("credentials_base64")
761
+ credentials_available = any(
762
+ map(
763
+ lambda x: x is not None,
764
+ [credentials_path, credentials_base64],
765
+ )
766
+ )
767
+ if credentials_available is False:
768
+ raise MissingValueError("credentials_path or credentials_base64", "GCS")
769
+
770
+ credentials = None
771
+ if credentials_path:
772
+ with open(credentials_path[0], "r") as f:
773
+ credentials = json.load(f)
774
+ else:
775
+ credentials = json.loads(base64.b64decode(credentials_base64[0]).decode()) # type: ignore
776
+
777
+ return credentials
778
+
779
+
780
+ class ElasticsearchDestination:
781
+ def dlt_dest(self, uri: str, **kwargs):
782
+ from urllib.parse import urlparse
783
+
784
+ parsed_uri = urlparse(uri)
785
+
786
+ # Extract connection details from URI
787
+ scheme = parsed_uri.scheme or "http"
788
+ host = parsed_uri.hostname or "localhost"
789
+ port = parsed_uri.port or 9200
790
+ username = parsed_uri.username
791
+ password = parsed_uri.password
792
+
793
+ # Build connection string
794
+ if username and password:
795
+ connection_string = f"{scheme}://{username}:{password}@{host}:{port}"
796
+ else:
797
+ connection_string = f"{scheme}://{host}:{port}"
798
+
799
+ # Add query parameters if any
800
+ if parsed_uri.query:
801
+ connection_string += f"?{parsed_uri.query}"
802
+
803
+ return elasticsearch_insert(connection_string=connection_string)
804
+
805
+ def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
806
+ return {
807
+ "table_name": table,
808
+ }
809
+
810
+ def post_load(self):
811
+ pass
812
+
813
+
814
+ class MongoDBDestination:
815
+ def dlt_dest(self, uri: str, **kwargs):
816
+ return mongodb_insert(uri)
817
+
818
+ def dlt_run_params(self, uri: str, table: str, **kwargs) -> dict:
819
+ return {
820
+ "table_name": table,
821
+ }
822
+
823
+ def post_load(self):
824
+ pass