ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin/__init__.py +262 -0
  10. ingestr/src/applovin_max/__init__.py +117 -0
  11. ingestr/src/appsflyer/__init__.py +325 -0
  12. ingestr/src/appsflyer/client.py +49 -45
  13. ingestr/src/appstore/__init__.py +1 -0
  14. ingestr/src/arrow/__init__.py +9 -1
  15. ingestr/src/asana_source/__init__.py +1 -1
  16. ingestr/src/attio/__init__.py +102 -0
  17. ingestr/src/attio/helpers.py +65 -0
  18. ingestr/src/blob.py +38 -11
  19. ingestr/src/buildinfo.py +1 -0
  20. ingestr/src/chess/__init__.py +1 -1
  21. ingestr/src/clickup/__init__.py +85 -0
  22. ingestr/src/clickup/helpers.py +47 -0
  23. ingestr/src/collector/spinner.py +43 -0
  24. ingestr/src/couchbase_source/__init__.py +118 -0
  25. ingestr/src/couchbase_source/helpers.py +135 -0
  26. ingestr/src/cursor/__init__.py +83 -0
  27. ingestr/src/cursor/helpers.py +188 -0
  28. ingestr/src/destinations.py +520 -33
  29. ingestr/src/docebo/__init__.py +589 -0
  30. ingestr/src/docebo/client.py +435 -0
  31. ingestr/src/docebo/helpers.py +97 -0
  32. ingestr/src/elasticsearch/__init__.py +80 -0
  33. ingestr/src/elasticsearch/helpers.py +138 -0
  34. ingestr/src/errors.py +8 -0
  35. ingestr/src/facebook_ads/__init__.py +47 -28
  36. ingestr/src/facebook_ads/helpers.py +59 -37
  37. ingestr/src/facebook_ads/settings.py +2 -0
  38. ingestr/src/facebook_ads/utils.py +39 -0
  39. ingestr/src/factory.py +116 -2
  40. ingestr/src/filesystem/__init__.py +8 -3
  41. ingestr/src/filters.py +46 -3
  42. ingestr/src/fluxx/__init__.py +9906 -0
  43. ingestr/src/fluxx/helpers.py +209 -0
  44. ingestr/src/frankfurter/__init__.py +157 -0
  45. ingestr/src/frankfurter/helpers.py +48 -0
  46. ingestr/src/freshdesk/__init__.py +89 -0
  47. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  48. ingestr/src/freshdesk/settings.py +9 -0
  49. ingestr/src/fundraiseup/__init__.py +95 -0
  50. ingestr/src/fundraiseup/client.py +81 -0
  51. ingestr/src/github/__init__.py +41 -6
  52. ingestr/src/github/helpers.py +5 -5
  53. ingestr/src/google_analytics/__init__.py +22 -4
  54. ingestr/src/google_analytics/helpers.py +124 -6
  55. ingestr/src/google_sheets/__init__.py +4 -4
  56. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  57. ingestr/src/hostaway/__init__.py +302 -0
  58. ingestr/src/hostaway/client.py +288 -0
  59. ingestr/src/http/__init__.py +35 -0
  60. ingestr/src/http/readers.py +114 -0
  61. ingestr/src/http_client.py +24 -0
  62. ingestr/src/hubspot/__init__.py +66 -23
  63. ingestr/src/hubspot/helpers.py +52 -22
  64. ingestr/src/hubspot/settings.py +14 -7
  65. ingestr/src/influxdb/__init__.py +46 -0
  66. ingestr/src/influxdb/client.py +34 -0
  67. ingestr/src/intercom/__init__.py +142 -0
  68. ingestr/src/intercom/helpers.py +674 -0
  69. ingestr/src/intercom/settings.py +279 -0
  70. ingestr/src/isoc_pulse/__init__.py +159 -0
  71. ingestr/src/jira_source/__init__.py +340 -0
  72. ingestr/src/jira_source/helpers.py +439 -0
  73. ingestr/src/jira_source/settings.py +170 -0
  74. ingestr/src/kafka/__init__.py +4 -1
  75. ingestr/src/kinesis/__init__.py +139 -0
  76. ingestr/src/kinesis/helpers.py +82 -0
  77. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  78. ingestr/src/linear/__init__.py +634 -0
  79. ingestr/src/linear/helpers.py +111 -0
  80. ingestr/src/linkedin_ads/helpers.py +0 -1
  81. ingestr/src/loader.py +69 -0
  82. ingestr/src/mailchimp/__init__.py +126 -0
  83. ingestr/src/mailchimp/helpers.py +226 -0
  84. ingestr/src/mailchimp/settings.py +164 -0
  85. ingestr/src/masking.py +344 -0
  86. ingestr/src/mixpanel/__init__.py +62 -0
  87. ingestr/src/mixpanel/client.py +99 -0
  88. ingestr/src/monday/__init__.py +246 -0
  89. ingestr/src/monday/helpers.py +392 -0
  90. ingestr/src/monday/settings.py +328 -0
  91. ingestr/src/mongodb/__init__.py +72 -8
  92. ingestr/src/mongodb/helpers.py +915 -38
  93. ingestr/src/partition.py +32 -0
  94. ingestr/src/personio/__init__.py +331 -0
  95. ingestr/src/personio/helpers.py +86 -0
  96. ingestr/src/phantombuster/__init__.py +65 -0
  97. ingestr/src/phantombuster/client.py +87 -0
  98. ingestr/src/pinterest/__init__.py +82 -0
  99. ingestr/src/pipedrive/__init__.py +198 -0
  100. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  101. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  102. ingestr/src/pipedrive/helpers/pages.py +115 -0
  103. ingestr/src/pipedrive/settings.py +27 -0
  104. ingestr/src/pipedrive/typing.py +3 -0
  105. ingestr/src/plusvibeai/__init__.py +335 -0
  106. ingestr/src/plusvibeai/helpers.py +544 -0
  107. ingestr/src/plusvibeai/settings.py +252 -0
  108. ingestr/src/quickbooks/__init__.py +117 -0
  109. ingestr/src/resource.py +40 -0
  110. ingestr/src/revenuecat/__init__.py +83 -0
  111. ingestr/src/revenuecat/helpers.py +237 -0
  112. ingestr/src/salesforce/__init__.py +156 -0
  113. ingestr/src/salesforce/helpers.py +64 -0
  114. ingestr/src/shopify/__init__.py +1 -17
  115. ingestr/src/smartsheets/__init__.py +82 -0
  116. ingestr/src/snapchat_ads/__init__.py +489 -0
  117. ingestr/src/snapchat_ads/client.py +72 -0
  118. ingestr/src/snapchat_ads/helpers.py +535 -0
  119. ingestr/src/socrata_source/__init__.py +83 -0
  120. ingestr/src/socrata_source/helpers.py +85 -0
  121. ingestr/src/socrata_source/settings.py +8 -0
  122. ingestr/src/solidgate/__init__.py +219 -0
  123. ingestr/src/solidgate/helpers.py +154 -0
  124. ingestr/src/sources.py +3132 -212
  125. ingestr/src/stripe_analytics/__init__.py +49 -21
  126. ingestr/src/stripe_analytics/helpers.py +286 -1
  127. ingestr/src/stripe_analytics/settings.py +62 -10
  128. ingestr/src/telemetry/event.py +10 -9
  129. ingestr/src/tiktok_ads/__init__.py +12 -6
  130. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  131. ingestr/src/trustpilot/__init__.py +48 -0
  132. ingestr/src/trustpilot/client.py +48 -0
  133. ingestr/src/version.py +6 -1
  134. ingestr/src/wise/__init__.py +68 -0
  135. ingestr/src/wise/client.py +63 -0
  136. ingestr/src/zoom/__init__.py +99 -0
  137. ingestr/src/zoom/helpers.py +102 -0
  138. ingestr/tests/unit/test_smartsheets.py +133 -0
  139. ingestr-0.14.104.dist-info/METADATA +563 -0
  140. ingestr-0.14.104.dist-info/RECORD +203 -0
  141. ingestr/src/appsflyer/_init_.py +0 -24
  142. ingestr-0.13.2.dist-info/METADATA +0 -302
  143. ingestr-0.13.2.dist-info/RECORD +0 -107
  144. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  145. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  146. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/sources.py CHANGED
@@ -3,6 +3,7 @@ import csv
3
3
  import json
4
4
  import os
5
5
  import re
6
+ import sys
6
7
  import tempfile
7
8
  from datetime import date, datetime, timedelta, timezone
8
9
  from typing import (
@@ -13,100 +14,39 @@ from typing import (
13
14
  List,
14
15
  Literal,
15
16
  Optional,
17
+ TypeAlias,
16
18
  Union,
17
19
  )
18
- from urllib.parse import ParseResult, parse_qs, quote, urlparse
20
+ from urllib.parse import ParseResult, parse_qs, urlencode, urlparse
19
21
 
20
- import dlt
21
- import gcsfs # type: ignore
22
+ import fsspec # type: ignore
22
23
  import pendulum
23
- import s3fs # type: ignore
24
- from dlt.common.configuration.specs import (
25
- AwsCredentials,
26
- )
27
- from dlt.common.libs.sql_alchemy import (
28
- Engine,
29
- MetaData,
30
- )
31
24
  from dlt.common.time import ensure_pendulum_datetime
32
- from dlt.common.typing import TDataItem, TSecretStrValue
33
25
  from dlt.extract import Incremental
26
+ from dlt.extract.exceptions import ResourcesNotFoundError
27
+ from dlt.sources import incremental as dlt_incremental
34
28
  from dlt.sources.credentials import (
35
29
  ConnectionStringCredentials,
36
30
  )
37
- from dlt.sources.sql_database import sql_table
38
- from dlt.sources.sql_database.helpers import TableLoader
39
- from dlt.sources.sql_database.schema_types import (
40
- ReflectionLevel,
41
- SelectAny,
42
- Table,
43
- TTypeAdapter,
44
- )
45
- from google.ads.googleads.client import GoogleAdsClient # type: ignore
46
- from sqlalchemy import Column
47
- from sqlalchemy import types as sa
48
31
 
49
32
  from ingestr.src import blob
50
- from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
51
- from ingestr.src.adjust.adjust_helpers import parse_filters
52
- from ingestr.src.airtable import airtable_source
53
- from ingestr.src.appsflyer._init_ import appsflyer_source
54
- from ingestr.src.appstore import app_store
55
- from ingestr.src.appstore.client import AppStoreConnectClient
56
- from ingestr.src.arrow import memory_mapped_arrow
57
- from ingestr.src.asana_source import asana_source
58
- from ingestr.src.chess import source
59
- from ingestr.src.dynamodb import dynamodb
60
33
  from ingestr.src.errors import (
61
34
  InvalidBlobTableError,
62
35
  MissingValueError,
63
36
  UnsupportedResourceError,
64
37
  )
65
- from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
66
- from ingestr.src.filesystem import readers
67
- from ingestr.src.filters import table_adapter_exclude_columns
68
- from ingestr.src.github import github_reactions, github_repo_events, github_stargazers
69
- from ingestr.src.google_ads import google_ads
70
- from ingestr.src.google_analytics import google_analytics
71
- from ingestr.src.google_sheets import google_spreadsheet
72
- from ingestr.src.gorgias import gorgias_source
73
- from ingestr.src.hubspot import hubspot
74
- from ingestr.src.kafka import kafka_consumer
75
- from ingestr.src.kafka.helpers import KafkaCredentials
76
- from ingestr.src.klaviyo._init_ import klaviyo_source
77
- from ingestr.src.linkedin_ads import linked_in_ads_source
78
- from ingestr.src.linkedin_ads.dimension_time_enum import (
79
- Dimension,
80
- TimeGranularity,
81
- )
82
- from ingestr.src.mongodb import mongodb_collection
83
- from ingestr.src.notion import notion_databases
84
- from ingestr.src.shopify import shopify_source
85
- from ingestr.src.slack import slack_source
86
- from ingestr.src.sql_database.callbacks import (
87
- chained_query_adapter_callback,
88
- custom_query_variable_subsitution,
89
- limit_callback,
90
- type_adapter_callback,
91
- )
92
- from ingestr.src.stripe_analytics import stripe_source
93
38
  from ingestr.src.table_definition import TableDefinition, table_string_to_dataclass
94
- from ingestr.src.tiktok_ads import tiktok_source
95
- from ingestr.src.time import isotime
96
- from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
97
- from ingestr.src.zendesk.helpers.credentials import (
98
- ZendeskCredentialsOAuth,
99
- ZendeskCredentialsToken,
100
- )
101
-
102
- TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
103
- TQueryAdapter = Callable[[SelectAny, Table], SelectAny]
104
39
 
105
40
 
106
41
  class SqlSource:
107
42
  table_builder: Callable
108
43
 
109
- def __init__(self, table_builder=sql_table) -> None:
44
+ def __init__(self, table_builder=None) -> None:
45
+ if table_builder is None:
46
+ from dlt.sources.sql_database import sql_table
47
+
48
+ table_builder = sql_table
49
+
110
50
  self.table_builder = table_builder
111
51
 
112
52
  def handles_incrementality(self) -> bool:
@@ -115,13 +55,16 @@ class SqlSource:
115
55
  def dlt_source(self, uri: str, table: str, **kwargs):
116
56
  table_fields = TableDefinition(dataset="custom", table="custom")
117
57
  if not table.startswith("query:"):
118
- table_fields = table_string_to_dataclass(table)
58
+ if uri.startswith("spanner://"):
59
+ table_fields = TableDefinition(dataset="", table=table)
60
+ else:
61
+ table_fields = table_string_to_dataclass(table)
119
62
 
120
63
  incremental = None
121
64
  if kwargs.get("incremental_key"):
122
65
  start_value = kwargs.get("interval_start")
123
66
  end_value = kwargs.get("interval_end")
124
- incremental = dlt.sources.incremental(
67
+ incremental = dlt_incremental(
125
68
  kwargs.get("incremental_key", ""),
126
69
  initial_value=start_value,
127
70
  end_value=end_value,
@@ -129,9 +72,143 @@ class SqlSource:
129
72
  range_start="closed",
130
73
  )
131
74
 
75
+ engine_adapter_callback = None
76
+
77
+ if uri.startswith("md://") or uri.startswith("motherduck://"):
78
+ parsed_uri = urlparse(uri)
79
+ query_params = parse_qs(parsed_uri.query)
80
+ # Convert md:// URI to duckdb:///md: format
81
+ if parsed_uri.path:
82
+ db_path = parsed_uri.path
83
+ else:
84
+ db_path = ""
85
+
86
+ token = query_params.get("token", [""])[0]
87
+ if not token:
88
+ raise ValueError("Token is required for MotherDuck connection")
89
+ uri = f"duckdb:///md:{db_path}?motherduck_token={token}"
90
+
132
91
  if uri.startswith("mysql://"):
133
92
  uri = uri.replace("mysql://", "mysql+pymysql://")
134
93
 
94
+ # Monkey patch cx_Oracle to use oracledb (thin mode, no client libraries required)
95
+ if uri.startswith("oracle+") or uri.startswith("oracle://"):
96
+ try:
97
+ import oracledb # type: ignore[import-not-found]
98
+
99
+ # SQLAlchemy's cx_oracle dialect checks for version >= 5.2
100
+ # oracledb has a different versioning scheme, so we need to patch it
101
+ oracledb.version = "8.3.0" # type: ignore[assignment]
102
+ sys.modules["cx_Oracle"] = oracledb # type: ignore[assignment]
103
+ except ImportError:
104
+ # oracledb not installed, will fail later with a clear error
105
+ pass
106
+
107
+ # Process Snowflake private key authentication
108
+ if uri.startswith("snowflake://"):
109
+ parsed_uri = urlparse(uri)
110
+ query_params = parse_qs(parsed_uri.query)
111
+
112
+ if "private_key" in query_params:
113
+ from dlt.common.libs.cryptography import decode_private_key
114
+
115
+ private_key = query_params["private_key"][0]
116
+ passphrase = query_params.get("private_key_passphrase", [None])[0]
117
+ decoded_key = decode_private_key(private_key, passphrase)
118
+
119
+ query_params["private_key"] = [base64.b64encode(decoded_key).decode()]
120
+ if "private_key_passphrase" in query_params:
121
+ del query_params["private_key_passphrase"]
122
+
123
+ # Rebuild URI
124
+ uri = parsed_uri._replace(
125
+ query=urlencode(query_params, doseq=True)
126
+ ).geturl()
127
+
128
+ # clickhouse://<username>:<password>@<host>:<port>?secure=<secure>
129
+ if uri.startswith("clickhouse://"):
130
+ parsed_uri = urlparse(uri)
131
+
132
+ query_params = parse_qs(parsed_uri.query)
133
+
134
+ if "http_port" in query_params:
135
+ del query_params["http_port"]
136
+
137
+ if "secure" not in query_params:
138
+ query_params["secure"] = ["1"]
139
+
140
+ uri = parsed_uri._replace(
141
+ scheme="clickhouse+native",
142
+ query=urlencode(query_params, doseq=True),
143
+ ).geturl()
144
+
145
+ if uri.startswith("db2://"):
146
+ uri = uri.replace("db2://", "db2+ibm_db://")
147
+
148
+ if uri.startswith("spanner://"):
149
+ parsed_uri = urlparse(uri)
150
+ query_params = parse_qs(parsed_uri.query)
151
+
152
+ project_id_param = query_params.get("project_id")
153
+ instance_id_param = query_params.get("instance_id")
154
+ database_param = query_params.get("database")
155
+
156
+ cred_path = query_params.get("credentials_path")
157
+ cred_base64 = query_params.get("credentials_base64")
158
+
159
+ if not project_id_param or not instance_id_param or not database_param:
160
+ raise ValueError(
161
+ "project_id, instance_id and database are required in the URI to get data from Google Spanner"
162
+ )
163
+
164
+ project_id = project_id_param[0]
165
+ instance_id = instance_id_param[0]
166
+ database = database_param[0]
167
+
168
+ if not cred_path and not cred_base64:
169
+ raise ValueError(
170
+ "credentials_path or credentials_base64 is required in the URI to get data from Google Sheets"
171
+ )
172
+ if cred_path:
173
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path[0]
174
+ elif cred_base64:
175
+ credentials = json.loads(
176
+ base64.b64decode(cred_base64[0]).decode("utf-8")
177
+ )
178
+ temp = tempfile.NamedTemporaryFile(
179
+ mode="w", delete=False, suffix=".json"
180
+ )
181
+ json.dump(credentials, temp)
182
+ temp.close()
183
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp.name
184
+
185
+ uri = f"spanner+spanner:///projects/{project_id}/instances/{instance_id}/databases/{database}"
186
+
187
+ def eng_callback(engine):
188
+ return engine.execution_options(read_only=True)
189
+
190
+ engine_adapter_callback = eng_callback
191
+ from dlt.common.libs.sql_alchemy import (
192
+ Engine,
193
+ MetaData,
194
+ )
195
+ from dlt.sources.sql_database.schema_types import (
196
+ ReflectionLevel,
197
+ SelectAny,
198
+ Table,
199
+ TTypeAdapter,
200
+ )
201
+ from sqlalchemy import Column
202
+ from sqlalchemy import types as sa
203
+
204
+ from ingestr.src.filters import table_adapter_exclude_columns
205
+ from ingestr.src.sql_database.callbacks import (
206
+ chained_query_adapter_callback,
207
+ custom_query_variable_subsitution,
208
+ limit_callback,
209
+ type_adapter_callback,
210
+ )
211
+
135
212
  query_adapters = []
136
213
  if kwargs.get("sql_limit"):
137
214
  query_adapters.append(
@@ -150,6 +227,13 @@ class SqlSource:
150
227
  defer_table_reflect = True
151
228
  query_value = table.split(":", 1)[1]
152
229
 
230
+ TableBackend: TypeAlias = Literal[
231
+ "sqlalchemy", "pyarrow", "pandas", "connectorx"
232
+ ]
233
+ TQueryAdapter: TypeAlias = Callable[[SelectAny, Table], SelectAny]
234
+ import dlt
235
+ from dlt.common.typing import TDataItem
236
+
153
237
  # this is a very hacky version of the table_rows function. it is built this way to go around the dlt's table loader.
154
238
  # I didn't want to write a full fledged sqlalchemy source for now, and wanted to benefit from the existing stuff to begin with.
155
239
  # this is by no means a production ready solution, but it works for now.
@@ -167,6 +251,9 @@ class SqlSource:
167
251
  backend_kwargs: Dict[str, Any] = None, # type: ignore
168
252
  type_adapter_callback: Optional[TTypeAdapter] = None,
169
253
  included_columns: Optional[List[str]] = None,
254
+ excluded_columns: Optional[
255
+ List[str]
256
+ ] = None, # Added for dlt 1.16.0 compatibility
170
257
  query_adapter_callback: Optional[TQueryAdapter] = None,
171
258
  resolve_foreign_keys: bool = False,
172
259
  ) -> Iterator[TDataItem]:
@@ -200,6 +287,8 @@ class SqlSource:
200
287
  *cols,
201
288
  )
202
289
 
290
+ from dlt.sources.sql_database.helpers import TableLoader
291
+
203
292
  loader = TableLoader(
204
293
  engine,
205
294
  backend,
@@ -220,8 +309,54 @@ class SqlSource:
220
309
  # override the query adapters, the only one we want is the one here in the case of custom queries
221
310
  query_adapters = [custom_query_variable_subsitution(query_value, kwargs)]
222
311
 
312
+ credentials = ConnectionStringCredentials(uri)
313
+ if uri.startswith("mssql://"):
314
+ parsed_uri = urlparse(uri)
315
+ params = parse_qs(parsed_uri.query)
316
+ params = {k.lower(): v for k, v in params.items()}
317
+ if params.get("authentication") == ["ActiveDirectoryAccessToken"]:
318
+ import pyodbc # type: ignore
319
+ from sqlalchemy import create_engine
320
+
321
+ from ingestr.src.destinations import (
322
+ MSSQL_COPT_SS_ACCESS_TOKEN,
323
+ handle_datetimeoffset,
324
+ serialize_azure_token,
325
+ )
326
+
327
+ cfg = {
328
+ "DRIVER": params.get("driver", ["ODBC Driver 18 for SQL Server"])[
329
+ 0
330
+ ],
331
+ "SERVER": f"{parsed_uri.hostname},{parsed_uri.port or 1433}",
332
+ "DATABASE": parsed_uri.path.lstrip("/"),
333
+ }
334
+ for k, v in params.items():
335
+ if k.lower() not in ["driver", "authentication", "connect_timeout"]:
336
+ cfg[k.upper()] = v[0]
337
+
338
+ token = serialize_azure_token(parsed_uri.password)
339
+ dsn = ";".join([f"{k}={v}" for k, v in cfg.items()])
340
+
341
+ def creator():
342
+ connection = pyodbc.connect(
343
+ dsn,
344
+ autocommit=True,
345
+ timeout=kwargs.get("connect_timeout", 30),
346
+ attrs_before={
347
+ MSSQL_COPT_SS_ACCESS_TOKEN: token,
348
+ },
349
+ )
350
+ connection.add_output_converter(-155, handle_datetimeoffset)
351
+ return connection
352
+
353
+ credentials = create_engine(
354
+ "mssql+pyodbc://",
355
+ creator=creator,
356
+ )
357
+
223
358
  builder_res = self.table_builder(
224
- credentials=ConnectionStringCredentials(uri),
359
+ credentials=credentials,
225
360
  schema=table_fields.dataset,
226
361
  table=table_fields.table,
227
362
  incremental=incremental,
@@ -234,6 +369,7 @@ class SqlSource:
234
369
  kwargs.get("sql_exclude_columns", [])
235
370
  ),
236
371
  defer_table_reflect=defer_table_reflect,
372
+ engine_adapter_callback=engine_adapter_callback,
237
373
  )
238
374
 
239
375
  return builder_res
@@ -242,7 +378,12 @@ class SqlSource:
242
378
  class ArrowMemoryMappedSource:
243
379
  table_builder: Callable
244
380
 
245
- def __init__(self, table_builder=memory_mapped_arrow) -> None:
381
+ def __init__(self, table_builder=None) -> None:
382
+ if table_builder is None:
383
+ from ingestr.src.arrow import memory_mapped_arrow
384
+
385
+ table_builder = memory_mapped_arrow
386
+
246
387
  self.table_builder = table_builder
247
388
 
248
389
  def handles_incrementality(self) -> bool:
@@ -254,7 +395,7 @@ class ArrowMemoryMappedSource:
254
395
  start_value = kwargs.get("interval_start")
255
396
  end_value = kwargs.get("interval_end")
256
397
 
257
- incremental = dlt.sources.incremental(
398
+ incremental = dlt_incremental(
258
399
  kwargs.get("incremental_key", ""),
259
400
  initial_value=start_value,
260
401
  end_value=end_value,
@@ -287,37 +428,199 @@ class ArrowMemoryMappedSource:
287
428
  class MongoDbSource:
288
429
  table_builder: Callable
289
430
 
290
- def __init__(self, table_builder=mongodb_collection) -> None:
431
+ def __init__(self, table_builder=None) -> None:
432
+ if table_builder is None:
433
+ from ingestr.src.mongodb import mongodb_collection
434
+
435
+ table_builder = mongodb_collection
436
+
291
437
  self.table_builder = table_builder
292
438
 
293
439
  def handles_incrementality(self) -> bool:
294
440
  return False
295
441
 
296
442
  def dlt_source(self, uri: str, table: str, **kwargs):
297
- table_fields = table_string_to_dataclass(table)
443
+ # Check if this is a custom query format (collection:query)
444
+ if ":" in table:
445
+ collection_name, query_json = table.split(":", 1)
298
446
 
299
- incremental = None
300
- if kwargs.get("incremental_key"):
301
- start_value = kwargs.get("interval_start")
302
- end_value = kwargs.get("interval_end")
447
+ # Parse the query using MongoDB's extended JSON parser
448
+ # First, convert MongoDB shell syntax to Extended JSON format
449
+ from bson import json_util
303
450
 
304
- incremental = dlt.sources.incremental(
305
- kwargs.get("incremental_key", ""),
306
- initial_value=start_value,
307
- end_value=end_value,
308
- range_end="closed",
309
- range_start="closed",
451
+ from ingestr.src.mongodb.helpers import convert_mongo_shell_to_extended_json
452
+
453
+ # Convert MongoDB shell constructs to Extended JSON v2 format
454
+ converted_query = convert_mongo_shell_to_extended_json(query_json)
455
+
456
+ try:
457
+ query = json_util.loads(converted_query)
458
+ except Exception as e:
459
+ raise ValueError(f"Invalid MongoDB query format: {e}")
460
+
461
+ # Validate that it's a list for aggregation pipeline
462
+ if not isinstance(query, list):
463
+ raise ValueError(
464
+ "Query must be a JSON array representing a MongoDB aggregation pipeline"
465
+ )
466
+
467
+ # Check for incremental load requirements
468
+ incremental = None
469
+ if kwargs.get("incremental_key"):
470
+ start_value = kwargs.get("interval_start")
471
+ end_value = kwargs.get("interval_end")
472
+
473
+ # Validate that incremental key is present in the pipeline
474
+ incremental_key = kwargs.get("incremental_key")
475
+ self._validate_incremental_query(query, str(incremental_key))
476
+
477
+ incremental = dlt_incremental(
478
+ str(incremental_key),
479
+ initial_value=start_value,
480
+ end_value=end_value,
481
+ )
482
+
483
+ # Substitute interval parameters in the query
484
+ query = self._substitute_interval_params(query, kwargs)
485
+
486
+ # Parse collection name to get database and collection
487
+ if "." in collection_name:
488
+ # Handle database.collection format
489
+ table_fields = table_string_to_dataclass(collection_name)
490
+ database = table_fields.dataset
491
+ collection = table_fields.table
492
+ else:
493
+ # Single collection name, use default database
494
+ database = None
495
+ collection = collection_name
496
+
497
+ table_instance = self.table_builder(
498
+ connection_url=uri,
499
+ database=database,
500
+ collection=collection,
501
+ parallel=False,
502
+ incremental=incremental,
503
+ custom_query=query,
310
504
  )
505
+ table_instance.max_table_nesting = 1
506
+ return table_instance
507
+ else:
508
+ # Default behavior for simple collection names
509
+ table_fields = table_string_to_dataclass(table)
311
510
 
312
- table_instance = self.table_builder(
313
- connection_url=uri,
314
- database=table_fields.dataset,
315
- collection=table_fields.table,
316
- parallel=True,
317
- incremental=incremental,
318
- )
511
+ incremental = None
512
+ if kwargs.get("incremental_key"):
513
+ start_value = kwargs.get("interval_start")
514
+ end_value = kwargs.get("interval_end")
319
515
 
320
- return table_instance
516
+ incremental = dlt_incremental(
517
+ kwargs.get("incremental_key", ""),
518
+ initial_value=start_value,
519
+ end_value=end_value,
520
+ )
521
+
522
+ table_instance = self.table_builder(
523
+ connection_url=uri,
524
+ database=table_fields.dataset,
525
+ collection=table_fields.table,
526
+ parallel=False,
527
+ incremental=incremental,
528
+ )
529
+ table_instance.max_table_nesting = 1
530
+
531
+ return table_instance
532
+
533
+ def _validate_incremental_query(self, query: list, incremental_key: str):
534
+ """Validate that incremental key is projected in the aggregation pipeline"""
535
+ # Check if there's a $project stage and if incremental_key is included
536
+ has_project = False
537
+ incremental_key_projected = False
538
+
539
+ for stage in query:
540
+ if "$project" in stage:
541
+ has_project = True
542
+ project_stage = stage["$project"]
543
+ if isinstance(project_stage, dict):
544
+ # Check if incremental_key is explicitly included
545
+ if incremental_key in project_stage:
546
+ if project_stage[incremental_key] not in [0, False]:
547
+ incremental_key_projected = True
548
+ # If there are only inclusions (1 or True values) and incremental_key is not included
549
+ elif any(v in [1, True] for v in project_stage.values()):
550
+ # This is an inclusion projection, incremental_key must be explicitly included
551
+ incremental_key_projected = False
552
+ # If there are only exclusions (0 or False values) and incremental_key is not excluded
553
+ elif all(
554
+ v in [0, False]
555
+ for v in project_stage.values()
556
+ if v in [0, False, 1, True]
557
+ ):
558
+ # This is an exclusion projection, incremental_key is included by default
559
+ if incremental_key not in project_stage:
560
+ incremental_key_projected = True
561
+ else:
562
+ incremental_key_projected = project_stage[
563
+ incremental_key
564
+ ] not in [0, False]
565
+ else:
566
+ # Mixed or unclear projection, assume incremental_key needs to be explicit
567
+ incremental_key_projected = False
568
+
569
+ # If there's a $project stage but incremental_key is not projected, raise error
570
+ if has_project and not incremental_key_projected:
571
+ raise ValueError(
572
+ f"Incremental key '{incremental_key}' must be included in the projected fields of the aggregation pipeline"
573
+ )
574
+
575
+ def _substitute_interval_params(self, query: list, kwargs: dict):
576
+ """Substitute :interval_start and :interval_end placeholders with actual datetime values"""
577
+ from dlt.common.time import ensure_pendulum_datetime
578
+
579
+ # Get interval values and convert them to datetime objects
580
+ interval_start = kwargs.get("interval_start")
581
+ interval_end = kwargs.get("interval_end")
582
+
583
+ # Convert string dates to datetime objects if needed
584
+ if interval_start is not None:
585
+ if isinstance(interval_start, str):
586
+ pendulum_dt = ensure_pendulum_datetime(interval_start)
587
+ interval_start = (
588
+ pendulum_dt.to_datetime()
589
+ if hasattr(pendulum_dt, "to_datetime")
590
+ else pendulum_dt
591
+ )
592
+ elif hasattr(interval_start, "to_datetime"):
593
+ interval_start = interval_start.to_datetime()
594
+
595
+ if interval_end is not None:
596
+ if isinstance(interval_end, str):
597
+ pendulum_dt = ensure_pendulum_datetime(interval_end)
598
+ interval_end = (
599
+ pendulum_dt.to_datetime()
600
+ if hasattr(pendulum_dt, "to_datetime")
601
+ else pendulum_dt
602
+ )
603
+ elif hasattr(interval_end, "to_datetime"):
604
+ interval_end = interval_end.to_datetime()
605
+
606
+ # Deep copy the query and replace placeholders with actual datetime objects
607
+ def replace_placeholders(obj):
608
+ if isinstance(obj, dict):
609
+ result = {}
610
+ for key, value in obj.items():
611
+ if value == ":interval_start" and interval_start is not None:
612
+ result[key] = interval_start
613
+ elif value == ":interval_end" and interval_end is not None:
614
+ result[key] = interval_end
615
+ else:
616
+ result[key] = replace_placeholders(value)
617
+ return result
618
+ elif isinstance(obj, list):
619
+ return [replace_placeholders(item) for item in obj]
620
+ else:
621
+ return obj
622
+
623
+ return replace_placeholders(query)
321
624
 
322
625
 
323
626
  class LocalCsvSource:
@@ -326,7 +629,7 @@ class LocalCsvSource:
326
629
 
327
630
  def dlt_source(self, uri: str, table: str, **kwargs):
328
631
  def csv_file(
329
- incremental: Optional[dlt.sources.incremental[Any]] = None,
632
+ incremental: Optional[dlt_incremental[Any]] = None,
330
633
  ):
331
634
  file_path = uri.split("://")[1]
332
635
  myFile = open(file_path, "r")
@@ -357,6 +660,7 @@ class LocalCsvSource:
357
660
  if inc_value < incremental.start_value:
358
661
  continue
359
662
 
663
+ dictionary = self.remove_empty_columns(dictionary)
360
664
  page.append(dictionary)
361
665
  current_items += 1
362
666
  else:
@@ -367,11 +671,13 @@ class LocalCsvSource:
367
671
  if page:
368
672
  yield page
369
673
 
370
- return dlt.resource(
674
+ from dlt import resource
675
+
676
+ return resource(
371
677
  csv_file,
372
678
  merge_key=kwargs.get("merge_key"), # type: ignore
373
679
  )(
374
- incremental=dlt.sources.incremental(
680
+ incremental=dlt_incremental(
375
681
  kwargs.get("incremental_key", ""),
376
682
  initial_value=kwargs.get("interval_start"),
377
683
  end_value=kwargs.get("interval_end"),
@@ -380,11 +686,19 @@ class LocalCsvSource:
380
686
  )
381
687
  )
382
688
 
689
+ def remove_empty_columns(self, row: Dict[str, str]) -> Dict[str, str]:
690
+ return {k: v for k, v in row.items() if v.strip() != ""}
691
+
383
692
 
384
693
  class NotionSource:
385
694
  table_builder: Callable
386
695
 
387
- def __init__(self, table_builder=notion_databases) -> None:
696
+ def __init__(self, table_builder=None) -> None:
697
+ if table_builder is None:
698
+ from ingestr.src.notion import notion_databases
699
+
700
+ table_builder = notion_databases
701
+
388
702
  self.table_builder = table_builder
389
703
 
390
704
  def handles_incrementality(self) -> bool:
@@ -411,6 +725,11 @@ class ShopifySource:
411
725
  return True
412
726
 
413
727
  def dlt_source(self, uri: str, table: str, **kwargs):
728
+ if kwargs.get("incremental_key"):
729
+ raise ValueError(
730
+ "Shopify takes care of incrementality on its own, you should not provide incremental_key"
731
+ )
732
+
414
733
  source_fields = urlparse(uri)
415
734
  source_params = parse_qs(source_fields.query)
416
735
  api_key = source_params.get("api_key")
@@ -444,6 +763,8 @@ class ShopifySource:
444
763
  f"Table name '{table}' is not supported for Shopify source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
445
764
  )
446
765
 
766
+ from ingestr.src.shopify import shopify_source
767
+
447
768
  return shopify_source(
448
769
  private_app_password=api_key[0],
449
770
  shop_url=f"https://{source_fields.netloc}",
@@ -488,6 +809,8 @@ class GorgiasSource:
488
809
  if kwargs.get("interval_end"):
489
810
  date_args["end_date"] = kwargs.get("interval_end")
490
811
 
812
+ from ingestr.src.gorgias import gorgias_source
813
+
491
814
  return gorgias_source(
492
815
  domain=source_fields.netloc,
493
816
  email=email[0],
@@ -499,7 +822,12 @@ class GorgiasSource:
499
822
  class GoogleSheetsSource:
500
823
  table_builder: Callable
501
824
 
502
- def __init__(self, table_builder=google_spreadsheet) -> None:
825
+ def __init__(self, table_builder=None) -> None:
826
+ if table_builder is None:
827
+ from ingestr.src.google_sheets import google_spreadsheet
828
+
829
+ table_builder = google_spreadsheet
830
+
503
831
  self.table_builder = table_builder
504
832
 
505
833
  def handles_incrementality(self) -> bool:
@@ -580,6 +908,8 @@ class ChessSource:
580
908
  f"Resource '{table}' is not supported for Chess source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
581
909
  )
582
910
 
911
+ from ingestr.src.chess import source
912
+
583
913
  return source(players=list_players, **date_args).with_resources(
584
914
  table_mapping[table]
585
915
  )
@@ -603,40 +933,74 @@ class StripeAnalyticsSource:
603
933
  if not api_key:
604
934
  raise ValueError("api_key in the URI is required to connect to Stripe")
605
935
 
606
- endpoint = None
607
- table = str.capitalize(table)
936
+ table = table.lower()
608
937
 
609
- if table in [
610
- "Subscription",
611
- "Account",
612
- "Coupon",
613
- "Customer",
614
- "Product",
615
- "Price",
616
- "BalanceTransaction",
617
- "Invoice",
618
- "Event",
619
- ]:
620
- endpoint = table
938
+ from ingestr.src.stripe_analytics.settings import ENDPOINTS
939
+
940
+ endpoint = None
941
+ incremental = False
942
+ sync = False
943
+
944
+ table_fields = table.split(":")
945
+ if len(table_fields) == 1:
946
+ endpoint = table_fields[0]
947
+ elif len(table_fields) == 2:
948
+ endpoint = table_fields[0]
949
+ sync = table_fields[1] == "sync"
950
+ elif len(table_fields) == 3:
951
+ endpoint = table_fields[0]
952
+ sync = table_fields[1] == "sync"
953
+ incremental = table_fields[2] == "incremental"
621
954
  else:
622
955
  raise ValueError(
623
- f"Resource '{table}' is not supported for stripe source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
956
+ "Invalid Stripe table format. Expected: stripe:<endpoint> or stripe:<endpoint>:<sync> or stripe:<endpoint>:<sync>:<incremental>"
624
957
  )
625
958
 
626
- date_args = {}
627
- if kwargs.get("interval_start"):
628
- date_args["start_date"] = kwargs.get("interval_start")
629
-
630
- if kwargs.get("interval_end"):
631
- date_args["end_date"] = kwargs.get("interval_end")
632
-
633
- return stripe_source(
634
- endpoints=[
635
- endpoint,
636
- ],
637
- stripe_secret_key=api_key[0],
638
- **date_args,
639
- ).with_resources(endpoint)
959
+ if incremental and not sync:
960
+ raise ValueError("incremental loads must be used with sync loading")
961
+
962
+ if incremental:
963
+ from ingestr.src.stripe_analytics import incremental_stripe_source
964
+
965
+ def nullable_date(date_str: Optional[str]):
966
+ if date_str:
967
+ return ensure_pendulum_datetime(date_str)
968
+ return None
969
+
970
+ endpoint = ENDPOINTS[endpoint]
971
+ return incremental_stripe_source(
972
+ endpoints=[
973
+ endpoint,
974
+ ],
975
+ stripe_secret_key=api_key[0],
976
+ initial_start_date=nullable_date(kwargs.get("interval_start", None)),
977
+ end_date=nullable_date(kwargs.get("interval_end", None)),
978
+ ).with_resources(endpoint)
979
+ else:
980
+ endpoint = ENDPOINTS[endpoint]
981
+ if sync:
982
+ from ingestr.src.stripe_analytics import stripe_source
983
+
984
+ return stripe_source(
985
+ endpoints=[
986
+ endpoint,
987
+ ],
988
+ stripe_secret_key=api_key[0],
989
+ ).with_resources(endpoint)
990
+ else:
991
+ from ingestr.src.stripe_analytics import async_stripe_source
992
+
993
+ return async_stripe_source(
994
+ endpoints=[
995
+ endpoint,
996
+ ],
997
+ stripe_secret_key=api_key[0],
998
+ max_workers=kwargs.get("extract_parallelism", 4),
999
+ ).with_resources(endpoint)
1000
+
1001
+ raise ValueError(
1002
+ f"Resource '{table}' is not supported for stripe source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1003
+ )
640
1004
 
641
1005
 
642
1006
  class FacebookAdsSource:
@@ -662,17 +1026,76 @@ class FacebookAdsSource:
662
1026
  "access_token and accound_id are required to connect to Facebook Ads."
663
1027
  )
664
1028
 
1029
+ from ingestr.src.facebook_ads import (
1030
+ facebook_ads_source,
1031
+ facebook_insights_source,
1032
+ )
1033
+
1034
+ insights_max_wait_to_finish_seconds = source_params.get(
1035
+ "insights_max_wait_to_finish_seconds", [60 * 60 * 4]
1036
+ )
1037
+ insights_max_wait_to_start_seconds = source_params.get(
1038
+ "insights_max_wait_to_start_seconds", [60 * 30]
1039
+ )
1040
+ insights_max_async_sleep_seconds = source_params.get(
1041
+ "insights_max_async_sleep_seconds", [20]
1042
+ )
1043
+
665
1044
  endpoint = None
666
1045
  if table in ["campaigns", "ad_sets", "ad_creatives", "ads", "leads"]:
667
1046
  endpoint = table
668
- elif table in "facebook_insights":
1047
+ elif table == "facebook_insights":
669
1048
  return facebook_insights_source(
670
1049
  access_token=access_token[0],
671
1050
  account_id=account_id[0],
1051
+ start_date=kwargs.get("interval_start"),
1052
+ end_date=kwargs.get("interval_end"),
1053
+ insights_max_wait_to_finish_seconds=insights_max_wait_to_finish_seconds[
1054
+ 0
1055
+ ],
1056
+ insights_max_wait_to_start_seconds=insights_max_wait_to_start_seconds[
1057
+ 0
1058
+ ],
1059
+ insights_max_async_sleep_seconds=insights_max_async_sleep_seconds[0],
672
1060
  ).with_resources("facebook_insights")
1061
+ elif table.startswith("facebook_insights:"):
1062
+ # Parse custom breakdowns and metrics from table name
1063
+ # Supported formats:
1064
+ # facebook_insights:breakdown_type
1065
+ # facebook_insights:breakdown_type:metric1,metric2...
1066
+ parts = table.split(":")
1067
+
1068
+ if len(parts) < 2 or len(parts) > 3:
1069
+ raise ValueError(
1070
+ "Invalid facebook_insights format. Expected: facebook_insights:breakdown_type or facebook_insights:breakdown_type:metric1,metric2..."
1071
+ )
1072
+
1073
+ breakdown_type = parts[1].strip()
1074
+ if not breakdown_type:
1075
+ raise ValueError(
1076
+ "Breakdown type must be provided in format: facebook_insights:breakdown_type"
1077
+ )
1078
+
1079
+ # Validate breakdown type against available options from settings
1080
+
1081
+ from ingestr.src.facebook_ads.helpers import (
1082
+ parse_insights_table_to_source_kwargs,
1083
+ )
1084
+
1085
+ source_kwargs = {
1086
+ "access_token": access_token[0],
1087
+ "account_id": account_id[0],
1088
+ "start_date": kwargs.get("interval_start"),
1089
+ "end_date": kwargs.get("interval_end"),
1090
+ }
1091
+
1092
+ source_kwargs.update(parse_insights_table_to_source_kwargs(table))
1093
+ return facebook_insights_source(**source_kwargs).with_resources(
1094
+ "facebook_insights"
1095
+ )
673
1096
  else:
674
1097
  raise ValueError(
675
- "fResource '{table}' is not supported for Facebook Ads source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1098
+ f"Resource '{table}' is not supported for Facebook Ads source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
676
1099
  )
677
1100
 
678
1101
  return facebook_ads_source(
@@ -719,6 +1142,8 @@ class SlackSource:
719
1142
  if kwargs.get("interval_end"):
720
1143
  date_args["end_date"] = kwargs.get("interval_end")
721
1144
 
1145
+ from ingestr.src.slack import slack_source
1146
+
722
1147
  return slack_source(
723
1148
  access_token=api_key[0],
724
1149
  table_per_channel=False,
@@ -729,7 +1154,7 @@ class SlackSource:
729
1154
 
730
1155
  class HubspotSource:
731
1156
  def handles_incrementality(self) -> bool:
732
- return True
1157
+ return False
733
1158
 
734
1159
  # hubspot://?api_key=<api_key>
735
1160
  def dlt_source(self, uri: str, table: str, **kwargs):
@@ -747,7 +1172,35 @@ class HubspotSource:
747
1172
  raise ValueError("api_key in the URI is required to connect to Hubspot")
748
1173
 
749
1174
  endpoint = None
750
- if table in ["contacts", "companies", "deals", "tickets", "products", "quotes"]:
1175
+
1176
+ from ingestr.src.hubspot import hubspot
1177
+
1178
+ if table.startswith("custom:"):
1179
+ fields = table.split(":", 2)
1180
+ if len(fields) != 2 and len(fields) != 3:
1181
+ raise ValueError(
1182
+ "Invalid Hubspot custom table format. Expected format: custom:<custom_object_type> or custom:<custom_object_type>:<associations>"
1183
+ )
1184
+
1185
+ if len(fields) == 2:
1186
+ endpoint = fields[1]
1187
+ else:
1188
+ endpoint = f"{fields[1]}:{fields[2]}"
1189
+
1190
+ return hubspot(
1191
+ api_key=api_key[0],
1192
+ custom_object=endpoint,
1193
+ ).with_resources("custom")
1194
+
1195
+ elif table in [
1196
+ "contacts",
1197
+ "companies",
1198
+ "deals",
1199
+ "tickets",
1200
+ "products",
1201
+ "quotes",
1202
+ "schemas",
1203
+ ]:
751
1204
  endpoint = table
752
1205
  else:
753
1206
  raise ValueError(
@@ -772,20 +1225,31 @@ class AirtableSource:
772
1225
  if not table:
773
1226
  raise ValueError("Source table is required to connect to Airtable")
774
1227
 
775
- tables = table.split(",")
776
-
777
1228
  source_parts = urlparse(uri)
778
1229
  source_fields = parse_qs(source_parts.query)
779
- base_id = source_fields.get("base_id")
780
1230
  access_token = source_fields.get("access_token")
781
1231
 
782
- if not base_id or not access_token:
1232
+ if not access_token:
783
1233
  raise ValueError(
784
- "base_id and access_token in the URI are required to connect to Airtable"
1234
+ "access_token in the URI is required to connect to Airtable"
785
1235
  )
786
1236
 
1237
+ base_id = source_fields.get("base_id", [None])[0]
1238
+ clean_table = table
1239
+
1240
+ table_fields = table.split("/")
1241
+ if len(table_fields) == 2:
1242
+ clean_table = table_fields[1]
1243
+ if not base_id:
1244
+ base_id = table_fields[0]
1245
+
1246
+ if not base_id:
1247
+ raise ValueError("base_id in the URI is required to connect to Airtable")
1248
+
1249
+ from ingestr.src.airtable import airtable_source
1250
+
787
1251
  return airtable_source(
788
- base_id=base_id[0], table_names=tables, access_token=access_token[0]
1252
+ base_id=base_id, table_names=[clean_table], access_token=access_token[0]
789
1253
  )
790
1254
 
791
1255
 
@@ -831,12 +1295,66 @@ class KlaviyoSource:
831
1295
  )
832
1296
 
833
1297
  start_date = kwargs.get("interval_start") or "2000-01-01"
1298
+
1299
+ from ingestr.src.klaviyo import klaviyo_source
1300
+
834
1301
  return klaviyo_source(
835
1302
  api_key=api_key[0],
836
1303
  start_date=start_date,
837
1304
  ).with_resources(resource)
838
1305
 
839
1306
 
1307
+ class MixpanelSource:
1308
+ def handles_incrementality(self) -> bool:
1309
+ return True
1310
+
1311
+ def dlt_source(self, uri: str, table: str, **kwargs):
1312
+ if kwargs.get("incremental_key"):
1313
+ raise ValueError(
1314
+ "Mixpanel takes care of incrementality on its own, you should not provide incremental_key"
1315
+ )
1316
+
1317
+ parsed = urlparse(uri)
1318
+ params = parse_qs(parsed.query)
1319
+ username = params.get("username")
1320
+ password = params.get("password")
1321
+ project_id = params.get("project_id")
1322
+ server = params.get("server", ["eu"])
1323
+
1324
+ if not username or not password or not project_id:
1325
+ raise ValueError(
1326
+ "username, password, project_id are required to connect to Mixpanel"
1327
+ )
1328
+
1329
+ if table not in ["events", "profiles"]:
1330
+ raise ValueError(
1331
+ f"Resource '{table}' is not supported for Mixpanel source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1332
+ )
1333
+
1334
+ start_date = kwargs.get("interval_start")
1335
+ if start_date:
1336
+ start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
1337
+ else:
1338
+ start_date = pendulum.datetime(2020, 1, 1).in_timezone("UTC")
1339
+
1340
+ end_date = kwargs.get("interval_end")
1341
+ if end_date:
1342
+ end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
1343
+ else:
1344
+ end_date = pendulum.now().in_timezone("UTC")
1345
+
1346
+ from ingestr.src.mixpanel import mixpanel_source
1347
+
1348
+ return mixpanel_source(
1349
+ username=username[0],
1350
+ password=password[0],
1351
+ project_id=project_id[0],
1352
+ start_date=start_date,
1353
+ end_date=end_date,
1354
+ server=server[0],
1355
+ ).with_resources(table)
1356
+
1357
+
840
1358
  class KafkaSource:
841
1359
  def handles_incrementality(self) -> bool:
842
1360
  return False
@@ -864,6 +1382,9 @@ class KafkaSource:
864
1382
  raise ValueError("group_id in the URI is required to connect to kafka")
865
1383
 
866
1384
  start_date = kwargs.get("interval_start")
1385
+ from ingestr.src.kafka import kafka_consumer
1386
+ from ingestr.src.kafka.helpers import KafkaCredentials
1387
+
867
1388
  return kafka_consumer(
868
1389
  topics=[table],
869
1390
  credentials=KafkaCredentials(
@@ -919,6 +1440,9 @@ class AdjustSource:
919
1440
  if kwargs.get("interval_end"):
920
1441
  end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
921
1442
 
1443
+ from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
1444
+ from ingestr.src.adjust.adjust_helpers import parse_filters
1445
+
922
1446
  dimensions = None
923
1447
  metrics = None
924
1448
  filters = []
@@ -966,6 +1490,8 @@ class AppsflyerSource:
966
1490
  return True
967
1491
 
968
1492
  def dlt_source(self, uri: str, table: str, **kwargs):
1493
+ from ingestr.src.appsflyer import appsflyer_source
1494
+
969
1495
  if kwargs.get("incremental_key"):
970
1496
  raise ValueError(
971
1497
  "Appsflyer_Source takes care of incrementality on its own, you should not provide incremental_key"
@@ -978,22 +1504,27 @@ class AppsflyerSource:
978
1504
  if not api_key:
979
1505
  raise ValueError("api_key in the URI is required to connect to Appsflyer")
980
1506
 
981
- resource = None
982
- if table in ["campaigns", "creatives"]:
983
- resource = table
984
- else:
985
- raise ValueError(
986
- f"Resource '{table}' is not supported for Appsflyer source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
987
- )
988
-
989
- start_date = kwargs.get("interval_start") or "2024-01-02"
990
- end_date = kwargs.get("interval_end") or "2024-01-29"
1507
+ start_date = kwargs.get("interval_start")
1508
+ end_date = kwargs.get("interval_end")
1509
+ dimensions = []
1510
+ metrics = []
1511
+ if table.startswith("custom:"):
1512
+ fields = table.split(":", 3)
1513
+ if len(fields) != 3:
1514
+ raise ValueError(
1515
+ "Invalid Adjust custom table format. Expected format: custom:<dimensions>:<metrics>"
1516
+ )
1517
+ dimensions = fields[1].split(",")
1518
+ metrics = fields[2].split(",")
1519
+ table = "custom"
991
1520
 
992
1521
  return appsflyer_source(
993
1522
  api_key=api_key[0],
994
- start_date=start_date,
995
- end_date=end_date,
996
- ).with_resources(resource)
1523
+ start_date=start_date.strftime("%Y-%m-%d") if start_date else None, # type: ignore
1524
+ end_date=end_date.strftime("%Y-%m-%d") if end_date else None, # type: ignore
1525
+ dimensions=dimensions,
1526
+ metrics=metrics,
1527
+ ).with_resources(table)
997
1528
 
998
1529
 
999
1530
  class ZendeskSource:
@@ -1018,6 +1549,12 @@ class ZendeskSource:
1018
1549
  if not subdomain:
1019
1550
  raise ValueError("Subdomain is required to connect with Zendesk")
1020
1551
 
1552
+ from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
1553
+ from ingestr.src.zendesk.helpers.credentials import (
1554
+ ZendeskCredentialsOAuth,
1555
+ ZendeskCredentialsToken,
1556
+ )
1557
+
1021
1558
  if not source_fields.username and source_fields.password:
1022
1559
  oauth_token = source_fields.password
1023
1560
  if not oauth_token:
@@ -1076,7 +1613,7 @@ class ZendeskSource:
1076
1613
  ).with_resources(table)
1077
1614
  else:
1078
1615
  raise ValueError(
1079
- "fResource '{table}' is not supported for Zendesk source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1616
+ f"Resource '{table}' is not supported for Zendesk source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1080
1617
  )
1081
1618
 
1082
1619
 
@@ -1091,7 +1628,7 @@ class S3Source:
1091
1628
  )
1092
1629
 
1093
1630
  parsed_uri = urlparse(uri)
1094
- source_fields = parse_qs(quote(parsed_uri.query, safe="=&"))
1631
+ source_fields = parse_qs(parsed_uri.query)
1095
1632
  access_key_id = source_fields.get("access_key_id")
1096
1633
  if not access_key_id:
1097
1634
  raise ValueError("access_key_id is required to connect to S3")
@@ -1106,22 +1643,34 @@ class S3Source:
1106
1643
 
1107
1644
  bucket_url = f"s3://{bucket_name}/"
1108
1645
 
1646
+ import s3fs # type: ignore
1647
+
1109
1648
  fs = s3fs.S3FileSystem(
1110
1649
  key=access_key_id[0],
1111
1650
  secret=secret_access_key[0],
1112
1651
  )
1113
1652
 
1114
- file_extension = path_to_file.split(".")[-1]
1115
- if file_extension == "csv":
1116
- endpoint = "read_csv"
1117
- elif file_extension == "jsonl":
1118
- endpoint = "read_jsonl"
1119
- elif file_extension == "parquet":
1120
- endpoint = "read_parquet"
1653
+ endpoint: Optional[str] = None
1654
+ if "#" in table:
1655
+ _, endpoint = table.split("#")
1656
+ if endpoint not in ["csv", "jsonl", "parquet"]:
1657
+ raise ValueError(
1658
+ "S3 Source only supports specific formats files: csv, jsonl, parquet"
1659
+ )
1660
+ endpoint = f"read_{endpoint}"
1121
1661
  else:
1122
- raise ValueError(
1123
- "S3 Source only supports specific formats files: csv, jsonl, parquet"
1124
- )
1662
+ try:
1663
+ endpoint = blob.parse_endpoint(path_to_file)
1664
+ except blob.UnsupportedEndpointError:
1665
+ raise ValueError(
1666
+ "S3 Source only supports specific formats files: csv, jsonl, parquet"
1667
+ )
1668
+ except Exception as e:
1669
+ raise ValueError(
1670
+ f"Failed to parse endpoint from path: {path_to_file}"
1671
+ ) from e
1672
+
1673
+ from ingestr.src.filesystem import readers
1125
1674
 
1126
1675
  return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
1127
1676
 
@@ -1132,6 +1681,11 @@ class TikTokSource:
1132
1681
  return True
1133
1682
 
1134
1683
  def dlt_source(self, uri: str, table: str, **kwargs):
1684
+ if kwargs.get("incremental_key"):
1685
+ raise ValueError(
1686
+ "TikTok takes care of incrementality on its own, you should not provide incremental_key"
1687
+ )
1688
+
1135
1689
  endpoint = "custom_reports"
1136
1690
 
1137
1691
  parsed_uri = urlparse(uri)
@@ -1217,6 +1771,8 @@ class TikTokSource:
1217
1771
  filter_name = list(filters.keys())[0]
1218
1772
  filter_value = list(map(int, filters[list(filters.keys())[0]]))
1219
1773
 
1774
+ from ingestr.src.tiktok_ads import tiktok_source
1775
+
1220
1776
  return tiktok_source(
1221
1777
  start_date=start_date,
1222
1778
  end_date=end_date,
@@ -1265,20 +1821,83 @@ class AsanaSource:
1265
1821
  f"Resource '{table}' is not supported for Asana source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1266
1822
  )
1267
1823
 
1824
+ import dlt
1825
+
1826
+ from ingestr.src.asana_source import asana_source
1827
+
1268
1828
  dlt.secrets["sources.asana_source.access_token"] = access_token[0]
1829
+
1269
1830
  src = asana_source()
1270
1831
  src.workspaces.add_filter(lambda w: w["gid"] == workspace)
1271
1832
  return src.with_resources(table)
1272
1833
 
1273
1834
 
1274
- class DynamoDBSource:
1275
- AWS_ENDPOINT_PATTERN = re.compile(".*\.(.+)\.amazonaws\.com")
1276
-
1277
- def infer_aws_region(self, uri: ParseResult) -> Optional[str]:
1278
- # try to infer from URI
1279
- matches = self.AWS_ENDPOINT_PATTERN.match(uri.netloc)
1280
- if matches is not None:
1281
- return matches[1]
1835
+ class JiraSource:
1836
+ resources = [
1837
+ "projects",
1838
+ "issues",
1839
+ "users",
1840
+ "issue_types",
1841
+ "statuses",
1842
+ "priorities",
1843
+ "resolutions",
1844
+ "project_versions",
1845
+ "project_components",
1846
+ "events",
1847
+ ]
1848
+
1849
+ def handles_incrementality(self) -> bool:
1850
+ return True
1851
+
1852
+ def dlt_source(self, uri: str, table: str, **kwargs):
1853
+ parsed_uri = urlparse(uri)
1854
+ params = parse_qs(parsed_uri.query)
1855
+
1856
+ base_url = f"https://{parsed_uri.netloc}"
1857
+ email = params.get("email")
1858
+ api_token = params.get("api_token")
1859
+
1860
+ if not email:
1861
+ raise ValueError("email must be specified in the URI query parameters")
1862
+
1863
+ if not api_token:
1864
+ raise ValueError("api_token is required for connecting to Jira")
1865
+
1866
+ flags = {
1867
+ "skip_archived": False,
1868
+ }
1869
+ if ":" in table:
1870
+ table, rest = table.split(":", 1) # type: ignore
1871
+ for k in rest.split(":"):
1872
+ flags[k] = True
1873
+
1874
+ if table not in self.resources:
1875
+ raise ValueError(
1876
+ f"Resource '{table}' is not supported for Jira source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1877
+ )
1878
+
1879
+ import dlt
1880
+
1881
+ from ingestr.src.jira_source import jira_source
1882
+
1883
+ dlt.secrets["sources.jira_source.base_url"] = base_url
1884
+ dlt.secrets["sources.jira_source.email"] = email[0]
1885
+ dlt.secrets["sources.jira_source.api_token"] = api_token[0]
1886
+
1887
+ src = jira_source()
1888
+ if flags["skip_archived"]:
1889
+ src.projects.add_filter(lambda p: not p.get("archived", False))
1890
+ return src.with_resources(table)
1891
+
1892
+
1893
+ class DynamoDBSource:
1894
+ AWS_ENDPOINT_PATTERN = re.compile(".*\.(.+)\.amazonaws\.com")
1895
+
1896
+ def infer_aws_region(self, uri: ParseResult) -> Optional[str]:
1897
+ # try to infer from URI
1898
+ matches = self.AWS_ENDPOINT_PATTERN.match(uri.netloc)
1899
+ if matches is not None:
1900
+ return matches[1]
1282
1901
 
1283
1902
  # else obtain region from query string
1284
1903
  region = parse_qs(uri.query).get("region")
@@ -1301,7 +1920,7 @@ class DynamoDBSource:
1301
1920
  if not region:
1302
1921
  raise ValueError("region is required to connect to Dynamodb")
1303
1922
 
1304
- qs = parse_qs(quote(parsed_uri.query, safe="=&"))
1923
+ qs = parse_qs(parsed_uri.query)
1305
1924
  access_key = qs.get("access_key_id")
1306
1925
 
1307
1926
  if not access_key:
@@ -1311,6 +1930,9 @@ class DynamoDBSource:
1311
1930
  if not secret_key:
1312
1931
  raise ValueError("secret_access_key is required to connect to Dynamodb")
1313
1932
 
1933
+ from dlt.common.configuration.specs import AwsCredentials
1934
+ from dlt.common.typing import TSecretStrValue
1935
+
1314
1936
  creds = AwsCredentials(
1315
1937
  aws_access_key_id=access_key[0],
1316
1938
  aws_secret_access_key=TSecretStrValue(secret_key[0]),
@@ -1321,8 +1943,11 @@ class DynamoDBSource:
1321
1943
  incremental = None
1322
1944
  incremental_key = kwargs.get("incremental_key")
1323
1945
 
1946
+ from ingestr.src.dynamodb import dynamodb
1947
+ from ingestr.src.time import isotime
1948
+
1324
1949
  if incremental_key:
1325
- incremental = dlt.sources.incremental(
1950
+ incremental = dlt_incremental(
1326
1951
  incremental_key.strip(),
1327
1952
  initial_value=isotime(kwargs.get("interval_start")),
1328
1953
  end_value=isotime(kwargs.get("interval_end")),
@@ -1334,47 +1959,127 @@ class DynamoDBSource:
1334
1959
  return dynamodb(table, creds, incremental)
1335
1960
 
1336
1961
 
1962
+ class DoceboSource:
1963
+ def handles_incrementality(self) -> bool:
1964
+ return False
1965
+
1966
+ def dlt_source(self, uri: str, table: str, **kwargs):
1967
+ # docebo://?base_url=https://yourcompany.docebosaas.com&client_id=xxx&client_secret=xxx
1968
+ # Optional: &username=xxx&password=xxx for password grant type
1969
+
1970
+ if kwargs.get("incremental_key"):
1971
+ raise ValueError("Incremental loads are not yet supported for Docebo")
1972
+
1973
+ parsed_uri = urlparse(uri)
1974
+ source_params = parse_qs(parsed_uri.query)
1975
+
1976
+ base_url = source_params.get("base_url")
1977
+ if not base_url:
1978
+ raise ValueError("base_url is required to connect to Docebo")
1979
+
1980
+ client_id = source_params.get("client_id")
1981
+ if not client_id:
1982
+ raise ValueError("client_id is required to connect to Docebo")
1983
+
1984
+ client_secret = source_params.get("client_secret")
1985
+ if not client_secret:
1986
+ raise ValueError("client_secret is required to connect to Docebo")
1987
+
1988
+ # Username and password are optional (uses client_credentials grant if not provided)
1989
+ username = source_params.get("username", [None])[0]
1990
+ password = source_params.get("password", [None])[0]
1991
+
1992
+ # Supported tables
1993
+ supported_tables = [
1994
+ "users",
1995
+ "courses",
1996
+ "user_fields",
1997
+ "branches",
1998
+ "groups",
1999
+ "group_members",
2000
+ "course_fields",
2001
+ "learning_objects",
2002
+ "learning_plans",
2003
+ "learning_plan_enrollments",
2004
+ "learning_plan_course_enrollments",
2005
+ "course_enrollments",
2006
+ "sessions",
2007
+ "categories",
2008
+ "certifications",
2009
+ "external_training",
2010
+ "survey_answers",
2011
+ ]
2012
+ if table not in supported_tables:
2013
+ raise ValueError(
2014
+ f"Resource '{table}' is not supported for Docebo source. Supported tables: {', '.join(supported_tables)}"
2015
+ )
2016
+
2017
+ from ingestr.src.docebo import docebo_source
2018
+
2019
+ return docebo_source(
2020
+ base_url=base_url[0],
2021
+ client_id=client_id[0],
2022
+ client_secret=client_secret[0],
2023
+ username=username,
2024
+ password=password,
2025
+ ).with_resources(table)
2026
+
2027
+
1337
2028
  class GoogleAnalyticsSource:
1338
2029
  def handles_incrementality(self) -> bool:
1339
2030
  return True
1340
2031
 
1341
2032
  def dlt_source(self, uri: str, table: str, **kwargs):
1342
- parse_uri = urlparse(uri)
1343
- source_fields = parse_qs(parse_uri.query)
1344
- cred_path = source_fields.get("credentials_path")
1345
-
1346
- if not cred_path:
1347
- raise ValueError("credentials_path is required to connect Google Analytics")
1348
- credentials = {}
2033
+ import ingestr.src.google_analytics.helpers as helpers
1349
2034
 
1350
- with open(cred_path[0], "r") as f:
1351
- credentials = json.load(f)
2035
+ if kwargs.get("incremental_key"):
2036
+ raise ValueError(
2037
+ "Google Analytics takes care of incrementality on its own, you should not provide incremental_key"
2038
+ )
1352
2039
 
1353
- property_id = source_fields.get("property_id")
1354
- if not property_id:
1355
- raise ValueError("property_id is required to connect to Google Analytics")
2040
+ result = helpers.parse_google_analytics_uri(uri)
2041
+ credentials = result["credentials"]
2042
+ property_id = result["property_id"]
1356
2043
 
1357
2044
  fields = table.split(":")
1358
- if len(fields) != 3:
2045
+ if len(fields) != 3 and len(fields) != 4:
1359
2046
  raise ValueError(
1360
- "Invalid table format. Expected format: custom:<dimensions>:<metrics>"
2047
+ "Invalid table format. Expected format: <report_type>:<dimensions>:<metrics> or <report_type>:<dimensions>:<metrics>:<minute_ranges>"
1361
2048
  )
1362
2049
 
1363
- dimensions = fields[1].replace(" ", "").split(",")
1364
-
1365
- datetime = ""
1366
- for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
1367
- if dimension_datetime in dimensions:
1368
- datetime = dimension_datetime
1369
- break
1370
- else:
2050
+ report_type = fields[0]
2051
+ if report_type not in ["custom", "realtime"]:
1371
2052
  raise ValueError(
1372
- "You must provide at least one dimension: [dateHour, dateHourMinute, date]"
2053
+ "Invalid report type. Expected format: <report_type>:<dimensions>:<metrics>. Available report types: custom, realtime"
1373
2054
  )
1374
2055
 
2056
+ dimensions = fields[1].replace(" ", "").split(",")
1375
2057
  metrics = fields[2].replace(" ", "").split(",")
2058
+
2059
+ minute_range_objects = []
2060
+ if len(fields) == 4:
2061
+ minute_range_objects = (
2062
+ helpers.convert_minutes_ranges_to_minute_range_objects(fields[3])
2063
+ )
2064
+
2065
+ datetime = ""
2066
+ resource_name = fields[0].lower()
2067
+ if resource_name == "custom":
2068
+ for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
2069
+ if dimension_datetime in dimensions:
2070
+ datetime = dimension_datetime
2071
+ break
2072
+ else:
2073
+ raise ValueError(
2074
+ "You must provide at least one dimension: [dateHour, dateHourMinute, date]"
2075
+ )
2076
+
1376
2077
  queries = [
1377
- {"resource_name": "custom", "dimensions": dimensions, "metrics": metrics}
2078
+ {
2079
+ "resource_name": resource_name,
2080
+ "dimensions": dimensions,
2081
+ "metrics": metrics,
2082
+ }
1378
2083
  ]
1379
2084
 
1380
2085
  start_date = pendulum.now().subtract(days=30).start_of("day")
@@ -1385,14 +2090,17 @@ class GoogleAnalyticsSource:
1385
2090
  if kwargs.get("interval_end") is not None:
1386
2091
  end_date = pendulum.instance(kwargs.get("interval_end")) # type: ignore
1387
2092
 
2093
+ from ingestr.src.google_analytics import google_analytics
2094
+
1388
2095
  return google_analytics(
1389
- property_id=property_id[0],
2096
+ property_id=property_id,
1390
2097
  start_date=start_date,
1391
2098
  end_date=end_date,
1392
2099
  datetime_dimension=datetime,
1393
2100
  queries=queries,
1394
2101
  credentials=credentials,
1395
- ).with_resources("basic_report")
2102
+ minute_range_objects=minute_range_objects if minute_range_objects else None,
2103
+ ).with_resources(resource_name)
1396
2104
 
1397
2105
 
1398
2106
  class GitHubSource:
@@ -1422,12 +2130,34 @@ class GitHubSource:
1422
2130
 
1423
2131
  access_token = source_fields.get("access_token", [""])[0]
1424
2132
 
2133
+ from ingestr.src.github import (
2134
+ github_reactions,
2135
+ github_repo_events,
2136
+ github_stargazers,
2137
+ )
2138
+
1425
2139
  if table in ["issues", "pull_requests"]:
1426
2140
  return github_reactions(
1427
2141
  owner=owner, name=repo, access_token=access_token
1428
2142
  ).with_resources(table)
1429
2143
  elif table == "repo_events":
1430
- return github_repo_events(owner=owner, name=repo, access_token=access_token)
2144
+ start_date = kwargs.get("interval_start") or pendulum.now().subtract(
2145
+ days=30
2146
+ )
2147
+ end_date = kwargs.get("interval_end") or None
2148
+
2149
+ if isinstance(start_date, str):
2150
+ start_date = pendulum.parse(start_date)
2151
+ if isinstance(end_date, str):
2152
+ end_date = pendulum.parse(end_date)
2153
+
2154
+ return github_repo_events(
2155
+ owner=owner,
2156
+ name=repo,
2157
+ access_token=access_token,
2158
+ start_date=start_date,
2159
+ end_date=end_date,
2160
+ )
1431
2161
  elif table == "stargazers":
1432
2162
  return github_stargazers(owner=owner, name=repo, access_token=access_token)
1433
2163
  else:
@@ -1454,6 +2184,8 @@ class AppleAppStoreSource:
1454
2184
  else:
1455
2185
  key = base64.b64decode(key_base64[0]).decode() # type: ignore
1456
2186
 
2187
+ from ingestr.src.appstore.client import AppStoreConnectClient
2188
+
1457
2189
  return AppStoreConnectClient(key.encode(), key_id, issuer_id)
1458
2190
 
1459
2191
  def dlt_source(self, uri: str, table: str, **kwargs):
@@ -1494,6 +2226,8 @@ class AppleAppStoreSource:
1494
2226
  if app_ids is None:
1495
2227
  raise MissingValueError("app_id", "App Store")
1496
2228
 
2229
+ from ingestr.src.appstore import app_store
2230
+
1497
2231
  src = app_store(
1498
2232
  client,
1499
2233
  app_ids,
@@ -1550,21 +2284,24 @@ class GCSSource:
1550
2284
  # (The RECOMMENDED way of passing service account credentials)
1551
2285
  # directly with gcsfs. As a workaround, we construct the GCSFileSystem
1552
2286
  # and pass it directly to filesystem.readers.
2287
+ import gcsfs # type: ignore
2288
+
1553
2289
  fs = gcsfs.GCSFileSystem(
1554
2290
  token=credentials,
1555
2291
  )
1556
2292
 
1557
- file_extension = path_to_file.split(".")[-1]
1558
- if file_extension == "csv":
1559
- endpoint = "read_csv"
1560
- elif file_extension == "jsonl":
1561
- endpoint = "read_jsonl"
1562
- elif file_extension == "parquet":
1563
- endpoint = "read_parquet"
1564
- else:
2293
+ try:
2294
+ endpoint = blob.parse_endpoint(path_to_file)
2295
+ except blob.UnsupportedEndpointError:
1565
2296
  raise ValueError(
1566
2297
  "GCS Source only supports specific formats files: csv, jsonl, parquet"
1567
2298
  )
2299
+ except Exception as e:
2300
+ raise ValueError(
2301
+ f"Failed to parse endpoint from path: {path_to_file}"
2302
+ ) from e
2303
+
2304
+ from ingestr.src.filesystem import readers
1568
2305
 
1569
2306
  return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
1570
2307
 
@@ -1573,7 +2310,9 @@ class GoogleAdsSource:
1573
2310
  def handles_incrementality(self) -> bool:
1574
2311
  return True
1575
2312
 
1576
- def init_client(self, params: Dict[str, List[str]]) -> GoogleAdsClient:
2313
+ def init_client(self, params: Dict[str, List[str]]):
2314
+ from google.ads.googleads.client import GoogleAdsClient # type: ignore
2315
+
1577
2316
  dev_token = params.get("dev_token")
1578
2317
  if dev_token is None or len(dev_token) == 0:
1579
2318
  raise MissingValueError("dev_token", "Google Ads")
@@ -1627,6 +2366,7 @@ class GoogleAdsSource:
1627
2366
  raise MissingValueError("customer_id", "Google Ads")
1628
2367
 
1629
2368
  params = parse_qs(parsed_uri.query)
2369
+
1630
2370
  client = self.init_client(params)
1631
2371
 
1632
2372
  start_date = kwargs.get("interval_start") or datetime.now(
@@ -1648,6 +2388,8 @@ class GoogleAdsSource:
1648
2388
  report_spec = table
1649
2389
  table = "daily_report"
1650
2390
 
2391
+ from ingestr.src.google_ads import google_ads
2392
+
1651
2393
  src = google_ads(
1652
2394
  client,
1653
2395
  customer_id,
@@ -1667,6 +2409,11 @@ class LinkedInAdsSource:
1667
2409
  return True
1668
2410
 
1669
2411
  def dlt_source(self, uri: str, table: str, **kwargs):
2412
+ if kwargs.get("incremental_key"):
2413
+ raise ValueError(
2414
+ "LinkedIn Ads takes care of incrementality on its own, you should not provide incremental_key"
2415
+ )
2416
+
1670
2417
  parsed_uri = urlparse(uri)
1671
2418
  source_fields = parse_qs(parsed_uri.query)
1672
2419
 
@@ -1712,6 +2459,12 @@ class LinkedInAdsSource:
1712
2459
  "'date' or 'month' is required to connect to LinkedIn Ads, please provide at least one of these dimensions."
1713
2460
  )
1714
2461
 
2462
+ from ingestr.src.linkedin_ads import linked_in_ads_source
2463
+ from ingestr.src.linkedin_ads.dimension_time_enum import (
2464
+ Dimension,
2465
+ TimeGranularity,
2466
+ )
2467
+
1715
2468
  if "date" in dimensions:
1716
2469
  time_granularity = TimeGranularity.daily
1717
2470
  dimensions.remove("date")
@@ -1737,3 +2490,2170 @@ class LinkedInAdsSource:
1737
2490
  metrics=metrics,
1738
2491
  time_granularity=time_granularity,
1739
2492
  ).with_resources("custom_reports")
2493
+
2494
+
2495
+ class ClickupSource:
2496
+ def handles_incrementality(self) -> bool:
2497
+ return True
2498
+
2499
+ def dlt_source(self, uri: str, table: str, **kwargs):
2500
+ if kwargs.get("incremental_key"):
2501
+ raise ValueError(
2502
+ "ClickUp takes care of incrementality on its own, you should not provide incremental_key"
2503
+ )
2504
+
2505
+ parsed_uri = urlparse(uri)
2506
+ params = parse_qs(parsed_uri.query)
2507
+ api_token = params.get("api_token")
2508
+
2509
+ if api_token is None:
2510
+ raise MissingValueError("api_token", "ClickUp")
2511
+
2512
+ interval_start = kwargs.get("interval_start")
2513
+ interval_end = kwargs.get("interval_end")
2514
+ start_date = (
2515
+ ensure_pendulum_datetime(interval_start).in_timezone("UTC")
2516
+ if interval_start
2517
+ else pendulum.datetime(2020, 1, 1, tz="UTC")
2518
+ )
2519
+ end_date = (
2520
+ ensure_pendulum_datetime(interval_end).in_timezone("UTC")
2521
+ if interval_end
2522
+ else None
2523
+ )
2524
+
2525
+ from ingestr.src.clickup import clickup_source
2526
+
2527
+ if table not in {"user", "teams", "lists", "tasks", "spaces"}:
2528
+ raise UnsupportedResourceError(table, "ClickUp")
2529
+
2530
+ return clickup_source(
2531
+ api_token=api_token[0], start_date=start_date, end_date=end_date
2532
+ ).with_resources(table)
2533
+
2534
+
2535
+ class AppLovinSource:
2536
+ def handles_incrementality(self) -> bool:
2537
+ return True
2538
+
2539
+ def dlt_source(self, uri: str, table: str, **kwargs):
2540
+ if kwargs.get("incremental_key") is not None:
2541
+ raise ValueError(
2542
+ "Applovin takes care of incrementality on its own, you should not provide incremental_key"
2543
+ )
2544
+
2545
+ parsed_uri = urlparse(uri)
2546
+ params = parse_qs(parsed_uri.query)
2547
+
2548
+ api_key = params.get("api_key", None)
2549
+ if api_key is None:
2550
+ raise MissingValueError("api_key", "AppLovin")
2551
+
2552
+ interval_start = kwargs.get("interval_start")
2553
+ interval_end = kwargs.get("interval_end")
2554
+
2555
+ now = datetime.now()
2556
+ start_date = (
2557
+ interval_start if interval_start is not None else now - timedelta(days=1)
2558
+ )
2559
+ end_date = interval_end
2560
+
2561
+ custom_report = None
2562
+ if table.startswith("custom:"):
2563
+ custom_report = table
2564
+ table = "custom_report"
2565
+
2566
+ from ingestr.src.applovin import applovin_source
2567
+
2568
+ src = applovin_source(
2569
+ api_key[0],
2570
+ start_date.strftime("%Y-%m-%d"),
2571
+ end_date.strftime("%Y-%m-%d") if end_date else None,
2572
+ custom_report,
2573
+ )
2574
+
2575
+ if table not in src.resources:
2576
+ raise UnsupportedResourceError(table, "AppLovin")
2577
+
2578
+ return src.with_resources(table)
2579
+
2580
+
2581
+ class ApplovinMaxSource:
2582
+ # expected uri format: applovinmax://?api_key=<api_key>
2583
+ # expected table format: user_ad_revenue:app_id_1,app_id_2
2584
+
2585
+ def handles_incrementality(self) -> bool:
2586
+ return True
2587
+
2588
+ def dlt_source(self, uri: str, table: str, **kwargs):
2589
+ if kwargs.get("incremental_key"):
2590
+ raise ValueError(
2591
+ "AppLovin Max takes care of incrementality on its own, you should not provide incremental_key"
2592
+ )
2593
+
2594
+ parsed_uri = urlparse(uri)
2595
+ params = parse_qs(parsed_uri.query)
2596
+
2597
+ api_key = params.get("api_key")
2598
+ if api_key is None:
2599
+ raise ValueError("api_key is required to connect to AppLovin Max API.")
2600
+
2601
+ AVAILABLE_TABLES = ["user_ad_revenue"]
2602
+
2603
+ table_fields = table.split(":")
2604
+ requested_table = table_fields[0]
2605
+
2606
+ if len(table_fields) != 2:
2607
+ raise ValueError(
2608
+ "Invalid table format. Expected format is user_ad_revenue:app_id_1,app_id_2"
2609
+ )
2610
+
2611
+ if requested_table not in AVAILABLE_TABLES:
2612
+ raise ValueError(
2613
+ f"Table name '{requested_table}' is not supported for AppLovin Max source yet."
2614
+ f"Only '{AVAILABLE_TABLES}' are currently supported. "
2615
+ "If you need additional tables, please create a GitHub issue at "
2616
+ "https://github.com/bruin-data/ingestr"
2617
+ )
2618
+
2619
+ applications = [
2620
+ i for i in table_fields[1].replace(" ", "").split(",") if i.strip()
2621
+ ]
2622
+ if len(applications) == 0:
2623
+ raise ValueError("At least one application id is required")
2624
+
2625
+ if len(applications) != len(set(applications)):
2626
+ raise ValueError("Application ids must be unique.")
2627
+
2628
+ interval_start = kwargs.get("interval_start")
2629
+ interval_end = kwargs.get("interval_end")
2630
+
2631
+ now = pendulum.now("UTC")
2632
+ default_start = now.subtract(days=30).date()
2633
+
2634
+ start_date = (
2635
+ interval_start.date() if interval_start is not None else default_start
2636
+ )
2637
+
2638
+ end_date = interval_end.date() if interval_end is not None else None
2639
+
2640
+ from ingestr.src.applovin_max import applovin_max_source
2641
+
2642
+ return applovin_max_source(
2643
+ start_date=start_date,
2644
+ end_date=end_date,
2645
+ api_key=api_key[0],
2646
+ applications=applications,
2647
+ ).with_resources(requested_table)
2648
+
2649
+
2650
+ class SalesforceSource:
2651
+ def handles_incrementality(self) -> bool:
2652
+ return True
2653
+
2654
+ def dlt_source(self, uri: str, table: str, **kwargs):
2655
+ if kwargs.get("incremental_key"):
2656
+ raise ValueError(
2657
+ "Salesforce takes care of incrementality on its own, you should not provide incremental_key"
2658
+ )
2659
+
2660
+ params = parse_qs(urlparse(uri).query)
2661
+ creds = {
2662
+ "username": params.get("username", [None])[0],
2663
+ "password": params.get("password", [None])[0],
2664
+ "token": params.get("token", [None])[0],
2665
+ "domain": params.get("domain", [None])[0],
2666
+ }
2667
+ for k, v in creds.items():
2668
+ if v is None:
2669
+ raise MissingValueError(k, "Salesforce")
2670
+
2671
+ from ingestr.src.salesforce import salesforce_source
2672
+
2673
+ src = salesforce_source(**creds) # type: ignore
2674
+
2675
+ if table.startswith("custom:"):
2676
+ custom_object = table.split(":")[1]
2677
+ src = salesforce_source(**creds, custom_object=custom_object)
2678
+ return src.with_resources("custom")
2679
+
2680
+ if table not in src.resources:
2681
+ raise UnsupportedResourceError(table, "Salesforce")
2682
+
2683
+ return src.with_resources(table)
2684
+
2685
+
2686
+ class PersonioSource:
2687
+ def handles_incrementality(self) -> bool:
2688
+ return True
2689
+
2690
+ # applovin://?client_id=123&client_secret=123
2691
+ def dlt_source(self, uri: str, table: str, **kwargs):
2692
+ if kwargs.get("incremental_key"):
2693
+ raise ValueError(
2694
+ "Personio takes care of incrementality on its own, you should not provide incremental_key"
2695
+ )
2696
+
2697
+ parsed_uri = urlparse(uri)
2698
+ params = parse_qs(parsed_uri.query)
2699
+
2700
+ client_id = params.get("client_id")
2701
+ client_secret = params.get("client_secret")
2702
+
2703
+ interval_start = kwargs.get("interval_start")
2704
+ interval_end = kwargs.get("interval_end")
2705
+
2706
+ interval_start_date = (
2707
+ interval_start if interval_start is not None else "2018-01-01"
2708
+ )
2709
+
2710
+ interval_end_date = (
2711
+ interval_end.strftime("%Y-%m-%d") if interval_end is not None else None
2712
+ )
2713
+
2714
+ if client_id is None:
2715
+ raise MissingValueError("client_id", "Personio")
2716
+ if client_secret is None:
2717
+ raise MissingValueError("client_secret", "Personio")
2718
+ if table not in [
2719
+ "employees",
2720
+ "absences",
2721
+ "absence_types",
2722
+ "attendances",
2723
+ "projects",
2724
+ "document_categories",
2725
+ "employees_absences_balance",
2726
+ "custom_reports_list",
2727
+ ]:
2728
+ raise UnsupportedResourceError(table, "Personio")
2729
+
2730
+ from ingestr.src.personio import personio_source
2731
+
2732
+ return personio_source(
2733
+ client_id=client_id[0],
2734
+ client_secret=client_secret[0],
2735
+ start_date=interval_start_date,
2736
+ end_date=interval_end_date,
2737
+ ).with_resources(table)
2738
+
2739
+
2740
+ class KinesisSource:
2741
+ def handles_incrementality(self) -> bool:
2742
+ return True
2743
+
2744
+ def dlt_source(self, uri: str, table: str, **kwargs):
2745
+ # kinesis://?aws_access_key_id=<AccessKeyId>&aws_secret_access_key=<SecretAccessKey>&region_name=<Region>
2746
+ # source table = stream name
2747
+ parsed_uri = urlparse(uri)
2748
+ params = parse_qs(parsed_uri.query)
2749
+
2750
+ aws_access_key_id = params.get("aws_access_key_id")
2751
+ if aws_access_key_id is None:
2752
+ raise MissingValueError("aws_access_key_id", "Kinesis")
2753
+
2754
+ aws_secret_access_key = params.get("aws_secret_access_key")
2755
+ if aws_secret_access_key is None:
2756
+ raise MissingValueError("aws_secret_access_key", "Kinesis")
2757
+
2758
+ region_name = params.get("region_name")
2759
+ if region_name is None:
2760
+ raise MissingValueError("region_name", "Kinesis")
2761
+
2762
+ start_date = kwargs.get("interval_start")
2763
+ if start_date is not None:
2764
+ # the resource will read all messages after this timestamp.
2765
+ start_date = ensure_pendulum_datetime(start_date)
2766
+
2767
+ from dlt.common.configuration.specs import AwsCredentials
2768
+
2769
+ from ingestr.src.kinesis import kinesis_stream
2770
+
2771
+ credentials = AwsCredentials(
2772
+ aws_access_key_id=aws_access_key_id[0],
2773
+ aws_secret_access_key=aws_secret_access_key[0],
2774
+ region_name=region_name[0],
2775
+ )
2776
+
2777
+ return kinesis_stream(
2778
+ stream_name=table, credentials=credentials, initial_at_timestamp=start_date
2779
+ )
2780
+
2781
+
2782
+ class PipedriveSource:
2783
+ def handles_incrementality(self) -> bool:
2784
+ return True
2785
+
2786
+ def dlt_source(self, uri: str, table: str, **kwargs):
2787
+ if kwargs.get("incremental_key"):
2788
+ raise ValueError(
2789
+ "Pipedrive takes care of incrementality on its own, you should not provide incremental_key"
2790
+ )
2791
+
2792
+ parsed_uri = urlparse(uri)
2793
+ params = parse_qs(parsed_uri.query)
2794
+ api_key = params.get("api_token")
2795
+ if api_key is None:
2796
+ raise MissingValueError("api_token", "Pipedrive")
2797
+
2798
+ start_date = kwargs.get("interval_start")
2799
+ if start_date is not None:
2800
+ start_date = ensure_pendulum_datetime(start_date)
2801
+ else:
2802
+ start_date = pendulum.parse("2000-01-01")
2803
+
2804
+ if table not in [
2805
+ "users",
2806
+ "activities",
2807
+ "persons",
2808
+ "organizations",
2809
+ "products",
2810
+ "stages",
2811
+ "deals",
2812
+ ]:
2813
+ raise UnsupportedResourceError(table, "Pipedrive")
2814
+
2815
+ from ingestr.src.pipedrive import pipedrive_source
2816
+
2817
+ return pipedrive_source(
2818
+ pipedrive_api_key=api_key, since_timestamp=start_date
2819
+ ).with_resources(table)
2820
+
2821
+
2822
+ class FrankfurterSource:
2823
+ def handles_incrementality(self) -> bool:
2824
+ return True
2825
+
2826
+ def dlt_source(self, uri: str, table: str, **kwargs):
2827
+ if kwargs.get("incremental_key"):
2828
+ raise ValueError(
2829
+ "Frankfurter takes care of incrementality on its own, you should not provide incremental_key"
2830
+ )
2831
+
2832
+ from ingestr.src.frankfurter import frankfurter_source
2833
+ from ingestr.src.frankfurter.helpers import validate_currency, validate_dates
2834
+
2835
+ parsed_uri = urlparse(uri)
2836
+ source_params = parse_qs(parsed_uri.query)
2837
+ base_currency = source_params.get("base", [None])[0]
2838
+
2839
+ if not base_currency:
2840
+ base_currency = "USD"
2841
+
2842
+ validate_currency(base_currency)
2843
+
2844
+ if kwargs.get("interval_start"):
2845
+ start_date = ensure_pendulum_datetime(str(kwargs.get("interval_start")))
2846
+ else:
2847
+ start_date = pendulum.yesterday()
2848
+
2849
+ if kwargs.get("interval_end"):
2850
+ end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
2851
+ else:
2852
+ end_date = None
2853
+
2854
+ validate_dates(start_date=start_date, end_date=end_date)
2855
+
2856
+ src = frankfurter_source(
2857
+ start_date=start_date,
2858
+ end_date=end_date,
2859
+ base_currency=base_currency,
2860
+ )
2861
+
2862
+ if table not in src.resources:
2863
+ raise UnsupportedResourceError(table, "Frankfurter")
2864
+
2865
+ return src.with_resources(table)
2866
+
2867
+
2868
+ class FreshdeskSource:
2869
+ # freshdesk://domain?api_key=<api_key>
2870
+ def handles_incrementality(self) -> bool:
2871
+ return True
2872
+
2873
+ def dlt_source(self, uri: str, table: str, **kwargs):
2874
+ if kwargs.get("incremental_key"):
2875
+ raise ValueError(
2876
+ "Freshdesk takes care of incrementality on its own, you should not provide incremental_key"
2877
+ )
2878
+
2879
+ parsed_uri = urlparse(uri)
2880
+ domain = parsed_uri.netloc
2881
+ query = parsed_uri.query
2882
+ params = parse_qs(query)
2883
+
2884
+ if not domain:
2885
+ raise MissingValueError("domain", "Freshdesk")
2886
+
2887
+ if "." in domain:
2888
+ domain = domain.split(".")[0]
2889
+
2890
+ api_key = params.get("api_key")
2891
+ if api_key is None:
2892
+ raise MissingValueError("api_key", "Freshdesk")
2893
+
2894
+ start_date = kwargs.get("interval_start")
2895
+ if start_date is not None:
2896
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
2897
+ else:
2898
+ start_date = ensure_pendulum_datetime("2022-01-01T00:00:00Z")
2899
+
2900
+ end_date = kwargs.get("interval_end")
2901
+ if end_date is not None:
2902
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
2903
+ else:
2904
+ end_date = None
2905
+
2906
+ custom_query: Optional[str] = None
2907
+ if ":" in table:
2908
+ table, custom_query = table.split(":", 1)
2909
+
2910
+ if table not in [
2911
+ "agents",
2912
+ "companies",
2913
+ "contacts",
2914
+ "groups",
2915
+ "roles",
2916
+ "tickets",
2917
+ ]:
2918
+ raise UnsupportedResourceError(table, "Freshdesk")
2919
+
2920
+ if custom_query and table != "tickets":
2921
+ raise ValueError(f"Custom query is not supported for {table}")
2922
+
2923
+ from ingestr.src.freshdesk import freshdesk_source
2924
+
2925
+ return freshdesk_source(
2926
+ api_secret_key=api_key[0],
2927
+ domain=domain,
2928
+ start_date=start_date,
2929
+ end_date=end_date,
2930
+ query=custom_query,
2931
+ ).with_resources(table)
2932
+
2933
+
2934
+ class TrustpilotSource:
2935
+ # trustpilot://<business_unit_id>?api_key=<api_key>
2936
+ def handles_incrementality(self) -> bool:
2937
+ return True
2938
+
2939
+ def dlt_source(self, uri: str, table: str, **kwargs):
2940
+ if kwargs.get("incremental_key"):
2941
+ raise ValueError(
2942
+ "Trustpilot takes care of incrementality on its own, you should not provide incremental_key"
2943
+ )
2944
+
2945
+ parsed_uri = urlparse(uri)
2946
+ business_unit_id = parsed_uri.netloc
2947
+ params = parse_qs(parsed_uri.query)
2948
+
2949
+ if not business_unit_id:
2950
+ raise MissingValueError("business_unit_id", "Trustpilot")
2951
+
2952
+ api_key = params.get("api_key")
2953
+ if api_key is None:
2954
+ raise MissingValueError("api_key", "Trustpilot")
2955
+
2956
+ start_date = kwargs.get("interval_start")
2957
+ if start_date is None:
2958
+ start_date = ensure_pendulum_datetime("2000-01-01").in_tz("UTC").isoformat()
2959
+ else:
2960
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC").isoformat()
2961
+
2962
+ end_date = kwargs.get("interval_end")
2963
+
2964
+ if end_date is not None:
2965
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC").isoformat()
2966
+
2967
+ if table not in ["reviews"]:
2968
+ raise UnsupportedResourceError(table, "Trustpilot")
2969
+
2970
+ from ingestr.src.trustpilot import trustpilot_source
2971
+
2972
+ return trustpilot_source(
2973
+ business_unit_id=business_unit_id,
2974
+ api_key=api_key[0],
2975
+ start_date=start_date,
2976
+ end_date=end_date,
2977
+ ).with_resources(table)
2978
+
2979
+
2980
+ class PhantombusterSource:
2981
+ def handles_incrementality(self) -> bool:
2982
+ return True
2983
+
2984
+ def dlt_source(self, uri: str, table: str, **kwargs):
2985
+ if kwargs.get("incremental_key"):
2986
+ raise ValueError(
2987
+ "Phantombuster takes care of incrementality on its own, you should not provide incremental_key"
2988
+ )
2989
+
2990
+ # phantombuster://?api_key=<api_key>
2991
+ # source table = phantom_results:agent_id
2992
+ parsed_uri = urlparse(uri)
2993
+ params = parse_qs(parsed_uri.query)
2994
+ api_key = params.get("api_key")
2995
+ if api_key is None:
2996
+ raise MissingValueError("api_key", "Phantombuster")
2997
+
2998
+ table_fields = table.replace(" ", "").split(":")
2999
+ table_name = table_fields[0]
3000
+
3001
+ agent_id = table_fields[1] if len(table_fields) > 1 else None
3002
+
3003
+ if table_name not in ["completed_phantoms"]:
3004
+ raise UnsupportedResourceError(table_name, "Phantombuster")
3005
+
3006
+ if not agent_id:
3007
+ raise MissingValueError("agent_id", "Phantombuster")
3008
+
3009
+ start_date = kwargs.get("interval_start")
3010
+ if start_date is None:
3011
+ start_date = ensure_pendulum_datetime("2018-01-01").in_tz("UTC")
3012
+ else:
3013
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
3014
+
3015
+ end_date = kwargs.get("interval_end")
3016
+ if end_date is not None:
3017
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3018
+
3019
+ from ingestr.src.phantombuster import phantombuster_source
3020
+
3021
+ return phantombuster_source(
3022
+ api_key=api_key[0],
3023
+ agent_id=agent_id,
3024
+ start_date=start_date,
3025
+ end_date=end_date,
3026
+ ).with_resources(table_name)
3027
+
3028
+
3029
+ class ElasticsearchSource:
3030
+ def handles_incrementality(self) -> bool:
3031
+ return False
3032
+
3033
+ def dlt_source(self, uri: str, table: str, **kwargs):
3034
+ from ingestr.src.elasticsearch import elasticsearch_source
3035
+
3036
+ incremental = None
3037
+ if kwargs.get("incremental_key"):
3038
+ start_value = kwargs.get("interval_start")
3039
+ end_value = kwargs.get("interval_end")
3040
+
3041
+ incremental = dlt_incremental(
3042
+ kwargs.get("incremental_key", ""),
3043
+ initial_value=start_value,
3044
+ end_value=end_value,
3045
+ range_end="closed",
3046
+ range_start="closed",
3047
+ )
3048
+
3049
+ # elasticsearch://localhost:9200?secure=true&verify_certs=false
3050
+ parsed = urlparse(uri)
3051
+
3052
+ index = table
3053
+ if not index:
3054
+ raise ValueError(
3055
+ "Table name must be provided which is the index name in elasticsearch"
3056
+ )
3057
+
3058
+ query_params = parsed.query
3059
+ params = parse_qs(query_params)
3060
+
3061
+ secure = True
3062
+ if "secure" in params:
3063
+ secure = params["secure"][0].capitalize() == "True"
3064
+
3065
+ verify_certs = True
3066
+ if "verify_certs" in params:
3067
+ verify_certs = params["verify_certs"][0].capitalize() == "True"
3068
+
3069
+ scheme = "https" if secure else "http"
3070
+ netloc = parsed.netloc
3071
+ connection_url = f"{scheme}://{netloc}"
3072
+
3073
+ return elasticsearch_source(
3074
+ connection_url=connection_url,
3075
+ index=index,
3076
+ verify_certs=verify_certs,
3077
+ incremental=incremental,
3078
+ ).with_resources(table)
3079
+
3080
+
3081
+ class AttioSource:
3082
+ def handles_incrementality(self) -> bool:
3083
+ return False
3084
+
3085
+ def dlt_source(self, uri: str, table: str, **kwargs):
3086
+ parsed_uri = urlparse(uri)
3087
+ query_params = parse_qs(parsed_uri.query)
3088
+ api_key = query_params.get("api_key")
3089
+
3090
+ if api_key is None:
3091
+ raise MissingValueError("api_key", "Attio")
3092
+
3093
+ parts = table.replace(" ", "").split(":")
3094
+ table_name = parts[0]
3095
+ params = parts[1:]
3096
+
3097
+ from ingestr.src.attio import attio_source
3098
+
3099
+ try:
3100
+ return attio_source(api_key=api_key[0], params=params).with_resources(
3101
+ table_name
3102
+ )
3103
+ except ResourcesNotFoundError:
3104
+ raise UnsupportedResourceError(table_name, "Attio")
3105
+
3106
+
3107
+ class SmartsheetSource:
3108
+ def handles_incrementality(self) -> bool:
3109
+ return False
3110
+
3111
+ # smartsheet://?access_token=<access_token>
3112
+ def dlt_source(self, uri: str, table: str, **kwargs):
3113
+ if kwargs.get("incremental_key"):
3114
+ raise ValueError("Incremental loads are not supported for Smartsheet")
3115
+
3116
+ if not table:
3117
+ raise ValueError(
3118
+ "Source table (sheet_id) is required to connect to Smartsheet"
3119
+ )
3120
+
3121
+ source_parts = urlparse(uri)
3122
+ source_fields = parse_qs(source_parts.query)
3123
+ access_token = source_fields.get("access_token")
3124
+
3125
+ if not access_token:
3126
+ raise ValueError(
3127
+ "access_token in the URI is required to connect to Smartsheet"
3128
+ )
3129
+
3130
+ from ingestr.src.smartsheets import smartsheet_source
3131
+
3132
+ return smartsheet_source(
3133
+ access_token=access_token[0],
3134
+ sheet_id=table, # table is now a single sheet_id
3135
+ )
3136
+
3137
+
3138
+ class SolidgateSource:
3139
+ def handles_incrementality(self) -> bool:
3140
+ return True
3141
+
3142
+ def dlt_source(self, uri: str, table: str, **kwargs):
3143
+ if kwargs.get("incremental_key"):
3144
+ raise ValueError(
3145
+ "Solidgate takes care of incrementality on its own, you should not provide incremental_key"
3146
+ )
3147
+
3148
+ parsed_uri = urlparse(uri)
3149
+ query_params = parse_qs(parsed_uri.query)
3150
+ public_key = query_params.get("public_key")
3151
+ secret_key = query_params.get("secret_key")
3152
+
3153
+ if public_key is None:
3154
+ raise MissingValueError("public_key", "Solidgate")
3155
+
3156
+ if secret_key is None:
3157
+ raise MissingValueError("secret_key", "Solidgate")
3158
+
3159
+ table_name = table.replace(" ", "")
3160
+
3161
+ start_date = kwargs.get("interval_start")
3162
+ if start_date is None:
3163
+ start_date = pendulum.yesterday().in_tz("UTC")
3164
+ else:
3165
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
3166
+
3167
+ end_date = kwargs.get("interval_end")
3168
+
3169
+ if end_date is not None:
3170
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3171
+
3172
+ from ingestr.src.solidgate import solidgate_source
3173
+
3174
+ try:
3175
+ return solidgate_source(
3176
+ public_key=public_key[0],
3177
+ secret_key=secret_key[0],
3178
+ start_date=start_date,
3179
+ end_date=end_date,
3180
+ ).with_resources(table_name)
3181
+ except ResourcesNotFoundError:
3182
+ raise UnsupportedResourceError(table_name, "Solidgate")
3183
+
3184
+
3185
+ class SFTPSource:
3186
+ def handles_incrementality(self) -> bool:
3187
+ return True
3188
+
3189
+ def dlt_source(self, uri: str, table: str, **kwargs):
3190
+ parsed_uri = urlparse(uri)
3191
+ host = parsed_uri.hostname
3192
+ if not host:
3193
+ raise MissingValueError("host", "SFTP URI")
3194
+ port = parsed_uri.port or 22
3195
+ username = parsed_uri.username
3196
+ password = parsed_uri.password
3197
+
3198
+ params: Dict[str, Any] = {
3199
+ "host": host,
3200
+ "port": port,
3201
+ "username": username,
3202
+ "password": password,
3203
+ "look_for_keys": False,
3204
+ "allow_agent": False,
3205
+ }
3206
+
3207
+ try:
3208
+ fs = fsspec.filesystem("sftp", **params)
3209
+ except Exception as e:
3210
+ raise ConnectionError(
3211
+ f"Failed to connect or authenticate to sftp server {host}:{port}. Error: {e}"
3212
+ )
3213
+ bucket_url = f"sftp://{host}:{port}"
3214
+
3215
+ if table.startswith("/"):
3216
+ file_glob = table
3217
+ else:
3218
+ file_glob = f"/{table}"
3219
+
3220
+ try:
3221
+ endpoint = blob.parse_endpoint(table)
3222
+ except blob.UnsupportedEndpointError:
3223
+ raise ValueError(
3224
+ "SFTP Source only supports specific formats files: csv, jsonl, parquet"
3225
+ )
3226
+ except Exception as e:
3227
+ raise ValueError(f"Failed to parse endpoint from path: {table}") from e
3228
+
3229
+ from ingestr.src.filesystem import readers
3230
+
3231
+ dlt_source_resource = readers(bucket_url, fs, file_glob)
3232
+ return dlt_source_resource.with_resources(endpoint)
3233
+
3234
+
3235
+ class QuickBooksSource:
3236
+ def handles_incrementality(self) -> bool:
3237
+ return True
3238
+
3239
+ # quickbooks://?company_id=<company_id>&client_id=<client_id>&client_secret=<client_secret>&refresh_token=<refresh>&access_token=<access_token>&environment=<env>&minor_version=<version>
3240
+ def dlt_source(self, uri: str, table: str, **kwargs):
3241
+ if kwargs.get("incremental_key"):
3242
+ raise ValueError(
3243
+ "QuickBooks takes care of incrementality on its own, you should not provide incremental_key"
3244
+ )
3245
+
3246
+ parsed_uri = urlparse(uri)
3247
+
3248
+ params = parse_qs(parsed_uri.query)
3249
+ company_id = params.get("company_id")
3250
+ client_id = params.get("client_id")
3251
+ client_secret = params.get("client_secret")
3252
+ refresh_token = params.get("refresh_token")
3253
+ environment = params.get("environment", ["production"])
3254
+ minor_version = params.get("minor_version", [None])
3255
+
3256
+ if not client_id or not client_id[0].strip():
3257
+ raise MissingValueError("client_id", "QuickBooks")
3258
+
3259
+ if not client_secret or not client_secret[0].strip():
3260
+ raise MissingValueError("client_secret", "QuickBooks")
3261
+
3262
+ if not refresh_token or not refresh_token[0].strip():
3263
+ raise MissingValueError("refresh_token", "QuickBooks")
3264
+
3265
+ if not company_id or not company_id[0].strip():
3266
+ raise MissingValueError("company_id", "QuickBooks")
3267
+
3268
+ if environment[0] not in ["production", "sandbox"]:
3269
+ raise ValueError(
3270
+ "Invalid environment. Must be either 'production' or 'sandbox'."
3271
+ )
3272
+
3273
+ from ingestr.src.quickbooks import quickbooks_source
3274
+
3275
+ table_name = table.replace(" ", "")
3276
+ table_mapping = {
3277
+ "customers": "customer",
3278
+ "invoices": "invoice",
3279
+ "accounts": "account",
3280
+ "vendors": "vendor",
3281
+ "payments": "payment",
3282
+ }
3283
+ if table_name in table_mapping:
3284
+ table_name = table_mapping[table_name]
3285
+
3286
+ start_date = kwargs.get("interval_start")
3287
+ if start_date is None:
3288
+ start_date = ensure_pendulum_datetime("2025-01-01").in_tz("UTC")
3289
+ else:
3290
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
3291
+
3292
+ end_date = kwargs.get("interval_end")
3293
+
3294
+ if end_date is not None:
3295
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3296
+
3297
+ return quickbooks_source(
3298
+ company_id=company_id[0],
3299
+ start_date=start_date,
3300
+ end_date=end_date,
3301
+ client_id=client_id[0],
3302
+ client_secret=client_secret[0],
3303
+ refresh_token=refresh_token[0],
3304
+ environment=environment[0],
3305
+ minor_version=minor_version[0],
3306
+ object=table_name,
3307
+ ).with_resources(table_name)
3308
+
3309
+
3310
+ class IsocPulseSource:
3311
+ def handles_incrementality(self) -> bool:
3312
+ return True
3313
+
3314
+ def dlt_source(self, uri: str, table: str, **kwargs):
3315
+ if kwargs.get("incremental_key"):
3316
+ raise ValueError(
3317
+ "Internet Society Pulse takes care of incrementality on its own, you should not provide incremental_key"
3318
+ )
3319
+
3320
+ parsed_uri = urlparse(uri)
3321
+ params = parse_qs(parsed_uri.query)
3322
+ token = params.get("token")
3323
+ if not token or not token[0].strip():
3324
+ raise MissingValueError("token", "Internet Society Pulse")
3325
+
3326
+ start_date = kwargs.get("interval_start")
3327
+ if start_date is None:
3328
+ start_date = pendulum.now().in_tz("UTC").subtract(days=30)
3329
+
3330
+ end_date = kwargs.get("interval_end")
3331
+
3332
+ metric = table
3333
+ opts = []
3334
+ if ":" in metric:
3335
+ metric, *opts = metric.strip().split(":")
3336
+ opts = [opt.strip() for opt in opts]
3337
+
3338
+ from ingestr.src.isoc_pulse import pulse_source
3339
+
3340
+ src = pulse_source(
3341
+ token=token[0],
3342
+ start_date=start_date.strftime("%Y-%m-%d"),
3343
+ end_date=end_date.strftime("%Y-%m-%d") if end_date else None,
3344
+ metric=metric,
3345
+ opts=opts,
3346
+ )
3347
+ return src.with_resources(metric)
3348
+
3349
+
3350
+ class PinterestSource:
3351
+ def handles_incrementality(self) -> bool:
3352
+ return True
3353
+
3354
+ def dlt_source(self, uri: str, table: str, **kwargs):
3355
+ if kwargs.get("incremental_key"):
3356
+ raise ValueError(
3357
+ "Pinterest takes care of incrementality on its own, you should not provide incremental_key"
3358
+ )
3359
+
3360
+ parsed = urlparse(uri)
3361
+ params = parse_qs(parsed.query)
3362
+ access_token = params.get("access_token")
3363
+
3364
+ if not access_token:
3365
+ raise MissingValueError("access_token", "Pinterest")
3366
+
3367
+ start_date = kwargs.get("interval_start")
3368
+ if start_date is not None:
3369
+ start_date = ensure_pendulum_datetime(start_date)
3370
+ else:
3371
+ start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
3372
+
3373
+ end_date = kwargs.get("interval_end")
3374
+ if end_date is not None:
3375
+ end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3376
+
3377
+ from ingestr.src.pinterest import pinterest_source
3378
+
3379
+ if table not in {"pins", "boards"}:
3380
+ raise UnsupportedResourceError(table, "Pinterest")
3381
+
3382
+ return pinterest_source(
3383
+ access_token=access_token[0],
3384
+ start_date=start_date,
3385
+ end_date=end_date,
3386
+ ).with_resources(table)
3387
+
3388
+
3389
+ class FluxxSource:
3390
+ def handles_incrementality(self) -> bool:
3391
+ return True
3392
+
3393
+ def dlt_source(self, uri: str, table: str, **kwargs):
3394
+ if kwargs.get("incremental_key"):
3395
+ raise ValueError(
3396
+ "Fluxx takes care of incrementality on its own, you should not provide incremental_key"
3397
+ )
3398
+
3399
+ # Parse URI: fluxx://instance?client_id=xxx&client_secret=xxx
3400
+ parsed_uri = urlparse(uri)
3401
+ source_params = parse_qs(parsed_uri.query)
3402
+
3403
+ instance = parsed_uri.hostname
3404
+ if not instance:
3405
+ raise ValueError(
3406
+ "Instance is required in the URI (e.g., fluxx://mycompany.preprod)"
3407
+ )
3408
+
3409
+ client_id = source_params.get("client_id")
3410
+ if not client_id:
3411
+ raise ValueError("client_id in the URI is required to connect to Fluxx")
3412
+
3413
+ client_secret = source_params.get("client_secret")
3414
+ if not client_secret:
3415
+ raise ValueError("client_secret in the URI is required to connect to Fluxx")
3416
+
3417
+ # Parse date parameters
3418
+ start_date = kwargs.get("interval_start")
3419
+ if start_date:
3420
+ start_date = ensure_pendulum_datetime(start_date)
3421
+
3422
+ end_date = kwargs.get("interval_end")
3423
+ if end_date:
3424
+ end_date = ensure_pendulum_datetime(end_date)
3425
+
3426
+ # Import Fluxx source
3427
+ from ingestr.src.fluxx import fluxx_source
3428
+
3429
+ # Parse table specification for custom column selection
3430
+ # Format: "resource_name:field1,field2,field3" or "resource_name"
3431
+ resources = None
3432
+ custom_fields = {}
3433
+
3434
+ if table:
3435
+ # Handle single resource with custom fields or multiple resources
3436
+ if ":" in table and table.count(":") == 1:
3437
+ # Single resource with custom fields: "grant_request:id,name,amount"
3438
+ resource_name, field_list = table.split(":", 1)
3439
+ resource_name = resource_name.strip()
3440
+ fields = [f.strip() for f in field_list.split(",")]
3441
+ resources = [resource_name]
3442
+ custom_fields[resource_name] = fields
3443
+ else:
3444
+ # Multiple resources or single resource without custom fields
3445
+ # Support comma-separated list: "grant_request,user"
3446
+ resources = [r.strip() for r in table.split(",")]
3447
+
3448
+ return fluxx_source(
3449
+ instance=instance,
3450
+ client_id=client_id[0],
3451
+ client_secret=client_secret[0],
3452
+ start_date=start_date,
3453
+ end_date=end_date,
3454
+ resources=resources,
3455
+ custom_fields=custom_fields,
3456
+ )
3457
+
3458
+
3459
+ class LinearSource:
3460
+ def handles_incrementality(self) -> bool:
3461
+ return True
3462
+
3463
+ def dlt_source(self, uri: str, table: str, **kwargs):
3464
+ if kwargs.get("incremental_key"):
3465
+ raise ValueError(
3466
+ "Linear takes care of incrementality on its own, you should not provide incremental_key"
3467
+ )
3468
+
3469
+ parsed_uri = urlparse(uri)
3470
+ params = parse_qs(parsed_uri.query)
3471
+ api_key = params.get("api_key")
3472
+ if api_key is None:
3473
+ raise MissingValueError("api_key", "Linear")
3474
+
3475
+ if table not in [
3476
+ "issues",
3477
+ "projects",
3478
+ "teams",
3479
+ "users",
3480
+ "workflow_states",
3481
+ "cycles",
3482
+ "attachments",
3483
+ "comments",
3484
+ "documents",
3485
+ "external_users",
3486
+ "initiative",
3487
+ "integrations",
3488
+ "labels",
3489
+ "organization",
3490
+ "project_updates",
3491
+ "team_memberships",
3492
+ "initiative_to_project",
3493
+ "project_milestone",
3494
+ "project_status",
3495
+ ]:
3496
+ raise UnsupportedResourceError(table, "Linear")
3497
+
3498
+ start_date = kwargs.get("interval_start")
3499
+ if start_date is not None:
3500
+ start_date = ensure_pendulum_datetime(start_date)
3501
+ else:
3502
+ start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
3503
+
3504
+ end_date = kwargs.get("interval_end")
3505
+ if end_date is not None:
3506
+ end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3507
+
3508
+ from ingestr.src.linear import linear_source
3509
+
3510
+ return linear_source(
3511
+ api_key=api_key[0],
3512
+ start_date=start_date,
3513
+ end_date=end_date,
3514
+ ).with_resources(table)
3515
+
3516
+
3517
+ class RevenueCatSource:
3518
+ def handles_incrementality(self) -> bool:
3519
+ return True
3520
+
3521
+ def dlt_source(self, uri: str, table: str, **kwargs):
3522
+ if kwargs.get("incremental_key"):
3523
+ raise ValueError(
3524
+ "RevenueCat takes care of incrementality on its own, you should not provide incremental_key"
3525
+ )
3526
+
3527
+ parsed_uri = urlparse(uri)
3528
+ params = parse_qs(parsed_uri.query)
3529
+
3530
+ api_key = params.get("api_key")
3531
+ if api_key is None:
3532
+ raise MissingValueError("api_key", "RevenueCat")
3533
+
3534
+ project_id = params.get("project_id")
3535
+ if project_id is None and table != "projects":
3536
+ raise MissingValueError("project_id", "RevenueCat")
3537
+
3538
+ if table not in [
3539
+ "customers",
3540
+ "products",
3541
+ "entitlements",
3542
+ "offerings",
3543
+ "subscriptions",
3544
+ "purchases",
3545
+ "projects",
3546
+ ]:
3547
+ raise UnsupportedResourceError(table, "RevenueCat")
3548
+
3549
+ start_date = kwargs.get("interval_start")
3550
+ if start_date is not None:
3551
+ start_date = ensure_pendulum_datetime(start_date)
3552
+ else:
3553
+ start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
3554
+
3555
+ end_date = kwargs.get("interval_end")
3556
+ if end_date is not None:
3557
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3558
+
3559
+ from ingestr.src.revenuecat import revenuecat_source
3560
+
3561
+ return revenuecat_source(
3562
+ api_key=api_key[0],
3563
+ project_id=project_id[0] if project_id is not None else None,
3564
+ ).with_resources(table)
3565
+
3566
+
3567
+ class ZoomSource:
3568
+ def handles_incrementality(self) -> bool:
3569
+ return True
3570
+
3571
+ def dlt_source(self, uri: str, table: str, **kwargs):
3572
+ if kwargs.get("incremental_key"):
3573
+ raise ValueError(
3574
+ "Zoom takes care of incrementality on its own, you should not provide incremental_key"
3575
+ )
3576
+
3577
+ parsed = urlparse(uri)
3578
+ params = parse_qs(parsed.query)
3579
+ client_id = params.get("client_id")
3580
+ client_secret = params.get("client_secret")
3581
+ account_id = params.get("account_id")
3582
+
3583
+ if not (client_id and client_secret and account_id):
3584
+ raise MissingValueError(
3585
+ "client_id/client_secret/account_id",
3586
+ "Zoom",
3587
+ )
3588
+
3589
+ start_date = kwargs.get("interval_start")
3590
+ if start_date is not None:
3591
+ start_date = ensure_pendulum_datetime(start_date)
3592
+ else:
3593
+ start_date = pendulum.datetime(2020, 1, 26).in_tz("UTC")
3594
+
3595
+ end_date = kwargs.get("interval_end")
3596
+ if end_date is not None:
3597
+ end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3598
+
3599
+ from ingestr.src.zoom import zoom_source
3600
+
3601
+ if table not in {"meetings", "users", "participants"}:
3602
+ raise UnsupportedResourceError(table, "Zoom")
3603
+
3604
+ return zoom_source(
3605
+ client_id=client_id[0],
3606
+ client_secret=client_secret[0],
3607
+ account_id=account_id[0],
3608
+ start_date=start_date,
3609
+ end_date=end_date,
3610
+ ).with_resources(table)
3611
+
3612
+
3613
+ class InfluxDBSource:
3614
+ def handles_incrementality(self) -> bool:
3615
+ return True
3616
+
3617
+ def dlt_source(self, uri: str, table: str, **kwargs):
3618
+ if kwargs.get("incremental_key"):
3619
+ raise ValueError(
3620
+ "InfluxDB takes care of incrementality on its own, you should not provide incremental_key"
3621
+ )
3622
+
3623
+ parsed_uri = urlparse(uri)
3624
+ params = parse_qs(parsed_uri.query)
3625
+ host = parsed_uri.hostname
3626
+ port = parsed_uri.port
3627
+
3628
+ secure = params.get("secure", ["true"])[0].lower() != "false"
3629
+ scheme = "https" if secure else "http"
3630
+
3631
+ if port:
3632
+ host_url = f"{scheme}://{host}:{port}"
3633
+ else:
3634
+ host_url = f"{scheme}://{host}"
3635
+
3636
+ token = params.get("token")
3637
+ org = params.get("org")
3638
+ bucket = params.get("bucket")
3639
+
3640
+ if not host:
3641
+ raise MissingValueError("host", "InfluxDB")
3642
+ if not token:
3643
+ raise MissingValueError("token", "InfluxDB")
3644
+ if not org:
3645
+ raise MissingValueError("org", "InfluxDB")
3646
+ if not bucket:
3647
+ raise MissingValueError("bucket", "InfluxDB")
3648
+
3649
+ start_date = kwargs.get("interval_start")
3650
+ if start_date is not None:
3651
+ start_date = ensure_pendulum_datetime(start_date)
3652
+ else:
3653
+ start_date = pendulum.datetime(2024, 1, 1).in_tz("UTC")
3654
+
3655
+ end_date = kwargs.get("interval_end")
3656
+ if end_date is not None:
3657
+ end_date = ensure_pendulum_datetime(end_date)
3658
+
3659
+ from ingestr.src.influxdb import influxdb_source
3660
+
3661
+ return influxdb_source(
3662
+ measurement=table,
3663
+ host=host_url,
3664
+ org=org[0],
3665
+ bucket=bucket[0],
3666
+ token=token[0],
3667
+ secure=secure,
3668
+ start_date=start_date,
3669
+ end_date=end_date,
3670
+ ).with_resources(table)
3671
+
3672
+
3673
+ class WiseSource:
3674
+ def handles_incrementality(self) -> bool:
3675
+ return True
3676
+
3677
+ def dlt_source(self, uri: str, table: str, **kwargs):
3678
+ parsed = urlparse(uri)
3679
+ params = parse_qs(parsed.query)
3680
+ api_key = params.get("api_key")
3681
+
3682
+ if not api_key:
3683
+ raise MissingValueError("api_key", "Wise")
3684
+
3685
+ if table not in ["profiles", "transfers", "balances"]:
3686
+ raise ValueError(
3687
+ f"Resource '{table}' is not supported for Wise source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
3688
+ )
3689
+
3690
+ start_date = kwargs.get("interval_start")
3691
+ if start_date:
3692
+ start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
3693
+ else:
3694
+ start_date = pendulum.datetime(2020, 1, 1).in_timezone("UTC")
3695
+
3696
+ end_date = kwargs.get("interval_end")
3697
+ if end_date:
3698
+ end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
3699
+ else:
3700
+ end_date = None
3701
+
3702
+ from ingestr.src.wise import wise_source
3703
+
3704
+ return wise_source(
3705
+ api_key=api_key[0],
3706
+ start_date=start_date,
3707
+ end_date=end_date,
3708
+ ).with_resources(table)
3709
+
3710
+
3711
+ class FundraiseupSource:
3712
+ def handles_incrementality(self) -> bool:
3713
+ return True
3714
+
3715
+ def dlt_source(self, uri: str, table: str, **kwargs):
3716
+ parsed_uri = urlparse(uri)
3717
+ params = parse_qs(parsed_uri.query)
3718
+
3719
+ api_key = params.get("api_key")
3720
+ if api_key is None:
3721
+ raise MissingValueError("api_key", "Fundraiseup")
3722
+
3723
+ from ingestr.src.fundraiseup import fundraiseup_source
3724
+
3725
+ src = fundraiseup_source(api_key=api_key[0])
3726
+ if table not in src.resources:
3727
+ raise UnsupportedResourceError(table, "Fundraiseup")
3728
+ return src.with_resources(table)
3729
+
3730
+
3731
+ class AnthropicSource:
3732
+ def handles_incrementality(self) -> bool:
3733
+ return True
3734
+
3735
+ def dlt_source(self, uri: str, table: str, **kwargs):
3736
+ # anthropic://?api_key=<admin_api_key>
3737
+ parsed_uri = urlparse(uri)
3738
+ params = parse_qs(parsed_uri.query)
3739
+
3740
+ api_key = params.get("api_key")
3741
+ if api_key is None:
3742
+ raise MissingValueError("api_key", "Anthropic")
3743
+
3744
+ if table not in [
3745
+ "claude_code_usage",
3746
+ "usage_report",
3747
+ "cost_report",
3748
+ "organization",
3749
+ "workspaces",
3750
+ "api_keys",
3751
+ "invites",
3752
+ "users",
3753
+ "workspace_members",
3754
+ ]:
3755
+ raise UnsupportedResourceError(table, "Anthropic")
3756
+
3757
+ # Get start and end dates from kwargs
3758
+ start_date = kwargs.get("interval_start")
3759
+ if start_date:
3760
+ start_date = ensure_pendulum_datetime(start_date)
3761
+ else:
3762
+ # Default to 2023-01-01
3763
+ start_date = pendulum.datetime(2023, 1, 1)
3764
+
3765
+ end_date = kwargs.get("interval_end")
3766
+ if end_date:
3767
+ end_date = ensure_pendulum_datetime(end_date)
3768
+ else:
3769
+ end_date = None
3770
+
3771
+ from ingestr.src.anthropic import anthropic_source
3772
+
3773
+ return anthropic_source(
3774
+ api_key=api_key[0],
3775
+ initial_start_date=start_date,
3776
+ end_date=end_date,
3777
+ ).with_resources(table)
3778
+
3779
+
3780
+ class PlusVibeAISource:
3781
+ resources = [
3782
+ "campaigns",
3783
+ "leads",
3784
+ "email_accounts",
3785
+ "emails",
3786
+ "blocklist",
3787
+ "webhooks",
3788
+ "tags",
3789
+ ]
3790
+
3791
+ def handles_incrementality(self) -> bool:
3792
+ return True
3793
+
3794
+ def dlt_source(self, uri: str, table: str, **kwargs):
3795
+ # plusvibeai://?api_key=<key>&workspace_id=<id>
3796
+ parsed_uri = urlparse(uri)
3797
+ params = parse_qs(parsed_uri.query)
3798
+
3799
+ api_key = params.get("api_key")
3800
+ workspace_id = params.get("workspace_id")
3801
+
3802
+ if not api_key:
3803
+ raise MissingValueError("api_key", "PlusVibeAI")
3804
+
3805
+ if not workspace_id:
3806
+ raise MissingValueError("workspace_id", "PlusVibeAI")
3807
+
3808
+ if table not in self.resources:
3809
+ raise UnsupportedResourceError(table, "PlusVibeAI")
3810
+
3811
+ import dlt
3812
+
3813
+ from ingestr.src.plusvibeai import plusvibeai_source
3814
+
3815
+ dlt.secrets["sources.plusvibeai.api_key"] = api_key[0]
3816
+ dlt.secrets["sources.plusvibeai.workspace_id"] = workspace_id[0]
3817
+
3818
+ # Handle custom base URL if provided
3819
+ base_url = params.get("base_url", ["https://api.plusvibe.ai"])[0]
3820
+ dlt.secrets["sources.plusvibeai.base_url"] = base_url
3821
+
3822
+ src = plusvibeai_source()
3823
+ return src.with_resources(table)
3824
+
3825
+
3826
+ class IntercomSource:
3827
+ def handles_incrementality(self) -> bool:
3828
+ return True
3829
+
3830
+ def dlt_source(self, uri: str, table: str, **kwargs):
3831
+ # intercom://?access_token=<token>&region=<us|eu|au>
3832
+ # OR intercom://?oauth_token=<token>&region=<us|eu|au>
3833
+ parsed_uri = urlparse(uri)
3834
+ params = parse_qs(parsed_uri.query)
3835
+
3836
+ # Check for authentication
3837
+ access_token = params.get("access_token")
3838
+ oauth_token = params.get("oauth_token")
3839
+ region = params.get("region", ["us"])[0]
3840
+
3841
+ if not access_token and not oauth_token:
3842
+ raise MissingValueError("access_token or oauth_token", "Intercom")
3843
+
3844
+ # Validate table/resource
3845
+ supported_tables = [
3846
+ "contacts",
3847
+ "companies",
3848
+ "conversations",
3849
+ "tickets",
3850
+ "tags",
3851
+ "segments",
3852
+ "teams",
3853
+ "admins",
3854
+ "articles",
3855
+ "data_attributes",
3856
+ ]
3857
+
3858
+ if table not in supported_tables:
3859
+ raise UnsupportedResourceError(table, "Intercom")
3860
+
3861
+ # Get date parameters
3862
+ start_date = kwargs.get("interval_start")
3863
+ if start_date:
3864
+ start_date = ensure_pendulum_datetime(start_date)
3865
+ else:
3866
+ start_date = pendulum.datetime(2020, 1, 1)
3867
+
3868
+ end_date = kwargs.get("interval_end")
3869
+ if end_date:
3870
+ end_date = ensure_pendulum_datetime(end_date)
3871
+
3872
+ # Import and initialize the source
3873
+ from ingestr.src.intercom import (
3874
+ IntercomCredentialsAccessToken,
3875
+ IntercomCredentialsOAuth,
3876
+ TIntercomCredentials,
3877
+ intercom_source,
3878
+ )
3879
+
3880
+ credentials: TIntercomCredentials
3881
+ if access_token:
3882
+ credentials = IntercomCredentialsAccessToken(
3883
+ access_token=access_token[0], region=region
3884
+ )
3885
+ else:
3886
+ if not oauth_token:
3887
+ raise MissingValueError("oauth_token", "Intercom")
3888
+ credentials = IntercomCredentialsOAuth(
3889
+ oauth_token=oauth_token[0], region=region
3890
+ )
3891
+
3892
+ return intercom_source(
3893
+ credentials=credentials,
3894
+ start_date=start_date,
3895
+ end_date=end_date,
3896
+ ).with_resources(table)
3897
+
3898
+
3899
+ class HttpSource:
3900
+ """Source for reading CSV, JSON, and Parquet files from HTTP URLs"""
3901
+
3902
+ def handles_incrementality(self) -> bool:
3903
+ return False
3904
+
3905
+ def dlt_source(self, uri: str, table: str, **kwargs):
3906
+ """
3907
+ Create a dlt source for reading files from HTTP URLs.
3908
+
3909
+ URI format: http://example.com/file.csv or https://example.com/file.json
3910
+
3911
+ Args:
3912
+ uri: HTTP(S) URL to the file
3913
+ table: Not used for HTTP source (files are read directly)
3914
+ **kwargs: Additional arguments:
3915
+ - file_format: Optional file format override ('csv', 'json', 'parquet')
3916
+ - chunksize: Number of records to process at once (default varies by format)
3917
+ - merge_key: Merge key for the resource
3918
+
3919
+ Returns:
3920
+ DltResource for the HTTP file
3921
+ """
3922
+ from ingestr.src.http import http_source
3923
+
3924
+ # Extract the actual URL (remove the http:// or https:// scheme if duplicated)
3925
+ url = uri
3926
+ if uri.startswith("http://http://") or uri.startswith("https://https://"):
3927
+ url = uri.split("://", 1)[1]
3928
+
3929
+ file_format = kwargs.get("file_format")
3930
+ chunksize = kwargs.get("chunksize")
3931
+ merge_key = kwargs.get("merge_key")
3932
+
3933
+ reader_kwargs = {}
3934
+ if chunksize is not None:
3935
+ reader_kwargs["chunksize"] = chunksize
3936
+
3937
+ source = http_source(url=url, file_format=file_format, **reader_kwargs)
3938
+
3939
+ if merge_key:
3940
+ source.apply_hints(merge_key=merge_key)
3941
+
3942
+ return source
3943
+
3944
+
3945
+ class MondaySource:
3946
+ def handles_incrementality(self) -> bool:
3947
+ return False
3948
+
3949
+ def dlt_source(self, uri: str, table: str, **kwargs):
3950
+ parsed_uri = urlparse(uri)
3951
+ query_params = parse_qs(parsed_uri.query)
3952
+ api_token = query_params.get("api_token")
3953
+
3954
+ if api_token is None:
3955
+ raise MissingValueError("api_token", "Monday")
3956
+
3957
+ parts = table.replace(" ", "").split(":")
3958
+ table_name = parts[0]
3959
+ params = parts[1:]
3960
+
3961
+ # Get interval_start and interval_end from kwargs (command line args)
3962
+ interval_start = kwargs.get("interval_start")
3963
+ interval_end = kwargs.get("interval_end")
3964
+
3965
+ # Convert datetime to string format YYYY-MM-DD
3966
+ start_date = interval_start.strftime("%Y-%m-%d") if interval_start else None
3967
+ end_date = interval_end.strftime("%Y-%m-%d") if interval_end else None
3968
+
3969
+ from ingestr.src.monday import monday_source
3970
+
3971
+ try:
3972
+ return monday_source(
3973
+ api_token=api_token[0],
3974
+ params=params,
3975
+ start_date=start_date,
3976
+ end_date=end_date,
3977
+ ).with_resources(table_name)
3978
+ except ResourcesNotFoundError:
3979
+ raise UnsupportedResourceError(table_name, "Monday")
3980
+
3981
+
3982
+ class MailchimpSource:
3983
+ def handles_incrementality(self) -> bool:
3984
+ return False
3985
+
3986
+ def dlt_source(self, uri: str, table: str, **kwargs):
3987
+ parsed_uri = urlparse(uri)
3988
+ query_params = parse_qs(parsed_uri.query)
3989
+ api_key = query_params.get("api_key")
3990
+ server = query_params.get("server")
3991
+
3992
+ if api_key is None:
3993
+ raise MissingValueError("api_key", "Mailchimp")
3994
+ if server is None:
3995
+ raise MissingValueError("server", "Mailchimp")
3996
+
3997
+ from ingestr.src.mailchimp import mailchimp_source
3998
+
3999
+ try:
4000
+ return mailchimp_source(
4001
+ api_key=api_key[0],
4002
+ server=server[0],
4003
+ ).with_resources(table)
4004
+ except ResourcesNotFoundError:
4005
+ raise UnsupportedResourceError(table, "Mailchimp")
4006
+
4007
+
4008
+ class AlliumSource:
4009
+ def handles_incrementality(self) -> bool:
4010
+ return False
4011
+
4012
+ def dlt_source(self, uri: str, table: str, **kwargs):
4013
+ parsed_uri = urlparse(uri)
4014
+ query_params = parse_qs(parsed_uri.query)
4015
+ api_key = query_params.get("api_key")
4016
+
4017
+ if api_key is None:
4018
+ raise MissingValueError("api_key", "Allium")
4019
+
4020
+ # Extract query_id and custom parameters from table parameter
4021
+ # Format: query_id or query:query_id or query:query_id:param1=value1&param2=value2
4022
+ query_id = table
4023
+ custom_params = {}
4024
+ limit = None
4025
+ compute_profile = None
4026
+
4027
+ if ":" in table:
4028
+ parts = table.split(":", 2) # Split into max 3 parts
4029
+ if len(parts) >= 2:
4030
+ query_id = parts[1]
4031
+ if len(parts) == 3:
4032
+ # Parse custom parameters from query string format
4033
+ param_string = parts[2]
4034
+ for param in param_string.split("&"):
4035
+ if "=" in param:
4036
+ key, value = param.split("=", 1)
4037
+ # Extract run_config parameters
4038
+ if key == "limit":
4039
+ limit = int(value)
4040
+ elif key == "compute_profile":
4041
+ compute_profile = value
4042
+ else:
4043
+ custom_params[key] = value
4044
+
4045
+ # Extract parameters from interval_start and interval_end
4046
+ # Default: 2 days ago 00:00 to yesterday 00:00
4047
+ now = pendulum.now()
4048
+ default_start = now.subtract(days=2).start_of("day")
4049
+ default_end = now.subtract(days=1).start_of("day")
4050
+
4051
+ parameters = {}
4052
+ interval_start = kwargs.get("interval_start")
4053
+ interval_end = kwargs.get("interval_end")
4054
+
4055
+ start_date = interval_start if interval_start is not None else default_start
4056
+ end_date = interval_end if interval_end is not None else default_end
4057
+
4058
+ parameters["start_date"] = start_date.strftime("%Y-%m-%d")
4059
+ parameters["end_date"] = end_date.strftime("%Y-%m-%d")
4060
+ parameters["start_timestamp"] = str(int(start_date.timestamp()))
4061
+ parameters["end_timestamp"] = str(int(end_date.timestamp()))
4062
+
4063
+ # Merge custom parameters (they override default parameters)
4064
+ parameters.update(custom_params)
4065
+
4066
+ from ingestr.src.allium import allium_source
4067
+
4068
+ return allium_source(
4069
+ api_key=api_key[0],
4070
+ query_id=query_id,
4071
+ parameters=parameters if parameters else None,
4072
+ limit=limit,
4073
+ compute_profile=compute_profile,
4074
+ )
4075
+
4076
+
4077
+ class CouchbaseSource:
4078
+ table_builder: Callable
4079
+
4080
+ def __init__(self, table_builder=None) -> None:
4081
+ if table_builder is None:
4082
+ from ingestr.src.couchbase_source import couchbase_collection
4083
+
4084
+ table_builder = couchbase_collection
4085
+
4086
+ self.table_builder = table_builder
4087
+
4088
+ def handles_incrementality(self) -> bool:
4089
+ return False
4090
+
4091
+ def dlt_source(self, uri: str, table: str, **kwargs):
4092
+ """
4093
+ Create a dlt source for reading data from Couchbase.
4094
+
4095
+ URI formats:
4096
+ - couchbase://username:password@host
4097
+ - couchbase://username:password@host/bucket
4098
+ - couchbase://username:password@host?ssl=true
4099
+ - couchbases://username:password@host (SSL enabled)
4100
+
4101
+ Table formats:
4102
+ - bucket.scope.collection (when bucket not in URI)
4103
+ - scope.collection (when bucket specified in URI path)
4104
+
4105
+ Note: If password contains special characters (@, :, /, etc.), they must be URL-encoded.
4106
+
4107
+ Examples:
4108
+ Local/Self-hosted:
4109
+ - couchbase://admin:password123@localhost with table "mybucket.myscope.mycollection"
4110
+ - couchbase://admin:password123@localhost/mybucket with table "myscope.mycollection"
4111
+ - couchbase://admin:password123@localhost?ssl=true with table "mybucket._default._default"
4112
+
4113
+ Capella (Cloud):
4114
+ - couchbases://user:pass@cb.xxx.cloud.couchbase.com with table "travel-sample.inventory.airport"
4115
+ - couchbase://user:pass@cb.xxx.cloud.couchbase.com/travel-sample?ssl=true with table "inventory.airport"
4116
+
4117
+ To encode password in Python:
4118
+ from urllib.parse import quote
4119
+ encoded_pwd = quote("MyPass@123!", safe='')
4120
+ uri = f"couchbase://admin:{encoded_pwd}@localhost?ssl=true"
4121
+
4122
+ Args:
4123
+ uri: Couchbase connection URI (can include /bucket path and ?ssl=true query parameter)
4124
+ table: Format depends on URI:
4125
+ - bucket.scope.collection (if bucket not in URI)
4126
+ - scope.collection (if bucket in URI path)
4127
+ **kwargs: Additional arguments:
4128
+ - limit: Maximum number of documents to fetch
4129
+ - incremental_key: Field to use for incremental loading
4130
+ - interval_start: Start value for incremental loading
4131
+ - interval_end: End value for incremental loading
4132
+
4133
+ Returns:
4134
+ DltResource for the Couchbase collection
4135
+ """
4136
+ # Parse the URI to extract connection details
4137
+ # urlparse automatically decodes URL-encoded credentials
4138
+
4139
+ parsed = urlparse(uri)
4140
+
4141
+ # Extract username and password from URI
4142
+ # Note: urlparse automatically decodes URL-encoded characters in username/password
4143
+ from urllib.parse import unquote
4144
+
4145
+ username = parsed.username
4146
+ password = unquote(parsed.password) if parsed.password else None
4147
+
4148
+ if not username or not password:
4149
+ raise ValueError(
4150
+ "Username and password must be provided in the URI.\n"
4151
+ "Format: couchbase://username:password@host\n"
4152
+ "If password has special characters (@, :, /), URL-encode them.\n"
4153
+ "Example: couchbase://admin:MyPass%40123@localhost for password 'MyPass@123'"
4154
+ )
4155
+
4156
+ # Reconstruct connection string without credentials
4157
+ scheme = parsed.scheme
4158
+ netloc = parsed.netloc
4159
+
4160
+ # Remove username:password@ from netloc if present
4161
+ if "@" in netloc:
4162
+ netloc = netloc.split("@", 1)[1]
4163
+
4164
+ # Parse query parameters from URI
4165
+ from urllib.parse import parse_qs
4166
+
4167
+ query_params = parse_qs(parsed.query)
4168
+
4169
+ # Check if SSL is requested via URI query parameter (?ssl=true)
4170
+ if "ssl" in query_params:
4171
+ ssl_value = query_params["ssl"][0].lower()
4172
+ use_ssl = ssl_value in ("true", "1", "yes")
4173
+
4174
+ # Apply SSL scheme based on parameter
4175
+ if use_ssl and scheme == "couchbase":
4176
+ scheme = "couchbases"
4177
+
4178
+ connection_string = f"{scheme}://{netloc}"
4179
+
4180
+ # Extract bucket from URI path if present (e.g., couchbase://host/bucket)
4181
+ bucket_from_uri = None
4182
+ if parsed.path and parsed.path.strip("/"):
4183
+ bucket_from_uri = parsed.path.strip("/").split("/")[0]
4184
+
4185
+ # Parse table format: can be "scope.collection" or "bucket.scope.collection"
4186
+ table_parts = table.split(".")
4187
+
4188
+ if len(table_parts) == 3:
4189
+ # Format: bucket.scope.collection
4190
+ bucket, scope, collection = table_parts
4191
+ elif len(table_parts) == 2:
4192
+ # Format: scope.collection (bucket from URI)
4193
+ if bucket_from_uri:
4194
+ bucket = bucket_from_uri
4195
+ scope, collection = table_parts
4196
+ else:
4197
+ raise ValueError(
4198
+ "Table format is 'scope.collection' but no bucket specified in URI.\n"
4199
+ f"Either use URI format: couchbase://user:pass@host/bucket\n"
4200
+ f"Or use table format: bucket.scope.collection\n"
4201
+ f"Got table: {table}"
4202
+ )
4203
+ else:
4204
+ raise ValueError(
4205
+ "Table format must be 'bucket.scope.collection' or 'scope.collection' (with bucket in URI). "
4206
+ f"Got: {table}\n"
4207
+ "Examples:\n"
4208
+ " - URI: couchbase://user:pass@host, Table: travel-sample.inventory.airport\n"
4209
+ " - URI: couchbase://user:pass@host/travel-sample, Table: inventory.airport"
4210
+ )
4211
+
4212
+ # Handle incremental loading
4213
+ incremental = None
4214
+ if kwargs.get("incremental_key"):
4215
+ start_value = kwargs.get("interval_start")
4216
+ end_value = kwargs.get("interval_end")
4217
+
4218
+ incremental = dlt_incremental(
4219
+ kwargs.get("incremental_key", ""),
4220
+ initial_value=start_value,
4221
+ end_value=end_value,
4222
+ range_end="closed",
4223
+ range_start="closed",
4224
+ )
4225
+
4226
+ # Get optional parameters
4227
+ limit = kwargs.get("limit")
4228
+
4229
+ table_instance = self.table_builder(
4230
+ connection_string=connection_string,
4231
+ username=username,
4232
+ password=password,
4233
+ bucket=bucket,
4234
+ scope=scope,
4235
+ collection=collection,
4236
+ incremental=incremental,
4237
+ limit=limit,
4238
+ )
4239
+ table_instance.max_table_nesting = 1
4240
+
4241
+ return table_instance
4242
+
4243
+
4244
+ class CursorSource:
4245
+ resources = [
4246
+ "team_members",
4247
+ "daily_usage_data",
4248
+ "team_spend",
4249
+ "filtered_usage_events",
4250
+ ]
4251
+
4252
+ def handles_incrementality(self) -> bool:
4253
+ return True
4254
+
4255
+ def dlt_source(self, uri: str, table: str, **kwargs):
4256
+ # cursor://?api_key=<api_key>
4257
+ parsed_uri = urlparse(uri)
4258
+ params = parse_qs(parsed_uri.query)
4259
+
4260
+ api_key = params.get("api_key")
4261
+
4262
+ if not api_key:
4263
+ raise MissingValueError("api_key", "Cursor")
4264
+
4265
+ if table not in self.resources:
4266
+ raise UnsupportedResourceError(table, "Cursor")
4267
+
4268
+ import dlt
4269
+
4270
+ from ingestr.src.cursor import cursor_source
4271
+
4272
+ dlt.secrets["sources.cursor.api_key"] = api_key[0]
4273
+
4274
+ # Handle interval_start and interval_end for daily_usage_data and filtered_usage_events (optional)
4275
+ if table in ["daily_usage_data", "filtered_usage_events"]:
4276
+ interval_start = kwargs.get("interval_start")
4277
+ interval_end = kwargs.get("interval_end")
4278
+
4279
+ # Both are optional, but if one is provided, both should be provided
4280
+ if interval_start is not None and interval_end is not None:
4281
+ # Convert datetime to epoch milliseconds
4282
+ start_ms = int(interval_start.timestamp() * 1000)
4283
+ end_ms = int(interval_end.timestamp() * 1000)
4284
+
4285
+ dlt.config["sources.cursor.start_date"] = start_ms
4286
+ dlt.config["sources.cursor.end_date"] = end_ms
4287
+
4288
+ src = cursor_source()
4289
+ return src.with_resources(table)
4290
+
4291
+
4292
+ class SocrataSource:
4293
+ def handles_incrementality(self) -> bool:
4294
+ return False
4295
+
4296
+ def dlt_source(self, uri: str, table: str, **kwargs):
4297
+ """
4298
+ Creates a DLT source for Socrata open data platform.
4299
+
4300
+ URI format: socrata://domain?app_token=TOKEN
4301
+ Table: dataset_id (e.g., "6udu-fhnu")
4302
+
4303
+ Args:
4304
+ uri: Socrata connection URI with domain and optional auth params
4305
+ table: Dataset ID (e.g., "6udu-fhnu")
4306
+ **kwargs: Additional arguments:
4307
+ - incremental_key: Field to use for incremental loading (e.g., ":updated_at")
4308
+ - interval_start: Start date for initial load
4309
+ - interval_end: End date for load
4310
+ - primary_key: Primary key field for merge operations
4311
+
4312
+ Returns:
4313
+ DltResource for the Socrata dataset
4314
+ """
4315
+ from urllib.parse import parse_qs, urlparse
4316
+
4317
+ parsed = urlparse(uri)
4318
+
4319
+ domain = parsed.netloc
4320
+ if not domain:
4321
+ raise ValueError(
4322
+ "Domain must be provided in the URI.\n"
4323
+ "Format: socrata://domain?app_token=TOKEN\n"
4324
+ "Example: socrata://evergreen.data.socrata.com?app_token=mytoken"
4325
+ )
4326
+
4327
+ query_params = parse_qs(parsed.query)
4328
+
4329
+ dataset_id = table
4330
+ if not dataset_id:
4331
+ raise ValueError(
4332
+ "Dataset ID must be provided as the table parameter.\n"
4333
+ "Example: --source-table 6udu-fhnu"
4334
+ )
4335
+
4336
+ app_token = query_params.get("app_token", [None])[0]
4337
+ username = query_params.get("username", [None])[0]
4338
+ password = query_params.get("password", [None])[0]
4339
+
4340
+ incremental = None
4341
+ if kwargs.get("incremental_key"):
4342
+ start_value = kwargs.get("interval_start")
4343
+ end_value = kwargs.get("interval_end")
4344
+
4345
+ if start_value:
4346
+ start_value = (
4347
+ start_value.isoformat()
4348
+ if hasattr(start_value, "isoformat")
4349
+ else str(start_value)
4350
+ )
4351
+
4352
+ if end_value:
4353
+ end_value = (
4354
+ end_value.isoformat()
4355
+ if hasattr(end_value, "isoformat")
4356
+ else str(end_value)
4357
+ )
4358
+
4359
+ incremental = dlt_incremental(
4360
+ kwargs.get("incremental_key", ""),
4361
+ initial_value=start_value,
4362
+ end_value=end_value,
4363
+ range_end="open",
4364
+ range_start="closed",
4365
+ )
4366
+
4367
+ primary_key = kwargs.get("primary_key")
4368
+
4369
+ from ingestr.src.socrata_source import source
4370
+
4371
+ return source(
4372
+ domain=domain,
4373
+ dataset_id=dataset_id,
4374
+ app_token=app_token,
4375
+ username=username,
4376
+ password=password,
4377
+ incremental=incremental,
4378
+ primary_key=primary_key,
4379
+ ).with_resources("dataset")
4380
+
4381
+
4382
+ class HostawaySource:
4383
+ def handles_incrementality(self) -> bool:
4384
+ return True
4385
+
4386
+ def dlt_source(self, uri: str, table: str, **kwargs):
4387
+ if kwargs.get("incremental_key"):
4388
+ raise ValueError(
4389
+ "Hostaway takes care of incrementality on its own, you should not provide incremental_key"
4390
+ )
4391
+
4392
+ source_parts = urlparse(uri)
4393
+ source_params = parse_qs(source_parts.query)
4394
+ api_key = source_params.get("api_key")
4395
+
4396
+ if not api_key:
4397
+ raise ValueError("api_key in the URI is required to connect to Hostaway")
4398
+
4399
+ match table:
4400
+ case "listings":
4401
+ resource_name = "listings"
4402
+ case "listing_fee_settings":
4403
+ resource_name = "listing_fee_settings"
4404
+ case "listing_agreements":
4405
+ resource_name = "listing_agreements"
4406
+ case "listing_pricing_settings":
4407
+ resource_name = "listing_pricing_settings"
4408
+ case "cancellation_policies":
4409
+ resource_name = "cancellation_policies"
4410
+ case "cancellation_policies_airbnb":
4411
+ resource_name = "cancellation_policies_airbnb"
4412
+ case "cancellation_policies_marriott":
4413
+ resource_name = "cancellation_policies_marriott"
4414
+ case "cancellation_policies_vrbo":
4415
+ resource_name = "cancellation_policies_vrbo"
4416
+ case "reservations":
4417
+ resource_name = "reservations"
4418
+ case "finance_fields":
4419
+ resource_name = "finance_fields"
4420
+ case "reservation_payment_methods":
4421
+ resource_name = "reservation_payment_methods"
4422
+ case "reservation_rental_agreements":
4423
+ resource_name = "reservation_rental_agreements"
4424
+ case "listing_calendars":
4425
+ resource_name = "listing_calendars"
4426
+ case "conversations":
4427
+ resource_name = "conversations"
4428
+ case "message_templates":
4429
+ resource_name = "message_templates"
4430
+ case "bed_types":
4431
+ resource_name = "bed_types"
4432
+ case "property_types":
4433
+ resource_name = "property_types"
4434
+ case "countries":
4435
+ resource_name = "countries"
4436
+ case "account_tax_settings":
4437
+ resource_name = "account_tax_settings"
4438
+ case "user_groups":
4439
+ resource_name = "user_groups"
4440
+ case "guest_payment_charges":
4441
+ resource_name = "guest_payment_charges"
4442
+ case "coupons":
4443
+ resource_name = "coupons"
4444
+ case "webhook_reservations":
4445
+ resource_name = "webhook_reservations"
4446
+ case "tasks":
4447
+ resource_name = "tasks"
4448
+ case _:
4449
+ raise ValueError(
4450
+ f"Resource '{table}' is not supported for Hostaway source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
4451
+ )
4452
+
4453
+ start_date = kwargs.get("interval_start")
4454
+ if start_date:
4455
+ start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
4456
+ else:
4457
+ start_date = pendulum.datetime(1970, 1, 1).in_timezone("UTC")
4458
+
4459
+ end_date = kwargs.get("interval_end")
4460
+ if end_date:
4461
+ end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
4462
+
4463
+ from ingestr.src.hostaway import hostaway_source
4464
+
4465
+ return hostaway_source(
4466
+ api_key=api_key[0],
4467
+ start_date=start_date,
4468
+ end_date=end_date,
4469
+ ).with_resources(resource_name)
4470
+
4471
+
4472
+ class SnapchatAdsSource:
4473
+ resources = [
4474
+ "organizations",
4475
+ "fundingsources",
4476
+ "billingcenters",
4477
+ "adaccounts",
4478
+ "invoices",
4479
+ "transactions",
4480
+ "members",
4481
+ "roles",
4482
+ "campaigns",
4483
+ "adsquads",
4484
+ "ads",
4485
+ "event_details",
4486
+ "creatives",
4487
+ "segments",
4488
+ "campaigns_stats",
4489
+ "ad_accounts_stats",
4490
+ "ads_stats",
4491
+ "ad_squads_stats",
4492
+ ]
4493
+
4494
+ def handles_incrementality(self) -> bool:
4495
+ return True
4496
+
4497
+ def dlt_source(self, uri: str, table: str, **kwargs):
4498
+ parsed_uri = urlparse(uri)
4499
+ source_fields = parse_qs(parsed_uri.query)
4500
+
4501
+ refresh_token = source_fields.get("refresh_token")
4502
+ if not refresh_token:
4503
+ raise ValueError("refresh_token is required to connect to Snapchat Ads")
4504
+
4505
+ client_id = source_fields.get("client_id")
4506
+ if not client_id:
4507
+ raise ValueError("client_id is required to connect to Snapchat Ads")
4508
+
4509
+ client_secret = source_fields.get("client_secret")
4510
+ if not client_secret:
4511
+ raise ValueError("client_secret is required to connect to Snapchat Ads")
4512
+
4513
+ organization_id = source_fields.get("organization_id")
4514
+
4515
+ # Resources that support ad_account_id filtering
4516
+ ad_account_resources = [
4517
+ "invoices",
4518
+ "campaigns",
4519
+ "adsquads",
4520
+ "ads",
4521
+ "event_details",
4522
+ "creatives",
4523
+ "segments",
4524
+ ]
4525
+
4526
+ # Stats resources
4527
+ stats_resources = [
4528
+ "campaigns_stats",
4529
+ "ad_accounts_stats",
4530
+ "ads_stats",
4531
+ "ad_squads_stats",
4532
+ ]
4533
+
4534
+ # Parse table name
4535
+ stats_config = None
4536
+ ad_account_id = None
4537
+
4538
+ if ":" in table:
4539
+ parts = table.split(":")
4540
+ resource_name = parts[0]
4541
+
4542
+ if resource_name in stats_resources:
4543
+ # Stats table format:
4544
+ # resource_name:granularity:fields:options (all accounts)
4545
+ # resource_name:ad_account_id:granularity:fields:options (specific account)
4546
+
4547
+ def parse_options(options_str: str) -> dict:
4548
+ """Parse key=value,key=value options string."""
4549
+ result = {}
4550
+ for option in options_str.split(","):
4551
+ if "=" in option:
4552
+ key, value = option.split("=", 1)
4553
+ result[key] = value
4554
+ return result
4555
+
4556
+ if len(parts) >= 2:
4557
+ valid_granularities = ["TOTAL", "DAY", "HOUR", "LIFETIME"]
4558
+
4559
+ if parts[1].upper() in valid_granularities:
4560
+ # Format: resource_name:granularity:fields:options
4561
+ stats_config = {
4562
+ "granularity": parts[1].upper(),
4563
+ "fields": parts[2]
4564
+ if len(parts) > 2
4565
+ else "impressions,spend",
4566
+ }
4567
+ if len(parts) > 3:
4568
+ stats_config.update(parse_options(parts[3]))
4569
+ else:
4570
+ # Format: resource_name:ad_account_id:granularity:fields:options
4571
+ ad_account_id = parts[1]
4572
+ stats_config = {
4573
+ "granularity": parts[2].upper()
4574
+ if len(parts) > 2
4575
+ else "DAY",
4576
+ "fields": parts[3]
4577
+ if len(parts) > 3
4578
+ else "impressions,spend",
4579
+ }
4580
+ if len(parts) > 4:
4581
+ stats_config.update(parse_options(parts[4]))
4582
+ else:
4583
+ # Just resource_name, use defaults
4584
+ stats_config = {
4585
+ "granularity": "DAY",
4586
+ "fields": "impressions,spend",
4587
+ }
4588
+ else:
4589
+ # Non-stats table with ad_account_id: resource_name:ad_account_id
4590
+ ad_account_id = parts[1] if len(parts) > 1 else None
4591
+ if not ad_account_id:
4592
+ raise ValueError(
4593
+ f"ad_account_id must be provided in format '{resource_name}:ad_account_id'"
4594
+ )
4595
+ else:
4596
+ resource_name = table
4597
+ if resource_name in stats_resources:
4598
+ # Stats resource with default config
4599
+ stats_config = {
4600
+ "granularity": "DAY",
4601
+ "fields": "impressions,spend",
4602
+ }
4603
+
4604
+ # Validation for non-stats resources
4605
+ if resource_name not in stats_resources:
4606
+ account_id_required = (
4607
+ resource_name in ad_account_resources
4608
+ and ad_account_id is None
4609
+ and not organization_id
4610
+ )
4611
+ if account_id_required:
4612
+ raise ValueError(
4613
+ f"organization_id is required for '{resource_name}' table when no specific ad_account_id is provided"
4614
+ )
4615
+
4616
+ if not organization_id and table != "organizations":
4617
+ raise ValueError(
4618
+ f"organization_id is required for table '{table}'. Only 'organizations' table does not require organization_id."
4619
+ )
4620
+ else:
4621
+ # Stats resources need either ad_account_id or organization_id
4622
+ if not ad_account_id and not organization_id:
4623
+ raise ValueError(
4624
+ f"organization_id is required for '{resource_name}' when ad_account_id is not provided"
4625
+ )
4626
+
4627
+ if resource_name not in self.resources:
4628
+ raise UnsupportedResourceError(table, "Snapchat Ads")
4629
+
4630
+ from ingestr.src.snapchat_ads import snapchat_ads_source
4631
+
4632
+ source_kwargs: dict[str, Any] = {
4633
+ "refresh_token": refresh_token[0],
4634
+ "client_id": client_id[0],
4635
+ "client_secret": client_secret[0],
4636
+ }
4637
+
4638
+ if organization_id:
4639
+ source_kwargs["organization_id"] = organization_id[0]
4640
+
4641
+ if ad_account_id:
4642
+ source_kwargs["ad_account_id"] = ad_account_id
4643
+
4644
+ # Add interval_start and interval_end for client-side filtering
4645
+ interval_start = kwargs.get("interval_start")
4646
+ if interval_start:
4647
+ source_kwargs["start_date"] = interval_start
4648
+
4649
+ interval_end = kwargs.get("interval_end")
4650
+ if interval_end:
4651
+ source_kwargs["end_date"] = interval_end
4652
+
4653
+ # Add stats_config for stats resource
4654
+ if stats_config:
4655
+ source_kwargs["stats_config"] = stats_config
4656
+
4657
+ source = snapchat_ads_source(**source_kwargs)
4658
+
4659
+ return source.with_resources(resource_name)