ingestr 0.13.13__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin_max/__init__.py +6 -4
  10. ingestr/src/appsflyer/__init__.py +325 -0
  11. ingestr/src/appsflyer/client.py +49 -45
  12. ingestr/src/appstore/__init__.py +1 -0
  13. ingestr/src/arrow/__init__.py +9 -1
  14. ingestr/src/asana_source/__init__.py +1 -1
  15. ingestr/src/attio/__init__.py +102 -0
  16. ingestr/src/attio/helpers.py +65 -0
  17. ingestr/src/blob.py +37 -10
  18. ingestr/src/buildinfo.py +1 -1
  19. ingestr/src/chess/__init__.py +1 -1
  20. ingestr/src/clickup/__init__.py +85 -0
  21. ingestr/src/clickup/helpers.py +47 -0
  22. ingestr/src/collector/spinner.py +43 -0
  23. ingestr/src/couchbase_source/__init__.py +118 -0
  24. ingestr/src/couchbase_source/helpers.py +135 -0
  25. ingestr/src/cursor/__init__.py +83 -0
  26. ingestr/src/cursor/helpers.py +188 -0
  27. ingestr/src/destinations.py +508 -27
  28. ingestr/src/docebo/__init__.py +589 -0
  29. ingestr/src/docebo/client.py +435 -0
  30. ingestr/src/docebo/helpers.py +97 -0
  31. ingestr/src/elasticsearch/__init__.py +80 -0
  32. ingestr/src/elasticsearch/helpers.py +138 -0
  33. ingestr/src/errors.py +8 -0
  34. ingestr/src/facebook_ads/__init__.py +47 -28
  35. ingestr/src/facebook_ads/helpers.py +59 -37
  36. ingestr/src/facebook_ads/settings.py +2 -0
  37. ingestr/src/facebook_ads/utils.py +39 -0
  38. ingestr/src/factory.py +107 -2
  39. ingestr/src/filesystem/__init__.py +8 -3
  40. ingestr/src/filters.py +46 -3
  41. ingestr/src/fluxx/__init__.py +9906 -0
  42. ingestr/src/fluxx/helpers.py +209 -0
  43. ingestr/src/frankfurter/__init__.py +157 -0
  44. ingestr/src/frankfurter/helpers.py +48 -0
  45. ingestr/src/freshdesk/__init__.py +89 -0
  46. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  47. ingestr/src/freshdesk/settings.py +9 -0
  48. ingestr/src/fundraiseup/__init__.py +95 -0
  49. ingestr/src/fundraiseup/client.py +81 -0
  50. ingestr/src/github/__init__.py +41 -6
  51. ingestr/src/github/helpers.py +5 -5
  52. ingestr/src/google_analytics/__init__.py +22 -4
  53. ingestr/src/google_analytics/helpers.py +124 -6
  54. ingestr/src/google_sheets/__init__.py +4 -4
  55. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  56. ingestr/src/hostaway/__init__.py +302 -0
  57. ingestr/src/hostaway/client.py +288 -0
  58. ingestr/src/http/__init__.py +35 -0
  59. ingestr/src/http/readers.py +114 -0
  60. ingestr/src/http_client.py +24 -0
  61. ingestr/src/hubspot/__init__.py +66 -23
  62. ingestr/src/hubspot/helpers.py +52 -22
  63. ingestr/src/hubspot/settings.py +14 -7
  64. ingestr/src/influxdb/__init__.py +46 -0
  65. ingestr/src/influxdb/client.py +34 -0
  66. ingestr/src/intercom/__init__.py +142 -0
  67. ingestr/src/intercom/helpers.py +674 -0
  68. ingestr/src/intercom/settings.py +279 -0
  69. ingestr/src/isoc_pulse/__init__.py +159 -0
  70. ingestr/src/jira_source/__init__.py +340 -0
  71. ingestr/src/jira_source/helpers.py +439 -0
  72. ingestr/src/jira_source/settings.py +170 -0
  73. ingestr/src/kafka/__init__.py +4 -1
  74. ingestr/src/kinesis/__init__.py +139 -0
  75. ingestr/src/kinesis/helpers.py +82 -0
  76. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  77. ingestr/src/linear/__init__.py +634 -0
  78. ingestr/src/linear/helpers.py +111 -0
  79. ingestr/src/linkedin_ads/helpers.py +0 -1
  80. ingestr/src/mailchimp/__init__.py +126 -0
  81. ingestr/src/mailchimp/helpers.py +226 -0
  82. ingestr/src/mailchimp/settings.py +164 -0
  83. ingestr/src/masking.py +344 -0
  84. ingestr/src/mixpanel/__init__.py +62 -0
  85. ingestr/src/mixpanel/client.py +99 -0
  86. ingestr/src/monday/__init__.py +246 -0
  87. ingestr/src/monday/helpers.py +392 -0
  88. ingestr/src/monday/settings.py +328 -0
  89. ingestr/src/mongodb/__init__.py +72 -8
  90. ingestr/src/mongodb/helpers.py +915 -38
  91. ingestr/src/partition.py +32 -0
  92. ingestr/src/phantombuster/__init__.py +65 -0
  93. ingestr/src/phantombuster/client.py +87 -0
  94. ingestr/src/pinterest/__init__.py +82 -0
  95. ingestr/src/pipedrive/__init__.py +198 -0
  96. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  97. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  98. ingestr/src/pipedrive/helpers/pages.py +115 -0
  99. ingestr/src/pipedrive/settings.py +27 -0
  100. ingestr/src/pipedrive/typing.py +3 -0
  101. ingestr/src/plusvibeai/__init__.py +335 -0
  102. ingestr/src/plusvibeai/helpers.py +544 -0
  103. ingestr/src/plusvibeai/settings.py +252 -0
  104. ingestr/src/quickbooks/__init__.py +117 -0
  105. ingestr/src/resource.py +40 -0
  106. ingestr/src/revenuecat/__init__.py +83 -0
  107. ingestr/src/revenuecat/helpers.py +237 -0
  108. ingestr/src/salesforce/__init__.py +15 -8
  109. ingestr/src/shopify/__init__.py +1 -17
  110. ingestr/src/smartsheets/__init__.py +82 -0
  111. ingestr/src/snapchat_ads/__init__.py +489 -0
  112. ingestr/src/snapchat_ads/client.py +72 -0
  113. ingestr/src/snapchat_ads/helpers.py +535 -0
  114. ingestr/src/socrata_source/__init__.py +83 -0
  115. ingestr/src/socrata_source/helpers.py +85 -0
  116. ingestr/src/socrata_source/settings.py +8 -0
  117. ingestr/src/solidgate/__init__.py +219 -0
  118. ingestr/src/solidgate/helpers.py +154 -0
  119. ingestr/src/sources.py +2933 -245
  120. ingestr/src/stripe_analytics/__init__.py +49 -21
  121. ingestr/src/stripe_analytics/helpers.py +286 -1
  122. ingestr/src/stripe_analytics/settings.py +62 -10
  123. ingestr/src/telemetry/event.py +10 -9
  124. ingestr/src/tiktok_ads/__init__.py +12 -6
  125. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  126. ingestr/src/trustpilot/__init__.py +48 -0
  127. ingestr/src/trustpilot/client.py +48 -0
  128. ingestr/src/wise/__init__.py +68 -0
  129. ingestr/src/wise/client.py +63 -0
  130. ingestr/src/zoom/__init__.py +99 -0
  131. ingestr/src/zoom/helpers.py +102 -0
  132. ingestr/tests/unit/test_smartsheets.py +133 -0
  133. {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/METADATA +229 -19
  134. ingestr-0.14.104.dist-info/RECORD +203 -0
  135. ingestr/src/appsflyer/_init_.py +0 -24
  136. ingestr-0.13.13.dist-info/RECORD +0 -115
  137. {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  138. {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  139. {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/sources.py CHANGED
@@ -3,6 +3,7 @@ import csv
3
3
  import json
4
4
  import os
5
5
  import re
6
+ import sys
6
7
  import tempfile
7
8
  from datetime import date, datetime, timedelta, timezone
8
9
  from typing import (
@@ -13,104 +14,39 @@ from typing import (
13
14
  List,
14
15
  Literal,
15
16
  Optional,
17
+ TypeAlias,
16
18
  Union,
17
19
  )
18
- from urllib.parse import ParseResult, parse_qs, quote, urlencode, urlparse
20
+ from urllib.parse import ParseResult, parse_qs, urlencode, urlparse
19
21
 
20
- import dlt
21
- import gcsfs # type: ignore
22
+ import fsspec # type: ignore
22
23
  import pendulum
23
- import s3fs # type: ignore
24
- from dlt.common.configuration.specs import (
25
- AwsCredentials,
26
- )
27
- from dlt.common.libs.sql_alchemy import (
28
- Engine,
29
- MetaData,
30
- )
31
24
  from dlt.common.time import ensure_pendulum_datetime
32
- from dlt.common.typing import TDataItem, TSecretStrValue
33
25
  from dlt.extract import Incremental
26
+ from dlt.extract.exceptions import ResourcesNotFoundError
27
+ from dlt.sources import incremental as dlt_incremental
34
28
  from dlt.sources.credentials import (
35
29
  ConnectionStringCredentials,
36
30
  )
37
- from dlt.sources.sql_database import sql_table
38
- from dlt.sources.sql_database.helpers import TableLoader
39
- from dlt.sources.sql_database.schema_types import (
40
- ReflectionLevel,
41
- SelectAny,
42
- Table,
43
- TTypeAdapter,
44
- )
45
- from google.ads.googleads.client import GoogleAdsClient # type: ignore
46
- from sqlalchemy import Column
47
- from sqlalchemy import types as sa
48
31
 
49
32
  from ingestr.src import blob
50
- from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
51
- from ingestr.src.adjust.adjust_helpers import parse_filters
52
- from ingestr.src.airtable import airtable_source
53
- from ingestr.src.applovin import applovin_source
54
- from ingestr.src.applovin_max import applovin_max_source
55
- from ingestr.src.appsflyer._init_ import appsflyer_source
56
- from ingestr.src.appstore import app_store
57
- from ingestr.src.appstore.client import AppStoreConnectClient
58
- from ingestr.src.arrow import memory_mapped_arrow
59
- from ingestr.src.asana_source import asana_source
60
- from ingestr.src.chess import source
61
- from ingestr.src.dynamodb import dynamodb
62
33
  from ingestr.src.errors import (
63
34
  InvalidBlobTableError,
64
35
  MissingValueError,
65
36
  UnsupportedResourceError,
66
37
  )
67
- from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
68
- from ingestr.src.filesystem import readers
69
- from ingestr.src.filters import table_adapter_exclude_columns
70
- from ingestr.src.github import github_reactions, github_repo_events, github_stargazers
71
- from ingestr.src.google_ads import google_ads
72
- from ingestr.src.google_analytics import google_analytics
73
- from ingestr.src.google_sheets import google_spreadsheet
74
- from ingestr.src.gorgias import gorgias_source
75
- from ingestr.src.hubspot import hubspot
76
- from ingestr.src.kafka import kafka_consumer
77
- from ingestr.src.kafka.helpers import KafkaCredentials
78
- from ingestr.src.klaviyo._init_ import klaviyo_source
79
- from ingestr.src.linkedin_ads import linked_in_ads_source
80
- from ingestr.src.linkedin_ads.dimension_time_enum import (
81
- Dimension,
82
- TimeGranularity,
83
- )
84
- from ingestr.src.mongodb import mongodb_collection
85
- from ingestr.src.notion import notion_databases
86
- from ingestr.src.personio import personio_source
87
- from ingestr.src.salesforce import salesforce_source
88
- from ingestr.src.shopify import shopify_source
89
- from ingestr.src.slack import slack_source
90
- from ingestr.src.sql_database.callbacks import (
91
- chained_query_adapter_callback,
92
- custom_query_variable_subsitution,
93
- limit_callback,
94
- type_adapter_callback,
95
- )
96
- from ingestr.src.stripe_analytics import stripe_source
97
38
  from ingestr.src.table_definition import TableDefinition, table_string_to_dataclass
98
- from ingestr.src.tiktok_ads import tiktok_source
99
- from ingestr.src.time import isotime
100
- from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
101
- from ingestr.src.zendesk.helpers.credentials import (
102
- ZendeskCredentialsOAuth,
103
- ZendeskCredentialsToken,
104
- )
105
-
106
- TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
107
- TQueryAdapter = Callable[[SelectAny, Table], SelectAny]
108
39
 
109
40
 
110
41
  class SqlSource:
111
42
  table_builder: Callable
112
43
 
113
- def __init__(self, table_builder=sql_table) -> None:
44
+ def __init__(self, table_builder=None) -> None:
45
+ if table_builder is None:
46
+ from dlt.sources.sql_database import sql_table
47
+
48
+ table_builder = sql_table
49
+
114
50
  self.table_builder = table_builder
115
51
 
116
52
  def handles_incrementality(self) -> bool:
@@ -119,13 +55,16 @@ class SqlSource:
119
55
  def dlt_source(self, uri: str, table: str, **kwargs):
120
56
  table_fields = TableDefinition(dataset="custom", table="custom")
121
57
  if not table.startswith("query:"):
122
- table_fields = table_string_to_dataclass(table)
58
+ if uri.startswith("spanner://"):
59
+ table_fields = TableDefinition(dataset="", table=table)
60
+ else:
61
+ table_fields = table_string_to_dataclass(table)
123
62
 
124
63
  incremental = None
125
64
  if kwargs.get("incremental_key"):
126
65
  start_value = kwargs.get("interval_start")
127
66
  end_value = kwargs.get("interval_end")
128
- incremental = dlt.sources.incremental(
67
+ incremental = dlt_incremental(
129
68
  kwargs.get("incremental_key", ""),
130
69
  initial_value=start_value,
131
70
  end_value=end_value,
@@ -133,36 +72,62 @@ class SqlSource:
133
72
  range_start="closed",
134
73
  )
135
74
 
75
+ engine_adapter_callback = None
76
+
77
+ if uri.startswith("md://") or uri.startswith("motherduck://"):
78
+ parsed_uri = urlparse(uri)
79
+ query_params = parse_qs(parsed_uri.query)
80
+ # Convert md:// URI to duckdb:///md: format
81
+ if parsed_uri.path:
82
+ db_path = parsed_uri.path
83
+ else:
84
+ db_path = ""
85
+
86
+ token = query_params.get("token", [""])[0]
87
+ if not token:
88
+ raise ValueError("Token is required for MotherDuck connection")
89
+ uri = f"duckdb:///md:{db_path}?motherduck_token={token}"
90
+
136
91
  if uri.startswith("mysql://"):
137
92
  uri = uri.replace("mysql://", "mysql+pymysql://")
138
93
 
139
- # clickhouse://<username>:<password>@<host>:<port>?secure=<secure>
140
- if uri.startswith("clickhouse://"):
94
+ # Monkey patch cx_Oracle to use oracledb (thin mode, no client libraries required)
95
+ if uri.startswith("oracle+") or uri.startswith("oracle://"):
96
+ try:
97
+ import oracledb # type: ignore[import-not-found]
98
+
99
+ # SQLAlchemy's cx_oracle dialect checks for version >= 5.2
100
+ # oracledb has a different versioning scheme, so we need to patch it
101
+ oracledb.version = "8.3.0" # type: ignore[assignment]
102
+ sys.modules["cx_Oracle"] = oracledb # type: ignore[assignment]
103
+ except ImportError:
104
+ # oracledb not installed, will fail later with a clear error
105
+ pass
106
+
107
+ # Process Snowflake private key authentication
108
+ if uri.startswith("snowflake://"):
141
109
  parsed_uri = urlparse(uri)
110
+ query_params = parse_qs(parsed_uri.query)
142
111
 
143
- username = parsed_uri.username
144
- if not username:
145
- raise ValueError(
146
- "A username is required to connect to the ClickHouse database."
147
- )
112
+ if "private_key" in query_params:
113
+ from dlt.common.libs.cryptography import decode_private_key
148
114
 
149
- password = parsed_uri.password
150
- if not password:
151
- raise ValueError(
152
- "A password is required to authenticate with the ClickHouse database."
153
- )
115
+ private_key = query_params["private_key"][0]
116
+ passphrase = query_params.get("private_key_passphrase", [None])[0]
117
+ decoded_key = decode_private_key(private_key, passphrase)
154
118
 
155
- host = parsed_uri.hostname
156
- if not host:
157
- raise ValueError(
158
- "The hostname or IP address of the ClickHouse server is required to establish a connection."
159
- )
119
+ query_params["private_key"] = [base64.b64encode(decoded_key).decode()]
120
+ if "private_key_passphrase" in query_params:
121
+ del query_params["private_key_passphrase"]
160
122
 
161
- port = parsed_uri.port
162
- if not port:
163
- raise ValueError(
164
- "The TCP port of the ClickHouse server is required to establish a connection."
165
- )
123
+ # Rebuild URI
124
+ uri = parsed_uri._replace(
125
+ query=urlencode(query_params, doseq=True)
126
+ ).geturl()
127
+
128
+ # clickhouse://<username>:<password>@<host>:<port>?secure=<secure>
129
+ if uri.startswith("clickhouse://"):
130
+ parsed_uri = urlparse(uri)
166
131
 
167
132
  query_params = parse_qs(parsed_uri.query)
168
133
 
@@ -177,6 +142,73 @@ class SqlSource:
177
142
  query=urlencode(query_params, doseq=True),
178
143
  ).geturl()
179
144
 
145
+ if uri.startswith("db2://"):
146
+ uri = uri.replace("db2://", "db2+ibm_db://")
147
+
148
+ if uri.startswith("spanner://"):
149
+ parsed_uri = urlparse(uri)
150
+ query_params = parse_qs(parsed_uri.query)
151
+
152
+ project_id_param = query_params.get("project_id")
153
+ instance_id_param = query_params.get("instance_id")
154
+ database_param = query_params.get("database")
155
+
156
+ cred_path = query_params.get("credentials_path")
157
+ cred_base64 = query_params.get("credentials_base64")
158
+
159
+ if not project_id_param or not instance_id_param or not database_param:
160
+ raise ValueError(
161
+ "project_id, instance_id and database are required in the URI to get data from Google Spanner"
162
+ )
163
+
164
+ project_id = project_id_param[0]
165
+ instance_id = instance_id_param[0]
166
+ database = database_param[0]
167
+
168
+ if not cred_path and not cred_base64:
169
+ raise ValueError(
170
+ "credentials_path or credentials_base64 is required in the URI to get data from Google Sheets"
171
+ )
172
+ if cred_path:
173
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path[0]
174
+ elif cred_base64:
175
+ credentials = json.loads(
176
+ base64.b64decode(cred_base64[0]).decode("utf-8")
177
+ )
178
+ temp = tempfile.NamedTemporaryFile(
179
+ mode="w", delete=False, suffix=".json"
180
+ )
181
+ json.dump(credentials, temp)
182
+ temp.close()
183
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp.name
184
+
185
+ uri = f"spanner+spanner:///projects/{project_id}/instances/{instance_id}/databases/{database}"
186
+
187
+ def eng_callback(engine):
188
+ return engine.execution_options(read_only=True)
189
+
190
+ engine_adapter_callback = eng_callback
191
+ from dlt.common.libs.sql_alchemy import (
192
+ Engine,
193
+ MetaData,
194
+ )
195
+ from dlt.sources.sql_database.schema_types import (
196
+ ReflectionLevel,
197
+ SelectAny,
198
+ Table,
199
+ TTypeAdapter,
200
+ )
201
+ from sqlalchemy import Column
202
+ from sqlalchemy import types as sa
203
+
204
+ from ingestr.src.filters import table_adapter_exclude_columns
205
+ from ingestr.src.sql_database.callbacks import (
206
+ chained_query_adapter_callback,
207
+ custom_query_variable_subsitution,
208
+ limit_callback,
209
+ type_adapter_callback,
210
+ )
211
+
180
212
  query_adapters = []
181
213
  if kwargs.get("sql_limit"):
182
214
  query_adapters.append(
@@ -195,6 +227,13 @@ class SqlSource:
195
227
  defer_table_reflect = True
196
228
  query_value = table.split(":", 1)[1]
197
229
 
230
+ TableBackend: TypeAlias = Literal[
231
+ "sqlalchemy", "pyarrow", "pandas", "connectorx"
232
+ ]
233
+ TQueryAdapter: TypeAlias = Callable[[SelectAny, Table], SelectAny]
234
+ import dlt
235
+ from dlt.common.typing import TDataItem
236
+
198
237
  # this is a very hacky version of the table_rows function. it is built this way to go around the dlt's table loader.
199
238
  # I didn't want to write a full fledged sqlalchemy source for now, and wanted to benefit from the existing stuff to begin with.
200
239
  # this is by no means a production ready solution, but it works for now.
@@ -212,6 +251,9 @@ class SqlSource:
212
251
  backend_kwargs: Dict[str, Any] = None, # type: ignore
213
252
  type_adapter_callback: Optional[TTypeAdapter] = None,
214
253
  included_columns: Optional[List[str]] = None,
254
+ excluded_columns: Optional[
255
+ List[str]
256
+ ] = None, # Added for dlt 1.16.0 compatibility
215
257
  query_adapter_callback: Optional[TQueryAdapter] = None,
216
258
  resolve_foreign_keys: bool = False,
217
259
  ) -> Iterator[TDataItem]:
@@ -245,6 +287,8 @@ class SqlSource:
245
287
  *cols,
246
288
  )
247
289
 
290
+ from dlt.sources.sql_database.helpers import TableLoader
291
+
248
292
  loader = TableLoader(
249
293
  engine,
250
294
  backend,
@@ -265,8 +309,54 @@ class SqlSource:
265
309
  # override the query adapters, the only one we want is the one here in the case of custom queries
266
310
  query_adapters = [custom_query_variable_subsitution(query_value, kwargs)]
267
311
 
312
+ credentials = ConnectionStringCredentials(uri)
313
+ if uri.startswith("mssql://"):
314
+ parsed_uri = urlparse(uri)
315
+ params = parse_qs(parsed_uri.query)
316
+ params = {k.lower(): v for k, v in params.items()}
317
+ if params.get("authentication") == ["ActiveDirectoryAccessToken"]:
318
+ import pyodbc # type: ignore
319
+ from sqlalchemy import create_engine
320
+
321
+ from ingestr.src.destinations import (
322
+ MSSQL_COPT_SS_ACCESS_TOKEN,
323
+ handle_datetimeoffset,
324
+ serialize_azure_token,
325
+ )
326
+
327
+ cfg = {
328
+ "DRIVER": params.get("driver", ["ODBC Driver 18 for SQL Server"])[
329
+ 0
330
+ ],
331
+ "SERVER": f"{parsed_uri.hostname},{parsed_uri.port or 1433}",
332
+ "DATABASE": parsed_uri.path.lstrip("/"),
333
+ }
334
+ for k, v in params.items():
335
+ if k.lower() not in ["driver", "authentication", "connect_timeout"]:
336
+ cfg[k.upper()] = v[0]
337
+
338
+ token = serialize_azure_token(parsed_uri.password)
339
+ dsn = ";".join([f"{k}={v}" for k, v in cfg.items()])
340
+
341
+ def creator():
342
+ connection = pyodbc.connect(
343
+ dsn,
344
+ autocommit=True,
345
+ timeout=kwargs.get("connect_timeout", 30),
346
+ attrs_before={
347
+ MSSQL_COPT_SS_ACCESS_TOKEN: token,
348
+ },
349
+ )
350
+ connection.add_output_converter(-155, handle_datetimeoffset)
351
+ return connection
352
+
353
+ credentials = create_engine(
354
+ "mssql+pyodbc://",
355
+ creator=creator,
356
+ )
357
+
268
358
  builder_res = self.table_builder(
269
- credentials=ConnectionStringCredentials(uri),
359
+ credentials=credentials,
270
360
  schema=table_fields.dataset,
271
361
  table=table_fields.table,
272
362
  incremental=incremental,
@@ -279,6 +369,7 @@ class SqlSource:
279
369
  kwargs.get("sql_exclude_columns", [])
280
370
  ),
281
371
  defer_table_reflect=defer_table_reflect,
372
+ engine_adapter_callback=engine_adapter_callback,
282
373
  )
283
374
 
284
375
  return builder_res
@@ -287,7 +378,12 @@ class SqlSource:
287
378
  class ArrowMemoryMappedSource:
288
379
  table_builder: Callable
289
380
 
290
- def __init__(self, table_builder=memory_mapped_arrow) -> None:
381
+ def __init__(self, table_builder=None) -> None:
382
+ if table_builder is None:
383
+ from ingestr.src.arrow import memory_mapped_arrow
384
+
385
+ table_builder = memory_mapped_arrow
386
+
291
387
  self.table_builder = table_builder
292
388
 
293
389
  def handles_incrementality(self) -> bool:
@@ -299,7 +395,7 @@ class ArrowMemoryMappedSource:
299
395
  start_value = kwargs.get("interval_start")
300
396
  end_value = kwargs.get("interval_end")
301
397
 
302
- incremental = dlt.sources.incremental(
398
+ incremental = dlt_incremental(
303
399
  kwargs.get("incremental_key", ""),
304
400
  initial_value=start_value,
305
401
  end_value=end_value,
@@ -332,37 +428,199 @@ class ArrowMemoryMappedSource:
332
428
  class MongoDbSource:
333
429
  table_builder: Callable
334
430
 
335
- def __init__(self, table_builder=mongodb_collection) -> None:
431
+ def __init__(self, table_builder=None) -> None:
432
+ if table_builder is None:
433
+ from ingestr.src.mongodb import mongodb_collection
434
+
435
+ table_builder = mongodb_collection
436
+
336
437
  self.table_builder = table_builder
337
438
 
338
439
  def handles_incrementality(self) -> bool:
339
440
  return False
340
441
 
341
442
  def dlt_source(self, uri: str, table: str, **kwargs):
342
- table_fields = table_string_to_dataclass(table)
443
+ # Check if this is a custom query format (collection:query)
444
+ if ":" in table:
445
+ collection_name, query_json = table.split(":", 1)
343
446
 
344
- incremental = None
345
- if kwargs.get("incremental_key"):
346
- start_value = kwargs.get("interval_start")
347
- end_value = kwargs.get("interval_end")
447
+ # Parse the query using MongoDB's extended JSON parser
448
+ # First, convert MongoDB shell syntax to Extended JSON format
449
+ from bson import json_util
348
450
 
349
- incremental = dlt.sources.incremental(
350
- kwargs.get("incremental_key", ""),
351
- initial_value=start_value,
352
- end_value=end_value,
353
- range_end="closed",
354
- range_start="closed",
451
+ from ingestr.src.mongodb.helpers import convert_mongo_shell_to_extended_json
452
+
453
+ # Convert MongoDB shell constructs to Extended JSON v2 format
454
+ converted_query = convert_mongo_shell_to_extended_json(query_json)
455
+
456
+ try:
457
+ query = json_util.loads(converted_query)
458
+ except Exception as e:
459
+ raise ValueError(f"Invalid MongoDB query format: {e}")
460
+
461
+ # Validate that it's a list for aggregation pipeline
462
+ if not isinstance(query, list):
463
+ raise ValueError(
464
+ "Query must be a JSON array representing a MongoDB aggregation pipeline"
465
+ )
466
+
467
+ # Check for incremental load requirements
468
+ incremental = None
469
+ if kwargs.get("incremental_key"):
470
+ start_value = kwargs.get("interval_start")
471
+ end_value = kwargs.get("interval_end")
472
+
473
+ # Validate that incremental key is present in the pipeline
474
+ incremental_key = kwargs.get("incremental_key")
475
+ self._validate_incremental_query(query, str(incremental_key))
476
+
477
+ incremental = dlt_incremental(
478
+ str(incremental_key),
479
+ initial_value=start_value,
480
+ end_value=end_value,
481
+ )
482
+
483
+ # Substitute interval parameters in the query
484
+ query = self._substitute_interval_params(query, kwargs)
485
+
486
+ # Parse collection name to get database and collection
487
+ if "." in collection_name:
488
+ # Handle database.collection format
489
+ table_fields = table_string_to_dataclass(collection_name)
490
+ database = table_fields.dataset
491
+ collection = table_fields.table
492
+ else:
493
+ # Single collection name, use default database
494
+ database = None
495
+ collection = collection_name
496
+
497
+ table_instance = self.table_builder(
498
+ connection_url=uri,
499
+ database=database,
500
+ collection=collection,
501
+ parallel=False,
502
+ incremental=incremental,
503
+ custom_query=query,
355
504
  )
505
+ table_instance.max_table_nesting = 1
506
+ return table_instance
507
+ else:
508
+ # Default behavior for simple collection names
509
+ table_fields = table_string_to_dataclass(table)
356
510
 
357
- table_instance = self.table_builder(
358
- connection_url=uri,
359
- database=table_fields.dataset,
360
- collection=table_fields.table,
361
- parallel=True,
362
- incremental=incremental,
363
- )
511
+ incremental = None
512
+ if kwargs.get("incremental_key"):
513
+ start_value = kwargs.get("interval_start")
514
+ end_value = kwargs.get("interval_end")
364
515
 
365
- return table_instance
516
+ incremental = dlt_incremental(
517
+ kwargs.get("incremental_key", ""),
518
+ initial_value=start_value,
519
+ end_value=end_value,
520
+ )
521
+
522
+ table_instance = self.table_builder(
523
+ connection_url=uri,
524
+ database=table_fields.dataset,
525
+ collection=table_fields.table,
526
+ parallel=False,
527
+ incremental=incremental,
528
+ )
529
+ table_instance.max_table_nesting = 1
530
+
531
+ return table_instance
532
+
533
+ def _validate_incremental_query(self, query: list, incremental_key: str):
534
+ """Validate that incremental key is projected in the aggregation pipeline"""
535
+ # Check if there's a $project stage and if incremental_key is included
536
+ has_project = False
537
+ incremental_key_projected = False
538
+
539
+ for stage in query:
540
+ if "$project" in stage:
541
+ has_project = True
542
+ project_stage = stage["$project"]
543
+ if isinstance(project_stage, dict):
544
+ # Check if incremental_key is explicitly included
545
+ if incremental_key in project_stage:
546
+ if project_stage[incremental_key] not in [0, False]:
547
+ incremental_key_projected = True
548
+ # If there are only inclusions (1 or True values) and incremental_key is not included
549
+ elif any(v in [1, True] for v in project_stage.values()):
550
+ # This is an inclusion projection, incremental_key must be explicitly included
551
+ incremental_key_projected = False
552
+ # If there are only exclusions (0 or False values) and incremental_key is not excluded
553
+ elif all(
554
+ v in [0, False]
555
+ for v in project_stage.values()
556
+ if v in [0, False, 1, True]
557
+ ):
558
+ # This is an exclusion projection, incremental_key is included by default
559
+ if incremental_key not in project_stage:
560
+ incremental_key_projected = True
561
+ else:
562
+ incremental_key_projected = project_stage[
563
+ incremental_key
564
+ ] not in [0, False]
565
+ else:
566
+ # Mixed or unclear projection, assume incremental_key needs to be explicit
567
+ incremental_key_projected = False
568
+
569
+ # If there's a $project stage but incremental_key is not projected, raise error
570
+ if has_project and not incremental_key_projected:
571
+ raise ValueError(
572
+ f"Incremental key '{incremental_key}' must be included in the projected fields of the aggregation pipeline"
573
+ )
574
+
575
+ def _substitute_interval_params(self, query: list, kwargs: dict):
576
+ """Substitute :interval_start and :interval_end placeholders with actual datetime values"""
577
+ from dlt.common.time import ensure_pendulum_datetime
578
+
579
+ # Get interval values and convert them to datetime objects
580
+ interval_start = kwargs.get("interval_start")
581
+ interval_end = kwargs.get("interval_end")
582
+
583
+ # Convert string dates to datetime objects if needed
584
+ if interval_start is not None:
585
+ if isinstance(interval_start, str):
586
+ pendulum_dt = ensure_pendulum_datetime(interval_start)
587
+ interval_start = (
588
+ pendulum_dt.to_datetime()
589
+ if hasattr(pendulum_dt, "to_datetime")
590
+ else pendulum_dt
591
+ )
592
+ elif hasattr(interval_start, "to_datetime"):
593
+ interval_start = interval_start.to_datetime()
594
+
595
+ if interval_end is not None:
596
+ if isinstance(interval_end, str):
597
+ pendulum_dt = ensure_pendulum_datetime(interval_end)
598
+ interval_end = (
599
+ pendulum_dt.to_datetime()
600
+ if hasattr(pendulum_dt, "to_datetime")
601
+ else pendulum_dt
602
+ )
603
+ elif hasattr(interval_end, "to_datetime"):
604
+ interval_end = interval_end.to_datetime()
605
+
606
+ # Deep copy the query and replace placeholders with actual datetime objects
607
+ def replace_placeholders(obj):
608
+ if isinstance(obj, dict):
609
+ result = {}
610
+ for key, value in obj.items():
611
+ if value == ":interval_start" and interval_start is not None:
612
+ result[key] = interval_start
613
+ elif value == ":interval_end" and interval_end is not None:
614
+ result[key] = interval_end
615
+ else:
616
+ result[key] = replace_placeholders(value)
617
+ return result
618
+ elif isinstance(obj, list):
619
+ return [replace_placeholders(item) for item in obj]
620
+ else:
621
+ return obj
622
+
623
+ return replace_placeholders(query)
366
624
 
367
625
 
368
626
  class LocalCsvSource:
@@ -371,7 +629,7 @@ class LocalCsvSource:
371
629
 
372
630
  def dlt_source(self, uri: str, table: str, **kwargs):
373
631
  def csv_file(
374
- incremental: Optional[dlt.sources.incremental[Any]] = None,
632
+ incremental: Optional[dlt_incremental[Any]] = None,
375
633
  ):
376
634
  file_path = uri.split("://")[1]
377
635
  myFile = open(file_path, "r")
@@ -413,11 +671,13 @@ class LocalCsvSource:
413
671
  if page:
414
672
  yield page
415
673
 
416
- return dlt.resource(
674
+ from dlt import resource
675
+
676
+ return resource(
417
677
  csv_file,
418
678
  merge_key=kwargs.get("merge_key"), # type: ignore
419
679
  )(
420
- incremental=dlt.sources.incremental(
680
+ incremental=dlt_incremental(
421
681
  kwargs.get("incremental_key", ""),
422
682
  initial_value=kwargs.get("interval_start"),
423
683
  end_value=kwargs.get("interval_end"),
@@ -433,7 +693,12 @@ class LocalCsvSource:
433
693
  class NotionSource:
434
694
  table_builder: Callable
435
695
 
436
- def __init__(self, table_builder=notion_databases) -> None:
696
+ def __init__(self, table_builder=None) -> None:
697
+ if table_builder is None:
698
+ from ingestr.src.notion import notion_databases
699
+
700
+ table_builder = notion_databases
701
+
437
702
  self.table_builder = table_builder
438
703
 
439
704
  def handles_incrementality(self) -> bool:
@@ -460,6 +725,11 @@ class ShopifySource:
460
725
  return True
461
726
 
462
727
  def dlt_source(self, uri: str, table: str, **kwargs):
728
+ if kwargs.get("incremental_key"):
729
+ raise ValueError(
730
+ "Shopify takes care of incrementality on its own, you should not provide incremental_key"
731
+ )
732
+
463
733
  source_fields = urlparse(uri)
464
734
  source_params = parse_qs(source_fields.query)
465
735
  api_key = source_params.get("api_key")
@@ -493,6 +763,8 @@ class ShopifySource:
493
763
  f"Table name '{table}' is not supported for Shopify source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
494
764
  )
495
765
 
766
+ from ingestr.src.shopify import shopify_source
767
+
496
768
  return shopify_source(
497
769
  private_app_password=api_key[0],
498
770
  shop_url=f"https://{source_fields.netloc}",
@@ -537,6 +809,8 @@ class GorgiasSource:
537
809
  if kwargs.get("interval_end"):
538
810
  date_args["end_date"] = kwargs.get("interval_end")
539
811
 
812
+ from ingestr.src.gorgias import gorgias_source
813
+
540
814
  return gorgias_source(
541
815
  domain=source_fields.netloc,
542
816
  email=email[0],
@@ -548,7 +822,12 @@ class GorgiasSource:
548
822
  class GoogleSheetsSource:
549
823
  table_builder: Callable
550
824
 
551
- def __init__(self, table_builder=google_spreadsheet) -> None:
825
+ def __init__(self, table_builder=None) -> None:
826
+ if table_builder is None:
827
+ from ingestr.src.google_sheets import google_spreadsheet
828
+
829
+ table_builder = google_spreadsheet
830
+
552
831
  self.table_builder = table_builder
553
832
 
554
833
  def handles_incrementality(self) -> bool:
@@ -629,6 +908,8 @@ class ChessSource:
629
908
  f"Resource '{table}' is not supported for Chess source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
630
909
  )
631
910
 
911
+ from ingestr.src.chess import source
912
+
632
913
  return source(players=list_players, **date_args).with_resources(
633
914
  table_mapping[table]
634
915
  )
@@ -652,40 +933,74 @@ class StripeAnalyticsSource:
652
933
  if not api_key:
653
934
  raise ValueError("api_key in the URI is required to connect to Stripe")
654
935
 
936
+ table = table.lower()
937
+
938
+ from ingestr.src.stripe_analytics.settings import ENDPOINTS
939
+
655
940
  endpoint = None
656
- table = str.capitalize(table)
941
+ incremental = False
942
+ sync = False
657
943
 
658
- if table in [
659
- "Subscription",
660
- "Account",
661
- "Coupon",
662
- "Customer",
663
- "Product",
664
- "Price",
665
- "BalanceTransaction",
666
- "Invoice",
667
- "Event",
668
- ]:
669
- endpoint = table
944
+ table_fields = table.split(":")
945
+ if len(table_fields) == 1:
946
+ endpoint = table_fields[0]
947
+ elif len(table_fields) == 2:
948
+ endpoint = table_fields[0]
949
+ sync = table_fields[1] == "sync"
950
+ elif len(table_fields) == 3:
951
+ endpoint = table_fields[0]
952
+ sync = table_fields[1] == "sync"
953
+ incremental = table_fields[2] == "incremental"
670
954
  else:
671
955
  raise ValueError(
672
- f"Resource '{table}' is not supported for stripe source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
956
+ "Invalid Stripe table format. Expected: stripe:<endpoint> or stripe:<endpoint>:<sync> or stripe:<endpoint>:<sync>:<incremental>"
673
957
  )
674
958
 
675
- date_args = {}
676
- if kwargs.get("interval_start"):
677
- date_args["start_date"] = kwargs.get("interval_start")
678
-
679
- if kwargs.get("interval_end"):
680
- date_args["end_date"] = kwargs.get("interval_end")
681
-
682
- return stripe_source(
683
- endpoints=[
684
- endpoint,
685
- ],
686
- stripe_secret_key=api_key[0],
687
- **date_args,
688
- ).with_resources(endpoint)
959
+ if incremental and not sync:
960
+ raise ValueError("incremental loads must be used with sync loading")
961
+
962
+ if incremental:
963
+ from ingestr.src.stripe_analytics import incremental_stripe_source
964
+
965
+ def nullable_date(date_str: Optional[str]):
966
+ if date_str:
967
+ return ensure_pendulum_datetime(date_str)
968
+ return None
969
+
970
+ endpoint = ENDPOINTS[endpoint]
971
+ return incremental_stripe_source(
972
+ endpoints=[
973
+ endpoint,
974
+ ],
975
+ stripe_secret_key=api_key[0],
976
+ initial_start_date=nullable_date(kwargs.get("interval_start", None)),
977
+ end_date=nullable_date(kwargs.get("interval_end", None)),
978
+ ).with_resources(endpoint)
979
+ else:
980
+ endpoint = ENDPOINTS[endpoint]
981
+ if sync:
982
+ from ingestr.src.stripe_analytics import stripe_source
983
+
984
+ return stripe_source(
985
+ endpoints=[
986
+ endpoint,
987
+ ],
988
+ stripe_secret_key=api_key[0],
989
+ ).with_resources(endpoint)
990
+ else:
991
+ from ingestr.src.stripe_analytics import async_stripe_source
992
+
993
+ return async_stripe_source(
994
+ endpoints=[
995
+ endpoint,
996
+ ],
997
+ stripe_secret_key=api_key[0],
998
+ max_workers=kwargs.get("extract_parallelism", 4),
999
+ ).with_resources(endpoint)
1000
+
1001
+ raise ValueError(
1002
+ f"Resource '{table}' is not supported for stripe source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1003
+ )
689
1004
 
690
1005
 
691
1006
  class FacebookAdsSource:
@@ -711,17 +1026,76 @@ class FacebookAdsSource:
711
1026
  "access_token and accound_id are required to connect to Facebook Ads."
712
1027
  )
713
1028
 
1029
+ from ingestr.src.facebook_ads import (
1030
+ facebook_ads_source,
1031
+ facebook_insights_source,
1032
+ )
1033
+
1034
+ insights_max_wait_to_finish_seconds = source_params.get(
1035
+ "insights_max_wait_to_finish_seconds", [60 * 60 * 4]
1036
+ )
1037
+ insights_max_wait_to_start_seconds = source_params.get(
1038
+ "insights_max_wait_to_start_seconds", [60 * 30]
1039
+ )
1040
+ insights_max_async_sleep_seconds = source_params.get(
1041
+ "insights_max_async_sleep_seconds", [20]
1042
+ )
1043
+
714
1044
  endpoint = None
715
1045
  if table in ["campaigns", "ad_sets", "ad_creatives", "ads", "leads"]:
716
1046
  endpoint = table
717
- elif table in "facebook_insights":
1047
+ elif table == "facebook_insights":
718
1048
  return facebook_insights_source(
719
1049
  access_token=access_token[0],
720
1050
  account_id=account_id[0],
1051
+ start_date=kwargs.get("interval_start"),
1052
+ end_date=kwargs.get("interval_end"),
1053
+ insights_max_wait_to_finish_seconds=insights_max_wait_to_finish_seconds[
1054
+ 0
1055
+ ],
1056
+ insights_max_wait_to_start_seconds=insights_max_wait_to_start_seconds[
1057
+ 0
1058
+ ],
1059
+ insights_max_async_sleep_seconds=insights_max_async_sleep_seconds[0],
721
1060
  ).with_resources("facebook_insights")
1061
+ elif table.startswith("facebook_insights:"):
1062
+ # Parse custom breakdowns and metrics from table name
1063
+ # Supported formats:
1064
+ # facebook_insights:breakdown_type
1065
+ # facebook_insights:breakdown_type:metric1,metric2...
1066
+ parts = table.split(":")
1067
+
1068
+ if len(parts) < 2 or len(parts) > 3:
1069
+ raise ValueError(
1070
+ "Invalid facebook_insights format. Expected: facebook_insights:breakdown_type or facebook_insights:breakdown_type:metric1,metric2..."
1071
+ )
1072
+
1073
+ breakdown_type = parts[1].strip()
1074
+ if not breakdown_type:
1075
+ raise ValueError(
1076
+ "Breakdown type must be provided in format: facebook_insights:breakdown_type"
1077
+ )
1078
+
1079
+ # Validate breakdown type against available options from settings
1080
+
1081
+ from ingestr.src.facebook_ads.helpers import (
1082
+ parse_insights_table_to_source_kwargs,
1083
+ )
1084
+
1085
+ source_kwargs = {
1086
+ "access_token": access_token[0],
1087
+ "account_id": account_id[0],
1088
+ "start_date": kwargs.get("interval_start"),
1089
+ "end_date": kwargs.get("interval_end"),
1090
+ }
1091
+
1092
+ source_kwargs.update(parse_insights_table_to_source_kwargs(table))
1093
+ return facebook_insights_source(**source_kwargs).with_resources(
1094
+ "facebook_insights"
1095
+ )
722
1096
  else:
723
1097
  raise ValueError(
724
- "fResource '{table}' is not supported for Facebook Ads source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1098
+ f"Resource '{table}' is not supported for Facebook Ads source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
725
1099
  )
726
1100
 
727
1101
  return facebook_ads_source(
@@ -768,6 +1142,8 @@ class SlackSource:
768
1142
  if kwargs.get("interval_end"):
769
1143
  date_args["end_date"] = kwargs.get("interval_end")
770
1144
 
1145
+ from ingestr.src.slack import slack_source
1146
+
771
1147
  return slack_source(
772
1148
  access_token=api_key[0],
773
1149
  table_per_channel=False,
@@ -778,7 +1154,7 @@ class SlackSource:
778
1154
 
779
1155
  class HubspotSource:
780
1156
  def handles_incrementality(self) -> bool:
781
- return True
1157
+ return False
782
1158
 
783
1159
  # hubspot://?api_key=<api_key>
784
1160
  def dlt_source(self, uri: str, table: str, **kwargs):
@@ -796,7 +1172,35 @@ class HubspotSource:
796
1172
  raise ValueError("api_key in the URI is required to connect to Hubspot")
797
1173
 
798
1174
  endpoint = None
799
- if table in ["contacts", "companies", "deals", "tickets", "products", "quotes"]:
1175
+
1176
+ from ingestr.src.hubspot import hubspot
1177
+
1178
+ if table.startswith("custom:"):
1179
+ fields = table.split(":", 2)
1180
+ if len(fields) != 2 and len(fields) != 3:
1181
+ raise ValueError(
1182
+ "Invalid Hubspot custom table format. Expected format: custom:<custom_object_type> or custom:<custom_object_type>:<associations>"
1183
+ )
1184
+
1185
+ if len(fields) == 2:
1186
+ endpoint = fields[1]
1187
+ else:
1188
+ endpoint = f"{fields[1]}:{fields[2]}"
1189
+
1190
+ return hubspot(
1191
+ api_key=api_key[0],
1192
+ custom_object=endpoint,
1193
+ ).with_resources("custom")
1194
+
1195
+ elif table in [
1196
+ "contacts",
1197
+ "companies",
1198
+ "deals",
1199
+ "tickets",
1200
+ "products",
1201
+ "quotes",
1202
+ "schemas",
1203
+ ]:
800
1204
  endpoint = table
801
1205
  else:
802
1206
  raise ValueError(
@@ -821,20 +1225,31 @@ class AirtableSource:
821
1225
  if not table:
822
1226
  raise ValueError("Source table is required to connect to Airtable")
823
1227
 
824
- tables = table.split(",")
825
-
826
1228
  source_parts = urlparse(uri)
827
1229
  source_fields = parse_qs(source_parts.query)
828
- base_id = source_fields.get("base_id")
829
1230
  access_token = source_fields.get("access_token")
830
1231
 
831
- if not base_id or not access_token:
1232
+ if not access_token:
832
1233
  raise ValueError(
833
- "base_id and access_token in the URI are required to connect to Airtable"
1234
+ "access_token in the URI is required to connect to Airtable"
834
1235
  )
835
1236
 
1237
+ base_id = source_fields.get("base_id", [None])[0]
1238
+ clean_table = table
1239
+
1240
+ table_fields = table.split("/")
1241
+ if len(table_fields) == 2:
1242
+ clean_table = table_fields[1]
1243
+ if not base_id:
1244
+ base_id = table_fields[0]
1245
+
1246
+ if not base_id:
1247
+ raise ValueError("base_id in the URI is required to connect to Airtable")
1248
+
1249
+ from ingestr.src.airtable import airtable_source
1250
+
836
1251
  return airtable_source(
837
- base_id=base_id[0], table_names=tables, access_token=access_token[0]
1252
+ base_id=base_id, table_names=[clean_table], access_token=access_token[0]
838
1253
  )
839
1254
 
840
1255
 
@@ -880,12 +1295,66 @@ class KlaviyoSource:
880
1295
  )
881
1296
 
882
1297
  start_date = kwargs.get("interval_start") or "2000-01-01"
1298
+
1299
+ from ingestr.src.klaviyo import klaviyo_source
1300
+
883
1301
  return klaviyo_source(
884
1302
  api_key=api_key[0],
885
1303
  start_date=start_date,
886
1304
  ).with_resources(resource)
887
1305
 
888
1306
 
1307
+ class MixpanelSource:
1308
+ def handles_incrementality(self) -> bool:
1309
+ return True
1310
+
1311
+ def dlt_source(self, uri: str, table: str, **kwargs):
1312
+ if kwargs.get("incremental_key"):
1313
+ raise ValueError(
1314
+ "Mixpanel takes care of incrementality on its own, you should not provide incremental_key"
1315
+ )
1316
+
1317
+ parsed = urlparse(uri)
1318
+ params = parse_qs(parsed.query)
1319
+ username = params.get("username")
1320
+ password = params.get("password")
1321
+ project_id = params.get("project_id")
1322
+ server = params.get("server", ["eu"])
1323
+
1324
+ if not username or not password or not project_id:
1325
+ raise ValueError(
1326
+ "username, password, project_id are required to connect to Mixpanel"
1327
+ )
1328
+
1329
+ if table not in ["events", "profiles"]:
1330
+ raise ValueError(
1331
+ f"Resource '{table}' is not supported for Mixpanel source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1332
+ )
1333
+
1334
+ start_date = kwargs.get("interval_start")
1335
+ if start_date:
1336
+ start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
1337
+ else:
1338
+ start_date = pendulum.datetime(2020, 1, 1).in_timezone("UTC")
1339
+
1340
+ end_date = kwargs.get("interval_end")
1341
+ if end_date:
1342
+ end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
1343
+ else:
1344
+ end_date = pendulum.now().in_timezone("UTC")
1345
+
1346
+ from ingestr.src.mixpanel import mixpanel_source
1347
+
1348
+ return mixpanel_source(
1349
+ username=username[0],
1350
+ password=password[0],
1351
+ project_id=project_id[0],
1352
+ start_date=start_date,
1353
+ end_date=end_date,
1354
+ server=server[0],
1355
+ ).with_resources(table)
1356
+
1357
+
889
1358
  class KafkaSource:
890
1359
  def handles_incrementality(self) -> bool:
891
1360
  return False
@@ -913,6 +1382,9 @@ class KafkaSource:
913
1382
  raise ValueError("group_id in the URI is required to connect to kafka")
914
1383
 
915
1384
  start_date = kwargs.get("interval_start")
1385
+ from ingestr.src.kafka import kafka_consumer
1386
+ from ingestr.src.kafka.helpers import KafkaCredentials
1387
+
916
1388
  return kafka_consumer(
917
1389
  topics=[table],
918
1390
  credentials=KafkaCredentials(
@@ -968,6 +1440,9 @@ class AdjustSource:
968
1440
  if kwargs.get("interval_end"):
969
1441
  end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
970
1442
 
1443
+ from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
1444
+ from ingestr.src.adjust.adjust_helpers import parse_filters
1445
+
971
1446
  dimensions = None
972
1447
  metrics = None
973
1448
  filters = []
@@ -1015,6 +1490,8 @@ class AppsflyerSource:
1015
1490
  return True
1016
1491
 
1017
1492
  def dlt_source(self, uri: str, table: str, **kwargs):
1493
+ from ingestr.src.appsflyer import appsflyer_source
1494
+
1018
1495
  if kwargs.get("incremental_key"):
1019
1496
  raise ValueError(
1020
1497
  "Appsflyer_Source takes care of incrementality on its own, you should not provide incremental_key"
@@ -1027,22 +1504,27 @@ class AppsflyerSource:
1027
1504
  if not api_key:
1028
1505
  raise ValueError("api_key in the URI is required to connect to Appsflyer")
1029
1506
 
1030
- resource = None
1031
- if table in ["campaigns", "creatives"]:
1032
- resource = table
1033
- else:
1034
- raise ValueError(
1035
- f"Resource '{table}' is not supported for Appsflyer source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1036
- )
1037
-
1038
- start_date = kwargs.get("interval_start") or "2024-01-02"
1039
- end_date = kwargs.get("interval_end") or "2024-01-29"
1507
+ start_date = kwargs.get("interval_start")
1508
+ end_date = kwargs.get("interval_end")
1509
+ dimensions = []
1510
+ metrics = []
1511
+ if table.startswith("custom:"):
1512
+ fields = table.split(":", 3)
1513
+ if len(fields) != 3:
1514
+ raise ValueError(
1515
+ "Invalid Adjust custom table format. Expected format: custom:<dimensions>:<metrics>"
1516
+ )
1517
+ dimensions = fields[1].split(",")
1518
+ metrics = fields[2].split(",")
1519
+ table = "custom"
1040
1520
 
1041
1521
  return appsflyer_source(
1042
1522
  api_key=api_key[0],
1043
- start_date=start_date,
1044
- end_date=end_date,
1045
- ).with_resources(resource)
1523
+ start_date=start_date.strftime("%Y-%m-%d") if start_date else None, # type: ignore
1524
+ end_date=end_date.strftime("%Y-%m-%d") if end_date else None, # type: ignore
1525
+ dimensions=dimensions,
1526
+ metrics=metrics,
1527
+ ).with_resources(table)
1046
1528
 
1047
1529
 
1048
1530
  class ZendeskSource:
@@ -1067,6 +1549,12 @@ class ZendeskSource:
1067
1549
  if not subdomain:
1068
1550
  raise ValueError("Subdomain is required to connect with Zendesk")
1069
1551
 
1552
+ from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
1553
+ from ingestr.src.zendesk.helpers.credentials import (
1554
+ ZendeskCredentialsOAuth,
1555
+ ZendeskCredentialsToken,
1556
+ )
1557
+
1070
1558
  if not source_fields.username and source_fields.password:
1071
1559
  oauth_token = source_fields.password
1072
1560
  if not oauth_token:
@@ -1125,7 +1613,7 @@ class ZendeskSource:
1125
1613
  ).with_resources(table)
1126
1614
  else:
1127
1615
  raise ValueError(
1128
- "fResource '{table}' is not supported for Zendesk source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1616
+ f"Resource '{table}' is not supported for Zendesk source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1129
1617
  )
1130
1618
 
1131
1619
 
@@ -1140,7 +1628,7 @@ class S3Source:
1140
1628
  )
1141
1629
 
1142
1630
  parsed_uri = urlparse(uri)
1143
- source_fields = parse_qs(quote(parsed_uri.query, safe="=&"))
1631
+ source_fields = parse_qs(parsed_uri.query)
1144
1632
  access_key_id = source_fields.get("access_key_id")
1145
1633
  if not access_key_id:
1146
1634
  raise ValueError("access_key_id is required to connect to S3")
@@ -1155,22 +1643,34 @@ class S3Source:
1155
1643
 
1156
1644
  bucket_url = f"s3://{bucket_name}/"
1157
1645
 
1646
+ import s3fs # type: ignore
1647
+
1158
1648
  fs = s3fs.S3FileSystem(
1159
1649
  key=access_key_id[0],
1160
1650
  secret=secret_access_key[0],
1161
1651
  )
1162
1652
 
1163
- file_extension = path_to_file.split(".")[-1]
1164
- if file_extension == "csv":
1165
- endpoint = "read_csv"
1166
- elif file_extension == "jsonl":
1167
- endpoint = "read_jsonl"
1168
- elif file_extension == "parquet":
1169
- endpoint = "read_parquet"
1653
+ endpoint: Optional[str] = None
1654
+ if "#" in table:
1655
+ _, endpoint = table.split("#")
1656
+ if endpoint not in ["csv", "jsonl", "parquet"]:
1657
+ raise ValueError(
1658
+ "S3 Source only supports specific formats files: csv, jsonl, parquet"
1659
+ )
1660
+ endpoint = f"read_{endpoint}"
1170
1661
  else:
1171
- raise ValueError(
1172
- "S3 Source only supports specific formats files: csv, jsonl, parquet"
1173
- )
1662
+ try:
1663
+ endpoint = blob.parse_endpoint(path_to_file)
1664
+ except blob.UnsupportedEndpointError:
1665
+ raise ValueError(
1666
+ "S3 Source only supports specific formats files: csv, jsonl, parquet"
1667
+ )
1668
+ except Exception as e:
1669
+ raise ValueError(
1670
+ f"Failed to parse endpoint from path: {path_to_file}"
1671
+ ) from e
1672
+
1673
+ from ingestr.src.filesystem import readers
1174
1674
 
1175
1675
  return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
1176
1676
 
@@ -1181,6 +1681,11 @@ class TikTokSource:
1181
1681
  return True
1182
1682
 
1183
1683
  def dlt_source(self, uri: str, table: str, **kwargs):
1684
+ if kwargs.get("incremental_key"):
1685
+ raise ValueError(
1686
+ "TikTok takes care of incrementality on its own, you should not provide incremental_key"
1687
+ )
1688
+
1184
1689
  endpoint = "custom_reports"
1185
1690
 
1186
1691
  parsed_uri = urlparse(uri)
@@ -1266,6 +1771,8 @@ class TikTokSource:
1266
1771
  filter_name = list(filters.keys())[0]
1267
1772
  filter_value = list(map(int, filters[list(filters.keys())[0]]))
1268
1773
 
1774
+ from ingestr.src.tiktok_ads import tiktok_source
1775
+
1269
1776
  return tiktok_source(
1270
1777
  start_date=start_date,
1271
1778
  end_date=end_date,
@@ -1314,15 +1821,78 @@ class AsanaSource:
1314
1821
  f"Resource '{table}' is not supported for Asana source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1315
1822
  )
1316
1823
 
1824
+ import dlt
1825
+
1826
+ from ingestr.src.asana_source import asana_source
1827
+
1317
1828
  dlt.secrets["sources.asana_source.access_token"] = access_token[0]
1829
+
1318
1830
  src = asana_source()
1319
1831
  src.workspaces.add_filter(lambda w: w["gid"] == workspace)
1320
1832
  return src.with_resources(table)
1321
1833
 
1322
1834
 
1323
- class DynamoDBSource:
1324
- AWS_ENDPOINT_PATTERN = re.compile(".*\.(.+)\.amazonaws\.com")
1325
-
1835
+ class JiraSource:
1836
+ resources = [
1837
+ "projects",
1838
+ "issues",
1839
+ "users",
1840
+ "issue_types",
1841
+ "statuses",
1842
+ "priorities",
1843
+ "resolutions",
1844
+ "project_versions",
1845
+ "project_components",
1846
+ "events",
1847
+ ]
1848
+
1849
+ def handles_incrementality(self) -> bool:
1850
+ return True
1851
+
1852
+ def dlt_source(self, uri: str, table: str, **kwargs):
1853
+ parsed_uri = urlparse(uri)
1854
+ params = parse_qs(parsed_uri.query)
1855
+
1856
+ base_url = f"https://{parsed_uri.netloc}"
1857
+ email = params.get("email")
1858
+ api_token = params.get("api_token")
1859
+
1860
+ if not email:
1861
+ raise ValueError("email must be specified in the URI query parameters")
1862
+
1863
+ if not api_token:
1864
+ raise ValueError("api_token is required for connecting to Jira")
1865
+
1866
+ flags = {
1867
+ "skip_archived": False,
1868
+ }
1869
+ if ":" in table:
1870
+ table, rest = table.split(":", 1) # type: ignore
1871
+ for k in rest.split(":"):
1872
+ flags[k] = True
1873
+
1874
+ if table not in self.resources:
1875
+ raise ValueError(
1876
+ f"Resource '{table}' is not supported for Jira source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
1877
+ )
1878
+
1879
+ import dlt
1880
+
1881
+ from ingestr.src.jira_source import jira_source
1882
+
1883
+ dlt.secrets["sources.jira_source.base_url"] = base_url
1884
+ dlt.secrets["sources.jira_source.email"] = email[0]
1885
+ dlt.secrets["sources.jira_source.api_token"] = api_token[0]
1886
+
1887
+ src = jira_source()
1888
+ if flags["skip_archived"]:
1889
+ src.projects.add_filter(lambda p: not p.get("archived", False))
1890
+ return src.with_resources(table)
1891
+
1892
+
1893
+ class DynamoDBSource:
1894
+ AWS_ENDPOINT_PATTERN = re.compile(".*\.(.+)\.amazonaws\.com")
1895
+
1326
1896
  def infer_aws_region(self, uri: ParseResult) -> Optional[str]:
1327
1897
  # try to infer from URI
1328
1898
  matches = self.AWS_ENDPOINT_PATTERN.match(uri.netloc)
@@ -1350,7 +1920,7 @@ class DynamoDBSource:
1350
1920
  if not region:
1351
1921
  raise ValueError("region is required to connect to Dynamodb")
1352
1922
 
1353
- qs = parse_qs(quote(parsed_uri.query, safe="=&"))
1923
+ qs = parse_qs(parsed_uri.query)
1354
1924
  access_key = qs.get("access_key_id")
1355
1925
 
1356
1926
  if not access_key:
@@ -1360,6 +1930,9 @@ class DynamoDBSource:
1360
1930
  if not secret_key:
1361
1931
  raise ValueError("secret_access_key is required to connect to Dynamodb")
1362
1932
 
1933
+ from dlt.common.configuration.specs import AwsCredentials
1934
+ from dlt.common.typing import TSecretStrValue
1935
+
1363
1936
  creds = AwsCredentials(
1364
1937
  aws_access_key_id=access_key[0],
1365
1938
  aws_secret_access_key=TSecretStrValue(secret_key[0]),
@@ -1370,8 +1943,11 @@ class DynamoDBSource:
1370
1943
  incremental = None
1371
1944
  incremental_key = kwargs.get("incremental_key")
1372
1945
 
1946
+ from ingestr.src.dynamodb import dynamodb
1947
+ from ingestr.src.time import isotime
1948
+
1373
1949
  if incremental_key:
1374
- incremental = dlt.sources.incremental(
1950
+ incremental = dlt_incremental(
1375
1951
  incremental_key.strip(),
1376
1952
  initial_value=isotime(kwargs.get("interval_start")),
1377
1953
  end_value=isotime(kwargs.get("interval_end")),
@@ -1383,47 +1959,127 @@ class DynamoDBSource:
1383
1959
  return dynamodb(table, creds, incremental)
1384
1960
 
1385
1961
 
1962
+ class DoceboSource:
1963
+ def handles_incrementality(self) -> bool:
1964
+ return False
1965
+
1966
+ def dlt_source(self, uri: str, table: str, **kwargs):
1967
+ # docebo://?base_url=https://yourcompany.docebosaas.com&client_id=xxx&client_secret=xxx
1968
+ # Optional: &username=xxx&password=xxx for password grant type
1969
+
1970
+ if kwargs.get("incremental_key"):
1971
+ raise ValueError("Incremental loads are not yet supported for Docebo")
1972
+
1973
+ parsed_uri = urlparse(uri)
1974
+ source_params = parse_qs(parsed_uri.query)
1975
+
1976
+ base_url = source_params.get("base_url")
1977
+ if not base_url:
1978
+ raise ValueError("base_url is required to connect to Docebo")
1979
+
1980
+ client_id = source_params.get("client_id")
1981
+ if not client_id:
1982
+ raise ValueError("client_id is required to connect to Docebo")
1983
+
1984
+ client_secret = source_params.get("client_secret")
1985
+ if not client_secret:
1986
+ raise ValueError("client_secret is required to connect to Docebo")
1987
+
1988
+ # Username and password are optional (uses client_credentials grant if not provided)
1989
+ username = source_params.get("username", [None])[0]
1990
+ password = source_params.get("password", [None])[0]
1991
+
1992
+ # Supported tables
1993
+ supported_tables = [
1994
+ "users",
1995
+ "courses",
1996
+ "user_fields",
1997
+ "branches",
1998
+ "groups",
1999
+ "group_members",
2000
+ "course_fields",
2001
+ "learning_objects",
2002
+ "learning_plans",
2003
+ "learning_plan_enrollments",
2004
+ "learning_plan_course_enrollments",
2005
+ "course_enrollments",
2006
+ "sessions",
2007
+ "categories",
2008
+ "certifications",
2009
+ "external_training",
2010
+ "survey_answers",
2011
+ ]
2012
+ if table not in supported_tables:
2013
+ raise ValueError(
2014
+ f"Resource '{table}' is not supported for Docebo source. Supported tables: {', '.join(supported_tables)}"
2015
+ )
2016
+
2017
+ from ingestr.src.docebo import docebo_source
2018
+
2019
+ return docebo_source(
2020
+ base_url=base_url[0],
2021
+ client_id=client_id[0],
2022
+ client_secret=client_secret[0],
2023
+ username=username,
2024
+ password=password,
2025
+ ).with_resources(table)
2026
+
2027
+
1386
2028
  class GoogleAnalyticsSource:
1387
2029
  def handles_incrementality(self) -> bool:
1388
2030
  return True
1389
2031
 
1390
2032
  def dlt_source(self, uri: str, table: str, **kwargs):
1391
- parse_uri = urlparse(uri)
1392
- source_fields = parse_qs(parse_uri.query)
1393
- cred_path = source_fields.get("credentials_path")
1394
-
1395
- if not cred_path:
1396
- raise ValueError("credentials_path is required to connect Google Analytics")
1397
- credentials = {}
2033
+ import ingestr.src.google_analytics.helpers as helpers
1398
2034
 
1399
- with open(cred_path[0], "r") as f:
1400
- credentials = json.load(f)
2035
+ if kwargs.get("incremental_key"):
2036
+ raise ValueError(
2037
+ "Google Analytics takes care of incrementality on its own, you should not provide incremental_key"
2038
+ )
1401
2039
 
1402
- property_id = source_fields.get("property_id")
1403
- if not property_id:
1404
- raise ValueError("property_id is required to connect to Google Analytics")
2040
+ result = helpers.parse_google_analytics_uri(uri)
2041
+ credentials = result["credentials"]
2042
+ property_id = result["property_id"]
1405
2043
 
1406
2044
  fields = table.split(":")
1407
- if len(fields) != 3:
2045
+ if len(fields) != 3 and len(fields) != 4:
1408
2046
  raise ValueError(
1409
- "Invalid table format. Expected format: custom:<dimensions>:<metrics>"
2047
+ "Invalid table format. Expected format: <report_type>:<dimensions>:<metrics> or <report_type>:<dimensions>:<metrics>:<minute_ranges>"
1410
2048
  )
1411
2049
 
1412
- dimensions = fields[1].replace(" ", "").split(",")
1413
-
1414
- datetime = ""
1415
- for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
1416
- if dimension_datetime in dimensions:
1417
- datetime = dimension_datetime
1418
- break
1419
- else:
2050
+ report_type = fields[0]
2051
+ if report_type not in ["custom", "realtime"]:
1420
2052
  raise ValueError(
1421
- "You must provide at least one dimension: [dateHour, dateHourMinute, date]"
2053
+ "Invalid report type. Expected format: <report_type>:<dimensions>:<metrics>. Available report types: custom, realtime"
1422
2054
  )
1423
2055
 
2056
+ dimensions = fields[1].replace(" ", "").split(",")
1424
2057
  metrics = fields[2].replace(" ", "").split(",")
2058
+
2059
+ minute_range_objects = []
2060
+ if len(fields) == 4:
2061
+ minute_range_objects = (
2062
+ helpers.convert_minutes_ranges_to_minute_range_objects(fields[3])
2063
+ )
2064
+
2065
+ datetime = ""
2066
+ resource_name = fields[0].lower()
2067
+ if resource_name == "custom":
2068
+ for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
2069
+ if dimension_datetime in dimensions:
2070
+ datetime = dimension_datetime
2071
+ break
2072
+ else:
2073
+ raise ValueError(
2074
+ "You must provide at least one dimension: [dateHour, dateHourMinute, date]"
2075
+ )
2076
+
1425
2077
  queries = [
1426
- {"resource_name": "custom", "dimensions": dimensions, "metrics": metrics}
2078
+ {
2079
+ "resource_name": resource_name,
2080
+ "dimensions": dimensions,
2081
+ "metrics": metrics,
2082
+ }
1427
2083
  ]
1428
2084
 
1429
2085
  start_date = pendulum.now().subtract(days=30).start_of("day")
@@ -1434,14 +2090,17 @@ class GoogleAnalyticsSource:
1434
2090
  if kwargs.get("interval_end") is not None:
1435
2091
  end_date = pendulum.instance(kwargs.get("interval_end")) # type: ignore
1436
2092
 
2093
+ from ingestr.src.google_analytics import google_analytics
2094
+
1437
2095
  return google_analytics(
1438
- property_id=property_id[0],
2096
+ property_id=property_id,
1439
2097
  start_date=start_date,
1440
2098
  end_date=end_date,
1441
2099
  datetime_dimension=datetime,
1442
2100
  queries=queries,
1443
2101
  credentials=credentials,
1444
- ).with_resources("basic_report")
2102
+ minute_range_objects=minute_range_objects if minute_range_objects else None,
2103
+ ).with_resources(resource_name)
1445
2104
 
1446
2105
 
1447
2106
  class GitHubSource:
@@ -1471,12 +2130,34 @@ class GitHubSource:
1471
2130
 
1472
2131
  access_token = source_fields.get("access_token", [""])[0]
1473
2132
 
2133
+ from ingestr.src.github import (
2134
+ github_reactions,
2135
+ github_repo_events,
2136
+ github_stargazers,
2137
+ )
2138
+
1474
2139
  if table in ["issues", "pull_requests"]:
1475
2140
  return github_reactions(
1476
2141
  owner=owner, name=repo, access_token=access_token
1477
2142
  ).with_resources(table)
1478
2143
  elif table == "repo_events":
1479
- return github_repo_events(owner=owner, name=repo, access_token=access_token)
2144
+ start_date = kwargs.get("interval_start") or pendulum.now().subtract(
2145
+ days=30
2146
+ )
2147
+ end_date = kwargs.get("interval_end") or None
2148
+
2149
+ if isinstance(start_date, str):
2150
+ start_date = pendulum.parse(start_date)
2151
+ if isinstance(end_date, str):
2152
+ end_date = pendulum.parse(end_date)
2153
+
2154
+ return github_repo_events(
2155
+ owner=owner,
2156
+ name=repo,
2157
+ access_token=access_token,
2158
+ start_date=start_date,
2159
+ end_date=end_date,
2160
+ )
1480
2161
  elif table == "stargazers":
1481
2162
  return github_stargazers(owner=owner, name=repo, access_token=access_token)
1482
2163
  else:
@@ -1503,6 +2184,8 @@ class AppleAppStoreSource:
1503
2184
  else:
1504
2185
  key = base64.b64decode(key_base64[0]).decode() # type: ignore
1505
2186
 
2187
+ from ingestr.src.appstore.client import AppStoreConnectClient
2188
+
1506
2189
  return AppStoreConnectClient(key.encode(), key_id, issuer_id)
1507
2190
 
1508
2191
  def dlt_source(self, uri: str, table: str, **kwargs):
@@ -1543,6 +2226,8 @@ class AppleAppStoreSource:
1543
2226
  if app_ids is None:
1544
2227
  raise MissingValueError("app_id", "App Store")
1545
2228
 
2229
+ from ingestr.src.appstore import app_store
2230
+
1546
2231
  src = app_store(
1547
2232
  client,
1548
2233
  app_ids,
@@ -1599,21 +2284,24 @@ class GCSSource:
1599
2284
  # (The RECOMMENDED way of passing service account credentials)
1600
2285
  # directly with gcsfs. As a workaround, we construct the GCSFileSystem
1601
2286
  # and pass it directly to filesystem.readers.
2287
+ import gcsfs # type: ignore
2288
+
1602
2289
  fs = gcsfs.GCSFileSystem(
1603
2290
  token=credentials,
1604
2291
  )
1605
2292
 
1606
- file_extension = path_to_file.split(".")[-1]
1607
- if file_extension == "csv":
1608
- endpoint = "read_csv"
1609
- elif file_extension == "jsonl":
1610
- endpoint = "read_jsonl"
1611
- elif file_extension == "parquet":
1612
- endpoint = "read_parquet"
1613
- else:
2293
+ try:
2294
+ endpoint = blob.parse_endpoint(path_to_file)
2295
+ except blob.UnsupportedEndpointError:
1614
2296
  raise ValueError(
1615
2297
  "GCS Source only supports specific formats files: csv, jsonl, parquet"
1616
2298
  )
2299
+ except Exception as e:
2300
+ raise ValueError(
2301
+ f"Failed to parse endpoint from path: {path_to_file}"
2302
+ ) from e
2303
+
2304
+ from ingestr.src.filesystem import readers
1617
2305
 
1618
2306
  return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
1619
2307
 
@@ -1622,7 +2310,9 @@ class GoogleAdsSource:
1622
2310
  def handles_incrementality(self) -> bool:
1623
2311
  return True
1624
2312
 
1625
- def init_client(self, params: Dict[str, List[str]]) -> GoogleAdsClient:
2313
+ def init_client(self, params: Dict[str, List[str]]):
2314
+ from google.ads.googleads.client import GoogleAdsClient # type: ignore
2315
+
1626
2316
  dev_token = params.get("dev_token")
1627
2317
  if dev_token is None or len(dev_token) == 0:
1628
2318
  raise MissingValueError("dev_token", "Google Ads")
@@ -1676,6 +2366,7 @@ class GoogleAdsSource:
1676
2366
  raise MissingValueError("customer_id", "Google Ads")
1677
2367
 
1678
2368
  params = parse_qs(parsed_uri.query)
2369
+
1679
2370
  client = self.init_client(params)
1680
2371
 
1681
2372
  start_date = kwargs.get("interval_start") or datetime.now(
@@ -1697,6 +2388,8 @@ class GoogleAdsSource:
1697
2388
  report_spec = table
1698
2389
  table = "daily_report"
1699
2390
 
2391
+ from ingestr.src.google_ads import google_ads
2392
+
1700
2393
  src = google_ads(
1701
2394
  client,
1702
2395
  customer_id,
@@ -1716,6 +2409,11 @@ class LinkedInAdsSource:
1716
2409
  return True
1717
2410
 
1718
2411
  def dlt_source(self, uri: str, table: str, **kwargs):
2412
+ if kwargs.get("incremental_key"):
2413
+ raise ValueError(
2414
+ "LinkedIn Ads takes care of incrementality on its own, you should not provide incremental_key"
2415
+ )
2416
+
1719
2417
  parsed_uri = urlparse(uri)
1720
2418
  source_fields = parse_qs(parsed_uri.query)
1721
2419
 
@@ -1761,6 +2459,12 @@ class LinkedInAdsSource:
1761
2459
  "'date' or 'month' is required to connect to LinkedIn Ads, please provide at least one of these dimensions."
1762
2460
  )
1763
2461
 
2462
+ from ingestr.src.linkedin_ads import linked_in_ads_source
2463
+ from ingestr.src.linkedin_ads.dimension_time_enum import (
2464
+ Dimension,
2465
+ TimeGranularity,
2466
+ )
2467
+
1764
2468
  if "date" in dimensions:
1765
2469
  time_granularity = TimeGranularity.daily
1766
2470
  dimensions.remove("date")
@@ -1788,6 +2492,46 @@ class LinkedInAdsSource:
1788
2492
  ).with_resources("custom_reports")
1789
2493
 
1790
2494
 
2495
+ class ClickupSource:
2496
+ def handles_incrementality(self) -> bool:
2497
+ return True
2498
+
2499
+ def dlt_source(self, uri: str, table: str, **kwargs):
2500
+ if kwargs.get("incremental_key"):
2501
+ raise ValueError(
2502
+ "ClickUp takes care of incrementality on its own, you should not provide incremental_key"
2503
+ )
2504
+
2505
+ parsed_uri = urlparse(uri)
2506
+ params = parse_qs(parsed_uri.query)
2507
+ api_token = params.get("api_token")
2508
+
2509
+ if api_token is None:
2510
+ raise MissingValueError("api_token", "ClickUp")
2511
+
2512
+ interval_start = kwargs.get("interval_start")
2513
+ interval_end = kwargs.get("interval_end")
2514
+ start_date = (
2515
+ ensure_pendulum_datetime(interval_start).in_timezone("UTC")
2516
+ if interval_start
2517
+ else pendulum.datetime(2020, 1, 1, tz="UTC")
2518
+ )
2519
+ end_date = (
2520
+ ensure_pendulum_datetime(interval_end).in_timezone("UTC")
2521
+ if interval_end
2522
+ else None
2523
+ )
2524
+
2525
+ from ingestr.src.clickup import clickup_source
2526
+
2527
+ if table not in {"user", "teams", "lists", "tasks", "spaces"}:
2528
+ raise UnsupportedResourceError(table, "ClickUp")
2529
+
2530
+ return clickup_source(
2531
+ api_token=api_token[0], start_date=start_date, end_date=end_date
2532
+ ).with_resources(table)
2533
+
2534
+
1791
2535
  class AppLovinSource:
1792
2536
  def handles_incrementality(self) -> bool:
1793
2537
  return True
@@ -1819,6 +2563,8 @@ class AppLovinSource:
1819
2563
  custom_report = table
1820
2564
  table = "custom_report"
1821
2565
 
2566
+ from ingestr.src.applovin import applovin_source
2567
+
1822
2568
  src = applovin_source(
1823
2569
  api_key[0],
1824
2570
  start_date.strftime("%Y-%m-%d"),
@@ -1833,20 +2579,25 @@ class AppLovinSource:
1833
2579
 
1834
2580
 
1835
2581
  class ApplovinMaxSource:
1836
- #expected uri format: applovinmax://?api_key=<api_key>
1837
- #expected table format: user_ad_revenue:app_id_1,app_id_2
2582
+ # expected uri format: applovinmax://?api_key=<api_key>
2583
+ # expected table format: user_ad_revenue:app_id_1,app_id_2
1838
2584
 
1839
2585
  def handles_incrementality(self) -> bool:
1840
2586
  return True
1841
2587
 
1842
2588
  def dlt_source(self, uri: str, table: str, **kwargs):
2589
+ if kwargs.get("incremental_key"):
2590
+ raise ValueError(
2591
+ "AppLovin Max takes care of incrementality on its own, you should not provide incremental_key"
2592
+ )
2593
+
1843
2594
  parsed_uri = urlparse(uri)
1844
2595
  params = parse_qs(parsed_uri.query)
1845
2596
 
1846
2597
  api_key = params.get("api_key")
1847
2598
  if api_key is None:
1848
2599
  raise ValueError("api_key is required to connect to AppLovin Max API.")
1849
-
2600
+
1850
2601
  AVAILABLE_TABLES = ["user_ad_revenue"]
1851
2602
 
1852
2603
  table_fields = table.split(":")
@@ -1856,7 +2607,7 @@ class ApplovinMaxSource:
1856
2607
  raise ValueError(
1857
2608
  "Invalid table format. Expected format is user_ad_revenue:app_id_1,app_id_2"
1858
2609
  )
1859
-
2610
+
1860
2611
  if requested_table not in AVAILABLE_TABLES:
1861
2612
  raise ValueError(
1862
2613
  f"Table name '{requested_table}' is not supported for AppLovin Max source yet."
@@ -1864,17 +2615,15 @@ class ApplovinMaxSource:
1864
2615
  "If you need additional tables, please create a GitHub issue at "
1865
2616
  "https://github.com/bruin-data/ingestr"
1866
2617
  )
1867
-
1868
- applications = [i for i in table_fields[1].replace(" ", "").split(",") if i.strip()]
2618
+
2619
+ applications = [
2620
+ i for i in table_fields[1].replace(" ", "").split(",") if i.strip()
2621
+ ]
1869
2622
  if len(applications) == 0:
1870
- raise ValueError(
1871
- "At least one application id is required"
1872
- )
1873
-
2623
+ raise ValueError("At least one application id is required")
2624
+
1874
2625
  if len(applications) != len(set(applications)):
1875
- raise ValueError(
1876
- "Application ids must be unique."
1877
- )
2626
+ raise ValueError("Application ids must be unique.")
1878
2627
 
1879
2628
  interval_start = kwargs.get("interval_start")
1880
2629
  interval_end = kwargs.get("interval_end")
@@ -1888,6 +2637,8 @@ class ApplovinMaxSource:
1888
2637
 
1889
2638
  end_date = interval_end.date() if interval_end is not None else None
1890
2639
 
2640
+ from ingestr.src.applovin_max import applovin_max_source
2641
+
1891
2642
  return applovin_max_source(
1892
2643
  start_date=start_date,
1893
2644
  end_date=end_date,
@@ -1911,13 +2662,21 @@ class SalesforceSource:
1911
2662
  "username": params.get("username", [None])[0],
1912
2663
  "password": params.get("password", [None])[0],
1913
2664
  "token": params.get("token", [None])[0],
2665
+ "domain": params.get("domain", [None])[0],
1914
2666
  }
1915
2667
  for k, v in creds.items():
1916
2668
  if v is None:
1917
2669
  raise MissingValueError(k, "Salesforce")
1918
2670
 
2671
+ from ingestr.src.salesforce import salesforce_source
2672
+
1919
2673
  src = salesforce_source(**creds) # type: ignore
1920
2674
 
2675
+ if table.startswith("custom:"):
2676
+ custom_object = table.split(":")[1]
2677
+ src = salesforce_source(**creds, custom_object=custom_object)
2678
+ return src.with_resources("custom")
2679
+
1921
2680
  if table not in src.resources:
1922
2681
  raise UnsupportedResourceError(table, "Salesforce")
1923
2682
 
@@ -1930,6 +2689,11 @@ class PersonioSource:
1930
2689
 
1931
2690
  # applovin://?client_id=123&client_secret=123
1932
2691
  def dlt_source(self, uri: str, table: str, **kwargs):
2692
+ if kwargs.get("incremental_key"):
2693
+ raise ValueError(
2694
+ "Personio takes care of incrementality on its own, you should not provide incremental_key"
2695
+ )
2696
+
1933
2697
  parsed_uri = urlparse(uri)
1934
2698
  params = parse_qs(parsed_uri.query)
1935
2699
 
@@ -1963,9 +2727,1933 @@ class PersonioSource:
1963
2727
  ]:
1964
2728
  raise UnsupportedResourceError(table, "Personio")
1965
2729
 
2730
+ from ingestr.src.personio import personio_source
2731
+
1966
2732
  return personio_source(
1967
2733
  client_id=client_id[0],
1968
2734
  client_secret=client_secret[0],
1969
2735
  start_date=interval_start_date,
1970
2736
  end_date=interval_end_date,
1971
2737
  ).with_resources(table)
2738
+
2739
+
2740
+ class KinesisSource:
2741
+ def handles_incrementality(self) -> bool:
2742
+ return True
2743
+
2744
+ def dlt_source(self, uri: str, table: str, **kwargs):
2745
+ # kinesis://?aws_access_key_id=<AccessKeyId>&aws_secret_access_key=<SecretAccessKey>&region_name=<Region>
2746
+ # source table = stream name
2747
+ parsed_uri = urlparse(uri)
2748
+ params = parse_qs(parsed_uri.query)
2749
+
2750
+ aws_access_key_id = params.get("aws_access_key_id")
2751
+ if aws_access_key_id is None:
2752
+ raise MissingValueError("aws_access_key_id", "Kinesis")
2753
+
2754
+ aws_secret_access_key = params.get("aws_secret_access_key")
2755
+ if aws_secret_access_key is None:
2756
+ raise MissingValueError("aws_secret_access_key", "Kinesis")
2757
+
2758
+ region_name = params.get("region_name")
2759
+ if region_name is None:
2760
+ raise MissingValueError("region_name", "Kinesis")
2761
+
2762
+ start_date = kwargs.get("interval_start")
2763
+ if start_date is not None:
2764
+ # the resource will read all messages after this timestamp.
2765
+ start_date = ensure_pendulum_datetime(start_date)
2766
+
2767
+ from dlt.common.configuration.specs import AwsCredentials
2768
+
2769
+ from ingestr.src.kinesis import kinesis_stream
2770
+
2771
+ credentials = AwsCredentials(
2772
+ aws_access_key_id=aws_access_key_id[0],
2773
+ aws_secret_access_key=aws_secret_access_key[0],
2774
+ region_name=region_name[0],
2775
+ )
2776
+
2777
+ return kinesis_stream(
2778
+ stream_name=table, credentials=credentials, initial_at_timestamp=start_date
2779
+ )
2780
+
2781
+
2782
+ class PipedriveSource:
2783
+ def handles_incrementality(self) -> bool:
2784
+ return True
2785
+
2786
+ def dlt_source(self, uri: str, table: str, **kwargs):
2787
+ if kwargs.get("incremental_key"):
2788
+ raise ValueError(
2789
+ "Pipedrive takes care of incrementality on its own, you should not provide incremental_key"
2790
+ )
2791
+
2792
+ parsed_uri = urlparse(uri)
2793
+ params = parse_qs(parsed_uri.query)
2794
+ api_key = params.get("api_token")
2795
+ if api_key is None:
2796
+ raise MissingValueError("api_token", "Pipedrive")
2797
+
2798
+ start_date = kwargs.get("interval_start")
2799
+ if start_date is not None:
2800
+ start_date = ensure_pendulum_datetime(start_date)
2801
+ else:
2802
+ start_date = pendulum.parse("2000-01-01")
2803
+
2804
+ if table not in [
2805
+ "users",
2806
+ "activities",
2807
+ "persons",
2808
+ "organizations",
2809
+ "products",
2810
+ "stages",
2811
+ "deals",
2812
+ ]:
2813
+ raise UnsupportedResourceError(table, "Pipedrive")
2814
+
2815
+ from ingestr.src.pipedrive import pipedrive_source
2816
+
2817
+ return pipedrive_source(
2818
+ pipedrive_api_key=api_key, since_timestamp=start_date
2819
+ ).with_resources(table)
2820
+
2821
+
2822
+ class FrankfurterSource:
2823
+ def handles_incrementality(self) -> bool:
2824
+ return True
2825
+
2826
+ def dlt_source(self, uri: str, table: str, **kwargs):
2827
+ if kwargs.get("incremental_key"):
2828
+ raise ValueError(
2829
+ "Frankfurter takes care of incrementality on its own, you should not provide incremental_key"
2830
+ )
2831
+
2832
+ from ingestr.src.frankfurter import frankfurter_source
2833
+ from ingestr.src.frankfurter.helpers import validate_currency, validate_dates
2834
+
2835
+ parsed_uri = urlparse(uri)
2836
+ source_params = parse_qs(parsed_uri.query)
2837
+ base_currency = source_params.get("base", [None])[0]
2838
+
2839
+ if not base_currency:
2840
+ base_currency = "USD"
2841
+
2842
+ validate_currency(base_currency)
2843
+
2844
+ if kwargs.get("interval_start"):
2845
+ start_date = ensure_pendulum_datetime(str(kwargs.get("interval_start")))
2846
+ else:
2847
+ start_date = pendulum.yesterday()
2848
+
2849
+ if kwargs.get("interval_end"):
2850
+ end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
2851
+ else:
2852
+ end_date = None
2853
+
2854
+ validate_dates(start_date=start_date, end_date=end_date)
2855
+
2856
+ src = frankfurter_source(
2857
+ start_date=start_date,
2858
+ end_date=end_date,
2859
+ base_currency=base_currency,
2860
+ )
2861
+
2862
+ if table not in src.resources:
2863
+ raise UnsupportedResourceError(table, "Frankfurter")
2864
+
2865
+ return src.with_resources(table)
2866
+
2867
+
2868
+ class FreshdeskSource:
2869
+ # freshdesk://domain?api_key=<api_key>
2870
+ def handles_incrementality(self) -> bool:
2871
+ return True
2872
+
2873
+ def dlt_source(self, uri: str, table: str, **kwargs):
2874
+ if kwargs.get("incremental_key"):
2875
+ raise ValueError(
2876
+ "Freshdesk takes care of incrementality on its own, you should not provide incremental_key"
2877
+ )
2878
+
2879
+ parsed_uri = urlparse(uri)
2880
+ domain = parsed_uri.netloc
2881
+ query = parsed_uri.query
2882
+ params = parse_qs(query)
2883
+
2884
+ if not domain:
2885
+ raise MissingValueError("domain", "Freshdesk")
2886
+
2887
+ if "." in domain:
2888
+ domain = domain.split(".")[0]
2889
+
2890
+ api_key = params.get("api_key")
2891
+ if api_key is None:
2892
+ raise MissingValueError("api_key", "Freshdesk")
2893
+
2894
+ start_date = kwargs.get("interval_start")
2895
+ if start_date is not None:
2896
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
2897
+ else:
2898
+ start_date = ensure_pendulum_datetime("2022-01-01T00:00:00Z")
2899
+
2900
+ end_date = kwargs.get("interval_end")
2901
+ if end_date is not None:
2902
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
2903
+ else:
2904
+ end_date = None
2905
+
2906
+ custom_query: Optional[str] = None
2907
+ if ":" in table:
2908
+ table, custom_query = table.split(":", 1)
2909
+
2910
+ if table not in [
2911
+ "agents",
2912
+ "companies",
2913
+ "contacts",
2914
+ "groups",
2915
+ "roles",
2916
+ "tickets",
2917
+ ]:
2918
+ raise UnsupportedResourceError(table, "Freshdesk")
2919
+
2920
+ if custom_query and table != "tickets":
2921
+ raise ValueError(f"Custom query is not supported for {table}")
2922
+
2923
+ from ingestr.src.freshdesk import freshdesk_source
2924
+
2925
+ return freshdesk_source(
2926
+ api_secret_key=api_key[0],
2927
+ domain=domain,
2928
+ start_date=start_date,
2929
+ end_date=end_date,
2930
+ query=custom_query,
2931
+ ).with_resources(table)
2932
+
2933
+
2934
+ class TrustpilotSource:
2935
+ # trustpilot://<business_unit_id>?api_key=<api_key>
2936
+ def handles_incrementality(self) -> bool:
2937
+ return True
2938
+
2939
+ def dlt_source(self, uri: str, table: str, **kwargs):
2940
+ if kwargs.get("incremental_key"):
2941
+ raise ValueError(
2942
+ "Trustpilot takes care of incrementality on its own, you should not provide incremental_key"
2943
+ )
2944
+
2945
+ parsed_uri = urlparse(uri)
2946
+ business_unit_id = parsed_uri.netloc
2947
+ params = parse_qs(parsed_uri.query)
2948
+
2949
+ if not business_unit_id:
2950
+ raise MissingValueError("business_unit_id", "Trustpilot")
2951
+
2952
+ api_key = params.get("api_key")
2953
+ if api_key is None:
2954
+ raise MissingValueError("api_key", "Trustpilot")
2955
+
2956
+ start_date = kwargs.get("interval_start")
2957
+ if start_date is None:
2958
+ start_date = ensure_pendulum_datetime("2000-01-01").in_tz("UTC").isoformat()
2959
+ else:
2960
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC").isoformat()
2961
+
2962
+ end_date = kwargs.get("interval_end")
2963
+
2964
+ if end_date is not None:
2965
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC").isoformat()
2966
+
2967
+ if table not in ["reviews"]:
2968
+ raise UnsupportedResourceError(table, "Trustpilot")
2969
+
2970
+ from ingestr.src.trustpilot import trustpilot_source
2971
+
2972
+ return trustpilot_source(
2973
+ business_unit_id=business_unit_id,
2974
+ api_key=api_key[0],
2975
+ start_date=start_date,
2976
+ end_date=end_date,
2977
+ ).with_resources(table)
2978
+
2979
+
2980
+ class PhantombusterSource:
2981
+ def handles_incrementality(self) -> bool:
2982
+ return True
2983
+
2984
+ def dlt_source(self, uri: str, table: str, **kwargs):
2985
+ if kwargs.get("incremental_key"):
2986
+ raise ValueError(
2987
+ "Phantombuster takes care of incrementality on its own, you should not provide incremental_key"
2988
+ )
2989
+
2990
+ # phantombuster://?api_key=<api_key>
2991
+ # source table = phantom_results:agent_id
2992
+ parsed_uri = urlparse(uri)
2993
+ params = parse_qs(parsed_uri.query)
2994
+ api_key = params.get("api_key")
2995
+ if api_key is None:
2996
+ raise MissingValueError("api_key", "Phantombuster")
2997
+
2998
+ table_fields = table.replace(" ", "").split(":")
2999
+ table_name = table_fields[0]
3000
+
3001
+ agent_id = table_fields[1] if len(table_fields) > 1 else None
3002
+
3003
+ if table_name not in ["completed_phantoms"]:
3004
+ raise UnsupportedResourceError(table_name, "Phantombuster")
3005
+
3006
+ if not agent_id:
3007
+ raise MissingValueError("agent_id", "Phantombuster")
3008
+
3009
+ start_date = kwargs.get("interval_start")
3010
+ if start_date is None:
3011
+ start_date = ensure_pendulum_datetime("2018-01-01").in_tz("UTC")
3012
+ else:
3013
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
3014
+
3015
+ end_date = kwargs.get("interval_end")
3016
+ if end_date is not None:
3017
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3018
+
3019
+ from ingestr.src.phantombuster import phantombuster_source
3020
+
3021
+ return phantombuster_source(
3022
+ api_key=api_key[0],
3023
+ agent_id=agent_id,
3024
+ start_date=start_date,
3025
+ end_date=end_date,
3026
+ ).with_resources(table_name)
3027
+
3028
+
3029
+ class ElasticsearchSource:
3030
+ def handles_incrementality(self) -> bool:
3031
+ return False
3032
+
3033
+ def dlt_source(self, uri: str, table: str, **kwargs):
3034
+ from ingestr.src.elasticsearch import elasticsearch_source
3035
+
3036
+ incremental = None
3037
+ if kwargs.get("incremental_key"):
3038
+ start_value = kwargs.get("interval_start")
3039
+ end_value = kwargs.get("interval_end")
3040
+
3041
+ incremental = dlt_incremental(
3042
+ kwargs.get("incremental_key", ""),
3043
+ initial_value=start_value,
3044
+ end_value=end_value,
3045
+ range_end="closed",
3046
+ range_start="closed",
3047
+ )
3048
+
3049
+ # elasticsearch://localhost:9200?secure=true&verify_certs=false
3050
+ parsed = urlparse(uri)
3051
+
3052
+ index = table
3053
+ if not index:
3054
+ raise ValueError(
3055
+ "Table name must be provided which is the index name in elasticsearch"
3056
+ )
3057
+
3058
+ query_params = parsed.query
3059
+ params = parse_qs(query_params)
3060
+
3061
+ secure = True
3062
+ if "secure" in params:
3063
+ secure = params["secure"][0].capitalize() == "True"
3064
+
3065
+ verify_certs = True
3066
+ if "verify_certs" in params:
3067
+ verify_certs = params["verify_certs"][0].capitalize() == "True"
3068
+
3069
+ scheme = "https" if secure else "http"
3070
+ netloc = parsed.netloc
3071
+ connection_url = f"{scheme}://{netloc}"
3072
+
3073
+ return elasticsearch_source(
3074
+ connection_url=connection_url,
3075
+ index=index,
3076
+ verify_certs=verify_certs,
3077
+ incremental=incremental,
3078
+ ).with_resources(table)
3079
+
3080
+
3081
+ class AttioSource:
3082
+ def handles_incrementality(self) -> bool:
3083
+ return False
3084
+
3085
+ def dlt_source(self, uri: str, table: str, **kwargs):
3086
+ parsed_uri = urlparse(uri)
3087
+ query_params = parse_qs(parsed_uri.query)
3088
+ api_key = query_params.get("api_key")
3089
+
3090
+ if api_key is None:
3091
+ raise MissingValueError("api_key", "Attio")
3092
+
3093
+ parts = table.replace(" ", "").split(":")
3094
+ table_name = parts[0]
3095
+ params = parts[1:]
3096
+
3097
+ from ingestr.src.attio import attio_source
3098
+
3099
+ try:
3100
+ return attio_source(api_key=api_key[0], params=params).with_resources(
3101
+ table_name
3102
+ )
3103
+ except ResourcesNotFoundError:
3104
+ raise UnsupportedResourceError(table_name, "Attio")
3105
+
3106
+
3107
+ class SmartsheetSource:
3108
+ def handles_incrementality(self) -> bool:
3109
+ return False
3110
+
3111
+ # smartsheet://?access_token=<access_token>
3112
+ def dlt_source(self, uri: str, table: str, **kwargs):
3113
+ if kwargs.get("incremental_key"):
3114
+ raise ValueError("Incremental loads are not supported for Smartsheet")
3115
+
3116
+ if not table:
3117
+ raise ValueError(
3118
+ "Source table (sheet_id) is required to connect to Smartsheet"
3119
+ )
3120
+
3121
+ source_parts = urlparse(uri)
3122
+ source_fields = parse_qs(source_parts.query)
3123
+ access_token = source_fields.get("access_token")
3124
+
3125
+ if not access_token:
3126
+ raise ValueError(
3127
+ "access_token in the URI is required to connect to Smartsheet"
3128
+ )
3129
+
3130
+ from ingestr.src.smartsheets import smartsheet_source
3131
+
3132
+ return smartsheet_source(
3133
+ access_token=access_token[0],
3134
+ sheet_id=table, # table is now a single sheet_id
3135
+ )
3136
+
3137
+
3138
+ class SolidgateSource:
3139
+ def handles_incrementality(self) -> bool:
3140
+ return True
3141
+
3142
+ def dlt_source(self, uri: str, table: str, **kwargs):
3143
+ if kwargs.get("incremental_key"):
3144
+ raise ValueError(
3145
+ "Solidgate takes care of incrementality on its own, you should not provide incremental_key"
3146
+ )
3147
+
3148
+ parsed_uri = urlparse(uri)
3149
+ query_params = parse_qs(parsed_uri.query)
3150
+ public_key = query_params.get("public_key")
3151
+ secret_key = query_params.get("secret_key")
3152
+
3153
+ if public_key is None:
3154
+ raise MissingValueError("public_key", "Solidgate")
3155
+
3156
+ if secret_key is None:
3157
+ raise MissingValueError("secret_key", "Solidgate")
3158
+
3159
+ table_name = table.replace(" ", "")
3160
+
3161
+ start_date = kwargs.get("interval_start")
3162
+ if start_date is None:
3163
+ start_date = pendulum.yesterday().in_tz("UTC")
3164
+ else:
3165
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
3166
+
3167
+ end_date = kwargs.get("interval_end")
3168
+
3169
+ if end_date is not None:
3170
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3171
+
3172
+ from ingestr.src.solidgate import solidgate_source
3173
+
3174
+ try:
3175
+ return solidgate_source(
3176
+ public_key=public_key[0],
3177
+ secret_key=secret_key[0],
3178
+ start_date=start_date,
3179
+ end_date=end_date,
3180
+ ).with_resources(table_name)
3181
+ except ResourcesNotFoundError:
3182
+ raise UnsupportedResourceError(table_name, "Solidgate")
3183
+
3184
+
3185
+ class SFTPSource:
3186
+ def handles_incrementality(self) -> bool:
3187
+ return True
3188
+
3189
+ def dlt_source(self, uri: str, table: str, **kwargs):
3190
+ parsed_uri = urlparse(uri)
3191
+ host = parsed_uri.hostname
3192
+ if not host:
3193
+ raise MissingValueError("host", "SFTP URI")
3194
+ port = parsed_uri.port or 22
3195
+ username = parsed_uri.username
3196
+ password = parsed_uri.password
3197
+
3198
+ params: Dict[str, Any] = {
3199
+ "host": host,
3200
+ "port": port,
3201
+ "username": username,
3202
+ "password": password,
3203
+ "look_for_keys": False,
3204
+ "allow_agent": False,
3205
+ }
3206
+
3207
+ try:
3208
+ fs = fsspec.filesystem("sftp", **params)
3209
+ except Exception as e:
3210
+ raise ConnectionError(
3211
+ f"Failed to connect or authenticate to sftp server {host}:{port}. Error: {e}"
3212
+ )
3213
+ bucket_url = f"sftp://{host}:{port}"
3214
+
3215
+ if table.startswith("/"):
3216
+ file_glob = table
3217
+ else:
3218
+ file_glob = f"/{table}"
3219
+
3220
+ try:
3221
+ endpoint = blob.parse_endpoint(table)
3222
+ except blob.UnsupportedEndpointError:
3223
+ raise ValueError(
3224
+ "SFTP Source only supports specific formats files: csv, jsonl, parquet"
3225
+ )
3226
+ except Exception as e:
3227
+ raise ValueError(f"Failed to parse endpoint from path: {table}") from e
3228
+
3229
+ from ingestr.src.filesystem import readers
3230
+
3231
+ dlt_source_resource = readers(bucket_url, fs, file_glob)
3232
+ return dlt_source_resource.with_resources(endpoint)
3233
+
3234
+
3235
+ class QuickBooksSource:
3236
+ def handles_incrementality(self) -> bool:
3237
+ return True
3238
+
3239
+ # quickbooks://?company_id=<company_id>&client_id=<client_id>&client_secret=<client_secret>&refresh_token=<refresh>&access_token=<access_token>&environment=<env>&minor_version=<version>
3240
+ def dlt_source(self, uri: str, table: str, **kwargs):
3241
+ if kwargs.get("incremental_key"):
3242
+ raise ValueError(
3243
+ "QuickBooks takes care of incrementality on its own, you should not provide incremental_key"
3244
+ )
3245
+
3246
+ parsed_uri = urlparse(uri)
3247
+
3248
+ params = parse_qs(parsed_uri.query)
3249
+ company_id = params.get("company_id")
3250
+ client_id = params.get("client_id")
3251
+ client_secret = params.get("client_secret")
3252
+ refresh_token = params.get("refresh_token")
3253
+ environment = params.get("environment", ["production"])
3254
+ minor_version = params.get("minor_version", [None])
3255
+
3256
+ if not client_id or not client_id[0].strip():
3257
+ raise MissingValueError("client_id", "QuickBooks")
3258
+
3259
+ if not client_secret or not client_secret[0].strip():
3260
+ raise MissingValueError("client_secret", "QuickBooks")
3261
+
3262
+ if not refresh_token or not refresh_token[0].strip():
3263
+ raise MissingValueError("refresh_token", "QuickBooks")
3264
+
3265
+ if not company_id or not company_id[0].strip():
3266
+ raise MissingValueError("company_id", "QuickBooks")
3267
+
3268
+ if environment[0] not in ["production", "sandbox"]:
3269
+ raise ValueError(
3270
+ "Invalid environment. Must be either 'production' or 'sandbox'."
3271
+ )
3272
+
3273
+ from ingestr.src.quickbooks import quickbooks_source
3274
+
3275
+ table_name = table.replace(" ", "")
3276
+ table_mapping = {
3277
+ "customers": "customer",
3278
+ "invoices": "invoice",
3279
+ "accounts": "account",
3280
+ "vendors": "vendor",
3281
+ "payments": "payment",
3282
+ }
3283
+ if table_name in table_mapping:
3284
+ table_name = table_mapping[table_name]
3285
+
3286
+ start_date = kwargs.get("interval_start")
3287
+ if start_date is None:
3288
+ start_date = ensure_pendulum_datetime("2025-01-01").in_tz("UTC")
3289
+ else:
3290
+ start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
3291
+
3292
+ end_date = kwargs.get("interval_end")
3293
+
3294
+ if end_date is not None:
3295
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3296
+
3297
+ return quickbooks_source(
3298
+ company_id=company_id[0],
3299
+ start_date=start_date,
3300
+ end_date=end_date,
3301
+ client_id=client_id[0],
3302
+ client_secret=client_secret[0],
3303
+ refresh_token=refresh_token[0],
3304
+ environment=environment[0],
3305
+ minor_version=minor_version[0],
3306
+ object=table_name,
3307
+ ).with_resources(table_name)
3308
+
3309
+
3310
+ class IsocPulseSource:
3311
+ def handles_incrementality(self) -> bool:
3312
+ return True
3313
+
3314
+ def dlt_source(self, uri: str, table: str, **kwargs):
3315
+ if kwargs.get("incremental_key"):
3316
+ raise ValueError(
3317
+ "Internet Society Pulse takes care of incrementality on its own, you should not provide incremental_key"
3318
+ )
3319
+
3320
+ parsed_uri = urlparse(uri)
3321
+ params = parse_qs(parsed_uri.query)
3322
+ token = params.get("token")
3323
+ if not token or not token[0].strip():
3324
+ raise MissingValueError("token", "Internet Society Pulse")
3325
+
3326
+ start_date = kwargs.get("interval_start")
3327
+ if start_date is None:
3328
+ start_date = pendulum.now().in_tz("UTC").subtract(days=30)
3329
+
3330
+ end_date = kwargs.get("interval_end")
3331
+
3332
+ metric = table
3333
+ opts = []
3334
+ if ":" in metric:
3335
+ metric, *opts = metric.strip().split(":")
3336
+ opts = [opt.strip() for opt in opts]
3337
+
3338
+ from ingestr.src.isoc_pulse import pulse_source
3339
+
3340
+ src = pulse_source(
3341
+ token=token[0],
3342
+ start_date=start_date.strftime("%Y-%m-%d"),
3343
+ end_date=end_date.strftime("%Y-%m-%d") if end_date else None,
3344
+ metric=metric,
3345
+ opts=opts,
3346
+ )
3347
+ return src.with_resources(metric)
3348
+
3349
+
3350
+ class PinterestSource:
3351
+ def handles_incrementality(self) -> bool:
3352
+ return True
3353
+
3354
+ def dlt_source(self, uri: str, table: str, **kwargs):
3355
+ if kwargs.get("incremental_key"):
3356
+ raise ValueError(
3357
+ "Pinterest takes care of incrementality on its own, you should not provide incremental_key"
3358
+ )
3359
+
3360
+ parsed = urlparse(uri)
3361
+ params = parse_qs(parsed.query)
3362
+ access_token = params.get("access_token")
3363
+
3364
+ if not access_token:
3365
+ raise MissingValueError("access_token", "Pinterest")
3366
+
3367
+ start_date = kwargs.get("interval_start")
3368
+ if start_date is not None:
3369
+ start_date = ensure_pendulum_datetime(start_date)
3370
+ else:
3371
+ start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
3372
+
3373
+ end_date = kwargs.get("interval_end")
3374
+ if end_date is not None:
3375
+ end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3376
+
3377
+ from ingestr.src.pinterest import pinterest_source
3378
+
3379
+ if table not in {"pins", "boards"}:
3380
+ raise UnsupportedResourceError(table, "Pinterest")
3381
+
3382
+ return pinterest_source(
3383
+ access_token=access_token[0],
3384
+ start_date=start_date,
3385
+ end_date=end_date,
3386
+ ).with_resources(table)
3387
+
3388
+
3389
+ class FluxxSource:
3390
+ def handles_incrementality(self) -> bool:
3391
+ return True
3392
+
3393
+ def dlt_source(self, uri: str, table: str, **kwargs):
3394
+ if kwargs.get("incremental_key"):
3395
+ raise ValueError(
3396
+ "Fluxx takes care of incrementality on its own, you should not provide incremental_key"
3397
+ )
3398
+
3399
+ # Parse URI: fluxx://instance?client_id=xxx&client_secret=xxx
3400
+ parsed_uri = urlparse(uri)
3401
+ source_params = parse_qs(parsed_uri.query)
3402
+
3403
+ instance = parsed_uri.hostname
3404
+ if not instance:
3405
+ raise ValueError(
3406
+ "Instance is required in the URI (e.g., fluxx://mycompany.preprod)"
3407
+ )
3408
+
3409
+ client_id = source_params.get("client_id")
3410
+ if not client_id:
3411
+ raise ValueError("client_id in the URI is required to connect to Fluxx")
3412
+
3413
+ client_secret = source_params.get("client_secret")
3414
+ if not client_secret:
3415
+ raise ValueError("client_secret in the URI is required to connect to Fluxx")
3416
+
3417
+ # Parse date parameters
3418
+ start_date = kwargs.get("interval_start")
3419
+ if start_date:
3420
+ start_date = ensure_pendulum_datetime(start_date)
3421
+
3422
+ end_date = kwargs.get("interval_end")
3423
+ if end_date:
3424
+ end_date = ensure_pendulum_datetime(end_date)
3425
+
3426
+ # Import Fluxx source
3427
+ from ingestr.src.fluxx import fluxx_source
3428
+
3429
+ # Parse table specification for custom column selection
3430
+ # Format: "resource_name:field1,field2,field3" or "resource_name"
3431
+ resources = None
3432
+ custom_fields = {}
3433
+
3434
+ if table:
3435
+ # Handle single resource with custom fields or multiple resources
3436
+ if ":" in table and table.count(":") == 1:
3437
+ # Single resource with custom fields: "grant_request:id,name,amount"
3438
+ resource_name, field_list = table.split(":", 1)
3439
+ resource_name = resource_name.strip()
3440
+ fields = [f.strip() for f in field_list.split(",")]
3441
+ resources = [resource_name]
3442
+ custom_fields[resource_name] = fields
3443
+ else:
3444
+ # Multiple resources or single resource without custom fields
3445
+ # Support comma-separated list: "grant_request,user"
3446
+ resources = [r.strip() for r in table.split(",")]
3447
+
3448
+ return fluxx_source(
3449
+ instance=instance,
3450
+ client_id=client_id[0],
3451
+ client_secret=client_secret[0],
3452
+ start_date=start_date,
3453
+ end_date=end_date,
3454
+ resources=resources,
3455
+ custom_fields=custom_fields,
3456
+ )
3457
+
3458
+
3459
+ class LinearSource:
3460
+ def handles_incrementality(self) -> bool:
3461
+ return True
3462
+
3463
+ def dlt_source(self, uri: str, table: str, **kwargs):
3464
+ if kwargs.get("incremental_key"):
3465
+ raise ValueError(
3466
+ "Linear takes care of incrementality on its own, you should not provide incremental_key"
3467
+ )
3468
+
3469
+ parsed_uri = urlparse(uri)
3470
+ params = parse_qs(parsed_uri.query)
3471
+ api_key = params.get("api_key")
3472
+ if api_key is None:
3473
+ raise MissingValueError("api_key", "Linear")
3474
+
3475
+ if table not in [
3476
+ "issues",
3477
+ "projects",
3478
+ "teams",
3479
+ "users",
3480
+ "workflow_states",
3481
+ "cycles",
3482
+ "attachments",
3483
+ "comments",
3484
+ "documents",
3485
+ "external_users",
3486
+ "initiative",
3487
+ "integrations",
3488
+ "labels",
3489
+ "organization",
3490
+ "project_updates",
3491
+ "team_memberships",
3492
+ "initiative_to_project",
3493
+ "project_milestone",
3494
+ "project_status",
3495
+ ]:
3496
+ raise UnsupportedResourceError(table, "Linear")
3497
+
3498
+ start_date = kwargs.get("interval_start")
3499
+ if start_date is not None:
3500
+ start_date = ensure_pendulum_datetime(start_date)
3501
+ else:
3502
+ start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
3503
+
3504
+ end_date = kwargs.get("interval_end")
3505
+ if end_date is not None:
3506
+ end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3507
+
3508
+ from ingestr.src.linear import linear_source
3509
+
3510
+ return linear_source(
3511
+ api_key=api_key[0],
3512
+ start_date=start_date,
3513
+ end_date=end_date,
3514
+ ).with_resources(table)
3515
+
3516
+
3517
+ class RevenueCatSource:
3518
+ def handles_incrementality(self) -> bool:
3519
+ return True
3520
+
3521
+ def dlt_source(self, uri: str, table: str, **kwargs):
3522
+ if kwargs.get("incremental_key"):
3523
+ raise ValueError(
3524
+ "RevenueCat takes care of incrementality on its own, you should not provide incremental_key"
3525
+ )
3526
+
3527
+ parsed_uri = urlparse(uri)
3528
+ params = parse_qs(parsed_uri.query)
3529
+
3530
+ api_key = params.get("api_key")
3531
+ if api_key is None:
3532
+ raise MissingValueError("api_key", "RevenueCat")
3533
+
3534
+ project_id = params.get("project_id")
3535
+ if project_id is None and table != "projects":
3536
+ raise MissingValueError("project_id", "RevenueCat")
3537
+
3538
+ if table not in [
3539
+ "customers",
3540
+ "products",
3541
+ "entitlements",
3542
+ "offerings",
3543
+ "subscriptions",
3544
+ "purchases",
3545
+ "projects",
3546
+ ]:
3547
+ raise UnsupportedResourceError(table, "RevenueCat")
3548
+
3549
+ start_date = kwargs.get("interval_start")
3550
+ if start_date is not None:
3551
+ start_date = ensure_pendulum_datetime(start_date)
3552
+ else:
3553
+ start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
3554
+
3555
+ end_date = kwargs.get("interval_end")
3556
+ if end_date is not None:
3557
+ end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3558
+
3559
+ from ingestr.src.revenuecat import revenuecat_source
3560
+
3561
+ return revenuecat_source(
3562
+ api_key=api_key[0],
3563
+ project_id=project_id[0] if project_id is not None else None,
3564
+ ).with_resources(table)
3565
+
3566
+
3567
+ class ZoomSource:
3568
+ def handles_incrementality(self) -> bool:
3569
+ return True
3570
+
3571
+ def dlt_source(self, uri: str, table: str, **kwargs):
3572
+ if kwargs.get("incremental_key"):
3573
+ raise ValueError(
3574
+ "Zoom takes care of incrementality on its own, you should not provide incremental_key"
3575
+ )
3576
+
3577
+ parsed = urlparse(uri)
3578
+ params = parse_qs(parsed.query)
3579
+ client_id = params.get("client_id")
3580
+ client_secret = params.get("client_secret")
3581
+ account_id = params.get("account_id")
3582
+
3583
+ if not (client_id and client_secret and account_id):
3584
+ raise MissingValueError(
3585
+ "client_id/client_secret/account_id",
3586
+ "Zoom",
3587
+ )
3588
+
3589
+ start_date = kwargs.get("interval_start")
3590
+ if start_date is not None:
3591
+ start_date = ensure_pendulum_datetime(start_date)
3592
+ else:
3593
+ start_date = pendulum.datetime(2020, 1, 26).in_tz("UTC")
3594
+
3595
+ end_date = kwargs.get("interval_end")
3596
+ if end_date is not None:
3597
+ end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
3598
+
3599
+ from ingestr.src.zoom import zoom_source
3600
+
3601
+ if table not in {"meetings", "users", "participants"}:
3602
+ raise UnsupportedResourceError(table, "Zoom")
3603
+
3604
+ return zoom_source(
3605
+ client_id=client_id[0],
3606
+ client_secret=client_secret[0],
3607
+ account_id=account_id[0],
3608
+ start_date=start_date,
3609
+ end_date=end_date,
3610
+ ).with_resources(table)
3611
+
3612
+
3613
+ class InfluxDBSource:
3614
+ def handles_incrementality(self) -> bool:
3615
+ return True
3616
+
3617
+ def dlt_source(self, uri: str, table: str, **kwargs):
3618
+ if kwargs.get("incremental_key"):
3619
+ raise ValueError(
3620
+ "InfluxDB takes care of incrementality on its own, you should not provide incremental_key"
3621
+ )
3622
+
3623
+ parsed_uri = urlparse(uri)
3624
+ params = parse_qs(parsed_uri.query)
3625
+ host = parsed_uri.hostname
3626
+ port = parsed_uri.port
3627
+
3628
+ secure = params.get("secure", ["true"])[0].lower() != "false"
3629
+ scheme = "https" if secure else "http"
3630
+
3631
+ if port:
3632
+ host_url = f"{scheme}://{host}:{port}"
3633
+ else:
3634
+ host_url = f"{scheme}://{host}"
3635
+
3636
+ token = params.get("token")
3637
+ org = params.get("org")
3638
+ bucket = params.get("bucket")
3639
+
3640
+ if not host:
3641
+ raise MissingValueError("host", "InfluxDB")
3642
+ if not token:
3643
+ raise MissingValueError("token", "InfluxDB")
3644
+ if not org:
3645
+ raise MissingValueError("org", "InfluxDB")
3646
+ if not bucket:
3647
+ raise MissingValueError("bucket", "InfluxDB")
3648
+
3649
+ start_date = kwargs.get("interval_start")
3650
+ if start_date is not None:
3651
+ start_date = ensure_pendulum_datetime(start_date)
3652
+ else:
3653
+ start_date = pendulum.datetime(2024, 1, 1).in_tz("UTC")
3654
+
3655
+ end_date = kwargs.get("interval_end")
3656
+ if end_date is not None:
3657
+ end_date = ensure_pendulum_datetime(end_date)
3658
+
3659
+ from ingestr.src.influxdb import influxdb_source
3660
+
3661
+ return influxdb_source(
3662
+ measurement=table,
3663
+ host=host_url,
3664
+ org=org[0],
3665
+ bucket=bucket[0],
3666
+ token=token[0],
3667
+ secure=secure,
3668
+ start_date=start_date,
3669
+ end_date=end_date,
3670
+ ).with_resources(table)
3671
+
3672
+
3673
+ class WiseSource:
3674
+ def handles_incrementality(self) -> bool:
3675
+ return True
3676
+
3677
+ def dlt_source(self, uri: str, table: str, **kwargs):
3678
+ parsed = urlparse(uri)
3679
+ params = parse_qs(parsed.query)
3680
+ api_key = params.get("api_key")
3681
+
3682
+ if not api_key:
3683
+ raise MissingValueError("api_key", "Wise")
3684
+
3685
+ if table not in ["profiles", "transfers", "balances"]:
3686
+ raise ValueError(
3687
+ f"Resource '{table}' is not supported for Wise source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
3688
+ )
3689
+
3690
+ start_date = kwargs.get("interval_start")
3691
+ if start_date:
3692
+ start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
3693
+ else:
3694
+ start_date = pendulum.datetime(2020, 1, 1).in_timezone("UTC")
3695
+
3696
+ end_date = kwargs.get("interval_end")
3697
+ if end_date:
3698
+ end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
3699
+ else:
3700
+ end_date = None
3701
+
3702
+ from ingestr.src.wise import wise_source
3703
+
3704
+ return wise_source(
3705
+ api_key=api_key[0],
3706
+ start_date=start_date,
3707
+ end_date=end_date,
3708
+ ).with_resources(table)
3709
+
3710
+
3711
+ class FundraiseupSource:
3712
+ def handles_incrementality(self) -> bool:
3713
+ return True
3714
+
3715
+ def dlt_source(self, uri: str, table: str, **kwargs):
3716
+ parsed_uri = urlparse(uri)
3717
+ params = parse_qs(parsed_uri.query)
3718
+
3719
+ api_key = params.get("api_key")
3720
+ if api_key is None:
3721
+ raise MissingValueError("api_key", "Fundraiseup")
3722
+
3723
+ from ingestr.src.fundraiseup import fundraiseup_source
3724
+
3725
+ src = fundraiseup_source(api_key=api_key[0])
3726
+ if table not in src.resources:
3727
+ raise UnsupportedResourceError(table, "Fundraiseup")
3728
+ return src.with_resources(table)
3729
+
3730
+
3731
+ class AnthropicSource:
3732
+ def handles_incrementality(self) -> bool:
3733
+ return True
3734
+
3735
+ def dlt_source(self, uri: str, table: str, **kwargs):
3736
+ # anthropic://?api_key=<admin_api_key>
3737
+ parsed_uri = urlparse(uri)
3738
+ params = parse_qs(parsed_uri.query)
3739
+
3740
+ api_key = params.get("api_key")
3741
+ if api_key is None:
3742
+ raise MissingValueError("api_key", "Anthropic")
3743
+
3744
+ if table not in [
3745
+ "claude_code_usage",
3746
+ "usage_report",
3747
+ "cost_report",
3748
+ "organization",
3749
+ "workspaces",
3750
+ "api_keys",
3751
+ "invites",
3752
+ "users",
3753
+ "workspace_members",
3754
+ ]:
3755
+ raise UnsupportedResourceError(table, "Anthropic")
3756
+
3757
+ # Get start and end dates from kwargs
3758
+ start_date = kwargs.get("interval_start")
3759
+ if start_date:
3760
+ start_date = ensure_pendulum_datetime(start_date)
3761
+ else:
3762
+ # Default to 2023-01-01
3763
+ start_date = pendulum.datetime(2023, 1, 1)
3764
+
3765
+ end_date = kwargs.get("interval_end")
3766
+ if end_date:
3767
+ end_date = ensure_pendulum_datetime(end_date)
3768
+ else:
3769
+ end_date = None
3770
+
3771
+ from ingestr.src.anthropic import anthropic_source
3772
+
3773
+ return anthropic_source(
3774
+ api_key=api_key[0],
3775
+ initial_start_date=start_date,
3776
+ end_date=end_date,
3777
+ ).with_resources(table)
3778
+
3779
+
3780
+ class PlusVibeAISource:
3781
+ resources = [
3782
+ "campaigns",
3783
+ "leads",
3784
+ "email_accounts",
3785
+ "emails",
3786
+ "blocklist",
3787
+ "webhooks",
3788
+ "tags",
3789
+ ]
3790
+
3791
+ def handles_incrementality(self) -> bool:
3792
+ return True
3793
+
3794
+ def dlt_source(self, uri: str, table: str, **kwargs):
3795
+ # plusvibeai://?api_key=<key>&workspace_id=<id>
3796
+ parsed_uri = urlparse(uri)
3797
+ params = parse_qs(parsed_uri.query)
3798
+
3799
+ api_key = params.get("api_key")
3800
+ workspace_id = params.get("workspace_id")
3801
+
3802
+ if not api_key:
3803
+ raise MissingValueError("api_key", "PlusVibeAI")
3804
+
3805
+ if not workspace_id:
3806
+ raise MissingValueError("workspace_id", "PlusVibeAI")
3807
+
3808
+ if table not in self.resources:
3809
+ raise UnsupportedResourceError(table, "PlusVibeAI")
3810
+
3811
+ import dlt
3812
+
3813
+ from ingestr.src.plusvibeai import plusvibeai_source
3814
+
3815
+ dlt.secrets["sources.plusvibeai.api_key"] = api_key[0]
3816
+ dlt.secrets["sources.plusvibeai.workspace_id"] = workspace_id[0]
3817
+
3818
+ # Handle custom base URL if provided
3819
+ base_url = params.get("base_url", ["https://api.plusvibe.ai"])[0]
3820
+ dlt.secrets["sources.plusvibeai.base_url"] = base_url
3821
+
3822
+ src = plusvibeai_source()
3823
+ return src.with_resources(table)
3824
+
3825
+
3826
+ class IntercomSource:
3827
+ def handles_incrementality(self) -> bool:
3828
+ return True
3829
+
3830
+ def dlt_source(self, uri: str, table: str, **kwargs):
3831
+ # intercom://?access_token=<token>&region=<us|eu|au>
3832
+ # OR intercom://?oauth_token=<token>&region=<us|eu|au>
3833
+ parsed_uri = urlparse(uri)
3834
+ params = parse_qs(parsed_uri.query)
3835
+
3836
+ # Check for authentication
3837
+ access_token = params.get("access_token")
3838
+ oauth_token = params.get("oauth_token")
3839
+ region = params.get("region", ["us"])[0]
3840
+
3841
+ if not access_token and not oauth_token:
3842
+ raise MissingValueError("access_token or oauth_token", "Intercom")
3843
+
3844
+ # Validate table/resource
3845
+ supported_tables = [
3846
+ "contacts",
3847
+ "companies",
3848
+ "conversations",
3849
+ "tickets",
3850
+ "tags",
3851
+ "segments",
3852
+ "teams",
3853
+ "admins",
3854
+ "articles",
3855
+ "data_attributes",
3856
+ ]
3857
+
3858
+ if table not in supported_tables:
3859
+ raise UnsupportedResourceError(table, "Intercom")
3860
+
3861
+ # Get date parameters
3862
+ start_date = kwargs.get("interval_start")
3863
+ if start_date:
3864
+ start_date = ensure_pendulum_datetime(start_date)
3865
+ else:
3866
+ start_date = pendulum.datetime(2020, 1, 1)
3867
+
3868
+ end_date = kwargs.get("interval_end")
3869
+ if end_date:
3870
+ end_date = ensure_pendulum_datetime(end_date)
3871
+
3872
+ # Import and initialize the source
3873
+ from ingestr.src.intercom import (
3874
+ IntercomCredentialsAccessToken,
3875
+ IntercomCredentialsOAuth,
3876
+ TIntercomCredentials,
3877
+ intercom_source,
3878
+ )
3879
+
3880
+ credentials: TIntercomCredentials
3881
+ if access_token:
3882
+ credentials = IntercomCredentialsAccessToken(
3883
+ access_token=access_token[0], region=region
3884
+ )
3885
+ else:
3886
+ if not oauth_token:
3887
+ raise MissingValueError("oauth_token", "Intercom")
3888
+ credentials = IntercomCredentialsOAuth(
3889
+ oauth_token=oauth_token[0], region=region
3890
+ )
3891
+
3892
+ return intercom_source(
3893
+ credentials=credentials,
3894
+ start_date=start_date,
3895
+ end_date=end_date,
3896
+ ).with_resources(table)
3897
+
3898
+
3899
+ class HttpSource:
3900
+ """Source for reading CSV, JSON, and Parquet files from HTTP URLs"""
3901
+
3902
+ def handles_incrementality(self) -> bool:
3903
+ return False
3904
+
3905
+ def dlt_source(self, uri: str, table: str, **kwargs):
3906
+ """
3907
+ Create a dlt source for reading files from HTTP URLs.
3908
+
3909
+ URI format: http://example.com/file.csv or https://example.com/file.json
3910
+
3911
+ Args:
3912
+ uri: HTTP(S) URL to the file
3913
+ table: Not used for HTTP source (files are read directly)
3914
+ **kwargs: Additional arguments:
3915
+ - file_format: Optional file format override ('csv', 'json', 'parquet')
3916
+ - chunksize: Number of records to process at once (default varies by format)
3917
+ - merge_key: Merge key for the resource
3918
+
3919
+ Returns:
3920
+ DltResource for the HTTP file
3921
+ """
3922
+ from ingestr.src.http import http_source
3923
+
3924
+ # Extract the actual URL (remove the http:// or https:// scheme if duplicated)
3925
+ url = uri
3926
+ if uri.startswith("http://http://") or uri.startswith("https://https://"):
3927
+ url = uri.split("://", 1)[1]
3928
+
3929
+ file_format = kwargs.get("file_format")
3930
+ chunksize = kwargs.get("chunksize")
3931
+ merge_key = kwargs.get("merge_key")
3932
+
3933
+ reader_kwargs = {}
3934
+ if chunksize is not None:
3935
+ reader_kwargs["chunksize"] = chunksize
3936
+
3937
+ source = http_source(url=url, file_format=file_format, **reader_kwargs)
3938
+
3939
+ if merge_key:
3940
+ source.apply_hints(merge_key=merge_key)
3941
+
3942
+ return source
3943
+
3944
+
3945
+ class MondaySource:
3946
+ def handles_incrementality(self) -> bool:
3947
+ return False
3948
+
3949
+ def dlt_source(self, uri: str, table: str, **kwargs):
3950
+ parsed_uri = urlparse(uri)
3951
+ query_params = parse_qs(parsed_uri.query)
3952
+ api_token = query_params.get("api_token")
3953
+
3954
+ if api_token is None:
3955
+ raise MissingValueError("api_token", "Monday")
3956
+
3957
+ parts = table.replace(" ", "").split(":")
3958
+ table_name = parts[0]
3959
+ params = parts[1:]
3960
+
3961
+ # Get interval_start and interval_end from kwargs (command line args)
3962
+ interval_start = kwargs.get("interval_start")
3963
+ interval_end = kwargs.get("interval_end")
3964
+
3965
+ # Convert datetime to string format YYYY-MM-DD
3966
+ start_date = interval_start.strftime("%Y-%m-%d") if interval_start else None
3967
+ end_date = interval_end.strftime("%Y-%m-%d") if interval_end else None
3968
+
3969
+ from ingestr.src.monday import monday_source
3970
+
3971
+ try:
3972
+ return monday_source(
3973
+ api_token=api_token[0],
3974
+ params=params,
3975
+ start_date=start_date,
3976
+ end_date=end_date,
3977
+ ).with_resources(table_name)
3978
+ except ResourcesNotFoundError:
3979
+ raise UnsupportedResourceError(table_name, "Monday")
3980
+
3981
+
3982
+ class MailchimpSource:
3983
+ def handles_incrementality(self) -> bool:
3984
+ return False
3985
+
3986
+ def dlt_source(self, uri: str, table: str, **kwargs):
3987
+ parsed_uri = urlparse(uri)
3988
+ query_params = parse_qs(parsed_uri.query)
3989
+ api_key = query_params.get("api_key")
3990
+ server = query_params.get("server")
3991
+
3992
+ if api_key is None:
3993
+ raise MissingValueError("api_key", "Mailchimp")
3994
+ if server is None:
3995
+ raise MissingValueError("server", "Mailchimp")
3996
+
3997
+ from ingestr.src.mailchimp import mailchimp_source
3998
+
3999
+ try:
4000
+ return mailchimp_source(
4001
+ api_key=api_key[0],
4002
+ server=server[0],
4003
+ ).with_resources(table)
4004
+ except ResourcesNotFoundError:
4005
+ raise UnsupportedResourceError(table, "Mailchimp")
4006
+
4007
+
4008
+ class AlliumSource:
4009
+ def handles_incrementality(self) -> bool:
4010
+ return False
4011
+
4012
+ def dlt_source(self, uri: str, table: str, **kwargs):
4013
+ parsed_uri = urlparse(uri)
4014
+ query_params = parse_qs(parsed_uri.query)
4015
+ api_key = query_params.get("api_key")
4016
+
4017
+ if api_key is None:
4018
+ raise MissingValueError("api_key", "Allium")
4019
+
4020
+ # Extract query_id and custom parameters from table parameter
4021
+ # Format: query_id or query:query_id or query:query_id:param1=value1&param2=value2
4022
+ query_id = table
4023
+ custom_params = {}
4024
+ limit = None
4025
+ compute_profile = None
4026
+
4027
+ if ":" in table:
4028
+ parts = table.split(":", 2) # Split into max 3 parts
4029
+ if len(parts) >= 2:
4030
+ query_id = parts[1]
4031
+ if len(parts) == 3:
4032
+ # Parse custom parameters from query string format
4033
+ param_string = parts[2]
4034
+ for param in param_string.split("&"):
4035
+ if "=" in param:
4036
+ key, value = param.split("=", 1)
4037
+ # Extract run_config parameters
4038
+ if key == "limit":
4039
+ limit = int(value)
4040
+ elif key == "compute_profile":
4041
+ compute_profile = value
4042
+ else:
4043
+ custom_params[key] = value
4044
+
4045
+ # Extract parameters from interval_start and interval_end
4046
+ # Default: 2 days ago 00:00 to yesterday 00:00
4047
+ now = pendulum.now()
4048
+ default_start = now.subtract(days=2).start_of("day")
4049
+ default_end = now.subtract(days=1).start_of("day")
4050
+
4051
+ parameters = {}
4052
+ interval_start = kwargs.get("interval_start")
4053
+ interval_end = kwargs.get("interval_end")
4054
+
4055
+ start_date = interval_start if interval_start is not None else default_start
4056
+ end_date = interval_end if interval_end is not None else default_end
4057
+
4058
+ parameters["start_date"] = start_date.strftime("%Y-%m-%d")
4059
+ parameters["end_date"] = end_date.strftime("%Y-%m-%d")
4060
+ parameters["start_timestamp"] = str(int(start_date.timestamp()))
4061
+ parameters["end_timestamp"] = str(int(end_date.timestamp()))
4062
+
4063
+ # Merge custom parameters (they override default parameters)
4064
+ parameters.update(custom_params)
4065
+
4066
+ from ingestr.src.allium import allium_source
4067
+
4068
+ return allium_source(
4069
+ api_key=api_key[0],
4070
+ query_id=query_id,
4071
+ parameters=parameters if parameters else None,
4072
+ limit=limit,
4073
+ compute_profile=compute_profile,
4074
+ )
4075
+
4076
+
4077
+ class CouchbaseSource:
4078
+ table_builder: Callable
4079
+
4080
+ def __init__(self, table_builder=None) -> None:
4081
+ if table_builder is None:
4082
+ from ingestr.src.couchbase_source import couchbase_collection
4083
+
4084
+ table_builder = couchbase_collection
4085
+
4086
+ self.table_builder = table_builder
4087
+
4088
+ def handles_incrementality(self) -> bool:
4089
+ return False
4090
+
4091
+ def dlt_source(self, uri: str, table: str, **kwargs):
4092
+ """
4093
+ Create a dlt source for reading data from Couchbase.
4094
+
4095
+ URI formats:
4096
+ - couchbase://username:password@host
4097
+ - couchbase://username:password@host/bucket
4098
+ - couchbase://username:password@host?ssl=true
4099
+ - couchbases://username:password@host (SSL enabled)
4100
+
4101
+ Table formats:
4102
+ - bucket.scope.collection (when bucket not in URI)
4103
+ - scope.collection (when bucket specified in URI path)
4104
+
4105
+ Note: If password contains special characters (@, :, /, etc.), they must be URL-encoded.
4106
+
4107
+ Examples:
4108
+ Local/Self-hosted:
4109
+ - couchbase://admin:password123@localhost with table "mybucket.myscope.mycollection"
4110
+ - couchbase://admin:password123@localhost/mybucket with table "myscope.mycollection"
4111
+ - couchbase://admin:password123@localhost?ssl=true with table "mybucket._default._default"
4112
+
4113
+ Capella (Cloud):
4114
+ - couchbases://user:pass@cb.xxx.cloud.couchbase.com with table "travel-sample.inventory.airport"
4115
+ - couchbase://user:pass@cb.xxx.cloud.couchbase.com/travel-sample?ssl=true with table "inventory.airport"
4116
+
4117
+ To encode password in Python:
4118
+ from urllib.parse import quote
4119
+ encoded_pwd = quote("MyPass@123!", safe='')
4120
+ uri = f"couchbase://admin:{encoded_pwd}@localhost?ssl=true"
4121
+
4122
+ Args:
4123
+ uri: Couchbase connection URI (can include /bucket path and ?ssl=true query parameter)
4124
+ table: Format depends on URI:
4125
+ - bucket.scope.collection (if bucket not in URI)
4126
+ - scope.collection (if bucket in URI path)
4127
+ **kwargs: Additional arguments:
4128
+ - limit: Maximum number of documents to fetch
4129
+ - incremental_key: Field to use for incremental loading
4130
+ - interval_start: Start value for incremental loading
4131
+ - interval_end: End value for incremental loading
4132
+
4133
+ Returns:
4134
+ DltResource for the Couchbase collection
4135
+ """
4136
+ # Parse the URI to extract connection details
4137
+ # urlparse automatically decodes URL-encoded credentials
4138
+
4139
+ parsed = urlparse(uri)
4140
+
4141
+ # Extract username and password from URI
4142
+ # Note: urlparse automatically decodes URL-encoded characters in username/password
4143
+ from urllib.parse import unquote
4144
+
4145
+ username = parsed.username
4146
+ password = unquote(parsed.password) if parsed.password else None
4147
+
4148
+ if not username or not password:
4149
+ raise ValueError(
4150
+ "Username and password must be provided in the URI.\n"
4151
+ "Format: couchbase://username:password@host\n"
4152
+ "If password has special characters (@, :, /), URL-encode them.\n"
4153
+ "Example: couchbase://admin:MyPass%40123@localhost for password 'MyPass@123'"
4154
+ )
4155
+
4156
+ # Reconstruct connection string without credentials
4157
+ scheme = parsed.scheme
4158
+ netloc = parsed.netloc
4159
+
4160
+ # Remove username:password@ from netloc if present
4161
+ if "@" in netloc:
4162
+ netloc = netloc.split("@", 1)[1]
4163
+
4164
+ # Parse query parameters from URI
4165
+ from urllib.parse import parse_qs
4166
+
4167
+ query_params = parse_qs(parsed.query)
4168
+
4169
+ # Check if SSL is requested via URI query parameter (?ssl=true)
4170
+ if "ssl" in query_params:
4171
+ ssl_value = query_params["ssl"][0].lower()
4172
+ use_ssl = ssl_value in ("true", "1", "yes")
4173
+
4174
+ # Apply SSL scheme based on parameter
4175
+ if use_ssl and scheme == "couchbase":
4176
+ scheme = "couchbases"
4177
+
4178
+ connection_string = f"{scheme}://{netloc}"
4179
+
4180
+ # Extract bucket from URI path if present (e.g., couchbase://host/bucket)
4181
+ bucket_from_uri = None
4182
+ if parsed.path and parsed.path.strip("/"):
4183
+ bucket_from_uri = parsed.path.strip("/").split("/")[0]
4184
+
4185
+ # Parse table format: can be "scope.collection" or "bucket.scope.collection"
4186
+ table_parts = table.split(".")
4187
+
4188
+ if len(table_parts) == 3:
4189
+ # Format: bucket.scope.collection
4190
+ bucket, scope, collection = table_parts
4191
+ elif len(table_parts) == 2:
4192
+ # Format: scope.collection (bucket from URI)
4193
+ if bucket_from_uri:
4194
+ bucket = bucket_from_uri
4195
+ scope, collection = table_parts
4196
+ else:
4197
+ raise ValueError(
4198
+ "Table format is 'scope.collection' but no bucket specified in URI.\n"
4199
+ f"Either use URI format: couchbase://user:pass@host/bucket\n"
4200
+ f"Or use table format: bucket.scope.collection\n"
4201
+ f"Got table: {table}"
4202
+ )
4203
+ else:
4204
+ raise ValueError(
4205
+ "Table format must be 'bucket.scope.collection' or 'scope.collection' (with bucket in URI). "
4206
+ f"Got: {table}\n"
4207
+ "Examples:\n"
4208
+ " - URI: couchbase://user:pass@host, Table: travel-sample.inventory.airport\n"
4209
+ " - URI: couchbase://user:pass@host/travel-sample, Table: inventory.airport"
4210
+ )
4211
+
4212
+ # Handle incremental loading
4213
+ incremental = None
4214
+ if kwargs.get("incremental_key"):
4215
+ start_value = kwargs.get("interval_start")
4216
+ end_value = kwargs.get("interval_end")
4217
+
4218
+ incremental = dlt_incremental(
4219
+ kwargs.get("incremental_key", ""),
4220
+ initial_value=start_value,
4221
+ end_value=end_value,
4222
+ range_end="closed",
4223
+ range_start="closed",
4224
+ )
4225
+
4226
+ # Get optional parameters
4227
+ limit = kwargs.get("limit")
4228
+
4229
+ table_instance = self.table_builder(
4230
+ connection_string=connection_string,
4231
+ username=username,
4232
+ password=password,
4233
+ bucket=bucket,
4234
+ scope=scope,
4235
+ collection=collection,
4236
+ incremental=incremental,
4237
+ limit=limit,
4238
+ )
4239
+ table_instance.max_table_nesting = 1
4240
+
4241
+ return table_instance
4242
+
4243
+
4244
+ class CursorSource:
4245
+ resources = [
4246
+ "team_members",
4247
+ "daily_usage_data",
4248
+ "team_spend",
4249
+ "filtered_usage_events",
4250
+ ]
4251
+
4252
+ def handles_incrementality(self) -> bool:
4253
+ return True
4254
+
4255
+ def dlt_source(self, uri: str, table: str, **kwargs):
4256
+ # cursor://?api_key=<api_key>
4257
+ parsed_uri = urlparse(uri)
4258
+ params = parse_qs(parsed_uri.query)
4259
+
4260
+ api_key = params.get("api_key")
4261
+
4262
+ if not api_key:
4263
+ raise MissingValueError("api_key", "Cursor")
4264
+
4265
+ if table not in self.resources:
4266
+ raise UnsupportedResourceError(table, "Cursor")
4267
+
4268
+ import dlt
4269
+
4270
+ from ingestr.src.cursor import cursor_source
4271
+
4272
+ dlt.secrets["sources.cursor.api_key"] = api_key[0]
4273
+
4274
+ # Handle interval_start and interval_end for daily_usage_data and filtered_usage_events (optional)
4275
+ if table in ["daily_usage_data", "filtered_usage_events"]:
4276
+ interval_start = kwargs.get("interval_start")
4277
+ interval_end = kwargs.get("interval_end")
4278
+
4279
+ # Both are optional, but if one is provided, both should be provided
4280
+ if interval_start is not None and interval_end is not None:
4281
+ # Convert datetime to epoch milliseconds
4282
+ start_ms = int(interval_start.timestamp() * 1000)
4283
+ end_ms = int(interval_end.timestamp() * 1000)
4284
+
4285
+ dlt.config["sources.cursor.start_date"] = start_ms
4286
+ dlt.config["sources.cursor.end_date"] = end_ms
4287
+
4288
+ src = cursor_source()
4289
+ return src.with_resources(table)
4290
+
4291
+
4292
+ class SocrataSource:
4293
+ def handles_incrementality(self) -> bool:
4294
+ return False
4295
+
4296
+ def dlt_source(self, uri: str, table: str, **kwargs):
4297
+ """
4298
+ Creates a DLT source for Socrata open data platform.
4299
+
4300
+ URI format: socrata://domain?app_token=TOKEN
4301
+ Table: dataset_id (e.g., "6udu-fhnu")
4302
+
4303
+ Args:
4304
+ uri: Socrata connection URI with domain and optional auth params
4305
+ table: Dataset ID (e.g., "6udu-fhnu")
4306
+ **kwargs: Additional arguments:
4307
+ - incremental_key: Field to use for incremental loading (e.g., ":updated_at")
4308
+ - interval_start: Start date for initial load
4309
+ - interval_end: End date for load
4310
+ - primary_key: Primary key field for merge operations
4311
+
4312
+ Returns:
4313
+ DltResource for the Socrata dataset
4314
+ """
4315
+ from urllib.parse import parse_qs, urlparse
4316
+
4317
+ parsed = urlparse(uri)
4318
+
4319
+ domain = parsed.netloc
4320
+ if not domain:
4321
+ raise ValueError(
4322
+ "Domain must be provided in the URI.\n"
4323
+ "Format: socrata://domain?app_token=TOKEN\n"
4324
+ "Example: socrata://evergreen.data.socrata.com?app_token=mytoken"
4325
+ )
4326
+
4327
+ query_params = parse_qs(parsed.query)
4328
+
4329
+ dataset_id = table
4330
+ if not dataset_id:
4331
+ raise ValueError(
4332
+ "Dataset ID must be provided as the table parameter.\n"
4333
+ "Example: --source-table 6udu-fhnu"
4334
+ )
4335
+
4336
+ app_token = query_params.get("app_token", [None])[0]
4337
+ username = query_params.get("username", [None])[0]
4338
+ password = query_params.get("password", [None])[0]
4339
+
4340
+ incremental = None
4341
+ if kwargs.get("incremental_key"):
4342
+ start_value = kwargs.get("interval_start")
4343
+ end_value = kwargs.get("interval_end")
4344
+
4345
+ if start_value:
4346
+ start_value = (
4347
+ start_value.isoformat()
4348
+ if hasattr(start_value, "isoformat")
4349
+ else str(start_value)
4350
+ )
4351
+
4352
+ if end_value:
4353
+ end_value = (
4354
+ end_value.isoformat()
4355
+ if hasattr(end_value, "isoformat")
4356
+ else str(end_value)
4357
+ )
4358
+
4359
+ incremental = dlt_incremental(
4360
+ kwargs.get("incremental_key", ""),
4361
+ initial_value=start_value,
4362
+ end_value=end_value,
4363
+ range_end="open",
4364
+ range_start="closed",
4365
+ )
4366
+
4367
+ primary_key = kwargs.get("primary_key")
4368
+
4369
+ from ingestr.src.socrata_source import source
4370
+
4371
+ return source(
4372
+ domain=domain,
4373
+ dataset_id=dataset_id,
4374
+ app_token=app_token,
4375
+ username=username,
4376
+ password=password,
4377
+ incremental=incremental,
4378
+ primary_key=primary_key,
4379
+ ).with_resources("dataset")
4380
+
4381
+
4382
+ class HostawaySource:
4383
+ def handles_incrementality(self) -> bool:
4384
+ return True
4385
+
4386
+ def dlt_source(self, uri: str, table: str, **kwargs):
4387
+ if kwargs.get("incremental_key"):
4388
+ raise ValueError(
4389
+ "Hostaway takes care of incrementality on its own, you should not provide incremental_key"
4390
+ )
4391
+
4392
+ source_parts = urlparse(uri)
4393
+ source_params = parse_qs(source_parts.query)
4394
+ api_key = source_params.get("api_key")
4395
+
4396
+ if not api_key:
4397
+ raise ValueError("api_key in the URI is required to connect to Hostaway")
4398
+
4399
+ match table:
4400
+ case "listings":
4401
+ resource_name = "listings"
4402
+ case "listing_fee_settings":
4403
+ resource_name = "listing_fee_settings"
4404
+ case "listing_agreements":
4405
+ resource_name = "listing_agreements"
4406
+ case "listing_pricing_settings":
4407
+ resource_name = "listing_pricing_settings"
4408
+ case "cancellation_policies":
4409
+ resource_name = "cancellation_policies"
4410
+ case "cancellation_policies_airbnb":
4411
+ resource_name = "cancellation_policies_airbnb"
4412
+ case "cancellation_policies_marriott":
4413
+ resource_name = "cancellation_policies_marriott"
4414
+ case "cancellation_policies_vrbo":
4415
+ resource_name = "cancellation_policies_vrbo"
4416
+ case "reservations":
4417
+ resource_name = "reservations"
4418
+ case "finance_fields":
4419
+ resource_name = "finance_fields"
4420
+ case "reservation_payment_methods":
4421
+ resource_name = "reservation_payment_methods"
4422
+ case "reservation_rental_agreements":
4423
+ resource_name = "reservation_rental_agreements"
4424
+ case "listing_calendars":
4425
+ resource_name = "listing_calendars"
4426
+ case "conversations":
4427
+ resource_name = "conversations"
4428
+ case "message_templates":
4429
+ resource_name = "message_templates"
4430
+ case "bed_types":
4431
+ resource_name = "bed_types"
4432
+ case "property_types":
4433
+ resource_name = "property_types"
4434
+ case "countries":
4435
+ resource_name = "countries"
4436
+ case "account_tax_settings":
4437
+ resource_name = "account_tax_settings"
4438
+ case "user_groups":
4439
+ resource_name = "user_groups"
4440
+ case "guest_payment_charges":
4441
+ resource_name = "guest_payment_charges"
4442
+ case "coupons":
4443
+ resource_name = "coupons"
4444
+ case "webhook_reservations":
4445
+ resource_name = "webhook_reservations"
4446
+ case "tasks":
4447
+ resource_name = "tasks"
4448
+ case _:
4449
+ raise ValueError(
4450
+ f"Resource '{table}' is not supported for Hostaway source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
4451
+ )
4452
+
4453
+ start_date = kwargs.get("interval_start")
4454
+ if start_date:
4455
+ start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
4456
+ else:
4457
+ start_date = pendulum.datetime(1970, 1, 1).in_timezone("UTC")
4458
+
4459
+ end_date = kwargs.get("interval_end")
4460
+ if end_date:
4461
+ end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
4462
+
4463
+ from ingestr.src.hostaway import hostaway_source
4464
+
4465
+ return hostaway_source(
4466
+ api_key=api_key[0],
4467
+ start_date=start_date,
4468
+ end_date=end_date,
4469
+ ).with_resources(resource_name)
4470
+
4471
+
4472
+ class SnapchatAdsSource:
4473
+ resources = [
4474
+ "organizations",
4475
+ "fundingsources",
4476
+ "billingcenters",
4477
+ "adaccounts",
4478
+ "invoices",
4479
+ "transactions",
4480
+ "members",
4481
+ "roles",
4482
+ "campaigns",
4483
+ "adsquads",
4484
+ "ads",
4485
+ "event_details",
4486
+ "creatives",
4487
+ "segments",
4488
+ "campaigns_stats",
4489
+ "ad_accounts_stats",
4490
+ "ads_stats",
4491
+ "ad_squads_stats",
4492
+ ]
4493
+
4494
+ def handles_incrementality(self) -> bool:
4495
+ return True
4496
+
4497
+ def dlt_source(self, uri: str, table: str, **kwargs):
4498
+ parsed_uri = urlparse(uri)
4499
+ source_fields = parse_qs(parsed_uri.query)
4500
+
4501
+ refresh_token = source_fields.get("refresh_token")
4502
+ if not refresh_token:
4503
+ raise ValueError("refresh_token is required to connect to Snapchat Ads")
4504
+
4505
+ client_id = source_fields.get("client_id")
4506
+ if not client_id:
4507
+ raise ValueError("client_id is required to connect to Snapchat Ads")
4508
+
4509
+ client_secret = source_fields.get("client_secret")
4510
+ if not client_secret:
4511
+ raise ValueError("client_secret is required to connect to Snapchat Ads")
4512
+
4513
+ organization_id = source_fields.get("organization_id")
4514
+
4515
+ # Resources that support ad_account_id filtering
4516
+ ad_account_resources = [
4517
+ "invoices",
4518
+ "campaigns",
4519
+ "adsquads",
4520
+ "ads",
4521
+ "event_details",
4522
+ "creatives",
4523
+ "segments",
4524
+ ]
4525
+
4526
+ # Stats resources
4527
+ stats_resources = [
4528
+ "campaigns_stats",
4529
+ "ad_accounts_stats",
4530
+ "ads_stats",
4531
+ "ad_squads_stats",
4532
+ ]
4533
+
4534
+ # Parse table name
4535
+ stats_config = None
4536
+ ad_account_id = None
4537
+
4538
+ if ":" in table:
4539
+ parts = table.split(":")
4540
+ resource_name = parts[0]
4541
+
4542
+ if resource_name in stats_resources:
4543
+ # Stats table format:
4544
+ # resource_name:granularity:fields:options (all accounts)
4545
+ # resource_name:ad_account_id:granularity:fields:options (specific account)
4546
+
4547
+ def parse_options(options_str: str) -> dict:
4548
+ """Parse key=value,key=value options string."""
4549
+ result = {}
4550
+ for option in options_str.split(","):
4551
+ if "=" in option:
4552
+ key, value = option.split("=", 1)
4553
+ result[key] = value
4554
+ return result
4555
+
4556
+ if len(parts) >= 2:
4557
+ valid_granularities = ["TOTAL", "DAY", "HOUR", "LIFETIME"]
4558
+
4559
+ if parts[1].upper() in valid_granularities:
4560
+ # Format: resource_name:granularity:fields:options
4561
+ stats_config = {
4562
+ "granularity": parts[1].upper(),
4563
+ "fields": parts[2]
4564
+ if len(parts) > 2
4565
+ else "impressions,spend",
4566
+ }
4567
+ if len(parts) > 3:
4568
+ stats_config.update(parse_options(parts[3]))
4569
+ else:
4570
+ # Format: resource_name:ad_account_id:granularity:fields:options
4571
+ ad_account_id = parts[1]
4572
+ stats_config = {
4573
+ "granularity": parts[2].upper()
4574
+ if len(parts) > 2
4575
+ else "DAY",
4576
+ "fields": parts[3]
4577
+ if len(parts) > 3
4578
+ else "impressions,spend",
4579
+ }
4580
+ if len(parts) > 4:
4581
+ stats_config.update(parse_options(parts[4]))
4582
+ else:
4583
+ # Just resource_name, use defaults
4584
+ stats_config = {
4585
+ "granularity": "DAY",
4586
+ "fields": "impressions,spend",
4587
+ }
4588
+ else:
4589
+ # Non-stats table with ad_account_id: resource_name:ad_account_id
4590
+ ad_account_id = parts[1] if len(parts) > 1 else None
4591
+ if not ad_account_id:
4592
+ raise ValueError(
4593
+ f"ad_account_id must be provided in format '{resource_name}:ad_account_id'"
4594
+ )
4595
+ else:
4596
+ resource_name = table
4597
+ if resource_name in stats_resources:
4598
+ # Stats resource with default config
4599
+ stats_config = {
4600
+ "granularity": "DAY",
4601
+ "fields": "impressions,spend",
4602
+ }
4603
+
4604
+ # Validation for non-stats resources
4605
+ if resource_name not in stats_resources:
4606
+ account_id_required = (
4607
+ resource_name in ad_account_resources
4608
+ and ad_account_id is None
4609
+ and not organization_id
4610
+ )
4611
+ if account_id_required:
4612
+ raise ValueError(
4613
+ f"organization_id is required for '{resource_name}' table when no specific ad_account_id is provided"
4614
+ )
4615
+
4616
+ if not organization_id and table != "organizations":
4617
+ raise ValueError(
4618
+ f"organization_id is required for table '{table}'. Only 'organizations' table does not require organization_id."
4619
+ )
4620
+ else:
4621
+ # Stats resources need either ad_account_id or organization_id
4622
+ if not ad_account_id and not organization_id:
4623
+ raise ValueError(
4624
+ f"organization_id is required for '{resource_name}' when ad_account_id is not provided"
4625
+ )
4626
+
4627
+ if resource_name not in self.resources:
4628
+ raise UnsupportedResourceError(table, "Snapchat Ads")
4629
+
4630
+ from ingestr.src.snapchat_ads import snapchat_ads_source
4631
+
4632
+ source_kwargs: dict[str, Any] = {
4633
+ "refresh_token": refresh_token[0],
4634
+ "client_id": client_id[0],
4635
+ "client_secret": client_secret[0],
4636
+ }
4637
+
4638
+ if organization_id:
4639
+ source_kwargs["organization_id"] = organization_id[0]
4640
+
4641
+ if ad_account_id:
4642
+ source_kwargs["ad_account_id"] = ad_account_id
4643
+
4644
+ # Add interval_start and interval_end for client-side filtering
4645
+ interval_start = kwargs.get("interval_start")
4646
+ if interval_start:
4647
+ source_kwargs["start_date"] = interval_start
4648
+
4649
+ interval_end = kwargs.get("interval_end")
4650
+ if interval_end:
4651
+ source_kwargs["end_date"] = interval_end
4652
+
4653
+ # Add stats_config for stats resource
4654
+ if stats_config:
4655
+ source_kwargs["stats_config"] = stats_config
4656
+
4657
+ source = snapchat_ads_source(**source_kwargs)
4658
+
4659
+ return source.with_resources(resource_name)