ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin/__init__.py +262 -0
- ingestr/src/applovin_max/__init__.py +117 -0
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +38 -11
- ingestr/src/buildinfo.py +1 -0
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +520 -33
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +116 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/loader.py +69 -0
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/personio/__init__.py +331 -0
- ingestr/src/personio/helpers.py +86 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +156 -0
- ingestr/src/salesforce/helpers.py +64 -0
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +3132 -212
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/version.py +6 -1
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- ingestr-0.14.104.dist-info/METADATA +563 -0
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.2.dist-info/METADATA +0 -302
- ingestr-0.13.2.dist-info/RECORD +0 -107
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/sources.py
CHANGED
|
@@ -3,6 +3,7 @@ import csv
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import tempfile
|
|
7
8
|
from datetime import date, datetime, timedelta, timezone
|
|
8
9
|
from typing import (
|
|
@@ -13,100 +14,39 @@ from typing import (
|
|
|
13
14
|
List,
|
|
14
15
|
Literal,
|
|
15
16
|
Optional,
|
|
17
|
+
TypeAlias,
|
|
16
18
|
Union,
|
|
17
19
|
)
|
|
18
|
-
from urllib.parse import ParseResult, parse_qs,
|
|
20
|
+
from urllib.parse import ParseResult, parse_qs, urlencode, urlparse
|
|
19
21
|
|
|
20
|
-
import
|
|
21
|
-
import gcsfs # type: ignore
|
|
22
|
+
import fsspec # type: ignore
|
|
22
23
|
import pendulum
|
|
23
|
-
import s3fs # type: ignore
|
|
24
|
-
from dlt.common.configuration.specs import (
|
|
25
|
-
AwsCredentials,
|
|
26
|
-
)
|
|
27
|
-
from dlt.common.libs.sql_alchemy import (
|
|
28
|
-
Engine,
|
|
29
|
-
MetaData,
|
|
30
|
-
)
|
|
31
24
|
from dlt.common.time import ensure_pendulum_datetime
|
|
32
|
-
from dlt.common.typing import TDataItem, TSecretStrValue
|
|
33
25
|
from dlt.extract import Incremental
|
|
26
|
+
from dlt.extract.exceptions import ResourcesNotFoundError
|
|
27
|
+
from dlt.sources import incremental as dlt_incremental
|
|
34
28
|
from dlt.sources.credentials import (
|
|
35
29
|
ConnectionStringCredentials,
|
|
36
30
|
)
|
|
37
|
-
from dlt.sources.sql_database import sql_table
|
|
38
|
-
from dlt.sources.sql_database.helpers import TableLoader
|
|
39
|
-
from dlt.sources.sql_database.schema_types import (
|
|
40
|
-
ReflectionLevel,
|
|
41
|
-
SelectAny,
|
|
42
|
-
Table,
|
|
43
|
-
TTypeAdapter,
|
|
44
|
-
)
|
|
45
|
-
from google.ads.googleads.client import GoogleAdsClient # type: ignore
|
|
46
|
-
from sqlalchemy import Column
|
|
47
|
-
from sqlalchemy import types as sa
|
|
48
31
|
|
|
49
32
|
from ingestr.src import blob
|
|
50
|
-
from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
|
|
51
|
-
from ingestr.src.adjust.adjust_helpers import parse_filters
|
|
52
|
-
from ingestr.src.airtable import airtable_source
|
|
53
|
-
from ingestr.src.appsflyer._init_ import appsflyer_source
|
|
54
|
-
from ingestr.src.appstore import app_store
|
|
55
|
-
from ingestr.src.appstore.client import AppStoreConnectClient
|
|
56
|
-
from ingestr.src.arrow import memory_mapped_arrow
|
|
57
|
-
from ingestr.src.asana_source import asana_source
|
|
58
|
-
from ingestr.src.chess import source
|
|
59
|
-
from ingestr.src.dynamodb import dynamodb
|
|
60
33
|
from ingestr.src.errors import (
|
|
61
34
|
InvalidBlobTableError,
|
|
62
35
|
MissingValueError,
|
|
63
36
|
UnsupportedResourceError,
|
|
64
37
|
)
|
|
65
|
-
from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
|
|
66
|
-
from ingestr.src.filesystem import readers
|
|
67
|
-
from ingestr.src.filters import table_adapter_exclude_columns
|
|
68
|
-
from ingestr.src.github import github_reactions, github_repo_events, github_stargazers
|
|
69
|
-
from ingestr.src.google_ads import google_ads
|
|
70
|
-
from ingestr.src.google_analytics import google_analytics
|
|
71
|
-
from ingestr.src.google_sheets import google_spreadsheet
|
|
72
|
-
from ingestr.src.gorgias import gorgias_source
|
|
73
|
-
from ingestr.src.hubspot import hubspot
|
|
74
|
-
from ingestr.src.kafka import kafka_consumer
|
|
75
|
-
from ingestr.src.kafka.helpers import KafkaCredentials
|
|
76
|
-
from ingestr.src.klaviyo._init_ import klaviyo_source
|
|
77
|
-
from ingestr.src.linkedin_ads import linked_in_ads_source
|
|
78
|
-
from ingestr.src.linkedin_ads.dimension_time_enum import (
|
|
79
|
-
Dimension,
|
|
80
|
-
TimeGranularity,
|
|
81
|
-
)
|
|
82
|
-
from ingestr.src.mongodb import mongodb_collection
|
|
83
|
-
from ingestr.src.notion import notion_databases
|
|
84
|
-
from ingestr.src.shopify import shopify_source
|
|
85
|
-
from ingestr.src.slack import slack_source
|
|
86
|
-
from ingestr.src.sql_database.callbacks import (
|
|
87
|
-
chained_query_adapter_callback,
|
|
88
|
-
custom_query_variable_subsitution,
|
|
89
|
-
limit_callback,
|
|
90
|
-
type_adapter_callback,
|
|
91
|
-
)
|
|
92
|
-
from ingestr.src.stripe_analytics import stripe_source
|
|
93
38
|
from ingestr.src.table_definition import TableDefinition, table_string_to_dataclass
|
|
94
|
-
from ingestr.src.tiktok_ads import tiktok_source
|
|
95
|
-
from ingestr.src.time import isotime
|
|
96
|
-
from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
|
|
97
|
-
from ingestr.src.zendesk.helpers.credentials import (
|
|
98
|
-
ZendeskCredentialsOAuth,
|
|
99
|
-
ZendeskCredentialsToken,
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
|
|
103
|
-
TQueryAdapter = Callable[[SelectAny, Table], SelectAny]
|
|
104
39
|
|
|
105
40
|
|
|
106
41
|
class SqlSource:
|
|
107
42
|
table_builder: Callable
|
|
108
43
|
|
|
109
|
-
def __init__(self, table_builder=
|
|
44
|
+
def __init__(self, table_builder=None) -> None:
|
|
45
|
+
if table_builder is None:
|
|
46
|
+
from dlt.sources.sql_database import sql_table
|
|
47
|
+
|
|
48
|
+
table_builder = sql_table
|
|
49
|
+
|
|
110
50
|
self.table_builder = table_builder
|
|
111
51
|
|
|
112
52
|
def handles_incrementality(self) -> bool:
|
|
@@ -115,13 +55,16 @@ class SqlSource:
|
|
|
115
55
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
116
56
|
table_fields = TableDefinition(dataset="custom", table="custom")
|
|
117
57
|
if not table.startswith("query:"):
|
|
118
|
-
|
|
58
|
+
if uri.startswith("spanner://"):
|
|
59
|
+
table_fields = TableDefinition(dataset="", table=table)
|
|
60
|
+
else:
|
|
61
|
+
table_fields = table_string_to_dataclass(table)
|
|
119
62
|
|
|
120
63
|
incremental = None
|
|
121
64
|
if kwargs.get("incremental_key"):
|
|
122
65
|
start_value = kwargs.get("interval_start")
|
|
123
66
|
end_value = kwargs.get("interval_end")
|
|
124
|
-
incremental =
|
|
67
|
+
incremental = dlt_incremental(
|
|
125
68
|
kwargs.get("incremental_key", ""),
|
|
126
69
|
initial_value=start_value,
|
|
127
70
|
end_value=end_value,
|
|
@@ -129,9 +72,143 @@ class SqlSource:
|
|
|
129
72
|
range_start="closed",
|
|
130
73
|
)
|
|
131
74
|
|
|
75
|
+
engine_adapter_callback = None
|
|
76
|
+
|
|
77
|
+
if uri.startswith("md://") or uri.startswith("motherduck://"):
|
|
78
|
+
parsed_uri = urlparse(uri)
|
|
79
|
+
query_params = parse_qs(parsed_uri.query)
|
|
80
|
+
# Convert md:// URI to duckdb:///md: format
|
|
81
|
+
if parsed_uri.path:
|
|
82
|
+
db_path = parsed_uri.path
|
|
83
|
+
else:
|
|
84
|
+
db_path = ""
|
|
85
|
+
|
|
86
|
+
token = query_params.get("token", [""])[0]
|
|
87
|
+
if not token:
|
|
88
|
+
raise ValueError("Token is required for MotherDuck connection")
|
|
89
|
+
uri = f"duckdb:///md:{db_path}?motherduck_token={token}"
|
|
90
|
+
|
|
132
91
|
if uri.startswith("mysql://"):
|
|
133
92
|
uri = uri.replace("mysql://", "mysql+pymysql://")
|
|
134
93
|
|
|
94
|
+
# Monkey patch cx_Oracle to use oracledb (thin mode, no client libraries required)
|
|
95
|
+
if uri.startswith("oracle+") or uri.startswith("oracle://"):
|
|
96
|
+
try:
|
|
97
|
+
import oracledb # type: ignore[import-not-found]
|
|
98
|
+
|
|
99
|
+
# SQLAlchemy's cx_oracle dialect checks for version >= 5.2
|
|
100
|
+
# oracledb has a different versioning scheme, so we need to patch it
|
|
101
|
+
oracledb.version = "8.3.0" # type: ignore[assignment]
|
|
102
|
+
sys.modules["cx_Oracle"] = oracledb # type: ignore[assignment]
|
|
103
|
+
except ImportError:
|
|
104
|
+
# oracledb not installed, will fail later with a clear error
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
# Process Snowflake private key authentication
|
|
108
|
+
if uri.startswith("snowflake://"):
|
|
109
|
+
parsed_uri = urlparse(uri)
|
|
110
|
+
query_params = parse_qs(parsed_uri.query)
|
|
111
|
+
|
|
112
|
+
if "private_key" in query_params:
|
|
113
|
+
from dlt.common.libs.cryptography import decode_private_key
|
|
114
|
+
|
|
115
|
+
private_key = query_params["private_key"][0]
|
|
116
|
+
passphrase = query_params.get("private_key_passphrase", [None])[0]
|
|
117
|
+
decoded_key = decode_private_key(private_key, passphrase)
|
|
118
|
+
|
|
119
|
+
query_params["private_key"] = [base64.b64encode(decoded_key).decode()]
|
|
120
|
+
if "private_key_passphrase" in query_params:
|
|
121
|
+
del query_params["private_key_passphrase"]
|
|
122
|
+
|
|
123
|
+
# Rebuild URI
|
|
124
|
+
uri = parsed_uri._replace(
|
|
125
|
+
query=urlencode(query_params, doseq=True)
|
|
126
|
+
).geturl()
|
|
127
|
+
|
|
128
|
+
# clickhouse://<username>:<password>@<host>:<port>?secure=<secure>
|
|
129
|
+
if uri.startswith("clickhouse://"):
|
|
130
|
+
parsed_uri = urlparse(uri)
|
|
131
|
+
|
|
132
|
+
query_params = parse_qs(parsed_uri.query)
|
|
133
|
+
|
|
134
|
+
if "http_port" in query_params:
|
|
135
|
+
del query_params["http_port"]
|
|
136
|
+
|
|
137
|
+
if "secure" not in query_params:
|
|
138
|
+
query_params["secure"] = ["1"]
|
|
139
|
+
|
|
140
|
+
uri = parsed_uri._replace(
|
|
141
|
+
scheme="clickhouse+native",
|
|
142
|
+
query=urlencode(query_params, doseq=True),
|
|
143
|
+
).geturl()
|
|
144
|
+
|
|
145
|
+
if uri.startswith("db2://"):
|
|
146
|
+
uri = uri.replace("db2://", "db2+ibm_db://")
|
|
147
|
+
|
|
148
|
+
if uri.startswith("spanner://"):
|
|
149
|
+
parsed_uri = urlparse(uri)
|
|
150
|
+
query_params = parse_qs(parsed_uri.query)
|
|
151
|
+
|
|
152
|
+
project_id_param = query_params.get("project_id")
|
|
153
|
+
instance_id_param = query_params.get("instance_id")
|
|
154
|
+
database_param = query_params.get("database")
|
|
155
|
+
|
|
156
|
+
cred_path = query_params.get("credentials_path")
|
|
157
|
+
cred_base64 = query_params.get("credentials_base64")
|
|
158
|
+
|
|
159
|
+
if not project_id_param or not instance_id_param or not database_param:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
"project_id, instance_id and database are required in the URI to get data from Google Spanner"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
project_id = project_id_param[0]
|
|
165
|
+
instance_id = instance_id_param[0]
|
|
166
|
+
database = database_param[0]
|
|
167
|
+
|
|
168
|
+
if not cred_path and not cred_base64:
|
|
169
|
+
raise ValueError(
|
|
170
|
+
"credentials_path or credentials_base64 is required in the URI to get data from Google Sheets"
|
|
171
|
+
)
|
|
172
|
+
if cred_path:
|
|
173
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path[0]
|
|
174
|
+
elif cred_base64:
|
|
175
|
+
credentials = json.loads(
|
|
176
|
+
base64.b64decode(cred_base64[0]).decode("utf-8")
|
|
177
|
+
)
|
|
178
|
+
temp = tempfile.NamedTemporaryFile(
|
|
179
|
+
mode="w", delete=False, suffix=".json"
|
|
180
|
+
)
|
|
181
|
+
json.dump(credentials, temp)
|
|
182
|
+
temp.close()
|
|
183
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp.name
|
|
184
|
+
|
|
185
|
+
uri = f"spanner+spanner:///projects/{project_id}/instances/{instance_id}/databases/{database}"
|
|
186
|
+
|
|
187
|
+
def eng_callback(engine):
|
|
188
|
+
return engine.execution_options(read_only=True)
|
|
189
|
+
|
|
190
|
+
engine_adapter_callback = eng_callback
|
|
191
|
+
from dlt.common.libs.sql_alchemy import (
|
|
192
|
+
Engine,
|
|
193
|
+
MetaData,
|
|
194
|
+
)
|
|
195
|
+
from dlt.sources.sql_database.schema_types import (
|
|
196
|
+
ReflectionLevel,
|
|
197
|
+
SelectAny,
|
|
198
|
+
Table,
|
|
199
|
+
TTypeAdapter,
|
|
200
|
+
)
|
|
201
|
+
from sqlalchemy import Column
|
|
202
|
+
from sqlalchemy import types as sa
|
|
203
|
+
|
|
204
|
+
from ingestr.src.filters import table_adapter_exclude_columns
|
|
205
|
+
from ingestr.src.sql_database.callbacks import (
|
|
206
|
+
chained_query_adapter_callback,
|
|
207
|
+
custom_query_variable_subsitution,
|
|
208
|
+
limit_callback,
|
|
209
|
+
type_adapter_callback,
|
|
210
|
+
)
|
|
211
|
+
|
|
135
212
|
query_adapters = []
|
|
136
213
|
if kwargs.get("sql_limit"):
|
|
137
214
|
query_adapters.append(
|
|
@@ -150,6 +227,13 @@ class SqlSource:
|
|
|
150
227
|
defer_table_reflect = True
|
|
151
228
|
query_value = table.split(":", 1)[1]
|
|
152
229
|
|
|
230
|
+
TableBackend: TypeAlias = Literal[
|
|
231
|
+
"sqlalchemy", "pyarrow", "pandas", "connectorx"
|
|
232
|
+
]
|
|
233
|
+
TQueryAdapter: TypeAlias = Callable[[SelectAny, Table], SelectAny]
|
|
234
|
+
import dlt
|
|
235
|
+
from dlt.common.typing import TDataItem
|
|
236
|
+
|
|
153
237
|
# this is a very hacky version of the table_rows function. it is built this way to go around the dlt's table loader.
|
|
154
238
|
# I didn't want to write a full fledged sqlalchemy source for now, and wanted to benefit from the existing stuff to begin with.
|
|
155
239
|
# this is by no means a production ready solution, but it works for now.
|
|
@@ -167,6 +251,9 @@ class SqlSource:
|
|
|
167
251
|
backend_kwargs: Dict[str, Any] = None, # type: ignore
|
|
168
252
|
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
169
253
|
included_columns: Optional[List[str]] = None,
|
|
254
|
+
excluded_columns: Optional[
|
|
255
|
+
List[str]
|
|
256
|
+
] = None, # Added for dlt 1.16.0 compatibility
|
|
170
257
|
query_adapter_callback: Optional[TQueryAdapter] = None,
|
|
171
258
|
resolve_foreign_keys: bool = False,
|
|
172
259
|
) -> Iterator[TDataItem]:
|
|
@@ -200,6 +287,8 @@ class SqlSource:
|
|
|
200
287
|
*cols,
|
|
201
288
|
)
|
|
202
289
|
|
|
290
|
+
from dlt.sources.sql_database.helpers import TableLoader
|
|
291
|
+
|
|
203
292
|
loader = TableLoader(
|
|
204
293
|
engine,
|
|
205
294
|
backend,
|
|
@@ -220,8 +309,54 @@ class SqlSource:
|
|
|
220
309
|
# override the query adapters, the only one we want is the one here in the case of custom queries
|
|
221
310
|
query_adapters = [custom_query_variable_subsitution(query_value, kwargs)]
|
|
222
311
|
|
|
312
|
+
credentials = ConnectionStringCredentials(uri)
|
|
313
|
+
if uri.startswith("mssql://"):
|
|
314
|
+
parsed_uri = urlparse(uri)
|
|
315
|
+
params = parse_qs(parsed_uri.query)
|
|
316
|
+
params = {k.lower(): v for k, v in params.items()}
|
|
317
|
+
if params.get("authentication") == ["ActiveDirectoryAccessToken"]:
|
|
318
|
+
import pyodbc # type: ignore
|
|
319
|
+
from sqlalchemy import create_engine
|
|
320
|
+
|
|
321
|
+
from ingestr.src.destinations import (
|
|
322
|
+
MSSQL_COPT_SS_ACCESS_TOKEN,
|
|
323
|
+
handle_datetimeoffset,
|
|
324
|
+
serialize_azure_token,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
cfg = {
|
|
328
|
+
"DRIVER": params.get("driver", ["ODBC Driver 18 for SQL Server"])[
|
|
329
|
+
0
|
|
330
|
+
],
|
|
331
|
+
"SERVER": f"{parsed_uri.hostname},{parsed_uri.port or 1433}",
|
|
332
|
+
"DATABASE": parsed_uri.path.lstrip("/"),
|
|
333
|
+
}
|
|
334
|
+
for k, v in params.items():
|
|
335
|
+
if k.lower() not in ["driver", "authentication", "connect_timeout"]:
|
|
336
|
+
cfg[k.upper()] = v[0]
|
|
337
|
+
|
|
338
|
+
token = serialize_azure_token(parsed_uri.password)
|
|
339
|
+
dsn = ";".join([f"{k}={v}" for k, v in cfg.items()])
|
|
340
|
+
|
|
341
|
+
def creator():
|
|
342
|
+
connection = pyodbc.connect(
|
|
343
|
+
dsn,
|
|
344
|
+
autocommit=True,
|
|
345
|
+
timeout=kwargs.get("connect_timeout", 30),
|
|
346
|
+
attrs_before={
|
|
347
|
+
MSSQL_COPT_SS_ACCESS_TOKEN: token,
|
|
348
|
+
},
|
|
349
|
+
)
|
|
350
|
+
connection.add_output_converter(-155, handle_datetimeoffset)
|
|
351
|
+
return connection
|
|
352
|
+
|
|
353
|
+
credentials = create_engine(
|
|
354
|
+
"mssql+pyodbc://",
|
|
355
|
+
creator=creator,
|
|
356
|
+
)
|
|
357
|
+
|
|
223
358
|
builder_res = self.table_builder(
|
|
224
|
-
credentials=
|
|
359
|
+
credentials=credentials,
|
|
225
360
|
schema=table_fields.dataset,
|
|
226
361
|
table=table_fields.table,
|
|
227
362
|
incremental=incremental,
|
|
@@ -234,6 +369,7 @@ class SqlSource:
|
|
|
234
369
|
kwargs.get("sql_exclude_columns", [])
|
|
235
370
|
),
|
|
236
371
|
defer_table_reflect=defer_table_reflect,
|
|
372
|
+
engine_adapter_callback=engine_adapter_callback,
|
|
237
373
|
)
|
|
238
374
|
|
|
239
375
|
return builder_res
|
|
@@ -242,7 +378,12 @@ class SqlSource:
|
|
|
242
378
|
class ArrowMemoryMappedSource:
|
|
243
379
|
table_builder: Callable
|
|
244
380
|
|
|
245
|
-
def __init__(self, table_builder=
|
|
381
|
+
def __init__(self, table_builder=None) -> None:
|
|
382
|
+
if table_builder is None:
|
|
383
|
+
from ingestr.src.arrow import memory_mapped_arrow
|
|
384
|
+
|
|
385
|
+
table_builder = memory_mapped_arrow
|
|
386
|
+
|
|
246
387
|
self.table_builder = table_builder
|
|
247
388
|
|
|
248
389
|
def handles_incrementality(self) -> bool:
|
|
@@ -254,7 +395,7 @@ class ArrowMemoryMappedSource:
|
|
|
254
395
|
start_value = kwargs.get("interval_start")
|
|
255
396
|
end_value = kwargs.get("interval_end")
|
|
256
397
|
|
|
257
|
-
incremental =
|
|
398
|
+
incremental = dlt_incremental(
|
|
258
399
|
kwargs.get("incremental_key", ""),
|
|
259
400
|
initial_value=start_value,
|
|
260
401
|
end_value=end_value,
|
|
@@ -287,37 +428,199 @@ class ArrowMemoryMappedSource:
|
|
|
287
428
|
class MongoDbSource:
|
|
288
429
|
table_builder: Callable
|
|
289
430
|
|
|
290
|
-
def __init__(self, table_builder=
|
|
431
|
+
def __init__(self, table_builder=None) -> None:
|
|
432
|
+
if table_builder is None:
|
|
433
|
+
from ingestr.src.mongodb import mongodb_collection
|
|
434
|
+
|
|
435
|
+
table_builder = mongodb_collection
|
|
436
|
+
|
|
291
437
|
self.table_builder = table_builder
|
|
292
438
|
|
|
293
439
|
def handles_incrementality(self) -> bool:
|
|
294
440
|
return False
|
|
295
441
|
|
|
296
442
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
297
|
-
|
|
443
|
+
# Check if this is a custom query format (collection:query)
|
|
444
|
+
if ":" in table:
|
|
445
|
+
collection_name, query_json = table.split(":", 1)
|
|
298
446
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
end_value = kwargs.get("interval_end")
|
|
447
|
+
# Parse the query using MongoDB's extended JSON parser
|
|
448
|
+
# First, convert MongoDB shell syntax to Extended JSON format
|
|
449
|
+
from bson import json_util
|
|
303
450
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
451
|
+
from ingestr.src.mongodb.helpers import convert_mongo_shell_to_extended_json
|
|
452
|
+
|
|
453
|
+
# Convert MongoDB shell constructs to Extended JSON v2 format
|
|
454
|
+
converted_query = convert_mongo_shell_to_extended_json(query_json)
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
query = json_util.loads(converted_query)
|
|
458
|
+
except Exception as e:
|
|
459
|
+
raise ValueError(f"Invalid MongoDB query format: {e}")
|
|
460
|
+
|
|
461
|
+
# Validate that it's a list for aggregation pipeline
|
|
462
|
+
if not isinstance(query, list):
|
|
463
|
+
raise ValueError(
|
|
464
|
+
"Query must be a JSON array representing a MongoDB aggregation pipeline"
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Check for incremental load requirements
|
|
468
|
+
incremental = None
|
|
469
|
+
if kwargs.get("incremental_key"):
|
|
470
|
+
start_value = kwargs.get("interval_start")
|
|
471
|
+
end_value = kwargs.get("interval_end")
|
|
472
|
+
|
|
473
|
+
# Validate that incremental key is present in the pipeline
|
|
474
|
+
incremental_key = kwargs.get("incremental_key")
|
|
475
|
+
self._validate_incremental_query(query, str(incremental_key))
|
|
476
|
+
|
|
477
|
+
incremental = dlt_incremental(
|
|
478
|
+
str(incremental_key),
|
|
479
|
+
initial_value=start_value,
|
|
480
|
+
end_value=end_value,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Substitute interval parameters in the query
|
|
484
|
+
query = self._substitute_interval_params(query, kwargs)
|
|
485
|
+
|
|
486
|
+
# Parse collection name to get database and collection
|
|
487
|
+
if "." in collection_name:
|
|
488
|
+
# Handle database.collection format
|
|
489
|
+
table_fields = table_string_to_dataclass(collection_name)
|
|
490
|
+
database = table_fields.dataset
|
|
491
|
+
collection = table_fields.table
|
|
492
|
+
else:
|
|
493
|
+
# Single collection name, use default database
|
|
494
|
+
database = None
|
|
495
|
+
collection = collection_name
|
|
496
|
+
|
|
497
|
+
table_instance = self.table_builder(
|
|
498
|
+
connection_url=uri,
|
|
499
|
+
database=database,
|
|
500
|
+
collection=collection,
|
|
501
|
+
parallel=False,
|
|
502
|
+
incremental=incremental,
|
|
503
|
+
custom_query=query,
|
|
310
504
|
)
|
|
505
|
+
table_instance.max_table_nesting = 1
|
|
506
|
+
return table_instance
|
|
507
|
+
else:
|
|
508
|
+
# Default behavior for simple collection names
|
|
509
|
+
table_fields = table_string_to_dataclass(table)
|
|
311
510
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
parallel=True,
|
|
317
|
-
incremental=incremental,
|
|
318
|
-
)
|
|
511
|
+
incremental = None
|
|
512
|
+
if kwargs.get("incremental_key"):
|
|
513
|
+
start_value = kwargs.get("interval_start")
|
|
514
|
+
end_value = kwargs.get("interval_end")
|
|
319
515
|
|
|
320
|
-
|
|
516
|
+
incremental = dlt_incremental(
|
|
517
|
+
kwargs.get("incremental_key", ""),
|
|
518
|
+
initial_value=start_value,
|
|
519
|
+
end_value=end_value,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
table_instance = self.table_builder(
|
|
523
|
+
connection_url=uri,
|
|
524
|
+
database=table_fields.dataset,
|
|
525
|
+
collection=table_fields.table,
|
|
526
|
+
parallel=False,
|
|
527
|
+
incremental=incremental,
|
|
528
|
+
)
|
|
529
|
+
table_instance.max_table_nesting = 1
|
|
530
|
+
|
|
531
|
+
return table_instance
|
|
532
|
+
|
|
533
|
+
def _validate_incremental_query(self, query: list, incremental_key: str):
|
|
534
|
+
"""Validate that incremental key is projected in the aggregation pipeline"""
|
|
535
|
+
# Check if there's a $project stage and if incremental_key is included
|
|
536
|
+
has_project = False
|
|
537
|
+
incremental_key_projected = False
|
|
538
|
+
|
|
539
|
+
for stage in query:
|
|
540
|
+
if "$project" in stage:
|
|
541
|
+
has_project = True
|
|
542
|
+
project_stage = stage["$project"]
|
|
543
|
+
if isinstance(project_stage, dict):
|
|
544
|
+
# Check if incremental_key is explicitly included
|
|
545
|
+
if incremental_key in project_stage:
|
|
546
|
+
if project_stage[incremental_key] not in [0, False]:
|
|
547
|
+
incremental_key_projected = True
|
|
548
|
+
# If there are only inclusions (1 or True values) and incremental_key is not included
|
|
549
|
+
elif any(v in [1, True] for v in project_stage.values()):
|
|
550
|
+
# This is an inclusion projection, incremental_key must be explicitly included
|
|
551
|
+
incremental_key_projected = False
|
|
552
|
+
# If there are only exclusions (0 or False values) and incremental_key is not excluded
|
|
553
|
+
elif all(
|
|
554
|
+
v in [0, False]
|
|
555
|
+
for v in project_stage.values()
|
|
556
|
+
if v in [0, False, 1, True]
|
|
557
|
+
):
|
|
558
|
+
# This is an exclusion projection, incremental_key is included by default
|
|
559
|
+
if incremental_key not in project_stage:
|
|
560
|
+
incremental_key_projected = True
|
|
561
|
+
else:
|
|
562
|
+
incremental_key_projected = project_stage[
|
|
563
|
+
incremental_key
|
|
564
|
+
] not in [0, False]
|
|
565
|
+
else:
|
|
566
|
+
# Mixed or unclear projection, assume incremental_key needs to be explicit
|
|
567
|
+
incremental_key_projected = False
|
|
568
|
+
|
|
569
|
+
# If there's a $project stage but incremental_key is not projected, raise error
|
|
570
|
+
if has_project and not incremental_key_projected:
|
|
571
|
+
raise ValueError(
|
|
572
|
+
f"Incremental key '{incremental_key}' must be included in the projected fields of the aggregation pipeline"
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
def _substitute_interval_params(self, query: list, kwargs: dict):
|
|
576
|
+
"""Substitute :interval_start and :interval_end placeholders with actual datetime values"""
|
|
577
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
578
|
+
|
|
579
|
+
# Get interval values and convert them to datetime objects
|
|
580
|
+
interval_start = kwargs.get("interval_start")
|
|
581
|
+
interval_end = kwargs.get("interval_end")
|
|
582
|
+
|
|
583
|
+
# Convert string dates to datetime objects if needed
|
|
584
|
+
if interval_start is not None:
|
|
585
|
+
if isinstance(interval_start, str):
|
|
586
|
+
pendulum_dt = ensure_pendulum_datetime(interval_start)
|
|
587
|
+
interval_start = (
|
|
588
|
+
pendulum_dt.to_datetime()
|
|
589
|
+
if hasattr(pendulum_dt, "to_datetime")
|
|
590
|
+
else pendulum_dt
|
|
591
|
+
)
|
|
592
|
+
elif hasattr(interval_start, "to_datetime"):
|
|
593
|
+
interval_start = interval_start.to_datetime()
|
|
594
|
+
|
|
595
|
+
if interval_end is not None:
|
|
596
|
+
if isinstance(interval_end, str):
|
|
597
|
+
pendulum_dt = ensure_pendulum_datetime(interval_end)
|
|
598
|
+
interval_end = (
|
|
599
|
+
pendulum_dt.to_datetime()
|
|
600
|
+
if hasattr(pendulum_dt, "to_datetime")
|
|
601
|
+
else pendulum_dt
|
|
602
|
+
)
|
|
603
|
+
elif hasattr(interval_end, "to_datetime"):
|
|
604
|
+
interval_end = interval_end.to_datetime()
|
|
605
|
+
|
|
606
|
+
# Deep copy the query and replace placeholders with actual datetime objects
|
|
607
|
+
def replace_placeholders(obj):
|
|
608
|
+
if isinstance(obj, dict):
|
|
609
|
+
result = {}
|
|
610
|
+
for key, value in obj.items():
|
|
611
|
+
if value == ":interval_start" and interval_start is not None:
|
|
612
|
+
result[key] = interval_start
|
|
613
|
+
elif value == ":interval_end" and interval_end is not None:
|
|
614
|
+
result[key] = interval_end
|
|
615
|
+
else:
|
|
616
|
+
result[key] = replace_placeholders(value)
|
|
617
|
+
return result
|
|
618
|
+
elif isinstance(obj, list):
|
|
619
|
+
return [replace_placeholders(item) for item in obj]
|
|
620
|
+
else:
|
|
621
|
+
return obj
|
|
622
|
+
|
|
623
|
+
return replace_placeholders(query)
|
|
321
624
|
|
|
322
625
|
|
|
323
626
|
class LocalCsvSource:
|
|
@@ -326,7 +629,7 @@ class LocalCsvSource:
|
|
|
326
629
|
|
|
327
630
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
328
631
|
def csv_file(
|
|
329
|
-
incremental: Optional[
|
|
632
|
+
incremental: Optional[dlt_incremental[Any]] = None,
|
|
330
633
|
):
|
|
331
634
|
file_path = uri.split("://")[1]
|
|
332
635
|
myFile = open(file_path, "r")
|
|
@@ -357,6 +660,7 @@ class LocalCsvSource:
|
|
|
357
660
|
if inc_value < incremental.start_value:
|
|
358
661
|
continue
|
|
359
662
|
|
|
663
|
+
dictionary = self.remove_empty_columns(dictionary)
|
|
360
664
|
page.append(dictionary)
|
|
361
665
|
current_items += 1
|
|
362
666
|
else:
|
|
@@ -367,11 +671,13 @@ class LocalCsvSource:
|
|
|
367
671
|
if page:
|
|
368
672
|
yield page
|
|
369
673
|
|
|
370
|
-
|
|
674
|
+
from dlt import resource
|
|
675
|
+
|
|
676
|
+
return resource(
|
|
371
677
|
csv_file,
|
|
372
678
|
merge_key=kwargs.get("merge_key"), # type: ignore
|
|
373
679
|
)(
|
|
374
|
-
incremental=
|
|
680
|
+
incremental=dlt_incremental(
|
|
375
681
|
kwargs.get("incremental_key", ""),
|
|
376
682
|
initial_value=kwargs.get("interval_start"),
|
|
377
683
|
end_value=kwargs.get("interval_end"),
|
|
@@ -380,11 +686,19 @@ class LocalCsvSource:
|
|
|
380
686
|
)
|
|
381
687
|
)
|
|
382
688
|
|
|
689
|
+
def remove_empty_columns(self, row: Dict[str, str]) -> Dict[str, str]:
|
|
690
|
+
return {k: v for k, v in row.items() if v.strip() != ""}
|
|
691
|
+
|
|
383
692
|
|
|
384
693
|
class NotionSource:
|
|
385
694
|
table_builder: Callable
|
|
386
695
|
|
|
387
|
-
def __init__(self, table_builder=
|
|
696
|
+
def __init__(self, table_builder=None) -> None:
|
|
697
|
+
if table_builder is None:
|
|
698
|
+
from ingestr.src.notion import notion_databases
|
|
699
|
+
|
|
700
|
+
table_builder = notion_databases
|
|
701
|
+
|
|
388
702
|
self.table_builder = table_builder
|
|
389
703
|
|
|
390
704
|
def handles_incrementality(self) -> bool:
|
|
@@ -411,6 +725,11 @@ class ShopifySource:
|
|
|
411
725
|
return True
|
|
412
726
|
|
|
413
727
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
728
|
+
if kwargs.get("incremental_key"):
|
|
729
|
+
raise ValueError(
|
|
730
|
+
"Shopify takes care of incrementality on its own, you should not provide incremental_key"
|
|
731
|
+
)
|
|
732
|
+
|
|
414
733
|
source_fields = urlparse(uri)
|
|
415
734
|
source_params = parse_qs(source_fields.query)
|
|
416
735
|
api_key = source_params.get("api_key")
|
|
@@ -444,6 +763,8 @@ class ShopifySource:
|
|
|
444
763
|
f"Table name '{table}' is not supported for Shopify source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
445
764
|
)
|
|
446
765
|
|
|
766
|
+
from ingestr.src.shopify import shopify_source
|
|
767
|
+
|
|
447
768
|
return shopify_source(
|
|
448
769
|
private_app_password=api_key[0],
|
|
449
770
|
shop_url=f"https://{source_fields.netloc}",
|
|
@@ -488,6 +809,8 @@ class GorgiasSource:
|
|
|
488
809
|
if kwargs.get("interval_end"):
|
|
489
810
|
date_args["end_date"] = kwargs.get("interval_end")
|
|
490
811
|
|
|
812
|
+
from ingestr.src.gorgias import gorgias_source
|
|
813
|
+
|
|
491
814
|
return gorgias_source(
|
|
492
815
|
domain=source_fields.netloc,
|
|
493
816
|
email=email[0],
|
|
@@ -499,7 +822,12 @@ class GorgiasSource:
|
|
|
499
822
|
class GoogleSheetsSource:
|
|
500
823
|
table_builder: Callable
|
|
501
824
|
|
|
502
|
-
def __init__(self, table_builder=
|
|
825
|
+
def __init__(self, table_builder=None) -> None:
|
|
826
|
+
if table_builder is None:
|
|
827
|
+
from ingestr.src.google_sheets import google_spreadsheet
|
|
828
|
+
|
|
829
|
+
table_builder = google_spreadsheet
|
|
830
|
+
|
|
503
831
|
self.table_builder = table_builder
|
|
504
832
|
|
|
505
833
|
def handles_incrementality(self) -> bool:
|
|
@@ -580,6 +908,8 @@ class ChessSource:
|
|
|
580
908
|
f"Resource '{table}' is not supported for Chess source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
581
909
|
)
|
|
582
910
|
|
|
911
|
+
from ingestr.src.chess import source
|
|
912
|
+
|
|
583
913
|
return source(players=list_players, **date_args).with_resources(
|
|
584
914
|
table_mapping[table]
|
|
585
915
|
)
|
|
@@ -603,40 +933,74 @@ class StripeAnalyticsSource:
|
|
|
603
933
|
if not api_key:
|
|
604
934
|
raise ValueError("api_key in the URI is required to connect to Stripe")
|
|
605
935
|
|
|
606
|
-
|
|
607
|
-
table = str.capitalize(table)
|
|
936
|
+
table = table.lower()
|
|
608
937
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
938
|
+
from ingestr.src.stripe_analytics.settings import ENDPOINTS
|
|
939
|
+
|
|
940
|
+
endpoint = None
|
|
941
|
+
incremental = False
|
|
942
|
+
sync = False
|
|
943
|
+
|
|
944
|
+
table_fields = table.split(":")
|
|
945
|
+
if len(table_fields) == 1:
|
|
946
|
+
endpoint = table_fields[0]
|
|
947
|
+
elif len(table_fields) == 2:
|
|
948
|
+
endpoint = table_fields[0]
|
|
949
|
+
sync = table_fields[1] == "sync"
|
|
950
|
+
elif len(table_fields) == 3:
|
|
951
|
+
endpoint = table_fields[0]
|
|
952
|
+
sync = table_fields[1] == "sync"
|
|
953
|
+
incremental = table_fields[2] == "incremental"
|
|
621
954
|
else:
|
|
622
955
|
raise ValueError(
|
|
623
|
-
|
|
956
|
+
"Invalid Stripe table format. Expected: stripe:<endpoint> or stripe:<endpoint>:<sync> or stripe:<endpoint>:<sync>:<incremental>"
|
|
624
957
|
)
|
|
625
958
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
959
|
+
if incremental and not sync:
|
|
960
|
+
raise ValueError("incremental loads must be used with sync loading")
|
|
961
|
+
|
|
962
|
+
if incremental:
|
|
963
|
+
from ingestr.src.stripe_analytics import incremental_stripe_source
|
|
964
|
+
|
|
965
|
+
def nullable_date(date_str: Optional[str]):
|
|
966
|
+
if date_str:
|
|
967
|
+
return ensure_pendulum_datetime(date_str)
|
|
968
|
+
return None
|
|
969
|
+
|
|
970
|
+
endpoint = ENDPOINTS[endpoint]
|
|
971
|
+
return incremental_stripe_source(
|
|
972
|
+
endpoints=[
|
|
973
|
+
endpoint,
|
|
974
|
+
],
|
|
975
|
+
stripe_secret_key=api_key[0],
|
|
976
|
+
initial_start_date=nullable_date(kwargs.get("interval_start", None)),
|
|
977
|
+
end_date=nullable_date(kwargs.get("interval_end", None)),
|
|
978
|
+
).with_resources(endpoint)
|
|
979
|
+
else:
|
|
980
|
+
endpoint = ENDPOINTS[endpoint]
|
|
981
|
+
if sync:
|
|
982
|
+
from ingestr.src.stripe_analytics import stripe_source
|
|
983
|
+
|
|
984
|
+
return stripe_source(
|
|
985
|
+
endpoints=[
|
|
986
|
+
endpoint,
|
|
987
|
+
],
|
|
988
|
+
stripe_secret_key=api_key[0],
|
|
989
|
+
).with_resources(endpoint)
|
|
990
|
+
else:
|
|
991
|
+
from ingestr.src.stripe_analytics import async_stripe_source
|
|
992
|
+
|
|
993
|
+
return async_stripe_source(
|
|
994
|
+
endpoints=[
|
|
995
|
+
endpoint,
|
|
996
|
+
],
|
|
997
|
+
stripe_secret_key=api_key[0],
|
|
998
|
+
max_workers=kwargs.get("extract_parallelism", 4),
|
|
999
|
+
).with_resources(endpoint)
|
|
1000
|
+
|
|
1001
|
+
raise ValueError(
|
|
1002
|
+
f"Resource '{table}' is not supported for stripe source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1003
|
+
)
|
|
640
1004
|
|
|
641
1005
|
|
|
642
1006
|
class FacebookAdsSource:
|
|
@@ -662,17 +1026,76 @@ class FacebookAdsSource:
|
|
|
662
1026
|
"access_token and accound_id are required to connect to Facebook Ads."
|
|
663
1027
|
)
|
|
664
1028
|
|
|
1029
|
+
from ingestr.src.facebook_ads import (
|
|
1030
|
+
facebook_ads_source,
|
|
1031
|
+
facebook_insights_source,
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
insights_max_wait_to_finish_seconds = source_params.get(
|
|
1035
|
+
"insights_max_wait_to_finish_seconds", [60 * 60 * 4]
|
|
1036
|
+
)
|
|
1037
|
+
insights_max_wait_to_start_seconds = source_params.get(
|
|
1038
|
+
"insights_max_wait_to_start_seconds", [60 * 30]
|
|
1039
|
+
)
|
|
1040
|
+
insights_max_async_sleep_seconds = source_params.get(
|
|
1041
|
+
"insights_max_async_sleep_seconds", [20]
|
|
1042
|
+
)
|
|
1043
|
+
|
|
665
1044
|
endpoint = None
|
|
666
1045
|
if table in ["campaigns", "ad_sets", "ad_creatives", "ads", "leads"]:
|
|
667
1046
|
endpoint = table
|
|
668
|
-
elif table
|
|
1047
|
+
elif table == "facebook_insights":
|
|
669
1048
|
return facebook_insights_source(
|
|
670
1049
|
access_token=access_token[0],
|
|
671
1050
|
account_id=account_id[0],
|
|
1051
|
+
start_date=kwargs.get("interval_start"),
|
|
1052
|
+
end_date=kwargs.get("interval_end"),
|
|
1053
|
+
insights_max_wait_to_finish_seconds=insights_max_wait_to_finish_seconds[
|
|
1054
|
+
0
|
|
1055
|
+
],
|
|
1056
|
+
insights_max_wait_to_start_seconds=insights_max_wait_to_start_seconds[
|
|
1057
|
+
0
|
|
1058
|
+
],
|
|
1059
|
+
insights_max_async_sleep_seconds=insights_max_async_sleep_seconds[0],
|
|
672
1060
|
).with_resources("facebook_insights")
|
|
1061
|
+
elif table.startswith("facebook_insights:"):
|
|
1062
|
+
# Parse custom breakdowns and metrics from table name
|
|
1063
|
+
# Supported formats:
|
|
1064
|
+
# facebook_insights:breakdown_type
|
|
1065
|
+
# facebook_insights:breakdown_type:metric1,metric2...
|
|
1066
|
+
parts = table.split(":")
|
|
1067
|
+
|
|
1068
|
+
if len(parts) < 2 or len(parts) > 3:
|
|
1069
|
+
raise ValueError(
|
|
1070
|
+
"Invalid facebook_insights format. Expected: facebook_insights:breakdown_type or facebook_insights:breakdown_type:metric1,metric2..."
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
breakdown_type = parts[1].strip()
|
|
1074
|
+
if not breakdown_type:
|
|
1075
|
+
raise ValueError(
|
|
1076
|
+
"Breakdown type must be provided in format: facebook_insights:breakdown_type"
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
# Validate breakdown type against available options from settings
|
|
1080
|
+
|
|
1081
|
+
from ingestr.src.facebook_ads.helpers import (
|
|
1082
|
+
parse_insights_table_to_source_kwargs,
|
|
1083
|
+
)
|
|
1084
|
+
|
|
1085
|
+
source_kwargs = {
|
|
1086
|
+
"access_token": access_token[0],
|
|
1087
|
+
"account_id": account_id[0],
|
|
1088
|
+
"start_date": kwargs.get("interval_start"),
|
|
1089
|
+
"end_date": kwargs.get("interval_end"),
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
source_kwargs.update(parse_insights_table_to_source_kwargs(table))
|
|
1093
|
+
return facebook_insights_source(**source_kwargs).with_resources(
|
|
1094
|
+
"facebook_insights"
|
|
1095
|
+
)
|
|
673
1096
|
else:
|
|
674
1097
|
raise ValueError(
|
|
675
|
-
"
|
|
1098
|
+
f"Resource '{table}' is not supported for Facebook Ads source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
676
1099
|
)
|
|
677
1100
|
|
|
678
1101
|
return facebook_ads_source(
|
|
@@ -719,6 +1142,8 @@ class SlackSource:
|
|
|
719
1142
|
if kwargs.get("interval_end"):
|
|
720
1143
|
date_args["end_date"] = kwargs.get("interval_end")
|
|
721
1144
|
|
|
1145
|
+
from ingestr.src.slack import slack_source
|
|
1146
|
+
|
|
722
1147
|
return slack_source(
|
|
723
1148
|
access_token=api_key[0],
|
|
724
1149
|
table_per_channel=False,
|
|
@@ -729,7 +1154,7 @@ class SlackSource:
|
|
|
729
1154
|
|
|
730
1155
|
class HubspotSource:
|
|
731
1156
|
def handles_incrementality(self) -> bool:
|
|
732
|
-
return
|
|
1157
|
+
return False
|
|
733
1158
|
|
|
734
1159
|
# hubspot://?api_key=<api_key>
|
|
735
1160
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
@@ -747,7 +1172,35 @@ class HubspotSource:
|
|
|
747
1172
|
raise ValueError("api_key in the URI is required to connect to Hubspot")
|
|
748
1173
|
|
|
749
1174
|
endpoint = None
|
|
750
|
-
|
|
1175
|
+
|
|
1176
|
+
from ingestr.src.hubspot import hubspot
|
|
1177
|
+
|
|
1178
|
+
if table.startswith("custom:"):
|
|
1179
|
+
fields = table.split(":", 2)
|
|
1180
|
+
if len(fields) != 2 and len(fields) != 3:
|
|
1181
|
+
raise ValueError(
|
|
1182
|
+
"Invalid Hubspot custom table format. Expected format: custom:<custom_object_type> or custom:<custom_object_type>:<associations>"
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
if len(fields) == 2:
|
|
1186
|
+
endpoint = fields[1]
|
|
1187
|
+
else:
|
|
1188
|
+
endpoint = f"{fields[1]}:{fields[2]}"
|
|
1189
|
+
|
|
1190
|
+
return hubspot(
|
|
1191
|
+
api_key=api_key[0],
|
|
1192
|
+
custom_object=endpoint,
|
|
1193
|
+
).with_resources("custom")
|
|
1194
|
+
|
|
1195
|
+
elif table in [
|
|
1196
|
+
"contacts",
|
|
1197
|
+
"companies",
|
|
1198
|
+
"deals",
|
|
1199
|
+
"tickets",
|
|
1200
|
+
"products",
|
|
1201
|
+
"quotes",
|
|
1202
|
+
"schemas",
|
|
1203
|
+
]:
|
|
751
1204
|
endpoint = table
|
|
752
1205
|
else:
|
|
753
1206
|
raise ValueError(
|
|
@@ -772,20 +1225,31 @@ class AirtableSource:
|
|
|
772
1225
|
if not table:
|
|
773
1226
|
raise ValueError("Source table is required to connect to Airtable")
|
|
774
1227
|
|
|
775
|
-
tables = table.split(",")
|
|
776
|
-
|
|
777
1228
|
source_parts = urlparse(uri)
|
|
778
1229
|
source_fields = parse_qs(source_parts.query)
|
|
779
|
-
base_id = source_fields.get("base_id")
|
|
780
1230
|
access_token = source_fields.get("access_token")
|
|
781
1231
|
|
|
782
|
-
if not
|
|
1232
|
+
if not access_token:
|
|
783
1233
|
raise ValueError(
|
|
784
|
-
"
|
|
1234
|
+
"access_token in the URI is required to connect to Airtable"
|
|
785
1235
|
)
|
|
786
1236
|
|
|
1237
|
+
base_id = source_fields.get("base_id", [None])[0]
|
|
1238
|
+
clean_table = table
|
|
1239
|
+
|
|
1240
|
+
table_fields = table.split("/")
|
|
1241
|
+
if len(table_fields) == 2:
|
|
1242
|
+
clean_table = table_fields[1]
|
|
1243
|
+
if not base_id:
|
|
1244
|
+
base_id = table_fields[0]
|
|
1245
|
+
|
|
1246
|
+
if not base_id:
|
|
1247
|
+
raise ValueError("base_id in the URI is required to connect to Airtable")
|
|
1248
|
+
|
|
1249
|
+
from ingestr.src.airtable import airtable_source
|
|
1250
|
+
|
|
787
1251
|
return airtable_source(
|
|
788
|
-
base_id=base_id
|
|
1252
|
+
base_id=base_id, table_names=[clean_table], access_token=access_token[0]
|
|
789
1253
|
)
|
|
790
1254
|
|
|
791
1255
|
|
|
@@ -831,12 +1295,66 @@ class KlaviyoSource:
|
|
|
831
1295
|
)
|
|
832
1296
|
|
|
833
1297
|
start_date = kwargs.get("interval_start") or "2000-01-01"
|
|
1298
|
+
|
|
1299
|
+
from ingestr.src.klaviyo import klaviyo_source
|
|
1300
|
+
|
|
834
1301
|
return klaviyo_source(
|
|
835
1302
|
api_key=api_key[0],
|
|
836
1303
|
start_date=start_date,
|
|
837
1304
|
).with_resources(resource)
|
|
838
1305
|
|
|
839
1306
|
|
|
1307
|
+
class MixpanelSource:
|
|
1308
|
+
def handles_incrementality(self) -> bool:
|
|
1309
|
+
return True
|
|
1310
|
+
|
|
1311
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1312
|
+
if kwargs.get("incremental_key"):
|
|
1313
|
+
raise ValueError(
|
|
1314
|
+
"Mixpanel takes care of incrementality on its own, you should not provide incremental_key"
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
parsed = urlparse(uri)
|
|
1318
|
+
params = parse_qs(parsed.query)
|
|
1319
|
+
username = params.get("username")
|
|
1320
|
+
password = params.get("password")
|
|
1321
|
+
project_id = params.get("project_id")
|
|
1322
|
+
server = params.get("server", ["eu"])
|
|
1323
|
+
|
|
1324
|
+
if not username or not password or not project_id:
|
|
1325
|
+
raise ValueError(
|
|
1326
|
+
"username, password, project_id are required to connect to Mixpanel"
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
if table not in ["events", "profiles"]:
|
|
1330
|
+
raise ValueError(
|
|
1331
|
+
f"Resource '{table}' is not supported for Mixpanel source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
start_date = kwargs.get("interval_start")
|
|
1335
|
+
if start_date:
|
|
1336
|
+
start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
|
|
1337
|
+
else:
|
|
1338
|
+
start_date = pendulum.datetime(2020, 1, 1).in_timezone("UTC")
|
|
1339
|
+
|
|
1340
|
+
end_date = kwargs.get("interval_end")
|
|
1341
|
+
if end_date:
|
|
1342
|
+
end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
|
|
1343
|
+
else:
|
|
1344
|
+
end_date = pendulum.now().in_timezone("UTC")
|
|
1345
|
+
|
|
1346
|
+
from ingestr.src.mixpanel import mixpanel_source
|
|
1347
|
+
|
|
1348
|
+
return mixpanel_source(
|
|
1349
|
+
username=username[0],
|
|
1350
|
+
password=password[0],
|
|
1351
|
+
project_id=project_id[0],
|
|
1352
|
+
start_date=start_date,
|
|
1353
|
+
end_date=end_date,
|
|
1354
|
+
server=server[0],
|
|
1355
|
+
).with_resources(table)
|
|
1356
|
+
|
|
1357
|
+
|
|
840
1358
|
class KafkaSource:
|
|
841
1359
|
def handles_incrementality(self) -> bool:
|
|
842
1360
|
return False
|
|
@@ -864,6 +1382,9 @@ class KafkaSource:
|
|
|
864
1382
|
raise ValueError("group_id in the URI is required to connect to kafka")
|
|
865
1383
|
|
|
866
1384
|
start_date = kwargs.get("interval_start")
|
|
1385
|
+
from ingestr.src.kafka import kafka_consumer
|
|
1386
|
+
from ingestr.src.kafka.helpers import KafkaCredentials
|
|
1387
|
+
|
|
867
1388
|
return kafka_consumer(
|
|
868
1389
|
topics=[table],
|
|
869
1390
|
credentials=KafkaCredentials(
|
|
@@ -919,6 +1440,9 @@ class AdjustSource:
|
|
|
919
1440
|
if kwargs.get("interval_end"):
|
|
920
1441
|
end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
|
|
921
1442
|
|
|
1443
|
+
from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
|
|
1444
|
+
from ingestr.src.adjust.adjust_helpers import parse_filters
|
|
1445
|
+
|
|
922
1446
|
dimensions = None
|
|
923
1447
|
metrics = None
|
|
924
1448
|
filters = []
|
|
@@ -966,6 +1490,8 @@ class AppsflyerSource:
|
|
|
966
1490
|
return True
|
|
967
1491
|
|
|
968
1492
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1493
|
+
from ingestr.src.appsflyer import appsflyer_source
|
|
1494
|
+
|
|
969
1495
|
if kwargs.get("incremental_key"):
|
|
970
1496
|
raise ValueError(
|
|
971
1497
|
"Appsflyer_Source takes care of incrementality on its own, you should not provide incremental_key"
|
|
@@ -978,22 +1504,27 @@ class AppsflyerSource:
|
|
|
978
1504
|
if not api_key:
|
|
979
1505
|
raise ValueError("api_key in the URI is required to connect to Appsflyer")
|
|
980
1506
|
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
)
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
1507
|
+
start_date = kwargs.get("interval_start")
|
|
1508
|
+
end_date = kwargs.get("interval_end")
|
|
1509
|
+
dimensions = []
|
|
1510
|
+
metrics = []
|
|
1511
|
+
if table.startswith("custom:"):
|
|
1512
|
+
fields = table.split(":", 3)
|
|
1513
|
+
if len(fields) != 3:
|
|
1514
|
+
raise ValueError(
|
|
1515
|
+
"Invalid Adjust custom table format. Expected format: custom:<dimensions>:<metrics>"
|
|
1516
|
+
)
|
|
1517
|
+
dimensions = fields[1].split(",")
|
|
1518
|
+
metrics = fields[2].split(",")
|
|
1519
|
+
table = "custom"
|
|
991
1520
|
|
|
992
1521
|
return appsflyer_source(
|
|
993
1522
|
api_key=api_key[0],
|
|
994
|
-
start_date=start_date,
|
|
995
|
-
end_date=end_date,
|
|
996
|
-
|
|
1523
|
+
start_date=start_date.strftime("%Y-%m-%d") if start_date else None, # type: ignore
|
|
1524
|
+
end_date=end_date.strftime("%Y-%m-%d") if end_date else None, # type: ignore
|
|
1525
|
+
dimensions=dimensions,
|
|
1526
|
+
metrics=metrics,
|
|
1527
|
+
).with_resources(table)
|
|
997
1528
|
|
|
998
1529
|
|
|
999
1530
|
class ZendeskSource:
|
|
@@ -1018,6 +1549,12 @@ class ZendeskSource:
|
|
|
1018
1549
|
if not subdomain:
|
|
1019
1550
|
raise ValueError("Subdomain is required to connect with Zendesk")
|
|
1020
1551
|
|
|
1552
|
+
from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
|
|
1553
|
+
from ingestr.src.zendesk.helpers.credentials import (
|
|
1554
|
+
ZendeskCredentialsOAuth,
|
|
1555
|
+
ZendeskCredentialsToken,
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1021
1558
|
if not source_fields.username and source_fields.password:
|
|
1022
1559
|
oauth_token = source_fields.password
|
|
1023
1560
|
if not oauth_token:
|
|
@@ -1076,7 +1613,7 @@ class ZendeskSource:
|
|
|
1076
1613
|
).with_resources(table)
|
|
1077
1614
|
else:
|
|
1078
1615
|
raise ValueError(
|
|
1079
|
-
"
|
|
1616
|
+
f"Resource '{table}' is not supported for Zendesk source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1080
1617
|
)
|
|
1081
1618
|
|
|
1082
1619
|
|
|
@@ -1091,7 +1628,7 @@ class S3Source:
|
|
|
1091
1628
|
)
|
|
1092
1629
|
|
|
1093
1630
|
parsed_uri = urlparse(uri)
|
|
1094
|
-
source_fields = parse_qs(
|
|
1631
|
+
source_fields = parse_qs(parsed_uri.query)
|
|
1095
1632
|
access_key_id = source_fields.get("access_key_id")
|
|
1096
1633
|
if not access_key_id:
|
|
1097
1634
|
raise ValueError("access_key_id is required to connect to S3")
|
|
@@ -1106,22 +1643,34 @@ class S3Source:
|
|
|
1106
1643
|
|
|
1107
1644
|
bucket_url = f"s3://{bucket_name}/"
|
|
1108
1645
|
|
|
1646
|
+
import s3fs # type: ignore
|
|
1647
|
+
|
|
1109
1648
|
fs = s3fs.S3FileSystem(
|
|
1110
1649
|
key=access_key_id[0],
|
|
1111
1650
|
secret=secret_access_key[0],
|
|
1112
1651
|
)
|
|
1113
1652
|
|
|
1114
|
-
|
|
1115
|
-
if
|
|
1116
|
-
endpoint = "
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1653
|
+
endpoint: Optional[str] = None
|
|
1654
|
+
if "#" in table:
|
|
1655
|
+
_, endpoint = table.split("#")
|
|
1656
|
+
if endpoint not in ["csv", "jsonl", "parquet"]:
|
|
1657
|
+
raise ValueError(
|
|
1658
|
+
"S3 Source only supports specific formats files: csv, jsonl, parquet"
|
|
1659
|
+
)
|
|
1660
|
+
endpoint = f"read_{endpoint}"
|
|
1121
1661
|
else:
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1662
|
+
try:
|
|
1663
|
+
endpoint = blob.parse_endpoint(path_to_file)
|
|
1664
|
+
except blob.UnsupportedEndpointError:
|
|
1665
|
+
raise ValueError(
|
|
1666
|
+
"S3 Source only supports specific formats files: csv, jsonl, parquet"
|
|
1667
|
+
)
|
|
1668
|
+
except Exception as e:
|
|
1669
|
+
raise ValueError(
|
|
1670
|
+
f"Failed to parse endpoint from path: {path_to_file}"
|
|
1671
|
+
) from e
|
|
1672
|
+
|
|
1673
|
+
from ingestr.src.filesystem import readers
|
|
1125
1674
|
|
|
1126
1675
|
return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
|
|
1127
1676
|
|
|
@@ -1132,6 +1681,11 @@ class TikTokSource:
|
|
|
1132
1681
|
return True
|
|
1133
1682
|
|
|
1134
1683
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1684
|
+
if kwargs.get("incremental_key"):
|
|
1685
|
+
raise ValueError(
|
|
1686
|
+
"TikTok takes care of incrementality on its own, you should not provide incremental_key"
|
|
1687
|
+
)
|
|
1688
|
+
|
|
1135
1689
|
endpoint = "custom_reports"
|
|
1136
1690
|
|
|
1137
1691
|
parsed_uri = urlparse(uri)
|
|
@@ -1217,6 +1771,8 @@ class TikTokSource:
|
|
|
1217
1771
|
filter_name = list(filters.keys())[0]
|
|
1218
1772
|
filter_value = list(map(int, filters[list(filters.keys())[0]]))
|
|
1219
1773
|
|
|
1774
|
+
from ingestr.src.tiktok_ads import tiktok_source
|
|
1775
|
+
|
|
1220
1776
|
return tiktok_source(
|
|
1221
1777
|
start_date=start_date,
|
|
1222
1778
|
end_date=end_date,
|
|
@@ -1265,20 +1821,83 @@ class AsanaSource:
|
|
|
1265
1821
|
f"Resource '{table}' is not supported for Asana source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1266
1822
|
)
|
|
1267
1823
|
|
|
1824
|
+
import dlt
|
|
1825
|
+
|
|
1826
|
+
from ingestr.src.asana_source import asana_source
|
|
1827
|
+
|
|
1268
1828
|
dlt.secrets["sources.asana_source.access_token"] = access_token[0]
|
|
1829
|
+
|
|
1269
1830
|
src = asana_source()
|
|
1270
1831
|
src.workspaces.add_filter(lambda w: w["gid"] == workspace)
|
|
1271
1832
|
return src.with_resources(table)
|
|
1272
1833
|
|
|
1273
1834
|
|
|
1274
|
-
class
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1835
|
+
class JiraSource:
|
|
1836
|
+
resources = [
|
|
1837
|
+
"projects",
|
|
1838
|
+
"issues",
|
|
1839
|
+
"users",
|
|
1840
|
+
"issue_types",
|
|
1841
|
+
"statuses",
|
|
1842
|
+
"priorities",
|
|
1843
|
+
"resolutions",
|
|
1844
|
+
"project_versions",
|
|
1845
|
+
"project_components",
|
|
1846
|
+
"events",
|
|
1847
|
+
]
|
|
1848
|
+
|
|
1849
|
+
def handles_incrementality(self) -> bool:
|
|
1850
|
+
return True
|
|
1851
|
+
|
|
1852
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1853
|
+
parsed_uri = urlparse(uri)
|
|
1854
|
+
params = parse_qs(parsed_uri.query)
|
|
1855
|
+
|
|
1856
|
+
base_url = f"https://{parsed_uri.netloc}"
|
|
1857
|
+
email = params.get("email")
|
|
1858
|
+
api_token = params.get("api_token")
|
|
1859
|
+
|
|
1860
|
+
if not email:
|
|
1861
|
+
raise ValueError("email must be specified in the URI query parameters")
|
|
1862
|
+
|
|
1863
|
+
if not api_token:
|
|
1864
|
+
raise ValueError("api_token is required for connecting to Jira")
|
|
1865
|
+
|
|
1866
|
+
flags = {
|
|
1867
|
+
"skip_archived": False,
|
|
1868
|
+
}
|
|
1869
|
+
if ":" in table:
|
|
1870
|
+
table, rest = table.split(":", 1) # type: ignore
|
|
1871
|
+
for k in rest.split(":"):
|
|
1872
|
+
flags[k] = True
|
|
1873
|
+
|
|
1874
|
+
if table not in self.resources:
|
|
1875
|
+
raise ValueError(
|
|
1876
|
+
f"Resource '{table}' is not supported for Jira source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1877
|
+
)
|
|
1878
|
+
|
|
1879
|
+
import dlt
|
|
1880
|
+
|
|
1881
|
+
from ingestr.src.jira_source import jira_source
|
|
1882
|
+
|
|
1883
|
+
dlt.secrets["sources.jira_source.base_url"] = base_url
|
|
1884
|
+
dlt.secrets["sources.jira_source.email"] = email[0]
|
|
1885
|
+
dlt.secrets["sources.jira_source.api_token"] = api_token[0]
|
|
1886
|
+
|
|
1887
|
+
src = jira_source()
|
|
1888
|
+
if flags["skip_archived"]:
|
|
1889
|
+
src.projects.add_filter(lambda p: not p.get("archived", False))
|
|
1890
|
+
return src.with_resources(table)
|
|
1891
|
+
|
|
1892
|
+
|
|
1893
|
+
class DynamoDBSource:
|
|
1894
|
+
AWS_ENDPOINT_PATTERN = re.compile(".*\.(.+)\.amazonaws\.com")
|
|
1895
|
+
|
|
1896
|
+
def infer_aws_region(self, uri: ParseResult) -> Optional[str]:
|
|
1897
|
+
# try to infer from URI
|
|
1898
|
+
matches = self.AWS_ENDPOINT_PATTERN.match(uri.netloc)
|
|
1899
|
+
if matches is not None:
|
|
1900
|
+
return matches[1]
|
|
1282
1901
|
|
|
1283
1902
|
# else obtain region from query string
|
|
1284
1903
|
region = parse_qs(uri.query).get("region")
|
|
@@ -1301,7 +1920,7 @@ class DynamoDBSource:
|
|
|
1301
1920
|
if not region:
|
|
1302
1921
|
raise ValueError("region is required to connect to Dynamodb")
|
|
1303
1922
|
|
|
1304
|
-
qs = parse_qs(
|
|
1923
|
+
qs = parse_qs(parsed_uri.query)
|
|
1305
1924
|
access_key = qs.get("access_key_id")
|
|
1306
1925
|
|
|
1307
1926
|
if not access_key:
|
|
@@ -1311,6 +1930,9 @@ class DynamoDBSource:
|
|
|
1311
1930
|
if not secret_key:
|
|
1312
1931
|
raise ValueError("secret_access_key is required to connect to Dynamodb")
|
|
1313
1932
|
|
|
1933
|
+
from dlt.common.configuration.specs import AwsCredentials
|
|
1934
|
+
from dlt.common.typing import TSecretStrValue
|
|
1935
|
+
|
|
1314
1936
|
creds = AwsCredentials(
|
|
1315
1937
|
aws_access_key_id=access_key[0],
|
|
1316
1938
|
aws_secret_access_key=TSecretStrValue(secret_key[0]),
|
|
@@ -1321,8 +1943,11 @@ class DynamoDBSource:
|
|
|
1321
1943
|
incremental = None
|
|
1322
1944
|
incremental_key = kwargs.get("incremental_key")
|
|
1323
1945
|
|
|
1946
|
+
from ingestr.src.dynamodb import dynamodb
|
|
1947
|
+
from ingestr.src.time import isotime
|
|
1948
|
+
|
|
1324
1949
|
if incremental_key:
|
|
1325
|
-
incremental =
|
|
1950
|
+
incremental = dlt_incremental(
|
|
1326
1951
|
incremental_key.strip(),
|
|
1327
1952
|
initial_value=isotime(kwargs.get("interval_start")),
|
|
1328
1953
|
end_value=isotime(kwargs.get("interval_end")),
|
|
@@ -1334,47 +1959,127 @@ class DynamoDBSource:
|
|
|
1334
1959
|
return dynamodb(table, creds, incremental)
|
|
1335
1960
|
|
|
1336
1961
|
|
|
1962
|
+
class DoceboSource:
|
|
1963
|
+
def handles_incrementality(self) -> bool:
|
|
1964
|
+
return False
|
|
1965
|
+
|
|
1966
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1967
|
+
# docebo://?base_url=https://yourcompany.docebosaas.com&client_id=xxx&client_secret=xxx
|
|
1968
|
+
# Optional: &username=xxx&password=xxx for password grant type
|
|
1969
|
+
|
|
1970
|
+
if kwargs.get("incremental_key"):
|
|
1971
|
+
raise ValueError("Incremental loads are not yet supported for Docebo")
|
|
1972
|
+
|
|
1973
|
+
parsed_uri = urlparse(uri)
|
|
1974
|
+
source_params = parse_qs(parsed_uri.query)
|
|
1975
|
+
|
|
1976
|
+
base_url = source_params.get("base_url")
|
|
1977
|
+
if not base_url:
|
|
1978
|
+
raise ValueError("base_url is required to connect to Docebo")
|
|
1979
|
+
|
|
1980
|
+
client_id = source_params.get("client_id")
|
|
1981
|
+
if not client_id:
|
|
1982
|
+
raise ValueError("client_id is required to connect to Docebo")
|
|
1983
|
+
|
|
1984
|
+
client_secret = source_params.get("client_secret")
|
|
1985
|
+
if not client_secret:
|
|
1986
|
+
raise ValueError("client_secret is required to connect to Docebo")
|
|
1987
|
+
|
|
1988
|
+
# Username and password are optional (uses client_credentials grant if not provided)
|
|
1989
|
+
username = source_params.get("username", [None])[0]
|
|
1990
|
+
password = source_params.get("password", [None])[0]
|
|
1991
|
+
|
|
1992
|
+
# Supported tables
|
|
1993
|
+
supported_tables = [
|
|
1994
|
+
"users",
|
|
1995
|
+
"courses",
|
|
1996
|
+
"user_fields",
|
|
1997
|
+
"branches",
|
|
1998
|
+
"groups",
|
|
1999
|
+
"group_members",
|
|
2000
|
+
"course_fields",
|
|
2001
|
+
"learning_objects",
|
|
2002
|
+
"learning_plans",
|
|
2003
|
+
"learning_plan_enrollments",
|
|
2004
|
+
"learning_plan_course_enrollments",
|
|
2005
|
+
"course_enrollments",
|
|
2006
|
+
"sessions",
|
|
2007
|
+
"categories",
|
|
2008
|
+
"certifications",
|
|
2009
|
+
"external_training",
|
|
2010
|
+
"survey_answers",
|
|
2011
|
+
]
|
|
2012
|
+
if table not in supported_tables:
|
|
2013
|
+
raise ValueError(
|
|
2014
|
+
f"Resource '{table}' is not supported for Docebo source. Supported tables: {', '.join(supported_tables)}"
|
|
2015
|
+
)
|
|
2016
|
+
|
|
2017
|
+
from ingestr.src.docebo import docebo_source
|
|
2018
|
+
|
|
2019
|
+
return docebo_source(
|
|
2020
|
+
base_url=base_url[0],
|
|
2021
|
+
client_id=client_id[0],
|
|
2022
|
+
client_secret=client_secret[0],
|
|
2023
|
+
username=username,
|
|
2024
|
+
password=password,
|
|
2025
|
+
).with_resources(table)
|
|
2026
|
+
|
|
2027
|
+
|
|
1337
2028
|
class GoogleAnalyticsSource:
|
|
1338
2029
|
def handles_incrementality(self) -> bool:
|
|
1339
2030
|
return True
|
|
1340
2031
|
|
|
1341
2032
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1342
|
-
|
|
1343
|
-
source_fields = parse_qs(parse_uri.query)
|
|
1344
|
-
cred_path = source_fields.get("credentials_path")
|
|
1345
|
-
|
|
1346
|
-
if not cred_path:
|
|
1347
|
-
raise ValueError("credentials_path is required to connect Google Analytics")
|
|
1348
|
-
credentials = {}
|
|
2033
|
+
import ingestr.src.google_analytics.helpers as helpers
|
|
1349
2034
|
|
|
1350
|
-
|
|
1351
|
-
|
|
2035
|
+
if kwargs.get("incremental_key"):
|
|
2036
|
+
raise ValueError(
|
|
2037
|
+
"Google Analytics takes care of incrementality on its own, you should not provide incremental_key"
|
|
2038
|
+
)
|
|
1352
2039
|
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
2040
|
+
result = helpers.parse_google_analytics_uri(uri)
|
|
2041
|
+
credentials = result["credentials"]
|
|
2042
|
+
property_id = result["property_id"]
|
|
1356
2043
|
|
|
1357
2044
|
fields = table.split(":")
|
|
1358
|
-
if len(fields) != 3:
|
|
2045
|
+
if len(fields) != 3 and len(fields) != 4:
|
|
1359
2046
|
raise ValueError(
|
|
1360
|
-
"Invalid table format. Expected format:
|
|
2047
|
+
"Invalid table format. Expected format: <report_type>:<dimensions>:<metrics> or <report_type>:<dimensions>:<metrics>:<minute_ranges>"
|
|
1361
2048
|
)
|
|
1362
2049
|
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
datetime = ""
|
|
1366
|
-
for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
|
|
1367
|
-
if dimension_datetime in dimensions:
|
|
1368
|
-
datetime = dimension_datetime
|
|
1369
|
-
break
|
|
1370
|
-
else:
|
|
2050
|
+
report_type = fields[0]
|
|
2051
|
+
if report_type not in ["custom", "realtime"]:
|
|
1371
2052
|
raise ValueError(
|
|
1372
|
-
"
|
|
2053
|
+
"Invalid report type. Expected format: <report_type>:<dimensions>:<metrics>. Available report types: custom, realtime"
|
|
1373
2054
|
)
|
|
1374
2055
|
|
|
2056
|
+
dimensions = fields[1].replace(" ", "").split(",")
|
|
1375
2057
|
metrics = fields[2].replace(" ", "").split(",")
|
|
2058
|
+
|
|
2059
|
+
minute_range_objects = []
|
|
2060
|
+
if len(fields) == 4:
|
|
2061
|
+
minute_range_objects = (
|
|
2062
|
+
helpers.convert_minutes_ranges_to_minute_range_objects(fields[3])
|
|
2063
|
+
)
|
|
2064
|
+
|
|
2065
|
+
datetime = ""
|
|
2066
|
+
resource_name = fields[0].lower()
|
|
2067
|
+
if resource_name == "custom":
|
|
2068
|
+
for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
|
|
2069
|
+
if dimension_datetime in dimensions:
|
|
2070
|
+
datetime = dimension_datetime
|
|
2071
|
+
break
|
|
2072
|
+
else:
|
|
2073
|
+
raise ValueError(
|
|
2074
|
+
"You must provide at least one dimension: [dateHour, dateHourMinute, date]"
|
|
2075
|
+
)
|
|
2076
|
+
|
|
1376
2077
|
queries = [
|
|
1377
|
-
{
|
|
2078
|
+
{
|
|
2079
|
+
"resource_name": resource_name,
|
|
2080
|
+
"dimensions": dimensions,
|
|
2081
|
+
"metrics": metrics,
|
|
2082
|
+
}
|
|
1378
2083
|
]
|
|
1379
2084
|
|
|
1380
2085
|
start_date = pendulum.now().subtract(days=30).start_of("day")
|
|
@@ -1385,14 +2090,17 @@ class GoogleAnalyticsSource:
|
|
|
1385
2090
|
if kwargs.get("interval_end") is not None:
|
|
1386
2091
|
end_date = pendulum.instance(kwargs.get("interval_end")) # type: ignore
|
|
1387
2092
|
|
|
2093
|
+
from ingestr.src.google_analytics import google_analytics
|
|
2094
|
+
|
|
1388
2095
|
return google_analytics(
|
|
1389
|
-
property_id=property_id
|
|
2096
|
+
property_id=property_id,
|
|
1390
2097
|
start_date=start_date,
|
|
1391
2098
|
end_date=end_date,
|
|
1392
2099
|
datetime_dimension=datetime,
|
|
1393
2100
|
queries=queries,
|
|
1394
2101
|
credentials=credentials,
|
|
1395
|
-
|
|
2102
|
+
minute_range_objects=minute_range_objects if minute_range_objects else None,
|
|
2103
|
+
).with_resources(resource_name)
|
|
1396
2104
|
|
|
1397
2105
|
|
|
1398
2106
|
class GitHubSource:
|
|
@@ -1422,12 +2130,34 @@ class GitHubSource:
|
|
|
1422
2130
|
|
|
1423
2131
|
access_token = source_fields.get("access_token", [""])[0]
|
|
1424
2132
|
|
|
2133
|
+
from ingestr.src.github import (
|
|
2134
|
+
github_reactions,
|
|
2135
|
+
github_repo_events,
|
|
2136
|
+
github_stargazers,
|
|
2137
|
+
)
|
|
2138
|
+
|
|
1425
2139
|
if table in ["issues", "pull_requests"]:
|
|
1426
2140
|
return github_reactions(
|
|
1427
2141
|
owner=owner, name=repo, access_token=access_token
|
|
1428
2142
|
).with_resources(table)
|
|
1429
2143
|
elif table == "repo_events":
|
|
1430
|
-
|
|
2144
|
+
start_date = kwargs.get("interval_start") or pendulum.now().subtract(
|
|
2145
|
+
days=30
|
|
2146
|
+
)
|
|
2147
|
+
end_date = kwargs.get("interval_end") or None
|
|
2148
|
+
|
|
2149
|
+
if isinstance(start_date, str):
|
|
2150
|
+
start_date = pendulum.parse(start_date)
|
|
2151
|
+
if isinstance(end_date, str):
|
|
2152
|
+
end_date = pendulum.parse(end_date)
|
|
2153
|
+
|
|
2154
|
+
return github_repo_events(
|
|
2155
|
+
owner=owner,
|
|
2156
|
+
name=repo,
|
|
2157
|
+
access_token=access_token,
|
|
2158
|
+
start_date=start_date,
|
|
2159
|
+
end_date=end_date,
|
|
2160
|
+
)
|
|
1431
2161
|
elif table == "stargazers":
|
|
1432
2162
|
return github_stargazers(owner=owner, name=repo, access_token=access_token)
|
|
1433
2163
|
else:
|
|
@@ -1454,6 +2184,8 @@ class AppleAppStoreSource:
|
|
|
1454
2184
|
else:
|
|
1455
2185
|
key = base64.b64decode(key_base64[0]).decode() # type: ignore
|
|
1456
2186
|
|
|
2187
|
+
from ingestr.src.appstore.client import AppStoreConnectClient
|
|
2188
|
+
|
|
1457
2189
|
return AppStoreConnectClient(key.encode(), key_id, issuer_id)
|
|
1458
2190
|
|
|
1459
2191
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
@@ -1494,6 +2226,8 @@ class AppleAppStoreSource:
|
|
|
1494
2226
|
if app_ids is None:
|
|
1495
2227
|
raise MissingValueError("app_id", "App Store")
|
|
1496
2228
|
|
|
2229
|
+
from ingestr.src.appstore import app_store
|
|
2230
|
+
|
|
1497
2231
|
src = app_store(
|
|
1498
2232
|
client,
|
|
1499
2233
|
app_ids,
|
|
@@ -1550,21 +2284,24 @@ class GCSSource:
|
|
|
1550
2284
|
# (The RECOMMENDED way of passing service account credentials)
|
|
1551
2285
|
# directly with gcsfs. As a workaround, we construct the GCSFileSystem
|
|
1552
2286
|
# and pass it directly to filesystem.readers.
|
|
2287
|
+
import gcsfs # type: ignore
|
|
2288
|
+
|
|
1553
2289
|
fs = gcsfs.GCSFileSystem(
|
|
1554
2290
|
token=credentials,
|
|
1555
2291
|
)
|
|
1556
2292
|
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
elif file_extension == "jsonl":
|
|
1561
|
-
endpoint = "read_jsonl"
|
|
1562
|
-
elif file_extension == "parquet":
|
|
1563
|
-
endpoint = "read_parquet"
|
|
1564
|
-
else:
|
|
2293
|
+
try:
|
|
2294
|
+
endpoint = blob.parse_endpoint(path_to_file)
|
|
2295
|
+
except blob.UnsupportedEndpointError:
|
|
1565
2296
|
raise ValueError(
|
|
1566
2297
|
"GCS Source only supports specific formats files: csv, jsonl, parquet"
|
|
1567
2298
|
)
|
|
2299
|
+
except Exception as e:
|
|
2300
|
+
raise ValueError(
|
|
2301
|
+
f"Failed to parse endpoint from path: {path_to_file}"
|
|
2302
|
+
) from e
|
|
2303
|
+
|
|
2304
|
+
from ingestr.src.filesystem import readers
|
|
1568
2305
|
|
|
1569
2306
|
return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
|
|
1570
2307
|
|
|
@@ -1573,7 +2310,9 @@ class GoogleAdsSource:
|
|
|
1573
2310
|
def handles_incrementality(self) -> bool:
|
|
1574
2311
|
return True
|
|
1575
2312
|
|
|
1576
|
-
def init_client(self, params: Dict[str, List[str]])
|
|
2313
|
+
def init_client(self, params: Dict[str, List[str]]):
|
|
2314
|
+
from google.ads.googleads.client import GoogleAdsClient # type: ignore
|
|
2315
|
+
|
|
1577
2316
|
dev_token = params.get("dev_token")
|
|
1578
2317
|
if dev_token is None or len(dev_token) == 0:
|
|
1579
2318
|
raise MissingValueError("dev_token", "Google Ads")
|
|
@@ -1627,6 +2366,7 @@ class GoogleAdsSource:
|
|
|
1627
2366
|
raise MissingValueError("customer_id", "Google Ads")
|
|
1628
2367
|
|
|
1629
2368
|
params = parse_qs(parsed_uri.query)
|
|
2369
|
+
|
|
1630
2370
|
client = self.init_client(params)
|
|
1631
2371
|
|
|
1632
2372
|
start_date = kwargs.get("interval_start") or datetime.now(
|
|
@@ -1648,6 +2388,8 @@ class GoogleAdsSource:
|
|
|
1648
2388
|
report_spec = table
|
|
1649
2389
|
table = "daily_report"
|
|
1650
2390
|
|
|
2391
|
+
from ingestr.src.google_ads import google_ads
|
|
2392
|
+
|
|
1651
2393
|
src = google_ads(
|
|
1652
2394
|
client,
|
|
1653
2395
|
customer_id,
|
|
@@ -1667,6 +2409,11 @@ class LinkedInAdsSource:
|
|
|
1667
2409
|
return True
|
|
1668
2410
|
|
|
1669
2411
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2412
|
+
if kwargs.get("incremental_key"):
|
|
2413
|
+
raise ValueError(
|
|
2414
|
+
"LinkedIn Ads takes care of incrementality on its own, you should not provide incremental_key"
|
|
2415
|
+
)
|
|
2416
|
+
|
|
1670
2417
|
parsed_uri = urlparse(uri)
|
|
1671
2418
|
source_fields = parse_qs(parsed_uri.query)
|
|
1672
2419
|
|
|
@@ -1712,6 +2459,12 @@ class LinkedInAdsSource:
|
|
|
1712
2459
|
"'date' or 'month' is required to connect to LinkedIn Ads, please provide at least one of these dimensions."
|
|
1713
2460
|
)
|
|
1714
2461
|
|
|
2462
|
+
from ingestr.src.linkedin_ads import linked_in_ads_source
|
|
2463
|
+
from ingestr.src.linkedin_ads.dimension_time_enum import (
|
|
2464
|
+
Dimension,
|
|
2465
|
+
TimeGranularity,
|
|
2466
|
+
)
|
|
2467
|
+
|
|
1715
2468
|
if "date" in dimensions:
|
|
1716
2469
|
time_granularity = TimeGranularity.daily
|
|
1717
2470
|
dimensions.remove("date")
|
|
@@ -1737,3 +2490,2170 @@ class LinkedInAdsSource:
|
|
|
1737
2490
|
metrics=metrics,
|
|
1738
2491
|
time_granularity=time_granularity,
|
|
1739
2492
|
).with_resources("custom_reports")
|
|
2493
|
+
|
|
2494
|
+
|
|
2495
|
+
class ClickupSource:
|
|
2496
|
+
def handles_incrementality(self) -> bool:
|
|
2497
|
+
return True
|
|
2498
|
+
|
|
2499
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2500
|
+
if kwargs.get("incremental_key"):
|
|
2501
|
+
raise ValueError(
|
|
2502
|
+
"ClickUp takes care of incrementality on its own, you should not provide incremental_key"
|
|
2503
|
+
)
|
|
2504
|
+
|
|
2505
|
+
parsed_uri = urlparse(uri)
|
|
2506
|
+
params = parse_qs(parsed_uri.query)
|
|
2507
|
+
api_token = params.get("api_token")
|
|
2508
|
+
|
|
2509
|
+
if api_token is None:
|
|
2510
|
+
raise MissingValueError("api_token", "ClickUp")
|
|
2511
|
+
|
|
2512
|
+
interval_start = kwargs.get("interval_start")
|
|
2513
|
+
interval_end = kwargs.get("interval_end")
|
|
2514
|
+
start_date = (
|
|
2515
|
+
ensure_pendulum_datetime(interval_start).in_timezone("UTC")
|
|
2516
|
+
if interval_start
|
|
2517
|
+
else pendulum.datetime(2020, 1, 1, tz="UTC")
|
|
2518
|
+
)
|
|
2519
|
+
end_date = (
|
|
2520
|
+
ensure_pendulum_datetime(interval_end).in_timezone("UTC")
|
|
2521
|
+
if interval_end
|
|
2522
|
+
else None
|
|
2523
|
+
)
|
|
2524
|
+
|
|
2525
|
+
from ingestr.src.clickup import clickup_source
|
|
2526
|
+
|
|
2527
|
+
if table not in {"user", "teams", "lists", "tasks", "spaces"}:
|
|
2528
|
+
raise UnsupportedResourceError(table, "ClickUp")
|
|
2529
|
+
|
|
2530
|
+
return clickup_source(
|
|
2531
|
+
api_token=api_token[0], start_date=start_date, end_date=end_date
|
|
2532
|
+
).with_resources(table)
|
|
2533
|
+
|
|
2534
|
+
|
|
2535
|
+
class AppLovinSource:
|
|
2536
|
+
def handles_incrementality(self) -> bool:
|
|
2537
|
+
return True
|
|
2538
|
+
|
|
2539
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2540
|
+
if kwargs.get("incremental_key") is not None:
|
|
2541
|
+
raise ValueError(
|
|
2542
|
+
"Applovin takes care of incrementality on its own, you should not provide incremental_key"
|
|
2543
|
+
)
|
|
2544
|
+
|
|
2545
|
+
parsed_uri = urlparse(uri)
|
|
2546
|
+
params = parse_qs(parsed_uri.query)
|
|
2547
|
+
|
|
2548
|
+
api_key = params.get("api_key", None)
|
|
2549
|
+
if api_key is None:
|
|
2550
|
+
raise MissingValueError("api_key", "AppLovin")
|
|
2551
|
+
|
|
2552
|
+
interval_start = kwargs.get("interval_start")
|
|
2553
|
+
interval_end = kwargs.get("interval_end")
|
|
2554
|
+
|
|
2555
|
+
now = datetime.now()
|
|
2556
|
+
start_date = (
|
|
2557
|
+
interval_start if interval_start is not None else now - timedelta(days=1)
|
|
2558
|
+
)
|
|
2559
|
+
end_date = interval_end
|
|
2560
|
+
|
|
2561
|
+
custom_report = None
|
|
2562
|
+
if table.startswith("custom:"):
|
|
2563
|
+
custom_report = table
|
|
2564
|
+
table = "custom_report"
|
|
2565
|
+
|
|
2566
|
+
from ingestr.src.applovin import applovin_source
|
|
2567
|
+
|
|
2568
|
+
src = applovin_source(
|
|
2569
|
+
api_key[0],
|
|
2570
|
+
start_date.strftime("%Y-%m-%d"),
|
|
2571
|
+
end_date.strftime("%Y-%m-%d") if end_date else None,
|
|
2572
|
+
custom_report,
|
|
2573
|
+
)
|
|
2574
|
+
|
|
2575
|
+
if table not in src.resources:
|
|
2576
|
+
raise UnsupportedResourceError(table, "AppLovin")
|
|
2577
|
+
|
|
2578
|
+
return src.with_resources(table)
|
|
2579
|
+
|
|
2580
|
+
|
|
2581
|
+
class ApplovinMaxSource:
|
|
2582
|
+
# expected uri format: applovinmax://?api_key=<api_key>
|
|
2583
|
+
# expected table format: user_ad_revenue:app_id_1,app_id_2
|
|
2584
|
+
|
|
2585
|
+
def handles_incrementality(self) -> bool:
|
|
2586
|
+
return True
|
|
2587
|
+
|
|
2588
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2589
|
+
if kwargs.get("incremental_key"):
|
|
2590
|
+
raise ValueError(
|
|
2591
|
+
"AppLovin Max takes care of incrementality on its own, you should not provide incremental_key"
|
|
2592
|
+
)
|
|
2593
|
+
|
|
2594
|
+
parsed_uri = urlparse(uri)
|
|
2595
|
+
params = parse_qs(parsed_uri.query)
|
|
2596
|
+
|
|
2597
|
+
api_key = params.get("api_key")
|
|
2598
|
+
if api_key is None:
|
|
2599
|
+
raise ValueError("api_key is required to connect to AppLovin Max API.")
|
|
2600
|
+
|
|
2601
|
+
AVAILABLE_TABLES = ["user_ad_revenue"]
|
|
2602
|
+
|
|
2603
|
+
table_fields = table.split(":")
|
|
2604
|
+
requested_table = table_fields[0]
|
|
2605
|
+
|
|
2606
|
+
if len(table_fields) != 2:
|
|
2607
|
+
raise ValueError(
|
|
2608
|
+
"Invalid table format. Expected format is user_ad_revenue:app_id_1,app_id_2"
|
|
2609
|
+
)
|
|
2610
|
+
|
|
2611
|
+
if requested_table not in AVAILABLE_TABLES:
|
|
2612
|
+
raise ValueError(
|
|
2613
|
+
f"Table name '{requested_table}' is not supported for AppLovin Max source yet."
|
|
2614
|
+
f"Only '{AVAILABLE_TABLES}' are currently supported. "
|
|
2615
|
+
"If you need additional tables, please create a GitHub issue at "
|
|
2616
|
+
"https://github.com/bruin-data/ingestr"
|
|
2617
|
+
)
|
|
2618
|
+
|
|
2619
|
+
applications = [
|
|
2620
|
+
i for i in table_fields[1].replace(" ", "").split(",") if i.strip()
|
|
2621
|
+
]
|
|
2622
|
+
if len(applications) == 0:
|
|
2623
|
+
raise ValueError("At least one application id is required")
|
|
2624
|
+
|
|
2625
|
+
if len(applications) != len(set(applications)):
|
|
2626
|
+
raise ValueError("Application ids must be unique.")
|
|
2627
|
+
|
|
2628
|
+
interval_start = kwargs.get("interval_start")
|
|
2629
|
+
interval_end = kwargs.get("interval_end")
|
|
2630
|
+
|
|
2631
|
+
now = pendulum.now("UTC")
|
|
2632
|
+
default_start = now.subtract(days=30).date()
|
|
2633
|
+
|
|
2634
|
+
start_date = (
|
|
2635
|
+
interval_start.date() if interval_start is not None else default_start
|
|
2636
|
+
)
|
|
2637
|
+
|
|
2638
|
+
end_date = interval_end.date() if interval_end is not None else None
|
|
2639
|
+
|
|
2640
|
+
from ingestr.src.applovin_max import applovin_max_source
|
|
2641
|
+
|
|
2642
|
+
return applovin_max_source(
|
|
2643
|
+
start_date=start_date,
|
|
2644
|
+
end_date=end_date,
|
|
2645
|
+
api_key=api_key[0],
|
|
2646
|
+
applications=applications,
|
|
2647
|
+
).with_resources(requested_table)
|
|
2648
|
+
|
|
2649
|
+
|
|
2650
|
+
class SalesforceSource:
|
|
2651
|
+
def handles_incrementality(self) -> bool:
|
|
2652
|
+
return True
|
|
2653
|
+
|
|
2654
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2655
|
+
if kwargs.get("incremental_key"):
|
|
2656
|
+
raise ValueError(
|
|
2657
|
+
"Salesforce takes care of incrementality on its own, you should not provide incremental_key"
|
|
2658
|
+
)
|
|
2659
|
+
|
|
2660
|
+
params = parse_qs(urlparse(uri).query)
|
|
2661
|
+
creds = {
|
|
2662
|
+
"username": params.get("username", [None])[0],
|
|
2663
|
+
"password": params.get("password", [None])[0],
|
|
2664
|
+
"token": params.get("token", [None])[0],
|
|
2665
|
+
"domain": params.get("domain", [None])[0],
|
|
2666
|
+
}
|
|
2667
|
+
for k, v in creds.items():
|
|
2668
|
+
if v is None:
|
|
2669
|
+
raise MissingValueError(k, "Salesforce")
|
|
2670
|
+
|
|
2671
|
+
from ingestr.src.salesforce import salesforce_source
|
|
2672
|
+
|
|
2673
|
+
src = salesforce_source(**creds) # type: ignore
|
|
2674
|
+
|
|
2675
|
+
if table.startswith("custom:"):
|
|
2676
|
+
custom_object = table.split(":")[1]
|
|
2677
|
+
src = salesforce_source(**creds, custom_object=custom_object)
|
|
2678
|
+
return src.with_resources("custom")
|
|
2679
|
+
|
|
2680
|
+
if table not in src.resources:
|
|
2681
|
+
raise UnsupportedResourceError(table, "Salesforce")
|
|
2682
|
+
|
|
2683
|
+
return src.with_resources(table)
|
|
2684
|
+
|
|
2685
|
+
|
|
2686
|
+
class PersonioSource:
|
|
2687
|
+
def handles_incrementality(self) -> bool:
|
|
2688
|
+
return True
|
|
2689
|
+
|
|
2690
|
+
# applovin://?client_id=123&client_secret=123
|
|
2691
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2692
|
+
if kwargs.get("incremental_key"):
|
|
2693
|
+
raise ValueError(
|
|
2694
|
+
"Personio takes care of incrementality on its own, you should not provide incremental_key"
|
|
2695
|
+
)
|
|
2696
|
+
|
|
2697
|
+
parsed_uri = urlparse(uri)
|
|
2698
|
+
params = parse_qs(parsed_uri.query)
|
|
2699
|
+
|
|
2700
|
+
client_id = params.get("client_id")
|
|
2701
|
+
client_secret = params.get("client_secret")
|
|
2702
|
+
|
|
2703
|
+
interval_start = kwargs.get("interval_start")
|
|
2704
|
+
interval_end = kwargs.get("interval_end")
|
|
2705
|
+
|
|
2706
|
+
interval_start_date = (
|
|
2707
|
+
interval_start if interval_start is not None else "2018-01-01"
|
|
2708
|
+
)
|
|
2709
|
+
|
|
2710
|
+
interval_end_date = (
|
|
2711
|
+
interval_end.strftime("%Y-%m-%d") if interval_end is not None else None
|
|
2712
|
+
)
|
|
2713
|
+
|
|
2714
|
+
if client_id is None:
|
|
2715
|
+
raise MissingValueError("client_id", "Personio")
|
|
2716
|
+
if client_secret is None:
|
|
2717
|
+
raise MissingValueError("client_secret", "Personio")
|
|
2718
|
+
if table not in [
|
|
2719
|
+
"employees",
|
|
2720
|
+
"absences",
|
|
2721
|
+
"absence_types",
|
|
2722
|
+
"attendances",
|
|
2723
|
+
"projects",
|
|
2724
|
+
"document_categories",
|
|
2725
|
+
"employees_absences_balance",
|
|
2726
|
+
"custom_reports_list",
|
|
2727
|
+
]:
|
|
2728
|
+
raise UnsupportedResourceError(table, "Personio")
|
|
2729
|
+
|
|
2730
|
+
from ingestr.src.personio import personio_source
|
|
2731
|
+
|
|
2732
|
+
return personio_source(
|
|
2733
|
+
client_id=client_id[0],
|
|
2734
|
+
client_secret=client_secret[0],
|
|
2735
|
+
start_date=interval_start_date,
|
|
2736
|
+
end_date=interval_end_date,
|
|
2737
|
+
).with_resources(table)
|
|
2738
|
+
|
|
2739
|
+
|
|
2740
|
+
class KinesisSource:
|
|
2741
|
+
def handles_incrementality(self) -> bool:
|
|
2742
|
+
return True
|
|
2743
|
+
|
|
2744
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2745
|
+
# kinesis://?aws_access_key_id=<AccessKeyId>&aws_secret_access_key=<SecretAccessKey>®ion_name=<Region>
|
|
2746
|
+
# source table = stream name
|
|
2747
|
+
parsed_uri = urlparse(uri)
|
|
2748
|
+
params = parse_qs(parsed_uri.query)
|
|
2749
|
+
|
|
2750
|
+
aws_access_key_id = params.get("aws_access_key_id")
|
|
2751
|
+
if aws_access_key_id is None:
|
|
2752
|
+
raise MissingValueError("aws_access_key_id", "Kinesis")
|
|
2753
|
+
|
|
2754
|
+
aws_secret_access_key = params.get("aws_secret_access_key")
|
|
2755
|
+
if aws_secret_access_key is None:
|
|
2756
|
+
raise MissingValueError("aws_secret_access_key", "Kinesis")
|
|
2757
|
+
|
|
2758
|
+
region_name = params.get("region_name")
|
|
2759
|
+
if region_name is None:
|
|
2760
|
+
raise MissingValueError("region_name", "Kinesis")
|
|
2761
|
+
|
|
2762
|
+
start_date = kwargs.get("interval_start")
|
|
2763
|
+
if start_date is not None:
|
|
2764
|
+
# the resource will read all messages after this timestamp.
|
|
2765
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
2766
|
+
|
|
2767
|
+
from dlt.common.configuration.specs import AwsCredentials
|
|
2768
|
+
|
|
2769
|
+
from ingestr.src.kinesis import kinesis_stream
|
|
2770
|
+
|
|
2771
|
+
credentials = AwsCredentials(
|
|
2772
|
+
aws_access_key_id=aws_access_key_id[0],
|
|
2773
|
+
aws_secret_access_key=aws_secret_access_key[0],
|
|
2774
|
+
region_name=region_name[0],
|
|
2775
|
+
)
|
|
2776
|
+
|
|
2777
|
+
return kinesis_stream(
|
|
2778
|
+
stream_name=table, credentials=credentials, initial_at_timestamp=start_date
|
|
2779
|
+
)
|
|
2780
|
+
|
|
2781
|
+
|
|
2782
|
+
class PipedriveSource:
|
|
2783
|
+
def handles_incrementality(self) -> bool:
|
|
2784
|
+
return True
|
|
2785
|
+
|
|
2786
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2787
|
+
if kwargs.get("incremental_key"):
|
|
2788
|
+
raise ValueError(
|
|
2789
|
+
"Pipedrive takes care of incrementality on its own, you should not provide incremental_key"
|
|
2790
|
+
)
|
|
2791
|
+
|
|
2792
|
+
parsed_uri = urlparse(uri)
|
|
2793
|
+
params = parse_qs(parsed_uri.query)
|
|
2794
|
+
api_key = params.get("api_token")
|
|
2795
|
+
if api_key is None:
|
|
2796
|
+
raise MissingValueError("api_token", "Pipedrive")
|
|
2797
|
+
|
|
2798
|
+
start_date = kwargs.get("interval_start")
|
|
2799
|
+
if start_date is not None:
|
|
2800
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
2801
|
+
else:
|
|
2802
|
+
start_date = pendulum.parse("2000-01-01")
|
|
2803
|
+
|
|
2804
|
+
if table not in [
|
|
2805
|
+
"users",
|
|
2806
|
+
"activities",
|
|
2807
|
+
"persons",
|
|
2808
|
+
"organizations",
|
|
2809
|
+
"products",
|
|
2810
|
+
"stages",
|
|
2811
|
+
"deals",
|
|
2812
|
+
]:
|
|
2813
|
+
raise UnsupportedResourceError(table, "Pipedrive")
|
|
2814
|
+
|
|
2815
|
+
from ingestr.src.pipedrive import pipedrive_source
|
|
2816
|
+
|
|
2817
|
+
return pipedrive_source(
|
|
2818
|
+
pipedrive_api_key=api_key, since_timestamp=start_date
|
|
2819
|
+
).with_resources(table)
|
|
2820
|
+
|
|
2821
|
+
|
|
2822
|
+
class FrankfurterSource:
|
|
2823
|
+
def handles_incrementality(self) -> bool:
|
|
2824
|
+
return True
|
|
2825
|
+
|
|
2826
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2827
|
+
if kwargs.get("incremental_key"):
|
|
2828
|
+
raise ValueError(
|
|
2829
|
+
"Frankfurter takes care of incrementality on its own, you should not provide incremental_key"
|
|
2830
|
+
)
|
|
2831
|
+
|
|
2832
|
+
from ingestr.src.frankfurter import frankfurter_source
|
|
2833
|
+
from ingestr.src.frankfurter.helpers import validate_currency, validate_dates
|
|
2834
|
+
|
|
2835
|
+
parsed_uri = urlparse(uri)
|
|
2836
|
+
source_params = parse_qs(parsed_uri.query)
|
|
2837
|
+
base_currency = source_params.get("base", [None])[0]
|
|
2838
|
+
|
|
2839
|
+
if not base_currency:
|
|
2840
|
+
base_currency = "USD"
|
|
2841
|
+
|
|
2842
|
+
validate_currency(base_currency)
|
|
2843
|
+
|
|
2844
|
+
if kwargs.get("interval_start"):
|
|
2845
|
+
start_date = ensure_pendulum_datetime(str(kwargs.get("interval_start")))
|
|
2846
|
+
else:
|
|
2847
|
+
start_date = pendulum.yesterday()
|
|
2848
|
+
|
|
2849
|
+
if kwargs.get("interval_end"):
|
|
2850
|
+
end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
|
|
2851
|
+
else:
|
|
2852
|
+
end_date = None
|
|
2853
|
+
|
|
2854
|
+
validate_dates(start_date=start_date, end_date=end_date)
|
|
2855
|
+
|
|
2856
|
+
src = frankfurter_source(
|
|
2857
|
+
start_date=start_date,
|
|
2858
|
+
end_date=end_date,
|
|
2859
|
+
base_currency=base_currency,
|
|
2860
|
+
)
|
|
2861
|
+
|
|
2862
|
+
if table not in src.resources:
|
|
2863
|
+
raise UnsupportedResourceError(table, "Frankfurter")
|
|
2864
|
+
|
|
2865
|
+
return src.with_resources(table)
|
|
2866
|
+
|
|
2867
|
+
|
|
2868
|
+
class FreshdeskSource:
|
|
2869
|
+
# freshdesk://domain?api_key=<api_key>
|
|
2870
|
+
def handles_incrementality(self) -> bool:
|
|
2871
|
+
return True
|
|
2872
|
+
|
|
2873
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2874
|
+
if kwargs.get("incremental_key"):
|
|
2875
|
+
raise ValueError(
|
|
2876
|
+
"Freshdesk takes care of incrementality on its own, you should not provide incremental_key"
|
|
2877
|
+
)
|
|
2878
|
+
|
|
2879
|
+
parsed_uri = urlparse(uri)
|
|
2880
|
+
domain = parsed_uri.netloc
|
|
2881
|
+
query = parsed_uri.query
|
|
2882
|
+
params = parse_qs(query)
|
|
2883
|
+
|
|
2884
|
+
if not domain:
|
|
2885
|
+
raise MissingValueError("domain", "Freshdesk")
|
|
2886
|
+
|
|
2887
|
+
if "." in domain:
|
|
2888
|
+
domain = domain.split(".")[0]
|
|
2889
|
+
|
|
2890
|
+
api_key = params.get("api_key")
|
|
2891
|
+
if api_key is None:
|
|
2892
|
+
raise MissingValueError("api_key", "Freshdesk")
|
|
2893
|
+
|
|
2894
|
+
start_date = kwargs.get("interval_start")
|
|
2895
|
+
if start_date is not None:
|
|
2896
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
|
|
2897
|
+
else:
|
|
2898
|
+
start_date = ensure_pendulum_datetime("2022-01-01T00:00:00Z")
|
|
2899
|
+
|
|
2900
|
+
end_date = kwargs.get("interval_end")
|
|
2901
|
+
if end_date is not None:
|
|
2902
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
2903
|
+
else:
|
|
2904
|
+
end_date = None
|
|
2905
|
+
|
|
2906
|
+
custom_query: Optional[str] = None
|
|
2907
|
+
if ":" in table:
|
|
2908
|
+
table, custom_query = table.split(":", 1)
|
|
2909
|
+
|
|
2910
|
+
if table not in [
|
|
2911
|
+
"agents",
|
|
2912
|
+
"companies",
|
|
2913
|
+
"contacts",
|
|
2914
|
+
"groups",
|
|
2915
|
+
"roles",
|
|
2916
|
+
"tickets",
|
|
2917
|
+
]:
|
|
2918
|
+
raise UnsupportedResourceError(table, "Freshdesk")
|
|
2919
|
+
|
|
2920
|
+
if custom_query and table != "tickets":
|
|
2921
|
+
raise ValueError(f"Custom query is not supported for {table}")
|
|
2922
|
+
|
|
2923
|
+
from ingestr.src.freshdesk import freshdesk_source
|
|
2924
|
+
|
|
2925
|
+
return freshdesk_source(
|
|
2926
|
+
api_secret_key=api_key[0],
|
|
2927
|
+
domain=domain,
|
|
2928
|
+
start_date=start_date,
|
|
2929
|
+
end_date=end_date,
|
|
2930
|
+
query=custom_query,
|
|
2931
|
+
).with_resources(table)
|
|
2932
|
+
|
|
2933
|
+
|
|
2934
|
+
class TrustpilotSource:
|
|
2935
|
+
# trustpilot://<business_unit_id>?api_key=<api_key>
|
|
2936
|
+
def handles_incrementality(self) -> bool:
|
|
2937
|
+
return True
|
|
2938
|
+
|
|
2939
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2940
|
+
if kwargs.get("incremental_key"):
|
|
2941
|
+
raise ValueError(
|
|
2942
|
+
"Trustpilot takes care of incrementality on its own, you should not provide incremental_key"
|
|
2943
|
+
)
|
|
2944
|
+
|
|
2945
|
+
parsed_uri = urlparse(uri)
|
|
2946
|
+
business_unit_id = parsed_uri.netloc
|
|
2947
|
+
params = parse_qs(parsed_uri.query)
|
|
2948
|
+
|
|
2949
|
+
if not business_unit_id:
|
|
2950
|
+
raise MissingValueError("business_unit_id", "Trustpilot")
|
|
2951
|
+
|
|
2952
|
+
api_key = params.get("api_key")
|
|
2953
|
+
if api_key is None:
|
|
2954
|
+
raise MissingValueError("api_key", "Trustpilot")
|
|
2955
|
+
|
|
2956
|
+
start_date = kwargs.get("interval_start")
|
|
2957
|
+
if start_date is None:
|
|
2958
|
+
start_date = ensure_pendulum_datetime("2000-01-01").in_tz("UTC").isoformat()
|
|
2959
|
+
else:
|
|
2960
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC").isoformat()
|
|
2961
|
+
|
|
2962
|
+
end_date = kwargs.get("interval_end")
|
|
2963
|
+
|
|
2964
|
+
if end_date is not None:
|
|
2965
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC").isoformat()
|
|
2966
|
+
|
|
2967
|
+
if table not in ["reviews"]:
|
|
2968
|
+
raise UnsupportedResourceError(table, "Trustpilot")
|
|
2969
|
+
|
|
2970
|
+
from ingestr.src.trustpilot import trustpilot_source
|
|
2971
|
+
|
|
2972
|
+
return trustpilot_source(
|
|
2973
|
+
business_unit_id=business_unit_id,
|
|
2974
|
+
api_key=api_key[0],
|
|
2975
|
+
start_date=start_date,
|
|
2976
|
+
end_date=end_date,
|
|
2977
|
+
).with_resources(table)
|
|
2978
|
+
|
|
2979
|
+
|
|
2980
|
+
class PhantombusterSource:
|
|
2981
|
+
def handles_incrementality(self) -> bool:
|
|
2982
|
+
return True
|
|
2983
|
+
|
|
2984
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2985
|
+
if kwargs.get("incremental_key"):
|
|
2986
|
+
raise ValueError(
|
|
2987
|
+
"Phantombuster takes care of incrementality on its own, you should not provide incremental_key"
|
|
2988
|
+
)
|
|
2989
|
+
|
|
2990
|
+
# phantombuster://?api_key=<api_key>
|
|
2991
|
+
# source table = phantom_results:agent_id
|
|
2992
|
+
parsed_uri = urlparse(uri)
|
|
2993
|
+
params = parse_qs(parsed_uri.query)
|
|
2994
|
+
api_key = params.get("api_key")
|
|
2995
|
+
if api_key is None:
|
|
2996
|
+
raise MissingValueError("api_key", "Phantombuster")
|
|
2997
|
+
|
|
2998
|
+
table_fields = table.replace(" ", "").split(":")
|
|
2999
|
+
table_name = table_fields[0]
|
|
3000
|
+
|
|
3001
|
+
agent_id = table_fields[1] if len(table_fields) > 1 else None
|
|
3002
|
+
|
|
3003
|
+
if table_name not in ["completed_phantoms"]:
|
|
3004
|
+
raise UnsupportedResourceError(table_name, "Phantombuster")
|
|
3005
|
+
|
|
3006
|
+
if not agent_id:
|
|
3007
|
+
raise MissingValueError("agent_id", "Phantombuster")
|
|
3008
|
+
|
|
3009
|
+
start_date = kwargs.get("interval_start")
|
|
3010
|
+
if start_date is None:
|
|
3011
|
+
start_date = ensure_pendulum_datetime("2018-01-01").in_tz("UTC")
|
|
3012
|
+
else:
|
|
3013
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
|
|
3014
|
+
|
|
3015
|
+
end_date = kwargs.get("interval_end")
|
|
3016
|
+
if end_date is not None:
|
|
3017
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3018
|
+
|
|
3019
|
+
from ingestr.src.phantombuster import phantombuster_source
|
|
3020
|
+
|
|
3021
|
+
return phantombuster_source(
|
|
3022
|
+
api_key=api_key[0],
|
|
3023
|
+
agent_id=agent_id,
|
|
3024
|
+
start_date=start_date,
|
|
3025
|
+
end_date=end_date,
|
|
3026
|
+
).with_resources(table_name)
|
|
3027
|
+
|
|
3028
|
+
|
|
3029
|
+
class ElasticsearchSource:
|
|
3030
|
+
def handles_incrementality(self) -> bool:
|
|
3031
|
+
return False
|
|
3032
|
+
|
|
3033
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3034
|
+
from ingestr.src.elasticsearch import elasticsearch_source
|
|
3035
|
+
|
|
3036
|
+
incremental = None
|
|
3037
|
+
if kwargs.get("incremental_key"):
|
|
3038
|
+
start_value = kwargs.get("interval_start")
|
|
3039
|
+
end_value = kwargs.get("interval_end")
|
|
3040
|
+
|
|
3041
|
+
incremental = dlt_incremental(
|
|
3042
|
+
kwargs.get("incremental_key", ""),
|
|
3043
|
+
initial_value=start_value,
|
|
3044
|
+
end_value=end_value,
|
|
3045
|
+
range_end="closed",
|
|
3046
|
+
range_start="closed",
|
|
3047
|
+
)
|
|
3048
|
+
|
|
3049
|
+
# elasticsearch://localhost:9200?secure=true&verify_certs=false
|
|
3050
|
+
parsed = urlparse(uri)
|
|
3051
|
+
|
|
3052
|
+
index = table
|
|
3053
|
+
if not index:
|
|
3054
|
+
raise ValueError(
|
|
3055
|
+
"Table name must be provided which is the index name in elasticsearch"
|
|
3056
|
+
)
|
|
3057
|
+
|
|
3058
|
+
query_params = parsed.query
|
|
3059
|
+
params = parse_qs(query_params)
|
|
3060
|
+
|
|
3061
|
+
secure = True
|
|
3062
|
+
if "secure" in params:
|
|
3063
|
+
secure = params["secure"][0].capitalize() == "True"
|
|
3064
|
+
|
|
3065
|
+
verify_certs = True
|
|
3066
|
+
if "verify_certs" in params:
|
|
3067
|
+
verify_certs = params["verify_certs"][0].capitalize() == "True"
|
|
3068
|
+
|
|
3069
|
+
scheme = "https" if secure else "http"
|
|
3070
|
+
netloc = parsed.netloc
|
|
3071
|
+
connection_url = f"{scheme}://{netloc}"
|
|
3072
|
+
|
|
3073
|
+
return elasticsearch_source(
|
|
3074
|
+
connection_url=connection_url,
|
|
3075
|
+
index=index,
|
|
3076
|
+
verify_certs=verify_certs,
|
|
3077
|
+
incremental=incremental,
|
|
3078
|
+
).with_resources(table)
|
|
3079
|
+
|
|
3080
|
+
|
|
3081
|
+
class AttioSource:
|
|
3082
|
+
def handles_incrementality(self) -> bool:
|
|
3083
|
+
return False
|
|
3084
|
+
|
|
3085
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3086
|
+
parsed_uri = urlparse(uri)
|
|
3087
|
+
query_params = parse_qs(parsed_uri.query)
|
|
3088
|
+
api_key = query_params.get("api_key")
|
|
3089
|
+
|
|
3090
|
+
if api_key is None:
|
|
3091
|
+
raise MissingValueError("api_key", "Attio")
|
|
3092
|
+
|
|
3093
|
+
parts = table.replace(" ", "").split(":")
|
|
3094
|
+
table_name = parts[0]
|
|
3095
|
+
params = parts[1:]
|
|
3096
|
+
|
|
3097
|
+
from ingestr.src.attio import attio_source
|
|
3098
|
+
|
|
3099
|
+
try:
|
|
3100
|
+
return attio_source(api_key=api_key[0], params=params).with_resources(
|
|
3101
|
+
table_name
|
|
3102
|
+
)
|
|
3103
|
+
except ResourcesNotFoundError:
|
|
3104
|
+
raise UnsupportedResourceError(table_name, "Attio")
|
|
3105
|
+
|
|
3106
|
+
|
|
3107
|
+
class SmartsheetSource:
|
|
3108
|
+
def handles_incrementality(self) -> bool:
|
|
3109
|
+
return False
|
|
3110
|
+
|
|
3111
|
+
# smartsheet://?access_token=<access_token>
|
|
3112
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3113
|
+
if kwargs.get("incremental_key"):
|
|
3114
|
+
raise ValueError("Incremental loads are not supported for Smartsheet")
|
|
3115
|
+
|
|
3116
|
+
if not table:
|
|
3117
|
+
raise ValueError(
|
|
3118
|
+
"Source table (sheet_id) is required to connect to Smartsheet"
|
|
3119
|
+
)
|
|
3120
|
+
|
|
3121
|
+
source_parts = urlparse(uri)
|
|
3122
|
+
source_fields = parse_qs(source_parts.query)
|
|
3123
|
+
access_token = source_fields.get("access_token")
|
|
3124
|
+
|
|
3125
|
+
if not access_token:
|
|
3126
|
+
raise ValueError(
|
|
3127
|
+
"access_token in the URI is required to connect to Smartsheet"
|
|
3128
|
+
)
|
|
3129
|
+
|
|
3130
|
+
from ingestr.src.smartsheets import smartsheet_source
|
|
3131
|
+
|
|
3132
|
+
return smartsheet_source(
|
|
3133
|
+
access_token=access_token[0],
|
|
3134
|
+
sheet_id=table, # table is now a single sheet_id
|
|
3135
|
+
)
|
|
3136
|
+
|
|
3137
|
+
|
|
3138
|
+
class SolidgateSource:
|
|
3139
|
+
def handles_incrementality(self) -> bool:
|
|
3140
|
+
return True
|
|
3141
|
+
|
|
3142
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3143
|
+
if kwargs.get("incremental_key"):
|
|
3144
|
+
raise ValueError(
|
|
3145
|
+
"Solidgate takes care of incrementality on its own, you should not provide incremental_key"
|
|
3146
|
+
)
|
|
3147
|
+
|
|
3148
|
+
parsed_uri = urlparse(uri)
|
|
3149
|
+
query_params = parse_qs(parsed_uri.query)
|
|
3150
|
+
public_key = query_params.get("public_key")
|
|
3151
|
+
secret_key = query_params.get("secret_key")
|
|
3152
|
+
|
|
3153
|
+
if public_key is None:
|
|
3154
|
+
raise MissingValueError("public_key", "Solidgate")
|
|
3155
|
+
|
|
3156
|
+
if secret_key is None:
|
|
3157
|
+
raise MissingValueError("secret_key", "Solidgate")
|
|
3158
|
+
|
|
3159
|
+
table_name = table.replace(" ", "")
|
|
3160
|
+
|
|
3161
|
+
start_date = kwargs.get("interval_start")
|
|
3162
|
+
if start_date is None:
|
|
3163
|
+
start_date = pendulum.yesterday().in_tz("UTC")
|
|
3164
|
+
else:
|
|
3165
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
|
|
3166
|
+
|
|
3167
|
+
end_date = kwargs.get("interval_end")
|
|
3168
|
+
|
|
3169
|
+
if end_date is not None:
|
|
3170
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3171
|
+
|
|
3172
|
+
from ingestr.src.solidgate import solidgate_source
|
|
3173
|
+
|
|
3174
|
+
try:
|
|
3175
|
+
return solidgate_source(
|
|
3176
|
+
public_key=public_key[0],
|
|
3177
|
+
secret_key=secret_key[0],
|
|
3178
|
+
start_date=start_date,
|
|
3179
|
+
end_date=end_date,
|
|
3180
|
+
).with_resources(table_name)
|
|
3181
|
+
except ResourcesNotFoundError:
|
|
3182
|
+
raise UnsupportedResourceError(table_name, "Solidgate")
|
|
3183
|
+
|
|
3184
|
+
|
|
3185
|
+
class SFTPSource:
|
|
3186
|
+
def handles_incrementality(self) -> bool:
|
|
3187
|
+
return True
|
|
3188
|
+
|
|
3189
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3190
|
+
parsed_uri = urlparse(uri)
|
|
3191
|
+
host = parsed_uri.hostname
|
|
3192
|
+
if not host:
|
|
3193
|
+
raise MissingValueError("host", "SFTP URI")
|
|
3194
|
+
port = parsed_uri.port or 22
|
|
3195
|
+
username = parsed_uri.username
|
|
3196
|
+
password = parsed_uri.password
|
|
3197
|
+
|
|
3198
|
+
params: Dict[str, Any] = {
|
|
3199
|
+
"host": host,
|
|
3200
|
+
"port": port,
|
|
3201
|
+
"username": username,
|
|
3202
|
+
"password": password,
|
|
3203
|
+
"look_for_keys": False,
|
|
3204
|
+
"allow_agent": False,
|
|
3205
|
+
}
|
|
3206
|
+
|
|
3207
|
+
try:
|
|
3208
|
+
fs = fsspec.filesystem("sftp", **params)
|
|
3209
|
+
except Exception as e:
|
|
3210
|
+
raise ConnectionError(
|
|
3211
|
+
f"Failed to connect or authenticate to sftp server {host}:{port}. Error: {e}"
|
|
3212
|
+
)
|
|
3213
|
+
bucket_url = f"sftp://{host}:{port}"
|
|
3214
|
+
|
|
3215
|
+
if table.startswith("/"):
|
|
3216
|
+
file_glob = table
|
|
3217
|
+
else:
|
|
3218
|
+
file_glob = f"/{table}"
|
|
3219
|
+
|
|
3220
|
+
try:
|
|
3221
|
+
endpoint = blob.parse_endpoint(table)
|
|
3222
|
+
except blob.UnsupportedEndpointError:
|
|
3223
|
+
raise ValueError(
|
|
3224
|
+
"SFTP Source only supports specific formats files: csv, jsonl, parquet"
|
|
3225
|
+
)
|
|
3226
|
+
except Exception as e:
|
|
3227
|
+
raise ValueError(f"Failed to parse endpoint from path: {table}") from e
|
|
3228
|
+
|
|
3229
|
+
from ingestr.src.filesystem import readers
|
|
3230
|
+
|
|
3231
|
+
dlt_source_resource = readers(bucket_url, fs, file_glob)
|
|
3232
|
+
return dlt_source_resource.with_resources(endpoint)
|
|
3233
|
+
|
|
3234
|
+
|
|
3235
|
+
class QuickBooksSource:
|
|
3236
|
+
def handles_incrementality(self) -> bool:
|
|
3237
|
+
return True
|
|
3238
|
+
|
|
3239
|
+
# quickbooks://?company_id=<company_id>&client_id=<client_id>&client_secret=<client_secret>&refresh_token=<refresh>&access_token=<access_token>&environment=<env>&minor_version=<version>
|
|
3240
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3241
|
+
if kwargs.get("incremental_key"):
|
|
3242
|
+
raise ValueError(
|
|
3243
|
+
"QuickBooks takes care of incrementality on its own, you should not provide incremental_key"
|
|
3244
|
+
)
|
|
3245
|
+
|
|
3246
|
+
parsed_uri = urlparse(uri)
|
|
3247
|
+
|
|
3248
|
+
params = parse_qs(parsed_uri.query)
|
|
3249
|
+
company_id = params.get("company_id")
|
|
3250
|
+
client_id = params.get("client_id")
|
|
3251
|
+
client_secret = params.get("client_secret")
|
|
3252
|
+
refresh_token = params.get("refresh_token")
|
|
3253
|
+
environment = params.get("environment", ["production"])
|
|
3254
|
+
minor_version = params.get("minor_version", [None])
|
|
3255
|
+
|
|
3256
|
+
if not client_id or not client_id[0].strip():
|
|
3257
|
+
raise MissingValueError("client_id", "QuickBooks")
|
|
3258
|
+
|
|
3259
|
+
if not client_secret or not client_secret[0].strip():
|
|
3260
|
+
raise MissingValueError("client_secret", "QuickBooks")
|
|
3261
|
+
|
|
3262
|
+
if not refresh_token or not refresh_token[0].strip():
|
|
3263
|
+
raise MissingValueError("refresh_token", "QuickBooks")
|
|
3264
|
+
|
|
3265
|
+
if not company_id or not company_id[0].strip():
|
|
3266
|
+
raise MissingValueError("company_id", "QuickBooks")
|
|
3267
|
+
|
|
3268
|
+
if environment[0] not in ["production", "sandbox"]:
|
|
3269
|
+
raise ValueError(
|
|
3270
|
+
"Invalid environment. Must be either 'production' or 'sandbox'."
|
|
3271
|
+
)
|
|
3272
|
+
|
|
3273
|
+
from ingestr.src.quickbooks import quickbooks_source
|
|
3274
|
+
|
|
3275
|
+
table_name = table.replace(" ", "")
|
|
3276
|
+
table_mapping = {
|
|
3277
|
+
"customers": "customer",
|
|
3278
|
+
"invoices": "invoice",
|
|
3279
|
+
"accounts": "account",
|
|
3280
|
+
"vendors": "vendor",
|
|
3281
|
+
"payments": "payment",
|
|
3282
|
+
}
|
|
3283
|
+
if table_name in table_mapping:
|
|
3284
|
+
table_name = table_mapping[table_name]
|
|
3285
|
+
|
|
3286
|
+
start_date = kwargs.get("interval_start")
|
|
3287
|
+
if start_date is None:
|
|
3288
|
+
start_date = ensure_pendulum_datetime("2025-01-01").in_tz("UTC")
|
|
3289
|
+
else:
|
|
3290
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
|
|
3291
|
+
|
|
3292
|
+
end_date = kwargs.get("interval_end")
|
|
3293
|
+
|
|
3294
|
+
if end_date is not None:
|
|
3295
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3296
|
+
|
|
3297
|
+
return quickbooks_source(
|
|
3298
|
+
company_id=company_id[0],
|
|
3299
|
+
start_date=start_date,
|
|
3300
|
+
end_date=end_date,
|
|
3301
|
+
client_id=client_id[0],
|
|
3302
|
+
client_secret=client_secret[0],
|
|
3303
|
+
refresh_token=refresh_token[0],
|
|
3304
|
+
environment=environment[0],
|
|
3305
|
+
minor_version=minor_version[0],
|
|
3306
|
+
object=table_name,
|
|
3307
|
+
).with_resources(table_name)
|
|
3308
|
+
|
|
3309
|
+
|
|
3310
|
+
class IsocPulseSource:
|
|
3311
|
+
def handles_incrementality(self) -> bool:
|
|
3312
|
+
return True
|
|
3313
|
+
|
|
3314
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3315
|
+
if kwargs.get("incremental_key"):
|
|
3316
|
+
raise ValueError(
|
|
3317
|
+
"Internet Society Pulse takes care of incrementality on its own, you should not provide incremental_key"
|
|
3318
|
+
)
|
|
3319
|
+
|
|
3320
|
+
parsed_uri = urlparse(uri)
|
|
3321
|
+
params = parse_qs(parsed_uri.query)
|
|
3322
|
+
token = params.get("token")
|
|
3323
|
+
if not token or not token[0].strip():
|
|
3324
|
+
raise MissingValueError("token", "Internet Society Pulse")
|
|
3325
|
+
|
|
3326
|
+
start_date = kwargs.get("interval_start")
|
|
3327
|
+
if start_date is None:
|
|
3328
|
+
start_date = pendulum.now().in_tz("UTC").subtract(days=30)
|
|
3329
|
+
|
|
3330
|
+
end_date = kwargs.get("interval_end")
|
|
3331
|
+
|
|
3332
|
+
metric = table
|
|
3333
|
+
opts = []
|
|
3334
|
+
if ":" in metric:
|
|
3335
|
+
metric, *opts = metric.strip().split(":")
|
|
3336
|
+
opts = [opt.strip() for opt in opts]
|
|
3337
|
+
|
|
3338
|
+
from ingestr.src.isoc_pulse import pulse_source
|
|
3339
|
+
|
|
3340
|
+
src = pulse_source(
|
|
3341
|
+
token=token[0],
|
|
3342
|
+
start_date=start_date.strftime("%Y-%m-%d"),
|
|
3343
|
+
end_date=end_date.strftime("%Y-%m-%d") if end_date else None,
|
|
3344
|
+
metric=metric,
|
|
3345
|
+
opts=opts,
|
|
3346
|
+
)
|
|
3347
|
+
return src.with_resources(metric)
|
|
3348
|
+
|
|
3349
|
+
|
|
3350
|
+
class PinterestSource:
|
|
3351
|
+
def handles_incrementality(self) -> bool:
|
|
3352
|
+
return True
|
|
3353
|
+
|
|
3354
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3355
|
+
if kwargs.get("incremental_key"):
|
|
3356
|
+
raise ValueError(
|
|
3357
|
+
"Pinterest takes care of incrementality on its own, you should not provide incremental_key"
|
|
3358
|
+
)
|
|
3359
|
+
|
|
3360
|
+
parsed = urlparse(uri)
|
|
3361
|
+
params = parse_qs(parsed.query)
|
|
3362
|
+
access_token = params.get("access_token")
|
|
3363
|
+
|
|
3364
|
+
if not access_token:
|
|
3365
|
+
raise MissingValueError("access_token", "Pinterest")
|
|
3366
|
+
|
|
3367
|
+
start_date = kwargs.get("interval_start")
|
|
3368
|
+
if start_date is not None:
|
|
3369
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3370
|
+
else:
|
|
3371
|
+
start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
|
|
3372
|
+
|
|
3373
|
+
end_date = kwargs.get("interval_end")
|
|
3374
|
+
if end_date is not None:
|
|
3375
|
+
end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3376
|
+
|
|
3377
|
+
from ingestr.src.pinterest import pinterest_source
|
|
3378
|
+
|
|
3379
|
+
if table not in {"pins", "boards"}:
|
|
3380
|
+
raise UnsupportedResourceError(table, "Pinterest")
|
|
3381
|
+
|
|
3382
|
+
return pinterest_source(
|
|
3383
|
+
access_token=access_token[0],
|
|
3384
|
+
start_date=start_date,
|
|
3385
|
+
end_date=end_date,
|
|
3386
|
+
).with_resources(table)
|
|
3387
|
+
|
|
3388
|
+
|
|
3389
|
+
class FluxxSource:
|
|
3390
|
+
def handles_incrementality(self) -> bool:
|
|
3391
|
+
return True
|
|
3392
|
+
|
|
3393
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3394
|
+
if kwargs.get("incremental_key"):
|
|
3395
|
+
raise ValueError(
|
|
3396
|
+
"Fluxx takes care of incrementality on its own, you should not provide incremental_key"
|
|
3397
|
+
)
|
|
3398
|
+
|
|
3399
|
+
# Parse URI: fluxx://instance?client_id=xxx&client_secret=xxx
|
|
3400
|
+
parsed_uri = urlparse(uri)
|
|
3401
|
+
source_params = parse_qs(parsed_uri.query)
|
|
3402
|
+
|
|
3403
|
+
instance = parsed_uri.hostname
|
|
3404
|
+
if not instance:
|
|
3405
|
+
raise ValueError(
|
|
3406
|
+
"Instance is required in the URI (e.g., fluxx://mycompany.preprod)"
|
|
3407
|
+
)
|
|
3408
|
+
|
|
3409
|
+
client_id = source_params.get("client_id")
|
|
3410
|
+
if not client_id:
|
|
3411
|
+
raise ValueError("client_id in the URI is required to connect to Fluxx")
|
|
3412
|
+
|
|
3413
|
+
client_secret = source_params.get("client_secret")
|
|
3414
|
+
if not client_secret:
|
|
3415
|
+
raise ValueError("client_secret in the URI is required to connect to Fluxx")
|
|
3416
|
+
|
|
3417
|
+
# Parse date parameters
|
|
3418
|
+
start_date = kwargs.get("interval_start")
|
|
3419
|
+
if start_date:
|
|
3420
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3421
|
+
|
|
3422
|
+
end_date = kwargs.get("interval_end")
|
|
3423
|
+
if end_date:
|
|
3424
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3425
|
+
|
|
3426
|
+
# Import Fluxx source
|
|
3427
|
+
from ingestr.src.fluxx import fluxx_source
|
|
3428
|
+
|
|
3429
|
+
# Parse table specification for custom column selection
|
|
3430
|
+
# Format: "resource_name:field1,field2,field3" or "resource_name"
|
|
3431
|
+
resources = None
|
|
3432
|
+
custom_fields = {}
|
|
3433
|
+
|
|
3434
|
+
if table:
|
|
3435
|
+
# Handle single resource with custom fields or multiple resources
|
|
3436
|
+
if ":" in table and table.count(":") == 1:
|
|
3437
|
+
# Single resource with custom fields: "grant_request:id,name,amount"
|
|
3438
|
+
resource_name, field_list = table.split(":", 1)
|
|
3439
|
+
resource_name = resource_name.strip()
|
|
3440
|
+
fields = [f.strip() for f in field_list.split(",")]
|
|
3441
|
+
resources = [resource_name]
|
|
3442
|
+
custom_fields[resource_name] = fields
|
|
3443
|
+
else:
|
|
3444
|
+
# Multiple resources or single resource without custom fields
|
|
3445
|
+
# Support comma-separated list: "grant_request,user"
|
|
3446
|
+
resources = [r.strip() for r in table.split(",")]
|
|
3447
|
+
|
|
3448
|
+
return fluxx_source(
|
|
3449
|
+
instance=instance,
|
|
3450
|
+
client_id=client_id[0],
|
|
3451
|
+
client_secret=client_secret[0],
|
|
3452
|
+
start_date=start_date,
|
|
3453
|
+
end_date=end_date,
|
|
3454
|
+
resources=resources,
|
|
3455
|
+
custom_fields=custom_fields,
|
|
3456
|
+
)
|
|
3457
|
+
|
|
3458
|
+
|
|
3459
|
+
class LinearSource:
|
|
3460
|
+
def handles_incrementality(self) -> bool:
|
|
3461
|
+
return True
|
|
3462
|
+
|
|
3463
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3464
|
+
if kwargs.get("incremental_key"):
|
|
3465
|
+
raise ValueError(
|
|
3466
|
+
"Linear takes care of incrementality on its own, you should not provide incremental_key"
|
|
3467
|
+
)
|
|
3468
|
+
|
|
3469
|
+
parsed_uri = urlparse(uri)
|
|
3470
|
+
params = parse_qs(parsed_uri.query)
|
|
3471
|
+
api_key = params.get("api_key")
|
|
3472
|
+
if api_key is None:
|
|
3473
|
+
raise MissingValueError("api_key", "Linear")
|
|
3474
|
+
|
|
3475
|
+
if table not in [
|
|
3476
|
+
"issues",
|
|
3477
|
+
"projects",
|
|
3478
|
+
"teams",
|
|
3479
|
+
"users",
|
|
3480
|
+
"workflow_states",
|
|
3481
|
+
"cycles",
|
|
3482
|
+
"attachments",
|
|
3483
|
+
"comments",
|
|
3484
|
+
"documents",
|
|
3485
|
+
"external_users",
|
|
3486
|
+
"initiative",
|
|
3487
|
+
"integrations",
|
|
3488
|
+
"labels",
|
|
3489
|
+
"organization",
|
|
3490
|
+
"project_updates",
|
|
3491
|
+
"team_memberships",
|
|
3492
|
+
"initiative_to_project",
|
|
3493
|
+
"project_milestone",
|
|
3494
|
+
"project_status",
|
|
3495
|
+
]:
|
|
3496
|
+
raise UnsupportedResourceError(table, "Linear")
|
|
3497
|
+
|
|
3498
|
+
start_date = kwargs.get("interval_start")
|
|
3499
|
+
if start_date is not None:
|
|
3500
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3501
|
+
else:
|
|
3502
|
+
start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
|
|
3503
|
+
|
|
3504
|
+
end_date = kwargs.get("interval_end")
|
|
3505
|
+
if end_date is not None:
|
|
3506
|
+
end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3507
|
+
|
|
3508
|
+
from ingestr.src.linear import linear_source
|
|
3509
|
+
|
|
3510
|
+
return linear_source(
|
|
3511
|
+
api_key=api_key[0],
|
|
3512
|
+
start_date=start_date,
|
|
3513
|
+
end_date=end_date,
|
|
3514
|
+
).with_resources(table)
|
|
3515
|
+
|
|
3516
|
+
|
|
3517
|
+
class RevenueCatSource:
|
|
3518
|
+
def handles_incrementality(self) -> bool:
|
|
3519
|
+
return True
|
|
3520
|
+
|
|
3521
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3522
|
+
if kwargs.get("incremental_key"):
|
|
3523
|
+
raise ValueError(
|
|
3524
|
+
"RevenueCat takes care of incrementality on its own, you should not provide incremental_key"
|
|
3525
|
+
)
|
|
3526
|
+
|
|
3527
|
+
parsed_uri = urlparse(uri)
|
|
3528
|
+
params = parse_qs(parsed_uri.query)
|
|
3529
|
+
|
|
3530
|
+
api_key = params.get("api_key")
|
|
3531
|
+
if api_key is None:
|
|
3532
|
+
raise MissingValueError("api_key", "RevenueCat")
|
|
3533
|
+
|
|
3534
|
+
project_id = params.get("project_id")
|
|
3535
|
+
if project_id is None and table != "projects":
|
|
3536
|
+
raise MissingValueError("project_id", "RevenueCat")
|
|
3537
|
+
|
|
3538
|
+
if table not in [
|
|
3539
|
+
"customers",
|
|
3540
|
+
"products",
|
|
3541
|
+
"entitlements",
|
|
3542
|
+
"offerings",
|
|
3543
|
+
"subscriptions",
|
|
3544
|
+
"purchases",
|
|
3545
|
+
"projects",
|
|
3546
|
+
]:
|
|
3547
|
+
raise UnsupportedResourceError(table, "RevenueCat")
|
|
3548
|
+
|
|
3549
|
+
start_date = kwargs.get("interval_start")
|
|
3550
|
+
if start_date is not None:
|
|
3551
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3552
|
+
else:
|
|
3553
|
+
start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
|
|
3554
|
+
|
|
3555
|
+
end_date = kwargs.get("interval_end")
|
|
3556
|
+
if end_date is not None:
|
|
3557
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3558
|
+
|
|
3559
|
+
from ingestr.src.revenuecat import revenuecat_source
|
|
3560
|
+
|
|
3561
|
+
return revenuecat_source(
|
|
3562
|
+
api_key=api_key[0],
|
|
3563
|
+
project_id=project_id[0] if project_id is not None else None,
|
|
3564
|
+
).with_resources(table)
|
|
3565
|
+
|
|
3566
|
+
|
|
3567
|
+
class ZoomSource:
|
|
3568
|
+
def handles_incrementality(self) -> bool:
|
|
3569
|
+
return True
|
|
3570
|
+
|
|
3571
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3572
|
+
if kwargs.get("incremental_key"):
|
|
3573
|
+
raise ValueError(
|
|
3574
|
+
"Zoom takes care of incrementality on its own, you should not provide incremental_key"
|
|
3575
|
+
)
|
|
3576
|
+
|
|
3577
|
+
parsed = urlparse(uri)
|
|
3578
|
+
params = parse_qs(parsed.query)
|
|
3579
|
+
client_id = params.get("client_id")
|
|
3580
|
+
client_secret = params.get("client_secret")
|
|
3581
|
+
account_id = params.get("account_id")
|
|
3582
|
+
|
|
3583
|
+
if not (client_id and client_secret and account_id):
|
|
3584
|
+
raise MissingValueError(
|
|
3585
|
+
"client_id/client_secret/account_id",
|
|
3586
|
+
"Zoom",
|
|
3587
|
+
)
|
|
3588
|
+
|
|
3589
|
+
start_date = kwargs.get("interval_start")
|
|
3590
|
+
if start_date is not None:
|
|
3591
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3592
|
+
else:
|
|
3593
|
+
start_date = pendulum.datetime(2020, 1, 26).in_tz("UTC")
|
|
3594
|
+
|
|
3595
|
+
end_date = kwargs.get("interval_end")
|
|
3596
|
+
if end_date is not None:
|
|
3597
|
+
end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3598
|
+
|
|
3599
|
+
from ingestr.src.zoom import zoom_source
|
|
3600
|
+
|
|
3601
|
+
if table not in {"meetings", "users", "participants"}:
|
|
3602
|
+
raise UnsupportedResourceError(table, "Zoom")
|
|
3603
|
+
|
|
3604
|
+
return zoom_source(
|
|
3605
|
+
client_id=client_id[0],
|
|
3606
|
+
client_secret=client_secret[0],
|
|
3607
|
+
account_id=account_id[0],
|
|
3608
|
+
start_date=start_date,
|
|
3609
|
+
end_date=end_date,
|
|
3610
|
+
).with_resources(table)
|
|
3611
|
+
|
|
3612
|
+
|
|
3613
|
+
class InfluxDBSource:
|
|
3614
|
+
def handles_incrementality(self) -> bool:
|
|
3615
|
+
return True
|
|
3616
|
+
|
|
3617
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3618
|
+
if kwargs.get("incremental_key"):
|
|
3619
|
+
raise ValueError(
|
|
3620
|
+
"InfluxDB takes care of incrementality on its own, you should not provide incremental_key"
|
|
3621
|
+
)
|
|
3622
|
+
|
|
3623
|
+
parsed_uri = urlparse(uri)
|
|
3624
|
+
params = parse_qs(parsed_uri.query)
|
|
3625
|
+
host = parsed_uri.hostname
|
|
3626
|
+
port = parsed_uri.port
|
|
3627
|
+
|
|
3628
|
+
secure = params.get("secure", ["true"])[0].lower() != "false"
|
|
3629
|
+
scheme = "https" if secure else "http"
|
|
3630
|
+
|
|
3631
|
+
if port:
|
|
3632
|
+
host_url = f"{scheme}://{host}:{port}"
|
|
3633
|
+
else:
|
|
3634
|
+
host_url = f"{scheme}://{host}"
|
|
3635
|
+
|
|
3636
|
+
token = params.get("token")
|
|
3637
|
+
org = params.get("org")
|
|
3638
|
+
bucket = params.get("bucket")
|
|
3639
|
+
|
|
3640
|
+
if not host:
|
|
3641
|
+
raise MissingValueError("host", "InfluxDB")
|
|
3642
|
+
if not token:
|
|
3643
|
+
raise MissingValueError("token", "InfluxDB")
|
|
3644
|
+
if not org:
|
|
3645
|
+
raise MissingValueError("org", "InfluxDB")
|
|
3646
|
+
if not bucket:
|
|
3647
|
+
raise MissingValueError("bucket", "InfluxDB")
|
|
3648
|
+
|
|
3649
|
+
start_date = kwargs.get("interval_start")
|
|
3650
|
+
if start_date is not None:
|
|
3651
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3652
|
+
else:
|
|
3653
|
+
start_date = pendulum.datetime(2024, 1, 1).in_tz("UTC")
|
|
3654
|
+
|
|
3655
|
+
end_date = kwargs.get("interval_end")
|
|
3656
|
+
if end_date is not None:
|
|
3657
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3658
|
+
|
|
3659
|
+
from ingestr.src.influxdb import influxdb_source
|
|
3660
|
+
|
|
3661
|
+
return influxdb_source(
|
|
3662
|
+
measurement=table,
|
|
3663
|
+
host=host_url,
|
|
3664
|
+
org=org[0],
|
|
3665
|
+
bucket=bucket[0],
|
|
3666
|
+
token=token[0],
|
|
3667
|
+
secure=secure,
|
|
3668
|
+
start_date=start_date,
|
|
3669
|
+
end_date=end_date,
|
|
3670
|
+
).with_resources(table)
|
|
3671
|
+
|
|
3672
|
+
|
|
3673
|
+
class WiseSource:
|
|
3674
|
+
def handles_incrementality(self) -> bool:
|
|
3675
|
+
return True
|
|
3676
|
+
|
|
3677
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3678
|
+
parsed = urlparse(uri)
|
|
3679
|
+
params = parse_qs(parsed.query)
|
|
3680
|
+
api_key = params.get("api_key")
|
|
3681
|
+
|
|
3682
|
+
if not api_key:
|
|
3683
|
+
raise MissingValueError("api_key", "Wise")
|
|
3684
|
+
|
|
3685
|
+
if table not in ["profiles", "transfers", "balances"]:
|
|
3686
|
+
raise ValueError(
|
|
3687
|
+
f"Resource '{table}' is not supported for Wise source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
3688
|
+
)
|
|
3689
|
+
|
|
3690
|
+
start_date = kwargs.get("interval_start")
|
|
3691
|
+
if start_date:
|
|
3692
|
+
start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
|
|
3693
|
+
else:
|
|
3694
|
+
start_date = pendulum.datetime(2020, 1, 1).in_timezone("UTC")
|
|
3695
|
+
|
|
3696
|
+
end_date = kwargs.get("interval_end")
|
|
3697
|
+
if end_date:
|
|
3698
|
+
end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
|
|
3699
|
+
else:
|
|
3700
|
+
end_date = None
|
|
3701
|
+
|
|
3702
|
+
from ingestr.src.wise import wise_source
|
|
3703
|
+
|
|
3704
|
+
return wise_source(
|
|
3705
|
+
api_key=api_key[0],
|
|
3706
|
+
start_date=start_date,
|
|
3707
|
+
end_date=end_date,
|
|
3708
|
+
).with_resources(table)
|
|
3709
|
+
|
|
3710
|
+
|
|
3711
|
+
class FundraiseupSource:
|
|
3712
|
+
def handles_incrementality(self) -> bool:
|
|
3713
|
+
return True
|
|
3714
|
+
|
|
3715
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3716
|
+
parsed_uri = urlparse(uri)
|
|
3717
|
+
params = parse_qs(parsed_uri.query)
|
|
3718
|
+
|
|
3719
|
+
api_key = params.get("api_key")
|
|
3720
|
+
if api_key is None:
|
|
3721
|
+
raise MissingValueError("api_key", "Fundraiseup")
|
|
3722
|
+
|
|
3723
|
+
from ingestr.src.fundraiseup import fundraiseup_source
|
|
3724
|
+
|
|
3725
|
+
src = fundraiseup_source(api_key=api_key[0])
|
|
3726
|
+
if table not in src.resources:
|
|
3727
|
+
raise UnsupportedResourceError(table, "Fundraiseup")
|
|
3728
|
+
return src.with_resources(table)
|
|
3729
|
+
|
|
3730
|
+
|
|
3731
|
+
class AnthropicSource:
|
|
3732
|
+
def handles_incrementality(self) -> bool:
|
|
3733
|
+
return True
|
|
3734
|
+
|
|
3735
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3736
|
+
# anthropic://?api_key=<admin_api_key>
|
|
3737
|
+
parsed_uri = urlparse(uri)
|
|
3738
|
+
params = parse_qs(parsed_uri.query)
|
|
3739
|
+
|
|
3740
|
+
api_key = params.get("api_key")
|
|
3741
|
+
if api_key is None:
|
|
3742
|
+
raise MissingValueError("api_key", "Anthropic")
|
|
3743
|
+
|
|
3744
|
+
if table not in [
|
|
3745
|
+
"claude_code_usage",
|
|
3746
|
+
"usage_report",
|
|
3747
|
+
"cost_report",
|
|
3748
|
+
"organization",
|
|
3749
|
+
"workspaces",
|
|
3750
|
+
"api_keys",
|
|
3751
|
+
"invites",
|
|
3752
|
+
"users",
|
|
3753
|
+
"workspace_members",
|
|
3754
|
+
]:
|
|
3755
|
+
raise UnsupportedResourceError(table, "Anthropic")
|
|
3756
|
+
|
|
3757
|
+
# Get start and end dates from kwargs
|
|
3758
|
+
start_date = kwargs.get("interval_start")
|
|
3759
|
+
if start_date:
|
|
3760
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3761
|
+
else:
|
|
3762
|
+
# Default to 2023-01-01
|
|
3763
|
+
start_date = pendulum.datetime(2023, 1, 1)
|
|
3764
|
+
|
|
3765
|
+
end_date = kwargs.get("interval_end")
|
|
3766
|
+
if end_date:
|
|
3767
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3768
|
+
else:
|
|
3769
|
+
end_date = None
|
|
3770
|
+
|
|
3771
|
+
from ingestr.src.anthropic import anthropic_source
|
|
3772
|
+
|
|
3773
|
+
return anthropic_source(
|
|
3774
|
+
api_key=api_key[0],
|
|
3775
|
+
initial_start_date=start_date,
|
|
3776
|
+
end_date=end_date,
|
|
3777
|
+
).with_resources(table)
|
|
3778
|
+
|
|
3779
|
+
|
|
3780
|
+
class PlusVibeAISource:
|
|
3781
|
+
resources = [
|
|
3782
|
+
"campaigns",
|
|
3783
|
+
"leads",
|
|
3784
|
+
"email_accounts",
|
|
3785
|
+
"emails",
|
|
3786
|
+
"blocklist",
|
|
3787
|
+
"webhooks",
|
|
3788
|
+
"tags",
|
|
3789
|
+
]
|
|
3790
|
+
|
|
3791
|
+
def handles_incrementality(self) -> bool:
|
|
3792
|
+
return True
|
|
3793
|
+
|
|
3794
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3795
|
+
# plusvibeai://?api_key=<key>&workspace_id=<id>
|
|
3796
|
+
parsed_uri = urlparse(uri)
|
|
3797
|
+
params = parse_qs(parsed_uri.query)
|
|
3798
|
+
|
|
3799
|
+
api_key = params.get("api_key")
|
|
3800
|
+
workspace_id = params.get("workspace_id")
|
|
3801
|
+
|
|
3802
|
+
if not api_key:
|
|
3803
|
+
raise MissingValueError("api_key", "PlusVibeAI")
|
|
3804
|
+
|
|
3805
|
+
if not workspace_id:
|
|
3806
|
+
raise MissingValueError("workspace_id", "PlusVibeAI")
|
|
3807
|
+
|
|
3808
|
+
if table not in self.resources:
|
|
3809
|
+
raise UnsupportedResourceError(table, "PlusVibeAI")
|
|
3810
|
+
|
|
3811
|
+
import dlt
|
|
3812
|
+
|
|
3813
|
+
from ingestr.src.plusvibeai import plusvibeai_source
|
|
3814
|
+
|
|
3815
|
+
dlt.secrets["sources.plusvibeai.api_key"] = api_key[0]
|
|
3816
|
+
dlt.secrets["sources.plusvibeai.workspace_id"] = workspace_id[0]
|
|
3817
|
+
|
|
3818
|
+
# Handle custom base URL if provided
|
|
3819
|
+
base_url = params.get("base_url", ["https://api.plusvibe.ai"])[0]
|
|
3820
|
+
dlt.secrets["sources.plusvibeai.base_url"] = base_url
|
|
3821
|
+
|
|
3822
|
+
src = plusvibeai_source()
|
|
3823
|
+
return src.with_resources(table)
|
|
3824
|
+
|
|
3825
|
+
|
|
3826
|
+
class IntercomSource:
|
|
3827
|
+
def handles_incrementality(self) -> bool:
|
|
3828
|
+
return True
|
|
3829
|
+
|
|
3830
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3831
|
+
# intercom://?access_token=<token>®ion=<us|eu|au>
|
|
3832
|
+
# OR intercom://?oauth_token=<token>®ion=<us|eu|au>
|
|
3833
|
+
parsed_uri = urlparse(uri)
|
|
3834
|
+
params = parse_qs(parsed_uri.query)
|
|
3835
|
+
|
|
3836
|
+
# Check for authentication
|
|
3837
|
+
access_token = params.get("access_token")
|
|
3838
|
+
oauth_token = params.get("oauth_token")
|
|
3839
|
+
region = params.get("region", ["us"])[0]
|
|
3840
|
+
|
|
3841
|
+
if not access_token and not oauth_token:
|
|
3842
|
+
raise MissingValueError("access_token or oauth_token", "Intercom")
|
|
3843
|
+
|
|
3844
|
+
# Validate table/resource
|
|
3845
|
+
supported_tables = [
|
|
3846
|
+
"contacts",
|
|
3847
|
+
"companies",
|
|
3848
|
+
"conversations",
|
|
3849
|
+
"tickets",
|
|
3850
|
+
"tags",
|
|
3851
|
+
"segments",
|
|
3852
|
+
"teams",
|
|
3853
|
+
"admins",
|
|
3854
|
+
"articles",
|
|
3855
|
+
"data_attributes",
|
|
3856
|
+
]
|
|
3857
|
+
|
|
3858
|
+
if table not in supported_tables:
|
|
3859
|
+
raise UnsupportedResourceError(table, "Intercom")
|
|
3860
|
+
|
|
3861
|
+
# Get date parameters
|
|
3862
|
+
start_date = kwargs.get("interval_start")
|
|
3863
|
+
if start_date:
|
|
3864
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3865
|
+
else:
|
|
3866
|
+
start_date = pendulum.datetime(2020, 1, 1)
|
|
3867
|
+
|
|
3868
|
+
end_date = kwargs.get("interval_end")
|
|
3869
|
+
if end_date:
|
|
3870
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3871
|
+
|
|
3872
|
+
# Import and initialize the source
|
|
3873
|
+
from ingestr.src.intercom import (
|
|
3874
|
+
IntercomCredentialsAccessToken,
|
|
3875
|
+
IntercomCredentialsOAuth,
|
|
3876
|
+
TIntercomCredentials,
|
|
3877
|
+
intercom_source,
|
|
3878
|
+
)
|
|
3879
|
+
|
|
3880
|
+
credentials: TIntercomCredentials
|
|
3881
|
+
if access_token:
|
|
3882
|
+
credentials = IntercomCredentialsAccessToken(
|
|
3883
|
+
access_token=access_token[0], region=region
|
|
3884
|
+
)
|
|
3885
|
+
else:
|
|
3886
|
+
if not oauth_token:
|
|
3887
|
+
raise MissingValueError("oauth_token", "Intercom")
|
|
3888
|
+
credentials = IntercomCredentialsOAuth(
|
|
3889
|
+
oauth_token=oauth_token[0], region=region
|
|
3890
|
+
)
|
|
3891
|
+
|
|
3892
|
+
return intercom_source(
|
|
3893
|
+
credentials=credentials,
|
|
3894
|
+
start_date=start_date,
|
|
3895
|
+
end_date=end_date,
|
|
3896
|
+
).with_resources(table)
|
|
3897
|
+
|
|
3898
|
+
|
|
3899
|
+
class HttpSource:
|
|
3900
|
+
"""Source for reading CSV, JSON, and Parquet files from HTTP URLs"""
|
|
3901
|
+
|
|
3902
|
+
def handles_incrementality(self) -> bool:
|
|
3903
|
+
return False
|
|
3904
|
+
|
|
3905
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3906
|
+
"""
|
|
3907
|
+
Create a dlt source for reading files from HTTP URLs.
|
|
3908
|
+
|
|
3909
|
+
URI format: http://example.com/file.csv or https://example.com/file.json
|
|
3910
|
+
|
|
3911
|
+
Args:
|
|
3912
|
+
uri: HTTP(S) URL to the file
|
|
3913
|
+
table: Not used for HTTP source (files are read directly)
|
|
3914
|
+
**kwargs: Additional arguments:
|
|
3915
|
+
- file_format: Optional file format override ('csv', 'json', 'parquet')
|
|
3916
|
+
- chunksize: Number of records to process at once (default varies by format)
|
|
3917
|
+
- merge_key: Merge key for the resource
|
|
3918
|
+
|
|
3919
|
+
Returns:
|
|
3920
|
+
DltResource for the HTTP file
|
|
3921
|
+
"""
|
|
3922
|
+
from ingestr.src.http import http_source
|
|
3923
|
+
|
|
3924
|
+
# Extract the actual URL (remove the http:// or https:// scheme if duplicated)
|
|
3925
|
+
url = uri
|
|
3926
|
+
if uri.startswith("http://http://") or uri.startswith("https://https://"):
|
|
3927
|
+
url = uri.split("://", 1)[1]
|
|
3928
|
+
|
|
3929
|
+
file_format = kwargs.get("file_format")
|
|
3930
|
+
chunksize = kwargs.get("chunksize")
|
|
3931
|
+
merge_key = kwargs.get("merge_key")
|
|
3932
|
+
|
|
3933
|
+
reader_kwargs = {}
|
|
3934
|
+
if chunksize is not None:
|
|
3935
|
+
reader_kwargs["chunksize"] = chunksize
|
|
3936
|
+
|
|
3937
|
+
source = http_source(url=url, file_format=file_format, **reader_kwargs)
|
|
3938
|
+
|
|
3939
|
+
if merge_key:
|
|
3940
|
+
source.apply_hints(merge_key=merge_key)
|
|
3941
|
+
|
|
3942
|
+
return source
|
|
3943
|
+
|
|
3944
|
+
|
|
3945
|
+
class MondaySource:
|
|
3946
|
+
def handles_incrementality(self) -> bool:
|
|
3947
|
+
return False
|
|
3948
|
+
|
|
3949
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3950
|
+
parsed_uri = urlparse(uri)
|
|
3951
|
+
query_params = parse_qs(parsed_uri.query)
|
|
3952
|
+
api_token = query_params.get("api_token")
|
|
3953
|
+
|
|
3954
|
+
if api_token is None:
|
|
3955
|
+
raise MissingValueError("api_token", "Monday")
|
|
3956
|
+
|
|
3957
|
+
parts = table.replace(" ", "").split(":")
|
|
3958
|
+
table_name = parts[0]
|
|
3959
|
+
params = parts[1:]
|
|
3960
|
+
|
|
3961
|
+
# Get interval_start and interval_end from kwargs (command line args)
|
|
3962
|
+
interval_start = kwargs.get("interval_start")
|
|
3963
|
+
interval_end = kwargs.get("interval_end")
|
|
3964
|
+
|
|
3965
|
+
# Convert datetime to string format YYYY-MM-DD
|
|
3966
|
+
start_date = interval_start.strftime("%Y-%m-%d") if interval_start else None
|
|
3967
|
+
end_date = interval_end.strftime("%Y-%m-%d") if interval_end else None
|
|
3968
|
+
|
|
3969
|
+
from ingestr.src.monday import monday_source
|
|
3970
|
+
|
|
3971
|
+
try:
|
|
3972
|
+
return monday_source(
|
|
3973
|
+
api_token=api_token[0],
|
|
3974
|
+
params=params,
|
|
3975
|
+
start_date=start_date,
|
|
3976
|
+
end_date=end_date,
|
|
3977
|
+
).with_resources(table_name)
|
|
3978
|
+
except ResourcesNotFoundError:
|
|
3979
|
+
raise UnsupportedResourceError(table_name, "Monday")
|
|
3980
|
+
|
|
3981
|
+
|
|
3982
|
+
class MailchimpSource:
|
|
3983
|
+
def handles_incrementality(self) -> bool:
|
|
3984
|
+
return False
|
|
3985
|
+
|
|
3986
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3987
|
+
parsed_uri = urlparse(uri)
|
|
3988
|
+
query_params = parse_qs(parsed_uri.query)
|
|
3989
|
+
api_key = query_params.get("api_key")
|
|
3990
|
+
server = query_params.get("server")
|
|
3991
|
+
|
|
3992
|
+
if api_key is None:
|
|
3993
|
+
raise MissingValueError("api_key", "Mailchimp")
|
|
3994
|
+
if server is None:
|
|
3995
|
+
raise MissingValueError("server", "Mailchimp")
|
|
3996
|
+
|
|
3997
|
+
from ingestr.src.mailchimp import mailchimp_source
|
|
3998
|
+
|
|
3999
|
+
try:
|
|
4000
|
+
return mailchimp_source(
|
|
4001
|
+
api_key=api_key[0],
|
|
4002
|
+
server=server[0],
|
|
4003
|
+
).with_resources(table)
|
|
4004
|
+
except ResourcesNotFoundError:
|
|
4005
|
+
raise UnsupportedResourceError(table, "Mailchimp")
|
|
4006
|
+
|
|
4007
|
+
|
|
4008
|
+
class AlliumSource:
|
|
4009
|
+
def handles_incrementality(self) -> bool:
|
|
4010
|
+
return False
|
|
4011
|
+
|
|
4012
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4013
|
+
parsed_uri = urlparse(uri)
|
|
4014
|
+
query_params = parse_qs(parsed_uri.query)
|
|
4015
|
+
api_key = query_params.get("api_key")
|
|
4016
|
+
|
|
4017
|
+
if api_key is None:
|
|
4018
|
+
raise MissingValueError("api_key", "Allium")
|
|
4019
|
+
|
|
4020
|
+
# Extract query_id and custom parameters from table parameter
|
|
4021
|
+
# Format: query_id or query:query_id or query:query_id:param1=value1¶m2=value2
|
|
4022
|
+
query_id = table
|
|
4023
|
+
custom_params = {}
|
|
4024
|
+
limit = None
|
|
4025
|
+
compute_profile = None
|
|
4026
|
+
|
|
4027
|
+
if ":" in table:
|
|
4028
|
+
parts = table.split(":", 2) # Split into max 3 parts
|
|
4029
|
+
if len(parts) >= 2:
|
|
4030
|
+
query_id = parts[1]
|
|
4031
|
+
if len(parts) == 3:
|
|
4032
|
+
# Parse custom parameters from query string format
|
|
4033
|
+
param_string = parts[2]
|
|
4034
|
+
for param in param_string.split("&"):
|
|
4035
|
+
if "=" in param:
|
|
4036
|
+
key, value = param.split("=", 1)
|
|
4037
|
+
# Extract run_config parameters
|
|
4038
|
+
if key == "limit":
|
|
4039
|
+
limit = int(value)
|
|
4040
|
+
elif key == "compute_profile":
|
|
4041
|
+
compute_profile = value
|
|
4042
|
+
else:
|
|
4043
|
+
custom_params[key] = value
|
|
4044
|
+
|
|
4045
|
+
# Extract parameters from interval_start and interval_end
|
|
4046
|
+
# Default: 2 days ago 00:00 to yesterday 00:00
|
|
4047
|
+
now = pendulum.now()
|
|
4048
|
+
default_start = now.subtract(days=2).start_of("day")
|
|
4049
|
+
default_end = now.subtract(days=1).start_of("day")
|
|
4050
|
+
|
|
4051
|
+
parameters = {}
|
|
4052
|
+
interval_start = kwargs.get("interval_start")
|
|
4053
|
+
interval_end = kwargs.get("interval_end")
|
|
4054
|
+
|
|
4055
|
+
start_date = interval_start if interval_start is not None else default_start
|
|
4056
|
+
end_date = interval_end if interval_end is not None else default_end
|
|
4057
|
+
|
|
4058
|
+
parameters["start_date"] = start_date.strftime("%Y-%m-%d")
|
|
4059
|
+
parameters["end_date"] = end_date.strftime("%Y-%m-%d")
|
|
4060
|
+
parameters["start_timestamp"] = str(int(start_date.timestamp()))
|
|
4061
|
+
parameters["end_timestamp"] = str(int(end_date.timestamp()))
|
|
4062
|
+
|
|
4063
|
+
# Merge custom parameters (they override default parameters)
|
|
4064
|
+
parameters.update(custom_params)
|
|
4065
|
+
|
|
4066
|
+
from ingestr.src.allium import allium_source
|
|
4067
|
+
|
|
4068
|
+
return allium_source(
|
|
4069
|
+
api_key=api_key[0],
|
|
4070
|
+
query_id=query_id,
|
|
4071
|
+
parameters=parameters if parameters else None,
|
|
4072
|
+
limit=limit,
|
|
4073
|
+
compute_profile=compute_profile,
|
|
4074
|
+
)
|
|
4075
|
+
|
|
4076
|
+
|
|
4077
|
+
class CouchbaseSource:
|
|
4078
|
+
table_builder: Callable
|
|
4079
|
+
|
|
4080
|
+
def __init__(self, table_builder=None) -> None:
|
|
4081
|
+
if table_builder is None:
|
|
4082
|
+
from ingestr.src.couchbase_source import couchbase_collection
|
|
4083
|
+
|
|
4084
|
+
table_builder = couchbase_collection
|
|
4085
|
+
|
|
4086
|
+
self.table_builder = table_builder
|
|
4087
|
+
|
|
4088
|
+
def handles_incrementality(self) -> bool:
|
|
4089
|
+
return False
|
|
4090
|
+
|
|
4091
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4092
|
+
"""
|
|
4093
|
+
Create a dlt source for reading data from Couchbase.
|
|
4094
|
+
|
|
4095
|
+
URI formats:
|
|
4096
|
+
- couchbase://username:password@host
|
|
4097
|
+
- couchbase://username:password@host/bucket
|
|
4098
|
+
- couchbase://username:password@host?ssl=true
|
|
4099
|
+
- couchbases://username:password@host (SSL enabled)
|
|
4100
|
+
|
|
4101
|
+
Table formats:
|
|
4102
|
+
- bucket.scope.collection (when bucket not in URI)
|
|
4103
|
+
- scope.collection (when bucket specified in URI path)
|
|
4104
|
+
|
|
4105
|
+
Note: If password contains special characters (@, :, /, etc.), they must be URL-encoded.
|
|
4106
|
+
|
|
4107
|
+
Examples:
|
|
4108
|
+
Local/Self-hosted:
|
|
4109
|
+
- couchbase://admin:password123@localhost with table "mybucket.myscope.mycollection"
|
|
4110
|
+
- couchbase://admin:password123@localhost/mybucket with table "myscope.mycollection"
|
|
4111
|
+
- couchbase://admin:password123@localhost?ssl=true with table "mybucket._default._default"
|
|
4112
|
+
|
|
4113
|
+
Capella (Cloud):
|
|
4114
|
+
- couchbases://user:pass@cb.xxx.cloud.couchbase.com with table "travel-sample.inventory.airport"
|
|
4115
|
+
- couchbase://user:pass@cb.xxx.cloud.couchbase.com/travel-sample?ssl=true with table "inventory.airport"
|
|
4116
|
+
|
|
4117
|
+
To encode password in Python:
|
|
4118
|
+
from urllib.parse import quote
|
|
4119
|
+
encoded_pwd = quote("MyPass@123!", safe='')
|
|
4120
|
+
uri = f"couchbase://admin:{encoded_pwd}@localhost?ssl=true"
|
|
4121
|
+
|
|
4122
|
+
Args:
|
|
4123
|
+
uri: Couchbase connection URI (can include /bucket path and ?ssl=true query parameter)
|
|
4124
|
+
table: Format depends on URI:
|
|
4125
|
+
- bucket.scope.collection (if bucket not in URI)
|
|
4126
|
+
- scope.collection (if bucket in URI path)
|
|
4127
|
+
**kwargs: Additional arguments:
|
|
4128
|
+
- limit: Maximum number of documents to fetch
|
|
4129
|
+
- incremental_key: Field to use for incremental loading
|
|
4130
|
+
- interval_start: Start value for incremental loading
|
|
4131
|
+
- interval_end: End value for incremental loading
|
|
4132
|
+
|
|
4133
|
+
Returns:
|
|
4134
|
+
DltResource for the Couchbase collection
|
|
4135
|
+
"""
|
|
4136
|
+
# Parse the URI to extract connection details
|
|
4137
|
+
# urlparse automatically decodes URL-encoded credentials
|
|
4138
|
+
|
|
4139
|
+
parsed = urlparse(uri)
|
|
4140
|
+
|
|
4141
|
+
# Extract username and password from URI
|
|
4142
|
+
# Note: urlparse automatically decodes URL-encoded characters in username/password
|
|
4143
|
+
from urllib.parse import unquote
|
|
4144
|
+
|
|
4145
|
+
username = parsed.username
|
|
4146
|
+
password = unquote(parsed.password) if parsed.password else None
|
|
4147
|
+
|
|
4148
|
+
if not username or not password:
|
|
4149
|
+
raise ValueError(
|
|
4150
|
+
"Username and password must be provided in the URI.\n"
|
|
4151
|
+
"Format: couchbase://username:password@host\n"
|
|
4152
|
+
"If password has special characters (@, :, /), URL-encode them.\n"
|
|
4153
|
+
"Example: couchbase://admin:MyPass%40123@localhost for password 'MyPass@123'"
|
|
4154
|
+
)
|
|
4155
|
+
|
|
4156
|
+
# Reconstruct connection string without credentials
|
|
4157
|
+
scheme = parsed.scheme
|
|
4158
|
+
netloc = parsed.netloc
|
|
4159
|
+
|
|
4160
|
+
# Remove username:password@ from netloc if present
|
|
4161
|
+
if "@" in netloc:
|
|
4162
|
+
netloc = netloc.split("@", 1)[1]
|
|
4163
|
+
|
|
4164
|
+
# Parse query parameters from URI
|
|
4165
|
+
from urllib.parse import parse_qs
|
|
4166
|
+
|
|
4167
|
+
query_params = parse_qs(parsed.query)
|
|
4168
|
+
|
|
4169
|
+
# Check if SSL is requested via URI query parameter (?ssl=true)
|
|
4170
|
+
if "ssl" in query_params:
|
|
4171
|
+
ssl_value = query_params["ssl"][0].lower()
|
|
4172
|
+
use_ssl = ssl_value in ("true", "1", "yes")
|
|
4173
|
+
|
|
4174
|
+
# Apply SSL scheme based on parameter
|
|
4175
|
+
if use_ssl and scheme == "couchbase":
|
|
4176
|
+
scheme = "couchbases"
|
|
4177
|
+
|
|
4178
|
+
connection_string = f"{scheme}://{netloc}"
|
|
4179
|
+
|
|
4180
|
+
# Extract bucket from URI path if present (e.g., couchbase://host/bucket)
|
|
4181
|
+
bucket_from_uri = None
|
|
4182
|
+
if parsed.path and parsed.path.strip("/"):
|
|
4183
|
+
bucket_from_uri = parsed.path.strip("/").split("/")[0]
|
|
4184
|
+
|
|
4185
|
+
# Parse table format: can be "scope.collection" or "bucket.scope.collection"
|
|
4186
|
+
table_parts = table.split(".")
|
|
4187
|
+
|
|
4188
|
+
if len(table_parts) == 3:
|
|
4189
|
+
# Format: bucket.scope.collection
|
|
4190
|
+
bucket, scope, collection = table_parts
|
|
4191
|
+
elif len(table_parts) == 2:
|
|
4192
|
+
# Format: scope.collection (bucket from URI)
|
|
4193
|
+
if bucket_from_uri:
|
|
4194
|
+
bucket = bucket_from_uri
|
|
4195
|
+
scope, collection = table_parts
|
|
4196
|
+
else:
|
|
4197
|
+
raise ValueError(
|
|
4198
|
+
"Table format is 'scope.collection' but no bucket specified in URI.\n"
|
|
4199
|
+
f"Either use URI format: couchbase://user:pass@host/bucket\n"
|
|
4200
|
+
f"Or use table format: bucket.scope.collection\n"
|
|
4201
|
+
f"Got table: {table}"
|
|
4202
|
+
)
|
|
4203
|
+
else:
|
|
4204
|
+
raise ValueError(
|
|
4205
|
+
"Table format must be 'bucket.scope.collection' or 'scope.collection' (with bucket in URI). "
|
|
4206
|
+
f"Got: {table}\n"
|
|
4207
|
+
"Examples:\n"
|
|
4208
|
+
" - URI: couchbase://user:pass@host, Table: travel-sample.inventory.airport\n"
|
|
4209
|
+
" - URI: couchbase://user:pass@host/travel-sample, Table: inventory.airport"
|
|
4210
|
+
)
|
|
4211
|
+
|
|
4212
|
+
# Handle incremental loading
|
|
4213
|
+
incremental = None
|
|
4214
|
+
if kwargs.get("incremental_key"):
|
|
4215
|
+
start_value = kwargs.get("interval_start")
|
|
4216
|
+
end_value = kwargs.get("interval_end")
|
|
4217
|
+
|
|
4218
|
+
incremental = dlt_incremental(
|
|
4219
|
+
kwargs.get("incremental_key", ""),
|
|
4220
|
+
initial_value=start_value,
|
|
4221
|
+
end_value=end_value,
|
|
4222
|
+
range_end="closed",
|
|
4223
|
+
range_start="closed",
|
|
4224
|
+
)
|
|
4225
|
+
|
|
4226
|
+
# Get optional parameters
|
|
4227
|
+
limit = kwargs.get("limit")
|
|
4228
|
+
|
|
4229
|
+
table_instance = self.table_builder(
|
|
4230
|
+
connection_string=connection_string,
|
|
4231
|
+
username=username,
|
|
4232
|
+
password=password,
|
|
4233
|
+
bucket=bucket,
|
|
4234
|
+
scope=scope,
|
|
4235
|
+
collection=collection,
|
|
4236
|
+
incremental=incremental,
|
|
4237
|
+
limit=limit,
|
|
4238
|
+
)
|
|
4239
|
+
table_instance.max_table_nesting = 1
|
|
4240
|
+
|
|
4241
|
+
return table_instance
|
|
4242
|
+
|
|
4243
|
+
|
|
4244
|
+
class CursorSource:
|
|
4245
|
+
resources = [
|
|
4246
|
+
"team_members",
|
|
4247
|
+
"daily_usage_data",
|
|
4248
|
+
"team_spend",
|
|
4249
|
+
"filtered_usage_events",
|
|
4250
|
+
]
|
|
4251
|
+
|
|
4252
|
+
def handles_incrementality(self) -> bool:
|
|
4253
|
+
return True
|
|
4254
|
+
|
|
4255
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4256
|
+
# cursor://?api_key=<api_key>
|
|
4257
|
+
parsed_uri = urlparse(uri)
|
|
4258
|
+
params = parse_qs(parsed_uri.query)
|
|
4259
|
+
|
|
4260
|
+
api_key = params.get("api_key")
|
|
4261
|
+
|
|
4262
|
+
if not api_key:
|
|
4263
|
+
raise MissingValueError("api_key", "Cursor")
|
|
4264
|
+
|
|
4265
|
+
if table not in self.resources:
|
|
4266
|
+
raise UnsupportedResourceError(table, "Cursor")
|
|
4267
|
+
|
|
4268
|
+
import dlt
|
|
4269
|
+
|
|
4270
|
+
from ingestr.src.cursor import cursor_source
|
|
4271
|
+
|
|
4272
|
+
dlt.secrets["sources.cursor.api_key"] = api_key[0]
|
|
4273
|
+
|
|
4274
|
+
# Handle interval_start and interval_end for daily_usage_data and filtered_usage_events (optional)
|
|
4275
|
+
if table in ["daily_usage_data", "filtered_usage_events"]:
|
|
4276
|
+
interval_start = kwargs.get("interval_start")
|
|
4277
|
+
interval_end = kwargs.get("interval_end")
|
|
4278
|
+
|
|
4279
|
+
# Both are optional, but if one is provided, both should be provided
|
|
4280
|
+
if interval_start is not None and interval_end is not None:
|
|
4281
|
+
# Convert datetime to epoch milliseconds
|
|
4282
|
+
start_ms = int(interval_start.timestamp() * 1000)
|
|
4283
|
+
end_ms = int(interval_end.timestamp() * 1000)
|
|
4284
|
+
|
|
4285
|
+
dlt.config["sources.cursor.start_date"] = start_ms
|
|
4286
|
+
dlt.config["sources.cursor.end_date"] = end_ms
|
|
4287
|
+
|
|
4288
|
+
src = cursor_source()
|
|
4289
|
+
return src.with_resources(table)
|
|
4290
|
+
|
|
4291
|
+
|
|
4292
|
+
class SocrataSource:
|
|
4293
|
+
def handles_incrementality(self) -> bool:
|
|
4294
|
+
return False
|
|
4295
|
+
|
|
4296
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4297
|
+
"""
|
|
4298
|
+
Creates a DLT source for Socrata open data platform.
|
|
4299
|
+
|
|
4300
|
+
URI format: socrata://domain?app_token=TOKEN
|
|
4301
|
+
Table: dataset_id (e.g., "6udu-fhnu")
|
|
4302
|
+
|
|
4303
|
+
Args:
|
|
4304
|
+
uri: Socrata connection URI with domain and optional auth params
|
|
4305
|
+
table: Dataset ID (e.g., "6udu-fhnu")
|
|
4306
|
+
**kwargs: Additional arguments:
|
|
4307
|
+
- incremental_key: Field to use for incremental loading (e.g., ":updated_at")
|
|
4308
|
+
- interval_start: Start date for initial load
|
|
4309
|
+
- interval_end: End date for load
|
|
4310
|
+
- primary_key: Primary key field for merge operations
|
|
4311
|
+
|
|
4312
|
+
Returns:
|
|
4313
|
+
DltResource for the Socrata dataset
|
|
4314
|
+
"""
|
|
4315
|
+
from urllib.parse import parse_qs, urlparse
|
|
4316
|
+
|
|
4317
|
+
parsed = urlparse(uri)
|
|
4318
|
+
|
|
4319
|
+
domain = parsed.netloc
|
|
4320
|
+
if not domain:
|
|
4321
|
+
raise ValueError(
|
|
4322
|
+
"Domain must be provided in the URI.\n"
|
|
4323
|
+
"Format: socrata://domain?app_token=TOKEN\n"
|
|
4324
|
+
"Example: socrata://evergreen.data.socrata.com?app_token=mytoken"
|
|
4325
|
+
)
|
|
4326
|
+
|
|
4327
|
+
query_params = parse_qs(parsed.query)
|
|
4328
|
+
|
|
4329
|
+
dataset_id = table
|
|
4330
|
+
if not dataset_id:
|
|
4331
|
+
raise ValueError(
|
|
4332
|
+
"Dataset ID must be provided as the table parameter.\n"
|
|
4333
|
+
"Example: --source-table 6udu-fhnu"
|
|
4334
|
+
)
|
|
4335
|
+
|
|
4336
|
+
app_token = query_params.get("app_token", [None])[0]
|
|
4337
|
+
username = query_params.get("username", [None])[0]
|
|
4338
|
+
password = query_params.get("password", [None])[0]
|
|
4339
|
+
|
|
4340
|
+
incremental = None
|
|
4341
|
+
if kwargs.get("incremental_key"):
|
|
4342
|
+
start_value = kwargs.get("interval_start")
|
|
4343
|
+
end_value = kwargs.get("interval_end")
|
|
4344
|
+
|
|
4345
|
+
if start_value:
|
|
4346
|
+
start_value = (
|
|
4347
|
+
start_value.isoformat()
|
|
4348
|
+
if hasattr(start_value, "isoformat")
|
|
4349
|
+
else str(start_value)
|
|
4350
|
+
)
|
|
4351
|
+
|
|
4352
|
+
if end_value:
|
|
4353
|
+
end_value = (
|
|
4354
|
+
end_value.isoformat()
|
|
4355
|
+
if hasattr(end_value, "isoformat")
|
|
4356
|
+
else str(end_value)
|
|
4357
|
+
)
|
|
4358
|
+
|
|
4359
|
+
incremental = dlt_incremental(
|
|
4360
|
+
kwargs.get("incremental_key", ""),
|
|
4361
|
+
initial_value=start_value,
|
|
4362
|
+
end_value=end_value,
|
|
4363
|
+
range_end="open",
|
|
4364
|
+
range_start="closed",
|
|
4365
|
+
)
|
|
4366
|
+
|
|
4367
|
+
primary_key = kwargs.get("primary_key")
|
|
4368
|
+
|
|
4369
|
+
from ingestr.src.socrata_source import source
|
|
4370
|
+
|
|
4371
|
+
return source(
|
|
4372
|
+
domain=domain,
|
|
4373
|
+
dataset_id=dataset_id,
|
|
4374
|
+
app_token=app_token,
|
|
4375
|
+
username=username,
|
|
4376
|
+
password=password,
|
|
4377
|
+
incremental=incremental,
|
|
4378
|
+
primary_key=primary_key,
|
|
4379
|
+
).with_resources("dataset")
|
|
4380
|
+
|
|
4381
|
+
|
|
4382
|
+
class HostawaySource:
|
|
4383
|
+
def handles_incrementality(self) -> bool:
|
|
4384
|
+
return True
|
|
4385
|
+
|
|
4386
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4387
|
+
if kwargs.get("incremental_key"):
|
|
4388
|
+
raise ValueError(
|
|
4389
|
+
"Hostaway takes care of incrementality on its own, you should not provide incremental_key"
|
|
4390
|
+
)
|
|
4391
|
+
|
|
4392
|
+
source_parts = urlparse(uri)
|
|
4393
|
+
source_params = parse_qs(source_parts.query)
|
|
4394
|
+
api_key = source_params.get("api_key")
|
|
4395
|
+
|
|
4396
|
+
if not api_key:
|
|
4397
|
+
raise ValueError("api_key in the URI is required to connect to Hostaway")
|
|
4398
|
+
|
|
4399
|
+
match table:
|
|
4400
|
+
case "listings":
|
|
4401
|
+
resource_name = "listings"
|
|
4402
|
+
case "listing_fee_settings":
|
|
4403
|
+
resource_name = "listing_fee_settings"
|
|
4404
|
+
case "listing_agreements":
|
|
4405
|
+
resource_name = "listing_agreements"
|
|
4406
|
+
case "listing_pricing_settings":
|
|
4407
|
+
resource_name = "listing_pricing_settings"
|
|
4408
|
+
case "cancellation_policies":
|
|
4409
|
+
resource_name = "cancellation_policies"
|
|
4410
|
+
case "cancellation_policies_airbnb":
|
|
4411
|
+
resource_name = "cancellation_policies_airbnb"
|
|
4412
|
+
case "cancellation_policies_marriott":
|
|
4413
|
+
resource_name = "cancellation_policies_marriott"
|
|
4414
|
+
case "cancellation_policies_vrbo":
|
|
4415
|
+
resource_name = "cancellation_policies_vrbo"
|
|
4416
|
+
case "reservations":
|
|
4417
|
+
resource_name = "reservations"
|
|
4418
|
+
case "finance_fields":
|
|
4419
|
+
resource_name = "finance_fields"
|
|
4420
|
+
case "reservation_payment_methods":
|
|
4421
|
+
resource_name = "reservation_payment_methods"
|
|
4422
|
+
case "reservation_rental_agreements":
|
|
4423
|
+
resource_name = "reservation_rental_agreements"
|
|
4424
|
+
case "listing_calendars":
|
|
4425
|
+
resource_name = "listing_calendars"
|
|
4426
|
+
case "conversations":
|
|
4427
|
+
resource_name = "conversations"
|
|
4428
|
+
case "message_templates":
|
|
4429
|
+
resource_name = "message_templates"
|
|
4430
|
+
case "bed_types":
|
|
4431
|
+
resource_name = "bed_types"
|
|
4432
|
+
case "property_types":
|
|
4433
|
+
resource_name = "property_types"
|
|
4434
|
+
case "countries":
|
|
4435
|
+
resource_name = "countries"
|
|
4436
|
+
case "account_tax_settings":
|
|
4437
|
+
resource_name = "account_tax_settings"
|
|
4438
|
+
case "user_groups":
|
|
4439
|
+
resource_name = "user_groups"
|
|
4440
|
+
case "guest_payment_charges":
|
|
4441
|
+
resource_name = "guest_payment_charges"
|
|
4442
|
+
case "coupons":
|
|
4443
|
+
resource_name = "coupons"
|
|
4444
|
+
case "webhook_reservations":
|
|
4445
|
+
resource_name = "webhook_reservations"
|
|
4446
|
+
case "tasks":
|
|
4447
|
+
resource_name = "tasks"
|
|
4448
|
+
case _:
|
|
4449
|
+
raise ValueError(
|
|
4450
|
+
f"Resource '{table}' is not supported for Hostaway source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
4451
|
+
)
|
|
4452
|
+
|
|
4453
|
+
start_date = kwargs.get("interval_start")
|
|
4454
|
+
if start_date:
|
|
4455
|
+
start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
|
|
4456
|
+
else:
|
|
4457
|
+
start_date = pendulum.datetime(1970, 1, 1).in_timezone("UTC")
|
|
4458
|
+
|
|
4459
|
+
end_date = kwargs.get("interval_end")
|
|
4460
|
+
if end_date:
|
|
4461
|
+
end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
|
|
4462
|
+
|
|
4463
|
+
from ingestr.src.hostaway import hostaway_source
|
|
4464
|
+
|
|
4465
|
+
return hostaway_source(
|
|
4466
|
+
api_key=api_key[0],
|
|
4467
|
+
start_date=start_date,
|
|
4468
|
+
end_date=end_date,
|
|
4469
|
+
).with_resources(resource_name)
|
|
4470
|
+
|
|
4471
|
+
|
|
4472
|
+
class SnapchatAdsSource:
|
|
4473
|
+
resources = [
|
|
4474
|
+
"organizations",
|
|
4475
|
+
"fundingsources",
|
|
4476
|
+
"billingcenters",
|
|
4477
|
+
"adaccounts",
|
|
4478
|
+
"invoices",
|
|
4479
|
+
"transactions",
|
|
4480
|
+
"members",
|
|
4481
|
+
"roles",
|
|
4482
|
+
"campaigns",
|
|
4483
|
+
"adsquads",
|
|
4484
|
+
"ads",
|
|
4485
|
+
"event_details",
|
|
4486
|
+
"creatives",
|
|
4487
|
+
"segments",
|
|
4488
|
+
"campaigns_stats",
|
|
4489
|
+
"ad_accounts_stats",
|
|
4490
|
+
"ads_stats",
|
|
4491
|
+
"ad_squads_stats",
|
|
4492
|
+
]
|
|
4493
|
+
|
|
4494
|
+
def handles_incrementality(self) -> bool:
|
|
4495
|
+
return True
|
|
4496
|
+
|
|
4497
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4498
|
+
parsed_uri = urlparse(uri)
|
|
4499
|
+
source_fields = parse_qs(parsed_uri.query)
|
|
4500
|
+
|
|
4501
|
+
refresh_token = source_fields.get("refresh_token")
|
|
4502
|
+
if not refresh_token:
|
|
4503
|
+
raise ValueError("refresh_token is required to connect to Snapchat Ads")
|
|
4504
|
+
|
|
4505
|
+
client_id = source_fields.get("client_id")
|
|
4506
|
+
if not client_id:
|
|
4507
|
+
raise ValueError("client_id is required to connect to Snapchat Ads")
|
|
4508
|
+
|
|
4509
|
+
client_secret = source_fields.get("client_secret")
|
|
4510
|
+
if not client_secret:
|
|
4511
|
+
raise ValueError("client_secret is required to connect to Snapchat Ads")
|
|
4512
|
+
|
|
4513
|
+
organization_id = source_fields.get("organization_id")
|
|
4514
|
+
|
|
4515
|
+
# Resources that support ad_account_id filtering
|
|
4516
|
+
ad_account_resources = [
|
|
4517
|
+
"invoices",
|
|
4518
|
+
"campaigns",
|
|
4519
|
+
"adsquads",
|
|
4520
|
+
"ads",
|
|
4521
|
+
"event_details",
|
|
4522
|
+
"creatives",
|
|
4523
|
+
"segments",
|
|
4524
|
+
]
|
|
4525
|
+
|
|
4526
|
+
# Stats resources
|
|
4527
|
+
stats_resources = [
|
|
4528
|
+
"campaigns_stats",
|
|
4529
|
+
"ad_accounts_stats",
|
|
4530
|
+
"ads_stats",
|
|
4531
|
+
"ad_squads_stats",
|
|
4532
|
+
]
|
|
4533
|
+
|
|
4534
|
+
# Parse table name
|
|
4535
|
+
stats_config = None
|
|
4536
|
+
ad_account_id = None
|
|
4537
|
+
|
|
4538
|
+
if ":" in table:
|
|
4539
|
+
parts = table.split(":")
|
|
4540
|
+
resource_name = parts[0]
|
|
4541
|
+
|
|
4542
|
+
if resource_name in stats_resources:
|
|
4543
|
+
# Stats table format:
|
|
4544
|
+
# resource_name:granularity:fields:options (all accounts)
|
|
4545
|
+
# resource_name:ad_account_id:granularity:fields:options (specific account)
|
|
4546
|
+
|
|
4547
|
+
def parse_options(options_str: str) -> dict:
|
|
4548
|
+
"""Parse key=value,key=value options string."""
|
|
4549
|
+
result = {}
|
|
4550
|
+
for option in options_str.split(","):
|
|
4551
|
+
if "=" in option:
|
|
4552
|
+
key, value = option.split("=", 1)
|
|
4553
|
+
result[key] = value
|
|
4554
|
+
return result
|
|
4555
|
+
|
|
4556
|
+
if len(parts) >= 2:
|
|
4557
|
+
valid_granularities = ["TOTAL", "DAY", "HOUR", "LIFETIME"]
|
|
4558
|
+
|
|
4559
|
+
if parts[1].upper() in valid_granularities:
|
|
4560
|
+
# Format: resource_name:granularity:fields:options
|
|
4561
|
+
stats_config = {
|
|
4562
|
+
"granularity": parts[1].upper(),
|
|
4563
|
+
"fields": parts[2]
|
|
4564
|
+
if len(parts) > 2
|
|
4565
|
+
else "impressions,spend",
|
|
4566
|
+
}
|
|
4567
|
+
if len(parts) > 3:
|
|
4568
|
+
stats_config.update(parse_options(parts[3]))
|
|
4569
|
+
else:
|
|
4570
|
+
# Format: resource_name:ad_account_id:granularity:fields:options
|
|
4571
|
+
ad_account_id = parts[1]
|
|
4572
|
+
stats_config = {
|
|
4573
|
+
"granularity": parts[2].upper()
|
|
4574
|
+
if len(parts) > 2
|
|
4575
|
+
else "DAY",
|
|
4576
|
+
"fields": parts[3]
|
|
4577
|
+
if len(parts) > 3
|
|
4578
|
+
else "impressions,spend",
|
|
4579
|
+
}
|
|
4580
|
+
if len(parts) > 4:
|
|
4581
|
+
stats_config.update(parse_options(parts[4]))
|
|
4582
|
+
else:
|
|
4583
|
+
# Just resource_name, use defaults
|
|
4584
|
+
stats_config = {
|
|
4585
|
+
"granularity": "DAY",
|
|
4586
|
+
"fields": "impressions,spend",
|
|
4587
|
+
}
|
|
4588
|
+
else:
|
|
4589
|
+
# Non-stats table with ad_account_id: resource_name:ad_account_id
|
|
4590
|
+
ad_account_id = parts[1] if len(parts) > 1 else None
|
|
4591
|
+
if not ad_account_id:
|
|
4592
|
+
raise ValueError(
|
|
4593
|
+
f"ad_account_id must be provided in format '{resource_name}:ad_account_id'"
|
|
4594
|
+
)
|
|
4595
|
+
else:
|
|
4596
|
+
resource_name = table
|
|
4597
|
+
if resource_name in stats_resources:
|
|
4598
|
+
# Stats resource with default config
|
|
4599
|
+
stats_config = {
|
|
4600
|
+
"granularity": "DAY",
|
|
4601
|
+
"fields": "impressions,spend",
|
|
4602
|
+
}
|
|
4603
|
+
|
|
4604
|
+
# Validation for non-stats resources
|
|
4605
|
+
if resource_name not in stats_resources:
|
|
4606
|
+
account_id_required = (
|
|
4607
|
+
resource_name in ad_account_resources
|
|
4608
|
+
and ad_account_id is None
|
|
4609
|
+
and not organization_id
|
|
4610
|
+
)
|
|
4611
|
+
if account_id_required:
|
|
4612
|
+
raise ValueError(
|
|
4613
|
+
f"organization_id is required for '{resource_name}' table when no specific ad_account_id is provided"
|
|
4614
|
+
)
|
|
4615
|
+
|
|
4616
|
+
if not organization_id and table != "organizations":
|
|
4617
|
+
raise ValueError(
|
|
4618
|
+
f"organization_id is required for table '{table}'. Only 'organizations' table does not require organization_id."
|
|
4619
|
+
)
|
|
4620
|
+
else:
|
|
4621
|
+
# Stats resources need either ad_account_id or organization_id
|
|
4622
|
+
if not ad_account_id and not organization_id:
|
|
4623
|
+
raise ValueError(
|
|
4624
|
+
f"organization_id is required for '{resource_name}' when ad_account_id is not provided"
|
|
4625
|
+
)
|
|
4626
|
+
|
|
4627
|
+
if resource_name not in self.resources:
|
|
4628
|
+
raise UnsupportedResourceError(table, "Snapchat Ads")
|
|
4629
|
+
|
|
4630
|
+
from ingestr.src.snapchat_ads import snapchat_ads_source
|
|
4631
|
+
|
|
4632
|
+
source_kwargs: dict[str, Any] = {
|
|
4633
|
+
"refresh_token": refresh_token[0],
|
|
4634
|
+
"client_id": client_id[0],
|
|
4635
|
+
"client_secret": client_secret[0],
|
|
4636
|
+
}
|
|
4637
|
+
|
|
4638
|
+
if organization_id:
|
|
4639
|
+
source_kwargs["organization_id"] = organization_id[0]
|
|
4640
|
+
|
|
4641
|
+
if ad_account_id:
|
|
4642
|
+
source_kwargs["ad_account_id"] = ad_account_id
|
|
4643
|
+
|
|
4644
|
+
# Add interval_start and interval_end for client-side filtering
|
|
4645
|
+
interval_start = kwargs.get("interval_start")
|
|
4646
|
+
if interval_start:
|
|
4647
|
+
source_kwargs["start_date"] = interval_start
|
|
4648
|
+
|
|
4649
|
+
interval_end = kwargs.get("interval_end")
|
|
4650
|
+
if interval_end:
|
|
4651
|
+
source_kwargs["end_date"] = interval_end
|
|
4652
|
+
|
|
4653
|
+
# Add stats_config for stats resource
|
|
4654
|
+
if stats_config:
|
|
4655
|
+
source_kwargs["stats_config"] = stats_config
|
|
4656
|
+
|
|
4657
|
+
source = snapchat_ads_source(**source_kwargs)
|
|
4658
|
+
|
|
4659
|
+
return source.with_resources(resource_name)
|