ingestr 0.13.13__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin_max/__init__.py +6 -4
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +37 -10
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +508 -27
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +107 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +15 -8
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +2933 -245
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/METADATA +229 -19
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.13.dist-info/RECORD +0 -115
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/sources.py
CHANGED
|
@@ -3,6 +3,7 @@ import csv
|
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import tempfile
|
|
7
8
|
from datetime import date, datetime, timedelta, timezone
|
|
8
9
|
from typing import (
|
|
@@ -13,104 +14,39 @@ from typing import (
|
|
|
13
14
|
List,
|
|
14
15
|
Literal,
|
|
15
16
|
Optional,
|
|
17
|
+
TypeAlias,
|
|
16
18
|
Union,
|
|
17
19
|
)
|
|
18
|
-
from urllib.parse import ParseResult, parse_qs,
|
|
20
|
+
from urllib.parse import ParseResult, parse_qs, urlencode, urlparse
|
|
19
21
|
|
|
20
|
-
import
|
|
21
|
-
import gcsfs # type: ignore
|
|
22
|
+
import fsspec # type: ignore
|
|
22
23
|
import pendulum
|
|
23
|
-
import s3fs # type: ignore
|
|
24
|
-
from dlt.common.configuration.specs import (
|
|
25
|
-
AwsCredentials,
|
|
26
|
-
)
|
|
27
|
-
from dlt.common.libs.sql_alchemy import (
|
|
28
|
-
Engine,
|
|
29
|
-
MetaData,
|
|
30
|
-
)
|
|
31
24
|
from dlt.common.time import ensure_pendulum_datetime
|
|
32
|
-
from dlt.common.typing import TDataItem, TSecretStrValue
|
|
33
25
|
from dlt.extract import Incremental
|
|
26
|
+
from dlt.extract.exceptions import ResourcesNotFoundError
|
|
27
|
+
from dlt.sources import incremental as dlt_incremental
|
|
34
28
|
from dlt.sources.credentials import (
|
|
35
29
|
ConnectionStringCredentials,
|
|
36
30
|
)
|
|
37
|
-
from dlt.sources.sql_database import sql_table
|
|
38
|
-
from dlt.sources.sql_database.helpers import TableLoader
|
|
39
|
-
from dlt.sources.sql_database.schema_types import (
|
|
40
|
-
ReflectionLevel,
|
|
41
|
-
SelectAny,
|
|
42
|
-
Table,
|
|
43
|
-
TTypeAdapter,
|
|
44
|
-
)
|
|
45
|
-
from google.ads.googleads.client import GoogleAdsClient # type: ignore
|
|
46
|
-
from sqlalchemy import Column
|
|
47
|
-
from sqlalchemy import types as sa
|
|
48
31
|
|
|
49
32
|
from ingestr.src import blob
|
|
50
|
-
from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
|
|
51
|
-
from ingestr.src.adjust.adjust_helpers import parse_filters
|
|
52
|
-
from ingestr.src.airtable import airtable_source
|
|
53
|
-
from ingestr.src.applovin import applovin_source
|
|
54
|
-
from ingestr.src.applovin_max import applovin_max_source
|
|
55
|
-
from ingestr.src.appsflyer._init_ import appsflyer_source
|
|
56
|
-
from ingestr.src.appstore import app_store
|
|
57
|
-
from ingestr.src.appstore.client import AppStoreConnectClient
|
|
58
|
-
from ingestr.src.arrow import memory_mapped_arrow
|
|
59
|
-
from ingestr.src.asana_source import asana_source
|
|
60
|
-
from ingestr.src.chess import source
|
|
61
|
-
from ingestr.src.dynamodb import dynamodb
|
|
62
33
|
from ingestr.src.errors import (
|
|
63
34
|
InvalidBlobTableError,
|
|
64
35
|
MissingValueError,
|
|
65
36
|
UnsupportedResourceError,
|
|
66
37
|
)
|
|
67
|
-
from ingestr.src.facebook_ads import facebook_ads_source, facebook_insights_source
|
|
68
|
-
from ingestr.src.filesystem import readers
|
|
69
|
-
from ingestr.src.filters import table_adapter_exclude_columns
|
|
70
|
-
from ingestr.src.github import github_reactions, github_repo_events, github_stargazers
|
|
71
|
-
from ingestr.src.google_ads import google_ads
|
|
72
|
-
from ingestr.src.google_analytics import google_analytics
|
|
73
|
-
from ingestr.src.google_sheets import google_spreadsheet
|
|
74
|
-
from ingestr.src.gorgias import gorgias_source
|
|
75
|
-
from ingestr.src.hubspot import hubspot
|
|
76
|
-
from ingestr.src.kafka import kafka_consumer
|
|
77
|
-
from ingestr.src.kafka.helpers import KafkaCredentials
|
|
78
|
-
from ingestr.src.klaviyo._init_ import klaviyo_source
|
|
79
|
-
from ingestr.src.linkedin_ads import linked_in_ads_source
|
|
80
|
-
from ingestr.src.linkedin_ads.dimension_time_enum import (
|
|
81
|
-
Dimension,
|
|
82
|
-
TimeGranularity,
|
|
83
|
-
)
|
|
84
|
-
from ingestr.src.mongodb import mongodb_collection
|
|
85
|
-
from ingestr.src.notion import notion_databases
|
|
86
|
-
from ingestr.src.personio import personio_source
|
|
87
|
-
from ingestr.src.salesforce import salesforce_source
|
|
88
|
-
from ingestr.src.shopify import shopify_source
|
|
89
|
-
from ingestr.src.slack import slack_source
|
|
90
|
-
from ingestr.src.sql_database.callbacks import (
|
|
91
|
-
chained_query_adapter_callback,
|
|
92
|
-
custom_query_variable_subsitution,
|
|
93
|
-
limit_callback,
|
|
94
|
-
type_adapter_callback,
|
|
95
|
-
)
|
|
96
|
-
from ingestr.src.stripe_analytics import stripe_source
|
|
97
38
|
from ingestr.src.table_definition import TableDefinition, table_string_to_dataclass
|
|
98
|
-
from ingestr.src.tiktok_ads import tiktok_source
|
|
99
|
-
from ingestr.src.time import isotime
|
|
100
|
-
from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
|
|
101
|
-
from ingestr.src.zendesk.helpers.credentials import (
|
|
102
|
-
ZendeskCredentialsOAuth,
|
|
103
|
-
ZendeskCredentialsToken,
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
|
|
107
|
-
TQueryAdapter = Callable[[SelectAny, Table], SelectAny]
|
|
108
39
|
|
|
109
40
|
|
|
110
41
|
class SqlSource:
|
|
111
42
|
table_builder: Callable
|
|
112
43
|
|
|
113
|
-
def __init__(self, table_builder=
|
|
44
|
+
def __init__(self, table_builder=None) -> None:
|
|
45
|
+
if table_builder is None:
|
|
46
|
+
from dlt.sources.sql_database import sql_table
|
|
47
|
+
|
|
48
|
+
table_builder = sql_table
|
|
49
|
+
|
|
114
50
|
self.table_builder = table_builder
|
|
115
51
|
|
|
116
52
|
def handles_incrementality(self) -> bool:
|
|
@@ -119,13 +55,16 @@ class SqlSource:
|
|
|
119
55
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
120
56
|
table_fields = TableDefinition(dataset="custom", table="custom")
|
|
121
57
|
if not table.startswith("query:"):
|
|
122
|
-
|
|
58
|
+
if uri.startswith("spanner://"):
|
|
59
|
+
table_fields = TableDefinition(dataset="", table=table)
|
|
60
|
+
else:
|
|
61
|
+
table_fields = table_string_to_dataclass(table)
|
|
123
62
|
|
|
124
63
|
incremental = None
|
|
125
64
|
if kwargs.get("incremental_key"):
|
|
126
65
|
start_value = kwargs.get("interval_start")
|
|
127
66
|
end_value = kwargs.get("interval_end")
|
|
128
|
-
incremental =
|
|
67
|
+
incremental = dlt_incremental(
|
|
129
68
|
kwargs.get("incremental_key", ""),
|
|
130
69
|
initial_value=start_value,
|
|
131
70
|
end_value=end_value,
|
|
@@ -133,36 +72,62 @@ class SqlSource:
|
|
|
133
72
|
range_start="closed",
|
|
134
73
|
)
|
|
135
74
|
|
|
75
|
+
engine_adapter_callback = None
|
|
76
|
+
|
|
77
|
+
if uri.startswith("md://") or uri.startswith("motherduck://"):
|
|
78
|
+
parsed_uri = urlparse(uri)
|
|
79
|
+
query_params = parse_qs(parsed_uri.query)
|
|
80
|
+
# Convert md:// URI to duckdb:///md: format
|
|
81
|
+
if parsed_uri.path:
|
|
82
|
+
db_path = parsed_uri.path
|
|
83
|
+
else:
|
|
84
|
+
db_path = ""
|
|
85
|
+
|
|
86
|
+
token = query_params.get("token", [""])[0]
|
|
87
|
+
if not token:
|
|
88
|
+
raise ValueError("Token is required for MotherDuck connection")
|
|
89
|
+
uri = f"duckdb:///md:{db_path}?motherduck_token={token}"
|
|
90
|
+
|
|
136
91
|
if uri.startswith("mysql://"):
|
|
137
92
|
uri = uri.replace("mysql://", "mysql+pymysql://")
|
|
138
93
|
|
|
139
|
-
#
|
|
140
|
-
if uri.startswith("
|
|
94
|
+
# Monkey patch cx_Oracle to use oracledb (thin mode, no client libraries required)
|
|
95
|
+
if uri.startswith("oracle+") or uri.startswith("oracle://"):
|
|
96
|
+
try:
|
|
97
|
+
import oracledb # type: ignore[import-not-found]
|
|
98
|
+
|
|
99
|
+
# SQLAlchemy's cx_oracle dialect checks for version >= 5.2
|
|
100
|
+
# oracledb has a different versioning scheme, so we need to patch it
|
|
101
|
+
oracledb.version = "8.3.0" # type: ignore[assignment]
|
|
102
|
+
sys.modules["cx_Oracle"] = oracledb # type: ignore[assignment]
|
|
103
|
+
except ImportError:
|
|
104
|
+
# oracledb not installed, will fail later with a clear error
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
# Process Snowflake private key authentication
|
|
108
|
+
if uri.startswith("snowflake://"):
|
|
141
109
|
parsed_uri = urlparse(uri)
|
|
110
|
+
query_params = parse_qs(parsed_uri.query)
|
|
142
111
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
raise ValueError(
|
|
146
|
-
"A username is required to connect to the ClickHouse database."
|
|
147
|
-
)
|
|
112
|
+
if "private_key" in query_params:
|
|
113
|
+
from dlt.common.libs.cryptography import decode_private_key
|
|
148
114
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
"A password is required to authenticate with the ClickHouse database."
|
|
153
|
-
)
|
|
115
|
+
private_key = query_params["private_key"][0]
|
|
116
|
+
passphrase = query_params.get("private_key_passphrase", [None])[0]
|
|
117
|
+
decoded_key = decode_private_key(private_key, passphrase)
|
|
154
118
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
"The hostname or IP address of the ClickHouse server is required to establish a connection."
|
|
159
|
-
)
|
|
119
|
+
query_params["private_key"] = [base64.b64encode(decoded_key).decode()]
|
|
120
|
+
if "private_key_passphrase" in query_params:
|
|
121
|
+
del query_params["private_key_passphrase"]
|
|
160
122
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
123
|
+
# Rebuild URI
|
|
124
|
+
uri = parsed_uri._replace(
|
|
125
|
+
query=urlencode(query_params, doseq=True)
|
|
126
|
+
).geturl()
|
|
127
|
+
|
|
128
|
+
# clickhouse://<username>:<password>@<host>:<port>?secure=<secure>
|
|
129
|
+
if uri.startswith("clickhouse://"):
|
|
130
|
+
parsed_uri = urlparse(uri)
|
|
166
131
|
|
|
167
132
|
query_params = parse_qs(parsed_uri.query)
|
|
168
133
|
|
|
@@ -177,6 +142,73 @@ class SqlSource:
|
|
|
177
142
|
query=urlencode(query_params, doseq=True),
|
|
178
143
|
).geturl()
|
|
179
144
|
|
|
145
|
+
if uri.startswith("db2://"):
|
|
146
|
+
uri = uri.replace("db2://", "db2+ibm_db://")
|
|
147
|
+
|
|
148
|
+
if uri.startswith("spanner://"):
|
|
149
|
+
parsed_uri = urlparse(uri)
|
|
150
|
+
query_params = parse_qs(parsed_uri.query)
|
|
151
|
+
|
|
152
|
+
project_id_param = query_params.get("project_id")
|
|
153
|
+
instance_id_param = query_params.get("instance_id")
|
|
154
|
+
database_param = query_params.get("database")
|
|
155
|
+
|
|
156
|
+
cred_path = query_params.get("credentials_path")
|
|
157
|
+
cred_base64 = query_params.get("credentials_base64")
|
|
158
|
+
|
|
159
|
+
if not project_id_param or not instance_id_param or not database_param:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
"project_id, instance_id and database are required in the URI to get data from Google Spanner"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
project_id = project_id_param[0]
|
|
165
|
+
instance_id = instance_id_param[0]
|
|
166
|
+
database = database_param[0]
|
|
167
|
+
|
|
168
|
+
if not cred_path and not cred_base64:
|
|
169
|
+
raise ValueError(
|
|
170
|
+
"credentials_path or credentials_base64 is required in the URI to get data from Google Sheets"
|
|
171
|
+
)
|
|
172
|
+
if cred_path:
|
|
173
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path[0]
|
|
174
|
+
elif cred_base64:
|
|
175
|
+
credentials = json.loads(
|
|
176
|
+
base64.b64decode(cred_base64[0]).decode("utf-8")
|
|
177
|
+
)
|
|
178
|
+
temp = tempfile.NamedTemporaryFile(
|
|
179
|
+
mode="w", delete=False, suffix=".json"
|
|
180
|
+
)
|
|
181
|
+
json.dump(credentials, temp)
|
|
182
|
+
temp.close()
|
|
183
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp.name
|
|
184
|
+
|
|
185
|
+
uri = f"spanner+spanner:///projects/{project_id}/instances/{instance_id}/databases/{database}"
|
|
186
|
+
|
|
187
|
+
def eng_callback(engine):
|
|
188
|
+
return engine.execution_options(read_only=True)
|
|
189
|
+
|
|
190
|
+
engine_adapter_callback = eng_callback
|
|
191
|
+
from dlt.common.libs.sql_alchemy import (
|
|
192
|
+
Engine,
|
|
193
|
+
MetaData,
|
|
194
|
+
)
|
|
195
|
+
from dlt.sources.sql_database.schema_types import (
|
|
196
|
+
ReflectionLevel,
|
|
197
|
+
SelectAny,
|
|
198
|
+
Table,
|
|
199
|
+
TTypeAdapter,
|
|
200
|
+
)
|
|
201
|
+
from sqlalchemy import Column
|
|
202
|
+
from sqlalchemy import types as sa
|
|
203
|
+
|
|
204
|
+
from ingestr.src.filters import table_adapter_exclude_columns
|
|
205
|
+
from ingestr.src.sql_database.callbacks import (
|
|
206
|
+
chained_query_adapter_callback,
|
|
207
|
+
custom_query_variable_subsitution,
|
|
208
|
+
limit_callback,
|
|
209
|
+
type_adapter_callback,
|
|
210
|
+
)
|
|
211
|
+
|
|
180
212
|
query_adapters = []
|
|
181
213
|
if kwargs.get("sql_limit"):
|
|
182
214
|
query_adapters.append(
|
|
@@ -195,6 +227,13 @@ class SqlSource:
|
|
|
195
227
|
defer_table_reflect = True
|
|
196
228
|
query_value = table.split(":", 1)[1]
|
|
197
229
|
|
|
230
|
+
TableBackend: TypeAlias = Literal[
|
|
231
|
+
"sqlalchemy", "pyarrow", "pandas", "connectorx"
|
|
232
|
+
]
|
|
233
|
+
TQueryAdapter: TypeAlias = Callable[[SelectAny, Table], SelectAny]
|
|
234
|
+
import dlt
|
|
235
|
+
from dlt.common.typing import TDataItem
|
|
236
|
+
|
|
198
237
|
# this is a very hacky version of the table_rows function. it is built this way to go around the dlt's table loader.
|
|
199
238
|
# I didn't want to write a full fledged sqlalchemy source for now, and wanted to benefit from the existing stuff to begin with.
|
|
200
239
|
# this is by no means a production ready solution, but it works for now.
|
|
@@ -212,6 +251,9 @@ class SqlSource:
|
|
|
212
251
|
backend_kwargs: Dict[str, Any] = None, # type: ignore
|
|
213
252
|
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
214
253
|
included_columns: Optional[List[str]] = None,
|
|
254
|
+
excluded_columns: Optional[
|
|
255
|
+
List[str]
|
|
256
|
+
] = None, # Added for dlt 1.16.0 compatibility
|
|
215
257
|
query_adapter_callback: Optional[TQueryAdapter] = None,
|
|
216
258
|
resolve_foreign_keys: bool = False,
|
|
217
259
|
) -> Iterator[TDataItem]:
|
|
@@ -245,6 +287,8 @@ class SqlSource:
|
|
|
245
287
|
*cols,
|
|
246
288
|
)
|
|
247
289
|
|
|
290
|
+
from dlt.sources.sql_database.helpers import TableLoader
|
|
291
|
+
|
|
248
292
|
loader = TableLoader(
|
|
249
293
|
engine,
|
|
250
294
|
backend,
|
|
@@ -265,8 +309,54 @@ class SqlSource:
|
|
|
265
309
|
# override the query adapters, the only one we want is the one here in the case of custom queries
|
|
266
310
|
query_adapters = [custom_query_variable_subsitution(query_value, kwargs)]
|
|
267
311
|
|
|
312
|
+
credentials = ConnectionStringCredentials(uri)
|
|
313
|
+
if uri.startswith("mssql://"):
|
|
314
|
+
parsed_uri = urlparse(uri)
|
|
315
|
+
params = parse_qs(parsed_uri.query)
|
|
316
|
+
params = {k.lower(): v for k, v in params.items()}
|
|
317
|
+
if params.get("authentication") == ["ActiveDirectoryAccessToken"]:
|
|
318
|
+
import pyodbc # type: ignore
|
|
319
|
+
from sqlalchemy import create_engine
|
|
320
|
+
|
|
321
|
+
from ingestr.src.destinations import (
|
|
322
|
+
MSSQL_COPT_SS_ACCESS_TOKEN,
|
|
323
|
+
handle_datetimeoffset,
|
|
324
|
+
serialize_azure_token,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
cfg = {
|
|
328
|
+
"DRIVER": params.get("driver", ["ODBC Driver 18 for SQL Server"])[
|
|
329
|
+
0
|
|
330
|
+
],
|
|
331
|
+
"SERVER": f"{parsed_uri.hostname},{parsed_uri.port or 1433}",
|
|
332
|
+
"DATABASE": parsed_uri.path.lstrip("/"),
|
|
333
|
+
}
|
|
334
|
+
for k, v in params.items():
|
|
335
|
+
if k.lower() not in ["driver", "authentication", "connect_timeout"]:
|
|
336
|
+
cfg[k.upper()] = v[0]
|
|
337
|
+
|
|
338
|
+
token = serialize_azure_token(parsed_uri.password)
|
|
339
|
+
dsn = ";".join([f"{k}={v}" for k, v in cfg.items()])
|
|
340
|
+
|
|
341
|
+
def creator():
|
|
342
|
+
connection = pyodbc.connect(
|
|
343
|
+
dsn,
|
|
344
|
+
autocommit=True,
|
|
345
|
+
timeout=kwargs.get("connect_timeout", 30),
|
|
346
|
+
attrs_before={
|
|
347
|
+
MSSQL_COPT_SS_ACCESS_TOKEN: token,
|
|
348
|
+
},
|
|
349
|
+
)
|
|
350
|
+
connection.add_output_converter(-155, handle_datetimeoffset)
|
|
351
|
+
return connection
|
|
352
|
+
|
|
353
|
+
credentials = create_engine(
|
|
354
|
+
"mssql+pyodbc://",
|
|
355
|
+
creator=creator,
|
|
356
|
+
)
|
|
357
|
+
|
|
268
358
|
builder_res = self.table_builder(
|
|
269
|
-
credentials=
|
|
359
|
+
credentials=credentials,
|
|
270
360
|
schema=table_fields.dataset,
|
|
271
361
|
table=table_fields.table,
|
|
272
362
|
incremental=incremental,
|
|
@@ -279,6 +369,7 @@ class SqlSource:
|
|
|
279
369
|
kwargs.get("sql_exclude_columns", [])
|
|
280
370
|
),
|
|
281
371
|
defer_table_reflect=defer_table_reflect,
|
|
372
|
+
engine_adapter_callback=engine_adapter_callback,
|
|
282
373
|
)
|
|
283
374
|
|
|
284
375
|
return builder_res
|
|
@@ -287,7 +378,12 @@ class SqlSource:
|
|
|
287
378
|
class ArrowMemoryMappedSource:
|
|
288
379
|
table_builder: Callable
|
|
289
380
|
|
|
290
|
-
def __init__(self, table_builder=
|
|
381
|
+
def __init__(self, table_builder=None) -> None:
|
|
382
|
+
if table_builder is None:
|
|
383
|
+
from ingestr.src.arrow import memory_mapped_arrow
|
|
384
|
+
|
|
385
|
+
table_builder = memory_mapped_arrow
|
|
386
|
+
|
|
291
387
|
self.table_builder = table_builder
|
|
292
388
|
|
|
293
389
|
def handles_incrementality(self) -> bool:
|
|
@@ -299,7 +395,7 @@ class ArrowMemoryMappedSource:
|
|
|
299
395
|
start_value = kwargs.get("interval_start")
|
|
300
396
|
end_value = kwargs.get("interval_end")
|
|
301
397
|
|
|
302
|
-
incremental =
|
|
398
|
+
incremental = dlt_incremental(
|
|
303
399
|
kwargs.get("incremental_key", ""),
|
|
304
400
|
initial_value=start_value,
|
|
305
401
|
end_value=end_value,
|
|
@@ -332,37 +428,199 @@ class ArrowMemoryMappedSource:
|
|
|
332
428
|
class MongoDbSource:
|
|
333
429
|
table_builder: Callable
|
|
334
430
|
|
|
335
|
-
def __init__(self, table_builder=
|
|
431
|
+
def __init__(self, table_builder=None) -> None:
|
|
432
|
+
if table_builder is None:
|
|
433
|
+
from ingestr.src.mongodb import mongodb_collection
|
|
434
|
+
|
|
435
|
+
table_builder = mongodb_collection
|
|
436
|
+
|
|
336
437
|
self.table_builder = table_builder
|
|
337
438
|
|
|
338
439
|
def handles_incrementality(self) -> bool:
|
|
339
440
|
return False
|
|
340
441
|
|
|
341
442
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
342
|
-
|
|
443
|
+
# Check if this is a custom query format (collection:query)
|
|
444
|
+
if ":" in table:
|
|
445
|
+
collection_name, query_json = table.split(":", 1)
|
|
343
446
|
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
end_value = kwargs.get("interval_end")
|
|
447
|
+
# Parse the query using MongoDB's extended JSON parser
|
|
448
|
+
# First, convert MongoDB shell syntax to Extended JSON format
|
|
449
|
+
from bson import json_util
|
|
348
450
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
451
|
+
from ingestr.src.mongodb.helpers import convert_mongo_shell_to_extended_json
|
|
452
|
+
|
|
453
|
+
# Convert MongoDB shell constructs to Extended JSON v2 format
|
|
454
|
+
converted_query = convert_mongo_shell_to_extended_json(query_json)
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
query = json_util.loads(converted_query)
|
|
458
|
+
except Exception as e:
|
|
459
|
+
raise ValueError(f"Invalid MongoDB query format: {e}")
|
|
460
|
+
|
|
461
|
+
# Validate that it's a list for aggregation pipeline
|
|
462
|
+
if not isinstance(query, list):
|
|
463
|
+
raise ValueError(
|
|
464
|
+
"Query must be a JSON array representing a MongoDB aggregation pipeline"
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# Check for incremental load requirements
|
|
468
|
+
incremental = None
|
|
469
|
+
if kwargs.get("incremental_key"):
|
|
470
|
+
start_value = kwargs.get("interval_start")
|
|
471
|
+
end_value = kwargs.get("interval_end")
|
|
472
|
+
|
|
473
|
+
# Validate that incremental key is present in the pipeline
|
|
474
|
+
incremental_key = kwargs.get("incremental_key")
|
|
475
|
+
self._validate_incremental_query(query, str(incremental_key))
|
|
476
|
+
|
|
477
|
+
incremental = dlt_incremental(
|
|
478
|
+
str(incremental_key),
|
|
479
|
+
initial_value=start_value,
|
|
480
|
+
end_value=end_value,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Substitute interval parameters in the query
|
|
484
|
+
query = self._substitute_interval_params(query, kwargs)
|
|
485
|
+
|
|
486
|
+
# Parse collection name to get database and collection
|
|
487
|
+
if "." in collection_name:
|
|
488
|
+
# Handle database.collection format
|
|
489
|
+
table_fields = table_string_to_dataclass(collection_name)
|
|
490
|
+
database = table_fields.dataset
|
|
491
|
+
collection = table_fields.table
|
|
492
|
+
else:
|
|
493
|
+
# Single collection name, use default database
|
|
494
|
+
database = None
|
|
495
|
+
collection = collection_name
|
|
496
|
+
|
|
497
|
+
table_instance = self.table_builder(
|
|
498
|
+
connection_url=uri,
|
|
499
|
+
database=database,
|
|
500
|
+
collection=collection,
|
|
501
|
+
parallel=False,
|
|
502
|
+
incremental=incremental,
|
|
503
|
+
custom_query=query,
|
|
355
504
|
)
|
|
505
|
+
table_instance.max_table_nesting = 1
|
|
506
|
+
return table_instance
|
|
507
|
+
else:
|
|
508
|
+
# Default behavior for simple collection names
|
|
509
|
+
table_fields = table_string_to_dataclass(table)
|
|
356
510
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
parallel=True,
|
|
362
|
-
incremental=incremental,
|
|
363
|
-
)
|
|
511
|
+
incremental = None
|
|
512
|
+
if kwargs.get("incremental_key"):
|
|
513
|
+
start_value = kwargs.get("interval_start")
|
|
514
|
+
end_value = kwargs.get("interval_end")
|
|
364
515
|
|
|
365
|
-
|
|
516
|
+
incremental = dlt_incremental(
|
|
517
|
+
kwargs.get("incremental_key", ""),
|
|
518
|
+
initial_value=start_value,
|
|
519
|
+
end_value=end_value,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
table_instance = self.table_builder(
|
|
523
|
+
connection_url=uri,
|
|
524
|
+
database=table_fields.dataset,
|
|
525
|
+
collection=table_fields.table,
|
|
526
|
+
parallel=False,
|
|
527
|
+
incremental=incremental,
|
|
528
|
+
)
|
|
529
|
+
table_instance.max_table_nesting = 1
|
|
530
|
+
|
|
531
|
+
return table_instance
|
|
532
|
+
|
|
533
|
+
def _validate_incremental_query(self, query: list, incremental_key: str):
|
|
534
|
+
"""Validate that incremental key is projected in the aggregation pipeline"""
|
|
535
|
+
# Check if there's a $project stage and if incremental_key is included
|
|
536
|
+
has_project = False
|
|
537
|
+
incremental_key_projected = False
|
|
538
|
+
|
|
539
|
+
for stage in query:
|
|
540
|
+
if "$project" in stage:
|
|
541
|
+
has_project = True
|
|
542
|
+
project_stage = stage["$project"]
|
|
543
|
+
if isinstance(project_stage, dict):
|
|
544
|
+
# Check if incremental_key is explicitly included
|
|
545
|
+
if incremental_key in project_stage:
|
|
546
|
+
if project_stage[incremental_key] not in [0, False]:
|
|
547
|
+
incremental_key_projected = True
|
|
548
|
+
# If there are only inclusions (1 or True values) and incremental_key is not included
|
|
549
|
+
elif any(v in [1, True] for v in project_stage.values()):
|
|
550
|
+
# This is an inclusion projection, incremental_key must be explicitly included
|
|
551
|
+
incremental_key_projected = False
|
|
552
|
+
# If there are only exclusions (0 or False values) and incremental_key is not excluded
|
|
553
|
+
elif all(
|
|
554
|
+
v in [0, False]
|
|
555
|
+
for v in project_stage.values()
|
|
556
|
+
if v in [0, False, 1, True]
|
|
557
|
+
):
|
|
558
|
+
# This is an exclusion projection, incremental_key is included by default
|
|
559
|
+
if incremental_key not in project_stage:
|
|
560
|
+
incremental_key_projected = True
|
|
561
|
+
else:
|
|
562
|
+
incremental_key_projected = project_stage[
|
|
563
|
+
incremental_key
|
|
564
|
+
] not in [0, False]
|
|
565
|
+
else:
|
|
566
|
+
# Mixed or unclear projection, assume incremental_key needs to be explicit
|
|
567
|
+
incremental_key_projected = False
|
|
568
|
+
|
|
569
|
+
# If there's a $project stage but incremental_key is not projected, raise error
|
|
570
|
+
if has_project and not incremental_key_projected:
|
|
571
|
+
raise ValueError(
|
|
572
|
+
f"Incremental key '{incremental_key}' must be included in the projected fields of the aggregation pipeline"
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
def _substitute_interval_params(self, query: list, kwargs: dict):
|
|
576
|
+
"""Substitute :interval_start and :interval_end placeholders with actual datetime values"""
|
|
577
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
578
|
+
|
|
579
|
+
# Get interval values and convert them to datetime objects
|
|
580
|
+
interval_start = kwargs.get("interval_start")
|
|
581
|
+
interval_end = kwargs.get("interval_end")
|
|
582
|
+
|
|
583
|
+
# Convert string dates to datetime objects if needed
|
|
584
|
+
if interval_start is not None:
|
|
585
|
+
if isinstance(interval_start, str):
|
|
586
|
+
pendulum_dt = ensure_pendulum_datetime(interval_start)
|
|
587
|
+
interval_start = (
|
|
588
|
+
pendulum_dt.to_datetime()
|
|
589
|
+
if hasattr(pendulum_dt, "to_datetime")
|
|
590
|
+
else pendulum_dt
|
|
591
|
+
)
|
|
592
|
+
elif hasattr(interval_start, "to_datetime"):
|
|
593
|
+
interval_start = interval_start.to_datetime()
|
|
594
|
+
|
|
595
|
+
if interval_end is not None:
|
|
596
|
+
if isinstance(interval_end, str):
|
|
597
|
+
pendulum_dt = ensure_pendulum_datetime(interval_end)
|
|
598
|
+
interval_end = (
|
|
599
|
+
pendulum_dt.to_datetime()
|
|
600
|
+
if hasattr(pendulum_dt, "to_datetime")
|
|
601
|
+
else pendulum_dt
|
|
602
|
+
)
|
|
603
|
+
elif hasattr(interval_end, "to_datetime"):
|
|
604
|
+
interval_end = interval_end.to_datetime()
|
|
605
|
+
|
|
606
|
+
# Deep copy the query and replace placeholders with actual datetime objects
|
|
607
|
+
def replace_placeholders(obj):
|
|
608
|
+
if isinstance(obj, dict):
|
|
609
|
+
result = {}
|
|
610
|
+
for key, value in obj.items():
|
|
611
|
+
if value == ":interval_start" and interval_start is not None:
|
|
612
|
+
result[key] = interval_start
|
|
613
|
+
elif value == ":interval_end" and interval_end is not None:
|
|
614
|
+
result[key] = interval_end
|
|
615
|
+
else:
|
|
616
|
+
result[key] = replace_placeholders(value)
|
|
617
|
+
return result
|
|
618
|
+
elif isinstance(obj, list):
|
|
619
|
+
return [replace_placeholders(item) for item in obj]
|
|
620
|
+
else:
|
|
621
|
+
return obj
|
|
622
|
+
|
|
623
|
+
return replace_placeholders(query)
|
|
366
624
|
|
|
367
625
|
|
|
368
626
|
class LocalCsvSource:
|
|
@@ -371,7 +629,7 @@ class LocalCsvSource:
|
|
|
371
629
|
|
|
372
630
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
373
631
|
def csv_file(
|
|
374
|
-
incremental: Optional[
|
|
632
|
+
incremental: Optional[dlt_incremental[Any]] = None,
|
|
375
633
|
):
|
|
376
634
|
file_path = uri.split("://")[1]
|
|
377
635
|
myFile = open(file_path, "r")
|
|
@@ -413,11 +671,13 @@ class LocalCsvSource:
|
|
|
413
671
|
if page:
|
|
414
672
|
yield page
|
|
415
673
|
|
|
416
|
-
|
|
674
|
+
from dlt import resource
|
|
675
|
+
|
|
676
|
+
return resource(
|
|
417
677
|
csv_file,
|
|
418
678
|
merge_key=kwargs.get("merge_key"), # type: ignore
|
|
419
679
|
)(
|
|
420
|
-
incremental=
|
|
680
|
+
incremental=dlt_incremental(
|
|
421
681
|
kwargs.get("incremental_key", ""),
|
|
422
682
|
initial_value=kwargs.get("interval_start"),
|
|
423
683
|
end_value=kwargs.get("interval_end"),
|
|
@@ -433,7 +693,12 @@ class LocalCsvSource:
|
|
|
433
693
|
class NotionSource:
|
|
434
694
|
table_builder: Callable
|
|
435
695
|
|
|
436
|
-
def __init__(self, table_builder=
|
|
696
|
+
def __init__(self, table_builder=None) -> None:
|
|
697
|
+
if table_builder is None:
|
|
698
|
+
from ingestr.src.notion import notion_databases
|
|
699
|
+
|
|
700
|
+
table_builder = notion_databases
|
|
701
|
+
|
|
437
702
|
self.table_builder = table_builder
|
|
438
703
|
|
|
439
704
|
def handles_incrementality(self) -> bool:
|
|
@@ -460,6 +725,11 @@ class ShopifySource:
|
|
|
460
725
|
return True
|
|
461
726
|
|
|
462
727
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
728
|
+
if kwargs.get("incremental_key"):
|
|
729
|
+
raise ValueError(
|
|
730
|
+
"Shopify takes care of incrementality on its own, you should not provide incremental_key"
|
|
731
|
+
)
|
|
732
|
+
|
|
463
733
|
source_fields = urlparse(uri)
|
|
464
734
|
source_params = parse_qs(source_fields.query)
|
|
465
735
|
api_key = source_params.get("api_key")
|
|
@@ -493,6 +763,8 @@ class ShopifySource:
|
|
|
493
763
|
f"Table name '{table}' is not supported for Shopify source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
494
764
|
)
|
|
495
765
|
|
|
766
|
+
from ingestr.src.shopify import shopify_source
|
|
767
|
+
|
|
496
768
|
return shopify_source(
|
|
497
769
|
private_app_password=api_key[0],
|
|
498
770
|
shop_url=f"https://{source_fields.netloc}",
|
|
@@ -537,6 +809,8 @@ class GorgiasSource:
|
|
|
537
809
|
if kwargs.get("interval_end"):
|
|
538
810
|
date_args["end_date"] = kwargs.get("interval_end")
|
|
539
811
|
|
|
812
|
+
from ingestr.src.gorgias import gorgias_source
|
|
813
|
+
|
|
540
814
|
return gorgias_source(
|
|
541
815
|
domain=source_fields.netloc,
|
|
542
816
|
email=email[0],
|
|
@@ -548,7 +822,12 @@ class GorgiasSource:
|
|
|
548
822
|
class GoogleSheetsSource:
|
|
549
823
|
table_builder: Callable
|
|
550
824
|
|
|
551
|
-
def __init__(self, table_builder=
|
|
825
|
+
def __init__(self, table_builder=None) -> None:
|
|
826
|
+
if table_builder is None:
|
|
827
|
+
from ingestr.src.google_sheets import google_spreadsheet
|
|
828
|
+
|
|
829
|
+
table_builder = google_spreadsheet
|
|
830
|
+
|
|
552
831
|
self.table_builder = table_builder
|
|
553
832
|
|
|
554
833
|
def handles_incrementality(self) -> bool:
|
|
@@ -629,6 +908,8 @@ class ChessSource:
|
|
|
629
908
|
f"Resource '{table}' is not supported for Chess source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
630
909
|
)
|
|
631
910
|
|
|
911
|
+
from ingestr.src.chess import source
|
|
912
|
+
|
|
632
913
|
return source(players=list_players, **date_args).with_resources(
|
|
633
914
|
table_mapping[table]
|
|
634
915
|
)
|
|
@@ -652,40 +933,74 @@ class StripeAnalyticsSource:
|
|
|
652
933
|
if not api_key:
|
|
653
934
|
raise ValueError("api_key in the URI is required to connect to Stripe")
|
|
654
935
|
|
|
936
|
+
table = table.lower()
|
|
937
|
+
|
|
938
|
+
from ingestr.src.stripe_analytics.settings import ENDPOINTS
|
|
939
|
+
|
|
655
940
|
endpoint = None
|
|
656
|
-
|
|
941
|
+
incremental = False
|
|
942
|
+
sync = False
|
|
657
943
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
"
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
"
|
|
667
|
-
"
|
|
668
|
-
]:
|
|
669
|
-
endpoint = table
|
|
944
|
+
table_fields = table.split(":")
|
|
945
|
+
if len(table_fields) == 1:
|
|
946
|
+
endpoint = table_fields[0]
|
|
947
|
+
elif len(table_fields) == 2:
|
|
948
|
+
endpoint = table_fields[0]
|
|
949
|
+
sync = table_fields[1] == "sync"
|
|
950
|
+
elif len(table_fields) == 3:
|
|
951
|
+
endpoint = table_fields[0]
|
|
952
|
+
sync = table_fields[1] == "sync"
|
|
953
|
+
incremental = table_fields[2] == "incremental"
|
|
670
954
|
else:
|
|
671
955
|
raise ValueError(
|
|
672
|
-
|
|
956
|
+
"Invalid Stripe table format. Expected: stripe:<endpoint> or stripe:<endpoint>:<sync> or stripe:<endpoint>:<sync>:<incremental>"
|
|
673
957
|
)
|
|
674
958
|
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
959
|
+
if incremental and not sync:
|
|
960
|
+
raise ValueError("incremental loads must be used with sync loading")
|
|
961
|
+
|
|
962
|
+
if incremental:
|
|
963
|
+
from ingestr.src.stripe_analytics import incremental_stripe_source
|
|
964
|
+
|
|
965
|
+
def nullable_date(date_str: Optional[str]):
|
|
966
|
+
if date_str:
|
|
967
|
+
return ensure_pendulum_datetime(date_str)
|
|
968
|
+
return None
|
|
969
|
+
|
|
970
|
+
endpoint = ENDPOINTS[endpoint]
|
|
971
|
+
return incremental_stripe_source(
|
|
972
|
+
endpoints=[
|
|
973
|
+
endpoint,
|
|
974
|
+
],
|
|
975
|
+
stripe_secret_key=api_key[0],
|
|
976
|
+
initial_start_date=nullable_date(kwargs.get("interval_start", None)),
|
|
977
|
+
end_date=nullable_date(kwargs.get("interval_end", None)),
|
|
978
|
+
).with_resources(endpoint)
|
|
979
|
+
else:
|
|
980
|
+
endpoint = ENDPOINTS[endpoint]
|
|
981
|
+
if sync:
|
|
982
|
+
from ingestr.src.stripe_analytics import stripe_source
|
|
983
|
+
|
|
984
|
+
return stripe_source(
|
|
985
|
+
endpoints=[
|
|
986
|
+
endpoint,
|
|
987
|
+
],
|
|
988
|
+
stripe_secret_key=api_key[0],
|
|
989
|
+
).with_resources(endpoint)
|
|
990
|
+
else:
|
|
991
|
+
from ingestr.src.stripe_analytics import async_stripe_source
|
|
992
|
+
|
|
993
|
+
return async_stripe_source(
|
|
994
|
+
endpoints=[
|
|
995
|
+
endpoint,
|
|
996
|
+
],
|
|
997
|
+
stripe_secret_key=api_key[0],
|
|
998
|
+
max_workers=kwargs.get("extract_parallelism", 4),
|
|
999
|
+
).with_resources(endpoint)
|
|
1000
|
+
|
|
1001
|
+
raise ValueError(
|
|
1002
|
+
f"Resource '{table}' is not supported for stripe source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1003
|
+
)
|
|
689
1004
|
|
|
690
1005
|
|
|
691
1006
|
class FacebookAdsSource:
|
|
@@ -711,17 +1026,76 @@ class FacebookAdsSource:
|
|
|
711
1026
|
"access_token and accound_id are required to connect to Facebook Ads."
|
|
712
1027
|
)
|
|
713
1028
|
|
|
1029
|
+
from ingestr.src.facebook_ads import (
|
|
1030
|
+
facebook_ads_source,
|
|
1031
|
+
facebook_insights_source,
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
insights_max_wait_to_finish_seconds = source_params.get(
|
|
1035
|
+
"insights_max_wait_to_finish_seconds", [60 * 60 * 4]
|
|
1036
|
+
)
|
|
1037
|
+
insights_max_wait_to_start_seconds = source_params.get(
|
|
1038
|
+
"insights_max_wait_to_start_seconds", [60 * 30]
|
|
1039
|
+
)
|
|
1040
|
+
insights_max_async_sleep_seconds = source_params.get(
|
|
1041
|
+
"insights_max_async_sleep_seconds", [20]
|
|
1042
|
+
)
|
|
1043
|
+
|
|
714
1044
|
endpoint = None
|
|
715
1045
|
if table in ["campaigns", "ad_sets", "ad_creatives", "ads", "leads"]:
|
|
716
1046
|
endpoint = table
|
|
717
|
-
elif table
|
|
1047
|
+
elif table == "facebook_insights":
|
|
718
1048
|
return facebook_insights_source(
|
|
719
1049
|
access_token=access_token[0],
|
|
720
1050
|
account_id=account_id[0],
|
|
1051
|
+
start_date=kwargs.get("interval_start"),
|
|
1052
|
+
end_date=kwargs.get("interval_end"),
|
|
1053
|
+
insights_max_wait_to_finish_seconds=insights_max_wait_to_finish_seconds[
|
|
1054
|
+
0
|
|
1055
|
+
],
|
|
1056
|
+
insights_max_wait_to_start_seconds=insights_max_wait_to_start_seconds[
|
|
1057
|
+
0
|
|
1058
|
+
],
|
|
1059
|
+
insights_max_async_sleep_seconds=insights_max_async_sleep_seconds[0],
|
|
721
1060
|
).with_resources("facebook_insights")
|
|
1061
|
+
elif table.startswith("facebook_insights:"):
|
|
1062
|
+
# Parse custom breakdowns and metrics from table name
|
|
1063
|
+
# Supported formats:
|
|
1064
|
+
# facebook_insights:breakdown_type
|
|
1065
|
+
# facebook_insights:breakdown_type:metric1,metric2...
|
|
1066
|
+
parts = table.split(":")
|
|
1067
|
+
|
|
1068
|
+
if len(parts) < 2 or len(parts) > 3:
|
|
1069
|
+
raise ValueError(
|
|
1070
|
+
"Invalid facebook_insights format. Expected: facebook_insights:breakdown_type or facebook_insights:breakdown_type:metric1,metric2..."
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
breakdown_type = parts[1].strip()
|
|
1074
|
+
if not breakdown_type:
|
|
1075
|
+
raise ValueError(
|
|
1076
|
+
"Breakdown type must be provided in format: facebook_insights:breakdown_type"
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
# Validate breakdown type against available options from settings
|
|
1080
|
+
|
|
1081
|
+
from ingestr.src.facebook_ads.helpers import (
|
|
1082
|
+
parse_insights_table_to_source_kwargs,
|
|
1083
|
+
)
|
|
1084
|
+
|
|
1085
|
+
source_kwargs = {
|
|
1086
|
+
"access_token": access_token[0],
|
|
1087
|
+
"account_id": account_id[0],
|
|
1088
|
+
"start_date": kwargs.get("interval_start"),
|
|
1089
|
+
"end_date": kwargs.get("interval_end"),
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
source_kwargs.update(parse_insights_table_to_source_kwargs(table))
|
|
1093
|
+
return facebook_insights_source(**source_kwargs).with_resources(
|
|
1094
|
+
"facebook_insights"
|
|
1095
|
+
)
|
|
722
1096
|
else:
|
|
723
1097
|
raise ValueError(
|
|
724
|
-
"
|
|
1098
|
+
f"Resource '{table}' is not supported for Facebook Ads source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
725
1099
|
)
|
|
726
1100
|
|
|
727
1101
|
return facebook_ads_source(
|
|
@@ -768,6 +1142,8 @@ class SlackSource:
|
|
|
768
1142
|
if kwargs.get("interval_end"):
|
|
769
1143
|
date_args["end_date"] = kwargs.get("interval_end")
|
|
770
1144
|
|
|
1145
|
+
from ingestr.src.slack import slack_source
|
|
1146
|
+
|
|
771
1147
|
return slack_source(
|
|
772
1148
|
access_token=api_key[0],
|
|
773
1149
|
table_per_channel=False,
|
|
@@ -778,7 +1154,7 @@ class SlackSource:
|
|
|
778
1154
|
|
|
779
1155
|
class HubspotSource:
|
|
780
1156
|
def handles_incrementality(self) -> bool:
|
|
781
|
-
return
|
|
1157
|
+
return False
|
|
782
1158
|
|
|
783
1159
|
# hubspot://?api_key=<api_key>
|
|
784
1160
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
@@ -796,7 +1172,35 @@ class HubspotSource:
|
|
|
796
1172
|
raise ValueError("api_key in the URI is required to connect to Hubspot")
|
|
797
1173
|
|
|
798
1174
|
endpoint = None
|
|
799
|
-
|
|
1175
|
+
|
|
1176
|
+
from ingestr.src.hubspot import hubspot
|
|
1177
|
+
|
|
1178
|
+
if table.startswith("custom:"):
|
|
1179
|
+
fields = table.split(":", 2)
|
|
1180
|
+
if len(fields) != 2 and len(fields) != 3:
|
|
1181
|
+
raise ValueError(
|
|
1182
|
+
"Invalid Hubspot custom table format. Expected format: custom:<custom_object_type> or custom:<custom_object_type>:<associations>"
|
|
1183
|
+
)
|
|
1184
|
+
|
|
1185
|
+
if len(fields) == 2:
|
|
1186
|
+
endpoint = fields[1]
|
|
1187
|
+
else:
|
|
1188
|
+
endpoint = f"{fields[1]}:{fields[2]}"
|
|
1189
|
+
|
|
1190
|
+
return hubspot(
|
|
1191
|
+
api_key=api_key[0],
|
|
1192
|
+
custom_object=endpoint,
|
|
1193
|
+
).with_resources("custom")
|
|
1194
|
+
|
|
1195
|
+
elif table in [
|
|
1196
|
+
"contacts",
|
|
1197
|
+
"companies",
|
|
1198
|
+
"deals",
|
|
1199
|
+
"tickets",
|
|
1200
|
+
"products",
|
|
1201
|
+
"quotes",
|
|
1202
|
+
"schemas",
|
|
1203
|
+
]:
|
|
800
1204
|
endpoint = table
|
|
801
1205
|
else:
|
|
802
1206
|
raise ValueError(
|
|
@@ -821,20 +1225,31 @@ class AirtableSource:
|
|
|
821
1225
|
if not table:
|
|
822
1226
|
raise ValueError("Source table is required to connect to Airtable")
|
|
823
1227
|
|
|
824
|
-
tables = table.split(",")
|
|
825
|
-
|
|
826
1228
|
source_parts = urlparse(uri)
|
|
827
1229
|
source_fields = parse_qs(source_parts.query)
|
|
828
|
-
base_id = source_fields.get("base_id")
|
|
829
1230
|
access_token = source_fields.get("access_token")
|
|
830
1231
|
|
|
831
|
-
if not
|
|
1232
|
+
if not access_token:
|
|
832
1233
|
raise ValueError(
|
|
833
|
-
"
|
|
1234
|
+
"access_token in the URI is required to connect to Airtable"
|
|
834
1235
|
)
|
|
835
1236
|
|
|
1237
|
+
base_id = source_fields.get("base_id", [None])[0]
|
|
1238
|
+
clean_table = table
|
|
1239
|
+
|
|
1240
|
+
table_fields = table.split("/")
|
|
1241
|
+
if len(table_fields) == 2:
|
|
1242
|
+
clean_table = table_fields[1]
|
|
1243
|
+
if not base_id:
|
|
1244
|
+
base_id = table_fields[0]
|
|
1245
|
+
|
|
1246
|
+
if not base_id:
|
|
1247
|
+
raise ValueError("base_id in the URI is required to connect to Airtable")
|
|
1248
|
+
|
|
1249
|
+
from ingestr.src.airtable import airtable_source
|
|
1250
|
+
|
|
836
1251
|
return airtable_source(
|
|
837
|
-
base_id=base_id
|
|
1252
|
+
base_id=base_id, table_names=[clean_table], access_token=access_token[0]
|
|
838
1253
|
)
|
|
839
1254
|
|
|
840
1255
|
|
|
@@ -880,12 +1295,66 @@ class KlaviyoSource:
|
|
|
880
1295
|
)
|
|
881
1296
|
|
|
882
1297
|
start_date = kwargs.get("interval_start") or "2000-01-01"
|
|
1298
|
+
|
|
1299
|
+
from ingestr.src.klaviyo import klaviyo_source
|
|
1300
|
+
|
|
883
1301
|
return klaviyo_source(
|
|
884
1302
|
api_key=api_key[0],
|
|
885
1303
|
start_date=start_date,
|
|
886
1304
|
).with_resources(resource)
|
|
887
1305
|
|
|
888
1306
|
|
|
1307
|
+
class MixpanelSource:
|
|
1308
|
+
def handles_incrementality(self) -> bool:
|
|
1309
|
+
return True
|
|
1310
|
+
|
|
1311
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1312
|
+
if kwargs.get("incremental_key"):
|
|
1313
|
+
raise ValueError(
|
|
1314
|
+
"Mixpanel takes care of incrementality on its own, you should not provide incremental_key"
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
parsed = urlparse(uri)
|
|
1318
|
+
params = parse_qs(parsed.query)
|
|
1319
|
+
username = params.get("username")
|
|
1320
|
+
password = params.get("password")
|
|
1321
|
+
project_id = params.get("project_id")
|
|
1322
|
+
server = params.get("server", ["eu"])
|
|
1323
|
+
|
|
1324
|
+
if not username or not password or not project_id:
|
|
1325
|
+
raise ValueError(
|
|
1326
|
+
"username, password, project_id are required to connect to Mixpanel"
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
if table not in ["events", "profiles"]:
|
|
1330
|
+
raise ValueError(
|
|
1331
|
+
f"Resource '{table}' is not supported for Mixpanel source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1332
|
+
)
|
|
1333
|
+
|
|
1334
|
+
start_date = kwargs.get("interval_start")
|
|
1335
|
+
if start_date:
|
|
1336
|
+
start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
|
|
1337
|
+
else:
|
|
1338
|
+
start_date = pendulum.datetime(2020, 1, 1).in_timezone("UTC")
|
|
1339
|
+
|
|
1340
|
+
end_date = kwargs.get("interval_end")
|
|
1341
|
+
if end_date:
|
|
1342
|
+
end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
|
|
1343
|
+
else:
|
|
1344
|
+
end_date = pendulum.now().in_timezone("UTC")
|
|
1345
|
+
|
|
1346
|
+
from ingestr.src.mixpanel import mixpanel_source
|
|
1347
|
+
|
|
1348
|
+
return mixpanel_source(
|
|
1349
|
+
username=username[0],
|
|
1350
|
+
password=password[0],
|
|
1351
|
+
project_id=project_id[0],
|
|
1352
|
+
start_date=start_date,
|
|
1353
|
+
end_date=end_date,
|
|
1354
|
+
server=server[0],
|
|
1355
|
+
).with_resources(table)
|
|
1356
|
+
|
|
1357
|
+
|
|
889
1358
|
class KafkaSource:
|
|
890
1359
|
def handles_incrementality(self) -> bool:
|
|
891
1360
|
return False
|
|
@@ -913,6 +1382,9 @@ class KafkaSource:
|
|
|
913
1382
|
raise ValueError("group_id in the URI is required to connect to kafka")
|
|
914
1383
|
|
|
915
1384
|
start_date = kwargs.get("interval_start")
|
|
1385
|
+
from ingestr.src.kafka import kafka_consumer
|
|
1386
|
+
from ingestr.src.kafka.helpers import KafkaCredentials
|
|
1387
|
+
|
|
916
1388
|
return kafka_consumer(
|
|
917
1389
|
topics=[table],
|
|
918
1390
|
credentials=KafkaCredentials(
|
|
@@ -968,6 +1440,9 @@ class AdjustSource:
|
|
|
968
1440
|
if kwargs.get("interval_end"):
|
|
969
1441
|
end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
|
|
970
1442
|
|
|
1443
|
+
from ingestr.src.adjust import REQUIRED_CUSTOM_DIMENSIONS, adjust_source
|
|
1444
|
+
from ingestr.src.adjust.adjust_helpers import parse_filters
|
|
1445
|
+
|
|
971
1446
|
dimensions = None
|
|
972
1447
|
metrics = None
|
|
973
1448
|
filters = []
|
|
@@ -1015,6 +1490,8 @@ class AppsflyerSource:
|
|
|
1015
1490
|
return True
|
|
1016
1491
|
|
|
1017
1492
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1493
|
+
from ingestr.src.appsflyer import appsflyer_source
|
|
1494
|
+
|
|
1018
1495
|
if kwargs.get("incremental_key"):
|
|
1019
1496
|
raise ValueError(
|
|
1020
1497
|
"Appsflyer_Source takes care of incrementality on its own, you should not provide incremental_key"
|
|
@@ -1027,22 +1504,27 @@ class AppsflyerSource:
|
|
|
1027
1504
|
if not api_key:
|
|
1028
1505
|
raise ValueError("api_key in the URI is required to connect to Appsflyer")
|
|
1029
1506
|
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
)
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1507
|
+
start_date = kwargs.get("interval_start")
|
|
1508
|
+
end_date = kwargs.get("interval_end")
|
|
1509
|
+
dimensions = []
|
|
1510
|
+
metrics = []
|
|
1511
|
+
if table.startswith("custom:"):
|
|
1512
|
+
fields = table.split(":", 3)
|
|
1513
|
+
if len(fields) != 3:
|
|
1514
|
+
raise ValueError(
|
|
1515
|
+
"Invalid Adjust custom table format. Expected format: custom:<dimensions>:<metrics>"
|
|
1516
|
+
)
|
|
1517
|
+
dimensions = fields[1].split(",")
|
|
1518
|
+
metrics = fields[2].split(",")
|
|
1519
|
+
table = "custom"
|
|
1040
1520
|
|
|
1041
1521
|
return appsflyer_source(
|
|
1042
1522
|
api_key=api_key[0],
|
|
1043
|
-
start_date=start_date,
|
|
1044
|
-
end_date=end_date,
|
|
1045
|
-
|
|
1523
|
+
start_date=start_date.strftime("%Y-%m-%d") if start_date else None, # type: ignore
|
|
1524
|
+
end_date=end_date.strftime("%Y-%m-%d") if end_date else None, # type: ignore
|
|
1525
|
+
dimensions=dimensions,
|
|
1526
|
+
metrics=metrics,
|
|
1527
|
+
).with_resources(table)
|
|
1046
1528
|
|
|
1047
1529
|
|
|
1048
1530
|
class ZendeskSource:
|
|
@@ -1067,6 +1549,12 @@ class ZendeskSource:
|
|
|
1067
1549
|
if not subdomain:
|
|
1068
1550
|
raise ValueError("Subdomain is required to connect with Zendesk")
|
|
1069
1551
|
|
|
1552
|
+
from ingestr.src.zendesk import zendesk_chat, zendesk_support, zendesk_talk
|
|
1553
|
+
from ingestr.src.zendesk.helpers.credentials import (
|
|
1554
|
+
ZendeskCredentialsOAuth,
|
|
1555
|
+
ZendeskCredentialsToken,
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1070
1558
|
if not source_fields.username and source_fields.password:
|
|
1071
1559
|
oauth_token = source_fields.password
|
|
1072
1560
|
if not oauth_token:
|
|
@@ -1125,7 +1613,7 @@ class ZendeskSource:
|
|
|
1125
1613
|
).with_resources(table)
|
|
1126
1614
|
else:
|
|
1127
1615
|
raise ValueError(
|
|
1128
|
-
"
|
|
1616
|
+
f"Resource '{table}' is not supported for Zendesk source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1129
1617
|
)
|
|
1130
1618
|
|
|
1131
1619
|
|
|
@@ -1140,7 +1628,7 @@ class S3Source:
|
|
|
1140
1628
|
)
|
|
1141
1629
|
|
|
1142
1630
|
parsed_uri = urlparse(uri)
|
|
1143
|
-
source_fields = parse_qs(
|
|
1631
|
+
source_fields = parse_qs(parsed_uri.query)
|
|
1144
1632
|
access_key_id = source_fields.get("access_key_id")
|
|
1145
1633
|
if not access_key_id:
|
|
1146
1634
|
raise ValueError("access_key_id is required to connect to S3")
|
|
@@ -1155,22 +1643,34 @@ class S3Source:
|
|
|
1155
1643
|
|
|
1156
1644
|
bucket_url = f"s3://{bucket_name}/"
|
|
1157
1645
|
|
|
1646
|
+
import s3fs # type: ignore
|
|
1647
|
+
|
|
1158
1648
|
fs = s3fs.S3FileSystem(
|
|
1159
1649
|
key=access_key_id[0],
|
|
1160
1650
|
secret=secret_access_key[0],
|
|
1161
1651
|
)
|
|
1162
1652
|
|
|
1163
|
-
|
|
1164
|
-
if
|
|
1165
|
-
endpoint = "
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1653
|
+
endpoint: Optional[str] = None
|
|
1654
|
+
if "#" in table:
|
|
1655
|
+
_, endpoint = table.split("#")
|
|
1656
|
+
if endpoint not in ["csv", "jsonl", "parquet"]:
|
|
1657
|
+
raise ValueError(
|
|
1658
|
+
"S3 Source only supports specific formats files: csv, jsonl, parquet"
|
|
1659
|
+
)
|
|
1660
|
+
endpoint = f"read_{endpoint}"
|
|
1170
1661
|
else:
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1662
|
+
try:
|
|
1663
|
+
endpoint = blob.parse_endpoint(path_to_file)
|
|
1664
|
+
except blob.UnsupportedEndpointError:
|
|
1665
|
+
raise ValueError(
|
|
1666
|
+
"S3 Source only supports specific formats files: csv, jsonl, parquet"
|
|
1667
|
+
)
|
|
1668
|
+
except Exception as e:
|
|
1669
|
+
raise ValueError(
|
|
1670
|
+
f"Failed to parse endpoint from path: {path_to_file}"
|
|
1671
|
+
) from e
|
|
1672
|
+
|
|
1673
|
+
from ingestr.src.filesystem import readers
|
|
1174
1674
|
|
|
1175
1675
|
return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
|
|
1176
1676
|
|
|
@@ -1181,6 +1681,11 @@ class TikTokSource:
|
|
|
1181
1681
|
return True
|
|
1182
1682
|
|
|
1183
1683
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1684
|
+
if kwargs.get("incremental_key"):
|
|
1685
|
+
raise ValueError(
|
|
1686
|
+
"TikTok takes care of incrementality on its own, you should not provide incremental_key"
|
|
1687
|
+
)
|
|
1688
|
+
|
|
1184
1689
|
endpoint = "custom_reports"
|
|
1185
1690
|
|
|
1186
1691
|
parsed_uri = urlparse(uri)
|
|
@@ -1266,6 +1771,8 @@ class TikTokSource:
|
|
|
1266
1771
|
filter_name = list(filters.keys())[0]
|
|
1267
1772
|
filter_value = list(map(int, filters[list(filters.keys())[0]]))
|
|
1268
1773
|
|
|
1774
|
+
from ingestr.src.tiktok_ads import tiktok_source
|
|
1775
|
+
|
|
1269
1776
|
return tiktok_source(
|
|
1270
1777
|
start_date=start_date,
|
|
1271
1778
|
end_date=end_date,
|
|
@@ -1314,15 +1821,78 @@ class AsanaSource:
|
|
|
1314
1821
|
f"Resource '{table}' is not supported for Asana source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1315
1822
|
)
|
|
1316
1823
|
|
|
1824
|
+
import dlt
|
|
1825
|
+
|
|
1826
|
+
from ingestr.src.asana_source import asana_source
|
|
1827
|
+
|
|
1317
1828
|
dlt.secrets["sources.asana_source.access_token"] = access_token[0]
|
|
1829
|
+
|
|
1318
1830
|
src = asana_source()
|
|
1319
1831
|
src.workspaces.add_filter(lambda w: w["gid"] == workspace)
|
|
1320
1832
|
return src.with_resources(table)
|
|
1321
1833
|
|
|
1322
1834
|
|
|
1323
|
-
class
|
|
1324
|
-
|
|
1325
|
-
|
|
1835
|
+
class JiraSource:
|
|
1836
|
+
resources = [
|
|
1837
|
+
"projects",
|
|
1838
|
+
"issues",
|
|
1839
|
+
"users",
|
|
1840
|
+
"issue_types",
|
|
1841
|
+
"statuses",
|
|
1842
|
+
"priorities",
|
|
1843
|
+
"resolutions",
|
|
1844
|
+
"project_versions",
|
|
1845
|
+
"project_components",
|
|
1846
|
+
"events",
|
|
1847
|
+
]
|
|
1848
|
+
|
|
1849
|
+
def handles_incrementality(self) -> bool:
|
|
1850
|
+
return True
|
|
1851
|
+
|
|
1852
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1853
|
+
parsed_uri = urlparse(uri)
|
|
1854
|
+
params = parse_qs(parsed_uri.query)
|
|
1855
|
+
|
|
1856
|
+
base_url = f"https://{parsed_uri.netloc}"
|
|
1857
|
+
email = params.get("email")
|
|
1858
|
+
api_token = params.get("api_token")
|
|
1859
|
+
|
|
1860
|
+
if not email:
|
|
1861
|
+
raise ValueError("email must be specified in the URI query parameters")
|
|
1862
|
+
|
|
1863
|
+
if not api_token:
|
|
1864
|
+
raise ValueError("api_token is required for connecting to Jira")
|
|
1865
|
+
|
|
1866
|
+
flags = {
|
|
1867
|
+
"skip_archived": False,
|
|
1868
|
+
}
|
|
1869
|
+
if ":" in table:
|
|
1870
|
+
table, rest = table.split(":", 1) # type: ignore
|
|
1871
|
+
for k in rest.split(":"):
|
|
1872
|
+
flags[k] = True
|
|
1873
|
+
|
|
1874
|
+
if table not in self.resources:
|
|
1875
|
+
raise ValueError(
|
|
1876
|
+
f"Resource '{table}' is not supported for Jira source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
1877
|
+
)
|
|
1878
|
+
|
|
1879
|
+
import dlt
|
|
1880
|
+
|
|
1881
|
+
from ingestr.src.jira_source import jira_source
|
|
1882
|
+
|
|
1883
|
+
dlt.secrets["sources.jira_source.base_url"] = base_url
|
|
1884
|
+
dlt.secrets["sources.jira_source.email"] = email[0]
|
|
1885
|
+
dlt.secrets["sources.jira_source.api_token"] = api_token[0]
|
|
1886
|
+
|
|
1887
|
+
src = jira_source()
|
|
1888
|
+
if flags["skip_archived"]:
|
|
1889
|
+
src.projects.add_filter(lambda p: not p.get("archived", False))
|
|
1890
|
+
return src.with_resources(table)
|
|
1891
|
+
|
|
1892
|
+
|
|
1893
|
+
class DynamoDBSource:
|
|
1894
|
+
AWS_ENDPOINT_PATTERN = re.compile(".*\.(.+)\.amazonaws\.com")
|
|
1895
|
+
|
|
1326
1896
|
def infer_aws_region(self, uri: ParseResult) -> Optional[str]:
|
|
1327
1897
|
# try to infer from URI
|
|
1328
1898
|
matches = self.AWS_ENDPOINT_PATTERN.match(uri.netloc)
|
|
@@ -1350,7 +1920,7 @@ class DynamoDBSource:
|
|
|
1350
1920
|
if not region:
|
|
1351
1921
|
raise ValueError("region is required to connect to Dynamodb")
|
|
1352
1922
|
|
|
1353
|
-
qs = parse_qs(
|
|
1923
|
+
qs = parse_qs(parsed_uri.query)
|
|
1354
1924
|
access_key = qs.get("access_key_id")
|
|
1355
1925
|
|
|
1356
1926
|
if not access_key:
|
|
@@ -1360,6 +1930,9 @@ class DynamoDBSource:
|
|
|
1360
1930
|
if not secret_key:
|
|
1361
1931
|
raise ValueError("secret_access_key is required to connect to Dynamodb")
|
|
1362
1932
|
|
|
1933
|
+
from dlt.common.configuration.specs import AwsCredentials
|
|
1934
|
+
from dlt.common.typing import TSecretStrValue
|
|
1935
|
+
|
|
1363
1936
|
creds = AwsCredentials(
|
|
1364
1937
|
aws_access_key_id=access_key[0],
|
|
1365
1938
|
aws_secret_access_key=TSecretStrValue(secret_key[0]),
|
|
@@ -1370,8 +1943,11 @@ class DynamoDBSource:
|
|
|
1370
1943
|
incremental = None
|
|
1371
1944
|
incremental_key = kwargs.get("incremental_key")
|
|
1372
1945
|
|
|
1946
|
+
from ingestr.src.dynamodb import dynamodb
|
|
1947
|
+
from ingestr.src.time import isotime
|
|
1948
|
+
|
|
1373
1949
|
if incremental_key:
|
|
1374
|
-
incremental =
|
|
1950
|
+
incremental = dlt_incremental(
|
|
1375
1951
|
incremental_key.strip(),
|
|
1376
1952
|
initial_value=isotime(kwargs.get("interval_start")),
|
|
1377
1953
|
end_value=isotime(kwargs.get("interval_end")),
|
|
@@ -1383,47 +1959,127 @@ class DynamoDBSource:
|
|
|
1383
1959
|
return dynamodb(table, creds, incremental)
|
|
1384
1960
|
|
|
1385
1961
|
|
|
1962
|
+
class DoceboSource:
|
|
1963
|
+
def handles_incrementality(self) -> bool:
|
|
1964
|
+
return False
|
|
1965
|
+
|
|
1966
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1967
|
+
# docebo://?base_url=https://yourcompany.docebosaas.com&client_id=xxx&client_secret=xxx
|
|
1968
|
+
# Optional: &username=xxx&password=xxx for password grant type
|
|
1969
|
+
|
|
1970
|
+
if kwargs.get("incremental_key"):
|
|
1971
|
+
raise ValueError("Incremental loads are not yet supported for Docebo")
|
|
1972
|
+
|
|
1973
|
+
parsed_uri = urlparse(uri)
|
|
1974
|
+
source_params = parse_qs(parsed_uri.query)
|
|
1975
|
+
|
|
1976
|
+
base_url = source_params.get("base_url")
|
|
1977
|
+
if not base_url:
|
|
1978
|
+
raise ValueError("base_url is required to connect to Docebo")
|
|
1979
|
+
|
|
1980
|
+
client_id = source_params.get("client_id")
|
|
1981
|
+
if not client_id:
|
|
1982
|
+
raise ValueError("client_id is required to connect to Docebo")
|
|
1983
|
+
|
|
1984
|
+
client_secret = source_params.get("client_secret")
|
|
1985
|
+
if not client_secret:
|
|
1986
|
+
raise ValueError("client_secret is required to connect to Docebo")
|
|
1987
|
+
|
|
1988
|
+
# Username and password are optional (uses client_credentials grant if not provided)
|
|
1989
|
+
username = source_params.get("username", [None])[0]
|
|
1990
|
+
password = source_params.get("password", [None])[0]
|
|
1991
|
+
|
|
1992
|
+
# Supported tables
|
|
1993
|
+
supported_tables = [
|
|
1994
|
+
"users",
|
|
1995
|
+
"courses",
|
|
1996
|
+
"user_fields",
|
|
1997
|
+
"branches",
|
|
1998
|
+
"groups",
|
|
1999
|
+
"group_members",
|
|
2000
|
+
"course_fields",
|
|
2001
|
+
"learning_objects",
|
|
2002
|
+
"learning_plans",
|
|
2003
|
+
"learning_plan_enrollments",
|
|
2004
|
+
"learning_plan_course_enrollments",
|
|
2005
|
+
"course_enrollments",
|
|
2006
|
+
"sessions",
|
|
2007
|
+
"categories",
|
|
2008
|
+
"certifications",
|
|
2009
|
+
"external_training",
|
|
2010
|
+
"survey_answers",
|
|
2011
|
+
]
|
|
2012
|
+
if table not in supported_tables:
|
|
2013
|
+
raise ValueError(
|
|
2014
|
+
f"Resource '{table}' is not supported for Docebo source. Supported tables: {', '.join(supported_tables)}"
|
|
2015
|
+
)
|
|
2016
|
+
|
|
2017
|
+
from ingestr.src.docebo import docebo_source
|
|
2018
|
+
|
|
2019
|
+
return docebo_source(
|
|
2020
|
+
base_url=base_url[0],
|
|
2021
|
+
client_id=client_id[0],
|
|
2022
|
+
client_secret=client_secret[0],
|
|
2023
|
+
username=username,
|
|
2024
|
+
password=password,
|
|
2025
|
+
).with_resources(table)
|
|
2026
|
+
|
|
2027
|
+
|
|
1386
2028
|
class GoogleAnalyticsSource:
|
|
1387
2029
|
def handles_incrementality(self) -> bool:
|
|
1388
2030
|
return True
|
|
1389
2031
|
|
|
1390
2032
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
1391
|
-
|
|
1392
|
-
source_fields = parse_qs(parse_uri.query)
|
|
1393
|
-
cred_path = source_fields.get("credentials_path")
|
|
1394
|
-
|
|
1395
|
-
if not cred_path:
|
|
1396
|
-
raise ValueError("credentials_path is required to connect Google Analytics")
|
|
1397
|
-
credentials = {}
|
|
2033
|
+
import ingestr.src.google_analytics.helpers as helpers
|
|
1398
2034
|
|
|
1399
|
-
|
|
1400
|
-
|
|
2035
|
+
if kwargs.get("incremental_key"):
|
|
2036
|
+
raise ValueError(
|
|
2037
|
+
"Google Analytics takes care of incrementality on its own, you should not provide incremental_key"
|
|
2038
|
+
)
|
|
1401
2039
|
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
2040
|
+
result = helpers.parse_google_analytics_uri(uri)
|
|
2041
|
+
credentials = result["credentials"]
|
|
2042
|
+
property_id = result["property_id"]
|
|
1405
2043
|
|
|
1406
2044
|
fields = table.split(":")
|
|
1407
|
-
if len(fields) != 3:
|
|
2045
|
+
if len(fields) != 3 and len(fields) != 4:
|
|
1408
2046
|
raise ValueError(
|
|
1409
|
-
"Invalid table format. Expected format:
|
|
2047
|
+
"Invalid table format. Expected format: <report_type>:<dimensions>:<metrics> or <report_type>:<dimensions>:<metrics>:<minute_ranges>"
|
|
1410
2048
|
)
|
|
1411
2049
|
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
datetime = ""
|
|
1415
|
-
for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
|
|
1416
|
-
if dimension_datetime in dimensions:
|
|
1417
|
-
datetime = dimension_datetime
|
|
1418
|
-
break
|
|
1419
|
-
else:
|
|
2050
|
+
report_type = fields[0]
|
|
2051
|
+
if report_type not in ["custom", "realtime"]:
|
|
1420
2052
|
raise ValueError(
|
|
1421
|
-
"
|
|
2053
|
+
"Invalid report type. Expected format: <report_type>:<dimensions>:<metrics>. Available report types: custom, realtime"
|
|
1422
2054
|
)
|
|
1423
2055
|
|
|
2056
|
+
dimensions = fields[1].replace(" ", "").split(",")
|
|
1424
2057
|
metrics = fields[2].replace(" ", "").split(",")
|
|
2058
|
+
|
|
2059
|
+
minute_range_objects = []
|
|
2060
|
+
if len(fields) == 4:
|
|
2061
|
+
minute_range_objects = (
|
|
2062
|
+
helpers.convert_minutes_ranges_to_minute_range_objects(fields[3])
|
|
2063
|
+
)
|
|
2064
|
+
|
|
2065
|
+
datetime = ""
|
|
2066
|
+
resource_name = fields[0].lower()
|
|
2067
|
+
if resource_name == "custom":
|
|
2068
|
+
for dimension_datetime in ["date", "dateHourMinute", "dateHour"]:
|
|
2069
|
+
if dimension_datetime in dimensions:
|
|
2070
|
+
datetime = dimension_datetime
|
|
2071
|
+
break
|
|
2072
|
+
else:
|
|
2073
|
+
raise ValueError(
|
|
2074
|
+
"You must provide at least one dimension: [dateHour, dateHourMinute, date]"
|
|
2075
|
+
)
|
|
2076
|
+
|
|
1425
2077
|
queries = [
|
|
1426
|
-
{
|
|
2078
|
+
{
|
|
2079
|
+
"resource_name": resource_name,
|
|
2080
|
+
"dimensions": dimensions,
|
|
2081
|
+
"metrics": metrics,
|
|
2082
|
+
}
|
|
1427
2083
|
]
|
|
1428
2084
|
|
|
1429
2085
|
start_date = pendulum.now().subtract(days=30).start_of("day")
|
|
@@ -1434,14 +2090,17 @@ class GoogleAnalyticsSource:
|
|
|
1434
2090
|
if kwargs.get("interval_end") is not None:
|
|
1435
2091
|
end_date = pendulum.instance(kwargs.get("interval_end")) # type: ignore
|
|
1436
2092
|
|
|
2093
|
+
from ingestr.src.google_analytics import google_analytics
|
|
2094
|
+
|
|
1437
2095
|
return google_analytics(
|
|
1438
|
-
property_id=property_id
|
|
2096
|
+
property_id=property_id,
|
|
1439
2097
|
start_date=start_date,
|
|
1440
2098
|
end_date=end_date,
|
|
1441
2099
|
datetime_dimension=datetime,
|
|
1442
2100
|
queries=queries,
|
|
1443
2101
|
credentials=credentials,
|
|
1444
|
-
|
|
2102
|
+
minute_range_objects=minute_range_objects if minute_range_objects else None,
|
|
2103
|
+
).with_resources(resource_name)
|
|
1445
2104
|
|
|
1446
2105
|
|
|
1447
2106
|
class GitHubSource:
|
|
@@ -1471,12 +2130,34 @@ class GitHubSource:
|
|
|
1471
2130
|
|
|
1472
2131
|
access_token = source_fields.get("access_token", [""])[0]
|
|
1473
2132
|
|
|
2133
|
+
from ingestr.src.github import (
|
|
2134
|
+
github_reactions,
|
|
2135
|
+
github_repo_events,
|
|
2136
|
+
github_stargazers,
|
|
2137
|
+
)
|
|
2138
|
+
|
|
1474
2139
|
if table in ["issues", "pull_requests"]:
|
|
1475
2140
|
return github_reactions(
|
|
1476
2141
|
owner=owner, name=repo, access_token=access_token
|
|
1477
2142
|
).with_resources(table)
|
|
1478
2143
|
elif table == "repo_events":
|
|
1479
|
-
|
|
2144
|
+
start_date = kwargs.get("interval_start") or pendulum.now().subtract(
|
|
2145
|
+
days=30
|
|
2146
|
+
)
|
|
2147
|
+
end_date = kwargs.get("interval_end") or None
|
|
2148
|
+
|
|
2149
|
+
if isinstance(start_date, str):
|
|
2150
|
+
start_date = pendulum.parse(start_date)
|
|
2151
|
+
if isinstance(end_date, str):
|
|
2152
|
+
end_date = pendulum.parse(end_date)
|
|
2153
|
+
|
|
2154
|
+
return github_repo_events(
|
|
2155
|
+
owner=owner,
|
|
2156
|
+
name=repo,
|
|
2157
|
+
access_token=access_token,
|
|
2158
|
+
start_date=start_date,
|
|
2159
|
+
end_date=end_date,
|
|
2160
|
+
)
|
|
1480
2161
|
elif table == "stargazers":
|
|
1481
2162
|
return github_stargazers(owner=owner, name=repo, access_token=access_token)
|
|
1482
2163
|
else:
|
|
@@ -1503,6 +2184,8 @@ class AppleAppStoreSource:
|
|
|
1503
2184
|
else:
|
|
1504
2185
|
key = base64.b64decode(key_base64[0]).decode() # type: ignore
|
|
1505
2186
|
|
|
2187
|
+
from ingestr.src.appstore.client import AppStoreConnectClient
|
|
2188
|
+
|
|
1506
2189
|
return AppStoreConnectClient(key.encode(), key_id, issuer_id)
|
|
1507
2190
|
|
|
1508
2191
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
@@ -1543,6 +2226,8 @@ class AppleAppStoreSource:
|
|
|
1543
2226
|
if app_ids is None:
|
|
1544
2227
|
raise MissingValueError("app_id", "App Store")
|
|
1545
2228
|
|
|
2229
|
+
from ingestr.src.appstore import app_store
|
|
2230
|
+
|
|
1546
2231
|
src = app_store(
|
|
1547
2232
|
client,
|
|
1548
2233
|
app_ids,
|
|
@@ -1599,21 +2284,24 @@ class GCSSource:
|
|
|
1599
2284
|
# (The RECOMMENDED way of passing service account credentials)
|
|
1600
2285
|
# directly with gcsfs. As a workaround, we construct the GCSFileSystem
|
|
1601
2286
|
# and pass it directly to filesystem.readers.
|
|
2287
|
+
import gcsfs # type: ignore
|
|
2288
|
+
|
|
1602
2289
|
fs = gcsfs.GCSFileSystem(
|
|
1603
2290
|
token=credentials,
|
|
1604
2291
|
)
|
|
1605
2292
|
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
elif file_extension == "jsonl":
|
|
1610
|
-
endpoint = "read_jsonl"
|
|
1611
|
-
elif file_extension == "parquet":
|
|
1612
|
-
endpoint = "read_parquet"
|
|
1613
|
-
else:
|
|
2293
|
+
try:
|
|
2294
|
+
endpoint = blob.parse_endpoint(path_to_file)
|
|
2295
|
+
except blob.UnsupportedEndpointError:
|
|
1614
2296
|
raise ValueError(
|
|
1615
2297
|
"GCS Source only supports specific formats files: csv, jsonl, parquet"
|
|
1616
2298
|
)
|
|
2299
|
+
except Exception as e:
|
|
2300
|
+
raise ValueError(
|
|
2301
|
+
f"Failed to parse endpoint from path: {path_to_file}"
|
|
2302
|
+
) from e
|
|
2303
|
+
|
|
2304
|
+
from ingestr.src.filesystem import readers
|
|
1617
2305
|
|
|
1618
2306
|
return readers(bucket_url, fs, path_to_file).with_resources(endpoint)
|
|
1619
2307
|
|
|
@@ -1622,7 +2310,9 @@ class GoogleAdsSource:
|
|
|
1622
2310
|
def handles_incrementality(self) -> bool:
|
|
1623
2311
|
return True
|
|
1624
2312
|
|
|
1625
|
-
def init_client(self, params: Dict[str, List[str]])
|
|
2313
|
+
def init_client(self, params: Dict[str, List[str]]):
|
|
2314
|
+
from google.ads.googleads.client import GoogleAdsClient # type: ignore
|
|
2315
|
+
|
|
1626
2316
|
dev_token = params.get("dev_token")
|
|
1627
2317
|
if dev_token is None or len(dev_token) == 0:
|
|
1628
2318
|
raise MissingValueError("dev_token", "Google Ads")
|
|
@@ -1676,6 +2366,7 @@ class GoogleAdsSource:
|
|
|
1676
2366
|
raise MissingValueError("customer_id", "Google Ads")
|
|
1677
2367
|
|
|
1678
2368
|
params = parse_qs(parsed_uri.query)
|
|
2369
|
+
|
|
1679
2370
|
client = self.init_client(params)
|
|
1680
2371
|
|
|
1681
2372
|
start_date = kwargs.get("interval_start") or datetime.now(
|
|
@@ -1697,6 +2388,8 @@ class GoogleAdsSource:
|
|
|
1697
2388
|
report_spec = table
|
|
1698
2389
|
table = "daily_report"
|
|
1699
2390
|
|
|
2391
|
+
from ingestr.src.google_ads import google_ads
|
|
2392
|
+
|
|
1700
2393
|
src = google_ads(
|
|
1701
2394
|
client,
|
|
1702
2395
|
customer_id,
|
|
@@ -1716,6 +2409,11 @@ class LinkedInAdsSource:
|
|
|
1716
2409
|
return True
|
|
1717
2410
|
|
|
1718
2411
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2412
|
+
if kwargs.get("incremental_key"):
|
|
2413
|
+
raise ValueError(
|
|
2414
|
+
"LinkedIn Ads takes care of incrementality on its own, you should not provide incremental_key"
|
|
2415
|
+
)
|
|
2416
|
+
|
|
1719
2417
|
parsed_uri = urlparse(uri)
|
|
1720
2418
|
source_fields = parse_qs(parsed_uri.query)
|
|
1721
2419
|
|
|
@@ -1761,6 +2459,12 @@ class LinkedInAdsSource:
|
|
|
1761
2459
|
"'date' or 'month' is required to connect to LinkedIn Ads, please provide at least one of these dimensions."
|
|
1762
2460
|
)
|
|
1763
2461
|
|
|
2462
|
+
from ingestr.src.linkedin_ads import linked_in_ads_source
|
|
2463
|
+
from ingestr.src.linkedin_ads.dimension_time_enum import (
|
|
2464
|
+
Dimension,
|
|
2465
|
+
TimeGranularity,
|
|
2466
|
+
)
|
|
2467
|
+
|
|
1764
2468
|
if "date" in dimensions:
|
|
1765
2469
|
time_granularity = TimeGranularity.daily
|
|
1766
2470
|
dimensions.remove("date")
|
|
@@ -1788,6 +2492,46 @@ class LinkedInAdsSource:
|
|
|
1788
2492
|
).with_resources("custom_reports")
|
|
1789
2493
|
|
|
1790
2494
|
|
|
2495
|
+
class ClickupSource:
|
|
2496
|
+
def handles_incrementality(self) -> bool:
|
|
2497
|
+
return True
|
|
2498
|
+
|
|
2499
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2500
|
+
if kwargs.get("incremental_key"):
|
|
2501
|
+
raise ValueError(
|
|
2502
|
+
"ClickUp takes care of incrementality on its own, you should not provide incremental_key"
|
|
2503
|
+
)
|
|
2504
|
+
|
|
2505
|
+
parsed_uri = urlparse(uri)
|
|
2506
|
+
params = parse_qs(parsed_uri.query)
|
|
2507
|
+
api_token = params.get("api_token")
|
|
2508
|
+
|
|
2509
|
+
if api_token is None:
|
|
2510
|
+
raise MissingValueError("api_token", "ClickUp")
|
|
2511
|
+
|
|
2512
|
+
interval_start = kwargs.get("interval_start")
|
|
2513
|
+
interval_end = kwargs.get("interval_end")
|
|
2514
|
+
start_date = (
|
|
2515
|
+
ensure_pendulum_datetime(interval_start).in_timezone("UTC")
|
|
2516
|
+
if interval_start
|
|
2517
|
+
else pendulum.datetime(2020, 1, 1, tz="UTC")
|
|
2518
|
+
)
|
|
2519
|
+
end_date = (
|
|
2520
|
+
ensure_pendulum_datetime(interval_end).in_timezone("UTC")
|
|
2521
|
+
if interval_end
|
|
2522
|
+
else None
|
|
2523
|
+
)
|
|
2524
|
+
|
|
2525
|
+
from ingestr.src.clickup import clickup_source
|
|
2526
|
+
|
|
2527
|
+
if table not in {"user", "teams", "lists", "tasks", "spaces"}:
|
|
2528
|
+
raise UnsupportedResourceError(table, "ClickUp")
|
|
2529
|
+
|
|
2530
|
+
return clickup_source(
|
|
2531
|
+
api_token=api_token[0], start_date=start_date, end_date=end_date
|
|
2532
|
+
).with_resources(table)
|
|
2533
|
+
|
|
2534
|
+
|
|
1791
2535
|
class AppLovinSource:
|
|
1792
2536
|
def handles_incrementality(self) -> bool:
|
|
1793
2537
|
return True
|
|
@@ -1819,6 +2563,8 @@ class AppLovinSource:
|
|
|
1819
2563
|
custom_report = table
|
|
1820
2564
|
table = "custom_report"
|
|
1821
2565
|
|
|
2566
|
+
from ingestr.src.applovin import applovin_source
|
|
2567
|
+
|
|
1822
2568
|
src = applovin_source(
|
|
1823
2569
|
api_key[0],
|
|
1824
2570
|
start_date.strftime("%Y-%m-%d"),
|
|
@@ -1833,20 +2579,25 @@ class AppLovinSource:
|
|
|
1833
2579
|
|
|
1834
2580
|
|
|
1835
2581
|
class ApplovinMaxSource:
|
|
1836
|
-
#expected uri format: applovinmax://?api_key=<api_key>
|
|
1837
|
-
#expected table format: user_ad_revenue:app_id_1,app_id_2
|
|
2582
|
+
# expected uri format: applovinmax://?api_key=<api_key>
|
|
2583
|
+
# expected table format: user_ad_revenue:app_id_1,app_id_2
|
|
1838
2584
|
|
|
1839
2585
|
def handles_incrementality(self) -> bool:
|
|
1840
2586
|
return True
|
|
1841
2587
|
|
|
1842
2588
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2589
|
+
if kwargs.get("incremental_key"):
|
|
2590
|
+
raise ValueError(
|
|
2591
|
+
"AppLovin Max takes care of incrementality on its own, you should not provide incremental_key"
|
|
2592
|
+
)
|
|
2593
|
+
|
|
1843
2594
|
parsed_uri = urlparse(uri)
|
|
1844
2595
|
params = parse_qs(parsed_uri.query)
|
|
1845
2596
|
|
|
1846
2597
|
api_key = params.get("api_key")
|
|
1847
2598
|
if api_key is None:
|
|
1848
2599
|
raise ValueError("api_key is required to connect to AppLovin Max API.")
|
|
1849
|
-
|
|
2600
|
+
|
|
1850
2601
|
AVAILABLE_TABLES = ["user_ad_revenue"]
|
|
1851
2602
|
|
|
1852
2603
|
table_fields = table.split(":")
|
|
@@ -1856,7 +2607,7 @@ class ApplovinMaxSource:
|
|
|
1856
2607
|
raise ValueError(
|
|
1857
2608
|
"Invalid table format. Expected format is user_ad_revenue:app_id_1,app_id_2"
|
|
1858
2609
|
)
|
|
1859
|
-
|
|
2610
|
+
|
|
1860
2611
|
if requested_table not in AVAILABLE_TABLES:
|
|
1861
2612
|
raise ValueError(
|
|
1862
2613
|
f"Table name '{requested_table}' is not supported for AppLovin Max source yet."
|
|
@@ -1864,17 +2615,15 @@ class ApplovinMaxSource:
|
|
|
1864
2615
|
"If you need additional tables, please create a GitHub issue at "
|
|
1865
2616
|
"https://github.com/bruin-data/ingestr"
|
|
1866
2617
|
)
|
|
1867
|
-
|
|
1868
|
-
applications = [
|
|
2618
|
+
|
|
2619
|
+
applications = [
|
|
2620
|
+
i for i in table_fields[1].replace(" ", "").split(",") if i.strip()
|
|
2621
|
+
]
|
|
1869
2622
|
if len(applications) == 0:
|
|
1870
|
-
raise ValueError(
|
|
1871
|
-
|
|
1872
|
-
)
|
|
1873
|
-
|
|
2623
|
+
raise ValueError("At least one application id is required")
|
|
2624
|
+
|
|
1874
2625
|
if len(applications) != len(set(applications)):
|
|
1875
|
-
raise ValueError(
|
|
1876
|
-
"Application ids must be unique."
|
|
1877
|
-
)
|
|
2626
|
+
raise ValueError("Application ids must be unique.")
|
|
1878
2627
|
|
|
1879
2628
|
interval_start = kwargs.get("interval_start")
|
|
1880
2629
|
interval_end = kwargs.get("interval_end")
|
|
@@ -1888,6 +2637,8 @@ class ApplovinMaxSource:
|
|
|
1888
2637
|
|
|
1889
2638
|
end_date = interval_end.date() if interval_end is not None else None
|
|
1890
2639
|
|
|
2640
|
+
from ingestr.src.applovin_max import applovin_max_source
|
|
2641
|
+
|
|
1891
2642
|
return applovin_max_source(
|
|
1892
2643
|
start_date=start_date,
|
|
1893
2644
|
end_date=end_date,
|
|
@@ -1911,13 +2662,21 @@ class SalesforceSource:
|
|
|
1911
2662
|
"username": params.get("username", [None])[0],
|
|
1912
2663
|
"password": params.get("password", [None])[0],
|
|
1913
2664
|
"token": params.get("token", [None])[0],
|
|
2665
|
+
"domain": params.get("domain", [None])[0],
|
|
1914
2666
|
}
|
|
1915
2667
|
for k, v in creds.items():
|
|
1916
2668
|
if v is None:
|
|
1917
2669
|
raise MissingValueError(k, "Salesforce")
|
|
1918
2670
|
|
|
2671
|
+
from ingestr.src.salesforce import salesforce_source
|
|
2672
|
+
|
|
1919
2673
|
src = salesforce_source(**creds) # type: ignore
|
|
1920
2674
|
|
|
2675
|
+
if table.startswith("custom:"):
|
|
2676
|
+
custom_object = table.split(":")[1]
|
|
2677
|
+
src = salesforce_source(**creds, custom_object=custom_object)
|
|
2678
|
+
return src.with_resources("custom")
|
|
2679
|
+
|
|
1921
2680
|
if table not in src.resources:
|
|
1922
2681
|
raise UnsupportedResourceError(table, "Salesforce")
|
|
1923
2682
|
|
|
@@ -1930,6 +2689,11 @@ class PersonioSource:
|
|
|
1930
2689
|
|
|
1931
2690
|
# applovin://?client_id=123&client_secret=123
|
|
1932
2691
|
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2692
|
+
if kwargs.get("incremental_key"):
|
|
2693
|
+
raise ValueError(
|
|
2694
|
+
"Personio takes care of incrementality on its own, you should not provide incremental_key"
|
|
2695
|
+
)
|
|
2696
|
+
|
|
1933
2697
|
parsed_uri = urlparse(uri)
|
|
1934
2698
|
params = parse_qs(parsed_uri.query)
|
|
1935
2699
|
|
|
@@ -1963,9 +2727,1933 @@ class PersonioSource:
|
|
|
1963
2727
|
]:
|
|
1964
2728
|
raise UnsupportedResourceError(table, "Personio")
|
|
1965
2729
|
|
|
2730
|
+
from ingestr.src.personio import personio_source
|
|
2731
|
+
|
|
1966
2732
|
return personio_source(
|
|
1967
2733
|
client_id=client_id[0],
|
|
1968
2734
|
client_secret=client_secret[0],
|
|
1969
2735
|
start_date=interval_start_date,
|
|
1970
2736
|
end_date=interval_end_date,
|
|
1971
2737
|
).with_resources(table)
|
|
2738
|
+
|
|
2739
|
+
|
|
2740
|
+
class KinesisSource:
|
|
2741
|
+
def handles_incrementality(self) -> bool:
|
|
2742
|
+
return True
|
|
2743
|
+
|
|
2744
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2745
|
+
# kinesis://?aws_access_key_id=<AccessKeyId>&aws_secret_access_key=<SecretAccessKey>®ion_name=<Region>
|
|
2746
|
+
# source table = stream name
|
|
2747
|
+
parsed_uri = urlparse(uri)
|
|
2748
|
+
params = parse_qs(parsed_uri.query)
|
|
2749
|
+
|
|
2750
|
+
aws_access_key_id = params.get("aws_access_key_id")
|
|
2751
|
+
if aws_access_key_id is None:
|
|
2752
|
+
raise MissingValueError("aws_access_key_id", "Kinesis")
|
|
2753
|
+
|
|
2754
|
+
aws_secret_access_key = params.get("aws_secret_access_key")
|
|
2755
|
+
if aws_secret_access_key is None:
|
|
2756
|
+
raise MissingValueError("aws_secret_access_key", "Kinesis")
|
|
2757
|
+
|
|
2758
|
+
region_name = params.get("region_name")
|
|
2759
|
+
if region_name is None:
|
|
2760
|
+
raise MissingValueError("region_name", "Kinesis")
|
|
2761
|
+
|
|
2762
|
+
start_date = kwargs.get("interval_start")
|
|
2763
|
+
if start_date is not None:
|
|
2764
|
+
# the resource will read all messages after this timestamp.
|
|
2765
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
2766
|
+
|
|
2767
|
+
from dlt.common.configuration.specs import AwsCredentials
|
|
2768
|
+
|
|
2769
|
+
from ingestr.src.kinesis import kinesis_stream
|
|
2770
|
+
|
|
2771
|
+
credentials = AwsCredentials(
|
|
2772
|
+
aws_access_key_id=aws_access_key_id[0],
|
|
2773
|
+
aws_secret_access_key=aws_secret_access_key[0],
|
|
2774
|
+
region_name=region_name[0],
|
|
2775
|
+
)
|
|
2776
|
+
|
|
2777
|
+
return kinesis_stream(
|
|
2778
|
+
stream_name=table, credentials=credentials, initial_at_timestamp=start_date
|
|
2779
|
+
)
|
|
2780
|
+
|
|
2781
|
+
|
|
2782
|
+
class PipedriveSource:
|
|
2783
|
+
def handles_incrementality(self) -> bool:
|
|
2784
|
+
return True
|
|
2785
|
+
|
|
2786
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2787
|
+
if kwargs.get("incremental_key"):
|
|
2788
|
+
raise ValueError(
|
|
2789
|
+
"Pipedrive takes care of incrementality on its own, you should not provide incremental_key"
|
|
2790
|
+
)
|
|
2791
|
+
|
|
2792
|
+
parsed_uri = urlparse(uri)
|
|
2793
|
+
params = parse_qs(parsed_uri.query)
|
|
2794
|
+
api_key = params.get("api_token")
|
|
2795
|
+
if api_key is None:
|
|
2796
|
+
raise MissingValueError("api_token", "Pipedrive")
|
|
2797
|
+
|
|
2798
|
+
start_date = kwargs.get("interval_start")
|
|
2799
|
+
if start_date is not None:
|
|
2800
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
2801
|
+
else:
|
|
2802
|
+
start_date = pendulum.parse("2000-01-01")
|
|
2803
|
+
|
|
2804
|
+
if table not in [
|
|
2805
|
+
"users",
|
|
2806
|
+
"activities",
|
|
2807
|
+
"persons",
|
|
2808
|
+
"organizations",
|
|
2809
|
+
"products",
|
|
2810
|
+
"stages",
|
|
2811
|
+
"deals",
|
|
2812
|
+
]:
|
|
2813
|
+
raise UnsupportedResourceError(table, "Pipedrive")
|
|
2814
|
+
|
|
2815
|
+
from ingestr.src.pipedrive import pipedrive_source
|
|
2816
|
+
|
|
2817
|
+
return pipedrive_source(
|
|
2818
|
+
pipedrive_api_key=api_key, since_timestamp=start_date
|
|
2819
|
+
).with_resources(table)
|
|
2820
|
+
|
|
2821
|
+
|
|
2822
|
+
class FrankfurterSource:
|
|
2823
|
+
def handles_incrementality(self) -> bool:
|
|
2824
|
+
return True
|
|
2825
|
+
|
|
2826
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2827
|
+
if kwargs.get("incremental_key"):
|
|
2828
|
+
raise ValueError(
|
|
2829
|
+
"Frankfurter takes care of incrementality on its own, you should not provide incremental_key"
|
|
2830
|
+
)
|
|
2831
|
+
|
|
2832
|
+
from ingestr.src.frankfurter import frankfurter_source
|
|
2833
|
+
from ingestr.src.frankfurter.helpers import validate_currency, validate_dates
|
|
2834
|
+
|
|
2835
|
+
parsed_uri = urlparse(uri)
|
|
2836
|
+
source_params = parse_qs(parsed_uri.query)
|
|
2837
|
+
base_currency = source_params.get("base", [None])[0]
|
|
2838
|
+
|
|
2839
|
+
if not base_currency:
|
|
2840
|
+
base_currency = "USD"
|
|
2841
|
+
|
|
2842
|
+
validate_currency(base_currency)
|
|
2843
|
+
|
|
2844
|
+
if kwargs.get("interval_start"):
|
|
2845
|
+
start_date = ensure_pendulum_datetime(str(kwargs.get("interval_start")))
|
|
2846
|
+
else:
|
|
2847
|
+
start_date = pendulum.yesterday()
|
|
2848
|
+
|
|
2849
|
+
if kwargs.get("interval_end"):
|
|
2850
|
+
end_date = ensure_pendulum_datetime(str(kwargs.get("interval_end")))
|
|
2851
|
+
else:
|
|
2852
|
+
end_date = None
|
|
2853
|
+
|
|
2854
|
+
validate_dates(start_date=start_date, end_date=end_date)
|
|
2855
|
+
|
|
2856
|
+
src = frankfurter_source(
|
|
2857
|
+
start_date=start_date,
|
|
2858
|
+
end_date=end_date,
|
|
2859
|
+
base_currency=base_currency,
|
|
2860
|
+
)
|
|
2861
|
+
|
|
2862
|
+
if table not in src.resources:
|
|
2863
|
+
raise UnsupportedResourceError(table, "Frankfurter")
|
|
2864
|
+
|
|
2865
|
+
return src.with_resources(table)
|
|
2866
|
+
|
|
2867
|
+
|
|
2868
|
+
class FreshdeskSource:
|
|
2869
|
+
# freshdesk://domain?api_key=<api_key>
|
|
2870
|
+
def handles_incrementality(self) -> bool:
|
|
2871
|
+
return True
|
|
2872
|
+
|
|
2873
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2874
|
+
if kwargs.get("incremental_key"):
|
|
2875
|
+
raise ValueError(
|
|
2876
|
+
"Freshdesk takes care of incrementality on its own, you should not provide incremental_key"
|
|
2877
|
+
)
|
|
2878
|
+
|
|
2879
|
+
parsed_uri = urlparse(uri)
|
|
2880
|
+
domain = parsed_uri.netloc
|
|
2881
|
+
query = parsed_uri.query
|
|
2882
|
+
params = parse_qs(query)
|
|
2883
|
+
|
|
2884
|
+
if not domain:
|
|
2885
|
+
raise MissingValueError("domain", "Freshdesk")
|
|
2886
|
+
|
|
2887
|
+
if "." in domain:
|
|
2888
|
+
domain = domain.split(".")[0]
|
|
2889
|
+
|
|
2890
|
+
api_key = params.get("api_key")
|
|
2891
|
+
if api_key is None:
|
|
2892
|
+
raise MissingValueError("api_key", "Freshdesk")
|
|
2893
|
+
|
|
2894
|
+
start_date = kwargs.get("interval_start")
|
|
2895
|
+
if start_date is not None:
|
|
2896
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
|
|
2897
|
+
else:
|
|
2898
|
+
start_date = ensure_pendulum_datetime("2022-01-01T00:00:00Z")
|
|
2899
|
+
|
|
2900
|
+
end_date = kwargs.get("interval_end")
|
|
2901
|
+
if end_date is not None:
|
|
2902
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
2903
|
+
else:
|
|
2904
|
+
end_date = None
|
|
2905
|
+
|
|
2906
|
+
custom_query: Optional[str] = None
|
|
2907
|
+
if ":" in table:
|
|
2908
|
+
table, custom_query = table.split(":", 1)
|
|
2909
|
+
|
|
2910
|
+
if table not in [
|
|
2911
|
+
"agents",
|
|
2912
|
+
"companies",
|
|
2913
|
+
"contacts",
|
|
2914
|
+
"groups",
|
|
2915
|
+
"roles",
|
|
2916
|
+
"tickets",
|
|
2917
|
+
]:
|
|
2918
|
+
raise UnsupportedResourceError(table, "Freshdesk")
|
|
2919
|
+
|
|
2920
|
+
if custom_query and table != "tickets":
|
|
2921
|
+
raise ValueError(f"Custom query is not supported for {table}")
|
|
2922
|
+
|
|
2923
|
+
from ingestr.src.freshdesk import freshdesk_source
|
|
2924
|
+
|
|
2925
|
+
return freshdesk_source(
|
|
2926
|
+
api_secret_key=api_key[0],
|
|
2927
|
+
domain=domain,
|
|
2928
|
+
start_date=start_date,
|
|
2929
|
+
end_date=end_date,
|
|
2930
|
+
query=custom_query,
|
|
2931
|
+
).with_resources(table)
|
|
2932
|
+
|
|
2933
|
+
|
|
2934
|
+
class TrustpilotSource:
|
|
2935
|
+
# trustpilot://<business_unit_id>?api_key=<api_key>
|
|
2936
|
+
def handles_incrementality(self) -> bool:
|
|
2937
|
+
return True
|
|
2938
|
+
|
|
2939
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2940
|
+
if kwargs.get("incremental_key"):
|
|
2941
|
+
raise ValueError(
|
|
2942
|
+
"Trustpilot takes care of incrementality on its own, you should not provide incremental_key"
|
|
2943
|
+
)
|
|
2944
|
+
|
|
2945
|
+
parsed_uri = urlparse(uri)
|
|
2946
|
+
business_unit_id = parsed_uri.netloc
|
|
2947
|
+
params = parse_qs(parsed_uri.query)
|
|
2948
|
+
|
|
2949
|
+
if not business_unit_id:
|
|
2950
|
+
raise MissingValueError("business_unit_id", "Trustpilot")
|
|
2951
|
+
|
|
2952
|
+
api_key = params.get("api_key")
|
|
2953
|
+
if api_key is None:
|
|
2954
|
+
raise MissingValueError("api_key", "Trustpilot")
|
|
2955
|
+
|
|
2956
|
+
start_date = kwargs.get("interval_start")
|
|
2957
|
+
if start_date is None:
|
|
2958
|
+
start_date = ensure_pendulum_datetime("2000-01-01").in_tz("UTC").isoformat()
|
|
2959
|
+
else:
|
|
2960
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC").isoformat()
|
|
2961
|
+
|
|
2962
|
+
end_date = kwargs.get("interval_end")
|
|
2963
|
+
|
|
2964
|
+
if end_date is not None:
|
|
2965
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC").isoformat()
|
|
2966
|
+
|
|
2967
|
+
if table not in ["reviews"]:
|
|
2968
|
+
raise UnsupportedResourceError(table, "Trustpilot")
|
|
2969
|
+
|
|
2970
|
+
from ingestr.src.trustpilot import trustpilot_source
|
|
2971
|
+
|
|
2972
|
+
return trustpilot_source(
|
|
2973
|
+
business_unit_id=business_unit_id,
|
|
2974
|
+
api_key=api_key[0],
|
|
2975
|
+
start_date=start_date,
|
|
2976
|
+
end_date=end_date,
|
|
2977
|
+
).with_resources(table)
|
|
2978
|
+
|
|
2979
|
+
|
|
2980
|
+
class PhantombusterSource:
|
|
2981
|
+
def handles_incrementality(self) -> bool:
|
|
2982
|
+
return True
|
|
2983
|
+
|
|
2984
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
2985
|
+
if kwargs.get("incremental_key"):
|
|
2986
|
+
raise ValueError(
|
|
2987
|
+
"Phantombuster takes care of incrementality on its own, you should not provide incremental_key"
|
|
2988
|
+
)
|
|
2989
|
+
|
|
2990
|
+
# phantombuster://?api_key=<api_key>
|
|
2991
|
+
# source table = phantom_results:agent_id
|
|
2992
|
+
parsed_uri = urlparse(uri)
|
|
2993
|
+
params = parse_qs(parsed_uri.query)
|
|
2994
|
+
api_key = params.get("api_key")
|
|
2995
|
+
if api_key is None:
|
|
2996
|
+
raise MissingValueError("api_key", "Phantombuster")
|
|
2997
|
+
|
|
2998
|
+
table_fields = table.replace(" ", "").split(":")
|
|
2999
|
+
table_name = table_fields[0]
|
|
3000
|
+
|
|
3001
|
+
agent_id = table_fields[1] if len(table_fields) > 1 else None
|
|
3002
|
+
|
|
3003
|
+
if table_name not in ["completed_phantoms"]:
|
|
3004
|
+
raise UnsupportedResourceError(table_name, "Phantombuster")
|
|
3005
|
+
|
|
3006
|
+
if not agent_id:
|
|
3007
|
+
raise MissingValueError("agent_id", "Phantombuster")
|
|
3008
|
+
|
|
3009
|
+
start_date = kwargs.get("interval_start")
|
|
3010
|
+
if start_date is None:
|
|
3011
|
+
start_date = ensure_pendulum_datetime("2018-01-01").in_tz("UTC")
|
|
3012
|
+
else:
|
|
3013
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
|
|
3014
|
+
|
|
3015
|
+
end_date = kwargs.get("interval_end")
|
|
3016
|
+
if end_date is not None:
|
|
3017
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3018
|
+
|
|
3019
|
+
from ingestr.src.phantombuster import phantombuster_source
|
|
3020
|
+
|
|
3021
|
+
return phantombuster_source(
|
|
3022
|
+
api_key=api_key[0],
|
|
3023
|
+
agent_id=agent_id,
|
|
3024
|
+
start_date=start_date,
|
|
3025
|
+
end_date=end_date,
|
|
3026
|
+
).with_resources(table_name)
|
|
3027
|
+
|
|
3028
|
+
|
|
3029
|
+
class ElasticsearchSource:
|
|
3030
|
+
def handles_incrementality(self) -> bool:
|
|
3031
|
+
return False
|
|
3032
|
+
|
|
3033
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3034
|
+
from ingestr.src.elasticsearch import elasticsearch_source
|
|
3035
|
+
|
|
3036
|
+
incremental = None
|
|
3037
|
+
if kwargs.get("incremental_key"):
|
|
3038
|
+
start_value = kwargs.get("interval_start")
|
|
3039
|
+
end_value = kwargs.get("interval_end")
|
|
3040
|
+
|
|
3041
|
+
incremental = dlt_incremental(
|
|
3042
|
+
kwargs.get("incremental_key", ""),
|
|
3043
|
+
initial_value=start_value,
|
|
3044
|
+
end_value=end_value,
|
|
3045
|
+
range_end="closed",
|
|
3046
|
+
range_start="closed",
|
|
3047
|
+
)
|
|
3048
|
+
|
|
3049
|
+
# elasticsearch://localhost:9200?secure=true&verify_certs=false
|
|
3050
|
+
parsed = urlparse(uri)
|
|
3051
|
+
|
|
3052
|
+
index = table
|
|
3053
|
+
if not index:
|
|
3054
|
+
raise ValueError(
|
|
3055
|
+
"Table name must be provided which is the index name in elasticsearch"
|
|
3056
|
+
)
|
|
3057
|
+
|
|
3058
|
+
query_params = parsed.query
|
|
3059
|
+
params = parse_qs(query_params)
|
|
3060
|
+
|
|
3061
|
+
secure = True
|
|
3062
|
+
if "secure" in params:
|
|
3063
|
+
secure = params["secure"][0].capitalize() == "True"
|
|
3064
|
+
|
|
3065
|
+
verify_certs = True
|
|
3066
|
+
if "verify_certs" in params:
|
|
3067
|
+
verify_certs = params["verify_certs"][0].capitalize() == "True"
|
|
3068
|
+
|
|
3069
|
+
scheme = "https" if secure else "http"
|
|
3070
|
+
netloc = parsed.netloc
|
|
3071
|
+
connection_url = f"{scheme}://{netloc}"
|
|
3072
|
+
|
|
3073
|
+
return elasticsearch_source(
|
|
3074
|
+
connection_url=connection_url,
|
|
3075
|
+
index=index,
|
|
3076
|
+
verify_certs=verify_certs,
|
|
3077
|
+
incremental=incremental,
|
|
3078
|
+
).with_resources(table)
|
|
3079
|
+
|
|
3080
|
+
|
|
3081
|
+
class AttioSource:
|
|
3082
|
+
def handles_incrementality(self) -> bool:
|
|
3083
|
+
return False
|
|
3084
|
+
|
|
3085
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3086
|
+
parsed_uri = urlparse(uri)
|
|
3087
|
+
query_params = parse_qs(parsed_uri.query)
|
|
3088
|
+
api_key = query_params.get("api_key")
|
|
3089
|
+
|
|
3090
|
+
if api_key is None:
|
|
3091
|
+
raise MissingValueError("api_key", "Attio")
|
|
3092
|
+
|
|
3093
|
+
parts = table.replace(" ", "").split(":")
|
|
3094
|
+
table_name = parts[0]
|
|
3095
|
+
params = parts[1:]
|
|
3096
|
+
|
|
3097
|
+
from ingestr.src.attio import attio_source
|
|
3098
|
+
|
|
3099
|
+
try:
|
|
3100
|
+
return attio_source(api_key=api_key[0], params=params).with_resources(
|
|
3101
|
+
table_name
|
|
3102
|
+
)
|
|
3103
|
+
except ResourcesNotFoundError:
|
|
3104
|
+
raise UnsupportedResourceError(table_name, "Attio")
|
|
3105
|
+
|
|
3106
|
+
|
|
3107
|
+
class SmartsheetSource:
|
|
3108
|
+
def handles_incrementality(self) -> bool:
|
|
3109
|
+
return False
|
|
3110
|
+
|
|
3111
|
+
# smartsheet://?access_token=<access_token>
|
|
3112
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3113
|
+
if kwargs.get("incremental_key"):
|
|
3114
|
+
raise ValueError("Incremental loads are not supported for Smartsheet")
|
|
3115
|
+
|
|
3116
|
+
if not table:
|
|
3117
|
+
raise ValueError(
|
|
3118
|
+
"Source table (sheet_id) is required to connect to Smartsheet"
|
|
3119
|
+
)
|
|
3120
|
+
|
|
3121
|
+
source_parts = urlparse(uri)
|
|
3122
|
+
source_fields = parse_qs(source_parts.query)
|
|
3123
|
+
access_token = source_fields.get("access_token")
|
|
3124
|
+
|
|
3125
|
+
if not access_token:
|
|
3126
|
+
raise ValueError(
|
|
3127
|
+
"access_token in the URI is required to connect to Smartsheet"
|
|
3128
|
+
)
|
|
3129
|
+
|
|
3130
|
+
from ingestr.src.smartsheets import smartsheet_source
|
|
3131
|
+
|
|
3132
|
+
return smartsheet_source(
|
|
3133
|
+
access_token=access_token[0],
|
|
3134
|
+
sheet_id=table, # table is now a single sheet_id
|
|
3135
|
+
)
|
|
3136
|
+
|
|
3137
|
+
|
|
3138
|
+
class SolidgateSource:
|
|
3139
|
+
def handles_incrementality(self) -> bool:
|
|
3140
|
+
return True
|
|
3141
|
+
|
|
3142
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3143
|
+
if kwargs.get("incremental_key"):
|
|
3144
|
+
raise ValueError(
|
|
3145
|
+
"Solidgate takes care of incrementality on its own, you should not provide incremental_key"
|
|
3146
|
+
)
|
|
3147
|
+
|
|
3148
|
+
parsed_uri = urlparse(uri)
|
|
3149
|
+
query_params = parse_qs(parsed_uri.query)
|
|
3150
|
+
public_key = query_params.get("public_key")
|
|
3151
|
+
secret_key = query_params.get("secret_key")
|
|
3152
|
+
|
|
3153
|
+
if public_key is None:
|
|
3154
|
+
raise MissingValueError("public_key", "Solidgate")
|
|
3155
|
+
|
|
3156
|
+
if secret_key is None:
|
|
3157
|
+
raise MissingValueError("secret_key", "Solidgate")
|
|
3158
|
+
|
|
3159
|
+
table_name = table.replace(" ", "")
|
|
3160
|
+
|
|
3161
|
+
start_date = kwargs.get("interval_start")
|
|
3162
|
+
if start_date is None:
|
|
3163
|
+
start_date = pendulum.yesterday().in_tz("UTC")
|
|
3164
|
+
else:
|
|
3165
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
|
|
3166
|
+
|
|
3167
|
+
end_date = kwargs.get("interval_end")
|
|
3168
|
+
|
|
3169
|
+
if end_date is not None:
|
|
3170
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3171
|
+
|
|
3172
|
+
from ingestr.src.solidgate import solidgate_source
|
|
3173
|
+
|
|
3174
|
+
try:
|
|
3175
|
+
return solidgate_source(
|
|
3176
|
+
public_key=public_key[0],
|
|
3177
|
+
secret_key=secret_key[0],
|
|
3178
|
+
start_date=start_date,
|
|
3179
|
+
end_date=end_date,
|
|
3180
|
+
).with_resources(table_name)
|
|
3181
|
+
except ResourcesNotFoundError:
|
|
3182
|
+
raise UnsupportedResourceError(table_name, "Solidgate")
|
|
3183
|
+
|
|
3184
|
+
|
|
3185
|
+
class SFTPSource:
|
|
3186
|
+
def handles_incrementality(self) -> bool:
|
|
3187
|
+
return True
|
|
3188
|
+
|
|
3189
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3190
|
+
parsed_uri = urlparse(uri)
|
|
3191
|
+
host = parsed_uri.hostname
|
|
3192
|
+
if not host:
|
|
3193
|
+
raise MissingValueError("host", "SFTP URI")
|
|
3194
|
+
port = parsed_uri.port or 22
|
|
3195
|
+
username = parsed_uri.username
|
|
3196
|
+
password = parsed_uri.password
|
|
3197
|
+
|
|
3198
|
+
params: Dict[str, Any] = {
|
|
3199
|
+
"host": host,
|
|
3200
|
+
"port": port,
|
|
3201
|
+
"username": username,
|
|
3202
|
+
"password": password,
|
|
3203
|
+
"look_for_keys": False,
|
|
3204
|
+
"allow_agent": False,
|
|
3205
|
+
}
|
|
3206
|
+
|
|
3207
|
+
try:
|
|
3208
|
+
fs = fsspec.filesystem("sftp", **params)
|
|
3209
|
+
except Exception as e:
|
|
3210
|
+
raise ConnectionError(
|
|
3211
|
+
f"Failed to connect or authenticate to sftp server {host}:{port}. Error: {e}"
|
|
3212
|
+
)
|
|
3213
|
+
bucket_url = f"sftp://{host}:{port}"
|
|
3214
|
+
|
|
3215
|
+
if table.startswith("/"):
|
|
3216
|
+
file_glob = table
|
|
3217
|
+
else:
|
|
3218
|
+
file_glob = f"/{table}"
|
|
3219
|
+
|
|
3220
|
+
try:
|
|
3221
|
+
endpoint = blob.parse_endpoint(table)
|
|
3222
|
+
except blob.UnsupportedEndpointError:
|
|
3223
|
+
raise ValueError(
|
|
3224
|
+
"SFTP Source only supports specific formats files: csv, jsonl, parquet"
|
|
3225
|
+
)
|
|
3226
|
+
except Exception as e:
|
|
3227
|
+
raise ValueError(f"Failed to parse endpoint from path: {table}") from e
|
|
3228
|
+
|
|
3229
|
+
from ingestr.src.filesystem import readers
|
|
3230
|
+
|
|
3231
|
+
dlt_source_resource = readers(bucket_url, fs, file_glob)
|
|
3232
|
+
return dlt_source_resource.with_resources(endpoint)
|
|
3233
|
+
|
|
3234
|
+
|
|
3235
|
+
class QuickBooksSource:
|
|
3236
|
+
def handles_incrementality(self) -> bool:
|
|
3237
|
+
return True
|
|
3238
|
+
|
|
3239
|
+
# quickbooks://?company_id=<company_id>&client_id=<client_id>&client_secret=<client_secret>&refresh_token=<refresh>&access_token=<access_token>&environment=<env>&minor_version=<version>
|
|
3240
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3241
|
+
if kwargs.get("incremental_key"):
|
|
3242
|
+
raise ValueError(
|
|
3243
|
+
"QuickBooks takes care of incrementality on its own, you should not provide incremental_key"
|
|
3244
|
+
)
|
|
3245
|
+
|
|
3246
|
+
parsed_uri = urlparse(uri)
|
|
3247
|
+
|
|
3248
|
+
params = parse_qs(parsed_uri.query)
|
|
3249
|
+
company_id = params.get("company_id")
|
|
3250
|
+
client_id = params.get("client_id")
|
|
3251
|
+
client_secret = params.get("client_secret")
|
|
3252
|
+
refresh_token = params.get("refresh_token")
|
|
3253
|
+
environment = params.get("environment", ["production"])
|
|
3254
|
+
minor_version = params.get("minor_version", [None])
|
|
3255
|
+
|
|
3256
|
+
if not client_id or not client_id[0].strip():
|
|
3257
|
+
raise MissingValueError("client_id", "QuickBooks")
|
|
3258
|
+
|
|
3259
|
+
if not client_secret or not client_secret[0].strip():
|
|
3260
|
+
raise MissingValueError("client_secret", "QuickBooks")
|
|
3261
|
+
|
|
3262
|
+
if not refresh_token or not refresh_token[0].strip():
|
|
3263
|
+
raise MissingValueError("refresh_token", "QuickBooks")
|
|
3264
|
+
|
|
3265
|
+
if not company_id or not company_id[0].strip():
|
|
3266
|
+
raise MissingValueError("company_id", "QuickBooks")
|
|
3267
|
+
|
|
3268
|
+
if environment[0] not in ["production", "sandbox"]:
|
|
3269
|
+
raise ValueError(
|
|
3270
|
+
"Invalid environment. Must be either 'production' or 'sandbox'."
|
|
3271
|
+
)
|
|
3272
|
+
|
|
3273
|
+
from ingestr.src.quickbooks import quickbooks_source
|
|
3274
|
+
|
|
3275
|
+
table_name = table.replace(" ", "")
|
|
3276
|
+
table_mapping = {
|
|
3277
|
+
"customers": "customer",
|
|
3278
|
+
"invoices": "invoice",
|
|
3279
|
+
"accounts": "account",
|
|
3280
|
+
"vendors": "vendor",
|
|
3281
|
+
"payments": "payment",
|
|
3282
|
+
}
|
|
3283
|
+
if table_name in table_mapping:
|
|
3284
|
+
table_name = table_mapping[table_name]
|
|
3285
|
+
|
|
3286
|
+
start_date = kwargs.get("interval_start")
|
|
3287
|
+
if start_date is None:
|
|
3288
|
+
start_date = ensure_pendulum_datetime("2025-01-01").in_tz("UTC")
|
|
3289
|
+
else:
|
|
3290
|
+
start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
|
|
3291
|
+
|
|
3292
|
+
end_date = kwargs.get("interval_end")
|
|
3293
|
+
|
|
3294
|
+
if end_date is not None:
|
|
3295
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3296
|
+
|
|
3297
|
+
return quickbooks_source(
|
|
3298
|
+
company_id=company_id[0],
|
|
3299
|
+
start_date=start_date,
|
|
3300
|
+
end_date=end_date,
|
|
3301
|
+
client_id=client_id[0],
|
|
3302
|
+
client_secret=client_secret[0],
|
|
3303
|
+
refresh_token=refresh_token[0],
|
|
3304
|
+
environment=environment[0],
|
|
3305
|
+
minor_version=minor_version[0],
|
|
3306
|
+
object=table_name,
|
|
3307
|
+
).with_resources(table_name)
|
|
3308
|
+
|
|
3309
|
+
|
|
3310
|
+
class IsocPulseSource:
|
|
3311
|
+
def handles_incrementality(self) -> bool:
|
|
3312
|
+
return True
|
|
3313
|
+
|
|
3314
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3315
|
+
if kwargs.get("incremental_key"):
|
|
3316
|
+
raise ValueError(
|
|
3317
|
+
"Internet Society Pulse takes care of incrementality on its own, you should not provide incremental_key"
|
|
3318
|
+
)
|
|
3319
|
+
|
|
3320
|
+
parsed_uri = urlparse(uri)
|
|
3321
|
+
params = parse_qs(parsed_uri.query)
|
|
3322
|
+
token = params.get("token")
|
|
3323
|
+
if not token or not token[0].strip():
|
|
3324
|
+
raise MissingValueError("token", "Internet Society Pulse")
|
|
3325
|
+
|
|
3326
|
+
start_date = kwargs.get("interval_start")
|
|
3327
|
+
if start_date is None:
|
|
3328
|
+
start_date = pendulum.now().in_tz("UTC").subtract(days=30)
|
|
3329
|
+
|
|
3330
|
+
end_date = kwargs.get("interval_end")
|
|
3331
|
+
|
|
3332
|
+
metric = table
|
|
3333
|
+
opts = []
|
|
3334
|
+
if ":" in metric:
|
|
3335
|
+
metric, *opts = metric.strip().split(":")
|
|
3336
|
+
opts = [opt.strip() for opt in opts]
|
|
3337
|
+
|
|
3338
|
+
from ingestr.src.isoc_pulse import pulse_source
|
|
3339
|
+
|
|
3340
|
+
src = pulse_source(
|
|
3341
|
+
token=token[0],
|
|
3342
|
+
start_date=start_date.strftime("%Y-%m-%d"),
|
|
3343
|
+
end_date=end_date.strftime("%Y-%m-%d") if end_date else None,
|
|
3344
|
+
metric=metric,
|
|
3345
|
+
opts=opts,
|
|
3346
|
+
)
|
|
3347
|
+
return src.with_resources(metric)
|
|
3348
|
+
|
|
3349
|
+
|
|
3350
|
+
class PinterestSource:
|
|
3351
|
+
def handles_incrementality(self) -> bool:
|
|
3352
|
+
return True
|
|
3353
|
+
|
|
3354
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3355
|
+
if kwargs.get("incremental_key"):
|
|
3356
|
+
raise ValueError(
|
|
3357
|
+
"Pinterest takes care of incrementality on its own, you should not provide incremental_key"
|
|
3358
|
+
)
|
|
3359
|
+
|
|
3360
|
+
parsed = urlparse(uri)
|
|
3361
|
+
params = parse_qs(parsed.query)
|
|
3362
|
+
access_token = params.get("access_token")
|
|
3363
|
+
|
|
3364
|
+
if not access_token:
|
|
3365
|
+
raise MissingValueError("access_token", "Pinterest")
|
|
3366
|
+
|
|
3367
|
+
start_date = kwargs.get("interval_start")
|
|
3368
|
+
if start_date is not None:
|
|
3369
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3370
|
+
else:
|
|
3371
|
+
start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
|
|
3372
|
+
|
|
3373
|
+
end_date = kwargs.get("interval_end")
|
|
3374
|
+
if end_date is not None:
|
|
3375
|
+
end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3376
|
+
|
|
3377
|
+
from ingestr.src.pinterest import pinterest_source
|
|
3378
|
+
|
|
3379
|
+
if table not in {"pins", "boards"}:
|
|
3380
|
+
raise UnsupportedResourceError(table, "Pinterest")
|
|
3381
|
+
|
|
3382
|
+
return pinterest_source(
|
|
3383
|
+
access_token=access_token[0],
|
|
3384
|
+
start_date=start_date,
|
|
3385
|
+
end_date=end_date,
|
|
3386
|
+
).with_resources(table)
|
|
3387
|
+
|
|
3388
|
+
|
|
3389
|
+
class FluxxSource:
|
|
3390
|
+
def handles_incrementality(self) -> bool:
|
|
3391
|
+
return True
|
|
3392
|
+
|
|
3393
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3394
|
+
if kwargs.get("incremental_key"):
|
|
3395
|
+
raise ValueError(
|
|
3396
|
+
"Fluxx takes care of incrementality on its own, you should not provide incremental_key"
|
|
3397
|
+
)
|
|
3398
|
+
|
|
3399
|
+
# Parse URI: fluxx://instance?client_id=xxx&client_secret=xxx
|
|
3400
|
+
parsed_uri = urlparse(uri)
|
|
3401
|
+
source_params = parse_qs(parsed_uri.query)
|
|
3402
|
+
|
|
3403
|
+
instance = parsed_uri.hostname
|
|
3404
|
+
if not instance:
|
|
3405
|
+
raise ValueError(
|
|
3406
|
+
"Instance is required in the URI (e.g., fluxx://mycompany.preprod)"
|
|
3407
|
+
)
|
|
3408
|
+
|
|
3409
|
+
client_id = source_params.get("client_id")
|
|
3410
|
+
if not client_id:
|
|
3411
|
+
raise ValueError("client_id in the URI is required to connect to Fluxx")
|
|
3412
|
+
|
|
3413
|
+
client_secret = source_params.get("client_secret")
|
|
3414
|
+
if not client_secret:
|
|
3415
|
+
raise ValueError("client_secret in the URI is required to connect to Fluxx")
|
|
3416
|
+
|
|
3417
|
+
# Parse date parameters
|
|
3418
|
+
start_date = kwargs.get("interval_start")
|
|
3419
|
+
if start_date:
|
|
3420
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3421
|
+
|
|
3422
|
+
end_date = kwargs.get("interval_end")
|
|
3423
|
+
if end_date:
|
|
3424
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3425
|
+
|
|
3426
|
+
# Import Fluxx source
|
|
3427
|
+
from ingestr.src.fluxx import fluxx_source
|
|
3428
|
+
|
|
3429
|
+
# Parse table specification for custom column selection
|
|
3430
|
+
# Format: "resource_name:field1,field2,field3" or "resource_name"
|
|
3431
|
+
resources = None
|
|
3432
|
+
custom_fields = {}
|
|
3433
|
+
|
|
3434
|
+
if table:
|
|
3435
|
+
# Handle single resource with custom fields or multiple resources
|
|
3436
|
+
if ":" in table and table.count(":") == 1:
|
|
3437
|
+
# Single resource with custom fields: "grant_request:id,name,amount"
|
|
3438
|
+
resource_name, field_list = table.split(":", 1)
|
|
3439
|
+
resource_name = resource_name.strip()
|
|
3440
|
+
fields = [f.strip() for f in field_list.split(",")]
|
|
3441
|
+
resources = [resource_name]
|
|
3442
|
+
custom_fields[resource_name] = fields
|
|
3443
|
+
else:
|
|
3444
|
+
# Multiple resources or single resource without custom fields
|
|
3445
|
+
# Support comma-separated list: "grant_request,user"
|
|
3446
|
+
resources = [r.strip() for r in table.split(",")]
|
|
3447
|
+
|
|
3448
|
+
return fluxx_source(
|
|
3449
|
+
instance=instance,
|
|
3450
|
+
client_id=client_id[0],
|
|
3451
|
+
client_secret=client_secret[0],
|
|
3452
|
+
start_date=start_date,
|
|
3453
|
+
end_date=end_date,
|
|
3454
|
+
resources=resources,
|
|
3455
|
+
custom_fields=custom_fields,
|
|
3456
|
+
)
|
|
3457
|
+
|
|
3458
|
+
|
|
3459
|
+
class LinearSource:
|
|
3460
|
+
def handles_incrementality(self) -> bool:
|
|
3461
|
+
return True
|
|
3462
|
+
|
|
3463
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3464
|
+
if kwargs.get("incremental_key"):
|
|
3465
|
+
raise ValueError(
|
|
3466
|
+
"Linear takes care of incrementality on its own, you should not provide incremental_key"
|
|
3467
|
+
)
|
|
3468
|
+
|
|
3469
|
+
parsed_uri = urlparse(uri)
|
|
3470
|
+
params = parse_qs(parsed_uri.query)
|
|
3471
|
+
api_key = params.get("api_key")
|
|
3472
|
+
if api_key is None:
|
|
3473
|
+
raise MissingValueError("api_key", "Linear")
|
|
3474
|
+
|
|
3475
|
+
if table not in [
|
|
3476
|
+
"issues",
|
|
3477
|
+
"projects",
|
|
3478
|
+
"teams",
|
|
3479
|
+
"users",
|
|
3480
|
+
"workflow_states",
|
|
3481
|
+
"cycles",
|
|
3482
|
+
"attachments",
|
|
3483
|
+
"comments",
|
|
3484
|
+
"documents",
|
|
3485
|
+
"external_users",
|
|
3486
|
+
"initiative",
|
|
3487
|
+
"integrations",
|
|
3488
|
+
"labels",
|
|
3489
|
+
"organization",
|
|
3490
|
+
"project_updates",
|
|
3491
|
+
"team_memberships",
|
|
3492
|
+
"initiative_to_project",
|
|
3493
|
+
"project_milestone",
|
|
3494
|
+
"project_status",
|
|
3495
|
+
]:
|
|
3496
|
+
raise UnsupportedResourceError(table, "Linear")
|
|
3497
|
+
|
|
3498
|
+
start_date = kwargs.get("interval_start")
|
|
3499
|
+
if start_date is not None:
|
|
3500
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3501
|
+
else:
|
|
3502
|
+
start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
|
|
3503
|
+
|
|
3504
|
+
end_date = kwargs.get("interval_end")
|
|
3505
|
+
if end_date is not None:
|
|
3506
|
+
end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3507
|
+
|
|
3508
|
+
from ingestr.src.linear import linear_source
|
|
3509
|
+
|
|
3510
|
+
return linear_source(
|
|
3511
|
+
api_key=api_key[0],
|
|
3512
|
+
start_date=start_date,
|
|
3513
|
+
end_date=end_date,
|
|
3514
|
+
).with_resources(table)
|
|
3515
|
+
|
|
3516
|
+
|
|
3517
|
+
class RevenueCatSource:
|
|
3518
|
+
def handles_incrementality(self) -> bool:
|
|
3519
|
+
return True
|
|
3520
|
+
|
|
3521
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3522
|
+
if kwargs.get("incremental_key"):
|
|
3523
|
+
raise ValueError(
|
|
3524
|
+
"RevenueCat takes care of incrementality on its own, you should not provide incremental_key"
|
|
3525
|
+
)
|
|
3526
|
+
|
|
3527
|
+
parsed_uri = urlparse(uri)
|
|
3528
|
+
params = parse_qs(parsed_uri.query)
|
|
3529
|
+
|
|
3530
|
+
api_key = params.get("api_key")
|
|
3531
|
+
if api_key is None:
|
|
3532
|
+
raise MissingValueError("api_key", "RevenueCat")
|
|
3533
|
+
|
|
3534
|
+
project_id = params.get("project_id")
|
|
3535
|
+
if project_id is None and table != "projects":
|
|
3536
|
+
raise MissingValueError("project_id", "RevenueCat")
|
|
3537
|
+
|
|
3538
|
+
if table not in [
|
|
3539
|
+
"customers",
|
|
3540
|
+
"products",
|
|
3541
|
+
"entitlements",
|
|
3542
|
+
"offerings",
|
|
3543
|
+
"subscriptions",
|
|
3544
|
+
"purchases",
|
|
3545
|
+
"projects",
|
|
3546
|
+
]:
|
|
3547
|
+
raise UnsupportedResourceError(table, "RevenueCat")
|
|
3548
|
+
|
|
3549
|
+
start_date = kwargs.get("interval_start")
|
|
3550
|
+
if start_date is not None:
|
|
3551
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3552
|
+
else:
|
|
3553
|
+
start_date = pendulum.datetime(2020, 1, 1).in_tz("UTC")
|
|
3554
|
+
|
|
3555
|
+
end_date = kwargs.get("interval_end")
|
|
3556
|
+
if end_date is not None:
|
|
3557
|
+
end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3558
|
+
|
|
3559
|
+
from ingestr.src.revenuecat import revenuecat_source
|
|
3560
|
+
|
|
3561
|
+
return revenuecat_source(
|
|
3562
|
+
api_key=api_key[0],
|
|
3563
|
+
project_id=project_id[0] if project_id is not None else None,
|
|
3564
|
+
).with_resources(table)
|
|
3565
|
+
|
|
3566
|
+
|
|
3567
|
+
class ZoomSource:
|
|
3568
|
+
def handles_incrementality(self) -> bool:
|
|
3569
|
+
return True
|
|
3570
|
+
|
|
3571
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3572
|
+
if kwargs.get("incremental_key"):
|
|
3573
|
+
raise ValueError(
|
|
3574
|
+
"Zoom takes care of incrementality on its own, you should not provide incremental_key"
|
|
3575
|
+
)
|
|
3576
|
+
|
|
3577
|
+
parsed = urlparse(uri)
|
|
3578
|
+
params = parse_qs(parsed.query)
|
|
3579
|
+
client_id = params.get("client_id")
|
|
3580
|
+
client_secret = params.get("client_secret")
|
|
3581
|
+
account_id = params.get("account_id")
|
|
3582
|
+
|
|
3583
|
+
if not (client_id and client_secret and account_id):
|
|
3584
|
+
raise MissingValueError(
|
|
3585
|
+
"client_id/client_secret/account_id",
|
|
3586
|
+
"Zoom",
|
|
3587
|
+
)
|
|
3588
|
+
|
|
3589
|
+
start_date = kwargs.get("interval_start")
|
|
3590
|
+
if start_date is not None:
|
|
3591
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3592
|
+
else:
|
|
3593
|
+
start_date = pendulum.datetime(2020, 1, 26).in_tz("UTC")
|
|
3594
|
+
|
|
3595
|
+
end_date = kwargs.get("interval_end")
|
|
3596
|
+
if end_date is not None:
|
|
3597
|
+
end_date = end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
|
|
3598
|
+
|
|
3599
|
+
from ingestr.src.zoom import zoom_source
|
|
3600
|
+
|
|
3601
|
+
if table not in {"meetings", "users", "participants"}:
|
|
3602
|
+
raise UnsupportedResourceError(table, "Zoom")
|
|
3603
|
+
|
|
3604
|
+
return zoom_source(
|
|
3605
|
+
client_id=client_id[0],
|
|
3606
|
+
client_secret=client_secret[0],
|
|
3607
|
+
account_id=account_id[0],
|
|
3608
|
+
start_date=start_date,
|
|
3609
|
+
end_date=end_date,
|
|
3610
|
+
).with_resources(table)
|
|
3611
|
+
|
|
3612
|
+
|
|
3613
|
+
class InfluxDBSource:
|
|
3614
|
+
def handles_incrementality(self) -> bool:
|
|
3615
|
+
return True
|
|
3616
|
+
|
|
3617
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3618
|
+
if kwargs.get("incremental_key"):
|
|
3619
|
+
raise ValueError(
|
|
3620
|
+
"InfluxDB takes care of incrementality on its own, you should not provide incremental_key"
|
|
3621
|
+
)
|
|
3622
|
+
|
|
3623
|
+
parsed_uri = urlparse(uri)
|
|
3624
|
+
params = parse_qs(parsed_uri.query)
|
|
3625
|
+
host = parsed_uri.hostname
|
|
3626
|
+
port = parsed_uri.port
|
|
3627
|
+
|
|
3628
|
+
secure = params.get("secure", ["true"])[0].lower() != "false"
|
|
3629
|
+
scheme = "https" if secure else "http"
|
|
3630
|
+
|
|
3631
|
+
if port:
|
|
3632
|
+
host_url = f"{scheme}://{host}:{port}"
|
|
3633
|
+
else:
|
|
3634
|
+
host_url = f"{scheme}://{host}"
|
|
3635
|
+
|
|
3636
|
+
token = params.get("token")
|
|
3637
|
+
org = params.get("org")
|
|
3638
|
+
bucket = params.get("bucket")
|
|
3639
|
+
|
|
3640
|
+
if not host:
|
|
3641
|
+
raise MissingValueError("host", "InfluxDB")
|
|
3642
|
+
if not token:
|
|
3643
|
+
raise MissingValueError("token", "InfluxDB")
|
|
3644
|
+
if not org:
|
|
3645
|
+
raise MissingValueError("org", "InfluxDB")
|
|
3646
|
+
if not bucket:
|
|
3647
|
+
raise MissingValueError("bucket", "InfluxDB")
|
|
3648
|
+
|
|
3649
|
+
start_date = kwargs.get("interval_start")
|
|
3650
|
+
if start_date is not None:
|
|
3651
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3652
|
+
else:
|
|
3653
|
+
start_date = pendulum.datetime(2024, 1, 1).in_tz("UTC")
|
|
3654
|
+
|
|
3655
|
+
end_date = kwargs.get("interval_end")
|
|
3656
|
+
if end_date is not None:
|
|
3657
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3658
|
+
|
|
3659
|
+
from ingestr.src.influxdb import influxdb_source
|
|
3660
|
+
|
|
3661
|
+
return influxdb_source(
|
|
3662
|
+
measurement=table,
|
|
3663
|
+
host=host_url,
|
|
3664
|
+
org=org[0],
|
|
3665
|
+
bucket=bucket[0],
|
|
3666
|
+
token=token[0],
|
|
3667
|
+
secure=secure,
|
|
3668
|
+
start_date=start_date,
|
|
3669
|
+
end_date=end_date,
|
|
3670
|
+
).with_resources(table)
|
|
3671
|
+
|
|
3672
|
+
|
|
3673
|
+
class WiseSource:
|
|
3674
|
+
def handles_incrementality(self) -> bool:
|
|
3675
|
+
return True
|
|
3676
|
+
|
|
3677
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3678
|
+
parsed = urlparse(uri)
|
|
3679
|
+
params = parse_qs(parsed.query)
|
|
3680
|
+
api_key = params.get("api_key")
|
|
3681
|
+
|
|
3682
|
+
if not api_key:
|
|
3683
|
+
raise MissingValueError("api_key", "Wise")
|
|
3684
|
+
|
|
3685
|
+
if table not in ["profiles", "transfers", "balances"]:
|
|
3686
|
+
raise ValueError(
|
|
3687
|
+
f"Resource '{table}' is not supported for Wise source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
3688
|
+
)
|
|
3689
|
+
|
|
3690
|
+
start_date = kwargs.get("interval_start")
|
|
3691
|
+
if start_date:
|
|
3692
|
+
start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
|
|
3693
|
+
else:
|
|
3694
|
+
start_date = pendulum.datetime(2020, 1, 1).in_timezone("UTC")
|
|
3695
|
+
|
|
3696
|
+
end_date = kwargs.get("interval_end")
|
|
3697
|
+
if end_date:
|
|
3698
|
+
end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
|
|
3699
|
+
else:
|
|
3700
|
+
end_date = None
|
|
3701
|
+
|
|
3702
|
+
from ingestr.src.wise import wise_source
|
|
3703
|
+
|
|
3704
|
+
return wise_source(
|
|
3705
|
+
api_key=api_key[0],
|
|
3706
|
+
start_date=start_date,
|
|
3707
|
+
end_date=end_date,
|
|
3708
|
+
).with_resources(table)
|
|
3709
|
+
|
|
3710
|
+
|
|
3711
|
+
class FundraiseupSource:
|
|
3712
|
+
def handles_incrementality(self) -> bool:
|
|
3713
|
+
return True
|
|
3714
|
+
|
|
3715
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3716
|
+
parsed_uri = urlparse(uri)
|
|
3717
|
+
params = parse_qs(parsed_uri.query)
|
|
3718
|
+
|
|
3719
|
+
api_key = params.get("api_key")
|
|
3720
|
+
if api_key is None:
|
|
3721
|
+
raise MissingValueError("api_key", "Fundraiseup")
|
|
3722
|
+
|
|
3723
|
+
from ingestr.src.fundraiseup import fundraiseup_source
|
|
3724
|
+
|
|
3725
|
+
src = fundraiseup_source(api_key=api_key[0])
|
|
3726
|
+
if table not in src.resources:
|
|
3727
|
+
raise UnsupportedResourceError(table, "Fundraiseup")
|
|
3728
|
+
return src.with_resources(table)
|
|
3729
|
+
|
|
3730
|
+
|
|
3731
|
+
class AnthropicSource:
|
|
3732
|
+
def handles_incrementality(self) -> bool:
|
|
3733
|
+
return True
|
|
3734
|
+
|
|
3735
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3736
|
+
# anthropic://?api_key=<admin_api_key>
|
|
3737
|
+
parsed_uri = urlparse(uri)
|
|
3738
|
+
params = parse_qs(parsed_uri.query)
|
|
3739
|
+
|
|
3740
|
+
api_key = params.get("api_key")
|
|
3741
|
+
if api_key is None:
|
|
3742
|
+
raise MissingValueError("api_key", "Anthropic")
|
|
3743
|
+
|
|
3744
|
+
if table not in [
|
|
3745
|
+
"claude_code_usage",
|
|
3746
|
+
"usage_report",
|
|
3747
|
+
"cost_report",
|
|
3748
|
+
"organization",
|
|
3749
|
+
"workspaces",
|
|
3750
|
+
"api_keys",
|
|
3751
|
+
"invites",
|
|
3752
|
+
"users",
|
|
3753
|
+
"workspace_members",
|
|
3754
|
+
]:
|
|
3755
|
+
raise UnsupportedResourceError(table, "Anthropic")
|
|
3756
|
+
|
|
3757
|
+
# Get start and end dates from kwargs
|
|
3758
|
+
start_date = kwargs.get("interval_start")
|
|
3759
|
+
if start_date:
|
|
3760
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3761
|
+
else:
|
|
3762
|
+
# Default to 2023-01-01
|
|
3763
|
+
start_date = pendulum.datetime(2023, 1, 1)
|
|
3764
|
+
|
|
3765
|
+
end_date = kwargs.get("interval_end")
|
|
3766
|
+
if end_date:
|
|
3767
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3768
|
+
else:
|
|
3769
|
+
end_date = None
|
|
3770
|
+
|
|
3771
|
+
from ingestr.src.anthropic import anthropic_source
|
|
3772
|
+
|
|
3773
|
+
return anthropic_source(
|
|
3774
|
+
api_key=api_key[0],
|
|
3775
|
+
initial_start_date=start_date,
|
|
3776
|
+
end_date=end_date,
|
|
3777
|
+
).with_resources(table)
|
|
3778
|
+
|
|
3779
|
+
|
|
3780
|
+
class PlusVibeAISource:
|
|
3781
|
+
resources = [
|
|
3782
|
+
"campaigns",
|
|
3783
|
+
"leads",
|
|
3784
|
+
"email_accounts",
|
|
3785
|
+
"emails",
|
|
3786
|
+
"blocklist",
|
|
3787
|
+
"webhooks",
|
|
3788
|
+
"tags",
|
|
3789
|
+
]
|
|
3790
|
+
|
|
3791
|
+
def handles_incrementality(self) -> bool:
|
|
3792
|
+
return True
|
|
3793
|
+
|
|
3794
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3795
|
+
# plusvibeai://?api_key=<key>&workspace_id=<id>
|
|
3796
|
+
parsed_uri = urlparse(uri)
|
|
3797
|
+
params = parse_qs(parsed_uri.query)
|
|
3798
|
+
|
|
3799
|
+
api_key = params.get("api_key")
|
|
3800
|
+
workspace_id = params.get("workspace_id")
|
|
3801
|
+
|
|
3802
|
+
if not api_key:
|
|
3803
|
+
raise MissingValueError("api_key", "PlusVibeAI")
|
|
3804
|
+
|
|
3805
|
+
if not workspace_id:
|
|
3806
|
+
raise MissingValueError("workspace_id", "PlusVibeAI")
|
|
3807
|
+
|
|
3808
|
+
if table not in self.resources:
|
|
3809
|
+
raise UnsupportedResourceError(table, "PlusVibeAI")
|
|
3810
|
+
|
|
3811
|
+
import dlt
|
|
3812
|
+
|
|
3813
|
+
from ingestr.src.plusvibeai import plusvibeai_source
|
|
3814
|
+
|
|
3815
|
+
dlt.secrets["sources.plusvibeai.api_key"] = api_key[0]
|
|
3816
|
+
dlt.secrets["sources.plusvibeai.workspace_id"] = workspace_id[0]
|
|
3817
|
+
|
|
3818
|
+
# Handle custom base URL if provided
|
|
3819
|
+
base_url = params.get("base_url", ["https://api.plusvibe.ai"])[0]
|
|
3820
|
+
dlt.secrets["sources.plusvibeai.base_url"] = base_url
|
|
3821
|
+
|
|
3822
|
+
src = plusvibeai_source()
|
|
3823
|
+
return src.with_resources(table)
|
|
3824
|
+
|
|
3825
|
+
|
|
3826
|
+
class IntercomSource:
|
|
3827
|
+
def handles_incrementality(self) -> bool:
|
|
3828
|
+
return True
|
|
3829
|
+
|
|
3830
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3831
|
+
# intercom://?access_token=<token>®ion=<us|eu|au>
|
|
3832
|
+
# OR intercom://?oauth_token=<token>®ion=<us|eu|au>
|
|
3833
|
+
parsed_uri = urlparse(uri)
|
|
3834
|
+
params = parse_qs(parsed_uri.query)
|
|
3835
|
+
|
|
3836
|
+
# Check for authentication
|
|
3837
|
+
access_token = params.get("access_token")
|
|
3838
|
+
oauth_token = params.get("oauth_token")
|
|
3839
|
+
region = params.get("region", ["us"])[0]
|
|
3840
|
+
|
|
3841
|
+
if not access_token and not oauth_token:
|
|
3842
|
+
raise MissingValueError("access_token or oauth_token", "Intercom")
|
|
3843
|
+
|
|
3844
|
+
# Validate table/resource
|
|
3845
|
+
supported_tables = [
|
|
3846
|
+
"contacts",
|
|
3847
|
+
"companies",
|
|
3848
|
+
"conversations",
|
|
3849
|
+
"tickets",
|
|
3850
|
+
"tags",
|
|
3851
|
+
"segments",
|
|
3852
|
+
"teams",
|
|
3853
|
+
"admins",
|
|
3854
|
+
"articles",
|
|
3855
|
+
"data_attributes",
|
|
3856
|
+
]
|
|
3857
|
+
|
|
3858
|
+
if table not in supported_tables:
|
|
3859
|
+
raise UnsupportedResourceError(table, "Intercom")
|
|
3860
|
+
|
|
3861
|
+
# Get date parameters
|
|
3862
|
+
start_date = kwargs.get("interval_start")
|
|
3863
|
+
if start_date:
|
|
3864
|
+
start_date = ensure_pendulum_datetime(start_date)
|
|
3865
|
+
else:
|
|
3866
|
+
start_date = pendulum.datetime(2020, 1, 1)
|
|
3867
|
+
|
|
3868
|
+
end_date = kwargs.get("interval_end")
|
|
3869
|
+
if end_date:
|
|
3870
|
+
end_date = ensure_pendulum_datetime(end_date)
|
|
3871
|
+
|
|
3872
|
+
# Import and initialize the source
|
|
3873
|
+
from ingestr.src.intercom import (
|
|
3874
|
+
IntercomCredentialsAccessToken,
|
|
3875
|
+
IntercomCredentialsOAuth,
|
|
3876
|
+
TIntercomCredentials,
|
|
3877
|
+
intercom_source,
|
|
3878
|
+
)
|
|
3879
|
+
|
|
3880
|
+
credentials: TIntercomCredentials
|
|
3881
|
+
if access_token:
|
|
3882
|
+
credentials = IntercomCredentialsAccessToken(
|
|
3883
|
+
access_token=access_token[0], region=region
|
|
3884
|
+
)
|
|
3885
|
+
else:
|
|
3886
|
+
if not oauth_token:
|
|
3887
|
+
raise MissingValueError("oauth_token", "Intercom")
|
|
3888
|
+
credentials = IntercomCredentialsOAuth(
|
|
3889
|
+
oauth_token=oauth_token[0], region=region
|
|
3890
|
+
)
|
|
3891
|
+
|
|
3892
|
+
return intercom_source(
|
|
3893
|
+
credentials=credentials,
|
|
3894
|
+
start_date=start_date,
|
|
3895
|
+
end_date=end_date,
|
|
3896
|
+
).with_resources(table)
|
|
3897
|
+
|
|
3898
|
+
|
|
3899
|
+
class HttpSource:
|
|
3900
|
+
"""Source for reading CSV, JSON, and Parquet files from HTTP URLs"""
|
|
3901
|
+
|
|
3902
|
+
def handles_incrementality(self) -> bool:
|
|
3903
|
+
return False
|
|
3904
|
+
|
|
3905
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3906
|
+
"""
|
|
3907
|
+
Create a dlt source for reading files from HTTP URLs.
|
|
3908
|
+
|
|
3909
|
+
URI format: http://example.com/file.csv or https://example.com/file.json
|
|
3910
|
+
|
|
3911
|
+
Args:
|
|
3912
|
+
uri: HTTP(S) URL to the file
|
|
3913
|
+
table: Not used for HTTP source (files are read directly)
|
|
3914
|
+
**kwargs: Additional arguments:
|
|
3915
|
+
- file_format: Optional file format override ('csv', 'json', 'parquet')
|
|
3916
|
+
- chunksize: Number of records to process at once (default varies by format)
|
|
3917
|
+
- merge_key: Merge key for the resource
|
|
3918
|
+
|
|
3919
|
+
Returns:
|
|
3920
|
+
DltResource for the HTTP file
|
|
3921
|
+
"""
|
|
3922
|
+
from ingestr.src.http import http_source
|
|
3923
|
+
|
|
3924
|
+
# Extract the actual URL (remove the http:// or https:// scheme if duplicated)
|
|
3925
|
+
url = uri
|
|
3926
|
+
if uri.startswith("http://http://") or uri.startswith("https://https://"):
|
|
3927
|
+
url = uri.split("://", 1)[1]
|
|
3928
|
+
|
|
3929
|
+
file_format = kwargs.get("file_format")
|
|
3930
|
+
chunksize = kwargs.get("chunksize")
|
|
3931
|
+
merge_key = kwargs.get("merge_key")
|
|
3932
|
+
|
|
3933
|
+
reader_kwargs = {}
|
|
3934
|
+
if chunksize is not None:
|
|
3935
|
+
reader_kwargs["chunksize"] = chunksize
|
|
3936
|
+
|
|
3937
|
+
source = http_source(url=url, file_format=file_format, **reader_kwargs)
|
|
3938
|
+
|
|
3939
|
+
if merge_key:
|
|
3940
|
+
source.apply_hints(merge_key=merge_key)
|
|
3941
|
+
|
|
3942
|
+
return source
|
|
3943
|
+
|
|
3944
|
+
|
|
3945
|
+
class MondaySource:
|
|
3946
|
+
def handles_incrementality(self) -> bool:
|
|
3947
|
+
return False
|
|
3948
|
+
|
|
3949
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3950
|
+
parsed_uri = urlparse(uri)
|
|
3951
|
+
query_params = parse_qs(parsed_uri.query)
|
|
3952
|
+
api_token = query_params.get("api_token")
|
|
3953
|
+
|
|
3954
|
+
if api_token is None:
|
|
3955
|
+
raise MissingValueError("api_token", "Monday")
|
|
3956
|
+
|
|
3957
|
+
parts = table.replace(" ", "").split(":")
|
|
3958
|
+
table_name = parts[0]
|
|
3959
|
+
params = parts[1:]
|
|
3960
|
+
|
|
3961
|
+
# Get interval_start and interval_end from kwargs (command line args)
|
|
3962
|
+
interval_start = kwargs.get("interval_start")
|
|
3963
|
+
interval_end = kwargs.get("interval_end")
|
|
3964
|
+
|
|
3965
|
+
# Convert datetime to string format YYYY-MM-DD
|
|
3966
|
+
start_date = interval_start.strftime("%Y-%m-%d") if interval_start else None
|
|
3967
|
+
end_date = interval_end.strftime("%Y-%m-%d") if interval_end else None
|
|
3968
|
+
|
|
3969
|
+
from ingestr.src.monday import monday_source
|
|
3970
|
+
|
|
3971
|
+
try:
|
|
3972
|
+
return monday_source(
|
|
3973
|
+
api_token=api_token[0],
|
|
3974
|
+
params=params,
|
|
3975
|
+
start_date=start_date,
|
|
3976
|
+
end_date=end_date,
|
|
3977
|
+
).with_resources(table_name)
|
|
3978
|
+
except ResourcesNotFoundError:
|
|
3979
|
+
raise UnsupportedResourceError(table_name, "Monday")
|
|
3980
|
+
|
|
3981
|
+
|
|
3982
|
+
class MailchimpSource:
|
|
3983
|
+
def handles_incrementality(self) -> bool:
|
|
3984
|
+
return False
|
|
3985
|
+
|
|
3986
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
3987
|
+
parsed_uri = urlparse(uri)
|
|
3988
|
+
query_params = parse_qs(parsed_uri.query)
|
|
3989
|
+
api_key = query_params.get("api_key")
|
|
3990
|
+
server = query_params.get("server")
|
|
3991
|
+
|
|
3992
|
+
if api_key is None:
|
|
3993
|
+
raise MissingValueError("api_key", "Mailchimp")
|
|
3994
|
+
if server is None:
|
|
3995
|
+
raise MissingValueError("server", "Mailchimp")
|
|
3996
|
+
|
|
3997
|
+
from ingestr.src.mailchimp import mailchimp_source
|
|
3998
|
+
|
|
3999
|
+
try:
|
|
4000
|
+
return mailchimp_source(
|
|
4001
|
+
api_key=api_key[0],
|
|
4002
|
+
server=server[0],
|
|
4003
|
+
).with_resources(table)
|
|
4004
|
+
except ResourcesNotFoundError:
|
|
4005
|
+
raise UnsupportedResourceError(table, "Mailchimp")
|
|
4006
|
+
|
|
4007
|
+
|
|
4008
|
+
class AlliumSource:
|
|
4009
|
+
def handles_incrementality(self) -> bool:
|
|
4010
|
+
return False
|
|
4011
|
+
|
|
4012
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4013
|
+
parsed_uri = urlparse(uri)
|
|
4014
|
+
query_params = parse_qs(parsed_uri.query)
|
|
4015
|
+
api_key = query_params.get("api_key")
|
|
4016
|
+
|
|
4017
|
+
if api_key is None:
|
|
4018
|
+
raise MissingValueError("api_key", "Allium")
|
|
4019
|
+
|
|
4020
|
+
# Extract query_id and custom parameters from table parameter
|
|
4021
|
+
# Format: query_id or query:query_id or query:query_id:param1=value1¶m2=value2
|
|
4022
|
+
query_id = table
|
|
4023
|
+
custom_params = {}
|
|
4024
|
+
limit = None
|
|
4025
|
+
compute_profile = None
|
|
4026
|
+
|
|
4027
|
+
if ":" in table:
|
|
4028
|
+
parts = table.split(":", 2) # Split into max 3 parts
|
|
4029
|
+
if len(parts) >= 2:
|
|
4030
|
+
query_id = parts[1]
|
|
4031
|
+
if len(parts) == 3:
|
|
4032
|
+
# Parse custom parameters from query string format
|
|
4033
|
+
param_string = parts[2]
|
|
4034
|
+
for param in param_string.split("&"):
|
|
4035
|
+
if "=" in param:
|
|
4036
|
+
key, value = param.split("=", 1)
|
|
4037
|
+
# Extract run_config parameters
|
|
4038
|
+
if key == "limit":
|
|
4039
|
+
limit = int(value)
|
|
4040
|
+
elif key == "compute_profile":
|
|
4041
|
+
compute_profile = value
|
|
4042
|
+
else:
|
|
4043
|
+
custom_params[key] = value
|
|
4044
|
+
|
|
4045
|
+
# Extract parameters from interval_start and interval_end
|
|
4046
|
+
# Default: 2 days ago 00:00 to yesterday 00:00
|
|
4047
|
+
now = pendulum.now()
|
|
4048
|
+
default_start = now.subtract(days=2).start_of("day")
|
|
4049
|
+
default_end = now.subtract(days=1).start_of("day")
|
|
4050
|
+
|
|
4051
|
+
parameters = {}
|
|
4052
|
+
interval_start = kwargs.get("interval_start")
|
|
4053
|
+
interval_end = kwargs.get("interval_end")
|
|
4054
|
+
|
|
4055
|
+
start_date = interval_start if interval_start is not None else default_start
|
|
4056
|
+
end_date = interval_end if interval_end is not None else default_end
|
|
4057
|
+
|
|
4058
|
+
parameters["start_date"] = start_date.strftime("%Y-%m-%d")
|
|
4059
|
+
parameters["end_date"] = end_date.strftime("%Y-%m-%d")
|
|
4060
|
+
parameters["start_timestamp"] = str(int(start_date.timestamp()))
|
|
4061
|
+
parameters["end_timestamp"] = str(int(end_date.timestamp()))
|
|
4062
|
+
|
|
4063
|
+
# Merge custom parameters (they override default parameters)
|
|
4064
|
+
parameters.update(custom_params)
|
|
4065
|
+
|
|
4066
|
+
from ingestr.src.allium import allium_source
|
|
4067
|
+
|
|
4068
|
+
return allium_source(
|
|
4069
|
+
api_key=api_key[0],
|
|
4070
|
+
query_id=query_id,
|
|
4071
|
+
parameters=parameters if parameters else None,
|
|
4072
|
+
limit=limit,
|
|
4073
|
+
compute_profile=compute_profile,
|
|
4074
|
+
)
|
|
4075
|
+
|
|
4076
|
+
|
|
4077
|
+
class CouchbaseSource:
|
|
4078
|
+
table_builder: Callable
|
|
4079
|
+
|
|
4080
|
+
def __init__(self, table_builder=None) -> None:
|
|
4081
|
+
if table_builder is None:
|
|
4082
|
+
from ingestr.src.couchbase_source import couchbase_collection
|
|
4083
|
+
|
|
4084
|
+
table_builder = couchbase_collection
|
|
4085
|
+
|
|
4086
|
+
self.table_builder = table_builder
|
|
4087
|
+
|
|
4088
|
+
def handles_incrementality(self) -> bool:
|
|
4089
|
+
return False
|
|
4090
|
+
|
|
4091
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4092
|
+
"""
|
|
4093
|
+
Create a dlt source for reading data from Couchbase.
|
|
4094
|
+
|
|
4095
|
+
URI formats:
|
|
4096
|
+
- couchbase://username:password@host
|
|
4097
|
+
- couchbase://username:password@host/bucket
|
|
4098
|
+
- couchbase://username:password@host?ssl=true
|
|
4099
|
+
- couchbases://username:password@host (SSL enabled)
|
|
4100
|
+
|
|
4101
|
+
Table formats:
|
|
4102
|
+
- bucket.scope.collection (when bucket not in URI)
|
|
4103
|
+
- scope.collection (when bucket specified in URI path)
|
|
4104
|
+
|
|
4105
|
+
Note: If password contains special characters (@, :, /, etc.), they must be URL-encoded.
|
|
4106
|
+
|
|
4107
|
+
Examples:
|
|
4108
|
+
Local/Self-hosted:
|
|
4109
|
+
- couchbase://admin:password123@localhost with table "mybucket.myscope.mycollection"
|
|
4110
|
+
- couchbase://admin:password123@localhost/mybucket with table "myscope.mycollection"
|
|
4111
|
+
- couchbase://admin:password123@localhost?ssl=true with table "mybucket._default._default"
|
|
4112
|
+
|
|
4113
|
+
Capella (Cloud):
|
|
4114
|
+
- couchbases://user:pass@cb.xxx.cloud.couchbase.com with table "travel-sample.inventory.airport"
|
|
4115
|
+
- couchbase://user:pass@cb.xxx.cloud.couchbase.com/travel-sample?ssl=true with table "inventory.airport"
|
|
4116
|
+
|
|
4117
|
+
To encode password in Python:
|
|
4118
|
+
from urllib.parse import quote
|
|
4119
|
+
encoded_pwd = quote("MyPass@123!", safe='')
|
|
4120
|
+
uri = f"couchbase://admin:{encoded_pwd}@localhost?ssl=true"
|
|
4121
|
+
|
|
4122
|
+
Args:
|
|
4123
|
+
uri: Couchbase connection URI (can include /bucket path and ?ssl=true query parameter)
|
|
4124
|
+
table: Format depends on URI:
|
|
4125
|
+
- bucket.scope.collection (if bucket not in URI)
|
|
4126
|
+
- scope.collection (if bucket in URI path)
|
|
4127
|
+
**kwargs: Additional arguments:
|
|
4128
|
+
- limit: Maximum number of documents to fetch
|
|
4129
|
+
- incremental_key: Field to use for incremental loading
|
|
4130
|
+
- interval_start: Start value for incremental loading
|
|
4131
|
+
- interval_end: End value for incremental loading
|
|
4132
|
+
|
|
4133
|
+
Returns:
|
|
4134
|
+
DltResource for the Couchbase collection
|
|
4135
|
+
"""
|
|
4136
|
+
# Parse the URI to extract connection details
|
|
4137
|
+
# urlparse automatically decodes URL-encoded credentials
|
|
4138
|
+
|
|
4139
|
+
parsed = urlparse(uri)
|
|
4140
|
+
|
|
4141
|
+
# Extract username and password from URI
|
|
4142
|
+
# Note: urlparse automatically decodes URL-encoded characters in username/password
|
|
4143
|
+
from urllib.parse import unquote
|
|
4144
|
+
|
|
4145
|
+
username = parsed.username
|
|
4146
|
+
password = unquote(parsed.password) if parsed.password else None
|
|
4147
|
+
|
|
4148
|
+
if not username or not password:
|
|
4149
|
+
raise ValueError(
|
|
4150
|
+
"Username and password must be provided in the URI.\n"
|
|
4151
|
+
"Format: couchbase://username:password@host\n"
|
|
4152
|
+
"If password has special characters (@, :, /), URL-encode them.\n"
|
|
4153
|
+
"Example: couchbase://admin:MyPass%40123@localhost for password 'MyPass@123'"
|
|
4154
|
+
)
|
|
4155
|
+
|
|
4156
|
+
# Reconstruct connection string without credentials
|
|
4157
|
+
scheme = parsed.scheme
|
|
4158
|
+
netloc = parsed.netloc
|
|
4159
|
+
|
|
4160
|
+
# Remove username:password@ from netloc if present
|
|
4161
|
+
if "@" in netloc:
|
|
4162
|
+
netloc = netloc.split("@", 1)[1]
|
|
4163
|
+
|
|
4164
|
+
# Parse query parameters from URI
|
|
4165
|
+
from urllib.parse import parse_qs
|
|
4166
|
+
|
|
4167
|
+
query_params = parse_qs(parsed.query)
|
|
4168
|
+
|
|
4169
|
+
# Check if SSL is requested via URI query parameter (?ssl=true)
|
|
4170
|
+
if "ssl" in query_params:
|
|
4171
|
+
ssl_value = query_params["ssl"][0].lower()
|
|
4172
|
+
use_ssl = ssl_value in ("true", "1", "yes")
|
|
4173
|
+
|
|
4174
|
+
# Apply SSL scheme based on parameter
|
|
4175
|
+
if use_ssl and scheme == "couchbase":
|
|
4176
|
+
scheme = "couchbases"
|
|
4177
|
+
|
|
4178
|
+
connection_string = f"{scheme}://{netloc}"
|
|
4179
|
+
|
|
4180
|
+
# Extract bucket from URI path if present (e.g., couchbase://host/bucket)
|
|
4181
|
+
bucket_from_uri = None
|
|
4182
|
+
if parsed.path and parsed.path.strip("/"):
|
|
4183
|
+
bucket_from_uri = parsed.path.strip("/").split("/")[0]
|
|
4184
|
+
|
|
4185
|
+
# Parse table format: can be "scope.collection" or "bucket.scope.collection"
|
|
4186
|
+
table_parts = table.split(".")
|
|
4187
|
+
|
|
4188
|
+
if len(table_parts) == 3:
|
|
4189
|
+
# Format: bucket.scope.collection
|
|
4190
|
+
bucket, scope, collection = table_parts
|
|
4191
|
+
elif len(table_parts) == 2:
|
|
4192
|
+
# Format: scope.collection (bucket from URI)
|
|
4193
|
+
if bucket_from_uri:
|
|
4194
|
+
bucket = bucket_from_uri
|
|
4195
|
+
scope, collection = table_parts
|
|
4196
|
+
else:
|
|
4197
|
+
raise ValueError(
|
|
4198
|
+
"Table format is 'scope.collection' but no bucket specified in URI.\n"
|
|
4199
|
+
f"Either use URI format: couchbase://user:pass@host/bucket\n"
|
|
4200
|
+
f"Or use table format: bucket.scope.collection\n"
|
|
4201
|
+
f"Got table: {table}"
|
|
4202
|
+
)
|
|
4203
|
+
else:
|
|
4204
|
+
raise ValueError(
|
|
4205
|
+
"Table format must be 'bucket.scope.collection' or 'scope.collection' (with bucket in URI). "
|
|
4206
|
+
f"Got: {table}\n"
|
|
4207
|
+
"Examples:\n"
|
|
4208
|
+
" - URI: couchbase://user:pass@host, Table: travel-sample.inventory.airport\n"
|
|
4209
|
+
" - URI: couchbase://user:pass@host/travel-sample, Table: inventory.airport"
|
|
4210
|
+
)
|
|
4211
|
+
|
|
4212
|
+
# Handle incremental loading
|
|
4213
|
+
incremental = None
|
|
4214
|
+
if kwargs.get("incremental_key"):
|
|
4215
|
+
start_value = kwargs.get("interval_start")
|
|
4216
|
+
end_value = kwargs.get("interval_end")
|
|
4217
|
+
|
|
4218
|
+
incremental = dlt_incremental(
|
|
4219
|
+
kwargs.get("incremental_key", ""),
|
|
4220
|
+
initial_value=start_value,
|
|
4221
|
+
end_value=end_value,
|
|
4222
|
+
range_end="closed",
|
|
4223
|
+
range_start="closed",
|
|
4224
|
+
)
|
|
4225
|
+
|
|
4226
|
+
# Get optional parameters
|
|
4227
|
+
limit = kwargs.get("limit")
|
|
4228
|
+
|
|
4229
|
+
table_instance = self.table_builder(
|
|
4230
|
+
connection_string=connection_string,
|
|
4231
|
+
username=username,
|
|
4232
|
+
password=password,
|
|
4233
|
+
bucket=bucket,
|
|
4234
|
+
scope=scope,
|
|
4235
|
+
collection=collection,
|
|
4236
|
+
incremental=incremental,
|
|
4237
|
+
limit=limit,
|
|
4238
|
+
)
|
|
4239
|
+
table_instance.max_table_nesting = 1
|
|
4240
|
+
|
|
4241
|
+
return table_instance
|
|
4242
|
+
|
|
4243
|
+
|
|
4244
|
+
class CursorSource:
|
|
4245
|
+
resources = [
|
|
4246
|
+
"team_members",
|
|
4247
|
+
"daily_usage_data",
|
|
4248
|
+
"team_spend",
|
|
4249
|
+
"filtered_usage_events",
|
|
4250
|
+
]
|
|
4251
|
+
|
|
4252
|
+
def handles_incrementality(self) -> bool:
|
|
4253
|
+
return True
|
|
4254
|
+
|
|
4255
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4256
|
+
# cursor://?api_key=<api_key>
|
|
4257
|
+
parsed_uri = urlparse(uri)
|
|
4258
|
+
params = parse_qs(parsed_uri.query)
|
|
4259
|
+
|
|
4260
|
+
api_key = params.get("api_key")
|
|
4261
|
+
|
|
4262
|
+
if not api_key:
|
|
4263
|
+
raise MissingValueError("api_key", "Cursor")
|
|
4264
|
+
|
|
4265
|
+
if table not in self.resources:
|
|
4266
|
+
raise UnsupportedResourceError(table, "Cursor")
|
|
4267
|
+
|
|
4268
|
+
import dlt
|
|
4269
|
+
|
|
4270
|
+
from ingestr.src.cursor import cursor_source
|
|
4271
|
+
|
|
4272
|
+
dlt.secrets["sources.cursor.api_key"] = api_key[0]
|
|
4273
|
+
|
|
4274
|
+
# Handle interval_start and interval_end for daily_usage_data and filtered_usage_events (optional)
|
|
4275
|
+
if table in ["daily_usage_data", "filtered_usage_events"]:
|
|
4276
|
+
interval_start = kwargs.get("interval_start")
|
|
4277
|
+
interval_end = kwargs.get("interval_end")
|
|
4278
|
+
|
|
4279
|
+
# Both are optional, but if one is provided, both should be provided
|
|
4280
|
+
if interval_start is not None and interval_end is not None:
|
|
4281
|
+
# Convert datetime to epoch milliseconds
|
|
4282
|
+
start_ms = int(interval_start.timestamp() * 1000)
|
|
4283
|
+
end_ms = int(interval_end.timestamp() * 1000)
|
|
4284
|
+
|
|
4285
|
+
dlt.config["sources.cursor.start_date"] = start_ms
|
|
4286
|
+
dlt.config["sources.cursor.end_date"] = end_ms
|
|
4287
|
+
|
|
4288
|
+
src = cursor_source()
|
|
4289
|
+
return src.with_resources(table)
|
|
4290
|
+
|
|
4291
|
+
|
|
4292
|
+
class SocrataSource:
|
|
4293
|
+
def handles_incrementality(self) -> bool:
|
|
4294
|
+
return False
|
|
4295
|
+
|
|
4296
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4297
|
+
"""
|
|
4298
|
+
Creates a DLT source for Socrata open data platform.
|
|
4299
|
+
|
|
4300
|
+
URI format: socrata://domain?app_token=TOKEN
|
|
4301
|
+
Table: dataset_id (e.g., "6udu-fhnu")
|
|
4302
|
+
|
|
4303
|
+
Args:
|
|
4304
|
+
uri: Socrata connection URI with domain and optional auth params
|
|
4305
|
+
table: Dataset ID (e.g., "6udu-fhnu")
|
|
4306
|
+
**kwargs: Additional arguments:
|
|
4307
|
+
- incremental_key: Field to use for incremental loading (e.g., ":updated_at")
|
|
4308
|
+
- interval_start: Start date for initial load
|
|
4309
|
+
- interval_end: End date for load
|
|
4310
|
+
- primary_key: Primary key field for merge operations
|
|
4311
|
+
|
|
4312
|
+
Returns:
|
|
4313
|
+
DltResource for the Socrata dataset
|
|
4314
|
+
"""
|
|
4315
|
+
from urllib.parse import parse_qs, urlparse
|
|
4316
|
+
|
|
4317
|
+
parsed = urlparse(uri)
|
|
4318
|
+
|
|
4319
|
+
domain = parsed.netloc
|
|
4320
|
+
if not domain:
|
|
4321
|
+
raise ValueError(
|
|
4322
|
+
"Domain must be provided in the URI.\n"
|
|
4323
|
+
"Format: socrata://domain?app_token=TOKEN\n"
|
|
4324
|
+
"Example: socrata://evergreen.data.socrata.com?app_token=mytoken"
|
|
4325
|
+
)
|
|
4326
|
+
|
|
4327
|
+
query_params = parse_qs(parsed.query)
|
|
4328
|
+
|
|
4329
|
+
dataset_id = table
|
|
4330
|
+
if not dataset_id:
|
|
4331
|
+
raise ValueError(
|
|
4332
|
+
"Dataset ID must be provided as the table parameter.\n"
|
|
4333
|
+
"Example: --source-table 6udu-fhnu"
|
|
4334
|
+
)
|
|
4335
|
+
|
|
4336
|
+
app_token = query_params.get("app_token", [None])[0]
|
|
4337
|
+
username = query_params.get("username", [None])[0]
|
|
4338
|
+
password = query_params.get("password", [None])[0]
|
|
4339
|
+
|
|
4340
|
+
incremental = None
|
|
4341
|
+
if kwargs.get("incremental_key"):
|
|
4342
|
+
start_value = kwargs.get("interval_start")
|
|
4343
|
+
end_value = kwargs.get("interval_end")
|
|
4344
|
+
|
|
4345
|
+
if start_value:
|
|
4346
|
+
start_value = (
|
|
4347
|
+
start_value.isoformat()
|
|
4348
|
+
if hasattr(start_value, "isoformat")
|
|
4349
|
+
else str(start_value)
|
|
4350
|
+
)
|
|
4351
|
+
|
|
4352
|
+
if end_value:
|
|
4353
|
+
end_value = (
|
|
4354
|
+
end_value.isoformat()
|
|
4355
|
+
if hasattr(end_value, "isoformat")
|
|
4356
|
+
else str(end_value)
|
|
4357
|
+
)
|
|
4358
|
+
|
|
4359
|
+
incremental = dlt_incremental(
|
|
4360
|
+
kwargs.get("incremental_key", ""),
|
|
4361
|
+
initial_value=start_value,
|
|
4362
|
+
end_value=end_value,
|
|
4363
|
+
range_end="open",
|
|
4364
|
+
range_start="closed",
|
|
4365
|
+
)
|
|
4366
|
+
|
|
4367
|
+
primary_key = kwargs.get("primary_key")
|
|
4368
|
+
|
|
4369
|
+
from ingestr.src.socrata_source import source
|
|
4370
|
+
|
|
4371
|
+
return source(
|
|
4372
|
+
domain=domain,
|
|
4373
|
+
dataset_id=dataset_id,
|
|
4374
|
+
app_token=app_token,
|
|
4375
|
+
username=username,
|
|
4376
|
+
password=password,
|
|
4377
|
+
incremental=incremental,
|
|
4378
|
+
primary_key=primary_key,
|
|
4379
|
+
).with_resources("dataset")
|
|
4380
|
+
|
|
4381
|
+
|
|
4382
|
+
class HostawaySource:
|
|
4383
|
+
def handles_incrementality(self) -> bool:
|
|
4384
|
+
return True
|
|
4385
|
+
|
|
4386
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4387
|
+
if kwargs.get("incremental_key"):
|
|
4388
|
+
raise ValueError(
|
|
4389
|
+
"Hostaway takes care of incrementality on its own, you should not provide incremental_key"
|
|
4390
|
+
)
|
|
4391
|
+
|
|
4392
|
+
source_parts = urlparse(uri)
|
|
4393
|
+
source_params = parse_qs(source_parts.query)
|
|
4394
|
+
api_key = source_params.get("api_key")
|
|
4395
|
+
|
|
4396
|
+
if not api_key:
|
|
4397
|
+
raise ValueError("api_key in the URI is required to connect to Hostaway")
|
|
4398
|
+
|
|
4399
|
+
match table:
|
|
4400
|
+
case "listings":
|
|
4401
|
+
resource_name = "listings"
|
|
4402
|
+
case "listing_fee_settings":
|
|
4403
|
+
resource_name = "listing_fee_settings"
|
|
4404
|
+
case "listing_agreements":
|
|
4405
|
+
resource_name = "listing_agreements"
|
|
4406
|
+
case "listing_pricing_settings":
|
|
4407
|
+
resource_name = "listing_pricing_settings"
|
|
4408
|
+
case "cancellation_policies":
|
|
4409
|
+
resource_name = "cancellation_policies"
|
|
4410
|
+
case "cancellation_policies_airbnb":
|
|
4411
|
+
resource_name = "cancellation_policies_airbnb"
|
|
4412
|
+
case "cancellation_policies_marriott":
|
|
4413
|
+
resource_name = "cancellation_policies_marriott"
|
|
4414
|
+
case "cancellation_policies_vrbo":
|
|
4415
|
+
resource_name = "cancellation_policies_vrbo"
|
|
4416
|
+
case "reservations":
|
|
4417
|
+
resource_name = "reservations"
|
|
4418
|
+
case "finance_fields":
|
|
4419
|
+
resource_name = "finance_fields"
|
|
4420
|
+
case "reservation_payment_methods":
|
|
4421
|
+
resource_name = "reservation_payment_methods"
|
|
4422
|
+
case "reservation_rental_agreements":
|
|
4423
|
+
resource_name = "reservation_rental_agreements"
|
|
4424
|
+
case "listing_calendars":
|
|
4425
|
+
resource_name = "listing_calendars"
|
|
4426
|
+
case "conversations":
|
|
4427
|
+
resource_name = "conversations"
|
|
4428
|
+
case "message_templates":
|
|
4429
|
+
resource_name = "message_templates"
|
|
4430
|
+
case "bed_types":
|
|
4431
|
+
resource_name = "bed_types"
|
|
4432
|
+
case "property_types":
|
|
4433
|
+
resource_name = "property_types"
|
|
4434
|
+
case "countries":
|
|
4435
|
+
resource_name = "countries"
|
|
4436
|
+
case "account_tax_settings":
|
|
4437
|
+
resource_name = "account_tax_settings"
|
|
4438
|
+
case "user_groups":
|
|
4439
|
+
resource_name = "user_groups"
|
|
4440
|
+
case "guest_payment_charges":
|
|
4441
|
+
resource_name = "guest_payment_charges"
|
|
4442
|
+
case "coupons":
|
|
4443
|
+
resource_name = "coupons"
|
|
4444
|
+
case "webhook_reservations":
|
|
4445
|
+
resource_name = "webhook_reservations"
|
|
4446
|
+
case "tasks":
|
|
4447
|
+
resource_name = "tasks"
|
|
4448
|
+
case _:
|
|
4449
|
+
raise ValueError(
|
|
4450
|
+
f"Resource '{table}' is not supported for Hostaway source yet, if you are interested in it please create a GitHub issue at https://github.com/bruin-data/ingestr"
|
|
4451
|
+
)
|
|
4452
|
+
|
|
4453
|
+
start_date = kwargs.get("interval_start")
|
|
4454
|
+
if start_date:
|
|
4455
|
+
start_date = ensure_pendulum_datetime(start_date).in_timezone("UTC")
|
|
4456
|
+
else:
|
|
4457
|
+
start_date = pendulum.datetime(1970, 1, 1).in_timezone("UTC")
|
|
4458
|
+
|
|
4459
|
+
end_date = kwargs.get("interval_end")
|
|
4460
|
+
if end_date:
|
|
4461
|
+
end_date = ensure_pendulum_datetime(end_date).in_timezone("UTC")
|
|
4462
|
+
|
|
4463
|
+
from ingestr.src.hostaway import hostaway_source
|
|
4464
|
+
|
|
4465
|
+
return hostaway_source(
|
|
4466
|
+
api_key=api_key[0],
|
|
4467
|
+
start_date=start_date,
|
|
4468
|
+
end_date=end_date,
|
|
4469
|
+
).with_resources(resource_name)
|
|
4470
|
+
|
|
4471
|
+
|
|
4472
|
+
class SnapchatAdsSource:
|
|
4473
|
+
resources = [
|
|
4474
|
+
"organizations",
|
|
4475
|
+
"fundingsources",
|
|
4476
|
+
"billingcenters",
|
|
4477
|
+
"adaccounts",
|
|
4478
|
+
"invoices",
|
|
4479
|
+
"transactions",
|
|
4480
|
+
"members",
|
|
4481
|
+
"roles",
|
|
4482
|
+
"campaigns",
|
|
4483
|
+
"adsquads",
|
|
4484
|
+
"ads",
|
|
4485
|
+
"event_details",
|
|
4486
|
+
"creatives",
|
|
4487
|
+
"segments",
|
|
4488
|
+
"campaigns_stats",
|
|
4489
|
+
"ad_accounts_stats",
|
|
4490
|
+
"ads_stats",
|
|
4491
|
+
"ad_squads_stats",
|
|
4492
|
+
]
|
|
4493
|
+
|
|
4494
|
+
def handles_incrementality(self) -> bool:
|
|
4495
|
+
return True
|
|
4496
|
+
|
|
4497
|
+
def dlt_source(self, uri: str, table: str, **kwargs):
|
|
4498
|
+
parsed_uri = urlparse(uri)
|
|
4499
|
+
source_fields = parse_qs(parsed_uri.query)
|
|
4500
|
+
|
|
4501
|
+
refresh_token = source_fields.get("refresh_token")
|
|
4502
|
+
if not refresh_token:
|
|
4503
|
+
raise ValueError("refresh_token is required to connect to Snapchat Ads")
|
|
4504
|
+
|
|
4505
|
+
client_id = source_fields.get("client_id")
|
|
4506
|
+
if not client_id:
|
|
4507
|
+
raise ValueError("client_id is required to connect to Snapchat Ads")
|
|
4508
|
+
|
|
4509
|
+
client_secret = source_fields.get("client_secret")
|
|
4510
|
+
if not client_secret:
|
|
4511
|
+
raise ValueError("client_secret is required to connect to Snapchat Ads")
|
|
4512
|
+
|
|
4513
|
+
organization_id = source_fields.get("organization_id")
|
|
4514
|
+
|
|
4515
|
+
# Resources that support ad_account_id filtering
|
|
4516
|
+
ad_account_resources = [
|
|
4517
|
+
"invoices",
|
|
4518
|
+
"campaigns",
|
|
4519
|
+
"adsquads",
|
|
4520
|
+
"ads",
|
|
4521
|
+
"event_details",
|
|
4522
|
+
"creatives",
|
|
4523
|
+
"segments",
|
|
4524
|
+
]
|
|
4525
|
+
|
|
4526
|
+
# Stats resources
|
|
4527
|
+
stats_resources = [
|
|
4528
|
+
"campaigns_stats",
|
|
4529
|
+
"ad_accounts_stats",
|
|
4530
|
+
"ads_stats",
|
|
4531
|
+
"ad_squads_stats",
|
|
4532
|
+
]
|
|
4533
|
+
|
|
4534
|
+
# Parse table name
|
|
4535
|
+
stats_config = None
|
|
4536
|
+
ad_account_id = None
|
|
4537
|
+
|
|
4538
|
+
if ":" in table:
|
|
4539
|
+
parts = table.split(":")
|
|
4540
|
+
resource_name = parts[0]
|
|
4541
|
+
|
|
4542
|
+
if resource_name in stats_resources:
|
|
4543
|
+
# Stats table format:
|
|
4544
|
+
# resource_name:granularity:fields:options (all accounts)
|
|
4545
|
+
# resource_name:ad_account_id:granularity:fields:options (specific account)
|
|
4546
|
+
|
|
4547
|
+
def parse_options(options_str: str) -> dict:
|
|
4548
|
+
"""Parse key=value,key=value options string."""
|
|
4549
|
+
result = {}
|
|
4550
|
+
for option in options_str.split(","):
|
|
4551
|
+
if "=" in option:
|
|
4552
|
+
key, value = option.split("=", 1)
|
|
4553
|
+
result[key] = value
|
|
4554
|
+
return result
|
|
4555
|
+
|
|
4556
|
+
if len(parts) >= 2:
|
|
4557
|
+
valid_granularities = ["TOTAL", "DAY", "HOUR", "LIFETIME"]
|
|
4558
|
+
|
|
4559
|
+
if parts[1].upper() in valid_granularities:
|
|
4560
|
+
# Format: resource_name:granularity:fields:options
|
|
4561
|
+
stats_config = {
|
|
4562
|
+
"granularity": parts[1].upper(),
|
|
4563
|
+
"fields": parts[2]
|
|
4564
|
+
if len(parts) > 2
|
|
4565
|
+
else "impressions,spend",
|
|
4566
|
+
}
|
|
4567
|
+
if len(parts) > 3:
|
|
4568
|
+
stats_config.update(parse_options(parts[3]))
|
|
4569
|
+
else:
|
|
4570
|
+
# Format: resource_name:ad_account_id:granularity:fields:options
|
|
4571
|
+
ad_account_id = parts[1]
|
|
4572
|
+
stats_config = {
|
|
4573
|
+
"granularity": parts[2].upper()
|
|
4574
|
+
if len(parts) > 2
|
|
4575
|
+
else "DAY",
|
|
4576
|
+
"fields": parts[3]
|
|
4577
|
+
if len(parts) > 3
|
|
4578
|
+
else "impressions,spend",
|
|
4579
|
+
}
|
|
4580
|
+
if len(parts) > 4:
|
|
4581
|
+
stats_config.update(parse_options(parts[4]))
|
|
4582
|
+
else:
|
|
4583
|
+
# Just resource_name, use defaults
|
|
4584
|
+
stats_config = {
|
|
4585
|
+
"granularity": "DAY",
|
|
4586
|
+
"fields": "impressions,spend",
|
|
4587
|
+
}
|
|
4588
|
+
else:
|
|
4589
|
+
# Non-stats table with ad_account_id: resource_name:ad_account_id
|
|
4590
|
+
ad_account_id = parts[1] if len(parts) > 1 else None
|
|
4591
|
+
if not ad_account_id:
|
|
4592
|
+
raise ValueError(
|
|
4593
|
+
f"ad_account_id must be provided in format '{resource_name}:ad_account_id'"
|
|
4594
|
+
)
|
|
4595
|
+
else:
|
|
4596
|
+
resource_name = table
|
|
4597
|
+
if resource_name in stats_resources:
|
|
4598
|
+
# Stats resource with default config
|
|
4599
|
+
stats_config = {
|
|
4600
|
+
"granularity": "DAY",
|
|
4601
|
+
"fields": "impressions,spend",
|
|
4602
|
+
}
|
|
4603
|
+
|
|
4604
|
+
# Validation for non-stats resources
|
|
4605
|
+
if resource_name not in stats_resources:
|
|
4606
|
+
account_id_required = (
|
|
4607
|
+
resource_name in ad_account_resources
|
|
4608
|
+
and ad_account_id is None
|
|
4609
|
+
and not organization_id
|
|
4610
|
+
)
|
|
4611
|
+
if account_id_required:
|
|
4612
|
+
raise ValueError(
|
|
4613
|
+
f"organization_id is required for '{resource_name}' table when no specific ad_account_id is provided"
|
|
4614
|
+
)
|
|
4615
|
+
|
|
4616
|
+
if not organization_id and table != "organizations":
|
|
4617
|
+
raise ValueError(
|
|
4618
|
+
f"organization_id is required for table '{table}'. Only 'organizations' table does not require organization_id."
|
|
4619
|
+
)
|
|
4620
|
+
else:
|
|
4621
|
+
# Stats resources need either ad_account_id or organization_id
|
|
4622
|
+
if not ad_account_id and not organization_id:
|
|
4623
|
+
raise ValueError(
|
|
4624
|
+
f"organization_id is required for '{resource_name}' when ad_account_id is not provided"
|
|
4625
|
+
)
|
|
4626
|
+
|
|
4627
|
+
if resource_name not in self.resources:
|
|
4628
|
+
raise UnsupportedResourceError(table, "Snapchat Ads")
|
|
4629
|
+
|
|
4630
|
+
from ingestr.src.snapchat_ads import snapchat_ads_source
|
|
4631
|
+
|
|
4632
|
+
source_kwargs: dict[str, Any] = {
|
|
4633
|
+
"refresh_token": refresh_token[0],
|
|
4634
|
+
"client_id": client_id[0],
|
|
4635
|
+
"client_secret": client_secret[0],
|
|
4636
|
+
}
|
|
4637
|
+
|
|
4638
|
+
if organization_id:
|
|
4639
|
+
source_kwargs["organization_id"] = organization_id[0]
|
|
4640
|
+
|
|
4641
|
+
if ad_account_id:
|
|
4642
|
+
source_kwargs["ad_account_id"] = ad_account_id
|
|
4643
|
+
|
|
4644
|
+
# Add interval_start and interval_end for client-side filtering
|
|
4645
|
+
interval_start = kwargs.get("interval_start")
|
|
4646
|
+
if interval_start:
|
|
4647
|
+
source_kwargs["start_date"] = interval_start
|
|
4648
|
+
|
|
4649
|
+
interval_end = kwargs.get("interval_end")
|
|
4650
|
+
if interval_end:
|
|
4651
|
+
source_kwargs["end_date"] = interval_end
|
|
4652
|
+
|
|
4653
|
+
# Add stats_config for stats resource
|
|
4654
|
+
if stats_config:
|
|
4655
|
+
source_kwargs["stats_config"] = stats_config
|
|
4656
|
+
|
|
4657
|
+
source = snapchat_ads_source(**source_kwargs)
|
|
4658
|
+
|
|
4659
|
+
return source.with_resources(resource_name)
|