ingestr 0.13.13__py3-none-any.whl → 0.14.104__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestr/conftest.py +72 -0
- ingestr/main.py +134 -87
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/adjust/adjust_helpers.py +7 -3
- ingestr/src/airtable/__init__.py +3 -2
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/applovin_max/__init__.py +6 -4
- ingestr/src/appsflyer/__init__.py +325 -0
- ingestr/src/appsflyer/client.py +49 -45
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/arrow/__init__.py +9 -1
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/attio/__init__.py +102 -0
- ingestr/src/attio/helpers.py +65 -0
- ingestr/src/blob.py +37 -10
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/clickup/__init__.py +85 -0
- ingestr/src/clickup/helpers.py +47 -0
- ingestr/src/collector/spinner.py +43 -0
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +508 -27
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/__init__.py +80 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +47 -28
- ingestr/src/facebook_ads/helpers.py +59 -37
- ingestr/src/facebook_ads/settings.py +2 -0
- ingestr/src/facebook_ads/utils.py +39 -0
- ingestr/src/factory.py +107 -2
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +46 -3
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -0
- ingestr/src/frankfurter/helpers.py +48 -0
- ingestr/src/freshdesk/__init__.py +89 -0
- ingestr/src/freshdesk/freshdesk_client.py +137 -0
- ingestr/src/freshdesk/settings.py +9 -0
- ingestr/src/fundraiseup/__init__.py +95 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +41 -6
- ingestr/src/github/helpers.py +5 -5
- ingestr/src/google_analytics/__init__.py +22 -4
- ingestr/src/google_analytics/helpers.py +124 -6
- ingestr/src/google_sheets/__init__.py +4 -4
- ingestr/src/google_sheets/helpers/data_processing.py +2 -2
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/http_client.py +24 -0
- ingestr/src/hubspot/__init__.py +66 -23
- ingestr/src/hubspot/helpers.py +52 -22
- ingestr/src/hubspot/settings.py +14 -7
- ingestr/src/influxdb/__init__.py +46 -0
- ingestr/src/influxdb/client.py +34 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/isoc_pulse/__init__.py +159 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/kafka/__init__.py +4 -1
- ingestr/src/kinesis/__init__.py +139 -0
- ingestr/src/kinesis/helpers.py +82 -0
- ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
- ingestr/src/linear/__init__.py +634 -0
- ingestr/src/linear/helpers.py +111 -0
- ingestr/src/linkedin_ads/helpers.py +0 -1
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/mixpanel/__init__.py +62 -0
- ingestr/src/mixpanel/client.py +99 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +72 -8
- ingestr/src/mongodb/helpers.py +915 -38
- ingestr/src/partition.py +32 -0
- ingestr/src/phantombuster/__init__.py +65 -0
- ingestr/src/phantombuster/client.py +87 -0
- ingestr/src/pinterest/__init__.py +82 -0
- ingestr/src/pipedrive/__init__.py +198 -0
- ingestr/src/pipedrive/helpers/__init__.py +23 -0
- ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
- ingestr/src/pipedrive/helpers/pages.py +115 -0
- ingestr/src/pipedrive/settings.py +27 -0
- ingestr/src/pipedrive/typing.py +3 -0
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/quickbooks/__init__.py +117 -0
- ingestr/src/resource.py +40 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +15 -8
- ingestr/src/shopify/__init__.py +1 -17
- ingestr/src/smartsheets/__init__.py +82 -0
- ingestr/src/snapchat_ads/__init__.py +489 -0
- ingestr/src/snapchat_ads/client.py +72 -0
- ingestr/src/snapchat_ads/helpers.py +535 -0
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/solidgate/__init__.py +219 -0
- ingestr/src/solidgate/helpers.py +154 -0
- ingestr/src/sources.py +2933 -245
- ingestr/src/stripe_analytics/__init__.py +49 -21
- ingestr/src/stripe_analytics/helpers.py +286 -1
- ingestr/src/stripe_analytics/settings.py +62 -10
- ingestr/src/telemetry/event.py +10 -9
- ingestr/src/tiktok_ads/__init__.py +12 -6
- ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
- ingestr/src/trustpilot/__init__.py +48 -0
- ingestr/src/trustpilot/client.py +48 -0
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/src/zoom/__init__.py +99 -0
- ingestr/src/zoom/helpers.py +102 -0
- ingestr/tests/unit/test_smartsheets.py +133 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/METADATA +229 -19
- ingestr-0.14.104.dist-info/RECORD +203 -0
- ingestr/src/appsflyer/_init_.py +0 -24
- ingestr-0.13.13.dist-info/RECORD +0 -115
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/conftest.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from main_test import DESTINATIONS, SOURCES # type: ignore
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def pytest_configure(config):
|
|
10
|
+
if is_master(config):
|
|
11
|
+
config.shared_directory = tempfile.mkdtemp()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def pytest_configure_node(node):
|
|
15
|
+
"""xdist hook"""
|
|
16
|
+
node.workerinput["shared_directory"] = node.config.shared_directory
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.fixture(scope="session")
|
|
20
|
+
def shared_directory(request):
|
|
21
|
+
if is_master(request.config):
|
|
22
|
+
return request.config.shared_directory
|
|
23
|
+
else:
|
|
24
|
+
return request.config.workerinput["shared_directory"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_master(config):
|
|
28
|
+
"""True if the code running the given pytest.config object is running in a xdist master
|
|
29
|
+
node or not running xdist at all.
|
|
30
|
+
"""
|
|
31
|
+
return not hasattr(config, "workerinput")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def start_containers(config):
|
|
35
|
+
if hasattr(config, "workerinput"):
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
unique_containers = set(SOURCES.values()) | set(DESTINATIONS.values())
|
|
39
|
+
for container in unique_containers:
|
|
40
|
+
container.container_lock_dir = config.shared_directory
|
|
41
|
+
|
|
42
|
+
with ThreadPoolExecutor() as executor:
|
|
43
|
+
for container in unique_containers:
|
|
44
|
+
executor.submit(container.start_fully)
|
|
45
|
+
# futures = [
|
|
46
|
+
# executor.submit(container.start_fully) for container in unique_containers
|
|
47
|
+
# ]
|
|
48
|
+
# # Wait for all futures to complete
|
|
49
|
+
# for future in futures:
|
|
50
|
+
# future.result()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def stop_containers(config):
|
|
54
|
+
if hasattr(config, "workerinput"):
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
should_manage_containers = os.environ.get("PYTEST_XDIST_WORKER", "gw0") == "gw0"
|
|
58
|
+
if not should_manage_containers:
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
unique_containers = set(SOURCES.values()) | set(DESTINATIONS.values())
|
|
62
|
+
|
|
63
|
+
for container in unique_containers:
|
|
64
|
+
container.stop_fully()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def pytest_sessionstart(session):
|
|
68
|
+
start_containers(session.config)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def pytest_sessionfinish(session, exitstatus):
|
|
72
|
+
stop_containers(session.config)
|
ingestr/main.py
CHANGED
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
import warnings
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import Optional
|
|
4
5
|
|
|
5
6
|
import typer
|
|
6
|
-
from dlt.common.runtime.collector import Collector
|
|
7
7
|
from rich.console import Console
|
|
8
|
-
from rich.status import Status
|
|
9
8
|
from typing_extensions import Annotated
|
|
10
9
|
|
|
11
|
-
from ingestr.src.filters import cast_set_to_list
|
|
12
10
|
from ingestr.src.telemetry.event import track
|
|
13
11
|
|
|
12
|
+
try:
|
|
13
|
+
from duckdb_engine import DuckDBEngineWarning
|
|
14
|
+
|
|
15
|
+
warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
|
|
16
|
+
except ImportError:
|
|
17
|
+
# duckdb-engine not installed
|
|
18
|
+
pass
|
|
19
|
+
|
|
14
20
|
app = typer.Typer(
|
|
15
21
|
name="ingestr",
|
|
16
22
|
help="ingestr is the CLI tool to ingest data from one source to another",
|
|
@@ -32,56 +38,18 @@ DATE_FORMATS = [
|
|
|
32
38
|
|
|
33
39
|
# https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
|
|
34
40
|
PARQUET_SUPPORTED_DESTINATIONS = [
|
|
35
|
-
"
|
|
41
|
+
"athenabigquery",
|
|
36
42
|
"duckdb",
|
|
37
43
|
"snowflake",
|
|
38
44
|
"databricks",
|
|
39
45
|
"synapse",
|
|
46
|
+
"s3",
|
|
40
47
|
]
|
|
41
48
|
|
|
42
49
|
# these sources would return a JSON for sure, which means they cannot be used with Parquet loader for BigQuery
|
|
43
50
|
JSON_RETURNING_SOURCES = ["notion"]
|
|
44
51
|
|
|
45
52
|
|
|
46
|
-
class SpinnerCollector(Collector):
|
|
47
|
-
status: Status
|
|
48
|
-
current_step: str
|
|
49
|
-
started: bool
|
|
50
|
-
|
|
51
|
-
def __init__(self) -> None:
|
|
52
|
-
self.status = Status("Ingesting data...", spinner="dots")
|
|
53
|
-
self.started = False
|
|
54
|
-
|
|
55
|
-
def update(
|
|
56
|
-
self,
|
|
57
|
-
name: str,
|
|
58
|
-
inc: int = 1,
|
|
59
|
-
total: Optional[int] = None,
|
|
60
|
-
message: Optional[str] = None, # type: ignore
|
|
61
|
-
label: str = "",
|
|
62
|
-
**kwargs,
|
|
63
|
-
) -> None:
|
|
64
|
-
self.status.update(self.current_step)
|
|
65
|
-
|
|
66
|
-
def _start(self, step: str) -> None:
|
|
67
|
-
self.current_step = self.__step_to_label(step)
|
|
68
|
-
self.status.start()
|
|
69
|
-
|
|
70
|
-
def __step_to_label(self, step: str) -> str:
|
|
71
|
-
verb = step.split(" ")[0].lower()
|
|
72
|
-
if verb.startswith("normalize"):
|
|
73
|
-
return "Normalizing the data"
|
|
74
|
-
elif verb.startswith("load"):
|
|
75
|
-
return "Loading the data to the destination"
|
|
76
|
-
elif verb.startswith("extract"):
|
|
77
|
-
return "Extracting the data from the source"
|
|
78
|
-
|
|
79
|
-
return f"{verb.capitalize()} the data"
|
|
80
|
-
|
|
81
|
-
def _stop(self) -> None:
|
|
82
|
-
self.status.stop()
|
|
83
|
-
|
|
84
|
-
|
|
85
53
|
class IncrementalStrategy(str, Enum):
|
|
86
54
|
create_replace = "replace"
|
|
87
55
|
append = "append"
|
|
@@ -99,6 +67,7 @@ class LoaderFileFormat(str, Enum):
|
|
|
99
67
|
|
|
100
68
|
|
|
101
69
|
class SqlBackend(str, Enum):
|
|
70
|
+
default = "default"
|
|
102
71
|
sqlalchemy = "sqlalchemy"
|
|
103
72
|
pyarrow = "pyarrow"
|
|
104
73
|
connectorx = "connectorx"
|
|
@@ -124,40 +93,44 @@ class SqlReflectionLevel(str, Enum):
|
|
|
124
93
|
def ingest(
|
|
125
94
|
source_uri: Annotated[
|
|
126
95
|
str,
|
|
127
|
-
typer.Option(
|
|
96
|
+
typer.Option(
|
|
97
|
+
help="The URI of the [green]source[/green]",
|
|
98
|
+
envvar=["SOURCE_URI", "INGESTR_SOURCE_URI"],
|
|
99
|
+
),
|
|
128
100
|
], # type: ignore
|
|
129
101
|
dest_uri: Annotated[
|
|
130
102
|
str,
|
|
131
103
|
typer.Option(
|
|
132
|
-
help="The URI of the [cyan]destination[/cyan]",
|
|
104
|
+
help="The URI of the [cyan]destination[/cyan]",
|
|
105
|
+
envvar=["DESTINATION_URI", "INGESTR_DESTINATION_URI"],
|
|
133
106
|
),
|
|
134
107
|
], # type: ignore
|
|
135
108
|
source_table: Annotated[
|
|
136
109
|
str,
|
|
137
110
|
typer.Option(
|
|
138
111
|
help="The table name in the [green]source[/green] to fetch",
|
|
139
|
-
envvar="SOURCE_TABLE",
|
|
112
|
+
envvar=["SOURCE_TABLE", "INGESTR_SOURCE_TABLE"],
|
|
140
113
|
),
|
|
141
114
|
], # type: ignore
|
|
142
115
|
dest_table: Annotated[
|
|
143
116
|
str,
|
|
144
117
|
typer.Option(
|
|
145
118
|
help="The table in the [cyan]destination[/cyan] to save the data into",
|
|
146
|
-
envvar="DESTINATION_TABLE",
|
|
119
|
+
envvar=["DESTINATION_TABLE", "INGESTR_DESTINATION_TABLE"],
|
|
147
120
|
),
|
|
148
121
|
] = None, # type: ignore
|
|
149
122
|
incremental_key: Annotated[
|
|
150
123
|
Optional[str],
|
|
151
124
|
typer.Option(
|
|
152
125
|
help="The incremental key from the table to be used for incremental strategies",
|
|
153
|
-
envvar="INCREMENTAL_KEY",
|
|
126
|
+
envvar=["INCREMENTAL_KEY", "INGESTR_INCREMENTAL_KEY"],
|
|
154
127
|
),
|
|
155
128
|
] = None, # type: ignore
|
|
156
129
|
incremental_strategy: Annotated[
|
|
157
130
|
IncrementalStrategy,
|
|
158
131
|
typer.Option(
|
|
159
132
|
help="The incremental strategy to use",
|
|
160
|
-
envvar="INCREMENTAL_STRATEGY",
|
|
133
|
+
envvar=["INCREMENTAL_STRATEGY", "INGESTR_INCREMENTAL_STRATEGY"],
|
|
161
134
|
),
|
|
162
135
|
] = IncrementalStrategy.create_replace, # type: ignore
|
|
163
136
|
interval_start: Annotated[
|
|
@@ -165,7 +138,7 @@ def ingest(
|
|
|
165
138
|
typer.Option(
|
|
166
139
|
help="The start of the interval the incremental key will cover",
|
|
167
140
|
formats=DATE_FORMATS,
|
|
168
|
-
envvar="INTERVAL_START",
|
|
141
|
+
envvar=["INTERVAL_START", "INGESTR_INTERVAL_START"],
|
|
169
142
|
),
|
|
170
143
|
] = None, # type: ignore
|
|
171
144
|
interval_end: Annotated[
|
|
@@ -173,128 +146,149 @@ def ingest(
|
|
|
173
146
|
typer.Option(
|
|
174
147
|
help="The end of the interval the incremental key will cover",
|
|
175
148
|
formats=DATE_FORMATS,
|
|
176
|
-
envvar="INTERVAL_END",
|
|
149
|
+
envvar=["INTERVAL_END", "INGESTR_INTERVAL_END"],
|
|
177
150
|
),
|
|
178
151
|
] = None, # type: ignore
|
|
179
152
|
primary_key: Annotated[
|
|
180
153
|
Optional[list[str]],
|
|
181
154
|
typer.Option(
|
|
182
155
|
help="The key that will be used to deduplicate the resulting table",
|
|
183
|
-
envvar="PRIMARY_KEY",
|
|
156
|
+
envvar=["PRIMARY_KEY", "INGESTR_PRIMARY_KEY"],
|
|
184
157
|
),
|
|
185
158
|
] = None, # type: ignore
|
|
186
159
|
partition_by: Annotated[
|
|
187
160
|
Optional[str],
|
|
188
161
|
typer.Option(
|
|
189
162
|
help="The partition key to be used for partitioning the destination table",
|
|
190
|
-
envvar="PARTITION_BY",
|
|
163
|
+
envvar=["PARTITION_BY", "INGESTR_PARTITION_BY"],
|
|
191
164
|
),
|
|
192
165
|
] = None, # type: ignore
|
|
193
166
|
cluster_by: Annotated[
|
|
194
167
|
Optional[str],
|
|
195
168
|
typer.Option(
|
|
196
169
|
help="The clustering key to be used for clustering the destination table, not every destination supports clustering.",
|
|
197
|
-
envvar="CLUSTER_BY",
|
|
170
|
+
envvar=["CLUSTER_BY", "INGESTR_CLUSTER_BY"],
|
|
198
171
|
),
|
|
199
172
|
] = None, # type: ignore
|
|
200
173
|
yes: Annotated[
|
|
201
174
|
Optional[bool],
|
|
202
175
|
typer.Option(
|
|
203
176
|
help="Skip the confirmation prompt and ingest right away",
|
|
204
|
-
envvar="SKIP_CONFIRMATION",
|
|
177
|
+
envvar=["SKIP_CONFIRMATION", "INGESTR_SKIP_CONFIRMATION"],
|
|
205
178
|
),
|
|
206
179
|
] = False, # type: ignore
|
|
207
180
|
full_refresh: Annotated[
|
|
208
181
|
bool,
|
|
209
182
|
typer.Option(
|
|
210
183
|
help="Ignore the state and refresh the destination table completely",
|
|
211
|
-
envvar="FULL_REFRESH",
|
|
184
|
+
envvar=["FULL_REFRESH", "INGESTR_FULL_REFRESH"],
|
|
212
185
|
),
|
|
213
186
|
] = False, # type: ignore
|
|
214
187
|
progress: Annotated[
|
|
215
188
|
Progress,
|
|
216
189
|
typer.Option(
|
|
217
190
|
help="The progress display type, must be one of 'interactive', 'log'",
|
|
218
|
-
envvar="PROGRESS",
|
|
191
|
+
envvar=["PROGRESS", "INGESTR_PROGRESS"],
|
|
219
192
|
),
|
|
220
193
|
] = Progress.interactive, # type: ignore
|
|
221
194
|
sql_backend: Annotated[
|
|
222
195
|
SqlBackend,
|
|
223
196
|
typer.Option(
|
|
224
197
|
help="The SQL backend to use",
|
|
225
|
-
envvar="SQL_BACKEND",
|
|
198
|
+
envvar=["SQL_BACKEND", "INGESTR_SQL_BACKEND"],
|
|
226
199
|
),
|
|
227
|
-
] = SqlBackend.
|
|
200
|
+
] = SqlBackend.default, # type: ignore
|
|
228
201
|
loader_file_format: Annotated[
|
|
229
202
|
Optional[LoaderFileFormat],
|
|
230
203
|
typer.Option(
|
|
231
204
|
help="The file format to use when loading data",
|
|
232
|
-
envvar="LOADER_FILE_FORMAT",
|
|
205
|
+
envvar=["LOADER_FILE_FORMAT", "INGESTR_LOADER_FILE_FORMAT"],
|
|
233
206
|
),
|
|
234
207
|
] = None, # type: ignore
|
|
235
208
|
page_size: Annotated[
|
|
236
209
|
Optional[int],
|
|
237
210
|
typer.Option(
|
|
238
211
|
help="The page size to be used when fetching data from SQL sources",
|
|
239
|
-
envvar="PAGE_SIZE",
|
|
212
|
+
envvar=["PAGE_SIZE", "INGESTR_PAGE_SIZE"],
|
|
240
213
|
),
|
|
241
214
|
] = 50000, # type: ignore
|
|
242
215
|
loader_file_size: Annotated[
|
|
243
216
|
Optional[int],
|
|
244
217
|
typer.Option(
|
|
245
218
|
help="The file size to be used by the loader to split the data into multiple files. This can be set independent of the page size, since page size is used for fetching the data from the sources whereas this is used for the processing/loading part.",
|
|
246
|
-
envvar="LOADER_FILE_SIZE",
|
|
219
|
+
envvar=["LOADER_FILE_SIZE", "INGESTR_LOADER_FILE_SIZE"],
|
|
247
220
|
),
|
|
248
221
|
] = 100000, # type: ignore
|
|
249
222
|
schema_naming: Annotated[
|
|
250
223
|
SchemaNaming,
|
|
251
224
|
typer.Option(
|
|
252
225
|
help="The naming convention to use when moving the tables from source to destination. The default behavior is explained here: https://dlthub.com/docs/general-usage/schema#naming-convention",
|
|
253
|
-
envvar="SCHEMA_NAMING",
|
|
226
|
+
envvar=["SCHEMA_NAMING", "INGESTR_SCHEMA_NAMING"],
|
|
254
227
|
),
|
|
255
228
|
] = SchemaNaming.default, # type: ignore
|
|
256
229
|
pipelines_dir: Annotated[
|
|
257
230
|
Optional[str],
|
|
258
231
|
typer.Option(
|
|
259
232
|
help="The path to store dlt-related pipeline metadata. By default, ingestr will create a temporary directory and delete it after the execution is done in order to make retries stateless.",
|
|
260
|
-
envvar="PIPELINES_DIR",
|
|
233
|
+
envvar=["PIPELINES_DIR", "INGESTR_PIPELINES_DIR"],
|
|
261
234
|
),
|
|
262
235
|
] = None, # type: ignore
|
|
263
236
|
extract_parallelism: Annotated[
|
|
264
237
|
Optional[int],
|
|
265
238
|
typer.Option(
|
|
266
239
|
help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
|
|
267
|
-
envvar="EXTRACT_PARALLELISM",
|
|
240
|
+
envvar=["EXTRACT_PARALLELISM", "INGESTR_EXTRACT_PARALLELISM"],
|
|
268
241
|
),
|
|
269
242
|
] = 5, # type: ignore
|
|
270
243
|
sql_reflection_level: Annotated[
|
|
271
244
|
SqlReflectionLevel,
|
|
272
245
|
typer.Option(
|
|
273
246
|
help="The reflection level to use when reflecting the table schema from the source",
|
|
274
|
-
envvar="SQL_REFLECTION_LEVEL",
|
|
247
|
+
envvar=["SQL_REFLECTION_LEVEL", "INGESTR_SQL_REFLECTION_LEVEL"],
|
|
275
248
|
),
|
|
276
249
|
] = SqlReflectionLevel.full, # type: ignore
|
|
277
250
|
sql_limit: Annotated[
|
|
278
251
|
Optional[int],
|
|
279
252
|
typer.Option(
|
|
280
253
|
help="The limit to use when fetching data from the source",
|
|
281
|
-
envvar="SQL_LIMIT",
|
|
254
|
+
envvar=["SQL_LIMIT", "INGESTR_SQL_LIMIT"],
|
|
282
255
|
),
|
|
283
256
|
] = None, # type: ignore
|
|
284
257
|
sql_exclude_columns: Annotated[
|
|
285
258
|
Optional[list[str]],
|
|
286
259
|
typer.Option(
|
|
287
260
|
help="The columns to exclude from the source table",
|
|
288
|
-
envvar="SQL_EXCLUDE_COLUMNS",
|
|
261
|
+
envvar=["SQL_EXCLUDE_COLUMNS", "INGESTR_SQL_EXCLUDE_COLUMNS"],
|
|
289
262
|
),
|
|
290
263
|
] = [], # type: ignore
|
|
291
264
|
columns: Annotated[
|
|
292
265
|
Optional[list[str]],
|
|
293
266
|
typer.Option(
|
|
294
267
|
help="The column types to be used for the destination table in the format of 'column_name:column_type'",
|
|
295
|
-
envvar="
|
|
268
|
+
envvar=["INGESTR_COLUMNS"],
|
|
269
|
+
),
|
|
270
|
+
] = None, # type: ignore
|
|
271
|
+
yield_limit: Annotated[
|
|
272
|
+
Optional[int],
|
|
273
|
+
typer.Option(
|
|
274
|
+
help="Limit the number of pages yielded from the source",
|
|
275
|
+
envvar=["YIELD_LIMIT", "INGESTR_YIELD_LIMIT"],
|
|
276
|
+
),
|
|
277
|
+
] = None, # type: ignore
|
|
278
|
+
staging_bucket: Annotated[
|
|
279
|
+
Optional[str],
|
|
280
|
+
typer.Option(
|
|
281
|
+
help="The staging bucket to be used for the ingestion, must be prefixed with 'gs://' or 's3://'",
|
|
282
|
+
envvar=["STAGING_BUCKET", "INGESTR_STAGING_BUCKET"],
|
|
296
283
|
),
|
|
297
284
|
] = None, # type: ignore
|
|
285
|
+
mask: Annotated[
|
|
286
|
+
Optional[list[str]],
|
|
287
|
+
typer.Option(
|
|
288
|
+
help="Column masking configuration in format 'column:algorithm[:param]'. Can be specified multiple times.",
|
|
289
|
+
envvar=["MASK", "INGESTR_MASK"],
|
|
290
|
+
),
|
|
291
|
+
] = [], # type: ignore
|
|
298
292
|
):
|
|
299
293
|
import hashlib
|
|
300
294
|
import tempfile
|
|
@@ -303,14 +297,22 @@ def ingest(
|
|
|
303
297
|
import dlt
|
|
304
298
|
import humanize
|
|
305
299
|
import typer
|
|
306
|
-
from dlt.common.data_types import TDataType
|
|
307
|
-
from dlt.common.destination import Destination
|
|
308
300
|
from dlt.common.pipeline import LoadInfo
|
|
309
301
|
from dlt.common.runtime.collector import Collector, LogCollector
|
|
310
302
|
from dlt.common.schema.typing import TColumnSchema
|
|
311
303
|
|
|
304
|
+
import ingestr.src.partition as partition
|
|
305
|
+
import ingestr.src.resource as resource
|
|
306
|
+
from ingestr.src.collector.spinner import SpinnerCollector
|
|
307
|
+
from ingestr.src.destinations import AthenaDestination
|
|
312
308
|
from ingestr.src.factory import SourceDestinationFactory
|
|
313
|
-
from ingestr.src.
|
|
309
|
+
from ingestr.src.filters import (
|
|
310
|
+
cast_set_to_list,
|
|
311
|
+
cast_spanner_types,
|
|
312
|
+
create_masking_filter,
|
|
313
|
+
handle_mysql_empty_dates,
|
|
314
|
+
)
|
|
315
|
+
from ingestr.src.sources import MongoDbSource
|
|
314
316
|
|
|
315
317
|
def report_errors(run_info: LoadInfo):
|
|
316
318
|
for load_package in run_info.load_packages:
|
|
@@ -345,7 +347,7 @@ def ingest(
|
|
|
345
347
|
return (source_table, dest_table)
|
|
346
348
|
|
|
347
349
|
def validate_loader_file_format(
|
|
348
|
-
dlt_dest
|
|
350
|
+
dlt_dest, loader_file_format: Optional[LoaderFileFormat]
|
|
349
351
|
):
|
|
350
352
|
if (
|
|
351
353
|
loader_file_format
|
|
@@ -357,17 +359,11 @@ def ingest(
|
|
|
357
359
|
)
|
|
358
360
|
raise typer.Abort()
|
|
359
361
|
|
|
360
|
-
def
|
|
361
|
-
if hasattr(source, "selected_resources") and source.selected_resources:
|
|
362
|
-
resource_names = list(source.selected_resources.keys())
|
|
363
|
-
for res in resource_names:
|
|
364
|
-
executable(source.resources[res])
|
|
365
|
-
else:
|
|
366
|
-
executable(source)
|
|
367
|
-
|
|
368
|
-
def parse_columns(columns: list[str]) -> dict[str, TDataType]:
|
|
362
|
+
def parse_columns(columns: list[str]) -> dict:
|
|
369
363
|
from typing import cast, get_args
|
|
370
364
|
|
|
365
|
+
from dlt.common.data_types import TDataType
|
|
366
|
+
|
|
371
367
|
possible_types = get_args(TDataType)
|
|
372
368
|
|
|
373
369
|
types: dict[str, TDataType] = {}
|
|
@@ -400,6 +396,7 @@ def ingest(
|
|
|
400
396
|
dlt.config["data_writer.file_max_items"] = loader_file_size
|
|
401
397
|
dlt.config["extract.workers"] = extract_parallelism
|
|
402
398
|
dlt.config["extract.max_parallel_items"] = extract_parallelism
|
|
399
|
+
dlt.config["load.raise_on_max_retries"] = 15
|
|
403
400
|
if schema_naming != SchemaNaming.default:
|
|
404
401
|
dlt.config["schema.naming"] = schema_naming.value
|
|
405
402
|
|
|
@@ -451,7 +448,9 @@ def ingest(
|
|
|
451
448
|
pipelines_dir = tempfile.mkdtemp()
|
|
452
449
|
is_pipelines_dir_temp = True
|
|
453
450
|
|
|
454
|
-
dlt_dest = destination.dlt_dest(
|
|
451
|
+
dlt_dest = destination.dlt_dest(
|
|
452
|
+
uri=dest_uri, dest_table=dest_table, staging_bucket=staging_bucket
|
|
453
|
+
)
|
|
455
454
|
validate_loader_file_format(dlt_dest, loader_file_format)
|
|
456
455
|
|
|
457
456
|
if partition_by:
|
|
@@ -473,7 +472,7 @@ def ingest(
|
|
|
473
472
|
|
|
474
473
|
column_hints[key]["primary_key"] = True
|
|
475
474
|
|
|
476
|
-
pipeline = dlt.pipeline(
|
|
475
|
+
pipeline = dlt.pipeline( # type: ignore
|
|
477
476
|
pipeline_name=m.hexdigest(),
|
|
478
477
|
destination=dlt_dest,
|
|
479
478
|
progress=progressInstance,
|
|
@@ -510,6 +509,7 @@ def ingest(
|
|
|
510
509
|
print(
|
|
511
510
|
f"[bold yellow] Primary Key:[/bold yellow] {primary_key if primary_key else 'None'}"
|
|
512
511
|
)
|
|
512
|
+
print(f"[bold yellow] Pipeline ID:[/bold yellow] {m.hexdigest()}")
|
|
513
513
|
print()
|
|
514
514
|
|
|
515
515
|
if not yes:
|
|
@@ -539,6 +539,15 @@ def ingest(
|
|
|
539
539
|
if interval_end:
|
|
540
540
|
interval_end = interval_end.date() # type: ignore
|
|
541
541
|
|
|
542
|
+
if factory.source_scheme.startswith("spanner"):
|
|
543
|
+
# we tend to use the 'pyarrow' backend in general, however, it has issues with JSON objects, so we override it to 'sqlalchemy' for Spanner.
|
|
544
|
+
if sql_backend.value == SqlBackend.default:
|
|
545
|
+
sql_backend = SqlBackend.sqlalchemy
|
|
546
|
+
|
|
547
|
+
# this allows us to identify the cases where the user does not have a preference, so that for some sources we can override it.
|
|
548
|
+
if sql_backend == SqlBackend.default:
|
|
549
|
+
sql_backend = SqlBackend.pyarrow
|
|
550
|
+
|
|
542
551
|
dlt_source = source.dlt_source(
|
|
543
552
|
uri=source_uri,
|
|
544
553
|
table=source_table,
|
|
@@ -551,22 +560,55 @@ def ingest(
|
|
|
551
560
|
sql_reflection_level=sql_reflection_level.value,
|
|
552
561
|
sql_limit=sql_limit,
|
|
553
562
|
sql_exclude_columns=sql_exclude_columns,
|
|
563
|
+
extract_parallelism=extract_parallelism,
|
|
554
564
|
)
|
|
555
565
|
|
|
556
|
-
|
|
566
|
+
resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))
|
|
567
|
+
if factory.source_scheme.startswith("mysql"):
|
|
568
|
+
resource.for_each(dlt_source, lambda x: x.add_map(handle_mysql_empty_dates))
|
|
569
|
+
|
|
570
|
+
if factory.source_scheme.startswith("spanner"):
|
|
571
|
+
resource.for_each(dlt_source, lambda x: x.add_map(cast_spanner_types))
|
|
572
|
+
|
|
573
|
+
if factory.source_scheme.startswith(
|
|
574
|
+
"mmap"
|
|
575
|
+
) and factory.destination_scheme.startswith("clickhouse"):
|
|
576
|
+
# https://github.com/dlt-hub/dlt/issues/2248
|
|
577
|
+
# TODO(turtledev): only apply for write dispositions that actually cause an exception.
|
|
578
|
+
# TODO(turtledev): make batch size configurable
|
|
579
|
+
import ingestr.src.arrow as arrow
|
|
580
|
+
|
|
581
|
+
resource.for_each(dlt_source, lambda x: x.add_map(arrow.as_list))
|
|
582
|
+
|
|
583
|
+
if mask:
|
|
584
|
+
masking_filter = create_masking_filter(mask)
|
|
585
|
+
resource.for_each(dlt_source, lambda x: x.add_map(masking_filter))
|
|
586
|
+
|
|
587
|
+
if yield_limit:
|
|
588
|
+
resource.for_each(dlt_source, lambda x: x.add_limit(yield_limit))
|
|
589
|
+
|
|
590
|
+
if isinstance(source, MongoDbSource):
|
|
591
|
+
from ingestr.src.resource import TypeHintMap
|
|
592
|
+
|
|
593
|
+
resource.for_each(
|
|
594
|
+
dlt_source, lambda x: x.add_map(TypeHintMap().type_hint_map)
|
|
595
|
+
)
|
|
557
596
|
|
|
558
597
|
def col_h(x):
|
|
559
598
|
if column_hints:
|
|
560
599
|
x.apply_hints(columns=column_hints)
|
|
561
600
|
|
|
562
|
-
|
|
601
|
+
resource.for_each(dlt_source, col_h)
|
|
602
|
+
|
|
603
|
+
if isinstance(destination, AthenaDestination) and partition_by:
|
|
604
|
+
partition.apply_athena_hints(dlt_source, partition_by, column_hints)
|
|
563
605
|
|
|
564
606
|
if original_incremental_strategy == IncrementalStrategy.delete_insert:
|
|
565
607
|
|
|
566
608
|
def set_primary_key(x):
|
|
567
609
|
x.incremental.primary_key = ()
|
|
568
610
|
|
|
569
|
-
|
|
611
|
+
resource.for_each(dlt_source, set_primary_key)
|
|
570
612
|
|
|
571
613
|
if (
|
|
572
614
|
factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
|
|
@@ -585,6 +627,10 @@ def ingest(
|
|
|
585
627
|
if incremental_strategy != IncrementalStrategy.none:
|
|
586
628
|
write_disposition = incremental_strategy.value
|
|
587
629
|
|
|
630
|
+
if factory.source_scheme == "influxdb":
|
|
631
|
+
if primary_key:
|
|
632
|
+
write_disposition = "merge"
|
|
633
|
+
|
|
588
634
|
start_time = datetime.now()
|
|
589
635
|
|
|
590
636
|
run_info: LoadInfo = pipeline.run(
|
|
@@ -592,6 +638,7 @@ def ingest(
|
|
|
592
638
|
**destination.dlt_run_params(
|
|
593
639
|
uri=dest_uri,
|
|
594
640
|
table=dest_table,
|
|
641
|
+
staging_bucket=staging_bucket,
|
|
595
642
|
),
|
|
596
643
|
write_disposition=write_disposition, # type: ignore
|
|
597
644
|
primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
|
ingestr/src/adjust/__init__.py
CHANGED
|
@@ -46,7 +46,7 @@ def adjust_source(
|
|
|
46
46
|
filters: Optional[dict] = None,
|
|
47
47
|
) -> Sequence[DltResource]:
|
|
48
48
|
@dlt.resource(write_disposition="merge", merge_key="day")
|
|
49
|
-
def campaigns():
|
|
49
|
+
def campaigns() -> DltResource:
|
|
50
50
|
adjust_api = AdjustAPI(api_key=api_key)
|
|
51
51
|
yield from adjust_api.fetch_report_data(
|
|
52
52
|
start_date=start_date,
|
|
@@ -57,12 +57,12 @@ def adjust_source(
|
|
|
57
57
|
)
|
|
58
58
|
|
|
59
59
|
@dlt.resource(write_disposition="replace", primary_key="id")
|
|
60
|
-
def events():
|
|
60
|
+
def events() -> DltResource:
|
|
61
61
|
adjust_api = AdjustAPI(api_key=api_key)
|
|
62
62
|
yield adjust_api.fetch_events()
|
|
63
63
|
|
|
64
64
|
@dlt.resource(write_disposition="merge", merge_key="day")
|
|
65
|
-
def creatives():
|
|
65
|
+
def creatives() -> DltResource:
|
|
66
66
|
adjust_api = AdjustAPI(api_key=api_key)
|
|
67
67
|
yield from adjust_api.fetch_report_data(
|
|
68
68
|
start_date=start_date,
|
|
@@ -95,7 +95,7 @@ def adjust_source(
|
|
|
95
95
|
primary_key=dimensions,
|
|
96
96
|
columns=type_hints,
|
|
97
97
|
)
|
|
98
|
-
def custom():
|
|
98
|
+
def custom() -> DltResource:
|
|
99
99
|
adjust_api = AdjustAPI(api_key=api_key)
|
|
100
100
|
yield from adjust_api.fetch_report_data(
|
|
101
101
|
start_date=start_date,
|
|
@@ -36,7 +36,7 @@ class AdjustAPI:
|
|
|
36
36
|
def __init__(self, api_key):
|
|
37
37
|
self.api_key = api_key
|
|
38
38
|
self.request_client = Client(
|
|
39
|
-
request_timeout=
|
|
39
|
+
request_timeout=1000, # Adjust support recommends 1000 seconds of read timeout.
|
|
40
40
|
raise_for_status=False,
|
|
41
41
|
retry_condition=retry_on_limit,
|
|
42
42
|
request_max_attempts=12,
|
|
@@ -82,7 +82,9 @@ class AdjustAPI:
|
|
|
82
82
|
items = result.get("rows", [])
|
|
83
83
|
yield items
|
|
84
84
|
else:
|
|
85
|
-
raise HTTPError(
|
|
85
|
+
raise HTTPError(
|
|
86
|
+
f"Request failed with status code: {response.status_code}, {response.text}."
|
|
87
|
+
)
|
|
86
88
|
|
|
87
89
|
def fetch_events(self):
|
|
88
90
|
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
@@ -93,7 +95,9 @@ class AdjustAPI:
|
|
|
93
95
|
result = response.json()
|
|
94
96
|
yield result
|
|
95
97
|
else:
|
|
96
|
-
raise HTTPError(
|
|
98
|
+
raise HTTPError(
|
|
99
|
+
f"Request failed with status code: {response.status_code}, {response.text}."
|
|
100
|
+
)
|
|
97
101
|
|
|
98
102
|
|
|
99
103
|
def parse_filters(filters_raw: str) -> dict:
|
ingestr/src/airtable/__init__.py
CHANGED
|
@@ -9,7 +9,7 @@ import pyairtable
|
|
|
9
9
|
from dlt.sources import DltResource
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
@dlt.source
|
|
12
|
+
@dlt.source(max_table_nesting=1)
|
|
13
13
|
def airtable_source(
|
|
14
14
|
base_id: str = dlt.config.value,
|
|
15
15
|
table_names: Optional[List[str]] = dlt.config.value,
|
|
@@ -50,12 +50,13 @@ def airtable_resource(
|
|
|
50
50
|
It starts with "app". See https://support.airtable.com/docs/finding-airtable-ids
|
|
51
51
|
table (Dict[str, Any]): Metadata about an airtable, does not contain the actual records
|
|
52
52
|
"""
|
|
53
|
+
|
|
53
54
|
primary_key_id = table["primaryFieldId"]
|
|
54
55
|
primary_key_field = [
|
|
55
56
|
field for field in table["fields"] if field["id"] == primary_key_id
|
|
56
57
|
][0]
|
|
57
58
|
table_name: str = table["name"]
|
|
58
|
-
primary_key: List[str] = [primary_key_field[
|
|
59
|
+
primary_key: List[str] = [f"fields__{primary_key_field['name']}".lower()]
|
|
59
60
|
air_table = api.table(base_id, table["id"])
|
|
60
61
|
|
|
61
62
|
# Table.iterate() supports rich customization options, such as chunk size, fields, cell format, timezone, locale, and view
|