omniload 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omniload/__init__.py +13 -0
- omniload/conftest.py +73 -0
- omniload/main.py +809 -0
- omniload/main_test.py +6468 -0
- omniload/src/adjust/__init__.py +108 -0
- omniload/src/adjust/adjust_helpers.py +122 -0
- omniload/src/adjust/adjust_helpers_test.py +15 -0
- omniload/src/airtable/__init__.py +84 -0
- omniload/src/allium/__init__.py +128 -0
- omniload/src/anthropic/__init__.py +277 -0
- omniload/src/anthropic/helpers.py +525 -0
- omniload/src/anthropic/helpers_test.py +522 -0
- omniload/src/applovin/__init__.py +316 -0
- omniload/src/applovin_max/__init__.py +117 -0
- omniload/src/appsflyer/__init__.py +325 -0
- omniload/src/appsflyer/client.py +110 -0
- omniload/src/appsflyer/client_test.py +56 -0
- omniload/src/appstore/__init__.py +142 -0
- omniload/src/appstore/client.py +126 -0
- omniload/src/appstore/errors.py +15 -0
- omniload/src/appstore/models.py +117 -0
- omniload/src/appstore/resources.py +179 -0
- omniload/src/arrow/__init__.py +81 -0
- omniload/src/asana_source/__init__.py +281 -0
- omniload/src/asana_source/helpers.py +30 -0
- omniload/src/asana_source/settings.py +158 -0
- omniload/src/attio/__init__.py +102 -0
- omniload/src/attio/helpers.py +65 -0
- omniload/src/blob.py +95 -0
- omniload/src/blob_test.py +36 -0
- omniload/src/bruin/__init__.py +76 -0
- omniload/src/chess/__init__.py +180 -0
- omniload/src/chess/helpers.py +35 -0
- omniload/src/chess/settings.py +18 -0
- omniload/src/clickup/__init__.py +85 -0
- omniload/src/clickup/helpers.py +47 -0
- omniload/src/collector/spinner.py +43 -0
- omniload/src/couchbase_source/__init__.py +118 -0
- omniload/src/couchbase_source/helpers.py +135 -0
- omniload/src/couchbase_source/helpers_test.py +170 -0
- omniload/src/cursor/__init__.py +83 -0
- omniload/src/cursor/helpers.py +188 -0
- omniload/src/customer_io/__init__.py +486 -0
- omniload/src/customer_io/helpers.py +530 -0
- omniload/src/destinations.py +982 -0
- omniload/src/destinations_test.py +227 -0
- omniload/src/docebo/__init__.py +589 -0
- omniload/src/docebo/client.py +435 -0
- omniload/src/docebo/helpers.py +97 -0
- omniload/src/docebo/helpers_test.py +190 -0
- omniload/src/dune/__init__.py +104 -0
- omniload/src/dune/helpers.py +108 -0
- omniload/src/dynamodb/__init__.py +86 -0
- omniload/src/elasticsearch/__init__.py +80 -0
- omniload/src/elasticsearch/helpers.py +141 -0
- omniload/src/errors.py +26 -0
- omniload/src/facebook_ads/__init__.py +403 -0
- omniload/src/facebook_ads/exceptions.py +19 -0
- omniload/src/facebook_ads/helpers.py +296 -0
- omniload/src/facebook_ads/helpers_test.py +208 -0
- omniload/src/facebook_ads/settings.py +224 -0
- omniload/src/facebook_ads/utils.py +53 -0
- omniload/src/factory.py +305 -0
- omniload/src/factory_test.py +13 -0
- omniload/src/filesystem/__init__.py +133 -0
- omniload/src/filesystem/helpers.py +114 -0
- omniload/src/filesystem/readers.py +187 -0
- omniload/src/filters.py +62 -0
- omniload/src/fireflies/__init__.py +151 -0
- omniload/src/fireflies/helpers.py +753 -0
- omniload/src/fireflies/helpers_test.py +515 -0
- omniload/src/fluxx/__init__.py +10013 -0
- omniload/src/fluxx/helpers.py +233 -0
- omniload/src/fluxx/helpers_test.py +287 -0
- omniload/src/frankfurter/__init__.py +157 -0
- omniload/src/frankfurter/helpers.py +48 -0
- omniload/src/freshdesk/__init__.py +103 -0
- omniload/src/freshdesk/freshdesk_client.py +151 -0
- omniload/src/freshdesk/settings.py +23 -0
- omniload/src/fundraiseup/__init__.py +95 -0
- omniload/src/fundraiseup/client.py +81 -0
- omniload/src/fundraiseup/client_test.py +463 -0
- omniload/src/github/__init__.py +202 -0
- omniload/src/github/helpers.py +207 -0
- omniload/src/github/queries.py +129 -0
- omniload/src/github/settings.py +24 -0
- omniload/src/google_ads/__init__.py +198 -0
- omniload/src/google_ads/field.py +17 -0
- omniload/src/google_ads/metrics.py +254 -0
- omniload/src/google_ads/predicates.py +37 -0
- omniload/src/google_ads/reports.py +411 -0
- omniload/src/google_ads/reports_test.py +212 -0
- omniload/src/google_ads/test_google_ads.py +184 -0
- omniload/src/google_analytics/__init__.py +144 -0
- omniload/src/google_analytics/helpers.py +312 -0
- omniload/src/google_analytics/helpers_test.py +78 -0
- omniload/src/google_sheets/__init__.py +166 -0
- omniload/src/google_sheets/helpers/__init__.py +15 -0
- omniload/src/google_sheets/helpers/api_calls.py +160 -0
- omniload/src/google_sheets/helpers/data_processing.py +316 -0
- omniload/src/gorgias/__init__.py +595 -0
- omniload/src/gorgias/helpers.py +166 -0
- omniload/src/gorgias/helpers_test.py +45 -0
- omniload/src/hostaway/__init__.py +302 -0
- omniload/src/hostaway/client.py +288 -0
- omniload/src/http/__init__.py +38 -0
- omniload/src/http/readers.py +146 -0
- omniload/src/http_client.py +24 -0
- omniload/src/hubspot/__init__.py +800 -0
- omniload/src/hubspot/helpers.py +417 -0
- omniload/src/hubspot/settings.py +329 -0
- omniload/src/indeed/__init__.py +153 -0
- omniload/src/indeed/helpers.py +228 -0
- omniload/src/influxdb/__init__.py +46 -0
- omniload/src/influxdb/client.py +34 -0
- omniload/src/intercom/__init__.py +142 -0
- omniload/src/intercom/helpers.py +674 -0
- omniload/src/intercom/helpers_test.py +249 -0
- omniload/src/intercom/settings.py +279 -0
- omniload/src/isoc_pulse/__init__.py +159 -0
- omniload/src/jira_source/__init__.py +377 -0
- omniload/src/jira_source/helpers.py +510 -0
- omniload/src/jira_source/settings.py +184 -0
- omniload/src/kafka/__init__.py +120 -0
- omniload/src/kafka/helpers.py +241 -0
- omniload/src/kinesis/__init__.py +153 -0
- omniload/src/kinesis/helpers.py +96 -0
- omniload/src/klaviyo/__init__.py +237 -0
- omniload/src/klaviyo/client.py +212 -0
- omniload/src/klaviyo/helpers.py +19 -0
- omniload/src/klaviyo/helpers_test.py +36 -0
- omniload/src/linear/__init__.py +634 -0
- omniload/src/linear/helpers.py +111 -0
- omniload/src/linkedin_ads/__init__.py +266 -0
- omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
- omniload/src/linkedin_ads/helpers.py +246 -0
- omniload/src/linkedin_ads/helpers_test.py +141 -0
- omniload/src/loader.py +69 -0
- omniload/src/loader_test.py +73 -0
- omniload/src/mailchimp/__init__.py +126 -0
- omniload/src/mailchimp/helpers.py +226 -0
- omniload/src/mailchimp/helpers_test.py +303 -0
- omniload/src/mailchimp/settings.py +164 -0
- omniload/src/masking.py +344 -0
- omniload/src/masking_test.py +386 -0
- omniload/src/mixpanel/__init__.py +62 -0
- omniload/src/mixpanel/client.py +104 -0
- omniload/src/monday/__init__.py +246 -0
- omniload/src/monday/helpers.py +392 -0
- omniload/src/monday/settings.py +325 -0
- omniload/src/mongodb/__init__.py +281 -0
- omniload/src/mongodb/helpers.py +975 -0
- omniload/src/mongodb/helpers_test.py +455 -0
- omniload/src/notion/__init__.py +69 -0
- omniload/src/notion/helpers/__init__.py +14 -0
- omniload/src/notion/helpers/client.py +178 -0
- omniload/src/notion/helpers/database.py +92 -0
- omniload/src/notion/settings.py +17 -0
- omniload/src/partition.py +32 -0
- omniload/src/personio/__init__.py +345 -0
- omniload/src/personio/helpers.py +100 -0
- omniload/src/phantombuster/__init__.py +65 -0
- omniload/src/phantombuster/client.py +87 -0
- omniload/src/pinterest/__init__.py +82 -0
- omniload/src/pipedrive/__init__.py +212 -0
- omniload/src/pipedrive/helpers/__init__.py +37 -0
- omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
- omniload/src/pipedrive/helpers/pages.py +129 -0
- omniload/src/pipedrive/settings.py +41 -0
- omniload/src/pipedrive/typing.py +17 -0
- omniload/src/plusvibeai/__init__.py +335 -0
- omniload/src/plusvibeai/helpers.py +544 -0
- omniload/src/plusvibeai/settings.py +252 -0
- omniload/src/primer/__init__.py +45 -0
- omniload/src/primer/helpers.py +79 -0
- omniload/src/primer/helpers_test.py +81 -0
- omniload/src/quickbooks/__init__.py +117 -0
- omniload/src/reddit_ads/__init__.py +183 -0
- omniload/src/reddit_ads/helpers.py +232 -0
- omniload/src/reddit_ads/helpers_test.py +181 -0
- omniload/src/resource.py +40 -0
- omniload/src/revenuecat/__init__.py +83 -0
- omniload/src/revenuecat/helpers.py +237 -0
- omniload/src/revenuecat/helpers_test.py +158 -0
- omniload/src/salesforce/__init__.py +170 -0
- omniload/src/salesforce/helpers.py +78 -0
- omniload/src/shopify/__init__.py +1953 -0
- omniload/src/shopify/exceptions.py +17 -0
- omniload/src/shopify/helpers.py +202 -0
- omniload/src/shopify/helpers_test.py +49 -0
- omniload/src/shopify/settings.py +19 -0
- omniload/src/slack/__init__.py +290 -0
- omniload/src/slack/helpers.py +218 -0
- omniload/src/slack/settings.py +36 -0
- omniload/src/smartsheets/__init__.py +82 -0
- omniload/src/smartsheets/test_smartsheets.py +133 -0
- omniload/src/snapchat_ads/__init__.py +455 -0
- omniload/src/snapchat_ads/client.py +72 -0
- omniload/src/snapchat_ads/helpers.py +630 -0
- omniload/src/snapchat_ads/helpers_test.py +248 -0
- omniload/src/snapchat_ads/settings.py +130 -0
- omniload/src/socrata_source/__init__.py +83 -0
- omniload/src/socrata_source/helpers.py +85 -0
- omniload/src/socrata_source/settings.py +8 -0
- omniload/src/solidgate/__init__.py +219 -0
- omniload/src/solidgate/helpers.py +154 -0
- omniload/src/sources.py +5408 -0
- omniload/src/sources_test.py +290 -0
- omniload/src/sql_database/__init__.py +0 -0
- omniload/src/sql_database/callbacks.py +66 -0
- omniload/src/stripe_analytics/__init__.py +183 -0
- omniload/src/stripe_analytics/helpers.py +386 -0
- omniload/src/stripe_analytics/helpers_test.py +130 -0
- omniload/src/stripe_analytics/settings.py +80 -0
- omniload/src/table_definition.py +15 -0
- omniload/src/tiktok_ads/__init__.py +150 -0
- omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
- omniload/src/tiktok_ads/tiktok_helpers_test.py +56 -0
- omniload/src/time.py +11 -0
- omniload/src/trustpilot/__init__.py +48 -0
- omniload/src/trustpilot/client.py +48 -0
- omniload/src/wise/__init__.py +68 -0
- omniload/src/wise/client.py +63 -0
- omniload/src/zendesk/__init__.py +480 -0
- omniload/src/zendesk/helpers/__init__.py +39 -0
- omniload/src/zendesk/helpers/api_helpers.py +119 -0
- omniload/src/zendesk/helpers/credentials.py +68 -0
- omniload/src/zendesk/helpers/talk_api.py +132 -0
- omniload/src/zendesk/settings.py +71 -0
- omniload/src/zoom/__init__.py +99 -0
- omniload/src/zoom/helpers.py +102 -0
- omniload-0.0.0.dist-info/METADATA +243 -0
- omniload-0.0.0.dist-info/RECORD +239 -0
- omniload-0.0.0.dist-info/WHEEL +5 -0
- omniload-0.0.0.dist-info/entry_points.txt +2 -0
- omniload-0.0.0.dist-info/licenses/LICENSE.Apache-2.0 +176 -0
- omniload-0.0.0.dist-info/licenses/LICENSE.md +21 -0
- omniload-0.0.0.dist-info/licenses/NOTICE +35 -0
- omniload-0.0.0.dist-info/top_level.txt +1 -0
omniload/main.py
ADDED
|
@@ -0,0 +1,809 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from typing_extensions import Annotated
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from duckdb_engine import DuckDBEngineWarning
|
|
12
|
+
|
|
13
|
+
warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
|
|
14
|
+
except ImportError:
|
|
15
|
+
# duckdb-engine not installed
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
app = typer.Typer(
|
|
19
|
+
name="omniload",
|
|
20
|
+
help="omniload is the CLI tool to ingest data from one source to another",
|
|
21
|
+
rich_markup_mode="rich",
|
|
22
|
+
pretty_exceptions_enable=False,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
console = Console()
|
|
26
|
+
print = console.print
|
|
27
|
+
|
|
28
|
+
DATE_FORMATS = [
|
|
29
|
+
"%Y-%m-%d",
|
|
30
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
31
|
+
"%Y-%m-%dT%H:%M:%S%z",
|
|
32
|
+
"%Y-%m-%d %H:%M:%S",
|
|
33
|
+
"%Y-%m-%dT%H:%M:%S.%f",
|
|
34
|
+
"%Y-%m-%dT%H:%M:%S.%f%z",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
# https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
|
|
38
|
+
PARQUET_SUPPORTED_DESTINATIONS = [
|
|
39
|
+
"athenabigquery",
|
|
40
|
+
"duckdb",
|
|
41
|
+
"snowflake",
|
|
42
|
+
"databricks",
|
|
43
|
+
"synapse",
|
|
44
|
+
"s3",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# these sources would return a JSON for sure, which means they cannot be used with Parquet loader for BigQuery
|
|
48
|
+
JSON_RETURNING_SOURCES = ["notion"]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class IncrementalStrategy(str, Enum):
|
|
52
|
+
create_replace = "replace"
|
|
53
|
+
append = "append"
|
|
54
|
+
delete_insert = "delete+insert"
|
|
55
|
+
merge = "merge"
|
|
56
|
+
scd2 = "scd2"
|
|
57
|
+
none = "none"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class LoaderFileFormat(str, Enum):
|
|
61
|
+
jsonl = "jsonl"
|
|
62
|
+
parquet = "parquet"
|
|
63
|
+
insert_values = "insert_values"
|
|
64
|
+
csv = "csv"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class SqlBackend(str, Enum):
|
|
68
|
+
default = "default"
|
|
69
|
+
sqlalchemy = "sqlalchemy"
|
|
70
|
+
pyarrow = "pyarrow"
|
|
71
|
+
connectorx = "connectorx"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Progress(str, Enum):
|
|
75
|
+
interactive = "interactive"
|
|
76
|
+
log = "log"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class SchemaNaming(str, Enum):
|
|
80
|
+
default = "default"
|
|
81
|
+
direct = "direct"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class SqlReflectionLevel(str, Enum):
|
|
85
|
+
minimal = "minimal"
|
|
86
|
+
full = "full"
|
|
87
|
+
full_with_precision = "full_with_precision"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@app.command()
|
|
91
|
+
def ingest(
|
|
92
|
+
source_uri: Annotated[
|
|
93
|
+
str,
|
|
94
|
+
typer.Option(
|
|
95
|
+
help="The URI of the [green]source[/green]",
|
|
96
|
+
envvar=["SOURCE_URI", "OMNILOAD_SOURCE_URI"],
|
|
97
|
+
),
|
|
98
|
+
], # type: ignore
|
|
99
|
+
dest_uri: Annotated[
|
|
100
|
+
str,
|
|
101
|
+
typer.Option(
|
|
102
|
+
help="The URI of the [cyan]destination[/cyan]",
|
|
103
|
+
envvar=["DESTINATION_URI", "OMNILOAD_DESTINATION_URI"],
|
|
104
|
+
),
|
|
105
|
+
], # type: ignore
|
|
106
|
+
source_table: Annotated[
|
|
107
|
+
str,
|
|
108
|
+
typer.Option(
|
|
109
|
+
help="The table name in the [green]source[/green] to fetch",
|
|
110
|
+
envvar=["SOURCE_TABLE", "OMNILOAD_SOURCE_TABLE"],
|
|
111
|
+
),
|
|
112
|
+
], # type: ignore
|
|
113
|
+
dest_table: Annotated[
|
|
114
|
+
str,
|
|
115
|
+
typer.Option(
|
|
116
|
+
help="The table in the [cyan]destination[/cyan] to save the data into",
|
|
117
|
+
envvar=["DESTINATION_TABLE", "OMNILOAD_DESTINATION_TABLE"],
|
|
118
|
+
),
|
|
119
|
+
] = None, # type: ignore
|
|
120
|
+
incremental_key: Annotated[
|
|
121
|
+
Optional[str],
|
|
122
|
+
typer.Option(
|
|
123
|
+
help="The incremental key from the table to be used for incremental strategies",
|
|
124
|
+
envvar=["INCREMENTAL_KEY", "OMNILOAD_INCREMENTAL_KEY"],
|
|
125
|
+
),
|
|
126
|
+
] = None, # type: ignore
|
|
127
|
+
incremental_strategy: Annotated[
|
|
128
|
+
IncrementalStrategy,
|
|
129
|
+
typer.Option(
|
|
130
|
+
help="The incremental strategy to use",
|
|
131
|
+
envvar=["INCREMENTAL_STRATEGY", "OMNILOAD_INCREMENTAL_STRATEGY"],
|
|
132
|
+
),
|
|
133
|
+
] = IncrementalStrategy.create_replace, # type: ignore
|
|
134
|
+
interval_start: Annotated[
|
|
135
|
+
Optional[datetime],
|
|
136
|
+
typer.Option(
|
|
137
|
+
help="The start of the interval the incremental key will cover",
|
|
138
|
+
formats=DATE_FORMATS,
|
|
139
|
+
envvar=["INTERVAL_START", "OMNILOAD_INTERVAL_START"],
|
|
140
|
+
),
|
|
141
|
+
] = None, # type: ignore
|
|
142
|
+
interval_end: Annotated[
|
|
143
|
+
Optional[datetime],
|
|
144
|
+
typer.Option(
|
|
145
|
+
help="The end of the interval the incremental key will cover",
|
|
146
|
+
formats=DATE_FORMATS,
|
|
147
|
+
envvar=["INTERVAL_END", "OMNILOAD_INTERVAL_END"],
|
|
148
|
+
),
|
|
149
|
+
] = None, # type: ignore
|
|
150
|
+
primary_key: Annotated[
|
|
151
|
+
Optional[list[str]],
|
|
152
|
+
typer.Option(
|
|
153
|
+
help="The key that will be used to deduplicate the resulting table",
|
|
154
|
+
envvar=["PRIMARY_KEY", "OMNILOAD_PRIMARY_KEY"],
|
|
155
|
+
),
|
|
156
|
+
] = None, # type: ignore
|
|
157
|
+
partition_by: Annotated[
|
|
158
|
+
Optional[str],
|
|
159
|
+
typer.Option(
|
|
160
|
+
help="The partition key to be used for partitioning the destination table",
|
|
161
|
+
envvar=["PARTITION_BY", "OMNILOAD_PARTITION_BY"],
|
|
162
|
+
),
|
|
163
|
+
] = None, # type: ignore
|
|
164
|
+
cluster_by: Annotated[
|
|
165
|
+
Optional[str],
|
|
166
|
+
typer.Option(
|
|
167
|
+
help="The clustering key to be used for clustering the destination table, not every destination supports clustering.",
|
|
168
|
+
envvar=["CLUSTER_BY", "OMNILOAD_CLUSTER_BY"],
|
|
169
|
+
),
|
|
170
|
+
] = None, # type: ignore
|
|
171
|
+
dry_run: Annotated[
|
|
172
|
+
Optional[bool],
|
|
173
|
+
typer.Option(
|
|
174
|
+
help="Display data transfer plan but don't invoke it",
|
|
175
|
+
envvar=["DRY_RUN", "OMNILOAD_DRY_RUN"],
|
|
176
|
+
),
|
|
177
|
+
] = False, # type: ignore
|
|
178
|
+
full_refresh: Annotated[
|
|
179
|
+
bool,
|
|
180
|
+
typer.Option(
|
|
181
|
+
help="Ignore the state and refresh the destination table completely",
|
|
182
|
+
envvar=["FULL_REFRESH", "OMNILOAD_FULL_REFRESH"],
|
|
183
|
+
),
|
|
184
|
+
] = False, # type: ignore
|
|
185
|
+
progress: Annotated[
|
|
186
|
+
Progress,
|
|
187
|
+
typer.Option(
|
|
188
|
+
help="The progress display type, must be one of 'interactive', 'log'",
|
|
189
|
+
envvar=["PROGRESS", "OMNILOAD_PROGRESS"],
|
|
190
|
+
),
|
|
191
|
+
] = Progress.interactive, # type: ignore
|
|
192
|
+
sql_backend: Annotated[
|
|
193
|
+
SqlBackend,
|
|
194
|
+
typer.Option(
|
|
195
|
+
help="The SQL backend to use",
|
|
196
|
+
envvar=["SQL_BACKEND", "OMNILOAD_SQL_BACKEND"],
|
|
197
|
+
),
|
|
198
|
+
] = SqlBackend.default, # type: ignore
|
|
199
|
+
loader_file_format: Annotated[
|
|
200
|
+
Optional[LoaderFileFormat],
|
|
201
|
+
typer.Option(
|
|
202
|
+
help="The file format to use when loading data",
|
|
203
|
+
envvar=["LOADER_FILE_FORMAT", "OMNILOAD_LOADER_FILE_FORMAT"],
|
|
204
|
+
),
|
|
205
|
+
] = None, # type: ignore
|
|
206
|
+
page_size: Annotated[
|
|
207
|
+
Optional[int],
|
|
208
|
+
typer.Option(
|
|
209
|
+
help="The page size to be used when fetching data from SQL sources",
|
|
210
|
+
envvar=["PAGE_SIZE", "OMNILOAD_PAGE_SIZE"],
|
|
211
|
+
),
|
|
212
|
+
] = 50000, # type: ignore
|
|
213
|
+
loader_file_size: Annotated[
|
|
214
|
+
Optional[int],
|
|
215
|
+
typer.Option(
|
|
216
|
+
help="The file size to be used by the loader to split the data into multiple files. This can be set independent of the page size, since page size is used for fetching the data from the sources whereas this is used for the processing/loading part.",
|
|
217
|
+
envvar=["LOADER_FILE_SIZE", "OMNILOAD_LOADER_FILE_SIZE"],
|
|
218
|
+
),
|
|
219
|
+
] = 100000, # type: ignore
|
|
220
|
+
schema_naming: Annotated[
|
|
221
|
+
SchemaNaming,
|
|
222
|
+
typer.Option(
|
|
223
|
+
help="The naming convention to use when moving the tables from source to destination. The default behavior is explained here: https://dlthub.com/docs/general-usage/schema#naming-convention",
|
|
224
|
+
envvar=["SCHEMA_NAMING", "OMNILOAD_SCHEMA_NAMING"],
|
|
225
|
+
),
|
|
226
|
+
] = SchemaNaming.default, # type: ignore
|
|
227
|
+
pipelines_dir: Annotated[
|
|
228
|
+
Optional[str],
|
|
229
|
+
typer.Option(
|
|
230
|
+
help="The path to store dlt-related pipeline metadata. By default, omniload will create a temporary directory and delete it after the execution is done in order to make retries stateless.",
|
|
231
|
+
envvar=["PIPELINES_DIR", "OMNILOAD_PIPELINES_DIR"],
|
|
232
|
+
),
|
|
233
|
+
] = None, # type: ignore
|
|
234
|
+
extract_parallelism: Annotated[
|
|
235
|
+
Optional[int],
|
|
236
|
+
typer.Option(
|
|
237
|
+
help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
|
|
238
|
+
envvar=["EXTRACT_PARALLELISM", "OMNILOAD_EXTRACT_PARALLELISM"],
|
|
239
|
+
),
|
|
240
|
+
] = 5, # type: ignore
|
|
241
|
+
sql_reflection_level: Annotated[
|
|
242
|
+
SqlReflectionLevel,
|
|
243
|
+
typer.Option(
|
|
244
|
+
help="The reflection level to use when reflecting the table schema from the source",
|
|
245
|
+
envvar=["SQL_REFLECTION_LEVEL", "OMNILOAD_SQL_REFLECTION_LEVEL"],
|
|
246
|
+
),
|
|
247
|
+
] = SqlReflectionLevel.full, # type: ignore
|
|
248
|
+
sql_limit: Annotated[
|
|
249
|
+
Optional[int],
|
|
250
|
+
typer.Option(
|
|
251
|
+
help="The limit to use when fetching data from the source",
|
|
252
|
+
envvar=["SQL_LIMIT", "OMNILOAD_SQL_LIMIT"],
|
|
253
|
+
),
|
|
254
|
+
] = None, # type: ignore
|
|
255
|
+
sql_exclude_columns: Annotated[
|
|
256
|
+
Optional[list[str]],
|
|
257
|
+
typer.Option(
|
|
258
|
+
help="The columns to exclude from the source table",
|
|
259
|
+
envvar=["SQL_EXCLUDE_COLUMNS", "OMNILOAD_SQL_EXCLUDE_COLUMNS"],
|
|
260
|
+
),
|
|
261
|
+
] = [], # type: ignore
|
|
262
|
+
columns: Annotated[
|
|
263
|
+
Optional[list[str]],
|
|
264
|
+
typer.Option(
|
|
265
|
+
help="The column types to be used for the destination table in the format of 'column_name:column_type'",
|
|
266
|
+
envvar=["OMNILOAD_COLUMNS"],
|
|
267
|
+
),
|
|
268
|
+
] = None, # type: ignore
|
|
269
|
+
yield_limit: Annotated[
|
|
270
|
+
Optional[int],
|
|
271
|
+
typer.Option(
|
|
272
|
+
help="Limit the number of pages yielded from the source",
|
|
273
|
+
envvar=["YIELD_LIMIT", "OMNILOAD_YIELD_LIMIT"],
|
|
274
|
+
),
|
|
275
|
+
] = None, # type: ignore
|
|
276
|
+
staging_bucket: Annotated[
|
|
277
|
+
Optional[str],
|
|
278
|
+
typer.Option(
|
|
279
|
+
help="The staging bucket to be used for the ingestion, must be prefixed with 'gs://' or 's3://'",
|
|
280
|
+
envvar=["STAGING_BUCKET", "OMNILOAD_STAGING_BUCKET"],
|
|
281
|
+
),
|
|
282
|
+
] = None, # type: ignore
|
|
283
|
+
mask: Annotated[
|
|
284
|
+
Optional[list[str]],
|
|
285
|
+
typer.Option(
|
|
286
|
+
help="Column masking configuration in format 'column:algorithm[:param]'. Can be specified multiple times.",
|
|
287
|
+
envvar=["MASK", "OMNILOAD_MASK"],
|
|
288
|
+
),
|
|
289
|
+
] = [], # type: ignore
|
|
290
|
+
):
|
|
291
|
+
import hashlib
|
|
292
|
+
import tempfile
|
|
293
|
+
import time
|
|
294
|
+
from datetime import datetime
|
|
295
|
+
|
|
296
|
+
import dlt
|
|
297
|
+
import humanize
|
|
298
|
+
import typer
|
|
299
|
+
from dlt.common.pipeline import LoadInfo
|
|
300
|
+
from dlt.common.runtime.collector import Collector, LogCollector
|
|
301
|
+
from dlt.common.schema.typing import TColumnSchema
|
|
302
|
+
from dlt.pipeline.exceptions import PipelineStepFailed
|
|
303
|
+
|
|
304
|
+
import omniload.src.partition as partition
|
|
305
|
+
import omniload.src.resource as resource
|
|
306
|
+
from omniload.src.collector.spinner import SpinnerCollector
|
|
307
|
+
from omniload.src.destinations import AthenaDestination, ClickhouseDestination
|
|
308
|
+
from omniload.src.factory import SourceDestinationFactory
|
|
309
|
+
from omniload.src.filters import (
|
|
310
|
+
cast_set_to_list,
|
|
311
|
+
cast_spanner_types,
|
|
312
|
+
create_masking_filter,
|
|
313
|
+
handle_mysql_empty_dates,
|
|
314
|
+
)
|
|
315
|
+
from omniload.src.sources import MongoDbSource
|
|
316
|
+
|
|
317
|
+
def report_errors(run_info: LoadInfo):
|
|
318
|
+
for load_package in run_info.load_packages:
|
|
319
|
+
failed_jobs = load_package.jobs["failed_jobs"]
|
|
320
|
+
if len(failed_jobs) == 0:
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
print()
|
|
324
|
+
print("[bold red]Failed jobs:[/bold red]")
|
|
325
|
+
print()
|
|
326
|
+
for job in failed_jobs:
|
|
327
|
+
print(f"[bold red] {job.job_file_info.job_id()}[/bold red]")
|
|
328
|
+
print(f" [bold yellow]Error:[/bold yellow] {job.failed_message}")
|
|
329
|
+
|
|
330
|
+
raise typer.Exit(1)
|
|
331
|
+
|
|
332
|
+
def validate_source_dest_tables(
|
|
333
|
+
source_table: str, dest_table: str
|
|
334
|
+
) -> tuple[str, str]:
|
|
335
|
+
if not dest_table:
|
|
336
|
+
if len(source_table.split(".")) != 2:
|
|
337
|
+
print(
|
|
338
|
+
"[red]Table name must be in the format schema.table for source table when dest-table is not given.[/red]"
|
|
339
|
+
)
|
|
340
|
+
raise typer.Abort()
|
|
341
|
+
|
|
342
|
+
print()
|
|
343
|
+
print(
|
|
344
|
+
"[yellow]Destination table is not given, defaulting to the source table.[/yellow]"
|
|
345
|
+
)
|
|
346
|
+
dest_table = source_table
|
|
347
|
+
return (source_table, dest_table)
|
|
348
|
+
|
|
349
|
+
def validate_loader_file_format(
|
|
350
|
+
dlt_dest, loader_file_format: Optional[LoaderFileFormat]
|
|
351
|
+
):
|
|
352
|
+
if (
|
|
353
|
+
loader_file_format
|
|
354
|
+
and loader_file_format.value
|
|
355
|
+
not in dlt_dest.capabilities().supported_loader_file_formats
|
|
356
|
+
):
|
|
357
|
+
print(
|
|
358
|
+
f"[red]Loader file format {loader_file_format.value} is not supported by the destination, available formats: {dlt_dest.capabilities().supported_loader_file_formats}.[/red]"
|
|
359
|
+
)
|
|
360
|
+
raise typer.Abort()
|
|
361
|
+
|
|
362
|
+
def parse_columns(columns: list[str]) -> dict:
|
|
363
|
+
from typing import cast, get_args
|
|
364
|
+
|
|
365
|
+
from dlt.common.data_types import TDataType
|
|
366
|
+
|
|
367
|
+
possible_types = get_args(TDataType)
|
|
368
|
+
custom_types = ("bigdecimal",)
|
|
369
|
+
|
|
370
|
+
types: dict[str, TDataType | str] = {}
|
|
371
|
+
for column in columns:
|
|
372
|
+
for candidate in column.split(","):
|
|
373
|
+
column_name, column_type = candidate.split(":")
|
|
374
|
+
if (
|
|
375
|
+
column_type not in possible_types
|
|
376
|
+
and column_type not in custom_types
|
|
377
|
+
):
|
|
378
|
+
print(
|
|
379
|
+
f"[red]Column type '{column_type}' is not supported, supported types: {possible_types + custom_types}.[/red]"
|
|
380
|
+
)
|
|
381
|
+
raise typer.Abort()
|
|
382
|
+
types[column_name] = (
|
|
383
|
+
cast(TDataType, column_type)
|
|
384
|
+
if column_type in possible_types
|
|
385
|
+
else column_type
|
|
386
|
+
)
|
|
387
|
+
return types
|
|
388
|
+
|
|
389
|
+
clean_sql_exclude_columns = []
|
|
390
|
+
if sql_exclude_columns:
|
|
391
|
+
for col in sql_exclude_columns:
|
|
392
|
+
for possible_col in col.split(","):
|
|
393
|
+
clean_sql_exclude_columns.append(possible_col.strip())
|
|
394
|
+
sql_exclude_columns = clean_sql_exclude_columns
|
|
395
|
+
|
|
396
|
+
dlt.config["data_writer.buffer_max_items"] = page_size
|
|
397
|
+
dlt.config["data_writer.file_max_items"] = loader_file_size
|
|
398
|
+
dlt.config["extract.workers"] = extract_parallelism
|
|
399
|
+
dlt.config["extract.max_parallel_items"] = extract_parallelism
|
|
400
|
+
dlt.config["load.raise_on_max_retries"] = 15
|
|
401
|
+
if schema_naming != SchemaNaming.default:
|
|
402
|
+
dlt.config["schema.naming"] = schema_naming.value
|
|
403
|
+
|
|
404
|
+
try:
|
|
405
|
+
(source_table, dest_table) = validate_source_dest_tables(
|
|
406
|
+
source_table, dest_table
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
factory = SourceDestinationFactory(source_uri, dest_uri)
|
|
410
|
+
|
|
411
|
+
source = factory.get_source()
|
|
412
|
+
destination = factory.get_destination()
|
|
413
|
+
|
|
414
|
+
column_hints: dict[str, TColumnSchema] = {}
|
|
415
|
+
original_incremental_strategy = incremental_strategy
|
|
416
|
+
|
|
417
|
+
column_types = parse_columns(columns) if columns else None
|
|
418
|
+
if column_types:
|
|
419
|
+
for column_name, column_type in column_types.items():
|
|
420
|
+
if column_type == "bigdecimal":
|
|
421
|
+
column_hints[column_name] = {
|
|
422
|
+
"data_type": "decimal",
|
|
423
|
+
"precision": 76,
|
|
424
|
+
"scale": 38,
|
|
425
|
+
}
|
|
426
|
+
else:
|
|
427
|
+
column_hints[column_name] = {"data_type": column_type}
|
|
428
|
+
|
|
429
|
+
merge_key = None
|
|
430
|
+
if incremental_strategy == IncrementalStrategy.delete_insert:
|
|
431
|
+
merge_key = incremental_key
|
|
432
|
+
incremental_strategy = IncrementalStrategy.merge
|
|
433
|
+
if incremental_key:
|
|
434
|
+
if incremental_key not in column_hints:
|
|
435
|
+
column_hints[incremental_key] = {}
|
|
436
|
+
|
|
437
|
+
column_hints[incremental_key]["merge_key"] = True
|
|
438
|
+
|
|
439
|
+
m = hashlib.sha256()
|
|
440
|
+
m.update(dest_table.encode("utf-8"))
|
|
441
|
+
|
|
442
|
+
progressInstance: Collector = SpinnerCollector()
|
|
443
|
+
if progress == Progress.log:
|
|
444
|
+
progressInstance = LogCollector()
|
|
445
|
+
|
|
446
|
+
is_pipelines_dir_temp = False
|
|
447
|
+
if pipelines_dir is None:
|
|
448
|
+
pipelines_dir = tempfile.mkdtemp()
|
|
449
|
+
is_pipelines_dir_temp = True
|
|
450
|
+
|
|
451
|
+
dlt_dest = destination.dlt_dest(
|
|
452
|
+
uri=dest_uri, dest_table=dest_table, staging_bucket=staging_bucket
|
|
453
|
+
)
|
|
454
|
+
validate_loader_file_format(dlt_dest, loader_file_format)
|
|
455
|
+
|
|
456
|
+
if partition_by:
|
|
457
|
+
if partition_by not in column_hints:
|
|
458
|
+
column_hints[partition_by] = {}
|
|
459
|
+
|
|
460
|
+
column_hints[partition_by]["partition"] = True
|
|
461
|
+
|
|
462
|
+
if cluster_by:
|
|
463
|
+
if cluster_by not in column_hints:
|
|
464
|
+
column_hints[cluster_by] = {}
|
|
465
|
+
|
|
466
|
+
column_hints[cluster_by]["cluster"] = True
|
|
467
|
+
|
|
468
|
+
if primary_key:
|
|
469
|
+
for key in primary_key:
|
|
470
|
+
if key not in column_hints:
|
|
471
|
+
column_hints[key] = {}
|
|
472
|
+
|
|
473
|
+
column_hints[key]["primary_key"] = True
|
|
474
|
+
|
|
475
|
+
pipeline = dlt.pipeline( # type: ignore
|
|
476
|
+
pipeline_name=m.hexdigest(),
|
|
477
|
+
destination=dlt_dest,
|
|
478
|
+
progress=progressInstance,
|
|
479
|
+
pipelines_dir=pipelines_dir,
|
|
480
|
+
refresh="drop_resources" if full_refresh else None,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
if source.handles_incrementality():
|
|
484
|
+
incremental_strategy = IncrementalStrategy.none
|
|
485
|
+
incremental_key = None
|
|
486
|
+
|
|
487
|
+
incremental_strategy_text = (
|
|
488
|
+
incremental_strategy.value
|
|
489
|
+
if incremental_strategy.value != IncrementalStrategy.none
|
|
490
|
+
else "Platform-specific"
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
source_table_print = source_table.split(":")[0]
|
|
494
|
+
|
|
495
|
+
print()
|
|
496
|
+
print("[bold green]Initiated the pipeline with the following:[/bold green]")
|
|
497
|
+
print(
|
|
498
|
+
f"[bold yellow] Source:[/bold yellow] {factory.source_scheme} / {source_table_print}"
|
|
499
|
+
)
|
|
500
|
+
print(
|
|
501
|
+
f"[bold yellow] Destination:[/bold yellow] {factory.destination_scheme} / {dest_table}"
|
|
502
|
+
)
|
|
503
|
+
print(
|
|
504
|
+
f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy_text}"
|
|
505
|
+
)
|
|
506
|
+
print(
|
|
507
|
+
f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
|
|
508
|
+
)
|
|
509
|
+
print(
|
|
510
|
+
f"[bold yellow] Primary Key:[/bold yellow] {primary_key if primary_key else 'None'}"
|
|
511
|
+
)
|
|
512
|
+
print(f"[bold yellow] Pipeline ID:[/bold yellow] {m.hexdigest()}")
|
|
513
|
+
print()
|
|
514
|
+
|
|
515
|
+
if dry_run:
|
|
516
|
+
typer.echo("Skipping data transfer, because `--dry-run` was selected.")
|
|
517
|
+
raise typer.Exit(0)
|
|
518
|
+
|
|
519
|
+
print()
|
|
520
|
+
print("[bold green]Starting the ingestion...[/bold green]")
|
|
521
|
+
|
|
522
|
+
if factory.source_scheme == "sqlite":
|
|
523
|
+
source_table = "main." + source_table.split(".")[-1]
|
|
524
|
+
|
|
525
|
+
if (
|
|
526
|
+
incremental_key
|
|
527
|
+
and incremental_key in column_hints
|
|
528
|
+
and "data_type" in column_hints[incremental_key]
|
|
529
|
+
and column_hints[incremental_key]["data_type"] == "date"
|
|
530
|
+
):
|
|
531
|
+
# By default, omniload treats the start and end dates as datetime objects. While this worked fine for many cases, if the
|
|
532
|
+
# incremental field is a date, the start and end dates cannot be compared to the incremental field, and the ingestion would fail.
|
|
533
|
+
# In order to eliminate this, we have introduced a new option to omniload, --columns, which allows the user to specify the column types for the destination table.
|
|
534
|
+
# This way, omniload will know the data type of the incremental field, and will be able to convert the start and end dates to the correct data type before running the ingestion.
|
|
535
|
+
if interval_start:
|
|
536
|
+
interval_start = interval_start.date() # type: ignore
|
|
537
|
+
if interval_end:
|
|
538
|
+
interval_end = interval_end.date() # type: ignore
|
|
539
|
+
|
|
540
|
+
if factory.source_scheme.startswith("spanner"):
|
|
541
|
+
# we tend to use the 'pyarrow' backend in general, however, it has issues with JSON objects, so we override it to 'sqlalchemy' for Spanner.
|
|
542
|
+
if sql_backend.value == SqlBackend.default:
|
|
543
|
+
sql_backend = SqlBackend.sqlalchemy
|
|
544
|
+
|
|
545
|
+
# this allows us to identify the cases where the user does not have a preference, so that for some sources we can override it.
|
|
546
|
+
if sql_backend == SqlBackend.default:
|
|
547
|
+
sql_backend = SqlBackend.pyarrow
|
|
548
|
+
|
|
549
|
+
dlt_source = source.dlt_source(
|
|
550
|
+
uri=source_uri,
|
|
551
|
+
table=source_table,
|
|
552
|
+
incremental_key=incremental_key,
|
|
553
|
+
merge_key=merge_key,
|
|
554
|
+
interval_start=interval_start,
|
|
555
|
+
interval_end=interval_end,
|
|
556
|
+
sql_backend=sql_backend.value,
|
|
557
|
+
page_size=page_size,
|
|
558
|
+
sql_reflection_level=sql_reflection_level.value,
|
|
559
|
+
sql_limit=sql_limit,
|
|
560
|
+
sql_exclude_columns=sql_exclude_columns,
|
|
561
|
+
extract_parallelism=extract_parallelism,
|
|
562
|
+
column_types=column_types,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))
|
|
566
|
+
if factory.source_scheme.startswith("mysql"):
|
|
567
|
+
resource.for_each(dlt_source, lambda x: x.add_map(handle_mysql_empty_dates))
|
|
568
|
+
|
|
569
|
+
if factory.source_scheme.startswith("spanner"):
|
|
570
|
+
resource.for_each(dlt_source, lambda x: x.add_map(cast_spanner_types))
|
|
571
|
+
|
|
572
|
+
if factory.source_scheme.startswith(
|
|
573
|
+
"mmap"
|
|
574
|
+
) and factory.destination_scheme.startswith("clickhouse"):
|
|
575
|
+
# https://github.com/dlt-hub/dlt/issues/2248
|
|
576
|
+
# TODO(turtledev): only apply for write dispositions that actually cause an exception.
|
|
577
|
+
# TODO(turtledev): make batch size configurable
|
|
578
|
+
import omniload.src.arrow as arrow
|
|
579
|
+
|
|
580
|
+
resource.for_each(dlt_source, lambda x: x.add_map(arrow.as_list))
|
|
581
|
+
|
|
582
|
+
if mask:
|
|
583
|
+
masking_filter = create_masking_filter(mask)
|
|
584
|
+
resource.for_each(dlt_source, lambda x: x.add_map(masking_filter))
|
|
585
|
+
|
|
586
|
+
if yield_limit:
|
|
587
|
+
resource.for_each(dlt_source, lambda x: x.add_limit(yield_limit))
|
|
588
|
+
|
|
589
|
+
if isinstance(source, MongoDbSource):
|
|
590
|
+
from omniload.src.resource import TypeHintMap
|
|
591
|
+
|
|
592
|
+
resource.for_each(
|
|
593
|
+
dlt_source, lambda x: x.add_map(TypeHintMap().type_hint_map)
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
def col_h(x):
|
|
597
|
+
if column_hints:
|
|
598
|
+
x.apply_hints(columns=column_hints)
|
|
599
|
+
|
|
600
|
+
resource.for_each(dlt_source, col_h)
|
|
601
|
+
|
|
602
|
+
if isinstance(destination, AthenaDestination) and partition_by:
|
|
603
|
+
partition.apply_athena_hints(dlt_source, partition_by, column_hints)
|
|
604
|
+
|
|
605
|
+
if isinstance(destination, ClickhouseDestination):
|
|
606
|
+
from dlt.destinations.adapters import clickhouse_adapter
|
|
607
|
+
|
|
608
|
+
settings = ClickhouseDestination.engine_settings(dest_uri)
|
|
609
|
+
engine_type = ClickhouseDestination.engine_type(dest_uri)
|
|
610
|
+
|
|
611
|
+
def apply_clickhouse_adapter(x):
|
|
612
|
+
kwargs = {"settings": settings}
|
|
613
|
+
if engine_type:
|
|
614
|
+
kwargs["table_engine_type"] = engine_type
|
|
615
|
+
clickhouse_adapter(x, **kwargs)
|
|
616
|
+
|
|
617
|
+
resource.for_each(
|
|
618
|
+
dlt_source,
|
|
619
|
+
apply_clickhouse_adapter,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
if original_incremental_strategy == IncrementalStrategy.delete_insert:
|
|
623
|
+
|
|
624
|
+
def set_primary_key(x):
|
|
625
|
+
x.incremental.primary_key = ()
|
|
626
|
+
|
|
627
|
+
resource.for_each(dlt_source, set_primary_key)
|
|
628
|
+
|
|
629
|
+
if (
|
|
630
|
+
factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
|
|
631
|
+
and loader_file_format is None
|
|
632
|
+
):
|
|
633
|
+
loader_file_format = LoaderFileFormat.parquet
|
|
634
|
+
|
|
635
|
+
# if the source is a JSON returning source, we cannot use Parquet loader for BigQuery
|
|
636
|
+
if (
|
|
637
|
+
factory.destination_scheme == "bigquery"
|
|
638
|
+
and factory.source_scheme in JSON_RETURNING_SOURCES
|
|
639
|
+
):
|
|
640
|
+
loader_file_format = None
|
|
641
|
+
|
|
642
|
+
write_disposition = None
|
|
643
|
+
if incremental_strategy != IncrementalStrategy.none:
|
|
644
|
+
write_disposition = incremental_strategy.value
|
|
645
|
+
|
|
646
|
+
if factory.source_scheme == "influxdb":
|
|
647
|
+
if primary_key:
|
|
648
|
+
write_disposition = "merge"
|
|
649
|
+
|
|
650
|
+
start_time = datetime.now()
|
|
651
|
+
|
|
652
|
+
def run_pipeline():
|
|
653
|
+
return pipeline.run(
|
|
654
|
+
dlt_source,
|
|
655
|
+
**destination.dlt_run_params(
|
|
656
|
+
uri=dest_uri,
|
|
657
|
+
table=dest_table,
|
|
658
|
+
staging_bucket=staging_bucket,
|
|
659
|
+
),
|
|
660
|
+
write_disposition=write_disposition, # type: ignore
|
|
661
|
+
primary_key=(
|
|
662
|
+
primary_key if primary_key and len(primary_key) > 0 else None
|
|
663
|
+
), # type: ignore
|
|
664
|
+
loader_file_format=(
|
|
665
|
+
loader_file_format.value if loader_file_format is not None else None # type: ignore
|
|
666
|
+
), # type: ignore
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
# Databricks concurrency error patterns that are safe to retry
|
|
670
|
+
DATABRICKS_RETRYABLE_ERRORS = [
|
|
671
|
+
"SCHEMA_ALREADY_EXISTS",
|
|
672
|
+
"DELTA_METADATA_CHANGED",
|
|
673
|
+
"MetadataChangedException",
|
|
674
|
+
]
|
|
675
|
+
|
|
676
|
+
def is_databricks_retryable_error(exception: Exception) -> bool:
|
|
677
|
+
if factory.destination_scheme != "databricks":
|
|
678
|
+
return False
|
|
679
|
+
error_str = str(exception)
|
|
680
|
+
return any(pattern in error_str for pattern in DATABRICKS_RETRYABLE_ERRORS)
|
|
681
|
+
|
|
682
|
+
max_retries = 3
|
|
683
|
+
for attempt in range(max_retries):
|
|
684
|
+
try:
|
|
685
|
+
run_info: LoadInfo = run_pipeline()
|
|
686
|
+
break
|
|
687
|
+
except PipelineStepFailed as e:
|
|
688
|
+
if is_databricks_retryable_error(e) and attempt < max_retries - 1:
|
|
689
|
+
delay = (attempt + 1) * 2 # 2s, 4s backoff
|
|
690
|
+
print(
|
|
691
|
+
f"[yellow]Databricks concurrency error, retrying in {delay}s (attempt {attempt + 1}/{max_retries})...[/yellow]"
|
|
692
|
+
)
|
|
693
|
+
time.sleep(delay)
|
|
694
|
+
continue
|
|
695
|
+
raise
|
|
696
|
+
|
|
697
|
+
report_errors(run_info)
|
|
698
|
+
|
|
699
|
+
destination.post_load()
|
|
700
|
+
|
|
701
|
+
end_time = datetime.now()
|
|
702
|
+
elapsedHuman = ""
|
|
703
|
+
elapsed = end_time - start_time
|
|
704
|
+
elapsedHuman = f"in {humanize.precisedelta(elapsed)}"
|
|
705
|
+
|
|
706
|
+
if is_pipelines_dir_temp:
|
|
707
|
+
import shutil
|
|
708
|
+
|
|
709
|
+
shutil.rmtree(pipelines_dir)
|
|
710
|
+
|
|
711
|
+
print(
|
|
712
|
+
f"[bold green]Successfully finished loading data from '{factory.source_scheme}' to '{factory.destination_scheme}' {elapsedHuman} [/bold green]"
|
|
713
|
+
)
|
|
714
|
+
print()
|
|
715
|
+
|
|
716
|
+
except Exception:
|
|
717
|
+
raise
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
@app.command()
|
|
721
|
+
def example_uris():
|
|
722
|
+
print()
|
|
723
|
+
typer.echo(
|
|
724
|
+
"Following are some example URI formats for supported sources and destinations:"
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
print()
|
|
728
|
+
print(
|
|
729
|
+
"[bold green]Postgres:[/bold green] [white]postgres://user:password@host:port/dbname?sslmode=require [/white]"
|
|
730
|
+
)
|
|
731
|
+
print(
|
|
732
|
+
"[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql[/white dim]"
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
print()
|
|
736
|
+
print(
|
|
737
|
+
"[bold green]BigQuery:[/bold green] [white]bigquery://project-id?credentials_path=/path/to/credentials.json&location=US [/white]"
|
|
738
|
+
)
|
|
739
|
+
print(
|
|
740
|
+
"[white dim]└── https://github.com/googleapis/python-bigquery-sqlalchemy?tab=readme-ov-file#connection-string-parameters[/white dim]"
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
print()
|
|
744
|
+
print(
|
|
745
|
+
"[bold green]Snowflake:[/bold green] [white]snowflake://user:password@account/dbname?warehouse=COMPUTE_WH [/white]"
|
|
746
|
+
)
|
|
747
|
+
print(
|
|
748
|
+
"[white dim]└── https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#connection-parameters"
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
print()
|
|
752
|
+
print(
|
|
753
|
+
"[bold green]Redshift:[/bold green] [white]redshift://user:password@host:port/dbname?sslmode=require [/white]"
|
|
754
|
+
)
|
|
755
|
+
print(
|
|
756
|
+
"[white dim]└── https://aws.amazon.com/blogs/big-data/use-the-amazon-redshift-sqlalchemy-dialect-to-interact-with-amazon-redshift/[/white dim]"
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
print()
|
|
760
|
+
print(
|
|
761
|
+
"[bold green]Databricks:[/bold green] [white]databricks://token:<access_token>@<server_hostname>?http_path=<http_path>&catalog=<catalog>&schema=<schema>[/white]"
|
|
762
|
+
)
|
|
763
|
+
print("[white dim]└── https://docs.databricks.com/en/dev-tools/sqlalchemy.html")
|
|
764
|
+
|
|
765
|
+
print()
|
|
766
|
+
print(
|
|
767
|
+
"[bold green]Microsoft SQL Server:[/bold green] [white]mssql://user:password@host:port/dbname?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes [/white]"
|
|
768
|
+
)
|
|
769
|
+
print(
|
|
770
|
+
"[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#microsoft-sql-server"
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
print()
|
|
774
|
+
print(
|
|
775
|
+
"[bold green]MySQL:[/bold green] [white]mysql://user:password@host:port/dbname [/white]"
|
|
776
|
+
)
|
|
777
|
+
print(
|
|
778
|
+
"[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#mysql[/white dim]"
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
print()
|
|
782
|
+
print("[bold green]DuckDB:[/bold green] [white]duckdb://path/to/database [/white]")
|
|
783
|
+
print("[white dim]└── https://github.com/Mause/duckdb_engine[/white dim]")
|
|
784
|
+
|
|
785
|
+
print()
|
|
786
|
+
print("[bold green]SQLite:[/bold green] [white]sqlite://path/to/database [/white]")
|
|
787
|
+
print(
|
|
788
|
+
"[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#sqlite[/white dim]"
|
|
789
|
+
)
|
|
790
|
+
|
|
791
|
+
print()
|
|
792
|
+
typer.echo(
|
|
793
|
+
"These are all coming from SQLAlchemy's URI format, so they should be familiar to most users."
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
@app.command()
|
|
798
|
+
def version():
|
|
799
|
+
from omniload import __version__ # type: ignore
|
|
800
|
+
|
|
801
|
+
print(f"v{__version__}")
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def main():
|
|
805
|
+
app()
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
if __name__ == "__main__":
|
|
809
|
+
main()
|