omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. omniload/conftest.py +72 -0
  2. omniload/main.py +810 -0
  3. omniload/src/.gitignore +10 -0
  4. omniload/src/adjust/__init__.py +108 -0
  5. omniload/src/adjust/adjust_helpers.py +122 -0
  6. omniload/src/airtable/__init__.py +84 -0
  7. omniload/src/allium/__init__.py +128 -0
  8. omniload/src/anthropic/__init__.py +277 -0
  9. omniload/src/anthropic/helpers.py +525 -0
  10. omniload/src/applovin/__init__.py +316 -0
  11. omniload/src/applovin_max/__init__.py +117 -0
  12. omniload/src/appsflyer/__init__.py +325 -0
  13. omniload/src/appsflyer/client.py +110 -0
  14. omniload/src/appstore/__init__.py +142 -0
  15. omniload/src/appstore/client.py +126 -0
  16. omniload/src/appstore/errors.py +15 -0
  17. omniload/src/appstore/models.py +117 -0
  18. omniload/src/appstore/resources.py +179 -0
  19. omniload/src/arrow/__init__.py +81 -0
  20. omniload/src/asana_source/__init__.py +281 -0
  21. omniload/src/asana_source/helpers.py +30 -0
  22. omniload/src/asana_source/settings.py +158 -0
  23. omniload/src/attio/__init__.py +102 -0
  24. omniload/src/attio/helpers.py +65 -0
  25. omniload/src/blob.py +95 -0
  26. omniload/src/bruin/__init__.py +76 -0
  27. omniload/src/chess/__init__.py +180 -0
  28. omniload/src/chess/helpers.py +35 -0
  29. omniload/src/chess/settings.py +18 -0
  30. omniload/src/clickup/__init__.py +85 -0
  31. omniload/src/clickup/helpers.py +47 -0
  32. omniload/src/collector/spinner.py +43 -0
  33. omniload/src/couchbase_source/__init__.py +118 -0
  34. omniload/src/couchbase_source/helpers.py +135 -0
  35. omniload/src/cursor/__init__.py +83 -0
  36. omniload/src/cursor/helpers.py +188 -0
  37. omniload/src/customer_io/__init__.py +486 -0
  38. omniload/src/customer_io/helpers.py +530 -0
  39. omniload/src/destinations.py +982 -0
  40. omniload/src/docebo/__init__.py +589 -0
  41. omniload/src/docebo/client.py +435 -0
  42. omniload/src/docebo/helpers.py +97 -0
  43. omniload/src/dune/__init__.py +104 -0
  44. omniload/src/dune/helpers.py +108 -0
  45. omniload/src/dynamodb/__init__.py +86 -0
  46. omniload/src/elasticsearch/__init__.py +80 -0
  47. omniload/src/elasticsearch/helpers.py +141 -0
  48. omniload/src/errors.py +26 -0
  49. omniload/src/facebook_ads/__init__.py +403 -0
  50. omniload/src/facebook_ads/exceptions.py +19 -0
  51. omniload/src/facebook_ads/helpers.py +296 -0
  52. omniload/src/facebook_ads/settings.py +224 -0
  53. omniload/src/facebook_ads/utils.py +53 -0
  54. omniload/src/factory.py +305 -0
  55. omniload/src/filesystem/__init__.py +133 -0
  56. omniload/src/filesystem/helpers.py +114 -0
  57. omniload/src/filesystem/readers.py +187 -0
  58. omniload/src/filters.py +62 -0
  59. omniload/src/fireflies/__init__.py +151 -0
  60. omniload/src/fireflies/helpers.py +753 -0
  61. omniload/src/fluxx/__init__.py +10013 -0
  62. omniload/src/fluxx/helpers.py +233 -0
  63. omniload/src/frankfurter/__init__.py +157 -0
  64. omniload/src/frankfurter/helpers.py +48 -0
  65. omniload/src/freshdesk/__init__.py +103 -0
  66. omniload/src/freshdesk/freshdesk_client.py +151 -0
  67. omniload/src/freshdesk/settings.py +23 -0
  68. omniload/src/fundraiseup/__init__.py +95 -0
  69. omniload/src/fundraiseup/client.py +81 -0
  70. omniload/src/github/__init__.py +202 -0
  71. omniload/src/github/helpers.py +207 -0
  72. omniload/src/github/queries.py +129 -0
  73. omniload/src/github/settings.py +24 -0
  74. omniload/src/google_ads/__init__.py +198 -0
  75. omniload/src/google_ads/field.py +17 -0
  76. omniload/src/google_ads/metrics.py +254 -0
  77. omniload/src/google_ads/predicates.py +37 -0
  78. omniload/src/google_ads/reports.py +411 -0
  79. omniload/src/google_ads/test_google_ads.py +184 -0
  80. omniload/src/google_analytics/__init__.py +144 -0
  81. omniload/src/google_analytics/helpers.py +312 -0
  82. omniload/src/google_sheets/README.md +95 -0
  83. omniload/src/google_sheets/__init__.py +166 -0
  84. omniload/src/google_sheets/helpers/__init__.py +15 -0
  85. omniload/src/google_sheets/helpers/api_calls.py +160 -0
  86. omniload/src/google_sheets/helpers/data_processing.py +316 -0
  87. omniload/src/gorgias/__init__.py +595 -0
  88. omniload/src/gorgias/helpers.py +166 -0
  89. omniload/src/hostaway/__init__.py +302 -0
  90. omniload/src/hostaway/client.py +288 -0
  91. omniload/src/http/__init__.py +38 -0
  92. omniload/src/http/readers.py +146 -0
  93. omniload/src/http_client.py +24 -0
  94. omniload/src/hubspot/__init__.py +800 -0
  95. omniload/src/hubspot/helpers.py +417 -0
  96. omniload/src/hubspot/settings.py +329 -0
  97. omniload/src/indeed/__init__.py +153 -0
  98. omniload/src/indeed/helpers.py +228 -0
  99. omniload/src/influxdb/__init__.py +46 -0
  100. omniload/src/influxdb/client.py +34 -0
  101. omniload/src/intercom/__init__.py +142 -0
  102. omniload/src/intercom/helpers.py +674 -0
  103. omniload/src/intercom/settings.py +279 -0
  104. omniload/src/isoc_pulse/__init__.py +159 -0
  105. omniload/src/jira_source/__init__.py +377 -0
  106. omniload/src/jira_source/helpers.py +510 -0
  107. omniload/src/jira_source/settings.py +184 -0
  108. omniload/src/kafka/__init__.py +120 -0
  109. omniload/src/kafka/helpers.py +241 -0
  110. omniload/src/kinesis/__init__.py +153 -0
  111. omniload/src/kinesis/helpers.py +96 -0
  112. omniload/src/klaviyo/__init__.py +237 -0
  113. omniload/src/klaviyo/client.py +212 -0
  114. omniload/src/klaviyo/helpers.py +19 -0
  115. omniload/src/linear/__init__.py +634 -0
  116. omniload/src/linear/helpers.py +111 -0
  117. omniload/src/linkedin_ads/__init__.py +266 -0
  118. omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
  119. omniload/src/linkedin_ads/helpers.py +246 -0
  120. omniload/src/loader.py +69 -0
  121. omniload/src/mailchimp/__init__.py +126 -0
  122. omniload/src/mailchimp/helpers.py +226 -0
  123. omniload/src/mailchimp/settings.py +164 -0
  124. omniload/src/masking.py +344 -0
  125. omniload/src/mixpanel/__init__.py +62 -0
  126. omniload/src/mixpanel/client.py +104 -0
  127. omniload/src/monday/__init__.py +246 -0
  128. omniload/src/monday/helpers.py +392 -0
  129. omniload/src/monday/settings.py +325 -0
  130. omniload/src/mongodb/__init__.py +281 -0
  131. omniload/src/mongodb/helpers.py +975 -0
  132. omniload/src/notion/__init__.py +69 -0
  133. omniload/src/notion/helpers/__init__.py +14 -0
  134. omniload/src/notion/helpers/client.py +178 -0
  135. omniload/src/notion/helpers/database.py +92 -0
  136. omniload/src/notion/settings.py +17 -0
  137. omniload/src/partition.py +32 -0
  138. omniload/src/personio/__init__.py +345 -0
  139. omniload/src/personio/helpers.py +100 -0
  140. omniload/src/phantombuster/__init__.py +65 -0
  141. omniload/src/phantombuster/client.py +87 -0
  142. omniload/src/pinterest/__init__.py +82 -0
  143. omniload/src/pipedrive/__init__.py +212 -0
  144. omniload/src/pipedrive/helpers/__init__.py +37 -0
  145. omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
  146. omniload/src/pipedrive/helpers/pages.py +129 -0
  147. omniload/src/pipedrive/settings.py +41 -0
  148. omniload/src/pipedrive/typing.py +17 -0
  149. omniload/src/plusvibeai/__init__.py +335 -0
  150. omniload/src/plusvibeai/helpers.py +544 -0
  151. omniload/src/plusvibeai/settings.py +252 -0
  152. omniload/src/primer/__init__.py +45 -0
  153. omniload/src/primer/helpers.py +79 -0
  154. omniload/src/quickbooks/__init__.py +117 -0
  155. omniload/src/reddit_ads/__init__.py +183 -0
  156. omniload/src/reddit_ads/helpers.py +232 -0
  157. omniload/src/resource.py +40 -0
  158. omniload/src/revenuecat/__init__.py +83 -0
  159. omniload/src/revenuecat/helpers.py +237 -0
  160. omniload/src/salesforce/__init__.py +170 -0
  161. omniload/src/salesforce/helpers.py +78 -0
  162. omniload/src/shopify/__init__.py +1953 -0
  163. omniload/src/shopify/exceptions.py +17 -0
  164. omniload/src/shopify/helpers.py +202 -0
  165. omniload/src/shopify/settings.py +19 -0
  166. omniload/src/slack/__init__.py +290 -0
  167. omniload/src/slack/helpers.py +218 -0
  168. omniload/src/slack/settings.py +36 -0
  169. omniload/src/smartsheets/__init__.py +82 -0
  170. omniload/src/snapchat_ads/__init__.py +455 -0
  171. omniload/src/snapchat_ads/client.py +72 -0
  172. omniload/src/snapchat_ads/helpers.py +630 -0
  173. omniload/src/snapchat_ads/settings.py +130 -0
  174. omniload/src/socrata_source/__init__.py +83 -0
  175. omniload/src/socrata_source/helpers.py +85 -0
  176. omniload/src/socrata_source/settings.py +8 -0
  177. omniload/src/solidgate/__init__.py +219 -0
  178. omniload/src/solidgate/helpers.py +154 -0
  179. omniload/src/sources.py +5408 -0
  180. omniload/src/sql_database/__init__.py +0 -0
  181. omniload/src/sql_database/callbacks.py +66 -0
  182. omniload/src/stripe_analytics/__init__.py +183 -0
  183. omniload/src/stripe_analytics/helpers.py +386 -0
  184. omniload/src/stripe_analytics/settings.py +80 -0
  185. omniload/src/table_definition.py +15 -0
  186. omniload/src/testdata/fakebqcredentials.json +14 -0
  187. omniload/src/tiktok_ads/__init__.py +150 -0
  188. omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
  189. omniload/src/time.py +11 -0
  190. omniload/src/trustpilot/__init__.py +48 -0
  191. omniload/src/trustpilot/client.py +48 -0
  192. omniload/src/version.py +6 -0
  193. omniload/src/wise/__init__.py +68 -0
  194. omniload/src/wise/client.py +63 -0
  195. omniload/src/zendesk/__init__.py +480 -0
  196. omniload/src/zendesk/helpers/__init__.py +39 -0
  197. omniload/src/zendesk/helpers/api_helpers.py +119 -0
  198. omniload/src/zendesk/helpers/credentials.py +68 -0
  199. omniload/src/zendesk/helpers/talk_api.py +132 -0
  200. omniload/src/zendesk/settings.py +71 -0
  201. omniload/src/zoom/__init__.py +99 -0
  202. omniload/src/zoom/helpers.py +102 -0
  203. omniload/testdata/.gitignore +2 -0
  204. omniload/testdata/create_replace.csv +21 -0
  205. omniload/testdata/delete_insert_expected.csv +6 -0
  206. omniload/testdata/delete_insert_part1.csv +5 -0
  207. omniload/testdata/delete_insert_part2.csv +6 -0
  208. omniload/testdata/merge_expected.csv +5 -0
  209. omniload/testdata/merge_part1.csv +4 -0
  210. omniload/testdata/merge_part2.csv +5 -0
  211. omniload/tests/unit/test_smartsheets.py +133 -0
  212. omniload-0.0.0.dev0.dist-info/METADATA +439 -0
  213. omniload-0.0.0.dev0.dist-info/RECORD +218 -0
  214. omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
  215. omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
  216. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
  217. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
  218. omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
omniload/main.py ADDED
@@ -0,0 +1,810 @@
1
+ import warnings
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import Optional
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from typing_extensions import Annotated
9
+
10
+ try:
11
+ from duckdb_engine import DuckDBEngineWarning
12
+
13
+ warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
14
+ except ImportError:
15
+ # duckdb-engine not installed
16
+ pass
17
+
18
+ app = typer.Typer(
19
+ name="omniload",
20
+ help="omniload is the CLI tool to ingest data from one source to another",
21
+ rich_markup_mode="rich",
22
+ pretty_exceptions_enable=False,
23
+ )
24
+
25
+ console = Console()
26
+ print = console.print
27
+
28
+ DATE_FORMATS = [
29
+ "%Y-%m-%d",
30
+ "%Y-%m-%dT%H:%M:%S",
31
+ "%Y-%m-%dT%H:%M:%S%z",
32
+ "%Y-%m-%d %H:%M:%S",
33
+ "%Y-%m-%dT%H:%M:%S.%f",
34
+ "%Y-%m-%dT%H:%M:%S.%f%z",
35
+ ]
36
+
37
+ # https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
38
+ PARQUET_SUPPORTED_DESTINATIONS = [
39
+ "athenabigquery",
40
+ "duckdb",
41
+ "snowflake",
42
+ "databricks",
43
+ "synapse",
44
+ "s3",
45
+ ]
46
+
47
+ # these sources would return a JSON for sure, which means they cannot be used with Parquet loader for BigQuery
48
+ JSON_RETURNING_SOURCES = ["notion"]
49
+
50
+
51
+ class IncrementalStrategy(str, Enum):
52
+ create_replace = "replace"
53
+ append = "append"
54
+ delete_insert = "delete+insert"
55
+ merge = "merge"
56
+ scd2 = "scd2"
57
+ none = "none"
58
+
59
+
60
+ class LoaderFileFormat(str, Enum):
61
+ jsonl = "jsonl"
62
+ parquet = "parquet"
63
+ insert_values = "insert_values"
64
+ csv = "csv"
65
+
66
+
67
+ class SqlBackend(str, Enum):
68
+ default = "default"
69
+ sqlalchemy = "sqlalchemy"
70
+ pyarrow = "pyarrow"
71
+ connectorx = "connectorx"
72
+
73
+
74
+ class Progress(str, Enum):
75
+ interactive = "interactive"
76
+ log = "log"
77
+
78
+
79
+ class SchemaNaming(str, Enum):
80
+ default = "default"
81
+ direct = "direct"
82
+
83
+
84
+ class SqlReflectionLevel(str, Enum):
85
+ minimal = "minimal"
86
+ full = "full"
87
+ full_with_precision = "full_with_precision"
88
+
89
+
90
+ @app.command()
91
+ def ingest(
92
+ source_uri: Annotated[
93
+ str,
94
+ typer.Option(
95
+ help="The URI of the [green]source[/green]",
96
+ envvar=["SOURCE_URI", "OMNILOAD_SOURCE_URI"],
97
+ ),
98
+ ], # type: ignore
99
+ dest_uri: Annotated[
100
+ str,
101
+ typer.Option(
102
+ help="The URI of the [cyan]destination[/cyan]",
103
+ envvar=["DESTINATION_URI", "OMNILOAD_DESTINATION_URI"],
104
+ ),
105
+ ], # type: ignore
106
+ source_table: Annotated[
107
+ str,
108
+ typer.Option(
109
+ help="The table name in the [green]source[/green] to fetch",
110
+ envvar=["SOURCE_TABLE", "OMNILOAD_SOURCE_TABLE"],
111
+ ),
112
+ ], # type: ignore
113
+ dest_table: Annotated[
114
+ str,
115
+ typer.Option(
116
+ help="The table in the [cyan]destination[/cyan] to save the data into",
117
+ envvar=["DESTINATION_TABLE", "OMNILOAD_DESTINATION_TABLE"],
118
+ ),
119
+ ] = None, # type: ignore
120
+ incremental_key: Annotated[
121
+ Optional[str],
122
+ typer.Option(
123
+ help="The incremental key from the table to be used for incremental strategies",
124
+ envvar=["INCREMENTAL_KEY", "OMNILOAD_INCREMENTAL_KEY"],
125
+ ),
126
+ ] = None, # type: ignore
127
+ incremental_strategy: Annotated[
128
+ IncrementalStrategy,
129
+ typer.Option(
130
+ help="The incremental strategy to use",
131
+ envvar=["INCREMENTAL_STRATEGY", "OMNILOAD_INCREMENTAL_STRATEGY"],
132
+ ),
133
+ ] = IncrementalStrategy.create_replace, # type: ignore
134
+ interval_start: Annotated[
135
+ Optional[datetime],
136
+ typer.Option(
137
+ help="The start of the interval the incremental key will cover",
138
+ formats=DATE_FORMATS,
139
+ envvar=["INTERVAL_START", "OMNILOAD_INTERVAL_START"],
140
+ ),
141
+ ] = None, # type: ignore
142
+ interval_end: Annotated[
143
+ Optional[datetime],
144
+ typer.Option(
145
+ help="The end of the interval the incremental key will cover",
146
+ formats=DATE_FORMATS,
147
+ envvar=["INTERVAL_END", "OMNILOAD_INTERVAL_END"],
148
+ ),
149
+ ] = None, # type: ignore
150
+ primary_key: Annotated[
151
+ Optional[list[str]],
152
+ typer.Option(
153
+ help="The key that will be used to deduplicate the resulting table",
154
+ envvar=["PRIMARY_KEY", "OMNILOAD_PRIMARY_KEY"],
155
+ ),
156
+ ] = None, # type: ignore
157
+ partition_by: Annotated[
158
+ Optional[str],
159
+ typer.Option(
160
+ help="The partition key to be used for partitioning the destination table",
161
+ envvar=["PARTITION_BY", "OMNILOAD_PARTITION_BY"],
162
+ ),
163
+ ] = None, # type: ignore
164
+ cluster_by: Annotated[
165
+ Optional[str],
166
+ typer.Option(
167
+ help="The clustering key to be used for clustering the destination table, not every destination supports clustering.",
168
+ envvar=["CLUSTER_BY", "OMNILOAD_CLUSTER_BY"],
169
+ ),
170
+ ] = None, # type: ignore
171
+ yes: Annotated[
172
+ Optional[bool],
173
+ typer.Option(
174
+ help="Skip the confirmation prompt and ingest right away",
175
+ envvar=["SKIP_CONFIRMATION", "OMNILOAD_SKIP_CONFIRMATION"],
176
+ ),
177
+ ] = False, # type: ignore
178
+ full_refresh: Annotated[
179
+ bool,
180
+ typer.Option(
181
+ help="Ignore the state and refresh the destination table completely",
182
+ envvar=["FULL_REFRESH", "OMNILOAD_FULL_REFRESH"],
183
+ ),
184
+ ] = False, # type: ignore
185
+ progress: Annotated[
186
+ Progress,
187
+ typer.Option(
188
+ help="The progress display type, must be one of 'interactive', 'log'",
189
+ envvar=["PROGRESS", "OMNILOAD_PROGRESS"],
190
+ ),
191
+ ] = Progress.interactive, # type: ignore
192
+ sql_backend: Annotated[
193
+ SqlBackend,
194
+ typer.Option(
195
+ help="The SQL backend to use",
196
+ envvar=["SQL_BACKEND", "OMNILOAD_SQL_BACKEND"],
197
+ ),
198
+ ] = SqlBackend.default, # type: ignore
199
+ loader_file_format: Annotated[
200
+ Optional[LoaderFileFormat],
201
+ typer.Option(
202
+ help="The file format to use when loading data",
203
+ envvar=["LOADER_FILE_FORMAT", "OMNILOAD_LOADER_FILE_FORMAT"],
204
+ ),
205
+ ] = None, # type: ignore
206
+ page_size: Annotated[
207
+ Optional[int],
208
+ typer.Option(
209
+ help="The page size to be used when fetching data from SQL sources",
210
+ envvar=["PAGE_SIZE", "OMNILOAD_PAGE_SIZE"],
211
+ ),
212
+ ] = 50000, # type: ignore
213
+ loader_file_size: Annotated[
214
+ Optional[int],
215
+ typer.Option(
216
+ help="The file size to be used by the loader to split the data into multiple files. This can be set independent of the page size, since page size is used for fetching the data from the sources whereas this is used for the processing/loading part.",
217
+ envvar=["LOADER_FILE_SIZE", "OMNILOAD_LOADER_FILE_SIZE"],
218
+ ),
219
+ ] = 100000, # type: ignore
220
+ schema_naming: Annotated[
221
+ SchemaNaming,
222
+ typer.Option(
223
+ help="The naming convention to use when moving the tables from source to destination. The default behavior is explained here: https://dlthub.com/docs/general-usage/schema#naming-convention",
224
+ envvar=["SCHEMA_NAMING", "OMNILOAD_SCHEMA_NAMING"],
225
+ ),
226
+ ] = SchemaNaming.default, # type: ignore
227
+ pipelines_dir: Annotated[
228
+ Optional[str],
229
+ typer.Option(
230
+ help="The path to store dlt-related pipeline metadata. By default, omniload will create a temporary directory and delete it after the execution is done in order to make retries stateless.",
231
+ envvar=["PIPELINES_DIR", "OMNILOAD_PIPELINES_DIR"],
232
+ ),
233
+ ] = None, # type: ignore
234
+ extract_parallelism: Annotated[
235
+ Optional[int],
236
+ typer.Option(
237
+ help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
238
+ envvar=["EXTRACT_PARALLELISM", "OMNILOAD_EXTRACT_PARALLELISM"],
239
+ ),
240
+ ] = 5, # type: ignore
241
+ sql_reflection_level: Annotated[
242
+ SqlReflectionLevel,
243
+ typer.Option(
244
+ help="The reflection level to use when reflecting the table schema from the source",
245
+ envvar=["SQL_REFLECTION_LEVEL", "OMNILOAD_SQL_REFLECTION_LEVEL"],
246
+ ),
247
+ ] = SqlReflectionLevel.full, # type: ignore
248
+ sql_limit: Annotated[
249
+ Optional[int],
250
+ typer.Option(
251
+ help="The limit to use when fetching data from the source",
252
+ envvar=["SQL_LIMIT", "OMNILOAD_SQL_LIMIT"],
253
+ ),
254
+ ] = None, # type: ignore
255
+ sql_exclude_columns: Annotated[
256
+ Optional[list[str]],
257
+ typer.Option(
258
+ help="The columns to exclude from the source table",
259
+ envvar=["SQL_EXCLUDE_COLUMNS", "OMNILOAD_SQL_EXCLUDE_COLUMNS"],
260
+ ),
261
+ ] = [], # type: ignore
262
+ columns: Annotated[
263
+ Optional[list[str]],
264
+ typer.Option(
265
+ help="The column types to be used for the destination table in the format of 'column_name:column_type'",
266
+ envvar=["OMNILOAD_COLUMNS"],
267
+ ),
268
+ ] = None, # type: ignore
269
+ yield_limit: Annotated[
270
+ Optional[int],
271
+ typer.Option(
272
+ help="Limit the number of pages yielded from the source",
273
+ envvar=["YIELD_LIMIT", "OMNILOAD_YIELD_LIMIT"],
274
+ ),
275
+ ] = None, # type: ignore
276
+ staging_bucket: Annotated[
277
+ Optional[str],
278
+ typer.Option(
279
+ help="The staging bucket to be used for the ingestion, must be prefixed with 'gs://' or 's3://'",
280
+ envvar=["STAGING_BUCKET", "OMNILOAD_STAGING_BUCKET"],
281
+ ),
282
+ ] = None, # type: ignore
283
+ mask: Annotated[
284
+ Optional[list[str]],
285
+ typer.Option(
286
+ help="Column masking configuration in format 'column:algorithm[:param]'. Can be specified multiple times.",
287
+ envvar=["MASK", "OMNILOAD_MASK"],
288
+ ),
289
+ ] = [], # type: ignore
290
+ ):
291
+ import hashlib
292
+ import tempfile
293
+ import time
294
+ from datetime import datetime
295
+
296
+ import dlt
297
+ import humanize
298
+ import typer
299
+ from dlt.common.pipeline import LoadInfo
300
+ from dlt.common.runtime.collector import Collector, LogCollector
301
+ from dlt.common.schema.typing import TColumnSchema
302
+ from dlt.pipeline.exceptions import PipelineStepFailed
303
+
304
+ import omniload.src.partition as partition
305
+ import omniload.src.resource as resource
306
+ from omniload.src.collector.spinner import SpinnerCollector
307
+ from omniload.src.destinations import AthenaDestination, ClickhouseDestination
308
+ from omniload.src.factory import SourceDestinationFactory
309
+ from omniload.src.filters import (
310
+ cast_set_to_list,
311
+ cast_spanner_types,
312
+ create_masking_filter,
313
+ handle_mysql_empty_dates,
314
+ )
315
+ from omniload.src.sources import MongoDbSource
316
+
317
+ def report_errors(run_info: LoadInfo):
318
+ for load_package in run_info.load_packages:
319
+ failed_jobs = load_package.jobs["failed_jobs"]
320
+ if len(failed_jobs) == 0:
321
+ continue
322
+
323
+ print()
324
+ print("[bold red]Failed jobs:[/bold red]")
325
+ print()
326
+ for job in failed_jobs:
327
+ print(f"[bold red] {job.job_file_info.job_id()}[/bold red]")
328
+ print(f" [bold yellow]Error:[/bold yellow] {job.failed_message}")
329
+
330
+ raise typer.Exit(1)
331
+
332
+ def validate_source_dest_tables(
333
+ source_table: str, dest_table: str
334
+ ) -> tuple[str, str]:
335
+ if not dest_table:
336
+ if len(source_table.split(".")) != 2:
337
+ print(
338
+ "[red]Table name must be in the format schema.table for source table when dest-table is not given.[/red]"
339
+ )
340
+ raise typer.Abort()
341
+
342
+ print()
343
+ print(
344
+ "[yellow]Destination table is not given, defaulting to the source table.[/yellow]"
345
+ )
346
+ dest_table = source_table
347
+ return (source_table, dest_table)
348
+
349
+ def validate_loader_file_format(
350
+ dlt_dest, loader_file_format: Optional[LoaderFileFormat]
351
+ ):
352
+ if (
353
+ loader_file_format
354
+ and loader_file_format.value
355
+ not in dlt_dest.capabilities().supported_loader_file_formats
356
+ ):
357
+ print(
358
+ f"[red]Loader file format {loader_file_format.value} is not supported by the destination, available formats: {dlt_dest.capabilities().supported_loader_file_formats}.[/red]"
359
+ )
360
+ raise typer.Abort()
361
+
362
+ def parse_columns(columns: list[str]) -> dict:
363
+ from typing import cast, get_args
364
+
365
+ from dlt.common.data_types import TDataType
366
+
367
+ possible_types = get_args(TDataType)
368
+ custom_types = ("bigdecimal",)
369
+
370
+ types: dict[str, TDataType | str] = {}
371
+ for column in columns:
372
+ for candidate in column.split(","):
373
+ column_name, column_type = candidate.split(":")
374
+ if (
375
+ column_type not in possible_types
376
+ and column_type not in custom_types
377
+ ):
378
+ print(
379
+ f"[red]Column type '{column_type}' is not supported, supported types: {possible_types + custom_types}.[/red]"
380
+ )
381
+ raise typer.Abort()
382
+ types[column_name] = (
383
+ cast(TDataType, column_type)
384
+ if column_type in possible_types
385
+ else column_type
386
+ )
387
+ return types
388
+
389
+ clean_sql_exclude_columns = []
390
+ if sql_exclude_columns:
391
+ for col in sql_exclude_columns:
392
+ for possible_col in col.split(","):
393
+ clean_sql_exclude_columns.append(possible_col.strip())
394
+ sql_exclude_columns = clean_sql_exclude_columns
395
+
396
+ dlt.config["data_writer.buffer_max_items"] = page_size
397
+ dlt.config["data_writer.file_max_items"] = loader_file_size
398
+ dlt.config["extract.workers"] = extract_parallelism
399
+ dlt.config["extract.max_parallel_items"] = extract_parallelism
400
+ dlt.config["load.raise_on_max_retries"] = 15
401
+ if schema_naming != SchemaNaming.default:
402
+ dlt.config["schema.naming"] = schema_naming.value
403
+
404
+ try:
405
+ (source_table, dest_table) = validate_source_dest_tables(
406
+ source_table, dest_table
407
+ )
408
+
409
+ factory = SourceDestinationFactory(source_uri, dest_uri)
410
+
411
+ source = factory.get_source()
412
+ destination = factory.get_destination()
413
+
414
+ column_hints: dict[str, TColumnSchema] = {}
415
+ original_incremental_strategy = incremental_strategy
416
+
417
+ column_types = parse_columns(columns) if columns else None
418
+ if column_types:
419
+ for column_name, column_type in column_types.items():
420
+ if column_type == "bigdecimal":
421
+ column_hints[column_name] = {
422
+ "data_type": "decimal",
423
+ "precision": 76,
424
+ "scale": 38,
425
+ }
426
+ else:
427
+ column_hints[column_name] = {"data_type": column_type}
428
+
429
+ merge_key = None
430
+ if incremental_strategy == IncrementalStrategy.delete_insert:
431
+ merge_key = incremental_key
432
+ incremental_strategy = IncrementalStrategy.merge
433
+ if incremental_key:
434
+ if incremental_key not in column_hints:
435
+ column_hints[incremental_key] = {}
436
+
437
+ column_hints[incremental_key]["merge_key"] = True
438
+
439
+ m = hashlib.sha256()
440
+ m.update(dest_table.encode("utf-8"))
441
+
442
+ progressInstance: Collector = SpinnerCollector()
443
+ if progress == Progress.log:
444
+ progressInstance = LogCollector()
445
+
446
+ is_pipelines_dir_temp = False
447
+ if pipelines_dir is None:
448
+ pipelines_dir = tempfile.mkdtemp()
449
+ is_pipelines_dir_temp = True
450
+
451
+ dlt_dest = destination.dlt_dest(
452
+ uri=dest_uri, dest_table=dest_table, staging_bucket=staging_bucket
453
+ )
454
+ validate_loader_file_format(dlt_dest, loader_file_format)
455
+
456
+ if partition_by:
457
+ if partition_by not in column_hints:
458
+ column_hints[partition_by] = {}
459
+
460
+ column_hints[partition_by]["partition"] = True
461
+
462
+ if cluster_by:
463
+ if cluster_by not in column_hints:
464
+ column_hints[cluster_by] = {}
465
+
466
+ column_hints[cluster_by]["cluster"] = True
467
+
468
+ if primary_key:
469
+ for key in primary_key:
470
+ if key not in column_hints:
471
+ column_hints[key] = {}
472
+
473
+ column_hints[key]["primary_key"] = True
474
+
475
+ pipeline = dlt.pipeline( # type: ignore
476
+ pipeline_name=m.hexdigest(),
477
+ destination=dlt_dest,
478
+ progress=progressInstance,
479
+ pipelines_dir=pipelines_dir,
480
+ refresh="drop_resources" if full_refresh else None,
481
+ )
482
+
483
+ if source.handles_incrementality():
484
+ incremental_strategy = IncrementalStrategy.none
485
+ incremental_key = None
486
+
487
+ incremental_strategy_text = (
488
+ incremental_strategy.value
489
+ if incremental_strategy.value != IncrementalStrategy.none
490
+ else "Platform-specific"
491
+ )
492
+
493
+ source_table_print = source_table.split(":")[0]
494
+
495
+ print()
496
+ print("[bold green]Initiated the pipeline with the following:[/bold green]")
497
+ print(
498
+ f"[bold yellow] Source:[/bold yellow] {factory.source_scheme} / {source_table_print}"
499
+ )
500
+ print(
501
+ f"[bold yellow] Destination:[/bold yellow] {factory.destination_scheme} / {dest_table}"
502
+ )
503
+ print(
504
+ f"[bold yellow] Incremental Strategy:[/bold yellow] {incremental_strategy_text}"
505
+ )
506
+ print(
507
+ f"[bold yellow] Incremental Key:[/bold yellow] {incremental_key if incremental_key else 'None'}"
508
+ )
509
+ print(
510
+ f"[bold yellow] Primary Key:[/bold yellow] {primary_key if primary_key else 'None'}"
511
+ )
512
+ print(f"[bold yellow] Pipeline ID:[/bold yellow] {m.hexdigest()}")
513
+ print()
514
+
515
+ if not yes:
516
+ continuePipeline = typer.confirm("Are you sure you would like to continue?")
517
+ if not continuePipeline:
518
+ raise typer.Abort()
519
+
520
+ print()
521
+ print("[bold green]Starting the ingestion...[/bold green]")
522
+
523
+ if factory.source_scheme == "sqlite":
524
+ source_table = "main." + source_table.split(".")[-1]
525
+
526
+ if (
527
+ incremental_key
528
+ and incremental_key in column_hints
529
+ and "data_type" in column_hints[incremental_key]
530
+ and column_hints[incremental_key]["data_type"] == "date"
531
+ ):
532
+ # By default, omniload treats the start and end dates as datetime objects. While this worked fine for many cases, if the
533
+ # incremental field is a date, the start and end dates cannot be compared to the incremental field, and the ingestion would fail.
534
+ # In order to eliminate this, we have introduced a new option to omniload, --columns, which allows the user to specify the column types for the destination table.
535
+ # This way, omniload will know the data type of the incremental field, and will be able to convert the start and end dates to the correct data type before running the ingestion.
536
+ if interval_start:
537
+ interval_start = interval_start.date() # type: ignore
538
+ if interval_end:
539
+ interval_end = interval_end.date() # type: ignore
540
+
541
+ if factory.source_scheme.startswith("spanner"):
542
+ # we tend to use the 'pyarrow' backend in general, however, it has issues with JSON objects, so we override it to 'sqlalchemy' for Spanner.
543
+ if sql_backend.value == SqlBackend.default:
544
+ sql_backend = SqlBackend.sqlalchemy
545
+
546
+ # this allows us to identify the cases where the user does not have a preference, so that for some sources we can override it.
547
+ if sql_backend == SqlBackend.default:
548
+ sql_backend = SqlBackend.pyarrow
549
+
550
+ dlt_source = source.dlt_source(
551
+ uri=source_uri,
552
+ table=source_table,
553
+ incremental_key=incremental_key,
554
+ merge_key=merge_key,
555
+ interval_start=interval_start,
556
+ interval_end=interval_end,
557
+ sql_backend=sql_backend.value,
558
+ page_size=page_size,
559
+ sql_reflection_level=sql_reflection_level.value,
560
+ sql_limit=sql_limit,
561
+ sql_exclude_columns=sql_exclude_columns,
562
+ extract_parallelism=extract_parallelism,
563
+ column_types=column_types,
564
+ )
565
+
566
+ resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))
567
+ if factory.source_scheme.startswith("mysql"):
568
+ resource.for_each(dlt_source, lambda x: x.add_map(handle_mysql_empty_dates))
569
+
570
+ if factory.source_scheme.startswith("spanner"):
571
+ resource.for_each(dlt_source, lambda x: x.add_map(cast_spanner_types))
572
+
573
+ if factory.source_scheme.startswith(
574
+ "mmap"
575
+ ) and factory.destination_scheme.startswith("clickhouse"):
576
+ # https://github.com/dlt-hub/dlt/issues/2248
577
+ # TODO(turtledev): only apply for write dispositions that actually cause an exception.
578
+ # TODO(turtledev): make batch size configurable
579
+ import omniload.src.arrow as arrow
580
+
581
+ resource.for_each(dlt_source, lambda x: x.add_map(arrow.as_list))
582
+
583
+ if mask:
584
+ masking_filter = create_masking_filter(mask)
585
+ resource.for_each(dlt_source, lambda x: x.add_map(masking_filter))
586
+
587
+ if yield_limit:
588
+ resource.for_each(dlt_source, lambda x: x.add_limit(yield_limit))
589
+
590
+ if isinstance(source, MongoDbSource):
591
+ from omniload.src.resource import TypeHintMap
592
+
593
+ resource.for_each(
594
+ dlt_source, lambda x: x.add_map(TypeHintMap().type_hint_map)
595
+ )
596
+
597
+ def col_h(x):
598
+ if column_hints:
599
+ x.apply_hints(columns=column_hints)
600
+
601
+ resource.for_each(dlt_source, col_h)
602
+
603
+ if isinstance(destination, AthenaDestination) and partition_by:
604
+ partition.apply_athena_hints(dlt_source, partition_by, column_hints)
605
+
606
+ if isinstance(destination, ClickhouseDestination):
607
+ from dlt.destinations.adapters import clickhouse_adapter
608
+
609
+ settings = ClickhouseDestination.engine_settings(dest_uri)
610
+ engine_type = ClickhouseDestination.engine_type(dest_uri)
611
+
612
+ def apply_clickhouse_adapter(x):
613
+ kwargs = {"settings": settings}
614
+ if engine_type:
615
+ kwargs["table_engine_type"] = engine_type
616
+ clickhouse_adapter(x, **kwargs)
617
+
618
+ resource.for_each(
619
+ dlt_source,
620
+ apply_clickhouse_adapter,
621
+ )
622
+
623
+ if original_incremental_strategy == IncrementalStrategy.delete_insert:
624
+
625
+ def set_primary_key(x):
626
+ x.incremental.primary_key = ()
627
+
628
+ resource.for_each(dlt_source, set_primary_key)
629
+
630
+ if (
631
+ factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
632
+ and loader_file_format is None
633
+ ):
634
+ loader_file_format = LoaderFileFormat.parquet
635
+
636
+ # if the source is a JSON returning source, we cannot use Parquet loader for BigQuery
637
+ if (
638
+ factory.destination_scheme == "bigquery"
639
+ and factory.source_scheme in JSON_RETURNING_SOURCES
640
+ ):
641
+ loader_file_format = None
642
+
643
+ write_disposition = None
644
+ if incremental_strategy != IncrementalStrategy.none:
645
+ write_disposition = incremental_strategy.value
646
+
647
+ if factory.source_scheme == "influxdb":
648
+ if primary_key:
649
+ write_disposition = "merge"
650
+
651
+ start_time = datetime.now()
652
+
653
+ def run_pipeline():
654
+ return pipeline.run(
655
+ dlt_source,
656
+ **destination.dlt_run_params(
657
+ uri=dest_uri,
658
+ table=dest_table,
659
+ staging_bucket=staging_bucket,
660
+ ),
661
+ write_disposition=write_disposition, # type: ignore
662
+ primary_key=(
663
+ primary_key if primary_key and len(primary_key) > 0 else None
664
+ ), # type: ignore
665
+ loader_file_format=(
666
+ loader_file_format.value if loader_file_format is not None else None # type: ignore
667
+ ), # type: ignore
668
+ )
669
+
670
+ # Databricks concurrency error patterns that are safe to retry
671
+ DATABRICKS_RETRYABLE_ERRORS = [
672
+ "SCHEMA_ALREADY_EXISTS",
673
+ "DELTA_METADATA_CHANGED",
674
+ "MetadataChangedException",
675
+ ]
676
+
677
+ def is_databricks_retryable_error(exception: Exception) -> bool:
678
+ if factory.destination_scheme != "databricks":
679
+ return False
680
+ error_str = str(exception)
681
+ return any(pattern in error_str for pattern in DATABRICKS_RETRYABLE_ERRORS)
682
+
683
+ max_retries = 3
684
+ for attempt in range(max_retries):
685
+ try:
686
+ run_info: LoadInfo = run_pipeline()
687
+ break
688
+ except PipelineStepFailed as e:
689
+ if is_databricks_retryable_error(e) and attempt < max_retries - 1:
690
+ delay = (attempt + 1) * 2 # 2s, 4s backoff
691
+ print(
692
+ f"[yellow]Databricks concurrency error, retrying in {delay}s (attempt {attempt + 1}/{max_retries})...[/yellow]"
693
+ )
694
+ time.sleep(delay)
695
+ continue
696
+ raise
697
+
698
+ report_errors(run_info)
699
+
700
+ destination.post_load()
701
+
702
+ end_time = datetime.now()
703
+ elapsedHuman = ""
704
+ elapsed = end_time - start_time
705
+ elapsedHuman = f"in {humanize.precisedelta(elapsed)}"
706
+
707
+ if is_pipelines_dir_temp:
708
+ import shutil
709
+
710
+ shutil.rmtree(pipelines_dir)
711
+
712
+ print(
713
+ f"[bold green]Successfully finished loading data from '{factory.source_scheme}' to '{factory.destination_scheme}' {elapsedHuman} [/bold green]"
714
+ )
715
+ print()
716
+
717
+ except Exception:
718
+ raise
719
+
720
+
721
+ @app.command()
722
+ def example_uris():
723
+ print()
724
+ typer.echo(
725
+ "Following are some example URI formats for supported sources and destinations:"
726
+ )
727
+
728
+ print()
729
+ print(
730
+ "[bold green]Postgres:[/bold green] [white]postgres://user:password@host:port/dbname?sslmode=require [/white]"
731
+ )
732
+ print(
733
+ "[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#postgresql[/white dim]"
734
+ )
735
+
736
+ print()
737
+ print(
738
+ "[bold green]BigQuery:[/bold green] [white]bigquery://project-id?credentials_path=/path/to/credentials.json&location=US [/white]"
739
+ )
740
+ print(
741
+ "[white dim]└── https://github.com/googleapis/python-bigquery-sqlalchemy?tab=readme-ov-file#connection-string-parameters[/white dim]"
742
+ )
743
+
744
+ print()
745
+ print(
746
+ "[bold green]Snowflake:[/bold green] [white]snowflake://user:password@account/dbname?warehouse=COMPUTE_WH [/white]"
747
+ )
748
+ print(
749
+ "[white dim]└── https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#connection-parameters"
750
+ )
751
+
752
+ print()
753
+ print(
754
+ "[bold green]Redshift:[/bold green] [white]redshift://user:password@host:port/dbname?sslmode=require [/white]"
755
+ )
756
+ print(
757
+ "[white dim]└── https://aws.amazon.com/blogs/big-data/use-the-amazon-redshift-sqlalchemy-dialect-to-interact-with-amazon-redshift/[/white dim]"
758
+ )
759
+
760
+ print()
761
+ print(
762
+ "[bold green]Databricks:[/bold green] [white]databricks://token:<access_token>@<server_hostname>?http_path=<http_path>&catalog=<catalog>&schema=<schema>[/white]"
763
+ )
764
+ print("[white dim]└── https://docs.databricks.com/en/dev-tools/sqlalchemy.html")
765
+
766
+ print()
767
+ print(
768
+ "[bold green]Microsoft SQL Server:[/bold green] [white]mssql://user:password@host:port/dbname?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes [/white]"
769
+ )
770
+ print(
771
+ "[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#microsoft-sql-server"
772
+ )
773
+
774
+ print()
775
+ print(
776
+ "[bold green]MySQL:[/bold green] [white]mysql://user:password@host:port/dbname [/white]"
777
+ )
778
+ print(
779
+ "[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#mysql[/white dim]"
780
+ )
781
+
782
+ print()
783
+ print("[bold green]DuckDB:[/bold green] [white]duckdb://path/to/database [/white]")
784
+ print("[white dim]└── https://github.com/Mause/duckdb_engine[/white dim]")
785
+
786
+ print()
787
+ print("[bold green]SQLite:[/bold green] [white]sqlite://path/to/database [/white]")
788
+ print(
789
+ "[white dim]└── https://docs.sqlalchemy.org/en/20/core/engines.html#sqlite[/white dim]"
790
+ )
791
+
792
+ print()
793
+ typer.echo(
794
+ "These are all coming from SQLAlchemy's URI format, so they should be familiar to most users."
795
+ )
796
+
797
+
798
+ @app.command()
799
+ def version():
800
+ from omniload.src.version import __version__ # type: ignore
801
+
802
+ print(f"v{__version__}")
803
+
804
+
805
+ def main():
806
+ app()
807
+
808
+
809
+ if __name__ == "__main__":
810
+ main()