ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin/__init__.py +262 -0
  10. ingestr/src/applovin_max/__init__.py +117 -0
  11. ingestr/src/appsflyer/__init__.py +325 -0
  12. ingestr/src/appsflyer/client.py +49 -45
  13. ingestr/src/appstore/__init__.py +1 -0
  14. ingestr/src/arrow/__init__.py +9 -1
  15. ingestr/src/asana_source/__init__.py +1 -1
  16. ingestr/src/attio/__init__.py +102 -0
  17. ingestr/src/attio/helpers.py +65 -0
  18. ingestr/src/blob.py +38 -11
  19. ingestr/src/buildinfo.py +1 -0
  20. ingestr/src/chess/__init__.py +1 -1
  21. ingestr/src/clickup/__init__.py +85 -0
  22. ingestr/src/clickup/helpers.py +47 -0
  23. ingestr/src/collector/spinner.py +43 -0
  24. ingestr/src/couchbase_source/__init__.py +118 -0
  25. ingestr/src/couchbase_source/helpers.py +135 -0
  26. ingestr/src/cursor/__init__.py +83 -0
  27. ingestr/src/cursor/helpers.py +188 -0
  28. ingestr/src/destinations.py +520 -33
  29. ingestr/src/docebo/__init__.py +589 -0
  30. ingestr/src/docebo/client.py +435 -0
  31. ingestr/src/docebo/helpers.py +97 -0
  32. ingestr/src/elasticsearch/__init__.py +80 -0
  33. ingestr/src/elasticsearch/helpers.py +138 -0
  34. ingestr/src/errors.py +8 -0
  35. ingestr/src/facebook_ads/__init__.py +47 -28
  36. ingestr/src/facebook_ads/helpers.py +59 -37
  37. ingestr/src/facebook_ads/settings.py +2 -0
  38. ingestr/src/facebook_ads/utils.py +39 -0
  39. ingestr/src/factory.py +116 -2
  40. ingestr/src/filesystem/__init__.py +8 -3
  41. ingestr/src/filters.py +46 -3
  42. ingestr/src/fluxx/__init__.py +9906 -0
  43. ingestr/src/fluxx/helpers.py +209 -0
  44. ingestr/src/frankfurter/__init__.py +157 -0
  45. ingestr/src/frankfurter/helpers.py +48 -0
  46. ingestr/src/freshdesk/__init__.py +89 -0
  47. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  48. ingestr/src/freshdesk/settings.py +9 -0
  49. ingestr/src/fundraiseup/__init__.py +95 -0
  50. ingestr/src/fundraiseup/client.py +81 -0
  51. ingestr/src/github/__init__.py +41 -6
  52. ingestr/src/github/helpers.py +5 -5
  53. ingestr/src/google_analytics/__init__.py +22 -4
  54. ingestr/src/google_analytics/helpers.py +124 -6
  55. ingestr/src/google_sheets/__init__.py +4 -4
  56. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  57. ingestr/src/hostaway/__init__.py +302 -0
  58. ingestr/src/hostaway/client.py +288 -0
  59. ingestr/src/http/__init__.py +35 -0
  60. ingestr/src/http/readers.py +114 -0
  61. ingestr/src/http_client.py +24 -0
  62. ingestr/src/hubspot/__init__.py +66 -23
  63. ingestr/src/hubspot/helpers.py +52 -22
  64. ingestr/src/hubspot/settings.py +14 -7
  65. ingestr/src/influxdb/__init__.py +46 -0
  66. ingestr/src/influxdb/client.py +34 -0
  67. ingestr/src/intercom/__init__.py +142 -0
  68. ingestr/src/intercom/helpers.py +674 -0
  69. ingestr/src/intercom/settings.py +279 -0
  70. ingestr/src/isoc_pulse/__init__.py +159 -0
  71. ingestr/src/jira_source/__init__.py +340 -0
  72. ingestr/src/jira_source/helpers.py +439 -0
  73. ingestr/src/jira_source/settings.py +170 -0
  74. ingestr/src/kafka/__init__.py +4 -1
  75. ingestr/src/kinesis/__init__.py +139 -0
  76. ingestr/src/kinesis/helpers.py +82 -0
  77. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  78. ingestr/src/linear/__init__.py +634 -0
  79. ingestr/src/linear/helpers.py +111 -0
  80. ingestr/src/linkedin_ads/helpers.py +0 -1
  81. ingestr/src/loader.py +69 -0
  82. ingestr/src/mailchimp/__init__.py +126 -0
  83. ingestr/src/mailchimp/helpers.py +226 -0
  84. ingestr/src/mailchimp/settings.py +164 -0
  85. ingestr/src/masking.py +344 -0
  86. ingestr/src/mixpanel/__init__.py +62 -0
  87. ingestr/src/mixpanel/client.py +99 -0
  88. ingestr/src/monday/__init__.py +246 -0
  89. ingestr/src/monday/helpers.py +392 -0
  90. ingestr/src/monday/settings.py +328 -0
  91. ingestr/src/mongodb/__init__.py +72 -8
  92. ingestr/src/mongodb/helpers.py +915 -38
  93. ingestr/src/partition.py +32 -0
  94. ingestr/src/personio/__init__.py +331 -0
  95. ingestr/src/personio/helpers.py +86 -0
  96. ingestr/src/phantombuster/__init__.py +65 -0
  97. ingestr/src/phantombuster/client.py +87 -0
  98. ingestr/src/pinterest/__init__.py +82 -0
  99. ingestr/src/pipedrive/__init__.py +198 -0
  100. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  101. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  102. ingestr/src/pipedrive/helpers/pages.py +115 -0
  103. ingestr/src/pipedrive/settings.py +27 -0
  104. ingestr/src/pipedrive/typing.py +3 -0
  105. ingestr/src/plusvibeai/__init__.py +335 -0
  106. ingestr/src/plusvibeai/helpers.py +544 -0
  107. ingestr/src/plusvibeai/settings.py +252 -0
  108. ingestr/src/quickbooks/__init__.py +117 -0
  109. ingestr/src/resource.py +40 -0
  110. ingestr/src/revenuecat/__init__.py +83 -0
  111. ingestr/src/revenuecat/helpers.py +237 -0
  112. ingestr/src/salesforce/__init__.py +156 -0
  113. ingestr/src/salesforce/helpers.py +64 -0
  114. ingestr/src/shopify/__init__.py +1 -17
  115. ingestr/src/smartsheets/__init__.py +82 -0
  116. ingestr/src/snapchat_ads/__init__.py +489 -0
  117. ingestr/src/snapchat_ads/client.py +72 -0
  118. ingestr/src/snapchat_ads/helpers.py +535 -0
  119. ingestr/src/socrata_source/__init__.py +83 -0
  120. ingestr/src/socrata_source/helpers.py +85 -0
  121. ingestr/src/socrata_source/settings.py +8 -0
  122. ingestr/src/solidgate/__init__.py +219 -0
  123. ingestr/src/solidgate/helpers.py +154 -0
  124. ingestr/src/sources.py +3132 -212
  125. ingestr/src/stripe_analytics/__init__.py +49 -21
  126. ingestr/src/stripe_analytics/helpers.py +286 -1
  127. ingestr/src/stripe_analytics/settings.py +62 -10
  128. ingestr/src/telemetry/event.py +10 -9
  129. ingestr/src/tiktok_ads/__init__.py +12 -6
  130. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  131. ingestr/src/trustpilot/__init__.py +48 -0
  132. ingestr/src/trustpilot/client.py +48 -0
  133. ingestr/src/version.py +6 -1
  134. ingestr/src/wise/__init__.py +68 -0
  135. ingestr/src/wise/client.py +63 -0
  136. ingestr/src/zoom/__init__.py +99 -0
  137. ingestr/src/zoom/helpers.py +102 -0
  138. ingestr/tests/unit/test_smartsheets.py +133 -0
  139. ingestr-0.14.104.dist-info/METADATA +563 -0
  140. ingestr-0.14.104.dist-info/RECORD +203 -0
  141. ingestr/src/appsflyer/_init_.py +0 -24
  142. ingestr-0.13.2.dist-info/METADATA +0 -302
  143. ingestr-0.13.2.dist-info/RECORD +0 -107
  144. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  145. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  146. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
ingestr/conftest.py ADDED
@@ -0,0 +1,72 @@
1
+ import os
2
+ import tempfile
3
+ from concurrent.futures import ThreadPoolExecutor
4
+
5
+ import pytest
6
+ from main_test import DESTINATIONS, SOURCES # type: ignore
7
+
8
+
9
+ def pytest_configure(config):
10
+ if is_master(config):
11
+ config.shared_directory = tempfile.mkdtemp()
12
+
13
+
14
+ def pytest_configure_node(node):
15
+ """xdist hook"""
16
+ node.workerinput["shared_directory"] = node.config.shared_directory
17
+
18
+
19
+ @pytest.fixture(scope="session")
20
+ def shared_directory(request):
21
+ if is_master(request.config):
22
+ return request.config.shared_directory
23
+ else:
24
+ return request.config.workerinput["shared_directory"]
25
+
26
+
27
+ def is_master(config):
28
+ """True if the code running the given pytest.config object is running in a xdist master
29
+ node or not running xdist at all.
30
+ """
31
+ return not hasattr(config, "workerinput")
32
+
33
+
34
+ def start_containers(config):
35
+ if hasattr(config, "workerinput"):
36
+ return
37
+
38
+ unique_containers = set(SOURCES.values()) | set(DESTINATIONS.values())
39
+ for container in unique_containers:
40
+ container.container_lock_dir = config.shared_directory
41
+
42
+ with ThreadPoolExecutor() as executor:
43
+ for container in unique_containers:
44
+ executor.submit(container.start_fully)
45
+ # futures = [
46
+ # executor.submit(container.start_fully) for container in unique_containers
47
+ # ]
48
+ # # Wait for all futures to complete
49
+ # for future in futures:
50
+ # future.result()
51
+
52
+
53
+ def stop_containers(config):
54
+ if hasattr(config, "workerinput"):
55
+ return
56
+
57
+ should_manage_containers = os.environ.get("PYTEST_XDIST_WORKER", "gw0") == "gw0"
58
+ if not should_manage_containers:
59
+ return
60
+
61
+ unique_containers = set(SOURCES.values()) | set(DESTINATIONS.values())
62
+
63
+ for container in unique_containers:
64
+ container.stop_fully()
65
+
66
+
67
+ def pytest_sessionstart(session):
68
+ start_containers(session.config)
69
+
70
+
71
+ def pytest_sessionfinish(session, exitstatus):
72
+ stop_containers(session.config)
ingestr/main.py CHANGED
@@ -1,16 +1,22 @@
1
+ import warnings
1
2
  from datetime import datetime
2
3
  from enum import Enum
3
4
  from typing import Optional
4
5
 
5
6
  import typer
6
- from dlt.common.runtime.collector import Collector
7
7
  from rich.console import Console
8
- from rich.status import Status
9
8
  from typing_extensions import Annotated
10
9
 
11
- from ingestr.src.filters import cast_set_to_list
12
10
  from ingestr.src.telemetry.event import track
13
11
 
12
+ try:
13
+ from duckdb_engine import DuckDBEngineWarning
14
+
15
+ warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
16
+ except ImportError:
17
+ # duckdb-engine not installed
18
+ pass
19
+
14
20
  app = typer.Typer(
15
21
  name="ingestr",
16
22
  help="ingestr is the CLI tool to ingest data from one source to another",
@@ -32,56 +38,18 @@ DATE_FORMATS = [
32
38
 
33
39
  # https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
34
40
  PARQUET_SUPPORTED_DESTINATIONS = [
35
- "athena" "bigquery",
41
+ "athenabigquery",
36
42
  "duckdb",
37
43
  "snowflake",
38
44
  "databricks",
39
45
  "synapse",
46
+ "s3",
40
47
  ]
41
48
 
42
49
  # these sources would return a JSON for sure, which means they cannot be used with Parquet loader for BigQuery
43
50
  JSON_RETURNING_SOURCES = ["notion"]
44
51
 
45
52
 
46
- class SpinnerCollector(Collector):
47
- status: Status
48
- current_step: str
49
- started: bool
50
-
51
- def __init__(self) -> None:
52
- self.status = Status("Ingesting data...", spinner="dots")
53
- self.started = False
54
-
55
- def update(
56
- self,
57
- name: str,
58
- inc: int = 1,
59
- total: Optional[int] = None,
60
- message: Optional[str] = None, # type: ignore
61
- label: str = "",
62
- **kwargs,
63
- ) -> None:
64
- self.status.update(self.current_step)
65
-
66
- def _start(self, step: str) -> None:
67
- self.current_step = self.__step_to_label(step)
68
- self.status.start()
69
-
70
- def __step_to_label(self, step: str) -> str:
71
- verb = step.split(" ")[0].lower()
72
- if verb.startswith("normalize"):
73
- return "Normalizing the data"
74
- elif verb.startswith("load"):
75
- return "Loading the data to the destination"
76
- elif verb.startswith("extract"):
77
- return "Extracting the data from the source"
78
-
79
- return f"{verb.capitalize()} the data"
80
-
81
- def _stop(self) -> None:
82
- self.status.stop()
83
-
84
-
85
53
  class IncrementalStrategy(str, Enum):
86
54
  create_replace = "replace"
87
55
  append = "append"
@@ -99,6 +67,7 @@ class LoaderFileFormat(str, Enum):
99
67
 
100
68
 
101
69
  class SqlBackend(str, Enum):
70
+ default = "default"
102
71
  sqlalchemy = "sqlalchemy"
103
72
  pyarrow = "pyarrow"
104
73
  connectorx = "connectorx"
@@ -124,40 +93,44 @@ class SqlReflectionLevel(str, Enum):
124
93
  def ingest(
125
94
  source_uri: Annotated[
126
95
  str,
127
- typer.Option(help="The URI of the [green]source[/green]", envvar="SOURCE_URI"),
96
+ typer.Option(
97
+ help="The URI of the [green]source[/green]",
98
+ envvar=["SOURCE_URI", "INGESTR_SOURCE_URI"],
99
+ ),
128
100
  ], # type: ignore
129
101
  dest_uri: Annotated[
130
102
  str,
131
103
  typer.Option(
132
- help="The URI of the [cyan]destination[/cyan]", envvar="DESTINATION_URI"
104
+ help="The URI of the [cyan]destination[/cyan]",
105
+ envvar=["DESTINATION_URI", "INGESTR_DESTINATION_URI"],
133
106
  ),
134
107
  ], # type: ignore
135
108
  source_table: Annotated[
136
109
  str,
137
110
  typer.Option(
138
111
  help="The table name in the [green]source[/green] to fetch",
139
- envvar="SOURCE_TABLE",
112
+ envvar=["SOURCE_TABLE", "INGESTR_SOURCE_TABLE"],
140
113
  ),
141
114
  ], # type: ignore
142
115
  dest_table: Annotated[
143
116
  str,
144
117
  typer.Option(
145
118
  help="The table in the [cyan]destination[/cyan] to save the data into",
146
- envvar="DESTINATION_TABLE",
119
+ envvar=["DESTINATION_TABLE", "INGESTR_DESTINATION_TABLE"],
147
120
  ),
148
121
  ] = None, # type: ignore
149
122
  incremental_key: Annotated[
150
123
  Optional[str],
151
124
  typer.Option(
152
125
  help="The incremental key from the table to be used for incremental strategies",
153
- envvar="INCREMENTAL_KEY",
126
+ envvar=["INCREMENTAL_KEY", "INGESTR_INCREMENTAL_KEY"],
154
127
  ),
155
128
  ] = None, # type: ignore
156
129
  incremental_strategy: Annotated[
157
130
  IncrementalStrategy,
158
131
  typer.Option(
159
132
  help="The incremental strategy to use",
160
- envvar="INCREMENTAL_STRATEGY",
133
+ envvar=["INCREMENTAL_STRATEGY", "INGESTR_INCREMENTAL_STRATEGY"],
161
134
  ),
162
135
  ] = IncrementalStrategy.create_replace, # type: ignore
163
136
  interval_start: Annotated[
@@ -165,7 +138,7 @@ def ingest(
165
138
  typer.Option(
166
139
  help="The start of the interval the incremental key will cover",
167
140
  formats=DATE_FORMATS,
168
- envvar="INTERVAL_START",
141
+ envvar=["INTERVAL_START", "INGESTR_INTERVAL_START"],
169
142
  ),
170
143
  ] = None, # type: ignore
171
144
  interval_end: Annotated[
@@ -173,128 +146,149 @@ def ingest(
173
146
  typer.Option(
174
147
  help="The end of the interval the incremental key will cover",
175
148
  formats=DATE_FORMATS,
176
- envvar="INTERVAL_END",
149
+ envvar=["INTERVAL_END", "INGESTR_INTERVAL_END"],
177
150
  ),
178
151
  ] = None, # type: ignore
179
152
  primary_key: Annotated[
180
153
  Optional[list[str]],
181
154
  typer.Option(
182
155
  help="The key that will be used to deduplicate the resulting table",
183
- envvar="PRIMARY_KEY",
156
+ envvar=["PRIMARY_KEY", "INGESTR_PRIMARY_KEY"],
184
157
  ),
185
158
  ] = None, # type: ignore
186
159
  partition_by: Annotated[
187
160
  Optional[str],
188
161
  typer.Option(
189
162
  help="The partition key to be used for partitioning the destination table",
190
- envvar="PARTITION_BY",
163
+ envvar=["PARTITION_BY", "INGESTR_PARTITION_BY"],
191
164
  ),
192
165
  ] = None, # type: ignore
193
166
  cluster_by: Annotated[
194
167
  Optional[str],
195
168
  typer.Option(
196
169
  help="The clustering key to be used for clustering the destination table, not every destination supports clustering.",
197
- envvar="CLUSTER_BY",
170
+ envvar=["CLUSTER_BY", "INGESTR_CLUSTER_BY"],
198
171
  ),
199
172
  ] = None, # type: ignore
200
173
  yes: Annotated[
201
174
  Optional[bool],
202
175
  typer.Option(
203
176
  help="Skip the confirmation prompt and ingest right away",
204
- envvar="SKIP_CONFIRMATION",
177
+ envvar=["SKIP_CONFIRMATION", "INGESTR_SKIP_CONFIRMATION"],
205
178
  ),
206
179
  ] = False, # type: ignore
207
180
  full_refresh: Annotated[
208
181
  bool,
209
182
  typer.Option(
210
183
  help="Ignore the state and refresh the destination table completely",
211
- envvar="FULL_REFRESH",
184
+ envvar=["FULL_REFRESH", "INGESTR_FULL_REFRESH"],
212
185
  ),
213
186
  ] = False, # type: ignore
214
187
  progress: Annotated[
215
188
  Progress,
216
189
  typer.Option(
217
190
  help="The progress display type, must be one of 'interactive', 'log'",
218
- envvar="PROGRESS",
191
+ envvar=["PROGRESS", "INGESTR_PROGRESS"],
219
192
  ),
220
193
  ] = Progress.interactive, # type: ignore
221
194
  sql_backend: Annotated[
222
195
  SqlBackend,
223
196
  typer.Option(
224
197
  help="The SQL backend to use",
225
- envvar="SQL_BACKEND",
198
+ envvar=["SQL_BACKEND", "INGESTR_SQL_BACKEND"],
226
199
  ),
227
- ] = SqlBackend.pyarrow, # type: ignore
200
+ ] = SqlBackend.default, # type: ignore
228
201
  loader_file_format: Annotated[
229
202
  Optional[LoaderFileFormat],
230
203
  typer.Option(
231
204
  help="The file format to use when loading data",
232
- envvar="LOADER_FILE_FORMAT",
205
+ envvar=["LOADER_FILE_FORMAT", "INGESTR_LOADER_FILE_FORMAT"],
233
206
  ),
234
207
  ] = None, # type: ignore
235
208
  page_size: Annotated[
236
209
  Optional[int],
237
210
  typer.Option(
238
211
  help="The page size to be used when fetching data from SQL sources",
239
- envvar="PAGE_SIZE",
212
+ envvar=["PAGE_SIZE", "INGESTR_PAGE_SIZE"],
240
213
  ),
241
214
  ] = 50000, # type: ignore
242
215
  loader_file_size: Annotated[
243
216
  Optional[int],
244
217
  typer.Option(
245
218
  help="The file size to be used by the loader to split the data into multiple files. This can be set independent of the page size, since page size is used for fetching the data from the sources whereas this is used for the processing/loading part.",
246
- envvar="LOADER_FILE_SIZE",
219
+ envvar=["LOADER_FILE_SIZE", "INGESTR_LOADER_FILE_SIZE"],
247
220
  ),
248
221
  ] = 100000, # type: ignore
249
222
  schema_naming: Annotated[
250
223
  SchemaNaming,
251
224
  typer.Option(
252
225
  help="The naming convention to use when moving the tables from source to destination. The default behavior is explained here: https://dlthub.com/docs/general-usage/schema#naming-convention",
253
- envvar="SCHEMA_NAMING",
226
+ envvar=["SCHEMA_NAMING", "INGESTR_SCHEMA_NAMING"],
254
227
  ),
255
228
  ] = SchemaNaming.default, # type: ignore
256
229
  pipelines_dir: Annotated[
257
230
  Optional[str],
258
231
  typer.Option(
259
232
  help="The path to store dlt-related pipeline metadata. By default, ingestr will create a temporary directory and delete it after the execution is done in order to make retries stateless.",
260
- envvar="PIPELINES_DIR",
233
+ envvar=["PIPELINES_DIR", "INGESTR_PIPELINES_DIR"],
261
234
  ),
262
235
  ] = None, # type: ignore
263
236
  extract_parallelism: Annotated[
264
237
  Optional[int],
265
238
  typer.Option(
266
239
  help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
267
- envvar="EXTRACT_PARALLELISM",
240
+ envvar=["EXTRACT_PARALLELISM", "INGESTR_EXTRACT_PARALLELISM"],
268
241
  ),
269
242
  ] = 5, # type: ignore
270
243
  sql_reflection_level: Annotated[
271
244
  SqlReflectionLevel,
272
245
  typer.Option(
273
246
  help="The reflection level to use when reflecting the table schema from the source",
274
- envvar="SQL_REFLECTION_LEVEL",
247
+ envvar=["SQL_REFLECTION_LEVEL", "INGESTR_SQL_REFLECTION_LEVEL"],
275
248
  ),
276
249
  ] = SqlReflectionLevel.full, # type: ignore
277
250
  sql_limit: Annotated[
278
251
  Optional[int],
279
252
  typer.Option(
280
253
  help="The limit to use when fetching data from the source",
281
- envvar="SQL_LIMIT",
254
+ envvar=["SQL_LIMIT", "INGESTR_SQL_LIMIT"],
282
255
  ),
283
256
  ] = None, # type: ignore
284
257
  sql_exclude_columns: Annotated[
285
258
  Optional[list[str]],
286
259
  typer.Option(
287
260
  help="The columns to exclude from the source table",
288
- envvar="SQL_EXCLUDE_COLUMNS",
261
+ envvar=["SQL_EXCLUDE_COLUMNS", "INGESTR_SQL_EXCLUDE_COLUMNS"],
289
262
  ),
290
263
  ] = [], # type: ignore
291
264
  columns: Annotated[
292
265
  Optional[list[str]],
293
266
  typer.Option(
294
267
  help="The column types to be used for the destination table in the format of 'column_name:column_type'",
295
- envvar="COLUMNS",
268
+ envvar=["INGESTR_COLUMNS"],
269
+ ),
270
+ ] = None, # type: ignore
271
+ yield_limit: Annotated[
272
+ Optional[int],
273
+ typer.Option(
274
+ help="Limit the number of pages yielded from the source",
275
+ envvar=["YIELD_LIMIT", "INGESTR_YIELD_LIMIT"],
276
+ ),
277
+ ] = None, # type: ignore
278
+ staging_bucket: Annotated[
279
+ Optional[str],
280
+ typer.Option(
281
+ help="The staging bucket to be used for the ingestion, must be prefixed with 'gs://' or 's3://'",
282
+ envvar=["STAGING_BUCKET", "INGESTR_STAGING_BUCKET"],
296
283
  ),
297
284
  ] = None, # type: ignore
285
+ mask: Annotated[
286
+ Optional[list[str]],
287
+ typer.Option(
288
+ help="Column masking configuration in format 'column:algorithm[:param]'. Can be specified multiple times.",
289
+ envvar=["MASK", "INGESTR_MASK"],
290
+ ),
291
+ ] = [], # type: ignore
298
292
  ):
299
293
  import hashlib
300
294
  import tempfile
@@ -303,14 +297,22 @@ def ingest(
303
297
  import dlt
304
298
  import humanize
305
299
  import typer
306
- from dlt.common.data_types import TDataType
307
- from dlt.common.destination import Destination
308
300
  from dlt.common.pipeline import LoadInfo
309
301
  from dlt.common.runtime.collector import Collector, LogCollector
310
302
  from dlt.common.schema.typing import TColumnSchema
311
303
 
304
+ import ingestr.src.partition as partition
305
+ import ingestr.src.resource as resource
306
+ from ingestr.src.collector.spinner import SpinnerCollector
307
+ from ingestr.src.destinations import AthenaDestination
312
308
  from ingestr.src.factory import SourceDestinationFactory
313
- from ingestr.src.telemetry.event import track
309
+ from ingestr.src.filters import (
310
+ cast_set_to_list,
311
+ cast_spanner_types,
312
+ create_masking_filter,
313
+ handle_mysql_empty_dates,
314
+ )
315
+ from ingestr.src.sources import MongoDbSource
314
316
 
315
317
  def report_errors(run_info: LoadInfo):
316
318
  for load_package in run_info.load_packages:
@@ -345,7 +347,7 @@ def ingest(
345
347
  return (source_table, dest_table)
346
348
 
347
349
  def validate_loader_file_format(
348
- dlt_dest: Destination, loader_file_format: Optional[LoaderFileFormat]
350
+ dlt_dest, loader_file_format: Optional[LoaderFileFormat]
349
351
  ):
350
352
  if (
351
353
  loader_file_format
@@ -357,17 +359,11 @@ def ingest(
357
359
  )
358
360
  raise typer.Abort()
359
361
 
360
- def run_on_resource(source, executable):
361
- if hasattr(source, "selected_resources") and source.selected_resources:
362
- resource_names = list(source.selected_resources.keys())
363
- for res in resource_names:
364
- executable(source.resources[res])
365
- else:
366
- executable(source)
367
-
368
- def parse_columns(columns: list[str]) -> dict[str, TDataType]:
362
+ def parse_columns(columns: list[str]) -> dict:
369
363
  from typing import cast, get_args
370
364
 
365
+ from dlt.common.data_types import TDataType
366
+
371
367
  possible_types = get_args(TDataType)
372
368
 
373
369
  types: dict[str, TDataType] = {}
@@ -400,6 +396,7 @@ def ingest(
400
396
  dlt.config["data_writer.file_max_items"] = loader_file_size
401
397
  dlt.config["extract.workers"] = extract_parallelism
402
398
  dlt.config["extract.max_parallel_items"] = extract_parallelism
399
+ dlt.config["load.raise_on_max_retries"] = 15
403
400
  if schema_naming != SchemaNaming.default:
404
401
  dlt.config["schema.naming"] = schema_naming.value
405
402
 
@@ -451,7 +448,9 @@ def ingest(
451
448
  pipelines_dir = tempfile.mkdtemp()
452
449
  is_pipelines_dir_temp = True
453
450
 
454
- dlt_dest = destination.dlt_dest(uri=dest_uri, dest_table=dest_table)
451
+ dlt_dest = destination.dlt_dest(
452
+ uri=dest_uri, dest_table=dest_table, staging_bucket=staging_bucket
453
+ )
455
454
  validate_loader_file_format(dlt_dest, loader_file_format)
456
455
 
457
456
  if partition_by:
@@ -473,7 +472,7 @@ def ingest(
473
472
 
474
473
  column_hints[key]["primary_key"] = True
475
474
 
476
- pipeline = dlt.pipeline(
475
+ pipeline = dlt.pipeline( # type: ignore
477
476
  pipeline_name=m.hexdigest(),
478
477
  destination=dlt_dest,
479
478
  progress=progressInstance,
@@ -510,6 +509,7 @@ def ingest(
510
509
  print(
511
510
  f"[bold yellow] Primary Key:[/bold yellow] {primary_key if primary_key else 'None'}"
512
511
  )
512
+ print(f"[bold yellow] Pipeline ID:[/bold yellow] {m.hexdigest()}")
513
513
  print()
514
514
 
515
515
  if not yes:
@@ -539,6 +539,15 @@ def ingest(
539
539
  if interval_end:
540
540
  interval_end = interval_end.date() # type: ignore
541
541
 
542
+ if factory.source_scheme.startswith("spanner"):
543
+ # we tend to use the 'pyarrow' backend in general, however, it has issues with JSON objects, so we override it to 'sqlalchemy' for Spanner.
544
+ if sql_backend.value == SqlBackend.default:
545
+ sql_backend = SqlBackend.sqlalchemy
546
+
547
+ # this allows us to identify the cases where the user does not have a preference, so that for some sources we can override it.
548
+ if sql_backend == SqlBackend.default:
549
+ sql_backend = SqlBackend.pyarrow
550
+
542
551
  dlt_source = source.dlt_source(
543
552
  uri=source_uri,
544
553
  table=source_table,
@@ -551,22 +560,55 @@ def ingest(
551
560
  sql_reflection_level=sql_reflection_level.value,
552
561
  sql_limit=sql_limit,
553
562
  sql_exclude_columns=sql_exclude_columns,
563
+ extract_parallelism=extract_parallelism,
554
564
  )
555
565
 
556
- run_on_resource(dlt_source, lambda x: x.add_map(cast_set_to_list))
566
+ resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))
567
+ if factory.source_scheme.startswith("mysql"):
568
+ resource.for_each(dlt_source, lambda x: x.add_map(handle_mysql_empty_dates))
569
+
570
+ if factory.source_scheme.startswith("spanner"):
571
+ resource.for_each(dlt_source, lambda x: x.add_map(cast_spanner_types))
572
+
573
+ if factory.source_scheme.startswith(
574
+ "mmap"
575
+ ) and factory.destination_scheme.startswith("clickhouse"):
576
+ # https://github.com/dlt-hub/dlt/issues/2248
577
+ # TODO(turtledev): only apply for write dispositions that actually cause an exception.
578
+ # TODO(turtledev): make batch size configurable
579
+ import ingestr.src.arrow as arrow
580
+
581
+ resource.for_each(dlt_source, lambda x: x.add_map(arrow.as_list))
582
+
583
+ if mask:
584
+ masking_filter = create_masking_filter(mask)
585
+ resource.for_each(dlt_source, lambda x: x.add_map(masking_filter))
586
+
587
+ if yield_limit:
588
+ resource.for_each(dlt_source, lambda x: x.add_limit(yield_limit))
589
+
590
+ if isinstance(source, MongoDbSource):
591
+ from ingestr.src.resource import TypeHintMap
592
+
593
+ resource.for_each(
594
+ dlt_source, lambda x: x.add_map(TypeHintMap().type_hint_map)
595
+ )
557
596
 
558
597
  def col_h(x):
559
598
  if column_hints:
560
599
  x.apply_hints(columns=column_hints)
561
600
 
562
- run_on_resource(dlt_source, col_h)
601
+ resource.for_each(dlt_source, col_h)
602
+
603
+ if isinstance(destination, AthenaDestination) and partition_by:
604
+ partition.apply_athena_hints(dlt_source, partition_by, column_hints)
563
605
 
564
606
  if original_incremental_strategy == IncrementalStrategy.delete_insert:
565
607
 
566
608
  def set_primary_key(x):
567
609
  x.incremental.primary_key = ()
568
610
 
569
- run_on_resource(dlt_source, set_primary_key)
611
+ resource.for_each(dlt_source, set_primary_key)
570
612
 
571
613
  if (
572
614
  factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
@@ -585,6 +627,10 @@ def ingest(
585
627
  if incremental_strategy != IncrementalStrategy.none:
586
628
  write_disposition = incremental_strategy.value
587
629
 
630
+ if factory.source_scheme == "influxdb":
631
+ if primary_key:
632
+ write_disposition = "merge"
633
+
588
634
  start_time = datetime.now()
589
635
 
590
636
  run_info: LoadInfo = pipeline.run(
@@ -592,6 +638,7 @@ def ingest(
592
638
  **destination.dlt_run_params(
593
639
  uri=dest_uri,
594
640
  table=dest_table,
641
+ staging_bucket=staging_bucket,
595
642
  ),
596
643
  write_disposition=write_disposition, # type: ignore
597
644
  primary_key=(primary_key if primary_key and len(primary_key) > 0 else None), # type: ignore
@@ -46,7 +46,7 @@ def adjust_source(
46
46
  filters: Optional[dict] = None,
47
47
  ) -> Sequence[DltResource]:
48
48
  @dlt.resource(write_disposition="merge", merge_key="day")
49
- def campaigns():
49
+ def campaigns() -> DltResource:
50
50
  adjust_api = AdjustAPI(api_key=api_key)
51
51
  yield from adjust_api.fetch_report_data(
52
52
  start_date=start_date,
@@ -57,12 +57,12 @@ def adjust_source(
57
57
  )
58
58
 
59
59
  @dlt.resource(write_disposition="replace", primary_key="id")
60
- def events():
60
+ def events() -> DltResource:
61
61
  adjust_api = AdjustAPI(api_key=api_key)
62
62
  yield adjust_api.fetch_events()
63
63
 
64
64
  @dlt.resource(write_disposition="merge", merge_key="day")
65
- def creatives():
65
+ def creatives() -> DltResource:
66
66
  adjust_api = AdjustAPI(api_key=api_key)
67
67
  yield from adjust_api.fetch_report_data(
68
68
  start_date=start_date,
@@ -95,7 +95,7 @@ def adjust_source(
95
95
  primary_key=dimensions,
96
96
  columns=type_hints,
97
97
  )
98
- def custom():
98
+ def custom() -> DltResource:
99
99
  adjust_api = AdjustAPI(api_key=api_key)
100
100
  yield from adjust_api.fetch_report_data(
101
101
  start_date=start_date,
@@ -36,7 +36,7 @@ class AdjustAPI:
36
36
  def __init__(self, api_key):
37
37
  self.api_key = api_key
38
38
  self.request_client = Client(
39
- request_timeout=8.0,
39
+ request_timeout=1000, # Adjust support recommends 1000 seconds of read timeout.
40
40
  raise_for_status=False,
41
41
  retry_condition=retry_on_limit,
42
42
  request_max_attempts=12,
@@ -82,7 +82,9 @@ class AdjustAPI:
82
82
  items = result.get("rows", [])
83
83
  yield items
84
84
  else:
85
- raise HTTPError(f"Request failed with status code: {response.status_code}")
85
+ raise HTTPError(
86
+ f"Request failed with status code: {response.status_code}, {response.text}."
87
+ )
86
88
 
87
89
  def fetch_events(self):
88
90
  headers = {"Authorization": f"Bearer {self.api_key}"}
@@ -93,7 +95,9 @@ class AdjustAPI:
93
95
  result = response.json()
94
96
  yield result
95
97
  else:
96
- raise HTTPError(f"Request failed with status code: {response.status_code}")
98
+ raise HTTPError(
99
+ f"Request failed with status code: {response.status_code}, {response.text}."
100
+ )
97
101
 
98
102
 
99
103
  def parse_filters(filters_raw: str) -> dict:
@@ -9,7 +9,7 @@ import pyairtable
9
9
  from dlt.sources import DltResource
10
10
 
11
11
 
12
- @dlt.source
12
+ @dlt.source(max_table_nesting=1)
13
13
  def airtable_source(
14
14
  base_id: str = dlt.config.value,
15
15
  table_names: Optional[List[str]] = dlt.config.value,
@@ -50,12 +50,13 @@ def airtable_resource(
50
50
  It starts with "app". See https://support.airtable.com/docs/finding-airtable-ids
51
51
  table (Dict[str, Any]): Metadata about an airtable, does not contain the actual records
52
52
  """
53
+
53
54
  primary_key_id = table["primaryFieldId"]
54
55
  primary_key_field = [
55
56
  field for field in table["fields"] if field["id"] == primary_key_id
56
57
  ][0]
57
58
  table_name: str = table["name"]
58
- primary_key: List[str] = [primary_key_field["name"]]
59
+ primary_key: List[str] = [f"fields__{primary_key_field['name']}".lower()]
59
60
  air_table = api.table(base_id, table["id"])
60
61
 
61
62
  # Table.iterate() supports rich customization options, such as chunk size, fields, cell format, timezone, locale, and view