omniload 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. omniload/conftest.py +72 -0
  2. omniload/main.py +810 -0
  3. omniload/src/.gitignore +10 -0
  4. omniload/src/adjust/__init__.py +108 -0
  5. omniload/src/adjust/adjust_helpers.py +122 -0
  6. omniload/src/airtable/__init__.py +84 -0
  7. omniload/src/allium/__init__.py +128 -0
  8. omniload/src/anthropic/__init__.py +277 -0
  9. omniload/src/anthropic/helpers.py +525 -0
  10. omniload/src/applovin/__init__.py +316 -0
  11. omniload/src/applovin_max/__init__.py +117 -0
  12. omniload/src/appsflyer/__init__.py +325 -0
  13. omniload/src/appsflyer/client.py +110 -0
  14. omniload/src/appstore/__init__.py +142 -0
  15. omniload/src/appstore/client.py +126 -0
  16. omniload/src/appstore/errors.py +15 -0
  17. omniload/src/appstore/models.py +117 -0
  18. omniload/src/appstore/resources.py +179 -0
  19. omniload/src/arrow/__init__.py +81 -0
  20. omniload/src/asana_source/__init__.py +281 -0
  21. omniload/src/asana_source/helpers.py +30 -0
  22. omniload/src/asana_source/settings.py +158 -0
  23. omniload/src/attio/__init__.py +102 -0
  24. omniload/src/attio/helpers.py +65 -0
  25. omniload/src/blob.py +95 -0
  26. omniload/src/bruin/__init__.py +76 -0
  27. omniload/src/chess/__init__.py +180 -0
  28. omniload/src/chess/helpers.py +35 -0
  29. omniload/src/chess/settings.py +18 -0
  30. omniload/src/clickup/__init__.py +85 -0
  31. omniload/src/clickup/helpers.py +47 -0
  32. omniload/src/collector/spinner.py +43 -0
  33. omniload/src/couchbase_source/__init__.py +118 -0
  34. omniload/src/couchbase_source/helpers.py +135 -0
  35. omniload/src/cursor/__init__.py +83 -0
  36. omniload/src/cursor/helpers.py +188 -0
  37. omniload/src/customer_io/__init__.py +486 -0
  38. omniload/src/customer_io/helpers.py +530 -0
  39. omniload/src/destinations.py +982 -0
  40. omniload/src/docebo/__init__.py +589 -0
  41. omniload/src/docebo/client.py +435 -0
  42. omniload/src/docebo/helpers.py +97 -0
  43. omniload/src/dune/__init__.py +104 -0
  44. omniload/src/dune/helpers.py +108 -0
  45. omniload/src/dynamodb/__init__.py +86 -0
  46. omniload/src/elasticsearch/__init__.py +80 -0
  47. omniload/src/elasticsearch/helpers.py +141 -0
  48. omniload/src/errors.py +26 -0
  49. omniload/src/facebook_ads/__init__.py +403 -0
  50. omniload/src/facebook_ads/exceptions.py +19 -0
  51. omniload/src/facebook_ads/helpers.py +296 -0
  52. omniload/src/facebook_ads/settings.py +224 -0
  53. omniload/src/facebook_ads/utils.py +53 -0
  54. omniload/src/factory.py +305 -0
  55. omniload/src/filesystem/__init__.py +133 -0
  56. omniload/src/filesystem/helpers.py +114 -0
  57. omniload/src/filesystem/readers.py +187 -0
  58. omniload/src/filters.py +62 -0
  59. omniload/src/fireflies/__init__.py +151 -0
  60. omniload/src/fireflies/helpers.py +753 -0
  61. omniload/src/fluxx/__init__.py +10013 -0
  62. omniload/src/fluxx/helpers.py +233 -0
  63. omniload/src/frankfurter/__init__.py +157 -0
  64. omniload/src/frankfurter/helpers.py +48 -0
  65. omniload/src/freshdesk/__init__.py +103 -0
  66. omniload/src/freshdesk/freshdesk_client.py +151 -0
  67. omniload/src/freshdesk/settings.py +23 -0
  68. omniload/src/fundraiseup/__init__.py +95 -0
  69. omniload/src/fundraiseup/client.py +81 -0
  70. omniload/src/github/__init__.py +202 -0
  71. omniload/src/github/helpers.py +207 -0
  72. omniload/src/github/queries.py +129 -0
  73. omniload/src/github/settings.py +24 -0
  74. omniload/src/google_ads/__init__.py +198 -0
  75. omniload/src/google_ads/field.py +17 -0
  76. omniload/src/google_ads/metrics.py +254 -0
  77. omniload/src/google_ads/predicates.py +37 -0
  78. omniload/src/google_ads/reports.py +411 -0
  79. omniload/src/google_ads/test_google_ads.py +184 -0
  80. omniload/src/google_analytics/__init__.py +144 -0
  81. omniload/src/google_analytics/helpers.py +312 -0
  82. omniload/src/google_sheets/README.md +95 -0
  83. omniload/src/google_sheets/__init__.py +166 -0
  84. omniload/src/google_sheets/helpers/__init__.py +15 -0
  85. omniload/src/google_sheets/helpers/api_calls.py +160 -0
  86. omniload/src/google_sheets/helpers/data_processing.py +316 -0
  87. omniload/src/gorgias/__init__.py +595 -0
  88. omniload/src/gorgias/helpers.py +166 -0
  89. omniload/src/hostaway/__init__.py +302 -0
  90. omniload/src/hostaway/client.py +288 -0
  91. omniload/src/http/__init__.py +38 -0
  92. omniload/src/http/readers.py +146 -0
  93. omniload/src/http_client.py +24 -0
  94. omniload/src/hubspot/__init__.py +800 -0
  95. omniload/src/hubspot/helpers.py +417 -0
  96. omniload/src/hubspot/settings.py +329 -0
  97. omniload/src/indeed/__init__.py +153 -0
  98. omniload/src/indeed/helpers.py +228 -0
  99. omniload/src/influxdb/__init__.py +46 -0
  100. omniload/src/influxdb/client.py +34 -0
  101. omniload/src/intercom/__init__.py +142 -0
  102. omniload/src/intercom/helpers.py +674 -0
  103. omniload/src/intercom/settings.py +279 -0
  104. omniload/src/isoc_pulse/__init__.py +159 -0
  105. omniload/src/jira_source/__init__.py +377 -0
  106. omniload/src/jira_source/helpers.py +510 -0
  107. omniload/src/jira_source/settings.py +184 -0
  108. omniload/src/kafka/__init__.py +120 -0
  109. omniload/src/kafka/helpers.py +241 -0
  110. omniload/src/kinesis/__init__.py +153 -0
  111. omniload/src/kinesis/helpers.py +96 -0
  112. omniload/src/klaviyo/__init__.py +237 -0
  113. omniload/src/klaviyo/client.py +212 -0
  114. omniload/src/klaviyo/helpers.py +19 -0
  115. omniload/src/linear/__init__.py +634 -0
  116. omniload/src/linear/helpers.py +111 -0
  117. omniload/src/linkedin_ads/__init__.py +266 -0
  118. omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
  119. omniload/src/linkedin_ads/helpers.py +246 -0
  120. omniload/src/loader.py +69 -0
  121. omniload/src/mailchimp/__init__.py +126 -0
  122. omniload/src/mailchimp/helpers.py +226 -0
  123. omniload/src/mailchimp/settings.py +164 -0
  124. omniload/src/masking.py +344 -0
  125. omniload/src/mixpanel/__init__.py +62 -0
  126. omniload/src/mixpanel/client.py +104 -0
  127. omniload/src/monday/__init__.py +246 -0
  128. omniload/src/monday/helpers.py +392 -0
  129. omniload/src/monday/settings.py +325 -0
  130. omniload/src/mongodb/__init__.py +281 -0
  131. omniload/src/mongodb/helpers.py +975 -0
  132. omniload/src/notion/__init__.py +69 -0
  133. omniload/src/notion/helpers/__init__.py +14 -0
  134. omniload/src/notion/helpers/client.py +178 -0
  135. omniload/src/notion/helpers/database.py +92 -0
  136. omniload/src/notion/settings.py +17 -0
  137. omniload/src/partition.py +32 -0
  138. omniload/src/personio/__init__.py +345 -0
  139. omniload/src/personio/helpers.py +100 -0
  140. omniload/src/phantombuster/__init__.py +65 -0
  141. omniload/src/phantombuster/client.py +87 -0
  142. omniload/src/pinterest/__init__.py +82 -0
  143. omniload/src/pipedrive/__init__.py +212 -0
  144. omniload/src/pipedrive/helpers/__init__.py +37 -0
  145. omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
  146. omniload/src/pipedrive/helpers/pages.py +129 -0
  147. omniload/src/pipedrive/settings.py +41 -0
  148. omniload/src/pipedrive/typing.py +17 -0
  149. omniload/src/plusvibeai/__init__.py +335 -0
  150. omniload/src/plusvibeai/helpers.py +544 -0
  151. omniload/src/plusvibeai/settings.py +252 -0
  152. omniload/src/primer/__init__.py +45 -0
  153. omniload/src/primer/helpers.py +79 -0
  154. omniload/src/quickbooks/__init__.py +117 -0
  155. omniload/src/reddit_ads/__init__.py +183 -0
  156. omniload/src/reddit_ads/helpers.py +232 -0
  157. omniload/src/resource.py +40 -0
  158. omniload/src/revenuecat/__init__.py +83 -0
  159. omniload/src/revenuecat/helpers.py +237 -0
  160. omniload/src/salesforce/__init__.py +170 -0
  161. omniload/src/salesforce/helpers.py +78 -0
  162. omniload/src/shopify/__init__.py +1953 -0
  163. omniload/src/shopify/exceptions.py +17 -0
  164. omniload/src/shopify/helpers.py +202 -0
  165. omniload/src/shopify/settings.py +19 -0
  166. omniload/src/slack/__init__.py +290 -0
  167. omniload/src/slack/helpers.py +218 -0
  168. omniload/src/slack/settings.py +36 -0
  169. omniload/src/smartsheets/__init__.py +82 -0
  170. omniload/src/snapchat_ads/__init__.py +455 -0
  171. omniload/src/snapchat_ads/client.py +72 -0
  172. omniload/src/snapchat_ads/helpers.py +630 -0
  173. omniload/src/snapchat_ads/settings.py +130 -0
  174. omniload/src/socrata_source/__init__.py +83 -0
  175. omniload/src/socrata_source/helpers.py +85 -0
  176. omniload/src/socrata_source/settings.py +8 -0
  177. omniload/src/solidgate/__init__.py +219 -0
  178. omniload/src/solidgate/helpers.py +154 -0
  179. omniload/src/sources.py +5408 -0
  180. omniload/src/sql_database/__init__.py +0 -0
  181. omniload/src/sql_database/callbacks.py +66 -0
  182. omniload/src/stripe_analytics/__init__.py +183 -0
  183. omniload/src/stripe_analytics/helpers.py +386 -0
  184. omniload/src/stripe_analytics/settings.py +80 -0
  185. omniload/src/table_definition.py +15 -0
  186. omniload/src/testdata/fakebqcredentials.json +14 -0
  187. omniload/src/tiktok_ads/__init__.py +150 -0
  188. omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
  189. omniload/src/time.py +11 -0
  190. omniload/src/trustpilot/__init__.py +48 -0
  191. omniload/src/trustpilot/client.py +48 -0
  192. omniload/src/version.py +6 -0
  193. omniload/src/wise/__init__.py +68 -0
  194. omniload/src/wise/client.py +63 -0
  195. omniload/src/zendesk/__init__.py +480 -0
  196. omniload/src/zendesk/helpers/__init__.py +39 -0
  197. omniload/src/zendesk/helpers/api_helpers.py +119 -0
  198. omniload/src/zendesk/helpers/credentials.py +68 -0
  199. omniload/src/zendesk/helpers/talk_api.py +132 -0
  200. omniload/src/zendesk/settings.py +71 -0
  201. omniload/src/zoom/__init__.py +99 -0
  202. omniload/src/zoom/helpers.py +102 -0
  203. omniload/testdata/.gitignore +2 -0
  204. omniload/testdata/create_replace.csv +21 -0
  205. omniload/testdata/delete_insert_expected.csv +6 -0
  206. omniload/testdata/delete_insert_part1.csv +5 -0
  207. omniload/testdata/delete_insert_part2.csv +6 -0
  208. omniload/testdata/merge_expected.csv +5 -0
  209. omniload/testdata/merge_part1.csv +4 -0
  210. omniload/testdata/merge_part2.csv +5 -0
  211. omniload/tests/unit/test_smartsheets.py +133 -0
  212. omniload-0.0.0.dev0.dist-info/METADATA +439 -0
  213. omniload-0.0.0.dev0.dist-info/RECORD +218 -0
  214. omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
  215. omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
  216. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
  217. omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
  218. omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
@@ -0,0 +1,975 @@
1
+ # Copyright 2022-2025 ScaleVector
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Mongo database source helpers and destination utilities"""
16
+
17
+ import re
18
+ from itertools import islice
19
+ from typing import (
20
+ TYPE_CHECKING,
21
+ Any,
22
+ Dict,
23
+ Iterable,
24
+ Iterator,
25
+ List,
26
+ Mapping,
27
+ Optional,
28
+ Tuple,
29
+ Union,
30
+ )
31
+
32
+ import dlt
33
+ from bson.decimal128 import Decimal128
34
+ from bson.objectid import ObjectId
35
+ from bson.regex import Regex
36
+ from bson.timestamp import Timestamp
37
+ from dlt.common import logger
38
+ from dlt.common.configuration.specs import BaseConfiguration, configspec
39
+ from dlt.common.data_writers import TDataItemFormat
40
+ from dlt.common.time import ensure_pendulum_datetime
41
+ from dlt.common.typing import TDataItem
42
+ from dlt.common.utils import map_nested_in_place
43
+ from pendulum import _datetime
44
+ from pymongo import ASCENDING, DESCENDING, MongoClient
45
+ from pymongo.collection import Collection
46
+ from pymongo.cursor import Cursor
47
+ from pymongo.helpers_shared import _fields_list_to_dict
48
+
49
+ if TYPE_CHECKING:
50
+ TMongoClient = MongoClient[Any]
51
+ TCollection = Collection[Any]
52
+ TCursor = Cursor[Any]
53
+ else:
54
+ TMongoClient = Any
55
+ TCollection = Any
56
+ TCursor = Any
57
+
58
+ try:
59
+ import pymongoarrow # type: ignore
60
+
61
+ PYMONGOARROW_AVAILABLE = True
62
+ except ImportError:
63
+ PYMONGOARROW_AVAILABLE = False
64
+
65
+
66
+ class CollectionLoader:
67
+ def __init__(
68
+ self,
69
+ client: TMongoClient,
70
+ collection: TCollection,
71
+ chunk_size: int,
72
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
73
+ ) -> None:
74
+ self.client = client
75
+ self.collection = collection
76
+ self.incremental = incremental
77
+ self.chunk_size = chunk_size
78
+
79
+ if incremental:
80
+ self.cursor_field = incremental.cursor_path
81
+ self.last_value = incremental.last_value
82
+ else:
83
+ self.cursor_column = None
84
+ self.last_value = None
85
+
86
+ @property
87
+ def _sort_op(self) -> List[Optional[Tuple[str, int]]]:
88
+ if not self.incremental or not self.last_value:
89
+ return []
90
+
91
+ if (
92
+ self.incremental.row_order == "asc"
93
+ and self.incremental.last_value_func is max
94
+ ) or (
95
+ self.incremental.row_order == "desc"
96
+ and self.incremental.last_value_func is min
97
+ ):
98
+ return [(self.cursor_field, ASCENDING)]
99
+
100
+ elif (
101
+ self.incremental.row_order == "asc"
102
+ and self.incremental.last_value_func is min
103
+ ) or (
104
+ self.incremental.row_order == "desc"
105
+ and self.incremental.last_value_func is max
106
+ ):
107
+ return [(self.cursor_field, DESCENDING)]
108
+
109
+ return []
110
+
111
+ @property
112
+ def _filter_op(self) -> Dict[str, Any]:
113
+ """Build a filtering operator.
114
+
115
+ Includes a field and the filtering condition for it.
116
+
117
+ Returns:
118
+ Dict[str, Any]: A dictionary with the filter operator.
119
+ """
120
+ if not (self.incremental and self.last_value):
121
+ return {}
122
+
123
+ filt = {}
124
+ if self.incremental.last_value_func is max:
125
+ filt = {self.cursor_field: {"$gte": self.last_value}}
126
+ if self.incremental.end_value:
127
+ filt[self.cursor_field]["$lt"] = self.incremental.end_value
128
+
129
+ elif self.incremental.last_value_func is min:
130
+ filt = {self.cursor_field: {"$lte": self.last_value}}
131
+ if self.incremental.end_value:
132
+ filt[self.cursor_field]["$gt"] = self.incremental.end_value
133
+
134
+ return filt
135
+
136
+ def _projection_op(
137
+ self, projection: Optional[Union[Mapping[str, Any], Iterable[str]]]
138
+ ) -> Optional[Dict[str, Any]]:
139
+ """Build a projection operator.
140
+
141
+ Args:
142
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): A tuple of fields to include or a dict specifying fields to include or exclude.
143
+ The incremental `primary_key` needs to be handle differently for inclusion
144
+ and exclusion projections.
145
+
146
+ Returns:
147
+ Tuple[str, ...] | Dict[str, Any]: A tuple or dictionary with the projection operator.
148
+ """
149
+ if projection is None:
150
+ return None
151
+
152
+ projection_dict = dict(_fields_list_to_dict(projection, "projection"))
153
+
154
+ if self.incremental:
155
+ # this is an inclusion projection
156
+ if any(v == 1 for v in projection_dict.values()):
157
+ # ensure primary_key is included
158
+ projection_dict.update(m={self.incremental.primary_key: 1})
159
+ # this is an exclusion projection
160
+ else:
161
+ try:
162
+ # ensure primary_key isn't excluded
163
+ projection_dict.pop(self.incremental.primary_key) # type: ignore
164
+ except KeyError:
165
+ pass # primary_key was properly not included in exclusion projection
166
+ else:
167
+ dlt.common.logger.warn(
168
+ f"Primary key `{self.incremental.primary_key}` was removed from exclusion projection"
169
+ )
170
+
171
+ return projection_dict
172
+
173
+ def _limit(self, cursor: Cursor, limit: Optional[int] = None) -> TCursor: # type: ignore
174
+ """Apply a limit to the cursor, if needed.
175
+
176
+ Args:
177
+ cursor (Cursor): The cursor to apply the limit.
178
+ limit (Optional[int]): The number of documents to load.
179
+
180
+ Returns:
181
+ Cursor: The cursor with the limit applied (if given).
182
+ """
183
+ if limit not in (0, None):
184
+ if self.incremental is None or self.incremental.last_value_func is None:
185
+ logger.warning(
186
+ "Using limit without ordering - results may be inconsistent."
187
+ )
188
+
189
+ cursor = cursor.limit(abs(limit))
190
+
191
+ return cursor
192
+
193
+ def load_documents(
194
+ self,
195
+ filter_: Dict[str, Any],
196
+ limit: Optional[int] = None,
197
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
198
+ ) -> Iterator[TDataItem]:
199
+ """Construct the query and load the documents from the collection.
200
+
201
+ Args:
202
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
203
+ limit (Optional[int]): The number of documents to load.
204
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
205
+
206
+ Yields:
207
+ Iterator[TDataItem]: An iterator of the loaded documents.
208
+ """
209
+ filter_op = self._filter_op
210
+ _raise_if_intersection(filter_op, filter_)
211
+ filter_op.update(filter_)
212
+
213
+ projection_op = self._projection_op(projection)
214
+
215
+ cursor = self.collection.find(filter=filter_op, projection=projection_op)
216
+ if self._sort_op:
217
+ cursor = cursor.sort(self._sort_op)
218
+
219
+ cursor = self._limit(cursor, limit)
220
+
221
+ while docs_slice := list(islice(cursor, self.chunk_size)):
222
+ res = map_nested_in_place(convert_mongo_objs, docs_slice)
223
+ if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
224
+ yield dlt.mark.with_hints(
225
+ res,
226
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
227
+ )
228
+ else:
229
+ yield res
230
+
231
+
232
+ class CollectionLoaderParallel(CollectionLoader):
233
+ def _get_document_count(self) -> int:
234
+ return self.collection.count_documents(filter=self._filter_op)
235
+
236
+ def _create_batches(self, limit: Optional[int] = None) -> List[Dict[str, int]]:
237
+ doc_count = self._get_document_count()
238
+ if limit:
239
+ doc_count = min(doc_count, abs(limit))
240
+
241
+ batches = []
242
+ left_to_load = doc_count
243
+
244
+ for sk in range(0, doc_count, self.chunk_size):
245
+ batches.append(dict(skip=sk, limit=min(self.chunk_size, left_to_load)))
246
+ left_to_load -= self.chunk_size
247
+
248
+ return batches
249
+
250
+ def _get_cursor(
251
+ self,
252
+ filter_: Dict[str, Any],
253
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
254
+ ) -> TCursor:
255
+ """Get a reading cursor for the collection.
256
+
257
+ Args:
258
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
259
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
260
+
261
+ Returns:
262
+ Cursor: The cursor for the collection.
263
+ """
264
+ filter_op = self._filter_op
265
+ _raise_if_intersection(filter_op, filter_)
266
+ filter_op.update(filter_)
267
+
268
+ projection_op = self._projection_op(projection)
269
+
270
+ cursor = self.collection.find(filter=filter_op, projection=projection_op)
271
+ if self._sort_op:
272
+ cursor = cursor.sort(self._sort_op)
273
+
274
+ return cursor
275
+
276
+ @dlt.defer
277
+ def _run_batch(self, cursor: TCursor, batch: Dict[str, int]) -> TDataItem:
278
+ cursor = cursor.clone()
279
+
280
+ data = []
281
+ for document in cursor.skip(batch["skip"]).limit(batch["limit"]):
282
+ data.append(map_nested_in_place(convert_mongo_objs, document))
283
+
284
+ return data
285
+
286
+ def _get_all_batches(
287
+ self,
288
+ filter_: Dict[str, Any],
289
+ limit: Optional[int] = None,
290
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
291
+ ) -> Iterator[TDataItem]:
292
+ """Load all documents from the collection in parallel batches.
293
+
294
+ Args:
295
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
296
+ limit (Optional[int]): The maximum number of documents to load.
297
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
298
+
299
+ Yields:
300
+ Iterator[TDataItem]: An iterator of the loaded documents.
301
+ """
302
+ batches = self._create_batches(limit=limit)
303
+ cursor = self._get_cursor(filter_=filter_, projection=projection)
304
+
305
+ for batch in batches:
306
+ yield self._run_batch(cursor=cursor, batch=batch)
307
+
308
+ def load_documents(
309
+ self,
310
+ filter_: Dict[str, Any],
311
+ limit: Optional[int] = None,
312
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
313
+ ) -> Iterator[TDataItem]:
314
+ """Load documents from the collection in parallel.
315
+
316
+ Args:
317
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
318
+ limit (Optional[int]): The number of documents to load.
319
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
320
+
321
+ Yields:
322
+ Iterator[TDataItem]: An iterator of the loaded documents.
323
+ """
324
+ for document in self._get_all_batches(
325
+ limit=limit, filter_=filter_, projection=projection
326
+ ):
327
+ yield document
328
+
329
+
330
+ class CollectionArrowLoader(CollectionLoader):
331
+ """
332
+ Mongo DB collection loader, which uses
333
+ Apache Arrow for data processing.
334
+ """
335
+
336
+ def load_documents(
337
+ self,
338
+ filter_: Dict[str, Any],
339
+ limit: Optional[int] = None,
340
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
341
+ pymongoarrow_schema: Any = None,
342
+ ) -> Iterator[Any]:
343
+ """
344
+ Load documents from the collection in Apache Arrow format.
345
+
346
+ Args:
347
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
348
+ limit (Optional[int]): The number of documents to load.
349
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
350
+ pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
351
+
352
+ Yields:
353
+ Iterator[Any]: An iterator of the loaded documents.
354
+ """
355
+ from pymongoarrow.context import PyMongoArrowContext # type: ignore
356
+ from pymongoarrow.lib import process_bson_stream # type: ignore
357
+
358
+ filter_op = self._filter_op
359
+ _raise_if_intersection(filter_op, filter_)
360
+ filter_op.update(filter_)
361
+
362
+ projection_op = self._projection_op(projection)
363
+
364
+ # NOTE the `filter_op` isn't passed
365
+ cursor = self.collection.find_raw_batches(
366
+ filter_, batch_size=self.chunk_size, projection=projection_op
367
+ )
368
+ if self._sort_op:
369
+ cursor = cursor.sort(self._sort_op) # type: ignore
370
+
371
+ cursor = self._limit(cursor, limit) # type: ignore
372
+
373
+ context = PyMongoArrowContext.from_schema(
374
+ schema=pymongoarrow_schema, codec_options=self.collection.codec_options
375
+ )
376
+ for batch in cursor:
377
+ process_bson_stream(batch, context)
378
+ table = context.finish()
379
+ yield convert_arrow_columns(table)
380
+
381
+
382
+ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
383
+ """
384
+ Mongo DB collection parallel loader, which uses
385
+ Apache Arrow for data processing.
386
+ """
387
+
388
+ def load_documents(
389
+ self,
390
+ filter_: Dict[str, Any],
391
+ limit: Optional[int] = None,
392
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
393
+ pymongoarrow_schema: Any = None,
394
+ ) -> Iterator[TDataItem]:
395
+ """Load documents from the collection in parallel.
396
+
397
+ Args:
398
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
399
+ limit (Optional[int]): The number of documents to load.
400
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
401
+ pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
402
+
403
+ Yields:
404
+ Iterator[TDataItem]: An iterator of the loaded documents.
405
+ """
406
+ yield from self._get_all_batches(
407
+ limit=limit,
408
+ filter_=filter_,
409
+ projection=projection,
410
+ pymongoarrow_schema=pymongoarrow_schema,
411
+ )
412
+
413
+ def _get_all_batches(
414
+ self,
415
+ filter_: Dict[str, Any],
416
+ limit: Optional[int] = None,
417
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
418
+ pymongoarrow_schema: Any = None,
419
+ ) -> Iterator[TDataItem]:
420
+ """Load all documents from the collection in parallel batches.
421
+
422
+ Args:
423
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
424
+ limit (Optional[int]): The maximum number of documents to load.
425
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
426
+ pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
427
+
428
+ Yields:
429
+ Iterator[TDataItem]: An iterator of the loaded documents.
430
+ """
431
+ batches = self._create_batches(limit=limit)
432
+ cursor = self._get_cursor(filter_=filter_, projection=projection)
433
+ for batch in batches:
434
+ yield self._run_batch(
435
+ cursor=cursor,
436
+ batch=batch,
437
+ pymongoarrow_schema=pymongoarrow_schema,
438
+ )
439
+
440
+ def _get_cursor(
441
+ self,
442
+ filter_: Dict[str, Any],
443
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
444
+ ) -> TCursor:
445
+ """Get a reading cursor for the collection.
446
+
447
+ Args:
448
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
449
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
450
+
451
+ Returns:
452
+ Cursor: The cursor for the collection.
453
+ """
454
+ filter_op = self._filter_op
455
+ _raise_if_intersection(filter_op, filter_)
456
+ filter_op.update(filter_)
457
+
458
+ projection_op = self._projection_op(projection)
459
+
460
+ cursor = self.collection.find_raw_batches(
461
+ filter=filter_op, batch_size=self.chunk_size, projection=projection_op
462
+ )
463
+ if self._sort_op:
464
+ cursor = cursor.sort(self._sort_op) # type: ignore
465
+
466
+ return cursor
467
+
468
+ @dlt.defer
469
+ def _run_batch(
470
+ self,
471
+ cursor: TCursor,
472
+ batch: Dict[str, int],
473
+ pymongoarrow_schema: Any = None,
474
+ ) -> TDataItem:
475
+ from pymongoarrow.context import PyMongoArrowContext
476
+ from pymongoarrow.lib import process_bson_stream
477
+
478
+ cursor = cursor.clone()
479
+
480
+ context = PyMongoArrowContext.from_schema(
481
+ schema=pymongoarrow_schema, codec_options=self.collection.codec_options
482
+ )
483
+ for chunk in cursor.skip(batch["skip"]).limit(batch["limit"]):
484
+ process_bson_stream(chunk, context)
485
+ table = context.finish()
486
+ yield convert_arrow_columns(table)
487
+
488
+
489
+ class CollectionAggregationLoader(CollectionLoader):
490
+ """
491
+ MongoDB collection loader that uses aggregation pipelines instead of find queries.
492
+ """
493
+
494
+ def __init__(
495
+ self,
496
+ client: TMongoClient,
497
+ collection: TCollection,
498
+ chunk_size: int,
499
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
500
+ ) -> None:
501
+ super().__init__(client, collection, chunk_size, incremental)
502
+ self.custom_query: Optional[List[Dict[str, Any]]] = None
503
+
504
+ def set_custom_query(self, query: List[Dict[str, Any]]):
505
+ """Set the custom aggregation pipeline query"""
506
+ self.custom_query = query
507
+
508
+ def load_documents(
509
+ self,
510
+ filter_: Dict[str, Any],
511
+ limit: Optional[int] = None,
512
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
513
+ ) -> Iterator[TDataItem]:
514
+ """Load documents using aggregation pipeline"""
515
+ if not self.custom_query:
516
+ # Fallback to parent method if no custom query
517
+ yield from super().load_documents(filter_, limit, projection)
518
+ return
519
+
520
+ # Build aggregation pipeline
521
+ pipeline = list(self.custom_query) # Copy the query
522
+
523
+ # For custom queries, we assume incremental filtering is already handled
524
+ # via interval placeholders (:interval_start, :interval_end) in the query itself.
525
+ # We don't add additional incremental filtering to avoid conflicts.
526
+
527
+ # Add additional filter if provided
528
+ if filter_:
529
+ filter_match = {"$match": filter_}
530
+ pipeline.insert(0, filter_match)
531
+
532
+ # Add limit if specified
533
+ if limit and limit > 0:
534
+ pipeline.append({"$limit": limit})
535
+
536
+ # Add maxTimeMS to prevent hanging
537
+ cursor = self.collection.aggregate(
538
+ pipeline,
539
+ allowDiskUse=True,
540
+ batchSize=min(self.chunk_size, 101),
541
+ maxTimeMS=30000, # 30 second timeout
542
+ )
543
+
544
+ docs_buffer = []
545
+ try:
546
+ for doc in cursor:
547
+ docs_buffer.append(doc)
548
+
549
+ if len(docs_buffer) >= self.chunk_size:
550
+ res = map_nested_in_place(convert_mongo_objs, docs_buffer)
551
+ if (
552
+ len(res) > 0
553
+ and "_id" in res[0]
554
+ and isinstance(res[0]["_id"], dict)
555
+ ):
556
+ yield dlt.mark.with_hints(
557
+ res,
558
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
559
+ )
560
+ else:
561
+ yield res
562
+ docs_buffer = []
563
+
564
+ # Yield any remaining documents
565
+ if docs_buffer:
566
+ res = map_nested_in_place(convert_mongo_objs, docs_buffer)
567
+ if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
568
+ yield dlt.mark.with_hints(
569
+ res,
570
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
571
+ )
572
+ else:
573
+ yield res
574
+ finally:
575
+ cursor.close()
576
+
577
+
578
+ class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
579
+ """
580
+ MongoDB collection parallel loader that uses aggregation pipelines.
581
+ Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
582
+ Falls back to sequential loading.
583
+ """
584
+
585
+ def load_documents(
586
+ self,
587
+ filter_: Dict[str, Any],
588
+ limit: Optional[int] = None,
589
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
590
+ ) -> Iterator[TDataItem]:
591
+ """Load documents using aggregation pipeline (sequential only)"""
592
+ logger.warning(
593
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
594
+ )
595
+ yield from super().load_documents(filter_, limit, projection)
596
+
597
+
598
+ class CollectionAggregationArrowLoader(CollectionAggregationLoader):
599
+ """
600
+ MongoDB collection aggregation loader that uses Apache Arrow for data processing.
601
+ """
602
+
603
+ def load_documents(
604
+ self,
605
+ filter_: Dict[str, Any],
606
+ limit: Optional[int] = None,
607
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
608
+ pymongoarrow_schema: Any = None,
609
+ ) -> Iterator[Any]:
610
+ """Load documents using aggregation pipeline with Arrow format"""
611
+ logger.warning(
612
+ "Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
613
+ )
614
+
615
+ # Load documents normally and convert to arrow format
616
+ for batch in super().load_documents(filter_, limit, projection):
617
+ if batch: # Only process non-empty batches
618
+ try:
619
+ from dlt.common.libs.pyarrow import pyarrow
620
+
621
+ # Convert dict batch to arrow table
622
+ table = pyarrow.Table.from_pylist(batch)
623
+ yield convert_arrow_columns(table)
624
+ except ImportError:
625
+ logger.warning(
626
+ "PyArrow not available, falling back to object format"
627
+ )
628
+ yield batch
629
+
630
+
631
+ class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
632
+ """
633
+ MongoDB collection parallel aggregation loader with Arrow support.
634
+ Falls back to sequential loading.
635
+ """
636
+
637
+ def load_documents(
638
+ self,
639
+ filter_: Dict[str, Any],
640
+ limit: Optional[int] = None,
641
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
642
+ pymongoarrow_schema: Any = None,
643
+ ) -> Iterator[TDataItem]:
644
+ """Load documents using aggregation pipeline with Arrow format (sequential only)"""
645
+ logger.warning(
646
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
647
+ )
648
+ yield from super().load_documents(
649
+ filter_, limit, projection, pymongoarrow_schema
650
+ )
651
+
652
+
653
+ def collection_documents(
654
+ client: TMongoClient,
655
+ collection: TCollection,
656
+ filter_: Dict[str, Any],
657
+ projection: Union[Dict[str, Any], List[str]],
658
+ pymongoarrow_schema: "pymongoarrow.schema.Schema",
659
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
660
+ parallel: bool = False,
661
+ limit: Optional[int] = None,
662
+ chunk_size: Optional[int] = 10000,
663
+ data_item_format: Optional[TDataItemFormat] = "object",
664
+ custom_query: Optional[List[Dict[str, Any]]] = None,
665
+ ) -> Iterator[TDataItem]:
666
+ """
667
+ A DLT source which loads data from a Mongo database using PyMongo.
668
+ Resources are automatically created for the collection.
669
+
670
+ Args:
671
+ client (MongoClient): The PyMongo client `pymongo.MongoClient` instance.
672
+ collection (Collection): The collection `pymongo.collection.Collection` to load.
673
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
674
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
675
+ when loading the collection. Supported inputs:
676
+ include (list) - ["year", "title"]
677
+ include (dict) - {"year": True, "title": True}
678
+ exclude (dict) - {"released": False, "runtime": False}
679
+ Note: Can't mix include and exclude statements '{"title": True, "released": False}`
680
+ pymongoarrow_schema (pymongoarrow.schema.Schema): The mapping of field types to convert BSON to Arrow.
681
+ incremental (Optional[dlt.sources.incremental[Any]]): The incremental configuration.
682
+ parallel (bool): Option to enable parallel loading for the collection. Default is False.
683
+ limit (Optional[int]): The maximum number of documents to load.
684
+ chunk_size (Optional[int]): The number of documents to load in each batch.
685
+ data_item_format (Optional[TDataItemFormat]): The data format to use for loading.
686
+ Supported formats:
687
+ object - Python objects (dicts, lists).
688
+ arrow - Apache Arrow tables.
689
+ custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
690
+
691
+ Returns:
692
+ Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
693
+ """
694
+ if data_item_format == "arrow" and not PYMONGOARROW_AVAILABLE:
695
+ dlt.common.logger.warn(
696
+ "'pymongoarrow' is not installed; falling back to standard MongoDB CollectionLoader."
697
+ )
698
+ data_item_format = "object"
699
+
700
+ if data_item_format != "arrow" and pymongoarrow_schema:
701
+ dlt.common.logger.warn(
702
+ "Received value for `pymongoarrow_schema`, but `data_item_format=='object'` "
703
+ "Use `data_item_format=='arrow'` to enforce schema."
704
+ )
705
+
706
+ if data_item_format == "arrow" and pymongoarrow_schema and projection:
707
+ dlt.common.logger.warn(
708
+ "Received values for both `pymongoarrow_schema` and `projection`. Since both "
709
+ "create a projection to select fields, `projection` will be ignored."
710
+ )
711
+
712
+ # If custom query is provided, use aggregation loaders
713
+ if custom_query:
714
+ if parallel:
715
+ if data_item_format == "arrow":
716
+ LoaderClass = CollectionAggregationArrowLoaderParallel
717
+ else:
718
+ LoaderClass = CollectionAggregationLoaderParallel # type: ignore
719
+ else:
720
+ if data_item_format == "arrow":
721
+ LoaderClass = CollectionAggregationArrowLoader # type: ignore
722
+ else:
723
+ LoaderClass = CollectionAggregationLoader # type: ignore
724
+ else:
725
+ if parallel:
726
+ if data_item_format == "arrow":
727
+ LoaderClass = CollectionArrowLoaderParallel
728
+ else:
729
+ LoaderClass = CollectionLoaderParallel # type: ignore
730
+ else:
731
+ if data_item_format == "arrow":
732
+ LoaderClass = CollectionArrowLoader # type: ignore
733
+ else:
734
+ LoaderClass = CollectionLoader # type: ignore
735
+
736
+ loader = LoaderClass(
737
+ client, collection, incremental=incremental, chunk_size=chunk_size
738
+ )
739
+
740
+ # Set custom query if provided
741
+ if custom_query and hasattr(loader, "set_custom_query"):
742
+ loader.set_custom_query(custom_query)
743
+
744
+ # Load documents based on loader type
745
+ if isinstance(
746
+ loader,
747
+ (
748
+ CollectionArrowLoader,
749
+ CollectionArrowLoaderParallel,
750
+ CollectionAggregationArrowLoader,
751
+ CollectionAggregationArrowLoaderParallel,
752
+ ),
753
+ ):
754
+ yield from loader.load_documents(
755
+ limit=limit,
756
+ filter_=filter_,
757
+ projection=projection,
758
+ pymongoarrow_schema=pymongoarrow_schema,
759
+ )
760
+ else:
761
+ yield from loader.load_documents(
762
+ limit=limit, filter_=filter_, projection=projection
763
+ )
764
+
765
+
766
+ def convert_mongo_objs(value: Any) -> Any:
767
+ """MongoDB to dlt type conversion when using Python loaders.
768
+
769
+ Notes:
770
+ The method `ObjectId.__str__()` creates a hexstring using `binascii.hexlify(__id).decode()`
771
+
772
+ """
773
+ if isinstance(value, (ObjectId, Decimal128)):
774
+ return str(value)
775
+ if isinstance(value, _datetime.datetime):
776
+ return ensure_pendulum_datetime(value)
777
+ if isinstance(value, Regex):
778
+ return value.try_compile().pattern
779
+ if isinstance(value, Timestamp):
780
+ date = value.as_datetime()
781
+ return ensure_pendulum_datetime(date)
782
+
783
+ return value
784
+
785
+
786
+ def convert_arrow_columns(table: Any) -> Any:
787
+ """Convert the given table columns to Python types.
788
+
789
+ Notes:
790
+ Calling str() matches the `convert_mongo_obs()` used in non-arrow code.
791
+ Pymongoarrow converts ObjectId to `fixed_size_binary[12]`, which can't be
792
+ converted to a string as a vectorized operation because it contains ASCII characters.
793
+
794
+ Instead, you need to loop over values using:
795
+ ```python
796
+ pyarrow.array([v.as_buffer().hex() for v in object_id_array], type=pyarrow.string())
797
+ # pymongoarrow simplifies this by allowing this syntax
798
+ [str(v) for v in object_id_array]
799
+ ```
800
+
801
+ Args:
802
+ table (pyarrow.lib.Table): The table to convert.
803
+
804
+ Returns:
805
+ pyarrow.lib.Table: The table with the columns converted.
806
+ """
807
+ from dlt.common.libs.pyarrow import pyarrow
808
+ from pymongoarrow.types import ( # type: ignore
809
+ _is_binary,
810
+ _is_code,
811
+ _is_decimal128,
812
+ _is_objectid,
813
+ )
814
+
815
+ for i, field in enumerate(table.schema):
816
+ if _is_objectid(field.type) or _is_decimal128(field.type):
817
+ col_values = [str(value) for value in table[field.name]]
818
+ table = table.set_column(
819
+ i,
820
+ pyarrow.field(field.name, pyarrow.string()),
821
+ pyarrow.array(col_values, type=pyarrow.string()),
822
+ )
823
+ else:
824
+ type_ = None
825
+ if _is_binary(field.type):
826
+ type_ = pyarrow.binary()
827
+ elif _is_code(field.type):
828
+ type_ = pyarrow.string()
829
+
830
+ if type_:
831
+ col_values = [value.as_py() for value in table[field.name]]
832
+ table = table.set_column(
833
+ i,
834
+ pyarrow.field(field.name, type_),
835
+ pyarrow.array(col_values, type=type_),
836
+ )
837
+ return table
838
+
839
+
840
+ def client_from_credentials(connection_url: str) -> TMongoClient:
841
+ client: TMongoClient = MongoClient(
842
+ connection_url, uuidRepresentation="standard", tz_aware=True
843
+ )
844
+ return client
845
+
846
+
847
+ def _raise_if_intersection(filter1: Dict[str, Any], filter2: Dict[str, Any]) -> None:
848
+ """
849
+ Raise an exception, if the given filters'
850
+ fields are intersecting.
851
+
852
+ Args:
853
+ filter1 (Dict[str, Any]): The first filter.
854
+ filter2 (Dict[str, Any]): The second filter.
855
+ """
856
+ field_inter = filter1.keys() & filter2.keys()
857
+ for field in field_inter:
858
+ if filter1[field].keys() & filter2[field].keys():
859
+ str_repr = str({field: filter1[field]})
860
+ raise ValueError(
861
+ (
862
+ f"Filtering operator {str_repr} is already used by the "
863
+ "incremental and can't be used in the filter."
864
+ )
865
+ )
866
+
867
+
868
+ @configspec
869
+ class MongoDbCollectionConfiguration(BaseConfiguration):
870
+ incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
871
+
872
+
873
+ @configspec
874
+ class MongoDbCollectionResourceConfiguration(BaseConfiguration):
875
+ connection_url: dlt.TSecretValue = dlt.secrets.value
876
+ database: Optional[str] = dlt.config.value
877
+ collection: str = dlt.config.value
878
+ incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
879
+ write_disposition: Optional[str] = dlt.config.value
880
+ parallel: Optional[bool] = False
881
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value
882
+
883
+
884
+ def convert_mongo_shell_to_extended_json(query_string: str) -> str:
885
+ """
886
+ Convert MongoDB shell syntax to MongoDB Extended JSON v2 format.
887
+
888
+ This function handles common MongoDB shell constructs like ISODate, ObjectId,
889
+ NumberLong, NumberDecimal, etc. and converts them to their Extended JSON equivalents
890
+ that can be parsed by bson.json_util.
891
+
892
+ Args:
893
+ query_string: A string containing MongoDB shell syntax
894
+
895
+ Returns:
896
+ A string with MongoDB Extended JSON v2 format
897
+
898
+ Examples:
899
+ >>> convert_mongo_shell_to_extended_json('ISODate("2010-01-01T00:00:00.000Z")')
900
+ '{"$date": "2010-01-01T00:00:00.000Z"}'
901
+
902
+ >>> convert_mongo_shell_to_extended_json('ObjectId("507f1f77bcf86cd799439011")')
903
+ '{"$oid": "507f1f77bcf86cd799439011"}'
904
+ """
905
+ converted = query_string
906
+
907
+ # Convert ISODate("...") to {"$date": "..."}
908
+ # Pattern matches ISODate("2010-01-01T00:00:00.000+0000") or similar
909
+ converted = re.sub(r'ISODate\("([^"]+)"\)', r'{"$date": "\1"}', converted)
910
+
911
+ # Convert ObjectId("...") to {"$oid": "..."}
912
+ converted = re.sub(r'ObjectId\("([^"]+)"\)', r'{"$oid": "\1"}', converted)
913
+
914
+ # Convert NumberLong(...) to {"$numberLong": "..."}
915
+ # Note: NumberLong can have quotes or not: NumberLong(123) or NumberLong("123")
916
+ converted = re.sub(r'NumberLong\("([^"]+)"\)', r'{"$numberLong": "\1"}', converted)
917
+ converted = re.sub(r"NumberLong\(([^)]+)\)", r'{"$numberLong": "\1"}', converted)
918
+
919
+ # Convert NumberInt(...) to {"$numberInt": "..."}
920
+ converted = re.sub(r'NumberInt\("([^"]+)"\)', r'{"$numberInt": "\1"}', converted)
921
+ converted = re.sub(r"NumberInt\(([^)]+)\)", r'{"$numberInt": "\1"}', converted)
922
+
923
+ # Convert NumberDecimal("...") to {"$numberDecimal": "..."}
924
+ converted = re.sub(
925
+ r'NumberDecimal\("([^"]+)"\)', r'{"$numberDecimal": "\1"}', converted
926
+ )
927
+
928
+ # Convert Timestamp(..., ...) to {"$timestamp": {"t": ..., "i": ...}}
929
+ # Timestamp(1234567890, 1) -> {"$timestamp": {"t": 1234567890, "i": 1}}
930
+ converted = re.sub(
931
+ r"Timestamp\((\d+),\s*(\d+)\)", r'{"$timestamp": {"t": \1, "i": \2}}', converted
932
+ )
933
+
934
+ # Convert BinData(..., "...") to {"$binary": {"base64": "...", "subType": "..."}}
935
+ converted = re.sub(
936
+ r'BinData\((\d+),\s*"([^"]+)"\)',
937
+ r'{"$binary": {"base64": "\2", "subType": "\1"}}',
938
+ converted,
939
+ )
940
+
941
+ # Convert MinKey() to {"$minKey": 1}
942
+ converted = re.sub(r"MinKey\(\)", r'{"$minKey": 1}', converted)
943
+
944
+ # Convert MaxKey() to {"$maxKey": 1}
945
+ converted = re.sub(r"MaxKey\(\)", r'{"$maxKey": 1}', converted)
946
+
947
+ # Convert UUID("...") to {"$uuid": "..."}
948
+ converted = re.sub(r'UUID\("([^"]+)"\)', r'{"$uuid": "\1"}', converted)
949
+
950
+ # Convert DBRef("collection", "id") to {"$ref": "collection", "$id": "id"}
951
+ converted = re.sub(
952
+ r'DBRef\("([^"]+)",\s*"([^"]+)"\)', r'{"$ref": "\1", "$id": "\2"}', converted
953
+ )
954
+
955
+ # Convert Code("...") to {"$code": "..."}
956
+ converted = re.sub(r'Code\("([^"]+)"\)', r'{"$code": "\1"}', converted)
957
+
958
+ return converted
959
+
960
+
961
+ __source_name__ = "mongodb"
962
+
963
+
964
+ # MongoDB destination helper functions
965
+ def process_file_items(file_path: str) -> list[dict]:
966
+ """Process items from a file path (JSONL format)."""
967
+ import json
968
+
969
+ documents = []
970
+ with open(file_path, "r") as f:
971
+ for line in f:
972
+ if line.strip():
973
+ doc = json.loads(line.strip())
974
+ documents.append(doc) # Include all fields including DLT metadata
975
+ return documents