ingestr 0.13.2__py3-none-any.whl → 0.14.104__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. ingestr/conftest.py +72 -0
  2. ingestr/main.py +134 -87
  3. ingestr/src/adjust/__init__.py +4 -4
  4. ingestr/src/adjust/adjust_helpers.py +7 -3
  5. ingestr/src/airtable/__init__.py +3 -2
  6. ingestr/src/allium/__init__.py +128 -0
  7. ingestr/src/anthropic/__init__.py +277 -0
  8. ingestr/src/anthropic/helpers.py +525 -0
  9. ingestr/src/applovin/__init__.py +262 -0
  10. ingestr/src/applovin_max/__init__.py +117 -0
  11. ingestr/src/appsflyer/__init__.py +325 -0
  12. ingestr/src/appsflyer/client.py +49 -45
  13. ingestr/src/appstore/__init__.py +1 -0
  14. ingestr/src/arrow/__init__.py +9 -1
  15. ingestr/src/asana_source/__init__.py +1 -1
  16. ingestr/src/attio/__init__.py +102 -0
  17. ingestr/src/attio/helpers.py +65 -0
  18. ingestr/src/blob.py +38 -11
  19. ingestr/src/buildinfo.py +1 -0
  20. ingestr/src/chess/__init__.py +1 -1
  21. ingestr/src/clickup/__init__.py +85 -0
  22. ingestr/src/clickup/helpers.py +47 -0
  23. ingestr/src/collector/spinner.py +43 -0
  24. ingestr/src/couchbase_source/__init__.py +118 -0
  25. ingestr/src/couchbase_source/helpers.py +135 -0
  26. ingestr/src/cursor/__init__.py +83 -0
  27. ingestr/src/cursor/helpers.py +188 -0
  28. ingestr/src/destinations.py +520 -33
  29. ingestr/src/docebo/__init__.py +589 -0
  30. ingestr/src/docebo/client.py +435 -0
  31. ingestr/src/docebo/helpers.py +97 -0
  32. ingestr/src/elasticsearch/__init__.py +80 -0
  33. ingestr/src/elasticsearch/helpers.py +138 -0
  34. ingestr/src/errors.py +8 -0
  35. ingestr/src/facebook_ads/__init__.py +47 -28
  36. ingestr/src/facebook_ads/helpers.py +59 -37
  37. ingestr/src/facebook_ads/settings.py +2 -0
  38. ingestr/src/facebook_ads/utils.py +39 -0
  39. ingestr/src/factory.py +116 -2
  40. ingestr/src/filesystem/__init__.py +8 -3
  41. ingestr/src/filters.py +46 -3
  42. ingestr/src/fluxx/__init__.py +9906 -0
  43. ingestr/src/fluxx/helpers.py +209 -0
  44. ingestr/src/frankfurter/__init__.py +157 -0
  45. ingestr/src/frankfurter/helpers.py +48 -0
  46. ingestr/src/freshdesk/__init__.py +89 -0
  47. ingestr/src/freshdesk/freshdesk_client.py +137 -0
  48. ingestr/src/freshdesk/settings.py +9 -0
  49. ingestr/src/fundraiseup/__init__.py +95 -0
  50. ingestr/src/fundraiseup/client.py +81 -0
  51. ingestr/src/github/__init__.py +41 -6
  52. ingestr/src/github/helpers.py +5 -5
  53. ingestr/src/google_analytics/__init__.py +22 -4
  54. ingestr/src/google_analytics/helpers.py +124 -6
  55. ingestr/src/google_sheets/__init__.py +4 -4
  56. ingestr/src/google_sheets/helpers/data_processing.py +2 -2
  57. ingestr/src/hostaway/__init__.py +302 -0
  58. ingestr/src/hostaway/client.py +288 -0
  59. ingestr/src/http/__init__.py +35 -0
  60. ingestr/src/http/readers.py +114 -0
  61. ingestr/src/http_client.py +24 -0
  62. ingestr/src/hubspot/__init__.py +66 -23
  63. ingestr/src/hubspot/helpers.py +52 -22
  64. ingestr/src/hubspot/settings.py +14 -7
  65. ingestr/src/influxdb/__init__.py +46 -0
  66. ingestr/src/influxdb/client.py +34 -0
  67. ingestr/src/intercom/__init__.py +142 -0
  68. ingestr/src/intercom/helpers.py +674 -0
  69. ingestr/src/intercom/settings.py +279 -0
  70. ingestr/src/isoc_pulse/__init__.py +159 -0
  71. ingestr/src/jira_source/__init__.py +340 -0
  72. ingestr/src/jira_source/helpers.py +439 -0
  73. ingestr/src/jira_source/settings.py +170 -0
  74. ingestr/src/kafka/__init__.py +4 -1
  75. ingestr/src/kinesis/__init__.py +139 -0
  76. ingestr/src/kinesis/helpers.py +82 -0
  77. ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
  78. ingestr/src/linear/__init__.py +634 -0
  79. ingestr/src/linear/helpers.py +111 -0
  80. ingestr/src/linkedin_ads/helpers.py +0 -1
  81. ingestr/src/loader.py +69 -0
  82. ingestr/src/mailchimp/__init__.py +126 -0
  83. ingestr/src/mailchimp/helpers.py +226 -0
  84. ingestr/src/mailchimp/settings.py +164 -0
  85. ingestr/src/masking.py +344 -0
  86. ingestr/src/mixpanel/__init__.py +62 -0
  87. ingestr/src/mixpanel/client.py +99 -0
  88. ingestr/src/monday/__init__.py +246 -0
  89. ingestr/src/monday/helpers.py +392 -0
  90. ingestr/src/monday/settings.py +328 -0
  91. ingestr/src/mongodb/__init__.py +72 -8
  92. ingestr/src/mongodb/helpers.py +915 -38
  93. ingestr/src/partition.py +32 -0
  94. ingestr/src/personio/__init__.py +331 -0
  95. ingestr/src/personio/helpers.py +86 -0
  96. ingestr/src/phantombuster/__init__.py +65 -0
  97. ingestr/src/phantombuster/client.py +87 -0
  98. ingestr/src/pinterest/__init__.py +82 -0
  99. ingestr/src/pipedrive/__init__.py +198 -0
  100. ingestr/src/pipedrive/helpers/__init__.py +23 -0
  101. ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
  102. ingestr/src/pipedrive/helpers/pages.py +115 -0
  103. ingestr/src/pipedrive/settings.py +27 -0
  104. ingestr/src/pipedrive/typing.py +3 -0
  105. ingestr/src/plusvibeai/__init__.py +335 -0
  106. ingestr/src/plusvibeai/helpers.py +544 -0
  107. ingestr/src/plusvibeai/settings.py +252 -0
  108. ingestr/src/quickbooks/__init__.py +117 -0
  109. ingestr/src/resource.py +40 -0
  110. ingestr/src/revenuecat/__init__.py +83 -0
  111. ingestr/src/revenuecat/helpers.py +237 -0
  112. ingestr/src/salesforce/__init__.py +156 -0
  113. ingestr/src/salesforce/helpers.py +64 -0
  114. ingestr/src/shopify/__init__.py +1 -17
  115. ingestr/src/smartsheets/__init__.py +82 -0
  116. ingestr/src/snapchat_ads/__init__.py +489 -0
  117. ingestr/src/snapchat_ads/client.py +72 -0
  118. ingestr/src/snapchat_ads/helpers.py +535 -0
  119. ingestr/src/socrata_source/__init__.py +83 -0
  120. ingestr/src/socrata_source/helpers.py +85 -0
  121. ingestr/src/socrata_source/settings.py +8 -0
  122. ingestr/src/solidgate/__init__.py +219 -0
  123. ingestr/src/solidgate/helpers.py +154 -0
  124. ingestr/src/sources.py +3132 -212
  125. ingestr/src/stripe_analytics/__init__.py +49 -21
  126. ingestr/src/stripe_analytics/helpers.py +286 -1
  127. ingestr/src/stripe_analytics/settings.py +62 -10
  128. ingestr/src/telemetry/event.py +10 -9
  129. ingestr/src/tiktok_ads/__init__.py +12 -6
  130. ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
  131. ingestr/src/trustpilot/__init__.py +48 -0
  132. ingestr/src/trustpilot/client.py +48 -0
  133. ingestr/src/version.py +6 -1
  134. ingestr/src/wise/__init__.py +68 -0
  135. ingestr/src/wise/client.py +63 -0
  136. ingestr/src/zoom/__init__.py +99 -0
  137. ingestr/src/zoom/helpers.py +102 -0
  138. ingestr/tests/unit/test_smartsheets.py +133 -0
  139. ingestr-0.14.104.dist-info/METADATA +563 -0
  140. ingestr-0.14.104.dist-info/RECORD +203 -0
  141. ingestr/src/appsflyer/_init_.py +0 -24
  142. ingestr-0.13.2.dist-info/METADATA +0 -302
  143. ingestr-0.13.2.dist-info/RECORD +0 -107
  144. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
  145. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
  146. {ingestr-0.13.2.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,12 +1,29 @@
1
- """Mongo database source helpers"""
1
+ """Mongo database source helpers and destination utilities"""
2
2
 
3
+ import re
3
4
  from itertools import islice
4
- from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Tuple
5
+ from typing import (
6
+ TYPE_CHECKING,
7
+ Any,
8
+ Dict,
9
+ Iterable,
10
+ Iterator,
11
+ List,
12
+ Mapping,
13
+ Optional,
14
+ Tuple,
15
+ Union,
16
+ )
5
17
 
6
18
  import dlt
7
19
  from bson.decimal128 import Decimal128
8
20
  from bson.objectid import ObjectId
21
+ from bson.regex import Regex
22
+ from bson.timestamp import Timestamp
23
+ from dlt.common import logger
9
24
  from dlt.common.configuration.specs import BaseConfiguration, configspec
25
+ from dlt.common.data_writers import TDataItemFormat
26
+ from dlt.common.schema import TTableSchema
10
27
  from dlt.common.time import ensure_pendulum_datetime
11
28
  from dlt.common.typing import TDataItem
12
29
  from dlt.common.utils import map_nested_in_place
@@ -14,17 +31,23 @@ from pendulum import _datetime
14
31
  from pymongo import ASCENDING, DESCENDING, MongoClient
15
32
  from pymongo.collection import Collection
16
33
  from pymongo.cursor import Cursor
34
+ from pymongo.helpers_shared import _fields_list_to_dict
17
35
 
18
36
  if TYPE_CHECKING:
19
37
  TMongoClient = MongoClient[Any]
20
- TCollection = Collection[Any] # type: ignore
38
+ TCollection = Collection[Any]
21
39
  TCursor = Cursor[Any]
22
40
  else:
23
41
  TMongoClient = Any
24
42
  TCollection = Any
25
43
  TCursor = Any
26
44
 
27
- CHUNK_SIZE = 10000
45
+ try:
46
+ import pymongoarrow # type: ignore
47
+
48
+ PYMONGOARROW_AVAILABLE = True
49
+ except ImportError:
50
+ PYMONGOARROW_AVAILABLE = False
28
51
 
29
52
 
30
53
  class CollectionLoader:
@@ -32,11 +55,14 @@ class CollectionLoader:
32
55
  self,
33
56
  client: TMongoClient,
34
57
  collection: TCollection,
58
+ chunk_size: int,
35
59
  incremental: Optional[dlt.sources.incremental[Any]] = None,
36
60
  ) -> None:
37
61
  self.client = client
38
62
  self.collection = collection
39
63
  self.incremental = incremental
64
+ self.chunk_size = chunk_size
65
+
40
66
  if incremental:
41
67
  self.cursor_field = incremental.cursor_path
42
68
  self.last_value = incremental.last_value
@@ -45,45 +71,193 @@ class CollectionLoader:
45
71
  self.last_value = None
46
72
 
47
73
  @property
48
- def _filter_op(self) -> Dict[str, Any]:
74
+ def _sort_op(self) -> List[Optional[Tuple[str, int]]]:
49
75
  if not self.incremental or not self.last_value:
50
- return {}
51
- if self.incremental.last_value_func is max:
52
- return {self.cursor_field: {"$gte": self.last_value}}
53
- elif self.incremental.last_value_func is min:
54
- return {self.cursor_field: {"$lt": self.last_value}}
55
- return {}
76
+ return []
56
77
 
57
- def load_documents(self) -> Iterator[TDataItem]:
58
- cursor = self.collection.find(self._filter_op)
59
- while docs_slice := list(islice(cursor, CHUNK_SIZE)):
60
- yield map_nested_in_place(convert_mongo_objs, docs_slice)
78
+ if (
79
+ self.incremental.row_order == "asc"
80
+ and self.incremental.last_value_func is max
81
+ ) or (
82
+ self.incremental.row_order == "desc"
83
+ and self.incremental.last_value_func is min
84
+ ):
85
+ return [(self.cursor_field, ASCENDING)]
86
+
87
+ elif (
88
+ self.incremental.row_order == "asc"
89
+ and self.incremental.last_value_func is min
90
+ ) or (
91
+ self.incremental.row_order == "desc"
92
+ and self.incremental.last_value_func is max
93
+ ):
94
+ return [(self.cursor_field, DESCENDING)]
61
95
 
96
+ return []
62
97
 
63
- class CollectionLoaderParallell(CollectionLoader):
64
98
  @property
65
- def _sort_op(self) -> List[Optional[Tuple[str, int]]]:
66
- if not self.incremental or not self.last_value:
67
- return []
99
+ def _filter_op(self) -> Dict[str, Any]:
100
+ """Build a filtering operator.
101
+
102
+ Includes a field and the filtering condition for it.
103
+
104
+ Returns:
105
+ Dict[str, Any]: A dictionary with the filter operator.
106
+ """
107
+ if not (self.incremental and self.last_value):
108
+ return {}
109
+
110
+ filt = {}
68
111
  if self.incremental.last_value_func is max:
69
- return [(self.cursor_field, ASCENDING)]
112
+ filt = {self.cursor_field: {"$gte": self.last_value}}
113
+ if self.incremental.end_value:
114
+ filt[self.cursor_field]["$lt"] = self.incremental.end_value
115
+
70
116
  elif self.incremental.last_value_func is min:
71
- return [(self.cursor_field, DESCENDING)]
72
- return []
117
+ filt = {self.cursor_field: {"$lte": self.last_value}}
118
+ if self.incremental.end_value:
119
+ filt[self.cursor_field]["$gt"] = self.incremental.end_value
120
+
121
+ return filt
122
+
123
+ def _projection_op(
124
+ self, projection: Optional[Union[Mapping[str, Any], Iterable[str]]]
125
+ ) -> Optional[Dict[str, Any]]:
126
+ """Build a projection operator.
127
+
128
+ Args:
129
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): A tuple of fields to include or a dict specifying fields to include or exclude.
130
+ The incremental `primary_key` needs to be handle differently for inclusion
131
+ and exclusion projections.
132
+
133
+ Returns:
134
+ Tuple[str, ...] | Dict[str, Any]: A tuple or dictionary with the projection operator.
135
+ """
136
+ if projection is None:
137
+ return None
138
+
139
+ projection_dict = dict(_fields_list_to_dict(projection, "projection"))
73
140
 
141
+ if self.incremental:
142
+ # this is an inclusion projection
143
+ if any(v == 1 for v in projection_dict.values()):
144
+ # ensure primary_key is included
145
+ projection_dict.update(m={self.incremental.primary_key: 1})
146
+ # this is an exclusion projection
147
+ else:
148
+ try:
149
+ # ensure primary_key isn't excluded
150
+ projection_dict.pop(self.incremental.primary_key) # type: ignore
151
+ except KeyError:
152
+ pass # primary_key was properly not included in exclusion projection
153
+ else:
154
+ dlt.common.logger.warn(
155
+ f"Primary key `{self.incremental.primary_key}` was removed from exclusion projection"
156
+ )
157
+
158
+ return projection_dict
159
+
160
+ def _limit(self, cursor: Cursor, limit: Optional[int] = None) -> TCursor: # type: ignore
161
+ """Apply a limit to the cursor, if needed.
162
+
163
+ Args:
164
+ cursor (Cursor): The cursor to apply the limit.
165
+ limit (Optional[int]): The number of documents to load.
166
+
167
+ Returns:
168
+ Cursor: The cursor with the limit applied (if given).
169
+ """
170
+ if limit not in (0, None):
171
+ if self.incremental is None or self.incremental.last_value_func is None:
172
+ logger.warning(
173
+ "Using limit without ordering - results may be inconsistent."
174
+ )
175
+
176
+ cursor = cursor.limit(abs(limit))
177
+
178
+ return cursor
179
+
180
+ def load_documents(
181
+ self,
182
+ filter_: Dict[str, Any],
183
+ limit: Optional[int] = None,
184
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
185
+ ) -> Iterator[TDataItem]:
186
+ """Construct the query and load the documents from the collection.
187
+
188
+ Args:
189
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
190
+ limit (Optional[int]): The number of documents to load.
191
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
192
+
193
+ Yields:
194
+ Iterator[TDataItem]: An iterator of the loaded documents.
195
+ """
196
+ filter_op = self._filter_op
197
+ _raise_if_intersection(filter_op, filter_)
198
+ filter_op.update(filter_)
199
+
200
+ projection_op = self._projection_op(projection)
201
+
202
+ cursor = self.collection.find(filter=filter_op, projection=projection_op)
203
+ if self._sort_op:
204
+ cursor = cursor.sort(self._sort_op)
205
+
206
+ cursor = self._limit(cursor, limit)
207
+
208
+ while docs_slice := list(islice(cursor, self.chunk_size)):
209
+ res = map_nested_in_place(convert_mongo_objs, docs_slice)
210
+ if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
211
+ yield dlt.mark.with_hints(
212
+ res,
213
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
214
+ )
215
+ else:
216
+ yield res
217
+
218
+
219
+ class CollectionLoaderParallel(CollectionLoader):
74
220
  def _get_document_count(self) -> int:
75
221
  return self.collection.count_documents(filter=self._filter_op)
76
222
 
77
- def _create_batches(self) -> List[Dict[str, int]]:
223
+ def _create_batches(self, limit: Optional[int] = None) -> List[Dict[str, int]]:
78
224
  doc_count = self._get_document_count()
79
- return [
80
- dict(skip=sk, limit=CHUNK_SIZE) for sk in range(0, doc_count, CHUNK_SIZE)
81
- ]
225
+ if limit:
226
+ doc_count = min(doc_count, abs(limit))
227
+
228
+ batches = []
229
+ left_to_load = doc_count
230
+
231
+ for sk in range(0, doc_count, self.chunk_size):
232
+ batches.append(dict(skip=sk, limit=min(self.chunk_size, left_to_load)))
233
+ left_to_load -= self.chunk_size
234
+
235
+ return batches
82
236
 
83
- def _get_cursor(self) -> TCursor:
84
- cursor = self.collection.find(filter=self._filter_op)
237
+ def _get_cursor(
238
+ self,
239
+ filter_: Dict[str, Any],
240
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
241
+ ) -> TCursor:
242
+ """Get a reading cursor for the collection.
243
+
244
+ Args:
245
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
246
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
247
+
248
+ Returns:
249
+ Cursor: The cursor for the collection.
250
+ """
251
+ filter_op = self._filter_op
252
+ _raise_if_intersection(filter_op, filter_)
253
+ filter_op.update(filter_)
254
+
255
+ projection_op = self._projection_op(projection)
256
+
257
+ cursor = self.collection.find(filter=filter_op, projection=projection_op)
85
258
  if self._sort_op:
86
259
  cursor = cursor.sort(self._sort_op)
260
+
87
261
  return cursor
88
262
 
89
263
  @dlt.defer
@@ -93,25 +267,388 @@ class CollectionLoaderParallell(CollectionLoader):
93
267
  data = []
94
268
  for document in cursor.skip(batch["skip"]).limit(batch["limit"]):
95
269
  data.append(map_nested_in_place(convert_mongo_objs, document))
270
+
96
271
  return data
97
272
 
98
- def _get_all_batches(self) -> Iterator[TDataItem]:
99
- batches = self._create_batches()
100
- cursor = self._get_cursor()
273
+ def _get_all_batches(
274
+ self,
275
+ filter_: Dict[str, Any],
276
+ limit: Optional[int] = None,
277
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
278
+ ) -> Iterator[TDataItem]:
279
+ """Load all documents from the collection in parallel batches.
280
+
281
+ Args:
282
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
283
+ limit (Optional[int]): The maximum number of documents to load.
284
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
285
+
286
+ Yields:
287
+ Iterator[TDataItem]: An iterator of the loaded documents.
288
+ """
289
+ batches = self._create_batches(limit=limit)
290
+ cursor = self._get_cursor(filter_=filter_, projection=projection)
101
291
 
102
292
  for batch in batches:
103
293
  yield self._run_batch(cursor=cursor, batch=batch)
104
294
 
105
- def load_documents(self) -> Iterator[TDataItem]:
106
- for document in self._get_all_batches():
295
+ def load_documents(
296
+ self,
297
+ filter_: Dict[str, Any],
298
+ limit: Optional[int] = None,
299
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
300
+ ) -> Iterator[TDataItem]:
301
+ """Load documents from the collection in parallel.
302
+
303
+ Args:
304
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
305
+ limit (Optional[int]): The number of documents to load.
306
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
307
+
308
+ Yields:
309
+ Iterator[TDataItem]: An iterator of the loaded documents.
310
+ """
311
+ for document in self._get_all_batches(
312
+ limit=limit, filter_=filter_, projection=projection
313
+ ):
107
314
  yield document
108
315
 
109
316
 
317
+ class CollectionArrowLoader(CollectionLoader):
318
+ """
319
+ Mongo DB collection loader, which uses
320
+ Apache Arrow for data processing.
321
+ """
322
+
323
+ def load_documents(
324
+ self,
325
+ filter_: Dict[str, Any],
326
+ limit: Optional[int] = None,
327
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
328
+ pymongoarrow_schema: Any = None,
329
+ ) -> Iterator[Any]:
330
+ """
331
+ Load documents from the collection in Apache Arrow format.
332
+
333
+ Args:
334
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
335
+ limit (Optional[int]): The number of documents to load.
336
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
337
+ pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
338
+
339
+ Yields:
340
+ Iterator[Any]: An iterator of the loaded documents.
341
+ """
342
+ from pymongoarrow.context import PyMongoArrowContext # type: ignore
343
+ from pymongoarrow.lib import process_bson_stream # type: ignore
344
+
345
+ filter_op = self._filter_op
346
+ _raise_if_intersection(filter_op, filter_)
347
+ filter_op.update(filter_)
348
+
349
+ projection_op = self._projection_op(projection)
350
+
351
+ # NOTE the `filter_op` isn't passed
352
+ cursor = self.collection.find_raw_batches(
353
+ filter_, batch_size=self.chunk_size, projection=projection_op
354
+ )
355
+ if self._sort_op:
356
+ cursor = cursor.sort(self._sort_op) # type: ignore
357
+
358
+ cursor = self._limit(cursor, limit) # type: ignore
359
+
360
+ context = PyMongoArrowContext.from_schema(
361
+ schema=pymongoarrow_schema, codec_options=self.collection.codec_options
362
+ )
363
+ for batch in cursor:
364
+ process_bson_stream(batch, context)
365
+ table = context.finish()
366
+ yield convert_arrow_columns(table)
367
+
368
+
369
+ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
370
+ """
371
+ Mongo DB collection parallel loader, which uses
372
+ Apache Arrow for data processing.
373
+ """
374
+
375
+ def load_documents(
376
+ self,
377
+ filter_: Dict[str, Any],
378
+ limit: Optional[int] = None,
379
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
380
+ pymongoarrow_schema: Any = None,
381
+ ) -> Iterator[TDataItem]:
382
+ """Load documents from the collection in parallel.
383
+
384
+ Args:
385
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
386
+ limit (Optional[int]): The number of documents to load.
387
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
388
+ pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
389
+
390
+ Yields:
391
+ Iterator[TDataItem]: An iterator of the loaded documents.
392
+ """
393
+ yield from self._get_all_batches(
394
+ limit=limit,
395
+ filter_=filter_,
396
+ projection=projection,
397
+ pymongoarrow_schema=pymongoarrow_schema,
398
+ )
399
+
400
+ def _get_all_batches(
401
+ self,
402
+ filter_: Dict[str, Any],
403
+ limit: Optional[int] = None,
404
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
405
+ pymongoarrow_schema: Any = None,
406
+ ) -> Iterator[TDataItem]:
407
+ """Load all documents from the collection in parallel batches.
408
+
409
+ Args:
410
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
411
+ limit (Optional[int]): The maximum number of documents to load.
412
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
413
+ pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
414
+
415
+ Yields:
416
+ Iterator[TDataItem]: An iterator of the loaded documents.
417
+ """
418
+ batches = self._create_batches(limit=limit)
419
+ cursor = self._get_cursor(filter_=filter_, projection=projection)
420
+ for batch in batches:
421
+ yield self._run_batch(
422
+ cursor=cursor,
423
+ batch=batch,
424
+ pymongoarrow_schema=pymongoarrow_schema,
425
+ )
426
+
427
+ def _get_cursor(
428
+ self,
429
+ filter_: Dict[str, Any],
430
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
431
+ ) -> TCursor:
432
+ """Get a reading cursor for the collection.
433
+
434
+ Args:
435
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
436
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
437
+
438
+ Returns:
439
+ Cursor: The cursor for the collection.
440
+ """
441
+ filter_op = self._filter_op
442
+ _raise_if_intersection(filter_op, filter_)
443
+ filter_op.update(filter_)
444
+
445
+ projection_op = self._projection_op(projection)
446
+
447
+ cursor = self.collection.find_raw_batches(
448
+ filter=filter_op, batch_size=self.chunk_size, projection=projection_op
449
+ )
450
+ if self._sort_op:
451
+ cursor = cursor.sort(self._sort_op) # type: ignore
452
+
453
+ return cursor
454
+
455
+ @dlt.defer
456
+ def _run_batch(
457
+ self,
458
+ cursor: TCursor,
459
+ batch: Dict[str, int],
460
+ pymongoarrow_schema: Any = None,
461
+ ) -> TDataItem:
462
+ from pymongoarrow.context import PyMongoArrowContext
463
+ from pymongoarrow.lib import process_bson_stream
464
+
465
+ cursor = cursor.clone()
466
+
467
+ context = PyMongoArrowContext.from_schema(
468
+ schema=pymongoarrow_schema, codec_options=self.collection.codec_options
469
+ )
470
+ for chunk in cursor.skip(batch["skip"]).limit(batch["limit"]):
471
+ process_bson_stream(chunk, context)
472
+ table = context.finish()
473
+ yield convert_arrow_columns(table)
474
+
475
+
476
+ class CollectionAggregationLoader(CollectionLoader):
477
+ """
478
+ MongoDB collection loader that uses aggregation pipelines instead of find queries.
479
+ """
480
+
481
+ def __init__(
482
+ self,
483
+ client: TMongoClient,
484
+ collection: TCollection,
485
+ chunk_size: int,
486
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
487
+ ) -> None:
488
+ super().__init__(client, collection, chunk_size, incremental)
489
+ self.custom_query: Optional[List[Dict[str, Any]]] = None
490
+
491
+ def set_custom_query(self, query: List[Dict[str, Any]]):
492
+ """Set the custom aggregation pipeline query"""
493
+ self.custom_query = query
494
+
495
+ def load_documents(
496
+ self,
497
+ filter_: Dict[str, Any],
498
+ limit: Optional[int] = None,
499
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
500
+ ) -> Iterator[TDataItem]:
501
+ """Load documents using aggregation pipeline"""
502
+ if not self.custom_query:
503
+ # Fallback to parent method if no custom query
504
+ yield from super().load_documents(filter_, limit, projection)
505
+ return
506
+
507
+ # Build aggregation pipeline
508
+ pipeline = list(self.custom_query) # Copy the query
509
+
510
+ # For custom queries, we assume incremental filtering is already handled
511
+ # via interval placeholders (:interval_start, :interval_end) in the query itself.
512
+ # We don't add additional incremental filtering to avoid conflicts.
513
+
514
+ # Add additional filter if provided
515
+ if filter_:
516
+ filter_match = {"$match": filter_}
517
+ pipeline.insert(0, filter_match)
518
+
519
+ # Add limit if specified
520
+ if limit and limit > 0:
521
+ pipeline.append({"$limit": limit})
522
+
523
+ # Add maxTimeMS to prevent hanging
524
+ cursor = self.collection.aggregate(
525
+ pipeline,
526
+ allowDiskUse=True,
527
+ batchSize=min(self.chunk_size, 101),
528
+ maxTimeMS=30000, # 30 second timeout
529
+ )
530
+
531
+ docs_buffer = []
532
+ try:
533
+ for doc in cursor:
534
+ docs_buffer.append(doc)
535
+
536
+ if len(docs_buffer) >= self.chunk_size:
537
+ res = map_nested_in_place(convert_mongo_objs, docs_buffer)
538
+ if (
539
+ len(res) > 0
540
+ and "_id" in res[0]
541
+ and isinstance(res[0]["_id"], dict)
542
+ ):
543
+ yield dlt.mark.with_hints(
544
+ res,
545
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
546
+ )
547
+ else:
548
+ yield res
549
+ docs_buffer = []
550
+
551
+ # Yield any remaining documents
552
+ if docs_buffer:
553
+ res = map_nested_in_place(convert_mongo_objs, docs_buffer)
554
+ if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
555
+ yield dlt.mark.with_hints(
556
+ res,
557
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
558
+ )
559
+ else:
560
+ yield res
561
+ finally:
562
+ cursor.close()
563
+
564
+
565
+ class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
566
+ """
567
+ MongoDB collection parallel loader that uses aggregation pipelines.
568
+ Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
569
+ Falls back to sequential loading.
570
+ """
571
+
572
+ def load_documents(
573
+ self,
574
+ filter_: Dict[str, Any],
575
+ limit: Optional[int] = None,
576
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
577
+ ) -> Iterator[TDataItem]:
578
+ """Load documents using aggregation pipeline (sequential only)"""
579
+ logger.warning(
580
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
581
+ )
582
+ yield from super().load_documents(filter_, limit, projection)
583
+
584
+
585
+ class CollectionAggregationArrowLoader(CollectionAggregationLoader):
586
+ """
587
+ MongoDB collection aggregation loader that uses Apache Arrow for data processing.
588
+ """
589
+
590
+ def load_documents(
591
+ self,
592
+ filter_: Dict[str, Any],
593
+ limit: Optional[int] = None,
594
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
595
+ pymongoarrow_schema: Any = None,
596
+ ) -> Iterator[Any]:
597
+ """Load documents using aggregation pipeline with Arrow format"""
598
+ logger.warning(
599
+ "Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
600
+ )
601
+
602
+ # Load documents normally and convert to arrow format
603
+ for batch in super().load_documents(filter_, limit, projection):
604
+ if batch: # Only process non-empty batches
605
+ try:
606
+ from dlt.common.libs.pyarrow import pyarrow
607
+
608
+ # Convert dict batch to arrow table
609
+ table = pyarrow.Table.from_pylist(batch)
610
+ yield convert_arrow_columns(table)
611
+ except ImportError:
612
+ logger.warning(
613
+ "PyArrow not available, falling back to object format"
614
+ )
615
+ yield batch
616
+
617
+
618
+ class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
619
+ """
620
+ MongoDB collection parallel aggregation loader with Arrow support.
621
+ Falls back to sequential loading.
622
+ """
623
+
624
+ def load_documents(
625
+ self,
626
+ filter_: Dict[str, Any],
627
+ limit: Optional[int] = None,
628
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
629
+ pymongoarrow_schema: Any = None,
630
+ ) -> Iterator[TDataItem]:
631
+ """Load documents using aggregation pipeline with Arrow format (sequential only)"""
632
+ logger.warning(
633
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
634
+ )
635
+ yield from super().load_documents(
636
+ filter_, limit, projection, pymongoarrow_schema
637
+ )
638
+
639
+
110
640
  def collection_documents(
111
641
  client: TMongoClient,
112
642
  collection: TCollection,
643
+ filter_: Dict[str, Any],
644
+ projection: Union[Dict[str, Any], List[str]],
645
+ pymongoarrow_schema: "pymongoarrow.schema.Schema",
113
646
  incremental: Optional[dlt.sources.incremental[Any]] = None,
114
647
  parallel: bool = False,
648
+ limit: Optional[int] = None,
649
+ chunk_size: Optional[int] = 10000,
650
+ data_item_format: Optional[TDataItemFormat] = "object",
651
+ custom_query: Optional[List[Dict[str, Any]]] = None,
115
652
  ) -> Iterator[TDataItem]:
116
653
  """
117
654
  A DLT source which loads data from a Mongo database using PyMongo.
@@ -120,27 +657,173 @@ def collection_documents(
120
657
  Args:
121
658
  client (MongoClient): The PyMongo client `pymongo.MongoClient` instance.
122
659
  collection (Collection): The collection `pymongo.collection.Collection` to load.
660
+ filter_ (Dict[str, Any]): The filter to apply to the collection.
661
+ projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
662
+ when loading the collection. Supported inputs:
663
+ include (list) - ["year", "title"]
664
+ include (dict) - {"year": True, "title": True}
665
+ exclude (dict) - {"released": False, "runtime": False}
666
+ Note: Can't mix include and exclude statements '{"title": True, "released": False}`
667
+ pymongoarrow_schema (pymongoarrow.schema.Schema): The mapping of field types to convert BSON to Arrow.
123
668
  incremental (Optional[dlt.sources.incremental[Any]]): The incremental configuration.
124
669
  parallel (bool): Option to enable parallel loading for the collection. Default is False.
670
+ limit (Optional[int]): The maximum number of documents to load.
671
+ chunk_size (Optional[int]): The number of documents to load in each batch.
672
+ data_item_format (Optional[TDataItemFormat]): The data format to use for loading.
673
+ Supported formats:
674
+ object - Python objects (dicts, lists).
675
+ arrow - Apache Arrow tables.
676
+ custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
125
677
 
126
678
  Returns:
127
679
  Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
128
680
  """
129
- LoaderClass = CollectionLoaderParallell if parallel else CollectionLoader
681
+ if data_item_format == "arrow" and not PYMONGOARROW_AVAILABLE:
682
+ dlt.common.logger.warn(
683
+ "'pymongoarrow' is not installed; falling back to standard MongoDB CollectionLoader."
684
+ )
685
+ data_item_format = "object"
686
+
687
+ if data_item_format != "arrow" and pymongoarrow_schema:
688
+ dlt.common.logger.warn(
689
+ "Received value for `pymongoarrow_schema`, but `data_item_format=='object'` "
690
+ "Use `data_item_format=='arrow'` to enforce schema."
691
+ )
130
692
 
131
- loader = LoaderClass(client, collection, incremental=incremental)
132
- for data in loader.load_documents():
133
- yield data
693
+ if data_item_format == "arrow" and pymongoarrow_schema and projection:
694
+ dlt.common.logger.warn(
695
+ "Received values for both `pymongoarrow_schema` and `projection`. Since both "
696
+ "create a projection to select fields, `projection` will be ignored."
697
+ )
698
+
699
+ # If custom query is provided, use aggregation loaders
700
+ if custom_query:
701
+ if parallel:
702
+ if data_item_format == "arrow":
703
+ LoaderClass = CollectionAggregationArrowLoaderParallel
704
+ else:
705
+ LoaderClass = CollectionAggregationLoaderParallel # type: ignore
706
+ else:
707
+ if data_item_format == "arrow":
708
+ LoaderClass = CollectionAggregationArrowLoader # type: ignore
709
+ else:
710
+ LoaderClass = CollectionAggregationLoader # type: ignore
711
+ else:
712
+ if parallel:
713
+ if data_item_format == "arrow":
714
+ LoaderClass = CollectionArrowLoaderParallel
715
+ else:
716
+ LoaderClass = CollectionLoaderParallel # type: ignore
717
+ else:
718
+ if data_item_format == "arrow":
719
+ LoaderClass = CollectionArrowLoader # type: ignore
720
+ else:
721
+ LoaderClass = CollectionLoader # type: ignore
722
+
723
+ loader = LoaderClass(
724
+ client, collection, incremental=incremental, chunk_size=chunk_size
725
+ )
726
+
727
+ # Set custom query if provided
728
+ if custom_query and hasattr(loader, "set_custom_query"):
729
+ loader.set_custom_query(custom_query)
730
+
731
+ # Load documents based on loader type
732
+ if isinstance(
733
+ loader,
734
+ (
735
+ CollectionArrowLoader,
736
+ CollectionArrowLoaderParallel,
737
+ CollectionAggregationArrowLoader,
738
+ CollectionAggregationArrowLoaderParallel,
739
+ ),
740
+ ):
741
+ yield from loader.load_documents(
742
+ limit=limit,
743
+ filter_=filter_,
744
+ projection=projection,
745
+ pymongoarrow_schema=pymongoarrow_schema,
746
+ )
747
+ else:
748
+ yield from loader.load_documents(
749
+ limit=limit, filter_=filter_, projection=projection
750
+ )
134
751
 
135
752
 
136
753
  def convert_mongo_objs(value: Any) -> Any:
754
+ """MongoDB to dlt type conversion when using Python loaders.
755
+
756
+ Notes:
757
+ The method `ObjectId.__str__()` creates a hexstring using `binascii.hexlify(__id).decode()`
758
+
759
+ """
137
760
  if isinstance(value, (ObjectId, Decimal128)):
138
761
  return str(value)
139
762
  if isinstance(value, _datetime.datetime):
140
763
  return ensure_pendulum_datetime(value)
764
+ if isinstance(value, Regex):
765
+ return value.try_compile().pattern
766
+ if isinstance(value, Timestamp):
767
+ date = value.as_datetime()
768
+ return ensure_pendulum_datetime(date)
769
+
141
770
  return value
142
771
 
143
772
 
773
+ def convert_arrow_columns(table: Any) -> Any:
774
+ """Convert the given table columns to Python types.
775
+
776
+ Notes:
777
+ Calling str() matches the `convert_mongo_obs()` used in non-arrow code.
778
+ Pymongoarrow converts ObjectId to `fixed_size_binary[12]`, which can't be
779
+ converted to a string as a vectorized operation because it contains ASCII characters.
780
+
781
+ Instead, you need to loop over values using:
782
+ ```python
783
+ pyarrow.array([v.as_buffer().hex() for v in object_id_array], type=pyarrow.string())
784
+ # pymongoarrow simplifies this by allowing this syntax
785
+ [str(v) for v in object_id_array]
786
+ ```
787
+
788
+ Args:
789
+ table (pyarrow.lib.Table): The table to convert.
790
+
791
+ Returns:
792
+ pyarrow.lib.Table: The table with the columns converted.
793
+ """
794
+ from dlt.common.libs.pyarrow import pyarrow
795
+ from pymongoarrow.types import ( # type: ignore
796
+ _is_binary,
797
+ _is_code,
798
+ _is_decimal128,
799
+ _is_objectid,
800
+ )
801
+
802
+ for i, field in enumerate(table.schema):
803
+ if _is_objectid(field.type) or _is_decimal128(field.type):
804
+ col_values = [str(value) for value in table[field.name]]
805
+ table = table.set_column(
806
+ i,
807
+ pyarrow.field(field.name, pyarrow.string()),
808
+ pyarrow.array(col_values, type=pyarrow.string()),
809
+ )
810
+ else:
811
+ type_ = None
812
+ if _is_binary(field.type):
813
+ type_ = pyarrow.binary()
814
+ elif _is_code(field.type):
815
+ type_ = pyarrow.string()
816
+
817
+ if type_:
818
+ col_values = [value.as_py() for value in table[field.name]]
819
+ table = table.set_column(
820
+ i,
821
+ pyarrow.field(field.name, type_),
822
+ pyarrow.array(col_values, type=type_),
823
+ )
824
+ return table
825
+
826
+
144
827
  def client_from_credentials(connection_url: str) -> TMongoClient:
145
828
  client: TMongoClient = MongoClient(
146
829
  connection_url, uuidRepresentation="standard", tz_aware=True
@@ -148,6 +831,27 @@ def client_from_credentials(connection_url: str) -> TMongoClient:
148
831
  return client
149
832
 
150
833
 
834
+ def _raise_if_intersection(filter1: Dict[str, Any], filter2: Dict[str, Any]) -> None:
835
+ """
836
+ Raise an exception, if the given filters'
837
+ fields are intersecting.
838
+
839
+ Args:
840
+ filter1 (Dict[str, Any]): The first filter.
841
+ filter2 (Dict[str, Any]): The second filter.
842
+ """
843
+ field_inter = filter1.keys() & filter2.keys()
844
+ for field in field_inter:
845
+ if filter1[field].keys() & filter2[field].keys():
846
+ str_repr = str({field: filter1[field]})
847
+ raise ValueError(
848
+ (
849
+ f"Filtering operator {str_repr} is already used by the "
850
+ "incremental and can't be used in the filter."
851
+ )
852
+ )
853
+
854
+
151
855
  @configspec
152
856
  class MongoDbCollectionConfiguration(BaseConfiguration):
153
857
  incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
@@ -155,12 +859,185 @@ class MongoDbCollectionConfiguration(BaseConfiguration):
155
859
 
156
860
  @configspec
157
861
  class MongoDbCollectionResourceConfiguration(BaseConfiguration):
158
- connection_url: str = dlt.config.value
862
+ connection_url: dlt.TSecretValue = dlt.secrets.value
159
863
  database: Optional[str] = dlt.config.value
160
864
  collection: str = dlt.config.value
161
865
  incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
162
866
  write_disposition: Optional[str] = dlt.config.value
163
867
  parallel: Optional[bool] = False
868
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value
869
+
870
+
871
+ def convert_mongo_shell_to_extended_json(query_string: str) -> str:
872
+ """
873
+ Convert MongoDB shell syntax to MongoDB Extended JSON v2 format.
874
+
875
+ This function handles common MongoDB shell constructs like ISODate, ObjectId,
876
+ NumberLong, NumberDecimal, etc. and converts them to their Extended JSON equivalents
877
+ that can be parsed by bson.json_util.
878
+
879
+ Args:
880
+ query_string: A string containing MongoDB shell syntax
881
+
882
+ Returns:
883
+ A string with MongoDB Extended JSON v2 format
884
+
885
+ Examples:
886
+ >>> convert_mongo_shell_to_extended_json('ISODate("2010-01-01T00:00:00.000Z")')
887
+ '{"$date": "2010-01-01T00:00:00.000Z"}'
888
+
889
+ >>> convert_mongo_shell_to_extended_json('ObjectId("507f1f77bcf86cd799439011")')
890
+ '{"$oid": "507f1f77bcf86cd799439011"}'
891
+ """
892
+ converted = query_string
893
+
894
+ # Convert ISODate("...") to {"$date": "..."}
895
+ # Pattern matches ISODate("2010-01-01T00:00:00.000+0000") or similar
896
+ converted = re.sub(r'ISODate\("([^"]+)"\)', r'{"$date": "\1"}', converted)
897
+
898
+ # Convert ObjectId("...") to {"$oid": "..."}
899
+ converted = re.sub(r'ObjectId\("([^"]+)"\)', r'{"$oid": "\1"}', converted)
900
+
901
+ # Convert NumberLong(...) to {"$numberLong": "..."}
902
+ # Note: NumberLong can have quotes or not: NumberLong(123) or NumberLong("123")
903
+ converted = re.sub(r'NumberLong\("([^"]+)"\)', r'{"$numberLong": "\1"}', converted)
904
+ converted = re.sub(r"NumberLong\(([^)]+)\)", r'{"$numberLong": "\1"}', converted)
905
+
906
+ # Convert NumberInt(...) to {"$numberInt": "..."}
907
+ converted = re.sub(r'NumberInt\("([^"]+)"\)', r'{"$numberInt": "\1"}', converted)
908
+ converted = re.sub(r"NumberInt\(([^)]+)\)", r'{"$numberInt": "\1"}', converted)
909
+
910
+ # Convert NumberDecimal("...") to {"$numberDecimal": "..."}
911
+ converted = re.sub(
912
+ r'NumberDecimal\("([^"]+)"\)', r'{"$numberDecimal": "\1"}', converted
913
+ )
914
+
915
+ # Convert Timestamp(..., ...) to {"$timestamp": {"t": ..., "i": ...}}
916
+ # Timestamp(1234567890, 1) -> {"$timestamp": {"t": 1234567890, "i": 1}}
917
+ converted = re.sub(
918
+ r"Timestamp\((\d+),\s*(\d+)\)", r'{"$timestamp": {"t": \1, "i": \2}}', converted
919
+ )
920
+
921
+ # Convert BinData(..., "...") to {"$binary": {"base64": "...", "subType": "..."}}
922
+ converted = re.sub(
923
+ r'BinData\((\d+),\s*"([^"]+)"\)',
924
+ r'{"$binary": {"base64": "\2", "subType": "\1"}}',
925
+ converted,
926
+ )
927
+
928
+ # Convert MinKey() to {"$minKey": 1}
929
+ converted = re.sub(r"MinKey\(\)", r'{"$minKey": 1}', converted)
930
+
931
+ # Convert MaxKey() to {"$maxKey": 1}
932
+ converted = re.sub(r"MaxKey\(\)", r'{"$maxKey": 1}', converted)
933
+
934
+ # Convert UUID("...") to {"$uuid": "..."}
935
+ converted = re.sub(r'UUID\("([^"]+)"\)', r'{"$uuid": "\1"}', converted)
936
+
937
+ # Convert DBRef("collection", "id") to {"$ref": "collection", "$id": "id"}
938
+ converted = re.sub(
939
+ r'DBRef\("([^"]+)",\s*"([^"]+)"\)', r'{"$ref": "\1", "$id": "\2"}', converted
940
+ )
941
+
942
+ # Convert Code("...") to {"$code": "..."}
943
+ converted = re.sub(r'Code\("([^"]+)"\)', r'{"$code": "\1"}', converted)
944
+
945
+ return converted
164
946
 
165
947
 
166
948
  __source_name__ = "mongodb"
949
+
950
+
951
+ # MongoDB destination helper functions
952
+ def process_file_items(file_path: str) -> list[dict]:
953
+ """Process items from a file path (JSONL format)."""
954
+ import json
955
+
956
+ documents = []
957
+ with open(file_path, "r") as f:
958
+ for line in f:
959
+ if line.strip():
960
+ doc = json.loads(line.strip())
961
+ documents.append(doc) # Include all fields including DLT metadata
962
+ return documents
963
+
964
+
965
+ def mongodb_insert(uri: str):
966
+ """Creates a dlt.destination for inserting data into a MongoDB collection.
967
+
968
+ Args:
969
+ uri (str): MongoDB connection URI including database.
970
+
971
+ Returns:
972
+ dlt.destination: A DLT destination object configured for MongoDB.
973
+ """
974
+ from urllib.parse import urlparse
975
+
976
+ parsed_uri = urlparse(uri)
977
+
978
+ # Handle both mongodb:// and mongodb+srv:// schemes
979
+ if uri.startswith("mongodb+srv://") or uri.startswith("mongodb://"):
980
+ # For modern connection strings (MongoDB Atlas), use the URI as-is
981
+ connection_string = uri
982
+ # Extract database from path or use default
983
+ database = (
984
+ parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
985
+ )
986
+ else:
987
+ # Legacy handling for backwards compatibility
988
+ host = parsed_uri.hostname or "localhost"
989
+ port = parsed_uri.port or 27017
990
+ username = parsed_uri.username
991
+ password = parsed_uri.password
992
+ database = (
993
+ parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
994
+ )
995
+
996
+ # Build connection string
997
+ if username and password:
998
+ connection_string = f"mongodb://{username}:{password}@{host}:{port}"
999
+ else:
1000
+ connection_string = f"mongodb://{host}:{port}"
1001
+
1002
+ # Add query parameters if any
1003
+ if parsed_uri.query:
1004
+ connection_string += f"?{parsed_uri.query}"
1005
+
1006
+ state = {"first_batch": True}
1007
+
1008
+ def destination(items: TDataItem, table: TTableSchema) -> None:
1009
+ import pyarrow
1010
+ from pymongo import MongoClient
1011
+
1012
+ # Extract database name from connection string
1013
+ # Get collection name from table metadata
1014
+ collection_name = table["name"]
1015
+
1016
+ # Connect to MongoDB
1017
+ with MongoClient(connection_string) as client:
1018
+ db = client[database]
1019
+ collection = db[collection_name]
1020
+
1021
+ # Process and insert documents
1022
+ if isinstance(items, str):
1023
+ documents = process_file_items(items)
1024
+ elif isinstance(items, pyarrow.RecordBatch):
1025
+ documents = [item for item in items.to_pylist()]
1026
+ else:
1027
+ documents = [item for item in items if isinstance(item, dict)]
1028
+
1029
+ if state["first_batch"] and documents:
1030
+ collection.delete_many({})
1031
+ state["first_batch"] = False
1032
+
1033
+ if documents:
1034
+ collection.insert_many(documents) # Insert all new data
1035
+
1036
+ return dlt.destination(
1037
+ destination,
1038
+ name="mongodb",
1039
+ loader_file_format="typed-jsonl",
1040
+ batch_size=1000,
1041
+ naming_convention="snake_case",
1042
+ loader_parallelism_strategy="sequential",
1043
+ )