omniload 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omniload/conftest.py +72 -0
- omniload/main.py +810 -0
- omniload/src/.gitignore +10 -0
- omniload/src/adjust/__init__.py +108 -0
- omniload/src/adjust/adjust_helpers.py +122 -0
- omniload/src/airtable/__init__.py +84 -0
- omniload/src/allium/__init__.py +128 -0
- omniload/src/anthropic/__init__.py +277 -0
- omniload/src/anthropic/helpers.py +525 -0
- omniload/src/applovin/__init__.py +316 -0
- omniload/src/applovin_max/__init__.py +117 -0
- omniload/src/appsflyer/__init__.py +325 -0
- omniload/src/appsflyer/client.py +110 -0
- omniload/src/appstore/__init__.py +142 -0
- omniload/src/appstore/client.py +126 -0
- omniload/src/appstore/errors.py +15 -0
- omniload/src/appstore/models.py +117 -0
- omniload/src/appstore/resources.py +179 -0
- omniload/src/arrow/__init__.py +81 -0
- omniload/src/asana_source/__init__.py +281 -0
- omniload/src/asana_source/helpers.py +30 -0
- omniload/src/asana_source/settings.py +158 -0
- omniload/src/attio/__init__.py +102 -0
- omniload/src/attio/helpers.py +65 -0
- omniload/src/blob.py +95 -0
- omniload/src/bruin/__init__.py +76 -0
- omniload/src/chess/__init__.py +180 -0
- omniload/src/chess/helpers.py +35 -0
- omniload/src/chess/settings.py +18 -0
- omniload/src/clickup/__init__.py +85 -0
- omniload/src/clickup/helpers.py +47 -0
- omniload/src/collector/spinner.py +43 -0
- omniload/src/couchbase_source/__init__.py +118 -0
- omniload/src/couchbase_source/helpers.py +135 -0
- omniload/src/cursor/__init__.py +83 -0
- omniload/src/cursor/helpers.py +188 -0
- omniload/src/customer_io/__init__.py +486 -0
- omniload/src/customer_io/helpers.py +530 -0
- omniload/src/destinations.py +982 -0
- omniload/src/docebo/__init__.py +589 -0
- omniload/src/docebo/client.py +435 -0
- omniload/src/docebo/helpers.py +97 -0
- omniload/src/dune/__init__.py +104 -0
- omniload/src/dune/helpers.py +108 -0
- omniload/src/dynamodb/__init__.py +86 -0
- omniload/src/elasticsearch/__init__.py +80 -0
- omniload/src/elasticsearch/helpers.py +141 -0
- omniload/src/errors.py +26 -0
- omniload/src/facebook_ads/__init__.py +403 -0
- omniload/src/facebook_ads/exceptions.py +19 -0
- omniload/src/facebook_ads/helpers.py +296 -0
- omniload/src/facebook_ads/settings.py +224 -0
- omniload/src/facebook_ads/utils.py +53 -0
- omniload/src/factory.py +305 -0
- omniload/src/filesystem/__init__.py +133 -0
- omniload/src/filesystem/helpers.py +114 -0
- omniload/src/filesystem/readers.py +187 -0
- omniload/src/filters.py +62 -0
- omniload/src/fireflies/__init__.py +151 -0
- omniload/src/fireflies/helpers.py +753 -0
- omniload/src/fluxx/__init__.py +10013 -0
- omniload/src/fluxx/helpers.py +233 -0
- omniload/src/frankfurter/__init__.py +157 -0
- omniload/src/frankfurter/helpers.py +48 -0
- omniload/src/freshdesk/__init__.py +103 -0
- omniload/src/freshdesk/freshdesk_client.py +151 -0
- omniload/src/freshdesk/settings.py +23 -0
- omniload/src/fundraiseup/__init__.py +95 -0
- omniload/src/fundraiseup/client.py +81 -0
- omniload/src/github/__init__.py +202 -0
- omniload/src/github/helpers.py +207 -0
- omniload/src/github/queries.py +129 -0
- omniload/src/github/settings.py +24 -0
- omniload/src/google_ads/__init__.py +198 -0
- omniload/src/google_ads/field.py +17 -0
- omniload/src/google_ads/metrics.py +254 -0
- omniload/src/google_ads/predicates.py +37 -0
- omniload/src/google_ads/reports.py +411 -0
- omniload/src/google_ads/test_google_ads.py +184 -0
- omniload/src/google_analytics/__init__.py +144 -0
- omniload/src/google_analytics/helpers.py +312 -0
- omniload/src/google_sheets/README.md +95 -0
- omniload/src/google_sheets/__init__.py +166 -0
- omniload/src/google_sheets/helpers/__init__.py +15 -0
- omniload/src/google_sheets/helpers/api_calls.py +160 -0
- omniload/src/google_sheets/helpers/data_processing.py +316 -0
- omniload/src/gorgias/__init__.py +595 -0
- omniload/src/gorgias/helpers.py +166 -0
- omniload/src/hostaway/__init__.py +302 -0
- omniload/src/hostaway/client.py +288 -0
- omniload/src/http/__init__.py +38 -0
- omniload/src/http/readers.py +146 -0
- omniload/src/http_client.py +24 -0
- omniload/src/hubspot/__init__.py +800 -0
- omniload/src/hubspot/helpers.py +417 -0
- omniload/src/hubspot/settings.py +329 -0
- omniload/src/indeed/__init__.py +153 -0
- omniload/src/indeed/helpers.py +228 -0
- omniload/src/influxdb/__init__.py +46 -0
- omniload/src/influxdb/client.py +34 -0
- omniload/src/intercom/__init__.py +142 -0
- omniload/src/intercom/helpers.py +674 -0
- omniload/src/intercom/settings.py +279 -0
- omniload/src/isoc_pulse/__init__.py +159 -0
- omniload/src/jira_source/__init__.py +377 -0
- omniload/src/jira_source/helpers.py +510 -0
- omniload/src/jira_source/settings.py +184 -0
- omniload/src/kafka/__init__.py +120 -0
- omniload/src/kafka/helpers.py +241 -0
- omniload/src/kinesis/__init__.py +153 -0
- omniload/src/kinesis/helpers.py +96 -0
- omniload/src/klaviyo/__init__.py +237 -0
- omniload/src/klaviyo/client.py +212 -0
- omniload/src/klaviyo/helpers.py +19 -0
- omniload/src/linear/__init__.py +634 -0
- omniload/src/linear/helpers.py +111 -0
- omniload/src/linkedin_ads/__init__.py +266 -0
- omniload/src/linkedin_ads/dimension_time_enum.py +17 -0
- omniload/src/linkedin_ads/helpers.py +246 -0
- omniload/src/loader.py +69 -0
- omniload/src/mailchimp/__init__.py +126 -0
- omniload/src/mailchimp/helpers.py +226 -0
- omniload/src/mailchimp/settings.py +164 -0
- omniload/src/masking.py +344 -0
- omniload/src/mixpanel/__init__.py +62 -0
- omniload/src/mixpanel/client.py +104 -0
- omniload/src/monday/__init__.py +246 -0
- omniload/src/monday/helpers.py +392 -0
- omniload/src/monday/settings.py +325 -0
- omniload/src/mongodb/__init__.py +281 -0
- omniload/src/mongodb/helpers.py +975 -0
- omniload/src/notion/__init__.py +69 -0
- omniload/src/notion/helpers/__init__.py +14 -0
- omniload/src/notion/helpers/client.py +178 -0
- omniload/src/notion/helpers/database.py +92 -0
- omniload/src/notion/settings.py +17 -0
- omniload/src/partition.py +32 -0
- omniload/src/personio/__init__.py +345 -0
- omniload/src/personio/helpers.py +100 -0
- omniload/src/phantombuster/__init__.py +65 -0
- omniload/src/phantombuster/client.py +87 -0
- omniload/src/pinterest/__init__.py +82 -0
- omniload/src/pipedrive/__init__.py +212 -0
- omniload/src/pipedrive/helpers/__init__.py +37 -0
- omniload/src/pipedrive/helpers/custom_fields_munger.py +116 -0
- omniload/src/pipedrive/helpers/pages.py +129 -0
- omniload/src/pipedrive/settings.py +41 -0
- omniload/src/pipedrive/typing.py +17 -0
- omniload/src/plusvibeai/__init__.py +335 -0
- omniload/src/plusvibeai/helpers.py +544 -0
- omniload/src/plusvibeai/settings.py +252 -0
- omniload/src/primer/__init__.py +45 -0
- omniload/src/primer/helpers.py +79 -0
- omniload/src/quickbooks/__init__.py +117 -0
- omniload/src/reddit_ads/__init__.py +183 -0
- omniload/src/reddit_ads/helpers.py +232 -0
- omniload/src/resource.py +40 -0
- omniload/src/revenuecat/__init__.py +83 -0
- omniload/src/revenuecat/helpers.py +237 -0
- omniload/src/salesforce/__init__.py +170 -0
- omniload/src/salesforce/helpers.py +78 -0
- omniload/src/shopify/__init__.py +1953 -0
- omniload/src/shopify/exceptions.py +17 -0
- omniload/src/shopify/helpers.py +202 -0
- omniload/src/shopify/settings.py +19 -0
- omniload/src/slack/__init__.py +290 -0
- omniload/src/slack/helpers.py +218 -0
- omniload/src/slack/settings.py +36 -0
- omniload/src/smartsheets/__init__.py +82 -0
- omniload/src/snapchat_ads/__init__.py +455 -0
- omniload/src/snapchat_ads/client.py +72 -0
- omniload/src/snapchat_ads/helpers.py +630 -0
- omniload/src/snapchat_ads/settings.py +130 -0
- omniload/src/socrata_source/__init__.py +83 -0
- omniload/src/socrata_source/helpers.py +85 -0
- omniload/src/socrata_source/settings.py +8 -0
- omniload/src/solidgate/__init__.py +219 -0
- omniload/src/solidgate/helpers.py +154 -0
- omniload/src/sources.py +5408 -0
- omniload/src/sql_database/__init__.py +0 -0
- omniload/src/sql_database/callbacks.py +66 -0
- omniload/src/stripe_analytics/__init__.py +183 -0
- omniload/src/stripe_analytics/helpers.py +386 -0
- omniload/src/stripe_analytics/settings.py +80 -0
- omniload/src/table_definition.py +15 -0
- omniload/src/testdata/fakebqcredentials.json +14 -0
- omniload/src/tiktok_ads/__init__.py +150 -0
- omniload/src/tiktok_ads/tiktok_helpers.py +130 -0
- omniload/src/time.py +11 -0
- omniload/src/trustpilot/__init__.py +48 -0
- omniload/src/trustpilot/client.py +48 -0
- omniload/src/version.py +6 -0
- omniload/src/wise/__init__.py +68 -0
- omniload/src/wise/client.py +63 -0
- omniload/src/zendesk/__init__.py +480 -0
- omniload/src/zendesk/helpers/__init__.py +39 -0
- omniload/src/zendesk/helpers/api_helpers.py +119 -0
- omniload/src/zendesk/helpers/credentials.py +68 -0
- omniload/src/zendesk/helpers/talk_api.py +132 -0
- omniload/src/zendesk/settings.py +71 -0
- omniload/src/zoom/__init__.py +99 -0
- omniload/src/zoom/helpers.py +102 -0
- omniload/testdata/.gitignore +2 -0
- omniload/testdata/create_replace.csv +21 -0
- omniload/testdata/delete_insert_expected.csv +6 -0
- omniload/testdata/delete_insert_part1.csv +5 -0
- omniload/testdata/delete_insert_part2.csv +6 -0
- omniload/testdata/merge_expected.csv +5 -0
- omniload/testdata/merge_part1.csv +4 -0
- omniload/testdata/merge_part2.csv +5 -0
- omniload/tests/unit/test_smartsheets.py +133 -0
- omniload-0.0.0.dev0.dist-info/METADATA +439 -0
- omniload-0.0.0.dev0.dist-info/RECORD +218 -0
- omniload-0.0.0.dev0.dist-info/WHEEL +4 -0
- omniload-0.0.0.dev0.dist-info/entry_points.txt +2 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.Apache-2.0 +201 -0
- omniload-0.0.0.dev0.dist-info/licenses/LICENSE.md +21 -0
- omniload-0.0.0.dev0.dist-info/licenses/NOTICE +35 -0
|
@@ -0,0 +1,975 @@
|
|
|
1
|
+
# Copyright 2022-2025 ScaleVector
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""Mongo database source helpers and destination utilities"""
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from itertools import islice
|
|
19
|
+
from typing import (
|
|
20
|
+
TYPE_CHECKING,
|
|
21
|
+
Any,
|
|
22
|
+
Dict,
|
|
23
|
+
Iterable,
|
|
24
|
+
Iterator,
|
|
25
|
+
List,
|
|
26
|
+
Mapping,
|
|
27
|
+
Optional,
|
|
28
|
+
Tuple,
|
|
29
|
+
Union,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
import dlt
|
|
33
|
+
from bson.decimal128 import Decimal128
|
|
34
|
+
from bson.objectid import ObjectId
|
|
35
|
+
from bson.regex import Regex
|
|
36
|
+
from bson.timestamp import Timestamp
|
|
37
|
+
from dlt.common import logger
|
|
38
|
+
from dlt.common.configuration.specs import BaseConfiguration, configspec
|
|
39
|
+
from dlt.common.data_writers import TDataItemFormat
|
|
40
|
+
from dlt.common.time import ensure_pendulum_datetime
|
|
41
|
+
from dlt.common.typing import TDataItem
|
|
42
|
+
from dlt.common.utils import map_nested_in_place
|
|
43
|
+
from pendulum import _datetime
|
|
44
|
+
from pymongo import ASCENDING, DESCENDING, MongoClient
|
|
45
|
+
from pymongo.collection import Collection
|
|
46
|
+
from pymongo.cursor import Cursor
|
|
47
|
+
from pymongo.helpers_shared import _fields_list_to_dict
|
|
48
|
+
|
|
49
|
+
if TYPE_CHECKING:
|
|
50
|
+
TMongoClient = MongoClient[Any]
|
|
51
|
+
TCollection = Collection[Any]
|
|
52
|
+
TCursor = Cursor[Any]
|
|
53
|
+
else:
|
|
54
|
+
TMongoClient = Any
|
|
55
|
+
TCollection = Any
|
|
56
|
+
TCursor = Any
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
import pymongoarrow # type: ignore
|
|
60
|
+
|
|
61
|
+
PYMONGOARROW_AVAILABLE = True
|
|
62
|
+
except ImportError:
|
|
63
|
+
PYMONGOARROW_AVAILABLE = False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class CollectionLoader:
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
client: TMongoClient,
|
|
70
|
+
collection: TCollection,
|
|
71
|
+
chunk_size: int,
|
|
72
|
+
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
73
|
+
) -> None:
|
|
74
|
+
self.client = client
|
|
75
|
+
self.collection = collection
|
|
76
|
+
self.incremental = incremental
|
|
77
|
+
self.chunk_size = chunk_size
|
|
78
|
+
|
|
79
|
+
if incremental:
|
|
80
|
+
self.cursor_field = incremental.cursor_path
|
|
81
|
+
self.last_value = incremental.last_value
|
|
82
|
+
else:
|
|
83
|
+
self.cursor_column = None
|
|
84
|
+
self.last_value = None
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def _sort_op(self) -> List[Optional[Tuple[str, int]]]:
|
|
88
|
+
if not self.incremental or not self.last_value:
|
|
89
|
+
return []
|
|
90
|
+
|
|
91
|
+
if (
|
|
92
|
+
self.incremental.row_order == "asc"
|
|
93
|
+
and self.incremental.last_value_func is max
|
|
94
|
+
) or (
|
|
95
|
+
self.incremental.row_order == "desc"
|
|
96
|
+
and self.incremental.last_value_func is min
|
|
97
|
+
):
|
|
98
|
+
return [(self.cursor_field, ASCENDING)]
|
|
99
|
+
|
|
100
|
+
elif (
|
|
101
|
+
self.incremental.row_order == "asc"
|
|
102
|
+
and self.incremental.last_value_func is min
|
|
103
|
+
) or (
|
|
104
|
+
self.incremental.row_order == "desc"
|
|
105
|
+
and self.incremental.last_value_func is max
|
|
106
|
+
):
|
|
107
|
+
return [(self.cursor_field, DESCENDING)]
|
|
108
|
+
|
|
109
|
+
return []
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def _filter_op(self) -> Dict[str, Any]:
|
|
113
|
+
"""Build a filtering operator.
|
|
114
|
+
|
|
115
|
+
Includes a field and the filtering condition for it.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Dict[str, Any]: A dictionary with the filter operator.
|
|
119
|
+
"""
|
|
120
|
+
if not (self.incremental and self.last_value):
|
|
121
|
+
return {}
|
|
122
|
+
|
|
123
|
+
filt = {}
|
|
124
|
+
if self.incremental.last_value_func is max:
|
|
125
|
+
filt = {self.cursor_field: {"$gte": self.last_value}}
|
|
126
|
+
if self.incremental.end_value:
|
|
127
|
+
filt[self.cursor_field]["$lt"] = self.incremental.end_value
|
|
128
|
+
|
|
129
|
+
elif self.incremental.last_value_func is min:
|
|
130
|
+
filt = {self.cursor_field: {"$lte": self.last_value}}
|
|
131
|
+
if self.incremental.end_value:
|
|
132
|
+
filt[self.cursor_field]["$gt"] = self.incremental.end_value
|
|
133
|
+
|
|
134
|
+
return filt
|
|
135
|
+
|
|
136
|
+
def _projection_op(
|
|
137
|
+
self, projection: Optional[Union[Mapping[str, Any], Iterable[str]]]
|
|
138
|
+
) -> Optional[Dict[str, Any]]:
|
|
139
|
+
"""Build a projection operator.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): A tuple of fields to include or a dict specifying fields to include or exclude.
|
|
143
|
+
The incremental `primary_key` needs to be handle differently for inclusion
|
|
144
|
+
and exclusion projections.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tuple[str, ...] | Dict[str, Any]: A tuple or dictionary with the projection operator.
|
|
148
|
+
"""
|
|
149
|
+
if projection is None:
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
projection_dict = dict(_fields_list_to_dict(projection, "projection"))
|
|
153
|
+
|
|
154
|
+
if self.incremental:
|
|
155
|
+
# this is an inclusion projection
|
|
156
|
+
if any(v == 1 for v in projection_dict.values()):
|
|
157
|
+
# ensure primary_key is included
|
|
158
|
+
projection_dict.update(m={self.incremental.primary_key: 1})
|
|
159
|
+
# this is an exclusion projection
|
|
160
|
+
else:
|
|
161
|
+
try:
|
|
162
|
+
# ensure primary_key isn't excluded
|
|
163
|
+
projection_dict.pop(self.incremental.primary_key) # type: ignore
|
|
164
|
+
except KeyError:
|
|
165
|
+
pass # primary_key was properly not included in exclusion projection
|
|
166
|
+
else:
|
|
167
|
+
dlt.common.logger.warn(
|
|
168
|
+
f"Primary key `{self.incremental.primary_key}` was removed from exclusion projection"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return projection_dict
|
|
172
|
+
|
|
173
|
+
def _limit(self, cursor: Cursor, limit: Optional[int] = None) -> TCursor: # type: ignore
|
|
174
|
+
"""Apply a limit to the cursor, if needed.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
cursor (Cursor): The cursor to apply the limit.
|
|
178
|
+
limit (Optional[int]): The number of documents to load.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Cursor: The cursor with the limit applied (if given).
|
|
182
|
+
"""
|
|
183
|
+
if limit not in (0, None):
|
|
184
|
+
if self.incremental is None or self.incremental.last_value_func is None:
|
|
185
|
+
logger.warning(
|
|
186
|
+
"Using limit without ordering - results may be inconsistent."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
cursor = cursor.limit(abs(limit))
|
|
190
|
+
|
|
191
|
+
return cursor
|
|
192
|
+
|
|
193
|
+
def load_documents(
|
|
194
|
+
self,
|
|
195
|
+
filter_: Dict[str, Any],
|
|
196
|
+
limit: Optional[int] = None,
|
|
197
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
198
|
+
) -> Iterator[TDataItem]:
|
|
199
|
+
"""Construct the query and load the documents from the collection.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
203
|
+
limit (Optional[int]): The number of documents to load.
|
|
204
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
205
|
+
|
|
206
|
+
Yields:
|
|
207
|
+
Iterator[TDataItem]: An iterator of the loaded documents.
|
|
208
|
+
"""
|
|
209
|
+
filter_op = self._filter_op
|
|
210
|
+
_raise_if_intersection(filter_op, filter_)
|
|
211
|
+
filter_op.update(filter_)
|
|
212
|
+
|
|
213
|
+
projection_op = self._projection_op(projection)
|
|
214
|
+
|
|
215
|
+
cursor = self.collection.find(filter=filter_op, projection=projection_op)
|
|
216
|
+
if self._sort_op:
|
|
217
|
+
cursor = cursor.sort(self._sort_op)
|
|
218
|
+
|
|
219
|
+
cursor = self._limit(cursor, limit)
|
|
220
|
+
|
|
221
|
+
while docs_slice := list(islice(cursor, self.chunk_size)):
|
|
222
|
+
res = map_nested_in_place(convert_mongo_objs, docs_slice)
|
|
223
|
+
if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
|
|
224
|
+
yield dlt.mark.with_hints(
|
|
225
|
+
res,
|
|
226
|
+
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
yield res
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class CollectionLoaderParallel(CollectionLoader):
|
|
233
|
+
def _get_document_count(self) -> int:
|
|
234
|
+
return self.collection.count_documents(filter=self._filter_op)
|
|
235
|
+
|
|
236
|
+
def _create_batches(self, limit: Optional[int] = None) -> List[Dict[str, int]]:
|
|
237
|
+
doc_count = self._get_document_count()
|
|
238
|
+
if limit:
|
|
239
|
+
doc_count = min(doc_count, abs(limit))
|
|
240
|
+
|
|
241
|
+
batches = []
|
|
242
|
+
left_to_load = doc_count
|
|
243
|
+
|
|
244
|
+
for sk in range(0, doc_count, self.chunk_size):
|
|
245
|
+
batches.append(dict(skip=sk, limit=min(self.chunk_size, left_to_load)))
|
|
246
|
+
left_to_load -= self.chunk_size
|
|
247
|
+
|
|
248
|
+
return batches
|
|
249
|
+
|
|
250
|
+
def _get_cursor(
|
|
251
|
+
self,
|
|
252
|
+
filter_: Dict[str, Any],
|
|
253
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
254
|
+
) -> TCursor:
|
|
255
|
+
"""Get a reading cursor for the collection.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
259
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Cursor: The cursor for the collection.
|
|
263
|
+
"""
|
|
264
|
+
filter_op = self._filter_op
|
|
265
|
+
_raise_if_intersection(filter_op, filter_)
|
|
266
|
+
filter_op.update(filter_)
|
|
267
|
+
|
|
268
|
+
projection_op = self._projection_op(projection)
|
|
269
|
+
|
|
270
|
+
cursor = self.collection.find(filter=filter_op, projection=projection_op)
|
|
271
|
+
if self._sort_op:
|
|
272
|
+
cursor = cursor.sort(self._sort_op)
|
|
273
|
+
|
|
274
|
+
return cursor
|
|
275
|
+
|
|
276
|
+
@dlt.defer
|
|
277
|
+
def _run_batch(self, cursor: TCursor, batch: Dict[str, int]) -> TDataItem:
|
|
278
|
+
cursor = cursor.clone()
|
|
279
|
+
|
|
280
|
+
data = []
|
|
281
|
+
for document in cursor.skip(batch["skip"]).limit(batch["limit"]):
|
|
282
|
+
data.append(map_nested_in_place(convert_mongo_objs, document))
|
|
283
|
+
|
|
284
|
+
return data
|
|
285
|
+
|
|
286
|
+
def _get_all_batches(
|
|
287
|
+
self,
|
|
288
|
+
filter_: Dict[str, Any],
|
|
289
|
+
limit: Optional[int] = None,
|
|
290
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
291
|
+
) -> Iterator[TDataItem]:
|
|
292
|
+
"""Load all documents from the collection in parallel batches.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
296
|
+
limit (Optional[int]): The maximum number of documents to load.
|
|
297
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
298
|
+
|
|
299
|
+
Yields:
|
|
300
|
+
Iterator[TDataItem]: An iterator of the loaded documents.
|
|
301
|
+
"""
|
|
302
|
+
batches = self._create_batches(limit=limit)
|
|
303
|
+
cursor = self._get_cursor(filter_=filter_, projection=projection)
|
|
304
|
+
|
|
305
|
+
for batch in batches:
|
|
306
|
+
yield self._run_batch(cursor=cursor, batch=batch)
|
|
307
|
+
|
|
308
|
+
def load_documents(
|
|
309
|
+
self,
|
|
310
|
+
filter_: Dict[str, Any],
|
|
311
|
+
limit: Optional[int] = None,
|
|
312
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
313
|
+
) -> Iterator[TDataItem]:
|
|
314
|
+
"""Load documents from the collection in parallel.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
318
|
+
limit (Optional[int]): The number of documents to load.
|
|
319
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
320
|
+
|
|
321
|
+
Yields:
|
|
322
|
+
Iterator[TDataItem]: An iterator of the loaded documents.
|
|
323
|
+
"""
|
|
324
|
+
for document in self._get_all_batches(
|
|
325
|
+
limit=limit, filter_=filter_, projection=projection
|
|
326
|
+
):
|
|
327
|
+
yield document
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
class CollectionArrowLoader(CollectionLoader):
|
|
331
|
+
"""
|
|
332
|
+
Mongo DB collection loader, which uses
|
|
333
|
+
Apache Arrow for data processing.
|
|
334
|
+
"""
|
|
335
|
+
|
|
336
|
+
def load_documents(
|
|
337
|
+
self,
|
|
338
|
+
filter_: Dict[str, Any],
|
|
339
|
+
limit: Optional[int] = None,
|
|
340
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
341
|
+
pymongoarrow_schema: Any = None,
|
|
342
|
+
) -> Iterator[Any]:
|
|
343
|
+
"""
|
|
344
|
+
Load documents from the collection in Apache Arrow format.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
348
|
+
limit (Optional[int]): The number of documents to load.
|
|
349
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
350
|
+
pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
|
|
351
|
+
|
|
352
|
+
Yields:
|
|
353
|
+
Iterator[Any]: An iterator of the loaded documents.
|
|
354
|
+
"""
|
|
355
|
+
from pymongoarrow.context import PyMongoArrowContext # type: ignore
|
|
356
|
+
from pymongoarrow.lib import process_bson_stream # type: ignore
|
|
357
|
+
|
|
358
|
+
filter_op = self._filter_op
|
|
359
|
+
_raise_if_intersection(filter_op, filter_)
|
|
360
|
+
filter_op.update(filter_)
|
|
361
|
+
|
|
362
|
+
projection_op = self._projection_op(projection)
|
|
363
|
+
|
|
364
|
+
# NOTE the `filter_op` isn't passed
|
|
365
|
+
cursor = self.collection.find_raw_batches(
|
|
366
|
+
filter_, batch_size=self.chunk_size, projection=projection_op
|
|
367
|
+
)
|
|
368
|
+
if self._sort_op:
|
|
369
|
+
cursor = cursor.sort(self._sort_op) # type: ignore
|
|
370
|
+
|
|
371
|
+
cursor = self._limit(cursor, limit) # type: ignore
|
|
372
|
+
|
|
373
|
+
context = PyMongoArrowContext.from_schema(
|
|
374
|
+
schema=pymongoarrow_schema, codec_options=self.collection.codec_options
|
|
375
|
+
)
|
|
376
|
+
for batch in cursor:
|
|
377
|
+
process_bson_stream(batch, context)
|
|
378
|
+
table = context.finish()
|
|
379
|
+
yield convert_arrow_columns(table)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class CollectionArrowLoaderParallel(CollectionLoaderParallel):
|
|
383
|
+
"""
|
|
384
|
+
Mongo DB collection parallel loader, which uses
|
|
385
|
+
Apache Arrow for data processing.
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
def load_documents(
|
|
389
|
+
self,
|
|
390
|
+
filter_: Dict[str, Any],
|
|
391
|
+
limit: Optional[int] = None,
|
|
392
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
393
|
+
pymongoarrow_schema: Any = None,
|
|
394
|
+
) -> Iterator[TDataItem]:
|
|
395
|
+
"""Load documents from the collection in parallel.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
399
|
+
limit (Optional[int]): The number of documents to load.
|
|
400
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
401
|
+
pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
|
|
402
|
+
|
|
403
|
+
Yields:
|
|
404
|
+
Iterator[TDataItem]: An iterator of the loaded documents.
|
|
405
|
+
"""
|
|
406
|
+
yield from self._get_all_batches(
|
|
407
|
+
limit=limit,
|
|
408
|
+
filter_=filter_,
|
|
409
|
+
projection=projection,
|
|
410
|
+
pymongoarrow_schema=pymongoarrow_schema,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
def _get_all_batches(
|
|
414
|
+
self,
|
|
415
|
+
filter_: Dict[str, Any],
|
|
416
|
+
limit: Optional[int] = None,
|
|
417
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
418
|
+
pymongoarrow_schema: Any = None,
|
|
419
|
+
) -> Iterator[TDataItem]:
|
|
420
|
+
"""Load all documents from the collection in parallel batches.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
424
|
+
limit (Optional[int]): The maximum number of documents to load.
|
|
425
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
426
|
+
pymongoarrow_schema (Any): The mapping of field types to convert BSON to Arrow.
|
|
427
|
+
|
|
428
|
+
Yields:
|
|
429
|
+
Iterator[TDataItem]: An iterator of the loaded documents.
|
|
430
|
+
"""
|
|
431
|
+
batches = self._create_batches(limit=limit)
|
|
432
|
+
cursor = self._get_cursor(filter_=filter_, projection=projection)
|
|
433
|
+
for batch in batches:
|
|
434
|
+
yield self._run_batch(
|
|
435
|
+
cursor=cursor,
|
|
436
|
+
batch=batch,
|
|
437
|
+
pymongoarrow_schema=pymongoarrow_schema,
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
def _get_cursor(
|
|
441
|
+
self,
|
|
442
|
+
filter_: Dict[str, Any],
|
|
443
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
444
|
+
) -> TCursor:
|
|
445
|
+
"""Get a reading cursor for the collection.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
449
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Cursor: The cursor for the collection.
|
|
453
|
+
"""
|
|
454
|
+
filter_op = self._filter_op
|
|
455
|
+
_raise_if_intersection(filter_op, filter_)
|
|
456
|
+
filter_op.update(filter_)
|
|
457
|
+
|
|
458
|
+
projection_op = self._projection_op(projection)
|
|
459
|
+
|
|
460
|
+
cursor = self.collection.find_raw_batches(
|
|
461
|
+
filter=filter_op, batch_size=self.chunk_size, projection=projection_op
|
|
462
|
+
)
|
|
463
|
+
if self._sort_op:
|
|
464
|
+
cursor = cursor.sort(self._sort_op) # type: ignore
|
|
465
|
+
|
|
466
|
+
return cursor
|
|
467
|
+
|
|
468
|
+
@dlt.defer
|
|
469
|
+
def _run_batch(
|
|
470
|
+
self,
|
|
471
|
+
cursor: TCursor,
|
|
472
|
+
batch: Dict[str, int],
|
|
473
|
+
pymongoarrow_schema: Any = None,
|
|
474
|
+
) -> TDataItem:
|
|
475
|
+
from pymongoarrow.context import PyMongoArrowContext
|
|
476
|
+
from pymongoarrow.lib import process_bson_stream
|
|
477
|
+
|
|
478
|
+
cursor = cursor.clone()
|
|
479
|
+
|
|
480
|
+
context = PyMongoArrowContext.from_schema(
|
|
481
|
+
schema=pymongoarrow_schema, codec_options=self.collection.codec_options
|
|
482
|
+
)
|
|
483
|
+
for chunk in cursor.skip(batch["skip"]).limit(batch["limit"]):
|
|
484
|
+
process_bson_stream(chunk, context)
|
|
485
|
+
table = context.finish()
|
|
486
|
+
yield convert_arrow_columns(table)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
class CollectionAggregationLoader(CollectionLoader):
|
|
490
|
+
"""
|
|
491
|
+
MongoDB collection loader that uses aggregation pipelines instead of find queries.
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
def __init__(
|
|
495
|
+
self,
|
|
496
|
+
client: TMongoClient,
|
|
497
|
+
collection: TCollection,
|
|
498
|
+
chunk_size: int,
|
|
499
|
+
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
500
|
+
) -> None:
|
|
501
|
+
super().__init__(client, collection, chunk_size, incremental)
|
|
502
|
+
self.custom_query: Optional[List[Dict[str, Any]]] = None
|
|
503
|
+
|
|
504
|
+
def set_custom_query(self, query: List[Dict[str, Any]]):
|
|
505
|
+
"""Set the custom aggregation pipeline query"""
|
|
506
|
+
self.custom_query = query
|
|
507
|
+
|
|
508
|
+
def load_documents(
|
|
509
|
+
self,
|
|
510
|
+
filter_: Dict[str, Any],
|
|
511
|
+
limit: Optional[int] = None,
|
|
512
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
513
|
+
) -> Iterator[TDataItem]:
|
|
514
|
+
"""Load documents using aggregation pipeline"""
|
|
515
|
+
if not self.custom_query:
|
|
516
|
+
# Fallback to parent method if no custom query
|
|
517
|
+
yield from super().load_documents(filter_, limit, projection)
|
|
518
|
+
return
|
|
519
|
+
|
|
520
|
+
# Build aggregation pipeline
|
|
521
|
+
pipeline = list(self.custom_query) # Copy the query
|
|
522
|
+
|
|
523
|
+
# For custom queries, we assume incremental filtering is already handled
|
|
524
|
+
# via interval placeholders (:interval_start, :interval_end) in the query itself.
|
|
525
|
+
# We don't add additional incremental filtering to avoid conflicts.
|
|
526
|
+
|
|
527
|
+
# Add additional filter if provided
|
|
528
|
+
if filter_:
|
|
529
|
+
filter_match = {"$match": filter_}
|
|
530
|
+
pipeline.insert(0, filter_match)
|
|
531
|
+
|
|
532
|
+
# Add limit if specified
|
|
533
|
+
if limit and limit > 0:
|
|
534
|
+
pipeline.append({"$limit": limit})
|
|
535
|
+
|
|
536
|
+
# Add maxTimeMS to prevent hanging
|
|
537
|
+
cursor = self.collection.aggregate(
|
|
538
|
+
pipeline,
|
|
539
|
+
allowDiskUse=True,
|
|
540
|
+
batchSize=min(self.chunk_size, 101),
|
|
541
|
+
maxTimeMS=30000, # 30 second timeout
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
docs_buffer = []
|
|
545
|
+
try:
|
|
546
|
+
for doc in cursor:
|
|
547
|
+
docs_buffer.append(doc)
|
|
548
|
+
|
|
549
|
+
if len(docs_buffer) >= self.chunk_size:
|
|
550
|
+
res = map_nested_in_place(convert_mongo_objs, docs_buffer)
|
|
551
|
+
if (
|
|
552
|
+
len(res) > 0
|
|
553
|
+
and "_id" in res[0]
|
|
554
|
+
and isinstance(res[0]["_id"], dict)
|
|
555
|
+
):
|
|
556
|
+
yield dlt.mark.with_hints(
|
|
557
|
+
res,
|
|
558
|
+
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
|
|
559
|
+
)
|
|
560
|
+
else:
|
|
561
|
+
yield res
|
|
562
|
+
docs_buffer = []
|
|
563
|
+
|
|
564
|
+
# Yield any remaining documents
|
|
565
|
+
if docs_buffer:
|
|
566
|
+
res = map_nested_in_place(convert_mongo_objs, docs_buffer)
|
|
567
|
+
if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
|
|
568
|
+
yield dlt.mark.with_hints(
|
|
569
|
+
res,
|
|
570
|
+
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
|
|
571
|
+
)
|
|
572
|
+
else:
|
|
573
|
+
yield res
|
|
574
|
+
finally:
|
|
575
|
+
cursor.close()
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
|
|
579
|
+
"""
|
|
580
|
+
MongoDB collection parallel loader that uses aggregation pipelines.
|
|
581
|
+
Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
|
|
582
|
+
Falls back to sequential loading.
|
|
583
|
+
"""
|
|
584
|
+
|
|
585
|
+
def load_documents(
|
|
586
|
+
self,
|
|
587
|
+
filter_: Dict[str, Any],
|
|
588
|
+
limit: Optional[int] = None,
|
|
589
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
590
|
+
) -> Iterator[TDataItem]:
|
|
591
|
+
"""Load documents using aggregation pipeline (sequential only)"""
|
|
592
|
+
logger.warning(
|
|
593
|
+
"Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
|
|
594
|
+
)
|
|
595
|
+
yield from super().load_documents(filter_, limit, projection)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
class CollectionAggregationArrowLoader(CollectionAggregationLoader):
|
|
599
|
+
"""
|
|
600
|
+
MongoDB collection aggregation loader that uses Apache Arrow for data processing.
|
|
601
|
+
"""
|
|
602
|
+
|
|
603
|
+
def load_documents(
|
|
604
|
+
self,
|
|
605
|
+
filter_: Dict[str, Any],
|
|
606
|
+
limit: Optional[int] = None,
|
|
607
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
608
|
+
pymongoarrow_schema: Any = None,
|
|
609
|
+
) -> Iterator[Any]:
|
|
610
|
+
"""Load documents using aggregation pipeline with Arrow format"""
|
|
611
|
+
logger.warning(
|
|
612
|
+
"Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
# Load documents normally and convert to arrow format
|
|
616
|
+
for batch in super().load_documents(filter_, limit, projection):
|
|
617
|
+
if batch: # Only process non-empty batches
|
|
618
|
+
try:
|
|
619
|
+
from dlt.common.libs.pyarrow import pyarrow
|
|
620
|
+
|
|
621
|
+
# Convert dict batch to arrow table
|
|
622
|
+
table = pyarrow.Table.from_pylist(batch)
|
|
623
|
+
yield convert_arrow_columns(table)
|
|
624
|
+
except ImportError:
|
|
625
|
+
logger.warning(
|
|
626
|
+
"PyArrow not available, falling back to object format"
|
|
627
|
+
)
|
|
628
|
+
yield batch
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
|
|
632
|
+
"""
|
|
633
|
+
MongoDB collection parallel aggregation loader with Arrow support.
|
|
634
|
+
Falls back to sequential loading.
|
|
635
|
+
"""
|
|
636
|
+
|
|
637
|
+
def load_documents(
|
|
638
|
+
self,
|
|
639
|
+
filter_: Dict[str, Any],
|
|
640
|
+
limit: Optional[int] = None,
|
|
641
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
642
|
+
pymongoarrow_schema: Any = None,
|
|
643
|
+
) -> Iterator[TDataItem]:
|
|
644
|
+
"""Load documents using aggregation pipeline with Arrow format (sequential only)"""
|
|
645
|
+
logger.warning(
|
|
646
|
+
"Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
|
|
647
|
+
)
|
|
648
|
+
yield from super().load_documents(
|
|
649
|
+
filter_, limit, projection, pymongoarrow_schema
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def collection_documents(
|
|
654
|
+
client: TMongoClient,
|
|
655
|
+
collection: TCollection,
|
|
656
|
+
filter_: Dict[str, Any],
|
|
657
|
+
projection: Union[Dict[str, Any], List[str]],
|
|
658
|
+
pymongoarrow_schema: "pymongoarrow.schema.Schema",
|
|
659
|
+
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
660
|
+
parallel: bool = False,
|
|
661
|
+
limit: Optional[int] = None,
|
|
662
|
+
chunk_size: Optional[int] = 10000,
|
|
663
|
+
data_item_format: Optional[TDataItemFormat] = "object",
|
|
664
|
+
custom_query: Optional[List[Dict[str, Any]]] = None,
|
|
665
|
+
) -> Iterator[TDataItem]:
|
|
666
|
+
"""
|
|
667
|
+
A DLT source which loads data from a Mongo database using PyMongo.
|
|
668
|
+
Resources are automatically created for the collection.
|
|
669
|
+
|
|
670
|
+
Args:
|
|
671
|
+
client (MongoClient): The PyMongo client `pymongo.MongoClient` instance.
|
|
672
|
+
collection (Collection): The collection `pymongo.collection.Collection` to load.
|
|
673
|
+
filter_ (Dict[str, Any]): The filter to apply to the collection.
|
|
674
|
+
projection (Optional[Union[Mapping[str, Any], Iterable[str]]]): The projection to select fields to create the Cursor.
|
|
675
|
+
when loading the collection. Supported inputs:
|
|
676
|
+
include (list) - ["year", "title"]
|
|
677
|
+
include (dict) - {"year": True, "title": True}
|
|
678
|
+
exclude (dict) - {"released": False, "runtime": False}
|
|
679
|
+
Note: Can't mix include and exclude statements '{"title": True, "released": False}`
|
|
680
|
+
pymongoarrow_schema (pymongoarrow.schema.Schema): The mapping of field types to convert BSON to Arrow.
|
|
681
|
+
incremental (Optional[dlt.sources.incremental[Any]]): The incremental configuration.
|
|
682
|
+
parallel (bool): Option to enable parallel loading for the collection. Default is False.
|
|
683
|
+
limit (Optional[int]): The maximum number of documents to load.
|
|
684
|
+
chunk_size (Optional[int]): The number of documents to load in each batch.
|
|
685
|
+
data_item_format (Optional[TDataItemFormat]): The data format to use for loading.
|
|
686
|
+
Supported formats:
|
|
687
|
+
object - Python objects (dicts, lists).
|
|
688
|
+
arrow - Apache Arrow tables.
|
|
689
|
+
custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
|
|
693
|
+
"""
|
|
694
|
+
if data_item_format == "arrow" and not PYMONGOARROW_AVAILABLE:
|
|
695
|
+
dlt.common.logger.warn(
|
|
696
|
+
"'pymongoarrow' is not installed; falling back to standard MongoDB CollectionLoader."
|
|
697
|
+
)
|
|
698
|
+
data_item_format = "object"
|
|
699
|
+
|
|
700
|
+
if data_item_format != "arrow" and pymongoarrow_schema:
|
|
701
|
+
dlt.common.logger.warn(
|
|
702
|
+
"Received value for `pymongoarrow_schema`, but `data_item_format=='object'` "
|
|
703
|
+
"Use `data_item_format=='arrow'` to enforce schema."
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
if data_item_format == "arrow" and pymongoarrow_schema and projection:
|
|
707
|
+
dlt.common.logger.warn(
|
|
708
|
+
"Received values for both `pymongoarrow_schema` and `projection`. Since both "
|
|
709
|
+
"create a projection to select fields, `projection` will be ignored."
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
# If custom query is provided, use aggregation loaders
|
|
713
|
+
if custom_query:
|
|
714
|
+
if parallel:
|
|
715
|
+
if data_item_format == "arrow":
|
|
716
|
+
LoaderClass = CollectionAggregationArrowLoaderParallel
|
|
717
|
+
else:
|
|
718
|
+
LoaderClass = CollectionAggregationLoaderParallel # type: ignore
|
|
719
|
+
else:
|
|
720
|
+
if data_item_format == "arrow":
|
|
721
|
+
LoaderClass = CollectionAggregationArrowLoader # type: ignore
|
|
722
|
+
else:
|
|
723
|
+
LoaderClass = CollectionAggregationLoader # type: ignore
|
|
724
|
+
else:
|
|
725
|
+
if parallel:
|
|
726
|
+
if data_item_format == "arrow":
|
|
727
|
+
LoaderClass = CollectionArrowLoaderParallel
|
|
728
|
+
else:
|
|
729
|
+
LoaderClass = CollectionLoaderParallel # type: ignore
|
|
730
|
+
else:
|
|
731
|
+
if data_item_format == "arrow":
|
|
732
|
+
LoaderClass = CollectionArrowLoader # type: ignore
|
|
733
|
+
else:
|
|
734
|
+
LoaderClass = CollectionLoader # type: ignore
|
|
735
|
+
|
|
736
|
+
loader = LoaderClass(
|
|
737
|
+
client, collection, incremental=incremental, chunk_size=chunk_size
|
|
738
|
+
)
|
|
739
|
+
|
|
740
|
+
# Set custom query if provided
|
|
741
|
+
if custom_query and hasattr(loader, "set_custom_query"):
|
|
742
|
+
loader.set_custom_query(custom_query)
|
|
743
|
+
|
|
744
|
+
# Load documents based on loader type
|
|
745
|
+
if isinstance(
|
|
746
|
+
loader,
|
|
747
|
+
(
|
|
748
|
+
CollectionArrowLoader,
|
|
749
|
+
CollectionArrowLoaderParallel,
|
|
750
|
+
CollectionAggregationArrowLoader,
|
|
751
|
+
CollectionAggregationArrowLoaderParallel,
|
|
752
|
+
),
|
|
753
|
+
):
|
|
754
|
+
yield from loader.load_documents(
|
|
755
|
+
limit=limit,
|
|
756
|
+
filter_=filter_,
|
|
757
|
+
projection=projection,
|
|
758
|
+
pymongoarrow_schema=pymongoarrow_schema,
|
|
759
|
+
)
|
|
760
|
+
else:
|
|
761
|
+
yield from loader.load_documents(
|
|
762
|
+
limit=limit, filter_=filter_, projection=projection
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def convert_mongo_objs(value: Any) -> Any:
|
|
767
|
+
"""MongoDB to dlt type conversion when using Python loaders.
|
|
768
|
+
|
|
769
|
+
Notes:
|
|
770
|
+
The method `ObjectId.__str__()` creates a hexstring using `binascii.hexlify(__id).decode()`
|
|
771
|
+
|
|
772
|
+
"""
|
|
773
|
+
if isinstance(value, (ObjectId, Decimal128)):
|
|
774
|
+
return str(value)
|
|
775
|
+
if isinstance(value, _datetime.datetime):
|
|
776
|
+
return ensure_pendulum_datetime(value)
|
|
777
|
+
if isinstance(value, Regex):
|
|
778
|
+
return value.try_compile().pattern
|
|
779
|
+
if isinstance(value, Timestamp):
|
|
780
|
+
date = value.as_datetime()
|
|
781
|
+
return ensure_pendulum_datetime(date)
|
|
782
|
+
|
|
783
|
+
return value
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def convert_arrow_columns(table: Any) -> Any:
|
|
787
|
+
"""Convert the given table columns to Python types.
|
|
788
|
+
|
|
789
|
+
Notes:
|
|
790
|
+
Calling str() matches the `convert_mongo_obs()` used in non-arrow code.
|
|
791
|
+
Pymongoarrow converts ObjectId to `fixed_size_binary[12]`, which can't be
|
|
792
|
+
converted to a string as a vectorized operation because it contains ASCII characters.
|
|
793
|
+
|
|
794
|
+
Instead, you need to loop over values using:
|
|
795
|
+
```python
|
|
796
|
+
pyarrow.array([v.as_buffer().hex() for v in object_id_array], type=pyarrow.string())
|
|
797
|
+
# pymongoarrow simplifies this by allowing this syntax
|
|
798
|
+
[str(v) for v in object_id_array]
|
|
799
|
+
```
|
|
800
|
+
|
|
801
|
+
Args:
|
|
802
|
+
table (pyarrow.lib.Table): The table to convert.
|
|
803
|
+
|
|
804
|
+
Returns:
|
|
805
|
+
pyarrow.lib.Table: The table with the columns converted.
|
|
806
|
+
"""
|
|
807
|
+
from dlt.common.libs.pyarrow import pyarrow
|
|
808
|
+
from pymongoarrow.types import ( # type: ignore
|
|
809
|
+
_is_binary,
|
|
810
|
+
_is_code,
|
|
811
|
+
_is_decimal128,
|
|
812
|
+
_is_objectid,
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
for i, field in enumerate(table.schema):
|
|
816
|
+
if _is_objectid(field.type) or _is_decimal128(field.type):
|
|
817
|
+
col_values = [str(value) for value in table[field.name]]
|
|
818
|
+
table = table.set_column(
|
|
819
|
+
i,
|
|
820
|
+
pyarrow.field(field.name, pyarrow.string()),
|
|
821
|
+
pyarrow.array(col_values, type=pyarrow.string()),
|
|
822
|
+
)
|
|
823
|
+
else:
|
|
824
|
+
type_ = None
|
|
825
|
+
if _is_binary(field.type):
|
|
826
|
+
type_ = pyarrow.binary()
|
|
827
|
+
elif _is_code(field.type):
|
|
828
|
+
type_ = pyarrow.string()
|
|
829
|
+
|
|
830
|
+
if type_:
|
|
831
|
+
col_values = [value.as_py() for value in table[field.name]]
|
|
832
|
+
table = table.set_column(
|
|
833
|
+
i,
|
|
834
|
+
pyarrow.field(field.name, type_),
|
|
835
|
+
pyarrow.array(col_values, type=type_),
|
|
836
|
+
)
|
|
837
|
+
return table
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
def client_from_credentials(connection_url: str) -> TMongoClient:
|
|
841
|
+
client: TMongoClient = MongoClient(
|
|
842
|
+
connection_url, uuidRepresentation="standard", tz_aware=True
|
|
843
|
+
)
|
|
844
|
+
return client
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
def _raise_if_intersection(filter1: Dict[str, Any], filter2: Dict[str, Any]) -> None:
|
|
848
|
+
"""
|
|
849
|
+
Raise an exception, if the given filters'
|
|
850
|
+
fields are intersecting.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
filter1 (Dict[str, Any]): The first filter.
|
|
854
|
+
filter2 (Dict[str, Any]): The second filter.
|
|
855
|
+
"""
|
|
856
|
+
field_inter = filter1.keys() & filter2.keys()
|
|
857
|
+
for field in field_inter:
|
|
858
|
+
if filter1[field].keys() & filter2[field].keys():
|
|
859
|
+
str_repr = str({field: filter1[field]})
|
|
860
|
+
raise ValueError(
|
|
861
|
+
(
|
|
862
|
+
f"Filtering operator {str_repr} is already used by the "
|
|
863
|
+
"incremental and can't be used in the filter."
|
|
864
|
+
)
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
@configspec
|
|
869
|
+
class MongoDbCollectionConfiguration(BaseConfiguration):
|
|
870
|
+
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
@configspec
|
|
874
|
+
class MongoDbCollectionResourceConfiguration(BaseConfiguration):
|
|
875
|
+
connection_url: dlt.TSecretValue = dlt.secrets.value
|
|
876
|
+
database: Optional[str] = dlt.config.value
|
|
877
|
+
collection: str = dlt.config.value
|
|
878
|
+
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
879
|
+
write_disposition: Optional[str] = dlt.config.value
|
|
880
|
+
parallel: Optional[bool] = False
|
|
881
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
def convert_mongo_shell_to_extended_json(query_string: str) -> str:
|
|
885
|
+
"""
|
|
886
|
+
Convert MongoDB shell syntax to MongoDB Extended JSON v2 format.
|
|
887
|
+
|
|
888
|
+
This function handles common MongoDB shell constructs like ISODate, ObjectId,
|
|
889
|
+
NumberLong, NumberDecimal, etc. and converts them to their Extended JSON equivalents
|
|
890
|
+
that can be parsed by bson.json_util.
|
|
891
|
+
|
|
892
|
+
Args:
|
|
893
|
+
query_string: A string containing MongoDB shell syntax
|
|
894
|
+
|
|
895
|
+
Returns:
|
|
896
|
+
A string with MongoDB Extended JSON v2 format
|
|
897
|
+
|
|
898
|
+
Examples:
|
|
899
|
+
>>> convert_mongo_shell_to_extended_json('ISODate("2010-01-01T00:00:00.000Z")')
|
|
900
|
+
'{"$date": "2010-01-01T00:00:00.000Z"}'
|
|
901
|
+
|
|
902
|
+
>>> convert_mongo_shell_to_extended_json('ObjectId("507f1f77bcf86cd799439011")')
|
|
903
|
+
'{"$oid": "507f1f77bcf86cd799439011"}'
|
|
904
|
+
"""
|
|
905
|
+
converted = query_string
|
|
906
|
+
|
|
907
|
+
# Convert ISODate("...") to {"$date": "..."}
|
|
908
|
+
# Pattern matches ISODate("2010-01-01T00:00:00.000+0000") or similar
|
|
909
|
+
converted = re.sub(r'ISODate\("([^"]+)"\)', r'{"$date": "\1"}', converted)
|
|
910
|
+
|
|
911
|
+
# Convert ObjectId("...") to {"$oid": "..."}
|
|
912
|
+
converted = re.sub(r'ObjectId\("([^"]+)"\)', r'{"$oid": "\1"}', converted)
|
|
913
|
+
|
|
914
|
+
# Convert NumberLong(...) to {"$numberLong": "..."}
|
|
915
|
+
# Note: NumberLong can have quotes or not: NumberLong(123) or NumberLong("123")
|
|
916
|
+
converted = re.sub(r'NumberLong\("([^"]+)"\)', r'{"$numberLong": "\1"}', converted)
|
|
917
|
+
converted = re.sub(r"NumberLong\(([^)]+)\)", r'{"$numberLong": "\1"}', converted)
|
|
918
|
+
|
|
919
|
+
# Convert NumberInt(...) to {"$numberInt": "..."}
|
|
920
|
+
converted = re.sub(r'NumberInt\("([^"]+)"\)', r'{"$numberInt": "\1"}', converted)
|
|
921
|
+
converted = re.sub(r"NumberInt\(([^)]+)\)", r'{"$numberInt": "\1"}', converted)
|
|
922
|
+
|
|
923
|
+
# Convert NumberDecimal("...") to {"$numberDecimal": "..."}
|
|
924
|
+
converted = re.sub(
|
|
925
|
+
r'NumberDecimal\("([^"]+)"\)', r'{"$numberDecimal": "\1"}', converted
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
# Convert Timestamp(..., ...) to {"$timestamp": {"t": ..., "i": ...}}
|
|
929
|
+
# Timestamp(1234567890, 1) -> {"$timestamp": {"t": 1234567890, "i": 1}}
|
|
930
|
+
converted = re.sub(
|
|
931
|
+
r"Timestamp\((\d+),\s*(\d+)\)", r'{"$timestamp": {"t": \1, "i": \2}}', converted
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
# Convert BinData(..., "...") to {"$binary": {"base64": "...", "subType": "..."}}
|
|
935
|
+
converted = re.sub(
|
|
936
|
+
r'BinData\((\d+),\s*"([^"]+)"\)',
|
|
937
|
+
r'{"$binary": {"base64": "\2", "subType": "\1"}}',
|
|
938
|
+
converted,
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
# Convert MinKey() to {"$minKey": 1}
|
|
942
|
+
converted = re.sub(r"MinKey\(\)", r'{"$minKey": 1}', converted)
|
|
943
|
+
|
|
944
|
+
# Convert MaxKey() to {"$maxKey": 1}
|
|
945
|
+
converted = re.sub(r"MaxKey\(\)", r'{"$maxKey": 1}', converted)
|
|
946
|
+
|
|
947
|
+
# Convert UUID("...") to {"$uuid": "..."}
|
|
948
|
+
converted = re.sub(r'UUID\("([^"]+)"\)', r'{"$uuid": "\1"}', converted)
|
|
949
|
+
|
|
950
|
+
# Convert DBRef("collection", "id") to {"$ref": "collection", "$id": "id"}
|
|
951
|
+
converted = re.sub(
|
|
952
|
+
r'DBRef\("([^"]+)",\s*"([^"]+)"\)', r'{"$ref": "\1", "$id": "\2"}', converted
|
|
953
|
+
)
|
|
954
|
+
|
|
955
|
+
# Convert Code("...") to {"$code": "..."}
|
|
956
|
+
converted = re.sub(r'Code\("([^"]+)"\)', r'{"$code": "\1"}', converted)
|
|
957
|
+
|
|
958
|
+
return converted
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
__source_name__ = "mongodb"
|
|
962
|
+
|
|
963
|
+
|
|
964
|
+
# MongoDB destination helper functions
|
|
965
|
+
def process_file_items(file_path: str) -> list[dict]:
|
|
966
|
+
"""Process items from a file path (JSONL format)."""
|
|
967
|
+
import json
|
|
968
|
+
|
|
969
|
+
documents = []
|
|
970
|
+
with open(file_path, "r") as f:
|
|
971
|
+
for line in f:
|
|
972
|
+
if line.strip():
|
|
973
|
+
doc = json.loads(line.strip())
|
|
974
|
+
documents.append(doc) # Include all fields including DLT metadata
|
|
975
|
+
return documents
|