ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +22 -3
- ingestr/src/adjust/__init__.py +4 -4
- ingestr/src/allium/__init__.py +128 -0
- ingestr/src/anthropic/__init__.py +277 -0
- ingestr/src/anthropic/helpers.py +525 -0
- ingestr/src/appstore/__init__.py +1 -0
- ingestr/src/asana_source/__init__.py +1 -1
- ingestr/src/buildinfo.py +1 -1
- ingestr/src/chess/__init__.py +1 -1
- ingestr/src/couchbase_source/__init__.py +118 -0
- ingestr/src/couchbase_source/helpers.py +135 -0
- ingestr/src/cursor/__init__.py +83 -0
- ingestr/src/cursor/helpers.py +188 -0
- ingestr/src/destinations.py +169 -1
- ingestr/src/docebo/__init__.py +589 -0
- ingestr/src/docebo/client.py +435 -0
- ingestr/src/docebo/helpers.py +97 -0
- ingestr/src/elasticsearch/helpers.py +138 -0
- ingestr/src/errors.py +8 -0
- ingestr/src/facebook_ads/__init__.py +26 -23
- ingestr/src/facebook_ads/helpers.py +47 -1
- ingestr/src/factory.py +48 -0
- ingestr/src/filesystem/__init__.py +8 -3
- ingestr/src/filters.py +9 -0
- ingestr/src/fluxx/__init__.py +9906 -0
- ingestr/src/fluxx/helpers.py +209 -0
- ingestr/src/frankfurter/__init__.py +157 -163
- ingestr/src/frankfurter/helpers.py +3 -3
- ingestr/src/freshdesk/__init__.py +25 -8
- ingestr/src/freshdesk/freshdesk_client.py +40 -5
- ingestr/src/fundraiseup/__init__.py +49 -0
- ingestr/src/fundraiseup/client.py +81 -0
- ingestr/src/github/__init__.py +6 -4
- ingestr/src/google_analytics/__init__.py +1 -1
- ingestr/src/hostaway/__init__.py +302 -0
- ingestr/src/hostaway/client.py +288 -0
- ingestr/src/http/__init__.py +35 -0
- ingestr/src/http/readers.py +114 -0
- ingestr/src/hubspot/__init__.py +6 -12
- ingestr/src/influxdb/__init__.py +1 -0
- ingestr/src/intercom/__init__.py +142 -0
- ingestr/src/intercom/helpers.py +674 -0
- ingestr/src/intercom/settings.py +279 -0
- ingestr/src/jira_source/__init__.py +340 -0
- ingestr/src/jira_source/helpers.py +439 -0
- ingestr/src/jira_source/settings.py +170 -0
- ingestr/src/klaviyo/__init__.py +5 -5
- ingestr/src/linear/__init__.py +553 -116
- ingestr/src/linear/helpers.py +77 -38
- ingestr/src/mailchimp/__init__.py +126 -0
- ingestr/src/mailchimp/helpers.py +226 -0
- ingestr/src/mailchimp/settings.py +164 -0
- ingestr/src/masking.py +344 -0
- ingestr/src/monday/__init__.py +246 -0
- ingestr/src/monday/helpers.py +392 -0
- ingestr/src/monday/settings.py +328 -0
- ingestr/src/mongodb/__init__.py +5 -2
- ingestr/src/mongodb/helpers.py +384 -10
- ingestr/src/plusvibeai/__init__.py +335 -0
- ingestr/src/plusvibeai/helpers.py +544 -0
- ingestr/src/plusvibeai/settings.py +252 -0
- ingestr/src/revenuecat/__init__.py +83 -0
- ingestr/src/revenuecat/helpers.py +237 -0
- ingestr/src/salesforce/__init__.py +15 -8
- ingestr/src/shopify/__init__.py +1 -1
- ingestr/src/smartsheets/__init__.py +33 -5
- ingestr/src/socrata_source/__init__.py +83 -0
- ingestr/src/socrata_source/helpers.py +85 -0
- ingestr/src/socrata_source/settings.py +8 -0
- ingestr/src/sources.py +1418 -54
- ingestr/src/stripe_analytics/__init__.py +2 -19
- ingestr/src/wise/__init__.py +68 -0
- ingestr/src/wise/client.py +63 -0
- ingestr/tests/unit/test_smartsheets.py +6 -9
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
- {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0
ingestr/src/mongodb/__init__.py
CHANGED
|
@@ -101,12 +101,13 @@ def mongodb_collection(
|
|
|
101
101
|
write_disposition: Optional[str] = dlt.config.value,
|
|
102
102
|
parallel: Optional[bool] = False,
|
|
103
103
|
limit: Optional[int] = None,
|
|
104
|
-
chunk_size: Optional[int] =
|
|
104
|
+
chunk_size: Optional[int] = 1000,
|
|
105
105
|
data_item_format: Optional[TDataItemFormat] = "object",
|
|
106
106
|
filter_: Optional[Dict[str, Any]] = None,
|
|
107
107
|
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
|
|
108
108
|
pymongoarrow_schema: Optional[Any] = None,
|
|
109
|
-
|
|
109
|
+
custom_query: Optional[List[Dict[str, Any]]] = None,
|
|
110
|
+
) -> DltResource:
|
|
110
111
|
"""
|
|
111
112
|
A DLT source which loads a collection from a mongo database using PyMongo.
|
|
112
113
|
|
|
@@ -132,6 +133,7 @@ def mongodb_collection(
|
|
|
132
133
|
exclude (dict) - {"released": False, "runtime": False}
|
|
133
134
|
Note: Can't mix include and exclude statements '{"title": True, "released": False}`
|
|
134
135
|
pymongoarrow_schema (pymongoarrow.schema.Schema): Mapping of expected field types to convert BSON to Arrow
|
|
136
|
+
custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
|
|
135
137
|
|
|
136
138
|
Returns:
|
|
137
139
|
Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
|
|
@@ -161,4 +163,5 @@ def mongodb_collection(
|
|
|
161
163
|
filter_=filter_ or {},
|
|
162
164
|
projection=projection,
|
|
163
165
|
pymongoarrow_schema=pymongoarrow_schema,
|
|
166
|
+
custom_query=custom_query,
|
|
164
167
|
)
|
ingestr/src/mongodb/helpers.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
"""Mongo database source helpers"""
|
|
1
|
+
"""Mongo database source helpers and destination utilities"""
|
|
2
2
|
|
|
3
|
+
import re
|
|
3
4
|
from itertools import islice
|
|
4
5
|
from typing import (
|
|
5
6
|
TYPE_CHECKING,
|
|
@@ -22,6 +23,7 @@ from bson.timestamp import Timestamp
|
|
|
22
23
|
from dlt.common import logger
|
|
23
24
|
from dlt.common.configuration.specs import BaseConfiguration, configspec
|
|
24
25
|
from dlt.common.data_writers import TDataItemFormat
|
|
26
|
+
from dlt.common.schema import TTableSchema
|
|
25
27
|
from dlt.common.time import ensure_pendulum_datetime
|
|
26
28
|
from dlt.common.typing import TDataItem
|
|
27
29
|
from dlt.common.utils import map_nested_in_place
|
|
@@ -204,7 +206,14 @@ class CollectionLoader:
|
|
|
204
206
|
cursor = self._limit(cursor, limit)
|
|
205
207
|
|
|
206
208
|
while docs_slice := list(islice(cursor, self.chunk_size)):
|
|
207
|
-
|
|
209
|
+
res = map_nested_in_place(convert_mongo_objs, docs_slice)
|
|
210
|
+
if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
|
|
211
|
+
yield dlt.mark.with_hints(
|
|
212
|
+
res,
|
|
213
|
+
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
|
|
214
|
+
)
|
|
215
|
+
else:
|
|
216
|
+
yield res
|
|
208
217
|
|
|
209
218
|
|
|
210
219
|
class CollectionLoaderParallel(CollectionLoader):
|
|
@@ -464,6 +473,170 @@ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
|
|
|
464
473
|
yield convert_arrow_columns(table)
|
|
465
474
|
|
|
466
475
|
|
|
476
|
+
class CollectionAggregationLoader(CollectionLoader):
|
|
477
|
+
"""
|
|
478
|
+
MongoDB collection loader that uses aggregation pipelines instead of find queries.
|
|
479
|
+
"""
|
|
480
|
+
|
|
481
|
+
def __init__(
|
|
482
|
+
self,
|
|
483
|
+
client: TMongoClient,
|
|
484
|
+
collection: TCollection,
|
|
485
|
+
chunk_size: int,
|
|
486
|
+
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
487
|
+
) -> None:
|
|
488
|
+
super().__init__(client, collection, chunk_size, incremental)
|
|
489
|
+
self.custom_query: Optional[List[Dict[str, Any]]] = None
|
|
490
|
+
|
|
491
|
+
def set_custom_query(self, query: List[Dict[str, Any]]):
|
|
492
|
+
"""Set the custom aggregation pipeline query"""
|
|
493
|
+
self.custom_query = query
|
|
494
|
+
|
|
495
|
+
def load_documents(
|
|
496
|
+
self,
|
|
497
|
+
filter_: Dict[str, Any],
|
|
498
|
+
limit: Optional[int] = None,
|
|
499
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
500
|
+
) -> Iterator[TDataItem]:
|
|
501
|
+
"""Load documents using aggregation pipeline"""
|
|
502
|
+
if not self.custom_query:
|
|
503
|
+
# Fallback to parent method if no custom query
|
|
504
|
+
yield from super().load_documents(filter_, limit, projection)
|
|
505
|
+
return
|
|
506
|
+
|
|
507
|
+
# Build aggregation pipeline
|
|
508
|
+
pipeline = list(self.custom_query) # Copy the query
|
|
509
|
+
|
|
510
|
+
# For custom queries, we assume incremental filtering is already handled
|
|
511
|
+
# via interval placeholders (:interval_start, :interval_end) in the query itself.
|
|
512
|
+
# We don't add additional incremental filtering to avoid conflicts.
|
|
513
|
+
|
|
514
|
+
# Add additional filter if provided
|
|
515
|
+
if filter_:
|
|
516
|
+
filter_match = {"$match": filter_}
|
|
517
|
+
pipeline.insert(0, filter_match)
|
|
518
|
+
|
|
519
|
+
# Add limit if specified
|
|
520
|
+
if limit and limit > 0:
|
|
521
|
+
pipeline.append({"$limit": limit})
|
|
522
|
+
|
|
523
|
+
# Add maxTimeMS to prevent hanging
|
|
524
|
+
cursor = self.collection.aggregate(
|
|
525
|
+
pipeline,
|
|
526
|
+
allowDiskUse=True,
|
|
527
|
+
batchSize=min(self.chunk_size, 101),
|
|
528
|
+
maxTimeMS=30000, # 30 second timeout
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
docs_buffer = []
|
|
532
|
+
try:
|
|
533
|
+
for doc in cursor:
|
|
534
|
+
docs_buffer.append(doc)
|
|
535
|
+
|
|
536
|
+
if len(docs_buffer) >= self.chunk_size:
|
|
537
|
+
res = map_nested_in_place(convert_mongo_objs, docs_buffer)
|
|
538
|
+
if (
|
|
539
|
+
len(res) > 0
|
|
540
|
+
and "_id" in res[0]
|
|
541
|
+
and isinstance(res[0]["_id"], dict)
|
|
542
|
+
):
|
|
543
|
+
yield dlt.mark.with_hints(
|
|
544
|
+
res,
|
|
545
|
+
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
|
|
546
|
+
)
|
|
547
|
+
else:
|
|
548
|
+
yield res
|
|
549
|
+
docs_buffer = []
|
|
550
|
+
|
|
551
|
+
# Yield any remaining documents
|
|
552
|
+
if docs_buffer:
|
|
553
|
+
res = map_nested_in_place(convert_mongo_objs, docs_buffer)
|
|
554
|
+
if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
|
|
555
|
+
yield dlt.mark.with_hints(
|
|
556
|
+
res,
|
|
557
|
+
dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
|
|
558
|
+
)
|
|
559
|
+
else:
|
|
560
|
+
yield res
|
|
561
|
+
finally:
|
|
562
|
+
cursor.close()
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
|
|
566
|
+
"""
|
|
567
|
+
MongoDB collection parallel loader that uses aggregation pipelines.
|
|
568
|
+
Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
|
|
569
|
+
Falls back to sequential loading.
|
|
570
|
+
"""
|
|
571
|
+
|
|
572
|
+
def load_documents(
|
|
573
|
+
self,
|
|
574
|
+
filter_: Dict[str, Any],
|
|
575
|
+
limit: Optional[int] = None,
|
|
576
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
577
|
+
) -> Iterator[TDataItem]:
|
|
578
|
+
"""Load documents using aggregation pipeline (sequential only)"""
|
|
579
|
+
logger.warning(
|
|
580
|
+
"Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
|
|
581
|
+
)
|
|
582
|
+
yield from super().load_documents(filter_, limit, projection)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
class CollectionAggregationArrowLoader(CollectionAggregationLoader):
|
|
586
|
+
"""
|
|
587
|
+
MongoDB collection aggregation loader that uses Apache Arrow for data processing.
|
|
588
|
+
"""
|
|
589
|
+
|
|
590
|
+
def load_documents(
|
|
591
|
+
self,
|
|
592
|
+
filter_: Dict[str, Any],
|
|
593
|
+
limit: Optional[int] = None,
|
|
594
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
595
|
+
pymongoarrow_schema: Any = None,
|
|
596
|
+
) -> Iterator[Any]:
|
|
597
|
+
"""Load documents using aggregation pipeline with Arrow format"""
|
|
598
|
+
logger.warning(
|
|
599
|
+
"Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
# Load documents normally and convert to arrow format
|
|
603
|
+
for batch in super().load_documents(filter_, limit, projection):
|
|
604
|
+
if batch: # Only process non-empty batches
|
|
605
|
+
try:
|
|
606
|
+
from dlt.common.libs.pyarrow import pyarrow
|
|
607
|
+
|
|
608
|
+
# Convert dict batch to arrow table
|
|
609
|
+
table = pyarrow.Table.from_pylist(batch)
|
|
610
|
+
yield convert_arrow_columns(table)
|
|
611
|
+
except ImportError:
|
|
612
|
+
logger.warning(
|
|
613
|
+
"PyArrow not available, falling back to object format"
|
|
614
|
+
)
|
|
615
|
+
yield batch
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
|
|
619
|
+
"""
|
|
620
|
+
MongoDB collection parallel aggregation loader with Arrow support.
|
|
621
|
+
Falls back to sequential loading.
|
|
622
|
+
"""
|
|
623
|
+
|
|
624
|
+
def load_documents(
|
|
625
|
+
self,
|
|
626
|
+
filter_: Dict[str, Any],
|
|
627
|
+
limit: Optional[int] = None,
|
|
628
|
+
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
|
|
629
|
+
pymongoarrow_schema: Any = None,
|
|
630
|
+
) -> Iterator[TDataItem]:
|
|
631
|
+
"""Load documents using aggregation pipeline with Arrow format (sequential only)"""
|
|
632
|
+
logger.warning(
|
|
633
|
+
"Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
|
|
634
|
+
)
|
|
635
|
+
yield from super().load_documents(
|
|
636
|
+
filter_, limit, projection, pymongoarrow_schema
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
|
|
467
640
|
def collection_documents(
|
|
468
641
|
client: TMongoClient,
|
|
469
642
|
collection: TCollection,
|
|
@@ -475,6 +648,7 @@ def collection_documents(
|
|
|
475
648
|
limit: Optional[int] = None,
|
|
476
649
|
chunk_size: Optional[int] = 10000,
|
|
477
650
|
data_item_format: Optional[TDataItemFormat] = "object",
|
|
651
|
+
custom_query: Optional[List[Dict[str, Any]]] = None,
|
|
478
652
|
) -> Iterator[TDataItem]:
|
|
479
653
|
"""
|
|
480
654
|
A DLT source which loads data from a Mongo database using PyMongo.
|
|
@@ -499,6 +673,7 @@ def collection_documents(
|
|
|
499
673
|
Supported formats:
|
|
500
674
|
object - Python objects (dicts, lists).
|
|
501
675
|
arrow - Apache Arrow tables.
|
|
676
|
+
custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
|
|
502
677
|
|
|
503
678
|
Returns:
|
|
504
679
|
Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
|
|
@@ -521,21 +696,48 @@ def collection_documents(
|
|
|
521
696
|
"create a projection to select fields, `projection` will be ignored."
|
|
522
697
|
)
|
|
523
698
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
699
|
+
# If custom query is provided, use aggregation loaders
|
|
700
|
+
if custom_query:
|
|
701
|
+
if parallel:
|
|
702
|
+
if data_item_format == "arrow":
|
|
703
|
+
LoaderClass = CollectionAggregationArrowLoaderParallel
|
|
704
|
+
else:
|
|
705
|
+
LoaderClass = CollectionAggregationLoaderParallel # type: ignore
|
|
527
706
|
else:
|
|
528
|
-
|
|
707
|
+
if data_item_format == "arrow":
|
|
708
|
+
LoaderClass = CollectionAggregationArrowLoader # type: ignore
|
|
709
|
+
else:
|
|
710
|
+
LoaderClass = CollectionAggregationLoader # type: ignore
|
|
529
711
|
else:
|
|
530
|
-
if
|
|
531
|
-
|
|
712
|
+
if parallel:
|
|
713
|
+
if data_item_format == "arrow":
|
|
714
|
+
LoaderClass = CollectionArrowLoaderParallel
|
|
715
|
+
else:
|
|
716
|
+
LoaderClass = CollectionLoaderParallel # type: ignore
|
|
532
717
|
else:
|
|
533
|
-
|
|
718
|
+
if data_item_format == "arrow":
|
|
719
|
+
LoaderClass = CollectionArrowLoader # type: ignore
|
|
720
|
+
else:
|
|
721
|
+
LoaderClass = CollectionLoader # type: ignore
|
|
534
722
|
|
|
535
723
|
loader = LoaderClass(
|
|
536
724
|
client, collection, incremental=incremental, chunk_size=chunk_size
|
|
537
725
|
)
|
|
538
|
-
|
|
726
|
+
|
|
727
|
+
# Set custom query if provided
|
|
728
|
+
if custom_query and hasattr(loader, "set_custom_query"):
|
|
729
|
+
loader.set_custom_query(custom_query)
|
|
730
|
+
|
|
731
|
+
# Load documents based on loader type
|
|
732
|
+
if isinstance(
|
|
733
|
+
loader,
|
|
734
|
+
(
|
|
735
|
+
CollectionArrowLoader,
|
|
736
|
+
CollectionArrowLoaderParallel,
|
|
737
|
+
CollectionAggregationArrowLoader,
|
|
738
|
+
CollectionAggregationArrowLoaderParallel,
|
|
739
|
+
),
|
|
740
|
+
):
|
|
539
741
|
yield from loader.load_documents(
|
|
540
742
|
limit=limit,
|
|
541
743
|
filter_=filter_,
|
|
@@ -666,4 +868,176 @@ class MongoDbCollectionResourceConfiguration(BaseConfiguration):
|
|
|
666
868
|
projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value
|
|
667
869
|
|
|
668
870
|
|
|
871
|
+
def convert_mongo_shell_to_extended_json(query_string: str) -> str:
|
|
872
|
+
"""
|
|
873
|
+
Convert MongoDB shell syntax to MongoDB Extended JSON v2 format.
|
|
874
|
+
|
|
875
|
+
This function handles common MongoDB shell constructs like ISODate, ObjectId,
|
|
876
|
+
NumberLong, NumberDecimal, etc. and converts them to their Extended JSON equivalents
|
|
877
|
+
that can be parsed by bson.json_util.
|
|
878
|
+
|
|
879
|
+
Args:
|
|
880
|
+
query_string: A string containing MongoDB shell syntax
|
|
881
|
+
|
|
882
|
+
Returns:
|
|
883
|
+
A string with MongoDB Extended JSON v2 format
|
|
884
|
+
|
|
885
|
+
Examples:
|
|
886
|
+
>>> convert_mongo_shell_to_extended_json('ISODate("2010-01-01T00:00:00.000Z")')
|
|
887
|
+
'{"$date": "2010-01-01T00:00:00.000Z"}'
|
|
888
|
+
|
|
889
|
+
>>> convert_mongo_shell_to_extended_json('ObjectId("507f1f77bcf86cd799439011")')
|
|
890
|
+
'{"$oid": "507f1f77bcf86cd799439011"}'
|
|
891
|
+
"""
|
|
892
|
+
converted = query_string
|
|
893
|
+
|
|
894
|
+
# Convert ISODate("...") to {"$date": "..."}
|
|
895
|
+
# Pattern matches ISODate("2010-01-01T00:00:00.000+0000") or similar
|
|
896
|
+
converted = re.sub(r'ISODate\("([^"]+)"\)', r'{"$date": "\1"}', converted)
|
|
897
|
+
|
|
898
|
+
# Convert ObjectId("...") to {"$oid": "..."}
|
|
899
|
+
converted = re.sub(r'ObjectId\("([^"]+)"\)', r'{"$oid": "\1"}', converted)
|
|
900
|
+
|
|
901
|
+
# Convert NumberLong(...) to {"$numberLong": "..."}
|
|
902
|
+
# Note: NumberLong can have quotes or not: NumberLong(123) or NumberLong("123")
|
|
903
|
+
converted = re.sub(r'NumberLong\("([^"]+)"\)', r'{"$numberLong": "\1"}', converted)
|
|
904
|
+
converted = re.sub(r"NumberLong\(([^)]+)\)", r'{"$numberLong": "\1"}', converted)
|
|
905
|
+
|
|
906
|
+
# Convert NumberInt(...) to {"$numberInt": "..."}
|
|
907
|
+
converted = re.sub(r'NumberInt\("([^"]+)"\)', r'{"$numberInt": "\1"}', converted)
|
|
908
|
+
converted = re.sub(r"NumberInt\(([^)]+)\)", r'{"$numberInt": "\1"}', converted)
|
|
909
|
+
|
|
910
|
+
# Convert NumberDecimal("...") to {"$numberDecimal": "..."}
|
|
911
|
+
converted = re.sub(
|
|
912
|
+
r'NumberDecimal\("([^"]+)"\)', r'{"$numberDecimal": "\1"}', converted
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
# Convert Timestamp(..., ...) to {"$timestamp": {"t": ..., "i": ...}}
|
|
916
|
+
# Timestamp(1234567890, 1) -> {"$timestamp": {"t": 1234567890, "i": 1}}
|
|
917
|
+
converted = re.sub(
|
|
918
|
+
r"Timestamp\((\d+),\s*(\d+)\)", r'{"$timestamp": {"t": \1, "i": \2}}', converted
|
|
919
|
+
)
|
|
920
|
+
|
|
921
|
+
# Convert BinData(..., "...") to {"$binary": {"base64": "...", "subType": "..."}}
|
|
922
|
+
converted = re.sub(
|
|
923
|
+
r'BinData\((\d+),\s*"([^"]+)"\)',
|
|
924
|
+
r'{"$binary": {"base64": "\2", "subType": "\1"}}',
|
|
925
|
+
converted,
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
# Convert MinKey() to {"$minKey": 1}
|
|
929
|
+
converted = re.sub(r"MinKey\(\)", r'{"$minKey": 1}', converted)
|
|
930
|
+
|
|
931
|
+
# Convert MaxKey() to {"$maxKey": 1}
|
|
932
|
+
converted = re.sub(r"MaxKey\(\)", r'{"$maxKey": 1}', converted)
|
|
933
|
+
|
|
934
|
+
# Convert UUID("...") to {"$uuid": "..."}
|
|
935
|
+
converted = re.sub(r'UUID\("([^"]+)"\)', r'{"$uuid": "\1"}', converted)
|
|
936
|
+
|
|
937
|
+
# Convert DBRef("collection", "id") to {"$ref": "collection", "$id": "id"}
|
|
938
|
+
converted = re.sub(
|
|
939
|
+
r'DBRef\("([^"]+)",\s*"([^"]+)"\)', r'{"$ref": "\1", "$id": "\2"}', converted
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
# Convert Code("...") to {"$code": "..."}
|
|
943
|
+
converted = re.sub(r'Code\("([^"]+)"\)', r'{"$code": "\1"}', converted)
|
|
944
|
+
|
|
945
|
+
return converted
|
|
946
|
+
|
|
947
|
+
|
|
669
948
|
__source_name__ = "mongodb"
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
# MongoDB destination helper functions
|
|
952
|
+
def process_file_items(file_path: str) -> list[dict]:
|
|
953
|
+
"""Process items from a file path (JSONL format)."""
|
|
954
|
+
import json
|
|
955
|
+
|
|
956
|
+
documents = []
|
|
957
|
+
with open(file_path, "r") as f:
|
|
958
|
+
for line in f:
|
|
959
|
+
if line.strip():
|
|
960
|
+
doc = json.loads(line.strip())
|
|
961
|
+
documents.append(doc) # Include all fields including DLT metadata
|
|
962
|
+
return documents
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
def mongodb_insert(uri: str):
|
|
966
|
+
"""Creates a dlt.destination for inserting data into a MongoDB collection.
|
|
967
|
+
|
|
968
|
+
Args:
|
|
969
|
+
uri (str): MongoDB connection URI including database.
|
|
970
|
+
|
|
971
|
+
Returns:
|
|
972
|
+
dlt.destination: A DLT destination object configured for MongoDB.
|
|
973
|
+
"""
|
|
974
|
+
from urllib.parse import urlparse
|
|
975
|
+
|
|
976
|
+
parsed_uri = urlparse(uri)
|
|
977
|
+
|
|
978
|
+
# Handle both mongodb:// and mongodb+srv:// schemes
|
|
979
|
+
if uri.startswith("mongodb+srv://") or uri.startswith("mongodb://"):
|
|
980
|
+
# For modern connection strings (MongoDB Atlas), use the URI as-is
|
|
981
|
+
connection_string = uri
|
|
982
|
+
# Extract database from path or use default
|
|
983
|
+
database = (
|
|
984
|
+
parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
|
|
985
|
+
)
|
|
986
|
+
else:
|
|
987
|
+
# Legacy handling for backwards compatibility
|
|
988
|
+
host = parsed_uri.hostname or "localhost"
|
|
989
|
+
port = parsed_uri.port or 27017
|
|
990
|
+
username = parsed_uri.username
|
|
991
|
+
password = parsed_uri.password
|
|
992
|
+
database = (
|
|
993
|
+
parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
|
|
994
|
+
)
|
|
995
|
+
|
|
996
|
+
# Build connection string
|
|
997
|
+
if username and password:
|
|
998
|
+
connection_string = f"mongodb://{username}:{password}@{host}:{port}"
|
|
999
|
+
else:
|
|
1000
|
+
connection_string = f"mongodb://{host}:{port}"
|
|
1001
|
+
|
|
1002
|
+
# Add query parameters if any
|
|
1003
|
+
if parsed_uri.query:
|
|
1004
|
+
connection_string += f"?{parsed_uri.query}"
|
|
1005
|
+
|
|
1006
|
+
state = {"first_batch": True}
|
|
1007
|
+
|
|
1008
|
+
def destination(items: TDataItem, table: TTableSchema) -> None:
|
|
1009
|
+
import pyarrow
|
|
1010
|
+
from pymongo import MongoClient
|
|
1011
|
+
|
|
1012
|
+
# Extract database name from connection string
|
|
1013
|
+
# Get collection name from table metadata
|
|
1014
|
+
collection_name = table["name"]
|
|
1015
|
+
|
|
1016
|
+
# Connect to MongoDB
|
|
1017
|
+
with MongoClient(connection_string) as client:
|
|
1018
|
+
db = client[database]
|
|
1019
|
+
collection = db[collection_name]
|
|
1020
|
+
|
|
1021
|
+
# Process and insert documents
|
|
1022
|
+
if isinstance(items, str):
|
|
1023
|
+
documents = process_file_items(items)
|
|
1024
|
+
elif isinstance(items, pyarrow.RecordBatch):
|
|
1025
|
+
documents = [item for item in items.to_pylist()]
|
|
1026
|
+
else:
|
|
1027
|
+
documents = [item for item in items if isinstance(item, dict)]
|
|
1028
|
+
|
|
1029
|
+
if state["first_batch"] and documents:
|
|
1030
|
+
collection.delete_many({})
|
|
1031
|
+
state["first_batch"] = False
|
|
1032
|
+
|
|
1033
|
+
if documents:
|
|
1034
|
+
collection.insert_many(documents) # Insert all new data
|
|
1035
|
+
|
|
1036
|
+
return dlt.destination(
|
|
1037
|
+
destination,
|
|
1038
|
+
name="mongodb",
|
|
1039
|
+
loader_file_format="typed-jsonl",
|
|
1040
|
+
batch_size=1000,
|
|
1041
|
+
naming_convention="snake_case",
|
|
1042
|
+
loader_parallelism_strategy="sequential",
|
|
1043
|
+
)
|