ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

Files changed (79) hide show
  1. ingestr/main.py +22 -3
  2. ingestr/src/adjust/__init__.py +4 -4
  3. ingestr/src/allium/__init__.py +128 -0
  4. ingestr/src/anthropic/__init__.py +277 -0
  5. ingestr/src/anthropic/helpers.py +525 -0
  6. ingestr/src/appstore/__init__.py +1 -0
  7. ingestr/src/asana_source/__init__.py +1 -1
  8. ingestr/src/buildinfo.py +1 -1
  9. ingestr/src/chess/__init__.py +1 -1
  10. ingestr/src/couchbase_source/__init__.py +118 -0
  11. ingestr/src/couchbase_source/helpers.py +135 -0
  12. ingestr/src/cursor/__init__.py +83 -0
  13. ingestr/src/cursor/helpers.py +188 -0
  14. ingestr/src/destinations.py +169 -1
  15. ingestr/src/docebo/__init__.py +589 -0
  16. ingestr/src/docebo/client.py +435 -0
  17. ingestr/src/docebo/helpers.py +97 -0
  18. ingestr/src/elasticsearch/helpers.py +138 -0
  19. ingestr/src/errors.py +8 -0
  20. ingestr/src/facebook_ads/__init__.py +26 -23
  21. ingestr/src/facebook_ads/helpers.py +47 -1
  22. ingestr/src/factory.py +48 -0
  23. ingestr/src/filesystem/__init__.py +8 -3
  24. ingestr/src/filters.py +9 -0
  25. ingestr/src/fluxx/__init__.py +9906 -0
  26. ingestr/src/fluxx/helpers.py +209 -0
  27. ingestr/src/frankfurter/__init__.py +157 -163
  28. ingestr/src/frankfurter/helpers.py +3 -3
  29. ingestr/src/freshdesk/__init__.py +25 -8
  30. ingestr/src/freshdesk/freshdesk_client.py +40 -5
  31. ingestr/src/fundraiseup/__init__.py +49 -0
  32. ingestr/src/fundraiseup/client.py +81 -0
  33. ingestr/src/github/__init__.py +6 -4
  34. ingestr/src/google_analytics/__init__.py +1 -1
  35. ingestr/src/hostaway/__init__.py +302 -0
  36. ingestr/src/hostaway/client.py +288 -0
  37. ingestr/src/http/__init__.py +35 -0
  38. ingestr/src/http/readers.py +114 -0
  39. ingestr/src/hubspot/__init__.py +6 -12
  40. ingestr/src/influxdb/__init__.py +1 -0
  41. ingestr/src/intercom/__init__.py +142 -0
  42. ingestr/src/intercom/helpers.py +674 -0
  43. ingestr/src/intercom/settings.py +279 -0
  44. ingestr/src/jira_source/__init__.py +340 -0
  45. ingestr/src/jira_source/helpers.py +439 -0
  46. ingestr/src/jira_source/settings.py +170 -0
  47. ingestr/src/klaviyo/__init__.py +5 -5
  48. ingestr/src/linear/__init__.py +553 -116
  49. ingestr/src/linear/helpers.py +77 -38
  50. ingestr/src/mailchimp/__init__.py +126 -0
  51. ingestr/src/mailchimp/helpers.py +226 -0
  52. ingestr/src/mailchimp/settings.py +164 -0
  53. ingestr/src/masking.py +344 -0
  54. ingestr/src/monday/__init__.py +246 -0
  55. ingestr/src/monday/helpers.py +392 -0
  56. ingestr/src/monday/settings.py +328 -0
  57. ingestr/src/mongodb/__init__.py +5 -2
  58. ingestr/src/mongodb/helpers.py +384 -10
  59. ingestr/src/plusvibeai/__init__.py +335 -0
  60. ingestr/src/plusvibeai/helpers.py +544 -0
  61. ingestr/src/plusvibeai/settings.py +252 -0
  62. ingestr/src/revenuecat/__init__.py +83 -0
  63. ingestr/src/revenuecat/helpers.py +237 -0
  64. ingestr/src/salesforce/__init__.py +15 -8
  65. ingestr/src/shopify/__init__.py +1 -1
  66. ingestr/src/smartsheets/__init__.py +33 -5
  67. ingestr/src/socrata_source/__init__.py +83 -0
  68. ingestr/src/socrata_source/helpers.py +85 -0
  69. ingestr/src/socrata_source/settings.py +8 -0
  70. ingestr/src/sources.py +1418 -54
  71. ingestr/src/stripe_analytics/__init__.py +2 -19
  72. ingestr/src/wise/__init__.py +68 -0
  73. ingestr/src/wise/client.py +63 -0
  74. ingestr/tests/unit/test_smartsheets.py +6 -9
  75. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
  76. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
  77. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
  78. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
  79. {ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0
@@ -101,12 +101,13 @@ def mongodb_collection(
101
101
  write_disposition: Optional[str] = dlt.config.value,
102
102
  parallel: Optional[bool] = False,
103
103
  limit: Optional[int] = None,
104
- chunk_size: Optional[int] = 10000,
104
+ chunk_size: Optional[int] = 1000,
105
105
  data_item_format: Optional[TDataItemFormat] = "object",
106
106
  filter_: Optional[Dict[str, Any]] = None,
107
107
  projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
108
108
  pymongoarrow_schema: Optional[Any] = None,
109
- ) -> Any:
109
+ custom_query: Optional[List[Dict[str, Any]]] = None,
110
+ ) -> DltResource:
110
111
  """
111
112
  A DLT source which loads a collection from a mongo database using PyMongo.
112
113
 
@@ -132,6 +133,7 @@ def mongodb_collection(
132
133
  exclude (dict) - {"released": False, "runtime": False}
133
134
  Note: Can't mix include and exclude statements '{"title": True, "released": False}`
134
135
  pymongoarrow_schema (pymongoarrow.schema.Schema): Mapping of expected field types to convert BSON to Arrow
136
+ custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
135
137
 
136
138
  Returns:
137
139
  Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -161,4 +163,5 @@ def mongodb_collection(
161
163
  filter_=filter_ or {},
162
164
  projection=projection,
163
165
  pymongoarrow_schema=pymongoarrow_schema,
166
+ custom_query=custom_query,
164
167
  )
@@ -1,5 +1,6 @@
1
- """Mongo database source helpers"""
1
+ """Mongo database source helpers and destination utilities"""
2
2
 
3
+ import re
3
4
  from itertools import islice
4
5
  from typing import (
5
6
  TYPE_CHECKING,
@@ -22,6 +23,7 @@ from bson.timestamp import Timestamp
22
23
  from dlt.common import logger
23
24
  from dlt.common.configuration.specs import BaseConfiguration, configspec
24
25
  from dlt.common.data_writers import TDataItemFormat
26
+ from dlt.common.schema import TTableSchema
25
27
  from dlt.common.time import ensure_pendulum_datetime
26
28
  from dlt.common.typing import TDataItem
27
29
  from dlt.common.utils import map_nested_in_place
@@ -204,7 +206,14 @@ class CollectionLoader:
204
206
  cursor = self._limit(cursor, limit)
205
207
 
206
208
  while docs_slice := list(islice(cursor, self.chunk_size)):
207
- yield map_nested_in_place(convert_mongo_objs, docs_slice)
209
+ res = map_nested_in_place(convert_mongo_objs, docs_slice)
210
+ if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
211
+ yield dlt.mark.with_hints(
212
+ res,
213
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
214
+ )
215
+ else:
216
+ yield res
208
217
 
209
218
 
210
219
  class CollectionLoaderParallel(CollectionLoader):
@@ -464,6 +473,170 @@ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
464
473
  yield convert_arrow_columns(table)
465
474
 
466
475
 
476
+ class CollectionAggregationLoader(CollectionLoader):
477
+ """
478
+ MongoDB collection loader that uses aggregation pipelines instead of find queries.
479
+ """
480
+
481
+ def __init__(
482
+ self,
483
+ client: TMongoClient,
484
+ collection: TCollection,
485
+ chunk_size: int,
486
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
487
+ ) -> None:
488
+ super().__init__(client, collection, chunk_size, incremental)
489
+ self.custom_query: Optional[List[Dict[str, Any]]] = None
490
+
491
+ def set_custom_query(self, query: List[Dict[str, Any]]):
492
+ """Set the custom aggregation pipeline query"""
493
+ self.custom_query = query
494
+
495
+ def load_documents(
496
+ self,
497
+ filter_: Dict[str, Any],
498
+ limit: Optional[int] = None,
499
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
500
+ ) -> Iterator[TDataItem]:
501
+ """Load documents using aggregation pipeline"""
502
+ if not self.custom_query:
503
+ # Fallback to parent method if no custom query
504
+ yield from super().load_documents(filter_, limit, projection)
505
+ return
506
+
507
+ # Build aggregation pipeline
508
+ pipeline = list(self.custom_query) # Copy the query
509
+
510
+ # For custom queries, we assume incremental filtering is already handled
511
+ # via interval placeholders (:interval_start, :interval_end) in the query itself.
512
+ # We don't add additional incremental filtering to avoid conflicts.
513
+
514
+ # Add additional filter if provided
515
+ if filter_:
516
+ filter_match = {"$match": filter_}
517
+ pipeline.insert(0, filter_match)
518
+
519
+ # Add limit if specified
520
+ if limit and limit > 0:
521
+ pipeline.append({"$limit": limit})
522
+
523
+ # Add maxTimeMS to prevent hanging
524
+ cursor = self.collection.aggregate(
525
+ pipeline,
526
+ allowDiskUse=True,
527
+ batchSize=min(self.chunk_size, 101),
528
+ maxTimeMS=30000, # 30 second timeout
529
+ )
530
+
531
+ docs_buffer = []
532
+ try:
533
+ for doc in cursor:
534
+ docs_buffer.append(doc)
535
+
536
+ if len(docs_buffer) >= self.chunk_size:
537
+ res = map_nested_in_place(convert_mongo_objs, docs_buffer)
538
+ if (
539
+ len(res) > 0
540
+ and "_id" in res[0]
541
+ and isinstance(res[0]["_id"], dict)
542
+ ):
543
+ yield dlt.mark.with_hints(
544
+ res,
545
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
546
+ )
547
+ else:
548
+ yield res
549
+ docs_buffer = []
550
+
551
+ # Yield any remaining documents
552
+ if docs_buffer:
553
+ res = map_nested_in_place(convert_mongo_objs, docs_buffer)
554
+ if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
555
+ yield dlt.mark.with_hints(
556
+ res,
557
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
558
+ )
559
+ else:
560
+ yield res
561
+ finally:
562
+ cursor.close()
563
+
564
+
565
+ class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
566
+ """
567
+ MongoDB collection parallel loader that uses aggregation pipelines.
568
+ Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
569
+ Falls back to sequential loading.
570
+ """
571
+
572
+ def load_documents(
573
+ self,
574
+ filter_: Dict[str, Any],
575
+ limit: Optional[int] = None,
576
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
577
+ ) -> Iterator[TDataItem]:
578
+ """Load documents using aggregation pipeline (sequential only)"""
579
+ logger.warning(
580
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
581
+ )
582
+ yield from super().load_documents(filter_, limit, projection)
583
+
584
+
585
+ class CollectionAggregationArrowLoader(CollectionAggregationLoader):
586
+ """
587
+ MongoDB collection aggregation loader that uses Apache Arrow for data processing.
588
+ """
589
+
590
+ def load_documents(
591
+ self,
592
+ filter_: Dict[str, Any],
593
+ limit: Optional[int] = None,
594
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
595
+ pymongoarrow_schema: Any = None,
596
+ ) -> Iterator[Any]:
597
+ """Load documents using aggregation pipeline with Arrow format"""
598
+ logger.warning(
599
+ "Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
600
+ )
601
+
602
+ # Load documents normally and convert to arrow format
603
+ for batch in super().load_documents(filter_, limit, projection):
604
+ if batch: # Only process non-empty batches
605
+ try:
606
+ from dlt.common.libs.pyarrow import pyarrow
607
+
608
+ # Convert dict batch to arrow table
609
+ table = pyarrow.Table.from_pylist(batch)
610
+ yield convert_arrow_columns(table)
611
+ except ImportError:
612
+ logger.warning(
613
+ "PyArrow not available, falling back to object format"
614
+ )
615
+ yield batch
616
+
617
+
618
+ class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
619
+ """
620
+ MongoDB collection parallel aggregation loader with Arrow support.
621
+ Falls back to sequential loading.
622
+ """
623
+
624
+ def load_documents(
625
+ self,
626
+ filter_: Dict[str, Any],
627
+ limit: Optional[int] = None,
628
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
629
+ pymongoarrow_schema: Any = None,
630
+ ) -> Iterator[TDataItem]:
631
+ """Load documents using aggregation pipeline with Arrow format (sequential only)"""
632
+ logger.warning(
633
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
634
+ )
635
+ yield from super().load_documents(
636
+ filter_, limit, projection, pymongoarrow_schema
637
+ )
638
+
639
+
467
640
  def collection_documents(
468
641
  client: TMongoClient,
469
642
  collection: TCollection,
@@ -475,6 +648,7 @@ def collection_documents(
475
648
  limit: Optional[int] = None,
476
649
  chunk_size: Optional[int] = 10000,
477
650
  data_item_format: Optional[TDataItemFormat] = "object",
651
+ custom_query: Optional[List[Dict[str, Any]]] = None,
478
652
  ) -> Iterator[TDataItem]:
479
653
  """
480
654
  A DLT source which loads data from a Mongo database using PyMongo.
@@ -499,6 +673,7 @@ def collection_documents(
499
673
  Supported formats:
500
674
  object - Python objects (dicts, lists).
501
675
  arrow - Apache Arrow tables.
676
+ custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
502
677
 
503
678
  Returns:
504
679
  Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -521,21 +696,48 @@ def collection_documents(
521
696
  "create a projection to select fields, `projection` will be ignored."
522
697
  )
523
698
 
524
- if parallel:
525
- if data_item_format == "arrow":
526
- LoaderClass = CollectionArrowLoaderParallel
699
+ # If custom query is provided, use aggregation loaders
700
+ if custom_query:
701
+ if parallel:
702
+ if data_item_format == "arrow":
703
+ LoaderClass = CollectionAggregationArrowLoaderParallel
704
+ else:
705
+ LoaderClass = CollectionAggregationLoaderParallel # type: ignore
527
706
  else:
528
- LoaderClass = CollectionLoaderParallel # type: ignore
707
+ if data_item_format == "arrow":
708
+ LoaderClass = CollectionAggregationArrowLoader # type: ignore
709
+ else:
710
+ LoaderClass = CollectionAggregationLoader # type: ignore
529
711
  else:
530
- if data_item_format == "arrow":
531
- LoaderClass = CollectionArrowLoader # type: ignore
712
+ if parallel:
713
+ if data_item_format == "arrow":
714
+ LoaderClass = CollectionArrowLoaderParallel
715
+ else:
716
+ LoaderClass = CollectionLoaderParallel # type: ignore
532
717
  else:
533
- LoaderClass = CollectionLoader # type: ignore
718
+ if data_item_format == "arrow":
719
+ LoaderClass = CollectionArrowLoader # type: ignore
720
+ else:
721
+ LoaderClass = CollectionLoader # type: ignore
534
722
 
535
723
  loader = LoaderClass(
536
724
  client, collection, incremental=incremental, chunk_size=chunk_size
537
725
  )
538
- if isinstance(loader, (CollectionArrowLoader, CollectionArrowLoaderParallel)):
726
+
727
+ # Set custom query if provided
728
+ if custom_query and hasattr(loader, "set_custom_query"):
729
+ loader.set_custom_query(custom_query)
730
+
731
+ # Load documents based on loader type
732
+ if isinstance(
733
+ loader,
734
+ (
735
+ CollectionArrowLoader,
736
+ CollectionArrowLoaderParallel,
737
+ CollectionAggregationArrowLoader,
738
+ CollectionAggregationArrowLoaderParallel,
739
+ ),
740
+ ):
539
741
  yield from loader.load_documents(
540
742
  limit=limit,
541
743
  filter_=filter_,
@@ -666,4 +868,176 @@ class MongoDbCollectionResourceConfiguration(BaseConfiguration):
666
868
  projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value
667
869
 
668
870
 
871
+ def convert_mongo_shell_to_extended_json(query_string: str) -> str:
872
+ """
873
+ Convert MongoDB shell syntax to MongoDB Extended JSON v2 format.
874
+
875
+ This function handles common MongoDB shell constructs like ISODate, ObjectId,
876
+ NumberLong, NumberDecimal, etc. and converts them to their Extended JSON equivalents
877
+ that can be parsed by bson.json_util.
878
+
879
+ Args:
880
+ query_string: A string containing MongoDB shell syntax
881
+
882
+ Returns:
883
+ A string with MongoDB Extended JSON v2 format
884
+
885
+ Examples:
886
+ >>> convert_mongo_shell_to_extended_json('ISODate("2010-01-01T00:00:00.000Z")')
887
+ '{"$date": "2010-01-01T00:00:00.000Z"}'
888
+
889
+ >>> convert_mongo_shell_to_extended_json('ObjectId("507f1f77bcf86cd799439011")')
890
+ '{"$oid": "507f1f77bcf86cd799439011"}'
891
+ """
892
+ converted = query_string
893
+
894
+ # Convert ISODate("...") to {"$date": "..."}
895
+ # Pattern matches ISODate("2010-01-01T00:00:00.000+0000") or similar
896
+ converted = re.sub(r'ISODate\("([^"]+)"\)', r'{"$date": "\1"}', converted)
897
+
898
+ # Convert ObjectId("...") to {"$oid": "..."}
899
+ converted = re.sub(r'ObjectId\("([^"]+)"\)', r'{"$oid": "\1"}', converted)
900
+
901
+ # Convert NumberLong(...) to {"$numberLong": "..."}
902
+ # Note: NumberLong can have quotes or not: NumberLong(123) or NumberLong("123")
903
+ converted = re.sub(r'NumberLong\("([^"]+)"\)', r'{"$numberLong": "\1"}', converted)
904
+ converted = re.sub(r"NumberLong\(([^)]+)\)", r'{"$numberLong": "\1"}', converted)
905
+
906
+ # Convert NumberInt(...) to {"$numberInt": "..."}
907
+ converted = re.sub(r'NumberInt\("([^"]+)"\)', r'{"$numberInt": "\1"}', converted)
908
+ converted = re.sub(r"NumberInt\(([^)]+)\)", r'{"$numberInt": "\1"}', converted)
909
+
910
+ # Convert NumberDecimal("...") to {"$numberDecimal": "..."}
911
+ converted = re.sub(
912
+ r'NumberDecimal\("([^"]+)"\)', r'{"$numberDecimal": "\1"}', converted
913
+ )
914
+
915
+ # Convert Timestamp(..., ...) to {"$timestamp": {"t": ..., "i": ...}}
916
+ # Timestamp(1234567890, 1) -> {"$timestamp": {"t": 1234567890, "i": 1}}
917
+ converted = re.sub(
918
+ r"Timestamp\((\d+),\s*(\d+)\)", r'{"$timestamp": {"t": \1, "i": \2}}', converted
919
+ )
920
+
921
+ # Convert BinData(..., "...") to {"$binary": {"base64": "...", "subType": "..."}}
922
+ converted = re.sub(
923
+ r'BinData\((\d+),\s*"([^"]+)"\)',
924
+ r'{"$binary": {"base64": "\2", "subType": "\1"}}',
925
+ converted,
926
+ )
927
+
928
+ # Convert MinKey() to {"$minKey": 1}
929
+ converted = re.sub(r"MinKey\(\)", r'{"$minKey": 1}', converted)
930
+
931
+ # Convert MaxKey() to {"$maxKey": 1}
932
+ converted = re.sub(r"MaxKey\(\)", r'{"$maxKey": 1}', converted)
933
+
934
+ # Convert UUID("...") to {"$uuid": "..."}
935
+ converted = re.sub(r'UUID\("([^"]+)"\)', r'{"$uuid": "\1"}', converted)
936
+
937
+ # Convert DBRef("collection", "id") to {"$ref": "collection", "$id": "id"}
938
+ converted = re.sub(
939
+ r'DBRef\("([^"]+)",\s*"([^"]+)"\)', r'{"$ref": "\1", "$id": "\2"}', converted
940
+ )
941
+
942
+ # Convert Code("...") to {"$code": "..."}
943
+ converted = re.sub(r'Code\("([^"]+)"\)', r'{"$code": "\1"}', converted)
944
+
945
+ return converted
946
+
947
+
669
948
  __source_name__ = "mongodb"
949
+
950
+
951
+ # MongoDB destination helper functions
952
+ def process_file_items(file_path: str) -> list[dict]:
953
+ """Process items from a file path (JSONL format)."""
954
+ import json
955
+
956
+ documents = []
957
+ with open(file_path, "r") as f:
958
+ for line in f:
959
+ if line.strip():
960
+ doc = json.loads(line.strip())
961
+ documents.append(doc) # Include all fields including DLT metadata
962
+ return documents
963
+
964
+
965
+ def mongodb_insert(uri: str):
966
+ """Creates a dlt.destination for inserting data into a MongoDB collection.
967
+
968
+ Args:
969
+ uri (str): MongoDB connection URI including database.
970
+
971
+ Returns:
972
+ dlt.destination: A DLT destination object configured for MongoDB.
973
+ """
974
+ from urllib.parse import urlparse
975
+
976
+ parsed_uri = urlparse(uri)
977
+
978
+ # Handle both mongodb:// and mongodb+srv:// schemes
979
+ if uri.startswith("mongodb+srv://") or uri.startswith("mongodb://"):
980
+ # For modern connection strings (MongoDB Atlas), use the URI as-is
981
+ connection_string = uri
982
+ # Extract database from path or use default
983
+ database = (
984
+ parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
985
+ )
986
+ else:
987
+ # Legacy handling for backwards compatibility
988
+ host = parsed_uri.hostname or "localhost"
989
+ port = parsed_uri.port or 27017
990
+ username = parsed_uri.username
991
+ password = parsed_uri.password
992
+ database = (
993
+ parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
994
+ )
995
+
996
+ # Build connection string
997
+ if username and password:
998
+ connection_string = f"mongodb://{username}:{password}@{host}:{port}"
999
+ else:
1000
+ connection_string = f"mongodb://{host}:{port}"
1001
+
1002
+ # Add query parameters if any
1003
+ if parsed_uri.query:
1004
+ connection_string += f"?{parsed_uri.query}"
1005
+
1006
+ state = {"first_batch": True}
1007
+
1008
+ def destination(items: TDataItem, table: TTableSchema) -> None:
1009
+ import pyarrow
1010
+ from pymongo import MongoClient
1011
+
1012
+ # Extract database name from connection string
1013
+ # Get collection name from table metadata
1014
+ collection_name = table["name"]
1015
+
1016
+ # Connect to MongoDB
1017
+ with MongoClient(connection_string) as client:
1018
+ db = client[database]
1019
+ collection = db[collection_name]
1020
+
1021
+ # Process and insert documents
1022
+ if isinstance(items, str):
1023
+ documents = process_file_items(items)
1024
+ elif isinstance(items, pyarrow.RecordBatch):
1025
+ documents = [item for item in items.to_pylist()]
1026
+ else:
1027
+ documents = [item for item in items if isinstance(item, dict)]
1028
+
1029
+ if state["first_batch"] and documents:
1030
+ collection.delete_many({})
1031
+ state["first_batch"] = False
1032
+
1033
+ if documents:
1034
+ collection.insert_many(documents) # Insert all new data
1035
+
1036
+ return dlt.destination(
1037
+ destination,
1038
+ name="mongodb",
1039
+ loader_file_format="typed-jsonl",
1040
+ batch_size=1000,
1041
+ naming_convention="snake_case",
1042
+ loader_parallelism_strategy="sequential",
1043
+ )