ingestr 0.13.78__py3-none-any.whl → 0.13.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -1,6 +1,5 @@
1
1
  """Mongo database source helpers"""
2
2
 
3
- import json
4
3
  from itertools import islice
5
4
  from typing import (
6
5
  TYPE_CHECKING,
@@ -209,7 +208,7 @@ class CollectionLoader:
209
208
  if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
210
209
  yield dlt.mark.with_hints(
211
210
  res,
212
- dlt.mark.make_hints(columns={"_id": {"data_type": "json"} }),
211
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
213
212
  )
214
213
  else:
215
214
  yield res
@@ -472,6 +471,145 @@ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
472
471
  yield convert_arrow_columns(table)
473
472
 
474
473
 
474
+ class CollectionAggregationLoader(CollectionLoader):
475
+ """
476
+ MongoDB collection loader that uses aggregation pipelines instead of find queries.
477
+ """
478
+
479
+ def __init__(
480
+ self,
481
+ client: TMongoClient,
482
+ collection: TCollection,
483
+ chunk_size: int,
484
+ incremental: Optional[dlt.sources.incremental[Any]] = None,
485
+ ) -> None:
486
+ super().__init__(client, collection, chunk_size, incremental)
487
+ self.custom_query: Optional[List[Dict[str, Any]]] = None
488
+
489
+ def set_custom_query(self, query: List[Dict[str, Any]]):
490
+ """Set the custom aggregation pipeline query"""
491
+ self.custom_query = query
492
+
493
+ def load_documents(
494
+ self,
495
+ filter_: Dict[str, Any],
496
+ limit: Optional[int] = None,
497
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
498
+ ) -> Iterator[TDataItem]:
499
+ """Load documents using aggregation pipeline"""
500
+ if not self.custom_query:
501
+ # Fallback to parent method if no custom query
502
+ yield from super().load_documents(filter_, limit, projection)
503
+ return
504
+
505
+ # Build aggregation pipeline
506
+ pipeline = list(self.custom_query) # Copy the query
507
+
508
+ # For custom queries, we assume incremental filtering is already handled
509
+ # via interval placeholders (:interval_start, :interval_end) in the query itself.
510
+ # We don't add additional incremental filtering to avoid conflicts.
511
+
512
+ # Add additional filter if provided
513
+ if filter_:
514
+ filter_match = {"$match": filter_}
515
+ pipeline.insert(0, filter_match)
516
+
517
+ # Add limit if specified
518
+ if limit and limit > 0:
519
+ pipeline.append({"$limit": limit})
520
+
521
+ print("pipeline", pipeline)
522
+ # Execute aggregation
523
+ cursor = self.collection.aggregate(pipeline, allowDiskUse=True)
524
+
525
+ # Process results in chunks
526
+ while docs_slice := list(islice(cursor, self.chunk_size)):
527
+ res = map_nested_in_place(convert_mongo_objs, docs_slice)
528
+ print("res", res)
529
+ if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
530
+ yield dlt.mark.with_hints(
531
+ res,
532
+ dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
533
+ )
534
+ else:
535
+ yield res
536
+
537
+
538
+ class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
539
+ """
540
+ MongoDB collection parallel loader that uses aggregation pipelines.
541
+ Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
542
+ Falls back to sequential loading.
543
+ """
544
+
545
+ def load_documents(
546
+ self,
547
+ filter_: Dict[str, Any],
548
+ limit: Optional[int] = None,
549
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
550
+ ) -> Iterator[TDataItem]:
551
+ """Load documents using aggregation pipeline (sequential only)"""
552
+ logger.warning(
553
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
554
+ )
555
+ yield from super().load_documents(filter_, limit, projection)
556
+
557
+
558
+ class CollectionAggregationArrowLoader(CollectionAggregationLoader):
559
+ """
560
+ MongoDB collection aggregation loader that uses Apache Arrow for data processing.
561
+ """
562
+
563
+ def load_documents(
564
+ self,
565
+ filter_: Dict[str, Any],
566
+ limit: Optional[int] = None,
567
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
568
+ pymongoarrow_schema: Any = None,
569
+ ) -> Iterator[Any]:
570
+ """Load documents using aggregation pipeline with Arrow format"""
571
+ logger.warning(
572
+ "Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
573
+ )
574
+
575
+ # Load documents normally and convert to arrow format
576
+ for batch in super().load_documents(filter_, limit, projection):
577
+ if batch: # Only process non-empty batches
578
+ try:
579
+ from dlt.common.libs.pyarrow import pyarrow
580
+
581
+ # Convert dict batch to arrow table
582
+ table = pyarrow.Table.from_pylist(batch)
583
+ yield convert_arrow_columns(table)
584
+ except ImportError:
585
+ logger.warning(
586
+ "PyArrow not available, falling back to object format"
587
+ )
588
+ yield batch
589
+
590
+
591
+ class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
592
+ """
593
+ MongoDB collection parallel aggregation loader with Arrow support.
594
+ Falls back to sequential loading.
595
+ """
596
+
597
+ def load_documents(
598
+ self,
599
+ filter_: Dict[str, Any],
600
+ limit: Optional[int] = None,
601
+ projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
602
+ pymongoarrow_schema: Any = None,
603
+ ) -> Iterator[TDataItem]:
604
+ """Load documents using aggregation pipeline with Arrow format (sequential only)"""
605
+ logger.warning(
606
+ "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
607
+ )
608
+ yield from super().load_documents(
609
+ filter_, limit, projection, pymongoarrow_schema
610
+ )
611
+
612
+
475
613
  def collection_documents(
476
614
  client: TMongoClient,
477
615
  collection: TCollection,
@@ -483,6 +621,7 @@ def collection_documents(
483
621
  limit: Optional[int] = None,
484
622
  chunk_size: Optional[int] = 10000,
485
623
  data_item_format: Optional[TDataItemFormat] = "object",
624
+ custom_query: Optional[List[Dict[str, Any]]] = None,
486
625
  ) -> Iterator[TDataItem]:
487
626
  """
488
627
  A DLT source which loads data from a Mongo database using PyMongo.
@@ -507,6 +646,7 @@ def collection_documents(
507
646
  Supported formats:
508
647
  object - Python objects (dicts, lists).
509
648
  arrow - Apache Arrow tables.
649
+ custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
510
650
 
511
651
  Returns:
512
652
  Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -529,21 +669,48 @@ def collection_documents(
529
669
  "create a projection to select fields, `projection` will be ignored."
530
670
  )
531
671
 
532
- if parallel:
533
- if data_item_format == "arrow":
534
- LoaderClass = CollectionArrowLoaderParallel
672
+ # If custom query is provided, use aggregation loaders
673
+ if custom_query:
674
+ if parallel:
675
+ if data_item_format == "arrow":
676
+ LoaderClass = CollectionAggregationArrowLoaderParallel
677
+ else:
678
+ LoaderClass = CollectionAggregationLoaderParallel # type: ignore
535
679
  else:
536
- LoaderClass = CollectionLoaderParallel # type: ignore
680
+ if data_item_format == "arrow":
681
+ LoaderClass = CollectionAggregationArrowLoader # type: ignore
682
+ else:
683
+ LoaderClass = CollectionAggregationLoader # type: ignore
537
684
  else:
538
- if data_item_format == "arrow":
539
- LoaderClass = CollectionArrowLoader # type: ignore
685
+ if parallel:
686
+ if data_item_format == "arrow":
687
+ LoaderClass = CollectionArrowLoaderParallel
688
+ else:
689
+ LoaderClass = CollectionLoaderParallel # type: ignore
540
690
  else:
541
- LoaderClass = CollectionLoader # type: ignore
542
-
691
+ if data_item_format == "arrow":
692
+ LoaderClass = CollectionArrowLoader # type: ignore
693
+ else:
694
+ LoaderClass = CollectionLoader # type: ignore
695
+
543
696
  loader = LoaderClass(
544
697
  client, collection, incremental=incremental, chunk_size=chunk_size
545
698
  )
546
- if isinstance(loader, (CollectionArrowLoader, CollectionArrowLoaderParallel)):
699
+
700
+ # Set custom query if provided
701
+ if custom_query and hasattr(loader, "set_custom_query"):
702
+ loader.set_custom_query(custom_query)
703
+
704
+ # Load documents based on loader type
705
+ if isinstance(
706
+ loader,
707
+ (
708
+ CollectionArrowLoader,
709
+ CollectionArrowLoaderParallel,
710
+ CollectionAggregationArrowLoader,
711
+ CollectionAggregationArrowLoaderParallel,
712
+ ),
713
+ ):
547
714
  yield from loader.load_documents(
548
715
  limit=limit,
549
716
  filter_=filter_,