PyPI - ingestr - Versions diffs - 0.13.78__py3-none-any.whl → 0.13.80__py3-none-any.whl - Mend

ingestr 0.13.78py3-none-any.whl → 0.13.80py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (21) hide show

ingestr/main.py +10 -3
ingestr/src/buildinfo.py +1 -1
ingestr/src/destinations.py +18 -0
ingestr/src/facebook_ads/__init__.py +6 -2
ingestr/src/facebook_ads/helpers.py +1 -1
ingestr/src/factory.py +5 -0
ingestr/src/freshdesk/__init__.py +23 -8
ingestr/src/freshdesk/freshdesk_client.py +16 -5
ingestr/src/github/__init__.py +5 -3
ingestr/src/github/helpers.py +1 -0
ingestr/src/influxdb/__init__.py +1 -0
ingestr/src/linear/__init__.py +61 -43
ingestr/src/linear/helpers.py +19 -36
ingestr/src/mongodb/__init__.py +3 -0
ingestr/src/mongodb/helpers.py +178 -11
ingestr/src/sources.py +311 -25
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/METADATA +6 -1
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/RECORD +21 -21
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/WHEEL +0 -0
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.78.dist-info → ingestr-0.13.80.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/mongodb/helpers.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Mongo database source helpers"""
-import json
 from itertools import islice
 from typing import (
     TYPE_CHECKING,
@@ -209,7 +208,7 @@ class CollectionLoader:
             if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
                 yield dlt.mark.with_hints(
                     res,
-                    dlt.mark.make_hints(columns={"_id": {"data_type": "json"} }),
+                    dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
                 )
             else:
                 yield res
@@ -472,6 +471,145 @@ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
             yield convert_arrow_columns(table)
+class CollectionAggregationLoader(CollectionLoader):
+    """
+    MongoDB collection loader that uses aggregation pipelines instead of find queries.
+    """
+    def __init__(
+        self,
+        client: TMongoClient,
+        collection: TCollection,
+        chunk_size: int,
+        incremental: Optional[dlt.sources.incremental[Any]] = None,
+    ) -> None:
+        super().__init__(client, collection, chunk_size, incremental)
+        self.custom_query: Optional[List[Dict[str, Any]]] = None
+    def set_custom_query(self, query: List[Dict[str, Any]]):
+        """Set the custom aggregation pipeline query"""
+        self.custom_query = query
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline"""
+        if not self.custom_query:
+            # Fallback to parent method if no custom query
+            yield from super().load_documents(filter_, limit, projection)
+            return
+        # Build aggregation pipeline
+        pipeline = list(self.custom_query)  # Copy the query
+        # For custom queries, we assume incremental filtering is already handled
+        # via interval placeholders (:interval_start, :interval_end) in the query itself.
+        # We don't add additional incremental filtering to avoid conflicts.
+        # Add additional filter if provided
+        if filter_:
+            filter_match = {"$match": filter_}
+            pipeline.insert(0, filter_match)
+        # Add limit if specified
+        if limit and limit > 0:
+            pipeline.append({"$limit": limit})
+        print("pipeline", pipeline)
+        # Execute aggregation
+        cursor = self.collection.aggregate(pipeline, allowDiskUse=True)
+        # Process results in chunks
+        while docs_slice := list(islice(cursor, self.chunk_size)):
+            res = map_nested_in_place(convert_mongo_objs, docs_slice)
+            print("res", res)
+            if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
+                yield dlt.mark.with_hints(
+                    res,
+                    dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
+                )
+            else:
+                yield res
+class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
+    """
+    MongoDB collection parallel loader that uses aggregation pipelines.
+    Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
+    Falls back to sequential loading.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline (sequential only)"""
+        logger.warning(
+            "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
+        )
+        yield from super().load_documents(filter_, limit, projection)
+class CollectionAggregationArrowLoader(CollectionAggregationLoader):
+    """
+    MongoDB collection aggregation loader that uses Apache Arrow for data processing.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+        pymongoarrow_schema: Any = None,
+    ) -> Iterator[Any]:
+        """Load documents using aggregation pipeline with Arrow format"""
+        logger.warning(
+            "Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
+        )
+        # Load documents normally and convert to arrow format
+        for batch in super().load_documents(filter_, limit, projection):
+            if batch:  # Only process non-empty batches
+                try:
+                    from dlt.common.libs.pyarrow import pyarrow
+                    # Convert dict batch to arrow table
+                    table = pyarrow.Table.from_pylist(batch)
+                    yield convert_arrow_columns(table)
+                except ImportError:
+                    logger.warning(
+                        "PyArrow not available, falling back to object format"
+                    )
+                    yield batch
+class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
+    """
+    MongoDB collection parallel aggregation loader with Arrow support.
+    Falls back to sequential loading.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+        pymongoarrow_schema: Any = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline with Arrow format (sequential only)"""
+        logger.warning(
+            "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
+        )
+        yield from super().load_documents(
+            filter_, limit, projection, pymongoarrow_schema
+        )
 def collection_documents(
     client: TMongoClient,
     collection: TCollection,
@@ -483,6 +621,7 @@ def collection_documents(
     limit: Optional[int] = None,
     chunk_size: Optional[int] = 10000,
     data_item_format: Optional[TDataItemFormat] = "object",
+    custom_query: Optional[List[Dict[str, Any]]] = None,
 ) -> Iterator[TDataItem]:
     """
     A DLT source which loads data from a Mongo database using PyMongo.
@@ -507,6 +646,7 @@ def collection_documents(
             Supported formats:
                 object - Python objects (dicts, lists).
                 arrow - Apache Arrow tables.
+        custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
     Returns:
         Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -529,21 +669,48 @@ def collection_documents(
             "create a projection to select fields, `projection` will be ignored."
         )
-    if parallel:
-        if data_item_format == "arrow":
-            LoaderClass = CollectionArrowLoaderParallel
+    # If custom query is provided, use aggregation loaders
+    if custom_query:
+        if parallel:
+            if data_item_format == "arrow":
+                LoaderClass = CollectionAggregationArrowLoaderParallel
+            else:
+                LoaderClass = CollectionAggregationLoaderParallel  # type: ignore
         else:
-            LoaderClass = CollectionLoaderParallel  # type: ignore
+            if data_item_format == "arrow":
+                LoaderClass = CollectionAggregationArrowLoader  # type: ignore
+            else:
+                LoaderClass = CollectionAggregationLoader  # type: ignore
     else:
-        if data_item_format == "arrow":
-            LoaderClass = CollectionArrowLoader  # type: ignore
+        if parallel:
+            if data_item_format == "arrow":
+                LoaderClass = CollectionArrowLoaderParallel
+            else:
+                LoaderClass = CollectionLoaderParallel  # type: ignore
         else:
-            LoaderClass = CollectionLoader  # type: ignore
+            if data_item_format == "arrow":
+                LoaderClass = CollectionArrowLoader  # type: ignore
+            else:
+                LoaderClass = CollectionLoader  # type: ignore
     loader = LoaderClass(
         client, collection, incremental=incremental, chunk_size=chunk_size
     )
-    if isinstance(loader, (CollectionArrowLoader, CollectionArrowLoaderParallel)):
+    # Set custom query if provided
+    if custom_query and hasattr(loader, "set_custom_query"):
+        loader.set_custom_query(custom_query)
+    # Load documents based on loader type
+    if isinstance(
+        loader,
+        (
+            CollectionArrowLoader,
+            CollectionArrowLoaderParallel,
+            CollectionAggregationArrowLoader,
+            CollectionAggregationArrowLoaderParallel,
+        ),
+    ):
         yield from loader.load_documents(
             limit=limit,
             filter_=filter_,

ingestr 0.13.78__py3-none-any.whl → 0.13.80__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.78py3-none-any.whl → 0.13.80py3-none-any.whl