PyPI - ingestr - Versions diffs - 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl - Mend

ingestr 0.13.75py3-none-any.whl → 0.14.98py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (79) hide show

ingestr/main.py +22 -3
ingestr/src/adjust/__init__.py +4 -4
ingestr/src/allium/__init__.py +128 -0
ingestr/src/anthropic/__init__.py +277 -0
ingestr/src/anthropic/helpers.py +525 -0
ingestr/src/appstore/__init__.py +1 -0
ingestr/src/asana_source/__init__.py +1 -1
ingestr/src/buildinfo.py +1 -1
ingestr/src/chess/__init__.py +1 -1
ingestr/src/couchbase_source/__init__.py +118 -0
ingestr/src/couchbase_source/helpers.py +135 -0
ingestr/src/cursor/__init__.py +83 -0
ingestr/src/cursor/helpers.py +188 -0
ingestr/src/destinations.py +169 -1
ingestr/src/docebo/__init__.py +589 -0
ingestr/src/docebo/client.py +435 -0
ingestr/src/docebo/helpers.py +97 -0
ingestr/src/elasticsearch/helpers.py +138 -0
ingestr/src/errors.py +8 -0
ingestr/src/facebook_ads/__init__.py +26 -23
ingestr/src/facebook_ads/helpers.py +47 -1
ingestr/src/factory.py +48 -0
ingestr/src/filesystem/__init__.py +8 -3
ingestr/src/filters.py +9 -0
ingestr/src/fluxx/__init__.py +9906 -0
ingestr/src/fluxx/helpers.py +209 -0
ingestr/src/frankfurter/__init__.py +157 -163
ingestr/src/frankfurter/helpers.py +3 -3
ingestr/src/freshdesk/__init__.py +25 -8
ingestr/src/freshdesk/freshdesk_client.py +40 -5
ingestr/src/fundraiseup/__init__.py +49 -0
ingestr/src/fundraiseup/client.py +81 -0
ingestr/src/github/__init__.py +6 -4
ingestr/src/google_analytics/__init__.py +1 -1
ingestr/src/hostaway/__init__.py +302 -0
ingestr/src/hostaway/client.py +288 -0
ingestr/src/http/__init__.py +35 -0
ingestr/src/http/readers.py +114 -0
ingestr/src/hubspot/__init__.py +6 -12
ingestr/src/influxdb/__init__.py +1 -0
ingestr/src/intercom/__init__.py +142 -0
ingestr/src/intercom/helpers.py +674 -0
ingestr/src/intercom/settings.py +279 -0
ingestr/src/jira_source/__init__.py +340 -0
ingestr/src/jira_source/helpers.py +439 -0
ingestr/src/jira_source/settings.py +170 -0
ingestr/src/klaviyo/__init__.py +5 -5
ingestr/src/linear/__init__.py +553 -116
ingestr/src/linear/helpers.py +77 -38
ingestr/src/mailchimp/__init__.py +126 -0
ingestr/src/mailchimp/helpers.py +226 -0
ingestr/src/mailchimp/settings.py +164 -0
ingestr/src/masking.py +344 -0
ingestr/src/monday/__init__.py +246 -0
ingestr/src/monday/helpers.py +392 -0
ingestr/src/monday/settings.py +328 -0
ingestr/src/mongodb/__init__.py +5 -2
ingestr/src/mongodb/helpers.py +384 -10
ingestr/src/plusvibeai/__init__.py +335 -0
ingestr/src/plusvibeai/helpers.py +544 -0
ingestr/src/plusvibeai/settings.py +252 -0
ingestr/src/revenuecat/__init__.py +83 -0
ingestr/src/revenuecat/helpers.py +237 -0
ingestr/src/salesforce/__init__.py +15 -8
ingestr/src/shopify/__init__.py +1 -1
ingestr/src/smartsheets/__init__.py +33 -5
ingestr/src/socrata_source/__init__.py +83 -0
ingestr/src/socrata_source/helpers.py +85 -0
ingestr/src/socrata_source/settings.py +8 -0
ingestr/src/sources.py +1418 -54
ingestr/src/stripe_analytics/__init__.py +2 -19
ingestr/src/wise/__init__.py +68 -0
ingestr/src/wise/client.py +63 -0
ingestr/tests/unit/test_smartsheets.py +6 -9
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/METADATA +24 -12
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/RECORD +79 -37
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/WHEEL +0 -0
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.75.dist-info → ingestr-0.14.98.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/mongodb/__init__.py CHANGED Viewed

@@ -101,12 +101,13 @@ def mongodb_collection(
     write_disposition: Optional[str] = dlt.config.value,
     parallel: Optional[bool] = False,
     limit: Optional[int] = None,
-    chunk_size: Optional[int] = 10000,
+    chunk_size: Optional[int] = 1000,
     data_item_format: Optional[TDataItemFormat] = "object",
     filter_: Optional[Dict[str, Any]] = None,
     projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
     pymongoarrow_schema: Optional[Any] = None,
-) -> Any:
+    custom_query: Optional[List[Dict[str, Any]]] = None,
+) -> DltResource:
     """
     A DLT source which loads a collection from a mongo database using PyMongo.
@@ -132,6 +133,7 @@ def mongodb_collection(
                 exclude (dict) - {"released": False, "runtime": False}
             Note: Can't mix include and exclude statements '{"title": True, "released": False}`
         pymongoarrow_schema (pymongoarrow.schema.Schema): Mapping of expected field types to convert BSON to Arrow
+        custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
     Returns:
         Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -161,4 +163,5 @@ def mongodb_collection(
         filter_=filter_ or {},
         projection=projection,
         pymongoarrow_schema=pymongoarrow_schema,
+        custom_query=custom_query,
     )

ingestr/src/mongodb/helpers.py CHANGED Viewed

@@ -1,5 +1,6 @@
-"""Mongo database source helpers"""
+"""Mongo database source helpers and destination utilities"""
+import re
 from itertools import islice
 from typing import (
     TYPE_CHECKING,
@@ -22,6 +23,7 @@ from bson.timestamp import Timestamp
 from dlt.common import logger
 from dlt.common.configuration.specs import BaseConfiguration, configspec
 from dlt.common.data_writers import TDataItemFormat
+from dlt.common.schema import TTableSchema
 from dlt.common.time import ensure_pendulum_datetime
 from dlt.common.typing import TDataItem
 from dlt.common.utils import map_nested_in_place
@@ -204,7 +206,14 @@ class CollectionLoader:
         cursor = self._limit(cursor, limit)
         while docs_slice := list(islice(cursor, self.chunk_size)):
-            yield map_nested_in_place(convert_mongo_objs, docs_slice)
+            res = map_nested_in_place(convert_mongo_objs, docs_slice)
+            if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
+                yield dlt.mark.with_hints(
+                    res,
+                    dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
+                )
+            else:
+                yield res
 class CollectionLoaderParallel(CollectionLoader):
@@ -464,6 +473,170 @@ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
             yield convert_arrow_columns(table)
+class CollectionAggregationLoader(CollectionLoader):
+    """
+    MongoDB collection loader that uses aggregation pipelines instead of find queries.
+    """
+    def __init__(
+        self,
+        client: TMongoClient,
+        collection: TCollection,
+        chunk_size: int,
+        incremental: Optional[dlt.sources.incremental[Any]] = None,
+    ) -> None:
+        super().__init__(client, collection, chunk_size, incremental)
+        self.custom_query: Optional[List[Dict[str, Any]]] = None
+    def set_custom_query(self, query: List[Dict[str, Any]]):
+        """Set the custom aggregation pipeline query"""
+        self.custom_query = query
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline"""
+        if not self.custom_query:
+            # Fallback to parent method if no custom query
+            yield from super().load_documents(filter_, limit, projection)
+            return
+        # Build aggregation pipeline
+        pipeline = list(self.custom_query)  # Copy the query
+        # For custom queries, we assume incremental filtering is already handled
+        # via interval placeholders (:interval_start, :interval_end) in the query itself.
+        # We don't add additional incremental filtering to avoid conflicts.
+        # Add additional filter if provided
+        if filter_:
+            filter_match = {"$match": filter_}
+            pipeline.insert(0, filter_match)
+        # Add limit if specified
+        if limit and limit > 0:
+            pipeline.append({"$limit": limit})
+        # Add maxTimeMS to prevent hanging
+        cursor = self.collection.aggregate(
+            pipeline,
+            allowDiskUse=True,
+            batchSize=min(self.chunk_size, 101),
+            maxTimeMS=30000,  # 30 second timeout
+        )
+        docs_buffer = []
+        try:
+            for doc in cursor:
+                docs_buffer.append(doc)
+                if len(docs_buffer) >= self.chunk_size:
+                    res = map_nested_in_place(convert_mongo_objs, docs_buffer)
+                    if (
+                        len(res) > 0
+                        and "_id" in res[0]
+                        and isinstance(res[0]["_id"], dict)
+                    ):
+                        yield dlt.mark.with_hints(
+                            res,
+                            dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
+                        )
+                    else:
+                        yield res
+                    docs_buffer = []
+            # Yield any remaining documents
+            if docs_buffer:
+                res = map_nested_in_place(convert_mongo_objs, docs_buffer)
+                if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
+                    yield dlt.mark.with_hints(
+                        res,
+                        dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
+                    )
+                else:
+                    yield res
+        finally:
+            cursor.close()
+class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
+    """
+    MongoDB collection parallel loader that uses aggregation pipelines.
+    Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
+    Falls back to sequential loading.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline (sequential only)"""
+        logger.warning(
+            "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
+        )
+        yield from super().load_documents(filter_, limit, projection)
+class CollectionAggregationArrowLoader(CollectionAggregationLoader):
+    """
+    MongoDB collection aggregation loader that uses Apache Arrow for data processing.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+        pymongoarrow_schema: Any = None,
+    ) -> Iterator[Any]:
+        """Load documents using aggregation pipeline with Arrow format"""
+        logger.warning(
+            "Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
+        )
+        # Load documents normally and convert to arrow format
+        for batch in super().load_documents(filter_, limit, projection):
+            if batch:  # Only process non-empty batches
+                try:
+                    from dlt.common.libs.pyarrow import pyarrow
+                    # Convert dict batch to arrow table
+                    table = pyarrow.Table.from_pylist(batch)
+                    yield convert_arrow_columns(table)
+                except ImportError:
+                    logger.warning(
+                        "PyArrow not available, falling back to object format"
+                    )
+                    yield batch
+class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
+    """
+    MongoDB collection parallel aggregation loader with Arrow support.
+    Falls back to sequential loading.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+        pymongoarrow_schema: Any = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline with Arrow format (sequential only)"""
+        logger.warning(
+            "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
+        )
+        yield from super().load_documents(
+            filter_, limit, projection, pymongoarrow_schema
+        )
 def collection_documents(
     client: TMongoClient,
     collection: TCollection,
@@ -475,6 +648,7 @@ def collection_documents(
     limit: Optional[int] = None,
     chunk_size: Optional[int] = 10000,
     data_item_format: Optional[TDataItemFormat] = "object",
+    custom_query: Optional[List[Dict[str, Any]]] = None,
 ) -> Iterator[TDataItem]:
     """
     A DLT source which loads data from a Mongo database using PyMongo.
@@ -499,6 +673,7 @@ def collection_documents(
             Supported formats:
                 object - Python objects (dicts, lists).
                 arrow - Apache Arrow tables.
+        custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
     Returns:
         Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -521,21 +696,48 @@ def collection_documents(
             "create a projection to select fields, `projection` will be ignored."
         )
-    if parallel:
-        if data_item_format == "arrow":
-            LoaderClass = CollectionArrowLoaderParallel
+    # If custom query is provided, use aggregation loaders
+    if custom_query:
+        if parallel:
+            if data_item_format == "arrow":
+                LoaderClass = CollectionAggregationArrowLoaderParallel
+            else:
+                LoaderClass = CollectionAggregationLoaderParallel  # type: ignore
         else:
-            LoaderClass = CollectionLoaderParallel  # type: ignore
+            if data_item_format == "arrow":
+                LoaderClass = CollectionAggregationArrowLoader  # type: ignore
+            else:
+                LoaderClass = CollectionAggregationLoader  # type: ignore
     else:
-        if data_item_format == "arrow":
-            LoaderClass = CollectionArrowLoader  # type: ignore
+        if parallel:
+            if data_item_format == "arrow":
+                LoaderClass = CollectionArrowLoaderParallel
+            else:
+                LoaderClass = CollectionLoaderParallel  # type: ignore
         else:
-            LoaderClass = CollectionLoader  # type: ignore
+            if data_item_format == "arrow":
+                LoaderClass = CollectionArrowLoader  # type: ignore
+            else:
+                LoaderClass = CollectionLoader  # type: ignore
     loader = LoaderClass(
         client, collection, incremental=incremental, chunk_size=chunk_size
     )
-    if isinstance(loader, (CollectionArrowLoader, CollectionArrowLoaderParallel)):
+    # Set custom query if provided
+    if custom_query and hasattr(loader, "set_custom_query"):
+        loader.set_custom_query(custom_query)
+    # Load documents based on loader type
+    if isinstance(
+        loader,
+        (
+            CollectionArrowLoader,
+            CollectionArrowLoaderParallel,
+            CollectionAggregationArrowLoader,
+            CollectionAggregationArrowLoaderParallel,
+        ),
+    ):
         yield from loader.load_documents(
             limit=limit,
             filter_=filter_,
@@ -666,4 +868,176 @@ class MongoDbCollectionResourceConfiguration(BaseConfiguration):
     projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value
+def convert_mongo_shell_to_extended_json(query_string: str) -> str:
+    """
+    Convert MongoDB shell syntax to MongoDB Extended JSON v2 format.
+    This function handles common MongoDB shell constructs like ISODate, ObjectId,
+    NumberLong, NumberDecimal, etc. and converts them to their Extended JSON equivalents
+    that can be parsed by bson.json_util.
+    Args:
+        query_string: A string containing MongoDB shell syntax
+    Returns:
+        A string with MongoDB Extended JSON v2 format
+    Examples:
+        >>> convert_mongo_shell_to_extended_json('ISODate("2010-01-01T00:00:00.000Z")')
+        '{"$date": "2010-01-01T00:00:00.000Z"}'
+        >>> convert_mongo_shell_to_extended_json('ObjectId("507f1f77bcf86cd799439011")')
+        '{"$oid": "507f1f77bcf86cd799439011"}'
+    """
+    converted = query_string
+    # Convert ISODate("...") to {"$date": "..."}
+    # Pattern matches ISODate("2010-01-01T00:00:00.000+0000") or similar
+    converted = re.sub(r'ISODate\("([^"]+)"\)', r'{"$date": "\1"}', converted)
+    # Convert ObjectId("...") to {"$oid": "..."}
+    converted = re.sub(r'ObjectId\("([^"]+)"\)', r'{"$oid": "\1"}', converted)
+    # Convert NumberLong(...) to {"$numberLong": "..."}
+    # Note: NumberLong can have quotes or not: NumberLong(123) or NumberLong("123")
+    converted = re.sub(r'NumberLong\("([^"]+)"\)', r'{"$numberLong": "\1"}', converted)
+    converted = re.sub(r"NumberLong\(([^)]+)\)", r'{"$numberLong": "\1"}', converted)
+    # Convert NumberInt(...) to {"$numberInt": "..."}
+    converted = re.sub(r'NumberInt\("([^"]+)"\)', r'{"$numberInt": "\1"}', converted)
+    converted = re.sub(r"NumberInt\(([^)]+)\)", r'{"$numberInt": "\1"}', converted)
+    # Convert NumberDecimal("...") to {"$numberDecimal": "..."}
+    converted = re.sub(
+        r'NumberDecimal\("([^"]+)"\)', r'{"$numberDecimal": "\1"}', converted
+    )
+    # Convert Timestamp(..., ...) to {"$timestamp": {"t": ..., "i": ...}}
+    # Timestamp(1234567890, 1) -> {"$timestamp": {"t": 1234567890, "i": 1}}
+    converted = re.sub(
+        r"Timestamp\((\d+),\s*(\d+)\)", r'{"$timestamp": {"t": \1, "i": \2}}', converted
+    )
+    # Convert BinData(..., "...") to {"$binary": {"base64": "...", "subType": "..."}}
+    converted = re.sub(
+        r'BinData\((\d+),\s*"([^"]+)"\)',
+        r'{"$binary": {"base64": "\2", "subType": "\1"}}',
+        converted,
+    )
+    # Convert MinKey() to {"$minKey": 1}
+    converted = re.sub(r"MinKey\(\)", r'{"$minKey": 1}', converted)
+    # Convert MaxKey() to {"$maxKey": 1}
+    converted = re.sub(r"MaxKey\(\)", r'{"$maxKey": 1}', converted)
+    # Convert UUID("...") to {"$uuid": "..."}
+    converted = re.sub(r'UUID\("([^"]+)"\)', r'{"$uuid": "\1"}', converted)
+    # Convert DBRef("collection", "id") to {"$ref": "collection", "$id": "id"}
+    converted = re.sub(
+        r'DBRef\("([^"]+)",\s*"([^"]+)"\)', r'{"$ref": "\1", "$id": "\2"}', converted
+    )
+    # Convert Code("...") to {"$code": "..."}
+    converted = re.sub(r'Code\("([^"]+)"\)', r'{"$code": "\1"}', converted)
+    return converted
 __source_name__ = "mongodb"
+# MongoDB destination helper functions
+def process_file_items(file_path: str) -> list[dict]:
+    """Process items from a file path (JSONL format)."""
+    import json
+    documents = []
+    with open(file_path, "r") as f:
+        for line in f:
+            if line.strip():
+                doc = json.loads(line.strip())
+                documents.append(doc)  # Include all fields including DLT metadata
+    return documents
+def mongodb_insert(uri: str):
+    """Creates a dlt.destination for inserting data into a MongoDB collection.
+    Args:
+        uri (str): MongoDB connection URI including database.
+    Returns:
+        dlt.destination: A DLT destination object configured for MongoDB.
+    """
+    from urllib.parse import urlparse
+    parsed_uri = urlparse(uri)
+    # Handle both mongodb:// and mongodb+srv:// schemes
+    if uri.startswith("mongodb+srv://") or uri.startswith("mongodb://"):
+        # For modern connection strings (MongoDB Atlas), use the URI as-is
+        connection_string = uri
+        # Extract database from path or use default
+        database = (
+            parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
+        )
+    else:
+        # Legacy handling for backwards compatibility
+        host = parsed_uri.hostname or "localhost"
+        port = parsed_uri.port or 27017
+        username = parsed_uri.username
+        password = parsed_uri.password
+        database = (
+            parsed_uri.path.lstrip("/") if parsed_uri.path.lstrip("/") else "ingestr_db"
+        )
+        # Build connection string
+        if username and password:
+            connection_string = f"mongodb://{username}:{password}@{host}:{port}"
+        else:
+            connection_string = f"mongodb://{host}:{port}"
+        # Add query parameters if any
+        if parsed_uri.query:
+            connection_string += f"?{parsed_uri.query}"
+    state = {"first_batch": True}
+    def destination(items: TDataItem, table: TTableSchema) -> None:
+        import pyarrow
+        from pymongo import MongoClient
+        # Extract database name from connection string
+        # Get collection name from table metadata
+        collection_name = table["name"]
+        # Connect to MongoDB
+        with MongoClient(connection_string) as client:
+            db = client[database]
+            collection = db[collection_name]
+            # Process and insert documents
+            if isinstance(items, str):
+                documents = process_file_items(items)
+            elif isinstance(items, pyarrow.RecordBatch):
+                documents = [item for item in items.to_pylist()]
+            else:
+                documents = [item for item in items if isinstance(item, dict)]
+            if state["first_batch"] and documents:
+                collection.delete_many({})
+                state["first_batch"] = False
+            if documents:
+                collection.insert_many(documents)  # Insert all new data
+    return dlt.destination(
+        destination,
+        name="mongodb",
+        loader_file_format="typed-jsonl",
+        batch_size=1000,
+        naming_convention="snake_case",
+        loader_parallelism_strategy="sequential",
+    )

ingestr 0.13.75__py3-none-any.whl → 0.14.98__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.75py3-none-any.whl → 0.14.98py3-none-any.whl