PyPI - ingestr - Versions diffs - 0.13.77__py3-none-any.whl → 0.13.79__py3-none-any.whl - Mend

ingestr 0.13.77py3-none-any.whl → 0.13.79py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (18) hide show

ingestr/main.py +10 -3
ingestr/src/buildinfo.py +1 -1
ingestr/src/destinations.py +18 -0
ingestr/src/facebook_ads/__init__.py +0 -1
ingestr/src/factory.py +5 -0
ingestr/src/freshdesk/__init__.py +23 -8
ingestr/src/freshdesk/freshdesk_client.py +16 -5
ingestr/src/github/__init__.py +5 -3
ingestr/src/github/helpers.py +1 -0
ingestr/src/influxdb/__init__.py +1 -0
ingestr/src/mongodb/__init__.py +3 -0
ingestr/src/mongodb/helpers.py +184 -9
ingestr/src/sources.py +203 -24
{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/METADATA +6 -1
{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/RECORD +18 -18
{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/WHEEL +0 -0
{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/licenses/LICENSE.md +0 -0

ingestr/main.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import warnings
 from datetime import datetime
 from enum import Enum
 from typing import Optional
@@ -8,6 +9,14 @@ from typing_extensions import Annotated
 from ingestr.src.telemetry.event import track
+try:
+    from duckdb_engine import DuckDBEngineWarning
+    warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
+except ImportError:
+    # duckdb-engine not installed
+    pass
 app = typer.Typer(
     name="ingestr",
     help="ingestr is the CLI tool to ingest data from one source to another",
@@ -506,7 +515,6 @@ def ingest(
         if factory.source_scheme == "sqlite":
             source_table = "main." + source_table.split(".")[-1]
         if (
             incremental_key
@@ -600,10 +608,9 @@ def ingest(
         if factory.source_scheme == "influxdb":
             if primary_key:
                 write_disposition = "merge"
         start_time = datetime.now()
         run_info: LoadInfo = pipeline.run(
             dlt_source,
             **destination.dlt_run_params(

ingestr/src/buildinfo.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "v0.13.77"
1	+ version = "v0.13.79"

ingestr/src/destinations.py CHANGED Viewed

@@ -147,6 +147,24 @@ class DuckDBDestination(GenericSqlDestination):
         return dlt.destinations.duckdb(uri, **kwargs)
+class MotherduckDestination(GenericSqlDestination):
+    def dlt_dest(self, uri: str, **kwargs):
+        from urllib.parse import parse_qs, urlparse
+        parsed = urlparse(uri)
+        query = parse_qs(parsed.query)
+        token = query.get("token", [None])[0]
+        from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials
+        creds = {
+            "password": token,
+        }
+        if parsed.path.lstrip("/"):
+            creds["database"] = parsed.path.lstrip("/")
+        return dlt.destinations.motherduck(MotherDuckCredentials(creds), **kwargs)
 def handle_datetimeoffset(dto_value: bytes) -> datetime.datetime:
     # ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794
     tup = struct.unpack(

ingestr/src/facebook_ads/__init__.py CHANGED Viewed

@@ -26,7 +26,6 @@ from .settings import (
     DEFAULT_LEAD_FIELDS,
     INSIGHT_FIELDS_TYPES,
     INSIGHTS_BREAKDOWNS_OPTIONS,
-    INSIGHTS_PRIMARY_KEY,
     INVALID_INSIGHTS_FIELDS,
     TInsightsBreakdownOptions,
     TInsightsLevels,

ingestr/src/factory.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ingestr.src.destinations import (
     DatabricksDestination,
     DuckDBDestination,
     GCSDestination,
+    MotherduckDestination,
     MsSQLDestination,
     MySqlDestination,
     PostgresDestination,
@@ -85,6 +86,8 @@ SQL_SOURCE_SCHEMES = [
     "mysql",
     "mysql+pymysql",
     "mysql+mysqlconnector",
+    "md",
+    "motherduck",
     "postgres",
     "postgresql",
     "postgresql+psycopg2",
@@ -195,6 +198,8 @@ class SourceDestinationFactory:
         "cratedb": CrateDBDestination,
         "databricks": DatabricksDestination,
         "duckdb": DuckDBDestination,
+        "motherduck": MotherduckDestination,
+        "md": MotherduckDestination,
         "mssql": MsSQLDestination,
         "postgres": PostgresDestination,
         "postgresql": PostgresDestination,

ingestr/src/freshdesk/__init__.py CHANGED Viewed

@@ -4,6 +4,8 @@ etc. to the database"""
 from typing import Any, Dict, Generator, Iterable, List, Optional
 import dlt
+import pendulum
+from dlt.common.time import ensure_pendulum_datetime
 from dlt.sources import DltResource
 from .freshdesk_client import FreshdeskClient
@@ -12,10 +14,12 @@ from .settings import DEFAULT_ENDPOINTS
 @dlt.source()
 def freshdesk_source(
-    endpoints: Optional[List[str]] = None,
+    domain: str,
+    api_secret_key: str,
+    start_date: pendulum.DateTime,
+    end_date: Optional[pendulum.DateTime] = None,
     per_page: int = 100,
-    domain: str = dlt.secrets.value,
-    api_secret_key: str = dlt.secrets.value,
+    endpoints: Optional[List[str]] = None,
 ) -> Iterable[DltResource]:
     """
     Retrieves data from specified Freshdesk API endpoints.
@@ -39,7 +43,11 @@ def freshdesk_source(
     def incremental_resource(
         endpoint: str,
         updated_at: Optional[Any] = dlt.sources.incremental(
-            "updated_at", initial_value="2022-01-01T00:00:00Z"
+            "updated_at",
+            initial_value=start_date.isoformat(),
+            end_value=end_date.isoformat() if end_date else None,
+            range_start="closed",
+            range_end="closed",
         ),
     ) -> Generator[Dict[Any, Any], Any, None]:
         """
@@ -48,15 +56,22 @@ def freshdesk_source(
         to ensure incremental loading.
         """
-        # Retrieve the last updated timestamp to fetch only new or updated records.
-        if updated_at is not None:
-            updated_at = updated_at.last_value
+        if updated_at.last_value is not None:
+            start_date = ensure_pendulum_datetime(updated_at.last_value)
+        else:
+            start_date = start_date
+        if updated_at.end_value is not None:
+            end_date = ensure_pendulum_datetime(updated_at.end_value)
+        else:
+            end_date = pendulum.now(tz="UTC")
         # Use the FreshdeskClient instance to fetch paginated responses
         yield from freshdesk.paginated_response(
             endpoint=endpoint,
             per_page=per_page,
-            updated_at=updated_at,
+            start_date=start_date,
+            end_date=end_date,
         )
     # Set default endpoints if not provided

ingestr/src/freshdesk/freshdesk_client.py CHANGED Viewed

@@ -2,8 +2,9 @@
 import logging
 import time
-from typing import Any, Dict, Iterable, Optional
+from typing import Any, Dict, Iterable
+import pendulum
 from dlt.common.typing import TDataItem
 from dlt.sources.helpers import requests
@@ -67,7 +68,8 @@ class FreshdeskClient:
         self,
         endpoint: str,
         per_page: int,
-        updated_at: Optional[str] = None,
+        start_date: pendulum.DateTime,
+        end_date: pendulum.DateTime,
     ) -> Iterable[TDataItem]:
         """
         Fetches a paginated response from a specified endpoint.
@@ -88,8 +90,8 @@ class FreshdeskClient:
                 param_key = (
                     "updated_since" if endpoint == "tickets" else "_updated_since"
                 )
-                if updated_at:
-                    params[param_key] = updated_at
+                params[param_key] = start_date.to_iso8601_string()
             # Handle requests with rate-limiting
             # A maximum of 300 pages (30000 tickets) will be returned.
@@ -98,5 +100,14 @@ class FreshdeskClient:
             if not data:
                 break  # Stop if no data or max page limit reached
-            yield data
+            filtered_data = [
+                item
+                for item in data
+                if "updated_at" in item
+                and pendulum.parse(item["updated_at"]) <= end_date
+            ]
+            if not filtered_data:
+                break
+            yield filtered_data
             page += 1

ingestr/src/github/__init__.py CHANGED Viewed

@@ -91,7 +91,9 @@ def github_repo_events(
     """
     # use naming function in table name to generate separate tables for each event
-    @dlt.resource(primary_key= "id", table_name=lambda i: i["type"], write_disposition="merge")
+    @dlt.resource(
+        primary_key="id", table_name=lambda i: i["type"], write_disposition="merge"
+    )
     def repo_events(
         last_created_at: dlt.sources.incremental[str] = dlt.sources.incremental(
             "created_at",
@@ -105,7 +107,7 @@ def github_repo_events(
         repos_path = (
             f"/repos/{urllib.parse.quote(owner)}/{urllib.parse.quote(name)}/events"
         )
         # Get the date range from the incremental state
         start_filter = pendulum.parse(
             last_created_at.last_value or last_created_at.initial_value
@@ -115,7 +117,7 @@ def github_repo_events(
             if last_created_at.end_value
             else pendulum.now()
         )
         for page in get_rest_pages(access_token, repos_path + "?per_page=100"):
             # Filter events by date range
             filtered_events = []

ingestr/src/github/helpers.py CHANGED Viewed

@@ -61,6 +61,7 @@ def get_stargazers(
             page_items,
         )
 def get_reactions_data(
     node_type: str,
     owner: str,

ingestr/src/influxdb/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from dlt.sources import DltResource
 from .client import InfluxClient
 @dlt.source(max_table_nesting=0)
 def influxdb_source(
     measurement: str,

ingestr/src/mongodb/__init__.py CHANGED Viewed

@@ -106,6 +106,7 @@ def mongodb_collection(
     filter_: Optional[Dict[str, Any]] = None,
     projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = dlt.config.value,
     pymongoarrow_schema: Optional[Any] = None,
+    custom_query: Optional[List[Dict[str, Any]]] = None,
 ) -> Any:
     """
     A DLT source which loads a collection from a mongo database using PyMongo.
@@ -132,6 +133,7 @@ def mongodb_collection(
                 exclude (dict) - {"released": False, "runtime": False}
             Note: Can't mix include and exclude statements '{"title": True, "released": False}`
         pymongoarrow_schema (pymongoarrow.schema.Schema): Mapping of expected field types to convert BSON to Arrow
+        custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
     Returns:
         Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -161,4 +163,5 @@ def mongodb_collection(
         filter_=filter_ or {},
         projection=projection,
         pymongoarrow_schema=pymongoarrow_schema,
+        custom_query=custom_query,
     )

ingestr/src/mongodb/helpers.py CHANGED Viewed

@@ -204,7 +204,14 @@ class CollectionLoader:
         cursor = self._limit(cursor, limit)
         while docs_slice := list(islice(cursor, self.chunk_size)):
-            yield map_nested_in_place(convert_mongo_objs, docs_slice)
+            res = map_nested_in_place(convert_mongo_objs, docs_slice)
+            if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
+                yield dlt.mark.with_hints(
+                    res,
+                    dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
+                )
+            else:
+                yield res
 class CollectionLoaderParallel(CollectionLoader):
@@ -464,6 +471,145 @@ class CollectionArrowLoaderParallel(CollectionLoaderParallel):
             yield convert_arrow_columns(table)
+class CollectionAggregationLoader(CollectionLoader):
+    """
+    MongoDB collection loader that uses aggregation pipelines instead of find queries.
+    """
+    def __init__(
+        self,
+        client: TMongoClient,
+        collection: TCollection,
+        chunk_size: int,
+        incremental: Optional[dlt.sources.incremental[Any]] = None,
+    ) -> None:
+        super().__init__(client, collection, chunk_size, incremental)
+        self.custom_query: Optional[List[Dict[str, Any]]] = None
+    def set_custom_query(self, query: List[Dict[str, Any]]):
+        """Set the custom aggregation pipeline query"""
+        self.custom_query = query
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline"""
+        if not self.custom_query:
+            # Fallback to parent method if no custom query
+            yield from super().load_documents(filter_, limit, projection)
+            return
+        # Build aggregation pipeline
+        pipeline = list(self.custom_query)  # Copy the query
+        # For custom queries, we assume incremental filtering is already handled
+        # via interval placeholders (:interval_start, :interval_end) in the query itself.
+        # We don't add additional incremental filtering to avoid conflicts.
+        # Add additional filter if provided
+        if filter_:
+            filter_match = {"$match": filter_}
+            pipeline.insert(0, filter_match)
+        # Add limit if specified
+        if limit and limit > 0:
+            pipeline.append({"$limit": limit})
+        print("pipeline", pipeline)
+        # Execute aggregation
+        cursor = self.collection.aggregate(pipeline, allowDiskUse=True)
+        # Process results in chunks
+        while docs_slice := list(islice(cursor, self.chunk_size)):
+            res = map_nested_in_place(convert_mongo_objs, docs_slice)
+            print("res", res)
+            if len(res) > 0 and "_id" in res[0] and isinstance(res[0]["_id"], dict):
+                yield dlt.mark.with_hints(
+                    res,
+                    dlt.mark.make_hints(columns={"_id": {"data_type": "json"}}),
+                )
+            else:
+                yield res
+class CollectionAggregationLoaderParallel(CollectionAggregationLoader):
+    """
+    MongoDB collection parallel loader that uses aggregation pipelines.
+    Note: Parallel loading is not supported for aggregation pipelines due to cursor limitations.
+    Falls back to sequential loading.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline (sequential only)"""
+        logger.warning(
+            "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
+        )
+        yield from super().load_documents(filter_, limit, projection)
+class CollectionAggregationArrowLoader(CollectionAggregationLoader):
+    """
+    MongoDB collection aggregation loader that uses Apache Arrow for data processing.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+        pymongoarrow_schema: Any = None,
+    ) -> Iterator[Any]:
+        """Load documents using aggregation pipeline with Arrow format"""
+        logger.warning(
+            "Arrow format is not directly supported for MongoDB aggregation pipelines. Converting to Arrow after loading."
+        )
+        # Load documents normally and convert to arrow format
+        for batch in super().load_documents(filter_, limit, projection):
+            if batch:  # Only process non-empty batches
+                try:
+                    from dlt.common.libs.pyarrow import pyarrow
+                    # Convert dict batch to arrow table
+                    table = pyarrow.Table.from_pylist(batch)
+                    yield convert_arrow_columns(table)
+                except ImportError:
+                    logger.warning(
+                        "PyArrow not available, falling back to object format"
+                    )
+                    yield batch
+class CollectionAggregationArrowLoaderParallel(CollectionAggregationArrowLoader):
+    """
+    MongoDB collection parallel aggregation loader with Arrow support.
+    Falls back to sequential loading.
+    """
+    def load_documents(
+        self,
+        filter_: Dict[str, Any],
+        limit: Optional[int] = None,
+        projection: Optional[Union[Mapping[str, Any], Iterable[str]]] = None,
+        pymongoarrow_schema: Any = None,
+    ) -> Iterator[TDataItem]:
+        """Load documents using aggregation pipeline with Arrow format (sequential only)"""
+        logger.warning(
+            "Parallel loading is not supported for MongoDB aggregation pipelines. Using sequential loading."
+        )
+        yield from super().load_documents(
+            filter_, limit, projection, pymongoarrow_schema
+        )
 def collection_documents(
     client: TMongoClient,
     collection: TCollection,
@@ -475,6 +621,7 @@ def collection_documents(
     limit: Optional[int] = None,
     chunk_size: Optional[int] = 10000,
     data_item_format: Optional[TDataItemFormat] = "object",
+    custom_query: Optional[List[Dict[str, Any]]] = None,
 ) -> Iterator[TDataItem]:
     """
     A DLT source which loads data from a Mongo database using PyMongo.
@@ -499,6 +646,7 @@ def collection_documents(
             Supported formats:
                 object - Python objects (dicts, lists).
                 arrow - Apache Arrow tables.
+        custom_query (Optional[List[Dict[str, Any]]]): Custom MongoDB aggregation pipeline to execute instead of find()
     Returns:
         Iterable[DltResource]: A list of DLT resources for each collection to be loaded.
@@ -521,21 +669,48 @@ def collection_documents(
             "create a projection to select fields, `projection` will be ignored."
         )
-    if parallel:
-        if data_item_format == "arrow":
-            LoaderClass = CollectionArrowLoaderParallel
+    # If custom query is provided, use aggregation loaders
+    if custom_query:
+        if parallel:
+            if data_item_format == "arrow":
+                LoaderClass = CollectionAggregationArrowLoaderParallel
+            else:
+                LoaderClass = CollectionAggregationLoaderParallel  # type: ignore
         else:
-            LoaderClass = CollectionLoaderParallel  # type: ignore
+            if data_item_format == "arrow":
+                LoaderClass = CollectionAggregationArrowLoader  # type: ignore
+            else:
+                LoaderClass = CollectionAggregationLoader  # type: ignore
     else:
-        if data_item_format == "arrow":
-            LoaderClass = CollectionArrowLoader  # type: ignore
+        if parallel:
+            if data_item_format == "arrow":
+                LoaderClass = CollectionArrowLoaderParallel
+            else:
+                LoaderClass = CollectionLoaderParallel  # type: ignore
         else:
-            LoaderClass = CollectionLoader  # type: ignore
+            if data_item_format == "arrow":
+                LoaderClass = CollectionArrowLoader  # type: ignore
+            else:
+                LoaderClass = CollectionLoader  # type: ignore
     loader = LoaderClass(
         client, collection, incremental=incremental, chunk_size=chunk_size
     )
-    if isinstance(loader, (CollectionArrowLoader, CollectionArrowLoaderParallel)):
+    # Set custom query if provided
+    if custom_query and hasattr(loader, "set_custom_query"):
+        loader.set_custom_query(custom_query)
+    # Load documents based on loader type
+    if isinstance(
+        loader,
+        (
+            CollectionArrowLoader,
+            CollectionArrowLoaderParallel,
+            CollectionAggregationArrowLoader,
+            CollectionAggregationArrowLoaderParallel,
+        ),
+    ):
         yield from loader.load_documents(
             limit=limit,
             filter_=filter_,

ingestr/src/sources.py CHANGED Viewed

@@ -73,6 +73,20 @@ class SqlSource:
         engine_adapter_callback = None
+        if uri.startswith("md://") or uri.startswith("motherduck://"):
+            parsed_uri = urlparse(uri)
+            query_params = parse_qs(parsed_uri.query)
+            # Convert md:// URI to duckdb:///md: format
+            if parsed_uri.path:
+                db_path = parsed_uri.path
+            else:
+                db_path = ""
+            token = query_params.get("token", [""])[0]
+            if not token:
+                raise ValueError("Token is required for MotherDuck connection")
+            uri = f"duckdb:///md:{db_path}?motherduck_token={token}"
         if uri.startswith("mysql://"):
             uri = uri.replace("mysql://", "mysql+pymysql://")
@@ -409,31 +423,181 @@ class MongoDbSource:
         return False
     def dlt_source(self, uri: str, table: str, **kwargs):
-        table_fields = table_string_to_dataclass(table)
+        # Check if this is a custom query format (collection:query)
+        if ":" in table:
+            collection_name, query_json = table.split(":", 1)
-        incremental = None
-        if kwargs.get("incremental_key"):
-            start_value = kwargs.get("interval_start")
-            end_value = kwargs.get("interval_end")
+            # Parse and validate the query
+            try:
+                import json
-            incremental = dlt_incremental(
-                kwargs.get("incremental_key", ""),
-                initial_value=start_value,
-                end_value=end_value,
-                range_end="closed",
-                range_start="closed",
+                query = json.loads(query_json)
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid JSON query format: {e}")
+            # Validate that it's a list for aggregation pipeline
+            if not isinstance(query, list):
+                raise ValueError(
+                    "Query must be a JSON array representing a MongoDB aggregation pipeline"
+                )
+            # Check for incremental load requirements
+            incremental = None
+            if kwargs.get("incremental_key"):
+                start_value = kwargs.get("interval_start")
+                end_value = kwargs.get("interval_end")
+                # Validate that incremental key is present in the pipeline
+                incremental_key = kwargs.get("incremental_key")
+                self._validate_incremental_query(query, str(incremental_key))
+                incremental = dlt_incremental(
+                    str(incremental_key),
+                    initial_value=start_value,
+                    end_value=end_value,
+                )
+                # Substitute interval parameters in the query
+                query = self._substitute_interval_params(query, kwargs)
+            # Parse collection name to get database and collection
+            if "." in collection_name:
+                # Handle database.collection format
+                table_fields = table_string_to_dataclass(collection_name)
+                database = table_fields.dataset
+                collection = table_fields.table
+            else:
+                # Single collection name, use default database
+                database = None
+                collection = collection_name
+            table_instance = self.table_builder(
+                connection_url=uri,
+                database=database,
+                collection=collection,
+                parallel=False,
+                incremental=incremental,
+                custom_query=query,
+            )
+            table_instance.max_table_nesting = 1
+            return table_instance
+        else:
+            # Default behavior for simple collection names
+            table_fields = table_string_to_dataclass(table)
+            incremental = None
+            if kwargs.get("incremental_key"):
+                start_value = kwargs.get("interval_start")
+                end_value = kwargs.get("interval_end")
+                incremental = dlt_incremental(
+                    kwargs.get("incremental_key", ""),
+                    initial_value=start_value,
+                    end_value=end_value,
+                )
+            table_instance = self.table_builder(
+                connection_url=uri,
+                database=table_fields.dataset,
+                collection=table_fields.table,
+                parallel=False,
+                incremental=incremental,
+            )
+            table_instance.max_table_nesting = 1
+            return table_instance
+    def _validate_incremental_query(self, query: list, incremental_key: str):
+        """Validate that incremental key is projected in the aggregation pipeline"""
+        # Check if there's a $project stage and if incremental_key is included
+        has_project = False
+        incremental_key_projected = False
+        for stage in query:
+            if "$project" in stage:
+                has_project = True
+                project_stage = stage["$project"]
+                if isinstance(project_stage, dict):
+                    # Check if incremental_key is explicitly included
+                    if incremental_key in project_stage:
+                        if project_stage[incremental_key] not in [0, False]:
+                            incremental_key_projected = True
+                    # If there are only inclusions (1 or True values) and incremental_key is not included
+                    elif any(v in [1, True] for v in project_stage.values()):
+                        # This is an inclusion projection, incremental_key must be explicitly included
+                        incremental_key_projected = False
+                    # If there are only exclusions (0 or False values) and incremental_key is not excluded
+                    elif all(
+                        v in [0, False]
+                        for v in project_stage.values()
+                        if v in [0, False, 1, True]
+                    ):
+                        # This is an exclusion projection, incremental_key is included by default
+                        if incremental_key not in project_stage:
+                            incremental_key_projected = True
+                        else:
+                            incremental_key_projected = project_stage[
+                                incremental_key
+                            ] not in [0, False]
+                    else:
+                        # Mixed or unclear projection, assume incremental_key needs to be explicit
+                        incremental_key_projected = False
+        # If there's a $project stage but incremental_key is not projected, raise error
+        if has_project and not incremental_key_projected:
+            raise ValueError(
+                f"Incremental key '{incremental_key}' must be included in the projected fields of the aggregation pipeline"
             )
-        table_instance = self.table_builder(
-            connection_url=uri,
-            database=table_fields.dataset,
-            collection=table_fields.table,
-            parallel=True,
-            incremental=incremental,
-        )
-        table_instance.max_table_nesting = 1
+    def _substitute_interval_params(self, query: list, kwargs: dict):
+        """Substitute :interval_start and :interval_end placeholders with actual datetime values"""
+        from dlt.common.time import ensure_pendulum_datetime
-        return table_instance
+        # Get interval values and convert them to datetime objects
+        interval_start = kwargs.get("interval_start")
+        interval_end = kwargs.get("interval_end")
+        # Convert string dates to datetime objects if needed
+        if interval_start is not None:
+            if isinstance(interval_start, str):
+                pendulum_dt = ensure_pendulum_datetime(interval_start)
+                interval_start = (
+                    pendulum_dt.to_datetime()
+                    if hasattr(pendulum_dt, "to_datetime")
+                    else pendulum_dt
+                )
+            elif hasattr(interval_start, "to_datetime"):
+                interval_start = interval_start.to_datetime()
+        if interval_end is not None:
+            if isinstance(interval_end, str):
+                pendulum_dt = ensure_pendulum_datetime(interval_end)
+                interval_end = (
+                    pendulum_dt.to_datetime()
+                    if hasattr(pendulum_dt, "to_datetime")
+                    else pendulum_dt
+                )
+            elif hasattr(interval_end, "to_datetime"):
+                interval_end = interval_end.to_datetime()
+        # Deep copy the query and replace placeholders with actual datetime objects
+        def replace_placeholders(obj):
+            if isinstance(obj, dict):
+                result = {}
+                for key, value in obj.items():
+                    if value == ":interval_start" and interval_start is not None:
+                        result[key] = interval_start
+                    elif value == ":interval_end" and interval_end is not None:
+                        result[key] = interval_end
+                    else:
+                        result[key] = replace_placeholders(value)
+                return result
+            elif isinstance(obj, list):
+                return [replace_placeholders(item) for item in obj]
+            else:
+                return obj
+        return replace_placeholders(query)
 class LocalCsvSource:
@@ -961,7 +1125,7 @@ class SlackSource:
 class HubspotSource:
     def handles_incrementality(self) -> bool:
-        return True
+        return False
     # hubspot://?api_key=<api_key>
     def dlt_source(self, uri: str, table: str, **kwargs):
@@ -2528,6 +2692,18 @@ class FreshdeskSource:
         if api_key is None:
             raise MissingValueError("api_key", "Freshdesk")
+        start_date = kwargs.get("interval_start")
+        if start_date is not None:
+            start_date = ensure_pendulum_datetime(start_date).in_tz("UTC")
+        else:
+            start_date = ensure_pendulum_datetime("2022-01-01T00:00:00Z")
+        end_date = kwargs.get("interval_end")
+        if end_date is not None:
+            end_date = ensure_pendulum_datetime(end_date).in_tz("UTC")
+        else:
+            end_date = None
         if table not in [
             "agents",
             "companies",
@@ -2541,7 +2717,10 @@ class FreshdeskSource:
         from ingestr.src.freshdesk import freshdesk_source
         return freshdesk_source(
-            api_secret_key=api_key[0], domain=domain
+            api_secret_key=api_key[0],
+            domain=domain,
+            start_date=start_date,
+            end_date=end_date,
         ).with_resources(table)
@@ -2684,7 +2863,7 @@ class ElasticsearchSource:
 class AttioSource:
     def handles_incrementality(self) -> bool:
-        return True
+        return False
     def dlt_source(self, uri: str, table: str, **kwargs):
         parsed_uri = urlparse(uri)
@@ -3056,7 +3235,7 @@ class InfluxDBSource:
         secure = params.get("secure", ["true"])[0].lower() != "false"
         scheme = "https" if secure else "http"
         if port:
             host_url = f"{scheme}://{host}:{port}"
         else:

{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ingestr
-Version: 0.13.77
+Version: 0.13.79
 Summary: ingestr is a command-line application that ingests data from various sources and stores them in any database.
 Project-URL: Homepage, https://github.com/bruin-data/ingestr
 Project-URL: Issues, https://github.com/bruin-data/ingestr/issues
@@ -355,6 +355,11 @@ Pull requests are welcome. However, please open an issue first to discuss what y
         <td>✅</td>
         <td>❌</td>
     </tr>
+    <tr>
+        <td>MotherDuck</td>
+        <td>✅</td>
+        <td>✅</td>
+    </tr>
     <tr>
         <td>MySQL</td>
         <td>✅</td>

{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
 ingestr/conftest.py,sha256=OE2yxeTCosS9CUFVuqNypm-2ftYvVBeeq7egm3878cI,1981
-ingestr/main.py,sha256=QsNVrz5_NgRUkvfExnd-2E02TGmWivPuop5hYinVAjM,26513
+ingestr/main.py,sha256=qoWHNcHh0-xVnyQxbQ-SKuTxPb1RNV3ENkCpqO7CLrk,26694
 ingestr/src/.gitignore,sha256=8cX1AZTSI0TcdZFGTmS_oyBjpfCzhOEt0DdAo2dFIY8,203
 ingestr/src/blob.py,sha256=UUWMjHUuoR9xP1XZQ6UANQmnMVyDx3d0X4-2FQC271I,2138
-ingestr/src/buildinfo.py,sha256=m-UnHXqIEncSB59iRwe3ErfexmtYKEfrkxW2stcGqlQ,21
-ingestr/src/destinations.py,sha256=ivTPio0zzqLRx22i597pxZHMNClz-XvYSyCaCPuGd8g,22248
+ingestr/src/buildinfo.py,sha256=yE0cfxWae8TNJJLYcRmNexeK769vtdz_-vJGzcROgwE,21
+ingestr/src/destinations.py,sha256=M2Yni6wiWcrvZ8EPJemidqxN156l0rehgCc7xuil7mo,22840
 ingestr/src/errors.py,sha256=Ufs4_DfE77_E3vnA1fOQdi6cmuLVNm7_SbFLkL1XPGk,686
-ingestr/src/factory.py,sha256=q_rSi4gYMfxnGvzhytPRAgC08N40nqDISvXwl7i-E_M,6655
+ingestr/src/factory.py,sha256=rF5Ry4o4t8KulSPBtrd7ZKCI_0TH1DAetG0zs9H7oik,6792
 ingestr/src/filters.py,sha256=LLecXe9QkLFkFLUZ92OXNdcANr1a8edDxrflc2ko_KA,1452
 ingestr/src/http_client.py,sha256=bxqsk6nJNXCo-79gW04B53DQO-yr25vaSsqP0AKtjx4,732
 ingestr/src/loader.py,sha256=9NaWAyfkXdqAZSS-N72Iwo36Lbx4PyqIfaaH1dNdkFs,1712
 ingestr/src/partition.py,sha256=BrIP6wFJvyR7Nus_3ElnfxknUXeCipK_E_bB8kZowfc,969
 ingestr/src/resource.py,sha256=ZqmZxFQVGlF8rFPhBiUB08HES0yoTj8sZ--jKfaaVps,1164
-ingestr/src/sources.py,sha256=JQ9TOGhBG9xM0jtKnz0cqwBpqxLxeU-Y-_NxWpMzVeU,107634
+ingestr/src/sources.py,sha256=qZz35cdO-nO9CZsdOJ8Ni56wclNfbGQuGj4nsoHpFxE,115678
 ingestr/src/table_definition.py,sha256=REbAbqdlmUMUuRh8nEQRreWjPVOQ5ZcfqGkScKdCrmk,390
 ingestr/src/time.py,sha256=H_Fk2J4ShXyUM-EMY7MqCLZQhlnZMZvO952bmZPc4yE,254
 ingestr/src/version.py,sha256=J_2xgZ0mKlvuHcjdKCx2nlioneLH0I47JiU_Slr_Nwc,189
@@ -41,7 +41,7 @@ ingestr/src/clickup/helpers.py,sha256=RzDKMUAHccuDhocIQ2ToBXfCERo8CBJqA3t-IPltBC
 ingestr/src/collector/spinner.py,sha256=_ZUqF5MI43hVIULdjF5s5mrAZbhEFXaiWirQmrv3Yk4,1201
 ingestr/src/dynamodb/__init__.py,sha256=swhxkeYBbJ35jn1IghCtvYWT2BM33KynVCh_oR4z28A,2264
 ingestr/src/elasticsearch/__init__.py,sha256=m-q93HgUmTwGDUwHOjHawstWL06TC3WIX3H05szybrY,2556
-ingestr/src/facebook_ads/__init__.py,sha256=Rchn-nH5mAOWW7OeYMCy_VS8dAoqfYY4t0YzWDSeN5k,9751
+ingestr/src/facebook_ads/__init__.py,sha256=_9929DYzcq5iLt-l3DmJ4VBZwmoEwgyPZbPstH0ySmI,9725
 ingestr/src/facebook_ads/exceptions.py,sha256=4Nlbc0Mv3i5g-9AoyT-n1PIa8IDi3VCTfEAzholx4Wc,115
 ingestr/src/facebook_ads/helpers.py,sha256=NshS21can1xhRKQzg_o-c6qSxWoC3NnE3FwgJxUnygE,8239
 ingestr/src/facebook_ads/settings.py,sha256=Bsic8RcmH-NfEZ7r_NGospTCmwISK9XaMT5y2NZirtg,4938
@@ -51,11 +51,11 @@ ingestr/src/filesystem/helpers.py,sha256=bg0muSHZr3hMa8H4jN2-LGWzI-SUoKlQNiWJ74-
 ingestr/src/filesystem/readers.py,sha256=a0fKkaRpnAOGsXI3EBNYZa7x6tlmAOsgRzb883StY30,3987
 ingestr/src/frankfurter/__init__.py,sha256=oVi4BiOxPRyckEVrBNunyMAHulPyMgyGRwBbhn-Xz6M,4987
 ingestr/src/frankfurter/helpers.py,sha256=SyrkRTDqvKdQxRHTV5kcSeVG3FEnaK5zxHyNyqtumZ0,1445
-ingestr/src/freshdesk/__init__.py,sha256=uFQW_cJyymxtHQiYb_xjzZAklc487L0n9GkgHgC7yAI,2618
-ingestr/src/freshdesk/freshdesk_client.py,sha256=3z5Yc008ADzRcJWtNc00PwjkLzG-RMI8jVIOOyYA-Rw,4088
+ingestr/src/freshdesk/__init__.py,sha256=ukyorgCNsW_snzsYBDsr3Q0WB8f-to9Fk0enqHHFQlk,3087
+ingestr/src/freshdesk/freshdesk_client.py,sha256=1nFf0K4MQ0KZbWwk4xSbYHaykVqmPLfN39miOFDpWVc,4385
 ingestr/src/freshdesk/settings.py,sha256=0Wr_OMnUZcTlry7BmALssLxD2yh686JW4moLNv12Jnw,409
-ingestr/src/github/__init__.py,sha256=R71y33KqzxDTvCLSGj2H2EztfGqsWGR9ZgcaurC1-A4,7220
-ingestr/src/github/helpers.py,sha256=hge8orylwiScRcMftlv4oSZ6ORvVANwHCPAGkg95FtI,6758
+ingestr/src/github/__init__.py,sha256=C7b5j6CrxmTItS4tyDa3OYzdAw5c__xboOtoEJYe3wQ,7217
+ingestr/src/github/helpers.py,sha256=rpv_3HzuOl4PQ-FUeA66pev-pgze9SaE8RUHIPYfZ_A,6759
 ingestr/src/github/queries.py,sha256=W34C02jUEdjFmOE7f7u9xvYyBNDMfVZAu0JIRZI2mkU,2302
 ingestr/src/github/settings.py,sha256=N5ahWrDIQ_4IWV9i-hTXxyYduqY9Ym2BTwqsWxcDdJ8,258
 ingestr/src/google_ads/__init__.py,sha256=bH0TtnRWcOUESezpvoA7VEUHAq_0ITGQeX4GGVBfl1I,3725
@@ -75,7 +75,7 @@ ingestr/src/gorgias/helpers.py,sha256=DamuijnvhGY9hysQO4txrVMf4izkGbh5qfBKImdOIN
 ingestr/src/hubspot/__init__.py,sha256=wqHefhc_YRI5dNFCcpvH-UUilNThE49sbGouSBiHYsw,11776
 ingestr/src/hubspot/helpers.py,sha256=k2b-lhxqBNKHoOSHoHegFSsk8xxjjGA0I04V0XyX2b4,7883
 ingestr/src/hubspot/settings.py,sha256=i73MkSiJfRLMFLfiJgYdhp-rhymHTfoqFzZ4uOJdFJM,2456
-ingestr/src/influxdb/__init__.py,sha256=sj_K4ShXECp6cW4xVVv2kCwQCFtTYD0dC9LOAEqFoVI,1289
+ingestr/src/influxdb/__init__.py,sha256=cYsGnDPNHRTe9pp14ogDQgPTCI9TOdyJm1MaNuQLHdk,1290
 ingestr/src/influxdb/client.py,sha256=hCxSNREAWWEvvAV3RQbKaWp2-e_7EE8xmVRjTwLFEFo,1230
 ingestr/src/isoc_pulse/__init__.py,sha256=9b4eN4faatpiwTuRNPuYcEt1hEFDEjua9XhfakUigBk,4648
 ingestr/src/kafka/__init__.py,sha256=QUHsGmdv5_E-3z0GDHXvbk39puwuGDBsyYSDhvbA89E,3595
@@ -92,8 +92,8 @@ ingestr/src/linkedin_ads/dimension_time_enum.py,sha256=EmHRdkFyTAfo4chGjThrwqffW
 ingestr/src/linkedin_ads/helpers.py,sha256=eUWudRVlXl4kqIhfXQ1eVsUpZwJn7UFqKSpnbLfxzds,4498
 ingestr/src/mixpanel/__init__.py,sha256=s1QtqMP0BTGW6YtdCabJFWj7lEn7KujzELwGpBOQgfs,1796
 ingestr/src/mixpanel/client.py,sha256=c_reouegOVYBOwHLfgYFwpmkba0Sxro1Zkml07NCYf0,3602
-ingestr/src/mongodb/__init__.py,sha256=T-RYPS_skl_2gNVfYWWXan2bVQYmm0bFBcCCqG5ejvg,7275
-ingestr/src/mongodb/helpers.py,sha256=H0GpOK3bPBhFWBEhJZOjywUBdzih6MOpmyVO_cKSN14,24178
+ingestr/src/mongodb/__init__.py,sha256=5KNdR2mxJoHSOU1pt-FIJNg9HT4aHPwl6mI31xPBQLA,7487
+ingestr/src/mongodb/helpers.py,sha256=VMGKkSN6FIQ4l-4TUqoc-Ou7r52_zPXuLF33ZN23B_I,30881
 ingestr/src/notion/__init__.py,sha256=36wUui8finbc85ObkRMq8boMraXMUehdABN_AMe_hzA,1834
 ingestr/src/notion/settings.py,sha256=MwQVZViJtnvOegfjXYc_pJ50oUYgSRPgwqu7TvpeMOA,82
 ingestr/src/notion/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -151,8 +151,8 @@ ingestr/testdata/merge_expected.csv,sha256=DReHqWGnQMsf2PBv_Q2pfjsgvikYFnf1zYcQZ
 ingestr/testdata/merge_part1.csv,sha256=Pw8Z9IDKcNU0qQHx1z6BUf4rF_-SxKGFOvymCt4OY9I,185
 ingestr/testdata/merge_part2.csv,sha256=T_GiWxA81SN63_tMOIuemcvboEFeAmbKc7xRXvL9esw,287
 ingestr/tests/unit/test_smartsheets.py,sha256=eiC2CCO4iNJcuN36ONvqmEDryCA1bA1REpayHpu42lk,5058
-ingestr-0.13.77.dist-info/METADATA,sha256=b-YQpLectr54strKX2Kdc6P42yALoOhxLeglY3AitgY,15093
-ingestr-0.13.77.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-ingestr-0.13.77.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
-ingestr-0.13.77.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
-ingestr-0.13.77.dist-info/RECORD,,
+ingestr-0.13.79.dist-info/METADATA,sha256=5dl0NFB3Ach1_lFtE4xOJpud_chn_w0qvepZnnMjRzo,15182
+ingestr-0.13.79.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+ingestr-0.13.79.dist-info/entry_points.txt,sha256=oPJy0KBnPWYjDtP1k8qwAihcTLHSZokSQvRAw_wtfJM,46
+ingestr-0.13.79.dist-info/licenses/LICENSE.md,sha256=cW8wIhn8HFE-KLStDF9jHQ1O_ARWP3kTpk_-eOccL24,1075
+ingestr-0.13.79.dist-info/RECORD,,

{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/WHEEL RENAMED Viewed

File without changes

{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ingestr-0.13.77.dist-info → ingestr-0.13.79.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

ingestr 0.13.77__py3-none-any.whl → 0.13.79__py3-none-any.whl

Potentially problematic release.

ingestr 0.13.77py3-none-any.whl → 0.13.79py3-none-any.whl