PyPI - ingestr - Versions diffs - 0.13.13__py3-none-any.whl → 0.14.104__py3-none-any.whl - Mend

ingestr 0.13.13py3-none-any.whl → 0.14.104py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

ingestr/conftest.py +72 -0
ingestr/main.py +134 -87
ingestr/src/adjust/__init__.py +4 -4
ingestr/src/adjust/adjust_helpers.py +7 -3
ingestr/src/airtable/__init__.py +3 -2
ingestr/src/allium/__init__.py +128 -0
ingestr/src/anthropic/__init__.py +277 -0
ingestr/src/anthropic/helpers.py +525 -0
ingestr/src/applovin_max/__init__.py +6 -4
ingestr/src/appsflyer/__init__.py +325 -0
ingestr/src/appsflyer/client.py +49 -45
ingestr/src/appstore/__init__.py +1 -0
ingestr/src/arrow/__init__.py +9 -1
ingestr/src/asana_source/__init__.py +1 -1
ingestr/src/attio/__init__.py +102 -0
ingestr/src/attio/helpers.py +65 -0
ingestr/src/blob.py +37 -10
ingestr/src/buildinfo.py +1 -1
ingestr/src/chess/__init__.py +1 -1
ingestr/src/clickup/__init__.py +85 -0
ingestr/src/clickup/helpers.py +47 -0
ingestr/src/collector/spinner.py +43 -0
ingestr/src/couchbase_source/__init__.py +118 -0
ingestr/src/couchbase_source/helpers.py +135 -0
ingestr/src/cursor/__init__.py +83 -0
ingestr/src/cursor/helpers.py +188 -0
ingestr/src/destinations.py +508 -27
ingestr/src/docebo/__init__.py +589 -0
ingestr/src/docebo/client.py +435 -0
ingestr/src/docebo/helpers.py +97 -0
ingestr/src/elasticsearch/__init__.py +80 -0
ingestr/src/elasticsearch/helpers.py +138 -0
ingestr/src/errors.py +8 -0
ingestr/src/facebook_ads/__init__.py +47 -28
ingestr/src/facebook_ads/helpers.py +59 -37
ingestr/src/facebook_ads/settings.py +2 -0
ingestr/src/facebook_ads/utils.py +39 -0
ingestr/src/factory.py +107 -2
ingestr/src/filesystem/__init__.py +8 -3
ingestr/src/filters.py +46 -3
ingestr/src/fluxx/__init__.py +9906 -0
ingestr/src/fluxx/helpers.py +209 -0
ingestr/src/frankfurter/__init__.py +157 -0
ingestr/src/frankfurter/helpers.py +48 -0
ingestr/src/freshdesk/__init__.py +89 -0
ingestr/src/freshdesk/freshdesk_client.py +137 -0
ingestr/src/freshdesk/settings.py +9 -0
ingestr/src/fundraiseup/__init__.py +95 -0
ingestr/src/fundraiseup/client.py +81 -0
ingestr/src/github/__init__.py +41 -6
ingestr/src/github/helpers.py +5 -5
ingestr/src/google_analytics/__init__.py +22 -4
ingestr/src/google_analytics/helpers.py +124 -6
ingestr/src/google_sheets/__init__.py +4 -4
ingestr/src/google_sheets/helpers/data_processing.py +2 -2
ingestr/src/hostaway/__init__.py +302 -0
ingestr/src/hostaway/client.py +288 -0
ingestr/src/http/__init__.py +35 -0
ingestr/src/http/readers.py +114 -0
ingestr/src/http_client.py +24 -0
ingestr/src/hubspot/__init__.py +66 -23
ingestr/src/hubspot/helpers.py +52 -22
ingestr/src/hubspot/settings.py +14 -7
ingestr/src/influxdb/__init__.py +46 -0
ingestr/src/influxdb/client.py +34 -0
ingestr/src/intercom/__init__.py +142 -0
ingestr/src/intercom/helpers.py +674 -0
ingestr/src/intercom/settings.py +279 -0
ingestr/src/isoc_pulse/__init__.py +159 -0
ingestr/src/jira_source/__init__.py +340 -0
ingestr/src/jira_source/helpers.py +439 -0
ingestr/src/jira_source/settings.py +170 -0
ingestr/src/kafka/__init__.py +4 -1
ingestr/src/kinesis/__init__.py +139 -0
ingestr/src/kinesis/helpers.py +82 -0
ingestr/src/klaviyo/{_init_.py → __init__.py} +5 -6
ingestr/src/linear/__init__.py +634 -0
ingestr/src/linear/helpers.py +111 -0
ingestr/src/linkedin_ads/helpers.py +0 -1
ingestr/src/mailchimp/__init__.py +126 -0
ingestr/src/mailchimp/helpers.py +226 -0
ingestr/src/mailchimp/settings.py +164 -0
ingestr/src/masking.py +344 -0
ingestr/src/mixpanel/__init__.py +62 -0
ingestr/src/mixpanel/client.py +99 -0
ingestr/src/monday/__init__.py +246 -0
ingestr/src/monday/helpers.py +392 -0
ingestr/src/monday/settings.py +328 -0
ingestr/src/mongodb/__init__.py +72 -8
ingestr/src/mongodb/helpers.py +915 -38
ingestr/src/partition.py +32 -0
ingestr/src/phantombuster/__init__.py +65 -0
ingestr/src/phantombuster/client.py +87 -0
ingestr/src/pinterest/__init__.py +82 -0
ingestr/src/pipedrive/__init__.py +198 -0
ingestr/src/pipedrive/helpers/__init__.py +23 -0
ingestr/src/pipedrive/helpers/custom_fields_munger.py +102 -0
ingestr/src/pipedrive/helpers/pages.py +115 -0
ingestr/src/pipedrive/settings.py +27 -0
ingestr/src/pipedrive/typing.py +3 -0
ingestr/src/plusvibeai/__init__.py +335 -0
ingestr/src/plusvibeai/helpers.py +544 -0
ingestr/src/plusvibeai/settings.py +252 -0
ingestr/src/quickbooks/__init__.py +117 -0
ingestr/src/resource.py +40 -0
ingestr/src/revenuecat/__init__.py +83 -0
ingestr/src/revenuecat/helpers.py +237 -0
ingestr/src/salesforce/__init__.py +15 -8
ingestr/src/shopify/__init__.py +1 -17
ingestr/src/smartsheets/__init__.py +82 -0
ingestr/src/snapchat_ads/__init__.py +489 -0
ingestr/src/snapchat_ads/client.py +72 -0
ingestr/src/snapchat_ads/helpers.py +535 -0
ingestr/src/socrata_source/__init__.py +83 -0
ingestr/src/socrata_source/helpers.py +85 -0
ingestr/src/socrata_source/settings.py +8 -0
ingestr/src/solidgate/__init__.py +219 -0
ingestr/src/solidgate/helpers.py +154 -0
ingestr/src/sources.py +2933 -245
ingestr/src/stripe_analytics/__init__.py +49 -21
ingestr/src/stripe_analytics/helpers.py +286 -1
ingestr/src/stripe_analytics/settings.py +62 -10
ingestr/src/telemetry/event.py +10 -9
ingestr/src/tiktok_ads/__init__.py +12 -6
ingestr/src/tiktok_ads/tiktok_helpers.py +0 -1
ingestr/src/trustpilot/__init__.py +48 -0
ingestr/src/trustpilot/client.py +48 -0
ingestr/src/wise/__init__.py +68 -0
ingestr/src/wise/client.py +63 -0
ingestr/src/zoom/__init__.py +99 -0
ingestr/src/zoom/helpers.py +102 -0
ingestr/tests/unit/test_smartsheets.py +133 -0
{ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/METADATA +229 -19
ingestr-0.14.104.dist-info/RECORD +203 -0
ingestr/src/appsflyer/_init_.py +0 -24
ingestr-0.13.13.dist-info/RECORD +0 -115
{ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/WHEEL +0 -0
{ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/entry_points.txt +0 -0
{ingestr-0.13.13.dist-info → ingestr-0.14.104.dist-info}/licenses/LICENSE.md +0 -0

ingestr/conftest.py ADDED Viewed

@@ -0,0 +1,72 @@
+import os
+import tempfile
+from concurrent.futures import ThreadPoolExecutor
+import pytest
+from main_test import DESTINATIONS, SOURCES  # type: ignore
+def pytest_configure(config):
+    if is_master(config):
+        config.shared_directory = tempfile.mkdtemp()
+def pytest_configure_node(node):
+    """xdist hook"""
+    node.workerinput["shared_directory"] = node.config.shared_directory
+@pytest.fixture(scope="session")
+def shared_directory(request):
+    if is_master(request.config):
+        return request.config.shared_directory
+    else:
+        return request.config.workerinput["shared_directory"]
+def is_master(config):
+    """True if the code running the given pytest.config object is running in a xdist master
+    node or not running xdist at all.
+    """
+    return not hasattr(config, "workerinput")
+def start_containers(config):
+    if hasattr(config, "workerinput"):
+        return
+    unique_containers = set(SOURCES.values()) | set(DESTINATIONS.values())
+    for container in unique_containers:
+        container.container_lock_dir = config.shared_directory
+    with ThreadPoolExecutor() as executor:
+        for container in unique_containers:
+            executor.submit(container.start_fully)
+        # futures = [
+        #     executor.submit(container.start_fully) for container in unique_containers
+        # ]
+        # # Wait for all futures to complete
+        # for future in futures:
+        #     future.result()
+def stop_containers(config):
+    if hasattr(config, "workerinput"):
+        return
+    should_manage_containers = os.environ.get("PYTEST_XDIST_WORKER", "gw0") == "gw0"
+    if not should_manage_containers:
+        return
+    unique_containers = set(SOURCES.values()) | set(DESTINATIONS.values())
+    for container in unique_containers:
+        container.stop_fully()
+def pytest_sessionstart(session):
+    start_containers(session.config)
+def pytest_sessionfinish(session, exitstatus):
+    stop_containers(session.config)

ingestr/main.py CHANGED Viewed

@@ -1,16 +1,22 @@
+import warnings
 from datetime import datetime
 from enum import Enum
 from typing import Optional
 import typer
-from dlt.common.runtime.collector import Collector
 from rich.console import Console
-from rich.status import Status
 from typing_extensions import Annotated
-from ingestr.src.filters import cast_set_to_list
 from ingestr.src.telemetry.event import track
+try:
+    from duckdb_engine import DuckDBEngineWarning
+    warnings.filterwarnings("ignore", category=DuckDBEngineWarning)
+except ImportError:
+    # duckdb-engine not installed
+    pass
 app = typer.Typer(
     name="ingestr",
     help="ingestr is the CLI tool to ingest data from one source to another",
@@ -32,56 +38,18 @@ DATE_FORMATS = [
 # https://dlthub.com/docs/dlt-ecosystem/file-formats/parquet#supported-destinations
 PARQUET_SUPPORTED_DESTINATIONS = [
-    "athena" "bigquery",
+    "athenabigquery",
     "duckdb",
     "snowflake",
     "databricks",
     "synapse",
+    "s3",
 ]
 # these sources would return a JSON for sure, which means they cannot be used with Parquet loader for BigQuery
 JSON_RETURNING_SOURCES = ["notion"]
-class SpinnerCollector(Collector):
-    status: Status
-    current_step: str
-    started: bool
-    def __init__(self) -> None:
-        self.status = Status("Ingesting data...", spinner="dots")
-        self.started = False
-    def update(
-        self,
-        name: str,
-        inc: int = 1,
-        total: Optional[int] = None,
-        message: Optional[str] = None,  # type: ignore
-        label: str = "",
-        **kwargs,
-    ) -> None:
-        self.status.update(self.current_step)
-    def _start(self, step: str) -> None:
-        self.current_step = self.__step_to_label(step)
-        self.status.start()
-    def __step_to_label(self, step: str) -> str:
-        verb = step.split(" ")[0].lower()
-        if verb.startswith("normalize"):
-            return "Normalizing the data"
-        elif verb.startswith("load"):
-            return "Loading the data to the destination"
-        elif verb.startswith("extract"):
-            return "Extracting the data from the source"
-        return f"{verb.capitalize()} the data"
-    def _stop(self) -> None:
-        self.status.stop()
 class IncrementalStrategy(str, Enum):
     create_replace = "replace"
     append = "append"
@@ -99,6 +67,7 @@ class LoaderFileFormat(str, Enum):
 class SqlBackend(str, Enum):
+    default = "default"
     sqlalchemy = "sqlalchemy"
     pyarrow = "pyarrow"
     connectorx = "connectorx"
@@ -124,40 +93,44 @@ class SqlReflectionLevel(str, Enum):
 def ingest(
     source_uri: Annotated[
         str,
-        typer.Option(help="The URI of the [green]source[/green]", envvar="SOURCE_URI"),
+        typer.Option(
+            help="The URI of the [green]source[/green]",
+            envvar=["SOURCE_URI", "INGESTR_SOURCE_URI"],
+        ),
     ],  # type: ignore
     dest_uri: Annotated[
         str,
         typer.Option(
-            help="The URI of the [cyan]destination[/cyan]", envvar="DESTINATION_URI"
+            help="The URI of the [cyan]destination[/cyan]",
+            envvar=["DESTINATION_URI", "INGESTR_DESTINATION_URI"],
         ),
     ],  # type: ignore
     source_table: Annotated[
         str,
         typer.Option(
             help="The table name in the [green]source[/green] to fetch",
-            envvar="SOURCE_TABLE",
+            envvar=["SOURCE_TABLE", "INGESTR_SOURCE_TABLE"],
         ),
     ],  # type: ignore
     dest_table: Annotated[
         str,
         typer.Option(
             help="The table in the [cyan]destination[/cyan] to save the data into",
-            envvar="DESTINATION_TABLE",
+            envvar=["DESTINATION_TABLE", "INGESTR_DESTINATION_TABLE"],
         ),
     ] = None,  # type: ignore
     incremental_key: Annotated[
         Optional[str],
         typer.Option(
             help="The incremental key from the table to be used for incremental strategies",
-            envvar="INCREMENTAL_KEY",
+            envvar=["INCREMENTAL_KEY", "INGESTR_INCREMENTAL_KEY"],
         ),
     ] = None,  # type: ignore
     incremental_strategy: Annotated[
         IncrementalStrategy,
         typer.Option(
             help="The incremental strategy to use",
-            envvar="INCREMENTAL_STRATEGY",
+            envvar=["INCREMENTAL_STRATEGY", "INGESTR_INCREMENTAL_STRATEGY"],
         ),
     ] = IncrementalStrategy.create_replace,  # type: ignore
     interval_start: Annotated[
@@ -165,7 +138,7 @@ def ingest(
         typer.Option(
             help="The start of the interval the incremental key will cover",
             formats=DATE_FORMATS,
-            envvar="INTERVAL_START",
+            envvar=["INTERVAL_START", "INGESTR_INTERVAL_START"],
         ),
     ] = None,  # type: ignore
     interval_end: Annotated[
@@ -173,128 +146,149 @@ def ingest(
         typer.Option(
             help="The end of the interval the incremental key will cover",
             formats=DATE_FORMATS,
-            envvar="INTERVAL_END",
+            envvar=["INTERVAL_END", "INGESTR_INTERVAL_END"],
         ),
     ] = None,  # type: ignore
     primary_key: Annotated[
         Optional[list[str]],
         typer.Option(
             help="The key that will be used to deduplicate the resulting table",
-            envvar="PRIMARY_KEY",
+            envvar=["PRIMARY_KEY", "INGESTR_PRIMARY_KEY"],
         ),
     ] = None,  # type: ignore
     partition_by: Annotated[
         Optional[str],
         typer.Option(
             help="The partition key to be used for partitioning the destination table",
-            envvar="PARTITION_BY",
+            envvar=["PARTITION_BY", "INGESTR_PARTITION_BY"],
         ),
     ] = None,  # type: ignore
     cluster_by: Annotated[
         Optional[str],
         typer.Option(
             help="The clustering key to be used for clustering the destination table, not every destination supports clustering.",
-            envvar="CLUSTER_BY",
+            envvar=["CLUSTER_BY", "INGESTR_CLUSTER_BY"],
         ),
     ] = None,  # type: ignore
     yes: Annotated[
         Optional[bool],
         typer.Option(
             help="Skip the confirmation prompt and ingest right away",
-            envvar="SKIP_CONFIRMATION",
+            envvar=["SKIP_CONFIRMATION", "INGESTR_SKIP_CONFIRMATION"],
         ),
     ] = False,  # type: ignore
     full_refresh: Annotated[
         bool,
         typer.Option(
             help="Ignore the state and refresh the destination table completely",
-            envvar="FULL_REFRESH",
+            envvar=["FULL_REFRESH", "INGESTR_FULL_REFRESH"],
         ),
     ] = False,  # type: ignore
     progress: Annotated[
         Progress,
         typer.Option(
             help="The progress display type, must be one of 'interactive', 'log'",
-            envvar="PROGRESS",
+            envvar=["PROGRESS", "INGESTR_PROGRESS"],
         ),
     ] = Progress.interactive,  # type: ignore
     sql_backend: Annotated[
         SqlBackend,
         typer.Option(
             help="The SQL backend to use",
-            envvar="SQL_BACKEND",
+            envvar=["SQL_BACKEND", "INGESTR_SQL_BACKEND"],
         ),
-    ] = SqlBackend.pyarrow,  # type: ignore
+    ] = SqlBackend.default,  # type: ignore
     loader_file_format: Annotated[
         Optional[LoaderFileFormat],
         typer.Option(
             help="The file format to use when loading data",
-            envvar="LOADER_FILE_FORMAT",
+            envvar=["LOADER_FILE_FORMAT", "INGESTR_LOADER_FILE_FORMAT"],
         ),
     ] = None,  # type: ignore
     page_size: Annotated[
         Optional[int],
         typer.Option(
             help="The page size to be used when fetching data from SQL sources",
-            envvar="PAGE_SIZE",
+            envvar=["PAGE_SIZE", "INGESTR_PAGE_SIZE"],
         ),
     ] = 50000,  # type: ignore
     loader_file_size: Annotated[
         Optional[int],
         typer.Option(
             help="The file size to be used by the loader to split the data into multiple files. This can be set independent of the page size, since page size is used for fetching the data from the sources whereas this is used for the processing/loading part.",
-            envvar="LOADER_FILE_SIZE",
+            envvar=["LOADER_FILE_SIZE", "INGESTR_LOADER_FILE_SIZE"],
         ),
     ] = 100000,  # type: ignore
     schema_naming: Annotated[
         SchemaNaming,
         typer.Option(
             help="The naming convention to use when moving the tables from source to destination. The default behavior is explained here: https://dlthub.com/docs/general-usage/schema#naming-convention",
-            envvar="SCHEMA_NAMING",
+            envvar=["SCHEMA_NAMING", "INGESTR_SCHEMA_NAMING"],
         ),
     ] = SchemaNaming.default,  # type: ignore
     pipelines_dir: Annotated[
         Optional[str],
         typer.Option(
             help="The path to store dlt-related pipeline metadata. By default, ingestr will create a temporary directory and delete it after the execution is done in order to make retries stateless.",
-            envvar="PIPELINES_DIR",
+            envvar=["PIPELINES_DIR", "INGESTR_PIPELINES_DIR"],
         ),
     ] = None,  # type: ignore
     extract_parallelism: Annotated[
         Optional[int],
         typer.Option(
             help="The number of parallel jobs to run for extracting data from the source, only applicable for certain sources",
-            envvar="EXTRACT_PARALLELISM",
+            envvar=["EXTRACT_PARALLELISM", "INGESTR_EXTRACT_PARALLELISM"],
         ),
     ] = 5,  # type: ignore
     sql_reflection_level: Annotated[
         SqlReflectionLevel,
         typer.Option(
             help="The reflection level to use when reflecting the table schema from the source",
-            envvar="SQL_REFLECTION_LEVEL",
+            envvar=["SQL_REFLECTION_LEVEL", "INGESTR_SQL_REFLECTION_LEVEL"],
         ),
     ] = SqlReflectionLevel.full,  # type: ignore
     sql_limit: Annotated[
         Optional[int],
         typer.Option(
             help="The limit to use when fetching data from the source",
-            envvar="SQL_LIMIT",
+            envvar=["SQL_LIMIT", "INGESTR_SQL_LIMIT"],
         ),
     ] = None,  # type: ignore
     sql_exclude_columns: Annotated[
         Optional[list[str]],
         typer.Option(
             help="The columns to exclude from the source table",
-            envvar="SQL_EXCLUDE_COLUMNS",
+            envvar=["SQL_EXCLUDE_COLUMNS", "INGESTR_SQL_EXCLUDE_COLUMNS"],
         ),
     ] = [],  # type: ignore
     columns: Annotated[
         Optional[list[str]],
         typer.Option(
             help="The column types to be used for the destination table in the format of 'column_name:column_type'",
-            envvar="COLUMNS",
+            envvar=["INGESTR_COLUMNS"],
+        ),
+    ] = None,  # type: ignore
+    yield_limit: Annotated[
+        Optional[int],
+        typer.Option(
+            help="Limit the number of pages yielded from the source",
+            envvar=["YIELD_LIMIT", "INGESTR_YIELD_LIMIT"],
+        ),
+    ] = None,  # type: ignore
+    staging_bucket: Annotated[
+        Optional[str],
+        typer.Option(
+            help="The staging bucket to be used for the ingestion, must be prefixed with 'gs://' or 's3://'",
+            envvar=["STAGING_BUCKET", "INGESTR_STAGING_BUCKET"],
         ),
     ] = None,  # type: ignore
+    mask: Annotated[
+        Optional[list[str]],
+        typer.Option(
+            help="Column masking configuration in format 'column:algorithm[:param]'. Can be specified multiple times.",
+            envvar=["MASK", "INGESTR_MASK"],
+        ),
+    ] = [],  # type: ignore
 ):
     import hashlib
     import tempfile
@@ -303,14 +297,22 @@ def ingest(
     import dlt
     import humanize
     import typer
-    from dlt.common.data_types import TDataType
-    from dlt.common.destination import Destination
     from dlt.common.pipeline import LoadInfo
     from dlt.common.runtime.collector import Collector, LogCollector
     from dlt.common.schema.typing import TColumnSchema
+    import ingestr.src.partition as partition
+    import ingestr.src.resource as resource
+    from ingestr.src.collector.spinner import SpinnerCollector
+    from ingestr.src.destinations import AthenaDestination
     from ingestr.src.factory import SourceDestinationFactory
-    from ingestr.src.telemetry.event import track
+    from ingestr.src.filters import (
+        cast_set_to_list,
+        cast_spanner_types,
+        create_masking_filter,
+        handle_mysql_empty_dates,
+    )
+    from ingestr.src.sources import MongoDbSource
     def report_errors(run_info: LoadInfo):
         for load_package in run_info.load_packages:
@@ -345,7 +347,7 @@ def ingest(
         return (source_table, dest_table)
     def validate_loader_file_format(
-        dlt_dest: Destination, loader_file_format: Optional[LoaderFileFormat]
+        dlt_dest, loader_file_format: Optional[LoaderFileFormat]
     ):
         if (
             loader_file_format
@@ -357,17 +359,11 @@ def ingest(
             )
             raise typer.Abort()
-    def run_on_resource(source, executable):
-        if hasattr(source, "selected_resources") and source.selected_resources:
-            resource_names = list(source.selected_resources.keys())
-            for res in resource_names:
-                executable(source.resources[res])
-        else:
-            executable(source)
-    def parse_columns(columns: list[str]) -> dict[str, TDataType]:
+    def parse_columns(columns: list[str]) -> dict:
         from typing import cast, get_args
+        from dlt.common.data_types import TDataType
         possible_types = get_args(TDataType)
         types: dict[str, TDataType] = {}
@@ -400,6 +396,7 @@ def ingest(
     dlt.config["data_writer.file_max_items"] = loader_file_size
     dlt.config["extract.workers"] = extract_parallelism
     dlt.config["extract.max_parallel_items"] = extract_parallelism
+    dlt.config["load.raise_on_max_retries"] = 15
     if schema_naming != SchemaNaming.default:
         dlt.config["schema.naming"] = schema_naming.value
@@ -451,7 +448,9 @@ def ingest(
             pipelines_dir = tempfile.mkdtemp()
             is_pipelines_dir_temp = True
-        dlt_dest = destination.dlt_dest(uri=dest_uri, dest_table=dest_table)
+        dlt_dest = destination.dlt_dest(
+            uri=dest_uri, dest_table=dest_table, staging_bucket=staging_bucket
+        )
         validate_loader_file_format(dlt_dest, loader_file_format)
         if partition_by:
@@ -473,7 +472,7 @@ def ingest(
                 column_hints[key]["primary_key"] = True
-        pipeline = dlt.pipeline(
+        pipeline = dlt.pipeline(  # type: ignore
             pipeline_name=m.hexdigest(),
             destination=dlt_dest,
             progress=progressInstance,
@@ -510,6 +509,7 @@ def ingest(
         print(
             f"[bold yellow]  Primary Key:[/bold yellow] {primary_key if primary_key else 'None'}"
         )
+        print(f"[bold yellow]  Pipeline ID:[/bold yellow] {m.hexdigest()}")
         print()
         if not yes:
@@ -539,6 +539,15 @@ def ingest(
             if interval_end:
                 interval_end = interval_end.date()  # type: ignore
+        if factory.source_scheme.startswith("spanner"):
+            # we tend to use the 'pyarrow' backend in general, however, it has issues with JSON objects, so we override it to 'sqlalchemy' for Spanner.
+            if sql_backend.value == SqlBackend.default:
+                sql_backend = SqlBackend.sqlalchemy
+        # this allows us to identify the cases where the user does not have a preference, so that for some sources we can override it.
+        if sql_backend == SqlBackend.default:
+            sql_backend = SqlBackend.pyarrow
         dlt_source = source.dlt_source(
             uri=source_uri,
             table=source_table,
@@ -551,22 +560,55 @@ def ingest(
             sql_reflection_level=sql_reflection_level.value,
             sql_limit=sql_limit,
             sql_exclude_columns=sql_exclude_columns,
+            extract_parallelism=extract_parallelism,
         )
-        run_on_resource(dlt_source, lambda x: x.add_map(cast_set_to_list))
+        resource.for_each(dlt_source, lambda x: x.add_map(cast_set_to_list))
+        if factory.source_scheme.startswith("mysql"):
+            resource.for_each(dlt_source, lambda x: x.add_map(handle_mysql_empty_dates))
+        if factory.source_scheme.startswith("spanner"):
+            resource.for_each(dlt_source, lambda x: x.add_map(cast_spanner_types))
+        if factory.source_scheme.startswith(
+            "mmap"
+        ) and factory.destination_scheme.startswith("clickhouse"):
+            # https://github.com/dlt-hub/dlt/issues/2248
+            # TODO(turtledev): only apply for write dispositions that actually cause an exception.
+            # TODO(turtledev): make batch size configurable
+            import ingestr.src.arrow as arrow
+            resource.for_each(dlt_source, lambda x: x.add_map(arrow.as_list))
+        if mask:
+            masking_filter = create_masking_filter(mask)
+            resource.for_each(dlt_source, lambda x: x.add_map(masking_filter))
+        if yield_limit:
+            resource.for_each(dlt_source, lambda x: x.add_limit(yield_limit))
+        if isinstance(source, MongoDbSource):
+            from ingestr.src.resource import TypeHintMap
+            resource.for_each(
+                dlt_source, lambda x: x.add_map(TypeHintMap().type_hint_map)
+            )
         def col_h(x):
             if column_hints:
                 x.apply_hints(columns=column_hints)
-        run_on_resource(dlt_source, col_h)
+        resource.for_each(dlt_source, col_h)
+        if isinstance(destination, AthenaDestination) and partition_by:
+            partition.apply_athena_hints(dlt_source, partition_by, column_hints)
         if original_incremental_strategy == IncrementalStrategy.delete_insert:
             def set_primary_key(x):
                 x.incremental.primary_key = ()
-            run_on_resource(dlt_source, set_primary_key)
+            resource.for_each(dlt_source, set_primary_key)
         if (
             factory.destination_scheme in PARQUET_SUPPORTED_DESTINATIONS
@@ -585,6 +627,10 @@ def ingest(
         if incremental_strategy != IncrementalStrategy.none:
             write_disposition = incremental_strategy.value
+        if factory.source_scheme == "influxdb":
+            if primary_key:
+                write_disposition = "merge"
         start_time = datetime.now()
         run_info: LoadInfo = pipeline.run(
@@ -592,6 +638,7 @@ def ingest(
             **destination.dlt_run_params(
                 uri=dest_uri,
                 table=dest_table,
+                staging_bucket=staging_bucket,
             ),
             write_disposition=write_disposition,  # type: ignore
             primary_key=(primary_key if primary_key and len(primary_key) > 0 else None),  # type: ignore

ingestr/src/adjust/__init__.py CHANGED Viewed

@@ -46,7 +46,7 @@ def adjust_source(
     filters: Optional[dict] = None,
 ) -> Sequence[DltResource]:
     @dlt.resource(write_disposition="merge", merge_key="day")
-    def campaigns():
+    def campaigns() -> DltResource:
         adjust_api = AdjustAPI(api_key=api_key)
         yield from adjust_api.fetch_report_data(
             start_date=start_date,
@@ -57,12 +57,12 @@ def adjust_source(
         )
     @dlt.resource(write_disposition="replace", primary_key="id")
-    def events():
+    def events() -> DltResource:
         adjust_api = AdjustAPI(api_key=api_key)
         yield adjust_api.fetch_events()
     @dlt.resource(write_disposition="merge", merge_key="day")
-    def creatives():
+    def creatives() -> DltResource:
         adjust_api = AdjustAPI(api_key=api_key)
         yield from adjust_api.fetch_report_data(
             start_date=start_date,
@@ -95,7 +95,7 @@ def adjust_source(
         primary_key=dimensions,
         columns=type_hints,
     )
-    def custom():
+    def custom() -> DltResource:
         adjust_api = AdjustAPI(api_key=api_key)
         yield from adjust_api.fetch_report_data(
             start_date=start_date,

ingestr/src/adjust/adjust_helpers.py CHANGED Viewed

@@ -36,7 +36,7 @@ class AdjustAPI:
     def __init__(self, api_key):
         self.api_key = api_key
         self.request_client = Client(
-            request_timeout=8.0,
+            request_timeout=1000,  # Adjust support recommends 1000 seconds of read timeout.
             raise_for_status=False,
             retry_condition=retry_on_limit,
             request_max_attempts=12,
@@ -82,7 +82,9 @@ class AdjustAPI:
             items = result.get("rows", [])
             yield items
         else:
-            raise HTTPError(f"Request failed with status code: {response.status_code}")
+            raise HTTPError(
+                f"Request failed with status code: {response.status_code}, {response.text}."
+            )
     def fetch_events(self):
         headers = {"Authorization": f"Bearer {self.api_key}"}
@@ -93,7 +95,9 @@ class AdjustAPI:
             result = response.json()
             yield result
         else:
-            raise HTTPError(f"Request failed with status code: {response.status_code}")
+            raise HTTPError(
+                f"Request failed with status code: {response.status_code}, {response.text}."
+            )
 def parse_filters(filters_raw: str) -> dict:

ingestr/src/airtable/__init__.py CHANGED Viewed

@@ -9,7 +9,7 @@ import pyairtable
 from dlt.sources import DltResource
-@dlt.source
+@dlt.source(max_table_nesting=1)
 def airtable_source(
     base_id: str = dlt.config.value,
     table_names: Optional[List[str]] = dlt.config.value,
@@ -50,12 +50,13 @@ def airtable_resource(
             It starts with "app". See https://support.airtable.com/docs/finding-airtable-ids
         table (Dict[str, Any]): Metadata about an airtable, does not contain the actual records
     """
     primary_key_id = table["primaryFieldId"]
     primary_key_field = [
         field for field in table["fields"] if field["id"] == primary_key_id
     ][0]
     table_name: str = table["name"]
-    primary_key: List[str] = [primary_key_field["name"]]
+    primary_key: List[str] = [f"fields__{primary_key_field['name']}".lower()]
     air_table = api.table(base_id, table["id"])
     # Table.iterate() supports rich customization options, such as chunk size, fields, cell format, timezone, locale, and view

ingestr 0.13.13__py3-none-any.whl → 0.14.104__py3-none-any.whl

ingestr 0.13.13py3-none-any.whl → 0.14.104py3-none-any.whl