PyPI - ingestr - Versions diffs - 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

ingestr 0.9.4py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (19) hide show

ingestr/main.py +156 -40
ingestr/src/adjust/__init__.py +1 -1
ingestr/src/filters.py +21 -0
ingestr/src/gorgias/__init__.py +17 -17
ingestr/src/shopify/__init__.py +42 -42
ingestr/src/slack/__init__.py +2 -2
ingestr/src/sources.py +34 -7
ingestr/src/version.py +1 -1
ingestr/src/zendesk/__init__.py +2 -2
{ingestr-0.9.4.dist-info → ingestr-0.10.0.dist-info}/METADATA +18 -18
{ingestr-0.9.4.dist-info → ingestr-0.10.0.dist-info}/RECORD +14 -18
ingestr/src/sql_database/__init__.py +0 -206
ingestr/src/sql_database/arrow_helpers.py +0 -139
ingestr/src/sql_database/helpers.py +0 -282
ingestr/src/sql_database/override.py +0 -10
ingestr/src/sql_database/schema_types.py +0 -139
{ingestr-0.9.4.dist-info → ingestr-0.10.0.dist-info}/WHEEL +0 -0
{ingestr-0.9.4.dist-info → ingestr-0.10.0.dist-info}/entry_points.txt +0 -0
{ingestr-0.9.4.dist-info → ingestr-0.10.0.dist-info}/licenses/LICENSE.md +0 -0

ingestr/main.py CHANGED Viewed

@@ -1,19 +1,14 @@
-import hashlib
-import tempfile
 from datetime import datetime
 from enum import Enum
 from typing import Optional
-import dlt
-import humanize
 import typer
-from dlt.common.pipeline import LoadInfo
-from dlt.common.runtime.collector import Collector, LogCollector
+from dlt.common.runtime.collector import Collector
 from rich.console import Console
 from rich.status import Status
 from typing_extensions import Annotated
-from ingestr.src.factory import SourceDestinationFactory
+from ingestr.src.filters import cast_set_to_list
 from ingestr.src.telemetry.event import track
 app = typer.Typer(
@@ -118,6 +113,12 @@ class SchemaNaming(str, Enum):
     direct = "direct"
+class SqlReflectionLevel(str, Enum):
+    minimal = "minimal"
+    full = "full"
+    full_with_precision = "full_with_precision"
 @app.command()
 def ingest(
     source_uri: Annotated[
@@ -181,6 +182,20 @@ def ingest(
             envvar="PRIMARY_KEY",
         ),
     ] = None,  # type: ignore
+    partition_by: Annotated[
+        Optional[str],
+        typer.Option(
+            help="The partition key to be used for partitioning the destination table",
+            envvar="PARTITION_BY",
+        ),
+    ] = None,  # type: ignore
+    cluster_by: Annotated[
+        Optional[str],
+        typer.Option(
+            help="The clustering key to be used for clustering the destination table, not every destination supports clustering.",
+            envvar="CLUSTER_BY",
+        ),
+    ] = None,  # type: ignore
     yes: Annotated[
         Optional[bool],
         typer.Option(
@@ -251,7 +266,88 @@ def ingest(
             envvar="EXTRACT_PARALLELISM",
         ),
     ] = 5,  # type: ignore
+    sql_reflection_level: Annotated[
+        SqlReflectionLevel,
+        typer.Option(
+            help="The reflection level to use when reflecting the table schema from the source",
+            envvar="SQL_REFLECTION_LEVEL",
+        ),
+    ] = SqlReflectionLevel.full,  # type: ignore
+    sql_limit: Annotated[
+        Optional[int],
+        typer.Option(
+            help="The limit to use when fetching data from the source",
+            envvar="SQL_LIMIT",
+        ),
+    ] = None,  # type: ignore
+    sql_exclude_columns: Annotated[
+        Optional[list[str]],
+        typer.Option(
+            help="The columns to exclude from the source table",
+            envvar="SQL_EXCLUDE_COLUMNS",
+        ),
+    ] = [],  # type: ignore
 ):
+    import hashlib
+    import tempfile
+    from datetime import datetime
+    import dlt
+    import humanize
+    import typer
+    from dlt.common.destination import Destination
+    from dlt.common.pipeline import LoadInfo
+    from dlt.common.runtime.collector import Collector, LogCollector
+    from dlt.common.schema.typing import TColumnSchema
+    from ingestr.src.factory import SourceDestinationFactory
+    from ingestr.src.telemetry.event import track
+    def report_errors(run_info: LoadInfo):
+        for load_package in run_info.load_packages:
+            failed_jobs = load_package.jobs["failed_jobs"]
+            if len(failed_jobs) == 0:
+                continue
+            print()
+            print("[bold red]Failed jobs:[/bold red]")
+            print()
+            for job in failed_jobs:
+                print(f"[bold red]  {job.job_file_info.job_id()}[/bold red]")
+                print(f"    [bold yellow]Error:[/bold yellow] {job.failed_message}")
+            raise typer.Exit(1)
+    def validate_source_dest_tables(
+        source_table: str, dest_table: str
+    ) -> tuple[str, str]:
+        if not dest_table:
+            if len(source_table.split(".")) != 2:
+                print(
+                    "[red]Table name must be in the format schema.table for source table when dest-table is not given.[/red]"
+                )
+                raise typer.Abort()
+            print()
+            print(
+                "[yellow]Destination table is not given, defaulting to the source table.[/yellow]"
+            )
+            dest_table = source_table
+        return (source_table, dest_table)
+    def validate_loader_file_format(
+        dlt_dest: Destination, loader_file_format: Optional[LoaderFileFormat]
+    ):
+        if (
+            loader_file_format
+            and loader_file_format.value
+            not in dlt_dest.capabilities().supported_loader_file_formats
+        ):
+            print(
+                f"[red]Loader file format {loader_file_format.value} is not supported by the destination.[/red]"
+            )
+            raise typer.Abort()
     track(
         "command_triggered",
         {
@@ -259,6 +355,13 @@ def ingest(
         },
     )
+    clean_sql_exclude_columns = []
+    if sql_exclude_columns:
+        for col in sql_exclude_columns:
+            for possible_col in col.split(","):
+                clean_sql_exclude_columns.append(possible_col.strip())
+        sql_exclude_columns = clean_sql_exclude_columns
     dlt.config["data_writer.buffer_max_items"] = page_size
     dlt.config["data_writer.file_max_items"] = loader_file_size
     dlt.config["extract.workers"] = extract_parallelism
@@ -267,29 +370,23 @@ def ingest(
         dlt.config["schema.naming"] = schema_naming.value
     try:
-        if not dest_table:
-            if len(source_table.split(".")) != 2:
-                print(
-                    "[red]Table name must be in the format schema.table for source table when dest-table is not given.[/red]"
-                )
-                raise typer.Abort()
-            print()
-            print(
-                "[yellow]Destination table is not given, defaulting to the source table.[/yellow]"
-            )
-            dest_table = source_table
+        (source_table, dest_table) = validate_source_dest_tables(
+            source_table, dest_table
+        )
         factory = SourceDestinationFactory(source_uri, dest_uri)
         source = factory.get_source()
         destination = factory.get_destination()
+        column_hints: dict[str, TColumnSchema] = {}
         original_incremental_strategy = incremental_strategy
         merge_key = None
         if incremental_strategy == IncrementalStrategy.delete_insert:
             merge_key = incremental_key
             incremental_strategy = IncrementalStrategy.merge
+            if incremental_key:
+                column_hints[incremental_key] = {"merge_key": True}
         m = hashlib.sha256()
         m.update(dest_table.encode("utf-8"))
@@ -303,11 +400,31 @@ def ingest(
             pipelines_dir = tempfile.mkdtemp()
             is_pipelines_dir_temp = True
+        dlt_dest = destination.dlt_dest(uri=dest_uri)
+        validate_loader_file_format(dlt_dest, loader_file_format)
+        if partition_by:
+            if partition_by not in column_hints:
+                column_hints[partition_by] = {}
+            column_hints[partition_by]["partition"] = True
+        if cluster_by:
+            if cluster_by not in column_hints:
+                column_hints[cluster_by] = {}
+            column_hints[cluster_by]["cluster"] = True
+        if primary_key:
+            for key in primary_key:
+                if key not in column_hints:
+                    column_hints[key] = {}
+                column_hints[key]["primary_key"] = True
         pipeline = dlt.pipeline(
             pipeline_name=m.hexdigest(),
-            destination=destination.dlt_dest(
-                uri=dest_uri,
-            ),
+            destination=dlt_dest,
             progress=progressInstance,
             pipelines_dir=pipelines_dir,
             refresh="drop_resources" if full_refresh else None,
@@ -365,8 +482,18 @@ def ingest(
             interval_end=interval_end,
             sql_backend=sql_backend.value,
             page_size=page_size,
+            sql_reflection_level=sql_reflection_level.value,
+            sql_limit=sql_limit,
+            sql_exclude_columns=sql_exclude_columns,
         )
+        if hasattr(dlt_source, "selected_resources") and dlt_source.selected_resources:
+            resource_names = list(dlt_source.selected_resources.keys())
+            for res in resource_names:
+                dlt_source.resources[res].add_map(cast_set_to_list)
+        else:
+            dlt_source.add_map(cast_set_to_list)
         if original_incremental_strategy == IncrementalStrategy.delete_insert:
             dlt_source.incremental.primary_key = ()
@@ -397,32 +524,21 @@ def ingest(
             ),
             write_disposition=write_disposition,  # type: ignore
             primary_key=(primary_key if primary_key and len(primary_key) > 0 else None),  # type: ignore
-            loader_file_format=loader_file_format.value
-            if loader_file_format is not None
-            else None,  # type: ignore
+            loader_file_format=(
+                loader_file_format.value if loader_file_format is not None else None  # type: ignore
+            ),  # type: ignore
+            columns=column_hints,
         )
-        for load_package in run_info.load_packages:
-            failed_jobs = load_package.jobs["failed_jobs"]
-            if len(failed_jobs) > 0:
-                print()
-                print("[bold red]Failed jobs:[/bold red]")
-                print()
-                for job in failed_jobs:
-                    print(f"[bold red]  {job.job_file_info.job_id()}[/bold red]")
-                    print(f"    [bold yellow]Error:[/bold yellow] {job.failed_message}")
-                raise typer.Exit(1)
+        report_errors(run_info)
         destination.post_load()
         end_time = datetime.now()
         elapsedHuman = ""
-        if run_info.started_at:
-            elapsed = end_time - start_time
-            elapsedHuman = f"in {humanize.precisedelta(elapsed)}"
+        elapsed = end_time - start_time
+        elapsedHuman = f"in {humanize.precisedelta(elapsed)}"
-        # remove the pipelines_dir folder if it was created by ingestr
         if is_pipelines_dir_temp:
             import shutil

ingestr/src/adjust/__init__.py CHANGED Viewed

@@ -82,7 +82,7 @@ def adjust_source(
             type_hints[metric] = KNOWN_TYPE_HINTS[metric]
     @dlt.resource(
-        write_disposition={"disposition": "merge", "strategy": "delete+insert"},
+        write_disposition={"disposition": "merge", "strategy": "delete-insert"},
         merge_key=merge_key,
         primary_key=dimensions,
         columns=type_hints,

ingestr/src/filters.py ADDED Viewed

@@ -0,0 +1,21 @@
+from dlt.common.libs.sql_alchemy import Table
+def cast_set_to_list(row):
+    # this handles just the sqlalchemy backend for now
+    if isinstance(row, dict):
+        for key in row.keys():
+            if isinstance(row[key], set):
+                row[key] = list(row[key])
+    return row
+def table_adapter_exclude_columns(cols: list[str]):
+    print("given cols", cols)
+    def excluder(table: Table):
+        cols_to_remove = [col for col in table._columns if col.name in cols]  # type: ignore
+        for col in cols_to_remove:
+            table._columns.remove(col)  # type: ignore
+    return excluder

ingestr/src/gorgias/__init__.py CHANGED Viewed

@@ -99,12 +99,12 @@ def gorgias_source(
                 "description": "When the user was last updated.",
             },
             "meta": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "Meta information associated with the user.",
             },
             "data": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "Additional data associated with the user.",
             },
@@ -185,17 +185,17 @@ def gorgias_source(
                 "description": "Indicates if the ticket was created by an agent",
             },
             "customer": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": False,
                 "description": "The customer linked to the ticket.",
             },
             "assignee_user": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "User assigned to the ticket",
             },
             "assignee_team": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "Team assigned to the ticket",
             },
@@ -210,17 +210,17 @@ def gorgias_source(
                 "description": "Excerpt of the ticket",
             },
             "integrations": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": False,
                 "description": "Integration information related to the ticket",
             },
             "meta": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "Meta information related to the ticket",
             },
             "tags": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": False,
                 "description": "Tags associated with the ticket",
             },
@@ -354,7 +354,7 @@ def gorgias_source(
                 "description": "How the message has been received, or sent from Gorgias.",
             },
             "sender": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": False,
                 "description": "The person who sent the message. It can be a user or a customer.",
             },
@@ -364,7 +364,7 @@ def gorgias_source(
                 "description": "ID of the integration that either received or sent the message.",
             },
             "intents": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "",
             },
@@ -379,7 +379,7 @@ def gorgias_source(
                 "description": "Whether the message was sent by your company to a customer, or the opposite.",
             },
             "receiver": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "The primary receiver of the message. It can be a user or a customer. Optional when the source type is 'internal-note'.",
             },
@@ -414,27 +414,27 @@ def gorgias_source(
                 "description": "",
             },
             "headers": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "Headers of the message",
             },
             "attachments": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "A list of files attached to the message.",
             },
             "actions": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "A list of actions performed on the message.",
             },
             "macros": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "A list of macros",
             },
             "meta": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "Message metadata",
             },
@@ -526,7 +526,7 @@ def gorgias_source(
                 "description": "ID of the customer linked to the survey.",
             },
             "meta": {
-                "data_type": "complex",
+                "data_type": "json",
                 "nullable": True,
                 "description": "Meta information associated with the survey.",
             },

ingestr 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl

Potentially problematic release.

ingestr 0.9.4py3-none-any.whl → 0.10.0py3-none-any.whl