PyPI - ingestr - Versions diffs - 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

ingestr 0.2.5py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ingestr might be problematic. Click here for more details.

Files changed (29) hide show

ingestr/main.py +32 -16
ingestr/main_test.py +18 -17
ingestr/src/destinations.py +2 -1
ingestr/src/factory.py +3 -1
ingestr/src/mongodb/__init__.py +1 -1
ingestr/src/mongodb/helpers.py +5 -5
ingestr/src/notion/__init__.py +55 -0
ingestr/src/notion/helpers/__init__.py +0 -0
ingestr/src/notion/helpers/client.py +164 -0
ingestr/src/notion/helpers/database.py +78 -0
ingestr/src/notion/settings.py +3 -0
ingestr/src/sources.py +24 -0
ingestr/src/sql_database/__init__.py +125 -13
ingestr/src/sql_database/helpers.py +162 -30
ingestr/src/sql_database/override.py +9 -0
ingestr/src/sql_database/schema_types.py +135 -27
ingestr/src/version.py +1 -1
ingestr/testdata/test_append.db +0 -0
ingestr/testdata/test_create_replace.db +0 -0
ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
ingestr/testdata/test_merge_with_primary_key.db +0 -0
{ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/METADATA +89 -25
ingestr-0.3.0.dist-info/RECORD +33 -0
{ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/WHEEL +1 -1
ingestr/src/sql_database/settings.py +0 -3
ingestr-0.2.5.dist-info/RECORD +0 -27
{ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/entry_points.txt +0 -0
{ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/licenses/LICENSE.md +0 -0

ingestr/src/sql_database/__init__.py CHANGED Viewed

@@ -1,28 +1,121 @@
 """Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads."""
-from typing import Any, Optional, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 import dlt
+from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
 from dlt.sources import DltResource
-from dlt.sources.credentials import ConnectionStringCredentials
 from sqlalchemy import MetaData, Table
 from sqlalchemy.engine import Engine
 from .helpers import (
+    SqlDatabaseTableConfiguration,
+    SqlTableResourceConfiguration,
+    TableBackend,
     engine_from_credentials,
     get_primary_key,
     table_rows,
 )
+from .override import IngestrConnectionStringCredentials
 from .schema_types import table_to_columns
+@dlt.source
+def sql_database(
+    credentials: Union[
+        IngestrConnectionStringCredentials, Engine, str
+    ] = dlt.secrets.value,
+    schema: Optional[str] = dlt.config.value,
+    metadata: Optional[MetaData] = None,
+    table_names: Optional[List[str]] = dlt.config.value,
+    chunk_size: int = 50000,
+    backend: TableBackend = "sqlalchemy",
+    detect_precision_hints: Optional[bool] = dlt.config.value,
+    defer_table_reflect: Optional[bool] = dlt.config.value,
+    table_adapter_callback: Callable[[Table], None] = None,
+    backend_kwargs: Dict[str, Any] = None,
+) -> Iterable[DltResource]:
+    """
+    A dlt source which loads data from an SQL database using SQLAlchemy.
+    Resources are automatically created for each table in the schema or from the given list of tables.
+    Args:
+        credentials (Union[IngestrConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
+        schema (Optional[str]): Name of the database schema to load (if different from default).
+        metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
+        table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
+        chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
+        backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
+            "sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
+            "sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
+            "connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
+        detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
+            This is disabled by default.
+        defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Requires table_names to be explicitly passed.
+            Enable this option when running on Airflow. Available on dlt 0.4.4 and later.
+        table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
+        backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
+    Returns:
+        Iterable[DltResource]: A list of DLT resources for each table to be loaded.
+    """
+    # set up alchemy engine
+    engine = engine_from_credentials(credentials)
+    engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
+    metadata = metadata or MetaData(schema=schema)
+    # use provided tables or all tables
+    if table_names:
+        tables = [
+            Table(name, metadata, autoload_with=None if defer_table_reflect else engine)
+            for name in table_names
+        ]
+    else:
+        if defer_table_reflect:
+            raise ValueError("You must pass table names to defer table reflection")
+        metadata.reflect(bind=engine)
+        tables = list(metadata.tables.values())
+    for table in tables:
+        if table_adapter_callback and not defer_table_reflect:
+            table_adapter_callback(table)
+        yield dlt.resource(
+            table_rows,
+            name=table.name,
+            primary_key=get_primary_key(table),
+            spec=SqlDatabaseTableConfiguration,
+            columns=table_to_columns(table, detect_precision_hints),
+        )(
+            engine,
+            table,
+            chunk_size,
+            backend,
+            detect_precision_hints=detect_precision_hints,
+            defer_table_reflect=defer_table_reflect,
+            table_adapter_callback=table_adapter_callback,
+            backend_kwargs=backend_kwargs,
+        )
+@dlt.sources.config.with_config(
+    sections=("sources", "sql_database"),
+    spec=SqlTableResourceConfiguration,
+    sections_merge_style=ConfigSectionContext.resource_merge_style,
+)
 def sql_table(
-    credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value,
+    credentials: Union[
+        IngestrConnectionStringCredentials, Engine, str
+    ] = dlt.secrets.value,
     table: str = dlt.config.value,
     schema: Optional[str] = dlt.config.value,
     metadata: Optional[MetaData] = None,
     incremental: Optional[dlt.sources.incremental[Any]] = None,
+    chunk_size: int = 1000,
+    backend: TableBackend = "sqlalchemy",
     detect_precision_hints: Optional[bool] = dlt.config.value,
+    defer_table_reflect: Optional[bool] = dlt.config.value,
+    table_adapter_callback: Callable[[Table], None] = None,
+    backend_kwargs: Dict[str, Any] = None,
     merge_key: Optional[str] = None,
 ) -> DltResource:
     """
@@ -35,26 +128,45 @@ def sql_table(
         metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. If provided, the `schema` argument is ignored.
         incremental (Optional[dlt.sources.incremental[Any]]): Option to enable incremental loading for the table.
             E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
-        write_disposition (str): Write disposition of the resource.
+        chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
+        backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
+            "sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
+            "sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
+            "connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
         detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
             This is disabled by default.
+        defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Enable this option when running on Airflow. Available
+            on dlt 0.4.4 and later
+        table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
+        backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
     Returns:
         DltResource: The dlt resource for loading data from the SQL database table.
     """
-    if not isinstance(credentials, Engine):
-        engine = engine_from_credentials(credentials)
-    else:
-        engine = credentials
-    engine.execution_options(stream_results=True)
+    engine = engine_from_credentials(credentials)
+    engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
     metadata = metadata or MetaData(schema=schema)
-    table_obj = Table(table, metadata, autoload_with=engine)
+    table_obj = Table(
+        table, metadata, autoload_with=None if defer_table_reflect else engine
+    )
+    if table_adapter_callback and not defer_table_reflect:
+        table_adapter_callback(table_obj)
     return dlt.resource(
         table_rows,
         name=table_obj.name,
         primary_key=get_primary_key(table_obj),
-        columns=table_to_columns(table_obj) if detect_precision_hints else None,  # type: ignore
-        merge_key=merge_key,  # type: ignore
-    )(engine, table_obj, incremental=incremental)
+        columns=table_to_columns(table_obj, detect_precision_hints),
+        merge_key=merge_key,
+    )(
+        engine,
+        table_obj,
+        chunk_size,
+        backend,
+        incremental=incremental,
+        detect_precision_hints=detect_precision_hints,
+        defer_table_reflect=defer_table_reflect,
+        table_adapter_callback=table_adapter_callback,
+        backend_kwargs=backend_kwargs,
+    )

ingestr/src/sql_database/helpers.py CHANGED Viewed

@@ -3,33 +3,49 @@
 import operator
 from typing import (
     Any,
+    Callable,
+    Dict,
     Iterator,
     List,
+    Literal,
     Optional,
     Union,
 )
 import dlt
 from dlt.common.configuration.specs import BaseConfiguration, configspec
+from dlt.common.exceptions import MissingDependencyException
+from dlt.common.schema import TTableSchemaColumns
 from dlt.common.typing import TDataItem
 from dlt.sources.credentials import ConnectionStringCredentials
 from sqlalchemy import Table, create_engine
 from sqlalchemy.engine import Engine
-from sqlalchemy.sql import Select
-from .settings import DEFAULT_CHUNK_SIZE
+from ingestr.src.sql_database.override import IngestrConnectionStringCredentials
+from .schema_types import (
+    SelectAny,
+    row_tuples_to_arrow,
+    table_to_columns,
+)
+TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
 class TableLoader:
     def __init__(
         self,
         engine: Engine,
+        backend: TableBackend,
         table: Table,
+        columns: TTableSchemaColumns,
         chunk_size: int = 1000,
         incremental: Optional[dlt.sources.incremental[Any]] = None,
     ) -> None:
         self.engine = engine
+        self.backend = backend
         self.table = table
+        self.columns = columns
         self.chunk_size = chunk_size
         self.incremental = incremental
         if incremental:
@@ -40,60 +56,152 @@ class TableLoader:
                     f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'"
                 ) from e
             self.last_value = incremental.last_value
+            self.end_value = incremental.end_value
+            self.row_order = getattr(self.incremental, "row_order", None)
         else:
-            self.cursor_column = None  # type: ignore
+            self.cursor_column = None
             self.last_value = None
+            self.end_value = None
+            self.row_order = None
-    def make_query(self) -> Select[Any]:  # type: ignore
+    def make_query(self) -> SelectAny:
         table = self.table
         query = table.select()
         if not self.incremental:
             return query
         last_value_func = self.incremental.last_value_func
+        # generate where
         if (
             last_value_func is max
         ):  # Query ordered and filtered according to last_value function
-            order_by = self.cursor_column.asc()
             filter_op = operator.ge
+            filter_op_end = operator.lt
         elif last_value_func is min:
-            order_by = self.cursor_column.desc()
             filter_op = operator.le
+            filter_op_end = operator.gt
         else:  # Custom last_value, load everything and let incremental handle filtering
             return query
-        query = query.order_by(order_by)
-        if self.last_value is None:
-            return query
-        return query.where(filter_op(self.cursor_column, self.last_value))
-    def load_rows(self) -> Iterator[List[TDataItem]]:
+        if self.last_value is not None:
+            query = query.where(filter_op(self.cursor_column, self.last_value))
+            if self.end_value is not None:
+                query = query.where(filter_op_end(self.cursor_column, self.end_value))
+        # generate order by from declared row order
+        order_by = None
+        if self.row_order == "asc":
+            order_by = self.cursor_column.asc()
+        elif self.row_order == "desc":
+            order_by = self.cursor_column.desc()
+        if order_by is not None:
+            query = query.order_by(order_by)
+        return query
+    def load_rows(self, backend_kwargs: Dict[str, Any] = None) -> Iterator[TDataItem]:
+        # make copy of kwargs
+        backend_kwargs = dict(backend_kwargs or {})
         query = self.make_query()
+        if self.backend == "connectorx":
+            yield from self._load_rows_connectorx(query, backend_kwargs)
+        else:
+            yield from self._load_rows(query, backend_kwargs)
+    def _load_rows(self, query: SelectAny, backend_kwargs: Dict[str, Any]) -> TDataItem:
         with self.engine.connect() as conn:
             result = conn.execution_options(yield_per=self.chunk_size).execute(query)
+            # NOTE: cursor returns not normalized column names! may be quite useful in case of Oracle dialect
+            # that normalizes columns
+            # columns = [c[0] for c in result.cursor.description]
+            columns = list(result.keys())
             for partition in result.partitions(size=self.chunk_size):
-                yield [dict(row._mapping) for row in partition]
+                if self.backend == "sqlalchemy":
+                    yield [dict(row._mapping) for row in partition]
+                elif self.backend == "pandas":
+                    from dlt.common.libs.pandas_sql import _wrap_result
+                    yield _wrap_result(
+                        partition,
+                        columns,
+                        **{"dtype_backend": "pyarrow", **backend_kwargs},
+                    )
+                elif self.backend == "pyarrow":
+                    yield row_tuples_to_arrow(
+                        partition, self.columns, tz=backend_kwargs.get("tz")
+                    )
+    def _load_rows_connectorx(
+        self, query: SelectAny, backend_kwargs: Dict[str, Any]
+    ) -> Iterator[TDataItem]:
+        try:
+            import connectorx as cx  # type: ignore
+        except ImportError:
+            raise MissingDependencyException(
+                "Connector X table backend", ["connectorx"]
+            )
+        # default settings
+        backend_kwargs = {
+            "return_type": "arrow2",
+            "protocol": "binary",
+            **backend_kwargs,
+        }
+        conn = backend_kwargs.pop(
+            "conn",
+            self.engine.url._replace(
+                drivername=self.engine.url.get_backend_name()
+            ).render_as_string(hide_password=False),
+        )
+        df = cx.read_sql(
+            conn,
+            str(query.compile(self.engine, compile_kwargs={"literal_binds": True})),
+            **backend_kwargs,
+        )
+        yield df
 def table_rows(
     engine: Engine,
     table: Table,
-    chunk_size: int = DEFAULT_CHUNK_SIZE,
+    chunk_size: int,
+    backend: TableBackend,
     incremental: Optional[dlt.sources.incremental[Any]] = None,
+    detect_precision_hints: bool = False,
+    defer_table_reflect: bool = False,
+    table_adapter_callback: Callable[[Table], None] = None,
+    backend_kwargs: Dict[str, Any] = None,
 ) -> Iterator[TDataItem]:
-    """
-    A DLT source which loads data from an SQL database using SQLAlchemy.
-    Resources are automatically created for each table in the schema or from the given list of tables.
+    columns: TTableSchemaColumns = None
+    if defer_table_reflect:
+        table = Table(
+            table.name, table.metadata, autoload_with=engine, extend_existing=True
+        )
+        if table_adapter_callback:
+            table_adapter_callback(table)
+        columns = table_to_columns(table, detect_precision_hints)
-    Args:
-        credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
-        schema (Optional[str]): Name of the database schema to load (if different from default).
-        metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
-        table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
+        # set the primary_key in the incremental
+        if incremental and incremental.primary_key is None:
+            primary_key = get_primary_key(table)
+            if primary_key is not None:
+                incremental.primary_key = primary_key
+        # yield empty record to set hints
+        yield dlt.mark.with_hints(
+            [],
+            dlt.mark.make_hints(
+                primary_key=get_primary_key(table),
+                columns=columns,
+            ),
+        )
+    else:
+        # table was already reflected
+        columns = table_to_columns(table, detect_precision_hints)
-    Returns:
-        Iterable[DltResource]: A list of DLT resources for each table to be loaded.
-    """
-    loader = TableLoader(engine, table, incremental=incremental, chunk_size=chunk_size)
-    yield from loader.load_rows()
+    loader = TableLoader(
+        engine, backend, table, columns, incremental=incremental, chunk_size=chunk_size
+    )
+    yield from loader.load_rows(backend_kwargs)
 def engine_from_credentials(
@@ -107,7 +215,31 @@ def engine_from_credentials(
 def get_primary_key(table: Table) -> List[str]:
-    return [c.name for c in table.primary_key]
+    """Create primary key or return None if no key defined"""
+    primary_key = [c.name for c in table.primary_key]
+    return primary_key if len(primary_key) > 0 else None
+def unwrap_json_connector_x(field: str) -> TDataItem:
+    """Creates a transform function to be added with `add_map` that will unwrap JSON columns
+    ingested via connectorx. Such columns are additionally quoted and translate SQL NULL to json "null"
+    """
+    import pyarrow as pa
+    import pyarrow.compute as pc
+    def _unwrap(table: TDataItem) -> TDataItem:
+        col_index = table.column_names.index(field)
+        # remove quotes
+        column = pc.replace_substring_regex(table[field], '"(.*)"', "\\1")
+        # convert json null to null
+        column = pc.replace_with_mask(
+            column,
+            pc.equal(column, "null").combine_chunks(),
+            pa.scalar(None, pa.large_string()),
+        )
+        return table.set_column(col_index, table.schema.field(col_index), column)
+    return _unwrap
 @configspec
@@ -117,10 +249,10 @@ class SqlDatabaseTableConfiguration(BaseConfiguration):
 @configspec
 class SqlTableResourceConfiguration(BaseConfiguration):
-    credentials: ConnectionStringCredentials
-    table: str
+    credentials: IngestrConnectionStringCredentials = None
+    table: str = None
     incremental: Optional[dlt.sources.incremental] = None  # type: ignore[type-arg]
-    schema: Optional[str]
+    schema: Optional[str] = None
 __source_name__ = "sql_database"

ingestr/src/sql_database/override.py ADDED Viewed

@@ -0,0 +1,9 @@
+from typing import Optional
+from dlt.common.configuration.specs.base_configuration import configspec
+from dlt.sources.credentials import ConnectionStringCredentials
+@configspec(init=False)
+class IngestrConnectionStringCredentials(ConnectionStringCredentials):
+    username: Optional[str] = None

ingestr/src/sql_database/schema_types.py CHANGED Viewed

@@ -1,54 +1,162 @@
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
+from dlt.common import logger
+from dlt.common.configuration import with_config
+from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.schema.typing import TColumnSchema, TTableSchemaColumns
 from sqlalchemy import Column, Table
-from sqlalchemy.sql import sqltypes
+from sqlalchemy.engine import Row
+from sqlalchemy.sql import Select, sqltypes
+from typing_extensions import TypeAlias
+# optionally create generics with any so they can be imported by dlt importer
+if TYPE_CHECKING:
+    SelectAny: TypeAlias = Select[Any]
+    ColumnAny: TypeAlias = Column[Any]
+    RowAny: TypeAlias = Row[Any]
+else:
+    SelectAny: TypeAlias = Type[Any]
+    ColumnAny: TypeAlias = Type[Any]
+    RowAny: TypeAlias = Type[Any]
-def sqla_col_to_column_schema(sql_col: Column[Any]) -> Optional[TColumnSchema]:
+def sqla_col_to_column_schema(
+    sql_col: ColumnAny, add_precision: bool = False
+) -> Optional[TColumnSchema]:
     """Infer dlt schema column type from an sqlalchemy type.
-    Precision and scale is inferred from that types that support it,
-    such as numeric, varchar, int, bigint
+    If `add_precision` is set, precision and scale is inferred from that types that support it,
+    such as numeric, varchar, int, bigint. Numeric (decimal) types have always precision added.
     """
     sql_t = sql_col.type
-    col = None
+    col: TColumnSchema = {
+        "name": sql_col.name,
+        "data_type": None,  # set that later
+        "nullable": sql_col.nullable,
+    }
-    if isinstance(sql_t, sqltypes.BigInteger):
-        col = dict(name=sql_col.name, data_type="bigint", precision=64)
-    elif isinstance(sql_t, sqltypes.SmallInteger):
-        col = dict(name=sql_col.name, data_type="bigint", precision=16)
+    if isinstance(sql_t, sqltypes.SmallInteger):
+        col["data_type"] = "bigint"
+        if add_precision:
+            col["precision"] = 32
     elif isinstance(sql_t, sqltypes.Integer):
-        col = dict(name=sql_col.name, data_type="bigint", precision=32)
-    elif isinstance(sql_t, sqltypes.Numeric) and not isinstance(sql_t, sqltypes.Float):
-        col = dict(
-            name=sql_col.name,
-            data_type="decimal",
-            precision=sql_t.precision,
-            scale=sql_t.scale,
-        )
+        col["data_type"] = "bigint"
+    elif isinstance(sql_t, sqltypes.Numeric):
+        # dlt column type depends on the data returned by the sql alchemy dialect
+        # and not on the metadata reflected in the database. all Numeric types
+        # that are returned as floats will assume "double" type
+        # and returned as decimals will assume "decimal" type
+        if sql_t.asdecimal is False:
+            col["data_type"] = "double"
+        else:
+            col["data_type"] = "decimal"
+            if sql_t.precision is not None:
+                col["precision"] = sql_t.precision
+                # must have a precision for any meaningful scale
+                if sql_t.scale is not None:
+                    col["scale"] = sql_t.scale
+                elif sql_t.decimal_return_scale is not None:
+                    col["scale"] = sql_t.decimal_return_scale
     elif isinstance(sql_t, sqltypes.String):
-        col = dict(name=sql_col.name, data_type="text", precision=sql_t.length)
+        col["data_type"] = "text"
+        if add_precision and sql_t.length:
+            col["precision"] = sql_t.length
     elif isinstance(sql_t, sqltypes._Binary):
-        col = dict(name=sql_col.name, data_type="binary", precision=sql_t.length)
+        col["data_type"] = "binary"
+        if add_precision and sql_t.length:
+            col["precision"] = sql_t.length
     elif isinstance(sql_t, sqltypes.DateTime):
-        col = dict(name=sql_col.name, data_type="timestamp")
+        col["data_type"] = "timestamp"
     elif isinstance(sql_t, sqltypes.Date):
-        col = dict(name=sql_col.name, data_type="date")
+        col["data_type"] = "date"
     elif isinstance(sql_t, sqltypes.Time):
-        col = dict(name=sql_col.name, data_type="time")
+        col["data_type"] = "time"
+    elif isinstance(sql_t, sqltypes.JSON):
+        col["data_type"] = "complex"
+    elif isinstance(sql_t, sqltypes.Boolean):
+        col["data_type"] = "bool"
+    else:
+        logger.warning(
+            f"A column with name {sql_col.name} contains unknown data type {sql_t} which cannot be mapped to `dlt` data type. When using sqlalchemy backend such data will be passed to the normalizer. In case of `pyarrow` backend such data will be ignored. In case of other backends, the behavior is backend-specific."
+        )
+        col = None
     if col:
         return {key: value for key, value in col.items() if value is not None}  # type: ignore[return-value]
     return None
-def table_to_columns(table: Table) -> TTableSchemaColumns:
+def table_to_columns(table: Table, add_precision: bool = False) -> TTableSchemaColumns:
     """Convert an sqlalchemy table to a dlt table schema.
-    Only columns types supporting precision/scale are included in result.
+    Adds precision to columns when `add_precision` is set.
     """
     return {
-        col["name"]: col  # type: ignore
-        for col in (sqla_col_to_column_schema(c) for c in table.columns)
+        col["name"]: col
+        for col in (sqla_col_to_column_schema(c, add_precision) for c in table.columns)
         if col is not None
     }
+@with_config
+def columns_to_arrow(
+    columns_schema: TTableSchemaColumns,
+    caps: DestinationCapabilitiesContext = None,
+    tz: str = "UTC",
+) -> Any:
+    """Converts `column_schema` to arrow schema using `caps` and `tz`. `caps` are injected from the container - which
+    is always the case if run within the pipeline. This will generate arrow schema compatible with the destination.
+    Otherwise generic capabilities are used
+    """
+    from dlt.common.destination.capabilities import DestinationCapabilitiesContext
+    from dlt.common.libs.pyarrow import get_py_arrow_datatype
+    from dlt.common.libs.pyarrow import pyarrow as pa
+    return pa.schema(
+        [
+            pa.field(
+                name,
+                get_py_arrow_datatype(
+                    schema_item,
+                    caps or DestinationCapabilitiesContext.generic_capabilities(),
+                    tz,
+                ),
+                nullable=schema_item.get("nullable", True),
+            )
+            for name, schema_item in columns_schema.items()
+        ]
+    )
+def row_tuples_to_arrow(
+    rows: Sequence[RowAny], columns: TTableSchemaColumns, tz: str
+) -> Any:
+    import numpy as np
+    from dlt.common.libs.pyarrow import pyarrow as pa
+    arrow_schema = columns_to_arrow(columns, tz=tz)
+    try:
+        from pandas._libs import lib
+        pivoted_rows = lib.to_object_array_tuples(rows).T  # type: ignore[attr-defined]
+    except ImportError:
+        logger.info(
+            "Pandas not installed, reverting to numpy.asarray to create a table which is slower"
+        )
+        pivoted_rows = np.asarray(rows, dtype="object", order="k").T  # type: ignore[call-overload]
+    columnar = {
+        col: dat.ravel()
+        for col, dat in zip(columns, np.vsplit(pivoted_rows, len(columns)))
+    }
+    for idx in range(0, len(arrow_schema.names)):
+        field = arrow_schema.field(idx)
+        py_type = type(rows[0][idx])
+        # cast double / float ndarrays to decimals if type mismatch, looks like decimals and floats are often mixed up in dialects
+        if pa.types.is_decimal(field.type) and issubclass(py_type, (str, float)):
+            logger.warning(
+                f"Field {field.name} was reflected as decimal type, but rows contains {py_type.__name__}. Additional cast is required which may slow down arrow table generation."
+            )
+            float_array = pa.array(columnar[field.name], type=pa.float64())
+            columnar[field.name] = float_array.cast(field.type, safe=False)
+    return pa.Table.from_pydict(columnar, schema=arrow_schema)

ingestr/src/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.2.5"
1	+ __version__ = "0.3.0"

ingestr/testdata/test_append.db ADDED Viewed

Binary file

ingestr/testdata/test_create_replace.db CHANGED Viewed

Binary file

ingestr/testdata/test_delete_insert_with_timerange.db CHANGED Viewed

Binary file

ingestr/testdata/test_delete_insert_without_primary_key.db CHANGED Viewed

Binary file

ingestr/testdata/test_merge_with_primary_key.db CHANGED Viewed

Binary file

ingestr 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

ingestr 0.2.5py3-none-any.whl → 0.3.0py3-none-any.whl