ingestr 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -1,28 +1,121 @@
1
1
  """Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads."""
2
2
 
3
- from typing import Any, Optional, Union
3
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
4
4
 
5
5
  import dlt
6
+ from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
6
7
  from dlt.sources import DltResource
7
- from dlt.sources.credentials import ConnectionStringCredentials
8
8
  from sqlalchemy import MetaData, Table
9
9
  from sqlalchemy.engine import Engine
10
10
 
11
11
  from .helpers import (
12
+ SqlDatabaseTableConfiguration,
13
+ SqlTableResourceConfiguration,
14
+ TableBackend,
12
15
  engine_from_credentials,
13
16
  get_primary_key,
14
17
  table_rows,
15
18
  )
19
+ from .override import IngestrConnectionStringCredentials
16
20
  from .schema_types import table_to_columns
17
21
 
18
22
 
23
+ @dlt.source
24
+ def sql_database(
25
+ credentials: Union[
26
+ IngestrConnectionStringCredentials, Engine, str
27
+ ] = dlt.secrets.value,
28
+ schema: Optional[str] = dlt.config.value,
29
+ metadata: Optional[MetaData] = None,
30
+ table_names: Optional[List[str]] = dlt.config.value,
31
+ chunk_size: int = 50000,
32
+ backend: TableBackend = "sqlalchemy",
33
+ detect_precision_hints: Optional[bool] = dlt.config.value,
34
+ defer_table_reflect: Optional[bool] = dlt.config.value,
35
+ table_adapter_callback: Callable[[Table], None] = None,
36
+ backend_kwargs: Dict[str, Any] = None,
37
+ ) -> Iterable[DltResource]:
38
+ """
39
+ A dlt source which loads data from an SQL database using SQLAlchemy.
40
+ Resources are automatically created for each table in the schema or from the given list of tables.
41
+
42
+ Args:
43
+ credentials (Union[IngestrConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
44
+ schema (Optional[str]): Name of the database schema to load (if different from default).
45
+ metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
46
+ table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
47
+ chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
48
+ backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
49
+ "sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
50
+ "sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
51
+ "connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
52
+ detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
53
+ This is disabled by default.
54
+ defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Requires table_names to be explicitly passed.
55
+ Enable this option when running on Airflow. Available on dlt 0.4.4 and later.
56
+ table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
57
+ backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
58
+ Returns:
59
+ Iterable[DltResource]: A list of DLT resources for each table to be loaded.
60
+ """
61
+
62
+ # set up alchemy engine
63
+ engine = engine_from_credentials(credentials)
64
+ engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
65
+ metadata = metadata or MetaData(schema=schema)
66
+
67
+ # use provided tables or all tables
68
+ if table_names:
69
+ tables = [
70
+ Table(name, metadata, autoload_with=None if defer_table_reflect else engine)
71
+ for name in table_names
72
+ ]
73
+ else:
74
+ if defer_table_reflect:
75
+ raise ValueError("You must pass table names to defer table reflection")
76
+ metadata.reflect(bind=engine)
77
+ tables = list(metadata.tables.values())
78
+
79
+ for table in tables:
80
+ if table_adapter_callback and not defer_table_reflect:
81
+ table_adapter_callback(table)
82
+ yield dlt.resource(
83
+ table_rows,
84
+ name=table.name,
85
+ primary_key=get_primary_key(table),
86
+ spec=SqlDatabaseTableConfiguration,
87
+ columns=table_to_columns(table, detect_precision_hints),
88
+ )(
89
+ engine,
90
+ table,
91
+ chunk_size,
92
+ backend,
93
+ detect_precision_hints=detect_precision_hints,
94
+ defer_table_reflect=defer_table_reflect,
95
+ table_adapter_callback=table_adapter_callback,
96
+ backend_kwargs=backend_kwargs,
97
+ )
98
+
99
+
100
+ @dlt.sources.config.with_config(
101
+ sections=("sources", "sql_database"),
102
+ spec=SqlTableResourceConfiguration,
103
+ sections_merge_style=ConfigSectionContext.resource_merge_style,
104
+ )
19
105
  def sql_table(
20
- credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value,
106
+ credentials: Union[
107
+ IngestrConnectionStringCredentials, Engine, str
108
+ ] = dlt.secrets.value,
21
109
  table: str = dlt.config.value,
22
110
  schema: Optional[str] = dlt.config.value,
23
111
  metadata: Optional[MetaData] = None,
24
112
  incremental: Optional[dlt.sources.incremental[Any]] = None,
113
+ chunk_size: int = 1000,
114
+ backend: TableBackend = "sqlalchemy",
25
115
  detect_precision_hints: Optional[bool] = dlt.config.value,
116
+ defer_table_reflect: Optional[bool] = dlt.config.value,
117
+ table_adapter_callback: Callable[[Table], None] = None,
118
+ backend_kwargs: Dict[str, Any] = None,
26
119
  merge_key: Optional[str] = None,
27
120
  ) -> DltResource:
28
121
  """
@@ -35,26 +128,45 @@ def sql_table(
35
128
  metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. If provided, the `schema` argument is ignored.
36
129
  incremental (Optional[dlt.sources.incremental[Any]]): Option to enable incremental loading for the table.
37
130
  E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
38
- write_disposition (str): Write disposition of the resource.
131
+ chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
132
+ backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
133
+ "sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
134
+ "sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
135
+ "connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
39
136
  detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
40
137
  This is disabled by default.
138
+ defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Enable this option when running on Airflow. Available
139
+ on dlt 0.4.4 and later
140
+ table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
141
+ backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
41
142
 
42
143
  Returns:
43
144
  DltResource: The dlt resource for loading data from the SQL database table.
44
145
  """
45
- if not isinstance(credentials, Engine):
46
- engine = engine_from_credentials(credentials)
47
- else:
48
- engine = credentials
49
- engine.execution_options(stream_results=True)
146
+ engine = engine_from_credentials(credentials)
147
+ engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
50
148
  metadata = metadata or MetaData(schema=schema)
51
149
 
52
- table_obj = Table(table, metadata, autoload_with=engine)
150
+ table_obj = Table(
151
+ table, metadata, autoload_with=None if defer_table_reflect else engine
152
+ )
153
+ if table_adapter_callback and not defer_table_reflect:
154
+ table_adapter_callback(table_obj)
53
155
 
54
156
  return dlt.resource(
55
157
  table_rows,
56
158
  name=table_obj.name,
57
159
  primary_key=get_primary_key(table_obj),
58
- columns=table_to_columns(table_obj) if detect_precision_hints else None, # type: ignore
59
- merge_key=merge_key, # type: ignore
60
- )(engine, table_obj, incremental=incremental)
160
+ columns=table_to_columns(table_obj, detect_precision_hints),
161
+ merge_key=merge_key,
162
+ )(
163
+ engine,
164
+ table_obj,
165
+ chunk_size,
166
+ backend,
167
+ incremental=incremental,
168
+ detect_precision_hints=detect_precision_hints,
169
+ defer_table_reflect=defer_table_reflect,
170
+ table_adapter_callback=table_adapter_callback,
171
+ backend_kwargs=backend_kwargs,
172
+ )
@@ -3,33 +3,49 @@
3
3
  import operator
4
4
  from typing import (
5
5
  Any,
6
+ Callable,
7
+ Dict,
6
8
  Iterator,
7
9
  List,
10
+ Literal,
8
11
  Optional,
9
12
  Union,
10
13
  )
11
14
 
12
15
  import dlt
13
16
  from dlt.common.configuration.specs import BaseConfiguration, configspec
17
+ from dlt.common.exceptions import MissingDependencyException
18
+ from dlt.common.schema import TTableSchemaColumns
14
19
  from dlt.common.typing import TDataItem
15
20
  from dlt.sources.credentials import ConnectionStringCredentials
16
21
  from sqlalchemy import Table, create_engine
17
22
  from sqlalchemy.engine import Engine
18
- from sqlalchemy.sql import Select
19
23
 
20
- from .settings import DEFAULT_CHUNK_SIZE
24
+ from ingestr.src.sql_database.override import IngestrConnectionStringCredentials
25
+
26
+ from .schema_types import (
27
+ SelectAny,
28
+ row_tuples_to_arrow,
29
+ table_to_columns,
30
+ )
31
+
32
+ TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
21
33
 
22
34
 
23
35
  class TableLoader:
24
36
  def __init__(
25
37
  self,
26
38
  engine: Engine,
39
+ backend: TableBackend,
27
40
  table: Table,
41
+ columns: TTableSchemaColumns,
28
42
  chunk_size: int = 1000,
29
43
  incremental: Optional[dlt.sources.incremental[Any]] = None,
30
44
  ) -> None:
31
45
  self.engine = engine
46
+ self.backend = backend
32
47
  self.table = table
48
+ self.columns = columns
33
49
  self.chunk_size = chunk_size
34
50
  self.incremental = incremental
35
51
  if incremental:
@@ -40,60 +56,152 @@ class TableLoader:
40
56
  f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'"
41
57
  ) from e
42
58
  self.last_value = incremental.last_value
59
+ self.end_value = incremental.end_value
60
+ self.row_order = getattr(self.incremental, "row_order", None)
43
61
  else:
44
- self.cursor_column = None # type: ignore
62
+ self.cursor_column = None
45
63
  self.last_value = None
64
+ self.end_value = None
65
+ self.row_order = None
46
66
 
47
- def make_query(self) -> Select[Any]: # type: ignore
67
+ def make_query(self) -> SelectAny:
48
68
  table = self.table
49
69
  query = table.select()
50
70
  if not self.incremental:
51
71
  return query
52
72
  last_value_func = self.incremental.last_value_func
73
+
74
+ # generate where
53
75
  if (
54
76
  last_value_func is max
55
77
  ): # Query ordered and filtered according to last_value function
56
- order_by = self.cursor_column.asc()
57
78
  filter_op = operator.ge
79
+ filter_op_end = operator.lt
58
80
  elif last_value_func is min:
59
- order_by = self.cursor_column.desc()
60
81
  filter_op = operator.le
82
+ filter_op_end = operator.gt
61
83
  else: # Custom last_value, load everything and let incremental handle filtering
62
84
  return query
63
- query = query.order_by(order_by)
64
- if self.last_value is None:
65
- return query
66
- return query.where(filter_op(self.cursor_column, self.last_value))
67
85
 
68
- def load_rows(self) -> Iterator[List[TDataItem]]:
86
+ if self.last_value is not None:
87
+ query = query.where(filter_op(self.cursor_column, self.last_value))
88
+ if self.end_value is not None:
89
+ query = query.where(filter_op_end(self.cursor_column, self.end_value))
90
+
91
+ # generate order by from declared row order
92
+ order_by = None
93
+ if self.row_order == "asc":
94
+ order_by = self.cursor_column.asc()
95
+ elif self.row_order == "desc":
96
+ order_by = self.cursor_column.desc()
97
+ if order_by is not None:
98
+ query = query.order_by(order_by)
99
+
100
+ return query
101
+
102
+ def load_rows(self, backend_kwargs: Dict[str, Any] = None) -> Iterator[TDataItem]:
103
+ # make copy of kwargs
104
+ backend_kwargs = dict(backend_kwargs or {})
69
105
  query = self.make_query()
106
+ if self.backend == "connectorx":
107
+ yield from self._load_rows_connectorx(query, backend_kwargs)
108
+ else:
109
+ yield from self._load_rows(query, backend_kwargs)
110
+
111
+ def _load_rows(self, query: SelectAny, backend_kwargs: Dict[str, Any]) -> TDataItem:
70
112
  with self.engine.connect() as conn:
71
113
  result = conn.execution_options(yield_per=self.chunk_size).execute(query)
114
+ # NOTE: cursor returns not normalized column names! may be quite useful in case of Oracle dialect
115
+ # that normalizes columns
116
+ # columns = [c[0] for c in result.cursor.description]
117
+ columns = list(result.keys())
72
118
  for partition in result.partitions(size=self.chunk_size):
73
- yield [dict(row._mapping) for row in partition]
119
+ if self.backend == "sqlalchemy":
120
+ yield [dict(row._mapping) for row in partition]
121
+ elif self.backend == "pandas":
122
+ from dlt.common.libs.pandas_sql import _wrap_result
123
+
124
+ yield _wrap_result(
125
+ partition,
126
+ columns,
127
+ **{"dtype_backend": "pyarrow", **backend_kwargs},
128
+ )
129
+ elif self.backend == "pyarrow":
130
+ yield row_tuples_to_arrow(
131
+ partition, self.columns, tz=backend_kwargs.get("tz")
132
+ )
133
+
134
+ def _load_rows_connectorx(
135
+ self, query: SelectAny, backend_kwargs: Dict[str, Any]
136
+ ) -> Iterator[TDataItem]:
137
+ try:
138
+ import connectorx as cx # type: ignore
139
+ except ImportError:
140
+ raise MissingDependencyException(
141
+ "Connector X table backend", ["connectorx"]
142
+ )
143
+
144
+ # default settings
145
+ backend_kwargs = {
146
+ "return_type": "arrow2",
147
+ "protocol": "binary",
148
+ **backend_kwargs,
149
+ }
150
+ conn = backend_kwargs.pop(
151
+ "conn",
152
+ self.engine.url._replace(
153
+ drivername=self.engine.url.get_backend_name()
154
+ ).render_as_string(hide_password=False),
155
+ )
156
+ df = cx.read_sql(
157
+ conn,
158
+ str(query.compile(self.engine, compile_kwargs={"literal_binds": True})),
159
+ **backend_kwargs,
160
+ )
161
+ yield df
74
162
 
75
163
 
76
164
  def table_rows(
77
165
  engine: Engine,
78
166
  table: Table,
79
- chunk_size: int = DEFAULT_CHUNK_SIZE,
167
+ chunk_size: int,
168
+ backend: TableBackend,
80
169
  incremental: Optional[dlt.sources.incremental[Any]] = None,
170
+ detect_precision_hints: bool = False,
171
+ defer_table_reflect: bool = False,
172
+ table_adapter_callback: Callable[[Table], None] = None,
173
+ backend_kwargs: Dict[str, Any] = None,
81
174
  ) -> Iterator[TDataItem]:
82
- """
83
- A DLT source which loads data from an SQL database using SQLAlchemy.
84
- Resources are automatically created for each table in the schema or from the given list of tables.
175
+ columns: TTableSchemaColumns = None
176
+ if defer_table_reflect:
177
+ table = Table(
178
+ table.name, table.metadata, autoload_with=engine, extend_existing=True
179
+ )
180
+ if table_adapter_callback:
181
+ table_adapter_callback(table)
182
+ columns = table_to_columns(table, detect_precision_hints)
85
183
 
86
- Args:
87
- credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
88
- schema (Optional[str]): Name of the database schema to load (if different from default).
89
- metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
90
- table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
184
+ # set the primary_key in the incremental
185
+ if incremental and incremental.primary_key is None:
186
+ primary_key = get_primary_key(table)
187
+ if primary_key is not None:
188
+ incremental.primary_key = primary_key
189
+ # yield empty record to set hints
190
+ yield dlt.mark.with_hints(
191
+ [],
192
+ dlt.mark.make_hints(
193
+ primary_key=get_primary_key(table),
194
+ columns=columns,
195
+ ),
196
+ )
197
+ else:
198
+ # table was already reflected
199
+ columns = table_to_columns(table, detect_precision_hints)
91
200
 
92
- Returns:
93
- Iterable[DltResource]: A list of DLT resources for each table to be loaded.
94
- """
95
- loader = TableLoader(engine, table, incremental=incremental, chunk_size=chunk_size)
96
- yield from loader.load_rows()
201
+ loader = TableLoader(
202
+ engine, backend, table, columns, incremental=incremental, chunk_size=chunk_size
203
+ )
204
+ yield from loader.load_rows(backend_kwargs)
97
205
 
98
206
 
99
207
  def engine_from_credentials(
@@ -107,7 +215,31 @@ def engine_from_credentials(
107
215
 
108
216
 
109
217
  def get_primary_key(table: Table) -> List[str]:
110
- return [c.name for c in table.primary_key]
218
+ """Create primary key or return None if no key defined"""
219
+ primary_key = [c.name for c in table.primary_key]
220
+ return primary_key if len(primary_key) > 0 else None
221
+
222
+
223
+ def unwrap_json_connector_x(field: str) -> TDataItem:
224
+ """Creates a transform function to be added with `add_map` that will unwrap JSON columns
225
+ ingested via connectorx. Such columns are additionally quoted and translate SQL NULL to json "null"
226
+ """
227
+ import pyarrow as pa
228
+ import pyarrow.compute as pc
229
+
230
+ def _unwrap(table: TDataItem) -> TDataItem:
231
+ col_index = table.column_names.index(field)
232
+ # remove quotes
233
+ column = pc.replace_substring_regex(table[field], '"(.*)"', "\\1")
234
+ # convert json null to null
235
+ column = pc.replace_with_mask(
236
+ column,
237
+ pc.equal(column, "null").combine_chunks(),
238
+ pa.scalar(None, pa.large_string()),
239
+ )
240
+ return table.set_column(col_index, table.schema.field(col_index), column)
241
+
242
+ return _unwrap
111
243
 
112
244
 
113
245
  @configspec
@@ -117,10 +249,10 @@ class SqlDatabaseTableConfiguration(BaseConfiguration):
117
249
 
118
250
  @configspec
119
251
  class SqlTableResourceConfiguration(BaseConfiguration):
120
- credentials: ConnectionStringCredentials
121
- table: str
252
+ credentials: IngestrConnectionStringCredentials = None
253
+ table: str = None
122
254
  incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
123
- schema: Optional[str]
255
+ schema: Optional[str] = None
124
256
 
125
257
 
126
258
  __source_name__ = "sql_database"
@@ -0,0 +1,9 @@
1
+ from typing import Optional
2
+
3
+ from dlt.common.configuration.specs.base_configuration import configspec
4
+ from dlt.sources.credentials import ConnectionStringCredentials
5
+
6
+
7
+ @configspec(init=False)
8
+ class IngestrConnectionStringCredentials(ConnectionStringCredentials):
9
+ username: Optional[str] = None
@@ -1,54 +1,162 @@
1
- from typing import Any, Optional
1
+ from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
2
2
 
3
+ from dlt.common import logger
4
+ from dlt.common.configuration import with_config
5
+ from dlt.common.destination import DestinationCapabilitiesContext
3
6
  from dlt.common.schema.typing import TColumnSchema, TTableSchemaColumns
4
7
  from sqlalchemy import Column, Table
5
- from sqlalchemy.sql import sqltypes
8
+ from sqlalchemy.engine import Row
9
+ from sqlalchemy.sql import Select, sqltypes
10
+ from typing_extensions import TypeAlias
6
11
 
12
+ # optionally create generics with any so they can be imported by dlt importer
13
+ if TYPE_CHECKING:
14
+ SelectAny: TypeAlias = Select[Any]
15
+ ColumnAny: TypeAlias = Column[Any]
16
+ RowAny: TypeAlias = Row[Any]
17
+ else:
18
+ SelectAny: TypeAlias = Type[Any]
19
+ ColumnAny: TypeAlias = Type[Any]
20
+ RowAny: TypeAlias = Type[Any]
7
21
 
8
- def sqla_col_to_column_schema(sql_col: Column[Any]) -> Optional[TColumnSchema]:
22
+
23
+ def sqla_col_to_column_schema(
24
+ sql_col: ColumnAny, add_precision: bool = False
25
+ ) -> Optional[TColumnSchema]:
9
26
  """Infer dlt schema column type from an sqlalchemy type.
10
27
 
11
- Precision and scale is inferred from that types that support it,
12
- such as numeric, varchar, int, bigint
28
+ If `add_precision` is set, precision and scale is inferred from that types that support it,
29
+ such as numeric, varchar, int, bigint. Numeric (decimal) types have always precision added.
13
30
  """
14
31
  sql_t = sql_col.type
15
- col = None
32
+ col: TColumnSchema = {
33
+ "name": sql_col.name,
34
+ "data_type": None, # set that later
35
+ "nullable": sql_col.nullable,
36
+ }
16
37
 
17
- if isinstance(sql_t, sqltypes.BigInteger):
18
- col = dict(name=sql_col.name, data_type="bigint", precision=64)
19
- elif isinstance(sql_t, sqltypes.SmallInteger):
20
- col = dict(name=sql_col.name, data_type="bigint", precision=16)
38
+ if isinstance(sql_t, sqltypes.SmallInteger):
39
+ col["data_type"] = "bigint"
40
+ if add_precision:
41
+ col["precision"] = 32
21
42
  elif isinstance(sql_t, sqltypes.Integer):
22
- col = dict(name=sql_col.name, data_type="bigint", precision=32)
23
- elif isinstance(sql_t, sqltypes.Numeric) and not isinstance(sql_t, sqltypes.Float):
24
- col = dict(
25
- name=sql_col.name,
26
- data_type="decimal",
27
- precision=sql_t.precision,
28
- scale=sql_t.scale,
29
- )
43
+ col["data_type"] = "bigint"
44
+ elif isinstance(sql_t, sqltypes.Numeric):
45
+ # dlt column type depends on the data returned by the sql alchemy dialect
46
+ # and not on the metadata reflected in the database. all Numeric types
47
+ # that are returned as floats will assume "double" type
48
+ # and returned as decimals will assume "decimal" type
49
+ if sql_t.asdecimal is False:
50
+ col["data_type"] = "double"
51
+ else:
52
+ col["data_type"] = "decimal"
53
+ if sql_t.precision is not None:
54
+ col["precision"] = sql_t.precision
55
+ # must have a precision for any meaningful scale
56
+ if sql_t.scale is not None:
57
+ col["scale"] = sql_t.scale
58
+ elif sql_t.decimal_return_scale is not None:
59
+ col["scale"] = sql_t.decimal_return_scale
30
60
  elif isinstance(sql_t, sqltypes.String):
31
- col = dict(name=sql_col.name, data_type="text", precision=sql_t.length)
61
+ col["data_type"] = "text"
62
+ if add_precision and sql_t.length:
63
+ col["precision"] = sql_t.length
32
64
  elif isinstance(sql_t, sqltypes._Binary):
33
- col = dict(name=sql_col.name, data_type="binary", precision=sql_t.length)
65
+ col["data_type"] = "binary"
66
+ if add_precision and sql_t.length:
67
+ col["precision"] = sql_t.length
34
68
  elif isinstance(sql_t, sqltypes.DateTime):
35
- col = dict(name=sql_col.name, data_type="timestamp")
69
+ col["data_type"] = "timestamp"
36
70
  elif isinstance(sql_t, sqltypes.Date):
37
- col = dict(name=sql_col.name, data_type="date")
71
+ col["data_type"] = "date"
38
72
  elif isinstance(sql_t, sqltypes.Time):
39
- col = dict(name=sql_col.name, data_type="time")
73
+ col["data_type"] = "time"
74
+ elif isinstance(sql_t, sqltypes.JSON):
75
+ col["data_type"] = "complex"
76
+ elif isinstance(sql_t, sqltypes.Boolean):
77
+ col["data_type"] = "bool"
78
+ else:
79
+ logger.warning(
80
+ f"A column with name {sql_col.name} contains unknown data type {sql_t} which cannot be mapped to `dlt` data type. When using sqlalchemy backend such data will be passed to the normalizer. In case of `pyarrow` backend such data will be ignored. In case of other backends, the behavior is backend-specific."
81
+ )
82
+ col = None
40
83
  if col:
41
84
  return {key: value for key, value in col.items() if value is not None} # type: ignore[return-value]
42
85
  return None
43
86
 
44
87
 
45
- def table_to_columns(table: Table) -> TTableSchemaColumns:
88
+ def table_to_columns(table: Table, add_precision: bool = False) -> TTableSchemaColumns:
46
89
  """Convert an sqlalchemy table to a dlt table schema.
47
90
 
48
- Only columns types supporting precision/scale are included in result.
91
+ Adds precision to columns when `add_precision` is set.
49
92
  """
50
93
  return {
51
- col["name"]: col # type: ignore
52
- for col in (sqla_col_to_column_schema(c) for c in table.columns)
94
+ col["name"]: col
95
+ for col in (sqla_col_to_column_schema(c, add_precision) for c in table.columns)
53
96
  if col is not None
54
97
  }
98
+
99
+
100
+ @with_config
101
+ def columns_to_arrow(
102
+ columns_schema: TTableSchemaColumns,
103
+ caps: DestinationCapabilitiesContext = None,
104
+ tz: str = "UTC",
105
+ ) -> Any:
106
+ """Converts `column_schema` to arrow schema using `caps` and `tz`. `caps` are injected from the container - which
107
+ is always the case if run within the pipeline. This will generate arrow schema compatible with the destination.
108
+ Otherwise generic capabilities are used
109
+ """
110
+ from dlt.common.destination.capabilities import DestinationCapabilitiesContext
111
+ from dlt.common.libs.pyarrow import get_py_arrow_datatype
112
+ from dlt.common.libs.pyarrow import pyarrow as pa
113
+
114
+ return pa.schema(
115
+ [
116
+ pa.field(
117
+ name,
118
+ get_py_arrow_datatype(
119
+ schema_item,
120
+ caps or DestinationCapabilitiesContext.generic_capabilities(),
121
+ tz,
122
+ ),
123
+ nullable=schema_item.get("nullable", True),
124
+ )
125
+ for name, schema_item in columns_schema.items()
126
+ ]
127
+ )
128
+
129
+
130
+ def row_tuples_to_arrow(
131
+ rows: Sequence[RowAny], columns: TTableSchemaColumns, tz: str
132
+ ) -> Any:
133
+ import numpy as np
134
+ from dlt.common.libs.pyarrow import pyarrow as pa
135
+
136
+ arrow_schema = columns_to_arrow(columns, tz=tz)
137
+
138
+ try:
139
+ from pandas._libs import lib
140
+
141
+ pivoted_rows = lib.to_object_array_tuples(rows).T # type: ignore[attr-defined]
142
+ except ImportError:
143
+ logger.info(
144
+ "Pandas not installed, reverting to numpy.asarray to create a table which is slower"
145
+ )
146
+ pivoted_rows = np.asarray(rows, dtype="object", order="k").T # type: ignore[call-overload]
147
+
148
+ columnar = {
149
+ col: dat.ravel()
150
+ for col, dat in zip(columns, np.vsplit(pivoted_rows, len(columns)))
151
+ }
152
+ for idx in range(0, len(arrow_schema.names)):
153
+ field = arrow_schema.field(idx)
154
+ py_type = type(rows[0][idx])
155
+ # cast double / float ndarrays to decimals if type mismatch, looks like decimals and floats are often mixed up in dialects
156
+ if pa.types.is_decimal(field.type) and issubclass(py_type, (str, float)):
157
+ logger.warning(
158
+ f"Field {field.name} was reflected as decimal type, but rows contains {py_type.__name__}. Additional cast is required which may slow down arrow table generation."
159
+ )
160
+ float_array = pa.array(columnar[field.name], type=pa.float64())
161
+ columnar[field.name] = float_array.cast(field.type, safe=False)
162
+ return pa.Table.from_pydict(columnar, schema=arrow_schema)
ingestr/src/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.5"
1
+ __version__ = "0.3.0"
Binary file
Binary file