ingestr 0.9.5__py3-none-any.whl → 0.10.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ingestr might be problematic. Click here for more details.

@@ -1,282 +0,0 @@
1
- """SQL database source helpers"""
2
-
3
- import operator
4
- import warnings
5
- from typing import (
6
- Any,
7
- Callable,
8
- Dict,
9
- Iterator,
10
- Literal,
11
- Optional,
12
- Union,
13
- )
14
-
15
- import dlt
16
- from dlt.common.configuration.specs import BaseConfiguration, configspec
17
- from dlt.common.exceptions import MissingDependencyException
18
- from dlt.common.schema import TTableSchemaColumns
19
- from dlt.common.typing import TDataItem, TSortOrder
20
- from sqlalchemy import create_engine
21
- from sqlalchemy.engine import Engine
22
- from sqlalchemy.exc import CompileError
23
-
24
- from .arrow_helpers import row_tuples_to_arrow
25
- from .override import IngestrConnectionStringCredentials as ConnectionStringCredentials
26
- from .schema_types import (
27
- ReflectionLevel,
28
- SelectAny,
29
- Table,
30
- TTypeAdapter,
31
- get_primary_key,
32
- table_to_columns,
33
- )
34
-
35
- TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
36
-
37
-
38
- class TableLoader:
39
- def __init__(
40
- self,
41
- engine: Engine,
42
- backend: TableBackend,
43
- table: Table,
44
- columns: TTableSchemaColumns,
45
- chunk_size: int = 1000,
46
- incremental: Optional[dlt.sources.incremental[Any]] = None,
47
- ) -> None:
48
- self.engine = engine
49
- self.backend = backend
50
- self.table = table
51
- self.columns = columns
52
- self.chunk_size = chunk_size
53
- self.incremental = incremental
54
- if incremental:
55
- try:
56
- self.cursor_column = table.c[incremental.cursor_path]
57
- except KeyError as e:
58
- raise KeyError(
59
- f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'"
60
- ) from e
61
- self.last_value = incremental.last_value
62
- self.end_value = incremental.end_value
63
- self.row_order: TSortOrder = self.incremental.row_order
64
- else:
65
- self.cursor_column = None
66
- self.last_value = None
67
- self.end_value = None
68
- self.row_order = None
69
-
70
- def make_query(self) -> SelectAny:
71
- table = self.table
72
- query = table.select()
73
- if not self.incremental:
74
- return query
75
-
76
- last_value_func = self.incremental.last_value_func
77
-
78
- # generate where
79
- if (
80
- last_value_func is max
81
- ): # Query ordered and filtered according to last_value function
82
- filter_op = operator.ge
83
- filter_op_end = operator.lt
84
- elif last_value_func is min:
85
- filter_op = operator.le
86
- filter_op_end = operator.gt
87
- else: # Custom last_value, load everything and let incremental handle filtering
88
- return query
89
-
90
- if self.last_value is not None:
91
- query = query.where(filter_op(self.cursor_column, self.last_value))
92
- if self.end_value is not None:
93
- query = query.where(filter_op_end(self.cursor_column, self.end_value))
94
-
95
- # generate order by from declared row order
96
- order_by = None
97
- if (self.row_order == "asc" and last_value_func is max) or (
98
- self.row_order == "desc" and last_value_func is min
99
- ):
100
- order_by = self.cursor_column.asc()
101
- elif (self.row_order == "asc" and last_value_func is min) or (
102
- self.row_order == "desc" and last_value_func is max
103
- ):
104
- order_by = self.cursor_column.desc()
105
- if order_by is not None:
106
- query = query.order_by(order_by)
107
-
108
- return query
109
-
110
- def load_rows(self, backend_kwargs: Dict[str, Any] = None) -> Iterator[TDataItem]:
111
- # make copy of kwargs
112
- backend_kwargs = dict(backend_kwargs or {})
113
- query = self.make_query()
114
- if self.backend == "connectorx":
115
- yield from self._load_rows_connectorx(query, backend_kwargs)
116
- else:
117
- yield from self._load_rows(query, backend_kwargs)
118
-
119
- def _load_rows(self, query: SelectAny, backend_kwargs: Dict[str, Any]) -> TDataItem:
120
- with self.engine.connect() as conn:
121
- result = conn.execution_options(yield_per=self.chunk_size).execute(query)
122
- # NOTE: cursor returns not normalized column names! may be quite useful in case of Oracle dialect
123
- # that normalizes columns
124
- # columns = [c[0] for c in result.cursor.description]
125
- columns = list(result.keys())
126
- for partition in result.partitions(size=self.chunk_size):
127
- if self.backend == "sqlalchemy":
128
- yield [dict(row._mapping) for row in partition]
129
- elif self.backend == "pandas":
130
- from dlt.common.libs.pandas_sql import _wrap_result
131
-
132
- df = _wrap_result(
133
- partition,
134
- columns,
135
- **{"dtype_backend": "pyarrow", **backend_kwargs},
136
- )
137
- yield df
138
- elif self.backend == "pyarrow":
139
- yield row_tuples_to_arrow(
140
- partition, self.columns, tz=backend_kwargs.get("tz", "UTC")
141
- )
142
-
143
- def _load_rows_connectorx(
144
- self, query: SelectAny, backend_kwargs: Dict[str, Any]
145
- ) -> Iterator[TDataItem]:
146
- try:
147
- import connectorx as cx # type: ignore
148
- except ImportError:
149
- raise MissingDependencyException(
150
- "Connector X table backend", ["connectorx"]
151
- )
152
-
153
- # default settings
154
- backend_kwargs = {
155
- "return_type": "arrow2",
156
- "protocol": "binary",
157
- **backend_kwargs,
158
- }
159
- conn = backend_kwargs.pop(
160
- "conn",
161
- self.engine.url._replace(
162
- drivername=self.engine.url.get_backend_name()
163
- ).render_as_string(hide_password=False),
164
- )
165
- try:
166
- query_str = str(
167
- query.compile(self.engine, compile_kwargs={"literal_binds": True})
168
- )
169
- except CompileError as ex:
170
- raise NotImplementedError(
171
- f"Query for table {self.table.name} could not be compiled to string to execute it on ConnectorX. If you are on SQLAlchemy 1.4.x the causing exception is due to literals that cannot be rendered, upgrade to 2.x: {str(ex)}"
172
- ) from ex
173
- df = cx.read_sql(conn, query_str, **backend_kwargs)
174
- yield df
175
-
176
-
177
- def table_rows(
178
- engine: Engine,
179
- table: Table,
180
- chunk_size: int,
181
- backend: TableBackend,
182
- incremental: Optional[dlt.sources.incremental[Any]] = None,
183
- defer_table_reflect: bool = False,
184
- table_adapter_callback: Callable[[Table], None] = None,
185
- reflection_level: ReflectionLevel = "minimal",
186
- backend_kwargs: Dict[str, Any] = None,
187
- type_adapter_callback: Optional[TTypeAdapter] = None,
188
- ) -> Iterator[TDataItem]:
189
- columns: TTableSchemaColumns = None
190
- if defer_table_reflect:
191
- table = Table(
192
- table.name, table.metadata, autoload_with=engine, extend_existing=True
193
- )
194
- if table_adapter_callback:
195
- table_adapter_callback(table)
196
- columns = table_to_columns(table, reflection_level, type_adapter_callback)
197
-
198
- # set the primary_key in the incremental
199
- if incremental and incremental.primary_key is None:
200
- primary_key = get_primary_key(table)
201
- if primary_key is not None:
202
- incremental.primary_key = primary_key
203
- # yield empty record to set hints
204
- yield dlt.mark.with_hints(
205
- [],
206
- dlt.mark.make_hints(
207
- primary_key=get_primary_key(table),
208
- columns=columns,
209
- ),
210
- )
211
- else:
212
- # table was already reflected
213
- columns = table_to_columns(table, reflection_level, type_adapter_callback)
214
-
215
- loader = TableLoader(
216
- engine, backend, table, columns, incremental=incremental, chunk_size=chunk_size
217
- )
218
- yield from loader.load_rows(backend_kwargs)
219
-
220
-
221
- def engine_from_credentials(
222
- credentials: Union[ConnectionStringCredentials, Engine, str], **backend_kwargs: Any
223
- ) -> Engine:
224
- if isinstance(credentials, Engine):
225
- return credentials
226
- if isinstance(credentials, ConnectionStringCredentials):
227
- credentials = credentials.to_native_representation()
228
- return create_engine(credentials, **backend_kwargs)
229
-
230
-
231
- def unwrap_json_connector_x(field: str) -> TDataItem:
232
- """Creates a transform function to be added with `add_map` that will unwrap JSON columns
233
- ingested via connectorx. Such columns are additionally quoted and translate SQL NULL to json "null"
234
- """
235
- import pyarrow as pa
236
- import pyarrow.compute as pc
237
-
238
- def _unwrap(table: TDataItem) -> TDataItem:
239
- col_index = table.column_names.index(field)
240
- # remove quotes
241
- column = pc.replace_substring_regex(table[field], '"(.*)"', "\\1")
242
- # convert json null to null
243
- column = pc.replace_with_mask(
244
- column,
245
- pc.equal(column, "null").combine_chunks(),
246
- pa.scalar(None, pa.large_string()),
247
- )
248
- return table.set_column(col_index, table.schema.field(col_index), column)
249
-
250
- return _unwrap
251
-
252
-
253
- def _detect_precision_hints_deprecated(value: Optional[bool]) -> None:
254
- if value is None:
255
- return
256
-
257
- msg = "`detect_precision_hints` argument is deprecated and will be removed in a future release. "
258
- if value:
259
- msg += "Use `reflection_level='full_with_precision'` which has the same effect instead."
260
-
261
- warnings.warn(
262
- msg,
263
- DeprecationWarning,
264
- )
265
-
266
-
267
- @configspec
268
- class SqlDatabaseTableConfiguration(BaseConfiguration):
269
- incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
270
-
271
-
272
- @configspec
273
- class SqlTableResourceConfiguration(BaseConfiguration):
274
- credentials: Union[ConnectionStringCredentials, Engine, str] = None
275
- table: str = None
276
- schema: Optional[str] = None
277
- incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
278
- chunk_size: int = 50000
279
- backend: TableBackend = "sqlalchemy"
280
- detect_precision_hints: Optional[bool] = None
281
- defer_table_reflect: Optional[bool] = False
282
- reflection_level: Optional[ReflectionLevel] = "full"
@@ -1,10 +0,0 @@
1
- from typing import Optional
2
-
3
- from dlt.common.configuration.specs.base_configuration import configspec
4
- from dlt.sources.credentials import ConnectionStringCredentials
5
-
6
-
7
- @configspec(init=False)
8
- class IngestrConnectionStringCredentials(ConnectionStringCredentials):
9
- username: Optional[str] = None
10
- database: Optional[str] = None
@@ -1,139 +0,0 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Any,
4
- Callable,
5
- List,
6
- Literal,
7
- Optional,
8
- Type,
9
- Union,
10
- )
11
-
12
- from dlt.common import logger
13
- from dlt.common.schema.typing import TColumnSchema, TTableSchemaColumns
14
- from sqlalchemy import Column, Table
15
- from sqlalchemy.engine import Row
16
- from sqlalchemy.sql import Select, sqltypes
17
- from sqlalchemy.sql.sqltypes import TypeEngine
18
- from typing_extensions import TypeAlias
19
-
20
- ReflectionLevel = Literal["minimal", "full", "full_with_precision"]
21
-
22
-
23
- # optionally create generics with any so they can be imported by dlt importer
24
- if TYPE_CHECKING:
25
- SelectAny: TypeAlias = Select[Any]
26
- ColumnAny: TypeAlias = Column[Any]
27
- RowAny: TypeAlias = Row[Any]
28
- TypeEngineAny = TypeEngine[Any]
29
- else:
30
- SelectAny: TypeAlias = Type[Any]
31
- ColumnAny: TypeAlias = Type[Any]
32
- RowAny: TypeAlias = Type[Any]
33
- TypeEngineAny = Type[Any]
34
-
35
-
36
- TTypeAdapter = Callable[
37
- [TypeEngineAny], Optional[Union[TypeEngineAny, Type[TypeEngineAny]]]
38
- ]
39
-
40
-
41
- def sqla_col_to_column_schema(
42
- sql_col: ColumnAny,
43
- reflection_level: ReflectionLevel,
44
- type_adapter_callback: Optional[TTypeAdapter] = None,
45
- ) -> Optional[TColumnSchema]:
46
- """Infer dlt schema column type from an sqlalchemy type.
47
-
48
- If `add_precision` is set, precision and scale is inferred from that types that support it,
49
- such as numeric, varchar, int, bigint. Numeric (decimal) types have always precision added.
50
- """
51
- col: TColumnSchema = {
52
- "name": sql_col.name,
53
- "nullable": sql_col.nullable,
54
- }
55
- if reflection_level == "minimal":
56
- return col
57
-
58
- sql_t = sql_col.type
59
-
60
- if type_adapter_callback:
61
- sql_t = type_adapter_callback(sql_t) # type: ignore[assignment]
62
- # Check if sqla type class rather than instance is returned
63
- if sql_t is not None and isinstance(sql_t, type):
64
- sql_t = sql_t()
65
-
66
- if sql_t is None:
67
- # Column ignored by callback
68
- return col
69
-
70
- add_precision = reflection_level == "full_with_precision"
71
-
72
- if isinstance(sql_t, sqltypes.SmallInteger):
73
- col["data_type"] = "bigint"
74
- if add_precision:
75
- col["precision"] = 32
76
- elif isinstance(sql_t, sqltypes.Integer):
77
- col["data_type"] = "bigint"
78
- elif isinstance(sql_t, sqltypes.Numeric):
79
- # dlt column type depends on the data returned by the sql alchemy dialect
80
- # and not on the metadata reflected in the database. all Numeric types
81
- # that are returned as floats will assume "double" type
82
- # and returned as decimals will assume "decimal" type
83
- if sql_t.asdecimal is False:
84
- col["data_type"] = "double"
85
- else:
86
- col["data_type"] = "decimal"
87
- if sql_t.precision is not None:
88
- col["precision"] = sql_t.precision
89
- # must have a precision for any meaningful scale
90
- if sql_t.scale is not None:
91
- col["scale"] = sql_t.scale
92
- elif sql_t.decimal_return_scale is not None:
93
- col["scale"] = sql_t.decimal_return_scale
94
- elif isinstance(sql_t, sqltypes.String):
95
- col["data_type"] = "text"
96
- if add_precision and sql_t.length:
97
- col["precision"] = sql_t.length
98
- elif isinstance(sql_t, sqltypes._Binary):
99
- col["data_type"] = "binary"
100
- if add_precision and sql_t.length:
101
- col["precision"] = sql_t.length
102
- elif isinstance(sql_t, sqltypes.DateTime):
103
- col["data_type"] = "timestamp"
104
- elif isinstance(sql_t, sqltypes.Date):
105
- col["data_type"] = "date"
106
- elif isinstance(sql_t, sqltypes.Time):
107
- col["data_type"] = "time"
108
- elif isinstance(sql_t, sqltypes.JSON):
109
- col["data_type"] = "complex"
110
- elif isinstance(sql_t, sqltypes.Boolean):
111
- col["data_type"] = "bool"
112
- else:
113
- logger.warning(
114
- f"A column with name {sql_col.name} contains unknown data type {sql_t} which cannot be mapped to `dlt` data type. When using sqlalchemy backend such data will be passed to the normalizer. In case of `pyarrow` and `pandas` backend, data types are detected from numpy ndarrays. In case of other backends, the behavior is backend-specific."
115
- )
116
-
117
- return {key: value for key, value in col.items() if value is not None} # type: ignore[return-value]
118
-
119
-
120
- def get_primary_key(table: Table) -> Optional[List[str]]:
121
- """Create primary key or return None if no key defined"""
122
- primary_key = [c.name for c in table.primary_key]
123
- return primary_key if len(primary_key) > 0 else None
124
-
125
-
126
- def table_to_columns(
127
- table: Table,
128
- reflection_level: ReflectionLevel = "full",
129
- type_conversion_fallback: Optional[TTypeAdapter] = None,
130
- ) -> TTableSchemaColumns:
131
- """Convert an sqlalchemy table to a dlt table schema."""
132
- return {
133
- col["name"]: col
134
- for col in (
135
- sqla_col_to_column_schema(c, reflection_level, type_conversion_fallback)
136
- for c in table.columns
137
- )
138
- if col is not None
139
- }