ingestr 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +156 -40
- ingestr/src/adjust/__init__.py +1 -1
- ingestr/src/filters.py +21 -0
- ingestr/src/gorgias/__init__.py +17 -17
- ingestr/src/shopify/__init__.py +42 -42
- ingestr/src/slack/__init__.py +2 -2
- ingestr/src/sources.py +33 -6
- ingestr/src/version.py +1 -1
- ingestr/src/zendesk/__init__.py +2 -2
- {ingestr-0.9.5.dist-info → ingestr-0.10.0.dist-info}/METADATA +18 -18
- {ingestr-0.9.5.dist-info → ingestr-0.10.0.dist-info}/RECORD +14 -18
- ingestr/src/sql_database/__init__.py +0 -206
- ingestr/src/sql_database/arrow_helpers.py +0 -139
- ingestr/src/sql_database/helpers.py +0 -282
- ingestr/src/sql_database/override.py +0 -10
- ingestr/src/sql_database/schema_types.py +0 -139
- {ingestr-0.9.5.dist-info → ingestr-0.10.0.dist-info}/WHEEL +0 -0
- {ingestr-0.9.5.dist-info → ingestr-0.10.0.dist-info}/entry_points.txt +0 -0
- {ingestr-0.9.5.dist-info → ingestr-0.10.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,206 +0,0 @@
|
|
|
1
|
-
"""Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads."""
|
|
2
|
-
|
|
3
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
4
|
-
|
|
5
|
-
import dlt
|
|
6
|
-
from dlt.sources import DltResource
|
|
7
|
-
from sqlalchemy import MetaData, Table
|
|
8
|
-
from sqlalchemy.engine import Engine
|
|
9
|
-
|
|
10
|
-
from .helpers import (
|
|
11
|
-
SqlDatabaseTableConfiguration,
|
|
12
|
-
SqlTableResourceConfiguration,
|
|
13
|
-
TableBackend,
|
|
14
|
-
_detect_precision_hints_deprecated,
|
|
15
|
-
engine_from_credentials,
|
|
16
|
-
table_rows,
|
|
17
|
-
)
|
|
18
|
-
from .override import IngestrConnectionStringCredentials as ConnectionStringCredentials
|
|
19
|
-
from .schema_types import (
|
|
20
|
-
ReflectionLevel,
|
|
21
|
-
TTypeAdapter,
|
|
22
|
-
get_primary_key,
|
|
23
|
-
table_to_columns,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dlt.source
|
|
28
|
-
def sql_database(
|
|
29
|
-
credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value,
|
|
30
|
-
schema: Optional[str] = dlt.config.value,
|
|
31
|
-
metadata: Optional[MetaData] = None,
|
|
32
|
-
table_names: Optional[List[str]] = dlt.config.value,
|
|
33
|
-
chunk_size: int = 50000,
|
|
34
|
-
backend: TableBackend = "sqlalchemy",
|
|
35
|
-
detect_precision_hints: Optional[bool] = False,
|
|
36
|
-
reflection_level: Optional[ReflectionLevel] = "full",
|
|
37
|
-
defer_table_reflect: Optional[bool] = None,
|
|
38
|
-
table_adapter_callback: Callable[[Table], None] = None,
|
|
39
|
-
backend_kwargs: Dict[str, Any] = None,
|
|
40
|
-
include_views: bool = False,
|
|
41
|
-
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
42
|
-
) -> Iterable[DltResource]:
|
|
43
|
-
"""
|
|
44
|
-
A dlt source which loads data from an SQL database using SQLAlchemy.
|
|
45
|
-
Resources are automatically created for each table in the schema or from the given list of tables.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
|
|
49
|
-
schema (Optional[str]): Name of the database schema to load (if different from default).
|
|
50
|
-
metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
|
|
51
|
-
table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
|
|
52
|
-
chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
|
|
53
|
-
backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
|
|
54
|
-
"sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
|
|
55
|
-
"sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
|
|
56
|
-
"connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
|
|
57
|
-
detect_precision_hints (bool): Deprecated. Use `reflection_level`. Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
|
|
58
|
-
This is disabled by default.
|
|
59
|
-
reflection_level: (ReflectionLevel): Specifies how much information should be reflected from the source database schema.
|
|
60
|
-
"minimal": Only table names, nullability and primary keys are reflected. Data types are inferred from the data.
|
|
61
|
-
"full": Data types will be reflected on top of "minimal". `dlt` will coerce the data into reflected types if necessary. This is the default option.
|
|
62
|
-
"full_with_precision": Sets precision and scale on supported data types (ie. decimal, text, binary). Creates big and regular integer types.
|
|
63
|
-
defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Requires table_names to be explicitly passed.
|
|
64
|
-
Enable this option when running on Airflow. Available on dlt 0.4.4 and later.
|
|
65
|
-
table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
|
|
66
|
-
backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
|
|
67
|
-
include_views (bool): Reflect views as well as tables. Note view names included in `table_names` are always included regardless of this setting.
|
|
68
|
-
type_adapter_callback(Optional[Callable]): Callable to override type inference when reflecting columns.
|
|
69
|
-
Argument is a single sqlalchemy data type (`TypeEngine` instance) and it should return another sqlalchemy data type, or `None` (type will be inferred from data)
|
|
70
|
-
Returns:
|
|
71
|
-
|
|
72
|
-
Iterable[DltResource]: A list of DLT resources for each table to be loaded.
|
|
73
|
-
"""
|
|
74
|
-
# detect precision hints is deprecated
|
|
75
|
-
_detect_precision_hints_deprecated(detect_precision_hints)
|
|
76
|
-
|
|
77
|
-
if detect_precision_hints:
|
|
78
|
-
reflection_level = "full_with_precision"
|
|
79
|
-
else:
|
|
80
|
-
reflection_level = reflection_level or "minimal"
|
|
81
|
-
|
|
82
|
-
# set up alchemy engine
|
|
83
|
-
engine = engine_from_credentials(credentials)
|
|
84
|
-
engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
|
|
85
|
-
metadata = metadata or MetaData(schema=schema)
|
|
86
|
-
|
|
87
|
-
# use provided tables or all tables
|
|
88
|
-
if table_names:
|
|
89
|
-
tables = [
|
|
90
|
-
Table(name, metadata, autoload_with=None if defer_table_reflect else engine)
|
|
91
|
-
for name in table_names
|
|
92
|
-
]
|
|
93
|
-
else:
|
|
94
|
-
if defer_table_reflect:
|
|
95
|
-
raise ValueError("You must pass table names to defer table reflection")
|
|
96
|
-
metadata.reflect(bind=engine, views=include_views)
|
|
97
|
-
tables = list(metadata.tables.values())
|
|
98
|
-
|
|
99
|
-
for table in tables:
|
|
100
|
-
if table_adapter_callback and not defer_table_reflect:
|
|
101
|
-
table_adapter_callback(table)
|
|
102
|
-
|
|
103
|
-
yield dlt.resource(
|
|
104
|
-
table_rows,
|
|
105
|
-
name=table.name,
|
|
106
|
-
primary_key=get_primary_key(table),
|
|
107
|
-
spec=SqlDatabaseTableConfiguration,
|
|
108
|
-
columns=table_to_columns(table, reflection_level, type_adapter_callback),
|
|
109
|
-
)(
|
|
110
|
-
engine,
|
|
111
|
-
table,
|
|
112
|
-
chunk_size,
|
|
113
|
-
backend,
|
|
114
|
-
reflection_level=reflection_level,
|
|
115
|
-
defer_table_reflect=defer_table_reflect,
|
|
116
|
-
table_adapter_callback=table_adapter_callback,
|
|
117
|
-
backend_kwargs=backend_kwargs,
|
|
118
|
-
type_adapter_callback=type_adapter_callback,
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@dlt.resource(
|
|
123
|
-
name=lambda args: args["table"], standalone=True, spec=SqlTableResourceConfiguration
|
|
124
|
-
)
|
|
125
|
-
def sql_table(
|
|
126
|
-
credentials: Union[ConnectionStringCredentials, Engine, str] = str,
|
|
127
|
-
table: str = dlt.config.value,
|
|
128
|
-
schema: Optional[str] = dlt.config.value,
|
|
129
|
-
metadata: Optional[MetaData] = None,
|
|
130
|
-
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
131
|
-
chunk_size: int = 50000,
|
|
132
|
-
backend: TableBackend = "sqlalchemy",
|
|
133
|
-
detect_precision_hints: Optional[bool] = None,
|
|
134
|
-
reflection_level: Optional[ReflectionLevel] = "full_with_precision",
|
|
135
|
-
defer_table_reflect: Optional[bool] = None,
|
|
136
|
-
table_adapter_callback: Callable[[Table], None] = None,
|
|
137
|
-
backend_kwargs: Dict[str, Any] = None,
|
|
138
|
-
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
139
|
-
merge_key: Optional[str] = None,
|
|
140
|
-
) -> DltResource:
|
|
141
|
-
"""
|
|
142
|
-
A dlt resource which loads data from an SQL database table using SQLAlchemy.
|
|
143
|
-
|
|
144
|
-
Args:
|
|
145
|
-
credentials (Union[ConnectionStringCredentials, Engine, str]): Database credentials or an `Engine` instance representing the database connection.
|
|
146
|
-
table (str): Name of the table or view to load.
|
|
147
|
-
schema (Optional[str]): Optional name of the schema the table belongs to.
|
|
148
|
-
metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. If provided, the `schema` argument is ignored.
|
|
149
|
-
incremental (Optional[dlt.sources.incremental[Any]]): Option to enable incremental loading for the table.
|
|
150
|
-
E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
|
|
151
|
-
chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
|
|
152
|
-
backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
|
|
153
|
-
"sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
|
|
154
|
-
"sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
|
|
155
|
-
"connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
|
|
156
|
-
reflection_level: (ReflectionLevel): Specifies how much information should be reflected from the source database schema.
|
|
157
|
-
"minimal": Only table names, nullability and primary keys are reflected. Data types are inferred from the data.
|
|
158
|
-
"full": Data types will be reflected on top of "minimal". `dlt` will coerce the data into reflected types if necessary. This is the default option.
|
|
159
|
-
"full_with_precision": Sets precision and scale on supported data types (ie. decimal, text, binary). Creates big and regular integer types.
|
|
160
|
-
detect_precision_hints (bool): Deprecated. Use `reflection_level`. Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
|
|
161
|
-
This is disabled by default.
|
|
162
|
-
defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Enable this option when running on Airflow. Available
|
|
163
|
-
on dlt 0.4.4 and later
|
|
164
|
-
table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
|
|
165
|
-
backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
|
|
166
|
-
type_adapter_callback(Optional[Callable]): Callable to override type inference when reflecting columns.
|
|
167
|
-
Argument is a single sqlalchemy data type (`TypeEngine` instance) and it should return another sqlalchemy data type, or `None` (type will be inferred from data)
|
|
168
|
-
|
|
169
|
-
Returns:
|
|
170
|
-
DltResource: The dlt resource for loading data from the SQL database table.
|
|
171
|
-
"""
|
|
172
|
-
_detect_precision_hints_deprecated(detect_precision_hints)
|
|
173
|
-
|
|
174
|
-
if detect_precision_hints:
|
|
175
|
-
reflection_level = "full_with_precision"
|
|
176
|
-
else:
|
|
177
|
-
reflection_level = reflection_level or "minimal"
|
|
178
|
-
|
|
179
|
-
engine = engine_from_credentials(credentials)
|
|
180
|
-
engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
|
|
181
|
-
metadata = metadata or MetaData(schema=schema)
|
|
182
|
-
|
|
183
|
-
table_obj = Table(
|
|
184
|
-
table, metadata, autoload_with=None if defer_table_reflect else engine
|
|
185
|
-
)
|
|
186
|
-
if table_adapter_callback and not defer_table_reflect:
|
|
187
|
-
table_adapter_callback(table_obj)
|
|
188
|
-
|
|
189
|
-
return dlt.resource(
|
|
190
|
-
table_rows,
|
|
191
|
-
name=table_obj.name,
|
|
192
|
-
primary_key=get_primary_key(table_obj),
|
|
193
|
-
columns=table_to_columns(table_obj, reflection_level, type_adapter_callback),
|
|
194
|
-
merge_key=merge_key,
|
|
195
|
-
)(
|
|
196
|
-
engine,
|
|
197
|
-
table_obj,
|
|
198
|
-
chunk_size,
|
|
199
|
-
backend,
|
|
200
|
-
incremental=incremental,
|
|
201
|
-
reflection_level=reflection_level,
|
|
202
|
-
defer_table_reflect=defer_table_reflect,
|
|
203
|
-
table_adapter_callback=table_adapter_callback,
|
|
204
|
-
backend_kwargs=backend_kwargs,
|
|
205
|
-
type_adapter_callback=type_adapter_callback,
|
|
206
|
-
)
|
|
@@ -1,139 +0,0 @@
|
|
|
1
|
-
from typing import Any, Optional, Sequence
|
|
2
|
-
|
|
3
|
-
from dlt.common import logger
|
|
4
|
-
from dlt.common.configuration import with_config
|
|
5
|
-
from dlt.common.destination import DestinationCapabilitiesContext
|
|
6
|
-
from dlt.common.json import custom_encode, map_nested_in_place
|
|
7
|
-
from dlt.common.schema.typing import TTableSchemaColumns
|
|
8
|
-
|
|
9
|
-
from .schema_types import RowAny
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@with_config
|
|
13
|
-
def columns_to_arrow(
|
|
14
|
-
columns_schema: TTableSchemaColumns,
|
|
15
|
-
caps: DestinationCapabilitiesContext = None,
|
|
16
|
-
tz: str = "UTC",
|
|
17
|
-
) -> Any:
|
|
18
|
-
"""Converts `column_schema` to arrow schema using `caps` and `tz`. `caps` are injected from the container - which
|
|
19
|
-
is always the case if run within the pipeline. This will generate arrow schema compatible with the destination.
|
|
20
|
-
Otherwise generic capabilities are used
|
|
21
|
-
"""
|
|
22
|
-
from dlt.common.destination.capabilities import DestinationCapabilitiesContext
|
|
23
|
-
from dlt.common.libs.pyarrow import get_py_arrow_datatype
|
|
24
|
-
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
25
|
-
|
|
26
|
-
return pa.schema(
|
|
27
|
-
[
|
|
28
|
-
pa.field(
|
|
29
|
-
name,
|
|
30
|
-
get_py_arrow_datatype(
|
|
31
|
-
schema_item,
|
|
32
|
-
caps or DestinationCapabilitiesContext.generic_capabilities(),
|
|
33
|
-
tz,
|
|
34
|
-
),
|
|
35
|
-
nullable=schema_item.get("nullable", True),
|
|
36
|
-
)
|
|
37
|
-
for name, schema_item in columns_schema.items()
|
|
38
|
-
if schema_item.get("data_type") is not None
|
|
39
|
-
]
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def row_tuples_to_arrow(
|
|
44
|
-
rows: Sequence[RowAny], columns: TTableSchemaColumns, tz: str
|
|
45
|
-
) -> Any:
|
|
46
|
-
"""Converts the rows to an arrow table using the columns schema.
|
|
47
|
-
Columns missing `data_type` will be inferred from the row data.
|
|
48
|
-
Columns with object types not supported by arrow are excluded from the resulting table.
|
|
49
|
-
"""
|
|
50
|
-
import numpy as np
|
|
51
|
-
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
52
|
-
|
|
53
|
-
try:
|
|
54
|
-
from pandas._libs import lib
|
|
55
|
-
|
|
56
|
-
pivoted_rows = lib.to_object_array_tuples(rows).T # type: ignore[attr-defined]
|
|
57
|
-
except ImportError:
|
|
58
|
-
logger.info(
|
|
59
|
-
"Pandas not installed, reverting to numpy.asarray to create a table which is slower"
|
|
60
|
-
)
|
|
61
|
-
pivoted_rows = np.asarray(rows, dtype="object", order="k").T # type: ignore[call-overload]
|
|
62
|
-
|
|
63
|
-
columnar = {
|
|
64
|
-
col: dat.ravel()
|
|
65
|
-
for col, dat in zip(columns, np.vsplit(pivoted_rows, len(columns)))
|
|
66
|
-
}
|
|
67
|
-
columnar_known_types = {
|
|
68
|
-
col["name"]: columnar[col["name"]]
|
|
69
|
-
for col in columns.values()
|
|
70
|
-
if col.get("data_type") is not None
|
|
71
|
-
}
|
|
72
|
-
columnar_unknown_types = {
|
|
73
|
-
col["name"]: columnar[col["name"]]
|
|
74
|
-
for col in columns.values()
|
|
75
|
-
if col.get("data_type") is None
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
arrow_schema = columns_to_arrow(columns, tz=tz)
|
|
79
|
-
|
|
80
|
-
for idx in range(0, len(arrow_schema.names)):
|
|
81
|
-
field = arrow_schema.field(idx)
|
|
82
|
-
py_type = type(rows[0][idx])
|
|
83
|
-
# cast double / float ndarrays to decimals if type mismatch, looks like decimals and floats are often mixed up in dialects
|
|
84
|
-
if pa.types.is_decimal(field.type) and issubclass(py_type, (str, float)):
|
|
85
|
-
logger.warning(
|
|
86
|
-
f"Field {field.name} was reflected as decimal type, but rows contains {py_type.__name__}. Additional cast is required which may slow down arrow table generation."
|
|
87
|
-
)
|
|
88
|
-
float_array = pa.array(columnar_known_types[field.name], type=pa.float64())
|
|
89
|
-
columnar_known_types[field.name] = float_array.cast(field.type, safe=False)
|
|
90
|
-
|
|
91
|
-
# If there are unknown type columns, first create a table to infer their types
|
|
92
|
-
if columnar_unknown_types:
|
|
93
|
-
new_schema_fields = []
|
|
94
|
-
for key in list(columnar_unknown_types):
|
|
95
|
-
arrow_col: Optional[pa.Array] = None
|
|
96
|
-
try:
|
|
97
|
-
arrow_col = pa.array(columnar_unknown_types[key])
|
|
98
|
-
if pa.types.is_null(arrow_col.type):
|
|
99
|
-
logger.warning(
|
|
100
|
-
f"Column {key} contains only NULL values and data type could not be inferred. This column is removed from a arrow table"
|
|
101
|
-
)
|
|
102
|
-
continue
|
|
103
|
-
|
|
104
|
-
except pa.ArrowInvalid as e:
|
|
105
|
-
# Try coercing types not supported by arrow to a json friendly format
|
|
106
|
-
# E.g. dataclasses -> dict, UUID -> str
|
|
107
|
-
try:
|
|
108
|
-
arrow_col = pa.array(
|
|
109
|
-
map_nested_in_place(
|
|
110
|
-
custom_encode, list(columnar_unknown_types[key])
|
|
111
|
-
)
|
|
112
|
-
)
|
|
113
|
-
logger.warning(
|
|
114
|
-
f"Column {key} contains a data type which is not supported by pyarrow and got converted into {arrow_col.type}. This slows down arrow table generation."
|
|
115
|
-
)
|
|
116
|
-
except (pa.ArrowInvalid, TypeError):
|
|
117
|
-
logger.warning(
|
|
118
|
-
f"Column {key} contains a data type which is not supported by pyarrow. This column will be ignored. Error: {e}"
|
|
119
|
-
)
|
|
120
|
-
if arrow_col is not None:
|
|
121
|
-
columnar_known_types[key] = arrow_col
|
|
122
|
-
new_schema_fields.append(
|
|
123
|
-
pa.field(
|
|
124
|
-
key,
|
|
125
|
-
arrow_col.type,
|
|
126
|
-
nullable=columns[key]["nullable"],
|
|
127
|
-
)
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
# New schema
|
|
131
|
-
column_order = {name: idx for idx, name in enumerate(columns)}
|
|
132
|
-
arrow_schema = pa.schema(
|
|
133
|
-
sorted(
|
|
134
|
-
list(arrow_schema) + new_schema_fields,
|
|
135
|
-
key=lambda x: column_order[x.name],
|
|
136
|
-
)
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
return pa.Table.from_pydict(columnar_known_types, schema=arrow_schema)
|
|
@@ -1,282 +0,0 @@
|
|
|
1
|
-
"""SQL database source helpers"""
|
|
2
|
-
|
|
3
|
-
import operator
|
|
4
|
-
import warnings
|
|
5
|
-
from typing import (
|
|
6
|
-
Any,
|
|
7
|
-
Callable,
|
|
8
|
-
Dict,
|
|
9
|
-
Iterator,
|
|
10
|
-
Literal,
|
|
11
|
-
Optional,
|
|
12
|
-
Union,
|
|
13
|
-
)
|
|
14
|
-
|
|
15
|
-
import dlt
|
|
16
|
-
from dlt.common.configuration.specs import BaseConfiguration, configspec
|
|
17
|
-
from dlt.common.exceptions import MissingDependencyException
|
|
18
|
-
from dlt.common.schema import TTableSchemaColumns
|
|
19
|
-
from dlt.common.typing import TDataItem, TSortOrder
|
|
20
|
-
from sqlalchemy import create_engine
|
|
21
|
-
from sqlalchemy.engine import Engine
|
|
22
|
-
from sqlalchemy.exc import CompileError
|
|
23
|
-
|
|
24
|
-
from .arrow_helpers import row_tuples_to_arrow
|
|
25
|
-
from .override import IngestrConnectionStringCredentials as ConnectionStringCredentials
|
|
26
|
-
from .schema_types import (
|
|
27
|
-
ReflectionLevel,
|
|
28
|
-
SelectAny,
|
|
29
|
-
Table,
|
|
30
|
-
TTypeAdapter,
|
|
31
|
-
get_primary_key,
|
|
32
|
-
table_to_columns,
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class TableLoader:
|
|
39
|
-
def __init__(
|
|
40
|
-
self,
|
|
41
|
-
engine: Engine,
|
|
42
|
-
backend: TableBackend,
|
|
43
|
-
table: Table,
|
|
44
|
-
columns: TTableSchemaColumns,
|
|
45
|
-
chunk_size: int = 1000,
|
|
46
|
-
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
47
|
-
) -> None:
|
|
48
|
-
self.engine = engine
|
|
49
|
-
self.backend = backend
|
|
50
|
-
self.table = table
|
|
51
|
-
self.columns = columns
|
|
52
|
-
self.chunk_size = chunk_size
|
|
53
|
-
self.incremental = incremental
|
|
54
|
-
if incremental:
|
|
55
|
-
try:
|
|
56
|
-
self.cursor_column = table.c[incremental.cursor_path]
|
|
57
|
-
except KeyError as e:
|
|
58
|
-
raise KeyError(
|
|
59
|
-
f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'"
|
|
60
|
-
) from e
|
|
61
|
-
self.last_value = incremental.last_value
|
|
62
|
-
self.end_value = incremental.end_value
|
|
63
|
-
self.row_order: TSortOrder = self.incremental.row_order
|
|
64
|
-
else:
|
|
65
|
-
self.cursor_column = None
|
|
66
|
-
self.last_value = None
|
|
67
|
-
self.end_value = None
|
|
68
|
-
self.row_order = None
|
|
69
|
-
|
|
70
|
-
def make_query(self) -> SelectAny:
|
|
71
|
-
table = self.table
|
|
72
|
-
query = table.select()
|
|
73
|
-
if not self.incremental:
|
|
74
|
-
return query
|
|
75
|
-
|
|
76
|
-
last_value_func = self.incremental.last_value_func
|
|
77
|
-
|
|
78
|
-
# generate where
|
|
79
|
-
if (
|
|
80
|
-
last_value_func is max
|
|
81
|
-
): # Query ordered and filtered according to last_value function
|
|
82
|
-
filter_op = operator.ge
|
|
83
|
-
filter_op_end = operator.lt
|
|
84
|
-
elif last_value_func is min:
|
|
85
|
-
filter_op = operator.le
|
|
86
|
-
filter_op_end = operator.gt
|
|
87
|
-
else: # Custom last_value, load everything and let incremental handle filtering
|
|
88
|
-
return query
|
|
89
|
-
|
|
90
|
-
if self.last_value is not None:
|
|
91
|
-
query = query.where(filter_op(self.cursor_column, self.last_value))
|
|
92
|
-
if self.end_value is not None:
|
|
93
|
-
query = query.where(filter_op_end(self.cursor_column, self.end_value))
|
|
94
|
-
|
|
95
|
-
# generate order by from declared row order
|
|
96
|
-
order_by = None
|
|
97
|
-
if (self.row_order == "asc" and last_value_func is max) or (
|
|
98
|
-
self.row_order == "desc" and last_value_func is min
|
|
99
|
-
):
|
|
100
|
-
order_by = self.cursor_column.asc()
|
|
101
|
-
elif (self.row_order == "asc" and last_value_func is min) or (
|
|
102
|
-
self.row_order == "desc" and last_value_func is max
|
|
103
|
-
):
|
|
104
|
-
order_by = self.cursor_column.desc()
|
|
105
|
-
if order_by is not None:
|
|
106
|
-
query = query.order_by(order_by)
|
|
107
|
-
|
|
108
|
-
return query
|
|
109
|
-
|
|
110
|
-
def load_rows(self, backend_kwargs: Dict[str, Any] = None) -> Iterator[TDataItem]:
|
|
111
|
-
# make copy of kwargs
|
|
112
|
-
backend_kwargs = dict(backend_kwargs or {})
|
|
113
|
-
query = self.make_query()
|
|
114
|
-
if self.backend == "connectorx":
|
|
115
|
-
yield from self._load_rows_connectorx(query, backend_kwargs)
|
|
116
|
-
else:
|
|
117
|
-
yield from self._load_rows(query, backend_kwargs)
|
|
118
|
-
|
|
119
|
-
def _load_rows(self, query: SelectAny, backend_kwargs: Dict[str, Any]) -> TDataItem:
|
|
120
|
-
with self.engine.connect() as conn:
|
|
121
|
-
result = conn.execution_options(yield_per=self.chunk_size).execute(query)
|
|
122
|
-
# NOTE: cursor returns not normalized column names! may be quite useful in case of Oracle dialect
|
|
123
|
-
# that normalizes columns
|
|
124
|
-
# columns = [c[0] for c in result.cursor.description]
|
|
125
|
-
columns = list(result.keys())
|
|
126
|
-
for partition in result.partitions(size=self.chunk_size):
|
|
127
|
-
if self.backend == "sqlalchemy":
|
|
128
|
-
yield [dict(row._mapping) for row in partition]
|
|
129
|
-
elif self.backend == "pandas":
|
|
130
|
-
from dlt.common.libs.pandas_sql import _wrap_result
|
|
131
|
-
|
|
132
|
-
df = _wrap_result(
|
|
133
|
-
partition,
|
|
134
|
-
columns,
|
|
135
|
-
**{"dtype_backend": "pyarrow", **backend_kwargs},
|
|
136
|
-
)
|
|
137
|
-
yield df
|
|
138
|
-
elif self.backend == "pyarrow":
|
|
139
|
-
yield row_tuples_to_arrow(
|
|
140
|
-
partition, self.columns, tz=backend_kwargs.get("tz", "UTC")
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
def _load_rows_connectorx(
|
|
144
|
-
self, query: SelectAny, backend_kwargs: Dict[str, Any]
|
|
145
|
-
) -> Iterator[TDataItem]:
|
|
146
|
-
try:
|
|
147
|
-
import connectorx as cx # type: ignore
|
|
148
|
-
except ImportError:
|
|
149
|
-
raise MissingDependencyException(
|
|
150
|
-
"Connector X table backend", ["connectorx"]
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
# default settings
|
|
154
|
-
backend_kwargs = {
|
|
155
|
-
"return_type": "arrow2",
|
|
156
|
-
"protocol": "binary",
|
|
157
|
-
**backend_kwargs,
|
|
158
|
-
}
|
|
159
|
-
conn = backend_kwargs.pop(
|
|
160
|
-
"conn",
|
|
161
|
-
self.engine.url._replace(
|
|
162
|
-
drivername=self.engine.url.get_backend_name()
|
|
163
|
-
).render_as_string(hide_password=False),
|
|
164
|
-
)
|
|
165
|
-
try:
|
|
166
|
-
query_str = str(
|
|
167
|
-
query.compile(self.engine, compile_kwargs={"literal_binds": True})
|
|
168
|
-
)
|
|
169
|
-
except CompileError as ex:
|
|
170
|
-
raise NotImplementedError(
|
|
171
|
-
f"Query for table {self.table.name} could not be compiled to string to execute it on ConnectorX. If you are on SQLAlchemy 1.4.x the causing exception is due to literals that cannot be rendered, upgrade to 2.x: {str(ex)}"
|
|
172
|
-
) from ex
|
|
173
|
-
df = cx.read_sql(conn, query_str, **backend_kwargs)
|
|
174
|
-
yield df
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
def table_rows(
|
|
178
|
-
engine: Engine,
|
|
179
|
-
table: Table,
|
|
180
|
-
chunk_size: int,
|
|
181
|
-
backend: TableBackend,
|
|
182
|
-
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
183
|
-
defer_table_reflect: bool = False,
|
|
184
|
-
table_adapter_callback: Callable[[Table], None] = None,
|
|
185
|
-
reflection_level: ReflectionLevel = "minimal",
|
|
186
|
-
backend_kwargs: Dict[str, Any] = None,
|
|
187
|
-
type_adapter_callback: Optional[TTypeAdapter] = None,
|
|
188
|
-
) -> Iterator[TDataItem]:
|
|
189
|
-
columns: TTableSchemaColumns = None
|
|
190
|
-
if defer_table_reflect:
|
|
191
|
-
table = Table(
|
|
192
|
-
table.name, table.metadata, autoload_with=engine, extend_existing=True
|
|
193
|
-
)
|
|
194
|
-
if table_adapter_callback:
|
|
195
|
-
table_adapter_callback(table)
|
|
196
|
-
columns = table_to_columns(table, reflection_level, type_adapter_callback)
|
|
197
|
-
|
|
198
|
-
# set the primary_key in the incremental
|
|
199
|
-
if incremental and incremental.primary_key is None:
|
|
200
|
-
primary_key = get_primary_key(table)
|
|
201
|
-
if primary_key is not None:
|
|
202
|
-
incremental.primary_key = primary_key
|
|
203
|
-
# yield empty record to set hints
|
|
204
|
-
yield dlt.mark.with_hints(
|
|
205
|
-
[],
|
|
206
|
-
dlt.mark.make_hints(
|
|
207
|
-
primary_key=get_primary_key(table),
|
|
208
|
-
columns=columns,
|
|
209
|
-
),
|
|
210
|
-
)
|
|
211
|
-
else:
|
|
212
|
-
# table was already reflected
|
|
213
|
-
columns = table_to_columns(table, reflection_level, type_adapter_callback)
|
|
214
|
-
|
|
215
|
-
loader = TableLoader(
|
|
216
|
-
engine, backend, table, columns, incremental=incremental, chunk_size=chunk_size
|
|
217
|
-
)
|
|
218
|
-
yield from loader.load_rows(backend_kwargs)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def engine_from_credentials(
|
|
222
|
-
credentials: Union[ConnectionStringCredentials, Engine, str], **backend_kwargs: Any
|
|
223
|
-
) -> Engine:
|
|
224
|
-
if isinstance(credentials, Engine):
|
|
225
|
-
return credentials
|
|
226
|
-
if isinstance(credentials, ConnectionStringCredentials):
|
|
227
|
-
credentials = credentials.to_native_representation()
|
|
228
|
-
return create_engine(credentials, **backend_kwargs)
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
def unwrap_json_connector_x(field: str) -> TDataItem:
|
|
232
|
-
"""Creates a transform function to be added with `add_map` that will unwrap JSON columns
|
|
233
|
-
ingested via connectorx. Such columns are additionally quoted and translate SQL NULL to json "null"
|
|
234
|
-
"""
|
|
235
|
-
import pyarrow as pa
|
|
236
|
-
import pyarrow.compute as pc
|
|
237
|
-
|
|
238
|
-
def _unwrap(table: TDataItem) -> TDataItem:
|
|
239
|
-
col_index = table.column_names.index(field)
|
|
240
|
-
# remove quotes
|
|
241
|
-
column = pc.replace_substring_regex(table[field], '"(.*)"', "\\1")
|
|
242
|
-
# convert json null to null
|
|
243
|
-
column = pc.replace_with_mask(
|
|
244
|
-
column,
|
|
245
|
-
pc.equal(column, "null").combine_chunks(),
|
|
246
|
-
pa.scalar(None, pa.large_string()),
|
|
247
|
-
)
|
|
248
|
-
return table.set_column(col_index, table.schema.field(col_index), column)
|
|
249
|
-
|
|
250
|
-
return _unwrap
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def _detect_precision_hints_deprecated(value: Optional[bool]) -> None:
|
|
254
|
-
if value is None:
|
|
255
|
-
return
|
|
256
|
-
|
|
257
|
-
msg = "`detect_precision_hints` argument is deprecated and will be removed in a future release. "
|
|
258
|
-
if value:
|
|
259
|
-
msg += "Use `reflection_level='full_with_precision'` which has the same effect instead."
|
|
260
|
-
|
|
261
|
-
warnings.warn(
|
|
262
|
-
msg,
|
|
263
|
-
DeprecationWarning,
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
@configspec
|
|
268
|
-
class SqlDatabaseTableConfiguration(BaseConfiguration):
|
|
269
|
-
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
@configspec
|
|
273
|
-
class SqlTableResourceConfiguration(BaseConfiguration):
|
|
274
|
-
credentials: Union[ConnectionStringCredentials, Engine, str] = None
|
|
275
|
-
table: str = None
|
|
276
|
-
schema: Optional[str] = None
|
|
277
|
-
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
278
|
-
chunk_size: int = 50000
|
|
279
|
-
backend: TableBackend = "sqlalchemy"
|
|
280
|
-
detect_precision_hints: Optional[bool] = None
|
|
281
|
-
defer_table_reflect: Optional[bool] = False
|
|
282
|
-
reflection_level: Optional[ReflectionLevel] = "full"
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
|
|
3
|
-
from dlt.common.configuration.specs.base_configuration import configspec
|
|
4
|
-
from dlt.sources.credentials import ConnectionStringCredentials
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@configspec(init=False)
|
|
8
|
-
class IngestrConnectionStringCredentials(ConnectionStringCredentials):
|
|
9
|
-
username: Optional[str] = None
|
|
10
|
-
database: Optional[str] = None
|