ingestr 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ingestr might be problematic. Click here for more details.
- ingestr/main.py +32 -16
- ingestr/main_test.py +18 -17
- ingestr/src/destinations.py +2 -1
- ingestr/src/factory.py +3 -1
- ingestr/src/mongodb/__init__.py +1 -1
- ingestr/src/mongodb/helpers.py +5 -5
- ingestr/src/notion/__init__.py +55 -0
- ingestr/src/notion/helpers/__init__.py +0 -0
- ingestr/src/notion/helpers/client.py +164 -0
- ingestr/src/notion/helpers/database.py +78 -0
- ingestr/src/notion/settings.py +3 -0
- ingestr/src/sources.py +24 -0
- ingestr/src/sql_database/__init__.py +125 -13
- ingestr/src/sql_database/helpers.py +162 -30
- ingestr/src/sql_database/override.py +9 -0
- ingestr/src/sql_database/schema_types.py +135 -27
- ingestr/src/version.py +1 -1
- ingestr/testdata/test_append.db +0 -0
- ingestr/testdata/test_create_replace.db +0 -0
- ingestr/testdata/test_delete_insert_with_timerange.db +0 -0
- ingestr/testdata/test_delete_insert_without_primary_key.db +0 -0
- ingestr/testdata/test_merge_with_primary_key.db +0 -0
- {ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/METADATA +89 -25
- ingestr-0.3.0.dist-info/RECORD +33 -0
- {ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/WHEEL +1 -1
- ingestr/src/sql_database/settings.py +0 -3
- ingestr-0.2.5.dist-info/RECORD +0 -27
- {ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/entry_points.txt +0 -0
- {ingestr-0.2.5.dist-info → ingestr-0.3.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,28 +1,121 @@
|
|
|
1
1
|
"""Source that loads tables form any SQLAlchemy supported database, supports batching requests and incremental loads."""
|
|
2
2
|
|
|
3
|
-
from typing import Any, Optional, Union
|
|
3
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
import dlt
|
|
6
|
+
from dlt.common.configuration.specs.config_section_context import ConfigSectionContext
|
|
6
7
|
from dlt.sources import DltResource
|
|
7
|
-
from dlt.sources.credentials import ConnectionStringCredentials
|
|
8
8
|
from sqlalchemy import MetaData, Table
|
|
9
9
|
from sqlalchemy.engine import Engine
|
|
10
10
|
|
|
11
11
|
from .helpers import (
|
|
12
|
+
SqlDatabaseTableConfiguration,
|
|
13
|
+
SqlTableResourceConfiguration,
|
|
14
|
+
TableBackend,
|
|
12
15
|
engine_from_credentials,
|
|
13
16
|
get_primary_key,
|
|
14
17
|
table_rows,
|
|
15
18
|
)
|
|
19
|
+
from .override import IngestrConnectionStringCredentials
|
|
16
20
|
from .schema_types import table_to_columns
|
|
17
21
|
|
|
18
22
|
|
|
23
|
+
@dlt.source
|
|
24
|
+
def sql_database(
|
|
25
|
+
credentials: Union[
|
|
26
|
+
IngestrConnectionStringCredentials, Engine, str
|
|
27
|
+
] = dlt.secrets.value,
|
|
28
|
+
schema: Optional[str] = dlt.config.value,
|
|
29
|
+
metadata: Optional[MetaData] = None,
|
|
30
|
+
table_names: Optional[List[str]] = dlt.config.value,
|
|
31
|
+
chunk_size: int = 50000,
|
|
32
|
+
backend: TableBackend = "sqlalchemy",
|
|
33
|
+
detect_precision_hints: Optional[bool] = dlt.config.value,
|
|
34
|
+
defer_table_reflect: Optional[bool] = dlt.config.value,
|
|
35
|
+
table_adapter_callback: Callable[[Table], None] = None,
|
|
36
|
+
backend_kwargs: Dict[str, Any] = None,
|
|
37
|
+
) -> Iterable[DltResource]:
|
|
38
|
+
"""
|
|
39
|
+
A dlt source which loads data from an SQL database using SQLAlchemy.
|
|
40
|
+
Resources are automatically created for each table in the schema or from the given list of tables.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
credentials (Union[IngestrConnectionStringCredentials, Engine, str]): Database credentials or an `sqlalchemy.Engine` instance.
|
|
44
|
+
schema (Optional[str]): Name of the database schema to load (if different from default).
|
|
45
|
+
metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. `schema` argument is ignored when this is used.
|
|
46
|
+
table_names (Optional[List[str]]): A list of table names to load. By default, all tables in the schema are loaded.
|
|
47
|
+
chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
|
|
48
|
+
backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
|
|
49
|
+
"sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
|
|
50
|
+
"sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
|
|
51
|
+
"connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
|
|
52
|
+
detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
|
|
53
|
+
This is disabled by default.
|
|
54
|
+
defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Requires table_names to be explicitly passed.
|
|
55
|
+
Enable this option when running on Airflow. Available on dlt 0.4.4 and later.
|
|
56
|
+
table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
|
|
57
|
+
backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
|
|
58
|
+
Returns:
|
|
59
|
+
Iterable[DltResource]: A list of DLT resources for each table to be loaded.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
# set up alchemy engine
|
|
63
|
+
engine = engine_from_credentials(credentials)
|
|
64
|
+
engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
|
|
65
|
+
metadata = metadata or MetaData(schema=schema)
|
|
66
|
+
|
|
67
|
+
# use provided tables or all tables
|
|
68
|
+
if table_names:
|
|
69
|
+
tables = [
|
|
70
|
+
Table(name, metadata, autoload_with=None if defer_table_reflect else engine)
|
|
71
|
+
for name in table_names
|
|
72
|
+
]
|
|
73
|
+
else:
|
|
74
|
+
if defer_table_reflect:
|
|
75
|
+
raise ValueError("You must pass table names to defer table reflection")
|
|
76
|
+
metadata.reflect(bind=engine)
|
|
77
|
+
tables = list(metadata.tables.values())
|
|
78
|
+
|
|
79
|
+
for table in tables:
|
|
80
|
+
if table_adapter_callback and not defer_table_reflect:
|
|
81
|
+
table_adapter_callback(table)
|
|
82
|
+
yield dlt.resource(
|
|
83
|
+
table_rows,
|
|
84
|
+
name=table.name,
|
|
85
|
+
primary_key=get_primary_key(table),
|
|
86
|
+
spec=SqlDatabaseTableConfiguration,
|
|
87
|
+
columns=table_to_columns(table, detect_precision_hints),
|
|
88
|
+
)(
|
|
89
|
+
engine,
|
|
90
|
+
table,
|
|
91
|
+
chunk_size,
|
|
92
|
+
backend,
|
|
93
|
+
detect_precision_hints=detect_precision_hints,
|
|
94
|
+
defer_table_reflect=defer_table_reflect,
|
|
95
|
+
table_adapter_callback=table_adapter_callback,
|
|
96
|
+
backend_kwargs=backend_kwargs,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dlt.sources.config.with_config(
|
|
101
|
+
sections=("sources", "sql_database"),
|
|
102
|
+
spec=SqlTableResourceConfiguration,
|
|
103
|
+
sections_merge_style=ConfigSectionContext.resource_merge_style,
|
|
104
|
+
)
|
|
19
105
|
def sql_table(
|
|
20
|
-
credentials: Union[
|
|
106
|
+
credentials: Union[
|
|
107
|
+
IngestrConnectionStringCredentials, Engine, str
|
|
108
|
+
] = dlt.secrets.value,
|
|
21
109
|
table: str = dlt.config.value,
|
|
22
110
|
schema: Optional[str] = dlt.config.value,
|
|
23
111
|
metadata: Optional[MetaData] = None,
|
|
24
112
|
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
113
|
+
chunk_size: int = 1000,
|
|
114
|
+
backend: TableBackend = "sqlalchemy",
|
|
25
115
|
detect_precision_hints: Optional[bool] = dlt.config.value,
|
|
116
|
+
defer_table_reflect: Optional[bool] = dlt.config.value,
|
|
117
|
+
table_adapter_callback: Callable[[Table], None] = None,
|
|
118
|
+
backend_kwargs: Dict[str, Any] = None,
|
|
26
119
|
merge_key: Optional[str] = None,
|
|
27
120
|
) -> DltResource:
|
|
28
121
|
"""
|
|
@@ -35,26 +128,45 @@ def sql_table(
|
|
|
35
128
|
metadata (Optional[MetaData]): Optional `sqlalchemy.MetaData` instance. If provided, the `schema` argument is ignored.
|
|
36
129
|
incremental (Optional[dlt.sources.incremental[Any]]): Option to enable incremental loading for the table.
|
|
37
130
|
E.g., `incremental=dlt.sources.incremental('updated_at', pendulum.parse('2022-01-01T00:00:00Z'))`
|
|
38
|
-
|
|
131
|
+
chunk_size (int): Number of rows yielded in one batch. SQL Alchemy will create additional internal rows buffer twice the chunk size.
|
|
132
|
+
backend (TableBackend): Type of backend to generate table data. One of: "sqlalchemy", "pyarrow", "pandas" and "connectorx".
|
|
133
|
+
"sqlalchemy" yields batches as lists of Python dictionaries, "pyarrow" and "connectorx" yield batches as arrow tables, "pandas" yields panda frames.
|
|
134
|
+
"sqlalchemy" is the default and does not require additional dependencies, "pyarrow" creates stable destination schemas with correct data types,
|
|
135
|
+
"connectorx" is typically the fastest but ignores the "chunk_size" so you must deal with large tables yourself.
|
|
39
136
|
detect_precision_hints (bool): Set column precision and scale hints for supported data types in the target schema based on the columns in the source tables.
|
|
40
137
|
This is disabled by default.
|
|
138
|
+
defer_table_reflect (bool): Will connect and reflect table schema only when yielding data. Enable this option when running on Airflow. Available
|
|
139
|
+
on dlt 0.4.4 and later
|
|
140
|
+
table_adapter_callback: (Callable): Receives each reflected table. May be used to modify the list of columns that will be selected.
|
|
141
|
+
backend_kwargs (**kwargs): kwargs passed to table backend ie. "conn" is used to pass specialized connection string to connectorx.
|
|
41
142
|
|
|
42
143
|
Returns:
|
|
43
144
|
DltResource: The dlt resource for loading data from the SQL database table.
|
|
44
145
|
"""
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
else:
|
|
48
|
-
engine = credentials
|
|
49
|
-
engine.execution_options(stream_results=True)
|
|
146
|
+
engine = engine_from_credentials(credentials)
|
|
147
|
+
engine.execution_options(stream_results=True, max_row_buffer=2 * chunk_size)
|
|
50
148
|
metadata = metadata or MetaData(schema=schema)
|
|
51
149
|
|
|
52
|
-
table_obj = Table(
|
|
150
|
+
table_obj = Table(
|
|
151
|
+
table, metadata, autoload_with=None if defer_table_reflect else engine
|
|
152
|
+
)
|
|
153
|
+
if table_adapter_callback and not defer_table_reflect:
|
|
154
|
+
table_adapter_callback(table_obj)
|
|
53
155
|
|
|
54
156
|
return dlt.resource(
|
|
55
157
|
table_rows,
|
|
56
158
|
name=table_obj.name,
|
|
57
159
|
primary_key=get_primary_key(table_obj),
|
|
58
|
-
columns=table_to_columns(table_obj
|
|
59
|
-
merge_key=merge_key,
|
|
60
|
-
)(
|
|
160
|
+
columns=table_to_columns(table_obj, detect_precision_hints),
|
|
161
|
+
merge_key=merge_key,
|
|
162
|
+
)(
|
|
163
|
+
engine,
|
|
164
|
+
table_obj,
|
|
165
|
+
chunk_size,
|
|
166
|
+
backend,
|
|
167
|
+
incremental=incremental,
|
|
168
|
+
detect_precision_hints=detect_precision_hints,
|
|
169
|
+
defer_table_reflect=defer_table_reflect,
|
|
170
|
+
table_adapter_callback=table_adapter_callback,
|
|
171
|
+
backend_kwargs=backend_kwargs,
|
|
172
|
+
)
|
|
@@ -3,33 +3,49 @@
|
|
|
3
3
|
import operator
|
|
4
4
|
from typing import (
|
|
5
5
|
Any,
|
|
6
|
+
Callable,
|
|
7
|
+
Dict,
|
|
6
8
|
Iterator,
|
|
7
9
|
List,
|
|
10
|
+
Literal,
|
|
8
11
|
Optional,
|
|
9
12
|
Union,
|
|
10
13
|
)
|
|
11
14
|
|
|
12
15
|
import dlt
|
|
13
16
|
from dlt.common.configuration.specs import BaseConfiguration, configspec
|
|
17
|
+
from dlt.common.exceptions import MissingDependencyException
|
|
18
|
+
from dlt.common.schema import TTableSchemaColumns
|
|
14
19
|
from dlt.common.typing import TDataItem
|
|
15
20
|
from dlt.sources.credentials import ConnectionStringCredentials
|
|
16
21
|
from sqlalchemy import Table, create_engine
|
|
17
22
|
from sqlalchemy.engine import Engine
|
|
18
|
-
from sqlalchemy.sql import Select
|
|
19
23
|
|
|
20
|
-
from .
|
|
24
|
+
from ingestr.src.sql_database.override import IngestrConnectionStringCredentials
|
|
25
|
+
|
|
26
|
+
from .schema_types import (
|
|
27
|
+
SelectAny,
|
|
28
|
+
row_tuples_to_arrow,
|
|
29
|
+
table_to_columns,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
TableBackend = Literal["sqlalchemy", "pyarrow", "pandas", "connectorx"]
|
|
21
33
|
|
|
22
34
|
|
|
23
35
|
class TableLoader:
|
|
24
36
|
def __init__(
|
|
25
37
|
self,
|
|
26
38
|
engine: Engine,
|
|
39
|
+
backend: TableBackend,
|
|
27
40
|
table: Table,
|
|
41
|
+
columns: TTableSchemaColumns,
|
|
28
42
|
chunk_size: int = 1000,
|
|
29
43
|
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
30
44
|
) -> None:
|
|
31
45
|
self.engine = engine
|
|
46
|
+
self.backend = backend
|
|
32
47
|
self.table = table
|
|
48
|
+
self.columns = columns
|
|
33
49
|
self.chunk_size = chunk_size
|
|
34
50
|
self.incremental = incremental
|
|
35
51
|
if incremental:
|
|
@@ -40,60 +56,152 @@ class TableLoader:
|
|
|
40
56
|
f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'"
|
|
41
57
|
) from e
|
|
42
58
|
self.last_value = incremental.last_value
|
|
59
|
+
self.end_value = incremental.end_value
|
|
60
|
+
self.row_order = getattr(self.incremental, "row_order", None)
|
|
43
61
|
else:
|
|
44
|
-
self.cursor_column = None
|
|
62
|
+
self.cursor_column = None
|
|
45
63
|
self.last_value = None
|
|
64
|
+
self.end_value = None
|
|
65
|
+
self.row_order = None
|
|
46
66
|
|
|
47
|
-
def make_query(self) ->
|
|
67
|
+
def make_query(self) -> SelectAny:
|
|
48
68
|
table = self.table
|
|
49
69
|
query = table.select()
|
|
50
70
|
if not self.incremental:
|
|
51
71
|
return query
|
|
52
72
|
last_value_func = self.incremental.last_value_func
|
|
73
|
+
|
|
74
|
+
# generate where
|
|
53
75
|
if (
|
|
54
76
|
last_value_func is max
|
|
55
77
|
): # Query ordered and filtered according to last_value function
|
|
56
|
-
order_by = self.cursor_column.asc()
|
|
57
78
|
filter_op = operator.ge
|
|
79
|
+
filter_op_end = operator.lt
|
|
58
80
|
elif last_value_func is min:
|
|
59
|
-
order_by = self.cursor_column.desc()
|
|
60
81
|
filter_op = operator.le
|
|
82
|
+
filter_op_end = operator.gt
|
|
61
83
|
else: # Custom last_value, load everything and let incremental handle filtering
|
|
62
84
|
return query
|
|
63
|
-
query = query.order_by(order_by)
|
|
64
|
-
if self.last_value is None:
|
|
65
|
-
return query
|
|
66
|
-
return query.where(filter_op(self.cursor_column, self.last_value))
|
|
67
85
|
|
|
68
|
-
|
|
86
|
+
if self.last_value is not None:
|
|
87
|
+
query = query.where(filter_op(self.cursor_column, self.last_value))
|
|
88
|
+
if self.end_value is not None:
|
|
89
|
+
query = query.where(filter_op_end(self.cursor_column, self.end_value))
|
|
90
|
+
|
|
91
|
+
# generate order by from declared row order
|
|
92
|
+
order_by = None
|
|
93
|
+
if self.row_order == "asc":
|
|
94
|
+
order_by = self.cursor_column.asc()
|
|
95
|
+
elif self.row_order == "desc":
|
|
96
|
+
order_by = self.cursor_column.desc()
|
|
97
|
+
if order_by is not None:
|
|
98
|
+
query = query.order_by(order_by)
|
|
99
|
+
|
|
100
|
+
return query
|
|
101
|
+
|
|
102
|
+
def load_rows(self, backend_kwargs: Dict[str, Any] = None) -> Iterator[TDataItem]:
|
|
103
|
+
# make copy of kwargs
|
|
104
|
+
backend_kwargs = dict(backend_kwargs or {})
|
|
69
105
|
query = self.make_query()
|
|
106
|
+
if self.backend == "connectorx":
|
|
107
|
+
yield from self._load_rows_connectorx(query, backend_kwargs)
|
|
108
|
+
else:
|
|
109
|
+
yield from self._load_rows(query, backend_kwargs)
|
|
110
|
+
|
|
111
|
+
def _load_rows(self, query: SelectAny, backend_kwargs: Dict[str, Any]) -> TDataItem:
|
|
70
112
|
with self.engine.connect() as conn:
|
|
71
113
|
result = conn.execution_options(yield_per=self.chunk_size).execute(query)
|
|
114
|
+
# NOTE: cursor returns not normalized column names! may be quite useful in case of Oracle dialect
|
|
115
|
+
# that normalizes columns
|
|
116
|
+
# columns = [c[0] for c in result.cursor.description]
|
|
117
|
+
columns = list(result.keys())
|
|
72
118
|
for partition in result.partitions(size=self.chunk_size):
|
|
73
|
-
|
|
119
|
+
if self.backend == "sqlalchemy":
|
|
120
|
+
yield [dict(row._mapping) for row in partition]
|
|
121
|
+
elif self.backend == "pandas":
|
|
122
|
+
from dlt.common.libs.pandas_sql import _wrap_result
|
|
123
|
+
|
|
124
|
+
yield _wrap_result(
|
|
125
|
+
partition,
|
|
126
|
+
columns,
|
|
127
|
+
**{"dtype_backend": "pyarrow", **backend_kwargs},
|
|
128
|
+
)
|
|
129
|
+
elif self.backend == "pyarrow":
|
|
130
|
+
yield row_tuples_to_arrow(
|
|
131
|
+
partition, self.columns, tz=backend_kwargs.get("tz")
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def _load_rows_connectorx(
|
|
135
|
+
self, query: SelectAny, backend_kwargs: Dict[str, Any]
|
|
136
|
+
) -> Iterator[TDataItem]:
|
|
137
|
+
try:
|
|
138
|
+
import connectorx as cx # type: ignore
|
|
139
|
+
except ImportError:
|
|
140
|
+
raise MissingDependencyException(
|
|
141
|
+
"Connector X table backend", ["connectorx"]
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# default settings
|
|
145
|
+
backend_kwargs = {
|
|
146
|
+
"return_type": "arrow2",
|
|
147
|
+
"protocol": "binary",
|
|
148
|
+
**backend_kwargs,
|
|
149
|
+
}
|
|
150
|
+
conn = backend_kwargs.pop(
|
|
151
|
+
"conn",
|
|
152
|
+
self.engine.url._replace(
|
|
153
|
+
drivername=self.engine.url.get_backend_name()
|
|
154
|
+
).render_as_string(hide_password=False),
|
|
155
|
+
)
|
|
156
|
+
df = cx.read_sql(
|
|
157
|
+
conn,
|
|
158
|
+
str(query.compile(self.engine, compile_kwargs={"literal_binds": True})),
|
|
159
|
+
**backend_kwargs,
|
|
160
|
+
)
|
|
161
|
+
yield df
|
|
74
162
|
|
|
75
163
|
|
|
76
164
|
def table_rows(
|
|
77
165
|
engine: Engine,
|
|
78
166
|
table: Table,
|
|
79
|
-
chunk_size: int
|
|
167
|
+
chunk_size: int,
|
|
168
|
+
backend: TableBackend,
|
|
80
169
|
incremental: Optional[dlt.sources.incremental[Any]] = None,
|
|
170
|
+
detect_precision_hints: bool = False,
|
|
171
|
+
defer_table_reflect: bool = False,
|
|
172
|
+
table_adapter_callback: Callable[[Table], None] = None,
|
|
173
|
+
backend_kwargs: Dict[str, Any] = None,
|
|
81
174
|
) -> Iterator[TDataItem]:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
175
|
+
columns: TTableSchemaColumns = None
|
|
176
|
+
if defer_table_reflect:
|
|
177
|
+
table = Table(
|
|
178
|
+
table.name, table.metadata, autoload_with=engine, extend_existing=True
|
|
179
|
+
)
|
|
180
|
+
if table_adapter_callback:
|
|
181
|
+
table_adapter_callback(table)
|
|
182
|
+
columns = table_to_columns(table, detect_precision_hints)
|
|
85
183
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
184
|
+
# set the primary_key in the incremental
|
|
185
|
+
if incremental and incremental.primary_key is None:
|
|
186
|
+
primary_key = get_primary_key(table)
|
|
187
|
+
if primary_key is not None:
|
|
188
|
+
incremental.primary_key = primary_key
|
|
189
|
+
# yield empty record to set hints
|
|
190
|
+
yield dlt.mark.with_hints(
|
|
191
|
+
[],
|
|
192
|
+
dlt.mark.make_hints(
|
|
193
|
+
primary_key=get_primary_key(table),
|
|
194
|
+
columns=columns,
|
|
195
|
+
),
|
|
196
|
+
)
|
|
197
|
+
else:
|
|
198
|
+
# table was already reflected
|
|
199
|
+
columns = table_to_columns(table, detect_precision_hints)
|
|
91
200
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
yield from loader.load_rows()
|
|
201
|
+
loader = TableLoader(
|
|
202
|
+
engine, backend, table, columns, incremental=incremental, chunk_size=chunk_size
|
|
203
|
+
)
|
|
204
|
+
yield from loader.load_rows(backend_kwargs)
|
|
97
205
|
|
|
98
206
|
|
|
99
207
|
def engine_from_credentials(
|
|
@@ -107,7 +215,31 @@ def engine_from_credentials(
|
|
|
107
215
|
|
|
108
216
|
|
|
109
217
|
def get_primary_key(table: Table) -> List[str]:
|
|
110
|
-
return
|
|
218
|
+
"""Create primary key or return None if no key defined"""
|
|
219
|
+
primary_key = [c.name for c in table.primary_key]
|
|
220
|
+
return primary_key if len(primary_key) > 0 else None
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def unwrap_json_connector_x(field: str) -> TDataItem:
|
|
224
|
+
"""Creates a transform function to be added with `add_map` that will unwrap JSON columns
|
|
225
|
+
ingested via connectorx. Such columns are additionally quoted and translate SQL NULL to json "null"
|
|
226
|
+
"""
|
|
227
|
+
import pyarrow as pa
|
|
228
|
+
import pyarrow.compute as pc
|
|
229
|
+
|
|
230
|
+
def _unwrap(table: TDataItem) -> TDataItem:
|
|
231
|
+
col_index = table.column_names.index(field)
|
|
232
|
+
# remove quotes
|
|
233
|
+
column = pc.replace_substring_regex(table[field], '"(.*)"', "\\1")
|
|
234
|
+
# convert json null to null
|
|
235
|
+
column = pc.replace_with_mask(
|
|
236
|
+
column,
|
|
237
|
+
pc.equal(column, "null").combine_chunks(),
|
|
238
|
+
pa.scalar(None, pa.large_string()),
|
|
239
|
+
)
|
|
240
|
+
return table.set_column(col_index, table.schema.field(col_index), column)
|
|
241
|
+
|
|
242
|
+
return _unwrap
|
|
111
243
|
|
|
112
244
|
|
|
113
245
|
@configspec
|
|
@@ -117,10 +249,10 @@ class SqlDatabaseTableConfiguration(BaseConfiguration):
|
|
|
117
249
|
|
|
118
250
|
@configspec
|
|
119
251
|
class SqlTableResourceConfiguration(BaseConfiguration):
|
|
120
|
-
credentials:
|
|
121
|
-
table: str
|
|
252
|
+
credentials: IngestrConnectionStringCredentials = None
|
|
253
|
+
table: str = None
|
|
122
254
|
incremental: Optional[dlt.sources.incremental] = None # type: ignore[type-arg]
|
|
123
|
-
schema: Optional[str]
|
|
255
|
+
schema: Optional[str] = None
|
|
124
256
|
|
|
125
257
|
|
|
126
258
|
__source_name__ = "sql_database"
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from dlt.common.configuration.specs.base_configuration import configspec
|
|
4
|
+
from dlt.sources.credentials import ConnectionStringCredentials
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@configspec(init=False)
|
|
8
|
+
class IngestrConnectionStringCredentials(ConnectionStringCredentials):
|
|
9
|
+
username: Optional[str] = None
|
|
@@ -1,54 +1,162 @@
|
|
|
1
|
-
from typing import Any, Optional
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
|
|
2
2
|
|
|
3
|
+
from dlt.common import logger
|
|
4
|
+
from dlt.common.configuration import with_config
|
|
5
|
+
from dlt.common.destination import DestinationCapabilitiesContext
|
|
3
6
|
from dlt.common.schema.typing import TColumnSchema, TTableSchemaColumns
|
|
4
7
|
from sqlalchemy import Column, Table
|
|
5
|
-
from sqlalchemy.
|
|
8
|
+
from sqlalchemy.engine import Row
|
|
9
|
+
from sqlalchemy.sql import Select, sqltypes
|
|
10
|
+
from typing_extensions import TypeAlias
|
|
6
11
|
|
|
12
|
+
# optionally create generics with any so they can be imported by dlt importer
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
SelectAny: TypeAlias = Select[Any]
|
|
15
|
+
ColumnAny: TypeAlias = Column[Any]
|
|
16
|
+
RowAny: TypeAlias = Row[Any]
|
|
17
|
+
else:
|
|
18
|
+
SelectAny: TypeAlias = Type[Any]
|
|
19
|
+
ColumnAny: TypeAlias = Type[Any]
|
|
20
|
+
RowAny: TypeAlias = Type[Any]
|
|
7
21
|
|
|
8
|
-
|
|
22
|
+
|
|
23
|
+
def sqla_col_to_column_schema(
|
|
24
|
+
sql_col: ColumnAny, add_precision: bool = False
|
|
25
|
+
) -> Optional[TColumnSchema]:
|
|
9
26
|
"""Infer dlt schema column type from an sqlalchemy type.
|
|
10
27
|
|
|
11
|
-
|
|
12
|
-
such as numeric, varchar, int, bigint
|
|
28
|
+
If `add_precision` is set, precision and scale is inferred from that types that support it,
|
|
29
|
+
such as numeric, varchar, int, bigint. Numeric (decimal) types have always precision added.
|
|
13
30
|
"""
|
|
14
31
|
sql_t = sql_col.type
|
|
15
|
-
col =
|
|
32
|
+
col: TColumnSchema = {
|
|
33
|
+
"name": sql_col.name,
|
|
34
|
+
"data_type": None, # set that later
|
|
35
|
+
"nullable": sql_col.nullable,
|
|
36
|
+
}
|
|
16
37
|
|
|
17
|
-
if isinstance(sql_t, sqltypes.
|
|
18
|
-
col =
|
|
19
|
-
|
|
20
|
-
|
|
38
|
+
if isinstance(sql_t, sqltypes.SmallInteger):
|
|
39
|
+
col["data_type"] = "bigint"
|
|
40
|
+
if add_precision:
|
|
41
|
+
col["precision"] = 32
|
|
21
42
|
elif isinstance(sql_t, sqltypes.Integer):
|
|
22
|
-
col =
|
|
23
|
-
elif isinstance(sql_t, sqltypes.Numeric)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
43
|
+
col["data_type"] = "bigint"
|
|
44
|
+
elif isinstance(sql_t, sqltypes.Numeric):
|
|
45
|
+
# dlt column type depends on the data returned by the sql alchemy dialect
|
|
46
|
+
# and not on the metadata reflected in the database. all Numeric types
|
|
47
|
+
# that are returned as floats will assume "double" type
|
|
48
|
+
# and returned as decimals will assume "decimal" type
|
|
49
|
+
if sql_t.asdecimal is False:
|
|
50
|
+
col["data_type"] = "double"
|
|
51
|
+
else:
|
|
52
|
+
col["data_type"] = "decimal"
|
|
53
|
+
if sql_t.precision is not None:
|
|
54
|
+
col["precision"] = sql_t.precision
|
|
55
|
+
# must have a precision for any meaningful scale
|
|
56
|
+
if sql_t.scale is not None:
|
|
57
|
+
col["scale"] = sql_t.scale
|
|
58
|
+
elif sql_t.decimal_return_scale is not None:
|
|
59
|
+
col["scale"] = sql_t.decimal_return_scale
|
|
30
60
|
elif isinstance(sql_t, sqltypes.String):
|
|
31
|
-
col =
|
|
61
|
+
col["data_type"] = "text"
|
|
62
|
+
if add_precision and sql_t.length:
|
|
63
|
+
col["precision"] = sql_t.length
|
|
32
64
|
elif isinstance(sql_t, sqltypes._Binary):
|
|
33
|
-
col =
|
|
65
|
+
col["data_type"] = "binary"
|
|
66
|
+
if add_precision and sql_t.length:
|
|
67
|
+
col["precision"] = sql_t.length
|
|
34
68
|
elif isinstance(sql_t, sqltypes.DateTime):
|
|
35
|
-
col =
|
|
69
|
+
col["data_type"] = "timestamp"
|
|
36
70
|
elif isinstance(sql_t, sqltypes.Date):
|
|
37
|
-
col =
|
|
71
|
+
col["data_type"] = "date"
|
|
38
72
|
elif isinstance(sql_t, sqltypes.Time):
|
|
39
|
-
col =
|
|
73
|
+
col["data_type"] = "time"
|
|
74
|
+
elif isinstance(sql_t, sqltypes.JSON):
|
|
75
|
+
col["data_type"] = "complex"
|
|
76
|
+
elif isinstance(sql_t, sqltypes.Boolean):
|
|
77
|
+
col["data_type"] = "bool"
|
|
78
|
+
else:
|
|
79
|
+
logger.warning(
|
|
80
|
+
f"A column with name {sql_col.name} contains unknown data type {sql_t} which cannot be mapped to `dlt` data type. When using sqlalchemy backend such data will be passed to the normalizer. In case of `pyarrow` backend such data will be ignored. In case of other backends, the behavior is backend-specific."
|
|
81
|
+
)
|
|
82
|
+
col = None
|
|
40
83
|
if col:
|
|
41
84
|
return {key: value for key, value in col.items() if value is not None} # type: ignore[return-value]
|
|
42
85
|
return None
|
|
43
86
|
|
|
44
87
|
|
|
45
|
-
def table_to_columns(table: Table) -> TTableSchemaColumns:
|
|
88
|
+
def table_to_columns(table: Table, add_precision: bool = False) -> TTableSchemaColumns:
|
|
46
89
|
"""Convert an sqlalchemy table to a dlt table schema.
|
|
47
90
|
|
|
48
|
-
|
|
91
|
+
Adds precision to columns when `add_precision` is set.
|
|
49
92
|
"""
|
|
50
93
|
return {
|
|
51
|
-
col["name"]: col
|
|
52
|
-
for col in (sqla_col_to_column_schema(c) for c in table.columns)
|
|
94
|
+
col["name"]: col
|
|
95
|
+
for col in (sqla_col_to_column_schema(c, add_precision) for c in table.columns)
|
|
53
96
|
if col is not None
|
|
54
97
|
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@with_config
|
|
101
|
+
def columns_to_arrow(
|
|
102
|
+
columns_schema: TTableSchemaColumns,
|
|
103
|
+
caps: DestinationCapabilitiesContext = None,
|
|
104
|
+
tz: str = "UTC",
|
|
105
|
+
) -> Any:
|
|
106
|
+
"""Converts `column_schema` to arrow schema using `caps` and `tz`. `caps` are injected from the container - which
|
|
107
|
+
is always the case if run within the pipeline. This will generate arrow schema compatible with the destination.
|
|
108
|
+
Otherwise generic capabilities are used
|
|
109
|
+
"""
|
|
110
|
+
from dlt.common.destination.capabilities import DestinationCapabilitiesContext
|
|
111
|
+
from dlt.common.libs.pyarrow import get_py_arrow_datatype
|
|
112
|
+
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
113
|
+
|
|
114
|
+
return pa.schema(
|
|
115
|
+
[
|
|
116
|
+
pa.field(
|
|
117
|
+
name,
|
|
118
|
+
get_py_arrow_datatype(
|
|
119
|
+
schema_item,
|
|
120
|
+
caps or DestinationCapabilitiesContext.generic_capabilities(),
|
|
121
|
+
tz,
|
|
122
|
+
),
|
|
123
|
+
nullable=schema_item.get("nullable", True),
|
|
124
|
+
)
|
|
125
|
+
for name, schema_item in columns_schema.items()
|
|
126
|
+
]
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def row_tuples_to_arrow(
|
|
131
|
+
rows: Sequence[RowAny], columns: TTableSchemaColumns, tz: str
|
|
132
|
+
) -> Any:
|
|
133
|
+
import numpy as np
|
|
134
|
+
from dlt.common.libs.pyarrow import pyarrow as pa
|
|
135
|
+
|
|
136
|
+
arrow_schema = columns_to_arrow(columns, tz=tz)
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
from pandas._libs import lib
|
|
140
|
+
|
|
141
|
+
pivoted_rows = lib.to_object_array_tuples(rows).T # type: ignore[attr-defined]
|
|
142
|
+
except ImportError:
|
|
143
|
+
logger.info(
|
|
144
|
+
"Pandas not installed, reverting to numpy.asarray to create a table which is slower"
|
|
145
|
+
)
|
|
146
|
+
pivoted_rows = np.asarray(rows, dtype="object", order="k").T # type: ignore[call-overload]
|
|
147
|
+
|
|
148
|
+
columnar = {
|
|
149
|
+
col: dat.ravel()
|
|
150
|
+
for col, dat in zip(columns, np.vsplit(pivoted_rows, len(columns)))
|
|
151
|
+
}
|
|
152
|
+
for idx in range(0, len(arrow_schema.names)):
|
|
153
|
+
field = arrow_schema.field(idx)
|
|
154
|
+
py_type = type(rows[0][idx])
|
|
155
|
+
# cast double / float ndarrays to decimals if type mismatch, looks like decimals and floats are often mixed up in dialects
|
|
156
|
+
if pa.types.is_decimal(field.type) and issubclass(py_type, (str, float)):
|
|
157
|
+
logger.warning(
|
|
158
|
+
f"Field {field.name} was reflected as decimal type, but rows contains {py_type.__name__}. Additional cast is required which may slow down arrow table generation."
|
|
159
|
+
)
|
|
160
|
+
float_array = pa.array(columnar[field.name], type=pa.float64())
|
|
161
|
+
columnar[field.name] = float_array.cast(field.type, safe=False)
|
|
162
|
+
return pa.Table.from_pydict(columnar, schema=arrow_schema)
|
ingestr/src/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.3.0"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|