meerschaum 3.0.0rc1__py3-none-any.whl → 3.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/_internal/arguments/_parser.py +2 -1
- meerschaum/_internal/docs/index.py +49 -2
- meerschaum/_internal/static.py +8 -24
- meerschaum/actions/verify.py +5 -8
- meerschaum/api/__init__.py +2 -1
- meerschaum/api/dash/__init__.py +0 -2
- meerschaum/api/dash/callbacks/dashboard.py +1 -1
- meerschaum/api/dash/tokens.py +2 -2
- meerschaum/api/routes/_pipes.py +47 -37
- meerschaum/config/_default.py +11 -1
- meerschaum/config/_version.py +1 -1
- meerschaum/config/stack/__init__.py +9 -8
- meerschaum/connectors/api/_pipes.py +2 -18
- meerschaum/connectors/api/_tokens.py +2 -2
- meerschaum/connectors/instance/_tokens.py +4 -4
- meerschaum/connectors/sql/_create_engine.py +3 -14
- meerschaum/connectors/sql/_pipes.py +118 -163
- meerschaum/connectors/sql/_sql.py +38 -20
- meerschaum/connectors/valkey/_pipes.py +44 -16
- meerschaum/core/Pipe/__init__.py +28 -5
- meerschaum/core/Pipe/_attributes.py +270 -46
- meerschaum/core/Pipe/_data.py +55 -17
- meerschaum/core/Pipe/_dtypes.py +19 -4
- meerschaum/core/Pipe/_edit.py +2 -0
- meerschaum/core/Pipe/_fetch.py +1 -1
- meerschaum/core/Pipe/_sync.py +90 -160
- meerschaum/core/Pipe/_verify.py +3 -3
- meerschaum/core/Token/_Token.py +3 -4
- meerschaum/utils/dataframe.py +379 -68
- meerschaum/utils/debug.py +15 -15
- meerschaum/utils/dtypes/__init__.py +388 -22
- meerschaum/utils/dtypes/sql.py +326 -30
- meerschaum/utils/misc.py +9 -68
- meerschaum/utils/packages/__init__.py +7 -21
- meerschaum/utils/packages/_packages.py +7 -2
- meerschaum/utils/schedule.py +1 -1
- meerschaum/utils/sql.py +7 -7
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/METADATA +5 -17
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/RECORD +45 -44
- meerschaum-3.0.0rc2.dist-info/licenses/NOTICE +2 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/WHEEL +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/entry_points.txt +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/top_level.txt +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/zip-safe +0 -0
@@ -31,7 +31,6 @@ install_flavor_drivers = {
|
|
31
31
|
'mssql': ['pyodbc'],
|
32
32
|
'oracle': ['oracledb'],
|
33
33
|
}
|
34
|
-
require_patching_flavors = {'cockroachdb': [('sqlalchemy-cockroachdb', 'sqlalchemy_cockroachdb')]}
|
35
34
|
|
36
35
|
flavor_dialects = {
|
37
36
|
'cockroachdb': (
|
@@ -63,19 +62,6 @@ def create_engine(
|
|
63
62
|
)
|
64
63
|
if self.flavor == 'mssql':
|
65
64
|
_init_mssql_sqlalchemy()
|
66
|
-
if self.flavor in require_patching_flavors:
|
67
|
-
from meerschaum.utils.packages import determine_version, _monkey_patch_get_distribution
|
68
|
-
import pathlib
|
69
|
-
for install_name, import_name in require_patching_flavors[self.flavor]:
|
70
|
-
pkg = attempt_import(
|
71
|
-
import_name,
|
72
|
-
debug=debug,
|
73
|
-
lazy=False,
|
74
|
-
warn=False
|
75
|
-
)
|
76
|
-
_monkey_patch_get_distribution(
|
77
|
-
install_name, determine_version(pathlib.Path(pkg.__file__), venv='mrsm')
|
78
|
-
)
|
79
65
|
|
80
66
|
### supplement missing values with defaults (e.g. port number)
|
81
67
|
for a, value in flavor_configs[self.flavor]['defaults'].items():
|
@@ -189,6 +175,9 @@ def _init_mssql_sqlalchemy():
|
|
189
175
|
lazy=False,
|
190
176
|
warn=False,
|
191
177
|
)
|
178
|
+
if pyodbc is None:
|
179
|
+
raise EnvironmentError("Cannot import pyodbc. Is the MSSQL driver installed?")
|
180
|
+
|
192
181
|
pyodbc.pooling = False
|
193
182
|
|
194
183
|
MSDialect_pyodbc = sqlalchemy_dialects_mssql_pyodbc.MSDialect_pyodbc
|
@@ -25,7 +25,6 @@ def register_pipe(
|
|
25
25
|
Register a new pipe.
|
26
26
|
A pipe's attributes must be set before registering.
|
27
27
|
"""
|
28
|
-
from meerschaum.utils.debug import dprint
|
29
28
|
from meerschaum.utils.packages import attempt_import
|
30
29
|
from meerschaum.utils.sql import json_flavors
|
31
30
|
|
@@ -170,7 +169,6 @@ def fetch_pipes_keys(
|
|
170
169
|
debug: bool, default False
|
171
170
|
Verbosity toggle.
|
172
171
|
"""
|
173
|
-
from meerschaum.utils.debug import dprint
|
174
172
|
from meerschaum.utils.packages import attempt_import
|
175
173
|
from meerschaum.utils.misc import separate_negation_values
|
176
174
|
from meerschaum.utils.sql import OMIT_NULLSFIRST_FLAVORS, table_exists
|
@@ -338,7 +336,6 @@ def create_indices(
|
|
338
336
|
"""
|
339
337
|
Create a pipe's indices.
|
340
338
|
"""
|
341
|
-
from meerschaum.utils.debug import dprint
|
342
339
|
if debug:
|
343
340
|
dprint(f"Creating indices for {pipe}...")
|
344
341
|
|
@@ -392,7 +389,6 @@ def drop_indices(
|
|
392
389
|
"""
|
393
390
|
Drop a pipe's indices.
|
394
391
|
"""
|
395
|
-
from meerschaum.utils.debug import dprint
|
396
392
|
if debug:
|
397
393
|
dprint(f"Dropping indices for {pipe}...")
|
398
394
|
|
@@ -1008,6 +1004,8 @@ def get_pipe_data(
|
|
1008
1004
|
limit: Optional[int] = None,
|
1009
1005
|
begin_add_minutes: int = 0,
|
1010
1006
|
end_add_minutes: int = 0,
|
1007
|
+
chunksize: Optional[int] = -1,
|
1008
|
+
as_iterator: bool = False,
|
1011
1009
|
debug: bool = False,
|
1012
1010
|
**kw: Any
|
1013
1011
|
) -> Union[pd.DataFrame, None]:
|
@@ -1044,14 +1042,17 @@ def get_pipe_data(
|
|
1044
1042
|
If specified, limit the number of rows retrieved to this value.
|
1045
1043
|
|
1046
1044
|
begin_add_minutes: int, default 0
|
1047
|
-
The number of minutes to add to the `begin` datetime (i.e. `DATEADD
|
1045
|
+
The number of minutes to add to the `begin` datetime (i.e. `DATEADD`).
|
1048
1046
|
|
1049
1047
|
end_add_minutes: int, default 0
|
1050
|
-
The number of minutes to add to the `end` datetime (i.e. `DATEADD
|
1048
|
+
The number of minutes to add to the `end` datetime (i.e. `DATEADD`).
|
1051
1049
|
|
1052
1050
|
chunksize: Optional[int], default -1
|
1053
1051
|
The size of dataframe chunks to load into memory.
|
1054
1052
|
|
1053
|
+
as_iterator: bool, default False
|
1054
|
+
If `True`, return the chunks iterator directly.
|
1055
|
+
|
1055
1056
|
debug: bool, default False
|
1056
1057
|
Verbosity toggle.
|
1057
1058
|
|
@@ -1060,43 +1061,58 @@ def get_pipe_data(
|
|
1060
1061
|
A `pd.DataFrame` of the pipe's data.
|
1061
1062
|
|
1062
1063
|
"""
|
1063
|
-
import
|
1064
|
-
from meerschaum.utils.misc import parse_df_datetimes, to_pandas_dtype
|
1064
|
+
import functools
|
1065
1065
|
from meerschaum.utils.packages import import_pandas
|
1066
|
-
from meerschaum.utils.dtypes import
|
1067
|
-
attempt_cast_to_numeric,
|
1068
|
-
attempt_cast_to_uuid,
|
1069
|
-
attempt_cast_to_bytes,
|
1070
|
-
attempt_cast_to_geometry,
|
1071
|
-
are_dtypes_equal,
|
1072
|
-
)
|
1066
|
+
from meerschaum.utils.dtypes import to_pandas_dtype, are_dtypes_equal
|
1073
1067
|
from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
|
1074
1068
|
pd = import_pandas()
|
1075
1069
|
is_dask = 'dask' in pd.__name__
|
1076
1070
|
|
1077
1071
|
cols_types = pipe.get_columns_types(debug=debug) if pipe.enforce else {}
|
1072
|
+
pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug) if pipe.enforce else {}
|
1073
|
+
|
1074
|
+
remote_pandas_types = {
|
1075
|
+
col: to_pandas_dtype(get_pd_type_from_db_type(typ))
|
1076
|
+
for col, typ in cols_types.items()
|
1077
|
+
}
|
1078
|
+
remote_dt_cols_types = {
|
1079
|
+
col: typ
|
1080
|
+
for col, typ in remote_pandas_types.items()
|
1081
|
+
if are_dtypes_equal(typ, 'datetime')
|
1082
|
+
}
|
1083
|
+
remote_dt_tz_aware_cols_types = {
|
1084
|
+
col: typ
|
1085
|
+
for col, typ in remote_dt_cols_types.items()
|
1086
|
+
if ',' in typ or typ == 'datetime'
|
1087
|
+
}
|
1088
|
+
remote_dt_tz_naive_cols_types = {
|
1089
|
+
col: typ
|
1090
|
+
for col, typ in remote_dt_cols_types.items()
|
1091
|
+
if col not in remote_dt_tz_aware_cols_types
|
1092
|
+
}
|
1093
|
+
|
1094
|
+
configured_pandas_types = {
|
1095
|
+
col: to_pandas_dtype(typ)
|
1096
|
+
for col, typ in pipe_dtypes.items()
|
1097
|
+
}
|
1098
|
+
configured_lower_precision_dt_cols_types = {
|
1099
|
+
col: typ
|
1100
|
+
for col, typ in pipe_dtypes.items()
|
1101
|
+
if (
|
1102
|
+
are_dtypes_equal('datetime', typ)
|
1103
|
+
and '[' in typ
|
1104
|
+
and 'ns' not in typ
|
1105
|
+
)
|
1106
|
+
|
1107
|
+
}
|
1108
|
+
|
1078
1109
|
dtypes = {
|
1079
|
-
**
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
**
|
1084
|
-
p_col: to_pandas_dtype(p_typ)
|
1085
|
-
for p_col, p_typ in pipe.dtypes.items()
|
1086
|
-
},
|
1110
|
+
**remote_pandas_types,
|
1111
|
+
**configured_pandas_types,
|
1112
|
+
**remote_dt_tz_aware_cols_types,
|
1113
|
+
**remote_dt_tz_naive_cols_types,
|
1114
|
+
**configured_lower_precision_dt_cols_types
|
1087
1115
|
} if pipe.enforce else {}
|
1088
|
-
if dtypes:
|
1089
|
-
if self.flavor == 'sqlite':
|
1090
|
-
if not pipe.columns.get('datetime', None):
|
1091
|
-
_dt = pipe.guess_datetime()
|
1092
|
-
else:
|
1093
|
-
_dt = pipe.get_columns('datetime')
|
1094
|
-
|
1095
|
-
if _dt:
|
1096
|
-
dt_type = dtypes.get(_dt, 'object').lower()
|
1097
|
-
if 'datetime' not in dt_type:
|
1098
|
-
if 'int' not in dt_type:
|
1099
|
-
dtypes[_dt] = 'datetime64[ns, UTC]'
|
1100
1116
|
|
1101
1117
|
existing_cols = cols_types.keys()
|
1102
1118
|
select_columns = (
|
@@ -1113,13 +1129,20 @@ def get_pipe_data(
|
|
1113
1129
|
and col not in (omit_columns or [])
|
1114
1130
|
]
|
1115
1131
|
) if pipe.enforce else select_columns
|
1132
|
+
|
1116
1133
|
if select_columns:
|
1117
1134
|
dtypes = {col: typ for col, typ in dtypes.items() if col in select_columns}
|
1135
|
+
|
1118
1136
|
dtypes = {
|
1119
|
-
col:
|
1137
|
+
col: typ
|
1120
1138
|
for col, typ in dtypes.items()
|
1121
|
-
if col in select_columns and col not in (omit_columns or [])
|
1139
|
+
if col in (select_columns or [col]) and col not in (omit_columns or [])
|
1122
1140
|
} if pipe.enforce else {}
|
1141
|
+
|
1142
|
+
if debug:
|
1143
|
+
dprint(f"[{self}] `read()` dtypes:")
|
1144
|
+
mrsm.pprint(dtypes)
|
1145
|
+
|
1123
1146
|
query = self.get_pipe_data_query(
|
1124
1147
|
pipe,
|
1125
1148
|
select_columns=select_columns,
|
@@ -1135,91 +1158,25 @@ def get_pipe_data(
|
|
1135
1158
|
**kw
|
1136
1159
|
)
|
1137
1160
|
|
1161
|
+
read_kwargs = {}
|
1138
1162
|
if is_dask:
|
1139
1163
|
index_col = pipe.columns.get('datetime', None)
|
1140
|
-
|
1141
|
-
|
1142
|
-
numeric_columns = [
|
1143
|
-
col
|
1144
|
-
for col, typ in pipe.dtypes.items()
|
1145
|
-
if typ.startswith('numeric') and col in dtypes
|
1146
|
-
]
|
1147
|
-
uuid_columns = [
|
1148
|
-
col
|
1149
|
-
for col, typ in pipe.dtypes.items()
|
1150
|
-
if typ == 'uuid' and col in dtypes
|
1151
|
-
]
|
1152
|
-
bytes_columns = [
|
1153
|
-
col
|
1154
|
-
for col, typ in pipe.dtypes.items()
|
1155
|
-
if typ == 'bytes' and col in dtypes
|
1156
|
-
]
|
1157
|
-
geometry_columns = [
|
1158
|
-
col
|
1159
|
-
for col, typ in pipe.dtypes.items()
|
1160
|
-
if typ.startswith('geometry') and col in dtypes
|
1161
|
-
]
|
1162
|
-
|
1163
|
-
kw['coerce_float'] = kw.get('coerce_float', (len(numeric_columns) == 0))
|
1164
|
+
read_kwargs['index_col'] = index_col
|
1164
1165
|
|
1165
|
-
|
1166
|
+
chunks = self.read(
|
1166
1167
|
query,
|
1168
|
+
chunksize=chunksize,
|
1169
|
+
as_iterator=True,
|
1170
|
+
coerce_float=False,
|
1167
1171
|
dtype=dtypes,
|
1168
1172
|
debug=debug,
|
1169
|
-
**
|
1173
|
+
**read_kwargs
|
1170
1174
|
)
|
1171
|
-
for col in numeric_columns:
|
1172
|
-
if col not in df.columns:
|
1173
|
-
continue
|
1174
|
-
df[col] = df[col].apply(attempt_cast_to_numeric)
|
1175
1175
|
|
1176
|
-
|
1177
|
-
|
1178
|
-
continue
|
1179
|
-
df[col] = df[col].apply(attempt_cast_to_uuid)
|
1176
|
+
if as_iterator:
|
1177
|
+
return chunks
|
1180
1178
|
|
1181
|
-
|
1182
|
-
if col not in df.columns:
|
1183
|
-
continue
|
1184
|
-
df[col] = df[col].apply(attempt_cast_to_bytes)
|
1185
|
-
|
1186
|
-
for col in geometry_columns:
|
1187
|
-
if col not in df.columns:
|
1188
|
-
continue
|
1189
|
-
df[col] = df[col].apply(attempt_cast_to_geometry)
|
1190
|
-
|
1191
|
-
if self.flavor == 'sqlite':
|
1192
|
-
ignore_dt_cols = [
|
1193
|
-
col
|
1194
|
-
for col, dtype in pipe.dtypes.items()
|
1195
|
-
if not are_dtypes_equal(str(dtype), 'datetime')
|
1196
|
-
]
|
1197
|
-
### NOTE: We have to consume the iterator here to ensure that datetimes are parsed correctly
|
1198
|
-
df = (
|
1199
|
-
parse_df_datetimes(
|
1200
|
-
df,
|
1201
|
-
ignore_cols=ignore_dt_cols,
|
1202
|
-
chunksize=kw.get('chunksize', None),
|
1203
|
-
strip_timezone=(pipe.tzinfo is None),
|
1204
|
-
debug=debug,
|
1205
|
-
) if isinstance(df, pd.DataFrame) else (
|
1206
|
-
[
|
1207
|
-
parse_df_datetimes(
|
1208
|
-
c,
|
1209
|
-
ignore_cols=ignore_dt_cols,
|
1210
|
-
chunksize=kw.get('chunksize', None),
|
1211
|
-
strip_timezone=(pipe.tzinfo is None),
|
1212
|
-
debug=debug,
|
1213
|
-
)
|
1214
|
-
for c in df
|
1215
|
-
]
|
1216
|
-
)
|
1217
|
-
)
|
1218
|
-
for col, typ in dtypes.items():
|
1219
|
-
if typ != 'json':
|
1220
|
-
continue
|
1221
|
-
df[col] = df[col].apply(lambda x: json.loads(x) if x is not None else x)
|
1222
|
-
return df
|
1179
|
+
return pd.concat(chunks)
|
1223
1180
|
|
1224
1181
|
|
1225
1182
|
def get_pipe_data_query(
|
@@ -1552,13 +1509,7 @@ def create_pipe_table_from_df(
|
|
1552
1509
|
"""
|
1553
1510
|
Create a pipe's table from its configured dtypes and an incoming dataframe.
|
1554
1511
|
"""
|
1555
|
-
from meerschaum.utils.dataframe import
|
1556
|
-
get_json_cols,
|
1557
|
-
get_numeric_cols,
|
1558
|
-
get_uuid_cols,
|
1559
|
-
get_datetime_cols,
|
1560
|
-
get_bytes_cols,
|
1561
|
-
)
|
1512
|
+
from meerschaum.utils.dataframe import get_special_cols
|
1562
1513
|
from meerschaum.utils.sql import (
|
1563
1514
|
get_create_table_queries,
|
1564
1515
|
sql_item_name,
|
@@ -1587,30 +1538,7 @@ def create_pipe_table_from_df(
|
|
1587
1538
|
for col_ix, col in pipe.columns.items()
|
1588
1539
|
if col and col_ix != 'primary'
|
1589
1540
|
},
|
1590
|
-
**
|
1591
|
-
col: 'uuid'
|
1592
|
-
for col in get_uuid_cols(df)
|
1593
|
-
},
|
1594
|
-
**{
|
1595
|
-
col: 'json'
|
1596
|
-
for col in get_json_cols(df)
|
1597
|
-
},
|
1598
|
-
**{
|
1599
|
-
col: 'numeric'
|
1600
|
-
for col in get_numeric_cols(df)
|
1601
|
-
},
|
1602
|
-
**{
|
1603
|
-
col: 'bytes'
|
1604
|
-
for col in get_bytes_cols(df)
|
1605
|
-
},
|
1606
|
-
**{
|
1607
|
-
col: 'datetime64[ns, UTC]'
|
1608
|
-
for col in get_datetime_cols(df, timezone_aware=True, timezone_naive=False)
|
1609
|
-
},
|
1610
|
-
**{
|
1611
|
-
col: 'datetime64[ns]'
|
1612
|
-
for col in get_datetime_cols(df, timezone_aware=False, timezone_naive=True)
|
1613
|
-
},
|
1541
|
+
**get_special_cols(df),
|
1614
1542
|
**pipe.dtypes
|
1615
1543
|
}
|
1616
1544
|
autoincrement = (
|
@@ -1762,18 +1690,16 @@ def sync_pipe(
|
|
1762
1690
|
_ = pipe.__dict__.pop('_columns_types', None)
|
1763
1691
|
if not self.exec_queries(alter_cols_queries, debug=debug):
|
1764
1692
|
warn(f"Failed to alter columns for {pipe}.")
|
1765
|
-
else:
|
1766
|
-
_ = pipe.infer_dtypes(persist=True)
|
1767
1693
|
|
1768
1694
|
### NOTE: Oracle SQL < 23c (2023) and SQLite does not support booleans,
|
1769
1695
|
### so infer bools and persist them to `dtypes`.
|
1770
1696
|
if self.flavor in ('oracle', 'sqlite', 'mysql', 'mariadb'):
|
1771
|
-
pipe_dtypes = pipe.
|
1697
|
+
pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug)
|
1772
1698
|
new_bool_cols = {
|
1773
1699
|
col: 'bool[pyarrow]'
|
1774
1700
|
for col, typ in df.dtypes.items()
|
1775
1701
|
if col not in pipe_dtypes
|
1776
|
-
|
1702
|
+
and are_dtypes_equal(str(typ), 'bool')
|
1777
1703
|
}
|
1778
1704
|
pipe_dtypes.update(new_bool_cols)
|
1779
1705
|
pipe.dtypes = pipe_dtypes
|
@@ -2788,7 +2714,6 @@ def pipe_exists(
|
|
2788
2714
|
debug=debug,
|
2789
2715
|
)
|
2790
2716
|
if debug:
|
2791
|
-
from meerschaum.utils.debug import dprint
|
2792
2717
|
dprint(f"{pipe} " + ('exists.' if exists else 'does not exist.'))
|
2793
2718
|
return exists
|
2794
2719
|
|
@@ -3125,11 +3050,17 @@ def get_pipe_columns_types(
|
|
3125
3050
|
debug=debug,
|
3126
3051
|
)
|
3127
3052
|
|
3053
|
+
if debug:
|
3054
|
+
dprint(f"Fetching columns_types for {pipe} with via SQLAlchemy table.")
|
3055
|
+
|
3128
3056
|
table_columns = {}
|
3129
3057
|
try:
|
3130
3058
|
pipe_table = self.get_pipe_table(pipe, debug=debug)
|
3131
3059
|
if pipe_table is None:
|
3132
3060
|
return {}
|
3061
|
+
if debug:
|
3062
|
+
dprint(f"Found columns:")
|
3063
|
+
mrsm.pprint(dict(pipe_table.columns))
|
3133
3064
|
for col in pipe_table.columns:
|
3134
3065
|
table_columns[str(col.name)] = str(col.type)
|
3135
3066
|
except Exception as e:
|
@@ -3321,10 +3252,9 @@ def get_alter_columns_queries(
|
|
3321
3252
|
-------
|
3322
3253
|
A list of the `ALTER TABLE` SQL query or queries to be executed on the provided connector.
|
3323
3254
|
"""
|
3324
|
-
if not pipe.exists(debug=debug):
|
3255
|
+
if not pipe.exists(debug=debug) or pipe.static:
|
3325
3256
|
return []
|
3326
|
-
|
3327
|
-
return
|
3257
|
+
|
3328
3258
|
from meerschaum.utils.sql import (
|
3329
3259
|
sql_item_name,
|
3330
3260
|
get_table_cols_types,
|
@@ -3370,7 +3300,8 @@ def get_alter_columns_queries(
|
|
3370
3300
|
debug=debug,
|
3371
3301
|
).items()
|
3372
3302
|
}
|
3373
|
-
|
3303
|
+
pipe_dtypes = pipe.dtypes
|
3304
|
+
pipe_bool_cols = [col for col, typ in pipe_dtypes.items() if are_dtypes_equal(str(typ), 'bool')]
|
3374
3305
|
pd_db_df_aliases = {
|
3375
3306
|
'int': 'bool',
|
3376
3307
|
'float': 'bool',
|
@@ -3378,7 +3309,10 @@ def get_alter_columns_queries(
|
|
3378
3309
|
'guid': 'object',
|
3379
3310
|
}
|
3380
3311
|
if self.flavor == 'oracle':
|
3381
|
-
pd_db_df_aliases
|
3312
|
+
pd_db_df_aliases.update({
|
3313
|
+
'int': 'numeric',
|
3314
|
+
'date': 'datetime',
|
3315
|
+
})
|
3382
3316
|
|
3383
3317
|
altered_cols = {
|
3384
3318
|
col: (db_cols_types.get(col, 'object'), typ)
|
@@ -3387,6 +3321,10 @@ def get_alter_columns_queries(
|
|
3387
3321
|
and not are_dtypes_equal(db_cols_types.get(col, 'object'), 'string')
|
3388
3322
|
}
|
3389
3323
|
|
3324
|
+
if debug and altered_cols:
|
3325
|
+
dprint(f"Columns to be altered:")
|
3326
|
+
mrsm.pprint(altered_cols)
|
3327
|
+
|
3390
3328
|
### NOTE: Sometimes bools are coerced into ints or floats.
|
3391
3329
|
altered_cols_to_ignore = set()
|
3392
3330
|
for col, (db_typ, df_typ) in altered_cols.items():
|
@@ -3413,13 +3351,20 @@ def get_alter_columns_queries(
|
|
3413
3351
|
if db_is_bool_compatible and df_is_bool_compatible:
|
3414
3352
|
altered_cols_to_ignore.add(bool_col)
|
3415
3353
|
|
3354
|
+
if debug and altered_cols_to_ignore:
|
3355
|
+
dprint(f"Ignoring the following altered columns (false positives).")
|
3356
|
+
mrsm.pprint(altered_cols_to_ignore)
|
3357
|
+
|
3416
3358
|
for col in altered_cols_to_ignore:
|
3417
3359
|
_ = altered_cols.pop(col, None)
|
3360
|
+
|
3418
3361
|
if not altered_cols:
|
3419
3362
|
return []
|
3420
3363
|
|
3421
3364
|
if numeric_cols:
|
3422
|
-
pipe.
|
3365
|
+
explicit_pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug)
|
3366
|
+
explicit_pipe_dtypes.update({col: 'numeric' for col in numeric_cols})
|
3367
|
+
pipe.dtypes = explicit_pipe_dtypes
|
3423
3368
|
if not pipe.temporary:
|
3424
3369
|
edit_success, edit_msg = pipe.edit(debug=debug)
|
3425
3370
|
if not edit_success:
|
@@ -3428,7 +3373,7 @@ def get_alter_columns_queries(
|
|
3428
3373
|
+ f"{edit_msg}"
|
3429
3374
|
)
|
3430
3375
|
else:
|
3431
|
-
numeric_cols.extend([col for col, typ in
|
3376
|
+
numeric_cols.extend([col for col, typ in pipe_dtypes.items() if typ.startswith('numeric')])
|
3432
3377
|
|
3433
3378
|
numeric_type = get_db_type_from_pd_type('numeric', self.flavor, as_sqlalchemy=False)
|
3434
3379
|
text_type = get_db_type_from_pd_type('str', self.flavor, as_sqlalchemy=False)
|
@@ -3636,20 +3581,18 @@ def get_to_sql_dtype(
|
|
3636
3581
|
>>> get_to_sql_dtype(pipe, df)
|
3637
3582
|
{'a': <class 'sqlalchemy.sql.sqltypes.JSON'>}
|
3638
3583
|
"""
|
3639
|
-
from meerschaum.utils.dataframe import
|
3584
|
+
from meerschaum.utils.dataframe import get_special_cols
|
3640
3585
|
from meerschaum.utils.dtypes.sql import get_db_type_from_pd_type
|
3641
3586
|
df_dtypes = {
|
3642
3587
|
col: str(typ)
|
3643
3588
|
for col, typ in df.dtypes.items()
|
3644
3589
|
}
|
3645
|
-
|
3646
|
-
|
3647
|
-
|
3648
|
-
df_dtypes.update({col: 'json' for col in json_cols})
|
3649
|
-
df_dtypes.update({col: 'numeric' for col in numeric_cols})
|
3650
|
-
df_dtypes.update({col: 'uuid' for col in uuid_cols})
|
3590
|
+
special_cols = get_special_cols(df)
|
3591
|
+
df_dtypes.update(special_cols)
|
3592
|
+
|
3651
3593
|
if update_dtypes:
|
3652
3594
|
df_dtypes.update(pipe.dtypes)
|
3595
|
+
|
3653
3596
|
return {
|
3654
3597
|
col: get_db_type_from_pd_type(typ, self.flavor, as_sqlalchemy=True)
|
3655
3598
|
for col, typ in df_dtypes.items()
|
@@ -3920,3 +3863,15 @@ def get_temporary_target(
|
|
3920
3863
|
+ transact_id
|
3921
3864
|
+ ((separator + label) if label else '')
|
3922
3865
|
)
|
3866
|
+
|
3867
|
+
|
3868
|
+
def _enforce_pipe_dtypes_chunks_hook(
|
3869
|
+
pipe: mrsm.Pipe,
|
3870
|
+
chunk_df: 'pd.DataFrame',
|
3871
|
+
debug: bool = False,
|
3872
|
+
**kwargs
|
3873
|
+
) -> 'pd.DataFrame':
|
3874
|
+
"""
|
3875
|
+
Enforce a pipe's dtypes on each chunk.
|
3876
|
+
"""
|
3877
|
+
return pipe.enforce_dtypes(chunk_df, debug=debug)
|
@@ -131,23 +131,28 @@ def read(
|
|
131
131
|
"""
|
132
132
|
if chunks is not None and chunks <= 0:
|
133
133
|
return []
|
134
|
+
|
134
135
|
from meerschaum.utils.sql import sql_item_name, truncate_item_name
|
135
136
|
from meerschaum.utils.dtypes import are_dtypes_equal, coerce_timezone
|
136
137
|
from meerschaum.utils.dtypes.sql import TIMEZONE_NAIVE_FLAVORS
|
137
138
|
from meerschaum.utils.packages import attempt_import, import_pandas
|
138
139
|
from meerschaum.utils.pool import get_pool
|
139
140
|
from meerschaum.utils.dataframe import chunksize_to_npartitions, get_numeric_cols
|
141
|
+
from meerschaum.utils.misc import filter_arguments
|
140
142
|
import warnings
|
141
143
|
import traceback
|
142
144
|
from decimal import Decimal
|
145
|
+
|
143
146
|
pd = import_pandas()
|
144
147
|
dd = None
|
148
|
+
|
145
149
|
is_dask = 'dask' in pd.__name__
|
146
150
|
pandas = attempt_import('pandas')
|
147
151
|
is_dask = dd is not None
|
148
152
|
npartitions = chunksize_to_npartitions(chunksize)
|
149
153
|
if is_dask:
|
150
154
|
chunksize = None
|
155
|
+
|
151
156
|
schema = schema or self.schema
|
152
157
|
utc_dt_cols = [
|
153
158
|
col
|
@@ -158,7 +163,7 @@ def read(
|
|
158
163
|
if dtype and utc_dt_cols and self.flavor in TIMEZONE_NAIVE_FLAVORS:
|
159
164
|
dtype = dtype.copy()
|
160
165
|
for col in utc_dt_cols:
|
161
|
-
dtype[col] = 'datetime64[
|
166
|
+
dtype[col] = 'datetime64[us]'
|
162
167
|
|
163
168
|
pool = get_pool(workers=workers)
|
164
169
|
sqlalchemy = attempt_import("sqlalchemy", lazy=False)
|
@@ -222,26 +227,33 @@ def read(
|
|
222
227
|
else format_sql_query_for_dask(str_query)
|
223
228
|
)
|
224
229
|
|
230
|
+
def _get_chunk_args_kwargs(_chunk):
|
231
|
+
return filter_arguments(
|
232
|
+
chunk_hook,
|
233
|
+
_chunk,
|
234
|
+
workers=workers,
|
235
|
+
chunksize=chunksize,
|
236
|
+
debug=debug,
|
237
|
+
**kw
|
238
|
+
)
|
239
|
+
|
225
240
|
chunk_list = []
|
226
241
|
chunk_hook_results = []
|
227
242
|
def _process_chunk(_chunk, _retry_on_failure: bool = True):
|
228
243
|
if self.flavor in TIMEZONE_NAIVE_FLAVORS:
|
229
244
|
for col in utc_dt_cols:
|
230
|
-
_chunk[col] = coerce_timezone(_chunk[col],
|
245
|
+
_chunk[col] = coerce_timezone(_chunk[col], strip_utc=False)
|
231
246
|
if not as_hook_results:
|
232
247
|
chunk_list.append(_chunk)
|
248
|
+
|
233
249
|
if chunk_hook is None:
|
234
250
|
return None
|
235
251
|
|
252
|
+
chunk_args, chunk_kwargs = _get_chunk_args_kwargs(_chunk)
|
253
|
+
|
236
254
|
result = None
|
237
255
|
try:
|
238
|
-
result = chunk_hook(
|
239
|
-
_chunk,
|
240
|
-
workers=workers,
|
241
|
-
chunksize=chunksize,
|
242
|
-
debug=debug,
|
243
|
-
**kw
|
244
|
-
)
|
256
|
+
result = chunk_hook(*chunk_args, **chunk_kwargs)
|
245
257
|
except Exception:
|
246
258
|
result = False, traceback.format_exc()
|
247
259
|
from meerschaum.utils.formatting import get_console
|
@@ -292,8 +304,16 @@ def read(
|
|
292
304
|
self.engine,
|
293
305
|
**read_sql_query_kwargs
|
294
306
|
)
|
307
|
+
|
295
308
|
to_return = (
|
296
|
-
|
309
|
+
(
|
310
|
+
chunk_generator
|
311
|
+
if not (as_hook_results or chunksize is None)
|
312
|
+
else (
|
313
|
+
_process_chunk(_chunk)
|
314
|
+
for _chunk in chunk_generator
|
315
|
+
)
|
316
|
+
)
|
297
317
|
if as_iterator or chunksize is None
|
298
318
|
else (
|
299
319
|
list(pool.imap(_process_chunk, chunk_generator))
|
@@ -339,9 +359,8 @@ def read(
|
|
339
359
|
try:
|
340
360
|
for chunk in chunk_generator:
|
341
361
|
if chunk_hook is not None:
|
342
|
-
|
343
|
-
|
344
|
-
)
|
362
|
+
chunk_args, chunk_kwargs = _get_chunk_args_kwargs(chunk)
|
363
|
+
chunk_hook_results.append(chunk_hook(*chunk_args, **chunk_kwargs))
|
345
364
|
chunk_list.append(chunk)
|
346
365
|
read_chunks += 1
|
347
366
|
if chunks is not None and read_chunks >= chunks:
|
@@ -356,9 +375,8 @@ def read(
|
|
356
375
|
try:
|
357
376
|
for chunk in chunk_generator:
|
358
377
|
if chunk_hook is not None:
|
359
|
-
|
360
|
-
|
361
|
-
)
|
378
|
+
chunk_args, chunk_kwargs = _get_chunk_args_kwargs(chunk)
|
379
|
+
chunk_hook_results.append(chunk_hook(*chunk_args, **chunk_kwargs))
|
362
380
|
chunk_list.append(chunk)
|
363
381
|
read_chunks += 1
|
364
382
|
if chunks is not None and read_chunks >= chunks:
|
@@ -389,9 +407,8 @@ def read(
|
|
389
407
|
### call the hook on any missed chunks.
|
390
408
|
if chunk_hook is not None and len(chunk_list) > len(chunk_hook_results):
|
391
409
|
for c in chunk_list[len(chunk_hook_results):]:
|
392
|
-
|
393
|
-
|
394
|
-
)
|
410
|
+
chunk_args, chunk_kwargs = _get_chunk_args_kwargs(c)
|
411
|
+
chunk_hook_results.append(chunk_hook(*chunk_args, **chunk_kwargs))
|
395
412
|
|
396
413
|
### chunksize is not None so must iterate
|
397
414
|
if debug:
|
@@ -784,6 +801,7 @@ def to_sql(
|
|
784
801
|
from meerschaum.utils.warnings import error, warn
|
785
802
|
import warnings
|
786
803
|
import functools
|
804
|
+
import traceback
|
787
805
|
|
788
806
|
if name is None:
|
789
807
|
error(f"Name must not be `None` to insert data into {self}.")
|
@@ -1057,7 +1075,7 @@ def to_sql(
|
|
1057
1075
|
except Exception as e:
|
1058
1076
|
if not silent:
|
1059
1077
|
warn(str(e))
|
1060
|
-
success, msg = False,
|
1078
|
+
success, msg = False, traceback.format_exc()
|
1061
1079
|
|
1062
1080
|
end = time.perf_counter()
|
1063
1081
|
if success:
|