meerschaum 3.0.0rc1__py3-none-any.whl → 3.0.0rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/_internal/arguments/_parser.py +2 -1
- meerschaum/_internal/docs/index.py +49 -2
- meerschaum/_internal/shell/Shell.py +5 -4
- meerschaum/_internal/static.py +8 -24
- meerschaum/actions/bootstrap.py +1 -1
- meerschaum/actions/edit.py +6 -3
- meerschaum/actions/start.py +1 -1
- meerschaum/actions/verify.py +5 -8
- meerschaum/api/__init__.py +2 -1
- meerschaum/api/dash/__init__.py +0 -2
- meerschaum/api/dash/callbacks/__init__.py +1 -0
- meerschaum/api/dash/callbacks/dashboard.py +20 -19
- meerschaum/api/dash/callbacks/jobs.py +11 -5
- meerschaum/api/dash/callbacks/pipes.py +106 -5
- meerschaum/api/dash/callbacks/settings/__init__.py +0 -1
- meerschaum/api/dash/callbacks/{settings/tokens.py → tokens.py} +1 -1
- meerschaum/api/dash/jobs.py +1 -1
- meerschaum/api/dash/pages/__init__.py +2 -1
- meerschaum/api/dash/pages/{job.py → jobs.py} +10 -7
- meerschaum/api/dash/pages/pipes.py +4 -3
- meerschaum/api/dash/pages/settings/__init__.py +0 -1
- meerschaum/api/dash/pages/{settings/tokens.py → tokens.py} +6 -8
- meerschaum/api/dash/pipes.py +131 -0
- meerschaum/api/dash/tokens.py +28 -31
- meerschaum/api/routes/_pipes.py +47 -37
- meerschaum/config/_default.py +13 -2
- meerschaum/config/_paths.py +1 -0
- meerschaum/config/_version.py +1 -1
- meerschaum/config/stack/__init__.py +9 -8
- meerschaum/connectors/api/_pipes.py +2 -18
- meerschaum/connectors/api/_tokens.py +2 -2
- meerschaum/connectors/instance/_tokens.py +10 -6
- meerschaum/connectors/sql/_SQLConnector.py +14 -0
- meerschaum/connectors/sql/_create_engine.py +3 -14
- meerschaum/connectors/sql/_pipes.py +175 -185
- meerschaum/connectors/sql/_sql.py +38 -20
- meerschaum/connectors/sql/tables/__init__.py +237 -122
- meerschaum/connectors/valkey/_pipes.py +44 -16
- meerschaum/core/Pipe/__init__.py +28 -5
- meerschaum/core/Pipe/_attributes.py +273 -46
- meerschaum/core/Pipe/_data.py +55 -17
- meerschaum/core/Pipe/_dtypes.py +19 -4
- meerschaum/core/Pipe/_edit.py +2 -0
- meerschaum/core/Pipe/_fetch.py +1 -1
- meerschaum/core/Pipe/_sync.py +90 -160
- meerschaum/core/Pipe/_verify.py +3 -3
- meerschaum/core/Token/_Token.py +4 -5
- meerschaum/plugins/bootstrap.py +508 -3
- meerschaum/utils/_get_pipes.py +1 -1
- meerschaum/utils/dataframe.py +385 -68
- meerschaum/utils/debug.py +15 -15
- meerschaum/utils/dtypes/__init__.py +387 -22
- meerschaum/utils/dtypes/sql.py +327 -31
- meerschaum/utils/misc.py +9 -68
- meerschaum/utils/packages/__init__.py +7 -21
- meerschaum/utils/packages/_packages.py +7 -2
- meerschaum/utils/schedule.py +1 -1
- meerschaum/utils/sql.py +8 -8
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/METADATA +5 -17
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/RECORD +66 -65
- meerschaum-3.0.0rc3.dist-info/licenses/NOTICE +2 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/WHEEL +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/entry_points.txt +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/licenses/LICENSE +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/top_level.txt +0 -0
- {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc3.dist-info}/zip-safe +0 -0
@@ -25,7 +25,6 @@ def register_pipe(
|
|
25
25
|
Register a new pipe.
|
26
26
|
A pipe's attributes must be set before registering.
|
27
27
|
"""
|
28
|
-
from meerschaum.utils.debug import dprint
|
29
28
|
from meerschaum.utils.packages import attempt_import
|
30
29
|
from meerschaum.utils.sql import json_flavors
|
31
30
|
|
@@ -148,7 +147,7 @@ def fetch_pipes_keys(
|
|
148
147
|
tags: Optional[List[str]] = None,
|
149
148
|
params: Optional[Dict[str, Any]] = None,
|
150
149
|
debug: bool = False
|
151
|
-
) ->
|
150
|
+
) -> List[Tuple[str, str, Optional[str]]]:
|
152
151
|
"""
|
153
152
|
Return a list of tuples corresponding to the parameters provided.
|
154
153
|
|
@@ -163,17 +162,27 @@ def fetch_pipes_keys(
|
|
163
162
|
location_keys: Optional[List[str]], default None
|
164
163
|
List of location_keys to search by.
|
165
164
|
|
165
|
+
tags: Optional[List[str]], default None
|
166
|
+
List of pipes to search by.
|
167
|
+
|
166
168
|
params: Optional[Dict[str, Any]], default None
|
167
169
|
Dictionary of additional parameters to search by.
|
168
170
|
E.g. `--params pipe_id:1`
|
169
171
|
|
170
172
|
debug: bool, default False
|
171
173
|
Verbosity toggle.
|
174
|
+
|
175
|
+
Returns
|
176
|
+
-------
|
177
|
+
A list of tuples of pipes' keys (connector_keys, metric_key, location_key).
|
172
178
|
"""
|
173
|
-
from meerschaum.utils.debug import dprint
|
174
179
|
from meerschaum.utils.packages import attempt_import
|
175
180
|
from meerschaum.utils.misc import separate_negation_values
|
176
|
-
from meerschaum.utils.sql import
|
181
|
+
from meerschaum.utils.sql import (
|
182
|
+
OMIT_NULLSFIRST_FLAVORS,
|
183
|
+
table_exists,
|
184
|
+
json_flavors,
|
185
|
+
)
|
177
186
|
from meerschaum._internal.static import STATIC_CONFIG
|
178
187
|
import json
|
179
188
|
from copy import deepcopy
|
@@ -261,25 +270,49 @@ def fetch_pipes_keys(
|
|
261
270
|
in_ex_tag_groups = [separate_negation_values(tag_group) for tag_group in tag_groups]
|
262
271
|
|
263
272
|
ors, nands = [], []
|
264
|
-
|
265
|
-
|
266
|
-
for
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
273
|
+
if self.flavor in json_flavors:
|
274
|
+
from sqlalchemy.dialects import postgresql
|
275
|
+
for _in_tags, _ex_tags in in_ex_tag_groups:
|
276
|
+
if _in_tags:
|
277
|
+
ors.append(
|
278
|
+
sqlalchemy.and_(
|
279
|
+
pipes_tbl.c['parameters'].cast(postgresql.JSONB).has_key('tags'),
|
280
|
+
pipes_tbl.c['parameters']['tags'].cast(
|
281
|
+
postgresql.JSONB
|
282
|
+
).contains(_in_tags)
|
283
|
+
)
|
284
|
+
)
|
285
|
+
for xt in _ex_tags:
|
286
|
+
nands.append(
|
287
|
+
sqlalchemy.not_(
|
288
|
+
sqlalchemy.and_(
|
289
|
+
pipes_tbl.c['parameters'].cast(postgresql.JSONB).has_key('tags'),
|
290
|
+
pipes_tbl.c['parameters']['tags'].cast(
|
291
|
+
postgresql.JSONB
|
292
|
+
).contains([xt])
|
293
|
+
)
|
294
|
+
)
|
295
|
+
)
|
296
|
+
else:
|
297
|
+
for _in_tags, _ex_tags in in_ex_tag_groups:
|
298
|
+
sub_ands = []
|
299
|
+
for nt in _in_tags:
|
300
|
+
sub_ands.append(
|
301
|
+
sqlalchemy.cast(
|
302
|
+
pipes_tbl.c['parameters'],
|
303
|
+
sqlalchemy.String,
|
304
|
+
).like(f'%"tags":%"{nt}"%')
|
305
|
+
)
|
306
|
+
if sub_ands:
|
307
|
+
ors.append(sqlalchemy.and_(*sub_ands))
|
308
|
+
|
309
|
+
for xt in _ex_tags:
|
310
|
+
nands.append(
|
311
|
+
sqlalchemy.cast(
|
312
|
+
pipes_tbl.c['parameters'],
|
313
|
+
sqlalchemy.String,
|
314
|
+
).not_like(f'%"tags":%"{xt}"%')
|
315
|
+
)
|
283
316
|
|
284
317
|
q = q.where(sqlalchemy.and_(*nands)) if nands else q
|
285
318
|
q = q.where(sqlalchemy.or_(*ors)) if ors else q
|
@@ -294,7 +327,7 @@ def fetch_pipes_keys(
|
|
294
327
|
|
295
328
|
### execute the query and return a list of tuples
|
296
329
|
if debug:
|
297
|
-
dprint(q
|
330
|
+
dprint(q)
|
298
331
|
try:
|
299
332
|
rows = (
|
300
333
|
self.execute(q).fetchall()
|
@@ -338,7 +371,6 @@ def create_indices(
|
|
338
371
|
"""
|
339
372
|
Create a pipe's indices.
|
340
373
|
"""
|
341
|
-
from meerschaum.utils.debug import dprint
|
342
374
|
if debug:
|
343
375
|
dprint(f"Creating indices for {pipe}...")
|
344
376
|
|
@@ -392,7 +424,6 @@ def drop_indices(
|
|
392
424
|
"""
|
393
425
|
Drop a pipe's indices.
|
394
426
|
"""
|
395
|
-
from meerschaum.utils.debug import dprint
|
396
427
|
if debug:
|
397
428
|
dprint(f"Dropping indices for {pipe}...")
|
398
429
|
|
@@ -1008,6 +1039,8 @@ def get_pipe_data(
|
|
1008
1039
|
limit: Optional[int] = None,
|
1009
1040
|
begin_add_minutes: int = 0,
|
1010
1041
|
end_add_minutes: int = 0,
|
1042
|
+
chunksize: Optional[int] = -1,
|
1043
|
+
as_iterator: bool = False,
|
1011
1044
|
debug: bool = False,
|
1012
1045
|
**kw: Any
|
1013
1046
|
) -> Union[pd.DataFrame, None]:
|
@@ -1044,14 +1077,17 @@ def get_pipe_data(
|
|
1044
1077
|
If specified, limit the number of rows retrieved to this value.
|
1045
1078
|
|
1046
1079
|
begin_add_minutes: int, default 0
|
1047
|
-
The number of minutes to add to the `begin` datetime (i.e. `DATEADD
|
1080
|
+
The number of minutes to add to the `begin` datetime (i.e. `DATEADD`).
|
1048
1081
|
|
1049
1082
|
end_add_minutes: int, default 0
|
1050
|
-
The number of minutes to add to the `end` datetime (i.e. `DATEADD
|
1083
|
+
The number of minutes to add to the `end` datetime (i.e. `DATEADD`).
|
1051
1084
|
|
1052
1085
|
chunksize: Optional[int], default -1
|
1053
1086
|
The size of dataframe chunks to load into memory.
|
1054
1087
|
|
1088
|
+
as_iterator: bool, default False
|
1089
|
+
If `True`, return the chunks iterator directly.
|
1090
|
+
|
1055
1091
|
debug: bool, default False
|
1056
1092
|
Verbosity toggle.
|
1057
1093
|
|
@@ -1060,43 +1096,58 @@ def get_pipe_data(
|
|
1060
1096
|
A `pd.DataFrame` of the pipe's data.
|
1061
1097
|
|
1062
1098
|
"""
|
1063
|
-
import
|
1064
|
-
from meerschaum.utils.misc import parse_df_datetimes, to_pandas_dtype
|
1099
|
+
import functools
|
1065
1100
|
from meerschaum.utils.packages import import_pandas
|
1066
|
-
from meerschaum.utils.dtypes import
|
1067
|
-
attempt_cast_to_numeric,
|
1068
|
-
attempt_cast_to_uuid,
|
1069
|
-
attempt_cast_to_bytes,
|
1070
|
-
attempt_cast_to_geometry,
|
1071
|
-
are_dtypes_equal,
|
1072
|
-
)
|
1101
|
+
from meerschaum.utils.dtypes import to_pandas_dtype, are_dtypes_equal
|
1073
1102
|
from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
|
1074
1103
|
pd = import_pandas()
|
1075
1104
|
is_dask = 'dask' in pd.__name__
|
1076
1105
|
|
1077
1106
|
cols_types = pipe.get_columns_types(debug=debug) if pipe.enforce else {}
|
1107
|
+
pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug) if pipe.enforce else {}
|
1108
|
+
|
1109
|
+
remote_pandas_types = {
|
1110
|
+
col: to_pandas_dtype(get_pd_type_from_db_type(typ))
|
1111
|
+
for col, typ in cols_types.items()
|
1112
|
+
}
|
1113
|
+
remote_dt_cols_types = {
|
1114
|
+
col: typ
|
1115
|
+
for col, typ in remote_pandas_types.items()
|
1116
|
+
if are_dtypes_equal(typ, 'datetime')
|
1117
|
+
}
|
1118
|
+
remote_dt_tz_aware_cols_types = {
|
1119
|
+
col: typ
|
1120
|
+
for col, typ in remote_dt_cols_types.items()
|
1121
|
+
if ',' in typ or typ == 'datetime'
|
1122
|
+
}
|
1123
|
+
remote_dt_tz_naive_cols_types = {
|
1124
|
+
col: typ
|
1125
|
+
for col, typ in remote_dt_cols_types.items()
|
1126
|
+
if col not in remote_dt_tz_aware_cols_types
|
1127
|
+
}
|
1128
|
+
|
1129
|
+
configured_pandas_types = {
|
1130
|
+
col: to_pandas_dtype(typ)
|
1131
|
+
for col, typ in pipe_dtypes.items()
|
1132
|
+
}
|
1133
|
+
configured_lower_precision_dt_cols_types = {
|
1134
|
+
col: typ
|
1135
|
+
for col, typ in pipe_dtypes.items()
|
1136
|
+
if (
|
1137
|
+
are_dtypes_equal('datetime', typ)
|
1138
|
+
and '[' in typ
|
1139
|
+
and 'ns' not in typ
|
1140
|
+
)
|
1141
|
+
|
1142
|
+
}
|
1143
|
+
|
1078
1144
|
dtypes = {
|
1079
|
-
**
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
**
|
1084
|
-
p_col: to_pandas_dtype(p_typ)
|
1085
|
-
for p_col, p_typ in pipe.dtypes.items()
|
1086
|
-
},
|
1145
|
+
**remote_pandas_types,
|
1146
|
+
**configured_pandas_types,
|
1147
|
+
**remote_dt_tz_aware_cols_types,
|
1148
|
+
**remote_dt_tz_naive_cols_types,
|
1149
|
+
**configured_lower_precision_dt_cols_types
|
1087
1150
|
} if pipe.enforce else {}
|
1088
|
-
if dtypes:
|
1089
|
-
if self.flavor == 'sqlite':
|
1090
|
-
if not pipe.columns.get('datetime', None):
|
1091
|
-
_dt = pipe.guess_datetime()
|
1092
|
-
else:
|
1093
|
-
_dt = pipe.get_columns('datetime')
|
1094
|
-
|
1095
|
-
if _dt:
|
1096
|
-
dt_type = dtypes.get(_dt, 'object').lower()
|
1097
|
-
if 'datetime' not in dt_type:
|
1098
|
-
if 'int' not in dt_type:
|
1099
|
-
dtypes[_dt] = 'datetime64[ns, UTC]'
|
1100
1151
|
|
1101
1152
|
existing_cols = cols_types.keys()
|
1102
1153
|
select_columns = (
|
@@ -1113,13 +1164,20 @@ def get_pipe_data(
|
|
1113
1164
|
and col not in (omit_columns or [])
|
1114
1165
|
]
|
1115
1166
|
) if pipe.enforce else select_columns
|
1167
|
+
|
1116
1168
|
if select_columns:
|
1117
1169
|
dtypes = {col: typ for col, typ in dtypes.items() if col in select_columns}
|
1170
|
+
|
1118
1171
|
dtypes = {
|
1119
|
-
col:
|
1172
|
+
col: typ
|
1120
1173
|
for col, typ in dtypes.items()
|
1121
|
-
if col in select_columns and col not in (omit_columns or [])
|
1174
|
+
if col in (select_columns or [col]) and col not in (omit_columns or [])
|
1122
1175
|
} if pipe.enforce else {}
|
1176
|
+
|
1177
|
+
if debug:
|
1178
|
+
dprint(f"[{self}] `read()` dtypes:")
|
1179
|
+
mrsm.pprint(dtypes)
|
1180
|
+
|
1123
1181
|
query = self.get_pipe_data_query(
|
1124
1182
|
pipe,
|
1125
1183
|
select_columns=select_columns,
|
@@ -1135,91 +1193,25 @@ def get_pipe_data(
|
|
1135
1193
|
**kw
|
1136
1194
|
)
|
1137
1195
|
|
1196
|
+
read_kwargs = {}
|
1138
1197
|
if is_dask:
|
1139
1198
|
index_col = pipe.columns.get('datetime', None)
|
1140
|
-
|
1199
|
+
read_kwargs['index_col'] = index_col
|
1141
1200
|
|
1142
|
-
|
1143
|
-
col
|
1144
|
-
for col, typ in pipe.dtypes.items()
|
1145
|
-
if typ.startswith('numeric') and col in dtypes
|
1146
|
-
]
|
1147
|
-
uuid_columns = [
|
1148
|
-
col
|
1149
|
-
for col, typ in pipe.dtypes.items()
|
1150
|
-
if typ == 'uuid' and col in dtypes
|
1151
|
-
]
|
1152
|
-
bytes_columns = [
|
1153
|
-
col
|
1154
|
-
for col, typ in pipe.dtypes.items()
|
1155
|
-
if typ == 'bytes' and col in dtypes
|
1156
|
-
]
|
1157
|
-
geometry_columns = [
|
1158
|
-
col
|
1159
|
-
for col, typ in pipe.dtypes.items()
|
1160
|
-
if typ.startswith('geometry') and col in dtypes
|
1161
|
-
]
|
1162
|
-
|
1163
|
-
kw['coerce_float'] = kw.get('coerce_float', (len(numeric_columns) == 0))
|
1164
|
-
|
1165
|
-
df = self.read(
|
1201
|
+
chunks = self.read(
|
1166
1202
|
query,
|
1203
|
+
chunksize=chunksize,
|
1204
|
+
as_iterator=True,
|
1205
|
+
coerce_float=False,
|
1167
1206
|
dtype=dtypes,
|
1168
1207
|
debug=debug,
|
1169
|
-
**
|
1208
|
+
**read_kwargs
|
1170
1209
|
)
|
1171
|
-
for col in numeric_columns:
|
1172
|
-
if col not in df.columns:
|
1173
|
-
continue
|
1174
|
-
df[col] = df[col].apply(attempt_cast_to_numeric)
|
1175
1210
|
|
1176
|
-
|
1177
|
-
|
1178
|
-
continue
|
1179
|
-
df[col] = df[col].apply(attempt_cast_to_uuid)
|
1180
|
-
|
1181
|
-
for col in bytes_columns:
|
1182
|
-
if col not in df.columns:
|
1183
|
-
continue
|
1184
|
-
df[col] = df[col].apply(attempt_cast_to_bytes)
|
1211
|
+
if as_iterator:
|
1212
|
+
return chunks
|
1185
1213
|
|
1186
|
-
|
1187
|
-
if col not in df.columns:
|
1188
|
-
continue
|
1189
|
-
df[col] = df[col].apply(attempt_cast_to_geometry)
|
1190
|
-
|
1191
|
-
if self.flavor == 'sqlite':
|
1192
|
-
ignore_dt_cols = [
|
1193
|
-
col
|
1194
|
-
for col, dtype in pipe.dtypes.items()
|
1195
|
-
if not are_dtypes_equal(str(dtype), 'datetime')
|
1196
|
-
]
|
1197
|
-
### NOTE: We have to consume the iterator here to ensure that datetimes are parsed correctly
|
1198
|
-
df = (
|
1199
|
-
parse_df_datetimes(
|
1200
|
-
df,
|
1201
|
-
ignore_cols=ignore_dt_cols,
|
1202
|
-
chunksize=kw.get('chunksize', None),
|
1203
|
-
strip_timezone=(pipe.tzinfo is None),
|
1204
|
-
debug=debug,
|
1205
|
-
) if isinstance(df, pd.DataFrame) else (
|
1206
|
-
[
|
1207
|
-
parse_df_datetimes(
|
1208
|
-
c,
|
1209
|
-
ignore_cols=ignore_dt_cols,
|
1210
|
-
chunksize=kw.get('chunksize', None),
|
1211
|
-
strip_timezone=(pipe.tzinfo is None),
|
1212
|
-
debug=debug,
|
1213
|
-
)
|
1214
|
-
for c in df
|
1215
|
-
]
|
1216
|
-
)
|
1217
|
-
)
|
1218
|
-
for col, typ in dtypes.items():
|
1219
|
-
if typ != 'json':
|
1220
|
-
continue
|
1221
|
-
df[col] = df[col].apply(lambda x: json.loads(x) if x is not None else x)
|
1222
|
-
return df
|
1214
|
+
return pd.concat(chunks)
|
1223
1215
|
|
1224
1216
|
|
1225
1217
|
def get_pipe_data_query(
|
@@ -1552,13 +1544,7 @@ def create_pipe_table_from_df(
|
|
1552
1544
|
"""
|
1553
1545
|
Create a pipe's table from its configured dtypes and an incoming dataframe.
|
1554
1546
|
"""
|
1555
|
-
from meerschaum.utils.dataframe import
|
1556
|
-
get_json_cols,
|
1557
|
-
get_numeric_cols,
|
1558
|
-
get_uuid_cols,
|
1559
|
-
get_datetime_cols,
|
1560
|
-
get_bytes_cols,
|
1561
|
-
)
|
1547
|
+
from meerschaum.utils.dataframe import get_special_cols
|
1562
1548
|
from meerschaum.utils.sql import (
|
1563
1549
|
get_create_table_queries,
|
1564
1550
|
sql_item_name,
|
@@ -1587,30 +1573,7 @@ def create_pipe_table_from_df(
|
|
1587
1573
|
for col_ix, col in pipe.columns.items()
|
1588
1574
|
if col and col_ix != 'primary'
|
1589
1575
|
},
|
1590
|
-
**
|
1591
|
-
col: 'uuid'
|
1592
|
-
for col in get_uuid_cols(df)
|
1593
|
-
},
|
1594
|
-
**{
|
1595
|
-
col: 'json'
|
1596
|
-
for col in get_json_cols(df)
|
1597
|
-
},
|
1598
|
-
**{
|
1599
|
-
col: 'numeric'
|
1600
|
-
for col in get_numeric_cols(df)
|
1601
|
-
},
|
1602
|
-
**{
|
1603
|
-
col: 'bytes'
|
1604
|
-
for col in get_bytes_cols(df)
|
1605
|
-
},
|
1606
|
-
**{
|
1607
|
-
col: 'datetime64[ns, UTC]'
|
1608
|
-
for col in get_datetime_cols(df, timezone_aware=True, timezone_naive=False)
|
1609
|
-
},
|
1610
|
-
**{
|
1611
|
-
col: 'datetime64[ns]'
|
1612
|
-
for col in get_datetime_cols(df, timezone_aware=False, timezone_naive=True)
|
1613
|
-
},
|
1576
|
+
**get_special_cols(df),
|
1614
1577
|
**pipe.dtypes
|
1615
1578
|
}
|
1616
1579
|
autoincrement = (
|
@@ -1762,18 +1725,16 @@ def sync_pipe(
|
|
1762
1725
|
_ = pipe.__dict__.pop('_columns_types', None)
|
1763
1726
|
if not self.exec_queries(alter_cols_queries, debug=debug):
|
1764
1727
|
warn(f"Failed to alter columns for {pipe}.")
|
1765
|
-
else:
|
1766
|
-
_ = pipe.infer_dtypes(persist=True)
|
1767
1728
|
|
1768
1729
|
### NOTE: Oracle SQL < 23c (2023) and SQLite does not support booleans,
|
1769
1730
|
### so infer bools and persist them to `dtypes`.
|
1770
1731
|
if self.flavor in ('oracle', 'sqlite', 'mysql', 'mariadb'):
|
1771
|
-
pipe_dtypes = pipe.
|
1732
|
+
pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug)
|
1772
1733
|
new_bool_cols = {
|
1773
1734
|
col: 'bool[pyarrow]'
|
1774
1735
|
for col, typ in df.dtypes.items()
|
1775
1736
|
if col not in pipe_dtypes
|
1776
|
-
|
1737
|
+
and are_dtypes_equal(str(typ), 'bool')
|
1777
1738
|
}
|
1778
1739
|
pipe_dtypes.update(new_bool_cols)
|
1779
1740
|
pipe.dtypes = pipe_dtypes
|
@@ -2788,7 +2749,6 @@ def pipe_exists(
|
|
2788
2749
|
debug=debug,
|
2789
2750
|
)
|
2790
2751
|
if debug:
|
2791
|
-
from meerschaum.utils.debug import dprint
|
2792
2752
|
dprint(f"{pipe} " + ('exists.' if exists else 'does not exist.'))
|
2793
2753
|
return exists
|
2794
2754
|
|
@@ -3125,11 +3085,17 @@ def get_pipe_columns_types(
|
|
3125
3085
|
debug=debug,
|
3126
3086
|
)
|
3127
3087
|
|
3088
|
+
if debug:
|
3089
|
+
dprint(f"Fetching columns_types for {pipe} with via SQLAlchemy table.")
|
3090
|
+
|
3128
3091
|
table_columns = {}
|
3129
3092
|
try:
|
3130
3093
|
pipe_table = self.get_pipe_table(pipe, debug=debug)
|
3131
3094
|
if pipe_table is None:
|
3132
3095
|
return {}
|
3096
|
+
if debug:
|
3097
|
+
dprint(f"Found columns:")
|
3098
|
+
mrsm.pprint(dict(pipe_table.columns))
|
3133
3099
|
for col in pipe_table.columns:
|
3134
3100
|
table_columns[str(col.name)] = str(col.type)
|
3135
3101
|
except Exception as e:
|
@@ -3321,10 +3287,9 @@ def get_alter_columns_queries(
|
|
3321
3287
|
-------
|
3322
3288
|
A list of the `ALTER TABLE` SQL query or queries to be executed on the provided connector.
|
3323
3289
|
"""
|
3324
|
-
if not pipe.exists(debug=debug):
|
3290
|
+
if not pipe.exists(debug=debug) or pipe.static:
|
3325
3291
|
return []
|
3326
|
-
|
3327
|
-
return
|
3292
|
+
|
3328
3293
|
from meerschaum.utils.sql import (
|
3329
3294
|
sql_item_name,
|
3330
3295
|
get_table_cols_types,
|
@@ -3370,7 +3335,8 @@ def get_alter_columns_queries(
|
|
3370
3335
|
debug=debug,
|
3371
3336
|
).items()
|
3372
3337
|
}
|
3373
|
-
|
3338
|
+
pipe_dtypes = pipe.dtypes
|
3339
|
+
pipe_bool_cols = [col for col, typ in pipe_dtypes.items() if are_dtypes_equal(str(typ), 'bool')]
|
3374
3340
|
pd_db_df_aliases = {
|
3375
3341
|
'int': 'bool',
|
3376
3342
|
'float': 'bool',
|
@@ -3378,7 +3344,10 @@ def get_alter_columns_queries(
|
|
3378
3344
|
'guid': 'object',
|
3379
3345
|
}
|
3380
3346
|
if self.flavor == 'oracle':
|
3381
|
-
pd_db_df_aliases
|
3347
|
+
pd_db_df_aliases.update({
|
3348
|
+
'int': 'numeric',
|
3349
|
+
'date': 'datetime',
|
3350
|
+
})
|
3382
3351
|
|
3383
3352
|
altered_cols = {
|
3384
3353
|
col: (db_cols_types.get(col, 'object'), typ)
|
@@ -3387,6 +3356,10 @@ def get_alter_columns_queries(
|
|
3387
3356
|
and not are_dtypes_equal(db_cols_types.get(col, 'object'), 'string')
|
3388
3357
|
}
|
3389
3358
|
|
3359
|
+
if debug and altered_cols:
|
3360
|
+
dprint(f"Columns to be altered:")
|
3361
|
+
mrsm.pprint(altered_cols)
|
3362
|
+
|
3390
3363
|
### NOTE: Sometimes bools are coerced into ints or floats.
|
3391
3364
|
altered_cols_to_ignore = set()
|
3392
3365
|
for col, (db_typ, df_typ) in altered_cols.items():
|
@@ -3413,13 +3386,20 @@ def get_alter_columns_queries(
|
|
3413
3386
|
if db_is_bool_compatible and df_is_bool_compatible:
|
3414
3387
|
altered_cols_to_ignore.add(bool_col)
|
3415
3388
|
|
3389
|
+
if debug and altered_cols_to_ignore:
|
3390
|
+
dprint(f"Ignoring the following altered columns (false positives).")
|
3391
|
+
mrsm.pprint(altered_cols_to_ignore)
|
3392
|
+
|
3416
3393
|
for col in altered_cols_to_ignore:
|
3417
3394
|
_ = altered_cols.pop(col, None)
|
3395
|
+
|
3418
3396
|
if not altered_cols:
|
3419
3397
|
return []
|
3420
3398
|
|
3421
3399
|
if numeric_cols:
|
3422
|
-
pipe.
|
3400
|
+
explicit_pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug)
|
3401
|
+
explicit_pipe_dtypes.update({col: 'numeric' for col in numeric_cols})
|
3402
|
+
pipe.dtypes = explicit_pipe_dtypes
|
3423
3403
|
if not pipe.temporary:
|
3424
3404
|
edit_success, edit_msg = pipe.edit(debug=debug)
|
3425
3405
|
if not edit_success:
|
@@ -3428,7 +3408,7 @@ def get_alter_columns_queries(
|
|
3428
3408
|
+ f"{edit_msg}"
|
3429
3409
|
)
|
3430
3410
|
else:
|
3431
|
-
numeric_cols.extend([col for col, typ in
|
3411
|
+
numeric_cols.extend([col for col, typ in pipe_dtypes.items() if typ.startswith('numeric')])
|
3432
3412
|
|
3433
3413
|
numeric_type = get_db_type_from_pd_type('numeric', self.flavor, as_sqlalchemy=False)
|
3434
3414
|
text_type = get_db_type_from_pd_type('str', self.flavor, as_sqlalchemy=False)
|
@@ -3636,20 +3616,18 @@ def get_to_sql_dtype(
|
|
3636
3616
|
>>> get_to_sql_dtype(pipe, df)
|
3637
3617
|
{'a': <class 'sqlalchemy.sql.sqltypes.JSON'>}
|
3638
3618
|
"""
|
3639
|
-
from meerschaum.utils.dataframe import
|
3619
|
+
from meerschaum.utils.dataframe import get_special_cols
|
3640
3620
|
from meerschaum.utils.dtypes.sql import get_db_type_from_pd_type
|
3641
3621
|
df_dtypes = {
|
3642
3622
|
col: str(typ)
|
3643
3623
|
for col, typ in df.dtypes.items()
|
3644
3624
|
}
|
3645
|
-
|
3646
|
-
|
3647
|
-
|
3648
|
-
df_dtypes.update({col: 'json' for col in json_cols})
|
3649
|
-
df_dtypes.update({col: 'numeric' for col in numeric_cols})
|
3650
|
-
df_dtypes.update({col: 'uuid' for col in uuid_cols})
|
3625
|
+
special_cols = get_special_cols(df)
|
3626
|
+
df_dtypes.update(special_cols)
|
3627
|
+
|
3651
3628
|
if update_dtypes:
|
3652
3629
|
df_dtypes.update(pipe.dtypes)
|
3630
|
+
|
3653
3631
|
return {
|
3654
3632
|
col: get_db_type_from_pd_type(typ, self.flavor, as_sqlalchemy=True)
|
3655
3633
|
for col, typ in df_dtypes.items()
|
@@ -3920,3 +3898,15 @@ def get_temporary_target(
|
|
3920
3898
|
+ transact_id
|
3921
3899
|
+ ((separator + label) if label else '')
|
3922
3900
|
)
|
3901
|
+
|
3902
|
+
|
3903
|
+
def _enforce_pipe_dtypes_chunks_hook(
|
3904
|
+
pipe: mrsm.Pipe,
|
3905
|
+
chunk_df: 'pd.DataFrame',
|
3906
|
+
debug: bool = False,
|
3907
|
+
**kwargs
|
3908
|
+
) -> 'pd.DataFrame':
|
3909
|
+
"""
|
3910
|
+
Enforce a pipe's dtypes on each chunk.
|
3911
|
+
"""
|
3912
|
+
return pipe.enforce_dtypes(chunk_df, debug=debug)
|