meerschaum 3.0.0rc1__py3-none-any.whl → 3.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. meerschaum/_internal/arguments/_parser.py +2 -1
  2. meerschaum/_internal/docs/index.py +49 -2
  3. meerschaum/_internal/static.py +8 -24
  4. meerschaum/actions/verify.py +5 -8
  5. meerschaum/api/__init__.py +2 -1
  6. meerschaum/api/dash/__init__.py +0 -2
  7. meerschaum/api/dash/callbacks/dashboard.py +1 -1
  8. meerschaum/api/dash/tokens.py +2 -2
  9. meerschaum/api/routes/_pipes.py +47 -37
  10. meerschaum/config/_default.py +11 -1
  11. meerschaum/config/_version.py +1 -1
  12. meerschaum/config/stack/__init__.py +9 -8
  13. meerschaum/connectors/api/_pipes.py +2 -18
  14. meerschaum/connectors/api/_tokens.py +2 -2
  15. meerschaum/connectors/instance/_tokens.py +4 -4
  16. meerschaum/connectors/sql/_create_engine.py +3 -14
  17. meerschaum/connectors/sql/_pipes.py +118 -163
  18. meerschaum/connectors/sql/_sql.py +38 -20
  19. meerschaum/connectors/valkey/_pipes.py +44 -16
  20. meerschaum/core/Pipe/__init__.py +28 -5
  21. meerschaum/core/Pipe/_attributes.py +270 -46
  22. meerschaum/core/Pipe/_data.py +55 -17
  23. meerschaum/core/Pipe/_dtypes.py +19 -4
  24. meerschaum/core/Pipe/_edit.py +2 -0
  25. meerschaum/core/Pipe/_fetch.py +1 -1
  26. meerschaum/core/Pipe/_sync.py +90 -160
  27. meerschaum/core/Pipe/_verify.py +3 -3
  28. meerschaum/core/Token/_Token.py +3 -4
  29. meerschaum/utils/dataframe.py +379 -68
  30. meerschaum/utils/debug.py +15 -15
  31. meerschaum/utils/dtypes/__init__.py +388 -22
  32. meerschaum/utils/dtypes/sql.py +326 -30
  33. meerschaum/utils/misc.py +9 -68
  34. meerschaum/utils/packages/__init__.py +7 -21
  35. meerschaum/utils/packages/_packages.py +7 -2
  36. meerschaum/utils/schedule.py +1 -1
  37. meerschaum/utils/sql.py +7 -7
  38. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/METADATA +5 -17
  39. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/RECORD +45 -44
  40. meerschaum-3.0.0rc2.dist-info/licenses/NOTICE +2 -0
  41. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/WHEEL +0 -0
  42. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/entry_points.txt +0 -0
  43. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/licenses/LICENSE +0 -0
  44. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/top_level.txt +0 -0
  45. {meerschaum-3.0.0rc1.dist-info → meerschaum-3.0.0rc2.dist-info}/zip-safe +0 -0
@@ -31,7 +31,6 @@ install_flavor_drivers = {
31
31
  'mssql': ['pyodbc'],
32
32
  'oracle': ['oracledb'],
33
33
  }
34
- require_patching_flavors = {'cockroachdb': [('sqlalchemy-cockroachdb', 'sqlalchemy_cockroachdb')]}
35
34
 
36
35
  flavor_dialects = {
37
36
  'cockroachdb': (
@@ -63,19 +62,6 @@ def create_engine(
63
62
  )
64
63
  if self.flavor == 'mssql':
65
64
  _init_mssql_sqlalchemy()
66
- if self.flavor in require_patching_flavors:
67
- from meerschaum.utils.packages import determine_version, _monkey_patch_get_distribution
68
- import pathlib
69
- for install_name, import_name in require_patching_flavors[self.flavor]:
70
- pkg = attempt_import(
71
- import_name,
72
- debug=debug,
73
- lazy=False,
74
- warn=False
75
- )
76
- _monkey_patch_get_distribution(
77
- install_name, determine_version(pathlib.Path(pkg.__file__), venv='mrsm')
78
- )
79
65
 
80
66
  ### supplement missing values with defaults (e.g. port number)
81
67
  for a, value in flavor_configs[self.flavor]['defaults'].items():
@@ -189,6 +175,9 @@ def _init_mssql_sqlalchemy():
189
175
  lazy=False,
190
176
  warn=False,
191
177
  )
178
+ if pyodbc is None:
179
+ raise EnvironmentError("Cannot import pyodbc. Is the MSSQL driver installed?")
180
+
192
181
  pyodbc.pooling = False
193
182
 
194
183
  MSDialect_pyodbc = sqlalchemy_dialects_mssql_pyodbc.MSDialect_pyodbc
@@ -25,7 +25,6 @@ def register_pipe(
25
25
  Register a new pipe.
26
26
  A pipe's attributes must be set before registering.
27
27
  """
28
- from meerschaum.utils.debug import dprint
29
28
  from meerschaum.utils.packages import attempt_import
30
29
  from meerschaum.utils.sql import json_flavors
31
30
 
@@ -170,7 +169,6 @@ def fetch_pipes_keys(
170
169
  debug: bool, default False
171
170
  Verbosity toggle.
172
171
  """
173
- from meerschaum.utils.debug import dprint
174
172
  from meerschaum.utils.packages import attempt_import
175
173
  from meerschaum.utils.misc import separate_negation_values
176
174
  from meerschaum.utils.sql import OMIT_NULLSFIRST_FLAVORS, table_exists
@@ -338,7 +336,6 @@ def create_indices(
338
336
  """
339
337
  Create a pipe's indices.
340
338
  """
341
- from meerschaum.utils.debug import dprint
342
339
  if debug:
343
340
  dprint(f"Creating indices for {pipe}...")
344
341
 
@@ -392,7 +389,6 @@ def drop_indices(
392
389
  """
393
390
  Drop a pipe's indices.
394
391
  """
395
- from meerschaum.utils.debug import dprint
396
392
  if debug:
397
393
  dprint(f"Dropping indices for {pipe}...")
398
394
 
@@ -1008,6 +1004,8 @@ def get_pipe_data(
1008
1004
  limit: Optional[int] = None,
1009
1005
  begin_add_minutes: int = 0,
1010
1006
  end_add_minutes: int = 0,
1007
+ chunksize: Optional[int] = -1,
1008
+ as_iterator: bool = False,
1011
1009
  debug: bool = False,
1012
1010
  **kw: Any
1013
1011
  ) -> Union[pd.DataFrame, None]:
@@ -1044,14 +1042,17 @@ def get_pipe_data(
1044
1042
  If specified, limit the number of rows retrieved to this value.
1045
1043
 
1046
1044
  begin_add_minutes: int, default 0
1047
- The number of minutes to add to the `begin` datetime (i.e. `DATEADD`.
1045
+ The number of minutes to add to the `begin` datetime (i.e. `DATEADD`).
1048
1046
 
1049
1047
  end_add_minutes: int, default 0
1050
- The number of minutes to add to the `end` datetime (i.e. `DATEADD`.
1048
+ The number of minutes to add to the `end` datetime (i.e. `DATEADD`).
1051
1049
 
1052
1050
  chunksize: Optional[int], default -1
1053
1051
  The size of dataframe chunks to load into memory.
1054
1052
 
1053
+ as_iterator: bool, default False
1054
+ If `True`, return the chunks iterator directly.
1055
+
1055
1056
  debug: bool, default False
1056
1057
  Verbosity toggle.
1057
1058
 
@@ -1060,43 +1061,58 @@ def get_pipe_data(
1060
1061
  A `pd.DataFrame` of the pipe's data.
1061
1062
 
1062
1063
  """
1063
- import json
1064
- from meerschaum.utils.misc import parse_df_datetimes, to_pandas_dtype
1064
+ import functools
1065
1065
  from meerschaum.utils.packages import import_pandas
1066
- from meerschaum.utils.dtypes import (
1067
- attempt_cast_to_numeric,
1068
- attempt_cast_to_uuid,
1069
- attempt_cast_to_bytes,
1070
- attempt_cast_to_geometry,
1071
- are_dtypes_equal,
1072
- )
1066
+ from meerschaum.utils.dtypes import to_pandas_dtype, are_dtypes_equal
1073
1067
  from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
1074
1068
  pd = import_pandas()
1075
1069
  is_dask = 'dask' in pd.__name__
1076
1070
 
1077
1071
  cols_types = pipe.get_columns_types(debug=debug) if pipe.enforce else {}
1072
+ pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug) if pipe.enforce else {}
1073
+
1074
+ remote_pandas_types = {
1075
+ col: to_pandas_dtype(get_pd_type_from_db_type(typ))
1076
+ for col, typ in cols_types.items()
1077
+ }
1078
+ remote_dt_cols_types = {
1079
+ col: typ
1080
+ for col, typ in remote_pandas_types.items()
1081
+ if are_dtypes_equal(typ, 'datetime')
1082
+ }
1083
+ remote_dt_tz_aware_cols_types = {
1084
+ col: typ
1085
+ for col, typ in remote_dt_cols_types.items()
1086
+ if ',' in typ or typ == 'datetime'
1087
+ }
1088
+ remote_dt_tz_naive_cols_types = {
1089
+ col: typ
1090
+ for col, typ in remote_dt_cols_types.items()
1091
+ if col not in remote_dt_tz_aware_cols_types
1092
+ }
1093
+
1094
+ configured_pandas_types = {
1095
+ col: to_pandas_dtype(typ)
1096
+ for col, typ in pipe_dtypes.items()
1097
+ }
1098
+ configured_lower_precision_dt_cols_types = {
1099
+ col: typ
1100
+ for col, typ in pipe_dtypes.items()
1101
+ if (
1102
+ are_dtypes_equal('datetime', typ)
1103
+ and '[' in typ
1104
+ and 'ns' not in typ
1105
+ )
1106
+
1107
+ }
1108
+
1078
1109
  dtypes = {
1079
- **{
1080
- col: get_pd_type_from_db_type(typ)
1081
- for col, typ in cols_types.items()
1082
- },
1083
- **{
1084
- p_col: to_pandas_dtype(p_typ)
1085
- for p_col, p_typ in pipe.dtypes.items()
1086
- },
1110
+ **remote_pandas_types,
1111
+ **configured_pandas_types,
1112
+ **remote_dt_tz_aware_cols_types,
1113
+ **remote_dt_tz_naive_cols_types,
1114
+ **configured_lower_precision_dt_cols_types
1087
1115
  } if pipe.enforce else {}
1088
- if dtypes:
1089
- if self.flavor == 'sqlite':
1090
- if not pipe.columns.get('datetime', None):
1091
- _dt = pipe.guess_datetime()
1092
- else:
1093
- _dt = pipe.get_columns('datetime')
1094
-
1095
- if _dt:
1096
- dt_type = dtypes.get(_dt, 'object').lower()
1097
- if 'datetime' not in dt_type:
1098
- if 'int' not in dt_type:
1099
- dtypes[_dt] = 'datetime64[ns, UTC]'
1100
1116
 
1101
1117
  existing_cols = cols_types.keys()
1102
1118
  select_columns = (
@@ -1113,13 +1129,20 @@ def get_pipe_data(
1113
1129
  and col not in (omit_columns or [])
1114
1130
  ]
1115
1131
  ) if pipe.enforce else select_columns
1132
+
1116
1133
  if select_columns:
1117
1134
  dtypes = {col: typ for col, typ in dtypes.items() if col in select_columns}
1135
+
1118
1136
  dtypes = {
1119
- col: to_pandas_dtype(typ)
1137
+ col: typ
1120
1138
  for col, typ in dtypes.items()
1121
- if col in select_columns and col not in (omit_columns or [])
1139
+ if col in (select_columns or [col]) and col not in (omit_columns or [])
1122
1140
  } if pipe.enforce else {}
1141
+
1142
+ if debug:
1143
+ dprint(f"[{self}] `read()` dtypes:")
1144
+ mrsm.pprint(dtypes)
1145
+
1123
1146
  query = self.get_pipe_data_query(
1124
1147
  pipe,
1125
1148
  select_columns=select_columns,
@@ -1135,91 +1158,25 @@ def get_pipe_data(
1135
1158
  **kw
1136
1159
  )
1137
1160
 
1161
+ read_kwargs = {}
1138
1162
  if is_dask:
1139
1163
  index_col = pipe.columns.get('datetime', None)
1140
- kw['index_col'] = index_col
1141
-
1142
- numeric_columns = [
1143
- col
1144
- for col, typ in pipe.dtypes.items()
1145
- if typ.startswith('numeric') and col in dtypes
1146
- ]
1147
- uuid_columns = [
1148
- col
1149
- for col, typ in pipe.dtypes.items()
1150
- if typ == 'uuid' and col in dtypes
1151
- ]
1152
- bytes_columns = [
1153
- col
1154
- for col, typ in pipe.dtypes.items()
1155
- if typ == 'bytes' and col in dtypes
1156
- ]
1157
- geometry_columns = [
1158
- col
1159
- for col, typ in pipe.dtypes.items()
1160
- if typ.startswith('geometry') and col in dtypes
1161
- ]
1162
-
1163
- kw['coerce_float'] = kw.get('coerce_float', (len(numeric_columns) == 0))
1164
+ read_kwargs['index_col'] = index_col
1164
1165
 
1165
- df = self.read(
1166
+ chunks = self.read(
1166
1167
  query,
1168
+ chunksize=chunksize,
1169
+ as_iterator=True,
1170
+ coerce_float=False,
1167
1171
  dtype=dtypes,
1168
1172
  debug=debug,
1169
- **kw
1173
+ **read_kwargs
1170
1174
  )
1171
- for col in numeric_columns:
1172
- if col not in df.columns:
1173
- continue
1174
- df[col] = df[col].apply(attempt_cast_to_numeric)
1175
1175
 
1176
- for col in uuid_columns:
1177
- if col not in df.columns:
1178
- continue
1179
- df[col] = df[col].apply(attempt_cast_to_uuid)
1176
+ if as_iterator:
1177
+ return chunks
1180
1178
 
1181
- for col in bytes_columns:
1182
- if col not in df.columns:
1183
- continue
1184
- df[col] = df[col].apply(attempt_cast_to_bytes)
1185
-
1186
- for col in geometry_columns:
1187
- if col not in df.columns:
1188
- continue
1189
- df[col] = df[col].apply(attempt_cast_to_geometry)
1190
-
1191
- if self.flavor == 'sqlite':
1192
- ignore_dt_cols = [
1193
- col
1194
- for col, dtype in pipe.dtypes.items()
1195
- if not are_dtypes_equal(str(dtype), 'datetime')
1196
- ]
1197
- ### NOTE: We have to consume the iterator here to ensure that datetimes are parsed correctly
1198
- df = (
1199
- parse_df_datetimes(
1200
- df,
1201
- ignore_cols=ignore_dt_cols,
1202
- chunksize=kw.get('chunksize', None),
1203
- strip_timezone=(pipe.tzinfo is None),
1204
- debug=debug,
1205
- ) if isinstance(df, pd.DataFrame) else (
1206
- [
1207
- parse_df_datetimes(
1208
- c,
1209
- ignore_cols=ignore_dt_cols,
1210
- chunksize=kw.get('chunksize', None),
1211
- strip_timezone=(pipe.tzinfo is None),
1212
- debug=debug,
1213
- )
1214
- for c in df
1215
- ]
1216
- )
1217
- )
1218
- for col, typ in dtypes.items():
1219
- if typ != 'json':
1220
- continue
1221
- df[col] = df[col].apply(lambda x: json.loads(x) if x is not None else x)
1222
- return df
1179
+ return pd.concat(chunks)
1223
1180
 
1224
1181
 
1225
1182
  def get_pipe_data_query(
@@ -1552,13 +1509,7 @@ def create_pipe_table_from_df(
1552
1509
  """
1553
1510
  Create a pipe's table from its configured dtypes and an incoming dataframe.
1554
1511
  """
1555
- from meerschaum.utils.dataframe import (
1556
- get_json_cols,
1557
- get_numeric_cols,
1558
- get_uuid_cols,
1559
- get_datetime_cols,
1560
- get_bytes_cols,
1561
- )
1512
+ from meerschaum.utils.dataframe import get_special_cols
1562
1513
  from meerschaum.utils.sql import (
1563
1514
  get_create_table_queries,
1564
1515
  sql_item_name,
@@ -1587,30 +1538,7 @@ def create_pipe_table_from_df(
1587
1538
  for col_ix, col in pipe.columns.items()
1588
1539
  if col and col_ix != 'primary'
1589
1540
  },
1590
- **{
1591
- col: 'uuid'
1592
- for col in get_uuid_cols(df)
1593
- },
1594
- **{
1595
- col: 'json'
1596
- for col in get_json_cols(df)
1597
- },
1598
- **{
1599
- col: 'numeric'
1600
- for col in get_numeric_cols(df)
1601
- },
1602
- **{
1603
- col: 'bytes'
1604
- for col in get_bytes_cols(df)
1605
- },
1606
- **{
1607
- col: 'datetime64[ns, UTC]'
1608
- for col in get_datetime_cols(df, timezone_aware=True, timezone_naive=False)
1609
- },
1610
- **{
1611
- col: 'datetime64[ns]'
1612
- for col in get_datetime_cols(df, timezone_aware=False, timezone_naive=True)
1613
- },
1541
+ **get_special_cols(df),
1614
1542
  **pipe.dtypes
1615
1543
  }
1616
1544
  autoincrement = (
@@ -1762,18 +1690,16 @@ def sync_pipe(
1762
1690
  _ = pipe.__dict__.pop('_columns_types', None)
1763
1691
  if not self.exec_queries(alter_cols_queries, debug=debug):
1764
1692
  warn(f"Failed to alter columns for {pipe}.")
1765
- else:
1766
- _ = pipe.infer_dtypes(persist=True)
1767
1693
 
1768
1694
  ### NOTE: Oracle SQL < 23c (2023) and SQLite does not support booleans,
1769
1695
  ### so infer bools and persist them to `dtypes`.
1770
1696
  if self.flavor in ('oracle', 'sqlite', 'mysql', 'mariadb'):
1771
- pipe_dtypes = pipe.dtypes
1697
+ pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug)
1772
1698
  new_bool_cols = {
1773
1699
  col: 'bool[pyarrow]'
1774
1700
  for col, typ in df.dtypes.items()
1775
1701
  if col not in pipe_dtypes
1776
- and are_dtypes_equal(str(typ), 'bool')
1702
+ and are_dtypes_equal(str(typ), 'bool')
1777
1703
  }
1778
1704
  pipe_dtypes.update(new_bool_cols)
1779
1705
  pipe.dtypes = pipe_dtypes
@@ -2788,7 +2714,6 @@ def pipe_exists(
2788
2714
  debug=debug,
2789
2715
  )
2790
2716
  if debug:
2791
- from meerschaum.utils.debug import dprint
2792
2717
  dprint(f"{pipe} " + ('exists.' if exists else 'does not exist.'))
2793
2718
  return exists
2794
2719
 
@@ -3125,11 +3050,17 @@ def get_pipe_columns_types(
3125
3050
  debug=debug,
3126
3051
  )
3127
3052
 
3053
+ if debug:
3054
+ dprint(f"Fetching columns_types for {pipe} with via SQLAlchemy table.")
3055
+
3128
3056
  table_columns = {}
3129
3057
  try:
3130
3058
  pipe_table = self.get_pipe_table(pipe, debug=debug)
3131
3059
  if pipe_table is None:
3132
3060
  return {}
3061
+ if debug:
3062
+ dprint(f"Found columns:")
3063
+ mrsm.pprint(dict(pipe_table.columns))
3133
3064
  for col in pipe_table.columns:
3134
3065
  table_columns[str(col.name)] = str(col.type)
3135
3066
  except Exception as e:
@@ -3321,10 +3252,9 @@ def get_alter_columns_queries(
3321
3252
  -------
3322
3253
  A list of the `ALTER TABLE` SQL query or queries to be executed on the provided connector.
3323
3254
  """
3324
- if not pipe.exists(debug=debug):
3255
+ if not pipe.exists(debug=debug) or pipe.static:
3325
3256
  return []
3326
- if pipe.static:
3327
- return
3257
+
3328
3258
  from meerschaum.utils.sql import (
3329
3259
  sql_item_name,
3330
3260
  get_table_cols_types,
@@ -3370,7 +3300,8 @@ def get_alter_columns_queries(
3370
3300
  debug=debug,
3371
3301
  ).items()
3372
3302
  }
3373
- pipe_bool_cols = [col for col, typ in pipe.dtypes.items() if are_dtypes_equal(str(typ), 'bool')]
3303
+ pipe_dtypes = pipe.dtypes
3304
+ pipe_bool_cols = [col for col, typ in pipe_dtypes.items() if are_dtypes_equal(str(typ), 'bool')]
3374
3305
  pd_db_df_aliases = {
3375
3306
  'int': 'bool',
3376
3307
  'float': 'bool',
@@ -3378,7 +3309,10 @@ def get_alter_columns_queries(
3378
3309
  'guid': 'object',
3379
3310
  }
3380
3311
  if self.flavor == 'oracle':
3381
- pd_db_df_aliases['int'] = 'numeric'
3312
+ pd_db_df_aliases.update({
3313
+ 'int': 'numeric',
3314
+ 'date': 'datetime',
3315
+ })
3382
3316
 
3383
3317
  altered_cols = {
3384
3318
  col: (db_cols_types.get(col, 'object'), typ)
@@ -3387,6 +3321,10 @@ def get_alter_columns_queries(
3387
3321
  and not are_dtypes_equal(db_cols_types.get(col, 'object'), 'string')
3388
3322
  }
3389
3323
 
3324
+ if debug and altered_cols:
3325
+ dprint(f"Columns to be altered:")
3326
+ mrsm.pprint(altered_cols)
3327
+
3390
3328
  ### NOTE: Sometimes bools are coerced into ints or floats.
3391
3329
  altered_cols_to_ignore = set()
3392
3330
  for col, (db_typ, df_typ) in altered_cols.items():
@@ -3413,13 +3351,20 @@ def get_alter_columns_queries(
3413
3351
  if db_is_bool_compatible and df_is_bool_compatible:
3414
3352
  altered_cols_to_ignore.add(bool_col)
3415
3353
 
3354
+ if debug and altered_cols_to_ignore:
3355
+ dprint(f"Ignoring the following altered columns (false positives).")
3356
+ mrsm.pprint(altered_cols_to_ignore)
3357
+
3416
3358
  for col in altered_cols_to_ignore:
3417
3359
  _ = altered_cols.pop(col, None)
3360
+
3418
3361
  if not altered_cols:
3419
3362
  return []
3420
3363
 
3421
3364
  if numeric_cols:
3422
- pipe.dtypes.update({col: 'numeric' for col in numeric_cols})
3365
+ explicit_pipe_dtypes = pipe.get_dtypes(infer=False, debug=debug)
3366
+ explicit_pipe_dtypes.update({col: 'numeric' for col in numeric_cols})
3367
+ pipe.dtypes = explicit_pipe_dtypes
3423
3368
  if not pipe.temporary:
3424
3369
  edit_success, edit_msg = pipe.edit(debug=debug)
3425
3370
  if not edit_success:
@@ -3428,7 +3373,7 @@ def get_alter_columns_queries(
3428
3373
  + f"{edit_msg}"
3429
3374
  )
3430
3375
  else:
3431
- numeric_cols.extend([col for col, typ in pipe.dtypes.items() if typ.startswith('numeric')])
3376
+ numeric_cols.extend([col for col, typ in pipe_dtypes.items() if typ.startswith('numeric')])
3432
3377
 
3433
3378
  numeric_type = get_db_type_from_pd_type('numeric', self.flavor, as_sqlalchemy=False)
3434
3379
  text_type = get_db_type_from_pd_type('str', self.flavor, as_sqlalchemy=False)
@@ -3636,20 +3581,18 @@ def get_to_sql_dtype(
3636
3581
  >>> get_to_sql_dtype(pipe, df)
3637
3582
  {'a': <class 'sqlalchemy.sql.sqltypes.JSON'>}
3638
3583
  """
3639
- from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
3584
+ from meerschaum.utils.dataframe import get_special_cols
3640
3585
  from meerschaum.utils.dtypes.sql import get_db_type_from_pd_type
3641
3586
  df_dtypes = {
3642
3587
  col: str(typ)
3643
3588
  for col, typ in df.dtypes.items()
3644
3589
  }
3645
- json_cols = get_json_cols(df)
3646
- numeric_cols = get_numeric_cols(df)
3647
- uuid_cols = get_uuid_cols(df)
3648
- df_dtypes.update({col: 'json' for col in json_cols})
3649
- df_dtypes.update({col: 'numeric' for col in numeric_cols})
3650
- df_dtypes.update({col: 'uuid' for col in uuid_cols})
3590
+ special_cols = get_special_cols(df)
3591
+ df_dtypes.update(special_cols)
3592
+
3651
3593
  if update_dtypes:
3652
3594
  df_dtypes.update(pipe.dtypes)
3595
+
3653
3596
  return {
3654
3597
  col: get_db_type_from_pd_type(typ, self.flavor, as_sqlalchemy=True)
3655
3598
  for col, typ in df_dtypes.items()
@@ -3920,3 +3863,15 @@ def get_temporary_target(
3920
3863
  + transact_id
3921
3864
  + ((separator + label) if label else '')
3922
3865
  )
3866
+
3867
+
3868
+ def _enforce_pipe_dtypes_chunks_hook(
3869
+ pipe: mrsm.Pipe,
3870
+ chunk_df: 'pd.DataFrame',
3871
+ debug: bool = False,
3872
+ **kwargs
3873
+ ) -> 'pd.DataFrame':
3874
+ """
3875
+ Enforce a pipe's dtypes on each chunk.
3876
+ """
3877
+ return pipe.enforce_dtypes(chunk_df, debug=debug)
@@ -131,23 +131,28 @@ def read(
131
131
  """
132
132
  if chunks is not None and chunks <= 0:
133
133
  return []
134
+
134
135
  from meerschaum.utils.sql import sql_item_name, truncate_item_name
135
136
  from meerschaum.utils.dtypes import are_dtypes_equal, coerce_timezone
136
137
  from meerschaum.utils.dtypes.sql import TIMEZONE_NAIVE_FLAVORS
137
138
  from meerschaum.utils.packages import attempt_import, import_pandas
138
139
  from meerschaum.utils.pool import get_pool
139
140
  from meerschaum.utils.dataframe import chunksize_to_npartitions, get_numeric_cols
141
+ from meerschaum.utils.misc import filter_arguments
140
142
  import warnings
141
143
  import traceback
142
144
  from decimal import Decimal
145
+
143
146
  pd = import_pandas()
144
147
  dd = None
148
+
145
149
  is_dask = 'dask' in pd.__name__
146
150
  pandas = attempt_import('pandas')
147
151
  is_dask = dd is not None
148
152
  npartitions = chunksize_to_npartitions(chunksize)
149
153
  if is_dask:
150
154
  chunksize = None
155
+
151
156
  schema = schema or self.schema
152
157
  utc_dt_cols = [
153
158
  col
@@ -158,7 +163,7 @@ def read(
158
163
  if dtype and utc_dt_cols and self.flavor in TIMEZONE_NAIVE_FLAVORS:
159
164
  dtype = dtype.copy()
160
165
  for col in utc_dt_cols:
161
- dtype[col] = 'datetime64[ns]'
166
+ dtype[col] = 'datetime64[us]'
162
167
 
163
168
  pool = get_pool(workers=workers)
164
169
  sqlalchemy = attempt_import("sqlalchemy", lazy=False)
@@ -222,26 +227,33 @@ def read(
222
227
  else format_sql_query_for_dask(str_query)
223
228
  )
224
229
 
230
+ def _get_chunk_args_kwargs(_chunk):
231
+ return filter_arguments(
232
+ chunk_hook,
233
+ _chunk,
234
+ workers=workers,
235
+ chunksize=chunksize,
236
+ debug=debug,
237
+ **kw
238
+ )
239
+
225
240
  chunk_list = []
226
241
  chunk_hook_results = []
227
242
  def _process_chunk(_chunk, _retry_on_failure: bool = True):
228
243
  if self.flavor in TIMEZONE_NAIVE_FLAVORS:
229
244
  for col in utc_dt_cols:
230
- _chunk[col] = coerce_timezone(_chunk[col], strip_timezone=False)
245
+ _chunk[col] = coerce_timezone(_chunk[col], strip_utc=False)
231
246
  if not as_hook_results:
232
247
  chunk_list.append(_chunk)
248
+
233
249
  if chunk_hook is None:
234
250
  return None
235
251
 
252
+ chunk_args, chunk_kwargs = _get_chunk_args_kwargs(_chunk)
253
+
236
254
  result = None
237
255
  try:
238
- result = chunk_hook(
239
- _chunk,
240
- workers=workers,
241
- chunksize=chunksize,
242
- debug=debug,
243
- **kw
244
- )
256
+ result = chunk_hook(*chunk_args, **chunk_kwargs)
245
257
  except Exception:
246
258
  result = False, traceback.format_exc()
247
259
  from meerschaum.utils.formatting import get_console
@@ -292,8 +304,16 @@ def read(
292
304
  self.engine,
293
305
  **read_sql_query_kwargs
294
306
  )
307
+
295
308
  to_return = (
296
- chunk_generator
309
+ (
310
+ chunk_generator
311
+ if not (as_hook_results or chunksize is None)
312
+ else (
313
+ _process_chunk(_chunk)
314
+ for _chunk in chunk_generator
315
+ )
316
+ )
297
317
  if as_iterator or chunksize is None
298
318
  else (
299
319
  list(pool.imap(_process_chunk, chunk_generator))
@@ -339,9 +359,8 @@ def read(
339
359
  try:
340
360
  for chunk in chunk_generator:
341
361
  if chunk_hook is not None:
342
- chunk_hook_results.append(
343
- chunk_hook(chunk, chunksize=chunksize, debug=debug, **kw)
344
- )
362
+ chunk_args, chunk_kwargs = _get_chunk_args_kwargs(chunk)
363
+ chunk_hook_results.append(chunk_hook(*chunk_args, **chunk_kwargs))
345
364
  chunk_list.append(chunk)
346
365
  read_chunks += 1
347
366
  if chunks is not None and read_chunks >= chunks:
@@ -356,9 +375,8 @@ def read(
356
375
  try:
357
376
  for chunk in chunk_generator:
358
377
  if chunk_hook is not None:
359
- chunk_hook_results.append(
360
- chunk_hook(chunk, chunksize=chunksize, debug=debug, **kw)
361
- )
378
+ chunk_args, chunk_kwargs = _get_chunk_args_kwargs(chunk)
379
+ chunk_hook_results.append(chunk_hook(*chunk_args, **chunk_kwargs))
362
380
  chunk_list.append(chunk)
363
381
  read_chunks += 1
364
382
  if chunks is not None and read_chunks >= chunks:
@@ -389,9 +407,8 @@ def read(
389
407
  ### call the hook on any missed chunks.
390
408
  if chunk_hook is not None and len(chunk_list) > len(chunk_hook_results):
391
409
  for c in chunk_list[len(chunk_hook_results):]:
392
- chunk_hook_results.append(
393
- chunk_hook(c, chunksize=chunksize, debug=debug, **kw)
394
- )
410
+ chunk_args, chunk_kwargs = _get_chunk_args_kwargs(c)
411
+ chunk_hook_results.append(chunk_hook(*chunk_args, **chunk_kwargs))
395
412
 
396
413
  ### chunksize is not None so must iterate
397
414
  if debug:
@@ -784,6 +801,7 @@ def to_sql(
784
801
  from meerschaum.utils.warnings import error, warn
785
802
  import warnings
786
803
  import functools
804
+ import traceback
787
805
 
788
806
  if name is None:
789
807
  error(f"Name must not be `None` to insert data into {self}.")
@@ -1057,7 +1075,7 @@ def to_sql(
1057
1075
  except Exception as e:
1058
1076
  if not silent:
1059
1077
  warn(str(e))
1060
- success, msg = False, str(e)
1078
+ success, msg = False, traceback.format_exc()
1061
1079
 
1062
1080
  end = time.perf_counter()
1063
1081
  if success: