meerschaum 2.6.16__py3-none-any.whl → 2.7.0rc1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- meerschaum/_internal/arguments/_parse_arguments.py +1 -1
- meerschaum/actions/delete.py +65 -69
- meerschaum/actions/edit.py +22 -2
- meerschaum/actions/install.py +1 -2
- meerschaum/actions/sync.py +2 -3
- meerschaum/config/_default.py +1 -1
- meerschaum/config/_paths.py +2 -1
- meerschaum/config/_version.py +1 -1
- meerschaum/connectors/api/_pipes.py +4 -3
- meerschaum/connectors/sql/_create_engine.py +3 -3
- meerschaum/connectors/sql/_pipes.py +84 -38
- meerschaum/connectors/sql/_sql.py +6 -1
- meerschaum/connectors/valkey/_pipes.py +12 -1
- meerschaum/core/Pipe/__init__.py +23 -13
- meerschaum/core/Pipe/_attributes.py +19 -0
- meerschaum/core/Pipe/_dtypes.py +1 -1
- meerschaum/core/Pipe/_sync.py +61 -21
- meerschaum/core/Pipe/_verify.py +8 -7
- meerschaum/jobs/_Job.py +2 -1
- meerschaum/plugins/_Plugin.py +11 -14
- meerschaum/utils/daemon/Daemon.py +20 -13
- meerschaum/utils/dataframe.py +175 -13
- meerschaum/utils/dtypes/__init__.py +103 -14
- meerschaum/utils/dtypes/sql.py +26 -0
- meerschaum/utils/misc.py +8 -8
- meerschaum/utils/packages/_packages.py +1 -1
- meerschaum/utils/schedule.py +8 -3
- meerschaum/utils/sql.py +70 -47
- meerschaum/utils/venv/_Venv.py +4 -4
- meerschaum/utils/venv/__init__.py +33 -13
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/METADATA +2 -2
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/RECORD +38 -38
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/LICENSE +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/NOTICE +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/WHEEL +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/top_level.txt +0 -0
- {meerschaum-2.6.16.dist-info → meerschaum-2.7.0rc1.dist-info}/zip-safe +0 -0
meerschaum/core/Pipe/_verify.py
CHANGED
@@ -7,9 +7,10 @@ Verify the contents of a pipe by resyncing its interval.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from datetime import datetime, timedelta
|
10
|
-
|
10
|
+
|
11
|
+
import meerschaum as mrsm
|
12
|
+
from meerschaum.utils.typing import SuccessTuple, Any, Optional, Union, Tuple, Dict
|
11
13
|
from meerschaum.utils.warnings import warn, info
|
12
|
-
from meerschaum.utils.debug import dprint
|
13
14
|
|
14
15
|
|
15
16
|
def verify(
|
@@ -94,9 +95,6 @@ def verify(
|
|
94
95
|
else 1
|
95
96
|
)
|
96
97
|
|
97
|
-
sync_less_than_begin = not bounded and begin is None
|
98
|
-
sync_greater_than_end = not bounded and end is None
|
99
|
-
|
100
98
|
cannot_determine_bounds = not self.exists(debug=debug)
|
101
99
|
|
102
100
|
if cannot_determine_bounds:
|
@@ -164,7 +162,7 @@ def verify(
|
|
164
162
|
)
|
165
163
|
|
166
164
|
info(
|
167
|
-
f"Syncing {len(chunk_bounds)} chunk" + ('s' if len(chunk_bounds) != 1 else '')
|
165
|
+
f"Verifying {self}:\n Syncing {len(chunk_bounds)} chunk" + ('s' if len(chunk_bounds) != 1 else '')
|
168
166
|
+ f" ({'un' if not bounded else ''}bounded)"
|
169
167
|
+ f" of size '{interval_str(chunk_interval)}'"
|
170
168
|
+ f" between '{begin_to_print}' and '{end_to_print}'."
|
@@ -187,7 +185,7 @@ def verify(
|
|
187
185
|
return chunk_begin_and_end, bounds_success_tuples[chunk_begin_and_end]
|
188
186
|
|
189
187
|
chunk_begin, chunk_end = chunk_begin_and_end
|
190
|
-
|
188
|
+
chunk_success, chunk_msg = self.sync(
|
191
189
|
begin=chunk_begin,
|
192
190
|
end=chunk_end,
|
193
191
|
params=params,
|
@@ -195,6 +193,9 @@ def verify(
|
|
195
193
|
debug=debug,
|
196
194
|
**kwargs
|
197
195
|
)
|
196
|
+
chunk_msg = chunk_msg.strip()
|
197
|
+
mrsm.pprint((chunk_success, chunk_msg))
|
198
|
+
return chunk_begin_and_end, (chunk_success, chunk_msg)
|
198
199
|
|
199
200
|
### If we have more than one chunk, attempt to sync the first one and return if its fails.
|
200
201
|
if len(chunk_bounds) > 1:
|
meerschaum/jobs/_Job.py
CHANGED
@@ -873,7 +873,7 @@ class Job:
|
|
873
873
|
"""
|
874
874
|
from meerschaum._internal.arguments import compress_pipeline_sysargs
|
875
875
|
sysargs = compress_pipeline_sysargs(self.sysargs)
|
876
|
-
return shlex.join(sysargs).replace(' + ', '\n+ ')
|
876
|
+
return shlex.join(sysargs).replace(' + ', '\n+ ').replace(' : ', '\n: ').lstrip().rstrip()
|
877
877
|
|
878
878
|
@property
|
879
879
|
def _externally_managed_file(self) -> pathlib.Path:
|
@@ -915,6 +915,7 @@ class Job:
|
|
915
915
|
'PYTHONUNBUFFERED': '1',
|
916
916
|
'LINES': str(get_config('jobs', 'terminal', 'lines')),
|
917
917
|
'COLUMNS': str(get_config('jobs', 'terminal', 'columns')),
|
918
|
+
STATIC_CONFIG['environment']['noninteractive']: 'true',
|
918
919
|
}
|
919
920
|
self._env = {**default_env, **_env}
|
920
921
|
return self._env
|
meerschaum/plugins/_Plugin.py
CHANGED
@@ -255,11 +255,11 @@ class Plugin:
|
|
255
255
|
|
256
256
|
|
257
257
|
def install(
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
258
|
+
self,
|
259
|
+
skip_deps: bool = False,
|
260
|
+
force: bool = False,
|
261
|
+
debug: bool = False,
|
262
|
+
) -> SuccessTuple:
|
263
263
|
"""
|
264
264
|
Extract a plugin's tar archive to the plugins directory.
|
265
265
|
|
@@ -359,7 +359,7 @@ class Plugin:
|
|
359
359
|
is_same_version = new_version and old_version and (
|
360
360
|
packaging_version.parse(old_version) == packaging_version.parse(new_version)
|
361
361
|
)
|
362
|
-
except Exception
|
362
|
+
except Exception:
|
363
363
|
is_new_version, is_same_version = True, False
|
364
364
|
|
365
365
|
### Determine where to permanently store the new plugin.
|
@@ -404,7 +404,7 @@ class Plugin:
|
|
404
404
|
dprint(f"Moving '{src_file}' to '{dst_dir}'...")
|
405
405
|
try:
|
406
406
|
shutil.move(src_file, dst_dir)
|
407
|
-
except Exception
|
407
|
+
except Exception:
|
408
408
|
success, msg = False, (
|
409
409
|
f"Failed to install plugin '{self}': " +
|
410
410
|
f"Could not move file '{src_file}' to '{dst_dir}'"
|
@@ -817,10 +817,10 @@ class Plugin:
|
|
817
817
|
|
818
818
|
|
819
819
|
def install_dependencies(
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
820
|
+
self,
|
821
|
+
force: bool = False,
|
822
|
+
debug: bool = False,
|
823
|
+
) -> bool:
|
824
824
|
"""
|
825
825
|
If specified, install dependencies.
|
826
826
|
|
@@ -841,12 +841,9 @@ class Plugin:
|
|
841
841
|
Returns
|
842
842
|
-------
|
843
843
|
A bool indicating success.
|
844
|
-
|
845
844
|
"""
|
846
845
|
from meerschaum.utils.packages import pip_install, venv_contains_package
|
847
|
-
from meerschaum.utils.debug import dprint
|
848
846
|
from meerschaum.utils.warnings import warn, info
|
849
|
-
from meerschaum.connectors.parse import parse_repo_keys
|
850
847
|
_deps = self.get_dependencies(debug=debug)
|
851
848
|
if not _deps and self.requirements_file_path is None:
|
852
849
|
return True
|
@@ -432,7 +432,7 @@ class Daemon:
|
|
432
432
|
+ "allow_dirty_run=True)"
|
433
433
|
)
|
434
434
|
env = dict(os.environ)
|
435
|
-
env['
|
435
|
+
env[STATIC_CONFIG['environment']['noninteractive']] = 'true'
|
436
436
|
_launch_success_bool = venv_exec(_launch_daemon_code, debug=debug, venv=None, env=env)
|
437
437
|
msg = (
|
438
438
|
"Success"
|
@@ -465,6 +465,7 @@ class Daemon:
|
|
465
465
|
self._write_stop_file('kill')
|
466
466
|
return True, "Process has already stopped."
|
467
467
|
|
468
|
+
psutil = attempt_import('psutil')
|
468
469
|
process = self.process
|
469
470
|
try:
|
470
471
|
process.terminate()
|
@@ -473,10 +474,16 @@ class Daemon:
|
|
473
474
|
except Exception as e:
|
474
475
|
return False, f"Failed to kill job {self} with exception: {e}"
|
475
476
|
|
477
|
+
try:
|
478
|
+
if process.status():
|
479
|
+
return False, "Failed to stop daemon '{self}' ({process})."
|
480
|
+
except psutil.NoSuchProcess:
|
481
|
+
pass
|
482
|
+
|
476
483
|
if self.pid_path.exists():
|
477
484
|
try:
|
478
485
|
self.pid_path.unlink()
|
479
|
-
except Exception
|
486
|
+
except Exception:
|
480
487
|
pass
|
481
488
|
|
482
489
|
self._write_stop_file('kill')
|
@@ -534,7 +541,7 @@ class Daemon:
|
|
534
541
|
if not timeout:
|
535
542
|
try:
|
536
543
|
success = self.process.status() == 'stopped'
|
537
|
-
except psutil.NoSuchProcess
|
544
|
+
except psutil.NoSuchProcess:
|
538
545
|
success = True
|
539
546
|
msg = "Success" if success else f"Failed to suspend daemon '{self.daemon_id}'."
|
540
547
|
if success:
|
@@ -677,11 +684,11 @@ class Daemon:
|
|
677
684
|
raise SystemExit(0)
|
678
685
|
|
679
686
|
def _send_signal(
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
687
|
+
self,
|
688
|
+
signal_to_send,
|
689
|
+
timeout: Union[float, int, None] = None,
|
690
|
+
check_timeout_interval: Union[float, int, None] = None,
|
691
|
+
) -> SuccessTuple:
|
685
692
|
"""Send a signal to the daemon process.
|
686
693
|
|
687
694
|
Parameters
|
@@ -709,7 +716,7 @@ class Daemon:
|
|
709
716
|
)
|
710
717
|
|
711
718
|
os.kill(pid, signal_to_send)
|
712
|
-
except Exception
|
719
|
+
except Exception:
|
713
720
|
return False, f"Failed to send signal {signal_to_send}:\n{traceback.format_exc()}"
|
714
721
|
|
715
722
|
timeout = self.get_timeout_seconds(timeout)
|
@@ -745,7 +752,7 @@ class Daemon:
|
|
745
752
|
if _already_exists and not allow_dirty_run:
|
746
753
|
error(
|
747
754
|
f"Daemon '{self.daemon_id}' already exists. " +
|
748
|
-
|
755
|
+
"To allow this daemon to run, do one of the following:\n"
|
749
756
|
+ " - Execute `daemon.cleanup()`.\n"
|
750
757
|
+ f" - Delete the directory '{self.path}'.\n"
|
751
758
|
+ " - Pass `allow_dirty_run=True` to `daemon.run()`.\n",
|
@@ -764,7 +771,7 @@ class Daemon:
|
|
764
771
|
if '_process' not in self.__dict__ or self.__dict__['_process'].pid != int(pid):
|
765
772
|
try:
|
766
773
|
self._process = psutil.Process(int(pid))
|
767
|
-
except Exception
|
774
|
+
except Exception:
|
768
775
|
if self.pid_path.exists():
|
769
776
|
self.pid_path.unlink()
|
770
777
|
return None
|
@@ -788,7 +795,7 @@ class Daemon:
|
|
788
795
|
if self.pid_path.exists():
|
789
796
|
try:
|
790
797
|
self.pid_path.unlink()
|
791
|
-
except Exception
|
798
|
+
except Exception:
|
792
799
|
pass
|
793
800
|
return 'stopped'
|
794
801
|
|
@@ -1000,7 +1007,7 @@ class Daemon:
|
|
1000
1007
|
try:
|
1001
1008
|
with open(self.properties_path, 'r', encoding='utf-8') as file:
|
1002
1009
|
properties = json.load(file)
|
1003
|
-
except Exception
|
1010
|
+
except Exception:
|
1004
1011
|
properties = {}
|
1005
1012
|
|
1006
1013
|
return properties
|
meerschaum/utils/dataframe.py
CHANGED
@@ -139,7 +139,6 @@ def filter_unseen_df(
|
|
139
139
|
import functools
|
140
140
|
import traceback
|
141
141
|
from decimal import Decimal
|
142
|
-
from uuid import UUID
|
143
142
|
from meerschaum.utils.warnings import warn
|
144
143
|
from meerschaum.utils.packages import import_pandas, attempt_import
|
145
144
|
from meerschaum.utils.dtypes import (
|
@@ -147,6 +146,7 @@ def filter_unseen_df(
|
|
147
146
|
are_dtypes_equal,
|
148
147
|
attempt_cast_to_numeric,
|
149
148
|
attempt_cast_to_uuid,
|
149
|
+
attempt_cast_to_bytes,
|
150
150
|
coerce_timezone,
|
151
151
|
)
|
152
152
|
pd = import_pandas(debug=debug)
|
@@ -333,6 +333,11 @@ def filter_unseen_df(
|
|
333
333
|
old_uuid_cols = get_uuid_cols(old_df)
|
334
334
|
new_uuid_cols = get_uuid_cols(new_df)
|
335
335
|
uuid_cols = set(new_uuid_cols + old_uuid_cols)
|
336
|
+
|
337
|
+
old_bytes_cols = get_bytes_cols(old_df)
|
338
|
+
new_bytes_cols = get_bytes_cols(new_df)
|
339
|
+
bytes_cols = set(new_bytes_cols + old_bytes_cols)
|
340
|
+
|
336
341
|
joined_df = merge(
|
337
342
|
new_df.infer_objects(copy=False).fillna(NA),
|
338
343
|
old_df.infer_objects(copy=False).fillna(NA),
|
@@ -368,6 +373,14 @@ def filter_unseen_df(
|
|
368
373
|
except Exception:
|
369
374
|
warn(f"Unable to parse numeric column '{uuid_col}':\n{traceback.format_exc()}")
|
370
375
|
|
376
|
+
for bytes_col in bytes_cols:
|
377
|
+
if bytes_col not in delta_df.columns:
|
378
|
+
continue
|
379
|
+
try:
|
380
|
+
delta_df[bytes_col] = delta_df[bytes_col].apply(attempt_cast_to_bytes)
|
381
|
+
except Exception:
|
382
|
+
warn(f"Unable to parse bytes column '{bytes_col}':\n{traceback.format_exc()}")
|
383
|
+
|
371
384
|
return delta_df
|
372
385
|
|
373
386
|
|
@@ -429,6 +442,7 @@ def parse_df_datetimes(
|
|
429
442
|
from meerschaum.utils.debug import dprint
|
430
443
|
from meerschaum.utils.warnings import warn
|
431
444
|
from meerschaum.utils.misc import items_str
|
445
|
+
from meerschaum.utils.dtypes import to_datetime
|
432
446
|
import traceback
|
433
447
|
pd = import_pandas()
|
434
448
|
pandas = attempt_import('pandas')
|
@@ -494,7 +508,7 @@ def parse_df_datetimes(
|
|
494
508
|
|
495
509
|
if len(cols_to_inspect) == 0:
|
496
510
|
if debug:
|
497
|
-
dprint(
|
511
|
+
dprint("All columns are ignored, skipping datetime detection...")
|
498
512
|
return df.fillna(pandas.NA)
|
499
513
|
|
500
514
|
### apply regex to columns to determine which are ISO datetimes
|
@@ -515,14 +529,10 @@ def parse_df_datetimes(
|
|
515
529
|
|
516
530
|
try:
|
517
531
|
if not using_dask:
|
518
|
-
df[datetime_cols] = df[datetime_cols].apply(
|
519
|
-
pd.to_datetime,
|
520
|
-
utc=True,
|
521
|
-
format='ISO8601',
|
522
|
-
)
|
532
|
+
df[datetime_cols] = df[datetime_cols].apply(to_datetime)
|
523
533
|
else:
|
524
534
|
df[datetime_cols] = df[datetime_cols].apply(
|
525
|
-
|
535
|
+
to_datetime,
|
526
536
|
utc=True,
|
527
537
|
axis=1,
|
528
538
|
meta={
|
@@ -665,7 +675,7 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
|
|
665
675
|
|
666
676
|
Returns
|
667
677
|
-------
|
668
|
-
A list of columns to treat as
|
678
|
+
A list of columns to treat as UUIDs.
|
669
679
|
"""
|
670
680
|
if df is None:
|
671
681
|
return []
|
@@ -692,6 +702,135 @@ def get_uuid_cols(df: 'pd.DataFrame') -> List[str]:
|
|
692
702
|
]
|
693
703
|
|
694
704
|
|
705
|
+
def get_datetime_cols(
|
706
|
+
df: 'pd.DataFrame',
|
707
|
+
timezone_aware: bool = True,
|
708
|
+
timezone_naive: bool = True,
|
709
|
+
) -> List[str]:
|
710
|
+
"""
|
711
|
+
Get the columns which contain `datetime` or `Timestamp` objects from a Pandas DataFrame.
|
712
|
+
|
713
|
+
Parameters
|
714
|
+
----------
|
715
|
+
df: pd.DataFrame
|
716
|
+
The DataFrame which may contain `datetime` or `Timestamp` objects.
|
717
|
+
|
718
|
+
timezone_aware: bool, default True
|
719
|
+
If `True`, include timezone-aware datetime columns.
|
720
|
+
|
721
|
+
timezone_naive: bool, default True
|
722
|
+
If `True`, include timezone-naive datetime columns.
|
723
|
+
|
724
|
+
Returns
|
725
|
+
-------
|
726
|
+
A list of columns to treat as datetimes.
|
727
|
+
"""
|
728
|
+
if not timezone_aware and not timezone_naive:
|
729
|
+
raise ValueError("`timezone_aware` and `timezone_naive` cannot both be `False`.")
|
730
|
+
|
731
|
+
if df is None:
|
732
|
+
return []
|
733
|
+
|
734
|
+
from datetime import datetime
|
735
|
+
from meerschaum.utils.dtypes import are_dtypes_equal
|
736
|
+
is_dask = 'dask' in df.__module__
|
737
|
+
if is_dask:
|
738
|
+
df = get_first_valid_dask_partition(df)
|
739
|
+
|
740
|
+
known_dt_cols = [
|
741
|
+
col
|
742
|
+
for col, typ in df.dtypes.items()
|
743
|
+
if are_dtypes_equal('datetime', str(typ))
|
744
|
+
]
|
745
|
+
|
746
|
+
if len(df) == 0:
|
747
|
+
return known_dt_cols
|
748
|
+
|
749
|
+
cols_indices = {
|
750
|
+
col: df[col].first_valid_index()
|
751
|
+
for col in df.columns
|
752
|
+
if col not in known_dt_cols
|
753
|
+
}
|
754
|
+
pydt_cols = [
|
755
|
+
col
|
756
|
+
for col, ix in cols_indices.items()
|
757
|
+
if (
|
758
|
+
ix is not None
|
759
|
+
and
|
760
|
+
isinstance(df.loc[ix][col], datetime)
|
761
|
+
)
|
762
|
+
]
|
763
|
+
dt_cols_set = set(known_dt_cols + pydt_cols)
|
764
|
+
all_dt_cols = [
|
765
|
+
col
|
766
|
+
for col in df.columns
|
767
|
+
if col in dt_cols_set
|
768
|
+
]
|
769
|
+
if timezone_aware and timezone_naive:
|
770
|
+
return all_dt_cols
|
771
|
+
|
772
|
+
known_timezone_aware_dt_cols = [
|
773
|
+
col
|
774
|
+
for col in known_dt_cols
|
775
|
+
if getattr(df[col], 'tz', None) is not None
|
776
|
+
]
|
777
|
+
timezone_aware_pydt_cols = [
|
778
|
+
col
|
779
|
+
for col in pydt_cols
|
780
|
+
if df.loc[cols_indices[col]][col].tzinfo is not None
|
781
|
+
]
|
782
|
+
timezone_aware_dt_cols_set = set(known_timezone_aware_dt_cols + timezone_aware_pydt_cols)
|
783
|
+
if timezone_aware:
|
784
|
+
return [
|
785
|
+
col
|
786
|
+
for col in all_dt_cols
|
787
|
+
if col in timezone_aware_pydt_cols
|
788
|
+
]
|
789
|
+
|
790
|
+
return [
|
791
|
+
col
|
792
|
+
for col in all_dt_cols
|
793
|
+
if col not in timezone_aware_dt_cols_set
|
794
|
+
]
|
795
|
+
|
796
|
+
|
797
|
+
def get_bytes_cols(df: 'pd.DataFrame') -> List[str]:
|
798
|
+
"""
|
799
|
+
Get the columns which contain bytes strings from a Pandas DataFrame.
|
800
|
+
|
801
|
+
Parameters
|
802
|
+
----------
|
803
|
+
df: pd.DataFrame
|
804
|
+
The DataFrame which may contain bytes strings.
|
805
|
+
|
806
|
+
Returns
|
807
|
+
-------
|
808
|
+
A list of columns to treat as bytes.
|
809
|
+
"""
|
810
|
+
if df is None:
|
811
|
+
return []
|
812
|
+
is_dask = 'dask' in df.__module__
|
813
|
+
if is_dask:
|
814
|
+
df = get_first_valid_dask_partition(df)
|
815
|
+
|
816
|
+
if len(df) == 0:
|
817
|
+
return []
|
818
|
+
|
819
|
+
cols_indices = {
|
820
|
+
col: df[col].first_valid_index()
|
821
|
+
for col in df.columns
|
822
|
+
}
|
823
|
+
return [
|
824
|
+
col
|
825
|
+
for col, ix in cols_indices.items()
|
826
|
+
if (
|
827
|
+
ix is not None
|
828
|
+
and
|
829
|
+
isinstance(df.loc[ix][col], bytes)
|
830
|
+
)
|
831
|
+
]
|
832
|
+
|
833
|
+
|
695
834
|
def enforce_dtypes(
|
696
835
|
df: 'pd.DataFrame',
|
697
836
|
dtypes: Dict[str, str],
|
@@ -743,6 +882,7 @@ def enforce_dtypes(
|
|
743
882
|
is_dtype_numeric,
|
744
883
|
attempt_cast_to_numeric,
|
745
884
|
attempt_cast_to_uuid,
|
885
|
+
attempt_cast_to_bytes,
|
746
886
|
coerce_timezone as _coerce_timezone,
|
747
887
|
)
|
748
888
|
pandas = mrsm.attempt_import('pandas')
|
@@ -773,6 +913,11 @@ def enforce_dtypes(
|
|
773
913
|
for col, typ in dtypes.items()
|
774
914
|
if typ == 'uuid'
|
775
915
|
]
|
916
|
+
bytes_cols = [
|
917
|
+
col
|
918
|
+
for col, typ in dtypes.items()
|
919
|
+
if typ == 'bytes'
|
920
|
+
]
|
776
921
|
datetime_cols = [
|
777
922
|
col
|
778
923
|
for col, typ in dtypes.items()
|
@@ -826,6 +971,17 @@ def enforce_dtypes(
|
|
826
971
|
if debug:
|
827
972
|
dprint(f"Unable to parse column '{col}' as UUID:\n{e}")
|
828
973
|
|
974
|
+
if bytes_cols:
|
975
|
+
if debug:
|
976
|
+
dprint(f"Checking for bytes: {bytes_cols}")
|
977
|
+
for col in bytes_cols:
|
978
|
+
if col in df.columns:
|
979
|
+
try:
|
980
|
+
df[col] = df[col].apply(attempt_cast_to_bytes)
|
981
|
+
except Exception as e:
|
982
|
+
if debug:
|
983
|
+
dprint(f"Unable to parse column '{col}' as bytes:\n{e}")
|
984
|
+
|
829
985
|
if datetime_cols and coerce_timezone:
|
830
986
|
if debug:
|
831
987
|
dprint(f"Checking for datetime conversion: {datetime_cols}")
|
@@ -931,6 +1087,8 @@ def get_datetime_bound_from_df(
|
|
931
1087
|
-------
|
932
1088
|
The minimum or maximum datetime value in the dataframe, or `None`.
|
933
1089
|
"""
|
1090
|
+
from meerschaum.utils.dtypes import to_datetime, value_is_null
|
1091
|
+
|
934
1092
|
if df is None:
|
935
1093
|
return None
|
936
1094
|
if not datetime_column:
|
@@ -982,9 +1140,9 @@ def get_datetime_bound_from_df(
|
|
982
1140
|
dt_val = dt_val.compute()
|
983
1141
|
|
984
1142
|
return (
|
985
|
-
|
1143
|
+
to_datetime(dt_val, as_pydatetime=True)
|
986
1144
|
if are_dtypes_equal(str(type(dt_val)), 'datetime')
|
987
|
-
else (dt_val if
|
1145
|
+
else (dt_val if not value_is_null(dt_val) else None)
|
988
1146
|
)
|
989
1147
|
|
990
1148
|
return None
|
@@ -1127,7 +1285,7 @@ def get_first_valid_dask_partition(ddf: 'dask.dataframe.DataFrame') -> Union['pd
|
|
1127
1285
|
for partition in ddf.partitions:
|
1128
1286
|
try:
|
1129
1287
|
pdf = partition.compute()
|
1130
|
-
except Exception
|
1288
|
+
except Exception:
|
1131
1289
|
continue
|
1132
1290
|
if len(pdf) > 0:
|
1133
1291
|
return pdf
|
@@ -1408,12 +1566,16 @@ def to_json(
|
|
1408
1566
|
A JSON string.
|
1409
1567
|
"""
|
1410
1568
|
from meerschaum.utils.packages import import_pandas
|
1569
|
+
from meerschaum.utils.dtypes import serialize_bytes
|
1411
1570
|
pd = import_pandas()
|
1412
1571
|
uuid_cols = get_uuid_cols(df)
|
1413
|
-
|
1572
|
+
bytes_cols = get_bytes_cols(df)
|
1573
|
+
if safe_copy and bool(uuid_cols or bytes_cols):
|
1414
1574
|
df = df.copy()
|
1415
1575
|
for col in uuid_cols:
|
1416
1576
|
df[col] = df[col].astype(str)
|
1577
|
+
for col in bytes_cols:
|
1578
|
+
df[col] = df[col].apply(serialize_bytes)
|
1417
1579
|
return df.infer_objects(copy=False).fillna(pd.NA).to_json(
|
1418
1580
|
date_format=date_format,
|
1419
1581
|
date_unit=date_unit,
|