meerschaum 2.5.0__py3-none-any.whl → 2.6.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- meerschaum/_internal/arguments/_parser.py +6 -1
- meerschaum/_internal/entry.py +16 -5
- meerschaum/actions/edit.py +6 -6
- meerschaum/actions/sql.py +12 -11
- meerschaum/api/dash/pipes.py +95 -13
- meerschaum/api/routes/_webterm.py +1 -0
- meerschaum/config/_edit.py +46 -19
- meerschaum/config/_read_config.py +20 -9
- meerschaum/config/_version.py +1 -1
- meerschaum/config/stack/__init__.py +1 -1
- meerschaum/connectors/sql/_pipes.py +80 -24
- meerschaum/connectors/sql/_sql.py +29 -10
- meerschaum/connectors/valkey/_pipes.py +1 -1
- meerschaum/core/Pipe/__init__.py +8 -9
- meerschaum/core/Pipe/_attributes.py +33 -11
- meerschaum/core/Pipe/_data.py +26 -7
- meerschaum/core/Pipe/_dtypes.py +4 -4
- meerschaum/core/Pipe/_fetch.py +1 -1
- meerschaum/core/Pipe/_sync.py +16 -4
- meerschaum/core/Pipe/_verify.py +1 -1
- meerschaum/utils/dataframe.py +58 -31
- meerschaum/utils/dtypes/__init__.py +16 -5
- meerschaum/utils/dtypes/sql.py +58 -28
- meerschaum/utils/misc.py +49 -16
- meerschaum/utils/packages/_packages.py +2 -1
- meerschaum/utils/schedule.py +7 -5
- meerschaum/utils/sql.py +224 -40
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/METADATA +5 -3
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/RECORD +35 -35
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/WHEEL +1 -1
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/LICENSE +0 -0
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/NOTICE +0 -0
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/entry_points.txt +0 -0
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/top_level.txt +0 -0
- {meerschaum-2.5.0.dist-info → meerschaum-2.6.0.dev1.dist-info}/zip-safe +0 -0
@@ -404,7 +404,7 @@ def get_create_index_queries(
|
|
404
404
|
indices = pipe.indices
|
405
405
|
|
406
406
|
_datetime = pipe.get_columns('datetime', error=False)
|
407
|
-
_datetime_type = pipe.dtypes.get(_datetime, 'datetime64[ns]')
|
407
|
+
_datetime_type = pipe.dtypes.get(_datetime, 'datetime64[ns, UTC]')
|
408
408
|
_datetime_name = (
|
409
409
|
sql_item_name(_datetime, self.flavor, None)
|
410
410
|
if _datetime is not None else None
|
@@ -738,7 +738,7 @@ def get_pipe_data(
|
|
738
738
|
dt_type = dtypes.get(_dt, 'object').lower()
|
739
739
|
if 'datetime' not in dt_type:
|
740
740
|
if 'int' not in dt_type:
|
741
|
-
dtypes[_dt] = 'datetime64[ns]'
|
741
|
+
dtypes[_dt] = 'datetime64[ns, UTC]'
|
742
742
|
existing_cols = pipe.get_columns_types(debug=debug)
|
743
743
|
select_columns = (
|
744
744
|
[
|
@@ -1197,7 +1197,12 @@ def sync_pipe(
|
|
1197
1197
|
A `SuccessTuple` of success (`bool`) and message (`str`).
|
1198
1198
|
"""
|
1199
1199
|
from meerschaum.utils.packages import import_pandas
|
1200
|
-
from meerschaum.utils.sql import
|
1200
|
+
from meerschaum.utils.sql import (
|
1201
|
+
get_update_queries,
|
1202
|
+
sql_item_name,
|
1203
|
+
update_queries,
|
1204
|
+
get_create_table_queries,
|
1205
|
+
)
|
1201
1206
|
from meerschaum.utils.misc import generate_password
|
1202
1207
|
from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols
|
1203
1208
|
from meerschaum.utils.dtypes import are_dtypes_equal
|
@@ -1232,7 +1237,6 @@ def sync_pipe(
|
|
1232
1237
|
|
1233
1238
|
### if table does not exist, create it with indices
|
1234
1239
|
is_new = False
|
1235
|
-
add_cols_query = None
|
1236
1240
|
if not pipe.exists(debug=debug):
|
1237
1241
|
check_existing = False
|
1238
1242
|
is_new = True
|
@@ -1252,9 +1256,7 @@ def sync_pipe(
|
|
1252
1256
|
|
1253
1257
|
### NOTE: Oracle SQL < 23c (2023) and SQLite does not support booleans,
|
1254
1258
|
### so infer bools and persist them to `dtypes`.
|
1255
|
-
|
1256
|
-
### to avoid merge issues.
|
1257
|
-
if self.flavor in ('oracle', 'sqlite', 'mssql', 'mysql', 'mariadb'):
|
1259
|
+
if self.flavor in ('oracle', 'sqlite', 'mysql', 'mariadb'):
|
1258
1260
|
pipe_dtypes = pipe.dtypes
|
1259
1261
|
new_bool_cols = {
|
1260
1262
|
col: 'bool[pyarrow]'
|
@@ -1309,7 +1311,60 @@ def sync_pipe(
|
|
1309
1311
|
'schema': self.get_pipe_schema(pipe),
|
1310
1312
|
})
|
1311
1313
|
|
1314
|
+
primary_key = pipe.columns.get('primary', None)
|
1315
|
+
new_dtypes = {
|
1316
|
+
**{
|
1317
|
+
col: str(typ)
|
1318
|
+
for col, typ in unseen_df.dtypes.items()
|
1319
|
+
},
|
1320
|
+
**{
|
1321
|
+
col: 'int'
|
1322
|
+
for col_ix, col in pipe.columns.items()
|
1323
|
+
if col_ix != 'primary'
|
1324
|
+
},
|
1325
|
+
**pipe.dtypes
|
1326
|
+
} if is_new else {}
|
1327
|
+
autoincrement = (
|
1328
|
+
pipe.parameters.get('autoincrement', False)
|
1329
|
+
or (is_new and primary_key and primary_key not in new_dtypes)
|
1330
|
+
)
|
1331
|
+
if autoincrement and autoincrement not in pipe.parameters:
|
1332
|
+
pipe.parameters['autoincrement'] = autoincrement
|
1333
|
+
edit_success, edit_msg = pipe.edit(debug=debug)
|
1334
|
+
if not edit_success:
|
1335
|
+
return edit_success, edit_msg
|
1336
|
+
|
1337
|
+
if autoincrement and primary_key and primary_key not in df.columns:
|
1338
|
+
if unseen_df is not None and primary_key in unseen_df.columns:
|
1339
|
+
del unseen_df[primary_key]
|
1340
|
+
if update_df is not None and primary_key in update_df.columns:
|
1341
|
+
del update_df[primary_key]
|
1342
|
+
if delta_df is not None and primary_key in delta_df.columns:
|
1343
|
+
del delta_df[primary_key]
|
1344
|
+
|
1345
|
+
if is_new:
|
1346
|
+
if autoincrement:
|
1347
|
+
_ = new_dtypes.pop(primary_key, None)
|
1348
|
+
|
1349
|
+
### TODO: see if this can be removed
|
1350
|
+
if 'datetime' in pipe.columns and self.flavor == 'timescaledb':
|
1351
|
+
primary_key = None
|
1352
|
+
|
1353
|
+
create_table_queries = get_create_table_queries(
|
1354
|
+
new_dtypes,
|
1355
|
+
pipe.target,
|
1356
|
+
self.flavor,
|
1357
|
+
schema=self.get_pipe_schema(pipe),
|
1358
|
+
primary_key=primary_key,
|
1359
|
+
)
|
1360
|
+
create_success = all(
|
1361
|
+
self.exec_queries(create_table_queries, break_on_error=True, rollback=True, debug=debug)
|
1362
|
+
)
|
1363
|
+
if not create_success:
|
1364
|
+
warn(f"Failed to create '{pipe.target}'. Continuing...")
|
1365
|
+
|
1312
1366
|
stats = self.to_sql(unseen_df, **unseen_kw)
|
1367
|
+
|
1313
1368
|
if is_new:
|
1314
1369
|
if not self.create_indices(pipe, debug=debug):
|
1315
1370
|
warn(f"Failed to create indices for {pipe}. Continuing...")
|
@@ -1358,7 +1413,7 @@ def sync_pipe(
|
|
1358
1413
|
]
|
1359
1414
|
update_queries = get_update_queries(
|
1360
1415
|
pipe.target,
|
1361
|
-
temp_target,
|
1416
|
+
temp_target,
|
1362
1417
|
self,
|
1363
1418
|
join_cols,
|
1364
1419
|
upsert=upsert,
|
@@ -1960,7 +2015,7 @@ def get_sync_time(
|
|
1960
2015
|
table = sql_item_name(pipe.target, self.flavor, self.get_pipe_schema(pipe))
|
1961
2016
|
|
1962
2017
|
dt_col = pipe.columns.get('datetime', None)
|
1963
|
-
dt_type = pipe.dtypes.get(dt_col, 'datetime64[ns]')
|
2018
|
+
dt_type = pipe.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
1964
2019
|
if not dt_col:
|
1965
2020
|
_dt = pipe.guess_datetime()
|
1966
2021
|
dt = sql_item_name(_dt, self.flavor, None) if _dt else None
|
@@ -2366,7 +2421,7 @@ def get_pipe_columns_types(
|
|
2366
2421
|
----------
|
2367
2422
|
pipe: mrsm.Pipe:
|
2368
2423
|
The pipe to get the columns for.
|
2369
|
-
|
2424
|
+
|
2370
2425
|
Returns
|
2371
2426
|
-------
|
2372
2427
|
A dictionary of columns names (`str`) and types (`str`).
|
@@ -2381,17 +2436,18 @@ def get_pipe_columns_types(
|
|
2381
2436
|
}
|
2382
2437
|
>>>
|
2383
2438
|
"""
|
2439
|
+
from meerschaum.utils.sql import get_table_cols_types
|
2384
2440
|
if not pipe.exists(debug=debug):
|
2385
2441
|
return {}
|
2386
2442
|
|
2387
|
-
if self.flavor
|
2388
|
-
|
2389
|
-
|
2390
|
-
|
2391
|
-
|
2392
|
-
|
2393
|
-
|
2394
|
-
|
2443
|
+
# if self.flavor not in ('oracle', 'mysql', 'mariadb'):
|
2444
|
+
return get_table_cols_types(
|
2445
|
+
pipe.target,
|
2446
|
+
self,
|
2447
|
+
flavor=self.flavor,
|
2448
|
+
schema=self.get_pipe_schema(pipe),
|
2449
|
+
debug=debug,
|
2450
|
+
)
|
2395
2451
|
|
2396
2452
|
table_columns = {}
|
2397
2453
|
try:
|
@@ -2823,11 +2879,11 @@ def get_alter_columns_queries(
|
|
2823
2879
|
|
2824
2880
|
|
2825
2881
|
def get_to_sql_dtype(
|
2826
|
-
|
2827
|
-
|
2828
|
-
|
2829
|
-
|
2830
|
-
|
2882
|
+
self,
|
2883
|
+
pipe: 'mrsm.Pipe',
|
2884
|
+
df: 'pd.DataFrame',
|
2885
|
+
update_dtypes: bool = True,
|
2886
|
+
) -> Dict[str, 'sqlalchemy.sql.visitors.TraversibleType']:
|
2831
2887
|
"""
|
2832
2888
|
Given a pipe and DataFrame, return the `dtype` dictionary for `to_sql()`.
|
2833
2889
|
|
@@ -2947,7 +3003,7 @@ def deduplicate_pipe(
|
|
2947
3003
|
duplicates_cte_name = sql_item_name('dups', self.flavor, None)
|
2948
3004
|
duplicate_row_number_name = sql_item_name('dup_row_num', self.flavor, None)
|
2949
3005
|
previous_row_number_name = sql_item_name('prev_row_num', self.flavor, None)
|
2950
|
-
|
3006
|
+
|
2951
3007
|
index_list_str = (
|
2952
3008
|
sql_item_name(dt_col, self.flavor, None)
|
2953
3009
|
if dt_col
|
@@ -17,8 +17,8 @@ from meerschaum.utils.warnings import warn
|
|
17
17
|
### database flavors that can use bulk insert
|
18
18
|
_bulk_flavors = {'postgresql', 'timescaledb', 'citus'}
|
19
19
|
### flavors that do not support chunks
|
20
|
-
_disallow_chunks_flavors = [
|
21
|
-
_max_chunks_flavors = {'sqlite': 1000
|
20
|
+
_disallow_chunks_flavors = []
|
21
|
+
_max_chunks_flavors = {'sqlite': 1000}
|
22
22
|
SKIP_READ_TRANSACTION_FLAVORS: list[str] = ['mssql']
|
23
23
|
|
24
24
|
|
@@ -123,7 +123,8 @@ def read(
|
|
123
123
|
if chunks is not None and chunks <= 0:
|
124
124
|
return []
|
125
125
|
from meerschaum.utils.sql import sql_item_name, truncate_item_name
|
126
|
-
from meerschaum.utils.dtypes
|
126
|
+
from meerschaum.utils.dtypes import are_dtypes_equal, coerce_timezone
|
127
|
+
from meerschaum.utils.dtypes.sql import NUMERIC_PRECISION_FLAVORS, TIMEZONE_NAIVE_FLAVORS
|
127
128
|
from meerschaum.utils.packages import attempt_import, import_pandas
|
128
129
|
from meerschaum.utils.pool import get_pool
|
129
130
|
from meerschaum.utils.dataframe import chunksize_to_npartitions, get_numeric_cols
|
@@ -139,6 +140,16 @@ def read(
|
|
139
140
|
if is_dask:
|
140
141
|
chunksize = None
|
141
142
|
schema = schema or self.schema
|
143
|
+
utc_dt_cols = [
|
144
|
+
col
|
145
|
+
for col, typ in dtype.items()
|
146
|
+
if are_dtypes_equal(typ, 'datetime') and 'utc' in typ.lower()
|
147
|
+
] if dtype else []
|
148
|
+
|
149
|
+
if dtype and utc_dt_cols and self.flavor in TIMEZONE_NAIVE_FLAVORS:
|
150
|
+
dtype = dtype.copy()
|
151
|
+
for col in utc_dt_cols:
|
152
|
+
dtype[col] = 'datetime64[ns]'
|
142
153
|
|
143
154
|
pool = get_pool(workers=workers)
|
144
155
|
sqlalchemy = attempt_import("sqlalchemy")
|
@@ -162,7 +173,6 @@ def read(
|
|
162
173
|
)
|
163
174
|
chunksize = _max_chunks_flavors[self.flavor]
|
164
175
|
|
165
|
-
### NOTE: A bug in duckdb_engine does not allow for chunks.
|
166
176
|
if chunksize is not None and self.flavor in _disallow_chunks_flavors:
|
167
177
|
chunksize = None
|
168
178
|
|
@@ -206,6 +216,9 @@ def read(
|
|
206
216
|
chunk_list = []
|
207
217
|
chunk_hook_results = []
|
208
218
|
def _process_chunk(_chunk, _retry_on_failure: bool = True):
|
219
|
+
if self.flavor in TIMEZONE_NAIVE_FLAVORS:
|
220
|
+
for col in utc_dt_cols:
|
221
|
+
_chunk[col] = coerce_timezone(_chunk[col], strip_timezone=False)
|
209
222
|
if not as_hook_results:
|
210
223
|
chunk_list.append(_chunk)
|
211
224
|
if chunk_hook is None:
|
@@ -765,7 +778,7 @@ def to_sql(
|
|
765
778
|
DROP_IF_EXISTS_FLAVORS,
|
766
779
|
)
|
767
780
|
from meerschaum.utils.dataframe import get_json_cols, get_numeric_cols, get_uuid_cols
|
768
|
-
from meerschaum.utils.dtypes import are_dtypes_equal, quantize_decimal
|
781
|
+
from meerschaum.utils.dtypes import are_dtypes_equal, quantize_decimal, coerce_timezone
|
769
782
|
from meerschaum.utils.dtypes.sql import (
|
770
783
|
NUMERIC_PRECISION_FLAVORS,
|
771
784
|
PD_TO_SQLALCHEMY_DTYPES_FLAVORS,
|
@@ -848,7 +861,6 @@ def to_sql(
|
|
848
861
|
if not success:
|
849
862
|
warn(f"Unable to drop {name}")
|
850
863
|
|
851
|
-
|
852
864
|
### Enforce NVARCHAR(2000) as text instead of CLOB.
|
853
865
|
dtype = to_sql_kw.get('dtype', {})
|
854
866
|
for col, typ in df.dtypes.items():
|
@@ -858,11 +870,18 @@ def to_sql(
|
|
858
870
|
dtype[col] = sqlalchemy.types.INTEGER
|
859
871
|
to_sql_kw['dtype'] = dtype
|
860
872
|
elif self.flavor == 'mssql':
|
873
|
+
pass
|
874
|
+
### TODO clean this up
|
875
|
+
# dtype = to_sql_kw.get('dtype', {})
|
876
|
+
# for col, typ in df.dtypes.items():
|
877
|
+
# if are_dtypes_equal(str(typ), 'bool'):
|
878
|
+
# dtype[col] = sqlalchemy.types.INTEGER
|
879
|
+
# to_sql_kw['dtype'] = dtype
|
880
|
+
elif self.flavor == 'duckdb':
|
861
881
|
dtype = to_sql_kw.get('dtype', {})
|
862
|
-
for col, typ in df.dtypes.items()
|
863
|
-
|
864
|
-
|
865
|
-
to_sql_kw['dtype'] = dtype
|
882
|
+
dt_cols = [col for col, typ in df.dtypes.items() if are_dtypes_equal(str(typ), 'datetime')]
|
883
|
+
for col in dt_cols:
|
884
|
+
df[col] = coerce_timezone(df[col], strip_utc=False)
|
866
885
|
|
867
886
|
### Check for JSON columns.
|
868
887
|
if self.flavor not in json_flavors:
|
@@ -706,7 +706,7 @@ def get_sync_time(
|
|
706
706
|
"""
|
707
707
|
from meerschaum.utils.dtypes import are_dtypes_equal
|
708
708
|
dt_col = pipe.columns.get('datetime', None)
|
709
|
-
dt_typ = pipe.dtypes.get(dt_col, 'datetime64[ns]')
|
709
|
+
dt_typ = pipe.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
710
710
|
if not dt_col:
|
711
711
|
return None
|
712
712
|
|
meerschaum/core/Pipe/__init__.py
CHANGED
@@ -153,6 +153,7 @@ class Pipe:
|
|
153
153
|
dtypes: Optional[Dict[str, str]] = None,
|
154
154
|
instance: Optional[Union[str, InstanceConnector]] = None,
|
155
155
|
temporary: bool = False,
|
156
|
+
upsert: Optional[bool] = None,
|
156
157
|
mrsm_instance: Optional[Union[str, InstanceConnector]] = None,
|
157
158
|
cache: bool = False,
|
158
159
|
debug: bool = False,
|
@@ -201,6 +202,9 @@ class Pipe:
|
|
201
202
|
instance: Optional[Union[str, InstanceConnector]], default None
|
202
203
|
Alias for `mrsm_instance`. If `mrsm_instance` is supplied, this value is ignored.
|
203
204
|
|
205
|
+
upsert: Optional[bool], default None
|
206
|
+
If `True`, set `upsert` to `True` in the parameters.
|
207
|
+
|
204
208
|
temporary: bool, default False
|
205
209
|
If `True`, prevent instance tables (pipes, users, plugins) from being created.
|
206
210
|
|
@@ -268,7 +272,7 @@ class Pipe:
|
|
268
272
|
or indexes
|
269
273
|
or self._attributes.get('parameters', {}).get('indices', None)
|
270
274
|
or self._attributes.get('parameters', {}).get('indexes', None)
|
271
|
-
)
|
275
|
+
)
|
272
276
|
if isinstance(indices, dict):
|
273
277
|
indices_key = (
|
274
278
|
'indexes'
|
@@ -292,6 +296,9 @@ class Pipe:
|
|
292
296
|
elif dtypes is not None:
|
293
297
|
warn(f"The provided dtypes are of invalid type '{type(dtypes)}'.")
|
294
298
|
|
299
|
+
if isinstance(upsert, bool):
|
300
|
+
self._attributes['parameters']['upsert'] = upsert
|
301
|
+
|
295
302
|
### NOTE: The parameters dictionary is {} by default.
|
296
303
|
### A Pipe may be registered without parameters, then edited,
|
297
304
|
### or a Pipe may be registered with parameters set in-memory first.
|
@@ -308,7 +315,6 @@ class Pipe:
|
|
308
315
|
|
309
316
|
self._cache = cache and get_config('system', 'experimental', 'cache')
|
310
317
|
|
311
|
-
|
312
318
|
@property
|
313
319
|
def meta(self):
|
314
320
|
"""
|
@@ -321,7 +327,6 @@ class Pipe:
|
|
321
327
|
'instance': self.instance_keys,
|
322
328
|
}
|
323
329
|
|
324
|
-
|
325
330
|
def keys(self) -> List[str]:
|
326
331
|
"""
|
327
332
|
Return the ordered keys for this pipe.
|
@@ -332,7 +337,6 @@ class Pipe:
|
|
332
337
|
if key != 'instance'
|
333
338
|
}
|
334
339
|
|
335
|
-
|
336
340
|
@property
|
337
341
|
def instance_connector(self) -> Union[InstanceConnector, None]:
|
338
342
|
"""
|
@@ -369,7 +373,6 @@ class Pipe:
|
|
369
373
|
return None
|
370
374
|
return self._connector
|
371
375
|
|
372
|
-
|
373
376
|
@property
|
374
377
|
def cache_connector(self) -> Union[meerschaum.connectors.sql.SQLConnector, None]:
|
375
378
|
"""
|
@@ -391,7 +394,6 @@ class Pipe:
|
|
391
394
|
|
392
395
|
return self._cache_connector
|
393
396
|
|
394
|
-
|
395
397
|
@property
|
396
398
|
def cache_pipe(self) -> Union['meerschaum.Pipe', None]:
|
397
399
|
"""
|
@@ -433,11 +435,9 @@ class Pipe:
|
|
433
435
|
|
434
436
|
return self._cache_pipe
|
435
437
|
|
436
|
-
|
437
438
|
def __str__(self, ansi: bool=False):
|
438
439
|
return pipe_repr(self, ansi=ansi)
|
439
440
|
|
440
|
-
|
441
441
|
def __eq__(self, other):
|
442
442
|
try:
|
443
443
|
return (
|
@@ -489,7 +489,6 @@ class Pipe:
|
|
489
489
|
"""
|
490
490
|
self.__init__(**_state)
|
491
491
|
|
492
|
-
|
493
492
|
def __getitem__(self, key: str) -> Any:
|
494
493
|
"""
|
495
494
|
Index the pipe's attributes.
|
@@ -103,10 +103,25 @@ def indices(self) -> Union[Dict[str, Union[str, List[str]]], None]:
|
|
103
103
|
if indices_key not in self.parameters:
|
104
104
|
self.parameters[indices_key] = {}
|
105
105
|
_indices = self.parameters[indices_key]
|
106
|
+
_columns = self.columns
|
107
|
+
dt_col = _columns.get('datetime', None)
|
106
108
|
if not isinstance(_indices, dict):
|
107
109
|
_indices = {}
|
108
110
|
self.parameters[indices_key] = _indices
|
109
|
-
|
111
|
+
unique_cols = (
|
112
|
+
[dt_col]
|
113
|
+
if dt_col
|
114
|
+
else []
|
115
|
+
) + [
|
116
|
+
col
|
117
|
+
for col_ix, col in _columns.items()
|
118
|
+
if col_ix != 'datetime'
|
119
|
+
]
|
120
|
+
return {
|
121
|
+
**({'unique': unique_cols} if len(unique_cols) > 1 else {}),
|
122
|
+
**_columns,
|
123
|
+
**_indices
|
124
|
+
}
|
110
125
|
|
111
126
|
|
112
127
|
@property
|
@@ -196,7 +211,7 @@ def get_columns(self, *args: str, error: bool = False) -> Union[str, Tuple[str]]
|
|
196
211
|
----------
|
197
212
|
*args: str
|
198
213
|
The column names to be retrieved.
|
199
|
-
|
214
|
+
|
200
215
|
error: bool, default False
|
201
216
|
If `True`, raise an `Exception` if the specified column is not defined.
|
202
217
|
|
@@ -509,15 +524,22 @@ def get_indices(self) -> Dict[str, str]:
|
|
509
524
|
if cols
|
510
525
|
}
|
511
526
|
_index_names = {
|
512
|
-
ix: (
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
location_key=self.location_key,
|
519
|
-
)
|
527
|
+
ix: _index_template.format(
|
528
|
+
target=_target,
|
529
|
+
column_names=column_names,
|
530
|
+
connector_keys=self.connector_keys,
|
531
|
+
metric_key=self.connector_key,
|
532
|
+
location_key=self.location_key,
|
520
533
|
)
|
521
534
|
for ix, column_names in _column_names.items()
|
522
535
|
}
|
523
|
-
|
536
|
+
### NOTE: Skip any duplicate indices.
|
537
|
+
seen_index_names = {}
|
538
|
+
for ix, index_name in _index_names.items():
|
539
|
+
if index_name in seen_index_names:
|
540
|
+
continue
|
541
|
+
seen_index_names[index_name] = ix
|
542
|
+
return {
|
543
|
+
ix: index_name
|
544
|
+
for index_name, ix in seen_index_names.items()
|
545
|
+
}
|
meerschaum/core/Pipe/_data.py
CHANGED
@@ -23,8 +23,8 @@ def get_data(
|
|
23
23
|
self,
|
24
24
|
select_columns: Optional[List[str]] = None,
|
25
25
|
omit_columns: Optional[List[str]] = None,
|
26
|
-
begin: Union[datetime, int, None] = None,
|
27
|
-
end: Union[datetime, int, None] = None,
|
26
|
+
begin: Union[datetime, int, str, None] = None,
|
27
|
+
end: Union[datetime, int, str, None] = None,
|
28
28
|
params: Optional[Dict[str, Any]] = None,
|
29
29
|
as_iterator: bool = False,
|
30
30
|
as_chunks: bool = False,
|
@@ -48,12 +48,12 @@ def get_data(
|
|
48
48
|
omit_columns: Optional[List[str]], default None
|
49
49
|
If provided, remove these columns from the selection.
|
50
50
|
|
51
|
-
begin: Union[datetime, int, None], default None
|
51
|
+
begin: Union[datetime, int, str, None], default None
|
52
52
|
Lower bound datetime to begin searching for data (inclusive).
|
53
53
|
Translates to a `WHERE` clause like `WHERE datetime >= begin`.
|
54
54
|
Defaults to `None`.
|
55
55
|
|
56
|
-
end: Union[datetime, int, None], default None
|
56
|
+
end: Union[datetime, int, str, None], default None
|
57
57
|
Upper bound datetime to stop searching for data (inclusive).
|
58
58
|
Translates to a `WHERE` clause like `WHERE datetime < end`.
|
59
59
|
Defaults to `None`.
|
@@ -105,11 +105,12 @@ def get_data(
|
|
105
105
|
from meerschaum.utils.venv import Venv
|
106
106
|
from meerschaum.connectors import get_connector_plugin
|
107
107
|
from meerschaum.utils.misc import iterate_chunks, items_str
|
108
|
-
from meerschaum.utils.dtypes import to_pandas_dtype
|
108
|
+
from meerschaum.utils.dtypes import to_pandas_dtype, coerce_timezone
|
109
109
|
from meerschaum.utils.dataframe import add_missing_cols_to_df, df_is_chunk_generator
|
110
110
|
from meerschaum.utils.packages import attempt_import
|
111
111
|
dd = attempt_import('dask.dataframe') if as_dask else None
|
112
112
|
dask = attempt_import('dask') if as_dask else None
|
113
|
+
dateutil_parser = attempt_import('dateutil.parser')
|
113
114
|
|
114
115
|
if select_columns == '*':
|
115
116
|
select_columns = None
|
@@ -120,11 +121,29 @@ def get_data(
|
|
120
121
|
omit_columns = [omit_columns]
|
121
122
|
|
122
123
|
as_iterator = as_iterator or as_chunks
|
124
|
+
dt_col = self.columns.get('datetime', None)
|
125
|
+
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
126
|
+
dt_is_utc = 'utc' in dt_typ.lower()
|
127
|
+
if isinstance(begin, str):
|
128
|
+
try:
|
129
|
+
begin = dateutil_parser.parse(begin)
|
130
|
+
except Exception as e:
|
131
|
+
warn(f"Failed to parse '{begin}' as datetime:\n{e}")
|
132
|
+
begin = None
|
133
|
+
if isinstance(end, str):
|
134
|
+
try:
|
135
|
+
end = dateutil_parser.parse(end)
|
136
|
+
except Exception as e:
|
137
|
+
warn(f"Failed to parse '{end}' as datetime:\n{e}")
|
138
|
+
end = None
|
139
|
+
if isinstance(begin, datetime):
|
140
|
+
begin = coerce_timezone(begin, strip_utc=(not dt_is_utc))
|
141
|
+
if isinstance(end, datetime):
|
142
|
+
end = coerce_timezone(end, strip_utc=(not dt_is_utc))
|
123
143
|
|
124
144
|
def _sort_df(_df):
|
125
145
|
if df_is_chunk_generator(_df):
|
126
146
|
return _df
|
127
|
-
dt_col = self.columns.get('datetime', None)
|
128
147
|
indices = [] if dt_col not in _df.columns else [dt_col]
|
129
148
|
non_dt_cols = [
|
130
149
|
col
|
@@ -607,7 +626,7 @@ def get_chunk_interval(
|
|
607
626
|
if dt_col is None:
|
608
627
|
return timedelta(minutes=chunk_minutes)
|
609
628
|
|
610
|
-
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
|
629
|
+
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
611
630
|
if 'int' in dt_dtype.lower():
|
612
631
|
return chunk_minutes
|
613
632
|
return timedelta(minutes=chunk_minutes)
|
meerschaum/core/Pipe/_dtypes.py
CHANGED
@@ -101,18 +101,18 @@ def infer_dtypes(self, persist: bool = False, debug: bool = False) -> Dict[str,
|
|
101
101
|
dt_col = self.columns.get('datetime', None)
|
102
102
|
if dt_col:
|
103
103
|
if not self.parameters.get('dtypes', {}).get(dt_col, None):
|
104
|
-
dtypes[dt_col] = 'datetime64[ns]'
|
104
|
+
dtypes[dt_col] = 'datetime64[ns, UTC]'
|
105
105
|
return dtypes
|
106
106
|
|
107
|
-
from meerschaum.utils.sql import
|
108
|
-
from meerschaum.utils.
|
107
|
+
from meerschaum.utils.dtypes.sql import get_pd_type_from_db_type
|
108
|
+
from meerschaum.utils.dtypes import to_pandas_dtype
|
109
109
|
columns_types = self.get_columns_types(debug=debug)
|
110
110
|
|
111
111
|
### NOTE: get_columns_types() may return either the types as
|
112
112
|
### PostgreSQL- or Pandas-style.
|
113
113
|
dtypes = {
|
114
114
|
c: (
|
115
|
-
|
115
|
+
get_pd_type_from_db_type(t, allow_custom_dtypes=True)
|
116
116
|
if str(t).isupper()
|
117
117
|
else to_pandas_dtype(t)
|
118
118
|
)
|
meerschaum/core/Pipe/_fetch.py
CHANGED
@@ -125,7 +125,7 @@ def get_backtrack_interval(
|
|
125
125
|
if dt_col is None:
|
126
126
|
return backtrack_interval
|
127
127
|
|
128
|
-
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns]')
|
128
|
+
dt_dtype = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
129
129
|
if 'int' in dt_dtype.lower():
|
130
130
|
return backtrack_minutes
|
131
131
|
|
meerschaum/core/Pipe/_sync.py
CHANGED
@@ -624,6 +624,18 @@ def filter_existing(
|
|
624
624
|
merge = pd.merge
|
625
625
|
NA = pd.NA
|
626
626
|
|
627
|
+
primary_key = self.columns.get('primary', None)
|
628
|
+
autoincrement = self.parameters.get('autoincrement', False)
|
629
|
+
pipe_columns = self.columns.copy()
|
630
|
+
|
631
|
+
if primary_key and autoincrement and df is not None and primary_key in df.columns:
|
632
|
+
if safe_copy:
|
633
|
+
df = df.copy()
|
634
|
+
safe_copy = False
|
635
|
+
if df[primary_key].isnull().all():
|
636
|
+
del df[primary_key]
|
637
|
+
_ = self.columns.pop(primary_key, None)
|
638
|
+
|
627
639
|
def get_empty_df():
|
628
640
|
empty_df = pd.DataFrame([])
|
629
641
|
dtypes = dict(df.dtypes) if df is not None else {}
|
@@ -643,8 +655,8 @@ def filter_existing(
|
|
643
655
|
|
644
656
|
### begin is the oldest data in the new dataframe
|
645
657
|
begin, end = None, None
|
646
|
-
dt_col =
|
647
|
-
dt_type = self.dtypes.get(dt_col, 'datetime64[ns]') if dt_col else None
|
658
|
+
dt_col = pipe_columns.get('datetime', None)
|
659
|
+
dt_type = self.dtypes.get(dt_col, 'datetime64[ns, UTC]') if dt_col else None
|
648
660
|
try:
|
649
661
|
min_dt_val = df[dt_col].min(skipna=True) if dt_col else None
|
650
662
|
if is_dask and min_dt_val is not None:
|
@@ -713,7 +725,7 @@ def filter_existing(
|
|
713
725
|
|
714
726
|
unique_index_vals = {
|
715
727
|
col: df[col].unique()
|
716
|
-
for col in
|
728
|
+
for col in pipe_columns
|
717
729
|
if col in df.columns and col != dt_col
|
718
730
|
} if not date_bound_only else {}
|
719
731
|
filter_params_index_limit = get_config('pipes', 'sync', 'filter_params_index_limit')
|
@@ -749,7 +761,7 @@ def filter_existing(
|
|
749
761
|
|
750
762
|
### Separate new rows from changed ones.
|
751
763
|
on_cols = [
|
752
|
-
col for col_key, col in
|
764
|
+
col for col_key, col in pipe_columns.items()
|
753
765
|
if (
|
754
766
|
col
|
755
767
|
and
|
meerschaum/core/Pipe/_verify.py
CHANGED
@@ -394,7 +394,7 @@ def get_bound_interval(self, debug: bool = False) -> Union[timedelta, int, None]
|
|
394
394
|
if not dt_col:
|
395
395
|
return bound_time_value
|
396
396
|
|
397
|
-
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns]')
|
397
|
+
dt_typ = self.dtypes.get(dt_col, 'datetime64[ns, UTC]')
|
398
398
|
if 'int' in dt_typ.lower():
|
399
399
|
return int(bound_time_value)
|
400
400
|
|