mainsequence 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mainsequence/__init__.py +0 -0
- mainsequence/__main__.py +9 -0
- mainsequence/cli/__init__.py +1 -0
- mainsequence/cli/api.py +157 -0
- mainsequence/cli/cli.py +442 -0
- mainsequence/cli/config.py +78 -0
- mainsequence/cli/ssh_utils.py +126 -0
- mainsequence/client/__init__.py +17 -0
- mainsequence/client/base.py +431 -0
- mainsequence/client/data_sources_interfaces/__init__.py +0 -0
- mainsequence/client/data_sources_interfaces/duckdb.py +1468 -0
- mainsequence/client/data_sources_interfaces/timescale.py +479 -0
- mainsequence/client/models_helpers.py +113 -0
- mainsequence/client/models_report_studio.py +412 -0
- mainsequence/client/models_tdag.py +2276 -0
- mainsequence/client/models_vam.py +1983 -0
- mainsequence/client/utils.py +387 -0
- mainsequence/dashboards/__init__.py +0 -0
- mainsequence/dashboards/streamlit/__init__.py +0 -0
- mainsequence/dashboards/streamlit/assets/config.toml +12 -0
- mainsequence/dashboards/streamlit/assets/favicon.png +0 -0
- mainsequence/dashboards/streamlit/assets/logo.png +0 -0
- mainsequence/dashboards/streamlit/core/__init__.py +0 -0
- mainsequence/dashboards/streamlit/core/theme.py +212 -0
- mainsequence/dashboards/streamlit/pages/__init__.py +0 -0
- mainsequence/dashboards/streamlit/scaffold.py +220 -0
- mainsequence/instrumentation/__init__.py +7 -0
- mainsequence/instrumentation/utils.py +101 -0
- mainsequence/instruments/__init__.py +1 -0
- mainsequence/instruments/data_interface/__init__.py +10 -0
- mainsequence/instruments/data_interface/data_interface.py +361 -0
- mainsequence/instruments/instruments/__init__.py +3 -0
- mainsequence/instruments/instruments/base_instrument.py +85 -0
- mainsequence/instruments/instruments/bond.py +447 -0
- mainsequence/instruments/instruments/european_option.py +74 -0
- mainsequence/instruments/instruments/interest_rate_swap.py +217 -0
- mainsequence/instruments/instruments/json_codec.py +585 -0
- mainsequence/instruments/instruments/knockout_fx_option.py +146 -0
- mainsequence/instruments/instruments/position.py +475 -0
- mainsequence/instruments/instruments/ql_fields.py +239 -0
- mainsequence/instruments/instruments/vanilla_fx_option.py +107 -0
- mainsequence/instruments/pricing_models/__init__.py +0 -0
- mainsequence/instruments/pricing_models/black_scholes.py +49 -0
- mainsequence/instruments/pricing_models/bond_pricer.py +182 -0
- mainsequence/instruments/pricing_models/fx_option_pricer.py +90 -0
- mainsequence/instruments/pricing_models/indices.py +350 -0
- mainsequence/instruments/pricing_models/knockout_fx_pricer.py +209 -0
- mainsequence/instruments/pricing_models/swap_pricer.py +502 -0
- mainsequence/instruments/settings.py +175 -0
- mainsequence/instruments/utils.py +29 -0
- mainsequence/logconf.py +284 -0
- mainsequence/reportbuilder/__init__.py +0 -0
- mainsequence/reportbuilder/__main__.py +0 -0
- mainsequence/reportbuilder/examples/ms_template_report.py +706 -0
- mainsequence/reportbuilder/model.py +713 -0
- mainsequence/reportbuilder/slide_templates.py +532 -0
- mainsequence/tdag/__init__.py +8 -0
- mainsequence/tdag/__main__.py +0 -0
- mainsequence/tdag/config.py +129 -0
- mainsequence/tdag/data_nodes/__init__.py +12 -0
- mainsequence/tdag/data_nodes/build_operations.py +751 -0
- mainsequence/tdag/data_nodes/data_nodes.py +1292 -0
- mainsequence/tdag/data_nodes/persist_managers.py +812 -0
- mainsequence/tdag/data_nodes/run_operations.py +543 -0
- mainsequence/tdag/data_nodes/utils.py +24 -0
- mainsequence/tdag/future_registry.py +25 -0
- mainsequence/tdag/utils.py +40 -0
- mainsequence/virtualfundbuilder/__init__.py +45 -0
- mainsequence/virtualfundbuilder/__main__.py +235 -0
- mainsequence/virtualfundbuilder/agent_interface.py +77 -0
- mainsequence/virtualfundbuilder/config_handling.py +86 -0
- mainsequence/virtualfundbuilder/contrib/__init__.py +0 -0
- mainsequence/virtualfundbuilder/contrib/apps/__init__.py +8 -0
- mainsequence/virtualfundbuilder/contrib/apps/etf_replicator_app.py +164 -0
- mainsequence/virtualfundbuilder/contrib/apps/generate_report.py +292 -0
- mainsequence/virtualfundbuilder/contrib/apps/load_external_portfolio.py +107 -0
- mainsequence/virtualfundbuilder/contrib/apps/news_app.py +437 -0
- mainsequence/virtualfundbuilder/contrib/apps/portfolio_report_app.py +91 -0
- mainsequence/virtualfundbuilder/contrib/apps/portfolio_table.py +95 -0
- mainsequence/virtualfundbuilder/contrib/apps/run_named_portfolio.py +45 -0
- mainsequence/virtualfundbuilder/contrib/apps/run_portfolio.py +40 -0
- mainsequence/virtualfundbuilder/contrib/apps/templates/base.html +147 -0
- mainsequence/virtualfundbuilder/contrib/apps/templates/report.html +77 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/__init__.py +5 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/external_weights.py +61 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/intraday_trend.py +149 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/market_cap.py +310 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/mock_signal.py +78 -0
- mainsequence/virtualfundbuilder/contrib/data_nodes/portfolio_replicator.py +269 -0
- mainsequence/virtualfundbuilder/contrib/prices/__init__.py +1 -0
- mainsequence/virtualfundbuilder/contrib/prices/data_nodes.py +810 -0
- mainsequence/virtualfundbuilder/contrib/prices/utils.py +11 -0
- mainsequence/virtualfundbuilder/contrib/rebalance_strategies/__init__.py +1 -0
- mainsequence/virtualfundbuilder/contrib/rebalance_strategies/rebalance_strategies.py +313 -0
- mainsequence/virtualfundbuilder/data_nodes.py +637 -0
- mainsequence/virtualfundbuilder/enums.py +23 -0
- mainsequence/virtualfundbuilder/models.py +282 -0
- mainsequence/virtualfundbuilder/notebook_handling.py +42 -0
- mainsequence/virtualfundbuilder/portfolio_interface.py +272 -0
- mainsequence/virtualfundbuilder/resource_factory/__init__.py +0 -0
- mainsequence/virtualfundbuilder/resource_factory/app_factory.py +170 -0
- mainsequence/virtualfundbuilder/resource_factory/base_factory.py +238 -0
- mainsequence/virtualfundbuilder/resource_factory/rebalance_factory.py +101 -0
- mainsequence/virtualfundbuilder/resource_factory/signal_factory.py +183 -0
- mainsequence/virtualfundbuilder/utils.py +381 -0
- mainsequence-2.0.0.dist-info/METADATA +105 -0
- mainsequence-2.0.0.dist-info/RECORD +110 -0
- mainsequence-2.0.0.dist-info/WHEEL +5 -0
- mainsequence-2.0.0.dist-info/licenses/LICENSE +40 -0
- mainsequence-2.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1468 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
from typing import Optional, Literal, List, Dict,TypedDict, Tuple, Any
|
5
|
+
import os, pyarrow.fs as pafs
|
6
|
+
|
7
|
+
import duckdb, pandas as pd
|
8
|
+
from pathlib import Path
|
9
|
+
import datetime
|
10
|
+
from mainsequence.logconf import logger
|
11
|
+
import pyarrow as pa
|
12
|
+
import pyarrow.parquet as pq
|
13
|
+
from ..utils import DataFrequency,UniqueIdentifierRangeMap
|
14
|
+
import uuid
|
15
|
+
from pyarrow import fs
|
16
|
+
|
17
|
+
def get_logger():
|
18
|
+
global logger
|
19
|
+
|
20
|
+
# If the logger doesn't have any handlers, create it using the custom function
|
21
|
+
logger.bind(sub_application="duck_db_interface")
|
22
|
+
return logger
|
23
|
+
|
24
|
+
logger = get_logger()
|
25
|
+
|
26
|
+
def _list_parquet_files(fs, dir_path: str) -> list[str]:
|
27
|
+
infos = fs.get_file_info(pafs.FileSelector(dir_path, recursive=False))
|
28
|
+
return [i.path for i in infos
|
29
|
+
if i.type == pafs.FileType.File and i.path.endswith(".parquet")]
|
30
|
+
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
class DuckDBInterface:
|
36
|
+
"""
|
37
|
+
Persist/serve (time_index, unique_identifier, …) DataFrames in a DuckDB file.
|
38
|
+
"""
|
39
|
+
|
40
|
+
def __init__(self, db_path: Optional[str | Path] = None):
|
41
|
+
"""
|
42
|
+
Initializes the interface with the path to the DuckDB database file.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
db_path (Optional[str | Path]): Path to the database file.
|
46
|
+
Defaults to the value of the DUCKDB_PATH
|
47
|
+
environment variable or 'analytics.duckdb'
|
48
|
+
in the current directory if the variable is not set.
|
49
|
+
"""
|
50
|
+
from mainsequence.tdag.config import TDAG_DATA_PATH
|
51
|
+
# ── choose default & normalise to string ───────────────────────────
|
52
|
+
default_path = os.getenv(
|
53
|
+
"DUCKDB_PATH",
|
54
|
+
os.path.join(f"{TDAG_DATA_PATH}", "duck_db"),
|
55
|
+
)
|
56
|
+
db_uri = str(db_path or default_path).rstrip("/")
|
57
|
+
|
58
|
+
# ── FileSystem abstraction (works for local & S3) ──────────────────
|
59
|
+
self._fs, self._object_path = fs.FileSystem.from_uri(db_uri)
|
60
|
+
|
61
|
+
# ── DuckDB connection ──────────────────────────────────────────────
|
62
|
+
# • local → store meta‑data in a .duckdb file under db_uri
|
63
|
+
# • remote → in‑memory DB; still works because all user data
|
64
|
+
# lives in Parquet on the object store
|
65
|
+
if db_uri.startswith("s3://") or db_uri.startswith("gs://"):
|
66
|
+
self.con = duckdb.connect(":memory:")
|
67
|
+
# duckdb needs the httpfs extension for S3
|
68
|
+
self.con.execute("INSTALL httpfs;")
|
69
|
+
self.con.execute("LOAD httpfs;")
|
70
|
+
else:
|
71
|
+
meta_file = Path(db_uri) / "duck_meta.duckdb"
|
72
|
+
meta_file.parent.mkdir(parents=True, exist_ok=True)
|
73
|
+
self.con = duckdb.connect(str(meta_file))
|
74
|
+
|
75
|
+
# ── sane defaults ──────────────────────────────────────────────────
|
76
|
+
self.con.execute("PRAGMA threads = 4")
|
77
|
+
self.con.execute("PRAGMA enable_object_cache = true")
|
78
|
+
self.con.execute("SET TIMEZONE = 'UTC';")
|
79
|
+
|
80
|
+
self.db_path = db_uri # keep the fully‑qualified URI
|
81
|
+
|
82
|
+
def launch_gui(self, host='localhost', port=4213, timeout=0.5):
|
83
|
+
import duckdb
|
84
|
+
import socket
|
85
|
+
|
86
|
+
def ui_is_running(host, port, timeout):
|
87
|
+
"""Returns True if something is listening on host:port."""
|
88
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
89
|
+
s.settimeout(timeout)
|
90
|
+
try:
|
91
|
+
s.connect((host, port))
|
92
|
+
return True
|
93
|
+
except (ConnectionRefusedError, socket.timeout):
|
94
|
+
return False
|
95
|
+
|
96
|
+
# 1. Connect to your database
|
97
|
+
conn = duckdb.connect(self.db_path)
|
98
|
+
|
99
|
+
# 2. Decide whether to start the UI
|
100
|
+
url = f"http://{host}:{port}"
|
101
|
+
if not ui_is_running(host, port, timeout):
|
102
|
+
# (first‐time only) install and load the UI extension
|
103
|
+
# conn.execute("INSTALL ui;")
|
104
|
+
# conn.execute("LOAD ui;")
|
105
|
+
# spin up the HTTP server and open your browser
|
106
|
+
conn.execute("CALL start_ui();")
|
107
|
+
print(f"DuckDB Explorer launched at {url}")
|
108
|
+
else:
|
109
|
+
print(f"DuckDB Explorer is already running at {url}")
|
110
|
+
|
111
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
112
|
+
# Public API
|
113
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
114
|
+
|
115
|
+
def time_index_minima(
|
116
|
+
self,
|
117
|
+
table: str,
|
118
|
+
ids: Optional[List[str]] = None,
|
119
|
+
) -> Tuple[Optional[pd.Timestamp], Dict[Any, Optional[pd.Timestamp]]]:
|
120
|
+
"""
|
121
|
+
Compute the minimum time_index over the entire dataset AND the minimum per unique_identifier.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
(global_min, per_id_dict)
|
125
|
+
|
126
|
+
global_min: pd.Timestamp (UTC) or None if table is empty / all-NULL
|
127
|
+
per_id_dict: {uid: pd.Timestamp (UTC) or None} for each distinct uid (after optional filtering)
|
128
|
+
|
129
|
+
Fast path:
|
130
|
+
Uses a single scan with GROUPING SETS ((), (unique_identifier)), reading only
|
131
|
+
(unique_identifier, time_index). DuckDB will push projection to Parquet and parallelize.
|
132
|
+
|
133
|
+
Fallback:
|
134
|
+
Runs two simple queries (global MIN + per-id MIN) if GROUPING SETS isn't supported
|
135
|
+
in your DuckDB build.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
table: logical name (your view name); if the view is missing, we scan the Parquet
|
139
|
+
directly under {self.db_path}/{table}/**/*.parquet with hive_partitioning.
|
140
|
+
ids: optional list; if provided, restricts to those unique_identifiers only.
|
141
|
+
"""
|
142
|
+
import duckdb
|
143
|
+
import pandas as pd
|
144
|
+
from typing import Any, Dict, Optional, Tuple, List
|
145
|
+
|
146
|
+
def qident(name: str) -> str:
|
147
|
+
return '"' + str(name).replace('"', '""') + '"'
|
148
|
+
|
149
|
+
qtbl = qident(table)
|
150
|
+
qid = qident("unique_identifier")
|
151
|
+
qts = qident("time_index")
|
152
|
+
|
153
|
+
# --- Choose fastest reliable source relation ---
|
154
|
+
# Prefer scanning the view if it exists (it normalizes schema); otherwise scan Parquet directly.
|
155
|
+
try:
|
156
|
+
use_view = bool(self.table_exists(table))
|
157
|
+
except Exception:
|
158
|
+
use_view = False
|
159
|
+
|
160
|
+
file_glob = f"{self.db_path}/{table}/**/*.parquet"
|
161
|
+
src_rel = (
|
162
|
+
qtbl
|
163
|
+
if use_view
|
164
|
+
else f"parquet_scan('{file_glob}', hive_partitioning=TRUE, union_by_name=TRUE)"
|
165
|
+
)
|
166
|
+
|
167
|
+
# Optional filter to reduce the output cardinality if the caller only cares about some ids
|
168
|
+
params: List[Any] = []
|
169
|
+
where_clause = ""
|
170
|
+
if ids:
|
171
|
+
placeholders = ", ".join("?" for _ in ids)
|
172
|
+
where_clause = f"WHERE {qid} IN ({placeholders})"
|
173
|
+
params.extend(list(ids))
|
174
|
+
|
175
|
+
# --- Single-pass: GROUPING SETS (grand total + per-id) ---
|
176
|
+
sql_one_pass = f"""
|
177
|
+
WITH src AS (
|
178
|
+
SELECT {qid} AS uid, {qts} AS ts
|
179
|
+
FROM {src_rel}
|
180
|
+
{where_clause}
|
181
|
+
)
|
182
|
+
SELECT
|
183
|
+
uid,
|
184
|
+
MIN(ts) AS min_val,
|
185
|
+
GROUPING(uid) AS is_total_row
|
186
|
+
FROM src
|
187
|
+
GROUP BY GROUPING SETS ((), (uid));
|
188
|
+
"""
|
189
|
+
|
190
|
+
try:
|
191
|
+
rows = self.con.execute(sql_one_pass, params).fetchall()
|
192
|
+
|
193
|
+
global_min_raw: Optional[Any] = None
|
194
|
+
per_id_raw: Dict[Any, Optional[Any]] = {}
|
195
|
+
|
196
|
+
for uid, min_val, is_total in rows:
|
197
|
+
if is_total:
|
198
|
+
global_min_raw = min_val # grand total row
|
199
|
+
else:
|
200
|
+
per_id_raw[uid] = min_val
|
201
|
+
|
202
|
+
# Normalize to tz-aware pandas Timestamps (UTC) for consistency with your interface
|
203
|
+
to_ts = lambda v: pd.to_datetime(v, utc=True) if v is not None else None
|
204
|
+
global_min = to_ts(global_min_raw)
|
205
|
+
per_id = {uid: to_ts(v) for uid, v in per_id_raw.items()}
|
206
|
+
return global_min, per_id
|
207
|
+
|
208
|
+
except duckdb.Error as e:
|
209
|
+
# --- Fallback: two straightforward queries (still reads only needed columns) ---
|
210
|
+
logger.info(f"time_index_minima: GROUPING SETS path failed; falling back. Reason: {e}")
|
211
|
+
|
212
|
+
sql_global = f"""
|
213
|
+
SELECT MIN(ts)
|
214
|
+
FROM (
|
215
|
+
SELECT {qts} AS ts
|
216
|
+
FROM {src_rel}
|
217
|
+
{where_clause}
|
218
|
+
)
|
219
|
+
"""
|
220
|
+
sql_per_id = f"""
|
221
|
+
SELECT uid, MIN(ts) AS min_val
|
222
|
+
FROM (
|
223
|
+
SELECT {qid} AS uid, {qts} AS ts
|
224
|
+
FROM {src_rel}
|
225
|
+
{where_clause}
|
226
|
+
)
|
227
|
+
GROUP BY uid
|
228
|
+
"""
|
229
|
+
|
230
|
+
global_min_raw = self.con.execute(sql_global, params).fetchone()[0]
|
231
|
+
pairs = self.con.execute(sql_per_id, params).fetchall()
|
232
|
+
|
233
|
+
to_ts = lambda v: pd.to_datetime(v, utc=True) if v is not None else None
|
234
|
+
global_min = to_ts(global_min_raw)
|
235
|
+
per_id = {uid: to_ts(min_val) for uid, min_val in pairs}
|
236
|
+
return global_min, per_id
|
237
|
+
|
238
|
+
def remove_columns(self, table: str, columns: List[str]) -> Dict[str, Any]:
|
239
|
+
"""
|
240
|
+
Forcefully drop the given columns from the dataset backing `table`.
|
241
|
+
|
242
|
+
Behavior:
|
243
|
+
• Rebuilds *every* partition directory (year=/month=/[day=]) into one new Parquet file.
|
244
|
+
• Drops the requested columns that exist in that partition (others are ignored).
|
245
|
+
• Always deletes the old Parquet fragments after the new file is written.
|
246
|
+
• Always refreshes the view to reflect the new schema.
|
247
|
+
|
248
|
+
Notes:
|
249
|
+
• Protected keys to keep storage model consistent:
|
250
|
+
{'time_index','unique_identifier','year','month','day'} are not dropped.
|
251
|
+
• If a requested column doesn’t exist in some partitions, those partitions are still rebuilt.
|
252
|
+
• Destructive and idempotent.
|
253
|
+
"""
|
254
|
+
import uuid
|
255
|
+
import duckdb
|
256
|
+
|
257
|
+
def qident(name: str) -> str:
|
258
|
+
return '"' + str(name).replace('"', '""') + '"'
|
259
|
+
|
260
|
+
requested = list(dict.fromkeys(columns or []))
|
261
|
+
protected = {"time_index", "unique_identifier", "year", "month", "day"}
|
262
|
+
|
263
|
+
# Discover unified schema to know which requested columns actually exist
|
264
|
+
file_glob = f"{self.db_path}/{table}/**/*.parquet"
|
265
|
+
try:
|
266
|
+
desc_rows = self.con.execute(
|
267
|
+
f"DESCRIBE SELECT * FROM read_parquet('{file_glob}', "
|
268
|
+
f"union_by_name=TRUE, hive_partitioning=TRUE)"
|
269
|
+
).fetchall()
|
270
|
+
present_cols = {r[0] for r in desc_rows}
|
271
|
+
except duckdb.Error as e:
|
272
|
+
logger.error(f"remove_columns: cannot scan files for '{table}': {e}")
|
273
|
+
try:
|
274
|
+
self._ensure_view(table)
|
275
|
+
except Exception as ev:
|
276
|
+
logger.warning(f"remove_columns: _ensure_view failed after scan error: {ev}")
|
277
|
+
return {"dropped": [], "skipped": requested, "partitions_rebuilt": 0, "files_deleted": 0}
|
278
|
+
|
279
|
+
to_drop_global = [c for c in requested if c in present_cols and c not in protected]
|
280
|
+
skipped_global = [c for c in requested if c not in present_cols or c in protected]
|
281
|
+
|
282
|
+
# Enumerate all partition directories that currently contain Parquet files
|
283
|
+
selector = fs.FileSelector(f"{self.db_path}/{table}", recursive=True)
|
284
|
+
infos = self._fs.get_file_info(selector)
|
285
|
+
part_dirs = sorted({
|
286
|
+
info.path.rpartition("/")[0]
|
287
|
+
for info in infos
|
288
|
+
if info.type == fs.FileType.File and info.path.endswith(".parquet")
|
289
|
+
})
|
290
|
+
|
291
|
+
if not part_dirs:
|
292
|
+
logger.info(f"remove_columns: table '{table}' has no Parquet files.")
|
293
|
+
try:
|
294
|
+
self._ensure_view(table)
|
295
|
+
except Exception as ev:
|
296
|
+
logger.warning(f"remove_columns: _ensure_view failed on empty table: {ev}")
|
297
|
+
return {"dropped": to_drop_global, "skipped": skipped_global,
|
298
|
+
"partitions_rebuilt": 0, "files_deleted": 0}
|
299
|
+
|
300
|
+
partitions_rebuilt = 0
|
301
|
+
files_deleted = 0
|
302
|
+
|
303
|
+
try:
|
304
|
+
for part_path in part_dirs:
|
305
|
+
# 1) Partition-local schema WITHOUT filename helper (stable "real" columns)
|
306
|
+
try:
|
307
|
+
part_desc = self.con.execute(
|
308
|
+
f"DESCRIBE SELECT * FROM parquet_scan('{part_path}/*.parquet', "
|
309
|
+
f" hive_partitioning=TRUE, union_by_name=TRUE)"
|
310
|
+
).fetchall()
|
311
|
+
# Preserve order returned by DESCRIBE for deterministic output
|
312
|
+
part_cols_ordered = [r[0] for r in part_desc]
|
313
|
+
part_cols_set = set(part_cols_ordered)
|
314
|
+
except duckdb.Error as e:
|
315
|
+
logger.warning(f"remove_columns: skipping partition due to scan error at {part_path}: {e}")
|
316
|
+
continue
|
317
|
+
|
318
|
+
to_drop_here = [c for c in to_drop_global if c in part_cols_set]
|
319
|
+
|
320
|
+
# 2) Columns to keep (explicit projection → safest)
|
321
|
+
keep_cols = [c for c in part_cols_ordered if c not in to_drop_here]
|
322
|
+
if not keep_cols:
|
323
|
+
# Should not happen due to 'protected', but guard anyway
|
324
|
+
logger.warning(f"remove_columns: nothing to write after drops in {part_path}; skipping")
|
325
|
+
continue
|
326
|
+
keep_csv = ", ".join(qident(c) for c in keep_cols)
|
327
|
+
|
328
|
+
# 3) Detect the actual helper file-path column name added by filename=TRUE
|
329
|
+
# by comparing with/without filename=TRUE.
|
330
|
+
try:
|
331
|
+
fname_desc = self.con.execute(
|
332
|
+
f"DESCRIBE SELECT * FROM parquet_scan('{part_path}/*.parquet', "
|
333
|
+
f" hive_partitioning=TRUE, union_by_name=TRUE, filename=TRUE)"
|
334
|
+
).fetchall()
|
335
|
+
cols_with_fname = {r[0] for r in fname_desc}
|
336
|
+
added_by_filename = cols_with_fname - part_cols_set # usually {'filename'} or {'file_name', ...}
|
337
|
+
file_col = next(iter(added_by_filename), None)
|
338
|
+
except duckdb.Error:
|
339
|
+
file_col = None
|
340
|
+
|
341
|
+
# 4) Decide ordering key for recency; fall back to time_index if helper missing
|
342
|
+
order_key = qident(file_col) if file_col else "time_index"
|
343
|
+
|
344
|
+
# 5) Rebuild partition with explicit projection + window de-dup
|
345
|
+
tmp_file = f"{part_path}/rebuild-{uuid.uuid4().hex}.parquet"
|
346
|
+
copy_sql = f"""
|
347
|
+
COPY (
|
348
|
+
SELECT {keep_csv}
|
349
|
+
FROM (
|
350
|
+
SELECT {keep_csv},
|
351
|
+
ROW_NUMBER() OVER (
|
352
|
+
PARTITION BY time_index, unique_identifier
|
353
|
+
ORDER BY {order_key} DESC
|
354
|
+
) AS rn
|
355
|
+
FROM parquet_scan('{part_path}/*.parquet',
|
356
|
+
hive_partitioning=TRUE,
|
357
|
+
union_by_name=TRUE,
|
358
|
+
filename=TRUE)
|
359
|
+
)
|
360
|
+
WHERE rn = 1
|
361
|
+
)
|
362
|
+
TO '{tmp_file}'
|
363
|
+
(FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000)
|
364
|
+
"""
|
365
|
+
try:
|
366
|
+
self.con.execute(copy_sql)
|
367
|
+
except duckdb.Error as e:
|
368
|
+
logger.error(f"remove_columns: COPY failed for partition {part_path}: {e}")
|
369
|
+
raise
|
370
|
+
|
371
|
+
# 6) Delete all old fragments, keep only the new file
|
372
|
+
try:
|
373
|
+
current_infos = self._fs.get_file_info(fs.FileSelector(part_path))
|
374
|
+
for fi in current_infos:
|
375
|
+
if fi.type == fs.FileType.File and fi.path.endswith(".parquet") and fi.path != tmp_file:
|
376
|
+
self._fs.delete_file(fi.path)
|
377
|
+
files_deleted += 1
|
378
|
+
except Exception as cleanup_e:
|
379
|
+
logger.warning(f"remove_columns: cleanup failed in {part_path}: {cleanup_e}")
|
380
|
+
|
381
|
+
partitions_rebuilt += 1
|
382
|
+
|
383
|
+
finally:
|
384
|
+
# Ensure logical schema matches physical files
|
385
|
+
try:
|
386
|
+
self._ensure_view(table)
|
387
|
+
except Exception as ev:
|
388
|
+
logger.warning(f"remove_columns: _ensure_view failed after rebuild: {ev}")
|
389
|
+
|
390
|
+
return {
|
391
|
+
"dropped": to_drop_global,
|
392
|
+
"skipped": skipped_global,
|
393
|
+
"partitions_rebuilt": partitions_rebuilt,
|
394
|
+
"files_deleted": files_deleted,
|
395
|
+
}
|
396
|
+
|
397
|
+
def upsert(self, df: pd.DataFrame, table: str,
|
398
|
+
data_frequency: DataFrequency = DataFrequency.one_m) -> None:
|
399
|
+
"""
|
400
|
+
Idempotently writes a DataFrame into *table* using (time_index, unique_identifier) PK.
|
401
|
+
Extra columns are added to the table automatically.
|
402
|
+
"""
|
403
|
+
import os
|
404
|
+
import uuid
|
405
|
+
import datetime
|
406
|
+
from pyarrow import fs # used for cleanup listing
|
407
|
+
|
408
|
+
if df.empty:
|
409
|
+
logger.warning(f"Attempted to upsert an empty DataFrame to table '{table}'. Skipping.")
|
410
|
+
return
|
411
|
+
|
412
|
+
# —— basic hygiene ——--------------------------------------------------
|
413
|
+
df = df.copy()
|
414
|
+
df["time_index"] = pd.to_datetime(df["time_index"], utc=True)
|
415
|
+
if "unique_identifier" not in df.columns:
|
416
|
+
df["unique_identifier"] = "" # degenerate PK for daily data
|
417
|
+
df["unique_identifier"] = df["unique_identifier"].astype(str) # ADDED: harden as text
|
418
|
+
|
419
|
+
# —— derive partition columns ——---------------------------------------
|
420
|
+
partitions = self._partition_keys(df["time_index"], data_frequency=data_frequency)
|
421
|
+
for col, values in partitions.items():
|
422
|
+
df[col] = values
|
423
|
+
part_cols = list(partitions)
|
424
|
+
|
425
|
+
logger.debug(f"Starting upsert of {len(df)} rows into table '{table}' in {self.db_path}")
|
426
|
+
|
427
|
+
# —— de‑duplication inside *this* DataFrame ——--------------------------
|
428
|
+
df = df.drop_duplicates(subset=["time_index", "unique_identifier"], keep="last")
|
429
|
+
|
430
|
+
# ── Write each partition safely ─────────────────────────────────
|
431
|
+
for keys, sub in df.groupby(part_cols, sort=False):
|
432
|
+
part_path = self._partition_path(dict(zip(part_cols, keys)), table=table)
|
433
|
+
self._fs.create_dir(part_path, recursive=True)
|
434
|
+
|
435
|
+
# Register incoming batch as a DuckDB relation
|
436
|
+
self.con.register("incoming_sub", sub)
|
437
|
+
|
438
|
+
# Detect presence of existing files and time-range overlap (cheap)
|
439
|
+
has_existing = False
|
440
|
+
time_overlap = False
|
441
|
+
try:
|
442
|
+
row = self.con.execute(
|
443
|
+
f"""
|
444
|
+
SELECT min(time_index) AS mn, max(time_index) AS mx
|
445
|
+
FROM parquet_scan('{part_path}/*.parquet', hive_partitioning=TRUE)
|
446
|
+
"""
|
447
|
+
).fetchone()
|
448
|
+
if row and row[0] is not None:
|
449
|
+
has_existing = True
|
450
|
+
mn = pd.to_datetime(row[0], utc=True)
|
451
|
+
mx = pd.to_datetime(row[1], utc=True)
|
452
|
+
smin = sub["time_index"].min()
|
453
|
+
smax = sub["time_index"].max()
|
454
|
+
time_overlap = not (smax < mn or smin > mx)
|
455
|
+
except Exception:
|
456
|
+
has_existing = False
|
457
|
+
|
458
|
+
# Exact PK overlap check (only if time windows overlap)
|
459
|
+
overlap_exists = False
|
460
|
+
if has_existing and time_overlap:
|
461
|
+
overlap_exists = bool(self.con.execute(
|
462
|
+
f"""
|
463
|
+
SELECT EXISTS (
|
464
|
+
SELECT 1
|
465
|
+
FROM incoming_sub i
|
466
|
+
JOIN parquet_scan('{part_path}/*.parquet', hive_partitioning=TRUE) e
|
467
|
+
ON e.time_index = i.time_index
|
468
|
+
AND e.unique_identifier = i.unique_identifier
|
469
|
+
LIMIT 1
|
470
|
+
);
|
471
|
+
"""
|
472
|
+
).fetchone()[0])
|
473
|
+
|
474
|
+
# -------------------- Append path (no PK collision) --------------------
|
475
|
+
if not has_existing or not time_overlap or not overlap_exists:
|
476
|
+
try:
|
477
|
+
ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
478
|
+
final_name = f"part-{ts}-{uuid.uuid4().hex}.parquet"
|
479
|
+
tmp_name = final_name + ".tmp"
|
480
|
+
tmp_path = f"{part_path}/{tmp_name}"
|
481
|
+
final_path = f"{part_path}/{final_name}"
|
482
|
+
|
483
|
+
if has_existing:
|
484
|
+
# Keep dedup exact: write only rows not already present
|
485
|
+
anti_join_select = f"""
|
486
|
+
SELECT i.*
|
487
|
+
FROM incoming_sub i
|
488
|
+
WHERE NOT EXISTS (
|
489
|
+
SELECT 1
|
490
|
+
FROM parquet_scan('{part_path}/*.parquet', hive_partitioning=TRUE) e
|
491
|
+
WHERE e.time_index = i.time_index
|
492
|
+
AND e.unique_identifier = i.unique_identifier
|
493
|
+
)
|
494
|
+
ORDER BY i.unique_identifier, i.time_index
|
495
|
+
"""
|
496
|
+
n_new = self.con.execute(
|
497
|
+
f"SELECT COUNT(*) FROM ({anti_join_select})"
|
498
|
+
).fetchone()[0]
|
499
|
+
if n_new == 0:
|
500
|
+
continue
|
501
|
+
self.con.execute(
|
502
|
+
f"""
|
503
|
+
COPY ({anti_join_select})
|
504
|
+
TO '{tmp_path}'
|
505
|
+
(FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000);
|
506
|
+
"""
|
507
|
+
)
|
508
|
+
else:
|
509
|
+
# No existing files → safe to copy all incoming rows
|
510
|
+
self.con.execute(
|
511
|
+
f"""
|
512
|
+
COPY (
|
513
|
+
SELECT i.*
|
514
|
+
FROM incoming_sub i
|
515
|
+
ORDER BY i.unique_identifier, i.time_index
|
516
|
+
)
|
517
|
+
TO '{tmp_path}'
|
518
|
+
(FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000);
|
519
|
+
"""
|
520
|
+
)
|
521
|
+
|
522
|
+
# Atomic move into place
|
523
|
+
self._fs.move(tmp_path, final_path)
|
524
|
+
except Exception as e:
|
525
|
+
logger.exception(f"Append path failed for partition {keys}: {e}")
|
526
|
+
raise
|
527
|
+
continue
|
528
|
+
|
529
|
+
# -------------------- Rewrite path (true upsert) -----------------------
|
530
|
+
try:
|
531
|
+
# Discover existing/incoming schemas and build COALESCE projection
|
532
|
+
desc_rows = self.con.execute(
|
533
|
+
f"""
|
534
|
+
DESCRIBE SELECT * FROM parquet_scan(
|
535
|
+
'{part_path}/*.parquet',
|
536
|
+
hive_partitioning=TRUE,
|
537
|
+
union_by_name=TRUE
|
538
|
+
)
|
539
|
+
"""
|
540
|
+
).fetchall()
|
541
|
+
existing_cols = [r[0] for r in desc_rows]
|
542
|
+
|
543
|
+
# ADDED: also look at incoming schema so we can build a typed e-select
|
544
|
+
incoming_desc = self.con.execute("DESCRIBE SELECT * FROM incoming_sub").fetchall() # ADDED
|
545
|
+
incoming_cols = [r[0] for r in incoming_desc]
|
546
|
+
|
547
|
+
all_cols = list(dict.fromkeys(incoming_cols + existing_cols)) # deterministic order
|
548
|
+
|
549
|
+
def qident(name: str) -> str:
|
550
|
+
return '"' + str(name).replace('"', '""') + '"'
|
551
|
+
|
552
|
+
inc_set, ex_set = set(incoming_cols), set(existing_cols)
|
553
|
+
|
554
|
+
# CHANGED: Build merged projection with explicit BIGINT casts for partition cols
|
555
|
+
select_exprs = []
|
556
|
+
for c in all_cols:
|
557
|
+
qc = qident(c)
|
558
|
+
if c in inc_set and c in ex_set:
|
559
|
+
if c in part_cols:
|
560
|
+
select_exprs.append(
|
561
|
+
f"COALESCE(CAST(i.{qc} AS BIGINT), CAST(e.{qc} AS BIGINT)) AS {qc}") # CHANGED
|
562
|
+
else:
|
563
|
+
select_exprs.append(f"COALESCE(i.{qc}, e.{qc}) AS {qc}")
|
564
|
+
elif c in inc_set:
|
565
|
+
if c in part_cols:
|
566
|
+
select_exprs.append(f"CAST(i.{qc} AS BIGINT) AS {qc}") # CHANGED
|
567
|
+
else:
|
568
|
+
select_exprs.append(f"i.{qc} AS {qc}")
|
569
|
+
else: # only in existing
|
570
|
+
if c in part_cols:
|
571
|
+
select_exprs.append(f"CAST(e.{qc} AS BIGINT) AS {qc}") # CHANGED
|
572
|
+
else:
|
573
|
+
select_exprs.append(f"e.{qc} AS {qc}")
|
574
|
+
select_list = ", ".join(select_exprs)
|
575
|
+
|
576
|
+
# ADDED: Build a type-aligned projection of existing rows for the anti-join side
|
577
|
+
# (so UNION ALL BY NAME sees identical types, esp. for partitions)
|
578
|
+
e_select_exprs = []
|
579
|
+
for c in all_cols:
|
580
|
+
qc = qident(c)
|
581
|
+
if c in ex_set:
|
582
|
+
if c in part_cols:
|
583
|
+
e_select_exprs.append(f"CAST(e.{qc} AS BIGINT) AS {qc}") # ADDED
|
584
|
+
else:
|
585
|
+
e_select_exprs.append(f"e.{qc} AS {qc}")
|
586
|
+
else:
|
587
|
+
# Column exists only in incoming; let it be NULL here
|
588
|
+
if c in part_cols:
|
589
|
+
e_select_exprs.append(f"CAST(NULL AS BIGINT) AS {qc}") # ADDED
|
590
|
+
else:
|
591
|
+
e_select_exprs.append(f"NULL AS {qc}") # ADDED
|
592
|
+
e_select_list = ", ".join(e_select_exprs)
|
593
|
+
|
594
|
+
ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
595
|
+
final_name = f"part-{ts}-{uuid.uuid4().hex}.parquet"
|
596
|
+
tmp_name = final_name + ".tmp"
|
597
|
+
tmp_path = f"{part_path}/{tmp_name}"
|
598
|
+
final_path = f"{part_path}/{final_name}"
|
599
|
+
|
600
|
+
merge_sql = f"""
|
601
|
+
COPY (
|
602
|
+
WITH existing AS (
|
603
|
+
SELECT * FROM parquet_scan(
|
604
|
+
'{part_path}/*.parquet',
|
605
|
+
hive_partitioning=TRUE,
|
606
|
+
union_by_name=TRUE
|
607
|
+
)
|
608
|
+
),
|
609
|
+
merged_incoming AS (
|
610
|
+
SELECT {select_list}
|
611
|
+
FROM incoming_sub i
|
612
|
+
LEFT JOIN existing e
|
613
|
+
ON e.time_index = i.time_index
|
614
|
+
AND e.unique_identifier = i.unique_identifier
|
615
|
+
)
|
616
|
+
SELECT *
|
617
|
+
FROM (
|
618
|
+
-- rows with incoming (coalesced over existing)
|
619
|
+
SELECT * FROM merged_incoming
|
620
|
+
UNION ALL BY NAME -- CHANGED: correct syntax order
|
621
|
+
-- keep existing rows that do not collide on PK
|
622
|
+
SELECT {e_select_list} -- REPLACED: used to be 'SELECT e.*'
|
623
|
+
FROM existing e
|
624
|
+
ANTI JOIN incoming_sub i
|
625
|
+
ON e.time_index = i.time_index
|
626
|
+
AND e.unique_identifier = i.unique_identifier
|
627
|
+
)
|
628
|
+
ORDER BY unique_identifier, time_index
|
629
|
+
)
|
630
|
+
TO '{tmp_path}'
|
631
|
+
(FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000);
|
632
|
+
"""
|
633
|
+
try:
|
634
|
+
self.con.execute(merge_sql)
|
635
|
+
except Exception as e:
|
636
|
+
raise e
|
637
|
+
self._fs.move(tmp_path, final_path)
|
638
|
+
|
639
|
+
# Cleanup old parquet fragments, keep only the new file
|
640
|
+
try:
|
641
|
+
for fi in self._fs.get_file_info(fs.FileSelector(part_path)):
|
642
|
+
if (
|
643
|
+
fi.type == fs.FileType.File
|
644
|
+
and fi.path.endswith(".parquet")
|
645
|
+
and os.path.basename(fi.path) != final_name
|
646
|
+
):
|
647
|
+
self._fs.delete_file(fi.path)
|
648
|
+
except Exception as cleanup_e:
|
649
|
+
logger.warning(f"Cleanup old parquet files failed in {part_path}: {cleanup_e}")
|
650
|
+
|
651
|
+
except Exception as e:
|
652
|
+
logger.exception(f"Rewrite path failed for partition {keys}: {e}")
|
653
|
+
raise
|
654
|
+
|
655
|
+
# ── Refresh view ────────────────────────────────────────────────
|
656
|
+
self._ensure_view(table=table)
|
657
|
+
|
658
|
+
def table_exists(self,table):
|
659
|
+
table_exists_result = self.con.execute("""
|
660
|
+
SELECT COUNT(*)
|
661
|
+
FROM information_schema.tables
|
662
|
+
WHERE table_schema='main' AND table_name = ?
|
663
|
+
UNION ALL
|
664
|
+
SELECT COUNT(*)
|
665
|
+
FROM information_schema.views
|
666
|
+
WHERE table_schema='main' AND table_name = ?
|
667
|
+
""", [table, table]).fetchone()[0] > 0
|
668
|
+
|
669
|
+
if table_exists_result is None:
|
670
|
+
logger.warning(f"Table '{table}' does not exist in {self.db_path}. Returning empty DataFrame.")
|
671
|
+
return pd.DataFrame()
|
672
|
+
return table_exists_result
|
673
|
+
|
674
|
+
def constrain_read(
|
675
|
+
self,
|
676
|
+
table: str,
|
677
|
+
*,
|
678
|
+
start: Optional[datetime.datetime] = None,
|
679
|
+
end: Optional[datetime.datetime] = None,
|
680
|
+
ids: Optional[List[str]] = None,
|
681
|
+
unique_identifier_range_map: Optional[Dict[str, Dict[str, Any]]] = None,
|
682
|
+
max_rows: Optional[int] = None,
|
683
|
+
now: Optional[datetime.datetime] = None,
|
684
|
+
) -> Tuple[
|
685
|
+
Optional[datetime.datetime], # adjusted_start
|
686
|
+
Optional[datetime.datetime], # adjusted_end
|
687
|
+
Optional[Dict[str, Dict[str, Any]]], # adjusted_unique_identifier_range_map
|
688
|
+
Dict[str, Any] # diagnostics
|
689
|
+
]:
|
690
|
+
"""
|
691
|
+
Constrain a prospective read so that the estimated number of rows does not exceed *max_rows*.
|
692
|
+
Estimation uses Parquet row-group metadata (min/max on time_index + num_rows), i.e. it does not
|
693
|
+
scan full data and does not depend on bar frequency.
|
694
|
+
|
695
|
+
Inputs are the same "shape" as DuckDBInterface.read(...). The function returns adjusted
|
696
|
+
(start, end, unique_identifier_range_map) that you can pass to read(...), plus diagnostics.
|
697
|
+
|
698
|
+
Row cap source:
|
699
|
+
• max_rows argument if provided
|
700
|
+
• else env MAX_READ_ROWS or TDAG_MAX_READ_ROWS
|
701
|
+
• else default 10_000_000
|
702
|
+
|
703
|
+
Behavior:
|
704
|
+
• Computes an overall effective [start, end] across inputs (range_map, start/end).
|
705
|
+
• Reads Parquet footers under {self.db_path}/{table}/**/*.parquet to gather row-group
|
706
|
+
(min_time, max_time, num_rows).
|
707
|
+
• If the estimated rows for [start, end] <= max_rows, returns inputs unchanged.
|
708
|
+
• Otherwise finds the latest limit_dt so that estimated rows in [start, limit_dt] ~= max_rows,
|
709
|
+
and tightens:
|
710
|
+
- global 'end' → min(end, limit_dt) if plain start/end was used
|
711
|
+
- per-uid 'end_date' in unique_identifier_range_map → min(existing, limit_dt)
|
712
|
+
|
713
|
+
Returns:
|
714
|
+
adjusted_start, adjusted_end, adjusted_unique_identifier_range_map, diagnostics
|
715
|
+
"""
|
716
|
+
import os
|
717
|
+
import re
|
718
|
+
import math
|
719
|
+
import numpy as np
|
720
|
+
import pandas as pd
|
721
|
+
import pyarrow.parquet as pq
|
722
|
+
from pyarrow import fs as pa_fs
|
723
|
+
from calendar import monthrange
|
724
|
+
|
725
|
+
# --- helpers -------------------------------------------------------------
|
726
|
+
|
727
|
+
def _to_utc_ts(dt: Optional[datetime.datetime]) -> Optional[pd.Timestamp]:
|
728
|
+
if dt is None:
|
729
|
+
return None
|
730
|
+
ts = pd.to_datetime(dt, utc=True)
|
731
|
+
# normalize to UTC tz-aware pandas Timestamp
|
732
|
+
if not isinstance(ts, pd.Timestamp):
|
733
|
+
ts = pd.Timestamp(ts, tz="UTC")
|
734
|
+
elif ts.tz is None:
|
735
|
+
ts = ts.tz_localize("UTC")
|
736
|
+
else:
|
737
|
+
ts = ts.tz_convert("UTC")
|
738
|
+
return ts
|
739
|
+
|
740
|
+
def _effective_start_from_range_map(rmap: Dict[str, Dict[str, Any]]) -> Optional[pd.Timestamp]:
|
741
|
+
starts = []
|
742
|
+
for v in rmap.values():
|
743
|
+
s = v.get("start_date")
|
744
|
+
if s is None:
|
745
|
+
continue
|
746
|
+
ts = _to_utc_ts(s)
|
747
|
+
if ts is None:
|
748
|
+
continue
|
749
|
+
# operand '>' means open interval → start just after s (epsilon)
|
750
|
+
if v.get("start_date_operand") == ">":
|
751
|
+
ts = ts + pd.Timedelta(nanoseconds=1)
|
752
|
+
starts.append(ts)
|
753
|
+
return min(starts) if starts else None
|
754
|
+
|
755
|
+
def _effective_end_from_range_map(rmap: Dict[str, Dict[str, Any]]) -> Optional[pd.Timestamp]:
|
756
|
+
ends = []
|
757
|
+
for v in rmap.values():
|
758
|
+
e = v.get("end_date")
|
759
|
+
if e is not None:
|
760
|
+
ends.append(_to_utc_ts(e))
|
761
|
+
return max(ends) if ends else None
|
762
|
+
|
763
|
+
def _parse_part_bounds_from_path(path: str) -> Tuple[Optional[pd.Timestamp], Optional[pd.Timestamp]]:
|
764
|
+
"""
|
765
|
+
Infer inclusive [partition_start, partition_end] purely from path components
|
766
|
+
like .../year=2024/month=07[/day=03]/file.parquet.
|
767
|
+
Returns (p_start, p_end) as UTC tz-aware Timestamps, or (None, None) if not parsable.
|
768
|
+
"""
|
769
|
+
m_year = re.search(r"/year=(\d{4})(/|$)", path)
|
770
|
+
m_month = re.search(r"/month=(\d{2})(/|$)", path)
|
771
|
+
m_day = re.search(r"/day=(\d{2})(/|$)", path)
|
772
|
+
if not (m_year and m_month):
|
773
|
+
return None, None
|
774
|
+
y = int(m_year.group(1))
|
775
|
+
m = int(m_month.group(1))
|
776
|
+
if m_day:
|
777
|
+
d = int(m_day.group(1))
|
778
|
+
start = pd.Timestamp(datetime.datetime(y, m, d, 0, 0, 0, tzinfo=datetime.timezone.utc))
|
779
|
+
end = start + pd.Timedelta(days=1) - pd.Timedelta(nanoseconds=1)
|
780
|
+
return start, end
|
781
|
+
# month granularity
|
782
|
+
last_day = monthrange(y, m)[1]
|
783
|
+
start = pd.Timestamp(datetime.datetime(y, m, 1, 0, 0, 0, tzinfo=datetime.timezone.utc))
|
784
|
+
end = pd.Timestamp(datetime.datetime(y, m, last_day, 23, 59, 59, tzinfo=datetime.timezone.utc)) \
|
785
|
+
+ pd.Timedelta(seconds=0.999999999) # inclusive
|
786
|
+
return start, end
|
787
|
+
|
788
|
+
def _collect_row_groups_meta(file_path: str) -> Tuple[
|
789
|
+
List[Tuple[pd.Timestamp, pd.Timestamp, int]], Optional[str]]:
|
790
|
+
"""
|
791
|
+
Return list of (rg_min, rg_max, rg_rows) for 'time_index' from Parquet footer.
|
792
|
+
If stats are missing, returns empty list and a reason.
|
793
|
+
"""
|
794
|
+
try:
|
795
|
+
pf = pq.ParquetFile(file_path, filesystem=self._fs)
|
796
|
+
except Exception as e:
|
797
|
+
return [], f"open_error:{e}"
|
798
|
+
|
799
|
+
# Find Arrow time unit (ns/us/ms/s)
|
800
|
+
try:
|
801
|
+
arr_schema = pf.schema_arrow
|
802
|
+
tfield = arr_schema.field("time_index")
|
803
|
+
ttype = tfield.type # pyarrow.TimestampType
|
804
|
+
unit = getattr(ttype, "unit", "ns")
|
805
|
+
except Exception:
|
806
|
+
# If schema isn't Arrow-resolvable, fall back to 'ns'
|
807
|
+
unit = "ns"
|
808
|
+
|
809
|
+
rg_list: List[Tuple[pd.Timestamp, pd.Timestamp, int]] = []
|
810
|
+
try:
|
811
|
+
meta = pf.metadata
|
812
|
+
nrg = meta.num_row_groups
|
813
|
+
# Find column index by name (robust against nested/flat)
|
814
|
+
col_idx = None
|
815
|
+
# Try direct mapping via schema_arrow index first
|
816
|
+
try:
|
817
|
+
idx = arr_schema.get_field_index("time_index")
|
818
|
+
if idx != -1:
|
819
|
+
col_idx = idx
|
820
|
+
except Exception:
|
821
|
+
col_idx = None
|
822
|
+
for i in range(nrg):
|
823
|
+
rg = meta.row_group(i)
|
824
|
+
# choose the column chunk for time_index
|
825
|
+
col = None
|
826
|
+
if col_idx is not None and col_idx < rg.num_columns:
|
827
|
+
col = rg.column(col_idx)
|
828
|
+
# Verify we picked the right column name; if not, search by path
|
829
|
+
try:
|
830
|
+
name = str(col.path_in_schema)
|
831
|
+
if name.split(".")[-1] != "time_index":
|
832
|
+
col = None
|
833
|
+
# else ok
|
834
|
+
except Exception:
|
835
|
+
# fall back to search
|
836
|
+
col = None
|
837
|
+
if col is None:
|
838
|
+
for j in range(rg.num_columns):
|
839
|
+
cj = rg.column(j)
|
840
|
+
try:
|
841
|
+
name = str(cj.path_in_schema)
|
842
|
+
except Exception:
|
843
|
+
name = ""
|
844
|
+
if name.split(".")[-1] == "time_index":
|
845
|
+
col = cj
|
846
|
+
break
|
847
|
+
if col is None:
|
848
|
+
# can't find time_index column → skip this row group
|
849
|
+
continue
|
850
|
+
stats = getattr(col, "statistics", None)
|
851
|
+
if not stats or not getattr(stats, "has_min_max", False):
|
852
|
+
continue
|
853
|
+
vmin, vmax = stats.min, stats.max
|
854
|
+
|
855
|
+
# Convert min/max to UTC Timestamps
|
856
|
+
def conv(v):
|
857
|
+
if v is None:
|
858
|
+
return None
|
859
|
+
# numeric epoch in unit
|
860
|
+
if isinstance(v, (int, np.integer)):
|
861
|
+
return pd.to_datetime(int(v), unit=unit, utc=True)
|
862
|
+
if isinstance(v, float):
|
863
|
+
return pd.to_datetime(int(v), unit=unit, utc=True)
|
864
|
+
# already datetime-like
|
865
|
+
try:
|
866
|
+
return pd.to_datetime(v, utc=True)
|
867
|
+
except Exception:
|
868
|
+
return None
|
869
|
+
|
870
|
+
tmin = conv(vmin)
|
871
|
+
tmax = conv(vmax)
|
872
|
+
if tmin is None or tmax is None:
|
873
|
+
continue
|
874
|
+
# ensure ordering
|
875
|
+
if tmax < tmin:
|
876
|
+
tmin, tmax = tmax, tmin
|
877
|
+
rg_list.append((tmin, tmax, rg.num_rows))
|
878
|
+
except Exception as e:
|
879
|
+
return [], f"meta_error:{e}"
|
880
|
+
return rg_list, None
|
881
|
+
|
882
|
+
def _rows_estimate_until(T: pd.Timestamp, rgs: List[Tuple[pd.Timestamp, pd.Timestamp, int]],
|
883
|
+
start_ts: pd.Timestamp) -> int:
|
884
|
+
"""
|
885
|
+
Estimate rows in [start_ts, T] by assuming uniform distribution within each row-group
|
886
|
+
between its (min_time, max_time). Uses only metadata.
|
887
|
+
"""
|
888
|
+
if T < start_ts:
|
889
|
+
return 0
|
890
|
+
total = 0.0
|
891
|
+
Ts = T.value
|
892
|
+
Ss = start_ts.value
|
893
|
+
for (mn, mx, rows) in rgs:
|
894
|
+
a = max(Ss, mn.value)
|
895
|
+
b = min(Ts, mx.value)
|
896
|
+
if b <= a:
|
897
|
+
continue
|
898
|
+
denom = (mx.value - mn.value)
|
899
|
+
if denom <= 0:
|
900
|
+
# degenerate: all timestamps equal; include whole group if it intersects
|
901
|
+
total += float(rows)
|
902
|
+
else:
|
903
|
+
frac = (b - a) / denom
|
904
|
+
total += frac * float(rows)
|
905
|
+
return int(total)
|
906
|
+
|
907
|
+
# --- resolve cap + effective time window -------------------------------
|
908
|
+
|
909
|
+
if max_rows is None:
|
910
|
+
env_val = os.getenv("MAX_READ_ROWS") or os.getenv("TDAG_MAX_READ_ROWS")
|
911
|
+
try:
|
912
|
+
max_rows = int(str(env_val).replace(",", "_")) if env_val is not None else 10_000_000
|
913
|
+
except Exception:
|
914
|
+
max_rows = 10_000_000
|
915
|
+
if now is None:
|
916
|
+
now = datetime.datetime.now(datetime.timezone.utc)
|
917
|
+
|
918
|
+
# Normalize inputs
|
919
|
+
start_ts = _to_utc_ts(start)
|
920
|
+
end_ts = _to_utc_ts(end)
|
921
|
+
uirm = None if unique_identifier_range_map is None else {k: dict(v) for k, v in
|
922
|
+
unique_identifier_range_map.items()}
|
923
|
+
|
924
|
+
# If ids are given without a range map, create a simple one from start/end
|
925
|
+
if ids and (uirm is None):
|
926
|
+
uirm = {
|
927
|
+
uid: {
|
928
|
+
"start_date": start_ts.to_pydatetime() if start_ts is not None else None,
|
929
|
+
"start_date_operand": ">=",
|
930
|
+
"end_date": (end_ts or _to_utc_ts(now)).to_pydatetime() if (end_ts or now) is not None else None,
|
931
|
+
"end_date_operand": "<=",
|
932
|
+
} for uid in ids
|
933
|
+
}
|
934
|
+
|
935
|
+
# Compute global window from inputs
|
936
|
+
if uirm:
|
937
|
+
eff_start = _effective_start_from_range_map(uirm)
|
938
|
+
eff_end_in_map = _effective_end_from_range_map(uirm)
|
939
|
+
else:
|
940
|
+
eff_start = start_ts
|
941
|
+
eff_end_in_map = None
|
942
|
+
|
943
|
+
eff_end = end_ts or eff_end_in_map or _to_utc_ts(now)
|
944
|
+
|
945
|
+
# If start still unknown, derive from metadata minima later
|
946
|
+
# If end still unknown, use now
|
947
|
+
if eff_end is None:
|
948
|
+
eff_end = _to_utc_ts(now)
|
949
|
+
|
950
|
+
# --- collect candidate files via partition pruning ---------------------
|
951
|
+
|
952
|
+
base = f"{self.db_path}/{table}"
|
953
|
+
selector = pa_fs.FileSelector(base, recursive=True)
|
954
|
+
files = [
|
955
|
+
info.path
|
956
|
+
for info in self._fs.get_file_info(selector)
|
957
|
+
if info.type == pa_fs.FileType.File and info.path.endswith(".parquet")
|
958
|
+
]
|
959
|
+
|
960
|
+
# Gather row-group metadata for relevant files
|
961
|
+
row_groups: List[Tuple[pd.Timestamp, pd.Timestamp, int]] = []
|
962
|
+
files_considered = 0
|
963
|
+
files_skipped_part = 0
|
964
|
+
files_meta_errors = 0
|
965
|
+
|
966
|
+
# Use partition bounds to prune before opening footers
|
967
|
+
for path in files:
|
968
|
+
p_start, p_end = _parse_part_bounds_from_path(path)
|
969
|
+
# If we don't know start yet, we must not prune too aggressively; include all and tighten later
|
970
|
+
if eff_start is not None and p_start is not None and p_end is not None:
|
971
|
+
if p_end < eff_start or p_start > eff_end:
|
972
|
+
files_skipped_part += 1
|
973
|
+
continue
|
974
|
+
rgs, err = _collect_row_groups_meta(path)
|
975
|
+
if err is not None:
|
976
|
+
files_meta_errors += 1
|
977
|
+
# Fall back: if partition range overlaps, take whole file as one group
|
978
|
+
if p_start is not None and p_end is not None:
|
979
|
+
# To avoid undercounting, include the file-level partition range as a single group
|
980
|
+
# with the file's row count from footer if available; otherwise skip.
|
981
|
+
try:
|
982
|
+
pf = pq.ParquetFile(path, filesystem=self._fs)
|
983
|
+
nrows_file = pf.metadata.num_rows
|
984
|
+
row_groups.append((p_start or eff_start or _to_utc_ts(now), p_end or eff_end, nrows_file))
|
985
|
+
files_considered += 1
|
986
|
+
except Exception:
|
987
|
+
pass
|
988
|
+
continue
|
989
|
+
if rgs:
|
990
|
+
row_groups.extend(rgs)
|
991
|
+
files_considered += 1
|
992
|
+
|
993
|
+
if not row_groups:
|
994
|
+
# No metadata found; nothing to constrain
|
995
|
+
diagnostics = {
|
996
|
+
"reason": "no_row_groups_found",
|
997
|
+
"max_rows": max_rows,
|
998
|
+
"files_considered": files_considered,
|
999
|
+
"files_skipped_partition": files_skipped_part,
|
1000
|
+
"files_meta_errors": files_meta_errors,
|
1001
|
+
}
|
1002
|
+
return start_ts, end_ts or _to_utc_ts(now), uirm, diagnostics
|
1003
|
+
|
1004
|
+
# If start still None, derive earliest available tmin from metadata
|
1005
|
+
if eff_start is None:
|
1006
|
+
eff_start = min(mn for (mn, mx, _) in row_groups)
|
1007
|
+
# Clamp if caller passed a later explicit start
|
1008
|
+
if start_ts is not None:
|
1009
|
+
eff_start = max(eff_start, start_ts)
|
1010
|
+
|
1011
|
+
# Filter row-groups that can intersect [eff_start, eff_end]
|
1012
|
+
row_groups = [rg for rg in row_groups if not (rg[1] < eff_start or rg[0] > eff_end)]
|
1013
|
+
if not row_groups:
|
1014
|
+
diagnostics = {
|
1015
|
+
"reason": "no_groups_in_window",
|
1016
|
+
"window": [str(eff_start), str(eff_end)],
|
1017
|
+
"max_rows": max_rows
|
1018
|
+
}
|
1019
|
+
return eff_start, eff_end, uirm, diagnostics
|
1020
|
+
|
1021
|
+
# --- estimate rows & binary search limit_dt ----------------------------
|
1022
|
+
|
1023
|
+
# Quick check: rows at eff_end
|
1024
|
+
est_rows_at_end = _rows_estimate_until(eff_end, row_groups, eff_start)
|
1025
|
+
if est_rows_at_end <= max_rows:
|
1026
|
+
diagnostics = {
|
1027
|
+
"limited": False,
|
1028
|
+
"estimated_rows": est_rows_at_end,
|
1029
|
+
"max_rows": max_rows,
|
1030
|
+
"limit_dt": None,
|
1031
|
+
"files_considered": files_considered,
|
1032
|
+
"files_skipped_partition": files_skipped_part,
|
1033
|
+
"files_meta_errors": files_meta_errors,
|
1034
|
+
"row_groups_considered": len(row_groups),
|
1035
|
+
"mode": "row_group_metadata",
|
1036
|
+
}
|
1037
|
+
# Nothing to change
|
1038
|
+
return start_ts or eff_start, end_ts or eff_end, uirm, diagnostics
|
1039
|
+
|
1040
|
+
# Binary search for latest T in [eff_start, eff_end] with est_rows <= max_rows
|
1041
|
+
lo = eff_start.value
|
1042
|
+
hi = eff_end.value
|
1043
|
+
for _ in range(64):
|
1044
|
+
mid = (lo + hi) // 2
|
1045
|
+
mid_ts = pd.Timestamp(mid, tz="UTC")
|
1046
|
+
est = _rows_estimate_until(mid_ts, row_groups, eff_start)
|
1047
|
+
if est > max_rows:
|
1048
|
+
hi = mid
|
1049
|
+
else:
|
1050
|
+
lo = mid
|
1051
|
+
limit_dt = pd.Timestamp(lo, tz="UTC")
|
1052
|
+
est_at_limit = _rows_estimate_until(limit_dt, row_groups, eff_start)
|
1053
|
+
|
1054
|
+
# --- produce adjusted outputs ------------------------------------------
|
1055
|
+
|
1056
|
+
adjusted_start = start_ts or eff_start
|
1057
|
+
adjusted_end = min(end_ts or eff_end, limit_dt)
|
1058
|
+
|
1059
|
+
adjusted_uirm = None
|
1060
|
+
if uirm is not None:
|
1061
|
+
adjusted_uirm = {}
|
1062
|
+
for uid, info in uirm.items():
|
1063
|
+
new_info = dict(info)
|
1064
|
+
# tighten end_date by limit_dt
|
1065
|
+
cur_end = info.get("end_date")
|
1066
|
+
cur_end_ts = _to_utc_ts(cur_end) if cur_end is not None else eff_end
|
1067
|
+
tight_end = min(cur_end_ts, limit_dt)
|
1068
|
+
new_info["end_date"] = tight_end.to_pydatetime()
|
1069
|
+
# preserve or default operand
|
1070
|
+
if "end_date_operand" not in new_info:
|
1071
|
+
new_info["end_date_operand"] = "<="
|
1072
|
+
adjusted_uirm[uid] = new_info
|
1073
|
+
|
1074
|
+
diagnostics = {
|
1075
|
+
"limited": True,
|
1076
|
+
"limit_dt": str(limit_dt),
|
1077
|
+
"estimated_rows_at_limit": est_at_limit,
|
1078
|
+
"estimated_rows_full": est_rows_at_end,
|
1079
|
+
"max_rows": max_rows,
|
1080
|
+
"files_considered": files_considered,
|
1081
|
+
"files_skipped_partition": files_skipped_part,
|
1082
|
+
"files_meta_errors": files_meta_errors,
|
1083
|
+
"row_groups_considered": len(row_groups),
|
1084
|
+
"mode": "row_group_metadata",
|
1085
|
+
"effective_window_before": [str(eff_start), str(eff_end)],
|
1086
|
+
"effective_window_after": [str(adjusted_start), str(adjusted_end)],
|
1087
|
+
}
|
1088
|
+
|
1089
|
+
return adjusted_start, adjusted_end, adjusted_uirm or uirm, diagnostics
|
1090
|
+
|
1091
|
+
def read(
|
1092
|
+
self,
|
1093
|
+
table: str,data_frequency:DataFrequency=DataFrequency.one_m,
|
1094
|
+
*,
|
1095
|
+
start: Optional[datetime.datetime] = None,
|
1096
|
+
end: Optional[datetime.datetime] = None,
|
1097
|
+
great_or_equal: bool = True, # Changed back to boolean
|
1098
|
+
less_or_equal: bool = True, # Changed back to boolean
|
1099
|
+
ids: Optional[List[str]] = None,
|
1100
|
+
columns: Optional[List[str]] = None,
|
1101
|
+
unique_identifier_range_map: Optional[UniqueIdentifierRangeMap] = None,
|
1102
|
+
column_range_descriptor: Optional[Dict[str,UniqueIdentifierRangeMap]] = None
|
1103
|
+
) -> pd.DataFrame:
|
1104
|
+
"""
|
1105
|
+
Reads data from the specified table, with optional filtering.
|
1106
|
+
Handles missing tables by returning an empty DataFrame.
|
1107
|
+
|
1108
|
+
Args:
|
1109
|
+
table (str): The name of the table to read from.
|
1110
|
+
start (Optional[datetime.datetime]): Minimum time_index filter.
|
1111
|
+
end (Optional[datetime.datetime]): Maximum time_index filter.
|
1112
|
+
great_or_equal (bool): If True, use >= for start date comparison. Defaults to True.
|
1113
|
+
less_or_equal (bool): If True, use <= for end date comparison. Defaults to True.
|
1114
|
+
ids (Optional[List[str]]): List of specific unique_identifiers to include.
|
1115
|
+
columns (Optional[List[str]]): Specific columns to select. Reads all if None.
|
1116
|
+
unique_identifier_range_map (Optional[UniqueIdentifierRangeMap]):
|
1117
|
+
A map where keys are unique_identifiers and values are dicts specifying
|
1118
|
+
date ranges (start_date, end_date, start_date_operand, end_date_operand)
|
1119
|
+
for that identifier. Mutually exclusive with 'ids'.
|
1120
|
+
|
1121
|
+
Returns:
|
1122
|
+
pd.DataFrame: The queried data, or an empty DataFrame if the table doesn't exist.
|
1123
|
+
|
1124
|
+
Raises:
|
1125
|
+
ValueError: If both `ids` and `unique_identifier_range_map` are provided.
|
1126
|
+
"""
|
1127
|
+
# Map boolean flags to operator strings internally
|
1128
|
+
start_operator = '>=' if great_or_equal else '>'
|
1129
|
+
end_operator = '<=' if less_or_equal else '<'
|
1130
|
+
|
1131
|
+
if ids is not None and unique_identifier_range_map is not None:
|
1132
|
+
raise ValueError("Cannot provide both 'ids' and 'unique_identifier_range_map'.")
|
1133
|
+
|
1134
|
+
logger.debug(
|
1135
|
+
f"Duck DB: Reading from table '{table}' with filters: start={start}, end={end}, "
|
1136
|
+
f"ids={ids is not None}, columns={columns}, range_map={unique_identifier_range_map is not None}"
|
1137
|
+
)
|
1138
|
+
|
1139
|
+
|
1140
|
+
if columns is not None:
|
1141
|
+
table_exists_result = self.table_exists(table)
|
1142
|
+
df_cols = self.con.execute(f"SELECT * FROM {table} AS _q LIMIT 0").fetch_df()
|
1143
|
+
if any([c not in df_cols.columns for c in columns ]):
|
1144
|
+
logger.warning(f"not all Columns '{columns}' are not present in table '{table}'. returning an empty DF")
|
1145
|
+
return pd.DataFrame()
|
1146
|
+
|
1147
|
+
cols_select = "*"
|
1148
|
+
if columns:
|
1149
|
+
required_cols = {"time_index", "unique_identifier"}
|
1150
|
+
select_set = set(columns) | required_cols
|
1151
|
+
cols_select = ", ".join(f'"{c}"' for c in select_set)
|
1152
|
+
|
1153
|
+
sql_parts = [f'SELECT {cols_select} FROM "{table}"']
|
1154
|
+
params = []
|
1155
|
+
where_clauses = []
|
1156
|
+
|
1157
|
+
# --- Build WHERE clauses ---
|
1158
|
+
if start is not None:
|
1159
|
+
where_clauses.append(f"time_index {start_operator} ?")
|
1160
|
+
params.append(start.replace(tzinfo=None) if start.tzinfo else start)
|
1161
|
+
if end is not None:
|
1162
|
+
where_clauses.append(f"time_index {end_operator} ?")
|
1163
|
+
params.append(end.replace(tzinfo=None) if end.tzinfo else end)
|
1164
|
+
if ids:
|
1165
|
+
if not isinstance(ids, list): ids = list(ids)
|
1166
|
+
if ids:
|
1167
|
+
placeholders = ", ".join("?" for _ in ids)
|
1168
|
+
where_clauses.append(f"unique_identifier IN ({placeholders})")
|
1169
|
+
params.extend(ids)
|
1170
|
+
if unique_identifier_range_map:
|
1171
|
+
range_conditions = []
|
1172
|
+
for uid, date_info in unique_identifier_range_map.items():
|
1173
|
+
uid_conditions = ["unique_identifier = ?"]
|
1174
|
+
range_params = [uid]
|
1175
|
+
# Use operands from map if present, otherwise default to >= and <=
|
1176
|
+
s_op = date_info.get('start_date_operand', '>=')
|
1177
|
+
e_op = date_info.get('end_date_operand', '<=')
|
1178
|
+
if date_info.get('start_date'):
|
1179
|
+
uid_conditions.append(f"time_index {s_op} ?")
|
1180
|
+
s_date = date_info['start_date']
|
1181
|
+
range_params.append(s_date.replace(tzinfo=None) if s_date.tzinfo else s_date)
|
1182
|
+
if date_info.get('end_date'):
|
1183
|
+
uid_conditions.append(f"time_index {e_op} ?")
|
1184
|
+
e_date = date_info['end_date']
|
1185
|
+
range_params.append(e_date.replace(tzinfo=None) if e_date.tzinfo else e_date)
|
1186
|
+
range_conditions.append(f"({' AND '.join(uid_conditions)})")
|
1187
|
+
params.extend(range_params)
|
1188
|
+
if range_conditions:
|
1189
|
+
where_clauses.append(f"({' OR '.join(range_conditions)})")
|
1190
|
+
|
1191
|
+
if where_clauses: sql_parts.append("WHERE " + " AND ".join(where_clauses))
|
1192
|
+
sql_parts.append("ORDER BY time_index")
|
1193
|
+
query = " ".join(sql_parts)
|
1194
|
+
logger.debug(f"Executing read query: {query} with params: {params}")
|
1195
|
+
|
1196
|
+
try:
|
1197
|
+
table_exists_result = self.table_exists(table)
|
1198
|
+
|
1199
|
+
df = self.con.execute(query, params).fetch_df()
|
1200
|
+
|
1201
|
+
if not df.empty:
|
1202
|
+
schema = self.con.execute(f'PRAGMA table_info("{table}")').fetchall()
|
1203
|
+
type_map = {
|
1204
|
+
name: self._duck_to_pandas(duck_type, data_frequency=data_frequency)
|
1205
|
+
for cid, name, duck_type, notnull, default, pk in schema
|
1206
|
+
if name in df.columns
|
1207
|
+
}
|
1208
|
+
for col, target_type in type_map.items():
|
1209
|
+
try:
|
1210
|
+
if target_type == "datetime64[ns, UTC]":
|
1211
|
+
arr =df[col].values
|
1212
|
+
arr_ns = arr.astype("datetime64[ns]")
|
1213
|
+
df[col] =pd.Series(
|
1214
|
+
pd.DatetimeIndex(arr_ns, tz="UTC"),
|
1215
|
+
index=df.index,
|
1216
|
+
name=col,
|
1217
|
+
)
|
1218
|
+
elif target_type == "datetime64[ns]":
|
1219
|
+
df[col] = pd.to_datetime(df[col], errors='coerce')
|
1220
|
+
else:
|
1221
|
+
if isinstance(target_type, (pd.Int64Dtype, pd.BooleanDtype, pd.StringDtype)):
|
1222
|
+
df[col] = df[col].astype(target_type, errors='ignore')
|
1223
|
+
else:
|
1224
|
+
df[col] = df[col].astype(target_type, errors='ignore')
|
1225
|
+
except Exception as type_e:
|
1226
|
+
logger.warning(f"Could not coerce column '{col}' to type '{target_type}': {type_e}")
|
1227
|
+
|
1228
|
+
logger.debug(f"Read {len(df)} rows from table '{table}'.")
|
1229
|
+
return df
|
1230
|
+
|
1231
|
+
return pd.DataFrame()
|
1232
|
+
|
1233
|
+
except duckdb.CatalogException as e:
|
1234
|
+
logger.warning(f"CatalogException for table '{table}': {e}. Returning empty DataFrame.")
|
1235
|
+
return pd.DataFrame()
|
1236
|
+
except duckdb.Error as e:
|
1237
|
+
logger.error(f"Failed to read data from table '{table}': {e}")
|
1238
|
+
raise
|
1239
|
+
except Exception as e:
|
1240
|
+
logger.exception(f"An unexpected error occurred during read from table '{table}': {e}")
|
1241
|
+
raise
|
1242
|
+
|
1243
|
+
def drop_table(self, table: str) -> None:
|
1244
|
+
"""
|
1245
|
+
Drops the specified table and corresponding view from the database.
|
1246
|
+
|
1247
|
+
Args:
|
1248
|
+
table (str): The name of the table/view to drop.
|
1249
|
+
"""
|
1250
|
+
logger.debug(f"Attempting to drop table and view '{table}' from {self.db_path}")
|
1251
|
+
try:
|
1252
|
+
# Drop the view first (if it exists)
|
1253
|
+
self.con.execute(f'DROP VIEW IF EXISTS "{table}"')
|
1254
|
+
logger.debug(f"Dropped view '{table}' (if it existed).")
|
1255
|
+
|
1256
|
+
# Then drop the table (if it exists)
|
1257
|
+
self.con.execute(f'DROP TABLE IF EXISTS "{table}"')
|
1258
|
+
logger.debug(f"Dropped table '{table}' (if it existed).")
|
1259
|
+
|
1260
|
+
except duckdb.Error as e:
|
1261
|
+
logger.error(f"Failed to drop table/view '{table}': {e}")
|
1262
|
+
raise
|
1263
|
+
except Exception as e:
|
1264
|
+
logger.exception(f"An unexpected error occurred while dropping table/view '{table}': {e}")
|
1265
|
+
raise
|
1266
|
+
|
1267
|
+
def list_tables(self) -> List[str]:
|
1268
|
+
"""
|
1269
|
+
Returns names of all tables and views in the main schema.
|
1270
|
+
"""
|
1271
|
+
try:
|
1272
|
+
rows = self.con.execute("SHOW TABLES").fetchall()
|
1273
|
+
return [r[0] for r in rows]
|
1274
|
+
except duckdb.Error as e:
|
1275
|
+
logger.error(f"Error listing tables/views in {self.db_path}: {e}")
|
1276
|
+
return []
|
1277
|
+
|
1278
|
+
|
1279
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
1280
|
+
# Private helpers
|
1281
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
1282
|
+
|
1283
|
+
def _ensure_view(self, table: str) -> None:
|
1284
|
+
"""
|
1285
|
+
CREATE OR REPLACE a view named `table` that:
|
1286
|
+
* reads all Parquet under self.db_path/table/**
|
1287
|
+
* hides partition columns (year, month, day)
|
1288
|
+
* locks column dtypes by explicit CASTs
|
1289
|
+
Schema is derived by unifying schemas across all partitions.
|
1290
|
+
"""
|
1291
|
+
partition_cols = {"year", "month", "day"}
|
1292
|
+
|
1293
|
+
def qident(name: str) -> str:
|
1294
|
+
"""Helper to safely quote identifiers for SQL."""
|
1295
|
+
return '"' + str(name).replace('"', '""') + '"'
|
1296
|
+
|
1297
|
+
file_glob = f"{self.db_path}/{table}/**/*.parquet"
|
1298
|
+
|
1299
|
+
# ✅ Key Change 1: Define a single, robust way to read the data.
|
1300
|
+
# This uses union_by_name=True to handle schema differences across files.
|
1301
|
+
read_clause = f"read_parquet('{file_glob}', union_by_name = True, hive_partitioning = TRUE)"
|
1302
|
+
|
1303
|
+
try:
|
1304
|
+
# ✅ Key Change 2: Use the robust read_clause for schema discovery.
|
1305
|
+
# This now correctly gets all columns from all partitions.
|
1306
|
+
desc_rows = self.con.execute(f"DESCRIBE SELECT * FROM {read_clause}").fetchall()
|
1307
|
+
except duckdb.Error as e:
|
1308
|
+
# No files yet or glob fails — skip (keeps existing view if any)
|
1309
|
+
logger.warning(f"_ensure_view: cannot scan files for '{table}': {e}")
|
1310
|
+
return
|
1311
|
+
|
1312
|
+
# Build CAST list, dropping partition columns
|
1313
|
+
cols = [(r[0], r[1]) for r in desc_rows if r and r[0] not in partition_cols]
|
1314
|
+
if not cols:
|
1315
|
+
logger.warning(f"_ensure_view: no non-partition columns for '{table}'. Skipping view refresh.")
|
1316
|
+
return
|
1317
|
+
|
1318
|
+
# Build the list of columns with explicit CASTs to enforce types
|
1319
|
+
select_exprs = [f"CAST({qident(name)} AS {coltype}) AS {qident(name)}"
|
1320
|
+
for name, coltype in cols]
|
1321
|
+
select_list = ",\n ".join(select_exprs)
|
1322
|
+
|
1323
|
+
# ✅ Key Change 3: Fix the DDL to be syntactically correct and use the robust read_clause.
|
1324
|
+
ddl = f"""
|
1325
|
+
CREATE OR REPLACE VIEW {qident(table)} AS
|
1326
|
+
SELECT
|
1327
|
+
{select_list}
|
1328
|
+
FROM {read_clause}
|
1329
|
+
"""
|
1330
|
+
|
1331
|
+
self._execute_transaction(ddl)
|
1332
|
+
|
1333
|
+
def _partition_path(self, keys: dict,table:str) -> str:
|
1334
|
+
parts = [f"{k}={int(v):02d}" if k != "year" else f"{k}={int(v):04d}"
|
1335
|
+
for k, v in keys.items()]
|
1336
|
+
return f"{self.db_path}/{table}/" + "/".join(parts)
|
1337
|
+
|
1338
|
+
def _partition_keys(self, ts: pd.Series,data_frequency:DataFrequency) -> dict:
|
1339
|
+
"""Return a dict of partition column → Series."""
|
1340
|
+
keys = {"year": ts.dt.year.astype(str), "month": ts.dt.month.astype(str)}
|
1341
|
+
if data_frequency == "minute":
|
1342
|
+
keys["day"] = ts.dt.day.astype(str)
|
1343
|
+
return keys
|
1344
|
+
|
1345
|
+
def _execute_transaction(self, sql: str) -> None:
|
1346
|
+
"""
|
1347
|
+
Run a single-statement SQL in a BEGIN/COMMIT block,
|
1348
|
+
rolling back on any failure.
|
1349
|
+
"""
|
1350
|
+
try:
|
1351
|
+
self.con.execute("BEGIN TRANSACTION;")
|
1352
|
+
self.con.execute(sql)
|
1353
|
+
self.con.execute("COMMIT;")
|
1354
|
+
except Exception:
|
1355
|
+
# best-effort rollback (if inside a failed transaction)
|
1356
|
+
try:
|
1357
|
+
self.con.execute("ROLLBACK;")
|
1358
|
+
except Exception:
|
1359
|
+
pass
|
1360
|
+
raise
|
1361
|
+
@staticmethod
|
1362
|
+
def _pandas_to_duck(dtype) -> str:
|
1363
|
+
"""
|
1364
|
+
Minimal dtype → DuckDB mapping. Extend as needed.
|
1365
|
+
"""
|
1366
|
+
if (pd.api.types.is_datetime64_any_dtype(dtype)
|
1367
|
+
or pd.api.types.is_datetime64tz_dtype(dtype)):
|
1368
|
+
return "TIMESTAMPTZ"
|
1369
|
+
if pd.api.types.is_integer_dtype(dtype):
|
1370
|
+
return "BIGINT"
|
1371
|
+
if pd.api.types.is_float_dtype(dtype):
|
1372
|
+
return "DOUBLE"
|
1373
|
+
if pd.api.types.is_bool_dtype(dtype):
|
1374
|
+
return "BOOLEAN"
|
1375
|
+
return "VARCHAR"
|
1376
|
+
|
1377
|
+
@staticmethod
|
1378
|
+
def _duck_to_pandas(duck_type: str,data_frequency:DataFrequency):
|
1379
|
+
"""
|
1380
|
+
Minimal DuckDB → pandas dtype mapping.
|
1381
|
+
Returns the dtype object (preferred) so that
|
1382
|
+
`df.astype({...})` gets pandas’ nullable dtypes.
|
1383
|
+
Extend as needed.
|
1384
|
+
"""
|
1385
|
+
dt = duck_type.upper()
|
1386
|
+
|
1387
|
+
# --- datetimes ------------------------------------------------------
|
1388
|
+
if dt in ("TIMESTAMPTZ", "TIMESTAMP WITH TIME ZONE"):
|
1389
|
+
# keep the UTC tz-awareness
|
1390
|
+
return "datetime64[ns, UTC]"
|
1391
|
+
|
1392
|
+
|
1393
|
+
if dt in ("TIMESTAMP", "DATETIME",):
|
1394
|
+
# keep timezone if present; duckdb returns tz‑aware objects already,
|
1395
|
+
# so no explicit 'UTC' suffix is needed here.
|
1396
|
+
return "datetime64[ns]"
|
1397
|
+
if dt == "DATE":
|
1398
|
+
return "datetime64[ns]" # pandas treats it as midnight
|
1399
|
+
|
1400
|
+
# --- integers -------------------------------------------------------
|
1401
|
+
if dt in ("TINYINT", "SMALLINT", "INTEGER", "INT", "BIGINT"):
|
1402
|
+
return pd.Int64Dtype() # nullable 64‑bit int
|
1403
|
+
|
1404
|
+
# --- floats / numerics ---------------------------------------------
|
1405
|
+
if dt in ("REAL", "FLOAT", "DOUBLE", "DECIMAL"):
|
1406
|
+
return "float64"
|
1407
|
+
|
1408
|
+
# --- booleans -------------------------------------------------------
|
1409
|
+
if dt == "BOOLEAN":
|
1410
|
+
return pd.BooleanDtype() # nullable boolean
|
1411
|
+
|
1412
|
+
# --- everything else ------------------------------------------------
|
1413
|
+
return pd.StringDtype() # pandas‘ native nullable string
|
1414
|
+
|
1415
|
+
# ─────────────────────────────────────────────────────────────────────── #
|
1416
|
+
# 3. OVERNIGHT DEDUP & COMPACTION #
|
1417
|
+
# ─────────────────────────────────────────────────────────────────────── #
|
1418
|
+
def overnight_dedup(self, table: str, date: Optional[datetime.date] = None) -> None:
|
1419
|
+
"""
|
1420
|
+
Keep only the newest row per (time_index, unique_identifier)
|
1421
|
+
for each partition, coalesce small files into one Parquet file.
|
1422
|
+
|
1423
|
+
Run this once a day during low‑traffic hours.
|
1424
|
+
"""
|
1425
|
+
# --- select partitions to touch ------------------------------------
|
1426
|
+
base = f"{self.db_path}/{table}"
|
1427
|
+
selector = fs.FileSelector(base, recursive=True)
|
1428
|
+
dirs = {info.path.rpartition("/")[0] for info in self._fs.get_file_info(selector)
|
1429
|
+
if info.type == fs.FileType.File
|
1430
|
+
and info.path.endswith(".parquet")}
|
1431
|
+
|
1432
|
+
if date:
|
1433
|
+
y, m, d = date.year, date.month, date.day
|
1434
|
+
dirs = {p for p in dirs if
|
1435
|
+
f"year={y:04d}" in p and f"month={m:02d}" in p
|
1436
|
+
and (data_frequency != "minute" or f"day={d:02d}" in p)}
|
1437
|
+
|
1438
|
+
for part_path in sorted(dirs):
|
1439
|
+
tmp_file = f"{part_path}/compact-{uuid.uuid4().hex}.parquet"
|
1440
|
+
|
1441
|
+
# DuckDB SQL: window‑deduplicate & write in one shot
|
1442
|
+
copy_sql = f"""
|
1443
|
+
COPY (
|
1444
|
+
SELECT *
|
1445
|
+
FROM (
|
1446
|
+
SELECT *,
|
1447
|
+
ROW_NUMBER() OVER (
|
1448
|
+
PARTITION BY time_index, unique_identifier
|
1449
|
+
ORDER BY _file_path DESC
|
1450
|
+
) AS rn
|
1451
|
+
FROM parquet_scan('{part_path}/*.parquet',
|
1452
|
+
hive_partitioning=TRUE,
|
1453
|
+
filename=true) -- exposes _file_path
|
1454
|
+
)
|
1455
|
+
WHERE rn = 1
|
1456
|
+
)
|
1457
|
+
TO '{tmp_file}'
|
1458
|
+
(FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000)
|
1459
|
+
"""
|
1460
|
+
self.con.execute(copy_sql)
|
1461
|
+
|
1462
|
+
# remove old fragments & leave only the compacted file
|
1463
|
+
for info in self._fs.get_file_info(fs.FileSelector(part_path)):
|
1464
|
+
if info.type == fs.FileType.File and info.path != tmp_file:
|
1465
|
+
self._fs.delete_file(info.path)
|
1466
|
+
|
1467
|
+
# Optionally rename to a deterministic name; here we just keep tmp_file
|
1468
|
+
logger.info(f"Compacted + de‑duplicated partition {part_path}")
|