mainsequence 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. mainsequence/__init__.py +0 -0
  2. mainsequence/__main__.py +9 -0
  3. mainsequence/cli/__init__.py +1 -0
  4. mainsequence/cli/api.py +157 -0
  5. mainsequence/cli/cli.py +442 -0
  6. mainsequence/cli/config.py +78 -0
  7. mainsequence/cli/ssh_utils.py +126 -0
  8. mainsequence/client/__init__.py +17 -0
  9. mainsequence/client/base.py +431 -0
  10. mainsequence/client/data_sources_interfaces/__init__.py +0 -0
  11. mainsequence/client/data_sources_interfaces/duckdb.py +1468 -0
  12. mainsequence/client/data_sources_interfaces/timescale.py +479 -0
  13. mainsequence/client/models_helpers.py +113 -0
  14. mainsequence/client/models_report_studio.py +412 -0
  15. mainsequence/client/models_tdag.py +2276 -0
  16. mainsequence/client/models_vam.py +1983 -0
  17. mainsequence/client/utils.py +387 -0
  18. mainsequence/dashboards/__init__.py +0 -0
  19. mainsequence/dashboards/streamlit/__init__.py +0 -0
  20. mainsequence/dashboards/streamlit/assets/config.toml +12 -0
  21. mainsequence/dashboards/streamlit/assets/favicon.png +0 -0
  22. mainsequence/dashboards/streamlit/assets/logo.png +0 -0
  23. mainsequence/dashboards/streamlit/core/__init__.py +0 -0
  24. mainsequence/dashboards/streamlit/core/theme.py +212 -0
  25. mainsequence/dashboards/streamlit/pages/__init__.py +0 -0
  26. mainsequence/dashboards/streamlit/scaffold.py +220 -0
  27. mainsequence/instrumentation/__init__.py +7 -0
  28. mainsequence/instrumentation/utils.py +101 -0
  29. mainsequence/instruments/__init__.py +1 -0
  30. mainsequence/instruments/data_interface/__init__.py +10 -0
  31. mainsequence/instruments/data_interface/data_interface.py +361 -0
  32. mainsequence/instruments/instruments/__init__.py +3 -0
  33. mainsequence/instruments/instruments/base_instrument.py +85 -0
  34. mainsequence/instruments/instruments/bond.py +447 -0
  35. mainsequence/instruments/instruments/european_option.py +74 -0
  36. mainsequence/instruments/instruments/interest_rate_swap.py +217 -0
  37. mainsequence/instruments/instruments/json_codec.py +585 -0
  38. mainsequence/instruments/instruments/knockout_fx_option.py +146 -0
  39. mainsequence/instruments/instruments/position.py +475 -0
  40. mainsequence/instruments/instruments/ql_fields.py +239 -0
  41. mainsequence/instruments/instruments/vanilla_fx_option.py +107 -0
  42. mainsequence/instruments/pricing_models/__init__.py +0 -0
  43. mainsequence/instruments/pricing_models/black_scholes.py +49 -0
  44. mainsequence/instruments/pricing_models/bond_pricer.py +182 -0
  45. mainsequence/instruments/pricing_models/fx_option_pricer.py +90 -0
  46. mainsequence/instruments/pricing_models/indices.py +350 -0
  47. mainsequence/instruments/pricing_models/knockout_fx_pricer.py +209 -0
  48. mainsequence/instruments/pricing_models/swap_pricer.py +502 -0
  49. mainsequence/instruments/settings.py +175 -0
  50. mainsequence/instruments/utils.py +29 -0
  51. mainsequence/logconf.py +284 -0
  52. mainsequence/reportbuilder/__init__.py +0 -0
  53. mainsequence/reportbuilder/__main__.py +0 -0
  54. mainsequence/reportbuilder/examples/ms_template_report.py +706 -0
  55. mainsequence/reportbuilder/model.py +713 -0
  56. mainsequence/reportbuilder/slide_templates.py +532 -0
  57. mainsequence/tdag/__init__.py +8 -0
  58. mainsequence/tdag/__main__.py +0 -0
  59. mainsequence/tdag/config.py +129 -0
  60. mainsequence/tdag/data_nodes/__init__.py +12 -0
  61. mainsequence/tdag/data_nodes/build_operations.py +751 -0
  62. mainsequence/tdag/data_nodes/data_nodes.py +1292 -0
  63. mainsequence/tdag/data_nodes/persist_managers.py +812 -0
  64. mainsequence/tdag/data_nodes/run_operations.py +543 -0
  65. mainsequence/tdag/data_nodes/utils.py +24 -0
  66. mainsequence/tdag/future_registry.py +25 -0
  67. mainsequence/tdag/utils.py +40 -0
  68. mainsequence/virtualfundbuilder/__init__.py +45 -0
  69. mainsequence/virtualfundbuilder/__main__.py +235 -0
  70. mainsequence/virtualfundbuilder/agent_interface.py +77 -0
  71. mainsequence/virtualfundbuilder/config_handling.py +86 -0
  72. mainsequence/virtualfundbuilder/contrib/__init__.py +0 -0
  73. mainsequence/virtualfundbuilder/contrib/apps/__init__.py +8 -0
  74. mainsequence/virtualfundbuilder/contrib/apps/etf_replicator_app.py +164 -0
  75. mainsequence/virtualfundbuilder/contrib/apps/generate_report.py +292 -0
  76. mainsequence/virtualfundbuilder/contrib/apps/load_external_portfolio.py +107 -0
  77. mainsequence/virtualfundbuilder/contrib/apps/news_app.py +437 -0
  78. mainsequence/virtualfundbuilder/contrib/apps/portfolio_report_app.py +91 -0
  79. mainsequence/virtualfundbuilder/contrib/apps/portfolio_table.py +95 -0
  80. mainsequence/virtualfundbuilder/contrib/apps/run_named_portfolio.py +45 -0
  81. mainsequence/virtualfundbuilder/contrib/apps/run_portfolio.py +40 -0
  82. mainsequence/virtualfundbuilder/contrib/apps/templates/base.html +147 -0
  83. mainsequence/virtualfundbuilder/contrib/apps/templates/report.html +77 -0
  84. mainsequence/virtualfundbuilder/contrib/data_nodes/__init__.py +5 -0
  85. mainsequence/virtualfundbuilder/contrib/data_nodes/external_weights.py +61 -0
  86. mainsequence/virtualfundbuilder/contrib/data_nodes/intraday_trend.py +149 -0
  87. mainsequence/virtualfundbuilder/contrib/data_nodes/market_cap.py +310 -0
  88. mainsequence/virtualfundbuilder/contrib/data_nodes/mock_signal.py +78 -0
  89. mainsequence/virtualfundbuilder/contrib/data_nodes/portfolio_replicator.py +269 -0
  90. mainsequence/virtualfundbuilder/contrib/prices/__init__.py +1 -0
  91. mainsequence/virtualfundbuilder/contrib/prices/data_nodes.py +810 -0
  92. mainsequence/virtualfundbuilder/contrib/prices/utils.py +11 -0
  93. mainsequence/virtualfundbuilder/contrib/rebalance_strategies/__init__.py +1 -0
  94. mainsequence/virtualfundbuilder/contrib/rebalance_strategies/rebalance_strategies.py +313 -0
  95. mainsequence/virtualfundbuilder/data_nodes.py +637 -0
  96. mainsequence/virtualfundbuilder/enums.py +23 -0
  97. mainsequence/virtualfundbuilder/models.py +282 -0
  98. mainsequence/virtualfundbuilder/notebook_handling.py +42 -0
  99. mainsequence/virtualfundbuilder/portfolio_interface.py +272 -0
  100. mainsequence/virtualfundbuilder/resource_factory/__init__.py +0 -0
  101. mainsequence/virtualfundbuilder/resource_factory/app_factory.py +170 -0
  102. mainsequence/virtualfundbuilder/resource_factory/base_factory.py +238 -0
  103. mainsequence/virtualfundbuilder/resource_factory/rebalance_factory.py +101 -0
  104. mainsequence/virtualfundbuilder/resource_factory/signal_factory.py +183 -0
  105. mainsequence/virtualfundbuilder/utils.py +381 -0
  106. mainsequence-2.0.0.dist-info/METADATA +105 -0
  107. mainsequence-2.0.0.dist-info/RECORD +110 -0
  108. mainsequence-2.0.0.dist-info/WHEEL +5 -0
  109. mainsequence-2.0.0.dist-info/licenses/LICENSE +40 -0
  110. mainsequence-2.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1468 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Optional, Literal, List, Dict,TypedDict, Tuple, Any
5
+ import os, pyarrow.fs as pafs
6
+
7
+ import duckdb, pandas as pd
8
+ from pathlib import Path
9
+ import datetime
10
+ from mainsequence.logconf import logger
11
+ import pyarrow as pa
12
+ import pyarrow.parquet as pq
13
+ from ..utils import DataFrequency,UniqueIdentifierRangeMap
14
+ import uuid
15
+ from pyarrow import fs
16
+
17
+ def get_logger():
18
+ global logger
19
+
20
+ # If the logger doesn't have any handlers, create it using the custom function
21
+ logger.bind(sub_application="duck_db_interface")
22
+ return logger
23
+
24
+ logger = get_logger()
25
+
26
+ def _list_parquet_files(fs, dir_path: str) -> list[str]:
27
+ infos = fs.get_file_info(pafs.FileSelector(dir_path, recursive=False))
28
+ return [i.path for i in infos
29
+ if i.type == pafs.FileType.File and i.path.endswith(".parquet")]
30
+
31
+
32
+
33
+
34
+
35
+ class DuckDBInterface:
36
+ """
37
+ Persist/serve (time_index, unique_identifier, …) DataFrames in a DuckDB file.
38
+ """
39
+
40
+ def __init__(self, db_path: Optional[str | Path] = None):
41
+ """
42
+ Initializes the interface with the path to the DuckDB database file.
43
+
44
+ Args:
45
+ db_path (Optional[str | Path]): Path to the database file.
46
+ Defaults to the value of the DUCKDB_PATH
47
+ environment variable or 'analytics.duckdb'
48
+ in the current directory if the variable is not set.
49
+ """
50
+ from mainsequence.tdag.config import TDAG_DATA_PATH
51
+ # ── choose default & normalise to string ───────────────────────────
52
+ default_path = os.getenv(
53
+ "DUCKDB_PATH",
54
+ os.path.join(f"{TDAG_DATA_PATH}", "duck_db"),
55
+ )
56
+ db_uri = str(db_path or default_path).rstrip("/")
57
+
58
+ # ── FileSystem abstraction (works for local & S3) ──────────────────
59
+ self._fs, self._object_path = fs.FileSystem.from_uri(db_uri)
60
+
61
+ # ── DuckDB connection ──────────────────────────────────────────────
62
+ # • local → store meta‑data in a .duckdb file under db_uri
63
+ # • remote → in‑memory DB; still works because all user data
64
+ # lives in Parquet on the object store
65
+ if db_uri.startswith("s3://") or db_uri.startswith("gs://"):
66
+ self.con = duckdb.connect(":memory:")
67
+ # duckdb needs the httpfs extension for S3
68
+ self.con.execute("INSTALL httpfs;")
69
+ self.con.execute("LOAD httpfs;")
70
+ else:
71
+ meta_file = Path(db_uri) / "duck_meta.duckdb"
72
+ meta_file.parent.mkdir(parents=True, exist_ok=True)
73
+ self.con = duckdb.connect(str(meta_file))
74
+
75
+ # ── sane defaults ──────────────────────────────────────────────────
76
+ self.con.execute("PRAGMA threads = 4")
77
+ self.con.execute("PRAGMA enable_object_cache = true")
78
+ self.con.execute("SET TIMEZONE = 'UTC';")
79
+
80
+ self.db_path = db_uri # keep the fully‑qualified URI
81
+
82
+ def launch_gui(self, host='localhost', port=4213, timeout=0.5):
83
+ import duckdb
84
+ import socket
85
+
86
+ def ui_is_running(host, port, timeout):
87
+ """Returns True if something is listening on host:port."""
88
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
89
+ s.settimeout(timeout)
90
+ try:
91
+ s.connect((host, port))
92
+ return True
93
+ except (ConnectionRefusedError, socket.timeout):
94
+ return False
95
+
96
+ # 1. Connect to your database
97
+ conn = duckdb.connect(self.db_path)
98
+
99
+ # 2. Decide whether to start the UI
100
+ url = f"http://{host}:{port}"
101
+ if not ui_is_running(host, port, timeout):
102
+ # (first‐time only) install and load the UI extension
103
+ # conn.execute("INSTALL ui;")
104
+ # conn.execute("LOAD ui;")
105
+ # spin up the HTTP server and open your browser
106
+ conn.execute("CALL start_ui();")
107
+ print(f"DuckDB Explorer launched at {url}")
108
+ else:
109
+ print(f"DuckDB Explorer is already running at {url}")
110
+
111
+ # ──────────────────────────────────────────────────────────────────────────────
112
+ # Public API
113
+ # ──────────────────────────────────────────────────────────────────────────────
114
+
115
+ def time_index_minima(
116
+ self,
117
+ table: str,
118
+ ids: Optional[List[str]] = None,
119
+ ) -> Tuple[Optional[pd.Timestamp], Dict[Any, Optional[pd.Timestamp]]]:
120
+ """
121
+ Compute the minimum time_index over the entire dataset AND the minimum per unique_identifier.
122
+
123
+ Returns:
124
+ (global_min, per_id_dict)
125
+
126
+ global_min: pd.Timestamp (UTC) or None if table is empty / all-NULL
127
+ per_id_dict: {uid: pd.Timestamp (UTC) or None} for each distinct uid (after optional filtering)
128
+
129
+ Fast path:
130
+ Uses a single scan with GROUPING SETS ((), (unique_identifier)), reading only
131
+ (unique_identifier, time_index). DuckDB will push projection to Parquet and parallelize.
132
+
133
+ Fallback:
134
+ Runs two simple queries (global MIN + per-id MIN) if GROUPING SETS isn't supported
135
+ in your DuckDB build.
136
+
137
+ Args:
138
+ table: logical name (your view name); if the view is missing, we scan the Parquet
139
+ directly under {self.db_path}/{table}/**/*.parquet with hive_partitioning.
140
+ ids: optional list; if provided, restricts to those unique_identifiers only.
141
+ """
142
+ import duckdb
143
+ import pandas as pd
144
+ from typing import Any, Dict, Optional, Tuple, List
145
+
146
+ def qident(name: str) -> str:
147
+ return '"' + str(name).replace('"', '""') + '"'
148
+
149
+ qtbl = qident(table)
150
+ qid = qident("unique_identifier")
151
+ qts = qident("time_index")
152
+
153
+ # --- Choose fastest reliable source relation ---
154
+ # Prefer scanning the view if it exists (it normalizes schema); otherwise scan Parquet directly.
155
+ try:
156
+ use_view = bool(self.table_exists(table))
157
+ except Exception:
158
+ use_view = False
159
+
160
+ file_glob = f"{self.db_path}/{table}/**/*.parquet"
161
+ src_rel = (
162
+ qtbl
163
+ if use_view
164
+ else f"parquet_scan('{file_glob}', hive_partitioning=TRUE, union_by_name=TRUE)"
165
+ )
166
+
167
+ # Optional filter to reduce the output cardinality if the caller only cares about some ids
168
+ params: List[Any] = []
169
+ where_clause = ""
170
+ if ids:
171
+ placeholders = ", ".join("?" for _ in ids)
172
+ where_clause = f"WHERE {qid} IN ({placeholders})"
173
+ params.extend(list(ids))
174
+
175
+ # --- Single-pass: GROUPING SETS (grand total + per-id) ---
176
+ sql_one_pass = f"""
177
+ WITH src AS (
178
+ SELECT {qid} AS uid, {qts} AS ts
179
+ FROM {src_rel}
180
+ {where_clause}
181
+ )
182
+ SELECT
183
+ uid,
184
+ MIN(ts) AS min_val,
185
+ GROUPING(uid) AS is_total_row
186
+ FROM src
187
+ GROUP BY GROUPING SETS ((), (uid));
188
+ """
189
+
190
+ try:
191
+ rows = self.con.execute(sql_one_pass, params).fetchall()
192
+
193
+ global_min_raw: Optional[Any] = None
194
+ per_id_raw: Dict[Any, Optional[Any]] = {}
195
+
196
+ for uid, min_val, is_total in rows:
197
+ if is_total:
198
+ global_min_raw = min_val # grand total row
199
+ else:
200
+ per_id_raw[uid] = min_val
201
+
202
+ # Normalize to tz-aware pandas Timestamps (UTC) for consistency with your interface
203
+ to_ts = lambda v: pd.to_datetime(v, utc=True) if v is not None else None
204
+ global_min = to_ts(global_min_raw)
205
+ per_id = {uid: to_ts(v) for uid, v in per_id_raw.items()}
206
+ return global_min, per_id
207
+
208
+ except duckdb.Error as e:
209
+ # --- Fallback: two straightforward queries (still reads only needed columns) ---
210
+ logger.info(f"time_index_minima: GROUPING SETS path failed; falling back. Reason: {e}")
211
+
212
+ sql_global = f"""
213
+ SELECT MIN(ts)
214
+ FROM (
215
+ SELECT {qts} AS ts
216
+ FROM {src_rel}
217
+ {where_clause}
218
+ )
219
+ """
220
+ sql_per_id = f"""
221
+ SELECT uid, MIN(ts) AS min_val
222
+ FROM (
223
+ SELECT {qid} AS uid, {qts} AS ts
224
+ FROM {src_rel}
225
+ {where_clause}
226
+ )
227
+ GROUP BY uid
228
+ """
229
+
230
+ global_min_raw = self.con.execute(sql_global, params).fetchone()[0]
231
+ pairs = self.con.execute(sql_per_id, params).fetchall()
232
+
233
+ to_ts = lambda v: pd.to_datetime(v, utc=True) if v is not None else None
234
+ global_min = to_ts(global_min_raw)
235
+ per_id = {uid: to_ts(min_val) for uid, min_val in pairs}
236
+ return global_min, per_id
237
+
238
+ def remove_columns(self, table: str, columns: List[str]) -> Dict[str, Any]:
239
+ """
240
+ Forcefully drop the given columns from the dataset backing `table`.
241
+
242
+ Behavior:
243
+ • Rebuilds *every* partition directory (year=/month=/[day=]) into one new Parquet file.
244
+ • Drops the requested columns that exist in that partition (others are ignored).
245
+ • Always deletes the old Parquet fragments after the new file is written.
246
+ • Always refreshes the view to reflect the new schema.
247
+
248
+ Notes:
249
+ • Protected keys to keep storage model consistent:
250
+ {'time_index','unique_identifier','year','month','day'} are not dropped.
251
+ • If a requested column doesn’t exist in some partitions, those partitions are still rebuilt.
252
+ • Destructive and idempotent.
253
+ """
254
+ import uuid
255
+ import duckdb
256
+
257
+ def qident(name: str) -> str:
258
+ return '"' + str(name).replace('"', '""') + '"'
259
+
260
+ requested = list(dict.fromkeys(columns or []))
261
+ protected = {"time_index", "unique_identifier", "year", "month", "day"}
262
+
263
+ # Discover unified schema to know which requested columns actually exist
264
+ file_glob = f"{self.db_path}/{table}/**/*.parquet"
265
+ try:
266
+ desc_rows = self.con.execute(
267
+ f"DESCRIBE SELECT * FROM read_parquet('{file_glob}', "
268
+ f"union_by_name=TRUE, hive_partitioning=TRUE)"
269
+ ).fetchall()
270
+ present_cols = {r[0] for r in desc_rows}
271
+ except duckdb.Error as e:
272
+ logger.error(f"remove_columns: cannot scan files for '{table}': {e}")
273
+ try:
274
+ self._ensure_view(table)
275
+ except Exception as ev:
276
+ logger.warning(f"remove_columns: _ensure_view failed after scan error: {ev}")
277
+ return {"dropped": [], "skipped": requested, "partitions_rebuilt": 0, "files_deleted": 0}
278
+
279
+ to_drop_global = [c for c in requested if c in present_cols and c not in protected]
280
+ skipped_global = [c for c in requested if c not in present_cols or c in protected]
281
+
282
+ # Enumerate all partition directories that currently contain Parquet files
283
+ selector = fs.FileSelector(f"{self.db_path}/{table}", recursive=True)
284
+ infos = self._fs.get_file_info(selector)
285
+ part_dirs = sorted({
286
+ info.path.rpartition("/")[0]
287
+ for info in infos
288
+ if info.type == fs.FileType.File and info.path.endswith(".parquet")
289
+ })
290
+
291
+ if not part_dirs:
292
+ logger.info(f"remove_columns: table '{table}' has no Parquet files.")
293
+ try:
294
+ self._ensure_view(table)
295
+ except Exception as ev:
296
+ logger.warning(f"remove_columns: _ensure_view failed on empty table: {ev}")
297
+ return {"dropped": to_drop_global, "skipped": skipped_global,
298
+ "partitions_rebuilt": 0, "files_deleted": 0}
299
+
300
+ partitions_rebuilt = 0
301
+ files_deleted = 0
302
+
303
+ try:
304
+ for part_path in part_dirs:
305
+ # 1) Partition-local schema WITHOUT filename helper (stable "real" columns)
306
+ try:
307
+ part_desc = self.con.execute(
308
+ f"DESCRIBE SELECT * FROM parquet_scan('{part_path}/*.parquet', "
309
+ f" hive_partitioning=TRUE, union_by_name=TRUE)"
310
+ ).fetchall()
311
+ # Preserve order returned by DESCRIBE for deterministic output
312
+ part_cols_ordered = [r[0] for r in part_desc]
313
+ part_cols_set = set(part_cols_ordered)
314
+ except duckdb.Error as e:
315
+ logger.warning(f"remove_columns: skipping partition due to scan error at {part_path}: {e}")
316
+ continue
317
+
318
+ to_drop_here = [c for c in to_drop_global if c in part_cols_set]
319
+
320
+ # 2) Columns to keep (explicit projection → safest)
321
+ keep_cols = [c for c in part_cols_ordered if c not in to_drop_here]
322
+ if not keep_cols:
323
+ # Should not happen due to 'protected', but guard anyway
324
+ logger.warning(f"remove_columns: nothing to write after drops in {part_path}; skipping")
325
+ continue
326
+ keep_csv = ", ".join(qident(c) for c in keep_cols)
327
+
328
+ # 3) Detect the actual helper file-path column name added by filename=TRUE
329
+ # by comparing with/without filename=TRUE.
330
+ try:
331
+ fname_desc = self.con.execute(
332
+ f"DESCRIBE SELECT * FROM parquet_scan('{part_path}/*.parquet', "
333
+ f" hive_partitioning=TRUE, union_by_name=TRUE, filename=TRUE)"
334
+ ).fetchall()
335
+ cols_with_fname = {r[0] for r in fname_desc}
336
+ added_by_filename = cols_with_fname - part_cols_set # usually {'filename'} or {'file_name', ...}
337
+ file_col = next(iter(added_by_filename), None)
338
+ except duckdb.Error:
339
+ file_col = None
340
+
341
+ # 4) Decide ordering key for recency; fall back to time_index if helper missing
342
+ order_key = qident(file_col) if file_col else "time_index"
343
+
344
+ # 5) Rebuild partition with explicit projection + window de-dup
345
+ tmp_file = f"{part_path}/rebuild-{uuid.uuid4().hex}.parquet"
346
+ copy_sql = f"""
347
+ COPY (
348
+ SELECT {keep_csv}
349
+ FROM (
350
+ SELECT {keep_csv},
351
+ ROW_NUMBER() OVER (
352
+ PARTITION BY time_index, unique_identifier
353
+ ORDER BY {order_key} DESC
354
+ ) AS rn
355
+ FROM parquet_scan('{part_path}/*.parquet',
356
+ hive_partitioning=TRUE,
357
+ union_by_name=TRUE,
358
+ filename=TRUE)
359
+ )
360
+ WHERE rn = 1
361
+ )
362
+ TO '{tmp_file}'
363
+ (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000)
364
+ """
365
+ try:
366
+ self.con.execute(copy_sql)
367
+ except duckdb.Error as e:
368
+ logger.error(f"remove_columns: COPY failed for partition {part_path}: {e}")
369
+ raise
370
+
371
+ # 6) Delete all old fragments, keep only the new file
372
+ try:
373
+ current_infos = self._fs.get_file_info(fs.FileSelector(part_path))
374
+ for fi in current_infos:
375
+ if fi.type == fs.FileType.File and fi.path.endswith(".parquet") and fi.path != tmp_file:
376
+ self._fs.delete_file(fi.path)
377
+ files_deleted += 1
378
+ except Exception as cleanup_e:
379
+ logger.warning(f"remove_columns: cleanup failed in {part_path}: {cleanup_e}")
380
+
381
+ partitions_rebuilt += 1
382
+
383
+ finally:
384
+ # Ensure logical schema matches physical files
385
+ try:
386
+ self._ensure_view(table)
387
+ except Exception as ev:
388
+ logger.warning(f"remove_columns: _ensure_view failed after rebuild: {ev}")
389
+
390
+ return {
391
+ "dropped": to_drop_global,
392
+ "skipped": skipped_global,
393
+ "partitions_rebuilt": partitions_rebuilt,
394
+ "files_deleted": files_deleted,
395
+ }
396
+
397
+ def upsert(self, df: pd.DataFrame, table: str,
398
+ data_frequency: DataFrequency = DataFrequency.one_m) -> None:
399
+ """
400
+ Idempotently writes a DataFrame into *table* using (time_index, unique_identifier) PK.
401
+ Extra columns are added to the table automatically.
402
+ """
403
+ import os
404
+ import uuid
405
+ import datetime
406
+ from pyarrow import fs # used for cleanup listing
407
+
408
+ if df.empty:
409
+ logger.warning(f"Attempted to upsert an empty DataFrame to table '{table}'. Skipping.")
410
+ return
411
+
412
+ # —— basic hygiene ——--------------------------------------------------
413
+ df = df.copy()
414
+ df["time_index"] = pd.to_datetime(df["time_index"], utc=True)
415
+ if "unique_identifier" not in df.columns:
416
+ df["unique_identifier"] = "" # degenerate PK for daily data
417
+ df["unique_identifier"] = df["unique_identifier"].astype(str) # ADDED: harden as text
418
+
419
+ # —— derive partition columns ——---------------------------------------
420
+ partitions = self._partition_keys(df["time_index"], data_frequency=data_frequency)
421
+ for col, values in partitions.items():
422
+ df[col] = values
423
+ part_cols = list(partitions)
424
+
425
+ logger.debug(f"Starting upsert of {len(df)} rows into table '{table}' in {self.db_path}")
426
+
427
+ # —— de‑duplication inside *this* DataFrame ——--------------------------
428
+ df = df.drop_duplicates(subset=["time_index", "unique_identifier"], keep="last")
429
+
430
+ # ── Write each partition safely ─────────────────────────────────
431
+ for keys, sub in df.groupby(part_cols, sort=False):
432
+ part_path = self._partition_path(dict(zip(part_cols, keys)), table=table)
433
+ self._fs.create_dir(part_path, recursive=True)
434
+
435
+ # Register incoming batch as a DuckDB relation
436
+ self.con.register("incoming_sub", sub)
437
+
438
+ # Detect presence of existing files and time-range overlap (cheap)
439
+ has_existing = False
440
+ time_overlap = False
441
+ try:
442
+ row = self.con.execute(
443
+ f"""
444
+ SELECT min(time_index) AS mn, max(time_index) AS mx
445
+ FROM parquet_scan('{part_path}/*.parquet', hive_partitioning=TRUE)
446
+ """
447
+ ).fetchone()
448
+ if row and row[0] is not None:
449
+ has_existing = True
450
+ mn = pd.to_datetime(row[0], utc=True)
451
+ mx = pd.to_datetime(row[1], utc=True)
452
+ smin = sub["time_index"].min()
453
+ smax = sub["time_index"].max()
454
+ time_overlap = not (smax < mn or smin > mx)
455
+ except Exception:
456
+ has_existing = False
457
+
458
+ # Exact PK overlap check (only if time windows overlap)
459
+ overlap_exists = False
460
+ if has_existing and time_overlap:
461
+ overlap_exists = bool(self.con.execute(
462
+ f"""
463
+ SELECT EXISTS (
464
+ SELECT 1
465
+ FROM incoming_sub i
466
+ JOIN parquet_scan('{part_path}/*.parquet', hive_partitioning=TRUE) e
467
+ ON e.time_index = i.time_index
468
+ AND e.unique_identifier = i.unique_identifier
469
+ LIMIT 1
470
+ );
471
+ """
472
+ ).fetchone()[0])
473
+
474
+ # -------------------- Append path (no PK collision) --------------------
475
+ if not has_existing or not time_overlap or not overlap_exists:
476
+ try:
477
+ ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
478
+ final_name = f"part-{ts}-{uuid.uuid4().hex}.parquet"
479
+ tmp_name = final_name + ".tmp"
480
+ tmp_path = f"{part_path}/{tmp_name}"
481
+ final_path = f"{part_path}/{final_name}"
482
+
483
+ if has_existing:
484
+ # Keep dedup exact: write only rows not already present
485
+ anti_join_select = f"""
486
+ SELECT i.*
487
+ FROM incoming_sub i
488
+ WHERE NOT EXISTS (
489
+ SELECT 1
490
+ FROM parquet_scan('{part_path}/*.parquet', hive_partitioning=TRUE) e
491
+ WHERE e.time_index = i.time_index
492
+ AND e.unique_identifier = i.unique_identifier
493
+ )
494
+ ORDER BY i.unique_identifier, i.time_index
495
+ """
496
+ n_new = self.con.execute(
497
+ f"SELECT COUNT(*) FROM ({anti_join_select})"
498
+ ).fetchone()[0]
499
+ if n_new == 0:
500
+ continue
501
+ self.con.execute(
502
+ f"""
503
+ COPY ({anti_join_select})
504
+ TO '{tmp_path}'
505
+ (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000);
506
+ """
507
+ )
508
+ else:
509
+ # No existing files → safe to copy all incoming rows
510
+ self.con.execute(
511
+ f"""
512
+ COPY (
513
+ SELECT i.*
514
+ FROM incoming_sub i
515
+ ORDER BY i.unique_identifier, i.time_index
516
+ )
517
+ TO '{tmp_path}'
518
+ (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000);
519
+ """
520
+ )
521
+
522
+ # Atomic move into place
523
+ self._fs.move(tmp_path, final_path)
524
+ except Exception as e:
525
+ logger.exception(f"Append path failed for partition {keys}: {e}")
526
+ raise
527
+ continue
528
+
529
+ # -------------------- Rewrite path (true upsert) -----------------------
530
+ try:
531
+ # Discover existing/incoming schemas and build COALESCE projection
532
+ desc_rows = self.con.execute(
533
+ f"""
534
+ DESCRIBE SELECT * FROM parquet_scan(
535
+ '{part_path}/*.parquet',
536
+ hive_partitioning=TRUE,
537
+ union_by_name=TRUE
538
+ )
539
+ """
540
+ ).fetchall()
541
+ existing_cols = [r[0] for r in desc_rows]
542
+
543
+ # ADDED: also look at incoming schema so we can build a typed e-select
544
+ incoming_desc = self.con.execute("DESCRIBE SELECT * FROM incoming_sub").fetchall() # ADDED
545
+ incoming_cols = [r[0] for r in incoming_desc]
546
+
547
+ all_cols = list(dict.fromkeys(incoming_cols + existing_cols)) # deterministic order
548
+
549
+ def qident(name: str) -> str:
550
+ return '"' + str(name).replace('"', '""') + '"'
551
+
552
+ inc_set, ex_set = set(incoming_cols), set(existing_cols)
553
+
554
+ # CHANGED: Build merged projection with explicit BIGINT casts for partition cols
555
+ select_exprs = []
556
+ for c in all_cols:
557
+ qc = qident(c)
558
+ if c in inc_set and c in ex_set:
559
+ if c in part_cols:
560
+ select_exprs.append(
561
+ f"COALESCE(CAST(i.{qc} AS BIGINT), CAST(e.{qc} AS BIGINT)) AS {qc}") # CHANGED
562
+ else:
563
+ select_exprs.append(f"COALESCE(i.{qc}, e.{qc}) AS {qc}")
564
+ elif c in inc_set:
565
+ if c in part_cols:
566
+ select_exprs.append(f"CAST(i.{qc} AS BIGINT) AS {qc}") # CHANGED
567
+ else:
568
+ select_exprs.append(f"i.{qc} AS {qc}")
569
+ else: # only in existing
570
+ if c in part_cols:
571
+ select_exprs.append(f"CAST(e.{qc} AS BIGINT) AS {qc}") # CHANGED
572
+ else:
573
+ select_exprs.append(f"e.{qc} AS {qc}")
574
+ select_list = ", ".join(select_exprs)
575
+
576
+ # ADDED: Build a type-aligned projection of existing rows for the anti-join side
577
+ # (so UNION ALL BY NAME sees identical types, esp. for partitions)
578
+ e_select_exprs = []
579
+ for c in all_cols:
580
+ qc = qident(c)
581
+ if c in ex_set:
582
+ if c in part_cols:
583
+ e_select_exprs.append(f"CAST(e.{qc} AS BIGINT) AS {qc}") # ADDED
584
+ else:
585
+ e_select_exprs.append(f"e.{qc} AS {qc}")
586
+ else:
587
+ # Column exists only in incoming; let it be NULL here
588
+ if c in part_cols:
589
+ e_select_exprs.append(f"CAST(NULL AS BIGINT) AS {qc}") # ADDED
590
+ else:
591
+ e_select_exprs.append(f"NULL AS {qc}") # ADDED
592
+ e_select_list = ", ".join(e_select_exprs)
593
+
594
+ ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
595
+ final_name = f"part-{ts}-{uuid.uuid4().hex}.parquet"
596
+ tmp_name = final_name + ".tmp"
597
+ tmp_path = f"{part_path}/{tmp_name}"
598
+ final_path = f"{part_path}/{final_name}"
599
+
600
+ merge_sql = f"""
601
+ COPY (
602
+ WITH existing AS (
603
+ SELECT * FROM parquet_scan(
604
+ '{part_path}/*.parquet',
605
+ hive_partitioning=TRUE,
606
+ union_by_name=TRUE
607
+ )
608
+ ),
609
+ merged_incoming AS (
610
+ SELECT {select_list}
611
+ FROM incoming_sub i
612
+ LEFT JOIN existing e
613
+ ON e.time_index = i.time_index
614
+ AND e.unique_identifier = i.unique_identifier
615
+ )
616
+ SELECT *
617
+ FROM (
618
+ -- rows with incoming (coalesced over existing)
619
+ SELECT * FROM merged_incoming
620
+ UNION ALL BY NAME -- CHANGED: correct syntax order
621
+ -- keep existing rows that do not collide on PK
622
+ SELECT {e_select_list} -- REPLACED: used to be 'SELECT e.*'
623
+ FROM existing e
624
+ ANTI JOIN incoming_sub i
625
+ ON e.time_index = i.time_index
626
+ AND e.unique_identifier = i.unique_identifier
627
+ )
628
+ ORDER BY unique_identifier, time_index
629
+ )
630
+ TO '{tmp_path}'
631
+ (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000);
632
+ """
633
+ try:
634
+ self.con.execute(merge_sql)
635
+ except Exception as e:
636
+ raise e
637
+ self._fs.move(tmp_path, final_path)
638
+
639
+ # Cleanup old parquet fragments, keep only the new file
640
+ try:
641
+ for fi in self._fs.get_file_info(fs.FileSelector(part_path)):
642
+ if (
643
+ fi.type == fs.FileType.File
644
+ and fi.path.endswith(".parquet")
645
+ and os.path.basename(fi.path) != final_name
646
+ ):
647
+ self._fs.delete_file(fi.path)
648
+ except Exception as cleanup_e:
649
+ logger.warning(f"Cleanup old parquet files failed in {part_path}: {cleanup_e}")
650
+
651
+ except Exception as e:
652
+ logger.exception(f"Rewrite path failed for partition {keys}: {e}")
653
+ raise
654
+
655
+ # ── Refresh view ────────────────────────────────────────────────
656
+ self._ensure_view(table=table)
657
+
658
+ def table_exists(self,table):
659
+ table_exists_result = self.con.execute("""
660
+ SELECT COUNT(*)
661
+ FROM information_schema.tables
662
+ WHERE table_schema='main' AND table_name = ?
663
+ UNION ALL
664
+ SELECT COUNT(*)
665
+ FROM information_schema.views
666
+ WHERE table_schema='main' AND table_name = ?
667
+ """, [table, table]).fetchone()[0] > 0
668
+
669
+ if table_exists_result is None:
670
+ logger.warning(f"Table '{table}' does not exist in {self.db_path}. Returning empty DataFrame.")
671
+ return pd.DataFrame()
672
+ return table_exists_result
673
+
674
+ def constrain_read(
675
+ self,
676
+ table: str,
677
+ *,
678
+ start: Optional[datetime.datetime] = None,
679
+ end: Optional[datetime.datetime] = None,
680
+ ids: Optional[List[str]] = None,
681
+ unique_identifier_range_map: Optional[Dict[str, Dict[str, Any]]] = None,
682
+ max_rows: Optional[int] = None,
683
+ now: Optional[datetime.datetime] = None,
684
+ ) -> Tuple[
685
+ Optional[datetime.datetime], # adjusted_start
686
+ Optional[datetime.datetime], # adjusted_end
687
+ Optional[Dict[str, Dict[str, Any]]], # adjusted_unique_identifier_range_map
688
+ Dict[str, Any] # diagnostics
689
+ ]:
690
+ """
691
+ Constrain a prospective read so that the estimated number of rows does not exceed *max_rows*.
692
+ Estimation uses Parquet row-group metadata (min/max on time_index + num_rows), i.e. it does not
693
+ scan full data and does not depend on bar frequency.
694
+
695
+ Inputs are the same "shape" as DuckDBInterface.read(...). The function returns adjusted
696
+ (start, end, unique_identifier_range_map) that you can pass to read(...), plus diagnostics.
697
+
698
+ Row cap source:
699
+ • max_rows argument if provided
700
+ • else env MAX_READ_ROWS or TDAG_MAX_READ_ROWS
701
+ • else default 10_000_000
702
+
703
+ Behavior:
704
+ • Computes an overall effective [start, end] across inputs (range_map, start/end).
705
+ • Reads Parquet footers under {self.db_path}/{table}/**/*.parquet to gather row-group
706
+ (min_time, max_time, num_rows).
707
+ • If the estimated rows for [start, end] <= max_rows, returns inputs unchanged.
708
+ • Otherwise finds the latest limit_dt so that estimated rows in [start, limit_dt] ~= max_rows,
709
+ and tightens:
710
+ - global 'end' → min(end, limit_dt) if plain start/end was used
711
+ - per-uid 'end_date' in unique_identifier_range_map → min(existing, limit_dt)
712
+
713
+ Returns:
714
+ adjusted_start, adjusted_end, adjusted_unique_identifier_range_map, diagnostics
715
+ """
716
+ import os
717
+ import re
718
+ import math
719
+ import numpy as np
720
+ import pandas as pd
721
+ import pyarrow.parquet as pq
722
+ from pyarrow import fs as pa_fs
723
+ from calendar import monthrange
724
+
725
+ # --- helpers -------------------------------------------------------------
726
+
727
+ def _to_utc_ts(dt: Optional[datetime.datetime]) -> Optional[pd.Timestamp]:
728
+ if dt is None:
729
+ return None
730
+ ts = pd.to_datetime(dt, utc=True)
731
+ # normalize to UTC tz-aware pandas Timestamp
732
+ if not isinstance(ts, pd.Timestamp):
733
+ ts = pd.Timestamp(ts, tz="UTC")
734
+ elif ts.tz is None:
735
+ ts = ts.tz_localize("UTC")
736
+ else:
737
+ ts = ts.tz_convert("UTC")
738
+ return ts
739
+
740
+ def _effective_start_from_range_map(rmap: Dict[str, Dict[str, Any]]) -> Optional[pd.Timestamp]:
741
+ starts = []
742
+ for v in rmap.values():
743
+ s = v.get("start_date")
744
+ if s is None:
745
+ continue
746
+ ts = _to_utc_ts(s)
747
+ if ts is None:
748
+ continue
749
+ # operand '>' means open interval → start just after s (epsilon)
750
+ if v.get("start_date_operand") == ">":
751
+ ts = ts + pd.Timedelta(nanoseconds=1)
752
+ starts.append(ts)
753
+ return min(starts) if starts else None
754
+
755
+ def _effective_end_from_range_map(rmap: Dict[str, Dict[str, Any]]) -> Optional[pd.Timestamp]:
756
+ ends = []
757
+ for v in rmap.values():
758
+ e = v.get("end_date")
759
+ if e is not None:
760
+ ends.append(_to_utc_ts(e))
761
+ return max(ends) if ends else None
762
+
763
+ def _parse_part_bounds_from_path(path: str) -> Tuple[Optional[pd.Timestamp], Optional[pd.Timestamp]]:
764
+ """
765
+ Infer inclusive [partition_start, partition_end] purely from path components
766
+ like .../year=2024/month=07[/day=03]/file.parquet.
767
+ Returns (p_start, p_end) as UTC tz-aware Timestamps, or (None, None) if not parsable.
768
+ """
769
+ m_year = re.search(r"/year=(\d{4})(/|$)", path)
770
+ m_month = re.search(r"/month=(\d{2})(/|$)", path)
771
+ m_day = re.search(r"/day=(\d{2})(/|$)", path)
772
+ if not (m_year and m_month):
773
+ return None, None
774
+ y = int(m_year.group(1))
775
+ m = int(m_month.group(1))
776
+ if m_day:
777
+ d = int(m_day.group(1))
778
+ start = pd.Timestamp(datetime.datetime(y, m, d, 0, 0, 0, tzinfo=datetime.timezone.utc))
779
+ end = start + pd.Timedelta(days=1) - pd.Timedelta(nanoseconds=1)
780
+ return start, end
781
+ # month granularity
782
+ last_day = monthrange(y, m)[1]
783
+ start = pd.Timestamp(datetime.datetime(y, m, 1, 0, 0, 0, tzinfo=datetime.timezone.utc))
784
+ end = pd.Timestamp(datetime.datetime(y, m, last_day, 23, 59, 59, tzinfo=datetime.timezone.utc)) \
785
+ + pd.Timedelta(seconds=0.999999999) # inclusive
786
+ return start, end
787
+
788
+ def _collect_row_groups_meta(file_path: str) -> Tuple[
789
+ List[Tuple[pd.Timestamp, pd.Timestamp, int]], Optional[str]]:
790
+ """
791
+ Return list of (rg_min, rg_max, rg_rows) for 'time_index' from Parquet footer.
792
+ If stats are missing, returns empty list and a reason.
793
+ """
794
+ try:
795
+ pf = pq.ParquetFile(file_path, filesystem=self._fs)
796
+ except Exception as e:
797
+ return [], f"open_error:{e}"
798
+
799
+ # Find Arrow time unit (ns/us/ms/s)
800
+ try:
801
+ arr_schema = pf.schema_arrow
802
+ tfield = arr_schema.field("time_index")
803
+ ttype = tfield.type # pyarrow.TimestampType
804
+ unit = getattr(ttype, "unit", "ns")
805
+ except Exception:
806
+ # If schema isn't Arrow-resolvable, fall back to 'ns'
807
+ unit = "ns"
808
+
809
+ rg_list: List[Tuple[pd.Timestamp, pd.Timestamp, int]] = []
810
+ try:
811
+ meta = pf.metadata
812
+ nrg = meta.num_row_groups
813
+ # Find column index by name (robust against nested/flat)
814
+ col_idx = None
815
+ # Try direct mapping via schema_arrow index first
816
+ try:
817
+ idx = arr_schema.get_field_index("time_index")
818
+ if idx != -1:
819
+ col_idx = idx
820
+ except Exception:
821
+ col_idx = None
822
+ for i in range(nrg):
823
+ rg = meta.row_group(i)
824
+ # choose the column chunk for time_index
825
+ col = None
826
+ if col_idx is not None and col_idx < rg.num_columns:
827
+ col = rg.column(col_idx)
828
+ # Verify we picked the right column name; if not, search by path
829
+ try:
830
+ name = str(col.path_in_schema)
831
+ if name.split(".")[-1] != "time_index":
832
+ col = None
833
+ # else ok
834
+ except Exception:
835
+ # fall back to search
836
+ col = None
837
+ if col is None:
838
+ for j in range(rg.num_columns):
839
+ cj = rg.column(j)
840
+ try:
841
+ name = str(cj.path_in_schema)
842
+ except Exception:
843
+ name = ""
844
+ if name.split(".")[-1] == "time_index":
845
+ col = cj
846
+ break
847
+ if col is None:
848
+ # can't find time_index column → skip this row group
849
+ continue
850
+ stats = getattr(col, "statistics", None)
851
+ if not stats or not getattr(stats, "has_min_max", False):
852
+ continue
853
+ vmin, vmax = stats.min, stats.max
854
+
855
+ # Convert min/max to UTC Timestamps
856
+ def conv(v):
857
+ if v is None:
858
+ return None
859
+ # numeric epoch in unit
860
+ if isinstance(v, (int, np.integer)):
861
+ return pd.to_datetime(int(v), unit=unit, utc=True)
862
+ if isinstance(v, float):
863
+ return pd.to_datetime(int(v), unit=unit, utc=True)
864
+ # already datetime-like
865
+ try:
866
+ return pd.to_datetime(v, utc=True)
867
+ except Exception:
868
+ return None
869
+
870
+ tmin = conv(vmin)
871
+ tmax = conv(vmax)
872
+ if tmin is None or tmax is None:
873
+ continue
874
+ # ensure ordering
875
+ if tmax < tmin:
876
+ tmin, tmax = tmax, tmin
877
+ rg_list.append((tmin, tmax, rg.num_rows))
878
+ except Exception as e:
879
+ return [], f"meta_error:{e}"
880
+ return rg_list, None
881
+
882
+ def _rows_estimate_until(T: pd.Timestamp, rgs: List[Tuple[pd.Timestamp, pd.Timestamp, int]],
883
+ start_ts: pd.Timestamp) -> int:
884
+ """
885
+ Estimate rows in [start_ts, T] by assuming uniform distribution within each row-group
886
+ between its (min_time, max_time). Uses only metadata.
887
+ """
888
+ if T < start_ts:
889
+ return 0
890
+ total = 0.0
891
+ Ts = T.value
892
+ Ss = start_ts.value
893
+ for (mn, mx, rows) in rgs:
894
+ a = max(Ss, mn.value)
895
+ b = min(Ts, mx.value)
896
+ if b <= a:
897
+ continue
898
+ denom = (mx.value - mn.value)
899
+ if denom <= 0:
900
+ # degenerate: all timestamps equal; include whole group if it intersects
901
+ total += float(rows)
902
+ else:
903
+ frac = (b - a) / denom
904
+ total += frac * float(rows)
905
+ return int(total)
906
+
907
+ # --- resolve cap + effective time window -------------------------------
908
+
909
+ if max_rows is None:
910
+ env_val = os.getenv("MAX_READ_ROWS") or os.getenv("TDAG_MAX_READ_ROWS")
911
+ try:
912
+ max_rows = int(str(env_val).replace(",", "_")) if env_val is not None else 10_000_000
913
+ except Exception:
914
+ max_rows = 10_000_000
915
+ if now is None:
916
+ now = datetime.datetime.now(datetime.timezone.utc)
917
+
918
+ # Normalize inputs
919
+ start_ts = _to_utc_ts(start)
920
+ end_ts = _to_utc_ts(end)
921
+ uirm = None if unique_identifier_range_map is None else {k: dict(v) for k, v in
922
+ unique_identifier_range_map.items()}
923
+
924
+ # If ids are given without a range map, create a simple one from start/end
925
+ if ids and (uirm is None):
926
+ uirm = {
927
+ uid: {
928
+ "start_date": start_ts.to_pydatetime() if start_ts is not None else None,
929
+ "start_date_operand": ">=",
930
+ "end_date": (end_ts or _to_utc_ts(now)).to_pydatetime() if (end_ts or now) is not None else None,
931
+ "end_date_operand": "<=",
932
+ } for uid in ids
933
+ }
934
+
935
+ # Compute global window from inputs
936
+ if uirm:
937
+ eff_start = _effective_start_from_range_map(uirm)
938
+ eff_end_in_map = _effective_end_from_range_map(uirm)
939
+ else:
940
+ eff_start = start_ts
941
+ eff_end_in_map = None
942
+
943
+ eff_end = end_ts or eff_end_in_map or _to_utc_ts(now)
944
+
945
+ # If start still unknown, derive from metadata minima later
946
+ # If end still unknown, use now
947
+ if eff_end is None:
948
+ eff_end = _to_utc_ts(now)
949
+
950
+ # --- collect candidate files via partition pruning ---------------------
951
+
952
+ base = f"{self.db_path}/{table}"
953
+ selector = pa_fs.FileSelector(base, recursive=True)
954
+ files = [
955
+ info.path
956
+ for info in self._fs.get_file_info(selector)
957
+ if info.type == pa_fs.FileType.File and info.path.endswith(".parquet")
958
+ ]
959
+
960
+ # Gather row-group metadata for relevant files
961
+ row_groups: List[Tuple[pd.Timestamp, pd.Timestamp, int]] = []
962
+ files_considered = 0
963
+ files_skipped_part = 0
964
+ files_meta_errors = 0
965
+
966
+ # Use partition bounds to prune before opening footers
967
+ for path in files:
968
+ p_start, p_end = _parse_part_bounds_from_path(path)
969
+ # If we don't know start yet, we must not prune too aggressively; include all and tighten later
970
+ if eff_start is not None and p_start is not None and p_end is not None:
971
+ if p_end < eff_start or p_start > eff_end:
972
+ files_skipped_part += 1
973
+ continue
974
+ rgs, err = _collect_row_groups_meta(path)
975
+ if err is not None:
976
+ files_meta_errors += 1
977
+ # Fall back: if partition range overlaps, take whole file as one group
978
+ if p_start is not None and p_end is not None:
979
+ # To avoid undercounting, include the file-level partition range as a single group
980
+ # with the file's row count from footer if available; otherwise skip.
981
+ try:
982
+ pf = pq.ParquetFile(path, filesystem=self._fs)
983
+ nrows_file = pf.metadata.num_rows
984
+ row_groups.append((p_start or eff_start or _to_utc_ts(now), p_end or eff_end, nrows_file))
985
+ files_considered += 1
986
+ except Exception:
987
+ pass
988
+ continue
989
+ if rgs:
990
+ row_groups.extend(rgs)
991
+ files_considered += 1
992
+
993
+ if not row_groups:
994
+ # No metadata found; nothing to constrain
995
+ diagnostics = {
996
+ "reason": "no_row_groups_found",
997
+ "max_rows": max_rows,
998
+ "files_considered": files_considered,
999
+ "files_skipped_partition": files_skipped_part,
1000
+ "files_meta_errors": files_meta_errors,
1001
+ }
1002
+ return start_ts, end_ts or _to_utc_ts(now), uirm, diagnostics
1003
+
1004
+ # If start still None, derive earliest available tmin from metadata
1005
+ if eff_start is None:
1006
+ eff_start = min(mn for (mn, mx, _) in row_groups)
1007
+ # Clamp if caller passed a later explicit start
1008
+ if start_ts is not None:
1009
+ eff_start = max(eff_start, start_ts)
1010
+
1011
+ # Filter row-groups that can intersect [eff_start, eff_end]
1012
+ row_groups = [rg for rg in row_groups if not (rg[1] < eff_start or rg[0] > eff_end)]
1013
+ if not row_groups:
1014
+ diagnostics = {
1015
+ "reason": "no_groups_in_window",
1016
+ "window": [str(eff_start), str(eff_end)],
1017
+ "max_rows": max_rows
1018
+ }
1019
+ return eff_start, eff_end, uirm, diagnostics
1020
+
1021
+ # --- estimate rows & binary search limit_dt ----------------------------
1022
+
1023
+ # Quick check: rows at eff_end
1024
+ est_rows_at_end = _rows_estimate_until(eff_end, row_groups, eff_start)
1025
+ if est_rows_at_end <= max_rows:
1026
+ diagnostics = {
1027
+ "limited": False,
1028
+ "estimated_rows": est_rows_at_end,
1029
+ "max_rows": max_rows,
1030
+ "limit_dt": None,
1031
+ "files_considered": files_considered,
1032
+ "files_skipped_partition": files_skipped_part,
1033
+ "files_meta_errors": files_meta_errors,
1034
+ "row_groups_considered": len(row_groups),
1035
+ "mode": "row_group_metadata",
1036
+ }
1037
+ # Nothing to change
1038
+ return start_ts or eff_start, end_ts or eff_end, uirm, diagnostics
1039
+
1040
+ # Binary search for latest T in [eff_start, eff_end] with est_rows <= max_rows
1041
+ lo = eff_start.value
1042
+ hi = eff_end.value
1043
+ for _ in range(64):
1044
+ mid = (lo + hi) // 2
1045
+ mid_ts = pd.Timestamp(mid, tz="UTC")
1046
+ est = _rows_estimate_until(mid_ts, row_groups, eff_start)
1047
+ if est > max_rows:
1048
+ hi = mid
1049
+ else:
1050
+ lo = mid
1051
+ limit_dt = pd.Timestamp(lo, tz="UTC")
1052
+ est_at_limit = _rows_estimate_until(limit_dt, row_groups, eff_start)
1053
+
1054
+ # --- produce adjusted outputs ------------------------------------------
1055
+
1056
+ adjusted_start = start_ts or eff_start
1057
+ adjusted_end = min(end_ts or eff_end, limit_dt)
1058
+
1059
+ adjusted_uirm = None
1060
+ if uirm is not None:
1061
+ adjusted_uirm = {}
1062
+ for uid, info in uirm.items():
1063
+ new_info = dict(info)
1064
+ # tighten end_date by limit_dt
1065
+ cur_end = info.get("end_date")
1066
+ cur_end_ts = _to_utc_ts(cur_end) if cur_end is not None else eff_end
1067
+ tight_end = min(cur_end_ts, limit_dt)
1068
+ new_info["end_date"] = tight_end.to_pydatetime()
1069
+ # preserve or default operand
1070
+ if "end_date_operand" not in new_info:
1071
+ new_info["end_date_operand"] = "<="
1072
+ adjusted_uirm[uid] = new_info
1073
+
1074
+ diagnostics = {
1075
+ "limited": True,
1076
+ "limit_dt": str(limit_dt),
1077
+ "estimated_rows_at_limit": est_at_limit,
1078
+ "estimated_rows_full": est_rows_at_end,
1079
+ "max_rows": max_rows,
1080
+ "files_considered": files_considered,
1081
+ "files_skipped_partition": files_skipped_part,
1082
+ "files_meta_errors": files_meta_errors,
1083
+ "row_groups_considered": len(row_groups),
1084
+ "mode": "row_group_metadata",
1085
+ "effective_window_before": [str(eff_start), str(eff_end)],
1086
+ "effective_window_after": [str(adjusted_start), str(adjusted_end)],
1087
+ }
1088
+
1089
+ return adjusted_start, adjusted_end, adjusted_uirm or uirm, diagnostics
1090
+
1091
+ def read(
1092
+ self,
1093
+ table: str,data_frequency:DataFrequency=DataFrequency.one_m,
1094
+ *,
1095
+ start: Optional[datetime.datetime] = None,
1096
+ end: Optional[datetime.datetime] = None,
1097
+ great_or_equal: bool = True, # Changed back to boolean
1098
+ less_or_equal: bool = True, # Changed back to boolean
1099
+ ids: Optional[List[str]] = None,
1100
+ columns: Optional[List[str]] = None,
1101
+ unique_identifier_range_map: Optional[UniqueIdentifierRangeMap] = None,
1102
+ column_range_descriptor: Optional[Dict[str,UniqueIdentifierRangeMap]] = None
1103
+ ) -> pd.DataFrame:
1104
+ """
1105
+ Reads data from the specified table, with optional filtering.
1106
+ Handles missing tables by returning an empty DataFrame.
1107
+
1108
+ Args:
1109
+ table (str): The name of the table to read from.
1110
+ start (Optional[datetime.datetime]): Minimum time_index filter.
1111
+ end (Optional[datetime.datetime]): Maximum time_index filter.
1112
+ great_or_equal (bool): If True, use >= for start date comparison. Defaults to True.
1113
+ less_or_equal (bool): If True, use <= for end date comparison. Defaults to True.
1114
+ ids (Optional[List[str]]): List of specific unique_identifiers to include.
1115
+ columns (Optional[List[str]]): Specific columns to select. Reads all if None.
1116
+ unique_identifier_range_map (Optional[UniqueIdentifierRangeMap]):
1117
+ A map where keys are unique_identifiers and values are dicts specifying
1118
+ date ranges (start_date, end_date, start_date_operand, end_date_operand)
1119
+ for that identifier. Mutually exclusive with 'ids'.
1120
+
1121
+ Returns:
1122
+ pd.DataFrame: The queried data, or an empty DataFrame if the table doesn't exist.
1123
+
1124
+ Raises:
1125
+ ValueError: If both `ids` and `unique_identifier_range_map` are provided.
1126
+ """
1127
+ # Map boolean flags to operator strings internally
1128
+ start_operator = '>=' if great_or_equal else '>'
1129
+ end_operator = '<=' if less_or_equal else '<'
1130
+
1131
+ if ids is not None and unique_identifier_range_map is not None:
1132
+ raise ValueError("Cannot provide both 'ids' and 'unique_identifier_range_map'.")
1133
+
1134
+ logger.debug(
1135
+ f"Duck DB: Reading from table '{table}' with filters: start={start}, end={end}, "
1136
+ f"ids={ids is not None}, columns={columns}, range_map={unique_identifier_range_map is not None}"
1137
+ )
1138
+
1139
+
1140
+ if columns is not None:
1141
+ table_exists_result = self.table_exists(table)
1142
+ df_cols = self.con.execute(f"SELECT * FROM {table} AS _q LIMIT 0").fetch_df()
1143
+ if any([c not in df_cols.columns for c in columns ]):
1144
+ logger.warning(f"not all Columns '{columns}' are not present in table '{table}'. returning an empty DF")
1145
+ return pd.DataFrame()
1146
+
1147
+ cols_select = "*"
1148
+ if columns:
1149
+ required_cols = {"time_index", "unique_identifier"}
1150
+ select_set = set(columns) | required_cols
1151
+ cols_select = ", ".join(f'"{c}"' for c in select_set)
1152
+
1153
+ sql_parts = [f'SELECT {cols_select} FROM "{table}"']
1154
+ params = []
1155
+ where_clauses = []
1156
+
1157
+ # --- Build WHERE clauses ---
1158
+ if start is not None:
1159
+ where_clauses.append(f"time_index {start_operator} ?")
1160
+ params.append(start.replace(tzinfo=None) if start.tzinfo else start)
1161
+ if end is not None:
1162
+ where_clauses.append(f"time_index {end_operator} ?")
1163
+ params.append(end.replace(tzinfo=None) if end.tzinfo else end)
1164
+ if ids:
1165
+ if not isinstance(ids, list): ids = list(ids)
1166
+ if ids:
1167
+ placeholders = ", ".join("?" for _ in ids)
1168
+ where_clauses.append(f"unique_identifier IN ({placeholders})")
1169
+ params.extend(ids)
1170
+ if unique_identifier_range_map:
1171
+ range_conditions = []
1172
+ for uid, date_info in unique_identifier_range_map.items():
1173
+ uid_conditions = ["unique_identifier = ?"]
1174
+ range_params = [uid]
1175
+ # Use operands from map if present, otherwise default to >= and <=
1176
+ s_op = date_info.get('start_date_operand', '>=')
1177
+ e_op = date_info.get('end_date_operand', '<=')
1178
+ if date_info.get('start_date'):
1179
+ uid_conditions.append(f"time_index {s_op} ?")
1180
+ s_date = date_info['start_date']
1181
+ range_params.append(s_date.replace(tzinfo=None) if s_date.tzinfo else s_date)
1182
+ if date_info.get('end_date'):
1183
+ uid_conditions.append(f"time_index {e_op} ?")
1184
+ e_date = date_info['end_date']
1185
+ range_params.append(e_date.replace(tzinfo=None) if e_date.tzinfo else e_date)
1186
+ range_conditions.append(f"({' AND '.join(uid_conditions)})")
1187
+ params.extend(range_params)
1188
+ if range_conditions:
1189
+ where_clauses.append(f"({' OR '.join(range_conditions)})")
1190
+
1191
+ if where_clauses: sql_parts.append("WHERE " + " AND ".join(where_clauses))
1192
+ sql_parts.append("ORDER BY time_index")
1193
+ query = " ".join(sql_parts)
1194
+ logger.debug(f"Executing read query: {query} with params: {params}")
1195
+
1196
+ try:
1197
+ table_exists_result = self.table_exists(table)
1198
+
1199
+ df = self.con.execute(query, params).fetch_df()
1200
+
1201
+ if not df.empty:
1202
+ schema = self.con.execute(f'PRAGMA table_info("{table}")').fetchall()
1203
+ type_map = {
1204
+ name: self._duck_to_pandas(duck_type, data_frequency=data_frequency)
1205
+ for cid, name, duck_type, notnull, default, pk in schema
1206
+ if name in df.columns
1207
+ }
1208
+ for col, target_type in type_map.items():
1209
+ try:
1210
+ if target_type == "datetime64[ns, UTC]":
1211
+ arr =df[col].values
1212
+ arr_ns = arr.astype("datetime64[ns]")
1213
+ df[col] =pd.Series(
1214
+ pd.DatetimeIndex(arr_ns, tz="UTC"),
1215
+ index=df.index,
1216
+ name=col,
1217
+ )
1218
+ elif target_type == "datetime64[ns]":
1219
+ df[col] = pd.to_datetime(df[col], errors='coerce')
1220
+ else:
1221
+ if isinstance(target_type, (pd.Int64Dtype, pd.BooleanDtype, pd.StringDtype)):
1222
+ df[col] = df[col].astype(target_type, errors='ignore')
1223
+ else:
1224
+ df[col] = df[col].astype(target_type, errors='ignore')
1225
+ except Exception as type_e:
1226
+ logger.warning(f"Could not coerce column '{col}' to type '{target_type}': {type_e}")
1227
+
1228
+ logger.debug(f"Read {len(df)} rows from table '{table}'.")
1229
+ return df
1230
+
1231
+ return pd.DataFrame()
1232
+
1233
+ except duckdb.CatalogException as e:
1234
+ logger.warning(f"CatalogException for table '{table}': {e}. Returning empty DataFrame.")
1235
+ return pd.DataFrame()
1236
+ except duckdb.Error as e:
1237
+ logger.error(f"Failed to read data from table '{table}': {e}")
1238
+ raise
1239
+ except Exception as e:
1240
+ logger.exception(f"An unexpected error occurred during read from table '{table}': {e}")
1241
+ raise
1242
+
1243
+ def drop_table(self, table: str) -> None:
1244
+ """
1245
+ Drops the specified table and corresponding view from the database.
1246
+
1247
+ Args:
1248
+ table (str): The name of the table/view to drop.
1249
+ """
1250
+ logger.debug(f"Attempting to drop table and view '{table}' from {self.db_path}")
1251
+ try:
1252
+ # Drop the view first (if it exists)
1253
+ self.con.execute(f'DROP VIEW IF EXISTS "{table}"')
1254
+ logger.debug(f"Dropped view '{table}' (if it existed).")
1255
+
1256
+ # Then drop the table (if it exists)
1257
+ self.con.execute(f'DROP TABLE IF EXISTS "{table}"')
1258
+ logger.debug(f"Dropped table '{table}' (if it existed).")
1259
+
1260
+ except duckdb.Error as e:
1261
+ logger.error(f"Failed to drop table/view '{table}': {e}")
1262
+ raise
1263
+ except Exception as e:
1264
+ logger.exception(f"An unexpected error occurred while dropping table/view '{table}': {e}")
1265
+ raise
1266
+
1267
+ def list_tables(self) -> List[str]:
1268
+ """
1269
+ Returns names of all tables and views in the main schema.
1270
+ """
1271
+ try:
1272
+ rows = self.con.execute("SHOW TABLES").fetchall()
1273
+ return [r[0] for r in rows]
1274
+ except duckdb.Error as e:
1275
+ logger.error(f"Error listing tables/views in {self.db_path}: {e}")
1276
+ return []
1277
+
1278
+
1279
+ # ──────────────────────────────────────────────────────────────────────────────
1280
+ # Private helpers
1281
+ # ──────────────────────────────────────────────────────────────────────────────
1282
+
1283
+ def _ensure_view(self, table: str) -> None:
1284
+ """
1285
+ CREATE OR REPLACE a view named `table` that:
1286
+ * reads all Parquet under self.db_path/table/**
1287
+ * hides partition columns (year, month, day)
1288
+ * locks column dtypes by explicit CASTs
1289
+ Schema is derived by unifying schemas across all partitions.
1290
+ """
1291
+ partition_cols = {"year", "month", "day"}
1292
+
1293
+ def qident(name: str) -> str:
1294
+ """Helper to safely quote identifiers for SQL."""
1295
+ return '"' + str(name).replace('"', '""') + '"'
1296
+
1297
+ file_glob = f"{self.db_path}/{table}/**/*.parquet"
1298
+
1299
+ # ✅ Key Change 1: Define a single, robust way to read the data.
1300
+ # This uses union_by_name=True to handle schema differences across files.
1301
+ read_clause = f"read_parquet('{file_glob}', union_by_name = True, hive_partitioning = TRUE)"
1302
+
1303
+ try:
1304
+ # ✅ Key Change 2: Use the robust read_clause for schema discovery.
1305
+ # This now correctly gets all columns from all partitions.
1306
+ desc_rows = self.con.execute(f"DESCRIBE SELECT * FROM {read_clause}").fetchall()
1307
+ except duckdb.Error as e:
1308
+ # No files yet or glob fails — skip (keeps existing view if any)
1309
+ logger.warning(f"_ensure_view: cannot scan files for '{table}': {e}")
1310
+ return
1311
+
1312
+ # Build CAST list, dropping partition columns
1313
+ cols = [(r[0], r[1]) for r in desc_rows if r and r[0] not in partition_cols]
1314
+ if not cols:
1315
+ logger.warning(f"_ensure_view: no non-partition columns for '{table}'. Skipping view refresh.")
1316
+ return
1317
+
1318
+ # Build the list of columns with explicit CASTs to enforce types
1319
+ select_exprs = [f"CAST({qident(name)} AS {coltype}) AS {qident(name)}"
1320
+ for name, coltype in cols]
1321
+ select_list = ",\n ".join(select_exprs)
1322
+
1323
+ # ✅ Key Change 3: Fix the DDL to be syntactically correct and use the robust read_clause.
1324
+ ddl = f"""
1325
+ CREATE OR REPLACE VIEW {qident(table)} AS
1326
+ SELECT
1327
+ {select_list}
1328
+ FROM {read_clause}
1329
+ """
1330
+
1331
+ self._execute_transaction(ddl)
1332
+
1333
+ def _partition_path(self, keys: dict,table:str) -> str:
1334
+ parts = [f"{k}={int(v):02d}" if k != "year" else f"{k}={int(v):04d}"
1335
+ for k, v in keys.items()]
1336
+ return f"{self.db_path}/{table}/" + "/".join(parts)
1337
+
1338
+ def _partition_keys(self, ts: pd.Series,data_frequency:DataFrequency) -> dict:
1339
+ """Return a dict of partition column → Series."""
1340
+ keys = {"year": ts.dt.year.astype(str), "month": ts.dt.month.astype(str)}
1341
+ if data_frequency == "minute":
1342
+ keys["day"] = ts.dt.day.astype(str)
1343
+ return keys
1344
+
1345
+ def _execute_transaction(self, sql: str) -> None:
1346
+ """
1347
+ Run a single-statement SQL in a BEGIN/COMMIT block,
1348
+ rolling back on any failure.
1349
+ """
1350
+ try:
1351
+ self.con.execute("BEGIN TRANSACTION;")
1352
+ self.con.execute(sql)
1353
+ self.con.execute("COMMIT;")
1354
+ except Exception:
1355
+ # best-effort rollback (if inside a failed transaction)
1356
+ try:
1357
+ self.con.execute("ROLLBACK;")
1358
+ except Exception:
1359
+ pass
1360
+ raise
1361
+ @staticmethod
1362
+ def _pandas_to_duck(dtype) -> str:
1363
+ """
1364
+ Minimal dtype → DuckDB mapping. Extend as needed.
1365
+ """
1366
+ if (pd.api.types.is_datetime64_any_dtype(dtype)
1367
+ or pd.api.types.is_datetime64tz_dtype(dtype)):
1368
+ return "TIMESTAMPTZ"
1369
+ if pd.api.types.is_integer_dtype(dtype):
1370
+ return "BIGINT"
1371
+ if pd.api.types.is_float_dtype(dtype):
1372
+ return "DOUBLE"
1373
+ if pd.api.types.is_bool_dtype(dtype):
1374
+ return "BOOLEAN"
1375
+ return "VARCHAR"
1376
+
1377
+ @staticmethod
1378
+ def _duck_to_pandas(duck_type: str,data_frequency:DataFrequency):
1379
+ """
1380
+ Minimal DuckDB → pandas dtype mapping.
1381
+ Returns the dtype object (preferred) so that
1382
+ `df.astype({...})` gets pandas’ nullable dtypes.
1383
+ Extend as needed.
1384
+ """
1385
+ dt = duck_type.upper()
1386
+
1387
+ # --- datetimes ------------------------------------------------------
1388
+ if dt in ("TIMESTAMPTZ", "TIMESTAMP WITH TIME ZONE"):
1389
+ # keep the UTC tz-awareness
1390
+ return "datetime64[ns, UTC]"
1391
+
1392
+
1393
+ if dt in ("TIMESTAMP", "DATETIME",):
1394
+ # keep timezone if present; duckdb returns tz‑aware objects already,
1395
+ # so no explicit 'UTC' suffix is needed here.
1396
+ return "datetime64[ns]"
1397
+ if dt == "DATE":
1398
+ return "datetime64[ns]" # pandas treats it as midnight
1399
+
1400
+ # --- integers -------------------------------------------------------
1401
+ if dt in ("TINYINT", "SMALLINT", "INTEGER", "INT", "BIGINT"):
1402
+ return pd.Int64Dtype() # nullable 64‑bit int
1403
+
1404
+ # --- floats / numerics ---------------------------------------------
1405
+ if dt in ("REAL", "FLOAT", "DOUBLE", "DECIMAL"):
1406
+ return "float64"
1407
+
1408
+ # --- booleans -------------------------------------------------------
1409
+ if dt == "BOOLEAN":
1410
+ return pd.BooleanDtype() # nullable boolean
1411
+
1412
+ # --- everything else ------------------------------------------------
1413
+ return pd.StringDtype() # pandas‘ native nullable string
1414
+
1415
+ # ─────────────────────────────────────────────────────────────────────── #
1416
+ # 3. OVERNIGHT DEDUP & COMPACTION #
1417
+ # ─────────────────────────────────────────────────────────────────────── #
1418
+ def overnight_dedup(self, table: str, date: Optional[datetime.date] = None) -> None:
1419
+ """
1420
+ Keep only the newest row per (time_index, unique_identifier)
1421
+ for each partition, coalesce small files into one Parquet file.
1422
+
1423
+ Run this once a day during low‑traffic hours.
1424
+ """
1425
+ # --- select partitions to touch ------------------------------------
1426
+ base = f"{self.db_path}/{table}"
1427
+ selector = fs.FileSelector(base, recursive=True)
1428
+ dirs = {info.path.rpartition("/")[0] for info in self._fs.get_file_info(selector)
1429
+ if info.type == fs.FileType.File
1430
+ and info.path.endswith(".parquet")}
1431
+
1432
+ if date:
1433
+ y, m, d = date.year, date.month, date.day
1434
+ dirs = {p for p in dirs if
1435
+ f"year={y:04d}" in p and f"month={m:02d}" in p
1436
+ and (data_frequency != "minute" or f"day={d:02d}" in p)}
1437
+
1438
+ for part_path in sorted(dirs):
1439
+ tmp_file = f"{part_path}/compact-{uuid.uuid4().hex}.parquet"
1440
+
1441
+ # DuckDB SQL: window‑deduplicate & write in one shot
1442
+ copy_sql = f"""
1443
+ COPY (
1444
+ SELECT *
1445
+ FROM (
1446
+ SELECT *,
1447
+ ROW_NUMBER() OVER (
1448
+ PARTITION BY time_index, unique_identifier
1449
+ ORDER BY _file_path DESC
1450
+ ) AS rn
1451
+ FROM parquet_scan('{part_path}/*.parquet',
1452
+ hive_partitioning=TRUE,
1453
+ filename=true) -- exposes _file_path
1454
+ )
1455
+ WHERE rn = 1
1456
+ )
1457
+ TO '{tmp_file}'
1458
+ (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 512000)
1459
+ """
1460
+ self.con.execute(copy_sql)
1461
+
1462
+ # remove old fragments & leave only the compacted file
1463
+ for info in self._fs.get_file_info(fs.FileSelector(part_path)):
1464
+ if info.type == fs.FileType.File and info.path != tmp_file:
1465
+ self._fs.delete_file(info.path)
1466
+
1467
+ # Optionally rename to a deterministic name; here we just keep tmp_file
1468
+ logger.info(f"Compacted + de‑duplicated partition {part_path}")