maxc-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maxc_cli/__init__.py +5 -0
- maxc_cli/__main__.py +6 -0
- maxc_cli/app.py +3406 -0
- maxc_cli/audit.py +18 -0
- maxc_cli/auth_providers.py +471 -0
- maxc_cli/backend/__init__.py +8 -0
- maxc_cli/backend/auth.py +144 -0
- maxc_cli/backend/data.py +87 -0
- maxc_cli/backend/job.py +304 -0
- maxc_cli/backend/meta.py +312 -0
- maxc_cli/backend/odps.py +130 -0
- maxc_cli/backend/query.py +148 -0
- maxc_cli/cache.py +662 -0
- maxc_cli/cli.py +1274 -0
- maxc_cli/config.py +406 -0
- maxc_cli/exceptions.py +99 -0
- maxc_cli/helpers.py +964 -0
- maxc_cli/models.py +533 -0
- maxc_cli/output.py +75 -0
- maxc_cli/store.py +123 -0
- maxc_cli/utils.py +136 -0
- maxc_cli-0.1.0.dist-info/METADATA +220 -0
- maxc_cli-0.1.0.dist-info/RECORD +26 -0
- maxc_cli-0.1.0.dist-info/WHEEL +5 -0
- maxc_cli-0.1.0.dist-info/entry_points.txt +2 -0
- maxc_cli-0.1.0.dist-info/top_level.txt +1 -0
maxc_cli/cache.py
ADDED
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
"""SQLite-based local cache for query sessions and metadata."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import sqlite3
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Generator
|
|
10
|
+
|
|
11
|
+
from .exceptions import ValidationError
|
|
12
|
+
from .utils import now_utc_iso
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_UNSET = object()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _safe_json_loads(text, default=_UNSET):
|
|
19
|
+
"""Parse JSON text, returning *default* on failure or empty input.
|
|
20
|
+
|
|
21
|
+
When *default* is not provided, falls back to ``[]``.
|
|
22
|
+
"""
|
|
23
|
+
if not text:
|
|
24
|
+
return [] if default is _UNSET else default
|
|
25
|
+
try:
|
|
26
|
+
return json.loads(text)
|
|
27
|
+
except (json.JSONDecodeError, TypeError):
|
|
28
|
+
return [] if default is _UNSET else default
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LocalCache:
|
|
32
|
+
"""Lightweight SQLite cache for query sessions and metadata."""
|
|
33
|
+
|
|
34
|
+
_INIT_RETRIES = 5
|
|
35
|
+
|
|
36
|
+
def __init__(self, cache_dir: 'Path'):
|
|
37
|
+
self.db_path = cache_dir / "cache.db"
|
|
38
|
+
try:
|
|
39
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
except OSError as exc:
|
|
41
|
+
raise ValidationError(
|
|
42
|
+
f"Local cache directory is unavailable: {self.db_path.parent}",
|
|
43
|
+
suggestion="Set `HOME` or `cache_dir` to a writable location before using cache-backed commands.",
|
|
44
|
+
) from exc
|
|
45
|
+
self._init_db()
|
|
46
|
+
|
|
47
|
+
def _init_db(self) -> 'None':
|
|
48
|
+
for attempt in range(self._INIT_RETRIES):
|
|
49
|
+
try:
|
|
50
|
+
with self._connect() as conn:
|
|
51
|
+
# Prefer WAL mode, but fall back to the default journal if another
|
|
52
|
+
# process is currently initializing the database.
|
|
53
|
+
try:
|
|
54
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
55
|
+
except ValidationError as exc:
|
|
56
|
+
if not self._is_lock_error(exc.message):
|
|
57
|
+
raise
|
|
58
|
+
conn.executescript("""
|
|
59
|
+
CREATE TABLE IF NOT EXISTS query_sessions (
|
|
60
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
61
|
+
job_id TEXT NOT NULL,
|
|
62
|
+
project TEXT NOT NULL,
|
|
63
|
+
sql TEXT,
|
|
64
|
+
created_at TEXT NOT NULL
|
|
65
|
+
);
|
|
66
|
+
CREATE INDEX IF NOT EXISTS idx_sessions_job_id ON query_sessions(job_id);
|
|
67
|
+
CREATE INDEX IF NOT EXISTS idx_sessions_created ON query_sessions(created_at);
|
|
68
|
+
|
|
69
|
+
CREATE TABLE IF NOT EXISTS table_metadata (
|
|
70
|
+
project TEXT NOT NULL,
|
|
71
|
+
schema_name TEXT NOT NULL DEFAULT 'default',
|
|
72
|
+
table_name TEXT NOT NULL,
|
|
73
|
+
description TEXT,
|
|
74
|
+
columns_json TEXT NOT NULL,
|
|
75
|
+
partitions_json TEXT,
|
|
76
|
+
row_count INTEGER,
|
|
77
|
+
size_bytes INTEGER,
|
|
78
|
+
owner TEXT,
|
|
79
|
+
updated_at TEXT NOT NULL,
|
|
80
|
+
PRIMARY KEY (project, schema_name, table_name)
|
|
81
|
+
);
|
|
82
|
+
CREATE INDEX IF NOT EXISTS idx_table_meta_project ON table_metadata(project);
|
|
83
|
+
CREATE INDEX IF NOT EXISTS idx_table_meta_project_schema ON table_metadata(project, schema_name);
|
|
84
|
+
CREATE INDEX IF NOT EXISTS idx_table_meta_table_name ON table_metadata(table_name);
|
|
85
|
+
CREATE INDEX IF NOT EXISTS idx_table_meta_updated ON table_metadata(updated_at DESC);
|
|
86
|
+
|
|
87
|
+
-- AI-generated semantic metadata for NL2SQL
|
|
88
|
+
CREATE TABLE IF NOT EXISTS table_semantic (
|
|
89
|
+
project TEXT NOT NULL,
|
|
90
|
+
schema_name TEXT NOT NULL DEFAULT 'default',
|
|
91
|
+
table_name TEXT NOT NULL,
|
|
92
|
+
semantic_desc TEXT,
|
|
93
|
+
use_cases TEXT,
|
|
94
|
+
sample_questions TEXT,
|
|
95
|
+
column_semantics_json TEXT,
|
|
96
|
+
|
|
97
|
+
-- Relations and statistics
|
|
98
|
+
relations_json TEXT,
|
|
99
|
+
stats_json TEXT,
|
|
100
|
+
|
|
101
|
+
-- Metadata
|
|
102
|
+
embedding BLOB,
|
|
103
|
+
generated_at TEXT NOT NULL,
|
|
104
|
+
generated_by TEXT DEFAULT 'agent',
|
|
105
|
+
version INTEGER DEFAULT 1,
|
|
106
|
+
|
|
107
|
+
PRIMARY KEY (project, schema_name, table_name)
|
|
108
|
+
);
|
|
109
|
+
CREATE INDEX IF NOT EXISTS idx_semantic_project ON table_semantic(project);
|
|
110
|
+
CREATE INDEX IF NOT EXISTS idx_semantic_project_schema ON table_semantic(project, schema_name);
|
|
111
|
+
|
|
112
|
+
-- FTS5 full-text index for keyword search
|
|
113
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS table_fts USING fts5(
|
|
114
|
+
project,
|
|
115
|
+
table_name,
|
|
116
|
+
schema_name,
|
|
117
|
+
description,
|
|
118
|
+
column_names,
|
|
119
|
+
column_comments,
|
|
120
|
+
semantic_desc,
|
|
121
|
+
use_cases,
|
|
122
|
+
content='',
|
|
123
|
+
tokenize='unicode61'
|
|
124
|
+
);
|
|
125
|
+
|
|
126
|
+
-- Cache build status tracking
|
|
127
|
+
CREATE TABLE IF NOT EXISTS cache_build_status (
|
|
128
|
+
project TEXT NOT NULL,
|
|
129
|
+
build_id TEXT NOT NULL,
|
|
130
|
+
status TEXT NOT NULL,
|
|
131
|
+
total_tables INTEGER DEFAULT 0,
|
|
132
|
+
processed_tables INTEGER DEFAULT 0,
|
|
133
|
+
failed_tables INTEGER DEFAULT 0,
|
|
134
|
+
started_at TEXT NOT NULL,
|
|
135
|
+
completed_at TEXT,
|
|
136
|
+
error_message TEXT,
|
|
137
|
+
PRIMARY KEY (project, build_id)
|
|
138
|
+
);
|
|
139
|
+
CREATE INDEX IF NOT EXISTS idx_build_status_project ON cache_build_status(project);
|
|
140
|
+
CREATE INDEX IF NOT EXISTS idx_build_status_started ON cache_build_status(started_at DESC);
|
|
141
|
+
""")
|
|
142
|
+
return
|
|
143
|
+
except ValidationError as exc:
|
|
144
|
+
if self._is_lock_error(exc.message) and attempt < self._INIT_RETRIES - 1:
|
|
145
|
+
time.sleep(0.05 * (attempt + 1))
|
|
146
|
+
continue
|
|
147
|
+
raise
|
|
148
|
+
|
|
149
|
+
@contextmanager
|
|
150
|
+
def _connect(self) -> 'Generator[sqlite3.Connection, None, None]':
|
|
151
|
+
# Increased timeout to 30 seconds to prevent lock contention in concurrent scenarios
|
|
152
|
+
try:
|
|
153
|
+
conn = sqlite3.connect(str(self.db_path), timeout=30.0)
|
|
154
|
+
except sqlite3.Error as exc:
|
|
155
|
+
raise self._translate_sqlite_error(exc) from exc
|
|
156
|
+
conn.row_factory = sqlite3.Row
|
|
157
|
+
try:
|
|
158
|
+
yield conn
|
|
159
|
+
conn.commit()
|
|
160
|
+
except sqlite3.Error as exc:
|
|
161
|
+
conn.rollback()
|
|
162
|
+
raise self._translate_sqlite_error(exc) from exc
|
|
163
|
+
finally:
|
|
164
|
+
conn.close()
|
|
165
|
+
|
|
166
|
+
def _translate_sqlite_error(self, exc: 'sqlite3.Error') -> 'ValidationError':
|
|
167
|
+
message = str(exc)
|
|
168
|
+
if self._is_lock_error(message):
|
|
169
|
+
return ValidationError(
|
|
170
|
+
f"Local cache is busy: {message}",
|
|
171
|
+
suggestion="Retry the command in a moment, or avoid starting multiple maxc processes against the same cache at once.",
|
|
172
|
+
)
|
|
173
|
+
if "unable to open database file" in message.lower():
|
|
174
|
+
return ValidationError(
|
|
175
|
+
f"Local cache database is unavailable: {self.db_path}",
|
|
176
|
+
suggestion="Set `HOME` or `cache_dir` to a writable location before using cache-backed commands.",
|
|
177
|
+
)
|
|
178
|
+
return ValidationError(
|
|
179
|
+
f"Local cache error: {message}",
|
|
180
|
+
suggestion="Check the cache path and local SQLite state before retrying.",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
@staticmethod
|
|
184
|
+
def _is_lock_error(message: 'str') -> 'bool':
|
|
185
|
+
lowered = message.lower()
|
|
186
|
+
return "database is locked" in lowered or "database table is locked" in lowered
|
|
187
|
+
|
|
188
|
+
def create_session(
|
|
189
|
+
self,
|
|
190
|
+
job_id: 'str',
|
|
191
|
+
project: 'str',
|
|
192
|
+
sql: 'str | None' = None,
|
|
193
|
+
) -> 'int':
|
|
194
|
+
"""Create a new query session, return session_id."""
|
|
195
|
+
with self._connect() as conn:
|
|
196
|
+
cursor = conn.execute(
|
|
197
|
+
"""
|
|
198
|
+
INSERT INTO query_sessions (job_id, project, sql, created_at)
|
|
199
|
+
VALUES (?, ?, ?, ?)
|
|
200
|
+
""",
|
|
201
|
+
(job_id, project, sql, now_utc_iso()),
|
|
202
|
+
)
|
|
203
|
+
return cursor.lastrowid # type: ignore
|
|
204
|
+
|
|
205
|
+
def get_session(self, session_id: 'int') -> 'dict[str, Any] | None':
|
|
206
|
+
"""Get session by id."""
|
|
207
|
+
with self._connect() as conn:
|
|
208
|
+
row = conn.execute(
|
|
209
|
+
"SELECT id, job_id, project, sql, created_at FROM query_sessions WHERE id = ?",
|
|
210
|
+
(session_id,),
|
|
211
|
+
).fetchone()
|
|
212
|
+
if row:
|
|
213
|
+
return dict(row)
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
def find_session_by_job_id(self, job_id: 'str') -> 'dict[str, Any] | None':
|
|
217
|
+
"""Find existing session by job_id (for deduplication)."""
|
|
218
|
+
with self._connect() as conn:
|
|
219
|
+
row = conn.execute(
|
|
220
|
+
"SELECT id, job_id, project, sql, created_at FROM query_sessions WHERE job_id = ? ORDER BY id DESC LIMIT 1",
|
|
221
|
+
(job_id,),
|
|
222
|
+
).fetchone()
|
|
223
|
+
if row:
|
|
224
|
+
return dict(row)
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
def cleanup_old_sessions(self, keep_hours: 'int' = 24) -> 'int':
|
|
228
|
+
"""Remove sessions older than keep_hours. Returns count deleted."""
|
|
229
|
+
with self._connect() as conn:
|
|
230
|
+
cursor = conn.execute(
|
|
231
|
+
"""
|
|
232
|
+
DELETE FROM query_sessions
|
|
233
|
+
WHERE datetime(created_at) < datetime('now', ?)
|
|
234
|
+
""",
|
|
235
|
+
(f"-{keep_hours} hours",),
|
|
236
|
+
)
|
|
237
|
+
return cursor.rowcount
|
|
238
|
+
|
|
239
|
+
# ========== Table Metadata Cache ==========
|
|
240
|
+
|
|
241
|
+
def cache_table(
|
|
242
|
+
self,
|
|
243
|
+
project: 'str',
|
|
244
|
+
table_name: 'str',
|
|
245
|
+
description: 'str | None',
|
|
246
|
+
columns: 'list[dict[str, Any]]',
|
|
247
|
+
partitions: 'list[str] | None' = None,
|
|
248
|
+
row_count: 'int | None' = None,
|
|
249
|
+
size_bytes: 'int | None' = None,
|
|
250
|
+
owner: 'str | None' = None,
|
|
251
|
+
schema_name: 'str' = "default",
|
|
252
|
+
) -> 'None':
|
|
253
|
+
"""Cache table metadata."""
|
|
254
|
+
with self._connect() as conn:
|
|
255
|
+
conn.execute(
|
|
256
|
+
"""
|
|
257
|
+
INSERT OR REPLACE INTO table_metadata
|
|
258
|
+
(project, schema_name, table_name, description, columns_json, partitions_json, row_count, size_bytes, owner, updated_at)
|
|
259
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
260
|
+
""",
|
|
261
|
+
(
|
|
262
|
+
project,
|
|
263
|
+
schema_name,
|
|
264
|
+
table_name,
|
|
265
|
+
description,
|
|
266
|
+
json.dumps(columns, ensure_ascii=False),
|
|
267
|
+
json.dumps(partitions, ensure_ascii=False) if partitions else None,
|
|
268
|
+
row_count,
|
|
269
|
+
size_bytes,
|
|
270
|
+
owner,
|
|
271
|
+
now_utc_iso(),
|
|
272
|
+
),
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
def get_cached_table(self, project: 'str', table_name: 'str', schema_name: 'str' = "default") -> 'dict[str, Any] | None':
|
|
276
|
+
"""Get cached table metadata."""
|
|
277
|
+
with self._connect() as conn:
|
|
278
|
+
row = conn.execute(
|
|
279
|
+
"""
|
|
280
|
+
SELECT table_name, schema_name, description, columns_json, partitions_json, row_count, size_bytes, owner, updated_at
|
|
281
|
+
FROM table_metadata WHERE project = ? AND schema_name = ? AND table_name = ?
|
|
282
|
+
""",
|
|
283
|
+
(project, schema_name, table_name),
|
|
284
|
+
).fetchone()
|
|
285
|
+
if row:
|
|
286
|
+
return {
|
|
287
|
+
"table_name": row["table_name"],
|
|
288
|
+
"schema_name": row["schema_name"],
|
|
289
|
+
"description": row["description"],
|
|
290
|
+
"columns": _safe_json_loads(row["columns_json"]),
|
|
291
|
+
"partitions": _safe_json_loads(row["partitions_json"]),
|
|
292
|
+
"row_count": row["row_count"],
|
|
293
|
+
"size_bytes": row["size_bytes"],
|
|
294
|
+
"owner": row["owner"],
|
|
295
|
+
"updated_at": row["updated_at"],
|
|
296
|
+
}
|
|
297
|
+
return None
|
|
298
|
+
|
|
299
|
+
def get_all_cached_tables(
|
|
300
|
+
self, project: 'str', schema_name: 'str | None' = None
|
|
301
|
+
) -> 'list[dict[str, Any]]':
|
|
302
|
+
"""Get all cached tables for a project, optionally filtered by schema."""
|
|
303
|
+
with self._connect() as conn:
|
|
304
|
+
if schema_name:
|
|
305
|
+
rows = conn.execute(
|
|
306
|
+
"""
|
|
307
|
+
SELECT table_name, schema_name, description, columns_json, partitions_json, row_count, size_bytes, owner, updated_at
|
|
308
|
+
FROM table_metadata WHERE project = ? AND schema_name = ?
|
|
309
|
+
ORDER BY schema_name, table_name
|
|
310
|
+
""",
|
|
311
|
+
(project, schema_name),
|
|
312
|
+
).fetchall()
|
|
313
|
+
else:
|
|
314
|
+
rows = conn.execute(
|
|
315
|
+
"""
|
|
316
|
+
SELECT table_name, schema_name, description, columns_json, partitions_json, row_count, size_bytes, owner, updated_at
|
|
317
|
+
FROM table_metadata WHERE project = ?
|
|
318
|
+
ORDER BY schema_name, table_name
|
|
319
|
+
""",
|
|
320
|
+
(project,),
|
|
321
|
+
).fetchall()
|
|
322
|
+
return [
|
|
323
|
+
{
|
|
324
|
+
"table_name": row["table_name"],
|
|
325
|
+
"schema_name": row["schema_name"],
|
|
326
|
+
"description": row["description"],
|
|
327
|
+
"columns": _safe_json_loads(row["columns_json"]),
|
|
328
|
+
"partitions": _safe_json_loads(row["partitions_json"]),
|
|
329
|
+
"row_count": row["row_count"],
|
|
330
|
+
"size_bytes": row["size_bytes"],
|
|
331
|
+
"owner": row["owner"],
|
|
332
|
+
"updated_at": row["updated_at"],
|
|
333
|
+
}
|
|
334
|
+
for row in rows
|
|
335
|
+
]
|
|
336
|
+
|
|
337
|
+
def get_cache_stats(self, project: 'str', schema_name: 'str | None' = None) -> 'dict[str, Any]':
|
|
338
|
+
"""Get cache statistics."""
|
|
339
|
+
with self._connect() as conn:
|
|
340
|
+
if schema_name:
|
|
341
|
+
row = conn.execute(
|
|
342
|
+
"""
|
|
343
|
+
SELECT COUNT(*) as count, MIN(updated_at) as oldest, MAX(updated_at) as newest
|
|
344
|
+
FROM table_metadata WHERE project = ? AND schema_name = ?
|
|
345
|
+
""",
|
|
346
|
+
(project, schema_name),
|
|
347
|
+
).fetchone()
|
|
348
|
+
else:
|
|
349
|
+
row = conn.execute(
|
|
350
|
+
"""
|
|
351
|
+
SELECT COUNT(*) as count, MIN(updated_at) as oldest, MAX(updated_at) as newest
|
|
352
|
+
FROM table_metadata WHERE project = ?
|
|
353
|
+
""",
|
|
354
|
+
(project,),
|
|
355
|
+
).fetchone()
|
|
356
|
+
return {
|
|
357
|
+
"table_count": row["count"] if row else 0,
|
|
358
|
+
"oldest": row["oldest"] if row else None,
|
|
359
|
+
"newest": row["newest"] if row else None,
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
def get_schemas(self, project: 'str') -> 'list[str]':
|
|
363
|
+
"""Get all schemas for a project."""
|
|
364
|
+
with self._connect() as conn:
|
|
365
|
+
rows = conn.execute(
|
|
366
|
+
"""
|
|
367
|
+
SELECT DISTINCT schema_name FROM table_metadata WHERE project = ? ORDER BY schema_name
|
|
368
|
+
""",
|
|
369
|
+
(project,),
|
|
370
|
+
).fetchall()
|
|
371
|
+
return [row["schema_name"] for row in rows]
|
|
372
|
+
|
|
373
|
+
def get_tables_by_name(self, project: 'str', table_name: 'str') -> 'list[dict[str, Any]]':
|
|
374
|
+
"""Get all tables with the given name across different schemas."""
|
|
375
|
+
with self._connect() as conn:
|
|
376
|
+
rows = conn.execute(
|
|
377
|
+
"""
|
|
378
|
+
SELECT schema_name, description, columns_json, partitions_json, row_count, updated_at
|
|
379
|
+
FROM table_metadata WHERE project = ? AND table_name = ?
|
|
380
|
+
""",
|
|
381
|
+
(project, table_name),
|
|
382
|
+
).fetchall()
|
|
383
|
+
return [
|
|
384
|
+
{
|
|
385
|
+
"schema_name": row["schema_name"],
|
|
386
|
+
"table_name": table_name,
|
|
387
|
+
"description": row["description"],
|
|
388
|
+
"columns": _safe_json_loads(row["columns_json"]),
|
|
389
|
+
"partitions": _safe_json_loads(row["partitions_json"]),
|
|
390
|
+
"row_count": row["row_count"],
|
|
391
|
+
"updated_at": row["updated_at"],
|
|
392
|
+
}
|
|
393
|
+
for row in rows
|
|
394
|
+
]
|
|
395
|
+
|
|
396
|
+
def clear_table_cache(self, project: 'str | None' = None, schema_name: 'str | None' = None) -> 'int':
|
|
397
|
+
"""Clear table metadata cache. If project is None, clear all."""
|
|
398
|
+
with self._connect() as conn:
|
|
399
|
+
if project and schema_name:
|
|
400
|
+
cursor = conn.execute(
|
|
401
|
+
"DELETE FROM table_metadata WHERE project = ? AND schema_name = ?",
|
|
402
|
+
(project, schema_name),
|
|
403
|
+
)
|
|
404
|
+
elif project:
|
|
405
|
+
cursor = conn.execute(
|
|
406
|
+
"DELETE FROM table_metadata WHERE project = ?",
|
|
407
|
+
(project,),
|
|
408
|
+
)
|
|
409
|
+
else:
|
|
410
|
+
cursor = conn.execute("DELETE FROM table_metadata")
|
|
411
|
+
return cursor.rowcount
|
|
412
|
+
|
|
413
|
+
# ========== Semantic Metadata (for NL2SQL) ==========
|
|
414
|
+
|
|
415
|
+
def save_semantic(
|
|
416
|
+
self,
|
|
417
|
+
project: 'str',
|
|
418
|
+
table_name: 'str',
|
|
419
|
+
semantic_desc: 'str',
|
|
420
|
+
use_cases: 'list[str]',
|
|
421
|
+
sample_questions: 'list[str]',
|
|
422
|
+
column_semantics: 'list[dict[str, Any]]',
|
|
423
|
+
schema_name: 'str' = "default",
|
|
424
|
+
relations: 'list[dict[str, Any]] | None' = None,
|
|
425
|
+
stats: 'dict[str, Any] | None' = None,
|
|
426
|
+
embedding: 'bytes | None' = None,
|
|
427
|
+
generated_by: 'str' = "agent",
|
|
428
|
+
) -> 'None':
|
|
429
|
+
"""Save AI-generated semantic metadata for NL2SQL."""
|
|
430
|
+
with self._connect() as conn:
|
|
431
|
+
conn.execute(
|
|
432
|
+
"""
|
|
433
|
+
INSERT OR REPLACE INTO table_semantic
|
|
434
|
+
(project, schema_name, table_name, semantic_desc, use_cases, sample_questions,
|
|
435
|
+
column_semantics_json, relations_json, stats_json, embedding, generated_at, generated_by)
|
|
436
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
437
|
+
""",
|
|
438
|
+
(
|
|
439
|
+
project,
|
|
440
|
+
schema_name,
|
|
441
|
+
table_name,
|
|
442
|
+
semantic_desc,
|
|
443
|
+
json.dumps(use_cases, ensure_ascii=False),
|
|
444
|
+
json.dumps(sample_questions, ensure_ascii=False),
|
|
445
|
+
json.dumps(column_semantics, ensure_ascii=False),
|
|
446
|
+
json.dumps(relations, ensure_ascii=False) if relations else None,
|
|
447
|
+
json.dumps(stats, ensure_ascii=False) if stats else None,
|
|
448
|
+
embedding,
|
|
449
|
+
now_utc_iso(),
|
|
450
|
+
generated_by,
|
|
451
|
+
),
|
|
452
|
+
)
|
|
453
|
+
# Update FTS index
|
|
454
|
+
cached = self.get_cached_table(project, table_name, schema_name)
|
|
455
|
+
if cached:
|
|
456
|
+
col_names = " ".join(c["name"] for c in cached.get("columns", []))
|
|
457
|
+
col_comments = " ".join(c.get("comment", "") for c in cached.get("columns", []))
|
|
458
|
+
conn.execute(
|
|
459
|
+
"INSERT OR REPLACE INTO table_fts(project, table_name, schema_name, description, column_names, column_comments, semantic_desc, use_cases) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
|
460
|
+
(project, table_name, schema_name, cached.get("description", ""), col_names, col_comments, semantic_desc, " ".join(use_cases)),
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
def get_semantic(self, project: 'str', table_name: 'str', schema_name: 'str' = "default") -> 'dict[str, Any] | None':
|
|
464
|
+
"""Get semantic metadata for a table."""
|
|
465
|
+
with self._connect() as conn:
|
|
466
|
+
row = conn.execute(
|
|
467
|
+
"""
|
|
468
|
+
SELECT semantic_desc, use_cases, sample_questions, column_semantics_json,
|
|
469
|
+
relations_json, stats_json, generated_at, generated_by
|
|
470
|
+
FROM table_semantic WHERE project = ? AND schema_name = ? AND table_name = ?
|
|
471
|
+
""",
|
|
472
|
+
(project, schema_name, table_name),
|
|
473
|
+
).fetchone()
|
|
474
|
+
if row:
|
|
475
|
+
return {
|
|
476
|
+
"schema_name": schema_name,
|
|
477
|
+
"semantic_desc": row["semantic_desc"],
|
|
478
|
+
"use_cases": _safe_json_loads(row["use_cases"]),
|
|
479
|
+
"sample_questions": _safe_json_loads(row["sample_questions"]),
|
|
480
|
+
"column_semantics": _safe_json_loads(row["column_semantics_json"]),
|
|
481
|
+
"relations": _safe_json_loads(row["relations_json"]),
|
|
482
|
+
"stats": _safe_json_loads(row["stats_json"], default=None),
|
|
483
|
+
"generated_at": row["generated_at"],
|
|
484
|
+
"generated_by": row["generated_by"],
|
|
485
|
+
}
|
|
486
|
+
return None
|
|
487
|
+
|
|
488
|
+
def fts_search(self, query: 'str', limit: 'int' = 20, project: 'str | None' = None) -> 'list[dict[str, Any]]':
|
|
489
|
+
"""Full-text search across all indexed tables."""
|
|
490
|
+
with self._connect() as conn:
|
|
491
|
+
if project:
|
|
492
|
+
rows = conn.execute(
|
|
493
|
+
"""
|
|
494
|
+
SELECT table_name, schema_name, snippet(table_fts, 0, '<b>', '</b>', '...', 32) as match_snippet,
|
|
495
|
+
bm25(table_fts) as score
|
|
496
|
+
FROM table_fts WHERE table_fts MATCH ? AND project = ?
|
|
497
|
+
ORDER BY score LIMIT ?
|
|
498
|
+
""",
|
|
499
|
+
(query, project, limit),
|
|
500
|
+
).fetchall()
|
|
501
|
+
else:
|
|
502
|
+
rows = conn.execute(
|
|
503
|
+
"""
|
|
504
|
+
SELECT table_name, schema_name, snippet(table_fts, 0, '<b>', '</b>', '...', 32) as match_snippet,
|
|
505
|
+
bm25(table_fts) as score
|
|
506
|
+
FROM table_fts WHERE table_fts MATCH ?
|
|
507
|
+
ORDER BY score LIMIT ?
|
|
508
|
+
""",
|
|
509
|
+
(query, limit),
|
|
510
|
+
).fetchall()
|
|
511
|
+
return [
|
|
512
|
+
{"table_name": row["table_name"], "schema_name": row["schema_name"], "snippet": row["match_snippet"], "score": row["score"]}
|
|
513
|
+
for row in rows
|
|
514
|
+
]
|
|
515
|
+
|
|
516
|
+
def get_all_semantics(
|
|
517
|
+
self, project: 'str', schema_name: 'str | None' = None
|
|
518
|
+
) -> 'list[dict[str, Any]]':
|
|
519
|
+
"""Get all semantic metadata for a project."""
|
|
520
|
+
with self._connect() as conn:
|
|
521
|
+
if schema_name:
|
|
522
|
+
rows = conn.execute(
|
|
523
|
+
"""
|
|
524
|
+
SELECT table_name, schema_name, semantic_desc, use_cases, sample_questions,
|
|
525
|
+
column_semantics_json, relations_json, stats_json, generated_at, generated_by
|
|
526
|
+
FROM table_semantic WHERE project = ? AND schema_name = ?
|
|
527
|
+
""",
|
|
528
|
+
(project, schema_name),
|
|
529
|
+
).fetchall()
|
|
530
|
+
else:
|
|
531
|
+
rows = conn.execute(
|
|
532
|
+
"""
|
|
533
|
+
SELECT table_name, schema_name, semantic_desc, use_cases, sample_questions,
|
|
534
|
+
column_semantics_json, relations_json, stats_json, generated_at, generated_by
|
|
535
|
+
FROM table_semantic WHERE project = ?
|
|
536
|
+
""",
|
|
537
|
+
(project,),
|
|
538
|
+
).fetchall()
|
|
539
|
+
return [
|
|
540
|
+
{
|
|
541
|
+
"table_name": row["table_name"],
|
|
542
|
+
"schema_name": row["schema_name"],
|
|
543
|
+
"semantic_desc": row["semantic_desc"],
|
|
544
|
+
"use_cases": _safe_json_loads(row["use_cases"]),
|
|
545
|
+
"sample_questions": _safe_json_loads(row["sample_questions"]),
|
|
546
|
+
"column_semantics": _safe_json_loads(row["column_semantics_json"]),
|
|
547
|
+
"relations": _safe_json_loads(row["relations_json"]),
|
|
548
|
+
"stats": _safe_json_loads(row["stats_json"], default=None),
|
|
549
|
+
"generated_at": row["generated_at"],
|
|
550
|
+
"generated_by": row["generated_by"],
|
|
551
|
+
}
|
|
552
|
+
for row in rows
|
|
553
|
+
]
|
|
554
|
+
|
|
555
|
+
# ========== Cache Build Status Tracking ==========
|
|
556
|
+
|
|
557
|
+
def start_build(self, project: 'str', build_id: 'str', total_tables: 'int') -> 'None':
|
|
558
|
+
"""Start a cache build process."""
|
|
559
|
+
with self._connect() as conn:
|
|
560
|
+
conn.execute(
|
|
561
|
+
"""
|
|
562
|
+
INSERT INTO cache_build_status
|
|
563
|
+
(project, build_id, status, total_tables, processed_tables, failed_tables, started_at)
|
|
564
|
+
VALUES (?, ?, 'running', ?, 0, 0, ?)
|
|
565
|
+
""",
|
|
566
|
+
(project, build_id, total_tables, now_utc_iso()),
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
def update_build_progress(
|
|
570
|
+
self, project: 'str', build_id: 'str', processed: 'int', failed: 'int'
|
|
571
|
+
) -> 'None':
|
|
572
|
+
"""Update cache build progress."""
|
|
573
|
+
with self._connect() as conn:
|
|
574
|
+
conn.execute(
|
|
575
|
+
"""
|
|
576
|
+
UPDATE cache_build_status
|
|
577
|
+
SET processed_tables = ?, failed_tables = ?
|
|
578
|
+
WHERE project = ? AND build_id = ? AND status = 'running'
|
|
579
|
+
""",
|
|
580
|
+
(processed, failed, project, build_id),
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
def complete_build(self, project: 'str', build_id: 'str', error_message: 'str | None' = None) -> 'None':
|
|
584
|
+
"""Mark cache build as completed."""
|
|
585
|
+
with self._connect() as conn:
|
|
586
|
+
if error_message:
|
|
587
|
+
conn.execute(
|
|
588
|
+
"""
|
|
589
|
+
UPDATE cache_build_status
|
|
590
|
+
SET status = 'failed', completed_at = ?, error_message = ?
|
|
591
|
+
WHERE project = ? AND build_id = ?
|
|
592
|
+
""",
|
|
593
|
+
(now_utc_iso(), error_message, project, build_id),
|
|
594
|
+
)
|
|
595
|
+
else:
|
|
596
|
+
conn.execute(
|
|
597
|
+
"""
|
|
598
|
+
UPDATE cache_build_status
|
|
599
|
+
SET status = 'completed', completed_at = ?
|
|
600
|
+
WHERE project = ? AND build_id = ?
|
|
601
|
+
""",
|
|
602
|
+
(now_utc_iso(), project, build_id),
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
def get_build_status(self, project: 'str', build_id: 'str | None' = None) -> 'dict[str, Any] | None':
|
|
606
|
+
"""Get cache build status. If build_id is None, get the latest build."""
|
|
607
|
+
with self._connect() as conn:
|
|
608
|
+
if build_id:
|
|
609
|
+
row = conn.execute(
|
|
610
|
+
"""
|
|
611
|
+
SELECT project, build_id, status, total_tables, processed_tables, failed_tables,
|
|
612
|
+
started_at, completed_at, error_message
|
|
613
|
+
FROM cache_build_status WHERE project = ? AND build_id = ?
|
|
614
|
+
""",
|
|
615
|
+
(project, build_id),
|
|
616
|
+
).fetchone()
|
|
617
|
+
else:
|
|
618
|
+
row = conn.execute(
|
|
619
|
+
"""
|
|
620
|
+
SELECT project, build_id, status, total_tables, processed_tables, failed_tables,
|
|
621
|
+
started_at, completed_at, error_message
|
|
622
|
+
FROM cache_build_status WHERE project = ?
|
|
623
|
+
ORDER BY started_at DESC LIMIT 1
|
|
624
|
+
""",
|
|
625
|
+
(project,),
|
|
626
|
+
).fetchone()
|
|
627
|
+
|
|
628
|
+
if row:
|
|
629
|
+
result = dict(row)
|
|
630
|
+
# Calculate progress percentage
|
|
631
|
+
if result["total_tables"] > 0:
|
|
632
|
+
result["progress_percent"] = int(
|
|
633
|
+
(result["processed_tables"] / result["total_tables"]) * 100
|
|
634
|
+
)
|
|
635
|
+
else:
|
|
636
|
+
result["progress_percent"] = 0
|
|
637
|
+
return result
|
|
638
|
+
return None
|
|
639
|
+
|
|
640
|
+
def get_recent_builds(self, project: 'str', limit: 'int' = 10) -> 'list[dict[str, Any]]':
|
|
641
|
+
"""Get recent build history for a project."""
|
|
642
|
+
with self._connect() as conn:
|
|
643
|
+
rows = conn.execute(
|
|
644
|
+
"""
|
|
645
|
+
SELECT project, build_id, status, total_tables, processed_tables, failed_tables,
|
|
646
|
+
started_at, completed_at, error_message
|
|
647
|
+
FROM cache_build_status WHERE project = ?
|
|
648
|
+
ORDER BY started_at DESC LIMIT ?
|
|
649
|
+
""",
|
|
650
|
+
(project, limit),
|
|
651
|
+
).fetchall()
|
|
652
|
+
results = []
|
|
653
|
+
for row in rows:
|
|
654
|
+
result = dict(row)
|
|
655
|
+
if result["total_tables"] > 0:
|
|
656
|
+
result["progress_percent"] = int(
|
|
657
|
+
(result["processed_tables"] / result["total_tables"]) * 100
|
|
658
|
+
)
|
|
659
|
+
else:
|
|
660
|
+
result["progress_percent"] = 0
|
|
661
|
+
results.append(result)
|
|
662
|
+
return results
|