maxc-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maxc_cli/__init__.py +5 -0
- maxc_cli/__main__.py +6 -0
- maxc_cli/app.py +3406 -0
- maxc_cli/audit.py +18 -0
- maxc_cli/auth_providers.py +471 -0
- maxc_cli/backend/__init__.py +8 -0
- maxc_cli/backend/auth.py +144 -0
- maxc_cli/backend/data.py +87 -0
- maxc_cli/backend/job.py +304 -0
- maxc_cli/backend/meta.py +312 -0
- maxc_cli/backend/odps.py +130 -0
- maxc_cli/backend/query.py +148 -0
- maxc_cli/cache.py +662 -0
- maxc_cli/cli.py +1274 -0
- maxc_cli/config.py +406 -0
- maxc_cli/exceptions.py +99 -0
- maxc_cli/helpers.py +964 -0
- maxc_cli/models.py +533 -0
- maxc_cli/output.py +75 -0
- maxc_cli/store.py +123 -0
- maxc_cli/utils.py +136 -0
- maxc_cli-0.1.0.dist-info/METADATA +220 -0
- maxc_cli-0.1.0.dist-info/RECORD +26 -0
- maxc_cli-0.1.0.dist-info/WHEEL +5 -0
- maxc_cli-0.1.0.dist-info/entry_points.txt +2 -0
- maxc_cli-0.1.0.dist-info/top_level.txt +1 -0
maxc_cli/app.py
ADDED
|
@@ -0,0 +1,3406 @@
|
|
|
1
|
+
|
|
2
|
+
from datetime import datetime, timezone
|
|
3
|
+
import getpass
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from time import monotonic
|
|
8
|
+
from typing import Any, Callable
|
|
9
|
+
|
|
10
|
+
from .audit import AuditLogger
|
|
11
|
+
from .auth_providers import (
|
|
12
|
+
build_auth_options,
|
|
13
|
+
build_ncs_auth_config,
|
|
14
|
+
list_ncs_accounts,
|
|
15
|
+
resolve_auth_connection,
|
|
16
|
+
)
|
|
17
|
+
from .backend import OdpsBackend
|
|
18
|
+
from .cache import LocalCache
|
|
19
|
+
from .config import (
|
|
20
|
+
AuthConfig,
|
|
21
|
+
TableDefinition,
|
|
22
|
+
default_global_config_path,
|
|
23
|
+
load_config,
|
|
24
|
+
load_config_mapping,
|
|
25
|
+
persist_login_config,
|
|
26
|
+
save_config_mapping,
|
|
27
|
+
session_override_path,
|
|
28
|
+
)
|
|
29
|
+
from .exceptions import (
|
|
30
|
+
BackendConnectionError,
|
|
31
|
+
CostLimitExceededError,
|
|
32
|
+
ErrorPayload,
|
|
33
|
+
FeatureUnavailableError,
|
|
34
|
+
JobTimeoutError,
|
|
35
|
+
MaxCError,
|
|
36
|
+
PermissionDeniedError,
|
|
37
|
+
ValidationError,
|
|
38
|
+
)
|
|
39
|
+
from .helpers import (
|
|
40
|
+
build_odps_identity_payload,
|
|
41
|
+
build_task_summary,
|
|
42
|
+
classify_failure_reason,
|
|
43
|
+
load_odps_env,
|
|
44
|
+
mask_access_id,
|
|
45
|
+
missing_odps_settings,
|
|
46
|
+
parse_time_value,
|
|
47
|
+
)
|
|
48
|
+
from .models import AgentHints, Envelope, JobInfo, QueryResult
|
|
49
|
+
from .store import JobStore
|
|
50
|
+
from .utils import decode_cursor, detect_operation, encode_cursor, now_utc_iso
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MaxCApp:
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
cwd: 'Path',
|
|
58
|
+
config_path: 'Path | None' = None,
|
|
59
|
+
load_backend: 'bool' = True,
|
|
60
|
+
) -> 'None':
|
|
61
|
+
self.cwd = cwd
|
|
62
|
+
self.config = load_config(cwd, config_path)
|
|
63
|
+
self.backend = OdpsBackend(self.config) if load_backend else None
|
|
64
|
+
self.remote_jobs = getattr(self.backend, "supports_remote_jobs", False) if self.backend else False
|
|
65
|
+
self.jobs: 'JobStore | None' = None
|
|
66
|
+
self._audit: 'AuditLogger | None' = None
|
|
67
|
+
self._audit_path = self.config.agent.audit_log or self.config.state_dir / "audit.log"
|
|
68
|
+
self._cache: 'LocalCache | None' = None
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def cache(self) -> 'LocalCache':
|
|
72
|
+
if self._cache is None:
|
|
73
|
+
self._cache = LocalCache(self.config.cache_dir)
|
|
74
|
+
return self._cache
|
|
75
|
+
|
|
76
|
+
def _ensure_job_store(self) -> 'JobStore':
|
|
77
|
+
if self.remote_jobs:
|
|
78
|
+
raise FeatureUnavailableError("Local job storage is not initialized for the current backend.")
|
|
79
|
+
if self.jobs is None:
|
|
80
|
+
self.jobs = JobStore(self.config.state_dir)
|
|
81
|
+
return self.jobs
|
|
82
|
+
|
|
83
|
+
def _whoami_validation_failed_envelope(
|
|
84
|
+
self,
|
|
85
|
+
*,
|
|
86
|
+
settings: 'dict[str, str | None]',
|
|
87
|
+
auth_type: 'str',
|
|
88
|
+
identity_source: 'str',
|
|
89
|
+
warnings: 'list[str]',
|
|
90
|
+
) -> 'Envelope':
|
|
91
|
+
payload, base_warnings = build_odps_identity_payload(
|
|
92
|
+
client=None,
|
|
93
|
+
settings=settings,
|
|
94
|
+
allowed_operations=self.config.allowed_operations,
|
|
95
|
+
identity_source=identity_source,
|
|
96
|
+
auth_type=auth_type,
|
|
97
|
+
token_expires_at=settings.get("token_expires_at"),
|
|
98
|
+
project=self.config.default_project,
|
|
99
|
+
owner_display_name=None,
|
|
100
|
+
authenticated=False,
|
|
101
|
+
configured=True,
|
|
102
|
+
validation_status="failed",
|
|
103
|
+
)
|
|
104
|
+
payload["auth_options"] = build_auth_options(default_global_config_path())
|
|
105
|
+
return Envelope(
|
|
106
|
+
command="auth.whoami",
|
|
107
|
+
status="success",
|
|
108
|
+
data=payload,
|
|
109
|
+
metadata={
|
|
110
|
+
"project": self.config.default_project,
|
|
111
|
+
"config_sources": [str(p) for p in self.config.sources],
|
|
112
|
+
},
|
|
113
|
+
agent_hints=AgentHints(
|
|
114
|
+
next_actions=["auth.login", "auth.login-ncs"],
|
|
115
|
+
warnings=base_warnings + warnings,
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def _auth_settings_from_config(self, auth: 'AuthConfig') -> 'dict[str, str | None]':
|
|
120
|
+
return {
|
|
121
|
+
"provider": auth.provider,
|
|
122
|
+
"access_id": auth.access_id,
|
|
123
|
+
"secret_access_key": auth.secret_access_key,
|
|
124
|
+
"security_token": auth.security_token,
|
|
125
|
+
"token_expires_at": auth.token_expires_at,
|
|
126
|
+
"project": auth.project,
|
|
127
|
+
"endpoint": auth.endpoint,
|
|
128
|
+
"region_name": auth.region_name,
|
|
129
|
+
"tunnel_endpoint": auth.tunnel_endpoint,
|
|
130
|
+
"ncs_process_command": auth.ncs.process_command,
|
|
131
|
+
"ncs_account_type": auth.ncs.account_type,
|
|
132
|
+
"ncs_employee_id": auth.ncs.employee_id,
|
|
133
|
+
"ncs_account_name": auth.ncs.account_name,
|
|
134
|
+
"ncs_app_name": auth.ncs.app_name,
|
|
135
|
+
"ncs_process_timeout": str(auth.ncs.process_timeout) if auth.ncs.process_timeout else None,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
def _clear_session_override(self, warnings: 'list[str]') -> 'None':
|
|
139
|
+
"""Remove session_override.yaml so stale project/schema don't shadow new auth config."""
|
|
140
|
+
override_path = session_override_path()
|
|
141
|
+
if override_path.exists():
|
|
142
|
+
override_path.unlink()
|
|
143
|
+
warnings.append(
|
|
144
|
+
"Session override (project/schema) was cleared because auth config changed. "
|
|
145
|
+
"Run `session set` if you need to re-apply a project override."
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def _validate_auth_config_shape(self, auth: 'AuthConfig') -> 'None':
|
|
149
|
+
settings = self._auth_settings_from_config(auth)
|
|
150
|
+
provider = (auth.provider or "").strip().lower()
|
|
151
|
+
if provider not in {"access_key", "sts_token", "ncs"}:
|
|
152
|
+
provider = "ncs" if settings.get("ncs_process_command") else "sts_token" if settings.get("security_token") else "access_key"
|
|
153
|
+
|
|
154
|
+
missing = missing_odps_settings(settings, auth_type=provider)
|
|
155
|
+
if not missing:
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
if provider == "ncs":
|
|
159
|
+
raise ValidationError(
|
|
160
|
+
f"ncs authentication is missing required fields: {', '.join(missing)}.",
|
|
161
|
+
suggestion="Provide project, endpoint, and ncs account configuration before using the ncs provider.",
|
|
162
|
+
)
|
|
163
|
+
if provider == "sts_token":
|
|
164
|
+
raise ValidationError(
|
|
165
|
+
f"STS authentication is missing required fields: {', '.join(missing)}.",
|
|
166
|
+
suggestion="Provide access_id, secret_access_key, security_token, project, and endpoint.",
|
|
167
|
+
)
|
|
168
|
+
raise ValidationError(
|
|
169
|
+
f"MaxCompute connection settings are incomplete: {', '.join(missing)}.",
|
|
170
|
+
suggestion="Run `maxc auth login` or set the required environment variables.",
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def _cache_age_seconds(self, updated_at: 'str | None') -> 'int | None':
|
|
174
|
+
if not updated_at:
|
|
175
|
+
return None
|
|
176
|
+
parsed = parse_time_value(updated_at)
|
|
177
|
+
if parsed is None:
|
|
178
|
+
return None
|
|
179
|
+
return max(int((datetime.now(timezone.utc) - parsed).total_seconds()), 0)
|
|
180
|
+
|
|
181
|
+
def _cache_metadata(
|
|
182
|
+
self,
|
|
183
|
+
*,
|
|
184
|
+
project: 'str',
|
|
185
|
+
source: 'str',
|
|
186
|
+
query_time_ms: 'int | None' = None,
|
|
187
|
+
schema_name: 'str | None' = None,
|
|
188
|
+
) -> 'dict[str, Any]':
|
|
189
|
+
cache_stats = self.cache.get_cache_stats(project, schema_name)
|
|
190
|
+
cache_age_seconds = self._cache_age_seconds(cache_stats.get("newest"))
|
|
191
|
+
metadata: 'dict[str, Any]' = {
|
|
192
|
+
"project": project,
|
|
193
|
+
"source": source,
|
|
194
|
+
"cache_available": cache_stats["table_count"] > 0,
|
|
195
|
+
"cache_age_seconds": cache_age_seconds,
|
|
196
|
+
"cache_stale": bool(cache_age_seconds is not None and cache_age_seconds > 3600),
|
|
197
|
+
"refresh_command": "cache build",
|
|
198
|
+
}
|
|
199
|
+
if query_time_ms is not None:
|
|
200
|
+
metadata["query_time_ms"] = query_time_ms
|
|
201
|
+
return metadata
|
|
202
|
+
|
|
203
|
+
def query(
|
|
204
|
+
self,
|
|
205
|
+
*,
|
|
206
|
+
command: 'str',
|
|
207
|
+
sql: 'str',
|
|
208
|
+
project: 'str | None' = None,
|
|
209
|
+
max_rows: 'int' = 100,
|
|
210
|
+
cursor: 'str | None' = None,
|
|
211
|
+
dry_run: 'bool' = False,
|
|
212
|
+
wait: 'int' = 10,
|
|
213
|
+
cost_check: 'float | None' = None,
|
|
214
|
+
idempotency_key: 'str | None' = None,
|
|
215
|
+
retry_on: 'list[str] | None' = None,
|
|
216
|
+
max_retries: 'int' = 0,
|
|
217
|
+
) -> 'Envelope':
|
|
218
|
+
if max_rows <= 0:
|
|
219
|
+
raise ValidationError("`--max-rows` and `--page-size` must be greater than 0.")
|
|
220
|
+
if cursor and dry_run:
|
|
221
|
+
raise ValidationError("Do not combine `--cursor` with `--dry-run`.")
|
|
222
|
+
|
|
223
|
+
target_project = project or self.config.default_project
|
|
224
|
+
offset, session_id = decode_cursor(cursor)
|
|
225
|
+
|
|
226
|
+
# 如果 cursor 包含 session_id,从缓存获取 job_id,直接读取结果而不重新执行 SQL
|
|
227
|
+
if session_id is not None and self.remote_jobs:
|
|
228
|
+
session = self.cache.get_session(session_id)
|
|
229
|
+
if session and session.get("job_id"):
|
|
230
|
+
result = self.backend.fetch_job_result(
|
|
231
|
+
session["job_id"],
|
|
232
|
+
project=session.get("project") or target_project,
|
|
233
|
+
max_rows=max_rows,
|
|
234
|
+
offset=offset,
|
|
235
|
+
)
|
|
236
|
+
envelope = self._build_query_envelope(
|
|
237
|
+
command=command,
|
|
238
|
+
result=result,
|
|
239
|
+
dry_run=False,
|
|
240
|
+
session_id=session_id,
|
|
241
|
+
)
|
|
242
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
243
|
+
return envelope
|
|
244
|
+
|
|
245
|
+
# Remote branch — always submit, then poll up to `wait` seconds
|
|
246
|
+
if self.remote_jobs and not dry_run:
|
|
247
|
+
job = self._submit_remote_job(
|
|
248
|
+
sql=sql,
|
|
249
|
+
project=target_project,
|
|
250
|
+
cost_check=cost_check,
|
|
251
|
+
idempotency_key=idempotency_key,
|
|
252
|
+
)
|
|
253
|
+
retry_warnings = []
|
|
254
|
+
if retry_on:
|
|
255
|
+
retry_warnings = ["`--retry-on` and `--max-retries` are not applied on the remote job path; the job runs as submitted."]
|
|
256
|
+
if wait == 0:
|
|
257
|
+
# Return pending envelope immediately, no polling
|
|
258
|
+
envelope = Envelope(
|
|
259
|
+
command=command,
|
|
260
|
+
status="pending",
|
|
261
|
+
data={"job_id": job.job_id},
|
|
262
|
+
metadata={
|
|
263
|
+
"job_id": job.job_id,
|
|
264
|
+
"project": job.project,
|
|
265
|
+
"submitted_at": job.submitted_at,
|
|
266
|
+
"logview": job.logview,
|
|
267
|
+
"wait_seconds": 0,
|
|
268
|
+
"sql_executed": sql,
|
|
269
|
+
},
|
|
270
|
+
agent_hints=AgentHints(
|
|
271
|
+
next_actions=["job.wait", "job.status"],
|
|
272
|
+
warnings=(job.warnings or []) + retry_warnings,
|
|
273
|
+
insights=["Use `job wait <job_id>` to wait for completion, then `job result <job_id>` to fetch rows."],
|
|
274
|
+
),
|
|
275
|
+
)
|
|
276
|
+
if idempotency_key:
|
|
277
|
+
envelope.metadata["idempotency_key"] = idempotency_key
|
|
278
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
279
|
+
return envelope
|
|
280
|
+
# Poll
|
|
281
|
+
try:
|
|
282
|
+
job_info = self.backend.wait_job(
|
|
283
|
+
job.job_id,
|
|
284
|
+
project=target_project,
|
|
285
|
+
timeout=wait,
|
|
286
|
+
poll_interval=1,
|
|
287
|
+
)
|
|
288
|
+
except JobTimeoutError:
|
|
289
|
+
envelope = Envelope(
|
|
290
|
+
command=command,
|
|
291
|
+
status="pending",
|
|
292
|
+
data={"job_id": job.job_id},
|
|
293
|
+
metadata={
|
|
294
|
+
"job_id": job.job_id,
|
|
295
|
+
"project": job.project,
|
|
296
|
+
"submitted_at": job.submitted_at,
|
|
297
|
+
"logview": job.logview,
|
|
298
|
+
"wait_seconds": wait,
|
|
299
|
+
"sql_executed": sql,
|
|
300
|
+
},
|
|
301
|
+
agent_hints=AgentHints(
|
|
302
|
+
next_actions=["job.wait", "job.status"],
|
|
303
|
+
warnings=(job.warnings or []) + retry_warnings,
|
|
304
|
+
insights=[
|
|
305
|
+
f"Query promoted to async after {wait}s. "
|
|
306
|
+
"Use `job wait <job_id> --timeout <N>` to wait for completion, "
|
|
307
|
+
"then `job result <job_id>` to fetch rows."
|
|
308
|
+
],
|
|
309
|
+
),
|
|
310
|
+
)
|
|
311
|
+
if idempotency_key:
|
|
312
|
+
envelope.metadata["idempotency_key"] = idempotency_key
|
|
313
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
314
|
+
return envelope
|
|
315
|
+
except BackendConnectionError as exc:
|
|
316
|
+
envelope = Envelope(
|
|
317
|
+
command=command,
|
|
318
|
+
status="failure",
|
|
319
|
+
data=None,
|
|
320
|
+
error=exc.to_payload(),
|
|
321
|
+
metadata={
|
|
322
|
+
"job_id": job.job_id,
|
|
323
|
+
"project": target_project,
|
|
324
|
+
"sql_executed": sql,
|
|
325
|
+
},
|
|
326
|
+
agent_hints=AgentHints(
|
|
327
|
+
next_actions=["job.status"],
|
|
328
|
+
),
|
|
329
|
+
)
|
|
330
|
+
if idempotency_key:
|
|
331
|
+
envelope.metadata["idempotency_key"] = idempotency_key
|
|
332
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
333
|
+
return envelope
|
|
334
|
+
# Job ended — check outcome
|
|
335
|
+
if job_info.status == "failure":
|
|
336
|
+
envelope = Envelope(
|
|
337
|
+
command=command,
|
|
338
|
+
status="failure",
|
|
339
|
+
data={"job_id": job_info.job_id},
|
|
340
|
+
metadata={
|
|
341
|
+
"job_id": job_info.job_id,
|
|
342
|
+
"project": job_info.project,
|
|
343
|
+
"submitted_at": job_info.submitted_at,
|
|
344
|
+
"logview": job_info.logview,
|
|
345
|
+
"sql_executed": sql,
|
|
346
|
+
},
|
|
347
|
+
agent_hints=AgentHints(
|
|
348
|
+
next_actions=["job.diagnose", "job.status"],
|
|
349
|
+
warnings=job_info.warnings or [],
|
|
350
|
+
),
|
|
351
|
+
)
|
|
352
|
+
if idempotency_key:
|
|
353
|
+
envelope.metadata["idempotency_key"] = idempotency_key
|
|
354
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
355
|
+
return envelope
|
|
356
|
+
# status == "success" — fetch rows
|
|
357
|
+
try:
|
|
358
|
+
result = self.backend.fetch_job_result(
|
|
359
|
+
job_info.job_id,
|
|
360
|
+
project=target_project,
|
|
361
|
+
max_rows=max_rows,
|
|
362
|
+
offset=offset,
|
|
363
|
+
)
|
|
364
|
+
except Exception as exc:
|
|
365
|
+
fetch_err = MaxCError(str(exc))
|
|
366
|
+
envelope = Envelope(
|
|
367
|
+
command=command,
|
|
368
|
+
status="failure",
|
|
369
|
+
data=None,
|
|
370
|
+
error=fetch_err.to_payload(),
|
|
371
|
+
metadata={
|
|
372
|
+
"job_id": job_info.job_id,
|
|
373
|
+
"project": target_project,
|
|
374
|
+
"sql_executed": sql,
|
|
375
|
+
},
|
|
376
|
+
agent_hints=AgentHints(
|
|
377
|
+
next_actions=["job.result"],
|
|
378
|
+
),
|
|
379
|
+
)
|
|
380
|
+
if idempotency_key:
|
|
381
|
+
envelope.metadata["idempotency_key"] = idempotency_key
|
|
382
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
383
|
+
return envelope
|
|
384
|
+
envelope = self._build_query_envelope(
|
|
385
|
+
command=command,
|
|
386
|
+
result=result,
|
|
387
|
+
dry_run=False,
|
|
388
|
+
)
|
|
389
|
+
envelope.metadata.update({
|
|
390
|
+
"job_id": job_info.job_id,
|
|
391
|
+
"submitted_at": job_info.submitted_at,
|
|
392
|
+
"logview": job_info.logview,
|
|
393
|
+
})
|
|
394
|
+
if idempotency_key:
|
|
395
|
+
envelope.metadata["idempotency_key"] = idempotency_key
|
|
396
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
397
|
+
return envelope
|
|
398
|
+
|
|
399
|
+
result = self._execute_query(
|
|
400
|
+
sql=sql,
|
|
401
|
+
project=target_project,
|
|
402
|
+
max_rows=max_rows,
|
|
403
|
+
offset=offset,
|
|
404
|
+
dry_run=dry_run,
|
|
405
|
+
cost_check=cost_check,
|
|
406
|
+
retry_on=retry_on or [],
|
|
407
|
+
max_retries=max_retries,
|
|
408
|
+
strict_cost_check=True,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
envelope = self._build_query_envelope(
|
|
412
|
+
command=command,
|
|
413
|
+
result=result,
|
|
414
|
+
dry_run=dry_run,
|
|
415
|
+
)
|
|
416
|
+
if idempotency_key:
|
|
417
|
+
envelope.metadata["idempotency_key"] = idempotency_key
|
|
418
|
+
|
|
419
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
420
|
+
return envelope
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def query_cost(
|
|
424
|
+
self,
|
|
425
|
+
*,
|
|
426
|
+
sql: 'str',
|
|
427
|
+
project: 'str | None' = None,
|
|
428
|
+
command: 'str' = "query.cost",
|
|
429
|
+
) -> 'Envelope':
|
|
430
|
+
target_project = project or self.config.default_project
|
|
431
|
+
analysis = self._analyze_query(
|
|
432
|
+
sql=sql,
|
|
433
|
+
project=target_project,
|
|
434
|
+
explain=False,
|
|
435
|
+
)
|
|
436
|
+
envelope = self._build_analysis_envelope(
|
|
437
|
+
command=command,
|
|
438
|
+
sql=sql,
|
|
439
|
+
analysis=analysis,
|
|
440
|
+
)
|
|
441
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
442
|
+
return envelope
|
|
443
|
+
|
|
444
|
+
def query_explain(
|
|
445
|
+
self,
|
|
446
|
+
*,
|
|
447
|
+
sql: 'str',
|
|
448
|
+
project: 'str | None' = None,
|
|
449
|
+
command: 'str' = "query.explain",
|
|
450
|
+
) -> 'Envelope':
|
|
451
|
+
target_project = project or self.config.default_project
|
|
452
|
+
analysis = self._analyze_query(
|
|
453
|
+
sql=sql,
|
|
454
|
+
project=target_project,
|
|
455
|
+
explain=True,
|
|
456
|
+
)
|
|
457
|
+
envelope = self._build_analysis_envelope(
|
|
458
|
+
command=command,
|
|
459
|
+
sql=sql,
|
|
460
|
+
analysis=analysis,
|
|
461
|
+
)
|
|
462
|
+
self.log(command, envelope.status, envelope.metadata)
|
|
463
|
+
return envelope
|
|
464
|
+
|
|
465
|
+
def submit_job(
|
|
466
|
+
self,
|
|
467
|
+
*,
|
|
468
|
+
sql: 'str',
|
|
469
|
+
project: 'str | None' = None,
|
|
470
|
+
max_rows: 'int' = 100,
|
|
471
|
+
cost_check: 'float | None' = None,
|
|
472
|
+
idempotency_key: 'str | None' = None,
|
|
473
|
+
) -> 'Envelope':
|
|
474
|
+
return self.query(
|
|
475
|
+
command="job.submit",
|
|
476
|
+
sql=sql,
|
|
477
|
+
project=project,
|
|
478
|
+
max_rows=max_rows,
|
|
479
|
+
wait=0,
|
|
480
|
+
cost_check=cost_check,
|
|
481
|
+
idempotency_key=idempotency_key,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
def job_status(self, job_id: 'str') -> 'Envelope':
|
|
485
|
+
if self.remote_jobs:
|
|
486
|
+
info = self.backend.get_job(job_id, project=self.config.default_project)
|
|
487
|
+
envelope = self._job_info_envelope("job.status", info)
|
|
488
|
+
self.log("job.status", envelope.status, envelope.metadata)
|
|
489
|
+
return envelope
|
|
490
|
+
|
|
491
|
+
jobs = self._ensure_job_store()
|
|
492
|
+
job = jobs.get_job(job_id)
|
|
493
|
+
info = self._local_job_info(job)
|
|
494
|
+
envelope = self._job_info_envelope("job.status", info)
|
|
495
|
+
self.log("job.status", envelope.status, envelope.metadata)
|
|
496
|
+
return envelope
|
|
497
|
+
|
|
498
|
+
def job_wait(self, job_id: 'str', *, timeout: 'int | None' = None) -> 'tuple[Envelope, list[dict[str, Any]]]':
|
|
499
|
+
# TODO:目前等待作业结束,是直接静默的 wait 知道 Success,我希望是每若干秒(3s?)打印一条作业状态的 ND JSON
|
|
500
|
+
if self.remote_jobs:
|
|
501
|
+
before = self.backend.get_job(job_id, project=self.config.default_project)
|
|
502
|
+
try:
|
|
503
|
+
after = self.backend.wait_job(job_id, project=self.config.default_project, timeout=timeout)
|
|
504
|
+
except JobTimeoutError:
|
|
505
|
+
envelope = Envelope(
|
|
506
|
+
command="job.wait",
|
|
507
|
+
status="pending",
|
|
508
|
+
data={"job_id": job_id},
|
|
509
|
+
metadata={
|
|
510
|
+
"job_id": job_id,
|
|
511
|
+
"project": self.config.default_project,
|
|
512
|
+
"submitted_at": before.submitted_at,
|
|
513
|
+
"logview": before.logview,
|
|
514
|
+
"wait_seconds": timeout,
|
|
515
|
+
},
|
|
516
|
+
agent_hints=AgentHints(
|
|
517
|
+
next_actions=["job.wait", "job.status"],
|
|
518
|
+
insights=[
|
|
519
|
+
f"Job still running after {timeout}s. Use `job wait {job_id} --timeout <N>` to continue waiting, then `job result {job_id}` to fetch rows."
|
|
520
|
+
],
|
|
521
|
+
),
|
|
522
|
+
)
|
|
523
|
+
self.log("job.wait", envelope.status, envelope.metadata)
|
|
524
|
+
return envelope, []
|
|
525
|
+
except BackendConnectionError as exc:
|
|
526
|
+
envelope = Envelope(
|
|
527
|
+
command="job.wait",
|
|
528
|
+
status="failure",
|
|
529
|
+
data=None,
|
|
530
|
+
error=ErrorPayload(
|
|
531
|
+
code="BACKEND_CONNECTION_ERROR",
|
|
532
|
+
message=str(exc),
|
|
533
|
+
recoverable=True,
|
|
534
|
+
suggestion=getattr(exc, "suggestion", None),
|
|
535
|
+
),
|
|
536
|
+
metadata={
|
|
537
|
+
"job_id": job_id,
|
|
538
|
+
"project": self.config.default_project,
|
|
539
|
+
},
|
|
540
|
+
agent_hints=AgentHints(
|
|
541
|
+
next_actions=["job.status"],
|
|
542
|
+
insights=[
|
|
543
|
+
f"Lost contact with backend while waiting for job {job_id}. Run `job status {job_id}` to check if it completed."
|
|
544
|
+
],
|
|
545
|
+
),
|
|
546
|
+
)
|
|
547
|
+
self.log("job.wait", envelope.status, envelope.metadata)
|
|
548
|
+
return envelope, []
|
|
549
|
+
if after.status != "success":
|
|
550
|
+
envelope = self._job_info_envelope("job.wait", after)
|
|
551
|
+
events = [
|
|
552
|
+
{"type": "started", "ts": before.submitted_at or now_utc_iso(), "job_id": before.job_id},
|
|
553
|
+
{
|
|
554
|
+
"type": "failed",
|
|
555
|
+
"ts": after.completed_at or now_utc_iso(),
|
|
556
|
+
"job_id": after.job_id,
|
|
557
|
+
"reason": after.failure_reason,
|
|
558
|
+
"retryable": after.retryable,
|
|
559
|
+
},
|
|
560
|
+
]
|
|
561
|
+
self.log("job.wait", envelope.status, envelope.metadata)
|
|
562
|
+
return envelope, events
|
|
563
|
+
result = self.backend.fetch_job_result(
|
|
564
|
+
job_id,
|
|
565
|
+
project=self.config.default_project,
|
|
566
|
+
max_rows=100,
|
|
567
|
+
)
|
|
568
|
+
envelope = self._build_query_envelope(
|
|
569
|
+
command="job.wait",
|
|
570
|
+
result=result,
|
|
571
|
+
dry_run=False,
|
|
572
|
+
)
|
|
573
|
+
envelope.metadata.update(
|
|
574
|
+
{
|
|
575
|
+
"job_id": job_id,
|
|
576
|
+
"submitted_at": after.submitted_at,
|
|
577
|
+
"completed_at": after.completed_at,
|
|
578
|
+
"logview": after.logview,
|
|
579
|
+
"stage": after.stage,
|
|
580
|
+
"retryable": after.retryable,
|
|
581
|
+
"failure_reason": after.failure_reason,
|
|
582
|
+
"task_summary": after.task_summary,
|
|
583
|
+
}
|
|
584
|
+
)
|
|
585
|
+
events = self._remote_job_events(before, after, result)
|
|
586
|
+
self.log("job.wait", envelope.status, envelope.metadata)
|
|
587
|
+
return envelope, events
|
|
588
|
+
|
|
589
|
+
jobs = self._ensure_job_store()
|
|
590
|
+
job = jobs.get_job(job_id)
|
|
591
|
+
events = self._job_events(job)
|
|
592
|
+
final_job = jobs.update_job(
|
|
593
|
+
job_id,
|
|
594
|
+
status="success",
|
|
595
|
+
progress=100,
|
|
596
|
+
started_at=job.get("started_at") or now_utc_iso(),
|
|
597
|
+
completed_at=now_utc_iso(),
|
|
598
|
+
)
|
|
599
|
+
stored = final_job["result"]
|
|
600
|
+
info = self._local_job_info(final_job)
|
|
601
|
+
envelope = Envelope(
|
|
602
|
+
command="job.wait",
|
|
603
|
+
status="success",
|
|
604
|
+
data=stored["data"],
|
|
605
|
+
metadata={
|
|
606
|
+
**stored["metadata"],
|
|
607
|
+
"job_id": job_id,
|
|
608
|
+
"submitted_at": final_job["submitted_at"],
|
|
609
|
+
"completed_at": final_job["completed_at"],
|
|
610
|
+
"stage": info.stage,
|
|
611
|
+
"retryable": info.retryable,
|
|
612
|
+
"failure_reason": info.failure_reason,
|
|
613
|
+
"logview": info.logview,
|
|
614
|
+
"task_summary": info.task_summary,
|
|
615
|
+
},
|
|
616
|
+
agent_hints=AgentHints(
|
|
617
|
+
next_actions=["job.result", "meta.describe"],
|
|
618
|
+
warnings=stored.get("agent_hints", {}).get("warnings", []),
|
|
619
|
+
),
|
|
620
|
+
)
|
|
621
|
+
self.log("job.wait", envelope.status, envelope.metadata)
|
|
622
|
+
return envelope, events
|
|
623
|
+
|
|
624
|
+
def job_result(self, job_id: 'str', *, max_rows: 'int' = 100, cursor: 'str | None' = None) -> 'Envelope':
|
|
625
|
+
if self.remote_jobs:
|
|
626
|
+
info = self.backend.get_job(job_id, project=self.config.default_project)
|
|
627
|
+
if info.status != "success":
|
|
628
|
+
envelope = self._job_info_envelope("job.result", info)
|
|
629
|
+
self.log("job.result", envelope.status, envelope.metadata)
|
|
630
|
+
return envelope
|
|
631
|
+
offset, _ = decode_cursor(cursor)
|
|
632
|
+
result = self.backend.fetch_job_result(
|
|
633
|
+
job_id,
|
|
634
|
+
project=self.config.default_project,
|
|
635
|
+
max_rows=max_rows,
|
|
636
|
+
offset=offset,
|
|
637
|
+
)
|
|
638
|
+
envelope = self._build_query_envelope(
|
|
639
|
+
command="job.result",
|
|
640
|
+
result=result,
|
|
641
|
+
dry_run=False,
|
|
642
|
+
)
|
|
643
|
+
envelope.metadata.update(
|
|
644
|
+
{
|
|
645
|
+
"job_id": job_id,
|
|
646
|
+
"submitted_at": info.submitted_at,
|
|
647
|
+
"completed_at": info.completed_at,
|
|
648
|
+
"logview": info.logview,
|
|
649
|
+
}
|
|
650
|
+
)
|
|
651
|
+
self.log("job.result", envelope.status, envelope.metadata)
|
|
652
|
+
return envelope
|
|
653
|
+
|
|
654
|
+
jobs = self._ensure_job_store()
|
|
655
|
+
job = jobs.get_job(job_id)
|
|
656
|
+
if job["status"] != "success":
|
|
657
|
+
info = self._local_job_info(job)
|
|
658
|
+
envelope = self._job_info_envelope("job.result", info)
|
|
659
|
+
self.log("job.result", envelope.status, envelope.metadata)
|
|
660
|
+
return envelope
|
|
661
|
+
|
|
662
|
+
stored = job["result"]
|
|
663
|
+
info = self._local_job_info(job)
|
|
664
|
+
all_rows = stored["data"].get("rows", [])
|
|
665
|
+
schema = stored["data"].get("schema", [])
|
|
666
|
+
total_rows = stored["data"].get("total_rows", len(all_rows))
|
|
667
|
+
|
|
668
|
+
offset, _ = decode_cursor(cursor) # session_id ignored for local jobs
|
|
669
|
+
page_rows = all_rows[offset:offset + max_rows]
|
|
670
|
+
returned_rows = len(page_rows)
|
|
671
|
+
has_more = (offset + returned_rows) < total_rows
|
|
672
|
+
next_cursor = encode_cursor(offset + returned_rows) if has_more else None
|
|
673
|
+
|
|
674
|
+
envelope = Envelope(
|
|
675
|
+
command="job.result",
|
|
676
|
+
status="success",
|
|
677
|
+
data={
|
|
678
|
+
"rows": page_rows,
|
|
679
|
+
"schema": schema,
|
|
680
|
+
"total_rows": total_rows,
|
|
681
|
+
"returned_rows": returned_rows,
|
|
682
|
+
"has_more": has_more,
|
|
683
|
+
"next_cursor": next_cursor,
|
|
684
|
+
},
|
|
685
|
+
metadata={
|
|
686
|
+
**stored["metadata"],
|
|
687
|
+
"job_id": job_id,
|
|
688
|
+
"submitted_at": job["submitted_at"],
|
|
689
|
+
"completed_at": job.get("completed_at", job["updated_at"]),
|
|
690
|
+
"stage": info.stage,
|
|
691
|
+
"retryable": info.retryable,
|
|
692
|
+
"failure_reason": info.failure_reason,
|
|
693
|
+
"logview": info.logview,
|
|
694
|
+
"task_summary": info.task_summary,
|
|
695
|
+
},
|
|
696
|
+
agent_hints=AgentHints(
|
|
697
|
+
next_actions=["meta.describe"],
|
|
698
|
+
warnings=stored.get("agent_hints", {}).get("warnings", []),
|
|
699
|
+
),
|
|
700
|
+
)
|
|
701
|
+
self.log("job.result", envelope.status, envelope.metadata)
|
|
702
|
+
return envelope
|
|
703
|
+
|
|
704
|
+
def cancel_job(self, job_id: 'str') -> 'Envelope':
|
|
705
|
+
if self.remote_jobs:
|
|
706
|
+
info = self.backend.cancel_job(job_id, project=self.config.default_project)
|
|
707
|
+
envelope = Envelope(
|
|
708
|
+
command="job.cancel",
|
|
709
|
+
status=info.status,
|
|
710
|
+
data={"job_id": job_id, "cancelled": True},
|
|
711
|
+
metadata={
|
|
712
|
+
"project": info.project,
|
|
713
|
+
"updated_at": info.updated_at,
|
|
714
|
+
"logview": info.logview,
|
|
715
|
+
},
|
|
716
|
+
agent_hints=AgentHints(
|
|
717
|
+
next_actions=["job.status"],
|
|
718
|
+
warnings=info.warnings,
|
|
719
|
+
),
|
|
720
|
+
)
|
|
721
|
+
self.log("job.cancel", envelope.status, envelope.metadata)
|
|
722
|
+
return envelope
|
|
723
|
+
|
|
724
|
+
jobs = self._ensure_job_store()
|
|
725
|
+
job = jobs.get_job(job_id)
|
|
726
|
+
if job["status"] == "success":
|
|
727
|
+
raise ValidationError("The job is already complete and cannot be cancelled.")
|
|
728
|
+
updated = jobs.update_job(job_id, status="failure", progress=0, cancelled=True)
|
|
729
|
+
envelope = Envelope(
|
|
730
|
+
command="job.cancel",
|
|
731
|
+
status="failure",
|
|
732
|
+
data={"job_id": job_id, "cancelled": True},
|
|
733
|
+
metadata={"project": updated["project"], "updated_at": updated["updated_at"]},
|
|
734
|
+
agent_hints=AgentHints(next_actions=["job.submit"]),
|
|
735
|
+
)
|
|
736
|
+
self.log("job.cancel", envelope.status, envelope.metadata)
|
|
737
|
+
return envelope
|
|
738
|
+
|
|
739
|
+
def job_diagnose(self, job_id: 'str') -> 'Envelope':
|
|
740
|
+
if self.remote_jobs:
|
|
741
|
+
payload = self.backend.diagnose_job(job_id, project=self.config.default_project)
|
|
742
|
+
envelope = Envelope(
|
|
743
|
+
command="job.diagnose",
|
|
744
|
+
status="success",
|
|
745
|
+
data=payload,
|
|
746
|
+
metadata={"project": self.config.default_project},
|
|
747
|
+
agent_hints=AgentHints(next_actions=["job.status", "job.result"]),
|
|
748
|
+
)
|
|
749
|
+
self.log("job.diagnose", envelope.status, envelope.metadata)
|
|
750
|
+
return envelope
|
|
751
|
+
|
|
752
|
+
jobs = self._ensure_job_store()
|
|
753
|
+
job = jobs.get_job(job_id)
|
|
754
|
+
info = self._local_job_info(job)
|
|
755
|
+
diagnosis = classify_failure_reason(info.failure_reason)
|
|
756
|
+
envelope = Envelope(
|
|
757
|
+
command="job.diagnose",
|
|
758
|
+
status="success",
|
|
759
|
+
data={
|
|
760
|
+
"job_id": info.job_id,
|
|
761
|
+
"status": info.status,
|
|
762
|
+
"stage": info.stage,
|
|
763
|
+
"retryable": info.retryable,
|
|
764
|
+
"failure_reason": info.failure_reason,
|
|
765
|
+
"diagnosis_category": diagnosis["category"],
|
|
766
|
+
"diagnosis_summary": diagnosis["summary"],
|
|
767
|
+
"logview": info.logview,
|
|
768
|
+
"task_summary": info.task_summary,
|
|
769
|
+
"task_statuses": [],
|
|
770
|
+
"task_results": {},
|
|
771
|
+
},
|
|
772
|
+
metadata={"project": info.project},
|
|
773
|
+
agent_hints=AgentHints(next_actions=["job.status", "job.result"]),
|
|
774
|
+
)
|
|
775
|
+
self.log("job.diagnose", envelope.status, envelope.metadata)
|
|
776
|
+
return envelope
|
|
777
|
+
|
|
778
|
+
def list_jobs(self, *, limit: 'int' = 20) -> 'Envelope':
|
|
779
|
+
if self.remote_jobs:
|
|
780
|
+
jobs = self.backend.list_jobs(project=self.config.default_project, limit=limit)
|
|
781
|
+
rows = [
|
|
782
|
+
{
|
|
783
|
+
"job_id": item.job_id,
|
|
784
|
+
"status": item.status,
|
|
785
|
+
"progress": item.progress,
|
|
786
|
+
"project": item.project,
|
|
787
|
+
"submitted_at": item.submitted_at,
|
|
788
|
+
}
|
|
789
|
+
for item in jobs
|
|
790
|
+
]
|
|
791
|
+
envelope = Envelope(
|
|
792
|
+
command="job.list",
|
|
793
|
+
status="success",
|
|
794
|
+
data={"jobs": rows, "total": len(rows)},
|
|
795
|
+
metadata={"backend": "odps", "project": self.config.default_project},
|
|
796
|
+
agent_hints=AgentHints(next_actions=["job.status", "job.wait"]),
|
|
797
|
+
)
|
|
798
|
+
self.log("job.list", envelope.status, envelope.metadata)
|
|
799
|
+
return envelope
|
|
800
|
+
|
|
801
|
+
jobs = self._ensure_job_store()
|
|
802
|
+
stored_jobs = jobs.list_jobs()[:limit]
|
|
803
|
+
rows = [
|
|
804
|
+
{
|
|
805
|
+
"job_id": item["job_id"],
|
|
806
|
+
"status": item["status"],
|
|
807
|
+
"progress": item["progress"],
|
|
808
|
+
"project": item["project"],
|
|
809
|
+
"submitted_at": item["submitted_at"],
|
|
810
|
+
}
|
|
811
|
+
for item in stored_jobs
|
|
812
|
+
]
|
|
813
|
+
envelope = Envelope(
|
|
814
|
+
command="job.list",
|
|
815
|
+
status="success",
|
|
816
|
+
data={"jobs": rows, "total": len(rows)},
|
|
817
|
+
metadata={"state_file": str(jobs.path)},
|
|
818
|
+
agent_hints=AgentHints(next_actions=["job.status", "job.wait"]),
|
|
819
|
+
)
|
|
820
|
+
self.log("job.list", envelope.status, envelope.metadata)
|
|
821
|
+
return envelope
|
|
822
|
+
|
|
823
|
+
def meta_list_tables(self) -> 'Envelope':
|
|
824
|
+
started = monotonic()
|
|
825
|
+
|
|
826
|
+
# Try to get from cache first
|
|
827
|
+
cached_tables = self.cache.get_all_cached_tables(self.config.default_project)
|
|
828
|
+
|
|
829
|
+
if cached_tables:
|
|
830
|
+
# Use cached data (returns list of dicts)
|
|
831
|
+
tables = cached_tables
|
|
832
|
+
source = "cache"
|
|
833
|
+
rows = [
|
|
834
|
+
{
|
|
835
|
+
"table_name": table.get("table_name"),
|
|
836
|
+
"table_type": table.get("table_type", "TABLE"),
|
|
837
|
+
"size_bytes": table.get("size_bytes"),
|
|
838
|
+
"owner": table.get("owner"),
|
|
839
|
+
"description": table.get("description"),
|
|
840
|
+
"partition_columns": [
|
|
841
|
+
c.get("name") if isinstance(c, dict) else str(c)
|
|
842
|
+
for c in table.get("partition_columns", [])
|
|
843
|
+
],
|
|
844
|
+
}
|
|
845
|
+
for table in tables
|
|
846
|
+
]
|
|
847
|
+
else:
|
|
848
|
+
# No cache - return guidance to build cache first
|
|
849
|
+
return Envelope(
|
|
850
|
+
command="meta.list-tables",
|
|
851
|
+
status="cache_miss",
|
|
852
|
+
data={"tables": [], "total": 0},
|
|
853
|
+
metadata=self._cache_metadata(
|
|
854
|
+
project=self.config.default_project,
|
|
855
|
+
source="none",
|
|
856
|
+
query_time_ms=int((monotonic() - started) * 1000),
|
|
857
|
+
),
|
|
858
|
+
agent_hints=AgentHints(
|
|
859
|
+
next_actions=["cache.build"],
|
|
860
|
+
insights=["No metadata cache found for this project. Building the cache first will significantly improve performance."],
|
|
861
|
+
warnings=["Cache miss: Run `maxc cache build` to populate the metadata cache."],
|
|
862
|
+
),
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
metadata = self._cache_metadata(
|
|
866
|
+
project=self.config.default_project,
|
|
867
|
+
source=source,
|
|
868
|
+
query_time_ms=int((monotonic() - started) * 1000),
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
envelope = Envelope(
|
|
872
|
+
command="meta.list-tables",
|
|
873
|
+
status="success",
|
|
874
|
+
data={"tables": rows, "total": len(rows)},
|
|
875
|
+
metadata=metadata,
|
|
876
|
+
agent_hints=AgentHints(
|
|
877
|
+
next_actions=["meta.describe", "data.sample"],
|
|
878
|
+
insights=[f"Table list served from {source}."],
|
|
879
|
+
),
|
|
880
|
+
)
|
|
881
|
+
self.log("meta.list-tables", envelope.status, envelope.metadata)
|
|
882
|
+
return envelope
|
|
883
|
+
|
|
884
|
+
def meta_describe(self, table_name: 'str', full: 'bool' = False) -> 'Envelope':
|
|
885
|
+
started = monotonic()
|
|
886
|
+
|
|
887
|
+
# Try to get from cache first
|
|
888
|
+
cached_table = self.cache.get_cached_table(
|
|
889
|
+
self.config.default_project,
|
|
890
|
+
table_name,
|
|
891
|
+
schema_name=self.config.default_schema or "default"
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
if cached_table:
|
|
895
|
+
# Use cached metadata for schema, fetch sample rows from API
|
|
896
|
+
from .config import TableDefinition, TableColumn
|
|
897
|
+
|
|
898
|
+
# Build TableDefinition from cache
|
|
899
|
+
columns = [
|
|
900
|
+
TableColumn(name=c["name"], type=c["type"], comment=c.get("comment", ""))
|
|
901
|
+
for c in cached_table.get("columns", [])
|
|
902
|
+
]
|
|
903
|
+
partition_columns = [
|
|
904
|
+
TableColumn(name=p, type="string", comment="")
|
|
905
|
+
for p in cached_table.get("partitions", [])
|
|
906
|
+
]
|
|
907
|
+
|
|
908
|
+
table = TableDefinition(
|
|
909
|
+
name=table_name,
|
|
910
|
+
description=cached_table.get("description", ""),
|
|
911
|
+
columns=columns,
|
|
912
|
+
sample_rows=[], # Will fetch from API if needed
|
|
913
|
+
partitions=[], # Will fetch from API if needed
|
|
914
|
+
partition_columns=partition_columns,
|
|
915
|
+
owner=cached_table.get("owner"),
|
|
916
|
+
size_bytes=cached_table.get("size_bytes"),
|
|
917
|
+
table_type="TABLE",
|
|
918
|
+
)
|
|
919
|
+
source = "cache"
|
|
920
|
+
|
|
921
|
+
warnings = []
|
|
922
|
+
# Optionally fetch additional metadata from API (description, owner, size, sample rows, partitions)
|
|
923
|
+
try:
|
|
924
|
+
api_table = self.backend.describe_table(table_name)
|
|
925
|
+
# Update with API data (API has priority over cache for these fields)
|
|
926
|
+
table.description = api_table.description or table.description
|
|
927
|
+
table.owner = api_table.owner or table.owner
|
|
928
|
+
table.size_bytes = api_table.size_bytes or table.size_bytes
|
|
929
|
+
table.created_at = api_table.created_at
|
|
930
|
+
table.updated_at = api_table.updated_at
|
|
931
|
+
table.table_type = api_table.table_type or table.table_type
|
|
932
|
+
table.sample_rows = api_table.sample_rows
|
|
933
|
+
table.partitions = api_table.partitions
|
|
934
|
+
except Exception:
|
|
935
|
+
# If API fails, still return cached schema
|
|
936
|
+
warnings.append("Backend API unavailable, showing cached schema only")
|
|
937
|
+
else:
|
|
938
|
+
# Fall back to live API
|
|
939
|
+
table = self.backend.describe_table(table_name)
|
|
940
|
+
source = "live"
|
|
941
|
+
warnings = []
|
|
942
|
+
|
|
943
|
+
# Get semantic metadata from cache
|
|
944
|
+
semantic = self.cache.get_semantic(
|
|
945
|
+
project=self.config.default_project,
|
|
946
|
+
table_name=table_name,
|
|
947
|
+
schema_name=self.config.default_schema or "default",
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
if not semantic:
|
|
951
|
+
warnings.append(
|
|
952
|
+
"Missing semantic metadata. Agent should generate it using its own LLM and save with: maxc meta semantic set"
|
|
953
|
+
)
|
|
954
|
+
|
|
955
|
+
payload = self._table_payload(table, full=full)
|
|
956
|
+
|
|
957
|
+
# Add hint about --full flag in summary mode
|
|
958
|
+
if not full and payload.get("has_more_columns"):
|
|
959
|
+
remaining = payload.get("remaining_columns", 0)
|
|
960
|
+
warnings.append(
|
|
961
|
+
f"Showing first 10 columns only. Use --full to see all {payload['column_count']} columns."
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
# Add semantic information to the payload
|
|
965
|
+
payload["semantic"] = semantic
|
|
966
|
+
|
|
967
|
+
envelope = Envelope(
|
|
968
|
+
command="meta.describe",
|
|
969
|
+
status="success",
|
|
970
|
+
data=payload,
|
|
971
|
+
metadata={
|
|
972
|
+
"project": self.config.default_project,
|
|
973
|
+
"source": source,
|
|
974
|
+
"query_time_ms": int((monotonic() - started) * 1000) if source == "live" else None,
|
|
975
|
+
},
|
|
976
|
+
agent_hints=AgentHints(
|
|
977
|
+
next_actions=["data.sample", "data.profile", "query"],
|
|
978
|
+
warnings=warnings,
|
|
979
|
+
),
|
|
980
|
+
)
|
|
981
|
+
self.log("meta.describe", envelope.status, envelope.metadata)
|
|
982
|
+
return envelope
|
|
983
|
+
|
|
984
|
+
def meta_search(self, keyword: 'str') -> 'Envelope':
|
|
985
|
+
started = monotonic()
|
|
986
|
+
cached_tables = self.cache.get_all_cached_tables(self.config.default_project)
|
|
987
|
+
if cached_tables:
|
|
988
|
+
matches = self._search_in_cache(keyword, cached_tables)
|
|
989
|
+
source = "cache"
|
|
990
|
+
else:
|
|
991
|
+
matches = self.backend.search_tables(keyword)
|
|
992
|
+
source = "live"
|
|
993
|
+
envelope = Envelope(
|
|
994
|
+
command="meta.search",
|
|
995
|
+
status="success",
|
|
996
|
+
data={"keyword": keyword, "matches": matches, "total": len(matches)},
|
|
997
|
+
metadata=self._cache_metadata(
|
|
998
|
+
project=self.config.default_project,
|
|
999
|
+
source=source,
|
|
1000
|
+
query_time_ms=int((monotonic() - started) * 1000) if source == "live" else None,
|
|
1001
|
+
),
|
|
1002
|
+
agent_hints=AgentHints(
|
|
1003
|
+
next_actions=["meta.describe", "data.sample"],
|
|
1004
|
+
warnings=[] if cached_tables else ["No metadata cache was used. Run `maxc cache build` to speed up future lookups."],
|
|
1005
|
+
),
|
|
1006
|
+
)
|
|
1007
|
+
self.log("meta.search", envelope.status, envelope.metadata)
|
|
1008
|
+
return envelope
|
|
1009
|
+
|
|
1010
|
+
def meta_search_columns(self, keyword: 'str') -> 'Envelope':
|
|
1011
|
+
started = monotonic()
|
|
1012
|
+
cached_tables = self.cache.get_all_cached_tables(self.config.default_project)
|
|
1013
|
+
if cached_tables:
|
|
1014
|
+
matches = self._search_columns_in_cache(keyword, cached_tables)
|
|
1015
|
+
source = "cache"
|
|
1016
|
+
else:
|
|
1017
|
+
matches = self.backend.search_columns(keyword)
|
|
1018
|
+
source = "live"
|
|
1019
|
+
envelope = Envelope(
|
|
1020
|
+
command="meta.search-columns",
|
|
1021
|
+
status="success",
|
|
1022
|
+
data={"keyword": keyword, "matches": matches, "total": len(matches)},
|
|
1023
|
+
metadata=self._cache_metadata(
|
|
1024
|
+
project=self.config.default_project,
|
|
1025
|
+
source=source,
|
|
1026
|
+
query_time_ms=int((monotonic() - started) * 1000) if source == "live" else None,
|
|
1027
|
+
),
|
|
1028
|
+
agent_hints=AgentHints(
|
|
1029
|
+
next_actions=["meta.describe", "query"],
|
|
1030
|
+
warnings=[] if cached_tables else ["No metadata cache was used. Run `maxc cache build` to speed up future lookups."],
|
|
1031
|
+
),
|
|
1032
|
+
)
|
|
1033
|
+
self.log("meta.search-columns", envelope.status, envelope.metadata)
|
|
1034
|
+
return envelope
|
|
1035
|
+
|
|
1036
|
+
def _search_in_cache(
|
|
1037
|
+
self, keyword: 'str', cached_tables: 'list[dict]'
|
|
1038
|
+
) -> 'list[dict]':
|
|
1039
|
+
"""Search tables in cache."""
|
|
1040
|
+
tokens = [t.lower() for t in keyword.split() if t.strip()] or [keyword.lower()]
|
|
1041
|
+
matches = []
|
|
1042
|
+
for table in cached_tables:
|
|
1043
|
+
score = 0
|
|
1044
|
+
matched_columns = []
|
|
1045
|
+
searchable = f"{table['table_name']} {table.get('description', '')}".lower()
|
|
1046
|
+
for token in tokens:
|
|
1047
|
+
if token in searchable:
|
|
1048
|
+
score += 5
|
|
1049
|
+
for col in table.get("columns", []):
|
|
1050
|
+
text = f"{col.get('name', '')} {col.get('comment', '')}".lower()
|
|
1051
|
+
if token in text:
|
|
1052
|
+
score += 2
|
|
1053
|
+
matched_columns.append(col["name"])
|
|
1054
|
+
if score:
|
|
1055
|
+
matches.append({
|
|
1056
|
+
"table_name": table["table_name"],
|
|
1057
|
+
"description": table.get("description"),
|
|
1058
|
+
"score": score,
|
|
1059
|
+
"matched_columns": list(set(matched_columns))[:5],
|
|
1060
|
+
})
|
|
1061
|
+
return sorted(matches, key=lambda x: -x["score"])[:20]
|
|
1062
|
+
|
|
1063
|
+
def _search_columns_in_cache(
|
|
1064
|
+
self, keyword: 'str', cached_tables: 'list[dict]'
|
|
1065
|
+
) -> 'list[dict]':
|
|
1066
|
+
"""Search columns in cache."""
|
|
1067
|
+
tokens = [t.lower() for t in keyword.split() if t.strip()] or [keyword.lower()]
|
|
1068
|
+
matches = []
|
|
1069
|
+
for table in cached_tables:
|
|
1070
|
+
for col in table.get("columns", []):
|
|
1071
|
+
text = f"{col.get('name', '')} {col.get('comment', '')}".lower()
|
|
1072
|
+
score = sum(2 for token in tokens if token in text)
|
|
1073
|
+
if score:
|
|
1074
|
+
matches.append({
|
|
1075
|
+
"table_name": table["table_name"],
|
|
1076
|
+
"column_name": col["name"],
|
|
1077
|
+
"column_type": col.get("type"),
|
|
1078
|
+
"column_comment": col.get("comment"),
|
|
1079
|
+
"score": score,
|
|
1080
|
+
})
|
|
1081
|
+
return sorted(matches, key=lambda x: -x["score"])[:50]
|
|
1082
|
+
|
|
1083
|
+
# ========== Semantic Metadata Methods ==========
|
|
1084
|
+
|
|
1085
|
+
def semantic_set(
|
|
1086
|
+
self,
|
|
1087
|
+
table_name: 'str',
|
|
1088
|
+
semantic_desc: 'str | None' = None,
|
|
1089
|
+
use_cases: 'list[str] | None' = None,
|
|
1090
|
+
sample_questions: 'list[str] | None' = None,
|
|
1091
|
+
column_semantics: 'list[dict[str, Any]] | None' = None,
|
|
1092
|
+
relations: 'list[dict[str, Any]] | None' = None,
|
|
1093
|
+
stats: 'dict[str, Any] | None' = None,
|
|
1094
|
+
) -> 'Envelope':
|
|
1095
|
+
"""Set semantic metadata for a table (data provided by Agent)."""
|
|
1096
|
+
try:
|
|
1097
|
+
self.cache.save_semantic(
|
|
1098
|
+
project=self.config.default_project,
|
|
1099
|
+
table_name=table_name,
|
|
1100
|
+
semantic_desc=semantic_desc or "",
|
|
1101
|
+
use_cases=use_cases or [],
|
|
1102
|
+
sample_questions=sample_questions or [],
|
|
1103
|
+
column_semantics=column_semantics or [],
|
|
1104
|
+
schema_name=self.config.default_schema or "default",
|
|
1105
|
+
relations=relations,
|
|
1106
|
+
stats=stats,
|
|
1107
|
+
)
|
|
1108
|
+
|
|
1109
|
+
envelope = Envelope(
|
|
1110
|
+
command="meta.semantic.set",
|
|
1111
|
+
status="success",
|
|
1112
|
+
data={
|
|
1113
|
+
"action": "set_semantic",
|
|
1114
|
+
"table_name": table_name,
|
|
1115
|
+
"has_description": bool(semantic_desc),
|
|
1116
|
+
"use_cases_count": len(use_cases) if use_cases else 0,
|
|
1117
|
+
"sample_questions_count": len(sample_questions) if sample_questions else 0,
|
|
1118
|
+
"column_semantics_count": len(column_semantics) if column_semantics else 0,
|
|
1119
|
+
},
|
|
1120
|
+
metadata={
|
|
1121
|
+
"project": self.config.default_project,
|
|
1122
|
+
"schema": self.config.default_schema or "default",
|
|
1123
|
+
},
|
|
1124
|
+
agent_hints=AgentHints(
|
|
1125
|
+
next_actions=[f"meta.describe {table_name} --json"],
|
|
1126
|
+
insights=["Semantic metadata has been saved to local cache."],
|
|
1127
|
+
),
|
|
1128
|
+
)
|
|
1129
|
+
except Exception as exc:
|
|
1130
|
+
envelope = Envelope(
|
|
1131
|
+
command="meta.semantic.set",
|
|
1132
|
+
status="failure",
|
|
1133
|
+
data=None,
|
|
1134
|
+
metadata={
|
|
1135
|
+
"project": self.config.default_project,
|
|
1136
|
+
},
|
|
1137
|
+
error=ErrorPayload(
|
|
1138
|
+
code="SEMANTIC_SET_ERROR",
|
|
1139
|
+
message=str(exc),
|
|
1140
|
+
recoverable=False,
|
|
1141
|
+
suggestion="Check the error message and try again.",
|
|
1142
|
+
),
|
|
1143
|
+
agent_hints=AgentHints(
|
|
1144
|
+
next_actions=["meta.semantic.set"],
|
|
1145
|
+
),
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
self.log("meta.semantic.set", envelope.status, envelope.metadata)
|
|
1149
|
+
return envelope
|
|
1150
|
+
|
|
1151
|
+
def semantic_get(
|
|
1152
|
+
self,
|
|
1153
|
+
table_name: 'str',
|
|
1154
|
+
) -> 'Envelope':
|
|
1155
|
+
"""Get semantic metadata for a table."""
|
|
1156
|
+
try:
|
|
1157
|
+
semantic = self.cache.get_semantic(
|
|
1158
|
+
project=self.config.default_project,
|
|
1159
|
+
table_name=table_name,
|
|
1160
|
+
schema_name=self.config.default_schema or "default",
|
|
1161
|
+
)
|
|
1162
|
+
|
|
1163
|
+
if semantic:
|
|
1164
|
+
envelope = Envelope(
|
|
1165
|
+
command="meta.semantic.get",
|
|
1166
|
+
status="success",
|
|
1167
|
+
data={
|
|
1168
|
+
"table_name": table_name,
|
|
1169
|
+
"semantic": semantic,
|
|
1170
|
+
},
|
|
1171
|
+
metadata={
|
|
1172
|
+
"project": self.config.default_project,
|
|
1173
|
+
"schema": self.config.default_schema or "default",
|
|
1174
|
+
},
|
|
1175
|
+
agent_hints=AgentHints(
|
|
1176
|
+
next_actions=["meta.describe", "meta.semantic.set"],
|
|
1177
|
+
),
|
|
1178
|
+
)
|
|
1179
|
+
else:
|
|
1180
|
+
envelope = Envelope(
|
|
1181
|
+
command="meta.semantic.get",
|
|
1182
|
+
status="success",
|
|
1183
|
+
data={
|
|
1184
|
+
"table_name": table_name,
|
|
1185
|
+
"semantic": None,
|
|
1186
|
+
},
|
|
1187
|
+
metadata={
|
|
1188
|
+
"project": self.config.default_project,
|
|
1189
|
+
"schema": self.config.default_schema or "default",
|
|
1190
|
+
},
|
|
1191
|
+
agent_hints=AgentHints(
|
|
1192
|
+
warnings=["No semantic metadata found. Use `maxc meta semantic set` to add metadata."],
|
|
1193
|
+
next_actions=[f"meta semantic set {table_name}"],
|
|
1194
|
+
),
|
|
1195
|
+
)
|
|
1196
|
+
except Exception as exc:
|
|
1197
|
+
envelope = Envelope(
|
|
1198
|
+
command="meta.semantic.get",
|
|
1199
|
+
status="failure",
|
|
1200
|
+
data=None,
|
|
1201
|
+
metadata={
|
|
1202
|
+
"project": self.config.default_project,
|
|
1203
|
+
},
|
|
1204
|
+
error=ErrorPayload(
|
|
1205
|
+
code="SEMANTIC_GET_ERROR",
|
|
1206
|
+
message=str(exc),
|
|
1207
|
+
recoverable=False,
|
|
1208
|
+
suggestion="Check the error message and try again.",
|
|
1209
|
+
),
|
|
1210
|
+
agent_hints=AgentHints(
|
|
1211
|
+
next_actions=["meta.semantic.get"],
|
|
1212
|
+
),
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
self.log("meta.semantic.get", envelope.status, envelope.metadata)
|
|
1216
|
+
return envelope
|
|
1217
|
+
|
|
1218
|
+
def semantic_list_missing(
|
|
1219
|
+
self,
|
|
1220
|
+
) -> 'Envelope':
|
|
1221
|
+
"""List tables without semantic metadata."""
|
|
1222
|
+
try:
|
|
1223
|
+
# Get all cached tables
|
|
1224
|
+
all_tables = self.cache.get_all_cached_tables(
|
|
1225
|
+
project=self.config.default_project
|
|
1226
|
+
)
|
|
1227
|
+
|
|
1228
|
+
# Get tables with semantic metadata
|
|
1229
|
+
semantic_tables = self.cache.get_all_semantics(
|
|
1230
|
+
project=self.config.default_project
|
|
1231
|
+
)
|
|
1232
|
+
semantic_table_names = {t["table_name"] for t in semantic_tables}
|
|
1233
|
+
|
|
1234
|
+
# Find tables missing semantic metadata
|
|
1235
|
+
missing = [
|
|
1236
|
+
t for t in all_tables
|
|
1237
|
+
if t["table_name"] not in semantic_table_names
|
|
1238
|
+
]
|
|
1239
|
+
|
|
1240
|
+
envelope = Envelope(
|
|
1241
|
+
command="meta.semantic.list-missing",
|
|
1242
|
+
status="success",
|
|
1243
|
+
data={
|
|
1244
|
+
"total_cached_tables": len(all_tables),
|
|
1245
|
+
"with_semantic": len(semantic_tables),
|
|
1246
|
+
"missing_semantic": len(missing),
|
|
1247
|
+
"tables": [
|
|
1248
|
+
{
|
|
1249
|
+
"table_name": t["table_name"],
|
|
1250
|
+
"schema_name": t["schema_name"],
|
|
1251
|
+
"description": t.get("description", ""),
|
|
1252
|
+
"column_count": len(t.get("columns", [])),
|
|
1253
|
+
}
|
|
1254
|
+
for t in missing[:50] # Limit to 50 tables
|
|
1255
|
+
],
|
|
1256
|
+
},
|
|
1257
|
+
metadata={
|
|
1258
|
+
"project": self.config.default_project,
|
|
1259
|
+
},
|
|
1260
|
+
agent_hints=AgentHints(
|
|
1261
|
+
insights=[
|
|
1262
|
+
f"{len(missing)} tables lack semantic metadata.",
|
|
1263
|
+
"Run `maxc meta semantic set <table>` to add metadata for each table."
|
|
1264
|
+
],
|
|
1265
|
+
next_actions=[
|
|
1266
|
+
f"meta semantic set {missing[0]['table_name']}"
|
|
1267
|
+
] if missing else [],
|
|
1268
|
+
),
|
|
1269
|
+
)
|
|
1270
|
+
except Exception as exc:
|
|
1271
|
+
envelope = Envelope(
|
|
1272
|
+
command="meta.semantic.list-missing",
|
|
1273
|
+
status="failure",
|
|
1274
|
+
data=None,
|
|
1275
|
+
metadata={
|
|
1276
|
+
"project": self.config.default_project,
|
|
1277
|
+
},
|
|
1278
|
+
error=ErrorPayload(
|
|
1279
|
+
code="SEMANTIC_LIST_MISSING_ERROR",
|
|
1280
|
+
message=str(exc),
|
|
1281
|
+
recoverable=False,
|
|
1282
|
+
suggestion="Check the error message and try again.",
|
|
1283
|
+
),
|
|
1284
|
+
agent_hints=AgentHints(
|
|
1285
|
+
next_actions=["meta.semantic.list-missing"],
|
|
1286
|
+
),
|
|
1287
|
+
)
|
|
1288
|
+
|
|
1289
|
+
self.log("meta.semantic.list-missing", envelope.status, envelope.metadata)
|
|
1290
|
+
return envelope
|
|
1291
|
+
|
|
1292
|
+
def meta_latest_partition(self, table_name: 'str') -> 'Envelope':
|
|
1293
|
+
payload, warnings = self.backend.latest_partition_info(table_name)
|
|
1294
|
+
next_actions = ["meta.freshness", "data.sample", "query"]
|
|
1295
|
+
if not payload.get("has_partitions"):
|
|
1296
|
+
next_actions = ["meta.describe", "data.sample"]
|
|
1297
|
+
envelope = Envelope(
|
|
1298
|
+
command="meta.latest-partition",
|
|
1299
|
+
status="success",
|
|
1300
|
+
data=payload,
|
|
1301
|
+
metadata={"project": self.config.default_project},
|
|
1302
|
+
agent_hints=AgentHints(next_actions=next_actions, warnings=warnings),
|
|
1303
|
+
)
|
|
1304
|
+
self.log("meta.latest-partition", envelope.status, envelope.metadata)
|
|
1305
|
+
return envelope
|
|
1306
|
+
|
|
1307
|
+
def meta_freshness(self, table_name: 'str') -> 'Envelope':
|
|
1308
|
+
payload, warnings = self.backend.freshness_info(table_name)
|
|
1309
|
+
next_actions = ["meta.latest-partition", "data.sample", "query"]
|
|
1310
|
+
if payload.get("freshness_status") == "stale":
|
|
1311
|
+
next_actions.insert(0, "job.submit")
|
|
1312
|
+
envelope = Envelope(
|
|
1313
|
+
command="meta.freshness",
|
|
1314
|
+
status="success",
|
|
1315
|
+
data=payload,
|
|
1316
|
+
metadata={"project": self.config.default_project},
|
|
1317
|
+
agent_hints=AgentHints(next_actions=next_actions, warnings=warnings),
|
|
1318
|
+
)
|
|
1319
|
+
self.log("meta.freshness", envelope.status, envelope.metadata)
|
|
1320
|
+
return envelope
|
|
1321
|
+
|
|
1322
|
+
def meta_lineage(self, table_name: 'str') -> 'Envelope':
|
|
1323
|
+
payload, warnings = self.backend.lineage_info(table_name)
|
|
1324
|
+
envelope = Envelope(
|
|
1325
|
+
command="meta.lineage",
|
|
1326
|
+
status="success",
|
|
1327
|
+
data=payload,
|
|
1328
|
+
metadata={"project": self.config.default_project},
|
|
1329
|
+
agent_hints=AgentHints(next_actions=["meta.describe"], warnings=warnings),
|
|
1330
|
+
)
|
|
1331
|
+
self.log("meta.lineage", envelope.status, envelope.metadata)
|
|
1332
|
+
return envelope
|
|
1333
|
+
|
|
1334
|
+
def cache_build(
|
|
1335
|
+
self,
|
|
1336
|
+
*,
|
|
1337
|
+
project: 'str | None' = None,
|
|
1338
|
+
max_workers: 'int' = 8,
|
|
1339
|
+
schema_name: 'str | None' = None,
|
|
1340
|
+
async_mode: 'bool' = False,
|
|
1341
|
+
progress_callback: 'Callable[[dict[str, Any]], None] | None' = None,
|
|
1342
|
+
) -> 'Envelope':
|
|
1343
|
+
"""Build metadata cache for all tables in the project.
|
|
1344
|
+
|
|
1345
|
+
Args:
|
|
1346
|
+
project: Project name
|
|
1347
|
+
max_workers: Number of concurrent workers
|
|
1348
|
+
schema_name: Specific schema to build (None = all schemas)
|
|
1349
|
+
async_mode: If True, return immediately with build_id for async tracking
|
|
1350
|
+
progress_callback: Optional callback for build progress events
|
|
1351
|
+
"""
|
|
1352
|
+
import uuid
|
|
1353
|
+
|
|
1354
|
+
target_project = project or self.config.default_project
|
|
1355
|
+
if progress_callback is not None:
|
|
1356
|
+
progress_callback(
|
|
1357
|
+
{
|
|
1358
|
+
"type": "listing_start",
|
|
1359
|
+
"project": target_project,
|
|
1360
|
+
"schema_name": schema_name,
|
|
1361
|
+
}
|
|
1362
|
+
)
|
|
1363
|
+
|
|
1364
|
+
all_tables = self.backend.list_tables()
|
|
1365
|
+
if schema_name:
|
|
1366
|
+
tables = all_tables
|
|
1367
|
+
else:
|
|
1368
|
+
tables = all_tables
|
|
1369
|
+
|
|
1370
|
+
if progress_callback is not None:
|
|
1371
|
+
progress_callback(
|
|
1372
|
+
{
|
|
1373
|
+
"type": "listing_complete",
|
|
1374
|
+
"project": target_project,
|
|
1375
|
+
"schema_name": schema_name,
|
|
1376
|
+
"total_tables": len(tables),
|
|
1377
|
+
}
|
|
1378
|
+
)
|
|
1379
|
+
|
|
1380
|
+
build_id = str(uuid.uuid4())[:8]
|
|
1381
|
+
|
|
1382
|
+
if async_mode:
|
|
1383
|
+
self.cache.start_build(target_project, build_id, len(tables))
|
|
1384
|
+
envelope = Envelope(
|
|
1385
|
+
command="cache.build",
|
|
1386
|
+
status="running",
|
|
1387
|
+
data={
|
|
1388
|
+
"action": "build",
|
|
1389
|
+
"build_id": build_id,
|
|
1390
|
+
"mode": "async",
|
|
1391
|
+
"scope": "schema" if schema_name else "project",
|
|
1392
|
+
"schema_name": schema_name,
|
|
1393
|
+
"tables_scanned": len(tables),
|
|
1394
|
+
"total_tables": len(tables),
|
|
1395
|
+
"cache_location": str(self.cache.db_path),
|
|
1396
|
+
"message": "Cache build started. Use `cache build-status` to track progress.",
|
|
1397
|
+
},
|
|
1398
|
+
metadata={"project": target_project, "build_id": build_id},
|
|
1399
|
+
agent_hints=AgentHints(
|
|
1400
|
+
next_actions=["cache.build-status"],
|
|
1401
|
+
insights=["The metadata cache build is running in the background."],
|
|
1402
|
+
),
|
|
1403
|
+
)
|
|
1404
|
+
import threading
|
|
1405
|
+
thread = threading.Thread(
|
|
1406
|
+
target=self._build_cache_background,
|
|
1407
|
+
args=(target_project, build_id, tables, max_workers, schema_name, False),
|
|
1408
|
+
daemon=True,
|
|
1409
|
+
)
|
|
1410
|
+
thread.start()
|
|
1411
|
+
self.log("cache.build", "running", envelope.metadata)
|
|
1412
|
+
return envelope
|
|
1413
|
+
|
|
1414
|
+
return self._build_cache_sync(
|
|
1415
|
+
target_project, build_id, tables, max_workers, schema_name, progress_callback=progress_callback
|
|
1416
|
+
)
|
|
1417
|
+
|
|
1418
|
+
def _build_cache_sync(
|
|
1419
|
+
self,
|
|
1420
|
+
project: 'str',
|
|
1421
|
+
build_id: 'str',
|
|
1422
|
+
tables: 'list',
|
|
1423
|
+
max_workers: 'int',
|
|
1424
|
+
schema_name: 'str | None' = None,
|
|
1425
|
+
initialize_status: 'bool' = True,
|
|
1426
|
+
progress_callback: 'Callable[[dict[str, Any]], None] | None' = None,
|
|
1427
|
+
) -> 'Envelope':
|
|
1428
|
+
"""Synchronous cache build with progress tracking."""
|
|
1429
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
1430
|
+
import threading
|
|
1431
|
+
|
|
1432
|
+
started = monotonic()
|
|
1433
|
+
cached_count = 0
|
|
1434
|
+
created_count = 0
|
|
1435
|
+
updated_count = 0
|
|
1436
|
+
errors: 'list[str]' = []
|
|
1437
|
+
lock = threading.Lock()
|
|
1438
|
+
|
|
1439
|
+
if initialize_status:
|
|
1440
|
+
self.cache.start_build(project, build_id, len(tables))
|
|
1441
|
+
if progress_callback is not None:
|
|
1442
|
+
progress_callback(
|
|
1443
|
+
{
|
|
1444
|
+
"type": "build_start",
|
|
1445
|
+
"project": project,
|
|
1446
|
+
"schema_name": schema_name,
|
|
1447
|
+
"build_id": build_id,
|
|
1448
|
+
"total_tables": len(tables),
|
|
1449
|
+
}
|
|
1450
|
+
)
|
|
1451
|
+
|
|
1452
|
+
def fetch_and_cache(
|
|
1453
|
+
table_name: 'str',
|
|
1454
|
+
table_schema: 'str' = "default",
|
|
1455
|
+
) -> 'tuple[str, str | None]':
|
|
1456
|
+
try:
|
|
1457
|
+
full_table = self.backend.describe_table(table_name)
|
|
1458
|
+
existing = self.cache.get_cached_table(project, full_table.name, table_schema)
|
|
1459
|
+
columns = [
|
|
1460
|
+
{"name": c.name, "type": c.type, "comment": c.comment}
|
|
1461
|
+
for c in full_table.columns
|
|
1462
|
+
]
|
|
1463
|
+
self.cache.cache_table(
|
|
1464
|
+
project=project,
|
|
1465
|
+
table_name=full_table.name,
|
|
1466
|
+
description=full_table.description,
|
|
1467
|
+
columns=columns,
|
|
1468
|
+
partitions=full_table.partitions,
|
|
1469
|
+
schema_name=table_schema,
|
|
1470
|
+
)
|
|
1471
|
+
return ("updated" if existing else "created"), None
|
|
1472
|
+
except Exception as exc:
|
|
1473
|
+
return "error", f"{table_name}: {exc}"
|
|
1474
|
+
|
|
1475
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
1476
|
+
futures = {executor.submit(fetch_and_cache, t.name): t.name for t in tables}
|
|
1477
|
+
for future in as_completed(futures):
|
|
1478
|
+
outcome, error = future.result()
|
|
1479
|
+
with lock:
|
|
1480
|
+
if error:
|
|
1481
|
+
errors.append(error)
|
|
1482
|
+
else:
|
|
1483
|
+
cached_count += 1
|
|
1484
|
+
if outcome == "updated":
|
|
1485
|
+
updated_count += 1
|
|
1486
|
+
else:
|
|
1487
|
+
created_count += 1
|
|
1488
|
+
self.cache.update_build_progress(
|
|
1489
|
+
project, build_id, cached_count, len(errors)
|
|
1490
|
+
)
|
|
1491
|
+
if progress_callback is not None:
|
|
1492
|
+
progress_callback(
|
|
1493
|
+
{
|
|
1494
|
+
"type": "progress",
|
|
1495
|
+
"project": project,
|
|
1496
|
+
"schema_name": schema_name,
|
|
1497
|
+
"build_id": build_id,
|
|
1498
|
+
"cached_tables": cached_count,
|
|
1499
|
+
"failed_tables": len(errors),
|
|
1500
|
+
"total_tables": len(tables),
|
|
1501
|
+
}
|
|
1502
|
+
)
|
|
1503
|
+
|
|
1504
|
+
if errors:
|
|
1505
|
+
self.cache.complete_build(project, build_id, error_message=f"{len(errors)} errors")
|
|
1506
|
+
else:
|
|
1507
|
+
self.cache.complete_build(project, build_id)
|
|
1508
|
+
|
|
1509
|
+
stats = self.cache.get_cache_stats(project)
|
|
1510
|
+
elapsed_ms = int((monotonic() - started) * 1000)
|
|
1511
|
+
envelope = Envelope(
|
|
1512
|
+
command="cache.build",
|
|
1513
|
+
status="success" if not errors else "partial",
|
|
1514
|
+
data={
|
|
1515
|
+
"action": "build",
|
|
1516
|
+
"build_id": build_id,
|
|
1517
|
+
"mode": "sync",
|
|
1518
|
+
"scope": "schema" if schema_name else "project",
|
|
1519
|
+
"schema_name": schema_name,
|
|
1520
|
+
"tables_scanned": len(tables),
|
|
1521
|
+
"cache_entries_created": created_count,
|
|
1522
|
+
"cache_entries_updated": updated_count,
|
|
1523
|
+
"cached_tables": cached_count,
|
|
1524
|
+
"total_tables": len(tables),
|
|
1525
|
+
"tables_failed": len(errors),
|
|
1526
|
+
"elapsed_ms": elapsed_ms,
|
|
1527
|
+
"cache_location": str(self.cache.db_path),
|
|
1528
|
+
"errors": errors[:10] if errors else [],
|
|
1529
|
+
"stats": stats,
|
|
1530
|
+
},
|
|
1531
|
+
metadata={"project": project, "build_id": build_id},
|
|
1532
|
+
agent_hints=AgentHints(
|
|
1533
|
+
next_actions=["meta.search", "meta.search-columns"],
|
|
1534
|
+
warnings=[f"Failed to cache {len(errors)} table(s)."] if errors else [],
|
|
1535
|
+
),
|
|
1536
|
+
)
|
|
1537
|
+
if progress_callback is not None:
|
|
1538
|
+
progress_callback(
|
|
1539
|
+
{
|
|
1540
|
+
"type": "completed",
|
|
1541
|
+
"project": project,
|
|
1542
|
+
"schema_name": schema_name,
|
|
1543
|
+
"build_id": build_id,
|
|
1544
|
+
"cached_tables": cached_count,
|
|
1545
|
+
"failed_tables": len(errors),
|
|
1546
|
+
"total_tables": len(tables),
|
|
1547
|
+
"status": envelope.status,
|
|
1548
|
+
}
|
|
1549
|
+
)
|
|
1550
|
+
self.log("cache.build", envelope.status, envelope.metadata)
|
|
1551
|
+
return envelope
|
|
1552
|
+
|
|
1553
|
+
def _build_cache_background(
|
|
1554
|
+
self,
|
|
1555
|
+
project: 'str',
|
|
1556
|
+
build_id: 'str',
|
|
1557
|
+
tables: 'list',
|
|
1558
|
+
max_workers: 'int',
|
|
1559
|
+
schema_name: 'str | None' = None,
|
|
1560
|
+
initialize_status: 'bool' = False,
|
|
1561
|
+
) -> 'None':
|
|
1562
|
+
"""Background cache build (async mode)."""
|
|
1563
|
+
try:
|
|
1564
|
+
self._build_cache_sync(
|
|
1565
|
+
project,
|
|
1566
|
+
build_id,
|
|
1567
|
+
tables,
|
|
1568
|
+
max_workers,
|
|
1569
|
+
schema_name,
|
|
1570
|
+
initialize_status=initialize_status,
|
|
1571
|
+
)
|
|
1572
|
+
except Exception as exc:
|
|
1573
|
+
self.cache.complete_build(project, build_id, error_message=str(exc))
|
|
1574
|
+
|
|
1575
|
+
def cache_build_status(
|
|
1576
|
+
self, *, project: 'str | None' = None, build_id: 'str | None' = None
|
|
1577
|
+
) -> 'Envelope':
|
|
1578
|
+
"""Get cache build status."""
|
|
1579
|
+
target_project = project or self.config.default_project
|
|
1580
|
+
status = self.cache.get_build_status(target_project, build_id)
|
|
1581
|
+
|
|
1582
|
+
if status:
|
|
1583
|
+
envelope = Envelope(
|
|
1584
|
+
command="cache.build-status",
|
|
1585
|
+
status="success",
|
|
1586
|
+
data=status,
|
|
1587
|
+
metadata={"project": target_project},
|
|
1588
|
+
agent_hints=AgentHints(
|
|
1589
|
+
next_actions=(
|
|
1590
|
+
["cache.build"]
|
|
1591
|
+
if status["status"] in ["failed", "completed"]
|
|
1592
|
+
else ["cache.build-status"]
|
|
1593
|
+
),
|
|
1594
|
+
insights=[
|
|
1595
|
+
f"Build progress: {status['progress_percent']}% ({status['processed_tables']}/{status['total_tables']})"
|
|
1596
|
+
]
|
|
1597
|
+
if status["status"] == "running"
|
|
1598
|
+
else [],
|
|
1599
|
+
),
|
|
1600
|
+
)
|
|
1601
|
+
else:
|
|
1602
|
+
envelope = Envelope(
|
|
1603
|
+
command="cache.build-status",
|
|
1604
|
+
status="not_found",
|
|
1605
|
+
data={"message": "No cache build record was found."},
|
|
1606
|
+
metadata={"project": target_project},
|
|
1607
|
+
agent_hints=AgentHints(next_actions=["cache.build"]),
|
|
1608
|
+
)
|
|
1609
|
+
return envelope
|
|
1610
|
+
|
|
1611
|
+
def cache_status(self, *, project: 'str | None' = None, schema_name: 'str | None' = None) -> 'Envelope':
|
|
1612
|
+
"""Get cache status."""
|
|
1613
|
+
target_project = project or self.config.default_project
|
|
1614
|
+
stats = self.cache.get_cache_stats(target_project, schema_name)
|
|
1615
|
+
schemas = self.cache.get_schemas(target_project)
|
|
1616
|
+
|
|
1617
|
+
envelope = Envelope(
|
|
1618
|
+
command="cache.status",
|
|
1619
|
+
status="success",
|
|
1620
|
+
data={
|
|
1621
|
+
**stats,
|
|
1622
|
+
"schemas": schemas,
|
|
1623
|
+
},
|
|
1624
|
+
metadata={"project": target_project},
|
|
1625
|
+
agent_hints=AgentHints(
|
|
1626
|
+
next_actions=["cache.build"] if stats["table_count"] == 0 else ["meta.search"],
|
|
1627
|
+
),
|
|
1628
|
+
)
|
|
1629
|
+
return envelope
|
|
1630
|
+
|
|
1631
|
+
def cache_clear(self, *, project: 'str | None' = None, schema_name: 'str | None' = None) -> 'Envelope':
|
|
1632
|
+
"""Clear metadata cache."""
|
|
1633
|
+
target_project = project or self.config.default_project
|
|
1634
|
+
deleted = self.cache.clear_table_cache(target_project, schema_name)
|
|
1635
|
+
envelope = Envelope(
|
|
1636
|
+
command="cache.clear",
|
|
1637
|
+
status="success",
|
|
1638
|
+
data={"deleted_tables": deleted},
|
|
1639
|
+
metadata={"project": target_project},
|
|
1640
|
+
agent_hints=AgentHints(next_actions=["cache.build"]),
|
|
1641
|
+
)
|
|
1642
|
+
return envelope
|
|
1643
|
+
|
|
1644
|
+
def cache_save_semantic(
|
|
1645
|
+
self,
|
|
1646
|
+
*,
|
|
1647
|
+
table_name: 'str',
|
|
1648
|
+
semantic_desc: 'str',
|
|
1649
|
+
use_cases: 'list[str]',
|
|
1650
|
+
sample_questions: 'list[str]',
|
|
1651
|
+
column_semantics: 'list[dict]',
|
|
1652
|
+
project: 'str | None' = None,
|
|
1653
|
+
schema_name: 'str' = "default",
|
|
1654
|
+
) -> 'Envelope':
|
|
1655
|
+
"""Save AI-generated semantic metadata for NL2SQL."""
|
|
1656
|
+
target_project = project or self.config.default_project
|
|
1657
|
+
self.cache.save_semantic(
|
|
1658
|
+
project=target_project,
|
|
1659
|
+
table_name=table_name,
|
|
1660
|
+
semantic_desc=semantic_desc,
|
|
1661
|
+
use_cases=use_cases,
|
|
1662
|
+
sample_questions=sample_questions,
|
|
1663
|
+
column_semantics=column_semantics,
|
|
1664
|
+
schema_name=schema_name,
|
|
1665
|
+
)
|
|
1666
|
+
envelope = Envelope(
|
|
1667
|
+
command="cache.save-semantic",
|
|
1668
|
+
status="success",
|
|
1669
|
+
data={
|
|
1670
|
+
"table_name": table_name,
|
|
1671
|
+
"schema_name": schema_name,
|
|
1672
|
+
"semantic_desc": semantic_desc,
|
|
1673
|
+
"use_cases_count": len(use_cases),
|
|
1674
|
+
"sample_questions_count": len(sample_questions),
|
|
1675
|
+
"column_semantics_count": len(column_semantics),
|
|
1676
|
+
},
|
|
1677
|
+
metadata={"project": target_project},
|
|
1678
|
+
agent_hints=AgentHints(
|
|
1679
|
+
next_actions=["meta.search", "cache.get-semantic"],
|
|
1680
|
+
insights=["Semantic metadata has been saved and can now support NL2SQL workflows."],
|
|
1681
|
+
),
|
|
1682
|
+
)
|
|
1683
|
+
return envelope
|
|
1684
|
+
|
|
1685
|
+
def cache_get_semantic(
|
|
1686
|
+
self, *, table_name: 'str', project: 'str | None' = None, schema_name: 'str' = "default"
|
|
1687
|
+
) -> 'Envelope':
|
|
1688
|
+
"""Get semantic metadata for a table."""
|
|
1689
|
+
target_project = project or self.config.default_project
|
|
1690
|
+
semantic = self.cache.get_semantic(target_project, table_name, schema_name)
|
|
1691
|
+
if semantic:
|
|
1692
|
+
envelope = Envelope(
|
|
1693
|
+
command="cache.get-semantic",
|
|
1694
|
+
status="success",
|
|
1695
|
+
data={"table_name": table_name, "schema_name": schema_name, **semantic},
|
|
1696
|
+
metadata={"project": target_project},
|
|
1697
|
+
agent_hints=AgentHints(next_actions=["query"]),
|
|
1698
|
+
)
|
|
1699
|
+
else:
|
|
1700
|
+
envelope = Envelope(
|
|
1701
|
+
command="cache.get-semantic",
|
|
1702
|
+
status="not_found",
|
|
1703
|
+
data={"table_name": table_name, "schema_name": schema_name},
|
|
1704
|
+
metadata={"project": target_project},
|
|
1705
|
+
agent_hints=AgentHints(
|
|
1706
|
+
next_actions=["cache.save-semantic"],
|
|
1707
|
+
warnings=["No semantic metadata was found. Run the semantic-index skill first."],
|
|
1708
|
+
),
|
|
1709
|
+
)
|
|
1710
|
+
return envelope
|
|
1711
|
+
|
|
1712
|
+
def meta_partitions(self, table_name: 'str') -> 'Envelope':
|
|
1713
|
+
table = self.backend.describe_table(table_name)
|
|
1714
|
+
envelope = Envelope(
|
|
1715
|
+
command="meta.partitions",
|
|
1716
|
+
status="success",
|
|
1717
|
+
data={"table_name": table.name, "partitions": table.partitions},
|
|
1718
|
+
metadata={"project": self.config.default_project},
|
|
1719
|
+
agent_hints=AgentHints(next_actions=["query"]),
|
|
1720
|
+
)
|
|
1721
|
+
self.log("meta.partitions", envelope.status, envelope.metadata)
|
|
1722
|
+
return envelope
|
|
1723
|
+
|
|
1724
|
+
def meta_list_projects(self) -> 'Envelope':
|
|
1725
|
+
"""List all projects owned by the current user."""
|
|
1726
|
+
projects = self.backend.list_projects()
|
|
1727
|
+
envelope = Envelope(
|
|
1728
|
+
command="meta.list-projects",
|
|
1729
|
+
status="success",
|
|
1730
|
+
data={"projects": projects, "total": len(projects)},
|
|
1731
|
+
metadata={"backend": "odps"},
|
|
1732
|
+
agent_hints=AgentHints(
|
|
1733
|
+
next_actions=["session.set", "meta.list-schemas"],
|
|
1734
|
+
insights=[
|
|
1735
|
+
f"Discovered {len(projects)} project(s) owned by the current identity.",
|
|
1736
|
+
"Choose a project, then run `maxc session set --project <project_name> --json` before listing schemas or tables.",
|
|
1737
|
+
],
|
|
1738
|
+
),
|
|
1739
|
+
)
|
|
1740
|
+
self.log("meta.list-projects", envelope.status, envelope.metadata)
|
|
1741
|
+
return envelope
|
|
1742
|
+
|
|
1743
|
+
def meta_list_schemas(self, *, project: 'str | None' = None) -> 'Envelope':
|
|
1744
|
+
"""List all schemas in a project."""
|
|
1745
|
+
target_project = project or self.config.default_project
|
|
1746
|
+
schemas = self.backend.list_schemas(project=target_project)
|
|
1747
|
+
rows = [{"name": s["name"]} for s in schemas]
|
|
1748
|
+
envelope = Envelope(
|
|
1749
|
+
command="meta.list-schemas",
|
|
1750
|
+
status="success",
|
|
1751
|
+
data={"schemas": rows, "total": len(rows)},
|
|
1752
|
+
metadata={"project": target_project},
|
|
1753
|
+
agent_hints=AgentHints(
|
|
1754
|
+
next_actions=["meta.list-tables", "meta.search"],
|
|
1755
|
+
warnings=[] if rows else ["No schemas were returned. Schema namespaces may not be enabled for this project."],
|
|
1756
|
+
),
|
|
1757
|
+
)
|
|
1758
|
+
self.log("meta.list-schemas", envelope.status, envelope.metadata)
|
|
1759
|
+
return envelope
|
|
1760
|
+
|
|
1761
|
+
def session_set(self, project: 'str | None' = None, schema: 'str | None' = None) -> 'Envelope':
|
|
1762
|
+
"""Set default project and/or schema.
|
|
1763
|
+
|
|
1764
|
+
Saves to session override file (~/.maxc/session_override.yaml) which has
|
|
1765
|
+
highest priority (above environment variables).
|
|
1766
|
+
"""
|
|
1767
|
+
from .config import session_override_path, save_config_mapping, _load_yaml_file
|
|
1768
|
+
|
|
1769
|
+
override_path = session_override_path()
|
|
1770
|
+
override = _load_yaml_file(override_path)
|
|
1771
|
+
|
|
1772
|
+
changes = []
|
|
1773
|
+
warnings: 'list[str]' = []
|
|
1774
|
+
|
|
1775
|
+
if project:
|
|
1776
|
+
if self.backend is not None:
|
|
1777
|
+
try:
|
|
1778
|
+
self.backend.get_project_info(project)
|
|
1779
|
+
except Exception as exc:
|
|
1780
|
+
raise ValidationError(
|
|
1781
|
+
f"Unable to access project `{project}`: {exc}",
|
|
1782
|
+
suggestion="Verify the project name and that the current identity has access.",
|
|
1783
|
+
) from exc
|
|
1784
|
+
else:
|
|
1785
|
+
warnings.append(
|
|
1786
|
+
"Project override was saved without remote validation because no authenticated backend session is active."
|
|
1787
|
+
)
|
|
1788
|
+
override["project"] = project
|
|
1789
|
+
changes.append(f"project set to `{project}`")
|
|
1790
|
+
# Warn if session override project differs from the project saved in auth config
|
|
1791
|
+
if self.config.auth.project and project != self.config.auth.project:
|
|
1792
|
+
warnings.append(
|
|
1793
|
+
f"Session project override (`{project}`) differs from the project saved in auth config "
|
|
1794
|
+
f"(`{self.config.auth.project}`). Operations will use `{project}`, but credentials "
|
|
1795
|
+
f"were configured for `{self.config.auth.project}`. Run `auth whoami` to verify access."
|
|
1796
|
+
)
|
|
1797
|
+
|
|
1798
|
+
if schema:
|
|
1799
|
+
override["schema"] = schema
|
|
1800
|
+
changes.append(f"schema set to `{schema}`")
|
|
1801
|
+
elif schema is not None: # explicitly cleared with --schema=""
|
|
1802
|
+
if "schema" in override:
|
|
1803
|
+
del override["schema"]
|
|
1804
|
+
changes.append("schema cleared")
|
|
1805
|
+
|
|
1806
|
+
save_config_mapping(override_path, override)
|
|
1807
|
+
|
|
1808
|
+
# Update current config in memory
|
|
1809
|
+
if project:
|
|
1810
|
+
self.config.default_project = project
|
|
1811
|
+
# Update backend project if available
|
|
1812
|
+
if self.backend is not None:
|
|
1813
|
+
self.backend.project = project
|
|
1814
|
+
if schema:
|
|
1815
|
+
self.config.default_schema = schema
|
|
1816
|
+
elif schema is not None:
|
|
1817
|
+
self.config.default_schema = None
|
|
1818
|
+
|
|
1819
|
+
envelope = Envelope(
|
|
1820
|
+
command="session.set",
|
|
1821
|
+
status="success",
|
|
1822
|
+
data={
|
|
1823
|
+
"project": self.config.default_project,
|
|
1824
|
+
"schema": self.config.default_schema,
|
|
1825
|
+
"override_path": str(override_path),
|
|
1826
|
+
"changes": changes,
|
|
1827
|
+
},
|
|
1828
|
+
metadata={},
|
|
1829
|
+
agent_hints=AgentHints(
|
|
1830
|
+
next_actions=["meta.list-tables", "meta.list-schemas", "session.show"],
|
|
1831
|
+
insights=[f"Session override set: {', '.join(changes)}. Overrides environment variables."],
|
|
1832
|
+
warnings=warnings,
|
|
1833
|
+
),
|
|
1834
|
+
)
|
|
1835
|
+
self.log("session.set", envelope.status, {"changes": changes})
|
|
1836
|
+
return envelope
|
|
1837
|
+
|
|
1838
|
+
def session_unset(self) -> 'Envelope':
|
|
1839
|
+
"""Clear session override, reverting to environment variables and config file."""
|
|
1840
|
+
from .config import session_override_path
|
|
1841
|
+
|
|
1842
|
+
override_path = session_override_path()
|
|
1843
|
+
cleared = []
|
|
1844
|
+
|
|
1845
|
+
if override_path.exists():
|
|
1846
|
+
override_path.unlink()
|
|
1847
|
+
cleared = ["project", "schema"]
|
|
1848
|
+
|
|
1849
|
+
envelope = Envelope(
|
|
1850
|
+
command="session.unset",
|
|
1851
|
+
status="success",
|
|
1852
|
+
data={
|
|
1853
|
+
"cleared": cleared,
|
|
1854
|
+
"override_path": str(override_path),
|
|
1855
|
+
},
|
|
1856
|
+
metadata={},
|
|
1857
|
+
agent_hints=AgentHints(
|
|
1858
|
+
next_actions=["session.show", "session.set"],
|
|
1859
|
+
insights=["Session override cleared. Configuration will use environment variables and config file."],
|
|
1860
|
+
),
|
|
1861
|
+
)
|
|
1862
|
+
self.log("session.unset", envelope.status, {})
|
|
1863
|
+
return envelope
|
|
1864
|
+
|
|
1865
|
+
def session_show(self) -> 'Envelope':
|
|
1866
|
+
"""Show current session settings with source information."""
|
|
1867
|
+
from .config import session_override_path, default_global_config_path, _load_yaml_file
|
|
1868
|
+
import os
|
|
1869
|
+
|
|
1870
|
+
override_path = session_override_path()
|
|
1871
|
+
config_path = default_global_config_path()
|
|
1872
|
+
override = _load_yaml_file(override_path)
|
|
1873
|
+
|
|
1874
|
+
# Determine source of project
|
|
1875
|
+
env_project = os.environ.get("MAXCOMPUTE_PROJECT") or os.environ.get("ODPS_PROJECT")
|
|
1876
|
+
if override.get("project"):
|
|
1877
|
+
project_source = "session_override"
|
|
1878
|
+
elif env_project:
|
|
1879
|
+
project_source = "environment"
|
|
1880
|
+
else:
|
|
1881
|
+
project_source = "config_file"
|
|
1882
|
+
|
|
1883
|
+
# Determine source of schema
|
|
1884
|
+
if override.get("schema"):
|
|
1885
|
+
schema_source = "session_override"
|
|
1886
|
+
else:
|
|
1887
|
+
schema_source = "config_file"
|
|
1888
|
+
|
|
1889
|
+
# Get project info if available
|
|
1890
|
+
project_info = None
|
|
1891
|
+
project_info_warning = None
|
|
1892
|
+
if self.backend is not None:
|
|
1893
|
+
try:
|
|
1894
|
+
raw_info = self.backend.get_project_info(self.config.default_project)
|
|
1895
|
+
project_info = {k: (str(v) if v is not None else None) for k, v in raw_info.items()}
|
|
1896
|
+
except Exception:
|
|
1897
|
+
project_info_warning = "Could not fetch project info from backend"
|
|
1898
|
+
|
|
1899
|
+
envelope = Envelope(
|
|
1900
|
+
command="session.show",
|
|
1901
|
+
status="success",
|
|
1902
|
+
data={
|
|
1903
|
+
"project": {
|
|
1904
|
+
"value": self.config.default_project,
|
|
1905
|
+
"source": project_source,
|
|
1906
|
+
},
|
|
1907
|
+
"schema": {
|
|
1908
|
+
"value": self.config.default_schema,
|
|
1909
|
+
"source": schema_source,
|
|
1910
|
+
},
|
|
1911
|
+
"override_path": str(override_path),
|
|
1912
|
+
"config_path": str(config_path) if config_path.exists() else None,
|
|
1913
|
+
"project_info": project_info,
|
|
1914
|
+
"config_sources": [str(p) for p in self.config.sources],
|
|
1915
|
+
},
|
|
1916
|
+
metadata={},
|
|
1917
|
+
agent_hints=AgentHints(
|
|
1918
|
+
next_actions=["session.set", "session.unset", "meta.list-tables"],
|
|
1919
|
+
warnings=[project_info_warning] if project_info_warning else [],
|
|
1920
|
+
insights=[
|
|
1921
|
+
f"Project `{self.config.default_project}` from {project_source}",
|
|
1922
|
+
f"Schema `{self.config.default_schema or 'default'}` from {schema_source}",
|
|
1923
|
+
],
|
|
1924
|
+
),
|
|
1925
|
+
)
|
|
1926
|
+
self.log("session.show", envelope.status, {})
|
|
1927
|
+
return envelope
|
|
1928
|
+
|
|
1929
|
+
def data_sample(
|
|
1930
|
+
self,
|
|
1931
|
+
table_name: 'str',
|
|
1932
|
+
rows: 'int' = 5,
|
|
1933
|
+
*,
|
|
1934
|
+
partition: 'str | None' = None,
|
|
1935
|
+
columns: 'list[str] | None' = None,
|
|
1936
|
+
) -> 'Envelope':
|
|
1937
|
+
if rows <= 0:
|
|
1938
|
+
raise ValidationError("`--rows` must be greater than 0.")
|
|
1939
|
+
table, sample_rows, sample_info = self.backend.sample_table(
|
|
1940
|
+
table_name,
|
|
1941
|
+
rows,
|
|
1942
|
+
partition=partition,
|
|
1943
|
+
columns=columns,
|
|
1944
|
+
)
|
|
1945
|
+
envelope = Envelope(
|
|
1946
|
+
command="data.sample",
|
|
1947
|
+
status="success",
|
|
1948
|
+
data={
|
|
1949
|
+
"table_name": table.name,
|
|
1950
|
+
"rows": sample_rows,
|
|
1951
|
+
"returned_rows": len(sample_rows),
|
|
1952
|
+
"schema": sample_info["schema"],
|
|
1953
|
+
"applied_partition": sample_info["applied_partition"],
|
|
1954
|
+
"selected_columns": sample_info["selected_columns"],
|
|
1955
|
+
},
|
|
1956
|
+
metadata={
|
|
1957
|
+
"project": self.config.default_project,
|
|
1958
|
+
"requested_rows": rows,
|
|
1959
|
+
"requested_partition": partition,
|
|
1960
|
+
"requested_columns": columns or [],
|
|
1961
|
+
},
|
|
1962
|
+
agent_hints=AgentHints(next_actions=["data.profile", "query"]),
|
|
1963
|
+
)
|
|
1964
|
+
self.log("data.sample", envelope.status, envelope.metadata)
|
|
1965
|
+
return envelope
|
|
1966
|
+
|
|
1967
|
+
def data_profile(self, table_name: 'str', *, partition: 'str | None' = None) -> 'Envelope':
|
|
1968
|
+
profile = self.backend.profile_table(table_name, partition=partition)
|
|
1969
|
+
envelope = Envelope(
|
|
1970
|
+
command="data.profile",
|
|
1971
|
+
status="success",
|
|
1972
|
+
data=profile,
|
|
1973
|
+
metadata={"project": self.config.default_project, "requested_partition": partition},
|
|
1974
|
+
agent_hints=AgentHints(next_actions=["query"]),
|
|
1975
|
+
)
|
|
1976
|
+
self.log("data.profile", envelope.status, envelope.metadata)
|
|
1977
|
+
return envelope
|
|
1978
|
+
|
|
1979
|
+
def auth_login(
|
|
1980
|
+
self,
|
|
1981
|
+
*,
|
|
1982
|
+
access_id: 'str | None' = None,
|
|
1983
|
+
secret_access_key: 'str | None' = None,
|
|
1984
|
+
security_token: 'str | None' = None,
|
|
1985
|
+
project: 'str | None' = None,
|
|
1986
|
+
endpoint: 'str | None' = None,
|
|
1987
|
+
region_name: 'str | None' = None,
|
|
1988
|
+
tunnel_endpoint: 'str | None' = None,
|
|
1989
|
+
from_env: 'bool' = False,
|
|
1990
|
+
no_validate: 'bool' = False,
|
|
1991
|
+
target_config_path: 'Path | None' = None,
|
|
1992
|
+
) -> 'Envelope':
|
|
1993
|
+
target_path = target_config_path or default_global_config_path()
|
|
1994
|
+
existing_payload = load_config_mapping(target_path) if target_path.exists() else {}
|
|
1995
|
+
existing_auth = AuthConfig.from_mapping(existing_payload.get("auth", {}) or {})
|
|
1996
|
+
env_settings = load_odps_env()
|
|
1997
|
+
|
|
1998
|
+
resolved_auth = AuthConfig(
|
|
1999
|
+
access_id=self._resolve_login_value(
|
|
2000
|
+
provided=access_id,
|
|
2001
|
+
env_value=env_settings.get("access_id"),
|
|
2002
|
+
existing_value=existing_auth.access_id,
|
|
2003
|
+
prompt="Access Key ID",
|
|
2004
|
+
required=True,
|
|
2005
|
+
secret=False,
|
|
2006
|
+
use_env=from_env,
|
|
2007
|
+
),
|
|
2008
|
+
secret_access_key=self._resolve_login_value(
|
|
2009
|
+
provided=secret_access_key,
|
|
2010
|
+
env_value=env_settings.get("secret_access_key"),
|
|
2011
|
+
existing_value=existing_auth.secret_access_key,
|
|
2012
|
+
prompt="Access Key Secret",
|
|
2013
|
+
required=True,
|
|
2014
|
+
secret=True,
|
|
2015
|
+
use_env=from_env,
|
|
2016
|
+
),
|
|
2017
|
+
security_token=self._resolve_login_value(
|
|
2018
|
+
provided=security_token,
|
|
2019
|
+
env_value=env_settings.get("security_token"),
|
|
2020
|
+
existing_value=existing_auth.security_token,
|
|
2021
|
+
prompt="STS Security Token (optional)",
|
|
2022
|
+
required=False,
|
|
2023
|
+
secret=True,
|
|
2024
|
+
use_env=from_env,
|
|
2025
|
+
),
|
|
2026
|
+
project=self._resolve_login_value(
|
|
2027
|
+
provided=project,
|
|
2028
|
+
env_value=env_settings.get("project"),
|
|
2029
|
+
existing_value=existing_auth.project,
|
|
2030
|
+
prompt="MaxCompute Project",
|
|
2031
|
+
required=True,
|
|
2032
|
+
secret=False,
|
|
2033
|
+
use_env=from_env,
|
|
2034
|
+
),
|
|
2035
|
+
endpoint=self._resolve_login_value(
|
|
2036
|
+
provided=endpoint,
|
|
2037
|
+
env_value=env_settings.get("endpoint"),
|
|
2038
|
+
existing_value=existing_auth.endpoint,
|
|
2039
|
+
prompt="MaxCompute Endpoint",
|
|
2040
|
+
required=True,
|
|
2041
|
+
secret=False,
|
|
2042
|
+
use_env=from_env,
|
|
2043
|
+
),
|
|
2044
|
+
region_name=self._resolve_login_value(
|
|
2045
|
+
provided=region_name,
|
|
2046
|
+
env_value=env_settings.get("region_name"),
|
|
2047
|
+
existing_value=existing_auth.region_name,
|
|
2048
|
+
prompt="MaxCompute Region (optional)",
|
|
2049
|
+
required=False,
|
|
2050
|
+
secret=False,
|
|
2051
|
+
use_env=from_env,
|
|
2052
|
+
),
|
|
2053
|
+
tunnel_endpoint=self._resolve_login_value(
|
|
2054
|
+
provided=tunnel_endpoint,
|
|
2055
|
+
env_value=env_settings.get("tunnel_endpoint"),
|
|
2056
|
+
existing_value=existing_auth.tunnel_endpoint,
|
|
2057
|
+
prompt="MaxCompute Tunnel Endpoint (optional)",
|
|
2058
|
+
required=False,
|
|
2059
|
+
secret=False,
|
|
2060
|
+
use_env=from_env,
|
|
2061
|
+
),
|
|
2062
|
+
)
|
|
2063
|
+
resolved_auth.provider = "sts_token" if resolved_auth.security_token else "access_key"
|
|
2064
|
+
if no_validate:
|
|
2065
|
+
self._validate_auth_config_shape(resolved_auth)
|
|
2066
|
+
else:
|
|
2067
|
+
resolve_auth_connection(self.config, auth_override=resolved_auth)
|
|
2068
|
+
|
|
2069
|
+
persist_login_config(
|
|
2070
|
+
target_path,
|
|
2071
|
+
auth=resolved_auth,
|
|
2072
|
+
)
|
|
2073
|
+
|
|
2074
|
+
warnings: 'list[str]' = []
|
|
2075
|
+
self._clear_session_override(warnings)
|
|
2076
|
+
if from_env:
|
|
2077
|
+
warnings.append(
|
|
2078
|
+
"Credentials were imported from environment variables and saved to config."
|
|
2079
|
+
)
|
|
2080
|
+
elif any(
|
|
2081
|
+
env_settings.get(name)
|
|
2082
|
+
for name in ("access_id", "secret_access_key", "security_token", "project", "endpoint")
|
|
2083
|
+
):
|
|
2084
|
+
warnings.append(
|
|
2085
|
+
"MaxCompute-related environment variables are set in the current shell; they may override the config you just saved."
|
|
2086
|
+
)
|
|
2087
|
+
|
|
2088
|
+
if no_validate:
|
|
2089
|
+
payload = {
|
|
2090
|
+
"authenticated": None,
|
|
2091
|
+
"configured": True,
|
|
2092
|
+
"validation_status": "configuration_only",
|
|
2093
|
+
"backend": "odps",
|
|
2094
|
+
"auth_type": resolved_auth.provider,
|
|
2095
|
+
"identity_source": "config_file",
|
|
2096
|
+
"principal_display": mask_access_id(resolved_auth.access_id),
|
|
2097
|
+
"principal_masked": mask_access_id(resolved_auth.access_id),
|
|
2098
|
+
"project": resolved_auth.project,
|
|
2099
|
+
"region": resolved_auth.region_name,
|
|
2100
|
+
"endpoint": resolved_auth.endpoint,
|
|
2101
|
+
"project_owner": None,
|
|
2102
|
+
"allowed_operations": self.config.allowed_operations,
|
|
2103
|
+
"saved": True,
|
|
2104
|
+
"validated": False,
|
|
2105
|
+
}
|
|
2106
|
+
if resolved_auth.security_token:
|
|
2107
|
+
payload["token_expires_at"] = resolved_auth.token_expires_at
|
|
2108
|
+
warnings.append("Authentication settings were saved without remote validation.")
|
|
2109
|
+
else:
|
|
2110
|
+
payload, validate_warnings = self._validate_auth_config(resolved_auth)
|
|
2111
|
+
payload["saved"] = True
|
|
2112
|
+
payload["validated"] = True
|
|
2113
|
+
warnings.extend(validate_warnings)
|
|
2114
|
+
|
|
2115
|
+
envelope = Envelope(
|
|
2116
|
+
command="auth.login",
|
|
2117
|
+
status="success",
|
|
2118
|
+
data=payload,
|
|
2119
|
+
metadata={
|
|
2120
|
+
"config_path": str(target_path),
|
|
2121
|
+
"written_fields": sorted(resolved_auth.to_mapping().keys()),
|
|
2122
|
+
"auth_storage": "config_file",
|
|
2123
|
+
},
|
|
2124
|
+
agent_hints=AgentHints(
|
|
2125
|
+
next_actions=["auth.whoami", "meta.list-tables"],
|
|
2126
|
+
warnings=warnings,
|
|
2127
|
+
),
|
|
2128
|
+
)
|
|
2129
|
+
self.log("auth.login", envelope.status, envelope.metadata)
|
|
2130
|
+
return envelope
|
|
2131
|
+
|
|
2132
|
+
def auth_login_ncs(
|
|
2133
|
+
self,
|
|
2134
|
+
*,
|
|
2135
|
+
account_type: 'str | None' = None,
|
|
2136
|
+
employee_id: 'str | None' = None,
|
|
2137
|
+
account_name: 'str | None' = None,
|
|
2138
|
+
app_name: 'str | None' = None,
|
|
2139
|
+
project: 'str | None' = None,
|
|
2140
|
+
endpoint: 'str | None' = None,
|
|
2141
|
+
region_name: 'str | None' = None,
|
|
2142
|
+
tunnel_endpoint: 'str | None' = None,
|
|
2143
|
+
interactive: 'bool' = False,
|
|
2144
|
+
list_accounts_mode: 'bool' = False,
|
|
2145
|
+
no_validate: 'bool' = False,
|
|
2146
|
+
target_config_path: 'Path | None' = None,
|
|
2147
|
+
) -> 'Envelope':
|
|
2148
|
+
target_path = target_config_path or default_global_config_path()
|
|
2149
|
+
existing_payload = load_config_mapping(target_path) if target_path.exists() else {}
|
|
2150
|
+
existing_auth = AuthConfig.from_mapping(existing_payload.get("auth", {}) or {})
|
|
2151
|
+
|
|
2152
|
+
if list_accounts_mode:
|
|
2153
|
+
normalized_type = account_type or existing_auth.ncs.account_type or "user"
|
|
2154
|
+
payload = list_ncs_accounts(normalized_type)
|
|
2155
|
+
envelope = Envelope(
|
|
2156
|
+
command="auth.login-ncs",
|
|
2157
|
+
status="success",
|
|
2158
|
+
data=payload,
|
|
2159
|
+
metadata={"config_path": str(target_path)},
|
|
2160
|
+
agent_hints=AgentHints(next_actions=["auth.login-ncs"]),
|
|
2161
|
+
)
|
|
2162
|
+
self.log("auth.login-ncs", envelope.status, envelope.metadata)
|
|
2163
|
+
return envelope
|
|
2164
|
+
|
|
2165
|
+
if interactive:
|
|
2166
|
+
account_type = account_type or self._prompt_text(
|
|
2167
|
+
"ncs account type (user/account/app)",
|
|
2168
|
+
default=existing_auth.ncs.account_type,
|
|
2169
|
+
)
|
|
2170
|
+
normalized_type = (account_type or "").strip().lower()
|
|
2171
|
+
if normalized_type == "user":
|
|
2172
|
+
employee_id = employee_id or self._prompt_text(
|
|
2173
|
+
"Employee ID", default=existing_auth.ncs.employee_id
|
|
2174
|
+
)
|
|
2175
|
+
elif normalized_type == "account":
|
|
2176
|
+
account_name = account_name or self._prompt_text(
|
|
2177
|
+
"Account name", default=existing_auth.ncs.account_name
|
|
2178
|
+
)
|
|
2179
|
+
elif normalized_type == "app":
|
|
2180
|
+
app_name = app_name or self._prompt_text(
|
|
2181
|
+
"App name", default=existing_auth.ncs.app_name
|
|
2182
|
+
)
|
|
2183
|
+
project = project or self._prompt_text(
|
|
2184
|
+
"MaxCompute Project", default=existing_auth.project
|
|
2185
|
+
)
|
|
2186
|
+
endpoint = endpoint or self._prompt_text(
|
|
2187
|
+
"MaxCompute Endpoint", default=existing_auth.endpoint
|
|
2188
|
+
)
|
|
2189
|
+
region_name = region_name or self._prompt_text(
|
|
2190
|
+
"MaxCompute Region (optional)",
|
|
2191
|
+
required=False,
|
|
2192
|
+
default=existing_auth.region_name,
|
|
2193
|
+
)
|
|
2194
|
+
tunnel_endpoint = tunnel_endpoint or self._prompt_text(
|
|
2195
|
+
"MaxCompute Tunnel Endpoint (optional)",
|
|
2196
|
+
required=False,
|
|
2197
|
+
default=existing_auth.tunnel_endpoint,
|
|
2198
|
+
)
|
|
2199
|
+
|
|
2200
|
+
normalized_type = (account_type or existing_auth.ncs.account_type or "").strip().lower() or None
|
|
2201
|
+
if not normalized_type:
|
|
2202
|
+
raise ValidationError(
|
|
2203
|
+
"account_type is required for ncs authentication.",
|
|
2204
|
+
suggestion=(
|
|
2205
|
+
"Specify --account-type user, account, or app. "
|
|
2206
|
+
"Run `auth login-ncs --list-accounts --account-type user` to list available accounts."
|
|
2207
|
+
),
|
|
2208
|
+
)
|
|
2209
|
+
ncs_config = build_ncs_auth_config(
|
|
2210
|
+
account_type=normalized_type,
|
|
2211
|
+
employee_id=employee_id or existing_auth.ncs.employee_id,
|
|
2212
|
+
account_name=account_name or existing_auth.ncs.account_name,
|
|
2213
|
+
app_name=app_name or existing_auth.ncs.app_name,
|
|
2214
|
+
process_timeout=existing_auth.ncs.process_timeout,
|
|
2215
|
+
)
|
|
2216
|
+
resolved_auth = AuthConfig(
|
|
2217
|
+
provider="ncs",
|
|
2218
|
+
project=project or existing_auth.project,
|
|
2219
|
+
endpoint=endpoint or existing_auth.endpoint,
|
|
2220
|
+
region_name=region_name or existing_auth.region_name,
|
|
2221
|
+
tunnel_endpoint=tunnel_endpoint or existing_auth.tunnel_endpoint,
|
|
2222
|
+
ncs=ncs_config,
|
|
2223
|
+
)
|
|
2224
|
+
if no_validate:
|
|
2225
|
+
self._validate_auth_config_shape(resolved_auth)
|
|
2226
|
+
else:
|
|
2227
|
+
resolve_auth_connection(self.config, auth_override=resolved_auth)
|
|
2228
|
+
|
|
2229
|
+
persist_login_config(
|
|
2230
|
+
target_path,
|
|
2231
|
+
auth=resolved_auth,
|
|
2232
|
+
)
|
|
2233
|
+
|
|
2234
|
+
warnings: 'list[str]' = []
|
|
2235
|
+
self._clear_session_override(warnings)
|
|
2236
|
+
env_settings = load_odps_env()
|
|
2237
|
+
overriding_env_fields = [
|
|
2238
|
+
name for name in ("project", "endpoint")
|
|
2239
|
+
if env_settings.get(name)
|
|
2240
|
+
]
|
|
2241
|
+
if overriding_env_fields:
|
|
2242
|
+
warnings.append(
|
|
2243
|
+
f"Environment variable(s) for {', '.join(overriding_env_fields)} are set and will override "
|
|
2244
|
+
f"the values you just saved at runtime. Unset them or they will take precedence over this ncs config."
|
|
2245
|
+
)
|
|
2246
|
+
|
|
2247
|
+
if no_validate:
|
|
2248
|
+
payload = {
|
|
2249
|
+
"authenticated": None,
|
|
2250
|
+
"configured": True,
|
|
2251
|
+
"validation_status": "configuration_only",
|
|
2252
|
+
"backend": "odps",
|
|
2253
|
+
"auth_type": "ncs",
|
|
2254
|
+
"identity_source": "config_file",
|
|
2255
|
+
"principal_display": None,
|
|
2256
|
+
"principal_masked": None,
|
|
2257
|
+
"project": resolved_auth.project,
|
|
2258
|
+
"region": resolved_auth.region_name,
|
|
2259
|
+
"endpoint": resolved_auth.endpoint,
|
|
2260
|
+
"project_owner": None,
|
|
2261
|
+
"allowed_operations": self.config.allowed_operations,
|
|
2262
|
+
"saved": True,
|
|
2263
|
+
"validated": False,
|
|
2264
|
+
"ncs": {
|
|
2265
|
+
"account_type": resolved_auth.ncs.account_type,
|
|
2266
|
+
"process_command": resolved_auth.ncs.process_command,
|
|
2267
|
+
},
|
|
2268
|
+
}
|
|
2269
|
+
warnings.append("ncs authentication settings were saved without remote validation.")
|
|
2270
|
+
else:
|
|
2271
|
+
payload, validate_warnings = self._validate_auth_config(resolved_auth)
|
|
2272
|
+
payload["saved"] = True
|
|
2273
|
+
payload["validated"] = True
|
|
2274
|
+
warnings.extend(validate_warnings)
|
|
2275
|
+
|
|
2276
|
+
envelope = Envelope(
|
|
2277
|
+
command="auth.login-ncs",
|
|
2278
|
+
status="success",
|
|
2279
|
+
data=payload,
|
|
2280
|
+
metadata={
|
|
2281
|
+
"config_path": str(target_path),
|
|
2282
|
+
"written_fields": sorted(resolved_auth.to_mapping().keys()),
|
|
2283
|
+
"auth_storage": "config_file",
|
|
2284
|
+
},
|
|
2285
|
+
agent_hints=AgentHints(
|
|
2286
|
+
next_actions=["auth.whoami", "meta.list-tables"],
|
|
2287
|
+
warnings=warnings,
|
|
2288
|
+
),
|
|
2289
|
+
)
|
|
2290
|
+
self.log("auth.login-ncs", envelope.status, envelope.metadata)
|
|
2291
|
+
return envelope
|
|
2292
|
+
|
|
2293
|
+
def auth_whoami(self) -> 'Envelope':
|
|
2294
|
+
if self.backend is None:
|
|
2295
|
+
try:
|
|
2296
|
+
self.backend = OdpsBackend(self.config)
|
|
2297
|
+
self.remote_jobs = getattr(self.backend, "supports_remote_jobs", False)
|
|
2298
|
+
except ValidationError as exc:
|
|
2299
|
+
envelope = self._unauthenticated_whoami_envelope(warnings=[exc.message])
|
|
2300
|
+
self.log("auth.whoami", envelope.status, envelope.metadata)
|
|
2301
|
+
return envelope
|
|
2302
|
+
|
|
2303
|
+
try:
|
|
2304
|
+
payload, warnings = self.backend.whoami_info(project=self.config.default_project)
|
|
2305
|
+
except MaxCError as exc:
|
|
2306
|
+
suppressed = getattr(getattr(self.backend, "resolved_auth", None), "suppressed_env_vars", [])
|
|
2307
|
+
extra_warnings = [exc.message]
|
|
2308
|
+
if suppressed:
|
|
2309
|
+
extra_warnings.append(
|
|
2310
|
+
f"{len(suppressed)} environment variable(s) are set but ignored because an explicit "
|
|
2311
|
+
f"auth provider is configured ({', '.join(suppressed)}). "
|
|
2312
|
+
f"To use environment variables, run `auth login --from-env` or unset the auth provider in config."
|
|
2313
|
+
)
|
|
2314
|
+
envelope = self._whoami_validation_failed_envelope(
|
|
2315
|
+
settings=getattr(self.backend, "settings", {}),
|
|
2316
|
+
auth_type=getattr(getattr(self.backend, "resolved_auth", None), "auth_type", "access_key"),
|
|
2317
|
+
identity_source=getattr(getattr(self.backend, "resolved_auth", None), "identity_source", "unknown"),
|
|
2318
|
+
warnings=extra_warnings,
|
|
2319
|
+
)
|
|
2320
|
+
self.log("auth.whoami", envelope.status, envelope.metadata)
|
|
2321
|
+
return envelope
|
|
2322
|
+
|
|
2323
|
+
suppressed = getattr(getattr(self.backend, "resolved_auth", None), "suppressed_env_vars", [])
|
|
2324
|
+
if suppressed:
|
|
2325
|
+
warnings = list(warnings) + [
|
|
2326
|
+
f"{len(suppressed)} environment variable(s) are set but ignored because an explicit "
|
|
2327
|
+
f"auth provider is configured ({', '.join(suppressed)}). "
|
|
2328
|
+
f"To use environment variables, run `auth login --from-env` or unset the auth provider in config."
|
|
2329
|
+
]
|
|
2330
|
+
envelope = Envelope(
|
|
2331
|
+
command="auth.whoami",
|
|
2332
|
+
status="success",
|
|
2333
|
+
data=payload,
|
|
2334
|
+
metadata={
|
|
2335
|
+
"project": self.config.default_project,
|
|
2336
|
+
"config_sources": [str(p) for p in self.config.sources],
|
|
2337
|
+
},
|
|
2338
|
+
agent_hints=AgentHints(
|
|
2339
|
+
next_actions=["auth.can-i", "meta.list-tables"],
|
|
2340
|
+
warnings=warnings,
|
|
2341
|
+
),
|
|
2342
|
+
)
|
|
2343
|
+
self.log("auth.whoami", envelope.status, envelope.metadata)
|
|
2344
|
+
return envelope
|
|
2345
|
+
|
|
2346
|
+
def auth_can_i(
|
|
2347
|
+
self,
|
|
2348
|
+
*,
|
|
2349
|
+
table_name: 'str',
|
|
2350
|
+
operation: 'str',
|
|
2351
|
+
project: 'str | None' = None,
|
|
2352
|
+
) -> 'Envelope':
|
|
2353
|
+
target_project = project or self.config.default_project
|
|
2354
|
+
payload, warnings = self.backend.can_i_info(
|
|
2355
|
+
table_name=table_name,
|
|
2356
|
+
operation=operation,
|
|
2357
|
+
project=target_project,
|
|
2358
|
+
)
|
|
2359
|
+
next_actions = ["query.cost", "query.explain"] if payload.get("allowed") else ["auth.whoami", "meta.describe"]
|
|
2360
|
+
envelope = Envelope(
|
|
2361
|
+
command="auth.can-i",
|
|
2362
|
+
status="success",
|
|
2363
|
+
data=payload,
|
|
2364
|
+
metadata={"project": target_project},
|
|
2365
|
+
agent_hints=AgentHints(next_actions=next_actions, warnings=warnings),
|
|
2366
|
+
)
|
|
2367
|
+
self.log("auth.can-i", envelope.status, envelope.metadata)
|
|
2368
|
+
return envelope
|
|
2369
|
+
|
|
2370
|
+
def _validate_auth_config(
|
|
2371
|
+
self,
|
|
2372
|
+
auth: 'AuthConfig',
|
|
2373
|
+
) -> 'tuple[dict[str, Any], list[str]]':
|
|
2374
|
+
resolved = resolve_auth_connection(self.config, auth_override=auth)
|
|
2375
|
+
client = resolved.create_client()
|
|
2376
|
+
|
|
2377
|
+
owner_display_name = None
|
|
2378
|
+
try:
|
|
2379
|
+
project = client.get_project(resolved.project)
|
|
2380
|
+
owner_display_name = getattr(project, "owner", None)
|
|
2381
|
+
except Exception:
|
|
2382
|
+
owner_display_name = None
|
|
2383
|
+
|
|
2384
|
+
return build_odps_identity_payload(
|
|
2385
|
+
client=client,
|
|
2386
|
+
settings=resolved.settings,
|
|
2387
|
+
allowed_operations=self.config.allowed_operations,
|
|
2388
|
+
identity_source=resolved.identity_source,
|
|
2389
|
+
auth_type=resolved.auth_type,
|
|
2390
|
+
token_expires_at=resolved.token_expires_at,
|
|
2391
|
+
project=resolved.project,
|
|
2392
|
+
owner_display_name=owner_display_name,
|
|
2393
|
+
)
|
|
2394
|
+
|
|
2395
|
+
def _unauthenticated_whoami_envelope(
|
|
2396
|
+
self,
|
|
2397
|
+
*,
|
|
2398
|
+
warnings: 'list[str] | None' = None,
|
|
2399
|
+
) -> 'Envelope':
|
|
2400
|
+
payload = {
|
|
2401
|
+
"authenticated": False,
|
|
2402
|
+
"configured": False,
|
|
2403
|
+
"validation_status": "missing_configuration",
|
|
2404
|
+
"backend": "odps",
|
|
2405
|
+
"auth_type": None,
|
|
2406
|
+
"identity_source": "unknown",
|
|
2407
|
+
"principal_display": None,
|
|
2408
|
+
"principal_masked": None,
|
|
2409
|
+
"project": self.config.default_project or None,
|
|
2410
|
+
"region": self.config.default_region or None,
|
|
2411
|
+
"endpoint": self.config.auth.endpoint,
|
|
2412
|
+
"project_owner": None,
|
|
2413
|
+
"allowed_operations": self.config.allowed_operations,
|
|
2414
|
+
"auth_options": build_auth_options(default_global_config_path()),
|
|
2415
|
+
}
|
|
2416
|
+
return Envelope(
|
|
2417
|
+
command="auth.whoami",
|
|
2418
|
+
status="success",
|
|
2419
|
+
data=payload,
|
|
2420
|
+
metadata={
|
|
2421
|
+
"project": self.config.default_project,
|
|
2422
|
+
"config_sources": [str(p) for p in self.config.sources],
|
|
2423
|
+
},
|
|
2424
|
+
agent_hints=AgentHints(
|
|
2425
|
+
next_actions=["auth.login", "auth.login-ncs"],
|
|
2426
|
+
warnings=(warnings or ["No active MaxCompute credentials are configured."]),
|
|
2427
|
+
),
|
|
2428
|
+
)
|
|
2429
|
+
|
|
2430
|
+
def _resolve_login_value(
|
|
2431
|
+
self,
|
|
2432
|
+
*,
|
|
2433
|
+
provided: 'str | None',
|
|
2434
|
+
env_value: 'str | None',
|
|
2435
|
+
existing_value: 'str | None',
|
|
2436
|
+
prompt: 'str',
|
|
2437
|
+
required: 'bool',
|
|
2438
|
+
secret: 'bool',
|
|
2439
|
+
use_env: 'bool',
|
|
2440
|
+
) -> 'str | None':
|
|
2441
|
+
if provided is not None and provided.strip():
|
|
2442
|
+
return provided.strip()
|
|
2443
|
+
if use_env and env_value:
|
|
2444
|
+
return env_value.strip()
|
|
2445
|
+
if use_env and required and not env_value:
|
|
2446
|
+
raise ValidationError(
|
|
2447
|
+
f"--from-env was specified but the environment variable for '{prompt}' is not set.",
|
|
2448
|
+
suggestion="Set the required environment variables (ALIBABA_CLOUD_ACCESS_KEY_ID, "
|
|
2449
|
+
"ALIBABA_CLOUD_ACCESS_KEY_SECRET, MAXCOMPUTE_PROJECT, MAXCOMPUTE_ENDPOINT) "
|
|
2450
|
+
"or provide the values as CLI flags.",
|
|
2451
|
+
)
|
|
2452
|
+
if existing_value:
|
|
2453
|
+
return existing_value.strip()
|
|
2454
|
+
if not required:
|
|
2455
|
+
return None
|
|
2456
|
+
if not sys.stdin.isatty():
|
|
2457
|
+
return None
|
|
2458
|
+
|
|
2459
|
+
if secret:
|
|
2460
|
+
value = getpass.getpass(f"{prompt}: ").strip()
|
|
2461
|
+
else:
|
|
2462
|
+
value = input(f"{prompt}: ").strip()
|
|
2463
|
+
return value or None
|
|
2464
|
+
|
|
2465
|
+
def _prompt_text(
|
|
2466
|
+
self,
|
|
2467
|
+
prompt: 'str',
|
|
2468
|
+
*,
|
|
2469
|
+
required: 'bool' = True,
|
|
2470
|
+
default: 'str | None' = None,
|
|
2471
|
+
) -> 'str | None':
|
|
2472
|
+
if not sys.stdin.isatty():
|
|
2473
|
+
return default
|
|
2474
|
+
display_prompt = f"{prompt} [current: {default}]" if default else prompt
|
|
2475
|
+
value = input(f"{display_prompt}: ").strip()
|
|
2476
|
+
if value:
|
|
2477
|
+
return value
|
|
2478
|
+
if default:
|
|
2479
|
+
return default
|
|
2480
|
+
if required:
|
|
2481
|
+
raise ValidationError(f"{prompt} is required.")
|
|
2482
|
+
return None
|
|
2483
|
+
|
|
2484
|
+
def schema_diff(self, left_table: 'str', right_table: 'str') -> 'Envelope':
|
|
2485
|
+
left = self.backend.describe_table(left_table)
|
|
2486
|
+
right = self.backend.describe_table(right_table)
|
|
2487
|
+
payload = build_schema_diff_payload(left, right)
|
|
2488
|
+
next_actions = ["meta.describe", "auth.can-i"]
|
|
2489
|
+
if payload.get("compatible"):
|
|
2490
|
+
next_actions = ["query.explain", "auth.can-i"]
|
|
2491
|
+
envelope = Envelope(
|
|
2492
|
+
command="diff.schema",
|
|
2493
|
+
status="success",
|
|
2494
|
+
data=payload,
|
|
2495
|
+
metadata={"project": self.config.default_project},
|
|
2496
|
+
agent_hints=AgentHints(next_actions=next_actions),
|
|
2497
|
+
)
|
|
2498
|
+
self.log("diff.schema", envelope.status, envelope.metadata)
|
|
2499
|
+
return envelope
|
|
2500
|
+
|
|
2501
|
+
def partition_diff(self, left_table: 'str', right_table: 'str') -> 'Envelope':
|
|
2502
|
+
left = self.backend.describe_table(left_table)
|
|
2503
|
+
right = self.backend.describe_table(right_table)
|
|
2504
|
+
payload = build_partition_diff_payload(left, right)
|
|
2505
|
+
envelope = Envelope(
|
|
2506
|
+
command="diff.partition",
|
|
2507
|
+
status="success",
|
|
2508
|
+
data=payload,
|
|
2509
|
+
metadata={"project": self.config.default_project},
|
|
2510
|
+
agent_hints=AgentHints(next_actions=["meta.partitions", "meta.latest-partition"]),
|
|
2511
|
+
)
|
|
2512
|
+
self.log("diff.partition", envelope.status, envelope.metadata)
|
|
2513
|
+
return envelope
|
|
2514
|
+
|
|
2515
|
+
def data_diff(
|
|
2516
|
+
self,
|
|
2517
|
+
left_table: 'str',
|
|
2518
|
+
right_table: 'str',
|
|
2519
|
+
*,
|
|
2520
|
+
keys: 'list[str]',
|
|
2521
|
+
columns: 'list[str] | None' = None,
|
|
2522
|
+
rows: 'int' = 100,
|
|
2523
|
+
partition: 'str | None' = None,
|
|
2524
|
+
left_partition: 'str | None' = None,
|
|
2525
|
+
right_partition: 'str | None' = None,
|
|
2526
|
+
) -> 'Envelope':
|
|
2527
|
+
if rows <= 0:
|
|
2528
|
+
raise ValidationError("`--rows` must be greater than 0.")
|
|
2529
|
+
if not keys:
|
|
2530
|
+
raise ValidationError("`--keys` requires at least one column.")
|
|
2531
|
+
|
|
2532
|
+
left = self.backend.describe_table(left_table)
|
|
2533
|
+
right = self.backend.describe_table(right_table)
|
|
2534
|
+
compared_columns = resolve_data_diff_columns(
|
|
2535
|
+
left,
|
|
2536
|
+
right,
|
|
2537
|
+
keys=keys,
|
|
2538
|
+
requested_columns=columns,
|
|
2539
|
+
)
|
|
2540
|
+
fetch_columns = dedupe_preserve_order([*keys, *compared_columns])
|
|
2541
|
+
resolved_left_partition = left_partition or partition
|
|
2542
|
+
resolved_right_partition = right_partition or partition
|
|
2543
|
+
_, left_rows, _ = self.backend.sample_table(
|
|
2544
|
+
left_table,
|
|
2545
|
+
rows,
|
|
2546
|
+
partition=resolved_left_partition,
|
|
2547
|
+
columns=fetch_columns,
|
|
2548
|
+
)
|
|
2549
|
+
_, right_rows, _ = self.backend.sample_table(
|
|
2550
|
+
right_table,
|
|
2551
|
+
rows,
|
|
2552
|
+
partition=resolved_right_partition,
|
|
2553
|
+
columns=fetch_columns,
|
|
2554
|
+
)
|
|
2555
|
+
|
|
2556
|
+
payload, warnings = build_data_diff_payload(
|
|
2557
|
+
left=left,
|
|
2558
|
+
right=right,
|
|
2559
|
+
left_rows=left_rows,
|
|
2560
|
+
right_rows=right_rows,
|
|
2561
|
+
keys=keys,
|
|
2562
|
+
compared_columns=compared_columns,
|
|
2563
|
+
rows_limit=rows,
|
|
2564
|
+
left_partition=resolved_left_partition,
|
|
2565
|
+
right_partition=resolved_right_partition,
|
|
2566
|
+
)
|
|
2567
|
+
envelope = Envelope(
|
|
2568
|
+
command="diff.data",
|
|
2569
|
+
status="success",
|
|
2570
|
+
data=payload,
|
|
2571
|
+
metadata={
|
|
2572
|
+
"project": self.config.default_project,
|
|
2573
|
+
"requested_rows": rows,
|
|
2574
|
+
"requested_columns": columns or [],
|
|
2575
|
+
"requested_keys": keys,
|
|
2576
|
+
"requested_partition": partition,
|
|
2577
|
+
"left_requested_partition": resolved_left_partition,
|
|
2578
|
+
"right_requested_partition": resolved_right_partition,
|
|
2579
|
+
},
|
|
2580
|
+
agent_hints=AgentHints(
|
|
2581
|
+
next_actions=["data.profile", "diff.schema", "meta.describe"],
|
|
2582
|
+
warnings=warnings,
|
|
2583
|
+
),
|
|
2584
|
+
)
|
|
2585
|
+
self.log("diff.data", envelope.status, envelope.metadata)
|
|
2586
|
+
return envelope
|
|
2587
|
+
|
|
2588
|
+
def agent_context(self) -> 'Envelope':
|
|
2589
|
+
"""Return project context without listing tables (too slow on large projects)."""
|
|
2590
|
+
envelope = Envelope(
|
|
2591
|
+
command="agent.context",
|
|
2592
|
+
status="success",
|
|
2593
|
+
data={
|
|
2594
|
+
"project": self.config.default_project,
|
|
2595
|
+
"region": self.config.default_region,
|
|
2596
|
+
"backend": "odps",
|
|
2597
|
+
"project_context": self.config.project_context,
|
|
2598
|
+
"allowed_operations": self.config.allowed_operations,
|
|
2599
|
+
"cost_threshold_cu": self.config.cost_threshold_cu,
|
|
2600
|
+
"sensitive_columns": self.config.sensitive_columns,
|
|
2601
|
+
},
|
|
2602
|
+
metadata={
|
|
2603
|
+
"config_sources": [str(path) for path in self.config.sources],
|
|
2604
|
+
"state_dir": str(self.config.state_dir),
|
|
2605
|
+
"job_mode": "remote" if self.remote_jobs else "local" if self.backend is not None else "unknown",
|
|
2606
|
+
},
|
|
2607
|
+
agent_hints=AgentHints(
|
|
2608
|
+
next_actions=["meta.search", "meta.list-tables"],
|
|
2609
|
+
insights=["Use `meta list-tables` to enumerate tables and `meta search` to locate relevant datasets."],
|
|
2610
|
+
),
|
|
2611
|
+
)
|
|
2612
|
+
self.log("agent.context", envelope.status, envelope.metadata)
|
|
2613
|
+
return envelope
|
|
2614
|
+
|
|
2615
|
+
def feature_unavailable(self, command: 'str', message: 'str') -> 'Envelope':
|
|
2616
|
+
raise FeatureUnavailableError(
|
|
2617
|
+
message,
|
|
2618
|
+
suggestion="Run `maxc --help` to inspect the currently supported commands.",
|
|
2619
|
+
)
|
|
2620
|
+
|
|
2621
|
+
def log(
|
|
2622
|
+
self,
|
|
2623
|
+
command: 'str',
|
|
2624
|
+
status: 'str',
|
|
2625
|
+
metadata: 'dict[str, Any] | None' = None,
|
|
2626
|
+
*,
|
|
2627
|
+
error: 'dict[str, Any] | None' = None,
|
|
2628
|
+
) -> 'None':
|
|
2629
|
+
try:
|
|
2630
|
+
if self._audit is None:
|
|
2631
|
+
self._audit = AuditLogger(self._audit_path)
|
|
2632
|
+
self._audit.log(
|
|
2633
|
+
{
|
|
2634
|
+
"command": command,
|
|
2635
|
+
"status": status,
|
|
2636
|
+
"metadata": metadata or {},
|
|
2637
|
+
"error": error,
|
|
2638
|
+
}
|
|
2639
|
+
)
|
|
2640
|
+
except OSError:
|
|
2641
|
+
# Command execution should not fail solely because the audit path is unavailable.
|
|
2642
|
+
return
|
|
2643
|
+
|
|
2644
|
+
def _submit_remote_job(
|
|
2645
|
+
self,
|
|
2646
|
+
*,
|
|
2647
|
+
sql: 'str',
|
|
2648
|
+
project: 'str',
|
|
2649
|
+
cost_check: 'float | None',
|
|
2650
|
+
idempotency_key: 'str | None',
|
|
2651
|
+
) -> 'JobInfo':
|
|
2652
|
+
if cost_check is not None:
|
|
2653
|
+
raise FeatureUnavailableError(
|
|
2654
|
+
"The real MaxCompute backend does not yet support CU-based `--cost-check` validation.",
|
|
2655
|
+
suggestion="Run `--dry-run` first to inspect SQLCost metadata, or remove `--cost-check`.",
|
|
2656
|
+
)
|
|
2657
|
+
return self.backend.submit_query(
|
|
2658
|
+
sql,
|
|
2659
|
+
project=project,
|
|
2660
|
+
idempotency_key=idempotency_key,
|
|
2661
|
+
)
|
|
2662
|
+
|
|
2663
|
+
def _execute_query(
|
|
2664
|
+
self,
|
|
2665
|
+
*,
|
|
2666
|
+
sql: 'str',
|
|
2667
|
+
project: 'str',
|
|
2668
|
+
max_rows: 'int',
|
|
2669
|
+
offset: 'int',
|
|
2670
|
+
dry_run: 'bool',
|
|
2671
|
+
cost_check: 'float | None',
|
|
2672
|
+
retry_on: 'list[str]',
|
|
2673
|
+
max_retries: 'int',
|
|
2674
|
+
strict_cost_check: 'bool',
|
|
2675
|
+
timeout: 'int | None' = None,
|
|
2676
|
+
) -> 'QueryResult':
|
|
2677
|
+
if sql.startswith("@natural"):
|
|
2678
|
+
raise FeatureUnavailableError(
|
|
2679
|
+
"`@natural` is a roadmap feature and is not available in the current MVP.",
|
|
2680
|
+
suggestion="Use `maxc meta search` or `maxc meta describe` to inspect tables, then submit plain SQL.",
|
|
2681
|
+
)
|
|
2682
|
+
|
|
2683
|
+
attempts = 0
|
|
2684
|
+
while True:
|
|
2685
|
+
try:
|
|
2686
|
+
if cost_check is not None and strict_cost_check and not self.backend.supports_cost_check:
|
|
2687
|
+
raise FeatureUnavailableError(
|
|
2688
|
+
"The current backend does not provide CU-based cost validation.",
|
|
2689
|
+
suggestion="Remove `--cost-check`, or use `--dry-run` to inspect SQLCost metadata.",
|
|
2690
|
+
)
|
|
2691
|
+
|
|
2692
|
+
result = self.backend.execute_query(
|
|
2693
|
+
sql,
|
|
2694
|
+
project=project,
|
|
2695
|
+
max_rows=max_rows,
|
|
2696
|
+
dry_run=dry_run,
|
|
2697
|
+
offset=offset,
|
|
2698
|
+
timeout=timeout,
|
|
2699
|
+
)
|
|
2700
|
+
return result
|
|
2701
|
+
except MaxCError as exc:
|
|
2702
|
+
attempts += 1
|
|
2703
|
+
can_retry = (
|
|
2704
|
+
attempts <= max_retries
|
|
2705
|
+
and exc.recoverable
|
|
2706
|
+
and exc.error_code in retry_on
|
|
2707
|
+
)
|
|
2708
|
+
if not can_retry:
|
|
2709
|
+
raise
|
|
2710
|
+
|
|
2711
|
+
def _analyze_query(
|
|
2712
|
+
self,
|
|
2713
|
+
*,
|
|
2714
|
+
sql: 'str',
|
|
2715
|
+
project: 'str',
|
|
2716
|
+
explain: 'bool',
|
|
2717
|
+
) -> 'dict[str, Any]':
|
|
2718
|
+
if sql.startswith("@natural"):
|
|
2719
|
+
raise FeatureUnavailableError(
|
|
2720
|
+
"`@natural` is a roadmap feature and is not available in the current MVP.",
|
|
2721
|
+
suggestion="Use `maxc meta search` or `maxc meta describe` to inspect tables, then submit plain SQL.",
|
|
2722
|
+
)
|
|
2723
|
+
if explain:
|
|
2724
|
+
return self.backend.explain_query(sql, project=project)
|
|
2725
|
+
return self.backend.estimate_query_cost(sql, project=project)
|
|
2726
|
+
|
|
2727
|
+
def _build_query_envelope(
|
|
2728
|
+
self,
|
|
2729
|
+
*,
|
|
2730
|
+
command: 'str',
|
|
2731
|
+
result: 'QueryResult',
|
|
2732
|
+
dry_run: 'bool',
|
|
2733
|
+
session_id: 'int | None' = None,
|
|
2734
|
+
) -> 'Envelope':
|
|
2735
|
+
insights = []
|
|
2736
|
+
next_actions = ["meta.describe"] if result.tables_used else []
|
|
2737
|
+
if result.has_more:
|
|
2738
|
+
next_actions.append("query.paginate")
|
|
2739
|
+
if dry_run:
|
|
2740
|
+
next_actions.append("job.submit")
|
|
2741
|
+
insights.append("Dry-run returned estimated cost and SQLCost metadata so you can decide whether to continue.")
|
|
2742
|
+
elif not result.rows:
|
|
2743
|
+
insights.append("The result set is empty. Check filters, partitions, and table selection.")
|
|
2744
|
+
|
|
2745
|
+
# 如果有 job_id 且 has_more,创建或复用 session,生成短 cursor
|
|
2746
|
+
next_cursor = None
|
|
2747
|
+
if result.has_more and result.returned_rows > 0:
|
|
2748
|
+
current_offset = result.extra_metadata.get("current_offset", 0)
|
|
2749
|
+
next_offset = current_offset + result.returned_rows
|
|
2750
|
+
if result.job_id and self.remote_jobs:
|
|
2751
|
+
# 远程 backend: 用 session_id 生成短 cursor
|
|
2752
|
+
if session_id is None:
|
|
2753
|
+
session_id = self.cache.create_session(
|
|
2754
|
+
job_id=result.job_id,
|
|
2755
|
+
project=result.project,
|
|
2756
|
+
sql=result.sql_executed,
|
|
2757
|
+
)
|
|
2758
|
+
next_cursor = encode_cursor(next_offset, session_id=session_id)
|
|
2759
|
+
else:
|
|
2760
|
+
# Mock backend: 只包含 offset
|
|
2761
|
+
next_cursor = encode_cursor(next_offset)
|
|
2762
|
+
|
|
2763
|
+
metadata = {
|
|
2764
|
+
"project": result.project,
|
|
2765
|
+
"elapsed_ms": result.elapsed_ms,
|
|
2766
|
+
"bytes_scanned": result.bytes_scanned,
|
|
2767
|
+
"sql_executed": result.sql_executed,
|
|
2768
|
+
"tables_used": result.tables_used,
|
|
2769
|
+
}
|
|
2770
|
+
if result.job_id:
|
|
2771
|
+
metadata["job_id"] = result.job_id
|
|
2772
|
+
if result.submitted_at:
|
|
2773
|
+
metadata["submitted_at"] = result.submitted_at
|
|
2774
|
+
if result.completed_at:
|
|
2775
|
+
metadata["completed_at"] = result.completed_at
|
|
2776
|
+
metadata.update(result.extra_metadata)
|
|
2777
|
+
|
|
2778
|
+
return Envelope(
|
|
2779
|
+
command=command,
|
|
2780
|
+
status="success",
|
|
2781
|
+
data={
|
|
2782
|
+
"rows": result.rows,
|
|
2783
|
+
"schema": result.schema,
|
|
2784
|
+
"total_rows": result.total_rows,
|
|
2785
|
+
"returned_rows": result.returned_rows,
|
|
2786
|
+
"has_more": result.has_more,
|
|
2787
|
+
"next_cursor": next_cursor,
|
|
2788
|
+
},
|
|
2789
|
+
metadata=metadata,
|
|
2790
|
+
agent_hints=AgentHints(
|
|
2791
|
+
next_actions=next_actions,
|
|
2792
|
+
warnings=result.warnings,
|
|
2793
|
+
insights=insights,
|
|
2794
|
+
),
|
|
2795
|
+
)
|
|
2796
|
+
|
|
2797
|
+
def _build_analysis_envelope(
|
|
2798
|
+
self,
|
|
2799
|
+
*,
|
|
2800
|
+
command: 'str',
|
|
2801
|
+
sql: 'str',
|
|
2802
|
+
analysis: 'dict[str, Any]',
|
|
2803
|
+
) -> 'Envelope':
|
|
2804
|
+
warnings = list(analysis.get("warnings", []))
|
|
2805
|
+
next_actions = ["query"]
|
|
2806
|
+
if analysis.get("tables_used"):
|
|
2807
|
+
next_actions.append("meta.describe")
|
|
2808
|
+
if command == "query.cost":
|
|
2809
|
+
next_actions.insert(0, "query.explain")
|
|
2810
|
+
insights = []
|
|
2811
|
+
if analysis.get("estimated_input_size_bytes") == 0:
|
|
2812
|
+
insights.append("The estimated scan input is 0 bytes. This is often a constant query or a plan that avoids scanning data.")
|
|
2813
|
+
|
|
2814
|
+
metadata = {
|
|
2815
|
+
"project": analysis.get("project"),
|
|
2816
|
+
"sql_executed": sql.rstrip(";"),
|
|
2817
|
+
}
|
|
2818
|
+
if analysis.get("elapsed_ms") is not None:
|
|
2819
|
+
metadata["elapsed_ms"] = analysis["elapsed_ms"]
|
|
2820
|
+
|
|
2821
|
+
return Envelope(
|
|
2822
|
+
command=command,
|
|
2823
|
+
status="success",
|
|
2824
|
+
data=analysis,
|
|
2825
|
+
metadata=metadata,
|
|
2826
|
+
agent_hints=AgentHints(
|
|
2827
|
+
next_actions=next_actions,
|
|
2828
|
+
warnings=warnings,
|
|
2829
|
+
insights=insights,
|
|
2830
|
+
),
|
|
2831
|
+
)
|
|
2832
|
+
|
|
2833
|
+
def _job_info_envelope(self, command: 'str', info: 'JobInfo') -> 'Envelope':
|
|
2834
|
+
next_actions = ["job.wait", "job.result"] if info.status in {"pending", "running"} else ["job.result"]
|
|
2835
|
+
if info.status == "failure":
|
|
2836
|
+
next_actions = ["job.diagnose", "job.status"]
|
|
2837
|
+
return Envelope(
|
|
2838
|
+
command=command,
|
|
2839
|
+
status=info.status,
|
|
2840
|
+
data={
|
|
2841
|
+
"job_id": info.job_id,
|
|
2842
|
+
"status": info.status,
|
|
2843
|
+
"progress": info.progress,
|
|
2844
|
+
"stage": info.stage,
|
|
2845
|
+
"retryable": info.retryable,
|
|
2846
|
+
"failure_reason": info.failure_reason,
|
|
2847
|
+
"logview": info.logview,
|
|
2848
|
+
"task_summary": info.task_summary,
|
|
2849
|
+
"sql": info.sql,
|
|
2850
|
+
},
|
|
2851
|
+
metadata={
|
|
2852
|
+
"project": info.project,
|
|
2853
|
+
"submitted_at": info.submitted_at,
|
|
2854
|
+
"updated_at": info.updated_at,
|
|
2855
|
+
"completed_at": info.completed_at,
|
|
2856
|
+
"logview": info.logview,
|
|
2857
|
+
"error_message": info.error_message,
|
|
2858
|
+
},
|
|
2859
|
+
agent_hints=AgentHints(
|
|
2860
|
+
next_actions=next_actions,
|
|
2861
|
+
warnings=info.warnings,
|
|
2862
|
+
),
|
|
2863
|
+
)
|
|
2864
|
+
|
|
2865
|
+
def _local_job_info(self, job: 'dict[str, Any]') -> 'JobInfo':
|
|
2866
|
+
status = job["status"]
|
|
2867
|
+
stage = "queue" if status == "pending" else "completed" if status == "success" else "failed"
|
|
2868
|
+
failure_reason = "The job was cancelled." if job.get("cancelled") else None
|
|
2869
|
+
diagnosis = classify_failure_reason(failure_reason)
|
|
2870
|
+
task_summary = build_task_summary(job.get("sql"))
|
|
2871
|
+
return JobInfo(
|
|
2872
|
+
job_id=job["job_id"],
|
|
2873
|
+
status=status,
|
|
2874
|
+
project=job["project"],
|
|
2875
|
+
progress=job["progress"],
|
|
2876
|
+
stage=stage,
|
|
2877
|
+
retryable=diagnosis["retryable"] if status == "failure" else None,
|
|
2878
|
+
failure_reason=failure_reason,
|
|
2879
|
+
task_summary=task_summary,
|
|
2880
|
+
sql=job.get("sql"),
|
|
2881
|
+
submitted_at=job.get("submitted_at"),
|
|
2882
|
+
updated_at=job.get("updated_at"),
|
|
2883
|
+
completed_at=job.get("completed_at"),
|
|
2884
|
+
logview=None,
|
|
2885
|
+
)
|
|
2886
|
+
|
|
2887
|
+
def _query_result_payload(self, result: 'QueryResult') -> 'dict[str, Any]':
|
|
2888
|
+
envelope = self._build_query_envelope(
|
|
2889
|
+
command="query",
|
|
2890
|
+
result=result,
|
|
2891
|
+
dry_run=False,
|
|
2892
|
+
)
|
|
2893
|
+
return envelope.to_dict(normalize=False)
|
|
2894
|
+
|
|
2895
|
+
def _job_events(self, job: 'dict[str, Any]') -> 'list[dict[str, Any]]':
|
|
2896
|
+
if job["status"] == "success":
|
|
2897
|
+
return [
|
|
2898
|
+
{
|
|
2899
|
+
"type": "completed",
|
|
2900
|
+
"ts": now_utc_iso(),
|
|
2901
|
+
"job_id": job["job_id"],
|
|
2902
|
+
"rows": job["result"]["data"]["returned_rows"],
|
|
2903
|
+
}
|
|
2904
|
+
]
|
|
2905
|
+
if job.get("cancelled"):
|
|
2906
|
+
raise ValidationError("The job was cancelled and can no longer be waited on.")
|
|
2907
|
+
|
|
2908
|
+
return [
|
|
2909
|
+
{"type": "started", "ts": now_utc_iso(), "job_id": job["job_id"]},
|
|
2910
|
+
{
|
|
2911
|
+
"type": "progress",
|
|
2912
|
+
"ts": now_utc_iso(),
|
|
2913
|
+
"job_id": job["job_id"],
|
|
2914
|
+
"percent": 20,
|
|
2915
|
+
"stage": "queue",
|
|
2916
|
+
},
|
|
2917
|
+
{
|
|
2918
|
+
"type": "progress",
|
|
2919
|
+
"ts": now_utc_iso(),
|
|
2920
|
+
"job_id": job["job_id"],
|
|
2921
|
+
"percent": 60,
|
|
2922
|
+
"stage": "scan",
|
|
2923
|
+
},
|
|
2924
|
+
{
|
|
2925
|
+
"type": "progress",
|
|
2926
|
+
"ts": now_utc_iso(),
|
|
2927
|
+
"job_id": job["job_id"],
|
|
2928
|
+
"percent": 90,
|
|
2929
|
+
"stage": "finalize",
|
|
2930
|
+
},
|
|
2931
|
+
{
|
|
2932
|
+
"type": "completed",
|
|
2933
|
+
"ts": now_utc_iso(),
|
|
2934
|
+
"job_id": job["job_id"],
|
|
2935
|
+
"rows": job["result"]["data"]["returned_rows"],
|
|
2936
|
+
},
|
|
2937
|
+
]
|
|
2938
|
+
|
|
2939
|
+
def _remote_job_events(
|
|
2940
|
+
self,
|
|
2941
|
+
before: 'JobInfo',
|
|
2942
|
+
after: 'JobInfo',
|
|
2943
|
+
result: 'QueryResult',
|
|
2944
|
+
) -> 'list[dict[str, Any]]':
|
|
2945
|
+
events = [{"type": "started", "ts": before.submitted_at or now_utc_iso(), "job_id": before.job_id}]
|
|
2946
|
+
if before.status in {"pending", "running"}:
|
|
2947
|
+
events.append(
|
|
2948
|
+
{
|
|
2949
|
+
"type": "progress",
|
|
2950
|
+
"ts": now_utc_iso(),
|
|
2951
|
+
"job_id": before.job_id,
|
|
2952
|
+
"percent": before.progress or 50,
|
|
2953
|
+
"stage": before.status,
|
|
2954
|
+
}
|
|
2955
|
+
)
|
|
2956
|
+
events.append(
|
|
2957
|
+
{
|
|
2958
|
+
"type": "completed",
|
|
2959
|
+
"ts": after.completed_at or now_utc_iso(),
|
|
2960
|
+
"job_id": after.job_id,
|
|
2961
|
+
"rows": result.returned_rows,
|
|
2962
|
+
}
|
|
2963
|
+
)
|
|
2964
|
+
return events
|
|
2965
|
+
|
|
2966
|
+
def _table_payload(self, table: 'TableDefinition', full: 'bool' = False) -> 'dict[str, Any]':
|
|
2967
|
+
# Calculate size in MB
|
|
2968
|
+
size_mb = (table.size_bytes / (1024 * 1024)) if table.size_bytes else 0
|
|
2969
|
+
|
|
2970
|
+
# Identify primary key with better heuristics:
|
|
2971
|
+
# 1. Look for explicit primary key indicators: *_id (but not ending with _sk), pk_*, id
|
|
2972
|
+
# 2. Exclude foreign key patterns and common FK suffixes
|
|
2973
|
+
foreign_key_suffixes = ('_date_sk', '_time_sk', '_dim_sk', '_demo_sk', '_hdemo_sk',
|
|
2974
|
+
'_cdemo_sk', '_addr_sk', '_promo_sk', '_item_sk', '_customer_sk',
|
|
2975
|
+
'_store_sk', '_bill_sk', '_ship_sk', '_reason_sk')
|
|
2976
|
+
primary_key = None
|
|
2977
|
+
|
|
2978
|
+
# First pass: look for actual primary keys (not ending with _sk)
|
|
2979
|
+
for col in table.columns:
|
|
2980
|
+
col_lower = col.name.lower()
|
|
2981
|
+
# Primary key candidates: ends with _id but not _sk, or explicitly named 'id'/'pk_*'
|
|
2982
|
+
if (col_lower.endswith('_id') and not col_lower.endswith('_sk')) or \
|
|
2983
|
+
col_lower == 'id' or col_lower.startswith('pk_'):
|
|
2984
|
+
primary_key = col.name
|
|
2985
|
+
break
|
|
2986
|
+
|
|
2987
|
+
# Second pass: if no clear PK, check for ticket numbers or other business keys
|
|
2988
|
+
if not primary_key:
|
|
2989
|
+
for col in table.columns:
|
|
2990
|
+
col_lower = col.name.lower()
|
|
2991
|
+
if 'ticket_number' in col_lower or 'order_number' in col_lower:
|
|
2992
|
+
primary_key = col.name
|
|
2993
|
+
break
|
|
2994
|
+
|
|
2995
|
+
payload = {
|
|
2996
|
+
"table_name": table.name,
|
|
2997
|
+
"description": table.description,
|
|
2998
|
+
"row_count": None, # Not available from TableDefinition
|
|
2999
|
+
"size_mb": round(size_mb, 2),
|
|
3000
|
+
"column_count": len(table.columns),
|
|
3001
|
+
"table_type": table.table_type,
|
|
3002
|
+
"owner": table.owner,
|
|
3003
|
+
"created_at": table.created_at,
|
|
3004
|
+
"updated_at": table.updated_at,
|
|
3005
|
+
}
|
|
3006
|
+
|
|
3007
|
+
if full:
|
|
3008
|
+
# Full mode: return all columns
|
|
3009
|
+
payload["schema"] = [
|
|
3010
|
+
{"name": column.name, "type": column.type, "comment": column.comment}
|
|
3011
|
+
for column in table.columns
|
|
3012
|
+
]
|
|
3013
|
+
payload["has_more_columns"] = False
|
|
3014
|
+
# Full mode: include complete sample rows
|
|
3015
|
+
payload["sample_preview"] = table.sample_rows[:2]
|
|
3016
|
+
else:
|
|
3017
|
+
# Summary mode: return first 10 columns only
|
|
3018
|
+
display_columns = table.columns[:10]
|
|
3019
|
+
payload["schema"] = [
|
|
3020
|
+
{"name": column.name, "type": column.type, "comment": column.comment}
|
|
3021
|
+
for column in display_columns
|
|
3022
|
+
]
|
|
3023
|
+
payload["has_more_columns"] = len(table.columns) > 10
|
|
3024
|
+
payload["remaining_columns"] = max(0, len(table.columns) - 10)
|
|
3025
|
+
|
|
3026
|
+
# Summary mode: no sample preview (keep it lightweight)
|
|
3027
|
+
payload["sample_preview"] = []
|
|
3028
|
+
|
|
3029
|
+
# Add primary key info (None if no clear primary key found)
|
|
3030
|
+
payload["primary_key"] = primary_key
|
|
3031
|
+
|
|
3032
|
+
# Add partition columns
|
|
3033
|
+
payload["partition_columns"] = [
|
|
3034
|
+
{"name": column.name, "type": column.type, "comment": column.comment}
|
|
3035
|
+
for column in table.partition_columns
|
|
3036
|
+
]
|
|
3037
|
+
|
|
3038
|
+
# Add other metadata
|
|
3039
|
+
payload["partitions"] = table.partitions
|
|
3040
|
+
payload["lineage"] = {
|
|
3041
|
+
"upstream_tables": table.upstream_tables,
|
|
3042
|
+
"downstream_tables": table.downstream_tables,
|
|
3043
|
+
}
|
|
3044
|
+
payload["extra_metadata"] = table.extra_metadata
|
|
3045
|
+
|
|
3046
|
+
return payload
|
|
3047
|
+
|
|
3048
|
+
|
|
3049
|
+
def build_schema_diff_payload(left: 'TableDefinition', right: 'TableDefinition') -> 'dict[str, Any]':
|
|
3050
|
+
columns = compare_columns(left.columns, right.columns, scope="columns")
|
|
3051
|
+
partition_columns = compare_columns(
|
|
3052
|
+
left.partition_columns,
|
|
3053
|
+
right.partition_columns,
|
|
3054
|
+
scope="partition_columns",
|
|
3055
|
+
added_is_breaking=True,
|
|
3056
|
+
)
|
|
3057
|
+
|
|
3058
|
+
breaking_changes = list(columns["breaking_changes"])
|
|
3059
|
+
non_breaking_changes = list(columns["non_breaking_changes"])
|
|
3060
|
+
breaking_changes.extend(partition_columns["breaking_changes"])
|
|
3061
|
+
non_breaking_changes.extend(partition_columns["non_breaking_changes"])
|
|
3062
|
+
|
|
3063
|
+
table_type_changed = left.table_type != right.table_type
|
|
3064
|
+
if table_type_changed:
|
|
3065
|
+
breaking_changes.append(
|
|
3066
|
+
{
|
|
3067
|
+
"kind": "table_type_changed",
|
|
3068
|
+
"scope": "table_attributes",
|
|
3069
|
+
"field": "table_type",
|
|
3070
|
+
"left": left.table_type,
|
|
3071
|
+
"right": right.table_type,
|
|
3072
|
+
}
|
|
3073
|
+
)
|
|
3074
|
+
|
|
3075
|
+
return {
|
|
3076
|
+
"left_table": left.name,
|
|
3077
|
+
"right_table": right.name,
|
|
3078
|
+
"compatible": not breaking_changes,
|
|
3079
|
+
"summary": {
|
|
3080
|
+
"added_columns": len(columns["added"]),
|
|
3081
|
+
"removed_columns": len(columns["removed"]),
|
|
3082
|
+
"changed_columns": len(columns["changed"]),
|
|
3083
|
+
"unchanged_columns": len(columns["unchanged"]),
|
|
3084
|
+
"added_partition_columns": len(partition_columns["added"]),
|
|
3085
|
+
"removed_partition_columns": len(partition_columns["removed"]),
|
|
3086
|
+
"changed_partition_columns": len(partition_columns["changed"]),
|
|
3087
|
+
"breaking_changes": len(breaking_changes),
|
|
3088
|
+
"non_breaking_changes": len(non_breaking_changes),
|
|
3089
|
+
},
|
|
3090
|
+
"breaking_changes": breaking_changes,
|
|
3091
|
+
"non_breaking_changes": non_breaking_changes,
|
|
3092
|
+
"columns": {
|
|
3093
|
+
"added": columns["added"],
|
|
3094
|
+
"removed": columns["removed"],
|
|
3095
|
+
"changed": columns["changed"],
|
|
3096
|
+
"unchanged": columns["unchanged"],
|
|
3097
|
+
},
|
|
3098
|
+
"partition_columns": {
|
|
3099
|
+
"added": partition_columns["added"],
|
|
3100
|
+
"removed": partition_columns["removed"],
|
|
3101
|
+
"changed": partition_columns["changed"],
|
|
3102
|
+
"unchanged": partition_columns["unchanged"],
|
|
3103
|
+
},
|
|
3104
|
+
"table_attributes": {
|
|
3105
|
+
"table_type": {
|
|
3106
|
+
"left": left.table_type,
|
|
3107
|
+
"right": right.table_type,
|
|
3108
|
+
"changed": table_type_changed,
|
|
3109
|
+
}
|
|
3110
|
+
},
|
|
3111
|
+
}
|
|
3112
|
+
|
|
3113
|
+
|
|
3114
|
+
def build_partition_diff_payload(left: 'TableDefinition', right: 'TableDefinition') -> 'dict[str, Any]':
|
|
3115
|
+
left_partitions = set(left.partitions)
|
|
3116
|
+
right_partitions = set(right.partitions)
|
|
3117
|
+
left_only = sorted(left_partitions - right_partitions)
|
|
3118
|
+
right_only = sorted(right_partitions - left_partitions)
|
|
3119
|
+
common = sorted(left_partitions & right_partitions)
|
|
3120
|
+
return {
|
|
3121
|
+
"left_table": left.name,
|
|
3122
|
+
"right_table": right.name,
|
|
3123
|
+
"compatible": not left_only and not right_only,
|
|
3124
|
+
"summary": {
|
|
3125
|
+
"left_partition_count": len(left.partitions),
|
|
3126
|
+
"right_partition_count": len(right.partitions),
|
|
3127
|
+
"left_only": len(left_only),
|
|
3128
|
+
"right_only": len(right_only),
|
|
3129
|
+
"common": len(common),
|
|
3130
|
+
},
|
|
3131
|
+
"left_only": left_only,
|
|
3132
|
+
"right_only": right_only,
|
|
3133
|
+
"common": common,
|
|
3134
|
+
}
|
|
3135
|
+
|
|
3136
|
+
|
|
3137
|
+
def build_data_diff_payload(
|
|
3138
|
+
*,
|
|
3139
|
+
left: 'TableDefinition',
|
|
3140
|
+
right: 'TableDefinition',
|
|
3141
|
+
left_rows: 'list[dict[str, Any]]',
|
|
3142
|
+
right_rows: 'list[dict[str, Any]]',
|
|
3143
|
+
keys: 'list[str]',
|
|
3144
|
+
compared_columns: 'list[str]',
|
|
3145
|
+
rows_limit: 'int',
|
|
3146
|
+
left_partition: 'str | None',
|
|
3147
|
+
right_partition: 'str | None',
|
|
3148
|
+
) -> 'tuple[dict[str, Any], list[str]]':
|
|
3149
|
+
left_by_key = index_rows_by_key(left_rows, keys=keys, table_name=left.name)
|
|
3150
|
+
right_by_key = index_rows_by_key(right_rows, keys=keys, table_name=right.name)
|
|
3151
|
+
left_keys = set(left_by_key)
|
|
3152
|
+
right_keys = set(right_by_key)
|
|
3153
|
+
common_keys = sorted(left_keys & right_keys)
|
|
3154
|
+
left_only_keys = sorted(left_keys - right_keys)
|
|
3155
|
+
right_only_keys = sorted(right_keys - left_keys)
|
|
3156
|
+
|
|
3157
|
+
mismatched_rows: 'list[dict[str, Any]]' = []
|
|
3158
|
+
for key in common_keys:
|
|
3159
|
+
left_row = left_by_key[key]
|
|
3160
|
+
right_row = right_by_key[key]
|
|
3161
|
+
differing_columns = [
|
|
3162
|
+
column
|
|
3163
|
+
for column in compared_columns
|
|
3164
|
+
if normalize_diff_value(left_row.get(column)) != normalize_diff_value(right_row.get(column))
|
|
3165
|
+
]
|
|
3166
|
+
if differing_columns:
|
|
3167
|
+
mismatched_rows.append(
|
|
3168
|
+
{
|
|
3169
|
+
"key": key_to_payload(keys, key),
|
|
3170
|
+
"differing_columns": differing_columns,
|
|
3171
|
+
"left_row": left_row,
|
|
3172
|
+
"right_row": right_row,
|
|
3173
|
+
}
|
|
3174
|
+
)
|
|
3175
|
+
|
|
3176
|
+
warnings = [
|
|
3177
|
+
(
|
|
3178
|
+
f"Data diff currently compares at most {rows_limit} read-only snapshot row(s) per side. "
|
|
3179
|
+
"Increase `--rows` and narrow the partition scope if you need higher coverage."
|
|
3180
|
+
)
|
|
3181
|
+
]
|
|
3182
|
+
if not compared_columns:
|
|
3183
|
+
warnings.append("Only key presence was compared. Non-key column values were not compared.")
|
|
3184
|
+
|
|
3185
|
+
payload = {
|
|
3186
|
+
"left_table": left.name,
|
|
3187
|
+
"right_table": right.name,
|
|
3188
|
+
"comparison_mode": "keyed_snapshot",
|
|
3189
|
+
"compatible": not left_only_keys and not right_only_keys and not mismatched_rows,
|
|
3190
|
+
"keys": keys,
|
|
3191
|
+
"compared_columns": compared_columns,
|
|
3192
|
+
"left_partition": left_partition,
|
|
3193
|
+
"right_partition": right_partition,
|
|
3194
|
+
"left_sampled_rows": len(left_rows),
|
|
3195
|
+
"right_sampled_rows": len(right_rows),
|
|
3196
|
+
"summary": {
|
|
3197
|
+
"matched_keys": len(common_keys),
|
|
3198
|
+
"mismatched_rows": len(mismatched_rows),
|
|
3199
|
+
"left_only_keys": len(left_only_keys),
|
|
3200
|
+
"right_only_keys": len(right_only_keys),
|
|
3201
|
+
"left_sampled_rows": len(left_rows),
|
|
3202
|
+
"right_sampled_rows": len(right_rows),
|
|
3203
|
+
},
|
|
3204
|
+
"left_only_keys": [key_to_payload(keys, key) for key in left_only_keys],
|
|
3205
|
+
"right_only_keys": [key_to_payload(keys, key) for key in right_only_keys],
|
|
3206
|
+
"mismatched_rows": mismatched_rows,
|
|
3207
|
+
}
|
|
3208
|
+
return payload, warnings
|
|
3209
|
+
|
|
3210
|
+
|
|
3211
|
+
def compare_columns(
|
|
3212
|
+
left_columns: 'list[Any]',
|
|
3213
|
+
right_columns: 'list[Any]',
|
|
3214
|
+
*,
|
|
3215
|
+
scope: 'str',
|
|
3216
|
+
added_is_breaking: 'bool' = False,
|
|
3217
|
+
) -> 'dict[str, Any]':
|
|
3218
|
+
left_by_name = {column.name: column for column in left_columns}
|
|
3219
|
+
right_by_name = {column.name: column for column in right_columns}
|
|
3220
|
+
left_names = set(left_by_name)
|
|
3221
|
+
right_names = set(right_by_name)
|
|
3222
|
+
|
|
3223
|
+
added = [column_payload(right_by_name[name]) for name in sorted(right_names - left_names)]
|
|
3224
|
+
removed = [column_payload(left_by_name[name]) for name in sorted(left_names - right_names)]
|
|
3225
|
+
unchanged: 'list[str]' = []
|
|
3226
|
+
changed: 'list[dict[str, Any]]' = []
|
|
3227
|
+
breaking_changes: 'list[dict[str, Any]]' = []
|
|
3228
|
+
non_breaking_changes: 'list[dict[str, Any]]' = []
|
|
3229
|
+
|
|
3230
|
+
for name in sorted(left_names & right_names):
|
|
3231
|
+
left = left_by_name[name]
|
|
3232
|
+
right = right_by_name[name]
|
|
3233
|
+
field_changes: 'dict[str, dict[str, Any]]' = {}
|
|
3234
|
+
if left.type != right.type:
|
|
3235
|
+
field_changes["type"] = {"left": left.type, "right": right.type}
|
|
3236
|
+
breaking_changes.append(
|
|
3237
|
+
{
|
|
3238
|
+
"kind": "column_type_changed",
|
|
3239
|
+
"scope": scope,
|
|
3240
|
+
"column": name,
|
|
3241
|
+
"left": left.type,
|
|
3242
|
+
"right": right.type,
|
|
3243
|
+
}
|
|
3244
|
+
)
|
|
3245
|
+
if left.comment != right.comment:
|
|
3246
|
+
field_changes["comment"] = {"left": left.comment, "right": right.comment}
|
|
3247
|
+
non_breaking_changes.append(
|
|
3248
|
+
{
|
|
3249
|
+
"kind": "column_comment_changed",
|
|
3250
|
+
"scope": scope,
|
|
3251
|
+
"column": name,
|
|
3252
|
+
"left": left.comment,
|
|
3253
|
+
"right": right.comment,
|
|
3254
|
+
}
|
|
3255
|
+
)
|
|
3256
|
+
if field_changes:
|
|
3257
|
+
changed.append(
|
|
3258
|
+
{
|
|
3259
|
+
"name": name,
|
|
3260
|
+
"left": column_payload(left),
|
|
3261
|
+
"right": column_payload(right),
|
|
3262
|
+
"changes": field_changes,
|
|
3263
|
+
}
|
|
3264
|
+
)
|
|
3265
|
+
else:
|
|
3266
|
+
unchanged.append(name)
|
|
3267
|
+
|
|
3268
|
+
for column in removed:
|
|
3269
|
+
breaking_changes.append(
|
|
3270
|
+
{
|
|
3271
|
+
"kind": "column_removed",
|
|
3272
|
+
"scope": scope,
|
|
3273
|
+
"column": column["name"],
|
|
3274
|
+
"left": column,
|
|
3275
|
+
"right": None,
|
|
3276
|
+
}
|
|
3277
|
+
)
|
|
3278
|
+
for column in added:
|
|
3279
|
+
target = breaking_changes if added_is_breaking else non_breaking_changes
|
|
3280
|
+
target.append(
|
|
3281
|
+
{
|
|
3282
|
+
"kind": "column_added",
|
|
3283
|
+
"scope": scope,
|
|
3284
|
+
"column": column["name"],
|
|
3285
|
+
"left": None,
|
|
3286
|
+
"right": column,
|
|
3287
|
+
}
|
|
3288
|
+
)
|
|
3289
|
+
|
|
3290
|
+
return {
|
|
3291
|
+
"added": added,
|
|
3292
|
+
"removed": removed,
|
|
3293
|
+
"changed": changed,
|
|
3294
|
+
"unchanged": unchanged,
|
|
3295
|
+
"breaking_changes": breaking_changes,
|
|
3296
|
+
"non_breaking_changes": non_breaking_changes,
|
|
3297
|
+
}
|
|
3298
|
+
|
|
3299
|
+
|
|
3300
|
+
def column_payload(column: 'Any') -> 'dict[str, Any]':
|
|
3301
|
+
return {
|
|
3302
|
+
"name": column.name,
|
|
3303
|
+
"type": column.type,
|
|
3304
|
+
"comment": column.comment,
|
|
3305
|
+
}
|
|
3306
|
+
|
|
3307
|
+
|
|
3308
|
+
def resolve_data_diff_columns(
|
|
3309
|
+
left: 'TableDefinition',
|
|
3310
|
+
right: 'TableDefinition',
|
|
3311
|
+
*,
|
|
3312
|
+
keys: 'list[str]',
|
|
3313
|
+
requested_columns: 'list[str] | None',
|
|
3314
|
+
) -> 'list[str]':
|
|
3315
|
+
left_columns = all_table_column_names(left)
|
|
3316
|
+
right_columns = all_table_column_names(right)
|
|
3317
|
+
deduped_keys = dedupe_preserve_order(keys)
|
|
3318
|
+
missing_keys = [
|
|
3319
|
+
key
|
|
3320
|
+
for key in deduped_keys
|
|
3321
|
+
if key not in left_columns or key not in right_columns
|
|
3322
|
+
]
|
|
3323
|
+
if missing_keys:
|
|
3324
|
+
raise ValidationError(
|
|
3325
|
+
f"Data diff key columns are incomplete across both tables: {', '.join(missing_keys)}",
|
|
3326
|
+
suggestion="Run `maxc meta describe` or `maxc diff schema` to inspect the available columns.",
|
|
3327
|
+
)
|
|
3328
|
+
|
|
3329
|
+
if requested_columns:
|
|
3330
|
+
compared_columns = [
|
|
3331
|
+
column
|
|
3332
|
+
for column in dedupe_preserve_order(requested_columns)
|
|
3333
|
+
if column not in deduped_keys
|
|
3334
|
+
]
|
|
3335
|
+
missing_columns = [
|
|
3336
|
+
column
|
|
3337
|
+
for column in compared_columns
|
|
3338
|
+
if column not in left_columns or column not in right_columns
|
|
3339
|
+
]
|
|
3340
|
+
if missing_columns:
|
|
3341
|
+
raise ValidationError(
|
|
3342
|
+
f"Data diff comparison columns are incomplete across both tables: {', '.join(missing_columns)}",
|
|
3343
|
+
suggestion="Run `maxc meta describe` or `maxc diff schema` to inspect the available columns.",
|
|
3344
|
+
)
|
|
3345
|
+
return compared_columns
|
|
3346
|
+
|
|
3347
|
+
return [
|
|
3348
|
+
column
|
|
3349
|
+
for column in left_columns
|
|
3350
|
+
if column in right_columns and column not in deduped_keys
|
|
3351
|
+
]
|
|
3352
|
+
|
|
3353
|
+
|
|
3354
|
+
def all_table_column_names(table: 'TableDefinition') -> 'list[str]':
|
|
3355
|
+
return [column.name for column in [*table.columns, *table.partition_columns]]
|
|
3356
|
+
|
|
3357
|
+
|
|
3358
|
+
def dedupe_preserve_order(values: 'list[str]') -> 'list[str]':
|
|
3359
|
+
deduped: 'list[str]' = []
|
|
3360
|
+
for value in values:
|
|
3361
|
+
if value not in deduped:
|
|
3362
|
+
deduped.append(value)
|
|
3363
|
+
return deduped
|
|
3364
|
+
|
|
3365
|
+
|
|
3366
|
+
def index_rows_by_key(
|
|
3367
|
+
rows: 'list[dict[str, Any]]',
|
|
3368
|
+
*,
|
|
3369
|
+
keys: 'list[str]',
|
|
3370
|
+
table_name: 'str',
|
|
3371
|
+
) -> 'dict[tuple[Any, ...], dict[str, Any]]':
|
|
3372
|
+
indexed: 'dict[tuple[Any, ...], dict[str, Any]]' = {}
|
|
3373
|
+
duplicate_keys: 'list[dict[str, Any]]' = []
|
|
3374
|
+
for row in rows:
|
|
3375
|
+
key = tuple(normalize_diff_value(row.get(column)) for column in keys)
|
|
3376
|
+
if key in indexed:
|
|
3377
|
+
duplicate_keys.append(key_to_payload(keys, key))
|
|
3378
|
+
continue
|
|
3379
|
+
indexed[key] = row
|
|
3380
|
+
if duplicate_keys:
|
|
3381
|
+
examples = ", ".join(str(item) for item in duplicate_keys[:3])
|
|
3382
|
+
raise ValidationError(
|
|
3383
|
+
f"Data diff found duplicate keys in the `{table_name}` snapshot and cannot align rows reliably: {examples}",
|
|
3384
|
+
suggestion="Use more selective `--keys`, or narrow the partition scope before retrying.",
|
|
3385
|
+
)
|
|
3386
|
+
return indexed
|
|
3387
|
+
|
|
3388
|
+
|
|
3389
|
+
def key_to_payload(columns: 'list[str]', values: 'tuple[Any, ...]') -> 'dict[str, Any]':
|
|
3390
|
+
return {
|
|
3391
|
+
column: value
|
|
3392
|
+
for column, value in zip(columns, values)
|
|
3393
|
+
}
|
|
3394
|
+
|
|
3395
|
+
|
|
3396
|
+
def normalize_diff_value(value: 'Any') -> 'Any':
|
|
3397
|
+
if hasattr(value, "isoformat"):
|
|
3398
|
+
try:
|
|
3399
|
+
return value.isoformat()
|
|
3400
|
+
except TypeError:
|
|
3401
|
+
return value
|
|
3402
|
+
return value
|
|
3403
|
+
|
|
3404
|
+
|
|
3405
|
+
def read_stdin() -> 'str':
|
|
3406
|
+
return sys.stdin.read()
|