maxc-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
maxc_cli/app.py ADDED
@@ -0,0 +1,3406 @@
1
+
2
+ from datetime import datetime, timezone
3
+ import getpass
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+ from time import monotonic
8
+ from typing import Any, Callable
9
+
10
+ from .audit import AuditLogger
11
+ from .auth_providers import (
12
+ build_auth_options,
13
+ build_ncs_auth_config,
14
+ list_ncs_accounts,
15
+ resolve_auth_connection,
16
+ )
17
+ from .backend import OdpsBackend
18
+ from .cache import LocalCache
19
+ from .config import (
20
+ AuthConfig,
21
+ TableDefinition,
22
+ default_global_config_path,
23
+ load_config,
24
+ load_config_mapping,
25
+ persist_login_config,
26
+ save_config_mapping,
27
+ session_override_path,
28
+ )
29
+ from .exceptions import (
30
+ BackendConnectionError,
31
+ CostLimitExceededError,
32
+ ErrorPayload,
33
+ FeatureUnavailableError,
34
+ JobTimeoutError,
35
+ MaxCError,
36
+ PermissionDeniedError,
37
+ ValidationError,
38
+ )
39
+ from .helpers import (
40
+ build_odps_identity_payload,
41
+ build_task_summary,
42
+ classify_failure_reason,
43
+ load_odps_env,
44
+ mask_access_id,
45
+ missing_odps_settings,
46
+ parse_time_value,
47
+ )
48
+ from .models import AgentHints, Envelope, JobInfo, QueryResult
49
+ from .store import JobStore
50
+ from .utils import decode_cursor, detect_operation, encode_cursor, now_utc_iso
51
+
52
+
53
+ class MaxCApp:
54
+ def __init__(
55
+ self,
56
+ *,
57
+ cwd: 'Path',
58
+ config_path: 'Path | None' = None,
59
+ load_backend: 'bool' = True,
60
+ ) -> 'None':
61
+ self.cwd = cwd
62
+ self.config = load_config(cwd, config_path)
63
+ self.backend = OdpsBackend(self.config) if load_backend else None
64
+ self.remote_jobs = getattr(self.backend, "supports_remote_jobs", False) if self.backend else False
65
+ self.jobs: 'JobStore | None' = None
66
+ self._audit: 'AuditLogger | None' = None
67
+ self._audit_path = self.config.agent.audit_log or self.config.state_dir / "audit.log"
68
+ self._cache: 'LocalCache | None' = None
69
+
70
+ @property
71
+ def cache(self) -> 'LocalCache':
72
+ if self._cache is None:
73
+ self._cache = LocalCache(self.config.cache_dir)
74
+ return self._cache
75
+
76
+ def _ensure_job_store(self) -> 'JobStore':
77
+ if self.remote_jobs:
78
+ raise FeatureUnavailableError("Local job storage is not initialized for the current backend.")
79
+ if self.jobs is None:
80
+ self.jobs = JobStore(self.config.state_dir)
81
+ return self.jobs
82
+
83
+ def _whoami_validation_failed_envelope(
84
+ self,
85
+ *,
86
+ settings: 'dict[str, str | None]',
87
+ auth_type: 'str',
88
+ identity_source: 'str',
89
+ warnings: 'list[str]',
90
+ ) -> 'Envelope':
91
+ payload, base_warnings = build_odps_identity_payload(
92
+ client=None,
93
+ settings=settings,
94
+ allowed_operations=self.config.allowed_operations,
95
+ identity_source=identity_source,
96
+ auth_type=auth_type,
97
+ token_expires_at=settings.get("token_expires_at"),
98
+ project=self.config.default_project,
99
+ owner_display_name=None,
100
+ authenticated=False,
101
+ configured=True,
102
+ validation_status="failed",
103
+ )
104
+ payload["auth_options"] = build_auth_options(default_global_config_path())
105
+ return Envelope(
106
+ command="auth.whoami",
107
+ status="success",
108
+ data=payload,
109
+ metadata={
110
+ "project": self.config.default_project,
111
+ "config_sources": [str(p) for p in self.config.sources],
112
+ },
113
+ agent_hints=AgentHints(
114
+ next_actions=["auth.login", "auth.login-ncs"],
115
+ warnings=base_warnings + warnings,
116
+ ),
117
+ )
118
+
119
+ def _auth_settings_from_config(self, auth: 'AuthConfig') -> 'dict[str, str | None]':
120
+ return {
121
+ "provider": auth.provider,
122
+ "access_id": auth.access_id,
123
+ "secret_access_key": auth.secret_access_key,
124
+ "security_token": auth.security_token,
125
+ "token_expires_at": auth.token_expires_at,
126
+ "project": auth.project,
127
+ "endpoint": auth.endpoint,
128
+ "region_name": auth.region_name,
129
+ "tunnel_endpoint": auth.tunnel_endpoint,
130
+ "ncs_process_command": auth.ncs.process_command,
131
+ "ncs_account_type": auth.ncs.account_type,
132
+ "ncs_employee_id": auth.ncs.employee_id,
133
+ "ncs_account_name": auth.ncs.account_name,
134
+ "ncs_app_name": auth.ncs.app_name,
135
+ "ncs_process_timeout": str(auth.ncs.process_timeout) if auth.ncs.process_timeout else None,
136
+ }
137
+
138
+ def _clear_session_override(self, warnings: 'list[str]') -> 'None':
139
+ """Remove session_override.yaml so stale project/schema don't shadow new auth config."""
140
+ override_path = session_override_path()
141
+ if override_path.exists():
142
+ override_path.unlink()
143
+ warnings.append(
144
+ "Session override (project/schema) was cleared because auth config changed. "
145
+ "Run `session set` if you need to re-apply a project override."
146
+ )
147
+
148
+ def _validate_auth_config_shape(self, auth: 'AuthConfig') -> 'None':
149
+ settings = self._auth_settings_from_config(auth)
150
+ provider = (auth.provider or "").strip().lower()
151
+ if provider not in {"access_key", "sts_token", "ncs"}:
152
+ provider = "ncs" if settings.get("ncs_process_command") else "sts_token" if settings.get("security_token") else "access_key"
153
+
154
+ missing = missing_odps_settings(settings, auth_type=provider)
155
+ if not missing:
156
+ return
157
+
158
+ if provider == "ncs":
159
+ raise ValidationError(
160
+ f"ncs authentication is missing required fields: {', '.join(missing)}.",
161
+ suggestion="Provide project, endpoint, and ncs account configuration before using the ncs provider.",
162
+ )
163
+ if provider == "sts_token":
164
+ raise ValidationError(
165
+ f"STS authentication is missing required fields: {', '.join(missing)}.",
166
+ suggestion="Provide access_id, secret_access_key, security_token, project, and endpoint.",
167
+ )
168
+ raise ValidationError(
169
+ f"MaxCompute connection settings are incomplete: {', '.join(missing)}.",
170
+ suggestion="Run `maxc auth login` or set the required environment variables.",
171
+ )
172
+
173
+ def _cache_age_seconds(self, updated_at: 'str | None') -> 'int | None':
174
+ if not updated_at:
175
+ return None
176
+ parsed = parse_time_value(updated_at)
177
+ if parsed is None:
178
+ return None
179
+ return max(int((datetime.now(timezone.utc) - parsed).total_seconds()), 0)
180
+
181
+ def _cache_metadata(
182
+ self,
183
+ *,
184
+ project: 'str',
185
+ source: 'str',
186
+ query_time_ms: 'int | None' = None,
187
+ schema_name: 'str | None' = None,
188
+ ) -> 'dict[str, Any]':
189
+ cache_stats = self.cache.get_cache_stats(project, schema_name)
190
+ cache_age_seconds = self._cache_age_seconds(cache_stats.get("newest"))
191
+ metadata: 'dict[str, Any]' = {
192
+ "project": project,
193
+ "source": source,
194
+ "cache_available": cache_stats["table_count"] > 0,
195
+ "cache_age_seconds": cache_age_seconds,
196
+ "cache_stale": bool(cache_age_seconds is not None and cache_age_seconds > 3600),
197
+ "refresh_command": "cache build",
198
+ }
199
+ if query_time_ms is not None:
200
+ metadata["query_time_ms"] = query_time_ms
201
+ return metadata
202
+
203
+ def query(
204
+ self,
205
+ *,
206
+ command: 'str',
207
+ sql: 'str',
208
+ project: 'str | None' = None,
209
+ max_rows: 'int' = 100,
210
+ cursor: 'str | None' = None,
211
+ dry_run: 'bool' = False,
212
+ wait: 'int' = 10,
213
+ cost_check: 'float | None' = None,
214
+ idempotency_key: 'str | None' = None,
215
+ retry_on: 'list[str] | None' = None,
216
+ max_retries: 'int' = 0,
217
+ ) -> 'Envelope':
218
+ if max_rows <= 0:
219
+ raise ValidationError("`--max-rows` and `--page-size` must be greater than 0.")
220
+ if cursor and dry_run:
221
+ raise ValidationError("Do not combine `--cursor` with `--dry-run`.")
222
+
223
+ target_project = project or self.config.default_project
224
+ offset, session_id = decode_cursor(cursor)
225
+
226
+ # 如果 cursor 包含 session_id,从缓存获取 job_id,直接读取结果而不重新执行 SQL
227
+ if session_id is not None and self.remote_jobs:
228
+ session = self.cache.get_session(session_id)
229
+ if session and session.get("job_id"):
230
+ result = self.backend.fetch_job_result(
231
+ session["job_id"],
232
+ project=session.get("project") or target_project,
233
+ max_rows=max_rows,
234
+ offset=offset,
235
+ )
236
+ envelope = self._build_query_envelope(
237
+ command=command,
238
+ result=result,
239
+ dry_run=False,
240
+ session_id=session_id,
241
+ )
242
+ self.log(command, envelope.status, envelope.metadata)
243
+ return envelope
244
+
245
+ # Remote branch — always submit, then poll up to `wait` seconds
246
+ if self.remote_jobs and not dry_run:
247
+ job = self._submit_remote_job(
248
+ sql=sql,
249
+ project=target_project,
250
+ cost_check=cost_check,
251
+ idempotency_key=idempotency_key,
252
+ )
253
+ retry_warnings = []
254
+ if retry_on:
255
+ retry_warnings = ["`--retry-on` and `--max-retries` are not applied on the remote job path; the job runs as submitted."]
256
+ if wait == 0:
257
+ # Return pending envelope immediately, no polling
258
+ envelope = Envelope(
259
+ command=command,
260
+ status="pending",
261
+ data={"job_id": job.job_id},
262
+ metadata={
263
+ "job_id": job.job_id,
264
+ "project": job.project,
265
+ "submitted_at": job.submitted_at,
266
+ "logview": job.logview,
267
+ "wait_seconds": 0,
268
+ "sql_executed": sql,
269
+ },
270
+ agent_hints=AgentHints(
271
+ next_actions=["job.wait", "job.status"],
272
+ warnings=(job.warnings or []) + retry_warnings,
273
+ insights=["Use `job wait <job_id>` to wait for completion, then `job result <job_id>` to fetch rows."],
274
+ ),
275
+ )
276
+ if idempotency_key:
277
+ envelope.metadata["idempotency_key"] = idempotency_key
278
+ self.log(command, envelope.status, envelope.metadata)
279
+ return envelope
280
+ # Poll
281
+ try:
282
+ job_info = self.backend.wait_job(
283
+ job.job_id,
284
+ project=target_project,
285
+ timeout=wait,
286
+ poll_interval=1,
287
+ )
288
+ except JobTimeoutError:
289
+ envelope = Envelope(
290
+ command=command,
291
+ status="pending",
292
+ data={"job_id": job.job_id},
293
+ metadata={
294
+ "job_id": job.job_id,
295
+ "project": job.project,
296
+ "submitted_at": job.submitted_at,
297
+ "logview": job.logview,
298
+ "wait_seconds": wait,
299
+ "sql_executed": sql,
300
+ },
301
+ agent_hints=AgentHints(
302
+ next_actions=["job.wait", "job.status"],
303
+ warnings=(job.warnings or []) + retry_warnings,
304
+ insights=[
305
+ f"Query promoted to async after {wait}s. "
306
+ "Use `job wait <job_id> --timeout <N>` to wait for completion, "
307
+ "then `job result <job_id>` to fetch rows."
308
+ ],
309
+ ),
310
+ )
311
+ if idempotency_key:
312
+ envelope.metadata["idempotency_key"] = idempotency_key
313
+ self.log(command, envelope.status, envelope.metadata)
314
+ return envelope
315
+ except BackendConnectionError as exc:
316
+ envelope = Envelope(
317
+ command=command,
318
+ status="failure",
319
+ data=None,
320
+ error=exc.to_payload(),
321
+ metadata={
322
+ "job_id": job.job_id,
323
+ "project": target_project,
324
+ "sql_executed": sql,
325
+ },
326
+ agent_hints=AgentHints(
327
+ next_actions=["job.status"],
328
+ ),
329
+ )
330
+ if idempotency_key:
331
+ envelope.metadata["idempotency_key"] = idempotency_key
332
+ self.log(command, envelope.status, envelope.metadata)
333
+ return envelope
334
+ # Job ended — check outcome
335
+ if job_info.status == "failure":
336
+ envelope = Envelope(
337
+ command=command,
338
+ status="failure",
339
+ data={"job_id": job_info.job_id},
340
+ metadata={
341
+ "job_id": job_info.job_id,
342
+ "project": job_info.project,
343
+ "submitted_at": job_info.submitted_at,
344
+ "logview": job_info.logview,
345
+ "sql_executed": sql,
346
+ },
347
+ agent_hints=AgentHints(
348
+ next_actions=["job.diagnose", "job.status"],
349
+ warnings=job_info.warnings or [],
350
+ ),
351
+ )
352
+ if idempotency_key:
353
+ envelope.metadata["idempotency_key"] = idempotency_key
354
+ self.log(command, envelope.status, envelope.metadata)
355
+ return envelope
356
+ # status == "success" — fetch rows
357
+ try:
358
+ result = self.backend.fetch_job_result(
359
+ job_info.job_id,
360
+ project=target_project,
361
+ max_rows=max_rows,
362
+ offset=offset,
363
+ )
364
+ except Exception as exc:
365
+ fetch_err = MaxCError(str(exc))
366
+ envelope = Envelope(
367
+ command=command,
368
+ status="failure",
369
+ data=None,
370
+ error=fetch_err.to_payload(),
371
+ metadata={
372
+ "job_id": job_info.job_id,
373
+ "project": target_project,
374
+ "sql_executed": sql,
375
+ },
376
+ agent_hints=AgentHints(
377
+ next_actions=["job.result"],
378
+ ),
379
+ )
380
+ if idempotency_key:
381
+ envelope.metadata["idempotency_key"] = idempotency_key
382
+ self.log(command, envelope.status, envelope.metadata)
383
+ return envelope
384
+ envelope = self._build_query_envelope(
385
+ command=command,
386
+ result=result,
387
+ dry_run=False,
388
+ )
389
+ envelope.metadata.update({
390
+ "job_id": job_info.job_id,
391
+ "submitted_at": job_info.submitted_at,
392
+ "logview": job_info.logview,
393
+ })
394
+ if idempotency_key:
395
+ envelope.metadata["idempotency_key"] = idempotency_key
396
+ self.log(command, envelope.status, envelope.metadata)
397
+ return envelope
398
+
399
+ result = self._execute_query(
400
+ sql=sql,
401
+ project=target_project,
402
+ max_rows=max_rows,
403
+ offset=offset,
404
+ dry_run=dry_run,
405
+ cost_check=cost_check,
406
+ retry_on=retry_on or [],
407
+ max_retries=max_retries,
408
+ strict_cost_check=True,
409
+ )
410
+
411
+ envelope = self._build_query_envelope(
412
+ command=command,
413
+ result=result,
414
+ dry_run=dry_run,
415
+ )
416
+ if idempotency_key:
417
+ envelope.metadata["idempotency_key"] = idempotency_key
418
+
419
+ self.log(command, envelope.status, envelope.metadata)
420
+ return envelope
421
+
422
+
423
+ def query_cost(
424
+ self,
425
+ *,
426
+ sql: 'str',
427
+ project: 'str | None' = None,
428
+ command: 'str' = "query.cost",
429
+ ) -> 'Envelope':
430
+ target_project = project or self.config.default_project
431
+ analysis = self._analyze_query(
432
+ sql=sql,
433
+ project=target_project,
434
+ explain=False,
435
+ )
436
+ envelope = self._build_analysis_envelope(
437
+ command=command,
438
+ sql=sql,
439
+ analysis=analysis,
440
+ )
441
+ self.log(command, envelope.status, envelope.metadata)
442
+ return envelope
443
+
444
+ def query_explain(
445
+ self,
446
+ *,
447
+ sql: 'str',
448
+ project: 'str | None' = None,
449
+ command: 'str' = "query.explain",
450
+ ) -> 'Envelope':
451
+ target_project = project or self.config.default_project
452
+ analysis = self._analyze_query(
453
+ sql=sql,
454
+ project=target_project,
455
+ explain=True,
456
+ )
457
+ envelope = self._build_analysis_envelope(
458
+ command=command,
459
+ sql=sql,
460
+ analysis=analysis,
461
+ )
462
+ self.log(command, envelope.status, envelope.metadata)
463
+ return envelope
464
+
465
+ def submit_job(
466
+ self,
467
+ *,
468
+ sql: 'str',
469
+ project: 'str | None' = None,
470
+ max_rows: 'int' = 100,
471
+ cost_check: 'float | None' = None,
472
+ idempotency_key: 'str | None' = None,
473
+ ) -> 'Envelope':
474
+ return self.query(
475
+ command="job.submit",
476
+ sql=sql,
477
+ project=project,
478
+ max_rows=max_rows,
479
+ wait=0,
480
+ cost_check=cost_check,
481
+ idempotency_key=idempotency_key,
482
+ )
483
+
484
+ def job_status(self, job_id: 'str') -> 'Envelope':
485
+ if self.remote_jobs:
486
+ info = self.backend.get_job(job_id, project=self.config.default_project)
487
+ envelope = self._job_info_envelope("job.status", info)
488
+ self.log("job.status", envelope.status, envelope.metadata)
489
+ return envelope
490
+
491
+ jobs = self._ensure_job_store()
492
+ job = jobs.get_job(job_id)
493
+ info = self._local_job_info(job)
494
+ envelope = self._job_info_envelope("job.status", info)
495
+ self.log("job.status", envelope.status, envelope.metadata)
496
+ return envelope
497
+
498
+ def job_wait(self, job_id: 'str', *, timeout: 'int | None' = None) -> 'tuple[Envelope, list[dict[str, Any]]]':
499
+ # TODO:目前等待作业结束,是直接静默的 wait 知道 Success,我希望是每若干秒(3s?)打印一条作业状态的 ND JSON
500
+ if self.remote_jobs:
501
+ before = self.backend.get_job(job_id, project=self.config.default_project)
502
+ try:
503
+ after = self.backend.wait_job(job_id, project=self.config.default_project, timeout=timeout)
504
+ except JobTimeoutError:
505
+ envelope = Envelope(
506
+ command="job.wait",
507
+ status="pending",
508
+ data={"job_id": job_id},
509
+ metadata={
510
+ "job_id": job_id,
511
+ "project": self.config.default_project,
512
+ "submitted_at": before.submitted_at,
513
+ "logview": before.logview,
514
+ "wait_seconds": timeout,
515
+ },
516
+ agent_hints=AgentHints(
517
+ next_actions=["job.wait", "job.status"],
518
+ insights=[
519
+ f"Job still running after {timeout}s. Use `job wait {job_id} --timeout <N>` to continue waiting, then `job result {job_id}` to fetch rows."
520
+ ],
521
+ ),
522
+ )
523
+ self.log("job.wait", envelope.status, envelope.metadata)
524
+ return envelope, []
525
+ except BackendConnectionError as exc:
526
+ envelope = Envelope(
527
+ command="job.wait",
528
+ status="failure",
529
+ data=None,
530
+ error=ErrorPayload(
531
+ code="BACKEND_CONNECTION_ERROR",
532
+ message=str(exc),
533
+ recoverable=True,
534
+ suggestion=getattr(exc, "suggestion", None),
535
+ ),
536
+ metadata={
537
+ "job_id": job_id,
538
+ "project": self.config.default_project,
539
+ },
540
+ agent_hints=AgentHints(
541
+ next_actions=["job.status"],
542
+ insights=[
543
+ f"Lost contact with backend while waiting for job {job_id}. Run `job status {job_id}` to check if it completed."
544
+ ],
545
+ ),
546
+ )
547
+ self.log("job.wait", envelope.status, envelope.metadata)
548
+ return envelope, []
549
+ if after.status != "success":
550
+ envelope = self._job_info_envelope("job.wait", after)
551
+ events = [
552
+ {"type": "started", "ts": before.submitted_at or now_utc_iso(), "job_id": before.job_id},
553
+ {
554
+ "type": "failed",
555
+ "ts": after.completed_at or now_utc_iso(),
556
+ "job_id": after.job_id,
557
+ "reason": after.failure_reason,
558
+ "retryable": after.retryable,
559
+ },
560
+ ]
561
+ self.log("job.wait", envelope.status, envelope.metadata)
562
+ return envelope, events
563
+ result = self.backend.fetch_job_result(
564
+ job_id,
565
+ project=self.config.default_project,
566
+ max_rows=100,
567
+ )
568
+ envelope = self._build_query_envelope(
569
+ command="job.wait",
570
+ result=result,
571
+ dry_run=False,
572
+ )
573
+ envelope.metadata.update(
574
+ {
575
+ "job_id": job_id,
576
+ "submitted_at": after.submitted_at,
577
+ "completed_at": after.completed_at,
578
+ "logview": after.logview,
579
+ "stage": after.stage,
580
+ "retryable": after.retryable,
581
+ "failure_reason": after.failure_reason,
582
+ "task_summary": after.task_summary,
583
+ }
584
+ )
585
+ events = self._remote_job_events(before, after, result)
586
+ self.log("job.wait", envelope.status, envelope.metadata)
587
+ return envelope, events
588
+
589
+ jobs = self._ensure_job_store()
590
+ job = jobs.get_job(job_id)
591
+ events = self._job_events(job)
592
+ final_job = jobs.update_job(
593
+ job_id,
594
+ status="success",
595
+ progress=100,
596
+ started_at=job.get("started_at") or now_utc_iso(),
597
+ completed_at=now_utc_iso(),
598
+ )
599
+ stored = final_job["result"]
600
+ info = self._local_job_info(final_job)
601
+ envelope = Envelope(
602
+ command="job.wait",
603
+ status="success",
604
+ data=stored["data"],
605
+ metadata={
606
+ **stored["metadata"],
607
+ "job_id": job_id,
608
+ "submitted_at": final_job["submitted_at"],
609
+ "completed_at": final_job["completed_at"],
610
+ "stage": info.stage,
611
+ "retryable": info.retryable,
612
+ "failure_reason": info.failure_reason,
613
+ "logview": info.logview,
614
+ "task_summary": info.task_summary,
615
+ },
616
+ agent_hints=AgentHints(
617
+ next_actions=["job.result", "meta.describe"],
618
+ warnings=stored.get("agent_hints", {}).get("warnings", []),
619
+ ),
620
+ )
621
+ self.log("job.wait", envelope.status, envelope.metadata)
622
+ return envelope, events
623
+
624
+ def job_result(self, job_id: 'str', *, max_rows: 'int' = 100, cursor: 'str | None' = None) -> 'Envelope':
625
+ if self.remote_jobs:
626
+ info = self.backend.get_job(job_id, project=self.config.default_project)
627
+ if info.status != "success":
628
+ envelope = self._job_info_envelope("job.result", info)
629
+ self.log("job.result", envelope.status, envelope.metadata)
630
+ return envelope
631
+ offset, _ = decode_cursor(cursor)
632
+ result = self.backend.fetch_job_result(
633
+ job_id,
634
+ project=self.config.default_project,
635
+ max_rows=max_rows,
636
+ offset=offset,
637
+ )
638
+ envelope = self._build_query_envelope(
639
+ command="job.result",
640
+ result=result,
641
+ dry_run=False,
642
+ )
643
+ envelope.metadata.update(
644
+ {
645
+ "job_id": job_id,
646
+ "submitted_at": info.submitted_at,
647
+ "completed_at": info.completed_at,
648
+ "logview": info.logview,
649
+ }
650
+ )
651
+ self.log("job.result", envelope.status, envelope.metadata)
652
+ return envelope
653
+
654
+ jobs = self._ensure_job_store()
655
+ job = jobs.get_job(job_id)
656
+ if job["status"] != "success":
657
+ info = self._local_job_info(job)
658
+ envelope = self._job_info_envelope("job.result", info)
659
+ self.log("job.result", envelope.status, envelope.metadata)
660
+ return envelope
661
+
662
+ stored = job["result"]
663
+ info = self._local_job_info(job)
664
+ all_rows = stored["data"].get("rows", [])
665
+ schema = stored["data"].get("schema", [])
666
+ total_rows = stored["data"].get("total_rows", len(all_rows))
667
+
668
+ offset, _ = decode_cursor(cursor) # session_id ignored for local jobs
669
+ page_rows = all_rows[offset:offset + max_rows]
670
+ returned_rows = len(page_rows)
671
+ has_more = (offset + returned_rows) < total_rows
672
+ next_cursor = encode_cursor(offset + returned_rows) if has_more else None
673
+
674
+ envelope = Envelope(
675
+ command="job.result",
676
+ status="success",
677
+ data={
678
+ "rows": page_rows,
679
+ "schema": schema,
680
+ "total_rows": total_rows,
681
+ "returned_rows": returned_rows,
682
+ "has_more": has_more,
683
+ "next_cursor": next_cursor,
684
+ },
685
+ metadata={
686
+ **stored["metadata"],
687
+ "job_id": job_id,
688
+ "submitted_at": job["submitted_at"],
689
+ "completed_at": job.get("completed_at", job["updated_at"]),
690
+ "stage": info.stage,
691
+ "retryable": info.retryable,
692
+ "failure_reason": info.failure_reason,
693
+ "logview": info.logview,
694
+ "task_summary": info.task_summary,
695
+ },
696
+ agent_hints=AgentHints(
697
+ next_actions=["meta.describe"],
698
+ warnings=stored.get("agent_hints", {}).get("warnings", []),
699
+ ),
700
+ )
701
+ self.log("job.result", envelope.status, envelope.metadata)
702
+ return envelope
703
+
704
+ def cancel_job(self, job_id: 'str') -> 'Envelope':
705
+ if self.remote_jobs:
706
+ info = self.backend.cancel_job(job_id, project=self.config.default_project)
707
+ envelope = Envelope(
708
+ command="job.cancel",
709
+ status=info.status,
710
+ data={"job_id": job_id, "cancelled": True},
711
+ metadata={
712
+ "project": info.project,
713
+ "updated_at": info.updated_at,
714
+ "logview": info.logview,
715
+ },
716
+ agent_hints=AgentHints(
717
+ next_actions=["job.status"],
718
+ warnings=info.warnings,
719
+ ),
720
+ )
721
+ self.log("job.cancel", envelope.status, envelope.metadata)
722
+ return envelope
723
+
724
+ jobs = self._ensure_job_store()
725
+ job = jobs.get_job(job_id)
726
+ if job["status"] == "success":
727
+ raise ValidationError("The job is already complete and cannot be cancelled.")
728
+ updated = jobs.update_job(job_id, status="failure", progress=0, cancelled=True)
729
+ envelope = Envelope(
730
+ command="job.cancel",
731
+ status="failure",
732
+ data={"job_id": job_id, "cancelled": True},
733
+ metadata={"project": updated["project"], "updated_at": updated["updated_at"]},
734
+ agent_hints=AgentHints(next_actions=["job.submit"]),
735
+ )
736
+ self.log("job.cancel", envelope.status, envelope.metadata)
737
+ return envelope
738
+
739
+ def job_diagnose(self, job_id: 'str') -> 'Envelope':
740
+ if self.remote_jobs:
741
+ payload = self.backend.diagnose_job(job_id, project=self.config.default_project)
742
+ envelope = Envelope(
743
+ command="job.diagnose",
744
+ status="success",
745
+ data=payload,
746
+ metadata={"project": self.config.default_project},
747
+ agent_hints=AgentHints(next_actions=["job.status", "job.result"]),
748
+ )
749
+ self.log("job.diagnose", envelope.status, envelope.metadata)
750
+ return envelope
751
+
752
+ jobs = self._ensure_job_store()
753
+ job = jobs.get_job(job_id)
754
+ info = self._local_job_info(job)
755
+ diagnosis = classify_failure_reason(info.failure_reason)
756
+ envelope = Envelope(
757
+ command="job.diagnose",
758
+ status="success",
759
+ data={
760
+ "job_id": info.job_id,
761
+ "status": info.status,
762
+ "stage": info.stage,
763
+ "retryable": info.retryable,
764
+ "failure_reason": info.failure_reason,
765
+ "diagnosis_category": diagnosis["category"],
766
+ "diagnosis_summary": diagnosis["summary"],
767
+ "logview": info.logview,
768
+ "task_summary": info.task_summary,
769
+ "task_statuses": [],
770
+ "task_results": {},
771
+ },
772
+ metadata={"project": info.project},
773
+ agent_hints=AgentHints(next_actions=["job.status", "job.result"]),
774
+ )
775
+ self.log("job.diagnose", envelope.status, envelope.metadata)
776
+ return envelope
777
+
778
+ def list_jobs(self, *, limit: 'int' = 20) -> 'Envelope':
779
+ if self.remote_jobs:
780
+ jobs = self.backend.list_jobs(project=self.config.default_project, limit=limit)
781
+ rows = [
782
+ {
783
+ "job_id": item.job_id,
784
+ "status": item.status,
785
+ "progress": item.progress,
786
+ "project": item.project,
787
+ "submitted_at": item.submitted_at,
788
+ }
789
+ for item in jobs
790
+ ]
791
+ envelope = Envelope(
792
+ command="job.list",
793
+ status="success",
794
+ data={"jobs": rows, "total": len(rows)},
795
+ metadata={"backend": "odps", "project": self.config.default_project},
796
+ agent_hints=AgentHints(next_actions=["job.status", "job.wait"]),
797
+ )
798
+ self.log("job.list", envelope.status, envelope.metadata)
799
+ return envelope
800
+
801
+ jobs = self._ensure_job_store()
802
+ stored_jobs = jobs.list_jobs()[:limit]
803
+ rows = [
804
+ {
805
+ "job_id": item["job_id"],
806
+ "status": item["status"],
807
+ "progress": item["progress"],
808
+ "project": item["project"],
809
+ "submitted_at": item["submitted_at"],
810
+ }
811
+ for item in stored_jobs
812
+ ]
813
+ envelope = Envelope(
814
+ command="job.list",
815
+ status="success",
816
+ data={"jobs": rows, "total": len(rows)},
817
+ metadata={"state_file": str(jobs.path)},
818
+ agent_hints=AgentHints(next_actions=["job.status", "job.wait"]),
819
+ )
820
+ self.log("job.list", envelope.status, envelope.metadata)
821
+ return envelope
822
+
823
+ def meta_list_tables(self) -> 'Envelope':
824
+ started = monotonic()
825
+
826
+ # Try to get from cache first
827
+ cached_tables = self.cache.get_all_cached_tables(self.config.default_project)
828
+
829
+ if cached_tables:
830
+ # Use cached data (returns list of dicts)
831
+ tables = cached_tables
832
+ source = "cache"
833
+ rows = [
834
+ {
835
+ "table_name": table.get("table_name"),
836
+ "table_type": table.get("table_type", "TABLE"),
837
+ "size_bytes": table.get("size_bytes"),
838
+ "owner": table.get("owner"),
839
+ "description": table.get("description"),
840
+ "partition_columns": [
841
+ c.get("name") if isinstance(c, dict) else str(c)
842
+ for c in table.get("partition_columns", [])
843
+ ],
844
+ }
845
+ for table in tables
846
+ ]
847
+ else:
848
+ # No cache - return guidance to build cache first
849
+ return Envelope(
850
+ command="meta.list-tables",
851
+ status="cache_miss",
852
+ data={"tables": [], "total": 0},
853
+ metadata=self._cache_metadata(
854
+ project=self.config.default_project,
855
+ source="none",
856
+ query_time_ms=int((monotonic() - started) * 1000),
857
+ ),
858
+ agent_hints=AgentHints(
859
+ next_actions=["cache.build"],
860
+ insights=["No metadata cache found for this project. Building the cache first will significantly improve performance."],
861
+ warnings=["Cache miss: Run `maxc cache build` to populate the metadata cache."],
862
+ ),
863
+ )
864
+
865
+ metadata = self._cache_metadata(
866
+ project=self.config.default_project,
867
+ source=source,
868
+ query_time_ms=int((monotonic() - started) * 1000),
869
+ )
870
+
871
+ envelope = Envelope(
872
+ command="meta.list-tables",
873
+ status="success",
874
+ data={"tables": rows, "total": len(rows)},
875
+ metadata=metadata,
876
+ agent_hints=AgentHints(
877
+ next_actions=["meta.describe", "data.sample"],
878
+ insights=[f"Table list served from {source}."],
879
+ ),
880
+ )
881
+ self.log("meta.list-tables", envelope.status, envelope.metadata)
882
+ return envelope
883
+
884
+ def meta_describe(self, table_name: 'str', full: 'bool' = False) -> 'Envelope':
885
+ started = monotonic()
886
+
887
+ # Try to get from cache first
888
+ cached_table = self.cache.get_cached_table(
889
+ self.config.default_project,
890
+ table_name,
891
+ schema_name=self.config.default_schema or "default"
892
+ )
893
+
894
+ if cached_table:
895
+ # Use cached metadata for schema, fetch sample rows from API
896
+ from .config import TableDefinition, TableColumn
897
+
898
+ # Build TableDefinition from cache
899
+ columns = [
900
+ TableColumn(name=c["name"], type=c["type"], comment=c.get("comment", ""))
901
+ for c in cached_table.get("columns", [])
902
+ ]
903
+ partition_columns = [
904
+ TableColumn(name=p, type="string", comment="")
905
+ for p in cached_table.get("partitions", [])
906
+ ]
907
+
908
+ table = TableDefinition(
909
+ name=table_name,
910
+ description=cached_table.get("description", ""),
911
+ columns=columns,
912
+ sample_rows=[], # Will fetch from API if needed
913
+ partitions=[], # Will fetch from API if needed
914
+ partition_columns=partition_columns,
915
+ owner=cached_table.get("owner"),
916
+ size_bytes=cached_table.get("size_bytes"),
917
+ table_type="TABLE",
918
+ )
919
+ source = "cache"
920
+
921
+ warnings = []
922
+ # Optionally fetch additional metadata from API (description, owner, size, sample rows, partitions)
923
+ try:
924
+ api_table = self.backend.describe_table(table_name)
925
+ # Update with API data (API has priority over cache for these fields)
926
+ table.description = api_table.description or table.description
927
+ table.owner = api_table.owner or table.owner
928
+ table.size_bytes = api_table.size_bytes or table.size_bytes
929
+ table.created_at = api_table.created_at
930
+ table.updated_at = api_table.updated_at
931
+ table.table_type = api_table.table_type or table.table_type
932
+ table.sample_rows = api_table.sample_rows
933
+ table.partitions = api_table.partitions
934
+ except Exception:
935
+ # If API fails, still return cached schema
936
+ warnings.append("Backend API unavailable, showing cached schema only")
937
+ else:
938
+ # Fall back to live API
939
+ table = self.backend.describe_table(table_name)
940
+ source = "live"
941
+ warnings = []
942
+
943
+ # Get semantic metadata from cache
944
+ semantic = self.cache.get_semantic(
945
+ project=self.config.default_project,
946
+ table_name=table_name,
947
+ schema_name=self.config.default_schema or "default",
948
+ )
949
+
950
+ if not semantic:
951
+ warnings.append(
952
+ "Missing semantic metadata. Agent should generate it using its own LLM and save with: maxc meta semantic set"
953
+ )
954
+
955
+ payload = self._table_payload(table, full=full)
956
+
957
+ # Add hint about --full flag in summary mode
958
+ if not full and payload.get("has_more_columns"):
959
+ remaining = payload.get("remaining_columns", 0)
960
+ warnings.append(
961
+ f"Showing first 10 columns only. Use --full to see all {payload['column_count']} columns."
962
+ )
963
+
964
+ # Add semantic information to the payload
965
+ payload["semantic"] = semantic
966
+
967
+ envelope = Envelope(
968
+ command="meta.describe",
969
+ status="success",
970
+ data=payload,
971
+ metadata={
972
+ "project": self.config.default_project,
973
+ "source": source,
974
+ "query_time_ms": int((monotonic() - started) * 1000) if source == "live" else None,
975
+ },
976
+ agent_hints=AgentHints(
977
+ next_actions=["data.sample", "data.profile", "query"],
978
+ warnings=warnings,
979
+ ),
980
+ )
981
+ self.log("meta.describe", envelope.status, envelope.metadata)
982
+ return envelope
983
+
984
+ def meta_search(self, keyword: 'str') -> 'Envelope':
985
+ started = monotonic()
986
+ cached_tables = self.cache.get_all_cached_tables(self.config.default_project)
987
+ if cached_tables:
988
+ matches = self._search_in_cache(keyword, cached_tables)
989
+ source = "cache"
990
+ else:
991
+ matches = self.backend.search_tables(keyword)
992
+ source = "live"
993
+ envelope = Envelope(
994
+ command="meta.search",
995
+ status="success",
996
+ data={"keyword": keyword, "matches": matches, "total": len(matches)},
997
+ metadata=self._cache_metadata(
998
+ project=self.config.default_project,
999
+ source=source,
1000
+ query_time_ms=int((monotonic() - started) * 1000) if source == "live" else None,
1001
+ ),
1002
+ agent_hints=AgentHints(
1003
+ next_actions=["meta.describe", "data.sample"],
1004
+ warnings=[] if cached_tables else ["No metadata cache was used. Run `maxc cache build` to speed up future lookups."],
1005
+ ),
1006
+ )
1007
+ self.log("meta.search", envelope.status, envelope.metadata)
1008
+ return envelope
1009
+
1010
+ def meta_search_columns(self, keyword: 'str') -> 'Envelope':
1011
+ started = monotonic()
1012
+ cached_tables = self.cache.get_all_cached_tables(self.config.default_project)
1013
+ if cached_tables:
1014
+ matches = self._search_columns_in_cache(keyword, cached_tables)
1015
+ source = "cache"
1016
+ else:
1017
+ matches = self.backend.search_columns(keyword)
1018
+ source = "live"
1019
+ envelope = Envelope(
1020
+ command="meta.search-columns",
1021
+ status="success",
1022
+ data={"keyword": keyword, "matches": matches, "total": len(matches)},
1023
+ metadata=self._cache_metadata(
1024
+ project=self.config.default_project,
1025
+ source=source,
1026
+ query_time_ms=int((monotonic() - started) * 1000) if source == "live" else None,
1027
+ ),
1028
+ agent_hints=AgentHints(
1029
+ next_actions=["meta.describe", "query"],
1030
+ warnings=[] if cached_tables else ["No metadata cache was used. Run `maxc cache build` to speed up future lookups."],
1031
+ ),
1032
+ )
1033
+ self.log("meta.search-columns", envelope.status, envelope.metadata)
1034
+ return envelope
1035
+
1036
+ def _search_in_cache(
1037
+ self, keyword: 'str', cached_tables: 'list[dict]'
1038
+ ) -> 'list[dict]':
1039
+ """Search tables in cache."""
1040
+ tokens = [t.lower() for t in keyword.split() if t.strip()] or [keyword.lower()]
1041
+ matches = []
1042
+ for table in cached_tables:
1043
+ score = 0
1044
+ matched_columns = []
1045
+ searchable = f"{table['table_name']} {table.get('description', '')}".lower()
1046
+ for token in tokens:
1047
+ if token in searchable:
1048
+ score += 5
1049
+ for col in table.get("columns", []):
1050
+ text = f"{col.get('name', '')} {col.get('comment', '')}".lower()
1051
+ if token in text:
1052
+ score += 2
1053
+ matched_columns.append(col["name"])
1054
+ if score:
1055
+ matches.append({
1056
+ "table_name": table["table_name"],
1057
+ "description": table.get("description"),
1058
+ "score": score,
1059
+ "matched_columns": list(set(matched_columns))[:5],
1060
+ })
1061
+ return sorted(matches, key=lambda x: -x["score"])[:20]
1062
+
1063
+ def _search_columns_in_cache(
1064
+ self, keyword: 'str', cached_tables: 'list[dict]'
1065
+ ) -> 'list[dict]':
1066
+ """Search columns in cache."""
1067
+ tokens = [t.lower() for t in keyword.split() if t.strip()] or [keyword.lower()]
1068
+ matches = []
1069
+ for table in cached_tables:
1070
+ for col in table.get("columns", []):
1071
+ text = f"{col.get('name', '')} {col.get('comment', '')}".lower()
1072
+ score = sum(2 for token in tokens if token in text)
1073
+ if score:
1074
+ matches.append({
1075
+ "table_name": table["table_name"],
1076
+ "column_name": col["name"],
1077
+ "column_type": col.get("type"),
1078
+ "column_comment": col.get("comment"),
1079
+ "score": score,
1080
+ })
1081
+ return sorted(matches, key=lambda x: -x["score"])[:50]
1082
+
1083
+ # ========== Semantic Metadata Methods ==========
1084
+
1085
+ def semantic_set(
1086
+ self,
1087
+ table_name: 'str',
1088
+ semantic_desc: 'str | None' = None,
1089
+ use_cases: 'list[str] | None' = None,
1090
+ sample_questions: 'list[str] | None' = None,
1091
+ column_semantics: 'list[dict[str, Any]] | None' = None,
1092
+ relations: 'list[dict[str, Any]] | None' = None,
1093
+ stats: 'dict[str, Any] | None' = None,
1094
+ ) -> 'Envelope':
1095
+ """Set semantic metadata for a table (data provided by Agent)."""
1096
+ try:
1097
+ self.cache.save_semantic(
1098
+ project=self.config.default_project,
1099
+ table_name=table_name,
1100
+ semantic_desc=semantic_desc or "",
1101
+ use_cases=use_cases or [],
1102
+ sample_questions=sample_questions or [],
1103
+ column_semantics=column_semantics or [],
1104
+ schema_name=self.config.default_schema or "default",
1105
+ relations=relations,
1106
+ stats=stats,
1107
+ )
1108
+
1109
+ envelope = Envelope(
1110
+ command="meta.semantic.set",
1111
+ status="success",
1112
+ data={
1113
+ "action": "set_semantic",
1114
+ "table_name": table_name,
1115
+ "has_description": bool(semantic_desc),
1116
+ "use_cases_count": len(use_cases) if use_cases else 0,
1117
+ "sample_questions_count": len(sample_questions) if sample_questions else 0,
1118
+ "column_semantics_count": len(column_semantics) if column_semantics else 0,
1119
+ },
1120
+ metadata={
1121
+ "project": self.config.default_project,
1122
+ "schema": self.config.default_schema or "default",
1123
+ },
1124
+ agent_hints=AgentHints(
1125
+ next_actions=[f"meta.describe {table_name} --json"],
1126
+ insights=["Semantic metadata has been saved to local cache."],
1127
+ ),
1128
+ )
1129
+ except Exception as exc:
1130
+ envelope = Envelope(
1131
+ command="meta.semantic.set",
1132
+ status="failure",
1133
+ data=None,
1134
+ metadata={
1135
+ "project": self.config.default_project,
1136
+ },
1137
+ error=ErrorPayload(
1138
+ code="SEMANTIC_SET_ERROR",
1139
+ message=str(exc),
1140
+ recoverable=False,
1141
+ suggestion="Check the error message and try again.",
1142
+ ),
1143
+ agent_hints=AgentHints(
1144
+ next_actions=["meta.semantic.set"],
1145
+ ),
1146
+ )
1147
+
1148
+ self.log("meta.semantic.set", envelope.status, envelope.metadata)
1149
+ return envelope
1150
+
1151
+ def semantic_get(
1152
+ self,
1153
+ table_name: 'str',
1154
+ ) -> 'Envelope':
1155
+ """Get semantic metadata for a table."""
1156
+ try:
1157
+ semantic = self.cache.get_semantic(
1158
+ project=self.config.default_project,
1159
+ table_name=table_name,
1160
+ schema_name=self.config.default_schema or "default",
1161
+ )
1162
+
1163
+ if semantic:
1164
+ envelope = Envelope(
1165
+ command="meta.semantic.get",
1166
+ status="success",
1167
+ data={
1168
+ "table_name": table_name,
1169
+ "semantic": semantic,
1170
+ },
1171
+ metadata={
1172
+ "project": self.config.default_project,
1173
+ "schema": self.config.default_schema or "default",
1174
+ },
1175
+ agent_hints=AgentHints(
1176
+ next_actions=["meta.describe", "meta.semantic.set"],
1177
+ ),
1178
+ )
1179
+ else:
1180
+ envelope = Envelope(
1181
+ command="meta.semantic.get",
1182
+ status="success",
1183
+ data={
1184
+ "table_name": table_name,
1185
+ "semantic": None,
1186
+ },
1187
+ metadata={
1188
+ "project": self.config.default_project,
1189
+ "schema": self.config.default_schema or "default",
1190
+ },
1191
+ agent_hints=AgentHints(
1192
+ warnings=["No semantic metadata found. Use `maxc meta semantic set` to add metadata."],
1193
+ next_actions=[f"meta semantic set {table_name}"],
1194
+ ),
1195
+ )
1196
+ except Exception as exc:
1197
+ envelope = Envelope(
1198
+ command="meta.semantic.get",
1199
+ status="failure",
1200
+ data=None,
1201
+ metadata={
1202
+ "project": self.config.default_project,
1203
+ },
1204
+ error=ErrorPayload(
1205
+ code="SEMANTIC_GET_ERROR",
1206
+ message=str(exc),
1207
+ recoverable=False,
1208
+ suggestion="Check the error message and try again.",
1209
+ ),
1210
+ agent_hints=AgentHints(
1211
+ next_actions=["meta.semantic.get"],
1212
+ ),
1213
+ )
1214
+
1215
+ self.log("meta.semantic.get", envelope.status, envelope.metadata)
1216
+ return envelope
1217
+
1218
+ def semantic_list_missing(
1219
+ self,
1220
+ ) -> 'Envelope':
1221
+ """List tables without semantic metadata."""
1222
+ try:
1223
+ # Get all cached tables
1224
+ all_tables = self.cache.get_all_cached_tables(
1225
+ project=self.config.default_project
1226
+ )
1227
+
1228
+ # Get tables with semantic metadata
1229
+ semantic_tables = self.cache.get_all_semantics(
1230
+ project=self.config.default_project
1231
+ )
1232
+ semantic_table_names = {t["table_name"] for t in semantic_tables}
1233
+
1234
+ # Find tables missing semantic metadata
1235
+ missing = [
1236
+ t for t in all_tables
1237
+ if t["table_name"] not in semantic_table_names
1238
+ ]
1239
+
1240
+ envelope = Envelope(
1241
+ command="meta.semantic.list-missing",
1242
+ status="success",
1243
+ data={
1244
+ "total_cached_tables": len(all_tables),
1245
+ "with_semantic": len(semantic_tables),
1246
+ "missing_semantic": len(missing),
1247
+ "tables": [
1248
+ {
1249
+ "table_name": t["table_name"],
1250
+ "schema_name": t["schema_name"],
1251
+ "description": t.get("description", ""),
1252
+ "column_count": len(t.get("columns", [])),
1253
+ }
1254
+ for t in missing[:50] # Limit to 50 tables
1255
+ ],
1256
+ },
1257
+ metadata={
1258
+ "project": self.config.default_project,
1259
+ },
1260
+ agent_hints=AgentHints(
1261
+ insights=[
1262
+ f"{len(missing)} tables lack semantic metadata.",
1263
+ "Run `maxc meta semantic set <table>` to add metadata for each table."
1264
+ ],
1265
+ next_actions=[
1266
+ f"meta semantic set {missing[0]['table_name']}"
1267
+ ] if missing else [],
1268
+ ),
1269
+ )
1270
+ except Exception as exc:
1271
+ envelope = Envelope(
1272
+ command="meta.semantic.list-missing",
1273
+ status="failure",
1274
+ data=None,
1275
+ metadata={
1276
+ "project": self.config.default_project,
1277
+ },
1278
+ error=ErrorPayload(
1279
+ code="SEMANTIC_LIST_MISSING_ERROR",
1280
+ message=str(exc),
1281
+ recoverable=False,
1282
+ suggestion="Check the error message and try again.",
1283
+ ),
1284
+ agent_hints=AgentHints(
1285
+ next_actions=["meta.semantic.list-missing"],
1286
+ ),
1287
+ )
1288
+
1289
+ self.log("meta.semantic.list-missing", envelope.status, envelope.metadata)
1290
+ return envelope
1291
+
1292
+ def meta_latest_partition(self, table_name: 'str') -> 'Envelope':
1293
+ payload, warnings = self.backend.latest_partition_info(table_name)
1294
+ next_actions = ["meta.freshness", "data.sample", "query"]
1295
+ if not payload.get("has_partitions"):
1296
+ next_actions = ["meta.describe", "data.sample"]
1297
+ envelope = Envelope(
1298
+ command="meta.latest-partition",
1299
+ status="success",
1300
+ data=payload,
1301
+ metadata={"project": self.config.default_project},
1302
+ agent_hints=AgentHints(next_actions=next_actions, warnings=warnings),
1303
+ )
1304
+ self.log("meta.latest-partition", envelope.status, envelope.metadata)
1305
+ return envelope
1306
+
1307
+ def meta_freshness(self, table_name: 'str') -> 'Envelope':
1308
+ payload, warnings = self.backend.freshness_info(table_name)
1309
+ next_actions = ["meta.latest-partition", "data.sample", "query"]
1310
+ if payload.get("freshness_status") == "stale":
1311
+ next_actions.insert(0, "job.submit")
1312
+ envelope = Envelope(
1313
+ command="meta.freshness",
1314
+ status="success",
1315
+ data=payload,
1316
+ metadata={"project": self.config.default_project},
1317
+ agent_hints=AgentHints(next_actions=next_actions, warnings=warnings),
1318
+ )
1319
+ self.log("meta.freshness", envelope.status, envelope.metadata)
1320
+ return envelope
1321
+
1322
+ def meta_lineage(self, table_name: 'str') -> 'Envelope':
1323
+ payload, warnings = self.backend.lineage_info(table_name)
1324
+ envelope = Envelope(
1325
+ command="meta.lineage",
1326
+ status="success",
1327
+ data=payload,
1328
+ metadata={"project": self.config.default_project},
1329
+ agent_hints=AgentHints(next_actions=["meta.describe"], warnings=warnings),
1330
+ )
1331
+ self.log("meta.lineage", envelope.status, envelope.metadata)
1332
+ return envelope
1333
+
1334
+ def cache_build(
1335
+ self,
1336
+ *,
1337
+ project: 'str | None' = None,
1338
+ max_workers: 'int' = 8,
1339
+ schema_name: 'str | None' = None,
1340
+ async_mode: 'bool' = False,
1341
+ progress_callback: 'Callable[[dict[str, Any]], None] | None' = None,
1342
+ ) -> 'Envelope':
1343
+ """Build metadata cache for all tables in the project.
1344
+
1345
+ Args:
1346
+ project: Project name
1347
+ max_workers: Number of concurrent workers
1348
+ schema_name: Specific schema to build (None = all schemas)
1349
+ async_mode: If True, return immediately with build_id for async tracking
1350
+ progress_callback: Optional callback for build progress events
1351
+ """
1352
+ import uuid
1353
+
1354
+ target_project = project or self.config.default_project
1355
+ if progress_callback is not None:
1356
+ progress_callback(
1357
+ {
1358
+ "type": "listing_start",
1359
+ "project": target_project,
1360
+ "schema_name": schema_name,
1361
+ }
1362
+ )
1363
+
1364
+ all_tables = self.backend.list_tables()
1365
+ if schema_name:
1366
+ tables = all_tables
1367
+ else:
1368
+ tables = all_tables
1369
+
1370
+ if progress_callback is not None:
1371
+ progress_callback(
1372
+ {
1373
+ "type": "listing_complete",
1374
+ "project": target_project,
1375
+ "schema_name": schema_name,
1376
+ "total_tables": len(tables),
1377
+ }
1378
+ )
1379
+
1380
+ build_id = str(uuid.uuid4())[:8]
1381
+
1382
+ if async_mode:
1383
+ self.cache.start_build(target_project, build_id, len(tables))
1384
+ envelope = Envelope(
1385
+ command="cache.build",
1386
+ status="running",
1387
+ data={
1388
+ "action": "build",
1389
+ "build_id": build_id,
1390
+ "mode": "async",
1391
+ "scope": "schema" if schema_name else "project",
1392
+ "schema_name": schema_name,
1393
+ "tables_scanned": len(tables),
1394
+ "total_tables": len(tables),
1395
+ "cache_location": str(self.cache.db_path),
1396
+ "message": "Cache build started. Use `cache build-status` to track progress.",
1397
+ },
1398
+ metadata={"project": target_project, "build_id": build_id},
1399
+ agent_hints=AgentHints(
1400
+ next_actions=["cache.build-status"],
1401
+ insights=["The metadata cache build is running in the background."],
1402
+ ),
1403
+ )
1404
+ import threading
1405
+ thread = threading.Thread(
1406
+ target=self._build_cache_background,
1407
+ args=(target_project, build_id, tables, max_workers, schema_name, False),
1408
+ daemon=True,
1409
+ )
1410
+ thread.start()
1411
+ self.log("cache.build", "running", envelope.metadata)
1412
+ return envelope
1413
+
1414
+ return self._build_cache_sync(
1415
+ target_project, build_id, tables, max_workers, schema_name, progress_callback=progress_callback
1416
+ )
1417
+
1418
+ def _build_cache_sync(
1419
+ self,
1420
+ project: 'str',
1421
+ build_id: 'str',
1422
+ tables: 'list',
1423
+ max_workers: 'int',
1424
+ schema_name: 'str | None' = None,
1425
+ initialize_status: 'bool' = True,
1426
+ progress_callback: 'Callable[[dict[str, Any]], None] | None' = None,
1427
+ ) -> 'Envelope':
1428
+ """Synchronous cache build with progress tracking."""
1429
+ from concurrent.futures import ThreadPoolExecutor, as_completed
1430
+ import threading
1431
+
1432
+ started = monotonic()
1433
+ cached_count = 0
1434
+ created_count = 0
1435
+ updated_count = 0
1436
+ errors: 'list[str]' = []
1437
+ lock = threading.Lock()
1438
+
1439
+ if initialize_status:
1440
+ self.cache.start_build(project, build_id, len(tables))
1441
+ if progress_callback is not None:
1442
+ progress_callback(
1443
+ {
1444
+ "type": "build_start",
1445
+ "project": project,
1446
+ "schema_name": schema_name,
1447
+ "build_id": build_id,
1448
+ "total_tables": len(tables),
1449
+ }
1450
+ )
1451
+
1452
+ def fetch_and_cache(
1453
+ table_name: 'str',
1454
+ table_schema: 'str' = "default",
1455
+ ) -> 'tuple[str, str | None]':
1456
+ try:
1457
+ full_table = self.backend.describe_table(table_name)
1458
+ existing = self.cache.get_cached_table(project, full_table.name, table_schema)
1459
+ columns = [
1460
+ {"name": c.name, "type": c.type, "comment": c.comment}
1461
+ for c in full_table.columns
1462
+ ]
1463
+ self.cache.cache_table(
1464
+ project=project,
1465
+ table_name=full_table.name,
1466
+ description=full_table.description,
1467
+ columns=columns,
1468
+ partitions=full_table.partitions,
1469
+ schema_name=table_schema,
1470
+ )
1471
+ return ("updated" if existing else "created"), None
1472
+ except Exception as exc:
1473
+ return "error", f"{table_name}: {exc}"
1474
+
1475
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
1476
+ futures = {executor.submit(fetch_and_cache, t.name): t.name for t in tables}
1477
+ for future in as_completed(futures):
1478
+ outcome, error = future.result()
1479
+ with lock:
1480
+ if error:
1481
+ errors.append(error)
1482
+ else:
1483
+ cached_count += 1
1484
+ if outcome == "updated":
1485
+ updated_count += 1
1486
+ else:
1487
+ created_count += 1
1488
+ self.cache.update_build_progress(
1489
+ project, build_id, cached_count, len(errors)
1490
+ )
1491
+ if progress_callback is not None:
1492
+ progress_callback(
1493
+ {
1494
+ "type": "progress",
1495
+ "project": project,
1496
+ "schema_name": schema_name,
1497
+ "build_id": build_id,
1498
+ "cached_tables": cached_count,
1499
+ "failed_tables": len(errors),
1500
+ "total_tables": len(tables),
1501
+ }
1502
+ )
1503
+
1504
+ if errors:
1505
+ self.cache.complete_build(project, build_id, error_message=f"{len(errors)} errors")
1506
+ else:
1507
+ self.cache.complete_build(project, build_id)
1508
+
1509
+ stats = self.cache.get_cache_stats(project)
1510
+ elapsed_ms = int((monotonic() - started) * 1000)
1511
+ envelope = Envelope(
1512
+ command="cache.build",
1513
+ status="success" if not errors else "partial",
1514
+ data={
1515
+ "action": "build",
1516
+ "build_id": build_id,
1517
+ "mode": "sync",
1518
+ "scope": "schema" if schema_name else "project",
1519
+ "schema_name": schema_name,
1520
+ "tables_scanned": len(tables),
1521
+ "cache_entries_created": created_count,
1522
+ "cache_entries_updated": updated_count,
1523
+ "cached_tables": cached_count,
1524
+ "total_tables": len(tables),
1525
+ "tables_failed": len(errors),
1526
+ "elapsed_ms": elapsed_ms,
1527
+ "cache_location": str(self.cache.db_path),
1528
+ "errors": errors[:10] if errors else [],
1529
+ "stats": stats,
1530
+ },
1531
+ metadata={"project": project, "build_id": build_id},
1532
+ agent_hints=AgentHints(
1533
+ next_actions=["meta.search", "meta.search-columns"],
1534
+ warnings=[f"Failed to cache {len(errors)} table(s)."] if errors else [],
1535
+ ),
1536
+ )
1537
+ if progress_callback is not None:
1538
+ progress_callback(
1539
+ {
1540
+ "type": "completed",
1541
+ "project": project,
1542
+ "schema_name": schema_name,
1543
+ "build_id": build_id,
1544
+ "cached_tables": cached_count,
1545
+ "failed_tables": len(errors),
1546
+ "total_tables": len(tables),
1547
+ "status": envelope.status,
1548
+ }
1549
+ )
1550
+ self.log("cache.build", envelope.status, envelope.metadata)
1551
+ return envelope
1552
+
1553
+ def _build_cache_background(
1554
+ self,
1555
+ project: 'str',
1556
+ build_id: 'str',
1557
+ tables: 'list',
1558
+ max_workers: 'int',
1559
+ schema_name: 'str | None' = None,
1560
+ initialize_status: 'bool' = False,
1561
+ ) -> 'None':
1562
+ """Background cache build (async mode)."""
1563
+ try:
1564
+ self._build_cache_sync(
1565
+ project,
1566
+ build_id,
1567
+ tables,
1568
+ max_workers,
1569
+ schema_name,
1570
+ initialize_status=initialize_status,
1571
+ )
1572
+ except Exception as exc:
1573
+ self.cache.complete_build(project, build_id, error_message=str(exc))
1574
+
1575
+ def cache_build_status(
1576
+ self, *, project: 'str | None' = None, build_id: 'str | None' = None
1577
+ ) -> 'Envelope':
1578
+ """Get cache build status."""
1579
+ target_project = project or self.config.default_project
1580
+ status = self.cache.get_build_status(target_project, build_id)
1581
+
1582
+ if status:
1583
+ envelope = Envelope(
1584
+ command="cache.build-status",
1585
+ status="success",
1586
+ data=status,
1587
+ metadata={"project": target_project},
1588
+ agent_hints=AgentHints(
1589
+ next_actions=(
1590
+ ["cache.build"]
1591
+ if status["status"] in ["failed", "completed"]
1592
+ else ["cache.build-status"]
1593
+ ),
1594
+ insights=[
1595
+ f"Build progress: {status['progress_percent']}% ({status['processed_tables']}/{status['total_tables']})"
1596
+ ]
1597
+ if status["status"] == "running"
1598
+ else [],
1599
+ ),
1600
+ )
1601
+ else:
1602
+ envelope = Envelope(
1603
+ command="cache.build-status",
1604
+ status="not_found",
1605
+ data={"message": "No cache build record was found."},
1606
+ metadata={"project": target_project},
1607
+ agent_hints=AgentHints(next_actions=["cache.build"]),
1608
+ )
1609
+ return envelope
1610
+
1611
+ def cache_status(self, *, project: 'str | None' = None, schema_name: 'str | None' = None) -> 'Envelope':
1612
+ """Get cache status."""
1613
+ target_project = project or self.config.default_project
1614
+ stats = self.cache.get_cache_stats(target_project, schema_name)
1615
+ schemas = self.cache.get_schemas(target_project)
1616
+
1617
+ envelope = Envelope(
1618
+ command="cache.status",
1619
+ status="success",
1620
+ data={
1621
+ **stats,
1622
+ "schemas": schemas,
1623
+ },
1624
+ metadata={"project": target_project},
1625
+ agent_hints=AgentHints(
1626
+ next_actions=["cache.build"] if stats["table_count"] == 0 else ["meta.search"],
1627
+ ),
1628
+ )
1629
+ return envelope
1630
+
1631
+ def cache_clear(self, *, project: 'str | None' = None, schema_name: 'str | None' = None) -> 'Envelope':
1632
+ """Clear metadata cache."""
1633
+ target_project = project or self.config.default_project
1634
+ deleted = self.cache.clear_table_cache(target_project, schema_name)
1635
+ envelope = Envelope(
1636
+ command="cache.clear",
1637
+ status="success",
1638
+ data={"deleted_tables": deleted},
1639
+ metadata={"project": target_project},
1640
+ agent_hints=AgentHints(next_actions=["cache.build"]),
1641
+ )
1642
+ return envelope
1643
+
1644
+ def cache_save_semantic(
1645
+ self,
1646
+ *,
1647
+ table_name: 'str',
1648
+ semantic_desc: 'str',
1649
+ use_cases: 'list[str]',
1650
+ sample_questions: 'list[str]',
1651
+ column_semantics: 'list[dict]',
1652
+ project: 'str | None' = None,
1653
+ schema_name: 'str' = "default",
1654
+ ) -> 'Envelope':
1655
+ """Save AI-generated semantic metadata for NL2SQL."""
1656
+ target_project = project or self.config.default_project
1657
+ self.cache.save_semantic(
1658
+ project=target_project,
1659
+ table_name=table_name,
1660
+ semantic_desc=semantic_desc,
1661
+ use_cases=use_cases,
1662
+ sample_questions=sample_questions,
1663
+ column_semantics=column_semantics,
1664
+ schema_name=schema_name,
1665
+ )
1666
+ envelope = Envelope(
1667
+ command="cache.save-semantic",
1668
+ status="success",
1669
+ data={
1670
+ "table_name": table_name,
1671
+ "schema_name": schema_name,
1672
+ "semantic_desc": semantic_desc,
1673
+ "use_cases_count": len(use_cases),
1674
+ "sample_questions_count": len(sample_questions),
1675
+ "column_semantics_count": len(column_semantics),
1676
+ },
1677
+ metadata={"project": target_project},
1678
+ agent_hints=AgentHints(
1679
+ next_actions=["meta.search", "cache.get-semantic"],
1680
+ insights=["Semantic metadata has been saved and can now support NL2SQL workflows."],
1681
+ ),
1682
+ )
1683
+ return envelope
1684
+
1685
+ def cache_get_semantic(
1686
+ self, *, table_name: 'str', project: 'str | None' = None, schema_name: 'str' = "default"
1687
+ ) -> 'Envelope':
1688
+ """Get semantic metadata for a table."""
1689
+ target_project = project or self.config.default_project
1690
+ semantic = self.cache.get_semantic(target_project, table_name, schema_name)
1691
+ if semantic:
1692
+ envelope = Envelope(
1693
+ command="cache.get-semantic",
1694
+ status="success",
1695
+ data={"table_name": table_name, "schema_name": schema_name, **semantic},
1696
+ metadata={"project": target_project},
1697
+ agent_hints=AgentHints(next_actions=["query"]),
1698
+ )
1699
+ else:
1700
+ envelope = Envelope(
1701
+ command="cache.get-semantic",
1702
+ status="not_found",
1703
+ data={"table_name": table_name, "schema_name": schema_name},
1704
+ metadata={"project": target_project},
1705
+ agent_hints=AgentHints(
1706
+ next_actions=["cache.save-semantic"],
1707
+ warnings=["No semantic metadata was found. Run the semantic-index skill first."],
1708
+ ),
1709
+ )
1710
+ return envelope
1711
+
1712
+ def meta_partitions(self, table_name: 'str') -> 'Envelope':
1713
+ table = self.backend.describe_table(table_name)
1714
+ envelope = Envelope(
1715
+ command="meta.partitions",
1716
+ status="success",
1717
+ data={"table_name": table.name, "partitions": table.partitions},
1718
+ metadata={"project": self.config.default_project},
1719
+ agent_hints=AgentHints(next_actions=["query"]),
1720
+ )
1721
+ self.log("meta.partitions", envelope.status, envelope.metadata)
1722
+ return envelope
1723
+
1724
+ def meta_list_projects(self) -> 'Envelope':
1725
+ """List all projects owned by the current user."""
1726
+ projects = self.backend.list_projects()
1727
+ envelope = Envelope(
1728
+ command="meta.list-projects",
1729
+ status="success",
1730
+ data={"projects": projects, "total": len(projects)},
1731
+ metadata={"backend": "odps"},
1732
+ agent_hints=AgentHints(
1733
+ next_actions=["session.set", "meta.list-schemas"],
1734
+ insights=[
1735
+ f"Discovered {len(projects)} project(s) owned by the current identity.",
1736
+ "Choose a project, then run `maxc session set --project <project_name> --json` before listing schemas or tables.",
1737
+ ],
1738
+ ),
1739
+ )
1740
+ self.log("meta.list-projects", envelope.status, envelope.metadata)
1741
+ return envelope
1742
+
1743
+ def meta_list_schemas(self, *, project: 'str | None' = None) -> 'Envelope':
1744
+ """List all schemas in a project."""
1745
+ target_project = project or self.config.default_project
1746
+ schemas = self.backend.list_schemas(project=target_project)
1747
+ rows = [{"name": s["name"]} for s in schemas]
1748
+ envelope = Envelope(
1749
+ command="meta.list-schemas",
1750
+ status="success",
1751
+ data={"schemas": rows, "total": len(rows)},
1752
+ metadata={"project": target_project},
1753
+ agent_hints=AgentHints(
1754
+ next_actions=["meta.list-tables", "meta.search"],
1755
+ warnings=[] if rows else ["No schemas were returned. Schema namespaces may not be enabled for this project."],
1756
+ ),
1757
+ )
1758
+ self.log("meta.list-schemas", envelope.status, envelope.metadata)
1759
+ return envelope
1760
+
1761
+ def session_set(self, project: 'str | None' = None, schema: 'str | None' = None) -> 'Envelope':
1762
+ """Set default project and/or schema.
1763
+
1764
+ Saves to session override file (~/.maxc/session_override.yaml) which has
1765
+ highest priority (above environment variables).
1766
+ """
1767
+ from .config import session_override_path, save_config_mapping, _load_yaml_file
1768
+
1769
+ override_path = session_override_path()
1770
+ override = _load_yaml_file(override_path)
1771
+
1772
+ changes = []
1773
+ warnings: 'list[str]' = []
1774
+
1775
+ if project:
1776
+ if self.backend is not None:
1777
+ try:
1778
+ self.backend.get_project_info(project)
1779
+ except Exception as exc:
1780
+ raise ValidationError(
1781
+ f"Unable to access project `{project}`: {exc}",
1782
+ suggestion="Verify the project name and that the current identity has access.",
1783
+ ) from exc
1784
+ else:
1785
+ warnings.append(
1786
+ "Project override was saved without remote validation because no authenticated backend session is active."
1787
+ )
1788
+ override["project"] = project
1789
+ changes.append(f"project set to `{project}`")
1790
+ # Warn if session override project differs from the project saved in auth config
1791
+ if self.config.auth.project and project != self.config.auth.project:
1792
+ warnings.append(
1793
+ f"Session project override (`{project}`) differs from the project saved in auth config "
1794
+ f"(`{self.config.auth.project}`). Operations will use `{project}`, but credentials "
1795
+ f"were configured for `{self.config.auth.project}`. Run `auth whoami` to verify access."
1796
+ )
1797
+
1798
+ if schema:
1799
+ override["schema"] = schema
1800
+ changes.append(f"schema set to `{schema}`")
1801
+ elif schema is not None: # explicitly cleared with --schema=""
1802
+ if "schema" in override:
1803
+ del override["schema"]
1804
+ changes.append("schema cleared")
1805
+
1806
+ save_config_mapping(override_path, override)
1807
+
1808
+ # Update current config in memory
1809
+ if project:
1810
+ self.config.default_project = project
1811
+ # Update backend project if available
1812
+ if self.backend is not None:
1813
+ self.backend.project = project
1814
+ if schema:
1815
+ self.config.default_schema = schema
1816
+ elif schema is not None:
1817
+ self.config.default_schema = None
1818
+
1819
+ envelope = Envelope(
1820
+ command="session.set",
1821
+ status="success",
1822
+ data={
1823
+ "project": self.config.default_project,
1824
+ "schema": self.config.default_schema,
1825
+ "override_path": str(override_path),
1826
+ "changes": changes,
1827
+ },
1828
+ metadata={},
1829
+ agent_hints=AgentHints(
1830
+ next_actions=["meta.list-tables", "meta.list-schemas", "session.show"],
1831
+ insights=[f"Session override set: {', '.join(changes)}. Overrides environment variables."],
1832
+ warnings=warnings,
1833
+ ),
1834
+ )
1835
+ self.log("session.set", envelope.status, {"changes": changes})
1836
+ return envelope
1837
+
1838
+ def session_unset(self) -> 'Envelope':
1839
+ """Clear session override, reverting to environment variables and config file."""
1840
+ from .config import session_override_path
1841
+
1842
+ override_path = session_override_path()
1843
+ cleared = []
1844
+
1845
+ if override_path.exists():
1846
+ override_path.unlink()
1847
+ cleared = ["project", "schema"]
1848
+
1849
+ envelope = Envelope(
1850
+ command="session.unset",
1851
+ status="success",
1852
+ data={
1853
+ "cleared": cleared,
1854
+ "override_path": str(override_path),
1855
+ },
1856
+ metadata={},
1857
+ agent_hints=AgentHints(
1858
+ next_actions=["session.show", "session.set"],
1859
+ insights=["Session override cleared. Configuration will use environment variables and config file."],
1860
+ ),
1861
+ )
1862
+ self.log("session.unset", envelope.status, {})
1863
+ return envelope
1864
+
1865
+ def session_show(self) -> 'Envelope':
1866
+ """Show current session settings with source information."""
1867
+ from .config import session_override_path, default_global_config_path, _load_yaml_file
1868
+ import os
1869
+
1870
+ override_path = session_override_path()
1871
+ config_path = default_global_config_path()
1872
+ override = _load_yaml_file(override_path)
1873
+
1874
+ # Determine source of project
1875
+ env_project = os.environ.get("MAXCOMPUTE_PROJECT") or os.environ.get("ODPS_PROJECT")
1876
+ if override.get("project"):
1877
+ project_source = "session_override"
1878
+ elif env_project:
1879
+ project_source = "environment"
1880
+ else:
1881
+ project_source = "config_file"
1882
+
1883
+ # Determine source of schema
1884
+ if override.get("schema"):
1885
+ schema_source = "session_override"
1886
+ else:
1887
+ schema_source = "config_file"
1888
+
1889
+ # Get project info if available
1890
+ project_info = None
1891
+ project_info_warning = None
1892
+ if self.backend is not None:
1893
+ try:
1894
+ raw_info = self.backend.get_project_info(self.config.default_project)
1895
+ project_info = {k: (str(v) if v is not None else None) for k, v in raw_info.items()}
1896
+ except Exception:
1897
+ project_info_warning = "Could not fetch project info from backend"
1898
+
1899
+ envelope = Envelope(
1900
+ command="session.show",
1901
+ status="success",
1902
+ data={
1903
+ "project": {
1904
+ "value": self.config.default_project,
1905
+ "source": project_source,
1906
+ },
1907
+ "schema": {
1908
+ "value": self.config.default_schema,
1909
+ "source": schema_source,
1910
+ },
1911
+ "override_path": str(override_path),
1912
+ "config_path": str(config_path) if config_path.exists() else None,
1913
+ "project_info": project_info,
1914
+ "config_sources": [str(p) for p in self.config.sources],
1915
+ },
1916
+ metadata={},
1917
+ agent_hints=AgentHints(
1918
+ next_actions=["session.set", "session.unset", "meta.list-tables"],
1919
+ warnings=[project_info_warning] if project_info_warning else [],
1920
+ insights=[
1921
+ f"Project `{self.config.default_project}` from {project_source}",
1922
+ f"Schema `{self.config.default_schema or 'default'}` from {schema_source}",
1923
+ ],
1924
+ ),
1925
+ )
1926
+ self.log("session.show", envelope.status, {})
1927
+ return envelope
1928
+
1929
+ def data_sample(
1930
+ self,
1931
+ table_name: 'str',
1932
+ rows: 'int' = 5,
1933
+ *,
1934
+ partition: 'str | None' = None,
1935
+ columns: 'list[str] | None' = None,
1936
+ ) -> 'Envelope':
1937
+ if rows <= 0:
1938
+ raise ValidationError("`--rows` must be greater than 0.")
1939
+ table, sample_rows, sample_info = self.backend.sample_table(
1940
+ table_name,
1941
+ rows,
1942
+ partition=partition,
1943
+ columns=columns,
1944
+ )
1945
+ envelope = Envelope(
1946
+ command="data.sample",
1947
+ status="success",
1948
+ data={
1949
+ "table_name": table.name,
1950
+ "rows": sample_rows,
1951
+ "returned_rows": len(sample_rows),
1952
+ "schema": sample_info["schema"],
1953
+ "applied_partition": sample_info["applied_partition"],
1954
+ "selected_columns": sample_info["selected_columns"],
1955
+ },
1956
+ metadata={
1957
+ "project": self.config.default_project,
1958
+ "requested_rows": rows,
1959
+ "requested_partition": partition,
1960
+ "requested_columns": columns or [],
1961
+ },
1962
+ agent_hints=AgentHints(next_actions=["data.profile", "query"]),
1963
+ )
1964
+ self.log("data.sample", envelope.status, envelope.metadata)
1965
+ return envelope
1966
+
1967
+ def data_profile(self, table_name: 'str', *, partition: 'str | None' = None) -> 'Envelope':
1968
+ profile = self.backend.profile_table(table_name, partition=partition)
1969
+ envelope = Envelope(
1970
+ command="data.profile",
1971
+ status="success",
1972
+ data=profile,
1973
+ metadata={"project": self.config.default_project, "requested_partition": partition},
1974
+ agent_hints=AgentHints(next_actions=["query"]),
1975
+ )
1976
+ self.log("data.profile", envelope.status, envelope.metadata)
1977
+ return envelope
1978
+
1979
+ def auth_login(
1980
+ self,
1981
+ *,
1982
+ access_id: 'str | None' = None,
1983
+ secret_access_key: 'str | None' = None,
1984
+ security_token: 'str | None' = None,
1985
+ project: 'str | None' = None,
1986
+ endpoint: 'str | None' = None,
1987
+ region_name: 'str | None' = None,
1988
+ tunnel_endpoint: 'str | None' = None,
1989
+ from_env: 'bool' = False,
1990
+ no_validate: 'bool' = False,
1991
+ target_config_path: 'Path | None' = None,
1992
+ ) -> 'Envelope':
1993
+ target_path = target_config_path or default_global_config_path()
1994
+ existing_payload = load_config_mapping(target_path) if target_path.exists() else {}
1995
+ existing_auth = AuthConfig.from_mapping(existing_payload.get("auth", {}) or {})
1996
+ env_settings = load_odps_env()
1997
+
1998
+ resolved_auth = AuthConfig(
1999
+ access_id=self._resolve_login_value(
2000
+ provided=access_id,
2001
+ env_value=env_settings.get("access_id"),
2002
+ existing_value=existing_auth.access_id,
2003
+ prompt="Access Key ID",
2004
+ required=True,
2005
+ secret=False,
2006
+ use_env=from_env,
2007
+ ),
2008
+ secret_access_key=self._resolve_login_value(
2009
+ provided=secret_access_key,
2010
+ env_value=env_settings.get("secret_access_key"),
2011
+ existing_value=existing_auth.secret_access_key,
2012
+ prompt="Access Key Secret",
2013
+ required=True,
2014
+ secret=True,
2015
+ use_env=from_env,
2016
+ ),
2017
+ security_token=self._resolve_login_value(
2018
+ provided=security_token,
2019
+ env_value=env_settings.get("security_token"),
2020
+ existing_value=existing_auth.security_token,
2021
+ prompt="STS Security Token (optional)",
2022
+ required=False,
2023
+ secret=True,
2024
+ use_env=from_env,
2025
+ ),
2026
+ project=self._resolve_login_value(
2027
+ provided=project,
2028
+ env_value=env_settings.get("project"),
2029
+ existing_value=existing_auth.project,
2030
+ prompt="MaxCompute Project",
2031
+ required=True,
2032
+ secret=False,
2033
+ use_env=from_env,
2034
+ ),
2035
+ endpoint=self._resolve_login_value(
2036
+ provided=endpoint,
2037
+ env_value=env_settings.get("endpoint"),
2038
+ existing_value=existing_auth.endpoint,
2039
+ prompt="MaxCompute Endpoint",
2040
+ required=True,
2041
+ secret=False,
2042
+ use_env=from_env,
2043
+ ),
2044
+ region_name=self._resolve_login_value(
2045
+ provided=region_name,
2046
+ env_value=env_settings.get("region_name"),
2047
+ existing_value=existing_auth.region_name,
2048
+ prompt="MaxCompute Region (optional)",
2049
+ required=False,
2050
+ secret=False,
2051
+ use_env=from_env,
2052
+ ),
2053
+ tunnel_endpoint=self._resolve_login_value(
2054
+ provided=tunnel_endpoint,
2055
+ env_value=env_settings.get("tunnel_endpoint"),
2056
+ existing_value=existing_auth.tunnel_endpoint,
2057
+ prompt="MaxCompute Tunnel Endpoint (optional)",
2058
+ required=False,
2059
+ secret=False,
2060
+ use_env=from_env,
2061
+ ),
2062
+ )
2063
+ resolved_auth.provider = "sts_token" if resolved_auth.security_token else "access_key"
2064
+ if no_validate:
2065
+ self._validate_auth_config_shape(resolved_auth)
2066
+ else:
2067
+ resolve_auth_connection(self.config, auth_override=resolved_auth)
2068
+
2069
+ persist_login_config(
2070
+ target_path,
2071
+ auth=resolved_auth,
2072
+ )
2073
+
2074
+ warnings: 'list[str]' = []
2075
+ self._clear_session_override(warnings)
2076
+ if from_env:
2077
+ warnings.append(
2078
+ "Credentials were imported from environment variables and saved to config."
2079
+ )
2080
+ elif any(
2081
+ env_settings.get(name)
2082
+ for name in ("access_id", "secret_access_key", "security_token", "project", "endpoint")
2083
+ ):
2084
+ warnings.append(
2085
+ "MaxCompute-related environment variables are set in the current shell; they may override the config you just saved."
2086
+ )
2087
+
2088
+ if no_validate:
2089
+ payload = {
2090
+ "authenticated": None,
2091
+ "configured": True,
2092
+ "validation_status": "configuration_only",
2093
+ "backend": "odps",
2094
+ "auth_type": resolved_auth.provider,
2095
+ "identity_source": "config_file",
2096
+ "principal_display": mask_access_id(resolved_auth.access_id),
2097
+ "principal_masked": mask_access_id(resolved_auth.access_id),
2098
+ "project": resolved_auth.project,
2099
+ "region": resolved_auth.region_name,
2100
+ "endpoint": resolved_auth.endpoint,
2101
+ "project_owner": None,
2102
+ "allowed_operations": self.config.allowed_operations,
2103
+ "saved": True,
2104
+ "validated": False,
2105
+ }
2106
+ if resolved_auth.security_token:
2107
+ payload["token_expires_at"] = resolved_auth.token_expires_at
2108
+ warnings.append("Authentication settings were saved without remote validation.")
2109
+ else:
2110
+ payload, validate_warnings = self._validate_auth_config(resolved_auth)
2111
+ payload["saved"] = True
2112
+ payload["validated"] = True
2113
+ warnings.extend(validate_warnings)
2114
+
2115
+ envelope = Envelope(
2116
+ command="auth.login",
2117
+ status="success",
2118
+ data=payload,
2119
+ metadata={
2120
+ "config_path": str(target_path),
2121
+ "written_fields": sorted(resolved_auth.to_mapping().keys()),
2122
+ "auth_storage": "config_file",
2123
+ },
2124
+ agent_hints=AgentHints(
2125
+ next_actions=["auth.whoami", "meta.list-tables"],
2126
+ warnings=warnings,
2127
+ ),
2128
+ )
2129
+ self.log("auth.login", envelope.status, envelope.metadata)
2130
+ return envelope
2131
+
2132
+ def auth_login_ncs(
2133
+ self,
2134
+ *,
2135
+ account_type: 'str | None' = None,
2136
+ employee_id: 'str | None' = None,
2137
+ account_name: 'str | None' = None,
2138
+ app_name: 'str | None' = None,
2139
+ project: 'str | None' = None,
2140
+ endpoint: 'str | None' = None,
2141
+ region_name: 'str | None' = None,
2142
+ tunnel_endpoint: 'str | None' = None,
2143
+ interactive: 'bool' = False,
2144
+ list_accounts_mode: 'bool' = False,
2145
+ no_validate: 'bool' = False,
2146
+ target_config_path: 'Path | None' = None,
2147
+ ) -> 'Envelope':
2148
+ target_path = target_config_path or default_global_config_path()
2149
+ existing_payload = load_config_mapping(target_path) if target_path.exists() else {}
2150
+ existing_auth = AuthConfig.from_mapping(existing_payload.get("auth", {}) or {})
2151
+
2152
+ if list_accounts_mode:
2153
+ normalized_type = account_type or existing_auth.ncs.account_type or "user"
2154
+ payload = list_ncs_accounts(normalized_type)
2155
+ envelope = Envelope(
2156
+ command="auth.login-ncs",
2157
+ status="success",
2158
+ data=payload,
2159
+ metadata={"config_path": str(target_path)},
2160
+ agent_hints=AgentHints(next_actions=["auth.login-ncs"]),
2161
+ )
2162
+ self.log("auth.login-ncs", envelope.status, envelope.metadata)
2163
+ return envelope
2164
+
2165
+ if interactive:
2166
+ account_type = account_type or self._prompt_text(
2167
+ "ncs account type (user/account/app)",
2168
+ default=existing_auth.ncs.account_type,
2169
+ )
2170
+ normalized_type = (account_type or "").strip().lower()
2171
+ if normalized_type == "user":
2172
+ employee_id = employee_id or self._prompt_text(
2173
+ "Employee ID", default=existing_auth.ncs.employee_id
2174
+ )
2175
+ elif normalized_type == "account":
2176
+ account_name = account_name or self._prompt_text(
2177
+ "Account name", default=existing_auth.ncs.account_name
2178
+ )
2179
+ elif normalized_type == "app":
2180
+ app_name = app_name or self._prompt_text(
2181
+ "App name", default=existing_auth.ncs.app_name
2182
+ )
2183
+ project = project or self._prompt_text(
2184
+ "MaxCompute Project", default=existing_auth.project
2185
+ )
2186
+ endpoint = endpoint or self._prompt_text(
2187
+ "MaxCompute Endpoint", default=existing_auth.endpoint
2188
+ )
2189
+ region_name = region_name or self._prompt_text(
2190
+ "MaxCompute Region (optional)",
2191
+ required=False,
2192
+ default=existing_auth.region_name,
2193
+ )
2194
+ tunnel_endpoint = tunnel_endpoint or self._prompt_text(
2195
+ "MaxCompute Tunnel Endpoint (optional)",
2196
+ required=False,
2197
+ default=existing_auth.tunnel_endpoint,
2198
+ )
2199
+
2200
+ normalized_type = (account_type or existing_auth.ncs.account_type or "").strip().lower() or None
2201
+ if not normalized_type:
2202
+ raise ValidationError(
2203
+ "account_type is required for ncs authentication.",
2204
+ suggestion=(
2205
+ "Specify --account-type user, account, or app. "
2206
+ "Run `auth login-ncs --list-accounts --account-type user` to list available accounts."
2207
+ ),
2208
+ )
2209
+ ncs_config = build_ncs_auth_config(
2210
+ account_type=normalized_type,
2211
+ employee_id=employee_id or existing_auth.ncs.employee_id,
2212
+ account_name=account_name or existing_auth.ncs.account_name,
2213
+ app_name=app_name or existing_auth.ncs.app_name,
2214
+ process_timeout=existing_auth.ncs.process_timeout,
2215
+ )
2216
+ resolved_auth = AuthConfig(
2217
+ provider="ncs",
2218
+ project=project or existing_auth.project,
2219
+ endpoint=endpoint or existing_auth.endpoint,
2220
+ region_name=region_name or existing_auth.region_name,
2221
+ tunnel_endpoint=tunnel_endpoint or existing_auth.tunnel_endpoint,
2222
+ ncs=ncs_config,
2223
+ )
2224
+ if no_validate:
2225
+ self._validate_auth_config_shape(resolved_auth)
2226
+ else:
2227
+ resolve_auth_connection(self.config, auth_override=resolved_auth)
2228
+
2229
+ persist_login_config(
2230
+ target_path,
2231
+ auth=resolved_auth,
2232
+ )
2233
+
2234
+ warnings: 'list[str]' = []
2235
+ self._clear_session_override(warnings)
2236
+ env_settings = load_odps_env()
2237
+ overriding_env_fields = [
2238
+ name for name in ("project", "endpoint")
2239
+ if env_settings.get(name)
2240
+ ]
2241
+ if overriding_env_fields:
2242
+ warnings.append(
2243
+ f"Environment variable(s) for {', '.join(overriding_env_fields)} are set and will override "
2244
+ f"the values you just saved at runtime. Unset them or they will take precedence over this ncs config."
2245
+ )
2246
+
2247
+ if no_validate:
2248
+ payload = {
2249
+ "authenticated": None,
2250
+ "configured": True,
2251
+ "validation_status": "configuration_only",
2252
+ "backend": "odps",
2253
+ "auth_type": "ncs",
2254
+ "identity_source": "config_file",
2255
+ "principal_display": None,
2256
+ "principal_masked": None,
2257
+ "project": resolved_auth.project,
2258
+ "region": resolved_auth.region_name,
2259
+ "endpoint": resolved_auth.endpoint,
2260
+ "project_owner": None,
2261
+ "allowed_operations": self.config.allowed_operations,
2262
+ "saved": True,
2263
+ "validated": False,
2264
+ "ncs": {
2265
+ "account_type": resolved_auth.ncs.account_type,
2266
+ "process_command": resolved_auth.ncs.process_command,
2267
+ },
2268
+ }
2269
+ warnings.append("ncs authentication settings were saved without remote validation.")
2270
+ else:
2271
+ payload, validate_warnings = self._validate_auth_config(resolved_auth)
2272
+ payload["saved"] = True
2273
+ payload["validated"] = True
2274
+ warnings.extend(validate_warnings)
2275
+
2276
+ envelope = Envelope(
2277
+ command="auth.login-ncs",
2278
+ status="success",
2279
+ data=payload,
2280
+ metadata={
2281
+ "config_path": str(target_path),
2282
+ "written_fields": sorted(resolved_auth.to_mapping().keys()),
2283
+ "auth_storage": "config_file",
2284
+ },
2285
+ agent_hints=AgentHints(
2286
+ next_actions=["auth.whoami", "meta.list-tables"],
2287
+ warnings=warnings,
2288
+ ),
2289
+ )
2290
+ self.log("auth.login-ncs", envelope.status, envelope.metadata)
2291
+ return envelope
2292
+
2293
+ def auth_whoami(self) -> 'Envelope':
2294
+ if self.backend is None:
2295
+ try:
2296
+ self.backend = OdpsBackend(self.config)
2297
+ self.remote_jobs = getattr(self.backend, "supports_remote_jobs", False)
2298
+ except ValidationError as exc:
2299
+ envelope = self._unauthenticated_whoami_envelope(warnings=[exc.message])
2300
+ self.log("auth.whoami", envelope.status, envelope.metadata)
2301
+ return envelope
2302
+
2303
+ try:
2304
+ payload, warnings = self.backend.whoami_info(project=self.config.default_project)
2305
+ except MaxCError as exc:
2306
+ suppressed = getattr(getattr(self.backend, "resolved_auth", None), "suppressed_env_vars", [])
2307
+ extra_warnings = [exc.message]
2308
+ if suppressed:
2309
+ extra_warnings.append(
2310
+ f"{len(suppressed)} environment variable(s) are set but ignored because an explicit "
2311
+ f"auth provider is configured ({', '.join(suppressed)}). "
2312
+ f"To use environment variables, run `auth login --from-env` or unset the auth provider in config."
2313
+ )
2314
+ envelope = self._whoami_validation_failed_envelope(
2315
+ settings=getattr(self.backend, "settings", {}),
2316
+ auth_type=getattr(getattr(self.backend, "resolved_auth", None), "auth_type", "access_key"),
2317
+ identity_source=getattr(getattr(self.backend, "resolved_auth", None), "identity_source", "unknown"),
2318
+ warnings=extra_warnings,
2319
+ )
2320
+ self.log("auth.whoami", envelope.status, envelope.metadata)
2321
+ return envelope
2322
+
2323
+ suppressed = getattr(getattr(self.backend, "resolved_auth", None), "suppressed_env_vars", [])
2324
+ if suppressed:
2325
+ warnings = list(warnings) + [
2326
+ f"{len(suppressed)} environment variable(s) are set but ignored because an explicit "
2327
+ f"auth provider is configured ({', '.join(suppressed)}). "
2328
+ f"To use environment variables, run `auth login --from-env` or unset the auth provider in config."
2329
+ ]
2330
+ envelope = Envelope(
2331
+ command="auth.whoami",
2332
+ status="success",
2333
+ data=payload,
2334
+ metadata={
2335
+ "project": self.config.default_project,
2336
+ "config_sources": [str(p) for p in self.config.sources],
2337
+ },
2338
+ agent_hints=AgentHints(
2339
+ next_actions=["auth.can-i", "meta.list-tables"],
2340
+ warnings=warnings,
2341
+ ),
2342
+ )
2343
+ self.log("auth.whoami", envelope.status, envelope.metadata)
2344
+ return envelope
2345
+
2346
+ def auth_can_i(
2347
+ self,
2348
+ *,
2349
+ table_name: 'str',
2350
+ operation: 'str',
2351
+ project: 'str | None' = None,
2352
+ ) -> 'Envelope':
2353
+ target_project = project or self.config.default_project
2354
+ payload, warnings = self.backend.can_i_info(
2355
+ table_name=table_name,
2356
+ operation=operation,
2357
+ project=target_project,
2358
+ )
2359
+ next_actions = ["query.cost", "query.explain"] if payload.get("allowed") else ["auth.whoami", "meta.describe"]
2360
+ envelope = Envelope(
2361
+ command="auth.can-i",
2362
+ status="success",
2363
+ data=payload,
2364
+ metadata={"project": target_project},
2365
+ agent_hints=AgentHints(next_actions=next_actions, warnings=warnings),
2366
+ )
2367
+ self.log("auth.can-i", envelope.status, envelope.metadata)
2368
+ return envelope
2369
+
2370
+ def _validate_auth_config(
2371
+ self,
2372
+ auth: 'AuthConfig',
2373
+ ) -> 'tuple[dict[str, Any], list[str]]':
2374
+ resolved = resolve_auth_connection(self.config, auth_override=auth)
2375
+ client = resolved.create_client()
2376
+
2377
+ owner_display_name = None
2378
+ try:
2379
+ project = client.get_project(resolved.project)
2380
+ owner_display_name = getattr(project, "owner", None)
2381
+ except Exception:
2382
+ owner_display_name = None
2383
+
2384
+ return build_odps_identity_payload(
2385
+ client=client,
2386
+ settings=resolved.settings,
2387
+ allowed_operations=self.config.allowed_operations,
2388
+ identity_source=resolved.identity_source,
2389
+ auth_type=resolved.auth_type,
2390
+ token_expires_at=resolved.token_expires_at,
2391
+ project=resolved.project,
2392
+ owner_display_name=owner_display_name,
2393
+ )
2394
+
2395
+ def _unauthenticated_whoami_envelope(
2396
+ self,
2397
+ *,
2398
+ warnings: 'list[str] | None' = None,
2399
+ ) -> 'Envelope':
2400
+ payload = {
2401
+ "authenticated": False,
2402
+ "configured": False,
2403
+ "validation_status": "missing_configuration",
2404
+ "backend": "odps",
2405
+ "auth_type": None,
2406
+ "identity_source": "unknown",
2407
+ "principal_display": None,
2408
+ "principal_masked": None,
2409
+ "project": self.config.default_project or None,
2410
+ "region": self.config.default_region or None,
2411
+ "endpoint": self.config.auth.endpoint,
2412
+ "project_owner": None,
2413
+ "allowed_operations": self.config.allowed_operations,
2414
+ "auth_options": build_auth_options(default_global_config_path()),
2415
+ }
2416
+ return Envelope(
2417
+ command="auth.whoami",
2418
+ status="success",
2419
+ data=payload,
2420
+ metadata={
2421
+ "project": self.config.default_project,
2422
+ "config_sources": [str(p) for p in self.config.sources],
2423
+ },
2424
+ agent_hints=AgentHints(
2425
+ next_actions=["auth.login", "auth.login-ncs"],
2426
+ warnings=(warnings or ["No active MaxCompute credentials are configured."]),
2427
+ ),
2428
+ )
2429
+
2430
+ def _resolve_login_value(
2431
+ self,
2432
+ *,
2433
+ provided: 'str | None',
2434
+ env_value: 'str | None',
2435
+ existing_value: 'str | None',
2436
+ prompt: 'str',
2437
+ required: 'bool',
2438
+ secret: 'bool',
2439
+ use_env: 'bool',
2440
+ ) -> 'str | None':
2441
+ if provided is not None and provided.strip():
2442
+ return provided.strip()
2443
+ if use_env and env_value:
2444
+ return env_value.strip()
2445
+ if use_env and required and not env_value:
2446
+ raise ValidationError(
2447
+ f"--from-env was specified but the environment variable for '{prompt}' is not set.",
2448
+ suggestion="Set the required environment variables (ALIBABA_CLOUD_ACCESS_KEY_ID, "
2449
+ "ALIBABA_CLOUD_ACCESS_KEY_SECRET, MAXCOMPUTE_PROJECT, MAXCOMPUTE_ENDPOINT) "
2450
+ "or provide the values as CLI flags.",
2451
+ )
2452
+ if existing_value:
2453
+ return existing_value.strip()
2454
+ if not required:
2455
+ return None
2456
+ if not sys.stdin.isatty():
2457
+ return None
2458
+
2459
+ if secret:
2460
+ value = getpass.getpass(f"{prompt}: ").strip()
2461
+ else:
2462
+ value = input(f"{prompt}: ").strip()
2463
+ return value or None
2464
+
2465
+ def _prompt_text(
2466
+ self,
2467
+ prompt: 'str',
2468
+ *,
2469
+ required: 'bool' = True,
2470
+ default: 'str | None' = None,
2471
+ ) -> 'str | None':
2472
+ if not sys.stdin.isatty():
2473
+ return default
2474
+ display_prompt = f"{prompt} [current: {default}]" if default else prompt
2475
+ value = input(f"{display_prompt}: ").strip()
2476
+ if value:
2477
+ return value
2478
+ if default:
2479
+ return default
2480
+ if required:
2481
+ raise ValidationError(f"{prompt} is required.")
2482
+ return None
2483
+
2484
+ def schema_diff(self, left_table: 'str', right_table: 'str') -> 'Envelope':
2485
+ left = self.backend.describe_table(left_table)
2486
+ right = self.backend.describe_table(right_table)
2487
+ payload = build_schema_diff_payload(left, right)
2488
+ next_actions = ["meta.describe", "auth.can-i"]
2489
+ if payload.get("compatible"):
2490
+ next_actions = ["query.explain", "auth.can-i"]
2491
+ envelope = Envelope(
2492
+ command="diff.schema",
2493
+ status="success",
2494
+ data=payload,
2495
+ metadata={"project": self.config.default_project},
2496
+ agent_hints=AgentHints(next_actions=next_actions),
2497
+ )
2498
+ self.log("diff.schema", envelope.status, envelope.metadata)
2499
+ return envelope
2500
+
2501
+ def partition_diff(self, left_table: 'str', right_table: 'str') -> 'Envelope':
2502
+ left = self.backend.describe_table(left_table)
2503
+ right = self.backend.describe_table(right_table)
2504
+ payload = build_partition_diff_payload(left, right)
2505
+ envelope = Envelope(
2506
+ command="diff.partition",
2507
+ status="success",
2508
+ data=payload,
2509
+ metadata={"project": self.config.default_project},
2510
+ agent_hints=AgentHints(next_actions=["meta.partitions", "meta.latest-partition"]),
2511
+ )
2512
+ self.log("diff.partition", envelope.status, envelope.metadata)
2513
+ return envelope
2514
+
2515
+ def data_diff(
2516
+ self,
2517
+ left_table: 'str',
2518
+ right_table: 'str',
2519
+ *,
2520
+ keys: 'list[str]',
2521
+ columns: 'list[str] | None' = None,
2522
+ rows: 'int' = 100,
2523
+ partition: 'str | None' = None,
2524
+ left_partition: 'str | None' = None,
2525
+ right_partition: 'str | None' = None,
2526
+ ) -> 'Envelope':
2527
+ if rows <= 0:
2528
+ raise ValidationError("`--rows` must be greater than 0.")
2529
+ if not keys:
2530
+ raise ValidationError("`--keys` requires at least one column.")
2531
+
2532
+ left = self.backend.describe_table(left_table)
2533
+ right = self.backend.describe_table(right_table)
2534
+ compared_columns = resolve_data_diff_columns(
2535
+ left,
2536
+ right,
2537
+ keys=keys,
2538
+ requested_columns=columns,
2539
+ )
2540
+ fetch_columns = dedupe_preserve_order([*keys, *compared_columns])
2541
+ resolved_left_partition = left_partition or partition
2542
+ resolved_right_partition = right_partition or partition
2543
+ _, left_rows, _ = self.backend.sample_table(
2544
+ left_table,
2545
+ rows,
2546
+ partition=resolved_left_partition,
2547
+ columns=fetch_columns,
2548
+ )
2549
+ _, right_rows, _ = self.backend.sample_table(
2550
+ right_table,
2551
+ rows,
2552
+ partition=resolved_right_partition,
2553
+ columns=fetch_columns,
2554
+ )
2555
+
2556
+ payload, warnings = build_data_diff_payload(
2557
+ left=left,
2558
+ right=right,
2559
+ left_rows=left_rows,
2560
+ right_rows=right_rows,
2561
+ keys=keys,
2562
+ compared_columns=compared_columns,
2563
+ rows_limit=rows,
2564
+ left_partition=resolved_left_partition,
2565
+ right_partition=resolved_right_partition,
2566
+ )
2567
+ envelope = Envelope(
2568
+ command="diff.data",
2569
+ status="success",
2570
+ data=payload,
2571
+ metadata={
2572
+ "project": self.config.default_project,
2573
+ "requested_rows": rows,
2574
+ "requested_columns": columns or [],
2575
+ "requested_keys": keys,
2576
+ "requested_partition": partition,
2577
+ "left_requested_partition": resolved_left_partition,
2578
+ "right_requested_partition": resolved_right_partition,
2579
+ },
2580
+ agent_hints=AgentHints(
2581
+ next_actions=["data.profile", "diff.schema", "meta.describe"],
2582
+ warnings=warnings,
2583
+ ),
2584
+ )
2585
+ self.log("diff.data", envelope.status, envelope.metadata)
2586
+ return envelope
2587
+
2588
+ def agent_context(self) -> 'Envelope':
2589
+ """Return project context without listing tables (too slow on large projects)."""
2590
+ envelope = Envelope(
2591
+ command="agent.context",
2592
+ status="success",
2593
+ data={
2594
+ "project": self.config.default_project,
2595
+ "region": self.config.default_region,
2596
+ "backend": "odps",
2597
+ "project_context": self.config.project_context,
2598
+ "allowed_operations": self.config.allowed_operations,
2599
+ "cost_threshold_cu": self.config.cost_threshold_cu,
2600
+ "sensitive_columns": self.config.sensitive_columns,
2601
+ },
2602
+ metadata={
2603
+ "config_sources": [str(path) for path in self.config.sources],
2604
+ "state_dir": str(self.config.state_dir),
2605
+ "job_mode": "remote" if self.remote_jobs else "local" if self.backend is not None else "unknown",
2606
+ },
2607
+ agent_hints=AgentHints(
2608
+ next_actions=["meta.search", "meta.list-tables"],
2609
+ insights=["Use `meta list-tables` to enumerate tables and `meta search` to locate relevant datasets."],
2610
+ ),
2611
+ )
2612
+ self.log("agent.context", envelope.status, envelope.metadata)
2613
+ return envelope
2614
+
2615
+ def feature_unavailable(self, command: 'str', message: 'str') -> 'Envelope':
2616
+ raise FeatureUnavailableError(
2617
+ message,
2618
+ suggestion="Run `maxc --help` to inspect the currently supported commands.",
2619
+ )
2620
+
2621
+ def log(
2622
+ self,
2623
+ command: 'str',
2624
+ status: 'str',
2625
+ metadata: 'dict[str, Any] | None' = None,
2626
+ *,
2627
+ error: 'dict[str, Any] | None' = None,
2628
+ ) -> 'None':
2629
+ try:
2630
+ if self._audit is None:
2631
+ self._audit = AuditLogger(self._audit_path)
2632
+ self._audit.log(
2633
+ {
2634
+ "command": command,
2635
+ "status": status,
2636
+ "metadata": metadata or {},
2637
+ "error": error,
2638
+ }
2639
+ )
2640
+ except OSError:
2641
+ # Command execution should not fail solely because the audit path is unavailable.
2642
+ return
2643
+
2644
+ def _submit_remote_job(
2645
+ self,
2646
+ *,
2647
+ sql: 'str',
2648
+ project: 'str',
2649
+ cost_check: 'float | None',
2650
+ idempotency_key: 'str | None',
2651
+ ) -> 'JobInfo':
2652
+ if cost_check is not None:
2653
+ raise FeatureUnavailableError(
2654
+ "The real MaxCompute backend does not yet support CU-based `--cost-check` validation.",
2655
+ suggestion="Run `--dry-run` first to inspect SQLCost metadata, or remove `--cost-check`.",
2656
+ )
2657
+ return self.backend.submit_query(
2658
+ sql,
2659
+ project=project,
2660
+ idempotency_key=idempotency_key,
2661
+ )
2662
+
2663
+ def _execute_query(
2664
+ self,
2665
+ *,
2666
+ sql: 'str',
2667
+ project: 'str',
2668
+ max_rows: 'int',
2669
+ offset: 'int',
2670
+ dry_run: 'bool',
2671
+ cost_check: 'float | None',
2672
+ retry_on: 'list[str]',
2673
+ max_retries: 'int',
2674
+ strict_cost_check: 'bool',
2675
+ timeout: 'int | None' = None,
2676
+ ) -> 'QueryResult':
2677
+ if sql.startswith("@natural"):
2678
+ raise FeatureUnavailableError(
2679
+ "`@natural` is a roadmap feature and is not available in the current MVP.",
2680
+ suggestion="Use `maxc meta search` or `maxc meta describe` to inspect tables, then submit plain SQL.",
2681
+ )
2682
+
2683
+ attempts = 0
2684
+ while True:
2685
+ try:
2686
+ if cost_check is not None and strict_cost_check and not self.backend.supports_cost_check:
2687
+ raise FeatureUnavailableError(
2688
+ "The current backend does not provide CU-based cost validation.",
2689
+ suggestion="Remove `--cost-check`, or use `--dry-run` to inspect SQLCost metadata.",
2690
+ )
2691
+
2692
+ result = self.backend.execute_query(
2693
+ sql,
2694
+ project=project,
2695
+ max_rows=max_rows,
2696
+ dry_run=dry_run,
2697
+ offset=offset,
2698
+ timeout=timeout,
2699
+ )
2700
+ return result
2701
+ except MaxCError as exc:
2702
+ attempts += 1
2703
+ can_retry = (
2704
+ attempts <= max_retries
2705
+ and exc.recoverable
2706
+ and exc.error_code in retry_on
2707
+ )
2708
+ if not can_retry:
2709
+ raise
2710
+
2711
+ def _analyze_query(
2712
+ self,
2713
+ *,
2714
+ sql: 'str',
2715
+ project: 'str',
2716
+ explain: 'bool',
2717
+ ) -> 'dict[str, Any]':
2718
+ if sql.startswith("@natural"):
2719
+ raise FeatureUnavailableError(
2720
+ "`@natural` is a roadmap feature and is not available in the current MVP.",
2721
+ suggestion="Use `maxc meta search` or `maxc meta describe` to inspect tables, then submit plain SQL.",
2722
+ )
2723
+ if explain:
2724
+ return self.backend.explain_query(sql, project=project)
2725
+ return self.backend.estimate_query_cost(sql, project=project)
2726
+
2727
+ def _build_query_envelope(
2728
+ self,
2729
+ *,
2730
+ command: 'str',
2731
+ result: 'QueryResult',
2732
+ dry_run: 'bool',
2733
+ session_id: 'int | None' = None,
2734
+ ) -> 'Envelope':
2735
+ insights = []
2736
+ next_actions = ["meta.describe"] if result.tables_used else []
2737
+ if result.has_more:
2738
+ next_actions.append("query.paginate")
2739
+ if dry_run:
2740
+ next_actions.append("job.submit")
2741
+ insights.append("Dry-run returned estimated cost and SQLCost metadata so you can decide whether to continue.")
2742
+ elif not result.rows:
2743
+ insights.append("The result set is empty. Check filters, partitions, and table selection.")
2744
+
2745
+ # 如果有 job_id 且 has_more,创建或复用 session,生成短 cursor
2746
+ next_cursor = None
2747
+ if result.has_more and result.returned_rows > 0:
2748
+ current_offset = result.extra_metadata.get("current_offset", 0)
2749
+ next_offset = current_offset + result.returned_rows
2750
+ if result.job_id and self.remote_jobs:
2751
+ # 远程 backend: 用 session_id 生成短 cursor
2752
+ if session_id is None:
2753
+ session_id = self.cache.create_session(
2754
+ job_id=result.job_id,
2755
+ project=result.project,
2756
+ sql=result.sql_executed,
2757
+ )
2758
+ next_cursor = encode_cursor(next_offset, session_id=session_id)
2759
+ else:
2760
+ # Mock backend: 只包含 offset
2761
+ next_cursor = encode_cursor(next_offset)
2762
+
2763
+ metadata = {
2764
+ "project": result.project,
2765
+ "elapsed_ms": result.elapsed_ms,
2766
+ "bytes_scanned": result.bytes_scanned,
2767
+ "sql_executed": result.sql_executed,
2768
+ "tables_used": result.tables_used,
2769
+ }
2770
+ if result.job_id:
2771
+ metadata["job_id"] = result.job_id
2772
+ if result.submitted_at:
2773
+ metadata["submitted_at"] = result.submitted_at
2774
+ if result.completed_at:
2775
+ metadata["completed_at"] = result.completed_at
2776
+ metadata.update(result.extra_metadata)
2777
+
2778
+ return Envelope(
2779
+ command=command,
2780
+ status="success",
2781
+ data={
2782
+ "rows": result.rows,
2783
+ "schema": result.schema,
2784
+ "total_rows": result.total_rows,
2785
+ "returned_rows": result.returned_rows,
2786
+ "has_more": result.has_more,
2787
+ "next_cursor": next_cursor,
2788
+ },
2789
+ metadata=metadata,
2790
+ agent_hints=AgentHints(
2791
+ next_actions=next_actions,
2792
+ warnings=result.warnings,
2793
+ insights=insights,
2794
+ ),
2795
+ )
2796
+
2797
+ def _build_analysis_envelope(
2798
+ self,
2799
+ *,
2800
+ command: 'str',
2801
+ sql: 'str',
2802
+ analysis: 'dict[str, Any]',
2803
+ ) -> 'Envelope':
2804
+ warnings = list(analysis.get("warnings", []))
2805
+ next_actions = ["query"]
2806
+ if analysis.get("tables_used"):
2807
+ next_actions.append("meta.describe")
2808
+ if command == "query.cost":
2809
+ next_actions.insert(0, "query.explain")
2810
+ insights = []
2811
+ if analysis.get("estimated_input_size_bytes") == 0:
2812
+ insights.append("The estimated scan input is 0 bytes. This is often a constant query or a plan that avoids scanning data.")
2813
+
2814
+ metadata = {
2815
+ "project": analysis.get("project"),
2816
+ "sql_executed": sql.rstrip(";"),
2817
+ }
2818
+ if analysis.get("elapsed_ms") is not None:
2819
+ metadata["elapsed_ms"] = analysis["elapsed_ms"]
2820
+
2821
+ return Envelope(
2822
+ command=command,
2823
+ status="success",
2824
+ data=analysis,
2825
+ metadata=metadata,
2826
+ agent_hints=AgentHints(
2827
+ next_actions=next_actions,
2828
+ warnings=warnings,
2829
+ insights=insights,
2830
+ ),
2831
+ )
2832
+
2833
+ def _job_info_envelope(self, command: 'str', info: 'JobInfo') -> 'Envelope':
2834
+ next_actions = ["job.wait", "job.result"] if info.status in {"pending", "running"} else ["job.result"]
2835
+ if info.status == "failure":
2836
+ next_actions = ["job.diagnose", "job.status"]
2837
+ return Envelope(
2838
+ command=command,
2839
+ status=info.status,
2840
+ data={
2841
+ "job_id": info.job_id,
2842
+ "status": info.status,
2843
+ "progress": info.progress,
2844
+ "stage": info.stage,
2845
+ "retryable": info.retryable,
2846
+ "failure_reason": info.failure_reason,
2847
+ "logview": info.logview,
2848
+ "task_summary": info.task_summary,
2849
+ "sql": info.sql,
2850
+ },
2851
+ metadata={
2852
+ "project": info.project,
2853
+ "submitted_at": info.submitted_at,
2854
+ "updated_at": info.updated_at,
2855
+ "completed_at": info.completed_at,
2856
+ "logview": info.logview,
2857
+ "error_message": info.error_message,
2858
+ },
2859
+ agent_hints=AgentHints(
2860
+ next_actions=next_actions,
2861
+ warnings=info.warnings,
2862
+ ),
2863
+ )
2864
+
2865
+ def _local_job_info(self, job: 'dict[str, Any]') -> 'JobInfo':
2866
+ status = job["status"]
2867
+ stage = "queue" if status == "pending" else "completed" if status == "success" else "failed"
2868
+ failure_reason = "The job was cancelled." if job.get("cancelled") else None
2869
+ diagnosis = classify_failure_reason(failure_reason)
2870
+ task_summary = build_task_summary(job.get("sql"))
2871
+ return JobInfo(
2872
+ job_id=job["job_id"],
2873
+ status=status,
2874
+ project=job["project"],
2875
+ progress=job["progress"],
2876
+ stage=stage,
2877
+ retryable=diagnosis["retryable"] if status == "failure" else None,
2878
+ failure_reason=failure_reason,
2879
+ task_summary=task_summary,
2880
+ sql=job.get("sql"),
2881
+ submitted_at=job.get("submitted_at"),
2882
+ updated_at=job.get("updated_at"),
2883
+ completed_at=job.get("completed_at"),
2884
+ logview=None,
2885
+ )
2886
+
2887
+ def _query_result_payload(self, result: 'QueryResult') -> 'dict[str, Any]':
2888
+ envelope = self._build_query_envelope(
2889
+ command="query",
2890
+ result=result,
2891
+ dry_run=False,
2892
+ )
2893
+ return envelope.to_dict(normalize=False)
2894
+
2895
+ def _job_events(self, job: 'dict[str, Any]') -> 'list[dict[str, Any]]':
2896
+ if job["status"] == "success":
2897
+ return [
2898
+ {
2899
+ "type": "completed",
2900
+ "ts": now_utc_iso(),
2901
+ "job_id": job["job_id"],
2902
+ "rows": job["result"]["data"]["returned_rows"],
2903
+ }
2904
+ ]
2905
+ if job.get("cancelled"):
2906
+ raise ValidationError("The job was cancelled and can no longer be waited on.")
2907
+
2908
+ return [
2909
+ {"type": "started", "ts": now_utc_iso(), "job_id": job["job_id"]},
2910
+ {
2911
+ "type": "progress",
2912
+ "ts": now_utc_iso(),
2913
+ "job_id": job["job_id"],
2914
+ "percent": 20,
2915
+ "stage": "queue",
2916
+ },
2917
+ {
2918
+ "type": "progress",
2919
+ "ts": now_utc_iso(),
2920
+ "job_id": job["job_id"],
2921
+ "percent": 60,
2922
+ "stage": "scan",
2923
+ },
2924
+ {
2925
+ "type": "progress",
2926
+ "ts": now_utc_iso(),
2927
+ "job_id": job["job_id"],
2928
+ "percent": 90,
2929
+ "stage": "finalize",
2930
+ },
2931
+ {
2932
+ "type": "completed",
2933
+ "ts": now_utc_iso(),
2934
+ "job_id": job["job_id"],
2935
+ "rows": job["result"]["data"]["returned_rows"],
2936
+ },
2937
+ ]
2938
+
2939
+ def _remote_job_events(
2940
+ self,
2941
+ before: 'JobInfo',
2942
+ after: 'JobInfo',
2943
+ result: 'QueryResult',
2944
+ ) -> 'list[dict[str, Any]]':
2945
+ events = [{"type": "started", "ts": before.submitted_at or now_utc_iso(), "job_id": before.job_id}]
2946
+ if before.status in {"pending", "running"}:
2947
+ events.append(
2948
+ {
2949
+ "type": "progress",
2950
+ "ts": now_utc_iso(),
2951
+ "job_id": before.job_id,
2952
+ "percent": before.progress or 50,
2953
+ "stage": before.status,
2954
+ }
2955
+ )
2956
+ events.append(
2957
+ {
2958
+ "type": "completed",
2959
+ "ts": after.completed_at or now_utc_iso(),
2960
+ "job_id": after.job_id,
2961
+ "rows": result.returned_rows,
2962
+ }
2963
+ )
2964
+ return events
2965
+
2966
+ def _table_payload(self, table: 'TableDefinition', full: 'bool' = False) -> 'dict[str, Any]':
2967
+ # Calculate size in MB
2968
+ size_mb = (table.size_bytes / (1024 * 1024)) if table.size_bytes else 0
2969
+
2970
+ # Identify primary key with better heuristics:
2971
+ # 1. Look for explicit primary key indicators: *_id (but not ending with _sk), pk_*, id
2972
+ # 2. Exclude foreign key patterns and common FK suffixes
2973
+ foreign_key_suffixes = ('_date_sk', '_time_sk', '_dim_sk', '_demo_sk', '_hdemo_sk',
2974
+ '_cdemo_sk', '_addr_sk', '_promo_sk', '_item_sk', '_customer_sk',
2975
+ '_store_sk', '_bill_sk', '_ship_sk', '_reason_sk')
2976
+ primary_key = None
2977
+
2978
+ # First pass: look for actual primary keys (not ending with _sk)
2979
+ for col in table.columns:
2980
+ col_lower = col.name.lower()
2981
+ # Primary key candidates: ends with _id but not _sk, or explicitly named 'id'/'pk_*'
2982
+ if (col_lower.endswith('_id') and not col_lower.endswith('_sk')) or \
2983
+ col_lower == 'id' or col_lower.startswith('pk_'):
2984
+ primary_key = col.name
2985
+ break
2986
+
2987
+ # Second pass: if no clear PK, check for ticket numbers or other business keys
2988
+ if not primary_key:
2989
+ for col in table.columns:
2990
+ col_lower = col.name.lower()
2991
+ if 'ticket_number' in col_lower or 'order_number' in col_lower:
2992
+ primary_key = col.name
2993
+ break
2994
+
2995
+ payload = {
2996
+ "table_name": table.name,
2997
+ "description": table.description,
2998
+ "row_count": None, # Not available from TableDefinition
2999
+ "size_mb": round(size_mb, 2),
3000
+ "column_count": len(table.columns),
3001
+ "table_type": table.table_type,
3002
+ "owner": table.owner,
3003
+ "created_at": table.created_at,
3004
+ "updated_at": table.updated_at,
3005
+ }
3006
+
3007
+ if full:
3008
+ # Full mode: return all columns
3009
+ payload["schema"] = [
3010
+ {"name": column.name, "type": column.type, "comment": column.comment}
3011
+ for column in table.columns
3012
+ ]
3013
+ payload["has_more_columns"] = False
3014
+ # Full mode: include complete sample rows
3015
+ payload["sample_preview"] = table.sample_rows[:2]
3016
+ else:
3017
+ # Summary mode: return first 10 columns only
3018
+ display_columns = table.columns[:10]
3019
+ payload["schema"] = [
3020
+ {"name": column.name, "type": column.type, "comment": column.comment}
3021
+ for column in display_columns
3022
+ ]
3023
+ payload["has_more_columns"] = len(table.columns) > 10
3024
+ payload["remaining_columns"] = max(0, len(table.columns) - 10)
3025
+
3026
+ # Summary mode: no sample preview (keep it lightweight)
3027
+ payload["sample_preview"] = []
3028
+
3029
+ # Add primary key info (None if no clear primary key found)
3030
+ payload["primary_key"] = primary_key
3031
+
3032
+ # Add partition columns
3033
+ payload["partition_columns"] = [
3034
+ {"name": column.name, "type": column.type, "comment": column.comment}
3035
+ for column in table.partition_columns
3036
+ ]
3037
+
3038
+ # Add other metadata
3039
+ payload["partitions"] = table.partitions
3040
+ payload["lineage"] = {
3041
+ "upstream_tables": table.upstream_tables,
3042
+ "downstream_tables": table.downstream_tables,
3043
+ }
3044
+ payload["extra_metadata"] = table.extra_metadata
3045
+
3046
+ return payload
3047
+
3048
+
3049
+ def build_schema_diff_payload(left: 'TableDefinition', right: 'TableDefinition') -> 'dict[str, Any]':
3050
+ columns = compare_columns(left.columns, right.columns, scope="columns")
3051
+ partition_columns = compare_columns(
3052
+ left.partition_columns,
3053
+ right.partition_columns,
3054
+ scope="partition_columns",
3055
+ added_is_breaking=True,
3056
+ )
3057
+
3058
+ breaking_changes = list(columns["breaking_changes"])
3059
+ non_breaking_changes = list(columns["non_breaking_changes"])
3060
+ breaking_changes.extend(partition_columns["breaking_changes"])
3061
+ non_breaking_changes.extend(partition_columns["non_breaking_changes"])
3062
+
3063
+ table_type_changed = left.table_type != right.table_type
3064
+ if table_type_changed:
3065
+ breaking_changes.append(
3066
+ {
3067
+ "kind": "table_type_changed",
3068
+ "scope": "table_attributes",
3069
+ "field": "table_type",
3070
+ "left": left.table_type,
3071
+ "right": right.table_type,
3072
+ }
3073
+ )
3074
+
3075
+ return {
3076
+ "left_table": left.name,
3077
+ "right_table": right.name,
3078
+ "compatible": not breaking_changes,
3079
+ "summary": {
3080
+ "added_columns": len(columns["added"]),
3081
+ "removed_columns": len(columns["removed"]),
3082
+ "changed_columns": len(columns["changed"]),
3083
+ "unchanged_columns": len(columns["unchanged"]),
3084
+ "added_partition_columns": len(partition_columns["added"]),
3085
+ "removed_partition_columns": len(partition_columns["removed"]),
3086
+ "changed_partition_columns": len(partition_columns["changed"]),
3087
+ "breaking_changes": len(breaking_changes),
3088
+ "non_breaking_changes": len(non_breaking_changes),
3089
+ },
3090
+ "breaking_changes": breaking_changes,
3091
+ "non_breaking_changes": non_breaking_changes,
3092
+ "columns": {
3093
+ "added": columns["added"],
3094
+ "removed": columns["removed"],
3095
+ "changed": columns["changed"],
3096
+ "unchanged": columns["unchanged"],
3097
+ },
3098
+ "partition_columns": {
3099
+ "added": partition_columns["added"],
3100
+ "removed": partition_columns["removed"],
3101
+ "changed": partition_columns["changed"],
3102
+ "unchanged": partition_columns["unchanged"],
3103
+ },
3104
+ "table_attributes": {
3105
+ "table_type": {
3106
+ "left": left.table_type,
3107
+ "right": right.table_type,
3108
+ "changed": table_type_changed,
3109
+ }
3110
+ },
3111
+ }
3112
+
3113
+
3114
+ def build_partition_diff_payload(left: 'TableDefinition', right: 'TableDefinition') -> 'dict[str, Any]':
3115
+ left_partitions = set(left.partitions)
3116
+ right_partitions = set(right.partitions)
3117
+ left_only = sorted(left_partitions - right_partitions)
3118
+ right_only = sorted(right_partitions - left_partitions)
3119
+ common = sorted(left_partitions & right_partitions)
3120
+ return {
3121
+ "left_table": left.name,
3122
+ "right_table": right.name,
3123
+ "compatible": not left_only and not right_only,
3124
+ "summary": {
3125
+ "left_partition_count": len(left.partitions),
3126
+ "right_partition_count": len(right.partitions),
3127
+ "left_only": len(left_only),
3128
+ "right_only": len(right_only),
3129
+ "common": len(common),
3130
+ },
3131
+ "left_only": left_only,
3132
+ "right_only": right_only,
3133
+ "common": common,
3134
+ }
3135
+
3136
+
3137
+ def build_data_diff_payload(
3138
+ *,
3139
+ left: 'TableDefinition',
3140
+ right: 'TableDefinition',
3141
+ left_rows: 'list[dict[str, Any]]',
3142
+ right_rows: 'list[dict[str, Any]]',
3143
+ keys: 'list[str]',
3144
+ compared_columns: 'list[str]',
3145
+ rows_limit: 'int',
3146
+ left_partition: 'str | None',
3147
+ right_partition: 'str | None',
3148
+ ) -> 'tuple[dict[str, Any], list[str]]':
3149
+ left_by_key = index_rows_by_key(left_rows, keys=keys, table_name=left.name)
3150
+ right_by_key = index_rows_by_key(right_rows, keys=keys, table_name=right.name)
3151
+ left_keys = set(left_by_key)
3152
+ right_keys = set(right_by_key)
3153
+ common_keys = sorted(left_keys & right_keys)
3154
+ left_only_keys = sorted(left_keys - right_keys)
3155
+ right_only_keys = sorted(right_keys - left_keys)
3156
+
3157
+ mismatched_rows: 'list[dict[str, Any]]' = []
3158
+ for key in common_keys:
3159
+ left_row = left_by_key[key]
3160
+ right_row = right_by_key[key]
3161
+ differing_columns = [
3162
+ column
3163
+ for column in compared_columns
3164
+ if normalize_diff_value(left_row.get(column)) != normalize_diff_value(right_row.get(column))
3165
+ ]
3166
+ if differing_columns:
3167
+ mismatched_rows.append(
3168
+ {
3169
+ "key": key_to_payload(keys, key),
3170
+ "differing_columns": differing_columns,
3171
+ "left_row": left_row,
3172
+ "right_row": right_row,
3173
+ }
3174
+ )
3175
+
3176
+ warnings = [
3177
+ (
3178
+ f"Data diff currently compares at most {rows_limit} read-only snapshot row(s) per side. "
3179
+ "Increase `--rows` and narrow the partition scope if you need higher coverage."
3180
+ )
3181
+ ]
3182
+ if not compared_columns:
3183
+ warnings.append("Only key presence was compared. Non-key column values were not compared.")
3184
+
3185
+ payload = {
3186
+ "left_table": left.name,
3187
+ "right_table": right.name,
3188
+ "comparison_mode": "keyed_snapshot",
3189
+ "compatible": not left_only_keys and not right_only_keys and not mismatched_rows,
3190
+ "keys": keys,
3191
+ "compared_columns": compared_columns,
3192
+ "left_partition": left_partition,
3193
+ "right_partition": right_partition,
3194
+ "left_sampled_rows": len(left_rows),
3195
+ "right_sampled_rows": len(right_rows),
3196
+ "summary": {
3197
+ "matched_keys": len(common_keys),
3198
+ "mismatched_rows": len(mismatched_rows),
3199
+ "left_only_keys": len(left_only_keys),
3200
+ "right_only_keys": len(right_only_keys),
3201
+ "left_sampled_rows": len(left_rows),
3202
+ "right_sampled_rows": len(right_rows),
3203
+ },
3204
+ "left_only_keys": [key_to_payload(keys, key) for key in left_only_keys],
3205
+ "right_only_keys": [key_to_payload(keys, key) for key in right_only_keys],
3206
+ "mismatched_rows": mismatched_rows,
3207
+ }
3208
+ return payload, warnings
3209
+
3210
+
3211
+ def compare_columns(
3212
+ left_columns: 'list[Any]',
3213
+ right_columns: 'list[Any]',
3214
+ *,
3215
+ scope: 'str',
3216
+ added_is_breaking: 'bool' = False,
3217
+ ) -> 'dict[str, Any]':
3218
+ left_by_name = {column.name: column for column in left_columns}
3219
+ right_by_name = {column.name: column for column in right_columns}
3220
+ left_names = set(left_by_name)
3221
+ right_names = set(right_by_name)
3222
+
3223
+ added = [column_payload(right_by_name[name]) for name in sorted(right_names - left_names)]
3224
+ removed = [column_payload(left_by_name[name]) for name in sorted(left_names - right_names)]
3225
+ unchanged: 'list[str]' = []
3226
+ changed: 'list[dict[str, Any]]' = []
3227
+ breaking_changes: 'list[dict[str, Any]]' = []
3228
+ non_breaking_changes: 'list[dict[str, Any]]' = []
3229
+
3230
+ for name in sorted(left_names & right_names):
3231
+ left = left_by_name[name]
3232
+ right = right_by_name[name]
3233
+ field_changes: 'dict[str, dict[str, Any]]' = {}
3234
+ if left.type != right.type:
3235
+ field_changes["type"] = {"left": left.type, "right": right.type}
3236
+ breaking_changes.append(
3237
+ {
3238
+ "kind": "column_type_changed",
3239
+ "scope": scope,
3240
+ "column": name,
3241
+ "left": left.type,
3242
+ "right": right.type,
3243
+ }
3244
+ )
3245
+ if left.comment != right.comment:
3246
+ field_changes["comment"] = {"left": left.comment, "right": right.comment}
3247
+ non_breaking_changes.append(
3248
+ {
3249
+ "kind": "column_comment_changed",
3250
+ "scope": scope,
3251
+ "column": name,
3252
+ "left": left.comment,
3253
+ "right": right.comment,
3254
+ }
3255
+ )
3256
+ if field_changes:
3257
+ changed.append(
3258
+ {
3259
+ "name": name,
3260
+ "left": column_payload(left),
3261
+ "right": column_payload(right),
3262
+ "changes": field_changes,
3263
+ }
3264
+ )
3265
+ else:
3266
+ unchanged.append(name)
3267
+
3268
+ for column in removed:
3269
+ breaking_changes.append(
3270
+ {
3271
+ "kind": "column_removed",
3272
+ "scope": scope,
3273
+ "column": column["name"],
3274
+ "left": column,
3275
+ "right": None,
3276
+ }
3277
+ )
3278
+ for column in added:
3279
+ target = breaking_changes if added_is_breaking else non_breaking_changes
3280
+ target.append(
3281
+ {
3282
+ "kind": "column_added",
3283
+ "scope": scope,
3284
+ "column": column["name"],
3285
+ "left": None,
3286
+ "right": column,
3287
+ }
3288
+ )
3289
+
3290
+ return {
3291
+ "added": added,
3292
+ "removed": removed,
3293
+ "changed": changed,
3294
+ "unchanged": unchanged,
3295
+ "breaking_changes": breaking_changes,
3296
+ "non_breaking_changes": non_breaking_changes,
3297
+ }
3298
+
3299
+
3300
+ def column_payload(column: 'Any') -> 'dict[str, Any]':
3301
+ return {
3302
+ "name": column.name,
3303
+ "type": column.type,
3304
+ "comment": column.comment,
3305
+ }
3306
+
3307
+
3308
+ def resolve_data_diff_columns(
3309
+ left: 'TableDefinition',
3310
+ right: 'TableDefinition',
3311
+ *,
3312
+ keys: 'list[str]',
3313
+ requested_columns: 'list[str] | None',
3314
+ ) -> 'list[str]':
3315
+ left_columns = all_table_column_names(left)
3316
+ right_columns = all_table_column_names(right)
3317
+ deduped_keys = dedupe_preserve_order(keys)
3318
+ missing_keys = [
3319
+ key
3320
+ for key in deduped_keys
3321
+ if key not in left_columns or key not in right_columns
3322
+ ]
3323
+ if missing_keys:
3324
+ raise ValidationError(
3325
+ f"Data diff key columns are incomplete across both tables: {', '.join(missing_keys)}",
3326
+ suggestion="Run `maxc meta describe` or `maxc diff schema` to inspect the available columns.",
3327
+ )
3328
+
3329
+ if requested_columns:
3330
+ compared_columns = [
3331
+ column
3332
+ for column in dedupe_preserve_order(requested_columns)
3333
+ if column not in deduped_keys
3334
+ ]
3335
+ missing_columns = [
3336
+ column
3337
+ for column in compared_columns
3338
+ if column not in left_columns or column not in right_columns
3339
+ ]
3340
+ if missing_columns:
3341
+ raise ValidationError(
3342
+ f"Data diff comparison columns are incomplete across both tables: {', '.join(missing_columns)}",
3343
+ suggestion="Run `maxc meta describe` or `maxc diff schema` to inspect the available columns.",
3344
+ )
3345
+ return compared_columns
3346
+
3347
+ return [
3348
+ column
3349
+ for column in left_columns
3350
+ if column in right_columns and column not in deduped_keys
3351
+ ]
3352
+
3353
+
3354
+ def all_table_column_names(table: 'TableDefinition') -> 'list[str]':
3355
+ return [column.name for column in [*table.columns, *table.partition_columns]]
3356
+
3357
+
3358
+ def dedupe_preserve_order(values: 'list[str]') -> 'list[str]':
3359
+ deduped: 'list[str]' = []
3360
+ for value in values:
3361
+ if value not in deduped:
3362
+ deduped.append(value)
3363
+ return deduped
3364
+
3365
+
3366
+ def index_rows_by_key(
3367
+ rows: 'list[dict[str, Any]]',
3368
+ *,
3369
+ keys: 'list[str]',
3370
+ table_name: 'str',
3371
+ ) -> 'dict[tuple[Any, ...], dict[str, Any]]':
3372
+ indexed: 'dict[tuple[Any, ...], dict[str, Any]]' = {}
3373
+ duplicate_keys: 'list[dict[str, Any]]' = []
3374
+ for row in rows:
3375
+ key = tuple(normalize_diff_value(row.get(column)) for column in keys)
3376
+ if key in indexed:
3377
+ duplicate_keys.append(key_to_payload(keys, key))
3378
+ continue
3379
+ indexed[key] = row
3380
+ if duplicate_keys:
3381
+ examples = ", ".join(str(item) for item in duplicate_keys[:3])
3382
+ raise ValidationError(
3383
+ f"Data diff found duplicate keys in the `{table_name}` snapshot and cannot align rows reliably: {examples}",
3384
+ suggestion="Use more selective `--keys`, or narrow the partition scope before retrying.",
3385
+ )
3386
+ return indexed
3387
+
3388
+
3389
+ def key_to_payload(columns: 'list[str]', values: 'tuple[Any, ...]') -> 'dict[str, Any]':
3390
+ return {
3391
+ column: value
3392
+ for column, value in zip(columns, values)
3393
+ }
3394
+
3395
+
3396
+ def normalize_diff_value(value: 'Any') -> 'Any':
3397
+ if hasattr(value, "isoformat"):
3398
+ try:
3399
+ return value.isoformat()
3400
+ except TypeError:
3401
+ return value
3402
+ return value
3403
+
3404
+
3405
+ def read_stdin() -> 'str':
3406
+ return sys.stdin.read()