maxc-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
maxc_cli/helpers.py ADDED
@@ -0,0 +1,964 @@
1
+ """Helper functions for MaxCompute backend operations."""
2
+
3
+ from collections import Counter
4
+ from datetime import date, datetime, time, timezone
5
+ from decimal import Decimal
6
+ from difflib import get_close_matches
7
+ import os
8
+ import re
9
+ from typing import Any, Iterable
10
+
11
+ from .config import TableDefinition
12
+ from .exceptions import (
13
+ BackendConnectionError,
14
+ FeatureUnavailableError,
15
+ MaxCError,
16
+ NotFoundError,
17
+ PermissionDeniedError,
18
+ SqlError,
19
+ ValidationError,
20
+ )
21
+ from .utils import (
22
+ detect_operation,
23
+ extract_table_names,
24
+ normalize_sql,
25
+ now_utc_iso,
26
+ parse_select_projection,
27
+ )
28
+
29
+ try:
30
+ from odps.errors import NoPermission as OdpsNoPermission
31
+ from odps.errors import NoSuchObject as OdpsNoSuchObject
32
+ from odps.errors import ODPSError
33
+ except Exception: # pragma: no cover
34
+ OdpsNoPermission = Exception
35
+ OdpsNoSuchObject = Exception
36
+ ODPSError = Exception
37
+
38
+
39
+ # Constants
40
+ ODPS_ENV_ALIASES: 'dict[str, tuple[str, ...]]' = {
41
+ "access_id": (
42
+ "ALIBABA_CLOUD_ACCESS_KEY_ID",
43
+ "ODPS_ACCESS_ID",
44
+ "ODPS_STS_ACCESS_KEY_ID",
45
+ "ACCESS_KEY_ID",
46
+ ),
47
+ "secret_access_key": (
48
+ "ALIBABA_CLOUD_ACCESS_KEY_SECRET",
49
+ "ODPS_ACCESS_KEY",
50
+ "ODPS_ACCESS_KEY_SECRET",
51
+ "ODPS_STS_ACCESS_KEY_SECRET",
52
+ "ACCESS_KEY_SECRET",
53
+ ),
54
+ "security_token": (
55
+ "ALIBABA_CLOUD_SECURITY_TOKEN",
56
+ "ODPS_STS_TOKEN",
57
+ "SECURITY_TOKEN",
58
+ ),
59
+ "project": ("MAXCOMPUTE_PROJECT", "ODPS_PROJECT"),
60
+ "endpoint": ("MAXCOMPUTE_ENDPOINT", "ODPS_ENDPOINT", "odps_endpoint"),
61
+ "region_name": ("MAXCOMPUTE_REGION", "ALIBABA_CLOUD_REGION"),
62
+ "tunnel_endpoint": ("MAXCOMPUTE_TUNNEL_ENDPOINT", "ODPS_TUNNEL_ENDPOINT"),
63
+ }
64
+
65
+ PARTITION_PATH_RE = re.compile(r"/(?=[A-Za-z_][\w]*\s*=)")
66
+ ISO_TIMEZONE_COLON_RE = re.compile(r"([+-]\d{2}):(\d{2})$")
67
+ FRESHNESS_FRESH_HOURS = 36
68
+ FRESHNESS_STALE_HOURS = 72
69
+
70
+
71
+ def load_odps_env() -> 'dict[str, str | None]':
72
+ """Load ODPS settings from environment variables."""
73
+ values: 'dict[str, str | None]' = {}
74
+ for field, aliases in ODPS_ENV_ALIASES.items():
75
+ value = None
76
+ for alias in aliases:
77
+ candidate = os.environ.get(alias)
78
+ if candidate:
79
+ value = candidate
80
+ break
81
+ values[field] = value
82
+ return values
83
+
84
+
85
+ def validate_login_settings(
86
+ *,
87
+ settings: 'dict[str, str | None]',
88
+ allowed_operations: 'list[str]',
89
+ ) -> 'tuple[dict[str, Any], list[str]]':
90
+ """Validate login settings by attempting to create an ODPS client.
91
+
92
+ Args:
93
+ settings: ODPS connection settings (access_id, secret_access_key, project, endpoint, etc.)
94
+ allowed_operations: List of allowed operations for this connection
95
+
96
+ Returns:
97
+ Tuple of (identity_payload, warnings)
98
+ """
99
+ try:
100
+ from odps import ODPS
101
+ from odps.accounts import StsAccount
102
+ except ImportError:
103
+ raise FeatureUnavailableError("pyodps is not installed in the current environment.")
104
+
105
+ auth_type = "sts_token" if settings.get("security_token") else "access_key"
106
+ missing = missing_odps_settings(settings, auth_type=auth_type)
107
+ if missing:
108
+ raise ValidationError(
109
+ f"auth login is missing required fields: {', '.join(missing)}.",
110
+ suggestion="Provide the missing values via flags, --from-env, or interactive prompts.",
111
+ )
112
+
113
+ try:
114
+ if auth_type == "sts_token":
115
+ account = StsAccount(
116
+ settings["access_id"],
117
+ settings["secret_access_key"],
118
+ settings["security_token"],
119
+ )
120
+ client = ODPS(
121
+ account,
122
+ project=settings["project"],
123
+ endpoint=settings["endpoint"],
124
+ region_name=settings.get("region_name") or None,
125
+ tunnel_endpoint=settings.get("tunnel_endpoint") or None,
126
+ )
127
+ else:
128
+ client = ODPS(
129
+ access_id=settings["access_id"],
130
+ secret_access_key=settings["secret_access_key"],
131
+ project=settings["project"],
132
+ endpoint=settings["endpoint"],
133
+ region_name=settings.get("region_name") or None,
134
+ tunnel_endpoint=settings.get("tunnel_endpoint") or None,
135
+ )
136
+ except Exception as exc:
137
+ raise translate_odps_error(exc) from exc
138
+
139
+ return build_odps_identity_payload(
140
+ client=client,
141
+ settings=settings,
142
+ allowed_operations=allowed_operations,
143
+ identity_source="config_file",
144
+ auth_type=auth_type,
145
+ token_expires_at=settings.get("token_expires_at"),
146
+ )
147
+
148
+
149
+ def resolve_odps_settings(
150
+ config,
151
+ auth_override=None,
152
+ ) -> 'tuple[dict[str, str | None], dict[str, str], list[str]]':
153
+ """Resolve ODPS settings from config and environment variables.
154
+
155
+ Returns (settings, sources, suppressed_env_vars).
156
+
157
+ suppressed_env_vars lists env var names that were present but ignored
158
+ because an explicit auth provider is configured in the config file.
159
+ When a provider is explicitly configured, env vars do not override any
160
+ auth settings — use ``auth login --from-env`` or ``session set`` instead.
161
+ """
162
+ auth = auth_override or config.auth
163
+ values = auth.to_mapping()
164
+ settings: 'dict[str, str | None]' = {
165
+ "provider": values.get("provider"),
166
+ "access_id": values.get("access_id"),
167
+ "secret_access_key": values.get("secret_access_key"),
168
+ "security_token": values.get("security_token"),
169
+ "token_expires_at": values.get("token_expires_at"),
170
+ "project": values.get("project"),
171
+ "endpoint": values.get("endpoint"),
172
+ "region_name": values.get("region_name"),
173
+ "tunnel_endpoint": values.get("tunnel_endpoint"),
174
+ "ncs_process_command": values.get("ncs", {}).get("process_command") if isinstance(values.get("ncs"), dict) else None,
175
+ "ncs_account_type": values.get("ncs", {}).get("account_type") if isinstance(values.get("ncs"), dict) else None,
176
+ "ncs_employee_id": values.get("ncs", {}).get("employee_id") if isinstance(values.get("ncs"), dict) else None,
177
+ "ncs_account_name": values.get("ncs", {}).get("account_name") if isinstance(values.get("ncs"), dict) else None,
178
+ "ncs_app_name": values.get("ncs", {}).get("app_name") if isinstance(values.get("ncs"), dict) else None,
179
+ "ncs_process_timeout": str(values.get("ncs", {}).get("process_timeout")) if isinstance(values.get("ncs"), dict) and values.get("ncs", {}).get("process_timeout") is not None else None,
180
+ }
181
+ sources: 'dict[str, str]' = {
182
+ key: "config_file" if value else "unset"
183
+ for key, value in settings.items()
184
+ }
185
+
186
+ env_settings = load_odps_env()
187
+ suppressed_env_vars: 'list[str]' = []
188
+
189
+ if sources.get("provider") == "config_file":
190
+ # User explicitly configured an auth provider via `auth login` or
191
+ # `auth login-ncs`. Env vars must not silently override the saved
192
+ # config — only env-var-based auth (Path B) is supposed to use them.
193
+ # Collect which env vars are set so callers can surface a warning.
194
+ for field, env_value in env_settings.items():
195
+ if env_value:
196
+ suppressed_env_vars.append(field)
197
+ else:
198
+ for field, env_value in env_settings.items():
199
+ if env_value:
200
+ settings[field] = env_value
201
+ sources[field] = "environment"
202
+
203
+ return settings, sources, suppressed_env_vars
204
+
205
+
206
+ def odps_identity_source(sources: 'dict[str, str]') -> 'str':
207
+ required_sources = {
208
+ value for key, value in sources.items()
209
+ if key in {"access_id", "secret_access_key", "security_token", "project", "endpoint"} and value != "unset"
210
+ }
211
+ if required_sources == {"environment"}:
212
+ return "environment"
213
+ if required_sources == {"config_file"}:
214
+ return "config_file"
215
+ if required_sources:
216
+ return "mixed"
217
+ return "unknown"
218
+
219
+
220
+ def missing_odps_settings(
221
+ settings: 'dict[str, str | None]',
222
+ *,
223
+ auth_type: 'str' = "access_key",
224
+ ) -> 'list[str]':
225
+ required = ["access_id", "secret_access_key", "project", "endpoint"]
226
+ if auth_type == "sts_token":
227
+ required.append("security_token")
228
+ if auth_type == "ncs":
229
+ missing = [f for f in ("project", "endpoint") if not settings.get(f)]
230
+ # ncs_process_command can be derived from account_type + identifier,
231
+ # so only require it when no derivable fields are present either.
232
+ has_process_command = bool(settings.get("ncs_process_command"))
233
+ has_account_fields = bool(
234
+ settings.get("ncs_account_type") and (
235
+ settings.get("ncs_employee_id")
236
+ or settings.get("ncs_account_name")
237
+ or settings.get("ncs_app_name")
238
+ )
239
+ )
240
+ if not has_process_command and not has_account_fields:
241
+ missing.append("ncs_process_command")
242
+ return missing
243
+ return [name for name in required if not settings.get(name)]
244
+
245
+
246
+ def build_odps_identity_payload(
247
+ *,
248
+ client: 'Any | None',
249
+ settings: 'dict[str, str | None]',
250
+ allowed_operations: 'list[str]',
251
+ identity_source: 'str',
252
+ auth_type: 'str' = "access_key",
253
+ token_expires_at: 'str | None' = None,
254
+ project: 'str | None' = None,
255
+ owner_display_name: 'str | None' = None,
256
+ authenticated: 'bool' = True,
257
+ configured: 'bool' = True,
258
+ validation_status: 'str' = "verified",
259
+ ) -> 'tuple[dict[str, Any], list[str]]':
260
+ target_project = project or settings.get("project")
261
+ access_id = (
262
+ getattr(getattr(client, "account", None), "access_id", None)
263
+ if client is not None
264
+ else None
265
+ ) or settings.get("access_id")
266
+ masked_access_id = mask_access_id(access_id) if access_id else None
267
+
268
+ # Prefer the owner display name when available from the whoami API.
269
+ # Fall back to a masked access key id when the display name is unavailable.
270
+ principal_display = owner_display_name or masked_access_id
271
+
272
+ warnings = []
273
+ if not owner_display_name and access_id:
274
+ warnings.append("Principal display name was unavailable; falling back to a masked access key id.")
275
+ if identity_source == "mixed":
276
+ warnings.append("Connection settings came from both environment variables and config files; environment values take precedence.")
277
+
278
+ payload = {
279
+ "authenticated": authenticated,
280
+ "configured": configured,
281
+ "validation_status": validation_status,
282
+ "backend": "odps",
283
+ "auth_type": auth_type,
284
+ "identity_source": identity_source,
285
+ "principal_display": principal_display,
286
+ "principal_masked": masked_access_id,
287
+ "project": target_project,
288
+ "region": settings.get("region_name"),
289
+ "endpoint": settings.get("endpoint"),
290
+ "project_owner": owner_display_name,
291
+ "allowed_operations": allowed_operations,
292
+ }
293
+ if token_expires_at:
294
+ payload["token_expires_at"] = token_expires_at
295
+
296
+ return payload, warnings
297
+
298
+
299
+ # Partition helpers
300
+ def build_latest_partition_info(
301
+ table: 'TableDefinition',
302
+ *,
303
+ source: 'str',
304
+ partitions: 'list[str] | None' = None,
305
+ latest_partition_override: 'str | None' = None,
306
+ visible_partition_count: 'int | None' = None,
307
+ ) -> 'tuple[dict[str, Any], list[str]]':
308
+ warnings: 'list[str]' = []
309
+ partition_columns = [column.name for column in table.partition_columns]
310
+ has_partitions = bool(partition_columns)
311
+ candidates = list(partitions) if partitions is not None else list(table.partitions)
312
+ latest_partition = latest_partition_override or select_latest_partition(
313
+ candidates,
314
+ partition_columns,
315
+ )
316
+
317
+ if not has_partitions:
318
+ warnings.append("The table is not partitioned, so `latest_partition` is `null`.")
319
+ elif not latest_partition:
320
+ warnings.append("No visible partitions were returned, so `latest_partition` is `null`.")
321
+
322
+ partition_values = normalize_partition_values(
323
+ parse_partition_spec(latest_partition) if latest_partition else {},
324
+ partition_columns,
325
+ )
326
+ if latest_partition and not partition_values:
327
+ warnings.append("A latest partition was returned, but its partition spec could not be parsed reliably.")
328
+
329
+ payload = {
330
+ "table_name": table.name,
331
+ "has_partitions": has_partitions,
332
+ "partition_columns": partition_columns,
333
+ "latest_partition": latest_partition,
334
+ "latest_partition_values": partition_values,
335
+ "reference_time": partition_reference_time(partition_values, partition_columns),
336
+ "latest_partition_source": source,
337
+ "visible_partition_count": visible_partition_count if visible_partition_count is not None else len(candidates),
338
+ "observed_at": now_utc_iso(),
339
+ }
340
+ return payload, warnings
341
+
342
+
343
+ def build_freshness_info(
344
+ table: 'TableDefinition',
345
+ latest_partition_payload: 'dict[str, Any]',
346
+ *,
347
+ warnings: 'list[str] | None' = None,
348
+ ) -> 'tuple[dict[str, Any], list[str]]':
349
+ payload_warnings = list(warnings or [])
350
+ observed_at = now_utc_iso()
351
+ observed_dt = parse_time_value(observed_at)
352
+ partition_time = latest_partition_payload.get("reference_time")
353
+ latest_partition = latest_partition_payload.get("latest_partition")
354
+ freshness_source = "latest_partition"
355
+ reference_time = partition_time
356
+
357
+ if reference_time is None and table.updated_at:
358
+ freshness_source = "table_updated_at"
359
+ reference_time = normalize_time_value(table.updated_at)
360
+ if latest_partition_payload.get("has_partitions"):
361
+ payload_warnings.append("The latest partition could not be converted to a timestamp, so freshness fell back to `updated_at`.")
362
+ else:
363
+ payload_warnings.append("The table is not partitioned, so freshness was inferred from `updated_at`.")
364
+ elif reference_time is None:
365
+ freshness_source = "unknown"
366
+ if latest_partition:
367
+ payload_warnings.append("A latest partition exists, but its values do not form a parseable timestamp.")
368
+ else:
369
+ payload_warnings.append("No parseable partition timestamp or `updated_at` value was available, so freshness is `unknown`.")
370
+
371
+ age_seconds: 'int | None' = None
372
+ age_hours: 'float | None' = None
373
+ age_days: 'float | None' = None
374
+ freshness_status = "unknown"
375
+ reference_dt = parse_time_value(reference_time) if reference_time else None
376
+
377
+ if observed_dt and reference_dt:
378
+ delta_seconds = int((observed_dt - reference_dt).total_seconds())
379
+ if delta_seconds < 0:
380
+ payload_warnings.append("The reference time is in the future, so freshness age was clamped to 0 seconds.")
381
+ delta_seconds = 0
382
+ age_seconds = delta_seconds
383
+ age_hours = round(delta_seconds / 3600, 2)
384
+ age_days = round(delta_seconds / 86400, 2)
385
+ if age_hours <= FRESHNESS_FRESH_HOURS:
386
+ freshness_status = "fresh"
387
+ elif age_hours <= FRESHNESS_STALE_HOURS:
388
+ freshness_status = "lagging"
389
+ else:
390
+ freshness_status = "stale"
391
+
392
+ payload = {
393
+ "table_name": table.name,
394
+ "freshness_source": freshness_source,
395
+ "freshness_status": freshness_status,
396
+ "reference_time": reference_time,
397
+ "observed_at": observed_at,
398
+ "age_seconds": age_seconds,
399
+ "age_hours": age_hours,
400
+ "age_days": age_days,
401
+ "latest_partition": latest_partition,
402
+ "latest_partition_values": latest_partition_payload.get("latest_partition_values", {}),
403
+ "partition_columns": latest_partition_payload.get("partition_columns", []),
404
+ "latest_partition_source": latest_partition_payload.get("latest_partition_source"),
405
+ "visible_partition_count": latest_partition_payload.get("visible_partition_count"),
406
+ "status_thresholds": {
407
+ "fresh_hours": FRESHNESS_FRESH_HOURS,
408
+ "stale_hours": FRESHNESS_STALE_HOURS,
409
+ },
410
+ }
411
+ return payload, payload_warnings
412
+
413
+
414
+ def select_latest_partition(partitions: 'list[str]', partition_columns: 'list[str]') -> 'str | None':
415
+ if not partitions:
416
+ return None
417
+ return max(
418
+ partitions,
419
+ key=lambda item: partition_sort_key(item, partition_columns),
420
+ )
421
+
422
+
423
+ def partition_sort_key(spec: 'str', partition_columns: 'list[str]') -> 'tuple[Any, ...]':
424
+ values = normalize_partition_values(parse_partition_spec(spec), partition_columns)
425
+ ordered_keys = partition_columns or list(values)
426
+ if not ordered_keys:
427
+ return (spec,)
428
+ return tuple(sortable_partition_value(values.get(key)) for key in ordered_keys)
429
+
430
+
431
+ def parse_partition_spec(spec: 'str | None') -> 'dict[str, str]':
432
+ if not spec:
433
+ return {}
434
+ normalized = PARTITION_PATH_RE.sub(",", spec.strip())
435
+ values: 'dict[str, str]' = {}
436
+ for item in normalized.split(","):
437
+ segment = item.strip()
438
+ if not segment or "=" not in segment:
439
+ continue
440
+ key, value = segment.split("=", 1)
441
+ values[key.strip()] = value.strip().strip("'").strip('"')
442
+ return values
443
+
444
+
445
+ def normalize_partition_values(
446
+ values: 'dict[str, str]',
447
+ partition_columns: 'list[str]',
448
+ ) -> 'dict[str, str]':
449
+ ordered: 'dict[str, str]' = {}
450
+ for key in partition_columns:
451
+ if key in values:
452
+ ordered[key] = values[key]
453
+ for key, value in values.items():
454
+ if key not in ordered:
455
+ ordered[key] = value
456
+ return ordered
457
+
458
+
459
+ def partition_reference_time(
460
+ partition_values: 'dict[str, str]',
461
+ partition_columns: 'list[str]',
462
+ ) -> 'str | None':
463
+ if not partition_values:
464
+ return None
465
+
466
+ ordered_keys = partition_columns or list(partition_values)
467
+ base_key = next((key for key in ordered_keys if key in partition_values), None)
468
+ if base_key is not None:
469
+ base_date = parse_date_value(partition_values.get(base_key))
470
+ if base_date is not None:
471
+ hour = parse_partition_component(partition_values, {"hh", "hour", "hr"})
472
+ minute = parse_partition_component(partition_values, {"mi", "minute", "mm"})
473
+ second = parse_partition_component(partition_values, {"ss", "second"})
474
+ if hour is not None or minute is not None or second is not None:
475
+ return datetime(
476
+ base_date.year,
477
+ base_date.month,
478
+ base_date.day,
479
+ hour or 0,
480
+ minute or 0,
481
+ second or 0,
482
+ tzinfo=timezone.utc,
483
+ ).isoformat()
484
+
485
+ for key in reversed(ordered_keys):
486
+ candidate = normalize_time_value(partition_values.get(key))
487
+ if candidate is not None:
488
+ return candidate
489
+ for value in partition_values.values():
490
+ candidate = normalize_time_value(value)
491
+ if candidate is not None:
492
+ return candidate
493
+ return None
494
+
495
+
496
+ def parse_partition_component(values: 'dict[str, str]', names: 'set[str]') -> 'int | None':
497
+ for key, value in values.items():
498
+ if key.lower() not in names:
499
+ continue
500
+ text = str(value).strip()
501
+ if not re.fullmatch(r"\d{1,2}", text):
502
+ continue
503
+ return int(text)
504
+ return None
505
+
506
+
507
+ def sortable_partition_value(value: 'Any') -> 'tuple[int, Any]':
508
+ if value is None:
509
+ return (0, "")
510
+ text = str(value).strip().strip("'").strip('"')
511
+ if re.fullmatch(r"-?\d+", text):
512
+ return (1, int(text))
513
+ if re.fullmatch(r"-?\d+\.\d+", text):
514
+ return (2, float(text))
515
+ normalized_time = normalize_time_value(text)
516
+ if normalized_time is not None:
517
+ dt = parse_time_value(normalized_time)
518
+ if dt is not None:
519
+ return (3, int(dt.timestamp()))
520
+ return (4, text.lower())
521
+
522
+
523
+ def parse_date_value(value: 'Any') -> 'date | None':
524
+ text = str(value).strip().strip("'").strip('"')
525
+ for fmt in ("%Y-%m-%d", "%Y%m%d"):
526
+ try:
527
+ return datetime.strptime(text, fmt).date()
528
+ except ValueError:
529
+ continue
530
+ return None
531
+
532
+
533
+ def normalize_time_value(value: 'Any') -> 'str | None':
534
+ if value is None:
535
+ return None
536
+ text = str(value).strip().strip("'").strip('"')
537
+ if not text:
538
+ return None
539
+ parsed = parse_time_value(text)
540
+ return parsed.isoformat() if parsed is not None else None
541
+
542
+
543
+ def parse_time_value(value: 'Any') -> 'datetime | None':
544
+ if value is None:
545
+ return None
546
+ text = str(value).strip().strip("'").strip('"')
547
+ if not text:
548
+ return None
549
+
550
+ candidate = text.replace("Z", "+00:00")
551
+ fromisoformat = getattr(datetime, "fromisoformat", None)
552
+ parsed = None
553
+ if fromisoformat is not None:
554
+ try:
555
+ parsed = fromisoformat(candidate)
556
+ except ValueError:
557
+ parsed = None
558
+ if parsed is not None:
559
+ if parsed.tzinfo is None:
560
+ parsed = parsed.replace(tzinfo=timezone.utc)
561
+ return parsed
562
+
563
+ normalized_candidate = ISO_TIMEZONE_COLON_RE.sub(r"\1\2", candidate)
564
+ for fmt in (
565
+ "%Y-%m-%dT%H:%M:%S.%f%z",
566
+ "%Y-%m-%dT%H:%M:%S%z",
567
+ "%Y-%m-%d %H:%M:%S.%f%z",
568
+ "%Y-%m-%d %H:%M:%S%z",
569
+ "%Y-%m-%dT%H:%M:%S.%f",
570
+ "%Y-%m-%dT%H:%M:%S",
571
+ "%Y%m%d%H%M%S",
572
+ "%Y%m%d%H",
573
+ "%Y-%m-%d %H:%M:%S",
574
+ "%Y-%m-%d",
575
+ "%Y%m%d",
576
+ ):
577
+ try:
578
+ source = normalized_candidate if "%z" in fmt else text
579
+ parsed = datetime.strptime(source, fmt)
580
+ except ValueError:
581
+ continue
582
+ if parsed.tzinfo is None:
583
+ parsed = parsed.replace(tzinfo=timezone.utc)
584
+ return parsed
585
+ return None
586
+
587
+
588
+ def partition_spec_text(partition: 'Any') -> 'str | None':
589
+ if partition is None:
590
+ return None
591
+ spec = getattr(partition, "partition_spec", None)
592
+ target = spec if spec is not None else partition
593
+ text = str(target).strip()
594
+ return text or None
595
+
596
+
597
+ # String helpers
598
+ def mask_access_id(value: 'str | None') -> 'str | None':
599
+ if not value:
600
+ return None
601
+ if len(value) <= 8:
602
+ return "*" * len(value)
603
+ return f"{value[:4]}***{value[-4:]}"
604
+
605
+
606
+ def quote_table_name(table_name: 'str') -> 'str':
607
+ if not re.fullmatch(r"[A-Za-z_][\w]*(\.[A-Za-z_][\w]*)*", table_name):
608
+ raise ValidationError(
609
+ f"Invalid table name: {table_name}",
610
+ suggestion="Use a plain table name or `project.table` form instead of passing a SQL fragment.",
611
+ )
612
+ return ".".join(f"`{part}`" for part in table_name.split("."))
613
+
614
+
615
+ def sql_string_literal(value: 'str') -> 'str':
616
+ return "'" + value.replace("'", "''") + "'"
617
+
618
+
619
+ # Task and job helpers
620
+ def build_task_summary(
621
+ sql: 'str | None',
622
+ *,
623
+ task_names: 'list[str] | None' = None,
624
+ task_types: 'dict[str, str] | None' = None,
625
+ ) -> 'dict[str, Any]':
626
+ normalized_sql = (sql or "").strip().rstrip(";")
627
+ return {
628
+ "operation": detect_operation(normalized_sql) if normalized_sql else "UNKNOWN",
629
+ "tables_used": extract_table_names(normalized_sql) if normalized_sql else [],
630
+ "task_names": task_names or [],
631
+ "task_types": task_types or {},
632
+ "sql_excerpt": normalized_sql[:200] if normalized_sql else None,
633
+ }
634
+
635
+
636
+ def classify_failure_reason(reason: 'str | None') -> 'dict[str, Any]':
637
+ text = (reason or "").strip()
638
+ if not text:
639
+ return {
640
+ "category": "not_failed",
641
+ "retryable": None,
642
+ "summary": "No diagnosable failure reason is currently available.",
643
+ }
644
+ lowered = text.lower()
645
+ if any(token in lowered for token in ("semantic analysis exception", "parse exception", "syntax")):
646
+ return {
647
+ "category": "sql_semantic_error",
648
+ "retryable": False,
649
+ "summary": "SQL semantic or syntax error.",
650
+ }
651
+ if any(token in lowered for token in ("permission", "access denied", "no privilege")):
652
+ return {
653
+ "category": "permission_error",
654
+ "retryable": False,
655
+ "summary": "Permission was denied or access was rejected.",
656
+ }
657
+ if any(token in lowered for token in ("cancel", "取消")):
658
+ return {
659
+ "category": "client_cancelled",
660
+ "retryable": False,
661
+ "summary": "The job was cancelled explicitly.",
662
+ }
663
+ if any(token in lowered for token in ("quota", "resource", "timeout", "timed out", "memory")):
664
+ return {
665
+ "category": "resource_or_timeout_error",
666
+ "retryable": True,
667
+ "summary": "Resource, quota, or timeout issue; retrying may help.",
668
+ }
669
+ return {
670
+ "category": "unknown_failure",
671
+ "retryable": False,
672
+ "summary": "The failure reason could not be classified.",
673
+ }
674
+
675
+
676
+ # Sample and profile helpers
677
+ def resolve_sample_request(
678
+ table: 'TableDefinition',
679
+ *,
680
+ partition: 'str | None',
681
+ columns: 'list[str] | None',
682
+ strict_partition_check: 'bool',
683
+ ) -> 'tuple[list[str], str | None, dict[str, str]]':
684
+ column_lookup = {
685
+ column.name: column
686
+ for column in [*table.columns, *table.partition_columns]
687
+ }
688
+ selected_columns = columns or [column.name for column in table.columns]
689
+ missing_columns = [column for column in selected_columns if column not in column_lookup]
690
+ if missing_columns:
691
+ raise ValidationError(
692
+ f"Unknown column(s): {', '.join(missing_columns)}",
693
+ suggestion="Run `maxc meta describe` to inspect the available schema.",
694
+ )
695
+
696
+ applied_partition = None
697
+ partition_values: 'dict[str, str]' = {}
698
+ if partition:
699
+ partition_column_names = [column.name for column in table.partition_columns]
700
+ if not partition_column_names:
701
+ raise ValidationError(
702
+ f"Table `{table.name}` is not partitioned, so `--partition` cannot be used.",
703
+ suggestion="Run `maxc meta describe` or `maxc meta latest-partition` first.",
704
+ )
705
+ partition_values = normalize_partition_values(
706
+ parse_partition_spec(partition),
707
+ partition_column_names,
708
+ )
709
+ if not partition_values:
710
+ raise ValidationError(
711
+ "`--partition` could not be parsed.",
712
+ suggestion="Example: `--partition ds=2026-03-20`",
713
+ )
714
+ if set(partition_values) != set(partition_column_names):
715
+ raise ValidationError(
716
+ f"`--partition` must provide all partition columns: {', '.join(partition_column_names)}.",
717
+ suggestion=f"Example: `--partition {canonical_partition_spec(dict.fromkeys(partition_column_names, '...'))}`",
718
+ )
719
+ applied_partition = canonical_partition_spec(
720
+ {name: partition_values[name] for name in partition_column_names}
721
+ )
722
+
723
+ if strict_partition_check and table.partitions:
724
+ normalized_partitions = {
725
+ canonical_partition_spec(normalize_partition_values(parse_partition_spec(item), partition_column_names))
726
+ for item in table.partitions
727
+ }
728
+ if applied_partition not in normalized_partitions:
729
+ raise ValidationError(
730
+ f"Partition does not exist: {applied_partition}",
731
+ suggestion="Run `maxc meta latest-partition` or `maxc meta partitions` first.",
732
+ )
733
+
734
+ deduped_columns: 'list[str]' = []
735
+ for column in selected_columns:
736
+ if column not in deduped_columns:
737
+ deduped_columns.append(column)
738
+ return deduped_columns, applied_partition, partition_values
739
+
740
+
741
+ def canonical_partition_spec(values: 'dict[str, str]') -> 'str':
742
+ return ",".join(f"{key}={value}" for key, value in values.items())
743
+
744
+
745
+ def build_sample_schema(table: 'TableDefinition', selected_columns: 'list[str]') -> 'list[dict[str, Any]]':
746
+ column_lookup = {
747
+ column.name: column
748
+ for column in [*table.columns, *table.partition_columns]
749
+ }
750
+ return [
751
+ {
752
+ "name": column_lookup[name].name,
753
+ "type": column_lookup[name].type,
754
+ "comment": column_lookup[name].comment,
755
+ }
756
+ for name in selected_columns
757
+ ]
758
+
759
+
760
+ def project_sample_rows(
761
+ rows: 'list[dict[str, Any]]',
762
+ selected_columns: 'list[str]',
763
+ partition_values: 'dict[str, str]',
764
+ ) -> 'list[dict[str, Any]]':
765
+ projected_rows: 'list[dict[str, Any]]' = []
766
+ for row in rows:
767
+ projected: 'dict[str, Any]' = {}
768
+ for column in selected_columns:
769
+ if column in row:
770
+ projected[column] = row[column]
771
+ else:
772
+ projected[column] = partition_values.get(column)
773
+ projected_rows.append(projected)
774
+ return projected_rows
775
+
776
+
777
+ def top_values(values: 'list[Any]', limit: 'int' = 5) -> 'list[dict[str, Any]]':
778
+ counter = Counter(str(value) for value in values)
779
+ return [
780
+ {"value": value, "count": count}
781
+ for value, count in counter.most_common(limit)
782
+ ]
783
+
784
+
785
+ def build_profile(
786
+ table: 'TableDefinition',
787
+ sample_rows: 'list[dict[str, Any]]',
788
+ *,
789
+ applied_partition: 'str | None' = None,
790
+ ) -> 'dict[str, Any]':
791
+ profiles = []
792
+ for column in table.columns:
793
+ values = [row.get(column.name) for row in sample_rows]
794
+ non_null = [value for value in values if value is not None]
795
+ profile: 'dict[str, Any]' = {
796
+ "name": column.name,
797
+ "type": column.type,
798
+ "comment": column.comment,
799
+ "null_count_in_sample": len(values) - len(non_null),
800
+ "null_ratio_in_sample": round((len(values) - len(non_null)) / len(values), 4) if values else None,
801
+ "sample_size": len(values),
802
+ "sample_values": non_null[:3],
803
+ "distinct_count_in_sample": len({str(item) for item in non_null}),
804
+ "top_values_in_sample": top_values(non_null),
805
+ }
806
+ if non_null and all(isinstance(item, (int, float, Decimal)) for item in non_null):
807
+ numeric = [float(item) for item in non_null]
808
+ profile["min"] = min(numeric)
809
+ profile["max"] = max(numeric)
810
+ profiles.append(profile)
811
+ return {
812
+ "table_name": table.name,
813
+ "partition_count": len(table.partitions),
814
+ "applied_partition": applied_partition,
815
+ "sampled_rows": len(sample_rows),
816
+ "columns": profiles,
817
+ }
818
+
819
+
820
+ def build_query_outline(sql: 'str') -> 'dict[str, Any]':
821
+ return {
822
+ "operation": detect_operation(sql),
823
+ "normalized_sql": normalize_sql(sql),
824
+ "tables_used": extract_table_names(sql),
825
+ "projected_columns": parse_select_projection(sql),
826
+ }
827
+
828
+
829
+ def translate_odps_error(exc: 'Exception', context: 'str' = "") -> 'MaxCError':
830
+ """Translate ODPS errors into structured CLI errors."""
831
+ message = str(exc)
832
+
833
+ project_name = _extract_resource_name(message, "projects")
834
+ table_name = _extract_resource_name(message, "tables")
835
+ schema_name = _extract_resource_name(message, "schemas")
836
+
837
+ if isinstance(exc, ModuleNotFoundError) and exc.name == "pandas":
838
+ return FeatureUnavailableError(
839
+ "pandas is required to read MaxCompute result sets that contain TIMESTAMP-like types.",
840
+ suggestion="Install pandas, or limit sampling / profiling to columns that do not require pandas.",
841
+ )
842
+
843
+ if isinstance(exc, OdpsNoPermission):
844
+ return _build_permission_error(message, context, project_name, table_name, schema_name)
845
+
846
+ if isinstance(exc, OdpsNoSuchObject):
847
+ return NotFoundError(
848
+ message,
849
+ suggestion="Run `maxc meta list-tables` or `maxc meta search` to verify the object exists.",
850
+ )
851
+
852
+ if isinstance(exc, ODPSError):
853
+ lowered = message.lower()
854
+ if "permission" in lowered or "access denied" in lowered or "nopermission" in lowered:
855
+ return _build_permission_error(message, context, project_name, table_name, schema_name)
856
+ if "parse exception" in lowered or "semantic analysis exception" in lowered:
857
+ return SqlError(message)
858
+ if "connection" in lowered or "failed to resolve" in lowered:
859
+ return BackendConnectionError(
860
+ message,
861
+ suggestion="Check network connectivity, the configured endpoint, and environment variables.",
862
+ )
863
+ return SqlError(message)
864
+
865
+ return BackendConnectionError(
866
+ message,
867
+ suggestion="Check MaxCompute network connectivity and environment configuration.",
868
+ )
869
+
870
+
871
+ def _extract_resource_name(message: 'str', resource_type: 'str') -> 'str | None':
872
+ """Extract a resource name from an ODPS error message."""
873
+ patterns = [
874
+ rf"\{{acs:odps:[^:]*:{resource_type}/([^}}\s]+)\}}",
875
+ rf"{resource_type}/([^}}\s/]+)",
876
+ ]
877
+ for pattern in patterns:
878
+ match = re.search(pattern, message, re.IGNORECASE)
879
+ if match:
880
+ return match.group(1)
881
+ return None
882
+
883
+
884
+ def _build_permission_error(
885
+ message: 'str',
886
+ context: 'str',
887
+ project_name: 'str | None',
888
+ table_name: 'str | None',
889
+ schema_name: 'str | None',
890
+ ) -> 'PermissionDeniedError':
891
+ """Build permission-denied errors with more precise suggestions."""
892
+ if context == "list_projects" and project_name:
893
+ return PermissionDeniedError(
894
+ f"Failed to list projects: missing Read permission on project {project_name}.",
895
+ suggestion=f"Verify that your account has `odps:Read` on project {project_name}, or contact the project owner.",
896
+ )
897
+
898
+ if context == "list_schemas":
899
+ target = project_name or "the current project"
900
+ return PermissionDeniedError(
901
+ f"Failed to list schemas: missing Read permission on project {target}.",
902
+ suggestion=f"Verify that your account has `odps:Read` on project {target}.",
903
+ )
904
+
905
+ if context == "get_project_info" and project_name:
906
+ return PermissionDeniedError(
907
+ f"Failed to read project info: missing Read permission on project {project_name}.",
908
+ suggestion=f"Verify that your account has `odps:Read` on project {project_name}.",
909
+ )
910
+
911
+ if table_name:
912
+ return PermissionDeniedError(
913
+ f"Operation failed: missing required permission on table {table_name}.",
914
+ suggestion=f"Verify that your account has the required permission on table {table_name}, or contact the project owner.",
915
+ )
916
+
917
+ if project_name:
918
+ return PermissionDeniedError(
919
+ f"Operation failed: missing required permission on project {project_name}.",
920
+ suggestion=f"Verify that your account has the required permission on project {project_name}, or contact the project owner.",
921
+ )
922
+
923
+ return PermissionDeniedError(message)
924
+
925
+
926
+ # Data conversion helpers
927
+ def record_to_dict(columns: 'list[str]', values: 'Iterable[Any]') -> 'dict[str, Any]':
928
+ return {
929
+ column: json_safe(value)
930
+ for column, value in zip(columns, values)
931
+ }
932
+
933
+
934
+ def json_safe(value: 'Any') -> 'Any':
935
+ if isinstance(value, Decimal):
936
+ return str(value)
937
+ if isinstance(value, datetime):
938
+ if value.tzinfo is None:
939
+ value = value.replace(tzinfo=timezone.utc)
940
+ return value.isoformat()
941
+ if isinstance(value, date):
942
+ return value.isoformat()
943
+ if isinstance(value, time):
944
+ return value.isoformat()
945
+ if isinstance(value, bytes):
946
+ try:
947
+ return value.decode("utf-8")
948
+ except Exception:
949
+ return value.hex()
950
+ return value
951
+
952
+
953
+ def _dt_to_iso(value: 'datetime | None') -> 'str | None':
954
+ if value is None:
955
+ return None
956
+ if value.tzinfo is None:
957
+ return value.replace(tzinfo=timezone.utc).isoformat()
958
+ return value.isoformat()
959
+
960
+
961
+ def _duration_ms(start: 'datetime | None', end: 'datetime | None') -> 'int':
962
+ if start is None or end is None:
963
+ return 0
964
+ return int((end - start).total_seconds() * 1000)