coderouter-cli 2.3.0a4__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coderouter/cli.py CHANGED
@@ -293,6 +293,18 @@ def _build_parser() -> argparse.ArgumentParser:
293
293
  metavar="N",
294
294
  help="Use only the last N entries (applied after --since and --provider filters).",
295
295
  )
296
+ # P1-6: --suggest-rules — statistical analysis → routing rule proposals.
297
+ replay.add_argument(
298
+ "--suggest-rules",
299
+ action="store_true",
300
+ help=(
301
+ "P1-6: analyse the request journal and print actionable routing "
302
+ "rule suggestions as copy-paste YAML snippets. Suggestions cover "
303
+ "provider reordering by cost, prompt_cache enablement, drift "
304
+ "detection configuration, and goal profile creation. "
305
+ "Can be combined with --since / --limit to scope the analysis window."
306
+ ),
307
+ )
296
308
 
297
309
  return parser
298
310
 
@@ -684,6 +696,25 @@ def _run_replay(args: argparse.Namespace) -> int:
684
696
  print("replay: no matching entries found.")
685
697
  return 0
686
698
 
699
+ if getattr(args, "suggest_rules", False):
700
+ # P1-6: statistical rule suggestion mode.
701
+ # Always compute a full window summary (ignores --compare / --provider).
702
+ from coderouter.state.replay import summarize_window as _sw
703
+ from coderouter.state.suggest_rules import format_suggestions, suggest_rules
704
+
705
+ # Re-read without provider filter so we see all providers.
706
+ all_entries = read_request_log(log_path, since=args.since)
707
+ if args.limit is not None and args.limit > 0:
708
+ all_entries = all_entries[-args.limit:]
709
+ full_summary = _sw(all_entries)
710
+ suggestions = suggest_rules(full_summary)
711
+ print(f"Request journal: {len(all_entries)} entries analysed")
712
+ print(f" Window: {full_summary.first_ts} → {full_summary.last_ts}")
713
+ print(f" Providers: {', '.join(sorted(full_summary.providers))}")
714
+ print()
715
+ print(format_suggestions(suggestions))
716
+ return 0
717
+
687
718
  if args.compare:
688
719
  provider_a, provider_b = args.compare
689
720
  comparison = compare_providers(entries, provider_a, provider_b)
@@ -658,6 +658,28 @@ class FallbackChain(BaseModel):
658
658
  ),
659
659
  )
660
660
 
661
+ # --- P1-5: goal_mode — tighter drift thresholds for /goal sessions -------
662
+ #
663
+ # When True, the drift detector automatically switches to the
664
+ # ``THRESHOLDS_GOAL`` preset regardless of ``drift_detection_sensitivity``,
665
+ # and lowers ``min_window_fill`` to 4 so stall detection fires faster.
666
+ #
667
+ # Intended for profiles routed by the ``/goal`` meta-command where
668
+ # the agent is expected to make steady forward progress. Repetition and
669
+ # length collapse are much more meaningful signals in that context than
670
+ # in a general-purpose chat session.
671
+ goal_mode: bool = Field(
672
+ default=False,
673
+ description=(
674
+ "P1-5: when True, automatically applies the ``goal`` drift "
675
+ "threshold preset (stricter thresholds, lower ``min_window_fill`` "
676
+ "of 4) for this profile. Overrides ``drift_detection_sensitivity`` "
677
+ "when drift_detection_action is not ``off``. Designed for "
678
+ "agent/goal sessions where forward-progress stalls are more "
679
+ "actionable than in ad-hoc chat."
680
+ ),
681
+ )
682
+
661
683
  # --- v2.0-H (L6): Mid-stream partial stitching --------------------------
662
684
  # * ``off`` — discard partial content on mid-stream failure (legacy).
663
685
  # * ``surface`` — return partial content as a truncated-but-valid response.
@@ -852,6 +874,130 @@ class AutoRouterConfig(BaseModel):
852
874
  )
853
875
 
854
876
 
877
+ class LauncherBackendConfig(BaseModel):
878
+ """Per-backend binary path configuration for the Launcher.
879
+
880
+ When ``binary`` is unset, the Launcher falls back to the default
881
+ executable name (``llama-server`` for llama.cpp, ``python`` for vllm)
882
+ and relies on ``$PATH`` resolution — which works when the tool is
883
+ globally installed. Set ``binary`` when:
884
+
885
+ - llama.cpp was built from source (e.g. ``~/llama.cpp/build/bin/llama-server``)
886
+ - vllm lives in a virtualenv (e.g. ``~/.venv/bin/python``)
887
+ - Multiple builds coexist and you want to pin a specific one
888
+
889
+ Tilde (``~``) and environment variables are expanded at launch time.
890
+
891
+ Example::
892
+
893
+ backends:
894
+ llama.cpp:
895
+ binary: ~/llama.cpp/build/bin/llama-server
896
+ vllm:
897
+ binary: ~/.venv/bin/python
898
+ """
899
+
900
+ model_config = ConfigDict(extra="forbid")
901
+
902
+ binary: str | None = Field(
903
+ default=None,
904
+ description=(
905
+ "Absolute or ``~``-relative path to the backend executable. "
906
+ "llama.cpp default: ``llama-server`` (PATH). "
907
+ "vllm default: ``python`` (PATH). "
908
+ "Expanded at launch time."
909
+ ),
910
+ )
911
+
912
+
913
+ class LauncherOptionProfile(BaseModel):
914
+ """One named option preset for a launcher backend (e.g. llama.cpp / vllm).
915
+
916
+ ``args`` maps CLI flag strings to their values. A bool value of
917
+ ``True`` means "include the flag without a value" (e.g. ``--no-mmap``);
918
+ ``False`` means "omit the flag entirely". All other value types are
919
+ converted to strings and appended as ``--flag value`` pairs.
920
+
921
+ Example::
922
+
923
+ name: "GPU速度重視"
924
+ args:
925
+ "-ngl": 99
926
+ "--ctx-size": 4096
927
+ "--no-mmap": false
928
+ """
929
+
930
+ model_config = ConfigDict(extra="forbid")
931
+
932
+ name: str = Field(..., description="Display name shown in the Launcher UI dropdown.")
933
+ args: dict[str, str | int | float | bool] = Field(
934
+ default_factory=dict,
935
+ description=(
936
+ "CLI flag → value mapping. "
937
+ "bool True = flag only (no value). "
938
+ "bool False = omit flag. "
939
+ "All other types are stringified and passed as '--flag value'."
940
+ ),
941
+ )
942
+
943
+
944
+ class LauncherConfig(BaseModel):
945
+ """The ``launcher:`` block in providers.yaml.
946
+
947
+ Controls the Launcher UI available at ``/launcher``.
948
+
949
+ Example::
950
+
951
+ launcher:
952
+ model_dirs:
953
+ - ~/models
954
+ - /data/gguf
955
+ option_profiles:
956
+ llama.cpp:
957
+ - name: "GPU速度重視"
958
+ args:
959
+ "-ngl": 99
960
+ "--ctx-size": 4096
961
+ vllm:
962
+ - name: "標準"
963
+ args:
964
+ "--dtype": "auto"
965
+ "--max-model-len": 4096
966
+ """
967
+
968
+ model_config = ConfigDict(extra="forbid")
969
+
970
+ model_dirs: list[str] = Field(
971
+ default_factory=list,
972
+ description=(
973
+ "Directories to scan for model files "
974
+ "(.gguf, .safetensors, .bin, .pt, .ggml). "
975
+ "Paths are expanded (~ and env vars) at scan time, not at load. "
976
+ "Non-existent paths are silently skipped."
977
+ ),
978
+ )
979
+ backends: dict[str, LauncherBackendConfig] = Field(
980
+ default_factory=dict,
981
+ description=(
982
+ "Per-backend binary path overrides. "
983
+ "Keys are backend names ('llama.cpp', 'vllm'). "
984
+ "When a key is absent, the default executable is used "
985
+ "('llama-server' / 'python') and resolved via PATH. "
986
+ "Useful when running a from-source build or a venv-specific binary."
987
+ ),
988
+ )
989
+ option_profiles: dict[str, list[LauncherOptionProfile]] = Field(
990
+ default_factory=dict,
991
+ description=(
992
+ "Named option presets per backend. "
993
+ "Keys should be backend names: 'llama.cpp', 'vllm'. "
994
+ "Each key maps to an ordered list of named presets. "
995
+ "A free-form 'extra args' field is always available in the UI "
996
+ "for one-off overrides without touching this config."
997
+ ),
998
+ )
999
+
1000
+
855
1001
  class PluginsConfig(BaseModel):
856
1002
  """The ``plugins:`` block in providers.yaml (v2.3.0).
857
1003
 
@@ -1060,6 +1206,17 @@ class CodeRouterConfig(BaseModel):
1060
1206
  "plugins (zero-cost, backward-compatible default)."
1061
1207
  ),
1062
1208
  )
1209
+ launcher: LauncherConfig | None = Field(
1210
+ default=None,
1211
+ description=(
1212
+ "Launcher configuration for the /launcher UI. "
1213
+ "Defines model_dirs to scan and option_profiles per backend "
1214
+ "('llama.cpp', 'vllm'). "
1215
+ "Unset (None) = Launcher UI shows empty model list and no profiles. "
1216
+ "The Launcher UI itself is always available at /launcher "
1217
+ "regardless of this setting."
1218
+ ),
1219
+ )
1063
1220
 
1064
1221
  @model_validator(mode="after")
1065
1222
  def _check_default_profile_exists(self) -> CodeRouterConfig:
@@ -12,6 +12,8 @@ to hit:
12
12
  * :mod:`coderouter.guards.self_healing` — v2.0-J auto-exclude +
13
13
  restart + recovery probe
14
14
  * :mod:`coderouter.guards.continuous_probe` — v2.0-I background probing
15
+ * :mod:`coderouter.guards._fingerprint` — P1-4 response fingerprinting
16
+ for goal_progress_stall signal
15
17
 
16
18
  Each guard is a pure-functional / single-class module that the engine
17
19
  consults at the appropriate dispatch point. Guards never block the
@@ -0,0 +1,125 @@
1
+ """Response fingerprinting for goal_progress_stall detection (P1-4).
2
+
3
+ A "fingerprint" is a compact, order-independent signature of the *content*
4
+ of an assistant response — independent of surface variation (filler phrases,
5
+ minor rewordings). Two responses with the same fingerprint are considered
6
+ semantically repetitive for stall-detection purposes.
7
+
8
+ Algorithm
9
+ ---------
10
+ 1. Normalise: lowercase, strip punctuation, collapse whitespace.
11
+ 2. Extract the N most-frequent content words (excluding a small stop-list).
12
+ 3. Sort alphabetically, join with '|', SHA-256 → 12-hex prefix.
13
+
14
+ The 12-hex prefix gives 281 trillion distinct values — collision probability
15
+ across any 20-response window is negligible (< 1 in 10^15).
16
+
17
+ Why top-N content words instead of full hash?
18
+ ----------------------------------------------
19
+ A verbatim hash would fail to catch "I cannot do X. Let me try Y" vs
20
+ "Let me try Y as I cannot do X" — same stall, different hash. By
21
+ extracting the dominant vocabulary we get useful fuzzy equality without
22
+ the overhead of embedding models.
23
+
24
+ Usage
25
+ -----
26
+ from coderouter.guards._fingerprint import fingerprint_response
27
+
28
+ fp = fingerprint_response(response_text)
29
+ obs = ResponseObservation(..., response_fingerprint=fp)
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import hashlib
35
+ import re
36
+ import unicodedata
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Stop-word list (English + common LLM filler)
40
+ # ---------------------------------------------------------------------------
41
+
42
+ _STOP_WORDS: frozenset[str] = frozenset(
43
+ {
44
+ # English function words
45
+ "a", "an", "the", "and", "or", "but", "if", "in", "on", "at", "to",
46
+ "for", "of", "with", "by", "from", "as", "is", "it", "its", "be",
47
+ "was", "are", "were", "been", "has", "have", "had", "do", "does",
48
+ "did", "will", "would", "could", "should", "may", "might", "shall",
49
+ "this", "that", "these", "those", "i", "you", "he", "she", "we",
50
+ "they", "me", "him", "her", "us", "them", "my", "your", "his",
51
+ "their", "our", "what", "which", "who", "how", "when", "where",
52
+ "why", "not", "no", "so", "up", "out", "into", "about", "than",
53
+ "then", "there", "here", "also", "just", "can", "get", "all",
54
+ # Common LLM assistant filler
55
+ "certainly", "sure", "absolutely", "great", "happy", "help",
56
+ "please", "let", "know", "feel", "free", "answer", "question",
57
+ "response", "following", "based", "provide", "using",
58
+ }
59
+ )
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Number of top content words to include in the fingerprint
63
+ # ---------------------------------------------------------------------------
64
+
65
+ _TOP_N: int = 12
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Public API
70
+ # ---------------------------------------------------------------------------
71
+
72
+
73
+ def fingerprint_response(text: str, *, top_n: int = _TOP_N) -> str:
74
+ """Return a 12-hex fingerprint string for *text*.
75
+
76
+ Parameters
77
+ ----------
78
+ text:
79
+ Raw assistant response text (plain text, not JSON).
80
+ top_n:
81
+ Number of most-frequent content words to include in the signature.
82
+ Defaults to ``_TOP_N`` (12). Lower values are more fuzzy; higher
83
+ values are more precise.
84
+
85
+ Returns
86
+ -------
87
+ A 12-character lowercase hexadecimal string, e.g. ``"a3f7b2c091de"``.
88
+ Returns ``""`` for empty / whitespace-only input.
89
+ """
90
+ if not text or not text.strip():
91
+ return ""
92
+
93
+ # 1. Unicode normalisation + lowercase
94
+ normalised = unicodedata.normalize("NFKC", text).lower()
95
+
96
+ # 2. Strip punctuation / digits, collapse whitespace
97
+ normalised = re.sub(r"[^\w\s]", " ", normalised)
98
+ normalised = re.sub(r"\d+", " ", normalised)
99
+ normalised = re.sub(r"\s+", " ", normalised).strip()
100
+
101
+ # 3. Tokenise and filter stop words (also skip very short tokens)
102
+ tokens = [w for w in normalised.split() if len(w) > 2 and w not in _STOP_WORDS]
103
+
104
+ if not tokens:
105
+ return ""
106
+
107
+ # 4. Count frequencies, take top-N
108
+ freq: dict[str, int] = {}
109
+ for tok in tokens:
110
+ freq[tok] = freq.get(tok, 0) + 1
111
+
112
+ # Require at least 3 distinct content words; single-word or near-empty
113
+ # responses (e.g. "xxxxx..." test stubs, error codes, bare ACKs) produce
114
+ # the same fingerprint every time and would falsely inflate the repetition
115
+ # rate. Returning "" marks these as "not fingerprinted" so detect_drift
116
+ # skips them entirely.
117
+ if len(freq) < 3:
118
+ return ""
119
+
120
+ top_words = sorted(freq, key=lambda w: (-freq[w], w))[:top_n]
121
+
122
+ # 5. Sort alphabetically → stable join → hash
123
+ signature = "|".join(sorted(top_words))
124
+ digest = hashlib.sha256(signature.encode()).hexdigest()
125
+ return digest[:12]
@@ -34,6 +34,10 @@ Signals
34
34
  * ``stop_anomaly_rate`` — fraction of responses with unexpected stop_reason
35
35
  (not "end_turn" / "tool_use" / "max_tokens")
36
36
  * ``error_rate`` — fraction of attempts that ended in failure
37
+ * ``goal_progress_stall`` (P1-4) — fraction of fingerprinted responses
38
+ whose fingerprint matches a previously-seen fingerprint in the window,
39
+ indicating the model is repeating itself without making progress.
40
+ Only fires when ``response_fingerprint`` is populated on observations.
37
41
 
38
42
  Thresholds are bundled as :class:`DriftThresholds` with three presets
39
43
  (``low`` / ``normal`` / ``high`` sensitivity).
@@ -71,6 +75,15 @@ class ResponseObservation:
71
75
  is_error: bool = False
72
76
  """True if the attempt ended in provider-failed / provider-failed-midstream."""
73
77
  stream: bool = False
78
+ response_fingerprint: str | None = None
79
+ """P1-4: compact content fingerprint of the response text.
80
+
81
+ When set, used by the ``goal_progress_stall`` signal to detect
82
+ repetition: the same fingerprint appearing multiple times in the
83
+ window indicates the model is not making progress. Computed by
84
+ :func:`coderouter.guards._fingerprint.fingerprint_response`.
85
+ Pass ``None`` (default) to opt-out — the signal is silently skipped.
86
+ """
74
87
 
75
88
 
76
89
  # ---------------------------------------------------------------------------
@@ -100,6 +113,12 @@ class DriftThresholds:
100
113
  length_collapse_ratio: float = 0.5
101
114
  """If recent half median is < 50% of earlier half median → collapse."""
102
115
 
116
+ # P1-4: repetition/stall threshold
117
+ repetition_rate_threshold: float = 0.4
118
+ """P1-4: fraction of fingerprinted responses whose fingerprint has
119
+ appeared before in the window. Above this rate → goal_progress_stall
120
+ signal fires (mild). Default 0.4 = 2 out of 5 responses are repeats."""
121
+
103
122
  # Minimum observations before detection fires
104
123
  min_window_fill: int = 6
105
124
  """Don't trigger until at least this many observations in the window."""
@@ -112,6 +131,7 @@ THRESHOLDS_LOW = DriftThresholds(
112
131
  tool_silence_rate=0.8,
113
132
  stop_anomaly_rate=0.6,
114
133
  error_rate=0.4,
134
+ repetition_rate_threshold=0.6,
115
135
  min_window_fill=10,
116
136
  )
117
137
 
@@ -123,6 +143,19 @@ THRESHOLDS_HIGH = DriftThresholds(
123
143
  tool_silence_rate=0.5,
124
144
  stop_anomaly_rate=0.3,
125
145
  error_rate=0.15,
146
+ repetition_rate_threshold=0.25,
147
+ min_window_fill=4,
148
+ )
149
+
150
+ # P1-5: goal-mode preset — tighter thresholds + lower min_window_fill.
151
+ # Applied automatically when the profile has goal_mode=True.
152
+ THRESHOLDS_GOAL = DriftThresholds(
153
+ empty_response_rate=0.2,
154
+ length_collapse_ratio=0.6,
155
+ tool_silence_rate=0.5,
156
+ stop_anomaly_rate=0.3,
157
+ error_rate=0.15,
158
+ repetition_rate_threshold=0.2,
126
159
  min_window_fill=4,
127
160
  )
128
161
 
@@ -130,6 +163,7 @@ SENSITIVITY_PRESETS: dict[str, DriftThresholds] = {
130
163
  "low": THRESHOLDS_LOW,
131
164
  "normal": THRESHOLDS_NORMAL,
132
165
  "high": THRESHOLDS_HIGH,
166
+ "goal": THRESHOLDS_GOAL,
133
167
  }
134
168
 
135
169
 
@@ -244,6 +278,27 @@ def detect_drift(
244
278
  if error_rate > thresholds.error_rate:
245
279
  mild_flags.append(f"error_rate={error_rate:.2f}")
246
280
 
281
+ # --- Signal 6: Goal progress stall (P1-4) ---
282
+ # Only active when at least some observations have a fingerprint.
283
+ # Computes: how many fingerprinted responses repeat a fingerprint
284
+ # already seen earlier in the window. High repetition → stall.
285
+ fingerprinted = [
286
+ obs for obs in window if obs.response_fingerprint # excludes None and ""
287
+ ]
288
+ if len(fingerprinted) >= 3:
289
+ seen: set[str] = set()
290
+ repeat_count = 0
291
+ for obs in fingerprinted:
292
+ fp = obs.response_fingerprint # guaranteed non-empty by filter above
293
+ if fp in seen:
294
+ repeat_count += 1
295
+ else:
296
+ seen.add(fp)
297
+ repetition_rate = repeat_count / len(fingerprinted)
298
+ signals["goal_progress_stall"] = round(repetition_rate, 3)
299
+ if repetition_rate > thresholds.repetition_rate_threshold:
300
+ mild_flags.append(f"goal_progress_stall={repetition_rate:.2f}")
301
+
247
302
  # --- Severity synthesis ---
248
303
  if severe_flags:
249
304
  severity: Literal["none", "mild", "severe"] = "severe"
coderouter/ingress/app.py CHANGED
@@ -13,6 +13,7 @@ from coderouter import __version__
13
13
  from coderouter.config import load_config
14
14
  from coderouter.ingress.anthropic_routes import router as anthropic_router
15
15
  from coderouter.ingress.dashboard_routes import router as dashboard_router
16
+ from coderouter.ingress.launcher_routes import router as launcher_router
16
17
  from coderouter.ingress.metrics_routes import router as metrics_router
17
18
  from coderouter.ingress.openai_routes import router as openai_router
18
19
  from coderouter.logging import configure_logging, get_logger
@@ -178,6 +179,12 @@ def create_app(config_path: str | None = None) -> FastAPI:
178
179
  with contextlib.suppress(Exception):
179
180
  await probe_task
180
181
 
182
+ # Launcher: stop child llama.cpp / vllm processes so they don't orphan.
183
+ from coderouter.ingress.launcher_routes import shutdown_launcher
184
+
185
+ with contextlib.suppress(Exception):
186
+ await shutdown_launcher(app)
187
+
181
188
  # v2.0-J: graceful shutdown of recovery probe tasks.
182
189
  with contextlib.suppress(Exception):
183
190
  await engine.shutdown_recovery_probes()
@@ -259,6 +266,10 @@ def create_app(config_path: str | None = None) -> FastAPI:
259
266
  # Same root-level mount as /metrics.json — the dashboard is a UI
260
267
  # concern and doesn't belong under the /v1 API surface.
261
268
  app.include_router(dashboard_router, tags=["dashboard"])
269
+ # Launcher UI + process management API.
270
+ # /launcher → single-page HTML UI
271
+ # /api/launcher/* → model scan, process start/stop/logs
272
+ app.include_router(launcher_router, tags=["launcher"])
262
273
 
263
274
  return app
264
275
 
@@ -76,6 +76,7 @@ _DASHBOARD_HTML = r"""<!doctype html>
76
76
  <header class="border-b border-slate-800 px-6 py-3">
77
77
  <div class="max-w-7xl mx-auto flex flex-wrap items-center gap-x-6 gap-y-2 text-sm">
78
78
  <span class="text-lg font-semibold tracking-tight">CodeRouter</span>
79
+ <a href="/launcher" class="text-slate-400 hover:text-slate-200 transition-colors text-sm">Launcher</a>
79
80
  <span class="text-slate-400">profile: <span data-bind="profile" class="text-slate-100 font-mono">—</span></span>
80
81
  <span class="text-slate-400">uptime: <span data-bind="uptime" class="text-slate-100 font-mono tabnum">—</span></span>
81
82
  <span class="text-slate-400">requests: <span data-bind="requests_total" class="text-slate-100 font-mono tabnum">0</span></span>