arkaos 2.22.0 → 2.22.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/VERSION +1 -1
  2. package/core/cognition/__pycache__/auto_documentor.cpython-313.pyc +0 -0
  3. package/core/cognition/auto_documentor.py +127 -12
  4. package/core/jobs/__pycache__/auto_doc_worker.cpython-313.pyc +0 -0
  5. package/core/jobs/auto_doc_worker.py +5 -3
  6. package/core/runtime/__pycache__/__init__.cpython-313.pyc +0 -0
  7. package/core/runtime/__pycache__/base.cpython-313.pyc +0 -0
  8. package/core/runtime/__pycache__/claude_code.cpython-313.pyc +0 -0
  9. package/core/runtime/__pycache__/codex_cli.cpython-313.pyc +0 -0
  10. package/core/runtime/__pycache__/cursor.cpython-313.pyc +0 -0
  11. package/core/runtime/__pycache__/gemini_cli.cpython-313.pyc +0 -0
  12. package/core/runtime/__pycache__/llm_cost_telemetry.cpython-313.pyc +0 -0
  13. package/core/runtime/__pycache__/llm_cost_telemetry_cli.cpython-313.pyc +0 -0
  14. package/core/runtime/__pycache__/llm_provider.cpython-313.pyc +0 -0
  15. package/core/runtime/__pycache__/pricing.cpython-313.pyc +0 -0
  16. package/core/runtime/claude_code.py +22 -16
  17. package/core/runtime/codex_cli.py +23 -6
  18. package/core/runtime/gemini_cli.py +135 -11
  19. package/core/runtime/llm_provider.py +14 -9
  20. package/core/shared/__init__.py +6 -0
  21. package/core/shared/__pycache__/__init__.cpython-313.pyc +0 -0
  22. package/core/shared/__pycache__/safe_session_id.cpython-313.pyc +0 -0
  23. package/core/shared/safe_session_id.py +41 -0
  24. package/core/synapse/__pycache__/kb_cache.cpython-313.pyc +0 -0
  25. package/core/synapse/__pycache__/layers.cpython-313.pyc +0 -0
  26. package/core/synapse/kb_cache.py +7 -6
  27. package/core/synapse/layers.py +7 -0
  28. package/core/workflow/__pycache__/flow_enforcer.cpython-313.pyc +0 -0
  29. package/core/workflow/__pycache__/marker_cache.cpython-313.pyc +0 -0
  30. package/core/workflow/__pycache__/research_gate.cpython-313.pyc +0 -0
  31. package/core/workflow/flow_enforcer.py +6 -14
  32. package/core/workflow/marker_cache.py +6 -8
  33. package/core/workflow/research_gate.py +11 -9
  34. package/package.json +1 -1
  35. package/pyproject.toml +1 -1
package/VERSION CHANGED
@@ -1 +1 @@
1
- 2.22.0
1
+ 2.22.1
@@ -21,16 +21,28 @@ from __future__ import annotations
21
21
 
22
22
  import json
23
23
  import re
24
+ from contextlib import contextmanager
24
25
  from dataclasses import dataclass, field
26
+ from datetime import datetime, timezone
25
27
  from pathlib import Path
26
28
  from typing import Iterable
27
29
 
28
30
  from core.obsidian import cataloger as _cataloger
29
31
  from core.obsidian import relator as _relator
30
32
  from core.obsidian.writer import ObsidianWriter
33
+ from core.shared import safe_session_id as _safe_session_id_module
31
34
 
35
+ try:
36
+ import fcntl # POSIX only
37
+ _HAS_FLOCK = True
38
+ except ImportError:
39
+ _HAS_FLOCK = False
32
40
 
33
- SAFE_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9._-]{1,128}$")
41
+
42
+ # Re-export for backward compatibility with any external importers.
43
+ SAFE_SESSION_ID_RE = _safe_session_id_module.SAFE_SESSION_ID_RE
44
+
45
+ AUTO_DOC_TELEMETRY_PATH = Path.home() / ".arkaos" / "telemetry" / "auto_doc.jsonl"
34
46
 
35
47
  _URL_RE = re.compile(r"https?://[^\s\)\]\"']+")
36
48
  _FILE_PATH_RE = re.compile(r"(?:^|[\s`'])(/[A-Za-z0-9_./\-]+\.[A-Za-z0-9]+)")
@@ -58,8 +70,8 @@ _SYSTEM_PROMPT = (
58
70
  "(150-300 words) summarising the session. Structure: short intro, "
59
71
  "then markdown sections for Key Facts, Decisions, and Sources. "
60
72
  "Preserve every URL and file path verbatim. Use Obsidian wikilinks "
61
- "([[Topic]]) for reusable concepts. No preamble, no sign-off, no "
62
- "meta commentary about the model or prompt. Output only markdown."
73
+ "([[Topic]]) for reusable concepts. Do not include preamble, sign-off, "
74
+ "or meta commentary about the model or prompt. Output only markdown."
63
75
  )
64
76
 
65
77
 
@@ -329,18 +341,52 @@ def _build_synthesis_prompt(learning: Learning) -> str:
329
341
  return "\n".join(lines)
330
342
 
331
343
 
344
+ def _extract_key_facts(learning: Learning, limit: int = 5) -> list[str]:
345
+ """Pull 3-5 bullet candidates from the learning content.
346
+
347
+ Used by the template fallback so both the LLM and template paths
348
+ produce a ``## Key Facts`` section in the same order as
349
+ ``_SYSTEM_PROMPT`` requires.
350
+ """
351
+ text = (learning.content or "").strip()
352
+ if not text:
353
+ return []
354
+ paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
355
+ bullets: list[str] = []
356
+ for para in paragraphs:
357
+ for raw in para.splitlines():
358
+ line = raw.strip().lstrip("-*• ").strip()
359
+ # Skip markdown section headings; they aren't facts.
360
+ if not line or line.startswith("#"):
361
+ continue
362
+ if line.startswith(">") or line.startswith("`"):
363
+ continue
364
+ if len(line) < 8:
365
+ continue
366
+ bullets.append(line[:240])
367
+ if len(bullets) >= limit:
368
+ return bullets
369
+ if len(bullets) >= limit:
370
+ break
371
+ return bullets
372
+
373
+
332
374
  def _template_synthesize(learning: Learning) -> str:
333
- parts = [f"# {learning.topic}", ""]
375
+ # Section order mirrors _SYSTEM_PROMPT: Key Facts → Decisions →
376
+ # Sources. Keeping both synthesis paths aligned means downstream
377
+ # consumers (MOC generation, relator) never branch on provider.
378
+ parts: list[str] = [f"# {learning.topic}", ""]
334
379
  parts.append(f"> {_AUTO_DOC_SUFFIX}.")
335
380
  parts.append("")
336
381
  if learning.content.strip():
337
382
  parts.append(learning.content.strip())
338
383
  parts.append("")
339
- if learning.sources:
340
- parts.append("## Sources")
384
+ key_facts = _extract_key_facts(learning)
385
+ if key_facts:
386
+ parts.append("## Key Facts")
341
387
  parts.append("")
342
- for src in learning.sources[:20]:
343
- parts.append(f"- {src}")
388
+ for fact in key_facts:
389
+ parts.append(f"- {fact}")
344
390
  parts.append("")
345
391
  if learning.decisions:
346
392
  parts.append("## Decisions")
@@ -348,6 +394,12 @@ def _template_synthesize(learning: Learning) -> str:
348
394
  for dec in learning.decisions[:10]:
349
395
  parts.append(f"- {dec}")
350
396
  parts.append("")
397
+ if learning.sources:
398
+ parts.append("## Sources")
399
+ parts.append("")
400
+ for src in learning.sources[:20]:
401
+ parts.append(f"- {src}")
402
+ parts.append("")
351
403
  return "\n".join(parts).rstrip() + "\n"
352
404
 
353
405
 
@@ -391,13 +443,78 @@ def _document_one(
391
443
  meta.setdefault("auto_documented", True)
392
444
  try:
393
445
  plan = _cataloger.plan(body, meta)
394
- except ValueError:
446
+ except ValueError as exc:
447
+ _log_auto_doc_event(
448
+ session_id=session_id,
449
+ event="classification-failed",
450
+ topic=learning.topic,
451
+ reason=str(exc),
452
+ )
453
+ return None
454
+ if plan is None:
455
+ _log_auto_doc_event(
456
+ session_id=session_id,
457
+ event="succeeded-empty",
458
+ topic=learning.topic,
459
+ reason="cataloger returned no plan",
460
+ )
395
461
  return None
396
462
  note_path = _cataloger.execute(plan, body, writer)
397
463
  _relate_note(note_path, body, vault_path, plan)
464
+ _log_auto_doc_event(
465
+ session_id=session_id,
466
+ event="succeeded-wrote-note",
467
+ topic=learning.topic,
468
+ reason=str(note_path),
469
+ )
398
470
  return note_path
399
471
 
400
472
 
473
+ @contextmanager
474
+ def _locked_append(path: Path):
475
+ """Append to ``path`` under an exclusive advisory lock (POSIX flock).
476
+
477
+ Mirrors the pattern in ``core/workflow/flow_enforcer._locked_append``
478
+ — see that module for the platform-fallback rationale.
479
+ """
480
+ path.parent.mkdir(parents=True, exist_ok=True)
481
+ fh = path.open("a", encoding="utf-8")
482
+ try:
483
+ if _HAS_FLOCK:
484
+ fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
485
+ yield fh
486
+ finally:
487
+ if _HAS_FLOCK:
488
+ try:
489
+ fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
490
+ except OSError:
491
+ pass
492
+ fh.close()
493
+
494
+
495
+ def _log_auto_doc_event(
496
+ *,
497
+ session_id: str,
498
+ event: str,
499
+ topic: str,
500
+ reason: str,
501
+ ) -> None:
502
+ """Append a structured auto-doc telemetry entry, degrade silently."""
503
+ entry = {
504
+ "ts": datetime.now(timezone.utc).isoformat(),
505
+ "session_id": session_id,
506
+ "event": event,
507
+ "topic": topic[:120],
508
+ "reason": reason[:240],
509
+ }
510
+ try:
511
+ with _locked_append(AUTO_DOC_TELEMETRY_PATH) as fh:
512
+ fh.write(json.dumps(entry) + "\n")
513
+ except OSError:
514
+ # Telemetry failures must never break the doc job.
515
+ return
516
+
517
+
401
518
  def _relate_note(note_path: Path, body: str, vault_path: Path, plan) -> None:
402
519
  try:
403
520
  related = _relator.find_related(
@@ -428,6 +545,4 @@ def _append_related_block(note_path: Path, related) -> None:
428
545
 
429
546
 
430
547
  def _safe_session_id(session_id: str) -> bool:
431
- if not isinstance(session_id, str) or not session_id:
432
- return False
433
- return bool(SAFE_SESSION_ID_RE.match(session_id))
548
+ return _safe_session_id_module.safe_session_id(session_id) is not None
@@ -28,7 +28,6 @@ from __future__ import annotations
28
28
  import argparse
29
29
  import json
30
30
  import os
31
- import re
32
31
  import sys
33
32
  import time
34
33
  import uuid
@@ -36,9 +35,12 @@ from datetime import datetime, timezone
36
35
  from pathlib import Path
37
36
  from typing import Optional
38
37
 
38
+ from core.shared import safe_session_id as _safe_session_id_module
39
+
39
40
 
40
41
  MAX_ATTEMPTS = 3
41
- SAFE_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9._-]{1,128}$")
42
+ # Re-export for backward compatibility with any external importers.
43
+ SAFE_SESSION_ID_RE = _safe_session_id_module.SAFE_SESSION_ID_RE
42
44
  _QUEUE_SUBDIRS = ("pending", "processing", "completed", "failed")
43
45
 
44
46
 
@@ -77,7 +79,7 @@ def enqueue_job(
77
79
  """Write a pending job file. Returns the job id."""
78
80
  root = queue_root or _queue_root()
79
81
  _ensure_queue(root)
80
- safe = session_id if SAFE_SESSION_ID_RE.match(session_id or "") else "unknown"
82
+ safe = _safe_session_id_module.safe_session_id(session_id or "") or "unknown"
81
83
  job_id = f"{int(time.time())}-{uuid.uuid4().hex[:12]}"
82
84
  payload = {
83
85
  "job_id": job_id,
@@ -120,7 +120,7 @@ class ClaudeCodeAdapter(RuntimeAdapter):
120
120
  max_tokens: int = 2000,
121
121
  system: str = "",
122
122
  ) -> "LLMResponse":
123
- from core.runtime.llm_provider import LLMResponse, LLMUnavailable
123
+ from core.runtime.llm_provider import LLMUnavailable
124
124
 
125
125
  binary = shutil.which("claude")
126
126
  if binary is None:
@@ -131,27 +131,28 @@ class ClaudeCodeAdapter(RuntimeAdapter):
131
131
  cmd = [binary, "-p", prompt, "--output-format", "json"]
132
132
  if system:
133
133
  cmd.extend(["--append-system-prompt", system])
134
- try:
135
- proc = subprocess.run(
136
- cmd,
137
- capture_output=True,
138
- text=True,
139
- timeout=60,
140
- check=False,
141
- )
142
- except subprocess.TimeoutExpired as exc:
143
- raise LLMUnavailable("claude CLI timed out after 60s") from exc
144
- except OSError as exc:
145
- raise LLMUnavailable(f"claude CLI subprocess failed: {exc}") from exc
146
-
134
+ proc = _run_claude_cli(cmd)
147
135
  if proc.returncode != 0:
148
136
  raise LLMUnavailable(
149
137
  f"claude CLI exited {proc.returncode}: {proc.stderr.strip()[:200]}"
150
138
  )
151
- return _parse_claude_json(proc.stdout)
139
+ return _parse_claude_cli_output(proc.stdout)
140
+
141
+
142
+ def _run_claude_cli(cmd: list[str]) -> subprocess.CompletedProcess:
143
+ from core.runtime.llm_provider import LLMUnavailable
144
+
145
+ try:
146
+ return subprocess.run(
147
+ cmd, capture_output=True, text=True, timeout=60, check=False
148
+ )
149
+ except subprocess.TimeoutExpired as exc:
150
+ raise LLMUnavailable("claude CLI timed out after 60s") from exc
151
+ except OSError as exc:
152
+ raise LLMUnavailable(f"claude CLI subprocess failed: {exc}") from exc
152
153
 
153
154
 
154
- def _parse_claude_json(stdout: str) -> "LLMResponse":
155
+ def _parse_claude_cli_output(stdout: str) -> "LLMResponse":
155
156
  from core.runtime.llm_provider import LLMResponse
156
157
 
157
158
  payload = json.loads(stdout) if stdout.strip() else {}
@@ -170,3 +171,8 @@ def _parse_claude_json(stdout: str) -> "LLMResponse":
170
171
  cached_tokens=cache_read,
171
172
  model=model,
172
173
  )
174
+
175
+
176
+ # Backward compatibility alias — tests and external importers that used
177
+ # the old helper name continue to work without modification.
178
+ _parse_claude_json = _parse_claude_cli_output
@@ -94,11 +94,28 @@ class CodexCliAdapter(RuntimeAdapter):
94
94
  "codex CLI not found on PATH — install Codex CLI to "
95
95
  "enable headless completion."
96
96
  )
97
- # TODO(llm-agnostic): Verify Codex CLI headless invocation
98
- # syntax (`codex exec "<prompt>"` was the working hypothesis
99
- # but has not been confirmed for the current release). Until
100
- # then, refuse rather than guess. Tracked in Task #12 report.
97
+ # TODO(llm-agnostic): Implement real headless completion.
98
+ #
99
+ # Status as of 2026-04-20: Codex CLI is NOT installed on the
100
+ # development machine, so actual invocation syntax could not
101
+ # be verified. Until a local install is available, refuse
102
+ # rather than ship guessed arguments.
103
+ #
104
+ # Verification checklist for whoever picks this up:
105
+ # 1. Install: npm install -g @openai/codex-cli
106
+ # 2. Discover: codex --help (confirm non-interactive flag)
107
+ # 3. Pattern: likely `codex exec "<prompt>"` or
108
+ # `codex --prompt "<prompt>" --format json`
109
+ # 4. Wire the subprocess call (mirror the Gemini adapter —
110
+ # list-form args, 60s timeout, stderr clipped, JSON parse
111
+ # with plain-text fallback, token estimate on miss).
112
+ #
113
+ # SubagentProvider cleanly falls back to anthropic-direct or
114
+ # stub when this raises, so the chain keeps working.
101
115
  raise NotImplementedError(
102
- "Codex CLI headless completion not yet wired — verify CLI "
103
- "syntax before enabling. See core/runtime/codex_cli.py TODO."
116
+ "Codex CLI headless mode requires local `codex` CLI. "
117
+ "Install: `npm install -g @openai/codex-cli` (verified 2026-04-20). "
118
+ "Verify syntax: `codex --help`. "
119
+ "See TODO(llm-agnostic) in this file. "
120
+ "SubagentProvider will cleanly fall back to anthropic-direct or stub."
104
121
  )
@@ -1,9 +1,26 @@
1
1
  """Gemini CLI runtime adapter.
2
2
 
3
3
  Google's Gemini CLI. Uses GEMINI.md for instructions and activate_skill for skills.
4
+
5
+ Headless invocation reference (verified against
6
+ https://github.com/google-gemini/gemini-cli docs — Context7 query
7
+ on 2026-04-20):
8
+
9
+ gemini -p "<prompt>" --output-format json
10
+
11
+ The JSON payload contains a ``response`` key (the model's text) and a
12
+ ``stats`` block with ``totalTokenCount`` / token counts. On failure the
13
+ payload includes an ``error`` block with diagnostic details. If JSON
14
+ parsing fails we fall back to treating stdout as raw text and estimate
15
+ tokens via a ``len(text) // 4`` heuristic — better than losing cost
16
+ telemetry entirely.
4
17
  """
5
18
 
19
+ from __future__ import annotations
20
+
21
+ import json
6
22
  import shutil
23
+ import subprocess
7
24
  from pathlib import Path
8
25
  from os.path import expanduser
9
26
  from typing import TYPE_CHECKING
@@ -14,6 +31,11 @@ if TYPE_CHECKING:
14
31
  from core.runtime.llm_provider import LLMResponse
15
32
 
16
33
 
34
+ _TIMEOUT_SECONDS = 60
35
+ _TOKEN_ESTIMATE_DIVISOR = 4 # Rough chars-per-token heuristic.
36
+ _STDERR_CLIP = 200
37
+
38
+
17
39
  class GeminiCliAdapter(RuntimeAdapter):
18
40
  """Adapter for Google's Gemini CLI."""
19
41
 
@@ -73,10 +95,7 @@ class GeminiCliAdapter(RuntimeAdapter):
73
95
  raise NotImplementedError("Use Gemini CLI's native content search")
74
96
 
75
97
  def headless_supported(self) -> bool:
76
- # Gemini CLI headless invocation syntax is not verified for the
77
- # current release. Returning False lets SubagentProvider fall
78
- # back gracefully rather than shell out blindly.
79
- return False
98
+ return shutil.which("gemini") is not None
80
99
 
81
100
  def headless_complete(
82
101
  self,
@@ -85,17 +104,122 @@ class GeminiCliAdapter(RuntimeAdapter):
85
104
  max_tokens: int = 2000,
86
105
  system: str = "",
87
106
  ) -> "LLMResponse":
107
+ from core.runtime.llm_provider import LLMUnavailable
108
+
88
109
  binary = shutil.which("gemini")
89
110
  if binary is None:
90
111
  raise NotImplementedError(
91
112
  "gemini CLI not found on PATH — install Gemini CLI to "
92
113
  "enable headless completion."
93
114
  )
94
- # TODO(llm-agnostic): Verify Gemini CLI's headless invocation
95
- # (`gemini -p "<prompt>"` was the working hypothesis). Until
96
- # confirmed for the shipped CLI version, refuse rather than
97
- # guess. Tracked in Task #12 report.
98
- raise NotImplementedError(
99
- "Gemini CLI headless completion not yet wired — verify CLI "
100
- "syntax before enabling. See core/runtime/gemini_cli.py TODO."
115
+ effective_prompt = _merge_system_prompt(prompt, system)
116
+ cmd = [binary, "-p", effective_prompt, "--output-format", "json"]
117
+ proc = _run_gemini_cli(cmd)
118
+ if proc.returncode != 0:
119
+ stderr_tail = proc.stderr.strip()[:_STDERR_CLIP]
120
+ raise LLMUnavailable(
121
+ f"gemini CLI exited {proc.returncode}: {stderr_tail}"
122
+ )
123
+ return _parse_gemini_cli_output(proc.stdout)
124
+
125
+
126
+ def _merge_system_prompt(prompt: str, system: str) -> str:
127
+ # Gemini CLI's -p flag accepts a single prompt; prepend the system
128
+ # text when provided so downstream behaviour matches Claude Code.
129
+ if not system:
130
+ return prompt
131
+ return f"{system}\n\n---\n\n{prompt}"
132
+
133
+
134
+ def _run_gemini_cli(cmd: list[str]) -> subprocess.CompletedProcess:
135
+ from core.runtime.llm_provider import LLMUnavailable
136
+
137
+ try:
138
+ return subprocess.run(
139
+ cmd,
140
+ capture_output=True,
141
+ text=True,
142
+ timeout=_TIMEOUT_SECONDS,
143
+ check=False,
144
+ )
145
+ except subprocess.TimeoutExpired as exc:
146
+ raise LLMUnavailable(
147
+ f"gemini CLI timed out after {_TIMEOUT_SECONDS}s"
148
+ ) from exc
149
+ except OSError as exc:
150
+ raise LLMUnavailable(f"gemini CLI subprocess failed: {exc}") from exc
151
+
152
+
153
+ def _parse_gemini_cli_output(stdout: str) -> "LLMResponse":
154
+ from core.runtime.llm_provider import LLMResponse
155
+
156
+ stripped = stdout.strip()
157
+ if not stripped:
158
+ return LLMResponse(
159
+ text="", tokens_in=0, tokens_out=0, cached_tokens=0, model=""
160
+ )
161
+ payload = _safe_loads(stripped)
162
+ if payload is None:
163
+ # Non-JSON fallback: treat stdout as raw text, estimate tokens.
164
+ return _response_from_plain_text(stripped)
165
+ return _response_from_json_payload(payload)
166
+
167
+
168
+ def _safe_loads(text: str) -> dict | None:
169
+ try:
170
+ data = json.loads(text)
171
+ except (json.JSONDecodeError, ValueError):
172
+ return None
173
+ return data if isinstance(data, dict) else None
174
+
175
+
176
+ def _response_from_plain_text(text: str) -> "LLMResponse":
177
+ from core.runtime.llm_provider import LLMResponse
178
+
179
+ estimate = max(1, len(text) // _TOKEN_ESTIMATE_DIVISOR)
180
+ return LLMResponse(
181
+ text=text,
182
+ tokens_in=0,
183
+ tokens_out=estimate,
184
+ cached_tokens=0,
185
+ model="",
186
+ )
187
+
188
+
189
+ def _response_from_json_payload(payload: dict) -> "LLMResponse":
190
+ from core.runtime.llm_provider import LLMResponse, LLMUnavailable
191
+
192
+ error = payload.get("error")
193
+ if isinstance(error, dict) and error:
194
+ message = str(error.get("message") or error).strip()[:_STDERR_CLIP]
195
+ raise LLMUnavailable(f"gemini CLI returned error: {message}")
196
+
197
+ text = str(payload.get("response") or payload.get("result") or "")
198
+ tokens_in, tokens_out = _extract_token_counts(payload, text)
199
+ model = str(payload.get("model") or "")
200
+ return LLMResponse(
201
+ text=text,
202
+ tokens_in=tokens_in,
203
+ tokens_out=tokens_out,
204
+ cached_tokens=0,
205
+ model=model,
206
+ )
207
+
208
+
209
+ def _extract_token_counts(payload: dict, text: str) -> tuple[int, int]:
210
+ stats = payload.get("stats") or payload.get("usageMetadata") or {}
211
+ if isinstance(stats, dict):
212
+ tokens_in = int(stats.get("promptTokenCount") or stats.get("input_tokens") or 0)
213
+ tokens_out = int(
214
+ stats.get("candidatesTokenCount")
215
+ or stats.get("output_tokens")
216
+ or 0
101
217
  )
218
+ # Fall back to the rolled-up total when per-side counts are absent.
219
+ if tokens_in == 0 and tokens_out == 0:
220
+ total = int(stats.get("totalTokenCount") or 0)
221
+ if total > 0:
222
+ return 0, total
223
+ return tokens_in, tokens_out
224
+ # No stats block at all — estimate output from text length.
225
+ return 0, max(1, len(text) // _TOKEN_ESTIMATE_DIVISOR)
@@ -199,6 +199,19 @@ class AnthropicDirectProvider:
199
199
  }
200
200
  ]
201
201
 
202
+ def _build_anthropic_payload(
203
+ self, prompt: str, system: str, max_tokens: int, model: str
204
+ ) -> dict[str, object]:
205
+ payload: dict[str, object] = {
206
+ "model": model,
207
+ "max_tokens": max_tokens,
208
+ "messages": [{"role": "user", "content": prompt}],
209
+ }
210
+ system_blocks = self._build_system_blocks(system)
211
+ if system_blocks:
212
+ payload["system"] = system_blocks
213
+ return payload
214
+
202
215
  def complete(
203
216
  self,
204
217
  prompt: str,
@@ -213,15 +226,7 @@ class AnthropicDirectProvider:
213
226
  "cannot select a model."
214
227
  )
215
228
  client = self._build_client()
216
- payload: dict[str, object] = {
217
- "model": model,
218
- "max_tokens": max_tokens,
219
- "messages": [{"role": "user", "content": prompt}],
220
- }
221
- system_blocks = self._build_system_blocks(system)
222
- if system_blocks:
223
- payload["system"] = system_blocks
224
-
229
+ payload = self._build_anthropic_payload(prompt, system, max_tokens, model)
225
230
  try:
226
231
  raw = client.messages.create(**payload) # type: ignore[attr-defined]
227
232
  except Exception as exc: # noqa: BLE001
@@ -0,0 +1,6 @@
1
+ """Cross-cutting primitives shared by multiple ArkaOS core packages.
2
+
3
+ Keep this package lean — only primitives that two or more sibling
4
+ packages already duplicate belong here. It is NOT a dumping ground for
5
+ utilities; each addition must delete a duplicate elsewhere.
6
+ """
@@ -0,0 +1,41 @@
1
+ """Shared session-id allowlist — path-traversal / injection guard.
2
+
3
+ A session id is considered safe iff it matches ``[A-Za-z0-9._-]{1,128}``.
4
+ Any other character (``/``, ``\\``, whitespace, control char, unicode,
5
+ NUL, ``..``) rejects — callers MUST treat ``None`` as "do not use this
6
+ id for any filesystem or shell path".
7
+
8
+ Why this lives here: the exact same regex + helper was duplicated in 6
9
+ modules (flow_enforcer, marker_cache, research_gate, kb_cache,
10
+ auto_documentor, auto_doc_worker). A single source of truth prevents
11
+ drift — if the allowlist ever tightens, it tightens everywhere.
12
+
13
+ Historic aliases remain at each call site as module-level re-exports
14
+ so external importers that did ``from core.workflow.flow_enforcer
15
+ import SAFE_SESSION_ID_RE`` continue to work.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import re
21
+
22
+
23
+ SAFE_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9._-]{1,128}$")
24
+
25
+
26
+ def safe_session_id(session_id: str) -> str | None:
27
+ """Validate ``session_id`` against the strict allowlist.
28
+
29
+ Returns the id unchanged when safe, or ``None`` when it contains
30
+ path separators, ``..`` traversal fragments, whitespace, unicode,
31
+ NUL bytes, or any character outside ``[A-Za-z0-9._-]``. Length is
32
+ capped at 128 characters to prevent pathological filesystem paths.
33
+
34
+ Callers MUST treat ``None`` as reject — never construct a path or
35
+ shell argument from the raw input when this returns ``None``.
36
+ """
37
+ if not session_id or not isinstance(session_id, str):
38
+ return None
39
+ if not SAFE_SESSION_ID_RE.match(session_id):
40
+ return None
41
+ return session_id
@@ -23,15 +23,17 @@ Turn-scoped marker (record_obsidian_query / read_obsidian_query):
23
23
  import hashlib
24
24
  import json
25
25
  import os
26
- import re
27
26
  import threading
28
27
  import time
29
28
  import uuid
30
29
  from pathlib import Path
31
30
  from typing import Any, Optional
32
31
 
32
+ from core.shared import safe_session_id as _safe_session_id_module
33
33
 
34
- SAFE_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9._-]{1,128}$")
34
+
35
+ # Re-export for backward compatibility with any external importers.
36
+ SAFE_SESSION_ID_RE = _safe_session_id_module.SAFE_SESSION_ID_RE
35
37
  KB_QUERY_MARKER_DIR = Path("/tmp/arkaos-kb-query")
36
38
  _MAX_QUERIES_PER_TURN = 32
37
39
  _MAX_QUERY_LEN = 512
@@ -428,11 +430,10 @@ def _kb_query_dir() -> Path:
428
430
 
429
431
 
430
432
  def _kb_query_path(session_id: str) -> Optional[Path]:
431
- if not session_id or not isinstance(session_id, str):
432
- return None
433
- if not SAFE_SESSION_ID_RE.match(session_id):
433
+ safe = _safe_session_id_module.safe_session_id(session_id)
434
+ if safe is None:
434
435
  return None
435
- return _kb_query_dir() / f"{session_id}.json"
436
+ return _kb_query_dir() / f"{safe}.json"
436
437
 
437
438
 
438
439
  def record_obsidian_query(session_id: str, query: str, hit_count: int = 0) -> None:
@@ -795,6 +795,11 @@ class SessionContextLayer(Layer):
795
795
  _WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:\|[^\]]+)?\]\]")
796
796
  _FRONTMATTER_RE = re.compile(r"^---\n.*?\n---\n", re.DOTALL)
797
797
  _KB_CONFIG_PATH = Path.home() / ".arkaos" / "config.json"
798
+ # Cap fallback-note scanning to avoid O(vault size) blow-ups on large
799
+ # Obsidian vaults. The cap is above any realistic top-N retrieval need
800
+ # (Jaccard ranks the top few notes; scanning 2000 sorted-by-name first
801
+ # is plenty — see `_load_fallback_notes`) while still bounding worst-case latency.
802
+ _MAX_FALLBACK_NOTES = 2000
798
803
  _KB_STOPWORDS: frozenset[str] = frozenset({
799
804
  "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of",
800
805
  "with", "by", "from", "as", "is", "was", "are", "were", "be", "been", "being",
@@ -929,6 +934,8 @@ def _load_fallback_notes(vault_path: Optional[Path]) -> list[dict]:
929
934
  return []
930
935
  notes: list[dict] = []
931
936
  for md in sorted(vault_path.rglob("*.md")):
937
+ if len(notes) >= _MAX_FALLBACK_NOTES:
938
+ break
932
939
  try:
933
940
  raw = md.read_text(encoding="utf-8")
934
941
  except (OSError, UnicodeDecodeError):
@@ -22,6 +22,7 @@ from dataclasses import asdict, dataclass
22
22
  from datetime import datetime, timezone
23
23
  from pathlib import Path
24
24
 
25
+ from core.shared import safe_session_id as _safe_session_id_module
25
26
  from core.workflow import marker_cache
26
27
 
27
28
  try:
@@ -58,7 +59,11 @@ GATED_TOOLS: frozenset[str] = frozenset({"Write", "Edit", "MultiEdit"})
58
59
  ROUTING_RE = re.compile(r"\[arka:routing\]\s*[\w-]+\s*->\s*\w+", re.IGNORECASE)
59
60
  TRIVIAL_RE = re.compile(r"\[arka:trivial\]\s*\S+", re.IGNORECASE)
60
61
  PHASE_RE = re.compile(r"\[arka:phase:\d+\]", re.IGNORECASE)
61
- SAFE_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9._-]{1,128}$")
62
+
63
+ # Re-export for backward compatibility with any external importers that
64
+ # relied on the module-level symbols before the core.shared extraction.
65
+ SAFE_SESSION_ID_RE = _safe_session_id_module.SAFE_SESSION_ID_RE
66
+ _safe_session_id = _safe_session_id_module.safe_session_id
62
67
 
63
68
  ASSISTANT_WINDOW = 6
64
69
  CONFIG_PATH = Path.home() / ".arkaos" / "config.json"
@@ -67,19 +72,6 @@ TELEMETRY_PATH = Path.home() / ".arkaos" / "telemetry" / "enforcement.jsonl"
67
72
  FLOW_REQUIRED_DIR = Path("/tmp/arkaos-wf-required")
68
73
 
69
74
 
70
- def _safe_session_id(session_id: str) -> str | None:
71
- """Validate session_id against a strict allowlist (prevents path traversal).
72
-
73
- Returns the id if safe, or None if it contains path separators, dots-dots,
74
- or characters outside `[A-Za-z0-9._-]`. Callers MUST treat None as reject.
75
- """
76
- if not session_id or not isinstance(session_id, str):
77
- return None
78
- if not SAFE_SESSION_ID_RE.match(session_id):
79
- return None
80
- return session_id
81
-
82
-
83
75
  @dataclass
84
76
  class Decision:
85
77
  """Outcome of enforcement evaluation for a single tool call."""
@@ -13,13 +13,15 @@ ADR compliance (docs/adr/2026-04-17-binding-flow-enforcement.md):
13
13
 
14
14
  import json
15
15
  import os
16
- import re
17
16
  import threading
18
17
  import time
19
18
  import uuid
20
19
  from dataclasses import dataclass
21
20
  from pathlib import Path
22
21
 
22
+ from core.shared import safe_session_id as _safe_session_id_module
23
+
24
+
23
25
  def _resolve_cache_dir() -> Path:
24
26
  override = os.environ.get("ARKA_MARKER_CACHE_DIR", "").strip()
25
27
  if override:
@@ -28,7 +30,8 @@ def _resolve_cache_dir() -> Path:
28
30
 
29
31
 
30
32
  MARKER_CACHE_DIR = _resolve_cache_dir()
31
- SAFE_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9._-]{1,128}$")
33
+ # Re-export for backward compatibility with any external importers.
34
+ SAFE_SESSION_ID_RE = _safe_session_id_module.SAFE_SESSION_ID_RE
32
35
  VALID_MARKER_TYPES: frozenset[str] = frozenset({"routing", "trivial", "phase"})
33
36
  _MAX_LABEL_LEN = 64
34
37
 
@@ -51,12 +54,7 @@ class MarkerRecord:
51
54
  }
52
55
 
53
56
 
54
- def _safe_session_id(session_id: str) -> str | None:
55
- if not session_id or not isinstance(session_id, str):
56
- return None
57
- if not SAFE_SESSION_ID_RE.match(session_id):
58
- return None
59
- return session_id
57
+ _safe_session_id = _safe_session_id_module.safe_session_id
60
58
 
61
59
 
62
60
  def _cache_path(session_id: str) -> Path | None:
@@ -27,6 +27,7 @@ from dataclasses import asdict, dataclass, field
27
27
  from datetime import datetime, timezone
28
28
  from pathlib import Path
29
29
 
30
+ from core.shared import safe_session_id as _safe_session_id_module
30
31
  from core.synapse import kb_cache
31
32
 
32
33
  try:
@@ -48,7 +49,8 @@ RESEARCH_EXTERNAL_TOOLS: frozenset[str] = frozenset({
48
49
  "mcp__firecrawl__firecrawl_extract",
49
50
  })
50
51
 
51
- SAFE_SESSION_ID_RE = re.compile(r"^[A-Za-z0-9._-]{1,128}$")
52
+ # Re-export for backward compatibility with any external importers.
53
+ SAFE_SESSION_ID_RE = _safe_session_id_module.SAFE_SESSION_ID_RE
52
54
  CONFIG_PATH = Path.home() / ".arkaos" / "config.json"
53
55
  BYPASS_AUDIT_PATH = Path.home() / ".arkaos" / "audit" / "kb_first_bypass.log"
54
56
  TELEMETRY_PATH = Path.home() / ".arkaos" / "telemetry" / "kb_first.jsonl"
@@ -100,12 +102,7 @@ def _locked_append(path: Path):
100
102
  fh.close()
101
103
 
102
104
 
103
- def _safe_session_id(session_id: str) -> str | None:
104
- if not session_id or not isinstance(session_id, str):
105
- return None
106
- if not SAFE_SESSION_ID_RE.match(session_id):
107
- return None
108
- return session_id
105
+ _safe_session_id = _safe_session_id_module.safe_session_id
109
106
 
110
107
 
111
108
  def _feature_flag_on() -> bool:
@@ -163,9 +160,14 @@ def _mark_violation(session_id: str, tool: str) -> None:
163
160
  return
164
161
  path.parent.mkdir(parents=True, exist_ok=True)
165
162
  entry = json.dumps({"tool": tool, "ts": datetime.now(timezone.utc).isoformat()})
163
+ # Race contract: two concurrent tool calls on the same session may
164
+ # both observe "no prior violation" and both emit the first-violation
165
+ # nudge. This is intentional — a nudge is cheap and both calls were
166
+ # genuinely first-ish. Deny is reserved for the SECOND violation
167
+ # after the first marker is on disk, which is what a plain
168
+ # ``write_text`` (non-exclusive, last-writer-wins) gives us. Tested
169
+ # by ``test_concurrent_violation_markers_race_safe``.
166
170
  try:
167
- # O_CREAT|O_EXCL would be stricter, but we want idempotent writes
168
- # from a concurrent race — last writer wins, both see "first".
169
171
  path.write_text(entry, encoding="utf-8")
170
172
  except OSError:
171
173
  pass
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "arkaos",
3
- "version": "2.22.0",
3
+ "version": "2.22.1",
4
4
  "description": "The Operating System for AI Agent Teams",
5
5
  "type": "module",
6
6
  "bin": {
package/pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "arkaos-core"
3
- version = "2.22.0"
3
+ version = "2.22.1"
4
4
  description = "Core engine for ArkaOS — The Operating System for AI Agent Teams"
5
5
  readme = "README.md"
6
6
  license = {text = "MIT"}