akernel-runtime 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. akernel_runtime-0.1.0.dist-info/METADATA +270 -0
  2. akernel_runtime-0.1.0.dist-info/RECORD +40 -0
  3. akernel_runtime-0.1.0.dist-info/WHEEL +5 -0
  4. akernel_runtime-0.1.0.dist-info/entry_points.txt +2 -0
  5. akernel_runtime-0.1.0.dist-info/licenses/LICENSE +201 -0
  6. akernel_runtime-0.1.0.dist-info/licenses/NOTICE +4 -0
  7. akernel_runtime-0.1.0.dist-info/top_level.txt +1 -0
  8. context_kernel/__init__.py +4 -0
  9. context_kernel/__main__.py +5 -0
  10. context_kernel/agent_reports.py +188 -0
  11. context_kernel/benchmarks.py +493 -0
  12. context_kernel/budget.py +72 -0
  13. context_kernel/cli.py +2953 -0
  14. context_kernel/context.py +161 -0
  15. context_kernel/evals.py +347 -0
  16. context_kernel/global_memory.py +126 -0
  17. context_kernel/loop.py +1617 -0
  18. context_kernel/marketplace.py +194 -0
  19. context_kernel/marketplace_data/skills/context_budget.json +27 -0
  20. context_kernel/marketplace_data/skills/context_compaction.json +27 -0
  21. context_kernel/marketplace_data/skills/edit_file.json +27 -0
  22. context_kernel/marketplace_data/skills/index.json +66 -0
  23. context_kernel/marketplace_data/skills/long_task_planning.json +27 -0
  24. context_kernel/marketplace_data/skills/multi_file_bugfix.json +28 -0
  25. context_kernel/memory.py +515 -0
  26. context_kernel/models.py +144 -0
  27. context_kernel/planner.py +155 -0
  28. context_kernel/policy.py +271 -0
  29. context_kernel/project.py +317 -0
  30. context_kernel/providers.py +1264 -0
  31. context_kernel/report_costs.py +375 -0
  32. context_kernel/runner.py +78 -0
  33. context_kernel/skills.py +318 -0
  34. context_kernel/state_writer.py +108 -0
  35. context_kernel/storage.py +171 -0
  36. context_kernel/tasks.py +549 -0
  37. context_kernel/text.py +42 -0
  38. context_kernel/tokenizer.py +22 -0
  39. context_kernel/tools.py +544 -0
  40. context_kernel/verifier.py +77 -0
@@ -0,0 +1,549 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from uuid import uuid4
5
+
6
+ from .memory import MemoryStore
7
+ from .models import utc_now
8
+ from .storage import Workspace
9
+ from .tokenizer import estimate_tokens
10
+
11
+
12
+ TASK_STATUSES = {"active", "blocked", "completed"}
13
+ MILESTONE_STATUSES = {"pending", "active", "completed", "blocked", "skipped"}
14
+
15
+
16
+ class TaskStore:
17
+ def __init__(self, workspace: Workspace):
18
+ self.workspace = workspace
19
+ self.workspace.tasks_dir.mkdir(parents=True, exist_ok=True)
20
+
21
+ def start(self, title: str, goal: str | None = None, *, with_plan: bool = False) -> dict[str, Any]:
22
+ now = utc_now()
23
+ task = {
24
+ "id": uuid4().hex[:12],
25
+ "title": compact(title),
26
+ "goal": compact(goal or title, limit=1000),
27
+ "status": "active",
28
+ "created_at": now,
29
+ "updated_at": now,
30
+ "completed_at": None,
31
+ "steps": [
32
+ {
33
+ "id": uuid4().hex[:8],
34
+ "created_at": now,
35
+ "kind": "start",
36
+ "note": compact(goal or title),
37
+ "refs": {},
38
+ }
39
+ ],
40
+ "refs": {"run_traces": [], "tool_traces": [], "memories": []},
41
+ }
42
+ if with_plan:
43
+ task["plan"] = build_task_plan(task["title"], task["goal"])
44
+ task["steps"].append(
45
+ {
46
+ "id": uuid4().hex[:8],
47
+ "created_at": now,
48
+ "kind": "plan",
49
+ "note": "Created structured long-task plan.",
50
+ "refs": {},
51
+ }
52
+ )
53
+ self.save(task)
54
+ return task
55
+
56
+ def get(self, task_id: str) -> dict[str, Any]:
57
+ path = self.workspace.tasks_dir / f"{task_id}.json"
58
+ if not path.exists():
59
+ raise KeyError(f"Unknown task: {task_id}")
60
+ return Workspace.read_json(path)
61
+
62
+ def save(self, task: dict[str, Any]) -> None:
63
+ task["updated_at"] = utc_now()
64
+ Workspace.write_json(self.workspace.tasks_dir / f"{task['id']}.json", task)
65
+
66
+ def list(self, status: str | None = None) -> list[dict[str, Any]]:
67
+ if status and status not in TASK_STATUSES:
68
+ raise ValueError(f"Unsupported task status: {status}")
69
+ tasks = [Workspace.read_json(path) for path in sorted(self.workspace.tasks_dir.glob("*.json"))]
70
+ if status:
71
+ tasks = [task for task in tasks if task.get("status") == status]
72
+ return sorted(tasks, key=lambda task: task.get("updated_at", ""), reverse=True)
73
+
74
+ def summary(self, task_id: str, max_steps: int = 5) -> dict[str, Any]:
75
+ task = self.get(task_id)
76
+ steps = task.get("steps", [])[-max_steps:]
77
+ return {
78
+ "id": task["id"],
79
+ "title": task["title"],
80
+ "goal": task["goal"],
81
+ "status": task["status"],
82
+ "latest_steps": [
83
+ {
84
+ "kind": step.get("kind"),
85
+ "note": step.get("note"),
86
+ "created_at": step.get("created_at"),
87
+ }
88
+ for step in steps
89
+ ],
90
+ "refs": {
91
+ "run_traces": len(task.get("refs", {}).get("run_traces", [])),
92
+ "tool_traces": len(task.get("refs", {}).get("tool_traces", [])),
93
+ "memories": len(task.get("refs", {}).get("memories", [])),
94
+ },
95
+ "plan": summarize_task_plan(task),
96
+ }
97
+
98
+ def brief(self, task_id: str, max_steps: int = 6, max_refs: int = 5) -> dict[str, Any]:
99
+ task = self.get(task_id)
100
+ steps = task.get("steps", [])[-max_steps:]
101
+ memories = self._memory_refs(task, max_refs)
102
+ run_traces = self._run_trace_refs(task, max_refs)
103
+ tool_traces = self._tool_trace_refs(task, max_refs)
104
+ brief = {
105
+ "task": {
106
+ "id": task["id"],
107
+ "title": task["title"],
108
+ "goal": task["goal"],
109
+ "status": task["status"],
110
+ "created_at": task["created_at"],
111
+ "updated_at": task["updated_at"],
112
+ },
113
+ "recent_steps": [
114
+ {
115
+ "kind": step.get("kind"),
116
+ "note": step.get("note"),
117
+ "created_at": step.get("created_at"),
118
+ }
119
+ for step in steps
120
+ ],
121
+ "linked_memory": memories,
122
+ "linked_run_traces": run_traces,
123
+ "linked_tool_traces": tool_traces,
124
+ "plan": brief_task_plan(task),
125
+ "resume_instructions": [
126
+ "Use this task brief as the active working state.",
127
+ "Do not replay full chat history unless this brief is insufficient.",
128
+ "Prefer attaching new run/tool traces back to this task.",
129
+ "If a structured plan exists, continue from its active milestone and update checkpoints when progress changes.",
130
+ ],
131
+ }
132
+ brief["estimated_tokens"] = estimate_tokens(brief)
133
+ return brief
134
+
135
+ def plan(self, task_id: str, *, goal: str | None = None, force: bool = False) -> dict[str, Any]:
136
+ task = self.get(task_id)
137
+ ensure_not_completed(task)
138
+ if task.get("plan") and not force:
139
+ return task
140
+ if goal:
141
+ task["goal"] = compact(goal, limit=1000)
142
+ task["plan"] = build_task_plan(task["title"], task["goal"])
143
+ task["steps"].append(
144
+ {
145
+ "id": uuid4().hex[:8],
146
+ "created_at": utc_now(),
147
+ "kind": "plan",
148
+ "note": "Created structured long-task plan.",
149
+ "refs": {},
150
+ }
151
+ )
152
+ self.save(task)
153
+ return task
154
+
155
+ def next_checkpoint(self, task_id: str) -> dict[str, Any]:
156
+ task = self.get(task_id)
157
+ plan = task.get("plan") or build_task_plan(task["title"], task["goal"])
158
+ milestone = active_milestone(plan)
159
+ return {
160
+ "task_id": task["id"],
161
+ "task_status": task["status"],
162
+ "milestone": milestone,
163
+ "resume_prompt": checkpoint_resume_prompt(task, milestone),
164
+ "plan_progress": plan_progress(plan),
165
+ }
166
+
167
+ def checkpoint(
168
+ self,
169
+ task_id: str,
170
+ note: str,
171
+ *,
172
+ milestone_id: str | None = None,
173
+ status: str | None = None,
174
+ ) -> dict[str, Any]:
175
+ task = self.get(task_id)
176
+ ensure_not_completed(task)
177
+ if status and status not in MILESTONE_STATUSES:
178
+ raise ValueError(f"Unsupported milestone status: {status}")
179
+ plan = task.get("plan")
180
+ selected: dict[str, Any] | None = None
181
+ if plan:
182
+ selected = find_milestone(plan, milestone_id) if milestone_id else active_milestone(plan)
183
+ if selected and status:
184
+ selected["status"] = status
185
+ selected["updated_at"] = utc_now()
186
+ if selected and status == "completed":
187
+ activate_next_milestone(plan, selected["id"])
188
+ refs = {"milestone": selected["id"]} if selected else {}
189
+ task["steps"].append(
190
+ {
191
+ "id": uuid4().hex[:8],
192
+ "created_at": utc_now(),
193
+ "kind": "checkpoint",
194
+ "note": compact(note, limit=1000),
195
+ "refs": refs,
196
+ }
197
+ )
198
+ if plan:
199
+ plan["updated_at"] = utc_now()
200
+ self.save(task)
201
+ return task
202
+
203
+ def step(self, task_id: str, note: str, *, kind: str = "note", refs: dict[str, Any] | None = None) -> dict[str, Any]:
204
+ task = self.get(task_id)
205
+ ensure_not_completed(task)
206
+ task["steps"].append(
207
+ {
208
+ "id": uuid4().hex[:8],
209
+ "created_at": utc_now(),
210
+ "kind": kind,
211
+ "note": compact(note, limit=1000),
212
+ "refs": refs or {},
213
+ }
214
+ )
215
+ self.save(task)
216
+ return task
217
+
218
+ def attach(self, task_id: str, ref_kind: str, ref_id: str) -> dict[str, Any]:
219
+ task = self.get(task_id)
220
+ ensure_not_completed(task)
221
+ bucket = normalize_ref_kind(ref_kind)
222
+ if ref_id not in task["refs"][bucket]:
223
+ task["refs"][bucket].append(ref_id)
224
+ task["steps"].append(
225
+ {
226
+ "id": uuid4().hex[:8],
227
+ "created_at": utc_now(),
228
+ "kind": "attach",
229
+ "note": f"Attached {ref_label(bucket)}: {ref_id}",
230
+ "refs": {bucket: [ref_id]},
231
+ }
232
+ )
233
+ self.save(task)
234
+ return task
235
+
236
+ def set_status(self, task_id: str, status: str, note: str | None = None) -> dict[str, Any]:
237
+ if status not in TASK_STATUSES:
238
+ raise ValueError(f"Unsupported task status: {status}")
239
+ task = self.get(task_id)
240
+ task["status"] = status
241
+ if status == "completed":
242
+ task["completed_at"] = utc_now()
243
+ if note:
244
+ task["steps"].append(
245
+ {
246
+ "id": uuid4().hex[:8],
247
+ "created_at": utc_now(),
248
+ "kind": status,
249
+ "note": compact(note, limit=1000),
250
+ "refs": {},
251
+ }
252
+ )
253
+ self.save(task)
254
+ return task
255
+
256
+ def _memory_refs(self, task: dict[str, Any], limit: int) -> list[dict[str, Any]]:
257
+ memory = MemoryStore(self.workspace)
258
+ records: list[dict[str, Any]] = []
259
+ for record_id in task.get("refs", {}).get("memories", [])[-limit:]:
260
+ try:
261
+ record = memory.get(record_id, include_archived=True)
262
+ except KeyError:
263
+ records.append({"id": record_id, "missing": True})
264
+ continue
265
+ records.append(
266
+ {
267
+ "id": record.id,
268
+ "kind": record.kind,
269
+ "text": compact(record.text, limit=500),
270
+ "tags": record.tags,
271
+ "archived": bool(record.archived_at),
272
+ }
273
+ )
274
+ return records
275
+
276
+ def _run_trace_refs(self, task: dict[str, Any], limit: int) -> list[dict[str, Any]]:
277
+ traces: list[dict[str, Any]] = []
278
+ for trace_id in task.get("refs", {}).get("run_traces", [])[-limit:]:
279
+ path = self.workspace.traces_dir / f"{trace_id}.json"
280
+ if not path.exists():
281
+ traces.append({"id": trace_id, "missing": True})
282
+ continue
283
+ trace = Workspace.read_json(path)
284
+ traces.append(
285
+ {
286
+ "id": trace.get("id", trace_id),
287
+ "created_at": trace.get("created_at"),
288
+ "provider": trace.get("provider"),
289
+ "model": trace.get("model"),
290
+ "request": compact(trace.get("request", ""), limit=300),
291
+ "response_text": compact(trace.get("response", {}).get("text", ""), limit=500),
292
+ "verifier_ok": trace.get("verifier", {}).get("ok"),
293
+ "tokens": trace.get("response", {}).get("total_tokens"),
294
+ }
295
+ )
296
+ return traces
297
+
298
+ def _tool_trace_refs(self, task: dict[str, Any], limit: int) -> list[dict[str, Any]]:
299
+ traces: list[dict[str, Any]] = []
300
+ for trace_id in task.get("refs", {}).get("tool_traces", [])[-limit:]:
301
+ path = self.workspace.tool_traces_dir / f"{trace_id}.json"
302
+ if not path.exists():
303
+ traces.append({"id": trace_id, "missing": True})
304
+ continue
305
+ trace = Workspace.read_json(path)
306
+ traces.append(
307
+ {
308
+ "id": trace.get("id", trace_id),
309
+ "created_at": trace.get("created_at"),
310
+ "tool": trace.get("tool"),
311
+ "ok": trace.get("ok"),
312
+ "blocked": trace.get("blocked"),
313
+ "subject": trace.get("policy", {}).get("subject"),
314
+ "output_summary": summarize_tool_output(trace),
315
+ "error": trace.get("error"),
316
+ }
317
+ )
318
+ return traces
319
+
320
+
321
+ def normalize_ref_kind(ref_kind: str) -> str:
322
+ normalized = ref_kind.strip().lower().replace("-", "_")
323
+ aliases = {
324
+ "run": "run_traces",
325
+ "run_trace": "run_traces",
326
+ "run_traces": "run_traces",
327
+ "tool": "tool_traces",
328
+ "tool_trace": "tool_traces",
329
+ "tool_traces": "tool_traces",
330
+ "memory": "memories",
331
+ "memories": "memories",
332
+ }
333
+ if normalized not in aliases:
334
+ raise ValueError("ref kind must be one of: run, tool, memory")
335
+ return aliases[normalized]
336
+
337
+
338
+ def ref_label(bucket: str) -> str:
339
+ return {
340
+ "run_traces": "run_trace",
341
+ "tool_traces": "tool_trace",
342
+ "memories": "memory",
343
+ }.get(bucket, bucket)
344
+
345
+
346
+ def ensure_not_completed(task: dict[str, Any]) -> None:
347
+ if task.get("status") == "completed":
348
+ raise ValueError(f"Task is completed and cannot be modified: {task['id']}")
349
+
350
+
351
+ def build_task_plan(title: str, goal: str, *, max_milestones: int = 5) -> dict[str, Any]:
352
+ now = utc_now()
353
+ objective = compact(goal or title, limit=1000)
354
+ phases = [
355
+ (
356
+ "M1",
357
+ "Investigate scope",
358
+ "Collect first-hand evidence, constraints, existing behavior, and non-goals before editing.",
359
+ ["Relevant files, commands, and risks are identified.", "No large context dump is required to resume."],
360
+ ),
361
+ (
362
+ "M2",
363
+ "Design bounded change",
364
+ "Choose the smallest safe implementation path, split risky work, and define verification commands.",
365
+ ["Implementation scope is explicit.", "Rollback or recovery path is known before edits."],
366
+ ),
367
+ (
368
+ "M3",
369
+ "Implement slice",
370
+ "Make the next cohesive code or documentation change while preserving unrelated work.",
371
+ ["Changed files match the planned slice.", "Task checkpoint records what changed and why."],
372
+ ),
373
+ (
374
+ "M4",
375
+ "Verify behavior",
376
+ "Run targeted tests first, then broader checks appropriate for the project profile.",
377
+ ["Verification commands and results are attached or summarized.", "New failures are classified before continuing."],
378
+ ),
379
+ (
380
+ "M5",
381
+ "Document and handoff",
382
+ "Update user-facing notes, summarize residual risk, and prepare commit or follow-up work.",
383
+ ["Docs or changelog reflect the behavior.", "Next action is clear without replaying chat history."],
384
+ ),
385
+ ][:max_milestones]
386
+ milestones = [
387
+ {
388
+ "id": milestone_id,
389
+ "title": title,
390
+ "objective": objective_text,
391
+ "status": "active" if index == 0 else "pending",
392
+ "acceptance": acceptance,
393
+ "updated_at": now,
394
+ }
395
+ for index, (milestone_id, title, objective_text, acceptance) in enumerate(phases)
396
+ ]
397
+ return {
398
+ "version": 1,
399
+ "created_at": now,
400
+ "updated_at": now,
401
+ "objective": objective,
402
+ "milestones": milestones,
403
+ "completion_policy": [
404
+ "All non-skipped milestones are completed or explicitly blocked with a reason.",
405
+ "Latest verification evidence is linked through task refs or checkpoint notes.",
406
+ "Final response reports completed work, tests, and residual risks.",
407
+ ],
408
+ }
409
+
410
+
411
+ def summarize_task_plan(task: dict[str, Any]) -> dict[str, Any] | None:
412
+ plan = task.get("plan")
413
+ if not plan:
414
+ return None
415
+ return {
416
+ "progress": plan_progress(plan),
417
+ "active": active_milestone(plan),
418
+ }
419
+
420
+
421
+ def brief_task_plan(task: dict[str, Any]) -> dict[str, Any] | None:
422
+ plan = task.get("plan")
423
+ if not plan:
424
+ return None
425
+ milestones = plan.get("milestones", [])
426
+ active = active_milestone(plan)
427
+ return {
428
+ "objective": compact(plan.get("objective", ""), limit=500),
429
+ "progress": plan_progress(plan),
430
+ "active_milestone": active,
431
+ "milestones": [
432
+ {
433
+ "id": milestone.get("id"),
434
+ "title": milestone.get("title"),
435
+ "status": milestone.get("status"),
436
+ }
437
+ for milestone in milestones
438
+ ],
439
+ "completion_policy": plan.get("completion_policy", []),
440
+ }
441
+
442
+
443
+ def plan_progress(plan: dict[str, Any]) -> dict[str, int]:
444
+ milestones = plan.get("milestones", [])
445
+ total = len(milestones)
446
+ completed = sum(1 for item in milestones if item.get("status") == "completed")
447
+ blocked = sum(1 for item in milestones if item.get("status") == "blocked")
448
+ skipped = sum(1 for item in milestones if item.get("status") == "skipped")
449
+ return {"total": total, "completed": completed, "blocked": blocked, "skipped": skipped}
450
+
451
+
452
+ def active_milestone(plan: dict[str, Any]) -> dict[str, Any] | None:
453
+ milestones = plan.get("milestones", [])
454
+ for milestone in milestones:
455
+ if milestone.get("status") == "active":
456
+ return milestone
457
+ for milestone in milestones:
458
+ if milestone.get("status") == "pending":
459
+ milestone["status"] = "active"
460
+ milestone["updated_at"] = utc_now()
461
+ return milestone
462
+ return milestones[-1] if milestones else None
463
+
464
+
465
+ def find_milestone(plan: dict[str, Any], milestone_id: str | None) -> dict[str, Any] | None:
466
+ if not milestone_id:
467
+ return None
468
+ normalized = milestone_id.strip().casefold()
469
+ for milestone in plan.get("milestones", []):
470
+ if str(milestone.get("id", "")).casefold() == normalized:
471
+ return milestone
472
+ raise KeyError(f"Unknown milestone: {milestone_id}")
473
+
474
+
475
+ def activate_next_milestone(plan: dict[str, Any], completed_id: str) -> None:
476
+ milestones = plan.get("milestones", [])
477
+ for index, milestone in enumerate(milestones):
478
+ if milestone.get("id") != completed_id:
479
+ continue
480
+ for next_milestone in milestones[index + 1 :]:
481
+ if next_milestone.get("status") == "pending":
482
+ next_milestone["status"] = "active"
483
+ next_milestone["updated_at"] = utc_now()
484
+ return
485
+ return
486
+
487
+
488
+ def checkpoint_resume_prompt(task: dict[str, Any], milestone: dict[str, Any] | None) -> str:
489
+ if not milestone:
490
+ return f"Continue task {task['id']}: {task['goal']}"
491
+ acceptance = "; ".join(milestone.get("acceptance", [])[:2])
492
+ return (
493
+ f"Continue task {task['id']} from {milestone.get('id')} {milestone.get('title')}: "
494
+ f"{milestone.get('objective')} Acceptance: {acceptance}"
495
+ )
496
+
497
+
498
+ def compact(text: str, limit: int = 300) -> str:
499
+ normalized = " ".join(str(text).split())
500
+ if len(normalized) <= limit:
501
+ return normalized
502
+ return normalized[: limit - 3].rstrip() + "..."
503
+
504
+
505
+ def summarize_tool_output(trace: dict[str, Any]) -> str:
506
+ output = trace.get("output", {})
507
+ tool = trace.get("tool")
508
+ if trace.get("blocked"):
509
+ subject = compact(str(trace.get("policy", {}).get("subject", "")), limit=200)
510
+ error = compact(str(trace.get("error", "blocked by policy")), limit=240)
511
+ return f"subject={subject}; error={error}" if subject else error
512
+ if trace.get("error") and not output:
513
+ return compact(str(trace.get("error", "")), limit=240)
514
+ if tool == "read_file":
515
+ content = compact(str(output.get("content", "")), limit=1000)
516
+ suffix = " (truncated)" if output.get("truncated") else ""
517
+ return content + suffix if content else "read completed"
518
+ if tool == "run_command":
519
+ command = compact(str(output.get("command", "")), limit=160)
520
+ stdout = compact(str(output.get("stdout", "")), limit=500)
521
+ stderr = compact(str(output.get("stderr", "")), limit=300)
522
+ exit_code = output.get("exit_code")
523
+ parts = [f"command={command}" if command else ""]
524
+ if exit_code is not None:
525
+ parts.append(f"exit_code={exit_code}")
526
+ if stdout:
527
+ parts.append(f"stdout={stdout}")
528
+ if stderr:
529
+ parts.append(f"stderr={stderr}")
530
+ return "; ".join(part for part in parts if part) or "command completed"
531
+ if tool == "patch_file":
532
+ path = compact(str(output.get("path", "")), limit=200)
533
+ delta = output.get("delta_chars")
534
+ return f"path={path}; delta_chars={delta}" if path else "patch completed"
535
+ if tool == "batch_patch":
536
+ applied_count = output.get("applied_count")
537
+ rolled_back = output.get("rolled_back")
538
+ results = output.get("results", [])
539
+ edit_count = len(results) if isinstance(results, list) else 0
540
+ return f"applied_count={applied_count}; rolled_back={rolled_back}; edits={edit_count}"
541
+ if tool == "write_file":
542
+ path = compact(str(output.get("path", "")), limit=200)
543
+ chars = output.get("written_chars")
544
+ return f"path={path}; written_chars={chars}" if path else "write completed"
545
+ if tool == "delete_file":
546
+ path = compact(str(output.get("path", "")), limit=200)
547
+ size = output.get("deleted_bytes")
548
+ return f"path={path}; deleted_bytes={size}" if path else "delete completed"
549
+ return compact(str(output), limit=500)
context_kernel/text.py ADDED
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+
6
+ _TERM = re.compile(r"[A-Za-z0-9_]+|[\u4e00-\u9fff]")
7
+ _STOPWORDS = {
8
+ "a",
9
+ "an",
10
+ "and",
11
+ "for",
12
+ "in",
13
+ "of",
14
+ "or",
15
+ "the",
16
+ "to",
17
+ "use",
18
+ "when",
19
+ "with",
20
+ "baseline",
21
+ "document",
22
+ "documentation",
23
+ "file",
24
+ "files",
25
+ "increase",
26
+ "increases",
27
+ "unrelated",
28
+ "request",
29
+ "task",
30
+ }
31
+
32
+
33
+ def terms(text: str) -> set[str]:
34
+ return {
35
+ term
36
+ for match in _TERM.finditer(text)
37
+ if (term := match.group(0).lower().strip()) and term not in _STOPWORDS
38
+ }
39
+
40
+
41
+ def matched_terms(left: str, right: str) -> list[str]:
42
+ return sorted(terms(left).intersection(terms(right)))
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from typing import Any
6
+
7
+
8
+ _ASCII_WORD = re.compile(r"[A-Za-z0-9_]+")
9
+ _CJK_CHAR = re.compile(r"[\u4e00-\u9fff]")
10
+
11
+
12
+ def estimate_tokens(value: Any) -> int:
13
+ """Small deterministic token estimate for budgeting and comparisons."""
14
+ if not isinstance(value, str):
15
+ value = json.dumps(value, ensure_ascii=False, sort_keys=True)
16
+
17
+ ascii_words = _ASCII_WORD.findall(value)
18
+ cjk_chars = _CJK_CHAR.findall(value)
19
+ punctuation = re.findall(r"[^\sA-Za-z0-9_\u4e00-\u9fff]", value)
20
+
21
+ return max(1, len(ascii_words) + len(cjk_chars) + max(1, len(punctuation) // 4))
22
+