crowd-control 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crowd_control/__init__.py +3 -0
- crowd_control/cli.py +431 -0
- crowd_control/config.py +119 -0
- crowd_control/default_config.toml +31 -0
- crowd_control/embed/__init__.py +1 -0
- crowd_control/embed/base.py +65 -0
- crowd_control/embed/ollama.py +74 -0
- crowd_control/embed/openai.py +69 -0
- crowd_control/embed/voyage.py +67 -0
- crowd_control/formatting.py +89 -0
- crowd_control/hooks.py +147 -0
- crowd_control/ingest/__init__.py +1 -0
- crowd_control/ingest/distiller.py +441 -0
- crowd_control/ingest/parser.py +383 -0
- crowd_control/ingest/pipeline.py +120 -0
- crowd_control/logging_config.py +69 -0
- crowd_control/py.typed +0 -0
- crowd_control/retrieve/__init__.py +88 -0
- crowd_control/retrieve/rank.py +170 -0
- crowd_control/retrieve/search.py +122 -0
- crowd_control/server.py +392 -0
- crowd_control/setup.py +177 -0
- crowd_control/storage/__init__.py +1 -0
- crowd_control/storage/db.py +337 -0
- crowd_control/storage/models.py +163 -0
- crowd_control/worker.py +132 -0
- crowd_control-0.0.1.dist-info/METADATA +150 -0
- crowd_control-0.0.1.dist-info/RECORD +31 -0
- crowd_control-0.0.1.dist-info/WHEEL +4 -0
- crowd_control-0.0.1.dist-info/entry_points.txt +2 -0
- crowd_control-0.0.1.dist-info/licenses/LICENSE +21 -0
crowd_control/cli.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from crowd_control.config import ConfigError, CrowdControlConfig, load_config
|
|
11
|
+
from crowd_control.ingest.parser import find_sessions, parse_session_file
|
|
12
|
+
from crowd_control.storage.models import TextBlock
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load_config_safe() -> CrowdControlConfig:
|
|
18
|
+
"""Load config, falling back to defaults on error.
|
|
19
|
+
|
|
20
|
+
Returns (config, error). If error is not None, config is the default
|
|
21
|
+
and the error should be reported by interactive commands.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
return load_config(), None
|
|
25
|
+
except ConfigError as e:
|
|
26
|
+
return CrowdControlConfig(), e
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@click.group()
|
|
30
|
+
@click.version_option(package_name="crowd-control")
|
|
31
|
+
@click.option("--verbose", "-v", is_flag=True, help="Show debug output on stderr.")
|
|
32
|
+
@click.pass_context
|
|
33
|
+
def main(ctx, verbose):
|
|
34
|
+
"""Crowd Control — learnings retention system for Claude Code."""
|
|
35
|
+
ctx.ensure_object(dict)
|
|
36
|
+
ctx.obj["verbose"] = verbose
|
|
37
|
+
|
|
38
|
+
config, config_error = _load_config_safe()
|
|
39
|
+
ctx.obj["config"] = config
|
|
40
|
+
ctx.obj["config_error"] = config_error
|
|
41
|
+
|
|
42
|
+
from crowd_control.logging_config import configure_logging
|
|
43
|
+
|
|
44
|
+
configure_logging(config, interactive=True, verbose=verbose)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _get_config_or_exit(ctx) -> CrowdControlConfig:
|
|
48
|
+
"""Get config from context, exiting on error. For interactive commands."""
|
|
49
|
+
config_error = ctx.obj.get("config_error")
|
|
50
|
+
if config_error is not None:
|
|
51
|
+
click.echo(str(config_error), err=True)
|
|
52
|
+
sys.exit(1)
|
|
53
|
+
return ctx.obj["config"]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@main.command()
|
|
57
|
+
@click.pass_context
|
|
58
|
+
def status(ctx):
|
|
59
|
+
"""Show system status and database stats."""
|
|
60
|
+
config = _get_config_or_exit(ctx)
|
|
61
|
+
try:
|
|
62
|
+
from crowd_control.storage.db import LearningStore
|
|
63
|
+
|
|
64
|
+
store = LearningStore(config.db_path)
|
|
65
|
+
count = store.count()
|
|
66
|
+
click.echo(f"Database: {config.db_path}")
|
|
67
|
+
click.echo(f"Learnings: {count}")
|
|
68
|
+
click.echo(f"Embedding: {config.embedding.provider}/{config.embedding.model}")
|
|
69
|
+
except Exception as e:
|
|
70
|
+
click.echo(f"Database not initialized: {e}")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@main.group()
|
|
74
|
+
def hook():
|
|
75
|
+
"""Hook handlers (called by Claude Code, not directly by users)."""
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@hook.command(name="session-end")
|
|
80
|
+
@click.pass_context
|
|
81
|
+
def hook_session_end(ctx):
|
|
82
|
+
"""Handle SessionEnd hook event from Claude Code."""
|
|
83
|
+
# Hooks must always exit 0 to avoid blocking Claude Code.
|
|
84
|
+
try:
|
|
85
|
+
config = ctx.obj["config"]
|
|
86
|
+
|
|
87
|
+
from crowd_control.logging_config import configure_logging
|
|
88
|
+
|
|
89
|
+
configure_logging(config, interactive=False)
|
|
90
|
+
|
|
91
|
+
from crowd_control.hooks import handle_session_end_hook
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
raw = sys.stdin.read()
|
|
95
|
+
event = json.loads(raw) if raw.strip() else {}
|
|
96
|
+
except json.JSONDecodeError:
|
|
97
|
+
click.echo("Invalid JSON on stdin", err=True)
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
result = handle_session_end_hook(event, config)
|
|
101
|
+
|
|
102
|
+
if result.skipped_reason:
|
|
103
|
+
click.echo(f"Skipped: {result.skipped_reason}", err=True)
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.debug("hook session-end failed: %s", e)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@main.command()
|
|
109
|
+
@click.pass_context
|
|
110
|
+
def worker(ctx):
|
|
111
|
+
"""Process queued ingestion jobs."""
|
|
112
|
+
config = ctx.obj["config"]
|
|
113
|
+
|
|
114
|
+
from crowd_control.logging_config import configure_logging
|
|
115
|
+
|
|
116
|
+
configure_logging(config, interactive=False)
|
|
117
|
+
|
|
118
|
+
from crowd_control.worker import process_queue
|
|
119
|
+
|
|
120
|
+
process_queue(config)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@main.command()
|
|
124
|
+
@click.option("--project", "project_scope", is_flag=True, help="Configure for current project.")
|
|
125
|
+
@click.pass_context
|
|
126
|
+
def setup(ctx, project_scope):
|
|
127
|
+
"""Configure hooks and MCP server in Claude Code."""
|
|
128
|
+
from crowd_control.setup import run_setup
|
|
129
|
+
|
|
130
|
+
config = _get_config_or_exit(ctx)
|
|
131
|
+
result = run_setup(config, project_scope=project_scope)
|
|
132
|
+
|
|
133
|
+
if result.issues:
|
|
134
|
+
for issue in result.issues:
|
|
135
|
+
click.echo(f" ! {issue}", err=True)
|
|
136
|
+
sys.exit(1)
|
|
137
|
+
|
|
138
|
+
click.echo(f"Crowd Control configured successfully ({result.scope_label}).")
|
|
139
|
+
click.echo()
|
|
140
|
+
click.echo(f"MCP server: {result.mcp_path} (crowd-control serve)")
|
|
141
|
+
click.echo("Hook:")
|
|
142
|
+
click.echo(" SessionEnd -> queues ingestion + spawns background worker")
|
|
143
|
+
click.echo()
|
|
144
|
+
click.echo(f"Storage: {result.storage_dir}")
|
|
145
|
+
click.echo(f"Embedding: {result.embedding_label}")
|
|
146
|
+
click.echo()
|
|
147
|
+
click.echo("Everything is automatic. When you end a session, learnings are")
|
|
148
|
+
click.echo("extracted in the background. The agent uses search_learnings to")
|
|
149
|
+
click.echo("find relevant insights during sessions.")
|
|
150
|
+
click.echo()
|
|
151
|
+
click.echo("Manual commands:")
|
|
152
|
+
click.echo(' crowd-control search "query" # Search from terminal')
|
|
153
|
+
click.echo(" crowd-control worker # Retry failed ingestions")
|
|
154
|
+
click.echo(" crowd-control status # Database stats")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
@main.command()
|
|
158
|
+
@click.argument("path", required=False)
|
|
159
|
+
@click.option("--dry-run", is_flag=True, help="Parse and show structure without storing.")
|
|
160
|
+
@click.option(
|
|
161
|
+
"--concurrency",
|
|
162
|
+
default=8,
|
|
163
|
+
type=int,
|
|
164
|
+
show_default=True,
|
|
165
|
+
help="Max parallel distillation requests.",
|
|
166
|
+
)
|
|
167
|
+
@click.pass_context
|
|
168
|
+
def ingest(ctx, path, dry_run, concurrency):
|
|
169
|
+
"""Ingest a session transcript."""
|
|
170
|
+
resolved = _resolve_session_path(path)
|
|
171
|
+
if resolved is None:
|
|
172
|
+
sys.exit(1)
|
|
173
|
+
|
|
174
|
+
if dry_run:
|
|
175
|
+
try:
|
|
176
|
+
session = parse_session_file(resolved)
|
|
177
|
+
except Exception as e:
|
|
178
|
+
click.echo(f"Error parsing {resolved}: {e}", err=True)
|
|
179
|
+
sys.exit(1)
|
|
180
|
+
_print_dry_run(session)
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
config = _get_config_or_exit(ctx)
|
|
184
|
+
|
|
185
|
+
def _cli_progress(stage: str, completed: int, total: int) -> None:
|
|
186
|
+
if completed == 1:
|
|
187
|
+
click.echo(f"{stage.capitalize()} {total} segments ({concurrency} workers)...")
|
|
188
|
+
click.echo(f" Completed {completed}/{total}")
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
from crowd_control.ingest.pipeline import ingest_session
|
|
192
|
+
|
|
193
|
+
result = ingest_session(
|
|
194
|
+
resolved, config, max_workers=concurrency, progress_callback=_cli_progress
|
|
195
|
+
)
|
|
196
|
+
except Exception as e:
|
|
197
|
+
click.echo(f"Ingestion failed: {e}", err=True)
|
|
198
|
+
sys.exit(1)
|
|
199
|
+
|
|
200
|
+
click.echo(f"\nIngested session {result.session_id}:")
|
|
201
|
+
click.echo(f" Segments processed: {result.segments_processed}")
|
|
202
|
+
click.echo(f" Learnings distilled: {result.learnings_distilled}")
|
|
203
|
+
click.echo(f" Learnings stored: {result.learnings_stored}")
|
|
204
|
+
click.echo(f" Duplicates skipped: {result.learnings_deduplicated}")
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@main.command(name="list")
|
|
208
|
+
@click.option("--project", default=None, help="Filter by project path.")
|
|
209
|
+
@click.option("--category", default=None, help="Filter by category.")
|
|
210
|
+
@click.option("--limit", default=50, type=int, show_default=True)
|
|
211
|
+
@click.pass_context
|
|
212
|
+
def list_cmd(ctx, project, category, limit):
|
|
213
|
+
"""List stored learnings."""
|
|
214
|
+
config = _get_config_or_exit(ctx)
|
|
215
|
+
try:
|
|
216
|
+
from crowd_control.storage.db import LearningStore
|
|
217
|
+
|
|
218
|
+
store = LearningStore(config.db_path)
|
|
219
|
+
learnings = store.list_learnings(project=project, category=category, limit=limit)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
click.echo(f"Database not available: {e}", err=True)
|
|
222
|
+
sys.exit(1)
|
|
223
|
+
|
|
224
|
+
if not learnings:
|
|
225
|
+
click.echo("No learnings found.")
|
|
226
|
+
return
|
|
227
|
+
|
|
228
|
+
for i, learning in enumerate(learnings, 1):
|
|
229
|
+
click.echo(f" [{i}] ({learning['category']}) [confidence={learning['confidence']:.2f}]")
|
|
230
|
+
click.echo(f" {learning['text']}")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
@main.command()
|
|
234
|
+
@click.argument("query")
|
|
235
|
+
@click.option("--limit", default=None, type=int, help="Override max results.")
|
|
236
|
+
@click.option("--project", default=None, help="Filter by project path.")
|
|
237
|
+
@click.option("--category", default=None, help="Filter by category.")
|
|
238
|
+
@click.pass_context
|
|
239
|
+
def search(ctx, query, limit, project, category):
|
|
240
|
+
"""Search learnings for a query."""
|
|
241
|
+
import dataclasses
|
|
242
|
+
|
|
243
|
+
config = _get_config_or_exit(ctx)
|
|
244
|
+
retrieval_config = config.retrieval
|
|
245
|
+
if limit is not None:
|
|
246
|
+
retrieval_config = dataclasses.replace(retrieval_config, max_results=limit)
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
from crowd_control.embed.base import create_embedder
|
|
250
|
+
|
|
251
|
+
embedder = create_embedder(config.embedding)
|
|
252
|
+
except Exception as e:
|
|
253
|
+
click.echo(f"Embedding provider error: {e}", err=True)
|
|
254
|
+
click.echo(
|
|
255
|
+
f"Is your embedding provider ({config.embedding.provider}) running?",
|
|
256
|
+
err=True,
|
|
257
|
+
)
|
|
258
|
+
sys.exit(1)
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
from crowd_control.storage.db import LearningStore
|
|
262
|
+
|
|
263
|
+
store = LearningStore(config.db_path)
|
|
264
|
+
except ValueError as e:
|
|
265
|
+
if "vector_dimensions is required" in str(e):
|
|
266
|
+
click.echo(
|
|
267
|
+
"No learnings database found. Run `crowd-control ingest` first.",
|
|
268
|
+
err=True,
|
|
269
|
+
)
|
|
270
|
+
else:
|
|
271
|
+
click.echo(f"Database error: {e}", err=True)
|
|
272
|
+
sys.exit(1)
|
|
273
|
+
|
|
274
|
+
from crowd_control.retrieve import retrieve_learnings
|
|
275
|
+
|
|
276
|
+
current_project = project or _detect_project()
|
|
277
|
+
|
|
278
|
+
result = retrieve_learnings(
|
|
279
|
+
query=query,
|
|
280
|
+
store=store,
|
|
281
|
+
embedder=embedder,
|
|
282
|
+
retrieval_config=retrieval_config,
|
|
283
|
+
scope=config.knowledge.scope,
|
|
284
|
+
current_project=current_project,
|
|
285
|
+
category=category,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
_print_search_results(result, query)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@main.command()
|
|
292
|
+
@click.option("--output", "-o", default=None, type=click.Path(), help="Output file path.")
|
|
293
|
+
@click.option("--project", default=None, help="Filter by project path.")
|
|
294
|
+
@click.option("--category", default=None, help="Filter by category.")
|
|
295
|
+
@click.pass_context
|
|
296
|
+
def export(ctx, output, project, category):
|
|
297
|
+
"""Export learnings as JSON."""
|
|
298
|
+
config = _get_config_or_exit(ctx)
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
from crowd_control.storage.db import LearningStore
|
|
302
|
+
|
|
303
|
+
store = LearningStore(config.db_path)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
click.echo(f"Database not available: {e}", err=True)
|
|
306
|
+
sys.exit(1)
|
|
307
|
+
|
|
308
|
+
learnings = store.export_learnings(project=project, category=category)
|
|
309
|
+
|
|
310
|
+
export_data = {
|
|
311
|
+
"version": "1",
|
|
312
|
+
"exported_at": datetime.now(UTC).isoformat(),
|
|
313
|
+
"count": len(learnings),
|
|
314
|
+
"learnings": learnings,
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
json_output = json.dumps(export_data, indent=2, default=str)
|
|
318
|
+
|
|
319
|
+
if output:
|
|
320
|
+
Path(output).write_text(json_output + "\n")
|
|
321
|
+
click.echo(f"Exported {len(learnings)} learnings to {output}", err=True)
|
|
322
|
+
else:
|
|
323
|
+
click.echo(json_output)
|
|
324
|
+
click.echo(f"Exported {len(learnings)} learnings", err=True)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@main.command()
|
|
328
|
+
def serve():
|
|
329
|
+
"""Run the MCP server (stdio transport)."""
|
|
330
|
+
from crowd_control.server import run_server
|
|
331
|
+
|
|
332
|
+
run_server()
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _print_search_results(result, query: str) -> None:
|
|
336
|
+
"""Format and display retrieval results."""
|
|
337
|
+
from crowd_control.formatting import extract_display_learnings
|
|
338
|
+
|
|
339
|
+
click.echo(f'Searching for: "{query}"')
|
|
340
|
+
click.echo()
|
|
341
|
+
|
|
342
|
+
if not result.ranked:
|
|
343
|
+
click.echo("No matching learnings found.")
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
learnings = extract_display_learnings(result)
|
|
347
|
+
|
|
348
|
+
for fl in learnings:
|
|
349
|
+
age_str = f"{fl.age_days}d" if fl.age_days > 0 else "0s"
|
|
350
|
+
click.echo(f" [{fl.rank}] (score={fl.score:.2f}) [{fl.category}]")
|
|
351
|
+
click.echo(f" {fl.text}")
|
|
352
|
+
click.echo(f" project={fl.project} retrieved={fl.active_count}x age={age_str}")
|
|
353
|
+
click.echo()
|
|
354
|
+
|
|
355
|
+
result_word = "result" if len(learnings) == 1 else "results"
|
|
356
|
+
click.echo(f"{len(learnings)} {result_word} (searched {result.total_learnings} learnings)")
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _print_dry_run(session) -> None:
|
|
360
|
+
"""Print session structure without distilling."""
|
|
361
|
+
filtered_count = sum(len(s.messages) for s in session.segments)
|
|
362
|
+
click.echo(f"Session: {session.session_id}")
|
|
363
|
+
click.echo(f"Project: {session.project_path}")
|
|
364
|
+
click.echo(f"Branch: {session.git_branch or '(none)'}")
|
|
365
|
+
click.echo(f"Model: {session.model or '(unknown)'}")
|
|
366
|
+
click.echo(f"Period: {_fmt_time(session.start_time)} → {_fmt_time(session.end_time)}")
|
|
367
|
+
click.echo(f"Messages: {session.message_count} parsed, {filtered_count} in segments")
|
|
368
|
+
click.echo()
|
|
369
|
+
|
|
370
|
+
if not session.segments:
|
|
371
|
+
click.echo("No conversation segments found.")
|
|
372
|
+
return
|
|
373
|
+
|
|
374
|
+
click.echo(f"Segments ({len(session.segments)}):")
|
|
375
|
+
for i, seg in enumerate(session.segments, 1):
|
|
376
|
+
tools = ", ".join(seg.tool_names) if seg.tool_names else "(none)"
|
|
377
|
+
click.echo(
|
|
378
|
+
f" [{i}] {_fmt_hms(seg.start_time)} — {_fmt_hms(seg.end_time)}"
|
|
379
|
+
f" ({len(seg.messages)} messages, tools: {tools})"
|
|
380
|
+
)
|
|
381
|
+
preview = _get_user_preview(seg)
|
|
382
|
+
if preview:
|
|
383
|
+
click.echo(f' User: "{preview}"')
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _resolve_session_path(path: str | None) -> Path | None:
|
|
387
|
+
"""Resolve a session path argument, or find the most recent session for cwd."""
|
|
388
|
+
if path:
|
|
389
|
+
resolved = Path(path).expanduser().resolve()
|
|
390
|
+
if not resolved.exists():
|
|
391
|
+
click.echo(f"File not found: {resolved}", err=True)
|
|
392
|
+
return None
|
|
393
|
+
return resolved
|
|
394
|
+
|
|
395
|
+
sessions = find_sessions()
|
|
396
|
+
if not sessions:
|
|
397
|
+
click.echo("No session files found for the current project.", err=True)
|
|
398
|
+
return None
|
|
399
|
+
return sessions[0]
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _get_user_preview(seg) -> str:
|
|
403
|
+
"""Get a preview of the first user text in a segment."""
|
|
404
|
+
for msg in seg.messages:
|
|
405
|
+
if msg.role.value == "user" and not msg.is_meta:
|
|
406
|
+
for block in msg.content:
|
|
407
|
+
if isinstance(block, TextBlock):
|
|
408
|
+
text = block.text.strip().replace("\n", " ")
|
|
409
|
+
if len(text) > 70:
|
|
410
|
+
return text[:67] + "..."
|
|
411
|
+
return text
|
|
412
|
+
return ""
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _fmt_time(dt) -> str:
|
|
416
|
+
try:
|
|
417
|
+
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
418
|
+
except (ValueError, AttributeError):
|
|
419
|
+
return "(unknown)"
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def _fmt_hms(dt) -> str:
|
|
423
|
+
try:
|
|
424
|
+
return dt.strftime("%H:%M:%S")
|
|
425
|
+
except (ValueError, AttributeError):
|
|
426
|
+
return "??:??:??"
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _detect_project() -> str:
|
|
430
|
+
"""Return the current working directory as the project path."""
|
|
431
|
+
return os.getcwd()
|
crowd_control/config.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Configuration loading and defaults."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import dataclasses
|
|
6
|
+
import logging
|
|
7
|
+
import tomllib
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
_DEFAULT_CONFIG_PATH = Path("~/.crowd-control/config.toml")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ConfigError(Exception):
|
|
17
|
+
"""Raised when configuration loading fails (e.g., invalid TOML)."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class EmbeddingConfig:
|
|
22
|
+
provider: str = "ollama"
|
|
23
|
+
model: str = "nomic-embed-text"
|
|
24
|
+
api_key_env: str | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class DistillationConfig:
|
|
29
|
+
model: str = "haiku"
|
|
30
|
+
max_learnings_per_session: int = 20
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class RetrievalConfig:
|
|
35
|
+
max_results: int = 15
|
|
36
|
+
max_tokens: int = 4000
|
|
37
|
+
min_similarity: float = 0.3
|
|
38
|
+
recency_half_life_days: float = 7.0
|
|
39
|
+
hotness_weight: float = 0.2
|
|
40
|
+
project_boost: float = 1.5
|
|
41
|
+
|
|
42
|
+
def __post_init__(self):
|
|
43
|
+
if self.recency_half_life_days <= 0:
|
|
44
|
+
raise ValueError("recency_half_life_days must be positive")
|
|
45
|
+
if not (0.0 <= self.hotness_weight <= 1.0):
|
|
46
|
+
raise ValueError("hotness_weight must be between 0.0 and 1.0")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(frozen=True)
|
|
50
|
+
class IngestionConfig:
|
|
51
|
+
auto_ingest: bool = True
|
|
52
|
+
batch_size: int = 5
|
|
53
|
+
dedup_threshold: float = 0.95
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(frozen=True)
|
|
57
|
+
class KnowledgeConfig:
|
|
58
|
+
scope: str = "project"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass(frozen=True)
|
|
62
|
+
class CrowdControlConfig:
|
|
63
|
+
storage_dir: str = "~/.crowd-control"
|
|
64
|
+
log_level: str = "off"
|
|
65
|
+
embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
|
|
66
|
+
distillation: DistillationConfig = field(default_factory=DistillationConfig)
|
|
67
|
+
retrieval: RetrievalConfig = field(default_factory=RetrievalConfig)
|
|
68
|
+
ingestion: IngestionConfig = field(default_factory=IngestionConfig)
|
|
69
|
+
knowledge: KnowledgeConfig = field(default_factory=KnowledgeConfig)
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def db_path(self) -> str:
|
|
73
|
+
return str(Path(self.storage_dir).expanduser() / "db")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# Mapping from TOML section names to their dataclass types.
|
|
77
|
+
_SECTION_MAP: dict[str, type] = {
|
|
78
|
+
"embedding": EmbeddingConfig,
|
|
79
|
+
"distillation": DistillationConfig,
|
|
80
|
+
"retrieval": RetrievalConfig,
|
|
81
|
+
"ingestion": IngestionConfig,
|
|
82
|
+
"knowledge": KnowledgeConfig,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def load_config(config_path: Path | None = None) -> CrowdControlConfig:
|
|
87
|
+
"""Load config from file, falling back to defaults for missing keys.
|
|
88
|
+
|
|
89
|
+
If config_path is None, looks for ~/.crowd-control/config.toml.
|
|
90
|
+
If the file doesn't exist, returns all defaults.
|
|
91
|
+
"""
|
|
92
|
+
path = (config_path or _DEFAULT_CONFIG_PATH).expanduser()
|
|
93
|
+
|
|
94
|
+
if not path.exists():
|
|
95
|
+
return CrowdControlConfig()
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
with open(path, "rb") as f:
|
|
99
|
+
raw = tomllib.load(f)
|
|
100
|
+
except tomllib.TOMLDecodeError as e:
|
|
101
|
+
raise ConfigError(f"Invalid TOML in {path}: {e}") from e
|
|
102
|
+
|
|
103
|
+
# Extract top-level (general) fields
|
|
104
|
+
general = raw.get("general", {})
|
|
105
|
+
top_kwargs: dict = {}
|
|
106
|
+
if "storage_dir" in general:
|
|
107
|
+
top_kwargs["storage_dir"] = general["storage_dir"]
|
|
108
|
+
if "log_level" in general:
|
|
109
|
+
top_kwargs["log_level"] = general["log_level"]
|
|
110
|
+
|
|
111
|
+
# Build each section dataclass from its TOML section
|
|
112
|
+
for section_name, cls in _SECTION_MAP.items():
|
|
113
|
+
section_data = raw.get(section_name, {})
|
|
114
|
+
if section_data:
|
|
115
|
+
valid_fields = {f.name for f in dataclasses.fields(cls)}
|
|
116
|
+
filtered = {k: v for k, v in section_data.items() if k in valid_fields}
|
|
117
|
+
top_kwargs[section_name] = cls(**filtered)
|
|
118
|
+
|
|
119
|
+
return CrowdControlConfig(**top_kwargs)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[general]
|
|
2
|
+
storage_dir = "~/.crowd-control"
|
|
3
|
+
log_level = "off"
|
|
4
|
+
|
|
5
|
+
[knowledge]
|
|
6
|
+
# "project" = learnings scoped to source project, retrieval filtered by current project
|
|
7
|
+
# "shared" = all learnings in one pool, retrieval searches everything
|
|
8
|
+
# "mixed" = distiller classifies each learning as project-specific or universal (v0.2+)
|
|
9
|
+
scope = "project"
|
|
10
|
+
|
|
11
|
+
[embedding]
|
|
12
|
+
provider = "ollama"
|
|
13
|
+
model = "nomic-embed-text"
|
|
14
|
+
# api_key_env = "VOYAGE_API_KEY" # env var name, only needed for API providers
|
|
15
|
+
|
|
16
|
+
[distillation]
|
|
17
|
+
model = "haiku" # Claude Code model alias (passed to claude -p --model)
|
|
18
|
+
max_learnings_per_session = 20
|
|
19
|
+
|
|
20
|
+
[retrieval]
|
|
21
|
+
max_results = 15
|
|
22
|
+
max_tokens = 4000
|
|
23
|
+
min_similarity = 0.3
|
|
24
|
+
recency_half_life_days = 7 # Exponential decay half-life in days
|
|
25
|
+
hotness_weight = 0.2 # Blend: 0.0 = pure semantic, 1.0 = pure hotness
|
|
26
|
+
project_boost = 1.5
|
|
27
|
+
|
|
28
|
+
[ingestion]
|
|
29
|
+
auto_ingest = true
|
|
30
|
+
batch_size = 5
|
|
31
|
+
dedup_threshold = 0.95 # Cosine similarity threshold for near-duplicate rejection
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Embedding providers."""
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Embedder protocol and factory."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol
|
|
6
|
+
|
|
7
|
+
from crowd_control.config import EmbeddingConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EmbeddingError(Exception):
|
|
11
|
+
"""Raised when embedding fails (connection error, API error, etc.)."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Embedder(Protocol):
|
|
15
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
16
|
+
"""Embed a batch of texts into vectors.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
texts: Non-empty list of strings to embed.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
List of embedding vectors, same length as texts.
|
|
23
|
+
Each vector has length == self.dimensions.
|
|
24
|
+
"""
|
|
25
|
+
...
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def dimensions(self) -> int:
|
|
29
|
+
"""The dimensionality of the embedding vectors."""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def max_input_chars(self) -> int:
|
|
34
|
+
"""Maximum character length per input text."""
|
|
35
|
+
...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def create_embedder(config: EmbeddingConfig) -> Embedder:
|
|
39
|
+
"""Create an embedder from config. Raises EmbeddingError on failure."""
|
|
40
|
+
try:
|
|
41
|
+
match config.provider:
|
|
42
|
+
case "ollama":
|
|
43
|
+
from crowd_control.embed.ollama import OllamaEmbedder
|
|
44
|
+
|
|
45
|
+
return OllamaEmbedder(model=config.model)
|
|
46
|
+
case "voyage":
|
|
47
|
+
from crowd_control.embed.voyage import VoyageEmbedder
|
|
48
|
+
|
|
49
|
+
return VoyageEmbedder(model=config.model, api_key_env=config.api_key_env)
|
|
50
|
+
case "openai":
|
|
51
|
+
from crowd_control.embed.openai import OpenAIEmbedder
|
|
52
|
+
|
|
53
|
+
return OpenAIEmbedder(model=config.model, api_key_env=config.api_key_env)
|
|
54
|
+
case _:
|
|
55
|
+
raise EmbeddingError(f"Unknown embedding provider: {config.provider}")
|
|
56
|
+
except ImportError as e:
|
|
57
|
+
package_hint = {"ollama": "ollama", "voyage": "voyage", "openai": "openai"}.get(
|
|
58
|
+
config.provider, config.provider
|
|
59
|
+
)
|
|
60
|
+
raise EmbeddingError(
|
|
61
|
+
f"{config.provider} package not installed. "
|
|
62
|
+
f"Run: pip install crowd-control[{package_hint}]"
|
|
63
|
+
) from e
|
|
64
|
+
except ValueError as e:
|
|
65
|
+
raise EmbeddingError(str(e)) from e
|