raglite-chromadb 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
raglite/raglite_cli.py ADDED
@@ -0,0 +1,953 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import fnmatch
6
+ import hashlib
7
+ import json
8
+ import os
9
+ import sys
10
+ import time
11
+ import urllib.request
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import Iterable
15
+
16
+ # NOTE: extract deps (bs4/pypdf) are only needed for `condense`.
17
+ # We import extract_file lazily inside the condense path so `index/query` can run without them.
18
+
19
+ DEFAULT_OLLAMA_URL = "http://127.0.0.1:11434"
20
+ DEFAULT_OLLAMA_MODEL = "llama3.2:3b"
21
+ DEFAULT_GATEWAY_URL = "http://127.0.0.1:18789"
22
+
23
+
24
+ @dataclass
25
+ class Prompts:
26
+ outline: str
27
+ execution_notes: str
28
+ tool_summary: str
29
+
30
+
31
+ def build_prompts(*, token_cap_hint: str, outline_max_tokens: int, exec_max_tokens: int, summary_max_tokens: int) -> Prompts:
32
+ # Stage A: loss-minimized outline. Keep very explicit, ban prose.
33
+ outline = f"""You are converting documentation into a LOSS-MINIMIZED, low-fluff OUTLINE for an executor AI.
34
+
35
+ Return ONLY markdown.
36
+
37
+ FORMAT:
38
+ - Use headings to preserve structure.
39
+ - Under each heading, only use bullet lists.
40
+ - Prefer exact names, symbols, commands, function names, parameters, file paths.
41
+
42
+ For each section, prioritize these bullets (when present):
43
+ - Definitions / key terms
44
+ - Interfaces (commands/APIs/classes/functions) + important fields/flags
45
+ - Procedures (step lists)
46
+ - Constraints/assumptions
47
+ - Failure modes / gotchas
48
+ - Examples (short)
49
+
50
+ RULES:
51
+ - No marketing prose. No table of contents. No change log.
52
+ - Keep as much factual content as possible.
53
+ - Target <= {outline_max_tokens} output tokens.
54
+
55
+ SOURCE (extracted text) below:
56
+ ---
57
+ """
58
+
59
+ # Stage B1: execution notes (from outline if enabled)
60
+ execution_notes = f"""You are an expert at converting documentation into EXECUTION-RELEVANT notes for an AI agent that can run tools (CLI commands, HTTP calls, scripts, functions).
61
+
62
+ Return ONLY markdown.
63
+
64
+ OUTPUT FORMAT (Markdown):
65
+ - Title
66
+ - What this tool/service is
67
+ - When to use
68
+ - Inputs (required/optional)
69
+ - Outputs
70
+ - Preconditions / assumptions
71
+ - Golden path (numbered steps)
72
+ - Verification checks
73
+ - Common errors + fixes
74
+ - Safety/rollback notes
75
+
76
+ RULES:
77
+ - Be concise and operational; no marketing.
78
+ - Prefer concrete commands, flags, endpoints, example payloads.
79
+ - Keep within {token_cap_hint}. Target <= {exec_max_tokens} output tokens.
80
+
81
+ SOURCE (extracted text) below:
82
+ ---
83
+ """
84
+
85
+ # Stage B2: tool index entry (template)
86
+ tool_summary = f"""You are an expert at writing ULTRA-CONDENSED, AI-readable TOOL INDEX entries.
87
+
88
+ Return ONLY markdown, and ONLY the filled-in TEMPLATE below.
89
+
90
+ TEMPLATE (replace the angle-bracket placeholders; keep headings verbatim):
91
+
92
+ # <TOOL_NAME>
93
+
94
+ **Purpose:** <ONE_SENTENCE>
95
+
96
+ **Capabilities:**
97
+ - <BULLET>
98
+ - <BULLET>
99
+ - <BULLET>
100
+
101
+ **Requires:**
102
+ - <BULLET_OR_Unknown>
103
+
104
+ **Entrypoints:**
105
+ - <BULLET_OR_Unknown>
106
+
107
+ **Limits / footguns:**
108
+ - <BULLET>
109
+
110
+ RULES:
111
+ - No table of contents, no prose summary, no history, no change log.
112
+ - Prefer symbols, backticks, and short bullets over sentences.
113
+ - Rewrite commands is OK, but reference the real command/name/path when the doc provides it.
114
+ - If the doc describes multiple components, choose the primary "tool" and mention others only as bullets.
115
+ - If a dependency/entrypoint is not explicitly in the source, write `Unknown`.
116
+ - Target <= {summary_max_tokens} output tokens.
117
+
118
+ SOURCE (extracted text) below:
119
+ ---
120
+ """
121
+
122
+ return Prompts(outline=outline, execution_notes=execution_notes, tool_summary=tool_summary)
123
+
124
+
125
+ def iter_input_files(root: Path) -> Iterable[Path]:
126
+ if root.is_file():
127
+ yield root
128
+ return
129
+
130
+ exts = {".pdf", ".txt", ".md", ".html", ".htm"}
131
+ for p in root.rglob("*"):
132
+ if p.is_file() and p.suffix.lower() in exts:
133
+ yield p
134
+
135
+
136
+ def post_json(url: str, data: dict, timeout: int = 120, headers: dict[str, str] | None = None) -> dict:
137
+ body = json.dumps(data).encode("utf-8")
138
+ h = {"Content-Type": "application/json"}
139
+ if headers:
140
+ h.update(headers)
141
+ req = urllib.request.Request(url, data=body, headers=h)
142
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
143
+ return json.loads(resp.read().decode("utf-8"))
144
+
145
+
146
+ def sha256_text(s: str) -> str:
147
+ return hashlib.sha256(s.encode("utf-8", errors="ignore")).hexdigest()
148
+
149
+
150
+ def ollama_generate(*, ollama_url: str, model: str, prompt: str, num_predict: int) -> str:
151
+ data = post_json(
152
+ f"{ollama_url}/api/generate",
153
+ {
154
+ "model": model,
155
+ "prompt": prompt,
156
+ "stream": False,
157
+ "options": {
158
+ "num_predict": num_predict,
159
+ "temperature": 0.2,
160
+ },
161
+ },
162
+ timeout=600,
163
+ )
164
+ return (data.get("response") or "").strip()
165
+
166
+
167
+ def openclaw_generate(*, gateway_url: str, gateway_token: str, agent_id: str, prompt: str, max_output_tokens: int) -> str:
168
+ data = post_json(
169
+ f"{gateway_url}/v1/responses",
170
+ {
171
+ "model": "openclaw",
172
+ "input": prompt,
173
+ "max_output_tokens": max_output_tokens,
174
+ },
175
+ timeout=600,
176
+ headers={
177
+ "Authorization": f"Bearer {gateway_token}",
178
+ "x-openclaw-agent-id": agent_id,
179
+ },
180
+ )
181
+
182
+ out_parts: list[str] = []
183
+ for item in data.get("output") or []:
184
+ if item.get("type") != "message":
185
+ continue
186
+ for part in item.get("content") or []:
187
+ if part.get("type") == "output_text" and part.get("text"):
188
+ out_parts.append(part["text"])
189
+
190
+ return "\n".join(out_parts).strip()
191
+
192
+
193
+ def generate_with_retries(*, engine: str, retries: int, sleep_s: float, prompt: str, max_tokens: int, ollama_url: str, ollama_model: str, gateway_url: str, gateway_token: str, agent_id: str) -> str:
194
+ last_err: Exception | None = None
195
+ for attempt in range(retries + 1):
196
+ try:
197
+ if engine == "ollama":
198
+ return ollama_generate(ollama_url=ollama_url, model=ollama_model, prompt=prompt, num_predict=max_tokens)
199
+ return openclaw_generate(gateway_url=gateway_url, gateway_token=gateway_token, agent_id=agent_id, prompt=prompt, max_output_tokens=max_tokens)
200
+ except Exception as e:
201
+ last_err = e
202
+ if attempt < retries:
203
+ time.sleep(sleep_s * (attempt + 1))
204
+ continue
205
+ raise
206
+ raise RuntimeError(str(last_err))
207
+
208
+
209
+ def rel_output_path(input_path: Path, input_root: Path, out_root: Path, suffix: str) -> Path:
210
+ if input_root.is_file():
211
+ rel = input_path.name
212
+ else:
213
+ rel = str(input_path.relative_to(input_root))
214
+
215
+ base = Path(rel)
216
+ out_rel = base.with_suffix("")
217
+ return out_root / out_rel.parent / (out_rel.name + suffix)
218
+
219
+
220
+ def matches_any(path_str: str, patterns: list[str]) -> bool:
221
+ return any(fnmatch.fnmatch(path_str, pat) for pat in patterns)
222
+
223
+
224
+ def validate_tool_summary(md: str) -> list[str]:
225
+ # Minimal structural validation (keeps us from silently accepting garbage).
226
+ problems: list[str] = []
227
+ required = [
228
+ "# ",
229
+ "**Purpose:**",
230
+ "**Capabilities:**",
231
+ "**Requires:**",
232
+ "**Entrypoints:**",
233
+ "**Limits / footguns:**",
234
+ ]
235
+ for r in required:
236
+ if r not in md:
237
+ problems.append(f"missing:{r}")
238
+
239
+ # place-holder leakage
240
+ if "<TOOL_NAME>" in md or "<ONE_SENTENCE>" in md or "<BULLET" in md:
241
+ problems.append("contains_placeholders")
242
+
243
+ # too short tends to mean it failed / got truncated into junk
244
+ if len(md.strip()) < 120:
245
+ problems.append("too_short")
246
+
247
+ return problems
248
+
249
+
250
+ def validate_outline(md: str) -> list[str]:
251
+ problems: list[str] = []
252
+ if len(md.strip()) < 400:
253
+ problems.append("too_short")
254
+ # Require at least one heading to avoid useless blobs
255
+ if "#" not in md:
256
+ problems.append("missing_headings")
257
+ # Often indicates model refused / answered generically
258
+ if md.strip().lower().startswith("i can") or md.strip().lower().startswith("sorry"):
259
+ problems.append("refusal_or_meta")
260
+ return problems
261
+
262
+
263
+ def validate_execution_notes(md: str) -> list[str]:
264
+ problems: list[str] = []
265
+ if not md.lstrip().startswith("#"):
266
+ problems.append("missing_title")
267
+ if "Golden path" not in md and "golden path" not in md:
268
+ problems.append("missing_golden_path")
269
+ if len(md.strip()) < 300:
270
+ problems.append("too_short")
271
+ return problems
272
+
273
+
274
+ def estimate_tokens(text: str) -> int:
275
+ """Very rough token estimate (good enough for sizing nodes)."""
276
+ return max(1, int(len(text) / 4))
277
+
278
+
279
+ def shard_outline_to_nodes(
280
+ *,
281
+ outline_md: str,
282
+ nodes_dir: Path,
283
+ node_min_tokens: int = 200,
284
+ node_max_tokens: int = 600,
285
+ node_max_chars: int = 1200,
286
+ ) -> list[Path]:
287
+ """Split outline into small, embed-friendly topic nodes.
288
+
289
+ Strategy:
290
+ - Split by H2 headings (##)
291
+ - If a section is too large, further split into multiple parts.
292
+ - If the final part is too small (< node_min_tokens), merge it into the previous part.
293
+ """
294
+ nodes_dir.mkdir(parents=True, exist_ok=True)
295
+
296
+ if node_min_tokens < 1 or node_max_tokens < 1 or node_max_chars < 200:
297
+ raise ValueError("node_min_tokens/node_max_tokens/node_max_chars must be positive")
298
+ if node_min_tokens > node_max_tokens:
299
+ raise ValueError("node_min_tokens must be <= node_max_tokens")
300
+
301
+ lines = outline_md.splitlines()
302
+ sections: list[tuple[str, list[str]]] = []
303
+ cur_title = "overview"
304
+ cur_lines: list[str] = []
305
+
306
+ def flush():
307
+ nonlocal cur_title, cur_lines
308
+ if cur_lines:
309
+ sections.append((cur_title, cur_lines))
310
+ cur_lines = []
311
+
312
+ for line in lines:
313
+ if line.startswith("## "):
314
+ flush()
315
+ cur_title = line[3:].strip() or "section"
316
+ cur_lines.append(line)
317
+ else:
318
+ cur_lines.append(line)
319
+ flush()
320
+
321
+ def slugify(title: str, idx: int) -> str:
322
+ slug = (
323
+ title.lower()
324
+ .replace("/", " ")
325
+ .replace("\\", " ")
326
+ .replace(":", " ")
327
+ .replace(" ", " ")
328
+ .strip()
329
+ )
330
+ slug = "-".join([p for p in slug.split() if p])[:60] or f"section-{idx}"
331
+ return slug
332
+
333
+ out_paths: list[Path] = []
334
+ out_i = 1
335
+
336
+ for sec_i, (title, sec_lines) in enumerate(sections, start=1):
337
+ slug = slugify(title, sec_i)
338
+
339
+ sec_text = "\n".join(sec_lines).strip() + "\n"
340
+ if estimate_tokens(sec_text) <= node_max_tokens and len(sec_text) <= node_max_chars:
341
+ p = nodes_dir / f"{out_i:02d}-{slug}.md"
342
+ p.write_text(sec_text, encoding="utf-8")
343
+ out_paths.append(p)
344
+ out_i += 1
345
+ continue
346
+
347
+ heading = sec_lines[0] if sec_lines and sec_lines[0].startswith("## ") else f"## {title}"
348
+ body = sec_lines[1:] if sec_lines and sec_lines[0].startswith("## ") else sec_lines
349
+
350
+ parts: list[str] = []
351
+ cur: list[str] = [heading]
352
+
353
+ for line in body:
354
+ candidate = "\n".join(cur + [line]).strip() + "\n"
355
+ if (estimate_tokens(candidate) > node_max_tokens or len(candidate) > node_max_chars) and len(cur) > 1:
356
+ parts.append("\n".join(cur).strip() + "\n")
357
+ cur = [heading, line]
358
+ else:
359
+ cur.append(line)
360
+
361
+ if len(cur) > 1:
362
+ parts.append("\n".join(cur).strip() + "\n")
363
+
364
+ # Merge trailing tiny part into previous part.
365
+ if len(parts) >= 2 and estimate_tokens(parts[-1]) < node_min_tokens:
366
+ parts[-2] = parts[-2].rstrip() + "\n" + parts[-1]
367
+ parts = parts[:-1]
368
+
369
+ for part_i, text in enumerate(parts, start=1):
370
+ # Cap to ensure embed-friendly nodes.
371
+ if len(text) > node_max_chars:
372
+ text = text[:node_max_chars] + "\n\n[TRUNCATED]\n"
373
+
374
+ suffix = f"--p{part_i:02d}" if len(parts) > 1 else ""
375
+ p = nodes_dir / f"{out_i:02d}-{slug}{suffix}.md"
376
+ p.write_text(text, encoding="utf-8")
377
+ out_paths.append(p)
378
+ out_i += 1
379
+
380
+ return out_paths
381
+
382
+
383
+ def write_doc_index(*, doc_index_path: Path, source_path: Path, tool_summary: Path, execution_notes: Path, outline_path: Path | None, node_paths: list[Path]) -> None:
384
+ doc_index_path.parent.mkdir(parents=True, exist_ok=True)
385
+
386
+ rel = lambda p: p.name if p.parent == doc_index_path.parent else str(p.relative_to(doc_index_path.parent))
387
+
388
+ lines: list[str] = []
389
+ lines.append(f"# {source_path.stem}")
390
+ lines.append("")
391
+ lines.append("## Outputs")
392
+ lines.append(f"- Tool summary: [{tool_summary.name}]({rel(tool_summary)})")
393
+ lines.append(f"- Execution notes: [{execution_notes.name}]({rel(execution_notes)})")
394
+ if outline_path:
395
+ lines.append(f"- Outline: [{outline_path.name}]({rel(outline_path)})")
396
+ if node_paths:
397
+ lines.append("")
398
+ lines.append("## Nodes")
399
+ for p in node_paths:
400
+ lines.append(f"- [{p.name}]({rel(p)})")
401
+ lines.append("")
402
+ lines.append("## Source")
403
+ lines.append(f"- `{source_path}`")
404
+ lines.append("")
405
+
406
+ doc_index_path.write_text("\n".join(lines), encoding="utf-8")
407
+
408
+
409
+ def write_root_index(*, out_root: Path, input_root: Path, doc_indices: list[Path]) -> None:
410
+ """Human-browsable top index that links to per-doc indices."""
411
+ index_path = out_root / "index.md"
412
+ lines: list[str] = []
413
+ lines.append("# RAGLite Index")
414
+ lines.append("")
415
+ lines.append(f"- Source: `{input_root}`")
416
+ lines.append(f"- Generated: {time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime())}")
417
+ lines.append("")
418
+
419
+ if not doc_indices:
420
+ lines.append("(No per-doc indices were generated. Run with `--nodes`.)")
421
+ index_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
422
+ return
423
+
424
+ lines.append("## Documents")
425
+ for p in sorted(doc_indices):
426
+ rel = str(p.relative_to(out_root))
427
+ lines.append(f"- [{p.stem}]({rel})")
428
+
429
+ index_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
430
+
431
+
432
+ def load_cache(cache_path: Path) -> dict:
433
+ try:
434
+ return json.loads(cache_path.read_text(encoding="utf-8"))
435
+ except Exception:
436
+ return {"files": {}}
437
+
438
+
439
+ def save_cache(cache_path: Path, cache: dict) -> None:
440
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
441
+ cache_path.write_text(json.dumps(cache, indent=2, sort_keys=True) + "\n", encoding="utf-8")
442
+
443
+
444
+ def cli() -> None:
445
+ """Console-script entrypoint."""
446
+ raise SystemExit(main())
447
+
448
+
449
+ def main() -> int:
450
+ ap = argparse.ArgumentParser(description="RAGLite CLI")
451
+ sub = ap.add_subparsers(dest="cmd", required=True)
452
+
453
+ # --- Condense ---
454
+ ap_c = sub.add_parser("condense", help="Extract + distill docs into markdown artifacts")
455
+ ap_c.add_argument("input", help="File or directory to process")
456
+ ap_c.add_argument("--out", default="./raglite_out", help="Output directory")
457
+
458
+ ap_c.add_argument("--engine", choices=["ollama", "openclaw"], default="ollama", help="Generation engine")
459
+
460
+ ap_c.add_argument("--ollama-url", default=DEFAULT_OLLAMA_URL)
461
+ ap_c.add_argument("--ollama-model", default=DEFAULT_OLLAMA_MODEL)
462
+
463
+ ap_c.add_argument("--gateway-url", default=DEFAULT_GATEWAY_URL)
464
+ ap_c.add_argument("--gateway-token", default=os.environ.get("OPENCLAW_GATEWAY_TOKEN", ""))
465
+ ap_c.add_argument("--gateway-agent-id", default="main")
466
+
467
+ ap_c.add_argument("--exec-max-tokens", type=int, default=1200)
468
+ ap_c.add_argument("--summary-max-tokens", type=int, default=350)
469
+ ap_c.add_argument("--outline", action="store_true", help="Generate a loss-minimized outline first and summarize from it")
470
+ ap_c.add_argument("--outline-max-tokens", type=int, default=5000)
471
+ ap_c.add_argument("--nodes", action="store_true", help="Write outline shards + per-doc index linking nodes")
472
+ ap_c.add_argument("--node-min-tokens", type=int, default=200, help="Minimum target size for a node (approx tokens). Small trailing parts are merged.")
473
+ ap_c.add_argument("--node-max-tokens", type=int, default=600, help="Maximum target size for a node (approx tokens)")
474
+ ap_c.add_argument("--node-max-chars", type=int, default=1200, help="Hard cap for node size (chars) to keep nodes embed-friendly")
475
+
476
+ ap_c.add_argument("--max-chars", type=int, default=180_000, help="Max extracted chars per file")
477
+ ap_c.add_argument("--sleep-ms", type=int, default=0, help="Sleep between files (throttle)")
478
+
479
+ ap_c.add_argument("--skip-existing", action="store_true")
480
+ ap_c.add_argument("--only", choices=["all", "tool-summary", "execution-notes"], default="all")
481
+
482
+ ap_c.add_argument("--include", action="append", default=[], help="Glob(s) to include (default: all)")
483
+ ap_c.add_argument("--exclude", action="append", default=[], help="Glob(s) to exclude")
484
+ ap_c.add_argument("--max-files", type=int, default=0, help="Process at most N files (0 = no limit)")
485
+
486
+ ap_c.add_argument("--retries", type=int, default=2)
487
+ ap_c.add_argument("--retry-sleep", type=float, default=1.5)
488
+ ap_c.add_argument("--max-errors", type=int, default=10)
489
+ ap_c.add_argument("--fail-fast", action="store_true")
490
+ ap_c.add_argument("--validate", action="store_true", help="Validate outputs; retry once if invalid")
491
+
492
+ # --- Index ---
493
+ ap_i = sub.add_parser("index", help="Embed + store a distilled directory into Chroma")
494
+ ap_i.add_argument("distilled", help="Distilled output directory (from condense)")
495
+ ap_i.add_argument("--collection", required=True, help="Chroma collection name")
496
+ ap_i.add_argument("--chroma-url", default="http://127.0.0.1:8100")
497
+ ap_i.add_argument("--ollama-url", default="http://127.0.0.1:11434")
498
+ ap_i.add_argument("--embed-model", default="nomic-embed-text")
499
+ ap_i.add_argument("--embed-max-chars", type=int, default=800, help="Max chars passed into embedding model")
500
+ ap_i.add_argument("--sleep-ms", type=int, default=0)
501
+ ap_i.add_argument("--include-outlines", action="store_true", help="Also index *.outline.md (default: skip)")
502
+ ap_i.add_argument(
503
+ "--include-kinds",
504
+ default="",
505
+ help="Comma-separated kinds to include (filters default set). Kinds: node,tool-summary,execution-notes,index,root-index,outline,md",
506
+ )
507
+ ap_i.add_argument(
508
+ "--exclude-kinds",
509
+ default="",
510
+ help="Comma-separated kinds to exclude. Kinds: node,tool-summary,execution-notes,index,root-index,outline,md",
511
+ )
512
+ ap_i.add_argument("--skip-indexed", action="store_true", help="Skip chunks already indexed (via .raglite/index_cache.json)")
513
+
514
+ # --- Run (condense + index) ---
515
+ ap_r = sub.add_parser("run", help="One-command pipeline: condense then index into a single Chroma collection")
516
+ ap_r.add_argument("input", help="File or directory to process")
517
+ ap_r.add_argument("--out", default="./raglite_out", help="Output directory")
518
+ ap_r.add_argument("--collection", required=True, help="Chroma collection name")
519
+ ap_r.add_argument("--chroma-url", default="http://127.0.0.1:8100")
520
+ ap_r.add_argument("--ollama-url", default=DEFAULT_OLLAMA_URL)
521
+ ap_r.add_argument("--embed-model", default="nomic-embed-text")
522
+ ap_r.add_argument("--embed-max-chars", type=int, default=800)
523
+ ap_r.add_argument(
524
+ "--skip-indexed",
525
+ action="store_true",
526
+ help="Skip chunks already indexed (via .raglite/index_cache.json)",
527
+ )
528
+ ap_r.add_argument("--include-outlines", action="store_true")
529
+ ap_r.add_argument("--include-kinds", default="")
530
+ ap_r.add_argument("--exclude-kinds", default="")
531
+
532
+ # Condense options (same as condense)
533
+ ap_r.add_argument("--engine", choices=["ollama", "openclaw"], default="ollama")
534
+ ap_r.add_argument("--ollama-model", default=DEFAULT_OLLAMA_MODEL)
535
+ ap_r.add_argument("--gateway-url", default=DEFAULT_GATEWAY_URL)
536
+ ap_r.add_argument("--gateway-token", default=os.environ.get("OPENCLAW_GATEWAY_TOKEN", ""))
537
+ ap_r.add_argument("--gateway-agent-id", default="main")
538
+ ap_r.add_argument("--exec-max-tokens", type=int, default=1200)
539
+ ap_r.add_argument("--summary-max-tokens", type=int, default=350)
540
+ ap_r.add_argument("--outline", action="store_true")
541
+ ap_r.add_argument("--outline-max-tokens", type=int, default=5000)
542
+ ap_r.add_argument("--nodes", action="store_true")
543
+ ap_r.add_argument("--node-min-tokens", type=int, default=200)
544
+ ap_r.add_argument("--node-max-tokens", type=int, default=600)
545
+ ap_r.add_argument("--node-max-chars", type=int, default=1200)
546
+ ap_r.add_argument("--max-chars", type=int, default=180_000)
547
+ ap_r.add_argument("--sleep-ms", type=int, default=0)
548
+ ap_r.add_argument("--skip-existing", action="store_true")
549
+ ap_r.add_argument("--only", choices=["all", "tool-summary", "execution-notes"], default="all")
550
+ ap_r.add_argument("--include", action="append", default=[])
551
+ ap_r.add_argument("--exclude", action="append", default=[])
552
+ ap_r.add_argument("--max-files", type=int, default=0)
553
+ ap_r.add_argument("--retries", type=int, default=2)
554
+ ap_r.add_argument("--retry-sleep", type=float, default=1.5)
555
+ ap_r.add_argument("--max-errors", type=int, default=10)
556
+ ap_r.add_argument("--fail-fast", action="store_true")
557
+ ap_r.add_argument("--validate", action="store_true")
558
+
559
+ # --- Query ---
560
+ ap_q = sub.add_parser("query", help="Hybrid search (vector + keyword) over an indexed distilled directory")
561
+ ap_q.add_argument("distilled", help="Distilled output directory")
562
+ ap_q.add_argument("--collection", required=True)
563
+ ap_q.add_argument("--chroma-url", default="http://127.0.0.1:8100")
564
+ ap_q.add_argument("--ollama-url", default="http://127.0.0.1:11434")
565
+ ap_q.add_argument("--embed-model", default="nomic-embed-text")
566
+ ap_q.add_argument("--embed-max-chars", type=int, default=800, help="Max chars passed into embedding model")
567
+ ap_q.add_argument("--top-k", type=int, default=10)
568
+ ap_q.add_argument("--keyword-top-k", type=int, default=10)
569
+ ap_q.add_argument("query", help="Search query")
570
+
571
+ args = ap.parse_args()
572
+
573
+ # --- Index command ---
574
+ if args.cmd == "index":
575
+ try:
576
+ from .vector_index import index_distilled_dir
577
+ except ImportError: # pragma: no cover
578
+ from raglite.vector_index import index_distilled_dir
579
+
580
+ def _parse_csv_set(s: str) -> set[str] | None:
581
+ parts = [p.strip() for p in (s or "").split(",") if p.strip()]
582
+ return set(parts) if parts else None
583
+
584
+ distilled_root = Path(args.distilled).expanduser().resolve()
585
+ index_cache_path = (distilled_root / ".raglite" / "index_cache.json") if args.skip_indexed else None
586
+
587
+ res = index_distilled_dir(
588
+ distilled_root=distilled_root,
589
+ chroma_url=args.chroma_url,
590
+ collection=args.collection,
591
+ ollama_url=args.ollama_url,
592
+ embed_model=args.embed_model,
593
+ embed_max_chars=args.embed_max_chars,
594
+ sleep_ms=args.sleep_ms,
595
+ include_outlines=bool(args.include_outlines),
596
+ include_kinds=_parse_csv_set(args.include_kinds),
597
+ exclude_kinds=_parse_csv_set(args.exclude_kinds),
598
+ index_cache_path=index_cache_path,
599
+ skip_indexed=bool(args.skip_indexed),
600
+ )
601
+ print(json.dumps(res, indent=2))
602
+ return 0
603
+
604
+ # --- Query command ---
605
+ if args.cmd == "query":
606
+ try:
607
+ from .vector_index import query_distilled
608
+ except ImportError: # pragma: no cover
609
+ from raglite.vector_index import query_distilled
610
+
611
+ res = query_distilled(
612
+ query=args.query,
613
+ distilled_root=Path(args.distilled).expanduser().resolve(),
614
+ chroma_url=args.chroma_url,
615
+ collection=args.collection,
616
+ ollama_url=args.ollama_url,
617
+ embed_model=args.embed_model,
618
+ embed_max_chars=args.embed_max_chars,
619
+ top_k=args.top_k,
620
+ keyword_top_k=args.keyword_top_k,
621
+ )
622
+ print(json.dumps(res, indent=2))
623
+ return 0
624
+
625
+ # --- Condense / Run command ---
626
+ if args.cmd == "run":
627
+ # 1) condense into --out
628
+ run_condense_args = args
629
+ # reuse the same condense path by setting fields as expected below
630
+ args = run_condense_args
631
+ args.cmd = "condense" # type: ignore[attr-defined]
632
+
633
+ # After condense finishes, we index the output.
634
+ should_index_after = True
635
+ else:
636
+ should_index_after = False
637
+
638
+ assert args.cmd == "condense"
639
+
640
+ input_root = Path(args.input).expanduser().resolve()
641
+ out_root = Path(args.out).expanduser().resolve()
642
+ out_root.mkdir(parents=True, exist_ok=True)
643
+
644
+ meta_dir = out_root / ".raglite"
645
+ cache_path = meta_dir / "cache.json"
646
+ errors_log = meta_dir / "errors.log"
647
+ run_stats_path = meta_dir / "run_stats.json"
648
+
649
+ cache = load_cache(cache_path)
650
+
651
+ gateway_token = args.gateway_token
652
+ if args.engine == "openclaw" and not gateway_token:
653
+ try:
654
+ cfg_path = Path("~/.openclaw/openclaw.json").expanduser()
655
+ cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
656
+ gateway_token = str(cfg.get("gateway", {}).get("auth", {}).get("token", ""))
657
+ except Exception:
658
+ gateway_token = ""
659
+
660
+ if args.engine == "openclaw" and not gateway_token:
661
+ print("ERROR: --gateway-token (or OPENCLAW_GATEWAY_TOKEN) is required for engine=openclaw", file=sys.stderr)
662
+ return 2
663
+
664
+ prompts = build_prompts(
665
+ token_cap_hint=f"~{args.exec_max_tokens} tokens max",
666
+ outline_max_tokens=args.outline_max_tokens,
667
+ exec_max_tokens=args.exec_max_tokens,
668
+ summary_max_tokens=args.summary_max_tokens,
669
+ )
670
+
671
+ all_files = list(iter_input_files(input_root))
672
+ if not all_files:
673
+ print("No matching files found.", file=sys.stderr)
674
+ return 2
675
+
676
+ # include/exclude filtering is based on path relative to input root
677
+ files: list[Path] = []
678
+ for p in all_files:
679
+ rel = p.name if input_root.is_file() else str(p.relative_to(input_root))
680
+ if args.include and not matches_any(rel, args.include):
681
+ continue
682
+ if args.exclude and matches_any(rel, args.exclude):
683
+ continue
684
+ files.append(p)
685
+
686
+ total = len(files)
687
+ if args.max_files and args.max_files > 0:
688
+ files = files[: args.max_files]
689
+
690
+ start = time.time()
691
+ stats = {
692
+ "engine": args.engine,
693
+ "ollama_model": args.ollama_model,
694
+ "outline": bool(args.outline),
695
+ "nodes": bool(args.nodes),
696
+ "input": str(input_root),
697
+ "out": str(out_root),
698
+ "total_candidates": total,
699
+ "processed": 0,
700
+ "ok": 0,
701
+ "skipped": 0,
702
+ "errors": 0,
703
+ "startedAt": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
704
+ }
705
+
706
+ def log_error(msg: str) -> None:
707
+ errors_log.parent.mkdir(parents=True, exist_ok=True)
708
+ with errors_log.open("a", encoding="utf-8", errors="ignore") as f:
709
+ f.write(msg + "\n")
710
+
711
+ doc_indices: list[Path] = []
712
+
713
+ for idx, p in enumerate(files, start=1):
714
+ stats["processed"] += 1
715
+ rel = p.name if input_root.is_file() else str(p.relative_to(input_root))
716
+
717
+ try:
718
+ try:
719
+ from .extract import extract_file
720
+ except ImportError: # pragma: no cover
721
+ from raglite.extract import extract_file
722
+
723
+ extracted = extract_file(p)
724
+ text = extracted.text
725
+ if len(text) > args.max_chars:
726
+ text = text[: args.max_chars] + "\n\n[TRUNCATED]"
727
+
728
+ text_hash = sha256_text(text)
729
+
730
+ exec_out = rel_output_path(p, input_root, out_root, ".execution-notes.md")
731
+ sum_out = rel_output_path(p, input_root, out_root, ".tool-summary.md")
732
+ outline_out = rel_output_path(p, input_root, out_root, ".outline.md")
733
+ nodes_dir = rel_output_path(p, input_root, out_root, "")
734
+ nodes_dir = nodes_dir.parent / nodes_dir.name # directory for this doc
735
+ nodes_dir = nodes_dir / "nodes"
736
+ doc_index = rel_output_path(p, input_root, out_root, "")
737
+ doc_index = doc_index.parent / (doc_index.name + ".index.md")
738
+
739
+ exec_out.parent.mkdir(parents=True, exist_ok=True)
740
+
741
+ cached = cache.get("files", {}).get(rel)
742
+ if args.skip_existing and exec_out.exists() and sum_out.exists():
743
+ stats["skipped"] += 1
744
+ continue
745
+ if cached and cached.get("hash") == text_hash and exec_out.exists() and sum_out.exists() and args.skip_existing:
746
+ stats["skipped"] += 1
747
+ continue
748
+
749
+ print(f"[{idx}/{len(files)}] {rel}", flush=True)
750
+
751
+ source_for_stage_b = text
752
+
753
+ node_paths: list[Path] = []
754
+ if args.outline or args.nodes:
755
+ outline_md = generate_with_retries(
756
+ engine=args.engine,
757
+ retries=args.retries,
758
+ sleep_s=args.retry_sleep,
759
+ prompt=prompts.outline + text,
760
+ max_tokens=args.outline_max_tokens,
761
+ ollama_url=args.ollama_url,
762
+ ollama_model=args.ollama_model,
763
+ gateway_url=args.gateway_url,
764
+ gateway_token=gateway_token,
765
+ agent_id=args.gateway_agent_id,
766
+ )
767
+
768
+ if args.validate:
769
+ probs = validate_outline(outline_md)
770
+ if probs:
771
+ outline_md = generate_with_retries(
772
+ engine=args.engine,
773
+ retries=0,
774
+ sleep_s=args.retry_sleep,
775
+ prompt=prompts.outline + text,
776
+ max_tokens=args.outline_max_tokens,
777
+ ollama_url=args.ollama_url,
778
+ ollama_model=args.ollama_model,
779
+ gateway_url=args.gateway_url,
780
+ gateway_token=gateway_token,
781
+ agent_id=args.gateway_agent_id,
782
+ )
783
+
784
+ outline_out.write_text(outline_md + "\n", encoding="utf-8")
785
+ source_for_stage_b = outline_md
786
+
787
+ if args.nodes:
788
+ node_paths = shard_outline_to_nodes(
789
+ outline_md=outline_md,
790
+ nodes_dir=nodes_dir,
791
+ node_min_tokens=args.node_min_tokens,
792
+ node_max_tokens=args.node_max_tokens,
793
+ node_max_chars=args.node_max_chars,
794
+ )
795
+
796
+ exec_md = ""
797
+ sum_md = ""
798
+
799
+ if args.only in ("all", "execution-notes"):
800
+ exec_md = generate_with_retries(
801
+ engine=args.engine,
802
+ retries=args.retries,
803
+ sleep_s=args.retry_sleep,
804
+ prompt=prompts.execution_notes + source_for_stage_b,
805
+ max_tokens=args.exec_max_tokens,
806
+ ollama_url=args.ollama_url,
807
+ ollama_model=args.ollama_model,
808
+ gateway_url=args.gateway_url,
809
+ gateway_token=gateway_token,
810
+ agent_id=args.gateway_agent_id,
811
+ )
812
+
813
+ if args.only in ("all", "tool-summary"):
814
+ sum_md = generate_with_retries(
815
+ engine=args.engine,
816
+ retries=args.retries,
817
+ sleep_s=args.retry_sleep,
818
+ prompt=prompts.tool_summary + source_for_stage_b,
819
+ max_tokens=args.summary_max_tokens,
820
+ ollama_url=args.ollama_url,
821
+ ollama_model=args.ollama_model,
822
+ gateway_url=args.gateway_url,
823
+ gateway_token=gateway_token,
824
+ agent_id=args.gateway_agent_id,
825
+ )
826
+
827
+ # Validation + one extra retry if requested
828
+ if args.validate:
829
+ if sum_md:
830
+ probs = validate_tool_summary(sum_md)
831
+ if probs:
832
+ sum_md = generate_with_retries(
833
+ engine=args.engine,
834
+ retries=0,
835
+ sleep_s=args.retry_sleep,
836
+ prompt=prompts.tool_summary + source_for_stage_b,
837
+ max_tokens=args.summary_max_tokens,
838
+ ollama_url=args.ollama_url,
839
+ ollama_model=args.ollama_model,
840
+ gateway_url=args.gateway_url,
841
+ gateway_token=gateway_token,
842
+ agent_id=args.gateway_agent_id,
843
+ )
844
+ if exec_md:
845
+ probs = validate_execution_notes(exec_md)
846
+ if probs:
847
+ exec_md = generate_with_retries(
848
+ engine=args.engine,
849
+ retries=0,
850
+ sleep_s=args.retry_sleep,
851
+ prompt=prompts.execution_notes + source_for_stage_b,
852
+ max_tokens=args.exec_max_tokens,
853
+ ollama_url=args.ollama_url,
854
+ ollama_model=args.ollama_model,
855
+ gateway_url=args.gateway_url,
856
+ gateway_token=gateway_token,
857
+ agent_id=args.gateway_agent_id,
858
+ )
859
+
860
+ if exec_md:
861
+ exec_out.write_text(exec_md.strip() + "\n", encoding="utf-8")
862
+ if sum_md:
863
+ sum_out.write_text(sum_md.strip() + "\n", encoding="utf-8")
864
+
865
+ if args.nodes:
866
+ write_doc_index(
867
+ doc_index_path=doc_index,
868
+ source_path=p,
869
+ tool_summary=sum_out,
870
+ execution_notes=exec_out,
871
+ outline_path=outline_out if outline_out.exists() else None,
872
+ node_paths=node_paths,
873
+ )
874
+ doc_indices.append(doc_index)
875
+
876
+ cache.setdefault("files", {})[rel] = {
877
+ "hash": text_hash,
878
+ "engine": args.engine,
879
+ "ollama_model": args.ollama_model,
880
+ "outline": bool(args.outline),
881
+ "nodes": bool(args.nodes),
882
+ "ts": time.time(),
883
+ }
884
+ save_cache(cache_path, cache)
885
+
886
+ stats["ok"] += 1
887
+
888
+ if args.sleep_ms:
889
+ time.sleep(args.sleep_ms / 1000.0)
890
+
891
+ except Exception as e:
892
+ stats["errors"] += 1
893
+ msg = f"ERROR {rel}: {e!r}"
894
+ print(msg, file=sys.stderr, flush=True)
895
+ try:
896
+ log_error(msg)
897
+ except Exception:
898
+ pass
899
+ if args.fail_fast or stats["errors"] >= args.max_errors:
900
+ break
901
+
902
+ # Root index is generated when --nodes is enabled.
903
+ if args.nodes:
904
+ try:
905
+ write_root_index(out_root=out_root, input_root=input_root, doc_indices=doc_indices)
906
+ except Exception as e:
907
+ log_error(f"ERROR write_root_index: {e!r}")
908
+
909
+ stats["durationSeconds"] = round(time.time() - start, 2)
910
+ run_stats_path.parent.mkdir(parents=True, exist_ok=True)
911
+ run_stats_path.write_text(json.dumps(stats, indent=2) + "\n", encoding="utf-8")
912
+
913
+ # If this was `run`, perform indexing as step 2.
914
+ if should_index_after and stats["errors"] == 0:
915
+ try:
916
+ try:
917
+ from .vector_index import index_distilled_dir
918
+ except ImportError: # pragma: no cover
919
+ from raglite.vector_index import index_distilled_dir
920
+
921
+ def _parse_csv_set(s: str) -> set[str] | None:
922
+ parts = [p.strip() for p in (s or "").split(",") if p.strip()]
923
+ return set(parts) if parts else None
924
+
925
+ index_cache_path = (out_root / ".raglite" / "index_cache.json") if bool(run_condense_args.skip_indexed) else None
926
+
927
+ index_res = index_distilled_dir(
928
+ distilled_root=out_root,
929
+ chroma_url=run_condense_args.chroma_url,
930
+ collection=run_condense_args.collection,
931
+ ollama_url=run_condense_args.ollama_url,
932
+ embed_model=run_condense_args.embed_model,
933
+ embed_max_chars=run_condense_args.embed_max_chars,
934
+ include_outlines=bool(run_condense_args.include_outlines),
935
+ include_kinds=_parse_csv_set(run_condense_args.include_kinds),
936
+ exclude_kinds=_parse_csv_set(run_condense_args.exclude_kinds),
937
+ index_cache_path=index_cache_path,
938
+ skip_indexed=bool(run_condense_args.skip_indexed),
939
+ )
940
+ # attach indexing stats to run_stats.json for visibility
941
+ stats["index"] = index_res
942
+ run_stats_path.write_text(json.dumps(stats, indent=2) + "\n", encoding="utf-8")
943
+ except Exception as e:
944
+ print(f"ERROR: indexing failed: {e!r}", file=sys.stderr)
945
+ return 1
946
+ elif should_index_after and stats["errors"] != 0:
947
+ print("NOTE: skipping indexing because condense reported errors", file=sys.stderr)
948
+
949
+ return 0 if stats["errors"] == 0 else 1
950
+
951
+
952
+ if __name__ == "__main__":
953
+ raise SystemExit(main())