agent-cli 0.70.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_cli/__init__.py +5 -0
- agent_cli/__main__.py +6 -0
- agent_cli/_extras.json +14 -0
- agent_cli/_requirements/.gitkeep +0 -0
- agent_cli/_requirements/audio.txt +79 -0
- agent_cli/_requirements/faster-whisper.txt +215 -0
- agent_cli/_requirements/kokoro.txt +425 -0
- agent_cli/_requirements/llm.txt +183 -0
- agent_cli/_requirements/memory.txt +355 -0
- agent_cli/_requirements/mlx-whisper.txt +222 -0
- agent_cli/_requirements/piper.txt +176 -0
- agent_cli/_requirements/rag.txt +402 -0
- agent_cli/_requirements/server.txt +154 -0
- agent_cli/_requirements/speed.txt +77 -0
- agent_cli/_requirements/vad.txt +155 -0
- agent_cli/_requirements/wyoming.txt +71 -0
- agent_cli/_tools.py +368 -0
- agent_cli/agents/__init__.py +23 -0
- agent_cli/agents/_voice_agent_common.py +136 -0
- agent_cli/agents/assistant.py +383 -0
- agent_cli/agents/autocorrect.py +284 -0
- agent_cli/agents/chat.py +496 -0
- agent_cli/agents/memory/__init__.py +31 -0
- agent_cli/agents/memory/add.py +190 -0
- agent_cli/agents/memory/proxy.py +160 -0
- agent_cli/agents/rag_proxy.py +128 -0
- agent_cli/agents/speak.py +209 -0
- agent_cli/agents/transcribe.py +671 -0
- agent_cli/agents/transcribe_daemon.py +499 -0
- agent_cli/agents/voice_edit.py +291 -0
- agent_cli/api.py +22 -0
- agent_cli/cli.py +106 -0
- agent_cli/config.py +503 -0
- agent_cli/config_cmd.py +307 -0
- agent_cli/constants.py +27 -0
- agent_cli/core/__init__.py +1 -0
- agent_cli/core/audio.py +461 -0
- agent_cli/core/audio_format.py +299 -0
- agent_cli/core/chroma.py +88 -0
- agent_cli/core/deps.py +191 -0
- agent_cli/core/openai_proxy.py +139 -0
- agent_cli/core/process.py +195 -0
- agent_cli/core/reranker.py +120 -0
- agent_cli/core/sse.py +87 -0
- agent_cli/core/transcription_logger.py +70 -0
- agent_cli/core/utils.py +526 -0
- agent_cli/core/vad.py +175 -0
- agent_cli/core/watch.py +65 -0
- agent_cli/dev/__init__.py +14 -0
- agent_cli/dev/cli.py +1588 -0
- agent_cli/dev/coding_agents/__init__.py +19 -0
- agent_cli/dev/coding_agents/aider.py +24 -0
- agent_cli/dev/coding_agents/base.py +167 -0
- agent_cli/dev/coding_agents/claude.py +39 -0
- agent_cli/dev/coding_agents/codex.py +24 -0
- agent_cli/dev/coding_agents/continue_dev.py +15 -0
- agent_cli/dev/coding_agents/copilot.py +24 -0
- agent_cli/dev/coding_agents/cursor_agent.py +48 -0
- agent_cli/dev/coding_agents/gemini.py +28 -0
- agent_cli/dev/coding_agents/opencode.py +15 -0
- agent_cli/dev/coding_agents/registry.py +49 -0
- agent_cli/dev/editors/__init__.py +19 -0
- agent_cli/dev/editors/base.py +89 -0
- agent_cli/dev/editors/cursor.py +15 -0
- agent_cli/dev/editors/emacs.py +46 -0
- agent_cli/dev/editors/jetbrains.py +56 -0
- agent_cli/dev/editors/nano.py +31 -0
- agent_cli/dev/editors/neovim.py +33 -0
- agent_cli/dev/editors/registry.py +59 -0
- agent_cli/dev/editors/sublime.py +20 -0
- agent_cli/dev/editors/vim.py +42 -0
- agent_cli/dev/editors/vscode.py +15 -0
- agent_cli/dev/editors/zed.py +20 -0
- agent_cli/dev/project.py +568 -0
- agent_cli/dev/registry.py +52 -0
- agent_cli/dev/skill/SKILL.md +141 -0
- agent_cli/dev/skill/examples.md +571 -0
- agent_cli/dev/terminals/__init__.py +19 -0
- agent_cli/dev/terminals/apple_terminal.py +82 -0
- agent_cli/dev/terminals/base.py +56 -0
- agent_cli/dev/terminals/gnome.py +51 -0
- agent_cli/dev/terminals/iterm2.py +84 -0
- agent_cli/dev/terminals/kitty.py +77 -0
- agent_cli/dev/terminals/registry.py +48 -0
- agent_cli/dev/terminals/tmux.py +58 -0
- agent_cli/dev/terminals/warp.py +132 -0
- agent_cli/dev/terminals/zellij.py +78 -0
- agent_cli/dev/worktree.py +856 -0
- agent_cli/docs_gen.py +417 -0
- agent_cli/example-config.toml +185 -0
- agent_cli/install/__init__.py +5 -0
- agent_cli/install/common.py +89 -0
- agent_cli/install/extras.py +174 -0
- agent_cli/install/hotkeys.py +48 -0
- agent_cli/install/services.py +87 -0
- agent_cli/memory/__init__.py +7 -0
- agent_cli/memory/_files.py +250 -0
- agent_cli/memory/_filters.py +63 -0
- agent_cli/memory/_git.py +157 -0
- agent_cli/memory/_indexer.py +142 -0
- agent_cli/memory/_ingest.py +408 -0
- agent_cli/memory/_persistence.py +182 -0
- agent_cli/memory/_prompt.py +91 -0
- agent_cli/memory/_retrieval.py +294 -0
- agent_cli/memory/_store.py +169 -0
- agent_cli/memory/_streaming.py +44 -0
- agent_cli/memory/_tasks.py +48 -0
- agent_cli/memory/api.py +113 -0
- agent_cli/memory/client.py +272 -0
- agent_cli/memory/engine.py +361 -0
- agent_cli/memory/entities.py +43 -0
- agent_cli/memory/models.py +112 -0
- agent_cli/opts.py +433 -0
- agent_cli/py.typed +0 -0
- agent_cli/rag/__init__.py +3 -0
- agent_cli/rag/_indexer.py +67 -0
- agent_cli/rag/_indexing.py +226 -0
- agent_cli/rag/_prompt.py +30 -0
- agent_cli/rag/_retriever.py +156 -0
- agent_cli/rag/_store.py +48 -0
- agent_cli/rag/_utils.py +218 -0
- agent_cli/rag/api.py +175 -0
- agent_cli/rag/client.py +299 -0
- agent_cli/rag/engine.py +302 -0
- agent_cli/rag/models.py +55 -0
- agent_cli/scripts/.runtime/.gitkeep +0 -0
- agent_cli/scripts/__init__.py +1 -0
- agent_cli/scripts/check_plugin_skill_sync.py +50 -0
- agent_cli/scripts/linux-hotkeys/README.md +63 -0
- agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
- agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
- agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
- agent_cli/scripts/macos-hotkeys/README.md +45 -0
- agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
- agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
- agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
- agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
- agent_cli/scripts/nvidia-asr-server/README.md +99 -0
- agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
- agent_cli/scripts/nvidia-asr-server/server.py +255 -0
- agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
- agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
- agent_cli/scripts/run-openwakeword.sh +11 -0
- agent_cli/scripts/run-piper-windows.ps1 +30 -0
- agent_cli/scripts/run-piper.sh +24 -0
- agent_cli/scripts/run-whisper-linux.sh +40 -0
- agent_cli/scripts/run-whisper-macos.sh +6 -0
- agent_cli/scripts/run-whisper-windows.ps1 +51 -0
- agent_cli/scripts/run-whisper.sh +9 -0
- agent_cli/scripts/run_faster_whisper_server.py +136 -0
- agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
- agent_cli/scripts/setup-linux.sh +108 -0
- agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
- agent_cli/scripts/setup-macos.sh +76 -0
- agent_cli/scripts/setup-windows.ps1 +63 -0
- agent_cli/scripts/start-all-services-windows.ps1 +53 -0
- agent_cli/scripts/start-all-services.sh +178 -0
- agent_cli/scripts/sync_extras.py +138 -0
- agent_cli/server/__init__.py +3 -0
- agent_cli/server/cli.py +721 -0
- agent_cli/server/common.py +222 -0
- agent_cli/server/model_manager.py +288 -0
- agent_cli/server/model_registry.py +225 -0
- agent_cli/server/proxy/__init__.py +3 -0
- agent_cli/server/proxy/api.py +444 -0
- agent_cli/server/streaming.py +67 -0
- agent_cli/server/tts/__init__.py +3 -0
- agent_cli/server/tts/api.py +335 -0
- agent_cli/server/tts/backends/__init__.py +82 -0
- agent_cli/server/tts/backends/base.py +139 -0
- agent_cli/server/tts/backends/kokoro.py +403 -0
- agent_cli/server/tts/backends/piper.py +253 -0
- agent_cli/server/tts/model_manager.py +201 -0
- agent_cli/server/tts/model_registry.py +28 -0
- agent_cli/server/tts/wyoming_handler.py +249 -0
- agent_cli/server/whisper/__init__.py +3 -0
- agent_cli/server/whisper/api.py +413 -0
- agent_cli/server/whisper/backends/__init__.py +89 -0
- agent_cli/server/whisper/backends/base.py +97 -0
- agent_cli/server/whisper/backends/faster_whisper.py +225 -0
- agent_cli/server/whisper/backends/mlx.py +270 -0
- agent_cli/server/whisper/languages.py +116 -0
- agent_cli/server/whisper/model_manager.py +157 -0
- agent_cli/server/whisper/model_registry.py +28 -0
- agent_cli/server/whisper/wyoming_handler.py +203 -0
- agent_cli/services/__init__.py +343 -0
- agent_cli/services/_wyoming_utils.py +64 -0
- agent_cli/services/asr.py +506 -0
- agent_cli/services/llm.py +228 -0
- agent_cli/services/tts.py +450 -0
- agent_cli/services/wake_word.py +142 -0
- agent_cli-0.70.5.dist-info/METADATA +2118 -0
- agent_cli-0.70.5.dist-info/RECORD +196 -0
- agent_cli-0.70.5.dist-info/WHEEL +4 -0
- agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
- agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"""RAG Indexing Logic."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import concurrent.futures
|
|
6
|
+
import datetime
|
|
7
|
+
import logging
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from agent_cli.rag._store import delete_by_file_path, get_all_metadata, upsert_docs
|
|
11
|
+
from agent_cli.rag._utils import chunk_text, get_file_hash, load_document_text, should_ignore_path
|
|
12
|
+
from agent_cli.rag.models import DocMetadata
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from chromadb import Collection
|
|
18
|
+
|
|
19
|
+
LOGGER = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def load_hashes_from_metadata(collection: Collection) -> tuple[dict[str, str], dict[str, float]]:
|
|
23
|
+
"""Rebuild hash and mtime caches from existing DB.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Tuple of (file_hashes, file_mtimes) dictionaries.
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
metadatas = get_all_metadata(collection)
|
|
30
|
+
file_hashes = {}
|
|
31
|
+
file_mtimes = {}
|
|
32
|
+
for meta in metadatas:
|
|
33
|
+
if meta:
|
|
34
|
+
fp = meta["file_path"]
|
|
35
|
+
file_hashes[fp] = meta["file_hash"]
|
|
36
|
+
file_mtimes[fp] = meta["file_mtime"]
|
|
37
|
+
return file_hashes, file_mtimes
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def index_file(
|
|
41
|
+
collection: Collection,
|
|
42
|
+
docs_folder: Path,
|
|
43
|
+
file_path: Path,
|
|
44
|
+
file_hashes: dict[str, str],
|
|
45
|
+
file_mtimes: dict[str, float],
|
|
46
|
+
) -> bool:
|
|
47
|
+
"""Index or reindex a single file.
|
|
48
|
+
|
|
49
|
+
Uses mtime-first checking for performance: only computes hash if mtime changed.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
True if the file was indexed (changed or new), False otherwise.
|
|
53
|
+
|
|
54
|
+
"""
|
|
55
|
+
if not file_path.exists():
|
|
56
|
+
return False
|
|
57
|
+
LOGGER.info(" 📄 Processing: %s", file_path.name)
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
relative_path = str(file_path.relative_to(docs_folder))
|
|
61
|
+
current_mtime = file_path.stat().st_mtime
|
|
62
|
+
|
|
63
|
+
# Fast path: mtime unchanged → skip (no hash computation needed)
|
|
64
|
+
if relative_path in file_mtimes and file_mtimes[relative_path] == current_mtime:
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
# mtime changed or new file: verify with hash
|
|
68
|
+
current_hash = get_file_hash(file_path)
|
|
69
|
+
|
|
70
|
+
# Hash unchanged (file was touched but not modified) → update mtime, skip
|
|
71
|
+
if relative_path in file_hashes and file_hashes[relative_path] == current_hash:
|
|
72
|
+
file_mtimes[relative_path] = current_mtime
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
# Remove old chunks first (atomic-ish update)
|
|
76
|
+
remove_file(collection, docs_folder, file_path, file_hashes, file_mtimes)
|
|
77
|
+
|
|
78
|
+
# Load and chunk document
|
|
79
|
+
text = load_document_text(file_path)
|
|
80
|
+
chunks = chunk_text(text) if text and text.strip() else []
|
|
81
|
+
if not chunks:
|
|
82
|
+
return False # Unsupported, empty, or no chunks
|
|
83
|
+
|
|
84
|
+
# Index chunks
|
|
85
|
+
ids = []
|
|
86
|
+
documents = []
|
|
87
|
+
metadatas = []
|
|
88
|
+
|
|
89
|
+
timestamp = datetime.datetime.now(datetime.UTC).isoformat()
|
|
90
|
+
|
|
91
|
+
for i, chunk in enumerate(chunks):
|
|
92
|
+
doc_id = f"{relative_path}:chunk:{i}"
|
|
93
|
+
ids.append(doc_id)
|
|
94
|
+
documents.append(chunk)
|
|
95
|
+
metadatas.append(
|
|
96
|
+
DocMetadata(
|
|
97
|
+
source=file_path.name,
|
|
98
|
+
file_path=relative_path,
|
|
99
|
+
file_type=file_path.suffix,
|
|
100
|
+
chunk_id=i,
|
|
101
|
+
total_chunks=len(chunks),
|
|
102
|
+
indexed_at=timestamp,
|
|
103
|
+
file_hash=current_hash,
|
|
104
|
+
file_mtime=current_mtime,
|
|
105
|
+
),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Upsert to ChromaDB in batches to avoid 502s from large payloads
|
|
109
|
+
# Use small batch size (10) to avoid overwhelming embedding servers
|
|
110
|
+
batch_size = 10
|
|
111
|
+
for i in range(0, len(ids), batch_size):
|
|
112
|
+
batch_ids = ids[i : i + batch_size]
|
|
113
|
+
batch_docs = documents[i : i + batch_size]
|
|
114
|
+
batch_meta = metadatas[i : i + batch_size]
|
|
115
|
+
upsert_docs(collection, batch_ids, batch_docs, batch_meta)
|
|
116
|
+
|
|
117
|
+
# Update tracking
|
|
118
|
+
file_hashes[relative_path] = current_hash
|
|
119
|
+
file_mtimes[relative_path] = current_mtime
|
|
120
|
+
|
|
121
|
+
LOGGER.info(" ✓ Indexed %s: %d chunks", file_path.name, len(chunks))
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
except Exception:
|
|
125
|
+
LOGGER.exception("Failed to index file %s", file_path)
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def remove_file(
|
|
130
|
+
collection: Collection,
|
|
131
|
+
docs_folder: Path,
|
|
132
|
+
file_path: Path,
|
|
133
|
+
file_hashes: dict[str, str],
|
|
134
|
+
file_mtimes: dict[str, float],
|
|
135
|
+
) -> bool:
|
|
136
|
+
"""Remove all chunks of a file from index.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
True if documents were removed (or at least untracked), False otherwise.
|
|
140
|
+
|
|
141
|
+
"""
|
|
142
|
+
try:
|
|
143
|
+
relative_path = str(file_path.relative_to(docs_folder))
|
|
144
|
+
delete_by_file_path(collection, relative_path)
|
|
145
|
+
|
|
146
|
+
# If it was tracked, we consider it "removed"
|
|
147
|
+
if relative_path in file_hashes:
|
|
148
|
+
LOGGER.info(" ✓ Removed %s from index", file_path.name)
|
|
149
|
+
file_hashes.pop(relative_path, None)
|
|
150
|
+
file_mtimes.pop(relative_path, None)
|
|
151
|
+
return True
|
|
152
|
+
|
|
153
|
+
return False
|
|
154
|
+
except Exception:
|
|
155
|
+
LOGGER.exception("Error removing file %s", file_path)
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def initial_index(
|
|
160
|
+
collection: Collection,
|
|
161
|
+
docs_folder: Path,
|
|
162
|
+
file_hashes: dict[str, str],
|
|
163
|
+
file_mtimes: dict[str, float],
|
|
164
|
+
) -> None:
|
|
165
|
+
"""Index all existing files on startup and remove deleted ones."""
|
|
166
|
+
LOGGER.info("🔍 Scanning existing files...")
|
|
167
|
+
|
|
168
|
+
# Snapshot of what's in the DB currently
|
|
169
|
+
paths_in_db = set(file_hashes.keys())
|
|
170
|
+
paths_found_on_disk = set()
|
|
171
|
+
|
|
172
|
+
processed_files = []
|
|
173
|
+
removed_files = []
|
|
174
|
+
|
|
175
|
+
# Gather all files first, excluding hidden and common development directories
|
|
176
|
+
all_files = [
|
|
177
|
+
p for p in docs_folder.rglob("*") if p.is_file() and not should_ignore_path(p, docs_folder)
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
# 1. Index Existing Files in Parallel
|
|
181
|
+
# Use max_workers=4 to match typical local backend parallelism (e.g. llama-server -np 4)
|
|
182
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
183
|
+
# Map futures to file paths
|
|
184
|
+
future_to_file = {
|
|
185
|
+
executor.submit(index_file, collection, docs_folder, f, file_hashes, file_mtimes): f
|
|
186
|
+
for f in all_files
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
for future in concurrent.futures.as_completed(future_to_file):
|
|
190
|
+
file_path = future_to_file[future]
|
|
191
|
+
try:
|
|
192
|
+
# Track that we found this file (regardless of index result)
|
|
193
|
+
rel_path = str(file_path.relative_to(docs_folder))
|
|
194
|
+
paths_found_on_disk.add(rel_path)
|
|
195
|
+
|
|
196
|
+
indexed = future.result()
|
|
197
|
+
if indexed:
|
|
198
|
+
processed_files.append(file_path.name)
|
|
199
|
+
except Exception:
|
|
200
|
+
LOGGER.exception("Error processing %s", file_path.name)
|
|
201
|
+
|
|
202
|
+
# 2. Clean up Deleted Files
|
|
203
|
+
# If it's in DB but not found on disk, it was deleted offline.
|
|
204
|
+
paths_to_remove = paths_in_db - paths_found_on_disk
|
|
205
|
+
|
|
206
|
+
if paths_to_remove:
|
|
207
|
+
LOGGER.info("🧹 Cleaning up %d deleted files found in index...", len(paths_to_remove))
|
|
208
|
+
for rel_path in paths_to_remove:
|
|
209
|
+
full_path = docs_folder / rel_path
|
|
210
|
+
try:
|
|
211
|
+
if remove_file(collection, docs_folder, full_path, file_hashes, file_mtimes):
|
|
212
|
+
removed_files.append(rel_path)
|
|
213
|
+
except Exception:
|
|
214
|
+
LOGGER.exception("Error removing stale file %s", rel_path)
|
|
215
|
+
|
|
216
|
+
if processed_files:
|
|
217
|
+
LOGGER.info("🆕 Added/Updated: %s", ", ".join(processed_files))
|
|
218
|
+
|
|
219
|
+
if removed_files:
|
|
220
|
+
LOGGER.info("🗑️ Removed: %s", ", ".join(removed_files))
|
|
221
|
+
|
|
222
|
+
LOGGER.info(
|
|
223
|
+
"✅ Initial scan complete. Indexed/Checked %d files, Removed %d stale files.",
|
|
224
|
+
len(paths_found_on_disk),
|
|
225
|
+
len(removed_files),
|
|
226
|
+
)
|
agent_cli/rag/_prompt.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Centralized prompts for RAG LLM calls."""
|
|
2
|
+
|
|
3
|
+
RAG_PROMPT_WITH_TOOLS = """
|
|
4
|
+
## Retrieved Documentation
|
|
5
|
+
The following was automatically retrieved based on the user's query:
|
|
6
|
+
|
|
7
|
+
<retrieved_documents>
|
|
8
|
+
{context}
|
|
9
|
+
</retrieved_documents>
|
|
10
|
+
|
|
11
|
+
## RAG Instructions
|
|
12
|
+
- Use the retrieved context ONLY if it's relevant to the question
|
|
13
|
+
- If the context is irrelevant, ignore it and answer based on your knowledge
|
|
14
|
+
- When using context, cite sources: [Source: filename]
|
|
15
|
+
- If snippets are insufficient, call read_full_document(file_path) to get full content
|
|
16
|
+
""".strip()
|
|
17
|
+
|
|
18
|
+
RAG_PROMPT_NO_TOOLS = """
|
|
19
|
+
## Retrieved Documentation
|
|
20
|
+
The following was automatically retrieved based on the user's query:
|
|
21
|
+
|
|
22
|
+
<retrieved_documents>
|
|
23
|
+
{context}
|
|
24
|
+
</retrieved_documents>
|
|
25
|
+
|
|
26
|
+
## RAG Instructions
|
|
27
|
+
- Use the retrieved context ONLY if it's relevant to the question
|
|
28
|
+
- If the context is irrelevant, ignore it and answer based on your knowledge
|
|
29
|
+
- When using context, cite sources: [Source: filename]
|
|
30
|
+
""".strip()
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""RAG Retrieval Logic (Functional)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from agent_cli.core.reranker import OnnxCrossEncoder, predict_relevance
|
|
9
|
+
from agent_cli.rag._store import query_docs
|
|
10
|
+
from agent_cli.rag.models import RagSource, RetrievalResult
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from chromadb import Collection
|
|
14
|
+
|
|
15
|
+
LOGGER = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def format_context(
|
|
19
|
+
ranked: list[tuple[str, dict, float]],
|
|
20
|
+
source_key: str = "source",
|
|
21
|
+
path_key: str = "file_path",
|
|
22
|
+
chunk_key: str = "chunk_id",
|
|
23
|
+
) -> str:
|
|
24
|
+
"""Format ranked documents as XML for context injection.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
ranked: List of (doc, meta, score) tuples from rerank_and_filter().
|
|
28
|
+
source_key: Metadata key for source name.
|
|
29
|
+
path_key: Metadata key for file path.
|
|
30
|
+
chunk_key: Metadata key for chunk ID.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
XML-formatted context string.
|
|
34
|
+
|
|
35
|
+
"""
|
|
36
|
+
if not ranked:
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
context_parts = []
|
|
40
|
+
for i, (doc, meta, score) in enumerate(ranked):
|
|
41
|
+
source = meta.get(source_key, "unknown")
|
|
42
|
+
path = meta.get(path_key, meta.get("doc_id", "unknown"))
|
|
43
|
+
chunk = meta.get(chunk_key, 0)
|
|
44
|
+
context_parts.append(
|
|
45
|
+
f'<document index="{i + 1}" source="{source}" '
|
|
46
|
+
f'path="{path}" chunk="{chunk}" score="{score:.3f}">\n{doc}\n</document>',
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return "\n".join(context_parts)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def rerank_and_filter(
|
|
53
|
+
reranker: OnnxCrossEncoder,
|
|
54
|
+
query: str,
|
|
55
|
+
docs: list[str],
|
|
56
|
+
metas: list[dict],
|
|
57
|
+
top_k: int,
|
|
58
|
+
min_score: float = 0.2,
|
|
59
|
+
) -> list[tuple[str, dict, float]]:
|
|
60
|
+
"""Rerank documents and filter by minimum score.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
reranker: Cross-encoder model for reranking.
|
|
64
|
+
query: Search query string.
|
|
65
|
+
docs: List of document texts.
|
|
66
|
+
metas: List of metadata dicts corresponding to docs.
|
|
67
|
+
top_k: Maximum number of results to return.
|
|
68
|
+
min_score: Minimum relevance score threshold.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
List of (doc, meta, score) tuples, sorted by score descending.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
if not docs:
|
|
75
|
+
return []
|
|
76
|
+
|
|
77
|
+
# Rerank
|
|
78
|
+
pairs = [(query, doc) for doc in docs]
|
|
79
|
+
scores = predict_relevance(reranker, pairs)
|
|
80
|
+
|
|
81
|
+
# Sort by score descending
|
|
82
|
+
ranked_all = sorted(
|
|
83
|
+
zip(docs, metas, scores, strict=False),
|
|
84
|
+
key=lambda x: x[2],
|
|
85
|
+
reverse=True,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Filter by min_score and take top_k
|
|
89
|
+
ranked = [(d, m, s) for d, m, s in ranked_all if s >= min_score][:top_k]
|
|
90
|
+
|
|
91
|
+
# Log retrieval quality
|
|
92
|
+
filtered_count = len(ranked_all) - len([x for x in ranked_all if x[2] >= min_score])
|
|
93
|
+
top_score = ranked_all[0][2] if ranked_all else 0.0
|
|
94
|
+
LOGGER.info(
|
|
95
|
+
"Retrieval: query_len=%d, candidates=%d, returned=%d, "
|
|
96
|
+
"top_score=%.3f, min_score=%.3f, filtered=%d",
|
|
97
|
+
len(query),
|
|
98
|
+
len(docs),
|
|
99
|
+
len(ranked),
|
|
100
|
+
top_score,
|
|
101
|
+
min_score,
|
|
102
|
+
filtered_count,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return ranked
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def search_context(
|
|
109
|
+
collection: Collection,
|
|
110
|
+
reranker_model: OnnxCrossEncoder,
|
|
111
|
+
query: str,
|
|
112
|
+
top_k: int = 3,
|
|
113
|
+
min_score: float = 0.2,
|
|
114
|
+
) -> RetrievalResult:
|
|
115
|
+
"""Retrieve relevant context for a query using hybrid search.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
collection: ChromaDB collection to search.
|
|
119
|
+
reranker_model: Cross-encoder model for reranking.
|
|
120
|
+
query: Search query string.
|
|
121
|
+
top_k: Maximum number of results to return.
|
|
122
|
+
min_score: Minimum relevance score threshold. Results below this are filtered out.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
RetrievalResult with context and sources. Empty if no results meet min_score.
|
|
126
|
+
|
|
127
|
+
"""
|
|
128
|
+
# Initial retrieval - fetch more candidates for reranking
|
|
129
|
+
n_candidates = top_k * 3
|
|
130
|
+
results = query_docs(collection, query, n_results=n_candidates)
|
|
131
|
+
|
|
132
|
+
if not results["documents"] or not results["documents"][0]:
|
|
133
|
+
return RetrievalResult(context="", sources=[])
|
|
134
|
+
|
|
135
|
+
docs = results["documents"][0]
|
|
136
|
+
metas = results["metadatas"][0] # type: ignore[index]
|
|
137
|
+
|
|
138
|
+
# Rerank and filter
|
|
139
|
+
ranked = rerank_and_filter(reranker_model, query, docs, metas, top_k, min_score)
|
|
140
|
+
|
|
141
|
+
if not ranked:
|
|
142
|
+
return RetrievalResult(context="", sources=[])
|
|
143
|
+
|
|
144
|
+
# Build context and sources
|
|
145
|
+
context = format_context(ranked)
|
|
146
|
+
sources = [
|
|
147
|
+
RagSource(
|
|
148
|
+
source=meta["source"],
|
|
149
|
+
path=meta["file_path"],
|
|
150
|
+
chunk_id=meta["chunk_id"],
|
|
151
|
+
score=float(score),
|
|
152
|
+
)
|
|
153
|
+
for _, meta, score in ranked
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
return RetrievalResult(context=context, sources=sources)
|
agent_cli/rag/_store.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""ChromaDB functional interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from agent_cli.core.chroma import delete_where, upsert
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
|
|
13
|
+
from chromadb import Collection
|
|
14
|
+
|
|
15
|
+
from agent_cli.rag.models import DocMetadata
|
|
16
|
+
|
|
17
|
+
LOGGER = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upsert_docs(
|
|
21
|
+
collection: Collection,
|
|
22
|
+
ids: list[str],
|
|
23
|
+
documents: list[str],
|
|
24
|
+
metadatas: Sequence[DocMetadata],
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Upsert documents into the collection."""
|
|
27
|
+
upsert(collection, ids=ids, documents=documents, metadatas=metadatas)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def delete_by_file_path(collection: Collection, file_path: str) -> None:
|
|
31
|
+
"""Delete all chunks associated with a file path."""
|
|
32
|
+
delete_where(collection, {"file_path": file_path})
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def query_docs(collection: Collection, text: str, n_results: int) -> dict[str, Any]:
|
|
36
|
+
"""Query the collection."""
|
|
37
|
+
return collection.query(query_texts=[text], n_results=n_results)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_all_metadata(collection: Collection) -> list[dict[str, Any]]:
|
|
41
|
+
"""Retrieve all metadata from the collection."""
|
|
42
|
+
result = collection.get(include=["metadatas"])
|
|
43
|
+
return result.get("metadatas", []) or [] # type: ignore[return-value]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def count_docs(collection: Collection) -> int:
|
|
47
|
+
"""Return total number of documents."""
|
|
48
|
+
return collection.count()
|
agent_cli/rag/_utils.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Utility functions for RAG: Document loading and chunking."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import logging
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
# Configure logging
|
|
13
|
+
LOGGER = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# Non-hidden directories to ignore (hidden dirs already caught by startswith(".") check)
|
|
16
|
+
DEFAULT_IGNORE_DIRS: frozenset[str] = frozenset(
|
|
17
|
+
{
|
|
18
|
+
"__pycache__",
|
|
19
|
+
"venv",
|
|
20
|
+
"env",
|
|
21
|
+
"htmlcov",
|
|
22
|
+
"node_modules",
|
|
23
|
+
"build",
|
|
24
|
+
"dist",
|
|
25
|
+
},
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Non-hidden files to ignore (hidden files already caught by startswith(".") check)
|
|
29
|
+
DEFAULT_IGNORE_FILES: frozenset[str] = frozenset(
|
|
30
|
+
{
|
|
31
|
+
"Thumbs.db",
|
|
32
|
+
},
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def should_ignore_path(path: Path, base_folder: Path) -> bool:
|
|
37
|
+
"""Check if a path should be ignored during indexing.
|
|
38
|
+
|
|
39
|
+
Ignores:
|
|
40
|
+
- Any path component starting with '.' (hidden files/dirs)
|
|
41
|
+
- Common development directories (__pycache__, node_modules, venv, etc.)
|
|
42
|
+
- .egg-info directories
|
|
43
|
+
- OS metadata files (Thumbs.db)
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
path: The file path to check.
|
|
47
|
+
base_folder: The base folder for computing relative paths.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
True if the path should be ignored, False otherwise.
|
|
51
|
+
|
|
52
|
+
"""
|
|
53
|
+
rel_parts = path.relative_to(base_folder).parts
|
|
54
|
+
|
|
55
|
+
for part in rel_parts:
|
|
56
|
+
# Hidden files/directories (starting with .)
|
|
57
|
+
if part.startswith("."):
|
|
58
|
+
return True
|
|
59
|
+
# Common ignore directories
|
|
60
|
+
if part in DEFAULT_IGNORE_DIRS:
|
|
61
|
+
return True
|
|
62
|
+
# .egg-info directories
|
|
63
|
+
if part.endswith(".egg-info"):
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
# Check specific file patterns
|
|
67
|
+
return path.name in DEFAULT_IGNORE_FILES
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Files to read as plain text directly (fast path)
|
|
71
|
+
TEXT_EXTENSIONS = {
|
|
72
|
+
".txt",
|
|
73
|
+
".md",
|
|
74
|
+
".json",
|
|
75
|
+
".py",
|
|
76
|
+
".js",
|
|
77
|
+
".ts",
|
|
78
|
+
".yaml",
|
|
79
|
+
".yml",
|
|
80
|
+
".rs",
|
|
81
|
+
".go",
|
|
82
|
+
".c",
|
|
83
|
+
".cpp",
|
|
84
|
+
".h",
|
|
85
|
+
".sh",
|
|
86
|
+
".toml",
|
|
87
|
+
".rst",
|
|
88
|
+
".ini",
|
|
89
|
+
".cfg",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# Files to convert using MarkItDown (rich documents)
|
|
93
|
+
MARKITDOWN_EXTENSIONS = {
|
|
94
|
+
".pdf",
|
|
95
|
+
".docx",
|
|
96
|
+
".pptx",
|
|
97
|
+
".xlsx",
|
|
98
|
+
".html",
|
|
99
|
+
".htm",
|
|
100
|
+
".csv",
|
|
101
|
+
".xml",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | MARKITDOWN_EXTENSIONS
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def load_document_text(file_path: Path) -> str | None:
|
|
108
|
+
"""Load text from a file path."""
|
|
109
|
+
suffix = file_path.suffix.lower()
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
if suffix in TEXT_EXTENSIONS:
|
|
113
|
+
return file_path.read_text(errors="ignore")
|
|
114
|
+
|
|
115
|
+
if suffix in MARKITDOWN_EXTENSIONS:
|
|
116
|
+
from markitdown import MarkItDown # noqa: PLC0415
|
|
117
|
+
|
|
118
|
+
md = MarkItDown()
|
|
119
|
+
result = md.convert(str(file_path))
|
|
120
|
+
return result.text_content
|
|
121
|
+
|
|
122
|
+
return None # Unsupported
|
|
123
|
+
except Exception:
|
|
124
|
+
LOGGER.exception("Failed to load %s", file_path)
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# Separators ordered by preference (most semantic first)
|
|
129
|
+
SEPARATORS = ("\n\n", "\n", ". ", ", ", " ")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _find_break_point(text: str, start: int, end: int, min_chunk: int) -> int:
|
|
133
|
+
"""Find a good break point near end, preferring semantic boundaries.
|
|
134
|
+
|
|
135
|
+
Searches backwards from end to find the last occurrence of a separator.
|
|
136
|
+
Only accepts separators that would create a chunk of at least min_chunk size.
|
|
137
|
+
If none qualify, falls back to the best available earlier separator before
|
|
138
|
+
finally splitting at the exact end. Returns the position after the separator
|
|
139
|
+
(so the separator stays with the preceding chunk).
|
|
140
|
+
"""
|
|
141
|
+
min_pos = start + min_chunk
|
|
142
|
+
fallback_point = -1
|
|
143
|
+
for sep in SEPARATORS:
|
|
144
|
+
pos = text.rfind(sep, start, end)
|
|
145
|
+
if pos <= start:
|
|
146
|
+
continue
|
|
147
|
+
candidate = pos + len(sep)
|
|
148
|
+
if pos >= min_pos:
|
|
149
|
+
return candidate
|
|
150
|
+
fallback_point = max(fallback_point, candidate)
|
|
151
|
+
if fallback_point != -1:
|
|
152
|
+
return fallback_point
|
|
153
|
+
# No separator found at acceptable position, break at end (character-level split)
|
|
154
|
+
return end
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200) -> list[str]:
|
|
158
|
+
r"""Split text into chunks, preferring semantic boundaries.
|
|
159
|
+
|
|
160
|
+
Strategy:
|
|
161
|
+
1. Slice the original text directly (no split/join, so no char loss)
|
|
162
|
+
2. Find break points at separators: \n\n, \n, ". ", ", ", " "
|
|
163
|
+
3. Fall back to character-level breaks when no separator found
|
|
164
|
+
4. Overlap by starting next chunk earlier in the text
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
text: The text to chunk.
|
|
168
|
+
chunk_size: Maximum chunk size in characters (default 1200, ~300 words).
|
|
169
|
+
overlap: Overlap between chunks in characters for context continuity.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
List of text chunks.
|
|
173
|
+
|
|
174
|
+
Raises:
|
|
175
|
+
ValueError: If chunk_size <= 0 or overlap >= chunk_size.
|
|
176
|
+
|
|
177
|
+
"""
|
|
178
|
+
if chunk_size <= 0:
|
|
179
|
+
msg = f"chunk_size must be positive, got {chunk_size}"
|
|
180
|
+
raise ValueError(msg)
|
|
181
|
+
if overlap >= chunk_size:
|
|
182
|
+
msg = f"overlap ({overlap}) must be less than chunk_size ({chunk_size})"
|
|
183
|
+
raise ValueError(msg)
|
|
184
|
+
|
|
185
|
+
if not text or not text.strip():
|
|
186
|
+
return []
|
|
187
|
+
|
|
188
|
+
text = text.strip()
|
|
189
|
+
if len(text) <= chunk_size:
|
|
190
|
+
return [text]
|
|
191
|
+
|
|
192
|
+
# Only accept separators that use at least half the chunk budget
|
|
193
|
+
min_chunk = chunk_size // 2
|
|
194
|
+
|
|
195
|
+
chunks = []
|
|
196
|
+
start = 0
|
|
197
|
+
|
|
198
|
+
while start < len(text):
|
|
199
|
+
end = start + chunk_size
|
|
200
|
+
|
|
201
|
+
if end >= len(text):
|
|
202
|
+
# Last chunk - take everything remaining
|
|
203
|
+
chunks.append(text[start:])
|
|
204
|
+
break
|
|
205
|
+
|
|
206
|
+
# Find a good break point
|
|
207
|
+
break_point = _find_break_point(text, start, end, min_chunk)
|
|
208
|
+
chunks.append(text[start:break_point])
|
|
209
|
+
|
|
210
|
+
# Next chunk starts with overlap (but must make progress)
|
|
211
|
+
start = max(start + 1, break_point - overlap)
|
|
212
|
+
|
|
213
|
+
return chunks
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def get_file_hash(file_path: Path) -> str:
|
|
217
|
+
"""Get hash of file content."""
|
|
218
|
+
return hashlib.md5(file_path.read_bytes(), usedforsecurity=False).hexdigest()
|