agent-cli 0.70.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. agent_cli/__init__.py +5 -0
  2. agent_cli/__main__.py +6 -0
  3. agent_cli/_extras.json +14 -0
  4. agent_cli/_requirements/.gitkeep +0 -0
  5. agent_cli/_requirements/audio.txt +79 -0
  6. agent_cli/_requirements/faster-whisper.txt +215 -0
  7. agent_cli/_requirements/kokoro.txt +425 -0
  8. agent_cli/_requirements/llm.txt +183 -0
  9. agent_cli/_requirements/memory.txt +355 -0
  10. agent_cli/_requirements/mlx-whisper.txt +222 -0
  11. agent_cli/_requirements/piper.txt +176 -0
  12. agent_cli/_requirements/rag.txt +402 -0
  13. agent_cli/_requirements/server.txt +154 -0
  14. agent_cli/_requirements/speed.txt +77 -0
  15. agent_cli/_requirements/vad.txt +155 -0
  16. agent_cli/_requirements/wyoming.txt +71 -0
  17. agent_cli/_tools.py +368 -0
  18. agent_cli/agents/__init__.py +23 -0
  19. agent_cli/agents/_voice_agent_common.py +136 -0
  20. agent_cli/agents/assistant.py +383 -0
  21. agent_cli/agents/autocorrect.py +284 -0
  22. agent_cli/agents/chat.py +496 -0
  23. agent_cli/agents/memory/__init__.py +31 -0
  24. agent_cli/agents/memory/add.py +190 -0
  25. agent_cli/agents/memory/proxy.py +160 -0
  26. agent_cli/agents/rag_proxy.py +128 -0
  27. agent_cli/agents/speak.py +209 -0
  28. agent_cli/agents/transcribe.py +671 -0
  29. agent_cli/agents/transcribe_daemon.py +499 -0
  30. agent_cli/agents/voice_edit.py +291 -0
  31. agent_cli/api.py +22 -0
  32. agent_cli/cli.py +106 -0
  33. agent_cli/config.py +503 -0
  34. agent_cli/config_cmd.py +307 -0
  35. agent_cli/constants.py +27 -0
  36. agent_cli/core/__init__.py +1 -0
  37. agent_cli/core/audio.py +461 -0
  38. agent_cli/core/audio_format.py +299 -0
  39. agent_cli/core/chroma.py +88 -0
  40. agent_cli/core/deps.py +191 -0
  41. agent_cli/core/openai_proxy.py +139 -0
  42. agent_cli/core/process.py +195 -0
  43. agent_cli/core/reranker.py +120 -0
  44. agent_cli/core/sse.py +87 -0
  45. agent_cli/core/transcription_logger.py +70 -0
  46. agent_cli/core/utils.py +526 -0
  47. agent_cli/core/vad.py +175 -0
  48. agent_cli/core/watch.py +65 -0
  49. agent_cli/dev/__init__.py +14 -0
  50. agent_cli/dev/cli.py +1588 -0
  51. agent_cli/dev/coding_agents/__init__.py +19 -0
  52. agent_cli/dev/coding_agents/aider.py +24 -0
  53. agent_cli/dev/coding_agents/base.py +167 -0
  54. agent_cli/dev/coding_agents/claude.py +39 -0
  55. agent_cli/dev/coding_agents/codex.py +24 -0
  56. agent_cli/dev/coding_agents/continue_dev.py +15 -0
  57. agent_cli/dev/coding_agents/copilot.py +24 -0
  58. agent_cli/dev/coding_agents/cursor_agent.py +48 -0
  59. agent_cli/dev/coding_agents/gemini.py +28 -0
  60. agent_cli/dev/coding_agents/opencode.py +15 -0
  61. agent_cli/dev/coding_agents/registry.py +49 -0
  62. agent_cli/dev/editors/__init__.py +19 -0
  63. agent_cli/dev/editors/base.py +89 -0
  64. agent_cli/dev/editors/cursor.py +15 -0
  65. agent_cli/dev/editors/emacs.py +46 -0
  66. agent_cli/dev/editors/jetbrains.py +56 -0
  67. agent_cli/dev/editors/nano.py +31 -0
  68. agent_cli/dev/editors/neovim.py +33 -0
  69. agent_cli/dev/editors/registry.py +59 -0
  70. agent_cli/dev/editors/sublime.py +20 -0
  71. agent_cli/dev/editors/vim.py +42 -0
  72. agent_cli/dev/editors/vscode.py +15 -0
  73. agent_cli/dev/editors/zed.py +20 -0
  74. agent_cli/dev/project.py +568 -0
  75. agent_cli/dev/registry.py +52 -0
  76. agent_cli/dev/skill/SKILL.md +141 -0
  77. agent_cli/dev/skill/examples.md +571 -0
  78. agent_cli/dev/terminals/__init__.py +19 -0
  79. agent_cli/dev/terminals/apple_terminal.py +82 -0
  80. agent_cli/dev/terminals/base.py +56 -0
  81. agent_cli/dev/terminals/gnome.py +51 -0
  82. agent_cli/dev/terminals/iterm2.py +84 -0
  83. agent_cli/dev/terminals/kitty.py +77 -0
  84. agent_cli/dev/terminals/registry.py +48 -0
  85. agent_cli/dev/terminals/tmux.py +58 -0
  86. agent_cli/dev/terminals/warp.py +132 -0
  87. agent_cli/dev/terminals/zellij.py +78 -0
  88. agent_cli/dev/worktree.py +856 -0
  89. agent_cli/docs_gen.py +417 -0
  90. agent_cli/example-config.toml +185 -0
  91. agent_cli/install/__init__.py +5 -0
  92. agent_cli/install/common.py +89 -0
  93. agent_cli/install/extras.py +174 -0
  94. agent_cli/install/hotkeys.py +48 -0
  95. agent_cli/install/services.py +87 -0
  96. agent_cli/memory/__init__.py +7 -0
  97. agent_cli/memory/_files.py +250 -0
  98. agent_cli/memory/_filters.py +63 -0
  99. agent_cli/memory/_git.py +157 -0
  100. agent_cli/memory/_indexer.py +142 -0
  101. agent_cli/memory/_ingest.py +408 -0
  102. agent_cli/memory/_persistence.py +182 -0
  103. agent_cli/memory/_prompt.py +91 -0
  104. agent_cli/memory/_retrieval.py +294 -0
  105. agent_cli/memory/_store.py +169 -0
  106. agent_cli/memory/_streaming.py +44 -0
  107. agent_cli/memory/_tasks.py +48 -0
  108. agent_cli/memory/api.py +113 -0
  109. agent_cli/memory/client.py +272 -0
  110. agent_cli/memory/engine.py +361 -0
  111. agent_cli/memory/entities.py +43 -0
  112. agent_cli/memory/models.py +112 -0
  113. agent_cli/opts.py +433 -0
  114. agent_cli/py.typed +0 -0
  115. agent_cli/rag/__init__.py +3 -0
  116. agent_cli/rag/_indexer.py +67 -0
  117. agent_cli/rag/_indexing.py +226 -0
  118. agent_cli/rag/_prompt.py +30 -0
  119. agent_cli/rag/_retriever.py +156 -0
  120. agent_cli/rag/_store.py +48 -0
  121. agent_cli/rag/_utils.py +218 -0
  122. agent_cli/rag/api.py +175 -0
  123. agent_cli/rag/client.py +299 -0
  124. agent_cli/rag/engine.py +302 -0
  125. agent_cli/rag/models.py +55 -0
  126. agent_cli/scripts/.runtime/.gitkeep +0 -0
  127. agent_cli/scripts/__init__.py +1 -0
  128. agent_cli/scripts/check_plugin_skill_sync.py +50 -0
  129. agent_cli/scripts/linux-hotkeys/README.md +63 -0
  130. agent_cli/scripts/linux-hotkeys/toggle-autocorrect.sh +45 -0
  131. agent_cli/scripts/linux-hotkeys/toggle-transcription.sh +58 -0
  132. agent_cli/scripts/linux-hotkeys/toggle-voice-edit.sh +58 -0
  133. agent_cli/scripts/macos-hotkeys/README.md +45 -0
  134. agent_cli/scripts/macos-hotkeys/skhd-config-example +5 -0
  135. agent_cli/scripts/macos-hotkeys/toggle-autocorrect.sh +12 -0
  136. agent_cli/scripts/macos-hotkeys/toggle-transcription.sh +37 -0
  137. agent_cli/scripts/macos-hotkeys/toggle-voice-edit.sh +37 -0
  138. agent_cli/scripts/nvidia-asr-server/README.md +99 -0
  139. agent_cli/scripts/nvidia-asr-server/pyproject.toml +27 -0
  140. agent_cli/scripts/nvidia-asr-server/server.py +255 -0
  141. agent_cli/scripts/nvidia-asr-server/shell.nix +32 -0
  142. agent_cli/scripts/nvidia-asr-server/uv.lock +4654 -0
  143. agent_cli/scripts/run-openwakeword.sh +11 -0
  144. agent_cli/scripts/run-piper-windows.ps1 +30 -0
  145. agent_cli/scripts/run-piper.sh +24 -0
  146. agent_cli/scripts/run-whisper-linux.sh +40 -0
  147. agent_cli/scripts/run-whisper-macos.sh +6 -0
  148. agent_cli/scripts/run-whisper-windows.ps1 +51 -0
  149. agent_cli/scripts/run-whisper.sh +9 -0
  150. agent_cli/scripts/run_faster_whisper_server.py +136 -0
  151. agent_cli/scripts/setup-linux-hotkeys.sh +72 -0
  152. agent_cli/scripts/setup-linux.sh +108 -0
  153. agent_cli/scripts/setup-macos-hotkeys.sh +61 -0
  154. agent_cli/scripts/setup-macos.sh +76 -0
  155. agent_cli/scripts/setup-windows.ps1 +63 -0
  156. agent_cli/scripts/start-all-services-windows.ps1 +53 -0
  157. agent_cli/scripts/start-all-services.sh +178 -0
  158. agent_cli/scripts/sync_extras.py +138 -0
  159. agent_cli/server/__init__.py +3 -0
  160. agent_cli/server/cli.py +721 -0
  161. agent_cli/server/common.py +222 -0
  162. agent_cli/server/model_manager.py +288 -0
  163. agent_cli/server/model_registry.py +225 -0
  164. agent_cli/server/proxy/__init__.py +3 -0
  165. agent_cli/server/proxy/api.py +444 -0
  166. agent_cli/server/streaming.py +67 -0
  167. agent_cli/server/tts/__init__.py +3 -0
  168. agent_cli/server/tts/api.py +335 -0
  169. agent_cli/server/tts/backends/__init__.py +82 -0
  170. agent_cli/server/tts/backends/base.py +139 -0
  171. agent_cli/server/tts/backends/kokoro.py +403 -0
  172. agent_cli/server/tts/backends/piper.py +253 -0
  173. agent_cli/server/tts/model_manager.py +201 -0
  174. agent_cli/server/tts/model_registry.py +28 -0
  175. agent_cli/server/tts/wyoming_handler.py +249 -0
  176. agent_cli/server/whisper/__init__.py +3 -0
  177. agent_cli/server/whisper/api.py +413 -0
  178. agent_cli/server/whisper/backends/__init__.py +89 -0
  179. agent_cli/server/whisper/backends/base.py +97 -0
  180. agent_cli/server/whisper/backends/faster_whisper.py +225 -0
  181. agent_cli/server/whisper/backends/mlx.py +270 -0
  182. agent_cli/server/whisper/languages.py +116 -0
  183. agent_cli/server/whisper/model_manager.py +157 -0
  184. agent_cli/server/whisper/model_registry.py +28 -0
  185. agent_cli/server/whisper/wyoming_handler.py +203 -0
  186. agent_cli/services/__init__.py +343 -0
  187. agent_cli/services/_wyoming_utils.py +64 -0
  188. agent_cli/services/asr.py +506 -0
  189. agent_cli/services/llm.py +228 -0
  190. agent_cli/services/tts.py +450 -0
  191. agent_cli/services/wake_word.py +142 -0
  192. agent_cli-0.70.5.dist-info/METADATA +2118 -0
  193. agent_cli-0.70.5.dist-info/RECORD +196 -0
  194. agent_cli-0.70.5.dist-info/WHEEL +4 -0
  195. agent_cli-0.70.5.dist-info/entry_points.txt +4 -0
  196. agent_cli-0.70.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,226 @@
1
+ """RAG Indexing Logic."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import concurrent.futures
6
+ import datetime
7
+ import logging
8
+ from typing import TYPE_CHECKING
9
+
10
+ from agent_cli.rag._store import delete_by_file_path, get_all_metadata, upsert_docs
11
+ from agent_cli.rag._utils import chunk_text, get_file_hash, load_document_text, should_ignore_path
12
+ from agent_cli.rag.models import DocMetadata
13
+
14
+ if TYPE_CHECKING:
15
+ from pathlib import Path
16
+
17
+ from chromadb import Collection
18
+
19
+ LOGGER = logging.getLogger(__name__)
20
+
21
+
22
+ def load_hashes_from_metadata(collection: Collection) -> tuple[dict[str, str], dict[str, float]]:
23
+ """Rebuild hash and mtime caches from existing DB.
24
+
25
+ Returns:
26
+ Tuple of (file_hashes, file_mtimes) dictionaries.
27
+
28
+ """
29
+ metadatas = get_all_metadata(collection)
30
+ file_hashes = {}
31
+ file_mtimes = {}
32
+ for meta in metadatas:
33
+ if meta:
34
+ fp = meta["file_path"]
35
+ file_hashes[fp] = meta["file_hash"]
36
+ file_mtimes[fp] = meta["file_mtime"]
37
+ return file_hashes, file_mtimes
38
+
39
+
40
+ def index_file(
41
+ collection: Collection,
42
+ docs_folder: Path,
43
+ file_path: Path,
44
+ file_hashes: dict[str, str],
45
+ file_mtimes: dict[str, float],
46
+ ) -> bool:
47
+ """Index or reindex a single file.
48
+
49
+ Uses mtime-first checking for performance: only computes hash if mtime changed.
50
+
51
+ Returns:
52
+ True if the file was indexed (changed or new), False otherwise.
53
+
54
+ """
55
+ if not file_path.exists():
56
+ return False
57
+ LOGGER.info(" 📄 Processing: %s", file_path.name)
58
+
59
+ try:
60
+ relative_path = str(file_path.relative_to(docs_folder))
61
+ current_mtime = file_path.stat().st_mtime
62
+
63
+ # Fast path: mtime unchanged → skip (no hash computation needed)
64
+ if relative_path in file_mtimes and file_mtimes[relative_path] == current_mtime:
65
+ return False
66
+
67
+ # mtime changed or new file: verify with hash
68
+ current_hash = get_file_hash(file_path)
69
+
70
+ # Hash unchanged (file was touched but not modified) → update mtime, skip
71
+ if relative_path in file_hashes and file_hashes[relative_path] == current_hash:
72
+ file_mtimes[relative_path] = current_mtime
73
+ return False
74
+
75
+ # Remove old chunks first (atomic-ish update)
76
+ remove_file(collection, docs_folder, file_path, file_hashes, file_mtimes)
77
+
78
+ # Load and chunk document
79
+ text = load_document_text(file_path)
80
+ chunks = chunk_text(text) if text and text.strip() else []
81
+ if not chunks:
82
+ return False # Unsupported, empty, or no chunks
83
+
84
+ # Index chunks
85
+ ids = []
86
+ documents = []
87
+ metadatas = []
88
+
89
+ timestamp = datetime.datetime.now(datetime.UTC).isoformat()
90
+
91
+ for i, chunk in enumerate(chunks):
92
+ doc_id = f"{relative_path}:chunk:{i}"
93
+ ids.append(doc_id)
94
+ documents.append(chunk)
95
+ metadatas.append(
96
+ DocMetadata(
97
+ source=file_path.name,
98
+ file_path=relative_path,
99
+ file_type=file_path.suffix,
100
+ chunk_id=i,
101
+ total_chunks=len(chunks),
102
+ indexed_at=timestamp,
103
+ file_hash=current_hash,
104
+ file_mtime=current_mtime,
105
+ ),
106
+ )
107
+
108
+ # Upsert to ChromaDB in batches to avoid 502s from large payloads
109
+ # Use small batch size (10) to avoid overwhelming embedding servers
110
+ batch_size = 10
111
+ for i in range(0, len(ids), batch_size):
112
+ batch_ids = ids[i : i + batch_size]
113
+ batch_docs = documents[i : i + batch_size]
114
+ batch_meta = metadatas[i : i + batch_size]
115
+ upsert_docs(collection, batch_ids, batch_docs, batch_meta)
116
+
117
+ # Update tracking
118
+ file_hashes[relative_path] = current_hash
119
+ file_mtimes[relative_path] = current_mtime
120
+
121
+ LOGGER.info(" ✓ Indexed %s: %d chunks", file_path.name, len(chunks))
122
+ return True
123
+
124
+ except Exception:
125
+ LOGGER.exception("Failed to index file %s", file_path)
126
+ return False
127
+
128
+
129
+ def remove_file(
130
+ collection: Collection,
131
+ docs_folder: Path,
132
+ file_path: Path,
133
+ file_hashes: dict[str, str],
134
+ file_mtimes: dict[str, float],
135
+ ) -> bool:
136
+ """Remove all chunks of a file from index.
137
+
138
+ Returns:
139
+ True if documents were removed (or at least untracked), False otherwise.
140
+
141
+ """
142
+ try:
143
+ relative_path = str(file_path.relative_to(docs_folder))
144
+ delete_by_file_path(collection, relative_path)
145
+
146
+ # If it was tracked, we consider it "removed"
147
+ if relative_path in file_hashes:
148
+ LOGGER.info(" ✓ Removed %s from index", file_path.name)
149
+ file_hashes.pop(relative_path, None)
150
+ file_mtimes.pop(relative_path, None)
151
+ return True
152
+
153
+ return False
154
+ except Exception:
155
+ LOGGER.exception("Error removing file %s", file_path)
156
+ return False
157
+
158
+
159
+ def initial_index(
160
+ collection: Collection,
161
+ docs_folder: Path,
162
+ file_hashes: dict[str, str],
163
+ file_mtimes: dict[str, float],
164
+ ) -> None:
165
+ """Index all existing files on startup and remove deleted ones."""
166
+ LOGGER.info("🔍 Scanning existing files...")
167
+
168
+ # Snapshot of what's in the DB currently
169
+ paths_in_db = set(file_hashes.keys())
170
+ paths_found_on_disk = set()
171
+
172
+ processed_files = []
173
+ removed_files = []
174
+
175
+ # Gather all files first, excluding hidden and common development directories
176
+ all_files = [
177
+ p for p in docs_folder.rglob("*") if p.is_file() and not should_ignore_path(p, docs_folder)
178
+ ]
179
+
180
+ # 1. Index Existing Files in Parallel
181
+ # Use max_workers=4 to match typical local backend parallelism (e.g. llama-server -np 4)
182
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
183
+ # Map futures to file paths
184
+ future_to_file = {
185
+ executor.submit(index_file, collection, docs_folder, f, file_hashes, file_mtimes): f
186
+ for f in all_files
187
+ }
188
+
189
+ for future in concurrent.futures.as_completed(future_to_file):
190
+ file_path = future_to_file[future]
191
+ try:
192
+ # Track that we found this file (regardless of index result)
193
+ rel_path = str(file_path.relative_to(docs_folder))
194
+ paths_found_on_disk.add(rel_path)
195
+
196
+ indexed = future.result()
197
+ if indexed:
198
+ processed_files.append(file_path.name)
199
+ except Exception:
200
+ LOGGER.exception("Error processing %s", file_path.name)
201
+
202
+ # 2. Clean up Deleted Files
203
+ # If it's in DB but not found on disk, it was deleted offline.
204
+ paths_to_remove = paths_in_db - paths_found_on_disk
205
+
206
+ if paths_to_remove:
207
+ LOGGER.info("🧹 Cleaning up %d deleted files found in index...", len(paths_to_remove))
208
+ for rel_path in paths_to_remove:
209
+ full_path = docs_folder / rel_path
210
+ try:
211
+ if remove_file(collection, docs_folder, full_path, file_hashes, file_mtimes):
212
+ removed_files.append(rel_path)
213
+ except Exception:
214
+ LOGGER.exception("Error removing stale file %s", rel_path)
215
+
216
+ if processed_files:
217
+ LOGGER.info("🆕 Added/Updated: %s", ", ".join(processed_files))
218
+
219
+ if removed_files:
220
+ LOGGER.info("🗑️ Removed: %s", ", ".join(removed_files))
221
+
222
+ LOGGER.info(
223
+ "✅ Initial scan complete. Indexed/Checked %d files, Removed %d stale files.",
224
+ len(paths_found_on_disk),
225
+ len(removed_files),
226
+ )
@@ -0,0 +1,30 @@
1
+ """Centralized prompts for RAG LLM calls."""
2
+
3
+ RAG_PROMPT_WITH_TOOLS = """
4
+ ## Retrieved Documentation
5
+ The following was automatically retrieved based on the user's query:
6
+
7
+ <retrieved_documents>
8
+ {context}
9
+ </retrieved_documents>
10
+
11
+ ## RAG Instructions
12
+ - Use the retrieved context ONLY if it's relevant to the question
13
+ - If the context is irrelevant, ignore it and answer based on your knowledge
14
+ - When using context, cite sources: [Source: filename]
15
+ - If snippets are insufficient, call read_full_document(file_path) to get full content
16
+ """.strip()
17
+
18
+ RAG_PROMPT_NO_TOOLS = """
19
+ ## Retrieved Documentation
20
+ The following was automatically retrieved based on the user's query:
21
+
22
+ <retrieved_documents>
23
+ {context}
24
+ </retrieved_documents>
25
+
26
+ ## RAG Instructions
27
+ - Use the retrieved context ONLY if it's relevant to the question
28
+ - If the context is irrelevant, ignore it and answer based on your knowledge
29
+ - When using context, cite sources: [Source: filename]
30
+ """.strip()
@@ -0,0 +1,156 @@
1
+ """RAG Retrieval Logic (Functional)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING
7
+
8
+ from agent_cli.core.reranker import OnnxCrossEncoder, predict_relevance
9
+ from agent_cli.rag._store import query_docs
10
+ from agent_cli.rag.models import RagSource, RetrievalResult
11
+
12
+ if TYPE_CHECKING:
13
+ from chromadb import Collection
14
+
15
+ LOGGER = logging.getLogger(__name__)
16
+
17
+
18
+ def format_context(
19
+ ranked: list[tuple[str, dict, float]],
20
+ source_key: str = "source",
21
+ path_key: str = "file_path",
22
+ chunk_key: str = "chunk_id",
23
+ ) -> str:
24
+ """Format ranked documents as XML for context injection.
25
+
26
+ Args:
27
+ ranked: List of (doc, meta, score) tuples from rerank_and_filter().
28
+ source_key: Metadata key for source name.
29
+ path_key: Metadata key for file path.
30
+ chunk_key: Metadata key for chunk ID.
31
+
32
+ Returns:
33
+ XML-formatted context string.
34
+
35
+ """
36
+ if not ranked:
37
+ return ""
38
+
39
+ context_parts = []
40
+ for i, (doc, meta, score) in enumerate(ranked):
41
+ source = meta.get(source_key, "unknown")
42
+ path = meta.get(path_key, meta.get("doc_id", "unknown"))
43
+ chunk = meta.get(chunk_key, 0)
44
+ context_parts.append(
45
+ f'<document index="{i + 1}" source="{source}" '
46
+ f'path="{path}" chunk="{chunk}" score="{score:.3f}">\n{doc}\n</document>',
47
+ )
48
+
49
+ return "\n".join(context_parts)
50
+
51
+
52
+ def rerank_and_filter(
53
+ reranker: OnnxCrossEncoder,
54
+ query: str,
55
+ docs: list[str],
56
+ metas: list[dict],
57
+ top_k: int,
58
+ min_score: float = 0.2,
59
+ ) -> list[tuple[str, dict, float]]:
60
+ """Rerank documents and filter by minimum score.
61
+
62
+ Args:
63
+ reranker: Cross-encoder model for reranking.
64
+ query: Search query string.
65
+ docs: List of document texts.
66
+ metas: List of metadata dicts corresponding to docs.
67
+ top_k: Maximum number of results to return.
68
+ min_score: Minimum relevance score threshold.
69
+
70
+ Returns:
71
+ List of (doc, meta, score) tuples, sorted by score descending.
72
+
73
+ """
74
+ if not docs:
75
+ return []
76
+
77
+ # Rerank
78
+ pairs = [(query, doc) for doc in docs]
79
+ scores = predict_relevance(reranker, pairs)
80
+
81
+ # Sort by score descending
82
+ ranked_all = sorted(
83
+ zip(docs, metas, scores, strict=False),
84
+ key=lambda x: x[2],
85
+ reverse=True,
86
+ )
87
+
88
+ # Filter by min_score and take top_k
89
+ ranked = [(d, m, s) for d, m, s in ranked_all if s >= min_score][:top_k]
90
+
91
+ # Log retrieval quality
92
+ filtered_count = len(ranked_all) - len([x for x in ranked_all if x[2] >= min_score])
93
+ top_score = ranked_all[0][2] if ranked_all else 0.0
94
+ LOGGER.info(
95
+ "Retrieval: query_len=%d, candidates=%d, returned=%d, "
96
+ "top_score=%.3f, min_score=%.3f, filtered=%d",
97
+ len(query),
98
+ len(docs),
99
+ len(ranked),
100
+ top_score,
101
+ min_score,
102
+ filtered_count,
103
+ )
104
+
105
+ return ranked
106
+
107
+
108
+ def search_context(
109
+ collection: Collection,
110
+ reranker_model: OnnxCrossEncoder,
111
+ query: str,
112
+ top_k: int = 3,
113
+ min_score: float = 0.2,
114
+ ) -> RetrievalResult:
115
+ """Retrieve relevant context for a query using hybrid search.
116
+
117
+ Args:
118
+ collection: ChromaDB collection to search.
119
+ reranker_model: Cross-encoder model for reranking.
120
+ query: Search query string.
121
+ top_k: Maximum number of results to return.
122
+ min_score: Minimum relevance score threshold. Results below this are filtered out.
123
+
124
+ Returns:
125
+ RetrievalResult with context and sources. Empty if no results meet min_score.
126
+
127
+ """
128
+ # Initial retrieval - fetch more candidates for reranking
129
+ n_candidates = top_k * 3
130
+ results = query_docs(collection, query, n_results=n_candidates)
131
+
132
+ if not results["documents"] or not results["documents"][0]:
133
+ return RetrievalResult(context="", sources=[])
134
+
135
+ docs = results["documents"][0]
136
+ metas = results["metadatas"][0] # type: ignore[index]
137
+
138
+ # Rerank and filter
139
+ ranked = rerank_and_filter(reranker_model, query, docs, metas, top_k, min_score)
140
+
141
+ if not ranked:
142
+ return RetrievalResult(context="", sources=[])
143
+
144
+ # Build context and sources
145
+ context = format_context(ranked)
146
+ sources = [
147
+ RagSource(
148
+ source=meta["source"],
149
+ path=meta["file_path"],
150
+ chunk_id=meta["chunk_id"],
151
+ score=float(score),
152
+ )
153
+ for _, meta, score in ranked
154
+ ]
155
+
156
+ return RetrievalResult(context=context, sources=sources)
@@ -0,0 +1,48 @@
1
+ """ChromaDB functional interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from agent_cli.core.chroma import delete_where, upsert
9
+
10
+ if TYPE_CHECKING:
11
+ from collections.abc import Sequence
12
+
13
+ from chromadb import Collection
14
+
15
+ from agent_cli.rag.models import DocMetadata
16
+
17
+ LOGGER = logging.getLogger(__name__)
18
+
19
+
20
+ def upsert_docs(
21
+ collection: Collection,
22
+ ids: list[str],
23
+ documents: list[str],
24
+ metadatas: Sequence[DocMetadata],
25
+ ) -> None:
26
+ """Upsert documents into the collection."""
27
+ upsert(collection, ids=ids, documents=documents, metadatas=metadatas)
28
+
29
+
30
+ def delete_by_file_path(collection: Collection, file_path: str) -> None:
31
+ """Delete all chunks associated with a file path."""
32
+ delete_where(collection, {"file_path": file_path})
33
+
34
+
35
+ def query_docs(collection: Collection, text: str, n_results: int) -> dict[str, Any]:
36
+ """Query the collection."""
37
+ return collection.query(query_texts=[text], n_results=n_results)
38
+
39
+
40
+ def get_all_metadata(collection: Collection) -> list[dict[str, Any]]:
41
+ """Retrieve all metadata from the collection."""
42
+ result = collection.get(include=["metadatas"])
43
+ return result.get("metadatas", []) or [] # type: ignore[return-value]
44
+
45
+
46
+ def count_docs(collection: Collection) -> int:
47
+ """Return total number of documents."""
48
+ return collection.count()
@@ -0,0 +1,218 @@
1
+ """Utility functions for RAG: Document loading and chunking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import logging
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from pathlib import Path
11
+
12
+ # Configure logging
13
+ LOGGER = logging.getLogger(__name__)
14
+
15
+ # Non-hidden directories to ignore (hidden dirs already caught by startswith(".") check)
16
+ DEFAULT_IGNORE_DIRS: frozenset[str] = frozenset(
17
+ {
18
+ "__pycache__",
19
+ "venv",
20
+ "env",
21
+ "htmlcov",
22
+ "node_modules",
23
+ "build",
24
+ "dist",
25
+ },
26
+ )
27
+
28
+ # Non-hidden files to ignore (hidden files already caught by startswith(".") check)
29
+ DEFAULT_IGNORE_FILES: frozenset[str] = frozenset(
30
+ {
31
+ "Thumbs.db",
32
+ },
33
+ )
34
+
35
+
36
+ def should_ignore_path(path: Path, base_folder: Path) -> bool:
37
+ """Check if a path should be ignored during indexing.
38
+
39
+ Ignores:
40
+ - Any path component starting with '.' (hidden files/dirs)
41
+ - Common development directories (__pycache__, node_modules, venv, etc.)
42
+ - .egg-info directories
43
+ - OS metadata files (Thumbs.db)
44
+
45
+ Args:
46
+ path: The file path to check.
47
+ base_folder: The base folder for computing relative paths.
48
+
49
+ Returns:
50
+ True if the path should be ignored, False otherwise.
51
+
52
+ """
53
+ rel_parts = path.relative_to(base_folder).parts
54
+
55
+ for part in rel_parts:
56
+ # Hidden files/directories (starting with .)
57
+ if part.startswith("."):
58
+ return True
59
+ # Common ignore directories
60
+ if part in DEFAULT_IGNORE_DIRS:
61
+ return True
62
+ # .egg-info directories
63
+ if part.endswith(".egg-info"):
64
+ return True
65
+
66
+ # Check specific file patterns
67
+ return path.name in DEFAULT_IGNORE_FILES
68
+
69
+
70
+ # Files to read as plain text directly (fast path)
71
+ TEXT_EXTENSIONS = {
72
+ ".txt",
73
+ ".md",
74
+ ".json",
75
+ ".py",
76
+ ".js",
77
+ ".ts",
78
+ ".yaml",
79
+ ".yml",
80
+ ".rs",
81
+ ".go",
82
+ ".c",
83
+ ".cpp",
84
+ ".h",
85
+ ".sh",
86
+ ".toml",
87
+ ".rst",
88
+ ".ini",
89
+ ".cfg",
90
+ }
91
+
92
+ # Files to convert using MarkItDown (rich documents)
93
+ MARKITDOWN_EXTENSIONS = {
94
+ ".pdf",
95
+ ".docx",
96
+ ".pptx",
97
+ ".xlsx",
98
+ ".html",
99
+ ".htm",
100
+ ".csv",
101
+ ".xml",
102
+ }
103
+
104
+ SUPPORTED_EXTENSIONS = TEXT_EXTENSIONS | MARKITDOWN_EXTENSIONS
105
+
106
+
107
+ def load_document_text(file_path: Path) -> str | None:
108
+ """Load text from a file path."""
109
+ suffix = file_path.suffix.lower()
110
+
111
+ try:
112
+ if suffix in TEXT_EXTENSIONS:
113
+ return file_path.read_text(errors="ignore")
114
+
115
+ if suffix in MARKITDOWN_EXTENSIONS:
116
+ from markitdown import MarkItDown # noqa: PLC0415
117
+
118
+ md = MarkItDown()
119
+ result = md.convert(str(file_path))
120
+ return result.text_content
121
+
122
+ return None # Unsupported
123
+ except Exception:
124
+ LOGGER.exception("Failed to load %s", file_path)
125
+ return None
126
+
127
+
128
+ # Separators ordered by preference (most semantic first)
129
+ SEPARATORS = ("\n\n", "\n", ". ", ", ", " ")
130
+
131
+
132
+ def _find_break_point(text: str, start: int, end: int, min_chunk: int) -> int:
133
+ """Find a good break point near end, preferring semantic boundaries.
134
+
135
+ Searches backwards from end to find the last occurrence of a separator.
136
+ Only accepts separators that would create a chunk of at least min_chunk size.
137
+ If none qualify, falls back to the best available earlier separator before
138
+ finally splitting at the exact end. Returns the position after the separator
139
+ (so the separator stays with the preceding chunk).
140
+ """
141
+ min_pos = start + min_chunk
142
+ fallback_point = -1
143
+ for sep in SEPARATORS:
144
+ pos = text.rfind(sep, start, end)
145
+ if pos <= start:
146
+ continue
147
+ candidate = pos + len(sep)
148
+ if pos >= min_pos:
149
+ return candidate
150
+ fallback_point = max(fallback_point, candidate)
151
+ if fallback_point != -1:
152
+ return fallback_point
153
+ # No separator found at acceptable position, break at end (character-level split)
154
+ return end
155
+
156
+
157
+ def chunk_text(text: str, chunk_size: int = 1200, overlap: int = 200) -> list[str]:
158
+ r"""Split text into chunks, preferring semantic boundaries.
159
+
160
+ Strategy:
161
+ 1. Slice the original text directly (no split/join, so no char loss)
162
+ 2. Find break points at separators: \n\n, \n, ". ", ", ", " "
163
+ 3. Fall back to character-level breaks when no separator found
164
+ 4. Overlap by starting next chunk earlier in the text
165
+
166
+ Args:
167
+ text: The text to chunk.
168
+ chunk_size: Maximum chunk size in characters (default 1200, ~300 words).
169
+ overlap: Overlap between chunks in characters for context continuity.
170
+
171
+ Returns:
172
+ List of text chunks.
173
+
174
+ Raises:
175
+ ValueError: If chunk_size <= 0 or overlap >= chunk_size.
176
+
177
+ """
178
+ if chunk_size <= 0:
179
+ msg = f"chunk_size must be positive, got {chunk_size}"
180
+ raise ValueError(msg)
181
+ if overlap >= chunk_size:
182
+ msg = f"overlap ({overlap}) must be less than chunk_size ({chunk_size})"
183
+ raise ValueError(msg)
184
+
185
+ if not text or not text.strip():
186
+ return []
187
+
188
+ text = text.strip()
189
+ if len(text) <= chunk_size:
190
+ return [text]
191
+
192
+ # Only accept separators that use at least half the chunk budget
193
+ min_chunk = chunk_size // 2
194
+
195
+ chunks = []
196
+ start = 0
197
+
198
+ while start < len(text):
199
+ end = start + chunk_size
200
+
201
+ if end >= len(text):
202
+ # Last chunk - take everything remaining
203
+ chunks.append(text[start:])
204
+ break
205
+
206
+ # Find a good break point
207
+ break_point = _find_break_point(text, start, end, min_chunk)
208
+ chunks.append(text[start:break_point])
209
+
210
+ # Next chunk starts with overlap (but must make progress)
211
+ start = max(start + 1, break_point - overlap)
212
+
213
+ return chunks
214
+
215
+
216
+ def get_file_hash(file_path: Path) -> str:
217
+ """Get hash of file content."""
218
+ return hashlib.md5(file_path.read_bytes(), usedforsecurity=False).hexdigest()