codexlr8 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codexlr8/search.py ADDED
@@ -0,0 +1,405 @@
1
+ """Search engine — SQLite FTS5 index with custom ranking for code search."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import fnmatch
6
+ import os
7
+ import re
8
+ import sqlite3
9
+ from datetime import datetime, timezone
10
+
11
+ from .config import load_config
12
+ from .meta import META_EXTENSION, read_meta
13
+ from .scanner import scan_project
14
+
15
+ INDEX_DB_NAME = ".codexlr8_index.db"
16
+
17
+
18
+ def _is_init_file(path: str) -> bool:
19
+ return os.path.basename(path) == "__init__.py"
20
+
21
+
22
+ def _tokenize(text: str) -> list[str]:
23
+ if not text:
24
+ return []
25
+ # Capture identifiers (letter-starting) and standalone numbers
26
+ tokens = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*|\d+", text.lower())
27
+ return [t for t in tokens if len(t) > 1 or t.isdigit()] # skip single letters
28
+
29
+
30
+ def _token_match_ratio(tokens: list[str], text: str) -> float:
31
+ """What fraction of query tokens appear in the document text?"""
32
+ if not tokens:
33
+ return 0.0
34
+ text_lower = text.lower()
35
+ matched = sum(1 for t in tokens if t in text_lower)
36
+ return matched / len(tokens)
37
+
38
+
39
+ def _matches_exclude(path: str, excludes: list[str]) -> bool:
40
+ """Check if a file path matches any exclude pattern."""
41
+ basename = os.path.basename(path)
42
+ for pattern in excludes:
43
+ if fnmatch.fnmatch(path, pattern):
44
+ return True
45
+ if fnmatch.fnmatch(basename, pattern):
46
+ return True
47
+ return False
48
+
49
+
50
+ class SearchEngine:
51
+ """SQLite FTS5-backed search engine for a codebase."""
52
+
53
+ def __init__(self, project_path: str):
54
+ self.project_path = os.path.abspath(project_path)
55
+ self.db_path = os.path.join(self.project_path, INDEX_DB_NAME)
56
+ self._config = None
57
+
58
+ @property
59
+ def config(self) -> dict:
60
+ if self._config is None:
61
+ self._config = load_config(self.project_path)
62
+ return self._config
63
+
64
+ def _get_connection(self) -> sqlite3.Connection:
65
+ conn = sqlite3.connect(self.db_path)
66
+ conn.row_factory = sqlite3.Row
67
+ conn.execute("PRAGMA journal_mode=WAL")
68
+ conn.execute("PRAGMA synchronous=NORMAL")
69
+ return conn
70
+
71
+ def build_index(self, incremental: bool = False,
72
+ exclude: list[str] | None = None,
73
+ include: list[str] | None = None) -> int:
74
+ """Build the full search index.
75
+
76
+ If incremental=True, only re-index changed/new/removed files.
77
+ include/exclude are glob patterns; fall back to config defaults.
78
+
79
+ Returns number of files indexed/mutated.
80
+ """
81
+ if exclude is None:
82
+ exclude = self.config.get("exclude", [])
83
+ if include is None:
84
+ include = self.config.get("include", [])
85
+
86
+ root = self.config.get("root", ".")
87
+ scan_root = os.path.join(self.project_path, root)
88
+
89
+ files_data = scan_project(
90
+ scan_root,
91
+ extensions=self.config.get("extensions"),
92
+ ignore_dirs=self.config.get("ignore_dirs"),
93
+ include=include,
94
+ exclude=exclude,
95
+ )
96
+
97
+ conn = self._get_connection()
98
+
99
+ if not incremental:
100
+ conn.execute("DROP TABLE IF EXISTS files")
101
+ conn.execute("""
102
+ CREATE VIRTUAL TABLE IF NOT EXISTS files USING fts5(
103
+ path, summary, tags, public_api, content,
104
+ tokenize='porter unicode61'
105
+ )
106
+ """)
107
+ conn.execute("DROP TABLE IF EXISTS file_meta")
108
+ conn.execute("""
109
+ CREATE TABLE IF NOT EXISTS file_meta (
110
+ path TEXT PRIMARY KEY,
111
+ content_size INTEGER,
112
+ has_meta BOOLEAN,
113
+ is_init BOOLEAN,
114
+ file_mtime REAL,
115
+ index_built_at TEXT
116
+ )
117
+ """)
118
+
119
+ conn.execute("""
120
+ CREATE VIRTUAL TABLE IF NOT EXISTS files USING fts5(
121
+ path, summary, tags, public_api, content,
122
+ tokenize='porter unicode61'
123
+ )
124
+ """)
125
+ conn.execute("""
126
+ CREATE TABLE IF NOT EXISTS file_meta (
127
+ path TEXT PRIMARY KEY,
128
+ content_size INTEGER,
129
+ has_meta BOOLEAN,
130
+ is_init BOOLEAN,
131
+ file_mtime REAL,
132
+ index_built_at TEXT
133
+ )
134
+ """)
135
+
136
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
137
+
138
+ if incremental:
139
+ count = self._incremental_update(conn, files_data, now)
140
+ else:
141
+ count = self._full_rebuild(conn, files_data, now)
142
+
143
+ conn.commit()
144
+ conn.close()
145
+ return count
146
+
147
+ def _full_rebuild(self, conn: sqlite3.Connection, files_data: list[dict], now: str) -> int:
148
+ conn.execute("DELETE FROM files")
149
+ conn.execute("DELETE FROM file_meta")
150
+ count = 0
151
+ for entry in files_data:
152
+ self._index_file(conn, entry, now)
153
+ count += 1
154
+ return count
155
+
156
+ def _incremental_update(self, conn: sqlite3.Connection, files_data: list[dict], now: str) -> int:
157
+ current_files: dict[str, float] = {}
158
+ file_data_map: dict[str, dict] = {}
159
+ for entry in files_data:
160
+ abspath = os.path.join(self.project_path, entry["path"])
161
+ mtime = os.path.getmtime(abspath)
162
+ current_files[entry["path"]] = mtime
163
+ file_data_map[entry["path"]] = entry
164
+
165
+ indexed = conn.execute("SELECT path, file_mtime FROM file_meta").fetchall()
166
+ indexed_map = {row["path"]: row["file_mtime"] for row in indexed}
167
+
168
+ count = 0
169
+
170
+ removed = set(indexed_map) - set(current_files)
171
+ for path in removed:
172
+ conn.execute("DELETE FROM files WHERE path = ?", (path,))
173
+ conn.execute("DELETE FROM file_meta WHERE path = ?", (path,))
174
+ count += 1
175
+
176
+ for path, mtime in current_files.items():
177
+ if path not in indexed_map or mtime > indexed_map[path]:
178
+ self._index_file(conn, file_data_map[path], now, replace=True)
179
+ count += 1
180
+
181
+ return count
182
+
183
+ def _index_file(self, conn: sqlite3.Connection, entry: dict, now: str, replace: bool = False):
184
+ path = entry["path"]
185
+ content = entry.get("content", "")
186
+ abspath = os.path.join(self.project_path, path)
187
+ meta = read_meta(abspath + META_EXTENSION) or {}
188
+ mtime = os.path.getmtime(abspath)
189
+
190
+ summary = meta.get("summary", "")
191
+ tags = " ".join(meta.get("tags", []))
192
+ public_api = " ".join(meta.get("public_api", []))
193
+
194
+ if replace:
195
+ conn.execute("DELETE FROM files WHERE path = ?", (path,))
196
+
197
+ conn.execute(
198
+ "INSERT INTO files (path, summary, tags, public_api, content) "
199
+ "VALUES (?, ?, ?, ?, ?)",
200
+ (path, summary, tags, public_api, content),
201
+ )
202
+
203
+ line_count = content.count('\n')
204
+ if content and not content.endswith('\n'):
205
+ line_count += 1
206
+
207
+ conn.execute(
208
+ "INSERT OR REPLACE INTO file_meta "
209
+ "(path, content_size, has_meta, is_init, file_mtime, index_built_at) "
210
+ "VALUES (?, ?, ?, ?, ?, ?)",
211
+ (path, line_count, bool(meta), _is_init_file(path), mtime, now),
212
+ )
213
+
214
+ def search(self, query: str, limit: int = 10,
215
+ exclude: list[str] | None = None) -> list[dict]:
216
+ """Search the codebase and return ranked results.
217
+
218
+ Uses AND semantics: all query tokens must match (like Google).
219
+ Falls back to OR if AND returns nothing, with a post-filter
220
+ requiring at least 50% of query tokens to match the document.
221
+ """
222
+ if not os.path.exists(self.db_path):
223
+ return []
224
+
225
+ tokens = _tokenize(query)
226
+ if not tokens:
227
+ return []
228
+
229
+ if exclude is None:
230
+ exclude = self.config.get("exclude", [])
231
+
232
+ conn = self._get_connection()
233
+
234
+ # Stage 1: try AND (best precision)
235
+ and_query = " AND ".join(tokens)
236
+ cursor = conn.execute(
237
+ "SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
238
+ " m.is_init, rank "
239
+ "FROM files f "
240
+ "JOIN file_meta m ON f.path = m.path "
241
+ "WHERE files MATCH ? "
242
+ "ORDER BY rank "
243
+ "LIMIT ?",
244
+ (and_query, limit * 5),
245
+ )
246
+ rows = cursor.fetchall()
247
+
248
+ # Stage 2: fall back to OR if AND found nothing
249
+ if not rows and len(tokens) > 1:
250
+ or_query = " OR ".join(tokens)
251
+ cursor = conn.execute(
252
+ "SELECT f.path, f.summary, f.tags, f.public_api, f.content, "
253
+ " m.is_init, rank "
254
+ "FROM files f "
255
+ "JOIN file_meta m ON f.path = m.path "
256
+ "WHERE files MATCH ? "
257
+ "ORDER BY rank "
258
+ "LIMIT ?",
259
+ (or_query, limit * 10),
260
+ )
261
+ rows = cursor.fetchall()
262
+
263
+ # Stage 3: post-filter by token coverage
264
+ min_ratio = 0.5 if len(tokens) >= 4 else 0.0
265
+ results = []
266
+ for row in rows:
267
+ if _matches_exclude(row["path"], exclude):
268
+ continue
269
+
270
+ content = row["content"] or ""
271
+ ratio = _token_match_ratio(tokens, content + (row["summary"] or "") + (row["tags"] or ""))
272
+ if ratio < min_ratio:
273
+ continue
274
+
275
+ score = self._compute_score(tokens, dict(row), ratio)
276
+ if row["is_init"]:
277
+ score *= 0.6
278
+ results.append({
279
+ "path": row["path"],
280
+ "summary": row["summary"] or None,
281
+ "tags": (row["tags"] or "").split(),
282
+ "public_api": row["public_api"] or "",
283
+ "score": score,
284
+ })
285
+
286
+ conn.close()
287
+ results.sort(key=lambda r: r["score"], reverse=True)
288
+
289
+ final = []
290
+ for r in results[:limit]:
291
+ preview, line_range = self._get_preview(r["path"], tokens)
292
+ final.append({
293
+ "path": r["path"],
294
+ "line_start": line_range[0],
295
+ "line_end": line_range[1],
296
+ "summary": r["summary"],
297
+ "tags": r["tags"],
298
+ "score": r["score"],
299
+ "preview": preview,
300
+ })
301
+
302
+ return final
303
+
304
+ def _compute_score(self, tokens: list[str], row: dict, match_ratio: float = 1.0) -> float:
305
+ """Compute relevance score.
306
+
307
+ Core ranking: BM25 from FTS5 (via 'rank') provides the base score.
308
+ On top of that:
309
+ - Metadata boost: public_api (1.0) > tags (0.8) > summary (0.6)
310
+ - Match ratio: fraction of query tokens found in the document
311
+ - init.py penalty: 0.6x (applied in search())
312
+ """
313
+ score = 0.0
314
+
315
+ public_api = (row.get("public_api") or "").lower()
316
+ summary = (row.get("summary") or "").lower()
317
+ tags = (row.get("tags") or "").lower()
318
+
319
+ api_tokens = set(_tokenize(public_api))
320
+ tag_tokens = set(tags.split())
321
+ summary_tokens = set(_tokenize(summary))
322
+
323
+ for token in tokens:
324
+ if token in api_tokens:
325
+ score += 1.0
326
+ elif token in tag_tokens:
327
+ score += 0.8
328
+ elif token in summary_tokens:
329
+ score += 0.6
330
+ else:
331
+ # Content match via BM25 — base weight
332
+ score += 0.3
333
+
334
+ # Multiply by match ratio: files matching more query terms rank higher
335
+ score *= match_ratio
336
+
337
+ return round(score, 4)
338
+
339
+ def _get_preview(self, relpath: str, tokens: list[str]) -> tuple[str, tuple[int, int]]:
340
+ filepath = os.path.join(self.project_path, relpath)
341
+ if not os.path.exists(filepath):
342
+ return "", (0, 0)
343
+
344
+ try:
345
+ with open(filepath, "r", encoding="utf-8", errors="replace") as f:
346
+ lines = f.readlines()
347
+ except Exception:
348
+ return "", (0, 0)
349
+
350
+ if not lines:
351
+ return "", (0, 0)
352
+
353
+ best_line = 0
354
+ best_matches = 0
355
+ for i, line in enumerate(lines):
356
+ line_lower = line.lower()
357
+ matches = sum(1 for t in tokens if t in line_lower)
358
+ if matches > best_matches:
359
+ best_matches = matches
360
+ best_line = i
361
+
362
+ start = max(0, best_line - 2)
363
+ end = min(len(lines), best_line + 8)
364
+ snippet = "".join(lines[start:end])
365
+
366
+ return snippet, (start + 1, end)
367
+
368
+ def status(self) -> dict:
369
+ result = {
370
+ "project_path": self.project_path,
371
+ "files_indexed": 0,
372
+ "files_with_meta": 0,
373
+ "files_without_meta": 0,
374
+ "total_lines": 0,
375
+ "index_age": "No index yet",
376
+ }
377
+
378
+ if not os.path.exists(self.db_path):
379
+ return result
380
+
381
+ conn = self._get_connection()
382
+
383
+ row = conn.execute("SELECT COUNT(*) as cnt FROM files").fetchone()
384
+ result["files_indexed"] = row["cnt"] if row else 0
385
+
386
+ row = conn.execute("SELECT COUNT(*) as cnt FROM file_meta WHERE has_meta = 1").fetchone()
387
+ result["files_with_meta"] = row["cnt"] if row else 0
388
+
389
+ result["files_without_meta"] = result["files_indexed"] - result["files_with_meta"]
390
+
391
+ row = conn.execute("SELECT SUM(content_size) as total FROM file_meta").fetchone()
392
+ result["total_lines"] = row["total"] or 0
393
+
394
+ mtime = os.path.getmtime(self.db_path)
395
+ mtime_dt = datetime.fromtimestamp(mtime)
396
+ age = datetime.now() - mtime_dt
397
+ if age.seconds < 60:
398
+ result["index_age"] = f"{age.seconds}s ago"
399
+ elif age.seconds < 3600:
400
+ result["index_age"] = f"{age.seconds // 60}m ago"
401
+ else:
402
+ result["index_age"] = f"{age.seconds // 3600}h ago"
403
+
404
+ conn.close()
405
+ return result
@@ -0,0 +1,152 @@
1
+ Metadata-Version: 2.4
2
+ Name: codexlr8
3
+ Version: 0.0.1
4
+ Summary: A codebase search engine for LLM coding agents
5
+ Author-email: Sadig Akhund <sadigaxund@gmail.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/sadigaxund/codexlr8
8
+ Project-URL: Repository, https://github.com/sadigaxund/codexlr8
9
+ Project-URL: Issues, https://github.com/sadigaxund/codexlr8/issues
10
+ Keywords: code-search,llm,agent,navigation,mcp
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: click>=8.0
23
+ Requires-Dist: pyyaml>=6.0
24
+ Requires-Dist: mcp>=1.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # CodeXLR8
31
+
32
+ [![PyPI version](https://img.shields.io/pypi/v/codexlr8)](https://pypi.org/project/codexlr8/)
33
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org/downloads/)
34
+ [![License](https://img.shields.io/badge/license-Apache%202.0-green)](LICENSE)
35
+ [![CI](https://github.com/sadigaxund/codexlr8/actions/workflows/test.yml/badge.svg)](https://github.com/sadigaxund/codexlr8/actions)
36
+
37
+ A codebase search engine for LLM coding agents. **One query, precise results, no noise.**
38
+
39
+ ## Setup
40
+
41
+ ```bash
42
+ pip install codexlr8
43
+ codexlr8 setup
44
+ ```
45
+
46
+ `setup` auto-detects MCP clients (Claude Code, Cursor) and injects the server config, then walks you through project configuration. After setup, build the search index:
47
+
48
+ ```bash
49
+ codexlr8 index .
50
+ ```
51
+
52
+ Your agents now have `codebase_search` and `codebase_index` tools. Search from the CLI yourself:
53
+
54
+ ```bash
55
+ codexlr8 search . "login auth"
56
+ # 1. auth/session.py:14-27 [score: 1.60]
57
+ # meta: User authentication — login, logout, session management
58
+ # tags: auth, login, session, security
59
+ ```
60
+
61
+ ## How It Works
62
+
63
+ CodeXLR8 indexes your codebase into an SQLite FTS5 database alongside optional `.meta.yaml` sidecar files that boost ranking precision:
64
+
65
+ | Layer | Source | Boost |
66
+ |---|---|---|
67
+ | 1 | Raw file content (function names, variables, comments, docstrings) | FTS5 BM25 base |
68
+ | 2 | `.meta.yaml` `summary` + `tags` | 0.6× – 0.8× |
69
+ | 3 | `.meta.yaml` `public_api` | 1.0× (strongest) |
70
+
71
+ Search uses AND semantics (like Google): all query tokens must match. If no results, falls back to OR with a ≥50% token threshold.
72
+
73
+ ## .meta.yaml Sidecars
74
+
75
+ Optional YAML files next to source files, created by `codexlr8 init`:
76
+
77
+ ```yaml
78
+ public_api: [login, logout, reset_password]
79
+ summary: "User auth: login, session, password reset"
80
+ tags: [auth, security, session]
81
+ invariants:
82
+ - "db.connect() must be called first"
83
+ ```
84
+
85
+ Files without `.meta.yaml` still get indexed — metadata just produces higher ranking scores.
86
+
87
+ ## Configuration
88
+
89
+ Optional `.codexlr8.yaml` at the project root:
90
+
91
+ ```yaml
92
+ root: "."
93
+ include: [] # scope: only scan these
94
+ exclude: # skip these
95
+ - tests/*
96
+ - test_*
97
+ extensions: # file types to index
98
+ - .py
99
+ - .js
100
+ ignore_dirs: # skip entirely
101
+ - .git
102
+ - __pycache__
103
+ ```
104
+
105
+ All fields have defaults. Use `codexlr8 setup` to create one interactively, or edit by hand.
106
+
107
+ ## Agent Integration
108
+
109
+ Works with **Claude Code, Cursor, Windsurf, Continue.dev** and any MCP-compatible client.
110
+
111
+ `codexlr8 setup` auto-detects installed clients and offers to inject the MCP server config. For manual setup, add this to your client's config:
112
+
113
+ ```json
114
+ {
115
+ "mcpServers": {
116
+ "codexlr8": {
117
+ "command": "uvx",
118
+ "args": ["codexlr8", "mcp-server"]
119
+ }
120
+ }
121
+ }
122
+ ```
123
+
124
+ Tools available to agents:
125
+
126
+ | Tool | Description |
127
+ |---|---|
128
+ | `codebase_search(query, path?, limit?, exclude?)` | Search the codebase, return ranked results |
129
+ | `codebase_index(path?, incremental?, exclude?)` | Build or update the search index |
130
+
131
+ The included agent skill ([SKILL.md](SKILL.md)) teaches agents to search before reading files, maintain `.meta.yaml` sidecars, and keep the index fresh.
132
+
133
+ ## Commands
134
+
135
+ ```
136
+ codexlr8 setup Interactive project + MCP config
137
+ codexlr8 scan <path> List source files and line counts
138
+ codexlr8 init <path> Bootstrap .meta.yaml sidecars
139
+ codexlr8 index <path> Build the search index
140
+ codexlr8 search <path> <q> Search the codebase
141
+ codexlr8 status <path> Show index coverage and age
142
+ codexlr8 install-skill Install agent skill for Claude Code
143
+ codexlr8 mcp-config Print MCP client config JSON
144
+ ```
145
+
146
+ ## Contributing
147
+
148
+ See [AGENTS.md](AGENTS.md) for principles and development guidelines.
149
+
150
+ ## License
151
+
152
+ Apache 2.0. See [LICENSE](LICENSE).
@@ -0,0 +1,13 @@
1
+ codexlr8/__init__.py,sha256=QddVPI3SmCeQq8QyXSCgrO5tBQTnWaigxknfP-iKzao,90
2
+ codexlr8/cli.py,sha256=yJTm_Z0BJU1t4PTTBMjgVScmEy9osF5dYfiTYi2Tb9U,18216
3
+ codexlr8/config.py,sha256=zlsxAnRhbpK-SJ2uN-t5O14mRX5wbzMWWB4Jg0CHPXw,1333
4
+ codexlr8/mcp_server.py,sha256=VqvtUpHMv6XRuRRjU2cuQuT-hnAiGtoYgz3W-bSsnaU,5584
5
+ codexlr8/meta.py,sha256=OyjAqD6OBdaC3gfAGpiuE9QSpHOa2gKS4jPUMXsRSw4,3270
6
+ codexlr8/scanner.py,sha256=AyTO5EtDlOwWXjtnd5_7kMUx22XEP9X4kl0UX4khoro,2882
7
+ codexlr8/search.py,sha256=ZmwkAnB02ZbFUcp1zfp5DUIYOYwLc1ubT-TOrcCL7iI,13851
8
+ codexlr8-0.0.1.dist-info/licenses/LICENSE,sha256=wAtXn9YalS-tNHgydkrIPFDouPpqARf2ObixOWadQUo,11342
9
+ codexlr8-0.0.1.dist-info/METADATA,sha256=7HxYFaju53MLFuLI6_v-vmEg7Nx9WU1DKgekL6yMIuE,4942
10
+ codexlr8-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
11
+ codexlr8-0.0.1.dist-info/entry_points.txt,sha256=hOg94qhE4Emf4OyHUye9m5_KhfF-hcqzOkE5gfXbVrQ,87
12
+ codexlr8-0.0.1.dist-info/top_level.txt,sha256=GxUlzPqgBl_1BIcFTCuzzD_hR3vX6DweiueEksKq0zg,9
13
+ codexlr8-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ codexlr8 = codexlr8.cli:main
3
+ codexlr8-mcp = codexlr8.mcp_server:main