everything-mcp 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
everything_mcp/server.py CHANGED
@@ -1,745 +1,745 @@
1
- """
2
- Everything MCP Server The definitive MCP server for voidtools Everything.
3
-
4
- Provides 5 tools for AI agents to search and analyse files at lightning speed
5
- using voidtools Everything's real-time NTFS index.
6
-
7
- Compatible with: Claude Code, Codex, Gemini, Kimi, Qwen, Cursor, Windsurf,
8
- and any MCP-compatible client using stdio transport.
9
- """
10
-
11
- from __future__ import annotations
12
-
13
- import asyncio
14
- import json
15
- import logging
16
- import os
17
- import sys
18
- from contextlib import asynccontextmanager
19
- from datetime import datetime
20
- from pathlib import Path
21
-
22
- from mcp.server.fastmcp import FastMCP
23
- from pydantic import BaseModel, ConfigDict, Field, field_validator
24
-
25
- from everything_mcp.backend import (
26
- FILE_TYPES,
27
- SORT_MAP,
28
- TIME_PERIODS,
29
- EverythingBackend,
30
- build_recent_query,
31
- build_type_query,
32
- human_size,
33
- )
34
- from everything_mcp.config import EverythingConfig
35
-
36
- # ── Logging (stderr required for stdio MCP transport) ──────────────────
37
-
38
- logging.basicConfig(
39
- level=logging.INFO,
40
- format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
41
- stream=sys.stderr,
42
- )
43
- logger = logging.getLogger("everything_mcp")
44
-
45
- # ── Globals (initialised during lifespan) ─────────────────────────────────
46
-
47
- _backend: EverythingBackend | None = None
48
- _config: EverythingConfig | None = None
49
-
50
-
51
- @asynccontextmanager
52
- async def lifespan(server):
53
- """Initialise Everything backend on startup, cleanup on shutdown."""
54
- global _backend, _config
55
-
56
- logger.info("Everything MCP starting auto-detecting Everything installation…")
57
- _config = EverythingConfig.auto_detect()
58
-
59
- if _config.is_valid:
60
- logger.info("Connected: %s (es: %s)", _config.version_info, _config.es_path)
61
- else:
62
- for err in _config.errors:
63
- logger.error(" %s", err)
64
- for warn in _config.warnings:
65
- logger.warning(" %s", warn)
66
-
67
- _backend = EverythingBackend(_config)
68
- try:
69
- yield
70
- finally:
71
- logger.info("Everything MCP shutting down.")
72
-
73
-
74
- # ── Server instance ───────────────────────────────────────────────────────
75
-
76
- mcp = FastMCP("everything_mcp", lifespan=lifespan)
77
-
78
-
79
- def _get_backend() -> EverythingBackend:
80
- """Return the backend or raise with a clear message."""
81
- if _backend is None:
82
- raise RuntimeError("Everything MCP not initialised")
83
- if not _config or not _config.is_valid:
84
- errors = _config.errors if _config else ["Not initialised"]
85
- raise RuntimeError("Everything is not available. " + " ".join(errors))
86
- return _backend
87
-
88
-
89
- # ═══════════════════════════════════════════════════════════════════════════
90
- # Tool 1: everything_search The Workhorse
91
- # ═══════════════════════════════════════════════════════════════════════════
92
-
93
-
94
- class SearchInput(BaseModel):
95
- """Input schema for ``everything_search``."""
96
-
97
- model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
98
-
99
- query: str = Field(
100
- ...,
101
- description=(
102
- "Search query using Everything syntax. Examples: "
103
- "'*.py' (all Python files), "
104
- "'ext:py;js path:C:\\Projects' (Python/JS in Projects), "
105
- "'size:>10mb ext:log' (large logs), "
106
- "'dm:today ext:py' (Python files modified today), "
107
- "'content:TODO ext:py' (files containing TODO requires content indexing), "
108
- "'\"exact phrase\"' (exact filename match), "
109
- "'regex:test_\\d+\\.py$' (regex). "
110
- "Combine with space (AND) or | (OR). Prefix ! to exclude."
111
- ),
112
- min_length=1,
113
- max_length=2000,
114
- )
115
- max_results: int = Field(
116
- default=50,
117
- description="Maximum results to return (1-500)",
118
- ge=1, le=500,
119
- )
120
- sort: str = Field(
121
- default="date-modified-desc",
122
- description=(
123
- "Sort order. Options: "
124
- + ", ".join(sorted(SORT_MAP.keys()))
125
- ),
126
- )
127
-
128
- @field_validator("sort")
129
- @classmethod
130
- def validate_sort(cls, v: str) -> str:
131
- if v not in SORT_MAP:
132
- raise ValueError(f"Invalid sort option '{v}'. Valid: {', '.join(sorted(SORT_MAP.keys()))}")
133
- return v
134
-
135
- match_case: bool = Field(default=False, description="Case-sensitive search")
136
- match_whole_word: bool = Field(default=False, description="Match whole words only")
137
- match_regex: bool = Field(default=False, description="Treat query as regex")
138
- match_path: bool = Field(default=False, description="Match against full path, not just filename")
139
- offset: int = Field(default=0, description="Skip N results (pagination)", ge=0)
140
-
141
-
142
- @mcp.tool(
143
- name="everything_search",
144
- annotations={
145
- "title": "Search Files & Folders",
146
- "readOnlyHint": True,
147
- "destructiveHint": False,
148
- "idempotentHint": True,
149
- "openWorldHint": False,
150
- },
151
- )
152
- async def everything_search(params: SearchInput) -> str:
153
- """Search for files and folders instantly using voidtools Everything.
154
-
155
- Leverages Everything's real-time NTFS index for sub-millisecond search
156
- across all local and mapped drives. Supports wildcards, regex, size/date
157
- filters, extension filters, path restrictions, and content search.
158
- """
159
- try:
160
- backend = _get_backend()
161
- results = await backend.search(
162
- query=params.query,
163
- max_results=params.max_results,
164
- sort=params.sort,
165
- match_case=params.match_case,
166
- match_whole_word=params.match_whole_word,
167
- match_regex=params.match_regex,
168
- match_path=params.match_path,
169
- offset=params.offset,
170
- )
171
- return _format_search_results(results, params.query, params.max_results, params.offset)
172
- except Exception as exc:
173
- return f"Error: {exc}"
174
-
175
-
176
- # ═══════════════════════════════════════════════════════════════════════════
177
- # Tool 2: everything_search_by_type Category Search
178
- # ═══════════════════════════════════════════════════════════════════════════
179
-
180
-
181
- class SearchByTypeInput(BaseModel):
182
- """Input schema for ``everything_search_by_type``."""
183
-
184
- model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
185
-
186
- file_type: str = Field(
187
- ...,
188
- description="File type category: " + ", ".join(sorted(FILE_TYPES.keys())),
189
- )
190
- query: str = Field(
191
- default="",
192
- description="Additional search filter (e.g. 'config' to narrow results)",
193
- )
194
- path: str = Field(
195
- default="",
196
- description="Restrict search to this directory (e.g. 'C:\\Projects')",
197
- )
198
- max_results: int = Field(default=50, ge=1, le=500)
199
- sort: str = Field(default="date-modified-desc")
200
-
201
- @field_validator("sort")
202
- @classmethod
203
- def validate_sort(cls, v: str) -> str:
204
- if v not in SORT_MAP:
205
- raise ValueError(f"Invalid sort option '{v}'. Valid: {', '.join(sorted(SORT_MAP.keys()))}")
206
- return v
207
-
208
-
209
- @mcp.tool(
210
- name="everything_search_by_type",
211
- annotations={
212
- "title": "Search by File Type Category",
213
- "readOnlyHint": True,
214
- "destructiveHint": False,
215
- "idempotentHint": True,
216
- "openWorldHint": False,
217
- },
218
- )
219
- async def everything_search_by_type(params: SearchByTypeInput) -> str:
220
- """Search for files by type category.
221
-
222
- Categories: audio, video, image, document, code, archive, executable,
223
- font, 3d, data. Each maps to a curated list of file extensions.
224
- """
225
- try:
226
- backend = _get_backend()
227
- query = build_type_query(params.file_type, params.query, params.path)
228
- results = await backend.search(
229
- query=query,
230
- max_results=params.max_results,
231
- sort=params.sort,
232
- )
233
- label = f"type:{params.file_type}" + (f" {params.query}" if params.query else "")
234
- return _format_search_results(results, label, params.max_results)
235
- except Exception as exc:
236
- return f"Error: {exc}"
237
-
238
-
239
- # ═══════════════════════════════════════════════════════════════════════════
240
- # Tool 3: everything_find_recent What Changed?
241
- # ═══════════════════════════════════════════════════════════════════════════
242
-
243
-
244
- class FindRecentInput(BaseModel):
245
- """Input schema for ``everything_find_recent``."""
246
-
247
- model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
248
-
249
- period: str = Field(
250
- default="1hour",
251
- description=(
252
- "How recent. Options: "
253
- + ", ".join(sorted(TIME_PERIODS.keys(), key=lambda k: list(TIME_PERIODS.keys()).index(k)))
254
- + ". Or raw Everything syntax like 'last2hours'."
255
- ),
256
- )
257
- path: str = Field(default="", description="Restrict to this directory path")
258
- extensions: str = Field(
259
- default="",
260
- description="Filter by extensions, e.g. 'py,js,ts' or 'py;js;ts'",
261
- )
262
- query: str = Field(default="", description="Additional search filter")
263
- max_results: int = Field(default=50, ge=1, le=500)
264
-
265
-
266
- @mcp.tool(
267
- name="everything_find_recent",
268
- annotations={
269
- "title": "Find Recently Modified Files",
270
- "readOnlyHint": True,
271
- "destructiveHint": False,
272
- "idempotentHint": True,
273
- "openWorldHint": False,
274
- },
275
- )
276
- async def everything_find_recent(params: FindRecentInput) -> str:
277
- """Find files modified within a recent time period.
278
-
279
- Ideal for discovering what changed in a project, tracking recent
280
- downloads, finding today's log files, etc. Sorted newest-first.
281
- """
282
- try:
283
- backend = _get_backend()
284
-
285
- query = build_recent_query(params.period, params.path, params.extensions)
286
- if params.query:
287
- query = f"{query} {params.query}"
288
-
289
- results = await backend.search(
290
- query=query,
291
- max_results=params.max_results,
292
- sort="date-modified-desc",
293
- )
294
- return _format_search_results(results, f"recent ({params.period})", params.max_results)
295
- except Exception as exc:
296
- return f"Error: {exc}"
297
-
298
-
299
- # ═══════════════════════════════════════════════════════════════════════════
300
- # Tool 4: everything_file_details Deep Inspection
301
- # ═══════════════════════════════════════════════════════════════════════════
302
-
303
-
304
- class FileDetailsInput(BaseModel):
305
- """Input schema for ``everything_file_details``."""
306
-
307
- model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
308
-
309
- paths: list[str] = Field(
310
- ...,
311
- description="File/folder paths to inspect (1-20)",
312
- min_length=1,
313
- max_length=20,
314
- )
315
- preview_lines: int = Field(
316
- default=0,
317
- description="Lines of text content to preview (0 = none, max 200)",
318
- ge=0, le=200,
319
- )
320
-
321
-
322
- @mcp.tool(
323
- name="everything_file_details",
324
- annotations={
325
- "title": "Get File Details & Content Preview",
326
- "readOnlyHint": True,
327
- "destructiveHint": False,
328
- "idempotentHint": True,
329
- "openWorldHint": False,
330
- },
331
- )
332
- async def everything_file_details(params: FileDetailsInput) -> str:
333
- """Get detailed metadata and optional content preview for specific files.
334
-
335
- Returns: full path, size, dates, type, permissions, hidden status.
336
- For directories: item count, subdirectories, file listing.
337
- For text files with preview_lines > 0: first N lines of content.
338
- """
339
- # Run blocking file I/O in thread pool to not block the event loop
340
- return await asyncio.to_thread(
341
- _get_file_details_sync,
342
- params.paths,
343
- params.preview_lines,
344
- )
345
-
346
-
347
- def _get_file_details_sync(paths: list[str], preview_lines: int) -> str:
348
- """Synchronous implementation of file details gathering."""
349
- output_parts: list[str] = []
350
-
351
- for filepath in paths:
352
- p = Path(filepath)
353
- info: dict = {"path": str(p)}
354
-
355
- if not p.exists():
356
- info["error"] = "File not found"
357
- output_parts.append(json.dumps(info, indent=2, ensure_ascii=False))
358
- continue
359
-
360
- try:
361
- stat = p.stat()
362
- info["name"] = p.name
363
- info["type"] = "folder" if p.is_dir() else "file"
364
-
365
- if not p.is_dir():
366
- info["size"] = stat.st_size
367
- info["size_human"] = human_size(stat.st_size)
368
- info["extension"] = p.suffix.lstrip(".").lower()
369
-
370
- info["date_modified"] = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
371
- info["date_created"] = datetime.fromtimestamp(stat.st_ctime).strftime("%Y-%m-%d %H:%M:%S")
372
- info["date_accessed"] = datetime.fromtimestamp(stat.st_atime).strftime("%Y-%m-%d %H:%M:%S")
373
- info["read_only"] = not os.access(filepath, os.W_OK)
374
-
375
- # Windows hidden attribute or Unix dotfile
376
- file_attrs = getattr(stat, "st_file_attributes", 0)
377
- info["hidden"] = bool(file_attrs & 0x2) if file_attrs else p.name.startswith(".")
378
-
379
- # Directory listing
380
- if p.is_dir():
381
- try:
382
- info.update(_summarize_directory(p))
383
- except PermissionError:
384
- info["listing_error"] = "Permission denied"
385
- except OSError as exc:
386
- info["listing_error"] = str(exc)
387
-
388
- # Content preview for text files
389
- elif preview_lines > 0:
390
- preview = _read_preview(p, preview_lines)
391
- if preview is not None:
392
- info["preview"] = preview
393
-
394
- except PermissionError:
395
- info["error"] = "Permission denied"
396
- except OSError as exc:
397
- info["error"] = str(exc)
398
-
399
- output_parts.append(json.dumps(info, indent=2, ensure_ascii=False))
400
-
401
- return "\n---\n".join(output_parts)
402
-
403
-
404
- # ═══════════════════════════════════════════════════════════════════════════
405
- # Tool 5: everything_count_stats Quick Analytics
406
- # ═══════════════════════════════════════════════════════════════════════════
407
-
408
-
409
- class CountStatsInput(BaseModel):
410
- """Input schema for ``everything_count_stats``."""
411
-
412
- model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
413
-
414
- query: str = Field(
415
- ...,
416
- description=(
417
- "Search query to count/measure. Same syntax as everything_search. "
418
- "Examples: 'ext:py path:C:\\Projects', 'ext:log size:>1mb', '*.tmp'"
419
- ),
420
- min_length=1,
421
- max_length=2000,
422
- )
423
- include_size: bool = Field(
424
- default=True,
425
- description="Also calculate total size of all matching files",
426
- )
427
- breakdown_by_extension: bool = Field(
428
- default=False,
429
- description="Break down count and size by file extension (samples top 200 results)",
430
- )
431
-
432
-
433
- @mcp.tool(
434
- name="everything_count_stats",
435
- annotations={
436
- "title": "Count & Size Statistics",
437
- "readOnlyHint": True,
438
- "destructiveHint": False,
439
- "idempotentHint": True,
440
- "openWorldHint": False,
441
- },
442
- )
443
- async def everything_count_stats(params: CountStatsInput) -> str:
444
- """Get count and size statistics for files matching a query.
445
-
446
- Fast way to understand the scope of a query without listing every file.
447
- Optionally breaks down by extension for a high-level overview.
448
- """
449
- try:
450
- backend = _get_backend()
451
- output: dict = {"query": params.query}
452
-
453
- # Count
454
- try:
455
- output["total_count"] = await backend.count(params.query)
456
- except Exception:
457
- output["count_note"] = "Count not available (es.exe may not support -get-result-count)"
458
-
459
- # Total size
460
- if params.include_size:
461
- try:
462
- total_size = await backend.get_total_size(params.query)
463
- if total_size >= 0:
464
- output["total_size"] = total_size
465
- output["total_size_human"] = human_size(total_size)
466
- except Exception:
467
- output["size_note"] = "Total size not available"
468
-
469
- # Extension breakdown
470
- if params.breakdown_by_extension:
471
- try:
472
- sample_limit = 500
473
- results = await backend.search(
474
- params.query,
475
- max_results=sample_limit,
476
- sort="name",
477
- )
478
- ext_stats: dict[str, dict] = {}
479
- sampled_files = 0
480
- for r in results:
481
- if r.is_dir:
482
- continue
483
- sampled_files += 1
484
- ext = r.extension or "(no extension)"
485
- entry = ext_stats.setdefault(ext, {"count": 0, "total_size": 0})
486
- entry["count"] += 1
487
- if r.size >= 0:
488
- entry["total_size"] += r.size
489
-
490
- sorted_exts = sorted(ext_stats.items(), key=lambda x: x[1]["count"], reverse=True)
491
- breakdown = {}
492
- for ext, stats in sorted_exts[:30]:
493
- breakdown[ext] = {
494
- "count": stats["count"],
495
- "total_size": stats["total_size"],
496
- "total_size_human": human_size(stats["total_size"]),
497
- }
498
- output["extension_breakdown"] = breakdown
499
- output["breakdown_note"] = (
500
- f"Based on {sampled_files} sampled files from first {len(results)} "
501
- f"results (max sample {sample_limit}); directories excluded."
502
- )
503
- except Exception as exc:
504
- output["breakdown_error"] = str(exc)
505
-
506
- return json.dumps(output, indent=2, ensure_ascii=False)
507
- except Exception as exc:
508
- return f"Error: {exc}"
509
-
510
-
511
- # ═══════════════════════════════════════════════════════════════════════════
512
- # Resource: Health Check
513
- # ═══════════════════════════════════════════════════════════════════════════
514
-
515
-
516
- @mcp.resource("everything://status")
517
- async def get_status() -> str:
518
- """Get the current status of the Everything connection."""
519
- if _backend:
520
- status = await _backend.health_check()
521
- else:
522
- status = {"status": "not initialised"}
523
- return json.dumps(status, indent=2)
524
-
525
-
526
- # ═══════════════════════════════════════════════════════════════════════════
527
- # Helpers
528
- # ═══════════════════════════════════════════════════════════════════════════
529
-
530
-
531
- def _format_search_results(
532
- results: list,
533
- query_label: str,
534
- max_results: int,
535
- offset: int = 0,
536
- ) -> str:
537
- """Format search results into a clean, readable string for LLM consumption."""
538
- if not results:
539
- return f"No results found for: {query_label}"
540
-
541
- header = f"Found {len(results)} results for: {query_label}"
542
- if offset > 0:
543
- header += f" (offset: {offset})"
544
- lines = [header, ""]
545
-
546
- for r in results:
547
- d = r.to_dict() if hasattr(r, "to_dict") else r
548
- path = d.get("path", "?")
549
- ftype = d.get("type", "file")
550
- size_h = d.get("size_human", "")
551
- dm = d.get("date_modified", "")
552
-
553
- prefix = "[DIR]" if ftype == "folder" else "[FILE]"
554
- meta_parts: list[str] = []
555
- if size_h:
556
- meta_parts.append(size_h)
557
- if dm:
558
- meta_parts.append(dm)
559
-
560
- meta = f" ({', '.join(meta_parts)})" if meta_parts else ""
561
- lines.append(f" {prefix} {path}{meta}")
562
-
563
- if len(results) >= max_results:
564
- lines.append("")
565
- lines.append(
566
- f"Showing first {max_results} results. "
567
- "Use 'offset' to paginate or refine the query."
568
- )
569
-
570
- return "\n".join(lines)
571
-
572
-
573
- # ── Text file preview ─────────────────────────────────────────────────────
574
-
575
- # Extensions we can safely read as text
576
- _TEXT_EXTENSIONS: frozenset[str] = frozenset({
577
- # Text & docs
578
- "txt", "md", "mdx", "rst", "adoc", "org",
579
- # Python
580
- "py", "pyi", "pyw", "pyx", "pxd",
581
- # JavaScript/TypeScript
582
- "js", "mjs", "cjs", "ts", "mts", "cts", "jsx", "tsx",
583
- # Web frameworks
584
- "vue", "svelte", "astro", "marko",
585
- # C family
586
- "c", "cpp", "cc", "cxx", "h", "hpp", "hxx", "cs", "java", "m", "mm",
587
- # Systems languages
588
- "go", "rs", "rb", "php", "swift", "kt", "kts", "scala", "r", "lua",
589
- # Shell
590
- "sh", "bash", "zsh", "fish", "ps1", "psm1", "psd1", "bat", "cmd",
591
- # Database & query
592
- "sql", "prisma", "graphql", "gql",
593
- # Web
594
- "html", "htm", "css", "scss", "sass", "less", "styl", "pcss",
595
- # Data formats
596
- "json", "jsonc", "json5", "jsonl", "ndjson",
597
- "xml", "xsl", "xslt", "xsd", "svg", "rss", "atom",
598
- "yaml", "yml", "toml", "ini", "cfg", "conf", "env", "properties",
599
- "csv", "tsv", "log",
600
- # Config files (with extensions)
601
- "gitignore", "gitattributes", "gitmodules", "npmrc", "nvmrc", "yarnrc",
602
- "dockerignore", "editorconfig", "eslintrc", "prettierrc", "babelrc",
603
- "stylelintrc", "browserslistrc",
604
- # Build tools
605
- "makefile", "dockerfile", "cmake", "gradle", "sbt", "cabal", "bazel",
606
- # Academic
607
- "tex", "bib", "cls", "sty",
608
- # Hardware
609
- "asm", "s", "v", "sv", "vhd", "vhdl",
610
- # Modern languages
611
- "dart", "zig", "nim", "hx", "odin", "jai", "vlang",
612
- # Functional
613
- "ex", "exs", "erl", "hrl", "hs", "lhs", "ml", "mli", "fs", "fsi", "fsx",
614
- "clj", "cljs", "cljc", "edn", "lisp", "el", "rkt", "scm", "fnl",
615
- # Other
616
- "pro", "pri", "qml", "proto", "thrift", "capnp",
617
- "tf", "hcl", "nix", "dhall", "jsonnet", "cue",
618
- "http", "rest", "lock",
619
- })
620
-
621
- # Filenames (no extension) that are always text
622
- _TEXT_FILENAMES: frozenset[str] = frozenset({
623
- "makefile", "dockerfile", "cmakelists.txt", "rakefile", "gemfile",
624
- "procfile", "vagrantfile", "brewfile", "justfile", "taskfile",
625
- "license", "licence", "readme", "authors", "contributors",
626
- "changelog", "changes", "history", "news", "todo",
627
- })
628
-
629
- _MAX_DIR_SCAN_ITEMS = 10_000
630
- _MAX_SUBDIRECTORY_SAMPLE = 20
631
- _MAX_FILE_SAMPLE = 30
632
- _MAX_PREVIEW_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
633
- _MAX_PREVIEW_CHARS = 50_000
634
-
635
-
636
- def _summarize_directory(path: Path) -> dict[str, object]:
637
- """Return bounded directory metadata without loading all entries in memory."""
638
- dirs: list[str] = []
639
- files: list[str] = []
640
- scanned = 0
641
- truncated = False
642
-
643
- with os.scandir(path) as entries:
644
- for entry in entries:
645
- if scanned >= _MAX_DIR_SCAN_ITEMS:
646
- truncated = True
647
- break
648
- scanned += 1
649
- try:
650
- if entry.is_dir(follow_symlinks=False):
651
- if len(dirs) < _MAX_SUBDIRECTORY_SAMPLE:
652
- dirs.append(entry.name)
653
- elif entry.is_file(follow_symlinks=False) and len(files) < _MAX_FILE_SAMPLE:
654
- files.append(entry.name)
655
- except OSError:
656
- continue
657
-
658
- summary: dict[str, object] = {
659
- "item_count": scanned,
660
- "subdirectories": sorted(dirs),
661
- "files_sample": sorted(files),
662
- }
663
- if truncated:
664
- summary["note"] = (
665
- f"Directory scan capped at {_MAX_DIR_SCAN_ITEMS} entries; samples may be incomplete"
666
- )
667
- elif scanned > (_MAX_SUBDIRECTORY_SAMPLE + _MAX_FILE_SAMPLE):
668
- summary["note"] = f"Showing first items of {scanned} total"
669
- return summary
670
-
671
-
672
- def _read_preview(path: Path, max_lines: int) -> str | None:
673
- """Read the first *max_lines* lines of a text file.
674
-
675
- Returns ``None`` for binary files or files that can't be read.
676
- """
677
- try:
678
- if path.stat().st_size > _MAX_PREVIEW_FILE_SIZE:
679
- return "(file too large for preview)"
680
- except OSError:
681
- return None
682
-
683
- ext = path.suffix.lstrip(".").lower()
684
- name_lower = path.name.lower()
685
- stem_lower = path.stem.lower()
686
-
687
- is_text = (
688
- ext in _TEXT_EXTENSIONS
689
- or name_lower in _TEXT_FILENAMES
690
- or stem_lower in _TEXT_FILENAMES
691
- or name_lower.startswith(".") # dotfiles are usually text
692
- )
693
-
694
- if not is_text:
695
- # Sniff for binary content
696
- try:
697
- with open(path, "rb") as f:
698
- chunk = f.read(512)
699
- if b"\x00" in chunk:
700
- return None # binary
701
- is_text = True
702
- except (OSError, PermissionError):
703
- return None
704
-
705
- if not is_text:
706
- return None
707
-
708
- # Read lines with encoding fallback
709
- for encoding in ("utf-8", "utf-8-sig", "latin-1"):
710
- try:
711
- with open(path, encoding=encoding) as f:
712
- lines: list[str] = []
713
- total_chars = 0
714
- truncated = False
715
- for _ in range(max_lines):
716
- remaining = _MAX_PREVIEW_CHARS - total_chars
717
- if remaining <= 0:
718
- truncated = True
719
- break
720
-
721
- # Bound each read to avoid huge single-line payloads.
722
- line = f.readline(remaining + 1)
723
- if not line:
724
- break
725
-
726
- if len(line) > remaining:
727
- line = line[:remaining]
728
- truncated = True
729
-
730
- total_chars += len(line)
731
- lines.append(line.rstrip("\n\r"))
732
-
733
- if total_chars >= _MAX_PREVIEW_CHARS:
734
- truncated = True
735
- break
736
-
737
- if truncated:
738
- lines.append("... [preview truncated]")
739
- return "\n".join(lines)
740
- except UnicodeDecodeError:
741
- continue
742
- except (OSError, PermissionError):
743
- return None
744
-
745
- return "(unable to decode file content)"
1
+ """
2
+ Everything MCP Server - The definitive MCP server for voidtools Everything.
3
+
4
+ Provides 5 tools for AI agents to search and analyse files at lightning speed
5
+ using voidtools Everything's real-time NTFS index.
6
+
7
+ Compatible with: Claude Code, Codex, Gemini, Kimi, Qwen, Cursor, Windsurf,
8
+ and any MCP-compatible client using stdio transport.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import json
15
+ import logging
16
+ import os
17
+ import sys
18
+ from contextlib import asynccontextmanager
19
+ from datetime import datetime
20
+ from pathlib import Path
21
+
22
+ from mcp.server.fastmcp import FastMCP
23
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
24
+
25
+ from everything_mcp.backend import (
26
+ FILE_TYPES,
27
+ SORT_MAP,
28
+ TIME_PERIODS,
29
+ EverythingBackend,
30
+ build_recent_query,
31
+ build_type_query,
32
+ human_size,
33
+ )
34
+ from everything_mcp.config import EverythingConfig
35
+
36
+ # ── Logging (stderr - required for stdio MCP transport) ──────────────────
37
+
38
+ logging.basicConfig(
39
+ level=logging.INFO,
40
+ format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
41
+ stream=sys.stderr,
42
+ )
43
+ logger = logging.getLogger("everything_mcp")
44
+
45
+ # ── Globals (initialised during lifespan) ─────────────────────────────────
46
+
47
+ _backend: EverythingBackend | None = None
48
+ _config: EverythingConfig | None = None
49
+
50
+
51
+ @asynccontextmanager
52
+ async def lifespan(server):
53
+ """Initialise Everything backend on startup, cleanup on shutdown."""
54
+ global _backend, _config
55
+
56
+ logger.info("Everything MCP starting - auto-detecting Everything installation…")
57
+ _config = EverythingConfig.auto_detect()
58
+
59
+ if _config.is_valid:
60
+ logger.info("Connected: %s (es: %s)", _config.version_info, _config.es_path)
61
+ else:
62
+ for err in _config.errors:
63
+ logger.error(" %s", err)
64
+ for warn in _config.warnings:
65
+ logger.warning(" %s", warn)
66
+
67
+ _backend = EverythingBackend(_config)
68
+ try:
69
+ yield
70
+ finally:
71
+ logger.info("Everything MCP shutting down.")
72
+
73
+
74
+ # ── Server instance ───────────────────────────────────────────────────────
75
+
76
+ mcp = FastMCP("everything_mcp", lifespan=lifespan)
77
+
78
+
79
+ def _get_backend() -> EverythingBackend:
80
+ """Return the backend or raise with a clear message."""
81
+ if _backend is None:
82
+ raise RuntimeError("Everything MCP not initialised")
83
+ if not _config or not _config.is_valid:
84
+ errors = _config.errors if _config else ["Not initialised"]
85
+ raise RuntimeError("Everything is not available. " + " ".join(errors))
86
+ return _backend
87
+
88
+
89
+ # ═══════════════════════════════════════════════════════════════════════════
90
+ # Tool 1: everything_search - The Workhorse
91
+ # ═══════════════════════════════════════════════════════════════════════════
92
+
93
+
94
+ class SearchInput(BaseModel):
95
+ """Input schema for ``everything_search``."""
96
+
97
+ model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
98
+
99
+ query: str = Field(
100
+ ...,
101
+ description=(
102
+ "Search query using Everything syntax. Examples: "
103
+ "'*.py' (all Python files), "
104
+ "'ext:py;js path:C:\\Projects' (Python/JS in Projects), "
105
+ "'size:>10mb ext:log' (large logs), "
106
+ "'dm:today ext:py' (Python files modified today), "
107
+ "'content:TODO ext:py' (files containing TODO - requires content indexing), "
108
+ "'\"exact phrase\"' (exact filename match), "
109
+ "'regex:test_\\d+\\.py$' (regex). "
110
+ "Combine with space (AND) or | (OR). Prefix ! to exclude."
111
+ ),
112
+ min_length=1,
113
+ max_length=2000,
114
+ )
115
+ max_results: int = Field(
116
+ default=50,
117
+ description="Maximum results to return (1-500)",
118
+ ge=1, le=500,
119
+ )
120
+ sort: str = Field(
121
+ default="date-modified-desc",
122
+ description=(
123
+ "Sort order. Options: "
124
+ + ", ".join(sorted(SORT_MAP.keys()))
125
+ ),
126
+ )
127
+
128
+ @field_validator("sort")
129
+ @classmethod
130
+ def validate_sort(cls, v: str) -> str:
131
+ if v not in SORT_MAP:
132
+ raise ValueError(f"Invalid sort option '{v}'. Valid: {', '.join(sorted(SORT_MAP.keys()))}")
133
+ return v
134
+
135
+ match_case: bool = Field(default=False, description="Case-sensitive search")
136
+ match_whole_word: bool = Field(default=False, description="Match whole words only")
137
+ match_regex: bool = Field(default=False, description="Treat query as regex")
138
+ match_path: bool = Field(default=False, description="Match against full path, not just filename")
139
+ offset: int = Field(default=0, description="Skip N results (pagination)", ge=0)
140
+
141
+
142
+ @mcp.tool(
143
+ name="everything_search",
144
+ annotations={
145
+ "title": "Search Files & Folders",
146
+ "readOnlyHint": True,
147
+ "destructiveHint": False,
148
+ "idempotentHint": True,
149
+ "openWorldHint": False,
150
+ },
151
+ )
152
+ async def everything_search(params: SearchInput) -> str:
153
+ """Search for files and folders instantly using voidtools Everything.
154
+
155
+ Leverages Everything's real-time NTFS index for sub-millisecond search
156
+ across all local and mapped drives. Supports wildcards, regex, size/date
157
+ filters, extension filters, path restrictions, and content search.
158
+ """
159
+ try:
160
+ backend = _get_backend()
161
+ results = await backend.search(
162
+ query=params.query,
163
+ max_results=params.max_results,
164
+ sort=params.sort,
165
+ match_case=params.match_case,
166
+ match_whole_word=params.match_whole_word,
167
+ match_regex=params.match_regex,
168
+ match_path=params.match_path,
169
+ offset=params.offset,
170
+ )
171
+ return _format_search_results(results, params.query, params.max_results, params.offset)
172
+ except Exception as exc:
173
+ return f"Error: {exc}"
174
+
175
+
176
+ # ═══════════════════════════════════════════════════════════════════════════
177
+ # Tool 2: everything_search_by_type - Category Search
178
+ # ═══════════════════════════════════════════════════════════════════════════
179
+
180
+
181
+ class SearchByTypeInput(BaseModel):
182
+ """Input schema for ``everything_search_by_type``."""
183
+
184
+ model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
185
+
186
+ file_type: str = Field(
187
+ ...,
188
+ description="File type category: " + ", ".join(sorted(FILE_TYPES.keys())),
189
+ )
190
+ query: str = Field(
191
+ default="",
192
+ description="Additional search filter (e.g. 'config' to narrow results)",
193
+ )
194
+ path: str = Field(
195
+ default="",
196
+ description="Restrict search to this directory (e.g. 'C:\\Projects')",
197
+ )
198
+ max_results: int = Field(default=50, ge=1, le=500)
199
+ sort: str = Field(default="date-modified-desc")
200
+
201
+ @field_validator("sort")
202
+ @classmethod
203
+ def validate_sort(cls, v: str) -> str:
204
+ if v not in SORT_MAP:
205
+ raise ValueError(f"Invalid sort option '{v}'. Valid: {', '.join(sorted(SORT_MAP.keys()))}")
206
+ return v
207
+
208
+
209
+ @mcp.tool(
210
+ name="everything_search_by_type",
211
+ annotations={
212
+ "title": "Search by File Type Category",
213
+ "readOnlyHint": True,
214
+ "destructiveHint": False,
215
+ "idempotentHint": True,
216
+ "openWorldHint": False,
217
+ },
218
+ )
219
+ async def everything_search_by_type(params: SearchByTypeInput) -> str:
220
+ """Search for files by type category.
221
+
222
+ Categories: audio, video, image, document, code, archive, executable,
223
+ font, 3d, data. Each maps to a curated list of file extensions.
224
+ """
225
+ try:
226
+ backend = _get_backend()
227
+ query = build_type_query(params.file_type, params.query, params.path)
228
+ results = await backend.search(
229
+ query=query,
230
+ max_results=params.max_results,
231
+ sort=params.sort,
232
+ )
233
+ label = f"type:{params.file_type}" + (f" {params.query}" if params.query else "")
234
+ return _format_search_results(results, label, params.max_results)
235
+ except Exception as exc:
236
+ return f"Error: {exc}"
237
+
238
+
239
+ # ═══════════════════════════════════════════════════════════════════════════
240
+ # Tool 3: everything_find_recent - What Changed?
241
+ # ═══════════════════════════════════════════════════════════════════════════
242
+
243
+
244
+ class FindRecentInput(BaseModel):
245
+ """Input schema for ``everything_find_recent``."""
246
+
247
+ model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
248
+
249
+ period: str = Field(
250
+ default="1hour",
251
+ description=(
252
+ "How recent. Options: "
253
+ + ", ".join(sorted(TIME_PERIODS.keys(), key=lambda k: list(TIME_PERIODS.keys()).index(k)))
254
+ + ". Or raw Everything syntax like 'last2hours'."
255
+ ),
256
+ )
257
+ path: str = Field(default="", description="Restrict to this directory path")
258
+ extensions: str = Field(
259
+ default="",
260
+ description="Filter by extensions, e.g. 'py,js,ts' or 'py;js;ts'",
261
+ )
262
+ query: str = Field(default="", description="Additional search filter")
263
+ max_results: int = Field(default=50, ge=1, le=500)
264
+
265
+
266
+ @mcp.tool(
267
+ name="everything_find_recent",
268
+ annotations={
269
+ "title": "Find Recently Modified Files",
270
+ "readOnlyHint": True,
271
+ "destructiveHint": False,
272
+ "idempotentHint": True,
273
+ "openWorldHint": False,
274
+ },
275
+ )
276
+ async def everything_find_recent(params: FindRecentInput) -> str:
277
+ """Find files modified within a recent time period.
278
+
279
+ Ideal for discovering what changed in a project, tracking recent
280
+ downloads, finding today's log files, etc. Sorted newest-first.
281
+ """
282
+ try:
283
+ backend = _get_backend()
284
+
285
+ query = build_recent_query(params.period, params.path, params.extensions)
286
+ if params.query:
287
+ query = f"{query} {params.query}"
288
+
289
+ results = await backend.search(
290
+ query=query,
291
+ max_results=params.max_results,
292
+ sort="date-modified-desc",
293
+ )
294
+ return _format_search_results(results, f"recent ({params.period})", params.max_results)
295
+ except Exception as exc:
296
+ return f"Error: {exc}"
297
+
298
+
299
+ # ═══════════════════════════════════════════════════════════════════════════
300
+ # Tool 4: everything_file_details - Deep Inspection
301
+ # ═══════════════════════════════════════════════════════════════════════════
302
+
303
+
304
+ class FileDetailsInput(BaseModel):
305
+ """Input schema for ``everything_file_details``."""
306
+
307
+ model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
308
+
309
+ paths: list[str] = Field(
310
+ ...,
311
+ description="File/folder paths to inspect (1-20)",
312
+ min_length=1,
313
+ max_length=20,
314
+ )
315
+ preview_lines: int = Field(
316
+ default=0,
317
+ description="Lines of text content to preview (0 = none, max 200)",
318
+ ge=0, le=200,
319
+ )
320
+
321
+
322
+ @mcp.tool(
323
+ name="everything_file_details",
324
+ annotations={
325
+ "title": "Get File Details & Content Preview",
326
+ "readOnlyHint": True,
327
+ "destructiveHint": False,
328
+ "idempotentHint": True,
329
+ "openWorldHint": False,
330
+ },
331
+ )
332
+ async def everything_file_details(params: FileDetailsInput) -> str:
333
+ """Get detailed metadata and optional content preview for specific files.
334
+
335
+ Returns: full path, size, dates, type, permissions, hidden status.
336
+ For directories: item count, subdirectories, file listing.
337
+ For text files with preview_lines > 0: first N lines of content.
338
+ """
339
+ # Run blocking file I/O in thread pool to not block the event loop
340
+ return await asyncio.to_thread(
341
+ _get_file_details_sync,
342
+ params.paths,
343
+ params.preview_lines,
344
+ )
345
+
346
+
347
+ def _get_file_details_sync(paths: list[str], preview_lines: int) -> str:
348
+ """Synchronous implementation of file details gathering."""
349
+ output_parts: list[str] = []
350
+
351
+ for filepath in paths:
352
+ p = Path(filepath)
353
+ info: dict = {"path": str(p)}
354
+
355
+ if not p.exists():
356
+ info["error"] = "File not found"
357
+ output_parts.append(json.dumps(info, indent=2, ensure_ascii=False))
358
+ continue
359
+
360
+ try:
361
+ stat = p.stat()
362
+ info["name"] = p.name
363
+ info["type"] = "folder" if p.is_dir() else "file"
364
+
365
+ if not p.is_dir():
366
+ info["size"] = stat.st_size
367
+ info["size_human"] = human_size(stat.st_size)
368
+ info["extension"] = p.suffix.lstrip(".").lower()
369
+
370
+ info["date_modified"] = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
371
+ info["date_created"] = datetime.fromtimestamp(stat.st_ctime).strftime("%Y-%m-%d %H:%M:%S")
372
+ info["date_accessed"] = datetime.fromtimestamp(stat.st_atime).strftime("%Y-%m-%d %H:%M:%S")
373
+ info["read_only"] = not os.access(filepath, os.W_OK)
374
+
375
+ # Windows hidden attribute or Unix dotfile
376
+ file_attrs = getattr(stat, "st_file_attributes", 0)
377
+ info["hidden"] = bool(file_attrs & 0x2) if file_attrs else p.name.startswith(".")
378
+
379
+ # Directory listing
380
+ if p.is_dir():
381
+ try:
382
+ info.update(_summarize_directory(p))
383
+ except PermissionError:
384
+ info["listing_error"] = "Permission denied"
385
+ except OSError as exc:
386
+ info["listing_error"] = str(exc)
387
+
388
+ # Content preview for text files
389
+ elif preview_lines > 0:
390
+ preview = _read_preview(p, preview_lines)
391
+ if preview is not None:
392
+ info["preview"] = preview
393
+
394
+ except PermissionError:
395
+ info["error"] = "Permission denied"
396
+ except OSError as exc:
397
+ info["error"] = str(exc)
398
+
399
+ output_parts.append(json.dumps(info, indent=2, ensure_ascii=False))
400
+
401
+ return "\n---\n".join(output_parts)
402
+
403
+
404
+ # ═══════════════════════════════════════════════════════════════════════════
405
+ # Tool 5: everything_count_stats - Quick Analytics
406
+ # ═══════════════════════════════════════════════════════════════════════════
407
+
408
+
409
+ class CountStatsInput(BaseModel):
410
+ """Input schema for ``everything_count_stats``."""
411
+
412
+ model_config = ConfigDict(str_strip_whitespace=True, extra="forbid")
413
+
414
+ query: str = Field(
415
+ ...,
416
+ description=(
417
+ "Search query to count/measure. Same syntax as everything_search. "
418
+ "Examples: 'ext:py path:C:\\Projects', 'ext:log size:>1mb', '*.tmp'"
419
+ ),
420
+ min_length=1,
421
+ max_length=2000,
422
+ )
423
+ include_size: bool = Field(
424
+ default=True,
425
+ description="Also calculate total size of all matching files",
426
+ )
427
+ breakdown_by_extension: bool = Field(
428
+ default=False,
429
+ description="Break down count and size by file extension (samples top 200 results)",
430
+ )
431
+
432
+
433
+ @mcp.tool(
434
+ name="everything_count_stats",
435
+ annotations={
436
+ "title": "Count & Size Statistics",
437
+ "readOnlyHint": True,
438
+ "destructiveHint": False,
439
+ "idempotentHint": True,
440
+ "openWorldHint": False,
441
+ },
442
+ )
443
+ async def everything_count_stats(params: CountStatsInput) -> str:
444
+ """Get count and size statistics for files matching a query.
445
+
446
+ Fast way to understand the scope of a query without listing every file.
447
+ Optionally breaks down by extension for a high-level overview.
448
+ """
449
+ try:
450
+ backend = _get_backend()
451
+ output: dict = {"query": params.query}
452
+
453
+ # Count
454
+ try:
455
+ output["total_count"] = await backend.count(params.query)
456
+ except Exception:
457
+ output["count_note"] = "Count not available (es.exe may not support -get-result-count)"
458
+
459
+ # Total size
460
+ if params.include_size:
461
+ try:
462
+ total_size = await backend.get_total_size(params.query)
463
+ if total_size >= 0:
464
+ output["total_size"] = total_size
465
+ output["total_size_human"] = human_size(total_size)
466
+ except Exception:
467
+ output["size_note"] = "Total size not available"
468
+
469
+ # Extension breakdown
470
+ if params.breakdown_by_extension:
471
+ try:
472
+ sample_limit = 500
473
+ results = await backend.search(
474
+ params.query,
475
+ max_results=sample_limit,
476
+ sort="name",
477
+ )
478
+ ext_stats: dict[str, dict] = {}
479
+ sampled_files = 0
480
+ for r in results:
481
+ if r.is_dir:
482
+ continue
483
+ sampled_files += 1
484
+ ext = r.extension or "(no extension)"
485
+ entry = ext_stats.setdefault(ext, {"count": 0, "total_size": 0})
486
+ entry["count"] += 1
487
+ if r.size >= 0:
488
+ entry["total_size"] += r.size
489
+
490
+ sorted_exts = sorted(ext_stats.items(), key=lambda x: x[1]["count"], reverse=True)
491
+ breakdown = {}
492
+ for ext, stats in sorted_exts[:30]:
493
+ breakdown[ext] = {
494
+ "count": stats["count"],
495
+ "total_size": stats["total_size"],
496
+ "total_size_human": human_size(stats["total_size"]),
497
+ }
498
+ output["extension_breakdown"] = breakdown
499
+ output["breakdown_note"] = (
500
+ f"Based on {sampled_files} sampled files from first {len(results)} "
501
+ f"results (max sample {sample_limit}); directories excluded."
502
+ )
503
+ except Exception as exc:
504
+ output["breakdown_error"] = str(exc)
505
+
506
+ return json.dumps(output, indent=2, ensure_ascii=False)
507
+ except Exception as exc:
508
+ return f"Error: {exc}"
509
+
510
+
511
+ # ═══════════════════════════════════════════════════════════════════════════
512
+ # Resource: Health Check
513
+ # ═══════════════════════════════════════════════════════════════════════════
514
+
515
+
516
+ @mcp.resource("everything://status")
517
+ async def get_status() -> str:
518
+ """Get the current status of the Everything connection."""
519
+ if _backend:
520
+ status = await _backend.health_check()
521
+ else:
522
+ status = {"status": "not initialised"}
523
+ return json.dumps(status, indent=2)
524
+
525
+
526
+ # ═══════════════════════════════════════════════════════════════════════════
527
+ # Helpers
528
+ # ═══════════════════════════════════════════════════════════════════════════
529
+
530
+
531
+ def _format_search_results(
532
+ results: list,
533
+ query_label: str,
534
+ max_results: int,
535
+ offset: int = 0,
536
+ ) -> str:
537
+ """Format search results into a clean, readable string for LLM consumption."""
538
+ if not results:
539
+ return f"No results found for: {query_label}"
540
+
541
+ header = f"Found {len(results)} results for: {query_label}"
542
+ if offset > 0:
543
+ header += f" (offset: {offset})"
544
+ lines = [header, ""]
545
+
546
+ for r in results:
547
+ d = r.to_dict() if hasattr(r, "to_dict") else r
548
+ path = d.get("path", "?")
549
+ ftype = d.get("type", "file")
550
+ size_h = d.get("size_human", "")
551
+ dm = d.get("date_modified", "")
552
+
553
+ prefix = "[DIR]" if ftype == "folder" else "[FILE]"
554
+ meta_parts: list[str] = []
555
+ if size_h:
556
+ meta_parts.append(size_h)
557
+ if dm:
558
+ meta_parts.append(dm)
559
+
560
+ meta = f" ({', '.join(meta_parts)})" if meta_parts else ""
561
+ lines.append(f" {prefix} {path}{meta}")
562
+
563
+ if len(results) >= max_results:
564
+ lines.append("")
565
+ lines.append(
566
+ f"Showing first {max_results} results. "
567
+ "Use 'offset' to paginate or refine the query."
568
+ )
569
+
570
+ return "\n".join(lines)
571
+
572
+
573
+ # ── Text file preview ─────────────────────────────────────────────────────
574
+
575
+ # Extensions we can safely read as text
576
+ _TEXT_EXTENSIONS: frozenset[str] = frozenset({
577
+ # Text & docs
578
+ "txt", "md", "mdx", "rst", "adoc", "org",
579
+ # Python
580
+ "py", "pyi", "pyw", "pyx", "pxd",
581
+ # JavaScript/TypeScript
582
+ "js", "mjs", "cjs", "ts", "mts", "cts", "jsx", "tsx",
583
+ # Web frameworks
584
+ "vue", "svelte", "astro", "marko",
585
+ # C family
586
+ "c", "cpp", "cc", "cxx", "h", "hpp", "hxx", "cs", "java", "m", "mm",
587
+ # Systems languages
588
+ "go", "rs", "rb", "php", "swift", "kt", "kts", "scala", "r", "lua",
589
+ # Shell
590
+ "sh", "bash", "zsh", "fish", "ps1", "psm1", "psd1", "bat", "cmd",
591
+ # Database & query
592
+ "sql", "prisma", "graphql", "gql",
593
+ # Web
594
+ "html", "htm", "css", "scss", "sass", "less", "styl", "pcss",
595
+ # Data formats
596
+ "json", "jsonc", "json5", "jsonl", "ndjson",
597
+ "xml", "xsl", "xslt", "xsd", "svg", "rss", "atom",
598
+ "yaml", "yml", "toml", "ini", "cfg", "conf", "env", "properties",
599
+ "csv", "tsv", "log",
600
+ # Config files (with extensions)
601
+ "gitignore", "gitattributes", "gitmodules", "npmrc", "nvmrc", "yarnrc",
602
+ "dockerignore", "editorconfig", "eslintrc", "prettierrc", "babelrc",
603
+ "stylelintrc", "browserslistrc",
604
+ # Build tools
605
+ "makefile", "dockerfile", "cmake", "gradle", "sbt", "cabal", "bazel",
606
+ # Academic
607
+ "tex", "bib", "cls", "sty",
608
+ # Hardware
609
+ "asm", "s", "v", "sv", "vhd", "vhdl",
610
+ # Modern languages
611
+ "dart", "zig", "nim", "hx", "odin", "jai", "vlang",
612
+ # Functional
613
+ "ex", "exs", "erl", "hrl", "hs", "lhs", "ml", "mli", "fs", "fsi", "fsx",
614
+ "clj", "cljs", "cljc", "edn", "lisp", "el", "rkt", "scm", "fnl",
615
+ # Other
616
+ "pro", "pri", "qml", "proto", "thrift", "capnp",
617
+ "tf", "hcl", "nix", "dhall", "jsonnet", "cue",
618
+ "http", "rest", "lock",
619
+ })
620
+
621
+ # Filenames (no extension) that are always text
622
+ _TEXT_FILENAMES: frozenset[str] = frozenset({
623
+ "makefile", "dockerfile", "cmakelists.txt", "rakefile", "gemfile",
624
+ "procfile", "vagrantfile", "brewfile", "justfile", "taskfile",
625
+ "license", "licence", "readme", "authors", "contributors",
626
+ "changelog", "changes", "history", "news", "todo",
627
+ })
628
+
629
+ _MAX_DIR_SCAN_ITEMS = 10_000
630
+ _MAX_SUBDIRECTORY_SAMPLE = 20
631
+ _MAX_FILE_SAMPLE = 30
632
+ _MAX_PREVIEW_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
633
+ _MAX_PREVIEW_CHARS = 50_000
634
+
635
+
636
+ def _summarize_directory(path: Path) -> dict[str, object]:
637
+ """Return bounded directory metadata without loading all entries in memory."""
638
+ dirs: list[str] = []
639
+ files: list[str] = []
640
+ scanned = 0
641
+ truncated = False
642
+
643
+ with os.scandir(path) as entries:
644
+ for entry in entries:
645
+ if scanned >= _MAX_DIR_SCAN_ITEMS:
646
+ truncated = True
647
+ break
648
+ scanned += 1
649
+ try:
650
+ if entry.is_dir(follow_symlinks=False):
651
+ if len(dirs) < _MAX_SUBDIRECTORY_SAMPLE:
652
+ dirs.append(entry.name)
653
+ elif entry.is_file(follow_symlinks=False) and len(files) < _MAX_FILE_SAMPLE:
654
+ files.append(entry.name)
655
+ except OSError:
656
+ continue
657
+
658
+ summary: dict[str, object] = {
659
+ "item_count": scanned,
660
+ "subdirectories": sorted(dirs),
661
+ "files_sample": sorted(files),
662
+ }
663
+ if truncated:
664
+ summary["note"] = (
665
+ f"Directory scan capped at {_MAX_DIR_SCAN_ITEMS} entries; samples may be incomplete"
666
+ )
667
+ elif scanned > (_MAX_SUBDIRECTORY_SAMPLE + _MAX_FILE_SAMPLE):
668
+ summary["note"] = f"Showing first items of {scanned} total"
669
+ return summary
670
+
671
+
672
+ def _read_preview(path: Path, max_lines: int) -> str | None:
673
+ """Read the first *max_lines* lines of a text file.
674
+
675
+ Returns ``None`` for binary files or files that can't be read.
676
+ """
677
+ try:
678
+ if path.stat().st_size > _MAX_PREVIEW_FILE_SIZE:
679
+ return "(file too large for preview)"
680
+ except OSError:
681
+ return None
682
+
683
+ ext = path.suffix.lstrip(".").lower()
684
+ name_lower = path.name.lower()
685
+ stem_lower = path.stem.lower()
686
+
687
+ is_text = (
688
+ ext in _TEXT_EXTENSIONS
689
+ or name_lower in _TEXT_FILENAMES
690
+ or stem_lower in _TEXT_FILENAMES
691
+ or name_lower.startswith(".") # dotfiles are usually text
692
+ )
693
+
694
+ if not is_text:
695
+ # Sniff for binary content
696
+ try:
697
+ with open(path, "rb") as f:
698
+ chunk = f.read(512)
699
+ if b"\x00" in chunk:
700
+ return None # binary
701
+ is_text = True
702
+ except (OSError, PermissionError):
703
+ return None
704
+
705
+ if not is_text:
706
+ return None
707
+
708
+ # Read lines with encoding fallback
709
+ for encoding in ("utf-8", "utf-8-sig", "latin-1"):
710
+ try:
711
+ with open(path, encoding=encoding) as f:
712
+ lines: list[str] = []
713
+ total_chars = 0
714
+ truncated = False
715
+ for _ in range(max_lines):
716
+ remaining = _MAX_PREVIEW_CHARS - total_chars
717
+ if remaining <= 0:
718
+ truncated = True
719
+ break
720
+
721
+ # Bound each read to avoid huge single-line payloads.
722
+ line = f.readline(remaining + 1)
723
+ if not line:
724
+ break
725
+
726
+ if len(line) > remaining:
727
+ line = line[:remaining]
728
+ truncated = True
729
+
730
+ total_chars += len(line)
731
+ lines.append(line.rstrip("\n\r"))
732
+
733
+ if total_chars >= _MAX_PREVIEW_CHARS:
734
+ truncated = True
735
+ break
736
+
737
+ if truncated:
738
+ lines.append("... [preview truncated]")
739
+ return "\n".join(lines)
740
+ except UnicodeDecodeError:
741
+ continue
742
+ except (OSError, PermissionError):
743
+ return None
744
+
745
+ return "(unable to decode file content)"