mcp-kb 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_kb/cli/__init__.py +1 -0
- mcp_kb/cli/args.py +175 -0
- mcp_kb/cli/main.py +181 -0
- mcp_kb/cli/reindex.py +113 -0
- mcp_kb/cli/runtime_config.py +421 -0
- mcp_kb/data/KNOWLEDBASE_DOC.md +151 -0
- mcp_kb/data/__init__.py +1 -0
- mcp_kb/ingest/__init__.py +1 -0
- mcp_kb/ingest/chroma.py +1287 -0
- mcp_kb/knowledge/__init__.py +1 -0
- mcp_kb/knowledge/bootstrap.py +44 -0
- mcp_kb/knowledge/events.py +105 -0
- mcp_kb/knowledge/search.py +177 -0
- mcp_kb/knowledge/store.py +294 -0
- mcp_kb/security/__init__.py +1 -0
- mcp_kb/security/path_validation.py +108 -0
- mcp_kb/server/__init__.py +1 -0
- mcp_kb/server/app.py +201 -0
- mcp_kb/ui/__init__.py +17 -0
- mcp_kb/ui/api.py +377 -0
- mcp_kb/ui/assets/assets/index.css +1 -0
- mcp_kb/ui/assets/index.html +62 -0
- mcp_kb/ui/server.py +332 -0
- mcp_kb/utils/__init__.py +1 -0
- mcp_kb/utils/filesystem.py +128 -0
- mcp_kb-0.3.3.dist-info/METADATA +338 -0
- mcp_kb-0.3.3.dist-info/RECORD +32 -0
- mcp_kb-0.3.1.dist-info/METADATA +0 -181
- mcp_kb-0.3.1.dist-info/RECORD +0 -7
- {mcp_kb-0.3.1.dist-info → mcp_kb-0.3.3.dist-info}/WHEEL +0 -0
- {mcp_kb-0.3.1.dist-info → mcp_kb-0.3.3.dist-info}/entry_points.txt +0 -0
- {mcp_kb-0.3.1.dist-info → mcp_kb-0.3.3.dist-info}/top_level.txt +0 -0
mcp_kb/ui/server.py
ADDED
@@ -0,0 +1,332 @@
|
|
1
|
+
"""Lightweight HTTP server that hosts the human UI and JSON endpoints.
|
2
|
+
|
3
|
+
The implementation uses :mod:`http.server` from the Python standard library to
|
4
|
+
avoid adding web framework dependencies. Static assets are loaded from package
|
5
|
+
resources, while dynamic endpoints call into a shared
|
6
|
+
:class:`~mcp_kb.knowledge.store.KnowledgeBase` instance.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from __future__ import annotations
|
10
|
+
|
11
|
+
import json
|
12
|
+
import threading
|
13
|
+
from http import HTTPStatus
|
14
|
+
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
15
|
+
from typing import ClassVar, Optional
|
16
|
+
from urllib.parse import parse_qs, urlparse
|
17
|
+
|
18
|
+
from importlib import resources
|
19
|
+
import logging
|
20
|
+
from mcp_kb.knowledge.store import KnowledgeBase
|
21
|
+
|
22
|
+
from .api import (
|
23
|
+
build_tree_json,
|
24
|
+
read_file_json,
|
25
|
+
write_file,
|
26
|
+
search_json,
|
27
|
+
vector_status_json,
|
28
|
+
vector_embeddings_json,
|
29
|
+
vector_query_embedding_json,
|
30
|
+
vector_reindex_json,
|
31
|
+
vector_refit_json,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
class _UIRequestHandler(BaseHTTPRequestHandler):
|
38
|
+
"""Request handler serving the web UI and API endpoints.
|
39
|
+
|
40
|
+
The handler reads assets from ``mcp_kb.ui.assets`` and forwards JSON API
|
41
|
+
requests to the injected knowledge base instance. An instance is attached
|
42
|
+
to the handler class at server startup via ``kb``.
|
43
|
+
"""
|
44
|
+
|
45
|
+
kb: ClassVar[Optional[KnowledgeBase]] = None
|
46
|
+
|
47
|
+
# Silence default log output; the main process already logs startup info
|
48
|
+
def log_message(self, format: str, *args) -> None: # pragma: no cover - noise
|
49
|
+
return
|
50
|
+
|
51
|
+
def do_GET(self) -> None: # noqa: N802 - HTTP verb name
|
52
|
+
"""Serve the index page, static assets, or a JSON read endpoint."""
|
53
|
+
|
54
|
+
assert self.kb is not None, "UI server not initialized with a KnowledgeBase"
|
55
|
+
parsed = urlparse(self.path)
|
56
|
+
if parsed.path == "/" or parsed.path == "/index.html":
|
57
|
+
self._serve_asset("index.html", content_type="text/html; charset=utf-8")
|
58
|
+
return
|
59
|
+
if parsed.path.startswith("/static/"):
|
60
|
+
name = parsed.path.split("/static/", 1)[1]
|
61
|
+
ctype = (
|
62
|
+
"text/javascript; charset=utf-8" if name.endswith(".js") else
|
63
|
+
"text/css; charset=utf-8" if name.endswith(".css") else
|
64
|
+
"application/octet-stream"
|
65
|
+
)
|
66
|
+
self._serve_asset(name, content_type=ctype)
|
67
|
+
return
|
68
|
+
if parsed.path == "/api/tree":
|
69
|
+
body = json.dumps(build_tree_json(self.kb)).encode("utf-8")
|
70
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
71
|
+
return
|
72
|
+
if parsed.path == "/api/file":
|
73
|
+
params = parse_qs(parsed.query)
|
74
|
+
rel_path = params.get("path", [""])[0]
|
75
|
+
try:
|
76
|
+
payload = read_file_json(self.kb, rel_path)
|
77
|
+
except Exception as exc: # pragma: no cover - defensive
|
78
|
+
logger.exception(exc)
|
79
|
+
self._send_error(HTTPStatus.BAD_REQUEST, str(exc))
|
80
|
+
return
|
81
|
+
body = json.dumps(payload.model_dump()).encode("utf-8")
|
82
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
83
|
+
return
|
84
|
+
if parsed.path == "/api/search":
|
85
|
+
params = parse_qs(parsed.query)
|
86
|
+
query = params.get("query", [""])[0]
|
87
|
+
if not query:
|
88
|
+
body = json.dumps([]).encode("utf-8")
|
89
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
90
|
+
return
|
91
|
+
limit_raw = params.get("limit", params.get("n", [None]))[0]
|
92
|
+
limit = None
|
93
|
+
if limit_raw is not None:
|
94
|
+
try:
|
95
|
+
limit = max(1, int(limit_raw))
|
96
|
+
except ValueError:
|
97
|
+
self._send_error(HTTPStatus.BAD_REQUEST, "Invalid limit value")
|
98
|
+
return
|
99
|
+
try:
|
100
|
+
results = search_json(self.kb, query, limit=limit)
|
101
|
+
except Exception as exc: # pragma: no cover - defensive
|
102
|
+
logger.exception(exc)
|
103
|
+
self._send_error(HTTPStatus.BAD_REQUEST, str(exc))
|
104
|
+
return
|
105
|
+
body = json.dumps(results).encode("utf-8")
|
106
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
107
|
+
return
|
108
|
+
|
109
|
+
if parsed.path == "/api/vector/status":
|
110
|
+
body = json.dumps(vector_status_json(self.kb)).encode("utf-8")
|
111
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
112
|
+
return
|
113
|
+
|
114
|
+
if parsed.path == "/api/vector/embeddings":
|
115
|
+
params = parse_qs(parsed.query)
|
116
|
+
try:
|
117
|
+
raw_limit = params.get("limit", ["1000"])[0]
|
118
|
+
raw_offset = params.get("offset", ["0"])[0]
|
119
|
+
limit = max(1, int(raw_limit))
|
120
|
+
offset = max(0, int(raw_offset))
|
121
|
+
except ValueError:
|
122
|
+
self._send_error(HTTPStatus.BAD_REQUEST, "Invalid limit/offset value")
|
123
|
+
return
|
124
|
+
path = params.get("path", [None])[0]
|
125
|
+
try:
|
126
|
+
results = vector_embeddings_json(self.kb, limit=limit, offset=offset, path=path)
|
127
|
+
except Exception as exc: # pragma: no cover - defensive
|
128
|
+
logger.exception(exc)
|
129
|
+
self._send_error(HTTPStatus.BAD_REQUEST, str(exc))
|
130
|
+
return
|
131
|
+
body = json.dumps(results).encode("utf-8")
|
132
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
133
|
+
return
|
134
|
+
|
135
|
+
if parsed.path == "/api/vector/query_embedding":
|
136
|
+
params = parse_qs(parsed.query)
|
137
|
+
query = params.get("query", [""])[0]
|
138
|
+
if not query:
|
139
|
+
body = json.dumps({"embedding": [], "used_model": None}).encode("utf-8")
|
140
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
141
|
+
return
|
142
|
+
result = vector_query_embedding_json(self.kb, query)
|
143
|
+
body = json.dumps(result).encode("utf-8")
|
144
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
145
|
+
return
|
146
|
+
|
147
|
+
self._send_error(HTTPStatus.NOT_FOUND, "Not Found")
|
148
|
+
|
149
|
+
def do_PUT(self) -> None: # noqa: N802 - HTTP verb name
|
150
|
+
"""Handle file save requests at ``/api/file`` with JSON payloads."""
|
151
|
+
|
152
|
+
assert self.kb is not None, "UI server not initialized with a KnowledgeBase"
|
153
|
+
parsed = urlparse(self.path)
|
154
|
+
if parsed.path != "/api/file":
|
155
|
+
self._send_error(HTTPStatus.NOT_FOUND, "Not Found")
|
156
|
+
return
|
157
|
+
try:
|
158
|
+
content_len = int(self.headers.get("Content-Length", "0"))
|
159
|
+
except ValueError:
|
160
|
+
self._send_error(HTTPStatus.LENGTH_REQUIRED, "Content-Length required")
|
161
|
+
return
|
162
|
+
raw = self.rfile.read(content_len)
|
163
|
+
try:
|
164
|
+
payload = json.loads(raw.decode("utf-8"))
|
165
|
+
path = payload["path"]
|
166
|
+
content = payload.get("content", "")
|
167
|
+
write_file(self.kb, path, content)
|
168
|
+
except KeyError:
|
169
|
+
logger.exception(exc)
|
170
|
+
self._send_error(HTTPStatus.BAD_REQUEST, "Missing 'path' in JSON body")
|
171
|
+
return
|
172
|
+
except Exception as exc: # pragma: no cover - defensive
|
173
|
+
logger.exception(exc)
|
174
|
+
self._send_error(HTTPStatus.BAD_REQUEST, str(exc))
|
175
|
+
return
|
176
|
+
self._send_response(HTTPStatus.NO_CONTENT, b"", "application/json")
|
177
|
+
|
178
|
+
def do_POST(self) -> None: # noqa: N802 - HTTP verb name
|
179
|
+
"""Handle administrative vector actions exposed via POST endpoints."""
|
180
|
+
|
181
|
+
assert self.kb is not None, "UI server not initialized with a KnowledgeBase"
|
182
|
+
parsed = urlparse(self.path)
|
183
|
+
if parsed.path == "/api/vector/reindex":
|
184
|
+
try:
|
185
|
+
payload = vector_reindex_json(self.kb)
|
186
|
+
body = json.dumps(payload).encode("utf-8")
|
187
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
188
|
+
except Exception as exc: # pragma: no cover - defensive
|
189
|
+
logger.exception(exc)
|
190
|
+
self._send_error(HTTPStatus.BAD_REQUEST, str(exc))
|
191
|
+
return
|
192
|
+
if parsed.path == "/api/vector/refit":
|
193
|
+
try:
|
194
|
+
payload = vector_refit_json(self.kb)
|
195
|
+
body = json.dumps(payload).encode("utf-8")
|
196
|
+
self._send_response(HTTPStatus.OK, body, "application/json")
|
197
|
+
except Exception as exc: # pragma: no cover - defensive
|
198
|
+
logger.exception(exc)
|
199
|
+
self._send_error(HTTPStatus.BAD_REQUEST, str(exc))
|
200
|
+
return
|
201
|
+
self._send_error(HTTPStatus.NOT_FOUND, "Not Found")
|
202
|
+
|
203
|
+
def do_DELETE(self) -> None: # noqa: N802 - HTTP verb name
|
204
|
+
"""Soft delete the requested file using ``kb.soft_delete``.
|
205
|
+
|
206
|
+
Endpoint: ``DELETE /api/file?path=...``
|
207
|
+
Returns ``204 No Content`` on success or ``400`` when validation fails.
|
208
|
+
"""
|
209
|
+
|
210
|
+
assert self.kb is not None, "UI server not initialized with a KnowledgeBase"
|
211
|
+
parsed = urlparse(self.path)
|
212
|
+
if parsed.path != "/api/file":
|
213
|
+
self._send_error(HTTPStatus.NOT_FOUND, "Not Found")
|
214
|
+
return
|
215
|
+
params = parse_qs(parsed.query)
|
216
|
+
rel_path = params.get("path", [""])[0]
|
217
|
+
try:
|
218
|
+
self.kb.soft_delete(rel_path)
|
219
|
+
except Exception as exc: # pragma: no cover - defensive
|
220
|
+
logger.exception(exc)
|
221
|
+
self._send_error(HTTPStatus.BAD_REQUEST, str(exc))
|
222
|
+
return
|
223
|
+
self._send_response(HTTPStatus.NO_CONTENT, b"", "application/json")
|
224
|
+
|
225
|
+
def _serve_asset(self, name: str, *, content_type: str) -> None:
|
226
|
+
"""Serve an embedded static asset by ``name`` with ``content_type``."""
|
227
|
+
|
228
|
+
try:
|
229
|
+
data = resources.files("mcp_kb.ui.assets").joinpath(name).read_bytes()
|
230
|
+
except FileNotFoundError:
|
231
|
+
self._send_error(HTTPStatus.NOT_FOUND, "Asset not found")
|
232
|
+
return
|
233
|
+
self._send_response(HTTPStatus.OK, data, content_type)
|
234
|
+
|
235
|
+
def _send_response(self, status: HTTPStatus, body: bytes, content_type: str) -> None:
|
236
|
+
"""Write an HTTP response with headers and body."""
|
237
|
+
|
238
|
+
self.send_response(status)
|
239
|
+
self.send_header("Content-Type", content_type)
|
240
|
+
self.send_header("Content-Length", str(len(body)))
|
241
|
+
self.end_headers()
|
242
|
+
if body:
|
243
|
+
self.wfile.write(body)
|
244
|
+
|
245
|
+
def _send_error(self, status: HTTPStatus, message: str) -> None:
|
246
|
+
"""Return a JSON error payload with ``status`` and ``message``."""
|
247
|
+
|
248
|
+
payload = json.dumps({"error": message}).encode("utf-8")
|
249
|
+
self._send_response(status, payload, "application/json")
|
250
|
+
|
251
|
+
|
252
|
+
DEFAULT_UI_PORT = 8765
|
253
|
+
|
254
|
+
|
255
|
+
class UIServer:
|
256
|
+
"""Container managing the lifecycle of the UI HTTP server.
|
257
|
+
|
258
|
+
The server binds in a background thread so it can run alongside the MCP
|
259
|
+
transports. Call :meth:`stop` to shut it down from tests or other code.
|
260
|
+
By default, the server attempts to bind to :data:`DEFAULT_UI_PORT` and will
|
261
|
+
increment by one until a free port is found. Callers can provide a
|
262
|
+
``port`` to override the starting point.
|
263
|
+
"""
|
264
|
+
|
265
|
+
def __init__(self, kb: KnowledgeBase, host: str = "127.0.0.1", port: int | None = None) -> None:
|
266
|
+
"""Create a server bound to ``host:port`` serving ``kb``.
|
267
|
+
|
268
|
+
Binding strategy
|
269
|
+
----------------
|
270
|
+
- When ``port`` is ``None``, the server starts scanning from
|
271
|
+
:data:`DEFAULT_UI_PORT`.
|
272
|
+
- When ``port`` is provided, scanning starts from that value.
|
273
|
+
- On conflict (address already in use), the server increments the port
|
274
|
+
by one and retries until successful.
|
275
|
+
"""
|
276
|
+
|
277
|
+
_UIRequestHandler.kb = kb
|
278
|
+
start = DEFAULT_UI_PORT if port is None else port
|
279
|
+
bound = None
|
280
|
+
last_error: Exception | None = None
|
281
|
+
for candidate in range(start, start + 200):
|
282
|
+
try:
|
283
|
+
httpd = ThreadingHTTPServer((host, candidate), _UIRequestHandler)
|
284
|
+
except OSError as exc: # port in use or permission error
|
285
|
+
last_error = exc
|
286
|
+
continue
|
287
|
+
else:
|
288
|
+
bound = (candidate, httpd)
|
289
|
+
break
|
290
|
+
if bound is None:
|
291
|
+
raise RuntimeError(
|
292
|
+
f"Failed to bind UI server on {host}:{start}-{start+199}: {last_error}"
|
293
|
+
)
|
294
|
+
self._httpd = bound[1]
|
295
|
+
self.host = host
|
296
|
+
self.port = bound[0]
|
297
|
+
self._thread: Optional[threading.Thread] = None
|
298
|
+
|
299
|
+
def start(self) -> None:
|
300
|
+
"""Start the HTTP server in a daemon thread."""
|
301
|
+
|
302
|
+
def _target() -> None:
|
303
|
+
self._httpd.serve_forever(poll_interval=0.5)
|
304
|
+
|
305
|
+
self._thread = threading.Thread(target=_target, name="kb-ui", daemon=True)
|
306
|
+
self._thread.start()
|
307
|
+
|
308
|
+
def stop(self) -> None:
|
309
|
+
"""Shutdown the server and wait for the thread to exit."""
|
310
|
+
|
311
|
+
self._httpd.shutdown()
|
312
|
+
self._httpd.server_close()
|
313
|
+
if self._thread is not None:
|
314
|
+
self._thread.join(timeout=5)
|
315
|
+
|
316
|
+
|
317
|
+
def start_ui_server(kb: KnowledgeBase, host: str = "127.0.0.1", port: int | None = None) -> UIServer:
|
318
|
+
"""Start and return a :class:`UIServer` bound to ``host:port`` for ``kb``.
|
319
|
+
|
320
|
+
When ``port`` is ``None`` the server tries :data:`DEFAULT_UI_PORT` and
|
321
|
+
increments until an available port is found.
|
322
|
+
"""
|
323
|
+
|
324
|
+
srv = UIServer(kb, host=host, port=port)
|
325
|
+
srv.start()
|
326
|
+
return srv
|
327
|
+
|
328
|
+
|
329
|
+
__all__ = [
|
330
|
+
"UIServer",
|
331
|
+
"start_ui_server",
|
332
|
+
]
|
mcp_kb/utils/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
"""Utility helpers shared across the knowledge base server modules."""
|
@@ -0,0 +1,128 @@
|
|
1
|
+
"""Filesystem helpers wrapping Python's standard library primitives.
|
2
|
+
|
3
|
+
The knowledge base server performs numerous file operations. Consolidating the
|
4
|
+
logic in this module keeps the rest of the code focused on business semantics
|
5
|
+
such as validating incoming requests and shaping responses. Each helper function
|
6
|
+
is intentionally small so that callers can compose them for different workflows
|
7
|
+
without duplicating the low-level boilerplate.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
from contextlib import contextmanager
|
13
|
+
from pathlib import Path
|
14
|
+
from threading import Lock
|
15
|
+
from typing import Dict, Iterator
|
16
|
+
|
17
|
+
|
18
|
+
class FileLockRegistry:
|
19
|
+
"""In-memory lock registry to serialize write operations per file.
|
20
|
+
|
21
|
+
Using per-path locks prevents concurrent writes from interleaving content
|
22
|
+
and potentially corrupting files. The registry lazily creates locks when a
|
23
|
+
path is first encountered. We reuse locks for subsequent operations to avoid
|
24
|
+
unbounded memory usage.
|
25
|
+
"""
|
26
|
+
|
27
|
+
def __init__(self) -> None:
|
28
|
+
"""Initialize the registry with an empty dictionary."""
|
29
|
+
|
30
|
+
self._locks: Dict[Path, Lock] = {}
|
31
|
+
self._global_lock = Lock()
|
32
|
+
|
33
|
+
@contextmanager
|
34
|
+
def acquire(self, path: Path) -> Iterator[None]:
|
35
|
+
"""Context manager that acquires a lock for the supplied path.
|
36
|
+
|
37
|
+
The helper nests two locks: a global mutex to retrieve or create the
|
38
|
+
per-path lock, and the per-path lock itself for the duration of the
|
39
|
+
caller's critical section.
|
40
|
+
|
41
|
+
Parameters
|
42
|
+
----------
|
43
|
+
path:
|
44
|
+
Absolute path indicating which file should be protected.
|
45
|
+
"""
|
46
|
+
|
47
|
+
with self._global_lock:
|
48
|
+
lock = self._locks.setdefault(path, Lock())
|
49
|
+
lock.acquire()
|
50
|
+
try:
|
51
|
+
yield
|
52
|
+
finally:
|
53
|
+
lock.release()
|
54
|
+
|
55
|
+
|
56
|
+
def write_text(path: Path, content: str) -> None:
|
57
|
+
"""Write text content to ``path`` using UTF-8 encoding."""
|
58
|
+
|
59
|
+
path.write_text(content, encoding="utf-8")
|
60
|
+
|
61
|
+
|
62
|
+
def append_text(path: Path, content: str) -> None:
|
63
|
+
"""Append text content to ``path`` using UTF-8 encoding."""
|
64
|
+
|
65
|
+
with path.open("a", encoding="utf-8") as handle:
|
66
|
+
handle.write(content)
|
67
|
+
|
68
|
+
|
69
|
+
def read_text(path: Path) -> str:
|
70
|
+
"""Read UTF-8 text content from ``path`` and return it."""
|
71
|
+
|
72
|
+
return path.read_text(encoding="utf-8")
|
73
|
+
|
74
|
+
|
75
|
+
def ensure_parent_directory(path: Path) -> None:
|
76
|
+
"""Ensure the parent directory of ``path`` exists by creating it."""
|
77
|
+
|
78
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
79
|
+
|
80
|
+
|
81
|
+
def rename(path: Path, target: Path) -> None:
|
82
|
+
"""Rename ``path`` to ``target`` using ``Path.rename`` semantics."""
|
83
|
+
|
84
|
+
path.rename(target)
|
85
|
+
|
86
|
+
|
87
|
+
def is_text_file(path: Path, max_bytes: int = 2048) -> bool:
|
88
|
+
"""Heuristically determine whether ``path`` contains UTF-8 text.
|
89
|
+
|
90
|
+
The check is designed to be fast and conservative for use when iterating
|
91
|
+
a directory tree. It reads at most ``max_bytes`` from the file in binary
|
92
|
+
mode and applies two filters:
|
93
|
+
|
94
|
+
- Reject files that contain NUL bytes, which are extremely uncommon in
|
95
|
+
textual formats and a strong indicator of binary content.
|
96
|
+
- Attempt to decode the sampled bytes as UTF-8. If decoding fails, the
|
97
|
+
file is treated as binary.
|
98
|
+
|
99
|
+
Parameters
|
100
|
+
----------
|
101
|
+
path:
|
102
|
+
Absolute path to the file on disk.
|
103
|
+
max_bytes:
|
104
|
+
Upper bound on the number of bytes to sample from the head of the
|
105
|
+
file. A small sample keeps directory scans fast while remaining
|
106
|
+
accurate for typical text formats such as ``.md``, ``.txt``, ``.xml``,
|
107
|
+
and source files.
|
108
|
+
|
109
|
+
Returns
|
110
|
+
-------
|
111
|
+
bool
|
112
|
+
``True`` if the file appears to be UTF-8 text; ``False`` otherwise.
|
113
|
+
"""
|
114
|
+
|
115
|
+
try:
|
116
|
+
with path.open("rb") as handle:
|
117
|
+
sample = handle.read(max_bytes)
|
118
|
+
except (FileNotFoundError, PermissionError): # pragma: no cover - defensive
|
119
|
+
return False
|
120
|
+
|
121
|
+
if b"\x00" in sample:
|
122
|
+
return False
|
123
|
+
|
124
|
+
try:
|
125
|
+
sample.decode("utf-8")
|
126
|
+
return True
|
127
|
+
except UnicodeDecodeError:
|
128
|
+
return False
|