mcp-plesk-dev-docs 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_plesk_dev_docs-0.4.2.dist-info/METADATA +221 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/RECORD +30 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/WHEEL +5 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/entry_points.txt +2 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/licenses/LICENSE +21 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/licenses/NOTICE +0 -0
- mcp_plesk_dev_docs-0.4.2.dist-info/top_level.txt +1 -0
- plesk_unified/__init__.py +3 -0
- plesk_unified/ai_client.py +257 -0
- plesk_unified/benchmark_engines.py +330 -0
- plesk_unified/benchmark_gates.py +254 -0
- plesk_unified/benchmark_reporting.py +107 -0
- plesk_unified/benchmark_runner.py +433 -0
- plesk_unified/benchmark_suites.py +30 -0
- plesk_unified/chunking.py +360 -0
- plesk_unified/error_handling.py +112 -0
- plesk_unified/html_utils.py +217 -0
- plesk_unified/indexing.py +53 -0
- plesk_unified/io_utils.py +287 -0
- plesk_unified/log_handler.py +209 -0
- plesk_unified/model_config.py +218 -0
- plesk_unified/platform_utils.py +214 -0
- plesk_unified/settings.py +93 -0
- plesk_unified/summary_cache.py +55 -0
- plesk_unified/tq_index.py +85 -0
- plesk_unified/turboquant/__init__.py +21 -0
- plesk_unified/turboquant/compressors.py +190 -0
- plesk_unified/turboquant/lloyd_max.py +190 -0
- plesk_unified/turboquant/turboquant.py +249 -0
- plesk_unified/types.py +27 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import re
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("plesk_unified")
|
|
7
|
+
|
|
8
|
+
# Bump this version whenever the chunking logic or context injection changes
|
|
9
|
+
# to force a re-embedding of changed chunks while preserving identical ones.
|
|
10
|
+
CHUNK_VERSION = "v15"
|
|
11
|
+
|
|
12
|
+
# Global registry for tree-sitter languages to avoid repeated lookups
|
|
13
|
+
_TS_LANGS = {}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _get_ts_lang(lang_name: str):
|
|
17
|
+
"""Get or load a tree-sitter language."""
|
|
18
|
+
if lang_name in _TS_LANGS:
|
|
19
|
+
return _TS_LANGS[lang_name]
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import tree_sitter_languages
|
|
23
|
+
|
|
24
|
+
lang = tree_sitter_languages.get_language(lang_name)
|
|
25
|
+
_TS_LANGS[lang_name] = lang
|
|
26
|
+
return lang
|
|
27
|
+
except Exception:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_ts_query(lang_name: str) -> Optional[str]:
|
|
32
|
+
"""Return the tree-sitter query string for the given language."""
|
|
33
|
+
if lang_name == "php":
|
|
34
|
+
return """
|
|
35
|
+
(class_declaration) @decl
|
|
36
|
+
(function_declaration) @decl
|
|
37
|
+
(method_declaration) @decl
|
|
38
|
+
(interface_declaration) @decl
|
|
39
|
+
(trait_declaration) @decl
|
|
40
|
+
"""
|
|
41
|
+
if lang_name in ("javascript", "typescript"):
|
|
42
|
+
return """
|
|
43
|
+
(class_declaration) @decl
|
|
44
|
+
(function_declaration) @decl
|
|
45
|
+
(method_definition) @decl
|
|
46
|
+
(export_statement) @decl
|
|
47
|
+
"""
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def chunk_by_ast(
|
|
52
|
+
text: str, lang_name: str, max_chars: int = 1500, overlap: int = 200
|
|
53
|
+
) -> Optional[List[str]]:
|
|
54
|
+
"""Chunk code using tree-sitter AST nodes (classes, functions, methods)."""
|
|
55
|
+
lang = _get_ts_lang(lang_name)
|
|
56
|
+
query_str = _get_ts_query(lang_name)
|
|
57
|
+
if not lang or not query_str:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
from tree_sitter import Parser
|
|
62
|
+
|
|
63
|
+
parser = Parser()
|
|
64
|
+
parser.set_language(lang)
|
|
65
|
+
tree = parser.parse(bytes(text, "utf-8"))
|
|
66
|
+
query = lang.query(query_str)
|
|
67
|
+
captures = query.captures(tree.root_node)
|
|
68
|
+
|
|
69
|
+
if not captures:
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
chunks = []
|
|
73
|
+
last_end = 0
|
|
74
|
+
|
|
75
|
+
for node, _ in captures:
|
|
76
|
+
# Handle gap before this node
|
|
77
|
+
if node.start_byte > last_end:
|
|
78
|
+
gap = text[last_end : node.start_byte].strip()
|
|
79
|
+
if gap:
|
|
80
|
+
chunks.extend(
|
|
81
|
+
chunk_by_chars(gap, max_chars, overlap)
|
|
82
|
+
if len(gap) > max_chars
|
|
83
|
+
else [gap]
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
block = text[node.start_byte : node.end_byte].strip()
|
|
87
|
+
if block:
|
|
88
|
+
chunks.extend(
|
|
89
|
+
chunk_by_chars(block, max_chars, overlap)
|
|
90
|
+
if len(block) > max_chars
|
|
91
|
+
else [block]
|
|
92
|
+
)
|
|
93
|
+
last_end = node.end_byte
|
|
94
|
+
|
|
95
|
+
# Handle tail
|
|
96
|
+
if last_end < len(text):
|
|
97
|
+
tail = text[last_end:].strip()
|
|
98
|
+
if tail:
|
|
99
|
+
chunks.extend(
|
|
100
|
+
chunk_by_chars(tail, max_chars, overlap)
|
|
101
|
+
if len(tail) > max_chars
|
|
102
|
+
else [tail]
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return chunks
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.warning("AST chunking failed for %s: %s", lang_name, e)
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def chunk_by_chars(text: str, size: int = 1500, overlap: int = 200) -> List[str]:
|
|
112
|
+
"""Chunk text by fixed character window with overlap."""
|
|
113
|
+
if not text:
|
|
114
|
+
return []
|
|
115
|
+
chunks: List[str] = []
|
|
116
|
+
start = 0
|
|
117
|
+
n = len(text)
|
|
118
|
+
step = max(1, size - overlap)
|
|
119
|
+
while start < n:
|
|
120
|
+
end = min(n, start + size)
|
|
121
|
+
chunk = text[start:end].strip()
|
|
122
|
+
if chunk:
|
|
123
|
+
chunks.append(chunk)
|
|
124
|
+
start += step
|
|
125
|
+
return chunks
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def chunk_by_lines(text: str, chunk_size: int, overlap: int = 0) -> List[str]:
|
|
129
|
+
"""Chunk text by lines with optional overlap.
|
|
130
|
+
|
|
131
|
+
`chunk_size` is number of lines per chunk. `overlap` is number of lines
|
|
132
|
+
to overlap between consecutive chunks.
|
|
133
|
+
"""
|
|
134
|
+
if not text:
|
|
135
|
+
return []
|
|
136
|
+
lines = text.splitlines()
|
|
137
|
+
if not lines:
|
|
138
|
+
return []
|
|
139
|
+
chunks: List[str] = []
|
|
140
|
+
step = max(1, chunk_size - overlap)
|
|
141
|
+
for i in range(0, len(lines), step):
|
|
142
|
+
chunk = "\n".join(lines[i : i + chunk_size])
|
|
143
|
+
if chunk.strip():
|
|
144
|
+
chunks.append(chunk)
|
|
145
|
+
return chunks
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _split_sentences(text: str) -> List[str]:
|
|
149
|
+
"""Split prose into sentences using a lightweight regex heuristic."""
|
|
150
|
+
if not text:
|
|
151
|
+
return []
|
|
152
|
+
normalized = re.sub(r"\s+", " ", text).strip()
|
|
153
|
+
if not normalized:
|
|
154
|
+
return []
|
|
155
|
+
parts = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9\"'`])", normalized)
|
|
156
|
+
return [p.strip() for p in parts if p.strip()]
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def chunk_by_sentence_window(
|
|
160
|
+
text: str, window_size: int = 5, overlap: int = 2
|
|
161
|
+
) -> List[str]:
|
|
162
|
+
"""Build overlapping sentence windows with configurable stride.
|
|
163
|
+
|
|
164
|
+
Task C: Increased default window size to 5 for better context.
|
|
165
|
+
The stride is determined by window_size - overlap to prevent chunk explosion.
|
|
166
|
+
"""
|
|
167
|
+
if not text:
|
|
168
|
+
return []
|
|
169
|
+
sentences = _split_sentences(text)
|
|
170
|
+
if not sentences:
|
|
171
|
+
return []
|
|
172
|
+
if len(sentences) <= window_size:
|
|
173
|
+
return [" ".join(sentences)]
|
|
174
|
+
|
|
175
|
+
chunks: List[str] = []
|
|
176
|
+
step = max(1, window_size - overlap)
|
|
177
|
+
for idx in range(0, len(sentences), step):
|
|
178
|
+
chunk = " ".join(sentences[idx : idx + window_size]).strip()
|
|
179
|
+
if chunk:
|
|
180
|
+
chunks.append(chunk)
|
|
181
|
+
# If this chunk already reached the end, stop to avoid redundant tail chunks
|
|
182
|
+
if idx + window_size >= len(sentences):
|
|
183
|
+
break
|
|
184
|
+
return chunks
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def chunk_php_hierarchical(
|
|
188
|
+
text: str, section_max_lines: int = 150, overlap: int = 20
|
|
189
|
+
) -> List[str]:
|
|
190
|
+
"""Chunk PHP by declarations, preserving docblocks and injecting context.
|
|
191
|
+
|
|
192
|
+
Task F: Improved boundary detection and block preservation.
|
|
193
|
+
Phase 5: Structural context injection for better method retrieval.
|
|
194
|
+
"""
|
|
195
|
+
if not text:
|
|
196
|
+
return []
|
|
197
|
+
|
|
198
|
+
# Regex that matches PHP declarations, optionally preceded by a docblock.
|
|
199
|
+
# Pattern: (/** ... */)? (abstract|final|...)* (class|interface|trait|function)
|
|
200
|
+
boundary_regex = (
|
|
201
|
+
r"(?:/\*\*[\s\S]*?\*/\s*)?"
|
|
202
|
+
r"^\s*(?:abstract\s+|final\s+|public\s+|protected\s+|private\s+|static\s+)*"
|
|
203
|
+
r"(class|interface|trait|function)\s+([a-zA-Z0-9_]+)"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
matches = list(re.finditer(boundary_regex, text, re.MULTILINE))
|
|
207
|
+
|
|
208
|
+
if not matches:
|
|
209
|
+
return chunk_by_lines(text, section_max_lines, overlap)
|
|
210
|
+
|
|
211
|
+
sections = []
|
|
212
|
+
current_class = ""
|
|
213
|
+
|
|
214
|
+
for i, match in enumerate(matches):
|
|
215
|
+
m_type = match.group(1)
|
|
216
|
+
m_name = match.group(2)
|
|
217
|
+
|
|
218
|
+
# If there's text before the first match (like <?php)
|
|
219
|
+
if i == 0 and match.start() > 0:
|
|
220
|
+
sections.append(text[0 : match.start()].strip())
|
|
221
|
+
|
|
222
|
+
# Determine header for this block
|
|
223
|
+
header = ""
|
|
224
|
+
if m_type == "function" and current_class:
|
|
225
|
+
header = f"// Context: {current_class}::{m_name}\n"
|
|
226
|
+
elif m_type in ("class", "interface", "trait"):
|
|
227
|
+
header = f"// Context: {m_type} {m_name}\n"
|
|
228
|
+
current_class = m_name
|
|
229
|
+
|
|
230
|
+
# Find end of this section
|
|
231
|
+
next_start = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
232
|
+
section_text = text[match.start() : next_start].strip()
|
|
233
|
+
|
|
234
|
+
if section_text:
|
|
235
|
+
sections.append(f"{header}{section_text}")
|
|
236
|
+
|
|
237
|
+
chunks: List[str] = []
|
|
238
|
+
for section in sections:
|
|
239
|
+
if not section:
|
|
240
|
+
continue
|
|
241
|
+
line_count = len(section.splitlines())
|
|
242
|
+
if line_count > section_max_lines:
|
|
243
|
+
chunks.extend(chunk_by_lines(section, section_max_lines, overlap))
|
|
244
|
+
else:
|
|
245
|
+
chunks.append(section)
|
|
246
|
+
|
|
247
|
+
return chunks
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def chunk_js_hierarchical(
|
|
251
|
+
text: str, section_max_lines: int = 60, overlap: int = 10
|
|
252
|
+
) -> List[str]:
|
|
253
|
+
"""Chunk JS/TS by export/class/function boundaries, preserving docblocks.
|
|
254
|
+
|
|
255
|
+
Task F: Improved boundary detection and block preservation.
|
|
256
|
+
"""
|
|
257
|
+
if not text:
|
|
258
|
+
return []
|
|
259
|
+
|
|
260
|
+
# Regex for JS declarations, optionally preceded by a docblock
|
|
261
|
+
boundary_regex = (
|
|
262
|
+
r"(?:/\*\*[\s\S]*?\*/\s*)?"
|
|
263
|
+
r"^\s*(?:export\s+(?:default\s+)*)?"
|
|
264
|
+
r"(?:class|function|const|let|var|describe|test|it)\b"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
sections = []
|
|
268
|
+
matches = list(re.finditer(boundary_regex, text, re.MULTILINE))
|
|
269
|
+
|
|
270
|
+
if not matches:
|
|
271
|
+
return chunk_by_lines(text, section_max_lines, overlap)
|
|
272
|
+
|
|
273
|
+
last_pos = 0
|
|
274
|
+
for match in matches:
|
|
275
|
+
if match.start() > last_pos:
|
|
276
|
+
section = text[last_pos : match.start()].strip()
|
|
277
|
+
if section:
|
|
278
|
+
sections.append(section)
|
|
279
|
+
last_pos = match.start()
|
|
280
|
+
|
|
281
|
+
if last_pos < len(text):
|
|
282
|
+
sections.append(text[last_pos:].strip())
|
|
283
|
+
|
|
284
|
+
chunks: List[str] = []
|
|
285
|
+
for section in sections:
|
|
286
|
+
line_count = len(section.splitlines())
|
|
287
|
+
if line_count > section_max_lines:
|
|
288
|
+
chunks.extend(chunk_by_lines(section, section_max_lines, overlap))
|
|
289
|
+
else:
|
|
290
|
+
chunks.append(section)
|
|
291
|
+
|
|
292
|
+
return chunks
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def build_doc_records(filename: str, chunks: List[str], meta: Dict) -> List[Dict]:
|
|
296
|
+
"""Build a list of document dicts suitable for DB insertion.
|
|
297
|
+
|
|
298
|
+
Each record includes `text`, `title`, `filename`, `category`, `breadcrumb`,
|
|
299
|
+
`doctype`, `endpoint` and `summary`.
|
|
300
|
+
|
|
301
|
+
The `text` field is enriched with metadata for better retrieval.
|
|
302
|
+
"""
|
|
303
|
+
records: List[Dict] = []
|
|
304
|
+
title = meta.get("title") or ""
|
|
305
|
+
breadcrumb = meta.get("breadcrumb") or ""
|
|
306
|
+
summary = meta.get("summary")
|
|
307
|
+
endpoint = meta.get("endpoint")
|
|
308
|
+
|
|
309
|
+
for i, c in enumerate(chunks):
|
|
310
|
+
# Task B & Phase 2: Prepend context to the text before embedding.
|
|
311
|
+
category = meta.get("category", "unknown").upper()
|
|
312
|
+
doctype = meta.get("doctype", "unknown")
|
|
313
|
+
|
|
314
|
+
header = f"[{category}] DocType: {doctype}\n"
|
|
315
|
+
header += f"[Title: {title} | Path: {breadcrumb}] \n"
|
|
316
|
+
if endpoint:
|
|
317
|
+
header += f"[Endpoint: {endpoint}] \n"
|
|
318
|
+
if summary:
|
|
319
|
+
header += f"[Summary: {summary}] \n"
|
|
320
|
+
|
|
321
|
+
enriched_text = f"{header}\n {c}"
|
|
322
|
+
|
|
323
|
+
# Strategy 2: Per-chunk fingerprinting
|
|
324
|
+
# Includes enriched_text (which has all context) and logic version.
|
|
325
|
+
h = hashlib.sha256()
|
|
326
|
+
h.update(f"{CHUNK_VERSION}:{enriched_text}".encode("utf-8"))
|
|
327
|
+
chunk_hash = h.hexdigest()
|
|
328
|
+
|
|
329
|
+
records.append(
|
|
330
|
+
{
|
|
331
|
+
"text": enriched_text,
|
|
332
|
+
"title": title,
|
|
333
|
+
"filename": filename,
|
|
334
|
+
"category": meta.get("category"),
|
|
335
|
+
"breadcrumb": breadcrumb,
|
|
336
|
+
"doctype": meta.get("doctype", "unknown"),
|
|
337
|
+
"endpoint": endpoint,
|
|
338
|
+
"summary": summary,
|
|
339
|
+
"chunk_id": i,
|
|
340
|
+
"chunk_hash": chunk_hash,
|
|
341
|
+
}
|
|
342
|
+
)
|
|
343
|
+
return records
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def persist_batch(table, docs: List[Dict]):
|
|
347
|
+
"""Persist a batch of docs to `table`.
|
|
348
|
+
|
|
349
|
+
`table` is expected to implement an `add(iterable)` method
|
|
350
|
+
(LanceDB-like).
|
|
351
|
+
|
|
352
|
+
This wrapper keeps the call site testable. Returns the result of
|
|
353
|
+
`table.add` when present.
|
|
354
|
+
"""
|
|
355
|
+
if not docs:
|
|
356
|
+
return None
|
|
357
|
+
if hasattr(table, "add"):
|
|
358
|
+
return table.add(docs)
|
|
359
|
+
# Fallback: try treating table as a callable
|
|
360
|
+
return table(docs)
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Callable, TypeVar
|
|
4
|
+
import inspect
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("plesk_unified")
|
|
7
|
+
|
|
8
|
+
# Try to import LanceDB exceptions for precise matching
|
|
9
|
+
try:
|
|
10
|
+
import lancedb.exceptions as lancedb_exc
|
|
11
|
+
|
|
12
|
+
LANCEDB_EXCEPTIONS_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
LANCEDB_EXCEPTIONS_AVAILABLE = False
|
|
15
|
+
|
|
16
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _classify_error(exc: Exception) -> str: # noqa: PLR0911
|
|
20
|
+
"""Map known exception types to user-friendly guidance strings."""
|
|
21
|
+
exc_msg = str(exc)
|
|
22
|
+
exc_msg_lower = exc_msg.lower()
|
|
23
|
+
exc_type_name = type(exc).__name__
|
|
24
|
+
|
|
25
|
+
# 1. LanceDB TableNotFoundError
|
|
26
|
+
if (
|
|
27
|
+
LANCEDB_EXCEPTIONS_AVAILABLE
|
|
28
|
+
and hasattr(lancedb_exc, "TableNotFoundError")
|
|
29
|
+
and isinstance(exc, lancedb_exc.TableNotFoundError)
|
|
30
|
+
):
|
|
31
|
+
return (
|
|
32
|
+
"[ERROR] Knowledge base not indexed. "
|
|
33
|
+
"Call refresh_knowledge(reset_db=True) first."
|
|
34
|
+
)
|
|
35
|
+
if exc_type_name == "TableNotFoundError":
|
|
36
|
+
return (
|
|
37
|
+
"[ERROR] Knowledge base not indexed. "
|
|
38
|
+
"Call refresh_knowledge(reset_db=True) first."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Handle ValueError that some versions of LanceDB raise when a table is missing
|
|
42
|
+
if isinstance(exc, ValueError) and "was not found" in exc_msg_lower:
|
|
43
|
+
return (
|
|
44
|
+
"[ERROR] Knowledge base not indexed. "
|
|
45
|
+
"Call refresh_knowledge(reset_db=True) first."
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# 2. LanceDB connection error
|
|
49
|
+
# Connection errors in LanceDB can manifest as various exceptions
|
|
50
|
+
# depending on the storage backend
|
|
51
|
+
if (
|
|
52
|
+
"lancedb" in exc_msg_lower
|
|
53
|
+
or "database" in exc_msg_lower
|
|
54
|
+
or "connection" in exc_msg_lower
|
|
55
|
+
):
|
|
56
|
+
if (
|
|
57
|
+
"not found" not in exc_msg_lower
|
|
58
|
+
): # Avoid collision with TableNotFoundError if not caught above
|
|
59
|
+
return (
|
|
60
|
+
"[ERROR] Database unavailable. Check storage/lancedb/ path. "
|
|
61
|
+
"Call daemon_health for details."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# 3. RuntimeError containing "model"
|
|
65
|
+
if isinstance(exc, RuntimeError) and "model" in exc_msg_lower:
|
|
66
|
+
return "[ERROR] Embedding model not loaded. Call warmup_server first."
|
|
67
|
+
|
|
68
|
+
# 4. PermissionError
|
|
69
|
+
if isinstance(exc, PermissionError):
|
|
70
|
+
return "[ERROR] Path traversal detected. Operation rejected."
|
|
71
|
+
|
|
72
|
+
# --- NEW FIX: Classify ValueError from _validate_category ---
|
|
73
|
+
if isinstance(exc, ValueError) and "invalid category" in exc_msg_lower:
|
|
74
|
+
# Re-raise as a more specific error message, keeping context
|
|
75
|
+
return f"[ERROR] Invalid argument: {exc_msg}. Check allowed category values."
|
|
76
|
+
|
|
77
|
+
# 5. Generic fallback
|
|
78
|
+
return (
|
|
79
|
+
"[ERROR] Unexpected server error. "
|
|
80
|
+
"Call daemon_health to check server state, then retry."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def tool_error_boundary(fn: F) -> F:
|
|
85
|
+
"""
|
|
86
|
+
Decorator for MCP tools to catch exceptions and return sanitized guidance.
|
|
87
|
+
|
|
88
|
+
Logs the full traceback to the module logger and returns a string starting
|
|
89
|
+
with [ERROR] that provides actionable instructions for the LLM.
|
|
90
|
+
"""
|
|
91
|
+
if inspect.iscoroutinefunction(fn):
|
|
92
|
+
|
|
93
|
+
@functools.wraps(fn)
|
|
94
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> str:
|
|
95
|
+
try:
|
|
96
|
+
return await fn(*args, **kwargs)
|
|
97
|
+
except Exception as exc:
|
|
98
|
+
logger.error("Tool %s failed", fn.__name__, exc_info=True)
|
|
99
|
+
return _classify_error(exc)
|
|
100
|
+
|
|
101
|
+
return async_wrapper # type: ignore
|
|
102
|
+
else:
|
|
103
|
+
|
|
104
|
+
@functools.wraps(fn)
|
|
105
|
+
def sync_wrapper(*args: Any, **kwargs: Any) -> str:
|
|
106
|
+
try:
|
|
107
|
+
return fn(*args, **kwargs)
|
|
108
|
+
except Exception as exc:
|
|
109
|
+
logger.error("Tool %s failed", fn.__name__, exc_info=True)
|
|
110
|
+
return _classify_error(exc)
|
|
111
|
+
|
|
112
|
+
return sync_wrapper # type: ignore
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Tuple, Set
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
from markdownify import markdownify as _md
|
|
9
|
+
|
|
10
|
+
from plesk_unified.ai_client import AIClient
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Constants for cognitive load rules
|
|
15
|
+
MAX_TABLE_ROWS = 10
|
|
16
|
+
MIN_PACKET_LEN = 5
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is_table_complex(table) -> bool:
|
|
20
|
+
"""Heuristic to determine if a table is complex enough to need LLM normalization."""
|
|
21
|
+
# merged cells
|
|
22
|
+
if table.find_all(attrs={"colspan": True}) or table.find_all(
|
|
23
|
+
attrs={"rowspan": True}
|
|
24
|
+
):
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
rows = table.find_all("tr")
|
|
28
|
+
if len(rows) > MAX_TABLE_ROWS: # Oversized table
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
# Check for multi-level headers (th in non-first row or multiple rows with th)
|
|
32
|
+
th_rows = 0
|
|
33
|
+
for row in rows:
|
|
34
|
+
if row.find("th"):
|
|
35
|
+
th_rows += 1
|
|
36
|
+
if th_rows > 1:
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _normalize_table_with_llm(
|
|
43
|
+
table, ai_client: AIClient, model: Optional[str] = None
|
|
44
|
+
) -> str:
|
|
45
|
+
"""Convert a complex HTML table into prose using an LLM."""
|
|
46
|
+
table_html = str(table)
|
|
47
|
+
prompt = (
|
|
48
|
+
"Convert the following HTML table into descriptive prose sentences "
|
|
49
|
+
"that preserve row-column semantic relationships. Output ONLY the prose.\n\n"
|
|
50
|
+
f"Table:\n{table_html}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
models = [model] if model else ["google/gemini-2.5-flash-lite"]
|
|
55
|
+
res = ai_client.generate_description(prompt, model_list=models)
|
|
56
|
+
if res and res != "Description unavailable.":
|
|
57
|
+
return res
|
|
58
|
+
except Exception:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _normalize_table_to_sentences(table) -> str:
|
|
65
|
+
"""Convert an HTML table into descriptive prose sentences."""
|
|
66
|
+
rows = table.find_all("tr")
|
|
67
|
+
if not rows:
|
|
68
|
+
return ""
|
|
69
|
+
|
|
70
|
+
matrix = []
|
|
71
|
+
for row in rows:
|
|
72
|
+
cells = row.find_all(["th", "td"])
|
|
73
|
+
values = [c.get_text(" ", strip=True) for c in cells]
|
|
74
|
+
if any(values):
|
|
75
|
+
matrix.append(values)
|
|
76
|
+
|
|
77
|
+
if not matrix:
|
|
78
|
+
return ""
|
|
79
|
+
|
|
80
|
+
header_row = rows[0].find_all("th")
|
|
81
|
+
has_header = bool(header_row)
|
|
82
|
+
|
|
83
|
+
headers = matrix[0] if has_header else []
|
|
84
|
+
data_rows = matrix[1:] if has_header else matrix
|
|
85
|
+
|
|
86
|
+
sentences = []
|
|
87
|
+
for row in data_rows:
|
|
88
|
+
if headers and len(headers) == len(row):
|
|
89
|
+
parts = [f"{headers[i]}: {row[i]}" for i in range(len(row)) if row[i]]
|
|
90
|
+
sentence = ", ".join(parts)
|
|
91
|
+
else:
|
|
92
|
+
sentence = " | ".join(cell for cell in row if cell)
|
|
93
|
+
sentence = sentence.strip()
|
|
94
|
+
if sentence:
|
|
95
|
+
sentences.append(sentence + ".")
|
|
96
|
+
|
|
97
|
+
return "\n".join(sentences)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _replace_tables_with_prose(
|
|
101
|
+
soup: BeautifulSoup,
|
|
102
|
+
llm_enabled: bool = False,
|
|
103
|
+
ai_client: Optional[AIClient] = None,
|
|
104
|
+
llm_model: Optional[str] = None,
|
|
105
|
+
) -> None:
|
|
106
|
+
"""Replace HTML tables with prose blocks to preserve semantic relationships."""
|
|
107
|
+
for table in soup.find_all("table"):
|
|
108
|
+
prose = ""
|
|
109
|
+
if llm_enabled and ai_client and _is_table_complex(table):
|
|
110
|
+
prose = _normalize_table_with_llm(table, ai_client, llm_model)
|
|
111
|
+
|
|
112
|
+
if not prose:
|
|
113
|
+
prose = _normalize_table_to_sentences(table)
|
|
114
|
+
|
|
115
|
+
if not prose:
|
|
116
|
+
table.decompose()
|
|
117
|
+
continue
|
|
118
|
+
replacement = soup.new_tag("p")
|
|
119
|
+
replacement.string = prose
|
|
120
|
+
table.replace_with(replacement)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def extract_api_endpoints(html: str) -> Optional[str]:
|
|
124
|
+
"""Extract API endpoint signatures (REST, XML, CLI) from HTML content."""
|
|
125
|
+
found_endpoints: Set[str] = set()
|
|
126
|
+
|
|
127
|
+
# 1. REST API patterns
|
|
128
|
+
rest_matches = re.findall(
|
|
129
|
+
r"(GET|POST|PUT|DELETE|PATCH).{0,100}?((/api/v2)?/[a-zA-Z0-9\/\-\_{}]+)",
|
|
130
|
+
html,
|
|
131
|
+
)
|
|
132
|
+
for method, path, _ in rest_matches:
|
|
133
|
+
found_endpoints.add(f"{method} {path}")
|
|
134
|
+
|
|
135
|
+
# 2. XML API patterns
|
|
136
|
+
xml_matches = re.findall(r"([a-z0-9\_]+(?:_list|_get|_set))", html)
|
|
137
|
+
for packet in xml_matches:
|
|
138
|
+
if len(packet) > MIN_PACKET_LEN: # avoid tiny noise
|
|
139
|
+
found_endpoints.add(f"XML: {packet}")
|
|
140
|
+
|
|
141
|
+
# 3. CLI patterns
|
|
142
|
+
cli_matches = re.findall(r"plesk\s+(?:bin\s+)?([a-z0-9\_]+)\s+([a-z0-9\_]+)", html)
|
|
143
|
+
for obj, cmd in cli_matches:
|
|
144
|
+
found_endpoints.add(f"CLI: {obj} {cmd}")
|
|
145
|
+
|
|
146
|
+
if not found_endpoints:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
return ", ".join(sorted(found_endpoints))
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def clean_dom_tree(soup: BeautifulSoup) -> BeautifulSoup:
|
|
153
|
+
"""Remove nav, footer, scripts and other noisy elements from the DOM tree."""
|
|
154
|
+
for sel in soup.select(
|
|
155
|
+
"nav, footer, script, style, aside, .sidebar, .toc, noscript"
|
|
156
|
+
):
|
|
157
|
+
sel.decompose()
|
|
158
|
+
|
|
159
|
+
llm_enabled = os.environ.get("PLESK_HTML_LLM_TABLE_NORMALIZE") == "1"
|
|
160
|
+
ai_client = AIClient() if llm_enabled else None
|
|
161
|
+
_replace_tables_with_prose(soup, llm_enabled=llm_enabled, ai_client=ai_client)
|
|
162
|
+
|
|
163
|
+
return soup
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def convert_soup_to_markdown(soup: BeautifulSoup) -> str:
|
|
167
|
+
"""Convert a cleaned BeautifulSoup tree into Markdown text."""
|
|
168
|
+
main = soup.find("main") or soup.find("article") or soup.body
|
|
169
|
+
raw_html = str(main) if main else str(soup)
|
|
170
|
+
|
|
171
|
+
# Convert to Markdown to preserve code blocks and headings.
|
|
172
|
+
text = _md(raw_html, heading_style="ATX", strip=["a"])
|
|
173
|
+
|
|
174
|
+
# Collapse runs of 3+ blank lines
|
|
175
|
+
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
|
176
|
+
return text
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def parse_html(
|
|
180
|
+
path: Path, toc_meta: Optional[dict] = None
|
|
181
|
+
) -> Tuple[str, str, Optional[str], str, Optional[str]]:
|
|
182
|
+
"""Parse an HTML file and return (filename, title, breadcrumb, text, endpoint)."""
|
|
183
|
+
path = Path(path)
|
|
184
|
+
with path.open("r", encoding="utf-8", errors="ignore") as fh:
|
|
185
|
+
html = fh.read()
|
|
186
|
+
|
|
187
|
+
# 1. Extraction (Functional Decomposition)
|
|
188
|
+
endpoint = extract_api_endpoints(html)
|
|
189
|
+
|
|
190
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
191
|
+
|
|
192
|
+
# 2. Title extraction
|
|
193
|
+
title_tag = soup.find("title")
|
|
194
|
+
title = (
|
|
195
|
+
title_tag.get_text(strip=True)
|
|
196
|
+
if title_tag
|
|
197
|
+
else (toc_meta or {}).get("title", "")
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# 3. DOM Cleaning
|
|
201
|
+
soup = clean_dom_tree(soup)
|
|
202
|
+
|
|
203
|
+
# 4. Conversion
|
|
204
|
+
text = convert_soup_to_markdown(soup)
|
|
205
|
+
|
|
206
|
+
breadcrumb = (toc_meta or {}).get("breadcrumb")
|
|
207
|
+
|
|
208
|
+
return path.name, title, breadcrumb, text, endpoint
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def clean_html_for_markdown(html: str) -> str:
|
|
212
|
+
"""Return cleaned HTML string suitable for markdown conversion."""
|
|
213
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
214
|
+
soup = clean_dom_tree(soup)
|
|
215
|
+
|
|
216
|
+
main = soup.find("main") or soup.find("article") or soup.body
|
|
217
|
+
return str(main) if main else str(soup)
|