mcp-plesk-dev-docs 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,360 @@
1
+ import hashlib
2
+ import re
3
+ import logging
4
+ from typing import Dict, List, Optional
5
+
6
+ logger = logging.getLogger("plesk_unified")
7
+
8
+ # Bump this version whenever the chunking logic or context injection changes
9
+ # to force a re-embedding of changed chunks while preserving identical ones.
10
+ CHUNK_VERSION = "v15"
11
+
12
+ # Global registry for tree-sitter languages to avoid repeated lookups
13
+ _TS_LANGS = {}
14
+
15
+
16
+ def _get_ts_lang(lang_name: str):
17
+ """Get or load a tree-sitter language."""
18
+ if lang_name in _TS_LANGS:
19
+ return _TS_LANGS[lang_name]
20
+
21
+ try:
22
+ import tree_sitter_languages
23
+
24
+ lang = tree_sitter_languages.get_language(lang_name)
25
+ _TS_LANGS[lang_name] = lang
26
+ return lang
27
+ except Exception:
28
+ return None
29
+
30
+
31
+ def _get_ts_query(lang_name: str) -> Optional[str]:
32
+ """Return the tree-sitter query string for the given language."""
33
+ if lang_name == "php":
34
+ return """
35
+ (class_declaration) @decl
36
+ (function_declaration) @decl
37
+ (method_declaration) @decl
38
+ (interface_declaration) @decl
39
+ (trait_declaration) @decl
40
+ """
41
+ if lang_name in ("javascript", "typescript"):
42
+ return """
43
+ (class_declaration) @decl
44
+ (function_declaration) @decl
45
+ (method_definition) @decl
46
+ (export_statement) @decl
47
+ """
48
+ return None
49
+
50
+
51
+ def chunk_by_ast(
52
+ text: str, lang_name: str, max_chars: int = 1500, overlap: int = 200
53
+ ) -> Optional[List[str]]:
54
+ """Chunk code using tree-sitter AST nodes (classes, functions, methods)."""
55
+ lang = _get_ts_lang(lang_name)
56
+ query_str = _get_ts_query(lang_name)
57
+ if not lang or not query_str:
58
+ return None
59
+
60
+ try:
61
+ from tree_sitter import Parser
62
+
63
+ parser = Parser()
64
+ parser.set_language(lang)
65
+ tree = parser.parse(bytes(text, "utf-8"))
66
+ query = lang.query(query_str)
67
+ captures = query.captures(tree.root_node)
68
+
69
+ if not captures:
70
+ return None
71
+
72
+ chunks = []
73
+ last_end = 0
74
+
75
+ for node, _ in captures:
76
+ # Handle gap before this node
77
+ if node.start_byte > last_end:
78
+ gap = text[last_end : node.start_byte].strip()
79
+ if gap:
80
+ chunks.extend(
81
+ chunk_by_chars(gap, max_chars, overlap)
82
+ if len(gap) > max_chars
83
+ else [gap]
84
+ )
85
+
86
+ block = text[node.start_byte : node.end_byte].strip()
87
+ if block:
88
+ chunks.extend(
89
+ chunk_by_chars(block, max_chars, overlap)
90
+ if len(block) > max_chars
91
+ else [block]
92
+ )
93
+ last_end = node.end_byte
94
+
95
+ # Handle tail
96
+ if last_end < len(text):
97
+ tail = text[last_end:].strip()
98
+ if tail:
99
+ chunks.extend(
100
+ chunk_by_chars(tail, max_chars, overlap)
101
+ if len(tail) > max_chars
102
+ else [tail]
103
+ )
104
+
105
+ return chunks
106
+ except Exception as e:
107
+ logger.warning("AST chunking failed for %s: %s", lang_name, e)
108
+ return None
109
+
110
+
111
+ def chunk_by_chars(text: str, size: int = 1500, overlap: int = 200) -> List[str]:
112
+ """Chunk text by fixed character window with overlap."""
113
+ if not text:
114
+ return []
115
+ chunks: List[str] = []
116
+ start = 0
117
+ n = len(text)
118
+ step = max(1, size - overlap)
119
+ while start < n:
120
+ end = min(n, start + size)
121
+ chunk = text[start:end].strip()
122
+ if chunk:
123
+ chunks.append(chunk)
124
+ start += step
125
+ return chunks
126
+
127
+
128
+ def chunk_by_lines(text: str, chunk_size: int, overlap: int = 0) -> List[str]:
129
+ """Chunk text by lines with optional overlap.
130
+
131
+ `chunk_size` is number of lines per chunk. `overlap` is number of lines
132
+ to overlap between consecutive chunks.
133
+ """
134
+ if not text:
135
+ return []
136
+ lines = text.splitlines()
137
+ if not lines:
138
+ return []
139
+ chunks: List[str] = []
140
+ step = max(1, chunk_size - overlap)
141
+ for i in range(0, len(lines), step):
142
+ chunk = "\n".join(lines[i : i + chunk_size])
143
+ if chunk.strip():
144
+ chunks.append(chunk)
145
+ return chunks
146
+
147
+
148
+ def _split_sentences(text: str) -> List[str]:
149
+ """Split prose into sentences using a lightweight regex heuristic."""
150
+ if not text:
151
+ return []
152
+ normalized = re.sub(r"\s+", " ", text).strip()
153
+ if not normalized:
154
+ return []
155
+ parts = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9\"'`])", normalized)
156
+ return [p.strip() for p in parts if p.strip()]
157
+
158
+
159
+ def chunk_by_sentence_window(
160
+ text: str, window_size: int = 5, overlap: int = 2
161
+ ) -> List[str]:
162
+ """Build overlapping sentence windows with configurable stride.
163
+
164
+ Task C: Increased default window size to 5 for better context.
165
+ The stride is determined by window_size - overlap to prevent chunk explosion.
166
+ """
167
+ if not text:
168
+ return []
169
+ sentences = _split_sentences(text)
170
+ if not sentences:
171
+ return []
172
+ if len(sentences) <= window_size:
173
+ return [" ".join(sentences)]
174
+
175
+ chunks: List[str] = []
176
+ step = max(1, window_size - overlap)
177
+ for idx in range(0, len(sentences), step):
178
+ chunk = " ".join(sentences[idx : idx + window_size]).strip()
179
+ if chunk:
180
+ chunks.append(chunk)
181
+ # If this chunk already reached the end, stop to avoid redundant tail chunks
182
+ if idx + window_size >= len(sentences):
183
+ break
184
+ return chunks
185
+
186
+
187
+ def chunk_php_hierarchical(
188
+ text: str, section_max_lines: int = 150, overlap: int = 20
189
+ ) -> List[str]:
190
+ """Chunk PHP by declarations, preserving docblocks and injecting context.
191
+
192
+ Task F: Improved boundary detection and block preservation.
193
+ Phase 5: Structural context injection for better method retrieval.
194
+ """
195
+ if not text:
196
+ return []
197
+
198
+ # Regex that matches PHP declarations, optionally preceded by a docblock.
199
+ # Pattern: (/** ... */)? (abstract|final|...)* (class|interface|trait|function)
200
+ boundary_regex = (
201
+ r"(?:/\*\*[\s\S]*?\*/\s*)?"
202
+ r"^\s*(?:abstract\s+|final\s+|public\s+|protected\s+|private\s+|static\s+)*"
203
+ r"(class|interface|trait|function)\s+([a-zA-Z0-9_]+)"
204
+ )
205
+
206
+ matches = list(re.finditer(boundary_regex, text, re.MULTILINE))
207
+
208
+ if not matches:
209
+ return chunk_by_lines(text, section_max_lines, overlap)
210
+
211
+ sections = []
212
+ current_class = ""
213
+
214
+ for i, match in enumerate(matches):
215
+ m_type = match.group(1)
216
+ m_name = match.group(2)
217
+
218
+ # If there's text before the first match (like <?php)
219
+ if i == 0 and match.start() > 0:
220
+ sections.append(text[0 : match.start()].strip())
221
+
222
+ # Determine header for this block
223
+ header = ""
224
+ if m_type == "function" and current_class:
225
+ header = f"// Context: {current_class}::{m_name}\n"
226
+ elif m_type in ("class", "interface", "trait"):
227
+ header = f"// Context: {m_type} {m_name}\n"
228
+ current_class = m_name
229
+
230
+ # Find end of this section
231
+ next_start = matches[i + 1].start() if i + 1 < len(matches) else len(text)
232
+ section_text = text[match.start() : next_start].strip()
233
+
234
+ if section_text:
235
+ sections.append(f"{header}{section_text}")
236
+
237
+ chunks: List[str] = []
238
+ for section in sections:
239
+ if not section:
240
+ continue
241
+ line_count = len(section.splitlines())
242
+ if line_count > section_max_lines:
243
+ chunks.extend(chunk_by_lines(section, section_max_lines, overlap))
244
+ else:
245
+ chunks.append(section)
246
+
247
+ return chunks
248
+
249
+
250
+ def chunk_js_hierarchical(
251
+ text: str, section_max_lines: int = 60, overlap: int = 10
252
+ ) -> List[str]:
253
+ """Chunk JS/TS by export/class/function boundaries, preserving docblocks.
254
+
255
+ Task F: Improved boundary detection and block preservation.
256
+ """
257
+ if not text:
258
+ return []
259
+
260
+ # Regex for JS declarations, optionally preceded by a docblock
261
+ boundary_regex = (
262
+ r"(?:/\*\*[\s\S]*?\*/\s*)?"
263
+ r"^\s*(?:export\s+(?:default\s+)*)?"
264
+ r"(?:class|function|const|let|var|describe|test|it)\b"
265
+ )
266
+
267
+ sections = []
268
+ matches = list(re.finditer(boundary_regex, text, re.MULTILINE))
269
+
270
+ if not matches:
271
+ return chunk_by_lines(text, section_max_lines, overlap)
272
+
273
+ last_pos = 0
274
+ for match in matches:
275
+ if match.start() > last_pos:
276
+ section = text[last_pos : match.start()].strip()
277
+ if section:
278
+ sections.append(section)
279
+ last_pos = match.start()
280
+
281
+ if last_pos < len(text):
282
+ sections.append(text[last_pos:].strip())
283
+
284
+ chunks: List[str] = []
285
+ for section in sections:
286
+ line_count = len(section.splitlines())
287
+ if line_count > section_max_lines:
288
+ chunks.extend(chunk_by_lines(section, section_max_lines, overlap))
289
+ else:
290
+ chunks.append(section)
291
+
292
+ return chunks
293
+
294
+
295
+ def build_doc_records(filename: str, chunks: List[str], meta: Dict) -> List[Dict]:
296
+ """Build a list of document dicts suitable for DB insertion.
297
+
298
+ Each record includes `text`, `title`, `filename`, `category`, `breadcrumb`,
299
+ `doctype`, `endpoint` and `summary`.
300
+
301
+ The `text` field is enriched with metadata for better retrieval.
302
+ """
303
+ records: List[Dict] = []
304
+ title = meta.get("title") or ""
305
+ breadcrumb = meta.get("breadcrumb") or ""
306
+ summary = meta.get("summary")
307
+ endpoint = meta.get("endpoint")
308
+
309
+ for i, c in enumerate(chunks):
310
+ # Task B & Phase 2: Prepend context to the text before embedding.
311
+ category = meta.get("category", "unknown").upper()
312
+ doctype = meta.get("doctype", "unknown")
313
+
314
+ header = f"[{category}] DocType: {doctype}\n"
315
+ header += f"[Title: {title} | Path: {breadcrumb}] \n"
316
+ if endpoint:
317
+ header += f"[Endpoint: {endpoint}] \n"
318
+ if summary:
319
+ header += f"[Summary: {summary}] \n"
320
+
321
+ enriched_text = f"{header}\n {c}"
322
+
323
+ # Strategy 2: Per-chunk fingerprinting
324
+ # Includes enriched_text (which has all context) and logic version.
325
+ h = hashlib.sha256()
326
+ h.update(f"{CHUNK_VERSION}:{enriched_text}".encode("utf-8"))
327
+ chunk_hash = h.hexdigest()
328
+
329
+ records.append(
330
+ {
331
+ "text": enriched_text,
332
+ "title": title,
333
+ "filename": filename,
334
+ "category": meta.get("category"),
335
+ "breadcrumb": breadcrumb,
336
+ "doctype": meta.get("doctype", "unknown"),
337
+ "endpoint": endpoint,
338
+ "summary": summary,
339
+ "chunk_id": i,
340
+ "chunk_hash": chunk_hash,
341
+ }
342
+ )
343
+ return records
344
+
345
+
346
+ def persist_batch(table, docs: List[Dict]):
347
+ """Persist a batch of docs to `table`.
348
+
349
+ `table` is expected to implement an `add(iterable)` method
350
+ (LanceDB-like).
351
+
352
+ This wrapper keeps the call site testable. Returns the result of
353
+ `table.add` when present.
354
+ """
355
+ if not docs:
356
+ return None
357
+ if hasattr(table, "add"):
358
+ return table.add(docs)
359
+ # Fallback: try treating table as a callable
360
+ return table(docs)
@@ -0,0 +1,112 @@
1
+ import functools
2
+ import logging
3
+ from typing import Any, Callable, TypeVar
4
+ import inspect
5
+
6
+ logger = logging.getLogger("plesk_unified")
7
+
8
+ # Try to import LanceDB exceptions for precise matching
9
+ try:
10
+ import lancedb.exceptions as lancedb_exc
11
+
12
+ LANCEDB_EXCEPTIONS_AVAILABLE = True
13
+ except ImportError:
14
+ LANCEDB_EXCEPTIONS_AVAILABLE = False
15
+
16
+ F = TypeVar("F", bound=Callable[..., Any])
17
+
18
+
19
+ def _classify_error(exc: Exception) -> str: # noqa: PLR0911
20
+ """Map known exception types to user-friendly guidance strings."""
21
+ exc_msg = str(exc)
22
+ exc_msg_lower = exc_msg.lower()
23
+ exc_type_name = type(exc).__name__
24
+
25
+ # 1. LanceDB TableNotFoundError
26
+ if (
27
+ LANCEDB_EXCEPTIONS_AVAILABLE
28
+ and hasattr(lancedb_exc, "TableNotFoundError")
29
+ and isinstance(exc, lancedb_exc.TableNotFoundError)
30
+ ):
31
+ return (
32
+ "[ERROR] Knowledge base not indexed. "
33
+ "Call refresh_knowledge(reset_db=True) first."
34
+ )
35
+ if exc_type_name == "TableNotFoundError":
36
+ return (
37
+ "[ERROR] Knowledge base not indexed. "
38
+ "Call refresh_knowledge(reset_db=True) first."
39
+ )
40
+
41
+ # Handle ValueError that some versions of LanceDB raise when a table is missing
42
+ if isinstance(exc, ValueError) and "was not found" in exc_msg_lower:
43
+ return (
44
+ "[ERROR] Knowledge base not indexed. "
45
+ "Call refresh_knowledge(reset_db=True) first."
46
+ )
47
+
48
+ # 2. LanceDB connection error
49
+ # Connection errors in LanceDB can manifest as various exceptions
50
+ # depending on the storage backend
51
+ if (
52
+ "lancedb" in exc_msg_lower
53
+ or "database" in exc_msg_lower
54
+ or "connection" in exc_msg_lower
55
+ ):
56
+ if (
57
+ "not found" not in exc_msg_lower
58
+ ): # Avoid collision with TableNotFoundError if not caught above
59
+ return (
60
+ "[ERROR] Database unavailable. Check storage/lancedb/ path. "
61
+ "Call daemon_health for details."
62
+ )
63
+
64
+ # 3. RuntimeError containing "model"
65
+ if isinstance(exc, RuntimeError) and "model" in exc_msg_lower:
66
+ return "[ERROR] Embedding model not loaded. Call warmup_server first."
67
+
68
+ # 4. PermissionError
69
+ if isinstance(exc, PermissionError):
70
+ return "[ERROR] Path traversal detected. Operation rejected."
71
+
72
+ # --- NEW FIX: Classify ValueError from _validate_category ---
73
+ if isinstance(exc, ValueError) and "invalid category" in exc_msg_lower:
74
+ # Re-raise as a more specific error message, keeping context
75
+ return f"[ERROR] Invalid argument: {exc_msg}. Check allowed category values."
76
+
77
+ # 5. Generic fallback
78
+ return (
79
+ "[ERROR] Unexpected server error. "
80
+ "Call daemon_health to check server state, then retry."
81
+ )
82
+
83
+
84
+ def tool_error_boundary(fn: F) -> F:
85
+ """
86
+ Decorator for MCP tools to catch exceptions and return sanitized guidance.
87
+
88
+ Logs the full traceback to the module logger and returns a string starting
89
+ with [ERROR] that provides actionable instructions for the LLM.
90
+ """
91
+ if inspect.iscoroutinefunction(fn):
92
+
93
+ @functools.wraps(fn)
94
+ async def async_wrapper(*args: Any, **kwargs: Any) -> str:
95
+ try:
96
+ return await fn(*args, **kwargs)
97
+ except Exception as exc:
98
+ logger.error("Tool %s failed", fn.__name__, exc_info=True)
99
+ return _classify_error(exc)
100
+
101
+ return async_wrapper # type: ignore
102
+ else:
103
+
104
+ @functools.wraps(fn)
105
+ def sync_wrapper(*args: Any, **kwargs: Any) -> str:
106
+ try:
107
+ return fn(*args, **kwargs)
108
+ except Exception as exc:
109
+ logger.error("Tool %s failed", fn.__name__, exc_info=True)
110
+ return _classify_error(exc)
111
+
112
+ return sync_wrapper # type: ignore
@@ -0,0 +1,217 @@
1
+ import logging
2
+ import os
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Optional, Tuple, Set
6
+
7
+ from bs4 import BeautifulSoup
8
+ from markdownify import markdownify as _md
9
+
10
+ from plesk_unified.ai_client import AIClient
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Constants for cognitive load rules
15
+ MAX_TABLE_ROWS = 10
16
+ MIN_PACKET_LEN = 5
17
+
18
+
19
+ def _is_table_complex(table) -> bool:
20
+ """Heuristic to determine if a table is complex enough to need LLM normalization."""
21
+ # merged cells
22
+ if table.find_all(attrs={"colspan": True}) or table.find_all(
23
+ attrs={"rowspan": True}
24
+ ):
25
+ return True
26
+
27
+ rows = table.find_all("tr")
28
+ if len(rows) > MAX_TABLE_ROWS: # Oversized table
29
+ return True
30
+
31
+ # Check for multi-level headers (th in non-first row or multiple rows with th)
32
+ th_rows = 0
33
+ for row in rows:
34
+ if row.find("th"):
35
+ th_rows += 1
36
+ if th_rows > 1:
37
+ return True
38
+
39
+ return False
40
+
41
+
42
+ def _normalize_table_with_llm(
43
+ table, ai_client: AIClient, model: Optional[str] = None
44
+ ) -> str:
45
+ """Convert a complex HTML table into prose using an LLM."""
46
+ table_html = str(table)
47
+ prompt = (
48
+ "Convert the following HTML table into descriptive prose sentences "
49
+ "that preserve row-column semantic relationships. Output ONLY the prose.\n\n"
50
+ f"Table:\n{table_html}"
51
+ )
52
+
53
+ try:
54
+ models = [model] if model else ["google/gemini-2.5-flash-lite"]
55
+ res = ai_client.generate_description(prompt, model_list=models)
56
+ if res and res != "Description unavailable.":
57
+ return res
58
+ except Exception:
59
+ pass
60
+
61
+ return ""
62
+
63
+
64
+ def _normalize_table_to_sentences(table) -> str:
65
+ """Convert an HTML table into descriptive prose sentences."""
66
+ rows = table.find_all("tr")
67
+ if not rows:
68
+ return ""
69
+
70
+ matrix = []
71
+ for row in rows:
72
+ cells = row.find_all(["th", "td"])
73
+ values = [c.get_text(" ", strip=True) for c in cells]
74
+ if any(values):
75
+ matrix.append(values)
76
+
77
+ if not matrix:
78
+ return ""
79
+
80
+ header_row = rows[0].find_all("th")
81
+ has_header = bool(header_row)
82
+
83
+ headers = matrix[0] if has_header else []
84
+ data_rows = matrix[1:] if has_header else matrix
85
+
86
+ sentences = []
87
+ for row in data_rows:
88
+ if headers and len(headers) == len(row):
89
+ parts = [f"{headers[i]}: {row[i]}" for i in range(len(row)) if row[i]]
90
+ sentence = ", ".join(parts)
91
+ else:
92
+ sentence = " | ".join(cell for cell in row if cell)
93
+ sentence = sentence.strip()
94
+ if sentence:
95
+ sentences.append(sentence + ".")
96
+
97
+ return "\n".join(sentences)
98
+
99
+
100
+ def _replace_tables_with_prose(
101
+ soup: BeautifulSoup,
102
+ llm_enabled: bool = False,
103
+ ai_client: Optional[AIClient] = None,
104
+ llm_model: Optional[str] = None,
105
+ ) -> None:
106
+ """Replace HTML tables with prose blocks to preserve semantic relationships."""
107
+ for table in soup.find_all("table"):
108
+ prose = ""
109
+ if llm_enabled and ai_client and _is_table_complex(table):
110
+ prose = _normalize_table_with_llm(table, ai_client, llm_model)
111
+
112
+ if not prose:
113
+ prose = _normalize_table_to_sentences(table)
114
+
115
+ if not prose:
116
+ table.decompose()
117
+ continue
118
+ replacement = soup.new_tag("p")
119
+ replacement.string = prose
120
+ table.replace_with(replacement)
121
+
122
+
123
+ def extract_api_endpoints(html: str) -> Optional[str]:
124
+ """Extract API endpoint signatures (REST, XML, CLI) from HTML content."""
125
+ found_endpoints: Set[str] = set()
126
+
127
+ # 1. REST API patterns
128
+ rest_matches = re.findall(
129
+ r"(GET|POST|PUT|DELETE|PATCH).{0,100}?((/api/v2)?/[a-zA-Z0-9\/\-\_{}]+)",
130
+ html,
131
+ )
132
+ for method, path, _ in rest_matches:
133
+ found_endpoints.add(f"{method} {path}")
134
+
135
+ # 2. XML API patterns
136
+ xml_matches = re.findall(r"([a-z0-9\_]+(?:_list|_get|_set))", html)
137
+ for packet in xml_matches:
138
+ if len(packet) > MIN_PACKET_LEN: # avoid tiny noise
139
+ found_endpoints.add(f"XML: {packet}")
140
+
141
+ # 3. CLI patterns
142
+ cli_matches = re.findall(r"plesk\s+(?:bin\s+)?([a-z0-9\_]+)\s+([a-z0-9\_]+)", html)
143
+ for obj, cmd in cli_matches:
144
+ found_endpoints.add(f"CLI: {obj} {cmd}")
145
+
146
+ if not found_endpoints:
147
+ return None
148
+
149
+ return ", ".join(sorted(found_endpoints))
150
+
151
+
152
+ def clean_dom_tree(soup: BeautifulSoup) -> BeautifulSoup:
153
+ """Remove nav, footer, scripts and other noisy elements from the DOM tree."""
154
+ for sel in soup.select(
155
+ "nav, footer, script, style, aside, .sidebar, .toc, noscript"
156
+ ):
157
+ sel.decompose()
158
+
159
+ llm_enabled = os.environ.get("PLESK_HTML_LLM_TABLE_NORMALIZE") == "1"
160
+ ai_client = AIClient() if llm_enabled else None
161
+ _replace_tables_with_prose(soup, llm_enabled=llm_enabled, ai_client=ai_client)
162
+
163
+ return soup
164
+
165
+
166
+ def convert_soup_to_markdown(soup: BeautifulSoup) -> str:
167
+ """Convert a cleaned BeautifulSoup tree into Markdown text."""
168
+ main = soup.find("main") or soup.find("article") or soup.body
169
+ raw_html = str(main) if main else str(soup)
170
+
171
+ # Convert to Markdown to preserve code blocks and headings.
172
+ text = _md(raw_html, heading_style="ATX", strip=["a"])
173
+
174
+ # Collapse runs of 3+ blank lines
175
+ text = re.sub(r"\n{3,}", "\n\n", text).strip()
176
+ return text
177
+
178
+
179
+ def parse_html(
180
+ path: Path, toc_meta: Optional[dict] = None
181
+ ) -> Tuple[str, str, Optional[str], str, Optional[str]]:
182
+ """Parse an HTML file and return (filename, title, breadcrumb, text, endpoint)."""
183
+ path = Path(path)
184
+ with path.open("r", encoding="utf-8", errors="ignore") as fh:
185
+ html = fh.read()
186
+
187
+ # 1. Extraction (Functional Decomposition)
188
+ endpoint = extract_api_endpoints(html)
189
+
190
+ soup = BeautifulSoup(html, "html.parser")
191
+
192
+ # 2. Title extraction
193
+ title_tag = soup.find("title")
194
+ title = (
195
+ title_tag.get_text(strip=True)
196
+ if title_tag
197
+ else (toc_meta or {}).get("title", "")
198
+ )
199
+
200
+ # 3. DOM Cleaning
201
+ soup = clean_dom_tree(soup)
202
+
203
+ # 4. Conversion
204
+ text = convert_soup_to_markdown(soup)
205
+
206
+ breadcrumb = (toc_meta or {}).get("breadcrumb")
207
+
208
+ return path.name, title, breadcrumb, text, endpoint
209
+
210
+
211
+ def clean_html_for_markdown(html: str) -> str:
212
+ """Return cleaned HTML string suitable for markdown conversion."""
213
+ soup = BeautifulSoup(html, "html.parser")
214
+ soup = clean_dom_tree(soup)
215
+
216
+ main = soup.find("main") or soup.find("article") or soup.body
217
+ return str(main) if main else str(soup)