docs-kit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,227 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+
6
+ def _sliding_window(text: str, chunk_size: int, chunk_overlap: int) -> list[str]:
7
+ """Split normalized text into overlapping windows."""
8
+ chunks: list[str] = []
9
+ start = 0
10
+ length = len(text)
11
+ while start < length:
12
+ end = min(length, start + chunk_size)
13
+ chunks.append(text[start:end].strip())
14
+ if end >= length:
15
+ break
16
+ start = max(0, end - chunk_overlap)
17
+ return [c for c in chunks if c]
18
+
19
+
20
+ def chunk_text(text: str, chunk_size: int = 800, chunk_overlap: int = 120) -> list[str]:
21
+ normalized = " ".join(text.split())
22
+ if not normalized:
23
+ return []
24
+ if chunk_overlap >= chunk_size:
25
+ raise ValueError("chunk_overlap must be smaller than chunk_size")
26
+ return _sliding_window(normalized, chunk_size, chunk_overlap)
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Markdown-aware chunking
31
+ # ---------------------------------------------------------------------------
32
+
33
+ _HEADER_RE = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
34
+ _FENCE_OPEN_RE = re.compile(r"^ {0,3}(`{3,}|~{3,})")
35
+
36
+
37
+ def _fence_ranges(text: str) -> list[tuple[int, int]]:
38
+ """Return (start, end) char ranges for every fenced code block in *text*."""
39
+ ranges: list[tuple[int, int]] = []
40
+ pos = 0
41
+ fence_char: str | None = None
42
+ fence_len: int = 0
43
+ fence_start: int = 0
44
+
45
+ for line in text.splitlines(keepends=True):
46
+ m = _FENCE_OPEN_RE.match(line)
47
+ if fence_char is None:
48
+ if m:
49
+ fence_char = m.group(1)[0]
50
+ fence_len = len(m.group(1))
51
+ fence_start = pos
52
+ else:
53
+ if m and m.group(1)[0] == fence_char and len(m.group(1)) >= fence_len:
54
+ ranges.append((fence_start, pos + len(line)))
55
+ fence_char = None
56
+ pos += len(line)
57
+
58
+ if fence_char is not None: # unclosed fence - treat rest of text as fenced
59
+ ranges.append((fence_start, len(text)))
60
+
61
+ return ranges
62
+
63
+
64
+ def _parse_sections(text: str) -> list[tuple[list[str], str]]:
65
+ """
66
+ Split markdown text into sections.
67
+ Returns list of (header_stack, body_text) pairs.
68
+ The first item may have an empty header_stack for content before any header.
69
+ Header lines inside fenced code blocks are ignored.
70
+ """
71
+ fenced = _fence_ranges(text)
72
+
73
+ def _in_fence(pos: int) -> bool:
74
+ return any(start <= pos < end for start, end in fenced)
75
+
76
+ matches = [m for m in _HEADER_RE.finditer(text) if not _in_fence(m.start())]
77
+ if not matches:
78
+ return [([], text)]
79
+
80
+ sections: list[tuple[list[str], str]] = []
81
+ # Content before the first header
82
+ preamble = text[: matches[0].start()]
83
+ if preamble.strip():
84
+ sections.append(([], preamble))
85
+
86
+ header_stack: list[tuple[int, str]] = [] # (level, title)
87
+
88
+ for i, match in enumerate(matches):
89
+ level = len(match.group(1))
90
+ title = match.group(2).strip()
91
+
92
+ # Pop headers deeper than or equal to this level
93
+ while header_stack and header_stack[-1][0] >= level:
94
+ header_stack.pop()
95
+ header_stack.append((level, title))
96
+
97
+ body_start = match.end()
98
+ body_end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
99
+ body = text[body_start:body_end]
100
+
101
+ sections.append(([t for _, t in header_stack], body))
102
+
103
+ return sections
104
+
105
+
106
+ def _build_header_prefix(header_stack: list[str]) -> str:
107
+ if not header_stack:
108
+ return ""
109
+ hashes = ["#" * (i + 1) for i in range(len(header_stack))]
110
+ parts = " > ".join(f"{h} {t}" for h, t in zip(hashes, header_stack))
111
+ return f"[{parts}]\n"
112
+
113
+
114
+ def _is_list_heavy(body: str) -> bool:
115
+ lines = [line for line in body.splitlines() if line.strip()]
116
+ if not lines:
117
+ return False
118
+ bullet_lines = sum(
119
+ 1 for line in lines if re.match(r"^\s*[-*+]\s+|^\s*\d+\.\s+", line)
120
+ )
121
+ return bullet_lines / len(lines) >= 0.5
122
+
123
+
124
+ def _split_into_bullet_items(body: str) -> list[str]:
125
+ """Split body text into individual bullet items (preserving sub-bullets and intro prose).
126
+
127
+ Only top-level (unindented) bullets start a new item. Indented sub-bullets
128
+ are continuation lines that stay attached to their parent bullet.
129
+ """
130
+ items: list[str] = []
131
+ current: list[str] = []
132
+ in_bullets = False
133
+
134
+ for line in body.splitlines():
135
+ # Top-level bullet: no leading whitespace before the marker
136
+ if re.match(r"^[-*+]\s+|^\d+\.\s+", line):
137
+ if current and not in_bullets:
138
+ # flush intro prose as its own item
139
+ items.append("\n".join(current).strip())
140
+ current = []
141
+ elif current:
142
+ items.append("\n".join(current).strip())
143
+ current = []
144
+ in_bullets = True
145
+ current = [line]
146
+ elif current:
147
+ current.append(line)
148
+ else:
149
+ current.append(line)
150
+
151
+ if current:
152
+ items.append("\n".join(current).strip())
153
+
154
+ return [item for item in items if item]
155
+
156
+
157
+ def _chunk_list_section(body: str, prefix: str, chunk_size: int) -> list[str]:
158
+ items = _split_into_bullet_items(body)
159
+ if not items:
160
+ text = " ".join(body.split())
161
+ return [f"{prefix}{text}"] if text else []
162
+
163
+ chunks: list[str] = []
164
+ group: list[str] = []
165
+ group_len = len(prefix)
166
+
167
+ for item in items:
168
+ item_len = len(item) + 1 # +1 for newline
169
+ if group and group_len + item_len > chunk_size:
170
+ chunks.append(f"{prefix}" + "\n".join(group))
171
+ group = [item]
172
+ group_len = len(prefix) + item_len
173
+ else:
174
+ group.append(item)
175
+ group_len += item_len
176
+
177
+ if group:
178
+ chunks.append(f"{prefix}" + "\n".join(group))
179
+
180
+ return chunks
181
+
182
+
183
+ def _chunk_prose_section(body: str, prefix: str, chunk_size: int, chunk_overlap: int) -> list[str]:
184
+ normalized = " ".join(body.split())
185
+ if not normalized:
186
+ return []
187
+ # Subtract prefix length from the window size so the total chunk fits within chunk_size.
188
+ # Clamp overlap to stay strictly below effective_size to keep _sliding_window advancing.
189
+ effective_size = max(1, chunk_size - len(prefix))
190
+ effective_overlap = min(chunk_overlap, max(0, effective_size - 1))
191
+ windows = _sliding_window(normalized, effective_size, effective_overlap)
192
+ return [f"{prefix}{w}" for w in windows]
193
+
194
+
195
+ def _merge_small_chunks(chunks: list[str], chunk_size: int) -> list[str]:
196
+ """Merge adjacent chunks that are both smaller than chunk_size/2."""
197
+ if not chunks:
198
+ return chunks
199
+ threshold = chunk_size // 2
200
+ merged: list[str] = [chunks[0]]
201
+ for chunk in chunks[1:]:
202
+ prev = merged[-1]
203
+ if len(prev) < threshold and len(chunk) < threshold and len(prev) + len(chunk) + 1 <= chunk_size:
204
+ merged[-1] = prev + "\n" + chunk
205
+ else:
206
+ merged.append(chunk)
207
+ return merged
208
+
209
+
210
+ def chunk_markdown(text: str, chunk_size: int = 800, chunk_overlap: int = 120) -> list[str]:
211
+ """Chunk markdown text with structure-awareness."""
212
+ if not text.strip():
213
+ return []
214
+ if chunk_overlap >= chunk_size:
215
+ raise ValueError("chunk_overlap must be smaller than chunk_size")
216
+
217
+ sections = _parse_sections(text)
218
+ raw_chunks: list[str] = []
219
+
220
+ for header_stack, body in sections:
221
+ prefix = _build_header_prefix(header_stack)
222
+ if _is_list_heavy(body):
223
+ raw_chunks.extend(_chunk_list_section(body, prefix, chunk_size))
224
+ else:
225
+ raw_chunks.extend(_chunk_prose_section(body, prefix, chunk_size, chunk_overlap))
226
+
227
+ return _merge_small_chunks([c for c in raw_chunks if c.strip()], chunk_size)
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import yaml
6
+ from pydantic import BaseModel, Field
7
+ from pydantic_settings import BaseSettings, SettingsConfigDict
8
+
9
+
10
+ class EmbeddingConfig(BaseSettings):
11
+ provider: str = "fastembed"
12
+ model: str = "BAAI/bge-small-en-v1.5"
13
+ model_config = SettingsConfigDict(env_prefix="EMBEDDING_", extra="ignore")
14
+
15
+
16
+ class VectorStoreConfig(BaseSettings):
17
+ provider: str = "qdrant"
18
+ url: str = ""
19
+ collection_name: str = "knowledge_base"
20
+ local_path: str = ".docs-kit/qdrant"
21
+ retrieval_limit: int = 5
22
+ score_threshold: float = 0.35
23
+ dense_prefetch_limit: int = 20
24
+ sparse_prefetch_limit: int = 20
25
+ model_config = SettingsConfigDict(env_prefix="VECTOR_STORE_", extra="ignore")
26
+
27
+ @property
28
+ def use_local(self) -> bool:
29
+ return not bool(self.url)
30
+
31
+
32
+ class IngestionConfig(BaseSettings):
33
+ chunk_size: int = 800
34
+ chunk_overlap: int = 120
35
+ bm25_model: str = "Qdrant/bm25"
36
+ model_config = SettingsConfigDict(env_prefix="INGESTION_", extra="ignore")
37
+
38
+
39
+ class McpConfig(BaseSettings):
40
+ transport: str = "stdio"
41
+ host: str = "localhost"
42
+ port: int = 3001
43
+ model_config = SettingsConfigDict(env_prefix="MCP_", extra="ignore")
44
+
45
+
46
+ class DocsKitConfig(BaseModel):
47
+ embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig)
48
+ vector_store: VectorStoreConfig = Field(default_factory=VectorStoreConfig)
49
+ ingestion: IngestionConfig = Field(default_factory=IngestionConfig)
50
+ mcp: McpConfig = Field(default_factory=McpConfig)
51
+
52
+ @classmethod
53
+ def from_yaml(cls, path: Path | str) -> DocsKitConfig:
54
+ path = Path(path)
55
+ with open(path) as f:
56
+ data = yaml.safe_load(f) or {}
57
+ config = cls(**data)
58
+ # Resolve a relative local_path against the YAML file's directory so the
59
+ # vector store is always found regardless of the process's working directory.
60
+ local_path = Path(config.vector_store.local_path)
61
+ if not local_path.is_absolute():
62
+ config.vector_store.local_path = str((path.parent / local_path).resolve())
63
+ return config
64
+
65
+ @classmethod
66
+ def from_env(cls) -> DocsKitConfig:
67
+ return cls()
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ _TAG_RE = re.compile(r"<[^>]+>")
6
+ _MULTI_NEWLINE_RE = re.compile(r"\n{3,}")
7
+ _HTML_ENTITIES = {
8
+ "&amp;": "&", "&lt;": "<", "&gt;": ">",
9
+ "&quot;": '"', "&#39;": "'", "&nbsp;": " ",
10
+ }
11
+
12
+
13
+ def _decode_entities(text: str) -> str:
14
+ for entity, char in _HTML_ENTITIES.items():
15
+ text = text.replace(entity, char)
16
+ return text
17
+
18
+
19
+ def _cell_text(cell_html: str) -> str:
20
+ return _decode_entities(_TAG_RE.sub("", cell_html)).strip()
21
+
22
+
23
+ def _table_to_text(table_html: str) -> str:
24
+ """Convert an HTML table to pipe-separated plain text rows."""
25
+ rows = re.findall(r"<tr[^>]*>(.*?)</tr>", table_html, re.DOTALL | re.IGNORECASE)
26
+ lines = []
27
+ for row in rows:
28
+ cells = re.findall(r"<t[hd][^>]*>(.*?)</t[hd]>", row, re.DOTALL | re.IGNORECASE)
29
+ texts = [_cell_text(c) for c in cells]
30
+ if any(texts):
31
+ lines.append(" | ".join(texts))
32
+ return "\n".join(lines)
33
+
34
+
35
+ def clean_html(content: str) -> str:
36
+ """Convert inline HTML in document content to clean plain text.
37
+
38
+ - HTML tables are converted to pipe-separated rows so LLMs can read them
39
+ - Remaining HTML tags are stripped
40
+ - Common HTML entities are decoded
41
+ - Excessive blank lines are collapsed
42
+
43
+ Safe to call on plain markdown — returns it unchanged if no HTML is present.
44
+ """
45
+ if "<" not in content:
46
+ return content
47
+
48
+ # Replace <table> blocks with text representation first
49
+ content = re.sub(
50
+ r"<table[^>]*>.*?</table>",
51
+ lambda m: _table_to_text(m.group(0)) + "\n",
52
+ content,
53
+ flags=re.DOTALL | re.IGNORECASE,
54
+ )
55
+
56
+ # Strip any remaining tags
57
+ content = _TAG_RE.sub("", content)
58
+
59
+ # Decode entities
60
+ content = _decode_entities(content)
61
+
62
+ # Collapse excessive blank lines
63
+ content = _MULTI_NEWLINE_RE.sub("\n\n", content)
64
+
65
+ return content.strip()
66
+
67
+
68
+ def extract_main_content(html: str) -> str:
69
+ """Extract the main content section from an HTML page, then clean it.
70
+
71
+ Tries to isolate <article> or <main> before stripping tags so that
72
+ navigation, headers, and footers are excluded.
73
+ """
74
+ for tag in ("article", "main"):
75
+ match = re.search(rf"<{tag}[^>]*>(.*?)</{tag}>", html, re.DOTALL | re.IGNORECASE)
76
+ if match:
77
+ return clean_html(match.group(1))
78
+ return clean_html(html)
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+ from typing import Any
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class Document(BaseModel):
7
+ """A loaded document before chunking."""
8
+ source: str
9
+ content: str
10
+ metadata: dict[str, Any] = Field(default_factory=dict)
11
+
12
+
13
+ class Chunk(BaseModel):
14
+ """A chunk of text after splitting a document."""
15
+ text: str
16
+ source: str
17
+ chunk_index: int
18
+ metadata: dict[str, Any] = Field(default_factory=dict)
19
+
20
+
21
+ class RetrievedChunk(BaseModel):
22
+ """A chunk returned from vector store retrieval with a relevance score."""
23
+ source: str
24
+ chunk_index: int
25
+ text: str
26
+ score: float
27
+
28
+
File without changes
docs_kit/mcp/server.py ADDED
@@ -0,0 +1,100 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from docs_kit.core.config import DocsKitConfig
6
+ from docs_kit.agent import DocsKitAgent
7
+
8
+
9
+ def _create_server(agent: DocsKitAgent, host: str = "127.0.0.1", port: int = 3001):
10
+ """Create a FastMCP server wired to the given agent."""
11
+ from mcp.server.fastmcp import FastMCP
12
+
13
+ mcp_server = FastMCP("docs-kit", host=host, port=port)
14
+
15
+ @mcp_server.tool()
16
+ def search_docs(query: str, limit: int = 5) -> str:
17
+ """Search the knowledge base using hybrid retrieval. Returns relevant document chunks with source attribution."""
18
+ chunks = agent.query(query, limit=limit)
19
+ results = [
20
+ {
21
+ "source": c.source,
22
+ "chunk_index": c.chunk_index,
23
+ "score": round(c.score, 4),
24
+ "text": c.text,
25
+ }
26
+ for c in chunks
27
+ ]
28
+ return json.dumps(results, indent=2)
29
+
30
+ @mcp_server.tool()
31
+ def list_sources() -> str:
32
+ """List all ingested document sources in the knowledge base."""
33
+ sources = agent.list_sources()
34
+ return json.dumps(sources, indent=2)
35
+
36
+ @mcp_server.tool()
37
+ def get_collection_info() -> str:
38
+ """Get statistics about the vector store collection."""
39
+ info = agent.get_collection_info()
40
+ return json.dumps(info, indent=2)
41
+
42
+ @mcp_server.tool()
43
+ def get_full_document(source: str) -> str:
44
+ """Retrieve the full text of a specific document by its source URL or path."""
45
+ document = agent.get_document(source)
46
+ if not document:
47
+ return f"No document found with source: {source}"
48
+ return document
49
+
50
+ @mcp_server.tool()
51
+ def ingest_urls(urls: str, provider: str = "auto") -> str:
52
+ """Ingest one or more URLs (comma-separated) into the knowledge base.
53
+
54
+ Args:
55
+ urls: Comma-separated list of documentation site URLs.
56
+ provider: Documentation platform — "auto" (default), "gitbook", or "mintlify".
57
+ "auto" tries llms-full.txt → llms.txt → sitemap.xml automatically.
58
+ """
59
+ url_list = [u.strip() for u in urls.split(",") if u.strip()]
60
+ if not url_list:
61
+ return json.dumps({"error": "No URLs provided"})
62
+ resolved_provider = provider if provider != "auto" else None
63
+ results = []
64
+ for url in url_list:
65
+ try:
66
+ count = agent.ingest_url(url, provider=resolved_provider)
67
+ results.append({"url": url, "status": "ok", "chunks_ingested": count})
68
+ except Exception as exc:
69
+ results.append({"url": url, "status": "error", "error": str(exc)})
70
+ return json.dumps(results, indent=2)
71
+
72
+ @mcp_server.tool()
73
+ def remove_source(source: str) -> str:
74
+ """Remove a previously ingested source (URL or file path) and all its chunks from the knowledge base."""
75
+ deleted = agent.remove_source(source)
76
+ if deleted:
77
+ return json.dumps({"status": "ok", "message": f"Removed source: {source}"})
78
+ return json.dumps({"status": "not_found", "message": f"No data found for source: {source}"})
79
+
80
+ @mcp_server.tool()
81
+ def list_ingested_sources() -> str:
82
+ """List all ingested document sources with their ingestion dates."""
83
+ entries = agent.list_sources_with_dates()
84
+ return json.dumps(entries, indent=2)
85
+
86
+ return mcp_server
87
+
88
+
89
+ def run_stdio(config: DocsKitConfig) -> None:
90
+ """Start the MCP server using stdio transport."""
91
+ agent = DocsKitAgent(config=config)
92
+ server = _create_server(agent, host=config.mcp.host, port=config.mcp.port)
93
+ server.run(transport="stdio")
94
+
95
+
96
+ def run_sse(config: DocsKitConfig) -> None:
97
+ """Start the MCP server using SSE transport (HTTP)."""
98
+ agent = DocsKitAgent(config=config)
99
+ server = _create_server(agent, host=config.mcp.host, port=config.mcp.port)
100
+ server.run(transport="sse")
docs_kit/mcp/tools.py ADDED
@@ -0,0 +1,10 @@
1
+ """
2
+ MCP Tools exposed by docs-kit:
3
+
4
+ - search_docs(query, limit=5): Hybrid RAG search. Returns JSON array of {source, chunk_index, score, text}.
5
+ - list_sources(): Returns JSON array of all ingested source strings.
6
+ - get_collection_info(): Returns JSON object with collection stats.
7
+ - get_full_document(source): Returns full reconstructed text of a document.
8
+
9
+ Tools are registered in server.py via the mcp SDK.
10
+ """