PyPI - docs-kit - Versions diffs - 0.1.1__py3-none-any.whl - Mend

docs-kit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

docs_kit/__init__.py +32 -0
docs_kit/__main__.py +4 -0
docs_kit/_version.py +1 -0
docs_kit/agent.py +190 -0
docs_kit/cli/__init__.py +0 -0
docs_kit/cli/__main__.py +34 -0
docs_kit/cli/commands.py +542 -0
docs_kit/cli/help.py +140 -0
docs_kit/connectors/__init__.py +0 -0
docs_kit/connectors/embeddings/__init__.py +3 -0
docs_kit/connectors/embeddings/base.py +9 -0
docs_kit/connectors/embeddings/fastembed.py +30 -0
docs_kit/connectors/fetchers/__init__.py +0 -0
docs_kit/connectors/fetchers/base.py +8 -0
docs_kit/connectors/fetchers/gitbook.py +7 -0
docs_kit/connectors/fetchers/llms_txt.py +85 -0
docs_kit/connectors/fetchers/mintlify.py +94 -0
docs_kit/connectors/parsers/__init__.py +4 -0
docs_kit/connectors/parsers/base.py +8 -0
docs_kit/connectors/parsers/markdown.py +8 -0
docs_kit/connectors/parsers/text.py +8 -0
docs_kit/connectors/vector_stores/__init__.py +3 -0
docs_kit/connectors/vector_stores/base.py +15 -0
docs_kit/connectors/vector_stores/qdrant.py +279 -0
docs_kit/core/__init__.py +0 -0
docs_kit/core/chunking.py +227 -0
docs_kit/core/config.py +67 -0
docs_kit/core/html_utils.py +78 -0
docs_kit/core/models.py +28 -0
docs_kit/mcp/__init__.py +0 -0
docs_kit/mcp/server.py +100 -0
docs_kit/mcp/tools.py +10 -0
docs_kit-0.1.1.dist-info/METADATA +268 -0
docs_kit-0.1.1.dist-info/RECORD +37 -0
docs_kit-0.1.1.dist-info/WHEEL +4 -0
docs_kit-0.1.1.dist-info/entry_points.txt +2 -0
docs_kit-0.1.1.dist-info/licenses/LICENSE +21 -0

docs_kit/core/chunking.py ADDED Viewed

@@ -0,0 +1,227 @@
+from __future__ import annotations
+import re
+def _sliding_window(text: str, chunk_size: int, chunk_overlap: int) -> list[str]:
+    """Split normalized text into overlapping windows."""
+    chunks: list[str] = []
+    start = 0
+    length = len(text)
+    while start < length:
+        end = min(length, start + chunk_size)
+        chunks.append(text[start:end].strip())
+        if end >= length:
+            break
+        start = max(0, end - chunk_overlap)
+    return [c for c in chunks if c]
+def chunk_text(text: str, chunk_size: int = 800, chunk_overlap: int = 120) -> list[str]:
+    normalized = " ".join(text.split())
+    if not normalized:
+        return []
+    if chunk_overlap >= chunk_size:
+        raise ValueError("chunk_overlap must be smaller than chunk_size")
+    return _sliding_window(normalized, chunk_size, chunk_overlap)
+# ---------------------------------------------------------------------------
+# Markdown-aware chunking
+# ---------------------------------------------------------------------------
+_HEADER_RE = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
+_FENCE_OPEN_RE = re.compile(r"^ {0,3}(`{3,}|~{3,})")
+def _fence_ranges(text: str) -> list[tuple[int, int]]:
+    """Return (start, end) char ranges for every fenced code block in *text*."""
+    ranges: list[tuple[int, int]] = []
+    pos = 0
+    fence_char: str | None = None
+    fence_len: int = 0
+    fence_start: int = 0
+    for line in text.splitlines(keepends=True):
+        m = _FENCE_OPEN_RE.match(line)
+        if fence_char is None:
+            if m:
+                fence_char = m.group(1)[0]
+                fence_len = len(m.group(1))
+                fence_start = pos
+        else:
+            if m and m.group(1)[0] == fence_char and len(m.group(1)) >= fence_len:
+                ranges.append((fence_start, pos + len(line)))
+                fence_char = None
+        pos += len(line)
+    if fence_char is not None:  # unclosed fence - treat rest of text as fenced
+        ranges.append((fence_start, len(text)))
+    return ranges
+def _parse_sections(text: str) -> list[tuple[list[str], str]]:
+    """
+    Split markdown text into sections.
+    Returns list of (header_stack, body_text) pairs.
+    The first item may have an empty header_stack for content before any header.
+    Header lines inside fenced code blocks are ignored.
+    """
+    fenced = _fence_ranges(text)
+    def _in_fence(pos: int) -> bool:
+        return any(start <= pos < end for start, end in fenced)
+    matches = [m for m in _HEADER_RE.finditer(text) if not _in_fence(m.start())]
+    if not matches:
+        return [([], text)]
+    sections: list[tuple[list[str], str]] = []
+    # Content before the first header
+    preamble = text[: matches[0].start()]
+    if preamble.strip():
+        sections.append(([], preamble))
+    header_stack: list[tuple[int, str]] = []  # (level, title)
+    for i, match in enumerate(matches):
+        level = len(match.group(1))
+        title = match.group(2).strip()
+        # Pop headers deeper than or equal to this level
+        while header_stack and header_stack[-1][0] >= level:
+            header_stack.pop()
+        header_stack.append((level, title))
+        body_start = match.end()
+        body_end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+        body = text[body_start:body_end]
+        sections.append(([t for _, t in header_stack], body))
+    return sections
+def _build_header_prefix(header_stack: list[str]) -> str:
+    if not header_stack:
+        return ""
+    hashes = ["#" * (i + 1) for i in range(len(header_stack))]
+    parts = " > ".join(f"{h} {t}" for h, t in zip(hashes, header_stack))
+    return f"[{parts}]\n"
+def _is_list_heavy(body: str) -> bool:
+    lines = [line for line in body.splitlines() if line.strip()]
+    if not lines:
+        return False
+    bullet_lines = sum(
+        1 for line in lines if re.match(r"^\s*[-*+]\s+|^\s*\d+\.\s+", line)
+    )
+    return bullet_lines / len(lines) >= 0.5
+def _split_into_bullet_items(body: str) -> list[str]:
+    """Split body text into individual bullet items (preserving sub-bullets and intro prose).
+    Only top-level (unindented) bullets start a new item. Indented sub-bullets
+    are continuation lines that stay attached to their parent bullet.
+    """
+    items: list[str] = []
+    current: list[str] = []
+    in_bullets = False
+    for line in body.splitlines():
+        # Top-level bullet: no leading whitespace before the marker
+        if re.match(r"^[-*+]\s+|^\d+\.\s+", line):
+            if current and not in_bullets:
+                # flush intro prose as its own item
+                items.append("\n".join(current).strip())
+                current = []
+            elif current:
+                items.append("\n".join(current).strip())
+                current = []
+            in_bullets = True
+            current = [line]
+        elif current:
+            current.append(line)
+        else:
+            current.append(line)
+    if current:
+        items.append("\n".join(current).strip())
+    return [item for item in items if item]
+def _chunk_list_section(body: str, prefix: str, chunk_size: int) -> list[str]:
+    items = _split_into_bullet_items(body)
+    if not items:
+        text = " ".join(body.split())
+        return [f"{prefix}{text}"] if text else []
+    chunks: list[str] = []
+    group: list[str] = []
+    group_len = len(prefix)
+    for item in items:
+        item_len = len(item) + 1  # +1 for newline
+        if group and group_len + item_len > chunk_size:
+            chunks.append(f"{prefix}" + "\n".join(group))
+            group = [item]
+            group_len = len(prefix) + item_len
+        else:
+            group.append(item)
+            group_len += item_len
+    if group:
+        chunks.append(f"{prefix}" + "\n".join(group))
+    return chunks
+def _chunk_prose_section(body: str, prefix: str, chunk_size: int, chunk_overlap: int) -> list[str]:
+    normalized = " ".join(body.split())
+    if not normalized:
+        return []
+    # Subtract prefix length from the window size so the total chunk fits within chunk_size.
+    # Clamp overlap to stay strictly below effective_size to keep _sliding_window advancing.
+    effective_size = max(1, chunk_size - len(prefix))
+    effective_overlap = min(chunk_overlap, max(0, effective_size - 1))
+    windows = _sliding_window(normalized, effective_size, effective_overlap)
+    return [f"{prefix}{w}" for w in windows]
+def _merge_small_chunks(chunks: list[str], chunk_size: int) -> list[str]:
+    """Merge adjacent chunks that are both smaller than chunk_size/2."""
+    if not chunks:
+        return chunks
+    threshold = chunk_size // 2
+    merged: list[str] = [chunks[0]]
+    for chunk in chunks[1:]:
+        prev = merged[-1]
+        if len(prev) < threshold and len(chunk) < threshold and len(prev) + len(chunk) + 1 <= chunk_size:
+            merged[-1] = prev + "\n" + chunk
+        else:
+            merged.append(chunk)
+    return merged
+def chunk_markdown(text: str, chunk_size: int = 800, chunk_overlap: int = 120) -> list[str]:
+    """Chunk markdown text with structure-awareness."""
+    if not text.strip():
+        return []
+    if chunk_overlap >= chunk_size:
+        raise ValueError("chunk_overlap must be smaller than chunk_size")
+    sections = _parse_sections(text)
+    raw_chunks: list[str] = []
+    for header_stack, body in sections:
+        prefix = _build_header_prefix(header_stack)
+        if _is_list_heavy(body):
+            raw_chunks.extend(_chunk_list_section(body, prefix, chunk_size))
+        else:
+            raw_chunks.extend(_chunk_prose_section(body, prefix, chunk_size, chunk_overlap))
+    return _merge_small_chunks([c for c in raw_chunks if c.strip()], chunk_size)

docs_kit/core/config.py ADDED Viewed

@@ -0,0 +1,67 @@
+from __future__ import annotations
+from pathlib import Path
+import yaml
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class EmbeddingConfig(BaseSettings):
+    provider: str = "fastembed"
+    model: str = "BAAI/bge-small-en-v1.5"
+    model_config = SettingsConfigDict(env_prefix="EMBEDDING_", extra="ignore")
+class VectorStoreConfig(BaseSettings):
+    provider: str = "qdrant"
+    url: str = ""
+    collection_name: str = "knowledge_base"
+    local_path: str = ".docs-kit/qdrant"
+    retrieval_limit: int = 5
+    score_threshold: float = 0.35
+    dense_prefetch_limit: int = 20
+    sparse_prefetch_limit: int = 20
+    model_config = SettingsConfigDict(env_prefix="VECTOR_STORE_", extra="ignore")
+    @property
+    def use_local(self) -> bool:
+        return not bool(self.url)
+class IngestionConfig(BaseSettings):
+    chunk_size: int = 800
+    chunk_overlap: int = 120
+    bm25_model: str = "Qdrant/bm25"
+    model_config = SettingsConfigDict(env_prefix="INGESTION_", extra="ignore")
+class McpConfig(BaseSettings):
+    transport: str = "stdio"
+    host: str = "localhost"
+    port: int = 3001
+    model_config = SettingsConfigDict(env_prefix="MCP_", extra="ignore")
+class DocsKitConfig(BaseModel):
+    embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig)
+    vector_store: VectorStoreConfig = Field(default_factory=VectorStoreConfig)
+    ingestion: IngestionConfig = Field(default_factory=IngestionConfig)
+    mcp: McpConfig = Field(default_factory=McpConfig)
+    @classmethod
+    def from_yaml(cls, path: Path | str) -> DocsKitConfig:
+        path = Path(path)
+        with open(path) as f:
+            data = yaml.safe_load(f) or {}
+        config = cls(**data)
+        # Resolve a relative local_path against the YAML file's directory so the
+        # vector store is always found regardless of the process's working directory.
+        local_path = Path(config.vector_store.local_path)
+        if not local_path.is_absolute():
+            config.vector_store.local_path = str((path.parent / local_path).resolve())
+        return config
+    @classmethod
+    def from_env(cls) -> DocsKitConfig:
+        return cls()

docs_kit/core/html_utils.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+import re
+_TAG_RE = re.compile(r"<[^>]+>")
+_MULTI_NEWLINE_RE = re.compile(r"\n{3,}")
+_HTML_ENTITIES = {
+    "&amp;": "&", "&lt;": "<", "&gt;": ">",
+    "&quot;": '"', "&#39;": "'", "&nbsp;": " ",
+}
+def _decode_entities(text: str) -> str:
+    for entity, char in _HTML_ENTITIES.items():
+        text = text.replace(entity, char)
+    return text
+def _cell_text(cell_html: str) -> str:
+    return _decode_entities(_TAG_RE.sub("", cell_html)).strip()
+def _table_to_text(table_html: str) -> str:
+    """Convert an HTML table to pipe-separated plain text rows."""
+    rows = re.findall(r"<tr[^>]*>(.*?)</tr>", table_html, re.DOTALL | re.IGNORECASE)
+    lines = []
+    for row in rows:
+        cells = re.findall(r"<t[hd][^>]*>(.*?)</t[hd]>", row, re.DOTALL | re.IGNORECASE)
+        texts = [_cell_text(c) for c in cells]
+        if any(texts):
+            lines.append(" | ".join(texts))
+    return "\n".join(lines)
+def clean_html(content: str) -> str:
+    """Convert inline HTML in document content to clean plain text.
+    - HTML tables are converted to pipe-separated rows so LLMs can read them
+    - Remaining HTML tags are stripped
+    - Common HTML entities are decoded
+    - Excessive blank lines are collapsed
+    Safe to call on plain markdown — returns it unchanged if no HTML is present.
+    """
+    if "<" not in content:
+        return content
+    # Replace <table> blocks with text representation first
+    content = re.sub(
+        r"<table[^>]*>.*?</table>",
+        lambda m: _table_to_text(m.group(0)) + "\n",
+        content,
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+    # Strip any remaining tags
+    content = _TAG_RE.sub("", content)
+    # Decode entities
+    content = _decode_entities(content)
+    # Collapse excessive blank lines
+    content = _MULTI_NEWLINE_RE.sub("\n\n", content)
+    return content.strip()
+def extract_main_content(html: str) -> str:
+    """Extract the main content section from an HTML page, then clean it.
+    Tries to isolate <article> or <main> before stripping tags so that
+    navigation, headers, and footers are excluded.
+    """
+    for tag in ("article", "main"):
+        match = re.search(rf"<{tag}[^>]*>(.*?)</{tag}>", html, re.DOTALL | re.IGNORECASE)
+        if match:
+            return clean_html(match.group(1))
+    return clean_html(html)

docs_kit/core/models.py ADDED Viewed

@@ -0,0 +1,28 @@
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field
+class Document(BaseModel):
+    """A loaded document before chunking."""
+    source: str
+    content: str
+    metadata: dict[str, Any] = Field(default_factory=dict)
+class Chunk(BaseModel):
+    """A chunk of text after splitting a document."""
+    text: str
+    source: str
+    chunk_index: int
+    metadata: dict[str, Any] = Field(default_factory=dict)
+class RetrievedChunk(BaseModel):
+    """A chunk returned from vector store retrieval with a relevance score."""
+    source: str
+    chunk_index: int
+    text: str
+    score: float

docs_kit/mcp/__init__.py ADDED Viewed

File without changes

docs_kit/mcp/server.py ADDED Viewed

@@ -0,0 +1,100 @@
+from __future__ import annotations
+import json
+from docs_kit.core.config import DocsKitConfig
+from docs_kit.agent import DocsKitAgent
+def _create_server(agent: DocsKitAgent, host: str = "127.0.0.1", port: int = 3001):
+    """Create a FastMCP server wired to the given agent."""
+    from mcp.server.fastmcp import FastMCP
+    mcp_server = FastMCP("docs-kit", host=host, port=port)
+    @mcp_server.tool()
+    def search_docs(query: str, limit: int = 5) -> str:
+        """Search the knowledge base using hybrid retrieval. Returns relevant document chunks with source attribution."""
+        chunks = agent.query(query, limit=limit)
+        results = [
+            {
+                "source": c.source,
+                "chunk_index": c.chunk_index,
+                "score": round(c.score, 4),
+                "text": c.text,
+            }
+            for c in chunks
+        ]
+        return json.dumps(results, indent=2)
+    @mcp_server.tool()
+    def list_sources() -> str:
+        """List all ingested document sources in the knowledge base."""
+        sources = agent.list_sources()
+        return json.dumps(sources, indent=2)
+    @mcp_server.tool()
+    def get_collection_info() -> str:
+        """Get statistics about the vector store collection."""
+        info = agent.get_collection_info()
+        return json.dumps(info, indent=2)
+    @mcp_server.tool()
+    def get_full_document(source: str) -> str:
+        """Retrieve the full text of a specific document by its source URL or path."""
+        document = agent.get_document(source)
+        if not document:
+            return f"No document found with source: {source}"
+        return document
+    @mcp_server.tool()
+    def ingest_urls(urls: str, provider: str = "auto") -> str:
+        """Ingest one or more URLs (comma-separated) into the knowledge base.
+        Args:
+            urls: Comma-separated list of documentation site URLs.
+            provider: Documentation platform — "auto" (default), "gitbook", or "mintlify".
+                      "auto" tries llms-full.txt → llms.txt → sitemap.xml automatically.
+        """
+        url_list = [u.strip() for u in urls.split(",") if u.strip()]
+        if not url_list:
+            return json.dumps({"error": "No URLs provided"})
+        resolved_provider = provider if provider != "auto" else None
+        results = []
+        for url in url_list:
+            try:
+                count = agent.ingest_url(url, provider=resolved_provider)
+                results.append({"url": url, "status": "ok", "chunks_ingested": count})
+            except Exception as exc:
+                results.append({"url": url, "status": "error", "error": str(exc)})
+        return json.dumps(results, indent=2)
+    @mcp_server.tool()
+    def remove_source(source: str) -> str:
+        """Remove a previously ingested source (URL or file path) and all its chunks from the knowledge base."""
+        deleted = agent.remove_source(source)
+        if deleted:
+            return json.dumps({"status": "ok", "message": f"Removed source: {source}"})
+        return json.dumps({"status": "not_found", "message": f"No data found for source: {source}"})
+    @mcp_server.tool()
+    def list_ingested_sources() -> str:
+        """List all ingested document sources with their ingestion dates."""
+        entries = agent.list_sources_with_dates()
+        return json.dumps(entries, indent=2)
+    return mcp_server
+def run_stdio(config: DocsKitConfig) -> None:
+    """Start the MCP server using stdio transport."""
+    agent = DocsKitAgent(config=config)
+    server = _create_server(agent, host=config.mcp.host, port=config.mcp.port)
+    server.run(transport="stdio")
+def run_sse(config: DocsKitConfig) -> None:
+    """Start the MCP server using SSE transport (HTTP)."""
+    agent = DocsKitAgent(config=config)
+    server = _create_server(agent, host=config.mcp.host, port=config.mcp.port)
+    server.run(transport="sse")

docs_kit/mcp/tools.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""
+MCP Tools exposed by docs-kit:
+- search_docs(query, limit=5): Hybrid RAG search. Returns JSON array of {source, chunk_index, score, text}.
+- list_sources(): Returns JSON array of all ingested source strings.
+- get_collection_info(): Returns JSON object with collection stats.
+- get_full_document(source): Returns full reconstructed text of a document.
+Tools are registered in server.py via the mcp SDK.
+"""