docs-kit 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs_kit/__init__.py +32 -0
- docs_kit/__main__.py +4 -0
- docs_kit/_version.py +1 -0
- docs_kit/agent.py +190 -0
- docs_kit/cli/__init__.py +0 -0
- docs_kit/cli/__main__.py +34 -0
- docs_kit/cli/commands.py +542 -0
- docs_kit/cli/help.py +140 -0
- docs_kit/connectors/__init__.py +0 -0
- docs_kit/connectors/embeddings/__init__.py +3 -0
- docs_kit/connectors/embeddings/base.py +9 -0
- docs_kit/connectors/embeddings/fastembed.py +30 -0
- docs_kit/connectors/fetchers/__init__.py +0 -0
- docs_kit/connectors/fetchers/base.py +8 -0
- docs_kit/connectors/fetchers/gitbook.py +7 -0
- docs_kit/connectors/fetchers/llms_txt.py +85 -0
- docs_kit/connectors/fetchers/mintlify.py +94 -0
- docs_kit/connectors/parsers/__init__.py +4 -0
- docs_kit/connectors/parsers/base.py +8 -0
- docs_kit/connectors/parsers/markdown.py +8 -0
- docs_kit/connectors/parsers/text.py +8 -0
- docs_kit/connectors/vector_stores/__init__.py +3 -0
- docs_kit/connectors/vector_stores/base.py +15 -0
- docs_kit/connectors/vector_stores/qdrant.py +279 -0
- docs_kit/core/__init__.py +0 -0
- docs_kit/core/chunking.py +227 -0
- docs_kit/core/config.py +67 -0
- docs_kit/core/html_utils.py +78 -0
- docs_kit/core/models.py +28 -0
- docs_kit/mcp/__init__.py +0 -0
- docs_kit/mcp/server.py +100 -0
- docs_kit/mcp/tools.py +10 -0
- docs_kit-0.1.1.dist-info/METADATA +268 -0
- docs_kit-0.1.1.dist-info/RECORD +37 -0
- docs_kit-0.1.1.dist-info/WHEEL +4 -0
- docs_kit-0.1.1.dist-info/entry_points.txt +2 -0
- docs_kit-0.1.1.dist-info/licenses/LICENSE +21 -0
docs_kit/__init__.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""docs-kit: Fetch docs, embed locally, expose via MCP for AI agents."""
|
|
2
|
+
|
|
3
|
+
from docs_kit._version import __version__
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"__version__",
|
|
7
|
+
"DocsKitAgent",
|
|
8
|
+
"DocsKitConfig",
|
|
9
|
+
"Chunk",
|
|
10
|
+
"Document",
|
|
11
|
+
"RetrievedChunk",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __getattr__(name: str):
|
|
16
|
+
if name == "DocsKitAgent":
|
|
17
|
+
from docs_kit.agent import DocsKitAgent
|
|
18
|
+
|
|
19
|
+
return DocsKitAgent
|
|
20
|
+
if name == "DocsKitConfig":
|
|
21
|
+
from docs_kit.core.config import DocsKitConfig
|
|
22
|
+
|
|
23
|
+
return DocsKitConfig
|
|
24
|
+
if name in {"Chunk", "Document", "RetrievedChunk"}:
|
|
25
|
+
from docs_kit.core.models import Chunk, Document, RetrievedChunk
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
"Chunk": Chunk,
|
|
29
|
+
"Document": Document,
|
|
30
|
+
"RetrievedChunk": RetrievedChunk,
|
|
31
|
+
}[name]
|
|
32
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
docs_kit/__main__.py
ADDED
docs_kit/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.1"
|
docs_kit/agent.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from docs_kit.core.config import DocsKitConfig
|
|
8
|
+
from docs_kit.core.models import Chunk, Document, RetrievedChunk
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
_PARSERS = {
|
|
13
|
+
".txt": "docs_kit.connectors.parsers.text:TextLoader",
|
|
14
|
+
".md": "docs_kit.connectors.parsers.markdown:MarkdownLoader",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _import_class(dotted_path: str) -> type:
|
|
19
|
+
module_path, class_name = dotted_path.rsplit(":", 1)
|
|
20
|
+
module = importlib.import_module(module_path)
|
|
21
|
+
return getattr(module, class_name)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DocsKitAgent:
|
|
25
|
+
"""High-level API for docs-kit.
|
|
26
|
+
|
|
27
|
+
Usage:
|
|
28
|
+
agent = DocsKitAgent()
|
|
29
|
+
agent.ingest("./docs/")
|
|
30
|
+
results = agent.query("how do I get started?")
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: DocsKitConfig | None = None, _lazy_init: bool = False):
|
|
34
|
+
if config is None:
|
|
35
|
+
config = DocsKitConfig()
|
|
36
|
+
self.config = config
|
|
37
|
+
self._dense_embedder = None
|
|
38
|
+
self._sparse_embedder = None
|
|
39
|
+
self._vector_store = None
|
|
40
|
+
|
|
41
|
+
if not _lazy_init:
|
|
42
|
+
self._init_components()
|
|
43
|
+
|
|
44
|
+
def _init_components(self) -> None:
|
|
45
|
+
if self.config.embedding.provider != "fastembed":
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"Unsupported embedding provider '{self.config.embedding.provider}'. Supported: fastembed."
|
|
48
|
+
)
|
|
49
|
+
from docs_kit.connectors.embeddings.fastembed import FastEmbedDenseEmbedding, FastEmbedSparseEmbedding
|
|
50
|
+
self._dense_embedder = FastEmbedDenseEmbedding(model=self.config.embedding.model)
|
|
51
|
+
self._sparse_embedder = FastEmbedSparseEmbedding(model=self.config.ingestion.bm25_model)
|
|
52
|
+
|
|
53
|
+
if self.config.vector_store.provider != "qdrant":
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"Unsupported vector store provider '{self.config.vector_store.provider}'. Supported: qdrant."
|
|
56
|
+
)
|
|
57
|
+
from docs_kit.connectors.vector_stores.qdrant import QdrantStore
|
|
58
|
+
self._vector_store = QdrantStore(
|
|
59
|
+
collection_name=self.config.vector_store.collection_name,
|
|
60
|
+
url=self.config.vector_store.url,
|
|
61
|
+
local_path=self.config.vector_store.local_path,
|
|
62
|
+
dense_prefetch_limit=self.config.vector_store.dense_prefetch_limit,
|
|
63
|
+
sparse_prefetch_limit=self.config.vector_store.sparse_prefetch_limit,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def _get_loader(self, suffix: str):
|
|
67
|
+
parser_path = _PARSERS.get(suffix)
|
|
68
|
+
if not parser_path:
|
|
69
|
+
raise ValueError(f"No parser for '{suffix}'. Supported: {list(_PARSERS.keys())}")
|
|
70
|
+
cls = _import_class(parser_path)
|
|
71
|
+
return cls()
|
|
72
|
+
|
|
73
|
+
def _document_is_markdown(self, doc: Document) -> bool:
|
|
74
|
+
content_type = str(doc.metadata.get("content_type", "")).lower()
|
|
75
|
+
doc_format = str(doc.metadata.get("format", "")).lower()
|
|
76
|
+
return doc.source.lower().endswith(".md") or "markdown" in content_type or doc_format == "markdown"
|
|
77
|
+
|
|
78
|
+
def _build_chunks(self, doc: Document) -> list[Chunk]:
|
|
79
|
+
from docs_kit.core.chunking import chunk_markdown, chunk_text
|
|
80
|
+
from docs_kit.core.html_utils import clean_html
|
|
81
|
+
chunk_size = self.config.ingestion.chunk_size
|
|
82
|
+
chunk_overlap = self.config.ingestion.chunk_overlap
|
|
83
|
+
content = clean_html(doc.content)
|
|
84
|
+
if self._document_is_markdown(doc):
|
|
85
|
+
text_chunks = chunk_markdown(content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
86
|
+
else:
|
|
87
|
+
text_chunks = chunk_text(content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
88
|
+
return [
|
|
89
|
+
Chunk(text=text, source=doc.source, chunk_index=index, metadata=dict(doc.metadata))
|
|
90
|
+
for index, text in enumerate(text_chunks)
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
def ingest_documents(self, documents: list[Document], recreate: bool = False) -> int:
|
|
94
|
+
total = 0
|
|
95
|
+
should_recreate = recreate
|
|
96
|
+
for doc in documents:
|
|
97
|
+
chunks = self._build_chunks(doc)
|
|
98
|
+
if not chunks:
|
|
99
|
+
continue
|
|
100
|
+
dense_vecs = self._dense_embedder.embed([chunk.text for chunk in chunks])
|
|
101
|
+
sparse_vecs = self._sparse_embedder.embed([chunk.text for chunk in chunks])
|
|
102
|
+
count = self._vector_store.upsert(chunks, dense_vecs, sparse_vecs, recreate=should_recreate)
|
|
103
|
+
self._vector_store.upsert_document(doc.source, doc.content, recreate=should_recreate)
|
|
104
|
+
should_recreate = False
|
|
105
|
+
total += count
|
|
106
|
+
return total
|
|
107
|
+
|
|
108
|
+
def ingest(self, path: str | Path, recreate: bool = False) -> int:
|
|
109
|
+
path = Path(path)
|
|
110
|
+
if not path.exists():
|
|
111
|
+
raise FileNotFoundError(f"Path not found: {path}")
|
|
112
|
+
if path.is_file():
|
|
113
|
+
files = [path]
|
|
114
|
+
else:
|
|
115
|
+
supported = set(_PARSERS.keys())
|
|
116
|
+
files = sorted(f for f in path.rglob("*") if f.suffix.lower() in supported)
|
|
117
|
+
if not files:
|
|
118
|
+
raise ValueError("No supported documents found.")
|
|
119
|
+
total = 0
|
|
120
|
+
should_recreate = recreate
|
|
121
|
+
for file_path in files:
|
|
122
|
+
loader = self._get_loader(file_path.suffix.lower())
|
|
123
|
+
doc = loader.load(file_path)
|
|
124
|
+
count = self.ingest_documents([doc], recreate=should_recreate)
|
|
125
|
+
if count > 0:
|
|
126
|
+
should_recreate = False
|
|
127
|
+
total += count
|
|
128
|
+
logger.info("Ingested %d chunks from %s", count, file_path)
|
|
129
|
+
return total
|
|
130
|
+
|
|
131
|
+
def _get_fetcher(self, provider: str | None, fetcher):
|
|
132
|
+
if fetcher is not None:
|
|
133
|
+
return fetcher
|
|
134
|
+
if provider == "gitbook":
|
|
135
|
+
from docs_kit.connectors.fetchers.gitbook import GitBookFetcher
|
|
136
|
+
return GitBookFetcher()
|
|
137
|
+
# "mintlify" or "auto" or None — MintlifyFetcher is the superset fetcher:
|
|
138
|
+
# it tries llms-full.txt → llms.txt → sitemap.xml, so it works for both.
|
|
139
|
+
from docs_kit.connectors.fetchers.mintlify import MintlifyFetcher
|
|
140
|
+
return MintlifyFetcher()
|
|
141
|
+
|
|
142
|
+
def ingest_url(self, url: str, recreate: bool = False, fetcher=None, provider: str | None = None) -> int:
|
|
143
|
+
"""Fetch documents from a URL and ingest them.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
url: The base URL of the documentation site.
|
|
147
|
+
recreate: Drop and recreate the collection before ingesting.
|
|
148
|
+
fetcher: Optional custom fetcher instance (overrides provider).
|
|
149
|
+
provider: One of "auto" (default), "gitbook", or "mintlify".
|
|
150
|
+
"""
|
|
151
|
+
documents = self._get_fetcher(provider, fetcher).fetch(url)
|
|
152
|
+
return self.ingest_documents(documents, recreate=recreate)
|
|
153
|
+
|
|
154
|
+
def fetch_documents(self, url: str, fetcher=None, provider: str | None = None) -> list[Document]:
|
|
155
|
+
"""Fetch documents from a URL without ingesting."""
|
|
156
|
+
return self._get_fetcher(provider, fetcher).fetch(url)
|
|
157
|
+
|
|
158
|
+
def query(self, text: str, limit: int | None = None) -> list[RetrievedChunk]:
|
|
159
|
+
effective_limit = limit if limit is not None else self.config.vector_store.retrieval_limit
|
|
160
|
+
dense_vec = self._dense_embedder.embed([text])[0]
|
|
161
|
+
sparse_vec = self._sparse_embedder.embed([text])[0]
|
|
162
|
+
return self._vector_store.query(
|
|
163
|
+
dense_vector=dense_vec,
|
|
164
|
+
sparse_vector=sparse_vec,
|
|
165
|
+
limit=effective_limit,
|
|
166
|
+
score_threshold=self.config.vector_store.score_threshold,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def collection_stats(self) -> dict:
|
|
170
|
+
return self._vector_store.collection_stats()
|
|
171
|
+
|
|
172
|
+
def get_collection_info(self) -> dict:
|
|
173
|
+
"""Get stats about the vector store collection."""
|
|
174
|
+
return self._vector_store.collection_stats()
|
|
175
|
+
|
|
176
|
+
def list_sources(self) -> list[str]:
|
|
177
|
+
"""List all unique document sources in the knowledge base."""
|
|
178
|
+
return self._vector_store.list_sources()
|
|
179
|
+
|
|
180
|
+
def get_document(self, source: str) -> str | None:
|
|
181
|
+
"""Return the exact stored source document content."""
|
|
182
|
+
return self._vector_store.get_document_content(source)
|
|
183
|
+
|
|
184
|
+
def remove_source(self, source: str) -> bool:
|
|
185
|
+
"""Remove all chunks and the document for a given source. Returns True if anything was deleted."""
|
|
186
|
+
return self._vector_store.delete_source(source)
|
|
187
|
+
|
|
188
|
+
def list_sources_with_dates(self) -> list[dict]:
|
|
189
|
+
"""List all ingested document sources with their ingestion timestamps."""
|
|
190
|
+
return self._vector_store.list_sources_with_dates()
|
docs_kit/cli/__init__.py
ADDED
|
File without changes
|
docs_kit/cli/__main__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import click
|
|
2
|
+
from docs_kit.cli.commands import init_cmd, ingest_cmd, serve_cmd, inspect_cmd, doctor_cmd, query_cmd, fetch_cmd, install_cmd, remove_cmd, list_cmd
|
|
3
|
+
from docs_kit.cli.help import DocsKitGroup, HELP_CONTEXT_SETTINGS, format_examples
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@click.group(
|
|
7
|
+
cls=DocsKitGroup,
|
|
8
|
+
context_settings=HELP_CONTEXT_SETTINGS,
|
|
9
|
+
epilog=format_examples(
|
|
10
|
+
"docs-kit ingest https://docs.example.com",
|
|
11
|
+
'docs-kit query "How do I authenticate?"',
|
|
12
|
+
"docs-kit serve --transport sse --port 3001",
|
|
13
|
+
),
|
|
14
|
+
)
|
|
15
|
+
@click.version_option(package_name="docs-kit")
|
|
16
|
+
def cli():
|
|
17
|
+
"""Fetch docs, embed them locally, and expose retrieval over MCP."""
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
cli.add_command(init_cmd, "init")
|
|
22
|
+
cli.add_command(ingest_cmd, "ingest")
|
|
23
|
+
cli.add_command(serve_cmd, "serve")
|
|
24
|
+
cli.add_command(inspect_cmd, "inspect")
|
|
25
|
+
cli.add_command(doctor_cmd, "doctor")
|
|
26
|
+
cli.add_command(query_cmd, "query")
|
|
27
|
+
cli.add_command(fetch_cmd, "fetch")
|
|
28
|
+
cli.add_command(install_cmd, "install")
|
|
29
|
+
cli.add_command(remove_cmd, "remove")
|
|
30
|
+
cli.add_command(list_cmd, "list")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
if __name__ == "__main__":
|
|
34
|
+
cli()
|