docs-kit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docs_kit/__init__.py ADDED
@@ -0,0 +1,32 @@
1
+ """docs-kit: Fetch docs, embed locally, expose via MCP for AI agents."""
2
+
3
+ from docs_kit._version import __version__
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "DocsKitAgent",
8
+ "DocsKitConfig",
9
+ "Chunk",
10
+ "Document",
11
+ "RetrievedChunk",
12
+ ]
13
+
14
+
15
+ def __getattr__(name: str):
16
+ if name == "DocsKitAgent":
17
+ from docs_kit.agent import DocsKitAgent
18
+
19
+ return DocsKitAgent
20
+ if name == "DocsKitConfig":
21
+ from docs_kit.core.config import DocsKitConfig
22
+
23
+ return DocsKitConfig
24
+ if name in {"Chunk", "Document", "RetrievedChunk"}:
25
+ from docs_kit.core.models import Chunk, Document, RetrievedChunk
26
+
27
+ return {
28
+ "Chunk": Chunk,
29
+ "Document": Document,
30
+ "RetrievedChunk": RetrievedChunk,
31
+ }[name]
32
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
docs_kit/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from docs_kit.cli.__main__ import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
docs_kit/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.1"
docs_kit/agent.py ADDED
@@ -0,0 +1,190 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ from docs_kit.core.config import DocsKitConfig
8
+ from docs_kit.core.models import Chunk, Document, RetrievedChunk
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _PARSERS = {
13
+ ".txt": "docs_kit.connectors.parsers.text:TextLoader",
14
+ ".md": "docs_kit.connectors.parsers.markdown:MarkdownLoader",
15
+ }
16
+
17
+
18
+ def _import_class(dotted_path: str) -> type:
19
+ module_path, class_name = dotted_path.rsplit(":", 1)
20
+ module = importlib.import_module(module_path)
21
+ return getattr(module, class_name)
22
+
23
+
24
+ class DocsKitAgent:
25
+ """High-level API for docs-kit.
26
+
27
+ Usage:
28
+ agent = DocsKitAgent()
29
+ agent.ingest("./docs/")
30
+ results = agent.query("how do I get started?")
31
+ """
32
+
33
+ def __init__(self, config: DocsKitConfig | None = None, _lazy_init: bool = False):
34
+ if config is None:
35
+ config = DocsKitConfig()
36
+ self.config = config
37
+ self._dense_embedder = None
38
+ self._sparse_embedder = None
39
+ self._vector_store = None
40
+
41
+ if not _lazy_init:
42
+ self._init_components()
43
+
44
+ def _init_components(self) -> None:
45
+ if self.config.embedding.provider != "fastembed":
46
+ raise ValueError(
47
+ f"Unsupported embedding provider '{self.config.embedding.provider}'. Supported: fastembed."
48
+ )
49
+ from docs_kit.connectors.embeddings.fastembed import FastEmbedDenseEmbedding, FastEmbedSparseEmbedding
50
+ self._dense_embedder = FastEmbedDenseEmbedding(model=self.config.embedding.model)
51
+ self._sparse_embedder = FastEmbedSparseEmbedding(model=self.config.ingestion.bm25_model)
52
+
53
+ if self.config.vector_store.provider != "qdrant":
54
+ raise ValueError(
55
+ f"Unsupported vector store provider '{self.config.vector_store.provider}'. Supported: qdrant."
56
+ )
57
+ from docs_kit.connectors.vector_stores.qdrant import QdrantStore
58
+ self._vector_store = QdrantStore(
59
+ collection_name=self.config.vector_store.collection_name,
60
+ url=self.config.vector_store.url,
61
+ local_path=self.config.vector_store.local_path,
62
+ dense_prefetch_limit=self.config.vector_store.dense_prefetch_limit,
63
+ sparse_prefetch_limit=self.config.vector_store.sparse_prefetch_limit,
64
+ )
65
+
66
+ def _get_loader(self, suffix: str):
67
+ parser_path = _PARSERS.get(suffix)
68
+ if not parser_path:
69
+ raise ValueError(f"No parser for '{suffix}'. Supported: {list(_PARSERS.keys())}")
70
+ cls = _import_class(parser_path)
71
+ return cls()
72
+
73
+ def _document_is_markdown(self, doc: Document) -> bool:
74
+ content_type = str(doc.metadata.get("content_type", "")).lower()
75
+ doc_format = str(doc.metadata.get("format", "")).lower()
76
+ return doc.source.lower().endswith(".md") or "markdown" in content_type or doc_format == "markdown"
77
+
78
+ def _build_chunks(self, doc: Document) -> list[Chunk]:
79
+ from docs_kit.core.chunking import chunk_markdown, chunk_text
80
+ from docs_kit.core.html_utils import clean_html
81
+ chunk_size = self.config.ingestion.chunk_size
82
+ chunk_overlap = self.config.ingestion.chunk_overlap
83
+ content = clean_html(doc.content)
84
+ if self._document_is_markdown(doc):
85
+ text_chunks = chunk_markdown(content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
86
+ else:
87
+ text_chunks = chunk_text(content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
88
+ return [
89
+ Chunk(text=text, source=doc.source, chunk_index=index, metadata=dict(doc.metadata))
90
+ for index, text in enumerate(text_chunks)
91
+ ]
92
+
93
+ def ingest_documents(self, documents: list[Document], recreate: bool = False) -> int:
94
+ total = 0
95
+ should_recreate = recreate
96
+ for doc in documents:
97
+ chunks = self._build_chunks(doc)
98
+ if not chunks:
99
+ continue
100
+ dense_vecs = self._dense_embedder.embed([chunk.text for chunk in chunks])
101
+ sparse_vecs = self._sparse_embedder.embed([chunk.text for chunk in chunks])
102
+ count = self._vector_store.upsert(chunks, dense_vecs, sparse_vecs, recreate=should_recreate)
103
+ self._vector_store.upsert_document(doc.source, doc.content, recreate=should_recreate)
104
+ should_recreate = False
105
+ total += count
106
+ return total
107
+
108
+ def ingest(self, path: str | Path, recreate: bool = False) -> int:
109
+ path = Path(path)
110
+ if not path.exists():
111
+ raise FileNotFoundError(f"Path not found: {path}")
112
+ if path.is_file():
113
+ files = [path]
114
+ else:
115
+ supported = set(_PARSERS.keys())
116
+ files = sorted(f for f in path.rglob("*") if f.suffix.lower() in supported)
117
+ if not files:
118
+ raise ValueError("No supported documents found.")
119
+ total = 0
120
+ should_recreate = recreate
121
+ for file_path in files:
122
+ loader = self._get_loader(file_path.suffix.lower())
123
+ doc = loader.load(file_path)
124
+ count = self.ingest_documents([doc], recreate=should_recreate)
125
+ if count > 0:
126
+ should_recreate = False
127
+ total += count
128
+ logger.info("Ingested %d chunks from %s", count, file_path)
129
+ return total
130
+
131
+ def _get_fetcher(self, provider: str | None, fetcher):
132
+ if fetcher is not None:
133
+ return fetcher
134
+ if provider == "gitbook":
135
+ from docs_kit.connectors.fetchers.gitbook import GitBookFetcher
136
+ return GitBookFetcher()
137
+ # "mintlify" or "auto" or None — MintlifyFetcher is the superset fetcher:
138
+ # it tries llms-full.txt → llms.txt → sitemap.xml, so it works for both.
139
+ from docs_kit.connectors.fetchers.mintlify import MintlifyFetcher
140
+ return MintlifyFetcher()
141
+
142
+ def ingest_url(self, url: str, recreate: bool = False, fetcher=None, provider: str | None = None) -> int:
143
+ """Fetch documents from a URL and ingest them.
144
+
145
+ Args:
146
+ url: The base URL of the documentation site.
147
+ recreate: Drop and recreate the collection before ingesting.
148
+ fetcher: Optional custom fetcher instance (overrides provider).
149
+ provider: One of "auto" (default), "gitbook", or "mintlify".
150
+ """
151
+ documents = self._get_fetcher(provider, fetcher).fetch(url)
152
+ return self.ingest_documents(documents, recreate=recreate)
153
+
154
+ def fetch_documents(self, url: str, fetcher=None, provider: str | None = None) -> list[Document]:
155
+ """Fetch documents from a URL without ingesting."""
156
+ return self._get_fetcher(provider, fetcher).fetch(url)
157
+
158
+ def query(self, text: str, limit: int | None = None) -> list[RetrievedChunk]:
159
+ effective_limit = limit if limit is not None else self.config.vector_store.retrieval_limit
160
+ dense_vec = self._dense_embedder.embed([text])[0]
161
+ sparse_vec = self._sparse_embedder.embed([text])[0]
162
+ return self._vector_store.query(
163
+ dense_vector=dense_vec,
164
+ sparse_vector=sparse_vec,
165
+ limit=effective_limit,
166
+ score_threshold=self.config.vector_store.score_threshold,
167
+ )
168
+
169
+ def collection_stats(self) -> dict:
170
+ return self._vector_store.collection_stats()
171
+
172
+ def get_collection_info(self) -> dict:
173
+ """Get stats about the vector store collection."""
174
+ return self._vector_store.collection_stats()
175
+
176
+ def list_sources(self) -> list[str]:
177
+ """List all unique document sources in the knowledge base."""
178
+ return self._vector_store.list_sources()
179
+
180
+ def get_document(self, source: str) -> str | None:
181
+ """Return the exact stored source document content."""
182
+ return self._vector_store.get_document_content(source)
183
+
184
+ def remove_source(self, source: str) -> bool:
185
+ """Remove all chunks and the document for a given source. Returns True if anything was deleted."""
186
+ return self._vector_store.delete_source(source)
187
+
188
+ def list_sources_with_dates(self) -> list[dict]:
189
+ """List all ingested document sources with their ingestion timestamps."""
190
+ return self._vector_store.list_sources_with_dates()
File without changes
@@ -0,0 +1,34 @@
1
+ import click
2
+ from docs_kit.cli.commands import init_cmd, ingest_cmd, serve_cmd, inspect_cmd, doctor_cmd, query_cmd, fetch_cmd, install_cmd, remove_cmd, list_cmd
3
+ from docs_kit.cli.help import DocsKitGroup, HELP_CONTEXT_SETTINGS, format_examples
4
+
5
+
6
+ @click.group(
7
+ cls=DocsKitGroup,
8
+ context_settings=HELP_CONTEXT_SETTINGS,
9
+ epilog=format_examples(
10
+ "docs-kit ingest https://docs.example.com",
11
+ 'docs-kit query "How do I authenticate?"',
12
+ "docs-kit serve --transport sse --port 3001",
13
+ ),
14
+ )
15
+ @click.version_option(package_name="docs-kit")
16
+ def cli():
17
+ """Fetch docs, embed them locally, and expose retrieval over MCP."""
18
+ pass
19
+
20
+
21
+ cli.add_command(init_cmd, "init")
22
+ cli.add_command(ingest_cmd, "ingest")
23
+ cli.add_command(serve_cmd, "serve")
24
+ cli.add_command(inspect_cmd, "inspect")
25
+ cli.add_command(doctor_cmd, "doctor")
26
+ cli.add_command(query_cmd, "query")
27
+ cli.add_command(fetch_cmd, "fetch")
28
+ cli.add_command(install_cmd, "install")
29
+ cli.add_command(remove_cmd, "remove")
30
+ cli.add_command(list_cmd, "list")
31
+
32
+
33
+ if __name__ == "__main__":
34
+ cli()