docs-kit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ from __future__ import annotations
2
+
3
+ from fastembed import SparseTextEmbedding, TextEmbedding
4
+ from qdrant_client.models import SparseVector
5
+
6
+
7
+ class FastEmbedDenseEmbedding:
8
+ """Local dense embeddings via fastembed — no API key required."""
9
+
10
+ def __init__(self, model: str = "BAAI/bge-small-en-v1.5"):
11
+ self._embedder = TextEmbedding(model_name=model)
12
+
13
+ def embed(self, texts: list[str]) -> list[list[float]]:
14
+ return [embedding.tolist() for embedding in self._embedder.embed(texts)]
15
+
16
+
17
+ class FastEmbedSparseEmbedding:
18
+ """Sparse BM25 embeddings via fastembed."""
19
+
20
+ def __init__(self, model: str = "Qdrant/bm25"):
21
+ self._embedder = SparseTextEmbedding(model_name=model)
22
+
23
+ def embed(self, texts: list[str]) -> list[SparseVector]:
24
+ results = list(self._embedder.embed(texts))
25
+ sparse_vectors: list[SparseVector] = []
26
+ for result in results:
27
+ indices = result.indices.tolist() if hasattr(result.indices, "tolist") else list(result.indices)
28
+ values = result.values.tolist() if hasattr(result.values, "tolist") else list(result.values)
29
+ sparse_vectors.append(SparseVector(indices=indices, values=values))
30
+ return sparse_vectors
File without changes
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+ from typing import Protocol, runtime_checkable
3
+ from docs_kit.core.models import Document
4
+
5
+
6
+ @runtime_checkable
7
+ class Fetcher(Protocol):
8
+ def fetch(self, url: str) -> list[Document]: ...
@@ -0,0 +1,7 @@
1
+ from __future__ import annotations
2
+
3
+ from docs_kit.connectors.fetchers.llms_txt import LlmsTxtFetcher
4
+
5
+
6
+ class GitBookFetcher(LlmsTxtFetcher):
7
+ """Fetches documentation from a GitBook site using its AI-native endpoints."""
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ import httpx
6
+
7
+ from docs_kit.core.models import Document
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class LlmsTxtFetcher:
13
+ """Fetches documentation from any site that supports the llms.txt standard.
14
+
15
+ Tries /llms-full.txt first (entire docs in one file), then /llms.txt
16
+ (index of individual pages). Raises ValueError if neither is available.
17
+
18
+ This is a shared base class for GitBookFetcher and MintlifyFetcher.
19
+ """
20
+
21
+ def __init__(self, timeout: float = 30.0):
22
+ self._timeout = timeout
23
+
24
+ def fetch(self, base_url: str) -> list[Document]:
25
+ base_url = base_url.rstrip("/")
26
+ try:
27
+ with httpx.Client(timeout=self._timeout, follow_redirects=True) as client:
28
+ docs = self._try_llms_full_txt(base_url, client)
29
+ if docs is not None:
30
+ return docs
31
+ docs = self._try_llms_txt(base_url, client)
32
+ if docs is not None:
33
+ return docs
34
+ raise ValueError(
35
+ f"Could not fetch docs from {base_url!r}: "
36
+ "no /llms-full.txt or /llms.txt found, or all discovered pages failed."
37
+ )
38
+ except httpx.RequestError as exc:
39
+ raise ValueError(f"Network error fetching {base_url!r}: {exc}") from exc
40
+
41
+ def _try_llms_full_txt(self, base_url: str, client: httpx.Client) -> list[Document] | None:
42
+ resp = client.get(f"{base_url}/llms-full.txt")
43
+ if resp.status_code == 200 and resp.text.strip():
44
+ return [Document(
45
+ source=f"{base_url}/llms-full.txt",
46
+ content=resp.text,
47
+ metadata={"format": "markdown", "fetch_method": "llms-full.txt"},
48
+ )]
49
+ return None
50
+
51
+ def _try_llms_txt(self, base_url: str, client: httpx.Client) -> list[Document] | None:
52
+ resp = client.get(f"{base_url}/llms.txt")
53
+ if resp.status_code != 200:
54
+ return None
55
+ urls = self._parse_llms_txt(resp.text)
56
+ documents = []
57
+ for url in urls:
58
+ page_resp = client.get(url)
59
+ if page_resp.status_code == 200 and page_resp.text.strip():
60
+ documents.append(Document(
61
+ source=url,
62
+ content=page_resp.text,
63
+ metadata={"format": "markdown", "fetch_method": "llms.txt"},
64
+ ))
65
+ else:
66
+ logger.warning("Skipped %s (status %d)", url, page_resp.status_code)
67
+ return documents if documents else None
68
+
69
+ @staticmethod
70
+ def _parse_llms_txt(content: str) -> list[str]:
71
+ """Extract URLs from llms.txt index format."""
72
+ urls = []
73
+ for line in content.splitlines():
74
+ line = line.strip()
75
+ if not line or line.startswith("#"):
76
+ continue
77
+ # Markdown link: [Title](url)
78
+ match = re.search(r'\(https?://[^\)]+\)', line)
79
+ if match:
80
+ urls.append(match.group(0)[1:-1]) # strip parens
81
+ continue
82
+ # Bare URL
83
+ if line.startswith("http://") or line.startswith("https://"):
84
+ urls.append(line.split()[0]) # take first token
85
+ return urls
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ import xml.etree.ElementTree as ET
6
+ import httpx
7
+
8
+ from docs_kit.connectors.fetchers.llms_txt import LlmsTxtFetcher
9
+ from docs_kit.core.html_utils import extract_main_content
10
+ from docs_kit.core.models import Document
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class MintlifyFetcher(LlmsTxtFetcher):
16
+ """Fetches documentation from a Mintlify site.
17
+
18
+ Strategy order:
19
+ 1. /llms-full.txt — full docs in one file (if the site owner enabled it)
20
+ 2. /llms.txt — index of individual pages (if the site owner enabled it)
21
+ 3. /sitemap.xml — fallback: parse the sitemap and scrape each page's HTML
22
+
23
+ Most Mintlify sites support the llms.txt standard so strategies 1 and 2
24
+ will succeed. Strategy 3 is a fallback for sites where llms.txt is disabled.
25
+ """
26
+
27
+ def fetch(self, base_url: str) -> list[Document]:
28
+ base_url = base_url.rstrip("/")
29
+ try:
30
+ with httpx.Client(timeout=self._timeout, follow_redirects=True) as client:
31
+ docs = self._try_llms_full_txt(base_url, client)
32
+ if docs is not None:
33
+ return docs
34
+
35
+ docs = self._try_llms_txt(base_url, client)
36
+ if docs is not None:
37
+ return docs
38
+
39
+ docs = self._try_sitemap(base_url, client)
40
+ if docs is not None:
41
+ return docs
42
+
43
+ raise ValueError(
44
+ f"Could not fetch docs from {base_url!r}: "
45
+ "no /llms-full.txt, /llms.txt, or /sitemap.xml found, "
46
+ "or all discovered pages failed. Is this a public Mintlify site?"
47
+ )
48
+ except httpx.RequestError as exc:
49
+ raise ValueError(f"Network error fetching {base_url!r}: {exc}") from exc
50
+
51
+ def _try_sitemap(self, base_url: str, client: httpx.Client) -> list[Document] | None:
52
+ resp = client.get(f"{base_url}/sitemap.xml")
53
+ if resp.status_code != 200 or not resp.text.strip():
54
+ return None
55
+
56
+ urls = self._parse_sitemap(resp.text)
57
+ if not urls:
58
+ return None
59
+
60
+ documents = []
61
+ for url in urls:
62
+ page_resp = client.get(url)
63
+ if page_resp.status_code == 200 and page_resp.text.strip():
64
+ content = extract_main_content(page_resp.text)
65
+ if content:
66
+ documents.append(Document(
67
+ source=url,
68
+ content=content,
69
+ metadata={"format": "markdown", "fetch_method": "sitemap.xml"},
70
+ ))
71
+ else:
72
+ logger.warning("Skipped %s (status %d)", url, page_resp.status_code)
73
+
74
+ return documents if documents else None
75
+
76
+ @staticmethod
77
+ def _parse_sitemap(xml_content: str) -> list[str]:
78
+ """Extract page URLs from a standard XML sitemap."""
79
+ urls = []
80
+ try:
81
+ root = ET.fromstring(xml_content)
82
+ # Sitemap namespace
83
+ ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
84
+ for loc in root.findall(".//sm:loc", ns):
85
+ if loc.text:
86
+ urls.append(loc.text.strip())
87
+ # Fallback: no namespace
88
+ if not urls:
89
+ for loc in root.findall(".//loc"):
90
+ if loc.text:
91
+ urls.append(loc.text.strip())
92
+ except ET.ParseError as exc:
93
+ logger.warning("Failed to parse sitemap.xml: %s", exc)
94
+ return urls
@@ -0,0 +1,4 @@
1
+ from docs_kit.connectors.parsers.text import TextLoader
2
+ from docs_kit.connectors.parsers.markdown import MarkdownLoader
3
+
4
+ __all__ = ["TextLoader", "MarkdownLoader"]
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ from typing import Protocol
4
+ from docs_kit.core.models import Document
5
+
6
+ class DocumentLoader(Protocol):
7
+ supported_extensions: list[str]
8
+ def load(self, path: Path) -> Document: ...
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ from docs_kit.core.models import Document
4
+
5
+ class MarkdownLoader:
6
+ supported_extensions = [".md"]
7
+ def load(self, path: Path) -> Document:
8
+ return Document(source=str(path), content=path.read_text(encoding="utf-8"))
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ from docs_kit.core.models import Document
4
+
5
+ class TextLoader:
6
+ supported_extensions = [".txt"]
7
+ def load(self, path: Path) -> Document:
8
+ return Document(source=str(path), content=path.read_text(encoding="utf-8"))
@@ -0,0 +1,3 @@
1
+ from docs_kit.connectors.vector_stores.qdrant import QdrantStore
2
+
3
+ __all__ = ["QdrantStore"]
@@ -0,0 +1,15 @@
1
+ from __future__ import annotations
2
+ from typing import Protocol
3
+ from qdrant_client.models import SparseVector
4
+ from docs_kit.core.models import Chunk, RetrievedChunk
5
+
6
+ class VectorStore(Protocol):
7
+ def ensure_collection(self, vector_size: int) -> None: ...
8
+ def upsert(self, chunks: list[Chunk], dense_vectors: list[list[float]], sparse_vectors: list[SparseVector] | None = None, recreate: bool = False) -> int: ...
9
+ def query(self, dense_vector: list[float], sparse_vector: SparseVector | None = None, limit: int = 5, score_threshold: float = 0.0) -> list[RetrievedChunk]: ...
10
+ def collection_stats(self) -> dict[str, bool | int | None]: ...
11
+ def list_sources(self) -> list[str]: ...
12
+ def get_by_source(self, source: str) -> list[RetrievedChunk]: ...
13
+ def upsert_document(self, source: str, content: str, recreate: bool = False) -> None: ...
14
+ def get_document_content(self, source: str) -> str | None: ...
15
+ def close(self) -> None: ...
@@ -0,0 +1,279 @@
1
+ from __future__ import annotations
2
+ import uuid
3
+ from datetime import datetime, timezone
4
+ from qdrant_client import QdrantClient
5
+ from qdrant_client.http import models as rest
6
+ from qdrant_client.http.exceptions import UnexpectedResponse
7
+ from qdrant_client.models import FieldCondition, Filter, Fusion, FusionQuery, MatchValue, Prefetch, SparseVector
8
+ from docs_kit.core.models import Chunk, RetrievedChunk
9
+
10
+ class QdrantStore:
11
+ def __init__(self, client: QdrantClient | None = None, collection_name: str = "knowledge_base",
12
+ url: str = "", local_path: str = ".qdrant",
13
+ dense_prefetch_limit: int = 20, sparse_prefetch_limit: int = 20):
14
+ if client is not None:
15
+ self._client = client
16
+ elif url:
17
+ self._client = QdrantClient(url=url)
18
+ else:
19
+ self._client = QdrantClient(path=local_path)
20
+ self._collection_name = collection_name
21
+ self._documents_collection_name = f"{collection_name}__documents"
22
+ self._dense_prefetch_limit = dense_prefetch_limit
23
+ self._sparse_prefetch_limit = sparse_prefetch_limit
24
+
25
+ def ensure_collection(self, vector_size: int) -> None:
26
+ collections = self._client.get_collections().collections
27
+ if any(c.name == self._collection_name for c in collections):
28
+ info = self._client.get_collection(self._collection_name)
29
+ params = getattr(getattr(info, "config", None), "params", None)
30
+ vectors_config = getattr(params, "vectors", None)
31
+ sparse_config = getattr(params, "sparse_vectors", None)
32
+
33
+ # Must have named dense+bm25 schema (not legacy single-vector).
34
+ has_named_dense = isinstance(vectors_config, dict) and "dense" in vectors_config
35
+ if not has_named_dense:
36
+ raise ValueError(
37
+ f"Collection '{self._collection_name}' exists but uses the old single-vector schema. "
38
+ "Re-ingest with recreate=True to upgrade to the hybrid dense+BM25 schema."
39
+ )
40
+
41
+ # Dense dimension must match the current embedding model.
42
+ existing_size = getattr(vectors_config.get("dense"), "size", None)
43
+ if existing_size is not None and existing_size != vector_size:
44
+ raise ValueError(
45
+ f"Collection '{self._collection_name}' has dense vectors of size {existing_size} "
46
+ f"but current embedding model produces size {vector_size}. "
47
+ "Re-ingest with recreate=True to rebuild the collection."
48
+ )
49
+
50
+ # Must have the bm25 sparse index for hybrid retrieval.
51
+ has_bm25 = isinstance(sparse_config, dict) and "bm25" in sparse_config
52
+ if not has_bm25:
53
+ raise ValueError(
54
+ f"Collection '{self._collection_name}' is missing the 'bm25' sparse vector index "
55
+ "required for hybrid retrieval. Re-ingest with recreate=True."
56
+ )
57
+ return
58
+ self._client.create_collection(
59
+ collection_name=self._collection_name,
60
+ vectors_config={"dense": rest.VectorParams(size=vector_size, distance=rest.Distance.COSINE)},
61
+ sparse_vectors_config={"bm25": rest.SparseVectorParams(modifier=rest.Modifier.IDF)},
62
+ )
63
+
64
+ def ensure_documents_collection(self) -> None:
65
+ collections = self._client.get_collections().collections
66
+ if any(c.name == self._documents_collection_name for c in collections):
67
+ return
68
+ self._client.create_collection(
69
+ collection_name=self._documents_collection_name,
70
+ vectors_config={"doc": rest.VectorParams(size=1, distance=rest.Distance.COSINE)},
71
+ )
72
+
73
+ def upsert(self, chunks: list[Chunk], dense_vectors: list[list[float]],
74
+ sparse_vectors: list[SparseVector] | None = None, recreate: bool = False) -> int:
75
+ if not chunks:
76
+ return 0
77
+ if recreate:
78
+ if self._client.collection_exists(self._collection_name):
79
+ self._client.delete_collection(self._collection_name)
80
+ if self._client.collection_exists(self._documents_collection_name):
81
+ self._client.delete_collection(self._documents_collection_name)
82
+ self.ensure_collection(len(dense_vectors[0]))
83
+ else:
84
+ self.ensure_collection(len(dense_vectors[0]))
85
+ ingested_at = datetime.now(timezone.utc).isoformat()
86
+ points = []
87
+ for i, chunk in enumerate(chunks):
88
+ vector: dict = {"dense": dense_vectors[i]}
89
+ if sparse_vectors:
90
+ vector["bm25"] = sparse_vectors[i]
91
+ points.append(rest.PointStruct(
92
+ id=str(uuid.uuid4()), vector=vector,
93
+ payload={"source": chunk.source, "chunk_index": chunk.chunk_index, "text": chunk.text, "ingested_at": ingested_at},
94
+ ))
95
+ self._client.upsert(collection_name=self._collection_name, points=points)
96
+ return len(points)
97
+
98
+ def upsert_document(self, source: str, content: str, recreate: bool = False) -> None:
99
+ if recreate and self._client.collection_exists(self._documents_collection_name):
100
+ self._client.delete_collection(self._documents_collection_name)
101
+ self.ensure_documents_collection()
102
+ self._client.delete(
103
+ collection_name=self._documents_collection_name,
104
+ points_selector=Filter(
105
+ must=[FieldCondition(key="source", match=MatchValue(value=source))]
106
+ ),
107
+ )
108
+ point = rest.PointStruct(
109
+ id=str(uuid.uuid4()),
110
+ vector={"doc": [1.0]},
111
+ payload={"source": source, "content": content, "ingested_at": datetime.now(timezone.utc).isoformat()},
112
+ )
113
+ self._client.upsert(collection_name=self._documents_collection_name, points=[point])
114
+
115
+ def query(self, dense_vector: list[float], sparse_vector: SparseVector | None = None,
116
+ limit: int = 5, score_threshold: float = 0.0) -> list[RetrievedChunk]:
117
+ if sparse_vector is not None:
118
+ search_result = self._client.query_points(
119
+ collection_name=self._collection_name,
120
+ prefetch=[
121
+ Prefetch(query=sparse_vector, using="bm25", limit=self._sparse_prefetch_limit),
122
+ Prefetch(query=dense_vector, using="dense", limit=self._dense_prefetch_limit),
123
+ ],
124
+ query=FusionQuery(fusion=Fusion.RRF), limit=limit,
125
+ score_threshold=score_threshold, with_payload=True,
126
+ )
127
+ else:
128
+ search_result = self._client.query_points(
129
+ collection_name=self._collection_name, query=dense_vector, using="dense",
130
+ limit=limit, score_threshold=score_threshold, with_payload=True,
131
+ )
132
+ points = getattr(search_result, "points", search_result)
133
+ results: list[RetrievedChunk] = []
134
+ for point in points:
135
+ payload = point.payload or {}
136
+ text = str(payload.get("text", ""))
137
+ if text:
138
+ results.append(RetrievedChunk(
139
+ source=str(payload.get("source", "unknown")),
140
+ chunk_index=int(payload.get("chunk_index", 0)),
141
+ text=text, score=float(getattr(point, "score", 0.0)),
142
+ ))
143
+ return results
144
+
145
+ def collection_stats(self) -> dict[str, bool | int | None]:
146
+ try:
147
+ info = self._client.get_collection(self._collection_name)
148
+ except UnexpectedResponse as exc:
149
+ if getattr(exc, "status_code", None) == 404:
150
+ return {"collection_exists": False, "points_count": 0}
151
+ raise
152
+ points_count = getattr(info, "points_count", None)
153
+ if points_count is None:
154
+ result = getattr(info, "result", None)
155
+ points_count = getattr(result, "points_count", None)
156
+ return {"collection_exists": True, "points_count": int(points_count) if points_count is not None else None}
157
+
158
+ def list_sources(self) -> list[str]:
159
+ if not self._client.collection_exists(self._collection_name):
160
+ return []
161
+ sources: set[str] = set()
162
+ offset = None
163
+ while True:
164
+ points, next_offset = self._client.scroll(
165
+ collection_name=self._collection_name,
166
+ limit=100,
167
+ offset=offset,
168
+ with_payload=["source"],
169
+ with_vectors=False,
170
+ )
171
+ for point in points:
172
+ src = (point.payload or {}).get("source")
173
+ if src is not None:
174
+ sources.add(str(src))
175
+ if next_offset is None:
176
+ break
177
+ offset = next_offset
178
+ return sorted(sources)
179
+
180
+ def get_by_source(self, source: str) -> list[RetrievedChunk]:
181
+ if not self._client.collection_exists(self._collection_name):
182
+ return []
183
+ results = []
184
+ offset = None
185
+ while True:
186
+ points, next_offset = self._client.scroll(
187
+ collection_name=self._collection_name,
188
+ scroll_filter=Filter(must=[
189
+ FieldCondition(key="source", match=MatchValue(value=source))
190
+ ]),
191
+ limit=100,
192
+ offset=offset,
193
+ with_payload=True,
194
+ with_vectors=False,
195
+ )
196
+ for point in points:
197
+ payload = point.payload or {}
198
+ text = str(payload.get("text", ""))
199
+ if text:
200
+ results.append(RetrievedChunk(
201
+ source=str(payload.get("source", source)),
202
+ chunk_index=int(payload.get("chunk_index", 0)),
203
+ text=text,
204
+ score=1.0,
205
+ ))
206
+ if next_offset is None:
207
+ break
208
+ offset = next_offset
209
+ return sorted(results, key=lambda c: c.chunk_index)
210
+
211
+ def get_document_content(self, source: str) -> str | None:
212
+ if not self._client.collection_exists(self._documents_collection_name):
213
+ return None
214
+ points, _next_offset = self._client.scroll(
215
+ collection_name=self._documents_collection_name,
216
+ scroll_filter=Filter(
217
+ must=[FieldCondition(key="source", match=MatchValue(value=source))]
218
+ ),
219
+ limit=1,
220
+ with_payload=True,
221
+ with_vectors=False,
222
+ )
223
+ if not points:
224
+ return None
225
+ payload = points[0].payload or {}
226
+ content = payload.get("content")
227
+ if content is None:
228
+ return None
229
+ return str(content)
230
+
231
+ def delete_source(self, source: str) -> bool:
232
+ """Delete all chunks and the document for a given source. Returns True if anything was deleted."""
233
+ deleted_any = False
234
+ source_filter = Filter(must=[FieldCondition(key="source", match=MatchValue(value=source))])
235
+ if self._client.collection_exists(self._collection_name):
236
+ result = self._client.delete(
237
+ collection_name=self._collection_name,
238
+ points_selector=source_filter,
239
+ )
240
+ if getattr(result, "status", None) is not None:
241
+ deleted_any = True
242
+ if self._client.collection_exists(self._documents_collection_name):
243
+ result = self._client.delete(
244
+ collection_name=self._documents_collection_name,
245
+ points_selector=source_filter,
246
+ )
247
+ if getattr(result, "status", None) is not None:
248
+ deleted_any = True
249
+ return deleted_any
250
+
251
+ def list_sources_with_dates(self) -> list[dict]:
252
+ """List all ingested document sources with their ingestion timestamps."""
253
+ if not self._client.collection_exists(self._documents_collection_name):
254
+ return []
255
+ entries = []
256
+ offset = None
257
+ while True:
258
+ points, next_offset = self._client.scroll(
259
+ collection_name=self._documents_collection_name,
260
+ limit=100,
261
+ offset=offset,
262
+ with_payload=["source", "ingested_at"],
263
+ with_vectors=False,
264
+ )
265
+ for point in points:
266
+ payload = point.payload or {}
267
+ src = payload.get("source")
268
+ if src is not None:
269
+ entries.append({
270
+ "source": str(src),
271
+ "ingested_at": str(payload.get("ingested_at", "unknown")),
272
+ })
273
+ if next_offset is None:
274
+ break
275
+ offset = next_offset
276
+ return sorted(entries, key=lambda e: e["source"])
277
+
278
+ def close(self) -> None:
279
+ self._client.close()
File without changes