docs-kit 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs_kit/__init__.py +32 -0
- docs_kit/__main__.py +4 -0
- docs_kit/_version.py +1 -0
- docs_kit/agent.py +190 -0
- docs_kit/cli/__init__.py +0 -0
- docs_kit/cli/__main__.py +34 -0
- docs_kit/cli/commands.py +542 -0
- docs_kit/cli/help.py +140 -0
- docs_kit/connectors/__init__.py +0 -0
- docs_kit/connectors/embeddings/__init__.py +3 -0
- docs_kit/connectors/embeddings/base.py +9 -0
- docs_kit/connectors/embeddings/fastembed.py +30 -0
- docs_kit/connectors/fetchers/__init__.py +0 -0
- docs_kit/connectors/fetchers/base.py +8 -0
- docs_kit/connectors/fetchers/gitbook.py +7 -0
- docs_kit/connectors/fetchers/llms_txt.py +85 -0
- docs_kit/connectors/fetchers/mintlify.py +94 -0
- docs_kit/connectors/parsers/__init__.py +4 -0
- docs_kit/connectors/parsers/base.py +8 -0
- docs_kit/connectors/parsers/markdown.py +8 -0
- docs_kit/connectors/parsers/text.py +8 -0
- docs_kit/connectors/vector_stores/__init__.py +3 -0
- docs_kit/connectors/vector_stores/base.py +15 -0
- docs_kit/connectors/vector_stores/qdrant.py +279 -0
- docs_kit/core/__init__.py +0 -0
- docs_kit/core/chunking.py +227 -0
- docs_kit/core/config.py +67 -0
- docs_kit/core/html_utils.py +78 -0
- docs_kit/core/models.py +28 -0
- docs_kit/mcp/__init__.py +0 -0
- docs_kit/mcp/server.py +100 -0
- docs_kit/mcp/tools.py +10 -0
- docs_kit-0.1.1.dist-info/METADATA +268 -0
- docs_kit-0.1.1.dist-info/RECORD +37 -0
- docs_kit-0.1.1.dist-info/WHEEL +4 -0
- docs_kit-0.1.1.dist-info/entry_points.txt +2 -0
- docs_kit-0.1.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from fastembed import SparseTextEmbedding, TextEmbedding
|
|
4
|
+
from qdrant_client.models import SparseVector
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FastEmbedDenseEmbedding:
|
|
8
|
+
"""Local dense embeddings via fastembed — no API key required."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, model: str = "BAAI/bge-small-en-v1.5"):
|
|
11
|
+
self._embedder = TextEmbedding(model_name=model)
|
|
12
|
+
|
|
13
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
14
|
+
return [embedding.tolist() for embedding in self._embedder.embed(texts)]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FastEmbedSparseEmbedding:
|
|
18
|
+
"""Sparse BM25 embeddings via fastembed."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, model: str = "Qdrant/bm25"):
|
|
21
|
+
self._embedder = SparseTextEmbedding(model_name=model)
|
|
22
|
+
|
|
23
|
+
def embed(self, texts: list[str]) -> list[SparseVector]:
|
|
24
|
+
results = list(self._embedder.embed(texts))
|
|
25
|
+
sparse_vectors: list[SparseVector] = []
|
|
26
|
+
for result in results:
|
|
27
|
+
indices = result.indices.tolist() if hasattr(result.indices, "tolist") else list(result.indices)
|
|
28
|
+
values = result.values.tolist() if hasattr(result.values, "tolist") else list(result.values)
|
|
29
|
+
sparse_vectors.append(SparseVector(indices=indices, values=values))
|
|
30
|
+
return sparse_vectors
|
|
File without changes
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from docs_kit.core.models import Document
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LlmsTxtFetcher:
|
|
13
|
+
"""Fetches documentation from any site that supports the llms.txt standard.
|
|
14
|
+
|
|
15
|
+
Tries /llms-full.txt first (entire docs in one file), then /llms.txt
|
|
16
|
+
(index of individual pages). Raises ValueError if neither is available.
|
|
17
|
+
|
|
18
|
+
This is a shared base class for GitBookFetcher and MintlifyFetcher.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, timeout: float = 30.0):
|
|
22
|
+
self._timeout = timeout
|
|
23
|
+
|
|
24
|
+
def fetch(self, base_url: str) -> list[Document]:
|
|
25
|
+
base_url = base_url.rstrip("/")
|
|
26
|
+
try:
|
|
27
|
+
with httpx.Client(timeout=self._timeout, follow_redirects=True) as client:
|
|
28
|
+
docs = self._try_llms_full_txt(base_url, client)
|
|
29
|
+
if docs is not None:
|
|
30
|
+
return docs
|
|
31
|
+
docs = self._try_llms_txt(base_url, client)
|
|
32
|
+
if docs is not None:
|
|
33
|
+
return docs
|
|
34
|
+
raise ValueError(
|
|
35
|
+
f"Could not fetch docs from {base_url!r}: "
|
|
36
|
+
"no /llms-full.txt or /llms.txt found, or all discovered pages failed."
|
|
37
|
+
)
|
|
38
|
+
except httpx.RequestError as exc:
|
|
39
|
+
raise ValueError(f"Network error fetching {base_url!r}: {exc}") from exc
|
|
40
|
+
|
|
41
|
+
def _try_llms_full_txt(self, base_url: str, client: httpx.Client) -> list[Document] | None:
|
|
42
|
+
resp = client.get(f"{base_url}/llms-full.txt")
|
|
43
|
+
if resp.status_code == 200 and resp.text.strip():
|
|
44
|
+
return [Document(
|
|
45
|
+
source=f"{base_url}/llms-full.txt",
|
|
46
|
+
content=resp.text,
|
|
47
|
+
metadata={"format": "markdown", "fetch_method": "llms-full.txt"},
|
|
48
|
+
)]
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
def _try_llms_txt(self, base_url: str, client: httpx.Client) -> list[Document] | None:
|
|
52
|
+
resp = client.get(f"{base_url}/llms.txt")
|
|
53
|
+
if resp.status_code != 200:
|
|
54
|
+
return None
|
|
55
|
+
urls = self._parse_llms_txt(resp.text)
|
|
56
|
+
documents = []
|
|
57
|
+
for url in urls:
|
|
58
|
+
page_resp = client.get(url)
|
|
59
|
+
if page_resp.status_code == 200 and page_resp.text.strip():
|
|
60
|
+
documents.append(Document(
|
|
61
|
+
source=url,
|
|
62
|
+
content=page_resp.text,
|
|
63
|
+
metadata={"format": "markdown", "fetch_method": "llms.txt"},
|
|
64
|
+
))
|
|
65
|
+
else:
|
|
66
|
+
logger.warning("Skipped %s (status %d)", url, page_resp.status_code)
|
|
67
|
+
return documents if documents else None
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def _parse_llms_txt(content: str) -> list[str]:
|
|
71
|
+
"""Extract URLs from llms.txt index format."""
|
|
72
|
+
urls = []
|
|
73
|
+
for line in content.splitlines():
|
|
74
|
+
line = line.strip()
|
|
75
|
+
if not line or line.startswith("#"):
|
|
76
|
+
continue
|
|
77
|
+
# Markdown link: [Title](url)
|
|
78
|
+
match = re.search(r'\(https?://[^\)]+\)', line)
|
|
79
|
+
if match:
|
|
80
|
+
urls.append(match.group(0)[1:-1]) # strip parens
|
|
81
|
+
continue
|
|
82
|
+
# Bare URL
|
|
83
|
+
if line.startswith("http://") or line.startswith("https://"):
|
|
84
|
+
urls.append(line.split()[0]) # take first token
|
|
85
|
+
return urls
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from docs_kit.connectors.fetchers.llms_txt import LlmsTxtFetcher
|
|
9
|
+
from docs_kit.core.html_utils import extract_main_content
|
|
10
|
+
from docs_kit.core.models import Document
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MintlifyFetcher(LlmsTxtFetcher):
|
|
16
|
+
"""Fetches documentation from a Mintlify site.
|
|
17
|
+
|
|
18
|
+
Strategy order:
|
|
19
|
+
1. /llms-full.txt — full docs in one file (if the site owner enabled it)
|
|
20
|
+
2. /llms.txt — index of individual pages (if the site owner enabled it)
|
|
21
|
+
3. /sitemap.xml — fallback: parse the sitemap and scrape each page's HTML
|
|
22
|
+
|
|
23
|
+
Most Mintlify sites support the llms.txt standard so strategies 1 and 2
|
|
24
|
+
will succeed. Strategy 3 is a fallback for sites where llms.txt is disabled.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def fetch(self, base_url: str) -> list[Document]:
|
|
28
|
+
base_url = base_url.rstrip("/")
|
|
29
|
+
try:
|
|
30
|
+
with httpx.Client(timeout=self._timeout, follow_redirects=True) as client:
|
|
31
|
+
docs = self._try_llms_full_txt(base_url, client)
|
|
32
|
+
if docs is not None:
|
|
33
|
+
return docs
|
|
34
|
+
|
|
35
|
+
docs = self._try_llms_txt(base_url, client)
|
|
36
|
+
if docs is not None:
|
|
37
|
+
return docs
|
|
38
|
+
|
|
39
|
+
docs = self._try_sitemap(base_url, client)
|
|
40
|
+
if docs is not None:
|
|
41
|
+
return docs
|
|
42
|
+
|
|
43
|
+
raise ValueError(
|
|
44
|
+
f"Could not fetch docs from {base_url!r}: "
|
|
45
|
+
"no /llms-full.txt, /llms.txt, or /sitemap.xml found, "
|
|
46
|
+
"or all discovered pages failed. Is this a public Mintlify site?"
|
|
47
|
+
)
|
|
48
|
+
except httpx.RequestError as exc:
|
|
49
|
+
raise ValueError(f"Network error fetching {base_url!r}: {exc}") from exc
|
|
50
|
+
|
|
51
|
+
def _try_sitemap(self, base_url: str, client: httpx.Client) -> list[Document] | None:
|
|
52
|
+
resp = client.get(f"{base_url}/sitemap.xml")
|
|
53
|
+
if resp.status_code != 200 or not resp.text.strip():
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
urls = self._parse_sitemap(resp.text)
|
|
57
|
+
if not urls:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
documents = []
|
|
61
|
+
for url in urls:
|
|
62
|
+
page_resp = client.get(url)
|
|
63
|
+
if page_resp.status_code == 200 and page_resp.text.strip():
|
|
64
|
+
content = extract_main_content(page_resp.text)
|
|
65
|
+
if content:
|
|
66
|
+
documents.append(Document(
|
|
67
|
+
source=url,
|
|
68
|
+
content=content,
|
|
69
|
+
metadata={"format": "markdown", "fetch_method": "sitemap.xml"},
|
|
70
|
+
))
|
|
71
|
+
else:
|
|
72
|
+
logger.warning("Skipped %s (status %d)", url, page_resp.status_code)
|
|
73
|
+
|
|
74
|
+
return documents if documents else None
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _parse_sitemap(xml_content: str) -> list[str]:
|
|
78
|
+
"""Extract page URLs from a standard XML sitemap."""
|
|
79
|
+
urls = []
|
|
80
|
+
try:
|
|
81
|
+
root = ET.fromstring(xml_content)
|
|
82
|
+
# Sitemap namespace
|
|
83
|
+
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
84
|
+
for loc in root.findall(".//sm:loc", ns):
|
|
85
|
+
if loc.text:
|
|
86
|
+
urls.append(loc.text.strip())
|
|
87
|
+
# Fallback: no namespace
|
|
88
|
+
if not urls:
|
|
89
|
+
for loc in root.findall(".//loc"):
|
|
90
|
+
if loc.text:
|
|
91
|
+
urls.append(loc.text.strip())
|
|
92
|
+
except ET.ParseError as exc:
|
|
93
|
+
logger.warning("Failed to parse sitemap.xml: %s", exc)
|
|
94
|
+
return urls
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from docs_kit.core.models import Document
|
|
4
|
+
|
|
5
|
+
class MarkdownLoader:
|
|
6
|
+
supported_extensions = [".md"]
|
|
7
|
+
def load(self, path: Path) -> Document:
|
|
8
|
+
return Document(source=str(path), content=path.read_text(encoding="utf-8"))
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from docs_kit.core.models import Document
|
|
4
|
+
|
|
5
|
+
class TextLoader:
|
|
6
|
+
supported_extensions = [".txt"]
|
|
7
|
+
def load(self, path: Path) -> Document:
|
|
8
|
+
return Document(source=str(path), content=path.read_text(encoding="utf-8"))
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Protocol
|
|
3
|
+
from qdrant_client.models import SparseVector
|
|
4
|
+
from docs_kit.core.models import Chunk, RetrievedChunk
|
|
5
|
+
|
|
6
|
+
class VectorStore(Protocol):
|
|
7
|
+
def ensure_collection(self, vector_size: int) -> None: ...
|
|
8
|
+
def upsert(self, chunks: list[Chunk], dense_vectors: list[list[float]], sparse_vectors: list[SparseVector] | None = None, recreate: bool = False) -> int: ...
|
|
9
|
+
def query(self, dense_vector: list[float], sparse_vector: SparseVector | None = None, limit: int = 5, score_threshold: float = 0.0) -> list[RetrievedChunk]: ...
|
|
10
|
+
def collection_stats(self) -> dict[str, bool | int | None]: ...
|
|
11
|
+
def list_sources(self) -> list[str]: ...
|
|
12
|
+
def get_by_source(self, source: str) -> list[RetrievedChunk]: ...
|
|
13
|
+
def upsert_document(self, source: str, content: str, recreate: bool = False) -> None: ...
|
|
14
|
+
def get_document_content(self, source: str) -> str | None: ...
|
|
15
|
+
def close(self) -> None: ...
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import uuid
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from qdrant_client import QdrantClient
|
|
5
|
+
from qdrant_client.http import models as rest
|
|
6
|
+
from qdrant_client.http.exceptions import UnexpectedResponse
|
|
7
|
+
from qdrant_client.models import FieldCondition, Filter, Fusion, FusionQuery, MatchValue, Prefetch, SparseVector
|
|
8
|
+
from docs_kit.core.models import Chunk, RetrievedChunk
|
|
9
|
+
|
|
10
|
+
class QdrantStore:
|
|
11
|
+
def __init__(self, client: QdrantClient | None = None, collection_name: str = "knowledge_base",
|
|
12
|
+
url: str = "", local_path: str = ".qdrant",
|
|
13
|
+
dense_prefetch_limit: int = 20, sparse_prefetch_limit: int = 20):
|
|
14
|
+
if client is not None:
|
|
15
|
+
self._client = client
|
|
16
|
+
elif url:
|
|
17
|
+
self._client = QdrantClient(url=url)
|
|
18
|
+
else:
|
|
19
|
+
self._client = QdrantClient(path=local_path)
|
|
20
|
+
self._collection_name = collection_name
|
|
21
|
+
self._documents_collection_name = f"{collection_name}__documents"
|
|
22
|
+
self._dense_prefetch_limit = dense_prefetch_limit
|
|
23
|
+
self._sparse_prefetch_limit = sparse_prefetch_limit
|
|
24
|
+
|
|
25
|
+
def ensure_collection(self, vector_size: int) -> None:
|
|
26
|
+
collections = self._client.get_collections().collections
|
|
27
|
+
if any(c.name == self._collection_name for c in collections):
|
|
28
|
+
info = self._client.get_collection(self._collection_name)
|
|
29
|
+
params = getattr(getattr(info, "config", None), "params", None)
|
|
30
|
+
vectors_config = getattr(params, "vectors", None)
|
|
31
|
+
sparse_config = getattr(params, "sparse_vectors", None)
|
|
32
|
+
|
|
33
|
+
# Must have named dense+bm25 schema (not legacy single-vector).
|
|
34
|
+
has_named_dense = isinstance(vectors_config, dict) and "dense" in vectors_config
|
|
35
|
+
if not has_named_dense:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
f"Collection '{self._collection_name}' exists but uses the old single-vector schema. "
|
|
38
|
+
"Re-ingest with recreate=True to upgrade to the hybrid dense+BM25 schema."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Dense dimension must match the current embedding model.
|
|
42
|
+
existing_size = getattr(vectors_config.get("dense"), "size", None)
|
|
43
|
+
if existing_size is not None and existing_size != vector_size:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"Collection '{self._collection_name}' has dense vectors of size {existing_size} "
|
|
46
|
+
f"but current embedding model produces size {vector_size}. "
|
|
47
|
+
"Re-ingest with recreate=True to rebuild the collection."
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Must have the bm25 sparse index for hybrid retrieval.
|
|
51
|
+
has_bm25 = isinstance(sparse_config, dict) and "bm25" in sparse_config
|
|
52
|
+
if not has_bm25:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"Collection '{self._collection_name}' is missing the 'bm25' sparse vector index "
|
|
55
|
+
"required for hybrid retrieval. Re-ingest with recreate=True."
|
|
56
|
+
)
|
|
57
|
+
return
|
|
58
|
+
self._client.create_collection(
|
|
59
|
+
collection_name=self._collection_name,
|
|
60
|
+
vectors_config={"dense": rest.VectorParams(size=vector_size, distance=rest.Distance.COSINE)},
|
|
61
|
+
sparse_vectors_config={"bm25": rest.SparseVectorParams(modifier=rest.Modifier.IDF)},
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def ensure_documents_collection(self) -> None:
|
|
65
|
+
collections = self._client.get_collections().collections
|
|
66
|
+
if any(c.name == self._documents_collection_name for c in collections):
|
|
67
|
+
return
|
|
68
|
+
self._client.create_collection(
|
|
69
|
+
collection_name=self._documents_collection_name,
|
|
70
|
+
vectors_config={"doc": rest.VectorParams(size=1, distance=rest.Distance.COSINE)},
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def upsert(self, chunks: list[Chunk], dense_vectors: list[list[float]],
|
|
74
|
+
sparse_vectors: list[SparseVector] | None = None, recreate: bool = False) -> int:
|
|
75
|
+
if not chunks:
|
|
76
|
+
return 0
|
|
77
|
+
if recreate:
|
|
78
|
+
if self._client.collection_exists(self._collection_name):
|
|
79
|
+
self._client.delete_collection(self._collection_name)
|
|
80
|
+
if self._client.collection_exists(self._documents_collection_name):
|
|
81
|
+
self._client.delete_collection(self._documents_collection_name)
|
|
82
|
+
self.ensure_collection(len(dense_vectors[0]))
|
|
83
|
+
else:
|
|
84
|
+
self.ensure_collection(len(dense_vectors[0]))
|
|
85
|
+
ingested_at = datetime.now(timezone.utc).isoformat()
|
|
86
|
+
points = []
|
|
87
|
+
for i, chunk in enumerate(chunks):
|
|
88
|
+
vector: dict = {"dense": dense_vectors[i]}
|
|
89
|
+
if sparse_vectors:
|
|
90
|
+
vector["bm25"] = sparse_vectors[i]
|
|
91
|
+
points.append(rest.PointStruct(
|
|
92
|
+
id=str(uuid.uuid4()), vector=vector,
|
|
93
|
+
payload={"source": chunk.source, "chunk_index": chunk.chunk_index, "text": chunk.text, "ingested_at": ingested_at},
|
|
94
|
+
))
|
|
95
|
+
self._client.upsert(collection_name=self._collection_name, points=points)
|
|
96
|
+
return len(points)
|
|
97
|
+
|
|
98
|
+
def upsert_document(self, source: str, content: str, recreate: bool = False) -> None:
|
|
99
|
+
if recreate and self._client.collection_exists(self._documents_collection_name):
|
|
100
|
+
self._client.delete_collection(self._documents_collection_name)
|
|
101
|
+
self.ensure_documents_collection()
|
|
102
|
+
self._client.delete(
|
|
103
|
+
collection_name=self._documents_collection_name,
|
|
104
|
+
points_selector=Filter(
|
|
105
|
+
must=[FieldCondition(key="source", match=MatchValue(value=source))]
|
|
106
|
+
),
|
|
107
|
+
)
|
|
108
|
+
point = rest.PointStruct(
|
|
109
|
+
id=str(uuid.uuid4()),
|
|
110
|
+
vector={"doc": [1.0]},
|
|
111
|
+
payload={"source": source, "content": content, "ingested_at": datetime.now(timezone.utc).isoformat()},
|
|
112
|
+
)
|
|
113
|
+
self._client.upsert(collection_name=self._documents_collection_name, points=[point])
|
|
114
|
+
|
|
115
|
+
def query(self, dense_vector: list[float], sparse_vector: SparseVector | None = None,
|
|
116
|
+
limit: int = 5, score_threshold: float = 0.0) -> list[RetrievedChunk]:
|
|
117
|
+
if sparse_vector is not None:
|
|
118
|
+
search_result = self._client.query_points(
|
|
119
|
+
collection_name=self._collection_name,
|
|
120
|
+
prefetch=[
|
|
121
|
+
Prefetch(query=sparse_vector, using="bm25", limit=self._sparse_prefetch_limit),
|
|
122
|
+
Prefetch(query=dense_vector, using="dense", limit=self._dense_prefetch_limit),
|
|
123
|
+
],
|
|
124
|
+
query=FusionQuery(fusion=Fusion.RRF), limit=limit,
|
|
125
|
+
score_threshold=score_threshold, with_payload=True,
|
|
126
|
+
)
|
|
127
|
+
else:
|
|
128
|
+
search_result = self._client.query_points(
|
|
129
|
+
collection_name=self._collection_name, query=dense_vector, using="dense",
|
|
130
|
+
limit=limit, score_threshold=score_threshold, with_payload=True,
|
|
131
|
+
)
|
|
132
|
+
points = getattr(search_result, "points", search_result)
|
|
133
|
+
results: list[RetrievedChunk] = []
|
|
134
|
+
for point in points:
|
|
135
|
+
payload = point.payload or {}
|
|
136
|
+
text = str(payload.get("text", ""))
|
|
137
|
+
if text:
|
|
138
|
+
results.append(RetrievedChunk(
|
|
139
|
+
source=str(payload.get("source", "unknown")),
|
|
140
|
+
chunk_index=int(payload.get("chunk_index", 0)),
|
|
141
|
+
text=text, score=float(getattr(point, "score", 0.0)),
|
|
142
|
+
))
|
|
143
|
+
return results
|
|
144
|
+
|
|
145
|
+
def collection_stats(self) -> dict[str, bool | int | None]:
|
|
146
|
+
try:
|
|
147
|
+
info = self._client.get_collection(self._collection_name)
|
|
148
|
+
except UnexpectedResponse as exc:
|
|
149
|
+
if getattr(exc, "status_code", None) == 404:
|
|
150
|
+
return {"collection_exists": False, "points_count": 0}
|
|
151
|
+
raise
|
|
152
|
+
points_count = getattr(info, "points_count", None)
|
|
153
|
+
if points_count is None:
|
|
154
|
+
result = getattr(info, "result", None)
|
|
155
|
+
points_count = getattr(result, "points_count", None)
|
|
156
|
+
return {"collection_exists": True, "points_count": int(points_count) if points_count is not None else None}
|
|
157
|
+
|
|
158
|
+
def list_sources(self) -> list[str]:
|
|
159
|
+
if not self._client.collection_exists(self._collection_name):
|
|
160
|
+
return []
|
|
161
|
+
sources: set[str] = set()
|
|
162
|
+
offset = None
|
|
163
|
+
while True:
|
|
164
|
+
points, next_offset = self._client.scroll(
|
|
165
|
+
collection_name=self._collection_name,
|
|
166
|
+
limit=100,
|
|
167
|
+
offset=offset,
|
|
168
|
+
with_payload=["source"],
|
|
169
|
+
with_vectors=False,
|
|
170
|
+
)
|
|
171
|
+
for point in points:
|
|
172
|
+
src = (point.payload or {}).get("source")
|
|
173
|
+
if src is not None:
|
|
174
|
+
sources.add(str(src))
|
|
175
|
+
if next_offset is None:
|
|
176
|
+
break
|
|
177
|
+
offset = next_offset
|
|
178
|
+
return sorted(sources)
|
|
179
|
+
|
|
180
|
+
def get_by_source(self, source: str) -> list[RetrievedChunk]:
|
|
181
|
+
if not self._client.collection_exists(self._collection_name):
|
|
182
|
+
return []
|
|
183
|
+
results = []
|
|
184
|
+
offset = None
|
|
185
|
+
while True:
|
|
186
|
+
points, next_offset = self._client.scroll(
|
|
187
|
+
collection_name=self._collection_name,
|
|
188
|
+
scroll_filter=Filter(must=[
|
|
189
|
+
FieldCondition(key="source", match=MatchValue(value=source))
|
|
190
|
+
]),
|
|
191
|
+
limit=100,
|
|
192
|
+
offset=offset,
|
|
193
|
+
with_payload=True,
|
|
194
|
+
with_vectors=False,
|
|
195
|
+
)
|
|
196
|
+
for point in points:
|
|
197
|
+
payload = point.payload or {}
|
|
198
|
+
text = str(payload.get("text", ""))
|
|
199
|
+
if text:
|
|
200
|
+
results.append(RetrievedChunk(
|
|
201
|
+
source=str(payload.get("source", source)),
|
|
202
|
+
chunk_index=int(payload.get("chunk_index", 0)),
|
|
203
|
+
text=text,
|
|
204
|
+
score=1.0,
|
|
205
|
+
))
|
|
206
|
+
if next_offset is None:
|
|
207
|
+
break
|
|
208
|
+
offset = next_offset
|
|
209
|
+
return sorted(results, key=lambda c: c.chunk_index)
|
|
210
|
+
|
|
211
|
+
def get_document_content(self, source: str) -> str | None:
|
|
212
|
+
if not self._client.collection_exists(self._documents_collection_name):
|
|
213
|
+
return None
|
|
214
|
+
points, _next_offset = self._client.scroll(
|
|
215
|
+
collection_name=self._documents_collection_name,
|
|
216
|
+
scroll_filter=Filter(
|
|
217
|
+
must=[FieldCondition(key="source", match=MatchValue(value=source))]
|
|
218
|
+
),
|
|
219
|
+
limit=1,
|
|
220
|
+
with_payload=True,
|
|
221
|
+
with_vectors=False,
|
|
222
|
+
)
|
|
223
|
+
if not points:
|
|
224
|
+
return None
|
|
225
|
+
payload = points[0].payload or {}
|
|
226
|
+
content = payload.get("content")
|
|
227
|
+
if content is None:
|
|
228
|
+
return None
|
|
229
|
+
return str(content)
|
|
230
|
+
|
|
231
|
+
def delete_source(self, source: str) -> bool:
|
|
232
|
+
"""Delete all chunks and the document for a given source. Returns True if anything was deleted."""
|
|
233
|
+
deleted_any = False
|
|
234
|
+
source_filter = Filter(must=[FieldCondition(key="source", match=MatchValue(value=source))])
|
|
235
|
+
if self._client.collection_exists(self._collection_name):
|
|
236
|
+
result = self._client.delete(
|
|
237
|
+
collection_name=self._collection_name,
|
|
238
|
+
points_selector=source_filter,
|
|
239
|
+
)
|
|
240
|
+
if getattr(result, "status", None) is not None:
|
|
241
|
+
deleted_any = True
|
|
242
|
+
if self._client.collection_exists(self._documents_collection_name):
|
|
243
|
+
result = self._client.delete(
|
|
244
|
+
collection_name=self._documents_collection_name,
|
|
245
|
+
points_selector=source_filter,
|
|
246
|
+
)
|
|
247
|
+
if getattr(result, "status", None) is not None:
|
|
248
|
+
deleted_any = True
|
|
249
|
+
return deleted_any
|
|
250
|
+
|
|
251
|
+
def list_sources_with_dates(self) -> list[dict]:
|
|
252
|
+
"""List all ingested document sources with their ingestion timestamps."""
|
|
253
|
+
if not self._client.collection_exists(self._documents_collection_name):
|
|
254
|
+
return []
|
|
255
|
+
entries = []
|
|
256
|
+
offset = None
|
|
257
|
+
while True:
|
|
258
|
+
points, next_offset = self._client.scroll(
|
|
259
|
+
collection_name=self._documents_collection_name,
|
|
260
|
+
limit=100,
|
|
261
|
+
offset=offset,
|
|
262
|
+
with_payload=["source", "ingested_at"],
|
|
263
|
+
with_vectors=False,
|
|
264
|
+
)
|
|
265
|
+
for point in points:
|
|
266
|
+
payload = point.payload or {}
|
|
267
|
+
src = payload.get("source")
|
|
268
|
+
if src is not None:
|
|
269
|
+
entries.append({
|
|
270
|
+
"source": str(src),
|
|
271
|
+
"ingested_at": str(payload.get("ingested_at", "unknown")),
|
|
272
|
+
})
|
|
273
|
+
if next_offset is None:
|
|
274
|
+
break
|
|
275
|
+
offset = next_offset
|
|
276
|
+
return sorted(entries, key=lambda e: e["source"])
|
|
277
|
+
|
|
278
|
+
def close(self) -> None:
|
|
279
|
+
self._client.close()
|
|
File without changes
|