docsgraph 0.1.0a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cairn/__init__.py +5 -0
- cairn/bench/__init__.py +37 -0
- cairn/bench/baseline.py +236 -0
- cairn/bench/dataset.py +109 -0
- cairn/bench/judge.py +126 -0
- cairn/bench/metrics.py +32 -0
- cairn/bench/report.py +143 -0
- cairn/bench/runner.py +219 -0
- cairn/cli/__init__.py +5 -0
- cairn/cli/app.py +776 -0
- cairn/cli/config.py +105 -0
- cairn/core/__init__.py +41 -0
- cairn/core/errors.py +68 -0
- cairn/core/types.py +147 -0
- cairn/embed/__init__.py +17 -0
- cairn/embed/base.py +31 -0
- cairn/embed/doubao.py +167 -0
- cairn/embed/fake.py +36 -0
- cairn/embed/openai_compatible.py +155 -0
- cairn/engine/__init__.py +18 -0
- cairn/engine/indexer.py +298 -0
- cairn/engine/manifest.py +83 -0
- cairn/entity/__init__.py +21 -0
- cairn/entity/base.py +52 -0
- cairn/entity/fake.py +34 -0
- cairn/entity/heuristic.py +148 -0
- cairn/index/__init__.py +39 -0
- cairn/index/entities.py +244 -0
- cairn/index/summaries.py +269 -0
- cairn/index/tree.py +274 -0
- cairn/index/vectors.py +287 -0
- cairn/index/xrefs.py +195 -0
- cairn/ingest/__init__.py +36 -0
- cairn/ingest/base.py +46 -0
- cairn/ingest/markdown.py +244 -0
- cairn/ingest/markitdown.py +145 -0
- cairn/ingest/pdf.py +357 -0
- cairn/inspection.py +971 -0
- cairn/mcp/__init__.py +12 -0
- cairn/mcp/schemas.py +547 -0
- cairn/mcp/server.py +363 -0
- cairn/providers.py +50 -0
- cairn/py.typed +0 -0
- cairn/repo.py +1486 -0
- cairn/repo_search.py +1505 -0
- cairn/summarize/__init__.py +18 -0
- cairn/summarize/base.py +56 -0
- cairn/summarize/cache.py +66 -0
- cairn/summarize/fake.py +43 -0
- cairn/summarize/openai_compatible.py +148 -0
- cairn/summarize/prompts.py +73 -0
- cairn/tools/__init__.py +31 -0
- cairn/tools/base.py +126 -0
- cairn/tools/find_mentions.py +93 -0
- cairn/tools/get_related.py +140 -0
- cairn/tools/get_section.py +130 -0
- cairn/tools/outline.py +75 -0
- cairn/tools/read_range.py +94 -0
- cairn/tools/search_keyword.py +94 -0
- cairn/tools/search_semantic.py +181 -0
- cairn/xref/__init__.py +24 -0
- cairn/xref/base.py +50 -0
- cairn/xref/fake.py +40 -0
- cairn/xref/heuristic.py +217 -0
- docsgraph-0.1.0a2.dist-info/METADATA +688 -0
- docsgraph-0.1.0a2.dist-info/RECORD +69 -0
- docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
- docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
- docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/cli/config.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""CLI configuration — environment variables with Ollama defaults.
|
|
2
|
+
|
|
3
|
+
Per CLAUDE.md P4 ("local-first must always work"), the defaults target a
|
|
4
|
+
local Ollama instance and require no API key.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from typing import Literal, cast
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, ConfigDict
|
|
13
|
+
|
|
14
|
+
EmbedProvider = Literal["openai-compatible", "doubao-vision"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LLMConfig(BaseModel):
|
|
18
|
+
"""Summarizer endpoint configuration."""
|
|
19
|
+
|
|
20
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
21
|
+
|
|
22
|
+
base_url: str = "http://localhost:11434/v1"
|
|
23
|
+
model: str = "llama3.2:3b"
|
|
24
|
+
api_key: str | None = None
|
|
25
|
+
timeout: float = 60.0
|
|
26
|
+
max_retries: int = 2
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EmbedConfig(BaseModel):
|
|
30
|
+
"""Embedder endpoint configuration."""
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
33
|
+
|
|
34
|
+
provider: EmbedProvider = "openai-compatible"
|
|
35
|
+
base_url: str = "http://localhost:11434/v1"
|
|
36
|
+
model: str = "nomic-embed-text"
|
|
37
|
+
dim: int = 768
|
|
38
|
+
api_key: str | None = None
|
|
39
|
+
timeout: float = 60.0
|
|
40
|
+
max_retries: int = 2
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class IndexConfig(BaseModel):
|
|
44
|
+
"""Index-build performance knobs."""
|
|
45
|
+
|
|
46
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
47
|
+
|
|
48
|
+
summary_concurrency: int = 4
|
|
49
|
+
embed_batch_size: int = 32
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def load_llm_config() -> LLMConfig:
|
|
53
|
+
"""Read summarizer config from ``CAIRN_LLM_*`` environment variables."""
|
|
54
|
+
return LLMConfig(
|
|
55
|
+
base_url=os.environ.get("CAIRN_LLM_BASE_URL", "http://localhost:11434/v1"),
|
|
56
|
+
model=os.environ.get("CAIRN_LLM_MODEL", "llama3.2:3b"),
|
|
57
|
+
api_key=os.environ.get("CAIRN_LLM_API_KEY") or None,
|
|
58
|
+
timeout=_float_env("CAIRN_LLM_TIMEOUT", 60.0),
|
|
59
|
+
max_retries=_int_env("CAIRN_LLM_MAX_RETRIES", 2),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def load_embed_config() -> EmbedConfig:
|
|
64
|
+
"""Read embedder config from ``CAIRN_EMBED_*`` environment variables."""
|
|
65
|
+
provider = os.environ.get("CAIRN_EMBED_PROVIDER", "openai-compatible")
|
|
66
|
+
if provider == "doubao-vision":
|
|
67
|
+
default_base_url = "https://ark.cn-beijing.volces.com/api/v3"
|
|
68
|
+
default_model = "doubao-embedding-vision-251215"
|
|
69
|
+
default_dim = "2048"
|
|
70
|
+
else:
|
|
71
|
+
default_base_url = "http://localhost:11434/v1"
|
|
72
|
+
default_model = "nomic-embed-text"
|
|
73
|
+
default_dim = "768"
|
|
74
|
+
|
|
75
|
+
return EmbedConfig(
|
|
76
|
+
provider=cast(EmbedProvider, provider),
|
|
77
|
+
base_url=os.environ.get("CAIRN_EMBED_BASE_URL", default_base_url),
|
|
78
|
+
model=os.environ.get("CAIRN_EMBED_MODEL", default_model),
|
|
79
|
+
dim=int(os.environ.get("CAIRN_EMBED_DIM", default_dim)),
|
|
80
|
+
api_key=os.environ.get("CAIRN_EMBED_API_KEY") or None,
|
|
81
|
+
timeout=_float_env("CAIRN_EMBED_TIMEOUT", 60.0),
|
|
82
|
+
max_retries=_int_env("CAIRN_EMBED_MAX_RETRIES", 2),
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def load_index_config() -> IndexConfig:
|
|
87
|
+
"""Read index-build performance config from environment variables."""
|
|
88
|
+
return IndexConfig(
|
|
89
|
+
summary_concurrency=_int_env("CAIRN_SUMMARY_CONCURRENCY", 4),
|
|
90
|
+
embed_batch_size=_int_env("CAIRN_EMBED_BATCH_SIZE", 32),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _int_env(name: str, default: int) -> int:
|
|
95
|
+
raw = os.environ.get(name)
|
|
96
|
+
if raw is None or not raw.strip():
|
|
97
|
+
return default
|
|
98
|
+
return int(raw)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _float_env(name: str, default: float) -> float:
|
|
102
|
+
raw = os.environ.get(name)
|
|
103
|
+
if raw is None or not raw.strip():
|
|
104
|
+
return default
|
|
105
|
+
return float(raw)
|
cairn/core/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Core types, errors, and configuration for Cairn."""
|
|
2
|
+
|
|
3
|
+
from cairn.core.errors import (
|
|
4
|
+
CairnError,
|
|
5
|
+
ConfigError,
|
|
6
|
+
IndexBuildError,
|
|
7
|
+
IndexNotFoundError,
|
|
8
|
+
IndexStaleError,
|
|
9
|
+
ParseError,
|
|
10
|
+
ToolError,
|
|
11
|
+
)
|
|
12
|
+
from cairn.core.types import (
|
|
13
|
+
Document,
|
|
14
|
+
Entity,
|
|
15
|
+
EntityKind,
|
|
16
|
+
Mention,
|
|
17
|
+
SectionNode,
|
|
18
|
+
Span,
|
|
19
|
+
SummarySet,
|
|
20
|
+
XRef,
|
|
21
|
+
XRefKind,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"CairnError",
|
|
26
|
+
"ConfigError",
|
|
27
|
+
"Document",
|
|
28
|
+
"Entity",
|
|
29
|
+
"EntityKind",
|
|
30
|
+
"IndexBuildError",
|
|
31
|
+
"IndexNotFoundError",
|
|
32
|
+
"IndexStaleError",
|
|
33
|
+
"Mention",
|
|
34
|
+
"ParseError",
|
|
35
|
+
"SectionNode",
|
|
36
|
+
"Span",
|
|
37
|
+
"SummarySet",
|
|
38
|
+
"ToolError",
|
|
39
|
+
"XRef",
|
|
40
|
+
"XRefKind",
|
|
41
|
+
]
|
cairn/core/errors.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Cairn error hierarchy.
|
|
2
|
+
|
|
3
|
+
Every error raised by Cairn library code derives from `CairnError`. Tool-layer
|
|
4
|
+
code translates these into structured MCP error envelopes; never lets them
|
|
5
|
+
escape to the transport.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CairnError(Exception):
|
|
14
|
+
"""Base class for all Cairn errors."""
|
|
15
|
+
|
|
16
|
+
code: str = "INTERNAL"
|
|
17
|
+
|
|
18
|
+
def __init__(self, message: str, *, details: dict[str, Any] | None = None) -> None:
|
|
19
|
+
super().__init__(message)
|
|
20
|
+
self.message = message
|
|
21
|
+
self.details: dict[str, Any] = details or {}
|
|
22
|
+
|
|
23
|
+
def to_envelope(self) -> dict[str, Any]:
|
|
24
|
+
"""Convert to the structured MCP error payload.
|
|
25
|
+
|
|
26
|
+
See docs/specs/mcp-tools.md §0 for the envelope shape.
|
|
27
|
+
"""
|
|
28
|
+
return {
|
|
29
|
+
"code": self.code,
|
|
30
|
+
"message": self.message,
|
|
31
|
+
"details": self.details,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ParseError(CairnError):
|
|
36
|
+
"""Source document could not be parsed into a canonical Document AST."""
|
|
37
|
+
|
|
38
|
+
code = "PARSE_FAILED"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class IndexBuildError(CairnError):
|
|
42
|
+
"""An index builder failed while constructing or updating an artifact."""
|
|
43
|
+
|
|
44
|
+
code = "INDEX_BUILD_FAILED"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class IndexNotFoundError(CairnError):
|
|
48
|
+
"""A referenced index or section does not exist."""
|
|
49
|
+
|
|
50
|
+
code = "NOT_FOUND"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class IndexStaleError(CairnError):
|
|
54
|
+
"""The on-disk index is older than its source and must be rebuilt."""
|
|
55
|
+
|
|
56
|
+
code = "INDEX_STALE"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class ConfigError(CairnError):
|
|
60
|
+
"""Invalid or missing configuration."""
|
|
61
|
+
|
|
62
|
+
code = "INVALID_CONFIG"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class ToolError(CairnError):
|
|
66
|
+
"""An MCP tool received invalid input or could not produce a result."""
|
|
67
|
+
|
|
68
|
+
code = "INVALID_INPUT"
|
cairn/core/types.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Canonical Cairn data model.
|
|
2
|
+
|
|
3
|
+
These are the types that flow across layer boundaries. They are the contract
|
|
4
|
+
between ingestion, indexing, retrieval, and the MCP server. Treat them as the
|
|
5
|
+
schema; never substitute ad-hoc dicts.
|
|
6
|
+
|
|
7
|
+
Mirrors ARCHITECTURE.md §4. Changes here are breaking and require an ADR.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Literal
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator
|
|
17
|
+
|
|
18
|
+
EntityKind = Literal["term", "code", "proper", "defined"]
|
|
19
|
+
XRefKind = Literal["link", "textual", "entity"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class _Frozen(BaseModel):
|
|
23
|
+
"""Common config for immutable, strict models."""
|
|
24
|
+
|
|
25
|
+
model_config = ConfigDict(
|
|
26
|
+
frozen=True,
|
|
27
|
+
extra="forbid",
|
|
28
|
+
str_strip_whitespace=False,
|
|
29
|
+
validate_assignment=True,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Span(_Frozen):
|
|
34
|
+
"""A half-open byte range `[start, end)` in the source document."""
|
|
35
|
+
|
|
36
|
+
start: int = Field(ge=0)
|
|
37
|
+
end: int = Field(ge=0)
|
|
38
|
+
|
|
39
|
+
@field_validator("end")
|
|
40
|
+
@classmethod
|
|
41
|
+
def _end_after_start(cls, end: int, info: ValidationInfo) -> int:
|
|
42
|
+
start = info.data.get("start")
|
|
43
|
+
if isinstance(start, int) and end < start:
|
|
44
|
+
msg = f"Span.end ({end}) must be >= Span.start ({start})"
|
|
45
|
+
raise ValueError(msg)
|
|
46
|
+
return end
|
|
47
|
+
|
|
48
|
+
def __len__(self) -> int:
|
|
49
|
+
return self.end - self.start
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SectionNode(_Frozen):
|
|
53
|
+
"""A node in the document's structural tree.
|
|
54
|
+
|
|
55
|
+
`id` is hierarchical, slug-based, and stable across re-indexing of the
|
|
56
|
+
same document. Example: ``hooks/use-effect/cleanup``.
|
|
57
|
+
|
|
58
|
+
`raw_text` is the body that belongs **directly** to this section, excluding
|
|
59
|
+
any descendant sections' bodies. To read continuous text including
|
|
60
|
+
descendants, use the `span` and read from the source — or use the
|
|
61
|
+
`read_range` retrieval tool.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
id: str = Field(min_length=1)
|
|
65
|
+
title: str
|
|
66
|
+
level: int = Field(ge=1, le=6)
|
|
67
|
+
parent: str | None
|
|
68
|
+
children: tuple[str, ...] = ()
|
|
69
|
+
span: Span
|
|
70
|
+
path: tuple[str, ...]
|
|
71
|
+
raw_text: str
|
|
72
|
+
|
|
73
|
+
@field_validator("id")
|
|
74
|
+
@classmethod
|
|
75
|
+
def _id_well_formed(cls, value: str) -> str:
|
|
76
|
+
if value.startswith("/") or value.endswith("/"):
|
|
77
|
+
msg = f"section id must not start or end with '/': {value!r}"
|
|
78
|
+
raise ValueError(msg)
|
|
79
|
+
if "//" in value:
|
|
80
|
+
msg = f"section id must not contain '//': {value!r}"
|
|
81
|
+
raise ValueError(msg)
|
|
82
|
+
return value
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class SummarySet(_Frozen):
|
|
86
|
+
"""Multi-granularity summaries for one section.
|
|
87
|
+
|
|
88
|
+
Generated by a `Summarizer` during indexing. Never produced at query time.
|
|
89
|
+
|
|
90
|
+
`digest` is optional because v0.1 generates only gist + synopsis; the
|
|
91
|
+
deeper level lands in v0.2. A `None` digest means "not generated at the
|
|
92
|
+
time this summary was built", not "this section has no digest possible".
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
section_id: str
|
|
96
|
+
gist: str = Field(description="≤ 20 words; the 'scent' in IFT terms")
|
|
97
|
+
synopsis: str = Field(description="one paragraph; ≤ 80 words")
|
|
98
|
+
digest: str | None = Field(
|
|
99
|
+
default=None,
|
|
100
|
+
description="multi-paragraph; ≤ 300 words. None until v0.2.",
|
|
101
|
+
)
|
|
102
|
+
model: str = Field(description="identifier of the LLM that produced these")
|
|
103
|
+
section_hash: str = Field(
|
|
104
|
+
description="sha256 hex of (title + raw_text) at generation time; "
|
|
105
|
+
"used to detect stale summaries when raw_text changes."
|
|
106
|
+
)
|
|
107
|
+
generated_at: datetime
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class Mention(_Frozen):
|
|
111
|
+
"""A single occurrence of an entity inside a section."""
|
|
112
|
+
|
|
113
|
+
section_id: str
|
|
114
|
+
span: Span
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class Entity(_Frozen):
|
|
118
|
+
"""A canonicalized term/concept and its mentions throughout the document."""
|
|
119
|
+
|
|
120
|
+
canonical: str = Field(min_length=1)
|
|
121
|
+
surface_forms: tuple[str, ...]
|
|
122
|
+
kind: EntityKind
|
|
123
|
+
mentions: tuple[Mention, ...]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class XRef(_Frozen):
|
|
127
|
+
"""A directed edge in the cross-reference graph."""
|
|
128
|
+
|
|
129
|
+
src: str = Field(description="section_id of the source")
|
|
130
|
+
dst: str = Field(description="section_id of the destination")
|
|
131
|
+
kind: XRefKind
|
|
132
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
133
|
+
span: Span
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class Document(_Frozen):
|
|
137
|
+
"""A fully ingested document.
|
|
138
|
+
|
|
139
|
+
Produced by Layer 1 (Ingestion). The sole input to Layer 2 (Index).
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
id: str = Field(min_length=1, description="human-readable doc_id, slug-based")
|
|
143
|
+
source_path: Path
|
|
144
|
+
source_hash: str = Field(description="sha256 of the source bytes, hex-encoded")
|
|
145
|
+
sections: tuple[SectionNode, ...]
|
|
146
|
+
indexed_at: datetime
|
|
147
|
+
cairn_version: str
|
cairn/embed/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Embedding layer — pluggable text → vector encoders.
|
|
2
|
+
|
|
3
|
+
Used by the index layer (`cairn.index.vectors.VectorBuilder`) at indexing time.
|
|
4
|
+
Never invoked at query time except for embedding the user's query string.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from cairn.embed.base import Embedder
|
|
8
|
+
from cairn.embed.doubao import DoubaoVisionEmbedder
|
|
9
|
+
from cairn.embed.fake import FakeEmbedder
|
|
10
|
+
from cairn.embed.openai_compatible import OpenAICompatibleEmbedder
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"DoubaoVisionEmbedder",
|
|
14
|
+
"Embedder",
|
|
15
|
+
"FakeEmbedder",
|
|
16
|
+
"OpenAICompatibleEmbedder",
|
|
17
|
+
]
|
cairn/embed/base.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Embedder protocol.
|
|
2
|
+
|
|
3
|
+
An ``Embedder`` turns a list of texts into a list of dense vectors. Batching
|
|
4
|
+
is the responsibility of the implementation — callers pass a list and may
|
|
5
|
+
assume the implementation chooses an efficient batching strategy.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Protocol, runtime_checkable
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@runtime_checkable
|
|
14
|
+
class Embedder(Protocol):
|
|
15
|
+
"""A pluggable text-embedding model.
|
|
16
|
+
|
|
17
|
+
The ``name`` attribute encodes both the implementation family and the
|
|
18
|
+
model identifier (e.g. ``"openai-compat:nomic-embed-text"``) so that
|
|
19
|
+
consumers can use it as a cache-invalidation key and a manifest marker.
|
|
20
|
+
|
|
21
|
+
Vectors returned by ``embed`` MUST have length ``dim`` for every text;
|
|
22
|
+
consumers may rely on this invariant when constructing typed vector
|
|
23
|
+
stores. Empty input must return an empty list (not raise).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name: str
|
|
27
|
+
dim: int
|
|
28
|
+
|
|
29
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
30
|
+
"""Embed each text in ``texts`` to a ``dim``-dimensional vector."""
|
|
31
|
+
...
|
cairn/embed/doubao.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Volcengine/Doubao embedding adapters."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
from cairn.core.errors import IndexBuildError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DoubaoVisionEmbedder:
|
|
14
|
+
"""Client for Doubao's multimodal embedding endpoint.
|
|
15
|
+
|
|
16
|
+
``doubao-embedding-vision-*`` models do not use the OpenAI-compatible
|
|
17
|
+
``/embeddings`` wire shape. They are served at
|
|
18
|
+
``/embeddings/multimodal`` and return ``{"data": {"embedding": ...}}``
|
|
19
|
+
for a single multimodal input. Cairn's embedder protocol expects one
|
|
20
|
+
vector per text, so this adapter issues one request per input text.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
*,
|
|
26
|
+
base_url: str = "https://ark.cn-beijing.volces.com/api/v3",
|
|
27
|
+
model: str = "doubao-embedding-vision-251215",
|
|
28
|
+
dim: int = 2048,
|
|
29
|
+
api_key: str | None = None,
|
|
30
|
+
timeout: float = 60.0,
|
|
31
|
+
max_retries: int = 2,
|
|
32
|
+
retry_base_delay: float = 0.5,
|
|
33
|
+
) -> None:
|
|
34
|
+
if dim < 1:
|
|
35
|
+
msg = f"dim must be >= 1; got {dim}"
|
|
36
|
+
raise ValueError(msg)
|
|
37
|
+
if max_retries < 0:
|
|
38
|
+
msg = f"max_retries must be >= 0; got {max_retries}"
|
|
39
|
+
raise ValueError(msg)
|
|
40
|
+
if retry_base_delay < 0:
|
|
41
|
+
msg = f"retry_base_delay must be >= 0; got {retry_base_delay}"
|
|
42
|
+
raise ValueError(msg)
|
|
43
|
+
self.base_url = base_url.rstrip("/")
|
|
44
|
+
self.model = model
|
|
45
|
+
self.dim = dim
|
|
46
|
+
self.api_key = api_key
|
|
47
|
+
self.timeout = timeout
|
|
48
|
+
self.max_retries = max_retries
|
|
49
|
+
self.retry_base_delay = retry_base_delay
|
|
50
|
+
self.name = f"doubao-vision:{model}"
|
|
51
|
+
|
|
52
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
53
|
+
"""Embed each text through Doubao's multimodal vectorization API."""
|
|
54
|
+
if not texts:
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
headers = {"Content-Type": "application/json"}
|
|
58
|
+
if self.api_key:
|
|
59
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
60
|
+
|
|
61
|
+
vectors: list[list[float]] = []
|
|
62
|
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
63
|
+
for index, text in enumerate(texts):
|
|
64
|
+
payload: dict[str, Any] = {
|
|
65
|
+
"model": self.model,
|
|
66
|
+
"input": [{"type": "text", "text": text}],
|
|
67
|
+
}
|
|
68
|
+
response = await self._post_with_retries(
|
|
69
|
+
client, payload, headers, index=index
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
vector = _extract_vector(response.json(), index=index)
|
|
73
|
+
if len(vector) != self.dim:
|
|
74
|
+
msg = (
|
|
75
|
+
f"doubao vision embedder returned dim={len(vector)} "
|
|
76
|
+
f"but client expects dim={self.dim} "
|
|
77
|
+
f"(model {self.model!r}, index {index})"
|
|
78
|
+
)
|
|
79
|
+
raise IndexBuildError(msg)
|
|
80
|
+
vectors.append(vector)
|
|
81
|
+
|
|
82
|
+
return vectors
|
|
83
|
+
|
|
84
|
+
async def _post_with_retries(
|
|
85
|
+
self,
|
|
86
|
+
client: httpx.AsyncClient,
|
|
87
|
+
payload: dict[str, Any],
|
|
88
|
+
headers: dict[str, str],
|
|
89
|
+
*,
|
|
90
|
+
index: int,
|
|
91
|
+
) -> httpx.Response:
|
|
92
|
+
last_exc: httpx.HTTPError | None = None
|
|
93
|
+
url = f"{self.base_url}/embeddings/multimodal"
|
|
94
|
+
for attempt in range(self.max_retries + 1):
|
|
95
|
+
try:
|
|
96
|
+
response = await client.post(url, json=payload, headers=headers)
|
|
97
|
+
except httpx.HTTPError as exc:
|
|
98
|
+
last_exc = exc
|
|
99
|
+
if attempt < self.max_retries:
|
|
100
|
+
await self._sleep_before_retry(attempt)
|
|
101
|
+
continue
|
|
102
|
+
msg = f"doubao vision embedder request failed: {exc}"
|
|
103
|
+
raise IndexBuildError(
|
|
104
|
+
msg,
|
|
105
|
+
details={
|
|
106
|
+
"model": self.model,
|
|
107
|
+
"base_url": self.base_url,
|
|
108
|
+
"index": index,
|
|
109
|
+
"error_type": type(exc).__name__,
|
|
110
|
+
"attempts": attempt + 1,
|
|
111
|
+
},
|
|
112
|
+
) from exc
|
|
113
|
+
|
|
114
|
+
if response.status_code in (429, 500, 502, 503, 504) and attempt < self.max_retries:
|
|
115
|
+
await self._sleep_before_retry(attempt)
|
|
116
|
+
continue
|
|
117
|
+
if response.status_code >= 400:
|
|
118
|
+
msg = (
|
|
119
|
+
f"doubao vision embedder endpoint returned HTTP "
|
|
120
|
+
f"{response.status_code}: {response.text[:200]}"
|
|
121
|
+
)
|
|
122
|
+
raise IndexBuildError(
|
|
123
|
+
msg,
|
|
124
|
+
details={
|
|
125
|
+
"status": response.status_code,
|
|
126
|
+
"model": self.model,
|
|
127
|
+
"base_url": self.base_url,
|
|
128
|
+
"index": index,
|
|
129
|
+
"attempts": attempt + 1,
|
|
130
|
+
},
|
|
131
|
+
)
|
|
132
|
+
return response
|
|
133
|
+
|
|
134
|
+
msg = "doubao vision embedder request failed without a response"
|
|
135
|
+
raise IndexBuildError(
|
|
136
|
+
msg,
|
|
137
|
+
details={
|
|
138
|
+
"model": self.model,
|
|
139
|
+
"base_url": self.base_url,
|
|
140
|
+
"index": index,
|
|
141
|
+
"error_type": type(last_exc).__name__ if last_exc else None,
|
|
142
|
+
},
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
async def _sleep_before_retry(self, attempt: int) -> None:
|
|
146
|
+
if self.retry_base_delay == 0:
|
|
147
|
+
return
|
|
148
|
+
await asyncio.sleep(self.retry_base_delay * (2**attempt))
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _extract_vector(data: dict[str, Any], *, index: int) -> list[float]:
|
|
152
|
+
"""Read the dense vector from Doubao's multimodal response shape."""
|
|
153
|
+
try:
|
|
154
|
+
embedding = data["data"]["embedding"]
|
|
155
|
+
except (KeyError, TypeError) as exc:
|
|
156
|
+
msg = "doubao vision embedder response did not match expected shape"
|
|
157
|
+
raise IndexBuildError(msg, details={"response": data, "index": index}) from exc
|
|
158
|
+
|
|
159
|
+
if not isinstance(embedding, list):
|
|
160
|
+
msg = "doubao vision embedder embedding is not a list"
|
|
161
|
+
raise IndexBuildError(msg, details={"response": data, "index": index})
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
return [float(value) for value in embedding]
|
|
165
|
+
except (TypeError, ValueError) as exc:
|
|
166
|
+
msg = "doubao vision embedder embedding contains non-numeric values"
|
|
167
|
+
raise IndexBuildError(msg, details={"response": data, "index": index}) from exc
|
cairn/embed/fake.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Deterministic, network-free embedder for tests and offline development.
|
|
2
|
+
|
|
3
|
+
Implementation is a sparse bag-of-words hash projection: each word in the
|
|
4
|
+
input lowers onto exactly one dimension (chosen by sha256 hash mod dim).
|
|
5
|
+
Vectors are similarity-respecting — two texts that share words land near
|
|
6
|
+
each other in cosine space — but the embedder has no semantic understanding.
|
|
7
|
+
Suitable for unit tests and pipeline plumbing checks; never for production.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FakeEmbedder:
|
|
16
|
+
"""Bag-of-words hash embedder. Deterministic; no network."""
|
|
17
|
+
|
|
18
|
+
name = "fake:bow-hash"
|
|
19
|
+
|
|
20
|
+
def __init__(self, dim: int = 64) -> None:
|
|
21
|
+
if dim < 1:
|
|
22
|
+
msg = f"dim must be >= 1; got {dim}"
|
|
23
|
+
raise ValueError(msg)
|
|
24
|
+
self.dim = dim
|
|
25
|
+
|
|
26
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
27
|
+
return [self._embed_one(t) for t in texts]
|
|
28
|
+
|
|
29
|
+
def _embed_one(self, text: str) -> list[float]:
|
|
30
|
+
vec = [0.0] * self.dim
|
|
31
|
+
words = text.lower().split() or ["__empty__"]
|
|
32
|
+
for word in words:
|
|
33
|
+
digest = hashlib.sha256(word.encode("utf-8")).digest()
|
|
34
|
+
idx = int.from_bytes(digest[:8], "big") % self.dim
|
|
35
|
+
vec[idx] += 1.0
|
|
36
|
+
return vec
|