docsgraph 0.1.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. cairn/__init__.py +5 -0
  2. cairn/bench/__init__.py +37 -0
  3. cairn/bench/baseline.py +236 -0
  4. cairn/bench/dataset.py +109 -0
  5. cairn/bench/judge.py +126 -0
  6. cairn/bench/metrics.py +32 -0
  7. cairn/bench/report.py +143 -0
  8. cairn/bench/runner.py +219 -0
  9. cairn/cli/__init__.py +5 -0
  10. cairn/cli/app.py +776 -0
  11. cairn/cli/config.py +105 -0
  12. cairn/core/__init__.py +41 -0
  13. cairn/core/errors.py +68 -0
  14. cairn/core/types.py +147 -0
  15. cairn/embed/__init__.py +17 -0
  16. cairn/embed/base.py +31 -0
  17. cairn/embed/doubao.py +167 -0
  18. cairn/embed/fake.py +36 -0
  19. cairn/embed/openai_compatible.py +155 -0
  20. cairn/engine/__init__.py +18 -0
  21. cairn/engine/indexer.py +298 -0
  22. cairn/engine/manifest.py +83 -0
  23. cairn/entity/__init__.py +21 -0
  24. cairn/entity/base.py +52 -0
  25. cairn/entity/fake.py +34 -0
  26. cairn/entity/heuristic.py +148 -0
  27. cairn/index/__init__.py +39 -0
  28. cairn/index/entities.py +244 -0
  29. cairn/index/summaries.py +269 -0
  30. cairn/index/tree.py +274 -0
  31. cairn/index/vectors.py +287 -0
  32. cairn/index/xrefs.py +195 -0
  33. cairn/ingest/__init__.py +36 -0
  34. cairn/ingest/base.py +46 -0
  35. cairn/ingest/markdown.py +244 -0
  36. cairn/ingest/markitdown.py +145 -0
  37. cairn/ingest/pdf.py +357 -0
  38. cairn/inspection.py +971 -0
  39. cairn/mcp/__init__.py +12 -0
  40. cairn/mcp/schemas.py +547 -0
  41. cairn/mcp/server.py +363 -0
  42. cairn/providers.py +50 -0
  43. cairn/py.typed +0 -0
  44. cairn/repo.py +1486 -0
  45. cairn/repo_search.py +1505 -0
  46. cairn/summarize/__init__.py +18 -0
  47. cairn/summarize/base.py +56 -0
  48. cairn/summarize/cache.py +66 -0
  49. cairn/summarize/fake.py +43 -0
  50. cairn/summarize/openai_compatible.py +148 -0
  51. cairn/summarize/prompts.py +73 -0
  52. cairn/tools/__init__.py +31 -0
  53. cairn/tools/base.py +126 -0
  54. cairn/tools/find_mentions.py +93 -0
  55. cairn/tools/get_related.py +140 -0
  56. cairn/tools/get_section.py +130 -0
  57. cairn/tools/outline.py +75 -0
  58. cairn/tools/read_range.py +94 -0
  59. cairn/tools/search_keyword.py +94 -0
  60. cairn/tools/search_semantic.py +181 -0
  61. cairn/xref/__init__.py +24 -0
  62. cairn/xref/base.py +50 -0
  63. cairn/xref/fake.py +40 -0
  64. cairn/xref/heuristic.py +217 -0
  65. docsgraph-0.1.0a2.dist-info/METADATA +688 -0
  66. docsgraph-0.1.0a2.dist-info/RECORD +69 -0
  67. docsgraph-0.1.0a2.dist-info/WHEEL +4 -0
  68. docsgraph-0.1.0a2.dist-info/entry_points.txt +3 -0
  69. docsgraph-0.1.0a2.dist-info/licenses/LICENSE +201 -0
cairn/cli/config.py ADDED
@@ -0,0 +1,105 @@
1
+ """CLI configuration — environment variables with Ollama defaults.
2
+
3
+ Per CLAUDE.md P4 ("local-first must always work"), the defaults target a
4
+ local Ollama instance and require no API key.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from typing import Literal, cast
11
+
12
+ from pydantic import BaseModel, ConfigDict
13
+
14
+ EmbedProvider = Literal["openai-compatible", "doubao-vision"]
15
+
16
+
17
+ class LLMConfig(BaseModel):
18
+ """Summarizer endpoint configuration."""
19
+
20
+ model_config = ConfigDict(frozen=True, extra="forbid")
21
+
22
+ base_url: str = "http://localhost:11434/v1"
23
+ model: str = "llama3.2:3b"
24
+ api_key: str | None = None
25
+ timeout: float = 60.0
26
+ max_retries: int = 2
27
+
28
+
29
+ class EmbedConfig(BaseModel):
30
+ """Embedder endpoint configuration."""
31
+
32
+ model_config = ConfigDict(frozen=True, extra="forbid")
33
+
34
+ provider: EmbedProvider = "openai-compatible"
35
+ base_url: str = "http://localhost:11434/v1"
36
+ model: str = "nomic-embed-text"
37
+ dim: int = 768
38
+ api_key: str | None = None
39
+ timeout: float = 60.0
40
+ max_retries: int = 2
41
+
42
+
43
+ class IndexConfig(BaseModel):
44
+ """Index-build performance knobs."""
45
+
46
+ model_config = ConfigDict(frozen=True, extra="forbid")
47
+
48
+ summary_concurrency: int = 4
49
+ embed_batch_size: int = 32
50
+
51
+
52
+ def load_llm_config() -> LLMConfig:
53
+ """Read summarizer config from ``CAIRN_LLM_*`` environment variables."""
54
+ return LLMConfig(
55
+ base_url=os.environ.get("CAIRN_LLM_BASE_URL", "http://localhost:11434/v1"),
56
+ model=os.environ.get("CAIRN_LLM_MODEL", "llama3.2:3b"),
57
+ api_key=os.environ.get("CAIRN_LLM_API_KEY") or None,
58
+ timeout=_float_env("CAIRN_LLM_TIMEOUT", 60.0),
59
+ max_retries=_int_env("CAIRN_LLM_MAX_RETRIES", 2),
60
+ )
61
+
62
+
63
+ def load_embed_config() -> EmbedConfig:
64
+ """Read embedder config from ``CAIRN_EMBED_*`` environment variables."""
65
+ provider = os.environ.get("CAIRN_EMBED_PROVIDER", "openai-compatible")
66
+ if provider == "doubao-vision":
67
+ default_base_url = "https://ark.cn-beijing.volces.com/api/v3"
68
+ default_model = "doubao-embedding-vision-251215"
69
+ default_dim = "2048"
70
+ else:
71
+ default_base_url = "http://localhost:11434/v1"
72
+ default_model = "nomic-embed-text"
73
+ default_dim = "768"
74
+
75
+ return EmbedConfig(
76
+ provider=cast(EmbedProvider, provider),
77
+ base_url=os.environ.get("CAIRN_EMBED_BASE_URL", default_base_url),
78
+ model=os.environ.get("CAIRN_EMBED_MODEL", default_model),
79
+ dim=int(os.environ.get("CAIRN_EMBED_DIM", default_dim)),
80
+ api_key=os.environ.get("CAIRN_EMBED_API_KEY") or None,
81
+ timeout=_float_env("CAIRN_EMBED_TIMEOUT", 60.0),
82
+ max_retries=_int_env("CAIRN_EMBED_MAX_RETRIES", 2),
83
+ )
84
+
85
+
86
+ def load_index_config() -> IndexConfig:
87
+ """Read index-build performance config from environment variables."""
88
+ return IndexConfig(
89
+ summary_concurrency=_int_env("CAIRN_SUMMARY_CONCURRENCY", 4),
90
+ embed_batch_size=_int_env("CAIRN_EMBED_BATCH_SIZE", 32),
91
+ )
92
+
93
+
94
+ def _int_env(name: str, default: int) -> int:
95
+ raw = os.environ.get(name)
96
+ if raw is None or not raw.strip():
97
+ return default
98
+ return int(raw)
99
+
100
+
101
+ def _float_env(name: str, default: float) -> float:
102
+ raw = os.environ.get(name)
103
+ if raw is None or not raw.strip():
104
+ return default
105
+ return float(raw)
cairn/core/__init__.py ADDED
@@ -0,0 +1,41 @@
1
+ """Core types, errors, and configuration for Cairn."""
2
+
3
+ from cairn.core.errors import (
4
+ CairnError,
5
+ ConfigError,
6
+ IndexBuildError,
7
+ IndexNotFoundError,
8
+ IndexStaleError,
9
+ ParseError,
10
+ ToolError,
11
+ )
12
+ from cairn.core.types import (
13
+ Document,
14
+ Entity,
15
+ EntityKind,
16
+ Mention,
17
+ SectionNode,
18
+ Span,
19
+ SummarySet,
20
+ XRef,
21
+ XRefKind,
22
+ )
23
+
24
+ __all__ = [
25
+ "CairnError",
26
+ "ConfigError",
27
+ "Document",
28
+ "Entity",
29
+ "EntityKind",
30
+ "IndexBuildError",
31
+ "IndexNotFoundError",
32
+ "IndexStaleError",
33
+ "Mention",
34
+ "ParseError",
35
+ "SectionNode",
36
+ "Span",
37
+ "SummarySet",
38
+ "ToolError",
39
+ "XRef",
40
+ "XRefKind",
41
+ ]
cairn/core/errors.py ADDED
@@ -0,0 +1,68 @@
1
+ """Cairn error hierarchy.
2
+
3
+ Every error raised by Cairn library code derives from `CairnError`. Tool-layer
4
+ code translates these into structured MCP error envelopes; never lets them
5
+ escape to the transport.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any
11
+
12
+
13
+ class CairnError(Exception):
14
+ """Base class for all Cairn errors."""
15
+
16
+ code: str = "INTERNAL"
17
+
18
+ def __init__(self, message: str, *, details: dict[str, Any] | None = None) -> None:
19
+ super().__init__(message)
20
+ self.message = message
21
+ self.details: dict[str, Any] = details or {}
22
+
23
+ def to_envelope(self) -> dict[str, Any]:
24
+ """Convert to the structured MCP error payload.
25
+
26
+ See docs/specs/mcp-tools.md §0 for the envelope shape.
27
+ """
28
+ return {
29
+ "code": self.code,
30
+ "message": self.message,
31
+ "details": self.details,
32
+ }
33
+
34
+
35
+ class ParseError(CairnError):
36
+ """Source document could not be parsed into a canonical Document AST."""
37
+
38
+ code = "PARSE_FAILED"
39
+
40
+
41
+ class IndexBuildError(CairnError):
42
+ """An index builder failed while constructing or updating an artifact."""
43
+
44
+ code = "INDEX_BUILD_FAILED"
45
+
46
+
47
+ class IndexNotFoundError(CairnError):
48
+ """A referenced index or section does not exist."""
49
+
50
+ code = "NOT_FOUND"
51
+
52
+
53
+ class IndexStaleError(CairnError):
54
+ """The on-disk index is older than its source and must be rebuilt."""
55
+
56
+ code = "INDEX_STALE"
57
+
58
+
59
+ class ConfigError(CairnError):
60
+ """Invalid or missing configuration."""
61
+
62
+ code = "INVALID_CONFIG"
63
+
64
+
65
+ class ToolError(CairnError):
66
+ """An MCP tool received invalid input or could not produce a result."""
67
+
68
+ code = "INVALID_INPUT"
cairn/core/types.py ADDED
@@ -0,0 +1,147 @@
1
+ """Canonical Cairn data model.
2
+
3
+ These are the types that flow across layer boundaries. They are the contract
4
+ between ingestion, indexing, retrieval, and the MCP server. Treat them as the
5
+ schema; never substitute ad-hoc dicts.
6
+
7
+ Mirrors ARCHITECTURE.md §4. Changes here are breaking and require an ADR.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Literal
15
+
16
+ from pydantic import BaseModel, ConfigDict, Field, ValidationInfo, field_validator
17
+
18
+ EntityKind = Literal["term", "code", "proper", "defined"]
19
+ XRefKind = Literal["link", "textual", "entity"]
20
+
21
+
22
+ class _Frozen(BaseModel):
23
+ """Common config for immutable, strict models."""
24
+
25
+ model_config = ConfigDict(
26
+ frozen=True,
27
+ extra="forbid",
28
+ str_strip_whitespace=False,
29
+ validate_assignment=True,
30
+ )
31
+
32
+
33
+ class Span(_Frozen):
34
+ """A half-open byte range `[start, end)` in the source document."""
35
+
36
+ start: int = Field(ge=0)
37
+ end: int = Field(ge=0)
38
+
39
+ @field_validator("end")
40
+ @classmethod
41
+ def _end_after_start(cls, end: int, info: ValidationInfo) -> int:
42
+ start = info.data.get("start")
43
+ if isinstance(start, int) and end < start:
44
+ msg = f"Span.end ({end}) must be >= Span.start ({start})"
45
+ raise ValueError(msg)
46
+ return end
47
+
48
+ def __len__(self) -> int:
49
+ return self.end - self.start
50
+
51
+
52
+ class SectionNode(_Frozen):
53
+ """A node in the document's structural tree.
54
+
55
+ `id` is hierarchical, slug-based, and stable across re-indexing of the
56
+ same document. Example: ``hooks/use-effect/cleanup``.
57
+
58
+ `raw_text` is the body that belongs **directly** to this section, excluding
59
+ any descendant sections' bodies. To read continuous text including
60
+ descendants, use the `span` and read from the source — or use the
61
+ `read_range` retrieval tool.
62
+ """
63
+
64
+ id: str = Field(min_length=1)
65
+ title: str
66
+ level: int = Field(ge=1, le=6)
67
+ parent: str | None
68
+ children: tuple[str, ...] = ()
69
+ span: Span
70
+ path: tuple[str, ...]
71
+ raw_text: str
72
+
73
+ @field_validator("id")
74
+ @classmethod
75
+ def _id_well_formed(cls, value: str) -> str:
76
+ if value.startswith("/") or value.endswith("/"):
77
+ msg = f"section id must not start or end with '/': {value!r}"
78
+ raise ValueError(msg)
79
+ if "//" in value:
80
+ msg = f"section id must not contain '//': {value!r}"
81
+ raise ValueError(msg)
82
+ return value
83
+
84
+
85
+ class SummarySet(_Frozen):
86
+ """Multi-granularity summaries for one section.
87
+
88
+ Generated by a `Summarizer` during indexing. Never produced at query time.
89
+
90
+ `digest` is optional because v0.1 generates only gist + synopsis; the
91
+ deeper level lands in v0.2. A `None` digest means "not generated at the
92
+ time this summary was built", not "this section has no digest possible".
93
+ """
94
+
95
+ section_id: str
96
+ gist: str = Field(description="≤ 20 words; the 'scent' in IFT terms")
97
+ synopsis: str = Field(description="one paragraph; ≤ 80 words")
98
+ digest: str | None = Field(
99
+ default=None,
100
+ description="multi-paragraph; ≤ 300 words. None until v0.2.",
101
+ )
102
+ model: str = Field(description="identifier of the LLM that produced these")
103
+ section_hash: str = Field(
104
+ description="sha256 hex of (title + raw_text) at generation time; "
105
+ "used to detect stale summaries when raw_text changes."
106
+ )
107
+ generated_at: datetime
108
+
109
+
110
+ class Mention(_Frozen):
111
+ """A single occurrence of an entity inside a section."""
112
+
113
+ section_id: str
114
+ span: Span
115
+
116
+
117
+ class Entity(_Frozen):
118
+ """A canonicalized term/concept and its mentions throughout the document."""
119
+
120
+ canonical: str = Field(min_length=1)
121
+ surface_forms: tuple[str, ...]
122
+ kind: EntityKind
123
+ mentions: tuple[Mention, ...]
124
+
125
+
126
+ class XRef(_Frozen):
127
+ """A directed edge in the cross-reference graph."""
128
+
129
+ src: str = Field(description="section_id of the source")
130
+ dst: str = Field(description="section_id of the destination")
131
+ kind: XRefKind
132
+ confidence: float = Field(ge=0.0, le=1.0)
133
+ span: Span
134
+
135
+
136
+ class Document(_Frozen):
137
+ """A fully ingested document.
138
+
139
+ Produced by Layer 1 (Ingestion). The sole input to Layer 2 (Index).
140
+ """
141
+
142
+ id: str = Field(min_length=1, description="human-readable doc_id, slug-based")
143
+ source_path: Path
144
+ source_hash: str = Field(description="sha256 of the source bytes, hex-encoded")
145
+ sections: tuple[SectionNode, ...]
146
+ indexed_at: datetime
147
+ cairn_version: str
@@ -0,0 +1,17 @@
1
+ """Embedding layer — pluggable text → vector encoders.
2
+
3
+ Used by the index layer (`cairn.index.vectors.VectorBuilder`) at indexing time.
4
+ Never invoked at query time except for embedding the user's query string.
5
+ """
6
+
7
+ from cairn.embed.base import Embedder
8
+ from cairn.embed.doubao import DoubaoVisionEmbedder
9
+ from cairn.embed.fake import FakeEmbedder
10
+ from cairn.embed.openai_compatible import OpenAICompatibleEmbedder
11
+
12
+ __all__ = [
13
+ "DoubaoVisionEmbedder",
14
+ "Embedder",
15
+ "FakeEmbedder",
16
+ "OpenAICompatibleEmbedder",
17
+ ]
cairn/embed/base.py ADDED
@@ -0,0 +1,31 @@
1
+ """Embedder protocol.
2
+
3
+ An ``Embedder`` turns a list of texts into a list of dense vectors. Batching
4
+ is the responsibility of the implementation — callers pass a list and may
5
+ assume the implementation chooses an efficient batching strategy.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Protocol, runtime_checkable
11
+
12
+
13
+ @runtime_checkable
14
+ class Embedder(Protocol):
15
+ """A pluggable text-embedding model.
16
+
17
+ The ``name`` attribute encodes both the implementation family and the
18
+ model identifier (e.g. ``"openai-compat:nomic-embed-text"``) so that
19
+ consumers can use it as a cache-invalidation key and a manifest marker.
20
+
21
+ Vectors returned by ``embed`` MUST have length ``dim`` for every text;
22
+ consumers may rely on this invariant when constructing typed vector
23
+ stores. Empty input must return an empty list (not raise).
24
+ """
25
+
26
+ name: str
27
+ dim: int
28
+
29
+ async def embed(self, texts: list[str]) -> list[list[float]]:
30
+ """Embed each text in ``texts`` to a ``dim``-dimensional vector."""
31
+ ...
cairn/embed/doubao.py ADDED
@@ -0,0 +1,167 @@
1
+ """Volcengine/Doubao embedding adapters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from typing import Any
7
+
8
+ import httpx
9
+
10
+ from cairn.core.errors import IndexBuildError
11
+
12
+
13
+ class DoubaoVisionEmbedder:
14
+ """Client for Doubao's multimodal embedding endpoint.
15
+
16
+ ``doubao-embedding-vision-*`` models do not use the OpenAI-compatible
17
+ ``/embeddings`` wire shape. They are served at
18
+ ``/embeddings/multimodal`` and return ``{"data": {"embedding": ...}}``
19
+ for a single multimodal input. Cairn's embedder protocol expects one
20
+ vector per text, so this adapter issues one request per input text.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ *,
26
+ base_url: str = "https://ark.cn-beijing.volces.com/api/v3",
27
+ model: str = "doubao-embedding-vision-251215",
28
+ dim: int = 2048,
29
+ api_key: str | None = None,
30
+ timeout: float = 60.0,
31
+ max_retries: int = 2,
32
+ retry_base_delay: float = 0.5,
33
+ ) -> None:
34
+ if dim < 1:
35
+ msg = f"dim must be >= 1; got {dim}"
36
+ raise ValueError(msg)
37
+ if max_retries < 0:
38
+ msg = f"max_retries must be >= 0; got {max_retries}"
39
+ raise ValueError(msg)
40
+ if retry_base_delay < 0:
41
+ msg = f"retry_base_delay must be >= 0; got {retry_base_delay}"
42
+ raise ValueError(msg)
43
+ self.base_url = base_url.rstrip("/")
44
+ self.model = model
45
+ self.dim = dim
46
+ self.api_key = api_key
47
+ self.timeout = timeout
48
+ self.max_retries = max_retries
49
+ self.retry_base_delay = retry_base_delay
50
+ self.name = f"doubao-vision:{model}"
51
+
52
+ async def embed(self, texts: list[str]) -> list[list[float]]:
53
+ """Embed each text through Doubao's multimodal vectorization API."""
54
+ if not texts:
55
+ return []
56
+
57
+ headers = {"Content-Type": "application/json"}
58
+ if self.api_key:
59
+ headers["Authorization"] = f"Bearer {self.api_key}"
60
+
61
+ vectors: list[list[float]] = []
62
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
63
+ for index, text in enumerate(texts):
64
+ payload: dict[str, Any] = {
65
+ "model": self.model,
66
+ "input": [{"type": "text", "text": text}],
67
+ }
68
+ response = await self._post_with_retries(
69
+ client, payload, headers, index=index
70
+ )
71
+
72
+ vector = _extract_vector(response.json(), index=index)
73
+ if len(vector) != self.dim:
74
+ msg = (
75
+ f"doubao vision embedder returned dim={len(vector)} "
76
+ f"but client expects dim={self.dim} "
77
+ f"(model {self.model!r}, index {index})"
78
+ )
79
+ raise IndexBuildError(msg)
80
+ vectors.append(vector)
81
+
82
+ return vectors
83
+
84
+ async def _post_with_retries(
85
+ self,
86
+ client: httpx.AsyncClient,
87
+ payload: dict[str, Any],
88
+ headers: dict[str, str],
89
+ *,
90
+ index: int,
91
+ ) -> httpx.Response:
92
+ last_exc: httpx.HTTPError | None = None
93
+ url = f"{self.base_url}/embeddings/multimodal"
94
+ for attempt in range(self.max_retries + 1):
95
+ try:
96
+ response = await client.post(url, json=payload, headers=headers)
97
+ except httpx.HTTPError as exc:
98
+ last_exc = exc
99
+ if attempt < self.max_retries:
100
+ await self._sleep_before_retry(attempt)
101
+ continue
102
+ msg = f"doubao vision embedder request failed: {exc}"
103
+ raise IndexBuildError(
104
+ msg,
105
+ details={
106
+ "model": self.model,
107
+ "base_url": self.base_url,
108
+ "index": index,
109
+ "error_type": type(exc).__name__,
110
+ "attempts": attempt + 1,
111
+ },
112
+ ) from exc
113
+
114
+ if response.status_code in (429, 500, 502, 503, 504) and attempt < self.max_retries:
115
+ await self._sleep_before_retry(attempt)
116
+ continue
117
+ if response.status_code >= 400:
118
+ msg = (
119
+ f"doubao vision embedder endpoint returned HTTP "
120
+ f"{response.status_code}: {response.text[:200]}"
121
+ )
122
+ raise IndexBuildError(
123
+ msg,
124
+ details={
125
+ "status": response.status_code,
126
+ "model": self.model,
127
+ "base_url": self.base_url,
128
+ "index": index,
129
+ "attempts": attempt + 1,
130
+ },
131
+ )
132
+ return response
133
+
134
+ msg = "doubao vision embedder request failed without a response"
135
+ raise IndexBuildError(
136
+ msg,
137
+ details={
138
+ "model": self.model,
139
+ "base_url": self.base_url,
140
+ "index": index,
141
+ "error_type": type(last_exc).__name__ if last_exc else None,
142
+ },
143
+ )
144
+
145
+ async def _sleep_before_retry(self, attempt: int) -> None:
146
+ if self.retry_base_delay == 0:
147
+ return
148
+ await asyncio.sleep(self.retry_base_delay * (2**attempt))
149
+
150
+
151
+ def _extract_vector(data: dict[str, Any], *, index: int) -> list[float]:
152
+ """Read the dense vector from Doubao's multimodal response shape."""
153
+ try:
154
+ embedding = data["data"]["embedding"]
155
+ except (KeyError, TypeError) as exc:
156
+ msg = "doubao vision embedder response did not match expected shape"
157
+ raise IndexBuildError(msg, details={"response": data, "index": index}) from exc
158
+
159
+ if not isinstance(embedding, list):
160
+ msg = "doubao vision embedder embedding is not a list"
161
+ raise IndexBuildError(msg, details={"response": data, "index": index})
162
+
163
+ try:
164
+ return [float(value) for value in embedding]
165
+ except (TypeError, ValueError) as exc:
166
+ msg = "doubao vision embedder embedding contains non-numeric values"
167
+ raise IndexBuildError(msg, details={"response": data, "index": index}) from exc
cairn/embed/fake.py ADDED
@@ -0,0 +1,36 @@
1
+ """Deterministic, network-free embedder for tests and offline development.
2
+
3
+ Implementation is a sparse bag-of-words hash projection: each word in the
4
+ input lowers onto exactly one dimension (chosen by sha256 hash mod dim).
5
+ Vectors are similarity-respecting — two texts that share words land near
6
+ each other in cosine space — but the embedder has no semantic understanding.
7
+ Suitable for unit tests and pipeline plumbing checks; never for production.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+
14
+
15
+ class FakeEmbedder:
16
+ """Bag-of-words hash embedder. Deterministic; no network."""
17
+
18
+ name = "fake:bow-hash"
19
+
20
+ def __init__(self, dim: int = 64) -> None:
21
+ if dim < 1:
22
+ msg = f"dim must be >= 1; got {dim}"
23
+ raise ValueError(msg)
24
+ self.dim = dim
25
+
26
+ async def embed(self, texts: list[str]) -> list[list[float]]:
27
+ return [self._embed_one(t) for t in texts]
28
+
29
+ def _embed_one(self, text: str) -> list[float]:
30
+ vec = [0.0] * self.dim
31
+ words = text.lower().split() or ["__empty__"]
32
+ for word in words:
33
+ digest = hashlib.sha256(word.encode("utf-8")).digest()
34
+ idx = int.from_bytes(digest[:8], "big") % self.dim
35
+ vec[idx] += 1.0
36
+ return vec