riff-kg-kit 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. riff_kg_kit-0.7.0/.gitignore +12 -0
  2. riff_kg_kit-0.7.0/CHANGELOG.md +11 -0
  3. riff_kg_kit-0.7.0/LICENSE +17 -0
  4. riff_kg_kit-0.7.0/PKG-INFO +100 -0
  5. riff_kg_kit-0.7.0/README.md +72 -0
  6. riff_kg_kit-0.7.0/pyproject.toml +52 -0
  7. riff_kg_kit-0.7.0/src/riff_kg/__init__.py +19 -0
  8. riff_kg_kit-0.7.0/src/riff_kg/commit.py +272 -0
  9. riff_kg_kit-0.7.0/src/riff_kg/config.py +39 -0
  10. riff_kg_kit-0.7.0/src/riff_kg/embedder.py +13 -0
  11. riff_kg_kit-0.7.0/src/riff_kg/extract.py +157 -0
  12. riff_kg_kit-0.7.0/src/riff_kg/ingest.py +161 -0
  13. riff_kg_kit-0.7.0/src/riff_kg/migrate.py +56 -0
  14. riff_kg_kit-0.7.0/src/riff_kg/migrations/001_extensions.sql +9 -0
  15. riff_kg_kit-0.7.0/src/riff_kg/migrations/002_signals_and_segments.sql +38 -0
  16. riff_kg_kit-0.7.0/src/riff_kg/migrations/003_extraction_and_staging.sql +39 -0
  17. riff_kg_kit-0.7.0/src/riff_kg/migrations/004_canonical_graph_and_jobs.sql +60 -0
  18. riff_kg_kit-0.7.0/src/riff_kg/migrations/__init__.py +8 -0
  19. riff_kg_kit-0.7.0/src/riff_kg/models.py +18 -0
  20. riff_kg_kit-0.7.0/src/riff_kg/pipeline.py +46 -0
  21. riff_kg_kit-0.7.0/src/riff_kg/py.typed +1 -0
  22. riff_kg_kit-0.7.0/src/riff_kg/search.py +418 -0
  23. riff_kg_kit-0.7.0/src/riff_kg/stage.py +17 -0
  24. riff_kg_kit-0.7.0/src/riff_kg/text.py +60 -0
  25. riff_kg_kit-0.7.0/tests/test_commit.py +181 -0
  26. riff_kg_kit-0.7.0/tests/test_config.py +18 -0
  27. riff_kg_kit-0.7.0/tests/test_extract.py +90 -0
  28. riff_kg_kit-0.7.0/tests/test_ingest_integration.py +65 -0
  29. riff_kg_kit-0.7.0/tests/test_migrations.py +25 -0
  30. riff_kg_kit-0.7.0/tests/test_search.py +195 -0
  31. riff_kg_kit-0.7.0/tests/test_text.py +37 -0
@@ -0,0 +1,12 @@
1
+ .venv/
2
+ venv/
3
+ __pycache__/
4
+ *.py[cod]
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ dist/
8
+ build/
9
+ *.egg-info/
10
+ .coverage
11
+ htmlcov/
12
+ .mypy_cache/
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ ## 0.7.0
4
+
5
+ - Hybrid retrieval improvements: `search_segments_hybrid` now supports `rrf` and `weighted` strategies via `merge_segment_hits_hybrid`.
6
+ - Graph-hop retrieval: added `graph_hop_subgraph` with `GraphEntityNode` and `GraphRelationEdge`.
7
+ - Docs: added publishing checklist and retrieval examples.
8
+
9
+ ## 0.6.0 and earlier
10
+
11
+ - Phases 0–4: package skeleton, migrations, ingest, extract, commit/reject, and docs.
@@ -0,0 +1,17 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Copyright 2026 Ellen
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ http://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: riff-kg-kit
3
+ Version: 0.7.0
4
+ Summary: Opinionated knowledge-graph ingest, staged proposals, and retrieval for Riff (FastAPI) apps
5
+ Project-URL: Homepage, https://pypi.org/project/riff-kg-kit/
6
+ Author: Ellen
7
+ License-Expression: Apache-2.0
8
+ License-File: LICENSE
9
+ Keywords: agents,knowledge-graph,rag,riff
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Typing :: Typed
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: pydantic<3,>=2.7
18
+ Provides-Extra: dev
19
+ Requires-Dist: asyncpg>=0.29; extra == 'dev'
20
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
21
+ Requires-Dist: pytest>=8.0; extra == 'dev'
22
+ Requires-Dist: ruff>=0.6; extra == 'dev'
23
+ Provides-Extra: postgres
24
+ Requires-Dist: asyncpg>=0.29; extra == 'postgres'
25
+ Provides-Extra: yaml
26
+ Requires-Dist: pyyaml>=6.0; extra == 'yaml'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # riff-kg-kit
30
+
31
+ PyPI-first Python library for **staged knowledge-graph ingestion** and **agent-approved commits**, designed to plug into **Riff** apps (FastAPI + Postgres + `pgvector`).
32
+
33
+ ## Status
34
+
35
+ **0.7.0 — Retrieval+ updates.** Adds hybrid reranking strategies and graph-hop retrieval helpers on top of vector/FTS segment search and `pack_context`.
36
+
37
+ ## Install
38
+
39
+ ```bash
40
+ pip install riff-kg-kit
41
+ ```
42
+
43
+ Editable (local dev):
44
+
45
+ ```bash
46
+ pip install -e ".[dev,yaml]"
47
+ ```
48
+
49
+ ## Quick use
50
+
51
+ ```python
52
+ from riff_kg import KgConfig
53
+
54
+ cfg = KgConfig.model_validate_json('{"embedding_dimension": 768}')
55
+ assert cfg.embedding_dimension == 768
56
+ ```
57
+
58
+ ## Roadmap (summary)
59
+
60
+ 1. Migrations + core tables (`signal`, `signal_segment`, `staged_proposal`, …)
61
+ 2. Normalize → segment → embed
62
+ 3. Extract → stage (LLM proposals only)
63
+ 4. Approve → commit (validated canonical graph)
64
+ 5. Search / pack-context / retrieval (implemented: `riff_kg.search`)
65
+
66
+ ### Retrieval (Phase 5)
67
+
68
+ ```python
69
+ from riff_kg import KgConfig
70
+ from riff_kg.search import pack_context, search_segments_vector
71
+
72
+ # After migrations and ingest with embeddings, pass a query embedding
73
+ # (same dimension as KgConfig.embedding_dimension, e.g. 768):
74
+ # hits = await search_segments_vector(conn, cfg, query_vec, scope_id="my_scope")
75
+ # text = pack_context(hits)
76
+ ```
77
+
78
+ ### Hybrid rerank and graph hops
79
+
80
+ ```python
81
+ from riff_kg.search import graph_hop_subgraph, search_segments_hybrid
82
+
83
+ # Hybrid search with Reciprocal Rank Fusion (default)
84
+ # hits = await search_segments_hybrid(conn, cfg, query_embedding=qvec, fts_query="topic")
85
+
86
+ # Graph neighborhood around a committed entity
87
+ # nodes, edges = await graph_hop_subgraph(conn, root_entity_id=entity_id, max_hops=2, scope_id="my_scope")
88
+ ```
89
+
90
+ ## Publishing checklist
91
+
92
+ 1. Bump version in `pyproject.toml` and `src/riff_kg/__init__.py`.
93
+ 2. Update `CHANGELOG.md` and README status/examples.
94
+ 3. Run `python -m ruff check src tests` and `python -m pytest tests -q`.
95
+ 4. Build distributions: `python -m build`.
96
+ 5. Upload to TestPyPI first, verify install, then publish to PyPI.
97
+
98
+ ## License
99
+
100
+ Apache-2.0 — see [LICENSE](LICENSE).
@@ -0,0 +1,72 @@
1
+ # riff-kg-kit
2
+
3
+ PyPI-first Python library for **staged knowledge-graph ingestion** and **agent-approved commits**, designed to plug into **Riff** apps (FastAPI + Postgres + `pgvector`).
4
+
5
+ ## Status
6
+
7
+ **0.7.0 — Retrieval+ updates.** Adds hybrid reranking strategies and graph-hop retrieval helpers on top of vector/FTS segment search and `pack_context`.
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install riff-kg-kit
13
+ ```
14
+
15
+ Editable (local dev):
16
+
17
+ ```bash
18
+ pip install -e ".[dev,yaml]"
19
+ ```
20
+
21
+ ## Quick use
22
+
23
+ ```python
24
+ from riff_kg import KgConfig
25
+
26
+ cfg = KgConfig.model_validate_json('{"embedding_dimension": 768}')
27
+ assert cfg.embedding_dimension == 768
28
+ ```
29
+
30
+ ## Roadmap (summary)
31
+
32
+ 1. Migrations + core tables (`signal`, `signal_segment`, `staged_proposal`, …)
33
+ 2. Normalize → segment → embed
34
+ 3. Extract → stage (LLM proposals only)
35
+ 4. Approve → commit (validated canonical graph)
36
+ 5. Search / pack-context / retrieval (implemented: `riff_kg.search`)
37
+
38
+ ### Retrieval (Phase 5)
39
+
40
+ ```python
41
+ from riff_kg import KgConfig
42
+ from riff_kg.search import pack_context, search_segments_vector
43
+
44
+ # After migrations and ingest with embeddings, pass a query embedding
45
+ # (same dimension as KgConfig.embedding_dimension, e.g. 768):
46
+ # hits = await search_segments_vector(conn, cfg, query_vec, scope_id="my_scope")
47
+ # text = pack_context(hits)
48
+ ```
49
+
50
+ ### Hybrid rerank and graph hops
51
+
52
+ ```python
53
+ from riff_kg.search import graph_hop_subgraph, search_segments_hybrid
54
+
55
+ # Hybrid search with Reciprocal Rank Fusion (default)
56
+ # hits = await search_segments_hybrid(conn, cfg, query_embedding=qvec, fts_query="topic")
57
+
58
+ # Graph neighborhood around a committed entity
59
+ # nodes, edges = await graph_hop_subgraph(conn, root_entity_id=entity_id, max_hops=2, scope_id="my_scope")
60
+ ```
61
+
62
+ ## Publishing checklist
63
+
64
+ 1. Bump version in `pyproject.toml` and `src/riff_kg/__init__.py`.
65
+ 2. Update `CHANGELOG.md` and README status/examples.
66
+ 3. Run `python -m ruff check src tests` and `python -m pytest tests -q`.
67
+ 4. Build distributions: `python -m build`.
68
+ 5. Upload to TestPyPI first, verify install, then publish to PyPI.
69
+
70
+ ## License
71
+
72
+ Apache-2.0 — see [LICENSE](LICENSE).
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.24.0"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "riff-kg-kit"
7
+ version = "0.7.0"
8
+ description = "Opinionated knowledge-graph ingest, staged proposals, and retrieval for Riff (FastAPI) apps"
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.11"
12
+ authors = [{ name = "Ellen" }]
13
+ keywords = ["knowledge-graph", "riff", "rag", "agents"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: Apache Software License",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Typing :: Typed",
21
+ ]
22
+ dependencies = ["pydantic>=2.7,<3"]
23
+
24
+ [project.optional-dependencies]
25
+ dev = [
26
+ "pytest>=8.0",
27
+ "pytest-asyncio>=0.24",
28
+ "ruff>=0.6",
29
+ "asyncpg>=0.29",
30
+ ]
31
+ postgres = ["asyncpg>=0.29"]
32
+ yaml = ["PyYAML>=6.0"]
33
+
34
+ [project.urls]
35
+ Homepage = "https://pypi.org/project/riff-kg-kit/"
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["src/riff_kg"]
39
+
40
+ [tool.hatch.build.targets.sdist]
41
+ include = ["/src", "/tests", "/README.md", "/CHANGELOG.md", "/LICENSE"]
42
+
43
+ [tool.ruff]
44
+ line-length = 100
45
+ target-version = "py311"
46
+
47
+ [tool.ruff.lint]
48
+ select = ["E", "F", "I", "UP"]
49
+
50
+ [tool.pytest.ini_options]
51
+ asyncio_mode = "auto"
52
+ testpaths = ["tests"]
@@ -0,0 +1,19 @@
1
+ """Riff knowledge-graph kit: staged ingest, approve/commit, retrieval."""
2
+
3
+ from riff_kg import commit, extract, ingest, models, pipeline, search, stage, text
4
+ from riff_kg.config import KgConfig
5
+
6
+ __version__ = "0.7.0"
7
+
8
+ __all__ = [
9
+ "__version__",
10
+ "KgConfig",
11
+ "commit",
12
+ "extract",
13
+ "ingest",
14
+ "models",
15
+ "pipeline",
16
+ "search",
17
+ "stage",
18
+ "text",
19
+ ]
@@ -0,0 +1,272 @@
1
+ """Phase 4: validate staged proposals and write canonical entities / relations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import uuid
7
+ from dataclasses import dataclass
8
+ from typing import Any, Literal
9
+
10
+ import asyncpg
11
+
12
+ from riff_kg.config import KgConfig
13
+
14
+
15
+ def _payload_dict(raw: Any) -> dict[str, Any]:
16
+ if raw is None:
17
+ return {}
18
+ if isinstance(raw, dict):
19
+ return raw
20
+ if isinstance(raw, str):
21
+ return json.loads(raw)
22
+ raise TypeError("payload must be dict or json string")
23
+
24
+
25
+ def _require_str(d: dict[str, Any], key: str) -> str:
26
+ v = d.get(key)
27
+ if not isinstance(v, str) or not v.strip():
28
+ raise ValueError(f"payload missing or invalid string field: {key!r}")
29
+ return v.strip()
30
+
31
+
32
+ def _optional_str(d: dict[str, Any], key: str) -> str | None:
33
+ v = d.get(key)
34
+ if v is None:
35
+ return None
36
+ if isinstance(v, str):
37
+ s = v.strip()
38
+ return s or None
39
+ raise ValueError(f"payload field {key!r} must be string or null")
40
+
41
+
42
+ def _optional_dict(d: dict[str, Any], key: str) -> dict[str, Any]:
43
+ v = d.get(key)
44
+ if v is None:
45
+ return {}
46
+ if isinstance(v, dict):
47
+ return v
48
+ raise ValueError(f"payload field {key!r} must be object")
49
+
50
+
51
+ def _parse_uuid(s: str) -> uuid.UUID:
52
+ try:
53
+ return uuid.UUID(str(s).strip())
54
+ except ValueError as e:
55
+ raise ValueError(f"invalid uuid: {s!r}") from e
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class CommitResult:
60
+ """One committed staged proposal."""
61
+
62
+ proposal_id: uuid.UUID
63
+ record_kind: Literal["entity", "relation"]
64
+ record_id: uuid.UUID
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class CommitSummary:
69
+ """Batch outcome for `commit_proposals`."""
70
+
71
+ committed: tuple[CommitResult, ...]
72
+ skipped_already_approved: tuple[uuid.UUID, ...]
73
+
74
+
75
+ async def commit_proposals(
76
+ conn: asyncpg.Connection,
77
+ cfg: KgConfig,
78
+ proposal_ids: list[uuid.UUID],
79
+ *,
80
+ committed_by: str | None = None,
81
+ ) -> CommitSummary:
82
+ """Commit pending proposals: entities first, then relations. Idempotent: skips `approved`."""
83
+ if not proposal_ids:
84
+ return CommitSummary(committed=(), skipped_already_approved=())
85
+
86
+ uniq = list(dict.fromkeys(proposal_ids))
87
+ rows = await conn.fetch(
88
+ """
89
+ SELECT sp.id, sp.proposal_kind, sp.payload, sp.segment_id, sp.char_start, sp.char_end,
90
+ sp.status AS proposal_status, s.scope_id, sp.extraction_run_id
91
+ FROM riff_kg.staged_proposals sp
92
+ JOIN riff_kg.signal_segments seg ON seg.id = sp.segment_id
93
+ JOIN riff_kg.signals s ON s.id = seg.signal_id
94
+ WHERE sp.id = ANY($1::uuid[])
95
+ """,
96
+ uniq,
97
+ )
98
+ by_id = {r["id"]: r for r in rows}
99
+ missing = [pid for pid in uniq if pid not in by_id]
100
+ if missing:
101
+ raise ValueError(f"unknown or inaccessible proposal ids: {missing}")
102
+
103
+ sorted_rows = sorted(
104
+ rows,
105
+ key=lambda r: (0 if str(r["proposal_kind"]).strip() == "entity" else 1, str(r["id"])),
106
+ )
107
+
108
+ committed: list[CommitResult] = []
109
+ skipped: list[uuid.UUID] = []
110
+
111
+ async with conn.transaction():
112
+ for row in sorted_rows:
113
+ pid: uuid.UUID = row["id"]
114
+ st = row["proposal_status"]
115
+ if st == "approved":
116
+ skipped.append(pid)
117
+ continue
118
+ if st != "pending_review":
119
+ raise ValueError(f"proposal {pid} is not pending_review ({st})")
120
+
121
+ scope_raw = row["scope_id"]
122
+ scope_id = scope_raw if isinstance(scope_raw, str) else str(scope_raw or "")
123
+ pk = row["proposal_kind"]
124
+ kind = str(pk).strip() if pk is not None else ""
125
+ if not kind:
126
+ raise ValueError(f"proposal {pid} has empty proposal_kind")
127
+ payload = _payload_dict(row["payload"])
128
+
129
+ if kind == "entity":
130
+ entity_type = _require_str(payload, "entity_type")
131
+ if entity_type not in cfg.entity_types:
132
+ raise ValueError(f"entity_type {entity_type!r} not allowed by KgConfig")
133
+ label = _require_str(payload, "label")
134
+ can_key = _optional_str(payload, "canonical_key")
135
+ attrs = json.dumps(_optional_dict(payload, "attrs"))
136
+
137
+ if can_key:
138
+ existing = await conn.fetchrow(
139
+ """
140
+ SELECT id FROM riff_kg.entities
141
+ WHERE scope_id = $1 AND entity_type = $2 AND canonical_key = $3
142
+ """,
143
+ scope_id,
144
+ entity_type,
145
+ can_key,
146
+ )
147
+ if existing:
148
+ eid = existing["id"]
149
+ await conn.execute(
150
+ """
151
+ UPDATE riff_kg.entities
152
+ SET label = $1, attrs = attrs || $2::jsonb
153
+ WHERE id = $3
154
+ """,
155
+ label,
156
+ attrs,
157
+ eid,
158
+ )
159
+ else:
160
+ eid = await conn.fetchval(
161
+ """
162
+ INSERT INTO riff_kg.entities (
163
+ scope_id, entity_type, label, canonical_key, attrs
164
+ )
165
+ VALUES ($1, $2, $3, $4, $5::jsonb)
166
+ RETURNING id
167
+ """,
168
+ scope_id,
169
+ entity_type,
170
+ label,
171
+ can_key,
172
+ attrs,
173
+ )
174
+ else:
175
+ eid = await conn.fetchval(
176
+ """
177
+ INSERT INTO riff_kg.entities (
178
+ scope_id, entity_type, label, canonical_key, attrs
179
+ )
180
+ VALUES ($1, $2, $3, NULL, $4::jsonb)
181
+ RETURNING id
182
+ """,
183
+ scope_id,
184
+ entity_type,
185
+ label,
186
+ attrs,
187
+ )
188
+
189
+ await conn.execute(
190
+ """
191
+ UPDATE riff_kg.staged_proposals
192
+ SET status = 'approved', resolution_note = NULL
193
+ WHERE id = $1
194
+ """,
195
+ pid,
196
+ )
197
+ committed.append(CommitResult(pid, "entity", eid))
198
+
199
+ elif kind == "relation":
200
+ rel_type = _require_str(payload, "relation_type")
201
+ if rel_type not in cfg.relation_types:
202
+ raise ValueError(f"relation_type {rel_type!r} not allowed by KgConfig")
203
+ src = _parse_uuid(_require_str(payload, "src_entity_id"))
204
+ dst = _parse_uuid(_require_str(payload, "dst_entity_id"))
205
+ rattrs = json.dumps(_optional_dict(payload, "attrs"))
206
+
207
+ rid = await conn.fetchval(
208
+ """
209
+ INSERT INTO riff_kg.relations (
210
+ scope_id, src_entity_id, dst_entity_id, relation_type, attrs,
211
+ evidence_segment_id, evidence_char_start, evidence_char_end,
212
+ staged_proposal_id, extraction_run_id, committed_by
213
+ )
214
+ VALUES (
215
+ $1, $2, $3, $4, $5::jsonb,
216
+ $6, $7, $8, $9, $10, $11
217
+ )
218
+ RETURNING id
219
+ """,
220
+ scope_id,
221
+ src,
222
+ dst,
223
+ rel_type,
224
+ rattrs,
225
+ row["segment_id"],
226
+ row["char_start"],
227
+ row["char_end"],
228
+ pid,
229
+ row["extraction_run_id"],
230
+ committed_by,
231
+ )
232
+
233
+ await conn.execute(
234
+ """
235
+ UPDATE riff_kg.staged_proposals SET status = 'approved', resolution_note = NULL
236
+ WHERE id = $1
237
+ """,
238
+ pid,
239
+ )
240
+ committed.append(CommitResult(pid, "relation", rid))
241
+ else:
242
+ raise ValueError(f"unsupported proposal_kind for commit: {kind!r}")
243
+
244
+ return CommitSummary(
245
+ committed=tuple(committed),
246
+ skipped_already_approved=tuple(skipped),
247
+ )
248
+
249
+
250
+ async def reject_proposals(
251
+ conn: asyncpg.Connection,
252
+ proposal_ids: list[uuid.UUID],
253
+ *,
254
+ reason: str,
255
+ ) -> int:
256
+ """Mark pending proposals rejected; returns number of rows updated."""
257
+ if not proposal_ids:
258
+ return 0
259
+ status = await conn.execute(
260
+ """
261
+ UPDATE riff_kg.staged_proposals
262
+ SET status = 'rejected', resolution_note = $2
263
+ WHERE id = ANY($1::uuid[]) AND status = 'pending_review'
264
+ """,
265
+ list(dict.fromkeys(proposal_ids)),
266
+ reason[:10_000],
267
+ )
268
+ # asyncpg returns "UPDATE N"
269
+ prefix = "UPDATE "
270
+ if status.startswith(prefix):
271
+ return int(status.split()[-1])
272
+ return 0
@@ -0,0 +1,39 @@
1
+ """Runtime configuration: chunking, embeddings, closed-world type registry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class KgConfig(BaseModel):
9
+ """Kit behavior for one Riff app deployment. Load from JSON/YAML or env."""
10
+
11
+ model_config = {"extra": "forbid"}
12
+
13
+ embedding_dimension: int = Field(default=768, ge=32, le=4096)
14
+ chunk_max_chars: int = Field(default=8000, ge=500)
15
+ chunk_overlap_chars: int = Field(default=400, ge=0)
16
+ chunk_if_longer_than_chars: int = Field(
17
+ default=12000,
18
+ ge=1000,
19
+ description="Below this size, keep a single segment (still stored as segment index 0).",
20
+ )
21
+ entity_types: tuple[str, ...] = Field(
22
+ default=("Entity",),
23
+ description="Allowed entity type labels at commit validation.",
24
+ )
25
+ relation_types: tuple[str, ...] = Field(
26
+ default=("related_to",),
27
+ description="Allowed relation predicates at commit validation.",
28
+ )
29
+ profile_uri: str | None = Field(
30
+ default=None,
31
+ description="Optional storage key or path to extraction JSON schema profile.",
32
+ )
33
+
34
+ def effective_chunking(self) -> dict[str, int]:
35
+ return {
36
+ "chunk_max_chars": self.chunk_max_chars,
37
+ "chunk_overlap_chars": self.chunk_overlap_chars,
38
+ "chunk_if_longer_than_chars": self.chunk_if_longer_than_chars,
39
+ }
@@ -0,0 +1,13 @@
1
+ """Pluggable text embedding (Phase 2)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+
8
+ @runtime_checkable
9
+ class Embedder(Protocol):
10
+ """Embed segment bodies; each vector length must match `KgConfig.embedding_dimension`."""
11
+
12
+ async def embed_texts(self, texts: list[str]) -> list[list[float]]:
13
+ ...