riff-kg-kit 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- riff_kg_kit-0.7.0/.gitignore +12 -0
- riff_kg_kit-0.7.0/CHANGELOG.md +11 -0
- riff_kg_kit-0.7.0/LICENSE +17 -0
- riff_kg_kit-0.7.0/PKG-INFO +100 -0
- riff_kg_kit-0.7.0/README.md +72 -0
- riff_kg_kit-0.7.0/pyproject.toml +52 -0
- riff_kg_kit-0.7.0/src/riff_kg/__init__.py +19 -0
- riff_kg_kit-0.7.0/src/riff_kg/commit.py +272 -0
- riff_kg_kit-0.7.0/src/riff_kg/config.py +39 -0
- riff_kg_kit-0.7.0/src/riff_kg/embedder.py +13 -0
- riff_kg_kit-0.7.0/src/riff_kg/extract.py +157 -0
- riff_kg_kit-0.7.0/src/riff_kg/ingest.py +161 -0
- riff_kg_kit-0.7.0/src/riff_kg/migrate.py +56 -0
- riff_kg_kit-0.7.0/src/riff_kg/migrations/001_extensions.sql +9 -0
- riff_kg_kit-0.7.0/src/riff_kg/migrations/002_signals_and_segments.sql +38 -0
- riff_kg_kit-0.7.0/src/riff_kg/migrations/003_extraction_and_staging.sql +39 -0
- riff_kg_kit-0.7.0/src/riff_kg/migrations/004_canonical_graph_and_jobs.sql +60 -0
- riff_kg_kit-0.7.0/src/riff_kg/migrations/__init__.py +8 -0
- riff_kg_kit-0.7.0/src/riff_kg/models.py +18 -0
- riff_kg_kit-0.7.0/src/riff_kg/pipeline.py +46 -0
- riff_kg_kit-0.7.0/src/riff_kg/py.typed +1 -0
- riff_kg_kit-0.7.0/src/riff_kg/search.py +418 -0
- riff_kg_kit-0.7.0/src/riff_kg/stage.py +17 -0
- riff_kg_kit-0.7.0/src/riff_kg/text.py +60 -0
- riff_kg_kit-0.7.0/tests/test_commit.py +181 -0
- riff_kg_kit-0.7.0/tests/test_config.py +18 -0
- riff_kg_kit-0.7.0/tests/test_extract.py +90 -0
- riff_kg_kit-0.7.0/tests/test_ingest_integration.py +65 -0
- riff_kg_kit-0.7.0/tests/test_migrations.py +25 -0
- riff_kg_kit-0.7.0/tests/test_search.py +195 -0
- riff_kg_kit-0.7.0/tests/test_text.py +37 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.7.0
|
|
4
|
+
|
|
5
|
+
- Hybrid retrieval improvements: `search_segments_hybrid` now supports `rrf` and `weighted` strategies via `merge_segment_hits_hybrid`.
|
|
6
|
+
- Graph-hop retrieval: added `graph_hop_subgraph` with `GraphEntityNode` and `GraphRelationEdge`.
|
|
7
|
+
- Docs: added publishing checklist and retrieval examples.
|
|
8
|
+
|
|
9
|
+
## 0.6.0 and earlier
|
|
10
|
+
|
|
11
|
+
- Phases 0–4: package skeleton, migrations, ingest, extract, commit/reject, and docs.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Copyright 2026 Ellen
|
|
6
|
+
|
|
7
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
you may not use this file except in compliance with the License.
|
|
9
|
+
You may obtain a copy of the License at
|
|
10
|
+
|
|
11
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
|
|
13
|
+
Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
See the License for the specific language governing permissions and
|
|
17
|
+
limitations under the License.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: riff-kg-kit
|
|
3
|
+
Version: 0.7.0
|
|
4
|
+
Summary: Opinionated knowledge-graph ingest, staged proposals, and retrieval for Riff (FastAPI) apps
|
|
5
|
+
Project-URL: Homepage, https://pypi.org/project/riff-kg-kit/
|
|
6
|
+
Author: Ellen
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: agents,knowledge-graph,rag,riff
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: pydantic<3,>=2.7
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: asyncpg>=0.29; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
21
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
22
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
23
|
+
Provides-Extra: postgres
|
|
24
|
+
Requires-Dist: asyncpg>=0.29; extra == 'postgres'
|
|
25
|
+
Provides-Extra: yaml
|
|
26
|
+
Requires-Dist: pyyaml>=6.0; extra == 'yaml'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# riff-kg-kit
|
|
30
|
+
|
|
31
|
+
PyPI-first Python library for **staged knowledge-graph ingestion** and **agent-approved commits**, designed to plug into **Riff** apps (FastAPI + Postgres + `pgvector`).
|
|
32
|
+
|
|
33
|
+
## Status
|
|
34
|
+
|
|
35
|
+
**0.7.0 — Retrieval+ updates.** Adds hybrid reranking strategies and graph-hop retrieval helpers on top of vector/FTS segment search and `pack_context`.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install riff-kg-kit
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Editable (local dev):
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install -e ".[dev,yaml]"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quick use
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from riff_kg import KgConfig
|
|
53
|
+
|
|
54
|
+
cfg = KgConfig.model_validate_json('{"embedding_dimension": 768}')
|
|
55
|
+
assert cfg.embedding_dimension == 768
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Roadmap (summary)
|
|
59
|
+
|
|
60
|
+
1. Migrations + core tables (`signal`, `signal_segment`, `staged_proposal`, …)
|
|
61
|
+
2. Normalize → segment → embed
|
|
62
|
+
3. Extract → stage (LLM proposals only)
|
|
63
|
+
4. Approve → commit (validated canonical graph)
|
|
64
|
+
5. Search / pack-context / retrieval (implemented: `riff_kg.search`)
|
|
65
|
+
|
|
66
|
+
### Retrieval (Phase 5)
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from riff_kg import KgConfig
|
|
70
|
+
from riff_kg.search import pack_context, search_segments_vector
|
|
71
|
+
|
|
72
|
+
# After migrations and ingest with embeddings, pass a query embedding
|
|
73
|
+
# (same dimension as KgConfig.embedding_dimension, e.g. 768):
|
|
74
|
+
# hits = await search_segments_vector(conn, cfg, query_vec, scope_id="my_scope")
|
|
75
|
+
# text = pack_context(hits)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Hybrid rerank and graph hops
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from riff_kg.search import graph_hop_subgraph, search_segments_hybrid
|
|
82
|
+
|
|
83
|
+
# Hybrid search with Reciprocal Rank Fusion (default)
|
|
84
|
+
# hits = await search_segments_hybrid(conn, cfg, query_embedding=qvec, fts_query="topic")
|
|
85
|
+
|
|
86
|
+
# Graph neighborhood around a committed entity
|
|
87
|
+
# nodes, edges = await graph_hop_subgraph(conn, root_entity_id=entity_id, max_hops=2, scope_id="my_scope")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Publishing checklist
|
|
91
|
+
|
|
92
|
+
1. Bump version in `pyproject.toml` and `src/riff_kg/__init__.py`.
|
|
93
|
+
2. Update `CHANGELOG.md` and README status/examples.
|
|
94
|
+
3. Run `python -m ruff check src tests` and `python -m pytest tests -q`.
|
|
95
|
+
4. Build distributions: `python -m build`.
|
|
96
|
+
5. Upload to TestPyPI first, verify install, then publish to PyPI.
|
|
97
|
+
|
|
98
|
+
## License
|
|
99
|
+
|
|
100
|
+
Apache-2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# riff-kg-kit
|
|
2
|
+
|
|
3
|
+
PyPI-first Python library for **staged knowledge-graph ingestion** and **agent-approved commits**, designed to plug into **Riff** apps (FastAPI + Postgres + `pgvector`).
|
|
4
|
+
|
|
5
|
+
## Status
|
|
6
|
+
|
|
7
|
+
**0.7.0 — Retrieval+ updates.** Adds hybrid reranking strategies and graph-hop retrieval helpers on top of vector/FTS segment search and `pack_context`.
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install riff-kg-kit
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Editable (local dev):
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install -e ".[dev,yaml]"
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Quick use
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from riff_kg import KgConfig
|
|
25
|
+
|
|
26
|
+
cfg = KgConfig.model_validate_json('{"embedding_dimension": 768}')
|
|
27
|
+
assert cfg.embedding_dimension == 768
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Roadmap (summary)
|
|
31
|
+
|
|
32
|
+
1. Migrations + core tables (`signal`, `signal_segment`, `staged_proposal`, …)
|
|
33
|
+
2. Normalize → segment → embed
|
|
34
|
+
3. Extract → stage (LLM proposals only)
|
|
35
|
+
4. Approve → commit (validated canonical graph)
|
|
36
|
+
5. Search / pack-context / retrieval (implemented: `riff_kg.search`)
|
|
37
|
+
|
|
38
|
+
### Retrieval (Phase 5)
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from riff_kg import KgConfig
|
|
42
|
+
from riff_kg.search import pack_context, search_segments_vector
|
|
43
|
+
|
|
44
|
+
# After migrations and ingest with embeddings, pass a query embedding
|
|
45
|
+
# (same dimension as KgConfig.embedding_dimension, e.g. 768):
|
|
46
|
+
# hits = await search_segments_vector(conn, cfg, query_vec, scope_id="my_scope")
|
|
47
|
+
# text = pack_context(hits)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Hybrid rerank and graph hops
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from riff_kg.search import graph_hop_subgraph, search_segments_hybrid
|
|
54
|
+
|
|
55
|
+
# Hybrid search with Reciprocal Rank Fusion (default)
|
|
56
|
+
# hits = await search_segments_hybrid(conn, cfg, query_embedding=qvec, fts_query="topic")
|
|
57
|
+
|
|
58
|
+
# Graph neighborhood around a committed entity
|
|
59
|
+
# nodes, edges = await graph_hop_subgraph(conn, root_entity_id=entity_id, max_hops=2, scope_id="my_scope")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Publishing checklist
|
|
63
|
+
|
|
64
|
+
1. Bump version in `pyproject.toml` and `src/riff_kg/__init__.py`.
|
|
65
|
+
2. Update `CHANGELOG.md` and README status/examples.
|
|
66
|
+
3. Run `python -m ruff check src tests` and `python -m pytest tests -q`.
|
|
67
|
+
4. Build distributions: `python -m build`.
|
|
68
|
+
5. Upload to TestPyPI first, verify install, then publish to PyPI.
|
|
69
|
+
|
|
70
|
+
## License
|
|
71
|
+
|
|
72
|
+
Apache-2.0 — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.24.0"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "riff-kg-kit"
|
|
7
|
+
version = "0.7.0"
|
|
8
|
+
description = "Opinionated knowledge-graph ingest, staged proposals, and retrieval for Riff (FastAPI) apps"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [{ name = "Ellen" }]
|
|
13
|
+
keywords = ["knowledge-graph", "riff", "rag", "agents"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: Apache Software License",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Typing :: Typed",
|
|
21
|
+
]
|
|
22
|
+
dependencies = ["pydantic>=2.7,<3"]
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
dev = [
|
|
26
|
+
"pytest>=8.0",
|
|
27
|
+
"pytest-asyncio>=0.24",
|
|
28
|
+
"ruff>=0.6",
|
|
29
|
+
"asyncpg>=0.29",
|
|
30
|
+
]
|
|
31
|
+
postgres = ["asyncpg>=0.29"]
|
|
32
|
+
yaml = ["PyYAML>=6.0"]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://pypi.org/project/riff-kg-kit/"
|
|
36
|
+
|
|
37
|
+
[tool.hatch.build.targets.wheel]
|
|
38
|
+
packages = ["src/riff_kg"]
|
|
39
|
+
|
|
40
|
+
[tool.hatch.build.targets.sdist]
|
|
41
|
+
include = ["/src", "/tests", "/README.md", "/CHANGELOG.md", "/LICENSE"]
|
|
42
|
+
|
|
43
|
+
[tool.ruff]
|
|
44
|
+
line-length = 100
|
|
45
|
+
target-version = "py311"
|
|
46
|
+
|
|
47
|
+
[tool.ruff.lint]
|
|
48
|
+
select = ["E", "F", "I", "UP"]
|
|
49
|
+
|
|
50
|
+
[tool.pytest.ini_options]
|
|
51
|
+
asyncio_mode = "auto"
|
|
52
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Riff knowledge-graph kit: staged ingest, approve/commit, retrieval."""
|
|
2
|
+
|
|
3
|
+
from riff_kg import commit, extract, ingest, models, pipeline, search, stage, text
|
|
4
|
+
from riff_kg.config import KgConfig
|
|
5
|
+
|
|
6
|
+
__version__ = "0.7.0"
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"__version__",
|
|
10
|
+
"KgConfig",
|
|
11
|
+
"commit",
|
|
12
|
+
"extract",
|
|
13
|
+
"ingest",
|
|
14
|
+
"models",
|
|
15
|
+
"pipeline",
|
|
16
|
+
"search",
|
|
17
|
+
"stage",
|
|
18
|
+
"text",
|
|
19
|
+
]
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""Phase 4: validate staged proposals and write canonical entities / relations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import uuid
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
import asyncpg
|
|
11
|
+
|
|
12
|
+
from riff_kg.config import KgConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _payload_dict(raw: Any) -> dict[str, Any]:
|
|
16
|
+
if raw is None:
|
|
17
|
+
return {}
|
|
18
|
+
if isinstance(raw, dict):
|
|
19
|
+
return raw
|
|
20
|
+
if isinstance(raw, str):
|
|
21
|
+
return json.loads(raw)
|
|
22
|
+
raise TypeError("payload must be dict or json string")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _require_str(d: dict[str, Any], key: str) -> str:
|
|
26
|
+
v = d.get(key)
|
|
27
|
+
if not isinstance(v, str) or not v.strip():
|
|
28
|
+
raise ValueError(f"payload missing or invalid string field: {key!r}")
|
|
29
|
+
return v.strip()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _optional_str(d: dict[str, Any], key: str) -> str | None:
|
|
33
|
+
v = d.get(key)
|
|
34
|
+
if v is None:
|
|
35
|
+
return None
|
|
36
|
+
if isinstance(v, str):
|
|
37
|
+
s = v.strip()
|
|
38
|
+
return s or None
|
|
39
|
+
raise ValueError(f"payload field {key!r} must be string or null")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _optional_dict(d: dict[str, Any], key: str) -> dict[str, Any]:
|
|
43
|
+
v = d.get(key)
|
|
44
|
+
if v is None:
|
|
45
|
+
return {}
|
|
46
|
+
if isinstance(v, dict):
|
|
47
|
+
return v
|
|
48
|
+
raise ValueError(f"payload field {key!r} must be object")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _parse_uuid(s: str) -> uuid.UUID:
|
|
52
|
+
try:
|
|
53
|
+
return uuid.UUID(str(s).strip())
|
|
54
|
+
except ValueError as e:
|
|
55
|
+
raise ValueError(f"invalid uuid: {s!r}") from e
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass(frozen=True)
|
|
59
|
+
class CommitResult:
|
|
60
|
+
"""One committed staged proposal."""
|
|
61
|
+
|
|
62
|
+
proposal_id: uuid.UUID
|
|
63
|
+
record_kind: Literal["entity", "relation"]
|
|
64
|
+
record_id: uuid.UUID
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass(frozen=True)
|
|
68
|
+
class CommitSummary:
|
|
69
|
+
"""Batch outcome for `commit_proposals`."""
|
|
70
|
+
|
|
71
|
+
committed: tuple[CommitResult, ...]
|
|
72
|
+
skipped_already_approved: tuple[uuid.UUID, ...]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def commit_proposals(
|
|
76
|
+
conn: asyncpg.Connection,
|
|
77
|
+
cfg: KgConfig,
|
|
78
|
+
proposal_ids: list[uuid.UUID],
|
|
79
|
+
*,
|
|
80
|
+
committed_by: str | None = None,
|
|
81
|
+
) -> CommitSummary:
|
|
82
|
+
"""Commit pending proposals: entities first, then relations. Idempotent: skips `approved`."""
|
|
83
|
+
if not proposal_ids:
|
|
84
|
+
return CommitSummary(committed=(), skipped_already_approved=())
|
|
85
|
+
|
|
86
|
+
uniq = list(dict.fromkeys(proposal_ids))
|
|
87
|
+
rows = await conn.fetch(
|
|
88
|
+
"""
|
|
89
|
+
SELECT sp.id, sp.proposal_kind, sp.payload, sp.segment_id, sp.char_start, sp.char_end,
|
|
90
|
+
sp.status AS proposal_status, s.scope_id, sp.extraction_run_id
|
|
91
|
+
FROM riff_kg.staged_proposals sp
|
|
92
|
+
JOIN riff_kg.signal_segments seg ON seg.id = sp.segment_id
|
|
93
|
+
JOIN riff_kg.signals s ON s.id = seg.signal_id
|
|
94
|
+
WHERE sp.id = ANY($1::uuid[])
|
|
95
|
+
""",
|
|
96
|
+
uniq,
|
|
97
|
+
)
|
|
98
|
+
by_id = {r["id"]: r for r in rows}
|
|
99
|
+
missing = [pid for pid in uniq if pid not in by_id]
|
|
100
|
+
if missing:
|
|
101
|
+
raise ValueError(f"unknown or inaccessible proposal ids: {missing}")
|
|
102
|
+
|
|
103
|
+
sorted_rows = sorted(
|
|
104
|
+
rows,
|
|
105
|
+
key=lambda r: (0 if str(r["proposal_kind"]).strip() == "entity" else 1, str(r["id"])),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
committed: list[CommitResult] = []
|
|
109
|
+
skipped: list[uuid.UUID] = []
|
|
110
|
+
|
|
111
|
+
async with conn.transaction():
|
|
112
|
+
for row in sorted_rows:
|
|
113
|
+
pid: uuid.UUID = row["id"]
|
|
114
|
+
st = row["proposal_status"]
|
|
115
|
+
if st == "approved":
|
|
116
|
+
skipped.append(pid)
|
|
117
|
+
continue
|
|
118
|
+
if st != "pending_review":
|
|
119
|
+
raise ValueError(f"proposal {pid} is not pending_review ({st})")
|
|
120
|
+
|
|
121
|
+
scope_raw = row["scope_id"]
|
|
122
|
+
scope_id = scope_raw if isinstance(scope_raw, str) else str(scope_raw or "")
|
|
123
|
+
pk = row["proposal_kind"]
|
|
124
|
+
kind = str(pk).strip() if pk is not None else ""
|
|
125
|
+
if not kind:
|
|
126
|
+
raise ValueError(f"proposal {pid} has empty proposal_kind")
|
|
127
|
+
payload = _payload_dict(row["payload"])
|
|
128
|
+
|
|
129
|
+
if kind == "entity":
|
|
130
|
+
entity_type = _require_str(payload, "entity_type")
|
|
131
|
+
if entity_type not in cfg.entity_types:
|
|
132
|
+
raise ValueError(f"entity_type {entity_type!r} not allowed by KgConfig")
|
|
133
|
+
label = _require_str(payload, "label")
|
|
134
|
+
can_key = _optional_str(payload, "canonical_key")
|
|
135
|
+
attrs = json.dumps(_optional_dict(payload, "attrs"))
|
|
136
|
+
|
|
137
|
+
if can_key:
|
|
138
|
+
existing = await conn.fetchrow(
|
|
139
|
+
"""
|
|
140
|
+
SELECT id FROM riff_kg.entities
|
|
141
|
+
WHERE scope_id = $1 AND entity_type = $2 AND canonical_key = $3
|
|
142
|
+
""",
|
|
143
|
+
scope_id,
|
|
144
|
+
entity_type,
|
|
145
|
+
can_key,
|
|
146
|
+
)
|
|
147
|
+
if existing:
|
|
148
|
+
eid = existing["id"]
|
|
149
|
+
await conn.execute(
|
|
150
|
+
"""
|
|
151
|
+
UPDATE riff_kg.entities
|
|
152
|
+
SET label = $1, attrs = attrs || $2::jsonb
|
|
153
|
+
WHERE id = $3
|
|
154
|
+
""",
|
|
155
|
+
label,
|
|
156
|
+
attrs,
|
|
157
|
+
eid,
|
|
158
|
+
)
|
|
159
|
+
else:
|
|
160
|
+
eid = await conn.fetchval(
|
|
161
|
+
"""
|
|
162
|
+
INSERT INTO riff_kg.entities (
|
|
163
|
+
scope_id, entity_type, label, canonical_key, attrs
|
|
164
|
+
)
|
|
165
|
+
VALUES ($1, $2, $3, $4, $5::jsonb)
|
|
166
|
+
RETURNING id
|
|
167
|
+
""",
|
|
168
|
+
scope_id,
|
|
169
|
+
entity_type,
|
|
170
|
+
label,
|
|
171
|
+
can_key,
|
|
172
|
+
attrs,
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
eid = await conn.fetchval(
|
|
176
|
+
"""
|
|
177
|
+
INSERT INTO riff_kg.entities (
|
|
178
|
+
scope_id, entity_type, label, canonical_key, attrs
|
|
179
|
+
)
|
|
180
|
+
VALUES ($1, $2, $3, NULL, $4::jsonb)
|
|
181
|
+
RETURNING id
|
|
182
|
+
""",
|
|
183
|
+
scope_id,
|
|
184
|
+
entity_type,
|
|
185
|
+
label,
|
|
186
|
+
attrs,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
await conn.execute(
|
|
190
|
+
"""
|
|
191
|
+
UPDATE riff_kg.staged_proposals
|
|
192
|
+
SET status = 'approved', resolution_note = NULL
|
|
193
|
+
WHERE id = $1
|
|
194
|
+
""",
|
|
195
|
+
pid,
|
|
196
|
+
)
|
|
197
|
+
committed.append(CommitResult(pid, "entity", eid))
|
|
198
|
+
|
|
199
|
+
elif kind == "relation":
|
|
200
|
+
rel_type = _require_str(payload, "relation_type")
|
|
201
|
+
if rel_type not in cfg.relation_types:
|
|
202
|
+
raise ValueError(f"relation_type {rel_type!r} not allowed by KgConfig")
|
|
203
|
+
src = _parse_uuid(_require_str(payload, "src_entity_id"))
|
|
204
|
+
dst = _parse_uuid(_require_str(payload, "dst_entity_id"))
|
|
205
|
+
rattrs = json.dumps(_optional_dict(payload, "attrs"))
|
|
206
|
+
|
|
207
|
+
rid = await conn.fetchval(
|
|
208
|
+
"""
|
|
209
|
+
INSERT INTO riff_kg.relations (
|
|
210
|
+
scope_id, src_entity_id, dst_entity_id, relation_type, attrs,
|
|
211
|
+
evidence_segment_id, evidence_char_start, evidence_char_end,
|
|
212
|
+
staged_proposal_id, extraction_run_id, committed_by
|
|
213
|
+
)
|
|
214
|
+
VALUES (
|
|
215
|
+
$1, $2, $3, $4, $5::jsonb,
|
|
216
|
+
$6, $7, $8, $9, $10, $11
|
|
217
|
+
)
|
|
218
|
+
RETURNING id
|
|
219
|
+
""",
|
|
220
|
+
scope_id,
|
|
221
|
+
src,
|
|
222
|
+
dst,
|
|
223
|
+
rel_type,
|
|
224
|
+
rattrs,
|
|
225
|
+
row["segment_id"],
|
|
226
|
+
row["char_start"],
|
|
227
|
+
row["char_end"],
|
|
228
|
+
pid,
|
|
229
|
+
row["extraction_run_id"],
|
|
230
|
+
committed_by,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
await conn.execute(
|
|
234
|
+
"""
|
|
235
|
+
UPDATE riff_kg.staged_proposals SET status = 'approved', resolution_note = NULL
|
|
236
|
+
WHERE id = $1
|
|
237
|
+
""",
|
|
238
|
+
pid,
|
|
239
|
+
)
|
|
240
|
+
committed.append(CommitResult(pid, "relation", rid))
|
|
241
|
+
else:
|
|
242
|
+
raise ValueError(f"unsupported proposal_kind for commit: {kind!r}")
|
|
243
|
+
|
|
244
|
+
return CommitSummary(
|
|
245
|
+
committed=tuple(committed),
|
|
246
|
+
skipped_already_approved=tuple(skipped),
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
async def reject_proposals(
|
|
251
|
+
conn: asyncpg.Connection,
|
|
252
|
+
proposal_ids: list[uuid.UUID],
|
|
253
|
+
*,
|
|
254
|
+
reason: str,
|
|
255
|
+
) -> int:
|
|
256
|
+
"""Mark pending proposals rejected; returns number of rows updated."""
|
|
257
|
+
if not proposal_ids:
|
|
258
|
+
return 0
|
|
259
|
+
status = await conn.execute(
|
|
260
|
+
"""
|
|
261
|
+
UPDATE riff_kg.staged_proposals
|
|
262
|
+
SET status = 'rejected', resolution_note = $2
|
|
263
|
+
WHERE id = ANY($1::uuid[]) AND status = 'pending_review'
|
|
264
|
+
""",
|
|
265
|
+
list(dict.fromkeys(proposal_ids)),
|
|
266
|
+
reason[:10_000],
|
|
267
|
+
)
|
|
268
|
+
# asyncpg returns "UPDATE N"
|
|
269
|
+
prefix = "UPDATE "
|
|
270
|
+
if status.startswith(prefix):
|
|
271
|
+
return int(status.split()[-1])
|
|
272
|
+
return 0
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Runtime configuration: chunking, embeddings, closed-world type registry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class KgConfig(BaseModel):
|
|
9
|
+
"""Kit behavior for one Riff app deployment. Load from JSON/YAML or env."""
|
|
10
|
+
|
|
11
|
+
model_config = {"extra": "forbid"}
|
|
12
|
+
|
|
13
|
+
embedding_dimension: int = Field(default=768, ge=32, le=4096)
|
|
14
|
+
chunk_max_chars: int = Field(default=8000, ge=500)
|
|
15
|
+
chunk_overlap_chars: int = Field(default=400, ge=0)
|
|
16
|
+
chunk_if_longer_than_chars: int = Field(
|
|
17
|
+
default=12000,
|
|
18
|
+
ge=1000,
|
|
19
|
+
description="Below this size, keep a single segment (still stored as segment index 0).",
|
|
20
|
+
)
|
|
21
|
+
entity_types: tuple[str, ...] = Field(
|
|
22
|
+
default=("Entity",),
|
|
23
|
+
description="Allowed entity type labels at commit validation.",
|
|
24
|
+
)
|
|
25
|
+
relation_types: tuple[str, ...] = Field(
|
|
26
|
+
default=("related_to",),
|
|
27
|
+
description="Allowed relation predicates at commit validation.",
|
|
28
|
+
)
|
|
29
|
+
profile_uri: str | None = Field(
|
|
30
|
+
default=None,
|
|
31
|
+
description="Optional storage key or path to extraction JSON schema profile.",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def effective_chunking(self) -> dict[str, int]:
|
|
35
|
+
return {
|
|
36
|
+
"chunk_max_chars": self.chunk_max_chars,
|
|
37
|
+
"chunk_overlap_chars": self.chunk_overlap_chars,
|
|
38
|
+
"chunk_if_longer_than_chars": self.chunk_if_longer_than_chars,
|
|
39
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Pluggable text embedding (Phase 2)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@runtime_checkable
|
|
9
|
+
class Embedder(Protocol):
|
|
10
|
+
"""Embed segment bodies; each vector length must match `KgConfig.embedding_dimension`."""
|
|
11
|
+
|
|
12
|
+
async def embed_texts(self, texts: list[str]) -> list[list[float]]:
|
|
13
|
+
...
|