kglite-docs 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kglite_docs-0.0.1/.gitignore +42 -0
- kglite_docs-0.0.1/LICENSE +21 -0
- kglite_docs-0.0.1/PKG-INFO +135 -0
- kglite_docs-0.0.1/README.md +87 -0
- kglite_docs-0.0.1/pyproject.toml +113 -0
- kglite_docs-0.0.1/src/kglite_docs/__init__.py +62 -0
- kglite_docs-0.0.1/src/kglite_docs/activity.py +146 -0
- kglite_docs-0.0.1/src/kglite_docs/agents.py +98 -0
- kglite_docs-0.0.1/src/kglite_docs/cli.py +291 -0
- kglite_docs-0.0.1/src/kglite_docs/cluster.py +277 -0
- kglite_docs-0.0.1/src/kglite_docs/context.py +81 -0
- kglite_docs-0.0.1/src/kglite_docs/corpus.py +754 -0
- kglite_docs-0.0.1/src/kglite_docs/embed.py +37 -0
- kglite_docs-0.0.1/src/kglite_docs/enrich.py +343 -0
- kglite_docs-0.0.1/src/kglite_docs/errors.py +65 -0
- kglite_docs-0.0.1/src/kglite_docs/export.py +334 -0
- kglite_docs-0.0.1/src/kglite_docs/ingest/__init__.py +12 -0
- kglite_docs-0.0.1/src/kglite_docs/ingest/chunker.py +242 -0
- kglite_docs-0.0.1/src/kglite_docs/ingest/formats.py +242 -0
- kglite_docs-0.0.1/src/kglite_docs/ingest/hashing.py +55 -0
- kglite_docs-0.0.1/src/kglite_docs/ingest/parser.py +76 -0
- kglite_docs-0.0.1/src/kglite_docs/ingest/pipeline.py +272 -0
- kglite_docs-0.0.1/src/kglite_docs/mcp_server/__init__.py +2 -0
- kglite_docs-0.0.1/src/kglite_docs/mcp_server/__main__.py +37 -0
- kglite_docs-0.0.1/src/kglite_docs/mcp_server/server.py +42 -0
- kglite_docs-0.0.1/src/kglite_docs/mcp_server/tools.py +432 -0
- kglite_docs-0.0.1/src/kglite_docs/ocr.py +244 -0
- kglite_docs-0.0.1/src/kglite_docs/quality.py +212 -0
- kglite_docs-0.0.1/src/kglite_docs/review.py +511 -0
- kglite_docs-0.0.1/src/kglite_docs/schema.py +93 -0
- kglite_docs-0.0.1/src/kglite_docs/store.py +215 -0
- kglite_docs-0.0.1/src/kglite_docs/tagging.py +161 -0
- kglite_docs-0.0.1/src/kglite_docs/translate.py +157 -0
- kglite_docs-0.0.1/src/kglite_docs/types.py +306 -0
- kglite_docs-0.0.1/tests/__init__.py +0 -0
- kglite_docs-0.0.1/tests/conftest.py +67 -0
- kglite_docs-0.0.1/tests/test_chunker.py +73 -0
- kglite_docs-0.0.1/tests/test_cluster.py +47 -0
- kglite_docs-0.0.1/tests/test_enrich.py +62 -0
- kglite_docs-0.0.1/tests/test_export.py +55 -0
- kglite_docs-0.0.1/tests/test_formats.py +72 -0
- kglite_docs-0.0.1/tests/test_hashing.py +43 -0
- kglite_docs-0.0.1/tests/test_ingest.py +108 -0
- kglite_docs-0.0.1/tests/test_mcp_smoke.py +121 -0
- kglite_docs-0.0.1/tests/test_ocr_flow.py +226 -0
- kglite_docs-0.0.1/tests/test_quality.py +51 -0
- kglite_docs-0.0.1/tests/test_review.py +207 -0
- kglite_docs-0.0.1/tests/test_search_and_compose.py +60 -0
- kglite_docs-0.0.1/tests/test_tagging_and_activity.py +67 -0
- kglite_docs-0.0.1/tests/test_translate.py +47 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info/
|
|
8
|
+
.eggs/
|
|
9
|
+
|
|
10
|
+
# Virtual environments
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
|
|
14
|
+
# IDE
|
|
15
|
+
.vscode/
|
|
16
|
+
.idea/
|
|
17
|
+
*.swp
|
|
18
|
+
|
|
19
|
+
# Testing / coverage
|
|
20
|
+
.pytest_cache/
|
|
21
|
+
.mypy_cache/
|
|
22
|
+
.ruff_cache/
|
|
23
|
+
htmlcov/
|
|
24
|
+
.coverage
|
|
25
|
+
coverage.xml
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Local artifacts
|
|
29
|
+
*.kgl
|
|
30
|
+
!sample_data/*.kgl
|
|
31
|
+
tmp/
|
|
32
|
+
.DS_Store
|
|
33
|
+
|
|
34
|
+
# Built docs
|
|
35
|
+
site/
|
|
36
|
+
|
|
37
|
+
# Sample PDFs (downloadable via scripts/fetch_sample_pdfs.sh — too big for the repo)
|
|
38
|
+
sample_data/pdfs/
|
|
39
|
+
sample_data/subset/
|
|
40
|
+
demo.kgl
|
|
41
|
+
synthesis_*.md
|
|
42
|
+
factcheck_*.json
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kristian dF Kollsgård
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kglite-docs
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Agent-first PDF knowledge base — chunk, embed, cluster, enrich, and serve over MCP. Built on kglite + bge-m3.
|
|
5
|
+
Project-URL: Homepage, https://github.com/kkollsga/kglite-docs
|
|
6
|
+
Project-URL: Repository, https://github.com/kkollsga/kglite-docs
|
|
7
|
+
Project-URL: Issues, https://github.com/kkollsga/kglite-docs/issues
|
|
8
|
+
Author-email: Kristian dF Kollsgård <kkollsg@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: bge-m3,embeddings,kglite,knowledge-graph,mcp,pdf,rag
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
21
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: huggingface-hub>=0.26
|
|
24
|
+
Requires-Dist: kglite>=0.10.4
|
|
25
|
+
Requires-Dist: markdownify>=0.13
|
|
26
|
+
Requires-Dist: numpy>=1.26
|
|
27
|
+
Requires-Dist: onnxruntime>=1.18
|
|
28
|
+
Requires-Dist: pandas>=2.0
|
|
29
|
+
Requires-Dist: pymupdf4llm>=0.0.17
|
|
30
|
+
Requires-Dist: pymupdf>=1.24
|
|
31
|
+
Requires-Dist: python-docx>=1.0
|
|
32
|
+
Requires-Dist: python-pptx>=0.6
|
|
33
|
+
Requires-Dist: reportlab>=4.0
|
|
34
|
+
Requires-Dist: tokenizers>=0.20
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: mypy>=1.13; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: ruff>=0.7; extra == 'dev'
|
|
40
|
+
Provides-Extra: docs
|
|
41
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
42
|
+
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
43
|
+
Requires-Dist: mkdocstrings[python]>=0.27; extra == 'docs'
|
|
44
|
+
Provides-Extra: mcp
|
|
45
|
+
Requires-Dist: mcp-methods>=0.3; extra == 'mcp'
|
|
46
|
+
Requires-Dist: mcp>=1.0; extra == 'mcp'
|
|
47
|
+
Description-Content-Type: text/markdown
|
|
48
|
+
|
|
49
|
+
# kglite-docs
|
|
50
|
+
|
|
51
|
+
> **Agent-first knowledge base for documents.** Ingest PDFs, Office files, Markdown, HTML, or images; chunk + embed them with [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3); cluster, tag, summarise, fact-check, translate, and review them — and serve the whole thing to AI agents over MCP.
|
|
52
|
+
|
|
53
|
+
[](https://pypi.org/project/kglite-docs/)
|
|
54
|
+
[](https://pypi.org/project/kglite-docs/)
|
|
55
|
+
[](https://kglite-docs.readthedocs.io/)
|
|
56
|
+
[](https://opensource.org/licenses/MIT)
|
|
57
|
+
|
|
58
|
+
Built on [`kglite`](https://github.com/kkollsga/kglite) (storage + vector search + clustering) and [`mcp-methods`](https://github.com/kkollsga/mcp-methods) (MCP framework).
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Why this and not generic RAG?
|
|
63
|
+
|
|
64
|
+
Most "RAG libraries" hand the agent `search(query) → list[chunk]` and stop. kglite-docs treats the corpus as a *living* knowledge graph that records who did what — and gives the agent typed tools to act on it.
|
|
65
|
+
|
|
66
|
+
- 📄 **Multi-format ingest** — PDF, DOCX, PPTX, MD, HTML, TXT, images. All flow into the same `Document → Page → Chunk` shape.
|
|
67
|
+
- 🤝 **Agents are first-class nodes** — their views, tags, summaries, verifications, and reviews are all queryable.
|
|
68
|
+
- ✅ **Cross-checked summaries** — one agent writes, a *different* agent verifies. Self-verification is rejected server-side.
|
|
69
|
+
- 📋 **Review kanban** — chunks move through `new → in_review → reviewed` with an immutable audit trail.
|
|
70
|
+
- 🛡️ **Grounding checks** — score how well an agent's summary aligns with its sources. Catch hallucinations before they ship.
|
|
71
|
+
- 🌍 **Translations** — per-chunk, multi-translator, with author/reviewer provenance.
|
|
72
|
+
- 🖼️ **Agent-driven OCR** — scanned pages handed back as rendered PNGs; agent transcribes and the graph absorbs the result.
|
|
73
|
+
|
|
74
|
+
## Install
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install "kglite-docs[mcp]"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## 30 seconds of Python
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from kglite_docs import Corpus
|
|
84
|
+
|
|
85
|
+
with Corpus.create("kb.kgl") as corpus: # auto-saves on exit
|
|
86
|
+
corpus.ingest_dir("./papers") # PDF / DOCX / PPTX / MD / HTML / images
|
|
87
|
+
hits = corpus.search("transformer attention", top_k=5, agent_id="me")
|
|
88
|
+
ctx = corpus.compose_context("transformer attention", max_tokens=3000)
|
|
89
|
+
# ctx["items"] is a ranked, token-budgeted bundle ready for your LLM prompt
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## 30 seconds of agent loop
|
|
93
|
+
|
|
94
|
+
Cross-checked enrichment in five lines:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
sid = corpus.add_summary(
|
|
98
|
+
target_id=hits[0]["id"], text="DPR uses a dual BERT encoder…",
|
|
99
|
+
agent_id="writer", model="opus-4.7",
|
|
100
|
+
)
|
|
101
|
+
# A different agent verifies — self-verification is rejected
|
|
102
|
+
corpus.verify_summary(sid, verdict="verified",
|
|
103
|
+
verifier_agent_id="reviewer", notes="checked p.5")
|
|
104
|
+
# Score how grounded the summary is in its source chunks
|
|
105
|
+
print(corpus.check_grounding(sid)["supported_fraction"]) # → 1.0
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Run it as an MCP server
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
kglite-docs-mcp --db kb.kgl
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Register with Claude Code:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
claude mcp add kglite-docs -- kglite-docs-mcp --db /abs/path/kb.kgl
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
The agent now sees ~30 typed tools (`search`, `compose_context`, `add_summary`, `verify_summary`, `tag_chunk`, `cluster_chunks`, `claim_next_review`, …) plus `cypher_query` as an escape hatch.
|
|
121
|
+
|
|
122
|
+
## Read the docs
|
|
123
|
+
|
|
124
|
+
📖 **Full documentation at [kglite-docs.readthedocs.io](https://kglite-docs.readthedocs.io/)**
|
|
125
|
+
|
|
126
|
+
- [Getting started](https://kglite-docs.readthedocs.io/en/latest/getting-started/) — 10 minutes from `pip install` to a running agent
|
|
127
|
+
- [Agent workflows](https://kglite-docs.readthedocs.io/en/latest/workflows/) — research, comparison, fact-checking, OCR loops, hallucination guards
|
|
128
|
+
- [Architecture](https://kglite-docs.readthedocs.io/en/latest/architecture/) — graph model, design rationale, the 30+ typed MCP tools
|
|
129
|
+
- [API reference](https://kglite-docs.readthedocs.io/en/latest/api/corpus/) — every method, every argument, IDE-friendly type stubs
|
|
130
|
+
- [Troubleshooting](https://kglite-docs.readthedocs.io/en/latest/troubleshooting/) — common failure modes
|
|
131
|
+
- [Changelog](https://kglite-docs.readthedocs.io/en/latest/changelog/)
|
|
132
|
+
|
|
133
|
+
## License
|
|
134
|
+
|
|
135
|
+
MIT.
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# kglite-docs
|
|
2
|
+
|
|
3
|
+
> **Agent-first knowledge base for documents.** Ingest PDFs, Office files, Markdown, HTML, or images; chunk + embed them with [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3); cluster, tag, summarise, fact-check, translate, and review them — and serve the whole thing to AI agents over MCP.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/kglite-docs/)
|
|
6
|
+
[](https://pypi.org/project/kglite-docs/)
|
|
7
|
+
[](https://kglite-docs.readthedocs.io/)
|
|
8
|
+
[](https://opensource.org/licenses/MIT)
|
|
9
|
+
|
|
10
|
+
Built on [`kglite`](https://github.com/kkollsga/kglite) (storage + vector search + clustering) and [`mcp-methods`](https://github.com/kkollsga/mcp-methods) (MCP framework).
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Why this and not generic RAG?
|
|
15
|
+
|
|
16
|
+
Most "RAG libraries" hand the agent `search(query) → list[chunk]` and stop. kglite-docs treats the corpus as a *living* knowledge graph that records who did what — and gives the agent typed tools to act on it.
|
|
17
|
+
|
|
18
|
+
- 📄 **Multi-format ingest** — PDF, DOCX, PPTX, MD, HTML, TXT, images. All flow into the same `Document → Page → Chunk` shape.
|
|
19
|
+
- 🤝 **Agents are first-class nodes** — their views, tags, summaries, verifications, and reviews are all queryable.
|
|
20
|
+
- ✅ **Cross-checked summaries** — one agent writes, a *different* agent verifies. Self-verification is rejected server-side.
|
|
21
|
+
- 📋 **Review kanban** — chunks move through `new → in_review → reviewed` with an immutable audit trail.
|
|
22
|
+
- 🛡️ **Grounding checks** — score how well an agent's summary aligns with its sources. Catch hallucinations before they ship.
|
|
23
|
+
- 🌍 **Translations** — per-chunk, multi-translator, with author/reviewer provenance.
|
|
24
|
+
- 🖼️ **Agent-driven OCR** — scanned pages handed back as rendered PNGs; agent transcribes and the graph absorbs the result.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install "kglite-docs[mcp]"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## 30 seconds of Python
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from kglite_docs import Corpus
|
|
36
|
+
|
|
37
|
+
with Corpus.create("kb.kgl") as corpus: # auto-saves on exit
|
|
38
|
+
corpus.ingest_dir("./papers") # PDF / DOCX / PPTX / MD / HTML / images
|
|
39
|
+
hits = corpus.search("transformer attention", top_k=5, agent_id="me")
|
|
40
|
+
ctx = corpus.compose_context("transformer attention", max_tokens=3000)
|
|
41
|
+
# ctx["items"] is a ranked, token-budgeted bundle ready for your LLM prompt
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## 30 seconds of agent loop
|
|
45
|
+
|
|
46
|
+
Cross-checked enrichment in five lines:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
sid = corpus.add_summary(
|
|
50
|
+
target_id=hits[0]["id"], text="DPR uses a dual BERT encoder…",
|
|
51
|
+
agent_id="writer", model="opus-4.7",
|
|
52
|
+
)
|
|
53
|
+
# A different agent verifies — self-verification is rejected
|
|
54
|
+
corpus.verify_summary(sid, verdict="verified",
|
|
55
|
+
verifier_agent_id="reviewer", notes="checked p.5")
|
|
56
|
+
# Score how grounded the summary is in its source chunks
|
|
57
|
+
print(corpus.check_grounding(sid)["supported_fraction"]) # → 1.0
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Run it as an MCP server
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
kglite-docs-mcp --db kb.kgl
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Register with Claude Code:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
claude mcp add kglite-docs -- kglite-docs-mcp --db /abs/path/kb.kgl
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
The agent now sees ~30 typed tools (`search`, `compose_context`, `add_summary`, `verify_summary`, `tag_chunk`, `cluster_chunks`, `claim_next_review`, …) plus `cypher_query` as an escape hatch.
|
|
73
|
+
|
|
74
|
+
## Read the docs
|
|
75
|
+
|
|
76
|
+
📖 **Full documentation at [kglite-docs.readthedocs.io](https://kglite-docs.readthedocs.io/)**
|
|
77
|
+
|
|
78
|
+
- [Getting started](https://kglite-docs.readthedocs.io/en/latest/getting-started/) — 10 minutes from `pip install` to a running agent
|
|
79
|
+
- [Agent workflows](https://kglite-docs.readthedocs.io/en/latest/workflows/) — research, comparison, fact-checking, OCR loops, hallucination guards
|
|
80
|
+
- [Architecture](https://kglite-docs.readthedocs.io/en/latest/architecture/) — graph model, design rationale, the 30+ typed MCP tools
|
|
81
|
+
- [API reference](https://kglite-docs.readthedocs.io/en/latest/api/corpus/) — every method, every argument, IDE-friendly type stubs
|
|
82
|
+
- [Troubleshooting](https://kglite-docs.readthedocs.io/en/latest/troubleshooting/) — common failure modes
|
|
83
|
+
- [Changelog](https://kglite-docs.readthedocs.io/en/latest/changelog/)
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
MIT.
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.25"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "kglite-docs"
|
|
7
|
+
# Bump this and push to main — the release.yml workflow checks PyPI
|
|
8
|
+
# and publishes if the version is new. No git tag needed.
|
|
9
|
+
version = "0.0.1"
|
|
10
|
+
description = "Agent-first PDF knowledge base — chunk, embed, cluster, enrich, and serve over MCP. Built on kglite + bge-m3."
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
license = "MIT"
|
|
13
|
+
license-files = ["LICENSE"]
|
|
14
|
+
authors = [
|
|
15
|
+
{ name = "Kristian dF Kollsgård", email = "kkollsg@gmail.com" },
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.10"
|
|
18
|
+
keywords = ["pdf", "rag", "embeddings", "knowledge-graph", "mcp", "kglite", "bge-m3"]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 3 - Alpha",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Programming Language :: Python :: 3.13",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
29
|
+
"Topic :: Text Processing :: Indexing",
|
|
30
|
+
]
|
|
31
|
+
dependencies = [
|
|
32
|
+
"kglite>=0.10.4",
|
|
33
|
+
"pymupdf4llm>=0.0.17",
|
|
34
|
+
"pymupdf>=1.24",
|
|
35
|
+
"tokenizers>=0.20",
|
|
36
|
+
"huggingface-hub>=0.26",
|
|
37
|
+
"onnxruntime>=1.18",
|
|
38
|
+
"numpy>=1.26",
|
|
39
|
+
"pandas>=2.0",
|
|
40
|
+
"python-docx>=1.0",
|
|
41
|
+
"python-pptx>=0.6",
|
|
42
|
+
"markdownify>=0.13",
|
|
43
|
+
"reportlab>=4.0",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
[project.optional-dependencies]
|
|
47
|
+
mcp = [
|
|
48
|
+
"mcp>=1.0",
|
|
49
|
+
"mcp-methods>=0.3",
|
|
50
|
+
]
|
|
51
|
+
dev = [
|
|
52
|
+
"pytest>=8.0",
|
|
53
|
+
"pytest-cov>=5.0",
|
|
54
|
+
"ruff>=0.7",
|
|
55
|
+
"mypy>=1.13",
|
|
56
|
+
]
|
|
57
|
+
docs = [
|
|
58
|
+
"mkdocs>=1.6",
|
|
59
|
+
"mkdocs-material>=9.5",
|
|
60
|
+
"mkdocstrings[python]>=0.27",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
[project.scripts]
|
|
64
|
+
kglite-docs = "kglite_docs.cli:main"
|
|
65
|
+
kglite-docs-mcp = "kglite_docs.mcp_server.__main__:main"
|
|
66
|
+
|
|
67
|
+
[project.urls]
|
|
68
|
+
Homepage = "https://github.com/kkollsga/kglite-docs"
|
|
69
|
+
Repository = "https://github.com/kkollsga/kglite-docs"
|
|
70
|
+
Issues = "https://github.com/kkollsga/kglite-docs/issues"
|
|
71
|
+
|
|
72
|
+
[tool.hatch.build.targets.wheel]
|
|
73
|
+
packages = ["src/kglite_docs"]
|
|
74
|
+
|
|
75
|
+
[tool.hatch.build.targets.sdist]
|
|
76
|
+
include = [
|
|
77
|
+
"src/kglite_docs",
|
|
78
|
+
"tests",
|
|
79
|
+
"README.md",
|
|
80
|
+
"LICENSE",
|
|
81
|
+
"pyproject.toml",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
[tool.ruff]
|
|
85
|
+
line-length = 100
|
|
86
|
+
target-version = "py310"
|
|
87
|
+
|
|
88
|
+
[tool.ruff.lint]
|
|
89
|
+
select = ["E", "F", "I", "B", "UP", "W", "N", "SIM"]
|
|
90
|
+
ignore = ["E501"]
|
|
91
|
+
|
|
92
|
+
[tool.ruff.lint.per-file-ignores]
|
|
93
|
+
# Scientific code uses `X` for the feature matrix (sklearn convention)
|
|
94
|
+
"src/kglite_docs/cluster.py" = ["N803", "N806"]
|
|
95
|
+
# Public exception API — `ReviewConflict` reads better than
|
|
96
|
+
# `ReviewConflictError` and is already exported through __init__.
|
|
97
|
+
"src/kglite_docs/errors.py" = ["N818"]
|
|
98
|
+
# Tests are allowed semicolons for compact setup lines.
|
|
99
|
+
"tests/*" = ["E702"]
|
|
100
|
+
|
|
101
|
+
[tool.mypy]
|
|
102
|
+
python_version = "3.10"
|
|
103
|
+
strict = true
|
|
104
|
+
warn_unused_ignores = true
|
|
105
|
+
exclude = ["tests/", "build/", "dist/"]
|
|
106
|
+
|
|
107
|
+
[tool.pytest.ini_options]
|
|
108
|
+
testpaths = ["tests"]
|
|
109
|
+
addopts = "-ra -q"
|
|
110
|
+
markers = [
|
|
111
|
+
"embed: tests that require the real bge-m3 model (slow; skipped if model not available)",
|
|
112
|
+
"mcp: tests that exercise the MCP server surface",
|
|
113
|
+
]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""kglite-docs — agent-first PDF knowledge base on top of kglite."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from kglite_docs.corpus import Corpus
|
|
6
|
+
from kglite_docs.errors import (
|
|
7
|
+
ConcurrencyError,
|
|
8
|
+
GroundingError,
|
|
9
|
+
IngestError,
|
|
10
|
+
InvalidEnumError,
|
|
11
|
+
KgliteDocsError,
|
|
12
|
+
MissingSourceError,
|
|
13
|
+
ReviewConflict,
|
|
14
|
+
SelfVerificationError,
|
|
15
|
+
UnsupportedFormatError,
|
|
16
|
+
)
|
|
17
|
+
from kglite_docs.schema import (
|
|
18
|
+
AGENT,
|
|
19
|
+
CHUNK,
|
|
20
|
+
CHUNK_TEXT_EMB,
|
|
21
|
+
CLUSTER,
|
|
22
|
+
DOCUMENT,
|
|
23
|
+
DOCUMENT_TITLE_EMB,
|
|
24
|
+
NOTE,
|
|
25
|
+
PAGE,
|
|
26
|
+
SUMMARY,
|
|
27
|
+
SUMMARY_TEXT_EMB,
|
|
28
|
+
TAG,
|
|
29
|
+
VIEW,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from importlib.metadata import version as _pkg_version
|
|
34
|
+
__version__ = _pkg_version("kglite-docs")
|
|
35
|
+
except Exception: # pragma: no cover - not installed (e.g. running from source)
|
|
36
|
+
__version__ = "0.0.0+local"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"AGENT",
|
|
40
|
+
"CHUNK",
|
|
41
|
+
"CHUNK_TEXT_EMB",
|
|
42
|
+
"CLUSTER",
|
|
43
|
+
"DOCUMENT",
|
|
44
|
+
"DOCUMENT_TITLE_EMB",
|
|
45
|
+
"NOTE",
|
|
46
|
+
"PAGE",
|
|
47
|
+
"SUMMARY",
|
|
48
|
+
"SUMMARY_TEXT_EMB",
|
|
49
|
+
"TAG",
|
|
50
|
+
"VIEW",
|
|
51
|
+
"ConcurrencyError",
|
|
52
|
+
"Corpus",
|
|
53
|
+
"GroundingError",
|
|
54
|
+
"IngestError",
|
|
55
|
+
"InvalidEnumError",
|
|
56
|
+
"KgliteDocsError",
|
|
57
|
+
"MissingSourceError",
|
|
58
|
+
"ReviewConflict",
|
|
59
|
+
"SelfVerificationError",
|
|
60
|
+
"UnsupportedFormatError",
|
|
61
|
+
"__version__",
|
|
62
|
+
]
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Agent identity + view tracking.
|
|
2
|
+
|
|
3
|
+
Agents are lazily registered on their first mutation; views can be
|
|
4
|
+
recorded explicitly (with context) or implicitly (when `search` /
|
|
5
|
+
`get_chunk` are called with an `agent_id`).
|
|
6
|
+
|
|
7
|
+
Aggregate `view_count` + `last_viewed_at` on the Chunk is updated on
|
|
8
|
+
every recorded view — a cheap denormalisation so listings can sort by
|
|
9
|
+
attention without joining View nodes.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import uuid
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from kglite_docs.schema import (
|
|
19
|
+
AGENT,
|
|
20
|
+
AUTHORED,
|
|
21
|
+
CHUNK,
|
|
22
|
+
VIEW,
|
|
23
|
+
VIEWED,
|
|
24
|
+
)
|
|
25
|
+
from kglite_docs.store import Store
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _now() -> str:
|
|
29
|
+
return datetime.now(timezone.utc).isoformat()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
from kglite_docs.store import rows as _df_dicts # noqa: E402
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def register_agent(
|
|
36
|
+
store: Store, *, agent_id: str, kind: str = "llm", model: str = ""
|
|
37
|
+
) -> dict[str, Any]:
|
|
38
|
+
"""Idempotent. Touches `last_seen` if the agent exists; creates otherwise."""
|
|
39
|
+
now = _now()
|
|
40
|
+
existing = _df_dicts(
|
|
41
|
+
store.cypher("MATCH (a:Agent {id: $id}) RETURN a.id AS id", params={"id": agent_id})
|
|
42
|
+
)
|
|
43
|
+
if existing:
|
|
44
|
+
store.cypher(
|
|
45
|
+
"MATCH (a:Agent {id: $id}) SET a.last_seen = $now, a.action_count = coalesce(a.action_count, 0) + 1",
|
|
46
|
+
params={"id": agent_id, "now": now},
|
|
47
|
+
)
|
|
48
|
+
return {"id": agent_id, "created": False, "last_seen": now}
|
|
49
|
+
store.upsert_nodes(
|
|
50
|
+
AGENT,
|
|
51
|
+
[{
|
|
52
|
+
"id": agent_id,
|
|
53
|
+
"title": agent_id,
|
|
54
|
+
"kind": kind,
|
|
55
|
+
"model": model,
|
|
56
|
+
"first_seen": now,
|
|
57
|
+
"last_seen": now,
|
|
58
|
+
"action_count": 1,
|
|
59
|
+
}],
|
|
60
|
+
)
|
|
61
|
+
return {"id": agent_id, "created": True, "last_seen": now}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def list_agents(store: Store) -> list[dict[str, Any]]:
|
|
65
|
+
df = store.cypher(
|
|
66
|
+
"MATCH (a:Agent) RETURN a.id AS id, a.kind AS kind, a.model AS model, "
|
|
67
|
+
"a.first_seen AS first_seen, a.last_seen AS last_seen, a.action_count AS actions "
|
|
68
|
+
"ORDER BY a.last_seen DESC"
|
|
69
|
+
)
|
|
70
|
+
return _df_dicts(df)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def record_view(
|
|
74
|
+
store: Store,
|
|
75
|
+
*,
|
|
76
|
+
agent_id: str,
|
|
77
|
+
target_id: str,
|
|
78
|
+
target_kind: str = CHUNK,
|
|
79
|
+
context: str = "",
|
|
80
|
+
) -> dict[str, Any]:
|
|
81
|
+
"""Record an agent viewing a target. Lazy-registers the agent.
|
|
82
|
+
|
|
83
|
+
- Always bumps the target's `view_count` and `last_viewed_at`.
|
|
84
|
+
- Creates a `View` node + edges when `context` is non-empty (so we
|
|
85
|
+
can surface "the query that led here" later); pure visits skip
|
|
86
|
+
the View node to keep the graph lean.
|
|
87
|
+
"""
|
|
88
|
+
register_agent(store, agent_id=agent_id)
|
|
89
|
+
now = _now()
|
|
90
|
+
if target_kind == CHUNK:
|
|
91
|
+
store.cypher(
|
|
92
|
+
"MATCH (c:Chunk {id: $id}) "
|
|
93
|
+
"SET c.view_count = coalesce(c.view_count, 0) + 1, c.last_viewed_at = $now",
|
|
94
|
+
params={"id": target_id, "now": now},
|
|
95
|
+
)
|
|
96
|
+
if not context:
|
|
97
|
+
return {"recorded": True, "view_node": None}
|
|
98
|
+
vid = str(uuid.uuid4())
|
|
99
|
+
store.upsert_nodes(
|
|
100
|
+
VIEW,
|
|
101
|
+
[{
|
|
102
|
+
"id": vid,
|
|
103
|
+
"title": context[:60],
|
|
104
|
+
"agent_id": agent_id,
|
|
105
|
+
"target_id": target_id,
|
|
106
|
+
"target_kind": target_kind,
|
|
107
|
+
"at": now,
|
|
108
|
+
"context": context,
|
|
109
|
+
}],
|
|
110
|
+
)
|
|
111
|
+
store.upsert_edges(
|
|
112
|
+
AUTHORED, [{"src": agent_id, "dst": vid}],
|
|
113
|
+
source_type=AGENT, target_type=VIEW,
|
|
114
|
+
)
|
|
115
|
+
# Aggregate VIEWED edge (Agent → Chunk) — multiple writes are tolerated;
|
|
116
|
+
# we don't need uniqueness here.
|
|
117
|
+
if target_kind == CHUNK:
|
|
118
|
+
store.upsert_edges(
|
|
119
|
+
VIEWED, [{"src": agent_id, "dst": target_id, "at": now, "context": context}],
|
|
120
|
+
source_type=AGENT, target_type=CHUNK,
|
|
121
|
+
)
|
|
122
|
+
return {"recorded": True, "view_node": vid}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def agent_activity(store: Store, agent_id: str, *, limit: int = 50) -> dict[str, Any]:
|
|
126
|
+
"""Return summary + recent activity for an agent."""
|
|
127
|
+
a_df = _df_dicts(store.cypher(
|
|
128
|
+
"MATCH (a:Agent {id: $id}) RETURN a.id AS id, a.kind AS kind, "
|
|
129
|
+
"a.first_seen AS first_seen, a.last_seen AS last_seen, a.action_count AS actions",
|
|
130
|
+
params={"id": agent_id},
|
|
131
|
+
))
|
|
132
|
+
if not a_df:
|
|
133
|
+
return {"agent": None, "views": [], "summaries": [], "tags": []}
|
|
134
|
+
views = _df_dicts(store.cypher(
|
|
135
|
+
"MATCH (a:Agent {id: $id})-[:AUTHORED]->(v:View) "
|
|
136
|
+
f"RETURN v.target_id AS target_id, v.target_kind AS target_kind, v.context AS context, v.at AS at "
|
|
137
|
+
f"ORDER BY v.at DESC LIMIT {int(limit)}",
|
|
138
|
+
params={"id": agent_id},
|
|
139
|
+
))
|
|
140
|
+
sums = _df_dicts(store.cypher(
|
|
141
|
+
"MATCH (a:Agent {id: $id})-[:AUTHORED]->(s:Summary) "
|
|
142
|
+
f"RETURN s.id AS id, s.target_id AS target_id, s.text AS text, s.verification_status AS status "
|
|
143
|
+
f"ORDER BY s.created_at DESC LIMIT {int(limit)}",
|
|
144
|
+
params={"id": agent_id},
|
|
145
|
+
))
|
|
146
|
+
return {"agent": a_df[0], "views": views, "summaries": sums}
|