kglite-docs 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. kglite_docs-0.0.1/.gitignore +42 -0
  2. kglite_docs-0.0.1/LICENSE +21 -0
  3. kglite_docs-0.0.1/PKG-INFO +135 -0
  4. kglite_docs-0.0.1/README.md +87 -0
  5. kglite_docs-0.0.1/pyproject.toml +113 -0
  6. kglite_docs-0.0.1/src/kglite_docs/__init__.py +62 -0
  7. kglite_docs-0.0.1/src/kglite_docs/activity.py +146 -0
  8. kglite_docs-0.0.1/src/kglite_docs/agents.py +98 -0
  9. kglite_docs-0.0.1/src/kglite_docs/cli.py +291 -0
  10. kglite_docs-0.0.1/src/kglite_docs/cluster.py +277 -0
  11. kglite_docs-0.0.1/src/kglite_docs/context.py +81 -0
  12. kglite_docs-0.0.1/src/kglite_docs/corpus.py +754 -0
  13. kglite_docs-0.0.1/src/kglite_docs/embed.py +37 -0
  14. kglite_docs-0.0.1/src/kglite_docs/enrich.py +343 -0
  15. kglite_docs-0.0.1/src/kglite_docs/errors.py +65 -0
  16. kglite_docs-0.0.1/src/kglite_docs/export.py +334 -0
  17. kglite_docs-0.0.1/src/kglite_docs/ingest/__init__.py +12 -0
  18. kglite_docs-0.0.1/src/kglite_docs/ingest/chunker.py +242 -0
  19. kglite_docs-0.0.1/src/kglite_docs/ingest/formats.py +242 -0
  20. kglite_docs-0.0.1/src/kglite_docs/ingest/hashing.py +55 -0
  21. kglite_docs-0.0.1/src/kglite_docs/ingest/parser.py +76 -0
  22. kglite_docs-0.0.1/src/kglite_docs/ingest/pipeline.py +272 -0
  23. kglite_docs-0.0.1/src/kglite_docs/mcp_server/__init__.py +2 -0
  24. kglite_docs-0.0.1/src/kglite_docs/mcp_server/__main__.py +37 -0
  25. kglite_docs-0.0.1/src/kglite_docs/mcp_server/server.py +42 -0
  26. kglite_docs-0.0.1/src/kglite_docs/mcp_server/tools.py +432 -0
  27. kglite_docs-0.0.1/src/kglite_docs/ocr.py +244 -0
  28. kglite_docs-0.0.1/src/kglite_docs/quality.py +212 -0
  29. kglite_docs-0.0.1/src/kglite_docs/review.py +511 -0
  30. kglite_docs-0.0.1/src/kglite_docs/schema.py +93 -0
  31. kglite_docs-0.0.1/src/kglite_docs/store.py +215 -0
  32. kglite_docs-0.0.1/src/kglite_docs/tagging.py +161 -0
  33. kglite_docs-0.0.1/src/kglite_docs/translate.py +157 -0
  34. kglite_docs-0.0.1/src/kglite_docs/types.py +306 -0
  35. kglite_docs-0.0.1/tests/__init__.py +0 -0
  36. kglite_docs-0.0.1/tests/conftest.py +67 -0
  37. kglite_docs-0.0.1/tests/test_chunker.py +73 -0
  38. kglite_docs-0.0.1/tests/test_cluster.py +47 -0
  39. kglite_docs-0.0.1/tests/test_enrich.py +62 -0
  40. kglite_docs-0.0.1/tests/test_export.py +55 -0
  41. kglite_docs-0.0.1/tests/test_formats.py +72 -0
  42. kglite_docs-0.0.1/tests/test_hashing.py +43 -0
  43. kglite_docs-0.0.1/tests/test_ingest.py +108 -0
  44. kglite_docs-0.0.1/tests/test_mcp_smoke.py +121 -0
  45. kglite_docs-0.0.1/tests/test_ocr_flow.py +226 -0
  46. kglite_docs-0.0.1/tests/test_quality.py +51 -0
  47. kglite_docs-0.0.1/tests/test_review.py +207 -0
  48. kglite_docs-0.0.1/tests/test_search_and_compose.py +60 -0
  49. kglite_docs-0.0.1/tests/test_tagging_and_activity.py +67 -0
  50. kglite_docs-0.0.1/tests/test_translate.py +47 -0
@@ -0,0 +1,42 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info/
8
+ .eggs/
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+
14
+ # IDE
15
+ .vscode/
16
+ .idea/
17
+ *.swp
18
+
19
+ # Testing / coverage
20
+ .pytest_cache/
21
+ .mypy_cache/
22
+ .ruff_cache/
23
+ htmlcov/
24
+ .coverage
25
+ coverage.xml
26
+
27
+
28
+ # Local artifacts
29
+ *.kgl
30
+ !sample_data/*.kgl
31
+ tmp/
32
+ .DS_Store
33
+
34
+ # Built docs
35
+ site/
36
+
37
+ # Sample PDFs (downloadable via scripts/fetch_sample_pdfs.sh — too big for the repo)
38
+ sample_data/pdfs/
39
+ sample_data/subset/
40
+ demo.kgl
41
+ synthesis_*.md
42
+ factcheck_*.json
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Kristian dF Kollsgård
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,135 @@
1
+ Metadata-Version: 2.4
2
+ Name: kglite-docs
3
+ Version: 0.0.1
4
+ Summary: Agent-first PDF knowledge base — chunk, embed, cluster, enrich, and serve over MCP. Built on kglite + bge-m3.
5
+ Project-URL: Homepage, https://github.com/kkollsga/kglite-docs
6
+ Project-URL: Repository, https://github.com/kkollsga/kglite-docs
7
+ Project-URL: Issues, https://github.com/kkollsga/kglite-docs/issues
8
+ Author-email: Kristian dF Kollsgård <kkollsg@gmail.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: bge-m3,embeddings,kglite,knowledge-graph,mcp,pdf,rag
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
21
+ Classifier: Topic :: Text Processing :: Indexing
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: huggingface-hub>=0.26
24
+ Requires-Dist: kglite>=0.10.4
25
+ Requires-Dist: markdownify>=0.13
26
+ Requires-Dist: numpy>=1.26
27
+ Requires-Dist: onnxruntime>=1.18
28
+ Requires-Dist: pandas>=2.0
29
+ Requires-Dist: pymupdf4llm>=0.0.17
30
+ Requires-Dist: pymupdf>=1.24
31
+ Requires-Dist: python-docx>=1.0
32
+ Requires-Dist: python-pptx>=0.6
33
+ Requires-Dist: reportlab>=4.0
34
+ Requires-Dist: tokenizers>=0.20
35
+ Provides-Extra: dev
36
+ Requires-Dist: mypy>=1.13; extra == 'dev'
37
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
38
+ Requires-Dist: pytest>=8.0; extra == 'dev'
39
+ Requires-Dist: ruff>=0.7; extra == 'dev'
40
+ Provides-Extra: docs
41
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
42
+ Requires-Dist: mkdocs>=1.6; extra == 'docs'
43
+ Requires-Dist: mkdocstrings[python]>=0.27; extra == 'docs'
44
+ Provides-Extra: mcp
45
+ Requires-Dist: mcp-methods>=0.3; extra == 'mcp'
46
+ Requires-Dist: mcp>=1.0; extra == 'mcp'
47
+ Description-Content-Type: text/markdown
48
+
49
+ # kglite-docs
50
+
51
+ > **Agent-first knowledge base for documents.** Ingest PDFs, Office files, Markdown, HTML, or images; chunk + embed them with [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3); cluster, tag, summarise, fact-check, translate, and review them — and serve the whole thing to AI agents over MCP.
52
+
53
+ [![PyPI](https://img.shields.io/pypi/v/kglite-docs.svg)](https://pypi.org/project/kglite-docs/)
54
+ [![Python](https://img.shields.io/pypi/pyversions/kglite-docs.svg)](https://pypi.org/project/kglite-docs/)
55
+ [![Docs](https://readthedocs.org/projects/kglite-docs/badge/?version=latest)](https://kglite-docs.readthedocs.io/)
56
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
57
+
58
+ Built on [`kglite`](https://github.com/kkollsga/kglite) (storage + vector search + clustering) and [`mcp-methods`](https://github.com/kkollsga/mcp-methods) (MCP framework).
59
+
60
+ ---
61
+
62
+ ## Why this and not generic RAG?
63
+
64
+ Most "RAG libraries" hand the agent `search(query) → list[chunk]` and stop. kglite-docs treats the corpus as a *living* knowledge graph that records who did what — and gives the agent typed tools to act on it.
65
+
66
+ - 📄 **Multi-format ingest** — PDF, DOCX, PPTX, MD, HTML, TXT, images. All flow into the same `Document → Page → Chunk` shape.
67
+ - 🤝 **Agents are first-class nodes** — their views, tags, summaries, verifications, and reviews are all queryable.
68
+ - ✅ **Cross-checked summaries** — one agent writes, a *different* agent verifies. Self-verification is rejected server-side.
69
+ - 📋 **Review kanban** — chunks move through `new → in_review → reviewed` with an immutable audit trail.
70
+ - 🛡️ **Grounding checks** — score how well an agent's summary aligns with its sources. Catch hallucinations before they ship.
71
+ - 🌍 **Translations** — per-chunk, multi-translator, with author/reviewer provenance.
72
+ - 🖼️ **Agent-driven OCR** — scanned pages handed back as rendered PNGs; agent transcribes and the graph absorbs the result.
73
+
74
+ ## Install
75
+
76
+ ```bash
77
+ pip install "kglite-docs[mcp]"
78
+ ```
79
+
80
+ ## 30 seconds of Python
81
+
82
+ ```python
83
+ from kglite_docs import Corpus
84
+
85
+ with Corpus.create("kb.kgl") as corpus: # auto-saves on exit
86
+ corpus.ingest_dir("./papers") # PDF / DOCX / PPTX / MD / HTML / images
87
+ hits = corpus.search("transformer attention", top_k=5, agent_id="me")
88
+ ctx = corpus.compose_context("transformer attention", max_tokens=3000)
89
+ # ctx["items"] is a ranked, token-budgeted bundle ready for your LLM prompt
90
+ ```
91
+
92
+ ## 30 seconds of agent loop
93
+
94
+ Cross-checked enrichment in five lines:
95
+
96
+ ```python
97
+ sid = corpus.add_summary(
98
+ target_id=hits[0]["id"], text="DPR uses a dual BERT encoder…",
99
+ agent_id="writer", model="opus-4.7",
100
+ )
101
+ # A different agent verifies — self-verification is rejected
102
+ corpus.verify_summary(sid, verdict="verified",
103
+ verifier_agent_id="reviewer", notes="checked p.5")
104
+ # Score how grounded the summary is in its source chunks
105
+ print(corpus.check_grounding(sid)["supported_fraction"]) # → 1.0
106
+ ```
107
+
108
+ ## Run it as an MCP server
109
+
110
+ ```bash
111
+ kglite-docs-mcp --db kb.kgl
112
+ ```
113
+
114
+ Register with Claude Code:
115
+
116
+ ```bash
117
+ claude mcp add kglite-docs -- kglite-docs-mcp --db /abs/path/kb.kgl
118
+ ```
119
+
120
+ The agent now sees ~30 typed tools (`search`, `compose_context`, `add_summary`, `verify_summary`, `tag_chunk`, `cluster_chunks`, `claim_next_review`, …) plus `cypher_query` as an escape hatch.
121
+
122
+ ## Read the docs
123
+
124
+ 📖 **Full documentation at [kglite-docs.readthedocs.io](https://kglite-docs.readthedocs.io/)**
125
+
126
+ - [Getting started](https://kglite-docs.readthedocs.io/en/latest/getting-started/) — 10 minutes from `pip install` to a running agent
127
+ - [Agent workflows](https://kglite-docs.readthedocs.io/en/latest/workflows/) — research, comparison, fact-checking, OCR loops, hallucination guards
128
+ - [Architecture](https://kglite-docs.readthedocs.io/en/latest/architecture/) — graph model, design rationale, the 30+ typed MCP tools
129
+ - [API reference](https://kglite-docs.readthedocs.io/en/latest/api/corpus/) — every method, every argument, IDE-friendly type stubs
130
+ - [Troubleshooting](https://kglite-docs.readthedocs.io/en/latest/troubleshooting/) — common failure modes
131
+ - [Changelog](https://kglite-docs.readthedocs.io/en/latest/changelog/)
132
+
133
+ ## License
134
+
135
+ MIT.
@@ -0,0 +1,87 @@
1
+ # kglite-docs
2
+
3
+ > **Agent-first knowledge base for documents.** Ingest PDFs, Office files, Markdown, HTML, or images; chunk + embed them with [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3); cluster, tag, summarise, fact-check, translate, and review them — and serve the whole thing to AI agents over MCP.
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/kglite-docs.svg)](https://pypi.org/project/kglite-docs/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/kglite-docs.svg)](https://pypi.org/project/kglite-docs/)
7
+ [![Docs](https://readthedocs.org/projects/kglite-docs/badge/?version=latest)](https://kglite-docs.readthedocs.io/)
8
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
9
+
10
+ Built on [`kglite`](https://github.com/kkollsga/kglite) (storage + vector search + clustering) and [`mcp-methods`](https://github.com/kkollsga/mcp-methods) (MCP framework).
11
+
12
+ ---
13
+
14
+ ## Why this and not generic RAG?
15
+
16
+ Most "RAG libraries" hand the agent `search(query) → list[chunk]` and stop. kglite-docs treats the corpus as a *living* knowledge graph that records who did what — and gives the agent typed tools to act on it.
17
+
18
+ - 📄 **Multi-format ingest** — PDF, DOCX, PPTX, MD, HTML, TXT, images. All flow into the same `Document → Page → Chunk` shape.
19
+ - 🤝 **Agents are first-class nodes** — their views, tags, summaries, verifications, and reviews are all queryable.
20
+ - ✅ **Cross-checked summaries** — one agent writes, a *different* agent verifies. Self-verification is rejected server-side.
21
+ - 📋 **Review kanban** — chunks move through `new → in_review → reviewed` with an immutable audit trail.
22
+ - 🛡️ **Grounding checks** — score how well an agent's summary aligns with its sources. Catch hallucinations before they ship.
23
+ - 🌍 **Translations** — per-chunk, multi-translator, with author/reviewer provenance.
24
+ - 🖼️ **Agent-driven OCR** — scanned pages handed back as rendered PNGs; agent transcribes and the graph absorbs the result.
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ pip install "kglite-docs[mcp]"
30
+ ```
31
+
32
+ ## 30 seconds of Python
33
+
34
+ ```python
35
+ from kglite_docs import Corpus
36
+
37
+ with Corpus.create("kb.kgl") as corpus: # auto-saves on exit
38
+ corpus.ingest_dir("./papers") # PDF / DOCX / PPTX / MD / HTML / images
39
+ hits = corpus.search("transformer attention", top_k=5, agent_id="me")
40
+ ctx = corpus.compose_context("transformer attention", max_tokens=3000)
41
+ # ctx["items"] is a ranked, token-budgeted bundle ready for your LLM prompt
42
+ ```
43
+
44
+ ## 30 seconds of agent loop
45
+
46
+ Cross-checked enrichment in five lines:
47
+
48
+ ```python
49
+ sid = corpus.add_summary(
50
+ target_id=hits[0]["id"], text="DPR uses a dual BERT encoder…",
51
+ agent_id="writer", model="opus-4.7",
52
+ )
53
+ # A different agent verifies — self-verification is rejected
54
+ corpus.verify_summary(sid, verdict="verified",
55
+ verifier_agent_id="reviewer", notes="checked p.5")
56
+ # Score how grounded the summary is in its source chunks
57
+ print(corpus.check_grounding(sid)["supported_fraction"]) # → 1.0
58
+ ```
59
+
60
+ ## Run it as an MCP server
61
+
62
+ ```bash
63
+ kglite-docs-mcp --db kb.kgl
64
+ ```
65
+
66
+ Register with Claude Code:
67
+
68
+ ```bash
69
+ claude mcp add kglite-docs -- kglite-docs-mcp --db /abs/path/kb.kgl
70
+ ```
71
+
72
+ The agent now sees ~30 typed tools (`search`, `compose_context`, `add_summary`, `verify_summary`, `tag_chunk`, `cluster_chunks`, `claim_next_review`, …) plus `cypher_query` as an escape hatch.
73
+
74
+ ## Read the docs
75
+
76
+ 📖 **Full documentation at [kglite-docs.readthedocs.io](https://kglite-docs.readthedocs.io/)**
77
+
78
+ - [Getting started](https://kglite-docs.readthedocs.io/en/latest/getting-started/) — 10 minutes from `pip install` to a running agent
79
+ - [Agent workflows](https://kglite-docs.readthedocs.io/en/latest/workflows/) — research, comparison, fact-checking, OCR loops, hallucination guards
80
+ - [Architecture](https://kglite-docs.readthedocs.io/en/latest/architecture/) — graph model, design rationale, the 30+ typed MCP tools
81
+ - [API reference](https://kglite-docs.readthedocs.io/en/latest/api/corpus/) — every method, every argument, IDE-friendly type stubs
82
+ - [Troubleshooting](https://kglite-docs.readthedocs.io/en/latest/troubleshooting/) — common failure modes
83
+ - [Changelog](https://kglite-docs.readthedocs.io/en/latest/changelog/)
84
+
85
+ ## License
86
+
87
+ MIT.
@@ -0,0 +1,113 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.25"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "kglite-docs"
7
+ # Bump this and push to main — the release.yml workflow checks PyPI
8
+ # and publishes if the version is new. No git tag needed.
9
+ version = "0.0.1"
10
+ description = "Agent-first PDF knowledge base — chunk, embed, cluster, enrich, and serve over MCP. Built on kglite + bge-m3."
11
+ readme = "README.md"
12
+ license = "MIT"
13
+ license-files = ["LICENSE"]
14
+ authors = [
15
+ { name = "Kristian dF Kollsgård", email = "kkollsg@gmail.com" },
16
+ ]
17
+ requires-python = ">=3.10"
18
+ keywords = ["pdf", "rag", "embeddings", "knowledge-graph", "mcp", "kglite", "bge-m3"]
19
+ classifiers = [
20
+ "Development Status :: 3 - Alpha",
21
+ "Intended Audience :: Developers",
22
+ "License :: OSI Approved :: MIT License",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Programming Language :: Python :: 3.13",
28
+ "Topic :: Scientific/Engineering :: Information Analysis",
29
+ "Topic :: Text Processing :: Indexing",
30
+ ]
31
+ dependencies = [
32
+ "kglite>=0.10.4",
33
+ "pymupdf4llm>=0.0.17",
34
+ "pymupdf>=1.24",
35
+ "tokenizers>=0.20",
36
+ "huggingface-hub>=0.26",
37
+ "onnxruntime>=1.18",
38
+ "numpy>=1.26",
39
+ "pandas>=2.0",
40
+ "python-docx>=1.0",
41
+ "python-pptx>=0.6",
42
+ "markdownify>=0.13",
43
+ "reportlab>=4.0",
44
+ ]
45
+
46
+ [project.optional-dependencies]
47
+ mcp = [
48
+ "mcp>=1.0",
49
+ "mcp-methods>=0.3",
50
+ ]
51
+ dev = [
52
+ "pytest>=8.0",
53
+ "pytest-cov>=5.0",
54
+ "ruff>=0.7",
55
+ "mypy>=1.13",
56
+ ]
57
+ docs = [
58
+ "mkdocs>=1.6",
59
+ "mkdocs-material>=9.5",
60
+ "mkdocstrings[python]>=0.27",
61
+ ]
62
+
63
+ [project.scripts]
64
+ kglite-docs = "kglite_docs.cli:main"
65
+ kglite-docs-mcp = "kglite_docs.mcp_server.__main__:main"
66
+
67
+ [project.urls]
68
+ Homepage = "https://github.com/kkollsga/kglite-docs"
69
+ Repository = "https://github.com/kkollsga/kglite-docs"
70
+ Issues = "https://github.com/kkollsga/kglite-docs/issues"
71
+
72
+ [tool.hatch.build.targets.wheel]
73
+ packages = ["src/kglite_docs"]
74
+
75
+ [tool.hatch.build.targets.sdist]
76
+ include = [
77
+ "src/kglite_docs",
78
+ "tests",
79
+ "README.md",
80
+ "LICENSE",
81
+ "pyproject.toml",
82
+ ]
83
+
84
+ [tool.ruff]
85
+ line-length = 100
86
+ target-version = "py310"
87
+
88
+ [tool.ruff.lint]
89
+ select = ["E", "F", "I", "B", "UP", "W", "N", "SIM"]
90
+ ignore = ["E501"]
91
+
92
+ [tool.ruff.lint.per-file-ignores]
93
+ # Scientific code uses `X` for the feature matrix (sklearn convention)
94
+ "src/kglite_docs/cluster.py" = ["N803", "N806"]
95
+ # Public exception API — `ReviewConflict` reads better than
96
+ # `ReviewConflictError` and is already exported through __init__.
97
+ "src/kglite_docs/errors.py" = ["N818"]
98
+ # Tests are allowed semicolons for compact setup lines.
99
+ "tests/*" = ["E702"]
100
+
101
+ [tool.mypy]
102
+ python_version = "3.10"
103
+ strict = true
104
+ warn_unused_ignores = true
105
+ exclude = ["tests/", "build/", "dist/"]
106
+
107
+ [tool.pytest.ini_options]
108
+ testpaths = ["tests"]
109
+ addopts = "-ra -q"
110
+ markers = [
111
+ "embed: tests that require the real bge-m3 model (slow; skipped if model not available)",
112
+ "mcp: tests that exercise the MCP server surface",
113
+ ]
@@ -0,0 +1,62 @@
1
+ """kglite-docs — agent-first PDF knowledge base on top of kglite."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from kglite_docs.corpus import Corpus
6
+ from kglite_docs.errors import (
7
+ ConcurrencyError,
8
+ GroundingError,
9
+ IngestError,
10
+ InvalidEnumError,
11
+ KgliteDocsError,
12
+ MissingSourceError,
13
+ ReviewConflict,
14
+ SelfVerificationError,
15
+ UnsupportedFormatError,
16
+ )
17
+ from kglite_docs.schema import (
18
+ AGENT,
19
+ CHUNK,
20
+ CHUNK_TEXT_EMB,
21
+ CLUSTER,
22
+ DOCUMENT,
23
+ DOCUMENT_TITLE_EMB,
24
+ NOTE,
25
+ PAGE,
26
+ SUMMARY,
27
+ SUMMARY_TEXT_EMB,
28
+ TAG,
29
+ VIEW,
30
+ )
31
+
32
+ try:
33
+ from importlib.metadata import version as _pkg_version
34
+ __version__ = _pkg_version("kglite-docs")
35
+ except Exception: # pragma: no cover - not installed (e.g. running from source)
36
+ __version__ = "0.0.0+local"
37
+
38
+ __all__ = [
39
+ "AGENT",
40
+ "CHUNK",
41
+ "CHUNK_TEXT_EMB",
42
+ "CLUSTER",
43
+ "DOCUMENT",
44
+ "DOCUMENT_TITLE_EMB",
45
+ "NOTE",
46
+ "PAGE",
47
+ "SUMMARY",
48
+ "SUMMARY_TEXT_EMB",
49
+ "TAG",
50
+ "VIEW",
51
+ "ConcurrencyError",
52
+ "Corpus",
53
+ "GroundingError",
54
+ "IngestError",
55
+ "InvalidEnumError",
56
+ "KgliteDocsError",
57
+ "MissingSourceError",
58
+ "ReviewConflict",
59
+ "SelfVerificationError",
60
+ "UnsupportedFormatError",
61
+ "__version__",
62
+ ]
@@ -0,0 +1,146 @@
1
+ """Agent identity + view tracking.
2
+
3
+ Agents are lazily registered on their first mutation; views can be
4
+ recorded explicitly (with context) or implicitly (when `search` /
5
+ `get_chunk` are called with an `agent_id`).
6
+
7
+ Aggregate `view_count` + `last_viewed_at` on the Chunk is updated on
8
+ every recorded view — a cheap denormalisation so listings can sort by
9
+ attention without joining View nodes.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import uuid
15
+ from datetime import datetime, timezone
16
+ from typing import Any
17
+
18
+ from kglite_docs.schema import (
19
+ AGENT,
20
+ AUTHORED,
21
+ CHUNK,
22
+ VIEW,
23
+ VIEWED,
24
+ )
25
+ from kglite_docs.store import Store
26
+
27
+
28
+ def _now() -> str:
29
+ return datetime.now(timezone.utc).isoformat()
30
+
31
+
32
+ from kglite_docs.store import rows as _df_dicts # noqa: E402
33
+
34
+
35
+ def register_agent(
36
+ store: Store, *, agent_id: str, kind: str = "llm", model: str = ""
37
+ ) -> dict[str, Any]:
38
+ """Idempotent. Touches `last_seen` if the agent exists; creates otherwise."""
39
+ now = _now()
40
+ existing = _df_dicts(
41
+ store.cypher("MATCH (a:Agent {id: $id}) RETURN a.id AS id", params={"id": agent_id})
42
+ )
43
+ if existing:
44
+ store.cypher(
45
+ "MATCH (a:Agent {id: $id}) SET a.last_seen = $now, a.action_count = coalesce(a.action_count, 0) + 1",
46
+ params={"id": agent_id, "now": now},
47
+ )
48
+ return {"id": agent_id, "created": False, "last_seen": now}
49
+ store.upsert_nodes(
50
+ AGENT,
51
+ [{
52
+ "id": agent_id,
53
+ "title": agent_id,
54
+ "kind": kind,
55
+ "model": model,
56
+ "first_seen": now,
57
+ "last_seen": now,
58
+ "action_count": 1,
59
+ }],
60
+ )
61
+ return {"id": agent_id, "created": True, "last_seen": now}
62
+
63
+
64
+ def list_agents(store: Store) -> list[dict[str, Any]]:
65
+ df = store.cypher(
66
+ "MATCH (a:Agent) RETURN a.id AS id, a.kind AS kind, a.model AS model, "
67
+ "a.first_seen AS first_seen, a.last_seen AS last_seen, a.action_count AS actions "
68
+ "ORDER BY a.last_seen DESC"
69
+ )
70
+ return _df_dicts(df)
71
+
72
+
73
+ def record_view(
74
+ store: Store,
75
+ *,
76
+ agent_id: str,
77
+ target_id: str,
78
+ target_kind: str = CHUNK,
79
+ context: str = "",
80
+ ) -> dict[str, Any]:
81
+ """Record an agent viewing a target. Lazy-registers the agent.
82
+
83
+ - Always bumps the target's `view_count` and `last_viewed_at`.
84
+ - Creates a `View` node + edges when `context` is non-empty (so we
85
+ can surface "the query that led here" later); pure visits skip
86
+ the View node to keep the graph lean.
87
+ """
88
+ register_agent(store, agent_id=agent_id)
89
+ now = _now()
90
+ if target_kind == CHUNK:
91
+ store.cypher(
92
+ "MATCH (c:Chunk {id: $id}) "
93
+ "SET c.view_count = coalesce(c.view_count, 0) + 1, c.last_viewed_at = $now",
94
+ params={"id": target_id, "now": now},
95
+ )
96
+ if not context:
97
+ return {"recorded": True, "view_node": None}
98
+ vid = str(uuid.uuid4())
99
+ store.upsert_nodes(
100
+ VIEW,
101
+ [{
102
+ "id": vid,
103
+ "title": context[:60],
104
+ "agent_id": agent_id,
105
+ "target_id": target_id,
106
+ "target_kind": target_kind,
107
+ "at": now,
108
+ "context": context,
109
+ }],
110
+ )
111
+ store.upsert_edges(
112
+ AUTHORED, [{"src": agent_id, "dst": vid}],
113
+ source_type=AGENT, target_type=VIEW,
114
+ )
115
+ # Aggregate VIEWED edge (Agent → Chunk) — multiple writes are tolerated;
116
+ # we don't need uniqueness here.
117
+ if target_kind == CHUNK:
118
+ store.upsert_edges(
119
+ VIEWED, [{"src": agent_id, "dst": target_id, "at": now, "context": context}],
120
+ source_type=AGENT, target_type=CHUNK,
121
+ )
122
+ return {"recorded": True, "view_node": vid}
123
+
124
+
125
+ def agent_activity(store: Store, agent_id: str, *, limit: int = 50) -> dict[str, Any]:
126
+ """Return summary + recent activity for an agent."""
127
+ a_df = _df_dicts(store.cypher(
128
+ "MATCH (a:Agent {id: $id}) RETURN a.id AS id, a.kind AS kind, "
129
+ "a.first_seen AS first_seen, a.last_seen AS last_seen, a.action_count AS actions",
130
+ params={"id": agent_id},
131
+ ))
132
+ if not a_df:
133
+ return {"agent": None, "views": [], "summaries": [], "tags": []}
134
+ views = _df_dicts(store.cypher(
135
+ "MATCH (a:Agent {id: $id})-[:AUTHORED]->(v:View) "
136
+ f"RETURN v.target_id AS target_id, v.target_kind AS target_kind, v.context AS context, v.at AS at "
137
+ f"ORDER BY v.at DESC LIMIT {int(limit)}",
138
+ params={"id": agent_id},
139
+ ))
140
+ sums = _df_dicts(store.cypher(
141
+ "MATCH (a:Agent {id: $id})-[:AUTHORED]->(s:Summary) "
142
+ f"RETURN s.id AS id, s.target_id AS target_id, s.text AS text, s.verification_status AS status "
143
+ f"ORDER BY s.created_at DESC LIMIT {int(limit)}",
144
+ params={"id": agent_id},
145
+ ))
146
+ return {"agent": a_df[0], "views": views, "summaries": sums}