graph-sieve 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graph_sieve-1.0.0/LICENSE +21 -0
- graph_sieve-1.0.0/PKG-INFO +110 -0
- graph_sieve-1.0.0/README.md +83 -0
- graph_sieve-1.0.0/pyproject.toml +44 -0
- graph_sieve-1.0.0/setup.cfg +4 -0
- graph_sieve-1.0.0/src/graph_sieve/__init__.py +15 -0
- graph_sieve-1.0.0/src/graph_sieve/agent.py +270 -0
- graph_sieve-1.0.0/src/graph_sieve/config.py +31 -0
- graph_sieve-1.0.0/src/graph_sieve/discovery.py +192 -0
- graph_sieve-1.0.0/src/graph_sieve/extractor.py +119 -0
- graph_sieve-1.0.0/src/graph_sieve/filters.py +34 -0
- graph_sieve-1.0.0/src/graph_sieve/graph_engine.py +194 -0
- graph_sieve-1.0.0/src/graph_sieve/hashing.py +51 -0
- graph_sieve-1.0.0/src/graph_sieve/hebrew_utils.py +10 -0
- graph_sieve-1.0.0/src/graph_sieve/llm_client.py +162 -0
- graph_sieve-1.0.0/src/graph_sieve/mcp_server.py +27 -0
- graph_sieve-1.0.0/src/graph_sieve/models.py +170 -0
- graph_sieve-1.0.0/src/graph_sieve/run_scanner.py +65 -0
- graph_sieve-1.0.0/src/graph_sieve/storage.py +44 -0
- graph_sieve-1.0.0/src/graph_sieve/strategic_filter.py +35 -0
- graph_sieve-1.0.0/src/graph_sieve/tools.py +76 -0
- graph_sieve-1.0.0/src/graph_sieve/validator.py +81 -0
- graph_sieve-1.0.0/src/graph_sieve/visualize.py +24 -0
- graph_sieve-1.0.0/src/graph_sieve/whois.py +72 -0
- graph_sieve-1.0.0/src/graph_sieve.egg-info/PKG-INFO +110 -0
- graph_sieve-1.0.0/src/graph_sieve.egg-info/SOURCES.txt +43 -0
- graph_sieve-1.0.0/src/graph_sieve.egg-info/dependency_links.txt +1 -0
- graph_sieve-1.0.0/src/graph_sieve.egg-info/entry_points.txt +6 -0
- graph_sieve-1.0.0/src/graph_sieve.egg-info/requires.txt +11 -0
- graph_sieve-1.0.0/src/graph_sieve.egg-info/top_level.txt +1 -0
- graph_sieve-1.0.0/tests/test_clustering.py +29 -0
- graph_sieve-1.0.0/tests/test_discovery.py +52 -0
- graph_sieve-1.0.0/tests/test_doc_counter.py +42 -0
- graph_sieve-1.0.0/tests/test_extractor.py +40 -0
- graph_sieve-1.0.0/tests/test_filters.py +17 -0
- graph_sieve-1.0.0/tests/test_graph_engine.py +46 -0
- graph_sieve-1.0.0/tests/test_hebrew_support.py +35 -0
- graph_sieve-1.0.0/tests/test_learning.py +27 -0
- graph_sieve-1.0.0/tests/test_learning_priority.py +63 -0
- graph_sieve-1.0.0/tests/test_pipeline.py +55 -0
- graph_sieve-1.0.0/tests/test_storage.py +30 -0
- graph_sieve-1.0.0/tests/test_storage_layered.py +22 -0
- graph_sieve-1.0.0/tests/test_strategic_filter.py +14 -0
- graph_sieve-1.0.0/tests/test_validator.py +27 -0
- graph_sieve-1.0.0/tests/test_workflow_final.py +55 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 graph-sieve contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: graph-sieve
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Full Spectrum Graph Sieve - Automated Technical Term Extraction and Relationship Mapping
|
|
5
|
+
Author-email: graph-sieve contributors <contributors@graph-sieve.org>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: knowledge-graph,llm,extraction,nlp,mcp,graph-database
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Requires-Python: >=3.11
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: markitdown[all]
|
|
16
|
+
Requires-Dist: openai
|
|
17
|
+
Requires-Dist: pydantic
|
|
18
|
+
Requires-Dist: pydantic-settings
|
|
19
|
+
Requires-Dist: tqdm
|
|
20
|
+
Requires-Dist: click
|
|
21
|
+
Requires-Dist: mcp
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: pandas
|
|
24
|
+
Requires-Dist: hyperextract
|
|
25
|
+
Requires-Dist: platformdirs
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# Graph-Sieve 🕸️📊
|
|
29
|
+
|
|
30
|
+
**Full Spectrum Graph Sieve - Automated Technical Term Extraction and Relationship Mapping**
|
|
31
|
+
|
|
32
|
+
`graph-sieve` is a standalone utility and service designed to extract relationship-aware domain knowledge from internal documents (.docx, .pptx, .msg, .pdf). It uses a multi-gate verifiable pipeline with local or remote models (OpenAI, Ollama, vLLM) to build a structured knowledge graph of technical terms and their relationships.
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
|
|
36
|
+
- **Multi-Gate Extraction**: A robust pipeline (Detection -> Extraction -> Validation) ensuring high-fidelity term capture.
|
|
37
|
+
- **Relationship Mapping**: Beyond simple term lookup—builds a Property Graph of how terms relate.
|
|
38
|
+
- **Multi-Format Support**: Handles PDF, PPTX, DOCX, MSG, and images (via OCR).
|
|
39
|
+
- **Flexible LLM Backend**: Run locally with Ollama/vLLM for privacy, or use OpenAI for scale.
|
|
40
|
+
- **Interactive Visualization**: Generate dynamic, relationship-aware graph visualizations.
|
|
41
|
+
- **MCP Server**: Integrated Model Context Protocol (MCP) server for seamless AI agent integration.
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install graph-sieve
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quick Start
|
|
50
|
+
|
|
51
|
+
1. **Configure Your LLM**:
|
|
52
|
+
Create a `.env` file in your working directory:
|
|
53
|
+
```env
|
|
54
|
+
LLM_PROVIDER=openai
|
|
55
|
+
OPENAI_API_KEY=your_key_here
|
|
56
|
+
```
|
|
57
|
+
*Or use Ollama (default):*
|
|
58
|
+
```env
|
|
59
|
+
LLM_PROVIDER=ollama
|
|
60
|
+
OLLAMA_BASE_URL=http://localhost:11434
|
|
61
|
+
MODEL_NAME=llama3
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
2. **Scan a Directory**:
|
|
65
|
+
```bash
|
|
66
|
+
graph-sieve-scan ./path/to/documents --dict my_dictionary.json
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
3. **Visualize the Results**:
|
|
70
|
+
```bash
|
|
71
|
+
graph-sieve-visualize --dict my_dictionary.json
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## CLI Commands
|
|
75
|
+
|
|
76
|
+
- `graph-sieve-scan`: Extract terms from a directory or file.
|
|
77
|
+
- `graph-sieve-lookup`: Query terms and their graph context.
|
|
78
|
+
- `graph-sieve-visualize`: Generate an interactive HTML graph.
|
|
79
|
+
- `graph-sieve-mcp`: Launch the MCP server.
|
|
80
|
+
- `graph-sieve-whois`: Find the source document for a specific term.
|
|
81
|
+
|
|
82
|
+
## Configuration (Environment Variables)
|
|
83
|
+
|
|
84
|
+
| Variable | Description | Default |
|
|
85
|
+
|----------|-------------|---------|
|
|
86
|
+
| `LLM_PROVIDER` | `openai`, `ollama`, or `vllm` | `ollama` |
|
|
87
|
+
| `OPENAI_API_KEY` | Required if using OpenAI | None |
|
|
88
|
+
| `OLLAMA_BASE_URL`| URL for Ollama API | `http://localhost:11434` |
|
|
89
|
+
| `MODEL_NAME` | Model to use for extraction | `gpt-4o-mini` / `llama3` |
|
|
90
|
+
| `STORAGE_DIR` | Directory for graph data | Platform-specific |
|
|
91
|
+
|
|
92
|
+
## AI Agent Integration
|
|
93
|
+
|
|
94
|
+
### Claude Desktop / Gemini CLI
|
|
95
|
+
To use Graph-Sieve as a tool, add it to your agent's config:
|
|
96
|
+
|
|
97
|
+
```json
|
|
98
|
+
{
|
|
99
|
+
"mcpServers": {
|
|
100
|
+
"graph-sieve": {
|
|
101
|
+
"command": "graph-sieve-mcp",
|
|
102
|
+
"args": []
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## License
|
|
109
|
+
|
|
110
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Graph-Sieve 🕸️📊
|
|
2
|
+
|
|
3
|
+
**Full Spectrum Graph Sieve - Automated Technical Term Extraction and Relationship Mapping**
|
|
4
|
+
|
|
5
|
+
`graph-sieve` is a standalone utility and service designed to extract relationship-aware domain knowledge from internal documents (.docx, .pptx, .msg, .pdf). It uses a multi-gate verifiable pipeline with local or remote models (OpenAI, Ollama, vLLM) to build a structured knowledge graph of technical terms and their relationships.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Multi-Gate Extraction**: A robust pipeline (Detection -> Extraction -> Validation) ensuring high-fidelity term capture.
|
|
10
|
+
- **Relationship Mapping**: Beyond simple term lookup—builds a Property Graph of how terms relate.
|
|
11
|
+
- **Multi-Format Support**: Handles PDF, PPTX, DOCX, MSG, and images (via OCR).
|
|
12
|
+
- **Flexible LLM Backend**: Run locally with Ollama/vLLM for privacy, or use OpenAI for scale.
|
|
13
|
+
- **Interactive Visualization**: Generate dynamic, relationship-aware graph visualizations.
|
|
14
|
+
- **MCP Server**: Integrated Model Context Protocol (MCP) server for seamless AI agent integration.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install graph-sieve
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
1. **Configure Your LLM**:
|
|
25
|
+
Create a `.env` file in your working directory:
|
|
26
|
+
```env
|
|
27
|
+
LLM_PROVIDER=openai
|
|
28
|
+
OPENAI_API_KEY=your_key_here
|
|
29
|
+
```
|
|
30
|
+
*Or use Ollama (default):*
|
|
31
|
+
```env
|
|
32
|
+
LLM_PROVIDER=ollama
|
|
33
|
+
OLLAMA_BASE_URL=http://localhost:11434
|
|
34
|
+
MODEL_NAME=llama3
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
2. **Scan a Directory**:
|
|
38
|
+
```bash
|
|
39
|
+
graph-sieve-scan ./path/to/documents --dict my_dictionary.json
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
3. **Visualize the Results**:
|
|
43
|
+
```bash
|
|
44
|
+
graph-sieve-visualize --dict my_dictionary.json
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## CLI Commands
|
|
48
|
+
|
|
49
|
+
- `graph-sieve-scan`: Extract terms from a directory or file.
|
|
50
|
+
- `graph-sieve-lookup`: Query terms and their graph context.
|
|
51
|
+
- `graph-sieve-visualize`: Generate an interactive HTML graph.
|
|
52
|
+
- `graph-sieve-mcp`: Launch the MCP server.
|
|
53
|
+
- `graph-sieve-whois`: Find the source document for a specific term.
|
|
54
|
+
|
|
55
|
+
## Configuration (Environment Variables)
|
|
56
|
+
|
|
57
|
+
| Variable | Description | Default |
|
|
58
|
+
|----------|-------------|---------|
|
|
59
|
+
| `LLM_PROVIDER` | `openai`, `ollama`, or `vllm` | `ollama` |
|
|
60
|
+
| `OPENAI_API_KEY` | Required if using OpenAI | None |
|
|
61
|
+
| `OLLAMA_BASE_URL`| URL for Ollama API | `http://localhost:11434` |
|
|
62
|
+
| `MODEL_NAME` | Model to use for extraction | `gpt-4o-mini` / `llama3` |
|
|
63
|
+
| `STORAGE_DIR` | Directory for graph data | Platform-specific |
|
|
64
|
+
|
|
65
|
+
## AI Agent Integration
|
|
66
|
+
|
|
67
|
+
### Claude Desktop / Gemini CLI
|
|
68
|
+
To use Graph-Sieve as a tool, add it to your agent's config:
|
|
69
|
+
|
|
70
|
+
```json
|
|
71
|
+
{
|
|
72
|
+
"mcpServers": {
|
|
73
|
+
"graph-sieve": {
|
|
74
|
+
"command": "graph-sieve-mcp",
|
|
75
|
+
"args": []
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## License
|
|
82
|
+
|
|
83
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "graph-sieve"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Full Spectrum Graph Sieve - Automated Technical Term Extraction and Relationship Mapping"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "graph-sieve contributors", email = "contributors@graph-sieve.org" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["knowledge-graph", "llm", "extraction", "nlp", "mcp", "graph-database"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"markitdown[all]",
|
|
24
|
+
"openai",
|
|
25
|
+
"pydantic",
|
|
26
|
+
"pydantic-settings",
|
|
27
|
+
"tqdm",
|
|
28
|
+
"click",
|
|
29
|
+
"mcp",
|
|
30
|
+
"numpy",
|
|
31
|
+
"pandas",
|
|
32
|
+
"hyperextract",
|
|
33
|
+
"platformdirs",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.scripts]
|
|
37
|
+
graph-sieve-scan = "graph_sieve.run_scanner:main"
|
|
38
|
+
graph-sieve-lookup = "graph_sieve.tools:main"
|
|
39
|
+
graph-sieve-mcp = "graph_sieve.mcp_server:mcp.run"
|
|
40
|
+
graph-sieve-visualize = "graph_sieve.visualize:main"
|
|
41
|
+
graph-sieve-whois = "graph_sieve.whois:main"
|
|
42
|
+
|
|
43
|
+
[tool.setuptools]
|
|
44
|
+
package-dir = { "" = "src" }
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .agent import DictionaryAgent
|
|
2
|
+
from .models import Dictionary, DictionaryEntry, UsageExample, GraphTriplet
|
|
3
|
+
from .storage import load_dictionary, save_dictionary
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"DictionaryAgent",
|
|
7
|
+
"Dictionary",
|
|
8
|
+
"DictionaryEntry",
|
|
9
|
+
"UsageExample",
|
|
10
|
+
"GraphTriplet",
|
|
11
|
+
"load_dictionary",
|
|
12
|
+
"save_dictionary",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
__version__ = "1.0.0"
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Optional
|
|
2
|
+
import os
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from .extractor import extract_all
|
|
5
|
+
from .filters import is_domain_specific
|
|
6
|
+
from .discovery import extract_with_llm, process_discovered_terms
|
|
7
|
+
from .validator import validate_with_llm
|
|
8
|
+
from .models import Dictionary, DictionaryEntry, UsageExample
|
|
9
|
+
from .graph_engine import GraphKnowledgeEngine
|
|
10
|
+
from .hashing import HashStore
|
|
11
|
+
from .strategic_filter import StrategicSieve
|
|
12
|
+
from .storage import load_dictionary, save_dictionary
|
|
13
|
+
from .llm_client import get_llm_client
|
|
14
|
+
|
|
15
|
+
class DictionaryAgent:
|
|
16
|
+
"""
|
|
17
|
+
The main entry point for the Dictionary Agent.
|
|
18
|
+
Consolidates the 5-gate pipeline logic and the high-level agent interface.
|
|
19
|
+
"""
|
|
20
|
+
def __init__(self, dictionary_path: str = None, whitelist: List[str] = None,
|
|
21
|
+
hash_store: HashStore = None, strategic_sieve: StrategicSieve = None,
|
|
22
|
+
seed_paths: List[str] = None):
|
|
23
|
+
self.dictionary_path = dictionary_path
|
|
24
|
+
self.dictionary = load_dictionary(dictionary_path) if dictionary_path else Dictionary()
|
|
25
|
+
self.whitelist = whitelist or []
|
|
26
|
+
self.graph_engine = GraphKnowledgeEngine(self.dictionary)
|
|
27
|
+
self.hash_store = hash_store or HashStore()
|
|
28
|
+
self.strategic_sieve = strategic_sieve or StrategicSieve()
|
|
29
|
+
self.seed_paths = seed_paths or []
|
|
30
|
+
|
|
31
|
+
def scan_file(self, file_path: str):
|
|
32
|
+
"""Simple interface for scanning a single file."""
|
|
33
|
+
return self.process_document(file_path)
|
|
34
|
+
|
|
35
|
+
def process_directory(self, dir_path: str) -> List[str]:
|
|
36
|
+
"""
|
|
37
|
+
Processes documents, starting with seed documents if provided.
|
|
38
|
+
"""
|
|
39
|
+
all_added = []
|
|
40
|
+
|
|
41
|
+
# 1. Process Seeds First
|
|
42
|
+
for seed in self.seed_paths:
|
|
43
|
+
if os.path.isfile(seed):
|
|
44
|
+
all_added.extend(self.process_document(seed, is_seed=True))
|
|
45
|
+
elif os.path.isdir(seed):
|
|
46
|
+
for root, _, filenames in os.walk(seed):
|
|
47
|
+
for f in filenames:
|
|
48
|
+
path = os.path.join(root, f)
|
|
49
|
+
all_added.extend(self.process_document(path, is_seed=True))
|
|
50
|
+
|
|
51
|
+
# 2. Process Remaining Docs in reverse-chronological order
|
|
52
|
+
files = []
|
|
53
|
+
for root, _, filenames in os.walk(dir_path):
|
|
54
|
+
for f in filenames:
|
|
55
|
+
path = os.path.join(root, f)
|
|
56
|
+
# Skip if already processed as seed
|
|
57
|
+
is_already_seed = any(os.path.samefile(path, s) for s in self.seed_paths if os.path.exists(s) and os.path.isfile(s))
|
|
58
|
+
if is_already_seed:
|
|
59
|
+
continue
|
|
60
|
+
files.append((path, os.path.getmtime(path)))
|
|
61
|
+
|
|
62
|
+
files.sort(key=lambda x: x[1], reverse=True)
|
|
63
|
+
|
|
64
|
+
for file_path, _ in files:
|
|
65
|
+
# Gate 1: Strategic Sieve
|
|
66
|
+
if not self.strategic_sieve.is_relevant(file_path):
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
if not self.hash_store.has_changed(file_path):
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
added = self.process_document(file_path)
|
|
73
|
+
all_added.extend(added)
|
|
74
|
+
|
|
75
|
+
self.hash_store.save()
|
|
76
|
+
if self.dictionary_path:
|
|
77
|
+
save_dictionary(self.dictionary, self.dictionary_path)
|
|
78
|
+
return all_added
|
|
79
|
+
|
|
80
|
+
def get_bounty_hints(self) -> Optional[str]:
|
|
81
|
+
"""
|
|
82
|
+
Pitfall 3 Fix: Bounty System. Identifies high-velocity pending terms.
|
|
83
|
+
Bounty Capping (Second-Order Fix): Limit to top 10 to protect context window.
|
|
84
|
+
"""
|
|
85
|
+
pending_entries = [
|
|
86
|
+
e for e in self.dictionary.entries.values()
|
|
87
|
+
if e.status == "PENDING_DEFINITION" and e.document_count >= 3
|
|
88
|
+
]
|
|
89
|
+
# Sort by ubiquity to find most important missing terms
|
|
90
|
+
pending_entries.sort(key=lambda x: x.document_count, reverse=True)
|
|
91
|
+
bounty_terms = [e.term for e in pending_entries[:10]] # Capped at 10
|
|
92
|
+
|
|
93
|
+
if not bounty_terms:
|
|
94
|
+
return None
|
|
95
|
+
return "The following terms have been seen multiple times but lack a definition. Please prioritize finding their meanings: " + ", ".join(bounty_terms)
|
|
96
|
+
|
|
97
|
+
def merge_conflicting_definitions(self, existing: DictionaryEntry, new_item: Dict[str, Any]) -> bool:
|
|
98
|
+
"""
|
|
99
|
+
Synthesis Gate: Merges conflicting high-confidence definitions using an LLM.
|
|
100
|
+
"""
|
|
101
|
+
# If both are high confidence (GOLD or SILVER) and overviews differ significantly
|
|
102
|
+
confidence_order = {"PENDING": 0, "BRONZE": 1, "SILVER": 2, "GOLD": 3}
|
|
103
|
+
new_conf = new_item.get("confidence_level", "BRONZE")
|
|
104
|
+
|
|
105
|
+
if (confidence_order.get(existing.confidence_level, 0) >= 2 and
|
|
106
|
+
confidence_order.get(new_conf, 0) >= 2 and
|
|
107
|
+
existing.overview.strip().lower() != new_item.get("overview", "").strip().lower()):
|
|
108
|
+
|
|
109
|
+
print(f"Triggering Synthesis Gate for term: {existing.term}")
|
|
110
|
+
client = get_llm_client()
|
|
111
|
+
prompt = f"""You are a Master Lexicographer. Two authoritative sources have defined the same term differently.
|
|
112
|
+
Your task is to SYNTHESIZE them into a single, unified definition that captures the nuances of both.
|
|
113
|
+
|
|
114
|
+
Term: {existing.term}
|
|
115
|
+
Definition A: {existing.overview}
|
|
116
|
+
Definition B: {new_item.get("overview")}
|
|
117
|
+
|
|
118
|
+
Deep Dive A: {existing.deep_dive or "None"}
|
|
119
|
+
Deep Dive B: {new_item.get("deep_dive") or "None"}
|
|
120
|
+
|
|
121
|
+
Provide a single JSON response with:
|
|
122
|
+
- overview: The synthesized 1-sentence summary.
|
|
123
|
+
- deep_dive: The combined technical details.
|
|
124
|
+
"""
|
|
125
|
+
messages = [
|
|
126
|
+
{"role": "system", "content": "Return JSON with 'overview' and 'deep_dive'."},
|
|
127
|
+
{"role": "user", "content": prompt}
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
synthesized = client.chat(messages, json_mode=True)
|
|
132
|
+
existing.overview = synthesized.get("overview", existing.overview)
|
|
133
|
+
existing.deep_dive = synthesized.get("deep_dive", existing.deep_dive)
|
|
134
|
+
# Upgrade to highest confidence if merged
|
|
135
|
+
existing.confidence_level = "GOLD" if "GOLD" in [existing.confidence_level, new_conf] else "SILVER"
|
|
136
|
+
return True
|
|
137
|
+
except:
|
|
138
|
+
return False
|
|
139
|
+
return False
|
|
140
|
+
|
|
141
|
+
def reconcile_supersedes(self, subject: str, target: str):
|
|
142
|
+
"""
|
|
143
|
+
Gate 4 Extension: If a new term SUPERSEDES an old one, update statuses.
|
|
144
|
+
"""
|
|
145
|
+
if target in self.dictionary.entries:
|
|
146
|
+
old_entry = self.dictionary.entries[target]
|
|
147
|
+
if old_entry.status != "LEGACY":
|
|
148
|
+
print(f"Temporal Reconciliation: {subject} SUPERSEDES {target}. Marking {target} as LEGACY.")
|
|
149
|
+
old_entry.status = "LEGACY"
|
|
150
|
+
|
|
151
|
+
# Also mark related relationships as legacy
|
|
152
|
+
for rel in self.dictionary.relationships:
|
|
153
|
+
if rel.subject == target or rel.object == target:
|
|
154
|
+
if rel.status != "LEGACY":
|
|
155
|
+
rel.status = "LEGACY"
|
|
156
|
+
|
|
157
|
+
def process_document(self, file_path: str, is_seed: bool = False) -> List[str]:
|
|
158
|
+
"""
|
|
159
|
+
Runs the 5-gate pipeline on a document by processing it in metadata-aware chunks.
|
|
160
|
+
"""
|
|
161
|
+
from .extractor import chunk_text
|
|
162
|
+
from .discovery import process_discovered_terms
|
|
163
|
+
|
|
164
|
+
# Gate 2: Extraction
|
|
165
|
+
full_text = extract_all(file_path)
|
|
166
|
+
if full_text.startswith("[Error"):
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
# Pitfall: Large documents need chunking
|
|
170
|
+
chunks = chunk_text(full_text, file_path)
|
|
171
|
+
all_added_terms = []
|
|
172
|
+
|
|
173
|
+
for chunk in chunks:
|
|
174
|
+
# Pitfall 4 Fix: Recursive Context Injection
|
|
175
|
+
context_hints = self.get_bounty_hints()
|
|
176
|
+
discovered = extract_with_llm(chunk, context_hints=context_hints)
|
|
177
|
+
|
|
178
|
+
valid_items = []
|
|
179
|
+
confidence_map = {}
|
|
180
|
+
|
|
181
|
+
for item in discovered:
|
|
182
|
+
term = item.get("term")
|
|
183
|
+
if not term: continue
|
|
184
|
+
|
|
185
|
+
# Gate 3: Base-Knowledge Filter
|
|
186
|
+
if not is_domain_specific(term, self.whitelist): continue
|
|
187
|
+
|
|
188
|
+
# Gate 5: Zero-Trust Validation (Pitfall 1 & 6 Fix)
|
|
189
|
+
# Note: We validate against the chunk to ensure the anchor is present
|
|
190
|
+
validation = validate_with_llm(chunk, item)
|
|
191
|
+
if not validation.get("is_valid", False):
|
|
192
|
+
# Don't print for every common word failure to avoid noise
|
|
193
|
+
if validation.get("status") == "HALLUCINATION" and "Hard-Anchor" in validation.get("reasoning", ""):
|
|
194
|
+
print(f"Discarding Hallucination (No Anchor): {term}")
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
valid_items.append(item)
|
|
198
|
+
confidence_map[term] = validation.get("confidence_level", "BRONZE")
|
|
199
|
+
|
|
200
|
+
# Temporal Reconciliation: Check for SUPERSEDES
|
|
201
|
+
for rel in item.get("relationships", []):
|
|
202
|
+
if rel.get("type") == "SUPERSEDES":
|
|
203
|
+
self.reconcile_supersedes(term, rel.get("target"))
|
|
204
|
+
|
|
205
|
+
# Gate 4: Temporal Reconciliation
|
|
206
|
+
# Check for conflicts before processing
|
|
207
|
+
for item in valid_items:
|
|
208
|
+
term = item.get("term")
|
|
209
|
+
if term in self.dictionary.entries:
|
|
210
|
+
item["confidence_level"] = confidence_map.get(term, "BRONZE")
|
|
211
|
+
self.merge_conflicting_definitions(self.dictionary.entries[term], item)
|
|
212
|
+
|
|
213
|
+
added_terms = process_discovered_terms(
|
|
214
|
+
valid_items,
|
|
215
|
+
self.dictionary,
|
|
216
|
+
file_path,
|
|
217
|
+
is_seed=is_seed,
|
|
218
|
+
graph_engine=self.graph_engine,
|
|
219
|
+
confidence_levels=confidence_map
|
|
220
|
+
)
|
|
221
|
+
all_added_terms.extend(added_terms)
|
|
222
|
+
|
|
223
|
+
return list(set(all_added_terms)) # Deduplicate added terms
|
|
224
|
+
|
|
225
|
+
def generate_summary(self) -> str:
|
|
226
|
+
"""
|
|
227
|
+
Generates a comprehensive summary of the agent's learning journey.
|
|
228
|
+
"""
|
|
229
|
+
entries = list(self.dictionary.entries.values())
|
|
230
|
+
total_terms = len(entries)
|
|
231
|
+
active_entries = [e for e in entries if e.status == "ACTIVE"]
|
|
232
|
+
pending_entries = [e for e in entries if e.status == "PENDING_DEFINITION"]
|
|
233
|
+
|
|
234
|
+
# Sort by document_count to find most common terms
|
|
235
|
+
common_terms = sorted(active_entries, key=lambda x: x.document_count, reverse=True)[:5]
|
|
236
|
+
|
|
237
|
+
lines = [
|
|
238
|
+
"\n" + "="*50,
|
|
239
|
+
"📊 DICTIONARY AGENT LEARNING SUMMARY",
|
|
240
|
+
"="*50,
|
|
241
|
+
f"Total Concepts Tracked: {total_terms}",
|
|
242
|
+
f"Fully Defined Terms: {len(active_entries)}",
|
|
243
|
+
f"Pending Definitions: {len(pending_entries)}",
|
|
244
|
+
f"Knowledge Graph Links: {len(self.dictionary.relationships)}",
|
|
245
|
+
f"Identified Communities: {len(self.dictionary.community_reports)}",
|
|
246
|
+
"\n⭐ MOST COMMON TERMS:",
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
for entry in common_terms:
|
|
250
|
+
lines.append(f"• {entry.term} (seen in {entry.document_count} files)")
|
|
251
|
+
lines.append(f" Confidence: {entry.confidence_level}")
|
|
252
|
+
lines.append(f" Overview: {entry.overview}")
|
|
253
|
+
if entry.deep_dive:
|
|
254
|
+
# Show first 100 chars of deep dive
|
|
255
|
+
dd_snippet = (entry.deep_dive[:100] + "...") if len(entry.deep_dive) > 100 else entry.deep_dive
|
|
256
|
+
lines.append(f" Deep Dive: {dd_snippet}")
|
|
257
|
+
lines.append("")
|
|
258
|
+
|
|
259
|
+
bounty_candidates = [e for e in pending_entries if e.document_count >= 3]
|
|
260
|
+
if bounty_candidates:
|
|
261
|
+
lines.append("🎯 BOUNTY LIST (Critical missing definitions):")
|
|
262
|
+
for entry in sorted(bounty_candidates, key=lambda x: x.document_count, reverse=True)[:10]:
|
|
263
|
+
lines.append(f"• {entry.term} (seen {entry.document_count} times)")
|
|
264
|
+
elif pending_entries:
|
|
265
|
+
lines.append("⏳ TOP PENDING CONCEPTS (Need more context):")
|
|
266
|
+
for entry in sorted(pending_entries, key=lambda x: x.document_count, reverse=True)[:3]:
|
|
267
|
+
lines.append(f"• {entry.term} (seen {entry.document_count} times)")
|
|
268
|
+
|
|
269
|
+
lines.append("\n" + "="*50 + "\n")
|
|
270
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
|
+
from platformdirs import user_data_dir
|
|
6
|
+
|
|
7
|
+
class Settings(BaseSettings):
|
|
8
|
+
model_config = SettingsConfigDict(
|
|
9
|
+
env_file=".env",
|
|
10
|
+
env_file_encoding="utf-8",
|
|
11
|
+
extra="ignore"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# LLM Settings
|
|
15
|
+
llm_provider: str = "openai" # openai, ollama, vllm
|
|
16
|
+
openai_api_key: Optional[str] = None
|
|
17
|
+
ollama_base_url: str = "http://localhost:11434"
|
|
18
|
+
model_name: str = "gpt-4o-mini"
|
|
19
|
+
|
|
20
|
+
# Storage Settings
|
|
21
|
+
storage_dir: Path = Path(user_data_dir("graph-sieve", "graph-sieve"))
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def hashes_path(self) -> Path:
|
|
25
|
+
return self.storage_dir / "hashes.json"
|
|
26
|
+
|
|
27
|
+
def ensure_dirs(self):
|
|
28
|
+
"""Ensure that the storage directory exists."""
|
|
29
|
+
self.storage_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
settings = Settings()
|