graphrag-core 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/PKG-INFO +4 -1
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/pyproject.toml +3 -2
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/__init__.py +15 -1
- graphrag_core-0.4.0/src/graphrag_core/extraction/__init__.py +5 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/extraction/engine.py +91 -91
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/interfaces.py +18 -0
- graphrag_core-0.4.0/src/graphrag_core/llm/__init__.py +17 -0
- graphrag_core-0.4.0/src/graphrag_core/llm/anthropic.py +65 -0
- graphrag_core-0.4.0/src/graphrag_core/llm/base.py +66 -0
- graphrag_core-0.4.0/src/graphrag_core/llm/openai.py +64 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/models.py +8 -0
- graphrag_core-0.4.0/tests/test_extraction/test_engine.py +583 -0
- graphrag_core-0.4.0/tests/test_extraction_engine.py +128 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_interfaces.py +12 -0
- graphrag_core-0.4.0/tests/test_llm_anthropic.py +77 -0
- graphrag_core-0.4.0/tests/test_llm_base.py +75 -0
- graphrag_core-0.4.0/tests/test_llm_openai.py +125 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_models.py +23 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/uv.lock +260 -1
- graphrag_core-0.2.0/src/graphrag_core/extraction/__init__.py +0 -5
- graphrag_core-0.2.0/src/graphrag_core/llm/__init__.py +0 -9
- graphrag_core-0.2.0/src/graphrag_core/llm/anthropic.py +0 -35
- graphrag_core-0.2.0/tests/test_extraction/test_engine.py +0 -271
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/.github/workflows/release.yml +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/.github/workflows/test.yml +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/.gitignore +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/CHANGELOG.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/CLAUDE.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/LICENSE +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/README.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/graphrag_core_interface_spec.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-11-bb1-document-ingestion.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-bb2-bb3-extraction-and-graph.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-bb4-hybrid-search.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-bb5-bb6-curation-and-registry.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-bb7-bb8-tools-and-agents.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-release-readiness.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-11-bb1-document-ingestion-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-bb2-bb3-extraction-and-graph-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-bb4-hybrid-search-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-bb5-bb6-curation-and-registry-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-bb7-bb8-tools-and-agents-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-release-readiness-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/_cypher.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/agents/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/agents/context.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/agents/orchestrator.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/curation/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/curation/detection.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/curation/pipeline.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/graph/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/graph/memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/graph/neo4j.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/ingestion/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/ingestion/chunker.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/ingestion/parsers.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/ingestion/pipeline.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/py.typed +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/registry/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/registry/matching.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/registry/memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/search/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/search/fusion.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/search/memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/search/neo4j.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/tools/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/tools/core_tools.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/tools/library.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/conftest.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_agents/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_agents/test_context.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_agents/test_orchestrator.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_curation/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_curation/test_detection.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_curation/test_pipeline.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_extraction/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_graph/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_graph/test_memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_graph/test_neo4j.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_ingestion/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_ingestion/test_chunker.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_ingestion/test_parsers.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_ingestion/test_pipeline.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_integration/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_integration/test_ingest_to_graph.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_registry/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_registry/test_matching.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_registry/test_memory_registry.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_search/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_search/test_fusion.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_search/test_memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_search/test_neo4j_search.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_tools/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_tools/test_core_tools.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_tools/test_library.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: graphrag-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs
|
|
5
5
|
Project-URL: Homepage, https://github.com/cdel1/graphrag-core
|
|
6
6
|
Project-URL: Repository, https://github.com/cdel1/graphrag-core
|
|
@@ -22,10 +22,13 @@ Requires-Dist: python-docx>=1.0
|
|
|
22
22
|
Provides-Extra: all
|
|
23
23
|
Requires-Dist: anthropic>=0.40; extra == 'all'
|
|
24
24
|
Requires-Dist: neo4j>=5.0; extra == 'all'
|
|
25
|
+
Requires-Dist: openai>=1.0; extra == 'all'
|
|
25
26
|
Provides-Extra: anthropic
|
|
26
27
|
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
27
28
|
Provides-Extra: neo4j
|
|
28
29
|
Requires-Dist: neo4j>=5.0; extra == 'neo4j'
|
|
30
|
+
Provides-Extra: openai
|
|
31
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
29
32
|
Description-Content-Type: text/markdown
|
|
30
33
|
|
|
31
34
|
# graphrag-core
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-core"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.0"
|
|
4
4
|
description = "Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs"
|
|
5
5
|
license = "MIT"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -36,7 +36,8 @@ markers = [
|
|
|
36
36
|
[project.optional-dependencies]
|
|
37
37
|
anthropic = ["anthropic>=0.40"]
|
|
38
38
|
neo4j = ["neo4j>=5.0"]
|
|
39
|
-
|
|
39
|
+
openai = ["openai>=1.0"]
|
|
40
|
+
all = ["graphrag-core[anthropic,neo4j,openai]"]
|
|
40
41
|
|
|
41
42
|
[project.urls]
|
|
42
43
|
Homepage = "https://github.com/cdel1/graphrag-core"
|
|
@@ -11,6 +11,7 @@ from graphrag_core.interfaces import (
|
|
|
11
11
|
EmbeddingModel,
|
|
12
12
|
EntityRegistry,
|
|
13
13
|
ExtractionEngine,
|
|
14
|
+
ExtractionPromptBuilder,
|
|
14
15
|
GraphStore,
|
|
15
16
|
IngestionPipeline,
|
|
16
17
|
LLMClient,
|
|
@@ -26,15 +27,17 @@ from graphrag_core.ingestion import (
|
|
|
26
27
|
TextParser,
|
|
27
28
|
TokenChunker,
|
|
28
29
|
)
|
|
29
|
-
from graphrag_core.extraction import LLMExtractionEngine
|
|
30
|
+
from graphrag_core.extraction import DefaultPromptBuilder, LLMExtractionEngine
|
|
30
31
|
from graphrag_core.graph import InMemoryGraphStore
|
|
31
32
|
from graphrag_core.search import InMemorySearchEngine
|
|
32
33
|
from graphrag_core.registry import InMemoryEntityRegistry
|
|
33
34
|
from graphrag_core.curation import DeterministicDetectionLayer, CurationPipeline
|
|
34
35
|
from graphrag_core.tools import Tool, ToolLibrary, register_core_tools
|
|
35
36
|
from graphrag_core.agents import AgentContext, SequentialOrchestrator
|
|
37
|
+
from graphrag_core.llm import BaseLLMClient
|
|
36
38
|
from graphrag_core.models import (
|
|
37
39
|
AgentResult,
|
|
40
|
+
ChunkExtractionResult,
|
|
38
41
|
CurationIssue,
|
|
39
42
|
CurationReport,
|
|
40
43
|
DocumentChunk,
|
|
@@ -65,6 +68,7 @@ __all__ = [
|
|
|
65
68
|
"EmbeddingModel",
|
|
66
69
|
"EntityRegistry",
|
|
67
70
|
"ExtractionEngine",
|
|
71
|
+
"ExtractionPromptBuilder",
|
|
68
72
|
"GraphStore",
|
|
69
73
|
"IngestionPipeline",
|
|
70
74
|
"LLMClient",
|
|
@@ -78,7 +82,10 @@ __all__ = [
|
|
|
78
82
|
"PdfParser",
|
|
79
83
|
"TextParser",
|
|
80
84
|
"TokenChunker",
|
|
85
|
+
# LLM base
|
|
86
|
+
"BaseLLMClient",
|
|
81
87
|
# BB2 implementations
|
|
88
|
+
"DefaultPromptBuilder",
|
|
82
89
|
"LLMExtractionEngine",
|
|
83
90
|
# BB3 implementations
|
|
84
91
|
"InMemoryGraphStore",
|
|
@@ -98,6 +105,7 @@ __all__ = [
|
|
|
98
105
|
"SequentialOrchestrator",
|
|
99
106
|
# Models
|
|
100
107
|
"AgentResult",
|
|
108
|
+
"ChunkExtractionResult",
|
|
101
109
|
"CurationIssue",
|
|
102
110
|
"CurationReport",
|
|
103
111
|
"DocumentChunk",
|
|
@@ -136,3 +144,9 @@ try:
|
|
|
136
144
|
__all__.append("AnthropicLLMClient")
|
|
137
145
|
except ImportError:
|
|
138
146
|
pass
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
from graphrag_core.llm import OpenAILLMClient
|
|
150
|
+
__all__.append("OpenAILLMClient")
|
|
151
|
+
except ImportError:
|
|
152
|
+
pass
|
|
@@ -2,10 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import
|
|
6
|
-
|
|
7
|
-
from graphrag_core.interfaces import LLMClient
|
|
5
|
+
from graphrag_core.interfaces import ExtractionPromptBuilder, LLMClient
|
|
8
6
|
from graphrag_core.models import (
|
|
7
|
+
ChunkExtractionResult,
|
|
9
8
|
DocumentChunk,
|
|
10
9
|
ExtractedNode,
|
|
11
10
|
ExtractedRelationship,
|
|
@@ -16,11 +15,94 @@ from graphrag_core.models import (
|
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
|
|
18
|
+
def validate_extraction(
|
|
19
|
+
nodes: list[ExtractedNode],
|
|
20
|
+
rels: list[ExtractedRelationship],
|
|
21
|
+
schema: OntologySchema,
|
|
22
|
+
) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
|
|
23
|
+
"""Filter extracted nodes and relationships to match schema constraints.
|
|
24
|
+
|
|
25
|
+
Removes:
|
|
26
|
+
- Nodes with labels not in the schema
|
|
27
|
+
- Relationships with types not in the schema
|
|
28
|
+
- Relationships referencing non-existent node IDs
|
|
29
|
+
- Relationships violating source/target type constraints
|
|
30
|
+
"""
|
|
31
|
+
allowed_labels = {nt.label for nt in schema.node_types}
|
|
32
|
+
allowed_rel_types = {rt.type for rt in schema.relationship_types}
|
|
33
|
+
rel_constraints = {
|
|
34
|
+
rt.type: (set(rt.source_types), set(rt.target_types))
|
|
35
|
+
for rt in schema.relationship_types
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
valid_nodes = [n for n in nodes if n.label in allowed_labels]
|
|
39
|
+
valid_node_ids = {n.id for n in valid_nodes}
|
|
40
|
+
node_labels = {n.id: n.label for n in valid_nodes}
|
|
41
|
+
|
|
42
|
+
valid_rels = []
|
|
43
|
+
for rel in rels:
|
|
44
|
+
if rel.type not in allowed_rel_types:
|
|
45
|
+
continue
|
|
46
|
+
if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
|
|
47
|
+
continue
|
|
48
|
+
source_types, target_types = rel_constraints[rel.type]
|
|
49
|
+
if node_labels[rel.source_id] not in source_types:
|
|
50
|
+
continue
|
|
51
|
+
if node_labels[rel.target_id] not in target_types:
|
|
52
|
+
continue
|
|
53
|
+
valid_rels.append(rel)
|
|
54
|
+
|
|
55
|
+
return valid_nodes, valid_rels
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class DefaultPromptBuilder:
|
|
59
|
+
"""Builds the default system prompt for LLM-based entity extraction."""
|
|
60
|
+
|
|
61
|
+
def build_system_prompt(self, schema: OntologySchema) -> str:
|
|
62
|
+
node_descriptions = []
|
|
63
|
+
for nt in schema.node_types:
|
|
64
|
+
props = ", ".join(
|
|
65
|
+
f"{p.name} ({p.type}{', required' if p.required else ''})"
|
|
66
|
+
for p in nt.properties
|
|
67
|
+
)
|
|
68
|
+
line = f"- {nt.label}: properties=[{props}]"
|
|
69
|
+
if nt.description:
|
|
70
|
+
line += f" \u2014 {nt.description}"
|
|
71
|
+
node_descriptions.append(line)
|
|
72
|
+
|
|
73
|
+
rel_descriptions = []
|
|
74
|
+
for rt in schema.relationship_types:
|
|
75
|
+
line = f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
|
|
76
|
+
if rt.description:
|
|
77
|
+
line += f" \u2014 {rt.description}"
|
|
78
|
+
rel_descriptions.append(line)
|
|
79
|
+
|
|
80
|
+
return (
|
|
81
|
+
"You are an entity extraction engine. Extract entities and relationships "
|
|
82
|
+
"from the provided text according to this schema.\n\n"
|
|
83
|
+
"ALLOWED NODE TYPES:\n"
|
|
84
|
+
+ "\n".join(node_descriptions)
|
|
85
|
+
+ "\n\nALLOWED RELATIONSHIP TYPES:\n"
|
|
86
|
+
+ "\n".join(rel_descriptions)
|
|
87
|
+
+ "\n\nDo not extract entities or relationships not listed above.\n\n"
|
|
88
|
+
"Rules:\n"
|
|
89
|
+
"- Every node id must be unique and descriptive (e.g., 'person-alice', 'company-acme')\n"
|
|
90
|
+
"- Only use node types and relationship types listed above\n"
|
|
91
|
+
"- Include all required properties for each node type\n"
|
|
92
|
+
"- Return empty arrays if no entities are found"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
19
96
|
class LLMExtractionEngine:
|
|
20
97
|
"""Extracts entities and relationships from text using an LLM, guided by an ontology schema."""
|
|
21
98
|
|
|
22
|
-
def __init__(
|
|
99
|
+
def __init__(
|
|
100
|
+
self,
|
|
101
|
+
llm_client: LLMClient,
|
|
102
|
+
prompt_builder: ExtractionPromptBuilder | None = None,
|
|
103
|
+
) -> None:
|
|
23
104
|
self._llm = llm_client
|
|
105
|
+
self._prompt_builder = prompt_builder or DefaultPromptBuilder()
|
|
24
106
|
|
|
25
107
|
async def extract(
|
|
26
108
|
self,
|
|
@@ -32,7 +114,7 @@ class LLMExtractionEngine:
|
|
|
32
114
|
all_rels: list[ExtractedRelationship] = []
|
|
33
115
|
all_provenance: list[ProvenanceLink] = []
|
|
34
116
|
|
|
35
|
-
system_prompt = self.
|
|
117
|
+
system_prompt = self._prompt_builder.build_system_prompt(schema)
|
|
36
118
|
|
|
37
119
|
for chunk in chunks:
|
|
38
120
|
nodes, rels = await self._extract_chunk(chunk, system_prompt)
|
|
@@ -55,71 +137,13 @@ class LLMExtractionEngine:
|
|
|
55
137
|
async def _extract_chunk(
|
|
56
138
|
self, chunk: DocumentChunk, system_prompt: str
|
|
57
139
|
) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
|
|
58
|
-
|
|
140
|
+
result = await self._llm.complete_json(
|
|
59
141
|
messages=[{"role": "user", "content": chunk.text}],
|
|
142
|
+
schema=ChunkExtractionResult,
|
|
60
143
|
system=system_prompt,
|
|
61
144
|
temperature=0.0,
|
|
62
145
|
)
|
|
63
|
-
return
|
|
64
|
-
|
|
65
|
-
def _build_system_prompt(self, schema: OntologySchema) -> str:
|
|
66
|
-
node_descriptions = []
|
|
67
|
-
for nt in schema.node_types:
|
|
68
|
-
props = ", ".join(
|
|
69
|
-
f"{p.name} ({p.type}{', required' if p.required else ''})"
|
|
70
|
-
for p in nt.properties
|
|
71
|
-
)
|
|
72
|
-
node_descriptions.append(f"- {nt.label}: properties=[{props}]")
|
|
73
|
-
|
|
74
|
-
rel_descriptions = []
|
|
75
|
-
for rt in schema.relationship_types:
|
|
76
|
-
rel_descriptions.append(
|
|
77
|
-
f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
return (
|
|
81
|
-
"You are an entity extraction engine. Extract entities and relationships "
|
|
82
|
-
"from the provided text according to this schema.\n\n"
|
|
83
|
-
"ALLOWED NODE TYPES:\n"
|
|
84
|
-
+ "\n".join(node_descriptions)
|
|
85
|
-
+ "\n\nALLOWED RELATIONSHIP TYPES:\n"
|
|
86
|
-
+ "\n".join(rel_descriptions)
|
|
87
|
-
+ "\n\nDo not extract entities or relationships not listed above.\n\n"
|
|
88
|
-
"Respond with ONLY a JSON object in this exact format:\n"
|
|
89
|
-
'{"nodes": [{"id": "<unique_id>", "label": "<NodeType>", "properties": {<key>: <value>}}], '
|
|
90
|
-
'"relationships": [{"source_id": "<node_id>", "target_id": "<node_id>", "type": "<RelType>", "properties": {}}]}\n\n'
|
|
91
|
-
"Rules:\n"
|
|
92
|
-
"- Every node id must be unique and descriptive (e.g., 'person-alice', 'company-acme')\n"
|
|
93
|
-
"- Only use node types and relationship types listed above\n"
|
|
94
|
-
"- Include all required properties for each node type\n"
|
|
95
|
-
"- Return empty arrays if no entities are found"
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
def _parse_response(
|
|
99
|
-
self, response: str
|
|
100
|
-
) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
|
|
101
|
-
data = json.loads(response)
|
|
102
|
-
|
|
103
|
-
nodes = [
|
|
104
|
-
ExtractedNode(
|
|
105
|
-
id=n["id"],
|
|
106
|
-
label=n["label"],
|
|
107
|
-
properties=n.get("properties", {}),
|
|
108
|
-
)
|
|
109
|
-
for n in data.get("nodes", [])
|
|
110
|
-
]
|
|
111
|
-
|
|
112
|
-
rels = [
|
|
113
|
-
ExtractedRelationship(
|
|
114
|
-
source_id=r["source_id"],
|
|
115
|
-
target_id=r["target_id"],
|
|
116
|
-
type=r["type"],
|
|
117
|
-
properties=r.get("properties", {}),
|
|
118
|
-
)
|
|
119
|
-
for r in data.get("relationships", [])
|
|
120
|
-
]
|
|
121
|
-
|
|
122
|
-
return nodes, rels
|
|
146
|
+
return result.nodes, result.relationships
|
|
123
147
|
|
|
124
148
|
def _validate(
|
|
125
149
|
self,
|
|
@@ -127,28 +151,4 @@ class LLMExtractionEngine:
|
|
|
127
151
|
rels: list[ExtractedRelationship],
|
|
128
152
|
schema: OntologySchema,
|
|
129
153
|
) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
|
|
130
|
-
|
|
131
|
-
allowed_rel_types = {rt.type for rt in schema.relationship_types}
|
|
132
|
-
rel_constraints = {
|
|
133
|
-
rt.type: (set(rt.source_types), set(rt.target_types))
|
|
134
|
-
for rt in schema.relationship_types
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
valid_nodes = [n for n in nodes if n.label in allowed_labels]
|
|
138
|
-
valid_node_ids = {n.id for n in valid_nodes}
|
|
139
|
-
node_labels = {n.id: n.label for n in valid_nodes}
|
|
140
|
-
|
|
141
|
-
valid_rels = []
|
|
142
|
-
for rel in rels:
|
|
143
|
-
if rel.type not in allowed_rel_types:
|
|
144
|
-
continue
|
|
145
|
-
if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
|
|
146
|
-
continue
|
|
147
|
-
source_types, target_types = rel_constraints[rel.type]
|
|
148
|
-
if node_labels[rel.source_id] not in source_types:
|
|
149
|
-
continue
|
|
150
|
-
if node_labels[rel.target_id] not in target_types:
|
|
151
|
-
continue
|
|
152
|
-
valid_rels.append(rel)
|
|
153
|
-
|
|
154
|
-
return valid_nodes, valid_rels
|
|
154
|
+
return validate_extraction(nodes, rels, schema)
|
|
@@ -4,6 +4,8 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Protocol, runtime_checkable
|
|
6
6
|
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
7
9
|
from graphrag_core.models import (
|
|
8
10
|
AgentResult,
|
|
9
11
|
ApplyResult,
|
|
@@ -77,6 +79,15 @@ class LLMClient(Protocol):
|
|
|
77
79
|
max_tokens: int = 4096,
|
|
78
80
|
) -> str: ...
|
|
79
81
|
|
|
82
|
+
async def complete_json(
|
|
83
|
+
self,
|
|
84
|
+
messages: list[dict[str, str]],
|
|
85
|
+
schema: type[BaseModel],
|
|
86
|
+
system: str | None = None,
|
|
87
|
+
temperature: float = 0.0,
|
|
88
|
+
max_tokens: int = 4096,
|
|
89
|
+
) -> BaseModel: ...
|
|
90
|
+
|
|
80
91
|
|
|
81
92
|
# ---------------------------------------------------------------------------
|
|
82
93
|
# BB2: Schema-Guided Entity Extraction
|
|
@@ -94,6 +105,13 @@ class ExtractionEngine(Protocol):
|
|
|
94
105
|
) -> ExtractionResult: ...
|
|
95
106
|
|
|
96
107
|
|
|
108
|
+
@runtime_checkable
|
|
109
|
+
class ExtractionPromptBuilder(Protocol):
|
|
110
|
+
"""Builds the system prompt for LLM-based entity extraction."""
|
|
111
|
+
|
|
112
|
+
def build_system_prompt(self, schema: OntologySchema) -> str: ...
|
|
113
|
+
|
|
114
|
+
|
|
97
115
|
# ---------------------------------------------------------------------------
|
|
98
116
|
# BB3: Provenance-Native Knowledge Graph
|
|
99
117
|
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""LLM client implementations."""
|
|
2
|
+
|
|
3
|
+
from graphrag_core.llm.base import BaseLLMClient
|
|
4
|
+
|
|
5
|
+
__all__: list[str] = ["BaseLLMClient"]
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from graphrag_core.llm.anthropic import AnthropicLLMClient
|
|
9
|
+
__all__.append("AnthropicLLMClient")
|
|
10
|
+
except ImportError:
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from graphrag_core.llm.openai import OpenAILLMClient
|
|
15
|
+
__all__.append("OpenAILLMClient")
|
|
16
|
+
except ImportError:
|
|
17
|
+
pass
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Anthropic Claude LLM client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from anthropic import AsyncAnthropic
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AnthropicLLMClient:
|
|
10
|
+
"""Thin wrapper around the Anthropic SDK implementing the LLMClient Protocol."""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
model: str = "claude-sonnet-4-20250514",
|
|
15
|
+
api_key: str | None = None,
|
|
16
|
+
) -> None:
|
|
17
|
+
self._model = model
|
|
18
|
+
self._client = AsyncAnthropic(api_key=api_key)
|
|
19
|
+
|
|
20
|
+
async def complete(
|
|
21
|
+
self,
|
|
22
|
+
messages: list[dict[str, str]],
|
|
23
|
+
system: str | None = None,
|
|
24
|
+
temperature: float = 0.0,
|
|
25
|
+
max_tokens: int = 4096,
|
|
26
|
+
) -> str:
|
|
27
|
+
kwargs: dict = {
|
|
28
|
+
"model": self._model,
|
|
29
|
+
"messages": messages,
|
|
30
|
+
"temperature": temperature,
|
|
31
|
+
"max_tokens": max_tokens,
|
|
32
|
+
}
|
|
33
|
+
if system is not None:
|
|
34
|
+
kwargs["system"] = system
|
|
35
|
+
response = await self._client.messages.create(**kwargs)
|
|
36
|
+
return response.content[0].text
|
|
37
|
+
|
|
38
|
+
async def complete_json(
|
|
39
|
+
self,
|
|
40
|
+
messages: list[dict[str, str]],
|
|
41
|
+
schema: type[BaseModel],
|
|
42
|
+
system: str | None = None,
|
|
43
|
+
temperature: float = 0.0,
|
|
44
|
+
max_tokens: int = 4096,
|
|
45
|
+
) -> BaseModel:
|
|
46
|
+
json_schema = schema.model_json_schema()
|
|
47
|
+
kwargs: dict = {
|
|
48
|
+
"model": self._model,
|
|
49
|
+
"messages": messages,
|
|
50
|
+
"temperature": temperature,
|
|
51
|
+
"max_tokens": max_tokens,
|
|
52
|
+
"tools": [
|
|
53
|
+
{
|
|
54
|
+
"name": "extract",
|
|
55
|
+
"description": "Extract structured data",
|
|
56
|
+
"input_schema": json_schema,
|
|
57
|
+
},
|
|
58
|
+
],
|
|
59
|
+
"tool_choice": {"type": "tool", "name": "extract"},
|
|
60
|
+
}
|
|
61
|
+
if system is not None:
|
|
62
|
+
kwargs["system"] = system
|
|
63
|
+
response = await self._client.messages.create(**kwargs)
|
|
64
|
+
tool_block = next(b for b in response.content if b.type == "tool_use")
|
|
65
|
+
return schema.model_validate(tool_block.input)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Base LLM client with fallback complete_json() via prompt + parse + retry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ValidationError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseLLMClient:
|
|
11
|
+
"""Default complete_json() via prompt + parse + retry.
|
|
12
|
+
|
|
13
|
+
Providers with native structured output (OpenAI, Anthropic) override
|
|
14
|
+
complete_json() directly. This base class provides a working fallback
|
|
15
|
+
for providers without native support (e.g., local model clients).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
async def complete(
|
|
19
|
+
self,
|
|
20
|
+
messages: list[dict[str, str]],
|
|
21
|
+
system: str | None = None,
|
|
22
|
+
temperature: float = 0.0,
|
|
23
|
+
max_tokens: int = 4096,
|
|
24
|
+
) -> str:
|
|
25
|
+
raise NotImplementedError
|
|
26
|
+
|
|
27
|
+
async def complete_json(
|
|
28
|
+
self,
|
|
29
|
+
messages: list[dict[str, str]],
|
|
30
|
+
schema: type[BaseModel],
|
|
31
|
+
system: str | None = None,
|
|
32
|
+
temperature: float = 0.0,
|
|
33
|
+
max_tokens: int = 4096,
|
|
34
|
+
) -> BaseModel:
|
|
35
|
+
schema_text = json.dumps(schema.model_json_schema(), indent=2)
|
|
36
|
+
augmented_system = (system or "") + (
|
|
37
|
+
f"\n\nRespond with ONLY a JSON object matching this schema:\n{schema_text}\n"
|
|
38
|
+
"No markdown fences. No explanation. Just the JSON object."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
for attempt in range(2):
|
|
42
|
+
response = await self.complete(
|
|
43
|
+
messages, system=augmented_system, temperature=temperature, max_tokens=max_tokens,
|
|
44
|
+
)
|
|
45
|
+
text = self._strip_json(response)
|
|
46
|
+
try:
|
|
47
|
+
return schema.model_validate_json(text)
|
|
48
|
+
except (json.JSONDecodeError, ValidationError) as exc:
|
|
49
|
+
if attempt == 0:
|
|
50
|
+
augmented_system += (
|
|
51
|
+
f"\n\nYour previous response failed validation: {exc}\n"
|
|
52
|
+
"Try again. Return ONLY valid JSON."
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
raise
|
|
56
|
+
raise RuntimeError("unreachable")
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _strip_json(text: str) -> str:
|
|
60
|
+
text = text.strip()
|
|
61
|
+
if text.startswith("```"):
|
|
62
|
+
nl = text.find("\n")
|
|
63
|
+
text = text[nl + 1 :] if nl != -1 else ""
|
|
64
|
+
if text.endswith("```"):
|
|
65
|
+
text = text[:-3]
|
|
66
|
+
return text.strip()
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""OpenAI LLM client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from openai import AsyncOpenAI
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OpenAILLMClient:
|
|
10
|
+
"""Thin wrapper around the OpenAI SDK implementing the LLMClient Protocol."""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
model: str = "gpt-4o",
|
|
15
|
+
api_key: str | None = None,
|
|
16
|
+
) -> None:
|
|
17
|
+
self._model = model
|
|
18
|
+
self._client = AsyncOpenAI(api_key=api_key)
|
|
19
|
+
|
|
20
|
+
async def complete(
|
|
21
|
+
self,
|
|
22
|
+
messages: list[dict[str, str]],
|
|
23
|
+
system: str | None = None,
|
|
24
|
+
temperature: float = 0.0,
|
|
25
|
+
max_tokens: int = 4096,
|
|
26
|
+
) -> str:
|
|
27
|
+
full_messages = list(messages)
|
|
28
|
+
if system is not None:
|
|
29
|
+
full_messages.insert(0, {"role": "system", "content": system})
|
|
30
|
+
response = await self._client.chat.completions.create(
|
|
31
|
+
model=self._model,
|
|
32
|
+
messages=full_messages,
|
|
33
|
+
temperature=temperature,
|
|
34
|
+
max_tokens=max_tokens,
|
|
35
|
+
)
|
|
36
|
+
return response.choices[0].message.content
|
|
37
|
+
|
|
38
|
+
async def complete_json(
|
|
39
|
+
self,
|
|
40
|
+
messages: list[dict[str, str]],
|
|
41
|
+
schema: type[BaseModel],
|
|
42
|
+
system: str | None = None,
|
|
43
|
+
temperature: float = 0.0,
|
|
44
|
+
max_tokens: int = 4096,
|
|
45
|
+
) -> BaseModel:
|
|
46
|
+
full_messages = list(messages)
|
|
47
|
+
if system is not None:
|
|
48
|
+
full_messages.insert(0, {"role": "system", "content": system})
|
|
49
|
+
json_schema = schema.model_json_schema()
|
|
50
|
+
response = await self._client.chat.completions.create(
|
|
51
|
+
model=self._model,
|
|
52
|
+
messages=full_messages,
|
|
53
|
+
temperature=temperature,
|
|
54
|
+
max_tokens=max_tokens,
|
|
55
|
+
response_format={
|
|
56
|
+
"type": "json_schema",
|
|
57
|
+
"json_schema": {
|
|
58
|
+
"name": schema.__name__,
|
|
59
|
+
"schema": json_schema,
|
|
60
|
+
"strict": True,
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
)
|
|
64
|
+
return schema.model_validate_json(response.choices[0].message.content)
|
|
@@ -68,12 +68,14 @@ class NodeTypeDefinition(BaseModel):
|
|
|
68
68
|
label: str
|
|
69
69
|
properties: list[PropertyDefinition]
|
|
70
70
|
required_properties: list[str] = []
|
|
71
|
+
description: str | None = None
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
class RelationshipTypeDefinition(BaseModel):
|
|
74
75
|
type: str
|
|
75
76
|
source_types: list[str]
|
|
76
77
|
target_types: list[str]
|
|
78
|
+
description: str | None = None
|
|
77
79
|
|
|
78
80
|
|
|
79
81
|
class OntologySchema(BaseModel):
|
|
@@ -106,6 +108,12 @@ class ExtractionResult(BaseModel):
|
|
|
106
108
|
provenance: list[ProvenanceLink]
|
|
107
109
|
|
|
108
110
|
|
|
111
|
+
class ChunkExtractionResult(BaseModel):
|
|
112
|
+
"""LLM extraction output for a single chunk (no provenance — engine adds that)."""
|
|
113
|
+
nodes: list[ExtractedNode]
|
|
114
|
+
relationships: list[ExtractedRelationship]
|
|
115
|
+
|
|
116
|
+
|
|
109
117
|
# ---------------------------------------------------------------------------
|
|
110
118
|
# BB3: Provenance-Native Knowledge Graph
|
|
111
119
|
# ---------------------------------------------------------------------------
|