graphrag-core 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/PKG-INFO +4 -1
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/pyproject.toml +3 -2
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/__init__.py +11 -0
- graphrag_core-0.3.0/src/graphrag_core/extraction/__init__.py +5 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/extraction/engine.py +53 -62
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/interfaces.py +11 -0
- graphrag_core-0.3.0/src/graphrag_core/llm/__init__.py +17 -0
- graphrag_core-0.3.0/src/graphrag_core/llm/anthropic.py +65 -0
- graphrag_core-0.3.0/src/graphrag_core/llm/base.py +66 -0
- graphrag_core-0.3.0/src/graphrag_core/llm/openai.py +64 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/models.py +8 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_extraction/test_engine.py +223 -8
- graphrag_core-0.3.0/tests/test_extraction_engine.py +128 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_interfaces.py +12 -0
- graphrag_core-0.3.0/tests/test_llm_anthropic.py +77 -0
- graphrag_core-0.3.0/tests/test_llm_base.py +75 -0
- graphrag_core-0.3.0/tests/test_llm_openai.py +125 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_models.py +23 -0
- graphrag_core-0.2.0/src/graphrag_core/extraction/__init__.py +0 -5
- graphrag_core-0.2.0/src/graphrag_core/llm/__init__.py +0 -9
- graphrag_core-0.2.0/src/graphrag_core/llm/anthropic.py +0 -35
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/.github/workflows/release.yml +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/.github/workflows/test.yml +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/.gitignore +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/CHANGELOG.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/CLAUDE.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/LICENSE +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/README.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/graphrag_core_interface_spec.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-11-bb1-document-ingestion.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-bb2-bb3-extraction-and-graph.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-bb4-hybrid-search.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-bb5-bb6-curation-and-registry.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-bb7-bb8-tools-and-agents.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-release-readiness.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-11-bb1-document-ingestion-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-bb2-bb3-extraction-and-graph-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-bb4-hybrid-search-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-bb5-bb6-curation-and-registry-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-bb7-bb8-tools-and-agents-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-release-readiness-design.md +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/_cypher.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/agents/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/agents/context.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/agents/orchestrator.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/curation/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/curation/detection.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/curation/pipeline.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/graph/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/graph/memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/graph/neo4j.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/ingestion/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/ingestion/chunker.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/ingestion/parsers.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/ingestion/pipeline.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/py.typed +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/registry/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/registry/matching.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/registry/memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/search/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/search/fusion.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/search/memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/search/neo4j.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/tools/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/tools/core_tools.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/tools/library.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/conftest.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_agents/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_agents/test_context.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_agents/test_orchestrator.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_curation/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_curation/test_detection.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_curation/test_pipeline.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_extraction/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_graph/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_graph/test_memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_graph/test_neo4j.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_ingestion/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_ingestion/test_chunker.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_ingestion/test_parsers.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_ingestion/test_pipeline.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_integration/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_integration/test_ingest_to_graph.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_registry/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_registry/test_matching.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_registry/test_memory_registry.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_search/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_search/test_fusion.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_search/test_memory.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_search/test_neo4j_search.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_tools/__init__.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_tools/test_core_tools.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_tools/test_library.py +0 -0
- {graphrag_core-0.2.0 → graphrag_core-0.3.0}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: graphrag-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs
|
|
5
5
|
Project-URL: Homepage, https://github.com/cdel1/graphrag-core
|
|
6
6
|
Project-URL: Repository, https://github.com/cdel1/graphrag-core
|
|
@@ -22,10 +22,13 @@ Requires-Dist: python-docx>=1.0
|
|
|
22
22
|
Provides-Extra: all
|
|
23
23
|
Requires-Dist: anthropic>=0.40; extra == 'all'
|
|
24
24
|
Requires-Dist: neo4j>=5.0; extra == 'all'
|
|
25
|
+
Requires-Dist: openai>=1.0; extra == 'all'
|
|
25
26
|
Provides-Extra: anthropic
|
|
26
27
|
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
27
28
|
Provides-Extra: neo4j
|
|
28
29
|
Requires-Dist: neo4j>=5.0; extra == 'neo4j'
|
|
30
|
+
Provides-Extra: openai
|
|
31
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
29
32
|
Description-Content-Type: text/markdown
|
|
30
33
|
|
|
31
34
|
# graphrag-core
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "graphrag-core"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs"
|
|
5
5
|
license = "MIT"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -36,7 +36,8 @@ markers = [
|
|
|
36
36
|
[project.optional-dependencies]
|
|
37
37
|
anthropic = ["anthropic>=0.40"]
|
|
38
38
|
neo4j = ["neo4j>=5.0"]
|
|
39
|
-
|
|
39
|
+
openai = ["openai>=1.0"]
|
|
40
|
+
all = ["graphrag-core[anthropic,neo4j,openai]"]
|
|
40
41
|
|
|
41
42
|
[project.urls]
|
|
42
43
|
Homepage = "https://github.com/cdel1/graphrag-core"
|
|
@@ -33,8 +33,10 @@ from graphrag_core.registry import InMemoryEntityRegistry
|
|
|
33
33
|
from graphrag_core.curation import DeterministicDetectionLayer, CurationPipeline
|
|
34
34
|
from graphrag_core.tools import Tool, ToolLibrary, register_core_tools
|
|
35
35
|
from graphrag_core.agents import AgentContext, SequentialOrchestrator
|
|
36
|
+
from graphrag_core.llm import BaseLLMClient
|
|
36
37
|
from graphrag_core.models import (
|
|
37
38
|
AgentResult,
|
|
39
|
+
ChunkExtractionResult,
|
|
38
40
|
CurationIssue,
|
|
39
41
|
CurationReport,
|
|
40
42
|
DocumentChunk,
|
|
@@ -78,6 +80,8 @@ __all__ = [
|
|
|
78
80
|
"PdfParser",
|
|
79
81
|
"TextParser",
|
|
80
82
|
"TokenChunker",
|
|
83
|
+
# LLM base
|
|
84
|
+
"BaseLLMClient",
|
|
81
85
|
# BB2 implementations
|
|
82
86
|
"LLMExtractionEngine",
|
|
83
87
|
# BB3 implementations
|
|
@@ -98,6 +102,7 @@ __all__ = [
|
|
|
98
102
|
"SequentialOrchestrator",
|
|
99
103
|
# Models
|
|
100
104
|
"AgentResult",
|
|
105
|
+
"ChunkExtractionResult",
|
|
101
106
|
"CurationIssue",
|
|
102
107
|
"CurationReport",
|
|
103
108
|
"DocumentChunk",
|
|
@@ -136,3 +141,9 @@ try:
|
|
|
136
141
|
__all__.append("AnthropicLLMClient")
|
|
137
142
|
except ImportError:
|
|
138
143
|
pass
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
from graphrag_core.llm import OpenAILLMClient
|
|
147
|
+
__all__.append("OpenAILLMClient")
|
|
148
|
+
except ImportError:
|
|
149
|
+
pass
|
|
@@ -2,10 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import json
|
|
6
|
-
|
|
7
5
|
from graphrag_core.interfaces import LLMClient
|
|
8
6
|
from graphrag_core.models import (
|
|
7
|
+
ChunkExtractionResult,
|
|
9
8
|
DocumentChunk,
|
|
10
9
|
ExtractedNode,
|
|
11
10
|
ExtractedRelationship,
|
|
@@ -16,6 +15,46 @@ from graphrag_core.models import (
|
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
|
|
18
|
+
def validate_extraction(
|
|
19
|
+
nodes: list[ExtractedNode],
|
|
20
|
+
rels: list[ExtractedRelationship],
|
|
21
|
+
schema: OntologySchema,
|
|
22
|
+
) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
|
|
23
|
+
"""Filter extracted nodes and relationships to match schema constraints.
|
|
24
|
+
|
|
25
|
+
Removes:
|
|
26
|
+
- Nodes with labels not in the schema
|
|
27
|
+
- Relationships with types not in the schema
|
|
28
|
+
- Relationships referencing non-existent node IDs
|
|
29
|
+
- Relationships violating source/target type constraints
|
|
30
|
+
"""
|
|
31
|
+
allowed_labels = {nt.label for nt in schema.node_types}
|
|
32
|
+
allowed_rel_types = {rt.type for rt in schema.relationship_types}
|
|
33
|
+
rel_constraints = {
|
|
34
|
+
rt.type: (set(rt.source_types), set(rt.target_types))
|
|
35
|
+
for rt in schema.relationship_types
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
valid_nodes = [n for n in nodes if n.label in allowed_labels]
|
|
39
|
+
valid_node_ids = {n.id for n in valid_nodes}
|
|
40
|
+
node_labels = {n.id: n.label for n in valid_nodes}
|
|
41
|
+
|
|
42
|
+
valid_rels = []
|
|
43
|
+
for rel in rels:
|
|
44
|
+
if rel.type not in allowed_rel_types:
|
|
45
|
+
continue
|
|
46
|
+
if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
|
|
47
|
+
continue
|
|
48
|
+
source_types, target_types = rel_constraints[rel.type]
|
|
49
|
+
if node_labels[rel.source_id] not in source_types:
|
|
50
|
+
continue
|
|
51
|
+
if node_labels[rel.target_id] not in target_types:
|
|
52
|
+
continue
|
|
53
|
+
valid_rels.append(rel)
|
|
54
|
+
|
|
55
|
+
return valid_nodes, valid_rels
|
|
56
|
+
|
|
57
|
+
|
|
19
58
|
class LLMExtractionEngine:
|
|
20
59
|
"""Extracts entities and relationships from text using an LLM, guided by an ontology schema."""
|
|
21
60
|
|
|
@@ -55,12 +94,13 @@ class LLMExtractionEngine:
|
|
|
55
94
|
async def _extract_chunk(
|
|
56
95
|
self, chunk: DocumentChunk, system_prompt: str
|
|
57
96
|
) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
|
|
58
|
-
|
|
97
|
+
result = await self._llm.complete_json(
|
|
59
98
|
messages=[{"role": "user", "content": chunk.text}],
|
|
99
|
+
schema=ChunkExtractionResult,
|
|
60
100
|
system=system_prompt,
|
|
61
101
|
temperature=0.0,
|
|
62
102
|
)
|
|
63
|
-
return
|
|
103
|
+
return result.nodes, result.relationships
|
|
64
104
|
|
|
65
105
|
def _build_system_prompt(self, schema: OntologySchema) -> str:
|
|
66
106
|
node_descriptions = []
|
|
@@ -69,13 +109,17 @@ class LLMExtractionEngine:
|
|
|
69
109
|
f"{p.name} ({p.type}{', required' if p.required else ''})"
|
|
70
110
|
for p in nt.properties
|
|
71
111
|
)
|
|
72
|
-
|
|
112
|
+
line = f"- {nt.label}: properties=[{props}]"
|
|
113
|
+
if nt.description:
|
|
114
|
+
line += f" \u2014 {nt.description}"
|
|
115
|
+
node_descriptions.append(line)
|
|
73
116
|
|
|
74
117
|
rel_descriptions = []
|
|
75
118
|
for rt in schema.relationship_types:
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
119
|
+
line = f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
|
|
120
|
+
if rt.description:
|
|
121
|
+
line += f" \u2014 {rt.description}"
|
|
122
|
+
rel_descriptions.append(line)
|
|
79
123
|
|
|
80
124
|
return (
|
|
81
125
|
"You are an entity extraction engine. Extract entities and relationships "
|
|
@@ -85,9 +129,6 @@ class LLMExtractionEngine:
|
|
|
85
129
|
+ "\n\nALLOWED RELATIONSHIP TYPES:\n"
|
|
86
130
|
+ "\n".join(rel_descriptions)
|
|
87
131
|
+ "\n\nDo not extract entities or relationships not listed above.\n\n"
|
|
88
|
-
"Respond with ONLY a JSON object in this exact format:\n"
|
|
89
|
-
'{"nodes": [{"id": "<unique_id>", "label": "<NodeType>", "properties": {<key>: <value>}}], '
|
|
90
|
-
'"relationships": [{"source_id": "<node_id>", "target_id": "<node_id>", "type": "<RelType>", "properties": {}}]}\n\n'
|
|
91
132
|
"Rules:\n"
|
|
92
133
|
"- Every node id must be unique and descriptive (e.g., 'person-alice', 'company-acme')\n"
|
|
93
134
|
"- Only use node types and relationship types listed above\n"
|
|
@@ -95,60 +136,10 @@ class LLMExtractionEngine:
|
|
|
95
136
|
"- Return empty arrays if no entities are found"
|
|
96
137
|
)
|
|
97
138
|
|
|
98
|
-
def _parse_response(
|
|
99
|
-
self, response: str
|
|
100
|
-
) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
|
|
101
|
-
data = json.loads(response)
|
|
102
|
-
|
|
103
|
-
nodes = [
|
|
104
|
-
ExtractedNode(
|
|
105
|
-
id=n["id"],
|
|
106
|
-
label=n["label"],
|
|
107
|
-
properties=n.get("properties", {}),
|
|
108
|
-
)
|
|
109
|
-
for n in data.get("nodes", [])
|
|
110
|
-
]
|
|
111
|
-
|
|
112
|
-
rels = [
|
|
113
|
-
ExtractedRelationship(
|
|
114
|
-
source_id=r["source_id"],
|
|
115
|
-
target_id=r["target_id"],
|
|
116
|
-
type=r["type"],
|
|
117
|
-
properties=r.get("properties", {}),
|
|
118
|
-
)
|
|
119
|
-
for r in data.get("relationships", [])
|
|
120
|
-
]
|
|
121
|
-
|
|
122
|
-
return nodes, rels
|
|
123
|
-
|
|
124
139
|
def _validate(
|
|
125
140
|
self,
|
|
126
141
|
nodes: list[ExtractedNode],
|
|
127
142
|
rels: list[ExtractedRelationship],
|
|
128
143
|
schema: OntologySchema,
|
|
129
144
|
) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
|
|
130
|
-
|
|
131
|
-
allowed_rel_types = {rt.type for rt in schema.relationship_types}
|
|
132
|
-
rel_constraints = {
|
|
133
|
-
rt.type: (set(rt.source_types), set(rt.target_types))
|
|
134
|
-
for rt in schema.relationship_types
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
valid_nodes = [n for n in nodes if n.label in allowed_labels]
|
|
138
|
-
valid_node_ids = {n.id for n in valid_nodes}
|
|
139
|
-
node_labels = {n.id: n.label for n in valid_nodes}
|
|
140
|
-
|
|
141
|
-
valid_rels = []
|
|
142
|
-
for rel in rels:
|
|
143
|
-
if rel.type not in allowed_rel_types:
|
|
144
|
-
continue
|
|
145
|
-
if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
|
|
146
|
-
continue
|
|
147
|
-
source_types, target_types = rel_constraints[rel.type]
|
|
148
|
-
if node_labels[rel.source_id] not in source_types:
|
|
149
|
-
continue
|
|
150
|
-
if node_labels[rel.target_id] not in target_types:
|
|
151
|
-
continue
|
|
152
|
-
valid_rels.append(rel)
|
|
153
|
-
|
|
154
|
-
return valid_nodes, valid_rels
|
|
145
|
+
return validate_extraction(nodes, rels, schema)
|
|
@@ -4,6 +4,8 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Protocol, runtime_checkable
|
|
6
6
|
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
7
9
|
from graphrag_core.models import (
|
|
8
10
|
AgentResult,
|
|
9
11
|
ApplyResult,
|
|
@@ -77,6 +79,15 @@ class LLMClient(Protocol):
|
|
|
77
79
|
max_tokens: int = 4096,
|
|
78
80
|
) -> str: ...
|
|
79
81
|
|
|
82
|
+
async def complete_json(
|
|
83
|
+
self,
|
|
84
|
+
messages: list[dict[str, str]],
|
|
85
|
+
schema: type[BaseModel],
|
|
86
|
+
system: str | None = None,
|
|
87
|
+
temperature: float = 0.0,
|
|
88
|
+
max_tokens: int = 4096,
|
|
89
|
+
) -> BaseModel: ...
|
|
90
|
+
|
|
80
91
|
|
|
81
92
|
# ---------------------------------------------------------------------------
|
|
82
93
|
# BB2: Schema-Guided Entity Extraction
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""LLM client implementations."""
|
|
2
|
+
|
|
3
|
+
from graphrag_core.llm.base import BaseLLMClient
|
|
4
|
+
|
|
5
|
+
__all__: list[str] = ["BaseLLMClient"]
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from graphrag_core.llm.anthropic import AnthropicLLMClient
|
|
9
|
+
__all__.append("AnthropicLLMClient")
|
|
10
|
+
except ImportError:
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from graphrag_core.llm.openai import OpenAILLMClient
|
|
15
|
+
__all__.append("OpenAILLMClient")
|
|
16
|
+
except ImportError:
|
|
17
|
+
pass
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Anthropic Claude LLM client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from anthropic import AsyncAnthropic
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AnthropicLLMClient:
|
|
10
|
+
"""Thin wrapper around the Anthropic SDK implementing the LLMClient Protocol."""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
model: str = "claude-sonnet-4-20250514",
|
|
15
|
+
api_key: str | None = None,
|
|
16
|
+
) -> None:
|
|
17
|
+
self._model = model
|
|
18
|
+
self._client = AsyncAnthropic(api_key=api_key)
|
|
19
|
+
|
|
20
|
+
async def complete(
|
|
21
|
+
self,
|
|
22
|
+
messages: list[dict[str, str]],
|
|
23
|
+
system: str | None = None,
|
|
24
|
+
temperature: float = 0.0,
|
|
25
|
+
max_tokens: int = 4096,
|
|
26
|
+
) -> str:
|
|
27
|
+
kwargs: dict = {
|
|
28
|
+
"model": self._model,
|
|
29
|
+
"messages": messages,
|
|
30
|
+
"temperature": temperature,
|
|
31
|
+
"max_tokens": max_tokens,
|
|
32
|
+
}
|
|
33
|
+
if system is not None:
|
|
34
|
+
kwargs["system"] = system
|
|
35
|
+
response = await self._client.messages.create(**kwargs)
|
|
36
|
+
return response.content[0].text
|
|
37
|
+
|
|
38
|
+
async def complete_json(
|
|
39
|
+
self,
|
|
40
|
+
messages: list[dict[str, str]],
|
|
41
|
+
schema: type[BaseModel],
|
|
42
|
+
system: str | None = None,
|
|
43
|
+
temperature: float = 0.0,
|
|
44
|
+
max_tokens: int = 4096,
|
|
45
|
+
) -> BaseModel:
|
|
46
|
+
json_schema = schema.model_json_schema()
|
|
47
|
+
kwargs: dict = {
|
|
48
|
+
"model": self._model,
|
|
49
|
+
"messages": messages,
|
|
50
|
+
"temperature": temperature,
|
|
51
|
+
"max_tokens": max_tokens,
|
|
52
|
+
"tools": [
|
|
53
|
+
{
|
|
54
|
+
"name": "extract",
|
|
55
|
+
"description": "Extract structured data",
|
|
56
|
+
"input_schema": json_schema,
|
|
57
|
+
},
|
|
58
|
+
],
|
|
59
|
+
"tool_choice": {"type": "tool", "name": "extract"},
|
|
60
|
+
}
|
|
61
|
+
if system is not None:
|
|
62
|
+
kwargs["system"] = system
|
|
63
|
+
response = await self._client.messages.create(**kwargs)
|
|
64
|
+
tool_block = next(b for b in response.content if b.type == "tool_use")
|
|
65
|
+
return schema.model_validate(tool_block.input)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Base LLM client with fallback complete_json() via prompt + parse + retry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ValidationError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseLLMClient:
|
|
11
|
+
"""Default complete_json() via prompt + parse + retry.
|
|
12
|
+
|
|
13
|
+
Providers with native structured output (OpenAI, Anthropic) override
|
|
14
|
+
complete_json() directly. This base class provides a working fallback
|
|
15
|
+
for providers without native support (e.g., local model clients).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
async def complete(
|
|
19
|
+
self,
|
|
20
|
+
messages: list[dict[str, str]],
|
|
21
|
+
system: str | None = None,
|
|
22
|
+
temperature: float = 0.0,
|
|
23
|
+
max_tokens: int = 4096,
|
|
24
|
+
) -> str:
|
|
25
|
+
raise NotImplementedError
|
|
26
|
+
|
|
27
|
+
async def complete_json(
|
|
28
|
+
self,
|
|
29
|
+
messages: list[dict[str, str]],
|
|
30
|
+
schema: type[BaseModel],
|
|
31
|
+
system: str | None = None,
|
|
32
|
+
temperature: float = 0.0,
|
|
33
|
+
max_tokens: int = 4096,
|
|
34
|
+
) -> BaseModel:
|
|
35
|
+
schema_text = json.dumps(schema.model_json_schema(), indent=2)
|
|
36
|
+
augmented_system = (system or "") + (
|
|
37
|
+
f"\n\nRespond with ONLY a JSON object matching this schema:\n{schema_text}\n"
|
|
38
|
+
"No markdown fences. No explanation. Just the JSON object."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
for attempt in range(2):
|
|
42
|
+
response = await self.complete(
|
|
43
|
+
messages, system=augmented_system, temperature=temperature, max_tokens=max_tokens,
|
|
44
|
+
)
|
|
45
|
+
text = self._strip_json(response)
|
|
46
|
+
try:
|
|
47
|
+
return schema.model_validate_json(text)
|
|
48
|
+
except (json.JSONDecodeError, ValidationError) as exc:
|
|
49
|
+
if attempt == 0:
|
|
50
|
+
augmented_system += (
|
|
51
|
+
f"\n\nYour previous response failed validation: {exc}\n"
|
|
52
|
+
"Try again. Return ONLY valid JSON."
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
raise
|
|
56
|
+
raise RuntimeError("unreachable")
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _strip_json(text: str) -> str:
|
|
60
|
+
text = text.strip()
|
|
61
|
+
if text.startswith("```"):
|
|
62
|
+
nl = text.find("\n")
|
|
63
|
+
text = text[nl + 1 :] if nl != -1 else ""
|
|
64
|
+
if text.endswith("```"):
|
|
65
|
+
text = text[:-3]
|
|
66
|
+
return text.strip()
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""OpenAI LLM client."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from openai import AsyncOpenAI
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OpenAILLMClient:
|
|
10
|
+
"""Thin wrapper around the OpenAI SDK implementing the LLMClient Protocol."""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
model: str = "gpt-4o",
|
|
15
|
+
api_key: str | None = None,
|
|
16
|
+
) -> None:
|
|
17
|
+
self._model = model
|
|
18
|
+
self._client = AsyncOpenAI(api_key=api_key)
|
|
19
|
+
|
|
20
|
+
async def complete(
|
|
21
|
+
self,
|
|
22
|
+
messages: list[dict[str, str]],
|
|
23
|
+
system: str | None = None,
|
|
24
|
+
temperature: float = 0.0,
|
|
25
|
+
max_tokens: int = 4096,
|
|
26
|
+
) -> str:
|
|
27
|
+
full_messages = list(messages)
|
|
28
|
+
if system is not None:
|
|
29
|
+
full_messages.insert(0, {"role": "system", "content": system})
|
|
30
|
+
response = await self._client.chat.completions.create(
|
|
31
|
+
model=self._model,
|
|
32
|
+
messages=full_messages,
|
|
33
|
+
temperature=temperature,
|
|
34
|
+
max_tokens=max_tokens,
|
|
35
|
+
)
|
|
36
|
+
return response.choices[0].message.content
|
|
37
|
+
|
|
38
|
+
async def complete_json(
|
|
39
|
+
self,
|
|
40
|
+
messages: list[dict[str, str]],
|
|
41
|
+
schema: type[BaseModel],
|
|
42
|
+
system: str | None = None,
|
|
43
|
+
temperature: float = 0.0,
|
|
44
|
+
max_tokens: int = 4096,
|
|
45
|
+
) -> BaseModel:
|
|
46
|
+
full_messages = list(messages)
|
|
47
|
+
if system is not None:
|
|
48
|
+
full_messages.insert(0, {"role": "system", "content": system})
|
|
49
|
+
json_schema = schema.model_json_schema()
|
|
50
|
+
response = await self._client.chat.completions.create(
|
|
51
|
+
model=self._model,
|
|
52
|
+
messages=full_messages,
|
|
53
|
+
temperature=temperature,
|
|
54
|
+
max_tokens=max_tokens,
|
|
55
|
+
response_format={
|
|
56
|
+
"type": "json_schema",
|
|
57
|
+
"json_schema": {
|
|
58
|
+
"name": schema.__name__,
|
|
59
|
+
"schema": json_schema,
|
|
60
|
+
"strict": True,
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
)
|
|
64
|
+
return schema.model_validate_json(response.choices[0].message.content)
|
|
@@ -68,12 +68,14 @@ class NodeTypeDefinition(BaseModel):
|
|
|
68
68
|
label: str
|
|
69
69
|
properties: list[PropertyDefinition]
|
|
70
70
|
required_properties: list[str] = []
|
|
71
|
+
description: str | None = None
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
class RelationshipTypeDefinition(BaseModel):
|
|
74
75
|
type: str
|
|
75
76
|
source_types: list[str]
|
|
76
77
|
target_types: list[str]
|
|
78
|
+
description: str | None = None
|
|
77
79
|
|
|
78
80
|
|
|
79
81
|
class OntologySchema(BaseModel):
|
|
@@ -106,6 +108,12 @@ class ExtractionResult(BaseModel):
|
|
|
106
108
|
provenance: list[ProvenanceLink]
|
|
107
109
|
|
|
108
110
|
|
|
111
|
+
class ChunkExtractionResult(BaseModel):
|
|
112
|
+
"""LLM extraction output for a single chunk (no provenance — engine adds that)."""
|
|
113
|
+
nodes: list[ExtractedNode]
|
|
114
|
+
relationships: list[ExtractedRelationship]
|
|
115
|
+
|
|
116
|
+
|
|
109
117
|
# ---------------------------------------------------------------------------
|
|
110
118
|
# BB3: Provenance-Native Knowledge Graph
|
|
111
119
|
# ---------------------------------------------------------------------------
|