graphrag-core 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/PKG-INFO +4 -1
  2. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/pyproject.toml +3 -2
  3. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/__init__.py +15 -1
  4. graphrag_core-0.4.0/src/graphrag_core/extraction/__init__.py +5 -0
  5. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/extraction/engine.py +91 -91
  6. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/interfaces.py +18 -0
  7. graphrag_core-0.4.0/src/graphrag_core/llm/__init__.py +17 -0
  8. graphrag_core-0.4.0/src/graphrag_core/llm/anthropic.py +65 -0
  9. graphrag_core-0.4.0/src/graphrag_core/llm/base.py +66 -0
  10. graphrag_core-0.4.0/src/graphrag_core/llm/openai.py +64 -0
  11. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/models.py +8 -0
  12. graphrag_core-0.4.0/tests/test_extraction/test_engine.py +583 -0
  13. graphrag_core-0.4.0/tests/test_extraction_engine.py +128 -0
  14. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_interfaces.py +12 -0
  15. graphrag_core-0.4.0/tests/test_llm_anthropic.py +77 -0
  16. graphrag_core-0.4.0/tests/test_llm_base.py +75 -0
  17. graphrag_core-0.4.0/tests/test_llm_openai.py +125 -0
  18. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_models.py +23 -0
  19. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/uv.lock +260 -1
  20. graphrag_core-0.2.0/src/graphrag_core/extraction/__init__.py +0 -5
  21. graphrag_core-0.2.0/src/graphrag_core/llm/__init__.py +0 -9
  22. graphrag_core-0.2.0/src/graphrag_core/llm/anthropic.py +0 -35
  23. graphrag_core-0.2.0/tests/test_extraction/test_engine.py +0 -271
  24. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/.github/workflows/release.yml +0 -0
  25. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/.github/workflows/test.yml +0 -0
  26. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/.gitignore +0 -0
  27. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/CHANGELOG.md +0 -0
  28. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/CLAUDE.md +0 -0
  29. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/LICENSE +0 -0
  30. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/README.md +0 -0
  31. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/graphrag_core_interface_spec.md +0 -0
  32. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-11-bb1-document-ingestion.md +0 -0
  33. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-bb2-bb3-extraction-and-graph.md +0 -0
  34. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-bb4-hybrid-search.md +0 -0
  35. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-bb5-bb6-curation-and-registry.md +0 -0
  36. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-bb7-bb8-tools-and-agents.md +0 -0
  37. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/plans/2026-04-12-release-readiness.md +0 -0
  38. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-11-bb1-document-ingestion-design.md +0 -0
  39. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-bb2-bb3-extraction-and-graph-design.md +0 -0
  40. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-bb4-hybrid-search-design.md +0 -0
  41. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-bb5-bb6-curation-and-registry-design.md +0 -0
  42. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-bb7-bb8-tools-and-agents-design.md +0 -0
  43. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/docs/superpowers/specs/2026-04-12-release-readiness-design.md +0 -0
  44. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/_cypher.py +0 -0
  45. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/agents/__init__.py +0 -0
  46. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/agents/context.py +0 -0
  47. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/agents/orchestrator.py +0 -0
  48. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/curation/__init__.py +0 -0
  49. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/curation/detection.py +0 -0
  50. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/curation/pipeline.py +0 -0
  51. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/graph/__init__.py +0 -0
  52. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/graph/memory.py +0 -0
  53. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/graph/neo4j.py +0 -0
  54. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/ingestion/__init__.py +0 -0
  55. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/ingestion/chunker.py +0 -0
  56. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/ingestion/parsers.py +0 -0
  57. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/ingestion/pipeline.py +0 -0
  58. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/py.typed +0 -0
  59. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/registry/__init__.py +0 -0
  60. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/registry/matching.py +0 -0
  61. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/registry/memory.py +0 -0
  62. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/search/__init__.py +0 -0
  63. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/search/fusion.py +0 -0
  64. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/search/memory.py +0 -0
  65. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/search/neo4j.py +0 -0
  66. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/tools/__init__.py +0 -0
  67. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/tools/core_tools.py +0 -0
  68. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/tools/library.py +0 -0
  69. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/__init__.py +0 -0
  70. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/conftest.py +0 -0
  71. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_agents/__init__.py +0 -0
  72. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_agents/test_context.py +0 -0
  73. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_agents/test_orchestrator.py +0 -0
  74. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_curation/__init__.py +0 -0
  75. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_curation/test_detection.py +0 -0
  76. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_curation/test_pipeline.py +0 -0
  77. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_extraction/__init__.py +0 -0
  78. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_graph/__init__.py +0 -0
  79. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_graph/test_memory.py +0 -0
  80. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_graph/test_neo4j.py +0 -0
  81. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_ingestion/__init__.py +0 -0
  82. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_ingestion/test_chunker.py +0 -0
  83. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_ingestion/test_parsers.py +0 -0
  84. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_ingestion/test_pipeline.py +0 -0
  85. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_integration/__init__.py +0 -0
  86. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_integration/test_ingest_to_graph.py +0 -0
  87. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_registry/__init__.py +0 -0
  88. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_registry/test_matching.py +0 -0
  89. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_registry/test_memory_registry.py +0 -0
  90. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_search/__init__.py +0 -0
  91. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_search/test_fusion.py +0 -0
  92. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_search/test_memory.py +0 -0
  93. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_search/test_neo4j_search.py +0 -0
  94. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_tools/__init__.py +0 -0
  95. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_tools/test_core_tools.py +0 -0
  96. {graphrag_core-0.2.0 → graphrag_core-0.4.0}/tests/test_tools/test_library.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: graphrag-core
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs
5
5
  Project-URL: Homepage, https://github.com/cdel1/graphrag-core
6
6
  Project-URL: Repository, https://github.com/cdel1/graphrag-core
@@ -22,10 +22,13 @@ Requires-Dist: python-docx>=1.0
22
22
  Provides-Extra: all
23
23
  Requires-Dist: anthropic>=0.40; extra == 'all'
24
24
  Requires-Dist: neo4j>=5.0; extra == 'all'
25
+ Requires-Dist: openai>=1.0; extra == 'all'
25
26
  Provides-Extra: anthropic
26
27
  Requires-Dist: anthropic>=0.40; extra == 'anthropic'
27
28
  Provides-Extra: neo4j
28
29
  Requires-Dist: neo4j>=5.0; extra == 'neo4j'
30
+ Provides-Extra: openai
31
+ Requires-Dist: openai>=1.0; extra == 'openai'
29
32
  Description-Content-Type: text/markdown
30
33
 
31
34
  # graphrag-core
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-core"
3
- version = "0.2.0"
3
+ version = "0.4.0"
4
4
  description = "Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs"
5
5
  license = "MIT"
6
6
  requires-python = ">=3.12"
@@ -36,7 +36,8 @@ markers = [
36
36
  [project.optional-dependencies]
37
37
  anthropic = ["anthropic>=0.40"]
38
38
  neo4j = ["neo4j>=5.0"]
39
- all = ["graphrag-core[anthropic,neo4j]"]
39
+ openai = ["openai>=1.0"]
40
+ all = ["graphrag-core[anthropic,neo4j,openai]"]
40
41
 
41
42
  [project.urls]
42
43
  Homepage = "https://github.com/cdel1/graphrag-core"
@@ -11,6 +11,7 @@ from graphrag_core.interfaces import (
11
11
  EmbeddingModel,
12
12
  EntityRegistry,
13
13
  ExtractionEngine,
14
+ ExtractionPromptBuilder,
14
15
  GraphStore,
15
16
  IngestionPipeline,
16
17
  LLMClient,
@@ -26,15 +27,17 @@ from graphrag_core.ingestion import (
26
27
  TextParser,
27
28
  TokenChunker,
28
29
  )
29
- from graphrag_core.extraction import LLMExtractionEngine
30
+ from graphrag_core.extraction import DefaultPromptBuilder, LLMExtractionEngine
30
31
  from graphrag_core.graph import InMemoryGraphStore
31
32
  from graphrag_core.search import InMemorySearchEngine
32
33
  from graphrag_core.registry import InMemoryEntityRegistry
33
34
  from graphrag_core.curation import DeterministicDetectionLayer, CurationPipeline
34
35
  from graphrag_core.tools import Tool, ToolLibrary, register_core_tools
35
36
  from graphrag_core.agents import AgentContext, SequentialOrchestrator
37
+ from graphrag_core.llm import BaseLLMClient
36
38
  from graphrag_core.models import (
37
39
  AgentResult,
40
+ ChunkExtractionResult,
38
41
  CurationIssue,
39
42
  CurationReport,
40
43
  DocumentChunk,
@@ -65,6 +68,7 @@ __all__ = [
65
68
  "EmbeddingModel",
66
69
  "EntityRegistry",
67
70
  "ExtractionEngine",
71
+ "ExtractionPromptBuilder",
68
72
  "GraphStore",
69
73
  "IngestionPipeline",
70
74
  "LLMClient",
@@ -78,7 +82,10 @@ __all__ = [
78
82
  "PdfParser",
79
83
  "TextParser",
80
84
  "TokenChunker",
85
+ # LLM base
86
+ "BaseLLMClient",
81
87
  # BB2 implementations
88
+ "DefaultPromptBuilder",
82
89
  "LLMExtractionEngine",
83
90
  # BB3 implementations
84
91
  "InMemoryGraphStore",
@@ -98,6 +105,7 @@ __all__ = [
98
105
  "SequentialOrchestrator",
99
106
  # Models
100
107
  "AgentResult",
108
+ "ChunkExtractionResult",
101
109
  "CurationIssue",
102
110
  "CurationReport",
103
111
  "DocumentChunk",
@@ -136,3 +144,9 @@ try:
136
144
  __all__.append("AnthropicLLMClient")
137
145
  except ImportError:
138
146
  pass
147
+
148
+ try:
149
+ from graphrag_core.llm import OpenAILLMClient
150
+ __all__.append("OpenAILLMClient")
151
+ except ImportError:
152
+ pass
@@ -0,0 +1,5 @@
1
+ """BB2: Schema-guided entity extraction."""
2
+
3
+ from graphrag_core.extraction.engine import DefaultPromptBuilder, LLMExtractionEngine, validate_extraction
4
+
5
+ __all__ = ["DefaultPromptBuilder", "LLMExtractionEngine", "validate_extraction"]
@@ -2,10 +2,9 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import json
6
-
7
- from graphrag_core.interfaces import LLMClient
5
+ from graphrag_core.interfaces import ExtractionPromptBuilder, LLMClient
8
6
  from graphrag_core.models import (
7
+ ChunkExtractionResult,
9
8
  DocumentChunk,
10
9
  ExtractedNode,
11
10
  ExtractedRelationship,
@@ -16,11 +15,94 @@ from graphrag_core.models import (
16
15
  )
17
16
 
18
17
 
18
+ def validate_extraction(
19
+ nodes: list[ExtractedNode],
20
+ rels: list[ExtractedRelationship],
21
+ schema: OntologySchema,
22
+ ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
23
+ """Filter extracted nodes and relationships to match schema constraints.
24
+
25
+ Removes:
26
+ - Nodes with labels not in the schema
27
+ - Relationships with types not in the schema
28
+ - Relationships referencing non-existent node IDs
29
+ - Relationships violating source/target type constraints
30
+ """
31
+ allowed_labels = {nt.label for nt in schema.node_types}
32
+ allowed_rel_types = {rt.type for rt in schema.relationship_types}
33
+ rel_constraints = {
34
+ rt.type: (set(rt.source_types), set(rt.target_types))
35
+ for rt in schema.relationship_types
36
+ }
37
+
38
+ valid_nodes = [n for n in nodes if n.label in allowed_labels]
39
+ valid_node_ids = {n.id for n in valid_nodes}
40
+ node_labels = {n.id: n.label for n in valid_nodes}
41
+
42
+ valid_rels = []
43
+ for rel in rels:
44
+ if rel.type not in allowed_rel_types:
45
+ continue
46
+ if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
47
+ continue
48
+ source_types, target_types = rel_constraints[rel.type]
49
+ if node_labels[rel.source_id] not in source_types:
50
+ continue
51
+ if node_labels[rel.target_id] not in target_types:
52
+ continue
53
+ valid_rels.append(rel)
54
+
55
+ return valid_nodes, valid_rels
56
+
57
+
58
+ class DefaultPromptBuilder:
59
+ """Builds the default system prompt for LLM-based entity extraction."""
60
+
61
+ def build_system_prompt(self, schema: OntologySchema) -> str:
62
+ node_descriptions = []
63
+ for nt in schema.node_types:
64
+ props = ", ".join(
65
+ f"{p.name} ({p.type}{', required' if p.required else ''})"
66
+ for p in nt.properties
67
+ )
68
+ line = f"- {nt.label}: properties=[{props}]"
69
+ if nt.description:
70
+ line += f" \u2014 {nt.description}"
71
+ node_descriptions.append(line)
72
+
73
+ rel_descriptions = []
74
+ for rt in schema.relationship_types:
75
+ line = f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
76
+ if rt.description:
77
+ line += f" \u2014 {rt.description}"
78
+ rel_descriptions.append(line)
79
+
80
+ return (
81
+ "You are an entity extraction engine. Extract entities and relationships "
82
+ "from the provided text according to this schema.\n\n"
83
+ "ALLOWED NODE TYPES:\n"
84
+ + "\n".join(node_descriptions)
85
+ + "\n\nALLOWED RELATIONSHIP TYPES:\n"
86
+ + "\n".join(rel_descriptions)
87
+ + "\n\nDo not extract entities or relationships not listed above.\n\n"
88
+ "Rules:\n"
89
+ "- Every node id must be unique and descriptive (e.g., 'person-alice', 'company-acme')\n"
90
+ "- Only use node types and relationship types listed above\n"
91
+ "- Include all required properties for each node type\n"
92
+ "- Return empty arrays if no entities are found"
93
+ )
94
+
95
+
19
96
  class LLMExtractionEngine:
20
97
  """Extracts entities and relationships from text using an LLM, guided by an ontology schema."""
21
98
 
22
- def __init__(self, llm_client: LLMClient) -> None:
99
+ def __init__(
100
+ self,
101
+ llm_client: LLMClient,
102
+ prompt_builder: ExtractionPromptBuilder | None = None,
103
+ ) -> None:
23
104
  self._llm = llm_client
105
+ self._prompt_builder = prompt_builder or DefaultPromptBuilder()
24
106
 
25
107
  async def extract(
26
108
  self,
@@ -32,7 +114,7 @@ class LLMExtractionEngine:
32
114
  all_rels: list[ExtractedRelationship] = []
33
115
  all_provenance: list[ProvenanceLink] = []
34
116
 
35
- system_prompt = self._build_system_prompt(schema)
117
+ system_prompt = self._prompt_builder.build_system_prompt(schema)
36
118
 
37
119
  for chunk in chunks:
38
120
  nodes, rels = await self._extract_chunk(chunk, system_prompt)
@@ -55,71 +137,13 @@ class LLMExtractionEngine:
55
137
  async def _extract_chunk(
56
138
  self, chunk: DocumentChunk, system_prompt: str
57
139
  ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
58
- response = await self._llm.complete(
140
+ result = await self._llm.complete_json(
59
141
  messages=[{"role": "user", "content": chunk.text}],
142
+ schema=ChunkExtractionResult,
60
143
  system=system_prompt,
61
144
  temperature=0.0,
62
145
  )
63
- return self._parse_response(response)
64
-
65
- def _build_system_prompt(self, schema: OntologySchema) -> str:
66
- node_descriptions = []
67
- for nt in schema.node_types:
68
- props = ", ".join(
69
- f"{p.name} ({p.type}{', required' if p.required else ''})"
70
- for p in nt.properties
71
- )
72
- node_descriptions.append(f"- {nt.label}: properties=[{props}]")
73
-
74
- rel_descriptions = []
75
- for rt in schema.relationship_types:
76
- rel_descriptions.append(
77
- f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
78
- )
79
-
80
- return (
81
- "You are an entity extraction engine. Extract entities and relationships "
82
- "from the provided text according to this schema.\n\n"
83
- "ALLOWED NODE TYPES:\n"
84
- + "\n".join(node_descriptions)
85
- + "\n\nALLOWED RELATIONSHIP TYPES:\n"
86
- + "\n".join(rel_descriptions)
87
- + "\n\nDo not extract entities or relationships not listed above.\n\n"
88
- "Respond with ONLY a JSON object in this exact format:\n"
89
- '{"nodes": [{"id": "<unique_id>", "label": "<NodeType>", "properties": {<key>: <value>}}], '
90
- '"relationships": [{"source_id": "<node_id>", "target_id": "<node_id>", "type": "<RelType>", "properties": {}}]}\n\n'
91
- "Rules:\n"
92
- "- Every node id must be unique and descriptive (e.g., 'person-alice', 'company-acme')\n"
93
- "- Only use node types and relationship types listed above\n"
94
- "- Include all required properties for each node type\n"
95
- "- Return empty arrays if no entities are found"
96
- )
97
-
98
- def _parse_response(
99
- self, response: str
100
- ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
101
- data = json.loads(response)
102
-
103
- nodes = [
104
- ExtractedNode(
105
- id=n["id"],
106
- label=n["label"],
107
- properties=n.get("properties", {}),
108
- )
109
- for n in data.get("nodes", [])
110
- ]
111
-
112
- rels = [
113
- ExtractedRelationship(
114
- source_id=r["source_id"],
115
- target_id=r["target_id"],
116
- type=r["type"],
117
- properties=r.get("properties", {}),
118
- )
119
- for r in data.get("relationships", [])
120
- ]
121
-
122
- return nodes, rels
146
+ return result.nodes, result.relationships
123
147
 
124
148
  def _validate(
125
149
  self,
@@ -127,28 +151,4 @@ class LLMExtractionEngine:
127
151
  rels: list[ExtractedRelationship],
128
152
  schema: OntologySchema,
129
153
  ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
130
- allowed_labels = {nt.label for nt in schema.node_types}
131
- allowed_rel_types = {rt.type for rt in schema.relationship_types}
132
- rel_constraints = {
133
- rt.type: (set(rt.source_types), set(rt.target_types))
134
- for rt in schema.relationship_types
135
- }
136
-
137
- valid_nodes = [n for n in nodes if n.label in allowed_labels]
138
- valid_node_ids = {n.id for n in valid_nodes}
139
- node_labels = {n.id: n.label for n in valid_nodes}
140
-
141
- valid_rels = []
142
- for rel in rels:
143
- if rel.type not in allowed_rel_types:
144
- continue
145
- if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
146
- continue
147
- source_types, target_types = rel_constraints[rel.type]
148
- if node_labels[rel.source_id] not in source_types:
149
- continue
150
- if node_labels[rel.target_id] not in target_types:
151
- continue
152
- valid_rels.append(rel)
153
-
154
- return valid_nodes, valid_rels
154
+ return validate_extraction(nodes, rels, schema)
@@ -4,6 +4,8 @@ from __future__ import annotations
4
4
 
5
5
  from typing import Protocol, runtime_checkable
6
6
 
7
+ from pydantic import BaseModel
8
+
7
9
  from graphrag_core.models import (
8
10
  AgentResult,
9
11
  ApplyResult,
@@ -77,6 +79,15 @@ class LLMClient(Protocol):
77
79
  max_tokens: int = 4096,
78
80
  ) -> str: ...
79
81
 
82
+ async def complete_json(
83
+ self,
84
+ messages: list[dict[str, str]],
85
+ schema: type[BaseModel],
86
+ system: str | None = None,
87
+ temperature: float = 0.0,
88
+ max_tokens: int = 4096,
89
+ ) -> BaseModel: ...
90
+
80
91
 
81
92
  # ---------------------------------------------------------------------------
82
93
  # BB2: Schema-Guided Entity Extraction
@@ -94,6 +105,13 @@ class ExtractionEngine(Protocol):
94
105
  ) -> ExtractionResult: ...
95
106
 
96
107
 
108
+ @runtime_checkable
109
+ class ExtractionPromptBuilder(Protocol):
110
+ """Builds the system prompt for LLM-based entity extraction."""
111
+
112
+ def build_system_prompt(self, schema: OntologySchema) -> str: ...
113
+
114
+
97
115
  # ---------------------------------------------------------------------------
98
116
  # BB3: Provenance-Native Knowledge Graph
99
117
  # ---------------------------------------------------------------------------
@@ -0,0 +1,17 @@
1
+ """LLM client implementations."""
2
+
3
+ from graphrag_core.llm.base import BaseLLMClient
4
+
5
+ __all__: list[str] = ["BaseLLMClient"]
6
+
7
+ try:
8
+ from graphrag_core.llm.anthropic import AnthropicLLMClient
9
+ __all__.append("AnthropicLLMClient")
10
+ except ImportError:
11
+ pass
12
+
13
+ try:
14
+ from graphrag_core.llm.openai import OpenAILLMClient
15
+ __all__.append("OpenAILLMClient")
16
+ except ImportError:
17
+ pass
@@ -0,0 +1,65 @@
1
+ """Anthropic Claude LLM client."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from anthropic import AsyncAnthropic
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class AnthropicLLMClient:
10
+ """Thin wrapper around the Anthropic SDK implementing the LLMClient Protocol."""
11
+
12
+ def __init__(
13
+ self,
14
+ model: str = "claude-sonnet-4-20250514",
15
+ api_key: str | None = None,
16
+ ) -> None:
17
+ self._model = model
18
+ self._client = AsyncAnthropic(api_key=api_key)
19
+
20
+ async def complete(
21
+ self,
22
+ messages: list[dict[str, str]],
23
+ system: str | None = None,
24
+ temperature: float = 0.0,
25
+ max_tokens: int = 4096,
26
+ ) -> str:
27
+ kwargs: dict = {
28
+ "model": self._model,
29
+ "messages": messages,
30
+ "temperature": temperature,
31
+ "max_tokens": max_tokens,
32
+ }
33
+ if system is not None:
34
+ kwargs["system"] = system
35
+ response = await self._client.messages.create(**kwargs)
36
+ return response.content[0].text
37
+
38
+ async def complete_json(
39
+ self,
40
+ messages: list[dict[str, str]],
41
+ schema: type[BaseModel],
42
+ system: str | None = None,
43
+ temperature: float = 0.0,
44
+ max_tokens: int = 4096,
45
+ ) -> BaseModel:
46
+ json_schema = schema.model_json_schema()
47
+ kwargs: dict = {
48
+ "model": self._model,
49
+ "messages": messages,
50
+ "temperature": temperature,
51
+ "max_tokens": max_tokens,
52
+ "tools": [
53
+ {
54
+ "name": "extract",
55
+ "description": "Extract structured data",
56
+ "input_schema": json_schema,
57
+ },
58
+ ],
59
+ "tool_choice": {"type": "tool", "name": "extract"},
60
+ }
61
+ if system is not None:
62
+ kwargs["system"] = system
63
+ response = await self._client.messages.create(**kwargs)
64
+ tool_block = next(b for b in response.content if b.type == "tool_use")
65
+ return schema.model_validate(tool_block.input)
@@ -0,0 +1,66 @@
1
+ """Base LLM client with fallback complete_json() via prompt + parse + retry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ from pydantic import BaseModel, ValidationError
8
+
9
+
10
+ class BaseLLMClient:
11
+ """Default complete_json() via prompt + parse + retry.
12
+
13
+ Providers with native structured output (OpenAI, Anthropic) override
14
+ complete_json() directly. This base class provides a working fallback
15
+ for providers without native support (e.g., local model clients).
16
+ """
17
+
18
+ async def complete(
19
+ self,
20
+ messages: list[dict[str, str]],
21
+ system: str | None = None,
22
+ temperature: float = 0.0,
23
+ max_tokens: int = 4096,
24
+ ) -> str:
25
+ raise NotImplementedError
26
+
27
+ async def complete_json(
28
+ self,
29
+ messages: list[dict[str, str]],
30
+ schema: type[BaseModel],
31
+ system: str | None = None,
32
+ temperature: float = 0.0,
33
+ max_tokens: int = 4096,
34
+ ) -> BaseModel:
35
+ schema_text = json.dumps(schema.model_json_schema(), indent=2)
36
+ augmented_system = (system or "") + (
37
+ f"\n\nRespond with ONLY a JSON object matching this schema:\n{schema_text}\n"
38
+ "No markdown fences. No explanation. Just the JSON object."
39
+ )
40
+
41
+ for attempt in range(2):
42
+ response = await self.complete(
43
+ messages, system=augmented_system, temperature=temperature, max_tokens=max_tokens,
44
+ )
45
+ text = self._strip_json(response)
46
+ try:
47
+ return schema.model_validate_json(text)
48
+ except (json.JSONDecodeError, ValidationError) as exc:
49
+ if attempt == 0:
50
+ augmented_system += (
51
+ f"\n\nYour previous response failed validation: {exc}\n"
52
+ "Try again. Return ONLY valid JSON."
53
+ )
54
+ else:
55
+ raise
56
+ raise RuntimeError("unreachable")
57
+
58
+ @staticmethod
59
+ def _strip_json(text: str) -> str:
60
+ text = text.strip()
61
+ if text.startswith("```"):
62
+ nl = text.find("\n")
63
+ text = text[nl + 1 :] if nl != -1 else ""
64
+ if text.endswith("```"):
65
+ text = text[:-3]
66
+ return text.strip()
@@ -0,0 +1,64 @@
1
+ """OpenAI LLM client."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from openai import AsyncOpenAI
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class OpenAILLMClient:
10
+ """Thin wrapper around the OpenAI SDK implementing the LLMClient Protocol."""
11
+
12
+ def __init__(
13
+ self,
14
+ model: str = "gpt-4o",
15
+ api_key: str | None = None,
16
+ ) -> None:
17
+ self._model = model
18
+ self._client = AsyncOpenAI(api_key=api_key)
19
+
20
+ async def complete(
21
+ self,
22
+ messages: list[dict[str, str]],
23
+ system: str | None = None,
24
+ temperature: float = 0.0,
25
+ max_tokens: int = 4096,
26
+ ) -> str:
27
+ full_messages = list(messages)
28
+ if system is not None:
29
+ full_messages.insert(0, {"role": "system", "content": system})
30
+ response = await self._client.chat.completions.create(
31
+ model=self._model,
32
+ messages=full_messages,
33
+ temperature=temperature,
34
+ max_tokens=max_tokens,
35
+ )
36
+ return response.choices[0].message.content
37
+
38
+ async def complete_json(
39
+ self,
40
+ messages: list[dict[str, str]],
41
+ schema: type[BaseModel],
42
+ system: str | None = None,
43
+ temperature: float = 0.0,
44
+ max_tokens: int = 4096,
45
+ ) -> BaseModel:
46
+ full_messages = list(messages)
47
+ if system is not None:
48
+ full_messages.insert(0, {"role": "system", "content": system})
49
+ json_schema = schema.model_json_schema()
50
+ response = await self._client.chat.completions.create(
51
+ model=self._model,
52
+ messages=full_messages,
53
+ temperature=temperature,
54
+ max_tokens=max_tokens,
55
+ response_format={
56
+ "type": "json_schema",
57
+ "json_schema": {
58
+ "name": schema.__name__,
59
+ "schema": json_schema,
60
+ "strict": True,
61
+ },
62
+ },
63
+ )
64
+ return schema.model_validate_json(response.choices[0].message.content)
@@ -68,12 +68,14 @@ class NodeTypeDefinition(BaseModel):
68
68
  label: str
69
69
  properties: list[PropertyDefinition]
70
70
  required_properties: list[str] = []
71
+ description: str | None = None
71
72
 
72
73
 
73
74
  class RelationshipTypeDefinition(BaseModel):
74
75
  type: str
75
76
  source_types: list[str]
76
77
  target_types: list[str]
78
+ description: str | None = None
77
79
 
78
80
 
79
81
  class OntologySchema(BaseModel):
@@ -106,6 +108,12 @@ class ExtractionResult(BaseModel):
106
108
  provenance: list[ProvenanceLink]
107
109
 
108
110
 
111
+ class ChunkExtractionResult(BaseModel):
112
+ """LLM extraction output for a single chunk (no provenance — engine adds that)."""
113
+ nodes: list[ExtractedNode]
114
+ relationships: list[ExtractedRelationship]
115
+
116
+
109
117
  # ---------------------------------------------------------------------------
110
118
  # BB3: Provenance-Native Knowledge Graph
111
119
  # ---------------------------------------------------------------------------