graphrag-core 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/PKG-INFO +4 -1
  2. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/pyproject.toml +3 -2
  3. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/__init__.py +11 -0
  4. graphrag_core-0.3.0/src/graphrag_core/extraction/__init__.py +5 -0
  5. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/extraction/engine.py +53 -62
  6. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/interfaces.py +11 -0
  7. graphrag_core-0.3.0/src/graphrag_core/llm/__init__.py +17 -0
  8. graphrag_core-0.3.0/src/graphrag_core/llm/anthropic.py +65 -0
  9. graphrag_core-0.3.0/src/graphrag_core/llm/base.py +66 -0
  10. graphrag_core-0.3.0/src/graphrag_core/llm/openai.py +64 -0
  11. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/models.py +8 -0
  12. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_extraction/test_engine.py +223 -8
  13. graphrag_core-0.3.0/tests/test_extraction_engine.py +128 -0
  14. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_interfaces.py +12 -0
  15. graphrag_core-0.3.0/tests/test_llm_anthropic.py +77 -0
  16. graphrag_core-0.3.0/tests/test_llm_base.py +75 -0
  17. graphrag_core-0.3.0/tests/test_llm_openai.py +125 -0
  18. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_models.py +23 -0
  19. graphrag_core-0.2.0/src/graphrag_core/extraction/__init__.py +0 -5
  20. graphrag_core-0.2.0/src/graphrag_core/llm/__init__.py +0 -9
  21. graphrag_core-0.2.0/src/graphrag_core/llm/anthropic.py +0 -35
  22. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/.github/workflows/release.yml +0 -0
  23. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/.github/workflows/test.yml +0 -0
  24. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/.gitignore +0 -0
  25. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/CHANGELOG.md +0 -0
  26. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/CLAUDE.md +0 -0
  27. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/LICENSE +0 -0
  28. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/README.md +0 -0
  29. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/graphrag_core_interface_spec.md +0 -0
  30. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-11-bb1-document-ingestion.md +0 -0
  31. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-bb2-bb3-extraction-and-graph.md +0 -0
  32. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-bb4-hybrid-search.md +0 -0
  33. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-bb5-bb6-curation-and-registry.md +0 -0
  34. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-bb7-bb8-tools-and-agents.md +0 -0
  35. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/plans/2026-04-12-release-readiness.md +0 -0
  36. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-11-bb1-document-ingestion-design.md +0 -0
  37. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-bb2-bb3-extraction-and-graph-design.md +0 -0
  38. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-bb4-hybrid-search-design.md +0 -0
  39. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-bb5-bb6-curation-and-registry-design.md +0 -0
  40. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-bb7-bb8-tools-and-agents-design.md +0 -0
  41. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/docs/superpowers/specs/2026-04-12-release-readiness-design.md +0 -0
  42. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/_cypher.py +0 -0
  43. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/agents/__init__.py +0 -0
  44. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/agents/context.py +0 -0
  45. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/agents/orchestrator.py +0 -0
  46. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/curation/__init__.py +0 -0
  47. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/curation/detection.py +0 -0
  48. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/curation/pipeline.py +0 -0
  49. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/graph/__init__.py +0 -0
  50. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/graph/memory.py +0 -0
  51. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/graph/neo4j.py +0 -0
  52. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/ingestion/__init__.py +0 -0
  53. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/ingestion/chunker.py +0 -0
  54. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/ingestion/parsers.py +0 -0
  55. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/ingestion/pipeline.py +0 -0
  56. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/py.typed +0 -0
  57. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/registry/__init__.py +0 -0
  58. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/registry/matching.py +0 -0
  59. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/registry/memory.py +0 -0
  60. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/search/__init__.py +0 -0
  61. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/search/fusion.py +0 -0
  62. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/search/memory.py +0 -0
  63. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/search/neo4j.py +0 -0
  64. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/tools/__init__.py +0 -0
  65. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/tools/core_tools.py +0 -0
  66. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/src/graphrag_core/tools/library.py +0 -0
  67. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/__init__.py +0 -0
  68. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/conftest.py +0 -0
  69. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_agents/__init__.py +0 -0
  70. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_agents/test_context.py +0 -0
  71. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_agents/test_orchestrator.py +0 -0
  72. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_curation/__init__.py +0 -0
  73. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_curation/test_detection.py +0 -0
  74. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_curation/test_pipeline.py +0 -0
  75. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_extraction/__init__.py +0 -0
  76. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_graph/__init__.py +0 -0
  77. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_graph/test_memory.py +0 -0
  78. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_graph/test_neo4j.py +0 -0
  79. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_ingestion/__init__.py +0 -0
  80. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_ingestion/test_chunker.py +0 -0
  81. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_ingestion/test_parsers.py +0 -0
  82. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_ingestion/test_pipeline.py +0 -0
  83. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_integration/__init__.py +0 -0
  84. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_integration/test_ingest_to_graph.py +0 -0
  85. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_registry/__init__.py +0 -0
  86. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_registry/test_matching.py +0 -0
  87. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_registry/test_memory_registry.py +0 -0
  88. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_search/__init__.py +0 -0
  89. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_search/test_fusion.py +0 -0
  90. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_search/test_memory.py +0 -0
  91. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_search/test_neo4j_search.py +0 -0
  92. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_tools/__init__.py +0 -0
  93. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_tools/test_core_tools.py +0 -0
  94. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/tests/test_tools/test_library.py +0 -0
  95. {graphrag_core-0.2.0 → graphrag_core-0.3.0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: graphrag-core
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs
5
5
  Project-URL: Homepage, https://github.com/cdel1/graphrag-core
6
6
  Project-URL: Repository, https://github.com/cdel1/graphrag-core
@@ -22,10 +22,13 @@ Requires-Dist: python-docx>=1.0
22
22
  Provides-Extra: all
23
23
  Requires-Dist: anthropic>=0.40; extra == 'all'
24
24
  Requires-Dist: neo4j>=5.0; extra == 'all'
25
+ Requires-Dist: openai>=1.0; extra == 'all'
25
26
  Provides-Extra: anthropic
26
27
  Requires-Dist: anthropic>=0.40; extra == 'anthropic'
27
28
  Provides-Extra: neo4j
28
29
  Requires-Dist: neo4j>=5.0; extra == 'neo4j'
30
+ Provides-Extra: openai
31
+ Requires-Dist: openai>=1.0; extra == 'openai'
29
32
  Description-Content-Type: text/markdown
30
33
 
31
34
  # graphrag-core
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "graphrag-core"
3
- version = "0.2.0"
3
+ version = "0.3.0"
4
4
  description = "Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs"
5
5
  license = "MIT"
6
6
  requires-python = ">=3.12"
@@ -36,7 +36,8 @@ markers = [
36
36
  [project.optional-dependencies]
37
37
  anthropic = ["anthropic>=0.40"]
38
38
  neo4j = ["neo4j>=5.0"]
39
- all = ["graphrag-core[anthropic,neo4j]"]
39
+ openai = ["openai>=1.0"]
40
+ all = ["graphrag-core[anthropic,neo4j,openai]"]
40
41
 
41
42
  [project.urls]
42
43
  Homepage = "https://github.com/cdel1/graphrag-core"
@@ -33,8 +33,10 @@ from graphrag_core.registry import InMemoryEntityRegistry
33
33
  from graphrag_core.curation import DeterministicDetectionLayer, CurationPipeline
34
34
  from graphrag_core.tools import Tool, ToolLibrary, register_core_tools
35
35
  from graphrag_core.agents import AgentContext, SequentialOrchestrator
36
+ from graphrag_core.llm import BaseLLMClient
36
37
  from graphrag_core.models import (
37
38
  AgentResult,
39
+ ChunkExtractionResult,
38
40
  CurationIssue,
39
41
  CurationReport,
40
42
  DocumentChunk,
@@ -78,6 +80,8 @@ __all__ = [
78
80
  "PdfParser",
79
81
  "TextParser",
80
82
  "TokenChunker",
83
+ # LLM base
84
+ "BaseLLMClient",
81
85
  # BB2 implementations
82
86
  "LLMExtractionEngine",
83
87
  # BB3 implementations
@@ -98,6 +102,7 @@ __all__ = [
98
102
  "SequentialOrchestrator",
99
103
  # Models
100
104
  "AgentResult",
105
+ "ChunkExtractionResult",
101
106
  "CurationIssue",
102
107
  "CurationReport",
103
108
  "DocumentChunk",
@@ -136,3 +141,9 @@ try:
136
141
  __all__.append("AnthropicLLMClient")
137
142
  except ImportError:
138
143
  pass
144
+
145
+ try:
146
+ from graphrag_core.llm import OpenAILLMClient
147
+ __all__.append("OpenAILLMClient")
148
+ except ImportError:
149
+ pass
@@ -0,0 +1,5 @@
1
+ """BB2: Schema-guided entity extraction."""
2
+
3
+ from graphrag_core.extraction.engine import LLMExtractionEngine, validate_extraction
4
+
5
+ __all__ = ["LLMExtractionEngine", "validate_extraction"]
@@ -2,10 +2,9 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- import json
6
-
7
5
  from graphrag_core.interfaces import LLMClient
8
6
  from graphrag_core.models import (
7
+ ChunkExtractionResult,
9
8
  DocumentChunk,
10
9
  ExtractedNode,
11
10
  ExtractedRelationship,
@@ -16,6 +15,46 @@ from graphrag_core.models import (
16
15
  )
17
16
 
18
17
 
18
+ def validate_extraction(
19
+ nodes: list[ExtractedNode],
20
+ rels: list[ExtractedRelationship],
21
+ schema: OntologySchema,
22
+ ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
23
+ """Filter extracted nodes and relationships to match schema constraints.
24
+
25
+ Removes:
26
+ - Nodes with labels not in the schema
27
+ - Relationships with types not in the schema
28
+ - Relationships referencing non-existent node IDs
29
+ - Relationships violating source/target type constraints
30
+ """
31
+ allowed_labels = {nt.label for nt in schema.node_types}
32
+ allowed_rel_types = {rt.type for rt in schema.relationship_types}
33
+ rel_constraints = {
34
+ rt.type: (set(rt.source_types), set(rt.target_types))
35
+ for rt in schema.relationship_types
36
+ }
37
+
38
+ valid_nodes = [n for n in nodes if n.label in allowed_labels]
39
+ valid_node_ids = {n.id for n in valid_nodes}
40
+ node_labels = {n.id: n.label for n in valid_nodes}
41
+
42
+ valid_rels = []
43
+ for rel in rels:
44
+ if rel.type not in allowed_rel_types:
45
+ continue
46
+ if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
47
+ continue
48
+ source_types, target_types = rel_constraints[rel.type]
49
+ if node_labels[rel.source_id] not in source_types:
50
+ continue
51
+ if node_labels[rel.target_id] not in target_types:
52
+ continue
53
+ valid_rels.append(rel)
54
+
55
+ return valid_nodes, valid_rels
56
+
57
+
19
58
  class LLMExtractionEngine:
20
59
  """Extracts entities and relationships from text using an LLM, guided by an ontology schema."""
21
60
 
@@ -55,12 +94,13 @@ class LLMExtractionEngine:
55
94
  async def _extract_chunk(
56
95
  self, chunk: DocumentChunk, system_prompt: str
57
96
  ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
58
- response = await self._llm.complete(
97
+ result = await self._llm.complete_json(
59
98
  messages=[{"role": "user", "content": chunk.text}],
99
+ schema=ChunkExtractionResult,
60
100
  system=system_prompt,
61
101
  temperature=0.0,
62
102
  )
63
- return self._parse_response(response)
103
+ return result.nodes, result.relationships
64
104
 
65
105
  def _build_system_prompt(self, schema: OntologySchema) -> str:
66
106
  node_descriptions = []
@@ -69,13 +109,17 @@ class LLMExtractionEngine:
69
109
  f"{p.name} ({p.type}{', required' if p.required else ''})"
70
110
  for p in nt.properties
71
111
  )
72
- node_descriptions.append(f"- {nt.label}: properties=[{props}]")
112
+ line = f"- {nt.label}: properties=[{props}]"
113
+ if nt.description:
114
+ line += f" \u2014 {nt.description}"
115
+ node_descriptions.append(line)
73
116
 
74
117
  rel_descriptions = []
75
118
  for rt in schema.relationship_types:
76
- rel_descriptions.append(
77
- f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
78
- )
119
+ line = f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
120
+ if rt.description:
121
+ line += f" \u2014 {rt.description}"
122
+ rel_descriptions.append(line)
79
123
 
80
124
  return (
81
125
  "You are an entity extraction engine. Extract entities and relationships "
@@ -85,9 +129,6 @@ class LLMExtractionEngine:
85
129
  + "\n\nALLOWED RELATIONSHIP TYPES:\n"
86
130
  + "\n".join(rel_descriptions)
87
131
  + "\n\nDo not extract entities or relationships not listed above.\n\n"
88
- "Respond with ONLY a JSON object in this exact format:\n"
89
- '{"nodes": [{"id": "<unique_id>", "label": "<NodeType>", "properties": {<key>: <value>}}], '
90
- '"relationships": [{"source_id": "<node_id>", "target_id": "<node_id>", "type": "<RelType>", "properties": {}}]}\n\n'
91
132
  "Rules:\n"
92
133
  "- Every node id must be unique and descriptive (e.g., 'person-alice', 'company-acme')\n"
93
134
  "- Only use node types and relationship types listed above\n"
@@ -95,60 +136,10 @@ class LLMExtractionEngine:
95
136
  "- Return empty arrays if no entities are found"
96
137
  )
97
138
 
98
- def _parse_response(
99
- self, response: str
100
- ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
101
- data = json.loads(response)
102
-
103
- nodes = [
104
- ExtractedNode(
105
- id=n["id"],
106
- label=n["label"],
107
- properties=n.get("properties", {}),
108
- )
109
- for n in data.get("nodes", [])
110
- ]
111
-
112
- rels = [
113
- ExtractedRelationship(
114
- source_id=r["source_id"],
115
- target_id=r["target_id"],
116
- type=r["type"],
117
- properties=r.get("properties", {}),
118
- )
119
- for r in data.get("relationships", [])
120
- ]
121
-
122
- return nodes, rels
123
-
124
139
  def _validate(
125
140
  self,
126
141
  nodes: list[ExtractedNode],
127
142
  rels: list[ExtractedRelationship],
128
143
  schema: OntologySchema,
129
144
  ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
130
- allowed_labels = {nt.label for nt in schema.node_types}
131
- allowed_rel_types = {rt.type for rt in schema.relationship_types}
132
- rel_constraints = {
133
- rt.type: (set(rt.source_types), set(rt.target_types))
134
- for rt in schema.relationship_types
135
- }
136
-
137
- valid_nodes = [n for n in nodes if n.label in allowed_labels]
138
- valid_node_ids = {n.id for n in valid_nodes}
139
- node_labels = {n.id: n.label for n in valid_nodes}
140
-
141
- valid_rels = []
142
- for rel in rels:
143
- if rel.type not in allowed_rel_types:
144
- continue
145
- if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
146
- continue
147
- source_types, target_types = rel_constraints[rel.type]
148
- if node_labels[rel.source_id] not in source_types:
149
- continue
150
- if node_labels[rel.target_id] not in target_types:
151
- continue
152
- valid_rels.append(rel)
153
-
154
- return valid_nodes, valid_rels
145
+ return validate_extraction(nodes, rels, schema)
@@ -4,6 +4,8 @@ from __future__ import annotations
4
4
 
5
5
  from typing import Protocol, runtime_checkable
6
6
 
7
+ from pydantic import BaseModel
8
+
7
9
  from graphrag_core.models import (
8
10
  AgentResult,
9
11
  ApplyResult,
@@ -77,6 +79,15 @@ class LLMClient(Protocol):
77
79
  max_tokens: int = 4096,
78
80
  ) -> str: ...
79
81
 
82
+ async def complete_json(
83
+ self,
84
+ messages: list[dict[str, str]],
85
+ schema: type[BaseModel],
86
+ system: str | None = None,
87
+ temperature: float = 0.0,
88
+ max_tokens: int = 4096,
89
+ ) -> BaseModel: ...
90
+
80
91
 
81
92
  # ---------------------------------------------------------------------------
82
93
  # BB2: Schema-Guided Entity Extraction
@@ -0,0 +1,17 @@
1
+ """LLM client implementations."""
2
+
3
+ from graphrag_core.llm.base import BaseLLMClient
4
+
5
+ __all__: list[str] = ["BaseLLMClient"]
6
+
7
+ try:
8
+ from graphrag_core.llm.anthropic import AnthropicLLMClient
9
+ __all__.append("AnthropicLLMClient")
10
+ except ImportError:
11
+ pass
12
+
13
+ try:
14
+ from graphrag_core.llm.openai import OpenAILLMClient
15
+ __all__.append("OpenAILLMClient")
16
+ except ImportError:
17
+ pass
@@ -0,0 +1,65 @@
1
+ """Anthropic Claude LLM client."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from anthropic import AsyncAnthropic
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class AnthropicLLMClient:
10
+ """Thin wrapper around the Anthropic SDK implementing the LLMClient Protocol."""
11
+
12
+ def __init__(
13
+ self,
14
+ model: str = "claude-sonnet-4-20250514",
15
+ api_key: str | None = None,
16
+ ) -> None:
17
+ self._model = model
18
+ self._client = AsyncAnthropic(api_key=api_key)
19
+
20
+ async def complete(
21
+ self,
22
+ messages: list[dict[str, str]],
23
+ system: str | None = None,
24
+ temperature: float = 0.0,
25
+ max_tokens: int = 4096,
26
+ ) -> str:
27
+ kwargs: dict = {
28
+ "model": self._model,
29
+ "messages": messages,
30
+ "temperature": temperature,
31
+ "max_tokens": max_tokens,
32
+ }
33
+ if system is not None:
34
+ kwargs["system"] = system
35
+ response = await self._client.messages.create(**kwargs)
36
+ return response.content[0].text
37
+
38
+ async def complete_json(
39
+ self,
40
+ messages: list[dict[str, str]],
41
+ schema: type[BaseModel],
42
+ system: str | None = None,
43
+ temperature: float = 0.0,
44
+ max_tokens: int = 4096,
45
+ ) -> BaseModel:
46
+ json_schema = schema.model_json_schema()
47
+ kwargs: dict = {
48
+ "model": self._model,
49
+ "messages": messages,
50
+ "temperature": temperature,
51
+ "max_tokens": max_tokens,
52
+ "tools": [
53
+ {
54
+ "name": "extract",
55
+ "description": "Extract structured data",
56
+ "input_schema": json_schema,
57
+ },
58
+ ],
59
+ "tool_choice": {"type": "tool", "name": "extract"},
60
+ }
61
+ if system is not None:
62
+ kwargs["system"] = system
63
+ response = await self._client.messages.create(**kwargs)
64
+ tool_block = next(b for b in response.content if b.type == "tool_use")
65
+ return schema.model_validate(tool_block.input)
@@ -0,0 +1,66 @@
1
+ """Base LLM client with fallback complete_json() via prompt + parse + retry."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ from pydantic import BaseModel, ValidationError
8
+
9
+
10
+ class BaseLLMClient:
11
+ """Default complete_json() via prompt + parse + retry.
12
+
13
+ Providers with native structured output (OpenAI, Anthropic) override
14
+ complete_json() directly. This base class provides a working fallback
15
+ for providers without native support (e.g., local model clients).
16
+ """
17
+
18
+ async def complete(
19
+ self,
20
+ messages: list[dict[str, str]],
21
+ system: str | None = None,
22
+ temperature: float = 0.0,
23
+ max_tokens: int = 4096,
24
+ ) -> str:
25
+ raise NotImplementedError
26
+
27
+ async def complete_json(
28
+ self,
29
+ messages: list[dict[str, str]],
30
+ schema: type[BaseModel],
31
+ system: str | None = None,
32
+ temperature: float = 0.0,
33
+ max_tokens: int = 4096,
34
+ ) -> BaseModel:
35
+ schema_text = json.dumps(schema.model_json_schema(), indent=2)
36
+ augmented_system = (system or "") + (
37
+ f"\n\nRespond with ONLY a JSON object matching this schema:\n{schema_text}\n"
38
+ "No markdown fences. No explanation. Just the JSON object."
39
+ )
40
+
41
+ for attempt in range(2):
42
+ response = await self.complete(
43
+ messages, system=augmented_system, temperature=temperature, max_tokens=max_tokens,
44
+ )
45
+ text = self._strip_json(response)
46
+ try:
47
+ return schema.model_validate_json(text)
48
+ except (json.JSONDecodeError, ValidationError) as exc:
49
+ if attempt == 0:
50
+ augmented_system += (
51
+ f"\n\nYour previous response failed validation: {exc}\n"
52
+ "Try again. Return ONLY valid JSON."
53
+ )
54
+ else:
55
+ raise
56
+ raise RuntimeError("unreachable")
57
+
58
+ @staticmethod
59
+ def _strip_json(text: str) -> str:
60
+ text = text.strip()
61
+ if text.startswith("```"):
62
+ nl = text.find("\n")
63
+ text = text[nl + 1 :] if nl != -1 else ""
64
+ if text.endswith("```"):
65
+ text = text[:-3]
66
+ return text.strip()
@@ -0,0 +1,64 @@
1
+ """OpenAI LLM client."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from openai import AsyncOpenAI
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class OpenAILLMClient:
10
+ """Thin wrapper around the OpenAI SDK implementing the LLMClient Protocol."""
11
+
12
+ def __init__(
13
+ self,
14
+ model: str = "gpt-4o",
15
+ api_key: str | None = None,
16
+ ) -> None:
17
+ self._model = model
18
+ self._client = AsyncOpenAI(api_key=api_key)
19
+
20
+ async def complete(
21
+ self,
22
+ messages: list[dict[str, str]],
23
+ system: str | None = None,
24
+ temperature: float = 0.0,
25
+ max_tokens: int = 4096,
26
+ ) -> str:
27
+ full_messages = list(messages)
28
+ if system is not None:
29
+ full_messages.insert(0, {"role": "system", "content": system})
30
+ response = await self._client.chat.completions.create(
31
+ model=self._model,
32
+ messages=full_messages,
33
+ temperature=temperature,
34
+ max_tokens=max_tokens,
35
+ )
36
+ return response.choices[0].message.content
37
+
38
+ async def complete_json(
39
+ self,
40
+ messages: list[dict[str, str]],
41
+ schema: type[BaseModel],
42
+ system: str | None = None,
43
+ temperature: float = 0.0,
44
+ max_tokens: int = 4096,
45
+ ) -> BaseModel:
46
+ full_messages = list(messages)
47
+ if system is not None:
48
+ full_messages.insert(0, {"role": "system", "content": system})
49
+ json_schema = schema.model_json_schema()
50
+ response = await self._client.chat.completions.create(
51
+ model=self._model,
52
+ messages=full_messages,
53
+ temperature=temperature,
54
+ max_tokens=max_tokens,
55
+ response_format={
56
+ "type": "json_schema",
57
+ "json_schema": {
58
+ "name": schema.__name__,
59
+ "schema": json_schema,
60
+ "strict": True,
61
+ },
62
+ },
63
+ )
64
+ return schema.model_validate_json(response.choices[0].message.content)
@@ -68,12 +68,14 @@ class NodeTypeDefinition(BaseModel):
68
68
  label: str
69
69
  properties: list[PropertyDefinition]
70
70
  required_properties: list[str] = []
71
+ description: str | None = None
71
72
 
72
73
 
73
74
  class RelationshipTypeDefinition(BaseModel):
74
75
  type: str
75
76
  source_types: list[str]
76
77
  target_types: list[str]
78
+ description: str | None = None
77
79
 
78
80
 
79
81
  class OntologySchema(BaseModel):
@@ -106,6 +108,12 @@ class ExtractionResult(BaseModel):
106
108
  provenance: list[ProvenanceLink]
107
109
 
108
110
 
111
+ class ChunkExtractionResult(BaseModel):
112
+ """LLM extraction output for a single chunk (no provenance — engine adds that)."""
113
+ nodes: list[ExtractedNode]
114
+ relationships: list[ExtractedRelationship]
115
+
116
+
109
117
  # ---------------------------------------------------------------------------
110
118
  # BB3: Provenance-Native Knowledge Graph
111
119
  # ---------------------------------------------------------------------------