PyPI - graphrag-core - Versions diffs - 0.2.0__tar.gz → 0.4.0__tar.gz - Mend

graphrag-core 0.2.0tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

{graphrag_core-0.2.0 → graphrag_core-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: graphrag-core
-Version: 0.2.0
+Version: 0.4.0
 Summary: Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs
 Project-URL: Homepage, https://github.com/cdel1/graphrag-core
 Project-URL: Repository, https://github.com/cdel1/graphrag-core
@@ -22,10 +22,13 @@ Requires-Dist: python-docx>=1.0
 Provides-Extra: all
 Requires-Dist: anthropic>=0.40; extra == 'all'
 Requires-Dist: neo4j>=5.0; extra == 'all'
+Requires-Dist: openai>=1.0; extra == 'all'
 Provides-Extra: anthropic
 Requires-Dist: anthropic>=0.40; extra == 'anthropic'
 Provides-Extra: neo4j
 Requires-Dist: neo4j>=5.0; extra == 'neo4j'
+Provides-Extra: openai
+Requires-Dist: openai>=1.0; extra == 'openai'
 Description-Content-Type: text/markdown
 # graphrag-core

{graphrag_core-0.2.0 → graphrag_core-0.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "graphrag-core"
-version = "0.2.0"
+version = "0.4.0"
 description = "Domain-agnostic Graph RAG framework for building governed, auditable Knowledge Graphs"
 license = "MIT"
 requires-python = ">=3.12"
@@ -36,7 +36,8 @@ markers = [
 [project.optional-dependencies]
 anthropic = ["anthropic>=0.40"]
 neo4j = ["neo4j>=5.0"]
-all = ["graphrag-core[anthropic,neo4j]"]
+openai = ["openai>=1.0"]
+all = ["graphrag-core[anthropic,neo4j,openai]"]
 [project.urls]
 Homepage = "https://github.com/cdel1/graphrag-core"

{graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/__init__.py RENAMED Viewed

@@ -11,6 +11,7 @@ from graphrag_core.interfaces import (
     EmbeddingModel,
     EntityRegistry,
     ExtractionEngine,
+    ExtractionPromptBuilder,
     GraphStore,
     IngestionPipeline,
     LLMClient,
@@ -26,15 +27,17 @@ from graphrag_core.ingestion import (
     TextParser,
     TokenChunker,
 )
-from graphrag_core.extraction import LLMExtractionEngine
+from graphrag_core.extraction import DefaultPromptBuilder, LLMExtractionEngine
 from graphrag_core.graph import InMemoryGraphStore
 from graphrag_core.search import InMemorySearchEngine
 from graphrag_core.registry import InMemoryEntityRegistry
 from graphrag_core.curation import DeterministicDetectionLayer, CurationPipeline
 from graphrag_core.tools import Tool, ToolLibrary, register_core_tools
 from graphrag_core.agents import AgentContext, SequentialOrchestrator
+from graphrag_core.llm import BaseLLMClient
 from graphrag_core.models import (
     AgentResult,
+    ChunkExtractionResult,
     CurationIssue,
     CurationReport,
     DocumentChunk,
@@ -65,6 +68,7 @@ __all__ = [
     "EmbeddingModel",
     "EntityRegistry",
     "ExtractionEngine",
+    "ExtractionPromptBuilder",
     "GraphStore",
     "IngestionPipeline",
     "LLMClient",
@@ -78,7 +82,10 @@ __all__ = [
     "PdfParser",
     "TextParser",
     "TokenChunker",
+    # LLM base
+    "BaseLLMClient",
     # BB2 implementations
+    "DefaultPromptBuilder",
     "LLMExtractionEngine",
     # BB3 implementations
     "InMemoryGraphStore",
@@ -98,6 +105,7 @@ __all__ = [
     "SequentialOrchestrator",
     # Models
     "AgentResult",
+    "ChunkExtractionResult",
     "CurationIssue",
     "CurationReport",
     "DocumentChunk",
@@ -136,3 +144,9 @@ try:
     __all__.append("AnthropicLLMClient")
 except ImportError:
     pass
+try:
+    from graphrag_core.llm import OpenAILLMClient
+    __all__.append("OpenAILLMClient")
+except ImportError:
+    pass

graphrag_core-0.4.0/src/graphrag_core/extraction/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""BB2: Schema-guided entity extraction."""
+from graphrag_core.extraction.engine import DefaultPromptBuilder, LLMExtractionEngine, validate_extraction
+__all__ = ["DefaultPromptBuilder", "LLMExtractionEngine", "validate_extraction"]

{graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/extraction/engine.py RENAMED Viewed

@@ -2,10 +2,9 @@
 from __future__ import annotations
-import json
-from graphrag_core.interfaces import LLMClient
+from graphrag_core.interfaces import ExtractionPromptBuilder, LLMClient
 from graphrag_core.models import (
+    ChunkExtractionResult,
     DocumentChunk,
     ExtractedNode,
     ExtractedRelationship,
@@ -16,11 +15,94 @@ from graphrag_core.models import (
 )
+def validate_extraction(
+    nodes: list[ExtractedNode],
+    rels: list[ExtractedRelationship],
+    schema: OntologySchema,
+) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
+    """Filter extracted nodes and relationships to match schema constraints.
+    Removes:
+    - Nodes with labels not in the schema
+    - Relationships with types not in the schema
+    - Relationships referencing non-existent node IDs
+    - Relationships violating source/target type constraints
+    """
+    allowed_labels = {nt.label for nt in schema.node_types}
+    allowed_rel_types = {rt.type for rt in schema.relationship_types}
+    rel_constraints = {
+        rt.type: (set(rt.source_types), set(rt.target_types))
+        for rt in schema.relationship_types
+    }
+    valid_nodes = [n for n in nodes if n.label in allowed_labels]
+    valid_node_ids = {n.id for n in valid_nodes}
+    node_labels = {n.id: n.label for n in valid_nodes}
+    valid_rels = []
+    for rel in rels:
+        if rel.type not in allowed_rel_types:
+            continue
+        if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
+            continue
+        source_types, target_types = rel_constraints[rel.type]
+        if node_labels[rel.source_id] not in source_types:
+            continue
+        if node_labels[rel.target_id] not in target_types:
+            continue
+        valid_rels.append(rel)
+    return valid_nodes, valid_rels
+class DefaultPromptBuilder:
+    """Builds the default system prompt for LLM-based entity extraction."""
+    def build_system_prompt(self, schema: OntologySchema) -> str:
+        node_descriptions = []
+        for nt in schema.node_types:
+            props = ", ".join(
+                f"{p.name} ({p.type}{', required' if p.required else ''})"
+                for p in nt.properties
+            )
+            line = f"- {nt.label}: properties=[{props}]"
+            if nt.description:
+                line += f" \u2014 {nt.description}"
+            node_descriptions.append(line)
+        rel_descriptions = []
+        for rt in schema.relationship_types:
+            line = f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
+            if rt.description:
+                line += f" \u2014 {rt.description}"
+            rel_descriptions.append(line)
+        return (
+            "You are an entity extraction engine. Extract entities and relationships "
+            "from the provided text according to this schema.\n\n"
+            "ALLOWED NODE TYPES:\n"
+            + "\n".join(node_descriptions)
+            + "\n\nALLOWED RELATIONSHIP TYPES:\n"
+            + "\n".join(rel_descriptions)
+            + "\n\nDo not extract entities or relationships not listed above.\n\n"
+            "Rules:\n"
+            "- Every node id must be unique and descriptive (e.g., 'person-alice', 'company-acme')\n"
+            "- Only use node types and relationship types listed above\n"
+            "- Include all required properties for each node type\n"
+            "- Return empty arrays if no entities are found"
+        )
 class LLMExtractionEngine:
     """Extracts entities and relationships from text using an LLM, guided by an ontology schema."""
-    def __init__(self, llm_client: LLMClient) -> None:
+    def __init__(
+        self,
+        llm_client: LLMClient,
+        prompt_builder: ExtractionPromptBuilder | None = None,
+    ) -> None:
         self._llm = llm_client
+        self._prompt_builder = prompt_builder or DefaultPromptBuilder()
     async def extract(
         self,
@@ -32,7 +114,7 @@ class LLMExtractionEngine:
         all_rels: list[ExtractedRelationship] = []
         all_provenance: list[ProvenanceLink] = []
-        system_prompt = self._build_system_prompt(schema)
+        system_prompt = self._prompt_builder.build_system_prompt(schema)
         for chunk in chunks:
             nodes, rels = await self._extract_chunk(chunk, system_prompt)
@@ -55,71 +137,13 @@ class LLMExtractionEngine:
     async def _extract_chunk(
         self, chunk: DocumentChunk, system_prompt: str
     ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
-        response = await self._llm.complete(
+        result = await self._llm.complete_json(
             messages=[{"role": "user", "content": chunk.text}],
+            schema=ChunkExtractionResult,
             system=system_prompt,
             temperature=0.0,
         )
-        return self._parse_response(response)
-    def _build_system_prompt(self, schema: OntologySchema) -> str:
-        node_descriptions = []
-        for nt in schema.node_types:
-            props = ", ".join(
-                f"{p.name} ({p.type}{', required' if p.required else ''})"
-                for p in nt.properties
-            )
-            node_descriptions.append(f"- {nt.label}: properties=[{props}]")
-        rel_descriptions = []
-        for rt in schema.relationship_types:
-            rel_descriptions.append(
-                f"- {rt.type}: {rt.source_types} -> {rt.target_types}"
-            )
-        return (
-            "You are an entity extraction engine. Extract entities and relationships "
-            "from the provided text according to this schema.\n\n"
-            "ALLOWED NODE TYPES:\n"
-            + "\n".join(node_descriptions)
-            + "\n\nALLOWED RELATIONSHIP TYPES:\n"
-            + "\n".join(rel_descriptions)
-            + "\n\nDo not extract entities or relationships not listed above.\n\n"
-            "Respond with ONLY a JSON object in this exact format:\n"
-            '{"nodes": [{"id": "<unique_id>", "label": "<NodeType>", "properties": {<key>: <value>}}], '
-            '"relationships": [{"source_id": "<node_id>", "target_id": "<node_id>", "type": "<RelType>", "properties": {}}]}\n\n'
-            "Rules:\n"
-            "- Every node id must be unique and descriptive (e.g., 'person-alice', 'company-acme')\n"
-            "- Only use node types and relationship types listed above\n"
-            "- Include all required properties for each node type\n"
-            "- Return empty arrays if no entities are found"
-        )
-    def _parse_response(
-        self, response: str
-    ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
-        data = json.loads(response)
-        nodes = [
-            ExtractedNode(
-                id=n["id"],
-                label=n["label"],
-                properties=n.get("properties", {}),
-            )
-            for n in data.get("nodes", [])
-        ]
-        rels = [
-            ExtractedRelationship(
-                source_id=r["source_id"],
-                target_id=r["target_id"],
-                type=r["type"],
-                properties=r.get("properties", {}),
-            )
-            for r in data.get("relationships", [])
-        ]
-        return nodes, rels
+        return result.nodes, result.relationships
     def _validate(
         self,
@@ -127,28 +151,4 @@ class LLMExtractionEngine:
         rels: list[ExtractedRelationship],
         schema: OntologySchema,
     ) -> tuple[list[ExtractedNode], list[ExtractedRelationship]]:
-        allowed_labels = {nt.label for nt in schema.node_types}
-        allowed_rel_types = {rt.type for rt in schema.relationship_types}
-        rel_constraints = {
-            rt.type: (set(rt.source_types), set(rt.target_types))
-            for rt in schema.relationship_types
-        }
-        valid_nodes = [n for n in nodes if n.label in allowed_labels]
-        valid_node_ids = {n.id for n in valid_nodes}
-        node_labels = {n.id: n.label for n in valid_nodes}
-        valid_rels = []
-        for rel in rels:
-            if rel.type not in allowed_rel_types:
-                continue
-            if rel.source_id not in valid_node_ids or rel.target_id not in valid_node_ids:
-                continue
-            source_types, target_types = rel_constraints[rel.type]
-            if node_labels[rel.source_id] not in source_types:
-                continue
-            if node_labels[rel.target_id] not in target_types:
-                continue
-            valid_rels.append(rel)
-        return valid_nodes, valid_rels
+        return validate_extraction(nodes, rels, schema)

{graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/interfaces.py RENAMED Viewed

@@ -4,6 +4,8 @@ from __future__ import annotations
 from typing import Protocol, runtime_checkable
+from pydantic import BaseModel
 from graphrag_core.models import (
     AgentResult,
     ApplyResult,
@@ -77,6 +79,15 @@ class LLMClient(Protocol):
         max_tokens: int = 4096,
     ) -> str: ...
+    async def complete_json(
+        self,
+        messages: list[dict[str, str]],
+        schema: type[BaseModel],
+        system: str | None = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+    ) -> BaseModel: ...
 # ---------------------------------------------------------------------------
 # BB2: Schema-Guided Entity Extraction
@@ -94,6 +105,13 @@ class ExtractionEngine(Protocol):
     ) -> ExtractionResult: ...
+@runtime_checkable
+class ExtractionPromptBuilder(Protocol):
+    """Builds the system prompt for LLM-based entity extraction."""
+    def build_system_prompt(self, schema: OntologySchema) -> str: ...
 # ---------------------------------------------------------------------------
 # BB3: Provenance-Native Knowledge Graph
 # ---------------------------------------------------------------------------

graphrag_core-0.4.0/src/graphrag_core/llm/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+"""LLM client implementations."""
+from graphrag_core.llm.base import BaseLLMClient
+__all__: list[str] = ["BaseLLMClient"]
+try:
+    from graphrag_core.llm.anthropic import AnthropicLLMClient
+    __all__.append("AnthropicLLMClient")
+except ImportError:
+    pass
+try:
+    from graphrag_core.llm.openai import OpenAILLMClient
+    __all__.append("OpenAILLMClient")
+except ImportError:
+    pass

graphrag_core-0.4.0/src/graphrag_core/llm/anthropic.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Anthropic Claude LLM client."""
+from __future__ import annotations
+from anthropic import AsyncAnthropic
+from pydantic import BaseModel
+class AnthropicLLMClient:
+    """Thin wrapper around the Anthropic SDK implementing the LLMClient Protocol."""
+    def __init__(
+        self,
+        model: str = "claude-sonnet-4-20250514",
+        api_key: str | None = None,
+    ) -> None:
+        self._model = model
+        self._client = AsyncAnthropic(api_key=api_key)
+    async def complete(
+        self,
+        messages: list[dict[str, str]],
+        system: str | None = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+    ) -> str:
+        kwargs: dict = {
+            "model": self._model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        if system is not None:
+            kwargs["system"] = system
+        response = await self._client.messages.create(**kwargs)
+        return response.content[0].text
+    async def complete_json(
+        self,
+        messages: list[dict[str, str]],
+        schema: type[BaseModel],
+        system: str | None = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+    ) -> BaseModel:
+        json_schema = schema.model_json_schema()
+        kwargs: dict = {
+            "model": self._model,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+            "tools": [
+                {
+                    "name": "extract",
+                    "description": "Extract structured data",
+                    "input_schema": json_schema,
+                },
+            ],
+            "tool_choice": {"type": "tool", "name": "extract"},
+        }
+        if system is not None:
+            kwargs["system"] = system
+        response = await self._client.messages.create(**kwargs)
+        tool_block = next(b for b in response.content if b.type == "tool_use")
+        return schema.model_validate(tool_block.input)

graphrag_core-0.4.0/src/graphrag_core/llm/base.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Base LLM client with fallback complete_json() via prompt + parse + retry."""
+from __future__ import annotations
+import json
+from pydantic import BaseModel, ValidationError
+class BaseLLMClient:
+    """Default complete_json() via prompt + parse + retry.
+    Providers with native structured output (OpenAI, Anthropic) override
+    complete_json() directly. This base class provides a working fallback
+    for providers without native support (e.g., local model clients).
+    """
+    async def complete(
+        self,
+        messages: list[dict[str, str]],
+        system: str | None = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+    ) -> str:
+        raise NotImplementedError
+    async def complete_json(
+        self,
+        messages: list[dict[str, str]],
+        schema: type[BaseModel],
+        system: str | None = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+    ) -> BaseModel:
+        schema_text = json.dumps(schema.model_json_schema(), indent=2)
+        augmented_system = (system or "") + (
+            f"\n\nRespond with ONLY a JSON object matching this schema:\n{schema_text}\n"
+            "No markdown fences. No explanation. Just the JSON object."
+        )
+        for attempt in range(2):
+            response = await self.complete(
+                messages, system=augmented_system, temperature=temperature, max_tokens=max_tokens,
+            )
+            text = self._strip_json(response)
+            try:
+                return schema.model_validate_json(text)
+            except (json.JSONDecodeError, ValidationError) as exc:
+                if attempt == 0:
+                    augmented_system += (
+                        f"\n\nYour previous response failed validation: {exc}\n"
+                        "Try again. Return ONLY valid JSON."
+                    )
+                else:
+                    raise
+        raise RuntimeError("unreachable")
+    @staticmethod
+    def _strip_json(text: str) -> str:
+        text = text.strip()
+        if text.startswith("```"):
+            nl = text.find("\n")
+            text = text[nl + 1 :] if nl != -1 else ""
+        if text.endswith("```"):
+            text = text[:-3]
+        return text.strip()

graphrag_core-0.4.0/src/graphrag_core/llm/openai.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""OpenAI LLM client."""
+from __future__ import annotations
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+class OpenAILLMClient:
+    """Thin wrapper around the OpenAI SDK implementing the LLMClient Protocol."""
+    def __init__(
+        self,
+        model: str = "gpt-4o",
+        api_key: str | None = None,
+    ) -> None:
+        self._model = model
+        self._client = AsyncOpenAI(api_key=api_key)
+    async def complete(
+        self,
+        messages: list[dict[str, str]],
+        system: str | None = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+    ) -> str:
+        full_messages = list(messages)
+        if system is not None:
+            full_messages.insert(0, {"role": "system", "content": system})
+        response = await self._client.chat.completions.create(
+            model=self._model,
+            messages=full_messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        return response.choices[0].message.content
+    async def complete_json(
+        self,
+        messages: list[dict[str, str]],
+        schema: type[BaseModel],
+        system: str | None = None,
+        temperature: float = 0.0,
+        max_tokens: int = 4096,
+    ) -> BaseModel:
+        full_messages = list(messages)
+        if system is not None:
+            full_messages.insert(0, {"role": "system", "content": system})
+        json_schema = schema.model_json_schema()
+        response = await self._client.chat.completions.create(
+            model=self._model,
+            messages=full_messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": schema.__name__,
+                    "schema": json_schema,
+                    "strict": True,
+                },
+            },
+        )
+        return schema.model_validate_json(response.choices[0].message.content)

{graphrag_core-0.2.0 → graphrag_core-0.4.0}/src/graphrag_core/models.py RENAMED Viewed

@@ -68,12 +68,14 @@ class NodeTypeDefinition(BaseModel):
     label: str
     properties: list[PropertyDefinition]
     required_properties: list[str] = []
+    description: str | None = None
 class RelationshipTypeDefinition(BaseModel):
     type: str
     source_types: list[str]
     target_types: list[str]
+    description: str | None = None
 class OntologySchema(BaseModel):
@@ -106,6 +108,12 @@ class ExtractionResult(BaseModel):
     provenance: list[ProvenanceLink]
+class ChunkExtractionResult(BaseModel):
+    """LLM extraction output for a single chunk (no provenance — engine adds that)."""
+    nodes: list[ExtractedNode]
+    relationships: list[ExtractedRelationship]
 # ---------------------------------------------------------------------------
 # BB3: Provenance-Native Knowledge Graph
 # ---------------------------------------------------------------------------

graphrag-core 0.2.0__tar.gz → 0.4.0__tar.gz

graphrag-core 0.2.0tar.gz → 0.4.0tar.gz