graphite-engine 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- graphite/__init__.py +47 -0
- graphite/adapters/__init__.py +1 -0
- graphite/adapters/alphaearth.py +213 -0
- graphite/adapters/weathernext.py +93 -0
- graphite/assembler.py +299 -0
- graphite/cache.py +66 -0
- graphite/claim.py +333 -0
- graphite/claim_store.py +133 -0
- graphite/confidence.py +225 -0
- graphite/domain.py +195 -0
- graphite/enums.py +70 -0
- graphite/evidence.py +97 -0
- graphite/features/__init__.py +1 -0
- graphite/features/alphaearth_enricher.py +125 -0
- graphite/features/embedding_similarity.py +193 -0
- graphite/geo_evidence/__init__.py +1 -0
- graphite/geo_evidence/geo_foundation.py +86 -0
- graphite/graph.py +182 -0
- graphite/io.py +194 -0
- graphite/llm.py +150 -0
- graphite/py.typed +0 -0
- graphite/rules.py +100 -0
- graphite/scenario.py +157 -0
- graphite/scenarios/__init__.py +1 -0
- graphite/scenarios/weathernext_forecast.py +140 -0
- graphite/schemas.py +171 -0
- graphite/simulate.py +245 -0
- graphite/text.py +268 -0
- graphite_engine-0.3.0.dist-info/METADATA +205 -0
- graphite_engine-0.3.0.dist-info/RECORD +33 -0
- graphite_engine-0.3.0.dist-info/WHEEL +5 -0
- graphite_engine-0.3.0.dist-info/licenses/LICENSE +191 -0
- graphite_engine-0.3.0.dist-info/top_level.txt +1 -0
graphite/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graphite — Open-source claim verification engine for high-stakes decisions.
|
|
3
|
+
|
|
4
|
+
Core primitives:
|
|
5
|
+
- Claim: the atomic unit of trust — structured assertion with provenance
|
|
6
|
+
- ClaimStore: SQLite-backed registry for persisting and querying claims
|
|
7
|
+
- Provenance: first-class evidence source (document, quote, confidence)
|
|
8
|
+
- ConfidenceScorer: explainable confidence scoring with named factors
|
|
9
|
+
|
|
10
|
+
Pipeline:
|
|
11
|
+
Fetcher → DocumentContext → Extractor → Claim[] → ClaimStore → Verify
|
|
12
|
+
|
|
13
|
+
Also included:
|
|
14
|
+
Graph assembly (GraphAssembler) and shock propagation (simulate, scenario)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# ── Core schemas ──
|
|
18
|
+
from .schemas import ExtractedEdge, NodeRef, Provenance, InferenceBasis, ExtractionError
|
|
19
|
+
from .enums import EdgeType, NodeType, SourceType, ConfidenceLevel, AssertionMode, EvidenceType
|
|
20
|
+
from .evidence import EvidencePacket, EvidenceData
|
|
21
|
+
|
|
22
|
+
# ── Trust engine primitives ──
|
|
23
|
+
from .claim import Claim, ClaimType, ClaimStatus, ClaimGranularity, ReviewState, ClaimOrigin
|
|
24
|
+
from .claim import ConfidenceFactor, ConfidenceResult
|
|
25
|
+
from .claim_store import ClaimStore
|
|
26
|
+
from .confidence import ConfidenceScorer
|
|
27
|
+
|
|
28
|
+
# ── Assembly ──
|
|
29
|
+
from .assembler import GraphAssembler
|
|
30
|
+
|
|
31
|
+
# ── Domain plugin contracts ──
|
|
32
|
+
from .domain import BaseFetcher, BaseExtractor, BasePipeline, DocumentContext, DomainSpec
|
|
33
|
+
from .domain import register_domain, get_domain, list_domains
|
|
34
|
+
|
|
35
|
+
# ── Rules ──
|
|
36
|
+
from .rules import BaseRuleEngine, RuleResult, ScoreBreakdown
|
|
37
|
+
|
|
38
|
+
# ── Simulation ──
|
|
39
|
+
from .simulate import top_k_paths_from_source, build_blast_radius, map_to_tier
|
|
40
|
+
|
|
41
|
+
# ── Scenario ──
|
|
42
|
+
from .scenario import ScenarioShock, ScenarioRunner
|
|
43
|
+
|
|
44
|
+
# ── I/O ──
|
|
45
|
+
from .io import save_graph, load_graph
|
|
46
|
+
|
|
47
|
+
__version__ = "0.3.0"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""graphite.adapters — External data adapters for Graphite Terra."""
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graphite/adapters/alphaearth.py — Cache-first AlphaEarth embedding adapter.
|
|
3
|
+
|
|
4
|
+
Reads 64-dimensional AlphaEarth Foundations embeddings for geographic locations.
|
|
5
|
+
Primary path: local .npy cache files (deterministic, no network needed).
|
|
6
|
+
Optional: GCS COGs at gs://alphaearth_foundations/ (Requester Pays).
|
|
7
|
+
|
|
8
|
+
AlphaEarth Foundations:
|
|
9
|
+
- 64-dim embeddings at ~10m/pixel resolution
|
|
10
|
+
- Annual layers: 2017–2025
|
|
11
|
+
- Multi-sensor: Sentinel-1/2, Landsat, climate sims, 3D laser
|
|
12
|
+
- Earth Engine dataset: GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL
|
|
13
|
+
- GCS bucket: gs://alphaearth_foundations (Requester Pays)
|
|
14
|
+
"""
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Dict, List, Optional, Tuple
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ── Constants ──
|
|
24
|
+
EMBEDDING_DIM = 64
|
|
25
|
+
GCS_BUCKET = "gs://alphaearth_foundations"
|
|
26
|
+
EE_DATASET = "GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL"
|
|
27
|
+
AVAILABLE_YEARS = list(range(2017, 2026))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AlphaEarthAdapter:
|
|
31
|
+
"""Read AlphaEarth embeddings — local cache first, GCS optional.
|
|
32
|
+
|
|
33
|
+
Primary path: cache/{year}/{node_id}.npy
|
|
34
|
+
- Pre-fetched embeddings stored as numpy arrays
|
|
35
|
+
- Deterministic: same result every run
|
|
36
|
+
- No network dependency for demos/CI
|
|
37
|
+
|
|
38
|
+
Optional path: GCS COGs (Requester Pays)
|
|
39
|
+
- Requires billing_project to be set
|
|
40
|
+
- Reads Cloud Optimized GeoTIFFs via rasterio
|
|
41
|
+
- Results cached locally after first fetch
|
|
42
|
+
|
|
43
|
+
Usage:
|
|
44
|
+
adapter = AlphaEarthAdapter(cache_dir="cache/alphaearth")
|
|
45
|
+
|
|
46
|
+
# From cache (fast, deterministic)
|
|
47
|
+
emb = adapter.get_embedding(29.7355, -95.2690, year=2017)
|
|
48
|
+
|
|
49
|
+
# With GCS fallback (requires rasterio + billing project)
|
|
50
|
+
adapter = AlphaEarthAdapter(
|
|
51
|
+
cache_dir="cache/alphaearth",
|
|
52
|
+
billing_project="my-gcp-project",
|
|
53
|
+
)
|
|
54
|
+
emb = adapter.get_embedding(29.7355, -95.2690, year=2017)
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
cache_dir: str = "cache/alphaearth",
|
|
60
|
+
billing_project: Optional[str] = None,
|
|
61
|
+
):
|
|
62
|
+
self.cache_dir = Path(cache_dir)
|
|
63
|
+
self.billing_project = billing_project
|
|
64
|
+
|
|
65
|
+
def get_embedding(
|
|
66
|
+
self,
|
|
67
|
+
lat: float,
|
|
68
|
+
lon: float,
|
|
69
|
+
year: int = 2017,
|
|
70
|
+
node_id: Optional[str] = None,
|
|
71
|
+
) -> np.ndarray:
|
|
72
|
+
"""Return 64-dim embedding for a point location.
|
|
73
|
+
|
|
74
|
+
Tries cache first (by node_id or lat/lon key), then GCS if configured.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
lat: Latitude
|
|
78
|
+
lon: Longitude
|
|
79
|
+
year: Annual embedding year (2017–2025)
|
|
80
|
+
node_id: Optional cache key (e.g., "PORT_HOUSTON")
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
numpy array of shape (64,)
|
|
84
|
+
"""
|
|
85
|
+
cache_key = node_id or f"{lat:.4f}_{lon:.4f}"
|
|
86
|
+
|
|
87
|
+
# 1. Try cache
|
|
88
|
+
cached = self._read_cache(cache_key, year)
|
|
89
|
+
if cached is not None:
|
|
90
|
+
return cached
|
|
91
|
+
|
|
92
|
+
# 2. Try GCS (optional)
|
|
93
|
+
if self.billing_project:
|
|
94
|
+
embedding = self._fetch_from_gcs(lat, lon, year)
|
|
95
|
+
if embedding is not None:
|
|
96
|
+
self._write_cache(cache_key, year, embedding)
|
|
97
|
+
return embedding
|
|
98
|
+
|
|
99
|
+
# 3. No data available
|
|
100
|
+
raise FileNotFoundError(
|
|
101
|
+
f"No AlphaEarth embedding found for {cache_key} (year={year}). "
|
|
102
|
+
f"Either pre-fetch to {self.cache_dir}/{year}/{cache_key}.npy "
|
|
103
|
+
f"or set billing_project for GCS access."
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def get_area_embedding(
|
|
107
|
+
self,
|
|
108
|
+
bbox: Tuple[float, float, float, float],
|
|
109
|
+
year: int = 2017,
|
|
110
|
+
node_id: Optional[str] = None,
|
|
111
|
+
) -> np.ndarray:
|
|
112
|
+
"""Return mean 64-dim embedding for a bounding box area.
|
|
113
|
+
|
|
114
|
+
For cache mode, this falls back to get_embedding with the bbox centroid.
|
|
115
|
+
For GCS mode, this would read and average the full bbox region.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
bbox: (min_lat, min_lon, max_lat, max_lon)
|
|
119
|
+
year: Annual embedding year
|
|
120
|
+
node_id: Optional cache key
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
numpy array of shape (64,)
|
|
124
|
+
"""
|
|
125
|
+
cache_key = node_id or f"bbox_{bbox[0]:.4f}_{bbox[1]:.4f}_{bbox[2]:.4f}_{bbox[3]:.4f}"
|
|
126
|
+
|
|
127
|
+
cached = self._read_cache(cache_key, year)
|
|
128
|
+
if cached is not None:
|
|
129
|
+
return cached
|
|
130
|
+
|
|
131
|
+
# Fallback: centroid point embedding
|
|
132
|
+
centroid_lat = (bbox[0] + bbox[2]) / 2
|
|
133
|
+
centroid_lon = (bbox[1] + bbox[3]) / 2
|
|
134
|
+
return self.get_embedding(centroid_lat, centroid_lon, year, node_id=cache_key)
|
|
135
|
+
|
|
136
|
+
def get_embedding_safe(
|
|
137
|
+
self,
|
|
138
|
+
lat: float,
|
|
139
|
+
lon: float,
|
|
140
|
+
year: int = 2017,
|
|
141
|
+
node_id: Optional[str] = None,
|
|
142
|
+
) -> Optional[np.ndarray]:
|
|
143
|
+
"""Like get_embedding but returns None instead of raising."""
|
|
144
|
+
try:
|
|
145
|
+
return self.get_embedding(lat, lon, year, node_id)
|
|
146
|
+
except FileNotFoundError:
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
# ── Cache I/O ──
|
|
150
|
+
|
|
151
|
+
def _cache_path(self, key: str, year: int) -> Path:
|
|
152
|
+
return self.cache_dir / str(year) / f"{key}.npy"
|
|
153
|
+
|
|
154
|
+
def _read_cache(self, key: str, year: int) -> Optional[np.ndarray]:
|
|
155
|
+
path = self._cache_path(key, year)
|
|
156
|
+
if path.exists():
|
|
157
|
+
arr = np.load(path)
|
|
158
|
+
if arr.shape == (EMBEDDING_DIM,):
|
|
159
|
+
return arr
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
def _write_cache(self, key: str, year: int, embedding: np.ndarray):
|
|
163
|
+
path = self._cache_path(key, year)
|
|
164
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
165
|
+
np.save(path, embedding)
|
|
166
|
+
|
|
167
|
+
# ── GCS fetch (optional, requires rasterio) ──
|
|
168
|
+
|
|
169
|
+
def _fetch_from_gcs(
|
|
170
|
+
self, lat: float, lon: float, year: int
|
|
171
|
+
) -> Optional[np.ndarray]:
|
|
172
|
+
"""Fetch embedding from GCS Cloud Optimized GeoTIFF.
|
|
173
|
+
|
|
174
|
+
Requires:
|
|
175
|
+
- rasterio package installed
|
|
176
|
+
- GCP billing project with access to requester-pays buckets
|
|
177
|
+
"""
|
|
178
|
+
try:
|
|
179
|
+
import rasterio
|
|
180
|
+
from rasterio.crs import CRS
|
|
181
|
+
except ImportError:
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
# GCS COGs are organized by year and UTM zone
|
|
185
|
+
# For now, return None — actual implementation would:
|
|
186
|
+
# 1. Determine UTM zone from lat/lon
|
|
187
|
+
# 2. Construct GCS path: gs://alphaearth_foundations/{year}/{zone}/...
|
|
188
|
+
# 3. Open COG with rasterio using GDAL_HTTP_HEADER_AUTH
|
|
189
|
+
# 4. Sample at lat/lon → 64-dim array
|
|
190
|
+
#
|
|
191
|
+
# This is left as a stub because:
|
|
192
|
+
# - GCS bucket is Requester Pays (needs billing project)
|
|
193
|
+
# - Demo should work from cache without network
|
|
194
|
+
# - Full implementation needs GDAL/rasterio + GCP auth setup
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
# ── Batch operations ──
|
|
198
|
+
|
|
199
|
+
def list_cached(self, year: int = 2017) -> List[str]:
|
|
200
|
+
"""List all cached embedding keys for a year."""
|
|
201
|
+
year_dir = self.cache_dir / str(year)
|
|
202
|
+
if not year_dir.exists():
|
|
203
|
+
return []
|
|
204
|
+
return [p.stem for p in year_dir.glob("*.npy")]
|
|
205
|
+
|
|
206
|
+
def cache_stats(self) -> Dict[str, int]:
|
|
207
|
+
"""Return count of cached embeddings per year."""
|
|
208
|
+
stats = {}
|
|
209
|
+
for year in AVAILABLE_YEARS:
|
|
210
|
+
count = len(self.list_cached(year))
|
|
211
|
+
if count > 0:
|
|
212
|
+
stats[str(year)] = count
|
|
213
|
+
return stats
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graphite/adapters/weathernext.py — Sample-first WeatherNext 2 forecast adapter.
|
|
3
|
+
|
|
4
|
+
Reads WeatherNext 2 ensemble forecast data for geographic locations.
|
|
5
|
+
Primary path: local forecast_snapshot.json (deterministic, no network needed).
|
|
6
|
+
Optional (--live): Earth Engine / BigQuery query (requires approved data request form).
|
|
7
|
+
|
|
8
|
+
WeatherNext 2:
|
|
9
|
+
- 0.25° resolution, 64-member ensemble
|
|
10
|
+
- Fields: temperature, wind, precipitation, humidity, pressure
|
|
11
|
+
- Coverage: 2022-present, 6-hour init times, up to 15-day lead time
|
|
12
|
+
- Access: EE/BigQuery (requires data request form)
|
|
13
|
+
- Note: Experimental dataset, not validated for real-world use
|
|
14
|
+
"""
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, List, Optional
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class WeatherNextAdapter:
|
|
21
|
+
"""Read WeatherNext 2 forecasts — sample snapshot first, live optional.
|
|
22
|
+
|
|
23
|
+
Primary path: forecast_snapshot.json
|
|
24
|
+
- Deterministic forecast data for demo nodes
|
|
25
|
+
- No network dependency for demos/CI
|
|
26
|
+
|
|
27
|
+
Optional path (live=True): Earth Engine / BigQuery
|
|
28
|
+
- Requires approved data request form
|
|
29
|
+
- Not implemented in v1
|
|
30
|
+
|
|
31
|
+
Usage:
|
|
32
|
+
adapter = WeatherNextAdapter(snapshot_path="forecast_snapshot.json")
|
|
33
|
+
forecast = adapter.get_forecast("asset:PORT_HOUSTON")
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
snapshot_path: Optional[str] = None,
|
|
39
|
+
live: bool = False,
|
|
40
|
+
):
|
|
41
|
+
self.live = live
|
|
42
|
+
self._data = None
|
|
43
|
+
self._snapshot_path = snapshot_path
|
|
44
|
+
|
|
45
|
+
if snapshot_path:
|
|
46
|
+
self._load_snapshot(snapshot_path)
|
|
47
|
+
|
|
48
|
+
def _load_snapshot(self, path: str):
|
|
49
|
+
"""Load forecast data from a snapshot JSON file."""
|
|
50
|
+
with open(path) as f:
|
|
51
|
+
raw = json.load(f)
|
|
52
|
+
|
|
53
|
+
self._meta = raw.get("meta", {})
|
|
54
|
+
self._data = {}
|
|
55
|
+
|
|
56
|
+
for point in raw.get("forecast_points", []):
|
|
57
|
+
node_id = point.get("node_id", "")
|
|
58
|
+
if node_id:
|
|
59
|
+
self._data[node_id] = point
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def meta(self) -> Dict[str, Any]:
|
|
63
|
+
"""Return forecast metadata."""
|
|
64
|
+
return self._meta if self._meta else {}
|
|
65
|
+
|
|
66
|
+
def get_forecast(self, node_id: str) -> Optional[Dict[str, Any]]:
|
|
67
|
+
"""Return forecast fields for a node.
|
|
68
|
+
|
|
69
|
+
Returns None if the node is not in the snapshot.
|
|
70
|
+
"""
|
|
71
|
+
if self._data and node_id in self._data:
|
|
72
|
+
return self._data[node_id]
|
|
73
|
+
|
|
74
|
+
if self.live:
|
|
75
|
+
return self._fetch_live(node_id)
|
|
76
|
+
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
def get_all_forecasts(self) -> Dict[str, Dict[str, Any]]:
|
|
80
|
+
"""Return all forecast points from the snapshot."""
|
|
81
|
+
return dict(self._data) if self._data else {}
|
|
82
|
+
|
|
83
|
+
def list_nodes(self) -> List[str]:
|
|
84
|
+
"""List all node IDs with forecast data."""
|
|
85
|
+
return list(self._data.keys()) if self._data else []
|
|
86
|
+
|
|
87
|
+
def _fetch_live(self, node_id: str) -> Optional[Dict[str, Any]]:
|
|
88
|
+
"""Fetch live forecast from Earth Engine / BigQuery.
|
|
89
|
+
|
|
90
|
+
Not implemented in v1 — requires approved data request form.
|
|
91
|
+
"""
|
|
92
|
+
# Stub: would use ee.ImageCollection or BigQuery SQL
|
|
93
|
+
return None
|
graphite/assembler.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graphite/assembler.py — Graph assembly from extracted edges.
|
|
3
|
+
|
|
4
|
+
The assembler sits between extractors and writers:
|
|
5
|
+
Extractor → ExtractedEdge[] → **Assembler** → nx.DiGraph → Writer
|
|
6
|
+
|
|
7
|
+
Responsibilities:
|
|
8
|
+
- Deduplicate edges (merge provenance when same relationship from different sources)
|
|
9
|
+
- Resolve conflicting attributes (merge policy with priority)
|
|
10
|
+
- Normalize nodes
|
|
11
|
+
- Collect extraction errors
|
|
12
|
+
- Stamp graph metadata
|
|
13
|
+
"""
|
|
14
|
+
import json
|
|
15
|
+
import math
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
from typing import Dict, List, Optional, Set
|
|
19
|
+
|
|
20
|
+
import networkx as nx
|
|
21
|
+
|
|
22
|
+
from .enums import AssertionMode, ConfidenceLevel, SourceType, EvidenceType
|
|
23
|
+
from .schemas import ExtractedEdge, ExtractionError, Provenance
|
|
24
|
+
from .domain import DomainSpec
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ═══════════════════════════════════════
|
|
28
|
+
# Merge Policy
|
|
29
|
+
# ═══════════════════════════════════════
|
|
30
|
+
|
|
31
|
+
_EVIDENCE_PRIORITY = {
|
|
32
|
+
EvidenceType.TABLE_CELL: 4,
|
|
33
|
+
EvidenceType.TEXT_QUOTE: 3,
|
|
34
|
+
EvidenceType.DERIVED: 2,
|
|
35
|
+
EvidenceType.MANUAL: 1,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
_SOURCE_PRIORITY = {
|
|
39
|
+
SourceType.USGS_MCS: 4,
|
|
40
|
+
SourceType.SEC_10K: 3,
|
|
41
|
+
SourceType.SEC_20F: 3,
|
|
42
|
+
SourceType.PDF: 2,
|
|
43
|
+
SourceType.WEB: 1,
|
|
44
|
+
SourceType.MANUAL: 1,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
_ASSERTION_PRIORITY = {
|
|
48
|
+
AssertionMode.EXTRACTED: 3,
|
|
49
|
+
AssertionMode.INFERRED: 2,
|
|
50
|
+
AssertionMode.SEEDED: 1,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
_CONFIDENCE_PRIORITY = {
|
|
54
|
+
ConfidenceLevel.HIGH: 3,
|
|
55
|
+
ConfidenceLevel.MEDIUM: 2,
|
|
56
|
+
ConfidenceLevel.LOW: 1,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _provenance_score(p: Provenance) -> int:
|
|
61
|
+
"""Score a provenance for merge priority. Higher = prefer."""
|
|
62
|
+
return (
|
|
63
|
+
_EVIDENCE_PRIORITY.get(p.evidence_type, 0) * 100
|
|
64
|
+
+ _SOURCE_PRIORITY.get(p.source_type, 0) * 10
|
|
65
|
+
+ _CONFIDENCE_PRIORITY.get(p.confidence, 0)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class GraphAssembler:
|
|
70
|
+
"""Assemble a NetworkX graph from normalized ExtractedEdge objects.
|
|
71
|
+
|
|
72
|
+
Usage:
|
|
73
|
+
assembler = GraphAssembler()
|
|
74
|
+
G = assembler.assemble(edges)
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
pipeline_version: str = "1.0",
|
|
80
|
+
domain_spec: Optional[DomainSpec] = None,
|
|
81
|
+
drop_zero_provenance: bool = True,
|
|
82
|
+
drop_low_inferred: bool = False,
|
|
83
|
+
):
|
|
84
|
+
self.pipeline_version = pipeline_version
|
|
85
|
+
self.domain_spec = domain_spec
|
|
86
|
+
self.drop_zero_provenance = drop_zero_provenance
|
|
87
|
+
self.drop_low_inferred = drop_low_inferred
|
|
88
|
+
self.errors: List[ExtractionError] = []
|
|
89
|
+
|
|
90
|
+
def assemble(
|
|
91
|
+
self,
|
|
92
|
+
edges: List[ExtractedEdge],
|
|
93
|
+
node_labels: Optional[Dict[str, str]] = None,
|
|
94
|
+
) -> nx.DiGraph:
|
|
95
|
+
"""Assemble edges into a NetworkX DiGraph.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
edges: Normalized extracted edges
|
|
99
|
+
node_labels: Optional node_id → display label mapping
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Assembled and stamped nx.DiGraph
|
|
103
|
+
"""
|
|
104
|
+
# 1. Validate edge types against domain registry
|
|
105
|
+
if self.domain_spec:
|
|
106
|
+
edges = self._validate_edge_types(edges)
|
|
107
|
+
|
|
108
|
+
# 2. Quality filters
|
|
109
|
+
edges = self._quality_filter(edges)
|
|
110
|
+
|
|
111
|
+
# 3. Deduplicate
|
|
112
|
+
deduped = self.dedupe_edges(edges)
|
|
113
|
+
|
|
114
|
+
# 4. Build graph
|
|
115
|
+
G = nx.DiGraph()
|
|
116
|
+
labels = node_labels or {}
|
|
117
|
+
|
|
118
|
+
# Collect all nodes
|
|
119
|
+
all_nodes: Dict[str, dict] = {}
|
|
120
|
+
for edge in deduped:
|
|
121
|
+
for nref in (edge.from_node, edge.to_node):
|
|
122
|
+
if nref.node_id not in all_nodes:
|
|
123
|
+
all_nodes[nref.node_id] = {
|
|
124
|
+
"node_type": nref.node_type.value,
|
|
125
|
+
"name": nref.label or labels.get(nref.node_id, nref.node_id),
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
for nid, attrs in all_nodes.items():
|
|
129
|
+
G.add_node(nid, **attrs)
|
|
130
|
+
|
|
131
|
+
# Add edges
|
|
132
|
+
for edge in deduped:
|
|
133
|
+
weight = edge.attributes.get("bucket_weight", 0.5)
|
|
134
|
+
cost = -math.log(max(weight, 0.01))
|
|
135
|
+
|
|
136
|
+
edge_attrs = {
|
|
137
|
+
"edge_type": edge.edge_type,
|
|
138
|
+
"assertion_mode": edge.assertion_mode.value,
|
|
139
|
+
"bucket_weight": weight,
|
|
140
|
+
"cost": round(cost, 6),
|
|
141
|
+
"confidence": edge.best_confidence.value,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
# Flatten domain attributes (but not nested structures)
|
|
145
|
+
for k, v in edge.attributes.items():
|
|
146
|
+
if k not in edge_attrs and not isinstance(v, (dict, list)):
|
|
147
|
+
edge_attrs[k] = v
|
|
148
|
+
|
|
149
|
+
# Serialize provenance as JSON string (GraphML compatible)
|
|
150
|
+
edge_attrs["provenance_json"] = json.dumps(
|
|
151
|
+
[p.model_dump() for p in edge.provenance], default=str
|
|
152
|
+
)
|
|
153
|
+
edge_attrs["provenance_count"] = len(edge.provenance)
|
|
154
|
+
edge_attrs["evidence"] = edge.provenance[0].evidence_quote if edge.provenance else ""
|
|
155
|
+
edge_attrs["data_source"] = edge.provenance[0].source_type.value if edge.provenance else ""
|
|
156
|
+
|
|
157
|
+
# Inference basis
|
|
158
|
+
if edge.inference_basis:
|
|
159
|
+
edge_attrs["inference_method"] = edge.inference_basis.method
|
|
160
|
+
edge_attrs["inference_reason"] = edge.inference_basis.reason
|
|
161
|
+
|
|
162
|
+
# Claim linkage (trust engine v1)
|
|
163
|
+
if edge.claim_ids:
|
|
164
|
+
edge_attrs["claim_ids"] = json.dumps(edge.claim_ids)
|
|
165
|
+
|
|
166
|
+
G.add_edge(edge.from_node.node_id, edge.to_node.node_id, **edge_attrs)
|
|
167
|
+
|
|
168
|
+
# 5. Stamp
|
|
169
|
+
return self._stamp_graph(G)
|
|
170
|
+
|
|
171
|
+
def dedupe_edges(self, edges: List[ExtractedEdge]) -> List[ExtractedEdge]:
|
|
172
|
+
"""Merge edges with same (from, to, type) — combine provenances."""
|
|
173
|
+
by_key: Dict[str, ExtractedEdge] = {}
|
|
174
|
+
|
|
175
|
+
for edge in edges:
|
|
176
|
+
key = edge.edge_key
|
|
177
|
+
if key in by_key:
|
|
178
|
+
existing = by_key[key]
|
|
179
|
+
merged = self._merge_edge_pair(existing, edge)
|
|
180
|
+
by_key[key] = merged
|
|
181
|
+
else:
|
|
182
|
+
by_key[key] = edge
|
|
183
|
+
|
|
184
|
+
return list(by_key.values())
|
|
185
|
+
|
|
186
|
+
def _merge_edge_pair(self, a: ExtractedEdge, b: ExtractedEdge) -> ExtractedEdge:
|
|
187
|
+
"""Merge two edges with the same key."""
|
|
188
|
+
# Merge provenance (dedupe by source_id)
|
|
189
|
+
seen_sources = {p.source_id for p in a.provenance}
|
|
190
|
+
merged_prov = list(a.provenance)
|
|
191
|
+
for p in b.provenance:
|
|
192
|
+
if p.source_id not in seen_sources:
|
|
193
|
+
merged_prov.append(p)
|
|
194
|
+
seen_sources.add(p.source_id)
|
|
195
|
+
|
|
196
|
+
# Stronger assertion mode (EXTRACTED > INFERRED > SEEDED)
|
|
197
|
+
mode = (
|
|
198
|
+
a.assertion_mode
|
|
199
|
+
if _ASSERTION_PRIORITY.get(a.assertion_mode, 0)
|
|
200
|
+
>= _ASSERTION_PRIORITY.get(b.assertion_mode, 0)
|
|
201
|
+
else b.assertion_mode
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Merge attributes with conflict tracking
|
|
205
|
+
merged_attrs = dict(a.attributes)
|
|
206
|
+
for k, v in b.attributes.items():
|
|
207
|
+
if k in merged_attrs and merged_attrs[k] != v:
|
|
208
|
+
rv_key = f"{k}_reported_values"
|
|
209
|
+
existing_rv = merged_attrs.get(rv_key, [])
|
|
210
|
+
if not existing_rv:
|
|
211
|
+
a_prov = a.provenance[0] if a.provenance else None
|
|
212
|
+
existing_rv.append({
|
|
213
|
+
"value": merged_attrs[k],
|
|
214
|
+
"source": a_prov.source_type.value if a_prov else "unknown",
|
|
215
|
+
"confidence": a_prov.confidence.value if a_prov else "LOW",
|
|
216
|
+
})
|
|
217
|
+
b_prov = b.provenance[0] if b.provenance else None
|
|
218
|
+
existing_rv.append({
|
|
219
|
+
"value": v,
|
|
220
|
+
"source": b_prov.source_type.value if b_prov else "unknown",
|
|
221
|
+
"confidence": b_prov.confidence.value if b_prov else "LOW",
|
|
222
|
+
})
|
|
223
|
+
merged_attrs[rv_key] = existing_rv
|
|
224
|
+
|
|
225
|
+
a_score = max((_provenance_score(p) for p in a.provenance), default=0)
|
|
226
|
+
b_score = max((_provenance_score(p) for p in b.provenance), default=0)
|
|
227
|
+
if b_score > a_score:
|
|
228
|
+
merged_attrs[k] = v
|
|
229
|
+
else:
|
|
230
|
+
merged_attrs[k] = v
|
|
231
|
+
|
|
232
|
+
basis = a.inference_basis or b.inference_basis
|
|
233
|
+
|
|
234
|
+
return ExtractedEdge(
|
|
235
|
+
from_node=a.from_node,
|
|
236
|
+
to_node=a.to_node,
|
|
237
|
+
edge_type=a.edge_type,
|
|
238
|
+
assertion_mode=mode,
|
|
239
|
+
attributes=merged_attrs,
|
|
240
|
+
provenance=merged_prov,
|
|
241
|
+
inference_basis=basis,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def _validate_edge_types(self, edges: List[ExtractedEdge]) -> List[ExtractedEdge]:
|
|
245
|
+
"""Check edge types against domain registry."""
|
|
246
|
+
allowed = set(self.domain_spec.allowed_edge_types) if self.domain_spec else set()
|
|
247
|
+
if not allowed:
|
|
248
|
+
return edges
|
|
249
|
+
|
|
250
|
+
valid = []
|
|
251
|
+
for edge in edges:
|
|
252
|
+
if edge.edge_type in allowed:
|
|
253
|
+
valid.append(edge)
|
|
254
|
+
else:
|
|
255
|
+
self.errors.append(ExtractionError(
|
|
256
|
+
entity_id=edge.from_node.node_id,
|
|
257
|
+
source_type=edge.provenance[0].source_type if edge.provenance else SourceType.MANUAL,
|
|
258
|
+
error_type="validation_failed",
|
|
259
|
+
message=f"Edge type '{edge.edge_type}' not in domain allowed types: {allowed}",
|
|
260
|
+
))
|
|
261
|
+
return valid
|
|
262
|
+
|
|
263
|
+
def _quality_filter(self, edges: List[ExtractedEdge]) -> List[ExtractedEdge]:
|
|
264
|
+
"""Apply quality filters."""
|
|
265
|
+
filtered = []
|
|
266
|
+
for edge in edges:
|
|
267
|
+
if self.drop_zero_provenance and not edge.provenance:
|
|
268
|
+
self.errors.append(ExtractionError(
|
|
269
|
+
entity_id=edge.from_node.node_id,
|
|
270
|
+
source_type=SourceType.MANUAL,
|
|
271
|
+
error_type="no_edges",
|
|
272
|
+
message=f"Edge {edge.edge_key} dropped: zero provenance",
|
|
273
|
+
))
|
|
274
|
+
continue
|
|
275
|
+
if (self.drop_low_inferred
|
|
276
|
+
and edge.assertion_mode == AssertionMode.INFERRED
|
|
277
|
+
and edge.best_confidence == ConfidenceLevel.LOW):
|
|
278
|
+
continue
|
|
279
|
+
filtered.append(edge)
|
|
280
|
+
return filtered
|
|
281
|
+
|
|
282
|
+
def _stamp_graph(self, G: nx.DiGraph) -> nx.DiGraph:
|
|
283
|
+
"""Add metadata to graph."""
|
|
284
|
+
edge_types = defaultdict(int)
|
|
285
|
+
assertion_modes = defaultdict(int)
|
|
286
|
+
for _, _, d in G.edges(data=True):
|
|
287
|
+
edge_types[d.get("edge_type", "?")] += 1
|
|
288
|
+
assertion_modes[d.get("assertion_mode", "?")] += 1
|
|
289
|
+
|
|
290
|
+
G.graph["built_at"] = datetime.now(timezone.utc).isoformat()
|
|
291
|
+
G.graph["pipeline_version"] = self.pipeline_version
|
|
292
|
+
G.graph["node_count"] = G.number_of_nodes()
|
|
293
|
+
G.graph["edge_count"] = G.number_of_edges()
|
|
294
|
+
G.graph["edge_types"] = json.dumps(dict(edge_types))
|
|
295
|
+
G.graph["assertion_modes"] = json.dumps(dict(assertion_modes))
|
|
296
|
+
if self.domain_spec:
|
|
297
|
+
G.graph["domain"] = self.domain_spec.name
|
|
298
|
+
|
|
299
|
+
return G
|