graphite-engine 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
graphite/__init__.py ADDED
@@ -0,0 +1,47 @@
1
+ """
2
+ Graphite — Open-source claim verification engine for high-stakes decisions.
3
+
4
+ Core primitives:
5
+ - Claim: the atomic unit of trust — structured assertion with provenance
6
+ - ClaimStore: SQLite-backed registry for persisting and querying claims
7
+ - Provenance: first-class evidence source (document, quote, confidence)
8
+ - ConfidenceScorer: explainable confidence scoring with named factors
9
+
10
+ Pipeline:
11
+ Fetcher → DocumentContext → Extractor → Claim[] → ClaimStore → Verify
12
+
13
+ Also included:
14
+ Graph assembly (GraphAssembler) and shock propagation (simulate, scenario)
15
+ """
16
+
17
+ # ── Core schemas ──
18
+ from .schemas import ExtractedEdge, NodeRef, Provenance, InferenceBasis, ExtractionError
19
+ from .enums import EdgeType, NodeType, SourceType, ConfidenceLevel, AssertionMode, EvidenceType
20
+ from .evidence import EvidencePacket, EvidenceData
21
+
22
+ # ── Trust engine primitives ──
23
+ from .claim import Claim, ClaimType, ClaimStatus, ClaimGranularity, ReviewState, ClaimOrigin
24
+ from .claim import ConfidenceFactor, ConfidenceResult
25
+ from .claim_store import ClaimStore
26
+ from .confidence import ConfidenceScorer
27
+
28
+ # ── Assembly ──
29
+ from .assembler import GraphAssembler
30
+
31
+ # ── Domain plugin contracts ──
32
+ from .domain import BaseFetcher, BaseExtractor, BasePipeline, DocumentContext, DomainSpec
33
+ from .domain import register_domain, get_domain, list_domains
34
+
35
+ # ── Rules ──
36
+ from .rules import BaseRuleEngine, RuleResult, ScoreBreakdown
37
+
38
+ # ── Simulation ──
39
+ from .simulate import top_k_paths_from_source, build_blast_radius, map_to_tier
40
+
41
+ # ── Scenario ──
42
+ from .scenario import ScenarioShock, ScenarioRunner
43
+
44
+ # ── I/O ──
45
+ from .io import save_graph, load_graph
46
+
47
+ __version__ = "0.3.0"
@@ -0,0 +1 @@
1
+ """graphite.adapters — External data adapters for Graphite Terra."""
@@ -0,0 +1,213 @@
1
+ """
2
+ graphite/adapters/alphaearth.py — Cache-first AlphaEarth embedding adapter.
3
+
4
+ Reads 64-dimensional AlphaEarth Foundations embeddings for geographic locations.
5
+ Primary path: local .npy cache files (deterministic, no network needed).
6
+ Optional: GCS COGs at gs://alphaearth_foundations/ (Requester Pays).
7
+
8
+ AlphaEarth Foundations:
9
+ - 64-dim embeddings at ~10m/pixel resolution
10
+ - Annual layers: 2017–2025
11
+ - Multi-sensor: Sentinel-1/2, Landsat, climate sims, 3D laser
12
+ - Earth Engine dataset: GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL
13
+ - GCS bucket: gs://alphaearth_foundations (Requester Pays)
14
+ """
15
+ import json
16
+ import os
17
+ from pathlib import Path
18
+ from typing import Dict, List, Optional, Tuple
19
+
20
+ import numpy as np
21
+
22
+
23
+ # ── Constants ──
24
+ EMBEDDING_DIM = 64
25
+ GCS_BUCKET = "gs://alphaearth_foundations"
26
+ EE_DATASET = "GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL"
27
+ AVAILABLE_YEARS = list(range(2017, 2026))
28
+
29
+
30
+ class AlphaEarthAdapter:
31
+ """Read AlphaEarth embeddings — local cache first, GCS optional.
32
+
33
+ Primary path: cache/{year}/{node_id}.npy
34
+ - Pre-fetched embeddings stored as numpy arrays
35
+ - Deterministic: same result every run
36
+ - No network dependency for demos/CI
37
+
38
+ Optional path: GCS COGs (Requester Pays)
39
+ - Requires billing_project to be set
40
+ - Reads Cloud Optimized GeoTIFFs via rasterio
41
+ - Results cached locally after first fetch
42
+
43
+ Usage:
44
+ adapter = AlphaEarthAdapter(cache_dir="cache/alphaearth")
45
+
46
+ # From cache (fast, deterministic)
47
+ emb = adapter.get_embedding(29.7355, -95.2690, year=2017)
48
+
49
+ # With GCS fallback (requires rasterio + billing project)
50
+ adapter = AlphaEarthAdapter(
51
+ cache_dir="cache/alphaearth",
52
+ billing_project="my-gcp-project",
53
+ )
54
+ emb = adapter.get_embedding(29.7355, -95.2690, year=2017)
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ cache_dir: str = "cache/alphaearth",
60
+ billing_project: Optional[str] = None,
61
+ ):
62
+ self.cache_dir = Path(cache_dir)
63
+ self.billing_project = billing_project
64
+
65
+ def get_embedding(
66
+ self,
67
+ lat: float,
68
+ lon: float,
69
+ year: int = 2017,
70
+ node_id: Optional[str] = None,
71
+ ) -> np.ndarray:
72
+ """Return 64-dim embedding for a point location.
73
+
74
+ Tries cache first (by node_id or lat/lon key), then GCS if configured.
75
+
76
+ Args:
77
+ lat: Latitude
78
+ lon: Longitude
79
+ year: Annual embedding year (2017–2025)
80
+ node_id: Optional cache key (e.g., "PORT_HOUSTON")
81
+
82
+ Returns:
83
+ numpy array of shape (64,)
84
+ """
85
+ cache_key = node_id or f"{lat:.4f}_{lon:.4f}"
86
+
87
+ # 1. Try cache
88
+ cached = self._read_cache(cache_key, year)
89
+ if cached is not None:
90
+ return cached
91
+
92
+ # 2. Try GCS (optional)
93
+ if self.billing_project:
94
+ embedding = self._fetch_from_gcs(lat, lon, year)
95
+ if embedding is not None:
96
+ self._write_cache(cache_key, year, embedding)
97
+ return embedding
98
+
99
+ # 3. No data available
100
+ raise FileNotFoundError(
101
+ f"No AlphaEarth embedding found for {cache_key} (year={year}). "
102
+ f"Either pre-fetch to {self.cache_dir}/{year}/{cache_key}.npy "
103
+ f"or set billing_project for GCS access."
104
+ )
105
+
106
+ def get_area_embedding(
107
+ self,
108
+ bbox: Tuple[float, float, float, float],
109
+ year: int = 2017,
110
+ node_id: Optional[str] = None,
111
+ ) -> np.ndarray:
112
+ """Return mean 64-dim embedding for a bounding box area.
113
+
114
+ For cache mode, this falls back to get_embedding with the bbox centroid.
115
+ For GCS mode, this would read and average the full bbox region.
116
+
117
+ Args:
118
+ bbox: (min_lat, min_lon, max_lat, max_lon)
119
+ year: Annual embedding year
120
+ node_id: Optional cache key
121
+
122
+ Returns:
123
+ numpy array of shape (64,)
124
+ """
125
+ cache_key = node_id or f"bbox_{bbox[0]:.4f}_{bbox[1]:.4f}_{bbox[2]:.4f}_{bbox[3]:.4f}"
126
+
127
+ cached = self._read_cache(cache_key, year)
128
+ if cached is not None:
129
+ return cached
130
+
131
+ # Fallback: centroid point embedding
132
+ centroid_lat = (bbox[0] + bbox[2]) / 2
133
+ centroid_lon = (bbox[1] + bbox[3]) / 2
134
+ return self.get_embedding(centroid_lat, centroid_lon, year, node_id=cache_key)
135
+
136
+ def get_embedding_safe(
137
+ self,
138
+ lat: float,
139
+ lon: float,
140
+ year: int = 2017,
141
+ node_id: Optional[str] = None,
142
+ ) -> Optional[np.ndarray]:
143
+ """Like get_embedding but returns None instead of raising."""
144
+ try:
145
+ return self.get_embedding(lat, lon, year, node_id)
146
+ except FileNotFoundError:
147
+ return None
148
+
149
+ # ── Cache I/O ──
150
+
151
+ def _cache_path(self, key: str, year: int) -> Path:
152
+ return self.cache_dir / str(year) / f"{key}.npy"
153
+
154
+ def _read_cache(self, key: str, year: int) -> Optional[np.ndarray]:
155
+ path = self._cache_path(key, year)
156
+ if path.exists():
157
+ arr = np.load(path)
158
+ if arr.shape == (EMBEDDING_DIM,):
159
+ return arr
160
+ return None
161
+
162
+ def _write_cache(self, key: str, year: int, embedding: np.ndarray):
163
+ path = self._cache_path(key, year)
164
+ path.parent.mkdir(parents=True, exist_ok=True)
165
+ np.save(path, embedding)
166
+
167
+ # ── GCS fetch (optional, requires rasterio) ──
168
+
169
+ def _fetch_from_gcs(
170
+ self, lat: float, lon: float, year: int
171
+ ) -> Optional[np.ndarray]:
172
+ """Fetch embedding from GCS Cloud Optimized GeoTIFF.
173
+
174
+ Requires:
175
+ - rasterio package installed
176
+ - GCP billing project with access to requester-pays buckets
177
+ """
178
+ try:
179
+ import rasterio
180
+ from rasterio.crs import CRS
181
+ except ImportError:
182
+ return None
183
+
184
+ # GCS COGs are organized by year and UTM zone
185
+ # For now, return None — actual implementation would:
186
+ # 1. Determine UTM zone from lat/lon
187
+ # 2. Construct GCS path: gs://alphaearth_foundations/{year}/{zone}/...
188
+ # 3. Open COG with rasterio using GDAL_HTTP_HEADER_AUTH
189
+ # 4. Sample at lat/lon → 64-dim array
190
+ #
191
+ # This is left as a stub because:
192
+ # - GCS bucket is Requester Pays (needs billing project)
193
+ # - Demo should work from cache without network
194
+ # - Full implementation needs GDAL/rasterio + GCP auth setup
195
+ return None
196
+
197
+ # ── Batch operations ──
198
+
199
+ def list_cached(self, year: int = 2017) -> List[str]:
200
+ """List all cached embedding keys for a year."""
201
+ year_dir = self.cache_dir / str(year)
202
+ if not year_dir.exists():
203
+ return []
204
+ return [p.stem for p in year_dir.glob("*.npy")]
205
+
206
+ def cache_stats(self) -> Dict[str, int]:
207
+ """Return count of cached embeddings per year."""
208
+ stats = {}
209
+ for year in AVAILABLE_YEARS:
210
+ count = len(self.list_cached(year))
211
+ if count > 0:
212
+ stats[str(year)] = count
213
+ return stats
@@ -0,0 +1,93 @@
1
+ """
2
+ graphite/adapters/weathernext.py — Sample-first WeatherNext 2 forecast adapter.
3
+
4
+ Reads WeatherNext 2 ensemble forecast data for geographic locations.
5
+ Primary path: local forecast_snapshot.json (deterministic, no network needed).
6
+ Optional (--live): Earth Engine / BigQuery query (requires approved data request form).
7
+
8
+ WeatherNext 2:
9
+ - 0.25° resolution, 64-member ensemble
10
+ - Fields: temperature, wind, precipitation, humidity, pressure
11
+ - Coverage: 2022-present, 6-hour init times, up to 15-day lead time
12
+ - Access: EE/BigQuery (requires data request form)
13
+ - Note: Experimental dataset, not validated for real-world use
14
+ """
15
+ import json
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List, Optional
18
+
19
+
20
+ class WeatherNextAdapter:
21
+ """Read WeatherNext 2 forecasts — sample snapshot first, live optional.
22
+
23
+ Primary path: forecast_snapshot.json
24
+ - Deterministic forecast data for demo nodes
25
+ - No network dependency for demos/CI
26
+
27
+ Optional path (live=True): Earth Engine / BigQuery
28
+ - Requires approved data request form
29
+ - Not implemented in v1
30
+
31
+ Usage:
32
+ adapter = WeatherNextAdapter(snapshot_path="forecast_snapshot.json")
33
+ forecast = adapter.get_forecast("asset:PORT_HOUSTON")
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ snapshot_path: Optional[str] = None,
39
+ live: bool = False,
40
+ ):
41
+ self.live = live
42
+ self._data = None
43
+ self._snapshot_path = snapshot_path
44
+
45
+ if snapshot_path:
46
+ self._load_snapshot(snapshot_path)
47
+
48
+ def _load_snapshot(self, path: str):
49
+ """Load forecast data from a snapshot JSON file."""
50
+ with open(path) as f:
51
+ raw = json.load(f)
52
+
53
+ self._meta = raw.get("meta", {})
54
+ self._data = {}
55
+
56
+ for point in raw.get("forecast_points", []):
57
+ node_id = point.get("node_id", "")
58
+ if node_id:
59
+ self._data[node_id] = point
60
+
61
+ @property
62
+ def meta(self) -> Dict[str, Any]:
63
+ """Return forecast metadata."""
64
+ return self._meta if self._meta else {}
65
+
66
+ def get_forecast(self, node_id: str) -> Optional[Dict[str, Any]]:
67
+ """Return forecast fields for a node.
68
+
69
+ Returns None if the node is not in the snapshot.
70
+ """
71
+ if self._data and node_id in self._data:
72
+ return self._data[node_id]
73
+
74
+ if self.live:
75
+ return self._fetch_live(node_id)
76
+
77
+ return None
78
+
79
+ def get_all_forecasts(self) -> Dict[str, Dict[str, Any]]:
80
+ """Return all forecast points from the snapshot."""
81
+ return dict(self._data) if self._data else {}
82
+
83
+ def list_nodes(self) -> List[str]:
84
+ """List all node IDs with forecast data."""
85
+ return list(self._data.keys()) if self._data else []
86
+
87
+ def _fetch_live(self, node_id: str) -> Optional[Dict[str, Any]]:
88
+ """Fetch live forecast from Earth Engine / BigQuery.
89
+
90
+ Not implemented in v1 — requires approved data request form.
91
+ """
92
+ # Stub: would use ee.ImageCollection or BigQuery SQL
93
+ return None
graphite/assembler.py ADDED
@@ -0,0 +1,299 @@
1
+ """
2
+ graphite/assembler.py — Graph assembly from extracted edges.
3
+
4
+ The assembler sits between extractors and writers:
5
+ Extractor → ExtractedEdge[] → **Assembler** → nx.DiGraph → Writer
6
+
7
+ Responsibilities:
8
+ - Deduplicate edges (merge provenance when same relationship from different sources)
9
+ - Resolve conflicting attributes (merge policy with priority)
10
+ - Normalize nodes
11
+ - Collect extraction errors
12
+ - Stamp graph metadata
13
+ """
14
+ import json
15
+ import math
16
+ from collections import defaultdict
17
+ from datetime import datetime, timezone
18
+ from typing import Dict, List, Optional, Set
19
+
20
+ import networkx as nx
21
+
22
+ from .enums import AssertionMode, ConfidenceLevel, SourceType, EvidenceType
23
+ from .schemas import ExtractedEdge, ExtractionError, Provenance
24
+ from .domain import DomainSpec
25
+
26
+
27
+ # ═══════════════════════════════════════
28
+ # Merge Policy
29
+ # ═══════════════════════════════════════
30
+
31
+ _EVIDENCE_PRIORITY = {
32
+ EvidenceType.TABLE_CELL: 4,
33
+ EvidenceType.TEXT_QUOTE: 3,
34
+ EvidenceType.DERIVED: 2,
35
+ EvidenceType.MANUAL: 1,
36
+ }
37
+
38
+ _SOURCE_PRIORITY = {
39
+ SourceType.USGS_MCS: 4,
40
+ SourceType.SEC_10K: 3,
41
+ SourceType.SEC_20F: 3,
42
+ SourceType.PDF: 2,
43
+ SourceType.WEB: 1,
44
+ SourceType.MANUAL: 1,
45
+ }
46
+
47
+ _ASSERTION_PRIORITY = {
48
+ AssertionMode.EXTRACTED: 3,
49
+ AssertionMode.INFERRED: 2,
50
+ AssertionMode.SEEDED: 1,
51
+ }
52
+
53
+ _CONFIDENCE_PRIORITY = {
54
+ ConfidenceLevel.HIGH: 3,
55
+ ConfidenceLevel.MEDIUM: 2,
56
+ ConfidenceLevel.LOW: 1,
57
+ }
58
+
59
+
60
+ def _provenance_score(p: Provenance) -> int:
61
+ """Score a provenance for merge priority. Higher = prefer."""
62
+ return (
63
+ _EVIDENCE_PRIORITY.get(p.evidence_type, 0) * 100
64
+ + _SOURCE_PRIORITY.get(p.source_type, 0) * 10
65
+ + _CONFIDENCE_PRIORITY.get(p.confidence, 0)
66
+ )
67
+
68
+
69
+ class GraphAssembler:
70
+ """Assemble a NetworkX graph from normalized ExtractedEdge objects.
71
+
72
+ Usage:
73
+ assembler = GraphAssembler()
74
+ G = assembler.assemble(edges)
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ pipeline_version: str = "1.0",
80
+ domain_spec: Optional[DomainSpec] = None,
81
+ drop_zero_provenance: bool = True,
82
+ drop_low_inferred: bool = False,
83
+ ):
84
+ self.pipeline_version = pipeline_version
85
+ self.domain_spec = domain_spec
86
+ self.drop_zero_provenance = drop_zero_provenance
87
+ self.drop_low_inferred = drop_low_inferred
88
+ self.errors: List[ExtractionError] = []
89
+
90
+ def assemble(
91
+ self,
92
+ edges: List[ExtractedEdge],
93
+ node_labels: Optional[Dict[str, str]] = None,
94
+ ) -> nx.DiGraph:
95
+ """Assemble edges into a NetworkX DiGraph.
96
+
97
+ Args:
98
+ edges: Normalized extracted edges
99
+ node_labels: Optional node_id → display label mapping
100
+
101
+ Returns:
102
+ Assembled and stamped nx.DiGraph
103
+ """
104
+ # 1. Validate edge types against domain registry
105
+ if self.domain_spec:
106
+ edges = self._validate_edge_types(edges)
107
+
108
+ # 2. Quality filters
109
+ edges = self._quality_filter(edges)
110
+
111
+ # 3. Deduplicate
112
+ deduped = self.dedupe_edges(edges)
113
+
114
+ # 4. Build graph
115
+ G = nx.DiGraph()
116
+ labels = node_labels or {}
117
+
118
+ # Collect all nodes
119
+ all_nodes: Dict[str, dict] = {}
120
+ for edge in deduped:
121
+ for nref in (edge.from_node, edge.to_node):
122
+ if nref.node_id not in all_nodes:
123
+ all_nodes[nref.node_id] = {
124
+ "node_type": nref.node_type.value,
125
+ "name": nref.label or labels.get(nref.node_id, nref.node_id),
126
+ }
127
+
128
+ for nid, attrs in all_nodes.items():
129
+ G.add_node(nid, **attrs)
130
+
131
+ # Add edges
132
+ for edge in deduped:
133
+ weight = edge.attributes.get("bucket_weight", 0.5)
134
+ cost = -math.log(max(weight, 0.01))
135
+
136
+ edge_attrs = {
137
+ "edge_type": edge.edge_type,
138
+ "assertion_mode": edge.assertion_mode.value,
139
+ "bucket_weight": weight,
140
+ "cost": round(cost, 6),
141
+ "confidence": edge.best_confidence.value,
142
+ }
143
+
144
+ # Flatten domain attributes (but not nested structures)
145
+ for k, v in edge.attributes.items():
146
+ if k not in edge_attrs and not isinstance(v, (dict, list)):
147
+ edge_attrs[k] = v
148
+
149
+ # Serialize provenance as JSON string (GraphML compatible)
150
+ edge_attrs["provenance_json"] = json.dumps(
151
+ [p.model_dump() for p in edge.provenance], default=str
152
+ )
153
+ edge_attrs["provenance_count"] = len(edge.provenance)
154
+ edge_attrs["evidence"] = edge.provenance[0].evidence_quote if edge.provenance else ""
155
+ edge_attrs["data_source"] = edge.provenance[0].source_type.value if edge.provenance else ""
156
+
157
+ # Inference basis
158
+ if edge.inference_basis:
159
+ edge_attrs["inference_method"] = edge.inference_basis.method
160
+ edge_attrs["inference_reason"] = edge.inference_basis.reason
161
+
162
+ # Claim linkage (trust engine v1)
163
+ if edge.claim_ids:
164
+ edge_attrs["claim_ids"] = json.dumps(edge.claim_ids)
165
+
166
+ G.add_edge(edge.from_node.node_id, edge.to_node.node_id, **edge_attrs)
167
+
168
+ # 5. Stamp
169
+ return self._stamp_graph(G)
170
+
171
+ def dedupe_edges(self, edges: List[ExtractedEdge]) -> List[ExtractedEdge]:
172
+ """Merge edges with same (from, to, type) — combine provenances."""
173
+ by_key: Dict[str, ExtractedEdge] = {}
174
+
175
+ for edge in edges:
176
+ key = edge.edge_key
177
+ if key in by_key:
178
+ existing = by_key[key]
179
+ merged = self._merge_edge_pair(existing, edge)
180
+ by_key[key] = merged
181
+ else:
182
+ by_key[key] = edge
183
+
184
+ return list(by_key.values())
185
+
186
+ def _merge_edge_pair(self, a: ExtractedEdge, b: ExtractedEdge) -> ExtractedEdge:
187
+ """Merge two edges with the same key."""
188
+ # Merge provenance (dedupe by source_id)
189
+ seen_sources = {p.source_id for p in a.provenance}
190
+ merged_prov = list(a.provenance)
191
+ for p in b.provenance:
192
+ if p.source_id not in seen_sources:
193
+ merged_prov.append(p)
194
+ seen_sources.add(p.source_id)
195
+
196
+ # Stronger assertion mode (EXTRACTED > INFERRED > SEEDED)
197
+ mode = (
198
+ a.assertion_mode
199
+ if _ASSERTION_PRIORITY.get(a.assertion_mode, 0)
200
+ >= _ASSERTION_PRIORITY.get(b.assertion_mode, 0)
201
+ else b.assertion_mode
202
+ )
203
+
204
+ # Merge attributes with conflict tracking
205
+ merged_attrs = dict(a.attributes)
206
+ for k, v in b.attributes.items():
207
+ if k in merged_attrs and merged_attrs[k] != v:
208
+ rv_key = f"{k}_reported_values"
209
+ existing_rv = merged_attrs.get(rv_key, [])
210
+ if not existing_rv:
211
+ a_prov = a.provenance[0] if a.provenance else None
212
+ existing_rv.append({
213
+ "value": merged_attrs[k],
214
+ "source": a_prov.source_type.value if a_prov else "unknown",
215
+ "confidence": a_prov.confidence.value if a_prov else "LOW",
216
+ })
217
+ b_prov = b.provenance[0] if b.provenance else None
218
+ existing_rv.append({
219
+ "value": v,
220
+ "source": b_prov.source_type.value if b_prov else "unknown",
221
+ "confidence": b_prov.confidence.value if b_prov else "LOW",
222
+ })
223
+ merged_attrs[rv_key] = existing_rv
224
+
225
+ a_score = max((_provenance_score(p) for p in a.provenance), default=0)
226
+ b_score = max((_provenance_score(p) for p in b.provenance), default=0)
227
+ if b_score > a_score:
228
+ merged_attrs[k] = v
229
+ else:
230
+ merged_attrs[k] = v
231
+
232
+ basis = a.inference_basis or b.inference_basis
233
+
234
+ return ExtractedEdge(
235
+ from_node=a.from_node,
236
+ to_node=a.to_node,
237
+ edge_type=a.edge_type,
238
+ assertion_mode=mode,
239
+ attributes=merged_attrs,
240
+ provenance=merged_prov,
241
+ inference_basis=basis,
242
+ )
243
+
244
+ def _validate_edge_types(self, edges: List[ExtractedEdge]) -> List[ExtractedEdge]:
245
+ """Check edge types against domain registry."""
246
+ allowed = set(self.domain_spec.allowed_edge_types) if self.domain_spec else set()
247
+ if not allowed:
248
+ return edges
249
+
250
+ valid = []
251
+ for edge in edges:
252
+ if edge.edge_type in allowed:
253
+ valid.append(edge)
254
+ else:
255
+ self.errors.append(ExtractionError(
256
+ entity_id=edge.from_node.node_id,
257
+ source_type=edge.provenance[0].source_type if edge.provenance else SourceType.MANUAL,
258
+ error_type="validation_failed",
259
+ message=f"Edge type '{edge.edge_type}' not in domain allowed types: {allowed}",
260
+ ))
261
+ return valid
262
+
263
+ def _quality_filter(self, edges: List[ExtractedEdge]) -> List[ExtractedEdge]:
264
+ """Apply quality filters."""
265
+ filtered = []
266
+ for edge in edges:
267
+ if self.drop_zero_provenance and not edge.provenance:
268
+ self.errors.append(ExtractionError(
269
+ entity_id=edge.from_node.node_id,
270
+ source_type=SourceType.MANUAL,
271
+ error_type="no_edges",
272
+ message=f"Edge {edge.edge_key} dropped: zero provenance",
273
+ ))
274
+ continue
275
+ if (self.drop_low_inferred
276
+ and edge.assertion_mode == AssertionMode.INFERRED
277
+ and edge.best_confidence == ConfidenceLevel.LOW):
278
+ continue
279
+ filtered.append(edge)
280
+ return filtered
281
+
282
+ def _stamp_graph(self, G: nx.DiGraph) -> nx.DiGraph:
283
+ """Add metadata to graph."""
284
+ edge_types = defaultdict(int)
285
+ assertion_modes = defaultdict(int)
286
+ for _, _, d in G.edges(data=True):
287
+ edge_types[d.get("edge_type", "?")] += 1
288
+ assertion_modes[d.get("assertion_mode", "?")] += 1
289
+
290
+ G.graph["built_at"] = datetime.now(timezone.utc).isoformat()
291
+ G.graph["pipeline_version"] = self.pipeline_version
292
+ G.graph["node_count"] = G.number_of_nodes()
293
+ G.graph["edge_count"] = G.number_of_edges()
294
+ G.graph["edge_types"] = json.dumps(dict(edge_types))
295
+ G.graph["assertion_modes"] = json.dumps(dict(assertion_modes))
296
+ if self.domain_spec:
297
+ G.graph["domain"] = self.domain_spec.name
298
+
299
+ return G