biocypher 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of biocypher might be problematic. Click here for more details.
- biocypher/__init__.py +8 -0
- biocypher/_graph.py +819 -0
- biocypher/_metadata.py +21 -8
- biocypher/_workflow.py +798 -0
- biocypher/output/in_memory/_airr.py +1 -1
- {biocypher-0.10.1.dist-info → biocypher-0.11.0.dist-info}/METADATA +22 -24
- {biocypher-0.10.1.dist-info → biocypher-0.11.0.dist-info}/RECORD +15 -13
- {biocypher-0.10.1.dist-info → biocypher-0.11.0.dist-info}/WHEEL +1 -1
- {biocypher-0.10.1.dist-info → biocypher-0.11.0.dist-info/licenses}/LICENSE +0 -0
biocypher/_workflow.py
ADDED
|
@@ -0,0 +1,798 @@
|
|
|
1
|
+
"""Unified BioCypher Workflow API for knowledge graph workflows.
|
|
2
|
+
|
|
3
|
+
This module provides a streamlined interface for creating and managing
|
|
4
|
+
knowledge graphs using the unified Graph representation, with optional
|
|
5
|
+
schema and ontology support. Designed for both agentic and deterministic
|
|
6
|
+
workflows.
|
|
7
|
+
|
|
8
|
+
## Design Philosophy
|
|
9
|
+
|
|
10
|
+
This API is designed with the following principles:
|
|
11
|
+
1. **Agentic-First**: Optimized for LLM agent workflows with computable functions
|
|
12
|
+
2. **Zero Dependencies**: Pure Python implementation for maximum compatibility
|
|
13
|
+
3. **Future-Proof**: Native BioCypher objects enable advanced agentic features
|
|
14
|
+
4. **Migration-Ready**: Wrapper methods provide compatibility with existing tools
|
|
15
|
+
5. **Progressive Validation**: Optional validation and deduplication with flexible modes
|
|
16
|
+
|
|
17
|
+
## Validation and Deduplication
|
|
18
|
+
|
|
19
|
+
Unlike the legacy BioCypher which enforces strict validation and deduplication,
|
|
20
|
+
this API provides **progressive validation** with three modes:
|
|
21
|
+
|
|
22
|
+
- **"none"** (default): No validation or deduplication - maximum flexibility for agents
|
|
23
|
+
- **"warn"**: Log warnings for schema violations and duplicates but continue processing
|
|
24
|
+
- **"strict"**: Enforce schema validation and deduplication - fail fast on violations
|
|
25
|
+
|
|
26
|
+
This approach allows:
|
|
27
|
+
- **Agents** to work with maximum flexibility (no validation overhead)
|
|
28
|
+
- **Deterministic workflows** to use validation when needed
|
|
29
|
+
- **Gradual migration** from legacy BioCypher (start with "none", add validation later)
|
|
30
|
+
|
|
31
|
+
## Future Migration Path
|
|
32
|
+
|
|
33
|
+
This module represents the future direction of BioCypher's in-memory graph
|
|
34
|
+
functionality. The plan is to:
|
|
35
|
+
|
|
36
|
+
1. **Phase 1** (Current): Keep separate from legacy code, provide compatibility wrappers
|
|
37
|
+
2. **Phase 2**: Replace legacy in-memory implementations (PandasKG, NetworkxKG)
|
|
38
|
+
3. **Phase 3**: Add advanced agentic features (computable functions, decision logging)
|
|
39
|
+
4. **Phase 4**: Integrate with main BioCypher class as unified interface
|
|
40
|
+
|
|
41
|
+
## Agentic Features (Future)
|
|
42
|
+
|
|
43
|
+
- Computable functions attached to nodes/edges
|
|
44
|
+
- Decision logging and reasoning traces
|
|
45
|
+
- Counterfactual inference capabilities
|
|
46
|
+
- MCP (Model Context Protocol) interface integration
|
|
47
|
+
- Local graph computation for agent workflows
|
|
48
|
+
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
import json
|
|
52
|
+
|
|
53
|
+
from typing import Any
|
|
54
|
+
|
|
55
|
+
import yaml
|
|
56
|
+
|
|
57
|
+
from ._graph import Edge, Graph, HyperEdge, Node
|
|
58
|
+
from ._logger import logger
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class BioCypherWorkflow:
|
|
62
|
+
"""Unified BioCypher interface for knowledge graph workflows.
|
|
63
|
+
|
|
64
|
+
This class provides a clean, simple API for creating and managing
|
|
65
|
+
knowledge graphs with optional schema and ontology support. Designed
|
|
66
|
+
for both agentic and deterministic workflows.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
name: str = "workflow_graph",
|
|
72
|
+
directed: bool = True,
|
|
73
|
+
schema: dict[str, Any] | None = None,
|
|
74
|
+
schema_file: str | None = None,
|
|
75
|
+
head_ontology_url: str | None = None,
|
|
76
|
+
validation_mode: str = "none",
|
|
77
|
+
deduplication: bool = False,
|
|
78
|
+
):
|
|
79
|
+
"""Initialize the workflow with a unified graph.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
name: Name of the knowledge graph
|
|
83
|
+
directed: Whether the graph is directed (default: True)
|
|
84
|
+
schema: Dictionary defining the knowledge graph schema
|
|
85
|
+
schema_file: Path to YAML schema file
|
|
86
|
+
head_ontology_url: URL to ontology file (defaults to Biolink model)
|
|
87
|
+
validation_mode: Validation level ("none", "warn", "strict")
|
|
88
|
+
deduplication: Whether to enable deduplication (default: False)
|
|
89
|
+
"""
|
|
90
|
+
self.graph = Graph(name=name, directed=directed)
|
|
91
|
+
self.name = name
|
|
92
|
+
self.schema = schema
|
|
93
|
+
self.schema_file = schema_file
|
|
94
|
+
self.head_ontology_url = head_ontology_url
|
|
95
|
+
self.validation_mode = validation_mode
|
|
96
|
+
self.deduplication = deduplication
|
|
97
|
+
|
|
98
|
+
# Track seen entities for deduplication
|
|
99
|
+
self._seen_nodes = set()
|
|
100
|
+
self._seen_edges = set()
|
|
101
|
+
|
|
102
|
+
# Initialize schema if provided
|
|
103
|
+
if schema_file:
|
|
104
|
+
self._load_schema_from_file(schema_file)
|
|
105
|
+
elif schema:
|
|
106
|
+
self._load_schema_from_dict(schema)
|
|
107
|
+
|
|
108
|
+
def _load_schema_from_file(self, schema_file: str) -> None:
|
|
109
|
+
"""Load schema from YAML file."""
|
|
110
|
+
try:
|
|
111
|
+
with open(schema_file, "r") as f:
|
|
112
|
+
self.schema = yaml.safe_load(f)
|
|
113
|
+
logger.info(f"Loaded schema from {schema_file}")
|
|
114
|
+
except Exception as e:
|
|
115
|
+
logger.warning(f"Could not load schema from {schema_file}: {e}")
|
|
116
|
+
|
|
117
|
+
def _load_schema_from_dict(self, schema: dict[str, Any]) -> None:
|
|
118
|
+
"""Load schema from dictionary."""
|
|
119
|
+
self.schema = schema
|
|
120
|
+
logger.info("Loaded schema from dictionary")
|
|
121
|
+
|
|
122
|
+
# ==================== NODE OPERATIONS ====================
|
|
123
|
+
|
|
124
|
+
def add_node(self, node_id: str, node_type: str, **properties) -> bool:
|
|
125
|
+
"""Add a node to the knowledge graph.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
node_id: Unique identifier for the node
|
|
129
|
+
node_type: Type/category of the node
|
|
130
|
+
**properties: Node properties as keyword arguments
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
bool: True if node was added, False if it already exists
|
|
134
|
+
|
|
135
|
+
Example:
|
|
136
|
+
workflow.add_node("protein_1", "protein", name="TP53", function="tumor_suppressor")
|
|
137
|
+
"""
|
|
138
|
+
# Check for duplicates if deduplication is enabled
|
|
139
|
+
if self.deduplication:
|
|
140
|
+
if node_id in self._seen_nodes:
|
|
141
|
+
if self.validation_mode == "warn":
|
|
142
|
+
logger.warning(f"Duplicate node ID '{node_id}' detected")
|
|
143
|
+
elif self.validation_mode == "strict":
|
|
144
|
+
raise ValueError(f"Duplicate node ID '{node_id}' not allowed in strict mode")
|
|
145
|
+
return False
|
|
146
|
+
self._seen_nodes.add(node_id)
|
|
147
|
+
|
|
148
|
+
# Validate against schema if validation is enabled
|
|
149
|
+
if self.validation_mode in ["warn", "strict"]:
|
|
150
|
+
is_valid = self.validate_against_schema(node_type, properties)
|
|
151
|
+
if not is_valid:
|
|
152
|
+
if self.validation_mode == "strict":
|
|
153
|
+
raise ValueError(f"Node '{node_id}' of type '{node_type}' failed schema validation")
|
|
154
|
+
elif self.validation_mode == "warn":
|
|
155
|
+
logger.warning(f"Node '{node_id}' of type '{node_type}' failed schema validation")
|
|
156
|
+
|
|
157
|
+
# Try to add node to graph (Graph class handles its own deduplication)
|
|
158
|
+
result = self.graph.add_node(node_id, node_type, properties)
|
|
159
|
+
|
|
160
|
+
# If deduplication is enabled and we're tracking, update our tracking
|
|
161
|
+
if self.deduplication and result:
|
|
162
|
+
self._seen_nodes.add(node_id)
|
|
163
|
+
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
def get_node(self, node_id: str) -> Node | None:
|
|
167
|
+
"""Get a node by ID.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
node_id: Node identifier
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Node object or None if not found
|
|
174
|
+
"""
|
|
175
|
+
return self.graph.get_node(node_id)
|
|
176
|
+
|
|
177
|
+
def get_nodes(self, node_type: str | None = None) -> list[Node]:
|
|
178
|
+
"""Get all nodes, optionally filtered by type.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
node_type: Optional filter by node type
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List of Node objects
|
|
185
|
+
"""
|
|
186
|
+
return self.graph.get_nodes(node_type)
|
|
187
|
+
|
|
188
|
+
def has_node(self, node_id: str) -> bool:
|
|
189
|
+
"""Check if a node exists.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
node_id: Node identifier
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
bool: True if node exists
|
|
196
|
+
"""
|
|
197
|
+
return self.graph.has_node(node_id)
|
|
198
|
+
|
|
199
|
+
def remove_node(self, node_id: str) -> bool:
|
|
200
|
+
"""Remove a node and all its connected edges.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
node_id: Node identifier
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
bool: True if node was removed, False if not found
|
|
207
|
+
"""
|
|
208
|
+
return self.graph.remove_node(node_id)
|
|
209
|
+
|
|
210
|
+
# ==================== EDGE OPERATIONS ====================
|
|
211
|
+
|
|
212
|
+
def add_edge(self, edge_id: str, edge_type: str, source: str, target: str, **properties) -> bool:
|
|
213
|
+
"""Add an edge to the knowledge graph.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
edge_id: Unique identifier for the edge
|
|
217
|
+
edge_type: Type/category of the edge
|
|
218
|
+
source: Source node ID
|
|
219
|
+
target: Target node ID
|
|
220
|
+
**properties: Edge properties as keyword arguments
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
bool: True if edge was added, False if it already exists
|
|
224
|
+
|
|
225
|
+
Example:
|
|
226
|
+
workflow.add_edge("interaction_1", "interaction", "protein_1", "protein_2",
|
|
227
|
+
confidence=0.8, method="yeast_two_hybrid")
|
|
228
|
+
"""
|
|
229
|
+
# Check for duplicates if deduplication is enabled
|
|
230
|
+
if self.deduplication:
|
|
231
|
+
edge_key = (edge_id, edge_type)
|
|
232
|
+
if edge_key in self._seen_edges:
|
|
233
|
+
if self.validation_mode == "warn":
|
|
234
|
+
logger.warning(f"Duplicate edge ID '{edge_id}' of type '{edge_type}' detected")
|
|
235
|
+
elif self.validation_mode == "strict":
|
|
236
|
+
raise ValueError(f"Duplicate edge ID '{edge_id}' not allowed in strict mode")
|
|
237
|
+
return False
|
|
238
|
+
self._seen_edges.add(edge_key)
|
|
239
|
+
|
|
240
|
+
# Validate against schema if validation is enabled
|
|
241
|
+
if self.validation_mode in ["warn", "strict"]:
|
|
242
|
+
is_valid = self.validate_against_schema(edge_type, properties)
|
|
243
|
+
if not is_valid:
|
|
244
|
+
if self.validation_mode == "strict":
|
|
245
|
+
raise ValueError(f"Edge '{edge_id}' of type '{edge_type}' failed schema validation")
|
|
246
|
+
elif self.validation_mode == "warn":
|
|
247
|
+
logger.warning(f"Edge '{edge_id}' of type '{edge_type}' failed schema validation")
|
|
248
|
+
|
|
249
|
+
# Try to add edge to graph (Graph class handles its own deduplication)
|
|
250
|
+
result = self.graph.add_edge(edge_id, edge_type, source, target, properties)
|
|
251
|
+
|
|
252
|
+
# If deduplication is enabled and we're tracking, update our tracking
|
|
253
|
+
if self.deduplication and result:
|
|
254
|
+
edge_key = (edge_id, edge_type)
|
|
255
|
+
self._seen_edges.add(edge_key)
|
|
256
|
+
|
|
257
|
+
return result
|
|
258
|
+
|
|
259
|
+
def get_edge(self, edge_id: str) -> Edge | None:
|
|
260
|
+
"""Get an edge by ID.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
edge_id: Edge identifier
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Edge object or None if not found
|
|
267
|
+
"""
|
|
268
|
+
return self.graph.get_edge(edge_id)
|
|
269
|
+
|
|
270
|
+
def get_edges(self, edge_type: str | None = None) -> list[Edge]:
|
|
271
|
+
"""Get all edges, optionally filtered by type.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
edge_type: Optional filter by edge type
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
List of Edge objects
|
|
278
|
+
"""
|
|
279
|
+
return self.graph.get_edges(edge_type)
|
|
280
|
+
|
|
281
|
+
def get_edges_between(self, source: str, target: str, edge_type: str | None = None) -> list[Edge]:
|
|
282
|
+
"""Get edges between two nodes.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
source: Source node ID
|
|
286
|
+
target: Target node ID
|
|
287
|
+
edge_type: Optional filter by edge type
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
List of Edge objects
|
|
291
|
+
"""
|
|
292
|
+
return self.graph.get_edges_between(source, target, edge_type)
|
|
293
|
+
|
|
294
|
+
def has_edge(self, edge_id: str) -> bool:
|
|
295
|
+
"""Check if an edge exists.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
edge_id: Edge identifier
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
bool: True if edge exists
|
|
302
|
+
"""
|
|
303
|
+
return self.graph.has_edge(edge_id)
|
|
304
|
+
|
|
305
|
+
def remove_edge(self, edge_id: str) -> bool:
|
|
306
|
+
"""Remove an edge from the graph.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
edge_id: Edge identifier
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
bool: True if edge was removed, False if not found
|
|
313
|
+
"""
|
|
314
|
+
return self.graph.remove_edge(edge_id)
|
|
315
|
+
|
|
316
|
+
# ==================== HYPEREDGE OPERATIONS ====================
|
|
317
|
+
|
|
318
|
+
def add_hyperedge(self, hyperedge_id: str, hyperedge_type: str, nodes: set[str], **properties) -> bool:
|
|
319
|
+
"""Add a hyperedge connecting multiple nodes.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
hyperedge_id: Unique identifier for the hyperedge
|
|
323
|
+
hyperedge_type: Type/category of the hyperedge
|
|
324
|
+
nodes: Set of node IDs to connect
|
|
325
|
+
**properties: Hyperedge properties as keyword arguments
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
bool: True if hyperedge was added, False if it already exists
|
|
329
|
+
|
|
330
|
+
Example:
|
|
331
|
+
workflow.add_hyperedge("complex_1", "protein_complex", {"protein_1", "protein_2", "protein_3"},
|
|
332
|
+
name="transcription_factor_complex")
|
|
333
|
+
"""
|
|
334
|
+
return self.graph.add_hyperedge(hyperedge_id, hyperedge_type, nodes, properties)
|
|
335
|
+
|
|
336
|
+
def get_hyperedge(self, hyperedge_id: str) -> HyperEdge | None:
|
|
337
|
+
"""Get a hyperedge by ID.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
hyperedge_id: Hyperedge identifier
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
HyperEdge object or None if not found
|
|
344
|
+
"""
|
|
345
|
+
return self.graph.get_hyperedge(hyperedge_id)
|
|
346
|
+
|
|
347
|
+
def get_hyperedges(self, hyperedge_type: str | None = None) -> list[HyperEdge]:
|
|
348
|
+
"""Get all hyperedges, optionally filtered by type.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
hyperedge_type: Optional filter by hyperedge type
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
List of HyperEdge objects
|
|
355
|
+
"""
|
|
356
|
+
return self.graph.get_hyperedges(hyperedge_type)
|
|
357
|
+
|
|
358
|
+
def has_hyperedge(self, hyperedge_id: str) -> bool:
|
|
359
|
+
"""Check if a hyperedge exists.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
hyperedge_id: Hyperedge identifier
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
bool: True if hyperedge exists
|
|
366
|
+
"""
|
|
367
|
+
return self.graph.has_hyperedge(hyperedge_id)
|
|
368
|
+
|
|
369
|
+
# ==================== GRAPH TRAVERSAL ====================
|
|
370
|
+
|
|
371
|
+
def get_neighbors(self, node_id: str, direction: str = "both") -> set[str]:
|
|
372
|
+
"""Get neighboring nodes.
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
node_id: Node identifier
|
|
376
|
+
direction: "in", "out", or "both"
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
Set of neighboring node IDs
|
|
380
|
+
"""
|
|
381
|
+
return self.graph.get_neighbors(node_id, direction)
|
|
382
|
+
|
|
383
|
+
def get_connected_edges(self, node_id: str, direction: str = "both") -> list[Edge]:
|
|
384
|
+
"""Get edges connected to a node.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
node_id: Node identifier
|
|
388
|
+
direction: "in", "out", or "both"
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
List of connected Edge objects
|
|
392
|
+
"""
|
|
393
|
+
return self.graph.get_connected_edges(node_id, direction)
|
|
394
|
+
|
|
395
|
+
def find_paths(self, source: str, target: str, max_length: int = 3) -> list[list[Edge]]:
|
|
396
|
+
"""Find all paths between two nodes.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
source: Source node ID
|
|
400
|
+
target: Target node ID
|
|
401
|
+
max_length: Maximum path length
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
List of paths, each path is a list of Edge objects
|
|
405
|
+
"""
|
|
406
|
+
return self.graph.find_paths(source, target, max_length)
|
|
407
|
+
|
|
408
|
+
# ==================== QUERY INTERFACE ====================
|
|
409
|
+
|
|
410
|
+
def query_nodes(self, node_type: str | None = None) -> list[dict[str, Any]]:
|
|
411
|
+
"""Query nodes in the knowledge graph.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
node_type: Optional filter by node type
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
List of node dictionaries
|
|
418
|
+
"""
|
|
419
|
+
nodes = self.graph.get_nodes(node_type)
|
|
420
|
+
return [node.to_dict() for node in nodes]
|
|
421
|
+
|
|
422
|
+
def query_edges(self, edge_type: str | None = None) -> list[dict[str, Any]]:
|
|
423
|
+
"""Query edges in the knowledge graph.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
edge_type: Optional filter by edge type
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
List of edge dictionaries
|
|
430
|
+
"""
|
|
431
|
+
edges = self.graph.get_edges(edge_type)
|
|
432
|
+
return [edge.to_dict() for edge in edges]
|
|
433
|
+
|
|
434
|
+
def query_hyperedges(self, hyperedge_type: str | None = None) -> list[dict[str, Any]]:
|
|
435
|
+
"""Query hyperedges in the knowledge graph.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
hyperedge_type: Optional filter by hyperedge type
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
List of hyperedge dictionaries
|
|
442
|
+
"""
|
|
443
|
+
hyperedges = self.graph.get_hyperedges(hyperedge_type)
|
|
444
|
+
return [hyperedge.to_dict() for hyperedge in hyperedges]
|
|
445
|
+
|
|
446
|
+
def find_connected_components(self, node_id: str, max_depth: int = 2) -> dict[str, Any]:
|
|
447
|
+
"""Find connected components around a node.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
node_id: Starting node ID
|
|
451
|
+
max_depth: Maximum depth to explore
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
Dictionary with nodes and edges in the component
|
|
455
|
+
"""
|
|
456
|
+
if not self.graph.has_node(node_id):
|
|
457
|
+
return {"nodes": [], "edges": [], "hyperedges": []}
|
|
458
|
+
|
|
459
|
+
# Collect nodes within max_depth
|
|
460
|
+
component_nodes = {node_id}
|
|
461
|
+
current_level = {node_id}
|
|
462
|
+
|
|
463
|
+
for depth in range(max_depth):
|
|
464
|
+
next_level = set()
|
|
465
|
+
for node in current_level:
|
|
466
|
+
neighbors = self.graph.get_neighbors(node)
|
|
467
|
+
next_level.update(neighbors)
|
|
468
|
+
current_level = next_level - component_nodes
|
|
469
|
+
component_nodes.update(current_level)
|
|
470
|
+
|
|
471
|
+
if not current_level:
|
|
472
|
+
break
|
|
473
|
+
|
|
474
|
+
# Get subgraph
|
|
475
|
+
subgraph = self.graph.get_subgraph(component_nodes)
|
|
476
|
+
|
|
477
|
+
return {
|
|
478
|
+
"nodes": [node.to_dict() for node in subgraph.get_nodes()],
|
|
479
|
+
"edges": [edge.to_dict() for edge in subgraph.get_edges()],
|
|
480
|
+
"hyperedges": [hyperedge.to_dict() for hyperedge in subgraph.get_hyperedges()],
|
|
481
|
+
"statistics": subgraph.get_statistics(),
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
# ==================== GRAPH ANALYSIS ====================
|
|
485
|
+
|
|
486
|
+
def get_statistics(self) -> dict[str, Any]:
|
|
487
|
+
"""Get comprehensive graph statistics.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
Dictionary with graph statistics
|
|
491
|
+
"""
|
|
492
|
+
return self.graph.get_statistics()
|
|
493
|
+
|
|
494
|
+
def get_summary(self) -> dict[str, Any]:
|
|
495
|
+
"""Get a human-readable summary of the graph.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
Dictionary with graph summary
|
|
499
|
+
"""
|
|
500
|
+
stats = self.graph.get_statistics()
|
|
501
|
+
|
|
502
|
+
# Get top node types
|
|
503
|
+
node_types = stats["node_types"]
|
|
504
|
+
top_node_types = sorted(node_types.items(), key=lambda x: x[1], reverse=True)[:5]
|
|
505
|
+
|
|
506
|
+
# Get top edge types
|
|
507
|
+
edge_types = stats["edge_types"]
|
|
508
|
+
top_edge_types = sorted(edge_types.items(), key=lambda x: x[1], reverse=True)[:5]
|
|
509
|
+
|
|
510
|
+
return {
|
|
511
|
+
"name": self.name,
|
|
512
|
+
"total_nodes": stats["basic"]["nodes"],
|
|
513
|
+
"total_edges": stats["basic"]["edges"],
|
|
514
|
+
"total_hyperedges": stats["basic"]["hyperedges"],
|
|
515
|
+
"top_node_types": top_node_types,
|
|
516
|
+
"top_edge_types": top_edge_types,
|
|
517
|
+
"connectivity": stats["connectivity"],
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
# ==================== SCHEMA AND ONTOLOGY SUPPORT ====================
|
|
521
|
+
|
|
522
|
+
def get_schema(self) -> dict[str, Any] | None:
|
|
523
|
+
"""Get the current schema configuration.
|
|
524
|
+
|
|
525
|
+
Returns:
|
|
526
|
+
Dictionary representing the schema or None if no schema
|
|
527
|
+
"""
|
|
528
|
+
return self.schema
|
|
529
|
+
|
|
530
|
+
def export_schema(self, filepath: str) -> None:
|
|
531
|
+
"""Export the current schema to a YAML file.
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
filepath: Path to save the schema file
|
|
535
|
+
"""
|
|
536
|
+
if self.schema:
|
|
537
|
+
with open(filepath, "w") as f:
|
|
538
|
+
yaml.dump(self.schema, f, default_flow_style=False)
|
|
539
|
+
logger.info(f"Schema exported to {filepath}")
|
|
540
|
+
else:
|
|
541
|
+
logger.warning("No schema to export")
|
|
542
|
+
|
|
543
|
+
def validate_against_schema(self, node_type: str, properties: dict[str, Any]) -> bool:
|
|
544
|
+
"""Validate node properties against schema (if available).
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
node_type: Type of node to validate
|
|
548
|
+
properties: Properties to validate
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
bool: True if valid, False otherwise
|
|
552
|
+
"""
|
|
553
|
+
if not self.schema or node_type not in self.schema:
|
|
554
|
+
return True # No schema or type not in schema, assume valid
|
|
555
|
+
|
|
556
|
+
schema_entry = self.schema[node_type]
|
|
557
|
+
if "properties" not in schema_entry:
|
|
558
|
+
return True # No property constraints
|
|
559
|
+
|
|
560
|
+
required_properties = schema_entry["properties"]
|
|
561
|
+
|
|
562
|
+
# Check if all required properties are present and have correct types
|
|
563
|
+
for prop_name, prop_type in required_properties.items():
|
|
564
|
+
if prop_name not in properties:
|
|
565
|
+
logger.warning(f"Missing required property '{prop_name}' for node type '{node_type}'")
|
|
566
|
+
return False
|
|
567
|
+
|
|
568
|
+
# Check property type
|
|
569
|
+
actual_value = properties[prop_name]
|
|
570
|
+
if not self._validate_property_type(actual_value, prop_type):
|
|
571
|
+
logger.warning(
|
|
572
|
+
f"Property '{prop_name}' has wrong type. Expected {prop_type}, got {type(actual_value).__name__}"
|
|
573
|
+
)
|
|
574
|
+
return False
|
|
575
|
+
|
|
576
|
+
return True
|
|
577
|
+
|
|
578
|
+
def _validate_property_type(self, value: Any, expected_type: str) -> bool:
|
|
579
|
+
"""Validate that a property value matches the expected type.
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
value: The actual value
|
|
583
|
+
expected_type: The expected type as string (e.g., 'str', 'int', 'float')
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
bool: True if type matches, False otherwise
|
|
587
|
+
"""
|
|
588
|
+
type_mapping = {
|
|
589
|
+
"str": str,
|
|
590
|
+
"int": int,
|
|
591
|
+
"float": float,
|
|
592
|
+
"bool": bool,
|
|
593
|
+
"list": list,
|
|
594
|
+
"dict": dict,
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
if expected_type not in type_mapping:
|
|
598
|
+
return True # Unknown type, assume valid
|
|
599
|
+
|
|
600
|
+
expected_python_type = type_mapping[expected_type]
|
|
601
|
+
return isinstance(value, expected_python_type)
|
|
602
|
+
|
|
603
|
+
# ==================== SERIALIZATION ====================
|
|
604
|
+
|
|
605
|
+
def to_json(self) -> str:
|
|
606
|
+
"""Export the knowledge graph to JSON format.
|
|
607
|
+
|
|
608
|
+
Returns:
|
|
609
|
+
JSON string representation of the graph
|
|
610
|
+
"""
|
|
611
|
+
return self.graph.to_json()
|
|
612
|
+
|
|
613
|
+
def from_json(self, json_data: str) -> None:
|
|
614
|
+
"""Import knowledge graph from JSON format.
|
|
615
|
+
|
|
616
|
+
Args:
|
|
617
|
+
json_data: JSON string containing graph data
|
|
618
|
+
"""
|
|
619
|
+
data = json.loads(json_data)
|
|
620
|
+
self.graph = Graph.from_dict(data)
|
|
621
|
+
self.name = self.graph.name
|
|
622
|
+
|
|
623
|
+
def save(self, filepath: str) -> None:
|
|
624
|
+
"""Save the graph to a file.
|
|
625
|
+
|
|
626
|
+
Args:
|
|
627
|
+
filepath: Path to save the graph
|
|
628
|
+
"""
|
|
629
|
+
with open(filepath, "w") as f:
|
|
630
|
+
f.write(self.to_json())
|
|
631
|
+
logger.info(f"Graph saved to {filepath}")
|
|
632
|
+
|
|
633
|
+
def load(self, filepath: str) -> None:
|
|
634
|
+
"""Load the graph from a file.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
filepath: Path to load the graph from
|
|
638
|
+
"""
|
|
639
|
+
with open(filepath, "r") as f:
|
|
640
|
+
json_data = f.read()
|
|
641
|
+
self.from_json(json_data)
|
|
642
|
+
logger.info(f"Graph loaded from {filepath}")
|
|
643
|
+
|
|
644
|
+
# ==================== UTILITY METHODS ====================
|
|
645
|
+
|
|
646
|
+
def clear(self) -> None:
|
|
647
|
+
"""Clear all nodes and edges from the graph."""
|
|
648
|
+
self.graph = Graph(name=self.name, directed=self.graph.directed)
|
|
649
|
+
logger.info("Graph cleared")
|
|
650
|
+
|
|
651
|
+
def copy(self) -> "BioCypherWorkflow":
|
|
652
|
+
"""Create a copy of the workflow and its graph.
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
New BioCypherWorkflow instance
|
|
656
|
+
"""
|
|
657
|
+
new_workflow = BioCypherWorkflow(
|
|
658
|
+
name=self.name, directed=self.graph.directed, schema=self.schema, head_ontology_url=self.head_ontology_url
|
|
659
|
+
)
|
|
660
|
+
new_workflow.from_json(self.to_json())
|
|
661
|
+
return new_workflow
|
|
662
|
+
|
|
663
|
+
def get_graph(self) -> Graph:
|
|
664
|
+
"""Get the underlying Graph object.
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
Graph object
|
|
668
|
+
"""
|
|
669
|
+
return self.graph
|
|
670
|
+
|
|
671
|
+
def __len__(self) -> int:
|
|
672
|
+
"""Return the number of nodes in the graph."""
|
|
673
|
+
return len(self.graph)
|
|
674
|
+
|
|
675
|
+
def __contains__(self, node_id: str) -> bool:
|
|
676
|
+
"""Check if a node exists in the graph."""
|
|
677
|
+
return node_id in self.graph
|
|
678
|
+
|
|
679
|
+
def __str__(self) -> str:
|
|
680
|
+
"""String representation of the workflow."""
|
|
681
|
+
stats = self.get_statistics()
|
|
682
|
+
return (
|
|
683
|
+
f"BioCypherWorkflow(name='{self.name}', "
|
|
684
|
+
f"nodes={stats['basic']['nodes']}, edges={stats['basic']['edges']}, "
|
|
685
|
+
f"hyperedges={stats['basic']['hyperedges']})"
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
def __repr__(self) -> str:
|
|
689
|
+
return self.__str__()
|
|
690
|
+
|
|
691
|
+
# ==================== COMPATIBILITY WRAPPER METHODS ====================
|
|
692
|
+
|
|
693
|
+
def to_networkx(self):
|
|
694
|
+
"""Convert to NetworkX graph for compatibility with existing tools.
|
|
695
|
+
|
|
696
|
+
Returns:
|
|
697
|
+
networkx.DiGraph: NetworkX representation of the graph
|
|
698
|
+
|
|
699
|
+
Note:
|
|
700
|
+
This method provides compatibility with existing NetworkX-based
|
|
701
|
+
tools while maintaining the native BioCypher object structure.
|
|
702
|
+
Future versions may use this as the primary backend.
|
|
703
|
+
"""
|
|
704
|
+
try:
|
|
705
|
+
import networkx as nx
|
|
706
|
+
except ImportError:
|
|
707
|
+
raise ImportError("NetworkX is required for to_networkx() conversion. Install with: pip install networkx")
|
|
708
|
+
|
|
709
|
+
g = nx.DiGraph() if self.graph.directed else nx.Graph()
|
|
710
|
+
|
|
711
|
+
# Add nodes with properties
|
|
712
|
+
for node in self.graph._nodes.values():
|
|
713
|
+
attrs = node.properties.copy()
|
|
714
|
+
attrs["node_type"] = node.type
|
|
715
|
+
g.add_node(node.id, **attrs)
|
|
716
|
+
|
|
717
|
+
# Add edges with properties
|
|
718
|
+
for edge in self.graph._edges.values():
|
|
719
|
+
attrs = edge.properties.copy()
|
|
720
|
+
attrs["edge_type"] = edge.type
|
|
721
|
+
g.add_edge(edge.source, edge.target, **attrs)
|
|
722
|
+
|
|
723
|
+
return g
|
|
724
|
+
|
|
725
|
+
def to_pandas(self):
|
|
726
|
+
"""Convert to Pandas DataFrames for compatibility with existing tools.
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
dict[str, pd.DataFrame]: Dictionary of DataFrames, one per node/edge type
|
|
730
|
+
|
|
731
|
+
Note:
|
|
732
|
+
This method provides compatibility with existing Pandas-based
|
|
733
|
+
tools while maintaining the native BioCypher object structure.
|
|
734
|
+
Future versions may use this as the primary backend.
|
|
735
|
+
"""
|
|
736
|
+
try:
|
|
737
|
+
import pandas as pd
|
|
738
|
+
except ImportError:
|
|
739
|
+
raise ImportError("Pandas is required for to_pandas() conversion. Install with: pip install pandas")
|
|
740
|
+
|
|
741
|
+
dfs = {}
|
|
742
|
+
|
|
743
|
+
# Create node DataFrames by type
|
|
744
|
+
for node_type, node_ids in self.graph._node_types.items():
|
|
745
|
+
nodes = [self.graph._nodes[node_id] for node_id in node_ids]
|
|
746
|
+
data = []
|
|
747
|
+
for node in nodes:
|
|
748
|
+
row = {"node_id": node.id, "node_type": node.type}
|
|
749
|
+
row.update(node.properties)
|
|
750
|
+
data.append(row)
|
|
751
|
+
dfs[node_type] = pd.DataFrame(data)
|
|
752
|
+
|
|
753
|
+
# Create edge DataFrames by type
|
|
754
|
+
for edge_type, edge_ids in self.graph._edge_types.items():
|
|
755
|
+
edges = [self.graph._edges[edge_id] for edge_id in edge_ids]
|
|
756
|
+
data = []
|
|
757
|
+
for edge in edges:
|
|
758
|
+
row = {"edge_id": edge.id, "edge_type": edge.type, "source_id": edge.source, "target_id": edge.target}
|
|
759
|
+
row.update(edge.properties)
|
|
760
|
+
data.append(row)
|
|
761
|
+
dfs[edge_type] = pd.DataFrame(data)
|
|
762
|
+
|
|
763
|
+
return dfs
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
# Convenience function for quick workflow creation
|
|
767
|
+
def create_workflow(
|
|
768
|
+
name: str = "knowledge_graph",
|
|
769
|
+
directed: bool = True,
|
|
770
|
+
schema: dict[str, Any] | None = None,
|
|
771
|
+
schema_file: str | None = None,
|
|
772
|
+
head_ontology_url: str | None = None,
|
|
773
|
+
validation_mode: str = "none",
|
|
774
|
+
deduplication: bool = False,
|
|
775
|
+
) -> BioCypherWorkflow:
|
|
776
|
+
"""Create a new knowledge graph workflow.
|
|
777
|
+
|
|
778
|
+
Args:
|
|
779
|
+
name: Name of the knowledge graph
|
|
780
|
+
directed: Whether the graph is directed
|
|
781
|
+
schema: Dictionary defining the knowledge graph schema
|
|
782
|
+
schema_file: Path to YAML schema file
|
|
783
|
+
head_ontology_url: URL to ontology file
|
|
784
|
+
validation_mode: Validation level ("none", "warn", "strict")
|
|
785
|
+
deduplication: Whether to enable deduplication
|
|
786
|
+
|
|
787
|
+
Returns:
|
|
788
|
+
BioCypherWorkflow instance
|
|
789
|
+
"""
|
|
790
|
+
return BioCypherWorkflow(
|
|
791
|
+
name=name,
|
|
792
|
+
directed=directed,
|
|
793
|
+
schema=schema,
|
|
794
|
+
schema_file=schema_file,
|
|
795
|
+
head_ontology_url=head_ontology_url,
|
|
796
|
+
validation_mode=validation_mode,
|
|
797
|
+
deduplication=deduplication,
|
|
798
|
+
)
|