biocypher 0.9.6__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

biocypher/_metadata.py CHANGED
@@ -10,7 +10,7 @@ import pathlib
10
10
 
11
11
  import toml
12
12
 
13
- _VERSION = "0.9.6"
13
+ _VERSION = "0.10.0"
14
14
 
15
15
 
16
16
  def get_metadata():
biocypher/_translate.py CHANGED
@@ -37,15 +37,12 @@ class Translator:
37
37
 
38
38
  Args:
39
39
  ----
40
- leaves:
41
- Dictionary detailing the leaves of the hierarchy
42
- tree representing the structure of the graph; the leaves are
43
- the entities that will be direct components of the graph,
44
- while the intermediary nodes are additional labels for
45
- filtering purposes.
40
+ ontology (Ontology): An Ontology object providing schema and mapping details.
46
41
  strict_mode:
47
- If True, the translator will raise an error if input data do not
48
- carry source, licence, and version information.
42
+ strict_mode (bool, optional): If True, enforces that every node and edge carries
43
+ the required 'source', 'licence', and 'version' properties. Raises ValueError
44
+ if these are missing. Defaults to False.
45
+
49
46
 
50
47
  """
51
48
  self.ontology = ontology
@@ -0,0 +1,491 @@
1
+ from typing import Any, Optional
2
+
3
+ from biocypher._create import BioCypherEdge, BioCypherNode
4
+ from biocypher._deduplicate import Deduplicator
5
+ from biocypher._logger import logger
6
+ from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
7
+
8
+ try:
9
+ from scirpy.io import AirrCell
10
+
11
+ HAS_SCIRPY = True
12
+ except ImportError:
13
+ HAS_SCIRPY = False
14
+
15
+
16
+ class AirrKG(_InMemoryKG):
17
+ """Knowledge graph for AIRR (Adaptive Immune Receptor Repertoire) data.
18
+
19
+ This class implements the AIRR data model for representing immune receptor sequences
20
+ (antibodies and T cell receptors) and their annotations. To ensure proper conversion
21
+ to AIRR format, your schema file should define immune receptor entities with property
22
+ names that match the AIRR standards.
23
+
24
+ Key property names in your schema for immune receptor entities:
25
+ - locus: The gene locus (e.g., "TRA", "TRB", "IGH", "IGK", "IGL")
26
+ - junction_aa: The amino acid sequence of the junction region (CDR3)
27
+ - v_call: The V gene assignment
28
+ - j_call: The J gene assignment
29
+ - productive: Whether the sequence is productive
30
+
31
+ Pairing Strategies specified in get_kg method:
32
+ - Indirect pairings allowed:
33
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
34
+ - Indirect pairings not allowed:
35
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
36
+
37
+ For a complete list of available fields and their descriptions, see:
38
+ https://docs.airr-community.org/en/stable/datarep/rearrangements.html#fields
39
+
40
+ All properties from the biocypher schema defined by user will be preserved in the AIRR format.
41
+ """
42
+
43
+ # Constants for internal property filtering
44
+ _INTERNAL_PROPERTIES = frozenset(["node_id", "node_label", "id", "preferred_id"])
45
+
46
+ def __init__(
47
+ self,
48
+ deduplicator: Optional["Deduplicator"] = None,
49
+ metadata_entity_type: str = "epitope",
50
+ ) -> None:
51
+ """Initialize AirrKG with configurable metadata node type.
52
+
53
+ Args:
54
+ ----
55
+ deduplicator: Deduplicator instance
56
+ metadata_entity_type: String specifying the metadata node type (default: "epitope")
57
+
58
+ """
59
+ super().__init__()
60
+ self.deduplicator = deduplicator or Deduplicator()
61
+ self.metadata_entity_type = metadata_entity_type
62
+
63
+ # Initialize storage for processed cells
64
+ self.adjacency_list = {}
65
+ self.airr_cells = []
66
+
67
+ # These will be populated when nodes and edges are added
68
+ self.sequence_entity_types = {}
69
+ self.chain_relationship_types = []
70
+ self.chain_to_epitope_relationship_types = []
71
+
72
+ def _check_dependencies(self) -> None:
73
+ """Verify that scirpy is available."""
74
+ if not HAS_SCIRPY:
75
+ msg = (
76
+ "AirrCell module from scirpy not detected. "
77
+ "Install it with 'poetry add biocypher[scirpy]' or 'poetry add scirpy'."
78
+ )
79
+ raise ImportError(msg)
80
+
81
+ def get_kg(self, indirect_pairings: bool = True) -> list[AirrCell]:
82
+ """Convert directly to AIRR format using AirCell from scirpy.
83
+
84
+ Args:
85
+ ----
86
+ indirect_pairings: Boolean controlling pairing strategy (default: True)
87
+ - True:
88
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
89
+ - False:
90
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
91
+
92
+ Returns:
93
+ -------
94
+ list: List of generated AIRR cells
95
+
96
+ """
97
+ self._check_dependencies()
98
+ if not self.airr_cells:
99
+ self.airr_cells = self._to_airr_cells(self.adjacency_list, indirect_pairings)
100
+ return self.airr_cells
101
+
102
+ def add_nodes(self, nodes: list[BioCypherNode]) -> None:
103
+ """Add BioCypher nodes, organizing them by type."""
104
+ self._add_to_entities_by_type(nodes)
105
+
106
+ def add_edges(self, edges: list[BioCypherEdge]) -> None:
107
+ """Add BioCypher edges, organizing them by type."""
108
+ self._add_to_entities_by_type(edges)
109
+
110
+ def _add_to_entities_by_type(self, entities: dict[str, list[Any]]) -> None:
111
+ """Add all entities (both nodes and edges) to a common adj. list."""
112
+ lists = self._separate_entity_types(entities)
113
+ for _type, _entities in lists.items():
114
+ if _type not in self.adjacency_list:
115
+ self.adjacency_list[_type] = []
116
+ self.adjacency_list[_type].extend(_entities)
117
+
118
+ def _process_entities(self, entities: dict[str, list[Any]]) -> tuple[dict, dict, dict]:
119
+ """Process entities and organize them into sequence nodes, metadata nodes, and receptor-epitope mappings.
120
+
121
+ Args:
122
+ ----
123
+ entities: Dictionary mapping entity types to lists of BioCypherNode/BioCypherEdge objects
124
+
125
+ Returns:
126
+ -------
127
+ tuple: (sequence_nodes, metadata_nodes, receptor_epitope_mapping)
128
+
129
+ """
130
+ sequence_nodes = {}
131
+ metadata_nodes = {}
132
+ receptor_epitope_mapping = {}
133
+
134
+ # Determine entity types while processing
135
+ all_node_types = set()
136
+ all_edge_types = set()
137
+
138
+ for entity_type, entities_list in entities.items():
139
+ if not entities_list: # Skip empty lists
140
+ continue
141
+
142
+ # Determine if this is a node or edge type
143
+ if isinstance(entities_list[0], BioCypherNode):
144
+ all_node_types.add(entity_type)
145
+ if entity_type == self.metadata_entity_type:
146
+ metadata_nodes.update({node.get_id(): node for node in entities_list})
147
+ else:
148
+ sequence_nodes.update({node.get_id(): node for node in entities_list})
149
+ self.sequence_entity_types[entity_type] = entity_type.replace(" sequence", "").upper()
150
+ elif isinstance(entities_list[0], BioCypherEdge):
151
+ all_edge_types.add(entity_type)
152
+
153
+ # Update relationship types
154
+ self.chain_relationship_types = [
155
+ edge_type for edge_type in all_edge_types if self.metadata_entity_type not in edge_type.lower()
156
+ ]
157
+
158
+ self.chain_to_epitope_relationship_types = [
159
+ edge_type for edge_type in all_edge_types if self.metadata_entity_type in edge_type.lower()
160
+ ]
161
+
162
+ # Process chain-to-epitope relationships
163
+ for entity_type in self.chain_to_epitope_relationship_types:
164
+ self._update_receptor_epitope_mapping(entities[entity_type], receptor_epitope_mapping)
165
+
166
+ return sequence_nodes, metadata_nodes, receptor_epitope_mapping
167
+
168
+ def _update_receptor_epitope_mapping(self, edges: list[BioCypherEdge], mapping: dict[str, set]) -> None:
169
+ """Update receptor-epitope mapping with new edges.
170
+
171
+ Args:
172
+ ----
173
+ edges: List of edges to process
174
+ mapping: Dictionary to update with receptor-epitope mappings
175
+
176
+ """
177
+ for edge in edges:
178
+ source_id = edge.get_source_id()
179
+ if source_id not in mapping:
180
+ mapping[source_id] = set()
181
+ mapping[source_id].add(edge.get_target_id())
182
+
183
+ def _process_paired_chains(
184
+ self,
185
+ entities: dict[str, list[Any]],
186
+ sequence_nodes: dict[str, BioCypherNode],
187
+ metadata_nodes: dict[str, BioCypherNode],
188
+ receptor_epitope_mapping: dict[str, set],
189
+ indirect_pairings: bool = True,
190
+ ) -> tuple[list[AirrCell], set[str], int]:
191
+ """Process paired chains and generate AIRR cells.
192
+
193
+ Args:
194
+ ----
195
+ entities: Dictionary of all entities
196
+ sequence_nodes: Dictionary of sequence nodes
197
+ metadata_nodes: Dictionary of metadata nodes
198
+ receptor_epitope_mapping: Dictionary of receptor-epitope mappings
199
+ indirect_pairings: Boolean controlling pairing strategy
200
+
201
+ Returns:
202
+ -------
203
+ tuple: (list of generated cells, set of processed chain IDs, count of cells with multiple epitopes)
204
+
205
+ """
206
+ airr_cells = []
207
+ processed_chains = set()
208
+ n_metacells = 0
209
+
210
+ for entity_type, edges in entities.items():
211
+ if entity_type in self.chain_relationship_types:
212
+ for edge in edges:
213
+ source_id, target_id = edge.get_source_id(), edge.get_target_id()
214
+ processed_chains.update([source_id, target_id])
215
+
216
+ # Use conditional logic for pairing strategy
217
+ source_metadata = receptor_epitope_mapping.get(source_id, set())
218
+ target_metadata = receptor_epitope_mapping.get(target_id, set())
219
+
220
+ if indirect_pairings:
221
+ # Union: create paired cell if either chain binds epitopes
222
+ metadata_ids = source_metadata | target_metadata
223
+
224
+ metadata_nodes_cell = self._get_metadata_nodes(metadata_ids, metadata_nodes)
225
+ if metadata_nodes_cell:
226
+ cell_s = self._generate_airr_cell(
227
+ cell_id=edge.get_id(),
228
+ source_node=sequence_nodes.get(source_id),
229
+ target_node=sequence_nodes.get(target_id),
230
+ metadata_nodes=metadata_nodes_cell,
231
+ paired=True,
232
+ receptor_epitope_mapping=receptor_epitope_mapping,
233
+ )
234
+ airr_cells.extend(cell_s)
235
+ if len(cell_s) > 1:
236
+ n_metacells += 1
237
+ else:
238
+ # Intersection: create paired cell only if both chains bind same epitopes
239
+ shared_metadata_ids = source_metadata & target_metadata
240
+
241
+ # Create paired cell if there are shared epitopes
242
+ if shared_metadata_ids:
243
+ shared_metadata_nodes = self._get_metadata_nodes(shared_metadata_ids, metadata_nodes)
244
+ if shared_metadata_nodes:
245
+ cell_s = self._generate_airr_cell(
246
+ cell_id=edge.get_id(),
247
+ source_node=sequence_nodes.get(source_id),
248
+ target_node=sequence_nodes.get(target_id),
249
+ metadata_nodes=shared_metadata_nodes,
250
+ paired=True,
251
+ receptor_epitope_mapping=receptor_epitope_mapping,
252
+ )
253
+ airr_cells.extend(cell_s)
254
+ if len(cell_s) > 1:
255
+ n_metacells += 1
256
+
257
+ # Create unpaired cells for chains with non-overlapping epitopes
258
+ source_only_metadata = source_metadata - target_metadata
259
+ target_only_metadata = target_metadata - source_metadata
260
+
261
+ # Create unpaired cell for source chain if it has unique epitopes
262
+ if source_only_metadata:
263
+ source_only_nodes = self._get_metadata_nodes(source_only_metadata, metadata_nodes)
264
+ if source_only_nodes:
265
+ source_cells = self._generate_airr_cell(
266
+ cell_id=f"unpaired_{source_id}",
267
+ source_node=sequence_nodes.get(source_id),
268
+ target_node=None,
269
+ metadata_nodes=source_only_nodes,
270
+ paired=False,
271
+ receptor_epitope_mapping=receptor_epitope_mapping,
272
+ )
273
+ airr_cells.extend(source_cells)
274
+ if len(source_cells) > 1:
275
+ n_metacells += 1
276
+
277
+ # Create unpaired cell for target chain if it has unique epitopes
278
+ if target_only_metadata:
279
+ target_only_nodes = self._get_metadata_nodes(target_only_metadata, metadata_nodes)
280
+ if target_only_nodes:
281
+ target_cells = self._generate_airr_cell(
282
+ cell_id=f"unpaired_{target_id}",
283
+ source_node=sequence_nodes.get(target_id),
284
+ target_node=None,
285
+ metadata_nodes=target_only_nodes,
286
+ paired=False,
287
+ receptor_epitope_mapping=receptor_epitope_mapping,
288
+ )
289
+ airr_cells.extend(target_cells)
290
+ if len(target_cells) > 1:
291
+ n_metacells += 1
292
+
293
+ return airr_cells, processed_chains, n_metacells
294
+
295
+ def _process_unpaired_chains(
296
+ self,
297
+ receptor_epitope_mapping: dict[str, set],
298
+ sequence_nodes: dict[str, BioCypherNode],
299
+ metadata_nodes: dict[str, BioCypherNode],
300
+ processed_chains: set[str],
301
+ ) -> tuple[list[AirrCell], int]:
302
+ """Process unpaired chains and generate AIRR cells.
303
+
304
+ Args:
305
+ ----
306
+ receptor_epitope_mapping: Dictionary of receptor-epitope mappings
307
+ sequence_nodes: Dictionary of sequence nodes
308
+ metadata_nodes: Dictionary of metadata nodes
309
+ processed_chains: Set of already processed chain IDs
310
+
311
+ Returns:
312
+ -------
313
+ tuple: (List of generated cells, count of cells with multiple epitopes)
314
+
315
+ """
316
+ airr_cells = []
317
+ n_metacells = 0
318
+
319
+ for chain_id in receptor_epitope_mapping:
320
+ if chain_id not in processed_chains:
321
+ # Get all metadata nodes for this unpaired chain
322
+ metadata_nodes_cell = self._get_metadata_nodes(receptor_epitope_mapping[chain_id], metadata_nodes)
323
+
324
+ if metadata_nodes_cell:
325
+ cell_s = self._generate_airr_cell(
326
+ cell_id=f"unpaired_{chain_id}",
327
+ source_node=sequence_nodes.get(chain_id),
328
+ target_node=None,
329
+ metadata_nodes=metadata_nodes_cell,
330
+ paired=False,
331
+ receptor_epitope_mapping=receptor_epitope_mapping,
332
+ )
333
+ airr_cells.extend(cell_s)
334
+ # Check if multiple cells were generated (indicating multiple epitopes)
335
+ if len(cell_s) > 1:
336
+ n_metacells += 1
337
+
338
+ return airr_cells, n_metacells
339
+
340
+ def _to_airr_cells(self, entities: dict[str, list[Any]], indirect_pairings: bool = True) -> list[AirrCell]:
341
+ """Convert BioCypher entities to AIRR cells using configurable mappings.
342
+
343
+ Args:
344
+ ----
345
+ entities: Dictionary mapping entity types to lists of BioCypherNode/BioCypherEdge objects
346
+ indirect_pairings: Boolean controlling pairing strategy (default: True)
347
+ - True:
348
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
349
+ - False:
350
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
351
+
352
+ Returns:
353
+ -------
354
+ list: List of generated AIRR cells
355
+
356
+ """
357
+ if not entities:
358
+ msg = "No entities provided for conversion."
359
+ raise ValueError(msg)
360
+
361
+ logger.info("Starting conversion to AIRR cells")
362
+
363
+ # Process all entities
364
+ sequence_nodes, metadata_nodes, receptor_epitope_mapping = self._process_entities(entities)
365
+
366
+ # Process paired chains
367
+ airr_cells, processed_chains, paired_metacells = self._process_paired_chains(
368
+ entities,
369
+ sequence_nodes,
370
+ metadata_nodes,
371
+ receptor_epitope_mapping,
372
+ indirect_pairings,
373
+ )
374
+
375
+ # Process unpaired chains
376
+ unpaired_cells, unpaired_metacells = self._process_unpaired_chains(
377
+ receptor_epitope_mapping,
378
+ sequence_nodes,
379
+ metadata_nodes,
380
+ processed_chains,
381
+ )
382
+ airr_cells.extend(unpaired_cells)
383
+
384
+ # Calculate total cells with multiple epitopes
385
+ total_metacells = paired_metacells + unpaired_metacells
386
+
387
+ # Log information about cells
388
+ logger.info(f"Generated total of {len(airr_cells)} AIRR cells")
389
+ if total_metacells > 0:
390
+ logger.info(f"{total_metacells} cells with more than 1 epitope were detected")
391
+
392
+ return airr_cells
393
+
394
+ def _get_metadata_nodes(
395
+ self,
396
+ metadata_ids: set[str],
397
+ metadata_nodes: dict[str, BioCypherNode],
398
+ ) -> list[BioCypherNode]:
399
+ """Get metadata nodes for a set of metadata IDs.
400
+
401
+ Args:
402
+ ----
403
+ metadata_ids: Set of metadata IDs
404
+ metadata_nodes: Dictionary of metadata nodes
405
+
406
+ Returns:
407
+ -------
408
+ list: List of metadata nodes
409
+
410
+ """
411
+ return [metadata_nodes[ep_id] for ep_id in metadata_ids if ep_id in metadata_nodes]
412
+
413
+ def _generate_airr_cell(
414
+ self,
415
+ cell_id: str,
416
+ source_node: BioCypherNode | None,
417
+ target_node: BioCypherNode | None,
418
+ metadata_nodes: list[BioCypherNode],
419
+ paired: bool,
420
+ receptor_epitope_mapping: dict[str, set] | None = None,
421
+ ) -> list[AirrCell]:
422
+ cell = AirrCell(cell_id=cell_id)
423
+
424
+ # Process both chains
425
+ for node in [source_node, target_node]:
426
+ if not node: # Skip if node is None
427
+ continue
428
+
429
+ props = node.get_properties()
430
+ chain = AirrCell.empty_chain_dict()
431
+
432
+ # Add all properties except internal ones
433
+ for key, value in props.items():
434
+ if key not in self._INTERNAL_PROPERTIES:
435
+ chain[key] = value
436
+
437
+ # Add locus based on node type
438
+ chain["locus"] = self.sequence_entity_types.get(node.get_label(), node.get_label())
439
+ chain["consensus_count"] = 0
440
+ chain["productive"] = True
441
+
442
+ # Add binds_epitope field based on receptor_epitope_mapping
443
+ if receptor_epitope_mapping and node.get_id() in receptor_epitope_mapping:
444
+ chain["validated_epitope"] = bool(receptor_epitope_mapping[node.get_id()])
445
+ else:
446
+ chain["validated_epitope"] = False
447
+
448
+ cell.add_chain(chain)
449
+
450
+ # Add metadata
451
+ return self.add_metadata(metadata_nodes, cell, paired)
452
+
453
+ def add_metadata(self, metadata_nodes: list[BioCypherNode], cell: AirrCell, paired: bool) -> list[AirrCell]:
454
+ """Add metadata from nodes to cell(s) and return a list of cells.
455
+
456
+ Args:
457
+ ----
458
+ metadata_nodes: List of metadata nodes to add
459
+ cell: Base cell to add metadata to
460
+ paired: Whether the cell is paired
461
+
462
+ Returns:
463
+ -------
464
+ List of cells with metadata added
465
+
466
+ """
467
+ cells = []
468
+ if not metadata_nodes:
469
+ cell["data_source"] = "BioCypher"
470
+ cell["is_paired"] = paired
471
+ cells.append(cell)
472
+ else:
473
+ for i, node in enumerate(metadata_nodes):
474
+ # Create a new AirrCell for each metadata node
475
+ if i > 0:
476
+ cell_id_new = f"{cell.cell_id}_meta{i+1}"
477
+ meta_cell = AirrCell(cell_id=cell_id_new)
478
+ for chain in cell.chains:
479
+ meta_cell.add_chain(chain)
480
+ else:
481
+ meta_cell = cell
482
+ props = node.get_properties()
483
+ for key, value in props.items():
484
+ if key not in self._INTERNAL_PROPERTIES:
485
+ meta_cell[key] = value
486
+
487
+ meta_cell["data_source"] = "BioCypher"
488
+ meta_cell["is_paired"] = paired
489
+
490
+ cells.append(meta_cell)
491
+ return cells
@@ -8,6 +8,7 @@ from __future__ import annotations
8
8
  from typing import TYPE_CHECKING
9
9
 
10
10
  from biocypher._logger import logger
11
+ from biocypher.output.in_memory._airr import AirrKG
11
12
  from biocypher.output.in_memory._networkx import NetworkxKG
12
13
  from biocypher.output.in_memory._pandas import PandasKG
13
14
 
@@ -19,7 +20,7 @@ logger.debug(f"Loading module {__name__}.")
19
20
 
20
21
  __all__ = ["get_in_memory_kg"]
21
22
 
22
- IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx"]
23
+ IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx", "airr"]
23
24
 
24
25
 
25
26
  def get_in_memory_kg(
@@ -35,10 +36,11 @@ def get_in_memory_kg(
35
36
  """
36
37
  if dbms in ["csv", "pandas", "tabular"]:
37
38
  return PandasKG(deduplicator)
38
-
39
39
  if dbms == "networkx":
40
40
  return NetworkxKG(deduplicator)
41
-
42
- msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
43
- logger.error(msg)
44
- raise NotImplementedError(msg)
41
+ elif dbms == "airr":
42
+ return AirrKG(deduplicator)
43
+ else:
44
+ msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
45
+ logger.error(msg)
46
+ raise NotImplementedError(msg)
@@ -1,5 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
+ from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
4
+
3
5
 
4
6
  class _InMemoryKG(ABC):
5
7
  """Abstract class for handling the in-memory Knowledge Graph instance.
@@ -9,6 +11,7 @@ class _InMemoryKG(ABC):
9
11
  - add_nodes
10
12
  - add_edges
11
13
  - get_kg
14
+ - _separate_entity_types
12
15
 
13
16
  Raises:
14
17
  NotImplementedError: InMemoryKG implementation must override 'add_nodes'
@@ -38,3 +41,57 @@ class _InMemoryKG(ABC):
38
41
  def get_kg(self):
39
42
  """Return the in-memory knowledge graph."""
40
43
  raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")
44
+
45
+ def _separate_entity_types(self, entities):
46
+ """
47
+ Given mixed iterable of BioCypher objects, separate them into lists by
48
+ type. Also deduplicates using the `Deduplicator` instance.
49
+ """
50
+ lists = {}
51
+ for entity in entities:
52
+ if (
53
+ not isinstance(entity, BioCypherNode)
54
+ and not isinstance(entity, BioCypherEdge)
55
+ and not isinstance(entity, BioCypherRelAsNode)
56
+ ):
57
+ raise TypeError(
58
+ "Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
59
+ )
60
+
61
+ if isinstance(entity, BioCypherNode):
62
+ seen = self.deduplicator.node_seen(entity)
63
+ elif isinstance(entity, BioCypherEdge):
64
+ seen = self.deduplicator.edge_seen(entity)
65
+ elif isinstance(entity, BioCypherRelAsNode):
66
+ seen = self.deduplicator.rel_as_node_seen(entity)
67
+
68
+ if seen:
69
+ continue
70
+
71
+ if isinstance(entity, BioCypherRelAsNode):
72
+ node = entity.get_node()
73
+ source_edge = entity.get_source_edge()
74
+ target_edge = entity.get_target_edge()
75
+
76
+ _type = node.get_type()
77
+ if _type not in lists:
78
+ lists[_type] = []
79
+ lists[_type].append(node)
80
+
81
+ _source_type = source_edge.get_type()
82
+ if _source_type not in lists:
83
+ lists[_source_type] = []
84
+ lists[_source_type].append(source_edge)
85
+
86
+ _target_type = target_edge.get_type()
87
+ if _target_type not in lists:
88
+ lists[_target_type] = []
89
+ lists[_target_type].append(target_edge)
90
+ continue
91
+
92
+ _type = entity.get_type()
93
+ if _type not in lists:
94
+ lists[_type] = []
95
+ lists[_type].append(entity)
96
+
97
+ return lists
@@ -1,6 +1,5 @@
1
1
  import pandas as pd
2
2
 
3
- from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
4
3
  from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
5
4
 
6
5
 
@@ -20,65 +19,8 @@ class PandasKG(_InMemoryKG):
20
19
  def add_edges(self, edges):
21
20
  self.add_tables(edges)
22
21
 
23
- def _separate_entity_types(self, entities):
24
- """
25
- Given mixed iterable of BioCypher objects, separate them into lists by
26
- type. Also deduplicates using the `Deduplicator` instance.
27
- """
28
- lists = {}
29
- for entity in entities:
30
- if (
31
- not isinstance(entity, BioCypherNode)
32
- and not isinstance(entity, BioCypherEdge)
33
- and not isinstance(entity, BioCypherRelAsNode)
34
- ):
35
- raise TypeError(
36
- "Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
37
- )
38
-
39
- if isinstance(entity, BioCypherNode):
40
- seen = self.deduplicator.node_seen(entity)
41
- elif isinstance(entity, BioCypherEdge):
42
- seen = self.deduplicator.edge_seen(entity)
43
- elif isinstance(entity, BioCypherRelAsNode):
44
- seen = self.deduplicator.rel_as_node_seen(entity)
45
-
46
- if seen:
47
- continue
48
-
49
- if isinstance(entity, BioCypherRelAsNode):
50
- node = entity.get_node()
51
- source_edge = entity.get_source_edge()
52
- target_edge = entity.get_target_edge()
53
-
54
- _type = node.get_type()
55
- if _type not in lists:
56
- lists[_type] = []
57
- lists[_type].append(node)
58
-
59
- _source_type = source_edge.get_type()
60
- if _source_type not in lists:
61
- lists[_source_type] = []
62
- lists[_source_type].append(source_edge)
63
-
64
- _target_type = target_edge.get_type()
65
- if _target_type not in lists:
66
- lists[_target_type] = []
67
- lists[_target_type].append(target_edge)
68
- continue
69
-
70
- _type = entity.get_type()
71
- if _type not in lists:
72
- lists[_type] = []
73
- lists[_type].append(entity)
74
-
75
- return lists
76
-
77
22
  def add_tables(self, entities):
78
- """
79
- Add Pandas dataframes for each node and edge type in the input.
80
- """
81
-
23
+ """Add Pandas dataframes for each node and edge type in the input."""
82
24
  lists = self._separate_entity_types(entities)
83
25
 
84
26
  for _type, _entities in lists.items():
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING
9
9
  from biocypher._config import config as _config
10
10
  from biocypher._logger import logger
11
11
  from biocypher.output.write._batch_writer import _BatchWriter
12
+ from biocypher.output.write.graph._airr import _AirrWriter
12
13
  from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
13
14
  from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
14
15
  from biocypher.output.write.graph._networkx import _NetworkXWriter
@@ -50,6 +51,7 @@ DBMS_TO_CLASS = {
50
51
  "Tabular": _PandasCSVWriter,
51
52
  "networkx": _NetworkXWriter,
52
53
  "NetworkX": _NetworkXWriter,
54
+ "airr": _AirrWriter,
53
55
  }
54
56
 
55
57
 
@@ -0,0 +1,32 @@
1
+ """Module to provide the AnnData writer class for BioCypher."""
2
+
3
+ from biocypher._logger import logger
4
+ from biocypher.output.write._writer import _Writer
5
+
6
+
7
+ class _AirrWriter(_Writer):
8
+ """A minimal placeholder writer class that implements the required methods
9
+ but performs no actual writing operations, since there is an existing anndata native writer functionality
10
+ """
11
+
12
+ def __init__(self, *args, **kwargs):
13
+ super().__init__(*args, **kwargs)
14
+ logger.info("Placeholder writer initialized")
15
+
16
+ def _write_node_data(self, nodes) -> bool:
17
+ """Required implementation that does nothing with nodes."""
18
+ logger.info("Placeholder: Node data received but not processed")
19
+ return True
20
+
21
+ def _write_edge_data(self, edges) -> bool:
22
+ """Required implementation that does nothing with edges."""
23
+ logger.info("Placeholder: Edge data received but not processed")
24
+ return True
25
+
26
+ def _construct_import_call(self) -> str:
27
+ """Return a placeholder import script."""
28
+ return "# This is a placeholder import script\nprint('No actual import functionality implemented')"
29
+
30
+ def _get_import_script_name(self) -> str:
31
+ """Return a placeholder script name."""
32
+ return "placeholder_import.py"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: biocypher
3
- Version: 0.9.6
3
+ Version: 0.10.0
4
4
  Summary: A unifying framework for biomedical research knowledge graphs
5
5
  Home-page: https://github.com/biocypher/biocypher
6
6
  License: MIT
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
22
+ Provides-Extra: scirpy
22
23
  Requires-Dist: PyYAML (>=5.0)
23
24
  Requires-Dist: appdirs
24
25
  Requires-Dist: more_itertools
@@ -27,6 +28,7 @@ Requires-Dist: networkx (>=3.0,<4.0)
27
28
  Requires-Dist: pandas (>=2.0.1,<3.0.0)
28
29
  Requires-Dist: pooch (>=1.7.0,<2.0.0)
29
30
  Requires-Dist: rdflib (>=6.2.0,<7.0.0)
31
+ Requires-Dist: scirpy (>=0.22.0,<0.23.0) ; extra == "scirpy"
30
32
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
31
33
  Requires-Dist: treelib (==1.6.4)
32
34
  Project-URL: Bug Tracker, https://github.com/biocypher/biocypher/issues
@@ -62,7 +64,7 @@ the docs [here](https://biocypher.org).
62
64
  margin-left: auto;
63
65
  margin-right: auto;
64
66
  width: 70%;"
65
- src="docs/graphical_abstract.png"
67
+ src="docs/assets/img/graphical-abstract-biocypher.png"
66
68
  alt="Graphical Abstract">
67
69
  </img>
68
70
 
@@ -11,24 +11,26 @@ biocypher/_deduplicate.py,sha256=rtglcaLRaVzNjLtaPwTGP8VvCM4PHYQ5CZ-cm32CrKQ,484
11
11
  biocypher/_get.py,sha256=_wUjhRjH2J6Qhq0Ndy3kdfaWhHDTT-dxyCvtuH36My4,14868
12
12
  biocypher/_logger.py,sha256=y9dh3SPJOCWXnkFSYSK7aj_-pB7zlAkNCf43Dp1lt74,2941
13
13
  biocypher/_mapping.py,sha256=ntspG2C_NaQODhWTBFk0CDvolkOCjtqlQ9E-NkJAuTg,9030
14
- biocypher/_metadata.py,sha256=Jb4Uva2PzrPbxzio7DMQnX0WuIFrl_pgyDW89L1R1oQ,1415
14
+ biocypher/_metadata.py,sha256=m8xeGsUl8MT9Tdlh_KKcPCa8Pf8Tn84yUlsWkJoxi2M,1416
15
15
  biocypher/_misc.py,sha256=YzlY7zwa0mim9QFg9HwXErkJFIH3cvLrbgjF8tKOIT8,6353
16
16
  biocypher/_ontology.py,sha256=lipZxU3aj6zrTbBrJZmCW6IRCuz-KQG3AfbYCVq6aFE,33133
17
- biocypher/_translate.py,sha256=9E19eLRL0VnxxDuiNhZ5vu54XyKXnfLuBhCgNcL9yAE,17000
17
+ biocypher/_translate.py,sha256=NKSM9lxNjNNbgQrK_24eWYh3B41TS7kjnSwjySnK3s0,16851
18
18
  biocypher/output/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  biocypher/output/connect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  biocypher/output/connect/_get_connector.py,sha256=Qimv3kTkXYkhJZRT6nq8mwIM2wORCnyqqHqF2IByuuc,1152
21
21
  biocypher/output/connect/_neo4j_driver.py,sha256=kXjOXW12wZFfEp7plAuo40bPSvOfd-i9m4YaXoMq-p0,12357
22
22
  biocypher/output/in_memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- biocypher/output/in_memory/_get_in_memory_kg.py,sha256=29DvmOdRUuzEblyMxy5001R4zjEXW3eHNv0htko7c4Y,1115
24
- biocypher/output/in_memory/_in_memory_kg.py,sha256=g1TPN8PkeAyXbrRuTAjshqC8voI6EmLqR8S_otmviwU,1423
23
+ biocypher/output/in_memory/_airr.py,sha256=tOYww8eHaH4EOElRKFxwShlVW83tXx4xbv7ZfrInphg,20461
24
+ biocypher/output/in_memory/_get_in_memory_kg.py,sha256=NQ-AT8jgAvb-aTK7zsYrItvlDEow_Mfni5tKEJjwjx0,1256
25
+ biocypher/output/in_memory/_in_memory_kg.py,sha256=BTBtqb2ZC_zxXdOJ59BQKUoGXZSpiaRG6VecjZGWCm0,3526
25
26
  biocypher/output/in_memory/_networkx.py,sha256=cSOSAreP7S3oeGT6noZ1kAIvSnkVnU3NUp1OY4yqzn0,1515
26
- biocypher/output/in_memory/_pandas.py,sha256=Ot2jbK5t_YLHqw0BUv9Z_qWNy9r6IX1LYEyejOSJzos,3288
27
+ biocypher/output/in_memory/_pandas.py,sha256=ndZcAAdsw38qZW3nWehcSxhpBGM8pXsn3DPoCcppI0U,1196
27
28
  biocypher/output/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
29
  biocypher/output/write/_batch_writer.py,sha256=_Dao7z4KN0Uhr86oOOWYEDrIUikR7T0v1SJC2Btd8Y4,38745
29
- biocypher/output/write/_get_writer.py,sha256=JozRWCMhvh65aQAlcGiiD5x3Nl1HSW8mK1Zf2nTSOzI,4385
30
+ biocypher/output/write/_get_writer.py,sha256=fXputhpt6K4mF8Ti6LFgwZIMU0GrK3aHkWIj1g4liwI,4469
30
31
  biocypher/output/write/_writer.py,sha256=y0dWI-RyQdrBLr9Fs91Y9KcCMjnlCaKJT0eWsIS2hG4,7158
31
32
  biocypher/output/write/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ biocypher/output/write/graph/_airr.py,sha256=XWzcqwoMZKZ8n6f2Y77hv1ZSRyg2tOVqlzX133-biG8,1274
32
34
  biocypher/output/write/graph/_arangodb.py,sha256=xue3hm_DVB5pMR5qqfGXlXll3RpILA0tXos2J-as1-E,7906
33
35
  biocypher/output/write/graph/_neo4j.py,sha256=tBPhxn8JAmSS6KmiePofwr9LpGjHQH9BTnpHVK2ellM,12042
34
36
  biocypher/output/write/graph/_networkx.py,sha256=2WYkw5ZM3Bp236iwAxEAp3A1DxHKT4_hEPNMUKvPHp4,2320
@@ -38,7 +40,7 @@ biocypher/output/write/relational/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
38
40
  biocypher/output/write/relational/_csv.py,sha256=m0BSQXts88Qu5AEvoIgnwRz54ia38g4VN3PaA3LCYM8,2807
39
41
  biocypher/output/write/relational/_postgresql.py,sha256=75iJvv-0ewSsQXhVeoYoGnmYQnKY_B4iItZV7DpEBto,12190
40
42
  biocypher/output/write/relational/_sqlite.py,sha256=BuGWOeeNA83lbUvjpkzqcR9_baWLsbfmLXBKe4O1EPE,2105
41
- biocypher-0.9.6.dist-info/LICENSE,sha256=oejgxuxyjSnyPw3YPloz6-dCBB_nYizJ4jDQnr-xZUU,1082
42
- biocypher-0.9.6.dist-info/METADATA,sha256=8zvMLLWli78WBCjcBMhkqdwQb2rEqIinHz4aePh8RZw,10600
43
- biocypher-0.9.6.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
44
- biocypher-0.9.6.dist-info/RECORD,,
43
+ biocypher-0.10.0.dist-info/LICENSE,sha256=oejgxuxyjSnyPw3YPloz6-dCBB_nYizJ4jDQnr-xZUU,1082
44
+ biocypher-0.10.0.dist-info/METADATA,sha256=TAd1YjHL94Sdp0_aW0ag7Qkbr5cJOCm7zvc6DqvG_ko,10706
45
+ biocypher-0.10.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
46
+ biocypher-0.10.0.dist-info/RECORD,,