biocypher 0.9.7__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

biocypher/_metadata.py CHANGED
@@ -10,7 +10,7 @@ import pathlib
10
10
 
11
11
  import toml
12
12
 
13
- _VERSION = "0.9.7"
13
+ _VERSION = "0.10.1"
14
14
 
15
15
 
16
16
  def get_metadata():
@@ -0,0 +1,499 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Optional
4
+
5
+ from biocypher._create import BioCypherEdge, BioCypherNode
6
+ from biocypher._deduplicate import Deduplicator
7
+ from biocypher._logger import logger
8
+ from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
9
+
10
+ if TYPE_CHECKING:
11
+ from scirpy.io import AirrCell
12
+
13
+ try:
14
+ from scirpy.io import AirrCell
15
+
16
+ HAS_SCIRPY = True
17
+ except ImportError:
18
+ HAS_SCIRPY = False
19
+
20
+
21
+ class AirrKG(_InMemoryKG):
22
+ """Knowledge graph for AIRR (Adaptive Immune Receptor Repertoire) data.
23
+
24
+ This class implements the AIRR data model for representing immune receptor sequences
25
+ (antibodies and T cell receptors) and their annotations. To ensure proper conversion
26
+ to AIRR format, your schema file should define immune receptor entities with property
27
+ names that match the AIRR standards.
28
+
29
+ Key property names in your schema for immune receptor entities:
30
+ - locus: The gene locus (e.g., "TRA", "TRB", "IGH", "IGK", "IGL")
31
+ - junction_aa: The amino acid sequence of the junction region (CDR3)
32
+ - v_call: The V gene assignment
33
+ - j_call: The J gene assignment
34
+ - productive: Whether the sequence is productive
35
+
36
+ Pairing Strategies specified in get_kg method:
37
+ - Indirect pairings allowed:
38
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
39
+ - Indirect pairings not allowed:
40
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
41
+
42
+ For a complete list of available fields and their descriptions, see:
43
+ https://docs.airr-community.org/en/stable/datarep/rearrangements.html#fields
44
+
45
+ All properties from the biocypher schema defined by user will be preserved in the AIRR format.
46
+ """
47
+
48
+ # Constants for internal property filtering
49
+ _INTERNAL_PROPERTIES = frozenset(["node_id", "node_label", "id", "preferred_id"])
50
+
51
+ def __init__(
52
+ self,
53
+ deduplicator: Optional["Deduplicator"] = None,
54
+ metadata_entity_type: str = "epitope",
55
+ ) -> None:
56
+ """Initialize AirrKG with configurable metadata node type.
57
+
58
+ Args:
59
+ ----
60
+ deduplicator: Deduplicator instance
61
+ metadata_entity_type: String specifying the metadata node type (default: "epitope")
62
+
63
+ """
64
+ super().__init__()
65
+ self.deduplicator = deduplicator or Deduplicator()
66
+ self.metadata_entity_type = metadata_entity_type
67
+
68
+ # Initialize storage for processed cells
69
+ self.adjacency_list = {}
70
+ self.airr_cells = []
71
+
72
+ # These will be populated when nodes and edges are added
73
+ self.sequence_entity_types = {}
74
+ self.chain_relationship_types = []
75
+ self.chain_to_epitope_relationship_types = []
76
+
77
+ def _check_dependencies(self) -> None:
78
+ """Verify that scirpy is available."""
79
+ if not HAS_SCIRPY:
80
+ msg = (
81
+ "AirrCell module from scirpy not detected. "
82
+ "Install it with 'poetry add biocypher[scirpy]' or 'poetry add scirpy'."
83
+ )
84
+ raise ImportError(msg)
85
+
86
+ def get_kg(self, indirect_pairings: bool = True) -> list[AirrCell]:
87
+ """Convert directly to AIRR format using AirCell from scirpy.
88
+
89
+ Args:
90
+ ----
91
+ indirect_pairings: Boolean controlling pairing strategy (default: True)
92
+ - True:
93
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
94
+ - False:
95
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
96
+
97
+ Returns:
98
+ -------
99
+ list: List of generated AIRR cells
100
+
101
+ """
102
+ self._check_dependencies()
103
+ if not self.airr_cells:
104
+ self.airr_cells = self._to_airr_cells(self.adjacency_list, indirect_pairings)
105
+ return self.airr_cells
106
+
107
+ def add_nodes(self, nodes: list[BioCypherNode]) -> None:
108
+ """Add BioCypher nodes, organizing them by type."""
109
+ self._add_to_entities_by_type(nodes)
110
+
111
+ def add_edges(self, edges: list[BioCypherEdge]) -> None:
112
+ """Add BioCypher edges, organizing them by type."""
113
+ self._add_to_entities_by_type(edges)
114
+
115
+ def _add_to_entities_by_type(self, entities: dict[str, list[Any]]) -> None:
116
+ """Add all entities (both nodes and edges) to a common adj. list."""
117
+ lists = self._separate_entity_types(entities)
118
+ for _type, _entities in lists.items():
119
+ if _type not in self.adjacency_list:
120
+ self.adjacency_list[_type] = []
121
+ self.adjacency_list[_type].extend(_entities)
122
+
123
+ def _process_entities(self, entities: dict[str, list[Any]]) -> tuple[dict, dict, dict]:
124
+ """Process entities and organize them into sequence nodes, metadata nodes, and receptor-epitope mappings.
125
+
126
+ Args:
127
+ ----
128
+ entities: Dictionary mapping entity types to lists of BioCypherNode/BioCypherEdge objects
129
+
130
+ Returns:
131
+ -------
132
+ tuple: (sequence_nodes, metadata_nodes, receptor_epitope_mapping)
133
+
134
+ """
135
+ sequence_nodes = {}
136
+ metadata_nodes = {}
137
+ receptor_epitope_mapping = {}
138
+
139
+ # Determine entity types while processing
140
+ all_node_types = set()
141
+ all_edge_types = set()
142
+
143
+ for entity_type, entities_list in entities.items():
144
+ if not entities_list: # Skip empty lists
145
+ continue
146
+
147
+ # Determine if this is a node or edge type
148
+ if isinstance(entities_list[0], BioCypherNode):
149
+ all_node_types.add(entity_type)
150
+ if entity_type == self.metadata_entity_type:
151
+ metadata_nodes.update({node.get_id(): node for node in entities_list})
152
+ else:
153
+ sequence_nodes.update({node.get_id(): node for node in entities_list})
154
+ self.sequence_entity_types[entity_type] = entity_type.replace(" sequence", "").upper()
155
+ elif isinstance(entities_list[0], BioCypherEdge):
156
+ all_edge_types.add(entity_type)
157
+
158
+ # Update relationship types
159
+ self.chain_relationship_types = [
160
+ edge_type for edge_type in all_edge_types if self.metadata_entity_type not in edge_type.lower()
161
+ ]
162
+
163
+ self.chain_to_epitope_relationship_types = [
164
+ edge_type for edge_type in all_edge_types if self.metadata_entity_type in edge_type.lower()
165
+ ]
166
+
167
+ # Process chain-to-epitope relationships
168
+ for entity_type in self.chain_to_epitope_relationship_types:
169
+ self._update_receptor_epitope_mapping(entities[entity_type], receptor_epitope_mapping)
170
+
171
+ return sequence_nodes, metadata_nodes, receptor_epitope_mapping
172
+
173
+ def _update_receptor_epitope_mapping(self, edges: list[BioCypherEdge], mapping: dict[str, set]) -> None:
174
+ """Update receptor-epitope mapping with new edges.
175
+
176
+ Args:
177
+ ----
178
+ edges: List of edges to process
179
+ mapping: Dictionary to update with receptor-epitope mappings
180
+
181
+ """
182
+ for edge in edges:
183
+ source_id = edge.get_source_id()
184
+ if source_id not in mapping:
185
+ mapping[source_id] = set()
186
+ mapping[source_id].add(edge.get_target_id())
187
+
188
+ def _process_paired_chains(
189
+ self,
190
+ entities: dict[str, list[Any]],
191
+ sequence_nodes: dict[str, BioCypherNode],
192
+ metadata_nodes: dict[str, BioCypherNode],
193
+ receptor_epitope_mapping: dict[str, set],
194
+ indirect_pairings: bool = True,
195
+ ) -> tuple[list[AirrCell], set[str], int]:
196
+ """Process paired chains and generate AIRR cells.
197
+
198
+ Args:
199
+ ----
200
+ entities: Dictionary of all entities
201
+ sequence_nodes: Dictionary of sequence nodes
202
+ metadata_nodes: Dictionary of metadata nodes
203
+ receptor_epitope_mapping: Dictionary of receptor-epitope mappings
204
+ indirect_pairings: Boolean controlling pairing strategy
205
+
206
+ Returns:
207
+ -------
208
+ tuple: (list of generated cells, set of processed chain IDs, count of cells with multiple epitopes)
209
+
210
+ """
211
+ airr_cells = []
212
+ processed_chains = set()
213
+ n_metacells = 0
214
+
215
+ for entity_type, edges in entities.items():
216
+ if entity_type in self.chain_relationship_types:
217
+ for edge in edges:
218
+ source_id, target_id = edge.get_source_id(), edge.get_target_id()
219
+ processed_chains.update([source_id, target_id])
220
+
221
+ # Use conditional logic for pairing strategy
222
+ source_metadata = receptor_epitope_mapping.get(source_id, set())
223
+ target_metadata = receptor_epitope_mapping.get(target_id, set())
224
+
225
+ if indirect_pairings:
226
+ # Union: create paired cell if either chain binds epitopes
227
+ metadata_ids = source_metadata | target_metadata
228
+
229
+ metadata_nodes_cell = self._get_metadata_nodes(metadata_ids, metadata_nodes)
230
+ if metadata_nodes_cell:
231
+ cell_s = self._generate_airr_cell(
232
+ cell_id=edge.get_id(),
233
+ source_node=sequence_nodes.get(source_id),
234
+ target_node=sequence_nodes.get(target_id),
235
+ metadata_nodes=metadata_nodes_cell,
236
+ paired=True,
237
+ receptor_epitope_mapping=receptor_epitope_mapping,
238
+ )
239
+ airr_cells.extend(cell_s)
240
+ if len(cell_s) > 1:
241
+ n_metacells += 1
242
+ else:
243
+ # Intersection: create paired cell only if both chains bind same epitopes
244
+ shared_metadata_ids = source_metadata & target_metadata
245
+
246
+ # Create paired cell if there are shared epitopes
247
+ if shared_metadata_ids:
248
+ shared_metadata_nodes = self._get_metadata_nodes(shared_metadata_ids, metadata_nodes)
249
+ if shared_metadata_nodes:
250
+ cell_s = self._generate_airr_cell(
251
+ cell_id=edge.get_id(),
252
+ source_node=sequence_nodes.get(source_id),
253
+ target_node=sequence_nodes.get(target_id),
254
+ metadata_nodes=shared_metadata_nodes,
255
+ paired=True,
256
+ receptor_epitope_mapping=receptor_epitope_mapping,
257
+ )
258
+ airr_cells.extend(cell_s)
259
+ if len(cell_s) > 1:
260
+ n_metacells += 1
261
+
262
+ # Create unpaired cells for chains with non-overlapping epitopes
263
+ source_only_metadata = source_metadata - target_metadata
264
+ target_only_metadata = target_metadata - source_metadata
265
+
266
+ # Create unpaired cell for source chain if it has unique epitopes
267
+ if source_only_metadata:
268
+ source_only_nodes = self._get_metadata_nodes(source_only_metadata, metadata_nodes)
269
+ if source_only_nodes:
270
+ source_cells = self._generate_airr_cell(
271
+ cell_id=f"unpaired_{source_id}",
272
+ source_node=sequence_nodes.get(source_id),
273
+ target_node=None,
274
+ metadata_nodes=source_only_nodes,
275
+ paired=False,
276
+ receptor_epitope_mapping=receptor_epitope_mapping,
277
+ )
278
+ airr_cells.extend(source_cells)
279
+ if len(source_cells) > 1:
280
+ n_metacells += 1
281
+
282
+ # Create unpaired cell for target chain if it has unique epitopes
283
+ if target_only_metadata:
284
+ target_only_nodes = self._get_metadata_nodes(target_only_metadata, metadata_nodes)
285
+ if target_only_nodes:
286
+ target_cells = self._generate_airr_cell(
287
+ cell_id=f"unpaired_{target_id}",
288
+ source_node=sequence_nodes.get(target_id),
289
+ target_node=None,
290
+ metadata_nodes=target_only_nodes,
291
+ paired=False,
292
+ receptor_epitope_mapping=receptor_epitope_mapping,
293
+ )
294
+ airr_cells.extend(target_cells)
295
+ if len(target_cells) > 1:
296
+ n_metacells += 1
297
+
298
+ return airr_cells, processed_chains, n_metacells
299
+
300
+ def _process_unpaired_chains(
301
+ self,
302
+ receptor_epitope_mapping: dict[str, set],
303
+ sequence_nodes: dict[str, BioCypherNode],
304
+ metadata_nodes: dict[str, BioCypherNode],
305
+ processed_chains: set[str],
306
+ ) -> tuple[list[AirrCell], int]:
307
+ """Process unpaired chains and generate AIRR cells.
308
+
309
+ Args:
310
+ ----
311
+ receptor_epitope_mapping: Dictionary of receptor-epitope mappings
312
+ sequence_nodes: Dictionary of sequence nodes
313
+ metadata_nodes: Dictionary of metadata nodes
314
+ processed_chains: Set of already processed chain IDs
315
+
316
+ Returns:
317
+ -------
318
+ tuple: (List of generated cells, count of cells with multiple epitopes)
319
+
320
+ """
321
+ airr_cells = []
322
+ n_metacells = 0
323
+
324
+ for chain_id in receptor_epitope_mapping:
325
+ if chain_id not in processed_chains:
326
+ # Get all metadata nodes for this unpaired chain
327
+ metadata_nodes_cell = self._get_metadata_nodes(receptor_epitope_mapping[chain_id], metadata_nodes)
328
+
329
+ if metadata_nodes_cell:
330
+ cell_s = self._generate_airr_cell(
331
+ cell_id=f"unpaired_{chain_id}",
332
+ source_node=sequence_nodes.get(chain_id),
333
+ target_node=None,
334
+ metadata_nodes=metadata_nodes_cell,
335
+ paired=False,
336
+ receptor_epitope_mapping=receptor_epitope_mapping,
337
+ )
338
+ airr_cells.extend(cell_s)
339
+ # Check if multiple cells were generated (indicating multiple epitopes)
340
+ if len(cell_s) > 1:
341
+ n_metacells += 1
342
+
343
+ return airr_cells, n_metacells
344
+
345
+ def _to_airr_cells(self, entities: dict[str, list[Any]], indirect_pairings: bool = True) -> list[AirrCell]:
346
+ """Convert BioCypher entities to AIRR cells using configurable mappings.
347
+
348
+ Args:
349
+ ----
350
+ entities: Dictionary mapping entity types to lists of BioCypherNode/BioCypherEdge objects
351
+ indirect_pairings: Boolean controlling pairing strategy (default: True)
352
+ - True:
353
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
354
+ - False:
355
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
356
+
357
+ Returns:
358
+ -------
359
+ list: List of generated AIRR cells
360
+
361
+ """
362
+ self._check_dependencies()
363
+ if not entities:
364
+ msg = "No entities provided for conversion."
365
+ raise ValueError(msg)
366
+
367
+ logger.info("Starting conversion to AIRR cells")
368
+
369
+ # Process all entities
370
+ sequence_nodes, metadata_nodes, receptor_epitope_mapping = self._process_entities(entities)
371
+
372
+ # Process paired chains
373
+ airr_cells, processed_chains, paired_metacells = self._process_paired_chains(
374
+ entities,
375
+ sequence_nodes,
376
+ metadata_nodes,
377
+ receptor_epitope_mapping,
378
+ indirect_pairings,
379
+ )
380
+
381
+ # Process unpaired chains
382
+ unpaired_cells, unpaired_metacells = self._process_unpaired_chains(
383
+ receptor_epitope_mapping,
384
+ sequence_nodes,
385
+ metadata_nodes,
386
+ processed_chains,
387
+ )
388
+ airr_cells.extend(unpaired_cells)
389
+
390
+ # Calculate total cells with multiple epitopes
391
+ total_metacells = paired_metacells + unpaired_metacells
392
+
393
+ # Log information about cells
394
+ logger.info(f"Generated total of {len(airr_cells)} AIRR cells")
395
+ if total_metacells > 0:
396
+ logger.info(f"{total_metacells} cells with more than 1 epitope were detected")
397
+
398
+ return airr_cells
399
+
400
+ def _get_metadata_nodes(
401
+ self,
402
+ metadata_ids: set[str],
403
+ metadata_nodes: dict[str, BioCypherNode],
404
+ ) -> list[BioCypherNode]:
405
+ """Get metadata nodes for a set of metadata IDs.
406
+
407
+ Args:
408
+ ----
409
+ metadata_ids: Set of metadata IDs
410
+ metadata_nodes: Dictionary of metadata nodes
411
+
412
+ Returns:
413
+ -------
414
+ list: List of metadata nodes
415
+
416
+ """
417
+ return [metadata_nodes[ep_id] for ep_id in metadata_ids if ep_id in metadata_nodes]
418
+
419
+ def _generate_airr_cell(
420
+ self,
421
+ cell_id: str,
422
+ source_node: BioCypherNode | None,
423
+ target_node: BioCypherNode | None,
424
+ metadata_nodes: list[BioCypherNode],
425
+ paired: bool,
426
+ receptor_epitope_mapping: dict[str, set] | None = None,
427
+ ) -> list[AirrCell]:
428
+ self._check_dependencies()
429
+ cell = AirrCell(cell_id=cell_id)
430
+
431
+ # Process both chains
432
+ for node in [source_node, target_node]:
433
+ if not node: # Skip if node is None
434
+ continue
435
+
436
+ props = node.get_properties()
437
+ chain = AirrCell.empty_chain_dict()
438
+
439
+ # Add all properties except internal ones
440
+ for key, value in props.items():
441
+ if key not in self._INTERNAL_PROPERTIES:
442
+ chain[key] = value
443
+
444
+ # Add locus based on node type
445
+ chain["locus"] = self.sequence_entity_types.get(node.get_label(), node.get_label())
446
+ chain["consensus_count"] = 0
447
+ chain["productive"] = True
448
+
449
+ # Add binds_epitope field based on receptor_epitope_mapping
450
+ if receptor_epitope_mapping and node.get_id() in receptor_epitope_mapping:
451
+ chain["validated_epitope"] = bool(receptor_epitope_mapping[node.get_id()])
452
+ else:
453
+ chain["validated_epitope"] = False
454
+
455
+ cell.add_chain(chain)
456
+
457
+ # Add metadata
458
+ return self.add_metadata(metadata_nodes, cell, paired)
459
+
460
+ def add_metadata(self, metadata_nodes: list[BioCypherNode], cell: AirrCell, paired: bool) -> list[AirrCell]:
461
+ """Add metadata from nodes to cell(s) and return a list of cells.
462
+
463
+ Args:
464
+ ----
465
+ metadata_nodes: List of metadata nodes to add
466
+ cell: Base cell to add metadata to
467
+ paired: Whether the cell is paired
468
+
469
+ Returns:
470
+ -------
471
+ List of cells with metadata added
472
+
473
+ """
474
+ self._check_dependencies()
475
+ cells = []
476
+ if not metadata_nodes:
477
+ cell["data_source"] = "BioCypher"
478
+ cell["is_paired"] = paired
479
+ cells.append(cell)
480
+ else:
481
+ for i, node in enumerate(metadata_nodes):
482
+ # Create a new AirrCell for each metadata node
483
+ if i > 0:
484
+ cell_id_new = f"{cell.cell_id}_meta{i+1}"
485
+ meta_cell = AirrCell(cell_id=cell_id_new)
486
+ for chain in cell.chains:
487
+ meta_cell.add_chain(chain)
488
+ else:
489
+ meta_cell = cell
490
+ props = node.get_properties()
491
+ for key, value in props.items():
492
+ if key not in self._INTERNAL_PROPERTIES:
493
+ meta_cell[key] = value
494
+
495
+ meta_cell["data_source"] = "BioCypher"
496
+ meta_cell["is_paired"] = paired
497
+
498
+ cells.append(meta_cell)
499
+ return cells
@@ -8,6 +8,7 @@ from __future__ import annotations
8
8
  from typing import TYPE_CHECKING
9
9
 
10
10
  from biocypher._logger import logger
11
+ from biocypher.output.in_memory._airr import AirrKG
11
12
  from biocypher.output.in_memory._networkx import NetworkxKG
12
13
  from biocypher.output.in_memory._pandas import PandasKG
13
14
 
@@ -19,7 +20,7 @@ logger.debug(f"Loading module {__name__}.")
19
20
 
20
21
  __all__ = ["get_in_memory_kg"]
21
22
 
22
- IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx"]
23
+ IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx", "airr"]
23
24
 
24
25
 
25
26
  def get_in_memory_kg(
@@ -35,10 +36,11 @@ def get_in_memory_kg(
35
36
  """
36
37
  if dbms in ["csv", "pandas", "tabular"]:
37
38
  return PandasKG(deduplicator)
38
-
39
39
  if dbms == "networkx":
40
40
  return NetworkxKG(deduplicator)
41
-
42
- msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
43
- logger.error(msg)
44
- raise NotImplementedError(msg)
41
+ elif dbms == "airr":
42
+ return AirrKG(deduplicator)
43
+ else:
44
+ msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
45
+ logger.error(msg)
46
+ raise NotImplementedError(msg)
@@ -1,5 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
+ from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
4
+
3
5
 
4
6
  class _InMemoryKG(ABC):
5
7
  """Abstract class for handling the in-memory Knowledge Graph instance.
@@ -9,6 +11,7 @@ class _InMemoryKG(ABC):
9
11
  - add_nodes
10
12
  - add_edges
11
13
  - get_kg
14
+ - _separate_entity_types
12
15
 
13
16
  Raises:
14
17
  NotImplementedError: InMemoryKG implementation must override 'add_nodes'
@@ -38,3 +41,57 @@ class _InMemoryKG(ABC):
38
41
  def get_kg(self):
39
42
  """Return the in-memory knowledge graph."""
40
43
  raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")
44
+
45
+ def _separate_entity_types(self, entities):
46
+ """
47
+ Given mixed iterable of BioCypher objects, separate them into lists by
48
+ type. Also deduplicates using the `Deduplicator` instance.
49
+ """
50
+ lists = {}
51
+ for entity in entities:
52
+ if (
53
+ not isinstance(entity, BioCypherNode)
54
+ and not isinstance(entity, BioCypherEdge)
55
+ and not isinstance(entity, BioCypherRelAsNode)
56
+ ):
57
+ raise TypeError(
58
+ "Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
59
+ )
60
+
61
+ if isinstance(entity, BioCypherNode):
62
+ seen = self.deduplicator.node_seen(entity)
63
+ elif isinstance(entity, BioCypherEdge):
64
+ seen = self.deduplicator.edge_seen(entity)
65
+ elif isinstance(entity, BioCypherRelAsNode):
66
+ seen = self.deduplicator.rel_as_node_seen(entity)
67
+
68
+ if seen:
69
+ continue
70
+
71
+ if isinstance(entity, BioCypherRelAsNode):
72
+ node = entity.get_node()
73
+ source_edge = entity.get_source_edge()
74
+ target_edge = entity.get_target_edge()
75
+
76
+ _type = node.get_type()
77
+ if _type not in lists:
78
+ lists[_type] = []
79
+ lists[_type].append(node)
80
+
81
+ _source_type = source_edge.get_type()
82
+ if _source_type not in lists:
83
+ lists[_source_type] = []
84
+ lists[_source_type].append(source_edge)
85
+
86
+ _target_type = target_edge.get_type()
87
+ if _target_type not in lists:
88
+ lists[_target_type] = []
89
+ lists[_target_type].append(target_edge)
90
+ continue
91
+
92
+ _type = entity.get_type()
93
+ if _type not in lists:
94
+ lists[_type] = []
95
+ lists[_type].append(entity)
96
+
97
+ return lists
@@ -1,6 +1,5 @@
1
1
  import pandas as pd
2
2
 
3
- from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
4
3
  from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
5
4
 
6
5
 
@@ -20,65 +19,8 @@ class PandasKG(_InMemoryKG):
20
19
  def add_edges(self, edges):
21
20
  self.add_tables(edges)
22
21
 
23
- def _separate_entity_types(self, entities):
24
- """
25
- Given mixed iterable of BioCypher objects, separate them into lists by
26
- type. Also deduplicates using the `Deduplicator` instance.
27
- """
28
- lists = {}
29
- for entity in entities:
30
- if (
31
- not isinstance(entity, BioCypherNode)
32
- and not isinstance(entity, BioCypherEdge)
33
- and not isinstance(entity, BioCypherRelAsNode)
34
- ):
35
- raise TypeError(
36
- "Expected a BioCypherNode / BioCypherEdge / " f"BioCypherRelAsNode, got {type(entity)}."
37
- )
38
-
39
- if isinstance(entity, BioCypherNode):
40
- seen = self.deduplicator.node_seen(entity)
41
- elif isinstance(entity, BioCypherEdge):
42
- seen = self.deduplicator.edge_seen(entity)
43
- elif isinstance(entity, BioCypherRelAsNode):
44
- seen = self.deduplicator.rel_as_node_seen(entity)
45
-
46
- if seen:
47
- continue
48
-
49
- if isinstance(entity, BioCypherRelAsNode):
50
- node = entity.get_node()
51
- source_edge = entity.get_source_edge()
52
- target_edge = entity.get_target_edge()
53
-
54
- _type = node.get_type()
55
- if _type not in lists:
56
- lists[_type] = []
57
- lists[_type].append(node)
58
-
59
- _source_type = source_edge.get_type()
60
- if _source_type not in lists:
61
- lists[_source_type] = []
62
- lists[_source_type].append(source_edge)
63
-
64
- _target_type = target_edge.get_type()
65
- if _target_type not in lists:
66
- lists[_target_type] = []
67
- lists[_target_type].append(target_edge)
68
- continue
69
-
70
- _type = entity.get_type()
71
- if _type not in lists:
72
- lists[_type] = []
73
- lists[_type].append(entity)
74
-
75
- return lists
76
-
77
22
  def add_tables(self, entities):
78
- """
79
- Add Pandas dataframes for each node and edge type in the input.
80
- """
81
-
23
+ """Add Pandas dataframes for each node and edge type in the input."""
82
24
  lists = self._separate_entity_types(entities)
83
25
 
84
26
  for _type, _entities in lists.items():
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING
9
9
  from biocypher._config import config as _config
10
10
  from biocypher._logger import logger
11
11
  from biocypher.output.write._batch_writer import _BatchWriter
12
+ from biocypher.output.write.graph._airr import _AirrWriter
12
13
  from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
13
14
  from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
14
15
  from biocypher.output.write.graph._networkx import _NetworkXWriter
@@ -50,6 +51,7 @@ DBMS_TO_CLASS = {
50
51
  "Tabular": _PandasCSVWriter,
51
52
  "networkx": _NetworkXWriter,
52
53
  "NetworkX": _NetworkXWriter,
54
+ "airr": _AirrWriter,
53
55
  }
54
56
 
55
57
 
@@ -0,0 +1,32 @@
1
+ """Module to provide the AnnData writer class for BioCypher."""
2
+
3
+ from biocypher._logger import logger
4
+ from biocypher.output.write._writer import _Writer
5
+
6
+
7
+ class _AirrWriter(_Writer):
8
+ """A minimal placeholder writer class that implements the required methods
9
+ but performs no actual writing operations, since there is an existing anndata native writer functionality
10
+ """
11
+
12
+ def __init__(self, *args, **kwargs):
13
+ super().__init__(*args, **kwargs)
14
+ logger.info("Placeholder writer initialized")
15
+
16
+ def _write_node_data(self, nodes) -> bool:
17
+ """Required implementation that does nothing with nodes."""
18
+ logger.info("Placeholder: Node data received but not processed")
19
+ return True
20
+
21
+ def _write_edge_data(self, edges) -> bool:
22
+ """Required implementation that does nothing with edges."""
23
+ logger.info("Placeholder: Edge data received but not processed")
24
+ return True
25
+
26
+ def _construct_import_call(self) -> str:
27
+ """Return a placeholder import script."""
28
+ return "# This is a placeholder import script\nprint('No actual import functionality implemented')"
29
+
30
+ def _get_import_script_name(self) -> str:
31
+ """Return a placeholder script name."""
32
+ return "placeholder_import.py"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: biocypher
3
- Version: 0.9.7
3
+ Version: 0.10.1
4
4
  Summary: A unifying framework for biomedical research knowledge graphs
5
5
  Home-page: https://github.com/biocypher/biocypher
6
6
  License: MIT
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
22
+ Provides-Extra: scirpy
22
23
  Requires-Dist: PyYAML (>=5.0)
23
24
  Requires-Dist: appdirs
24
25
  Requires-Dist: more_itertools
@@ -27,6 +28,7 @@ Requires-Dist: networkx (>=3.0,<4.0)
27
28
  Requires-Dist: pandas (>=2.0.1,<3.0.0)
28
29
  Requires-Dist: pooch (>=1.7.0,<2.0.0)
29
30
  Requires-Dist: rdflib (>=6.2.0,<7.0.0)
31
+ Requires-Dist: scirpy (>=0.22.0,<0.23.0) ; extra == "scirpy"
30
32
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
31
33
  Requires-Dist: treelib (==1.6.4)
32
34
  Project-URL: Bug Tracker, https://github.com/biocypher/biocypher/issues
@@ -79,7 +81,7 @@ Board](https://github.com/orgs/biocypher/projects/3/views/2).
79
81
 
80
82
  Install the package from PyPI using `pip install biocypher`. More comprehensive
81
83
  installation and configuration instructions can be found
82
- [here](https://biocypher.org/installation.html).
84
+ [here](https://biocypher.org/BioCypher/installation/).
83
85
 
84
86
  Exemplary usage of BioCypher to build a graph database is shown in our tutorial
85
87
  and the various pipelines we have created. You can find these on the [Components
@@ -11,7 +11,7 @@ biocypher/_deduplicate.py,sha256=rtglcaLRaVzNjLtaPwTGP8VvCM4PHYQ5CZ-cm32CrKQ,484
11
11
  biocypher/_get.py,sha256=_wUjhRjH2J6Qhq0Ndy3kdfaWhHDTT-dxyCvtuH36My4,14868
12
12
  biocypher/_logger.py,sha256=y9dh3SPJOCWXnkFSYSK7aj_-pB7zlAkNCf43Dp1lt74,2941
13
13
  biocypher/_mapping.py,sha256=ntspG2C_NaQODhWTBFk0CDvolkOCjtqlQ9E-NkJAuTg,9030
14
- biocypher/_metadata.py,sha256=MzfAA6RQw7SDP9sRRrQQbAUKSBX_VIpqeh10qERhTg4,1415
14
+ biocypher/_metadata.py,sha256=dz_VeWCq0wx6Cr1hCzscvEVPLSRY1F8Tecu2saeAhkg,1416
15
15
  biocypher/_misc.py,sha256=YzlY7zwa0mim9QFg9HwXErkJFIH3cvLrbgjF8tKOIT8,6353
16
16
  biocypher/_ontology.py,sha256=lipZxU3aj6zrTbBrJZmCW6IRCuz-KQG3AfbYCVq6aFE,33133
17
17
  biocypher/_translate.py,sha256=NKSM9lxNjNNbgQrK_24eWYh3B41TS7kjnSwjySnK3s0,16851
@@ -20,15 +20,17 @@ biocypher/output/connect/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZ
20
20
  biocypher/output/connect/_get_connector.py,sha256=Qimv3kTkXYkhJZRT6nq8mwIM2wORCnyqqHqF2IByuuc,1152
21
21
  biocypher/output/connect/_neo4j_driver.py,sha256=kXjOXW12wZFfEp7plAuo40bPSvOfd-i9m4YaXoMq-p0,12357
22
22
  biocypher/output/in_memory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- biocypher/output/in_memory/_get_in_memory_kg.py,sha256=29DvmOdRUuzEblyMxy5001R4zjEXW3eHNv0htko7c4Y,1115
24
- biocypher/output/in_memory/_in_memory_kg.py,sha256=g1TPN8PkeAyXbrRuTAjshqC8voI6EmLqR8S_otmviwU,1423
23
+ biocypher/output/in_memory/_airr.py,sha256=wOYg2x1WCs7DTImtdxNyyoCmjDUVg1Ovh29kzMOQ688,20671
24
+ biocypher/output/in_memory/_get_in_memory_kg.py,sha256=NQ-AT8jgAvb-aTK7zsYrItvlDEow_Mfni5tKEJjwjx0,1256
25
+ biocypher/output/in_memory/_in_memory_kg.py,sha256=BTBtqb2ZC_zxXdOJ59BQKUoGXZSpiaRG6VecjZGWCm0,3526
25
26
  biocypher/output/in_memory/_networkx.py,sha256=cSOSAreP7S3oeGT6noZ1kAIvSnkVnU3NUp1OY4yqzn0,1515
26
- biocypher/output/in_memory/_pandas.py,sha256=Ot2jbK5t_YLHqw0BUv9Z_qWNy9r6IX1LYEyejOSJzos,3288
27
+ biocypher/output/in_memory/_pandas.py,sha256=ndZcAAdsw38qZW3nWehcSxhpBGM8pXsn3DPoCcppI0U,1196
27
28
  biocypher/output/write/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
29
  biocypher/output/write/_batch_writer.py,sha256=_Dao7z4KN0Uhr86oOOWYEDrIUikR7T0v1SJC2Btd8Y4,38745
29
- biocypher/output/write/_get_writer.py,sha256=JozRWCMhvh65aQAlcGiiD5x3Nl1HSW8mK1Zf2nTSOzI,4385
30
+ biocypher/output/write/_get_writer.py,sha256=fXputhpt6K4mF8Ti6LFgwZIMU0GrK3aHkWIj1g4liwI,4469
30
31
  biocypher/output/write/_writer.py,sha256=y0dWI-RyQdrBLr9Fs91Y9KcCMjnlCaKJT0eWsIS2hG4,7158
31
32
  biocypher/output/write/graph/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ biocypher/output/write/graph/_airr.py,sha256=XWzcqwoMZKZ8n6f2Y77hv1ZSRyg2tOVqlzX133-biG8,1274
32
34
  biocypher/output/write/graph/_arangodb.py,sha256=xue3hm_DVB5pMR5qqfGXlXll3RpILA0tXos2J-as1-E,7906
33
35
  biocypher/output/write/graph/_neo4j.py,sha256=tBPhxn8JAmSS6KmiePofwr9LpGjHQH9BTnpHVK2ellM,12042
34
36
  biocypher/output/write/graph/_networkx.py,sha256=2WYkw5ZM3Bp236iwAxEAp3A1DxHKT4_hEPNMUKvPHp4,2320
@@ -38,7 +40,7 @@ biocypher/output/write/relational/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
38
40
  biocypher/output/write/relational/_csv.py,sha256=m0BSQXts88Qu5AEvoIgnwRz54ia38g4VN3PaA3LCYM8,2807
39
41
  biocypher/output/write/relational/_postgresql.py,sha256=75iJvv-0ewSsQXhVeoYoGnmYQnKY_B4iItZV7DpEBto,12190
40
42
  biocypher/output/write/relational/_sqlite.py,sha256=BuGWOeeNA83lbUvjpkzqcR9_baWLsbfmLXBKe4O1EPE,2105
41
- biocypher-0.9.7.dist-info/LICENSE,sha256=oejgxuxyjSnyPw3YPloz6-dCBB_nYizJ4jDQnr-xZUU,1082
42
- biocypher-0.9.7.dist-info/METADATA,sha256=uJ0y5nC0-_fSDjV9WTfVvMzGU7K0LunAkSmZrID1IYo,10621
43
- biocypher-0.9.7.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
44
- biocypher-0.9.7.dist-info/RECORD,,
43
+ biocypher-0.10.1.dist-info/LICENSE,sha256=oejgxuxyjSnyPw3YPloz6-dCBB_nYizJ4jDQnr-xZUU,1082
44
+ biocypher-0.10.1.dist-info/METADATA,sha256=Pxq7NgLmZcVn5zrge_QpB0xozDZWrtghWm2RMuOWZGU,10712
45
+ biocypher-0.10.1.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
46
+ biocypher-0.10.1.dist-info/RECORD,,