biocypher 0.9.7__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biocypher might be problematic. Click here for more details.

Files changed (47) hide show
  1. {biocypher-0.9.7 → biocypher-0.10.0}/PKG-INFO +3 -1
  2. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_metadata.py +1 -1
  3. biocypher-0.10.0/biocypher/output/in_memory/_airr.py +491 -0
  4. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/in_memory/_get_in_memory_kg.py +8 -6
  5. biocypher-0.9.7/biocypher/output/in_memory/_pandas.py → biocypher-0.10.0/biocypher/output/in_memory/_in_memory_kg.py +34 -32
  6. biocypher-0.10.0/biocypher/output/in_memory/_pandas.py +37 -0
  7. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/_get_writer.py +2 -0
  8. biocypher-0.10.0/biocypher/output/write/graph/_airr.py +32 -0
  9. {biocypher-0.9.7 → biocypher-0.10.0}/pyproject.toml +6 -2
  10. biocypher-0.9.7/biocypher/output/in_memory/_in_memory_kg.py +0 -40
  11. {biocypher-0.9.7 → biocypher-0.10.0}/LICENSE +0 -0
  12. {biocypher-0.9.7 → biocypher-0.10.0}/README.md +0 -0
  13. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/__init__.py +0 -0
  14. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_config/__init__.py +0 -0
  15. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_config/biocypher_config.yaml +0 -0
  16. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_config/test_config.yaml +0 -0
  17. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_config/test_schema_config.yaml +0 -0
  18. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_config/test_schema_config_disconnected.yaml +0 -0
  19. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_config/test_schema_config_extended.yaml +0 -0
  20. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_core.py +0 -0
  21. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_create.py +0 -0
  22. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_deduplicate.py +0 -0
  23. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_get.py +0 -0
  24. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_logger.py +0 -0
  25. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_mapping.py +0 -0
  26. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_misc.py +0 -0
  27. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_ontology.py +0 -0
  28. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/_translate.py +0 -0
  29. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/__init__.py +0 -0
  30. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/connect/__init__.py +0 -0
  31. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/connect/_get_connector.py +0 -0
  32. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/connect/_neo4j_driver.py +0 -0
  33. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/in_memory/__init__.py +0 -0
  34. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/in_memory/_networkx.py +0 -0
  35. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/__init__.py +0 -0
  36. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/_batch_writer.py +0 -0
  37. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/_writer.py +0 -0
  38. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/graph/__init__.py +0 -0
  39. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/graph/_arangodb.py +0 -0
  40. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/graph/_neo4j.py +0 -0
  41. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/graph/_networkx.py +0 -0
  42. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/graph/_owl.py +0 -0
  43. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/graph/_rdf.py +0 -0
  44. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/relational/__init__.py +0 -0
  45. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/relational/_csv.py +0 -0
  46. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/relational/_postgresql.py +0 -0
  47. {biocypher-0.9.7 → biocypher-0.10.0}/biocypher/output/write/relational/_sqlite.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: biocypher
3
- Version: 0.9.7
3
+ Version: 0.10.0
4
4
  Summary: A unifying framework for biomedical research knowledge graphs
5
5
  Home-page: https://github.com/biocypher/biocypher
6
6
  License: MIT
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
22
+ Provides-Extra: scirpy
22
23
  Requires-Dist: PyYAML (>=5.0)
23
24
  Requires-Dist: appdirs
24
25
  Requires-Dist: more_itertools
@@ -27,6 +28,7 @@ Requires-Dist: networkx (>=3.0,<4.0)
27
28
  Requires-Dist: pandas (>=2.0.1,<3.0.0)
28
29
  Requires-Dist: pooch (>=1.7.0,<2.0.0)
29
30
  Requires-Dist: rdflib (>=6.2.0,<7.0.0)
31
+ Requires-Dist: scirpy (>=0.22.0,<0.23.0) ; extra == "scirpy"
30
32
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
31
33
  Requires-Dist: treelib (==1.6.4)
32
34
  Project-URL: Bug Tracker, https://github.com/biocypher/biocypher/issues
@@ -10,7 +10,7 @@ import pathlib
10
10
 
11
11
  import toml
12
12
 
13
- _VERSION = "0.9.7"
13
+ _VERSION = "0.10.0"
14
14
 
15
15
 
16
16
  def get_metadata():
@@ -0,0 +1,491 @@
1
+ from typing import Any, Optional
2
+
3
+ from biocypher._create import BioCypherEdge, BioCypherNode
4
+ from biocypher._deduplicate import Deduplicator
5
+ from biocypher._logger import logger
6
+ from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
7
+
8
+ try:
9
+ from scirpy.io import AirrCell
10
+
11
+ HAS_SCIRPY = True
12
+ except ImportError:
13
+ HAS_SCIRPY = False
14
+
15
+
16
+ class AirrKG(_InMemoryKG):
17
+ """Knowledge graph for AIRR (Adaptive Immune Receptor Repertoire) data.
18
+
19
+ This class implements the AIRR data model for representing immune receptor sequences
20
+ (antibodies and T cell receptors) and their annotations. To ensure proper conversion
21
+ to AIRR format, your schema file should define immune receptor entities with property
22
+ names that match the AIRR standards.
23
+
24
+ Key property names in your schema for immune receptor entities:
25
+ - locus: The gene locus (e.g., "TRA", "TRB", "IGH", "IGK", "IGL")
26
+ - junction_aa: The amino acid sequence of the junction region (CDR3)
27
+ - v_call: The V gene assignment
28
+ - j_call: The J gene assignment
29
+ - productive: Whether the sequence is productive
30
+
31
+ Pairing Strategies specified in get_kg method:
32
+ - Indirect pairings allowed:
33
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
34
+ - Indirect pairings not allowed:
35
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
36
+
37
+ For a complete list of available fields and their descriptions, see:
38
+ https://docs.airr-community.org/en/stable/datarep/rearrangements.html#fields
39
+
40
+ All properties from the biocypher schema defined by user will be preserved in the AIRR format.
41
+ """
42
+
43
+ # Constants for internal property filtering
44
+ _INTERNAL_PROPERTIES = frozenset(["node_id", "node_label", "id", "preferred_id"])
45
+
46
+ def __init__(
47
+ self,
48
+ deduplicator: Optional["Deduplicator"] = None,
49
+ metadata_entity_type: str = "epitope",
50
+ ) -> None:
51
+ """Initialize AirrKG with configurable metadata node type.
52
+
53
+ Args:
54
+ ----
55
+ deduplicator: Deduplicator instance
56
+ metadata_entity_type: String specifying the metadata node type (default: "epitope")
57
+
58
+ """
59
+ super().__init__()
60
+ self.deduplicator = deduplicator or Deduplicator()
61
+ self.metadata_entity_type = metadata_entity_type
62
+
63
+ # Initialize storage for processed cells
64
+ self.adjacency_list = {}
65
+ self.airr_cells = []
66
+
67
+ # These will be populated when nodes and edges are added
68
+ self.sequence_entity_types = {}
69
+ self.chain_relationship_types = []
70
+ self.chain_to_epitope_relationship_types = []
71
+
72
+ def _check_dependencies(self) -> None:
73
+ """Verify that scirpy is available."""
74
+ if not HAS_SCIRPY:
75
+ msg = (
76
+ "AirrCell module from scirpy not detected. "
77
+ "Install it with 'poetry add biocypher[scirpy]' or 'poetry add scirpy'."
78
+ )
79
+ raise ImportError(msg)
80
+
81
+ def get_kg(self, indirect_pairings: bool = True) -> list[AirrCell]:
82
+ """Convert directly to AIRR format using AirCell from scirpy.
83
+
84
+ Args:
85
+ ----
86
+ indirect_pairings: Boolean controlling pairing strategy (default: True)
87
+ - True:
88
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
89
+ - False:
90
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
91
+
92
+ Returns:
93
+ -------
94
+ list: List of generated AIRR cells
95
+
96
+ """
97
+ self._check_dependencies()
98
+ if not self.airr_cells:
99
+ self.airr_cells = self._to_airr_cells(self.adjacency_list, indirect_pairings)
100
+ return self.airr_cells
101
+
102
+ def add_nodes(self, nodes: list[BioCypherNode]) -> None:
103
+ """Add BioCypher nodes, organizing them by type."""
104
+ self._add_to_entities_by_type(nodes)
105
+
106
+ def add_edges(self, edges: list[BioCypherEdge]) -> None:
107
+ """Add BioCypher edges, organizing them by type."""
108
+ self._add_to_entities_by_type(edges)
109
+
110
+ def _add_to_entities_by_type(self, entities: dict[str, list[Any]]) -> None:
111
+ """Add all entities (both nodes and edges) to a common adj. list."""
112
+ lists = self._separate_entity_types(entities)
113
+ for _type, _entities in lists.items():
114
+ if _type not in self.adjacency_list:
115
+ self.adjacency_list[_type] = []
116
+ self.adjacency_list[_type].extend(_entities)
117
+
118
+ def _process_entities(self, entities: dict[str, list[Any]]) -> tuple[dict, dict, dict]:
119
+ """Process entities and organize them into sequence nodes, metadata nodes, and receptor-epitope mappings.
120
+
121
+ Args:
122
+ ----
123
+ entities: Dictionary mapping entity types to lists of BioCypherNode/BioCypherEdge objects
124
+
125
+ Returns:
126
+ -------
127
+ tuple: (sequence_nodes, metadata_nodes, receptor_epitope_mapping)
128
+
129
+ """
130
+ sequence_nodes = {}
131
+ metadata_nodes = {}
132
+ receptor_epitope_mapping = {}
133
+
134
+ # Determine entity types while processing
135
+ all_node_types = set()
136
+ all_edge_types = set()
137
+
138
+ for entity_type, entities_list in entities.items():
139
+ if not entities_list: # Skip empty lists
140
+ continue
141
+
142
+ # Determine if this is a node or edge type
143
+ if isinstance(entities_list[0], BioCypherNode):
144
+ all_node_types.add(entity_type)
145
+ if entity_type == self.metadata_entity_type:
146
+ metadata_nodes.update({node.get_id(): node for node in entities_list})
147
+ else:
148
+ sequence_nodes.update({node.get_id(): node for node in entities_list})
149
+ self.sequence_entity_types[entity_type] = entity_type.replace(" sequence", "").upper()
150
+ elif isinstance(entities_list[0], BioCypherEdge):
151
+ all_edge_types.add(entity_type)
152
+
153
+ # Update relationship types
154
+ self.chain_relationship_types = [
155
+ edge_type for edge_type in all_edge_types if self.metadata_entity_type not in edge_type.lower()
156
+ ]
157
+
158
+ self.chain_to_epitope_relationship_types = [
159
+ edge_type for edge_type in all_edge_types if self.metadata_entity_type in edge_type.lower()
160
+ ]
161
+
162
+ # Process chain-to-epitope relationships
163
+ for entity_type in self.chain_to_epitope_relationship_types:
164
+ self._update_receptor_epitope_mapping(entities[entity_type], receptor_epitope_mapping)
165
+
166
+ return sequence_nodes, metadata_nodes, receptor_epitope_mapping
167
+
168
+ def _update_receptor_epitope_mapping(self, edges: list[BioCypherEdge], mapping: dict[str, set]) -> None:
169
+ """Update receptor-epitope mapping with new edges.
170
+
171
+ Args:
172
+ ----
173
+ edges: List of edges to process
174
+ mapping: Dictionary to update with receptor-epitope mappings
175
+
176
+ """
177
+ for edge in edges:
178
+ source_id = edge.get_source_id()
179
+ if source_id not in mapping:
180
+ mapping[source_id] = set()
181
+ mapping[source_id].add(edge.get_target_id())
182
+
183
+ def _process_paired_chains(
184
+ self,
185
+ entities: dict[str, list[Any]],
186
+ sequence_nodes: dict[str, BioCypherNode],
187
+ metadata_nodes: dict[str, BioCypherNode],
188
+ receptor_epitope_mapping: dict[str, set],
189
+ indirect_pairings: bool = True,
190
+ ) -> tuple[list[AirrCell], set[str], int]:
191
+ """Process paired chains and generate AIRR cells.
192
+
193
+ Args:
194
+ ----
195
+ entities: Dictionary of all entities
196
+ sequence_nodes: Dictionary of sequence nodes
197
+ metadata_nodes: Dictionary of metadata nodes
198
+ receptor_epitope_mapping: Dictionary of receptor-epitope mappings
199
+ indirect_pairings: Boolean controlling pairing strategy
200
+
201
+ Returns:
202
+ -------
203
+ tuple: (list of generated cells, set of processed chain IDs, count of cells with multiple epitopes)
204
+
205
+ """
206
+ airr_cells = []
207
+ processed_chains = set()
208
+ n_metacells = 0
209
+
210
+ for entity_type, edges in entities.items():
211
+ if entity_type in self.chain_relationship_types:
212
+ for edge in edges:
213
+ source_id, target_id = edge.get_source_id(), edge.get_target_id()
214
+ processed_chains.update([source_id, target_id])
215
+
216
+ # Use conditional logic for pairing strategy
217
+ source_metadata = receptor_epitope_mapping.get(source_id, set())
218
+ target_metadata = receptor_epitope_mapping.get(target_id, set())
219
+
220
+ if indirect_pairings:
221
+ # Union: create paired cell if either chain binds epitopes
222
+ metadata_ids = source_metadata | target_metadata
223
+
224
+ metadata_nodes_cell = self._get_metadata_nodes(metadata_ids, metadata_nodes)
225
+ if metadata_nodes_cell:
226
+ cell_s = self._generate_airr_cell(
227
+ cell_id=edge.get_id(),
228
+ source_node=sequence_nodes.get(source_id),
229
+ target_node=sequence_nodes.get(target_id),
230
+ metadata_nodes=metadata_nodes_cell,
231
+ paired=True,
232
+ receptor_epitope_mapping=receptor_epitope_mapping,
233
+ )
234
+ airr_cells.extend(cell_s)
235
+ if len(cell_s) > 1:
236
+ n_metacells += 1
237
+ else:
238
+ # Intersection: create paired cell only if both chains bind same epitopes
239
+ shared_metadata_ids = source_metadata & target_metadata
240
+
241
+ # Create paired cell if there are shared epitopes
242
+ if shared_metadata_ids:
243
+ shared_metadata_nodes = self._get_metadata_nodes(shared_metadata_ids, metadata_nodes)
244
+ if shared_metadata_nodes:
245
+ cell_s = self._generate_airr_cell(
246
+ cell_id=edge.get_id(),
247
+ source_node=sequence_nodes.get(source_id),
248
+ target_node=sequence_nodes.get(target_id),
249
+ metadata_nodes=shared_metadata_nodes,
250
+ paired=True,
251
+ receptor_epitope_mapping=receptor_epitope_mapping,
252
+ )
253
+ airr_cells.extend(cell_s)
254
+ if len(cell_s) > 1:
255
+ n_metacells += 1
256
+
257
+ # Create unpaired cells for chains with non-overlapping epitopes
258
+ source_only_metadata = source_metadata - target_metadata
259
+ target_only_metadata = target_metadata - source_metadata
260
+
261
+ # Create unpaired cell for source chain if it has unique epitopes
262
+ if source_only_metadata:
263
+ source_only_nodes = self._get_metadata_nodes(source_only_metadata, metadata_nodes)
264
+ if source_only_nodes:
265
+ source_cells = self._generate_airr_cell(
266
+ cell_id=f"unpaired_{source_id}",
267
+ source_node=sequence_nodes.get(source_id),
268
+ target_node=None,
269
+ metadata_nodes=source_only_nodes,
270
+ paired=False,
271
+ receptor_epitope_mapping=receptor_epitope_mapping,
272
+ )
273
+ airr_cells.extend(source_cells)
274
+ if len(source_cells) > 1:
275
+ n_metacells += 1
276
+
277
+ # Create unpaired cell for target chain if it has unique epitopes
278
+ if target_only_metadata:
279
+ target_only_nodes = self._get_metadata_nodes(target_only_metadata, metadata_nodes)
280
+ if target_only_nodes:
281
+ target_cells = self._generate_airr_cell(
282
+ cell_id=f"unpaired_{target_id}",
283
+ source_node=sequence_nodes.get(target_id),
284
+ target_node=None,
285
+ metadata_nodes=target_only_nodes,
286
+ paired=False,
287
+ receptor_epitope_mapping=receptor_epitope_mapping,
288
+ )
289
+ airr_cells.extend(target_cells)
290
+ if len(target_cells) > 1:
291
+ n_metacells += 1
292
+
293
+ return airr_cells, processed_chains, n_metacells
294
+
295
+ def _process_unpaired_chains(
296
+ self,
297
+ receptor_epitope_mapping: dict[str, set],
298
+ sequence_nodes: dict[str, BioCypherNode],
299
+ metadata_nodes: dict[str, BioCypherNode],
300
+ processed_chains: set[str],
301
+ ) -> tuple[list[AirrCell], int]:
302
+ """Process unpaired chains and generate AIRR cells.
303
+
304
+ Args:
305
+ ----
306
+ receptor_epitope_mapping: Dictionary of receptor-epitope mappings
307
+ sequence_nodes: Dictionary of sequence nodes
308
+ metadata_nodes: Dictionary of metadata nodes
309
+ processed_chains: Set of already processed chain IDs
310
+
311
+ Returns:
312
+ -------
313
+ tuple: (List of generated cells, count of cells with multiple epitopes)
314
+
315
+ """
316
+ airr_cells = []
317
+ n_metacells = 0
318
+
319
+ for chain_id in receptor_epitope_mapping:
320
+ if chain_id not in processed_chains:
321
+ # Get all metadata nodes for this unpaired chain
322
+ metadata_nodes_cell = self._get_metadata_nodes(receptor_epitope_mapping[chain_id], metadata_nodes)
323
+
324
+ if metadata_nodes_cell:
325
+ cell_s = self._generate_airr_cell(
326
+ cell_id=f"unpaired_{chain_id}",
327
+ source_node=sequence_nodes.get(chain_id),
328
+ target_node=None,
329
+ metadata_nodes=metadata_nodes_cell,
330
+ paired=False,
331
+ receptor_epitope_mapping=receptor_epitope_mapping,
332
+ )
333
+ airr_cells.extend(cell_s)
334
+ # Check if multiple cells were generated (indicating multiple epitopes)
335
+ if len(cell_s) > 1:
336
+ n_metacells += 1
337
+
338
+ return airr_cells, n_metacells
339
+
340
+ def _to_airr_cells(self, entities: dict[str, list[Any]], indirect_pairings: bool = True) -> list[AirrCell]:
341
+ """Convert BioCypher entities to AIRR cells using configurable mappings.
342
+
343
+ Args:
344
+ ----
345
+ entities: Dictionary mapping entity types to lists of BioCypherNode/BioCypherEdge objects
346
+ indirect_pairings: Boolean controlling pairing strategy (default: True)
347
+ - True:
348
+ Epitope is only matched with ONE of the paired receptors -> the "paired" AIRR cell will be created
349
+ - False:
350
+ Epitope is only matched with ONE of the paired receptors -> no "paired" AIRR cell will be created
351
+
352
+ Returns:
353
+ -------
354
+ list: List of generated AIRR cells
355
+
356
+ """
357
+ if not entities:
358
+ msg = "No entities provided for conversion."
359
+ raise ValueError(msg)
360
+
361
+ logger.info("Starting conversion to AIRR cells")
362
+
363
+ # Process all entities
364
+ sequence_nodes, metadata_nodes, receptor_epitope_mapping = self._process_entities(entities)
365
+
366
+ # Process paired chains
367
+ airr_cells, processed_chains, paired_metacells = self._process_paired_chains(
368
+ entities,
369
+ sequence_nodes,
370
+ metadata_nodes,
371
+ receptor_epitope_mapping,
372
+ indirect_pairings,
373
+ )
374
+
375
+ # Process unpaired chains
376
+ unpaired_cells, unpaired_metacells = self._process_unpaired_chains(
377
+ receptor_epitope_mapping,
378
+ sequence_nodes,
379
+ metadata_nodes,
380
+ processed_chains,
381
+ )
382
+ airr_cells.extend(unpaired_cells)
383
+
384
+ # Calculate total cells with multiple epitopes
385
+ total_metacells = paired_metacells + unpaired_metacells
386
+
387
+ # Log information about cells
388
+ logger.info(f"Generated total of {len(airr_cells)} AIRR cells")
389
+ if total_metacells > 0:
390
+ logger.info(f"{total_metacells} cells with more than 1 epitope were detected")
391
+
392
+ return airr_cells
393
+
394
+ def _get_metadata_nodes(
395
+ self,
396
+ metadata_ids: set[str],
397
+ metadata_nodes: dict[str, BioCypherNode],
398
+ ) -> list[BioCypherNode]:
399
+ """Get metadata nodes for a set of metadata IDs.
400
+
401
+ Args:
402
+ ----
403
+ metadata_ids: Set of metadata IDs
404
+ metadata_nodes: Dictionary of metadata nodes
405
+
406
+ Returns:
407
+ -------
408
+ list: List of metadata nodes
409
+
410
+ """
411
+ return [metadata_nodes[ep_id] for ep_id in metadata_ids if ep_id in metadata_nodes]
412
+
413
+ def _generate_airr_cell(
414
+ self,
415
+ cell_id: str,
416
+ source_node: BioCypherNode | None,
417
+ target_node: BioCypherNode | None,
418
+ metadata_nodes: list[BioCypherNode],
419
+ paired: bool,
420
+ receptor_epitope_mapping: dict[str, set] | None = None,
421
+ ) -> list[AirrCell]:
422
+ cell = AirrCell(cell_id=cell_id)
423
+
424
+ # Process both chains
425
+ for node in [source_node, target_node]:
426
+ if not node: # Skip if node is None
427
+ continue
428
+
429
+ props = node.get_properties()
430
+ chain = AirrCell.empty_chain_dict()
431
+
432
+ # Add all properties except internal ones
433
+ for key, value in props.items():
434
+ if key not in self._INTERNAL_PROPERTIES:
435
+ chain[key] = value
436
+
437
+ # Add locus based on node type
438
+ chain["locus"] = self.sequence_entity_types.get(node.get_label(), node.get_label())
439
+ chain["consensus_count"] = 0
440
+ chain["productive"] = True
441
+
442
+ # Add binds_epitope field based on receptor_epitope_mapping
443
+ if receptor_epitope_mapping and node.get_id() in receptor_epitope_mapping:
444
+ chain["validated_epitope"] = bool(receptor_epitope_mapping[node.get_id()])
445
+ else:
446
+ chain["validated_epitope"] = False
447
+
448
+ cell.add_chain(chain)
449
+
450
+ # Add metadata
451
+ return self.add_metadata(metadata_nodes, cell, paired)
452
+
453
+ def add_metadata(self, metadata_nodes: list[BioCypherNode], cell: AirrCell, paired: bool) -> list[AirrCell]:
454
+ """Add metadata from nodes to cell(s) and return a list of cells.
455
+
456
+ Args:
457
+ ----
458
+ metadata_nodes: List of metadata nodes to add
459
+ cell: Base cell to add metadata to
460
+ paired: Whether the cell is paired
461
+
462
+ Returns:
463
+ -------
464
+ List of cells with metadata added
465
+
466
+ """
467
+ cells = []
468
+ if not metadata_nodes:
469
+ cell["data_source"] = "BioCypher"
470
+ cell["is_paired"] = paired
471
+ cells.append(cell)
472
+ else:
473
+ for i, node in enumerate(metadata_nodes):
474
+ # Create a new AirrCell for each metadata node
475
+ if i > 0:
476
+ cell_id_new = f"{cell.cell_id}_meta{i+1}"
477
+ meta_cell = AirrCell(cell_id=cell_id_new)
478
+ for chain in cell.chains:
479
+ meta_cell.add_chain(chain)
480
+ else:
481
+ meta_cell = cell
482
+ props = node.get_properties()
483
+ for key, value in props.items():
484
+ if key not in self._INTERNAL_PROPERTIES:
485
+ meta_cell[key] = value
486
+
487
+ meta_cell["data_source"] = "BioCypher"
488
+ meta_cell["is_paired"] = paired
489
+
490
+ cells.append(meta_cell)
491
+ return cells
@@ -8,6 +8,7 @@ from __future__ import annotations
8
8
  from typing import TYPE_CHECKING
9
9
 
10
10
  from biocypher._logger import logger
11
+ from biocypher.output.in_memory._airr import AirrKG
11
12
  from biocypher.output.in_memory._networkx import NetworkxKG
12
13
  from biocypher.output.in_memory._pandas import PandasKG
13
14
 
@@ -19,7 +20,7 @@ logger.debug(f"Loading module {__name__}.")
19
20
 
20
21
  __all__ = ["get_in_memory_kg"]
21
22
 
22
- IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx"]
23
+ IN_MEMORY_DBMS = ["csv", "pandas", "tabular", "networkx", "airr"]
23
24
 
24
25
 
25
26
  def get_in_memory_kg(
@@ -35,10 +36,11 @@ def get_in_memory_kg(
35
36
  """
36
37
  if dbms in ["csv", "pandas", "tabular"]:
37
38
  return PandasKG(deduplicator)
38
-
39
39
  if dbms == "networkx":
40
40
  return NetworkxKG(deduplicator)
41
-
42
- msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
43
- logger.error(msg)
44
- raise NotImplementedError(msg)
41
+ elif dbms == "airr":
42
+ return AirrKG(deduplicator)
43
+ else:
44
+ msg = f"Getting the in memory BioCypher KG is not supported for the DBMS {dbms}. Supported: {IN_MEMORY_DBMS}."
45
+ logger.error(msg)
46
+ raise NotImplementedError(msg)
@@ -1,24 +1,46 @@
1
- import pandas as pd
1
+ from abc import ABC, abstractmethod
2
2
 
3
3
  from biocypher._create import BioCypherEdge, BioCypherNode, BioCypherRelAsNode
4
- from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
5
4
 
6
5
 
7
- class PandasKG(_InMemoryKG):
8
- def __init__(self, deduplicator):
9
- super().__init__() # keeping in spite of ABC not having __init__
10
- self.deduplicator = deduplicator
6
+ class _InMemoryKG(ABC):
7
+ """Abstract class for handling the in-memory Knowledge Graph instance.
8
+ Specifics of the different in-memory implementations (e.g. csv, networkx)
9
+ are implemented in the child classes. Any concrete in-memory implementation
10
+ needs to implement at least:
11
+ - add_nodes
12
+ - add_edges
13
+ - get_kg
14
+ - _separate_entity_types
11
15
 
12
- self.dfs = {}
13
-
14
- def get_kg(self):
15
- return self.dfs
16
+ Raises:
17
+ NotImplementedError: InMemoryKG implementation must override 'add_nodes'
18
+ NotImplementedError: InMemoryKG implementation must override 'add_edges'
19
+ NotImplementedError: InMemoryKG implementation must override 'get_kg'
20
+ """
16
21
 
22
+ @abstractmethod
17
23
  def add_nodes(self, nodes):
18
- self.add_tables(nodes)
24
+ """Add nodes to the in-memory knowledge graph.
25
+
26
+ Args:
27
+ nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
28
+ """
29
+ raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")
19
30
 
31
+ @abstractmethod
20
32
  def add_edges(self, edges):
21
- self.add_tables(edges)
33
+ """Add edges to the in-memory knowledge graph.
34
+
35
+ Args:
36
+ edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
37
+ """
38
+ raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")
39
+
40
+ @abstractmethod
41
+ def get_kg(self):
42
+ """Return the in-memory knowledge graph."""
43
+ raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")
22
44
 
23
45
  def _separate_entity_types(self, entities):
24
46
  """
@@ -73,23 +95,3 @@ class PandasKG(_InMemoryKG):
73
95
  lists[_type].append(entity)
74
96
 
75
97
  return lists
76
-
77
- def add_tables(self, entities):
78
- """
79
- Add Pandas dataframes for each node and edge type in the input.
80
- """
81
-
82
- lists = self._separate_entity_types(entities)
83
-
84
- for _type, _entities in lists.items():
85
- self._add_entity_df(_type, _entities)
86
-
87
- def _add_entity_df(self, _type, _entities):
88
- df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities]))
89
- # replace "properties." with "" in column names
90
- df.columns = [col.replace("properties.", "") for col in df.columns]
91
- if _type not in self.dfs:
92
- self.dfs[_type] = df
93
- else:
94
- self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True)
95
- return self.dfs[_type]
@@ -0,0 +1,37 @@
1
+ import pandas as pd
2
+
3
+ from biocypher.output.in_memory._in_memory_kg import _InMemoryKG
4
+
5
+
6
+ class PandasKG(_InMemoryKG):
7
+ def __init__(self, deduplicator):
8
+ super().__init__() # keeping in spite of ABC not having __init__
9
+ self.deduplicator = deduplicator
10
+
11
+ self.dfs = {}
12
+
13
+ def get_kg(self):
14
+ return self.dfs
15
+
16
+ def add_nodes(self, nodes):
17
+ self.add_tables(nodes)
18
+
19
+ def add_edges(self, edges):
20
+ self.add_tables(edges)
21
+
22
+ def add_tables(self, entities):
23
+ """Add Pandas dataframes for each node and edge type in the input."""
24
+ lists = self._separate_entity_types(entities)
25
+
26
+ for _type, _entities in lists.items():
27
+ self._add_entity_df(_type, _entities)
28
+
29
+ def _add_entity_df(self, _type, _entities):
30
+ df = pd.DataFrame(pd.json_normalize([node.get_dict() for node in _entities]))
31
+ # replace "properties." with "" in column names
32
+ df.columns = [col.replace("properties.", "") for col in df.columns]
33
+ if _type not in self.dfs:
34
+ self.dfs[_type] = df
35
+ else:
36
+ self.dfs[_type] = pd.concat([self.dfs[_type], df], ignore_index=True)
37
+ return self.dfs[_type]
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING
9
9
  from biocypher._config import config as _config
10
10
  from biocypher._logger import logger
11
11
  from biocypher.output.write._batch_writer import _BatchWriter
12
+ from biocypher.output.write.graph._airr import _AirrWriter
12
13
  from biocypher.output.write.graph._arangodb import _ArangoDBBatchWriter
13
14
  from biocypher.output.write.graph._neo4j import _Neo4jBatchWriter
14
15
  from biocypher.output.write.graph._networkx import _NetworkXWriter
@@ -50,6 +51,7 @@ DBMS_TO_CLASS = {
50
51
  "Tabular": _PandasCSVWriter,
51
52
  "networkx": _NetworkXWriter,
52
53
  "NetworkX": _NetworkXWriter,
54
+ "airr": _AirrWriter,
53
55
  }
54
56
 
55
57
 
@@ -0,0 +1,32 @@
1
+ """Module to provide the AnnData writer class for BioCypher."""
2
+
3
+ from biocypher._logger import logger
4
+ from biocypher.output.write._writer import _Writer
5
+
6
+
7
+ class _AirrWriter(_Writer):
8
+ """A minimal placeholder writer class that implements the required methods
9
+ but performs no actual writing operations, since there is an existing anndata native writer functionality
10
+ """
11
+
12
+ def __init__(self, *args, **kwargs):
13
+ super().__init__(*args, **kwargs)
14
+ logger.info("Placeholder writer initialized")
15
+
16
+ def _write_node_data(self, nodes) -> bool:
17
+ """Required implementation that does nothing with nodes."""
18
+ logger.info("Placeholder: Node data received but not processed")
19
+ return True
20
+
21
+ def _write_edge_data(self, edges) -> bool:
22
+ """Required implementation that does nothing with edges."""
23
+ logger.info("Placeholder: Edge data received but not processed")
24
+ return True
25
+
26
+ def _construct_import_call(self) -> str:
27
+ """Return a placeholder import script."""
28
+ return "# This is a placeholder import script\nprint('No actual import functionality implemented')"
29
+
30
+ def _get_import_script_name(self) -> str:
31
+ """Return a placeholder script name."""
32
+ return "placeholder_import.py"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "biocypher"
3
- version = "0.9.7"
3
+ version = "0.10.0"
4
4
  description = "A unifying framework for biomedical research knowledge graphs"
5
5
  authors = [
6
6
  "Sebastian Lobentanzer <sebastian.lobentanzer@gmail.com>",
@@ -44,6 +44,10 @@ neo4j-utils = "0.0.7"
44
44
  pandas = "^2.0.1"
45
45
  pooch = "^1.7.0"
46
46
  tqdm = "^4.65.0"
47
+ scirpy = { version = "^0.22.0", optional = true }
48
+
49
+ [tool.poetry.extras]
50
+ scirpy = ["scirpy"]
47
51
 
48
52
  [tool.poetry.group.dev.dependencies]
49
53
  mkdocs-material = "^9.5"
@@ -59,7 +63,7 @@ hypothesis = "^6.50.1"
59
63
  ipython = "^8.7.0"
60
64
  ipykernel = "^6.23.1"
61
65
  coverage-badge = "^1.1.0"
62
- nbsphinx = "^0.9.7"
66
+ nbsphinx = "^0.9.6"
63
67
  ruff = "^0.2.0"
64
68
  mike = "^2.1.3"
65
69
 
@@ -1,40 +0,0 @@
1
- from abc import ABC, abstractmethod
2
-
3
-
4
- class _InMemoryKG(ABC):
5
- """Abstract class for handling the in-memory Knowledge Graph instance.
6
- Specifics of the different in-memory implementations (e.g. csv, networkx)
7
- are implemented in the child classes. Any concrete in-memory implementation
8
- needs to implement at least:
9
- - add_nodes
10
- - add_edges
11
- - get_kg
12
-
13
- Raises:
14
- NotImplementedError: InMemoryKG implementation must override 'add_nodes'
15
- NotImplementedError: InMemoryKG implementation must override 'add_edges'
16
- NotImplementedError: InMemoryKG implementation must override 'get_kg'
17
- """
18
-
19
- @abstractmethod
20
- def add_nodes(self, nodes):
21
- """Add nodes to the in-memory knowledge graph.
22
-
23
- Args:
24
- nodes (Iterable[BioCypherNode]): Iterable of BioCypherNode objects.
25
- """
26
- raise NotImplementedError("InMemoryKG implementation must override 'add_nodes'")
27
-
28
- @abstractmethod
29
- def add_edges(self, edges):
30
- """Add edges to the in-memory knowledge graph.
31
-
32
- Args:
33
- edges (Iterable[BioCypherEdge]): Iterable of BioCypherEdge objects.
34
- """
35
- raise NotImplementedError("InMemoryKG implementation must override 'add_edges'")
36
-
37
- @abstractmethod
38
- def get_kg(self):
39
- """Return the in-memory knowledge graph."""
40
- raise NotImplementedError("InMemoryKG implementation must override 'get_kg'")
File without changes
File without changes
File without changes
File without changes
File without changes