napistu 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,19 +6,79 @@ import numpy as np
6
6
  import pandas as pd
7
7
  import pytest
8
8
  from napistu import sbml_dfs_core
9
+ from napistu.source import Source
9
10
  from napistu.ingestion import sbml
10
11
  from napistu.modify import pathwayannot
11
12
 
12
13
  from napistu import identifiers as napistu_identifiers
13
14
  from napistu.constants import (
14
15
  SBML_DFS,
15
- SBOTERM_NAMES,
16
16
  BQB_DEFINING_ATTRS,
17
17
  BQB_DEFINING_ATTRS_LOOSE,
18
18
  BQB,
19
- IDENTIFIERS,
20
19
  )
21
20
  from napistu.sbml_dfs_core import SBML_dfs
21
+ from unittest.mock import patch
22
+
23
+
24
+ @pytest.fixture
25
+ def test_data():
26
+ """Create test data for SBML integration tests."""
27
+
28
+ # Test compartments
29
+ compartments_df = pd.DataFrame(
30
+ [
31
+ {"c_name": "nucleus", "c_Identifiers": None},
32
+ {"c_name": "cytoplasm", "c_Identifiers": None},
33
+ ]
34
+ )
35
+
36
+ # Test species with extra data
37
+ species_df = pd.DataFrame(
38
+ [
39
+ {
40
+ "s_name": "TP53",
41
+ "s_Identifiers": None,
42
+ "gene_type": "tumor_suppressor",
43
+ },
44
+ {"s_name": "MDM2", "s_Identifiers": None, "gene_type": "oncogene"},
45
+ {
46
+ "s_name": "CDKN1A",
47
+ "s_Identifiers": None,
48
+ "gene_type": "cell_cycle",
49
+ },
50
+ ]
51
+ )
52
+
53
+ # Test interactions with extra data
54
+ interaction_edgelist = pd.DataFrame(
55
+ [
56
+ {
57
+ "upstream_name": "TP53",
58
+ "downstream_name": "CDKN1A",
59
+ "upstream_compartment": "nucleus",
60
+ "downstream_compartment": "nucleus",
61
+ "r_name": "TP53_activates_CDKN1A",
62
+ "sbo_term": "SBO:0000459",
63
+ "r_Identifiers": None,
64
+ "r_isreversible": False,
65
+ "confidence": 0.95,
66
+ },
67
+ {
68
+ "upstream_name": "MDM2",
69
+ "downstream_name": "TP53",
70
+ "upstream_compartment": "cytoplasm",
71
+ "downstream_compartment": "nucleus",
72
+ "r_name": "MDM2_inhibits_TP53",
73
+ "sbo_term": "SBO:0000020",
74
+ "r_Identifiers": None,
75
+ "r_isreversible": False,
76
+ "confidence": 0.87,
77
+ },
78
+ ]
79
+ )
80
+
81
+ return [interaction_edgelist, species_df, compartments_df, Source(init=True)]
22
82
 
23
83
 
24
84
  def test_drop_cofactors(sbml_dfs):
@@ -212,26 +272,6 @@ def test_sbml_dfs_remove_reactions_check_species(sbml_dfs):
212
272
  sbml_dfs.validate()
213
273
 
214
274
 
215
- def test_formula(sbml_dfs):
216
- # create a formula string
217
-
218
- an_r_id = sbml_dfs.reactions.index[0]
219
-
220
- reaction_species_df = sbml_dfs.reaction_species[
221
- sbml_dfs.reaction_species["r_id"] == an_r_id
222
- ].merge(sbml_dfs.compartmentalized_species, left_on="sc_id", right_index=True)
223
-
224
- formula_str = sbml_dfs_core.construct_formula_string(
225
- reaction_species_df, sbml_dfs.reactions, name_var="sc_name"
226
- )
227
-
228
- assert isinstance(formula_str, str)
229
- assert (
230
- formula_str
231
- == "CO2 [extracellular region] -> CO2 [cytosol] ---- modifiers: AQP1 tetramer [plasma membrane]]"
232
- )
233
-
234
-
235
275
  def test_read_sbml_with_invalid_ids():
236
276
  SBML_W_BAD_IDS = "R-HSA-166658.sbml"
237
277
  test_path = os.path.abspath(os.path.join(__file__, os.pardir))
@@ -243,17 +283,6 @@ def test_read_sbml_with_invalid_ids():
243
283
  assert isinstance(sbml_dfs_core.SBML_dfs(sbml_w_bad_ids), sbml_dfs_core.SBML_dfs)
244
284
 
245
285
 
246
- def test_stubbed_compartment():
247
- compartment = sbml_dfs_core._stub_compartments()
248
-
249
- assert compartment["c_Identifiers"].iloc[0].ids[0] == {
250
- "ontology": "go",
251
- "identifier": "GO:0005575",
252
- "url": "https://www.ebi.ac.uk/QuickGO/term/GO:0005575",
253
- "bqb": "BQB_IS",
254
- }
255
-
256
-
257
286
  def test_get_table(sbml_dfs):
258
287
  assert isinstance(sbml_dfs.get_table("species"), pd.DataFrame)
259
288
  assert isinstance(sbml_dfs.get_table("species", {"id"}), pd.DataFrame)
@@ -304,10 +333,20 @@ def test_species_status(sbml_dfs):
304
333
  select_species = species[species["s_name"] == "OxyHbA"]
305
334
  assert select_species.shape[0] == 1
306
335
 
307
- status = sbml_dfs_core.species_status(select_species.index[0], sbml_dfs)
336
+ status = sbml_dfs.species_status(select_species.index[0])
337
+
338
+ # expected columns
339
+ expected_columns = [
340
+ SBML_DFS.SC_NAME,
341
+ SBML_DFS.STOICHIOMETRY,
342
+ SBML_DFS.R_NAME,
343
+ "r_formula_str",
344
+ ]
345
+ assert all(col in status.columns for col in expected_columns)
346
+
308
347
  assert (
309
348
  status["r_formula_str"][0]
310
- == "4.0 H+ + OxyHbA + 4.0 CO2 -> 4.0 O2 + Protonated Carbamino DeoxyHbA [cytosol]"
349
+ == "cytosol: 4.0 CO2 + 4.0 H+ + OxyHbA -> 4.0 O2 + Protonated Carbamino DeoxyHbA"
311
350
  )
312
351
 
313
352
 
@@ -374,91 +413,6 @@ def test_get_identifiers_handles_missing_values():
374
413
  ), "Only Identifiers objects should be returned."
375
414
 
376
415
 
377
- def test_find_underspecified_reactions():
378
-
379
- reaction_w_regulators = pd.DataFrame(
380
- {
381
- SBML_DFS.SC_ID: ["A", "B", "C", "D", "E", "F", "G"],
382
- SBML_DFS.STOICHIOMETRY: [-1, -1, 1, 1, 0, 0, 0],
383
- SBML_DFS.SBO_TERM: [
384
- SBOTERM_NAMES.REACTANT,
385
- SBOTERM_NAMES.REACTANT,
386
- SBOTERM_NAMES.PRODUCT,
387
- SBOTERM_NAMES.PRODUCT,
388
- SBOTERM_NAMES.CATALYST,
389
- SBOTERM_NAMES.CATALYST,
390
- SBOTERM_NAMES.STIMULATOR,
391
- ],
392
- }
393
- ).assign(r_id="bar")
394
- reaction_w_regulators[SBML_DFS.RSC_ID] = [
395
- f"rsc_{i}" for i in range(len(reaction_w_regulators))
396
- ]
397
- reaction_w_regulators.set_index(SBML_DFS.RSC_ID, inplace=True)
398
- reaction_w_regulators = sbml_dfs_core.add_sbo_role(reaction_w_regulators)
399
-
400
- reaction_w_interactors = pd.DataFrame(
401
- {
402
- SBML_DFS.SC_ID: ["A", "B"],
403
- SBML_DFS.STOICHIOMETRY: [-1, 1],
404
- SBML_DFS.SBO_TERM: [SBOTERM_NAMES.REACTANT, SBOTERM_NAMES.REACTANT],
405
- }
406
- ).assign(r_id="baz")
407
- reaction_w_interactors[SBML_DFS.RSC_ID] = [
408
- f"rsc_{i}" for i in range(len(reaction_w_interactors))
409
- ]
410
- reaction_w_interactors.set_index(SBML_DFS.RSC_ID, inplace=True)
411
- reaction_w_interactors = sbml_dfs_core.add_sbo_role(reaction_w_interactors)
412
-
413
- working_reactions = reaction_w_regulators.copy()
414
- working_reactions["new"] = True
415
- working_reactions.loc["rsc_0", "new"] = False
416
- working_reactions
417
- result = sbml_dfs_core.find_underspecified_reactions(working_reactions)
418
- assert result == {"bar"}
419
-
420
- # missing one enzyme -> operable
421
- working_reactions = reaction_w_regulators.copy()
422
- working_reactions["new"] = True
423
- working_reactions.loc["rsc_4", "new"] = False
424
- working_reactions
425
- result = sbml_dfs_core.find_underspecified_reactions(working_reactions)
426
- assert result == set()
427
-
428
- # missing one product -> inoperable
429
- working_reactions = reaction_w_regulators.copy()
430
- working_reactions["new"] = True
431
- working_reactions.loc["rsc_2", "new"] = False
432
- working_reactions
433
- result = sbml_dfs_core.find_underspecified_reactions(working_reactions)
434
- assert result == {"bar"}
435
-
436
- # missing all enzymes -> inoperable
437
- working_reactions = reaction_w_regulators.copy()
438
- working_reactions["new"] = True
439
- working_reactions.loc["rsc_4", "new"] = False
440
- working_reactions.loc["rsc_5", "new"] = False
441
- working_reactions
442
- result = sbml_dfs_core.find_underspecified_reactions(working_reactions)
443
- assert result == {"bar"}
444
-
445
- # missing regulators -> operable
446
- working_reactions = reaction_w_regulators.copy()
447
- working_reactions["new"] = True
448
- working_reactions.loc["rsc_6", "new"] = False
449
- working_reactions
450
- result = sbml_dfs_core.find_underspecified_reactions(working_reactions)
451
- assert result == set()
452
-
453
- # remove an interactor
454
- working_reactions = reaction_w_interactors.copy()
455
- working_reactions["new"] = True
456
- working_reactions.loc["rsc_0", "new"] = False
457
- working_reactions
458
- result = sbml_dfs_core.find_underspecified_reactions(working_reactions)
459
- assert result == {"baz"}
460
-
461
-
462
416
  def test_remove_entity_data_success(sbml_dfs_w_data):
463
417
  """Test successful removal of entity data."""
464
418
  # Get initial data
@@ -502,82 +456,158 @@ def test_remove_entity_data_nonexistent(sbml_dfs_w_data, caplog):
502
456
  sbml_dfs_w_data.validate()
503
457
 
504
458
 
505
- def test_filter_to_characteristic_species_ids():
506
-
507
- species_ids_dict = {
508
- SBML_DFS.S_ID: ["large_complex"] * 6
509
- + ["small_complex"] * 2
510
- + ["proteinA", "proteinB"]
511
- + ["proteinC"] * 3
512
- + [
513
- "promiscuous_complexA",
514
- "promiscuous_complexB",
515
- "promiscuous_complexC",
516
- "promiscuous_complexD",
517
- "promiscuous_complexE",
518
- ],
519
- IDENTIFIERS.ONTOLOGY: ["complexportal"]
520
- + ["HGNC"] * 7
521
- + ["GO"] * 2
522
- + ["ENSG", "ENSP", "pubmed"]
523
- + ["HGNC"] * 5,
524
- IDENTIFIERS.IDENTIFIER: [
525
- "CPX-BIG",
526
- "mem1",
527
- "mem2",
528
- "mem3",
529
- "mem4",
530
- "mem5",
531
- "part1",
532
- "part2",
533
- "GO:1",
534
- "GO:2",
535
- "dna_seq",
536
- "protein_seq",
537
- "my_cool_pub",
538
- ]
539
- + ["promiscuous_complex"] * 5,
540
- IDENTIFIERS.BQB: [BQB.IS]
541
- + [BQB.HAS_PART] * 7
542
- + [BQB.IS] * 2
543
- + [
544
- # these are retained if BQB_DEFINING_ATTRS_LOOSE is used
545
- BQB.ENCODES,
546
- BQB.IS_ENCODED_BY,
547
- # this should always be removed
548
- BQB.IS_DESCRIBED_BY,
549
- ]
550
- + [BQB.HAS_PART] * 5,
459
+ def test_get_characteristic_species_ids():
460
+ """
461
+ Test get_characteristic_species_ids function with both dogmatic and non-dogmatic cases.
462
+ """
463
+ # Create mock species identifiers data
464
+ mock_species_ids = pd.DataFrame(
465
+ {
466
+ "s_id": ["s1", "s2", "s3", "s4", "s5"],
467
+ "identifier": ["P12345", "CHEBI:15377", "GO:12345", "P67890", "P67890"],
468
+ "ontology": ["uniprot", "chebi", "go", "uniprot", "chebi"],
469
+ "bqb": [
470
+ "BQB_IS",
471
+ "BQB_IS",
472
+ "BQB_HAS_PART",
473
+ "BQB_HAS_VERSION",
474
+ "BQB_ENCODES",
475
+ ],
476
+ }
477
+ )
478
+
479
+ # Create minimal required tables for SBML_dfs
480
+ compartments = pd.DataFrame(
481
+ {"c_name": ["cytosol"], "c_Identifiers": [None]}, index=["C1"]
482
+ )
483
+ compartments.index.name = "c_id"
484
+ species = pd.DataFrame(
485
+ {"s_name": ["A"], "s_Identifiers": [None], "s_source": [None]}, index=["s1"]
486
+ )
487
+ species.index.name = "s_id"
488
+ compartmentalized_species = pd.DataFrame(
489
+ {
490
+ "sc_name": ["A [cytosol]"],
491
+ "s_id": ["s1"],
492
+ "c_id": ["C1"],
493
+ "sc_source": [None],
494
+ },
495
+ index=["SC1"],
496
+ )
497
+ compartmentalized_species.index.name = "sc_id"
498
+ reactions = pd.DataFrame(
499
+ {
500
+ "r_name": ["rxn1"],
501
+ "r_Identifiers": [None],
502
+ "r_source": [None],
503
+ "r_isreversible": [False],
504
+ },
505
+ index=["R1"],
506
+ )
507
+ reactions.index.name = "r_id"
508
+ reaction_species = pd.DataFrame(
509
+ {
510
+ "r_id": ["R1"],
511
+ "sc_id": ["SC1"],
512
+ "stoichiometry": [1],
513
+ "sbo_term": ["SBO:0000459"],
514
+ },
515
+ index=["RSC1"],
516
+ )
517
+ reaction_species.index.name = "rsc_id"
518
+
519
+ sbml_dict = {
520
+ "compartments": compartments,
521
+ "species": species,
522
+ "compartmentalized_species": compartmentalized_species,
523
+ "reactions": reactions,
524
+ "reaction_species": reaction_species,
551
525
  }
526
+ sbml_dfs = SBML_dfs(sbml_dict, validate=False, resolve=False)
527
+
528
+ # Test dogmatic case (default)
529
+ expected_bqbs = BQB_DEFINING_ATTRS + [BQB.HAS_PART] # noqa: F841
530
+ with patch.object(sbml_dfs, "get_identifiers", return_value=mock_species_ids):
531
+ dogmatic_result = sbml_dfs.get_characteristic_species_ids()
532
+ expected_dogmatic = mock_species_ids.query("bqb in @expected_bqbs")
533
+ pd.testing.assert_frame_equal(
534
+ dogmatic_result, expected_dogmatic, check_like=True
535
+ )
536
+
537
+ # Test non-dogmatic case
538
+ expected_bqbs = BQB_DEFINING_ATTRS_LOOSE + [BQB.HAS_PART] # noqa: F841
539
+ with patch.object(sbml_dfs, "get_identifiers", return_value=mock_species_ids):
540
+ non_dogmatic_result = sbml_dfs.get_characteristic_species_ids(dogmatic=False)
541
+ expected_non_dogmatic = mock_species_ids.query("bqb in @expected_bqbs")
542
+ pd.testing.assert_frame_equal(
543
+ non_dogmatic_result, expected_non_dogmatic, check_like=True
544
+ )
545
+
546
+
547
+ def test_sbml_basic_functionality(test_data):
548
+ """Test basic SBML_dfs creation from edgelist."""
549
+ interaction_edgelist, species_df, compartments_df, interaction_source = test_data
550
+
551
+ result = sbml_dfs_core.sbml_dfs_from_edgelist(
552
+ interaction_edgelist, species_df, compartments_df, interaction_source
553
+ )
554
+
555
+ assert isinstance(result, SBML_dfs)
556
+ assert len(result.species) == 3
557
+ assert len(result.compartments) == 2
558
+ assert len(result.reactions) == 2
559
+ assert (
560
+ len(result.compartmentalized_species) == 3
561
+ ) # TP53[nucleus], CDKN1A[nucleus], MDM2[cytoplasm]
562
+ assert len(result.reaction_species) == 4 # 2 reactions * 2 species each
563
+
564
+
565
+ def test_sbml_extra_data_preservation(test_data):
566
+ """Test that extra columns are preserved when requested."""
567
+ interaction_edgelist, species_df, compartments_df, interaction_source = test_data
568
+
569
+ result = sbml_dfs_core.sbml_dfs_from_edgelist(
570
+ interaction_edgelist,
571
+ species_df,
572
+ compartments_df,
573
+ interaction_source,
574
+ keep_species_data=True,
575
+ keep_reactions_data="experiment",
576
+ )
552
577
 
553
- species_ids = pd.DataFrame(species_ids_dict)
578
+ assert hasattr(result, "species_data")
579
+ assert hasattr(result, "reactions_data")
580
+ assert "gene_type" in result.species_data["source"].columns
581
+ assert "confidence" in result.reactions_data["experiment"].columns
554
582
 
555
- characteristic_ids_narrow = sbml_dfs_core.filter_to_characteristic_species_ids(
556
- species_ids,
557
- defining_biological_qualifiers=BQB_DEFINING_ATTRS,
558
- max_complex_size=4,
559
- max_promiscuity=4,
583
+
584
+ def test_sbml_compartmentalized_naming(test_data):
585
+ """Test compartmentalized species naming convention."""
586
+ interaction_edgelist, species_df, compartments_df, interaction_source = test_data
587
+
588
+ result = sbml_dfs_core.sbml_dfs_from_edgelist(
589
+ interaction_edgelist, species_df, compartments_df, interaction_source
560
590
  )
561
591
 
562
- EXPECTED_IDS = ["CPX-BIG", "GO:1", "GO:2", "part1", "part2"]
563
- assert characteristic_ids_narrow[IDENTIFIERS.IDENTIFIER].tolist() == EXPECTED_IDS
592
+ comp_names = result.compartmentalized_species["sc_name"].tolist()
593
+ assert "TP53 [nucleus]" in comp_names
594
+ assert "MDM2 [cytoplasm]" in comp_names
595
+ assert "CDKN1A [nucleus]" in comp_names
596
+
597
+
598
+ def test_sbml_custom_stoichiometry(test_data):
599
+ """Test custom stoichiometry parameters."""
600
+ interaction_edgelist, species_df, compartments_df, interaction_source = test_data
564
601
 
565
- characteristic_ids_loose = sbml_dfs_core.filter_to_characteristic_species_ids(
566
- species_ids,
567
- # include encodes and is_encoded_by as equivalent to is
568
- defining_biological_qualifiers=BQB_DEFINING_ATTRS_LOOSE,
569
- max_complex_size=4,
570
- # expand promiscuity to default value
571
- max_promiscuity=20,
602
+ result = sbml_dfs_core.sbml_dfs_from_edgelist(
603
+ interaction_edgelist,
604
+ species_df,
605
+ compartments_df,
606
+ interaction_source,
607
+ upstream_stoichiometry=2,
608
+ downstream_stoichiometry=3,
572
609
  )
573
610
 
574
- EXPECTED_IDS = [
575
- "CPX-BIG",
576
- "GO:1",
577
- "GO:2",
578
- "dna_seq",
579
- "protein_seq",
580
- "part1",
581
- "part2",
582
- ] + ["promiscuous_complex"] * 5
583
- assert characteristic_ids_loose[IDENTIFIERS.IDENTIFIER].tolist() == EXPECTED_IDS
611
+ stoichiometries = result.reaction_species["stoichiometry"].unique()
612
+ assert 2 in stoichiometries # upstream
613
+ assert 3 in stoichiometries # downstream