PyPI - gsrap - Versions diffs - 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

gsrap 0.8.2py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

gsrap/.ipynb_checkpoints/__init__-checkpoint.py +2 -0
gsrap/__init__.py +2 -0
gsrap/assets/kegg_compound_to_others.pickle +0 -0
gsrap/assets/kegg_reaction_to_others.pickle +0 -0
gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py +96 -4
gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py +72 -1
gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py +2 -2
gsrap/commons/downloads.py +96 -4
gsrap/commons/escherutils.py +72 -1
gsrap/commons/excelhub.py +2 -2
gsrap/getmaps/.ipynb_checkpoints/getmaps-checkpoint.py +14 -5
gsrap/getmaps/.ipynb_checkpoints/kdown-checkpoint.py +75 -4
gsrap/getmaps/getmaps.py +14 -5
gsrap/getmaps/kdown.py +75 -4
gsrap/parsedb/.ipynb_checkpoints/annotation-checkpoint.py +9 -0
gsrap/parsedb/.ipynb_checkpoints/completeness-checkpoint.py +45 -11
gsrap/parsedb/.ipynb_checkpoints/manual-checkpoint.py +10 -0
gsrap/parsedb/.ipynb_checkpoints/parsedb-checkpoint.py +40 -19
gsrap/parsedb/.ipynb_checkpoints/repeating-checkpoint.py +2 -2
gsrap/parsedb/annotation.py +9 -0
gsrap/parsedb/completeness.py +45 -11
gsrap/parsedb/manual.py +10 -0
gsrap/parsedb/parsedb.py +40 -19
gsrap/parsedb/repeating.py +2 -2
{gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/METADATA +1 -1
{gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/RECORD +29 -29
{gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/LICENSE.txt +0 -0
{gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/WHEEL +0 -0
{gsrap-0.8.2.dist-info → gsrap-0.9.0.dist-info}/entry_points.txt +0 -0

gsrap/.ipynb_checkpoints/__init__-checkpoint.py CHANGED Viewed

@@ -75,12 +75,14 @@ def main():
     parsedb_parser.add_argument("-z", "--initialize", metavar='', type=str, default='-', help="Initialize the universe on the provided medium. By default, the first medium in --media is used. Use 'none' to avoid initialization.")
     parsedb_parser.add_argument("--precursors", action='store_true', help="Verify biosynthesis of biomass precursors and show blocked ones.")
     parsedb_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
+    parsedb_parser.add_argument("-t", "--taxon", metavar='', type=str, default='-', help="High-level taxon of interest. If provided, it must follow the syntax '{level}:{name}', where {level} is 'kingdom' or 'phylum'.")
     parsedb_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the optional eggnog-mapper annotation table(s).")
     parsedb_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code. If provided, it takes precedence over --eggnog.")
     parsedb_parser.add_argument("--goodbefore", metavar='', type=str, default='-', help="Syntax is {pure_mid}-{rid1}-{rid2}. From top to bottom, build the universe until reaction {rid1}, transport {rid2} and metabolite {pure_mid} are reached.")
     parsedb_parser.add_argument("--onlyauthor", metavar='', type=str, default='-', help="Build the universe by parsing contents of the specified author ID only. Contents affected by --goodbefore are parsed anyway.")
     parsedb_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
     parsedb_parser.add_argument("-j", "--justparse", action='store_true', help="Just parse the database without performing extra activities (saves time during universe expansion).")
+    parsedb_parser.add_argument("-d", "--keepdisconn", action='store_true', help="Do not remove disconnected metabolites.")

gsrap/__init__.py CHANGED Viewed

@@ -75,12 +75,14 @@ def main():
     parsedb_parser.add_argument("-z", "--initialize", metavar='', type=str, default='-', help="Initialize the universe on the provided medium. By default, the first medium in --media is used. Use 'none' to avoid initialization.")
     parsedb_parser.add_argument("--precursors", action='store_true', help="Verify biosynthesis of biomass precursors and show blocked ones.")
     parsedb_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
+    parsedb_parser.add_argument("-t", "--taxon", metavar='', type=str, default='-', help="High-level taxon of interest. If provided, it must follow the syntax '{level}:{name}', where {level} is 'kingdom' or 'phylum'.")
     parsedb_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the optional eggnog-mapper annotation table(s).")
     parsedb_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code. If provided, it takes precedence over --eggnog.")
     parsedb_parser.add_argument("--goodbefore", metavar='', type=str, default='-', help="Syntax is {pure_mid}-{rid1}-{rid2}. From top to bottom, build the universe until reaction {rid1}, transport {rid2} and metabolite {pure_mid} are reached.")
     parsedb_parser.add_argument("--onlyauthor", metavar='', type=str, default='-', help="Build the universe by parsing contents of the specified author ID only. Contents affected by --goodbefore are parsed anyway.")
     parsedb_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
     parsedb_parser.add_argument("-j", "--justparse", action='store_true', help="Just parse the database without performing extra activities (saves time during universe expansion).")
+    parsedb_parser.add_argument("-d", "--keepdisconn", action='store_true', help="Do not remove disconnected metabolites.")

gsrap/assets/kegg_compound_to_others.pickle CHANGED Viewed

Binary file

gsrap/assets/kegg_reaction_to_others.pickle CHANGED Viewed

Binary file

gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py CHANGED Viewed

@@ -243,7 +243,99 @@ def format_expansion(logger, eggnog):
+def check_taxon(logger, taxon, idcollection_dict):
+    # verify presence of needed assets
+    if 'ko_to_taxa' not in idcollection_dict.keys():
+        logger.error(f"Asset 'ko_to_taxa' not found in 'gsrap.maps'. Please update 'gsrap.maps' with 'gsrap getmaps'.")
+        return 1
+    # extract level and name
+    try: level, name = taxon.split(':')
+    except:
+        logger.error(f"Provided --taxon is not well formatted: '{taxon}'.")
+        return 1
+    # compute available levels and check
+    avail_levels = set(['kingdom', 'phylum'])
+    if level not in avail_levels:
+        logger.error(f"Provided level is not acceptable: '{level}' (see --taxon). Acceptable levels are {avail_levels}.")
+        return 1
+    # compute available taxa at input level
+    avail_taxa_at_level = set()
+    ko_to_taxa = idcollection_dict['ko_to_taxa']
+    for ko in ko_to_taxa.keys():
+        for taxon_name in ko_to_taxa[ko][level]:
+            avail_taxa_at_level.add(taxon_name)
+    if name not in avail_taxa_at_level:
+        logger.error(f"Provided taxon name is not acceptable: '{name}' (see --taxon). Acceptable taxon names for level '{level}' are {avail_taxa_at_level}.")
+        return 1
+    """
+    sorted(list(df.query("kingdom == 'Bacteria'")['phylum'].unique()))
+    ['Acidobacteriota',
+     'Actinomycetota',
+     'Alphaproteobacteria',
+     'Aquificota',
+     'Armatimonadota',
+     'Atribacterota',
+     'Bacilli',
+     'Bacteria incertae sedis',
+     'Bacteroidota',
+     'Balneolota',
+     'Bdellovibrionota',
+     'Betaproteobacteria',
+     'Caldisericota',
+     'Calditrichota',
+     'Campylobacterota',
+     'Chlamydiota',
+     'Chlorobiota',
+     'Chloroflexota',
+     'Chrysiogenota',
+     'Cloacimonadota',
+     'Clostridia',
+     'Coprothermobacterota',
+     'Cyanobacteriota',
+     'Deferribacterota',
+     'Deinococcota',
+     'Deltaproteobacteria',
+     'Dictyoglomota',
+     'Elusimicrobiota',
+     'Enterobacteria',
+     'Fibrobacterota',
+     'Fidelibacterota',
+     'Fusobacteriota',
+     'Gemmatimonadota',
+     'Ignavibacteriota',
+     'Kiritimatiellota',
+     'Lentisphaerota',
+     'Melainabacteria',
+     'Mycoplasmatota',
+     'Myxococcota',
+     'Nitrospinota',
+     'Nitrospirota',
+     'Omnitrophota',
+     'Planctomycetota',
+     'Rhodothermota',
+     'Spirochaetota',
+     'Synergistota',
+     'Thermodesulfobacteriota',
+     'Thermodesulfobiota',
+     'Thermomicrobiota',
+     'Thermosulfidibacterota',
+     'Thermotogota',
+     'Verrucomicrobiota',
+     'Vulcanimicrobiota',
+     'other Bacillota',
+     'other Gammaproteobacteria',
+     'other Pseudomonadota',
+     'unclassified Bacteria']
+    """
+    return 0

gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py CHANGED Viewed

@@ -1,3 +1,9 @@
+import warnings
+import logging
+import cobra
 def print_json_tree(data, level=0, max_level=2):
@@ -17,7 +23,7 @@ def print_json_tree(data, level=0, max_level=2):
-def count_undrawn_rids(logger, universe, lastmap):
+def count_undrawn_rids(logger, universe, lastmap, focus):
     rids = set([r.id for r in universe.reactions])
@@ -32,6 +38,71 @@ def count_undrawn_rids(logger, universe, lastmap):
     logger.debug(f"Last universal map version detected: '{filename}'.")
     if len(remainings) > 0:
         logger.warning(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
+        if focus == '-':
+            logger.warning(f"Drawing is eased when using '--focus'...")
     else:
         logger.info(f"Our universal map is {len(remainings)} reactions behind. Thank you ♥")
+def count_undrawn_rids_focus(logger, universe, lastmap, focus, outdir):
+    # get modeled reads for this --focus:
+    rids = set()
+    try: gr = universe.groups.get_by_id(focus)
+    except:
+        logger.warning(f"Group '{focus}' not found!")
+        return
+    for r in gr.members:
+        rids.add(r.id)
+    # get rids on Escher:
+    drawn_rids = set()
+    for key, value in lastmap['json'][1]['reactions'].items():
+        drawn_rids.add(value['bigg_id'])
+    # get remaining rids for this map:
+    remainings = rids - drawn_rids
+    remainings_krs = set()
+    for rid in remainings:
+        r = universe.reactions.get_by_id(rid)
+        krs = r.annotation['kegg.reaction']
+        for kr in krs:
+            remainings_krs.add(kr)
+    if len(remainings) > 0:
+        if focus != 'transport':
+            logger.warning(f"Focusing on '{focus}', our universal map is {len(remainings)} reactions behind: {' '.join(list(remainings_krs))}.")
+        else:
+            logger.warning(f"Focusing on '{focus}', our universal map is {len(remainings)} reactions behind.")  # usually no kegg codes for tranport reactions
+        # subset the universe to ease the drawing:
+        universe_focus = universe.copy()
+        to_remove = [r for r in universe_focus.reactions if r.id not in rids]
+        # trick to avoid the WARNING "cobra/core/group.py:147: UserWarning: need to pass in a list"
+        # triggered when trying to remove reactions that are included in groups.
+        with warnings.catch_warnings():  # temporarily suppress warnings for this block
+            warnings.simplefilter("ignore")  # ignore all warnings
+            cobra_logger = logging.getLogger("cobra.util.solver")
+            old_level = cobra_logger.level
+            cobra_logger.setLevel(logging.ERROR)
+            universe_focus.remove_reactions(to_remove,remove_orphans=True)
+            # restore original behaviour:
+            cobra_logger.setLevel(old_level)
+        # save the subset for drawing in Escher!
+        logger.info(f"Writing '{outdir}/{focus}.json' to ease your drawing workflow...")
+        cobra.io.save_json_model(universe_focus, f'{outdir}/{focus}.json')
+    else:
+        logger.info(f"Focusing on '{focus}', our universal map is {len(remainings)} reactions behind. Thank you ♥")

gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py CHANGED Viewed

@@ -148,7 +148,7 @@ def write_excel_model(model, filepath, nofigs, memote_results_dict, df_E, df_B,
             else: df_T.append(row_dict)
     for g in model.genes:
-        row_dict = {'gid': g.id, 'involved_in': '; '.join([r.id for r in g.reactions])}
+        row_dict = {'gid': g.id, 'name': g.name, 'involved_in': '; '.join([r.id for r in g.reactions])}
         for db in g.annotation.keys():
             annots = g.annotation[db]
@@ -171,7 +171,7 @@ def write_excel_model(model, filepath, nofigs, memote_results_dict, df_E, df_B,
     df_R = df_R[df_R_first_cols + sorted([c for c in df_R.columns if c not in df_R_first_cols])]
     df_T = df_T[df_R_first_cols + sorted([c for c in df_T.columns if c not in df_R_first_cols])]
     df_A = df_A[df_R_first_cols + sorted([c for c in df_A.columns if c not in df_R_first_cols])]
-    df_G_first_cols = ['gid', 'involved_in']
+    df_G_first_cols = ['gid', 'name', 'involved_in']
     df_G = df_G[df_G_first_cols + sorted([c for c in df_G.columns if c not in df_G_first_cols])]

gsrap/commons/downloads.py CHANGED Viewed

@@ -243,7 +243,99 @@ def format_expansion(logger, eggnog):
+def check_taxon(logger, taxon, idcollection_dict):
+    # verify presence of needed assets
+    if 'ko_to_taxa' not in idcollection_dict.keys():
+        logger.error(f"Asset 'ko_to_taxa' not found in 'gsrap.maps'. Please update 'gsrap.maps' with 'gsrap getmaps'.")
+        return 1
+    # extract level and name
+    try: level, name = taxon.split(':')
+    except:
+        logger.error(f"Provided --taxon is not well formatted: '{taxon}'.")
+        return 1
+    # compute available levels and check
+    avail_levels = set(['kingdom', 'phylum'])
+    if level not in avail_levels:
+        logger.error(f"Provided level is not acceptable: '{level}' (see --taxon). Acceptable levels are {avail_levels}.")
+        return 1
+    # compute available taxa at input level
+    avail_taxa_at_level = set()
+    ko_to_taxa = idcollection_dict['ko_to_taxa']
+    for ko in ko_to_taxa.keys():
+        for taxon_name in ko_to_taxa[ko][level]:
+            avail_taxa_at_level.add(taxon_name)
+    if name not in avail_taxa_at_level:
+        logger.error(f"Provided taxon name is not acceptable: '{name}' (see --taxon). Acceptable taxon names for level '{level}' are {avail_taxa_at_level}.")
+        return 1
+    """
+    sorted(list(df.query("kingdom == 'Bacteria'")['phylum'].unique()))
+    ['Acidobacteriota',
+     'Actinomycetota',
+     'Alphaproteobacteria',
+     'Aquificota',
+     'Armatimonadota',
+     'Atribacterota',
+     'Bacilli',
+     'Bacteria incertae sedis',
+     'Bacteroidota',
+     'Balneolota',
+     'Bdellovibrionota',
+     'Betaproteobacteria',
+     'Caldisericota',
+     'Calditrichota',
+     'Campylobacterota',
+     'Chlamydiota',
+     'Chlorobiota',
+     'Chloroflexota',
+     'Chrysiogenota',
+     'Cloacimonadota',
+     'Clostridia',
+     'Coprothermobacterota',
+     'Cyanobacteriota',
+     'Deferribacterota',
+     'Deinococcota',
+     'Deltaproteobacteria',
+     'Dictyoglomota',
+     'Elusimicrobiota',
+     'Enterobacteria',
+     'Fibrobacterota',
+     'Fidelibacterota',
+     'Fusobacteriota',
+     'Gemmatimonadota',
+     'Ignavibacteriota',
+     'Kiritimatiellota',
+     'Lentisphaerota',
+     'Melainabacteria',
+     'Mycoplasmatota',
+     'Myxococcota',
+     'Nitrospinota',
+     'Nitrospirota',
+     'Omnitrophota',
+     'Planctomycetota',
+     'Rhodothermota',
+     'Spirochaetota',
+     'Synergistota',
+     'Thermodesulfobacteriota',
+     'Thermodesulfobiota',
+     'Thermomicrobiota',
+     'Thermosulfidibacterota',
+     'Thermotogota',
+     'Verrucomicrobiota',
+     'Vulcanimicrobiota',
+     'other Bacillota',
+     'other Gammaproteobacteria',
+     'other Pseudomonadota',
+     'unclassified Bacteria']
+    """
+    return 0

gsrap/commons/escherutils.py CHANGED Viewed

@@ -1,3 +1,9 @@
+import warnings
+import logging
+import cobra
 def print_json_tree(data, level=0, max_level=2):
@@ -17,7 +23,7 @@ def print_json_tree(data, level=0, max_level=2):
-def count_undrawn_rids(logger, universe, lastmap):
+def count_undrawn_rids(logger, universe, lastmap, focus):
     rids = set([r.id for r in universe.reactions])
@@ -32,6 +38,71 @@ def count_undrawn_rids(logger, universe, lastmap):
     logger.debug(f"Last universal map version detected: '{filename}'.")
     if len(remainings) > 0:
         logger.warning(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
+        if focus == '-':
+            logger.warning(f"Drawing is eased when using '--focus'...")
     else:
         logger.info(f"Our universal map is {len(remainings)} reactions behind. Thank you ♥")
+def count_undrawn_rids_focus(logger, universe, lastmap, focus, outdir):
+    # get modeled reads for this --focus:
+    rids = set()
+    try: gr = universe.groups.get_by_id(focus)
+    except:
+        logger.warning(f"Group '{focus}' not found!")
+        return
+    for r in gr.members:
+        rids.add(r.id)
+    # get rids on Escher:
+    drawn_rids = set()
+    for key, value in lastmap['json'][1]['reactions'].items():
+        drawn_rids.add(value['bigg_id'])
+    # get remaining rids for this map:
+    remainings = rids - drawn_rids
+    remainings_krs = set()
+    for rid in remainings:
+        r = universe.reactions.get_by_id(rid)
+        krs = r.annotation['kegg.reaction']
+        for kr in krs:
+            remainings_krs.add(kr)
+    if len(remainings) > 0:
+        if focus != 'transport':
+            logger.warning(f"Focusing on '{focus}', our universal map is {len(remainings)} reactions behind: {' '.join(list(remainings_krs))}.")
+        else:
+            logger.warning(f"Focusing on '{focus}', our universal map is {len(remainings)} reactions behind.")  # usually no kegg codes for tranport reactions
+        # subset the universe to ease the drawing:
+        universe_focus = universe.copy()
+        to_remove = [r for r in universe_focus.reactions if r.id not in rids]
+        # trick to avoid the WARNING "cobra/core/group.py:147: UserWarning: need to pass in a list"
+        # triggered when trying to remove reactions that are included in groups.
+        with warnings.catch_warnings():  # temporarily suppress warnings for this block
+            warnings.simplefilter("ignore")  # ignore all warnings
+            cobra_logger = logging.getLogger("cobra.util.solver")
+            old_level = cobra_logger.level
+            cobra_logger.setLevel(logging.ERROR)
+            universe_focus.remove_reactions(to_remove,remove_orphans=True)
+            # restore original behaviour:
+            cobra_logger.setLevel(old_level)
+        # save the subset for drawing in Escher!
+        logger.info(f"Writing '{outdir}/{focus}.json' to ease your drawing workflow...")
+        cobra.io.save_json_model(universe_focus, f'{outdir}/{focus}.json')
+    else:
+        logger.info(f"Focusing on '{focus}', our universal map is {len(remainings)} reactions behind. Thank you ♥")

gsrap/commons/excelhub.py CHANGED Viewed

@@ -148,7 +148,7 @@ def write_excel_model(model, filepath, nofigs, memote_results_dict, df_E, df_B,
             else: df_T.append(row_dict)
     for g in model.genes:
-        row_dict = {'gid': g.id, 'involved_in': '; '.join([r.id for r in g.reactions])}
+        row_dict = {'gid': g.id, 'name': g.name, 'involved_in': '; '.join([r.id for r in g.reactions])}
         for db in g.annotation.keys():
             annots = g.annotation[db]
@@ -171,7 +171,7 @@ def write_excel_model(model, filepath, nofigs, memote_results_dict, df_E, df_B,
     df_R = df_R[df_R_first_cols + sorted([c for c in df_R.columns if c not in df_R_first_cols])]
     df_T = df_T[df_R_first_cols + sorted([c for c in df_T.columns if c not in df_R_first_cols])]
     df_A = df_A[df_R_first_cols + sorted([c for c in df_A.columns if c not in df_R_first_cols])]
-    df_G_first_cols = ['gid', 'involved_in']
+    df_G_first_cols = ['gid', 'name', 'involved_in']
     df_G = df_G[df_G_first_cols + sorted([c for c in df_G.columns if c not in df_G_first_cols])]

gsrap/getmaps/.ipynb_checkpoints/getmaps-checkpoint.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pickle
 from .kdown import download_raw_txtfiles
+from .kdown import create_dict_keggorg
 from .kdown import create_dict_ko
 from .kdown import create_dict_c
 from .kdown import create_dict_r
@@ -20,13 +21,19 @@ def do_kdown(logger, outdir, usecache, keeptmp):
     logger.info(f"Respectfully retrieving metabolic information from KEGG. Raw data are being saved into '{outdir}/kdown/'. Be patient, could take a couple of days...")
     os.makedirs(f'{outdir}/kdown/', exist_ok=True)
     response = download_raw_txtfiles(logger, outdir, usecache)
     if type(response) == int: return 1
     else: RELEASE_kegg = response
     logger.info("Parsing downloaded KEGG information...")
+    response = create_dict_keggorg(logger, outdir)
+    if type(response) == int: return 1
+    else: dict_keggorg = response
     response = create_dict_ko(logger, outdir)
     if type(response) == int: return 1
     else: dict_ko = response
@@ -49,7 +56,7 @@ def do_kdown(logger, outdir, usecache, keeptmp):
     # create 'idcollection_dict' and 'summary_dict' dictionaries
-    idcollection_dict = create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md)
+    idcollection_dict = create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md)
     summary_dict = create_summary_dict(dict_c, dict_r, dict_map, dict_md)
@@ -57,7 +64,6 @@ def do_kdown(logger, outdir, usecache, keeptmp):
 def main(args, logger):
@@ -67,7 +73,7 @@ def main(args, logger):
     os.makedirs(f'{args.outdir}/', exist_ok=True)
-    # KEGG
+    # KEGG download
     response = do_kdown(logger, args.outdir, args.usecache, args.keeptmp)
     if type(response) == int: return 1
     else: RELEASE_kegg, idcollection_dict, summary_dict = response[0], response[1], response[2]
@@ -76,7 +82,9 @@ def main(args, logger):
     # create 'gsrap.maps':
     with open(f'{args.outdir}/gsrap.maps', 'wb') as wb_handler:
         pickle.dump({
-            'RELEASE_kegg': RELEASE_kegg, 'idcollection_dict': idcollection_dict, 'summary_dict': summary_dict,
+            'RELEASE_kegg': RELEASE_kegg,
+            'idcollection_dict': idcollection_dict,
+            'summary_dict': summary_dict,
         }, wb_handler)
     logger.info(f"'{args.outdir}/gsrap.maps' created!")
@@ -87,4 +95,5 @@ def main(args, logger):
         logger.info(f"Temporary raw files deleted!")
     return 0

gsrap/getmaps/.ipynb_checkpoints/kdown-checkpoint.py CHANGED Viewed

@@ -34,6 +34,7 @@ def download_raw_txtfiles(logger, outdir, usecache):
         'orthology',
         'module',
         'pathway',
+        'organism',
     ]
     for db in databases:
         time.sleep(0.5)
@@ -45,8 +46,9 @@ def download_raw_txtfiles(logger, outdir, usecache):
     # mix the items to download to be respectful/compliant
     items_to_download = []
     for db in databases:
+        if db == 'organism':
+            continue   # here we just need the list
         with open(f"{outdir}/kdown/{db}.txt", 'r') as file:
             res_string = file.read()
             rows = res_string.split('\n')
@@ -54,7 +56,6 @@ def download_raw_txtfiles(logger, outdir, usecache):
                 item_id = row.split('\t', 1)[0]
                 if item_id == '': continue
                 items_to_download.append({'db': db, 'id': item_id})
     random.shuffle(items_to_download)
@@ -79,6 +80,51 @@ def download_raw_txtfiles(logger, outdir, usecache):
+def create_dict_keggorg(logger, outdir):
+    organisms_raw = open(f'{outdir}/kdown/organism.txt', 'r').read()
+    # create a dataframe listing all organisms in KEGG;
+    # columns are [tnumber, name, domain, kingdom, phylum, classification]
+    df = []  # list fo dicts
+    for line in organisms_raw.strip().split("\n"):
+        fields = line.split("\t")
+        if len(fields) == 4:
+            tnumber, keggorg, name, classification = fields
+            levels = classification.split(";")
+            domain = levels[0]
+            kingdom = levels[1]
+            phylum = levels[2]
+            df.append({
+                'tnumber':tnumber,
+                'keggorg': keggorg,
+                'name': name,
+                'domain': domain,
+                'kingdom': kingdom,
+                'phylum': phylum,
+                'classification': classification
+            })
+        else:
+            # never verified during tests!
+            logger.warning(f'Strange number of fields found in this line of "organism.txt": """{line}""".')
+    df = pnd.DataFrame.from_records(df)
+    df = df.set_index('keggorg', drop=True, verify_integrity=True)
+    # convert dataframe to dict
+    dict_keggorg = {}
+    for keggorg, row in df.iterrows():
+        dict_keggorg[keggorg] = {
+            'kingdom': row['kingdom'],
+            'phylum': row['phylum'],
+            #'name': row['name'],   # not strictly needed. Commented to save disk space.
+        }
+    if logger != None: logger.info(f'Number of unique items (org): {len(dict_keggorg.keys())}.')
+    return dict_keggorg
 def create_dict_ko(logger, outdir):
     dict_ko = {}         # main output
@@ -98,6 +144,7 @@ def create_dict_ko(logger, outdir):
                 'ecs': set(),
                 'cogs': set(),
                 'gos': set(),
+                'keggorgs': set(),
             }
         else:
             logger.error(f"{ko_id} already included!")
@@ -175,7 +222,13 @@ def create_dict_ko(logger, outdir):
                         gos = content[len('GO: '):].strip().split(' ')
                         for go in gos:
                             dict_ko[ko_id]['gos'].add(go)
+                # parse the organism-specific genes
+                if curr_header == 'GENES       ':
+                    keggorg = content.split(': ',1)[0]
+                    dict_ko[ko_id]['keggorgs'].add(keggorg.lower()) # organism.txt has IDs in lowercase
                 # parse the reactions
                 if curr_header == 'REACTION    ':
@@ -547,7 +600,7 @@ def create_dict_md(logger, outdir):
-def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
+def create_idcollection_dict(dict_keggorg, dict_ko, dict_c, dict_r, dict_map, dict_md):
     idcollection_dict = {}
@@ -620,6 +673,24 @@ def create_idcollection_dict(dict_ko, dict_c, dict_r, dict_map, dict_md):
         for go in dict_ko[ko_id]['gos']:
             idcollection_dict['ko_to_gos'][ko_id].add(go)
+    # creation of 'ko_to_keggorgs' skipped as it takes too much disk space. Replaced with 'ko_to_taxa'.
+    idcollection_dict['ko_to_taxa'] = {}
+    missing_keggorgs = set()
+    for ko_id in dict_ko.keys():
+        idcollection_dict['ko_to_taxa'][ko_id] = {'kingdom': set(), 'phylum': set()}
+        for keggorg in dict_ko[ko_id]['keggorgs']:
+            try:
+                kingdom = dict_keggorg[keggorg]['kingdom']
+                phylum = dict_keggorg[keggorg]['phylum']
+            except:
+                if keggorg not in missing_keggorgs:
+                    missing_keggorgs.add(keggorg)
+                    #print(f"Organism '{keggorg}' appears in 'orthology/' but not in 'organism.txt'.")
+                continue
+            idcollection_dict['ko_to_taxa'][ko_id]['kingdom'].add(kingdom)
+            idcollection_dict['ko_to_taxa'][ko_id]['phylum'].add(phylum)
     idcollection_dict['map_to_name'] = {}
     for map_id in dict_map.keys():

gsrap 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

gsrap 0.8.2py3-none-any.whl → 0.9.0py3-none-any.whl