PyPI - gsrap - Versions diffs - 0.7.1__tar.gz → 0.8.0__tar.gz - Mend

gsrap 0.7.1tar.gz → 0.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{gsrap-0.7.1 → gsrap-0.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: gsrap
-Version: 0.7.1
+Version: 0.8.0
 Summary:
 License: GNU General Public License v3.0
 Author: Gioele Lazzari
@@ -17,9 +17,11 @@ Requires-Dist: cobra (>=0.29)
 Requires-Dist: colorlog (>=6.9.0)
 Requires-Dist: gdown (>=5.2.0)
 Requires-Dist: gempipe (>=1.38.1)
+Requires-Dist: matplotlib (>=3.9.0)
 Requires-Dist: memote (>=0.17.0)
 Requires-Dist: openpyxl (>=3.1.0)
 Requires-Dist: pandas (>=2.0.0)
+Requires-Dist: xlsxwriter (>=3.1.0)
 Description-Content-Type: text/markdown
 Source code for `gsrap`.

{gsrap-0.7.1 → gsrap-0.8.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gsrap"
-version = "0.7.1"
+version = "0.8.0"
 description = ""
 authors = ["Gioele Lazzari"]
 license = "GNU General Public License v3.0"
@@ -16,6 +16,9 @@ gempipe = ">=1.38.1"
 gdown = ">=5.2.0"
 colorlog = ">=6.9.0"
 memote = ">=0.17.0"
+matplotlib = ">=3.9.0"
+xlsxwriter = ">=3.1.0"
 [build-system]
 requires = ["poetry-core>=1.0.0"]

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/.ipynb_checkpoints/__init__-checkpoint.py RENAMED Viewed

@@ -72,9 +72,10 @@ def main():
     parsedb_parser.add_argument("--precursors", action='store_true', help="Verify biosynthesis of biomass precursors and show blocked ones.")
     parsedb_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
     parsedb_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the optional eggnog-mapper annotation table(s).")
-    #parsedb_parser.add_argument("-z", "--zeroes", action='store_true', help="Show maps/modules with 0%% coverage, in addition to partials (use only with --progress).")
+    parsedb_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code.")
     parsedb_parser.add_argument("--goodbefore", metavar='', type=str, default='-', help="Syntax is {pure_mid}-{rid1}-{rid2}. From top to bottom, build the universe until reaction {rid1}, transport {rid2} and metabolite {pure_mid} are reached.")
     parsedb_parser.add_argument("--onlyauthor", metavar='', type=str, default='-', help="Build the universe by parsing contents of the specified author ID only. Contents affected by --goodbefore are parsed anyway.")
+    parsedb_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
     # add arguments for the 'mkmodel' command
@@ -84,6 +85,7 @@ def main():
     mkmodel_parser.add_argument("-c", "--cores", metavar='', type=int, default=0, help="Number of cores to use (if 0, use all available cores).")
     mkmodel_parser.add_argument("-o", "--outdir", metavar='', type=str, default='./', help="Main output directory (will be created if not existing).")
     mkmodel_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the eggnog-mapper annotation table(s).")
+    mkmodel_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code.")
     mkmodel_parser.add_argument("-u", "--universe", metavar='', type=str, default='-', help="Path to the universe model (SBML format).")
     mkmodel_parser.add_argument("-i", "--force_inclusion", metavar='', type=str, default='-', help="Force the inclusion of the specified reactions (comma-separated IDs).")
     mkmodel_parser.add_argument("-f", "--gap_fill", metavar='', type=str, default='-', help="Media to use during gap-filling (comma-separated IDs); if not provided, gap-filling will be skipped.")
@@ -94,6 +96,7 @@ def main():
     mkmodel_parser.add_argument("--conditional", metavar='', type=float, default=0.5, help="Expected minimum fraction of reactions in a biosynthetic pathway for an actually present conditional biomass precursor.")
     mkmodel_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
     mkmodel_parser.add_argument("-b", "--biomass", metavar='', type=str, default='-', help="Strain ID associated to experimental biomass data.")
+    mkmodel_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
     # add arguments for the 'runsims' command
@@ -110,6 +113,7 @@ def main():
     runsims_parser.add_argument("--omission", action='store_true', help="Perform single omission experiments to study auxotrophies.")
     runsims_parser.add_argument("--essential", action='store_true', help="Predict essential genes (single-gene knock-out simulations).")
     runsims_parser.add_argument("--factors", action='store_true', help="Predict putative growth factors.")
+    runsims_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
     # check the inputted subcommand, automatic sys.exit(1) if a bad subprogram was specied.

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/__init__.py RENAMED Viewed

@@ -72,9 +72,10 @@ def main():
     parsedb_parser.add_argument("--precursors", action='store_true', help="Verify biosynthesis of biomass precursors and show blocked ones.")
     parsedb_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
     parsedb_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the optional eggnog-mapper annotation table(s).")
-    #parsedb_parser.add_argument("-z", "--zeroes", action='store_true', help="Show maps/modules with 0%% coverage, in addition to partials (use only with --progress).")
+    parsedb_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code.")
     parsedb_parser.add_argument("--goodbefore", metavar='', type=str, default='-', help="Syntax is {pure_mid}-{rid1}-{rid2}. From top to bottom, build the universe until reaction {rid1}, transport {rid2} and metabolite {pure_mid} are reached.")
     parsedb_parser.add_argument("--onlyauthor", metavar='', type=str, default='-', help="Build the universe by parsing contents of the specified author ID only. Contents affected by --goodbefore are parsed anyway.")
+    parsedb_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
     # add arguments for the 'mkmodel' command
@@ -84,6 +85,7 @@ def main():
     mkmodel_parser.add_argument("-c", "--cores", metavar='', type=int, default=0, help="Number of cores to use (if 0, use all available cores).")
     mkmodel_parser.add_argument("-o", "--outdir", metavar='', type=str, default='./', help="Main output directory (will be created if not existing).")
     mkmodel_parser.add_argument("-e", "--eggnog", nargs='+', metavar='', type=str, default='-', help="Path to the eggnog-mapper annotation table(s).")
+    mkmodel_parser.add_argument("-k", "--keggorg", metavar='', type=str, default='-', help="A single KEGG Organism code.")
     mkmodel_parser.add_argument("-u", "--universe", metavar='', type=str, default='-', help="Path to the universe model (SBML format).")
     mkmodel_parser.add_argument("-i", "--force_inclusion", metavar='', type=str, default='-', help="Force the inclusion of the specified reactions (comma-separated IDs).")
     mkmodel_parser.add_argument("-f", "--gap_fill", metavar='', type=str, default='-', help="Media to use during gap-filling (comma-separated IDs); if not provided, gap-filling will be skipped.")
@@ -94,6 +96,7 @@ def main():
     mkmodel_parser.add_argument("--conditional", metavar='', type=float, default=0.5, help="Expected minimum fraction of reactions in a biosynthetic pathway for an actually present conditional biomass precursor.")
     mkmodel_parser.add_argument("--biosynth", action='store_true', help="Check biosynthesis of all metabolites and detect dead-ends.")
     mkmodel_parser.add_argument("-b", "--biomass", metavar='', type=str, default='-', help="Strain ID associated to experimental biomass data.")
+    mkmodel_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
     # add arguments for the 'runsims' command
@@ -110,6 +113,7 @@ def main():
     runsims_parser.add_argument("--omission", action='store_true', help="Perform single omission experiments to study auxotrophies.")
     runsims_parser.add_argument("--essential", action='store_true', help="Predict essential genes (single-gene knock-out simulations).")
     runsims_parser.add_argument("--factors", action='store_true', help="Predict putative growth factors.")
+    runsims_parser.add_argument("--nofigs", action='store_true', help="Do not generate figures.")
     # check the inputted subcommand, automatic sys.exit(1) if a bad subprogram was specied.

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/commons/.ipynb_checkpoints/__init__-checkpoint.py RENAMED Viewed

@@ -7,3 +7,4 @@ from .metrics import *
 from .sbmlutils import *
 from .escherutils import *
 from .logutils import *
+from .keggutils import *

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/commons/.ipynb_checkpoints/downloads-checkpoint.py RENAMED Viewed

@@ -236,7 +236,7 @@ def format_expansion(logger, eggnog):
     if eggnog == [] or eggnog == ['-']:
-        eggnog = '-'   # return always a list except for ths case
+        eggnog = '-'   # return always a list except for this case
     return eggnog

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/commons/.ipynb_checkpoints/escherutils-checkpoint.py RENAMED Viewed

@@ -31,7 +31,7 @@ def count_undrawn_rids(logger, universe, lastmap):
     filename = lastmap['filename']
     logger.debug(f"Last universal map version detected: '{filename}'.")
     if len(remainings) > 0:
-        logger.info(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
+        logger.warning(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
     else:
         logger.info(f"Our universal map is {len(remainings)} reactions behind. Thank you ♥")

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/commons/.ipynb_checkpoints/excelhub-checkpoint.py RENAMED Viewed

@@ -1,14 +1,20 @@
 import pandas as pnd
+from .figures import figure_df_C_F1
-def write_excel_model(model, filepath, df_E, df_B, df_P, df_S):
+def write_excel_model(model, filepath, nofigs, df_E, df_B, df_P, df_S, df_C=None):
-    df_M = []
-    df_R = []
-    df_T = []
-    df_A = []
+    # generate figures
+    if nofigs == False:
+        if df_C is not None:
+            df_C_F1 = figure_df_C_F1(df_C)
     # format df_E:  # biomass precursors biosynthesis
     if df_E is not None:
@@ -33,64 +39,112 @@ def write_excel_model(model, filepath, df_E, df_B, df_P, df_S):
         df_S.insert(0, 'mid', '')  # new columns as first
         df_S['mid'] = df_S.index
         df_S = df_S.reset_index(drop=True)
+    # format df_C: universal reaction coverage
+    if df_C is not None:
+        df_C.insert(0, 'kr', '')  # new columns as first
+        df_C['kr'] = df_C.index
+        df_C = df_C.reset_index(drop=True)
+    # define dict-lists, future dataframes
+    df_M = []
+    df_R = []
+    df_T = []
+    df_G = []
+    df_A = []
     for m in model.metabolites:
+        row_dict = {'mid': m.id, 'name': m.name, 'formula': m.formula, 'charge': m.charge,}
-        # get kc codes:
-        if 'kegg.compound' not in m.annotation.keys():  kc_ids = ''
-        else:
-            kc_ids = m.annotation['kegg.compound']
-            if type(kc_ids) == str: kc_ids = [kc_ids]
-            kc_ids = '; '.join([i for i in kc_ids if i!='CXXXXX'])
-        df_M.append({'mid': m.id, 'formula': m.formula, 'charge': m.charge, 'kc': kc_ids, 'name': m.name})
+        for db in m.annotation.keys():
+            annots = m.annotation[db]
+            if type(annots) == str: annots = [annots]
+            annots = '; '.join([i for i in annots])
+            row_dict[db] = annots
+        df_M.append(row_dict)
     for r in model.reactions:
+        row_dict = {'rid': r.id, 'name': r.name, 'rstring': r.reaction, 'gpr': "Not applicable", 'bounds': r.bounds}
+        for db in r.annotation.keys():
+            annots = r.annotation[db]
+            if type(annots) == str: annots = [annots]
+            annots = '; '.join([i for i in annots])
+            row_dict[db] = annots
         # handle artificial reactions
         if r.id == 'Biomass':
-            df_A.append({'rid': r.id, 'rstring': r.reaction, 'type': 'biomass', 'name': r.name})
+            # commented as the type is inplicit in the ID
+            #row_dict['type'] = 'biomass'
+            df_A.append(row_dict)
         elif len(r.metabolites) == 1:
+            # commented as the type is inplicit in the ID
+            """
             if len(r.metabolites)==1 and list(r.metabolites)[0].id.rsplit('_',1)[-1] == 'e':
-                df_A.append({'rid': r.id, 'rstring': r.reaction, 'type': 'exchange', 'name': r.name})
+                row_dict['type'] = 'exchange'
             elif r.lower_bound < 0 and r.upper_bound > 0:
-                df_A.append({'rid': r.id, 'rstring': r.reaction, 'type': 'sink', 'name': r.name})
+                row_dict['type'] = 'sink'
             elif r.lower_bound == 0 and r.upper_bound > 0:
-                df_A.append({'rid': r.id, 'rstring': r.reaction, 'type': 'demand', 'name': r.name})
+                row_dict['type'] = 'demand'
+            """
+            df_A.append(row_dict)
         else: # more than 1 metabolite involved
+            row_dict['gpr'] = r.gene_reaction_rule
-            # get kr codes:
-            if 'kegg.reaction' not in r.annotation.keys():  kr_ids = ''
-            else:
-                kr_ids = r.annotation['kegg.reaction']
-                if type(kr_ids) == str: kr_ids = [kr_ids]
-                kr_ids = '; '.join([i for i in kr_ids if i!='RXXXXX'])
             # introduce reaction in the correct table:
-            r_dict = {'rid': r.id, 'rstring': r.reaction, 'kr': kr_ids, 'gpr': r.gene_reaction_rule, 'name': r.name}
             if len(set([m.id.rsplit('_',1)[-1] for m in r.metabolites])) == 1:
-                df_R.append(r_dict)
-            else: df_T.append(r_dict)
+                df_R.append(row_dict)
+            else: df_T.append(row_dict)
+    for g in model.genes:
+        row_dict = {'gid': g.id, 'involved_in': '; '.join([r.id for r in g.reactions])}
+        for db in g.annotation.keys():
+            annots = g.annotation[db]
+            if type(annots) == str: annots = [annots]
+            annots = '; '.join([i for i in annots])
+            row_dict[db] = annots
+        df_G.append(row_dict)
+    # create dataframes from dict-lists
     df_M = pnd.DataFrame.from_records(df_M)
     df_R = pnd.DataFrame.from_records(df_R)
     df_T = pnd.DataFrame.from_records(df_T)
     df_A = pnd.DataFrame.from_records(df_A)
-    with pnd.ExcelWriter(filepath) as writer:
+    df_G = pnd.DataFrame.from_records(df_G)
+    # sort columns
+    df_M_first_cols = ['mid', 'name', 'formula', 'charge']
+    df_M = df_M[df_M_first_cols + sorted([c for c in df_M.columns if c not in df_M_first_cols])]
+    df_R_first_cols = ['rid', 'name', 'rstring', 'gpr', 'bounds']
+    df_R = df_R[df_R_first_cols + sorted([c for c in df_R.columns if c not in df_R_first_cols])]
+    df_T = df_T[df_R_first_cols + sorted([c for c in df_T.columns if c not in df_R_first_cols])]
+    df_A = df_A[df_R_first_cols + sorted([c for c in df_A.columns if c not in df_R_first_cols])]
+    df_G_first_cols = ['gid', 'involved_in']
+    df_G = df_G[df_G_first_cols + sorted([c for c in df_G.columns if c not in df_G_first_cols])]
+    with pnd.ExcelWriter(filepath, engine='xlsxwriter') as writer:
         df_M.to_excel(writer, sheet_name='Metabolites', index=False)
         df_R.to_excel(writer, sheet_name='Reactions', index=False)
         df_T.to_excel(writer, sheet_name='Transporters', index=False)
+        df_G.to_excel(writer, sheet_name='Genes', index=False)
         df_A.to_excel(writer, sheet_name='Artificials', index=False)
         if df_E is not None and len(df_E)!=0: df_E.to_excel(writer, sheet_name='Precursors', index=False)
         if df_B is not None: df_B.to_excel(writer, sheet_name='Biomass', index=False)
         if df_P is not None and len(df_P)!=0: df_P.to_excel(writer, sheet_name='Biolog®', index=False)
         if df_S is not None and len(df_S.columns)>2: df_S.to_excel(writer, sheet_name='Biosynth', index=False)
+        if df_C is not None:
+            df_C.to_excel(writer, sheet_name='Coverage', index=False)
+            if nofigs == False:
+                worksheet = writer.sheets['Coverage']
+                worksheet.insert_image('E3', 'df_C_F1.png', {'image_data': df_C_F1})
     sheets_dict = {
         'model_id': model.id,
@@ -102,6 +156,7 @@ def write_excel_model(model, filepath, df_E, df_B, df_P, df_S):
         'Biomass': df_B,
         'Biolog': df_P,
         'Biosynth': df_S,
+        'Coverage': df_C,
     }
     return sheets_dict
@@ -115,9 +170,10 @@ def comparative_table(logger, outdir, sheets_dicts):
     for sheets_dict in sheets_dicts:
         for index, row in sheets_dict['Reactions'].iterrows():
             if row['rid'] not in df_topology.index:
-                df_topology.loc[row['rid'], 'rstring'] = row['rstring']
-                df_topology.loc[row['rid'], 'kr'] = row['kr']
-                df_topology.loc[row['rid'], 'name'] = row['name']
+                df_topology.loc[row['rid'], 'rid'] = row['rid']
+                for key, value in row.to_dict().items():
+                    # force string to avoid errors with bounds
+                    df_topology.loc[row['rid'], key] = '' if pnd.isna(value) else str(value)
             df_topology.loc[row['rid'], sheets_dict['model_id']] = 1
     for sheets_dict in sheets_dicts:  # replace missing values:
         df_topology = df_topology.fillna({sheets_dict['model_id']: 0})
@@ -128,9 +184,10 @@ def comparative_table(logger, outdir, sheets_dicts):
     for sheets_dict in sheets_dicts:
         for index, row in sheets_dict['Reactions'].iterrows():
             if row['rid'] not in df_gprs.index:
-                df_gprs.loc[row['rid'], 'rstring'] = row['rstring']
-                df_gprs.loc[row['rid'], 'kr'] = row['kr']
-                df_gprs.loc[row['rid'], 'name'] = row['name']
+                df_gprs.loc[row['rid'], 'rid'] = row['rid']
+                for key, value in row.to_dict().items():
+                    # force string to avoid errors with bounds
+                    df_gprs.loc[row['rid'], key] = '' if pnd.isna(value) else str(value)
             df_gprs.loc[row['rid'], sheets_dict['model_id']] = row['gpr']
     for sheets_dict in sheets_dicts:  # replace missing values:
         df_gprs = df_gprs.fillna({sheets_dict['model_id']: 'missing'})

gsrap-0.8.0/src/gsrap/commons/.ipynb_checkpoints/figures-checkpoint.py ADDED Viewed

@@ -0,0 +1,119 @@
+from io import BytesIO
+import numpy as np
+import pandas as pnd
+from scipy.spatial.distance import pdist
+from scipy.cluster.hierarchy import linkage, cut_tree, dendrogram, leaves_list
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+def figure_df_C_F1(df_coverage):
+    # prepare the binary matrix:
+    modeled_rs = df_coverage[df_coverage['modeled']==True].index
+    unmodeled_rs = df_coverage[df_coverage['modeled']==False].index
+    # remove useless columns
+    bin_matrix = df_coverage[[i for i in df_coverage.columns if i not in ['map_ids', 'modeled']]]
+    # sort rows: upper rows are present in more strains
+    bin_matrix = bin_matrix.loc[bin_matrix.sum(axis=1).sort_values(ascending=False).index]
+    # split in 2: modeled above, non-modeled below:
+    bin_matrix = pnd.concat([
+        bin_matrix.loc[[i for i in bin_matrix.index if i in modeled_rs], ],
+        bin_matrix.loc[[i for i in bin_matrix.index if i in unmodeled_rs], ]
+    ])
+    strains = bin_matrix.columns
+    bin_matrix = bin_matrix.T  # features in column
+    # pdist() / linkage() will loose the accession information. So here we save a dict:
+    index_to_strain = {i: strain for i, strain in enumerate(bin_matrix.index)}
+    # Calculate the linkage matrix using Ward clustering and Jaccard dissimilarity
+    distances = pdist(bin_matrix, 'jaccard')
+    linkage_matrix = linkage(distances, method='ward')
+    # PART 0: create the frame
+    fig, axs = plt.subplots(
+        nrows=2, ncols=2,
+        figsize=(15, 10),
+        gridspec_kw={  # suplots width proportions.
+            'width_ratios': [0.5, 1.0],
+            'height_ratios': [0.015, 0.985]
+        }
+    )
+    # PART 1: dendrogram
+    dn = dendrogram(
+        linkage_matrix, ax=axs[1,0],
+        orientation='left',
+        color_threshold=0, above_threshold_color='black',
+    )
+    ### PART 2: heatmap
+    ord_leaves = leaves_list(linkage_matrix)
+    ord_leaves = np.flip(ord_leaves)  # because leaves are returned in the inverse sense.
+    ord_leaves = [index_to_strain[i] for i in ord_leaves]  # convert index as number to index as accession
+    bin_matrix = bin_matrix.loc[ord_leaves, :]  # reordered dataframe.
+    axs[1,1].matshow(
+        bin_matrix,
+        cmap='viridis',
+        aspect='auto', # non-squared pixels to fit the axis
+    )
+    ### PART 3: coverage bar
+    axs[0,1].matshow(
+        df_coverage.loc[bin_matrix.T.index, ['modeled']].T,
+        cmap='cool_r',
+        aspect='auto', # non-squared pixels to fit the axis
+    )
+    ### PART 4: legends
+    legend_feat = [
+        Patch(facecolor=plt.colormaps.get_cmap('viridis')(0.0), edgecolor='black', label='Absent'),
+        Patch(facecolor=plt.colormaps.get_cmap('viridis')(1.0), edgecolor='black', label='Probably present'),
+    ]
+    legend_cov = [
+        Patch(facecolor=plt.colormaps.get_cmap('cool_r')(0.0), edgecolor='black', label='Not modeled'),
+        Patch(facecolor=plt.colormaps.get_cmap('cool_r')(1.0), edgecolor='black', label='Modeled'),
+    ]
+    l1 = axs[1,0].legend(handles=legend_cov, title='Universe coverage', loc='upper left')
+    l2 = axs[1,0].legend(handles=legend_feat, title='KEGG reaction in strain', loc='lower left')
+    axs[1,0].add_artist(l1)  # keep both legends visible
+    ### PART 5: aesthetics
+    plt.subplots_adjust(wspace=0, hspace=0)  # adjust the space between subplots:
+    axs[0,0].axis('off')  # remove frame and axis
+    axs[1,0].axis('off')  # remove frame and axis
+    axs[0,1].yaxis.set_visible(False)  # remove ticks, tick labels, axis label
+    axs[1,1].xaxis.set_ticks([])       # remove ticks
+    axs[1,1].set_xticklabels([])       # remove tick labels
+    axs[1,1].xaxis.set_label_position("bottom")
+    axs[1,1].set_xlabel("KEGG reactions")
+    axs[1,1].yaxis.set_ticks([])       # remove ticks
+    axs[1,1].set_yticklabels([])       # remove tick labels
+    axs[1,1].yaxis.set_label_position("right")
+    axs[1,1].set_ylabel(f"{len(strains)} strains", rotation=270, labelpad=13)  # labelpad is in points (1 point = 1/72 inch)
+    ### PART 6: save fig
+    buf = BytesIO()
+    fig.savefig(buf, dpi=300, bbox_inches='tight')  # labelpad is in inches (1 point = 1/72 inch)
+    plt.close(fig)
+    buf.seek(0)  # rewind the buffer to the beginning
+    return buf

gsrap-0.8.0/src/gsrap/commons/.ipynb_checkpoints/keggutils-checkpoint.py ADDED Viewed

@@ -0,0 +1,145 @@
+import time
+import os
+import sys
+import pickle
+import pandas as pnd
+from Bio.KEGG import REST
+def download_keggorg(logger, keggorg='lpl', outdir='./', ):
+    # check if already downloaded
+    outfile = os.path.join(outdir, f'{keggorg}.keggorg')
+    if os.path.exists(outfile):
+        logger.info(f"Organism code '{keggorg}' already downloaded ('{os.path.join(outdir, f'{keggorg}.keggorg')}').")
+        return 0
+    # donwload entire txt:
+    logger.info(f"Verifying existence of organism code '{keggorg}' on KEGG...")
+    time.sleep(0.5)   # be respectful
+    try: response = REST.kegg_list(keggorg).read()
+    except:
+        logger.error(f"Organism code '{keggorg}' not found in KEGG database.")
+        return 1
+    # response is now a string similar to:
+    """
+    lpl:lp_0026	CDS	31317..32084	hydrolase, HAD superfamily, Cof family
+    lpl:lp_0027	CDS	complement(32236..32907)	pgmB1; beta-phosphoglucomutase
+    """
+    # extract the gene IDs list:
+    gene_ids = [line.split('\t')[0] for line in response.strip().split('\n')]
+    # example of gene_id: "lpl:lp_0005"
+    logger.info(f"Respectfully downloading {len(gene_ids)} genes from KEGG...")
+    # respectfully download in batch
+    # 10 is the max number of elements that can be downloaded
+    batch_size = 10
+    n_batches = len(gene_ids) // batch_size + (1 if (len(gene_ids) % batch_size) > 0 else 0)
+    n_attempts = 5
+    attempts_left = n_attempts
+    default_sleep = 0.5
+    sleep_time = default_sleep
+    completed_batches = 0
+    completed_genes = 0
+    res_string_list = []
+    while completed_batches < n_batches:
+        # be respectful
+        time.sleep(sleep_time)
+        # extract batch
+        start_index = completed_batches *batch_size
+        end_index = (completed_batches+1) *batch_size
+        if end_index > len(gene_ids): end_index = len(gene_ids)
+        curr_batch = gene_ids[start_index: end_index]
+        # download batch
+        try:
+            res_string = REST.kegg_get(curr_batch).read()
+            for item in res_string.split("///\n\n"):
+                res_string_list.append(item.replace('///\n', ''))
+            completed_batches += 1
+            completed_genes += len(curr_batch)
+            print(f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!", end='\r', file=sys.stderr)
+            attempts_left = n_attempts
+            sleep_time = default_sleep
+        except:
+            attempts_left -= 1
+            sleep_time = default_sleep *4  # increase sleep time to be more respectful
+            logger.warning(f"An error occurred during kegg_get() of batch {curr_batch}. Remaining attempts: {attempts_left}.")
+            if attempts_left == 0:
+                logger.error("No attemps left! Shutting down...")
+                return 1
+    # hide last progress trace ('sheets_dicts' unused if not in multi-strain mode):
+    last_trace = f"{completed_genes}/{len(gene_ids)} ({int(completed_genes/len(gene_ids)*100)}%) completed!"
+    whitewash = ''.join([' ' for i in range(len(last_trace))])
+    print(whitewash, end='\r', file=sys.stderr)
+    # extract info into a formatted df:
+    df = []  # list of dicts, future df
+    for entry in res_string_list:
+        entry_dict = {}
+        curr_header = None
+        for line in entry.split('\n'):
+            if line == '': continue
+            header = line[:12]
+            content = line[12:]
+            if header != ' '*12:
+                curr_header = header
+            if curr_header == 'ENTRY       ':
+                gid = content.split(' ', 1)[0]
+                entry_dict['gid'] = gid
+            if curr_header == 'POSITION    ':
+                entry_dict['pos'] = content.strip()
+            if curr_header == 'ORTHOLOGY   ':
+                ko = content.split(' ', 1)[0]
+                entry_dict['ko'] = ko
+            if curr_header == 'MOTIF       ':
+                db, value = content.strip().split(': ', 1)
+                entry_dict[db] = value.split(' ')
+            if curr_header == 'DBLINKS     ':
+                db, value = content.strip().split(': ', 1)
+                entry_dict[db] = value.split(' ')
+        df.append(entry_dict)
+    df = pnd.DataFrame.from_records(df)
+    # save dataframe in the output dir:
+    with open(outfile, 'wb') as wb_handler:
+        pickle.dump(df, wb_handler)
+    logger.info(f"'{outfile}' created!")
+    return 0

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/commons/__init__.py RENAMED Viewed

@@ -7,3 +7,4 @@ from .metrics import *
 from .sbmlutils import *
 from .escherutils import *
 from .logutils import *
+from .keggutils import *

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/commons/downloads.py RENAMED Viewed

@@ -236,7 +236,7 @@ def format_expansion(logger, eggnog):
     if eggnog == [] or eggnog == ['-']:
-        eggnog = '-'   # return always a list except for ths case
+        eggnog = '-'   # return always a list except for this case
     return eggnog

{gsrap-0.7.1 → gsrap-0.8.0}/src/gsrap/commons/escherutils.py RENAMED Viewed

@@ -31,7 +31,7 @@ def count_undrawn_rids(logger, universe, lastmap):
     filename = lastmap['filename']
     logger.debug(f"Last universal map version detected: '{filename}'.")
     if len(remainings) > 0:
-        logger.info(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
+        logger.warning(f"Our universal map is {len(remainings)} reactions behind. Please draw!")
     else:
         logger.info(f"Our universal map is {len(remainings)} reactions behind. Thank you ♥")

gsrap 0.7.1__tar.gz → 0.8.0__tar.gz

gsrap 0.7.1tar.gz → 0.8.0tar.gz