PyPI - mutts - Versions diffs - 1.0.0__tar.gz - Mend

mutts 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

mutts-1.0.0/PKG-INFO +18 -0
mutts-1.0.0/pyproject.toml +28 -0
mutts-1.0.0/src/mutts/__init__.py +6 -0
mutts-1.0.0/src/mutts/cli.py +130 -0
mutts-1.0.0/src/mutts/retriever.py +212 -0
mutts-1.0.0/src/mutts/spreadsheet.py +125 -0

mutts-1.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,18 @@
+Metadata-Version: 2.3
+Name: mutts
+Version: 1.0.0
+Summary: Metadata for User facility Template Transformations
+Author: Sujay Patil
+Author-email: spatil@lbl.gov
+Requires-Python: >=3.9,<4.0
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: click (>=8.1.3,<9.0.0)
+Requires-Dist: openpyxl (>=3.0.10,<4.0.0)
+Requires-Dist: pandas (>=1.5.2,<2.0.0)
+Requires-Dist: python-dotenv (>=0.21.1,<0.22.0)
+Requires-Dist: requests (>=2.28.2,<3.0.0)

mutts-1.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,28 @@
+[tool.poetry]
+name = "mutts"
+version = "1.0.0"
+description = "Metadata for User facility Template Transformations"
+authors = [
+    "Sujay Patil <spatil@lbl.gov>",
+    "Cristina Stone Pedraza <cristina.stonepedraza@pnnl.gov>",
+    "Montana Smith <montana.smith@pnnl.gov>",
+    ]
+packages = [{include = "mutts", from = "src"}]
+[tool.poetry.scripts]
+mutts = "mutts.cli:cli"
+[tool.poetry.dependencies]
+python = "^3.9"
+pandas = "^1.5.2"
+openpyxl = "^3.0.10"
+click = "^8.1.3"
+python-dotenv = "^0.21.1"
+requests = "^2.28.2"
+[tool.poetry.dev-dependencies]
+black = "^22.12.0"
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"

mutts-1.0.0/src/mutts/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""MUTTs - Metadata for User facility Template Transformations"""
+from mutts.retriever import MetadataRetriever
+from mutts.spreadsheet import SpreadsheetCreator
+__all__ = ["MetadataRetriever", "SpreadsheetCreator"]

mutts-1.0.0/src/mutts/cli.py ADDED Viewed

@@ -0,0 +1,130 @@
+import json
+import os
+import click
+import pandas as pd
+from dotenv import load_dotenv, dotenv_values
+from openpyxl.styles import Alignment
+from typing import Dict, List, Union
+from mutts.retriever import MetadataRetriever
+from mutts.spreadsheet import SpreadsheetCreator
+def format_worksheet(worksheet):
+    """
+    Apply formatting to a worksheet for better readability.
+    :param worksheet: The openpyxl worksheet to format.
+    """
+    # Enable text wrapping and adjust column widths
+    for column in worksheet.columns:
+        max_length = 0
+        column_letter = column[0].column_letter
+        for cell in column:
+            # Enable text wrapping for all cells
+            cell.alignment = Alignment(wrap_text=True, vertical='top')
+            # Calculate max length for column width
+            try:
+                if cell.value:
+                    cell_length = len(str(cell.value))
+                    if cell_length > max_length:
+                        max_length = cell_length
+            except:
+                pass
+        # Set column width with reasonable limits (min 10, max 50)
+        adjusted_width = min(max(max_length + 2, 10), 50)
+        worksheet.column_dimensions[column_letter].width = adjusted_width
+@click.command()
+@click.option("--submission", "-s", required=True, help="Metadata submission id.")
+@click.option(
+    "--user-facility", "-u", required=True, help="User facility to send data to."
+)
+@click.option("--header/--no-header", "-h", default=False, show_default=True)
+@click.option(
+    "--mapper",
+    "-m",
+    required=True,
+    type=click.Path(exists=True),
+    help="Path to user facility specific JSON file.",
+)
+@click.option(
+    "--unique-field",
+    "-uf",
+    required=True,
+    help="Unique field to identify the metadata records.",
+)
+@click.option(
+    "--output",
+    "-o",
+    required=True,
+    help="Path to result output XLSX file.",
+)
+def cli(
+    submission: str,
+    user_facility: str,
+    header: bool,
+    mapper: str,
+    unique_field: str,
+    output: str,
+) -> None:
+    """
+    Command-line interface for creating a spreadsheet based on metadata records.
+    :param submission: The ID of the metadata submission.
+    :param user_facility: The user facility to retrieve data from.
+    :param header: True if the headers should be included, False otherwise.
+    :param mapper: Path to the JSON mapper specifying column mappings.
+    :param unique_field: Unique field to identify the metadata records.
+    :param output: Path to the output XLSX file.
+    """
+    load_dotenv()
+    env_path = os.path.join(os.getcwd(), ".env")
+    env_vars = dotenv_values(env_path)
+    for key, value in env_vars.items():
+        os.environ[key] = value
+    metadata_retriever = MetadataRetriever(submission, user_facility)
+    metadata_df = metadata_retriever.retrieve_metadata_records(unique_field)
+    with open(mapper, "r") as f:
+        json_mapper: Dict[str, Dict[str, Union[str, List[str]]]] = json.load(f)
+    spreadsheet_creator = SpreadsheetCreator(user_facility, json_mapper, metadata_df)
+    user_facility_spreadsheet = spreadsheet_creator.create_spreadsheet(header)
+    # Write the main data sheet and copy static sheets from template
+    with pd.ExcelWriter(output, engine='openpyxl') as writer:
+        # Write the generated data to 'DATA SHEET'
+        user_facility_spreadsheet.to_excel(writer, index=False, sheet_name='DATA SHEET')
+        # Path to static JGI v15 Excel template
+        static_excel_path = os.path.join(
+            os.path.dirname(__file__), '..', '..',
+            'input-files', 'static-excel-tabs', 'JGI.Metagenome.NA.v15.xlsx'
+        )
+        # Copy INSTRUCTIONS and PLATE LOCATIONS sheets from JGI v15 template
+        # static file if it exists
+        if os.path.exists(static_excel_path):
+            static_excel = pd.ExcelFile(static_excel_path)
+            if 'INSTRUCTIONS' in static_excel.sheet_names:
+                instructions_df = pd.read_excel(static_excel, 'INSTRUCTIONS')
+                instructions_df.to_excel(writer, index=False, sheet_name='INSTRUCTIONS')
+            if 'PLATE LOCATIONS' in static_excel.sheet_names:
+                plate_locations_df = pd.read_excel(static_excel, 'PLATE LOCATIONS')
+                plate_locations_df.to_excel(writer, index=False, sheet_name='PLATE LOCATIONS')
+        # Apply formatting to all sheets
+        for sheet_name in writer.book.sheetnames:
+            worksheet = writer.book[sheet_name]
+            format_worksheet(worksheet)
+if __name__ == "__main__":
+    cli()

mutts-1.0.0/src/mutts/retriever.py ADDED Viewed

@@ -0,0 +1,212 @@
+import calendar
+import os
+import pandas as pd
+import requests
+from typing import Dict, Any
+from dotenv import dotenv_values
+class MetadataRetriever:
+    """
+    Retrieves metadata records from a given submission ID and user facility.
+    """
+    USER_FACILITY_DICT: Dict[str, str] = {
+        "emsl": "emsl_data",
+        "jgi_mg": "jgi_mg_data",
+        "jgi_mg_lr": "jgi_mg_lr_data",
+        "jgi_mt": "jgi_mt_data",
+    }
+    def __init__(self, metadata_submission_id: str, user_facility: str) -> None:
+        """
+        Initialize the MetadataRetriever.
+        :param metadata_submission_id: The ID of the metadata submission.
+        :param user_facility: The user facility to retrieve data from.
+        """
+        self.metadata_submission_id = metadata_submission_id
+        self.user_facility = user_facility
+        self.load_and_set_env_vars()
+        self.base_url = self.env.get("SUBMISSION_PORTAL_BASE_URL")
+    def load_and_set_env_vars(self):
+        """Loads and sets environment variables from .env file."""
+        env_path = os.path.join(os.path.dirname(__file__), "..", "..", ".env")
+        env_vars = dotenv_values(env_path)
+        for key, value in env_vars.items():
+            os.environ[key] = value
+        self.env: Dict[str, str] = dict(os.environ)
+    def retrieve_metadata_records(self, unique_field: str) -> pd.DataFrame:
+        """
+        Retrieves the metadata records for the given submission ID and user facility.
+        :return: The retrieved metadata records as a Pandas DataFrame.
+        """
+        self.load_and_set_env_vars()
+        refresh_response = requests.post(
+            f"{self.base_url}/auth/refresh",
+            json={"refresh_token": self.env["DATA_PORTAL_REFRESH_TOKEN"]},
+        )
+        refresh_response.raise_for_status()
+        refresh_body = refresh_response.json()
+        access_token = refresh_body["access_token"]
+        headers = {
+            "content-type": "application/json; charset=UTF-8",
+            "Authorization": f"Bearer {access_token}",
+        }
+        response: Dict[str, Any] = requests.get(
+            f"{self.base_url}/api/metadata_submission/{self.metadata_submission_id}",
+            headers=headers,
+        ).json()
+        # Get user-facility key data
+        common_df: pd.DataFrame = pd.DataFrame()
+        if self.user_facility in self.USER_FACILITY_DICT:
+            user_facility_data: Dict[str, Any] = response["metadata_submission"][
+                "sampleData"
+            ].get(self.USER_FACILITY_DICT[self.user_facility], {})
+            common_df = pd.DataFrame(user_facility_data)
+        # Check if common_df is empty
+        if common_df.empty:
+            raise ValueError(
+                f"No key {self.user_facility} exists in submission metadata record {self.metadata_submission_id}"
+            )
+        else:
+            df = common_df
+        # Find non-user-facility keys (ie, plant_associated, water, etc)
+        all_keys_data = response["metadata_submission"]["sampleData"]
+        user_facility_keys = [
+            "emsl_data",
+            "jgi_mg_data",
+            "jgi_mg_lr_data",
+            "jgi_mt_data",
+        ]
+        sample_data_keys = [
+            key for key in all_keys_data if key not in user_facility_keys
+        ]
+        # Create an empty list to store dataframes for each key
+        sample_data_dfs = []
+        # Loop through resulting keys and combine with common_df by samp_name
+        for key in sample_data_keys:
+            sample_data: Dict[str, Any] = response["metadata_submission"][
+                "sampleData"
+            ].get(key, {})
+            # Begin collecting detailed sample data
+            # If there's sample data, create a DataFrame and add it to the list
+            if sample_data:
+                sample_data_df = pd.DataFrame(sample_data)
+                # Add the non-UF key name into the df for 'Sample Isolated From' col in jgi mg/mt
+                sample_data_df["sample_isolated_from"] = key
+                # Append to list of dfs
+                sample_data_dfs.append(sample_data_df)
+        # Concatenate sample dataframes into one (if they exist)
+        if sample_data_dfs:
+            all_sample_data_df = pd.concat(sample_data_dfs, ignore_index=True)
+            # Merge the combined sample data with df on samp_name
+            if not df.empty and not all_sample_data_df.empty:
+                df = pd.merge(df, all_sample_data_df, on="samp_name", how="outer")
+        # Auto-fill depth with 0 for JGI facilities if no value is provided
+        if self.user_facility in ["jgi_mg", "jgi_mt", "jgi_mg_lr"]:
+            if "depth" not in df.columns:
+                df["depth"] = 0
+            else:
+                df["depth"] = df["depth"].fillna(0)
+        for index, row in df.iterrows():
+            if "lat_lon" in df.columns:
+                # Check if lat_lon is nan before trying to split it
+                if pd.isnull(row["lat_lon"]):
+                    df.at[index, "latitude"] = None
+                    df.at[index, "longitude"] = None
+                else:
+                    values = str(row["lat_lon"]).split(" ", 1)
+                    # Assign the split values back to the row
+                    df.at[index, "latitude"] = values[0]
+                    df.at[index, "longitude"] = values[1]
+            if "depth" in df.columns:
+                # Case - different delimiters used
+                row["depth"] = str(row["depth"]).replace("-", " - ")
+                # Case - only one value provided for depth (single value will be max and min)
+                # Checking if the value is a string, because if there is a dash, that will be the case
+                if type(row["depth"]) == str:
+                    values = row["depth"].split(" - ")
+                    # Check if only one value
+                    if len(values) == 1:
+                        df.at[index, "minimum_depth"] = float(values[0])
+                        df.at[index, "maximum_depth"] = float(values[0])
+                    # Check if it's a range
+                    elif len(values) == 2:
+                        df.at[index, "minimum_depth"] = float(values[0])
+                        df.at[index, "maximum_depth"] = float(values[1])
+                else:
+                    df.at[index, "minimum_depth"] = row["depth"]
+                    df.at[index, "maximum_depth"] = row["depth"]
+        if "geo_loc_name" in df.columns:
+            df["country_name"] = df["geo_loc_name"].str.split(":").str[0]
+        if "collection_date" in df.columns:
+            df["collection_year"] = df["collection_date"].str.split("-").str[0]
+            df["collection_month"] = df["collection_date"].str.split("-").str[1]
+            df["collection_day"] = df["collection_date"].str.split("-").str[2]
+            # Safely map collection_month to month_name (account for NaN values)
+            def get_month_name(month):
+                try:
+                    return calendar.month_name[int(month)]
+                except (ValueError, TypeError):
+                    return ""  # return empty string for invalid cases
+            df["collection_month_name"] = df["collection_month"].apply(get_month_name)
+        # Ensure 'analysis_type' exists in df before modifying it
+        if "analysis_type" in df.columns:
+            df["analysis_type"] = df["analysis_type"].apply(
+                lambda x: "; ".join(x) if isinstance(x, list) else x
+            )
+        # Address 'Was sample DNAse treated?' col
+        # Change from 'yes/no' to 'Y/N'
+        if self.user_facility in ["jgi_mg", "jgi_mt"] and "dnase" in df.columns:
+            df.loc[df["dnase"] == "yes", "dnase"] = "Y"
+            df.loc[df["dnase"] == "no", "dnase"] = "N"
+        # Address standardizing "USA" country name for MG and MT
+        # Replace "country_name" with "USA" if it exists
+        usa_names = [
+            "United States",
+            "United States of America",
+            "US",
+            "America",
+            "usa",
+            "united states",
+            "united states of america",
+            "us",
+            "america",
+        ]
+        if self.user_facility == "jgi_mg" or self.user_facility == "jgi_mt":
+            df["country_name"] = df["country_name"].replace(usa_names, "USA")
+        return df

mutts-1.0.0/src/mutts/spreadsheet.py ADDED Viewed

@@ -0,0 +1,125 @@
+import pandas as pd
+from typing import Dict, List, Union
+class SpreadsheetCreator:
+    """
+    Creates a spreadsheet based on a JSON mapper and metadata DataFrame.
+    """
+    def __init__(
+        self,
+        user_facility: str,
+        json_mapper: Dict[str, Dict[str, Union[str, List[str]]]],
+        metadata_df: pd.DataFrame,
+    ) -> None:
+        """
+        Initialize the SpreadsheetCreator.
+        :param json_mapper: The JSON mapper specifying column mappings.
+        :param metadata_df: The metadata DataFrame to create the spreadsheet from.
+        """
+        self.user_facility = user_facility
+        self.json_mapper = json_mapper
+        self.metadata_df = metadata_df
+    def combine_headers_df(self, header: bool) -> pd.DataFrame:
+        """
+        Combines and formats the headers DataFrame.
+        :param header: True if the headers should be included, False otherwise.
+        :return: The combined headers DataFrame.
+        """
+        d: Dict[str, List[Union[str, List[str]]]] = {}
+        for k, v in self.json_mapper.items():
+            l: List[Union[str, List[str]]] = [
+                h for h_n, h in v.items() if h_n != "sub_port_mapping"
+            ]
+            d[k] = l
+        headers_df: pd.DataFrame = pd.DataFrame(d)
+        if header:
+            last_row = headers_df.iloc[-1]
+            column_values: List[str] = list(last_row)
+            headers_df = headers_df.drop(headers_df.index[-1])
+            headers_df.loc[len(headers_df)] = headers_df.columns.to_list()
+            headers_df.columns = column_values
+            shift = 1
+            headers_df = pd.concat(
+                [headers_df.iloc[-shift:], headers_df.iloc[:-shift]], ignore_index=True
+            )
+        return headers_df
+    def combine_sample_rows_df(self) -> pd.DataFrame:
+        """
+        Combines and formats the sample rows DataFrame.
+        :return: The combined sample rows DataFrame.
+        """
+        rows_df: pd.DataFrame = pd.DataFrame()
+        for k, v in self.json_mapper.items():
+            if (
+                "sub_port_mapping" in v
+                and v["sub_port_mapping"] in self.metadata_df.columns.to_list()
+            ):
+                if "header" in v:
+                    rows_df[v["header"]] = self.metadata_df[v["sub_port_mapping"]]
+                else:
+                    rows_df[k] = self.metadata_df[v["sub_port_mapping"]]
+        return rows_df
+    def combine_headers_and_rows(
+        self, headers_df: pd.DataFrame, rows_df: pd.DataFrame
+    ) -> pd.DataFrame:
+        """
+        Combines the headers and sample rows DataFrames.
+        :param headers_df: The headers DataFrame.
+        :param rows_df: The sample rows DataFrame.
+        :return: The combined DataFrame.
+        """
+        # Account for specialized EMSL user facility mapping:
+        if self.user_facility == "emsl":
+            # Extract the header mapping keywords and column titles from headers_df
+            # These will be used to map the info in rows_df into the new df
+            mapping_keywords = headers_df.iloc[2].values
+            column_titles = headers_df.columns
+            # Go through rows_df data and select cols where the mapping keywords match
+            # (exist in both headers_df and rows_df), and insert NaN for missing data
+            matched_data = {
+                title: rows_df.get(keyword, pd.Series([None] * len(rows_df)))
+                for title, keyword in zip(column_titles, mapping_keywords)
+            }
+            # Create new df for aligned column data
+            matching_rows_df = pd.DataFrame(matched_data)
+            # Combind aligned data with headers_df by keeping the header and
+            # appending the aligned rows_df data
+            combined = pd.concat([headers_df, matching_rows_df], ignore_index=True)
+            return combined
+        # Otherwise, JGI user facility:
+        else:
+            return pd.concat([headers_df, rows_df], ignore_index=True)
+    def create_spreadsheet(self, header: bool) -> pd.DataFrame:
+        """
+        Creates the spreadsheet based on the JSON mapper and metadata DataFrame.
+        :param header: True if the headers should be included, False otherwise.
+        :return: The created spreadsheet.
+        """
+        headers_df = self.combine_headers_df(header)
+        rows_df = self.combine_sample_rows_df()
+        spreadsheet = self.combine_headers_and_rows(headers_df, rows_df)
+        return spreadsheet