mutts 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mutts-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.3
2
+ Name: mutts
3
+ Version: 1.0.0
4
+ Summary: Metadata for User facility Template Transformations
5
+ Author: Sujay Patil
6
+ Author-email: spatil@lbl.gov
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: click (>=8.1.3,<9.0.0)
15
+ Requires-Dist: openpyxl (>=3.0.10,<4.0.0)
16
+ Requires-Dist: pandas (>=1.5.2,<2.0.0)
17
+ Requires-Dist: python-dotenv (>=0.21.1,<0.22.0)
18
+ Requires-Dist: requests (>=2.28.2,<3.0.0)
@@ -0,0 +1,28 @@
1
+ [tool.poetry]
2
+ name = "mutts"
3
+ version = "1.0.0"
4
+ description = "Metadata for User facility Template Transformations"
5
+ authors = [
6
+ "Sujay Patil <spatil@lbl.gov>",
7
+ "Cristina Stone Pedraza <cristina.stonepedraza@pnnl.gov>",
8
+ "Montana Smith <montana.smith@pnnl.gov>",
9
+ ]
10
+ packages = [{include = "mutts", from = "src"}]
11
+
12
+ [tool.poetry.scripts]
13
+ mutts = "mutts.cli:cli"
14
+
15
+ [tool.poetry.dependencies]
16
+ python = "^3.9"
17
+ pandas = "^1.5.2"
18
+ openpyxl = "^3.0.10"
19
+ click = "^8.1.3"
20
+ python-dotenv = "^0.21.1"
21
+ requests = "^2.28.2"
22
+
23
+ [tool.poetry.dev-dependencies]
24
+ black = "^22.12.0"
25
+
26
+ [build-system]
27
+ requires = ["poetry-core>=1.0.0"]
28
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,6 @@
1
+ """MUTTs - Metadata for User facility Template Transformations"""
2
+
3
+ from mutts.retriever import MetadataRetriever
4
+ from mutts.spreadsheet import SpreadsheetCreator
5
+
6
+ __all__ = ["MetadataRetriever", "SpreadsheetCreator"]
@@ -0,0 +1,130 @@
1
+ import json
2
+ import os
3
+
4
+ import click
5
+ import pandas as pd
6
+ from dotenv import load_dotenv, dotenv_values
7
+ from openpyxl.styles import Alignment
8
+ from typing import Dict, List, Union
9
+
10
+ from mutts.retriever import MetadataRetriever
11
+ from mutts.spreadsheet import SpreadsheetCreator
12
+
13
+
14
+ def format_worksheet(worksheet):
15
+ """
16
+ Apply formatting to a worksheet for better readability.
17
+
18
+ :param worksheet: The openpyxl worksheet to format.
19
+ """
20
+ # Enable text wrapping and adjust column widths
21
+ for column in worksheet.columns:
22
+ max_length = 0
23
+ column_letter = column[0].column_letter
24
+
25
+ for cell in column:
26
+ # Enable text wrapping for all cells
27
+ cell.alignment = Alignment(wrap_text=True, vertical='top')
28
+
29
+ # Calculate max length for column width
30
+ try:
31
+ if cell.value:
32
+ cell_length = len(str(cell.value))
33
+ if cell_length > max_length:
34
+ max_length = cell_length
35
+ except:
36
+ pass
37
+
38
+ # Set column width with reasonable limits (min 10, max 50)
39
+ adjusted_width = min(max(max_length + 2, 10), 50)
40
+ worksheet.column_dimensions[column_letter].width = adjusted_width
41
+
42
+
43
+ @click.command()
44
+ @click.option("--submission", "-s", required=True, help="Metadata submission id.")
45
+ @click.option(
46
+ "--user-facility", "-u", required=True, help="User facility to send data to."
47
+ )
48
+ @click.option("--header/--no-header", "-h", default=False, show_default=True)
49
+ @click.option(
50
+ "--mapper",
51
+ "-m",
52
+ required=True,
53
+ type=click.Path(exists=True),
54
+ help="Path to user facility specific JSON file.",
55
+ )
56
+ @click.option(
57
+ "--unique-field",
58
+ "-uf",
59
+ required=True,
60
+ help="Unique field to identify the metadata records.",
61
+ )
62
+ @click.option(
63
+ "--output",
64
+ "-o",
65
+ required=True,
66
+ help="Path to result output XLSX file.",
67
+ )
68
+ def cli(
69
+ submission: str,
70
+ user_facility: str,
71
+ header: bool,
72
+ mapper: str,
73
+ unique_field: str,
74
+ output: str,
75
+ ) -> None:
76
+ """
77
+ Command-line interface for creating a spreadsheet based on metadata records.
78
+
79
+ :param submission: The ID of the metadata submission.
80
+ :param user_facility: The user facility to retrieve data from.
81
+ :param header: True if the headers should be included, False otherwise.
82
+ :param mapper: Path to the JSON mapper specifying column mappings.
83
+ :param unique_field: Unique field to identify the metadata records.
84
+ :param output: Path to the output XLSX file.
85
+ """
86
+ load_dotenv()
87
+ env_path = os.path.join(os.getcwd(), ".env")
88
+ env_vars = dotenv_values(env_path)
89
+ for key, value in env_vars.items():
90
+ os.environ[key] = value
91
+
92
+ metadata_retriever = MetadataRetriever(submission, user_facility)
93
+ metadata_df = metadata_retriever.retrieve_metadata_records(unique_field)
94
+
95
+ with open(mapper, "r") as f:
96
+ json_mapper: Dict[str, Dict[str, Union[str, List[str]]]] = json.load(f)
97
+
98
+ spreadsheet_creator = SpreadsheetCreator(user_facility, json_mapper, metadata_df)
99
+ user_facility_spreadsheet = spreadsheet_creator.create_spreadsheet(header)
100
+
101
+ # Write the main data sheet and copy static sheets from template
102
+ with pd.ExcelWriter(output, engine='openpyxl') as writer:
103
+ # Write the generated data to 'DATA SHEET'
104
+ user_facility_spreadsheet.to_excel(writer, index=False, sheet_name='DATA SHEET')
105
+
106
+ # Path to static JGI v15 Excel template
107
+ static_excel_path = os.path.join(
108
+ os.path.dirname(__file__), '..', '..',
109
+ 'input-files', 'static-excel-tabs', 'JGI.Metagenome.NA.v15.xlsx'
110
+ )
111
+
112
+ # Copy INSTRUCTIONS and PLATE LOCATIONS sheets from JGI v15 template
113
+ # static file if it exists
114
+ if os.path.exists(static_excel_path):
115
+ static_excel = pd.ExcelFile(static_excel_path)
116
+ if 'INSTRUCTIONS' in static_excel.sheet_names:
117
+ instructions_df = pd.read_excel(static_excel, 'INSTRUCTIONS')
118
+ instructions_df.to_excel(writer, index=False, sheet_name='INSTRUCTIONS')
119
+ if 'PLATE LOCATIONS' in static_excel.sheet_names:
120
+ plate_locations_df = pd.read_excel(static_excel, 'PLATE LOCATIONS')
121
+ plate_locations_df.to_excel(writer, index=False, sheet_name='PLATE LOCATIONS')
122
+
123
+ # Apply formatting to all sheets
124
+ for sheet_name in writer.book.sheetnames:
125
+ worksheet = writer.book[sheet_name]
126
+ format_worksheet(worksheet)
127
+
128
+
129
+ if __name__ == "__main__":
130
+ cli()
@@ -0,0 +1,212 @@
1
+ import calendar
2
+ import os
3
+
4
+ import pandas as pd
5
+ import requests
6
+
7
+ from typing import Dict, Any
8
+ from dotenv import dotenv_values
9
+
10
+
11
+ class MetadataRetriever:
12
+ """
13
+ Retrieves metadata records from a given submission ID and user facility.
14
+ """
15
+
16
+ USER_FACILITY_DICT: Dict[str, str] = {
17
+ "emsl": "emsl_data",
18
+ "jgi_mg": "jgi_mg_data",
19
+ "jgi_mg_lr": "jgi_mg_lr_data",
20
+ "jgi_mt": "jgi_mt_data",
21
+ }
22
+
23
+ def __init__(self, metadata_submission_id: str, user_facility: str) -> None:
24
+ """
25
+ Initialize the MetadataRetriever.
26
+
27
+ :param metadata_submission_id: The ID of the metadata submission.
28
+ :param user_facility: The user facility to retrieve data from.
29
+ """
30
+ self.metadata_submission_id = metadata_submission_id
31
+ self.user_facility = user_facility
32
+ self.load_and_set_env_vars()
33
+ self.base_url = self.env.get("SUBMISSION_PORTAL_BASE_URL")
34
+
35
+ def load_and_set_env_vars(self):
36
+ """Loads and sets environment variables from .env file."""
37
+ env_path = os.path.join(os.path.dirname(__file__), "..", "..", ".env")
38
+ env_vars = dotenv_values(env_path)
39
+ for key, value in env_vars.items():
40
+ os.environ[key] = value
41
+
42
+ self.env: Dict[str, str] = dict(os.environ)
43
+
44
+ def retrieve_metadata_records(self, unique_field: str) -> pd.DataFrame:
45
+ """
46
+ Retrieves the metadata records for the given submission ID and user facility.
47
+
48
+ :return: The retrieved metadata records as a Pandas DataFrame.
49
+ """
50
+ self.load_and_set_env_vars()
51
+
52
+ refresh_response = requests.post(
53
+ f"{self.base_url}/auth/refresh",
54
+ json={"refresh_token": self.env["DATA_PORTAL_REFRESH_TOKEN"]},
55
+ )
56
+ refresh_response.raise_for_status()
57
+ refresh_body = refresh_response.json()
58
+ access_token = refresh_body["access_token"]
59
+
60
+ headers = {
61
+ "content-type": "application/json; charset=UTF-8",
62
+ "Authorization": f"Bearer {access_token}",
63
+ }
64
+ response: Dict[str, Any] = requests.get(
65
+ f"{self.base_url}/api/metadata_submission/{self.metadata_submission_id}",
66
+ headers=headers,
67
+ ).json()
68
+
69
+ # Get user-facility key data
70
+ common_df: pd.DataFrame = pd.DataFrame()
71
+ if self.user_facility in self.USER_FACILITY_DICT:
72
+ user_facility_data: Dict[str, Any] = response["metadata_submission"][
73
+ "sampleData"
74
+ ].get(self.USER_FACILITY_DICT[self.user_facility], {})
75
+ common_df = pd.DataFrame(user_facility_data)
76
+
77
+ # Check if common_df is empty
78
+ if common_df.empty:
79
+ raise ValueError(
80
+ f"No key {self.user_facility} exists in submission metadata record {self.metadata_submission_id}"
81
+ )
82
+ else:
83
+ df = common_df
84
+
85
+ # Find non-user-facility keys (ie, plant_associated, water, etc)
86
+ all_keys_data = response["metadata_submission"]["sampleData"]
87
+ user_facility_keys = [
88
+ "emsl_data",
89
+ "jgi_mg_data",
90
+ "jgi_mg_lr_data",
91
+ "jgi_mt_data",
92
+ ]
93
+ sample_data_keys = [
94
+ key for key in all_keys_data if key not in user_facility_keys
95
+ ]
96
+
97
+ # Create an empty list to store dataframes for each key
98
+ sample_data_dfs = []
99
+
100
+ # Loop through resulting keys and combine with common_df by samp_name
101
+ for key in sample_data_keys:
102
+
103
+ sample_data: Dict[str, Any] = response["metadata_submission"][
104
+ "sampleData"
105
+ ].get(key, {})
106
+
107
+ # Begin collecting detailed sample data
108
+
109
+ # If there's sample data, create a DataFrame and add it to the list
110
+ if sample_data:
111
+ sample_data_df = pd.DataFrame(sample_data)
112
+
113
+ # Add the non-UF key name into the df for 'Sample Isolated From' col in jgi mg/mt
114
+ sample_data_df["sample_isolated_from"] = key
115
+ # Append to list of dfs
116
+ sample_data_dfs.append(sample_data_df)
117
+
118
+ # Concatenate sample dataframes into one (if they exist)
119
+ if sample_data_dfs:
120
+ all_sample_data_df = pd.concat(sample_data_dfs, ignore_index=True)
121
+ # Merge the combined sample data with df on samp_name
122
+ if not df.empty and not all_sample_data_df.empty:
123
+ df = pd.merge(df, all_sample_data_df, on="samp_name", how="outer")
124
+
125
+ # Auto-fill depth with 0 for JGI facilities if no value is provided
126
+ if self.user_facility in ["jgi_mg", "jgi_mt", "jgi_mg_lr"]:
127
+ if "depth" not in df.columns:
128
+ df["depth"] = 0
129
+ else:
130
+ df["depth"] = df["depth"].fillna(0)
131
+
132
+ for index, row in df.iterrows():
133
+
134
+ if "lat_lon" in df.columns:
135
+
136
+ # Check if lat_lon is nan before trying to split it
137
+ if pd.isnull(row["lat_lon"]):
138
+ df.at[index, "latitude"] = None
139
+ df.at[index, "longitude"] = None
140
+ else:
141
+ values = str(row["lat_lon"]).split(" ", 1)
142
+ # Assign the split values back to the row
143
+ df.at[index, "latitude"] = values[0]
144
+ df.at[index, "longitude"] = values[1]
145
+
146
+ if "depth" in df.columns:
147
+
148
+ # Case - different delimiters used
149
+ row["depth"] = str(row["depth"]).replace("-", " - ")
150
+
151
+ # Case - only one value provided for depth (single value will be max and min)
152
+ # Checking if the value is a string, because if there is a dash, that will be the case
153
+ if type(row["depth"]) == str:
154
+ values = row["depth"].split(" - ")
155
+ # Check if only one value
156
+ if len(values) == 1:
157
+ df.at[index, "minimum_depth"] = float(values[0])
158
+ df.at[index, "maximum_depth"] = float(values[0])
159
+ # Check if it's a range
160
+ elif len(values) == 2:
161
+ df.at[index, "minimum_depth"] = float(values[0])
162
+ df.at[index, "maximum_depth"] = float(values[1])
163
+ else:
164
+ df.at[index, "minimum_depth"] = row["depth"]
165
+ df.at[index, "maximum_depth"] = row["depth"]
166
+
167
+ if "geo_loc_name" in df.columns:
168
+ df["country_name"] = df["geo_loc_name"].str.split(":").str[0]
169
+
170
+ if "collection_date" in df.columns:
171
+ df["collection_year"] = df["collection_date"].str.split("-").str[0]
172
+ df["collection_month"] = df["collection_date"].str.split("-").str[1]
173
+ df["collection_day"] = df["collection_date"].str.split("-").str[2]
174
+
175
+ # Safely map collection_month to month_name (account for NaN values)
176
+ def get_month_name(month):
177
+ try:
178
+ return calendar.month_name[int(month)]
179
+ except (ValueError, TypeError):
180
+ return "" # return empty string for invalid cases
181
+
182
+ df["collection_month_name"] = df["collection_month"].apply(get_month_name)
183
+
184
+ # Ensure 'analysis_type' exists in df before modifying it
185
+ if "analysis_type" in df.columns:
186
+ df["analysis_type"] = df["analysis_type"].apply(
187
+ lambda x: "; ".join(x) if isinstance(x, list) else x
188
+ )
189
+
190
+ # Address 'Was sample DNAse treated?' col
191
+ # Change from 'yes/no' to 'Y/N'
192
+ if self.user_facility in ["jgi_mg", "jgi_mt"] and "dnase" in df.columns:
193
+ df.loc[df["dnase"] == "yes", "dnase"] = "Y"
194
+ df.loc[df["dnase"] == "no", "dnase"] = "N"
195
+
196
+ # Address standardizing "USA" country name for MG and MT
197
+ # Replace "country_name" with "USA" if it exists
198
+ usa_names = [
199
+ "United States",
200
+ "United States of America",
201
+ "US",
202
+ "America",
203
+ "usa",
204
+ "united states",
205
+ "united states of america",
206
+ "us",
207
+ "america",
208
+ ]
209
+ if self.user_facility == "jgi_mg" or self.user_facility == "jgi_mt":
210
+ df["country_name"] = df["country_name"].replace(usa_names, "USA")
211
+
212
+ return df
@@ -0,0 +1,125 @@
1
+ import pandas as pd
2
+ from typing import Dict, List, Union
3
+
4
+
5
+ class SpreadsheetCreator:
6
+ """
7
+ Creates a spreadsheet based on a JSON mapper and metadata DataFrame.
8
+ """
9
+
10
+ def __init__(
11
+ self,
12
+ user_facility: str,
13
+ json_mapper: Dict[str, Dict[str, Union[str, List[str]]]],
14
+ metadata_df: pd.DataFrame,
15
+ ) -> None:
16
+ """
17
+ Initialize the SpreadsheetCreator.
18
+
19
+ :param json_mapper: The JSON mapper specifying column mappings.
20
+ :param metadata_df: The metadata DataFrame to create the spreadsheet from.
21
+ """
22
+ self.user_facility = user_facility
23
+ self.json_mapper = json_mapper
24
+ self.metadata_df = metadata_df
25
+
26
+ def combine_headers_df(self, header: bool) -> pd.DataFrame:
27
+ """
28
+ Combines and formats the headers DataFrame.
29
+
30
+ :param header: True if the headers should be included, False otherwise.
31
+ :return: The combined headers DataFrame.
32
+ """
33
+ d: Dict[str, List[Union[str, List[str]]]] = {}
34
+ for k, v in self.json_mapper.items():
35
+ l: List[Union[str, List[str]]] = [
36
+ h for h_n, h in v.items() if h_n != "sub_port_mapping"
37
+ ]
38
+ d[k] = l
39
+
40
+ headers_df: pd.DataFrame = pd.DataFrame(d)
41
+
42
+ if header:
43
+ last_row = headers_df.iloc[-1]
44
+ column_values: List[str] = list(last_row)
45
+
46
+ headers_df = headers_df.drop(headers_df.index[-1])
47
+ headers_df.loc[len(headers_df)] = headers_df.columns.to_list()
48
+ headers_df.columns = column_values
49
+
50
+ shift = 1
51
+ headers_df = pd.concat(
52
+ [headers_df.iloc[-shift:], headers_df.iloc[:-shift]], ignore_index=True
53
+ )
54
+
55
+ return headers_df
56
+
57
+ def combine_sample_rows_df(self) -> pd.DataFrame:
58
+ """
59
+ Combines and formats the sample rows DataFrame.
60
+
61
+ :return: The combined sample rows DataFrame.
62
+ """
63
+ rows_df: pd.DataFrame = pd.DataFrame()
64
+ for k, v in self.json_mapper.items():
65
+ if (
66
+ "sub_port_mapping" in v
67
+ and v["sub_port_mapping"] in self.metadata_df.columns.to_list()
68
+ ):
69
+ if "header" in v:
70
+ rows_df[v["header"]] = self.metadata_df[v["sub_port_mapping"]]
71
+ else:
72
+ rows_df[k] = self.metadata_df[v["sub_port_mapping"]]
73
+
74
+ return rows_df
75
+
76
+ def combine_headers_and_rows(
77
+ self, headers_df: pd.DataFrame, rows_df: pd.DataFrame
78
+ ) -> pd.DataFrame:
79
+ """
80
+ Combines the headers and sample rows DataFrames.
81
+
82
+ :param headers_df: The headers DataFrame.
83
+ :param rows_df: The sample rows DataFrame.
84
+ :return: The combined DataFrame.
85
+ """
86
+
87
+ # Account for specialized EMSL user facility mapping:
88
+ if self.user_facility == "emsl":
89
+
90
+ # Extract the header mapping keywords and column titles from headers_df
91
+ # These will be used to map the info in rows_df into the new df
92
+ mapping_keywords = headers_df.iloc[2].values
93
+ column_titles = headers_df.columns
94
+
95
+ # Go through rows_df data and select cols where the mapping keywords match
96
+ # (exist in both headers_df and rows_df), and insert NaN for missing data
97
+ matched_data = {
98
+ title: rows_df.get(keyword, pd.Series([None] * len(rows_df)))
99
+ for title, keyword in zip(column_titles, mapping_keywords)
100
+ }
101
+
102
+ # Create new df for aligned column data
103
+ matching_rows_df = pd.DataFrame(matched_data)
104
+
105
+ # Combind aligned data with headers_df by keeping the header and
106
+ # appending the aligned rows_df data
107
+ combined = pd.concat([headers_df, matching_rows_df], ignore_index=True)
108
+
109
+ return combined
110
+
111
+ # Otherwise, JGI user facility:
112
+ else:
113
+ return pd.concat([headers_df, rows_df], ignore_index=True)
114
+
115
+ def create_spreadsheet(self, header: bool) -> pd.DataFrame:
116
+ """
117
+ Creates the spreadsheet based on the JSON mapper and metadata DataFrame.
118
+
119
+ :param header: True if the headers should be included, False otherwise.
120
+ :return: The created spreadsheet.
121
+ """
122
+ headers_df = self.combine_headers_df(header)
123
+ rows_df = self.combine_sample_rows_df()
124
+ spreadsheet = self.combine_headers_and_rows(headers_df, rows_df)
125
+ return spreadsheet