linkml-store 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of linkml-store might be problematic. Click here for more details.
- linkml_store/api/collection.py +2 -2
- linkml_store/api/database.py +1 -12
- linkml_store/api/stores/duckdb/duckdb_collection.py +25 -23
- linkml_store/api/stores/duckdb/duckdb_database.py +2 -2
- linkml_store/api/stores/filesystem/filesystem_database.py +1 -1
- linkml_store/api/stores/mongodb/mongodb_collection.py +39 -25
- linkml_store/api/stores/neo4j/neo4j_database.py +1 -1
- linkml_store/api/stores/solr/solr_collection.py +102 -19
- linkml_store/cli.py +1 -2
- linkml_store/index/implementations/llm_indexer.py +0 -1
- linkml_store/index/indexer.py +2 -1
- linkml_store/inference/implementations/llm_inference_engine.py +2 -4
- linkml_store/inference/inference_config.py +1 -1
- linkml_store/inference/inference_engine.py +1 -1
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +172 -0
- linkml_store/plotting/heatmap.py +356 -0
- linkml_store/utils/dat_parser.py +1 -1
- linkml_store/utils/enrichment_analyzer.py +7 -7
- linkml_store/utils/llm_utils.py +1 -1
- linkml_store/utils/object_utils.py +9 -3
- {linkml_store-0.2.9.dist-info → linkml_store-0.2.10.dist-info}/METADATA +1 -1
- {linkml_store-0.2.9.dist-info → linkml_store-0.2.10.dist-info}/RECORD +26 -23
- {linkml_store-0.2.9.dist-info → linkml_store-0.2.10.dist-info}/WHEEL +1 -1
- {linkml_store-0.2.9.dist-info → linkml_store-0.2.10.dist-info}/LICENSE +0 -0
- {linkml_store-0.2.9.dist-info → linkml_store-0.2.10.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for the plotting package.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, Union
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
|
|
11
|
+
from linkml_store.plotting.heatmap import heatmap_from_file, export_heatmap_data
|
|
12
|
+
from linkml_store.utils.format_utils import Format
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@click.group()
|
|
18
|
+
def plot_cli():
|
|
19
|
+
"""Plotting utilities for LinkML data."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@plot_cli.command()
|
|
24
|
+
@click.argument("input_file", required=False)
|
|
25
|
+
@click.option("--x-column", "-x", required=True, help="Column to use for x-axis")
|
|
26
|
+
@click.option("--y-column", "-y", required=True, help="Column to use for y-axis")
|
|
27
|
+
@click.option("--value-column", "-v", help="Column containing values (if not provided, counts will be used)")
|
|
28
|
+
@click.option("--title", "-t", help="Title for the heatmap")
|
|
29
|
+
@click.option("--width", "-w", type=int, default=10, show_default=True, help="Width of the figure in inches")
|
|
30
|
+
@click.option("--height", "-h", type=int, default=8, show_default=True, help="Height of the figure in inches")
|
|
31
|
+
@click.option("--cmap", "-c", default="YlGnBu", show_default=True, help="Colormap to use")
|
|
32
|
+
@click.option("--output", "-o", required=True, help="Output file path")
|
|
33
|
+
@click.option("--format", "-f", help="Input file format")
|
|
34
|
+
@click.option("--dpi", type=int, default=300, show_default=True, help="DPI for output image")
|
|
35
|
+
@click.option("--square/--no-square", default=False, show_default=True, help="Make cells square")
|
|
36
|
+
@click.option("--annotate/--no-annotate", default=True, show_default=True, help="Annotate cells with values")
|
|
37
|
+
@click.option("--font-size", type=int, default=10, show_default=True, help="Font size for annotations and labels")
|
|
38
|
+
@click.option("--robust/--no-robust", default=False, show_default=True, help="Use robust quantiles for colormap scaling")
|
|
39
|
+
@click.option("--remove-duplicates/--no-remove-duplicates", default=True, show_default=True,
|
|
40
|
+
help="Remove duplicate x,y combinations (default) or keep all occurrences")
|
|
41
|
+
@click.option("--cluster", type=click.Choice(["none", "both", "x", "y"]), default="none", show_default=True,
|
|
42
|
+
help="Cluster axes: none (default), both, x-axis only, or y-axis only")
|
|
43
|
+
@click.option("--cluster-method", type=click.Choice(["complete", "average", "single", "ward"]), default="complete", show_default=True,
|
|
44
|
+
help="Linkage method for hierarchical clustering")
|
|
45
|
+
@click.option("--cluster-metric", type=click.Choice(["euclidean", "correlation", "cosine", "cityblock"]), default="euclidean", show_default=True,
|
|
46
|
+
help="Distance metric for clustering")
|
|
47
|
+
@click.option("--export-data", "-e", help="Export the heatmap data to this file")
|
|
48
|
+
@click.option("--export-format", "-E", type=click.Choice([f.value for f in Format]), default="csv", show_default=True,
|
|
49
|
+
help="Format for exported data")
|
|
50
|
+
def heatmap(
|
|
51
|
+
input_file: Optional[str],
|
|
52
|
+
x_column: str,
|
|
53
|
+
y_column: str,
|
|
54
|
+
value_column: Optional[str],
|
|
55
|
+
title: Optional[str],
|
|
56
|
+
width: int,
|
|
57
|
+
height: int,
|
|
58
|
+
cmap: str,
|
|
59
|
+
output: str,
|
|
60
|
+
format: Optional[str],
|
|
61
|
+
dpi: int,
|
|
62
|
+
square: bool,
|
|
63
|
+
annotate: bool,
|
|
64
|
+
font_size: int,
|
|
65
|
+
robust: bool,
|
|
66
|
+
remove_duplicates: bool,
|
|
67
|
+
cluster: str,
|
|
68
|
+
cluster_method: str,
|
|
69
|
+
cluster_metric: str,
|
|
70
|
+
export_data: Optional[str],
|
|
71
|
+
export_format: Union[str, Format],
|
|
72
|
+
):
|
|
73
|
+
"""
|
|
74
|
+
Create a heatmap from a tabular data file.
|
|
75
|
+
|
|
76
|
+
Examples:
|
|
77
|
+
# From a file
|
|
78
|
+
linkml-store plot heatmap data.csv -x species -y country -o heatmap.png
|
|
79
|
+
|
|
80
|
+
# From stdin
|
|
81
|
+
cat data.csv | linkml-store plot heatmap -x species -y country -o heatmap.png
|
|
82
|
+
|
|
83
|
+
This will create a heatmap showing the frequency counts of species by country.
|
|
84
|
+
If you want to use a specific value column instead of counts:
|
|
85
|
+
|
|
86
|
+
linkml-store plot heatmap data.csv -x species -y country -v population -o heatmap.png
|
|
87
|
+
"""
|
|
88
|
+
# Handle file path - if None, use stdin
|
|
89
|
+
if input_file is None:
|
|
90
|
+
input_file = "-" # format_utils treats "-" as stdin
|
|
91
|
+
|
|
92
|
+
# Convert 'none' to False for clustering parameter
|
|
93
|
+
use_cluster = False if cluster == "none" else cluster
|
|
94
|
+
|
|
95
|
+
# Create heatmap visualization
|
|
96
|
+
fig, ax = heatmap_from_file(
|
|
97
|
+
file_path=input_file,
|
|
98
|
+
x_column=x_column,
|
|
99
|
+
y_column=y_column,
|
|
100
|
+
value_column=value_column,
|
|
101
|
+
title=title,
|
|
102
|
+
figsize=(width, height),
|
|
103
|
+
cmap=cmap,
|
|
104
|
+
output_file=output,
|
|
105
|
+
format=format,
|
|
106
|
+
dpi=dpi,
|
|
107
|
+
square=square,
|
|
108
|
+
annot=annotate,
|
|
109
|
+
font_size=font_size,
|
|
110
|
+
robust=robust,
|
|
111
|
+
remove_duplicates=remove_duplicates,
|
|
112
|
+
cluster=use_cluster,
|
|
113
|
+
cluster_method=cluster_method,
|
|
114
|
+
cluster_metric=cluster_metric,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Export data if requested
|
|
118
|
+
if export_data:
|
|
119
|
+
# For export, reuse the data already loaded for the heatmap instead of loading again
|
|
120
|
+
# This avoids the "I/O operation on closed file" error when input_file is stdin
|
|
121
|
+
import pandas as pd
|
|
122
|
+
from matplotlib.axes import Axes
|
|
123
|
+
|
|
124
|
+
# Extract the data directly from the plot
|
|
125
|
+
if hasattr(ax, 'get_figure') and hasattr(ax, 'get_children'):
|
|
126
|
+
# Extract the heatmap data from the plot itself
|
|
127
|
+
heatmap_data = {}
|
|
128
|
+
for child in ax.get_children():
|
|
129
|
+
if isinstance(child, plt.matplotlib.collections.QuadMesh):
|
|
130
|
+
# Get the colormap data
|
|
131
|
+
data_values = child.get_array()
|
|
132
|
+
rows = ax.get_yticks()
|
|
133
|
+
cols = ax.get_xticks()
|
|
134
|
+
row_labels = [item.get_text() for item in ax.get_yticklabels()]
|
|
135
|
+
col_labels = [item.get_text() for item in ax.get_xticklabels()]
|
|
136
|
+
|
|
137
|
+
# Create a dataframe from the plot data
|
|
138
|
+
heatmap_df = pd.DataFrame(
|
|
139
|
+
index=[label for label in row_labels if label],
|
|
140
|
+
columns=[label for label in col_labels if label]
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Fill in the values (if we can)
|
|
144
|
+
if len(data_values) == len(row_labels) * len(col_labels):
|
|
145
|
+
for i, row in enumerate(row_labels):
|
|
146
|
+
for j, col in enumerate(col_labels):
|
|
147
|
+
if row and col: # Skip empty labels
|
|
148
|
+
idx = i * len(col_labels) + j
|
|
149
|
+
if idx < len(data_values):
|
|
150
|
+
heatmap_df.at[row, col] = data_values[idx]
|
|
151
|
+
|
|
152
|
+
# Reset index to make the y_column a regular column
|
|
153
|
+
result_df = heatmap_df.reset_index()
|
|
154
|
+
result_df.rename(columns={'index': y_column}, inplace=True)
|
|
155
|
+
|
|
156
|
+
# Export the data
|
|
157
|
+
from linkml_store.utils.format_utils import write_output
|
|
158
|
+
records = result_df.to_dict(orient='records')
|
|
159
|
+
write_output(records, format=export_format, target=export_data)
|
|
160
|
+
click.echo(f"Heatmap data exported to {export_data}")
|
|
161
|
+
break
|
|
162
|
+
else:
|
|
163
|
+
# If we couldn't extract data from the plot, inform the user
|
|
164
|
+
click.echo("Warning: Could not export data from the plot")
|
|
165
|
+
else:
|
|
166
|
+
click.echo("Warning: Could not export data from the plot")
|
|
167
|
+
|
|
168
|
+
click.echo(f"Heatmap created at {output}")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
if __name__ == "__main__":
|
|
172
|
+
plot_cli()
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Heatmap visualization module for LinkML data.
|
|
3
|
+
|
|
4
|
+
This module provides functions to generate heatmaps from pandas DataFrames or tabular data files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
11
|
+
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import seaborn as sns
|
|
16
|
+
from matplotlib.colors import LinearSegmentedColormap
|
|
17
|
+
from scipy.cluster import hierarchy
|
|
18
|
+
from scipy.spatial import distance
|
|
19
|
+
|
|
20
|
+
from linkml_store.utils.format_utils import Format, load_objects, write_output
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def create_heatmap(
|
|
26
|
+
data: pd.DataFrame,
|
|
27
|
+
x_column: str,
|
|
28
|
+
y_column: str,
|
|
29
|
+
value_column: Optional[str] = None,
|
|
30
|
+
title: Optional[str] = None,
|
|
31
|
+
figsize: Tuple[int, int] = (10, 8),
|
|
32
|
+
cmap: Union[str, LinearSegmentedColormap] = "YlGnBu",
|
|
33
|
+
annot: bool = True,
|
|
34
|
+
fmt: Optional[str] = None, # Dynamically determined based on data
|
|
35
|
+
linewidths: float = 0.5,
|
|
36
|
+
linecolor: str = "white",
|
|
37
|
+
square: bool = False,
|
|
38
|
+
output_file: Optional[str] = None,
|
|
39
|
+
dpi: int = 300,
|
|
40
|
+
missing_value: Any = np.nan,
|
|
41
|
+
vmin: Optional[float] = None,
|
|
42
|
+
vmax: Optional[float] = None,
|
|
43
|
+
robust: bool = False,
|
|
44
|
+
remove_duplicates: bool = True,
|
|
45
|
+
font_size: int = 10,
|
|
46
|
+
cluster: Union[bool, Literal["both", "x", "y"]] = False,
|
|
47
|
+
cluster_method: str = "complete", # linkage method: complete, average, single, etc.
|
|
48
|
+
cluster_metric: str = "euclidean", # distance metric: euclidean, cosine, etc.
|
|
49
|
+
**kwargs,
|
|
50
|
+
) -> Tuple[plt.Figure, plt.Axes]:
|
|
51
|
+
"""
|
|
52
|
+
Create a heatmap from a pandas DataFrame.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
data: Input DataFrame containing the data to plot
|
|
56
|
+
x_column: Column to use for x-axis categories
|
|
57
|
+
y_column: Column to use for y-axis categories
|
|
58
|
+
value_column: Column containing values for the heatmap. If None, frequency counts will be used.
|
|
59
|
+
title: Title for the heatmap
|
|
60
|
+
figsize: Figure size as (width, height) in inches
|
|
61
|
+
cmap: Colormap for the heatmap
|
|
62
|
+
annot: Whether to annotate cells with values
|
|
63
|
+
fmt: String formatting code for annotations (auto-detected if None)
|
|
64
|
+
linewidths: Width of lines between cells
|
|
65
|
+
linecolor: Color of lines between cells
|
|
66
|
+
square: Whether to make cells square
|
|
67
|
+
output_file: File path to save the figure (optional)
|
|
68
|
+
dpi: Resolution for saved figure
|
|
69
|
+
missing_value: Value to use for missing data (defaults to NaN)
|
|
70
|
+
vmin: Minimum value for colormap scaling
|
|
71
|
+
vmax: Maximum value for colormap scaling
|
|
72
|
+
robust: If True, compute colormap limits using robust quantiles instead of min/max
|
|
73
|
+
remove_duplicates: If True, removes duplicate rows before creating the heatmap
|
|
74
|
+
font_size: Font size for annotations
|
|
75
|
+
cluster: Whether and which axes to cluster:
|
|
76
|
+
- False: No clustering (default)
|
|
77
|
+
- True or "both": Cluster both x and y axes
|
|
78
|
+
- "x": Cluster only x-axis
|
|
79
|
+
- "y": Cluster only y-axis
|
|
80
|
+
cluster_method: Linkage method for hierarchical clustering
|
|
81
|
+
(e.g., "single", "complete", "average", "ward")
|
|
82
|
+
cluster_metric: Distance metric for clustering (e.g., "euclidean", "correlation", "cosine")
|
|
83
|
+
**kwargs: Additional keyword arguments to pass to seaborn's heatmap function
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Tuple containing the figure and axes objects
|
|
87
|
+
"""
|
|
88
|
+
# Validate input
|
|
89
|
+
if x_column not in data.columns:
|
|
90
|
+
raise ValueError(f"x_column '{x_column}' not found in DataFrame columns: {list(data.columns)}")
|
|
91
|
+
if y_column not in data.columns:
|
|
92
|
+
raise ValueError(f"y_column '{y_column}' not found in DataFrame columns: {list(data.columns)}")
|
|
93
|
+
if value_column and value_column not in data.columns:
|
|
94
|
+
raise ValueError(f"value_column '{value_column}' not found in DataFrame columns: {list(data.columns)}")
|
|
95
|
+
|
|
96
|
+
# Remove duplicates by default (assume they're accidents unless user overrides)
|
|
97
|
+
if remove_duplicates:
|
|
98
|
+
data = data.drop_duplicates()
|
|
99
|
+
|
|
100
|
+
# Prepare the data
|
|
101
|
+
if value_column:
|
|
102
|
+
# Use the provided value column
|
|
103
|
+
pivot_data = data.pivot_table(
|
|
104
|
+
index=y_column,
|
|
105
|
+
columns=x_column,
|
|
106
|
+
values=value_column,
|
|
107
|
+
aggfunc='mean',
|
|
108
|
+
fill_value=missing_value
|
|
109
|
+
)
|
|
110
|
+
else:
|
|
111
|
+
# Use frequency counts
|
|
112
|
+
cross_tab = pd.crosstab(data[y_column], data[x_column])
|
|
113
|
+
pivot_data = cross_tab
|
|
114
|
+
|
|
115
|
+
# Auto-detect format string if not provided
|
|
116
|
+
if fmt is None:
|
|
117
|
+
# Check if the pivot table contains integers only
|
|
118
|
+
if pivot_data.dtypes.apply(lambda x: pd.api.types.is_integer_dtype(x)).all():
|
|
119
|
+
fmt = 'd' # Integer format
|
|
120
|
+
else:
|
|
121
|
+
fmt = '.1f' # One decimal place for floats
|
|
122
|
+
|
|
123
|
+
# Make sure all cells have a reasonable minimum size
|
|
124
|
+
min_height = max(4, 80 / len(pivot_data.index) if len(pivot_data.index) > 0 else 10)
|
|
125
|
+
min_width = max(4, 80 / len(pivot_data.columns) if len(pivot_data.columns) > 0 else 10)
|
|
126
|
+
|
|
127
|
+
# Adjust figure size based on the number of rows and columns
|
|
128
|
+
adjusted_height = max(figsize[1], min_height * len(pivot_data.index) / 10)
|
|
129
|
+
adjusted_width = max(figsize[0], min_width * len(pivot_data.columns) / 10)
|
|
130
|
+
adjusted_figsize = (adjusted_width, adjusted_height)
|
|
131
|
+
|
|
132
|
+
# Create figure and axes
|
|
133
|
+
fig, ax = plt.subplots(figsize=adjusted_figsize)
|
|
134
|
+
|
|
135
|
+
# Apply clustering if requested
|
|
136
|
+
row_linkage = None
|
|
137
|
+
col_linkage = None
|
|
138
|
+
|
|
139
|
+
if cluster:
|
|
140
|
+
cluster_axes = cluster
|
|
141
|
+
if cluster_axes is True:
|
|
142
|
+
cluster_axes = "both"
|
|
143
|
+
|
|
144
|
+
# Fill NAs for clustering
|
|
145
|
+
pivot_data_for_clustering = pivot_data.fillna(0)
|
|
146
|
+
|
|
147
|
+
# Cluster rows (y-axis)
|
|
148
|
+
if cluster_axes in ["both", "y"]:
|
|
149
|
+
try:
|
|
150
|
+
# Calculate distance matrix and linkage for rows
|
|
151
|
+
row_distances = distance.pdist(pivot_data_for_clustering.values, metric=cluster_metric)
|
|
152
|
+
row_linkage = hierarchy.linkage(row_distances, method=cluster_method)
|
|
153
|
+
|
|
154
|
+
# Reorder rows based on clustering
|
|
155
|
+
row_dendrogram = hierarchy.dendrogram(row_linkage, no_plot=True)
|
|
156
|
+
row_order = row_dendrogram['leaves']
|
|
157
|
+
pivot_data = pivot_data.iloc[row_order]
|
|
158
|
+
|
|
159
|
+
logger.info(f"Applied clustering to rows using {cluster_method} linkage and {cluster_metric} metric")
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.warning(f"Failed to cluster rows: {e}")
|
|
162
|
+
|
|
163
|
+
# Cluster columns (x-axis)
|
|
164
|
+
if cluster_axes in ["both", "x"]:
|
|
165
|
+
try:
|
|
166
|
+
# Calculate distance matrix and linkage for columns
|
|
167
|
+
col_distances = distance.pdist(pivot_data_for_clustering.values.T, metric=cluster_metric)
|
|
168
|
+
col_linkage = hierarchy.linkage(col_distances, method=cluster_method)
|
|
169
|
+
|
|
170
|
+
# Reorder columns based on clustering
|
|
171
|
+
col_dendrogram = hierarchy.dendrogram(col_linkage, no_plot=True)
|
|
172
|
+
col_order = col_dendrogram['leaves']
|
|
173
|
+
pivot_data = pivot_data.iloc[:, col_order]
|
|
174
|
+
|
|
175
|
+
logger.info(f"Applied clustering to columns using {cluster_method} linkage and {cluster_metric} metric")
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logger.warning(f"Failed to cluster columns: {e}")
|
|
178
|
+
|
|
179
|
+
# Create the heatmap
|
|
180
|
+
sns.heatmap(
|
|
181
|
+
pivot_data,
|
|
182
|
+
cmap=cmap,
|
|
183
|
+
annot=annot,
|
|
184
|
+
fmt=fmt,
|
|
185
|
+
linewidths=linewidths,
|
|
186
|
+
linecolor=linecolor,
|
|
187
|
+
square=square,
|
|
188
|
+
vmin=vmin,
|
|
189
|
+
vmax=vmax,
|
|
190
|
+
robust=robust,
|
|
191
|
+
ax=ax,
|
|
192
|
+
annot_kws={'fontsize': font_size},
|
|
193
|
+
**kwargs
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Set title if provided
|
|
197
|
+
if title:
|
|
198
|
+
ax.set_title(title, fontsize=font_size + 4)
|
|
199
|
+
|
|
200
|
+
# Improve display of tick labels
|
|
201
|
+
plt.xticks(rotation=45, ha="right", fontsize=font_size)
|
|
202
|
+
plt.yticks(rotation=0, fontsize=font_size)
|
|
203
|
+
|
|
204
|
+
# Add grid lines to make the table more readable
|
|
205
|
+
ax.grid(False)
|
|
206
|
+
|
|
207
|
+
# Improve contrast for better readability
|
|
208
|
+
for _, spine in ax.spines.items():
|
|
209
|
+
spine.set_visible(True)
|
|
210
|
+
spine.set_color('black')
|
|
211
|
+
spine.set_linewidth(1)
|
|
212
|
+
|
|
213
|
+
# Adjust layout
|
|
214
|
+
plt.tight_layout()
|
|
215
|
+
|
|
216
|
+
# Save the figure if output file is specified
|
|
217
|
+
if output_file:
|
|
218
|
+
output_path = Path(output_file)
|
|
219
|
+
output_dir = output_path.parent
|
|
220
|
+
if not output_dir.exists():
|
|
221
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
plt.savefig(output_file, dpi=dpi, bbox_inches="tight")
|
|
223
|
+
logger.info(f"Heatmap saved to {output_file}")
|
|
224
|
+
|
|
225
|
+
return fig, ax
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def heatmap_from_file(
|
|
229
|
+
file_path: str,
|
|
230
|
+
x_column: str,
|
|
231
|
+
y_column: str,
|
|
232
|
+
value_column: Optional[str] = None,
|
|
233
|
+
format: Optional[Union[Format, str]] = None,
|
|
234
|
+
compression: Optional[str] = None,
|
|
235
|
+
output_file: Optional[str] = None,
|
|
236
|
+
remove_duplicates: bool = True,
|
|
237
|
+
**kwargs,
|
|
238
|
+
) -> Tuple[plt.Figure, plt.Axes]:
|
|
239
|
+
"""
|
|
240
|
+
Create a heatmap from a file (CSV, TSV, etc.).
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
file_path: Path to the input file or "-" for stdin
|
|
244
|
+
x_column: Column to use for x-axis categories
|
|
245
|
+
y_column: Column to use for y-axis categories
|
|
246
|
+
value_column: Column containing values for the heatmap. If None, frequency counts will be used.
|
|
247
|
+
format: Format of the input file (auto-detected if None)
|
|
248
|
+
compression: Compression format ('gz' or 'tgz')
|
|
249
|
+
output_file: File path to save the figure (optional)
|
|
250
|
+
remove_duplicates: If True, removes duplicate rows before creating the heatmap
|
|
251
|
+
**kwargs: Additional arguments to pass to create_heatmap
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
Tuple containing the figure and axes objects
|
|
255
|
+
"""
|
|
256
|
+
# Handle stdin input safely
|
|
257
|
+
import sys
|
|
258
|
+
import io
|
|
259
|
+
import pandas as pd
|
|
260
|
+
import click
|
|
261
|
+
|
|
262
|
+
# Load the data
|
|
263
|
+
if file_path == "-":
|
|
264
|
+
# Read directly from stdin since format_utils will use sys.stdin which may already be consumed
|
|
265
|
+
if not format or str(format).lower() in ['csv', 'tsv']:
|
|
266
|
+
# Default to CSV if no format specified
|
|
267
|
+
delimiter = ',' if not format or str(format).lower() == 'csv' else '\t'
|
|
268
|
+
df = pd.read_csv(sys.stdin, delimiter=delimiter)
|
|
269
|
+
else:
|
|
270
|
+
# Try to use format_utils but with a backup plan
|
|
271
|
+
try:
|
|
272
|
+
objs = load_objects(file_path, format=format, compression=compression)
|
|
273
|
+
df = pd.DataFrame(objs)
|
|
274
|
+
except ValueError as e:
|
|
275
|
+
if "I/O operation on closed file" in str(e):
|
|
276
|
+
logger.warning("Could not read from stdin. It may have been consumed already.")
|
|
277
|
+
raise click.UsageError("Error reading from stdin. Please provide a file path or ensure stdin has data.")
|
|
278
|
+
else:
|
|
279
|
+
raise
|
|
280
|
+
else:
|
|
281
|
+
# For regular files, use format_utils as normal
|
|
282
|
+
if (not format or format in ["csv", "tsv"]) and not compression:
|
|
283
|
+
df = pd.read_csv(file_path)
|
|
284
|
+
else:
|
|
285
|
+
objs = load_objects(file_path, format=format, compression=compression)
|
|
286
|
+
df = pd.DataFrame(objs)
|
|
287
|
+
|
|
288
|
+
# Create the heatmap
|
|
289
|
+
return create_heatmap(
|
|
290
|
+
data=df,
|
|
291
|
+
x_column=x_column,
|
|
292
|
+
y_column=y_column,
|
|
293
|
+
value_column=value_column,
|
|
294
|
+
output_file=output_file,
|
|
295
|
+
remove_duplicates=remove_duplicates,
|
|
296
|
+
**kwargs
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def export_heatmap_data(
|
|
301
|
+
data: pd.DataFrame,
|
|
302
|
+
x_column: str,
|
|
303
|
+
y_column: str,
|
|
304
|
+
value_column: Optional[str] = None,
|
|
305
|
+
output_file: Optional[str] = None,
|
|
306
|
+
format: Union[Format, str] = Format.CSV,
|
|
307
|
+
missing_value: Any = np.nan,
|
|
308
|
+
remove_duplicates: bool = True,
|
|
309
|
+
) -> pd.DataFrame:
|
|
310
|
+
"""
|
|
311
|
+
Export heatmap data to a file or return it as a DataFrame.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
data: Input DataFrame containing the data
|
|
315
|
+
x_column: Column to use for x-axis categories
|
|
316
|
+
y_column: Column to use for y-axis categories
|
|
317
|
+
value_column: Column containing values for the heatmap. If None, frequency counts will be used.
|
|
318
|
+
output_file: File path to save the data (optional)
|
|
319
|
+
format: Output format for the file
|
|
320
|
+
missing_value: Value to use for missing data
|
|
321
|
+
remove_duplicates: If True, removes duplicate rows before creating the pivot table
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
DataFrame containing the pivot table data
|
|
325
|
+
"""
|
|
326
|
+
# Remove duplicates by default (assume they're accidents unless user overrides)
|
|
327
|
+
if remove_duplicates:
|
|
328
|
+
# Keep the first occurrence of each x_column, y_column combination
|
|
329
|
+
data = data.drop_duplicates(subset=[x_column, y_column])
|
|
330
|
+
|
|
331
|
+
# Prepare the data
|
|
332
|
+
if value_column:
|
|
333
|
+
# Use the provided value column
|
|
334
|
+
pivot_data = data.pivot_table(
|
|
335
|
+
index=y_column,
|
|
336
|
+
columns=x_column,
|
|
337
|
+
values=value_column,
|
|
338
|
+
aggfunc='mean',
|
|
339
|
+
fill_value=missing_value
|
|
340
|
+
)
|
|
341
|
+
else:
|
|
342
|
+
# Use frequency counts
|
|
343
|
+
cross_tab = pd.crosstab(data[y_column], data[x_column])
|
|
344
|
+
pivot_data = cross_tab
|
|
345
|
+
|
|
346
|
+
# Reset index to make the y_column a regular column
|
|
347
|
+
result_df = pivot_data.reset_index()
|
|
348
|
+
|
|
349
|
+
# Write to file if output_file is provided
|
|
350
|
+
if output_file:
|
|
351
|
+
# Convert to records format for writing
|
|
352
|
+
records = result_df.to_dict(orient='records')
|
|
353
|
+
write_output(records, format=format, target=output_file)
|
|
354
|
+
logger.info(f"Heatmap data saved to {output_file}")
|
|
355
|
+
|
|
356
|
+
return result_df
|
linkml_store/utils/dat_parser.py
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
import numpy as np
|
|
3
|
-
from linkml_store.api import Collection
|
|
4
|
-
from scipy import stats
|
|
1
|
+
from collections import Counter
|
|
5
2
|
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
6
|
from pydantic import BaseModel
|
|
7
|
+
from scipy import stats
|
|
8
|
+
|
|
9
|
+
from linkml_store.api import Collection
|
|
7
10
|
|
|
8
11
|
|
|
9
12
|
class EnrichedCategory(BaseModel):
|
|
@@ -17,9 +20,6 @@ class EnrichedCategory(BaseModel):
|
|
|
17
20
|
adjusted_p_value: float
|
|
18
21
|
|
|
19
22
|
|
|
20
|
-
from collections import Counter, defaultdict
|
|
21
|
-
|
|
22
|
-
|
|
23
23
|
class EnrichmentAnalyzer:
|
|
24
24
|
def __init__(self, df: pd.DataFrame, sample_key: str, classification_key: str):
|
|
25
25
|
"""
|
linkml_store/utils/llm_utils.py
CHANGED
|
@@ -83,15 +83,21 @@ def object_path_get(obj: Union[BaseModel, Dict[str, Any]], path: str, default_va
|
|
|
83
83
|
'NA'
|
|
84
84
|
"""
|
|
85
85
|
if isinstance(obj, BaseModel):
|
|
86
|
-
obj = obj.
|
|
86
|
+
obj = obj.model_dump()
|
|
87
87
|
parts = path.split(".")
|
|
88
88
|
for part in parts:
|
|
89
89
|
if "[" in part:
|
|
90
90
|
key, index = part[:-1].split("[")
|
|
91
91
|
index = int(index)
|
|
92
|
-
obj
|
|
92
|
+
if key in obj and obj[key] is not None:
|
|
93
|
+
obj = obj[key][index]
|
|
94
|
+
else:
|
|
95
|
+
return default_value
|
|
93
96
|
else:
|
|
94
|
-
|
|
97
|
+
if isinstance(obj, list):
|
|
98
|
+
obj = [v1.get(part, default_value) for v1 in obj]
|
|
99
|
+
else:
|
|
100
|
+
obj = obj.get(part, default_value)
|
|
95
101
|
return obj
|
|
96
102
|
|
|
97
103
|
|