linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Heatmap visualization module for LinkML data.
|
|
3
|
+
|
|
4
|
+
This module provides functions to generate heatmaps from pandas DataFrames or tabular data files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
11
|
+
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
import numpy as np
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import seaborn as sns
|
|
16
|
+
from matplotlib.colors import LinearSegmentedColormap
|
|
17
|
+
from scipy.cluster import hierarchy
|
|
18
|
+
from scipy.spatial import distance
|
|
19
|
+
|
|
20
|
+
from linkml_store.utils.format_utils import Format, load_objects, write_output
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def create_heatmap(
|
|
26
|
+
data: pd.DataFrame,
|
|
27
|
+
x_column: str,
|
|
28
|
+
y_column: str,
|
|
29
|
+
value_column: Optional[str] = None,
|
|
30
|
+
minimum_value: Optional[float] = None,
|
|
31
|
+
title: Optional[str] = None,
|
|
32
|
+
figsize: Tuple[int, int] = (10, 8),
|
|
33
|
+
cmap: Union[str, LinearSegmentedColormap] = "YlGnBu",
|
|
34
|
+
annot: bool = True,
|
|
35
|
+
fmt: Optional[str] = None, # Dynamically determined based on data
|
|
36
|
+
linewidths: float = 0.5,
|
|
37
|
+
linecolor: str = "white",
|
|
38
|
+
square: bool = False,
|
|
39
|
+
output_file: Optional[str] = None,
|
|
40
|
+
dpi: int = 300,
|
|
41
|
+
missing_value: Any = np.nan,
|
|
42
|
+
vmin: Optional[float] = None,
|
|
43
|
+
vmax: Optional[float] = None,
|
|
44
|
+
robust: bool = False,
|
|
45
|
+
remove_duplicates: bool = False,
|
|
46
|
+
font_size: int = 10,
|
|
47
|
+
cluster: Union[bool, Literal["both", "x", "y"]] = False,
|
|
48
|
+
cluster_method: str = "complete", # linkage method: complete, average, single, etc.
|
|
49
|
+
cluster_metric: str = "euclidean", # distance metric: euclidean, cosine, etc.
|
|
50
|
+
**kwargs,
|
|
51
|
+
) -> Tuple[plt.Figure, plt.Axes]:
|
|
52
|
+
"""
|
|
53
|
+
Create a heatmap from a pandas DataFrame.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
data: Input DataFrame containing the data to plot
|
|
57
|
+
x_column: Column to use for x-axis categories
|
|
58
|
+
y_column: Column to use for y-axis categories
|
|
59
|
+
value_column: Column containing values for the heatmap. If None, frequency counts will be used.
|
|
60
|
+
minimum_value: Minimum value to include in the heatmap
|
|
61
|
+
title: Title for the heatmap
|
|
62
|
+
figsize: Figure size as (width, height) in inches
|
|
63
|
+
cmap: Colormap for the heatmap
|
|
64
|
+
annot: Whether to annotate cells with values
|
|
65
|
+
fmt: String formatting code for annotations (auto-detected if None)
|
|
66
|
+
linewidths: Width of lines between cells
|
|
67
|
+
linecolor: Color of lines between cells
|
|
68
|
+
square: Whether to make cells square
|
|
69
|
+
output_file: File path to save the figure (optional)
|
|
70
|
+
dpi: Resolution for saved figure
|
|
71
|
+
missing_value: Value to use for missing data (defaults to NaN)
|
|
72
|
+
vmin: Minimum value for colormap scaling
|
|
73
|
+
vmax: Maximum value for colormap scaling
|
|
74
|
+
robust: If True, compute colormap limits using robust quantiles instead of min/max
|
|
75
|
+
remove_duplicates: If True, removes duplicate rows before creating the heatmap
|
|
76
|
+
font_size: Font size for annotations
|
|
77
|
+
cluster: Whether and which axes to cluster:
|
|
78
|
+
- False: No clustering (default)
|
|
79
|
+
- True or "both": Cluster both x and y axes
|
|
80
|
+
- "x": Cluster only x-axis
|
|
81
|
+
- "y": Cluster only y-axis
|
|
82
|
+
cluster_method: Linkage method for hierarchical clustering
|
|
83
|
+
(e.g., "single", "complete", "average", "ward")
|
|
84
|
+
cluster_metric: Distance metric for clustering (e.g., "euclidean", "correlation", "cosine")
|
|
85
|
+
**kwargs: Additional keyword arguments to pass to seaborn's heatmap function
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Tuple containing the figure and axes objects
|
|
89
|
+
"""
|
|
90
|
+
# Validate input
|
|
91
|
+
if x_column not in data.columns:
|
|
92
|
+
raise ValueError(f"x_column '{x_column}' not found in DataFrame columns: {list(data.columns)}")
|
|
93
|
+
if y_column not in data.columns:
|
|
94
|
+
raise ValueError(f"y_column '{y_column}' not found in DataFrame columns: {list(data.columns)}")
|
|
95
|
+
if value_column and value_column not in data.columns:
|
|
96
|
+
raise ValueError(f"value_column '{value_column}' not found in DataFrame columns: {list(data.columns)}")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
cols = [x_column, y_column]
|
|
100
|
+
if value_column:
|
|
101
|
+
cols.append(value_column)
|
|
102
|
+
|
|
103
|
+
# select cols from data
|
|
104
|
+
data = data[cols]
|
|
105
|
+
|
|
106
|
+
if any(isinstance(val, (list, set, tuple)) for val in data[x_column].dropna().head(100)):
|
|
107
|
+
logger.info(f"Exploding list values in x_column '{x_column}'")
|
|
108
|
+
data = data.explode(x_column).dropna(subset=[x_column])
|
|
109
|
+
|
|
110
|
+
if any(isinstance(val, (list, set, tuple)) for val in data[y_column].dropna().head(100)):
|
|
111
|
+
logger.info(f"Exploding list values in y_column '{y_column}'")
|
|
112
|
+
data = data.explode(y_column).dropna(subset=[y_column])
|
|
113
|
+
|
|
114
|
+
if value_column:
|
|
115
|
+
if any(isinstance(val, (list, set, tuple)) for val in data[value_column].dropna().head(100)):
|
|
116
|
+
logger.info(f"Exploding list values in value_column '{value_column}'")
|
|
117
|
+
data = data.explode(value_column).dropna(subset=[value_column])
|
|
118
|
+
|
|
119
|
+
# Remove duplicates by default (assume they're accidents unless user overrides)
|
|
120
|
+
if remove_duplicates:
|
|
121
|
+
data = data.drop_duplicates()
|
|
122
|
+
|
|
123
|
+
if value_column and minimum_value is not None:
|
|
124
|
+
data = data[data[value_column] >= minimum_value]
|
|
125
|
+
|
|
126
|
+
# Prepare the data
|
|
127
|
+
if value_column:
|
|
128
|
+
# Use the provided value column
|
|
129
|
+
# print(data.head())
|
|
130
|
+
pivot_data = data.pivot_table(
|
|
131
|
+
index=y_column,
|
|
132
|
+
columns=x_column,
|
|
133
|
+
values=value_column,
|
|
134
|
+
aggfunc='mean',
|
|
135
|
+
fill_value=missing_value
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
# Use frequency counts
|
|
139
|
+
cross_tab = pd.crosstab(data[y_column], data[x_column])
|
|
140
|
+
pivot_data = cross_tab
|
|
141
|
+
|
|
142
|
+
# Auto-detect format string if not provided
|
|
143
|
+
if fmt is None:
|
|
144
|
+
# Check if the pivot table contains integers only
|
|
145
|
+
if pivot_data.dtypes.apply(lambda x: pd.api.types.is_integer_dtype(x)).all():
|
|
146
|
+
fmt = 'd' # Integer format
|
|
147
|
+
else:
|
|
148
|
+
fmt = '.1f' # One decimal place for floats
|
|
149
|
+
|
|
150
|
+
# Make sure all cells have a reasonable minimum size
|
|
151
|
+
min_height = max(4, 80 / len(pivot_data.index) if len(pivot_data.index) > 0 else 10)
|
|
152
|
+
min_width = max(4, 80 / len(pivot_data.columns) if len(pivot_data.columns) > 0 else 10)
|
|
153
|
+
|
|
154
|
+
# Adjust figure size based on the number of rows and columns
|
|
155
|
+
adjusted_height = max(figsize[1], min_height * len(pivot_data.index) / 10)
|
|
156
|
+
adjusted_width = max(figsize[0], min_width * len(pivot_data.columns) / 10)
|
|
157
|
+
adjusted_figsize = (adjusted_width, adjusted_height)
|
|
158
|
+
|
|
159
|
+
# Create figure and axes
|
|
160
|
+
fig, ax = plt.subplots(figsize=adjusted_figsize)
|
|
161
|
+
|
|
162
|
+
# Apply clustering if requested
|
|
163
|
+
row_linkage = None
|
|
164
|
+
col_linkage = None
|
|
165
|
+
|
|
166
|
+
if cluster:
|
|
167
|
+
cluster_axes = cluster
|
|
168
|
+
if cluster_axes is True:
|
|
169
|
+
cluster_axes = "both"
|
|
170
|
+
|
|
171
|
+
# Fill NAs for clustering
|
|
172
|
+
pivot_data_for_clustering = pivot_data.fillna(0)
|
|
173
|
+
|
|
174
|
+
# Cluster rows (y-axis)
|
|
175
|
+
if cluster_axes in ["both", "y"]:
|
|
176
|
+
try:
|
|
177
|
+
# Calculate distance matrix and linkage for rows
|
|
178
|
+
row_distances = distance.pdist(pivot_data_for_clustering.values, metric=cluster_metric)
|
|
179
|
+
row_linkage = hierarchy.linkage(row_distances, method=cluster_method)
|
|
180
|
+
|
|
181
|
+
# Reorder rows based on clustering
|
|
182
|
+
row_dendrogram = hierarchy.dendrogram(row_linkage, no_plot=True)
|
|
183
|
+
row_order = row_dendrogram['leaves']
|
|
184
|
+
pivot_data = pivot_data.iloc[row_order]
|
|
185
|
+
|
|
186
|
+
logger.info(f"Applied clustering to rows using {cluster_method} linkage and {cluster_metric} metric")
|
|
187
|
+
except Exception as e:
|
|
188
|
+
logger.warning(f"Failed to cluster rows: {e}")
|
|
189
|
+
|
|
190
|
+
# Cluster columns (x-axis)
|
|
191
|
+
if cluster_axes in ["both", "x"]:
|
|
192
|
+
try:
|
|
193
|
+
# Calculate distance matrix and linkage for columns
|
|
194
|
+
col_distances = distance.pdist(pivot_data_for_clustering.values.T, metric=cluster_metric)
|
|
195
|
+
col_linkage = hierarchy.linkage(col_distances, method=cluster_method)
|
|
196
|
+
|
|
197
|
+
# Reorder columns based on clustering
|
|
198
|
+
col_dendrogram = hierarchy.dendrogram(col_linkage, no_plot=True)
|
|
199
|
+
col_order = col_dendrogram['leaves']
|
|
200
|
+
pivot_data = pivot_data.iloc[:, col_order]
|
|
201
|
+
|
|
202
|
+
logger.info(f"Applied clustering to columns using {cluster_method} linkage and {cluster_metric} metric")
|
|
203
|
+
except Exception as e:
|
|
204
|
+
logger.warning(f"Failed to cluster columns: {e}")
|
|
205
|
+
|
|
206
|
+
# Create the heatmap
|
|
207
|
+
sns.heatmap(
|
|
208
|
+
pivot_data,
|
|
209
|
+
cmap=cmap,
|
|
210
|
+
annot=annot,
|
|
211
|
+
fmt=fmt,
|
|
212
|
+
linewidths=linewidths,
|
|
213
|
+
linecolor=linecolor,
|
|
214
|
+
square=square,
|
|
215
|
+
vmin=vmin,
|
|
216
|
+
vmax=vmax,
|
|
217
|
+
robust=robust,
|
|
218
|
+
ax=ax,
|
|
219
|
+
annot_kws={'fontsize': font_size},
|
|
220
|
+
**kwargs
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Set title if provided
|
|
224
|
+
if title:
|
|
225
|
+
ax.set_title(title, fontsize=font_size + 4)
|
|
226
|
+
|
|
227
|
+
# Improve display of tick labels
|
|
228
|
+
plt.xticks(rotation=45, ha="right", fontsize=font_size)
|
|
229
|
+
plt.yticks(rotation=0, fontsize=font_size)
|
|
230
|
+
|
|
231
|
+
# Add grid lines to make the table more readable
|
|
232
|
+
ax.grid(False)
|
|
233
|
+
|
|
234
|
+
# Improve contrast for better readability
|
|
235
|
+
for _, spine in ax.spines.items():
|
|
236
|
+
spine.set_visible(True)
|
|
237
|
+
spine.set_color('black')
|
|
238
|
+
spine.set_linewidth(1)
|
|
239
|
+
|
|
240
|
+
# Adjust layout
|
|
241
|
+
plt.tight_layout()
|
|
242
|
+
|
|
243
|
+
# Save the figure if output file is specified
|
|
244
|
+
if output_file:
|
|
245
|
+
output_path = Path(output_file)
|
|
246
|
+
output_dir = output_path.parent
|
|
247
|
+
if not output_dir.exists():
|
|
248
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
249
|
+
plt.savefig(output_file, dpi=dpi, bbox_inches="tight")
|
|
250
|
+
logger.info(f"Heatmap saved to {output_file}")
|
|
251
|
+
|
|
252
|
+
return fig, ax
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def heatmap_from_file(
|
|
256
|
+
file_path: str,
|
|
257
|
+
x_column: str,
|
|
258
|
+
y_column: str,
|
|
259
|
+
value_column: Optional[str] = None,
|
|
260
|
+
format: Optional[Union[Format, str]] = None,
|
|
261
|
+
compression: Optional[str] = None,
|
|
262
|
+
output_file: Optional[str] = None,
|
|
263
|
+
remove_duplicates: bool = True,
|
|
264
|
+
**kwargs,
|
|
265
|
+
) -> Tuple[plt.Figure, plt.Axes]:
|
|
266
|
+
"""
|
|
267
|
+
Create a heatmap from a file (CSV, TSV, etc.).
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
file_path: Path to the input file or "-" for stdin
|
|
271
|
+
x_column: Column to use for x-axis categories
|
|
272
|
+
y_column: Column to use for y-axis categories
|
|
273
|
+
value_column: Column containing values for the heatmap. If None, frequency counts will be used.
|
|
274
|
+
format: Format of the input file (auto-detected if None)
|
|
275
|
+
compression: Compression format ('gz' or 'tgz')
|
|
276
|
+
output_file: File path to save the figure (optional)
|
|
277
|
+
remove_duplicates: If True, removes duplicate rows before creating the heatmap
|
|
278
|
+
**kwargs: Additional arguments to pass to create_heatmap
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Tuple containing the figure and axes objects
|
|
282
|
+
"""
|
|
283
|
+
# Handle stdin input safely
|
|
284
|
+
import sys
|
|
285
|
+
import io
|
|
286
|
+
import pandas as pd
|
|
287
|
+
import click
|
|
288
|
+
|
|
289
|
+
# Load the data
|
|
290
|
+
if file_path == "-":
|
|
291
|
+
# Read directly from stdin since format_utils will use sys.stdin which may already be consumed
|
|
292
|
+
if not format or str(format).lower() in ['csv', 'tsv']:
|
|
293
|
+
# Default to CSV if no format specified
|
|
294
|
+
delimiter = ',' if not format or str(format).lower() == 'csv' else '\t'
|
|
295
|
+
df = pd.read_csv(sys.stdin, delimiter=delimiter)
|
|
296
|
+
else:
|
|
297
|
+
# Try to use format_utils but with a backup plan
|
|
298
|
+
try:
|
|
299
|
+
objs = load_objects(file_path, format=format, compression=compression)
|
|
300
|
+
df = pd.DataFrame(objs)
|
|
301
|
+
except ValueError as e:
|
|
302
|
+
if "I/O operation on closed file" in str(e):
|
|
303
|
+
logger.warning("Could not read from stdin. It may have been consumed already.")
|
|
304
|
+
raise click.UsageError("Error reading from stdin. Please provide a file path or ensure stdin has data.")
|
|
305
|
+
else:
|
|
306
|
+
raise
|
|
307
|
+
else:
|
|
308
|
+
# For regular files, use format_utils as normal
|
|
309
|
+
if (not format or format in ["csv", "tsv"]) and not compression:
|
|
310
|
+
df = pd.read_csv(file_path)
|
|
311
|
+
else:
|
|
312
|
+
objs = load_objects(file_path, format=format, compression=compression)
|
|
313
|
+
df = pd.DataFrame(objs)
|
|
314
|
+
|
|
315
|
+
# Create the heatmap
|
|
316
|
+
return create_heatmap(
|
|
317
|
+
data=df,
|
|
318
|
+
x_column=x_column,
|
|
319
|
+
y_column=y_column,
|
|
320
|
+
value_column=value_column,
|
|
321
|
+
output_file=output_file,
|
|
322
|
+
remove_duplicates=remove_duplicates,
|
|
323
|
+
**kwargs
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def export_heatmap_data(
|
|
328
|
+
data: pd.DataFrame,
|
|
329
|
+
x_column: str,
|
|
330
|
+
y_column: str,
|
|
331
|
+
value_column: Optional[str] = None,
|
|
332
|
+
output_file: Optional[str] = None,
|
|
333
|
+
format: Union[Format, str] = Format.CSV,
|
|
334
|
+
missing_value: Any = np.nan,
|
|
335
|
+
remove_duplicates: bool = True,
|
|
336
|
+
) -> pd.DataFrame:
|
|
337
|
+
"""
|
|
338
|
+
Export heatmap data to a file or return it as a DataFrame.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
data: Input DataFrame containing the data
|
|
342
|
+
x_column: Column to use for x-axis categories
|
|
343
|
+
y_column: Column to use for y-axis categories
|
|
344
|
+
value_column: Column containing values for the heatmap. If None, frequency counts will be used.
|
|
345
|
+
output_file: File path to save the data (optional)
|
|
346
|
+
format: Output format for the file
|
|
347
|
+
missing_value: Value to use for missing data
|
|
348
|
+
remove_duplicates: If True, removes duplicate rows before creating the pivot table
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
DataFrame containing the pivot table data
|
|
352
|
+
"""
|
|
353
|
+
# Remove duplicates by default (assume they're accidents unless user overrides)
|
|
354
|
+
if remove_duplicates:
|
|
355
|
+
# Keep the first occurrence of each x_column, y_column combination
|
|
356
|
+
data = data.drop_duplicates(subset=[x_column, y_column])
|
|
357
|
+
|
|
358
|
+
# Prepare the data
|
|
359
|
+
if value_column:
|
|
360
|
+
# Use the provided value column
|
|
361
|
+
pivot_data = data.pivot_table(
|
|
362
|
+
index=y_column,
|
|
363
|
+
columns=x_column,
|
|
364
|
+
values=value_column,
|
|
365
|
+
aggfunc='mean',
|
|
366
|
+
fill_value=missing_value
|
|
367
|
+
)
|
|
368
|
+
else:
|
|
369
|
+
# Use frequency counts
|
|
370
|
+
cross_tab = pd.crosstab(data[y_column], data[x_column])
|
|
371
|
+
pivot_data = cross_tab
|
|
372
|
+
|
|
373
|
+
# Reset index to make the y_column a regular column
|
|
374
|
+
result_df = pivot_data.reset_index()
|
|
375
|
+
|
|
376
|
+
# Write to file if output_file is provided
|
|
377
|
+
if output_file:
|
|
378
|
+
# Convert to records format for writing
|
|
379
|
+
records = result_df.to_dict(orient='records')
|
|
380
|
+
write_output(records, format=format, target=output_file)
|
|
381
|
+
logger.info(f"Heatmap data saved to {output_file}")
|
|
382
|
+
|
|
383
|
+
return result_df
|
|
File without changes
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from linkml_store.api.collection import OBJECT
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def insert_operation_to_patches(objs: List[OBJECT], **kwargs):
|
|
7
|
+
"""
|
|
8
|
+
Translate a list of objects to a list of patches for insertion.
|
|
9
|
+
|
|
10
|
+
Note: inserts are always treated as being at the start of a list
|
|
11
|
+
|
|
12
|
+
:param objs: objects to insert
|
|
13
|
+
:param kwargs: additional arguments
|
|
14
|
+
"""
|
|
15
|
+
patches = []
|
|
16
|
+
for obj in objs:
|
|
17
|
+
patches.append({"op": "add", "path": "/0", "value": obj})
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
ENTRY = Dict[str, Any]
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
|
|
7
|
+
"""
|
|
8
|
+
Parse SIB/Swiss-Prot format data into a structured dictionary.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
text (str): The text in SIB/Swiss-Prot format
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
dict: A dictionary with entry IDs as keys and parsed data as values
|
|
15
|
+
"""
|
|
16
|
+
# Split the text into entries (separated by //)
|
|
17
|
+
entries = text.split("//\n")
|
|
18
|
+
header = None
|
|
19
|
+
|
|
20
|
+
# Initialize results dictionary
|
|
21
|
+
results = []
|
|
22
|
+
|
|
23
|
+
# Parse each entry
|
|
24
|
+
for entry in entries:
|
|
25
|
+
if not entry.strip():
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
# Initialize dictionary for current entry
|
|
29
|
+
current_entry = {}
|
|
30
|
+
current_code = None
|
|
31
|
+
|
|
32
|
+
# Process each line
|
|
33
|
+
for line in entry.strip().split("\n"):
|
|
34
|
+
if not line.strip():
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# Check if this is a new field (starts with a 2-letter code followed by space)
|
|
38
|
+
if len(line) > 2 and line[2] == " ":
|
|
39
|
+
current_code = line[0:2]
|
|
40
|
+
# Remove the code and the following space(s)
|
|
41
|
+
value = line[3:].strip()
|
|
42
|
+
|
|
43
|
+
# Initialize as list if needed for multi-line fields
|
|
44
|
+
if current_code not in current_entry:
|
|
45
|
+
current_entry[current_code] = []
|
|
46
|
+
|
|
47
|
+
current_entry[current_code].append(value)
|
|
48
|
+
|
|
49
|
+
# Continuation of previous field
|
|
50
|
+
elif current_code is not None:
|
|
51
|
+
# Handle continuation lines (typically indented)
|
|
52
|
+
if current_code == "CC":
|
|
53
|
+
# For comments, preserve the indentation
|
|
54
|
+
current_entry[current_code].append(line)
|
|
55
|
+
else:
|
|
56
|
+
# For other fields, strip and append
|
|
57
|
+
current_entry[current_code].append(line.strip())
|
|
58
|
+
|
|
59
|
+
# Combine multiline comments; e.g
|
|
60
|
+
# -!- ...
|
|
61
|
+
# ...
|
|
62
|
+
# -!- ...
|
|
63
|
+
ccs = current_entry.get("CC", [])
|
|
64
|
+
new_ccs = []
|
|
65
|
+
for cc in ccs:
|
|
66
|
+
if not cc.startswith("-!-") and new_ccs:
|
|
67
|
+
new_ccs[-1] += " " + cc
|
|
68
|
+
else:
|
|
69
|
+
new_ccs.append(cc)
|
|
70
|
+
current_entry["CC"] = new_ccs
|
|
71
|
+
for k, vs in current_entry.items():
|
|
72
|
+
if k != "CC":
|
|
73
|
+
combined = "".join(vs)
|
|
74
|
+
combined = combined.strip()
|
|
75
|
+
if combined.endswith("."):
|
|
76
|
+
combined = combined.split(".")
|
|
77
|
+
combined = [c.strip() for c in combined if c.strip()]
|
|
78
|
+
if k == "DE":
|
|
79
|
+
combined = combined[0]
|
|
80
|
+
current_entry[k] = combined
|
|
81
|
+
|
|
82
|
+
if "ID" in current_entry:
|
|
83
|
+
results.append(current_entry)
|
|
84
|
+
else:
|
|
85
|
+
header = current_entry
|
|
86
|
+
|
|
87
|
+
return header, results
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Example usage:
|
|
91
|
+
# data = parse_sib_format(text)
|
|
92
|
+
# for entry_id, entry_data in data.items():
|
|
93
|
+
# print(f"Entry: {entry_id}")
|
|
94
|
+
# for code, values in entry_data.items():
|
|
95
|
+
# print(f" {code}: {values}")
|