linkml-store 0.2.6__py3-none-any.whl → 0.2.10rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (35) hide show
  1. linkml_store/api/client.py +2 -3
  2. linkml_store/api/collection.py +63 -8
  3. linkml_store/api/database.py +20 -3
  4. linkml_store/api/stores/duckdb/duckdb_collection.py +168 -4
  5. linkml_store/api/stores/duckdb/duckdb_database.py +5 -5
  6. linkml_store/api/stores/filesystem/__init__.py +1 -1
  7. linkml_store/api/stores/filesystem/filesystem_database.py +1 -1
  8. linkml_store/api/stores/mongodb/mongodb_collection.py +132 -15
  9. linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
  10. linkml_store/api/stores/neo4j/neo4j_database.py +1 -1
  11. linkml_store/api/stores/solr/solr_collection.py +107 -18
  12. linkml_store/cli.py +201 -21
  13. linkml_store/index/implementations/llm_indexer.py +13 -6
  14. linkml_store/index/indexer.py +9 -5
  15. linkml_store/inference/implementations/llm_inference_engine.py +15 -13
  16. linkml_store/inference/implementations/rag_inference_engine.py +13 -10
  17. linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
  18. linkml_store/inference/inference_config.py +2 -1
  19. linkml_store/inference/inference_engine.py +1 -1
  20. linkml_store/plotting/__init__.py +5 -0
  21. linkml_store/plotting/cli.py +172 -0
  22. linkml_store/plotting/heatmap.py +356 -0
  23. linkml_store/utils/dat_parser.py +95 -0
  24. linkml_store/utils/enrichment_analyzer.py +217 -0
  25. linkml_store/utils/format_utils.py +124 -3
  26. linkml_store/utils/llm_utils.py +4 -2
  27. linkml_store/utils/object_utils.py +9 -3
  28. linkml_store/utils/pandas_utils.py +1 -1
  29. linkml_store/utils/sql_utils.py +1 -1
  30. linkml_store/utils/vector_utils.py +3 -10
  31. {linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/METADATA +3 -1
  32. {linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/RECORD +35 -30
  33. {linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/WHEEL +1 -1
  34. {linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/LICENSE +0 -0
  35. {linkml_store-0.2.6.dist-info → linkml_store-0.2.10rc1.dist-info}/entry_points.txt +0 -0
@@ -1,18 +1,16 @@
1
- import json
2
1
  import logging
3
2
  from dataclasses import dataclass
4
3
  from pathlib import Path
5
4
  from typing import ClassVar, List, Optional, TextIO, Union
6
5
 
7
6
  import yaml
8
- from linkml_store.utils.llm_utils import parse_yaml_payload
9
7
  from llm import get_key
10
8
  from pydantic import BaseModel
11
9
 
12
- from linkml_store.api.collection import OBJECT, Collection
10
+ from linkml_store.api.collection import OBJECT
13
11
  from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
14
12
  from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
15
- from linkml_store.utils.object_utils import select_nested
13
+ from linkml_store.utils.llm_utils import parse_yaml_payload
16
14
 
17
15
  logger = logging.getLogger(__name__)
18
16
 
@@ -79,21 +77,24 @@ class LLMInferenceEngine(InferenceEngine):
79
77
  def _schema_str(self) -> str:
80
78
  db = self.training_data.base_collection.parent
81
79
  from linkml_runtime.dumpers import json_dumper
80
+
82
81
  schema_dict = json_dumper.to_dict(db.schema_view.schema)
83
82
  return yaml.dump(schema_dict)
84
83
 
85
- def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]:
84
+ def derive(
85
+ self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None
86
+ ) -> Optional[LLMInference]:
86
87
  import llm
87
88
 
88
89
  model: llm.Model = self.model
89
- #model_name = self.config.llm_config.model_name
90
- #feature_attributes = self.config.feature_attributes
90
+ # model_name = self.config.llm_config.model_name
91
+ # feature_attributes = self.config.feature_attributes
91
92
  target_attributes = self.config.target_attributes
92
93
  query_text = self.object_to_text(object)
93
94
 
94
95
  if not target_attributes:
95
96
  target_attributes = [k for k, v in object.items() if v is None or v == ""]
96
- #if not feature_attributes:
97
+ # if not feature_attributes:
97
98
  # feature_attributes = [k for k, v in object.items() if v is not None and v != ""]
98
99
 
99
100
  system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
@@ -107,7 +108,9 @@ class LLMInferenceEngine(InferenceEngine):
107
108
  "```yaml\n"
108
109
  f"{stub}\n"
109
110
  "```\n"
110
- "---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n"
111
+ "---\nQuery:\n"
112
+ f"## INCOMPLETE OBJECT:\n{query_text}\n"
113
+ "## OUTPUT:\n"
111
114
  )
112
115
  logger.info(f"Prompt: {prompt}")
113
116
  response = model.prompt(prompt, system=system_prompt)
@@ -130,9 +133,8 @@ class LLMInferenceEngine(InferenceEngine):
130
133
  "\nThis was invalid.\n",
131
134
  "Validation errors:\n",
132
135
  ] + [self.object_to_text(e) for e in errs]
133
- return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
134
- return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
135
-
136
+ return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts)
137
+ return LLMInference(predicted_object=predicted_object, iterations=iteration + 1, query=object)
136
138
 
137
139
  def export_model(
138
140
  self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
@@ -149,4 +151,4 @@ class LLMInferenceEngine(InferenceEngine):
149
151
 
150
152
  @classmethod
151
153
  def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine":
152
- raise NotImplementedError("Does not make sense for this engine")
154
+ raise NotImplementedError("Does not make sense for this engine")
@@ -111,7 +111,9 @@ class RAGInferenceEngine(InferenceEngine):
111
111
  def object_to_text(self, object: OBJECT) -> str:
112
112
  return yaml.dump(object)
113
113
 
114
- def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[RAGInference]:
114
+ def derive(
115
+ self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None
116
+ ) -> Optional[RAGInference]:
115
117
  import llm
116
118
  from tiktoken import encoding_for_model
117
119
 
@@ -131,8 +133,9 @@ class RAGInferenceEngine(InferenceEngine):
131
133
  if not self.rag_collection.indexers:
132
134
  raise ValueError("RAG collection must have an indexer attached")
133
135
  logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}")
134
- rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
135
- mmr_relevance_factor=mmr_relevance_factor)
136
+ rs = self.rag_collection.search(
137
+ query_text, limit=num_examples, index_name="llm", mmr_relevance_factor=mmr_relevance_factor
138
+ )
136
139
  examples = rs.rows
137
140
  logger.info(f"Found {len(examples)} examples")
138
141
  if not examples:
@@ -153,11 +156,11 @@ class RAGInferenceEngine(InferenceEngine):
153
156
  input_obj_text = self.object_to_text(input_obj)
154
157
  if input_obj_text == query_text:
155
158
  continue
156
- #raise ValueError(
159
+ # raise ValueError(
157
160
  # f"Query object {query_text} is the same as example object {input_obj_text}\n"
158
161
  # "This indicates possible test data leakage\n."
159
162
  # "TODO: allow an option that allows user to treat this as a basic lookup\n"
160
- #)
163
+ # )
161
164
  output_obj = select_nested(example, target_attributes)
162
165
  prompt_clause = (
163
166
  "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
@@ -176,9 +179,9 @@ class RAGInferenceEngine(InferenceEngine):
176
179
  except KeyError:
177
180
  encoding = encoding_for_model("gpt-4")
178
181
  token_limit = get_token_limit(model_name)
179
- prompt = render_formatted_text(make_text, values=prompt_clauses,
180
- encoding=encoding, token_limit=token_limit,
181
- additional_text=system_prompt)
182
+ prompt = render_formatted_text(
183
+ make_text, values=prompt_clauses, encoding=encoding, token_limit=token_limit, additional_text=system_prompt
184
+ )
182
185
  logger.info(f"Prompt: {prompt}")
183
186
  response = model.prompt(prompt, system=system_prompt)
184
187
  yaml_str = response.text()
@@ -199,8 +202,8 @@ class RAGInferenceEngine(InferenceEngine):
199
202
  "\nThis was invalid.\n",
200
203
  "Validation errors:\n",
201
204
  ] + [self.object_to_text(e) for e in errs]
202
- return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
203
- return RAGInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
205
+ return self.derive(object, iteration=iteration + 1, additional_prompt_texts=extra_texts)
206
+ return RAGInference(predicted_object=predicted_object, iterations=iteration + 1, query=object)
204
207
 
205
208
  def _parse_yaml_payload(self, yaml_str: str, strict=False) -> Optional[OBJECT]:
206
209
  if "```" in yaml_str:
@@ -94,6 +94,8 @@ class SklearnInferenceEngine(InferenceEngine):
94
94
  if not feature_cols:
95
95
  feature_cols = df.columns.difference(target_cols).tolist()
96
96
  self.config.feature_attributes = feature_cols
97
+ if not feature_cols:
98
+ raise ValueError("No features found in the data")
97
99
  target_col = target_cols[0]
98
100
  logger.info(f"Feature columns: {feature_cols}")
99
101
  X = df[feature_cols].copy()
@@ -102,6 +104,8 @@ class SklearnInferenceEngine(InferenceEngine):
102
104
 
103
105
  # find list of features to skip (categorical with > N categories)
104
106
  skip_features = []
107
+ if not len(X.columns):
108
+ raise ValueError("No features to train on")
105
109
  for col in X.columns:
106
110
  unique_values = self._get_unique_values(X[col])
107
111
  if len(unique_values) > self.maximum_proportion_distinct_features * len(X[col]):
@@ -115,6 +119,8 @@ class SklearnInferenceEngine(InferenceEngine):
115
119
 
116
120
  # Encode features
117
121
  encoded_features = []
122
+ if not len(X.columns):
123
+ raise ValueError(f"No features to train on from after skipping {skip_features}")
118
124
  for col in X.columns:
119
125
  logger.info(f"Checking whether to encode: {col}")
120
126
  col_encoder = self._get_encoder(X[col])
@@ -153,7 +159,7 @@ class SklearnInferenceEngine(InferenceEngine):
153
159
  y = y_encoder.fit_transform(y.values.ravel()) # Convert to 1D numpy array
154
160
  self.transformed_targets = y_encoder.classes_
155
161
 
156
- # print(f"Fitting model with features: {X.columns}")
162
+ # print(f"Fitting model with features: {X.columns}, y={y}, X={X}")
157
163
  clf = DecisionTreeClassifier(random_state=42)
158
164
  clf.fit(X, y)
159
165
  self.classifier = clf
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import List, Optional, Tuple, Any
2
+ from typing import Any, List, Optional, Tuple
3
3
 
4
4
  from pydantic import BaseModel, ConfigDict, Field
5
5
 
@@ -59,6 +59,7 @@ class Inference(BaseModel, extra="forbid"):
59
59
  """
60
60
  Result of an inference derivation.
61
61
  """
62
+
62
63
  query: Optional[OBJECT] = Field(default=None, description="The query object.")
63
64
  predicted_object: OBJECT = Field(..., description="The predicted object.")
64
65
  confidence: Optional[float] = Field(default=None, description="The confidence of the prediction.", le=1.0, ge=0.0)
@@ -4,7 +4,7 @@ from abc import ABC
4
4
  from dataclasses import dataclass
5
5
  from enum import Enum
6
6
  from pathlib import Path
7
- from typing import Optional, TextIO, Tuple, Union, Any
7
+ from typing import Any, Optional, TextIO, Tuple, Union
8
8
 
9
9
  import pandas as pd
10
10
  from pydantic import BaseModel, ConfigDict
@@ -0,0 +1,5 @@
1
+ """
2
+ Visualization and plotting functions for LinkML data.
3
+ """
4
+
5
+ __version__ = "0.1.0"
@@ -0,0 +1,172 @@
1
+ """
2
+ Command-line interface for the plotting package.
3
+ """
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Optional, Union
8
+
9
+ import click
10
+
11
+ from linkml_store.plotting.heatmap import heatmap_from_file, export_heatmap_data
12
+ from linkml_store.utils.format_utils import Format
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @click.group()
18
+ def plot_cli():
19
+ """Plotting utilities for LinkML data."""
20
+ pass
21
+
22
+
23
+ @plot_cli.command()
24
+ @click.argument("input_file", required=False)
25
+ @click.option("--x-column", "-x", required=True, help="Column to use for x-axis")
26
+ @click.option("--y-column", "-y", required=True, help="Column to use for y-axis")
27
+ @click.option("--value-column", "-v", help="Column containing values (if not provided, counts will be used)")
28
+ @click.option("--title", "-t", help="Title for the heatmap")
29
+ @click.option("--width", "-w", type=int, default=10, show_default=True, help="Width of the figure in inches")
30
+ @click.option("--height", "-h", type=int, default=8, show_default=True, help="Height of the figure in inches")
31
+ @click.option("--cmap", "-c", default="YlGnBu", show_default=True, help="Colormap to use")
32
+ @click.option("--output", "-o", required=True, help="Output file path")
33
+ @click.option("--format", "-f", help="Input file format")
34
+ @click.option("--dpi", type=int, default=300, show_default=True, help="DPI for output image")
35
+ @click.option("--square/--no-square", default=False, show_default=True, help="Make cells square")
36
+ @click.option("--annotate/--no-annotate", default=True, show_default=True, help="Annotate cells with values")
37
+ @click.option("--font-size", type=int, default=10, show_default=True, help="Font size for annotations and labels")
38
+ @click.option("--robust/--no-robust", default=False, show_default=True, help="Use robust quantiles for colormap scaling")
39
+ @click.option("--remove-duplicates/--no-remove-duplicates", default=True, show_default=True,
40
+ help="Remove duplicate x,y combinations (default) or keep all occurrences")
41
+ @click.option("--cluster", type=click.Choice(["none", "both", "x", "y"]), default="none", show_default=True,
42
+ help="Cluster axes: none (default), both, x-axis only, or y-axis only")
43
+ @click.option("--cluster-method", type=click.Choice(["complete", "average", "single", "ward"]), default="complete", show_default=True,
44
+ help="Linkage method for hierarchical clustering")
45
+ @click.option("--cluster-metric", type=click.Choice(["euclidean", "correlation", "cosine", "cityblock"]), default="euclidean", show_default=True,
46
+ help="Distance metric for clustering")
47
+ @click.option("--export-data", "-e", help="Export the heatmap data to this file")
48
+ @click.option("--export-format", "-E", type=click.Choice([f.value for f in Format]), default="csv", show_default=True,
49
+ help="Format for exported data")
50
+ def heatmap(
51
+ input_file: Optional[str],
52
+ x_column: str,
53
+ y_column: str,
54
+ value_column: Optional[str],
55
+ title: Optional[str],
56
+ width: int,
57
+ height: int,
58
+ cmap: str,
59
+ output: str,
60
+ format: Optional[str],
61
+ dpi: int,
62
+ square: bool,
63
+ annotate: bool,
64
+ font_size: int,
65
+ robust: bool,
66
+ remove_duplicates: bool,
67
+ cluster: str,
68
+ cluster_method: str,
69
+ cluster_metric: str,
70
+ export_data: Optional[str],
71
+ export_format: Union[str, Format],
72
+ ):
73
+ """
74
+ Create a heatmap from a tabular data file.
75
+
76
+ Examples:
77
+ # From a file
78
+ linkml-store plot heatmap data.csv -x species -y country -o heatmap.png
79
+
80
+ # From stdin
81
+ cat data.csv | linkml-store plot heatmap -x species -y country -o heatmap.png
82
+
83
+ This will create a heatmap showing the frequency counts of species by country.
84
+ If you want to use a specific value column instead of counts:
85
+
86
+ linkml-store plot heatmap data.csv -x species -y country -v population -o heatmap.png
87
+ """
88
+ # Handle file path - if None, use stdin
89
+ if input_file is None:
90
+ input_file = "-" # format_utils treats "-" as stdin
91
+
92
+ # Convert 'none' to False for clustering parameter
93
+ use_cluster = False if cluster == "none" else cluster
94
+
95
+ # Create heatmap visualization
96
+ fig, ax = heatmap_from_file(
97
+ file_path=input_file,
98
+ x_column=x_column,
99
+ y_column=y_column,
100
+ value_column=value_column,
101
+ title=title,
102
+ figsize=(width, height),
103
+ cmap=cmap,
104
+ output_file=output,
105
+ format=format,
106
+ dpi=dpi,
107
+ square=square,
108
+ annot=annotate,
109
+ font_size=font_size,
110
+ robust=robust,
111
+ remove_duplicates=remove_duplicates,
112
+ cluster=use_cluster,
113
+ cluster_method=cluster_method,
114
+ cluster_metric=cluster_metric,
115
+ )
116
+
117
+ # Export data if requested
118
+ if export_data:
119
+ # For export, reuse the data already loaded for the heatmap instead of loading again
120
+ # This avoids the "I/O operation on closed file" error when input_file is stdin
121
+ import pandas as pd
122
+ from matplotlib.axes import Axes
123
+
124
+ # Extract the data directly from the plot
125
+ if hasattr(ax, 'get_figure') and hasattr(ax, 'get_children'):
126
+ # Extract the heatmap data from the plot itself
127
+ heatmap_data = {}
128
+ for child in ax.get_children():
129
+ if isinstance(child, plt.matplotlib.collections.QuadMesh):
130
+ # Get the colormap data
131
+ data_values = child.get_array()
132
+ rows = ax.get_yticks()
133
+ cols = ax.get_xticks()
134
+ row_labels = [item.get_text() for item in ax.get_yticklabels()]
135
+ col_labels = [item.get_text() for item in ax.get_xticklabels()]
136
+
137
+ # Create a dataframe from the plot data
138
+ heatmap_df = pd.DataFrame(
139
+ index=[label for label in row_labels if label],
140
+ columns=[label for label in col_labels if label]
141
+ )
142
+
143
+ # Fill in the values (if we can)
144
+ if len(data_values) == len(row_labels) * len(col_labels):
145
+ for i, row in enumerate(row_labels):
146
+ for j, col in enumerate(col_labels):
147
+ if row and col: # Skip empty labels
148
+ idx = i * len(col_labels) + j
149
+ if idx < len(data_values):
150
+ heatmap_df.at[row, col] = data_values[idx]
151
+
152
+ # Reset index to make the y_column a regular column
153
+ result_df = heatmap_df.reset_index()
154
+ result_df.rename(columns={'index': y_column}, inplace=True)
155
+
156
+ # Export the data
157
+ from linkml_store.utils.format_utils import write_output
158
+ records = result_df.to_dict(orient='records')
159
+ write_output(records, format=export_format, target=export_data)
160
+ click.echo(f"Heatmap data exported to {export_data}")
161
+ break
162
+ else:
163
+ # If we couldn't extract data from the plot, inform the user
164
+ click.echo("Warning: Could not export data from the plot")
165
+ else:
166
+ click.echo("Warning: Could not export data from the plot")
167
+
168
+ click.echo(f"Heatmap created at {output}")
169
+
170
+
171
+ if __name__ == "__main__":
172
+ plot_cli()