linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from linkml_runtime import SchemaView
|
|
4
|
+
from linkml_runtime.linkml_model import SlotDefinition
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def path_to_attribute_list(class_name: str, path: str, schema_view: SchemaView) -> List[SlotDefinition]:
|
|
8
|
+
"""
|
|
9
|
+
Convert a path to a list of attributes.
|
|
10
|
+
|
|
11
|
+
:param path:
|
|
12
|
+
:return:
|
|
13
|
+
"""
|
|
14
|
+
parts = path.split(".")
|
|
15
|
+
att_list = []
|
|
16
|
+
while parts:
|
|
17
|
+
part = parts.pop(0)
|
|
18
|
+
att = schema_view.induced_slot(part, class_name)
|
|
19
|
+
if not att:
|
|
20
|
+
raise ValueError(f"Attribute {part} not found in class {class_name}")
|
|
21
|
+
att_list.append(att)
|
|
22
|
+
class_name = att.range
|
|
23
|
+
return att_list
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from linkml_runtime.utils.formatutils import underscore
|
|
10
|
+
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
|
|
11
|
+
from sklearn.tree import DecisionTreeClassifier, _tree, export_graphviz
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def tree_to_nested_expression(
|
|
17
|
+
tree: DecisionTreeClassifier,
|
|
18
|
+
feature_names: List[str],
|
|
19
|
+
categorical_features: Optional[List[str]] = None,
|
|
20
|
+
feature_encoders: Optional[Dict[str, Union[OneHotEncoder, LabelEncoder]]] = None,
|
|
21
|
+
target_encoder: Optional[LabelEncoder] = None,
|
|
22
|
+
) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Convert a trained scikit-learn DecisionTreeClassifier to a nested Python conditional expression.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
tree (DecisionTreeClassifier): A trained decision tree classifier.
|
|
28
|
+
feature_names (list): List of feature names (including one-hot encoded feature names).
|
|
29
|
+
categorical_features (list): List of original categorical feature names.
|
|
30
|
+
feature_encoders (dict): Dictionary mapping feature names to their respective OneHotEncoders or LabelEncoders.
|
|
31
|
+
target_encoder (LabelEncoder, optional): LabelEncoder for the target variable if it's categorical.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
str: A string representing the nested Python conditional expression.
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
>>> import numpy as np
|
|
38
|
+
>>> from sklearn.tree import DecisionTreeClassifier
|
|
39
|
+
>>> from sklearn.preprocessing import OneHotEncoder, LabelEncoder
|
|
40
|
+
>>>
|
|
41
|
+
>>> # Prepare sample data
|
|
42
|
+
>>> X = np.array([[0, 'A'], [0, 'B'], [1, 'A'], [1, 'B']])
|
|
43
|
+
>>> y = np.array(['No', 'Yes', 'Yes', 'No'])
|
|
44
|
+
>>>
|
|
45
|
+
>>> # Prepare the encoders
|
|
46
|
+
>>> feature_encoders = {'feature2': OneHotEncoder(sparse_output=False, handle_unknown='ignore')}
|
|
47
|
+
>>> target_encoder = LabelEncoder()
|
|
48
|
+
>>>
|
|
49
|
+
>>> # Encode the categorical feature and target
|
|
50
|
+
>>> X_encoded = np.column_stack([
|
|
51
|
+
... X[:, 0],
|
|
52
|
+
... feature_encoders['feature2'].fit_transform(X[:, 1].reshape(-1, 1))
|
|
53
|
+
... ])
|
|
54
|
+
>>> y_encoded = target_encoder.fit_transform(y)
|
|
55
|
+
>>>
|
|
56
|
+
>>> # Train the decision tree
|
|
57
|
+
>>> clf = DecisionTreeClassifier(random_state=42)
|
|
58
|
+
>>> clf.fit(X_encoded, y_encoded)
|
|
59
|
+
DecisionTreeClassifier(random_state=42)
|
|
60
|
+
>>>
|
|
61
|
+
>>> # Convert to nested expression
|
|
62
|
+
>>> feature_names = ['feature1', 'feature2_A', 'feature2_B']
|
|
63
|
+
>>> categorical_features = ['feature2']
|
|
64
|
+
>>> expression = tree_to_nested_expression(clf, feature_names,
|
|
65
|
+
... categorical_features, feature_encoders, target_encoder)
|
|
66
|
+
>>> print(expression)
|
|
67
|
+
(("Yes" if ({feature1} <= 0.5000) else "No") if ({feature2} == "A")
|
|
68
|
+
else ("No" if ({feature1} <= 0.5000) else "Yes"))
|
|
69
|
+
"""
|
|
70
|
+
tree_ = tree.tree_
|
|
71
|
+
feature_name = [feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature]
|
|
72
|
+
|
|
73
|
+
categorical_features = set(categorical_features or [])
|
|
74
|
+
|
|
75
|
+
def get_original_feature_name(name):
|
|
76
|
+
return name.split("_")[0] if "_" in name else name
|
|
77
|
+
|
|
78
|
+
def recurse(node):
|
|
79
|
+
if tree_.feature[node] != _tree.TREE_UNDEFINED:
|
|
80
|
+
name = feature_name[node]
|
|
81
|
+
threshold = tree_.threshold[node]
|
|
82
|
+
original_name = get_original_feature_name(name)
|
|
83
|
+
original_name_safe = underscore(original_name)
|
|
84
|
+
name_safe = underscore(name)
|
|
85
|
+
|
|
86
|
+
original_name_safe = "{" + original_name_safe + "}"
|
|
87
|
+
name_safe = "{" + name_safe + "}"
|
|
88
|
+
|
|
89
|
+
if original_name in categorical_features:
|
|
90
|
+
if feature_encoders is None or original_name not in feature_encoders:
|
|
91
|
+
raise ValueError(f"Encoder is required for categorical feature {original_name}")
|
|
92
|
+
|
|
93
|
+
encoder = feature_encoders[original_name]
|
|
94
|
+
if isinstance(encoder, OneHotEncoder):
|
|
95
|
+
# For one-hot encoded features, we check if the specific category is present
|
|
96
|
+
category = name.split("_", 1)[1] # Get everything after the first underscore
|
|
97
|
+
condition = f'{original_name_safe} == "{category}"'
|
|
98
|
+
elif isinstance(encoder, LabelEncoder):
|
|
99
|
+
category = encoder.inverse_transform([int(threshold)])[0]
|
|
100
|
+
condition = f'{original_name_safe} == "{category}"'
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError(f"Unsupported encoder type for feature {original_name}")
|
|
103
|
+
else:
|
|
104
|
+
if np.isinf(threshold):
|
|
105
|
+
condition = "True"
|
|
106
|
+
else:
|
|
107
|
+
condition = f"{name_safe} <= {threshold:.4f}"
|
|
108
|
+
|
|
109
|
+
left_expr = recurse(tree_.children_left[node])
|
|
110
|
+
right_expr = recurse(tree_.children_right[node])
|
|
111
|
+
|
|
112
|
+
return f"({left_expr} if ({condition}) else {right_expr})"
|
|
113
|
+
else:
|
|
114
|
+
class_index = np.argmax(tree_.value[node])
|
|
115
|
+
if target_encoder:
|
|
116
|
+
class_label = target_encoder.inverse_transform([class_index])[0]
|
|
117
|
+
return f'"{class_label}"'
|
|
118
|
+
else:
|
|
119
|
+
return str(class_index)
|
|
120
|
+
|
|
121
|
+
return recurse(0)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def escape_label(s: str) -> str:
|
|
125
|
+
"""Escape special characters in label strings."""
|
|
126
|
+
s = str(s)
|
|
127
|
+
return re.sub(r"([<>])", r"\\\1", s)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def visualize_decision_tree(
|
|
131
|
+
clf: DecisionTreeClassifier,
|
|
132
|
+
feature_names: List[str],
|
|
133
|
+
class_names: List[str] = None,
|
|
134
|
+
output_file: Union[Path, str] = "decision_tree.png",
|
|
135
|
+
) -> None:
|
|
136
|
+
"""
|
|
137
|
+
Generate a visualization of the decision tree and save it as a PNG file.
|
|
138
|
+
|
|
139
|
+
:param clf: Trained DecisionTreeClassifier
|
|
140
|
+
:param feature_names: List of feature names
|
|
141
|
+
:param class_names: List of class names (optional)
|
|
142
|
+
:param output_file: The name of the file to save the visualization (default: "decision_tree.png")
|
|
143
|
+
|
|
144
|
+
>>> # Create a sample dataset
|
|
145
|
+
>>> import pandas as pd
|
|
146
|
+
>>> data = pd.DataFrame({
|
|
147
|
+
... 'age': [25, 30, 35, 40, 45],
|
|
148
|
+
... 'income': [50000, 60000, 70000, 80000, 90000],
|
|
149
|
+
... 'credit_score': [600, 650, 700, 750, 800],
|
|
150
|
+
... 'approved': ['No', 'No', 'Yes', 'Yes', 'Yes']
|
|
151
|
+
... })
|
|
152
|
+
>>>
|
|
153
|
+
>>> # Prepare features and target
|
|
154
|
+
>>> X = data[['age', 'income', 'credit_score']]
|
|
155
|
+
>>> y = data['approved']
|
|
156
|
+
>>>
|
|
157
|
+
>>> # Encode target variable
|
|
158
|
+
>>> le = LabelEncoder()
|
|
159
|
+
>>> y_encoded = le.fit_transform(y)
|
|
160
|
+
>>>
|
|
161
|
+
>>> # Train a decision tree
|
|
162
|
+
>>> clf = DecisionTreeClassifier(random_state=42)
|
|
163
|
+
>>> _ = clf.fit(X, y_encoded)
|
|
164
|
+
>>> # Visualize the tree
|
|
165
|
+
>>> visualize_decision_tree(clf, X.columns.tolist(), le.classes_, "tests/output/test_tree.png")
|
|
166
|
+
"""
|
|
167
|
+
# Escape special characters in feature names and class names
|
|
168
|
+
escaped_feature_names = [escape_label(name) for name in feature_names]
|
|
169
|
+
escaped_class_names = [escape_label(name) for name in (class_names if class_names is not None else [])]
|
|
170
|
+
|
|
171
|
+
import graphviz
|
|
172
|
+
|
|
173
|
+
dot_data = export_graphviz(
|
|
174
|
+
clf,
|
|
175
|
+
out_file=None,
|
|
176
|
+
feature_names=escaped_feature_names,
|
|
177
|
+
class_names=escaped_class_names,
|
|
178
|
+
filled=True,
|
|
179
|
+
rounded=True,
|
|
180
|
+
special_characters=True,
|
|
181
|
+
)
|
|
182
|
+
# dot_data = escape_label(dot_data)
|
|
183
|
+
logger.info(f"Dot: {dot_data}")
|
|
184
|
+
dot_path = shutil.which("dot")
|
|
185
|
+
if not dot_path:
|
|
186
|
+
logger.warning("Graphviz 'dot' executable not found in PATH. Skipping visualization.")
|
|
187
|
+
return
|
|
188
|
+
os.environ["GRAPHVIZ_DOT"] = dot_path
|
|
189
|
+
|
|
190
|
+
graph = graphviz.Source(dot_data)
|
|
191
|
+
if isinstance(output_file, Path):
|
|
192
|
+
output_file = str(output_file)
|
|
193
|
+
graph.render(output_file.rsplit(".", 1)[0], format="png", cleanup=True)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Optional, Tuple, Type, Union
|
|
3
|
+
|
|
4
|
+
import sqlalchemy
|
|
5
|
+
import sqlalchemy.sql.sqltypes as sqlt
|
|
6
|
+
from linkml_runtime.linkml_model import SchemaDefinition, SlotDefinition
|
|
7
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
8
|
+
from sqlalchemy import MetaData, quoted_name
|
|
9
|
+
|
|
10
|
+
from linkml_store.api.queries import Query
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
TYPE_MAP = {
|
|
15
|
+
sqlt.TEXT: "string",
|
|
16
|
+
sqlt.INTEGER: "integer",
|
|
17
|
+
sqlt.FLOAT: "float",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
OP_MAP = {
|
|
21
|
+
"eq": "=",
|
|
22
|
+
"$in": "ARRAY_CONTAINS", ## mongodb
|
|
23
|
+
"$contains": "ARRAY_CONTAINS", ## TODO: this is chromadb-specific
|
|
24
|
+
"in": "ARRAY_CONTAINS",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _map_type(typ: Type) -> str:
|
|
29
|
+
for k, v in TYPE_MAP.items():
|
|
30
|
+
if isinstance(typ, k):
|
|
31
|
+
return v
|
|
32
|
+
return "string"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def where_clause_to_sql(query: Query) -> str:
|
|
36
|
+
if not query.where_clause:
|
|
37
|
+
return ""
|
|
38
|
+
if isinstance(query.where_clause, str):
|
|
39
|
+
where_clause_sql = query.where_clause
|
|
40
|
+
elif isinstance(query.where_clause, list):
|
|
41
|
+
where_clause_sql = " AND ".join(query.where_clause)
|
|
42
|
+
elif isinstance(query.where_clause, dict):
|
|
43
|
+
conjs = []
|
|
44
|
+
for k, v in query.where_clause.items():
|
|
45
|
+
conjs.extend(col_val_constraints_to_conjs(k, v))
|
|
46
|
+
where_clause_sql = " AND ".join(conjs)
|
|
47
|
+
|
|
48
|
+
else:
|
|
49
|
+
raise ValueError(f"Invalid where_clause type: {type(query.where_clause)}")
|
|
50
|
+
return "WHERE " + where_clause_sql
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def col_val_constraints_to_conjs(col_name: str, val_constraints: Any) -> list:
|
|
54
|
+
if val_constraints is None:
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
def _quote(v: Any):
|
|
58
|
+
if isinstance(v, str):
|
|
59
|
+
# escape internal vs
|
|
60
|
+
v = v.replace("'", "''")
|
|
61
|
+
return f"'{v}'"
|
|
62
|
+
else:
|
|
63
|
+
return v
|
|
64
|
+
|
|
65
|
+
if isinstance(val_constraints, dict):
|
|
66
|
+
conjs = []
|
|
67
|
+
for k, v in val_constraints.items():
|
|
68
|
+
if k in OP_MAP:
|
|
69
|
+
if k == "$in" and isinstance(v, list):
|
|
70
|
+
v_mapped = [_quote(v1) for v1 in v]
|
|
71
|
+
t = f"{col_name} IN ({', '.join(v_mapped)})"
|
|
72
|
+
else:
|
|
73
|
+
t = f"{OP_MAP[k]}({col_name}, {_quote(v)})"
|
|
74
|
+
else:
|
|
75
|
+
t = f"{col_name} {k} {_quote(v)}"
|
|
76
|
+
conjs.append(t)
|
|
77
|
+
return conjs
|
|
78
|
+
else:
|
|
79
|
+
return [f"{col_name} = {_quote(val_constraints)}"]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def query_to_sql(query: Query, count=False, limit=None, offset: Optional[int] = None):
|
|
83
|
+
select_cols = query.select_cols if query.select_cols else ["*"]
|
|
84
|
+
if count:
|
|
85
|
+
sql_str = ["SELECT COUNT(*)"]
|
|
86
|
+
else:
|
|
87
|
+
sql_str = [f"SELECT {', '.join(select_cols)}"]
|
|
88
|
+
sql_str.append(f"FROM {query.from_table}")
|
|
89
|
+
sql_str.append(where_clause_to_sql(query))
|
|
90
|
+
if not count:
|
|
91
|
+
if query.sort_by:
|
|
92
|
+
sql_str.append(f"ORDER BY {', '.join(query.sort_by)}")
|
|
93
|
+
if not count:
|
|
94
|
+
if limit is None:
|
|
95
|
+
limit = query.limit
|
|
96
|
+
if limit is None:
|
|
97
|
+
limit = 100
|
|
98
|
+
if limit < 0:
|
|
99
|
+
limit = None
|
|
100
|
+
if limit is not None:
|
|
101
|
+
sql_str.append(f" LIMIT {limit}")
|
|
102
|
+
offset = offset if offset else query.offset
|
|
103
|
+
if offset:
|
|
104
|
+
sql_str.append(f" OFFSET {offset}")
|
|
105
|
+
sql_str = [line for line in sql_str if line]
|
|
106
|
+
return "\n".join(sql_str)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], multivalued=False, limit=100) -> str:
|
|
110
|
+
# Create a modified WHERE clause that excludes conditions directly related to facet_column
|
|
111
|
+
modified_where = None
|
|
112
|
+
if query.where_clause:
|
|
113
|
+
where_clause_sql = where_clause_to_sql(query)
|
|
114
|
+
# Split the where clause into conditions and exclude those related to the facet_column
|
|
115
|
+
conditions = [cond for cond in where_clause_sql.split(" AND ") if not cond.startswith(f"{facet_column} ")]
|
|
116
|
+
modified_where = " AND ".join(conditions)
|
|
117
|
+
|
|
118
|
+
def make_col_safe(col):
|
|
119
|
+
return '"' + quoted_name(col, True) + '"' if " " in col else col
|
|
120
|
+
|
|
121
|
+
if isinstance(facet_column, str):
|
|
122
|
+
facet_column = make_col_safe(facet_column)
|
|
123
|
+
if isinstance(facet_column, tuple):
|
|
124
|
+
facet_column = [make_col_safe(col) for col in facet_column]
|
|
125
|
+
if multivalued:
|
|
126
|
+
raise NotImplementedError("Multivalued facets are not supported for multiple columns")
|
|
127
|
+
facet_column = ", ".join(facet_column)
|
|
128
|
+
from_table = query.from_table
|
|
129
|
+
if multivalued:
|
|
130
|
+
from_table = f"(SELECT UNNEST({facet_column}) as {facet_column} FROM {query.from_table}"
|
|
131
|
+
from_table += f" {modified_where}" if modified_where else ""
|
|
132
|
+
from_table += ")"
|
|
133
|
+
else:
|
|
134
|
+
from_table += f" {modified_where}" if modified_where else ""
|
|
135
|
+
sql_str = [f"SELECT {facet_column}, COUNT(*) as count", f"FROM {from_table}"]
|
|
136
|
+
# if modified_where:
|
|
137
|
+
# sql_str.append(f"{modified_where}")
|
|
138
|
+
sql_str.append(f"GROUP BY {facet_column}")
|
|
139
|
+
sql_str.append("ORDER BY count DESC") # Optional, order by count for convenience
|
|
140
|
+
if limit is not None:
|
|
141
|
+
sql_str.append(f"LIMIT {limit}")
|
|
142
|
+
return "\n".join(sql_str)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def introspect_schema(engine: sqlalchemy.Engine) -> SchemaDefinition:
|
|
146
|
+
"""
|
|
147
|
+
Introspect a database schema and return a SchemaDefinition object
|
|
148
|
+
|
|
149
|
+
:param engine:
|
|
150
|
+
:return:
|
|
151
|
+
"""
|
|
152
|
+
metadata_obj = MetaData()
|
|
153
|
+
logging.info(f"Reflecting using {engine}")
|
|
154
|
+
metadata_obj.reflect(bind=engine)
|
|
155
|
+
sb = SchemaBuilder()
|
|
156
|
+
schema = sb.schema
|
|
157
|
+
for table in metadata_obj.sorted_tables:
|
|
158
|
+
logging.info(f"Importing {table.name}")
|
|
159
|
+
sb.add_class(table.name)
|
|
160
|
+
cls = schema.classes[table.name]
|
|
161
|
+
pks = [column for column in table.columns if column.primary_key]
|
|
162
|
+
if len(pks) == 1:
|
|
163
|
+
pk = pks.pop().name
|
|
164
|
+
else:
|
|
165
|
+
pk = None
|
|
166
|
+
for column in table.columns:
|
|
167
|
+
slot = SlotDefinition(column.name)
|
|
168
|
+
cls.attributes[slot.name] = slot
|
|
169
|
+
if pk and pk == column.name:
|
|
170
|
+
slot.identifier = True
|
|
171
|
+
if column.foreign_keys:
|
|
172
|
+
for fk in column.foreign_keys:
|
|
173
|
+
[fk_table, fk_table_col] = str(fk.column).split(".")
|
|
174
|
+
slot.range = fk_table
|
|
175
|
+
else:
|
|
176
|
+
slot.range = _map_type(column.type)
|
|
177
|
+
return schema
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def predictive_power(df, target_col, feature_cols, cv=5):
|
|
6
|
+
from sklearn.model_selection import cross_val_score
|
|
7
|
+
from sklearn.preprocessing import LabelEncoder
|
|
8
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
9
|
+
|
|
10
|
+
# Prepare the data
|
|
11
|
+
X = df[feature_cols].copy() # Create an explicit copy
|
|
12
|
+
y = df[target_col].copy()
|
|
13
|
+
|
|
14
|
+
# Encode categorical variables
|
|
15
|
+
for col in X.columns:
|
|
16
|
+
if X[col].dtype == "object":
|
|
17
|
+
X[col] = LabelEncoder().fit_transform(X[col].astype(str))
|
|
18
|
+
|
|
19
|
+
if y.dtype == "object":
|
|
20
|
+
y = LabelEncoder().fit_transform(y.astype(str))
|
|
21
|
+
|
|
22
|
+
# Adjust cv based on the number of unique values in y
|
|
23
|
+
n_unique = len(np.unique(y))
|
|
24
|
+
cv = min(cv, n_unique)
|
|
25
|
+
|
|
26
|
+
# Train a decision tree and get cross-validated accuracy
|
|
27
|
+
clf = DecisionTreeClassifier(random_state=42)
|
|
28
|
+
|
|
29
|
+
if cv < 2:
|
|
30
|
+
# If cv is less than 2, we can't do cross-validation, so we'll just fit and score
|
|
31
|
+
clf.fit(X, y)
|
|
32
|
+
return clf.score(X, y)
|
|
33
|
+
else:
|
|
34
|
+
scores = cross_val_score(clf, X, y, cv=cv)
|
|
35
|
+
return scores.mean()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def analyze_predictive_power(df, columns=None, cv=5):
|
|
39
|
+
if columns is None:
|
|
40
|
+
columns = df.columns
|
|
41
|
+
results = pd.DataFrame(index=columns, columns=["predictive_power", "features"])
|
|
42
|
+
|
|
43
|
+
for target_col in columns:
|
|
44
|
+
feature_cols = [col for col in columns if col != target_col]
|
|
45
|
+
try:
|
|
46
|
+
power = predictive_power(df, target_col, feature_cols, cv)
|
|
47
|
+
results.loc[target_col, "predictive_power"] = power
|
|
48
|
+
results.loc[target_col, "features"] = ", ".join(feature_cols)
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"Error processing {target_col}: {str(e)}")
|
|
51
|
+
results.loc[target_col, "predictive_power"] = np.nan
|
|
52
|
+
|
|
53
|
+
return results
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Tuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
LOL = List[List[float]]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
|
|
13
|
+
"""
|
|
14
|
+
Calculate the cosine similarity between two vectors.
|
|
15
|
+
|
|
16
|
+
>>> v100 = np.array([1, 0, 0])
|
|
17
|
+
>>> v010 = np.array([0, 1, 0])
|
|
18
|
+
>>> v001 = np.array([0, 0, 1])
|
|
19
|
+
>>> v011 = np.array([0, 1, 1])
|
|
20
|
+
>>> pairwise_cosine_similarity(v100, v010)
|
|
21
|
+
0.0
|
|
22
|
+
>>> pairwise_cosine_similarity(v100, v001)
|
|
23
|
+
0.0
|
|
24
|
+
>>> pairwise_cosine_similarity(v010, v001)
|
|
25
|
+
0.0
|
|
26
|
+
>>> pairwise_cosine_similarity(v100, v100)
|
|
27
|
+
1.0
|
|
28
|
+
>>> f"{pairwise_cosine_similarity(v010, v011):0.3f}"
|
|
29
|
+
'0.707'
|
|
30
|
+
|
|
31
|
+
:param vector1:
|
|
32
|
+
:param vector2:
|
|
33
|
+
:return:
|
|
34
|
+
"""
|
|
35
|
+
dot_product = np.dot(vector1, vector2)
|
|
36
|
+
norm1 = np.linalg.norm(vector1)
|
|
37
|
+
norm2 = np.linalg.norm(vector2)
|
|
38
|
+
return float(dot_product / (norm1 * norm2))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def compute_cosine_similarity_matrix(list1: LOL, list2: LOL) -> np.ndarray:
|
|
42
|
+
"""
|
|
43
|
+
Compute cosine similarity between two lists of vectors.
|
|
44
|
+
|
|
45
|
+
Result is a two column vector sim[ROW][COL] where ROW is from list1 and COL is from list2.
|
|
46
|
+
|
|
47
|
+
:param list1:
|
|
48
|
+
:param list2:
|
|
49
|
+
:return:
|
|
50
|
+
"""
|
|
51
|
+
# Convert lists to numpy arrays
|
|
52
|
+
matrix1 = np.array(list1)
|
|
53
|
+
matrix2 = np.array(list2)
|
|
54
|
+
|
|
55
|
+
# Normalize the vectors in both matrices
|
|
56
|
+
matrix1_norm = matrix1 / np.linalg.norm(matrix1, axis=1)[:, np.newaxis]
|
|
57
|
+
matrix2_norm = matrix2 / np.linalg.norm(matrix2, axis=1)[:, np.newaxis]
|
|
58
|
+
|
|
59
|
+
# Compute dot products (resulting in cosine similarity values)
|
|
60
|
+
cosine_similarity_matrix = np.dot(matrix1_norm, matrix2_norm.T)
|
|
61
|
+
|
|
62
|
+
return cosine_similarity_matrix
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
66
|
+
"""
|
|
67
|
+
Find the top match for each row in the cosine similarity matrix.
|
|
68
|
+
|
|
69
|
+
:param cosine_similarity_matrix:
|
|
70
|
+
:return:
|
|
71
|
+
"""
|
|
72
|
+
# Find the index of the maximum value in each row
|
|
73
|
+
top_match_indices = np.argmax(cosine_similarity_matrix, axis=1)
|
|
74
|
+
|
|
75
|
+
# Find the maximum similarity value in each row
|
|
76
|
+
top_match_values = np.amax(cosine_similarity_matrix, axis=1)
|
|
77
|
+
|
|
78
|
+
return top_match_indices, top_match_values
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def top_n_matches(cosine_similarity_matrix: np.ndarray, n: int = 10) -> Tuple[np.ndarray, np.ndarray]:
|
|
82
|
+
# Find the indices that would sort each row in descending order
|
|
83
|
+
sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
|
|
84
|
+
|
|
85
|
+
# Take the first n indices from the sorted indices to get the top n matches
|
|
86
|
+
top_n_indices = sorted_indices[:, :n]
|
|
87
|
+
|
|
88
|
+
# Take the first n values from the sorted values to get the top n match values
|
|
89
|
+
top_n_values = -np.sort(-cosine_similarity_matrix, axis=1)[:, :n]
|
|
90
|
+
|
|
91
|
+
return top_n_indices, top_n_values
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def mmr_diversified_search(
|
|
95
|
+
query_vector: np.ndarray, document_vectors: List[np.ndarray], relevance_factor=0.5, top_n=None
|
|
96
|
+
) -> List[int]:
|
|
97
|
+
"""
|
|
98
|
+
Perform diversified search using Maximal Marginal Relevance (MMR).
|
|
99
|
+
|
|
100
|
+
:param query_vector: The vector representing the query.
|
|
101
|
+
:param document_vectors: The vectors representing the documents.
|
|
102
|
+
:param relevance_factor: The balance parameter between relevance and diversity.
|
|
103
|
+
:param top_n: The number of results to return. If None, return all.
|
|
104
|
+
:return: A list of indices representing the diversified order of documents.
|
|
105
|
+
"""
|
|
106
|
+
if top_n is None:
|
|
107
|
+
# If no specific number of results is specified, return all
|
|
108
|
+
top_n = len(document_vectors)
|
|
109
|
+
|
|
110
|
+
if top_n == 0:
|
|
111
|
+
return []
|
|
112
|
+
|
|
113
|
+
# Calculate cosine similarities between query and all documents
|
|
114
|
+
norms_query = np.linalg.norm(query_vector)
|
|
115
|
+
norms_docs = np.linalg.norm(document_vectors, axis=1)
|
|
116
|
+
similarities = np.dot(document_vectors, query_vector) / (norms_docs * norms_query)
|
|
117
|
+
|
|
118
|
+
# Initialize set of selected indices and results list
|
|
119
|
+
selected_indices = set()
|
|
120
|
+
result_indices = []
|
|
121
|
+
|
|
122
|
+
# Diversified search loop
|
|
123
|
+
for _ in range(top_n):
|
|
124
|
+
max_mmr = float("-inf")
|
|
125
|
+
best_index = None
|
|
126
|
+
|
|
127
|
+
# Loop over all documents
|
|
128
|
+
for idx, _doc_vector in enumerate(document_vectors):
|
|
129
|
+
if idx not in selected_indices:
|
|
130
|
+
relevance = relevance_factor * similarities[idx]
|
|
131
|
+
diversity = 0
|
|
132
|
+
|
|
133
|
+
# Penalize based on similarity to already selected documents
|
|
134
|
+
if selected_indices:
|
|
135
|
+
max_sim_to_selected = max(
|
|
136
|
+
[
|
|
137
|
+
np.dot(document_vectors[idx], document_vectors[s])
|
|
138
|
+
/ (np.linalg.norm(document_vectors[idx]) * np.linalg.norm(document_vectors[s]))
|
|
139
|
+
for s in selected_indices
|
|
140
|
+
]
|
|
141
|
+
)
|
|
142
|
+
diversity = (1 - relevance_factor) * max_sim_to_selected
|
|
143
|
+
|
|
144
|
+
mmr_score = relevance - diversity
|
|
145
|
+
|
|
146
|
+
# Update best MMR score and index
|
|
147
|
+
if mmr_score > max_mmr:
|
|
148
|
+
max_mmr = mmr_score
|
|
149
|
+
best_index = idx
|
|
150
|
+
|
|
151
|
+
# Add the best document to the result and mark it as selected
|
|
152
|
+
if best_index is None:
|
|
153
|
+
logger.warning(f"No best index found over {len(document_vectors)} documents.")
|
|
154
|
+
continue
|
|
155
|
+
result_indices.append(best_index)
|
|
156
|
+
selected_indices.add(best_index)
|
|
157
|
+
|
|
158
|
+
return result_indices
|
|
File without changes
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>{% block title %}LinkML Store API{% endblock %}</title>
|
|
7
|
+
<style>
|
|
8
|
+
body { font-family: Arial, sans-serif; line-height: 1.6; padding: 20px; }
|
|
9
|
+
h1 { color: #333; }
|
|
10
|
+
a { color: #0066cc; }
|
|
11
|
+
.navigation { margin-bottom: 20px; }
|
|
12
|
+
.content { margin-top: 20px; }
|
|
13
|
+
</style>
|
|
14
|
+
</head>
|
|
15
|
+
<body>
|
|
16
|
+
<div class="navigation">
|
|
17
|
+
<a href="/pages/">Home</a> |
|
|
18
|
+
<a href="/pages/databases">Databases</a>
|
|
19
|
+
</div>
|
|
20
|
+
<div class="content">
|
|
21
|
+
{% block content %}{% endblock %}
|
|
22
|
+
</div>
|
|
23
|
+
</body>
|
|
24
|
+
</html>
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{% extends "base.html.j2" %}
|
|
2
|
+
|
|
3
|
+
{% block content %}
|
|
4
|
+
<h1>{{ response.meta.title }}</h1>
|
|
5
|
+
<p>Name: {{ params.collection_name }}</p>
|
|
6
|
+
|
|
7
|
+
<h2>Collections</h2>
|
|
8
|
+
<ul>
|
|
9
|
+
{% for collection in response.data.collections %}
|
|
10
|
+
<li>
|
|
11
|
+
<a href="/pages{{ collection.links|selectattr('rel', 'equalto', 'self')|first|attr('href') }}">{{ collection.name }}</a>
|
|
12
|
+
</li>
|
|
13
|
+
{% endfor %}
|
|
14
|
+
</ul>
|
|
15
|
+
{% endblock %}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{% extends "base.html.j2" %}
|
|
2
|
+
|
|
3
|
+
{% block content %}
|
|
4
|
+
<h1>{{ response.meta.title }}</h1>
|
|
5
|
+
<p>Handle: {{ response.data.handle }}</p>
|
|
6
|
+
<p>Number of collections: {{ response.data.num_collections }}</p>
|
|
7
|
+
|
|
8
|
+
<h2>Collections</h2>
|
|
9
|
+
<ul>
|
|
10
|
+
{% for collection in response.data.collections %}
|
|
11
|
+
<li>
|
|
12
|
+
<a href="/pages{{ collection.links|selectattr('rel', 'equalto', 'self')|first|attr('href') }}">{{ collection.name }}</a>
|
|
13
|
+
</li>
|
|
14
|
+
{% endfor %}
|
|
15
|
+
</ul>
|
|
16
|
+
{% endblock %}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
{% extends "base.html.j2" %}
|
|
2
|
+
|
|
3
|
+
{% block title %}LinkML Store API - Databases{% endblock %}
|
|
4
|
+
|
|
5
|
+
{% block content %}
|
|
6
|
+
<h1>Databases</h1>
|
|
7
|
+
<ul>
|
|
8
|
+
{% for db in response.data.databases %}
|
|
9
|
+
<li>
|
|
10
|
+
<a href="/pages/databases/{{ db.name }}">{{ db.name }}</a>
|
|
11
|
+
</li>
|
|
12
|
+
{% endfor %}
|
|
13
|
+
</ul>
|
|
14
|
+
{% endblock %}
|