linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
from pymongo import MongoClient
|
|
9
|
+
from pymongo.database import Database
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_connection_info(db: Database):
|
|
15
|
+
client = db.client
|
|
16
|
+
|
|
17
|
+
# Get the host and port
|
|
18
|
+
host_info = client.address
|
|
19
|
+
if host_info:
|
|
20
|
+
host, port = host_info
|
|
21
|
+
else:
|
|
22
|
+
# For replica sets or sharded clusters, we might need to get this differently
|
|
23
|
+
host = client.HOST
|
|
24
|
+
port = client.PORT
|
|
25
|
+
|
|
26
|
+
# Get the database name
|
|
27
|
+
db_name = db.name
|
|
28
|
+
|
|
29
|
+
# Get username if available
|
|
30
|
+
username = None
|
|
31
|
+
if hasattr(client, "options") and hasattr(client.options, "credentials"):
|
|
32
|
+
credentials = client.options.credentials
|
|
33
|
+
if credentials:
|
|
34
|
+
username = credentials.username
|
|
35
|
+
|
|
36
|
+
return {"host": host, "port": port, "db_name": db_name, "username": username}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_connection_string(client: MongoClient):
|
|
40
|
+
"""
|
|
41
|
+
Extract a connection string from the MongoClient.
|
|
42
|
+
This avoids triggering truth value testing on Database objects.
|
|
43
|
+
"""
|
|
44
|
+
if client.address:
|
|
45
|
+
host, port = client.address
|
|
46
|
+
return f"{host}:{port}"
|
|
47
|
+
if hasattr(client, "address") and client.address:
|
|
48
|
+
host, port = client.address
|
|
49
|
+
return f"{host}:{port}"
|
|
50
|
+
elif client.hosts:
|
|
51
|
+
# For replica sets, return all hosts
|
|
52
|
+
return ",".join(f"{host}:{port}" for host, port in client.hosts)
|
|
53
|
+
elif hasattr(client, "HOST"):
|
|
54
|
+
# If we can't determine hosts, use the entire URI
|
|
55
|
+
parsed_uri = urlparse(client.HOST)
|
|
56
|
+
return f"{parsed_uri.hostname}:{parsed_uri.port}"
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("Unable to determine connection string from client")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_connection_info(db: Database):
|
|
62
|
+
"""
|
|
63
|
+
Extract connection information from the Database object.
|
|
64
|
+
"""
|
|
65
|
+
# Get the name of the database
|
|
66
|
+
db_name = db.name
|
|
67
|
+
|
|
68
|
+
# Get the client's node list (this should work for single nodes and replica sets)
|
|
69
|
+
node_list = db.client.nodes
|
|
70
|
+
|
|
71
|
+
if not node_list:
|
|
72
|
+
raise ValueError("Unable to determine connection information from database")
|
|
73
|
+
|
|
74
|
+
# Use the first node in the list (for single node setups, this will be the only node)
|
|
75
|
+
first_node = node_list[0]
|
|
76
|
+
host, port = first_node
|
|
77
|
+
|
|
78
|
+
return host, port, db_name
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_auth_from_client(client: MongoClient):
|
|
82
|
+
"""Extract authentication details from MongoClient."""
|
|
83
|
+
if hasattr(client, "_MongoClient__options"):
|
|
84
|
+
# For older versions of PyMongo
|
|
85
|
+
options = client._MongoClient__options
|
|
86
|
+
elif hasattr(client, "options"):
|
|
87
|
+
# For newer versions of PyMongo
|
|
88
|
+
options = client.options
|
|
89
|
+
else:
|
|
90
|
+
return None, None, None
|
|
91
|
+
|
|
92
|
+
if hasattr(options, "credentials"):
|
|
93
|
+
creds = options.credentials
|
|
94
|
+
return creds.username, creds.password, creds.source
|
|
95
|
+
return None, None, None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def connection_from_handle(handle: str):
|
|
99
|
+
if handle.startswith("mongodb://"):
|
|
100
|
+
handle = handle.replace("mongodb://", "")
|
|
101
|
+
host, db = handle.split("/")
|
|
102
|
+
return host, db
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def export_mongodb(handle: str, location: str, password: Optional[str] = None):
|
|
106
|
+
host, db_name = connection_from_handle(handle)
|
|
107
|
+
|
|
108
|
+
# Construct the mongodump command
|
|
109
|
+
cmd = ["mongodump", f"--host={host}", f"--db={db_name}"]
|
|
110
|
+
logger.info(f"Exporting MongoDB database {db_name} from {host} to {location}")
|
|
111
|
+
cmd.extend(["--out", location])
|
|
112
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
113
|
+
logger.info(f"MongoDB export completed successfully. Output: {result.stdout}")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def import_mongodb(handle: str, dump_dir: str, drop: bool = False):
|
|
117
|
+
host, db_name = connection_from_handle(handle)
|
|
118
|
+
|
|
119
|
+
# list dirs in dump_dir
|
|
120
|
+
dir_path = Path(dump_dir)
|
|
121
|
+
if not dir_path.is_dir():
|
|
122
|
+
raise ValueError(f"{dir_path} is not a dir")
|
|
123
|
+
directories = [name for name in os.listdir(dump_dir)]
|
|
124
|
+
if len(directories) != 1:
|
|
125
|
+
raise ValueError(f"Expected exactly one database in {dump_dir}, got: {directories}")
|
|
126
|
+
src_db_name = directories[0]
|
|
127
|
+
|
|
128
|
+
# Construct the mongorestore command
|
|
129
|
+
cmd = [
|
|
130
|
+
"mongorestore",
|
|
131
|
+
f"--host={host}",
|
|
132
|
+
f"--nsFrom={src_db_name}.*",
|
|
133
|
+
f"--nsTo={db_name}.*",
|
|
134
|
+
str(dump_dir),
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
# Add drop option if specified
|
|
138
|
+
if drop:
|
|
139
|
+
cmd.append("--drop")
|
|
140
|
+
logger.info(f"CMD={cmd}")
|
|
141
|
+
# Execute mongorestore
|
|
142
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
143
|
+
if result.stderr:
|
|
144
|
+
logger.warning(result.stderr)
|
|
145
|
+
logger.info(f"MongoDB import completed successfully. Output: {result.stdout} // {result.stderr}")
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
from py2neo import Graph
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def draw_neo4j_graph(handle="bolt://localhost:7687", auth=("neo4j", None)):
|
|
6
|
+
# Connect to Neo4j
|
|
7
|
+
graph = Graph(handle, auth=auth)
|
|
8
|
+
|
|
9
|
+
# Run a Cypher query
|
|
10
|
+
query = """
|
|
11
|
+
MATCH (n)-[r]->(m)
|
|
12
|
+
RETURN n, r, m
|
|
13
|
+
LIMIT 100
|
|
14
|
+
"""
|
|
15
|
+
result = graph.run(query)
|
|
16
|
+
|
|
17
|
+
# Create a NetworkX graph
|
|
18
|
+
G = nx.DiGraph() # Use DiGraph for directed edges
|
|
19
|
+
for record in result:
|
|
20
|
+
n = record["n"]
|
|
21
|
+
m = record["m"]
|
|
22
|
+
r = record["r"]
|
|
23
|
+
G.add_node(n["name"], label=list(n.labels or ["-"])[0])
|
|
24
|
+
G.add_node(m["name"], label=list(m.labels or ["-"])[0])
|
|
25
|
+
G.add_edge(n["name"], m["name"], type=type(r).__name__)
|
|
26
|
+
|
|
27
|
+
# Draw the graph
|
|
28
|
+
pos = nx.spring_layout(G)
|
|
29
|
+
|
|
30
|
+
# Draw nodes
|
|
31
|
+
nx.draw_networkx_nodes(G, pos, node_color="lightblue", node_size=10000)
|
|
32
|
+
|
|
33
|
+
# Draw edges
|
|
34
|
+
nx.draw_networkx_edges(G, pos, edge_color="gray", arrows=True)
|
|
35
|
+
|
|
36
|
+
# Add node labels
|
|
37
|
+
node_labels = nx.get_node_attributes(G, "label")
|
|
38
|
+
nx.draw_networkx_labels(G, pos, {node: f"{node}\n({label})" for node, label in node_labels.items()}, font_size=16)
|
|
39
|
+
|
|
40
|
+
# Add edge labels
|
|
41
|
+
edge_labels = nx.get_edge_attributes(G, "type")
|
|
42
|
+
nx.draw_networkx_edge_labels(G, pos, edge_labels, font_size=16)
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from copy import deepcopy
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def object_path_update(
|
|
9
|
+
obj: Union[BaseModel, Dict[str, Any]], path: str, value: Any
|
|
10
|
+
) -> Union[BaseModel, Dict[str, Any]]:
|
|
11
|
+
"""
|
|
12
|
+
Updates a nested object based on a path description and a value. The path to the
|
|
13
|
+
desired field is given in dot and bracket notation (e.g., 'a[0].b.c[1]').
|
|
14
|
+
|
|
15
|
+
:param obj: The dictionary object to be updated.
|
|
16
|
+
:type obj: Dict[str, Any]
|
|
17
|
+
:param path: The path string indicating where to place the value within the object.
|
|
18
|
+
:type path: str
|
|
19
|
+
:param value: The value to be set at the specified path.
|
|
20
|
+
:type value: Any
|
|
21
|
+
:return: None. This function modifies the object in-place.
|
|
22
|
+
:rtype: None
|
|
23
|
+
|
|
24
|
+
**Example**::
|
|
25
|
+
|
|
26
|
+
>>> data = {}
|
|
27
|
+
>>> object_path_update(data, 'persons[0].foo.bar', 1)
|
|
28
|
+
{'persons': [{'foo': {'bar': 1}}]}
|
|
29
|
+
"""
|
|
30
|
+
if isinstance(obj, BaseModel):
|
|
31
|
+
typ = type(obj)
|
|
32
|
+
obj = obj.model_dump(exclude_none=True)
|
|
33
|
+
obj = object_path_update(obj, path, value)
|
|
34
|
+
return typ(**obj)
|
|
35
|
+
obj = deepcopy(obj)
|
|
36
|
+
ret_obj = obj
|
|
37
|
+
parts = path.split(".")
|
|
38
|
+
for part in parts[:-1]:
|
|
39
|
+
if "[" in part:
|
|
40
|
+
key, index = part[:-1].split("[")
|
|
41
|
+
index = int(index)
|
|
42
|
+
# obj = obj.setdefault(key, [{} for _ in range(index+1)])
|
|
43
|
+
obj = obj.setdefault(key, [])
|
|
44
|
+
while len(obj) <= index:
|
|
45
|
+
obj.append({})
|
|
46
|
+
obj = obj[index]
|
|
47
|
+
else:
|
|
48
|
+
if part in obj and obj[part] is None:
|
|
49
|
+
del obj[part]
|
|
50
|
+
obj = obj.setdefault(part, {})
|
|
51
|
+
last_part = parts[-1]
|
|
52
|
+
if "[" in last_part:
|
|
53
|
+
key, index = last_part[:-1].split("[")
|
|
54
|
+
index = int(index)
|
|
55
|
+
if key not in obj or not isinstance(obj[key], list):
|
|
56
|
+
obj[key] = [{} for _ in range(index + 1)]
|
|
57
|
+
obj[key][index] = value
|
|
58
|
+
else:
|
|
59
|
+
obj[last_part] = value
|
|
60
|
+
return ret_obj
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def object_path_get(obj: Union[BaseModel, Dict[str, Any]], path: str, default_value=None) -> Any:
|
|
64
|
+
"""
|
|
65
|
+
Retrieves a value from a nested object based on a path description. The path to the
|
|
66
|
+
desired field is given in dot and bracket notation (e.g., 'a[0].b.c[1]').
|
|
67
|
+
|
|
68
|
+
:param obj: The dictionary object to be updated.
|
|
69
|
+
:type obj: Dict[str, Any]
|
|
70
|
+
:param path: The path string indicating where to place the value within the object.
|
|
71
|
+
:type path: str
|
|
72
|
+
:return: The value at the specified path.
|
|
73
|
+
:rtype: Any
|
|
74
|
+
|
|
75
|
+
**Example**::
|
|
76
|
+
|
|
77
|
+
>>> data = {'persons': [{'foo': {'bar': 1}}]}
|
|
78
|
+
>>> object_path_get(data, 'persons[0].foo.bar')
|
|
79
|
+
1
|
|
80
|
+
>>> object_path_get(data, 'persons[0].foo')
|
|
81
|
+
{'bar': 1}
|
|
82
|
+
>>> object_path_get({}, 'not there', "NA")
|
|
83
|
+
'NA'
|
|
84
|
+
"""
|
|
85
|
+
if isinstance(obj, BaseModel):
|
|
86
|
+
obj = obj.model_dump()
|
|
87
|
+
parts = path.split(".")
|
|
88
|
+
for part in parts:
|
|
89
|
+
if "[" in part:
|
|
90
|
+
key, index = part[:-1].split("[")
|
|
91
|
+
index = int(index)
|
|
92
|
+
if key in obj and obj[key] is not None:
|
|
93
|
+
obj = obj[key][index]
|
|
94
|
+
else:
|
|
95
|
+
return default_value
|
|
96
|
+
else:
|
|
97
|
+
if isinstance(obj, list):
|
|
98
|
+
obj = [v1.get(part, default_value) for v1 in obj]
|
|
99
|
+
else:
|
|
100
|
+
obj = obj.get(part, default_value)
|
|
101
|
+
return obj
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_update_expression(expr: str) -> Union[tuple[str, Any], None]:
|
|
105
|
+
"""
|
|
106
|
+
Parse a string expression of the form 'path.to.field=value' into a path and a value.
|
|
107
|
+
|
|
108
|
+
:param expr:
|
|
109
|
+
:return:
|
|
110
|
+
"""
|
|
111
|
+
try:
|
|
112
|
+
path, val = expr.split("=", 1)
|
|
113
|
+
val = json.loads(val)
|
|
114
|
+
except ValueError:
|
|
115
|
+
return None
|
|
116
|
+
return path, val
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def clean_empties(value: Union[Dict, List]) -> Any:
|
|
120
|
+
if isinstance(value, dict):
|
|
121
|
+
value = {k: v for k, v in ((k, clean_empties(v)) for k, v in value.items()) if v is not None}
|
|
122
|
+
elif isinstance(value, list):
|
|
123
|
+
value = [v for v in (clean_empties(v) for v in value) if v is not None]
|
|
124
|
+
return value
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=None) -> Optional[dict]:
|
|
128
|
+
"""
|
|
129
|
+
Select nested attributes from a complex dictionary based on selector strings.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
data (dict): The input nested dictionary.
|
|
133
|
+
paths (list): A list of selector strings.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
dict: A new dictionary with the same structure, but only the selected attributes.
|
|
137
|
+
|
|
138
|
+
Example:
|
|
139
|
+
>>> data = {
|
|
140
|
+
... "person": {
|
|
141
|
+
... "name": "John Doe",
|
|
142
|
+
... "age": 30,
|
|
143
|
+
... "address": {
|
|
144
|
+
... "street": "123 Main St",
|
|
145
|
+
... "city": "Anytown",
|
|
146
|
+
... "country": "USA"
|
|
147
|
+
... },
|
|
148
|
+
... "phones": [
|
|
149
|
+
... {"type": "home", "number": "555-1234"},
|
|
150
|
+
... {"type": "work", "number": "555-5678"}
|
|
151
|
+
... ]
|
|
152
|
+
... },
|
|
153
|
+
... "company": {
|
|
154
|
+
... "name": "Acme Inc",
|
|
155
|
+
... "location": "New York"
|
|
156
|
+
... }
|
|
157
|
+
... }
|
|
158
|
+
>>> select_nested(data, ["person.address.street", "person.address.city"])
|
|
159
|
+
{'person': {'address': {'street': '123 Main St', 'city': 'Anytown'}}}
|
|
160
|
+
>>> select_nested(data, ["person.phones.number", "person.phones.type"])
|
|
161
|
+
{'person': {'phones': [{'type': 'home', 'number': '555-1234'}, {'type': 'work', 'number': '555-5678'}]}}
|
|
162
|
+
>>> select_nested(data, ["person"])
|
|
163
|
+
{'person': {'name': 'John Doe', 'age': 30, 'address': {'street': '123 Main St', 'city': 'Anytown',
|
|
164
|
+
'country': 'USA'}, 'phones': [{'type': 'home', 'number': '555-1234'}, {'type': 'work', 'number': '555-5678'}]}}
|
|
165
|
+
>>> select_nested(data, ["person.phones.type"])
|
|
166
|
+
{'person': {'phones': [{'type': 'home'}, {'type': 'work'}]}}
|
|
167
|
+
"""
|
|
168
|
+
if current_path is None:
|
|
169
|
+
current_path = []
|
|
170
|
+
matching_paths = []
|
|
171
|
+
if not paths:
|
|
172
|
+
raise ValueError("No paths provided")
|
|
173
|
+
for path in paths:
|
|
174
|
+
if isinstance(path, str):
|
|
175
|
+
path = path.split(".")
|
|
176
|
+
if path == current_path:
|
|
177
|
+
return data
|
|
178
|
+
if path[: len(current_path)] == current_path:
|
|
179
|
+
matching_paths.append(path)
|
|
180
|
+
if not matching_paths:
|
|
181
|
+
return None
|
|
182
|
+
if isinstance(data, dict):
|
|
183
|
+
new_obj = {k: select_nested(v, matching_paths, current_path + [k]) for k, v in data.items()}
|
|
184
|
+
new_obj = {k: v for k, v in new_obj.items() if v is not None}
|
|
185
|
+
return new_obj
|
|
186
|
+
if isinstance(data, list):
|
|
187
|
+
new_obj = [select_nested(v, matching_paths, current_path + []) for i, v in enumerate(data)]
|
|
188
|
+
new_obj = [v for v in new_obj if v is not None]
|
|
189
|
+
return new_obj
|
|
190
|
+
return data
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def flatten_dict(d: Dict[str, Any], parent_key: str = "", sep: str = ".") -> Dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Recursively flatten a nested dictionary.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
d (Dict[str, Any]): The dictionary to flatten.
|
|
15
|
+
parent_key (str): The parent key for nested dictionaries.
|
|
16
|
+
sep (str): The separator to use between keys.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dict[str, Any]: A flattened dictionary.
|
|
20
|
+
|
|
21
|
+
>>> flatten_dict({'a': 1, 'b': {'c': 2, 'd': {'e': 3}}})
|
|
22
|
+
{'a': 1, 'b.c': 2, 'b.d.e': 3}
|
|
23
|
+
"""
|
|
24
|
+
items = []
|
|
25
|
+
for k, v in d.items():
|
|
26
|
+
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
|
27
|
+
if isinstance(v, dict):
|
|
28
|
+
items.extend(flatten_dict(v, new_key, sep=sep).items())
|
|
29
|
+
else:
|
|
30
|
+
items.append((new_key, v))
|
|
31
|
+
return dict(items)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
|
|
35
|
+
"""
|
|
36
|
+
Convert a list of nested objects to a flattened pandas DataFrame.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
data (List[Dict[str, Any]]): A list of nested dictionaries.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
pd.DataFrame: A flattened DataFrame.
|
|
43
|
+
|
|
44
|
+
>>> data = [
|
|
45
|
+
... {"person": {"name": "Alice", "age": 30}, "job": {"title": "Engineer", "salary": 75000}},
|
|
46
|
+
... {"person": {"name": "Bob", "age": 35}, "job": {"title": "Manager", "salary": 85000}}
|
|
47
|
+
... ]
|
|
48
|
+
>>> df = nested_objects_to_dataframe(data)
|
|
49
|
+
>>> df.columns.tolist()
|
|
50
|
+
['person.name', 'person.age', 'job.title', 'job.salary']
|
|
51
|
+
>>> df['person.name'].tolist()
|
|
52
|
+
['Alice', 'Bob']
|
|
53
|
+
"""
|
|
54
|
+
flattened_data = [flatten_dict(item) for item in data]
|
|
55
|
+
return pd.DataFrame(flattened_data)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def facet_summary_to_dataframe_unmelted(
|
|
59
|
+
facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]],
|
|
60
|
+
) -> pd.DataFrame:
|
|
61
|
+
rows = []
|
|
62
|
+
|
|
63
|
+
for facet_type, facet_data in facet_summary.items():
|
|
64
|
+
if isinstance(facet_type, str):
|
|
65
|
+
# Single facet type
|
|
66
|
+
for category, value in facet_data:
|
|
67
|
+
rows.append({facet_type: category, "Value": value})
|
|
68
|
+
else:
|
|
69
|
+
# Multiple facet types
|
|
70
|
+
for cat_val_tuple in facet_data:
|
|
71
|
+
if len(cat_val_tuple) == 2:
|
|
72
|
+
categories, value = cat_val_tuple
|
|
73
|
+
else:
|
|
74
|
+
categories, value = cat_val_tuple[:-1], cat_val_tuple[-1]
|
|
75
|
+
row = {"Value": value}
|
|
76
|
+
for i, facet in enumerate(facet_type):
|
|
77
|
+
logger.debug(f"FT={facet_type} i={i} Facet: {facet}, categories: {categories}")
|
|
78
|
+
row[facet] = categories[i] if len(categories) > i else None
|
|
79
|
+
rows.append(row)
|
|
80
|
+
|
|
81
|
+
df = pd.DataFrame(rows)
|
|
82
|
+
|
|
83
|
+
# Ensure all columns are present, fill with None if missing
|
|
84
|
+
all_columns = set(col for facet in facet_summary.keys() for col in (facet if isinstance(facet, tuple) else [facet]))
|
|
85
|
+
for col in all_columns:
|
|
86
|
+
if col not in df.columns:
|
|
87
|
+
df[col] = None
|
|
88
|
+
|
|
89
|
+
# Move 'Value' to the end
|
|
90
|
+
cols = [col for col in df.columns if col != "Value"] + ["Value"]
|
|
91
|
+
df = df[cols]
|
|
92
|
+
|
|
93
|
+
return df
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, TypedDict
|
|
2
|
+
|
|
3
|
+
import jsonpatch
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PatchDict(TypedDict):
|
|
7
|
+
op: str
|
|
8
|
+
path: str
|
|
9
|
+
value: Optional[Any]
|
|
10
|
+
_from: Optional[str]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def apply_patches(obj: Any, patches: List[PatchDict], primary_key: Optional[str] = None, in_place=False) -> Any:
|
|
14
|
+
"""
|
|
15
|
+
Apply a set of patches to an object.
|
|
16
|
+
|
|
17
|
+
If the object is a list, the primary key must be specified.
|
|
18
|
+
|
|
19
|
+
>>> objs = [{'id': 'F1', 'name': 'Cheese'}, {'id': 'F2', 'name': 'Bread'}]
|
|
20
|
+
>>> patches = [{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}]
|
|
21
|
+
>>> apply_patches(objs, patches, primary_key='id')
|
|
22
|
+
[{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': 'Bread'}]
|
|
23
|
+
|
|
24
|
+
:param obj: object to patch
|
|
25
|
+
:param patches: list of patches, conforming to the JSON Patch format
|
|
26
|
+
:param primary_key: key to use as the primary key for the objects (if obj is a list)
|
|
27
|
+
:param in_place: whether to apply the patches in place
|
|
28
|
+
:return:
|
|
29
|
+
"""
|
|
30
|
+
if isinstance(obj, dict):
|
|
31
|
+
patch_obj = jsonpatch.JsonPatch(patches)
|
|
32
|
+
return patch_obj.apply(obj, in_place=in_place)
|
|
33
|
+
elif isinstance(obj, list):
|
|
34
|
+
if not primary_key:
|
|
35
|
+
raise ValueError("Primary key must be specified for list objects")
|
|
36
|
+
return apply_patches_to_list(obj, patches, primary_key, in_place=in_place)
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"Unsupported object type: {type(obj)}")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def apply_patches_to_list(
|
|
42
|
+
objects: List[Dict[str, Any]], patches: List[PatchDict], primary_key: str, in_place=False
|
|
43
|
+
) -> List[Dict[str, Any]]:
|
|
44
|
+
"""
|
|
45
|
+
Apply a set of patches to a list of objects.
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
:param objects: list of objects
|
|
50
|
+
:param patches: list of patches, conforming to the JSON Patch format
|
|
51
|
+
:param primary_key: key to use as the primary key for the objects
|
|
52
|
+
:param in_place: whether to apply the patches in place
|
|
53
|
+
:return:
|
|
54
|
+
"""
|
|
55
|
+
objs_as_dict = {obj[primary_key]: obj for obj in objects}
|
|
56
|
+
result = apply_patches_to_keyed_list(objs_as_dict, patches, in_place=in_place)
|
|
57
|
+
return list(result.values())
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def apply_patches_to_keyed_list(
|
|
61
|
+
objs_as_dict: Dict[str, Dict[str, Any]], patches: List[PatchDict], in_place=False
|
|
62
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
63
|
+
"""
|
|
64
|
+
Apply a set of patches to a list of objects, where the objects are keyed by a primary key
|
|
65
|
+
|
|
66
|
+
:param objs_as_dict:
|
|
67
|
+
:param patches:
|
|
68
|
+
:param in_place:
|
|
69
|
+
:return:
|
|
70
|
+
"""
|
|
71
|
+
patch_obj = jsonpatch.JsonPatch(patches)
|
|
72
|
+
result = patch_obj.apply(objs_as_dict, in_place=in_place)
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def patches_from_objects_lists(
|
|
77
|
+
src_objs: List[Dict[str, Any]], dst_objs: List[Dict[str, Any]], primary_key: str, exclude_none=True
|
|
78
|
+
) -> List[PatchDict]:
|
|
79
|
+
"""
|
|
80
|
+
Generate a set of patches to transform src_objs into tgt_objs.
|
|
81
|
+
|
|
82
|
+
>>> src_objs = [{'id': 'F1', 'name': 'Cheese'}, {'id': 'F2', 'name': 'Bread'}]
|
|
83
|
+
>>> tgt_objs = [{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': 'Bread'}]
|
|
84
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id')
|
|
85
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}]
|
|
86
|
+
|
|
87
|
+
by default exclude_none is True, so None values are excluded from the patch
|
|
88
|
+
|
|
89
|
+
>>> tgt_objs = [{'id': 'F1', 'name': 'Toast'}, {'id': 'F2', 'name': None}]
|
|
90
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id')
|
|
91
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}, {'op': 'remove', 'path': '/F2/name'}]
|
|
92
|
+
|
|
93
|
+
if exclude_none is False, None values are treated as being set to None
|
|
94
|
+
|
|
95
|
+
>>> patches_from_objects_lists(src_objs, tgt_objs, primary_key='id', exclude_none=False)
|
|
96
|
+
[{'op': 'replace', 'path': '/F1/name', 'value': 'Toast'}, {'op': 'replace', 'path': '/F2/name', 'value': None}]
|
|
97
|
+
|
|
98
|
+
See also: `<https://github.com/orgs/linkml/discussions/1975>`_
|
|
99
|
+
|
|
100
|
+
Note the patches are sorted deterministically, first by path, then by operation.
|
|
101
|
+
This helps ensure operations on the same object are grouped together
|
|
102
|
+
|
|
103
|
+
:param src_objs: source objects
|
|
104
|
+
:param dst_objs: target objects
|
|
105
|
+
:param primary_key: key to use as the primary key for the objects
|
|
106
|
+
:param exclude_none: whether to exclude None values from the patch
|
|
107
|
+
:return:
|
|
108
|
+
"""
|
|
109
|
+
src_objs_as_dict = {obj[primary_key]: obj for obj in src_objs}
|
|
110
|
+
dst_objs_as_dict = {obj[primary_key]: obj for obj in dst_objs}
|
|
111
|
+
if exclude_none:
|
|
112
|
+
src_objs_as_dict = {k: remove_nones(v) for k, v in src_objs_as_dict.items()}
|
|
113
|
+
dst_objs_as_dict = {k: remove_nones(v) for k, v in dst_objs_as_dict.items()}
|
|
114
|
+
patch_obj = jsonpatch.JsonPatch.from_diff(src_objs_as_dict, dst_objs_as_dict)
|
|
115
|
+
pl = patch_obj.patch
|
|
116
|
+
return sorted(pl, key=lambda x: (x["path"], x["op"]))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def remove_nones(obj: Dict[str, Any]) -> Dict[str, Any]:
|
|
120
|
+
"""
|
|
121
|
+
Remove None values from a dictionary.
|
|
122
|
+
|
|
123
|
+
:param obj:
|
|
124
|
+
:return:
|
|
125
|
+
"""
|
|
126
|
+
return {k: v for k, v in obj.items() if v is not None}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import operator
|
|
2
|
+
from typing import Any, Callable, Dict
|
|
3
|
+
|
|
4
|
+
MONGO_OPERATORS = {
|
|
5
|
+
"$eq": operator.eq,
|
|
6
|
+
"$ne": operator.ne,
|
|
7
|
+
"$gt": operator.gt,
|
|
8
|
+
"$gte": operator.ge,
|
|
9
|
+
"$lt": operator.lt,
|
|
10
|
+
"$lte": operator.le,
|
|
11
|
+
"$in": lambda a, b: any(x in b for x in (a if isinstance(a, list) else [a])),
|
|
12
|
+
"$nin": lambda a, b: all(x not in b for x in (a if isinstance(a, list) else [a])),
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def mongo_query_to_match_function(where: Dict[str, Any]) -> Callable[[Dict[str, Any]], bool]:
|
|
17
|
+
"""
|
|
18
|
+
Convert a MongoDB-style query to a matching function.
|
|
19
|
+
|
|
20
|
+
>>> query = {"name": "foo", "age": {"$gt": 25}}
|
|
21
|
+
>>> matcher = mongo_query_to_match_function(query)
|
|
22
|
+
>>> matcher({"name": "foo", "age": 30})
|
|
23
|
+
True
|
|
24
|
+
>>> matcher({"name": "foo", "age": 20})
|
|
25
|
+
False
|
|
26
|
+
>>> matcher({"name": "bar", "age": 30})
|
|
27
|
+
False
|
|
28
|
+
|
|
29
|
+
>>> nested_query = {"nested.job": "engineer", "skills": {"$in": ["python", "mongodb"]}}
|
|
30
|
+
>>> nested_matcher = mongo_query_to_match_function(nested_query)
|
|
31
|
+
>>> nested_matcher({"nested": {"job": "engineer"}, "skills": ["python", "javascript"]})
|
|
32
|
+
True
|
|
33
|
+
>>> nested_matcher({"nested": {"job": "designer"}, "skills": ["python", "mongodb"]})
|
|
34
|
+
False
|
|
35
|
+
>>> nested_matcher({"nested": {"job": "engineer"}, "skills": ["java", "c++"]})
|
|
36
|
+
False
|
|
37
|
+
|
|
38
|
+
>>> complex_query = {"name": "foo", "age": {"$gte": 25, "$lt": 40}, "nested.salary": {"$gt": 50000}}
|
|
39
|
+
>>> complex_matcher = mongo_query_to_match_function(complex_query)
|
|
40
|
+
>>> complex_matcher({"name": "foo", "age": 30, "nested": {"salary": 60000}})
|
|
41
|
+
True
|
|
42
|
+
>>> complex_matcher({"name": "foo", "age": 45, "nested": {"salary": 70000}})
|
|
43
|
+
False
|
|
44
|
+
>>> complex_matcher({"name": "foo", "age": 35, "nested": {"salary": 40000}})
|
|
45
|
+
False
|
|
46
|
+
|
|
47
|
+
>>> invalid_query = {"age": {"$invalid": 25}}
|
|
48
|
+
>>> invalid_matcher = mongo_query_to_match_function(invalid_query)
|
|
49
|
+
>>> invalid_matcher({"age": 30})
|
|
50
|
+
Traceback (most recent call last):
|
|
51
|
+
...
|
|
52
|
+
ValueError: Unsupported operator: $invalid
|
|
53
|
+
"""
|
|
54
|
+
if where is None:
|
|
55
|
+
where = {}
|
|
56
|
+
|
|
57
|
+
def matches(obj: Dict[str, Any]) -> bool:
|
|
58
|
+
def check_condition(key: str, condition: Any) -> bool:
|
|
59
|
+
if isinstance(condition, dict) and any(k.startswith("$") for k in condition.keys()):
|
|
60
|
+
for op, value in condition.items():
|
|
61
|
+
if op in MONGO_OPERATORS:
|
|
62
|
+
if not MONGO_OPERATORS[op](get_nested_value(obj, key), value):
|
|
63
|
+
return False
|
|
64
|
+
else:
|
|
65
|
+
raise ValueError(f"Unsupported operator: {op}")
|
|
66
|
+
elif isinstance(condition, dict):
|
|
67
|
+
return check_nested_condition(get_nested_value(obj, key), condition)
|
|
68
|
+
else:
|
|
69
|
+
return get_nested_value(obj, key) == condition
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
def check_nested_condition(nested_obj: Dict[str, Any], nested_condition: Dict[str, Any]) -> bool:
|
|
73
|
+
for k, v in nested_condition.items():
|
|
74
|
+
if not check_condition(k, v):
|
|
75
|
+
return False
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
def get_nested_value(obj: Dict[str, Any], key: str) -> Any:
|
|
79
|
+
parts = key.split(".")
|
|
80
|
+
for part in parts:
|
|
81
|
+
if isinstance(obj, dict):
|
|
82
|
+
obj = obj.get(part)
|
|
83
|
+
else:
|
|
84
|
+
return None
|
|
85
|
+
return obj
|
|
86
|
+
|
|
87
|
+
return all(check_condition(k, v) for k, v in where.items())
|
|
88
|
+
|
|
89
|
+
return matches
|