linkml-store 0.2.5__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (28) hide show
  1. linkml_store/api/client.py +9 -6
  2. linkml_store/api/collection.py +118 -5
  3. linkml_store/api/database.py +45 -14
  4. linkml_store/api/stores/duckdb/duckdb_collection.py +176 -8
  5. linkml_store/api/stores/duckdb/duckdb_database.py +52 -19
  6. linkml_store/api/stores/filesystem/__init__.py +1 -1
  7. linkml_store/api/stores/mongodb/mongodb_collection.py +186 -0
  8. linkml_store/api/stores/mongodb/mongodb_database.py +8 -3
  9. linkml_store/api/stores/solr/solr_collection.py +7 -1
  10. linkml_store/cli.py +202 -21
  11. linkml_store/index/implementations/llm_indexer.py +14 -6
  12. linkml_store/index/indexer.py +7 -4
  13. linkml_store/inference/implementations/llm_inference_engine.py +13 -9
  14. linkml_store/inference/implementations/rag_inference_engine.py +13 -10
  15. linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
  16. linkml_store/inference/inference_config.py +1 -0
  17. linkml_store/utils/dat_parser.py +95 -0
  18. linkml_store/utils/enrichment_analyzer.py +217 -0
  19. linkml_store/utils/format_utils.py +183 -3
  20. linkml_store/utils/llm_utils.py +3 -1
  21. linkml_store/utils/pandas_utils.py +1 -1
  22. linkml_store/utils/sql_utils.py +7 -1
  23. linkml_store/utils/vector_utils.py +4 -11
  24. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/METADATA +4 -3
  25. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
  26. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
  27. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
  28. {linkml_store-0.2.5.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,95 @@
1
+ from typing import Tuple, Optional, Dict, Any, List
2
+
3
+ ENTRY = Dict[str, Any]
4
+
5
+
6
+ def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
7
+ """
8
+ Parse SIB/Swiss-Prot format data into a structured dictionary.
9
+
10
+ Args:
11
+ text (str): The text in SIB/Swiss-Prot format
12
+
13
+ Returns:
14
+ dict: A dictionary with entry IDs as keys and parsed data as values
15
+ """
16
+ # Split the text into entries (separated by //)
17
+ entries = text.split("//\n")
18
+ header = None
19
+
20
+ # Initialize results dictionary
21
+ results = []
22
+
23
+ # Parse each entry
24
+ for entry in entries:
25
+ if not entry.strip():
26
+ continue
27
+
28
+ # Initialize dictionary for current entry
29
+ current_entry = {}
30
+ current_code = None
31
+
32
+ # Process each line
33
+ for line in entry.strip().split("\n"):
34
+ if not line.strip():
35
+ continue
36
+
37
+ # Check if this is a new field (starts with a 2-letter code followed by space)
38
+ if len(line) > 2 and line[2] == " ":
39
+ current_code = line[0:2]
40
+ # Remove the code and the following space(s)
41
+ value = line[3:].strip()
42
+
43
+ # Initialize as list if needed for multi-line fields
44
+ if current_code not in current_entry:
45
+ current_entry[current_code] = []
46
+
47
+ current_entry[current_code].append(value)
48
+
49
+ # Continuation of previous field
50
+ elif current_code is not None:
51
+ # Handle continuation lines (typically indented)
52
+ if current_code == "CC":
53
+ # For comments, preserve the indentation
54
+ current_entry[current_code].append(line)
55
+ else:
56
+ # For other fields, strip and append
57
+ current_entry[current_code].append(line.strip())
58
+
59
+ # Combine multiline comments; e.g
60
+ # -!- ...
61
+ # ...
62
+ # -!- ...
63
+ ccs = current_entry.get("CC", [])
64
+ new_ccs = []
65
+ for cc in ccs:
66
+ if not cc.startswith("-!-") and new_ccs:
67
+ new_ccs[-1] += " " + cc
68
+ else:
69
+ new_ccs.append(cc)
70
+ current_entry["CC"] = new_ccs
71
+ for k, vs in current_entry.items():
72
+ if k != "CC":
73
+ combined = "".join(vs)
74
+ combined = combined.strip()
75
+ if combined.endswith("."):
76
+ combined = combined.split(".")
77
+ combined = [c.strip() for c in combined if c.strip()]
78
+ if k == "DE":
79
+ combined = combined[0]
80
+ current_entry[k] = combined
81
+
82
+ if "ID" in current_entry:
83
+ results.append(current_entry)
84
+ else:
85
+ header = current_entry
86
+
87
+ return header, results
88
+
89
+
90
+ # Example usage:
91
+ # data = parse_sib_format(text)
92
+ # for entry_id, entry_data in data.items():
93
+ # print(f"Entry: {entry_id}")
94
+ # for code, values in entry_data.items():
95
+ # print(f" {code}: {values}")
@@ -0,0 +1,217 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from linkml_store.api import Collection
4
+ from scipy import stats
5
+ from typing import Dict, List
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class EnrichedCategory(BaseModel):
10
+ """
11
+ Information about a category enriched in a sample
12
+ """
13
+
14
+ category: str
15
+ fold_change: float
16
+ original_p_value: float
17
+ adjusted_p_value: float
18
+
19
+
20
+ from collections import Counter, defaultdict
21
+
22
+
23
+ class EnrichmentAnalyzer:
24
+ def __init__(self, df: pd.DataFrame, sample_key: str, classification_key: str):
25
+ """
26
+ Initialize the analyzer with a DataFrame and key column names.
27
+ Precomputes category frequencies for the entire dataset.
28
+
29
+ Args:
30
+ df: DataFrame containing the data
31
+ sample_key: Column name for sample IDs
32
+ classification_key: Column name for category lists
33
+ """
34
+ self.df = df
35
+ self.sample_key = sample_key
36
+ self.classification_key = classification_key
37
+
38
+ # Precompute global category statistics
39
+ self.global_stats = self._compute_global_stats()
40
+
41
+ # Cache for sample-specific category counts
42
+ self.sample_cache: Dict[str, Counter] = {}
43
+
44
+ @classmethod
45
+ def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> "EnrichmentAnalyzer":
46
+ """
47
+ Initialize the analyzer with a Collection and key column names.
48
+ Precomputes category frequencies for the entire dataset.
49
+
50
+ Args:
51
+ collection: Collection containing the data
52
+ sample_key: Column name for sample IDs
53
+ classification_key: Column name for category lists
54
+ """
55
+ column_atts = [sample_key, classification_key]
56
+ results = collection.find(select_cols=column_atts, limit=-1)
57
+ df = results.rows_dataframe
58
+ ea = cls(df, sample_key=sample_key, classification_key=classification_key)
59
+ return ea
60
+
61
+ def _compute_global_stats(self) -> Dict[str, int]:
62
+ """
63
+ Compute global category frequencies across all samples.
64
+ Returns a dictionary of category -> count
65
+ """
66
+ global_counter = Counter()
67
+
68
+ # Flatten all categories and count
69
+ for categories in self.df[self.classification_key]:
70
+ if isinstance(categories, list):
71
+ global_counter.update(categories)
72
+ else:
73
+ # Handle case where categories might be a string
74
+ global_counter.update([categories])
75
+
76
+ return global_counter
77
+
78
+ @property
79
+ def sample_ids(self) -> List[str]:
80
+ df = self.df
81
+ return df[self.sample_key].unique().tolist()
82
+
83
+ def _get_sample_stats(self, sample_id: str) -> Counter:
84
+ """
85
+ Get category frequencies for a specific sample.
86
+ Uses caching to avoid recomputation.
87
+ """
88
+ if sample_id in self.sample_cache:
89
+ return self.sample_cache[sample_id]
90
+
91
+ sample_data = self.df[self.df[self.sample_key] == sample_id]
92
+ if sample_data.empty:
93
+ raise KeyError(f"Sample ID '{sample_id}' not found")
94
+ sample_data = sample_data.dropna()
95
+ # if sample_data.empty:
96
+ # raise ValueError(f"Sample ID '{sample_id}' has missing values after dropping NA")
97
+ counter = Counter()
98
+
99
+ for categories in sample_data[self.classification_key]:
100
+ if isinstance(categories, list):
101
+ counter.update(categories)
102
+ else:
103
+ counter.update([categories])
104
+
105
+ self.sample_cache[sample_id] = counter
106
+ return counter
107
+
108
+ def find_enriched_categories(
109
+ self,
110
+ sample_id: str,
111
+ min_occurrences: int = 5,
112
+ p_value_threshold: float = 0.05,
113
+ multiple_testing_correction: str = "bh",
114
+ ) -> List[EnrichedCategory]:
115
+ """
116
+ Find categories that are enriched in the given sample.
117
+
118
+ Args:
119
+ sample_id: ID of the sample to analyze
120
+ min_occurrences: Minimum number of occurrences required for a category
121
+ p_value_threshold: P-value threshold for significance
122
+
123
+ Returns:
124
+ List of tuples (category, fold_change, p_value) sorted by significance
125
+ """
126
+ sample_stats = self._get_sample_stats(sample_id)
127
+ total_sample_annotations = sum(sample_stats.values())
128
+ total_global_annotations = sum(self.global_stats.values())
129
+
130
+ results = []
131
+
132
+ for category, sample_count in sample_stats.items():
133
+ global_count = self.global_stats[category]
134
+
135
+ # Skip rare categories
136
+ if global_count < min_occurrences:
137
+ continue
138
+
139
+ # Calculate fold change
140
+ sample_freq = sample_count / total_sample_annotations
141
+ global_freq = global_count / total_global_annotations
142
+ fold_change = sample_freq / global_freq if global_freq > 0 else float("inf")
143
+
144
+ # Perform Fisher's exact test
145
+ contingency_table = np.array(
146
+ [
147
+ [sample_count, global_count - sample_count],
148
+ [
149
+ total_sample_annotations - sample_count,
150
+ total_global_annotations - total_sample_annotations - (global_count - sample_count),
151
+ ],
152
+ ]
153
+ )
154
+
155
+ _, p_value = stats.fisher_exact(contingency_table)
156
+
157
+ if p_value < p_value_threshold:
158
+ results.append((category, fold_change, p_value))
159
+
160
+ if not results:
161
+ return results
162
+
163
+ # Sort by p-value
164
+ results.sort(key=lambda x: x[2])
165
+
166
+ # Apply multiple testing correction
167
+ categories, fold_changes, p_values = zip(*results)
168
+
169
+ if multiple_testing_correction.lower() == "bonf":
170
+ # Bonferroni correction
171
+ n_tests = len(self.global_stats) # Total number of categories tested
172
+ adjusted_p_values = [min(1.0, p * n_tests) for p in p_values]
173
+
174
+ elif multiple_testing_correction.lower() == "bh":
175
+ # Benjamini-Hochberg correction
176
+ n = len(p_values)
177
+ sorted_indices = np.argsort(p_values)
178
+ sorted_p_values = np.array(p_values)[sorted_indices]
179
+
180
+ # Calculate BH adjusted p-values
181
+ adjusted_p_values = np.zeros(n)
182
+ for i, p in enumerate(sorted_p_values):
183
+ adjusted_p_values[i] = p * n / (i + 1)
184
+
185
+ # Ensure monotonicity
186
+ for i in range(n - 2, -1, -1):
187
+ adjusted_p_values[i] = min(adjusted_p_values[i], adjusted_p_values[i + 1])
188
+
189
+ # Restore original order
190
+ inverse_indices = np.argsort(sorted_indices)
191
+ adjusted_p_values = adjusted_p_values[inverse_indices]
192
+
193
+ # Ensure we don't exceed 1.0
194
+ adjusted_p_values = np.minimum(adjusted_p_values, 1.0)
195
+
196
+ else:
197
+ # No correction
198
+ adjusted_p_values = p_values
199
+
200
+ # Filter by adjusted p-value threshold and create final results
201
+ # Create EnrichedCategory objects
202
+ final_results = [
203
+ EnrichedCategory(category=cat, fold_change=fc, original_p_value=p, adjusted_p_value=adj_p)
204
+ for cat, fc, p, adj_p in zip(categories, fold_changes, p_values, adjusted_p_values)
205
+ if adj_p < p_value_threshold
206
+ ]
207
+
208
+ # Sort by adjusted p-value
209
+ final_results.sort(key=lambda x: x.adjusted_p_value)
210
+ return final_results
211
+
212
+
213
+ # Example usage:
214
+ # analyzer = EnrichmentAnalyzer(df, 'sample_id', 'categories')
215
+ # enriched = analyzer.find_enriched_categories('sample1')
216
+ # for category, fold_change, p_value in enriched:
217
+ # print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})")
@@ -1,8 +1,10 @@
1
1
  import csv
2
2
  import gzip
3
+ import hashlib
3
4
  import io
4
5
  import json
5
6
  import logging
7
+ import re
6
8
  import sys
7
9
  import tarfile
8
10
  from enum import Enum
@@ -28,13 +30,27 @@ class Format(Enum):
28
30
  JSONL = "jsonl"
29
31
  YAML = "yaml"
30
32
  YAMLL = "yamll"
33
+ TOML = "toml"
31
34
  TSV = "tsv"
32
35
  CSV = "csv"
33
36
  XML = "xml"
37
+ TURTLE = "turtle"
38
+ RDFXML = "rdfxml"
39
+ TEXT = "text"
40
+ TEXTLINES = "textlines"
41
+ OBO = "obo"
42
+ FASTA = "fasta"
43
+ GMT = "gmt"
44
+ DAT = "dat"
45
+ MARKDOWN = "markdown"
46
+ PKL = "pkl"
34
47
  PYTHON = "python"
35
48
  PARQUET = "parquet"
49
+ HDF5 = "hdf5"
50
+ NETCDF = "netcdf"
36
51
  FORMATTED = "formatted"
37
52
  TABLE = "table"
53
+ XLSX = "xlsx"
38
54
  SQLDUMP_DUCKDB = "duckdb"
39
55
  SQLDUMP_POSTGRES = "postgres"
40
56
  DUMP_MONGODB = "mongodb"
@@ -51,7 +67,12 @@ class Format(Enum):
51
67
  ".yamll": cls.YAMLL,
52
68
  ".tsv": cls.TSV,
53
69
  ".csv": cls.CSV,
70
+ ".txt": cls.TEXT,
54
71
  ".xml": cls.XML,
72
+ ".owx": cls.XML,
73
+ ".owl": cls.RDFXML,
74
+ ".ttl": cls.TURTLE,
75
+ ".md": cls.MARKDOWN,
55
76
  ".py": cls.PYTHON,
56
77
  ".parquet": cls.PARQUET,
57
78
  ".pq": cls.PARQUET,
@@ -67,6 +88,9 @@ class Format(Enum):
67
88
  def is_dump_format(self):
68
89
  return self in [Format.SQLDUMP_DUCKDB, Format.SQLDUMP_POSTGRES, Format.DUMP_MONGODB]
69
90
 
91
+ def is_binary_format(self):
92
+ return self in [Format.PARQUET, Format.XLSX]
93
+
70
94
  def is_xsv(self):
71
95
  return self in [Format.TSV, Format.CSV]
72
96
 
@@ -95,12 +119,45 @@ def load_objects_from_url(
95
119
  return objs
96
120
 
97
121
 
122
+ def clean_pandas_value(v):
123
+ """Clean a single value from pandas."""
124
+ import math
125
+
126
+ if isinstance(v, float):
127
+ if math.isnan(v) or math.isinf(v):
128
+ return None
129
+ return float(v) # Ensures proper float type
130
+ return v
131
+
132
+
133
+ def clean_nested_structure(obj):
134
+ """Recursively clean a nested structure of dicts/lists from pandas."""
135
+ if isinstance(obj, dict):
136
+ return {k: clean_nested_structure(v) for k, v in obj.items()}
137
+ elif isinstance(obj, list):
138
+ return [clean_nested_structure(item) for item in obj] # Fixed: using 'item' instead of 'v'
139
+ else:
140
+ return clean_pandas_value(obj)
141
+
142
+
98
143
  def process_file(
99
- f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
144
+ f: IO,
145
+ format: Format,
146
+ expected_type: Optional[Type] = None,
147
+ header_comment_token: Optional[str] = None,
148
+ format_options: Optional[Dict[str, Any]] = None,
100
149
  ) -> List[Dict[str, Any]]:
101
150
  """
102
151
  Process a single file and return a list of objects.
152
+
153
+ :param f: The file object.
154
+ :param format: The format of the file.
155
+ :param expected_type: The expected type of the objects.
156
+ :param header_comment_token: Token used for header comments to be skipped
157
+ :return:
103
158
  """
159
+ if format_options is None:
160
+ format_options = {}
104
161
  if format == Format.YAMLL:
105
162
  format = Format.YAML
106
163
  expected_type = list
@@ -115,6 +172,14 @@ def process_file(
115
172
  objs = [obj for obj in objs if obj is not None]
116
173
  else:
117
174
  objs = yaml.safe_load(f)
175
+ elif format == Format.TOML:
176
+ import toml
177
+
178
+ objs = toml.load(f)
179
+ if not isinstance(objs, list):
180
+ objs = [objs]
181
+ elif format == Format.TEXTLINES:
182
+ objs = f.readlines()
118
183
  elif format in [Format.TSV, Format.CSV]:
119
184
  if header_comment_token:
120
185
  while True:
@@ -128,6 +193,87 @@ def process_file(
128
193
  objs = list(reader)
129
194
  elif format == Format.XML:
130
195
  objs = xmltodict.parse(f.read())
196
+ elif format == Format.PKL:
197
+ objs = pd.read_pickle(f).to_dict(orient="records")
198
+ elif format == Format.XLSX:
199
+ xls = pd.ExcelFile(f)
200
+ objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
201
+ elif format == Format.TEXT:
202
+ txt = f.read()
203
+ objs = [
204
+ {
205
+ "name": Path(f.name).name,
206
+ "path": f.name,
207
+ "content": txt,
208
+ "size": len(txt),
209
+ "lines": txt.count("\n") + 1,
210
+ "md5": hashlib.md5(txt.encode()).hexdigest(),
211
+ }
212
+ ]
213
+ elif format == Format.GMT:
214
+ objs = []
215
+ lib_name = Path(f.name).name
216
+ for line in f:
217
+ parts = line.strip().split("\t")
218
+ desc = parts[1]
219
+ objs.append(
220
+ {
221
+ "library": lib_name,
222
+ "uid": f"{lib_name}.{parts[0]}",
223
+ "name": parts[0],
224
+ "description": desc if desc else None,
225
+ "genes": parts[2:],
226
+ }
227
+ )
228
+ elif format == Format.FASTA:
229
+ objs = []
230
+ current_obj = None
231
+ for line in f:
232
+ line = line.strip()
233
+ if line.startswith(">"):
234
+ if current_obj:
235
+ objs.append(current_obj)
236
+ current_obj = {"id": line[1:], "sequence": ""}
237
+ else:
238
+ current_obj["sequence"] += line
239
+ if current_obj:
240
+ objs.append(current_obj)
241
+ elif format == Format.OBO:
242
+ blocks = split_document(f.read(), "\n\n")
243
+ id_pattern = re.compile(r"id: (\S+)")
244
+
245
+ def get_id(block):
246
+ m = id_pattern.search(block)
247
+ return m.group(1) if m else None
248
+
249
+ objs = [{"id": get_id(block), "content": block} for block in blocks]
250
+ objs = [obj for obj in objs if obj["id"]]
251
+ elif format == Format.DAT:
252
+ from linkml_store.utils.dat_parser import parse_sib_format
253
+
254
+ _, objs = parse_sib_format(f.read())
255
+ elif format in (Format.RDFXML, Format.TURTLE):
256
+ import lightrdf
257
+
258
+ parser = lightrdf.Parser()
259
+ objs = []
260
+ ext_fmt = "rdfxml"
261
+ if format == Format.TURTLE:
262
+ ext_fmt = "ttl"
263
+ bytesio = io.BytesIO(f.read().encode("utf-8"))
264
+ buffer = io.BufferedReader(bytesio)
265
+ for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt):
266
+ obj = {
267
+ "subject": s,
268
+ "predicate": p,
269
+ "object": o,
270
+ }
271
+ if format_options.get("pivot", False):
272
+ obj = {
273
+ "subject": s,
274
+ p: o,
275
+ }
276
+ objs.append(obj)
131
277
  elif format == Format.PARQUET:
132
278
  import pyarrow.parquet as pq
133
279
 
@@ -162,11 +308,20 @@ def load_objects(
162
308
  :param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
163
309
  :param expected_type: The target type to load the objects into, e.g. list
164
310
  :param header_comment_token: Token used for header comments to be skipped
311
+ :param select_query: JSONPath query to select specific objects from the loaded data.
165
312
  :return: A list of dictionaries representing the loaded objects.
166
313
  """
167
314
  if isinstance(file_path, Path):
168
315
  file_path = str(file_path)
169
316
 
317
+ for url_scheme in ["http", "https", "ftp"]:
318
+ if file_path.startswith(f"{url_scheme}://"):
319
+ return load_objects_from_url(
320
+ file_path,
321
+ format=format,
322
+ expected_type=expected_type,
323
+ )
324
+
170
325
  if isinstance(format, str):
171
326
  format = Format(format)
172
327
 
@@ -185,9 +340,9 @@ def load_objects(
185
340
  else:
186
341
  if Path(file_path).is_dir():
187
342
  raise ValueError(f"{file_path} is a dir, which is invalid for {format}")
188
- mode = "rb" if format == Format.PARQUET or compression == "gz" else "r"
189
343
  open_func = gzip.open if compression == "gz" else open
190
344
  format = Format.guess_format(file_path) if not format else format
345
+ mode = "rb" if (format and format.is_binary_format()) or compression == "gz" else "r"
191
346
  with open_func(file_path, mode) if file_path != "-" else sys.stdin as f:
192
347
  if compression == "gz" and mode == "r":
193
348
  f = io.TextIOWrapper(f)
@@ -242,7 +397,8 @@ def write_output(
242
397
 
243
398
 
244
399
  def render_output(
245
- data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Optional[Union[Format, str]] = Format.YAML
400
+ data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]],
401
+ format: Optional[Union[Format, str]] = Format.YAML,
246
402
  ) -> str:
247
403
  """
248
404
  Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -275,6 +431,12 @@ def render_output(
275
431
  if isinstance(data, pd.DataFrame):
276
432
  data = data.to_dict(orient="records")
277
433
 
434
+ if isinstance(data, BaseModel):
435
+ data = data.model_dump()
436
+
437
+ if data and isinstance(data, list) and isinstance(data[0], BaseModel):
438
+ data = [d.model_dump() if isinstance(d, BaseModel) else d for d in data]
439
+
278
440
  if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
279
441
  data = [data]
280
442
 
@@ -287,8 +449,15 @@ def render_output(
287
449
  return "\n".join(json.dumps(obj) for obj in data)
288
450
  elif format == Format.PYTHON:
289
451
  return str(data)
452
+ elif format == Format.MARKDOWN:
453
+
454
+ def as_markdown(obj: dict):
455
+ return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
456
+
457
+ return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
290
458
  elif format == Format.TABLE:
291
459
  from tabulate import tabulate
460
+
292
461
  return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
293
462
  elif format == Format.YAML:
294
463
  if isinstance(data, list):
@@ -343,3 +512,14 @@ def guess_format(path: str) -> Optional[Format]:
343
512
  :return: The guessed format.
344
513
  """
345
514
  return Format.guess_format(path)
515
+
516
+
517
+ def split_document(doc: str, delimiter: str):
518
+ """
519
+ Split a document into parts based on a delimiter.
520
+
521
+ :param doc: The document to split.
522
+ :param delimiter: The delimiter.
523
+ :return: The parts of the document.
524
+ """
525
+ return doc.split(delimiter)
@@ -76,6 +76,7 @@ def render_formatted_text(
76
76
  return text
77
77
  if not values:
78
78
  raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
79
+ # remove last element and try again
79
80
  return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
80
81
 
81
82
 
@@ -104,6 +105,7 @@ def get_token_limit(model_name: str) -> int:
104
105
 
105
106
  def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
106
107
  import yaml
108
+
107
109
  if "```" in yaml_str:
108
110
  yaml_str = yaml_str.split("```")[1].strip()
109
111
  if yaml_str.startswith("yaml"):
@@ -114,4 +116,4 @@ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
114
116
  if strict:
115
117
  raise e
116
118
  logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
117
- return None
119
+ return None
@@ -56,7 +56,7 @@ def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
56
56
 
57
57
 
58
58
  def facet_summary_to_dataframe_unmelted(
59
- facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]]
59
+ facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]],
60
60
  ) -> pd.DataFrame:
61
61
  rows = []
62
62
 
@@ -5,7 +5,7 @@ import sqlalchemy
5
5
  import sqlalchemy.sql.sqltypes as sqlt
6
6
  from linkml_runtime.linkml_model import SchemaDefinition, SlotDefinition
7
7
  from linkml_runtime.utils.schema_builder import SchemaBuilder
8
- from sqlalchemy import MetaData
8
+ from sqlalchemy import MetaData, quoted_name
9
9
 
10
10
  from linkml_store.api.queries import Query
11
11
 
@@ -115,7 +115,13 @@ def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], mul
115
115
  conditions = [cond for cond in where_clause_sql.split(" AND ") if not cond.startswith(f"{facet_column} ")]
116
116
  modified_where = " AND ".join(conditions)
117
117
 
118
+ def make_col_safe(col):
119
+ return '"' + quoted_name(col, True) + '"' if " " in col else col
120
+
121
+ if isinstance(facet_column, str):
122
+ facet_column = make_col_safe(facet_column)
118
123
  if isinstance(facet_column, tuple):
124
+ facet_column = [make_col_safe(col) for col in facet_column]
119
125
  if multivalued:
120
126
  raise NotImplementedError("Multivalued facets are not supported for multiple columns")
121
127
  facet_column = ", ".join(facet_column)