linkml-store 0.2.6__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {linkml_store-0.2.6 → linkml_store-0.2.9}/PKG-INFO +3 -1
  2. {linkml_store-0.2.6 → linkml_store-0.2.9}/pyproject.toml +3 -1
  3. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/client.py +2 -3
  4. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/collection.py +63 -8
  5. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/database.py +30 -2
  6. linkml_store-0.2.9/src/linkml_store/api/stores/duckdb/duckdb_collection.py +337 -0
  7. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/duckdb/duckdb_database.py +3 -3
  8. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/filesystem/__init__.py +1 -1
  9. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +115 -12
  10. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
  11. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/solr/solr_collection.py +7 -1
  12. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/cli.py +201 -20
  13. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/index/implementations/llm_indexer.py +14 -6
  14. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/index/indexer.py +7 -4
  15. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/implementations/llm_inference_engine.py +13 -9
  16. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/implementations/rag_inference_engine.py +13 -10
  17. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
  18. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/inference_config.py +1 -0
  19. linkml_store-0.2.9/src/linkml_store/utils/dat_parser.py +95 -0
  20. linkml_store-0.2.9/src/linkml_store/utils/enrichment_analyzer.py +217 -0
  21. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/format_utils.py +124 -3
  22. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/llm_utils.py +3 -1
  23. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/pandas_utils.py +1 -1
  24. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/sql_utils.py +1 -1
  25. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/vector_utils.py +3 -10
  26. linkml_store-0.2.6/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -175
  27. {linkml_store-0.2.6 → linkml_store-0.2.9}/LICENSE +0 -0
  28. {linkml_store-0.2.6 → linkml_store-0.2.9}/README.md +0 -0
  29. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/__init__.py +0 -0
  30. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/__init__.py +0 -0
  31. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/config.py +0 -0
  32. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/queries.py +0 -0
  33. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/__init__.py +0 -0
  34. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  35. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  36. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  37. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  38. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  39. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  40. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
  41. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  42. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  43. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  44. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  45. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
  46. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
  47. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
  48. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  49. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  50. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  51. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/api/types.py +0 -0
  52. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/constants.py +0 -0
  53. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/graphs/__init__.py +0 -0
  54. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/graphs/graph_map.py +0 -0
  55. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/index/__init__.py +0 -0
  56. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/index/implementations/__init__.py +0 -0
  57. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  58. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/__init__.py +0 -0
  59. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/evaluation.py +0 -0
  60. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/implementations/__init__.py +0 -0
  61. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +0 -0
  62. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/inference_engine.py +0 -0
  63. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/inference/inference_engine_registry.py +0 -0
  64. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/__init__.py +0 -0
  65. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/change_utils.py +0 -0
  66. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/file_utils.py +0 -0
  67. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/io.py +0 -0
  68. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/mongodb_utils.py +0 -0
  69. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/neo4j_utils.py +0 -0
  70. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/object_utils.py +0 -0
  71. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/patch_utils.py +0 -0
  72. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/query_utils.py +0 -0
  73. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/schema_utils.py +0 -0
  74. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/sklearn_utils.py +0 -0
  75. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/utils/stats_utils.py +0 -0
  76. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/webapi/__init__.py +0 -0
  77. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/webapi/html/__init__.py +0 -0
  78. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  79. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  80. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  81. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
  82. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
  83. {linkml_store-0.2.6 → linkml_store-0.2.9}/src/linkml_store/webapi/main.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: linkml-store
3
- Version: 0.2.6
3
+ Version: 0.2.9
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -24,6 +24,7 @@ Provides-Extra: map
24
24
  Provides-Extra: mongodb
25
25
  Provides-Extra: neo4j
26
26
  Provides-Extra: pyarrow
27
+ Provides-Extra: rdf
27
28
  Provides-Extra: renderer
28
29
  Provides-Extra: scipy
29
30
  Provides-Extra: tests
@@ -39,6 +40,7 @@ Requires-Dist: h5py ; extra == "h5py"
39
40
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
40
41
  Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
41
42
  Requires-Dist: jsonpatch (>=1.33)
43
+ Requires-Dist: lightrdf ; extra == "rdf"
42
44
  Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
43
45
  Requires-Dist: linkml-runtime (>=1.8.0)
44
46
  Requires-Dist: linkml_map ; extra == "map"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.2.6"
3
+ version = "0.2.9"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -23,6 +23,7 @@ pystow = "^0.5.4"
23
23
  black = { version=">=24.0.0", optional = true }
24
24
  ruff = { version=">=0.6.2", optional = true }
25
25
  llm = { version="*", optional = true }
26
+ lightrdf = { version="*", optional = true }
26
27
  tiktoken = { version="*", optional = true }
27
28
  pymongo = "^4.11"
28
29
  neo4j = { version="*", optional = true }
@@ -91,6 +92,7 @@ renderer = ["linkml_renderer"]
91
92
  fastapi = ["fastapi", "uvicorn"]
92
93
  frictionless = ["frictionless"]
93
94
  scipy = ["scipy", "scikit-learn"]
95
+ rdf = ["lightrdf"]
94
96
  #ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
95
97
  bigquery = ["google-cloud-bigquery"]
96
98
  all = ["llm", "mongodb", "neo4j", "validation", "map", "renderer", "bigquery"]
@@ -12,7 +12,6 @@ from linkml_store.api.config import ClientConfig
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
-
16
15
  HANDLE_MAP = {
17
16
  "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
18
17
  "sqlite": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
@@ -220,14 +219,14 @@ class Client:
220
219
  scheme, _ = handle.split(":", 1)
221
220
  if scheme not in HANDLE_MAP:
222
221
  raise ValueError(f"Unknown scheme: {scheme}")
223
- module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
222
+ module_path, class_name = HANDLE_MAP[scheme].rsplit(".", 1)
224
223
  try:
225
224
  module = importlib.import_module(module_path)
226
225
  cls = getattr(module, class_name)
227
226
  except ImportError as e:
228
227
  raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
229
228
 
230
- #cls = HANDLE_MAP[scheme]
229
+ # cls = HANDLE_MAP[scheme]
231
230
  db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
232
231
  if schema_view:
233
232
  db.set_schema_view(schema_view)
@@ -211,7 +211,7 @@ class Collection(Generic[DatabaseType]):
211
211
  """
212
212
  raise NotImplementedError
213
213
 
214
- def index (
214
+ def index(
215
215
  self,
216
216
  objs: Union[OBJECT, List[OBJECT]],
217
217
  index_name: Optional[str] = None,
@@ -231,10 +231,13 @@ class Collection(Generic[DatabaseType]):
231
231
  """
232
232
  raise NotImplementedError
233
233
 
234
- def upsert(self,
235
- objs: Union[OBJECT, List[OBJECT]],
236
- filter_fields: List[str],
237
- update_fields: Union[List[str], None] = None, **kwargs):
234
+ def upsert(
235
+ self,
236
+ objs: Union[OBJECT, List[OBJECT]],
237
+ filter_fields: List[str],
238
+ update_fields: Union[List[str], None] = None,
239
+ **kwargs,
240
+ ):
238
241
  """
239
242
  Add one or more objects to the collection.
240
243
 
@@ -454,7 +457,12 @@ class Collection(Generic[DatabaseType]):
454
457
  return qr.rows[0]
455
458
  return None
456
459
 
457
- def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
460
+ def find(
461
+ self,
462
+ where: Optional[Any] = None,
463
+ select_cols: Optional[List[str]] = None,
464
+ **kwargs,
465
+ ) -> QueryResult:
458
466
  """
459
467
  Find objects in the collection using a where query.
460
468
 
@@ -484,10 +492,14 @@ class Collection(Generic[DatabaseType]):
484
492
 
485
493
 
486
494
  :param where:
495
+ :param select_cols:
487
496
  :param kwargs:
488
497
  :return:
489
498
  """
490
- query = self._create_query(where_clause=where)
499
+ query = self._create_query(
500
+ where_clause=where,
501
+ select_cols=select_cols,
502
+ )
491
503
  self._pre_query_hook(query)
492
504
  return self.query(query, **kwargs)
493
505
 
@@ -587,6 +599,7 @@ class Collection(Generic[DatabaseType]):
587
599
  assert ix_coll.size() > 0
588
600
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
589
601
  index_col = ix.index_field
602
+
590
603
  # TODO: optimize this for large indexes
591
604
  def row2array(row):
592
605
  v = row[index_col]
@@ -594,6 +607,7 @@ class Collection(Generic[DatabaseType]):
594
607
  # sqlite stores arrays as strings
595
608
  v = json.loads(v)
596
609
  return np.array(v, dtype=float)
610
+
597
611
  vector_pairs = [(row, row2array(row)) for row in qr.rows]
598
612
  results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
599
613
  for r in results:
@@ -608,6 +622,47 @@ class Collection(Generic[DatabaseType]):
608
622
  new_qr.rows = [r[1] for r in results]
609
623
  return new_qr
610
624
 
625
+ def group_by(
626
+ self,
627
+ group_by_fields: List[str],
628
+ inlined_field="objects",
629
+ agg_map: Optional[Dict[str, str]] = None,
630
+ where: Optional[Dict] = None,
631
+ **kwargs,
632
+ ) -> QueryResult:
633
+ """
634
+ Group objects in the collection by a column.
635
+
636
+ :param group_by:
637
+ :param where:
638
+ :param kwargs:
639
+ :return:
640
+ """
641
+ if isinstance(group_by_fields, str):
642
+ group_by_fields = [group_by_fields]
643
+ df = self.find(where=where, limit=-1).rows_dataframe
644
+
645
+ # Handle the case where agg_map is None
646
+ if agg_map is None:
647
+ agg_map = {}
648
+
649
+ pk_fields = agg_map.get("first", []) + group_by_fields
650
+ list_fields = agg_map.get("list", [])
651
+ if not list_fields:
652
+ list_fields = [a for a in df.columns if a not in pk_fields]
653
+
654
+ grouped_objs = defaultdict(list)
655
+ for _, row in df.iterrows():
656
+ pk = tuple(row[pk_fields])
657
+ grouped_objs[pk].append({k: row[k] for k in list_fields})
658
+ results = []
659
+ for pk, objs in grouped_objs.items():
660
+ top_obj = {k: v for k, v in zip(pk_fields, pk)}
661
+ top_obj[inlined_field] = objs
662
+ results.append(top_obj)
663
+ r = QueryResult(num_rows=len(results), rows=results)
664
+ return r
665
+
611
666
  @property
612
667
  def is_internal(self) -> bool:
613
668
  """
@@ -1062,7 +1117,7 @@ class Collection(Generic[DatabaseType]):
1062
1117
  multivalued = any(multivalueds)
1063
1118
  inlined = any(inlineds)
1064
1119
  if multivalued and False in multivalueds:
1065
- raise ValueError(f"Mixed list non list: {vs} // inferred= {multivalueds}")
1120
+ logger.info(f"Mixed list non list: {vs} // inferred= {multivalueds}")
1066
1121
  # if not rngs:
1067
1122
  # raise AssertionError(f"Empty rngs for {k} = {vs}")
1068
1123
  rng = rngs[0] if rngs else None
@@ -595,7 +595,31 @@ class Database(ABC, Generic[CollectionType]):
595
595
  sb.add_class(coll.target_class_name)
596
596
  return SchemaView(sb.schema)
597
597
 
598
- def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
598
+ def validate_database(self, **kwargs) -> List["ValidationResult"]:
599
+ """
600
+ Validate the contents of the database.
601
+
602
+ As `iter_validate_database`, but returns a list of validation results.
603
+
604
+ :param kwargs:
605
+ :return:
606
+ """
607
+ return list(self.iter_validate_database(**kwargs))
608
+
609
+ def validate_database(self, **kwargs) -> List["ValidationResult"]:
610
+ """
611
+ Validate the contents of the database.
612
+
613
+ As `iter_validate_database`, but returns a list of validation results.
614
+
615
+ :param kwargs:
616
+ :return:
617
+ """
618
+ return list(self.iter_validate_database(**kwargs))
619
+
620
+ def iter_validate_database(
621
+ self, ensure_referential_integrity: bool = None, **kwargs
622
+ ) -> Iterator["ValidationResult"]:
599
623
  """
600
624
  Validate the contents of the database.
601
625
 
@@ -635,12 +659,14 @@ class Database(ABC, Generic[CollectionType]):
635
659
  'capital' is a required property
636
660
  'continent' is a required proper
637
661
 
662
+ :param ensure_referential_integrity: ensure referential integrity
638
663
  :param kwargs:
639
664
  :return: iterator over validation results
640
665
  """
641
666
  for collection in self.list_collections():
642
667
  yield from collection.iter_validate_collection(**kwargs)
643
- if self.metadata.ensure_referential_integrity:
668
+ if self.metadata.ensure_referential_integrity or ensure_referential_integrity:
669
+ logger.info(f"Validating referential integrity on {self.alias}")
644
670
  yield from self._validate_referential_integrity(**kwargs)
645
671
 
646
672
  def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]:
@@ -661,7 +687,9 @@ class Database(ABC, Generic[CollectionType]):
661
687
  induced_slots = sv.class_induced_slots(cd.name)
662
688
  slot_map = {s.name: s for s in induced_slots}
663
689
  # rmap = {s.name: s.range for s in induced_slots}
690
+ # map slot ranges to a collection where that range is stored
664
691
  sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range}
692
+ logger.debug(f"Validating referential integrity for {collection.target_class_name} // {sr_to_coll}")
665
693
  for obj in collection.find_iter():
666
694
  for k, v in obj.items():
667
695
  if k not in sr_to_coll:
@@ -0,0 +1,337 @@
1
+ import logging
2
+ from typing import Any, Dict, List, Optional, Union, Tuple
3
+
4
+ import sqlalchemy as sqla
5
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
6
+ from sqlalchemy import Column, Table, delete, insert, inspect, text
7
+ from sqlalchemy.sql.ddl import CreateTable
8
+
9
+ from linkml_store.api import Collection
10
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
11
+ from linkml_store.api.queries import Query, QueryResult
12
+ from linkml_store.api.stores.duckdb.mappings import TMAP
13
+ from linkml_store.utils.sql_utils import facet_count_sql
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class DuckDBCollection(Collection):
19
+ _table_created: bool = None
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ super().__init__(*args, **kwargs)
23
+
24
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
25
+ logger.debug(f"Inserting {len(objs)}")
26
+ if not isinstance(objs, list):
27
+ objs = [objs]
28
+ if not objs:
29
+ return
30
+ cd = self.class_definition()
31
+ if not cd:
32
+ logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}; will induce")
33
+ cd = self.induce_class_definition_from_objects(objs)
34
+ self._create_table(cd)
35
+ table = self._sqla_table(cd)
36
+ logger.info(f"Inserting into: {self.alias} // T={table.name}")
37
+ engine = self.parent.engine
38
+ col_names = [c.name for c in table.columns]
39
+ bad_objs = [obj for obj in objs if not isinstance(obj, dict)]
40
+ if bad_objs:
41
+ logger.error(f"Bad objects: {bad_objs}")
42
+ objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
43
+ with engine.connect() as conn:
44
+ with conn.begin():
45
+ conn.execute(insert(table), objs)
46
+ conn.commit()
47
+ self._post_insert_hook(objs)
48
+
49
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
50
+ if not isinstance(objs, list):
51
+ objs = [objs]
52
+ cd = self.class_definition()
53
+ if not cd or not cd.attributes:
54
+ cd = self.induce_class_definition_from_objects(objs)
55
+ assert cd.attributes
56
+ table = self._sqla_table(cd)
57
+ engine = self.parent.engine
58
+ with engine.connect() as conn:
59
+ for obj in objs:
60
+ conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
61
+ stmt = delete(table).where(*conditions)
62
+ stmt = stmt.compile(engine)
63
+ conn.execute(stmt)
64
+ conn.commit()
65
+ self._post_delete_hook()
66
+ return None
67
+
68
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
69
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
70
+ if where is None:
71
+ where = {}
72
+ cd = self.class_definition()
73
+ if not cd:
74
+ logger.info(f"No class definition found for {self.target_class_name}, assuming not prepopulated")
75
+ return 0
76
+ table = self._sqla_table(cd)
77
+ engine = self.parent.engine
78
+ inspector = inspect(engine)
79
+ table_exists = table.name in inspector.get_table_names()
80
+ if not table_exists:
81
+ logger.info(f"Table {table.name} does not exist, assuming no data")
82
+ return 0
83
+ with engine.connect() as conn:
84
+ conditions = [table.c[k] == v for k, v in where.items()]
85
+ stmt = delete(table).where(*conditions)
86
+ stmt = stmt.compile(engine)
87
+ result = conn.execute(stmt)
88
+ deleted_rows_count = result.rowcount
89
+ if deleted_rows_count == 0 and not missing_ok:
90
+ raise ValueError(f"No rows found for {where}")
91
+ conn.commit()
92
+ self._post_delete_hook()
93
+ return deleted_rows_count if deleted_rows_count > -1 else None
94
+
95
+ def query_facets(
96
+ self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
97
+ ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
98
+ if facet_limit is None:
99
+ facet_limit = DEFAULT_FACET_LIMIT
100
+ results = {}
101
+ cd = self.class_definition()
102
+ with self.parent.engine.connect() as conn:
103
+ if not facet_columns:
104
+ if not cd:
105
+ raise ValueError(f"No class definition found for {self.target_class_name}")
106
+ facet_columns = list(cd.attributes.keys())
107
+ for col in facet_columns:
108
+ logger.debug(f"Faceting on {col}")
109
+ if isinstance(col, tuple):
110
+ sd = SlotDefinition(name="PLACEHOLDER")
111
+ else:
112
+ sd = cd.attributes[col]
113
+ facet_query = self._create_query(where_clause=where)
114
+ facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
115
+ logger.debug(f"Facet query: {facet_query_str}")
116
+ rows = list(conn.execute(text(facet_query_str)))
117
+ results[col] = [tuple(row) for row in rows]
118
+ return results
119
+
120
+ def _sqla_table(self, cd: ClassDefinition) -> Table:
121
+ schema_view = self.parent.schema_view
122
+ metadata_obj = sqla.MetaData()
123
+ cols = []
124
+ for att in schema_view.class_induced_slots(cd.name):
125
+ typ = TMAP.get(att.range, sqla.String)
126
+ if att.inlined or att.inlined_as_list:
127
+ typ = sqla.JSON
128
+ if att.multivalued:
129
+ typ = sqla.ARRAY(typ, dimensions=1)
130
+ if att.array:
131
+ typ = sqla.ARRAY(typ, dimensions=1)
132
+ col = Column(att.name, typ)
133
+ cols.append(col)
134
+ t = Table(self.alias, metadata_obj, *cols)
135
+ return t
136
+
137
+ def _check_if_initialized(self) -> bool:
138
+ # if self._initialized:
139
+ # return True
140
+ query = Query(
141
+ from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
142
+ )
143
+ qr = self.parent.query(query)
144
+ if qr.num_rows > 0:
145
+ return True
146
+ return False
147
+
148
+ def group_by(
149
+ self,
150
+ group_by_fields: List[str],
151
+ inlined_field="objects",
152
+ agg_map: Optional[Dict[str, str]] = None,
153
+ where: Optional[Dict] = None,
154
+ **kwargs,
155
+ ) -> QueryResult:
156
+ """
157
+ Group objects in the collection by specified fields using SQLAlchemy.
158
+
159
+ This implementation leverages DuckDB's SQL capabilities for more efficient grouping.
160
+
161
+ :param group_by_fields: List of fields to group by
162
+ :param inlined_field: Field name to store aggregated objects
163
+ :param agg_map: Dictionary mapping aggregation types to fields
164
+ :param where: Filter conditions
165
+ :param kwargs: Additional arguments
166
+ :return: Query result containing grouped data
167
+ """
168
+ if isinstance(group_by_fields, str):
169
+ group_by_fields = [group_by_fields]
170
+
171
+ cd = self.class_definition()
172
+ if not cd:
173
+ logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}")
174
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
175
+
176
+ # Check if the table exists
177
+ if not self.parent._table_exists(self.alias):
178
+ logger.debug(f"Table {self.alias} doesn't exist, falling back to parent implementation")
179
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
180
+
181
+ # Get table definition
182
+ table = self._sqla_table(cd)
183
+ engine = self.parent.engine
184
+
185
+ # Create a SQLAlchemy select statement for groups
186
+ from sqlalchemy import select, func, and_, or_
187
+ group_cols = [table.c[field] for field in group_by_fields if field in table.columns.keys()]
188
+
189
+ if not group_cols:
190
+ logger.warning(f"None of the group_by fields {group_by_fields} found in table columns")
191
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
192
+
193
+ stmt = select(*group_cols).distinct()
194
+
195
+ # Add where conditions if specified
196
+ if where:
197
+ conditions = []
198
+ for k, v in where.items():
199
+ if k in table.columns.keys():
200
+ # Handle different operator types (dict values for operators)
201
+ if isinstance(v, dict):
202
+ for op, val in v.items():
203
+ if op == "$gt":
204
+ conditions.append(table.c[k] > val)
205
+ elif op == "$gte":
206
+ conditions.append(table.c[k] >= val)
207
+ elif op == "$lt":
208
+ conditions.append(table.c[k] < val)
209
+ elif op == "$lte":
210
+ conditions.append(table.c[k] <= val)
211
+ elif op == "$ne":
212
+ conditions.append(table.c[k] != val)
213
+ elif op == "$in":
214
+ conditions.append(table.c[k].in_(val))
215
+ else:
216
+ # Default to equality for unknown operators
217
+ logger.warning(f"Unknown operator {op}, using equality")
218
+ conditions.append(table.c[k] == val)
219
+ else:
220
+ # Direct equality comparison
221
+ conditions.append(table.c[k] == v)
222
+
223
+ if conditions:
224
+ for condition in conditions:
225
+ stmt = stmt.where(condition)
226
+
227
+ results = []
228
+ try:
229
+ with engine.connect() as conn:
230
+ # Get all distinct groups
231
+ group_result = conn.execute(stmt)
232
+ group_rows = list(group_result)
233
+
234
+ # For each group, get all objects
235
+ for group_row in group_rows:
236
+ # Build conditions for this group
237
+ group_conditions = []
238
+ group_dict = {}
239
+
240
+ for i, field in enumerate(group_by_fields):
241
+ if field in table.columns.keys():
242
+ value = group_row[i]
243
+ group_dict[field] = value
244
+ if value is None:
245
+ group_conditions.append(table.c[field].is_(None))
246
+ else:
247
+ group_conditions.append(table.c[field] == value)
248
+
249
+ # Get all rows for this group
250
+ row_stmt = select(*table.columns)
251
+ for condition in group_conditions:
252
+ row_stmt = row_stmt.where(condition)
253
+
254
+ # Add original where conditions
255
+ if where:
256
+ for k, v in where.items():
257
+ if k in table.columns.keys():
258
+ # Handle different operator types for the row query as well
259
+ if isinstance(v, dict):
260
+ for op, val in v.items():
261
+ if op == "$gt":
262
+ row_stmt = row_stmt.where(table.c[k] > val)
263
+ elif op == "$gte":
264
+ row_stmt = row_stmt.where(table.c[k] >= val)
265
+ elif op == "$lt":
266
+ row_stmt = row_stmt.where(table.c[k] < val)
267
+ elif op == "$lte":
268
+ row_stmt = row_stmt.where(table.c[k] <= val)
269
+ elif op == "$ne":
270
+ row_stmt = row_stmt.where(table.c[k] != val)
271
+ elif op == "$in":
272
+ row_stmt = row_stmt.where(table.c[k].in_(val))
273
+ else:
274
+ # Default to equality for unknown operators
275
+ row_stmt = row_stmt.where(table.c[k] == val)
276
+ else:
277
+ # Direct equality comparison
278
+ row_stmt = row_stmt.where(table.c[k] == v)
279
+
280
+ row_result = conn.execute(row_stmt)
281
+ rows = list(row_result)
282
+
283
+ # Convert rows to dictionaries
284
+ objects = []
285
+ for row in rows:
286
+ obj = {}
287
+ for i, col in enumerate(row._fields):
288
+ obj[col] = row[i]
289
+ objects.append(obj)
290
+
291
+ # Apply agg_map to filter fields if specified
292
+ if agg_map and "list" in agg_map:
293
+ list_fields = agg_map["list"]
294
+ if list_fields:
295
+ objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
296
+
297
+ # Create the result object
298
+ result_obj = group_dict.copy()
299
+ result_obj[inlined_field] = objects
300
+ results.append(result_obj)
301
+
302
+ return QueryResult(num_rows=len(results), rows=results)
303
+ except Exception as e:
304
+ logger.warning(f"Error in DuckDB group_by: {e}")
305
+ # Fall back to parent implementation
306
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
307
+
308
+ def _create_table(self, cd: ClassDefinition):
309
+ if self._table_created or self.metadata.is_prepopulated:
310
+ logger.info(f"Already have table for: {cd.name}")
311
+ return
312
+ if self.parent._table_exists(self.alias):
313
+ logger.info(f"Table already exists for {cd.name}")
314
+ self._table_created = True
315
+ self._initialized = True
316
+ self.metadata.is_prepopulated = True
317
+ return
318
+ # query = Query(
319
+ # from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
320
+ # )
321
+ # qr = self.parent.query(query)
322
+ # if qr.num_rows > 0:
323
+ # logger.info(f"Table already exists for {cd.name}")
324
+ # self._table_created = True
325
+ # self._initialized = True
326
+ # self.metadata.is_prepopulated = True
327
+ # return
328
+ logger.info(f"Creating table for {cd.name}")
329
+ t = self._sqla_table(cd)
330
+ ct = CreateTable(t)
331
+ ddl = str(ct.compile(self.parent.engine))
332
+ with self.parent.engine.connect() as conn:
333
+ conn.execute(text(ddl))
334
+ conn.commit()
335
+ self._table_created = True
336
+ self._initialized = True
337
+ self.metadata.is_prepopulated = True
@@ -100,9 +100,9 @@ class DuckDBDatabase(Database):
100
100
  meta_query = Query(
101
101
  from_table="sqlite_master",
102
102
  where_clause={
103
- #"type": "table",
103
+ # "type": "table",
104
104
  "name": table,
105
- }
105
+ },
106
106
  )
107
107
  else:
108
108
  if table.startswith("information_schema"):
@@ -112,7 +112,7 @@ class DuckDBDatabase(Database):
112
112
  where_clause={
113
113
  "table_type": "BASE TABLE",
114
114
  "table_name": table,
115
- }
115
+ },
116
116
  )
117
117
 
118
118
  qr = self.query(meta_query)
@@ -4,7 +4,7 @@ Adapter for FileSystem wrapper
4
4
  Handles have the form:
5
5
 
6
6
  - ``file:<path>`` for a local file
7
- """
7
+ """
8
8
 
9
9
  from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
10
10
  from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase