linkml-store 0.2.6__tar.gz → 0.2.10rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (87) hide show
  1. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/PKG-INFO +3 -1
  2. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/pyproject.toml +7 -1
  3. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/client.py +2 -3
  4. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/collection.py +63 -8
  5. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/database.py +20 -3
  6. linkml_store-0.2.10rc1/src/linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  7. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/duckdb/duckdb_database.py +5 -5
  8. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/filesystem/__init__.py +1 -1
  9. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +132 -15
  10. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
  11. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/neo4j/neo4j_database.py +1 -1
  12. linkml_store-0.2.10rc1/src/linkml_store/api/stores/solr/solr_collection.py +222 -0
  13. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/cli.py +201 -21
  14. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/index/implementations/llm_indexer.py +13 -6
  15. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/index/indexer.py +9 -5
  16. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/implementations/llm_inference_engine.py +15 -13
  17. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/implementations/rag_inference_engine.py +13 -10
  18. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
  19. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/inference_config.py +2 -1
  20. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/inference_engine.py +1 -1
  21. linkml_store-0.2.10rc1/src/linkml_store/plotting/__init__.py +5 -0
  22. linkml_store-0.2.10rc1/src/linkml_store/plotting/cli.py +172 -0
  23. linkml_store-0.2.10rc1/src/linkml_store/plotting/heatmap.py +356 -0
  24. linkml_store-0.2.10rc1/src/linkml_store/utils/dat_parser.py +95 -0
  25. linkml_store-0.2.10rc1/src/linkml_store/utils/enrichment_analyzer.py +217 -0
  26. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/format_utils.py +124 -3
  27. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/llm_utils.py +4 -2
  28. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/object_utils.py +9 -3
  29. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/pandas_utils.py +1 -1
  30. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/sql_utils.py +1 -1
  31. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/vector_utils.py +3 -10
  32. linkml_store-0.2.6/src/linkml_store/api/stores/duckdb/duckdb_collection.py +0 -175
  33. linkml_store-0.2.6/src/linkml_store/api/stores/solr/solr_collection.py +0 -133
  34. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/LICENSE +0 -0
  35. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/README.md +0 -0
  36. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/__init__.py +0 -0
  37. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/__init__.py +0 -0
  38. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/config.py +0 -0
  39. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/queries.py +0 -0
  40. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/__init__.py +0 -0
  41. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  42. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  43. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  44. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  45. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  46. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  47. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/filesystem/filesystem_database.py +1 -1
  48. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  49. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  50. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  51. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  52. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
  53. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
  54. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  55. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  56. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  57. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/api/types.py +0 -0
  58. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/constants.py +0 -0
  59. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/graphs/__init__.py +0 -0
  60. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/graphs/graph_map.py +0 -0
  61. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/index/__init__.py +0 -0
  62. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/index/implementations/__init__.py +0 -0
  63. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  64. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/__init__.py +0 -0
  65. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/evaluation.py +0 -0
  66. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/implementations/__init__.py +0 -0
  67. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +0 -0
  68. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/inference/inference_engine_registry.py +0 -0
  69. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/__init__.py +0 -0
  70. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/change_utils.py +0 -0
  71. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/file_utils.py +0 -0
  72. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/io.py +0 -0
  73. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/mongodb_utils.py +0 -0
  74. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/neo4j_utils.py +0 -0
  75. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/patch_utils.py +0 -0
  76. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/query_utils.py +0 -0
  77. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/schema_utils.py +0 -0
  78. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/sklearn_utils.py +0 -0
  79. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/utils/stats_utils.py +0 -0
  80. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/webapi/__init__.py +0 -0
  81. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/webapi/html/__init__.py +0 -0
  82. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  83. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  84. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  85. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
  86. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
  87. {linkml_store-0.2.6 → linkml_store-0.2.10rc1}/src/linkml_store/webapi/main.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: linkml-store
3
- Version: 0.2.6
3
+ Version: 0.2.10rc1
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -24,6 +24,7 @@ Provides-Extra: map
24
24
  Provides-Extra: mongodb
25
25
  Provides-Extra: neo4j
26
26
  Provides-Extra: pyarrow
27
+ Provides-Extra: rdf
27
28
  Provides-Extra: renderer
28
29
  Provides-Extra: scipy
29
30
  Provides-Extra: tests
@@ -39,6 +40,7 @@ Requires-Dist: h5py ; extra == "h5py"
39
40
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
40
41
  Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
41
42
  Requires-Dist: jsonpatch (>=1.33)
43
+ Requires-Dist: lightrdf ; extra == "rdf"
42
44
  Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
43
45
  Requires-Dist: linkml-runtime (>=1.8.0)
44
46
  Requires-Dist: linkml_map ; extra == "map"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.2.6"
3
+ version = "0.2.10rc1"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -23,6 +23,7 @@ pystow = "^0.5.4"
23
23
  black = { version=">=24.0.0", optional = true }
24
24
  ruff = { version=">=0.6.2", optional = true }
25
25
  llm = { version="*", optional = true }
26
+ lightrdf = { version="*", optional = true }
26
27
  tiktoken = { version="*", optional = true }
27
28
  pymongo = "^4.11"
28
29
  neo4j = { version="*", optional = true }
@@ -66,6 +67,10 @@ jupyter = "*"
66
67
  jupysql = "*"
67
68
  papermill = "*"
68
69
  nbdime = "*"
70
+ codespell = {version = ">=2.3.0"}
71
+ tomli = {version = ">=2.0.1"}
72
+ black = {version = ">=24.0.0"}
73
+ ruff = {version = ">=0.6.2"}
69
74
 
70
75
  [tool.poetry.group.tests.dependencies]
71
76
  pytest = "^7.4.0"
@@ -91,6 +96,7 @@ renderer = ["linkml_renderer"]
91
96
  fastapi = ["fastapi", "uvicorn"]
92
97
  frictionless = ["frictionless"]
93
98
  scipy = ["scipy", "scikit-learn"]
99
+ rdf = ["lightrdf"]
94
100
  #ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
95
101
  bigquery = ["google-cloud-bigquery"]
96
102
  all = ["llm", "mongodb", "neo4j", "validation", "map", "renderer", "bigquery"]
@@ -12,7 +12,6 @@ from linkml_store.api.config import ClientConfig
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
14
 
15
-
16
15
  HANDLE_MAP = {
17
16
  "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
18
17
  "sqlite": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
@@ -220,14 +219,14 @@ class Client:
220
219
  scheme, _ = handle.split(":", 1)
221
220
  if scheme not in HANDLE_MAP:
222
221
  raise ValueError(f"Unknown scheme: {scheme}")
223
- module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
222
+ module_path, class_name = HANDLE_MAP[scheme].rsplit(".", 1)
224
223
  try:
225
224
  module = importlib.import_module(module_path)
226
225
  cls = getattr(module, class_name)
227
226
  except ImportError as e:
228
227
  raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
229
228
 
230
- #cls = HANDLE_MAP[scheme]
229
+ # cls = HANDLE_MAP[scheme]
231
230
  db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
232
231
  if schema_view:
233
232
  db.set_schema_view(schema_view)
@@ -211,7 +211,7 @@ class Collection(Generic[DatabaseType]):
211
211
  """
212
212
  raise NotImplementedError
213
213
 
214
- def index (
214
+ def index(
215
215
  self,
216
216
  objs: Union[OBJECT, List[OBJECT]],
217
217
  index_name: Optional[str] = None,
@@ -231,10 +231,13 @@ class Collection(Generic[DatabaseType]):
231
231
  """
232
232
  raise NotImplementedError
233
233
 
234
- def upsert(self,
235
- objs: Union[OBJECT, List[OBJECT]],
236
- filter_fields: List[str],
237
- update_fields: Union[List[str], None] = None, **kwargs):
234
+ def upsert(
235
+ self,
236
+ objs: Union[OBJECT, List[OBJECT]],
237
+ filter_fields: List[str],
238
+ update_fields: Union[List[str], None] = None,
239
+ **kwargs,
240
+ ):
238
241
  """
239
242
  Add one or more objects to the collection.
240
243
 
@@ -454,7 +457,12 @@ class Collection(Generic[DatabaseType]):
454
457
  return qr.rows[0]
455
458
  return None
456
459
 
457
- def find(self, where: Optional[Any] = None, **kwargs) -> QueryResult:
460
+ def find(
461
+ self,
462
+ where: Optional[Any] = None,
463
+ select_cols: Optional[List[str]] = None,
464
+ **kwargs,
465
+ ) -> QueryResult:
458
466
  """
459
467
  Find objects in the collection using a where query.
460
468
 
@@ -484,10 +492,14 @@ class Collection(Generic[DatabaseType]):
484
492
 
485
493
 
486
494
  :param where:
495
+ :param select_cols:
487
496
  :param kwargs:
488
497
  :return:
489
498
  """
490
- query = self._create_query(where_clause=where)
499
+ query = self._create_query(
500
+ where_clause=where,
501
+ select_cols=select_cols,
502
+ )
491
503
  self._pre_query_hook(query)
492
504
  return self.query(query, **kwargs)
493
505
 
@@ -587,6 +599,7 @@ class Collection(Generic[DatabaseType]):
587
599
  assert ix_coll.size() > 0
588
600
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
589
601
  index_col = ix.index_field
602
+
590
603
  # TODO: optimize this for large indexes
591
604
  def row2array(row):
592
605
  v = row[index_col]
@@ -594,6 +607,7 @@ class Collection(Generic[DatabaseType]):
594
607
  # sqlite stores arrays as strings
595
608
  v = json.loads(v)
596
609
  return np.array(v, dtype=float)
610
+
597
611
  vector_pairs = [(row, row2array(row)) for row in qr.rows]
598
612
  results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
599
613
  for r in results:
@@ -608,6 +622,47 @@ class Collection(Generic[DatabaseType]):
608
622
  new_qr.rows = [r[1] for r in results]
609
623
  return new_qr
610
624
 
625
+ def group_by(
626
+ self,
627
+ group_by_fields: List[str],
628
+ inlined_field="objects",
629
+ agg_map: Optional[Dict[str, str]] = None,
630
+ where: Optional[Dict] = None,
631
+ **kwargs,
632
+ ) -> QueryResult:
633
+ """
634
+ Group objects in the collection by a column.
635
+
636
+ :param group_by:
637
+ :param where:
638
+ :param kwargs:
639
+ :return:
640
+ """
641
+ if isinstance(group_by_fields, str):
642
+ group_by_fields = [group_by_fields]
643
+ df = self.find(where=where, limit=-1).rows_dataframe
644
+
645
+ # Handle the case where agg_map is None
646
+ if agg_map is None:
647
+ agg_map = {}
648
+
649
+ pk_fields = agg_map.get("first", []) + group_by_fields
650
+ list_fields = agg_map.get("list", [])
651
+ if not list_fields:
652
+ list_fields = [a for a in df.columns if a not in pk_fields]
653
+
654
+ grouped_objs = defaultdict(list)
655
+ for _, row in df.iterrows():
656
+ pk = tuple(row[pk_fields])
657
+ grouped_objs[pk].append({k: row[k] for k in list_fields})
658
+ results = []
659
+ for pk, objs in grouped_objs.items():
660
+ top_obj = {k: v for k, v in zip(pk_fields, pk)}
661
+ top_obj[inlined_field] = objs
662
+ results.append(top_obj)
663
+ r = QueryResult(num_rows=len(results), rows=results)
664
+ return r
665
+
611
666
  @property
612
667
  def is_internal(self) -> bool:
613
668
  """
@@ -1062,7 +1117,7 @@ class Collection(Generic[DatabaseType]):
1062
1117
  multivalued = any(multivalueds)
1063
1118
  inlined = any(inlineds)
1064
1119
  if multivalued and False in multivalueds:
1065
- raise ValueError(f"Mixed list non list: {vs} // inferred= {multivalueds}")
1120
+ logger.info(f"Mixed list non list: {vs} // inferred= {multivalueds}")
1066
1121
  # if not rngs:
1067
1122
  # raise AssertionError(f"Empty rngs for {k} = {vs}")
1068
1123
  rng = rngs[0] if rngs else None
@@ -595,11 +595,24 @@ class Database(ABC, Generic[CollectionType]):
595
595
  sb.add_class(coll.target_class_name)
596
596
  return SchemaView(sb.schema)
597
597
 
598
- def iter_validate_database(self, **kwargs) -> Iterator["ValidationResult"]:
598
+ def validate_database(self, **kwargs) -> List["ValidationResult"]:
599
599
  """
600
600
  Validate the contents of the database.
601
601
 
602
- An an example, let's create a database with a predefined schema
602
+ As `iter_validate_database`, but returns a list of validation results.
603
+
604
+ :param kwargs:
605
+ :return:
606
+ """
607
+ return list(self.iter_validate_database(**kwargs))
608
+
609
+ def iter_validate_database(
610
+ self, ensure_referential_integrity: bool = None, **kwargs
611
+ ) -> Iterator["ValidationResult"]:
612
+ """
613
+ Validate the contents of the database.
614
+
615
+ An example, let's create a database with a predefined schema
603
616
  from the countries.linkml.yaml file:
604
617
 
605
618
  >>> from linkml_store.api.client import Client
@@ -635,12 +648,14 @@ class Database(ABC, Generic[CollectionType]):
635
648
  'capital' is a required property
636
649
  'continent' is a required proper
637
650
 
651
+ :param ensure_referential_integrity: ensure referential integrity
638
652
  :param kwargs:
639
653
  :return: iterator over validation results
640
654
  """
641
655
  for collection in self.list_collections():
642
656
  yield from collection.iter_validate_collection(**kwargs)
643
- if self.metadata.ensure_referential_integrity:
657
+ if self.metadata.ensure_referential_integrity or ensure_referential_integrity:
658
+ logger.info(f"Validating referential integrity on {self.alias}")
644
659
  yield from self._validate_referential_integrity(**kwargs)
645
660
 
646
661
  def _validate_referential_integrity(self, **kwargs) -> Iterator["ValidationResult"]:
@@ -661,7 +676,9 @@ class Database(ABC, Generic[CollectionType]):
661
676
  induced_slots = sv.class_induced_slots(cd.name)
662
677
  slot_map = {s.name: s for s in induced_slots}
663
678
  # rmap = {s.name: s.range for s in induced_slots}
679
+ # map slot ranges to a collection where that range is stored
664
680
  sr_to_coll = {s.name: cmap.get(s.range, []) for s in induced_slots if s.range}
681
+ logger.debug(f"Validating referential integrity for {collection.target_class_name} // {sr_to_coll}")
665
682
  for obj in collection.find_iter():
666
683
  for k, v in obj.items():
667
684
  if k not in sr_to_coll:
@@ -0,0 +1,339 @@
1
+ import logging
2
+ from typing import Any, Dict, List, Optional, Tuple, Union
3
+
4
+ import sqlalchemy as sqla
5
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
6
+ from sqlalchemy import Column, Table, delete, insert, inspect, text
7
+ from sqlalchemy.sql.ddl import CreateTable
8
+
9
+ from linkml_store.api import Collection
10
+ from linkml_store.api.collection import DEFAULT_FACET_LIMIT, OBJECT
11
+ from linkml_store.api.queries import Query, QueryResult
12
+ from linkml_store.api.stores.duckdb.mappings import TMAP
13
+ from linkml_store.utils.sql_utils import facet_count_sql
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class DuckDBCollection(Collection):
19
+ _table_created: bool = None
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ super().__init__(*args, **kwargs)
23
+
24
+ def insert(self, objs: Union[OBJECT, List[OBJECT]], **kwargs):
25
+ logger.debug(f"Inserting {len(objs)}")
26
+ if not isinstance(objs, list):
27
+ objs = [objs]
28
+ if not objs:
29
+ return
30
+ cd = self.class_definition()
31
+ if not cd:
32
+ logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}; will induce")
33
+ cd = self.induce_class_definition_from_objects(objs)
34
+ self._create_table(cd)
35
+ table = self._sqla_table(cd)
36
+ logger.info(f"Inserting into: {self.alias} // T={table.name}")
37
+ engine = self.parent.engine
38
+ col_names = [c.name for c in table.columns]
39
+ bad_objs = [obj for obj in objs if not isinstance(obj, dict)]
40
+ if bad_objs:
41
+ logger.error(f"Bad objects: {bad_objs}")
42
+ objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
43
+ with engine.connect() as conn:
44
+ with conn.begin():
45
+ conn.execute(insert(table), objs)
46
+ conn.commit()
47
+ self._post_insert_hook(objs)
48
+
49
+ def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
50
+ if not isinstance(objs, list):
51
+ objs = [objs]
52
+ cd = self.class_definition()
53
+ if not cd or not cd.attributes:
54
+ cd = self.induce_class_definition_from_objects(objs)
55
+ assert cd.attributes
56
+ table = self._sqla_table(cd)
57
+ engine = self.parent.engine
58
+ with engine.connect() as conn:
59
+ for obj in objs:
60
+ conditions = [table.c[k] == v for k, v in obj.items() if k in cd.attributes]
61
+ stmt = delete(table).where(*conditions)
62
+ stmt = stmt.compile(engine)
63
+ conn.execute(stmt)
64
+ conn.commit()
65
+ self._post_delete_hook()
66
+ return None
67
+
68
+ def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
69
+ logger.info(f"Deleting from {self.target_class_name} where: {where}")
70
+ if where is None:
71
+ where = {}
72
+ cd = self.class_definition()
73
+ if not cd:
74
+ logger.info(f"No class definition found for {self.target_class_name}, assuming not prepopulated")
75
+ return 0
76
+ table = self._sqla_table(cd)
77
+ engine = self.parent.engine
78
+ inspector = inspect(engine)
79
+ table_exists = table.name in inspector.get_table_names()
80
+ if not table_exists:
81
+ logger.info(f"Table {table.name} does not exist, assuming no data")
82
+ return 0
83
+ with engine.connect() as conn:
84
+ conditions = [table.c[k] == v for k, v in where.items()]
85
+ stmt = delete(table).where(*conditions)
86
+ stmt = stmt.compile(engine)
87
+ result = conn.execute(stmt)
88
+ deleted_rows_count = result.rowcount
89
+ if deleted_rows_count == 0 and not missing_ok:
90
+ raise ValueError(f"No rows found for {where}")
91
+ conn.commit()
92
+ self._post_delete_hook()
93
+ return deleted_rows_count if deleted_rows_count > -1 else None
94
+
95
+ def query_facets(
96
+ self, where: Dict = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
97
+ ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
98
+ if facet_limit is None:
99
+ facet_limit = DEFAULT_FACET_LIMIT
100
+ results = {}
101
+ cd = self.class_definition()
102
+ with self.parent.engine.connect() as conn:
103
+ if not facet_columns:
104
+ if not cd:
105
+ raise ValueError(f"No class definition found for {self.target_class_name}")
106
+ facet_columns = list(cd.attributes.keys())
107
+ for col in facet_columns:
108
+ logger.debug(f"Faceting on {col}")
109
+ if isinstance(col, tuple):
110
+ sd = SlotDefinition(name="PLACEHOLDER")
111
+ else:
112
+ sd = cd.attributes[col]
113
+ facet_query = self._create_query(where_clause=where)
114
+ facet_query_str = facet_count_sql(facet_query, col, multivalued=sd.multivalued)
115
+ logger.debug(f"Facet query: {facet_query_str}")
116
+ rows = list(conn.execute(text(facet_query_str)))
117
+ results[col] = [tuple(row) for row in rows]
118
+ return results
119
+
120
+ def _sqla_table(self, cd: ClassDefinition) -> Table:
121
+ schema_view = self.parent.schema_view
122
+ metadata_obj = sqla.MetaData()
123
+ cols = []
124
+ for att in schema_view.class_induced_slots(cd.name):
125
+ typ = TMAP.get(att.range, sqla.String)
126
+ if att.inlined or att.inlined_as_list:
127
+ typ = sqla.JSON
128
+ if att.multivalued:
129
+ typ = sqla.ARRAY(typ, dimensions=1)
130
+ if att.array:
131
+ typ = sqla.ARRAY(typ, dimensions=1)
132
+ col = Column(att.name, typ)
133
+ cols.append(col)
134
+ t = Table(self.alias, metadata_obj, *cols)
135
+ return t
136
+
137
+ def _check_if_initialized(self) -> bool:
138
+ # if self._initialized:
139
+ # return True
140
+ query = Query(
141
+ from_table="information_schema.tables", where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
142
+ )
143
+ qr = self.parent.query(query)
144
+ if qr.num_rows > 0:
145
+ return True
146
+ return False
147
+
148
+ def group_by(
149
+ self,
150
+ group_by_fields: List[str],
151
+ inlined_field="objects",
152
+ agg_map: Optional[Dict[str, str]] = None,
153
+ where: Optional[Dict] = None,
154
+ **kwargs,
155
+ ) -> QueryResult:
156
+ """
157
+ Group objects in the collection by specified fields using SQLAlchemy.
158
+
159
+ This implementation leverages DuckDB's SQL capabilities for more efficient grouping.
160
+
161
+ :param group_by_fields: List of fields to group by
162
+ :param inlined_field: Field name to store aggregated objects
163
+ :param agg_map: Dictionary mapping aggregation types to fields
164
+ :param where: Filter conditions
165
+ :param kwargs: Additional arguments
166
+ :return: Query result containing grouped data
167
+ """
168
+ if isinstance(group_by_fields, str):
169
+ group_by_fields = [group_by_fields]
170
+
171
+ cd = self.class_definition()
172
+ if not cd:
173
+ logger.debug(f"No class definition defined for {self.alias} {self.target_class_name}")
174
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
175
+
176
+ # Check if the table exists
177
+ if not self.parent._table_exists(self.alias):
178
+ logger.debug(f"Table {self.alias} doesn't exist, falling back to parent implementation")
179
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
180
+
181
+ # Get table definition
182
+ table = self._sqla_table(cd)
183
+ engine = self.parent.engine
184
+
185
+ # Create a SQLAlchemy select statement for groups
186
+ from sqlalchemy import select
187
+
188
+ group_cols = [table.c[field] for field in group_by_fields if field in table.columns.keys()]
189
+
190
+ if not group_cols:
191
+ logger.warning(f"None of the group_by fields {group_by_fields} found in table columns")
192
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
193
+
194
+ stmt = select(*group_cols).distinct()
195
+
196
+ # Add where conditions if specified
197
+ if where:
198
+ conditions = []
199
+ for k, v in where.items():
200
+ if k in table.columns.keys():
201
+ # Handle different operator types (dict values for operators)
202
+ if isinstance(v, dict):
203
+ for op, val in v.items():
204
+ if op == "$gt":
205
+ conditions.append(table.c[k] > val)
206
+ elif op == "$gte":
207
+ conditions.append(table.c[k] >= val)
208
+ elif op == "$lt":
209
+ conditions.append(table.c[k] < val)
210
+ elif op == "$lte":
211
+ conditions.append(table.c[k] <= val)
212
+ elif op == "$ne":
213
+ conditions.append(table.c[k] != val)
214
+ elif op == "$in":
215
+ conditions.append(table.c[k].in_(val))
216
+ else:
217
+ # Default to equality for unknown operators
218
+ logger.warning(f"Unknown operator {op}, using equality")
219
+ conditions.append(table.c[k] == val)
220
+ else:
221
+ # Direct equality comparison
222
+ conditions.append(table.c[k] == v)
223
+
224
+ if conditions:
225
+ for condition in conditions:
226
+ stmt = stmt.where(condition)
227
+
228
+ results = []
229
+ try:
230
+ with engine.connect() as conn:
231
+ # Get all distinct groups
232
+ group_result = conn.execute(stmt)
233
+ group_rows = list(group_result)
234
+
235
+ # For each group, get all objects
236
+ for group_row in group_rows:
237
+ # Build conditions for this group
238
+ group_conditions = []
239
+ group_dict = {}
240
+
241
+ for i, field in enumerate(group_by_fields):
242
+ if field in table.columns.keys():
243
+ value = group_row[i]
244
+ group_dict[field] = value
245
+ if value is None:
246
+ group_conditions.append(table.c[field].is_(None))
247
+ else:
248
+ group_conditions.append(table.c[field] == value)
249
+
250
+ # Get all rows for this group
251
+ row_stmt = select(*table.columns)
252
+ for condition in group_conditions:
253
+ row_stmt = row_stmt.where(condition)
254
+
255
+ # Add original where conditions
256
+ if where:
257
+ for k, v in where.items():
258
+ if k in table.columns.keys():
259
+ # Handle different operator types for the row query as well
260
+ if isinstance(v, dict):
261
+ for op, val in v.items():
262
+ if op == "$gt":
263
+ row_stmt = row_stmt.where(table.c[k] > val)
264
+ elif op == "$gte":
265
+ row_stmt = row_stmt.where(table.c[k] >= val)
266
+ elif op == "$lt":
267
+ row_stmt = row_stmt.where(table.c[k] < val)
268
+ elif op == "$lte":
269
+ row_stmt = row_stmt.where(table.c[k] <= val)
270
+ elif op == "$ne":
271
+ row_stmt = row_stmt.where(table.c[k] != val)
272
+ elif op == "$in":
273
+ row_stmt = row_stmt.where(table.c[k].in_(val))
274
+ else:
275
+ # Default to equality for unknown operators
276
+ row_stmt = row_stmt.where(table.c[k] == val)
277
+ else:
278
+ # Direct equality comparison
279
+ row_stmt = row_stmt.where(table.c[k] == v)
280
+
281
+ row_result = conn.execute(row_stmt)
282
+ rows = list(row_result)
283
+
284
+ # Convert rows to dictionaries
285
+ objects = []
286
+ for row in rows:
287
+ obj = {}
288
+ for i, col in enumerate(row._fields):
289
+ obj[col] = row[i]
290
+ objects.append(obj)
291
+
292
+ # Apply agg_map to filter fields if specified
293
+ if agg_map and "list" in agg_map:
294
+ list_fields = agg_map["list"]
295
+ if list_fields:
296
+ objects = [{k: obj.get(k) for k in list_fields if k in obj} for obj in objects]
297
+
298
+ # Create the result object
299
+ result_obj = group_dict.copy()
300
+ result_obj[inlined_field] = objects
301
+ results.append(result_obj)
302
+
303
+ return QueryResult(num_rows=len(results), rows=results)
304
+ except Exception as e:
305
+ logger.warning(f"Error in DuckDB group_by: {e}")
306
+ # Fall back to parent implementation
307
+ return super().group_by(group_by_fields, inlined_field, agg_map, where, **kwargs)
308
+
309
+ def _create_table(self, cd: ClassDefinition):
310
+ if self._table_created or self.metadata.is_prepopulated:
311
+ logger.info(f"Already have table for: {cd.name}")
312
+ return
313
+ if self.parent._table_exists(self.alias):
314
+ logger.info(f"Table already exists for {cd.name}")
315
+ self._table_created = True
316
+ self._initialized = True
317
+ self.metadata.is_prepopulated = True
318
+ return
319
+ # query = Query(
320
+ # from_table="information_schema.tables",
321
+ # where_clause={"table_type": "BASE TABLE", "table_name": self.alias}
322
+ # )
323
+ # qr = self.parent.query(query)
324
+ # if qr.num_rows > 0:
325
+ # logger.info(f"Table already exists for {cd.name}")
326
+ # self._table_created = True
327
+ # self._initialized = True
328
+ # self.metadata.is_prepopulated = True
329
+ # return
330
+ logger.info(f"Creating table for {cd.name}")
331
+ t = self._sqla_table(cd)
332
+ ct = CreateTable(t)
333
+ ddl = str(ct.compile(self.parent.engine))
334
+ with self.parent.engine.connect() as conn:
335
+ conn.execute(text(ddl))
336
+ conn.commit()
337
+ self._table_created = True
338
+ self._initialized = True
339
+ self.metadata.is_prepopulated = True
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  import logging
3
3
  from pathlib import Path
4
- from typing import Optional, Union, List
4
+ from typing import List, Optional, Union
5
5
 
6
6
  import pandas as pd
7
7
  import sqlalchemy
@@ -14,7 +14,7 @@ from linkml_store.api import Database
14
14
  from linkml_store.api.queries import Query, QueryResult
15
15
  from linkml_store.api.stores.duckdb.duckdb_collection import DuckDBCollection
16
16
  from linkml_store.utils.format_utils import Format
17
- from linkml_store.utils.sql_utils import introspect_schema, query_to_sql, where_clause_to_sql
17
+ from linkml_store.utils.sql_utils import introspect_schema, query_to_sql
18
18
 
19
19
  TYPE_MAP = {
20
20
  "VARCHAR": "string",
@@ -100,9 +100,9 @@ class DuckDBDatabase(Database):
100
100
  meta_query = Query(
101
101
  from_table="sqlite_master",
102
102
  where_clause={
103
- #"type": "table",
103
+ # "type": "table",
104
104
  "name": table,
105
- }
105
+ },
106
106
  )
107
107
  else:
108
108
  if table.startswith("information_schema"):
@@ -112,7 +112,7 @@ class DuckDBDatabase(Database):
112
112
  where_clause={
113
113
  "table_type": "BASE TABLE",
114
114
  "table_name": table,
115
- }
115
+ },
116
116
  )
117
117
 
118
118
  qr = self.query(meta_query)
@@ -4,7 +4,7 @@ Adapter for FileSystem wrapper
4
4
  Handles have the form:
5
5
 
6
6
  - ``file:<path>`` for a local file
7
- """
7
+ """
8
8
 
9
9
  from linkml_store.api.stores.filesystem.filesystem_collection import FileSystemCollection
10
10
  from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase