linkml-store 0.1.14__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (80) hide show
  1. {linkml_store-0.1.14 → linkml_store-0.2.1}/PKG-INFO +9 -1
  2. {linkml_store-0.1.14 → linkml_store-0.2.1}/README.md +3 -0
  3. {linkml_store-0.1.14 → linkml_store-0.2.1}/pyproject.toml +9 -18
  4. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/collection.py +48 -5
  5. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/database.py +7 -1
  6. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/queries.py +3 -1
  7. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +8 -2
  8. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/cli.py +44 -18
  9. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/implementations/llm_indexer.py +20 -2
  10. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/indexer.py +51 -1
  11. linkml_store-0.2.1/src/linkml_store/inference/evaluation.py +195 -0
  12. linkml_store-0.2.1/src/linkml_store/inference/implementations/rag_inference_engine.py +232 -0
  13. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +15 -4
  14. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +20 -2
  15. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/inference_config.py +1 -0
  16. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/inference_engine.py +53 -19
  17. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/format_utils.py +6 -0
  18. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/llm_utils.py +2 -0
  19. linkml_store-0.2.1/src/linkml_store/utils/object_utils.py +182 -0
  20. linkml_store-0.1.14/src/linkml_store/inference/implementations/rag_inference_engine.py +0 -145
  21. linkml_store-0.1.14/src/linkml_store/utils/object_utils.py +0 -83
  22. {linkml_store-0.1.14 → linkml_store-0.2.1}/LICENSE +0 -0
  23. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/__init__.py +0 -0
  24. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/__init__.py +0 -0
  25. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/client.py +0 -0
  26. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/config.py +0 -0
  27. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/__init__.py +0 -0
  28. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  29. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  30. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  31. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  32. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
  33. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  34. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
  35. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  36. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
  37. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  38. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  39. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  40. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  41. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +0 -0
  42. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
  43. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
  44. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
  45. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
  46. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  47. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
  48. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  49. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  50. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/api/types.py +0 -0
  51. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/constants.py +0 -0
  52. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/graphs/__init__.py +0 -0
  53. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/graphs/graph_map.py +0 -0
  54. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/__init__.py +0 -0
  55. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/implementations/__init__.py +0 -0
  56. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  57. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/__init__.py +0 -0
  58. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/implementations/__init__.py +0 -0
  59. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/inference/inference_engine_registry.py +0 -0
  60. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/__init__.py +0 -0
  61. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/change_utils.py +0 -0
  62. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/file_utils.py +0 -0
  63. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/io.py +0 -0
  64. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/mongodb_utils.py +0 -0
  65. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/neo4j_utils.py +0 -0
  66. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/pandas_utils.py +0 -0
  67. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/patch_utils.py +0 -0
  68. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/query_utils.py +0 -0
  69. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/schema_utils.py +0 -0
  70. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/sklearn_utils.py +0 -0
  71. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/sql_utils.py +0 -0
  72. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/utils/stats_utils.py +0 -0
  73. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/__init__.py +0 -0
  74. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/__init__.py +0 -0
  75. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  76. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  77. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  78. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
  79. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
  80. {linkml_store-0.1.14 → linkml_store-0.2.1}/src/linkml_store/webapi/main.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: linkml-store
3
- Version: 0.1.14
3
+ Version: 0.2.1
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -18,6 +18,7 @@ Provides-Extra: chromadb
18
18
  Provides-Extra: fastapi
19
19
  Provides-Extra: frictionless
20
20
  Provides-Extra: h5py
21
+ Provides-Extra: ibis
21
22
  Provides-Extra: llm
22
23
  Provides-Extra: map
23
24
  Provides-Extra: mongodb
@@ -34,7 +35,9 @@ Requires-Dist: duckdb (>=0.10.1)
34
35
  Requires-Dist: duckdb-engine (>=0.11.2)
35
36
  Requires-Dist: fastapi ; extra == "fastapi"
36
37
  Requires-Dist: frictionless ; extra == "frictionless"
38
+ Requires-Dist: gcsfs ; extra == "ibis"
37
39
  Requires-Dist: h5py ; extra == "h5py"
40
+ Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
38
41
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
39
42
  Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
40
43
  Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
@@ -43,6 +46,7 @@ Requires-Dist: linkml_map ; extra == "map"
43
46
  Requires-Dist: linkml_renderer ; extra == "renderer"
44
47
  Requires-Dist: llm ; extra == "llm"
45
48
  Requires-Dist: matplotlib ; extra == "analytics"
49
+ Requires-Dist: multipledispatch ; extra == "ibis"
46
50
  Requires-Dist: neo4j ; extra == "neo4j"
47
51
  Requires-Dist: networkx ; extra == "neo4j"
48
52
  Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
@@ -52,6 +56,7 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
52
56
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
53
57
  Requires-Dist: pymongo ; extra == "mongodb"
54
58
  Requires-Dist: pystow (>=0.5.4,<0.6.0)
59
+ Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
55
60
  Requires-Dist: scikit-learn ; extra == "scipy"
56
61
  Requires-Dist: scipy ; extra == "scipy"
57
62
  Requires-Dist: seaborn ; extra == "analytics"
@@ -70,6 +75,8 @@ common query, index, and storage operations.
70
75
 
71
76
  For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
72
77
 
78
+ See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for a high level overview.
79
+
73
80
  __Warning__ LinkML-Store is still undergoing changes and refactoring,
74
81
  APIs and command line options are subject to change!
75
82
 
@@ -196,3 +203,4 @@ make app
196
203
 
197
204
  See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details
198
205
 
206
+
@@ -7,6 +7,8 @@ common query, index, and storage operations.
7
7
 
8
8
  For full documentation, see [https://linkml.io/linkml-store/](https://linkml.io/linkml-store/)
9
9
 
10
+ See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for a high level overview.
11
+
10
12
  __Warning__ LinkML-Store is still undergoing changes and refactoring,
11
13
  APIs and command line options are subject to change!
12
14
 
@@ -132,3 +134,4 @@ make app
132
134
  ## Background
133
135
 
134
136
  See [these slides](https://docs.google.com/presentation/d/e/2PACX-1vSgtWUNUW0qNO_ZhMAGQ6fYhlXZJjBNMYT0OiZz8DDx8oj7iG9KofRs6SeaMXBBOICGknoyMG2zaHnm/embed?start=false&loop=false&delayms=3000) for more details
137
+
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.1.14"
3
+ version = "0.2.1"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -20,6 +20,7 @@ seaborn = { version = "*", optional = true }
20
20
  plotly = { version = "*", optional = true }
21
21
  pystow = "^0.5.4"
22
22
  black = { version=">=24.0.0", optional = true }
23
+ ruff = { version=">=0.6.2", optional = true }
23
24
  llm = { version="*", optional = true }
24
25
  tiktoken = { version="*", optional = true }
25
26
  pymongo = { version="*", optional = true }
@@ -35,6 +36,9 @@ linkml = { version=">=1.8.0", optional = true }
35
36
  linkml_map = { version="*", optional = true }
36
37
  linkml_renderer = { version="*", optional = true }
37
38
  frictionless = { version="*", optional = true }
39
+ ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
40
+ gcsfs = { version="*", optional = true }
41
+ multipledispatch = { version="*" }
38
42
  pandas = ">=2.2.1"
39
43
  jinja2 = "^3.1.4"
40
44
  jsonlines = "^4.0.0"
@@ -69,7 +73,7 @@ numpy = [
69
73
  [tool.poetry.extras]
70
74
  analytics = ["pandas", "matplotlib", "seaborn", "plotly"]
71
75
  app = ["streamlit"]
72
- tests = ["black"]
76
+ tests = ["black", "ruff"]
73
77
  llm = ["llm", "tiktoken"]
74
78
  mongodb = ["pymongo"]
75
79
  neo4j = ["neo4j", "py2neo", "networkx"]
@@ -82,6 +86,7 @@ renderer = ["linkml_renderer"]
82
86
  fastapi = ["fastapi", "uvicorn"]
83
87
  frictionless = ["frictionless"]
84
88
  scipy = ["scipy", "scikit-learn"]
89
+ ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
85
90
 
86
91
  [tool.poetry.scripts]
87
92
  linkml-store = "linkml_store.cli:cli"
@@ -119,27 +124,13 @@ extend-exclude = [
119
124
  ]
120
125
  force-exclude = true
121
126
  line-length = 120
122
- extend-ignore = ["E203"]
123
- select = [
127
+ lint.extend-ignore = ["E203"]
128
+ lint.select = [
124
129
  "E", # pycodestyle errors
125
130
  "F", # Pyflakes
126
131
  "I", # isort
127
132
  ]
128
- # Assume Python 3.8
129
- target-version = "py38"
130
133
 
131
- [tool.ruff.per-file-ignores]
132
- # These templates can have long lines
133
- "linkml/generators/sqlalchemy/sqlalchemy_declarative_template.py" = ["E501"]
134
- "linkml/generators/sqlalchemy/sqlalchemy_imperative_template.py" = ["E501"]
135
-
136
- # Notebooks can have unsorted imports
137
- "tests/test_notebooks/input/*" = ["E402"]
138
-
139
-
140
- [tool.ruff.mccabe]
141
- # Unlike Flake8, default to a complexity level of 10.
142
- max-complexity = 10
143
134
 
144
135
 
145
136
  [tool.codespell]
@@ -226,6 +226,18 @@ class Collection(Generic[DatabaseType]):
226
226
  self._initialized = True
227
227
  patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
228
228
  self._broadcast(patches, **kwargs)
229
+ self._post_modification_hook(**kwargs)
230
+
231
+ def _post_delete_hook(self, **kwargs):
232
+ self._post_modification_hook(**kwargs)
233
+
234
+ def _post_modification_hook(self, **kwargs):
235
+ for indexer in self.indexers.values():
236
+ ix_collection_name = self.get_index_collection_name(indexer)
237
+ ix_collection = self.parent.get_collection(ix_collection_name)
238
+ # Currently updating the source triggers complete reindexing
239
+ # TODO: make this more efficient by only deleting modified
240
+ ix_collection.delete_where({})
229
241
 
230
242
  def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
231
243
  """
@@ -476,7 +488,7 @@ class Collection(Generic[DatabaseType]):
476
488
  Now let's index, using the simple trigram-based index
477
489
 
478
490
  >>> index = get_indexer("simple")
479
- >>> collection.attach_indexer(index)
491
+ >>> _ = collection.attach_indexer(index)
480
492
 
481
493
  Now let's find all objects:
482
494
 
@@ -514,7 +526,10 @@ class Collection(Generic[DatabaseType]):
514
526
  if ix_coll.size() == 0:
515
527
  logger.info(f"Index {index_name} is empty; indexing all objects")
516
528
  all_objs = self.find(limit=-1).rows
517
- self.index_objects(all_objs, index_name, replace=True, **kwargs)
529
+ if all_objs:
530
+ # print(f"Index {index_name} is empty; indexing all objects {len(all_objs)}")
531
+ self.index_objects(all_objs, index_name, replace=True, **kwargs)
532
+ assert ix_coll.size() > 0
518
533
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
519
534
  index_col = ix.index_field
520
535
  # TODO: optimize this for large indexes
@@ -648,7 +663,31 @@ class Collection(Generic[DatabaseType]):
648
663
  """
649
664
  return self.find({}, limit=1).num_rows
650
665
 
651
- def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
666
+ def rows_iter(self) -> Iterable[OBJECT]:
667
+ """
668
+ Return an iterator over the objects in the collection.
669
+
670
+ :return:
671
+ """
672
+ yield from self.find({}, limit=-1).rows
673
+
674
+ def rows(self) -> List[OBJECT]:
675
+ """
676
+ Return a list of objects in the collection.
677
+
678
+ :return:
679
+ """
680
+ return list(self.rows_iter())
681
+
682
+ def ranked_rows(self) -> List[Tuple[float, OBJECT]]:
683
+ """
684
+ Return a list of objects in the collection, with scores.
685
+ """
686
+ return [(n, obj) for n, obj in enumerate(self.rows_iter())]
687
+
688
+ def attach_indexer(
689
+ self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs
690
+ ) -> Indexer:
652
691
  """
653
692
  Attach an index to the collection.
654
693
 
@@ -669,8 +708,8 @@ class Collection(Generic[DatabaseType]):
669
708
  >>> full_index.name = "full"
670
709
  >>> name_index = get_indexer("simple", text_template="{name}")
671
710
  >>> name_index.name = "name"
672
- >>> collection.attach_indexer(full_index)
673
- >>> collection.attach_indexer(name_index)
711
+ >>> _ = collection.attach_indexer(full_index)
712
+ >>> _ = collection.attach_indexer(name_index)
674
713
 
675
714
  Now let's find objects using the full index, using the string "France".
676
715
  We expect the country France to be the top hit, but the score will
@@ -713,6 +752,10 @@ class Collection(Generic[DatabaseType]):
713
752
  all_objs = self.find(limit=-1).rows
714
753
  logger.info(f"Auto-indexing {len(all_objs)} objects")
715
754
  self.index_objects(all_objs, index_name, replace=True, **kwargs)
755
+ return index
756
+
757
+ def get_index_collection_name(self, indexer: Indexer) -> str:
758
+ return self._index_collection_name(indexer.name)
716
759
 
717
760
  def _index_collection_name(self, index_name: str) -> str:
718
761
  """
@@ -268,7 +268,7 @@ class Database(ABC, Generic[CollectionType]):
268
268
  metadata: Optional[CollectionConfig] = None,
269
269
  recreate_if_exists=False,
270
270
  **kwargs,
271
- ) -> CollectionType:
271
+ ) -> Collection:
272
272
  """
273
273
  Create a new collection in the current database.
274
274
 
@@ -760,6 +760,12 @@ class Database(ABC, Generic[CollectionType]):
760
760
  """
761
761
  Export a database to a file or location.
762
762
 
763
+ >>> from linkml_store.api.client import Client
764
+ >>> client = Client()
765
+ >>> db = client.attach_database("duckdb", alias="test")
766
+ >>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
767
+ >>> db.export_database("/tmp/iris.yaml", Format.YAML)
768
+
763
769
  :param location: location of the file
764
770
  :param target_format: target format
765
771
  :param kwargs: additional arguments
@@ -40,7 +40,9 @@ class FacetCountResult(BaseModel):
40
40
 
41
41
  class QueryResult(BaseModel):
42
42
  """
43
- A query result
43
+ A query result.
44
+
45
+ TODO: make this a subclass of Collection
44
46
  """
45
47
 
46
48
  query: Optional[Query] = None
@@ -36,6 +36,9 @@ class DuckDBCollection(Collection):
36
36
  logger.info(f"Inserting into: {self.alias} // T={table.name}")
37
37
  engine = self.parent.engine
38
38
  col_names = [c.name for c in table.columns]
39
+ bad_objs = [obj for obj in objs if not isinstance(obj, dict)]
40
+ if bad_objs:
41
+ logger.error(f"Bad objects: {bad_objs}")
39
42
  objs = [{k: obj.get(k, None) for k in col_names} for obj in objs]
40
43
  with engine.connect() as conn:
41
44
  with conn.begin():
@@ -47,8 +50,9 @@ class DuckDBCollection(Collection):
47
50
  if not isinstance(objs, list):
48
51
  objs = [objs]
49
52
  cd = self.class_definition()
50
- if not cd:
53
+ if not cd or not cd.attributes:
51
54
  cd = self.induce_class_definition_from_objects(objs)
55
+ assert cd.attributes
52
56
  table = self._sqla_table(cd)
53
57
  engine = self.parent.engine
54
58
  with engine.connect() as conn:
@@ -58,7 +62,8 @@ class DuckDBCollection(Collection):
58
62
  stmt = stmt.compile(engine)
59
63
  conn.execute(stmt)
60
64
  conn.commit()
61
- return
65
+ self._post_delete_hook()
66
+ return None
62
67
 
63
68
  def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
64
69
  logger.info(f"Deleting from {self.target_class_name} where: {where}")
@@ -84,6 +89,7 @@ class DuckDBCollection(Collection):
84
89
  if deleted_rows_count == 0 and not missing_ok:
85
90
  raise ValueError(f"No rows found for {where}")
86
91
  conn.commit()
92
+ self._post_delete_hook()
87
93
  return deleted_rows_count if deleted_rows_count > -1 else None
88
94
 
89
95
  def query_facets(
@@ -7,6 +7,7 @@ from typing import Optional
7
7
  import click
8
8
  import yaml
9
9
  from linkml_runtime.dumpers import json_dumper
10
+ from linkml_runtime.utils.formatutils import underscore
10
11
  from pydantic import BaseModel
11
12
 
12
13
  from linkml_store import Client
@@ -17,6 +18,7 @@ from linkml_store.index import get_indexer
17
18
  from linkml_store.index.implementations.simple_indexer import SimpleIndexer
18
19
  from linkml_store.index.indexer import Indexer
19
20
  from linkml_store.inference import get_inference_engine
21
+ from linkml_store.inference.evaluation import evaluate_predictor, score_text_overlap
20
22
  from linkml_store.inference.inference_config import InferenceConfig
21
23
  from linkml_store.inference.inference_engine import ModelSerialization
22
24
  from linkml_store.utils.format_utils import Format, guess_format, load_objects, render_output, write_output
@@ -74,6 +76,8 @@ class ContextSettings(BaseModel):
74
76
  if name is None:
75
77
  # if len(self.database.list_collections()) > 1:
76
78
  # raise ValueError("Collection must be specified if there are multiple collections.")
79
+ if not self.database:
80
+ return None
77
81
  if not self.database.list_collections():
78
82
  return None
79
83
  name = list(self.database.list_collections())[0]
@@ -130,7 +134,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
130
134
  logger.setLevel(logging.ERROR)
131
135
  ctx.ensure_object(dict)
132
136
  if input:
133
- stem = Path(input).stem
137
+ stem = underscore(Path(input).stem)
134
138
  database = "duckdb"
135
139
  collection = stem
136
140
  config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
@@ -216,7 +220,10 @@ def insert(ctx, files, object, format):
216
220
  @click.option("--object", "-i", multiple=True, help="Input object as YAML")
217
221
  @click.pass_context
218
222
  def store(ctx, files, object, format):
219
- """Store objects from files (JSON, YAML, TSV) into the specified collection."""
223
+ """Store objects from files (JSON, YAML, TSV) into the database.
224
+
225
+ Note: this is similar to insert, but a collection does not need to be specified
226
+ """
220
227
  settings = ctx.obj["settings"]
221
228
  db = settings.database
222
229
  if not files and not object:
@@ -496,12 +503,16 @@ def describe(ctx, where, output_type, output, limit):
496
503
  @click.option(
497
504
  "--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
498
505
  )
506
+ @click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
507
+ @click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
499
508
  @click.option("--query", "-q", type=click.STRING, help="query term")
500
509
  @click.pass_context
501
510
  def infer(
502
511
  ctx,
503
512
  inference_config_file,
504
513
  query,
514
+ evaluation_count,
515
+ evaluation_match_function,
505
516
  training_test_data_split,
506
517
  predictor_type,
507
518
  target_attribute,
@@ -545,25 +556,28 @@ def infer(
545
556
  else:
546
557
  query_obj = None
547
558
  collection = ctx.obj["settings"].collection
548
- atts = collection.class_definition().attributes.keys()
559
+ if collection:
560
+ atts = collection.class_definition().attributes.keys()
561
+ else:
562
+ atts = []
563
+ if feature_attributes:
564
+ features = feature_attributes.split(",")
565
+ features = [f.strip() for f in features]
566
+ else:
567
+ if query_obj:
568
+ features = query_obj.keys()
569
+ else:
570
+ features = None
571
+ if target_attribute:
572
+ target_attributes = list(target_attribute)
573
+ else:
574
+ target_attributes = [att for att in atts if att not in features]
549
575
  if model_format:
550
576
  model_format = ModelSerialization(model_format)
551
577
  if load_model:
552
578
  predictor = get_inference_engine(predictor_type)
553
579
  predictor = type(predictor).load_model(load_model)
554
580
  else:
555
- if feature_attributes:
556
- features = feature_attributes.split(",")
557
- features = [f.strip() for f in features]
558
- else:
559
- if query_obj:
560
- features = query_obj.keys()
561
- else:
562
- features = None
563
- if target_attribute:
564
- target_attributes = list(target_attribute)
565
- else:
566
- target_attributes = [att for att in atts if att not in features]
567
581
  if inference_config_file:
568
582
  config = InferenceConfig.from_file(inference_config_file)
569
583
  else:
@@ -571,14 +585,26 @@ def infer(
571
585
  if training_test_data_split:
572
586
  config.train_test_split = training_test_data_split
573
587
  predictor = get_inference_engine(predictor_type, config=config)
574
- predictor.load_and_split_data(collection)
588
+ if collection:
589
+ predictor.load_and_split_data(collection)
575
590
  predictor.initialize_model()
576
591
  if export_model:
577
592
  logger.info(f"Exporting model to {export_model} in {model_format}")
578
593
  predictor.export_model(export_model, model_format)
579
594
  if not query_obj:
580
- if not export_model:
581
- raise ValueError("Query must be specified if not exporting model")
595
+ if not export_model and not evaluation_count:
596
+ raise ValueError("Query or evaluate must be specified if not exporting model")
597
+ if evaluation_count:
598
+ if evaluation_match_function == "score_text_overlap":
599
+ match_function_fn = score_text_overlap
600
+ elif evaluation_match_function is not None:
601
+ raise ValueError(f"Unknown match function: {evaluation_match_function}")
602
+ else:
603
+ match_function_fn = None
604
+ outcome = evaluate_predictor(
605
+ predictor, target_attributes, evaluation_count=evaluation_count, match_function=match_function_fn
606
+ )
607
+ print(f"Outcome: {outcome} // accuracy: {outcome.accuracy}")
582
608
  if query_obj:
583
609
  result = predictor.derive(query_obj)
584
610
  dumped_obj = result.model_dump(exclude_none=True)
@@ -1,11 +1,13 @@
1
1
  import logging
2
2
  from pathlib import Path
3
- from typing import TYPE_CHECKING, List
3
+ from typing import TYPE_CHECKING, List, Optional
4
4
 
5
5
  import numpy as np
6
+ from tiktoken import encoding_for_model
6
7
 
7
8
  from linkml_store.api.config import CollectionConfig
8
9
  from linkml_store.index.indexer import INDEX_ITEM, Indexer
10
+ from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
9
11
 
10
12
  if TYPE_CHECKING:
11
13
  import llm
@@ -29,6 +31,7 @@ class LLMIndexer(Indexer):
29
31
  cached_embeddings_database: str = None
30
32
  cached_embeddings_collection: str = None
31
33
  cache_queries: bool = False
34
+ truncation_method: Optional[str] = None
32
35
 
33
36
  @property
34
37
  def embedding_model(self):
@@ -62,6 +65,21 @@ class LLMIndexer(Indexer):
62
65
  """
63
66
  logging.info(f"Converting {len(texts)} texts to vectors")
64
67
  model = self.embedding_model
68
+ token_limit = get_token_limit(model.model_id)
69
+ encoding = encoding_for_model("gpt-4o")
70
+
71
+ def truncate_text(text: str) -> str:
72
+ # split into tokens every 1000 chars:
73
+ parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
74
+ return render_formatted_text(
75
+ lambda x: "".join(x),
76
+ parts,
77
+ encoding,
78
+ token_limit,
79
+ )
80
+
81
+ texts = [truncate_text(text) for text in texts]
82
+
65
83
  if self.cached_embeddings_database and (cache is None or cache or self.cache_queries):
66
84
  model_id = model.model_id
67
85
  if not model_id:
@@ -88,7 +106,7 @@ class LLMIndexer(Indexer):
88
106
  embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
89
107
  else:
90
108
  embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
91
- texts = list(texts)
109
+
92
110
  embeddings = list([None] * len(texts))
93
111
  uncached_texts = []
94
112
  n = 0
@@ -36,6 +36,54 @@ def cosine_similarity(vector1, vector2) -> float:
36
36
  class Indexer(BaseModel):
37
37
  """
38
38
  An indexer operates on a collection in order to search for objects.
39
+
40
+ You should use a subcllass of this; this can be looked up dynqamically:
41
+
42
+ >>> from linkml_store.index import get_indexer
43
+ >>> indexer = get_indexer("simple")
44
+
45
+ You can customize how objects are indexed by passing in a text template.
46
+ For example, if your collection has objects with "name" and "profession" attributes,
47
+ you can index them as "{name} {profession}".
48
+
49
+ >>> indexer = get_indexer("simple", text_template="{name} :: {profession}")
50
+
51
+ By default, python fstrings are assumed.
52
+
53
+ We can test this works using the :ref:`object_to_text` method (normally
54
+ you would never need to call this directly, but it's useful for testing):
55
+
56
+ >>> obj = {"name": "John", "profession": "doctor"}
57
+ >>> indexer.object_to_text(obj)
58
+ 'John :: doctor'
59
+
60
+ You can also use Jinja2 templates; this gives more flexibility and logic,
61
+ e.g. conditional formatting:
62
+
63
+ >>> tmpl = "{{name}}{% if profession %} :: {{profession}}{% endif %}"
64
+ >>> indexer = get_indexer("simple", text_template=tmpl, text_template_syntax=TemplateSyntaxEnum.jinja2)
65
+ >>> indexer.object_to_text(obj)
66
+ 'John :: doctor'
67
+ >>> indexer.object_to_text({"name": "John"})
68
+ 'John'
69
+
70
+ You can also specify which attributes to index:
71
+
72
+ >>> indexer = get_indexer("simple", index_attributes=["name"])
73
+ >>> indexer.object_to_text(obj)
74
+ 'John'
75
+
76
+ The purpose of an indexer is to translate a collection of objects into a collection of objects
77
+ such as vectors for purposes such as search. Unless you are implementing your own indexer, you
78
+ generally don't need to use the methods that return vectors, but we can examine their behavior
79
+ to get a sense of how they work.
80
+
81
+ >>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
82
+ >>> assert cosine_similarity(vectors[0], vectors[1]) > cosine_similarity(vectors[0], vectors[2])
83
+
84
+ Note you should consult the documentation for the specific indexer you are using for more details on
85
+ how text is converted to vectors.
86
+
39
87
  """
40
88
 
41
89
  name: Optional[str] = None
@@ -122,7 +170,9 @@ class Indexer(BaseModel):
122
170
  self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
123
171
  ) -> List[Tuple[float, Any]]:
124
172
  """
125
- Search the index for a query string
173
+ Use the indexer to search against a database of vectors.
174
+
175
+ Note: this is a low-level method, typically you would use the :ref:`search` method on a :ref:`Collection`.
126
176
 
127
177
  :param query: The query string to search for
128
178
  :param vectors: A list of indexed items, where each item is a tuple of (id, vector)