linkml-store 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of linkml-store might be problematic. Click here for more details.

Files changed (80) hide show
  1. {linkml_store-0.2.0 → linkml_store-0.2.2}/PKG-INFO +6 -1
  2. {linkml_store-0.2.0 → linkml_store-0.2.2}/pyproject.toml +9 -18
  3. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/collection.py +50 -6
  4. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/database.py +7 -1
  5. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/queries.py +3 -1
  6. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/duckdb_collection.py +5 -2
  7. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/cli.py +58 -13
  8. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/index/implementations/llm_indexer.py +20 -2
  9. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/index/indexer.py +70 -16
  10. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/inference/evaluation.py +9 -3
  11. linkml_store-0.2.2/src/linkml_store/inference/implementations/rag_inference_engine.py +262 -0
  12. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/sklearn_inference_engine.py +1 -1
  13. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/inference/inference_config.py +5 -2
  14. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/inference/inference_engine.py +20 -13
  15. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/llm_utils.py +1 -0
  16. linkml_store-0.2.2/src/linkml_store/utils/vector_utils.py +165 -0
  17. linkml_store-0.2.0/src/linkml_store/inference/implementations/rag_inference_engine.py +0 -145
  18. {linkml_store-0.2.0 → linkml_store-0.2.2}/LICENSE +0 -0
  19. {linkml_store-0.2.0 → linkml_store-0.2.2}/README.md +0 -0
  20. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/__init__.py +0 -0
  21. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/__init__.py +0 -0
  22. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/client.py +0 -0
  23. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/config.py +0 -0
  24. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/__init__.py +0 -0
  25. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/chromadb/__init__.py +0 -0
  26. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/chromadb/chromadb_collection.py +0 -0
  27. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/chromadb/chromadb_database.py +0 -0
  28. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/__init__.py +0 -0
  29. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/duckdb_database.py +0 -0
  30. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/duckdb/mappings.py +0 -0
  31. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/filesystem/__init__.py +0 -0
  32. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/filesystem/filesystem_collection.py +0 -0
  33. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/filesystem/filesystem_database.py +0 -0
  34. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/hdf5/__init__.py +0 -0
  35. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/hdf5/hdf5_collection.py +0 -0
  36. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/hdf5/hdf5_database.py +0 -0
  37. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/mongodb/__init__.py +0 -0
  38. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/mongodb/mongodb_collection.py +0 -0
  39. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/mongodb/mongodb_database.py +0 -0
  40. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/neo4j/__init__.py +0 -0
  41. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/neo4j/neo4j_collection.py +0 -0
  42. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/neo4j/neo4j_database.py +0 -0
  43. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/__init__.py +0 -0
  44. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/solr_collection.py +0 -0
  45. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/solr_database.py +0 -0
  46. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/stores/solr/solr_utils.py +0 -0
  47. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/api/types.py +0 -0
  48. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/constants.py +0 -0
  49. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/graphs/__init__.py +0 -0
  50. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/graphs/graph_map.py +0 -0
  51. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/index/__init__.py +0 -0
  52. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/index/implementations/__init__.py +0 -0
  53. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/index/implementations/simple_indexer.py +0 -0
  54. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/inference/__init__.py +0 -0
  55. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/__init__.py +0 -0
  56. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/inference/implementations/rule_based_inference_engine.py +0 -0
  57. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/inference/inference_engine_registry.py +0 -0
  58. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/__init__.py +0 -0
  59. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/change_utils.py +0 -0
  60. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/file_utils.py +0 -0
  61. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/format_utils.py +0 -0
  62. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/io.py +0 -0
  63. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/mongodb_utils.py +0 -0
  64. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/neo4j_utils.py +0 -0
  65. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/object_utils.py +0 -0
  66. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/pandas_utils.py +0 -0
  67. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/patch_utils.py +0 -0
  68. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/query_utils.py +0 -0
  69. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/schema_utils.py +0 -0
  70. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/sklearn_utils.py +0 -0
  71. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/sql_utils.py +0 -0
  72. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/utils/stats_utils.py +0 -0
  73. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/webapi/__init__.py +0 -0
  74. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/webapi/html/__init__.py +0 -0
  75. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/webapi/html/base.html.j2 +0 -0
  76. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/webapi/html/collection_details.html.j2 +0 -0
  77. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/webapi/html/database_details.html.j2 +0 -0
  78. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/webapi/html/databases.html.j2 +0 -0
  79. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/webapi/html/generic.html.j2 +0 -0
  80. {linkml_store-0.2.0 → linkml_store-0.2.2}/src/linkml_store/webapi/main.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: linkml-store
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: linkml-store
5
5
  License: MIT
6
6
  Author: Author 1
@@ -18,6 +18,7 @@ Provides-Extra: chromadb
18
18
  Provides-Extra: fastapi
19
19
  Provides-Extra: frictionless
20
20
  Provides-Extra: h5py
21
+ Provides-Extra: ibis
21
22
  Provides-Extra: llm
22
23
  Provides-Extra: map
23
24
  Provides-Extra: mongodb
@@ -34,7 +35,9 @@ Requires-Dist: duckdb (>=0.10.1)
34
35
  Requires-Dist: duckdb-engine (>=0.11.2)
35
36
  Requires-Dist: fastapi ; extra == "fastapi"
36
37
  Requires-Dist: frictionless ; extra == "frictionless"
38
+ Requires-Dist: gcsfs ; extra == "ibis"
37
39
  Requires-Dist: h5py ; extra == "h5py"
40
+ Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
38
41
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
39
42
  Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
40
43
  Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
@@ -43,6 +46,7 @@ Requires-Dist: linkml_map ; extra == "map"
43
46
  Requires-Dist: linkml_renderer ; extra == "renderer"
44
47
  Requires-Dist: llm ; extra == "llm"
45
48
  Requires-Dist: matplotlib ; extra == "analytics"
49
+ Requires-Dist: multipledispatch ; extra == "ibis"
46
50
  Requires-Dist: neo4j ; extra == "neo4j"
47
51
  Requires-Dist: networkx ; extra == "neo4j"
48
52
  Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
@@ -52,6 +56,7 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
52
56
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
53
57
  Requires-Dist: pymongo ; extra == "mongodb"
54
58
  Requires-Dist: pystow (>=0.5.4,<0.6.0)
59
+ Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
55
60
  Requires-Dist: scikit-learn ; extra == "scipy"
56
61
  Requires-Dist: scipy ; extra == "scipy"
57
62
  Requires-Dist: seaborn ; extra == "analytics"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "linkml-store"
3
- version = "0.2.0"
3
+ version = "0.2.2"
4
4
  description = "linkml-store"
5
5
  authors = ["Author 1 <author@org.org>"]
6
6
  license = "MIT"
@@ -20,6 +20,7 @@ seaborn = { version = "*", optional = true }
20
20
  plotly = { version = "*", optional = true }
21
21
  pystow = "^0.5.4"
22
22
  black = { version=">=24.0.0", optional = true }
23
+ ruff = { version=">=0.6.2", optional = true }
23
24
  llm = { version="*", optional = true }
24
25
  tiktoken = { version="*", optional = true }
25
26
  pymongo = { version="*", optional = true }
@@ -35,6 +36,9 @@ linkml = { version=">=1.8.0", optional = true }
35
36
  linkml_map = { version="*", optional = true }
36
37
  linkml_renderer = { version="*", optional = true }
37
38
  frictionless = { version="*", optional = true }
39
+ ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
40
+ gcsfs = { version="*", optional = true }
41
+ multipledispatch = { version="*" }
38
42
  pandas = ">=2.2.1"
39
43
  jinja2 = "^3.1.4"
40
44
  jsonlines = "^4.0.0"
@@ -69,7 +73,7 @@ numpy = [
69
73
  [tool.poetry.extras]
70
74
  analytics = ["pandas", "matplotlib", "seaborn", "plotly"]
71
75
  app = ["streamlit"]
72
- tests = ["black"]
76
+ tests = ["black", "ruff"]
73
77
  llm = ["llm", "tiktoken"]
74
78
  mongodb = ["pymongo"]
75
79
  neo4j = ["neo4j", "py2neo", "networkx"]
@@ -82,6 +86,7 @@ renderer = ["linkml_renderer"]
82
86
  fastapi = ["fastapi", "uvicorn"]
83
87
  frictionless = ["frictionless"]
84
88
  scipy = ["scipy", "scikit-learn"]
89
+ ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
85
90
 
86
91
  [tool.poetry.scripts]
87
92
  linkml-store = "linkml_store.cli:cli"
@@ -119,27 +124,13 @@ extend-exclude = [
119
124
  ]
120
125
  force-exclude = true
121
126
  line-length = 120
122
- extend-ignore = ["E203"]
123
- select = [
127
+ lint.extend-ignore = ["E203"]
128
+ lint.select = [
124
129
  "E", # pycodestyle errors
125
130
  "F", # Pyflakes
126
131
  "I", # isort
127
132
  ]
128
- # Assume Python 3.8
129
- target-version = "py38"
130
133
 
131
- [tool.ruff.per-file-ignores]
132
- # These templates can have long lines
133
- "linkml/generators/sqlalchemy/sqlalchemy_declarative_template.py" = ["E501"]
134
- "linkml/generators/sqlalchemy/sqlalchemy_imperative_template.py" = ["E501"]
135
-
136
- # Notebooks can have unsorted imports
137
- "tests/test_notebooks/input/*" = ["E402"]
138
-
139
-
140
- [tool.ruff.mccabe]
141
- # Unlike Flake8, default to a complexity level of 10.
142
- max-complexity = 10
143
134
 
144
135
 
145
136
  [tool.codespell]
@@ -226,6 +226,18 @@ class Collection(Generic[DatabaseType]):
226
226
  self._initialized = True
227
227
  patches = [{"op": "add", "path": "/0", "value": obj} for obj in objs]
228
228
  self._broadcast(patches, **kwargs)
229
+ self._post_modification_hook(**kwargs)
230
+
231
+ def _post_delete_hook(self, **kwargs):
232
+ self._post_modification_hook(**kwargs)
233
+
234
+ def _post_modification_hook(self, **kwargs):
235
+ for indexer in self.indexers.values():
236
+ ix_collection_name = self.get_index_collection_name(indexer)
237
+ ix_collection = self.parent.get_collection(ix_collection_name)
238
+ # Currently updating the source triggers complete reindexing
239
+ # TODO: make this more efficient by only deleting modified
240
+ ix_collection.delete_where({})
229
241
 
230
242
  def delete(self, objs: Union[OBJECT, List[OBJECT]], **kwargs) -> Optional[int]:
231
243
  """
@@ -458,6 +470,7 @@ class Collection(Generic[DatabaseType]):
458
470
  where: Optional[Any] = None,
459
471
  index_name: Optional[str] = None,
460
472
  limit: Optional[int] = None,
473
+ mmr_relevance_factor: Optional[float] = None,
461
474
  **kwargs,
462
475
  ) -> QueryResult:
463
476
  """
@@ -476,7 +489,7 @@ class Collection(Generic[DatabaseType]):
476
489
  Now let's index, using the simple trigram-based index
477
490
 
478
491
  >>> index = get_indexer("simple")
479
- >>> collection.attach_indexer(index)
492
+ >>> _ = collection.attach_indexer(index)
480
493
 
481
494
  Now let's find all objects:
482
495
 
@@ -514,12 +527,15 @@ class Collection(Generic[DatabaseType]):
514
527
  if ix_coll.size() == 0:
515
528
  logger.info(f"Index {index_name} is empty; indexing all objects")
516
529
  all_objs = self.find(limit=-1).rows
517
- self.index_objects(all_objs, index_name, replace=True, **kwargs)
530
+ if all_objs:
531
+ # print(f"Index {index_name} is empty; indexing all objects {len(all_objs)}")
532
+ self.index_objects(all_objs, index_name, replace=True, **kwargs)
533
+ assert ix_coll.size() > 0
518
534
  qr = ix_coll.find(where=where, limit=-1, **kwargs)
519
535
  index_col = ix.index_field
520
536
  # TODO: optimize this for large indexes
521
537
  vector_pairs = [(row, np.array(row[index_col], dtype=float)) for row in qr.rows]
522
- results = ix.search(query, vector_pairs, limit=limit)
538
+ results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
523
539
  for r in results:
524
540
  del r[1][index_col]
525
541
  new_qr = QueryResult(num_rows=len(results))
@@ -648,7 +664,31 @@ class Collection(Generic[DatabaseType]):
648
664
  """
649
665
  return self.find({}, limit=1).num_rows
650
666
 
651
- def attach_indexer(self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs):
667
+ def rows_iter(self) -> Iterable[OBJECT]:
668
+ """
669
+ Return an iterator over the objects in the collection.
670
+
671
+ :return:
672
+ """
673
+ yield from self.find({}, limit=-1).rows
674
+
675
+ def rows(self) -> List[OBJECT]:
676
+ """
677
+ Return a list of objects in the collection.
678
+
679
+ :return:
680
+ """
681
+ return list(self.rows_iter())
682
+
683
+ def ranked_rows(self) -> List[Tuple[float, OBJECT]]:
684
+ """
685
+ Return a list of objects in the collection, with scores.
686
+ """
687
+ return [(n, obj) for n, obj in enumerate(self.rows_iter())]
688
+
689
+ def attach_indexer(
690
+ self, index: Union[Indexer, str], name: Optional[str] = None, auto_index=True, **kwargs
691
+ ) -> Indexer:
652
692
  """
653
693
  Attach an index to the collection.
654
694
 
@@ -669,8 +709,8 @@ class Collection(Generic[DatabaseType]):
669
709
  >>> full_index.name = "full"
670
710
  >>> name_index = get_indexer("simple", text_template="{name}")
671
711
  >>> name_index.name = "name"
672
- >>> collection.attach_indexer(full_index)
673
- >>> collection.attach_indexer(name_index)
712
+ >>> _ = collection.attach_indexer(full_index)
713
+ >>> _ = collection.attach_indexer(name_index)
674
714
 
675
715
  Now let's find objects using the full index, using the string "France".
676
716
  We expect the country France to be the top hit, but the score will
@@ -713,6 +753,10 @@ class Collection(Generic[DatabaseType]):
713
753
  all_objs = self.find(limit=-1).rows
714
754
  logger.info(f"Auto-indexing {len(all_objs)} objects")
715
755
  self.index_objects(all_objs, index_name, replace=True, **kwargs)
756
+ return index
757
+
758
+ def get_index_collection_name(self, indexer: Indexer) -> str:
759
+ return self._index_collection_name(indexer.name)
716
760
 
717
761
  def _index_collection_name(self, index_name: str) -> str:
718
762
  """
@@ -268,7 +268,7 @@ class Database(ABC, Generic[CollectionType]):
268
268
  metadata: Optional[CollectionConfig] = None,
269
269
  recreate_if_exists=False,
270
270
  **kwargs,
271
- ) -> CollectionType:
271
+ ) -> Collection:
272
272
  """
273
273
  Create a new collection in the current database.
274
274
 
@@ -760,6 +760,12 @@ class Database(ABC, Generic[CollectionType]):
760
760
  """
761
761
  Export a database to a file or location.
762
762
 
763
+ >>> from linkml_store.api.client import Client
764
+ >>> client = Client()
765
+ >>> db = client.attach_database("duckdb", alias="test")
766
+ >>> db.import_database("tests/input/iris.csv", Format.CSV, collection_name="iris")
767
+ >>> db.export_database("/tmp/iris.yaml", Format.YAML)
768
+
763
769
  :param location: location of the file
764
770
  :param target_format: target format
765
771
  :param kwargs: additional arguments
@@ -40,7 +40,9 @@ class FacetCountResult(BaseModel):
40
40
 
41
41
  class QueryResult(BaseModel):
42
42
  """
43
- A query result
43
+ A query result.
44
+
45
+ TODO: make this a subclass of Collection
44
46
  """
45
47
 
46
48
  query: Optional[Query] = None
@@ -50,8 +50,9 @@ class DuckDBCollection(Collection):
50
50
  if not isinstance(objs, list):
51
51
  objs = [objs]
52
52
  cd = self.class_definition()
53
- if not cd:
53
+ if not cd or not cd.attributes:
54
54
  cd = self.induce_class_definition_from_objects(objs)
55
+ assert cd.attributes
55
56
  table = self._sqla_table(cd)
56
57
  engine = self.parent.engine
57
58
  with engine.connect() as conn:
@@ -61,7 +62,8 @@ class DuckDBCollection(Collection):
61
62
  stmt = stmt.compile(engine)
62
63
  conn.execute(stmt)
63
64
  conn.commit()
64
- return
65
+ self._post_delete_hook()
66
+ return None
65
67
 
66
68
  def delete_where(self, where: Optional[Dict[str, Any]] = None, missing_ok=True, **kwargs) -> Optional[int]:
67
69
  logger.info(f"Deleting from {self.target_class_name} where: {where}")
@@ -87,6 +89,7 @@ class DuckDBCollection(Collection):
87
89
  if deleted_rows_count == 0 and not missing_ok:
88
90
  raise ValueError(f"No rows found for {where}")
89
91
  conn.commit()
92
+ self._post_delete_hook()
90
93
  return deleted_rows_count if deleted_rows_count > -1 else None
91
94
 
92
95
  def query_facets(
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import sys
3
3
  import warnings
4
+ from collections import defaultdict
4
5
  from pathlib import Path
5
- from typing import Optional
6
+ from typing import Optional, Tuple, Any
6
7
 
7
8
  import click
8
9
  import yaml
@@ -76,6 +77,8 @@ class ContextSettings(BaseModel):
76
77
  if name is None:
77
78
  # if len(self.database.list_collections()) > 1:
78
79
  # raise ValueError("Collection must be specified if there are multiple collections.")
80
+ if not self.database:
81
+ return None
79
82
  if not self.database.list_collections():
80
83
  return None
81
84
  name = list(self.database.list_collections())[0]
@@ -218,7 +221,10 @@ def insert(ctx, files, object, format):
218
221
  @click.option("--object", "-i", multiple=True, help="Input object as YAML")
219
222
  @click.pass_context
220
223
  def store(ctx, files, object, format):
221
- """Store objects from files (JSON, YAML, TSV) into the specified collection."""
224
+ """Store objects from files (JSON, YAML, TSV) into the database.
225
+
226
+ Note: this is similar to insert, but a collection does not need to be specified
227
+ """
222
228
  settings = ctx.obj["settings"]
223
229
  db = settings.database
224
230
  if not files and not object:
@@ -410,14 +416,6 @@ def list_collections(ctx, **kwargs):
410
416
  def fq(ctx, where, limit, columns, output_type, wide, output):
411
417
  """
412
418
  Query facets from the specified collection.
413
-
414
- :param ctx:
415
- :param where:
416
- :param limit:
417
- :param columns:
418
- :param output_type:
419
- :param output:
420
- :return:
421
419
  """
422
420
  collection = ctx.obj["settings"].collection
423
421
  where_clause = yaml.safe_load(where) if where else None
@@ -483,6 +481,41 @@ def describe(ctx, where, output_type, output, limit):
483
481
  write_output(df.describe(include="all").transpose(), output_type, target=output)
484
482
 
485
483
 
484
+ @cli.command()
485
+ @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
486
+ @click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
487
+ @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
488
+ @click.option("--output", "-o", type=click.Path(), help="Output file path")
489
+ @click.option("--index", "-I", help="Attributes to index on in pivot")
490
+ @click.option("--columns", "-A", help="Attributes to use as columns in pivot")
491
+ @click.option("--values", "-V", help="Attributes to use as values in pivot")
492
+ @click.pass_context
493
+ def pivot(ctx, where, limit, index, columns, values, output_type, output):
494
+ collection = ctx.obj["settings"].collection
495
+ where_clause = yaml.safe_load(where) if where else None
496
+ column_atts = columns.split(",") if columns else None
497
+ value_atts = values.split(",") if values else None
498
+ index_atts = index.split(",") if index else None
499
+ results = collection.find(where_clause, limit=limit)
500
+ pivoted = defaultdict(dict)
501
+ for row in results.rows:
502
+ index_key = tuple([row.get(att) for att in index_atts])
503
+ column_key = tuple([row.get(att) for att in column_atts])
504
+ value_key = tuple([row.get(att) for att in value_atts])
505
+ pivoted[index_key][column_key] = value_key
506
+ pivoted_objs = []
507
+ def detuple(t: Tuple) -> Any:
508
+ if len(t) == 1:
509
+ return t[0]
510
+ return str(t)
511
+ for index_key, data in pivoted.items():
512
+ obj = {att: key for att, key in zip(index_atts, index_key)}
513
+ for column_key, value_key in data.items():
514
+ obj[detuple(column_key)] = detuple(value_key)
515
+ pivoted_objs.append(obj)
516
+ write_output(pivoted_objs, output_type, target=output)
517
+
518
+
486
519
  @cli.command()
487
520
  @click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
488
521
  @click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -499,6 +532,7 @@ def describe(ctx, where, output_type, output, limit):
499
532
  "--predictor-type", "-t", default="sklearn", show_default=True, type=click.STRING, help="Type of predictor"
500
533
  )
501
534
  @click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
535
+ @click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
502
536
  @click.option("--query", "-q", type=click.STRING, help="query term")
503
537
  @click.pass_context
504
538
  def infer(
@@ -506,6 +540,7 @@ def infer(
506
540
  inference_config_file,
507
541
  query,
508
542
  evaluation_count,
543
+ evaluation_match_function,
509
544
  training_test_data_split,
510
545
  predictor_type,
511
546
  target_attribute,
@@ -549,7 +584,10 @@ def infer(
549
584
  else:
550
585
  query_obj = None
551
586
  collection = ctx.obj["settings"].collection
552
- atts = collection.class_definition().attributes.keys()
587
+ if collection:
588
+ atts = collection.class_definition().attributes.keys()
589
+ else:
590
+ atts = []
553
591
  if feature_attributes:
554
592
  features = feature_attributes.split(",")
555
593
  features = [f.strip() for f in features]
@@ -575,7 +613,8 @@ def infer(
575
613
  if training_test_data_split:
576
614
  config.train_test_split = training_test_data_split
577
615
  predictor = get_inference_engine(predictor_type, config=config)
578
- predictor.load_and_split_data(collection)
616
+ if collection:
617
+ predictor.load_and_split_data(collection)
579
618
  predictor.initialize_model()
580
619
  if export_model:
581
620
  logger.info(f"Exporting model to {export_model} in {model_format}")
@@ -584,8 +623,14 @@ def infer(
584
623
  if not export_model and not evaluation_count:
585
624
  raise ValueError("Query or evaluate must be specified if not exporting model")
586
625
  if evaluation_count:
626
+ if evaluation_match_function == "score_text_overlap":
627
+ match_function_fn = score_text_overlap
628
+ elif evaluation_match_function is not None:
629
+ raise ValueError(f"Unknown match function: {evaluation_match_function}")
630
+ else:
631
+ match_function_fn = None
587
632
  outcome = evaluate_predictor(
588
- predictor, target_attributes, evaluation_count=evaluation_count, match_function=score_text_overlap
633
+ predictor, target_attributes, evaluation_count=evaluation_count, match_function=match_function_fn
589
634
  )
590
635
  print(f"Outcome: {outcome} // accuracy: {outcome.accuracy}")
591
636
  if query_obj:
@@ -1,11 +1,13 @@
1
1
  import logging
2
2
  from pathlib import Path
3
- from typing import TYPE_CHECKING, List
3
+ from typing import TYPE_CHECKING, List, Optional
4
4
 
5
5
  import numpy as np
6
+ from tiktoken import encoding_for_model
6
7
 
7
8
  from linkml_store.api.config import CollectionConfig
8
9
  from linkml_store.index.indexer import INDEX_ITEM, Indexer
10
+ from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
9
11
 
10
12
  if TYPE_CHECKING:
11
13
  import llm
@@ -29,6 +31,7 @@ class LLMIndexer(Indexer):
29
31
  cached_embeddings_database: str = None
30
32
  cached_embeddings_collection: str = None
31
33
  cache_queries: bool = False
34
+ truncation_method: Optional[str] = None
32
35
 
33
36
  @property
34
37
  def embedding_model(self):
@@ -62,6 +65,21 @@ class LLMIndexer(Indexer):
62
65
  """
63
66
  logging.info(f"Converting {len(texts)} texts to vectors")
64
67
  model = self.embedding_model
68
+ token_limit = get_token_limit(model.model_id)
69
+ encoding = encoding_for_model("gpt-4o")
70
+
71
+ def truncate_text(text: str) -> str:
72
+ # split into tokens every 1000 chars:
73
+ parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
74
+ return render_formatted_text(
75
+ lambda x: "".join(x),
76
+ parts,
77
+ encoding,
78
+ token_limit,
79
+ )
80
+
81
+ texts = [truncate_text(text) for text in texts]
82
+
65
83
  if self.cached_embeddings_database and (cache is None or cache or self.cache_queries):
66
84
  model_id = model.model_id
67
85
  if not model_id:
@@ -88,7 +106,7 @@ class LLMIndexer(Indexer):
88
106
  embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
89
107
  else:
90
108
  embeddings_collection = embeddings_db.create_collection(coll_name, metadata=config)
91
- texts = list(texts)
109
+
92
110
  embeddings = list([None] * len(texts))
93
111
  uncached_texts = []
94
112
  n = 0
@@ -3,6 +3,7 @@ from enum import Enum
3
3
  from typing import Any, Callable, Dict, List, Optional, Tuple
4
4
 
5
5
  import numpy as np
6
+ from linkml_store.utils.vector_utils import pairwise_cosine_similarity, mmr_diversified_search
6
7
  from pydantic import BaseModel
7
8
 
8
9
  INDEX_ITEM = np.ndarray
@@ -19,23 +20,57 @@ class TemplateSyntaxEnum(str, Enum):
19
20
  fstring = "fstring"
20
21
 
21
22
 
22
- def cosine_similarity(vector1, vector2) -> float:
23
+ class Indexer(BaseModel):
23
24
  """
24
- Calculate the cosine similarity between two vectors
25
+ An indexer operates on a collection in order to search for objects.
25
26
 
26
- :param vector1:
27
- :param vector2:
28
- :return:
29
- """
30
- dot_product = np.dot(vector1, vector2)
31
- norm1 = np.linalg.norm(vector1)
32
- norm2 = np.linalg.norm(vector2)
33
- return dot_product / (norm1 * norm2)
27
+ You should use a subcllass of this; this can be looked up dynqamically:
34
28
 
29
+ >>> from linkml_store.index import get_indexer
30
+ >>> indexer = get_indexer("simple")
31
+
32
+ You can customize how objects are indexed by passing in a text template.
33
+ For example, if your collection has objects with "name" and "profession" attributes,
34
+ you can index them as "{name} {profession}".
35
+
36
+ >>> indexer = get_indexer("simple", text_template="{name} :: {profession}")
37
+
38
+ By default, python fstrings are assumed.
39
+
40
+ We can test this works using the :ref:`object_to_text` method (normally
41
+ you would never need to call this directly, but it's useful for testing):
42
+
43
+ >>> obj = {"name": "John", "profession": "doctor"}
44
+ >>> indexer.object_to_text(obj)
45
+ 'John :: doctor'
46
+
47
+ You can also use Jinja2 templates; this gives more flexibility and logic,
48
+ e.g. conditional formatting:
49
+
50
+ >>> tmpl = "{{name}}{% if profession %} :: {{profession}}{% endif %}"
51
+ >>> indexer = get_indexer("simple", text_template=tmpl, text_template_syntax=TemplateSyntaxEnum.jinja2)
52
+ >>> indexer.object_to_text(obj)
53
+ 'John :: doctor'
54
+ >>> indexer.object_to_text({"name": "John"})
55
+ 'John'
56
+
57
+ You can also specify which attributes to index:
58
+
59
+ >>> indexer = get_indexer("simple", index_attributes=["name"])
60
+ >>> indexer.object_to_text(obj)
61
+ 'John'
62
+
63
+ The purpose of an indexer is to translate a collection of objects into a collection of objects
64
+ such as vectors for purposes such as search. Unless you are implementing your own indexer, you
65
+ generally don't need to use the methods that return vectors, but we can examine their behavior
66
+ to get a sense of how they work.
67
+
68
+ >>> vectors = indexer.objects_to_vectors([{"name": "Aardvark"}, {"name": "Aardwolf"}, {"name": "Zesty"}])
69
+ >>> assert pairwise_cosine_similarity(vectors[0], vectors[1]) > pairwise_cosine_similarity(vectors[0], vectors[2])
70
+
71
+ Note you should consult the documentation for the specific indexer you are using for more details on
72
+ how text is converted to vectors.
35
73
 
36
- class Indexer(BaseModel):
37
- """
38
- An indexer operates on a collection in order to search for objects.
39
74
  """
40
75
 
41
76
  name: Optional[str] = None
@@ -119,10 +154,13 @@ class Indexer(BaseModel):
119
154
  return str(obj)
120
155
 
121
156
  def search(
122
- self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None
157
+ self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None,
158
+ mmr_relevance_factor: Optional[float] = None
123
159
  ) -> List[Tuple[float, Any]]:
124
160
  """
125
- Search the index for a query string
161
+ Use the indexer to search against a database of vectors.
162
+
163
+ Note: this is a low-level method, typically you would use the :ref:`search` method on a :ref:`Collection`.
126
164
 
127
165
  :param query: The query string to search for
128
166
  :param vectors: A list of indexed items, where each item is a tuple of (id, vector)
@@ -133,13 +171,29 @@ class Indexer(BaseModel):
133
171
  # Convert the query string to a vector
134
172
  query_vector = self.text_to_vector(query, cache=False)
135
173
 
174
+ if mmr_relevance_factor is not None:
175
+ vlist = [v for _, v in vectors]
176
+ idlist = [id for id, _ in vectors]
177
+ sorted_indices = mmr_diversified_search(
178
+ query_vector, vlist,
179
+ relevance_factor=mmr_relevance_factor, top_n=limit)
180
+ results = []
181
+ # TODO: this is inefficient when limit is high
182
+ for i in range(limit):
183
+ if i >= len(sorted_indices):
184
+ break
185
+ pos = sorted_indices[i]
186
+ score = pairwise_cosine_similarity(query_vector, vlist[pos])
187
+ results.append((score, idlist[pos]))
188
+ return results
189
+
136
190
  distances = []
137
191
 
138
192
  # Iterate over each indexed item
139
193
  for item_id, item_vector in vectors:
140
194
  # Calculate the Euclidean distance between the query vector and the item vector
141
195
  # distance = 1-np.linalg.norm(query_vector - item_vector)
142
- distance = cosine_similarity(query_vector, item_vector)
196
+ distance = pairwise_cosine_similarity(query_vector, item_vector)
143
197
  distances.append((distance, item_id))
144
198
 
145
199
  # Sort the distances in ascending order
@@ -20,6 +20,8 @@ def score_match(target: Optional[Any], candidate: Optional[Any], match_function:
20
20
  1.0
21
21
  >>> score_match("a", "b")
22
22
  0.0
23
+ >>> score_match("abcd", "abcde")
24
+ 0.0
23
25
  >>> score_match("a", None)
24
26
  0.0
25
27
  >>> score_match(None, "a")
@@ -52,7 +54,7 @@ def score_match(target: Optional[Any], candidate: Optional[Any], match_function:
52
54
 
53
55
  :param target:
54
56
  :param candidate:
55
- :param match_function:
57
+ :param match_function: defaults to struct
56
58
  :return:
57
59
  """
58
60
  if target == candidate:
@@ -99,7 +101,8 @@ def evaluate_predictor(
99
101
  :param predictor:
100
102
  :param target_attributes:
101
103
  :param feature_attributes:
102
- :param evaluation_count:
104
+ :param evaluation_count: max iterations
105
+ :param match_function: function to use for matching
103
106
  :return:
104
107
  """
105
108
  n = 0
@@ -113,8 +116,8 @@ def evaluate_predictor(
113
116
  else:
114
117
  test_obj = row
115
118
  result = predictor.derive(test_obj)
116
- logger.info(f"Predicted: {result.predicted_object} Expected: {expected_obj}")
117
119
  tp += score_match(result.predicted_object, expected_obj, match_function)
120
+ logger.info(f"TP={tp} MF={match_function} Predicted: {result.predicted_object} Expected: {expected_obj}")
118
121
  n += 1
119
122
  if evaluation_count is not None and n >= evaluation_count:
120
123
  break
@@ -125,6 +128,9 @@ def score_text_overlap(str1: Any, str2: Any) -> float:
125
128
  """
126
129
  Compute the overlap score between two strings.
127
130
 
131
+ >>> score_text_overlap("abc", "bcde")
132
+ 0.5
133
+
128
134
  :param str1:
129
135
  :param str2:
130
136
  :return: