PyPI - linkml-store - Versions diffs - 0.2.2__tar.gz → 0.2.5__tar.gz - Mend

linkml-store 0.2.2tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (80) hide show

{linkml_store-0.2.2 → linkml_store-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,24 +1,24 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: linkml-store
-Version: 0.2.2
+Version: 0.2.5
 Summary: linkml-store
 License: MIT
 Author: Author 1
 Author-email: author@org.org
-Requires-Python: >=3.9, !=2.7.*, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*, !=3.7.*, !=3.8.*
+Requires-Python: >=3.10,<4.0
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Provides-Extra: all
 Provides-Extra: analytics
 Provides-Extra: app
-Provides-Extra: chromadb
+Provides-Extra: bigquery
 Provides-Extra: fastapi
 Provides-Extra: frictionless
 Provides-Extra: h5py
-Provides-Extra: ibis
 Provides-Extra: llm
 Provides-Extra: map
 Provides-Extra: mongodb
@@ -29,25 +29,25 @@ Provides-Extra: scipy
 Provides-Extra: tests
 Provides-Extra: validation
 Requires-Dist: black (>=24.0.0) ; extra == "tests"
-Requires-Dist: chromadb ; extra == "chromadb"
 Requires-Dist: click
 Requires-Dist: duckdb (>=0.10.1)
 Requires-Dist: duckdb-engine (>=0.11.2)
 Requires-Dist: fastapi ; extra == "fastapi"
 Requires-Dist: frictionless ; extra == "frictionless"
-Requires-Dist: gcsfs ; extra == "ibis"
+Requires-Dist: gcsfs
+Requires-Dist: google-cloud-bigquery ; extra == "bigquery"
 Requires-Dist: h5py ; extra == "h5py"
-Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
 Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
 Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
+Requires-Dist: jsonpatch (>=1.33)
 Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
 Requires-Dist: linkml-runtime (>=1.8.0)
 Requires-Dist: linkml_map ; extra == "map"
 Requires-Dist: linkml_renderer ; extra == "renderer"
-Requires-Dist: llm ; extra == "llm"
+Requires-Dist: llm ; extra == "llm" or extra == "all"
 Requires-Dist: matplotlib ; extra == "analytics"
-Requires-Dist: multipledispatch ; extra == "ibis"
-Requires-Dist: neo4j ; extra == "neo4j"
+Requires-Dist: multipledispatch
+Requires-Dist: neo4j ; extra == "neo4j" or extra == "all"
 Requires-Dist: networkx ; extra == "neo4j"
 Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
 Requires-Dist: plotly ; extra == "analytics"
@@ -56,14 +56,17 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pymongo ; extra == "mongodb"
 Requires-Dist: pystow (>=0.5.4,<0.6.0)
+Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
 Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
 Requires-Dist: scikit-learn ; extra == "scipy"
 Requires-Dist: scipy ; extra == "scipy"
 Requires-Dist: seaborn ; extra == "analytics"
 Requires-Dist: sqlalchemy
 Requires-Dist: streamlit (>=1.32.2,<2.0.0) ; extra == "app"
+Requires-Dist: tabulate
 Requires-Dist: tiktoken ; extra == "llm"
 Requires-Dist: uvicorn ; extra == "fastapi"
+Requires-Dist: xmltodict (>=0.13.0)
 Description-Content-Type: text/markdown
 # linkml-store

{linkml_store-0.2.2 → linkml_store-0.2.5}/pyproject.toml RENAMED Viewed

@@ -1,18 +1,19 @@
 [tool.poetry]
 name = "linkml-store"
-version = "0.2.2"
+version = "0.2.5"
 description = "linkml-store"
 authors = ["Author 1 <author@org.org>"]
 license = "MIT"
 readme = "README.md"
 [tool.poetry.dependencies]
-python = "^3.9, !=3.9.7"
+python = "^3.10"
 click = "*"
 pydantic = "^2.0.0"
 linkml-runtime = ">=1.8.0"
 streamlit = { version = "^1.32.2", optional = true }
 sqlalchemy = "*"
+google-cloud-bigquery = "*"
 duckdb = ">=0.10.1"
 duckdb-engine = ">=0.11.2"
 matplotlib = { version = "*", optional = true }
@@ -27,7 +28,7 @@ pymongo = { version="*", optional = true }
 neo4j = { version="*", optional = true }
 py2neo = { version="*", optional = true }
 networkx = { version="*", optional = true }
-chromadb = { version="*", optional = true }
+#chromadb = { version="*", optional = true }
 pyarrow = { version="*", optional = true }
 h5py = { version="*", optional = true }
 scipy = { version="*", optional = true }
@@ -36,14 +37,18 @@ linkml = { version=">=1.8.0", optional = true }
 linkml_map = { version="*", optional = true }
 linkml_renderer = { version="*", optional = true }
 frictionless = { version="*", optional = true }
-ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
+#ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
 gcsfs = { version="*", optional = true }
 multipledispatch = { version="*" }
+tabulate = "*"
 pandas = ">=2.2.1"
 jinja2 = "^3.1.4"
 jsonlines = "^4.0.0"
 fastapi = { version="*", optional = true }
 uvicorn = { version="*", optional = true }
+xmltodict = ">=0.13.0"
+jsonpatch = ">=1.33"
+python-dotenv = "^1.0.1"
 [tool.poetry.group.dev.dependencies]
 pytest = {version = ">=7.1.2"}
@@ -77,7 +82,7 @@ tests = ["black", "ruff"]
 llm = ["llm", "tiktoken"]
 mongodb = ["pymongo"]
 neo4j = ["neo4j", "py2neo", "networkx"]
-chromadb = ["chromadb"]
+#chromadb = ["chromadb"]
 h5py = ["h5py"]
 pyarrow = ["pyarrow"]
 validation = ["linkml"]
@@ -86,7 +91,9 @@ renderer = ["linkml_renderer"]
 fastapi = ["fastapi", "uvicorn"]
 frictionless = ["frictionless"]
 scipy = ["scipy", "scikit-learn"]
-ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
+#ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
+bigquery = ["google-cloud-bigquery"]
+all = ["llm", "mongodb", "neo4j", "validation", "map", "renderer", "bigquery"]
 [tool.poetry.scripts]
 linkml-store = "linkml_store.cli:cli"

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/client.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import importlib
 import logging
 from pathlib import Path
 from typing import Dict, Optional, Union
@@ -7,23 +8,22 @@ from linkml_runtime import SchemaView
 from linkml_store.api import Database
 from linkml_store.api.config import ClientConfig
-from linkml_store.api.stores.chromadb.chromadb_database import ChromaDBDatabase
-from linkml_store.api.stores.duckdb.duckdb_database import DuckDBDatabase
-from linkml_store.api.stores.filesystem.filesystem_database import FileSystemDatabase
-from linkml_store.api.stores.mongodb.mongodb_database import MongoDBDatabase
-from linkml_store.api.stores.neo4j.neo4j_database import Neo4jDatabase
-from linkml_store.api.stores.solr.solr_database import SolrDatabase
 logger = logging.getLogger(__name__)
 HANDLE_MAP = {
-    "duckdb": DuckDBDatabase,
-    "solr": SolrDatabase,
-    "mongodb": MongoDBDatabase,
-    "chromadb": ChromaDBDatabase,
-    "neo4j": Neo4jDatabase,
-    "file": FileSystemDatabase,
+    "duckdb": "linkml_store.api.stores.duckdb.duckdb_database.DuckDBDatabase",
+    "solr": "linkml_store.api.stores.solr.solr_database.SolrDatabase",
+    "mongodb": "linkml_store.api.stores.mongodb.mongodb_database.MongoDBDatabase",
+    "chromadb": "linkml_store.api.stores.chromadb.chromadb_database.ChromaDBDatabase",
+    "neo4j": "linkml_store.api.stores.neo4j.neo4j_database.Neo4jDatabase",
+    "file": "linkml_store.api.stores.filesystem.filesystem_database.FileSystemDatabase",
+}
+SUFFIX_MAP = {
+    "ddb": "duckdb:///{path}",
 }
@@ -155,6 +155,9 @@ class Client:
             if auto_attach:
                 db = self.attach_database(handle, alias=name, **kwargs)
                 db.from_config(db_config)
+            if db_config.source:
+                db = self.get_database(name)
+                db.store(db_config.source.data)
     def _set_database_config(self, db: Database):
         """
@@ -198,6 +201,12 @@ class Client:
         :param kwargs:
         :return:
         """
+        if ":" not in handle:
+            if alias is None:
+                alias = handle
+            suffix = handle.split(".")[-1]
+            if suffix in SUFFIX_MAP:
+                handle = SUFFIX_MAP[suffix].format(path=handle)
         if ":" not in handle:
             scheme = handle
             handle = None
@@ -207,14 +216,23 @@ class Client:
             scheme, _ = handle.split(":", 1)
         if scheme not in HANDLE_MAP:
             raise ValueError(f"Unknown scheme: {scheme}")
-        cls = HANDLE_MAP[scheme]
+        module_path, class_name = HANDLE_MAP[scheme].rsplit('.', 1)
+        try:
+            module = importlib.import_module(module_path)
+            cls = getattr(module, class_name)
+        except ImportError as e:
+            raise ImportError(f"Failed to import {scheme} database. Make sure the correct extras are installed: {e}")
+        #cls = HANDLE_MAP[scheme]
         db = cls(handle=handle, recreate_if_exists=recreate_if_exists, **kwargs)
         if schema_view:
             db.set_schema_view(schema_view)
         if not alias:
             alias = handle
         if not self._databases:
+            logger.info("Initializing databases")
             self._databases = {}
+        logger.info(f"Attaching {alias}")
         self._databases[alias] = db
         db.parent = self
         if db.alias:
@@ -257,8 +275,9 @@ class Client:
             self._databases[name] = db
         if name not in self._databases:
             if create_if_not_exists:
-                logger.info(f"Creating database: {name}")
-                self.attach_database(name, **kwargs)
+                logger.info(f"Creating/attaching database: {name}")
+                db = self.attach_database(name, **kwargs)
+                name = db.alias
             else:
                 raise ValueError(f"Database {name} does not exist")
         db = self._databases[name]

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/collection.py RENAMED Viewed

@@ -470,6 +470,7 @@ class Collection(Generic[DatabaseType]):
         where: Optional[Any] = None,
         index_name: Optional[str] = None,
         limit: Optional[int] = None,
+        select_cols: Optional[List[str]] = None,
         mmr_relevance_factor: Optional[float] = None,
         **kwargs,
     ) -> QueryResult:
@@ -503,6 +504,7 @@ class Collection(Generic[DatabaseType]):
         :param where:
         :param index_name:
         :param limit:
+        :param select_cols:
         :param kwargs:
         :return:
         """
@@ -538,6 +540,11 @@ class Collection(Generic[DatabaseType]):
         results = ix.search(query, vector_pairs, limit=limit, mmr_relevance_factor=mmr_relevance_factor, **kwargs)
         for r in results:
             del r[1][index_col]
+        if select_cols:
+            new_results = []
+            for r in results:
+                new_results.append((r[0], {k: v for k, v in r[1].items() if k in select_cols}))
+            results = new_results
         new_qr = QueryResult(num_rows=len(results))
         new_qr.ranked_rows = results
         new_qr.rows = [r[1] for r in results]
@@ -672,6 +679,7 @@ class Collection(Generic[DatabaseType]):
         """
         yield from self.find({}, limit=-1).rows
+    @property
     def rows(self) -> List[OBJECT]:
         """
         Return a list of objects in the collection.

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/config.py RENAMED Viewed

@@ -91,7 +91,7 @@ class CollectionConfig(ConfiguredBaseModel):
     )
     source: Optional[CollectionSource] = Field(
         default=None,
-        description="Metadata about the source",
+        description="Source for the collection",
     )
     derived_from: Optional[List[DerivationConfiguration]] = Field(
         default=None,
@@ -154,6 +154,10 @@ class DatabaseConfig(ConfiguredBaseModel):
         default=False,
         description="Whether to ensure referential integrity",
     )
+    source: Optional[CollectionSource] = Field(
+        default=None,
+        description="Source for the database",
+    )
 class ClientConfig(ConfiguredBaseModel):

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/database.py RENAMED Viewed

@@ -470,6 +470,7 @@ class Database(ABC, Generic[CollectionType]):
         if not self._schema_view:
             self._initialize_schema()
         if not self._schema_view:
+            logger.info("Inducing schema view")
             self._schema_view = self.induce_schema_view()
         return self._schema_view
@@ -505,6 +506,7 @@ class Database(ABC, Generic[CollectionType]):
         if isinstance(schema_view, str):
             schema_view = SchemaView(schema_view)
         self._schema_view = schema_view
+        logger.info(f"Setting schema view for {self.handle}")
         # self._schema_view = SchemaView(schema_view.materialize_derived_schema())
         if not self._collections:
             return

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/api/stores/filesystem/filesystem_database.py RENAMED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Optional
 import yaml
-from linkml.utils.schema_builder import SchemaBuilder
+from linkml_runtime.utils.schema_builder import SchemaBuilder
 from linkml_runtime import SchemaView
 from linkml_store.api import Database

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/cli.py RENAMED Viewed

@@ -99,6 +99,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
 @click.option("--database", "-d", help="Database name")
 @click.option("--collection", "-c", help="Collection name")
 @click.option("--input", "-i", help="Input file (alternative to database/collection)")
+@click.option("--schema", "-S", help="Path to schema (LinkML yaml)")
 @click.option("--config", "-C", type=click.Path(exists=True), help="Path to the configuration file")
 @click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
 @click.option("-v", "--verbose", count=True)
@@ -111,7 +112,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
     help="If set then show full stacktrace on error",
 )
 @click.pass_context
-def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, input, **kwargs):
+def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, schema, config, set, input, **kwargs):
     """A CLI for interacting with the linkml-store."""
     if not stacktrace:
         sys.tracebacklimit = 0
@@ -135,12 +136,17 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         logger.setLevel(logging.ERROR)
     ctx.ensure_object(dict)
     if input:
-        stem = underscore(Path(input).stem)
-        database = "duckdb"
-        collection = stem
+        database = "duckdb" # default: store in duckdb
+        if input.startswith("http"):
+            parts = input.split("/")
+            collection = parts[-1]
+            collection = collection.split(".")[0]
+        else:
+            stem = underscore(Path(input).stem)
+            collection = stem
+        logger.info(f"Using input file: {input}, "
+                    f"default storage is {database} and collection is {collection}")
         config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
-        # collection = Path(input).stem
-        # database = f"file:{Path(input).parent}"
     if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
         config = DEFAULT_LOCAL_CONF_PATH
     if config is None and DEFAULT_GLOBAL_CONF_PATH.exists():
@@ -153,6 +159,9 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
     client = Client().from_config(config, **kwargs) if config else Client()
     settings = ContextSettings(client=client, database_name=database, collection_name=collection)
     ctx.obj["settings"] = settings
+    if schema:
+        db = settings.database
+        db.set_schema_view(schema)
     if settings.database_name:
         db = client.get_database(database)
         if set:
@@ -178,10 +187,11 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
 @cli.command()
 @click.argument("files", type=click.Path(exists=True), nargs=-1)
+@click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
 @click.option("--format", "-f", type=format_choice, help="Input format")
 @click.option("--object", "-i", multiple=True, help="Input object as YAML")
 @click.pass_context
-def insert(ctx, files, object, format):
+def insert(ctx, files, replace, object, format):
     """Insert objects from files (JSON, YAML, TSV) into the specified collection.
     Using a configuration:
@@ -195,7 +205,6 @@ def insert(ctx, files, object, format):
     collection = settings.collection
     if not collection:
         raise ValueError("Collection must be specified.")
-    objects = []
     if not files and not object:
         files = ["-"]
     for file_path in files:
@@ -204,13 +213,19 @@ def insert(ctx, files, object, format):
         else:
             objects = load_objects(file_path)
         logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
-        collection.insert(objects)
+        if replace:
+            collection.replace(objects)
+        else:
+            collection.insert(objects)
         click.echo(f"Inserted {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
     if object:
         for object_str in object:
             logger.info(f"Parsing: {object_str}")
             objects = yaml.safe_load(object_str)
-            collection.insert(objects)
+            if replace:
+                collection.replace(objects)
+            else:
+                collection.insert(objects)
             click.echo(f"Inserted {len(objects)} objects from {object_str} into collection '{collection.alias}'.")
     collection.commit()
@@ -523,6 +538,7 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
 @click.option(
     "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
 )
+@click.option("--training-collection", type=click.STRING,help="Collection to use for training")
 @click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
 @click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
 @click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
@@ -534,14 +550,17 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
 @click.option("--evaluation-count", "-n", type=click.INT, help="Number of examples to evaluate over")
 @click.option("--evaluation-match-function", help="Name of function to use for matching objects in eval")
 @click.option("--query", "-q", type=click.STRING, help="query term")
+@click.option("--where", "-w", type=click.STRING, help="query term")
 @click.pass_context
 def infer(
     ctx,
     inference_config_file,
+    where,
     query,
     evaluation_count,
     evaluation_match_function,
     training_test_data_split,
+    training_collection,
     predictor_type,
     target_attribute,
     feature_attributes,
@@ -579,6 +598,7 @@ def infer(
         linkml-store -i tests/input/iris.csv inference -t sklearn \
            -q '{"sepal_length": 5.1, "sepal_width": 3.5, "petal_length": 1.4, "petal_width": 0.2}'
     """
+    where_clause = yaml.safe_load(where) if where else None
     if query:
         query_obj = yaml.safe_load(query)
     else:
@@ -603,6 +623,7 @@ def infer(
     if model_format:
         model_format = ModelSerialization(model_format)
     if load_model:
+        logger.info(f"Loading predictor from {load_model}")
         predictor = get_inference_engine(predictor_type)
         predictor = type(predictor).load_model(load_model)
     else:
@@ -613,13 +634,18 @@ def infer(
         if training_test_data_split:
             config.train_test_split = training_test_data_split
         predictor = get_inference_engine(predictor_type, config=config)
-        if collection:
-            predictor.load_and_split_data(collection)
+        training_collection_obj = collection
+        if training_collection:
+            training_collection_obj = ctx.obj["settings"].database.get_collection(training_collection)
+        if training_collection_obj:
+            logger.info(f"Using collection: {training_collection_obj.alias} for inference")
+            split = training_test_data_split or (1.0, 0.0)
+            predictor.load_and_split_data(training_collection_obj, split=split)
         predictor.initialize_model()
     if export_model:
         logger.info(f"Exporting model to {export_model} in {model_format}")
         predictor.export_model(export_model, model_format)
-    if not query_obj:
+    if not query_obj and where_clause is None:
         if not export_model and not evaluation_count:
             raise ValueError("Query or evaluate must be specified if not exporting model")
     if evaluation_count:
@@ -637,6 +663,12 @@ def infer(
         result = predictor.derive(query_obj)
         dumped_obj = result.model_dump(exclude_none=True)
         write_output([dumped_obj], output_type, target=output)
+    if where_clause is not None:
+        predicted_objs = []
+        for query_obj in collection.find(where_clause).rows:
+            result = predictor.derive(query_obj)
+            predicted_objs.append(result.predicted_object)
+        write_output(predicted_objs, output_type, target=output)
 @cli.command()
@@ -681,6 +713,7 @@ def schema(ctx, output_type, output):
 @cli.command()
 @click.argument("search_term")
 @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the search")
+@click.option("--select", "-s", type=click.STRING, help="SELECT clause for the query, as YAML")
 @click.option("--limit", "-l", type=click.INT, help="Maximum number of search results")
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -689,13 +722,14 @@ def schema(ctx, output_type, output):
 )
 @index_type_option
 @click.pass_context
-def search(ctx, search_term, where, limit, index_type, output_type, output, auto_index):
+def search(ctx, search_term, where, select, limit, index_type, output_type, output, auto_index):
     """Search objects in the specified collection."""
     collection = ctx.obj["settings"].collection
     ix = get_indexer(index_type)
     logger.info(f"Attaching index to collection {collection.alias}: {ix.model_dump()}")
     collection.attach_indexer(ix, auto_index=auto_index)
-    result = collection.search(search_term, where=where, limit=limit)
+    select_cols = yaml.safe_load(select) if select else None
+    result = collection.search(search_term, where=where, select_cols=select_cols, limit=limit)
     output_data = render_output([{"score": row[0], **row[1]} for row in result.ranked_rows], output_type)
     if output:
         with open(output, "w") as f:

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/index/implementations/llm_indexer.py RENAMED Viewed

@@ -3,7 +3,6 @@ from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional
 import numpy as np
-from tiktoken import encoding_for_model
 from linkml_store.api.config import CollectionConfig
 from linkml_store.index.indexer import INDEX_ITEM, Indexer
@@ -55,7 +54,7 @@ class LLMIndexer(Indexer):
     def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
         """
-        Use LLM to embed
+        Use LLM to embed.
         >>> indexer = LLMIndexer(cached_embeddings_database="tests/input/llm_cache.db")
         >>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
@@ -63,20 +62,24 @@ class LLMIndexer(Indexer):
         :param texts:
         :return:
         """
+        from tiktoken import encoding_for_model
         logging.info(f"Converting {len(texts)} texts to vectors")
         model = self.embedding_model
-        token_limit = get_token_limit(model.model_id)
+        # TODO: make this more accurate
+        token_limit = get_token_limit(model.model_id) - 200
         encoding = encoding_for_model("gpt-4o")
         def truncate_text(text: str) -> str:
             # split into tokens every 1000 chars:
             parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
-            return render_formatted_text(
+            truncated = render_formatted_text(
                 lambda x: "".join(x),
                 parts,
                 encoding,
                 token_limit,
             )
+            logger.debug(f"Truncated text from {len(text)} to {len(truncated)}")
+            return truncated
         texts = [truncate_text(text) for text in texts]

linkml_store-0.2.5/src/linkml_store/inference/implementations/llm_inference_engine.py ADDED Viewed

@@ -0,0 +1,152 @@
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar, List, Optional, TextIO, Union
+import yaml
+from linkml_store.utils.llm_utils import parse_yaml_payload
+from llm import get_key
+from pydantic import BaseModel
+from linkml_store.api.collection import OBJECT, Collection
+from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
+from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
+from linkml_store.utils.object_utils import select_nested
+logger = logging.getLogger(__name__)
+MAX_ITERATIONS = 5
+DEFAULT_NUM_EXAMPLES = 20
+SYSTEM_PROMPT = """
+Your task is to inference the complete YAML
+object output given the YAML object input. I will provide you
+with contextual information, including the schema,
+to help with the inference. You can use the following
+You should return ONLY valid YAML in your response.
+"""
+class TrainedModel(BaseModel, extra="forbid"):
+    index_rows: List[OBJECT]
+    config: Optional[InferenceConfig] = None
+class LLMInference(Inference):
+    iterations: int = 0
+@dataclass
+class LLMInferenceEngine(InferenceEngine):
+    """
+    LLM based predictor.
+    Unlike the RAG predictor this performs few-shot inference
+    """
+    _model: "llm.Model" = None  # noqa: F821
+    PERSIST_COLS: ClassVar[List[str]] = [
+        "config",
+    ]
+    def __post_init__(self):
+        if not self.config:
+            self.config = InferenceConfig()
+        if not self.config.llm_config:
+            self.config.llm_config = LLMConfig()
+    @property
+    def model(self) -> "llm.Model":  # noqa: F821
+        import llm
+        if self._model is None:
+            self._model = llm.get_model(self.config.llm_config.model_name)
+            if self._model.needs_key:
+                key = get_key(None, key_alias=self._model.needs_key)
+                self._model.key = key
+        return self._model
+    def initialize_model(self, **kwargs):
+        logger.info(f"Initializing model {self.model}")
+    def object_to_text(self, object: OBJECT) -> str:
+        return yaml.dump(object)
+    def _schema_str(self) -> str:
+        db = self.training_data.base_collection.parent
+        from linkml_runtime.dumpers import json_dumper
+        schema_dict = json_dumper.to_dict(db.schema_view.schema)
+        return yaml.dump(schema_dict)
+    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]:
+        import llm
+        model: llm.Model = self.model
+        #model_name = self.config.llm_config.model_name
+        #feature_attributes = self.config.feature_attributes
+        target_attributes = self.config.target_attributes
+        query_text = self.object_to_text(object)
+        if not target_attributes:
+            target_attributes = [k for k, v in object.items() if v is None or v == ""]
+        #if not feature_attributes:
+        #    feature_attributes = [k for k, v in object.items() if v is not None and v != ""]
+        system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
+        system_prompt += "\n## SCHEMA:\n\n" + self._schema_str()
+        stub = ", ".join([f"{k}: ..." for k in target_attributes])
+        stub = "{" + stub + "}"
+        prompt = (
+            "Provide a YAML object of the form"
+            "```yaml\n"
+            f"{stub}\n"
+            "```\n"
+            "---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n"
+        )
+        logger.info(f"Prompt: {prompt}")
+        response = model.prompt(prompt, system=system_prompt)
+        yaml_str = response.text()
+        logger.info(f"Response: {yaml_str}")
+        predicted_object = parse_yaml_payload(yaml_str, strict=True)
+        predicted_object = {**object, **predicted_object}
+        if self.config.validate_results:
+            base_collection = self.training_data.base_collection
+            errs = list(base_collection.iter_validate_collection([predicted_object]))
+            if errs:
+                print(f"{iteration} // FAILED TO VALIDATE: {yaml_str}")
+                print(f"PARSED: {predicted_object}")
+                print(f"ERRORS: {errs}")
+                if iteration > MAX_ITERATIONS:
+                    raise ValueError(f"Validation errors: {errs}")
+                extra_texts = [
+                    "Make sure results conform to the schema. Previously you provided:\n",
+                    yaml_str,
+                    "\nThis was invalid.\n",
+                    "Validation errors:\n",
+                ] + [self.object_to_text(e) for e in errs]
+                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
+        return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
+    def export_model(
+        self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
+    ):
+        self.save_model(output)
+    def save_model(self, output: Union[str, Path]) -> None:
+        """
+        Save the trained model and related data to a file.
+        :param output: Path to save the model
+        """
+        raise NotImplementedError("Does not make sense for this engine")
+    @classmethod
+    def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine":
+        raise NotImplementedError("Does not make sense for this engine")

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/rag_inference_engine.py RENAMED Viewed

@@ -20,7 +20,7 @@ DEFAULT_NUM_EXAMPLES = 20
 DEFAULT_MMR_RELEVANCE_FACTOR = 0.8
 SYSTEM_PROMPT = """
-You are a {llm_config.role}, your task is to inference the YAML
+You are a {llm_config.role}, your task is to infer the YAML
 object output given the YAML object input. I will provide you
 with a collection of examples that will provide guidance both
 on the desired structure of the response, as well as the kind
@@ -130,23 +130,34 @@ class RAGInferenceEngine(InferenceEngine):
         else:
             if not self.rag_collection.indexers:
                 raise ValueError("RAG collection must have an indexer attached")
+            logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}")
             rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
                                             mmr_relevance_factor=mmr_relevance_factor)
             examples = rs.rows
+            logger.info(f"Found {len(examples)} examples")
             if not examples:
                 raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
         prompt_clauses = []
-        query_obj = select_nested(object, feature_attributes)
+        this_feature_attributes = feature_attributes
+        if not this_feature_attributes:
+            this_feature_attributes = list(set(object.keys()) - set(target_attributes))
+        query_obj = select_nested(object, this_feature_attributes)
         query_text = self.object_to_text(query_obj)
         for example in examples:
-            input_obj = select_nested(example, feature_attributes)
+            this_feature_attributes = feature_attributes
+            if not this_feature_attributes:
+                this_feature_attributes = list(set(example.keys()) - set(target_attributes))
+            if not this_feature_attributes:
+                raise ValueError(f"No feature attributes found in example {example}")
+            input_obj = select_nested(example, this_feature_attributes)
             input_obj_text = self.object_to_text(input_obj)
             if input_obj_text == query_text:
-                raise ValueError(
-                    f"Query object {query_text} is the same as example object {input_obj_text}\n"
-                    "This indicates possible test data leakage\n."
-                    "TODO: allow an option that allows user to treat this as a basic lookup\n"
-                )
+                continue
+                #raise ValueError(
+                #    f"Query object {query_text} is the same as example object {input_obj_text}\n"
+                #    "This indicates possible test data leakage\n."
+                #    "TODO: allow an option that allows user to treat this as a basic lookup\n"
+                #)
             output_obj = select_nested(example, target_attributes)
             prompt_clause = (
                 "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
@@ -169,7 +180,7 @@ class RAGInferenceEngine(InferenceEngine):
                                        encoding=encoding, token_limit=token_limit,
                                        additional_text=system_prompt)
         logger.info(f"Prompt: {prompt}")
-        response = model.prompt(prompt, system_prompt)
+        response = model.prompt(prompt, system=system_prompt)
         yaml_str = response.text()
         logger.info(f"Response: {yaml_str}")
         predicted_object = self._parse_yaml_payload(yaml_str, strict=True)

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/inference/inference_engine.py RENAMED Viewed

@@ -4,7 +4,7 @@ from abc import ABC
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import Optional, TextIO, Tuple, Union
+from typing import Optional, TextIO, Tuple, Union, Any
 import pandas as pd
 from pydantic import BaseModel, ConfigDict
@@ -67,13 +67,14 @@ class CollectionSlice(BaseModel):
     # slice: Tuple[Optional[int], Optional[int]] = Field(default=(None, None))
     indices: Optional[Tuple[int, ...]] = None
     _collection: Optional[Collection] = None
+    where: Any = None
     @property
     def collection(self) -> Collection:
         if not self._collection and not self.indices:
             return self.base_collection
         if not self._collection:
-            rows = self.base_collection.find({}, limit=-1).rows
+            rows = self.base_collection.rows
             subset = [rows[i] for i in self.indices]
             db = self.base_collection.parent
             subset_name = self.slice_alias
@@ -94,6 +95,7 @@ class CollectionSlice(BaseModel):
         """
         Return the slice of the collection as a dataframe.
+        :param flattened: flattned nested objects to give keys like foo.bar
         :return:
         """
         rs = self.collection.find({}, limit=-1)
@@ -122,7 +124,7 @@ class InferenceEngine(ABC):
         Load the data and split it into training and testing sets.
         :param collection:
-        :param split:
+        :param split: Tuple of training and testing split ratios.
         :param randomize:
         :return:
         """
@@ -134,7 +136,7 @@ class InferenceEngine(ABC):
             self.training_data = CollectionSlice(name="train", base_collection=collection, indices=None)
             self.testing_data = None
             return
-        logger.info(f"Loading and splitting data from collection {collection.alias}")
+        logger.info(f"Loading and splitting data {split} from collection {collection.alias}")
         size = collection.size()
         indices = range(size)
         if randomize:

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/format_utils.py RENAMED Viewed

@@ -12,9 +12,9 @@ from typing import IO, Any, Dict, List, Optional, TextIO, Type, Union
 import pandas as pd
 import pystow
+import xmltodict
 import yaml
 from pydantic import BaseModel
-from tabulate import tabulate
 logger = logging.getLogger(__name__)
@@ -30,6 +30,7 @@ class Format(Enum):
     YAMLL = "yamll"
     TSV = "tsv"
     CSV = "csv"
+    XML = "xml"
     PYTHON = "python"
     PARQUET = "parquet"
     FORMATTED = "formatted"
@@ -50,6 +51,7 @@ class Format(Enum):
             ".yamll": cls.YAMLL,
             ".tsv": cls.TSV,
             ".csv": cls.CSV,
+            ".xml": cls.XML,
             ".py": cls.PYTHON,
             ".parquet": cls.PARQUET,
             ".pq": cls.PARQUET,
@@ -124,6 +126,8 @@ def process_file(
         delimiter = "\t" if format == Format.TSV else ","
         reader = csv.DictReader(f, delimiter=delimiter)
         objs = list(reader)
+    elif format == Format.XML:
+        objs = xmltodict.parse(f.read())
     elif format == Format.PARQUET:
         import pyarrow.parquet as pq
@@ -284,6 +288,7 @@ def render_output(
     elif format == Format.PYTHON:
         return str(data)
     elif format == Format.TABLE:
+        from tabulate import tabulate
         return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
     elif format == Format.YAML:
         if isinstance(data, list):

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/llm_utils.py RENAMED Viewed

@@ -1,6 +1,10 @@
-from typing import Callable, List, Optional
+import logging
+from typing import Callable, List, Optional, TYPE_CHECKING
-from tiktoken import Encoding
+if TYPE_CHECKING:
+    import tiktoken
+logger = logging.getLogger(__name__)
 MODEL_TOKEN_MAPPING = {
     "gpt-4o-mini": 128_000,
@@ -40,7 +44,7 @@ MODEL_TOKEN_MAPPING = {
 def render_formatted_text(
     render_func: Callable,
     values: List[str],
-    encoding: Encoding,
+    encoding: "tiktoken.Encoding",
     token_limit: int,
     additional_text: Optional[str] = None,
 ) -> str:
@@ -67,6 +71,7 @@ def render_formatted_text(
     if additional_text:
         token_limit -= len(encoding.encode(additional_text))
     text_length = len(encoding.encode(text))
+    logger.debug(f"Encoding length: {text_length} (original: {len(text)})")
     if text_length <= token_limit:
         return text
     if not values:
@@ -95,3 +100,18 @@ def get_token_limit(model_name: str) -> int:
         if model in model_name:
             return token_limit
     return 4096
+def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
+    import yaml
+    if "```" in yaml_str:
+        yaml_str = yaml_str.split("```")[1].strip()
+        if yaml_str.startswith("yaml"):
+            yaml_str = yaml_str[4:].strip()
+    try:
+        return yaml.safe_load(yaml_str)
+    except Exception as e:
+        if strict:
+            raise e
+        logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
+        return None

{linkml_store-0.2.2 → linkml_store-0.2.5}/src/linkml_store/utils/object_utils.py RENAMED Viewed

@@ -124,7 +124,7 @@ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=N
     Args:
     data (dict): The input nested dictionary.
-    selectors (list): A list of selector strings.
+    paths (list): A list of selector strings.
     Returns:
     dict: A new dictionary with the same structure, but only the selected attributes.
@@ -162,6 +162,8 @@ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=N
     if current_path is None:
         current_path = []
     matching_paths = []
+    if not paths:
+        raise ValueError("No paths provided")
     for path in paths:
         if isinstance(path, str):
             path = path.split(".")