PyPI - linkml-store - Versions diffs - 0.2.4__tar.gz → 0.2.5__tar.gz - Mend

linkml-store 0.2.4tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (80) hide show

{linkml_store-0.2.4 → linkml_store-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,14 +1,13 @@
 Metadata-Version: 2.3
 Name: linkml-store
-Version: 0.2.4
+Version: 0.2.5
 Summary: linkml-store
 License: MIT
 Author: Author 1
 Author-email: author@org.org
-Requires-Python: >=3.9, !=2.7.*, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*, !=3.7.*, !=3.8.*
+Requires-Python: >=3.10,<4.0
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
@@ -20,7 +19,6 @@ Provides-Extra: bigquery
 Provides-Extra: fastapi
 Provides-Extra: frictionless
 Provides-Extra: h5py
-Provides-Extra: ibis
 Provides-Extra: llm
 Provides-Extra: map
 Provides-Extra: mongodb
@@ -36,20 +34,19 @@ Requires-Dist: duckdb (>=0.10.1)
 Requires-Dist: duckdb-engine (>=0.11.2)
 Requires-Dist: fastapi ; extra == "fastapi"
 Requires-Dist: frictionless ; extra == "frictionless"
-Requires-Dist: gcsfs ; extra == "ibis"
+Requires-Dist: gcsfs
 Requires-Dist: google-cloud-bigquery ; extra == "bigquery"
 Requires-Dist: h5py ; extra == "h5py"
-Requires-Dist: ibis-framework[duckdb,examples] (>=9.3.0) ; extra == "ibis"
 Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
 Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
-Requires-Dist: jsonpatch (>=1.33,<2.0)
+Requires-Dist: jsonpatch (>=1.33)
 Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
 Requires-Dist: linkml-runtime (>=1.8.0)
 Requires-Dist: linkml_map ; extra == "map"
 Requires-Dist: linkml_renderer ; extra == "renderer"
 Requires-Dist: llm ; extra == "llm" or extra == "all"
 Requires-Dist: matplotlib ; extra == "analytics"
-Requires-Dist: multipledispatch ; extra == "ibis"
+Requires-Dist: multipledispatch
 Requires-Dist: neo4j ; extra == "neo4j" or extra == "all"
 Requires-Dist: networkx ; extra == "neo4j"
 Requires-Dist: pandas (>=2.2.1) ; extra == "analytics"
@@ -59,6 +56,7 @@ Requires-Dist: pyarrow ; extra == "pyarrow"
 Requires-Dist: pydantic (>=2.0.0,<3.0.0)
 Requires-Dist: pymongo ; extra == "mongodb"
 Requires-Dist: pystow (>=0.5.4,<0.6.0)
+Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
 Requires-Dist: ruff (>=0.6.2) ; extra == "tests"
 Requires-Dist: scikit-learn ; extra == "scipy"
 Requires-Dist: scipy ; extra == "scipy"
@@ -68,7 +66,7 @@ Requires-Dist: streamlit (>=1.32.2,<2.0.0) ; extra == "app"
 Requires-Dist: tabulate
 Requires-Dist: tiktoken ; extra == "llm"
 Requires-Dist: uvicorn ; extra == "fastapi"
-Requires-Dist: xmltodict (>=0.13.0,<0.14.0)
+Requires-Dist: xmltodict (>=0.13.0)
 Description-Content-Type: text/markdown
 # linkml-store

{linkml_store-0.2.4 → linkml_store-0.2.5}/pyproject.toml RENAMED Viewed

@@ -1,13 +1,13 @@
 [tool.poetry]
 name = "linkml-store"
-version = "0.2.4"
+version = "0.2.5"
 description = "linkml-store"
 authors = ["Author 1 <author@org.org>"]
 license = "MIT"
 readme = "README.md"
 [tool.poetry.dependencies]
-python = "^3.9, !=3.9.7"
+python = "^3.10"
 click = "*"
 pydantic = "^2.0.0"
 linkml-runtime = ">=1.8.0"
@@ -37,7 +37,7 @@ linkml = { version=">=1.8.0", optional = true }
 linkml_map = { version="*", optional = true }
 linkml_renderer = { version="*", optional = true }
 frictionless = { version="*", optional = true }
-ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
+#ibis-framework = { version=">=9.3.0", extras = ["duckdb", "examples"], optional = true }
 gcsfs = { version="*", optional = true }
 multipledispatch = { version="*" }
 tabulate = "*"
@@ -46,8 +46,9 @@ jinja2 = "^3.1.4"
 jsonlines = "^4.0.0"
 fastapi = { version="*", optional = true }
 uvicorn = { version="*", optional = true }
-xmltodict = "^0.13.0"
-jsonpatch = "^1.33"
+xmltodict = ">=0.13.0"
+jsonpatch = ">=1.33"
+python-dotenv = "^1.0.1"
 [tool.poetry.group.dev.dependencies]
 pytest = {version = ">=7.1.2"}
@@ -90,9 +91,9 @@ renderer = ["linkml_renderer"]
 fastapi = ["fastapi", "uvicorn"]
 frictionless = ["frictionless"]
 scipy = ["scipy", "scikit-learn"]
-ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
+#ibis = ["ibis-framework", "multipledispatch", "gcsfs"]
 bigquery = ["google-cloud-bigquery"]
-all = ["llm", "mongodb", "neo4j", "validation", "map", "renderer", "ibis", "bigquery"]
+all = ["llm", "mongodb", "neo4j", "validation", "map", "renderer", "bigquery"]
 [tool.poetry.scripts]
 linkml-store = "linkml_store.cli:cli"

{linkml_store-0.2.4 → linkml_store-0.2.5}/src/linkml_store/api/client.py RENAMED Viewed

@@ -22,6 +22,10 @@ HANDLE_MAP = {
     "file": "linkml_store.api.stores.filesystem.filesystem_database.FileSystemDatabase",
 }
+SUFFIX_MAP = {
+    "ddb": "duckdb:///{path}",
+}
 class Client:
     """
@@ -197,6 +201,12 @@ class Client:
         :param kwargs:
         :return:
         """
+        if ":" not in handle:
+            if alias is None:
+                alias = handle
+            suffix = handle.split(".")[-1]
+            if suffix in SUFFIX_MAP:
+                handle = SUFFIX_MAP[suffix].format(path=handle)
         if ":" not in handle:
             scheme = handle
             handle = None
@@ -220,7 +230,9 @@ class Client:
         if not alias:
             alias = handle
         if not self._databases:
+            logger.info("Initializing databases")
             self._databases = {}
+        logger.info(f"Attaching {alias}")
         self._databases[alias] = db
         db.parent = self
         if db.alias:
@@ -263,8 +275,9 @@ class Client:
             self._databases[name] = db
         if name not in self._databases:
             if create_if_not_exists:
-                logger.info(f"Creating database: {name}")
-                self.attach_database(name, **kwargs)
+                logger.info(f"Creating/attaching database: {name}")
+                db = self.attach_database(name, **kwargs)
+                name = db.alias
             else:
                 raise ValueError(f"Database {name} does not exist")
         db = self._databases[name]

{linkml_store-0.2.4 → linkml_store-0.2.5}/src/linkml_store/api/database.py RENAMED Viewed

@@ -470,6 +470,7 @@ class Database(ABC, Generic[CollectionType]):
         if not self._schema_view:
             self._initialize_schema()
         if not self._schema_view:
+            logger.info("Inducing schema view")
             self._schema_view = self.induce_schema_view()
         return self._schema_view
@@ -505,6 +506,7 @@ class Database(ABC, Generic[CollectionType]):
         if isinstance(schema_view, str):
             schema_view = SchemaView(schema_view)
         self._schema_view = schema_view
+        logger.info(f"Setting schema view for {self.handle}")
         # self._schema_view = SchemaView(schema_view.materialize_derived_schema())
         if not self._collections:
             return

{linkml_store-0.2.4 → linkml_store-0.2.5}/src/linkml_store/cli.py RENAMED Viewed

@@ -99,6 +99,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
 @click.option("--database", "-d", help="Database name")
 @click.option("--collection", "-c", help="Collection name")
 @click.option("--input", "-i", help="Input file (alternative to database/collection)")
+@click.option("--schema", "-S", help="Path to schema (LinkML yaml)")
 @click.option("--config", "-C", type=click.Path(exists=True), help="Path to the configuration file")
 @click.option("--set", help="Metadata settings in the form PATHEXPR=value", multiple=True)
 @click.option("-v", "--verbose", count=True)
@@ -111,7 +112,7 @@ include_internal_option = click.option("--include-internal/--no-include-internal
     help="If set then show full stacktrace on error",
 )
 @click.pass_context
-def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, config, set, input, **kwargs):
+def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection, schema, config, set, input, **kwargs):
     """A CLI for interacting with the linkml-store."""
     if not stacktrace:
         sys.tracebacklimit = 0
@@ -158,6 +159,9 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
     client = Client().from_config(config, **kwargs) if config else Client()
     settings = ContextSettings(client=client, database_name=database, collection_name=collection)
     ctx.obj["settings"] = settings
+    if schema:
+        db = settings.database
+        db.set_schema_view(schema)
     if settings.database_name:
         db = client.get_database(database)
         if set:
@@ -534,6 +538,7 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
 @click.option(
     "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
 )
+@click.option("--training-collection", type=click.STRING,help="Collection to use for training")
 @click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
 @click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
 @click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
@@ -555,6 +560,7 @@ def infer(
     evaluation_count,
     evaluation_match_function,
     training_test_data_split,
+    training_collection,
     predictor_type,
     target_attribute,
     feature_attributes,
@@ -617,6 +623,7 @@ def infer(
     if model_format:
         model_format = ModelSerialization(model_format)
     if load_model:
+        logger.info(f"Loading predictor from {load_model}")
         predictor = get_inference_engine(predictor_type)
         predictor = type(predictor).load_model(load_model)
     else:
@@ -627,13 +634,18 @@ def infer(
         if training_test_data_split:
             config.train_test_split = training_test_data_split
         predictor = get_inference_engine(predictor_type, config=config)
-        if collection:
-            predictor.load_and_split_data(collection)
+        training_collection_obj = collection
+        if training_collection:
+            training_collection_obj = ctx.obj["settings"].database.get_collection(training_collection)
+        if training_collection_obj:
+            logger.info(f"Using collection: {training_collection_obj.alias} for inference")
+            split = training_test_data_split or (1.0, 0.0)
+            predictor.load_and_split_data(training_collection_obj, split=split)
         predictor.initialize_model()
     if export_model:
         logger.info(f"Exporting model to {export_model} in {model_format}")
         predictor.export_model(export_model, model_format)
-    if not query_obj:
+    if not query_obj and where_clause is None:
         if not export_model and not evaluation_count:
             raise ValueError("Query or evaluate must be specified if not exporting model")
     if evaluation_count:
@@ -651,6 +663,12 @@ def infer(
         result = predictor.derive(query_obj)
         dumped_obj = result.model_dump(exclude_none=True)
         write_output([dumped_obj], output_type, target=output)
+    if where_clause is not None:
+        predicted_objs = []
+        for query_obj in collection.find(where_clause).rows:
+            result = predictor.derive(query_obj)
+            predicted_objs.append(result.predicted_object)
+        write_output(predicted_objs, output_type, target=output)
 @cli.command()

linkml_store-0.2.5/src/linkml_store/inference/implementations/llm_inference_engine.py ADDED Viewed

@@ -0,0 +1,152 @@
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar, List, Optional, TextIO, Union
+import yaml
+from linkml_store.utils.llm_utils import parse_yaml_payload
+from llm import get_key
+from pydantic import BaseModel
+from linkml_store.api.collection import OBJECT, Collection
+from linkml_store.inference.inference_config import Inference, InferenceConfig, LLMConfig
+from linkml_store.inference.inference_engine import InferenceEngine, ModelSerialization
+from linkml_store.utils.object_utils import select_nested
+logger = logging.getLogger(__name__)
+MAX_ITERATIONS = 5
+DEFAULT_NUM_EXAMPLES = 20
+SYSTEM_PROMPT = """
+Your task is to inference the complete YAML
+object output given the YAML object input. I will provide you
+with contextual information, including the schema,
+to help with the inference. You can use the following
+You should return ONLY valid YAML in your response.
+"""
+class TrainedModel(BaseModel, extra="forbid"):
+    index_rows: List[OBJECT]
+    config: Optional[InferenceConfig] = None
+class LLMInference(Inference):
+    iterations: int = 0
+@dataclass
+class LLMInferenceEngine(InferenceEngine):
+    """
+    LLM based predictor.
+    Unlike the RAG predictor this performs few-shot inference
+    """
+    _model: "llm.Model" = None  # noqa: F821
+    PERSIST_COLS: ClassVar[List[str]] = [
+        "config",
+    ]
+    def __post_init__(self):
+        if not self.config:
+            self.config = InferenceConfig()
+        if not self.config.llm_config:
+            self.config.llm_config = LLMConfig()
+    @property
+    def model(self) -> "llm.Model":  # noqa: F821
+        import llm
+        if self._model is None:
+            self._model = llm.get_model(self.config.llm_config.model_name)
+            if self._model.needs_key:
+                key = get_key(None, key_alias=self._model.needs_key)
+                self._model.key = key
+        return self._model
+    def initialize_model(self, **kwargs):
+        logger.info(f"Initializing model {self.model}")
+    def object_to_text(self, object: OBJECT) -> str:
+        return yaml.dump(object)
+    def _schema_str(self) -> str:
+        db = self.training_data.base_collection.parent
+        from linkml_runtime.dumpers import json_dumper
+        schema_dict = json_dumper.to_dict(db.schema_view.schema)
+        return yaml.dump(schema_dict)
+    def derive(self, object: OBJECT, iteration=0, additional_prompt_texts: Optional[List[str]] = None) -> Optional[LLMInference]:
+        import llm
+        model: llm.Model = self.model
+        #model_name = self.config.llm_config.model_name
+        #feature_attributes = self.config.feature_attributes
+        target_attributes = self.config.target_attributes
+        query_text = self.object_to_text(object)
+        if not target_attributes:
+            target_attributes = [k for k, v in object.items() if v is None or v == ""]
+        #if not feature_attributes:
+        #    feature_attributes = [k for k, v in object.items() if v is not None and v != ""]
+        system_prompt = SYSTEM_PROMPT.format(llm_config=self.config.llm_config)
+        system_prompt += "\n## SCHEMA:\n\n" + self._schema_str()
+        stub = ", ".join([f"{k}: ..." for k in target_attributes])
+        stub = "{" + stub + "}"
+        prompt = (
+            "Provide a YAML object of the form"
+            "```yaml\n"
+            f"{stub}\n"
+            "```\n"
+            "---\nQuery:\n" f"## INCOMPLETE OBJECT:\n{query_text}\n" "## OUTPUT:\n"
+        )
+        logger.info(f"Prompt: {prompt}")
+        response = model.prompt(prompt, system=system_prompt)
+        yaml_str = response.text()
+        logger.info(f"Response: {yaml_str}")
+        predicted_object = parse_yaml_payload(yaml_str, strict=True)
+        predicted_object = {**object, **predicted_object}
+        if self.config.validate_results:
+            base_collection = self.training_data.base_collection
+            errs = list(base_collection.iter_validate_collection([predicted_object]))
+            if errs:
+                print(f"{iteration} // FAILED TO VALIDATE: {yaml_str}")
+                print(f"PARSED: {predicted_object}")
+                print(f"ERRORS: {errs}")
+                if iteration > MAX_ITERATIONS:
+                    raise ValueError(f"Validation errors: {errs}")
+                extra_texts = [
+                    "Make sure results conform to the schema. Previously you provided:\n",
+                    yaml_str,
+                    "\nThis was invalid.\n",
+                    "Validation errors:\n",
+                ] + [self.object_to_text(e) for e in errs]
+                return self.derive(object, iteration=iteration+1, additional_prompt_texts=extra_texts)
+        return LLMInference(predicted_object=predicted_object, iterations=iteration+1, query=object)
+    def export_model(
+        self, output: Optional[Union[str, Path, TextIO]], model_serialization: ModelSerialization = None, **kwargs
+    ):
+        self.save_model(output)
+    def save_model(self, output: Union[str, Path]) -> None:
+        """
+        Save the trained model and related data to a file.
+        :param output: Path to save the model
+        """
+        raise NotImplementedError("Does not make sense for this engine")
+    @classmethod
+    def load_model(cls, file_path: Union[str, Path]) -> "LLMInferenceEngine":
+        raise NotImplementedError("Does not make sense for this engine")

{linkml_store-0.2.4 → linkml_store-0.2.5}/src/linkml_store/inference/implementations/rag_inference_engine.py RENAMED Viewed

@@ -20,7 +20,7 @@ DEFAULT_NUM_EXAMPLES = 20
 DEFAULT_MMR_RELEVANCE_FACTOR = 0.8
 SYSTEM_PROMPT = """
-You are a {llm_config.role}, your task is to inference the YAML
+You are a {llm_config.role}, your task is to infer the YAML
 object output given the YAML object input. I will provide you
 with a collection of examples that will provide guidance both
 on the desired structure of the response, as well as the kind
@@ -130,23 +130,34 @@ class RAGInferenceEngine(InferenceEngine):
         else:
             if not self.rag_collection.indexers:
                 raise ValueError("RAG collection must have an indexer attached")
+            logger.info(f"Searching {self.rag_collection.alias} for examples for: {query_text}")
             rs = self.rag_collection.search(query_text, limit=num_examples, index_name="llm",
                                             mmr_relevance_factor=mmr_relevance_factor)
             examples = rs.rows
+            logger.info(f"Found {len(examples)} examples")
             if not examples:
                 raise ValueError(f"No examples found for {query_text}; size = {self.rag_collection.size()}")
         prompt_clauses = []
-        query_obj = select_nested(object, feature_attributes)
+        this_feature_attributes = feature_attributes
+        if not this_feature_attributes:
+            this_feature_attributes = list(set(object.keys()) - set(target_attributes))
+        query_obj = select_nested(object, this_feature_attributes)
         query_text = self.object_to_text(query_obj)
         for example in examples:
-            input_obj = select_nested(example, feature_attributes)
+            this_feature_attributes = feature_attributes
+            if not this_feature_attributes:
+                this_feature_attributes = list(set(example.keys()) - set(target_attributes))
+            if not this_feature_attributes:
+                raise ValueError(f"No feature attributes found in example {example}")
+            input_obj = select_nested(example, this_feature_attributes)
             input_obj_text = self.object_to_text(input_obj)
             if input_obj_text == query_text:
-                raise ValueError(
-                    f"Query object {query_text} is the same as example object {input_obj_text}\n"
-                    "This indicates possible test data leakage\n."
-                    "TODO: allow an option that allows user to treat this as a basic lookup\n"
-                )
+                continue
+                #raise ValueError(
+                #    f"Query object {query_text} is the same as example object {input_obj_text}\n"
+                #    "This indicates possible test data leakage\n."
+                #    "TODO: allow an option that allows user to treat this as a basic lookup\n"
+                #)
             output_obj = select_nested(example, target_attributes)
             prompt_clause = (
                 "---\nExample:\n" f"## INPUT:\n{input_obj_text}\n" f"## OUTPUT:\n{self.object_to_text(output_obj)}\n"
@@ -169,7 +180,7 @@ class RAGInferenceEngine(InferenceEngine):
                                        encoding=encoding, token_limit=token_limit,
                                        additional_text=system_prompt)
         logger.info(f"Prompt: {prompt}")
-        response = model.prompt(prompt, system_prompt)
+        response = model.prompt(prompt, system=system_prompt)
         yaml_str = response.text()
         logger.info(f"Response: {yaml_str}")
         predicted_object = self._parse_yaml_payload(yaml_str, strict=True)

{linkml_store-0.2.4 → linkml_store-0.2.5}/src/linkml_store/inference/inference_engine.py RENAMED Viewed

@@ -124,7 +124,7 @@ class InferenceEngine(ABC):
         Load the data and split it into training and testing sets.
         :param collection:
-        :param split:
+        :param split: Tuple of training and testing split ratios.
         :param randomize:
         :return:
         """
@@ -136,7 +136,7 @@ class InferenceEngine(ABC):
             self.training_data = CollectionSlice(name="train", base_collection=collection, indices=None)
             self.testing_data = None
             return
-        logger.info(f"Loading and splitting data from collection {collection.alias}")
+        logger.info(f"Loading and splitting data {split} from collection {collection.alias}")
         size = collection.size()
         indices = range(size)
         if randomize:

{linkml_store-0.2.4 → linkml_store-0.2.5}/src/linkml_store/utils/llm_utils.py RENAMED Viewed

@@ -100,3 +100,18 @@ def get_token_limit(model_name: str) -> int:
         if model in model_name:
             return token_limit
     return 4096
+def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
+    import yaml
+    if "```" in yaml_str:
+        yaml_str = yaml_str.split("```")[1].strip()
+        if yaml_str.startswith("yaml"):
+            yaml_str = yaml_str[4:].strip()
+    try:
+        return yaml.safe_load(yaml_str)
+    except Exception as e:
+        if strict:
+            raise e
+        logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
+        return None

{linkml_store-0.2.4 → linkml_store-0.2.5}/src/linkml_store/utils/object_utils.py RENAMED Viewed

@@ -124,7 +124,7 @@ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=N
     Args:
     data (dict): The input nested dictionary.
-    selectors (list): A list of selector strings.
+    paths (list): A list of selector strings.
     Returns:
     dict: A new dictionary with the same structure, but only the selected attributes.
@@ -162,6 +162,8 @@ def select_nested(data: dict, paths: List[Union[str, List[str]]], current_path=N
     if current_path is None:
         current_path = []
     matching_paths = []
+    if not paths:
+        raise ValueError("No paths provided")
     for path in paths:
         if isinstance(path, str):
             path = path.split(".")