PyPI - linkml-store - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

linkml-store 0.2.6py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (35) hide show

linkml_store/api/client.py +2 -3
linkml_store/api/collection.py +63 -8
linkml_store/api/database.py +20 -3
linkml_store/api/stores/duckdb/duckdb_collection.py +168 -4
linkml_store/api/stores/duckdb/duckdb_database.py +5 -5
linkml_store/api/stores/filesystem/__init__.py +1 -1
linkml_store/api/stores/filesystem/filesystem_database.py +1 -1
linkml_store/api/stores/mongodb/mongodb_collection.py +132 -15
linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
linkml_store/api/stores/neo4j/neo4j_database.py +1 -1
linkml_store/api/stores/solr/solr_collection.py +107 -18
linkml_store/cli.py +201 -21
linkml_store/index/implementations/llm_indexer.py +13 -6
linkml_store/index/indexer.py +9 -5
linkml_store/inference/implementations/llm_inference_engine.py +15 -13
linkml_store/inference/implementations/rag_inference_engine.py +13 -10
linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
linkml_store/inference/inference_config.py +2 -1
linkml_store/inference/inference_engine.py +1 -1
linkml_store/plotting/__init__.py +5 -0
linkml_store/plotting/cli.py +172 -0
linkml_store/plotting/heatmap.py +356 -0
linkml_store/utils/dat_parser.py +95 -0
linkml_store/utils/enrichment_analyzer.py +217 -0
linkml_store/utils/format_utils.py +124 -3
linkml_store/utils/llm_utils.py +4 -2
linkml_store/utils/object_utils.py +9 -3
linkml_store/utils/pandas_utils.py +1 -1
linkml_store/utils/sql_utils.py +1 -1
linkml_store/utils/vector_utils.py +3 -10
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10.dist-info}/METADATA +3 -1
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10.dist-info}/RECORD +35 -30
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10.dist-info}/WHEEL +1 -1
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10.dist-info}/LICENSE +0 -0
{linkml_store-0.2.6.dist-info → linkml_store-0.2.10.dist-info}/entry_points.txt +0 -0

linkml_store/api/stores/neo4j/neo4j_database.py CHANGED Viewed

@@ -27,7 +27,7 @@ class Neo4jDatabase(Database):
         if handle is None:
             handle = "bolt://localhost:7687/neo4j"
         if handle.startswith("neo4j:"):
-            handle = handle.replace("neo4j:", "bolt:")
+            handle = handle.replace("neo4j:", "bolt:", 1)
         super().__init__(handle=handle, **kwargs)
     @property

linkml_store/api/stores/solr/solr_collection.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 from copy import copy
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Tuple
 import requests
@@ -56,32 +56,121 @@ class SolrCollection(Collection):
         response.raise_for_status()
         data = response.json()
+        logger.debug(f"Response: {data}")
         num_rows = data["response"]["numFound"]
         rows = data["response"]["docs"]
         return QueryResult(query=query, num_rows=num_rows, rows=rows)
     def query_facets(
-        self, where: Optional[Dict] = None, facet_columns: List[str] = None, facet_limit=DEFAULT_FACET_LIMIT, **kwargs
-    ) -> Dict[str, Dict[str, int]]:
+        self,
+        where: Optional[Dict] = None,
+        facet_columns: List[Union[str, Tuple[str, ...]]] = None,
+        facet_limit=DEFAULT_FACET_LIMIT,
+        facet_min_count: int = 1,
+        **kwargs,
+    ) -> Dict[Union[str, Tuple[str, ...]], List[Tuple[Any, int]]]:
+        """
+        Query facet counts for fields or field combinations.
+        :param where: Filter conditions
+        :param facet_columns: List of fields to facet on. Elements can be:
+                            - Simple strings for single field facets
+                            - Tuples of strings for field combinations (pivot facets)
+        :param facet_limit: Maximum number of facet values to return
+        :param facet_min_count: Minimum count for facet values to be included
+        :return: Dictionary mapping fields or field tuples to lists of (value, count) tuples
+        """
         solr_query = self._build_solr_query(where)
-        solr_query["facet"] = "true"
-        solr_query["facet.field"] = facet_columns
-        solr_query["facet.limit"] = facet_limit
-        logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")
-        response = requests.get(f"{self._collection_base}/select", params=solr_query)
-        response.raise_for_status()
-        data = response.json()
-        facet_counts = data["facet_counts"]["facet_fields"]
+        # Separate single fields and tuple fields
+        single_fields = []
+        tuple_fields = []
+        if facet_columns:
+            for field in facet_columns:
+                if isinstance(field, str):
+                    single_fields.append(field)
+                elif isinstance(field, tuple):
+                    tuple_fields.append(field)
+        # Process regular facets
         results = {}
-        for facet_field, counts in facet_counts.items():
-            results[facet_field] = list(zip(counts[::2], counts[1::2]))
+        if single_fields:
+            solr_query["facet"] = "true"
+            solr_query["facet.field"] = single_fields
+            solr_query["facet.limit"] = facet_limit
+            solr_query["facet.mincount"] = facet_min_count
+            logger.info(f"Querying Solr collection {self.alias} for facets with query: {solr_query}")
+            response = requests.get(f"{self._collection_base}/select", params=solr_query)
+            response.raise_for_status()
+            data = response.json()
+            facet_counts = data["facet_counts"]["facet_fields"]
+            for facet_field, counts in facet_counts.items():
+                results[facet_field] = list(zip(counts[::2], counts[1::2]))
+        # Process pivot facets for tuple fields
+        if tuple_fields:
+            # TODO: Add a warning if Solr < 4.0, when this was introduced
+            for field_tuple in tuple_fields:
+                # Create a query for this specific field tuple
+                pivot_query = self._build_solr_query(where)
+                pivot_query["facet"] = "true"
+                # Create pivot facet
+                field_str = ','.join(field_tuple)
+                pivot_query["facet.pivot"] = field_str
+                pivot_query["facet.pivot.mincount"] = facet_min_count
+                pivot_query["facet.limit"] = facet_limit
+                logger.info(f"Querying Solr collection {self.alias} for pivot facets with query: {pivot_query}")
+                response = requests.get(f"{self._collection_base}/select", params=pivot_query)
+                response.raise_for_status()
+                data = response.json()
+                pivot_facets = data.get("facet_counts", {}).get("facet_pivot", {})
+                # Process pivot facets into the same format as MongoDB results
+                field_str = ','.join(field_tuple)
+                pivot_data = pivot_facets.get(field_str, [])
+                # Build a list of tuples (field values, count)
+                pivot_results = []
+                self._process_pivot_facets(pivot_data, [], pivot_results, field_tuple)
+                results[field_tuple] = pivot_results
         return results
+    def _process_pivot_facets(self, pivot_data, current_values, results, field_tuple):
+        """
+        Recursively process pivot facet results to extract combinations of field values.
+        :param pivot_data: The pivot facet data from Solr
+        :param current_values: The current path of values in the recursion
+        :param results: The result list to populate
+        :param field_tuple: The original field tuple for reference
+        """
+        for item in pivot_data:
+            # Add the current field value
+            value = item.get("value")
+            count = item.get("count", 0)
+            # Update the current path with this value
+            values = current_values + [value]
+            # If we have all the fields from the tuple, add a result
+            if len(values) == len(field_tuple):
+                # Create a tuple of values corresponding to the field tuple
+                results.append((tuple(values), count))
+            # Process child pivot fields recursively
+            pivot = item.get("pivot", [])
+            if pivot and len(values) < len(field_tuple):
+                self._process_pivot_facets(pivot, values, results, field_tuple)
     def _build_solr_query(
         self, query: Union[Query, Dict], search_term="*:*", extra: Optional[Dict] = None

linkml_store/cli.py CHANGED Viewed

@@ -3,7 +3,7 @@ import sys
 import warnings
 from collections import defaultdict
 from pathlib import Path
-from typing import Optional, Tuple, Any
+from typing import Any, Optional, Tuple
 import click
 import yaml
@@ -37,6 +37,11 @@ index_type_option = click.option(
     show_default=True,
     help="Type of index to create. Values: simple, llm",
 )
+json_select_query_option = click.option(
+    "--json-select-query",
+    "-J",
+    help="JSON SELECT query",
+)
 logger = logging.getLogger(__name__)
@@ -136,7 +141,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         logger.setLevel(logging.ERROR)
     ctx.ensure_object(dict)
     if input:
-        database = "duckdb" # default: store in duckdb
+        database = "duckdb"  # default: store in duckdb
         if input.startswith("http"):
             parts = input.split("/")
             collection = parts[-1]
@@ -144,8 +149,7 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
         else:
             stem = underscore(Path(input).stem)
             collection = stem
-        logger.info(f"Using input file: {input}, "
-                    f"default storage is {database} and collection is {collection}")
+        logger.info(f"Using input file: {input}, " f"default storage is {database} and collection is {collection}")
         config = ClientConfig(databases={"duckdb": {"collections": {stem: {"source": {"local_path": input}}}}})
     if config is None and DEFAULT_LOCAL_CONF_PATH.exists():
         config = DEFAULT_LOCAL_CONF_PATH
@@ -185,13 +189,25 @@ def cli(ctx, verbose: int, quiet: bool, stacktrace: bool, database, collection,
             settings.collection_name = collection.alias
+@cli.command()
+@click.pass_context
+def drop(ctx):
+    """
+    Drop database and all its collections.
+    """
+    database = ctx.obj["settings"].database
+    database.drop()
 @cli.command()
 @click.argument("files", type=click.Path(), nargs=-1)
 @click.option("--replace/--no-replace", default=False, show_default=True, help="Replace existing objects")
 @click.option("--format", "-f", type=format_choice, help="Input format")
 @click.option("--object", "-i", multiple=True, help="Input object as YAML")
+@click.option("--source-field", help="If provided, inject file path source as this field")
+@json_select_query_option
 @click.pass_context
-def insert(ctx, files, replace, object, format):
+def insert(ctx, files, replace, object, format, source_field, json_select_query):
     """Insert objects from files (JSON, YAML, TSV) into the specified collection.
     Using a configuration:
@@ -207,11 +223,17 @@ def insert(ctx, files, replace, object, format):
         raise ValueError("Collection must be specified.")
     if not files and not object:
         files = ["-"]
+    load_objects_args = {}
+    if json_select_query:
+        load_objects_args["select_query"] = json_select_query
     for file_path in files:
         if format:
-            objects = load_objects(file_path, format=format)
+            objects = load_objects(file_path, format=format, **load_objects_args)
         else:
-            objects = load_objects(file_path)
+            objects = load_objects(file_path, **load_objects_args)
+        if source_field:
+            for obj in objects:
+                obj[source_field] = str(file_path)
         logger.info(f"Inserting {len(objects)} objects from {file_path} into collection '{collection.alias}'.")
         if replace:
             collection.replace(objects)
@@ -222,6 +244,8 @@ def insert(ctx, files, replace, object, format):
         for object_str in object:
             logger.info(f"Parsing: {object_str}")
             objects = yaml.safe_load(object_str)
+            if not isinstance(objects, list):
+                objects = [objects]
             if replace:
                 collection.replace(objects)
             else:
@@ -234,21 +258,41 @@ def insert(ctx, files, replace, object, format):
 @click.argument("files", type=click.Path(exists=True), nargs=-1)
 @click.option("--format", "-f", type=format_choice, help="Input format")
 @click.option("--object", "-i", multiple=True, help="Input object as YAML")
+@json_select_query_option
 @click.pass_context
-def store(ctx, files, object, format):
+def store(ctx, files, object, format, json_select_query):
     """Store objects from files (JSON, YAML, TSV) into the database.
-    Note: this is similar to insert, but a collection does not need to be specified
+    Note: this is similar to insert, but a collection does not need to be specified.
+    For example, assume that `my-collection` is a dict with multiple keys,
+    and we want one collection per key:
+        linkml-store -d my.ddb store my-collection.yaml
+    Loading JSON (e.g OBO-JSON), with a --json-select-query:
+        linkml-store -d cl.ddb  store -J graphs  cl.obo.json
+    Loading XML (e.g OWL-XML), with a --json-select-query:
+        linkml-store -d cl.ddb  store -J Ontology  cl.owx
+    Because the XML uses a top level Ontology, with multiple
     """
     settings = ctx.obj["settings"]
     db = settings.database
     if not files and not object:
         files = ["-"]
+    load_objects_args = {}
+    if json_select_query:
+        load_objects_args["select_query"] = json_select_query
     for file_path in files:
         if format:
-            objects = load_objects(file_path, format=format)
+            objects = load_objects(file_path, format=format, **load_objects_args)
         else:
-            objects = load_objects(file_path)
+            objects = load_objects(file_path, **load_objects_args)
         logger.info(f"Inserting {len(objects)} objects from {file_path} into database '{db}'.")
         for obj in objects:
             db.store(obj)
@@ -422,15 +466,32 @@ def list_collections(ctx, **kwargs):
 @cli.command()
 @click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
-@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
+@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return per facet")
+@click.option("--facet-min-count", "-M", type=click.INT, help="Minimum count for a facet to be included")
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
-@click.option("--columns", "-S", help="Columns to facet on")
+@click.option("--columns", "-S", help="Columns to facet on. Comma-separated, join combined facets with +")
 @click.option("--wide/--no-wide", "-U/--no-U", default=False, show_default=True, help="Wide table")
 @click.pass_context
-def fq(ctx, where, limit, columns, output_type, wide, output):
+def fq(ctx, where, limit, columns, output_type, wide, output, **kwargs):
     """
-    Query facets from the specified collection.
+    Query facet counts from the specified collection.
+    Assuming your .linkml.yaml includes an entry mapping `phenopackets` to a
+    mongodb
+    Facet counts (all columns)
+        linkml-store -d phenopackets fq
+    Nested columns:
+        linkml-store -d phenopackets fq subject.timeAtLastEncounter.age
+    Compound keys:
+        linkml-store -d phenopackets fq subject.sex+subject.timeAtLastEncounter.age
     """
     collection = ctx.obj["settings"].collection
     where_clause = yaml.safe_load(where) if where else None
@@ -439,7 +500,7 @@ def fq(ctx, where, limit, columns, output_type, wide, output):
         columns = [col.strip() for col in columns]
         columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
     logger.info(f"Faceting on columns: {columns}")
-    results = collection.query_facets(where_clause, facet_columns=columns, limit=limit)
+    results = collection.query_facets(where_clause, facet_columns=columns, facet_limit=limit, **kwargs)
     logger.info(f"Facet results: {results}")
     def _untuple(key):
@@ -471,6 +532,56 @@ def fq(ctx, where, limit, columns, output_type, wide, output):
         click.echo(output_data)
+@cli.command()
+@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
+@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return per facet")
+@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
+@click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option("--columns", "-S", help="Columns to facet on. Comma-separated, join combined facets with +")
+@click.pass_context
+def groupby(ctx, where, limit, columns, output_type, output, **kwargs):
+    """
+    Group by columns in the specified collection.
+    Assume a simple triple model:
+        linkml-store -d cl.ddb -c triple insert cl.owl
+    This makes a flat subject/predicate/object table
+    This can be grouped, e.g by subject:
+        linkml-store -d cl.ddb -c triple groupby -s subject
+    Or subject and predicate:
+        linkml-store -d cl.ddb -c triple groupby -s '[subject,predicate]'
+    """
+    collection = ctx.obj["settings"].collection
+    where_clause = yaml.safe_load(where) if where else None
+    columns = columns.split(",") if columns else None
+    if columns:
+        columns = [col.strip() for col in columns]
+        columns = [(tuple(col.split("+")) if "+" in col else col) for col in columns]
+    logger.info(f"Group by: {columns}")
+    result = collection.group_by(
+        group_by_fields=columns,
+        where_clause=where_clause,
+        agg_map={},
+        limit=limit,
+        **kwargs,
+    )
+    logger.info(f"Group by results: {result}")
+    output_data = render_output(result.rows, output_type)
+    if output:
+        with open(output, "w") as f:
+            f.write(output_data)
+        click.echo(f"Query results saved to {output}")
+    else:
+        click.echo(output_data)
 def _get_index(index_type=None, **kwargs) -> Indexer:
     if index_type is None or index_type == "simple":
         return SimpleIndexer(name="test", **kwargs)
@@ -519,10 +630,12 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
         value_key = tuple([row.get(att) for att in value_atts])
         pivoted[index_key][column_key] = value_key
     pivoted_objs = []
     def detuple(t: Tuple) -> Any:
         if len(t) == 1:
             return t[0]
         return str(t)
     for index_key, data in pivoted.items():
         obj = {att: key for att, key in zip(index_atts, index_key)}
         for column_key, value_key in data.items():
@@ -531,6 +644,57 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
     write_output(pivoted_objs, output_type, target=output)
+@cli.command()
+@click.option("--where", "-w", type=click.STRING, help="WHERE clause for the query")
+@click.option("--limit", "-l", type=click.INT, help="Maximum number of results to return")
+@click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
+@click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option("--sample-field", "-I", help="Field to use as the sample identifier")
+@click.option("--classification-field", "-L", help="Field to use as for classification")
+@click.option(
+    "--p-value-threshold",
+    "-P",
+    type=click.FLOAT,
+    default=0.05,
+    show_default=True,
+    help="P-value threshold for enrichment",
+)
+@click.option(
+    "--multiple-testing-correction",
+    "-M",
+    type=click.STRING,
+    default="bh",
+    show_default=True,
+    help="Multiple test correction method",
+)
+@click.argument("samples", type=click.STRING, nargs=-1)
+@click.pass_context
+def enrichment(ctx, where, limit, output_type, output, sample_field, classification_field, samples, **kwargs):
+    from linkml_store.utils.enrichment_analyzer import EnrichmentAnalyzer
+    collection = ctx.obj["settings"].collection
+    where_clause = yaml.safe_load(where) if where else None
+    column_atts = [sample_field, classification_field]
+    results = collection.find(where_clause, select_cols=column_atts, limit=-1)
+    df = results.rows_dataframe
+    ea = EnrichmentAnalyzer(df, sample_key=sample_field, classification_key=classification_field)
+    if not samples:
+        samples = df[sample_field].unique()
+    enrichment_results = []
+    for sample in samples:
+        enriched = ea.find_enriched_categories(sample, **kwargs)
+        for e in enriched:
+            obj = {"sample": sample, **e.model_dump()}
+            enrichment_results.append(obj)
+    output_data = render_output(enrichment_results, output_type)
+    if output:
+        with open(output, "w") as f:
+            f.write(output_data)
+        click.echo(f"Search results saved to {output}")
+    else:
+        click.echo(output_data)
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default=Format.YAML.value, help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
@@ -538,7 +702,7 @@ def pivot(ctx, where, limit, index, columns, values, output_type, output):
 @click.option(
     "--feature-attributes", "-F", type=click.STRING, help="Feature attributes for inference (comma separated)"
 )
-@click.option("--training-collection", type=click.STRING,help="Collection to use for training")
+@click.option("--training-collection", type=click.STRING, help="Collection to use for training")
 @click.option("--inference-config-file", "-Y", type=click.Path(), help="Path to inference configuration file")
 @click.option("--export-model", "-E", type=click.Path(), help="Export model to file")
 @click.option("--load-model", "-L", type=click.Path(), help="Load model from file")
@@ -753,12 +917,28 @@ def indexes(ctx):
 @cli.command()
 @click.option("--output-type", "-O", type=format_choice, default="json", help="Output format")
 @click.option("--output", "-o", type=click.Path(), help="Output file path")
+@click.option(
+    "--collection-only/--no-collection-only",
+    default=False,
+    show_default=True,
+    help="Only validate specified collection",
+)
+@click.option(
+    "--ensure-referential-integrity/--no-ensure-referential-integrity",
+    default=True,
+    show_default=True,
+    help="Ensure referential integrity",
+)
 @click.pass_context
-def validate(ctx, output_type, output):
+def validate(ctx, output_type, output, collection_only, **kwargs):
     """Validate objects in the specified collection."""
-    collection = ctx.obj["settings"].collection
-    logger.info(f"Validating collection {collection.alias}")
-    validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection()]
+    if collection_only:
+        collection = ctx.obj["settings"].collection
+        logger.info(f"Validating collection {collection.alias}")
+        validation_results = [json_dumper.to_dict(x) for x in collection.iter_validate_collection(**kwargs)]
+    else:
+        db = ctx.obj["settings"].database
+        validation_results = [json_dumper.to_dict(x) for x in db.validate_database(**kwargs)]
     output_data = render_output(validation_results, output_type)
     if output:
         with open(output, "w") as f:

linkml_store/index/implementations/llm_indexer.py CHANGED Viewed

@@ -11,6 +11,7 @@ from linkml_store.utils.llm_utils import get_token_limit, render_formatted_text
 if TYPE_CHECKING:
     import llm
+CHUNK_SIZE = 1000
 logger = logging.getLogger(__name__)
@@ -25,7 +26,7 @@ class LLMIndexer(Indexer):
     >>> vector = indexer.text_to_vector("hello")
     """
-    embedding_model_name: str = "ada-002"
+    embedding_model_name: str = "text-embedding-ada-002"
     _embedding_model: "llm.EmbeddingModel" = None
     cached_embeddings_database: str = None
     cached_embeddings_collection: str = None
@@ -52,7 +53,9 @@ class LLMIndexer(Indexer):
         """
         return self.texts_to_vectors([text], cache=cache, **kwargs)[0]
-    def texts_to_vectors(self, texts: List[str], cache: bool = None, **kwargs) -> List[INDEX_ITEM]:
+    def texts_to_vectors(
+        self, texts: List[str], cache: bool = None, token_limit_penalty=0, **kwargs
+    ) -> List[INDEX_ITEM]:
         """
         Use LLM to embed.
@@ -60,18 +63,22 @@ class LLMIndexer(Indexer):
         >>> vectors = indexer.texts_to_vectors(["hello", "goodbye"])
         :param texts:
+        :param cache:
+        :param token_limit_penalty:
         :return:
         """
         from tiktoken import encoding_for_model
         logging.info(f"Converting {len(texts)} texts to vectors")
         model = self.embedding_model
         # TODO: make this more accurate
-        token_limit = get_token_limit(model.model_id) - 200
-        encoding = encoding_for_model("gpt-4o")
+        token_limit = get_token_limit(model.model_id) - token_limit_penalty
+        logging.info(f"Token limit for {model.model_id}: {token_limit}")
+        encoding = encoding_for_model(self.embedding_model_name)
         def truncate_text(text: str) -> str:
             # split into tokens every 1000 chars:
-            parts = [text[i : i + 1000] for i in range(0, len(text), 1000)]
+            parts = [text[i : i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
             truncated = render_formatted_text(
                 lambda x: "".join(x),
                 parts,
@@ -140,5 +147,5 @@ class LLMIndexer(Indexer):
                 embeddings_collection.commit()
         else:
             logger.info(f"Embedding {len(texts)} texts")
-            embeddings = model.embed_multi(texts)
+            embeddings = list(model.embed_multi(texts, batch_size=1))
         return [np.array(v, dtype=float) for v in embeddings]

linkml_store/index/indexer.py CHANGED Viewed

@@ -3,9 +3,10 @@ from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple
 import numpy as np
-from linkml_store.utils.vector_utils import pairwise_cosine_similarity, mmr_diversified_search
 from pydantic import BaseModel
+from linkml_store.utils.vector_utils import mmr_diversified_search, pairwise_cosine_similarity
 INDEX_ITEM = np.ndarray
 logger = logging.getLogger(__name__)
@@ -154,8 +155,11 @@ class Indexer(BaseModel):
         return str(obj)
     def search(
-        self, query: str, vectors: List[Tuple[str, INDEX_ITEM]], limit: Optional[int] = None,
-            mmr_relevance_factor: Optional[float] = None
+        self,
+        query: str,
+        vectors: List[Tuple[str, INDEX_ITEM]],
+        limit: Optional[int] = None,
+        mmr_relevance_factor: Optional[float] = None,
     ) -> List[Tuple[float, Any]]:
         """
         Use the indexer to search against a database of vectors.
@@ -175,8 +179,8 @@ class Indexer(BaseModel):
             vlist = [v for _, v in vectors]
             idlist = [id for id, _ in vectors]
             sorted_indices = mmr_diversified_search(
-                query_vector, vlist,
-                relevance_factor=mmr_relevance_factor, top_n=limit)
+                query_vector, vlist, relevance_factor=mmr_relevance_factor, top_n=limit
+            )
             results = []
             # TODO: this is inefficient when limit is high
             for i in range(limit):

linkml-store 0.2.6__py3-none-any.whl → 0.2.10__py3-none-any.whl

Potentially problematic release.

linkml-store 0.2.6py3-none-any.whl → 0.2.10py3-none-any.whl