PyPI - linkml-store - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

linkml-store 0.2.6py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of linkml-store might be problematic. Click here for more details.

Files changed (28) hide show

linkml_store/api/client.py +2 -3
linkml_store/api/collection.py +63 -8
linkml_store/api/database.py +30 -2
linkml_store/api/stores/duckdb/duckdb_collection.py +165 -3
linkml_store/api/stores/duckdb/duckdb_database.py +3 -3
linkml_store/api/stores/filesystem/__init__.py +1 -1
linkml_store/api/stores/mongodb/mongodb_collection.py +115 -12
linkml_store/api/stores/mongodb/mongodb_database.py +2 -1
linkml_store/api/stores/solr/solr_collection.py +7 -1
linkml_store/cli.py +201 -20
linkml_store/index/implementations/llm_indexer.py +14 -6
linkml_store/index/indexer.py +7 -4
linkml_store/inference/implementations/llm_inference_engine.py +13 -9
linkml_store/inference/implementations/rag_inference_engine.py +13 -10
linkml_store/inference/implementations/sklearn_inference_engine.py +7 -1
linkml_store/inference/inference_config.py +1 -0
linkml_store/utils/dat_parser.py +95 -0
linkml_store/utils/enrichment_analyzer.py +217 -0
linkml_store/utils/format_utils.py +124 -3
linkml_store/utils/llm_utils.py +3 -1
linkml_store/utils/pandas_utils.py +1 -1
linkml_store/utils/sql_utils.py +1 -1
linkml_store/utils/vector_utils.py +3 -10
{linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/METADATA +3 -1
{linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/RECORD +28 -26
{linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/WHEEL +1 -1
{linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/LICENSE +0 -0
{linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/entry_points.txt +0 -0

linkml_store/utils/dat_parser.py ADDED Viewed

@@ -0,0 +1,95 @@
+from typing import Tuple, Optional, Dict, Any, List
+ENTRY = Dict[str, Any]
+def parse_sib_format(text) -> Tuple[Optional[ENTRY], List[ENTRY]]:
+    """
+    Parse SIB/Swiss-Prot format data into a structured dictionary.
+    Args:
+        text (str): The text in SIB/Swiss-Prot format
+    Returns:
+        dict: A dictionary with entry IDs as keys and parsed data as values
+    """
+    # Split the text into entries (separated by //)
+    entries = text.split("//\n")
+    header = None
+    # Initialize results dictionary
+    results = []
+    # Parse each entry
+    for entry in entries:
+        if not entry.strip():
+            continue
+        # Initialize dictionary for current entry
+        current_entry = {}
+        current_code = None
+        # Process each line
+        for line in entry.strip().split("\n"):
+            if not line.strip():
+                continue
+            # Check if this is a new field (starts with a 2-letter code followed by space)
+            if len(line) > 2 and line[2] == " ":
+                current_code = line[0:2]
+                # Remove the code and the following space(s)
+                value = line[3:].strip()
+                # Initialize as list if needed for multi-line fields
+                if current_code not in current_entry:
+                    current_entry[current_code] = []
+                current_entry[current_code].append(value)
+            # Continuation of previous field
+            elif current_code is not None:
+                # Handle continuation lines (typically indented)
+                if current_code == "CC":
+                    # For comments, preserve the indentation
+                    current_entry[current_code].append(line)
+                else:
+                    # For other fields, strip and append
+                    current_entry[current_code].append(line.strip())
+        # Combine multiline comments; e.g
+        # -!- ...
+        #     ...
+        # -!- ...
+        ccs = current_entry.get("CC", [])
+        new_ccs = []
+        for cc in ccs:
+            if not cc.startswith("-!-") and new_ccs:
+                new_ccs[-1] += " " + cc
+            else:
+                new_ccs.append(cc)
+        current_entry["CC"] = new_ccs
+        for k, vs in current_entry.items():
+            if k != "CC":
+                combined = "".join(vs)
+                combined = combined.strip()
+                if combined.endswith("."):
+                    combined = combined.split(".")
+                    combined = [c.strip() for c in combined if c.strip()]
+                    if k == "DE":
+                        combined = combined[0]
+                current_entry[k] = combined
+        if "ID" in current_entry:
+            results.append(current_entry)
+        else:
+            header = current_entry
+    return header, results
+# Example usage:
+# data = parse_sib_format(text)
+# for entry_id, entry_data in data.items():
+#     print(f"Entry: {entry_id}")
+#     for code, values in entry_data.items():
+#         print(f"  {code}: {values}")

linkml_store/utils/enrichment_analyzer.py ADDED Viewed

@@ -0,0 +1,217 @@
+import pandas as pd
+import numpy as np
+from linkml_store.api import Collection
+from scipy import stats
+from typing import Dict, List
+from pydantic import BaseModel
+class EnrichedCategory(BaseModel):
+    """
+    Information about a category enriched in a sample
+    """
+    category: str
+    fold_change: float
+    original_p_value: float
+    adjusted_p_value: float
+from collections import Counter, defaultdict
+class EnrichmentAnalyzer:
+    def __init__(self, df: pd.DataFrame, sample_key: str, classification_key: str):
+        """
+        Initialize the analyzer with a DataFrame and key column names.
+        Precomputes category frequencies for the entire dataset.
+        Args:
+            df: DataFrame containing the data
+            sample_key: Column name for sample IDs
+            classification_key: Column name for category lists
+        """
+        self.df = df
+        self.sample_key = sample_key
+        self.classification_key = classification_key
+        # Precompute global category statistics
+        self.global_stats = self._compute_global_stats()
+        # Cache for sample-specific category counts
+        self.sample_cache: Dict[str, Counter] = {}
+    @classmethod
+    def from_collection(cls, collection: Collection, sample_key: str, classification_key: str) -> "EnrichmentAnalyzer":
+        """
+        Initialize the analyzer with a Collection and key column names.
+        Precomputes category frequencies for the entire dataset.
+        Args:
+            collection: Collection containing the data
+            sample_key: Column name for sample IDs
+            classification_key: Column name for category lists
+        """
+        column_atts = [sample_key, classification_key]
+        results = collection.find(select_cols=column_atts, limit=-1)
+        df = results.rows_dataframe
+        ea = cls(df, sample_key=sample_key, classification_key=classification_key)
+        return ea
+    def _compute_global_stats(self) -> Dict[str, int]:
+        """
+        Compute global category frequencies across all samples.
+        Returns a dictionary of category -> count
+        """
+        global_counter = Counter()
+        # Flatten all categories and count
+        for categories in self.df[self.classification_key]:
+            if isinstance(categories, list):
+                global_counter.update(categories)
+            else:
+                # Handle case where categories might be a string
+                global_counter.update([categories])
+        return global_counter
+    @property
+    def sample_ids(self) -> List[str]:
+        df = self.df
+        return df[self.sample_key].unique().tolist()
+    def _get_sample_stats(self, sample_id: str) -> Counter:
+        """
+        Get category frequencies for a specific sample.
+        Uses caching to avoid recomputation.
+        """
+        if sample_id in self.sample_cache:
+            return self.sample_cache[sample_id]
+        sample_data = self.df[self.df[self.sample_key] == sample_id]
+        if sample_data.empty:
+            raise KeyError(f"Sample ID '{sample_id}' not found")
+        sample_data = sample_data.dropna()
+        # if sample_data.empty:
+        #    raise ValueError(f"Sample ID '{sample_id}' has missing values after dropping NA")
+        counter = Counter()
+        for categories in sample_data[self.classification_key]:
+            if isinstance(categories, list):
+                counter.update(categories)
+            else:
+                counter.update([categories])
+        self.sample_cache[sample_id] = counter
+        return counter
+    def find_enriched_categories(
+        self,
+        sample_id: str,
+        min_occurrences: int = 5,
+        p_value_threshold: float = 0.05,
+        multiple_testing_correction: str = "bh",
+    ) -> List[EnrichedCategory]:
+        """
+        Find categories that are enriched in the given sample.
+        Args:
+            sample_id: ID of the sample to analyze
+            min_occurrences: Minimum number of occurrences required for a category
+            p_value_threshold: P-value threshold for significance
+        Returns:
+            List of tuples (category, fold_change, p_value) sorted by significance
+        """
+        sample_stats = self._get_sample_stats(sample_id)
+        total_sample_annotations = sum(sample_stats.values())
+        total_global_annotations = sum(self.global_stats.values())
+        results = []
+        for category, sample_count in sample_stats.items():
+            global_count = self.global_stats[category]
+            # Skip rare categories
+            if global_count < min_occurrences:
+                continue
+            # Calculate fold change
+            sample_freq = sample_count / total_sample_annotations
+            global_freq = global_count / total_global_annotations
+            fold_change = sample_freq / global_freq if global_freq > 0 else float("inf")
+            # Perform Fisher's exact test
+            contingency_table = np.array(
+                [
+                    [sample_count, global_count - sample_count],
+                    [
+                        total_sample_annotations - sample_count,
+                        total_global_annotations - total_sample_annotations - (global_count - sample_count),
+                    ],
+                ]
+            )
+            _, p_value = stats.fisher_exact(contingency_table)
+            if p_value < p_value_threshold:
+                results.append((category, fold_change, p_value))
+        if not results:
+            return results
+        # Sort by p-value
+        results.sort(key=lambda x: x[2])
+        # Apply multiple testing correction
+        categories, fold_changes, p_values = zip(*results)
+        if multiple_testing_correction.lower() == "bonf":
+            # Bonferroni correction
+            n_tests = len(self.global_stats)  # Total number of categories tested
+            adjusted_p_values = [min(1.0, p * n_tests) for p in p_values]
+        elif multiple_testing_correction.lower() == "bh":
+            # Benjamini-Hochberg correction
+            n = len(p_values)
+            sorted_indices = np.argsort(p_values)
+            sorted_p_values = np.array(p_values)[sorted_indices]
+            # Calculate BH adjusted p-values
+            adjusted_p_values = np.zeros(n)
+            for i, p in enumerate(sorted_p_values):
+                adjusted_p_values[i] = p * n / (i + 1)
+            # Ensure monotonicity
+            for i in range(n - 2, -1, -1):
+                adjusted_p_values[i] = min(adjusted_p_values[i], adjusted_p_values[i + 1])
+            # Restore original order
+            inverse_indices = np.argsort(sorted_indices)
+            adjusted_p_values = adjusted_p_values[inverse_indices]
+            # Ensure we don't exceed 1.0
+            adjusted_p_values = np.minimum(adjusted_p_values, 1.0)
+        else:
+            # No correction
+            adjusted_p_values = p_values
+        # Filter by adjusted p-value threshold and create final results
+        # Create EnrichedCategory objects
+        final_results = [
+            EnrichedCategory(category=cat, fold_change=fc, original_p_value=p, adjusted_p_value=adj_p)
+            for cat, fc, p, adj_p in zip(categories, fold_changes, p_values, adjusted_p_values)
+            if adj_p < p_value_threshold
+        ]
+        # Sort by adjusted p-value
+        final_results.sort(key=lambda x: x.adjusted_p_value)
+        return final_results
+# Example usage:
+# analyzer = EnrichmentAnalyzer(df, 'sample_id', 'categories')
+# enriched = analyzer.find_enriched_categories('sample1')
+# for category, fold_change, p_value in enriched:
+#     print(f"{category}: {fold_change:.2f}x enrichment (p={p_value:.2e})")

linkml_store/utils/format_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import csv
 import gzip
+import hashlib
 import io
 import json
 import logging
@@ -29,13 +30,24 @@ class Format(Enum):
     JSONL = "jsonl"
     YAML = "yaml"
     YAMLL = "yamll"
+    TOML = "toml"
     TSV = "tsv"
     CSV = "csv"
     XML = "xml"
+    TURTLE = "turtle"
+    RDFXML = "rdfxml"
+    TEXT = "text"
+    TEXTLINES = "textlines"
     OBO = "obo"
+    FASTA = "fasta"
+    GMT = "gmt"
+    DAT = "dat"
+    MARKDOWN = "markdown"
     PKL = "pkl"
     PYTHON = "python"
     PARQUET = "parquet"
+    HDF5 = "hdf5"
+    NETCDF = "netcdf"
     FORMATTED = "formatted"
     TABLE = "table"
     XLSX = "xlsx"
@@ -55,7 +67,12 @@ class Format(Enum):
             ".yamll": cls.YAMLL,
             ".tsv": cls.TSV,
             ".csv": cls.CSV,
+            ".txt": cls.TEXT,
             ".xml": cls.XML,
+            ".owx": cls.XML,
+            ".owl": cls.RDFXML,
+            ".ttl": cls.TURTLE,
+            ".md": cls.MARKDOWN,
             ".py": cls.PYTHON,
             ".parquet": cls.PARQUET,
             ".pq": cls.PARQUET,
@@ -122,12 +139,25 @@ def clean_nested_structure(obj):
     else:
         return clean_pandas_value(obj)
 def process_file(
-    f: IO, format: Format, expected_type: Optional[Type] = None, header_comment_token: Optional[str] = None
+    f: IO,
+    format: Format,
+    expected_type: Optional[Type] = None,
+    header_comment_token: Optional[str] = None,
+    format_options: Optional[Dict[str, Any]] = None,
 ) -> List[Dict[str, Any]]:
     """
     Process a single file and return a list of objects.
+    :param f: The file object.
+    :param format: The format of the file.
+    :param expected_type: The expected type of the objects.
+    :param header_comment_token: Token used for header comments to be skipped
+    :return:
     """
+    if format_options is None:
+        format_options = {}
     if format == Format.YAMLL:
         format = Format.YAML
         expected_type = list
@@ -142,6 +172,14 @@ def process_file(
             objs = [obj for obj in objs if obj is not None]
         else:
             objs = yaml.safe_load(f)
+    elif format == Format.TOML:
+        import toml
+        objs = toml.load(f)
+        if not isinstance(objs, list):
+            objs = [objs]
+    elif format == Format.TEXTLINES:
+        objs = f.readlines()
     elif format in [Format.TSV, Format.CSV]:
         if header_comment_token:
             while True:
@@ -160,14 +198,82 @@ def process_file(
     elif format == Format.XLSX:
         xls = pd.ExcelFile(f)
         objs = {sheet: clean_nested_structure(xls.parse(sheet).to_dict(orient="records")) for sheet in xls.sheet_names}
+    elif format == Format.TEXT:
+        txt = f.read()
+        objs = [
+            {
+                "name": Path(f.name).name,
+                "path": f.name,
+                "content": txt,
+                "size": len(txt),
+                "lines": txt.count("\n") + 1,
+                "md5": hashlib.md5(txt.encode()).hexdigest(),
+            }
+        ]
+    elif format == Format.GMT:
+        objs = []
+        lib_name = Path(f.name).name
+        for line in f:
+            parts = line.strip().split("\t")
+            desc = parts[1]
+            objs.append(
+                {
+                    "library": lib_name,
+                    "uid": f"{lib_name}.{parts[0]}",
+                    "name": parts[0],
+                    "description": desc if desc else None,
+                    "genes": parts[2:],
+                }
+            )
+    elif format == Format.FASTA:
+        objs = []
+        current_obj = None
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                if current_obj:
+                    objs.append(current_obj)
+                current_obj = {"id": line[1:], "sequence": ""}
+            else:
+                current_obj["sequence"] += line
+        if current_obj:
+            objs.append(current_obj)
     elif format == Format.OBO:
         blocks = split_document(f.read(), "\n\n")
         id_pattern = re.compile(r"id: (\S+)")
         def get_id(block):
             m = id_pattern.search(block)
             return m.group(1) if m else None
         objs = [{"id": get_id(block), "content": block} for block in blocks]
         objs = [obj for obj in objs if obj["id"]]
+    elif format == Format.DAT:
+        from linkml_store.utils.dat_parser import parse_sib_format
+        _, objs = parse_sib_format(f.read())
+    elif format in (Format.RDFXML, Format.TURTLE):
+        import lightrdf
+        parser = lightrdf.Parser()
+        objs = []
+        ext_fmt = "rdfxml"
+        if format == Format.TURTLE:
+            ext_fmt = "ttl"
+        bytesio = io.BytesIO(f.read().encode("utf-8"))
+        buffer = io.BufferedReader(bytesio)
+        for s, p, o in parser.parse(buffer, base_iri=None, format=ext_fmt):
+            obj = {
+                "subject": s,
+                "predicate": p,
+                "object": o,
+            }
+            if format_options.get("pivot", False):
+                obj = {
+                    "subject": s,
+                    p: o,
+                }
+            objs.append(obj)
     elif format == Format.PARQUET:
         import pyarrow.parquet as pq
@@ -202,6 +308,7 @@ def load_objects(
     :param compression: The compression type. Supports 'gz' for gzip and 'tgz' for tar.gz.
     :param expected_type: The target type to load the objects into, e.g. list
     :param header_comment_token: Token used for header comments to be skipped
+    :param select_query: JSONPath query to select specific objects from the loaded data.
     :return: A list of dictionaries representing the loaded objects.
     """
     if isinstance(file_path, Path):
@@ -290,7 +397,8 @@ def write_output(
 def render_output(
-    data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame], format: Optional[Union[Format, str]] = Format.YAML
+    data: Union[List[Dict[str, Any]], Dict[str, Any], pd.DataFrame, List[BaseModel]],
+    format: Optional[Union[Format, str]] = Format.YAML,
 ) -> str:
     """
     Render output data in JSON, JSONLines, YAML, CSV, or TSV format.
@@ -323,6 +431,12 @@ def render_output(
     if isinstance(data, pd.DataFrame):
         data = data.to_dict(orient="records")
+    if isinstance(data, BaseModel):
+        data = data.model_dump()
+    if data and isinstance(data, list) and isinstance(data[0], BaseModel):
+        data = [d.model_dump() if isinstance(d, BaseModel) else d for d in data]
     if isinstance(data, dict) and format in [Format.TSV, Format.CSV]:
         data = [data]
@@ -335,8 +449,15 @@ def render_output(
         return "\n".join(json.dumps(obj) for obj in data)
     elif format == Format.PYTHON:
         return str(data)
+    elif format == Format.MARKDOWN:
+        def as_markdown(obj: dict):
+            return "## Object\n\n" + "\n".join([f" * {k}: {v}" for k, v in obj.items()])
+        return "\n\n".join([as_markdown(obj) for obj in data]) if isinstance(data, list) else as_markdown(data)
     elif format == Format.TABLE:
         from tabulate import tabulate
         return tabulate(pd.DataFrame(data), headers="keys", tablefmt="psql")
     elif format == Format.YAML:
         if isinstance(data, list):
@@ -401,4 +522,4 @@ def split_document(doc: str, delimiter: str):
     :param delimiter: The delimiter.
     :return: The parts of the document.
     """
-    return doc.split(delimiter)
+    return doc.split(delimiter)

linkml_store/utils/llm_utils.py CHANGED Viewed

@@ -76,6 +76,7 @@ def render_formatted_text(
         return text
     if not values:
         raise ValueError(f"Cannot fit text into token limit: {text_length} > {token_limit}")
+    # remove last element and try again
     return render_formatted_text(render_func, values[0:-1], encoding=encoding, token_limit=token_limit)
@@ -104,6 +105,7 @@ def get_token_limit(model_name: str) -> int:
 def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
     import yaml
     if "```" in yaml_str:
         yaml_str = yaml_str.split("```")[1].strip()
         if yaml_str.startswith("yaml"):
@@ -114,4 +116,4 @@ def parse_yaml_payload(yaml_str: str, strict=False) -> Optional[dict]:
         if strict:
             raise e
         logger.error(f"Error parsing YAML: {yaml_str}\n{e}")
-        return None
+        return None

linkml_store/utils/pandas_utils.py CHANGED Viewed

@@ -56,7 +56,7 @@ def nested_objects_to_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
 def facet_summary_to_dataframe_unmelted(
-    facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]]
+    facet_summary: Dict[Union[str, Tuple[str, ...]], List[Tuple[Union[str, Tuple[str, ...]], int]]],
 ) -> pd.DataFrame:
     rows = []

linkml_store/utils/sql_utils.py CHANGED Viewed

@@ -116,7 +116,7 @@ def facet_count_sql(query: Query, facet_column: Union[str, Tuple[str, ...]], mul
         modified_where = " AND ".join(conditions)
     def make_col_safe(col):
-        return '"' + quoted_name(col, True) + '"' if ' ' in col else col
+        return '"' + quoted_name(col, True) + '"' if " " in col else col
     if isinstance(facet_column, str):
         facet_column = make_col_safe(facet_column)

linkml_store/utils/vector_utils.py CHANGED Viewed

@@ -8,6 +8,7 @@ logger = logging.getLogger(__name__)
 LOL = List[List[float]]
 def pairwise_cosine_similarity(vector1: np.array, vector2: np.array) -> float:
     """
     Calculate the cosine similarity between two vectors.
@@ -77,9 +78,7 @@ def top_matches(cosine_similarity_matrix: np.ndarray) -> Tuple[np.ndarray, np.nd
     return top_match_indices, top_match_values
-def top_n_matches(
-    cosine_similarity_matrix: np.ndarray, n: int = 10
-) -> Tuple[np.ndarray, np.ndarray]:
+def top_n_matches(cosine_similarity_matrix: np.ndarray, n: int = 10) -> Tuple[np.ndarray, np.ndarray]:
     # Find the indices that would sort each row in descending order
     sorted_indices = np.argsort(-cosine_similarity_matrix, axis=1)
@@ -136,10 +135,7 @@ def mmr_diversified_search(
                     max_sim_to_selected = max(
                         [
                             np.dot(document_vectors[idx], document_vectors[s])
-                            / (
-                                np.linalg.norm(document_vectors[idx])
-                                * np.linalg.norm(document_vectors[s])
-                            )
+                            / (np.linalg.norm(document_vectors[idx]) * np.linalg.norm(document_vectors[s]))
                             for s in selected_indices
                         ]
                     )
@@ -160,6 +156,3 @@ def mmr_diversified_search(
         selected_indices.add(best_index)
     return result_indices

{linkml_store-0.2.6.dist-info → linkml_store-0.2.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: linkml-store
-Version: 0.2.6
+Version: 0.2.9
 Summary: linkml-store
 License: MIT
 Author: Author 1
@@ -24,6 +24,7 @@ Provides-Extra: map
 Provides-Extra: mongodb
 Provides-Extra: neo4j
 Provides-Extra: pyarrow
+Provides-Extra: rdf
 Provides-Extra: renderer
 Provides-Extra: scipy
 Provides-Extra: tests
@@ -39,6 +40,7 @@ Requires-Dist: h5py ; extra == "h5py"
 Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
 Requires-Dist: jsonlines (>=4.0.0,<5.0.0)
 Requires-Dist: jsonpatch (>=1.33)
+Requires-Dist: lightrdf ; extra == "rdf"
 Requires-Dist: linkml (>=1.8.0) ; extra == "validation"
 Requires-Dist: linkml-runtime (>=1.8.0)
 Requires-Dist: linkml_map ; extra == "map"

linkml-store 0.2.6__py3-none-any.whl → 0.2.9__py3-none-any.whl

Potentially problematic release.

linkml-store 0.2.6py3-none-any.whl → 0.2.9py3-none-any.whl