PyPI - mseep-txtai - Versions diffs - 9.1.1__py3-none-any.whl - Mend

mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (251) hide show

mseep_txtai-9.1.1.dist-info/METADATA +262 -0
mseep_txtai-9.1.1.dist-info/RECORD +251 -0
mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
txtai/__init__.py +16 -0
txtai/agent/__init__.py +12 -0
txtai/agent/base.py +54 -0
txtai/agent/factory.py +39 -0
txtai/agent/model.py +107 -0
txtai/agent/placeholder.py +16 -0
txtai/agent/tool/__init__.py +7 -0
txtai/agent/tool/embeddings.py +69 -0
txtai/agent/tool/factory.py +130 -0
txtai/agent/tool/function.py +49 -0
txtai/ann/__init__.py +7 -0
txtai/ann/base.py +153 -0
txtai/ann/dense/__init__.py +11 -0
txtai/ann/dense/annoy.py +72 -0
txtai/ann/dense/factory.py +76 -0
txtai/ann/dense/faiss.py +233 -0
txtai/ann/dense/hnsw.py +104 -0
txtai/ann/dense/numpy.py +164 -0
txtai/ann/dense/pgvector.py +323 -0
txtai/ann/dense/sqlite.py +303 -0
txtai/ann/dense/torch.py +38 -0
txtai/ann/sparse/__init__.py +7 -0
txtai/ann/sparse/factory.py +61 -0
txtai/ann/sparse/ivfsparse.py +377 -0
txtai/ann/sparse/pgsparse.py +56 -0
txtai/api/__init__.py +18 -0
txtai/api/application.py +134 -0
txtai/api/authorization.py +53 -0
txtai/api/base.py +159 -0
txtai/api/cluster.py +295 -0
txtai/api/extension.py +19 -0
txtai/api/factory.py +40 -0
txtai/api/responses/__init__.py +7 -0
txtai/api/responses/factory.py +30 -0
txtai/api/responses/json.py +56 -0
txtai/api/responses/messagepack.py +51 -0
txtai/api/route.py +41 -0
txtai/api/routers/__init__.py +25 -0
txtai/api/routers/agent.py +38 -0
txtai/api/routers/caption.py +42 -0
txtai/api/routers/embeddings.py +280 -0
txtai/api/routers/entity.py +42 -0
txtai/api/routers/extractor.py +28 -0
txtai/api/routers/labels.py +47 -0
txtai/api/routers/llm.py +61 -0
txtai/api/routers/objects.py +42 -0
txtai/api/routers/openai.py +191 -0
txtai/api/routers/rag.py +61 -0
txtai/api/routers/reranker.py +46 -0
txtai/api/routers/segmentation.py +42 -0
txtai/api/routers/similarity.py +48 -0
txtai/api/routers/summary.py +46 -0
txtai/api/routers/tabular.py +42 -0
txtai/api/routers/textractor.py +42 -0
txtai/api/routers/texttospeech.py +33 -0
txtai/api/routers/transcription.py +42 -0
txtai/api/routers/translation.py +46 -0
txtai/api/routers/upload.py +36 -0
txtai/api/routers/workflow.py +28 -0
txtai/app/__init__.py +5 -0
txtai/app/base.py +821 -0
txtai/archive/__init__.py +9 -0
txtai/archive/base.py +104 -0
txtai/archive/compress.py +51 -0
txtai/archive/factory.py +25 -0
txtai/archive/tar.py +49 -0
txtai/archive/zip.py +35 -0
txtai/cloud/__init__.py +8 -0
txtai/cloud/base.py +106 -0
txtai/cloud/factory.py +70 -0
txtai/cloud/hub.py +101 -0
txtai/cloud/storage.py +125 -0
txtai/console/__init__.py +5 -0
txtai/console/__main__.py +22 -0
txtai/console/base.py +264 -0
txtai/data/__init__.py +10 -0
txtai/data/base.py +138 -0
txtai/data/labels.py +42 -0
txtai/data/questions.py +135 -0
txtai/data/sequences.py +48 -0
txtai/data/texts.py +68 -0
txtai/data/tokens.py +28 -0
txtai/database/__init__.py +14 -0
txtai/database/base.py +342 -0
txtai/database/client.py +227 -0
txtai/database/duckdb.py +150 -0
txtai/database/embedded.py +76 -0
txtai/database/encoder/__init__.py +8 -0
txtai/database/encoder/base.py +37 -0
txtai/database/encoder/factory.py +56 -0
txtai/database/encoder/image.py +43 -0
txtai/database/encoder/serialize.py +28 -0
txtai/database/factory.py +77 -0
txtai/database/rdbms.py +569 -0
txtai/database/schema/__init__.py +6 -0
txtai/database/schema/orm.py +99 -0
txtai/database/schema/statement.py +98 -0
txtai/database/sql/__init__.py +8 -0
txtai/database/sql/aggregate.py +178 -0
txtai/database/sql/base.py +189 -0
txtai/database/sql/expression.py +404 -0
txtai/database/sql/token.py +342 -0
txtai/database/sqlite.py +57 -0
txtai/embeddings/__init__.py +7 -0
txtai/embeddings/base.py +1107 -0
txtai/embeddings/index/__init__.py +14 -0
txtai/embeddings/index/action.py +15 -0
txtai/embeddings/index/autoid.py +92 -0
txtai/embeddings/index/configuration.py +71 -0
txtai/embeddings/index/documents.py +86 -0
txtai/embeddings/index/functions.py +155 -0
txtai/embeddings/index/indexes.py +199 -0
txtai/embeddings/index/indexids.py +60 -0
txtai/embeddings/index/reducer.py +104 -0
txtai/embeddings/index/stream.py +67 -0
txtai/embeddings/index/transform.py +205 -0
txtai/embeddings/search/__init__.py +11 -0
txtai/embeddings/search/base.py +344 -0
txtai/embeddings/search/errors.py +9 -0
txtai/embeddings/search/explain.py +120 -0
txtai/embeddings/search/ids.py +61 -0
txtai/embeddings/search/query.py +69 -0
txtai/embeddings/search/scan.py +196 -0
txtai/embeddings/search/terms.py +46 -0
txtai/graph/__init__.py +10 -0
txtai/graph/base.py +769 -0
txtai/graph/factory.py +61 -0
txtai/graph/networkx.py +275 -0
txtai/graph/query.py +181 -0
txtai/graph/rdbms.py +113 -0
txtai/graph/topics.py +166 -0
txtai/models/__init__.py +9 -0
txtai/models/models.py +268 -0
txtai/models/onnx.py +133 -0
txtai/models/pooling/__init__.py +9 -0
txtai/models/pooling/base.py +141 -0
txtai/models/pooling/cls.py +28 -0
txtai/models/pooling/factory.py +144 -0
txtai/models/pooling/late.py +173 -0
txtai/models/pooling/mean.py +33 -0
txtai/models/pooling/muvera.py +164 -0
txtai/models/registry.py +37 -0
txtai/models/tokendetection.py +122 -0
txtai/pipeline/__init__.py +17 -0
txtai/pipeline/audio/__init__.py +11 -0
txtai/pipeline/audio/audiomixer.py +58 -0
txtai/pipeline/audio/audiostream.py +94 -0
txtai/pipeline/audio/microphone.py +244 -0
txtai/pipeline/audio/signal.py +186 -0
txtai/pipeline/audio/texttoaudio.py +60 -0
txtai/pipeline/audio/texttospeech.py +553 -0
txtai/pipeline/audio/transcription.py +212 -0
txtai/pipeline/base.py +23 -0
txtai/pipeline/data/__init__.py +10 -0
txtai/pipeline/data/filetohtml.py +206 -0
txtai/pipeline/data/htmltomd.py +414 -0
txtai/pipeline/data/segmentation.py +178 -0
txtai/pipeline/data/tabular.py +155 -0
txtai/pipeline/data/textractor.py +139 -0
txtai/pipeline/data/tokenizer.py +112 -0
txtai/pipeline/factory.py +77 -0
txtai/pipeline/hfmodel.py +111 -0
txtai/pipeline/hfpipeline.py +96 -0
txtai/pipeline/image/__init__.py +7 -0
txtai/pipeline/image/caption.py +55 -0
txtai/pipeline/image/imagehash.py +90 -0
txtai/pipeline/image/objects.py +80 -0
txtai/pipeline/llm/__init__.py +11 -0
txtai/pipeline/llm/factory.py +86 -0
txtai/pipeline/llm/generation.py +173 -0
txtai/pipeline/llm/huggingface.py +218 -0
txtai/pipeline/llm/litellm.py +90 -0
txtai/pipeline/llm/llama.py +152 -0
txtai/pipeline/llm/llm.py +75 -0
txtai/pipeline/llm/rag.py +477 -0
txtai/pipeline/nop.py +14 -0
txtai/pipeline/tensors.py +52 -0
txtai/pipeline/text/__init__.py +13 -0
txtai/pipeline/text/crossencoder.py +70 -0
txtai/pipeline/text/entity.py +140 -0
txtai/pipeline/text/labels.py +137 -0
txtai/pipeline/text/lateencoder.py +103 -0
txtai/pipeline/text/questions.py +48 -0
txtai/pipeline/text/reranker.py +57 -0
txtai/pipeline/text/similarity.py +83 -0
txtai/pipeline/text/summary.py +98 -0
txtai/pipeline/text/translation.py +298 -0
txtai/pipeline/train/__init__.py +7 -0
txtai/pipeline/train/hfonnx.py +196 -0
txtai/pipeline/train/hftrainer.py +398 -0
txtai/pipeline/train/mlonnx.py +63 -0
txtai/scoring/__init__.py +12 -0
txtai/scoring/base.py +188 -0
txtai/scoring/bm25.py +29 -0
txtai/scoring/factory.py +95 -0
txtai/scoring/pgtext.py +181 -0
txtai/scoring/sif.py +32 -0
txtai/scoring/sparse.py +218 -0
txtai/scoring/terms.py +499 -0
txtai/scoring/tfidf.py +358 -0
txtai/serialize/__init__.py +10 -0
txtai/serialize/base.py +85 -0
txtai/serialize/errors.py +9 -0
txtai/serialize/factory.py +29 -0
txtai/serialize/messagepack.py +42 -0
txtai/serialize/pickle.py +98 -0
txtai/serialize/serializer.py +46 -0
txtai/util/__init__.py +7 -0
txtai/util/resolver.py +32 -0
txtai/util/sparsearray.py +62 -0
txtai/util/template.py +16 -0
txtai/vectors/__init__.py +8 -0
txtai/vectors/base.py +476 -0
txtai/vectors/dense/__init__.py +12 -0
txtai/vectors/dense/external.py +55 -0
txtai/vectors/dense/factory.py +121 -0
txtai/vectors/dense/huggingface.py +44 -0
txtai/vectors/dense/litellm.py +86 -0
txtai/vectors/dense/llama.py +84 -0
txtai/vectors/dense/m2v.py +67 -0
txtai/vectors/dense/sbert.py +92 -0
txtai/vectors/dense/words.py +211 -0
txtai/vectors/recovery.py +57 -0
txtai/vectors/sparse/__init__.py +7 -0
txtai/vectors/sparse/base.py +90 -0
txtai/vectors/sparse/factory.py +55 -0
txtai/vectors/sparse/sbert.py +34 -0
txtai/version.py +6 -0
txtai/workflow/__init__.py +8 -0
txtai/workflow/base.py +184 -0
txtai/workflow/execute.py +99 -0
txtai/workflow/factory.py +42 -0
txtai/workflow/task/__init__.py +18 -0
txtai/workflow/task/base.py +490 -0
txtai/workflow/task/console.py +24 -0
txtai/workflow/task/export.py +64 -0
txtai/workflow/task/factory.py +89 -0
txtai/workflow/task/file.py +28 -0
txtai/workflow/task/image.py +36 -0
txtai/workflow/task/retrieve.py +61 -0
txtai/workflow/task/service.py +102 -0
txtai/workflow/task/storage.py +110 -0
txtai/workflow/task/stream.py +33 -0
txtai/workflow/task/template.py +116 -0
txtai/workflow/task/url.py +20 -0
txtai/workflow/task/workflow.py +14 -0

txtai/vectors/dense/factory.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""
+Factory module
+"""
+from ...util import Resolver
+from .external import External
+from .huggingface import HFVectors
+from .litellm import LiteLLM
+from .llama import LlamaCpp
+from .m2v import Model2Vec
+from .sbert import STVectors
+from .words import WordVectors
+class VectorsFactory:
+    """
+    Methods to create dense vector models.
+    """
+    @staticmethod
+    def create(config, scoring=None, models=None):
+        """
+        Create a Vectors model instance.
+        Args:
+            config: vector configuration
+            scoring: scoring instance
+            models: models cache
+        Returns:
+            Vectors
+        """
+        # Determine vector method
+        method = VectorsFactory.method(config)
+        # External vectors
+        if method == "external":
+            return External(config, scoring, models)
+        # LiteLLM vectors
+        if method == "litellm":
+            return LiteLLM(config, scoring, models)
+        # llama.cpp vectors
+        if method == "llama.cpp":
+            return LlamaCpp(config, scoring, models)
+        # Model2vec vectors
+        if method == "model2vec":
+            return Model2Vec(config, scoring, models)
+        # Sentence Transformers vectors
+        if method == "sentence-transformers":
+            return STVectors(config, scoring, models) if config and config.get("path") else None
+        # Word vectors
+        if method == "words":
+            return WordVectors(config, scoring, models)
+        # Transformers vectors
+        if HFVectors.ismethod(method):
+            return HFVectors(config, scoring, models) if config and config.get("path") else None
+        # Resolve custom method
+        return VectorsFactory.resolve(method, config, scoring, models) if method else None
+    @staticmethod
+    def method(config):
+        """
+        Get or derive the vector method.
+        Args:
+            config: vector configuration
+        Returns:
+            vector method
+        """
+        # Determine vector method
+        method = config.get("method")
+        path = config.get("path")
+        # Infer method from path, if blank
+        if not method:
+            if path:
+                if LiteLLM.ismodel(path):
+                    method = "litellm"
+                elif LlamaCpp.ismodel(path):
+                    method = "llama.cpp"
+                elif Model2Vec.ismodel(path):
+                    method = "model2vec"
+                elif WordVectors.ismodel(path):
+                    method = "words"
+                else:
+                    method = "transformers"
+            elif config.get("transform"):
+                method = "external"
+        return method
+    @staticmethod
+    def resolve(backend, config, scoring, models):
+        """
+        Attempt to resolve a custom backend.
+        Args:
+            backend: backend class
+            config: vector configuration
+            scoring: scoring instance
+            models: models cache
+        Returns:
+            Vectors
+        """
+        try:
+            return Resolver()(backend)(config, scoring, models)
+        except Exception as e:
+            raise ImportError(f"Unable to resolve vectors backend: '{backend}'") from e

txtai/vectors/dense/huggingface.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""
+Hugging Face module
+"""
+from ...models import Models, PoolingFactory
+from ..base import Vectors
+class HFVectors(Vectors):
+    """
+    Builds vectors using the Hugging Face transformers library.
+    """
+    @staticmethod
+    def ismethod(method):
+        """
+        Checks if this method uses local transformers-based models.
+        Args:
+            method: input method
+        Returns:
+            True if this is a local transformers-based model, False otherwise
+        """
+        return method in ("transformers", "pooling", "clspooling", "meanpooling")
+    def loadmodel(self, path):
+        # Build embeddings with transformers pooling
+        return PoolingFactory.create(
+            {
+                "method": self.config.get("method"),
+                "path": path,
+                "device": Models.deviceid(self.config.get("gpu", True)),
+                "tokenizer": self.config.get("tokenizer"),
+                "maxlength": self.config.get("maxlength"),
+                "modelargs": self.config.get("vectors", {}),
+            }
+        )
+    def encode(self, data, category=None):
+        # Encode data using vectors model
+        return self.model.encode(data, batch=self.encodebatch, category=category)

txtai/vectors/dense/litellm.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""
+LiteLLM module
+"""
+import numpy as np
+from transformers.utils import cached_file
+# Conditional import
+try:
+    import litellm as api
+    LITELLM = True
+except ImportError:
+    LITELLM = False
+from ..base import Vectors
+class LiteLLM(Vectors):
+    """
+    Builds vectors using an external embeddings API via LiteLLM.
+    """
+    @staticmethod
+    def ismodel(path):
+        """
+        Checks if path is a LiteLLM model.
+        Args:
+            path: input path
+        Returns:
+            True if this is a LiteLLM model, False otherwise
+        """
+        # pylint: disable=W0702
+        if isinstance(path, str) and LITELLM:
+            debug = api.suppress_debug_info
+            try:
+                # Suppress debug messages for this test
+                api.suppress_debug_info = True
+                return api.get_llm_provider(path) and not LiteLLM.ishub(path)
+            except:
+                return False
+            finally:
+                # Restore debug info value to original value
+                api.suppress_debug_info = debug
+        return False
+    @staticmethod
+    def ishub(path):
+        """
+        Checks if path is available on the HF Hub.
+        Args:
+            input path
+        Returns:
+            True if this is a model on the HF Hub
+        """
+        # pylint: disable=W0702
+        try:
+            return cached_file(path_or_repo_id=path, filename="config.json") is not None if "/" in path else False
+        except:
+            return False
+    def __init__(self, config, scoring, models):
+        # Check before parent constructor since it calls loadmodel
+        if not LITELLM:
+            raise ImportError('LiteLLM is not available - install "vectors" extra to enable')
+        super().__init__(config, scoring, models)
+    def loadmodel(self, path):
+        return None
+    def encode(self, data, category=None):
+        # Call external embeddings API using LiteLLM
+        # Batching is handled server-side
+        response = api.embedding(model=self.config.get("path"), input=data, **self.config.get("vectors", {}))
+        # Read response into a NumPy array
+        return np.array([x["embedding"] for x in response.data], dtype=np.float32)

txtai/vectors/dense/llama.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""
+Llama module
+"""
+import os
+import numpy as np
+from huggingface_hub import hf_hub_download
+# Conditional import
+try:
+    from llama_cpp import Llama
+    LLAMA_CPP = True
+except ImportError:
+    LLAMA_CPP = False
+from ..base import Vectors
+class LlamaCpp(Vectors):
+    """
+    Builds vectors using llama.cpp.
+    """
+    @staticmethod
+    def ismodel(path):
+        """
+        Checks if path is a llama.cpp model.
+        Args:
+            path: input path
+        Returns:
+            True if this is a llama.cpp model, False otherwise
+        """
+        return isinstance(path, str) and path.lower().endswith(".gguf")
+    def __init__(self, config, scoring, models):
+        # Check before parent constructor since it calls loadmodel
+        if not LLAMA_CPP:
+            raise ImportError('llama.cpp is not available - install "vectors" extra to enable')
+        super().__init__(config, scoring, models)
+    def loadmodel(self, path):
+        # Check if this is a local path, otherwise download from the HF Hub
+        path = path if os.path.exists(path) else self.download(path)
+        # Additional model arguments
+        modelargs = self.config.get("vectors", {})
+        # Default GPU layers if not already set
+        modelargs["n_gpu_layers"] = modelargs.get("n_gpu_layers", -1 if self.config.get("gpu", os.environ.get("LLAMA_NO_METAL") != "1") else 0)
+        # Create llama.cpp instance
+        return Llama(path, n_ctx=0, verbose=modelargs.pop("verbose", False), embedding=True, **modelargs)
+    def encode(self, data, category=None):
+        # Generate embeddings and return as a NumPy array
+        # llama-cpp-python has it's own batching built-in using n_batch parameter
+        return np.array(self.model.embed(data), dtype=np.float32)
+    def download(self, path):
+        """
+        Downloads path from the Hugging Face Hub.
+        Args:
+            path: full model path
+        Returns:
+            local cached model path
+        """
+        # Split into parts
+        parts = path.split("/")
+        # Calculate repo id split
+        repo = 2 if len(parts) > 2 else 1
+        # Download and cache file
+        return hf_hub_download(repo_id="/".join(parts[:repo]), filename="/".join(parts[repo:]))

txtai/vectors/dense/m2v.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""
+Model2Vec module
+"""
+import json
+from huggingface_hub.errors import HFValidationError
+from transformers.utils import cached_file
+# Conditional import
+try:
+    from model2vec import StaticModel
+    MODEL2VEC = True
+except ImportError:
+    MODEL2VEC = False
+from ..base import Vectors
+class Model2Vec(Vectors):
+    """
+    Builds vectors using Model2Vec.
+    """
+    @staticmethod
+    def ismodel(path):
+        """
+        Checks if path is a Model2Vec model.
+        Args:
+            path: input path
+        Returns:
+            True if this is a Model2Vec model, False otherwise
+        """
+        try:
+            # Download file and parse JSON
+            path = cached_file(path_or_repo_id=path, filename="config.json")
+            if path:
+                with open(path, encoding="utf-8") as f:
+                    config = json.load(f)
+                    return config.get("model_type") == "model2vec"
+        # Ignore this error - invalid repo or directory
+        except (HFValidationError, OSError):
+            pass
+        return False
+    def __init__(self, config, scoring, models):
+        # Check before parent constructor since it calls loadmodel
+        if not MODEL2VEC:
+            raise ImportError('Model2Vec is not available - install "vectors" extra to enable')
+        super().__init__(config, scoring, models)
+    def loadmodel(self, path):
+        return StaticModel.from_pretrained(path)
+    def encode(self, data, category=None):
+        # Additional model arguments
+        modelargs = self.config.get("vectors", {})
+        # Encode data
+        return self.model.encode(data, batch_size=self.encodebatch, **modelargs)

txtai/vectors/dense/sbert.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""
+Sentence Transformers module
+"""
+# Conditional import
+try:
+    from sentence_transformers import SentenceTransformer
+    SENTENCE_TRANSFORMERS = True
+except ImportError:
+    SENTENCE_TRANSFORMERS = False
+from ...models import Models
+from ..base import Vectors
+class STVectors(Vectors):
+    """
+    Builds vectors using sentence-transformers (aka SBERT).
+    """
+    def __init__(self, config, scoring, models):
+        # Check before parent constructor since it calls loadmodel
+        if not SENTENCE_TRANSFORMERS:
+            raise ImportError('sentence-transformers is not available - install "vectors" extra to enable')
+        # Pool parameter created here since loadmodel is called from parent constructor
+        self.pool = None
+        super().__init__(config, scoring, models)
+    def loadmodel(self, path):
+        # Get target device
+        gpu, pool = self.config.get("gpu", True), False
+        # Default mode uses a single GPU. Setting to all spawns a process per GPU.
+        if isinstance(gpu, str) and gpu == "all":
+            # Get number of accelerator devices available
+            devices = Models.acceleratorcount()
+            # Enable multiprocessing pooling only when multiple devices are available
+            gpu, pool = devices <= 1, devices > 1
+        # Tensor device id
+        deviceid = Models.deviceid(gpu)
+        # Additional model arguments
+        modelargs = self.config.get("vectors", {})
+        # Load sentence-transformers encoder
+        model = self.loadencoder(path, device=Models.device(deviceid), **modelargs)
+        # Start process pool for multiple GPUs
+        if pool:
+            self.pool = model.start_multi_process_pool()
+        # Return model
+        return model
+    def encode(self, data, category=None):
+        # Get encode method based on input category
+        encode = self.model.encode_query if category == "query" else self.model.encode_document if category == "data" else self.model.encode
+        # Additional encoding arguments
+        encodeargs = self.config.get("encodeargs", {})
+        # Encode with sentence transformers encoder
+        return encode(data, pool=self.pool, batch_size=self.encodebatch, **encodeargs)
+    def close(self):
+        # Close pool before model is closed in parent method
+        if self.pool:
+            self.model.stop_multi_process_pool(self.pool)
+            self.pool = None
+        super().close()
+    def loadencoder(self, path, device, **kwargs):
+        """
+        Loads the embeddings encoder model from path.
+        Args:
+            path: model path
+            device: tensor device
+            kwargs: additional keyword args
+        Returns:
+            embeddings encoder
+        """
+        return SentenceTransformer(path, device=device, **kwargs)

txtai/vectors/dense/words.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""
+Word Vectors module
+"""
+import json
+import logging
+import os
+import tempfile
+from multiprocessing import Pool
+import numpy as np
+from huggingface_hub.errors import HFValidationError
+from transformers.utils import cached_file
+# Conditional import
+try:
+    from staticvectors import Database, StaticVectors
+    STATICVECTORS = True
+except ImportError:
+    STATICVECTORS = False
+from ...pipeline import Tokenizer
+from ..base import Vectors
+# Logging configuration
+logger = logging.getLogger(__name__)
+# Multiprocessing helper methods
+# pylint: disable=W0603
+PARAMETERS, VECTORS = None, None
+def create(config, scoring):
+    """
+    Multiprocessing helper method. Creates a global embeddings object to be accessed in a new subprocess.
+    Args:
+        config: vector configuration
+        scoring: scoring instance
+    """
+    global PARAMETERS
+    global VECTORS
+    # Store model parameters for lazy loading
+    PARAMETERS, VECTORS = (config, scoring, None), None
+def transform(document):
+    """
+    Multiprocessing helper method. Transforms document into an embeddings vector.
+    Args:
+        document: (id, data, tags)
+    Returns:
+        (id, embedding)
+    """
+    # Lazy load vectors model
+    global VECTORS
+    if not VECTORS:
+        VECTORS = WordVectors(*PARAMETERS)
+    return (document[0], VECTORS.transform(document))
+class WordVectors(Vectors):
+    """
+    Builds vectors using weighted word embeddings.
+    """
+    @staticmethod
+    def ismodel(path):
+        """
+        Checks if path is a WordVectors model.
+        Args:
+            path: input path
+        Returns:
+            True if this is a WordVectors model, False otherwise
+        """
+        # Check if this is a SQLite database
+        if WordVectors.isdatabase(path):
+            return True
+        try:
+            # Download file and parse JSON
+            path = cached_file(path_or_repo_id=path, filename="config.json")
+            if path:
+                with open(path, encoding="utf-8") as f:
+                    config = json.load(f)
+                    return config.get("model_type") == "staticvectors"
+        # Ignore this error - invalid repo or directory
+        except (HFValidationError, OSError):
+            pass
+        return False
+    @staticmethod
+    def isdatabase(path):
+        """
+        Checks if this is a SQLite database file which is the file format used for word vectors databases.
+        Args:
+            path: path to check
+        Returns:
+            True if this is a SQLite database
+        """
+        return isinstance(path, str) and STATICVECTORS and Database.isdatabase(path)
+    def __init__(self, config, scoring, models):
+        # Check before parent constructor since it calls loadmodel
+        if not STATICVECTORS:
+            raise ImportError('staticvectors is not available - install "vectors" extra to enable')
+        super().__init__(config, scoring, models)
+    def loadmodel(self, path):
+        return StaticVectors(path)
+    def encode(self, data, category=None):
+        # Iterate over each data element, tokenize (if necessary) and build an aggregated embeddings vector
+        embeddings = []
+        for tokens in data:
+            # Convert to tokens, if necessary. If tokenized list is empty, use input string.
+            if isinstance(tokens, str):
+                tokenlist = Tokenizer.tokenize(tokens)
+                tokens = tokenlist if tokenlist else [tokens]
+            # Generate weights for each vector using a scoring method
+            weights = self.scoring.weights(tokens) if self.scoring else None
+            # pylint: disable=E1133
+            if weights and [x for x in weights if x > 0]:
+                # Build weighted average embeddings vector. Create weights array as float32 to match embeddings precision.
+                embedding = np.average(self.lookup(tokens), weights=np.array(weights, dtype=np.float32), axis=0)
+            else:
+                # If no weights, use mean
+                embedding = np.mean(self.lookup(tokens), axis=0)
+            embeddings.append(embedding)
+        return np.array(embeddings, dtype=np.float32)
+    def index(self, documents, batchsize=500, checkpoint=None):
+        # Derive number of parallel processes
+        parallel = self.config.get("parallel", True)
+        parallel = os.cpu_count() if parallel and isinstance(parallel, bool) else int(parallel)
+        # Use default single process indexing logic
+        if not parallel:
+            return super().index(documents, batchsize)
+        # Customize indexing logic with multiprocessing pool to efficiently build vectors
+        ids, dimensions, batches, stream = [], None, 0, None
+        # Shared objects with Pool
+        args = (self.config, self.scoring)
+        # Convert all documents to embedding arrays, stream embeddings to disk to control memory usage
+        with Pool(parallel, initializer=create, initargs=args) as pool:
+            with tempfile.NamedTemporaryFile(mode="wb", suffix=".npy", delete=False) as output:
+                stream = output.name
+                embeddings = []
+                for uid, embedding in pool.imap(transform, documents, self.encodebatch):
+                    if not dimensions:
+                        # Set number of dimensions for embeddings
+                        dimensions = embedding.shape[0]
+                    ids.append(uid)
+                    embeddings.append(embedding)
+                    if len(embeddings) == batchsize:
+                        np.save(output, np.array(embeddings, dtype=np.float32), allow_pickle=False)
+                        batches += 1
+                        embeddings = []
+                # Final embeddings batch
+                if embeddings:
+                    np.save(output, np.array(embeddings, dtype=np.float32), allow_pickle=False)
+                    batches += 1
+        return (ids, dimensions, batches, stream)
+    def lookup(self, tokens):
+        """
+        Queries word vectors for given list of input tokens.
+        Args:
+            tokens: list of tokens to query
+        Returns:
+            word vectors array
+        """
+        return self.model.embeddings(tokens)
+    def tokens(self, data):
+        # Skip tokenization rules
+        return data