PyPI - mseep-txtai - Versions diffs - 9.1.1__py3-none-any.whl - Mend

mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (251) hide show

mseep_txtai-9.1.1.dist-info/METADATA +262 -0
mseep_txtai-9.1.1.dist-info/RECORD +251 -0
mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
txtai/__init__.py +16 -0
txtai/agent/__init__.py +12 -0
txtai/agent/base.py +54 -0
txtai/agent/factory.py +39 -0
txtai/agent/model.py +107 -0
txtai/agent/placeholder.py +16 -0
txtai/agent/tool/__init__.py +7 -0
txtai/agent/tool/embeddings.py +69 -0
txtai/agent/tool/factory.py +130 -0
txtai/agent/tool/function.py +49 -0
txtai/ann/__init__.py +7 -0
txtai/ann/base.py +153 -0
txtai/ann/dense/__init__.py +11 -0
txtai/ann/dense/annoy.py +72 -0
txtai/ann/dense/factory.py +76 -0
txtai/ann/dense/faiss.py +233 -0
txtai/ann/dense/hnsw.py +104 -0
txtai/ann/dense/numpy.py +164 -0
txtai/ann/dense/pgvector.py +323 -0
txtai/ann/dense/sqlite.py +303 -0
txtai/ann/dense/torch.py +38 -0
txtai/ann/sparse/__init__.py +7 -0
txtai/ann/sparse/factory.py +61 -0
txtai/ann/sparse/ivfsparse.py +377 -0
txtai/ann/sparse/pgsparse.py +56 -0
txtai/api/__init__.py +18 -0
txtai/api/application.py +134 -0
txtai/api/authorization.py +53 -0
txtai/api/base.py +159 -0
txtai/api/cluster.py +295 -0
txtai/api/extension.py +19 -0
txtai/api/factory.py +40 -0
txtai/api/responses/__init__.py +7 -0
txtai/api/responses/factory.py +30 -0
txtai/api/responses/json.py +56 -0
txtai/api/responses/messagepack.py +51 -0
txtai/api/route.py +41 -0
txtai/api/routers/__init__.py +25 -0
txtai/api/routers/agent.py +38 -0
txtai/api/routers/caption.py +42 -0
txtai/api/routers/embeddings.py +280 -0
txtai/api/routers/entity.py +42 -0
txtai/api/routers/extractor.py +28 -0
txtai/api/routers/labels.py +47 -0
txtai/api/routers/llm.py +61 -0
txtai/api/routers/objects.py +42 -0
txtai/api/routers/openai.py +191 -0
txtai/api/routers/rag.py +61 -0
txtai/api/routers/reranker.py +46 -0
txtai/api/routers/segmentation.py +42 -0
txtai/api/routers/similarity.py +48 -0
txtai/api/routers/summary.py +46 -0
txtai/api/routers/tabular.py +42 -0
txtai/api/routers/textractor.py +42 -0
txtai/api/routers/texttospeech.py +33 -0
txtai/api/routers/transcription.py +42 -0
txtai/api/routers/translation.py +46 -0
txtai/api/routers/upload.py +36 -0
txtai/api/routers/workflow.py +28 -0
txtai/app/__init__.py +5 -0
txtai/app/base.py +821 -0
txtai/archive/__init__.py +9 -0
txtai/archive/base.py +104 -0
txtai/archive/compress.py +51 -0
txtai/archive/factory.py +25 -0
txtai/archive/tar.py +49 -0
txtai/archive/zip.py +35 -0
txtai/cloud/__init__.py +8 -0
txtai/cloud/base.py +106 -0
txtai/cloud/factory.py +70 -0
txtai/cloud/hub.py +101 -0
txtai/cloud/storage.py +125 -0
txtai/console/__init__.py +5 -0
txtai/console/__main__.py +22 -0
txtai/console/base.py +264 -0
txtai/data/__init__.py +10 -0
txtai/data/base.py +138 -0
txtai/data/labels.py +42 -0
txtai/data/questions.py +135 -0
txtai/data/sequences.py +48 -0
txtai/data/texts.py +68 -0
txtai/data/tokens.py +28 -0
txtai/database/__init__.py +14 -0
txtai/database/base.py +342 -0
txtai/database/client.py +227 -0
txtai/database/duckdb.py +150 -0
txtai/database/embedded.py +76 -0
txtai/database/encoder/__init__.py +8 -0
txtai/database/encoder/base.py +37 -0
txtai/database/encoder/factory.py +56 -0
txtai/database/encoder/image.py +43 -0
txtai/database/encoder/serialize.py +28 -0
txtai/database/factory.py +77 -0
txtai/database/rdbms.py +569 -0
txtai/database/schema/__init__.py +6 -0
txtai/database/schema/orm.py +99 -0
txtai/database/schema/statement.py +98 -0
txtai/database/sql/__init__.py +8 -0
txtai/database/sql/aggregate.py +178 -0
txtai/database/sql/base.py +189 -0
txtai/database/sql/expression.py +404 -0
txtai/database/sql/token.py +342 -0
txtai/database/sqlite.py +57 -0
txtai/embeddings/__init__.py +7 -0
txtai/embeddings/base.py +1107 -0
txtai/embeddings/index/__init__.py +14 -0
txtai/embeddings/index/action.py +15 -0
txtai/embeddings/index/autoid.py +92 -0
txtai/embeddings/index/configuration.py +71 -0
txtai/embeddings/index/documents.py +86 -0
txtai/embeddings/index/functions.py +155 -0
txtai/embeddings/index/indexes.py +199 -0
txtai/embeddings/index/indexids.py +60 -0
txtai/embeddings/index/reducer.py +104 -0
txtai/embeddings/index/stream.py +67 -0
txtai/embeddings/index/transform.py +205 -0
txtai/embeddings/search/__init__.py +11 -0
txtai/embeddings/search/base.py +344 -0
txtai/embeddings/search/errors.py +9 -0
txtai/embeddings/search/explain.py +120 -0
txtai/embeddings/search/ids.py +61 -0
txtai/embeddings/search/query.py +69 -0
txtai/embeddings/search/scan.py +196 -0
txtai/embeddings/search/terms.py +46 -0
txtai/graph/__init__.py +10 -0
txtai/graph/base.py +769 -0
txtai/graph/factory.py +61 -0
txtai/graph/networkx.py +275 -0
txtai/graph/query.py +181 -0
txtai/graph/rdbms.py +113 -0
txtai/graph/topics.py +166 -0
txtai/models/__init__.py +9 -0
txtai/models/models.py +268 -0
txtai/models/onnx.py +133 -0
txtai/models/pooling/__init__.py +9 -0
txtai/models/pooling/base.py +141 -0
txtai/models/pooling/cls.py +28 -0
txtai/models/pooling/factory.py +144 -0
txtai/models/pooling/late.py +173 -0
txtai/models/pooling/mean.py +33 -0
txtai/models/pooling/muvera.py +164 -0
txtai/models/registry.py +37 -0
txtai/models/tokendetection.py +122 -0
txtai/pipeline/__init__.py +17 -0
txtai/pipeline/audio/__init__.py +11 -0
txtai/pipeline/audio/audiomixer.py +58 -0
txtai/pipeline/audio/audiostream.py +94 -0
txtai/pipeline/audio/microphone.py +244 -0
txtai/pipeline/audio/signal.py +186 -0
txtai/pipeline/audio/texttoaudio.py +60 -0
txtai/pipeline/audio/texttospeech.py +553 -0
txtai/pipeline/audio/transcription.py +212 -0
txtai/pipeline/base.py +23 -0
txtai/pipeline/data/__init__.py +10 -0
txtai/pipeline/data/filetohtml.py +206 -0
txtai/pipeline/data/htmltomd.py +414 -0
txtai/pipeline/data/segmentation.py +178 -0
txtai/pipeline/data/tabular.py +155 -0
txtai/pipeline/data/textractor.py +139 -0
txtai/pipeline/data/tokenizer.py +112 -0
txtai/pipeline/factory.py +77 -0
txtai/pipeline/hfmodel.py +111 -0
txtai/pipeline/hfpipeline.py +96 -0
txtai/pipeline/image/__init__.py +7 -0
txtai/pipeline/image/caption.py +55 -0
txtai/pipeline/image/imagehash.py +90 -0
txtai/pipeline/image/objects.py +80 -0
txtai/pipeline/llm/__init__.py +11 -0
txtai/pipeline/llm/factory.py +86 -0
txtai/pipeline/llm/generation.py +173 -0
txtai/pipeline/llm/huggingface.py +218 -0
txtai/pipeline/llm/litellm.py +90 -0
txtai/pipeline/llm/llama.py +152 -0
txtai/pipeline/llm/llm.py +75 -0
txtai/pipeline/llm/rag.py +477 -0
txtai/pipeline/nop.py +14 -0
txtai/pipeline/tensors.py +52 -0
txtai/pipeline/text/__init__.py +13 -0
txtai/pipeline/text/crossencoder.py +70 -0
txtai/pipeline/text/entity.py +140 -0
txtai/pipeline/text/labels.py +137 -0
txtai/pipeline/text/lateencoder.py +103 -0
txtai/pipeline/text/questions.py +48 -0
txtai/pipeline/text/reranker.py +57 -0
txtai/pipeline/text/similarity.py +83 -0
txtai/pipeline/text/summary.py +98 -0
txtai/pipeline/text/translation.py +298 -0
txtai/pipeline/train/__init__.py +7 -0
txtai/pipeline/train/hfonnx.py +196 -0
txtai/pipeline/train/hftrainer.py +398 -0
txtai/pipeline/train/mlonnx.py +63 -0
txtai/scoring/__init__.py +12 -0
txtai/scoring/base.py +188 -0
txtai/scoring/bm25.py +29 -0
txtai/scoring/factory.py +95 -0
txtai/scoring/pgtext.py +181 -0
txtai/scoring/sif.py +32 -0
txtai/scoring/sparse.py +218 -0
txtai/scoring/terms.py +499 -0
txtai/scoring/tfidf.py +358 -0
txtai/serialize/__init__.py +10 -0
txtai/serialize/base.py +85 -0
txtai/serialize/errors.py +9 -0
txtai/serialize/factory.py +29 -0
txtai/serialize/messagepack.py +42 -0
txtai/serialize/pickle.py +98 -0
txtai/serialize/serializer.py +46 -0
txtai/util/__init__.py +7 -0
txtai/util/resolver.py +32 -0
txtai/util/sparsearray.py +62 -0
txtai/util/template.py +16 -0
txtai/vectors/__init__.py +8 -0
txtai/vectors/base.py +476 -0
txtai/vectors/dense/__init__.py +12 -0
txtai/vectors/dense/external.py +55 -0
txtai/vectors/dense/factory.py +121 -0
txtai/vectors/dense/huggingface.py +44 -0
txtai/vectors/dense/litellm.py +86 -0
txtai/vectors/dense/llama.py +84 -0
txtai/vectors/dense/m2v.py +67 -0
txtai/vectors/dense/sbert.py +92 -0
txtai/vectors/dense/words.py +211 -0
txtai/vectors/recovery.py +57 -0
txtai/vectors/sparse/__init__.py +7 -0
txtai/vectors/sparse/base.py +90 -0
txtai/vectors/sparse/factory.py +55 -0
txtai/vectors/sparse/sbert.py +34 -0
txtai/version.py +6 -0
txtai/workflow/__init__.py +8 -0
txtai/workflow/base.py +184 -0
txtai/workflow/execute.py +99 -0
txtai/workflow/factory.py +42 -0
txtai/workflow/task/__init__.py +18 -0
txtai/workflow/task/base.py +490 -0
txtai/workflow/task/console.py +24 -0
txtai/workflow/task/export.py +64 -0
txtai/workflow/task/factory.py +89 -0
txtai/workflow/task/file.py +28 -0
txtai/workflow/task/image.py +36 -0
txtai/workflow/task/retrieve.py +61 -0
txtai/workflow/task/service.py +102 -0
txtai/workflow/task/storage.py +110 -0
txtai/workflow/task/stream.py +33 -0
txtai/workflow/task/template.py +116 -0
txtai/workflow/task/url.py +20 -0
txtai/workflow/task/workflow.py +14 -0

txtai/database/sql/expression.py ADDED Viewed

@@ -0,0 +1,404 @@
+"""
+Expression module
+"""
+from .token import Token
+class Expression:
+    """
+    Parses expression statements and runs a set of substitution/formatting rules.
+    """
+    def __init__(self, resolver, tolist):
+        """
+        Creates a new expression parser.
+        Args:
+            resolver: function to call to resolve query column names with database column names
+            tolist: outputs expression lists if True, text if False
+        """
+        self.resolver = resolver
+        self.tolist = tolist
+    def __call__(self, tokens, alias=False, aliases=None, similar=None):
+        """
+        Parses and formats a list of tokens as follows:
+            - Replaces query column names with database column names
+            - Adds similar query placeholders and extracts similar function parameters
+            - Rewrites expression and returns
+        Args:
+            tokens: input expression
+            alias: if True, column aliases should be generated and added to aliases dict
+            aliases: dict of generated aliases, if present these tokens should NOT be resolved
+            similar: list of similar queries, if present new similar queries are appended to this list
+        Returns:
+            rewritten clause
+        """
+        # Processes token expressions and applies a set of transformation rules
+        transformed = self.process(list(tokens), alias, aliases, similar)
+        # Re-write alias expression and return
+        if alias and not self.tolist:
+            return self.buildalias(transformed, tokens, aliases)
+        # Re-write input expression and return
+        return self.buildlist(transformed) if self.tolist is True else self.buildtext(transformed)
+    def process(self, tokens, alias, aliases, similar):
+        """
+        Replaces query column names with database column names, adds similar query placeholders and
+        extracts similar function parameters.
+        Args:
+            tokens: input expression
+            alias: if True, column aliases should be generated and added to aliases dict
+            aliases: dict of generated aliases, if present these tokens should NOT be resolved
+            similar: list of similar queries, if present new similar queries are appended to this list
+        Returns:
+            transformed tokens
+        """
+        # Create clause index and token iterator. Iterator skips distinct tokens.
+        index, iterator = 0, ((x, token) for x, token in enumerate(tokens) if not Token.isdistinct(token))
+        for x, token in iterator:
+            # Check if separator, increment clause index
+            if Token.isseparator(token):
+                index += 1
+            # Check if token is a square bracket
+            elif Token.isbracket(token):
+                # Resolve bracket expression
+                self.bracket(iterator, tokens, x)
+            # Check if token is a similar function
+            elif Token.issimilar(tokens, x, similar):
+                # Resolve similar expression
+                self.similar(iterator, tokens, x, similar)
+            # Check if token is a function
+            elif Token.isfunction(tokens, x):
+                # Resolve function expression
+                self.function(iterator, tokens, token, aliases, similar)
+            # Check for alias expression
+            elif Token.isalias(tokens, x, alias):
+                # Process alias expression
+                self.alias(iterator, tokens, x, aliases, index)
+            # Check for attribute expression
+            elif Token.isattribute(tokens, x):
+                # Resolve attribute expression
+                self.attribute(tokens, x, aliases)
+            # Check for compound expression
+            elif Token.iscompound(tokens, x):
+                # Resolve compound expression
+                self.compound(iterator, tokens, x, aliases, similar)
+        # Remove replaced tokens
+        return [token for token in tokens if token]
+    def buildtext(self, tokens):
+        """
+        Builds a new expression from tokens. This method applies a set of rules to generate whitespace between tokens.
+        Args:
+            tokens: input expression
+        Returns:
+            expression text
+        """
+        # Rebuild expression
+        text = ""
+        for token in tokens:
+            # Write token with whitespace rules applied
+            text += Token.wrapspace(text, token)
+        # Remove any leading/trailing whitespace and return
+        return text.strip()
+    def buildlist(self, tokens):
+        """
+        Builds a new expression from tokens. This method returns a list of expression components. These components can be joined together
+        on commas to form a text expression.
+        Args:
+            tokens: input expression
+        Returns:
+            expression list
+        """
+        parts, current, parens, brackets = [], [], 0, 0
+        for token in tokens:
+            # Create new part
+            if token == "," and not parens and not brackets:
+                parts.append(self.buildtext(current))
+                current = []
+            else:
+                # Accumulate tokens
+                if token == "(":
+                    parens += 1
+                elif token == ")":
+                    parens -= 1
+                elif token == "[":
+                    brackets += 1
+                elif token == "]":
+                    brackets -= 1
+                elif Token.issortorder(token):
+                    token = f" {token}"
+                current.append(token)
+        # Add last part
+        if current:
+            parts.append(self.buildtext(current))
+        return parts
+    def buildalias(self, transformed, tokens, aliases):
+        """
+        Builds new alias text expression from transformed and input tokens.
+        Args:
+            transformed: transformed tokens
+            tokens: original input tokens
+            aliases: dict of column aliases
+        Returns:
+            alias text expression
+        """
+        # Convert tokens to expressions
+        transformed = self.buildlist(transformed)
+        tokens = self.buildlist(tokens)
+        expression = []
+        for x, token in enumerate(transformed):
+            if x not in aliases.values():
+                alias = tokens[x]
+                # Strip leading/trailing brackets from alias name that doesn't have operators
+                if not any(Token.isoperator(t) for t in alias) and alias[0] in ("[", "(") and alias[-1] in ("]", ")"):
+                    alias = alias[1:-1]
+                # Strip leading distinct keyword
+                values = alias.split()
+                if len(values) > 0 and Token.isdistinct(values[0]):
+                    alias = " ".join(values[1:])
+                # Resolve alias
+                token = self.resolver(token, alias)
+            expression.append(token)
+        # Build alias text expression
+        return ", ".join(expression)
+    def bracket(self, iterator, tokens, x):
+        """
+        Consumes a [bracket] expression.
+        Args:
+            iterator: tokens iterator
+            tokens: input tokens
+            x: current position
+        """
+        # Function parameters
+        params = []
+        # Clear token from stream
+        token = tokens[x]
+        tokens[x] = None
+        # Bracket counter (current token is an open bracket)
+        brackets = 1
+        # Read until token is a end bracket
+        while token and (token != "]" or brackets > 0):
+            x, token = next(iterator, (None, None))
+            # Increase/decrease bracket counter
+            if token == "[":
+                brackets += 1
+            elif token == "]":
+                brackets -= 1
+            # Accumulate tokens
+            if token != "]" or brackets > 0:
+                params.append(token)
+            # Clear token from stream
+            tokens[x] = None
+        # Set last token to resolved bracket expression
+        tokens[x] = self.resolve(self.buildtext(params), None)
+    def similar(self, iterator, tokens, x, similar):
+        """
+        Substitutes a similar() function call with a placeholder that can later be used to add
+        embeddings query results as a filter.
+        Args:
+            iterator: tokens iterator
+            tokens: input tokens
+            x: current position
+            similar: list where similar function call parameters are stored
+        """
+        # Function parameters
+        params = []
+        # Clear token from stream
+        token = tokens[x]
+        tokens[x] = None
+        # Read until token is a closing paren
+        while token and token != ")":
+            x, token = next(iterator, (None, None))
+            if token and token not in ["(", ",", ")"]:
+                # Strip quotes and accumulate tokens
+                params.append(token.replace("'", "").replace('"', ""))
+            # Clear token from stream
+            tokens[x] = None
+        # Add placeholder for embedding similarity results
+        tokens[x] = f"{Token.SIMILAR_TOKEN}{len(similar)}"
+        # Save parameters
+        similar.append(params)
+    def function(self, iterator, tokens, token, aliases, similar):
+        """
+        Resolves column names within the function's parameters.
+        Args:
+            iterator: tokens iterator
+            tokens: input tokens
+            token: current token
+            aliases: dict of generated aliases, if present these tokens should NOT be resolved
+            similar: list where similar function call parameters are stored
+        """
+        # Consume function parameters
+        while token and token != ")":
+            x, token = next(iterator, (None, None))
+            # Check if token is a square bracket
+            if Token.isbracket(token):
+                # Resolve bracket expression
+                self.bracket(iterator, tokens, x)
+            # Check if token is a similar function
+            elif Token.issimilar(tokens, x, similar):
+                # Resolve similar expression
+                self.similar(iterator, tokens, x, similar)
+            # Check if token is a function
+            elif Token.isfunction(tokens, x):
+                # Resolve function parameters that are functions
+                self.function(iterator, tokens, token, aliases, similar)
+            # Check for attribute expression
+            elif Token.isattribute(tokens, x):
+                # Resolve attributes
+                self.attribute(tokens, x, aliases)
+            # Check for compound expression
+            elif Token.iscompound(tokens, x):
+                # Resolve compound expressions
+                self.compound(iterator, tokens, x, aliases, similar)
+    def alias(self, iterator, tokens, x, aliases, index):
+        """
+        Reads an alias clause and stores it in aliases.
+        Args:
+            iterator: tokens iterator
+            tokens: input tokens
+            x: current position
+            aliases: dict where aliases are stored - stores {alias: clause index}
+            index: clause index, used to match aliases with columns
+        """
+        token = tokens[x]
+        # If this is an alias token, get next token
+        if token in Token.ALIAS:
+            x, token = next(iterator, (None, None))
+        # Consume tokens until end of stream or a separator is found. Evaluate next token to prevent consuming here.
+        while x + 1 < len(tokens) and not Token.isseparator(Token.get(tokens, x + 1)):
+            x, token = next(iterator, (None, None))
+        # Add normalized alias and clause index
+        aliases[Token.normalize(token)] = index
+    def attribute(self, tokens, x, aliases):
+        """
+        Resolves an attribute column name.
+        Args:
+            tokens: input tokens
+            x: current token position
+            aliases: dict of generated aliases, if present these tokens should NOT be resolved
+        """
+        # Resolve attribute expression
+        tokens[x] = self.resolve(tokens[x], aliases)
+    def compound(self, iterator, tokens, x, aliases, similar):
+        """
+        Resolves column names in a compound expression (left side <operator(s)> right side).
+        Args:
+            iterator: tokens iterator
+            tokens: input tokens
+            x: current token position
+            aliases: dict of generated aliases, if present these tokens should NOT be resolved
+            similar: list where similar function call parameters are stored
+        """
+        # Resolve left side (left side already had function processing applied through standard loop)
+        if Token.iscolumn(tokens[x - 1]):
+            tokens[x - 1] = self.resolve(tokens[x - 1], aliases)
+        # Consume operator(s), handle both single and compound operators, i.e. column NOT LIKE 1
+        token = tokens[x]
+        while token and Token.isoperator(token):
+            x, token = next(iterator, (None, None))
+        # Resolve right side
+        if token and Token.iscolumn(token):
+            # Need to process functions since it hasn't went through the standard loop yet
+            if Token.isfunction(tokens, x):
+                self.function(iterator, tokens, token, aliases, similar)
+            else:
+                tokens[x] = self.resolve(token, aliases)
+    def resolve(self, token, aliases):
+        """
+        Resolves this token's value if it is not an alias or a bind parameter.
+        Args:
+            token: token to resolve
+            aliases: dict of generated aliases, if present these tokens should NOT be resolved
+        Returns:
+            resolved token value
+        """
+        # Check for alias or bind parameter
+        if (aliases and Token.normalize(token) in aliases) or (token.startswith(":")):
+            return token
+        return self.resolver(token)