PyPI - dstklib - Versions diffs - 1.0.0__py3-none-any.whl - Mend

dstklib 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

dstk/__init__.py +12 -0
dstk/collocations.py +121 -0
dstk/count_models.py +112 -0
dstk/geometric_distance.py +107 -0
dstk/lib_types/__init__.py +9 -0
dstk/lib_types/dstk_types.py +26 -0
dstk/lib_types/fasttext_types.py +1 -0
dstk/lib_types/gensim_types.py +1 -0
dstk/lib_types/matplotlib_types.py +4 -0
dstk/lib_types/nltk_types.py +1 -0
dstk/lib_types/numpy_types.py +2 -0
dstk/lib_types/pandas_types.py +1 -0
dstk/lib_types/sklearn_types.py +1 -0
dstk/lib_types/spacy_types.py +6 -0
dstk/matrix_base.py +113 -0
dstk/pipeline_tools.py +27 -0
dstk/pipelines.py +114 -0
dstk/plot_embeddings.py +240 -0
dstk/predict_models.py +189 -0
dstk/text_matrix_builder.py +87 -0
dstk/text_processor.py +450 -0
dstk/weight_matrix.py +71 -0
dstk/workflow_tools.py +257 -0
dstklib-1.0.0.dist-info/LICENSE +674 -0
dstklib-1.0.0.dist-info/METADATA +360 -0
dstklib-1.0.0.dist-info/RECORD +28 -0
dstklib-1.0.0.dist-info/WHEEL +5 -0
dstklib-1.0.0.dist-info/top_level.txt +1 -0

dstk/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+from .count_models import *
+from .geometric_distance import *
+from .plot_embeddings import *
+from .predict_models import *
+from .collocations import *
+from .text_matrix_builder import *
+from .text_processor import *
+from .weight_matrix import *
+from .matrix_base import *
+from .workflow_tools import *
+from .pipeline_tools import *
+from .pipelines import *

dstk/collocations.py ADDED Viewed

@@ -0,0 +1,121 @@
+from collections import Counter
+from dataclasses import dataclass
+import matplotlib.pyplot as plt
+from .workflow_tools import workflow, requires, WorkflowManager
+from .lib_types import BarContainer, Collocate
+STAGES = [
+    "start", # Before any processing
+    "collocates", # Manipulation of collocates
+    "count", # Operations dealing with the counts of the words appearing around the target word
+    "end" # End of the workflow. After this stage the user must necessarily call result to continue with the analysis
+]
+@dataclass
+class Ngrams:
+    collocates: list[tuple[str, ...]]
+    bigrams: list[Collocate]
+class Collocations(WorkflowManager):
+    """
+    Extracts n-grams for a target word by a context window (both directed or undirected). Counts and plots the terms that co-occur with the target
+    :param tokens: A list of tokenized words (e.g. from a text or corpus) in which to search for the target word and its collocates.
+    """
+    _start: list[str]
+    _end: BarContainer
+    def __init__(self, tokens: list[str] | None = None):
+        """
+        Initializes TargetCollocations with given attributes.
+        """
+        super().__init__()
+        # Stages
+        self._collocates: Ngrams
+        self._count: Counter[str]
+        self._set_workflow(input_arg=tokens)
+    @requires(stages=["start"])
+    @workflow(input_arg="tokens", input_process="_start", output_process="_collocates", next_stage="collocates")
+    def extract_ngrams(self, *, tokens: list[str], target_word: str, window_size: tuple[int, int], directed: bool = False) -> Ngrams:
+        """
+        Extracts both the context words of the target collocation, returned as tuples whose lenght corresponds to the specified window_size, and the collocations of the target word, in either directed or undirected manner.
+        :param tokens: A list of tokenized words (e.g. from a text or corpus) in which to search for the target word and its collocates.
+        :param target_word: Target word whose collocation are to be identified.
+        :param window_size: Context window represented as a tuple (left, right) of the number of words to be included to left and right of the target word.
+        :param directed: If True, the position of collocates relative to the target word is considered (i.e., direction matters); if False, direction is ignored. Defaults to False.
+        """
+        collocates: list[tuple[str, ...]] = []
+        bigrams: list[Collocate] = []
+        for index, word in enumerate(tokens):
+            if word == target_word:
+                start: int = max(0, index - window_size[0])
+                end: int = min(len(tokens), index + window_size[1] + 1)
+                left_context: list[str] = tokens[start:index]
+                right_context: list[str] = tokens[index + 1:end]
+                context: list[str] = left_context + right_context
+                collocates.append(tuple(context))
+                if directed == True:
+                    bigrams.extend([(word, ("L", target_word)) for word in left_context] + [(word, ("R", target_word)) for word in right_context])
+                else:
+                    bigrams.extend([(word, target_word) for word in context])
+        return Ngrams(collocates, bigrams)
+    @requires(stages=["collocates"])
+    @workflow(input_arg="collocates", input_attrs={"collocates": "collocates"}, input_process="_collocates", output_process="_count", next_stage="count")
+    def count_collocates(self, *, collocates: list[tuple[str, ...]]) -> Counter[str]:
+        """
+        Counts the collocates of the target word.
+        :param collocates: A list of collocates to count.
+        """
+        all_words: list[str] = [word for collocation in collocates for word in collocation]
+        word_counts: Counter[str] = Counter(all_words)
+        return word_counts
+    @requires(stages=["count"])
+    @workflow(input_arg="word_counts", input_process="_count", output_process="_end", next_stage="end")
+    def plot(self, *, word_counts: Counter[str], size: int = 10, show: bool = True, path: str | None = None) -> BarContainer:
+        """
+        Plots the count of the collocates.
+        :param word_counts: A Counter object with the counts of each word. Defaults to None.
+        :param size: The number of the most common collocates to plot. Defaults to 10.
+        :param show: If True, shows the plot. Defaults to True.
+        :param path: If provided, saves the plot in the specified path. Defaults to None.
+        """
+        counts: list[tuple[str, int]] = word_counts.most_common(size)
+        words: tuple[str, ...]
+        values: tuple[int, ...]
+        words, values = zip(*counts)
+        fig: BarContainer = plt.bar(words, values)
+        plt.xlabel("Palabras")
+        plt.ylabel("Cuentas")
+        if path:
+            plt.savefig(path)
+        if show:
+            plt.show()
+        return fig

dstk/count_models.py ADDED Viewed

@@ -0,0 +1,112 @@
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA, TruncatedSVD
+from .workflow_tools import requires, workflow, WorkflowManager
+from .matrix_base import MatrixRepresentation, accept_matrix_representation, matrix_to_dataframe
+from .lib_types import ndarray, DataFrame
+STAGES = [
+    "start", # Before any transformation to the Co-Matrixis applied
+    "embeddings", # Result of the embeddings
+    "end" # Embeddings transformed to dataframe
+]
+class CountModels(WorkflowManager):
+    """
+    Generates word embeddings using dimensionality reduction techniques on a co-occurrence matrix, such as SVD and PCA.
+    :param co_ocurrence_matrix: A Co-ocurrence matrix from which embeddings will be generated.
+    """
+    _start: ndarray | DataFrame
+    _end: DataFrame
+    def __init__(self, co_ocurrence_matrix: DataFrame | None = None):
+        """
+        Initializes CountModels with given attributes.
+        """
+        super().__init__()
+        self._embeddings: MatrixRepresentation
+        self._set_workflow(input_arg=co_ocurrence_matrix)
+    @requires(stages=["start"])
+    @workflow(input_arg="matrix", input_process="_start", output_process="_embeddings", next_stage="embeddings")
+    @accept_matrix_representation()
+    def scale_matrix(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
+        """
+        Scales the input matrix to have zero mean and unit variance for each feature.
+        This method applies standardization using scikit-learn's StandardScaler, which transforms the data such that each colum (feature) has a mean of 0 and a standard deviation of 1.
+        :param matrix: The input data to scale.
+        :param kwargs: Additional keyword arguments to pass to sklearn's StandardScaler. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
+        This method supports different matrix forms due to decorator-based preprocessing:
+            - matrix: ndarray or Dataframe
+            - matrix representation: MatrixRepresentation
+        """
+        scaler: StandardScaler = StandardScaler(**kwargs)
+        scaled_matrix: ndarray = scaler.fit_transform(matrix)
+        return MatrixRepresentation(scaled_matrix, matrix.index if isinstance(matrix, DataFrame) else None, matrix.columns if isinstance(matrix, DataFrame) else None)
+    @requires(stages=["embeddings"])
+    @workflow(input_arg="matrix", input_process="_embeddings", output_process="_embeddings")
+    @accept_matrix_representation(override=("columns", None))
+    def svd_embeddings(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
+        """
+        Generates word embeddings using truncated Single Value Descomposition (SVD).
+        :param matrix: A Co-occurrence matrix from which embeddings will be generated.
+        :param kwargs: Additional keyword arguments to pass to sklearn's TruncatedSVD.
+            Common options include:
+                - n_components: Specifies the number of dimensions to reduce the Co-ocurrence matrix to.
+            For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
+        This method supports different matrix forms due to decorator-based preprocessing:
+            - matrix: ndarray or Dataframe
+            - matrix representation: MatrixRepresentation
+        """
+        svd: TruncatedSVD = TruncatedSVD(**kwargs)
+        embeddings: ndarray = svd.fit_transform(matrix)
+        return MatrixRepresentation(embeddings, matrix.index if isinstance(matrix, DataFrame) else None, None)
+    @requires(stages=["embeddings"])
+    @workflow(input_arg="matrix", input_process="_embeddings", output_process="_embeddings")
+    @accept_matrix_representation(override=("columns", None))
+    def pca_embeddings(self, *, matrix: ndarray | DataFrame, **kwargs) -> MatrixRepresentation:
+        """
+        Generates word embeddings using Principal Component Analysis (PCA).
+        :param matrix: A Co-occurrence matrix from which embeddings will be generated.
+        :param kwargs: Additional keyword arguments to pass to sklearn's PCA.
+            Common options include:
+                - n_components: If an integer, specifies the number of dimensions to reduce the Co-ocurrence matrix to. If a float, the amount of variance to preserve during PCA.
+            For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
+        This method supports different matrix forms due to decorator-based preprocessing:
+            - matrix: ndarray or Dataframe
+            - matrix representation: MatrixRepresentation
+        """
+        pca: PCA = PCA(**kwargs)
+        embeddings: ndarray = pca.fit_transform(matrix)
+        return MatrixRepresentation(embeddings, matrix.index if isinstance(matrix, DataFrame) else None, None)
+    @requires(stages=["embeddings"])
+    @workflow(input_arg="matrix", input_process="_embeddings", output_process="_end", next_stage="end")
+    def to_dataframe(self, *, matrix: MatrixRepresentation, **kwargs) -> DataFrame:
+        """
+        Creates a dataframe from a matrix representation.
+        :param matrix: An matrix represenation from which to create a dataframe.
+        :param kwargs: Additional keyword arguments to pass to sklearn's pandas' DataFrame.
+        """
+        return matrix_to_dataframe(matrix=matrix, **kwargs)

dstk/geometric_distance.py ADDED Viewed

@@ -0,0 +1,107 @@
+from sklearn.neighbors import NearestNeighbors
+import pandas as pd
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from .workflow_tools import requires, workflow, WorkflowManager
+from .lib_types import ndarray, Series, DataFrame
+STAGES = [
+    "start", # The embeddings array
+    "end" # After a distance has been applied
+]
+class GeometricDistance(WorkflowManager):
+    """
+    Provides a set of methods to calculate the distance between the embeddings of words, such as Euclidean distance, Manhattan distance, Cosine similarity, Nearest neighbors, etc.
+    """
+    def __init__(self, embeddings: DataFrame | None = None):
+        """
+        Initializes GeometricDistance with given attributes.
+        :param embeddings: A matrix of word embeddings.
+        :param vocab: Sequence of words representing the vocabulary aligned with the embeddings.
+        """
+        super().__init__()
+        self._set_workflow(input_arg=embeddings)
+    @requires(stages=["start", "end"], multiple_calls=True)
+    @workflow(input_arg="embeddings", input_process="_start", output_process="_end", next_stage="end") # It would be interesting if you could select a set of distances as then result return all of them for comparision. Or you could call different words and return an array wit the result of all of them.
+    def euclidean_distance(self, *, embeddings: DataFrame, first_word: str, second_word: str) -> float:
+        """
+        Computes the Euclidean distance between the embeddings of two words.
+        :param embeddings: A dataframe containing the word embeddings.
+        :param first_word: The first word in the pair.
+        :param second_word: The second word in the pair.
+        """
+        first_word_vector: Series = embeddings.loc[first_word]
+        second_word_vector: Series = embeddings.loc[second_word]
+        return float(np.linalg.norm(first_word_vector - second_word_vector))
+    @requires(stages=["start", "end"], multiple_calls=True)
+    @workflow(input_arg="embeddings", input_process="_start", output_process="_end", next_stage="end")
+    def manhattan_distance(self, *, embeddings: DataFrame, first_word: str, second_word: str) -> float:
+        """
+        Computes the Manhattan distance between the embeddings of two words.
+        :param embeddings: A dataframe containing the word embeddings.
+        :param first_word: The first word in the pair.
+        :param second_word: The second word in the pair.
+        """
+        first_word_vector: Series = embeddings.loc[first_word]
+        second_word_vector: Series = embeddings.loc[second_word]
+        return np.sum(np.abs(first_word_vector - second_word_vector))
+    @requires(stages=["start", "end"], multiple_calls=True)
+    @workflow(input_arg="embeddings", input_process="_start", output_process="_end", next_stage="end")
+    def cos_similarity(self, *, embeddings: DataFrame, first_word: str, second_word: str) -> float:
+        """
+        Computes the cosine similarity between the embeddings of two words.
+        :param embeddings: A dataframe containing the word embeddings.
+        :param first_word: The first word in the pair.
+        :param second_word: The second word in the pair.
+        """
+        first_word_vector: ndarray = np.array(embeddings.loc[first_word]).reshape(1, -1)
+        second_word_vector: ndarray = np.array(embeddings.loc[second_word]).reshape(1, -1)
+        cos_sim: ndarray = cosine_similarity(first_word_vector, second_word_vector)
+        return cos_sim[0][0]
+    @requires(stages=["start", "end"], multiple_calls=True)
+    @workflow(input_arg="embeddings", input_process="_start", output_process="_end", next_stage="end")
+    def nearest_neighbors(self, *, embeddings: DataFrame, word: str, metric: str, n_words: int = 5, **kwargs) -> list[tuple[str, float]]:
+        """
+        Returns the top N most semantically similar words to a given target word, based on the specified distance or similarity metric.
+        :param embeddings: A dataframe containing the word embeddings.
+        :param word: The target word to find neighbors for.
+        :param metric: The distance or similarity metric to use (e.g., 'cosine', 'euclidean').
+        :param n_words: Number of nearest neighbors to return. Defaults to 5.
+        :param kwargs: Additional keyword arguments to pass to sklearn's NearestNeighbors. For more information check: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html
+        """
+        neighbors: NearestNeighbors = NearestNeighbors(n_neighbors=n_words, algorithm="auto", metric=metric, **kwargs)
+        neighbors.fit(embeddings.to_numpy())
+        word_vector: Series = embeddings.loc[word]
+        distances: ndarray
+        indices: ndarray
+        distances, indices = neighbors.kneighbors([word_vector], n_neighbors=n_words + 1)
+        neighbor_tuples = zip(indices[0], distances[0])
+        results: list[tuple[str, float]] = [(embeddings.index[index], 1 - distance) for index, distance in neighbor_tuples if embeddings.index[index] != word]
+        return results

dstk/lib_types/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .spacy_types import *
+from .sklearn_types import *
+from .numpy_types import *
+from .pandas_types import *
+from .matplotlib_types import *
+from .fasttext_types import *
+from .gensim_types import *
+from .nltk_types import *
+from .dstk_types import *

dstk/lib_types/dstk_types.py ADDED Viewed

@@ -0,0 +1,26 @@
+from typing import TypeAlias, Callable, TypeVar, Any
+from .spacy_types import Doc, Span, Token
+from .sklearn_types import csc_matrix, csr_matrix
+from .numpy_types import ndarray, NDArray, str_
+from .pandas_types import Index
+# TextProcessor
+TokenIterator: TypeAlias = Doc | Span | list[Token]
+POSIterator: TypeAlias = list[tuple[Token, str]]
+SentenceIterator: TypeAlias = list[Span] | list[list[Token]] | list[POSIterator]
+TextIterator: TypeAlias = list[str] | list[tuple[str, str]] | list[list[tuple[str, str]]]
+Sentence: TypeAlias = Span | list[Token] | POSIterator | list[tuple[str, str]]
+Sentences: TypeAlias =  list[Sentence]
+POSTags: TypeAlias = list[tuple[Token | str, str]]
+Function = TypeVar("Function", bound=Callable[..., object])
+# TargetCollocations
+Collocate = tuple[str, tuple[str, str]] | tuple[str, str]
+# Matrices
+Matrix: TypeAlias = csr_matrix | csc_matrix | ndarray
+Labels: TypeAlias = NDArray[str_] | Index | list[str] | None
+#Workflow
+MethodSpec: TypeAlias = dict[str, dict[str, Any]]

dstk/lib_types/fasttext_types.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from fasttext.FastText import _FastText as FastText

dstk/lib_types/gensim_types.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from gensim.models import Word2Vec

dstk/lib_types/matplotlib_types.py ADDED Viewed

@@ -0,0 +1,4 @@
+from matplotlib.collections import PathCollection
+from matplotlib.pyplot import Axes
+from matplotlib.container import BarContainer
+from mpl_toolkits.mplot3d.axes3d import Axes3D

dstk/lib_types/nltk_types.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from nltk import Text

dstk/lib_types/numpy_types.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from numpy import ndarray, str_
2	+ from numpy.typing import NDArray

dstk/lib_types/pandas_types.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from pandas import DataFrame, Series, Index

dstk/lib_types/sklearn_types.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from scipy.sparse import csr_matrix, csc_matrix

dstk/lib_types/spacy_types.py ADDED Viewed

@@ -0,0 +1,6 @@
+# spaCy types
+from spacy.language import Language
+from spacy.tokens import Doc, Token, Span
+#__all__ = ['Language', 'Doc']

dstk/matrix_base.py ADDED Viewed

@@ -0,0 +1,113 @@
+from dataclasses import dataclass, field
+import pandas as pd
+from .lib_types import Matrix, Labels, ndarray, DataFrame, csr_matrix, csc_matrix
+from .workflow_tools import accepts_generic
+from typing import Any, cast, Callable
+@dataclass
+class MatrixRepresentation:
+    """
+    Container for a matrix and its associated row and column labels, with optional metadata.
+    :param matrix: The core matrix data, typically a NumPy array.
+    :param rows: Optional row labels.
+    :param columns: Optional column labels.
+    :param meta: Optional dictionary to store additional metadata.
+    """
+    matrix: Matrix
+    rows: Labels = None
+    columns: Labels = None
+    meta: dict[str, Any] = field(default_factory=dict)
+def accept_matrix_representation(accepts: bool = True, custom_error_message: str = "", intercept: bool = True, meta: str | None = None, override: tuple[str, Any] | None = None) -> Callable:
+    """
+    Decorator that allows a method to accept a MatrixRepresentation object as input.
+    Extracts the underlying matrix for processing and optionally reattaches metadata, labels, or captures non-matrix outputs into metadata.
+    :param accepts: Whether to accept MatrixRepresentation inputs.
+    :param custom_error_message: Optional error message if input type is not accepted.
+    :param intercept: Whether to intercept the input and repackage the output.
+    :param meta: Optional metadata key to store non-matrix outputs in the result.
+    :return: A decorated method that supports MatrixRepresentation as input.
+    """
+    def is_matrix_representation(matrix: Any) -> bool:
+        """
+        If matrix is an instance of MatrixRepresentation, returns True. Else, returns False.
+        :param matrix: A matrix to check its instance.
+        """
+        return True if isinstance(matrix, MatrixRepresentation) else False
+    def intercept_matrix(self, input_value: MatrixRepresentation, method: Callable, *args, **kwargs) -> MatrixRepresentation:
+        result: MatrixRepresentation | Any = method(self, *args, matrix=input_value.matrix, **kwargs)
+        matrix: MatrixRepresentation
+        if isinstance(result, MatrixRepresentation):
+            matrix = result
+            matrix.rows = input_value.rows
+            matrix.columns = input_value.columns
+            if override:
+                attr, value = override
+                setattr(matrix, attr, value)
+        else:
+            if isinstance(result, ndarray):
+                matrix = MatrixRepresentation(result, input_value.rows, input_value.columns)
+            elif not isinstance(result, ndarray) and meta:
+                matrix = MatrixRepresentation(input_value.matrix, input_value.rows, input_value.columns)
+                matrix.meta[meta] = result
+        return matrix
+    return accepts_generic(
+        type_checker=is_matrix_representation,
+        input_arg="matrix",
+        accepts=accepts,
+        intercept=intercept,
+        interceptor=lambda self, input_value, method, *args, **kwargs: intercept_matrix(self, input_value, method, *args, **kwargs),
+        input_type=MatrixRepresentation,
+        custom_error_message=custom_error_message
+    )
+def matrix_to_dataframe(matrix: MatrixRepresentation, **kwargs):
+    """
+    Converts a MatrixRepresentation to a pandas DataFrame.
+    :param matrix: A MatrixRepresentation instance.
+    :param kwargs: Additional keyword arguments to pass to sklearn's pandas' DataFrame.
+    :return: A pandas DataFrame with corresponding data and labels.
+    """
+    if isinstance(matrix.matrix, csr_matrix) or isinstance(matrix.matrix, csc_matrix):
+        matrix.matrix = matrix.matrix.toarray()
+    return pd.DataFrame(
+        matrix.matrix,
+        index=matrix.rows,
+        columns=matrix.columns,
+        **kwargs
+    )
+def dataframe_to_matrix(dataframe: DataFrame):
+    """
+    Converts a pandas DataFrame to a MatrixRepresentation.
+    :param dataframe: A pandas DataFrame.
+    :return: A MatrixRepresentation with matrix data and index/column labels.
+    """
+    return MatrixRepresentation(
+        matrix=dataframe.to_numpy(),
+        rows=list(dataframe.index),
+        columns=list(dataframe.columns)
+    )

dstk/pipeline_tools.py ADDED Viewed

@@ -0,0 +1,27 @@
+from .workflow_tools import WorkflowBuilder, WorkflowManager
+from typing import Any, Callable
+class PipelineBuilder:
+    """
+    Automates the execution of a sequence of workflows on a WorkflowBuilder or Callable subclass.
+    :param workflows: A subclass of WorkflowBuilder representing the workflow to execute or a function to hook in the pipeline.
+    """
+    def __init__(self, workflows: list[WorkflowBuilder | Callable]):
+        """
+        Initializes WorkflowBuilder with given attributes.
+        """
+        self.workflows: list[WorkflowBuilder | Callable] = workflows
+    def __call__(self, **kwargs) -> Any:
+            workflows: list[WorkflowBuilder | Callable] = self.workflows
+            entry_workflow: WorkflowBuilder | Callable = workflows.pop(0)
+            result: Any = entry_workflow(**kwargs)
+            for workflow in workflows:
+                result = workflow(result)
+            return result