PyPI - LanguageStatisticsLibPy - Versions diffs - 1.0.3__py3-none-any.whl - Mend

LanguageStatisticsLibPy 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

languagestatisticslibpy/Bigrams.py +124 -0
languagestatisticslibpy/Grams.py +134 -0
languagestatisticslibpy/GramsType.py +38 -0
languagestatisticslibpy/Hexagrams.py +130 -0
languagestatisticslibpy/LanguageStatistics.py +313 -0
languagestatisticslibpy/LanguageStatisticsFile.py +108 -0
languagestatisticslibpy/Node.py +89 -0
languagestatisticslibpy/Pentagrams.py +128 -0
languagestatisticslibpy/Tetragrams.py +125 -0
languagestatisticslibpy/Trigrams.py +125 -0
languagestatisticslibpy/Unigrams.py +110 -0
languagestatisticslibpy/WordTree.py +162 -0
languagestatisticslibpy/__init__.py +0 -0
languagestatisticslibpy/test1.py +26 -0
languagestatisticslibpy/test2.py +80 -0
languagestatisticslibpy-1.0.3.dist-info/LICENSE +201 -0
languagestatisticslibpy-1.0.3.dist-info/METADATA +124 -0
languagestatisticslibpy-1.0.3.dist-info/RECORD +19 -0
languagestatisticslibpy-1.0.3.dist-info/WHEEL +4 -0

languagestatisticslibpy/Bigrams.py ADDED Viewed

@@ -0,0 +1,124 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+'''
+import numpy as np
+import os
+from languagestatisticslibpy.Grams import Grams
+from languagestatisticslibpy.GramsType import GramsType
+from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
+class Bigrams(Grams):
+    def __init__(self, language, language_statistics_directory, use_spaces=False):
+        """
+        Initializes the Bigrams class by calling the parent class (Grams) initializer.
+        Parameters:
+        - language (str): The language of the bigram statistics.
+        - language_statistics_directory (str): Path to the directory containing language statistics files.
+        - use_spaces (bool): Whether to include spaces in the analysis (default: False).
+        """
+        super().__init__(language, language_statistics_directory, use_spaces)
+    def load_gz(self, filename, language_statistics_directory):
+        """
+        Loads a gzip-compressed file containing bigram frequencies.
+        Parameters:
+        - filename (str): The name of the file to load.
+        - language_statistics_directory (str): The directory where the statistics file is located.
+        Sets:
+        - self.frequencies (np.ndarray): A 2D array of bigram frequencies.
+        - self.alphabet (list): The alphabet used in the statistics file.
+        - self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
+        """
+        file_path = os.path.join(language_statistics_directory, filename)
+        language_statistics_file = LanguageStatisticsFile(file_path)
+        self.frequencies = language_statistics_file.load_frequencies(2)
+        self.alphabet = language_statistics_file.alphabet
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
+    def calculate_cost(self, text):
+        """
+        Calculates the cost of a given text based on bigram frequencies.
+        Parameters:
+        - text (str): The text to analyze.
+        Returns:
+        - float: The average cost of bigrams in the text. Returns 0 if the text length is less than 2.
+        Notes:
+        - Skips bigrams containing characters outside the defined alphabet.
+        - If `add_letter_indices` is defined, modifies indices of the characters before computing the cost.
+        """
+        if len(text) < 2:
+            return 0
+        value = 0
+        alphabet_length = len(self.alphabet)
+        end = len(text) - 1
+        for i in range(end):
+            a = text[i]
+            b = text[i + 1]
+            if self.add_letter_indices:
+                a += self.add_letter_indices[a]
+                b += self.add_letter_indices[b]
+            if a >= alphabet_length or b >= alphabet_length or a < 0 or b < 0:
+                continue
+            value += self.frequencies[a, b]
+        return value / end
+    def gram_size(self):
+        """
+        Returns the size of the grams being analyzed (bigrams in this case).
+        Returns:
+        - int: The size of the grams (always 2 for bigrams).
+        """
+        return 2
+    def grams_type(self):
+        """
+        Returns the type of grams being analyzed.
+        Returns:
+        - GramsType: An enum value representing the type of grams (GramsType.Bigrams).
+        """
+        return GramsType.Bigrams
+    def normalize(self, max_value):
+        """
+        Normalizes the bigram frequencies based on the provided maximum value.
+        Parameters:
+        - max_value (float): The maximum value used for normalization.
+        Notes:
+        - Adjusts all frequencies proportionally to the new maximum value.
+        - Updates `self.max_value` to the new maximum after normalization.
+        """
+        super().normalize(max_value)
+        adjust_value = self.max_value * max_value
+        for a in range(len(self.alphabet)):
+            for b in range(len(self.alphabet)):
+                self.frequencies[a, b] = adjust_value / self.frequencies[a, b]
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')

languagestatisticslibpy/Grams.py ADDED Viewed

@@ -0,0 +1,134 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+'''
+from abc import ABC, abstractmethod
+class Grams(ABC):
+    def __init__(self, language, language_statistics_directory, use_spaces):
+        """
+        Initializes the Grams superclass.
+        Parameters:
+        - language (str): The language of the n-gram statistics.
+        - language_statistics_directory (str): Path to the directory containing language statistics files.
+        - use_spaces (bool): Whether to include spaces in the analysis.
+        Initializes:
+        - self.max_value (float): The maximum value of the frequencies, set during file loading.
+        - self.is_normalized (bool): Tracks whether the frequencies have been normalized.
+        - self.alphabet (list): The alphabet used in the statistics file.
+        - self.add_letter_indices (list): Adjustment indices for characters when reducing the alphabet.
+        Raises:
+        - Exception: If the specified language statistics file is not found.
+        """
+        self.max_value = None
+        self.is_normalized = False
+        self.alphabet = None
+        self.add_letter_indices = None
+        # Construct the filename based on language and space usage.
+        filename = f"{language}-{self.gram_size()}gram-nocs{'-sp' if use_spaces else ''}.gz"
+        try:
+            # Attempt to load the gzipped language statistics file.
+            self.load_gz(filename, language_statistics_directory)
+        except FileNotFoundError as e:
+            raise Exception(f"Did not find the specified language statistics file for language={language} and use_spaces={use_spaces}: {filename}") from e
+    @abstractmethod
+    def calculate_cost(self, text):
+        """
+        Abstract method to calculate the cost of a given text based on n-gram frequencies.
+        Parameters:
+        - text (str): The text to analyze.
+        Returns:
+        - float: The calculated cost of the text.
+        """
+        ...
+    @abstractmethod
+    def gram_size(self):
+        """
+        Abstract method to return the size of the grams (e.g., 1 for unigrams, 2 for bigrams).
+        Returns:
+        - int: The size of the grams.
+        """
+        ...
+    @abstractmethod
+    def grams_type(self):
+        """
+        Abstract method to return the type of grams (e.g., GramsType.Unigrams).
+        Returns:
+        - GramsType: An enum representing the type of grams.
+        """
+        ...
+    @abstractmethod
+    def load_gz(self, filename, language_statistics_directory):
+        """
+        Abstract method to load a gzipped file containing n-gram frequencies.
+        Parameters:
+        - filename (str): The name of the file to load.
+        - language_statistics_directory (str): The directory where the statistics file is located.
+        Raises:
+        - FileNotFoundError: If the file does not exist.
+        """
+        ...
+    def reduce_alphabet(self, new_alphabet):
+        """
+        Reduces the current alphabet to a new, smaller alphabet.
+        Parameters:
+        - new_alphabet (list): The reduced alphabet.
+        Notes:
+        - If the new alphabet matches the original alphabet in size, no changes are made.
+        - Updates `self.add_letter_indices` to reflect adjustments for the reduced alphabet.
+        """
+        if len(new_alphabet) == len(self.alphabet):
+            self.add_letter_indices = None
+            return
+        self.add_letter_indices = [0] * len(new_alphabet)
+        add_value = 0
+        for i, letter in enumerate(new_alphabet):
+            if letter not in self.alphabet:
+                add_value += 1
+            self.add_letter_indices[i] = add_value
+    def normalize(self, max_value):
+        """
+        Normalizes the n-gram frequencies to a specified maximum value.
+        Parameters:
+        - max_value (float): The maximum value for normalization.
+        Raises:
+        - Exception: If the frequencies have already been normalized.
+        Notes:
+        - Sets `self.is_normalized` to True after normalization.
+        """
+        if self.is_normalized:
+            raise Exception("This Gram object has already been normalized!")
+        self.is_normalized = True

languagestatisticslibpy/GramsType.py ADDED Viewed

@@ -0,0 +1,38 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+'''
+from enum import Enum
+class GramsType(Enum):
+    """
+    Enum representing the types of n-grams.
+    Attributes:
+    - Undefined (int): Represents an invalid or uninitialized type (value: 0).
+    - Unigrams (int): Represents 1-grams, single characters (value: 1).
+    - Bigrams (int): Represents 2-grams, pairs of characters (value: 2).
+    - Trigrams (int): Represents 3-grams, triplets of characters (value: 3).
+    - Tetragrams (int): Represents 4-grams, quadruplets of characters (value: 4).
+    - Pentagrams (int): Represents 5-grams, quintuplets of characters (value: 5).
+    - Hexagrams (int): Represents 6-grams, sextuplets of characters (value: 6).
+    """
+    Undefined = 0       # Invalid or uninitialized type
+    Unigrams = 1        # 1-grams (single characters)
+    Bigrams = 2         # 2-grams (pairs of characters)
+    Trigrams = 3        # 3-grams (triplets of characters)
+    Tetragrams = 4      # 4-grams (quadruplets of characters)
+    Pentagrams = 5      # 5-grams (quintuplets of characters)
+    Hexagrams = 6       # 6-grams (sextuplets of characters)

languagestatisticslibpy/Hexagrams.py ADDED Viewed

@@ -0,0 +1,130 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+'''
+import numpy as np
+import os
+from languagestatisticslibpy.Grams import Grams
+from languagestatisticslibpy.GramsType import GramsType
+from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
+class Hexagrams(Grams):
+    def __init__(self, language, language_statistics_directory, use_spaces=False):
+        """
+        Initializes the Hexagrams class by calling the parent class (Grams) initializer.
+        Parameters:
+        - language (str): The language of the hexagram statistics.
+        - language_statistics_directory (str): Path to the directory containing language statistics files.
+        - use_spaces (bool): Whether to include spaces in the analysis (default: False).
+        """
+        super().__init__(language, language_statistics_directory, use_spaces)
+    def load_gz(self, filename, language_statistics_directory):
+        """
+        Loads a gzip-compressed file containing hexagram frequencies.
+        Parameters:
+        - filename (str): The name of the file to load.
+        - language_statistics_directory (str): The directory where the statistics file is located.
+        Sets:
+        - self.frequencies (np.ndarray): A 6D array of hexagram frequencies.
+        - self.alphabet (list): The alphabet used in the statistics file.
+        - self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
+        """
+        file_path = os.path.join(language_statistics_directory, filename)
+        language_statistics_file = LanguageStatisticsFile(file_path)
+        self.frequencies = language_statistics_file.load_frequencies(6)
+        self.alphabet = language_statistics_file.alphabet
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
+    def calculate_cost(self, text):
+        """
+        Calculates the cost of a given text based on hexagram frequencies.
+        Parameters:
+        - text (str): The text to analyze.
+        Returns:
+        - float: The average cost of hexagrams in the text. Returns 0.0 if the text length is less than 6.
+        Notes:
+        - Skips hexagrams containing characters outside the defined alphabet.
+        - If `add_letter_indices` is defined, modifies the index of the characters before computing the cost.
+        """
+        if len(text) < 6:
+            return 0.0
+        value = 0.0
+        alphabet_length = len(self.alphabet)
+        end = len(text) - 5
+        for i in range(end):
+            a, b, c, d, e, f = text[i:i+6]
+            if self.add_letter_indices:
+                a += self.add_letter_indices.get(a, 0)
+                b += self.add_letter_indices.get(b, 0)
+                c += self.add_letter_indices.get(c, 0)
+                d += self.add_letter_indices.get(d, 0)
+                e += self.add_letter_indices.get(e, 0)
+                f += self.add_letter_indices.get(f, 0)
+            if 0 <= a < alphabet_length and 0 <= b < alphabet_length and \
+               0 <= c < alphabet_length and 0 <= d < alphabet_length and \
+               0 <= e < alphabet_length and 0 <= f < alphabet_length:
+                value += self.frequencies[a, b, c, d, e, f]
+        return value / end
+    def gram_size(self):
+        """
+        Returns the size of the grams being analyzed (hexagrams in this case).
+        Returns:
+        - int: The size of the grams (always 6 for hexagrams).
+        """
+        return 6
+    def grams_type(self):
+        """
+        Returns the type of grams being analyzed.
+        Returns:
+        - GramsType: An enum value representing the type of grams (GramsType.Hexagrams).
+        """
+        return GramsType.Hexagrams
+    def normalize(self, max_value):
+        """
+        Normalizes the hexagram frequencies based on the provided maximum value.
+        Parameters:
+        - max_value (float): The maximum value used for normalization.
+        Notes:
+        - Adjusts all frequencies proportionally to the new maximum value.
+        - Updates `self.max_value` to the new maximum after normalization.
+        """
+        super().normalize(max_value)
+        adjust_value = self.max_value * max_value
+        for a in range(len(self.alphabet)):
+            for b in range(len(self.alphabet)):
+                for c in range(len(self.alphabet)):
+                    for d in range(len(self.alphabet)):
+                        for e in range(len(self.alphabet)):
+                            for f in range(len(self.alphabet)):
+                                self.frequencies[a, b, c, d, e, f] = adjust_value / self.frequencies[a, b, c, d, e, f]
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')