PyPI - LanguageStatisticsLibPy - Versions diffs - 1.0.3__py3-none-any.whl - Mend

LanguageStatisticsLibPy 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

languagestatisticslibpy/Bigrams.py +124 -0
languagestatisticslibpy/Grams.py +134 -0
languagestatisticslibpy/GramsType.py +38 -0
languagestatisticslibpy/Hexagrams.py +130 -0
languagestatisticslibpy/LanguageStatistics.py +313 -0
languagestatisticslibpy/LanguageStatisticsFile.py +108 -0
languagestatisticslibpy/Node.py +89 -0
languagestatisticslibpy/Pentagrams.py +128 -0
languagestatisticslibpy/Tetragrams.py +125 -0
languagestatisticslibpy/Trigrams.py +125 -0
languagestatisticslibpy/Unigrams.py +110 -0
languagestatisticslibpy/WordTree.py +162 -0
languagestatisticslibpy/__init__.py +0 -0
languagestatisticslibpy/test1.py +26 -0
languagestatisticslibpy/test2.py +80 -0
languagestatisticslibpy-1.0.3.dist-info/LICENSE +201 -0
languagestatisticslibpy-1.0.3.dist-info/METADATA +124 -0
languagestatisticslibpy-1.0.3.dist-info/RECORD +19 -0
languagestatisticslibpy-1.0.3.dist-info/WHEEL +4 -0

languagestatisticslibpy/Tetragrams.py ADDED Viewed

@@ -0,0 +1,125 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+'''
+import numpy as np
+import os
+from languagestatisticslibpy.Grams import Grams
+from languagestatisticslibpy.GramsType import GramsType
+from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
+class Tetragrams(Grams):
+    def __init__(self, language, language_statistics_directory, use_spaces=False):
+        """
+        Initializes the Tetragrams class by calling the parent class (Grams) initializer.
+        Parameters:
+        - language (str): The language of the tetragram statistics.
+        - language_statistics_directory (str): Path to the directory containing language statistics files.
+        - use_spaces (bool): Whether to include spaces in the analysis (default: False).
+        """
+        super().__init__(language, language_statistics_directory, use_spaces)
+    def load_gz(self, filename, language_statistics_directory):
+        """
+        Loads a gzip-compressed file containing tetragram frequencies.
+        Parameters:
+        - filename (str): The name of the file to load.
+        - language_statistics_directory (str): The directory where the statistics file is located.
+        Sets:
+        - self.frequencies (np.ndarray): A 4D array of tetragram frequencies.
+        - self.alphabet (list): The alphabet used in the statistics file.
+        - self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
+        """
+        file_path = os.path.join(language_statistics_directory, filename)
+        language_statistics_file = LanguageStatisticsFile(file_path)
+        self.frequencies = language_statistics_file.load_frequencies(4)
+        self.alphabet = language_statistics_file.alphabet
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
+    def calculate_cost(self, text):
+        """
+        Calculates the cost of a given text based on tetragram frequencies.
+        Parameters:
+        - text (str): The text to analyze.
+        Returns:
+        - float: The average cost of tetragrams in the text. Returns 0.0 if the text length is less than 4.
+        Notes:
+        - Skips tetragrams containing characters outside the defined alphabet.
+        - If `add_letter_indices` is defined, modifies the index of the characters before computing the cost.
+        """
+        if len(text) < 4:
+            return 0.0
+        value = 0.0
+        alphabet_length = len(self.alphabet)
+        end = len(text) - 3
+        for i in range(end):
+            a, b, c, d = text[i:i+4]
+            if self.add_letter_indices:
+                a += self.add_letter_indices.get(a, 0)
+                b += self.add_letter_indices.get(b, 0)
+                c += self.add_letter_indices.get(c, 0)
+                d += self.add_letter_indices.get(d, 0)
+            if 0 <= a < alphabet_length and 0 <= b < alphabet_length and \
+               0 <= c < alphabet_length and 0 <= d < alphabet_length:
+                value += self.frequencies[a, b, c, d]
+        return value / end
+    def gram_size(self):
+        """
+        Returns the size of the grams being analyzed (tetragrams in this case).
+        Returns:
+        - int: The size of the grams (always 4 for tetragrams).
+        """
+        return 4
+    def grams_type(self):
+        """
+        Returns the type of grams being analyzed.
+        Returns:
+        - GramsType: An enum value representing the type of grams (GramsType.Tetragrams).
+        """
+        return GramsType.Tetragrams
+    def normalize(self, max_value):
+        """
+        Normalizes the tetragram frequencies based on the provided maximum value.
+        Parameters:
+        - max_value (float): The maximum value used for normalization.
+        Notes:
+        - Adjusts all frequencies proportionally to the new maximum value.
+        - Updates `self.max_value` to the new maximum after normalization.
+        """
+        super().normalize(max_value)
+        adjust_value = self.max_value * max_value
+        for a in range(len(self.alphabet)):
+            for b in range(len(self.alphabet)):
+                for c in range(len(self.alphabet)):
+                    for d in range(len(self.alphabet)):
+                        self.frequencies[a, b, c, d] = adjust_value / self.frequencies[a, b, c, d]
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')

languagestatisticslibpy/Trigrams.py ADDED Viewed

@@ -0,0 +1,125 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+'''
+import numpy as np
+import os
+from languagestatisticslibpy.Grams import Grams
+from languagestatisticslibpy.GramsType import GramsType
+from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
+class Trigrams(Grams):
+    def __init__(self, language, language_statistics_directory, use_spaces=False):
+        """
+        Initializes the Trigrams class by calling the parent class (Grams) initializer.
+        Parameters:
+        - language (str): The language of the trigram statistics.
+        - language_statistics_directory (str): Path to the directory containing language statistics files.
+        - use_spaces (bool): Whether to include spaces in the analysis (default: False).
+        """
+        super().__init__(language, language_statistics_directory, use_spaces)
+    def load_gz(self, filename, language_statistics_directory):
+        """
+        Loads a gzip-compressed file containing trigram frequencies.
+        Parameters:
+        - filename (str): The name of the file to load.
+        - language_statistics_directory (str): The directory where the statistics file is located.
+        Sets:
+        - self.frequencies (np.ndarray): A 3D array of trigram frequencies.
+        - self.alphabet (list): The alphabet used in the statistics file.
+        - self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
+        """
+        file_path = os.path.join(language_statistics_directory, filename)
+        language_statistics_file = LanguageStatisticsFile(file_path)
+        self.frequencies = language_statistics_file.load_frequencies(3)
+        self.alphabet = language_statistics_file.alphabet
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
+    def calculate_cost(self, text):
+        """
+        Calculates the cost of a given text based on trigram frequencies.
+        Parameters:
+        - text (str): The text to analyze.
+        Returns:
+        - float: The average cost of trigrams in the text. Returns 0 if the text length is less than 3.
+        Notes:
+        - Skips trigrams containing characters outside the defined alphabet.
+        - If `add_letter_indices` is defined, modifies indices of the characters before computing the cost.
+        """
+        if len(text) < 3:
+            return 0
+        value = 0
+        alphabet_length = len(self.alphabet)
+        end = len(text) - 2
+        for i in range(end):
+            a = text[i]
+            b = text[i + 1]
+            c = text[i + 2]
+            if self.add_letter_indices:
+                a += self.add_letter_indices.get(a, 0)
+                b += self.add_letter_indices.get(b, 0)
+                c += self.add_letter_indices.get(c, 0)
+            if a >= alphabet_length or b >= alphabet_length or c >= alphabet_length or a < 0 or b < 0 or c < 0:
+                continue
+            value += self.frequencies[a, b, c]
+        return value / end
+    def gram_size(self):
+        """
+        Returns the size of the grams being analyzed (trigrams in this case).
+        Returns:
+        - int: The size of the grams (always 3 for trigrams).
+        """
+        return 3
+    def grams_type(self):
+        """
+        Returns the type of grams being analyzed.
+        Returns:
+        - GramsType: An enum value representing the type of grams (GramsType.Trigrams).
+        """
+        return GramsType.Trigrams
+    def normalize(self, max_value):
+        """
+        Normalizes the trigram frequencies based on the provided maximum value.
+        Parameters:
+        - max_value (float): The maximum value used for normalization.
+        Notes:
+        - Adjusts all frequencies proportionally to the new maximum value.
+        - Updates `self.max_value` to the new maximum after normalization.
+        """
+        super().normalize(max_value)
+        adjust_value = self.max_value * max_value
+        for a in range(len(self.alphabet)):
+            for b in range(len(self.alphabet)):
+                for c in range(len(self.alphabet)):
+                    self.frequencies[a, b, c] = adjust_value / self.frequencies[a, b, c]
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')

languagestatisticslibpy/Unigrams.py ADDED Viewed

@@ -0,0 +1,110 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+'''
+import numpy as np
+import os
+from languagestatisticslibpy.Grams import Grams
+from languagestatisticslibpy.GramsType import GramsType
+from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
+class Unigrams(Grams):
+    def __init__(self, language, language_statistics_directory, use_spaces=False):
+        """
+        Initializes the Unigrams class by calling the parent class (Grams) initializer.
+        Parameters:
+        - language (str): The language of the unigram statistics.
+        - language_statistics_directory (str): Path to the directory containing language statistics files.
+        - use_spaces (bool): Whether to include spaces in the analysis (default: False).
+        """
+        super().__init__(language, language_statistics_directory, use_spaces)
+    def load_gz(self, filename, language_statistics_directory):
+        """
+        Loads a gzip-compressed file containing unigram frequencies.
+        Parameters:
+        - filename (str): The name of the file to load.
+        - language_statistics_directory (str): The directory where the statistics file is located.
+        Sets:
+        - self.frequencies (np.ndarray): A 1D array of unigram frequencies.
+        - self.alphabet (list): The alphabet used in the statistics file.
+        - self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
+        """
+        file_path = os.path.join(language_statistics_directory, filename)
+        language_statistics_file = LanguageStatisticsFile(file_path)
+        self.frequencies = language_statistics_file.load_frequencies(1)
+        self.alphabet = language_statistics_file.alphabet
+        self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
+    def calculate_cost(self, text):
+        """
+        Calculates the cost of a given text based on unigram frequencies.
+        Parameters:
+        - text (str): The text to analyze.
+        Returns:
+        - float: The average cost of unigrams in the text. Returns 0.0 if the text is empty.
+        Notes:
+        - Skips characters that are outside the defined alphabet.
+        - If `add_letter_indices` is defined, modifies the index of the character before computing the cost.
+        """
+        if len(text) == 0:
+            return 0.0
+        value = 0.0
+        for i in text:
+            if self.add_letter_indices:
+                i += self.add_letter_indices.get(i, 0)
+            if 0 <= i < len(self.alphabet):
+                value += self.frequencies[i]
+        return value / len(text)
+    def gram_size(self):
+        """
+        Returns the size of the grams being analyzed (unigrams in this case).
+        Returns:
+        - int: The size of the grams (always 1 for unigrams).
+        """
+        return 1
+    def grams_type(self):
+        """
+        Returns the type of grams being analyzed.
+        Returns:
+        - GramsType: An enum value representing the type of grams (GramsType.Unigrams).
+        """
+        return GramsType.Unigrams
+    def normalize(self, max_value):
+        """
+        Normalizes the unigram frequencies based on the provided maximum value.
+        Parameters:
+        - max_value (float): The maximum value used for normalization.
+        Notes:
+        - Adjusts all frequencies proportionally to the new maximum value.
+        - Updates `self.max_value` to the new maximum after normalization.
+        """
+        super().normalize(max_value)
+        adjust_value = self.max_value * max_value
+        for a in range(len(self.alphabet)):
+            self.frequencies[a] = adjust_value / self.frequencies[a]

languagestatisticslibpy/WordTree.py ADDED Viewed

@@ -0,0 +1,162 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+'''
+from io import BufferedReader
+from collections import deque
+from languagestatisticslibpy.Node import Node
+class WordTree(Node):
+    """
+    Represents a tree data structure for storing words and efficiently querying them.
+    Inherits:
+    - Node: The base class for tree nodes, where each node represents a character.
+    Attributes:
+    - stored_words (int): The number of words stored in the tree.
+    - language_code (str): The language code for the words stored in the tree.
+    - alphabet (str): The alphabet used in the stored words.
+    """
+    def __init__(self):
+        """
+        Initializes an empty WordTree.
+        Initializes:
+        - stored_words (int): Set to 0, as no words are initially stored.
+        - language_code (str): Empty, to be set during deserialization.
+        - alphabet (str): Empty, to be set during deserialization.
+        """
+        super().__init__()
+        self.stored_words = 0
+        self.language_code = ''
+        self.alphabet = ''
+    @staticmethod
+    def deserialize(reader: BufferedReader):
+        """
+        Deserializes a WordTree from a binary file.
+        Parameters:
+        - reader (BufferedReader): A binary file reader containing the serialized WordTree.
+        Returns:
+        - WordTree: The deserialized WordTree object.
+        Raises:
+        - Exception: If the file format is invalid or the magic number does not match.
+        Process:
+        1. Reads the file header and validates the magic number.
+        2. Reads the language code and alphabet.
+        3. Reads the number of stored words.
+        4. Constructs the WordTree structure by iterating through the file's serialized data.
+        """
+        tree = WordTree()
+        # Load word tree header
+        magic_no = reader.read(6).decode('utf-8')
+        if magic_no != "CT2DIC":
+            raise Exception("File does not start with the expected magic number for word tree.")
+        # Read language code
+        tree.language_code = ''
+        char = reader.read(1).decode('utf-8')
+        while char != '\0':
+            tree.language_code += char
+            char = reader.read(1).decode('utf-8')
+        # Read alphabet
+        tree.alphabet = ''
+        char = reader.read(1).decode('utf-8')
+        while char != '\0':
+            tree.alphabet += char
+            char = reader.read(1).decode('utf-8')
+        # Read number of stored words
+        tree.stored_words = int.from_bytes(reader.read(4), 'little')
+        # Load word tree data structure
+        stack = deque([tree])
+        byte = reader.read(1)
+        while byte:
+            char = byte.decode('utf-8')
+            if char == Node.WordEndSymbol:
+                stack[-1].word_ends_here = True
+                tree.stored_words += 1
+            elif char == Node.TerminationSymbol:
+                stack.pop()
+            else:
+                new_node = Node(char)
+                stack[-1].child_nodes.append(new_node)
+                stack.append(new_node)
+            byte = reader.read(1)
+        return tree
+    def contains_word(self, word):
+        """
+        Checks whether a given word exists in the WordTree.
+        Parameters:
+        - word (str): The word to search for.
+        Returns:
+        - bool: True if the word exists in the tree, False otherwise.
+        Process:
+        1. Converts the word to uppercase for case-insensitive comparison.
+        2. Traverses the tree to find the sequence of characters in the word.
+        3. Returns False if any character is missing in the tree structure.
+        """
+        word = word.upper()
+        current_node = self
+        for char in word:
+            found_node = None
+            for child_node in current_node.child_nodes:
+                if child_node.value == char:
+                    current_node = child_node
+                    found_node = True
+                    break
+            if not found_node:
+                return False
+        return True
+    def to_list(self):
+        """
+        Converts all words stored in the WordTree into a list.
+        Returns:
+        - list: A list of all words stored in the tree.
+        Process:
+        1. Traverses the tree using a stack to collect characters.
+        2. Adds a word to the list whenever a node marks the end of a word.
+        """
+        list_of_words = []
+        stack = deque()
+        def add_node_to_list(node, stack):
+            stack.append(node.value)
+            if node.word_ends_here:
+                list_of_words.append(''.join(stack))
+            for child_node in node.child_nodes:
+                add_node_to_list(child_node, deque(stack))
+            stack.pop()
+        for node in self.child_nodes:
+            add_node_to_list(node, stack)
+        return list_of_words

languagestatisticslibpy/__init__.py ADDED Viewed

File without changes

languagestatisticslibpy/test1.py ADDED Viewed

@@ -0,0 +1,26 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   Usage: python3 test1.py
+   test1.py is a minimal working example (MWE) which just needs the
+   package LanguageStatisticsLibPy to be installed.
+'''
+from languagestatisticslibpy.LanguageStatistics import LanguageStatistics as LS
+plaintext = LS.map_text_into_number_space("HELLOWORD", LS.alphabets['en'])
+ioc = LS.calculate_ioc(plaintext)
+print(ioc)

languagestatisticslibpy/test2.py ADDED Viewed

@@ -0,0 +1,80 @@
+'''
+   Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   Usage: python3 test2.py
+   test2.py just needs the package LanguageStatisticsLibPy AND all the language
+   statistics (n-grams) and dictionary files  to be installed on your computer.
+'''
+from languagestatisticslibpy.LanguageStatistics import LanguageStatistics as LS
+from datetime import datetime
+# Change this path to the folder where the CrypTool-2 language statistics and
+# dictionary files are stored, e.g. the folder "LanguageStatistics" in the standard
+# CrypTool-2 installation folder if you have installed CrypTool 2 on Windows.
+# Sample directory:
+# ct2_language_statistics_folder = "C:\\Program Files\\CrypTool 2\\LanguageStatistics"   # Windows
+# ct2_language_statistics_folder = "/home/be/tmp/LanguageStatisticsLibPy_PIP-Test/LSLP/"   # Linux
+ct2_language_statistics_folder = "/Users/be/Documents/Python/LanguageStatisticsLibPy_PIP-Test/LSLP"   # Mac (Note: gz file not found if path starts with ~)
+# test the cost calculation of all gram classes except hexagrams
+for i in range(1, 6):
+    #write current ngram size
+    print("Grams size:", i)
+    #load grams for English
+    start = datetime.now()
+    grams = LS.create_grams_by_size(i, "en", ct2_language_statistics_folder, False)
+    print("\tGrams loaded in", (datetime.now() - start))
+    #normalize the grams
+    start = datetime.now()
+    grams.normalize(1000000.0)
+    print("\tGrams normalized in", (datetime.now() - start))
+    #map the text into the number space of the grams
+    numbers = LS.map_text_into_number_space("HELLOWORLDTHISISATEST", grams.alphabet)
+    #calculate the cost of the text
+    cost = grams.calculate_cost(numbers)
+    #convert the numbers back into the text space
+    text = LS.map_numbers_into_text_space(numbers, grams.alphabet)
+    print("\tText:", text)
+    #print the cost
+    print("\tCost value:", cost)
+#Test the word tree
+#Hint: the word tree works with strings instead of number arrays
+print("Loading word tree")
+start = datetime.now()
+tree = LS.load_word_tree("en", ct2_language_statistics_folder)
+print("\tWord tree loaded", (datetime.now() - start))
+print("\tTotal number of words in tree", tree.stored_words)
+word = "Hello"
+print("Word:", word)
+print("\tContains word:", tree.contains_word(word))
+word = "World"
+print("Word:", word)
+print("\tContains word:", tree.contains_word(word))
+word = "HelloWorld"
+print("Word:", word)
+print("\tContains word:", tree.contains_word(word))