LanguageStatisticsLibPy 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ '''
2
+ Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ '''
16
+ import numpy as np
17
+ import os
18
+ from languagestatisticslibpy.Grams import Grams
19
+ from languagestatisticslibpy.GramsType import GramsType
20
+ from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
21
+
22
+
23
+ class Bigrams(Grams):
24
+ def __init__(self, language, language_statistics_directory, use_spaces=False):
25
+ """
26
+ Initializes the Bigrams class by calling the parent class (Grams) initializer.
27
+
28
+ Parameters:
29
+ - language (str): The language of the bigram statistics.
30
+ - language_statistics_directory (str): Path to the directory containing language statistics files.
31
+ - use_spaces (bool): Whether to include spaces in the analysis (default: False).
32
+ """
33
+ super().__init__(language, language_statistics_directory, use_spaces)
34
+
35
+ def load_gz(self, filename, language_statistics_directory):
36
+ """
37
+ Loads a gzip-compressed file containing bigram frequencies.
38
+
39
+ Parameters:
40
+ - filename (str): The name of the file to load.
41
+ - language_statistics_directory (str): The directory where the statistics file is located.
42
+
43
+ Sets:
44
+ - self.frequencies (np.ndarray): A 2D array of bigram frequencies.
45
+ - self.alphabet (list): The alphabet used in the statistics file.
46
+ - self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
47
+ """
48
+ file_path = os.path.join(language_statistics_directory, filename)
49
+ language_statistics_file = LanguageStatisticsFile(file_path)
50
+ self.frequencies = language_statistics_file.load_frequencies(2)
51
+ self.alphabet = language_statistics_file.alphabet
52
+ self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
53
+
54
+ def calculate_cost(self, text):
55
+ """
56
+ Calculates the cost of a given text based on bigram frequencies.
57
+
58
+ Parameters:
59
+ - text (str): The text to analyze.
60
+
61
+ Returns:
62
+ - float: The average cost of bigrams in the text. Returns 0 if the text length is less than 2.
63
+
64
+ Notes:
65
+ - Skips bigrams containing characters outside the defined alphabet.
66
+ - If `add_letter_indices` is defined, modifies indices of the characters before computing the cost.
67
+ """
68
+ if len(text) < 2:
69
+ return 0
70
+
71
+ value = 0
72
+ alphabet_length = len(self.alphabet)
73
+ end = len(text) - 1
74
+
75
+ for i in range(end):
76
+ a = text[i]
77
+ b = text[i + 1]
78
+
79
+ if self.add_letter_indices:
80
+ a += self.add_letter_indices[a]
81
+ b += self.add_letter_indices[b]
82
+
83
+ if a >= alphabet_length or b >= alphabet_length or a < 0 or b < 0:
84
+ continue
85
+ value += self.frequencies[a, b]
86
+
87
+ return value / end
88
+
89
+ def gram_size(self):
90
+ """
91
+ Returns the size of the grams being analyzed (bigrams in this case).
92
+
93
+ Returns:
94
+ - int: The size of the grams (always 2 for bigrams).
95
+ """
96
+ return 2
97
+
98
+ def grams_type(self):
99
+ """
100
+ Returns the type of grams being analyzed.
101
+
102
+ Returns:
103
+ - GramsType: An enum value representing the type of grams (GramsType.Bigrams).
104
+ """
105
+ return GramsType.Bigrams
106
+
107
+ def normalize(self, max_value):
108
+ """
109
+ Normalizes the bigram frequencies based on the provided maximum value.
110
+
111
+ Parameters:
112
+ - max_value (float): The maximum value used for normalization.
113
+
114
+ Notes:
115
+ - Adjusts all frequencies proportionally to the new maximum value.
116
+ - Updates `self.max_value` to the new maximum after normalization.
117
+ """
118
+ super().normalize(max_value)
119
+ adjust_value = self.max_value * max_value
120
+ for a in range(len(self.alphabet)):
121
+ for b in range(len(self.alphabet)):
122
+ self.frequencies[a, b] = adjust_value / self.frequencies[a, b]
123
+ self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
124
+
@@ -0,0 +1,134 @@
1
+ '''
2
+ Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ '''
16
+ from abc import ABC, abstractmethod
17
+
18
+ class Grams(ABC):
19
+ def __init__(self, language, language_statistics_directory, use_spaces):
20
+ """
21
+ Initializes the Grams superclass.
22
+
23
+ Parameters:
24
+ - language (str): The language of the n-gram statistics.
25
+ - language_statistics_directory (str): Path to the directory containing language statistics files.
26
+ - use_spaces (bool): Whether to include spaces in the analysis.
27
+
28
+ Initializes:
29
+ - self.max_value (float): The maximum value of the frequencies, set during file loading.
30
+ - self.is_normalized (bool): Tracks whether the frequencies have been normalized.
31
+ - self.alphabet (list): The alphabet used in the statistics file.
32
+ - self.add_letter_indices (list): Adjustment indices for characters when reducing the alphabet.
33
+
34
+ Raises:
35
+ - Exception: If the specified language statistics file is not found.
36
+ """
37
+ self.max_value = None
38
+ self.is_normalized = False
39
+ self.alphabet = None
40
+ self.add_letter_indices = None
41
+
42
+ # Construct the filename based on language and space usage.
43
+ filename = f"{language}-{self.gram_size()}gram-nocs{'-sp' if use_spaces else ''}.gz"
44
+ try:
45
+ # Attempt to load the gzipped language statistics file.
46
+ self.load_gz(filename, language_statistics_directory)
47
+ except FileNotFoundError as e:
48
+ raise Exception(f"Did not find the specified language statistics file for language={language} and use_spaces={use_spaces}: {filename}") from e
49
+
50
+ @abstractmethod
51
+ def calculate_cost(self, text):
52
+ """
53
+ Abstract method to calculate the cost of a given text based on n-gram frequencies.
54
+
55
+ Parameters:
56
+ - text (str): The text to analyze.
57
+
58
+ Returns:
59
+ - float: The calculated cost of the text.
60
+ """
61
+ ...
62
+
63
+ @abstractmethod
64
+ def gram_size(self):
65
+ """
66
+ Abstract method to return the size of the grams (e.g., 1 for unigrams, 2 for bigrams).
67
+
68
+ Returns:
69
+ - int: The size of the grams.
70
+ """
71
+ ...
72
+
73
+ @abstractmethod
74
+ def grams_type(self):
75
+ """
76
+ Abstract method to return the type of grams (e.g., GramsType.Unigrams).
77
+
78
+ Returns:
79
+ - GramsType: An enum representing the type of grams.
80
+ """
81
+ ...
82
+
83
+ @abstractmethod
84
+ def load_gz(self, filename, language_statistics_directory):
85
+ """
86
+ Abstract method to load a gzipped file containing n-gram frequencies.
87
+
88
+ Parameters:
89
+ - filename (str): The name of the file to load.
90
+ - language_statistics_directory (str): The directory where the statistics file is located.
91
+
92
+ Raises:
93
+ - FileNotFoundError: If the file does not exist.
94
+ """
95
+ ...
96
+
97
+ def reduce_alphabet(self, new_alphabet):
98
+ """
99
+ Reduces the current alphabet to a new, smaller alphabet.
100
+
101
+ Parameters:
102
+ - new_alphabet (list): The reduced alphabet.
103
+
104
+ Notes:
105
+ - If the new alphabet matches the original alphabet in size, no changes are made.
106
+ - Updates `self.add_letter_indices` to reflect adjustments for the reduced alphabet.
107
+ """
108
+ if len(new_alphabet) == len(self.alphabet):
109
+ self.add_letter_indices = None
110
+ return
111
+
112
+ self.add_letter_indices = [0] * len(new_alphabet)
113
+ add_value = 0
114
+ for i, letter in enumerate(new_alphabet):
115
+ if letter not in self.alphabet:
116
+ add_value += 1
117
+ self.add_letter_indices[i] = add_value
118
+
119
+ def normalize(self, max_value):
120
+ """
121
+ Normalizes the n-gram frequencies to a specified maximum value.
122
+
123
+ Parameters:
124
+ - max_value (float): The maximum value for normalization.
125
+
126
+ Raises:
127
+ - Exception: If the frequencies have already been normalized.
128
+
129
+ Notes:
130
+ - Sets `self.is_normalized` to True after normalization.
131
+ """
132
+ if self.is_normalized:
133
+ raise Exception("This Gram object has already been normalized!")
134
+ self.is_normalized = True
@@ -0,0 +1,38 @@
1
+ '''
2
+ Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ '''
16
+ from enum import Enum
17
+
18
+ class GramsType(Enum):
19
+ """
20
+ Enum representing the types of n-grams.
21
+
22
+ Attributes:
23
+ - Undefined (int): Represents an invalid or uninitialized type (value: 0).
24
+ - Unigrams (int): Represents 1-grams, single characters (value: 1).
25
+ - Bigrams (int): Represents 2-grams, pairs of characters (value: 2).
26
+ - Trigrams (int): Represents 3-grams, triplets of characters (value: 3).
27
+ - Tetragrams (int): Represents 4-grams, quadruplets of characters (value: 4).
28
+ - Pentagrams (int): Represents 5-grams, quintuplets of characters (value: 5).
29
+ - Hexagrams (int): Represents 6-grams, sextuplets of characters (value: 6).
30
+ """
31
+
32
+ Undefined = 0 # Invalid or uninitialized type
33
+ Unigrams = 1 # 1-grams (single characters)
34
+ Bigrams = 2 # 2-grams (pairs of characters)
35
+ Trigrams = 3 # 3-grams (triplets of characters)
36
+ Tetragrams = 4 # 4-grams (quadruplets of characters)
37
+ Pentagrams = 5 # 5-grams (quintuplets of characters)
38
+ Hexagrams = 6 # 6-grams (sextuplets of characters)
@@ -0,0 +1,130 @@
1
+ '''
2
+ Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ '''
16
+ import numpy as np
17
+ import os
18
+ from languagestatisticslibpy.Grams import Grams
19
+ from languagestatisticslibpy.GramsType import GramsType
20
+ from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
21
+
22
+ class Hexagrams(Grams):
23
+ def __init__(self, language, language_statistics_directory, use_spaces=False):
24
+ """
25
+ Initializes the Hexagrams class by calling the parent class (Grams) initializer.
26
+
27
+ Parameters:
28
+ - language (str): The language of the hexagram statistics.
29
+ - language_statistics_directory (str): Path to the directory containing language statistics files.
30
+ - use_spaces (bool): Whether to include spaces in the analysis (default: False).
31
+ """
32
+ super().__init__(language, language_statistics_directory, use_spaces)
33
+
34
+ def load_gz(self, filename, language_statistics_directory):
35
+ """
36
+ Loads a gzip-compressed file containing hexagram frequencies.
37
+
38
+ Parameters:
39
+ - filename (str): The name of the file to load.
40
+ - language_statistics_directory (str): The directory where the statistics file is located.
41
+
42
+ Sets:
43
+ - self.frequencies (np.ndarray): A 6D array of hexagram frequencies.
44
+ - self.alphabet (list): The alphabet used in the statistics file.
45
+ - self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
46
+ """
47
+ file_path = os.path.join(language_statistics_directory, filename)
48
+ language_statistics_file = LanguageStatisticsFile(file_path)
49
+ self.frequencies = language_statistics_file.load_frequencies(6)
50
+ self.alphabet = language_statistics_file.alphabet
51
+ self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
52
+
53
+ def calculate_cost(self, text):
54
+ """
55
+ Calculates the cost of a given text based on hexagram frequencies.
56
+
57
+ Parameters:
58
+ - text (str): The text to analyze.
59
+
60
+ Returns:
61
+ - float: The average cost of hexagrams in the text. Returns 0.0 if the text length is less than 6.
62
+
63
+ Notes:
64
+ - Skips hexagrams containing characters outside the defined alphabet.
65
+ - If `add_letter_indices` is defined, modifies the index of the characters before computing the cost.
66
+ """
67
+ if len(text) < 6:
68
+ return 0.0
69
+
70
+ value = 0.0
71
+ alphabet_length = len(self.alphabet)
72
+ end = len(text) - 5
73
+
74
+ for i in range(end):
75
+ a, b, c, d, e, f = text[i:i+6]
76
+
77
+ if self.add_letter_indices:
78
+ a += self.add_letter_indices.get(a, 0)
79
+ b += self.add_letter_indices.get(b, 0)
80
+ c += self.add_letter_indices.get(c, 0)
81
+ d += self.add_letter_indices.get(d, 0)
82
+ e += self.add_letter_indices.get(e, 0)
83
+ f += self.add_letter_indices.get(f, 0)
84
+
85
+ if 0 <= a < alphabet_length and 0 <= b < alphabet_length and \
86
+ 0 <= c < alphabet_length and 0 <= d < alphabet_length and \
87
+ 0 <= e < alphabet_length and 0 <= f < alphabet_length:
88
+ value += self.frequencies[a, b, c, d, e, f]
89
+
90
+ return value / end
91
+
92
+ def gram_size(self):
93
+ """
94
+ Returns the size of the grams being analyzed (hexagrams in this case).
95
+
96
+ Returns:
97
+ - int: The size of the grams (always 6 for hexagrams).
98
+ """
99
+ return 6
100
+
101
+ def grams_type(self):
102
+ """
103
+ Returns the type of grams being analyzed.
104
+
105
+ Returns:
106
+ - GramsType: An enum value representing the type of grams (GramsType.Hexagrams).
107
+ """
108
+ return GramsType.Hexagrams
109
+
110
+ def normalize(self, max_value):
111
+ """
112
+ Normalizes the hexagram frequencies based on the provided maximum value.
113
+
114
+ Parameters:
115
+ - max_value (float): The maximum value used for normalization.
116
+
117
+ Notes:
118
+ - Adjusts all frequencies proportionally to the new maximum value.
119
+ - Updates `self.max_value` to the new maximum after normalization.
120
+ """
121
+ super().normalize(max_value)
122
+ adjust_value = self.max_value * max_value
123
+ for a in range(len(self.alphabet)):
124
+ for b in range(len(self.alphabet)):
125
+ for c in range(len(self.alphabet)):
126
+ for d in range(len(self.alphabet)):
127
+ for e in range(len(self.alphabet)):
128
+ for f in range(len(self.alphabet)):
129
+ self.frequencies[a, b, c, d, e, f] = adjust_value / self.frequencies[a, b, c, d, e, f]
130
+ self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')