LanguageStatisticsLibPy 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,313 @@
1
+ '''
2
+ Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ '''
16
+ import os
17
+ import gzip
18
+ from enum import Enum
19
+ from languagestatisticslibpy.GramsType import GramsType
20
+ from languagestatisticslibpy.Unigrams import Unigrams
21
+ from languagestatisticslibpy.Bigrams import Bigrams
22
+ from languagestatisticslibpy.Trigrams import Trigrams
23
+ from languagestatisticslibpy.Tetragrams import Tetragrams
24
+ from languagestatisticslibpy.Pentagrams import Pentagrams
25
+ from languagestatisticslibpy.Hexagrams import Hexagrams
26
+ from languagestatisticslibpy.WordTree import WordTree
27
+
28
+ class HandlingOfUnknownSymbols(Enum):
29
+ """
30
+ Enum to specify how to handle unknown symbols during text or number mapping.
31
+
32
+ Attributes:
33
+ - REMOVE (int): Remove unknown symbols.
34
+ - REPLACE (int): Replace unknown symbols with a specific character or number.
35
+ """
36
+ REMOVE = 0
37
+ REPLACE = 1
38
+
39
+
40
+ class LanguageStatistics:
41
+ """
42
+ Provides utilities for handling language-related statistics, such as unigram frequencies,
43
+ alphabet definitions, and mapping between text and number spaces.
44
+
45
+ Attributes:
46
+ - supported_languages_codes (list): A list of ISO language codes for supported languages.
47
+ - supported_languages (list): A list of language names corresponding to the supported language codes.
48
+ - unigrams (dict): A dictionary mapping language codes to their unigram frequencies.
49
+ - alphabets (dict): A dictionary mapping language codes to their alphabets.
50
+ """
51
+
52
+ supported_languages_codes = [
53
+ "en", "de", "es", "fr", "it", "hu", "ru", "cs", "el", "la", "nl", "sv", "pt", "pl", "tr"
54
+ ]
55
+
56
+ supported_languages = [
57
+ "English", "German", "Spanish", "French", "Italian", "Hungarian",
58
+ "Russian", "Czech", "Greek", "Latin", "Dutch", "Swedish",
59
+ "Portuguese", "Polish", "Turkish"
60
+ ]
61
+
62
+ unigrams = {
63
+ # Source: Wikipedia
64
+ "en": [ 0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, 0.06094, 0.06966, 0.00153, 0.00772, 0.04025, 0.02406, 0.06749, 0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056, 0.02758, 0.00978, 0.0236, 0.0015, 0.01974, 0.00074], # English
65
+ "fr": [ 0.07636, 0.00901, 0.0326, 0.03669, 0.14715, 0.01066, 0.00866, 0.00737, 0.07529, 0.00613, 0.00049, 0.05456, 0.02968, 0.07095, 0.05796, 0.02521, 0.01362, 0.06693, 0.07948, 0.07244, 0.06311, 0.01838, 0.00074, 0.00427, 0.00128, 0.00326], # French
66
+ "de": [ 0.06516, 0.01886, 0.02732, 0.05076, 0.16396, 0.01656, 0.03009, 0.04577, 0.0655, 0.00268, 0.01417, 0.03437, 0.02534, 0.09776, 0.02594, 0.0067, 0.00018, 0.07003, 0.0727, 0.06154, 0.04166, 0.00846, 0.01921, 0.00034, 0.00039, 0.01134], # German
67
+ "es": [ 0.11525, 0.02215, 0.04019, 0.0501, 0.12181, 0.00692, 0.01768, 0.00703, 0.06247, 0.00493, 0.00011, 0.04967, 0.03157, 0.06712, 0.08683, 0.0251, 0.00877, 0.06871, 0.07977, 0.04632, 0.02927, 0.01138, 0.00017, 0.00215, 0.01008, 0.00467], # Spanish
68
+ "pt": [ 0.14634, 0.01043, 0.03882, 0.04992, 0.1257, 0.01023, 0.01303, 0.00781, 0.06186, 0.00397, 0.00015, 0.02779, 0.04738, 0.04446, 0.09735, 0.02523, 0.01204, 0.0653, 0.06805, 0.04336, 0.03639, 0.01575, 0.00037, 0.00253, 6e-005, 0.0047], # Portuguese
69
+ "eo": [ 0.12117, 0.0098, 0.00776, 0.03044, 0.08995, 0.01037, 0.01171, 0.00384, 0.10012, 0.03501, 0.04163, 0.06104, 0.02994, 0.07955, 0.08779, 0.02755, 0, 0.05914, 0.06092, 0.05276, 0.03183, 0.01904, 0, 0, 0, 0.00494], # Esperanto
70
+ "it": [ 0.11745, 0.00927, 0.04501, 0.03736, 0.11792, 0.01153, 0.01644, 0.00636, 0.10143, 0.00011, 9e-005, 0.0651, 0.02512, 0.06883, 0.09832, 0.03056, 0.00505, 0.06367, 0.04981, 0.05623, 0.03011, 0.02097, 0.00033, 3e-005, 0.0002, 0.01181], # Italian
71
+ "tr": [ 0.1292, 0.02844, 0.01463, 0.05206, 0.09912, 0.00461, 0.01253, 0.01212, 0.096, 0.00034, 0.05683, 0.05922, 0.03752, 0.07987, 0.02976, 0.00886, 0, 0.07722, 0.03014, 0.03314, 0.03235, 0.00959, 0, 0, 0.03336, 0.015], # Turkish
72
+ "sv": [ 0.09383, 0.01535, 0.01486, 0.04702, 0.10149, 0.02027, 0.02862, 0.0209, 0.05817, 0.00614, 0.0314, 0.05275, 0.03471, 0.08542, 0.04482, 0.01839, 0.0002, 0.08431, 0.0659, 0.07691, 0.01919, 0.02415, 0.00142, 0.00159, 0.00708, 0.0007], # Swedish
73
+ "pl": [ 0.10503, 0.0174, 0.03895, 0.03725, 0.07352, 0.00143, 0.01731, 0.01015, 0.08328, 0.01836, 0.02753, 0.02564, 0.02515, 0.06237, 0.06667, 0.02445, 0, 0.05243, 0.05224, 0.02475, 0.02062, 0.00012, 0.05813, 4e-005, 0.03206, 0.04852], # Polish
74
+ "nl": [ 0.07486, 0.01584, 0.01242, 0.05933, 0.1891, 0.00805, 0.03403, 0.0238, 0.06499, 0.0146, 0.02248, 0.03568, 0.02213, 0.10032, 0.06063, 0.0157, 9e-005, 0.06411, 0.0373, 0.0679, 0.0199, 0.0285, 0.0152, 0.00036, 0.00035, 0.0139], # Dutch
75
+ "da": [ 0.06025, 0.02, 0.00565, 0.05858, 0.15453, 0.02406, 0.04077, 0.01621, 0.06, 0.0073, 0.03395, 0.05229, 0.03237, 0.0724, 0.04636, 0.01756, 7e-005, 0.08956, 0.05805, 0.06862, 0.01979, 0.02332, 0.00069, 0.00028, 0.00698, 0.00034], # Danish
76
+ "is": [ 0.1011, 0.01043, 0, 0.01575, 0.06418, 0.03013, 0.04241, 0.01871, 0.07578, 0.01144, 0.03314, 0.04532, 0.04041, 0.07711, 0.02166, 0.00789, 0, 0.08581, 0.0563, 0.04953, 0.04562, 0.02437, 0, 0.00046, 0.009, 0], # Icelandic
77
+ "fi": [ 0.12217, 0.00281, 0.00281, 0.01043, 0.07968, 0.00194, 0.00392, 0.01851, 0.10817, 0.02042, 0.04973, 0.05761, 0.03202, 0.08826, 0.05614, 0.01842, 0.00013, 0.02872, 0.07862, 0.0875, 0.05008, 0.0225, 0.00094, 0.00031, 0.01745, 0.00051], # Finnish
78
+ "cs": [ 0.08421, 0.00822, 0.0074, 0.03475, 0.07562, 0.00084, 0.00092, 0.01356, 0.06073, 0.01433, 0.02894, 0.03802, 0.02446, 0.06468, 0.06695, 0.01906, 1e-005, 0.04799, 0.05212, 0.05727, 0.0216, 0.05344, 0.00016, 0.00027, 0.01043, 0.01503], # Czech
79
+ # Source: http://practicalcryptography.com/cryptanalysis/letter-frequencies-various-languages/
80
+ "ru": [ 0.0804, 0.0155, 0.0475, 0.0188, 0.0295, 0.0821, 0.0022, 0.008, 0.0161, 0.0798, 0.0136, 0.0349, 0.0432, 0.0311, 0.0672, 0.1061, 0.0282, 0.0538, 0.0571, 0.0583, 0.0228, 0.0041, 0.0102, 0.0058, 0.0123, 0.0055, 0.0034, 0.0003, 0.0191, 0.0139, 0.0031, 0.0063, 0.02 ], # Russian
81
+ # Source: https://everything2.com/title/Letter+frequency+in+several+languages
82
+ "la": [ 0.072, 0.012, 0.033, 0.017, 0.092, 0.009, 0.014, 0.005, 0.101, 0, 0, 0.021, 0.034, 0.06, 0.044, 0.03, 0.013, 0.068, 0.068, 0.072, 0.074, 0.007, 0, 0.006, 0, 0 ], # Latin
83
+ }
84
+
85
+ alphabets = {
86
+ "en": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # English
87
+ "de": "ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜß", # German
88
+ "fr": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # French
89
+ "es": "ABCDEFGHIJKLMNOPQRSTUVWXYZÑ", # Spanish
90
+ "it": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Italian
91
+ "hu": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Hungarian
92
+ "ru": "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", # Russian
93
+ "cs": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Slovak
94
+ "la": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Latin
95
+ "el": "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ", # Greek
96
+ "nl": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Dutch
97
+ "sv": "ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ", # Swedish
98
+ "pt": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Portuguese
99
+ "pl": "AĄBCĆDEĘFGHIJKLŁMNŃOÓPQRSŚTUVWXYZŹŻ", # Polish
100
+ "tr": "ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ", # Turkish
101
+ }
102
+
103
+ @staticmethod
104
+ def language_code(language_id):
105
+ """
106
+ Retrieves the ISO language code for a given language ID.
107
+
108
+ Parameters:
109
+ - language_id (int): The index of the language in `supported_languages_codes`.
110
+
111
+ Returns:
112
+ - str: The language code, or an empty string if the index is invalid.
113
+ """
114
+ try:
115
+ return LanguageStatistics.supported_languages_codes[language_id]
116
+ except IndexError:
117
+ return ""
118
+
119
+ @staticmethod
120
+ def language_id(language_code):
121
+ """
122
+ Retrieves the ID for a given language code.
123
+
124
+ Parameters:
125
+ - language_code (str): The language code.
126
+
127
+ Returns:
128
+ - int: The index of the language in `supported_languages_codes`.
129
+ """
130
+ return LanguageStatistics.supported_languages_codes.index(language_code.lower())
131
+
132
+ @staticmethod
133
+ def create_grams(language_code, language_statistics_directory, grams_type, use_spaces):
134
+ """
135
+ Creates a grams object of the specified type.
136
+
137
+ Parameters:
138
+ - language_code (str): The language code.
139
+ - language_statistics_directory (str): Path to the language statistics directory.
140
+ - grams_type (GramsType): The type of grams to create.
141
+ - use_spaces (bool): Whether to include spaces in the analysis.
142
+
143
+ Returns:
144
+ - Grams: The created grams object.
145
+
146
+ Raises:
147
+ - ValueError: If the grams type is unsupported.
148
+ """
149
+ if grams_type == GramsType.Unigrams:
150
+ return Unigrams(language_code, language_statistics_directory, use_spaces)
151
+ elif grams_type == GramsType.Bigrams:
152
+ return Bigrams(language_code, language_statistics_directory, use_spaces)
153
+ elif grams_type == GramsType.Trigrams:
154
+ return Trigrams(language_code, language_statistics_directory, use_spaces)
155
+ elif grams_type == GramsType.Tetragrams:
156
+ return Tetragrams(language_code, language_statistics_directory, use_spaces)
157
+ elif grams_type == GramsType.Pentagrams:
158
+ return Pentagrams(language_code, language_statistics_directory, use_spaces)
159
+ elif grams_type == GramsType.Hexagrams:
160
+ return Hexagrams(language_code, language_statistics_directory, use_spaces)
161
+ else:
162
+ raise ValueError(f"Unsupported grams type: {grams_type}")
163
+
164
+ @staticmethod
165
+ def create_grams_by_size(grams_size, language, language_statistics_directory, use_spaces=False):
166
+ """
167
+ Creates a grams object for the specified size (e.g., unigrams, bigrams, etc.).
168
+
169
+ Parameters:
170
+ - grams_size (int): The size of the grams to create (e.g., 1 for unigrams, 2 for bigrams).
171
+ - language (str): The language code.
172
+ - language_statistics_directory (str): Path to the language statistics directory.
173
+ - use_spaces (bool): Whether to include spaces in the analysis (default: False).
174
+
175
+ Returns:
176
+ - Grams: The created grams object of the specified size.
177
+
178
+ Raises:
179
+ - ValueError: If the grams size is not supported.
180
+ """
181
+ grams_type = LanguageStatistics.get_grams_type_by_length(grams_size)
182
+ return LanguageStatistics.create_grams(language, language_statistics_directory, grams_type, use_spaces)
183
+
184
+ @staticmethod
185
+ def get_grams_type_by_length(length):
186
+ """
187
+ Retrieves the GramsType corresponding to a specific length.
188
+
189
+ Parameters:
190
+ - length (int): The size of the grams (e.g., 1 for unigrams, 2 for bigrams).
191
+
192
+ Returns:
193
+ - GramsType: The corresponding GramsType for the given length.
194
+
195
+ Raises:
196
+ - ValueError: If the length does not correspond to a valid GramsType.
197
+ """
198
+ try:
199
+ return GramsType(length)
200
+ except ValueError:
201
+ raise ValueError(f"No GramsType found for length: {length}")
202
+
203
+
204
+ @staticmethod
205
+ def alphabet(language, use_spaces=False):
206
+ """
207
+ Retrieves the alphabet for a given language.
208
+
209
+ Parameters:
210
+ - language (str): The language code.
211
+ - use_spaces (bool): Whether to include spaces in the alphabet.
212
+
213
+ Returns:
214
+ - str: The alphabet for the language, or None if not found.
215
+ """
216
+ alphabet = LanguageStatistics.alphabets.get(language, None)
217
+ if use_spaces and alphabet:
218
+ return alphabet + " "
219
+ return alphabet
220
+
221
+ @staticmethod
222
+ def calculate_ioc(plaintext):
223
+ """
224
+ Calculates the Index of Coincidence (IoC) for a given plaintext.
225
+
226
+ Parameters:
227
+ - plaintext (str): The input text.
228
+
229
+ Returns:
230
+ - float: The IoC of the text.
231
+ """
232
+ count_chars = {}
233
+ for c in plaintext:
234
+ count_chars[c] = count_chars.get(c, 0) + 1
235
+ value = sum(cnt * (cnt - 1) for cnt in count_chars.values())
236
+ N = len(plaintext)
237
+ if N <= 1:
238
+ return 0
239
+ return value / (N * (N - 1))
240
+
241
+ @staticmethod
242
+ def map_numbers_into_text_space(numbers, alphabet, handling=HandlingOfUnknownSymbols.REMOVE, replace_character='?'):
243
+ """
244
+ Maps a list of numbers into text using a given alphabet.
245
+
246
+ Parameters:
247
+ - numbers (list): List of numbers to map.
248
+ - alphabet (str): The alphabet for mapping.
249
+ - handling (HandlingOfUnknownSymbols): How to handle unknown numbers.
250
+ - replace_character (str): Replacement character for unknown numbers.
251
+
252
+ Returns:
253
+ - str: The resulting string.
254
+ """
255
+ string = ""
256
+ if handling == HandlingOfUnknownSymbols.REMOVE:
257
+ for n in numbers:
258
+ if 0 <= n < len(alphabet):
259
+ string += alphabet[n]
260
+ elif handling == HandlingOfUnknownSymbols.REPLACE:
261
+ for n in numbers:
262
+ if 0 <= n < len(alphabet):
263
+ string += alphabet[n]
264
+ else:
265
+ string += replace_character
266
+ else:
267
+ raise ValueError(f"Invalid handling mode: {handling}")
268
+ return string
269
+
270
+ @staticmethod
271
+ def map_text_into_number_space(text, alphabet, handling=HandlingOfUnknownSymbols.REMOVE, replace_number=-1):
272
+ """
273
+ Maps text into a list of numbers using a given alphabet.
274
+
275
+ Parameters:
276
+ - text (str): The input text.
277
+ - alphabet (str): The alphabet for mapping.
278
+ - handling (HandlingOfUnknownSymbols): How to handle unknown characters.
279
+ - replace_number (int): Replacement number for unknown characters.
280
+
281
+ Returns:
282
+ - list: The resulting list of numbers.
283
+ """
284
+ numlist = []
285
+ if handling == HandlingOfUnknownSymbols.REMOVE:
286
+ for c in text:
287
+ if c in alphabet:
288
+ numlist.append(alphabet.index(c))
289
+ elif handling == HandlingOfUnknownSymbols.REPLACE:
290
+ for c in text:
291
+ if c in alphabet:
292
+ numlist.append(alphabet.index(c))
293
+ else:
294
+ numlist.append(replace_number)
295
+ else:
296
+ raise ValueError(f"Invalid handling mode: {handling}")
297
+ return numlist
298
+
299
+ @staticmethod
300
+ def load_word_tree(language_code, language_statistics_directory):
301
+ """
302
+ Loads a WordTree for a specific language.
303
+
304
+ Parameters:
305
+ - language_code (str): The language code.
306
+ - language_statistics_directory (str): Path to the language statistics directory.
307
+
308
+ Returns:
309
+ - WordTree: The loaded WordTree object.
310
+ """
311
+ filename = os.path.join(language_statistics_directory, f"Dictionary_{language_code}.dic")
312
+ with gzip.open(filename, 'rb') as filestream:
313
+ return WordTree.deserialize(filestream)
@@ -0,0 +1,108 @@
1
+ '''
2
+ Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ '''
16
+ import struct
17
+ import numpy as np
18
+ import gzip
19
+
20
+ import gzip
21
+ import struct
22
+ import numpy as np
23
+
24
+ class LanguageStatisticsFile:
25
+ """
26
+ Class for handling the loading of language statistics from a compressed file.
27
+
28
+ Attributes:
29
+ - FILE_FORMAT_MAGIC_NUMBER (str): The expected magic number to identify valid language statistics files.
30
+ - file_path (str): The path to the language statistics file.
31
+ - alphabet (str): The alphabet used in the language statistics.
32
+ - language_code (str): The language code extracted from the statistics file.
33
+ """
34
+
35
+ FILE_FORMAT_MAGIC_NUMBER = "CTLS"
36
+
37
+ def __init__(self, file_path):
38
+ """
39
+ Initializes the LanguageStatisticsFile class.
40
+
41
+ Parameters:
42
+ - file_path (str): The path to the language statistics file.
43
+
44
+ Initializes:
45
+ - self.file_path (str): Stores the path to the file.
46
+ - self.alphabet (str): Initially empty, set after file loading.
47
+ - self.language_code (str): Initially empty, set after file loading.
48
+ """
49
+ self.file_path = file_path
50
+ self.alphabet = ''
51
+ self.language_code = ''
52
+
53
+ def load_frequencies(self, array_dimensions):
54
+ """
55
+ Loads the frequency data from the language statistics file.
56
+
57
+ Parameters:
58
+ - array_dimensions (int): The dimensionality of the frequency array (e.g., 1 for unigrams, 2 for bigrams).
59
+
60
+ Returns:
61
+ - np.ndarray: A numpy array containing the frequency data.
62
+
63
+ Raises:
64
+ - Exception: If the file format is invalid or the dimensions of the frequency array do not match the expected value.
65
+
66
+ Process:
67
+ 1. Validates the file by checking the magic number.
68
+ 2. Reads the language code, gram length, and alphabet.
69
+ 3. Verifies that the gram length matches the required dimensions.
70
+ 4. Reads the frequency data and reshapes it into the appropriate numpy array format.
71
+ 5. Copies the data into a new numpy array to allow modification.
72
+ """
73
+ with gzip.open(self.file_path, 'rb') as file:
74
+ # Validate the file format by checking the magic number.
75
+ magic_number = file.read(4).decode('utf-8')
76
+ if magic_number != self.FILE_FORMAT_MAGIC_NUMBER:
77
+ raise Exception("File does not start with the expected magic number for language statistics.")
78
+
79
+ # Read the language code (length-prefixed string).
80
+ language_code_length_bytes = file.read(1)[0]
81
+ self.language_code = file.read(language_code_length_bytes).decode('utf-8')
82
+
83
+ # Read the gram length (32-bit signed integer).
84
+ gram_length = struct.unpack('<i', file.read(4))[0]
85
+
86
+ # Ensure the gram length matches the required dimensions.
87
+ if gram_length != array_dimensions:
88
+ raise Exception("Gram length of statistics file differs from required dimensions of frequency array.")
89
+
90
+ # Read the alphabet (length-prefixed string).
91
+ self.alphabet_length = file.read(1)[0]
92
+ self.alphabet = file.read(self.alphabet_length).decode('utf-8')
93
+
94
+ # Calculate the total number of frequency entries.
95
+ frequency_entries = self.alphabet_length ** gram_length
96
+
97
+ # Read the frequency data (32-bit float array).
98
+ frequency_data = file.read(frequency_entries * 4)
99
+
100
+ # Reshape the data into the correct dimensionality and copy it for mutability.
101
+ if array_dimensions == 1:
102
+ frequencies = np.frombuffer(frequency_data, dtype=np.float32).copy()
103
+ else:
104
+ frequencies = np.frombuffer(frequency_data, dtype=np.float32).reshape(
105
+ tuple([self.alphabet_length] * array_dimensions)
106
+ ).copy()
107
+
108
+ return frequencies
@@ -0,0 +1,89 @@
1
+ '''
2
+ Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ '''
16
+
17
+ class Node:
18
+ """
19
+ Represents a single node in a tree structure for storing characters.
20
+
21
+ Attributes:
22
+ - WordEndSymbol (str): A constant indicating the end of a word in the tree (default: `chr(1)`).
23
+ - TerminationSymbol (str): A constant indicating the end of a tree branch (default: `chr(0)`).
24
+ - value (str or None): The character or value stored in this node.
25
+ - word_ends_here (bool): Whether this node marks the end of a word.
26
+ - child_nodes (list): A list of child nodes connected to this node.
27
+ """
28
+
29
+ WordEndSymbol = chr(1) # Constant for the symbol indicating the end of a word
30
+ TerminationSymbol = chr(0) # Constant for the symbol indicating the end of the tree
31
+
32
+ def __init__(self, value=None):
33
+ """
34
+ Initializes a Node object.
35
+
36
+ Parameters:
37
+ - value (str or None): The character or value to be stored in this node (default: None).
38
+
39
+ Initializes:
40
+ - self.value (str or None): The value of this node.
41
+ - self.word_ends_here (bool): Set to False initially, indicating that no word ends here.
42
+ - self.child_nodes (list): An empty list to hold child nodes.
43
+ """
44
+ self.value = value
45
+ self.word_ends_here = False
46
+ self.child_nodes = []
47
+
48
+ def __eq__(self, other):
49
+ """
50
+ Compares two Node objects for equality.
51
+
52
+ Parameters:
53
+ - other (Node): The node to compare with.
54
+
55
+ Returns:
56
+ - bool: True if both nodes are equal, False otherwise.
57
+
58
+ Notes:
59
+ - Two nodes are considered equal if:
60
+ - Their values are the same.
61
+ - Their `word_ends_here` flags are the same.
62
+ - Their child nodes are identical in value and order.
63
+ """
64
+ if not isinstance(other, Node):
65
+ return False
66
+ if self.value != other.value or self.word_ends_here != other.word_ends_here:
67
+ return False
68
+ if len(self.child_nodes) != len(other.child_nodes):
69
+ return False
70
+ for i in range(len(self.child_nodes)):
71
+ if self.child_nodes[i] != other.child_nodes[i]:
72
+ return False
73
+ return True
74
+
75
+ def __hash__(self):
76
+ """
77
+ Calculates a hash value for the Node.
78
+
79
+ Returns:
80
+ - int: A hash value based on the node's value, `word_ends_here` status, and child nodes.
81
+
82
+ Notes:
83
+ - This method allows the node to be used in hash-based collections (e.g., sets, dictionaries).
84
+ """
85
+ hash_value = hash(self.value)
86
+ hash_value = hash_value * 31 + hash(self.word_ends_here)
87
+ for child_node in self.child_nodes:
88
+ hash_value = hash_value * 31 + hash(child_node)
89
+ return hash_value
@@ -0,0 +1,128 @@
1
+ '''
2
+ Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ '''
16
+ import numpy as np
17
+ import os
18
+ from languagestatisticslibpy.Grams import Grams
19
+ from languagestatisticslibpy.GramsType import GramsType
20
+ from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
21
+
22
+ class Pentagrams(Grams):
23
+ def __init__(self, language, language_statistics_directory, use_spaces=False):
24
+ """
25
+ Initializes the Pentagrams class by calling the parent class (Grams) initializer.
26
+
27
+ Parameters:
28
+ - language (str): The language of the pentagram statistics.
29
+ - language_statistics_directory (str): Path to the directory containing language statistics files.
30
+ - use_spaces (bool): Whether to include spaces in the analysis (default: False).
31
+ """
32
+ super().__init__(language, language_statistics_directory, use_spaces)
33
+
34
+ def load_gz(self, filename, language_statistics_directory):
35
+ """
36
+ Loads a gzip-compressed file containing pentagram frequencies.
37
+
38
+ Parameters:
39
+ - filename (str): The name of the file to load.
40
+ - language_statistics_directory (str): The directory where the statistics file is located.
41
+
42
+ Sets:
43
+ - self.frequencies (np.ndarray): A 5D array of pentagram frequencies.
44
+ - self.alphabet (list): The alphabet used in the statistics file.
45
+ - self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
46
+ """
47
+ file_path = os.path.join(language_statistics_directory, filename)
48
+ language_statistics_file = LanguageStatisticsFile(file_path)
49
+ self.frequencies = language_statistics_file.load_frequencies(5)
50
+ self.alphabet = language_statistics_file.alphabet
51
+ self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
52
+
53
+ def calculate_cost(self, text):
54
+ """
55
+ Calculates the cost of a given text based on pentagram frequencies.
56
+
57
+ Parameters:
58
+ - text (str): The text to analyze.
59
+
60
+ Returns:
61
+ - float: The average cost of pentagrams in the text. Returns 0.0 if the text length is less than 5.
62
+
63
+ Notes:
64
+ - Skips pentagrams containing characters outside the defined alphabet.
65
+ - If `add_letter_indices` is defined, modifies the index of the characters before computing the cost.
66
+ """
67
+ if len(text) < 5:
68
+ return 0.0
69
+
70
+ value = 0.0
71
+ alphabet_length = len(self.alphabet)
72
+ end = len(text) - 4
73
+
74
+ for i in range(end):
75
+ a, b, c, d, e = text[i:i+5]
76
+
77
+ if self.add_letter_indices:
78
+ a += self.add_letter_indices.get(a, 0)
79
+ b += self.add_letter_indices.get(b, 0)
80
+ c += self.add_letter_indices.get(c, 0)
81
+ d += self.add_letter_indices.get(d, 0)
82
+ e += self.add_letter_indices.get(e, 0)
83
+
84
+ if 0 <= a < alphabet_length and 0 <= b < alphabet_length and \
85
+ 0 <= c < alphabet_length and 0 <= d < alphabet_length and \
86
+ 0 <= e < alphabet_length:
87
+ value += self.frequencies[a, b, c, d, e]
88
+
89
+ return value / end
90
+
91
+ def gram_size(self):
92
+ """
93
+ Returns the size of the grams being analyzed (pentagrams in this case).
94
+
95
+ Returns:
96
+ - int: The size of the grams (always 5 for pentagrams).
97
+ """
98
+ return 5
99
+
100
+ def grams_type(self):
101
+ """
102
+ Returns the type of grams being analyzed.
103
+
104
+ Returns:
105
+ - GramsType: An enum value representing the type of grams (GramsType.Pentagrams).
106
+ """
107
+ return GramsType.Pentagrams
108
+
109
+ def normalize(self, max_value):
110
+ """
111
+ Normalizes the pentagram frequencies based on the provided maximum value.
112
+
113
+ Parameters:
114
+ - max_value (float): The maximum value used for normalization.
115
+
116
+ Notes:
117
+ - Adjusts all frequencies proportionally to the new maximum value.
118
+ - Updates `self.max_value` to the new maximum after normalization.
119
+ """
120
+ super().normalize(max_value)
121
+ adjust_value = self.max_value * max_value
122
+ for a in range(len(self.alphabet)):
123
+ for b in range(len(self.alphabet)):
124
+ for c in range(len(self.alphabet)):
125
+ for d in range(len(self.alphabet)):
126
+ for e in range(len(self.alphabet)):
127
+ self.frequencies[a, b, c, d, e] = adjust_value / self.frequencies[a, b, c, d, e]
128
+ self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')