LanguageStatisticsLibPy 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- languagestatisticslibpy/Bigrams.py +124 -0
- languagestatisticslibpy/Grams.py +134 -0
- languagestatisticslibpy/GramsType.py +38 -0
- languagestatisticslibpy/Hexagrams.py +130 -0
- languagestatisticslibpy/LanguageStatistics.py +313 -0
- languagestatisticslibpy/LanguageStatisticsFile.py +108 -0
- languagestatisticslibpy/Node.py +89 -0
- languagestatisticslibpy/Pentagrams.py +128 -0
- languagestatisticslibpy/Tetragrams.py +125 -0
- languagestatisticslibpy/Trigrams.py +125 -0
- languagestatisticslibpy/Unigrams.py +110 -0
- languagestatisticslibpy/WordTree.py +162 -0
- languagestatisticslibpy/__init__.py +0 -0
- languagestatisticslibpy/test1.py +26 -0
- languagestatisticslibpy/test2.py +80 -0
- languagestatisticslibpy-1.0.3.dist-info/LICENSE +201 -0
- languagestatisticslibpy-1.0.3.dist-info/METADATA +124 -0
- languagestatisticslibpy-1.0.3.dist-info/RECORD +19 -0
- languagestatisticslibpy-1.0.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
'''
|
|
16
|
+
import os
|
|
17
|
+
import gzip
|
|
18
|
+
from enum import Enum
|
|
19
|
+
from languagestatisticslibpy.GramsType import GramsType
|
|
20
|
+
from languagestatisticslibpy.Unigrams import Unigrams
|
|
21
|
+
from languagestatisticslibpy.Bigrams import Bigrams
|
|
22
|
+
from languagestatisticslibpy.Trigrams import Trigrams
|
|
23
|
+
from languagestatisticslibpy.Tetragrams import Tetragrams
|
|
24
|
+
from languagestatisticslibpy.Pentagrams import Pentagrams
|
|
25
|
+
from languagestatisticslibpy.Hexagrams import Hexagrams
|
|
26
|
+
from languagestatisticslibpy.WordTree import WordTree
|
|
27
|
+
|
|
28
|
+
class HandlingOfUnknownSymbols(Enum):
|
|
29
|
+
"""
|
|
30
|
+
Enum to specify how to handle unknown symbols during text or number mapping.
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
- REMOVE (int): Remove unknown symbols.
|
|
34
|
+
- REPLACE (int): Replace unknown symbols with a specific character or number.
|
|
35
|
+
"""
|
|
36
|
+
REMOVE = 0
|
|
37
|
+
REPLACE = 1
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LanguageStatistics:
|
|
41
|
+
"""
|
|
42
|
+
Provides utilities for handling language-related statistics, such as unigram frequencies,
|
|
43
|
+
alphabet definitions, and mapping between text and number spaces.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
- supported_languages_codes (list): A list of ISO language codes for supported languages.
|
|
47
|
+
- supported_languages (list): A list of language names corresponding to the supported language codes.
|
|
48
|
+
- unigrams (dict): A dictionary mapping language codes to their unigram frequencies.
|
|
49
|
+
- alphabets (dict): A dictionary mapping language codes to their alphabets.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
supported_languages_codes = [
|
|
53
|
+
"en", "de", "es", "fr", "it", "hu", "ru", "cs", "el", "la", "nl", "sv", "pt", "pl", "tr"
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
supported_languages = [
|
|
57
|
+
"English", "German", "Spanish", "French", "Italian", "Hungarian",
|
|
58
|
+
"Russian", "Czech", "Greek", "Latin", "Dutch", "Swedish",
|
|
59
|
+
"Portuguese", "Polish", "Turkish"
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
unigrams = {
|
|
63
|
+
# Source: Wikipedia
|
|
64
|
+
"en": [ 0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, 0.06094, 0.06966, 0.00153, 0.00772, 0.04025, 0.02406, 0.06749, 0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056, 0.02758, 0.00978, 0.0236, 0.0015, 0.01974, 0.00074], # English
|
|
65
|
+
"fr": [ 0.07636, 0.00901, 0.0326, 0.03669, 0.14715, 0.01066, 0.00866, 0.00737, 0.07529, 0.00613, 0.00049, 0.05456, 0.02968, 0.07095, 0.05796, 0.02521, 0.01362, 0.06693, 0.07948, 0.07244, 0.06311, 0.01838, 0.00074, 0.00427, 0.00128, 0.00326], # French
|
|
66
|
+
"de": [ 0.06516, 0.01886, 0.02732, 0.05076, 0.16396, 0.01656, 0.03009, 0.04577, 0.0655, 0.00268, 0.01417, 0.03437, 0.02534, 0.09776, 0.02594, 0.0067, 0.00018, 0.07003, 0.0727, 0.06154, 0.04166, 0.00846, 0.01921, 0.00034, 0.00039, 0.01134], # German
|
|
67
|
+
"es": [ 0.11525, 0.02215, 0.04019, 0.0501, 0.12181, 0.00692, 0.01768, 0.00703, 0.06247, 0.00493, 0.00011, 0.04967, 0.03157, 0.06712, 0.08683, 0.0251, 0.00877, 0.06871, 0.07977, 0.04632, 0.02927, 0.01138, 0.00017, 0.00215, 0.01008, 0.00467], # Spanish
|
|
68
|
+
"pt": [ 0.14634, 0.01043, 0.03882, 0.04992, 0.1257, 0.01023, 0.01303, 0.00781, 0.06186, 0.00397, 0.00015, 0.02779, 0.04738, 0.04446, 0.09735, 0.02523, 0.01204, 0.0653, 0.06805, 0.04336, 0.03639, 0.01575, 0.00037, 0.00253, 6e-005, 0.0047], # Portuguese
|
|
69
|
+
"eo": [ 0.12117, 0.0098, 0.00776, 0.03044, 0.08995, 0.01037, 0.01171, 0.00384, 0.10012, 0.03501, 0.04163, 0.06104, 0.02994, 0.07955, 0.08779, 0.02755, 0, 0.05914, 0.06092, 0.05276, 0.03183, 0.01904, 0, 0, 0, 0.00494], # Esperanto
|
|
70
|
+
"it": [ 0.11745, 0.00927, 0.04501, 0.03736, 0.11792, 0.01153, 0.01644, 0.00636, 0.10143, 0.00011, 9e-005, 0.0651, 0.02512, 0.06883, 0.09832, 0.03056, 0.00505, 0.06367, 0.04981, 0.05623, 0.03011, 0.02097, 0.00033, 3e-005, 0.0002, 0.01181], # Italian
|
|
71
|
+
"tr": [ 0.1292, 0.02844, 0.01463, 0.05206, 0.09912, 0.00461, 0.01253, 0.01212, 0.096, 0.00034, 0.05683, 0.05922, 0.03752, 0.07987, 0.02976, 0.00886, 0, 0.07722, 0.03014, 0.03314, 0.03235, 0.00959, 0, 0, 0.03336, 0.015], # Turkish
|
|
72
|
+
"sv": [ 0.09383, 0.01535, 0.01486, 0.04702, 0.10149, 0.02027, 0.02862, 0.0209, 0.05817, 0.00614, 0.0314, 0.05275, 0.03471, 0.08542, 0.04482, 0.01839, 0.0002, 0.08431, 0.0659, 0.07691, 0.01919, 0.02415, 0.00142, 0.00159, 0.00708, 0.0007], # Swedish
|
|
73
|
+
"pl": [ 0.10503, 0.0174, 0.03895, 0.03725, 0.07352, 0.00143, 0.01731, 0.01015, 0.08328, 0.01836, 0.02753, 0.02564, 0.02515, 0.06237, 0.06667, 0.02445, 0, 0.05243, 0.05224, 0.02475, 0.02062, 0.00012, 0.05813, 4e-005, 0.03206, 0.04852], # Polish
|
|
74
|
+
"nl": [ 0.07486, 0.01584, 0.01242, 0.05933, 0.1891, 0.00805, 0.03403, 0.0238, 0.06499, 0.0146, 0.02248, 0.03568, 0.02213, 0.10032, 0.06063, 0.0157, 9e-005, 0.06411, 0.0373, 0.0679, 0.0199, 0.0285, 0.0152, 0.00036, 0.00035, 0.0139], # Dutch
|
|
75
|
+
"da": [ 0.06025, 0.02, 0.00565, 0.05858, 0.15453, 0.02406, 0.04077, 0.01621, 0.06, 0.0073, 0.03395, 0.05229, 0.03237, 0.0724, 0.04636, 0.01756, 7e-005, 0.08956, 0.05805, 0.06862, 0.01979, 0.02332, 0.00069, 0.00028, 0.00698, 0.00034], # Danish
|
|
76
|
+
"is": [ 0.1011, 0.01043, 0, 0.01575, 0.06418, 0.03013, 0.04241, 0.01871, 0.07578, 0.01144, 0.03314, 0.04532, 0.04041, 0.07711, 0.02166, 0.00789, 0, 0.08581, 0.0563, 0.04953, 0.04562, 0.02437, 0, 0.00046, 0.009, 0], # Icelandic
|
|
77
|
+
"fi": [ 0.12217, 0.00281, 0.00281, 0.01043, 0.07968, 0.00194, 0.00392, 0.01851, 0.10817, 0.02042, 0.04973, 0.05761, 0.03202, 0.08826, 0.05614, 0.01842, 0.00013, 0.02872, 0.07862, 0.0875, 0.05008, 0.0225, 0.00094, 0.00031, 0.01745, 0.00051], # Finnish
|
|
78
|
+
"cs": [ 0.08421, 0.00822, 0.0074, 0.03475, 0.07562, 0.00084, 0.00092, 0.01356, 0.06073, 0.01433, 0.02894, 0.03802, 0.02446, 0.06468, 0.06695, 0.01906, 1e-005, 0.04799, 0.05212, 0.05727, 0.0216, 0.05344, 0.00016, 0.00027, 0.01043, 0.01503], # Czech
|
|
79
|
+
# Source: http://practicalcryptography.com/cryptanalysis/letter-frequencies-various-languages/
|
|
80
|
+
"ru": [ 0.0804, 0.0155, 0.0475, 0.0188, 0.0295, 0.0821, 0.0022, 0.008, 0.0161, 0.0798, 0.0136, 0.0349, 0.0432, 0.0311, 0.0672, 0.1061, 0.0282, 0.0538, 0.0571, 0.0583, 0.0228, 0.0041, 0.0102, 0.0058, 0.0123, 0.0055, 0.0034, 0.0003, 0.0191, 0.0139, 0.0031, 0.0063, 0.02 ], # Russian
|
|
81
|
+
# Source: https://everything2.com/title/Letter+frequency+in+several+languages
|
|
82
|
+
"la": [ 0.072, 0.012, 0.033, 0.017, 0.092, 0.009, 0.014, 0.005, 0.101, 0, 0, 0.021, 0.034, 0.06, 0.044, 0.03, 0.013, 0.068, 0.068, 0.072, 0.074, 0.007, 0, 0.006, 0, 0 ], # Latin
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
alphabets = {
|
|
86
|
+
"en": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # English
|
|
87
|
+
"de": "ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜß", # German
|
|
88
|
+
"fr": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # French
|
|
89
|
+
"es": "ABCDEFGHIJKLMNOPQRSTUVWXYZÑ", # Spanish
|
|
90
|
+
"it": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Italian
|
|
91
|
+
"hu": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Hungarian
|
|
92
|
+
"ru": "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", # Russian
|
|
93
|
+
"cs": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Slovak
|
|
94
|
+
"la": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Latin
|
|
95
|
+
"el": "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ", # Greek
|
|
96
|
+
"nl": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Dutch
|
|
97
|
+
"sv": "ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ", # Swedish
|
|
98
|
+
"pt": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", # Portuguese
|
|
99
|
+
"pl": "AĄBCĆDEĘFGHIJKLŁMNŃOÓPQRSŚTUVWXYZŹŻ", # Polish
|
|
100
|
+
"tr": "ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZ", # Turkish
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def language_code(language_id):
|
|
105
|
+
"""
|
|
106
|
+
Retrieves the ISO language code for a given language ID.
|
|
107
|
+
|
|
108
|
+
Parameters:
|
|
109
|
+
- language_id (int): The index of the language in `supported_languages_codes`.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
- str: The language code, or an empty string if the index is invalid.
|
|
113
|
+
"""
|
|
114
|
+
try:
|
|
115
|
+
return LanguageStatistics.supported_languages_codes[language_id]
|
|
116
|
+
except IndexError:
|
|
117
|
+
return ""
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def language_id(language_code):
|
|
121
|
+
"""
|
|
122
|
+
Retrieves the ID for a given language code.
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
- language_code (str): The language code.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
- int: The index of the language in `supported_languages_codes`.
|
|
129
|
+
"""
|
|
130
|
+
return LanguageStatistics.supported_languages_codes.index(language_code.lower())
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def create_grams(language_code, language_statistics_directory, grams_type, use_spaces):
|
|
134
|
+
"""
|
|
135
|
+
Creates a grams object of the specified type.
|
|
136
|
+
|
|
137
|
+
Parameters:
|
|
138
|
+
- language_code (str): The language code.
|
|
139
|
+
- language_statistics_directory (str): Path to the language statistics directory.
|
|
140
|
+
- grams_type (GramsType): The type of grams to create.
|
|
141
|
+
- use_spaces (bool): Whether to include spaces in the analysis.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
- Grams: The created grams object.
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
- ValueError: If the grams type is unsupported.
|
|
148
|
+
"""
|
|
149
|
+
if grams_type == GramsType.Unigrams:
|
|
150
|
+
return Unigrams(language_code, language_statistics_directory, use_spaces)
|
|
151
|
+
elif grams_type == GramsType.Bigrams:
|
|
152
|
+
return Bigrams(language_code, language_statistics_directory, use_spaces)
|
|
153
|
+
elif grams_type == GramsType.Trigrams:
|
|
154
|
+
return Trigrams(language_code, language_statistics_directory, use_spaces)
|
|
155
|
+
elif grams_type == GramsType.Tetragrams:
|
|
156
|
+
return Tetragrams(language_code, language_statistics_directory, use_spaces)
|
|
157
|
+
elif grams_type == GramsType.Pentagrams:
|
|
158
|
+
return Pentagrams(language_code, language_statistics_directory, use_spaces)
|
|
159
|
+
elif grams_type == GramsType.Hexagrams:
|
|
160
|
+
return Hexagrams(language_code, language_statistics_directory, use_spaces)
|
|
161
|
+
else:
|
|
162
|
+
raise ValueError(f"Unsupported grams type: {grams_type}")
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def create_grams_by_size(grams_size, language, language_statistics_directory, use_spaces=False):
|
|
166
|
+
"""
|
|
167
|
+
Creates a grams object for the specified size (e.g., unigrams, bigrams, etc.).
|
|
168
|
+
|
|
169
|
+
Parameters:
|
|
170
|
+
- grams_size (int): The size of the grams to create (e.g., 1 for unigrams, 2 for bigrams).
|
|
171
|
+
- language (str): The language code.
|
|
172
|
+
- language_statistics_directory (str): Path to the language statistics directory.
|
|
173
|
+
- use_spaces (bool): Whether to include spaces in the analysis (default: False).
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
- Grams: The created grams object of the specified size.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
- ValueError: If the grams size is not supported.
|
|
180
|
+
"""
|
|
181
|
+
grams_type = LanguageStatistics.get_grams_type_by_length(grams_size)
|
|
182
|
+
return LanguageStatistics.create_grams(language, language_statistics_directory, grams_type, use_spaces)
|
|
183
|
+
|
|
184
|
+
@staticmethod
|
|
185
|
+
def get_grams_type_by_length(length):
|
|
186
|
+
"""
|
|
187
|
+
Retrieves the GramsType corresponding to a specific length.
|
|
188
|
+
|
|
189
|
+
Parameters:
|
|
190
|
+
- length (int): The size of the grams (e.g., 1 for unigrams, 2 for bigrams).
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
- GramsType: The corresponding GramsType for the given length.
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
- ValueError: If the length does not correspond to a valid GramsType.
|
|
197
|
+
"""
|
|
198
|
+
try:
|
|
199
|
+
return GramsType(length)
|
|
200
|
+
except ValueError:
|
|
201
|
+
raise ValueError(f"No GramsType found for length: {length}")
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def alphabet(language, use_spaces=False):
|
|
206
|
+
"""
|
|
207
|
+
Retrieves the alphabet for a given language.
|
|
208
|
+
|
|
209
|
+
Parameters:
|
|
210
|
+
- language (str): The language code.
|
|
211
|
+
- use_spaces (bool): Whether to include spaces in the alphabet.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
- str: The alphabet for the language, or None if not found.
|
|
215
|
+
"""
|
|
216
|
+
alphabet = LanguageStatistics.alphabets.get(language, None)
|
|
217
|
+
if use_spaces and alphabet:
|
|
218
|
+
return alphabet + " "
|
|
219
|
+
return alphabet
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def calculate_ioc(plaintext):
|
|
223
|
+
"""
|
|
224
|
+
Calculates the Index of Coincidence (IoC) for a given plaintext.
|
|
225
|
+
|
|
226
|
+
Parameters:
|
|
227
|
+
- plaintext (str): The input text.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
- float: The IoC of the text.
|
|
231
|
+
"""
|
|
232
|
+
count_chars = {}
|
|
233
|
+
for c in plaintext:
|
|
234
|
+
count_chars[c] = count_chars.get(c, 0) + 1
|
|
235
|
+
value = sum(cnt * (cnt - 1) for cnt in count_chars.values())
|
|
236
|
+
N = len(plaintext)
|
|
237
|
+
if N <= 1:
|
|
238
|
+
return 0
|
|
239
|
+
return value / (N * (N - 1))
|
|
240
|
+
|
|
241
|
+
@staticmethod
|
|
242
|
+
def map_numbers_into_text_space(numbers, alphabet, handling=HandlingOfUnknownSymbols.REMOVE, replace_character='?'):
|
|
243
|
+
"""
|
|
244
|
+
Maps a list of numbers into text using a given alphabet.
|
|
245
|
+
|
|
246
|
+
Parameters:
|
|
247
|
+
- numbers (list): List of numbers to map.
|
|
248
|
+
- alphabet (str): The alphabet for mapping.
|
|
249
|
+
- handling (HandlingOfUnknownSymbols): How to handle unknown numbers.
|
|
250
|
+
- replace_character (str): Replacement character for unknown numbers.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
- str: The resulting string.
|
|
254
|
+
"""
|
|
255
|
+
string = ""
|
|
256
|
+
if handling == HandlingOfUnknownSymbols.REMOVE:
|
|
257
|
+
for n in numbers:
|
|
258
|
+
if 0 <= n < len(alphabet):
|
|
259
|
+
string += alphabet[n]
|
|
260
|
+
elif handling == HandlingOfUnknownSymbols.REPLACE:
|
|
261
|
+
for n in numbers:
|
|
262
|
+
if 0 <= n < len(alphabet):
|
|
263
|
+
string += alphabet[n]
|
|
264
|
+
else:
|
|
265
|
+
string += replace_character
|
|
266
|
+
else:
|
|
267
|
+
raise ValueError(f"Invalid handling mode: {handling}")
|
|
268
|
+
return string
|
|
269
|
+
|
|
270
|
+
@staticmethod
|
|
271
|
+
def map_text_into_number_space(text, alphabet, handling=HandlingOfUnknownSymbols.REMOVE, replace_number=-1):
|
|
272
|
+
"""
|
|
273
|
+
Maps text into a list of numbers using a given alphabet.
|
|
274
|
+
|
|
275
|
+
Parameters:
|
|
276
|
+
- text (str): The input text.
|
|
277
|
+
- alphabet (str): The alphabet for mapping.
|
|
278
|
+
- handling (HandlingOfUnknownSymbols): How to handle unknown characters.
|
|
279
|
+
- replace_number (int): Replacement number for unknown characters.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
- list: The resulting list of numbers.
|
|
283
|
+
"""
|
|
284
|
+
numlist = []
|
|
285
|
+
if handling == HandlingOfUnknownSymbols.REMOVE:
|
|
286
|
+
for c in text:
|
|
287
|
+
if c in alphabet:
|
|
288
|
+
numlist.append(alphabet.index(c))
|
|
289
|
+
elif handling == HandlingOfUnknownSymbols.REPLACE:
|
|
290
|
+
for c in text:
|
|
291
|
+
if c in alphabet:
|
|
292
|
+
numlist.append(alphabet.index(c))
|
|
293
|
+
else:
|
|
294
|
+
numlist.append(replace_number)
|
|
295
|
+
else:
|
|
296
|
+
raise ValueError(f"Invalid handling mode: {handling}")
|
|
297
|
+
return numlist
|
|
298
|
+
|
|
299
|
+
@staticmethod
|
|
300
|
+
def load_word_tree(language_code, language_statistics_directory):
|
|
301
|
+
"""
|
|
302
|
+
Loads a WordTree for a specific language.
|
|
303
|
+
|
|
304
|
+
Parameters:
|
|
305
|
+
- language_code (str): The language code.
|
|
306
|
+
- language_statistics_directory (str): Path to the language statistics directory.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
- WordTree: The loaded WordTree object.
|
|
310
|
+
"""
|
|
311
|
+
filename = os.path.join(language_statistics_directory, f"Dictionary_{language_code}.dic")
|
|
312
|
+
with gzip.open(filename, 'rb') as filestream:
|
|
313
|
+
return WordTree.deserialize(filestream)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
'''
|
|
16
|
+
import struct
|
|
17
|
+
import numpy as np
|
|
18
|
+
import gzip
|
|
19
|
+
|
|
20
|
+
import gzip
|
|
21
|
+
import struct
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
class LanguageStatisticsFile:
|
|
25
|
+
"""
|
|
26
|
+
Class for handling the loading of language statistics from a compressed file.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
- FILE_FORMAT_MAGIC_NUMBER (str): The expected magic number to identify valid language statistics files.
|
|
30
|
+
- file_path (str): The path to the language statistics file.
|
|
31
|
+
- alphabet (str): The alphabet used in the language statistics.
|
|
32
|
+
- language_code (str): The language code extracted from the statistics file.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
FILE_FORMAT_MAGIC_NUMBER = "CTLS"
|
|
36
|
+
|
|
37
|
+
def __init__(self, file_path):
|
|
38
|
+
"""
|
|
39
|
+
Initializes the LanguageStatisticsFile class.
|
|
40
|
+
|
|
41
|
+
Parameters:
|
|
42
|
+
- file_path (str): The path to the language statistics file.
|
|
43
|
+
|
|
44
|
+
Initializes:
|
|
45
|
+
- self.file_path (str): Stores the path to the file.
|
|
46
|
+
- self.alphabet (str): Initially empty, set after file loading.
|
|
47
|
+
- self.language_code (str): Initially empty, set after file loading.
|
|
48
|
+
"""
|
|
49
|
+
self.file_path = file_path
|
|
50
|
+
self.alphabet = ''
|
|
51
|
+
self.language_code = ''
|
|
52
|
+
|
|
53
|
+
def load_frequencies(self, array_dimensions):
|
|
54
|
+
"""
|
|
55
|
+
Loads the frequency data from the language statistics file.
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
- array_dimensions (int): The dimensionality of the frequency array (e.g., 1 for unigrams, 2 for bigrams).
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
- np.ndarray: A numpy array containing the frequency data.
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
- Exception: If the file format is invalid or the dimensions of the frequency array do not match the expected value.
|
|
65
|
+
|
|
66
|
+
Process:
|
|
67
|
+
1. Validates the file by checking the magic number.
|
|
68
|
+
2. Reads the language code, gram length, and alphabet.
|
|
69
|
+
3. Verifies that the gram length matches the required dimensions.
|
|
70
|
+
4. Reads the frequency data and reshapes it into the appropriate numpy array format.
|
|
71
|
+
5. Copies the data into a new numpy array to allow modification.
|
|
72
|
+
"""
|
|
73
|
+
with gzip.open(self.file_path, 'rb') as file:
|
|
74
|
+
# Validate the file format by checking the magic number.
|
|
75
|
+
magic_number = file.read(4).decode('utf-8')
|
|
76
|
+
if magic_number != self.FILE_FORMAT_MAGIC_NUMBER:
|
|
77
|
+
raise Exception("File does not start with the expected magic number for language statistics.")
|
|
78
|
+
|
|
79
|
+
# Read the language code (length-prefixed string).
|
|
80
|
+
language_code_length_bytes = file.read(1)[0]
|
|
81
|
+
self.language_code = file.read(language_code_length_bytes).decode('utf-8')
|
|
82
|
+
|
|
83
|
+
# Read the gram length (32-bit signed integer).
|
|
84
|
+
gram_length = struct.unpack('<i', file.read(4))[0]
|
|
85
|
+
|
|
86
|
+
# Ensure the gram length matches the required dimensions.
|
|
87
|
+
if gram_length != array_dimensions:
|
|
88
|
+
raise Exception("Gram length of statistics file differs from required dimensions of frequency array.")
|
|
89
|
+
|
|
90
|
+
# Read the alphabet (length-prefixed string).
|
|
91
|
+
self.alphabet_length = file.read(1)[0]
|
|
92
|
+
self.alphabet = file.read(self.alphabet_length).decode('utf-8')
|
|
93
|
+
|
|
94
|
+
# Calculate the total number of frequency entries.
|
|
95
|
+
frequency_entries = self.alphabet_length ** gram_length
|
|
96
|
+
|
|
97
|
+
# Read the frequency data (32-bit float array).
|
|
98
|
+
frequency_data = file.read(frequency_entries * 4)
|
|
99
|
+
|
|
100
|
+
# Reshape the data into the correct dimensionality and copy it for mutability.
|
|
101
|
+
if array_dimensions == 1:
|
|
102
|
+
frequencies = np.frombuffer(frequency_data, dtype=np.float32).copy()
|
|
103
|
+
else:
|
|
104
|
+
frequencies = np.frombuffer(frequency_data, dtype=np.float32).reshape(
|
|
105
|
+
tuple([self.alphabet_length] * array_dimensions)
|
|
106
|
+
).copy()
|
|
107
|
+
|
|
108
|
+
return frequencies
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
'''
|
|
16
|
+
|
|
17
|
+
class Node:
|
|
18
|
+
"""
|
|
19
|
+
Represents a single node in a tree structure for storing characters.
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
- WordEndSymbol (str): A constant indicating the end of a word in the tree (default: `chr(1)`).
|
|
23
|
+
- TerminationSymbol (str): A constant indicating the end of a tree branch (default: `chr(0)`).
|
|
24
|
+
- value (str or None): The character or value stored in this node.
|
|
25
|
+
- word_ends_here (bool): Whether this node marks the end of a word.
|
|
26
|
+
- child_nodes (list): A list of child nodes connected to this node.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
WordEndSymbol = chr(1) # Constant for the symbol indicating the end of a word
|
|
30
|
+
TerminationSymbol = chr(0) # Constant for the symbol indicating the end of the tree
|
|
31
|
+
|
|
32
|
+
def __init__(self, value=None):
|
|
33
|
+
"""
|
|
34
|
+
Initializes a Node object.
|
|
35
|
+
|
|
36
|
+
Parameters:
|
|
37
|
+
- value (str or None): The character or value to be stored in this node (default: None).
|
|
38
|
+
|
|
39
|
+
Initializes:
|
|
40
|
+
- self.value (str or None): The value of this node.
|
|
41
|
+
- self.word_ends_here (bool): Set to False initially, indicating that no word ends here.
|
|
42
|
+
- self.child_nodes (list): An empty list to hold child nodes.
|
|
43
|
+
"""
|
|
44
|
+
self.value = value
|
|
45
|
+
self.word_ends_here = False
|
|
46
|
+
self.child_nodes = []
|
|
47
|
+
|
|
48
|
+
def __eq__(self, other):
|
|
49
|
+
"""
|
|
50
|
+
Compares two Node objects for equality.
|
|
51
|
+
|
|
52
|
+
Parameters:
|
|
53
|
+
- other (Node): The node to compare with.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
- bool: True if both nodes are equal, False otherwise.
|
|
57
|
+
|
|
58
|
+
Notes:
|
|
59
|
+
- Two nodes are considered equal if:
|
|
60
|
+
- Their values are the same.
|
|
61
|
+
- Their `word_ends_here` flags are the same.
|
|
62
|
+
- Their child nodes are identical in value and order.
|
|
63
|
+
"""
|
|
64
|
+
if not isinstance(other, Node):
|
|
65
|
+
return False
|
|
66
|
+
if self.value != other.value or self.word_ends_here != other.word_ends_here:
|
|
67
|
+
return False
|
|
68
|
+
if len(self.child_nodes) != len(other.child_nodes):
|
|
69
|
+
return False
|
|
70
|
+
for i in range(len(self.child_nodes)):
|
|
71
|
+
if self.child_nodes[i] != other.child_nodes[i]:
|
|
72
|
+
return False
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
def __hash__(self):
|
|
76
|
+
"""
|
|
77
|
+
Calculates a hash value for the Node.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
- int: A hash value based on the node's value, `word_ends_here` status, and child nodes.
|
|
81
|
+
|
|
82
|
+
Notes:
|
|
83
|
+
- This method allows the node to be used in hash-based collections (e.g., sets, dictionaries).
|
|
84
|
+
"""
|
|
85
|
+
hash_value = hash(self.value)
|
|
86
|
+
hash_value = hash_value * 31 + hash(self.word_ends_here)
|
|
87
|
+
for child_node in self.child_nodes:
|
|
88
|
+
hash_value = hash_value * 31 + hash(child_node)
|
|
89
|
+
return hash_value
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
'''
|
|
16
|
+
import numpy as np
|
|
17
|
+
import os
|
|
18
|
+
from languagestatisticslibpy.Grams import Grams
|
|
19
|
+
from languagestatisticslibpy.GramsType import GramsType
|
|
20
|
+
from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
|
|
21
|
+
|
|
22
|
+
class Pentagrams(Grams):
|
|
23
|
+
def __init__(self, language, language_statistics_directory, use_spaces=False):
|
|
24
|
+
"""
|
|
25
|
+
Initializes the Pentagrams class by calling the parent class (Grams) initializer.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
- language (str): The language of the pentagram statistics.
|
|
29
|
+
- language_statistics_directory (str): Path to the directory containing language statistics files.
|
|
30
|
+
- use_spaces (bool): Whether to include spaces in the analysis (default: False).
|
|
31
|
+
"""
|
|
32
|
+
super().__init__(language, language_statistics_directory, use_spaces)
|
|
33
|
+
|
|
34
|
+
def load_gz(self, filename, language_statistics_directory):
|
|
35
|
+
"""
|
|
36
|
+
Loads a gzip-compressed file containing pentagram frequencies.
|
|
37
|
+
|
|
38
|
+
Parameters:
|
|
39
|
+
- filename (str): The name of the file to load.
|
|
40
|
+
- language_statistics_directory (str): The directory where the statistics file is located.
|
|
41
|
+
|
|
42
|
+
Sets:
|
|
43
|
+
- self.frequencies (np.ndarray): A 5D array of pentagram frequencies.
|
|
44
|
+
- self.alphabet (list): The alphabet used in the statistics file.
|
|
45
|
+
- self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
|
|
46
|
+
"""
|
|
47
|
+
file_path = os.path.join(language_statistics_directory, filename)
|
|
48
|
+
language_statistics_file = LanguageStatisticsFile(file_path)
|
|
49
|
+
self.frequencies = language_statistics_file.load_frequencies(5)
|
|
50
|
+
self.alphabet = language_statistics_file.alphabet
|
|
51
|
+
self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
|
|
52
|
+
|
|
53
|
+
def calculate_cost(self, text):
|
|
54
|
+
"""
|
|
55
|
+
Calculates the cost of a given text based on pentagram frequencies.
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
- text (str): The text to analyze.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
- float: The average cost of pentagrams in the text. Returns 0.0 if the text length is less than 5.
|
|
62
|
+
|
|
63
|
+
Notes:
|
|
64
|
+
- Skips pentagrams containing characters outside the defined alphabet.
|
|
65
|
+
- If `add_letter_indices` is defined, modifies the index of the characters before computing the cost.
|
|
66
|
+
"""
|
|
67
|
+
if len(text) < 5:
|
|
68
|
+
return 0.0
|
|
69
|
+
|
|
70
|
+
value = 0.0
|
|
71
|
+
alphabet_length = len(self.alphabet)
|
|
72
|
+
end = len(text) - 4
|
|
73
|
+
|
|
74
|
+
for i in range(end):
|
|
75
|
+
a, b, c, d, e = text[i:i+5]
|
|
76
|
+
|
|
77
|
+
if self.add_letter_indices:
|
|
78
|
+
a += self.add_letter_indices.get(a, 0)
|
|
79
|
+
b += self.add_letter_indices.get(b, 0)
|
|
80
|
+
c += self.add_letter_indices.get(c, 0)
|
|
81
|
+
d += self.add_letter_indices.get(d, 0)
|
|
82
|
+
e += self.add_letter_indices.get(e, 0)
|
|
83
|
+
|
|
84
|
+
if 0 <= a < alphabet_length and 0 <= b < alphabet_length and \
|
|
85
|
+
0 <= c < alphabet_length and 0 <= d < alphabet_length and \
|
|
86
|
+
0 <= e < alphabet_length:
|
|
87
|
+
value += self.frequencies[a, b, c, d, e]
|
|
88
|
+
|
|
89
|
+
return value / end
|
|
90
|
+
|
|
91
|
+
def gram_size(self):
|
|
92
|
+
"""
|
|
93
|
+
Returns the size of the grams being analyzed (pentagrams in this case).
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
- int: The size of the grams (always 5 for pentagrams).
|
|
97
|
+
"""
|
|
98
|
+
return 5
|
|
99
|
+
|
|
100
|
+
def grams_type(self):
|
|
101
|
+
"""
|
|
102
|
+
Returns the type of grams being analyzed.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
- GramsType: An enum value representing the type of grams (GramsType.Pentagrams).
|
|
106
|
+
"""
|
|
107
|
+
return GramsType.Pentagrams
|
|
108
|
+
|
|
109
|
+
def normalize(self, max_value):
|
|
110
|
+
"""
|
|
111
|
+
Normalizes the pentagram frequencies based on the provided maximum value.
|
|
112
|
+
|
|
113
|
+
Parameters:
|
|
114
|
+
- max_value (float): The maximum value used for normalization.
|
|
115
|
+
|
|
116
|
+
Notes:
|
|
117
|
+
- Adjusts all frequencies proportionally to the new maximum value.
|
|
118
|
+
- Updates `self.max_value` to the new maximum after normalization.
|
|
119
|
+
"""
|
|
120
|
+
super().normalize(max_value)
|
|
121
|
+
adjust_value = self.max_value * max_value
|
|
122
|
+
for a in range(len(self.alphabet)):
|
|
123
|
+
for b in range(len(self.alphabet)):
|
|
124
|
+
for c in range(len(self.alphabet)):
|
|
125
|
+
for d in range(len(self.alphabet)):
|
|
126
|
+
for e in range(len(self.alphabet)):
|
|
127
|
+
self.frequencies[a, b, c, d, e] = adjust_value / self.frequencies[a, b, c, d, e]
|
|
128
|
+
self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
|