LanguageStatisticsLibPy 1.0.3__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- languagestatisticslibpy/LanguageStatisticsFile.py +48 -40
- languagestatisticslibpy/Tetragrams.py +125 -125
- languagestatisticslibpy/Trigrams.py +125 -125
- languagestatisticslibpy/Unigrams.py +110 -110
- languagestatisticslibpy/WordTree.py +161 -161
- languagestatisticslibpy/test1.py +26 -26
- languagestatisticslibpy/test2.py +80 -80
- {languagestatisticslibpy-1.0.3.dist-info → languagestatisticslibpy-1.0.4.dist-info}/METADATA +4 -2
- languagestatisticslibpy-1.0.4.dist-info/RECORD +19 -0
- {languagestatisticslibpy-1.0.3.dist-info → languagestatisticslibpy-1.0.4.dist-info}/WHEEL +1 -1
- languagestatisticslibpy-1.0.3.dist-info/RECORD +0 -19
- {languagestatisticslibpy-1.0.3.dist-info → languagestatisticslibpy-1.0.4.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,110 +1,110 @@
|
|
|
1
|
-
'''
|
|
2
|
-
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
'''
|
|
16
|
-
import numpy as np
|
|
17
|
-
import os
|
|
18
|
-
from languagestatisticslibpy.Grams import Grams
|
|
19
|
-
from languagestatisticslibpy.GramsType import GramsType
|
|
20
|
-
from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
|
|
21
|
-
|
|
22
|
-
class Unigrams(Grams):
|
|
23
|
-
def __init__(self, language, language_statistics_directory, use_spaces=False):
|
|
24
|
-
"""
|
|
25
|
-
Initializes the Unigrams class by calling the parent class (Grams) initializer.
|
|
26
|
-
|
|
27
|
-
Parameters:
|
|
28
|
-
- language (str): The language of the unigram statistics.
|
|
29
|
-
- language_statistics_directory (str): Path to the directory containing language statistics files.
|
|
30
|
-
- use_spaces (bool): Whether to include spaces in the analysis (default: False).
|
|
31
|
-
"""
|
|
32
|
-
super().__init__(language, language_statistics_directory, use_spaces)
|
|
33
|
-
|
|
34
|
-
def load_gz(self, filename, language_statistics_directory):
|
|
35
|
-
"""
|
|
36
|
-
Loads a gzip-compressed file containing unigram frequencies.
|
|
37
|
-
|
|
38
|
-
Parameters:
|
|
39
|
-
- filename (str): The name of the file to load.
|
|
40
|
-
- language_statistics_directory (str): The directory where the statistics file is located.
|
|
41
|
-
|
|
42
|
-
Sets:
|
|
43
|
-
- self.frequencies (np.ndarray): A 1D array of unigram frequencies.
|
|
44
|
-
- self.alphabet (list): The alphabet used in the statistics file.
|
|
45
|
-
- self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
|
|
46
|
-
"""
|
|
47
|
-
file_path = os.path.join(language_statistics_directory, filename)
|
|
48
|
-
language_statistics_file = LanguageStatisticsFile(file_path)
|
|
49
|
-
self.frequencies = language_statistics_file.load_frequencies(1)
|
|
50
|
-
self.alphabet = language_statistics_file.alphabet
|
|
51
|
-
self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
|
|
52
|
-
|
|
53
|
-
def calculate_cost(self, text):
|
|
54
|
-
"""
|
|
55
|
-
Calculates the cost of a given text based on unigram frequencies.
|
|
56
|
-
|
|
57
|
-
Parameters:
|
|
58
|
-
- text (str): The text to analyze.
|
|
59
|
-
|
|
60
|
-
Returns:
|
|
61
|
-
- float: The average cost of unigrams in the text. Returns 0.0 if the text is empty.
|
|
62
|
-
|
|
63
|
-
Notes:
|
|
64
|
-
- Skips characters that are outside the defined alphabet.
|
|
65
|
-
- If `add_letter_indices` is defined, modifies the index of the character before computing the cost.
|
|
66
|
-
"""
|
|
67
|
-
if len(text) == 0:
|
|
68
|
-
return 0.0
|
|
69
|
-
|
|
70
|
-
value = 0.0
|
|
71
|
-
for i in text:
|
|
72
|
-
if self.add_letter_indices:
|
|
73
|
-
i += self.add_letter_indices.get(i, 0)
|
|
74
|
-
if 0 <= i < len(self.alphabet):
|
|
75
|
-
value += self.frequencies[i]
|
|
76
|
-
return value / len(text)
|
|
77
|
-
|
|
78
|
-
def gram_size(self):
|
|
79
|
-
"""
|
|
80
|
-
Returns the size of the grams being analyzed (unigrams in this case).
|
|
81
|
-
|
|
82
|
-
Returns:
|
|
83
|
-
- int: The size of the grams (always 1 for unigrams).
|
|
84
|
-
"""
|
|
85
|
-
return 1
|
|
86
|
-
|
|
87
|
-
def grams_type(self):
|
|
88
|
-
"""
|
|
89
|
-
Returns the type of grams being analyzed.
|
|
90
|
-
|
|
91
|
-
Returns:
|
|
92
|
-
- GramsType: An enum value representing the type of grams (GramsType.Unigrams).
|
|
93
|
-
"""
|
|
94
|
-
return GramsType.Unigrams
|
|
95
|
-
|
|
96
|
-
def normalize(self, max_value):
|
|
97
|
-
"""
|
|
98
|
-
Normalizes the unigram frequencies based on the provided maximum value.
|
|
99
|
-
|
|
100
|
-
Parameters:
|
|
101
|
-
- max_value (float): The maximum value used for normalization.
|
|
102
|
-
|
|
103
|
-
Notes:
|
|
104
|
-
- Adjusts all frequencies proportionally to the new maximum value.
|
|
105
|
-
- Updates `self.max_value` to the new maximum after normalization.
|
|
106
|
-
"""
|
|
107
|
-
super().normalize(max_value)
|
|
108
|
-
adjust_value = self.max_value * max_value
|
|
109
|
-
for a in range(len(self.alphabet)):
|
|
110
|
-
self.frequencies[a] = adjust_value / self.frequencies[a]
|
|
1
|
+
'''
|
|
2
|
+
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
'''
|
|
16
|
+
import numpy as np
|
|
17
|
+
import os
|
|
18
|
+
from languagestatisticslibpy.Grams import Grams
|
|
19
|
+
from languagestatisticslibpy.GramsType import GramsType
|
|
20
|
+
from languagestatisticslibpy.LanguageStatisticsFile import LanguageStatisticsFile
|
|
21
|
+
|
|
22
|
+
class Unigrams(Grams):
|
|
23
|
+
def __init__(self, language, language_statistics_directory, use_spaces=False):
|
|
24
|
+
"""
|
|
25
|
+
Initializes the Unigrams class by calling the parent class (Grams) initializer.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
- language (str): The language of the unigram statistics.
|
|
29
|
+
- language_statistics_directory (str): Path to the directory containing language statistics files.
|
|
30
|
+
- use_spaces (bool): Whether to include spaces in the analysis (default: False).
|
|
31
|
+
"""
|
|
32
|
+
super().__init__(language, language_statistics_directory, use_spaces)
|
|
33
|
+
|
|
34
|
+
def load_gz(self, filename, language_statistics_directory):
|
|
35
|
+
"""
|
|
36
|
+
Loads a gzip-compressed file containing unigram frequencies.
|
|
37
|
+
|
|
38
|
+
Parameters:
|
|
39
|
+
- filename (str): The name of the file to load.
|
|
40
|
+
- language_statistics_directory (str): The directory where the statistics file is located.
|
|
41
|
+
|
|
42
|
+
Sets:
|
|
43
|
+
- self.frequencies (np.ndarray): A 1D array of unigram frequencies.
|
|
44
|
+
- self.alphabet (list): The alphabet used in the statistics file.
|
|
45
|
+
- self.max_value (float): The maximum value in the frequencies array, or -∞ if the array is empty.
|
|
46
|
+
"""
|
|
47
|
+
file_path = os.path.join(language_statistics_directory, filename)
|
|
48
|
+
language_statistics_file = LanguageStatisticsFile(file_path)
|
|
49
|
+
self.frequencies = language_statistics_file.load_frequencies(1)
|
|
50
|
+
self.alphabet = language_statistics_file.alphabet
|
|
51
|
+
self.max_value = np.max(self.frequencies) if self.frequencies.size > 0 else float('-inf')
|
|
52
|
+
|
|
53
|
+
def calculate_cost(self, text):
|
|
54
|
+
"""
|
|
55
|
+
Calculates the cost of a given text based on unigram frequencies.
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
- text (str): The text to analyze.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
- float: The average cost of unigrams in the text. Returns 0.0 if the text is empty.
|
|
62
|
+
|
|
63
|
+
Notes:
|
|
64
|
+
- Skips characters that are outside the defined alphabet.
|
|
65
|
+
- If `add_letter_indices` is defined, modifies the index of the character before computing the cost.
|
|
66
|
+
"""
|
|
67
|
+
if len(text) == 0:
|
|
68
|
+
return 0.0
|
|
69
|
+
|
|
70
|
+
value = 0.0
|
|
71
|
+
for i in text:
|
|
72
|
+
if self.add_letter_indices:
|
|
73
|
+
i += self.add_letter_indices.get(i, 0)
|
|
74
|
+
if 0 <= i < len(self.alphabet):
|
|
75
|
+
value += self.frequencies[i]
|
|
76
|
+
return value / len(text)
|
|
77
|
+
|
|
78
|
+
def gram_size(self):
|
|
79
|
+
"""
|
|
80
|
+
Returns the size of the grams being analyzed (unigrams in this case).
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
- int: The size of the grams (always 1 for unigrams).
|
|
84
|
+
"""
|
|
85
|
+
return 1
|
|
86
|
+
|
|
87
|
+
def grams_type(self):
|
|
88
|
+
"""
|
|
89
|
+
Returns the type of grams being analyzed.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
- GramsType: An enum value representing the type of grams (GramsType.Unigrams).
|
|
93
|
+
"""
|
|
94
|
+
return GramsType.Unigrams
|
|
95
|
+
|
|
96
|
+
def normalize(self, max_value):
|
|
97
|
+
"""
|
|
98
|
+
Normalizes the unigram frequencies based on the provided maximum value.
|
|
99
|
+
|
|
100
|
+
Parameters:
|
|
101
|
+
- max_value (float): The maximum value used for normalization.
|
|
102
|
+
|
|
103
|
+
Notes:
|
|
104
|
+
- Adjusts all frequencies proportionally to the new maximum value.
|
|
105
|
+
- Updates `self.max_value` to the new maximum after normalization.
|
|
106
|
+
"""
|
|
107
|
+
super().normalize(max_value)
|
|
108
|
+
adjust_value = self.max_value * max_value
|
|
109
|
+
for a in range(len(self.alphabet)):
|
|
110
|
+
self.frequencies[a] = adjust_value / self.frequencies[a]
|
|
@@ -1,162 +1,162 @@
|
|
|
1
|
-
'''
|
|
2
|
-
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
'''
|
|
16
|
-
from io import BufferedReader
|
|
17
|
-
from collections import deque
|
|
18
|
-
from languagestatisticslibpy.Node import Node
|
|
19
|
-
|
|
20
|
-
class WordTree(Node):
|
|
21
|
-
"""
|
|
22
|
-
Represents a tree data structure for storing words and efficiently querying them.
|
|
23
|
-
|
|
24
|
-
Inherits:
|
|
25
|
-
- Node: The base class for tree nodes, where each node represents a character.
|
|
26
|
-
|
|
27
|
-
Attributes:
|
|
28
|
-
- stored_words (int): The number of words stored in the tree.
|
|
29
|
-
- language_code (str): The language code for the words stored in the tree.
|
|
30
|
-
- alphabet (str): The alphabet used in the stored words.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
def __init__(self):
|
|
34
|
-
"""
|
|
35
|
-
Initializes an empty WordTree.
|
|
36
|
-
|
|
37
|
-
Initializes:
|
|
38
|
-
- stored_words (int): Set to 0, as no words are initially stored.
|
|
39
|
-
- language_code (str): Empty, to be set during deserialization.
|
|
40
|
-
- alphabet (str): Empty, to be set during deserialization.
|
|
41
|
-
"""
|
|
42
|
-
super().__init__()
|
|
43
|
-
self.stored_words = 0
|
|
44
|
-
self.language_code = ''
|
|
45
|
-
self.alphabet = ''
|
|
46
|
-
|
|
47
|
-
@staticmethod
|
|
48
|
-
def deserialize(reader: BufferedReader):
|
|
49
|
-
"""
|
|
50
|
-
Deserializes a WordTree from a binary file.
|
|
51
|
-
|
|
52
|
-
Parameters:
|
|
53
|
-
- reader (BufferedReader): A binary file reader containing the serialized WordTree.
|
|
54
|
-
|
|
55
|
-
Returns:
|
|
56
|
-
- WordTree: The deserialized WordTree object.
|
|
57
|
-
|
|
58
|
-
Raises:
|
|
59
|
-
- Exception: If the file format is invalid or the magic number does not match.
|
|
60
|
-
|
|
61
|
-
Process:
|
|
62
|
-
1. Reads the file header and validates the magic number.
|
|
63
|
-
2. Reads the language code and alphabet.
|
|
64
|
-
3. Reads the number of stored words.
|
|
65
|
-
4. Constructs the WordTree structure by iterating through the file's serialized data.
|
|
66
|
-
"""
|
|
67
|
-
tree = WordTree()
|
|
68
|
-
|
|
69
|
-
# Load word tree header
|
|
70
|
-
magic_no = reader.read(6).decode('utf-8')
|
|
71
|
-
if magic_no != "CT2DIC":
|
|
72
|
-
raise Exception("File does not start with the expected magic number for word tree.")
|
|
73
|
-
|
|
74
|
-
# Read language code
|
|
75
|
-
tree.language_code = ''
|
|
76
|
-
char = reader.read(1).decode('utf-8')
|
|
77
|
-
while char != '\0':
|
|
78
|
-
tree.language_code += char
|
|
79
|
-
char = reader.read(1).decode('utf-8')
|
|
80
|
-
|
|
81
|
-
# Read alphabet
|
|
82
|
-
tree.alphabet = ''
|
|
83
|
-
char = reader.read(1).decode('utf-8')
|
|
84
|
-
while char != '\0':
|
|
85
|
-
tree.alphabet += char
|
|
86
|
-
char = reader.read(1).decode('utf-8')
|
|
87
|
-
|
|
88
|
-
# Read number of stored words
|
|
89
|
-
tree.stored_words = int.from_bytes(reader.read(4), 'little')
|
|
90
|
-
|
|
91
|
-
# Load word tree data structure
|
|
92
|
-
stack = deque([tree])
|
|
93
|
-
byte = reader.read(1)
|
|
94
|
-
while byte:
|
|
95
|
-
char = byte.decode('utf-8')
|
|
96
|
-
if char == Node.WordEndSymbol:
|
|
97
|
-
stack[-1].word_ends_here = True
|
|
98
|
-
tree.stored_words += 1
|
|
99
|
-
elif char == Node.TerminationSymbol:
|
|
100
|
-
stack.pop()
|
|
101
|
-
else:
|
|
102
|
-
new_node = Node(char)
|
|
103
|
-
stack[-1].child_nodes.append(new_node)
|
|
104
|
-
stack.append(new_node)
|
|
105
|
-
byte = reader.read(1)
|
|
106
|
-
|
|
107
|
-
return tree
|
|
108
|
-
|
|
109
|
-
def contains_word(self, word):
|
|
110
|
-
"""
|
|
111
|
-
Checks whether a given word exists in the WordTree.
|
|
112
|
-
|
|
113
|
-
Parameters:
|
|
114
|
-
- word (str): The word to search for.
|
|
115
|
-
|
|
116
|
-
Returns:
|
|
117
|
-
- bool: True if the word exists in the tree, False otherwise.
|
|
118
|
-
|
|
119
|
-
Process:
|
|
120
|
-
1. Converts the word to uppercase for case-insensitive comparison.
|
|
121
|
-
2. Traverses the tree to find the sequence of characters in the word.
|
|
122
|
-
3. Returns False if any character is missing in the tree structure.
|
|
123
|
-
"""
|
|
124
|
-
word = word.upper()
|
|
125
|
-
current_node = self
|
|
126
|
-
for char in word:
|
|
127
|
-
found_node = None
|
|
128
|
-
for child_node in current_node.child_nodes:
|
|
129
|
-
if child_node.value == char:
|
|
130
|
-
current_node = child_node
|
|
131
|
-
found_node = True
|
|
132
|
-
break
|
|
133
|
-
if not found_node:
|
|
134
|
-
return False
|
|
135
|
-
return True
|
|
136
|
-
|
|
137
|
-
def to_list(self):
|
|
138
|
-
"""
|
|
139
|
-
Converts all words stored in the WordTree into a list.
|
|
140
|
-
|
|
141
|
-
Returns:
|
|
142
|
-
- list: A list of all words stored in the tree.
|
|
143
|
-
|
|
144
|
-
Process:
|
|
145
|
-
1. Traverses the tree using a stack to collect characters.
|
|
146
|
-
2. Adds a word to the list whenever a node marks the end of a word.
|
|
147
|
-
"""
|
|
148
|
-
list_of_words = []
|
|
149
|
-
stack = deque()
|
|
150
|
-
|
|
151
|
-
def add_node_to_list(node, stack):
|
|
152
|
-
stack.append(node.value)
|
|
153
|
-
if node.word_ends_here:
|
|
154
|
-
list_of_words.append(''.join(stack))
|
|
155
|
-
for child_node in node.child_nodes:
|
|
156
|
-
add_node_to_list(child_node, deque(stack))
|
|
157
|
-
stack.pop()
|
|
158
|
-
|
|
159
|
-
for node in self.child_nodes:
|
|
160
|
-
add_node_to_list(node, stack)
|
|
161
|
-
|
|
1
|
+
'''
|
|
2
|
+
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
'''
|
|
16
|
+
from io import BufferedReader
|
|
17
|
+
from collections import deque
|
|
18
|
+
from languagestatisticslibpy.Node import Node
|
|
19
|
+
|
|
20
|
+
class WordTree(Node):
|
|
21
|
+
"""
|
|
22
|
+
Represents a tree data structure for storing words and efficiently querying them.
|
|
23
|
+
|
|
24
|
+
Inherits:
|
|
25
|
+
- Node: The base class for tree nodes, where each node represents a character.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
- stored_words (int): The number of words stored in the tree.
|
|
29
|
+
- language_code (str): The language code for the words stored in the tree.
|
|
30
|
+
- alphabet (str): The alphabet used in the stored words.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
"""
|
|
35
|
+
Initializes an empty WordTree.
|
|
36
|
+
|
|
37
|
+
Initializes:
|
|
38
|
+
- stored_words (int): Set to 0, as no words are initially stored.
|
|
39
|
+
- language_code (str): Empty, to be set during deserialization.
|
|
40
|
+
- alphabet (str): Empty, to be set during deserialization.
|
|
41
|
+
"""
|
|
42
|
+
super().__init__()
|
|
43
|
+
self.stored_words = 0
|
|
44
|
+
self.language_code = ''
|
|
45
|
+
self.alphabet = ''
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def deserialize(reader: BufferedReader):
|
|
49
|
+
"""
|
|
50
|
+
Deserializes a WordTree from a binary file.
|
|
51
|
+
|
|
52
|
+
Parameters:
|
|
53
|
+
- reader (BufferedReader): A binary file reader containing the serialized WordTree.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
- WordTree: The deserialized WordTree object.
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
- Exception: If the file format is invalid or the magic number does not match.
|
|
60
|
+
|
|
61
|
+
Process:
|
|
62
|
+
1. Reads the file header and validates the magic number.
|
|
63
|
+
2. Reads the language code and alphabet.
|
|
64
|
+
3. Reads the number of stored words.
|
|
65
|
+
4. Constructs the WordTree structure by iterating through the file's serialized data.
|
|
66
|
+
"""
|
|
67
|
+
tree = WordTree()
|
|
68
|
+
|
|
69
|
+
# Load word tree header
|
|
70
|
+
magic_no = reader.read(6).decode('utf-8')
|
|
71
|
+
if magic_no != "CT2DIC":
|
|
72
|
+
raise Exception("File does not start with the expected magic number for word tree.")
|
|
73
|
+
|
|
74
|
+
# Read language code
|
|
75
|
+
tree.language_code = ''
|
|
76
|
+
char = reader.read(1).decode('utf-8')
|
|
77
|
+
while char != '\0':
|
|
78
|
+
tree.language_code += char
|
|
79
|
+
char = reader.read(1).decode('utf-8')
|
|
80
|
+
|
|
81
|
+
# Read alphabet
|
|
82
|
+
tree.alphabet = ''
|
|
83
|
+
char = reader.read(1).decode('utf-8')
|
|
84
|
+
while char != '\0':
|
|
85
|
+
tree.alphabet += char
|
|
86
|
+
char = reader.read(1).decode('utf-8')
|
|
87
|
+
|
|
88
|
+
# Read number of stored words
|
|
89
|
+
tree.stored_words = int.from_bytes(reader.read(4), 'little')
|
|
90
|
+
|
|
91
|
+
# Load word tree data structure
|
|
92
|
+
stack = deque([tree])
|
|
93
|
+
byte = reader.read(1)
|
|
94
|
+
while byte:
|
|
95
|
+
char = byte.decode('utf-8')
|
|
96
|
+
if char == Node.WordEndSymbol:
|
|
97
|
+
stack[-1].word_ends_here = True
|
|
98
|
+
tree.stored_words += 1
|
|
99
|
+
elif char == Node.TerminationSymbol:
|
|
100
|
+
stack.pop()
|
|
101
|
+
else:
|
|
102
|
+
new_node = Node(char)
|
|
103
|
+
stack[-1].child_nodes.append(new_node)
|
|
104
|
+
stack.append(new_node)
|
|
105
|
+
byte = reader.read(1)
|
|
106
|
+
|
|
107
|
+
return tree
|
|
108
|
+
|
|
109
|
+
def contains_word(self, word):
|
|
110
|
+
"""
|
|
111
|
+
Checks whether a given word exists in the WordTree.
|
|
112
|
+
|
|
113
|
+
Parameters:
|
|
114
|
+
- word (str): The word to search for.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
- bool: True if the word exists in the tree, False otherwise.
|
|
118
|
+
|
|
119
|
+
Process:
|
|
120
|
+
1. Converts the word to uppercase for case-insensitive comparison.
|
|
121
|
+
2. Traverses the tree to find the sequence of characters in the word.
|
|
122
|
+
3. Returns False if any character is missing in the tree structure.
|
|
123
|
+
"""
|
|
124
|
+
word = word.upper()
|
|
125
|
+
current_node = self
|
|
126
|
+
for char in word:
|
|
127
|
+
found_node = None
|
|
128
|
+
for child_node in current_node.child_nodes:
|
|
129
|
+
if child_node.value == char:
|
|
130
|
+
current_node = child_node
|
|
131
|
+
found_node = True
|
|
132
|
+
break
|
|
133
|
+
if not found_node:
|
|
134
|
+
return False
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
def to_list(self):
|
|
138
|
+
"""
|
|
139
|
+
Converts all words stored in the WordTree into a list.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
- list: A list of all words stored in the tree.
|
|
143
|
+
|
|
144
|
+
Process:
|
|
145
|
+
1. Traverses the tree using a stack to collect characters.
|
|
146
|
+
2. Adds a word to the list whenever a node marks the end of a word.
|
|
147
|
+
"""
|
|
148
|
+
list_of_words = []
|
|
149
|
+
stack = deque()
|
|
150
|
+
|
|
151
|
+
def add_node_to_list(node, stack):
|
|
152
|
+
stack.append(node.value)
|
|
153
|
+
if node.word_ends_here:
|
|
154
|
+
list_of_words.append(''.join(stack))
|
|
155
|
+
for child_node in node.child_nodes:
|
|
156
|
+
add_node_to_list(child_node, deque(stack))
|
|
157
|
+
stack.pop()
|
|
158
|
+
|
|
159
|
+
for node in self.child_nodes:
|
|
160
|
+
add_node_to_list(node, stack)
|
|
161
|
+
|
|
162
162
|
return list_of_words
|
languagestatisticslibpy/test1.py
CHANGED
|
@@ -1,26 +1,26 @@
|
|
|
1
|
-
'''
|
|
2
|
-
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
-
|
|
4
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
you may not use this file except in compliance with the License.
|
|
6
|
-
You may obtain a copy of the License at
|
|
7
|
-
|
|
8
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
|
|
10
|
-
Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
See the License for the specific language governing permissions and
|
|
14
|
-
limitations under the License.
|
|
15
|
-
|
|
16
|
-
Usage: python3 test1.py
|
|
17
|
-
test1.py is a minimal working example (MWE) which just needs the
|
|
18
|
-
package LanguageStatisticsLibPy to be installed.
|
|
19
|
-
'''
|
|
20
|
-
|
|
21
|
-
from languagestatisticslibpy.LanguageStatistics import LanguageStatistics as LS
|
|
22
|
-
|
|
23
|
-
plaintext = LS.map_text_into_number_space("HELLOWORD", LS.alphabets['en'])
|
|
24
|
-
ioc = LS.calculate_ioc(plaintext)
|
|
25
|
-
|
|
26
|
-
print(ioc)
|
|
1
|
+
'''
|
|
2
|
+
Copyright 2024 Nils Kopal, Bernhard Esslinger, CrypTool Team
|
|
3
|
+
|
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
you may not use this file except in compliance with the License.
|
|
6
|
+
You may obtain a copy of the License at
|
|
7
|
+
|
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
|
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
See the License for the specific language governing permissions and
|
|
14
|
+
limitations under the License.
|
|
15
|
+
|
|
16
|
+
Usage: python3 test1.py
|
|
17
|
+
test1.py is a minimal working example (MWE) which just needs the
|
|
18
|
+
package LanguageStatisticsLibPy to be installed.
|
|
19
|
+
'''
|
|
20
|
+
|
|
21
|
+
from languagestatisticslibpy.LanguageStatistics import LanguageStatistics as LS
|
|
22
|
+
|
|
23
|
+
plaintext = LS.map_text_into_number_space("HELLOWORD", LS.alphabets['en'])
|
|
24
|
+
ioc = LS.calculate_ioc(plaintext)
|
|
25
|
+
|
|
26
|
+
print(ioc)
|