grzegorz 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- grzegorz/__main__.py +22 -71
- grzegorz/anki_integration.py +18 -28
- grzegorz/fetcher.py +20 -50
- grzegorz/generator.py +36 -45
- grzegorz/io.py +59 -0
- grzegorz/subcommands.py +161 -0
- grzegorz/test.py +10 -10
- grzegorz/word.py +1 -49
- grzegorz/wordlist.py +30 -33
- {grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/METADATA +4 -3
- grzegorz-0.6.1.dist-info/RECORD +16 -0
- {grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/WHEEL +1 -1
- grzegorz-0.5.0.dist-info/RECORD +0 -14
- {grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/entry_points.txt +0 -0
- {grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info/licenses}/LICENSE +0 -0
- {grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/top_level.txt +0 -0
grzegorz/__main__.py
CHANGED
|
@@ -13,13 +13,8 @@
|
|
|
13
13
|
# You should have received a copy of the GNU General Public License along with
|
|
14
14
|
# grzegorz. If not, see <https://www.gnu.org/licenses/>.
|
|
15
15
|
|
|
16
|
-
from grzegorz.
|
|
17
|
-
from grzegorz.generator import (MinPairGenerator)
|
|
18
|
-
from grzegorz.word import (Word)
|
|
19
|
-
from grzegorz.anki_integration import makedeck
|
|
20
|
-
from grzegorz.wordlist import (wordlist, print_languages_list)
|
|
16
|
+
from grzegorz.subcommands import *
|
|
21
17
|
|
|
22
|
-
from os import remove
|
|
23
18
|
import argparse
|
|
24
19
|
|
|
25
20
|
# Why does it have to be this complicated?
|
|
@@ -55,9 +50,9 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
55
50
|
help='Build an Anki deck for a language automatically')
|
|
56
51
|
parser_fullmake.add_argument('language',
|
|
57
52
|
type=str)
|
|
58
|
-
parser_fullmake.add_argument('
|
|
59
|
-
type=
|
|
60
|
-
help='number of words to
|
|
53
|
+
parser_fullmake.add_argument('bounds',
|
|
54
|
+
type=str,
|
|
55
|
+
help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
|
|
61
56
|
parser_fullmake.add_argument('--clean',
|
|
62
57
|
dest='clean',
|
|
63
58
|
action='store_true',
|
|
@@ -70,9 +65,9 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
70
65
|
parser_wordlist.add_argument('language',
|
|
71
66
|
type=str,
|
|
72
67
|
help='language of the wordlist')
|
|
73
|
-
parser_wordlist.add_argument('
|
|
74
|
-
type=
|
|
75
|
-
help='number of words to keep')
|
|
68
|
+
parser_wordlist.add_argument('bounds',
|
|
69
|
+
type=str,
|
|
70
|
+
help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
|
|
76
71
|
parser_wordlist.add_argument('outfile',
|
|
77
72
|
type=str,
|
|
78
73
|
help='path where the wordlist should be stored')
|
|
@@ -91,6 +86,11 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
91
86
|
action='store_true',
|
|
92
87
|
default=False,
|
|
93
88
|
help='Save the words for which no IPA was found in the output file (default: don\'t)')
|
|
89
|
+
parser_fetchipa.add_argument('--numproc',
|
|
90
|
+
type=int,
|
|
91
|
+
dest='numproc',
|
|
92
|
+
default=20,
|
|
93
|
+
help='Number of concurrent processes to handle the wordlist; default: 20')
|
|
94
94
|
|
|
95
95
|
# 'generate' subcommand
|
|
96
96
|
parser_generate = subparsers.add_parser('generate',
|
|
@@ -138,39 +138,6 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
138
138
|
|
|
139
139
|
return parser
|
|
140
140
|
|
|
141
|
-
def fullmake(language: str, numwords: int, clean: bool) -> None:
|
|
142
|
-
"""
|
|
143
|
-
Practically: wrap all commands into one. If `clean` is True, then
|
|
144
|
-
temporary files created by this function are removed.
|
|
145
|
-
"""
|
|
146
|
-
optimise = True
|
|
147
|
-
keep_phonemes = True
|
|
148
|
-
keep_chronemes = True
|
|
149
|
-
keep_stress = True
|
|
150
|
-
|
|
151
|
-
wordlist_file = language + "-wordlist.txt"
|
|
152
|
-
ipa_json = language + "-ipa.json"
|
|
153
|
-
minpairs_file = language + "-minpairs.json"
|
|
154
|
-
makedeck_file = "grzegorz-" + language + "-minpairs.apkg"
|
|
155
|
-
|
|
156
|
-
if wordlist(language, numwords, wordlist_file) == 1:
|
|
157
|
-
exit(1)
|
|
158
|
-
fetchipa(wordlist_file, ipa_json, False)
|
|
159
|
-
g = MinPairGenerator(
|
|
160
|
-
optimise,
|
|
161
|
-
keep_phonemes,
|
|
162
|
-
keep_chronemes,
|
|
163
|
-
keep_stress,
|
|
164
|
-
)
|
|
165
|
-
g.generate(ipa_json, minpairs_file)
|
|
166
|
-
makedeck(minpairs_file, makedeck_file)
|
|
167
|
-
|
|
168
|
-
if clean:
|
|
169
|
-
print("Removing temporary files...")
|
|
170
|
-
remove(wordlist_file)
|
|
171
|
-
remove(ipa_json)
|
|
172
|
-
remove(minpairs_file)
|
|
173
|
-
|
|
174
141
|
def main() -> None:
|
|
175
142
|
parser = create_argparser()
|
|
176
143
|
args = parser.parse_args()
|
|
@@ -180,17 +147,14 @@ def main() -> None:
|
|
|
180
147
|
match cmd:
|
|
181
148
|
case 'fullmake':
|
|
182
149
|
clean = args.clean
|
|
183
|
-
|
|
150
|
+
bounds = args.bounds
|
|
184
151
|
language = args.language.lower()
|
|
185
|
-
fullmake(language,
|
|
152
|
+
fullmake(language, bounds, clean)
|
|
186
153
|
case 'wordlist':
|
|
187
|
-
|
|
188
|
-
numwords = args.numwords
|
|
189
|
-
language = args.language.lower()
|
|
190
|
-
status = wordlist(language, numwords, outfile)
|
|
154
|
+
status = wordlist_command(args.language.lower(), args.bounds, args.outfile)
|
|
191
155
|
exit(status)
|
|
192
156
|
case 'fetchipa':
|
|
193
|
-
fetchipa(args.infile, args.outfile, args.keep_failed)
|
|
157
|
+
fetchipa(args.infile, args.outfile, args.keep_failed, args.numproc)
|
|
194
158
|
case 'generate':
|
|
195
159
|
infile = args.infile
|
|
196
160
|
outfile = args.outfile
|
|
@@ -199,29 +163,16 @@ def main() -> None:
|
|
|
199
163
|
no_chronemes = args.no_chronemes;
|
|
200
164
|
no_stress = args.no_stress;
|
|
201
165
|
filter_file_path = args.path
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
not no_phonemes,
|
|
205
|
-
not no_chronemes,
|
|
206
|
-
not no_stress
|
|
207
|
-
)
|
|
208
|
-
if filter_file_path is not None:
|
|
209
|
-
g.set_filter_pairs_from_file(filter_file_path)
|
|
210
|
-
g.generate(infile, outfile)
|
|
166
|
+
generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
|
|
167
|
+
no_stress, filter_file_path)
|
|
211
168
|
case 'makedeck':
|
|
212
|
-
infile
|
|
213
|
-
outfile = args.outfile
|
|
214
|
-
makedeck(infile, outfile)
|
|
169
|
+
makedeck(args.infile, args.outfile)
|
|
215
170
|
case 'analyse':
|
|
216
|
-
|
|
171
|
+
print_analysis(args.ipa)
|
|
217
172
|
case 'check':
|
|
218
|
-
|
|
219
|
-
word2 = Word("", args.ipa_second)
|
|
220
|
-
generator = MinPairGenerator(False, True, True, True)
|
|
221
|
-
if not generator.print_human_readable_check(word1, word2):
|
|
222
|
-
exit(1)
|
|
173
|
+
print_minpair_check(args.ipa_first, args.ipa_second)
|
|
223
174
|
case 'list-languages':
|
|
224
|
-
|
|
175
|
+
list_languages()
|
|
225
176
|
case _:
|
|
226
177
|
parser.print_help()
|
|
227
178
|
|
grzegorz/anki_integration.py
CHANGED
|
@@ -13,11 +13,10 @@
|
|
|
13
13
|
# You should have received a copy of the GNU General Public License along with
|
|
14
14
|
# grzegorz. If not, see <https://www.gnu.org/licenses/>.
|
|
15
15
|
|
|
16
|
-
from grzegorz.word import
|
|
16
|
+
from grzegorz.word import WordPair
|
|
17
17
|
|
|
18
18
|
import genanki
|
|
19
19
|
from genanki import Note, Deck
|
|
20
|
-
import json
|
|
21
20
|
|
|
22
21
|
"""The model used for the flashcards is rather simple"""
|
|
23
22
|
grzegorz_minpair_model = genanki.Model(
|
|
@@ -44,7 +43,7 @@ grzegorz_minpair_model = genanki.Model(
|
|
|
44
43
|
<br>
|
|
45
44
|
|
|
46
45
|
<div class="minpair">
|
|
47
|
-
<div id="
|
|
46
|
+
<div id="correct-word" class="word">{{Word 1 text}}<br>{{Word 1 IPA}}</div>
|
|
48
47
|
<div class="center"><i>or</i></div>
|
|
49
48
|
<div class="word">{{Word 2 text}}<br>{{Word 2 IPA}}</div>
|
|
50
49
|
</div>""",
|
|
@@ -121,50 +120,41 @@ You heard: <div class="word">{{Word 2 text}}</div>
|
|
|
121
120
|
}""",
|
|
122
121
|
)
|
|
123
122
|
|
|
124
|
-
def
|
|
125
|
-
"""
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
minpairs = list(map(MinPair.from_dict, dict))
|
|
129
|
-
notes = list(map(minpair_to_anki_note, minpairs))
|
|
130
|
-
deck = notes_to_deck(notes)
|
|
131
|
-
export_deck(deck, outfile)
|
|
123
|
+
def minpairs_to_deck(minpairs: list[WordPair]) -> Deck:
|
|
124
|
+
"""Turn a list of minimal pairs into an Anki deck"""
|
|
125
|
+
notes = [minpair_to_anki_note(mp) for mp in minpairs]
|
|
126
|
+
return notes_to_deck(notes)
|
|
132
127
|
|
|
133
|
-
|
|
128
|
+
def export_deck(deck: Deck, outfile: str) -> None:
|
|
129
|
+
"""Package the given deck and write it to a file"""
|
|
130
|
+
genanki.Package(deck).write_to_file(outfile)
|
|
134
131
|
|
|
135
|
-
def minpair_to_anki_note(minpair:
|
|
132
|
+
def minpair_to_anki_note(minpair: WordPair) -> Note:
|
|
136
133
|
"""
|
|
137
|
-
Given a minimal pair, create an Anki note from it, with grzegorz_minpair_model
|
|
138
|
-
as its
|
|
134
|
+
Given a minimal pair, create an Anki note from it, with `grzegorz_minpair_model`
|
|
135
|
+
as its template.
|
|
139
136
|
"""
|
|
140
|
-
first = minpair.first
|
|
141
|
-
last = minpair.last
|
|
142
137
|
note = genanki.Note(
|
|
143
138
|
model=grzegorz_minpair_model,
|
|
144
139
|
fields=[
|
|
145
|
-
|
|
140
|
+
minpair[0].text,
|
|
146
141
|
'',
|
|
147
|
-
|
|
148
|
-
|
|
142
|
+
minpair[0].ipa,
|
|
143
|
+
minpair[1].text,
|
|
149
144
|
'',
|
|
150
|
-
|
|
145
|
+
minpair[1].ipa,
|
|
151
146
|
]
|
|
152
147
|
)
|
|
153
148
|
return note
|
|
154
149
|
|
|
155
150
|
def notes_to_deck(notes: list[Note]) -> Deck:
|
|
156
151
|
"""
|
|
157
|
-
|
|
152
|
+
Put the `Note`s into a `Deck` called "grzegorz's minimal pairs"
|
|
158
153
|
"""
|
|
159
154
|
deck = genanki.Deck(
|
|
160
|
-
1597757363,
|
|
155
|
+
1597757363, # deck ID, randomly generated but hardcoded
|
|
161
156
|
"grzegorz's minimal pairs",
|
|
162
157
|
)
|
|
163
158
|
for note in notes:
|
|
164
159
|
deck.add_note(note)
|
|
165
160
|
return deck
|
|
166
|
-
|
|
167
|
-
def export_deck(deck: Deck, outfile: str) -> None:
|
|
168
|
-
"""Package the given deck and write it to a file"""
|
|
169
|
-
genanki.Package(deck).write_to_file(outfile)
|
|
170
|
-
print('Done! Now import', outfile, 'in your Anki')
|
grzegorz/fetcher.py
CHANGED
|
@@ -13,67 +13,37 @@
|
|
|
13
13
|
# You should have received a copy of the GNU General Public License along with
|
|
14
14
|
# grzegorz. If not, see <https://www.gnu.org/licenses/>.
|
|
15
15
|
|
|
16
|
-
from grzegorz.word import Word
|
|
16
|
+
from grzegorz.word import Word
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
from
|
|
20
|
-
from functools import partial
|
|
21
|
-
from tqdm import tqdm
|
|
22
|
-
import json
|
|
18
|
+
import requests
|
|
19
|
+
from bs4 import BeautifulSoup
|
|
23
20
|
import re
|
|
24
21
|
|
|
25
|
-
def fetchipa(infile: str, outfile: str, keep_failed: bool) -> None:
|
|
26
|
-
"""
|
|
27
|
-
Given an input file containing a list of words separated, fetch the IPAs and
|
|
28
|
-
create a JSON file with their IPA spellings matched to their text
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
# For speed reasons, we use parallelism
|
|
32
|
-
numproc = 10 * cpu_count()
|
|
33
|
-
|
|
34
|
-
contents = readfile(infile).splitlines()
|
|
35
|
-
language = contents.pop(0)
|
|
36
|
-
words = [line for line in contents if line]
|
|
37
|
-
wds = []
|
|
38
|
-
numwords = len(words)
|
|
39
|
-
|
|
40
|
-
print("Fetching IPA spellings for", numwords, language, "words...")
|
|
41
|
-
if numwords > 500:
|
|
42
|
-
print("If you cancel, all progress will be lost!")
|
|
43
|
-
with Pool(numproc) as p:
|
|
44
|
-
for x in tqdm(p.imap_unordered(partial(get_ipa_for_word, language=language),
|
|
45
|
-
words), total=numwords):
|
|
46
|
-
wds.append(x)
|
|
47
|
-
|
|
48
|
-
# Don't keep entries with no IPA pronunciation
|
|
49
|
-
if not keep_failed:
|
|
50
|
-
wds = [w for w in wds if w.ipa]
|
|
51
|
-
|
|
52
|
-
jsonlog = json.dumps([Word.obj_dict(word) for word in wds])
|
|
53
|
-
writefile(outfile, jsonlog)
|
|
54
|
-
|
|
55
22
|
### HELPER FUNCTIONS ###
|
|
56
23
|
|
|
57
24
|
def get_ipa_for_word(word: str, language: str) -> Word:
|
|
58
25
|
"""
|
|
59
|
-
Look
|
|
26
|
+
Look for the IPA transliteration of the given word in the specified language
|
|
27
|
+
and return a `Word` binding it to the letters. If no transcription was
|
|
28
|
+
found, then the `ipa` field of the result is empty.
|
|
60
29
|
"""
|
|
61
|
-
|
|
62
|
-
|
|
30
|
+
language = language.capitalize()
|
|
31
|
+
url = f"https://en.wiktionary.org/wiki/{word}"
|
|
32
|
+
webpage = requests.get(url)
|
|
33
|
+
soup= BeautifulSoup(webpage.text, "html.parser")
|
|
34
|
+
pronunciations= soup.select(f'li:has(sup:has(a[href="/wiki/Appendix:{language}_pronunciation"]))' )
|
|
35
|
+
|
|
63
36
|
ipa = ""
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
if ipa[0] == '-':
|
|
71
|
-
ipa = ""
|
|
72
|
-
except (IndexError, AttributeError, KeyError) as _:
|
|
73
|
-
pass
|
|
37
|
+
# maybe blindly choosing the first IPA transliteration is not the wisest
|
|
38
|
+
# choice in the world?
|
|
39
|
+
if len(pronunciations):
|
|
40
|
+
first_entry = pronunciations[0].find("span", {"class": "IPA"})
|
|
41
|
+
if first_entry is not None:
|
|
42
|
+
ipa = first_entry.text
|
|
74
43
|
|
|
75
44
|
return Word(word, ipa)
|
|
76
45
|
|
|
77
46
|
def first_ipa_pronunciation(ipa_str: str) -> str:
|
|
78
47
|
"""Find the first IPA spelling in the given string"""
|
|
79
|
-
|
|
48
|
+
result = re.findall(r"[/\[].*?[/\]]", ipa_str)
|
|
49
|
+
return result[0] if len(result) else ""
|
grzegorz/generator.py
CHANGED
|
@@ -13,11 +13,11 @@
|
|
|
13
13
|
# You should have received a copy of the GNU General Public License along with
|
|
14
14
|
# grzegorz. If not, see <https://www.gnu.org/licenses/>.
|
|
15
15
|
|
|
16
|
-
from grzegorz.word import (Word,
|
|
16
|
+
from grzegorz.word import (Word, WordPair,
|
|
17
17
|
PHONEME_MINPAIR, CHRONEME_MINPAIR, STRESS_MINPAIR,
|
|
18
18
|
NOT_MINPAIR)
|
|
19
|
+
from grzegorz.io import readfile
|
|
19
20
|
|
|
20
|
-
import json
|
|
21
21
|
from tqdm import tqdm
|
|
22
22
|
from itertools import chain, combinations
|
|
23
23
|
|
|
@@ -46,42 +46,31 @@ class MinPairGenerator:
|
|
|
46
46
|
lists_of_phonemes.append(line.replace(" ", "").split(","))
|
|
47
47
|
self.filter_pairs = phoneme_lists_to_phoneme_pairs(lists_of_phonemes)
|
|
48
48
|
|
|
49
|
-
def generate(self,
|
|
49
|
+
def generate(self, words: list[Word], silent: bool = True) -> list[WordPair]:
|
|
50
50
|
"""
|
|
51
|
-
|
|
52
|
-
a file `outfile` with all the minimal pairs found, in JSON format
|
|
51
|
+
Generate minimal pairs from the given parameters
|
|
53
52
|
"""
|
|
54
|
-
jsonstr = readfile(infile)
|
|
55
|
-
words = json.loads(jsonstr, object_hook=Word.from_dict)
|
|
56
53
|
minpairs = []
|
|
57
54
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
if not self.keep_chronemes:
|
|
64
|
-
print("Generator: chroneme contrasts will be ignored")
|
|
65
|
-
if not self.keep_stress:
|
|
66
|
-
print("Generator: syllable stress contrasts will be ignored")
|
|
67
|
-
|
|
68
|
-
for i in tqdm(range(0,len(words))):
|
|
69
|
-
for j in range(i+1,len(words)):
|
|
70
|
-
pair = MinPair(words[i], words[j])
|
|
55
|
+
progress_bar = tqdm(total=int(len(words) * (len(words) - 1) / 2), disable=silent)
|
|
56
|
+
for i in range(0, len(words)):
|
|
57
|
+
words_after = range(i+1, len(words))
|
|
58
|
+
for j in words_after:
|
|
59
|
+
pair = (words[i], words[j])
|
|
71
60
|
if self.check_minpair(pair):
|
|
72
61
|
minpairs.append(pair)
|
|
62
|
+
progress_bar.update(len(words_after))
|
|
63
|
+
progress_bar.close()
|
|
73
64
|
|
|
74
|
-
|
|
75
|
-
writefile(outfile, json_out)
|
|
76
|
-
print('Done! Generated', len(minpairs), 'minimal pairs')
|
|
65
|
+
return minpairs
|
|
77
66
|
|
|
78
|
-
def check_minpair(self, pair:
|
|
67
|
+
def check_minpair(self, pair: WordPair) -> int:
|
|
79
68
|
"""
|
|
80
|
-
If the given pair
|
|
81
|
-
return, per case, PHONEME_MINPAIR, CHRONEME_MINPAIR
|
|
69
|
+
If the given pair is not a minpair, return NOT_MINPAIR; otherwise,
|
|
70
|
+
return, per case, PHONEME_MINPAIR, CHRONEME_MINPAIR or STRESS_MINPAIR
|
|
82
71
|
"""
|
|
83
72
|
# Skip empty entries
|
|
84
|
-
if not pair.
|
|
73
|
+
if not pair[0].phonology or not pair[1].phonology:
|
|
85
74
|
return False
|
|
86
75
|
# A minimal pair is kept if it has an interesting difference.
|
|
87
76
|
if self.keep_phonemes and self.check_phoneme_contrast(pair):
|
|
@@ -107,7 +96,7 @@ class MinPairGenerator:
|
|
|
107
96
|
print("")
|
|
108
97
|
word2.print_human_readable()
|
|
109
98
|
print("")
|
|
110
|
-
verdict = self.check_minpair(
|
|
99
|
+
verdict = self.check_minpair((word1, word2))
|
|
111
100
|
if verdict == PHONEME_MINPAIR:
|
|
112
101
|
print("minimal pair based on phoneme difference")
|
|
113
102
|
elif verdict == CHRONEME_MINPAIR:
|
|
@@ -118,9 +107,11 @@ class MinPairGenerator:
|
|
|
118
107
|
print("not minimal pair")
|
|
119
108
|
return verdict
|
|
120
109
|
|
|
121
|
-
def check_phoneme_contrast(self, pair:
|
|
122
|
-
|
|
123
|
-
|
|
110
|
+
def check_phoneme_contrast(self, pair: WordPair) -> bool:
|
|
111
|
+
"""Check if the two Words form a minimal pair based on a phoneme
|
|
112
|
+
difference"""
|
|
113
|
+
first = pair[0].phonology
|
|
114
|
+
last = pair[1].phonology
|
|
124
115
|
|
|
125
116
|
# we have to work with same number of syllables
|
|
126
117
|
if len(first) != len(last):
|
|
@@ -141,9 +132,11 @@ class MinPairGenerator:
|
|
|
141
132
|
|
|
142
133
|
return (not self.optimise or self.check_optimised_phone_pair(diffs[0][0], diffs[0][1]))
|
|
143
134
|
|
|
144
|
-
def check_chroneme_contrast(self, pair:
|
|
145
|
-
|
|
146
|
-
|
|
135
|
+
def check_chroneme_contrast(self, pair: WordPair) -> bool:
|
|
136
|
+
"""Check if the two `Word`s form a minimal pair based on a sound length
|
|
137
|
+
difference (i.e. a different chroneme)"""
|
|
138
|
+
first = pair[0].phonology
|
|
139
|
+
last = pair[1].phonology
|
|
147
140
|
|
|
148
141
|
# we have to work with same number of syllables
|
|
149
142
|
if len(first) != len(last):
|
|
@@ -166,9 +159,11 @@ class MinPairGenerator:
|
|
|
166
159
|
|
|
167
160
|
return chroneme_diffs >= 1
|
|
168
161
|
|
|
169
|
-
def check_stress_contrast(self, pair:
|
|
170
|
-
|
|
171
|
-
|
|
162
|
+
def check_stress_contrast(self, pair: WordPair) -> bool:
|
|
163
|
+
"""Check if the two `Word`s form a minimal pair based on different
|
|
164
|
+
placcing of syllable stress, all sounds being the same"""
|
|
165
|
+
first = pair[0].phonology
|
|
166
|
+
last = pair[1].phonology
|
|
172
167
|
|
|
173
168
|
# we have to work with same number of syllables
|
|
174
169
|
if len(first) != len(last):
|
|
@@ -186,9 +181,9 @@ class MinPairGenerator:
|
|
|
186
181
|
|
|
187
182
|
### Helper functions ###
|
|
188
183
|
|
|
189
|
-
def
|
|
190
|
-
"""
|
|
191
|
-
return
|
|
184
|
+
def flatten(lst: list[list]) -> set[list]:
|
|
185
|
+
"""Return the set of all elements belonging to the sublists of the list"""
|
|
186
|
+
return set(chain(*lst))
|
|
192
187
|
|
|
193
188
|
def phoneme_list_to_pairs(phoneme_list: list[str]) -> list[tuple[str]]:
|
|
194
189
|
"""
|
|
@@ -206,16 +201,12 @@ def phoneme_list_to_pairs(phoneme_list: list[str]) -> list[tuple[str]]:
|
|
|
206
201
|
pairs = chain.from_iterable(combinations(s, r) for r in range(2, 2+1))
|
|
207
202
|
return list(pairs)
|
|
208
203
|
|
|
209
|
-
def flatten(lst: list[list]) -> set[list]:
|
|
210
|
-
"""Return the set of all elements belonging to the sublists of the list"""
|
|
211
|
-
return set(chain(*lst))
|
|
212
|
-
|
|
213
204
|
def phoneme_lists_to_phoneme_pairs(phoneme_lists: list[list[str]]) -> set[list]:
|
|
214
205
|
"""
|
|
215
206
|
Given a list of lists of phonemes, return the combined set of all phoneme
|
|
216
207
|
pairs made from every individual list.
|
|
217
208
|
"""
|
|
218
|
-
return flatten(list
|
|
209
|
+
return flatten([phoneme_list_to_pairs(list) for list in phoneme_lists])
|
|
219
210
|
|
|
220
211
|
### CONSTANTS ###
|
|
221
212
|
|
grzegorz/io.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Copyright (c) 2022 xylous <xylous.e@gmail.com>
|
|
2
|
+
#
|
|
3
|
+
# This file is part of grzegorz.
|
|
4
|
+
# grzegorz is free software: you can redistribute it and/or modify it under the
|
|
5
|
+
# terms of the GNU General Public License as published by the Free Software
|
|
6
|
+
# Foundation, either version 3 of the License, or (at your option) any later
|
|
7
|
+
# version.
|
|
8
|
+
#
|
|
9
|
+
# grzegorz is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
10
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
|
11
|
+
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
12
|
+
#
|
|
13
|
+
# You should have received a copy of the GNU General Public License along with
|
|
14
|
+
# grzegorz. If not, see <https://www.gnu.org/licenses/>.
|
|
15
|
+
|
|
16
|
+
from grzegorz.word import (Word, WordPair)
|
|
17
|
+
|
|
18
|
+
from typing import Callable, TypeVar
|
|
19
|
+
|
|
20
|
+
T = TypeVar('T')
|
|
21
|
+
|
|
22
|
+
def readfile(path: str) -> str:
|
|
23
|
+
"""Return the contents of a file"""
|
|
24
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
25
|
+
return f.read()
|
|
26
|
+
|
|
27
|
+
def writefile(path: str, text: str) -> None:
|
|
28
|
+
"""Write `text` to the given path"""
|
|
29
|
+
with open(path, 'w', encoding='utf-8') as f:
|
|
30
|
+
f.write(text)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# JSON has several disadvantages, alongside being too verbose for our purposes.
|
|
34
|
+
# Running multiple threads, like `fetchipa()` does, would make it tricky to
|
|
35
|
+
# add new data to the file. On the other hand, using plain text and a thread
|
|
36
|
+
# mutex allows us to directly append new lines.
|
|
37
|
+
|
|
38
|
+
GRZEGORZ_WORD_FORMAT_SEPARATOR = ", "
|
|
39
|
+
GRZEGORZ_MINPAIR_FORMAT_SEPARATOR = " -- "
|
|
40
|
+
|
|
41
|
+
def encode_word(word: Word) -> str:
|
|
42
|
+
return word.text + GRZEGORZ_WORD_FORMAT_SEPARATOR + word.ipa
|
|
43
|
+
|
|
44
|
+
def encode_minpair(pair: WordPair) -> str:
|
|
45
|
+
return encode_word(pair[0]) + GRZEGORZ_MINPAIR_FORMAT_SEPARATOR + encode_word(pair[1])
|
|
46
|
+
|
|
47
|
+
def decode_word(s: str) -> Word:
|
|
48
|
+
spl = s.split(GRZEGORZ_WORD_FORMAT_SEPARATOR)
|
|
49
|
+
return Word(spl[0], spl[1])
|
|
50
|
+
|
|
51
|
+
def decode_minpair(s: str) -> WordPair:
|
|
52
|
+
spl = s.split(GRZEGORZ_MINPAIR_FORMAT_SEPARATOR)
|
|
53
|
+
return (decode_word(spl[0]), decode_word(spl[1]))
|
|
54
|
+
|
|
55
|
+
def encode_format(hook: Callable[[T], str], input: list[T]) -> str:
|
|
56
|
+
return "\n".join([hook(elem) for elem in input])
|
|
57
|
+
|
|
58
|
+
def decode_format(hook: Callable[[str], T], input: str) -> list[T]:
|
|
59
|
+
return [hook(line) for line in input.splitlines()]
|
grzegorz/subcommands.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Copyright (c) 2023 xylous <xylous.e@gmail.com>
|
|
2
|
+
#
|
|
3
|
+
# This file is part of grzegorz.
|
|
4
|
+
# grzegorz is free software: you can redistribute it and/or modify it under the
|
|
5
|
+
# terms of the GNU General Public License as published by the Free Software
|
|
6
|
+
# Foundation, either version 3 of the License, or (at your option) any later
|
|
7
|
+
# version.
|
|
8
|
+
#
|
|
9
|
+
# grzegorz is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
10
|
+
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
|
11
|
+
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
12
|
+
#
|
|
13
|
+
# You should have received a copy of the GNU General Public License along with
|
|
14
|
+
# grzegorz. If not, see <https://www.gnu.org/licenses/>.
|
|
15
|
+
|
|
16
|
+
from grzegorz.fetcher import get_ipa_for_word
|
|
17
|
+
from grzegorz.generator import (MinPairGenerator)
|
|
18
|
+
from grzegorz.anki_integration import (minpairs_to_deck, export_deck)
|
|
19
|
+
from grzegorz.wordlist import (wordlist, print_languages_list, valid_lang)
|
|
20
|
+
from grzegorz.word import Word
|
|
21
|
+
from grzegorz.io import *
|
|
22
|
+
|
|
23
|
+
from os import (remove, linesep)
|
|
24
|
+
from multiprocessing import Pool
|
|
25
|
+
from threading import Lock
|
|
26
|
+
from functools import partial
|
|
27
|
+
from tqdm import tqdm
|
|
28
|
+
|
|
29
|
+
def fullmake(language: str, bounds: str, clean: bool) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Practically: wrap all commands into one. If `clean` is True, then
|
|
32
|
+
temporary files created by this function are removed.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
wordlist_file = language + "-wordlist.txt"
|
|
36
|
+
ipa_file = language + "-ipa.txt"
|
|
37
|
+
minpairs_file = language + "-minpairs.txt"
|
|
38
|
+
makedeck_file = "grzegorz-" + language + "-minpairs.apkg"
|
|
39
|
+
|
|
40
|
+
if wordlist_command(language, bounds, wordlist_file) == 1:
|
|
41
|
+
exit(1)
|
|
42
|
+
fetchipa(wordlist_file, ipa_file, False, 20)
|
|
43
|
+
generate_command(ipa_file, minpairs_file, False, False, False, False)
|
|
44
|
+
makedeck(minpairs_file, makedeck_file)
|
|
45
|
+
|
|
46
|
+
if clean:
|
|
47
|
+
print("Removing temporary files...")
|
|
48
|
+
remove(wordlist_file)
|
|
49
|
+
remove(ipa_file)
|
|
50
|
+
remove(minpairs_file)
|
|
51
|
+
|
|
52
|
+
def list_languages() -> None:
|
|
53
|
+
print_languages_list()
|
|
54
|
+
|
|
55
|
+
def print_analysis(ipa: str) -> None:
|
|
56
|
+
Word("", ipa).print_human_readable()
|
|
57
|
+
|
|
58
|
+
def print_minpair_check(ipa1: str, ipa2: str) -> None:
|
|
59
|
+
word1 = Word("", ipa2)
|
|
60
|
+
word2 = Word("", ipa1)
|
|
61
|
+
generator = MinPairGenerator(False, True, True, True)
|
|
62
|
+
if not generator.print_human_readable_check(word1, word2):
|
|
63
|
+
exit(1)
|
|
64
|
+
|
|
65
|
+
def wordlist_command(language: str, bounds: str, outfile: str) -> int:
|
|
66
|
+
"""
|
|
67
|
+
Fetch a word list of `numwords` and put it into `outfile` for the given
|
|
68
|
+
language, if it's valid
|
|
69
|
+
If the operation failed, then return 1, otherwise return 0
|
|
70
|
+
"""
|
|
71
|
+
spl = bounds.split(":")
|
|
72
|
+
if bounds.isnumeric():
|
|
73
|
+
lowerbound = 0
|
|
74
|
+
upperbound = int(bounds)
|
|
75
|
+
elif spl[0].isnumeric() and spl[1].isnumeric():
|
|
76
|
+
lowerbound = int(spl[0])
|
|
77
|
+
upperbound = int(spl[1])
|
|
78
|
+
else:
|
|
79
|
+
print("Error: can't recognise bounds. Only positive integers are allowed before and after the ':'")
|
|
80
|
+
return 1
|
|
81
|
+
|
|
82
|
+
if lowerbound > upperbound:
|
|
83
|
+
print("Error: lower bound is bigger than upper bound; abort")
|
|
84
|
+
return 1
|
|
85
|
+
|
|
86
|
+
if not valid_lang(language):
|
|
87
|
+
print(language, "Error: that is not a language for which a wordlist can be fetched", sep='')
|
|
88
|
+
return 1
|
|
89
|
+
|
|
90
|
+
raw_words = wordlist(language, upperbound, lowerbound)
|
|
91
|
+
if raw_words:
|
|
92
|
+
writefile(outfile, '\n'.join(raw_words))
|
|
93
|
+
print("Fetched", upperbound - lowerbound, language, "words into", outfile)
|
|
94
|
+
return 0
|
|
95
|
+
else:
|
|
96
|
+
return 1
|
|
97
|
+
|
|
98
|
+
def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int = 20) -> None:
|
|
99
|
+
"""
|
|
100
|
+
Given an input file containing a list of words separated, fetch the IPAs and
|
|
101
|
+
create a text file with their IPA spellings matched to their text
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
# Ensure that we're processing the data with at least one thread
|
|
105
|
+
if numproc < 1:
|
|
106
|
+
numproc = 1
|
|
107
|
+
|
|
108
|
+
wordlist = readfile(infile).splitlines()
|
|
109
|
+
|
|
110
|
+
language = wordlist.pop(0)
|
|
111
|
+
words = [line for line in wordlist if line]
|
|
112
|
+
numwords = len(words)
|
|
113
|
+
|
|
114
|
+
print("NOTE:",
|
|
115
|
+
" Words are appended progressively to the file, so progress won't be lost.",
|
|
116
|
+
" However, you won't be able to read the file while the program is running.",
|
|
117
|
+
sep=linesep)
|
|
118
|
+
|
|
119
|
+
print("Fetching IPA spellings for", numwords, language, "words...")
|
|
120
|
+
with open(outfile, "a", encoding='utf-8') as handle:
|
|
121
|
+
with Pool(numproc) as p:
|
|
122
|
+
for fetched_word in tqdm(p.imap_unordered(partial(get_ipa_for_word, language=language),
|
|
123
|
+
words), total=numwords):
|
|
124
|
+
if keep_failed or fetched_word.ipa != "":
|
|
125
|
+
encoded = encode_word(fetched_word) + "\n"
|
|
126
|
+
with Lock():
|
|
127
|
+
handle.write(encoded)
|
|
128
|
+
|
|
129
|
+
def generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
|
|
130
|
+
no_stress, filter_file_path=None) -> None:
|
|
131
|
+
words = decode_format(decode_word, readfile(infile))
|
|
132
|
+
g = MinPairGenerator(
|
|
133
|
+
not nooptimise,
|
|
134
|
+
not no_phonemes,
|
|
135
|
+
not no_chronemes,
|
|
136
|
+
not no_stress
|
|
137
|
+
)
|
|
138
|
+
if filter_file_path is not None:
|
|
139
|
+
g.set_filter_pairs_from_file(filter_file_path)
|
|
140
|
+
|
|
141
|
+
if no_phonemes and not no_chronemes and not no_stress:
|
|
142
|
+
print("Generator: skipping all contrasts means no minimal pairs will be generated; abort")
|
|
143
|
+
return
|
|
144
|
+
if no_phonemes:
|
|
145
|
+
print("Generator: phoneme contrasts will be ignored")
|
|
146
|
+
if no_chronemes:
|
|
147
|
+
print("Generator: chroneme contrasts will be ignored")
|
|
148
|
+
if no_stress:
|
|
149
|
+
print("Generator: syllable stress contrasts will be ignored")
|
|
150
|
+
|
|
151
|
+
print('Generating minimal pairs from:', len(words), 'words')
|
|
152
|
+
minpairs = g.generate(words, False)
|
|
153
|
+
writefile(outfile, encode_format(encode_minpair, minpairs))
|
|
154
|
+
print('Done! Generated', len(minpairs), 'minimal pairs')
|
|
155
|
+
|
|
156
|
+
def makedeck(infile: str, outfile: str) -> None:
|
|
157
|
+
"""Create an Anki deck given a file full of minimal pairs"""
|
|
158
|
+
minpairs = decode_format(decode_minpair, readfile(infile))
|
|
159
|
+
deck = minpairs_to_deck(minpairs)
|
|
160
|
+
export_deck(deck, outfile)
|
|
161
|
+
print('Done! Now import', outfile, 'in your Anki')
|
grzegorz/test.py
CHANGED
|
@@ -49,52 +49,52 @@ class GeneratorTests(unittest.TestCase):
|
|
|
49
49
|
def test_phoneme_contrast_r_and_m_not_optimised(self):
|
|
50
50
|
w1 = Word("", "/barˈbaz/")
|
|
51
51
|
w2 = Word("", "/bamˈbaz/")
|
|
52
|
-
self.assertTrue(g.check_phoneme_contrast(
|
|
52
|
+
self.assertTrue(g.check_phoneme_contrast((w1, w2)))
|
|
53
53
|
|
|
54
54
|
def test_phoneme_contrast_with_chroneme_difference(self):
|
|
55
55
|
w1 = Word("", "/barˈbaz/")
|
|
56
56
|
w2 = Word("", "/bar:ˈbaz/")
|
|
57
|
-
self.assertFalse(g.check_phoneme_contrast(
|
|
57
|
+
self.assertFalse(g.check_phoneme_contrast((w1, w2)))
|
|
58
58
|
|
|
59
59
|
def test_chroneme_contrast(self):
|
|
60
60
|
w1 = Word("", "/barˈbaz/")
|
|
61
61
|
w2 = Word("", "/bar:ˈbaz/")
|
|
62
|
-
self.assertTrue(g.check_chroneme_contrast(
|
|
62
|
+
self.assertTrue(g.check_chroneme_contrast((w1, w2)))
|
|
63
63
|
|
|
64
64
|
def test_chroneme_contrast_two_diffs(self):
|
|
65
65
|
w1 = Word("", "/barˈbaz/")
|
|
66
66
|
w2 = Word("", "/bar:ˈba:z/")
|
|
67
|
-
self.assertTrue(g.check_chroneme_contrast(
|
|
67
|
+
self.assertTrue(g.check_chroneme_contrast((w1, w2)))
|
|
68
68
|
|
|
69
69
|
def test_syllable_stress_contrast_two_syllable(self):
|
|
70
70
|
w1 = Word("", "/barˈbaz/")
|
|
71
71
|
w2 = Word("", "/bar.baz/")
|
|
72
|
-
self.assertTrue(g.check_stress_contrast(
|
|
72
|
+
self.assertTrue(g.check_stress_contrast((w1, w2)))
|
|
73
73
|
|
|
74
74
|
def test_syllable_stress_contrast_three_syllables_1(self):
|
|
75
75
|
w1 = Word("", "/barˈbaz.do/")
|
|
76
76
|
w2 = Word("", "/bar.baz.do/")
|
|
77
|
-
self.assertTrue(g.check_stress_contrast(
|
|
77
|
+
self.assertTrue(g.check_stress_contrast((w1, w2)))
|
|
78
78
|
|
|
79
79
|
def test_syllable_stress_contrast_three_syllables_2(self):
|
|
80
80
|
w1 = Word("", "/barˈbaz.do/")
|
|
81
81
|
w2 = Word("", "/bar.bazˈdo/")
|
|
82
|
-
self.assertTrue(g.check_stress_contrast(
|
|
82
|
+
self.assertTrue(g.check_stress_contrast((w1, w2)))
|
|
83
83
|
|
|
84
84
|
def test_syllable_stress_contrast_three_syllables_3(self):
|
|
85
85
|
w1 = Word("", "/barˈbaz.do/")
|
|
86
86
|
w2 = Word("", "/barˌbazˈdo/")
|
|
87
|
-
self.assertTrue(g.check_stress_contrast(
|
|
87
|
+
self.assertTrue(g.check_stress_contrast((w1, w2)))
|
|
88
88
|
|
|
89
89
|
def test_syllable_stress_contrast_three_syllables_4(self):
|
|
90
90
|
w1 = Word("", "/bar.baz.do/")
|
|
91
91
|
w2 = Word("", "/barˌbazˈdo/")
|
|
92
|
-
self.assertTrue(g.check_stress_contrast(
|
|
92
|
+
self.assertTrue(g.check_stress_contrast((w1, w2)))
|
|
93
93
|
|
|
94
94
|
def test_syllable_stress_contrast_four_syllables_1(self):
|
|
95
95
|
w1 = Word("", "/bar.baz.do.man/")
|
|
96
96
|
w2 = Word("", "/barˌbazˈdo.man/")
|
|
97
|
-
self.assertTrue(g.check_stress_contrast(
|
|
97
|
+
self.assertTrue(g.check_stress_contrast((w1, w2)))
|
|
98
98
|
|
|
99
99
|
if __name__ == '__main__':
|
|
100
100
|
unittest.main()
|
grzegorz/word.py
CHANGED
|
@@ -126,58 +126,10 @@ class Word:
|
|
|
126
126
|
|
|
127
127
|
return syllables
|
|
128
128
|
|
|
129
|
-
|
|
130
|
-
def obj_dict(word):
|
|
131
|
-
"""Return this class as a dictionary"""
|
|
132
|
-
dict = word.__dict__
|
|
133
|
-
# this might fail since the dictionary is mutated, and the same Word
|
|
134
|
-
# might be converted more than one time
|
|
135
|
-
try:
|
|
136
|
-
# We don't need to know about the sounds of the word; those can be
|
|
137
|
-
# computed
|
|
138
|
-
dict.pop('phonology')
|
|
139
|
-
except KeyError:
|
|
140
|
-
pass
|
|
141
|
-
return dict
|
|
142
|
-
|
|
143
|
-
@staticmethod
|
|
144
|
-
def from_dict(dict) -> 'Word':
|
|
145
|
-
"""Deserialise this class from JSON"""
|
|
146
|
-
return Word(dict['text'], dict['ipa'])
|
|
147
|
-
|
|
148
|
-
class MinPair:
|
|
149
|
-
"""Two words in a pair. Voilà c'est tout."""
|
|
150
|
-
def __init__(self, first: Word, last: Word) -> None:
|
|
151
|
-
self.first = first;
|
|
152
|
-
self.last = last;
|
|
153
|
-
|
|
154
|
-
@staticmethod
|
|
155
|
-
def obj_dict(obj: 'MinPair'):
|
|
156
|
-
"""Return this class as a dictionary"""
|
|
157
|
-
dict = obj.__dict__;
|
|
158
|
-
dict['first'] = Word.obj_dict(dict['first']);
|
|
159
|
-
dict['last'] = Word.obj_dict(dict['last']);
|
|
160
|
-
return dict
|
|
161
|
-
|
|
162
|
-
@staticmethod
|
|
163
|
-
def from_dict(dict) -> 'MinPair':
|
|
164
|
-
"""Construct this class from a dictionary"""
|
|
165
|
-
word1 = Word.from_dict(dict['first'])
|
|
166
|
-
word2 = Word.from_dict(dict['last'])
|
|
167
|
-
return MinPair(word1, word2)
|
|
129
|
+
WordPair = tuple[Word, Word]
|
|
168
130
|
|
|
169
131
|
### Helper functions ###
|
|
170
132
|
|
|
171
|
-
def readfile(path: str) -> str:
|
|
172
|
-
"""Return the contents of a file"""
|
|
173
|
-
with open(path, 'r', encoding='utf-8') as f:
|
|
174
|
-
return f.read()
|
|
175
|
-
|
|
176
|
-
def writefile(path: str, text: str) -> None:
|
|
177
|
-
"""Write `text` to the given path"""
|
|
178
|
-
with open(path, 'w', encoding='utf-8') as f:
|
|
179
|
-
f.write(text)
|
|
180
|
-
|
|
181
133
|
def parse_ipa_characters(ipa: str) -> list[str]:
|
|
182
134
|
""" Given an IPA transliteration, return all the IPA characters in it """
|
|
183
135
|
# Remove any any forward slashes, square brackets or round parentheses that
|
grzegorz/wordlist.py
CHANGED
|
@@ -13,11 +13,8 @@
|
|
|
13
13
|
# You should have received a copy of the GNU General Public License along with
|
|
14
14
|
# grzegorz. If not, see <https://www.gnu.org/licenses/>.
|
|
15
15
|
|
|
16
|
-
from grzegorz.word import writefile
|
|
17
|
-
|
|
18
16
|
import requests
|
|
19
17
|
|
|
20
|
-
"""List of languages for which word lists can be fetched"""
|
|
21
18
|
VALID_LANGUAGES = [
|
|
22
19
|
# Germanic languages
|
|
23
20
|
('english', 'en'),
|
|
@@ -62,28 +59,32 @@ VALID_LANGUAGES = [
|
|
|
62
59
|
('chinese', 'zh'),
|
|
63
60
|
('japanese', 'ja'),
|
|
64
61
|
]
|
|
62
|
+
"""
|
|
63
|
+
List of languages for which word lists can be fetched, in tuple format, with the
|
|
64
|
+
first element being the language full name and the second element being the
|
|
65
|
+
language code
|
|
66
|
+
"""
|
|
65
67
|
|
|
66
|
-
"""This is where all the lists are fetched from"""
|
|
67
68
|
RESOURCES_REPO_LINK = 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016'
|
|
69
|
+
"""All the wordlists are fetched from here"""
|
|
70
|
+
|
|
68
71
|
|
|
69
|
-
def wordlist(lang,
|
|
72
|
+
def wordlist(lang: str, upperbound: int, lowerbound: int = 0) -> list[str]:
|
|
70
73
|
"""
|
|
71
|
-
|
|
72
|
-
language
|
|
73
|
-
If
|
|
74
|
+
Return the most common words that are between index `lowerbound` and
|
|
75
|
+
`upperbound` in the given language. Note that the first element is always
|
|
76
|
+
the language name. If it isn't present, then the language is invalid.
|
|
74
77
|
"""
|
|
75
|
-
language = lang_name(lang)
|
|
76
78
|
if not valid_lang(lang):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
language = lang_name(lang)
|
|
79
82
|
link = wordlist_link_for_lang(lang)
|
|
80
|
-
words_kept_slice = slice(
|
|
83
|
+
words_kept_slice = slice(lowerbound, upperbound)
|
|
81
84
|
raw_words = fetch_contents(link).splitlines()[words_kept_slice]
|
|
82
|
-
raw_words =
|
|
85
|
+
raw_words = [line.split()[0] for line in raw_words]
|
|
83
86
|
raw_words.insert(0, language)
|
|
84
|
-
|
|
85
|
-
print("Fetched", numwords, language, "words into", outfile)
|
|
86
|
-
return 0
|
|
87
|
+
return raw_words
|
|
87
88
|
|
|
88
89
|
def print_languages_list() -> None:
|
|
89
90
|
for (lang, code) in sorted(VALID_LANGUAGES, key=lambda pair: pair[1]):
|
|
@@ -91,44 +92,40 @@ def print_languages_list() -> None:
|
|
|
91
92
|
|
|
92
93
|
### HELPER FUNCTIONS ###
|
|
93
94
|
|
|
94
|
-
def valid_lang(lang):
|
|
95
|
-
"""
|
|
95
|
+
def valid_lang(lang: str) -> bool:
|
|
96
|
+
"""Check if `wordlist()` can fetch a wordlist for the given language or
|
|
97
|
+
language code"""
|
|
96
98
|
for pair in VALID_LANGUAGES:
|
|
97
99
|
if lang in pair:
|
|
98
100
|
return True
|
|
99
101
|
return False
|
|
100
102
|
|
|
101
|
-
def lang_code(lang):
|
|
102
|
-
"""Given a language, return its language code
|
|
103
|
+
def lang_code(lang: str) -> str:
|
|
104
|
+
"""Given a language, return its language code, provided it's in the
|
|
105
|
+
`VALID_LANGUAGES` property"""
|
|
103
106
|
for pair in VALID_LANGUAGES:
|
|
104
107
|
if lang in pair:
|
|
105
108
|
_, code = pair
|
|
106
109
|
return code
|
|
107
110
|
return ''
|
|
108
111
|
|
|
109
|
-
def lang_name(lang):
|
|
110
|
-
"""Given a language, return its language fullname
|
|
112
|
+
def lang_name(lang: str) -> str:
|
|
113
|
+
"""Given a language, return its language fullname, provided it's in the
|
|
114
|
+
`VALID_LANGUAGES` property"""
|
|
111
115
|
for pair in VALID_LANGUAGES:
|
|
112
116
|
if lang in pair:
|
|
113
117
|
name, _ = pair
|
|
114
118
|
return name
|
|
115
119
|
return ''
|
|
116
120
|
|
|
117
|
-
def wordlist_link_for_lang(lang):
|
|
118
|
-
"""Return the link to the wordlist for the given language
|
|
121
|
+
def wordlist_link_for_lang(lang: str):
|
|
122
|
+
"""Return the link to the wordlist for the given language, provided it is
|
|
123
|
+
valid"""
|
|
119
124
|
code = lang_code(lang)
|
|
120
125
|
link = RESOURCES_REPO_LINK + "/" + code + "/" + code + "_50k.txt"
|
|
121
126
|
return link
|
|
122
127
|
|
|
123
|
-
def fetch_contents(link):
|
|
128
|
+
def fetch_contents(link: str):
|
|
124
129
|
"""Return the string containing the webpage at `link`"""
|
|
125
130
|
res = requests.get(link)
|
|
126
131
|
return res.text
|
|
127
|
-
|
|
128
|
-
def format_line(line):
|
|
129
|
-
"""
|
|
130
|
-
The format of the list we fetched is not perfect: we need to keep only the
|
|
131
|
-
first word on every line
|
|
132
|
-
"""
|
|
133
|
-
first_word = line.split()[0]
|
|
134
|
-
return first_word
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: grzegorz
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Minimal pair generator and phonetics tool
|
|
5
5
|
Home-page: https://github.com/xylous/grzegorz
|
|
6
6
|
Author: xylous
|
|
@@ -16,10 +16,11 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
16
16
|
Requires-Python: >=3.10
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: beautifulsoup4
|
|
20
20
|
Requires-Dist: tqdm
|
|
21
21
|
Requires-Dist: requests
|
|
22
22
|
Requires-Dist: genanki
|
|
23
|
+
Dynamic: license-file
|
|
23
24
|
|
|
24
25
|
# grzegorz
|
|
25
26
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
grzegorz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
grzegorz/__main__.py,sha256=jYpB9FB0uuJwrpxBpU-d13vIw1vhNg2kYX4yC_-UWrg,7133
|
|
3
|
+
grzegorz/anki_integration.py,sha256=eMFdFNd0NsqLxX23NtlEzinhGMCecEFyoklfFkMqQOk,3933
|
|
4
|
+
grzegorz/fetcher.py,sha256=oAZRDZVqH93HgLFFffJ-dl6Qc83aD43ZuNVK9boy7F0,1902
|
|
5
|
+
grzegorz/generator.py,sha256=oCz9TKg9wPN3VIGGa2H8L2Ex4Uf2_gX_XFrlxiB4RSw,9320
|
|
6
|
+
grzegorz/io.py,sha256=JM2pOKgECmnVxCZplgRt1gEiyYWXUn_Z6OanmGSaab0,2221
|
|
7
|
+
grzegorz/subcommands.py,sha256=QQQX1LraTi9Lfo28N1s4G1j-j_z4HtiUsAYsVNyt5FI,6101
|
|
8
|
+
grzegorz/test.py,sha256=znHJFiV0Q1qP0kJYtoweMTNqJH1eX9ZHWFZedOJIuGo,3866
|
|
9
|
+
grzegorz/word.py,sha256=bXNTq_sjrn7CTOWBGkKdQXky_j0c-OzxhhgJWDh0BR0,7899
|
|
10
|
+
grzegorz/wordlist.py,sha256=SqKkZoyY8Ol1vp8Rt0PeNWtxL8ND3qE_yWGl5yiKZ_M,4058
|
|
11
|
+
grzegorz-0.6.1.dist-info/licenses/LICENSE,sha256=STF0KkBB_RpcXwp43xCvRIKKe_4V-zrq1lU1OsTgapY,35148
|
|
12
|
+
grzegorz-0.6.1.dist-info/METADATA,sha256=sZkAm0W5qIsh8fcWOpiy-cV6L64p5RsTWm06VYKbf5U,3980
|
|
13
|
+
grzegorz-0.6.1.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
|
|
14
|
+
grzegorz-0.6.1.dist-info/entry_points.txt,sha256=rZ-JLt-sbS1rZ5YwodMyf9o80C6sN4AfuSCb0sFNVJ8,52
|
|
15
|
+
grzegorz-0.6.1.dist-info/top_level.txt,sha256=W2SodvLxGhkJfWfNhDO0Vh7prBehEXdE9sHWJ1mZXTA,9
|
|
16
|
+
grzegorz-0.6.1.dist-info/RECORD,,
|
grzegorz-0.5.0.dist-info/RECORD
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
grzegorz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
grzegorz/__main__.py,sha256=7duY2W-u9q1gbiZrzsq8Ev_kEScv_mqm1VcRQlhzv48,8531
|
|
3
|
-
grzegorz/anki_integration.py,sha256=W1Y9Uomx3W87FfLUTYX4s97ehI6gzbbfxrXZlEGnHRY,4156
|
|
4
|
-
grzegorz/fetcher.py,sha256=kpZ0dZjxLo31IUKHYkv0M2QUGChm0DjkL_CAdiILv0w,2823
|
|
5
|
-
grzegorz/generator.py,sha256=DS2U36YtFWxDLvgva7rrr_O-H59wf_mzuDxtYE75s9Y,9808
|
|
6
|
-
grzegorz/test.py,sha256=8lwwtimbmFEH7vJnRKerK0tvZy1ozwVWO_HglASkXe0,3950
|
|
7
|
-
grzegorz/word.py,sha256=wivk5Zs37Xx2rertpNz-Ui67Wrh6JLPMbNTQJDnZNzQ,9448
|
|
8
|
-
grzegorz/wordlist.py,sha256=ALpDy15uUhO74bVFInJ7VJfgJdTDeCijTgwwGgSHdUs,4024
|
|
9
|
-
grzegorz-0.5.0.dist-info/LICENSE,sha256=STF0KkBB_RpcXwp43xCvRIKKe_4V-zrq1lU1OsTgapY,35148
|
|
10
|
-
grzegorz-0.5.0.dist-info/METADATA,sha256=lhGR2w4N3D42rt5TUWk5ksYx04Jkzk_XmZMKXaaaEFE,3960
|
|
11
|
-
grzegorz-0.5.0.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
12
|
-
grzegorz-0.5.0.dist-info/entry_points.txt,sha256=rZ-JLt-sbS1rZ5YwodMyf9o80C6sN4AfuSCb0sFNVJ8,52
|
|
13
|
-
grzegorz-0.5.0.dist-info/top_level.txt,sha256=W2SodvLxGhkJfWfNhDO0Vh7prBehEXdE9sHWJ1mZXTA,9
|
|
14
|
-
grzegorz-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|