grzegorz 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
grzegorz/__main__.py CHANGED
@@ -13,13 +13,8 @@
13
13
  # You should have received a copy of the GNU General Public License along with
14
14
  # grzegorz. If not, see <https://www.gnu.org/licenses/>.
15
15
 
16
- from grzegorz.fetcher import fetchipa
17
- from grzegorz.generator import (MinPairGenerator)
18
- from grzegorz.word import (Word)
19
- from grzegorz.anki_integration import makedeck
20
- from grzegorz.wordlist import (wordlist, print_languages_list)
16
+ from grzegorz.subcommands import *
21
17
 
22
- from os import remove
23
18
  import argparse
24
19
 
25
20
  # Why does it have to be this complicated?
@@ -55,9 +50,9 @@ def create_argparser() -> argparse.ArgumentParser:
55
50
  help='Build an Anki deck for a language automatically')
56
51
  parser_fullmake.add_argument('language',
57
52
  type=str)
58
- parser_fullmake.add_argument('numwords',
59
- type=int,
60
- help='number of words to sample')
53
+ parser_fullmake.add_argument('bounds',
54
+ type=str,
55
+ help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
61
56
  parser_fullmake.add_argument('--clean',
62
57
  dest='clean',
63
58
  action='store_true',
@@ -70,9 +65,9 @@ def create_argparser() -> argparse.ArgumentParser:
70
65
  parser_wordlist.add_argument('language',
71
66
  type=str,
72
67
  help='language of the wordlist')
73
- parser_wordlist.add_argument('numwords',
74
- type=int,
75
- help='number of words to keep')
68
+ parser_wordlist.add_argument('bounds',
69
+ type=str,
70
+ help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
76
71
  parser_wordlist.add_argument('outfile',
77
72
  type=str,
78
73
  help='path where the wordlist should be stored')
@@ -91,6 +86,11 @@ def create_argparser() -> argparse.ArgumentParser:
91
86
  action='store_true',
92
87
  default=False,
93
88
  help='Save the words for which no IPA was found in the output file (default: don\'t)')
89
+ parser_fetchipa.add_argument('--numproc',
90
+ type=int,
91
+ dest='numproc',
92
+ default=20,
93
+ help='Number of concurrent processes to handle the wordlist; default: 20')
94
94
 
95
95
  # 'generate' subcommand
96
96
  parser_generate = subparsers.add_parser('generate',
@@ -138,39 +138,6 @@ def create_argparser() -> argparse.ArgumentParser:
138
138
 
139
139
  return parser
140
140
 
141
- def fullmake(language: str, numwords: int, clean: bool) -> None:
142
- """
143
- Practically: wrap all commands into one. If `clean` is True, then
144
- temporary files created by this function are removed.
145
- """
146
- optimise = True
147
- keep_phonemes = True
148
- keep_chronemes = True
149
- keep_stress = True
150
-
151
- wordlist_file = language + "-wordlist.txt"
152
- ipa_json = language + "-ipa.json"
153
- minpairs_file = language + "-minpairs.json"
154
- makedeck_file = "grzegorz-" + language + "-minpairs.apkg"
155
-
156
- if wordlist(language, numwords, wordlist_file) == 1:
157
- exit(1)
158
- fetchipa(wordlist_file, ipa_json, False)
159
- g = MinPairGenerator(
160
- optimise,
161
- keep_phonemes,
162
- keep_chronemes,
163
- keep_stress,
164
- )
165
- g.generate(ipa_json, minpairs_file)
166
- makedeck(minpairs_file, makedeck_file)
167
-
168
- if clean:
169
- print("Removing temporary files...")
170
- remove(wordlist_file)
171
- remove(ipa_json)
172
- remove(minpairs_file)
173
-
174
141
  def main() -> None:
175
142
  parser = create_argparser()
176
143
  args = parser.parse_args()
@@ -180,17 +147,14 @@ def main() -> None:
180
147
  match cmd:
181
148
  case 'fullmake':
182
149
  clean = args.clean
183
- numwords = args.numwords
150
+ bounds = args.bounds
184
151
  language = args.language.lower()
185
- fullmake(language, numwords, clean)
152
+ fullmake(language, bounds, clean)
186
153
  case 'wordlist':
187
- outfile = args.outfile
188
- numwords = args.numwords
189
- language = args.language.lower()
190
- status = wordlist(language, numwords, outfile)
154
+ status = wordlist_command(args.language.lower(), args.bounds, args.outfile)
191
155
  exit(status)
192
156
  case 'fetchipa':
193
- fetchipa(args.infile, args.outfile, args.keep_failed)
157
+ fetchipa(args.infile, args.outfile, args.keep_failed, args.numproc)
194
158
  case 'generate':
195
159
  infile = args.infile
196
160
  outfile = args.outfile
@@ -199,29 +163,16 @@ def main() -> None:
199
163
  no_chronemes = args.no_chronemes;
200
164
  no_stress = args.no_stress;
201
165
  filter_file_path = args.path
202
- g = MinPairGenerator(
203
- not nooptimise,
204
- not no_phonemes,
205
- not no_chronemes,
206
- not no_stress
207
- )
208
- if filter_file_path is not None:
209
- g.set_filter_pairs_from_file(filter_file_path)
210
- g.generate(infile, outfile)
166
+ generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
167
+ no_stress, filter_file_path)
211
168
  case 'makedeck':
212
- infile = args.infile
213
- outfile = args.outfile
214
- makedeck(infile, outfile)
169
+ makedeck(args.infile, args.outfile)
215
170
  case 'analyse':
216
- Word("", args.ipa).print_human_readable()
171
+ print_analysis(args.ipa)
217
172
  case 'check':
218
- word1 = Word("", args.ipa_first)
219
- word2 = Word("", args.ipa_second)
220
- generator = MinPairGenerator(False, True, True, True)
221
- if not generator.print_human_readable_check(word1, word2):
222
- exit(1)
173
+ print_minpair_check(args.ipa_first, args.ipa_second)
223
174
  case 'list-languages':
224
- print_languages_list()
175
+ list_languages()
225
176
  case _:
226
177
  parser.print_help()
227
178
 
@@ -13,11 +13,10 @@
13
13
  # You should have received a copy of the GNU General Public License along with
14
14
  # grzegorz. If not, see <https://www.gnu.org/licenses/>.
15
15
 
16
- from grzegorz.word import MinPair, readfile
16
+ from grzegorz.word import WordPair
17
17
 
18
18
  import genanki
19
19
  from genanki import Note, Deck
20
- import json
21
20
 
22
21
  """The model used for the flashcards is rather simple"""
23
22
  grzegorz_minpair_model = genanki.Model(
@@ -44,7 +43,7 @@ grzegorz_minpair_model = genanki.Model(
44
43
  <br>
45
44
 
46
45
  <div class="minpair">
47
- <div id="corret-word" class="word">{{Word 1 text}}<br>{{Word 1 IPA}}</div>
46
+ <div id="correct-word" class="word">{{Word 1 text}}<br>{{Word 1 IPA}}</div>
48
47
  <div class="center"><i>or</i></div>
49
48
  <div class="word">{{Word 2 text}}<br>{{Word 2 IPA}}</div>
50
49
  </div>""",
@@ -121,50 +120,41 @@ You heard: <div class="word">{{Word 2 text}}</div>
121
120
  }""",
122
121
  )
123
122
 
124
- def makedeck(infile: str, outfile: str) -> None:
125
- """Create an Anki deck given a file full of minimal pairs"""
126
- json_str = readfile(infile)
127
- dict = json.loads(json_str)
128
- minpairs = list(map(MinPair.from_dict, dict))
129
- notes = list(map(minpair_to_anki_note, minpairs))
130
- deck = notes_to_deck(notes)
131
- export_deck(deck, outfile)
123
+ def minpairs_to_deck(minpairs: list[WordPair]) -> Deck:
124
+ """Turn a list of minimal pairs into an Anki deck"""
125
+ notes = [minpair_to_anki_note(mp) for mp in minpairs]
126
+ return notes_to_deck(notes)
132
127
 
133
- ### HELPER FUNCTIONS ###
128
+ def export_deck(deck: Deck, outfile: str) -> None:
129
+ """Package the given deck and write it to a file"""
130
+ genanki.Package(deck).write_to_file(outfile)
134
131
 
135
- def minpair_to_anki_note(minpair: MinPair) -> Note:
132
+ def minpair_to_anki_note(minpair: WordPair) -> Note:
136
133
  """
137
- Given a minimal pair, create an Anki note from it, with grzegorz_minpair_model
138
- as its model.
134
+ Given a minimal pair, create an Anki note from it, with `grzegorz_minpair_model`
135
+ as its template.
139
136
  """
140
- first = minpair.first
141
- last = minpair.last
142
137
  note = genanki.Note(
143
138
  model=grzegorz_minpair_model,
144
139
  fields=[
145
- first.text,
140
+ minpair[0].text,
146
141
  '',
147
- first.ipa,
148
- last.text,
142
+ minpair[0].ipa,
143
+ minpair[1].text,
149
144
  '',
150
- last.ipa,
145
+ minpair[1].ipa,
151
146
  ]
152
147
  )
153
148
  return note
154
149
 
155
150
  def notes_to_deck(notes: list[Note]) -> Deck:
156
151
  """
157
- Add a list of notes into a deck called "grzegorz's minimal pairs"
152
+ Put the `Note`s into a `Deck` called "grzegorz's minimal pairs"
158
153
  """
159
154
  deck = genanki.Deck(
160
- 1597757363,
155
+ 1597757363, # deck ID, randomly generated but hardcoded
161
156
  "grzegorz's minimal pairs",
162
157
  )
163
158
  for note in notes:
164
159
  deck.add_note(note)
165
160
  return deck
166
-
167
- def export_deck(deck: Deck, outfile: str) -> None:
168
- """Package the given deck and write it to a file"""
169
- genanki.Package(deck).write_to_file(outfile)
170
- print('Done! Now import', outfile, 'in your Anki')
grzegorz/fetcher.py CHANGED
@@ -13,67 +13,37 @@
13
13
  # You should have received a copy of the GNU General Public License along with
14
14
  # grzegorz. If not, see <https://www.gnu.org/licenses/>.
15
15
 
16
- from grzegorz.word import Word, readfile, writefile
16
+ from grzegorz.word import Word
17
17
 
18
- from wiktionaryparser import WiktionaryParser
19
- from multiprocessing import Pool, cpu_count
20
- from functools import partial
21
- from tqdm import tqdm
22
- import json
18
+ import requests
19
+ from bs4 import BeautifulSoup
23
20
  import re
24
21
 
25
- def fetchipa(infile: str, outfile: str, keep_failed: bool) -> None:
26
- """
27
- Given an input file containing a list of words separated, fetch the IPAs and
28
- create a JSON file with their IPA spellings matched to their text
29
- """
30
-
31
- # For speed reasons, we use parallelism
32
- numproc = 10 * cpu_count()
33
-
34
- contents = readfile(infile).splitlines()
35
- language = contents.pop(0)
36
- words = [line for line in contents if line]
37
- wds = []
38
- numwords = len(words)
39
-
40
- print("Fetching IPA spellings for", numwords, language, "words...")
41
- if numwords > 500:
42
- print("If you cancel, all progress will be lost!")
43
- with Pool(numproc) as p:
44
- for x in tqdm(p.imap_unordered(partial(get_ipa_for_word, language=language),
45
- words), total=numwords):
46
- wds.append(x)
47
-
48
- # Don't keep entries with no IPA pronunciation
49
- if not keep_failed:
50
- wds = [w for w in wds if w.ipa]
51
-
52
- jsonlog = json.dumps([Word.obj_dict(word) for word in wds])
53
- writefile(outfile, jsonlog)
54
-
55
22
  ### HELPER FUNCTIONS ###
56
23
 
57
24
  def get_ipa_for_word(word: str, language: str) -> Word:
58
25
  """
59
- Look on the English Wiktionary for the IPA of the given word
26
+ Look for the IPA transliteration of the given word in the specified language
27
+ and return a `Word` binding it to the letters. If no transcription was
28
+ found, then the `ipa` field of the result is empty.
60
29
  """
61
- parser = WiktionaryParser()
62
- parser.set_default_language(language)
30
+ language = language.capitalize()
31
+ url = f"https://en.wiktionary.org/wiki/{word}"
32
+ webpage = requests.get(url)
33
+ soup= BeautifulSoup(webpage.text, "html.parser")
34
+ pronunciations= soup.select(f'li:has(sup:has(a[href="/wiki/Appendix:{language}_pronunciation"]))' )
35
+
63
36
  ipa = ""
64
- # If we get no result, skip.
65
- try:
66
- ipa = first_ipa_pronunciation(parser.fetch(word)[0]['pronunciations']['text'][0])
67
- # Not all words have their IPAs on wiktionary, but they might have a
68
- # "Rhymes" section (try German wordlists). If we did fetch a rhyme,
69
- # don't add it as a valid IPA
70
- if ipa[0] == '-':
71
- ipa = ""
72
- except (IndexError, AttributeError, KeyError) as _:
73
- pass
37
+ # maybe blindly choosing the first IPA transliteration is not the wisest
38
+ # choice in the world?
39
+ if len(pronunciations):
40
+ first_entry = pronunciations[0].find("span", {"class": "IPA"})
41
+ if first_entry is not None:
42
+ ipa = first_entry.text
74
43
 
75
44
  return Word(word, ipa)
76
45
 
77
46
  def first_ipa_pronunciation(ipa_str: str) -> str:
78
47
  """Find the first IPA spelling in the given string"""
79
- return re.findall(r"[/\[].*?[/\]]", ipa_str)[0]
48
+ result = re.findall(r"[/\[].*?[/\]]", ipa_str)
49
+ return result[0] if len(result) else ""
grzegorz/generator.py CHANGED
@@ -13,11 +13,11 @@
13
13
  # You should have received a copy of the GNU General Public License along with
14
14
  # grzegorz. If not, see <https://www.gnu.org/licenses/>.
15
15
 
16
- from grzegorz.word import (Word, MinPair, readfile, writefile,
16
+ from grzegorz.word import (Word, WordPair,
17
17
  PHONEME_MINPAIR, CHRONEME_MINPAIR, STRESS_MINPAIR,
18
18
  NOT_MINPAIR)
19
+ from grzegorz.io import readfile
19
20
 
20
- import json
21
21
  from tqdm import tqdm
22
22
  from itertools import chain, combinations
23
23
 
@@ -46,42 +46,31 @@ class MinPairGenerator:
46
46
  lists_of_phonemes.append(line.replace(" ", "").split(","))
47
47
  self.filter_pairs = phoneme_lists_to_phoneme_pairs(lists_of_phonemes)
48
48
 
49
- def generate(self, infile: str, outfile: str) -> None:
49
+ def generate(self, words: list[Word], silent: bool = True) -> list[WordPair]:
50
50
  """
51
- Given the path to a file containing JSON data about serialised `Word`s, create
52
- a file `outfile` with all the minimal pairs found, in JSON format
51
+ Generate minimal pairs from the given parameters
53
52
  """
54
- jsonstr = readfile(infile)
55
- words = json.loads(jsonstr, object_hook=Word.from_dict)
56
53
  minpairs = []
57
54
 
58
- if not self.keep_phonemes and not self.keep_chronemes and not self.keep_stress:
59
- print("Generator: skipping all contrasts means no minimal pairs will be generated; abort")
60
- return
61
- if not self.keep_phonemes:
62
- print("Generator: phoneme contrasts will be ignored")
63
- if not self.keep_chronemes:
64
- print("Generator: chroneme contrasts will be ignored")
65
- if not self.keep_stress:
66
- print("Generator: syllable stress contrasts will be ignored")
67
-
68
- for i in tqdm(range(0,len(words))):
69
- for j in range(i+1,len(words)):
70
- pair = MinPair(words[i], words[j])
55
+ progress_bar = tqdm(total=int(len(words) * (len(words) - 1) / 2), disable=silent)
56
+ for i in range(0, len(words)):
57
+ words_after = range(i+1, len(words))
58
+ for j in words_after:
59
+ pair = (words[i], words[j])
71
60
  if self.check_minpair(pair):
72
61
  minpairs.append(pair)
62
+ progress_bar.update(len(words_after))
63
+ progress_bar.close()
73
64
 
74
- json_out = json.dumps([MinPair.obj_dict(pair) for pair in minpairs])
75
- writefile(outfile, json_out)
76
- print('Done! Generated', len(minpairs), 'minimal pairs')
65
+ return minpairs
77
66
 
78
- def check_minpair(self, pair: MinPair) -> int:
67
+ def check_minpair(self, pair: WordPair) -> int:
79
68
  """
80
- If the given pair si not a minpair, return NOT_MINPAIR; otherwise,
81
- return, per case, PHONEME_MINPAIR, CHRONEME_MINPAIR and STRESS_MINPAIR
69
+ If the given pair is not a minpair, return NOT_MINPAIR; otherwise,
70
+ return, per case, PHONEME_MINPAIR, CHRONEME_MINPAIR or STRESS_MINPAIR
82
71
  """
83
72
  # Skip empty entries
84
- if not pair.first.phonology or not pair.last.phonology:
73
+ if not pair[0].phonology or not pair[1].phonology:
85
74
  return False
86
75
  # A minimal pair is kept if it has an interesting difference.
87
76
  if self.keep_phonemes and self.check_phoneme_contrast(pair):
@@ -107,7 +96,7 @@ class MinPairGenerator:
107
96
  print("")
108
97
  word2.print_human_readable()
109
98
  print("")
110
- verdict = self.check_minpair(MinPair(word1, word2))
99
+ verdict = self.check_minpair((word1, word2))
111
100
  if verdict == PHONEME_MINPAIR:
112
101
  print("minimal pair based on phoneme difference")
113
102
  elif verdict == CHRONEME_MINPAIR:
@@ -118,9 +107,11 @@ class MinPairGenerator:
118
107
  print("not minimal pair")
119
108
  return verdict
120
109
 
121
- def check_phoneme_contrast(self, pair: MinPair) -> bool:
122
- first = pair.first.phonology
123
- last = pair.last.phonology
110
+ def check_phoneme_contrast(self, pair: WordPair) -> bool:
111
+ """Check if the two Words form a minimal pair based on a phoneme
112
+ difference"""
113
+ first = pair[0].phonology
114
+ last = pair[1].phonology
124
115
 
125
116
  # we have to work with same number of syllables
126
117
  if len(first) != len(last):
@@ -141,9 +132,11 @@ class MinPairGenerator:
141
132
 
142
133
  return (not self.optimise or self.check_optimised_phone_pair(diffs[0][0], diffs[0][1]))
143
134
 
144
- def check_chroneme_contrast(self, pair: MinPair) -> bool:
145
- first = pair.first.phonology
146
- last = pair.last.phonology
135
+ def check_chroneme_contrast(self, pair: WordPair) -> bool:
136
+ """Check if the two `Word`s form a minimal pair based on a sound length
137
+ difference (i.e. a different chroneme)"""
138
+ first = pair[0].phonology
139
+ last = pair[1].phonology
147
140
 
148
141
  # we have to work with same number of syllables
149
142
  if len(first) != len(last):
@@ -166,9 +159,11 @@ class MinPairGenerator:
166
159
 
167
160
  return chroneme_diffs >= 1
168
161
 
169
- def check_stress_contrast(self, pair: MinPair) -> bool:
170
- first = pair.first.phonology
171
- last = pair.last.phonology
162
+ def check_stress_contrast(self, pair: WordPair) -> bool:
163
+ """Check if the two `Word`s form a minimal pair based on different
164
+ placcing of syllable stress, all sounds being the same"""
165
+ first = pair[0].phonology
166
+ last = pair[1].phonology
172
167
 
173
168
  # we have to work with same number of syllables
174
169
  if len(first) != len(last):
@@ -186,9 +181,9 @@ class MinPairGenerator:
186
181
 
187
182
  ### Helper functions ###
188
183
 
189
- def differences(A: list, B: list) -> list:
190
- """Given two lists, return pairs of elements that differ at the same index"""
191
- return [(a, b) for a, b in zip(A, B) if a != b]
184
+ def flatten(lst: list[list]) -> set[list]:
185
+ """Return the set of all elements belonging to the sublists of the list"""
186
+ return set(chain(*lst))
192
187
 
193
188
  def phoneme_list_to_pairs(phoneme_list: list[str]) -> list[tuple[str]]:
194
189
  """
@@ -206,16 +201,12 @@ def phoneme_list_to_pairs(phoneme_list: list[str]) -> list[tuple[str]]:
206
201
  pairs = chain.from_iterable(combinations(s, r) for r in range(2, 2+1))
207
202
  return list(pairs)
208
203
 
209
- def flatten(lst: list[list]) -> set[list]:
210
- """Return the set of all elements belonging to the sublists of the list"""
211
- return set(chain(*lst))
212
-
213
204
  def phoneme_lists_to_phoneme_pairs(phoneme_lists: list[list[str]]) -> set[list]:
214
205
  """
215
206
  Given a list of lists of phonemes, return the combined set of all phoneme
216
207
  pairs made from every individual list.
217
208
  """
218
- return flatten(list(map(phoneme_list_to_pairs, phoneme_lists)))
209
+ return flatten([phoneme_list_to_pairs(list) for list in phoneme_lists])
219
210
 
220
211
  ### CONSTANTS ###
221
212
 
grzegorz/io.py ADDED
@@ -0,0 +1,59 @@
1
+ # Copyright (c) 2022 xylous <xylous.e@gmail.com>
2
+ #
3
+ # This file is part of grzegorz.
4
+ # grzegorz is free software: you can redistribute it and/or modify it under the
5
+ # terms of the GNU General Public License as published by the Free Software
6
+ # Foundation, either version 3 of the License, or (at your option) any later
7
+ # version.
8
+ #
9
+ # grzegorz is distributed in the hope that it will be useful, but WITHOUT ANY
10
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
11
+ # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License along with
14
+ # grzegorz. If not, see <https://www.gnu.org/licenses/>.
15
+
16
+ from grzegorz.word import (Word, WordPair)
17
+
18
+ from typing import Callable, TypeVar
19
+
20
+ T = TypeVar('T')
21
+
22
+ def readfile(path: str) -> str:
23
+ """Return the contents of a file"""
24
+ with open(path, 'r', encoding='utf-8') as f:
25
+ return f.read()
26
+
27
+ def writefile(path: str, text: str) -> None:
28
+ """Write `text` to the given path"""
29
+ with open(path, 'w', encoding='utf-8') as f:
30
+ f.write(text)
31
+
32
+
33
+ # JSON has several disadvantages, alongside being too verbose for our purposes.
34
+ # Running multiple threads, like `fetchipa()` does, would make it tricky to
35
+ # add new data to the file. On the other hand, using plain text and a thread
36
+ # mutex allows us to directly append new lines.
37
+
38
+ GRZEGORZ_WORD_FORMAT_SEPARATOR = ", "
39
+ GRZEGORZ_MINPAIR_FORMAT_SEPARATOR = " -- "
40
+
41
+ def encode_word(word: Word) -> str:
42
+ return word.text + GRZEGORZ_WORD_FORMAT_SEPARATOR + word.ipa
43
+
44
+ def encode_minpair(pair: WordPair) -> str:
45
+ return encode_word(pair[0]) + GRZEGORZ_MINPAIR_FORMAT_SEPARATOR + encode_word(pair[1])
46
+
47
+ def decode_word(s: str) -> Word:
48
+ spl = s.split(GRZEGORZ_WORD_FORMAT_SEPARATOR)
49
+ return Word(spl[0], spl[1])
50
+
51
+ def decode_minpair(s: str) -> WordPair:
52
+ spl = s.split(GRZEGORZ_MINPAIR_FORMAT_SEPARATOR)
53
+ return (decode_word(spl[0]), decode_word(spl[1]))
54
+
55
+ def encode_format(hook: Callable[[T], str], input: list[T]) -> str:
56
+ return "\n".join([hook(elem) for elem in input])
57
+
58
+ def decode_format(hook: Callable[[str], T], input: str) -> list[T]:
59
+ return [hook(line) for line in input.splitlines()]
@@ -0,0 +1,161 @@
1
+ # Copyright (c) 2023 xylous <xylous.e@gmail.com>
2
+ #
3
+ # This file is part of grzegorz.
4
+ # grzegorz is free software: you can redistribute it and/or modify it under the
5
+ # terms of the GNU General Public License as published by the Free Software
6
+ # Foundation, either version 3 of the License, or (at your option) any later
7
+ # version.
8
+ #
9
+ # grzegorz is distributed in the hope that it will be useful, but WITHOUT ANY
10
+ # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
11
+ # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License along with
14
+ # grzegorz. If not, see <https://www.gnu.org/licenses/>.
15
+
16
+ from grzegorz.fetcher import get_ipa_for_word
17
+ from grzegorz.generator import (MinPairGenerator)
18
+ from grzegorz.anki_integration import (minpairs_to_deck, export_deck)
19
+ from grzegorz.wordlist import (wordlist, print_languages_list, valid_lang)
20
+ from grzegorz.word import Word
21
+ from grzegorz.io import *
22
+
23
+ from os import (remove, linesep)
24
+ from multiprocessing import Pool
25
+ from threading import Lock
26
+ from functools import partial
27
+ from tqdm import tqdm
28
+
29
+ def fullmake(language: str, bounds: str, clean: bool) -> None:
30
+ """
31
+ Practically: wrap all commands into one. If `clean` is True, then
32
+ temporary files created by this function are removed.
33
+ """
34
+
35
+ wordlist_file = language + "-wordlist.txt"
36
+ ipa_file = language + "-ipa.txt"
37
+ minpairs_file = language + "-minpairs.txt"
38
+ makedeck_file = "grzegorz-" + language + "-minpairs.apkg"
39
+
40
+ if wordlist_command(language, bounds, wordlist_file) == 1:
41
+ exit(1)
42
+ fetchipa(wordlist_file, ipa_file, False, 20)
43
+ generate_command(ipa_file, minpairs_file, False, False, False, False)
44
+ makedeck(minpairs_file, makedeck_file)
45
+
46
+ if clean:
47
+ print("Removing temporary files...")
48
+ remove(wordlist_file)
49
+ remove(ipa_file)
50
+ remove(minpairs_file)
51
+
52
+ def list_languages() -> None:
53
+ print_languages_list()
54
+
55
+ def print_analysis(ipa: str) -> None:
56
+ Word("", ipa).print_human_readable()
57
+
58
+ def print_minpair_check(ipa1: str, ipa2: str) -> None:
59
+ word1 = Word("", ipa2)
60
+ word2 = Word("", ipa1)
61
+ generator = MinPairGenerator(False, True, True, True)
62
+ if not generator.print_human_readable_check(word1, word2):
63
+ exit(1)
64
+
65
+ def wordlist_command(language: str, bounds: str, outfile: str) -> int:
66
+ """
67
+ Fetch a word list of `numwords` and put it into `outfile` for the given
68
+ language, if it's valid
69
+ If the operation failed, then return 1, otherwise return 0
70
+ """
71
+ spl = bounds.split(":")
72
+ if bounds.isnumeric():
73
+ lowerbound = 0
74
+ upperbound = int(bounds)
75
+ elif spl[0].isnumeric() and spl[1].isnumeric():
76
+ lowerbound = int(spl[0])
77
+ upperbound = int(spl[1])
78
+ else:
79
+ print("Error: can't recognise bounds. Only positive integers are allowed before and after the ':'")
80
+ return 1
81
+
82
+ if lowerbound > upperbound:
83
+ print("Error: lower bound is bigger than upper bound; abort")
84
+ return 1
85
+
86
+ if not valid_lang(language):
87
+ print(language, "Error: that is not a language for which a wordlist can be fetched", sep='')
88
+ return 1
89
+
90
+ raw_words = wordlist(language, upperbound, lowerbound)
91
+ if raw_words:
92
+ writefile(outfile, '\n'.join(raw_words))
93
+ print("Fetched", upperbound - lowerbound, language, "words into", outfile)
94
+ return 0
95
+ else:
96
+ return 1
97
+
98
+ def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int = 20) -> None:
99
+ """
100
+ Given an input file containing a list of words separated, fetch the IPAs and
101
+ create a text file with their IPA spellings matched to their text
102
+ """
103
+
104
+ # Ensure that we're processing the data with at least one thread
105
+ if numproc < 1:
106
+ numproc = 1
107
+
108
+ wordlist = readfile(infile).splitlines()
109
+
110
+ language = wordlist.pop(0)
111
+ words = [line for line in wordlist if line]
112
+ numwords = len(words)
113
+
114
+ print("NOTE:",
115
+ " Words are appended progressively to the file, so progress won't be lost.",
116
+ " However, you won't be able to read the file while the program is running.",
117
+ sep=linesep)
118
+
119
+ print("Fetching IPA spellings for", numwords, language, "words...")
120
+ with open(outfile, "a", encoding='utf-8') as handle:
121
+ with Pool(numproc) as p:
122
+ for fetched_word in tqdm(p.imap_unordered(partial(get_ipa_for_word, language=language),
123
+ words), total=numwords):
124
+ if keep_failed or fetched_word.ipa != "":
125
+ encoded = encode_word(fetched_word) + "\n"
126
+ with Lock():
127
+ handle.write(encoded)
128
+
129
+ def generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
130
+ no_stress, filter_file_path=None) -> None:
131
+ words = decode_format(decode_word, readfile(infile))
132
+ g = MinPairGenerator(
133
+ not nooptimise,
134
+ not no_phonemes,
135
+ not no_chronemes,
136
+ not no_stress
137
+ )
138
+ if filter_file_path is not None:
139
+ g.set_filter_pairs_from_file(filter_file_path)
140
+
141
+ if no_phonemes and not no_chronemes and not no_stress:
142
+ print("Generator: skipping all contrasts means no minimal pairs will be generated; abort")
143
+ return
144
+ if no_phonemes:
145
+ print("Generator: phoneme contrasts will be ignored")
146
+ if no_chronemes:
147
+ print("Generator: chroneme contrasts will be ignored")
148
+ if no_stress:
149
+ print("Generator: syllable stress contrasts will be ignored")
150
+
151
+ print('Generating minimal pairs from:', len(words), 'words')
152
+ minpairs = g.generate(words, False)
153
+ writefile(outfile, encode_format(encode_minpair, minpairs))
154
+ print('Done! Generated', len(minpairs), 'minimal pairs')
155
+
156
+ def makedeck(infile: str, outfile: str) -> None:
157
+ """Create an Anki deck given a file full of minimal pairs"""
158
+ minpairs = decode_format(decode_minpair, readfile(infile))
159
+ deck = minpairs_to_deck(minpairs)
160
+ export_deck(deck, outfile)
161
+ print('Done! Now import', outfile, 'in your Anki')
grzegorz/test.py CHANGED
@@ -49,52 +49,52 @@ class GeneratorTests(unittest.TestCase):
49
49
  def test_phoneme_contrast_r_and_m_not_optimised(self):
50
50
  w1 = Word("", "/barˈbaz/")
51
51
  w2 = Word("", "/bamˈbaz/")
52
- self.assertTrue(g.check_phoneme_contrast(MinPair(w1, w2), False))
52
+ self.assertTrue(g.check_phoneme_contrast((w1, w2)))
53
53
 
54
54
  def test_phoneme_contrast_with_chroneme_difference(self):
55
55
  w1 = Word("", "/barˈbaz/")
56
56
  w2 = Word("", "/bar:ˈbaz/")
57
- self.assertFalse(g.check_phoneme_contrast(MinPair(w1, w2), False))
57
+ self.assertFalse(g.check_phoneme_contrast((w1, w2)))
58
58
 
59
59
  def test_chroneme_contrast(self):
60
60
  w1 = Word("", "/barˈbaz/")
61
61
  w2 = Word("", "/bar:ˈbaz/")
62
- self.assertTrue(g.check_chroneme_contrast(MinPair(w1, w2)))
62
+ self.assertTrue(g.check_chroneme_contrast((w1, w2)))
63
63
 
64
64
  def test_chroneme_contrast_two_diffs(self):
65
65
  w1 = Word("", "/barˈbaz/")
66
66
  w2 = Word("", "/bar:ˈba:z/")
67
- self.assertTrue(g.check_chroneme_contrast(MinPair(w1, w2)))
67
+ self.assertTrue(g.check_chroneme_contrast((w1, w2)))
68
68
 
69
69
  def test_syllable_stress_contrast_two_syllable(self):
70
70
  w1 = Word("", "/barˈbaz/")
71
71
  w2 = Word("", "/bar.baz/")
72
- self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
72
+ self.assertTrue(g.check_stress_contrast((w1, w2)))
73
73
 
74
74
  def test_syllable_stress_contrast_three_syllables_1(self):
75
75
  w1 = Word("", "/barˈbaz.do/")
76
76
  w2 = Word("", "/bar.baz.do/")
77
- self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
77
+ self.assertTrue(g.check_stress_contrast((w1, w2)))
78
78
 
79
79
  def test_syllable_stress_contrast_three_syllables_2(self):
80
80
  w1 = Word("", "/barˈbaz.do/")
81
81
  w2 = Word("", "/bar.bazˈdo/")
82
- self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
82
+ self.assertTrue(g.check_stress_contrast((w1, w2)))
83
83
 
84
84
  def test_syllable_stress_contrast_three_syllables_3(self):
85
85
  w1 = Word("", "/barˈbaz.do/")
86
86
  w2 = Word("", "/barˌbazˈdo/")
87
- self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
87
+ self.assertTrue(g.check_stress_contrast((w1, w2)))
88
88
 
89
89
  def test_syllable_stress_contrast_three_syllables_4(self):
90
90
  w1 = Word("", "/bar.baz.do/")
91
91
  w2 = Word("", "/barˌbazˈdo/")
92
- self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
92
+ self.assertTrue(g.check_stress_contrast((w1, w2)))
93
93
 
94
94
  def test_syllable_stress_contrast_four_syllables_1(self):
95
95
  w1 = Word("", "/bar.baz.do.man/")
96
96
  w2 = Word("", "/barˌbazˈdo.man/")
97
- self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
97
+ self.assertTrue(g.check_stress_contrast((w1, w2)))
98
98
 
99
99
  if __name__ == '__main__':
100
100
  unittest.main()
grzegorz/word.py CHANGED
@@ -126,58 +126,10 @@ class Word:
126
126
 
127
127
  return syllables
128
128
 
129
- @staticmethod
130
- def obj_dict(word):
131
- """Return this class as a dictionary"""
132
- dict = word.__dict__
133
- # this might fail since the dictionary is mutated, and the same Word
134
- # might be converted more than one time
135
- try:
136
- # We don't need to know about the sounds of the word; those can be
137
- # computed
138
- dict.pop('phonology')
139
- except KeyError:
140
- pass
141
- return dict
142
-
143
- @staticmethod
144
- def from_dict(dict) -> 'Word':
145
- """Deserialise this class from JSON"""
146
- return Word(dict['text'], dict['ipa'])
147
-
148
- class MinPair:
149
- """Two words in a pair. Voilà c'est tout."""
150
- def __init__(self, first: Word, last: Word) -> None:
151
- self.first = first;
152
- self.last = last;
153
-
154
- @staticmethod
155
- def obj_dict(obj: 'MinPair'):
156
- """Return this class as a dictionary"""
157
- dict = obj.__dict__;
158
- dict['first'] = Word.obj_dict(dict['first']);
159
- dict['last'] = Word.obj_dict(dict['last']);
160
- return dict
161
-
162
- @staticmethod
163
- def from_dict(dict) -> 'MinPair':
164
- """Construct this class from a dictionary"""
165
- word1 = Word.from_dict(dict['first'])
166
- word2 = Word.from_dict(dict['last'])
167
- return MinPair(word1, word2)
129
+ WordPair = tuple[Word, Word]
168
130
 
169
131
  ### Helper functions ###
170
132
 
171
- def readfile(path: str) -> str:
172
- """Return the contents of a file"""
173
- with open(path, 'r', encoding='utf-8') as f:
174
- return f.read()
175
-
176
- def writefile(path: str, text: str) -> None:
177
- """Write `text` to the given path"""
178
- with open(path, 'w', encoding='utf-8') as f:
179
- f.write(text)
180
-
181
133
  def parse_ipa_characters(ipa: str) -> list[str]:
182
134
  """ Given an IPA transliteration, return all the IPA characters in it """
183
135
  # Remove any any forward slashes, square brackets or round parentheses that
grzegorz/wordlist.py CHANGED
@@ -13,11 +13,8 @@
13
13
  # You should have received a copy of the GNU General Public License along with
14
14
  # grzegorz. If not, see <https://www.gnu.org/licenses/>.
15
15
 
16
- from grzegorz.word import writefile
17
-
18
16
  import requests
19
17
 
20
- """List of languages for which word lists can be fetched"""
21
18
  VALID_LANGUAGES = [
22
19
  # Germanic languages
23
20
  ('english', 'en'),
@@ -62,28 +59,32 @@ VALID_LANGUAGES = [
62
59
  ('chinese', 'zh'),
63
60
  ('japanese', 'ja'),
64
61
  ]
62
+ """
63
+ List of languages for which word lists can be fetched, in tuple format, with the
64
+ first element being the language full name and the second element being the
65
+ language code
66
+ """
65
67
 
66
- """This is where all the lists are fetched from"""
67
68
  RESOURCES_REPO_LINK = 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016'
69
+ """All the wordlists are fetched from here"""
70
+
68
71
 
69
- def wordlist(lang, numwords, outfile) -> int:
72
+ def wordlist(lang: str, upperbound: int, lowerbound: int = 0) -> list[str]:
70
73
  """
71
- Fetch a word list of `numwords` and put it into `outfile` for the given
72
- language, if it's valid
73
- If the operation failed, then return 1, otherwise return 0
74
+ Return the most common words that are between index `lowerbound` and
75
+ `upperbound` in the given language. Note that the first element is always
76
+ the language name. If it isn't present, then the language is invalid.
74
77
  """
75
- language = lang_name(lang)
76
78
  if not valid_lang(lang):
77
- print(lang, "? I can't fetch a wordlist for that", sep='')
78
- return 1
79
+ return []
80
+
81
+ language = lang_name(lang)
79
82
  link = wordlist_link_for_lang(lang)
80
- words_kept_slice = slice(0, numwords)
83
+ words_kept_slice = slice(lowerbound, upperbound)
81
84
  raw_words = fetch_contents(link).splitlines()[words_kept_slice]
82
- raw_words = list(map(format_line, raw_words))
85
+ raw_words = [line.split()[0] for line in raw_words]
83
86
  raw_words.insert(0, language)
84
- writefile(outfile, '\n'.join(raw_words))
85
- print("Fetched", numwords, language, "words into", outfile)
86
- return 0
87
+ return raw_words
87
88
 
88
89
  def print_languages_list() -> None:
89
90
  for (lang, code) in sorted(VALID_LANGUAGES, key=lambda pair: pair[1]):
@@ -91,44 +92,40 @@ def print_languages_list() -> None:
91
92
 
92
93
  ### HELPER FUNCTIONS ###
93
94
 
94
- def valid_lang(lang):
95
- """We only accept languages that are on the list"""
95
+ def valid_lang(lang: str) -> bool:
96
+ """Check if `wordlist()` can fetch a wordlist for the given language or
97
+ language code"""
96
98
  for pair in VALID_LANGUAGES:
97
99
  if lang in pair:
98
100
  return True
99
101
  return False
100
102
 
101
- def lang_code(lang):
102
- """Given a language, return its language code"""
103
+ def lang_code(lang: str) -> str:
104
+ """Given a language, return its language code, provided it's in the
105
+ `VALID_LANGUAGES` property"""
103
106
  for pair in VALID_LANGUAGES:
104
107
  if lang in pair:
105
108
  _, code = pair
106
109
  return code
107
110
  return ''
108
111
 
109
- def lang_name(lang):
110
- """Given a language, return its language fullname"""
112
+ def lang_name(lang: str) -> str:
113
+ """Given a language, return its language fullname, provided it's in the
114
+ `VALID_LANGUAGES` property"""
111
115
  for pair in VALID_LANGUAGES:
112
116
  if lang in pair:
113
117
  name, _ = pair
114
118
  return name
115
119
  return ''
116
120
 
117
- def wordlist_link_for_lang(lang):
118
- """Return the link to the wordlist for the given language"""
121
+ def wordlist_link_for_lang(lang: str):
122
+ """Return the link to the wordlist for the given language, provided it is
123
+ valid"""
119
124
  code = lang_code(lang)
120
125
  link = RESOURCES_REPO_LINK + "/" + code + "/" + code + "_50k.txt"
121
126
  return link
122
127
 
123
- def fetch_contents(link):
128
+ def fetch_contents(link: str):
124
129
  """Return the string containing the webpage at `link`"""
125
130
  res = requests.get(link)
126
131
  return res.text
127
-
128
- def format_line(line):
129
- """
130
- The format of the list we fetched is not perfect: we need to keep only the
131
- first word on every line
132
- """
133
- first_word = line.split()[0]
134
- return first_word
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: grzegorz
3
- Version: 0.5.0
3
+ Version: 0.6.1
4
4
  Summary: Minimal pair generator and phonetics tool
5
5
  Home-page: https://github.com/xylous/grzegorz
6
6
  Author: xylous
@@ -16,10 +16,11 @@ Classifier: Topic :: Text Processing :: Linguistic
16
16
  Requires-Python: >=3.10
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: wiktionaryparser
19
+ Requires-Dist: beautifulsoup4
20
20
  Requires-Dist: tqdm
21
21
  Requires-Dist: requests
22
22
  Requires-Dist: genanki
23
+ Dynamic: license-file
23
24
 
24
25
  # grzegorz
25
26
 
@@ -0,0 +1,16 @@
1
+ grzegorz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ grzegorz/__main__.py,sha256=jYpB9FB0uuJwrpxBpU-d13vIw1vhNg2kYX4yC_-UWrg,7133
3
+ grzegorz/anki_integration.py,sha256=eMFdFNd0NsqLxX23NtlEzinhGMCecEFyoklfFkMqQOk,3933
4
+ grzegorz/fetcher.py,sha256=oAZRDZVqH93HgLFFffJ-dl6Qc83aD43ZuNVK9boy7F0,1902
5
+ grzegorz/generator.py,sha256=oCz9TKg9wPN3VIGGa2H8L2Ex4Uf2_gX_XFrlxiB4RSw,9320
6
+ grzegorz/io.py,sha256=JM2pOKgECmnVxCZplgRt1gEiyYWXUn_Z6OanmGSaab0,2221
7
+ grzegorz/subcommands.py,sha256=QQQX1LraTi9Lfo28N1s4G1j-j_z4HtiUsAYsVNyt5FI,6101
8
+ grzegorz/test.py,sha256=znHJFiV0Q1qP0kJYtoweMTNqJH1eX9ZHWFZedOJIuGo,3866
9
+ grzegorz/word.py,sha256=bXNTq_sjrn7CTOWBGkKdQXky_j0c-OzxhhgJWDh0BR0,7899
10
+ grzegorz/wordlist.py,sha256=SqKkZoyY8Ol1vp8Rt0PeNWtxL8ND3qE_yWGl5yiKZ_M,4058
11
+ grzegorz-0.6.1.dist-info/licenses/LICENSE,sha256=STF0KkBB_RpcXwp43xCvRIKKe_4V-zrq1lU1OsTgapY,35148
12
+ grzegorz-0.6.1.dist-info/METADATA,sha256=sZkAm0W5qIsh8fcWOpiy-cV6L64p5RsTWm06VYKbf5U,3980
13
+ grzegorz-0.6.1.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
14
+ grzegorz-0.6.1.dist-info/entry_points.txt,sha256=rZ-JLt-sbS1rZ5YwodMyf9o80C6sN4AfuSCb0sFNVJ8,52
15
+ grzegorz-0.6.1.dist-info/top_level.txt,sha256=W2SodvLxGhkJfWfNhDO0Vh7prBehEXdE9sHWJ1mZXTA,9
16
+ grzegorz-0.6.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.40.0)
2
+ Generator: setuptools (80.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,14 +0,0 @@
1
- grzegorz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- grzegorz/__main__.py,sha256=7duY2W-u9q1gbiZrzsq8Ev_kEScv_mqm1VcRQlhzv48,8531
3
- grzegorz/anki_integration.py,sha256=W1Y9Uomx3W87FfLUTYX4s97ehI6gzbbfxrXZlEGnHRY,4156
4
- grzegorz/fetcher.py,sha256=kpZ0dZjxLo31IUKHYkv0M2QUGChm0DjkL_CAdiILv0w,2823
5
- grzegorz/generator.py,sha256=DS2U36YtFWxDLvgva7rrr_O-H59wf_mzuDxtYE75s9Y,9808
6
- grzegorz/test.py,sha256=8lwwtimbmFEH7vJnRKerK0tvZy1ozwVWO_HglASkXe0,3950
7
- grzegorz/word.py,sha256=wivk5Zs37Xx2rertpNz-Ui67Wrh6JLPMbNTQJDnZNzQ,9448
8
- grzegorz/wordlist.py,sha256=ALpDy15uUhO74bVFInJ7VJfgJdTDeCijTgwwGgSHdUs,4024
9
- grzegorz-0.5.0.dist-info/LICENSE,sha256=STF0KkBB_RpcXwp43xCvRIKKe_4V-zrq1lU1OsTgapY,35148
10
- grzegorz-0.5.0.dist-info/METADATA,sha256=lhGR2w4N3D42rt5TUWk5ksYx04Jkzk_XmZMKXaaaEFE,3960
11
- grzegorz-0.5.0.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
12
- grzegorz-0.5.0.dist-info/entry_points.txt,sha256=rZ-JLt-sbS1rZ5YwodMyf9o80C6sN4AfuSCb0sFNVJ8,52
13
- grzegorz-0.5.0.dist-info/top_level.txt,sha256=W2SodvLxGhkJfWfNhDO0Vh7prBehEXdE9sHWJ1mZXTA,9
14
- grzegorz-0.5.0.dist-info/RECORD,,