PyPI - grzegorz - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

grzegorz 0.5.0py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

grzegorz/__main__.py +22 -71
grzegorz/anki_integration.py +18 -28
grzegorz/fetcher.py +20 -50
grzegorz/generator.py +36 -45
grzegorz/io.py +59 -0
grzegorz/subcommands.py +161 -0
grzegorz/test.py +10 -10
grzegorz/word.py +1 -49
grzegorz/wordlist.py +30 -33
{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/METADATA +4 -3
grzegorz-0.6.1.dist-info/RECORD +16 -0
{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/WHEEL +1 -1
grzegorz-0.5.0.dist-info/RECORD +0 -14
{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/entry_points.txt +0 -0
{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info/licenses}/LICENSE +0 -0
{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/top_level.txt +0 -0

grzegorz/__main__.py CHANGED Viewed

@@ -13,13 +13,8 @@
 # You should have received a copy of the GNU General Public License along with
 # grzegorz.  If not, see <https://www.gnu.org/licenses/>.
-from grzegorz.fetcher import fetchipa
-from grzegorz.generator import (MinPairGenerator)
-from grzegorz.word import (Word)
-from grzegorz.anki_integration import makedeck
-from grzegorz.wordlist import (wordlist, print_languages_list)
+from grzegorz.subcommands import *
-from os import remove
 import argparse
 # Why does it have to be this complicated?
@@ -55,9 +50,9 @@ def create_argparser() -> argparse.ArgumentParser:
             help='Build an Anki deck for a language automatically')
     parser_fullmake.add_argument('language',
             type=str)
-    parser_fullmake.add_argument('numwords',
-            type=int,
-            help='number of words to sample')
+    parser_fullmake.add_argument('bounds',
+            type=str,
+            help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
     parser_fullmake.add_argument('--clean',
             dest='clean',
             action='store_true',
@@ -70,9 +65,9 @@ def create_argparser() -> argparse.ArgumentParser:
     parser_wordlist.add_argument('language',
             type=str,
             help='language of the wordlist')
-    parser_wordlist.add_argument('numwords',
-            type=int,
-            help='number of words to keep')
+    parser_wordlist.add_argument('bounds',
+            type=str,
+            help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
     parser_wordlist.add_argument('outfile',
             type=str,
             help='path where the wordlist should be stored')
@@ -91,6 +86,11 @@ def create_argparser() -> argparse.ArgumentParser:
             action='store_true',
             default=False,
             help='Save the words for which no IPA was found in the output file (default: don\'t)')
+    parser_fetchipa.add_argument('--numproc',
+            type=int,
+            dest='numproc',
+            default=20,
+            help='Number of concurrent processes to handle the wordlist; default: 20')
     # 'generate' subcommand
     parser_generate = subparsers.add_parser('generate',
@@ -138,39 +138,6 @@ def create_argparser() -> argparse.ArgumentParser:
     return parser
-def fullmake(language: str, numwords: int, clean: bool) -> None:
-    """
-    Practically: wrap all commands into one. If `clean` is True, then
-    temporary files created by this function are removed.
-    """
-    optimise = True
-    keep_phonemes = True
-    keep_chronemes = True
-    keep_stress = True
-    wordlist_file = language + "-wordlist.txt"
-    ipa_json = language + "-ipa.json"
-    minpairs_file = language + "-minpairs.json"
-    makedeck_file = "grzegorz-" + language + "-minpairs.apkg"
-    if wordlist(language, numwords, wordlist_file) == 1:
-        exit(1)
-    fetchipa(wordlist_file, ipa_json, False)
-    g = MinPairGenerator(
-        optimise,
-        keep_phonemes,
-        keep_chronemes,
-        keep_stress,
-    )
-    g.generate(ipa_json, minpairs_file)
-    makedeck(minpairs_file, makedeck_file)
-    if clean:
-        print("Removing temporary files...")
-        remove(wordlist_file)
-        remove(ipa_json)
-        remove(minpairs_file)
 def main() -> None:
     parser = create_argparser()
     args = parser.parse_args()
@@ -180,17 +147,14 @@ def main() -> None:
     match cmd:
         case 'fullmake':
             clean = args.clean
-            numwords = args.numwords
+            bounds = args.bounds
             language = args.language.lower()
-            fullmake(language, numwords, clean)
+            fullmake(language, bounds, clean)
         case 'wordlist':
-            outfile = args.outfile
-            numwords = args.numwords
-            language = args.language.lower()
-            status = wordlist(language, numwords, outfile)
+            status = wordlist_command(args.language.lower(), args.bounds, args.outfile)
             exit(status)
         case 'fetchipa':
-            fetchipa(args.infile, args.outfile, args.keep_failed)
+            fetchipa(args.infile, args.outfile, args.keep_failed, args.numproc)
         case 'generate':
             infile = args.infile
             outfile = args.outfile
@@ -199,29 +163,16 @@ def main() -> None:
             no_chronemes = args.no_chronemes;
             no_stress = args.no_stress;
             filter_file_path = args.path
-            g = MinPairGenerator(
-                not nooptimise,
-                not no_phonemes,
-                not no_chronemes,
-                not no_stress
-            )
-            if filter_file_path is not None:
-                g.set_filter_pairs_from_file(filter_file_path)
-            g.generate(infile, outfile)
+            generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
+                     no_stress, filter_file_path)
         case 'makedeck':
-            infile = args.infile
-            outfile = args.outfile
-            makedeck(infile, outfile)
+            makedeck(args.infile, args.outfile)
         case 'analyse':
-            Word("", args.ipa).print_human_readable()
+            print_analysis(args.ipa)
         case 'check':
-            word1 = Word("", args.ipa_first)
-            word2 = Word("", args.ipa_second)
-            generator = MinPairGenerator(False, True, True, True)
-            if not generator.print_human_readable_check(word1, word2):
-                exit(1)
+            print_minpair_check(args.ipa_first, args.ipa_second)
         case 'list-languages':
-            print_languages_list()
+            list_languages()
         case _:
             parser.print_help()

grzegorz/anki_integration.py CHANGED Viewed

@@ -13,11 +13,10 @@
 # You should have received a copy of the GNU General Public License along with
 # grzegorz.  If not, see <https://www.gnu.org/licenses/>.
-from grzegorz.word import MinPair, readfile
+from grzegorz.word import WordPair
 import genanki
 from genanki import Note, Deck
-import json
 """The model used for the flashcards is rather simple"""
 grzegorz_minpair_model = genanki.Model(
@@ -44,7 +43,7 @@ grzegorz_minpair_model = genanki.Model(
 <br>
 <div class="minpair">
-<div id="corret-word" class="word">{{Word 1 text}}<br>{{Word 1 IPA}}</div>
+<div id="correct-word" class="word">{{Word 1 text}}<br>{{Word 1 IPA}}</div>
 <div class="center"><i>or</i></div>
 <div class="word">{{Word 2 text}}<br>{{Word 2 IPA}}</div>
 </div>""",
@@ -121,50 +120,41 @@ You heard: <div class="word">{{Word 2 text}}</div>
 }""",
 )
-def makedeck(infile: str, outfile: str) -> None:
-    """Create an Anki deck given a file full of minimal pairs"""
-    json_str = readfile(infile)
-    dict = json.loads(json_str)
-    minpairs = list(map(MinPair.from_dict, dict))
-    notes = list(map(minpair_to_anki_note, minpairs))
-    deck = notes_to_deck(notes)
-    export_deck(deck, outfile)
+def minpairs_to_deck(minpairs: list[WordPair]) -> Deck:
+    """Turn a list of minimal pairs into an Anki deck"""
+    notes = [minpair_to_anki_note(mp) for mp in minpairs]
+    return notes_to_deck(notes)
-### HELPER FUNCTIONS ###
+def export_deck(deck: Deck, outfile: str) -> None:
+    """Package the given deck and write it to a file"""
+    genanki.Package(deck).write_to_file(outfile)
-def minpair_to_anki_note(minpair: MinPair) -> Note:
+def minpair_to_anki_note(minpair: WordPair) -> Note:
     """
-    Given a minimal pair, create an Anki note from it, with grzegorz_minpair_model
-    as its model.
+    Given a minimal pair, create an Anki note from it, with `grzegorz_minpair_model`
+    as its template.
     """
-    first = minpair.first
-    last = minpair.last
     note = genanki.Note(
         model=grzegorz_minpair_model,
         fields=[
-            first.text,
+            minpair[0].text,
             '',
-            first.ipa,
-            last.text,
+            minpair[0].ipa,
+            minpair[1].text,
             '',
-            last.ipa,
+            minpair[1].ipa,
         ]
     )
     return note
 def notes_to_deck(notes: list[Note]) -> Deck:
     """
-    Add a list of notes into a deck called "grzegorz's minimal pairs"
+    Put the `Note`s into a `Deck` called "grzegorz's minimal pairs"
     """
     deck = genanki.Deck(
-        1597757363,
+        1597757363, # deck ID, randomly generated but hardcoded
         "grzegorz's minimal pairs",
     )
     for note in notes:
         deck.add_note(note)
     return deck
-def export_deck(deck: Deck, outfile: str) -> None:
-    """Package the given deck and write it to a file"""
-    genanki.Package(deck).write_to_file(outfile)
-    print('Done! Now import', outfile, 'in your Anki')

grzegorz/fetcher.py CHANGED Viewed

@@ -13,67 +13,37 @@
 # You should have received a copy of the GNU General Public License along with
 # grzegorz.  If not, see <https://www.gnu.org/licenses/>.
-from grzegorz.word import Word, readfile, writefile
+from grzegorz.word import Word
-from wiktionaryparser import WiktionaryParser
-from multiprocessing import Pool, cpu_count
-from functools import partial
-from tqdm import tqdm
-import json
+import requests
+from bs4 import BeautifulSoup
 import re
-def fetchipa(infile: str, outfile: str, keep_failed: bool) -> None:
-    """
-    Given an input file containing a list of words separated, fetch the IPAs and
-    create a JSON file with their IPA spellings matched to their text
-    """
-    # For speed reasons, we use parallelism
-    numproc = 10 * cpu_count()
-    contents = readfile(infile).splitlines()
-    language = contents.pop(0)
-    words = [line for line in contents if line]
-    wds = []
-    numwords = len(words)
-    print("Fetching IPA spellings for", numwords, language, "words...")
-    if numwords > 500:
-        print("If you cancel, all progress will be lost!")
-    with Pool(numproc) as p:
-        for x in tqdm(p.imap_unordered(partial(get_ipa_for_word, language=language),
-            words), total=numwords):
-            wds.append(x)
-    # Don't keep entries with no IPA pronunciation
-    if not keep_failed:
-        wds = [w for w in wds if w.ipa]
-    jsonlog = json.dumps([Word.obj_dict(word) for word in wds])
-    writefile(outfile, jsonlog)
 ### HELPER FUNCTIONS ###
 def get_ipa_for_word(word: str, language: str) -> Word:
     """
-    Look on the English Wiktionary for the IPA of the given word
+    Look for the IPA transliteration of the given word in the specified language
+    and return a `Word` binding it to the letters. If no transcription was
+    found, then the `ipa` field of the result is empty.
     """
-    parser = WiktionaryParser()
-    parser.set_default_language(language)
+    language = language.capitalize()
+    url = f"https://en.wiktionary.org/wiki/{word}"
+    webpage = requests.get(url)
+    soup= BeautifulSoup(webpage.text, "html.parser")
+    pronunciations= soup.select(f'li:has(sup:has(a[href="/wiki/Appendix:{language}_pronunciation"]))' )
     ipa = ""
-    # If we get no result, skip.
-    try:
-        ipa = first_ipa_pronunciation(parser.fetch(word)[0]['pronunciations']['text'][0])
-        # Not all words have their IPAs on wiktionary, but they might have a
-        # "Rhymes" section (try German wordlists). If we did fetch a rhyme,
-        # don't add it as a valid IPA
-        if ipa[0] == '-':
-            ipa = ""
-    except (IndexError, AttributeError, KeyError) as _:
-        pass
+    # maybe blindly choosing the first IPA transliteration is not the wisest
+    # choice in the world?
+    if len(pronunciations):
+        first_entry = pronunciations[0].find("span", {"class": "IPA"})
+        if first_entry is not None:
+            ipa = first_entry.text
     return Word(word, ipa)
 def first_ipa_pronunciation(ipa_str: str) -> str:
     """Find the first IPA spelling in the given string"""
-    return re.findall(r"[/\[].*?[/\]]", ipa_str)[0]
+    result = re.findall(r"[/\[].*?[/\]]", ipa_str)
+    return result[0] if len(result) else ""

grzegorz/generator.py CHANGED Viewed

@@ -13,11 +13,11 @@
 # You should have received a copy of the GNU General Public License along with
 # grzegorz.  If not, see <https://www.gnu.org/licenses/>.
-from grzegorz.word import (Word, MinPair, readfile, writefile,
+from grzegorz.word import (Word, WordPair,
                            PHONEME_MINPAIR, CHRONEME_MINPAIR, STRESS_MINPAIR,
                            NOT_MINPAIR)
+from grzegorz.io import readfile
-import json
 from tqdm import tqdm
 from itertools import chain, combinations
@@ -46,42 +46,31 @@ class MinPairGenerator:
                 lists_of_phonemes.append(line.replace(" ", "").split(","))
         self.filter_pairs = phoneme_lists_to_phoneme_pairs(lists_of_phonemes)
-    def generate(self, infile: str, outfile: str) -> None:
+    def generate(self, words: list[Word], silent: bool = True) -> list[WordPair]:
         """
-        Given the path to a file containing JSON data about serialised `Word`s, create
-        a file `outfile` with all the minimal pairs found, in JSON format
+        Generate minimal pairs from the given parameters
         """
-        jsonstr = readfile(infile)
-        words = json.loads(jsonstr, object_hook=Word.from_dict)
         minpairs = []
-        if not self.keep_phonemes and not self.keep_chronemes and not self.keep_stress:
-            print("Generator: skipping all contrasts means no minimal pairs will be generated; abort")
-            return
-        if not self.keep_phonemes:
-            print("Generator: phoneme contrasts will be ignored")
-        if not self.keep_chronemes:
-            print("Generator: chroneme contrasts will be ignored")
-        if not self.keep_stress:
-            print("Generator: syllable stress contrasts will be ignored")
-        for i in tqdm(range(0,len(words))):
-            for j in range(i+1,len(words)):
-                pair = MinPair(words[i], words[j])
+        progress_bar = tqdm(total=int(len(words) * (len(words) - 1) / 2), disable=silent)
+        for i in range(0, len(words)):
+            words_after = range(i+1, len(words))
+            for j in words_after:
+                pair = (words[i], words[j])
                 if self.check_minpair(pair):
                     minpairs.append(pair)
+            progress_bar.update(len(words_after))
+        progress_bar.close()
-        json_out = json.dumps([MinPair.obj_dict(pair) for pair in minpairs])
-        writefile(outfile, json_out)
-        print('Done! Generated', len(minpairs), 'minimal pairs')
+        return minpairs
-    def check_minpair(self, pair: MinPair) -> int:
+    def check_minpair(self, pair: WordPair) -> int:
         """
-        If the given pair si not a minpair, return NOT_MINPAIR; otherwise,
-        return, per case, PHONEME_MINPAIR, CHRONEME_MINPAIR and STRESS_MINPAIR
+        If the given pair is not a minpair, return NOT_MINPAIR; otherwise,
+        return, per case, PHONEME_MINPAIR, CHRONEME_MINPAIR or STRESS_MINPAIR
         """
         # Skip empty entries
-        if not pair.first.phonology or not pair.last.phonology:
+        if not pair[0].phonology or not pair[1].phonology:
             return False
         # A minimal pair is kept if it has an interesting difference.
         if self.keep_phonemes and self.check_phoneme_contrast(pair):
@@ -107,7 +96,7 @@ class MinPairGenerator:
         print("")
         word2.print_human_readable()
         print("")
-        verdict = self.check_minpair(MinPair(word1, word2))
+        verdict = self.check_minpair((word1, word2))
         if verdict == PHONEME_MINPAIR:
             print("minimal pair based on phoneme difference")
         elif verdict == CHRONEME_MINPAIR:
@@ -118,9 +107,11 @@ class MinPairGenerator:
             print("not minimal pair")
         return verdict
-    def check_phoneme_contrast(self, pair: MinPair) -> bool:
-        first = pair.first.phonology
-        last = pair.last.phonology
+    def check_phoneme_contrast(self, pair: WordPair) -> bool:
+        """Check if the two Words form a minimal pair based on a phoneme
+        difference"""
+        first = pair[0].phonology
+        last = pair[1].phonology
         # we have to work with same number of syllables
         if len(first) != len(last):
@@ -141,9 +132,11 @@ class MinPairGenerator:
         return (not self.optimise or self.check_optimised_phone_pair(diffs[0][0], diffs[0][1]))
-    def check_chroneme_contrast(self, pair: MinPair) -> bool:
-        first = pair.first.phonology
-        last = pair.last.phonology
+    def check_chroneme_contrast(self, pair: WordPair) -> bool:
+        """Check if the two `Word`s form a minimal pair based on a sound length
+        difference (i.e. a different chroneme)"""
+        first = pair[0].phonology
+        last = pair[1].phonology
         # we have to work with same number of syllables
         if len(first) != len(last):
@@ -166,9 +159,11 @@ class MinPairGenerator:
         return chroneme_diffs >= 1
-    def check_stress_contrast(self, pair: MinPair) -> bool:
-        first = pair.first.phonology
-        last = pair.last.phonology
+    def check_stress_contrast(self, pair: WordPair) -> bool:
+        """Check if the two `Word`s form a minimal pair based on different
+        placcing of syllable stress, all sounds being the same"""
+        first = pair[0].phonology
+        last = pair[1].phonology
         # we have to work with same number of syllables
         if len(first) != len(last):
@@ -186,9 +181,9 @@ class MinPairGenerator:
 ### Helper functions ###
-def differences(A: list, B: list) -> list:
-    """Given two lists, return pairs of elements that differ at the same index"""
-    return [(a, b) for a, b in zip(A, B) if a != b]
+def flatten(lst: list[list]) -> set[list]:
+    """Return the set of all elements belonging to the sublists of the list"""
+    return set(chain(*lst))
 def phoneme_list_to_pairs(phoneme_list: list[str]) -> list[tuple[str]]:
     """
@@ -206,16 +201,12 @@ def phoneme_list_to_pairs(phoneme_list: list[str]) -> list[tuple[str]]:
     pairs = chain.from_iterable(combinations(s, r) for r in range(2, 2+1))
     return list(pairs)
-def flatten(lst: list[list]) -> set[list]:
-    """Return the set of all elements belonging to the sublists of the list"""
-    return set(chain(*lst))
 def phoneme_lists_to_phoneme_pairs(phoneme_lists: list[list[str]]) -> set[list]:
     """
     Given a list of lists of phonemes, return the combined set of all phoneme
     pairs made from every individual list.
     """
-    return flatten(list(map(phoneme_list_to_pairs, phoneme_lists)))
+    return flatten([phoneme_list_to_pairs(list) for list in phoneme_lists])
 ### CONSTANTS ###

grzegorz/io.py ADDED Viewed

@@ -0,0 +1,59 @@
+# Copyright (c) 2022 xylous <xylous.e@gmail.com>
+#
+# This file is part of grzegorz.
+# grzegorz is free software: you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# grzegorz is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# grzegorz.  If not, see <https://www.gnu.org/licenses/>.
+from grzegorz.word import (Word, WordPair)
+from typing import Callable, TypeVar
+T = TypeVar('T')
+def readfile(path: str) -> str:
+    """Return the contents of a file"""
+    with open(path, 'r', encoding='utf-8') as f:
+        return f.read()
+def writefile(path: str, text: str) -> None:
+    """Write `text` to the given path"""
+    with open(path, 'w', encoding='utf-8') as f:
+        f.write(text)
+# JSON has several disadvantages, alongside being too verbose for our purposes.
+# Running multiple threads, like `fetchipa()` does, would make it tricky to
+# add new data to the file. On the other hand, using plain text and a thread
+# mutex allows us to directly append new lines.
+GRZEGORZ_WORD_FORMAT_SEPARATOR = ", "
+GRZEGORZ_MINPAIR_FORMAT_SEPARATOR = " -- "
+def encode_word(word: Word) -> str:
+    return word.text + GRZEGORZ_WORD_FORMAT_SEPARATOR + word.ipa
+def encode_minpair(pair: WordPair) -> str:
+    return encode_word(pair[0]) + GRZEGORZ_MINPAIR_FORMAT_SEPARATOR + encode_word(pair[1])
+def decode_word(s: str) -> Word:
+    spl = s.split(GRZEGORZ_WORD_FORMAT_SEPARATOR)
+    return Word(spl[0], spl[1])
+def decode_minpair(s: str) -> WordPair:
+    spl = s.split(GRZEGORZ_MINPAIR_FORMAT_SEPARATOR)
+    return (decode_word(spl[0]), decode_word(spl[1]))
+def encode_format(hook: Callable[[T], str], input: list[T]) -> str:
+    return "\n".join([hook(elem) for elem in input])
+def decode_format(hook: Callable[[str], T], input: str) -> list[T]:
+    return [hook(line) for line in input.splitlines()]

grzegorz/subcommands.py ADDED Viewed

@@ -0,0 +1,161 @@
+# Copyright (c) 2023 xylous <xylous.e@gmail.com>
+#
+# This file is part of grzegorz.
+# grzegorz is free software: you can redistribute it and/or modify it under the
+# terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# grzegorz is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# grzegorz.  If not, see <https://www.gnu.org/licenses/>.
+from grzegorz.fetcher import get_ipa_for_word
+from grzegorz.generator import (MinPairGenerator)
+from grzegorz.anki_integration import (minpairs_to_deck, export_deck)
+from grzegorz.wordlist import (wordlist, print_languages_list, valid_lang)
+from grzegorz.word import Word
+from grzegorz.io import *
+from os import (remove, linesep)
+from multiprocessing import Pool
+from threading import Lock
+from functools import partial
+from tqdm import tqdm
+def fullmake(language: str, bounds: str, clean: bool) -> None:
+    """
+    Practically: wrap all commands into one. If `clean` is True, then
+    temporary files created by this function are removed.
+    """
+    wordlist_file = language + "-wordlist.txt"
+    ipa_file = language + "-ipa.txt"
+    minpairs_file = language + "-minpairs.txt"
+    makedeck_file = "grzegorz-" + language + "-minpairs.apkg"
+    if wordlist_command(language, bounds, wordlist_file) == 1:
+        exit(1)
+    fetchipa(wordlist_file, ipa_file, False, 20)
+    generate_command(ipa_file, minpairs_file, False, False, False, False)
+    makedeck(minpairs_file, makedeck_file)
+    if clean:
+        print("Removing temporary files...")
+        remove(wordlist_file)
+        remove(ipa_file)
+        remove(minpairs_file)
+def list_languages() -> None:
+    print_languages_list()
+def print_analysis(ipa: str) -> None:
+    Word("", ipa).print_human_readable()
+def print_minpair_check(ipa1: str, ipa2: str) -> None:
+    word1 = Word("", ipa2)
+    word2 = Word("", ipa1)
+    generator = MinPairGenerator(False, True, True, True)
+    if not generator.print_human_readable_check(word1, word2):
+        exit(1)
+def wordlist_command(language: str, bounds: str, outfile: str) -> int:
+    """
+    Fetch a word list of `numwords` and put it into `outfile` for the given
+    language, if it's valid
+    If the operation failed, then return 1, otherwise return 0
+    """
+    spl = bounds.split(":")
+    if bounds.isnumeric():
+        lowerbound = 0
+        upperbound = int(bounds)
+    elif spl[0].isnumeric() and spl[1].isnumeric():
+        lowerbound = int(spl[0])
+        upperbound = int(spl[1])
+    else:
+        print("Error: can't recognise bounds. Only positive integers are allowed before and after the ':'")
+        return 1
+    if lowerbound > upperbound:
+        print("Error: lower bound is bigger than upper bound; abort")
+        return 1
+    if not valid_lang(language):
+        print(language, "Error: that is not a language for which a wordlist can be fetched", sep='')
+        return 1
+    raw_words = wordlist(language, upperbound, lowerbound)
+    if raw_words:
+        writefile(outfile, '\n'.join(raw_words))
+        print("Fetched", upperbound - lowerbound, language, "words into", outfile)
+        return 0
+    else:
+        return 1
+def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int = 20) -> None:
+    """
+    Given an input file containing a list of words separated, fetch the IPAs and
+    create a text file with their IPA spellings matched to their text
+    """
+    # Ensure that we're processing the data with at least one thread
+    if numproc < 1:
+        numproc = 1
+    wordlist = readfile(infile).splitlines()
+    language = wordlist.pop(0)
+    words = [line for line in wordlist if line]
+    numwords = len(words)
+    print("NOTE:",
+            "  Words are appended progressively to the file, so progress won't be lost.",
+            "  However, you won't be able to read the file while the program is running.",
+            sep=linesep)
+    print("Fetching IPA spellings for", numwords, language, "words...")
+    with open(outfile, "a", encoding='utf-8') as handle:
+        with Pool(numproc) as p:
+            for fetched_word in tqdm(p.imap_unordered(partial(get_ipa_for_word, language=language),
+                words), total=numwords):
+                if keep_failed or fetched_word.ipa != "":
+                    encoded = encode_word(fetched_word) + "\n"
+                    with Lock():
+                        handle.write(encoded)
+def generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
+                     no_stress, filter_file_path=None) -> None:
+    words = decode_format(decode_word, readfile(infile))
+    g = MinPairGenerator(
+        not nooptimise,
+        not no_phonemes,
+        not no_chronemes,
+        not no_stress
+    )
+    if filter_file_path is not None:
+        g.set_filter_pairs_from_file(filter_file_path)
+    if no_phonemes and not no_chronemes and not no_stress:
+        print("Generator: skipping all contrasts means no minimal pairs will be generated; abort")
+        return
+    if no_phonemes:
+        print("Generator: phoneme contrasts will be ignored")
+    if no_chronemes:
+        print("Generator: chroneme contrasts will be ignored")
+    if no_stress:
+        print("Generator: syllable stress contrasts will be ignored")
+    print('Generating minimal pairs from:', len(words), 'words')
+    minpairs = g.generate(words, False)
+    writefile(outfile, encode_format(encode_minpair, minpairs))
+    print('Done! Generated', len(minpairs), 'minimal pairs')
+def makedeck(infile: str, outfile: str) -> None:
+    """Create an Anki deck given a file full of minimal pairs"""
+    minpairs = decode_format(decode_minpair, readfile(infile))
+    deck = minpairs_to_deck(minpairs)
+    export_deck(deck, outfile)
+    print('Done! Now import', outfile, 'in your Anki')

grzegorz/test.py CHANGED Viewed

@@ -49,52 +49,52 @@ class GeneratorTests(unittest.TestCase):
     def test_phoneme_contrast_r_and_m_not_optimised(self):
         w1 = Word("", "/barˈbaz/")
         w2 = Word("", "/bamˈbaz/")
-        self.assertTrue(g.check_phoneme_contrast(MinPair(w1, w2), False))
+        self.assertTrue(g.check_phoneme_contrast((w1, w2)))
     def test_phoneme_contrast_with_chroneme_difference(self):
         w1 = Word("", "/barˈbaz/")
         w2 = Word("", "/bar:ˈbaz/")
-        self.assertFalse(g.check_phoneme_contrast(MinPair(w1, w2), False))
+        self.assertFalse(g.check_phoneme_contrast((w1, w2)))
     def test_chroneme_contrast(self):
         w1 = Word("", "/barˈbaz/")
         w2 = Word("", "/bar:ˈbaz/")
-        self.assertTrue(g.check_chroneme_contrast(MinPair(w1, w2)))
+        self.assertTrue(g.check_chroneme_contrast((w1, w2)))
     def test_chroneme_contrast_two_diffs(self):
         w1 = Word("", "/barˈbaz/")
         w2 = Word("", "/bar:ˈba:z/")
-        self.assertTrue(g.check_chroneme_contrast(MinPair(w1, w2)))
+        self.assertTrue(g.check_chroneme_contrast((w1, w2)))
     def test_syllable_stress_contrast_two_syllable(self):
         w1 = Word("", "/barˈbaz/")
         w2 = Word("", "/bar.baz/")
-        self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
+        self.assertTrue(g.check_stress_contrast((w1, w2)))
     def test_syllable_stress_contrast_three_syllables_1(self):
         w1 = Word("", "/barˈbaz.do/")
         w2 = Word("", "/bar.baz.do/")
-        self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
+        self.assertTrue(g.check_stress_contrast((w1, w2)))
     def test_syllable_stress_contrast_three_syllables_2(self):
         w1 = Word("", "/barˈbaz.do/")
         w2 = Word("", "/bar.bazˈdo/")
-        self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
+        self.assertTrue(g.check_stress_contrast((w1, w2)))
     def test_syllable_stress_contrast_three_syllables_3(self):
         w1 = Word("", "/barˈbaz.do/")
         w2 = Word("", "/barˌbazˈdo/")
-        self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
+        self.assertTrue(g.check_stress_contrast((w1, w2)))
     def test_syllable_stress_contrast_three_syllables_4(self):
         w1 = Word("", "/bar.baz.do/")
         w2 = Word("", "/barˌbazˈdo/")
-        self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
+        self.assertTrue(g.check_stress_contrast((w1, w2)))
     def test_syllable_stress_contrast_four_syllables_1(self):
         w1 = Word("", "/bar.baz.do.man/")
         w2 = Word("", "/barˌbazˈdo.man/")
-        self.assertTrue(g.check_stress_contrast(MinPair(w1, w2)))
+        self.assertTrue(g.check_stress_contrast((w1, w2)))
 if __name__ == '__main__':
     unittest.main()

grzegorz/word.py CHANGED Viewed

@@ -126,58 +126,10 @@ class Word:
         return syllables
-    @staticmethod
-    def obj_dict(word):
-        """Return this class as a dictionary"""
-        dict = word.__dict__
-        # this might fail since the dictionary is mutated, and the same Word
-        # might be converted more than one time
-        try:
-            # We don't need to know about the sounds of the word; those can be
-            # computed
-            dict.pop('phonology')
-        except KeyError:
-            pass
-        return dict
-    @staticmethod
-    def from_dict(dict) -> 'Word':
-        """Deserialise this class from JSON"""
-        return Word(dict['text'], dict['ipa'])
-class MinPair:
-    """Two words in a pair. Voilà c'est tout."""
-    def __init__(self, first: Word, last: Word) -> None:
-        self.first = first;
-        self.last = last;
-    @staticmethod
-    def obj_dict(obj: 'MinPair'):
-        """Return this class as a dictionary"""
-        dict = obj.__dict__;
-        dict['first'] = Word.obj_dict(dict['first']);
-        dict['last'] = Word.obj_dict(dict['last']);
-        return dict
-    @staticmethod
-    def from_dict(dict) -> 'MinPair':
-        """Construct this class from a dictionary"""
-        word1 = Word.from_dict(dict['first'])
-        word2 = Word.from_dict(dict['last'])
-        return MinPair(word1, word2)
+WordPair = tuple[Word, Word]
 ### Helper functions ###
-def readfile(path: str) -> str:
-    """Return the contents of a file"""
-    with open(path, 'r', encoding='utf-8') as f:
-        return f.read()
-def writefile(path: str, text: str) -> None:
-    """Write `text` to the given path"""
-    with open(path, 'w', encoding='utf-8') as f:
-        f.write(text)
 def parse_ipa_characters(ipa: str) -> list[str]:
     """ Given an IPA transliteration, return all the IPA characters in it """
     # Remove any any forward slashes, square brackets or round parentheses that

grzegorz/wordlist.py CHANGED Viewed

@@ -13,11 +13,8 @@
 # You should have received a copy of the GNU General Public License along with
 # grzegorz.  If not, see <https://www.gnu.org/licenses/>.
-from grzegorz.word import writefile
 import requests
-"""List of languages for which word lists can be fetched"""
 VALID_LANGUAGES = [
     # Germanic languages
     ('english', 'en'),
@@ -62,28 +59,32 @@ VALID_LANGUAGES = [
     ('chinese', 'zh'),
     ('japanese', 'ja'),
 ]
+"""
+List of languages for which word lists can be fetched, in tuple format, with the
+first element being the language full name and the second element being the
+language code
+"""
-"""This is where all the lists are fetched from"""
 RESOURCES_REPO_LINK = 'https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016'
+"""All the wordlists are fetched from here"""
-def wordlist(lang, numwords, outfile) -> int:
+def wordlist(lang: str, upperbound: int, lowerbound: int = 0) -> list[str]:
     """
-    Fetch a word list of `numwords` and put it into `outfile` for the given
-    language, if it's valid
-    If the operation failed, then return 1, otherwise return 0
+    Return the most common words that are between index `lowerbound` and
+    `upperbound` in the given language. Note that the first element is always
+    the language name. If it isn't present, then the language is invalid.
     """
-    language = lang_name(lang)
     if not valid_lang(lang):
-        print(lang, "? I can't fetch a wordlist for that", sep='')
-        return 1
+        return []
+    language = lang_name(lang)
     link = wordlist_link_for_lang(lang)
-    words_kept_slice = slice(0, numwords)
+    words_kept_slice = slice(lowerbound, upperbound)
     raw_words = fetch_contents(link).splitlines()[words_kept_slice]
-    raw_words = list(map(format_line, raw_words))
+    raw_words = [line.split()[0] for line in raw_words]
     raw_words.insert(0, language)
-    writefile(outfile, '\n'.join(raw_words))
-    print("Fetched", numwords, language, "words into", outfile)
-    return 0
+    return raw_words
 def print_languages_list() -> None:
     for (lang, code) in sorted(VALID_LANGUAGES, key=lambda pair: pair[1]):
@@ -91,44 +92,40 @@ def print_languages_list() -> None:
 ### HELPER FUNCTIONS ###
-def valid_lang(lang):
-    """We only accept languages that are on the list"""
+def valid_lang(lang: str) -> bool:
+    """Check if `wordlist()` can fetch a wordlist for the given language or
+    language code"""
     for pair in VALID_LANGUAGES:
         if lang in pair:
             return True
     return False
-def lang_code(lang):
-    """Given a language, return its language code"""
+def lang_code(lang: str) -> str:
+    """Given a language, return its language code, provided it's in the
+    `VALID_LANGUAGES` property"""
     for pair in VALID_LANGUAGES:
         if lang in pair:
             _, code = pair
             return code
     return ''
-def lang_name(lang):
-    """Given a language, return its language fullname"""
+def lang_name(lang: str) -> str:
+    """Given a language, return its language fullname, provided it's in the
+    `VALID_LANGUAGES` property"""
     for pair in VALID_LANGUAGES:
         if lang in pair:
             name, _ = pair
             return name
     return ''
-def wordlist_link_for_lang(lang):
-    """Return the link to the wordlist for the given language"""
+def wordlist_link_for_lang(lang: str):
+    """Return the link to the wordlist for the given language, provided it is
+    valid"""
     code = lang_code(lang)
     link = RESOURCES_REPO_LINK + "/" + code + "/" + code + "_50k.txt"
     return link
-def fetch_contents(link):
+def fetch_contents(link: str):
     """Return the string containing the webpage at `link`"""
     res = requests.get(link)
     return res.text
-def format_line(line):
-    """
-    The format of the list we fetched is not perfect: we need to keep only the
-    first word on every line
-    """
-    first_word = line.split()[0]
-    return first_word

{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: grzegorz
-Version: 0.5.0
+Version: 0.6.1
 Summary: Minimal pair generator and phonetics tool
 Home-page: https://github.com/xylous/grzegorz
 Author: xylous
@@ -16,10 +16,11 @@ Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: wiktionaryparser
+Requires-Dist: beautifulsoup4
 Requires-Dist: tqdm
 Requires-Dist: requests
 Requires-Dist: genanki
+Dynamic: license-file
 # grzegorz

grzegorz-0.6.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+grzegorz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+grzegorz/__main__.py,sha256=jYpB9FB0uuJwrpxBpU-d13vIw1vhNg2kYX4yC_-UWrg,7133
+grzegorz/anki_integration.py,sha256=eMFdFNd0NsqLxX23NtlEzinhGMCecEFyoklfFkMqQOk,3933
+grzegorz/fetcher.py,sha256=oAZRDZVqH93HgLFFffJ-dl6Qc83aD43ZuNVK9boy7F0,1902
+grzegorz/generator.py,sha256=oCz9TKg9wPN3VIGGa2H8L2Ex4Uf2_gX_XFrlxiB4RSw,9320
+grzegorz/io.py,sha256=JM2pOKgECmnVxCZplgRt1gEiyYWXUn_Z6OanmGSaab0,2221
+grzegorz/subcommands.py,sha256=QQQX1LraTi9Lfo28N1s4G1j-j_z4HtiUsAYsVNyt5FI,6101
+grzegorz/test.py,sha256=znHJFiV0Q1qP0kJYtoweMTNqJH1eX9ZHWFZedOJIuGo,3866
+grzegorz/word.py,sha256=bXNTq_sjrn7CTOWBGkKdQXky_j0c-OzxhhgJWDh0BR0,7899
+grzegorz/wordlist.py,sha256=SqKkZoyY8Ol1vp8Rt0PeNWtxL8ND3qE_yWGl5yiKZ_M,4058
+grzegorz-0.6.1.dist-info/licenses/LICENSE,sha256=STF0KkBB_RpcXwp43xCvRIKKe_4V-zrq1lU1OsTgapY,35148
+grzegorz-0.6.1.dist-info/METADATA,sha256=sZkAm0W5qIsh8fcWOpiy-cV6L64p5RsTWm06VYKbf5U,3980
+grzegorz-0.6.1.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
+grzegorz-0.6.1.dist-info/entry_points.txt,sha256=rZ-JLt-sbS1rZ5YwodMyf9o80C6sN4AfuSCb0sFNVJ8,52
+grzegorz-0.6.1.dist-info/top_level.txt,sha256=W2SodvLxGhkJfWfNhDO0Vh7prBehEXdE9sHWJ1mZXTA,9
+grzegorz-0.6.1.dist-info/RECORD,,

{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.40.0)
+Generator: setuptools (80.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

grzegorz-0.5.0.dist-info/RECORD DELETED Viewed

@@ -1,14 +0,0 @@
-grzegorz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-grzegorz/__main__.py,sha256=7duY2W-u9q1gbiZrzsq8Ev_kEScv_mqm1VcRQlhzv48,8531
-grzegorz/anki_integration.py,sha256=W1Y9Uomx3W87FfLUTYX4s97ehI6gzbbfxrXZlEGnHRY,4156
-grzegorz/fetcher.py,sha256=kpZ0dZjxLo31IUKHYkv0M2QUGChm0DjkL_CAdiILv0w,2823
-grzegorz/generator.py,sha256=DS2U36YtFWxDLvgva7rrr_O-H59wf_mzuDxtYE75s9Y,9808
-grzegorz/test.py,sha256=8lwwtimbmFEH7vJnRKerK0tvZy1ozwVWO_HglASkXe0,3950
-grzegorz/word.py,sha256=wivk5Zs37Xx2rertpNz-Ui67Wrh6JLPMbNTQJDnZNzQ,9448
-grzegorz/wordlist.py,sha256=ALpDy15uUhO74bVFInJ7VJfgJdTDeCijTgwwGgSHdUs,4024
-grzegorz-0.5.0.dist-info/LICENSE,sha256=STF0KkBB_RpcXwp43xCvRIKKe_4V-zrq1lU1OsTgapY,35148
-grzegorz-0.5.0.dist-info/METADATA,sha256=lhGR2w4N3D42rt5TUWk5ksYx04Jkzk_XmZMKXaaaEFE,3960
-grzegorz-0.5.0.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
-grzegorz-0.5.0.dist-info/entry_points.txt,sha256=rZ-JLt-sbS1rZ5YwodMyf9o80C6sN4AfuSCb0sFNVJ8,52
-grzegorz-0.5.0.dist-info/top_level.txt,sha256=W2SodvLxGhkJfWfNhDO0Vh7prBehEXdE9sHWJ1mZXTA,9
-grzegorz-0.5.0.dist-info/RECORD,,

{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{grzegorz-0.5.0.dist-info → grzegorz-0.6.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

grzegorz 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

grzegorz 0.5.0py3-none-any.whl → 0.6.1py3-none-any.whl