PyPI - grzegorz - Versions diffs - 0.6.0__tar.gz → 0.6.2__tar.gz - Mend

grzegorz 0.6.0tar.gz → 0.6.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{grzegorz-0.6.0 → grzegorz-0.6.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: grzegorz
-Version: 0.6.0
+Version: 0.6.2
 Summary: Minimal pair generator and phonetics tool
 Home-page: https://github.com/xylous/grzegorz
 Author: xylous
@@ -16,6 +16,12 @@ Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: beautifulsoup4
+Requires-Dist: tqdm
+Requires-Dist: requests
+Requires-Dist: genanki
+Requires-Dist: fake-useragent
+Dynamic: license-file
 # grzegorz

{grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/__main__.py RENAMED Viewed

@@ -26,20 +26,17 @@ def create_argparser() -> argparse.ArgumentParser:
     # 'analyse' subcommand
     parser_analyse = subparsers.add_parser('analyse',
-            help='Print the result of phonologically parsing of the given IPA transcription')
+            help='Parse the given IPA transcription')
     parser_analyse.add_argument('ipa',
-            type=str,
-            help="IPA transcription")
+            type=str)
     # 'check' subcommand
     parser_check = subparsers.add_parser('check',
-            help='Check if the two given IPAs can form minimal pair')
-    parser_check .add_argument('ipa_first',
-            type=str,
-            help="first IPA transcription")
-    parser_check .add_argument('ipa_second',
-            type=str,
-            help="second IPA transcription")
+            help='Check if the two given IPAs can form a minimal pair')
+    parser_check.add_argument('ipa_first',
+            type=str)
+    parser_check.add_argument('ipa_second',
+            type=str)
     # 'list-languages' subcommand
     subparsers.add_parser('list-languages',
@@ -47,12 +44,12 @@ def create_argparser() -> argparse.ArgumentParser:
     # 'fullmake' command
     parser_fullmake = subparsers.add_parser('fullmake',
-            help='Build an Anki deck for a language automatically')
+            help=f'Build an Anki deck for a language (equivalent of \'wordlist\', \'fetchipa\', \'generate\', \'makedeck\')')
     parser_fullmake.add_argument('language',
             type=str)
     parser_fullmake.add_argument('bounds',
             type=str,
-            help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
+            help='number of words to keep, e.g. "5000"; alternatively, the range of words to keep, e.g. "1500:3000"')
     parser_fullmake.add_argument('--clean',
             dest='clean',
             action='store_true',
@@ -61,31 +58,30 @@ def create_argparser() -> argparse.ArgumentParser:
     # 'wordlist' command
     parser_wordlist = subparsers.add_parser('wordlist',
-            help='Fetch the word list for a given language, containing a certain number of words')
+            help='Get the specified number of words from a frequency wordlist in the given language')
     parser_wordlist.add_argument('language',
             type=str,
             help='language of the wordlist')
     parser_wordlist.add_argument('bounds',
             type=str,
-            help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
+            help='number of words to keep, e.g. "5000"; alternatively, the range of words to keep, e.g. "1500:3000"')
     parser_wordlist.add_argument('outfile',
             type=str,
             help='path where the wordlist should be stored')
     # 'fetchipa' subcommand
     parser_fetchipa = subparsers.add_parser('fetchipa',
-            help='Fetch all IPA pronunciations for the words into a JSON file')
+            help='Fetch IPA pronunciations for words in a wordlist')
     parser_fetchipa.add_argument('infile',
             type=str,
-            help='file containing the list of words')
+            help='wordlist output file')
     parser_fetchipa.add_argument('outfile',
-            type=str,
-            help='output file (JSON)')
+            type=str)
     parser_fetchipa.add_argument('--keep-failed',
             dest='keep_failed',
             action='store_true',
             default=False,
-            help='Save the words for which no IPA was found in the output file (default: don\'t)')
+            help='In the output file, keep the words with no found IPA (default: don\'t)')
     parser_fetchipa.add_argument('--numproc',
             type=int,
             dest='numproc',
@@ -94,10 +90,10 @@ def create_argparser() -> argparse.ArgumentParser:
     # 'generate' subcommand
     parser_generate = subparsers.add_parser('generate',
-            help='Create minimal pairs, given a JSON input file')
+            help='Find minimal pairs based on the output file of \'fetchipa\'')
     parser_generate.add_argument('infile',
             type=str,
-            help='JSON file created by fetchipa')
+            help='file created by fetchipa')
     parser_generate.add_argument('outfile',
             type=str,
             help='path where the created minimal pairs will be stored')
@@ -105,7 +101,7 @@ def create_argparser() -> argparse.ArgumentParser:
             action='store_true',
             default=False,
             dest="nooptimise",
-            help="generate all possible minimal pairs (default: optimise)")
+            help="generate all possible minimal pairs (default: similar sounds)")
     parser_generate.add_argument('--no-phonemes',
             action='store_true',
             default=False,
@@ -124,17 +120,17 @@ def create_argparser() -> argparse.ArgumentParser:
     parser_generate.add_argument('-f', '--filter-file',
             type=str,
             dest="path",
-            help="path to the file whose contents determine the phones to keep when optimising")
+            help="path to file with rules for desired phoneme differences")
     # 'makedeck' subcommand
     parser_makedeck = subparsers.add_parser('makedeck',
-            help='Create an Anki deck package containing all minimal pairs')
+            help='Create an Anki deck package file from the output of the \'generate\' command')
     parser_makedeck.add_argument('infile',
             type=str,
-            help="Output file of 'generate'")
+            help="output file of 'generate'")
     parser_makedeck.add_argument('outfile',
             type=str,
-            help="Output file; note that it should ideally have the .apkg extension")
+            help="(.apkg extension)")
     return parser

{grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/fetcher.py RENAMED Viewed

@@ -15,35 +15,48 @@
 from grzegorz.word import Word
-from wiktionaryparser import WiktionaryParser
+import requests
+from bs4 import BeautifulSoup
+from fake_useragent import UserAgent
 import re
 ### HELPER FUNCTIONS ###
 def get_ipa_for_word(word: str, language: str) -> Word:
     """
     Look for the IPA transliteration of the given word in the specified language
     and return a `Word` binding it to the letters. If no transcription was
     found, then the `ipa` field of the result is empty.
     """
-    parser = WiktionaryParser()
-    parser.set_default_language(language)
+    language = language.capitalize()
+    language = "Serbo-Croatian" if language in ["Croatian", "Serbian"] else language
+    url = f"https://en.wiktionary.org/wiki/{word}"
+    # wiktionary blocks requests with no/standard user-agent
+    # use a random one to bypass that
+    ua = UserAgent()
+    headers = {"User-Agent": ua.random}
+    webpage = requests.get(url, headers=headers)
+    soup = BeautifulSoup(webpage.text, "html.parser")
+    pronunciations= soup.select(f'li:has(sup:has(a[href="/wiki/Appendix:{language}_pronunciation"]))' )
     ipa = ""
-    fetched = parser.fetch(word)
-    if len(fetched):
-        first_entry = fetched[0]
-        pronunciations = first_entry.get('pronunciations')
-        text = pronunciations.get('text')
-        if len(text):
-            ipa = first_ipa_pronunciation(text[0])
-    # Not all words have their IPAs on wiktionary, but they might have a
-    # "Rhymes" section (many German words do, for example). If we did fetch a
-    # rhyme, don't add it as a valid IPA
-    if len(ipa) and ipa[0] == '-':
-        ipa = ""
+    # maybe blindly choosing the first IPA transliteration is not the wisest
+    # choice in the world?
+    if len(pronunciations):
+        first_entry = pronunciations[0].find("span", {"class": "IPA"})
+        if first_entry is not None:
+            ipa = first_entry.text
+    # in German, nouns are capitalized, but the wordlist we're using might not
+    # respect that. This accounts for that, but likely reduces performance for
+    # words without any wiktionary entry.
+    if language == "German" and ipa == "" and word != word.capitalize():
+        return get_ipa_for_word(word.capitalize(), language)
     return Word(word, ipa)
 def first_ipa_pronunciation(ipa_str: str) -> str:
     """Find the first IPA spelling in the given string"""
     result = re.findall(r"[/\[].*?[/\]]", ipa_str)

{grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/generator.py RENAMED Viewed

@@ -46,17 +46,21 @@ class MinPairGenerator:
                 lists_of_phonemes.append(line.replace(" ", "").split(","))
         self.filter_pairs = phoneme_lists_to_phoneme_pairs(lists_of_phonemes)
-    def generate(self, words: list[Word]) -> list[WordPair]:
+    def generate(self, words: list[Word], silent: bool = True) -> list[WordPair]:
         """
         Generate minimal pairs from the given parameters
         """
         minpairs = []
-        for i in tqdm(range(0,len(words))):
-            for j in range(i+1,len(words)):
+        progress_bar = tqdm(total=int(len(words) * (len(words) - 1) / 2), disable=silent)
+        for i in range(0, len(words)):
+            words_after = range(i+1, len(words))
+            for j in words_after:
                 pair = (words[i], words[j])
                 if self.check_minpair(pair):
                     minpairs.append(pair)
+            progress_bar.update(len(words_after))
+        progress_bar.close()
         return minpairs

{grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/subcommands.py RENAMED Viewed

@@ -95,7 +95,7 @@ def wordlist_command(language: str, bounds: str, outfile: str) -> int:
     else:
         return 1
-def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int = 10) -> None:
+def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int = 20) -> None:
     """
     Given an input file containing a list of words separated, fetch the IPAs and
     create a text file with their IPA spellings matched to their text
@@ -148,7 +148,8 @@ def generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
     if no_stress:
         print("Generator: syllable stress contrasts will be ignored")
-    minpairs = g.generate(words)
+    print('Generating minimal pairs from:', len(words), 'words')
+    minpairs = g.generate(words, False)
     writefile(outfile, encode_format(encode_minpair, minpairs))
     print('Done! Generated', len(minpairs), 'minimal pairs')

{grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: grzegorz
-Version: 0.6.0
+Version: 0.6.2
 Summary: Minimal pair generator and phonetics tool
 Home-page: https://github.com/xylous/grzegorz
 Author: xylous
@@ -16,6 +16,12 @@ Classifier: Topic :: Text Processing :: Linguistic
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: beautifulsoup4
+Requires-Dist: tqdm
+Requires-Dist: requests
+Requires-Dist: genanki
+Requires-Dist: fake-useragent
+Dynamic: license-file
 # grzegorz

grzegorz-0.6.2/grzegorz.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,5 @@
+beautifulsoup4
+tqdm
+requests
+genanki
+fake-useragent

{grzegorz-0.6.0 → grzegorz-0.6.2}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = grzegorz
-version = 0.6.0
+version = 0.6.2
 author = xylous
 author_email = xylous.e@gmail.com
 description = Minimal pair generator and phonetics tool
@@ -23,10 +23,11 @@ packages =
 	grzegorz
 python_requires = >=3.10
 install_requires =
-	wiktionaryparser
+	beautifulsoup4
 	tqdm
 	requests
 	genanki
+	fake-useragent
 [options.entry_points]
 console_scripts =