grzegorz 0.6.0__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {grzegorz-0.6.0 → grzegorz-0.6.2}/PKG-INFO +8 -2
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/__main__.py +22 -26
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/fetcher.py +29 -16
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/generator.py +7 -3
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/subcommands.py +3 -2
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz.egg-info/PKG-INFO +8 -2
- grzegorz-0.6.2/grzegorz.egg-info/requires.txt +5 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/setup.cfg +3 -2
- grzegorz-0.6.0/grzegorz.egg-info/requires.txt +0 -4
- {grzegorz-0.6.0 → grzegorz-0.6.2}/LICENSE +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/README.md +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/__init__.py +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/anki_integration.py +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/io.py +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/test.py +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/word.py +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz/wordlist.py +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz.egg-info/SOURCES.txt +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz.egg-info/dependency_links.txt +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz.egg-info/entry_points.txt +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/grzegorz.egg-info/top_level.txt +0 -0
- {grzegorz-0.6.0 → grzegorz-0.6.2}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: grzegorz
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: Minimal pair generator and phonetics tool
|
|
5
5
|
Home-page: https://github.com/xylous/grzegorz
|
|
6
6
|
Author: xylous
|
|
@@ -16,6 +16,12 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
16
16
|
Requires-Python: >=3.10
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
+
Requires-Dist: beautifulsoup4
|
|
20
|
+
Requires-Dist: tqdm
|
|
21
|
+
Requires-Dist: requests
|
|
22
|
+
Requires-Dist: genanki
|
|
23
|
+
Requires-Dist: fake-useragent
|
|
24
|
+
Dynamic: license-file
|
|
19
25
|
|
|
20
26
|
# grzegorz
|
|
21
27
|
|
|
@@ -26,20 +26,17 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
26
26
|
|
|
27
27
|
# 'analyse' subcommand
|
|
28
28
|
parser_analyse = subparsers.add_parser('analyse',
|
|
29
|
-
help='
|
|
29
|
+
help='Parse the given IPA transcription')
|
|
30
30
|
parser_analyse.add_argument('ipa',
|
|
31
|
-
type=str
|
|
32
|
-
help="IPA transcription")
|
|
31
|
+
type=str)
|
|
33
32
|
|
|
34
33
|
# 'check' subcommand
|
|
35
34
|
parser_check = subparsers.add_parser('check',
|
|
36
|
-
help='Check if the two given IPAs can form minimal pair')
|
|
37
|
-
parser_check
|
|
38
|
-
type=str
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
type=str,
|
|
42
|
-
help="second IPA transcription")
|
|
35
|
+
help='Check if the two given IPAs can form a minimal pair')
|
|
36
|
+
parser_check.add_argument('ipa_first',
|
|
37
|
+
type=str)
|
|
38
|
+
parser_check.add_argument('ipa_second',
|
|
39
|
+
type=str)
|
|
43
40
|
|
|
44
41
|
# 'list-languages' subcommand
|
|
45
42
|
subparsers.add_parser('list-languages',
|
|
@@ -47,12 +44,12 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
47
44
|
|
|
48
45
|
# 'fullmake' command
|
|
49
46
|
parser_fullmake = subparsers.add_parser('fullmake',
|
|
50
|
-
help='Build an Anki deck for a language
|
|
47
|
+
help=f'Build an Anki deck for a language (equivalent of \'wordlist\', \'fetchipa\', \'generate\', \'makedeck\')')
|
|
51
48
|
parser_fullmake.add_argument('language',
|
|
52
49
|
type=str)
|
|
53
50
|
parser_fullmake.add_argument('bounds',
|
|
54
51
|
type=str,
|
|
55
|
-
help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
|
|
52
|
+
help='number of words to keep, e.g. "5000"; alternatively, the range of words to keep, e.g. "1500:3000"')
|
|
56
53
|
parser_fullmake.add_argument('--clean',
|
|
57
54
|
dest='clean',
|
|
58
55
|
action='store_true',
|
|
@@ -61,31 +58,30 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
61
58
|
|
|
62
59
|
# 'wordlist' command
|
|
63
60
|
parser_wordlist = subparsers.add_parser('wordlist',
|
|
64
|
-
help='
|
|
61
|
+
help='Get the specified number of words from a frequency wordlist in the given language')
|
|
65
62
|
parser_wordlist.add_argument('language',
|
|
66
63
|
type=str,
|
|
67
64
|
help='language of the wordlist')
|
|
68
65
|
parser_wordlist.add_argument('bounds',
|
|
69
66
|
type=str,
|
|
70
|
-
help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
|
|
67
|
+
help='number of words to keep, e.g. "5000"; alternatively, the range of words to keep, e.g. "1500:3000"')
|
|
71
68
|
parser_wordlist.add_argument('outfile',
|
|
72
69
|
type=str,
|
|
73
70
|
help='path where the wordlist should be stored')
|
|
74
71
|
|
|
75
72
|
# 'fetchipa' subcommand
|
|
76
73
|
parser_fetchipa = subparsers.add_parser('fetchipa',
|
|
77
|
-
help='Fetch
|
|
74
|
+
help='Fetch IPA pronunciations for words in a wordlist')
|
|
78
75
|
parser_fetchipa.add_argument('infile',
|
|
79
76
|
type=str,
|
|
80
|
-
help='
|
|
77
|
+
help='wordlist output file')
|
|
81
78
|
parser_fetchipa.add_argument('outfile',
|
|
82
|
-
type=str
|
|
83
|
-
help='output file (JSON)')
|
|
79
|
+
type=str)
|
|
84
80
|
parser_fetchipa.add_argument('--keep-failed',
|
|
85
81
|
dest='keep_failed',
|
|
86
82
|
action='store_true',
|
|
87
83
|
default=False,
|
|
88
|
-
help='
|
|
84
|
+
help='In the output file, keep the words with no found IPA (default: don\'t)')
|
|
89
85
|
parser_fetchipa.add_argument('--numproc',
|
|
90
86
|
type=int,
|
|
91
87
|
dest='numproc',
|
|
@@ -94,10 +90,10 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
94
90
|
|
|
95
91
|
# 'generate' subcommand
|
|
96
92
|
parser_generate = subparsers.add_parser('generate',
|
|
97
|
-
help='
|
|
93
|
+
help='Find minimal pairs based on the output file of \'fetchipa\'')
|
|
98
94
|
parser_generate.add_argument('infile',
|
|
99
95
|
type=str,
|
|
100
|
-
help='
|
|
96
|
+
help='file created by fetchipa')
|
|
101
97
|
parser_generate.add_argument('outfile',
|
|
102
98
|
type=str,
|
|
103
99
|
help='path where the created minimal pairs will be stored')
|
|
@@ -105,7 +101,7 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
105
101
|
action='store_true',
|
|
106
102
|
default=False,
|
|
107
103
|
dest="nooptimise",
|
|
108
|
-
help="generate all possible minimal pairs (default:
|
|
104
|
+
help="generate all possible minimal pairs (default: similar sounds)")
|
|
109
105
|
parser_generate.add_argument('--no-phonemes',
|
|
110
106
|
action='store_true',
|
|
111
107
|
default=False,
|
|
@@ -124,17 +120,17 @@ def create_argparser() -> argparse.ArgumentParser:
|
|
|
124
120
|
parser_generate.add_argument('-f', '--filter-file',
|
|
125
121
|
type=str,
|
|
126
122
|
dest="path",
|
|
127
|
-
help="path to
|
|
123
|
+
help="path to file with rules for desired phoneme differences")
|
|
128
124
|
|
|
129
125
|
# 'makedeck' subcommand
|
|
130
126
|
parser_makedeck = subparsers.add_parser('makedeck',
|
|
131
|
-
help='Create an Anki deck package
|
|
127
|
+
help='Create an Anki deck package file from the output of the \'generate\' command')
|
|
132
128
|
parser_makedeck.add_argument('infile',
|
|
133
129
|
type=str,
|
|
134
|
-
help="
|
|
130
|
+
help="output file of 'generate'")
|
|
135
131
|
parser_makedeck.add_argument('outfile',
|
|
136
132
|
type=str,
|
|
137
|
-
help="
|
|
133
|
+
help="(.apkg extension)")
|
|
138
134
|
|
|
139
135
|
return parser
|
|
140
136
|
|
|
@@ -15,35 +15,48 @@
|
|
|
15
15
|
|
|
16
16
|
from grzegorz.word import Word
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
import requests
|
|
19
|
+
from bs4 import BeautifulSoup
|
|
20
|
+
from fake_useragent import UserAgent
|
|
19
21
|
import re
|
|
20
22
|
|
|
21
23
|
### HELPER FUNCTIONS ###
|
|
22
|
-
|
|
23
24
|
def get_ipa_for_word(word: str, language: str) -> Word:
|
|
24
25
|
"""
|
|
25
26
|
Look for the IPA transliteration of the given word in the specified language
|
|
26
27
|
and return a `Word` binding it to the letters. If no transcription was
|
|
27
28
|
found, then the `ipa` field of the result is empty.
|
|
28
29
|
"""
|
|
29
|
-
|
|
30
|
-
|
|
30
|
+
language = language.capitalize()
|
|
31
|
+
language = "Serbo-Croatian" if language in ["Croatian", "Serbian"] else language
|
|
32
|
+
url = f"https://en.wiktionary.org/wiki/{word}"
|
|
33
|
+
|
|
34
|
+
# wiktionary blocks requests with no/standard user-agent
|
|
35
|
+
# use a random one to bypass that
|
|
36
|
+
ua = UserAgent()
|
|
37
|
+
headers = {"User-Agent": ua.random}
|
|
38
|
+
|
|
39
|
+
webpage = requests.get(url, headers=headers)
|
|
40
|
+
soup = BeautifulSoup(webpage.text, "html.parser")
|
|
41
|
+
pronunciations= soup.select(f'li:has(sup:has(a[href="/wiki/Appendix:{language}_pronunciation"]))' )
|
|
42
|
+
|
|
31
43
|
ipa = ""
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
#
|
|
40
|
-
#
|
|
41
|
-
#
|
|
42
|
-
if
|
|
43
|
-
|
|
44
|
+
# maybe blindly choosing the first IPA transliteration is not the wisest
|
|
45
|
+
# choice in the world?
|
|
46
|
+
if len(pronunciations):
|
|
47
|
+
first_entry = pronunciations[0].find("span", {"class": "IPA"})
|
|
48
|
+
if first_entry is not None:
|
|
49
|
+
ipa = first_entry.text
|
|
50
|
+
|
|
51
|
+
# in German, nouns are capitalized, but the wordlist we're using might not
|
|
52
|
+
# respect that. This accounts for that, but likely reduces performance for
|
|
53
|
+
# words without any wiktionary entry.
|
|
54
|
+
if language == "German" and ipa == "" and word != word.capitalize():
|
|
55
|
+
return get_ipa_for_word(word.capitalize(), language)
|
|
44
56
|
|
|
45
57
|
return Word(word, ipa)
|
|
46
58
|
|
|
59
|
+
|
|
47
60
|
def first_ipa_pronunciation(ipa_str: str) -> str:
|
|
48
61
|
"""Find the first IPA spelling in the given string"""
|
|
49
62
|
result = re.findall(r"[/\[].*?[/\]]", ipa_str)
|
|
@@ -46,17 +46,21 @@ class MinPairGenerator:
|
|
|
46
46
|
lists_of_phonemes.append(line.replace(" ", "").split(","))
|
|
47
47
|
self.filter_pairs = phoneme_lists_to_phoneme_pairs(lists_of_phonemes)
|
|
48
48
|
|
|
49
|
-
def generate(self, words: list[Word]) -> list[WordPair]:
|
|
49
|
+
def generate(self, words: list[Word], silent: bool = True) -> list[WordPair]:
|
|
50
50
|
"""
|
|
51
51
|
Generate minimal pairs from the given parameters
|
|
52
52
|
"""
|
|
53
53
|
minpairs = []
|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
progress_bar = tqdm(total=int(len(words) * (len(words) - 1) / 2), disable=silent)
|
|
56
|
+
for i in range(0, len(words)):
|
|
57
|
+
words_after = range(i+1, len(words))
|
|
58
|
+
for j in words_after:
|
|
57
59
|
pair = (words[i], words[j])
|
|
58
60
|
if self.check_minpair(pair):
|
|
59
61
|
minpairs.append(pair)
|
|
62
|
+
progress_bar.update(len(words_after))
|
|
63
|
+
progress_bar.close()
|
|
60
64
|
|
|
61
65
|
return minpairs
|
|
62
66
|
|
|
@@ -95,7 +95,7 @@ def wordlist_command(language: str, bounds: str, outfile: str) -> int:
|
|
|
95
95
|
else:
|
|
96
96
|
return 1
|
|
97
97
|
|
|
98
|
-
def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int =
|
|
98
|
+
def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int = 20) -> None:
|
|
99
99
|
"""
|
|
100
100
|
Given an input file containing a list of words separated, fetch the IPAs and
|
|
101
101
|
create a text file with their IPA spellings matched to their text
|
|
@@ -148,7 +148,8 @@ def generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
|
|
|
148
148
|
if no_stress:
|
|
149
149
|
print("Generator: syllable stress contrasts will be ignored")
|
|
150
150
|
|
|
151
|
-
|
|
151
|
+
print('Generating minimal pairs from:', len(words), 'words')
|
|
152
|
+
minpairs = g.generate(words, False)
|
|
152
153
|
writefile(outfile, encode_format(encode_minpair, minpairs))
|
|
153
154
|
print('Done! Generated', len(minpairs), 'minimal pairs')
|
|
154
155
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: grzegorz
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: Minimal pair generator and phonetics tool
|
|
5
5
|
Home-page: https://github.com/xylous/grzegorz
|
|
6
6
|
Author: xylous
|
|
@@ -16,6 +16,12 @@ Classifier: Topic :: Text Processing :: Linguistic
|
|
|
16
16
|
Requires-Python: >=3.10
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
+
Requires-Dist: beautifulsoup4
|
|
20
|
+
Requires-Dist: tqdm
|
|
21
|
+
Requires-Dist: requests
|
|
22
|
+
Requires-Dist: genanki
|
|
23
|
+
Requires-Dist: fake-useragent
|
|
24
|
+
Dynamic: license-file
|
|
19
25
|
|
|
20
26
|
# grzegorz
|
|
21
27
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = grzegorz
|
|
3
|
-
version = 0.6.
|
|
3
|
+
version = 0.6.2
|
|
4
4
|
author = xylous
|
|
5
5
|
author_email = xylous.e@gmail.com
|
|
6
6
|
description = Minimal pair generator and phonetics tool
|
|
@@ -23,10 +23,11 @@ packages =
|
|
|
23
23
|
grzegorz
|
|
24
24
|
python_requires = >=3.10
|
|
25
25
|
install_requires =
|
|
26
|
-
|
|
26
|
+
beautifulsoup4
|
|
27
27
|
tqdm
|
|
28
28
|
requests
|
|
29
29
|
genanki
|
|
30
|
+
fake-useragent
|
|
30
31
|
|
|
31
32
|
[options.entry_points]
|
|
32
33
|
console_scripts =
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|