grzegorz 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
grzegorz/__main__.py CHANGED
@@ -26,20 +26,17 @@ def create_argparser() -> argparse.ArgumentParser:
26
26
 
27
27
  # 'analyse' subcommand
28
28
  parser_analyse = subparsers.add_parser('analyse',
29
- help='Print the result of phonologically parsing of the given IPA transcription')
29
+ help='Parse the given IPA transcription')
30
30
  parser_analyse.add_argument('ipa',
31
- type=str,
32
- help="IPA transcription")
31
+ type=str)
33
32
 
34
33
  # 'check' subcommand
35
34
  parser_check = subparsers.add_parser('check',
36
- help='Check if the two given IPAs can form minimal pair')
37
- parser_check .add_argument('ipa_first',
38
- type=str,
39
- help="first IPA transcription")
40
- parser_check .add_argument('ipa_second',
41
- type=str,
42
- help="second IPA transcription")
35
+ help='Check if the two given IPAs can form a minimal pair')
36
+ parser_check.add_argument('ipa_first',
37
+ type=str)
38
+ parser_check.add_argument('ipa_second',
39
+ type=str)
43
40
 
44
41
  # 'list-languages' subcommand
45
42
  subparsers.add_parser('list-languages',
@@ -47,12 +44,12 @@ def create_argparser() -> argparse.ArgumentParser:
47
44
 
48
45
  # 'fullmake' command
49
46
  parser_fullmake = subparsers.add_parser('fullmake',
50
- help='Build an Anki deck for a language automatically')
47
+ help=f'Build an Anki deck for a language (equivalent of \'wordlist\', \'fetchipa\', \'generate\', \'makedeck\')')
51
48
  parser_fullmake.add_argument('language',
52
49
  type=str)
53
50
  parser_fullmake.add_argument('bounds',
54
51
  type=str,
55
- help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
52
+ help='number of words to keep, e.g. "5000"; alternatively, the range of words to keep, e.g. "1500:3000"')
56
53
  parser_fullmake.add_argument('--clean',
57
54
  dest='clean',
58
55
  action='store_true',
@@ -61,31 +58,30 @@ def create_argparser() -> argparse.ArgumentParser:
61
58
 
62
59
  # 'wordlist' command
63
60
  parser_wordlist = subparsers.add_parser('wordlist',
64
- help='Fetch the word list for a given language, containing a certain number of words')
61
+ help='Get the specified number of words from a frequency wordlist in the given language')
65
62
  parser_wordlist.add_argument('language',
66
63
  type=str,
67
64
  help='language of the wordlist')
68
65
  parser_wordlist.add_argument('bounds',
69
66
  type=str,
70
- help='number of words to keep; alternatively, the range of words to keep, e.g. "1500:3000"')
67
+ help='number of words to keep, e.g. "5000"; alternatively, the range of words to keep, e.g. "1500:3000"')
71
68
  parser_wordlist.add_argument('outfile',
72
69
  type=str,
73
70
  help='path where the wordlist should be stored')
74
71
 
75
72
  # 'fetchipa' subcommand
76
73
  parser_fetchipa = subparsers.add_parser('fetchipa',
77
- help='Fetch all IPA pronunciations for the words into a JSON file')
74
+ help='Fetch IPA pronunciations for words in a wordlist')
78
75
  parser_fetchipa.add_argument('infile',
79
76
  type=str,
80
- help='file containing the list of words')
77
+ help='wordlist output file')
81
78
  parser_fetchipa.add_argument('outfile',
82
- type=str,
83
- help='output file (JSON)')
79
+ type=str)
84
80
  parser_fetchipa.add_argument('--keep-failed',
85
81
  dest='keep_failed',
86
82
  action='store_true',
87
83
  default=False,
88
- help='Save the words for which no IPA was found in the output file (default: don\'t)')
84
+ help='In the output file, keep the words with no found IPA (default: don\'t)')
89
85
  parser_fetchipa.add_argument('--numproc',
90
86
  type=int,
91
87
  dest='numproc',
@@ -94,10 +90,10 @@ def create_argparser() -> argparse.ArgumentParser:
94
90
 
95
91
  # 'generate' subcommand
96
92
  parser_generate = subparsers.add_parser('generate',
97
- help='Create minimal pairs, given a JSON input file')
93
+ help='Find minimal pairs based on the output file of \'fetchipa\'')
98
94
  parser_generate.add_argument('infile',
99
95
  type=str,
100
- help='JSON file created by fetchipa')
96
+ help='file created by fetchipa')
101
97
  parser_generate.add_argument('outfile',
102
98
  type=str,
103
99
  help='path where the created minimal pairs will be stored')
@@ -105,7 +101,7 @@ def create_argparser() -> argparse.ArgumentParser:
105
101
  action='store_true',
106
102
  default=False,
107
103
  dest="nooptimise",
108
- help="generate all possible minimal pairs (default: optimise)")
104
+ help="generate all possible minimal pairs (default: similar sounds)")
109
105
  parser_generate.add_argument('--no-phonemes',
110
106
  action='store_true',
111
107
  default=False,
@@ -124,17 +120,17 @@ def create_argparser() -> argparse.ArgumentParser:
124
120
  parser_generate.add_argument('-f', '--filter-file',
125
121
  type=str,
126
122
  dest="path",
127
- help="path to the file whose contents determine the phones to keep when optimising")
123
+ help="path to file with rules for desired phoneme differences")
128
124
 
129
125
  # 'makedeck' subcommand
130
126
  parser_makedeck = subparsers.add_parser('makedeck',
131
- help='Create an Anki deck package containing all minimal pairs')
127
+ help='Create an Anki deck package file from the output of the \'generate\' command')
132
128
  parser_makedeck.add_argument('infile',
133
129
  type=str,
134
- help="Output file of 'generate'")
130
+ help="output file of 'generate'")
135
131
  parser_makedeck.add_argument('outfile',
136
132
  type=str,
137
- help="Output file; note that it should ideally have the .apkg extension")
133
+ help="(.apkg extension)")
138
134
 
139
135
  return parser
140
136
 
grzegorz/fetcher.py CHANGED
@@ -15,35 +15,48 @@
15
15
 
16
16
  from grzegorz.word import Word
17
17
 
18
- from wiktionaryparser import WiktionaryParser
18
+ import requests
19
+ from bs4 import BeautifulSoup
20
+ from fake_useragent import UserAgent
19
21
  import re
20
22
 
21
23
  ### HELPER FUNCTIONS ###
22
-
23
24
  def get_ipa_for_word(word: str, language: str) -> Word:
24
25
  """
25
26
  Look for the IPA transliteration of the given word in the specified language
26
27
  and return a `Word` binding it to the letters. If no transcription was
27
28
  found, then the `ipa` field of the result is empty.
28
29
  """
29
- parser = WiktionaryParser()
30
- parser.set_default_language(language)
30
+ language = language.capitalize()
31
+ language = "Serbo-Croatian" if language in ["Croatian", "Serbian"] else language
32
+ url = f"https://en.wiktionary.org/wiki/{word}"
33
+
34
+ # wiktionary blocks requests with no/standard user-agent
35
+ # use a random one to bypass that
36
+ ua = UserAgent()
37
+ headers = {"User-Agent": ua.random}
38
+
39
+ webpage = requests.get(url, headers=headers)
40
+ soup = BeautifulSoup(webpage.text, "html.parser")
41
+ pronunciations= soup.select(f'li:has(sup:has(a[href="/wiki/Appendix:{language}_pronunciation"]))' )
42
+
31
43
  ipa = ""
32
- fetched = parser.fetch(word)
33
- if len(fetched):
34
- first_entry = fetched[0]
35
- pronunciations = first_entry.get('pronunciations')
36
- text = pronunciations.get('text')
37
- if len(text):
38
- ipa = first_ipa_pronunciation(text[0])
39
- # Not all words have their IPAs on wiktionary, but they might have a
40
- # "Rhymes" section (many German words do, for example). If we did fetch a
41
- # rhyme, don't add it as a valid IPA
42
- if len(ipa) and ipa[0] == '-':
43
- ipa = ""
44
+ # maybe blindly choosing the first IPA transliteration is not the wisest
45
+ # choice in the world?
46
+ if len(pronunciations):
47
+ first_entry = pronunciations[0].find("span", {"class": "IPA"})
48
+ if first_entry is not None:
49
+ ipa = first_entry.text
50
+
51
+ # in German, nouns are capitalized, but the wordlist we're using might not
52
+ # respect that. This accounts for that, but likely reduces performance for
53
+ # words without any wiktionary entry.
54
+ if language == "German" and ipa == "" and word != word.capitalize():
55
+ return get_ipa_for_word(word.capitalize(), language)
44
56
 
45
57
  return Word(word, ipa)
46
58
 
59
+
47
60
  def first_ipa_pronunciation(ipa_str: str) -> str:
48
61
  """Find the first IPA spelling in the given string"""
49
62
  result = re.findall(r"[/\[].*?[/\]]", ipa_str)
grzegorz/generator.py CHANGED
@@ -46,17 +46,21 @@ class MinPairGenerator:
46
46
  lists_of_phonemes.append(line.replace(" ", "").split(","))
47
47
  self.filter_pairs = phoneme_lists_to_phoneme_pairs(lists_of_phonemes)
48
48
 
49
- def generate(self, words: list[Word]) -> list[WordPair]:
49
+ def generate(self, words: list[Word], silent: bool = True) -> list[WordPair]:
50
50
  """
51
51
  Generate minimal pairs from the given parameters
52
52
  """
53
53
  minpairs = []
54
54
 
55
- for i in tqdm(range(0,len(words))):
56
- for j in range(i+1,len(words)):
55
+ progress_bar = tqdm(total=int(len(words) * (len(words) - 1) / 2), disable=silent)
56
+ for i in range(0, len(words)):
57
+ words_after = range(i+1, len(words))
58
+ for j in words_after:
57
59
  pair = (words[i], words[j])
58
60
  if self.check_minpair(pair):
59
61
  minpairs.append(pair)
62
+ progress_bar.update(len(words_after))
63
+ progress_bar.close()
60
64
 
61
65
  return minpairs
62
66
 
grzegorz/subcommands.py CHANGED
@@ -95,7 +95,7 @@ def wordlist_command(language: str, bounds: str, outfile: str) -> int:
95
95
  else:
96
96
  return 1
97
97
 
98
- def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int = 10) -> None:
98
+ def fetchipa(infile: str, outfile: str, keep_failed: bool, numproc: int = 20) -> None:
99
99
  """
100
100
  Given an input file containing a list of words separated, fetch the IPAs and
101
101
  create a text file with their IPA spellings matched to their text
@@ -148,7 +148,8 @@ def generate_command(infile, outfile, nooptimise, no_phonemes, no_chronemes,
148
148
  if no_stress:
149
149
  print("Generator: syllable stress contrasts will be ignored")
150
150
 
151
- minpairs = g.generate(words)
151
+ print('Generating minimal pairs from:', len(words), 'words')
152
+ minpairs = g.generate(words, False)
152
153
  writefile(outfile, encode_format(encode_minpair, minpairs))
153
154
  print('Done! Generated', len(minpairs), 'minimal pairs')
154
155
 
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: grzegorz
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Minimal pair generator and phonetics tool
5
5
  Home-page: https://github.com/xylous/grzegorz
6
6
  Author: xylous
@@ -16,10 +16,12 @@ Classifier: Topic :: Text Processing :: Linguistic
16
16
  Requires-Python: >=3.10
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: wiktionaryparser
19
+ Requires-Dist: beautifulsoup4
20
20
  Requires-Dist: tqdm
21
21
  Requires-Dist: requests
22
22
  Requires-Dist: genanki
23
+ Requires-Dist: fake-useragent
24
+ Dynamic: license-file
23
25
 
24
26
  # grzegorz
25
27
 
@@ -0,0 +1,16 @@
1
+ grzegorz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ grzegorz/__main__.py,sha256=9QRQlG0zL9V42g706-I3pM4Zmddfqgp31NO-TqFQEcQ,6948
3
+ grzegorz/anki_integration.py,sha256=eMFdFNd0NsqLxX23NtlEzinhGMCecEFyoklfFkMqQOk,3933
4
+ grzegorz/fetcher.py,sha256=mDFQybXC_XJ2h9BMqk_rKPSlyp8wSu33PQ-U9MO2cSU,2539
5
+ grzegorz/generator.py,sha256=oCz9TKg9wPN3VIGGa2H8L2Ex4Uf2_gX_XFrlxiB4RSw,9320
6
+ grzegorz/io.py,sha256=JM2pOKgECmnVxCZplgRt1gEiyYWXUn_Z6OanmGSaab0,2221
7
+ grzegorz/subcommands.py,sha256=QQQX1LraTi9Lfo28N1s4G1j-j_z4HtiUsAYsVNyt5FI,6101
8
+ grzegorz/test.py,sha256=znHJFiV0Q1qP0kJYtoweMTNqJH1eX9ZHWFZedOJIuGo,3866
9
+ grzegorz/word.py,sha256=bXNTq_sjrn7CTOWBGkKdQXky_j0c-OzxhhgJWDh0BR0,7899
10
+ grzegorz/wordlist.py,sha256=SqKkZoyY8Ol1vp8Rt0PeNWtxL8ND3qE_yWGl5yiKZ_M,4058
11
+ grzegorz-0.6.2.dist-info/licenses/LICENSE,sha256=STF0KkBB_RpcXwp43xCvRIKKe_4V-zrq1lU1OsTgapY,35148
12
+ grzegorz-0.6.2.dist-info/METADATA,sha256=0CHuOOyM4N33IheYIgC-aDiKYsFShaZPgCUAqHZzFjk,4010
13
+ grzegorz-0.6.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
14
+ grzegorz-0.6.2.dist-info/entry_points.txt,sha256=rZ-JLt-sbS1rZ5YwodMyf9o80C6sN4AfuSCb0sFNVJ8,52
15
+ grzegorz-0.6.2.dist-info/top_level.txt,sha256=W2SodvLxGhkJfWfNhDO0Vh7prBehEXdE9sHWJ1mZXTA,9
16
+ grzegorz-0.6.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.41.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,16 +0,0 @@
1
- grzegorz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- grzegorz/__main__.py,sha256=jYpB9FB0uuJwrpxBpU-d13vIw1vhNg2kYX4yC_-UWrg,7133
3
- grzegorz/anki_integration.py,sha256=eMFdFNd0NsqLxX23NtlEzinhGMCecEFyoklfFkMqQOk,3933
4
- grzegorz/fetcher.py,sha256=quIpygekLCtXDS5Yx_2eHkJUeDaJDMdNcqI2P5HPxos,1929
5
- grzegorz/generator.py,sha256=IQyqY_dPUHmkaCcWnuN5QCQfvRiCS9B_8oOLuiNs2dM,9096
6
- grzegorz/io.py,sha256=JM2pOKgECmnVxCZplgRt1gEiyYWXUn_Z6OanmGSaab0,2221
7
- grzegorz/subcommands.py,sha256=uO0StbCG081rXPp71DMm--anb2pyA2QHNBCwDEwh6vk,6029
8
- grzegorz/test.py,sha256=znHJFiV0Q1qP0kJYtoweMTNqJH1eX9ZHWFZedOJIuGo,3866
9
- grzegorz/word.py,sha256=bXNTq_sjrn7CTOWBGkKdQXky_j0c-OzxhhgJWDh0BR0,7899
10
- grzegorz/wordlist.py,sha256=SqKkZoyY8Ol1vp8Rt0PeNWtxL8ND3qE_yWGl5yiKZ_M,4058
11
- grzegorz-0.6.0.dist-info/LICENSE,sha256=STF0KkBB_RpcXwp43xCvRIKKe_4V-zrq1lU1OsTgapY,35148
12
- grzegorz-0.6.0.dist-info/METADATA,sha256=IbAHh2s0xdwqEhLehK8SAImnm14mHO3k8nTnNfyOGtg,3960
13
- grzegorz-0.6.0.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
14
- grzegorz-0.6.0.dist-info/entry_points.txt,sha256=rZ-JLt-sbS1rZ5YwodMyf9o80C6sN4AfuSCb0sFNVJ8,52
15
- grzegorz-0.6.0.dist-info/top_level.txt,sha256=W2SodvLxGhkJfWfNhDO0Vh7prBehEXdE9sHWJ1mZXTA,9
16
- grzegorz-0.6.0.dist-info/RECORD,,