wiktionary-de-parser 0.12.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,254 @@
1
+ # Changelog
2
+ All notable changes to this project will be documented in this file.
3
+
4
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [0.12.7] - 2024-12-31
8
+ ### Changed
9
+ - Update dependencies
10
+ - Improved meanings parsing (experimental)
11
+
12
+ ## [0.12.5] - 2024-12-30
13
+ ### Changed
14
+ - Update dependencies
15
+ - Improved meanings parsing (experimental)
16
+
17
+ ## [0.12.1] - 2024-12-30
18
+ ### Added
19
+ - Parse meanings and add "meanings" field to output, when `include_meanings`-param ist True in `parse_entry`-call.
20
+
21
+ ## [0.12.0] - 2024-07-29
22
+ ### Changed
23
+ - Update value for xml property "xsi:schemaLocation" to "http://www.mediawiki.org/xml/export-0.11/"
24
+
25
+ ## [0.11.5] - 2024-02-10
26
+ ### Removed
27
+ - "wikicode" field from page model (not used)
28
+
29
+ ### Fixed
30
+ - Performance improvements
31
+
32
+ ## [0.11.4] - 2024-02-09
33
+ ### Changed
34
+ - Add dict comprehension to improve performance
35
+
36
+ ## [0.11.3] - 2024-02-09
37
+ ### Changed
38
+ - Add None type to language
39
+ - Small improvements
40
+
41
+ ## [0.11.2] - 2024-02-09
42
+ ### Changed
43
+ - Rename "syllables" to "hyphenation"
44
+
45
+ ## [0.11.1] - 2024-02-05
46
+ ### Changed
47
+ - Allow to specify path to dump file in `WiktionaryDump` class
48
+
49
+ ## [0.11.0] - 2024-02-04
50
+ ### Changed
51
+ - Update dependencies
52
+ - Refactor internally to use `pydantic` models
53
+
54
+ ### Removed
55
+ - Remove `Record` class
56
+ - Remove config options
57
+
58
+ ### Added
59
+ - Add pydantic models
60
+ - Add WiktionaryParser and WiktionaryDump classes
61
+
62
+ ## [0.10.1] - 2024-01-29
63
+ ### Changed
64
+ - pass parsed wikitext internally to extraction methods
65
+ - update tests
66
+
67
+ ## [0.10.0] - 2024-01-29
68
+ ### Changed
69
+ - Update dependencies
70
+ - Use dataclasses instead of dicts internally
71
+ ### Added
72
+ - Add "page_id" and "index" field to output (if a page contains multiple entries, the index indicates the position of the word in the page)
73
+ - Add tests for POS and language parsing
74
+ ### Removed
75
+ - __BREAKING__: Removed the ability to load custom methods from outside the package. The same can be achieved by setting the "wiki_text" field in the config dict and parsing the Wikitext manually.
76
+
77
+ ## [0.9.5] - 2022-07-26
78
+ ### Fixed
79
+ - Make sure "title" is of base string type (not `etree._ElementUnicodeResult`)
80
+
81
+ ## [0.9.4] - 2022-07-18
82
+ ### Changed
83
+ - Improve typing
84
+
85
+ ## [0.9.3] - 2022-07-18
86
+ ### Fixed
87
+ - Fix type errors
88
+ ### Added
89
+ - Add method to parse rhymes
90
+ - Add tests for rhymes parsing
91
+
92
+ ## [0.9.2] - 2022-07-17
93
+ ### Fixed
94
+ - Improve lemma parsing
95
+ ### Added
96
+ - Add tests for lemma parsing
97
+
98
+ ## [0.9.1] - 2022-07-15
99
+ ### Fixed
100
+ - Make config dict keys optional
101
+
102
+ ## [0.9.0] - 2022-07-13
103
+ ### Added
104
+ - Add development instructions to README.md
105
+ - Add tests for syllable parsing
106
+ - Add tests for IPA parsing
107
+ - Add VSCode launch.json
108
+ - Add config dict
109
+ - Add config option to optionally include wikitext in output (disabled by default)
110
+ ### Changed
111
+ - Update dependencies
112
+ - Replace Autopep with black
113
+ - Ignore inflected forms, regional slang, Austrian/Swiss dialect etc. when parsing IPA-templates from now on
114
+ - `ignored_prefixes` is now part of a config dict
115
+ ### Fixed
116
+ - Improve syllable parsing
117
+ - Improve IPA parsing
118
+ ### Removed
119
+ - `pyphen` as fallback for syllables parsing
120
+
121
+
122
+ ## [0.8.9] - 2021-11-13
123
+ ### Changed
124
+ - Change repository and package name from `wiktionary_de_parser` to `wiktionary-de-parser`
125
+
126
+ ## [0.8.8] - 2021-11-12
127
+ ### Changed
128
+ - Make `lemma` and `inflected` fields required fields
129
+
130
+ ## [0.8.7] - 2021-11-12
131
+ ### Changed
132
+ - Removed typing_extensions again
133
+
134
+ ## [0.8.6] - 2021-11-12
135
+ ### Added
136
+ - Added typing_extensions
137
+
138
+ ## [0.8.5] - 2021-11-12
139
+ ### Added
140
+ - More type hints
141
+
142
+ ## [0.8.4] - 2021-11-12
143
+ ### Added
144
+ - Type hint for iterable (Record)
145
+ ### Changed
146
+ - removed None type dict entries in flexion parsing result
147
+ ### Fixed
148
+ - minor flexion parsing improvements
149
+
150
+ ## [0.8.3] - 2021-11-12
151
+ ### Changed
152
+ - Converted repository to [Poetry](https://python-poetry.org/) project
153
+ - Renamed `langCode` to `lang_code`
154
+ ### Added
155
+ - Started to implement tests and type hints
156
+ ### Fixed
157
+ - Updated regular expression and improved flexion parsing
158
+
159
+ ## [0.8.1] - 2020-07-10
160
+ ### Fixed
161
+ - improve dash parsing in table values
162
+
163
+ ## [0.8.0] - 2019-12-01
164
+ ### Fixed
165
+ - `MANIFEST.in` added langcode files
166
+
167
+ ## [0.7.9] - 2019-12-01
168
+ ### Fixed
169
+ - `syllables.py` improvemed syllables parsing
170
+ ### Added
171
+ - `language.py` added field `langCode` (providing ISO639-1 language code)
172
+ ### Changed
173
+ - `language.py` renamed field `language` to `lang`
174
+ - `README.md` updated readme
175
+
176
+ ## [0.7.8] - 2019-12-01
177
+ ### Fixed
178
+ - `ipa.py` IPA parsing improvement
179
+
180
+ ## [0.7.7] - 2019-07-16
181
+ ### Fixed
182
+ - `pos.py` added 'Deklinierte Form' as POS (can be Substantiv, Adjektiv, Artikel, Pronomen)
183
+
184
+ ## [0.7.6] - 2019-07-13
185
+ ### Fixed
186
+ - `ipa.py` Match correct paragraph in WikiText for parsing IPA
187
+
188
+ ## [0.7.5] - 2019-07-13
189
+ ### Fixed
190
+ - `syllables.py` Improved syllables parsing
191
+
192
+ ## [0.7.4] - 2019-07-13
193
+ ### Changed
194
+ - `ipa.py` Make IPA field a `list` (support multiple IPA transcriptions for one word)
195
+
196
+ ### Fixed
197
+ - `ipa.py` Improved IPA parsing
198
+
199
+ ## [0.7.3] - 2019-05-29
200
+ ### Fixed
201
+ - `pos.py` Prevent duplicate POS names
202
+
203
+ ## [0.7.2] - 2019-05-29
204
+ ### Fixed
205
+ - `pos.py` Toponym was a Dict key, when Template 'Deutsch Toponym Übersicht' was present (should be nested noun value)
206
+
207
+ ## [0.7.1] - 2019-05-27
208
+ ### Added
209
+ - [Python package](https://pypi.org) support
210
+
211
+ ### Changed
212
+ - repository structure
213
+ - README.md
214
+
215
+ ## [0.6.6] - 2019-04-14
216
+ ### Added
217
+ - allow 'Genus 1' - 'Genus 4' in flexion dictionary
218
+ - added `inflected` field to indicate whether entry is for inflected word
219
+
220
+ ### Changed
221
+ - put 'Genus' back to to flexion dictionary
222
+
223
+ ### Fixed
224
+ - strip values in `lemma.py`, `language.py`, `ipa.py`
225
+
226
+ ## [0.6.5] - 2019-04-14
227
+ ### Added
228
+ - accept `Vorlage-Test` in regex pattern in `pos.py` & `language.py`
229
+ - accept `Merkspruch` in `pos.py`
230
+
231
+ ### Fixed
232
+ - improved regex for section splitting
233
+ - improved regex for POS matching
234
+ - fix missing POS names when there is a POS template
235
+
236
+ ### Removed
237
+ - language codes
238
+
239
+ ## [0.6.0] - 2019-04-12
240
+ ### Added
241
+ - loading custom methods via `custom_methods` argument in class constructor and `load_methods` function
242
+ - Changelog.md (this file)
243
+
244
+ ### Changed
245
+ - load all files from `methods` folder and initialize them as extraction methods
246
+ - extraction methods must return a `Dict()` now
247
+ - `flexion.py`: returns 'genus' and flexion info separately
248
+
249
+ ### Removed
250
+ - `method_names` in `__init__.py`
251
+
252
+ ## [0.5.0] - 2019-04-11
253
+ ### Added
254
+ - initial release
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2019 Gregor Weichbrodt
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,155 @@
1
+ Metadata-Version: 2.1
2
+ Name: wiktionary-de-parser
3
+ Version: 0.12.7
4
+ Summary: Extracts data from German Wiktionary dump files.
5
+ Home-page: https://github.com/gambolputty/wiktionary-de-parser
6
+ License: MIT
7
+ Keywords: wiktionary,xml,parser,data-extraction,german,nlp
8
+ Author: Gregor Weichbrodt
9
+ Author-email: gregorweichbrodt@gmail.com
10
+ Requires-Python: >=3.11,<4.0
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Natural Language :: German
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.7
18
+ Classifier: Topic :: Text Processing :: Markup :: XML
19
+ Requires-Dist: black (>=24.1.1,<25.0.0)
20
+ Requires-Dist: lxml (>=5.1.0,<6.0.0)
21
+ Requires-Dist: mwparserfromhell (>=0.6.6,<0.7.0)
22
+ Requires-Dist: pydantic (>=2.6.0,<3.0.0)
23
+ Requires-Dist: requests (>=2.31.0,<3.0.0)
24
+ Requires-Dist: tqdm (>=4.66.1,<5.0.0)
25
+ Requires-Dist: wikitextparser (>=0.56.2,<0.57.0)
26
+ Project-URL: Bug Tracker, https://github.com/gambolputty/wiktionary-de-parser/issues
27
+ Project-URL: Repository, https://github.com/gambolputty/wiktionary-de-parser
28
+ Description-Content-Type: text/markdown
29
+
30
+ # wiktionary-de-parser
31
+
32
+ A Python module to extract data from German Wiktionary XML files (for Python 3.11+).
33
+
34
+ ## Features
35
+
36
+ - Extracts _IPA transcriptions_, _hyphenation_, _language_, _part of speech_ information (basic), _genus_ and _flexion tables_ of a word.
37
+ - Yields per entry, not per page (a page can have multiple entries/ words can have different meanings)
38
+
39
+ ## Installation
40
+
41
+ `pip install wiktionary-de-parser`
42
+
43
+ Or with [Poetry](https://python-poetry.org/):
44
+
45
+ `poetry add wiktionary-de-parser`
46
+
47
+ ## Usage
48
+
49
+ ### Loading the XML dump file
50
+ ```python
51
+ from wiktionary_de_parser import WiktionaryParser
52
+ from wiktionary_de_parser.dump_processor import WiktionaryDump
53
+
54
+ # To download the dump file, specify the directory where the
55
+ # dump file should be stored.
56
+ dump = WiktionaryDump(dump_dir_path="directory-of-dump-file")
57
+
58
+ # This will download "dewiktionary-latest-pages-articles-multistream.xml.bz2" to
59
+ # the directory specified in `dump_dir_path`.
60
+ dump.download_dump()
61
+
62
+ # Alternatively you can specify a different dump file to download.
63
+ dump = WiktionaryDump(
64
+ dump_dir_path="directory-of-dump-file",
65
+ dump_download_url="url-to-dump-file.xml.bz2",
66
+ )
67
+ dump.download_dump()
68
+
69
+ # If you already have the dump file locally, specify the path to the file.
70
+ dump = WiktionaryDump(dump_file_path="path-to-dump-file.xml.bz2")
71
+ dump.download_dump()
72
+ ```
73
+
74
+ ### Parsing the dump file
75
+ ```python
76
+ from pprint import pprint
77
+ from wiktionary_de_parser import WiktionaryParser
78
+
79
+ # ... (see above)
80
+
81
+ parser = WiktionaryParser()
82
+
83
+ for page in dump.pages():
84
+ # Skip redirects
85
+ if page.redirect_to:
86
+ continue
87
+
88
+ if page.name == "Abend":
89
+ # Parse all entries for "Abend"
90
+ for entry in parser.entries_from_page(page):
91
+ results = parser.parse_entry(entry)
92
+ pprint(results)
93
+ break
94
+ ```
95
+
96
+ ## Output
97
+ All page entries for "Abend":
98
+
99
+ ```python
100
+ ParsedWiktionaryPageEntry(
101
+ name="Abend",
102
+ hyphenation=["Abend"],
103
+ flexion={
104
+ "Genus": "m",
105
+ "Nominativ Singular": "Abend",
106
+ "Nominativ Plural": "Abende",
107
+ "Genitiv Singular": "Abends",
108
+ "Genitiv Plural": "Abende",
109
+ "Dativ Singular": "Abend",
110
+ "Dativ Plural": "Abenden",
111
+ "Akkusativ Singular": "Abend",
112
+ "Akkusativ Plural": "Abende",
113
+ },
114
+ ipa=["ˈaːbn̩t", "ˈaːbm̩t"],
115
+ language=Language(lang="Deutsch", lang_code="de"),
116
+ lemma=Lemma(lemma="Abend", inflected=False),
117
+ pos={"Substantiv": []},
118
+ rhymes=["aːbn̩t"],
119
+ )
120
+ ParsedWiktionaryPageEntry(
121
+ name="Abend",
122
+ hyphenation=["Abend"],
123
+ flexion=None,
124
+ ipa=["ˈaːbn̩t"],
125
+ language=Language(lang="Deutsch", lang_code="de"),
126
+ lemma=Lemma(lemma="Abend", inflected=False),
127
+ pos={"Substantiv": ["Nachname"]},
128
+ rhymes=["aːbn̩t"],
129
+ )
130
+ ParsedWiktionaryPageEntry(
131
+ name="Abend",
132
+ hyphenation=["Abend"],
133
+ flexion=None,
134
+ ipa=["ˈaːbn̩t", "ˈaːbm̩t"],
135
+ language=Language(lang="Deutsch", lang_code="de"),
136
+ lemma=Lemma(lemma="Abend", inflected=False),
137
+ pos={"Substantiv": ["Toponym"]},
138
+ rhymes=["aːbn̩t"],
139
+ )
140
+
141
+ ```
142
+
143
+ ## Development
144
+ This project uses [Poetry](https://python-poetry.org/).
145
+
146
+ 1. Install [Poetry](https://python-poetry.org/).
147
+ 2. Clone this repository
148
+ 3. Run `poetry install` inside of the project folder to install dependencies.
149
+ 4. There is a `notebook.ipynb` to test the parser.
150
+ 5. Run `poetry run pytest` to run tests.
151
+
152
+ ## License
153
+
154
+ [MIT](https://github.com/gambolputty/wiktionary-de-parser/blob/master/LICENSE.md) © Gregor Weichbrodt
155
+
@@ -0,0 +1,125 @@
1
+ # wiktionary-de-parser
2
+
3
+ A Python module to extract data from German Wiktionary XML files (for Python 3.11+).
4
+
5
+ ## Features
6
+
7
+ - Extracts _IPA transcriptions_, _hyphenation_, _language_, _part of speech_ information (basic), _genus_ and _flexion tables_ of a word.
8
+ - Yields per entry, not per page (a page can have multiple entries/ words can have different meanings)
9
+
10
+ ## Installation
11
+
12
+ `pip install wiktionary-de-parser`
13
+
14
+ Or with [Poetry](https://python-poetry.org/):
15
+
16
+ `poetry add wiktionary-de-parser`
17
+
18
+ ## Usage
19
+
20
+ ### Loading the XML dump file
21
+ ```python
22
+ from wiktionary_de_parser import WiktionaryParser
23
+ from wiktionary_de_parser.dump_processor import WiktionaryDump
24
+
25
+ # To download the dump file, specify the directory where the
26
+ # dump file should be stored.
27
+ dump = WiktionaryDump(dump_dir_path="directory-of-dump-file")
28
+
29
+ # This will download "dewiktionary-latest-pages-articles-multistream.xml.bz2" to
30
+ # the directory specified in `dump_dir_path`.
31
+ dump.download_dump()
32
+
33
+ # Alternatively you can specify a different dump file to download.
34
+ dump = WiktionaryDump(
35
+ dump_dir_path="directory-of-dump-file",
36
+ dump_download_url="url-to-dump-file.xml.bz2",
37
+ )
38
+ dump.download_dump()
39
+
40
+ # If you already have the dump file locally, specify the path to the file.
41
+ dump = WiktionaryDump(dump_file_path="path-to-dump-file.xml.bz2")
42
+ dump.download_dump()
43
+ ```
44
+
45
+ ### Parsing the dump file
46
+ ```python
47
+ from pprint import pprint
48
+ from wiktionary_de_parser import WiktionaryParser
49
+
50
+ # ... (see above)
51
+
52
+ parser = WiktionaryParser()
53
+
54
+ for page in dump.pages():
55
+ # Skip redirects
56
+ if page.redirect_to:
57
+ continue
58
+
59
+ if page.name == "Abend":
60
+ # Parse all entries for "Abend"
61
+ for entry in parser.entries_from_page(page):
62
+ results = parser.parse_entry(entry)
63
+ pprint(results)
64
+ break
65
+ ```
66
+
67
+ ## Output
68
+ All page entries for "Abend":
69
+
70
+ ```python
71
+ ParsedWiktionaryPageEntry(
72
+ name="Abend",
73
+ hyphenation=["Abend"],
74
+ flexion={
75
+ "Genus": "m",
76
+ "Nominativ Singular": "Abend",
77
+ "Nominativ Plural": "Abende",
78
+ "Genitiv Singular": "Abends",
79
+ "Genitiv Plural": "Abende",
80
+ "Dativ Singular": "Abend",
81
+ "Dativ Plural": "Abenden",
82
+ "Akkusativ Singular": "Abend",
83
+ "Akkusativ Plural": "Abende",
84
+ },
85
+ ipa=["ˈaːbn̩t", "ˈaːbm̩t"],
86
+ language=Language(lang="Deutsch", lang_code="de"),
87
+ lemma=Lemma(lemma="Abend", inflected=False),
88
+ pos={"Substantiv": []},
89
+ rhymes=["aːbn̩t"],
90
+ )
91
+ ParsedWiktionaryPageEntry(
92
+ name="Abend",
93
+ hyphenation=["Abend"],
94
+ flexion=None,
95
+ ipa=["ˈaːbn̩t"],
96
+ language=Language(lang="Deutsch", lang_code="de"),
97
+ lemma=Lemma(lemma="Abend", inflected=False),
98
+ pos={"Substantiv": ["Nachname"]},
99
+ rhymes=["aːbn̩t"],
100
+ )
101
+ ParsedWiktionaryPageEntry(
102
+ name="Abend",
103
+ hyphenation=["Abend"],
104
+ flexion=None,
105
+ ipa=["ˈaːbn̩t", "ˈaːbm̩t"],
106
+ language=Language(lang="Deutsch", lang_code="de"),
107
+ lemma=Lemma(lemma="Abend", inflected=False),
108
+ pos={"Substantiv": ["Toponym"]},
109
+ rhymes=["aːbn̩t"],
110
+ )
111
+
112
+ ```
113
+
114
+ ## Development
115
+ This project uses [Poetry](https://python-poetry.org/).
116
+
117
+ 1. Install [Poetry](https://python-poetry.org/).
118
+ 2. Clone this repository
119
+ 3. Run `poetry install` inside of the project folder to install dependencies.
120
+ 4. There is a `notebook.ipynb` to test the parser.
121
+ 5. Run `poetry run pytest` to run tests.
122
+
123
+ ## License
124
+
125
+ [MIT](https://github.com/gambolputty/wiktionary-de-parser/blob/master/LICENSE.md) © Gregor Weichbrodt
@@ -0,0 +1,46 @@
1
+ [tool.poetry]
2
+ name = "wiktionary-de-parser"
3
+ version = "0.12.7"
4
+ description = "Extracts data from German Wiktionary dump files."
5
+ authors = ["Gregor Weichbrodt <gregorweichbrodt@gmail.com>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ homepage = "https://github.com/gambolputty/wiktionary-de-parser"
9
+ repository = "https://github.com/gambolputty/wiktionary-de-parser"
10
+ keywords = ["wiktionary", "xml", "parser", "data-extraction", "german", "nlp"]
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3.7",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Operating System :: OS Independent",
15
+ "Natural Language :: German",
16
+ "Topic :: Text Processing :: Markup :: XML"
17
+ ]
18
+ include = [
19
+ "CHANGELOG.md",
20
+ "LICENSE.txt",
21
+ ]
22
+
23
+ [tool.poetry.urls]
24
+ "Bug Tracker" = "https://github.com/gambolputty/wiktionary-de-parser/issues"
25
+
26
+ [tool.poetry.dependencies]
27
+ python = "^3.11"
28
+ lxml = "^5.1.0"
29
+ mwparserfromhell = "^0.6.6"
30
+ black = "^24.1.1"
31
+ pydantic = "^2.6.0"
32
+ requests = "^2.31.0"
33
+ tqdm = "^4.66.1"
34
+ wikitextparser = "^0.56.2"
35
+
36
+ [tool.poetry.dev-dependencies]
37
+ pytest = "^7.1.2"
38
+ black = "*"
39
+
40
+ [tool.poetry.group.dev.dependencies]
41
+ ipykernel = "^6.29.0"
42
+ ipywidgets = "^8.1.5"
43
+
44
+ [build-system]
45
+ requires = ["poetry-core>=1.0.0"]
46
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,98 @@
1
+ import importlib.util
2
+ import inspect
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Type
6
+
7
+ from wiktionary_de_parser.models import (
8
+ ParsedWiktionaryPageEntry,
9
+ WiktionaryPage,
10
+ WiktionaryPageEntry,
11
+ )
12
+ from wiktionary_de_parser.parser import Parser
13
+
14
+
15
+ class WiktionaryParser:
16
+ parser_classes: list[Type[Parser]]
17
+
18
+ def __init__(self):
19
+ self.parser_classes = self.find_parser_classes()
20
+
21
+ @staticmethod
22
+ def find_parser_classes():
23
+ path = Path(__file__).parent / "parser"
24
+ parent_class = Parser
25
+ classes: list[Type[Parser]] = []
26
+
27
+ for child in path.iterdir():
28
+ if (
29
+ child.is_file()
30
+ and child.name.endswith(".py")
31
+ and child.name != "__init__.py"
32
+ ):
33
+ module_name = child.stem # Entfernen Sie die .py-Endung
34
+ spec = importlib.util.spec_from_file_location(
35
+ module_name, child
36
+ )
37
+
38
+ if not spec or not spec.loader:
39
+ raise Exception(f"Could not load {child}")
40
+
41
+ module = importlib.util.module_from_spec(spec)
42
+ spec.loader.exec_module(module)
43
+
44
+ for name, obj in inspect.getmembers(module):
45
+ if (
46
+ inspect.isclass(obj)
47
+ and issubclass(obj, parent_class)
48
+ and (obj != parent_class)
49
+ ):
50
+ classes.append(obj)
51
+
52
+ return classes
53
+
54
+ def entries_from_page(self, page: WiktionaryPage):
55
+ """
56
+ Split page into entries. One page can have multiple word entries, for example:
57
+ - https://de.wiktionary.org/wiki/instrument
58
+
59
+ New entries begin at "==" and "===" (sometimes there is no "==")
60
+ Compare:
61
+ - https://de.wiktionary.org/wiki/instrument
62
+ - https://de.wiktionary.org/wiki/Becken
63
+ """
64
+ if not page.wikitext:
65
+ return
66
+
67
+ entries: list[str] = re.findall(
68
+ r"(=== {{Wortart(?:[\w\W](?!^===? ))+)", page.wikitext, re.MULTILINE
69
+ )
70
+
71
+ for index, entry in enumerate(entries):
72
+ yield WiktionaryPageEntry(
73
+ page=page,
74
+ index=index,
75
+ wikitext=entry,
76
+ )
77
+
78
+ def parse_entry(
79
+ self,
80
+ wiktionary_entry: WiktionaryPageEntry,
81
+ include_meanings: bool = False,
82
+ ):
83
+ """
84
+ Parses an entry of a page.
85
+ """
86
+
87
+ # Instantiate all subclasses and run them
88
+ results = {
89
+ instance.name: instance.run()
90
+ for subclass in self.parser_classes
91
+ if (instance := subclass(wiktionary_entry))
92
+ and (include_meanings or instance.name != "meanings")
93
+ }
94
+
95
+ # Add the page name
96
+ results["name"] = wiktionary_entry.page.name
97
+
98
+ return ParsedWiktionaryPageEntry(**results)