wiktionary-de-parser 0.12.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wiktionary_de_parser-0.12.7/CHANGELOG.md +254 -0
- wiktionary_de_parser-0.12.7/LICENSE.txt +21 -0
- wiktionary_de_parser-0.12.7/PKG-INFO +155 -0
- wiktionary_de_parser-0.12.7/README.md +125 -0
- wiktionary_de_parser-0.12.7/pyproject.toml +46 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/__init__.py +98 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/assets/sprachcodes_iso639-1.txt +186 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/config.py +4 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/dump_processor/__init__.py +155 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/models.py +56 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/__init__.py +28 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/parse_flexion.py +86 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/parse_hyphenation.py +84 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/parse_ipa.py +106 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/parse_language.py +50 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/parse_lemma.py +60 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/parse_meanings.py +630 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/parse_pos.py +218 -0
- wiktionary_de_parser-0.12.7/wiktionary_de_parser/parser/parse_rhymes.py +70 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
All notable changes to this project will be documented in this file.
|
|
3
|
+
|
|
4
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
5
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [0.12.7] - 2024-12-31
|
|
8
|
+
### Changed
|
|
9
|
+
- Update dependencies
|
|
10
|
+
- Improved meanings parsing (experimental)
|
|
11
|
+
|
|
12
|
+
## [0.12.5] - 2024-12-30
|
|
13
|
+
### Changed
|
|
14
|
+
- Update dependencies
|
|
15
|
+
- Improved meanings parsing (experimental)
|
|
16
|
+
|
|
17
|
+
## [0.12.1] - 2024-12-30
|
|
18
|
+
### Added
|
|
19
|
+
- Parse meanings and add "meanings" field to output, when `include_meanings`-param ist True in `parse_entry`-call.
|
|
20
|
+
|
|
21
|
+
## [0.12.0] - 2024-07-29
|
|
22
|
+
### Changed
|
|
23
|
+
- Update value for xml property "xsi:schemaLocation" to "http://www.mediawiki.org/xml/export-0.11/"
|
|
24
|
+
|
|
25
|
+
## [0.11.5] - 2024-02-10
|
|
26
|
+
### Removed
|
|
27
|
+
- "wikicode" field from page model (not used)
|
|
28
|
+
|
|
29
|
+
### Fixed
|
|
30
|
+
- Performance improvements
|
|
31
|
+
|
|
32
|
+
## [0.11.4] - 2024-02-09
|
|
33
|
+
### Changed
|
|
34
|
+
- Add dict comprehension to improve performance
|
|
35
|
+
|
|
36
|
+
## [0.11.3] - 2024-02-09
|
|
37
|
+
### Changed
|
|
38
|
+
- Add None type to language
|
|
39
|
+
- Small improvements
|
|
40
|
+
|
|
41
|
+
## [0.11.2] - 2024-02-09
|
|
42
|
+
### Changed
|
|
43
|
+
- Rename "syllables" to "hyphenation"
|
|
44
|
+
|
|
45
|
+
## [0.11.1] - 2024-02-05
|
|
46
|
+
### Changed
|
|
47
|
+
- Allow to specify path to dump file in `WiktionaryDump` class
|
|
48
|
+
|
|
49
|
+
## [0.11.0] - 2024-02-04
|
|
50
|
+
### Changed
|
|
51
|
+
- Update dependencies
|
|
52
|
+
- Refactor internally to use `pydantic` models
|
|
53
|
+
|
|
54
|
+
### Removed
|
|
55
|
+
- Remove `Record` class
|
|
56
|
+
- Remove config options
|
|
57
|
+
|
|
58
|
+
### Added
|
|
59
|
+
- Add pydantic models
|
|
60
|
+
- Add WiktionaryParser and WiktionaryDump classes
|
|
61
|
+
|
|
62
|
+
## [0.10.1] - 2024-01-29
|
|
63
|
+
### Changed
|
|
64
|
+
- pass parsed wikitext internally to extraction methods
|
|
65
|
+
- update tests
|
|
66
|
+
|
|
67
|
+
## [0.10.0] - 2024-01-29
|
|
68
|
+
### Changed
|
|
69
|
+
- Update dependencies
|
|
70
|
+
- Use dataclasses instead of dicts internally
|
|
71
|
+
### Added
|
|
72
|
+
- Add "page_id" and "index" field to output (if a page contains multiple entries, the index indicates the position of the word in the page)
|
|
73
|
+
- Add tests for POS and language parsing
|
|
74
|
+
### Removed
|
|
75
|
+
- __BREAKING__: Removed the ability to load custom methods from outside the package. The same can be achieved by setting the "wiki_text" field in the config dict and parsing the Wikitext manually.
|
|
76
|
+
|
|
77
|
+
## [0.9.5] - 2022-07-26
|
|
78
|
+
### Fixed
|
|
79
|
+
- Make sure "title" is of base string type (not `etree._ElementUnicodeResult`)
|
|
80
|
+
|
|
81
|
+
## [0.9.4] - 2022-07-18
|
|
82
|
+
### Changed
|
|
83
|
+
- Improve typing
|
|
84
|
+
|
|
85
|
+
## [0.9.3] - 2022-07-18
|
|
86
|
+
### Fixed
|
|
87
|
+
- Fix type errors
|
|
88
|
+
### Added
|
|
89
|
+
- Add method to parse rhymes
|
|
90
|
+
- Add tests for rhymes parsing
|
|
91
|
+
|
|
92
|
+
## [0.9.2] - 2022-07-17
|
|
93
|
+
### Fixed
|
|
94
|
+
- Improve lemma parsing
|
|
95
|
+
### Added
|
|
96
|
+
- Add tests for lemma parsing
|
|
97
|
+
|
|
98
|
+
## [0.9.1] - 2022-07-15
|
|
99
|
+
### Fixed
|
|
100
|
+
- Make config dict keys optional
|
|
101
|
+
|
|
102
|
+
## [0.9.0] - 2022-07-13
|
|
103
|
+
### Added
|
|
104
|
+
- Add development instructions to README.md
|
|
105
|
+
- Add tests for syllable parsing
|
|
106
|
+
- Add tests for IPA parsing
|
|
107
|
+
- Add VSCode launch.json
|
|
108
|
+
- Add config dict
|
|
109
|
+
- Add config option to optionally include wikitext in output (disabled by default)
|
|
110
|
+
### Changed
|
|
111
|
+
- Update dependencies
|
|
112
|
+
- Replace Autopep with black
|
|
113
|
+
- Ignore inflected forms, regional slang, Austrian/Swiss dialect etc. when parsing IPA-templates from now on
|
|
114
|
+
- `ignored_prefixes` is now part of a config dict
|
|
115
|
+
### Fixed
|
|
116
|
+
- Improve syllable parsing
|
|
117
|
+
- Improve IPA parsing
|
|
118
|
+
### Removed
|
|
119
|
+
- `pyphen` as fallback for syllables parsing
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
## [0.8.9] - 2021-11-13
|
|
123
|
+
### Changed
|
|
124
|
+
- Change repository and package name from `wiktionary_de_parser` to `wiktionary-de-parser`
|
|
125
|
+
|
|
126
|
+
## [0.8.8] - 2021-11-12
|
|
127
|
+
### Changed
|
|
128
|
+
- Make `lemma` and `inflected` fields required fields
|
|
129
|
+
|
|
130
|
+
## [0.8.7] - 2021-11-12
|
|
131
|
+
### Changed
|
|
132
|
+
- Removed typing_extensions again
|
|
133
|
+
|
|
134
|
+
## [0.8.6] - 2021-11-12
|
|
135
|
+
### Added
|
|
136
|
+
- Added typing_extensions
|
|
137
|
+
|
|
138
|
+
## [0.8.5] - 2021-11-12
|
|
139
|
+
### Added
|
|
140
|
+
- More type hints
|
|
141
|
+
|
|
142
|
+
## [0.8.4] - 2021-11-12
|
|
143
|
+
### Added
|
|
144
|
+
- Type hint for iterable (Record)
|
|
145
|
+
### Changed
|
|
146
|
+
- removed None type dict entries in flexion parsing result
|
|
147
|
+
### Fixed
|
|
148
|
+
- minor flexion parsing improvements
|
|
149
|
+
|
|
150
|
+
## [0.8.3] - 2021-11-12
|
|
151
|
+
### Changed
|
|
152
|
+
- Converted repository to [Poetry](https://python-poetry.org/) project
|
|
153
|
+
- Renamed `langCode` to `lang_code`
|
|
154
|
+
### Added
|
|
155
|
+
- Started to implement tests and type hints
|
|
156
|
+
### Fixed
|
|
157
|
+
- Updated regular expression and improved flexion parsing
|
|
158
|
+
|
|
159
|
+
## [0.8.1] - 2020-07-10
|
|
160
|
+
### Fixed
|
|
161
|
+
- improve dash parsing in table values
|
|
162
|
+
|
|
163
|
+
## [0.8.0] - 2019-12-01
|
|
164
|
+
### Fixed
|
|
165
|
+
- `MANIFEST.in` added langcode files
|
|
166
|
+
|
|
167
|
+
## [0.7.9] - 2019-12-01
|
|
168
|
+
### Fixed
|
|
169
|
+
- `syllables.py` improvemed syllables parsing
|
|
170
|
+
### Added
|
|
171
|
+
- `language.py` added field `langCode` (providing ISO639-1 language code)
|
|
172
|
+
### Changed
|
|
173
|
+
- `language.py` renamed field `language` to `lang`
|
|
174
|
+
- `README.md` updated readme
|
|
175
|
+
|
|
176
|
+
## [0.7.8] - 2019-12-01
|
|
177
|
+
### Fixed
|
|
178
|
+
- `ipa.py` IPA parsing improvement
|
|
179
|
+
|
|
180
|
+
## [0.7.7] - 2019-07-16
|
|
181
|
+
### Fixed
|
|
182
|
+
- `pos.py` added 'Deklinierte Form' as POS (can be Substantiv, Adjektiv, Artikel, Pronomen)
|
|
183
|
+
|
|
184
|
+
## [0.7.6] - 2019-07-13
|
|
185
|
+
### Fixed
|
|
186
|
+
- `ipa.py` Match correct paragraph in WikiText for parsing IPA
|
|
187
|
+
|
|
188
|
+
## [0.7.5] - 2019-07-13
|
|
189
|
+
### Fixed
|
|
190
|
+
- `syllables.py` Improved syllables parsing
|
|
191
|
+
|
|
192
|
+
## [0.7.4] - 2019-07-13
|
|
193
|
+
### Changed
|
|
194
|
+
- `ipa.py` Make IPA field a `list` (support multiple IPA transcriptions for one word)
|
|
195
|
+
|
|
196
|
+
### Fixed
|
|
197
|
+
- `ipa.py` Improved IPA parsing
|
|
198
|
+
|
|
199
|
+
## [0.7.3] - 2019-05-29
|
|
200
|
+
### Fixed
|
|
201
|
+
- `pos.py` Prevent duplicate POS names
|
|
202
|
+
|
|
203
|
+
## [0.7.2] - 2019-05-29
|
|
204
|
+
### Fixed
|
|
205
|
+
- `pos.py` Toponym was a Dict key, when Template 'Deutsch Toponym Übersicht' was present (should be nested noun value)
|
|
206
|
+
|
|
207
|
+
## [0.7.1] - 2019-05-27
|
|
208
|
+
### Added
|
|
209
|
+
- [Python package](https://pypi.org) support
|
|
210
|
+
|
|
211
|
+
### Changed
|
|
212
|
+
- repository structure
|
|
213
|
+
- README.md
|
|
214
|
+
|
|
215
|
+
## [0.6.6] - 2019-04-14
|
|
216
|
+
### Added
|
|
217
|
+
- allow 'Genus 1' - 'Genus 4' in flexion dictionary
|
|
218
|
+
- added `inflected` field to indicate whether entry is for inflected word
|
|
219
|
+
|
|
220
|
+
### Changed
|
|
221
|
+
- put 'Genus' back to to flexion dictionary
|
|
222
|
+
|
|
223
|
+
### Fixed
|
|
224
|
+
- strip values in `lemma.py`, `language.py`, `ipa.py`
|
|
225
|
+
|
|
226
|
+
## [0.6.5] - 2019-04-14
|
|
227
|
+
### Added
|
|
228
|
+
- accept `Vorlage-Test` in regex pattern in `pos.py` & `language.py`
|
|
229
|
+
- accept `Merkspruch` in `pos.py`
|
|
230
|
+
|
|
231
|
+
### Fixed
|
|
232
|
+
- improved regex for section splitting
|
|
233
|
+
- improved regex for POS matching
|
|
234
|
+
- fix missing POS names when there is a POS template
|
|
235
|
+
|
|
236
|
+
### Removed
|
|
237
|
+
- language codes
|
|
238
|
+
|
|
239
|
+
## [0.6.0] - 2019-04-12
|
|
240
|
+
### Added
|
|
241
|
+
- loading custom methods via `custom_methods` argument in class constructor and `load_methods` function
|
|
242
|
+
- Changelog.md (this file)
|
|
243
|
+
|
|
244
|
+
### Changed
|
|
245
|
+
- load all files from `methods` folder and initialize them as extraction methods
|
|
246
|
+
- extraction methods must return a `Dict()` now
|
|
247
|
+
- `flexion.py`: returns 'genus' and flexion info separately
|
|
248
|
+
|
|
249
|
+
### Removed
|
|
250
|
+
- `method_names` in `__init__.py`
|
|
251
|
+
|
|
252
|
+
## [0.5.0] - 2019-04-11
|
|
253
|
+
### Added
|
|
254
|
+
- initial release
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2019 Gregor Weichbrodt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: wiktionary-de-parser
|
|
3
|
+
Version: 0.12.7
|
|
4
|
+
Summary: Extracts data from German Wiktionary dump files.
|
|
5
|
+
Home-page: https://github.com/gambolputty/wiktionary-de-parser
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: wiktionary,xml,parser,data-extraction,german,nlp
|
|
8
|
+
Author: Gregor Weichbrodt
|
|
9
|
+
Author-email: gregorweichbrodt@gmail.com
|
|
10
|
+
Requires-Python: >=3.11,<4.0
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Natural Language :: German
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
19
|
+
Requires-Dist: black (>=24.1.1,<25.0.0)
|
|
20
|
+
Requires-Dist: lxml (>=5.1.0,<6.0.0)
|
|
21
|
+
Requires-Dist: mwparserfromhell (>=0.6.6,<0.7.0)
|
|
22
|
+
Requires-Dist: pydantic (>=2.6.0,<3.0.0)
|
|
23
|
+
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
24
|
+
Requires-Dist: tqdm (>=4.66.1,<5.0.0)
|
|
25
|
+
Requires-Dist: wikitextparser (>=0.56.2,<0.57.0)
|
|
26
|
+
Project-URL: Bug Tracker, https://github.com/gambolputty/wiktionary-de-parser/issues
|
|
27
|
+
Project-URL: Repository, https://github.com/gambolputty/wiktionary-de-parser
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# wiktionary-de-parser
|
|
31
|
+
|
|
32
|
+
A Python module to extract data from German Wiktionary XML files (for Python 3.11+).
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
|
|
36
|
+
- Extracts _IPA transcriptions_, _hyphenation_, _language_, _part of speech_ information (basic), _genus_ and _flexion tables_ of a word.
|
|
37
|
+
- Yields per entry, not per page (a page can have multiple entries/ words can have different meanings)
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
`pip install wiktionary-de-parser`
|
|
42
|
+
|
|
43
|
+
Or with [Poetry](https://python-poetry.org/):
|
|
44
|
+
|
|
45
|
+
`poetry add wiktionary-de-parser`
|
|
46
|
+
|
|
47
|
+
## Usage
|
|
48
|
+
|
|
49
|
+
### Loading the XML dump file
|
|
50
|
+
```python
|
|
51
|
+
from wiktionary_de_parser import WiktionaryParser
|
|
52
|
+
from wiktionary_de_parser.dump_processor import WiktionaryDump
|
|
53
|
+
|
|
54
|
+
# To download the dump file, specify the directory where the
|
|
55
|
+
# dump file should be stored.
|
|
56
|
+
dump = WiktionaryDump(dump_dir_path="directory-of-dump-file")
|
|
57
|
+
|
|
58
|
+
# This will download "dewiktionary-latest-pages-articles-multistream.xml.bz2" to
|
|
59
|
+
# the directory specified in `dump_dir_path`.
|
|
60
|
+
dump.download_dump()
|
|
61
|
+
|
|
62
|
+
# Alternatively you can specify a different dump file to download.
|
|
63
|
+
dump = WiktionaryDump(
|
|
64
|
+
dump_dir_path="directory-of-dump-file",
|
|
65
|
+
dump_download_url="url-to-dump-file.xml.bz2",
|
|
66
|
+
)
|
|
67
|
+
dump.download_dump()
|
|
68
|
+
|
|
69
|
+
# If you already have the dump file locally, specify the path to the file.
|
|
70
|
+
dump = WiktionaryDump(dump_file_path="path-to-dump-file.xml.bz2")
|
|
71
|
+
dump.download_dump()
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Parsing the dump file
|
|
75
|
+
```python
|
|
76
|
+
from pprint import pprint
|
|
77
|
+
from wiktionary_de_parser import WiktionaryParser
|
|
78
|
+
|
|
79
|
+
# ... (see above)
|
|
80
|
+
|
|
81
|
+
parser = WiktionaryParser()
|
|
82
|
+
|
|
83
|
+
for page in dump.pages():
|
|
84
|
+
# Skip redirects
|
|
85
|
+
if page.redirect_to:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
if page.name == "Abend":
|
|
89
|
+
# Parse all entries for "Abend"
|
|
90
|
+
for entry in parser.entries_from_page(page):
|
|
91
|
+
results = parser.parse_entry(entry)
|
|
92
|
+
pprint(results)
|
|
93
|
+
break
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Output
|
|
97
|
+
All page entries for "Abend":
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
ParsedWiktionaryPageEntry(
|
|
101
|
+
name="Abend",
|
|
102
|
+
hyphenation=["Abend"],
|
|
103
|
+
flexion={
|
|
104
|
+
"Genus": "m",
|
|
105
|
+
"Nominativ Singular": "Abend",
|
|
106
|
+
"Nominativ Plural": "Abende",
|
|
107
|
+
"Genitiv Singular": "Abends",
|
|
108
|
+
"Genitiv Plural": "Abende",
|
|
109
|
+
"Dativ Singular": "Abend",
|
|
110
|
+
"Dativ Plural": "Abenden",
|
|
111
|
+
"Akkusativ Singular": "Abend",
|
|
112
|
+
"Akkusativ Plural": "Abende",
|
|
113
|
+
},
|
|
114
|
+
ipa=["ˈaːbn̩t", "ˈaːbm̩t"],
|
|
115
|
+
language=Language(lang="Deutsch", lang_code="de"),
|
|
116
|
+
lemma=Lemma(lemma="Abend", inflected=False),
|
|
117
|
+
pos={"Substantiv": []},
|
|
118
|
+
rhymes=["aːbn̩t"],
|
|
119
|
+
)
|
|
120
|
+
ParsedWiktionaryPageEntry(
|
|
121
|
+
name="Abend",
|
|
122
|
+
hyphenation=["Abend"],
|
|
123
|
+
flexion=None,
|
|
124
|
+
ipa=["ˈaːbn̩t"],
|
|
125
|
+
language=Language(lang="Deutsch", lang_code="de"),
|
|
126
|
+
lemma=Lemma(lemma="Abend", inflected=False),
|
|
127
|
+
pos={"Substantiv": ["Nachname"]},
|
|
128
|
+
rhymes=["aːbn̩t"],
|
|
129
|
+
)
|
|
130
|
+
ParsedWiktionaryPageEntry(
|
|
131
|
+
name="Abend",
|
|
132
|
+
hyphenation=["Abend"],
|
|
133
|
+
flexion=None,
|
|
134
|
+
ipa=["ˈaːbn̩t", "ˈaːbm̩t"],
|
|
135
|
+
language=Language(lang="Deutsch", lang_code="de"),
|
|
136
|
+
lemma=Lemma(lemma="Abend", inflected=False),
|
|
137
|
+
pos={"Substantiv": ["Toponym"]},
|
|
138
|
+
rhymes=["aːbn̩t"],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Development
|
|
144
|
+
This project uses [Poetry](https://python-poetry.org/).
|
|
145
|
+
|
|
146
|
+
1. Install [Poetry](https://python-poetry.org/).
|
|
147
|
+
2. Clone this repository
|
|
148
|
+
3. Run `poetry install` inside of the project folder to install dependencies.
|
|
149
|
+
4. There is a `notebook.ipynb` to test the parser.
|
|
150
|
+
5. Run `poetry run pytest` to run tests.
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
[MIT](https://github.com/gambolputty/wiktionary-de-parser/blob/master/LICENSE.md) © Gregor Weichbrodt
|
|
155
|
+
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# wiktionary-de-parser
|
|
2
|
+
|
|
3
|
+
A Python module to extract data from German Wiktionary XML files (for Python 3.11+).
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Extracts _IPA transcriptions_, _hyphenation_, _language_, _part of speech_ information (basic), _genus_ and _flexion tables_ of a word.
|
|
8
|
+
- Yields per entry, not per page (a page can have multiple entries/ words can have different meanings)
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
`pip install wiktionary-de-parser`
|
|
13
|
+
|
|
14
|
+
Or with [Poetry](https://python-poetry.org/):
|
|
15
|
+
|
|
16
|
+
`poetry add wiktionary-de-parser`
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
### Loading the XML dump file
|
|
21
|
+
```python
|
|
22
|
+
from wiktionary_de_parser import WiktionaryParser
|
|
23
|
+
from wiktionary_de_parser.dump_processor import WiktionaryDump
|
|
24
|
+
|
|
25
|
+
# To download the dump file, specify the directory where the
|
|
26
|
+
# dump file should be stored.
|
|
27
|
+
dump = WiktionaryDump(dump_dir_path="directory-of-dump-file")
|
|
28
|
+
|
|
29
|
+
# This will download "dewiktionary-latest-pages-articles-multistream.xml.bz2" to
|
|
30
|
+
# the directory specified in `dump_dir_path`.
|
|
31
|
+
dump.download_dump()
|
|
32
|
+
|
|
33
|
+
# Alternatively you can specify a different dump file to download.
|
|
34
|
+
dump = WiktionaryDump(
|
|
35
|
+
dump_dir_path="directory-of-dump-file",
|
|
36
|
+
dump_download_url="url-to-dump-file.xml.bz2",
|
|
37
|
+
)
|
|
38
|
+
dump.download_dump()
|
|
39
|
+
|
|
40
|
+
# If you already have the dump file locally, specify the path to the file.
|
|
41
|
+
dump = WiktionaryDump(dump_file_path="path-to-dump-file.xml.bz2")
|
|
42
|
+
dump.download_dump()
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Parsing the dump file
|
|
46
|
+
```python
|
|
47
|
+
from pprint import pprint
|
|
48
|
+
from wiktionary_de_parser import WiktionaryParser
|
|
49
|
+
|
|
50
|
+
# ... (see above)
|
|
51
|
+
|
|
52
|
+
parser = WiktionaryParser()
|
|
53
|
+
|
|
54
|
+
for page in dump.pages():
|
|
55
|
+
# Skip redirects
|
|
56
|
+
if page.redirect_to:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
if page.name == "Abend":
|
|
60
|
+
# Parse all entries for "Abend"
|
|
61
|
+
for entry in parser.entries_from_page(page):
|
|
62
|
+
results = parser.parse_entry(entry)
|
|
63
|
+
pprint(results)
|
|
64
|
+
break
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Output
|
|
68
|
+
All page entries for "Abend":
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
ParsedWiktionaryPageEntry(
|
|
72
|
+
name="Abend",
|
|
73
|
+
hyphenation=["Abend"],
|
|
74
|
+
flexion={
|
|
75
|
+
"Genus": "m",
|
|
76
|
+
"Nominativ Singular": "Abend",
|
|
77
|
+
"Nominativ Plural": "Abende",
|
|
78
|
+
"Genitiv Singular": "Abends",
|
|
79
|
+
"Genitiv Plural": "Abende",
|
|
80
|
+
"Dativ Singular": "Abend",
|
|
81
|
+
"Dativ Plural": "Abenden",
|
|
82
|
+
"Akkusativ Singular": "Abend",
|
|
83
|
+
"Akkusativ Plural": "Abende",
|
|
84
|
+
},
|
|
85
|
+
ipa=["ˈaːbn̩t", "ˈaːbm̩t"],
|
|
86
|
+
language=Language(lang="Deutsch", lang_code="de"),
|
|
87
|
+
lemma=Lemma(lemma="Abend", inflected=False),
|
|
88
|
+
pos={"Substantiv": []},
|
|
89
|
+
rhymes=["aːbn̩t"],
|
|
90
|
+
)
|
|
91
|
+
ParsedWiktionaryPageEntry(
|
|
92
|
+
name="Abend",
|
|
93
|
+
hyphenation=["Abend"],
|
|
94
|
+
flexion=None,
|
|
95
|
+
ipa=["ˈaːbn̩t"],
|
|
96
|
+
language=Language(lang="Deutsch", lang_code="de"),
|
|
97
|
+
lemma=Lemma(lemma="Abend", inflected=False),
|
|
98
|
+
pos={"Substantiv": ["Nachname"]},
|
|
99
|
+
rhymes=["aːbn̩t"],
|
|
100
|
+
)
|
|
101
|
+
ParsedWiktionaryPageEntry(
|
|
102
|
+
name="Abend",
|
|
103
|
+
hyphenation=["Abend"],
|
|
104
|
+
flexion=None,
|
|
105
|
+
ipa=["ˈaːbn̩t", "ˈaːbm̩t"],
|
|
106
|
+
language=Language(lang="Deutsch", lang_code="de"),
|
|
107
|
+
lemma=Lemma(lemma="Abend", inflected=False),
|
|
108
|
+
pos={"Substantiv": ["Toponym"]},
|
|
109
|
+
rhymes=["aːbn̩t"],
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Development
|
|
115
|
+
This project uses [Poetry](https://python-poetry.org/).
|
|
116
|
+
|
|
117
|
+
1. Install [Poetry](https://python-poetry.org/).
|
|
118
|
+
2. Clone this repository
|
|
119
|
+
3. Run `poetry install` inside of the project folder to install dependencies.
|
|
120
|
+
4. There is a `notebook.ipynb` to test the parser.
|
|
121
|
+
5. Run `poetry run pytest` to run tests.
|
|
122
|
+
|
|
123
|
+
## License
|
|
124
|
+
|
|
125
|
+
[MIT](https://github.com/gambolputty/wiktionary-de-parser/blob/master/LICENSE.md) © Gregor Weichbrodt
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "wiktionary-de-parser"
|
|
3
|
+
version = "0.12.7"
|
|
4
|
+
description = "Extracts data from German Wiktionary dump files."
|
|
5
|
+
authors = ["Gregor Weichbrodt <gregorweichbrodt@gmail.com>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
homepage = "https://github.com/gambolputty/wiktionary-de-parser"
|
|
9
|
+
repository = "https://github.com/gambolputty/wiktionary-de-parser"
|
|
10
|
+
keywords = ["wiktionary", "xml", "parser", "data-extraction", "german", "nlp"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Programming Language :: Python :: 3.7",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Natural Language :: German",
|
|
16
|
+
"Topic :: Text Processing :: Markup :: XML"
|
|
17
|
+
]
|
|
18
|
+
include = [
|
|
19
|
+
"CHANGELOG.md",
|
|
20
|
+
"LICENSE.txt",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[tool.poetry.urls]
|
|
24
|
+
"Bug Tracker" = "https://github.com/gambolputty/wiktionary-de-parser/issues"
|
|
25
|
+
|
|
26
|
+
[tool.poetry.dependencies]
|
|
27
|
+
python = "^3.11"
|
|
28
|
+
lxml = "^5.1.0"
|
|
29
|
+
mwparserfromhell = "^0.6.6"
|
|
30
|
+
black = "^24.1.1"
|
|
31
|
+
pydantic = "^2.6.0"
|
|
32
|
+
requests = "^2.31.0"
|
|
33
|
+
tqdm = "^4.66.1"
|
|
34
|
+
wikitextparser = "^0.56.2"
|
|
35
|
+
|
|
36
|
+
[tool.poetry.dev-dependencies]
|
|
37
|
+
pytest = "^7.1.2"
|
|
38
|
+
black = "*"
|
|
39
|
+
|
|
40
|
+
[tool.poetry.group.dev.dependencies]
|
|
41
|
+
ipykernel = "^6.29.0"
|
|
42
|
+
ipywidgets = "^8.1.5"
|
|
43
|
+
|
|
44
|
+
[build-system]
|
|
45
|
+
requires = ["poetry-core>=1.0.0"]
|
|
46
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import inspect
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Type
|
|
6
|
+
|
|
7
|
+
from wiktionary_de_parser.models import (
|
|
8
|
+
ParsedWiktionaryPageEntry,
|
|
9
|
+
WiktionaryPage,
|
|
10
|
+
WiktionaryPageEntry,
|
|
11
|
+
)
|
|
12
|
+
from wiktionary_de_parser.parser import Parser
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class WiktionaryParser:
|
|
16
|
+
parser_classes: list[Type[Parser]]
|
|
17
|
+
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self.parser_classes = self.find_parser_classes()
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def find_parser_classes():
|
|
23
|
+
path = Path(__file__).parent / "parser"
|
|
24
|
+
parent_class = Parser
|
|
25
|
+
classes: list[Type[Parser]] = []
|
|
26
|
+
|
|
27
|
+
for child in path.iterdir():
|
|
28
|
+
if (
|
|
29
|
+
child.is_file()
|
|
30
|
+
and child.name.endswith(".py")
|
|
31
|
+
and child.name != "__init__.py"
|
|
32
|
+
):
|
|
33
|
+
module_name = child.stem # Entfernen Sie die .py-Endung
|
|
34
|
+
spec = importlib.util.spec_from_file_location(
|
|
35
|
+
module_name, child
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
if not spec or not spec.loader:
|
|
39
|
+
raise Exception(f"Could not load {child}")
|
|
40
|
+
|
|
41
|
+
module = importlib.util.module_from_spec(spec)
|
|
42
|
+
spec.loader.exec_module(module)
|
|
43
|
+
|
|
44
|
+
for name, obj in inspect.getmembers(module):
|
|
45
|
+
if (
|
|
46
|
+
inspect.isclass(obj)
|
|
47
|
+
and issubclass(obj, parent_class)
|
|
48
|
+
and (obj != parent_class)
|
|
49
|
+
):
|
|
50
|
+
classes.append(obj)
|
|
51
|
+
|
|
52
|
+
return classes
|
|
53
|
+
|
|
54
|
+
def entries_from_page(self, page: WiktionaryPage):
|
|
55
|
+
"""
|
|
56
|
+
Split page into entries. One page can have multiple word entries, for example:
|
|
57
|
+
- https://de.wiktionary.org/wiki/instrument
|
|
58
|
+
|
|
59
|
+
New entries begin at "==" and "===" (sometimes there is no "==")
|
|
60
|
+
Compare:
|
|
61
|
+
- https://de.wiktionary.org/wiki/instrument
|
|
62
|
+
- https://de.wiktionary.org/wiki/Becken
|
|
63
|
+
"""
|
|
64
|
+
if not page.wikitext:
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
entries: list[str] = re.findall(
|
|
68
|
+
r"(=== {{Wortart(?:[\w\W](?!^===? ))+)", page.wikitext, re.MULTILINE
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
for index, entry in enumerate(entries):
|
|
72
|
+
yield WiktionaryPageEntry(
|
|
73
|
+
page=page,
|
|
74
|
+
index=index,
|
|
75
|
+
wikitext=entry,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def parse_entry(
|
|
79
|
+
self,
|
|
80
|
+
wiktionary_entry: WiktionaryPageEntry,
|
|
81
|
+
include_meanings: bool = False,
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Parses an entry of a page.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
# Instantiate all subclasses and run them
|
|
88
|
+
results = {
|
|
89
|
+
instance.name: instance.run()
|
|
90
|
+
for subclass in self.parser_classes
|
|
91
|
+
if (instance := subclass(wiktionary_entry))
|
|
92
|
+
and (include_meanings or instance.name != "meanings")
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
# Add the page name
|
|
96
|
+
results["name"] = wiktionary_entry.page.name
|
|
97
|
+
|
|
98
|
+
return ParsedWiktionaryPageEntry(**results)
|