wikitextparser 0.56.4__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikitextparser-1.0.0/PKG-INFO +413 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/README.rst +20 -1
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/pyproject.toml +24 -16
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/__init__.py +1 -1
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_template.py +1 -1
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_wikitext.py +8 -8
- {wikitextparser-0.56.4 → wikitextparser-1.0.0/wikitextparser/wikitextparser.egg-info}/PKG-INFO +396 -394
- wikitextparser-1.0.0/wikitextparser/wikitextparser.egg-info/SOURCES.txt +23 -0
- wikitextparser-1.0.0/wikitextparser/wikitextparser.egg-info/dependency_links.txt +1 -0
- wikitextparser-1.0.0/wikitextparser/wikitextparser.egg-info/requires.txt +10 -0
- wikitextparser-1.0.0/wikitextparser/wikitextparser.egg-info/top_level.txt +1 -0
- wikitextparser-1.0.0/wikitextparser/wikitextparser.egg-info/zip-safe +1 -0
- wikitextparser-0.56.4/.coveragerc +0 -4
- wikitextparser-0.56.4/.github/workflows/tests.yml +0 -42
- wikitextparser-0.56.4/.gitignore +0 -1
- wikitextparser-0.56.4/.readthedocs.yaml +0 -25
- wikitextparser-0.56.4/.vscode/settings.json +0 -7
- wikitextparser-0.56.4/CHANGELOG.rst +0 -558
- wikitextparser-0.56.4/LICENSE.md +0 -674
- wikitextparser-0.56.4/docs/CHANGELOG.rst +0 -3
- wikitextparser-0.56.4/docs/Makefile +0 -20
- wikitextparser-0.56.4/docs/README.rst +0 -5
- wikitextparser-0.56.4/docs/conf.py +0 -189
- wikitextparser-0.56.4/docs/index.rst +0 -152
- wikitextparser-0.56.4/docs/make.bat +0 -36
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_argument.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_cell.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_comment_bold_italic.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_config.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_externallink.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_parameter.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_parser_function.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_section.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_spans.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_table.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_tag.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_wikilink.py +0 -0
- {wikitextparser-0.56.4 → wikitextparser-1.0.0}/wikitextparser/_wikilist.py +0 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: wikitextparser
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A simple parsing tool for MediaWiki's wikitext markup.
|
|
5
|
+
Keywords: MediaWiki,wikitext,parser
|
|
6
|
+
Author: 5j9
|
|
7
|
+
Author-email: 5j9 <5j9@users.noreply.github.com>
|
|
8
|
+
License: GNU General Public License v3 (GPLv3)
|
|
9
|
+
Classifier: Programming Language :: Python
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
11
|
+
Classifier: Topic :: Text Processing
|
|
12
|
+
Requires-Dist: regex>=2022.9.11
|
|
13
|
+
Requires-Dist: wcwidth
|
|
14
|
+
Requires-Dist: coverage ; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest ; extra == 'tests'
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Project-URL: Homepage, https://github.com/5j9/wikitextparser
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Provides-Extra: tests
|
|
20
|
+
Description-Content-Type: text/x-rst
|
|
21
|
+
|
|
22
|
+
.. image:: https://github.com/5j9/wikitextparser/actions/workflows/tests.yml/badge.svg
|
|
23
|
+
:target: https://github.com/5j9/wikitextparser/actions/workflows/tests.yml
|
|
24
|
+
.. image:: https://codecov.io/github/5j9/wikitextparser/coverage.svg?branch=master
|
|
25
|
+
:target: https://codecov.io/github/5j9/wikitextparser
|
|
26
|
+
.. image:: https://readthedocs.org/projects/wikitextparser/badge/?version=latest
|
|
27
|
+
:target: http://wikitextparser.readthedocs.io/en/latest/?badge=latest
|
|
28
|
+
|
|
29
|
+
==============
|
|
30
|
+
WikiTextParser
|
|
31
|
+
==============
|
|
32
|
+
.. Quick Start Guid
|
|
33
|
+
|
|
34
|
+
A simple to use WikiText parsing library for `MediaWiki <https://www.mediawiki.org/wiki/MediaWiki>`_.
|
|
35
|
+
|
|
36
|
+
The purpose is to allow users easily extract and/or manipulate templates, template parameters, parser functions, tables, external links, wikilinks, lists, etc. found in wikitexts.
|
|
37
|
+
|
|
38
|
+
.. contents:: Table of Contents
|
|
39
|
+
|
|
40
|
+
Installation
|
|
41
|
+
============
|
|
42
|
+
|
|
43
|
+
- Python 3.8+ is required
|
|
44
|
+
- ``pip install wikitextparser``
|
|
45
|
+
|
|
46
|
+
Usage
|
|
47
|
+
=====
|
|
48
|
+
|
|
49
|
+
.. code:: python
|
|
50
|
+
|
|
51
|
+
>>> import wikitextparser as wtp
|
|
52
|
+
|
|
53
|
+
WikiTextParser can detect sections, parser functions, templates, wiki links, external links, arguments, tables, wiki lists, and comments in your wikitext. The following sections are a quick overview of some of these functionalities.
|
|
54
|
+
|
|
55
|
+
You may also want to have a look at the test modules for more examples and probable pitfalls (expected failures).
|
|
56
|
+
|
|
57
|
+
Templates
|
|
58
|
+
---------
|
|
59
|
+
|
|
60
|
+
.. code:: python
|
|
61
|
+
|
|
62
|
+
>>> parsed = wtp.parse("{{text|value1{{text|value2}}}}")
|
|
63
|
+
>>> parsed.templates
|
|
64
|
+
[Template('{{text|value1{{text|value2}}}}'), Template('{{text|value2}}')]
|
|
65
|
+
>>> parsed.templates[0].arguments
|
|
66
|
+
[Argument("|value1{{text|value2}}")]
|
|
67
|
+
>>> parsed.templates[0].arguments[0].value = 'value3'
|
|
68
|
+
>>> print(parsed)
|
|
69
|
+
{{text|value3}}
|
|
70
|
+
|
|
71
|
+
The ``pformat`` method returns a pretty-print formatted string for templates:
|
|
72
|
+
|
|
73
|
+
.. code:: python
|
|
74
|
+
|
|
75
|
+
>>> parsed = wtp.parse('{{t1 |b=b|c=c| d={{t2|e=e|f=f}} }}')
|
|
76
|
+
>>> t1, t2 = parsed.templates
|
|
77
|
+
>>> print(t2.pformat())
|
|
78
|
+
{{t2
|
|
79
|
+
| e = e
|
|
80
|
+
| f = f
|
|
81
|
+
}}
|
|
82
|
+
>>> print(t1.pformat())
|
|
83
|
+
{{t1
|
|
84
|
+
| b = b
|
|
85
|
+
| c = c
|
|
86
|
+
| d = {{t2
|
|
87
|
+
| e = e
|
|
88
|
+
| f = f
|
|
89
|
+
}}
|
|
90
|
+
}}
|
|
91
|
+
|
|
92
|
+
``Template.rm_dup_args_safe`` and ``Template.rm_first_of_dup_args`` methods can be used to clean-up `pages using duplicate arguments in template calls <https://en.wikipedia.org/wiki/Category:Pages_using_duplicate_arguments_in_template_calls>`_:
|
|
93
|
+
|
|
94
|
+
.. code:: python
|
|
95
|
+
|
|
96
|
+
>>> t = wtp.Template('{{t|a=a|a=b|a=a}}')
|
|
97
|
+
>>> t.rm_dup_args_safe()
|
|
98
|
+
>>> t
|
|
99
|
+
Template('{{t|a=b|a=a}}')
|
|
100
|
+
>>> t = wtp.Template('{{t|a=a|a=b|a=a}}')
|
|
101
|
+
>>> t.rm_first_of_dup_args()
|
|
102
|
+
>>> t
|
|
103
|
+
Template('{{t|a=a}}')
|
|
104
|
+
|
|
105
|
+
Template parameters:
|
|
106
|
+
|
|
107
|
+
.. code:: python
|
|
108
|
+
|
|
109
|
+
>>> param = wtp.parse('{{{a|b}}}').parameters[0]
|
|
110
|
+
>>> param.name
|
|
111
|
+
'a'
|
|
112
|
+
>>> param.default
|
|
113
|
+
'b'
|
|
114
|
+
>>> param.default = 'c'
|
|
115
|
+
>>> param
|
|
116
|
+
Parameter('{{{a|c}}}')
|
|
117
|
+
>>> param.append_default('d')
|
|
118
|
+
>>> param
|
|
119
|
+
Parameter('{{{a|{{{d|c}}}}}}')
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
WikiLinks
|
|
123
|
+
---------
|
|
124
|
+
|
|
125
|
+
.. code:: python
|
|
126
|
+
|
|
127
|
+
>>> wl = wtp.parse('... [[title#fragmet|text]] ...').wikilinks[0]
|
|
128
|
+
>>> wl.title = 'new_title'
|
|
129
|
+
>>> wl.fragment = 'new_fragmet'
|
|
130
|
+
>>> wl.text = 'X'
|
|
131
|
+
>>> wl
|
|
132
|
+
WikiLink('[[new_title#new_fragmet|X]]')
|
|
133
|
+
>>> del wl.text
|
|
134
|
+
>>> wl
|
|
135
|
+
WikiLink('[[new_title#new_fragmet]]')
|
|
136
|
+
|
|
137
|
+
All WikiLink properties support get, set, and delete operations. Categories are special cases of WikiLinks, in that they are prefixed with the category namespace, which is case insensitive and may be internationalized:
|
|
138
|
+
|
|
139
|
+
.. code:: python
|
|
140
|
+
|
|
141
|
+
>>> parsed = wtp.parse("""
|
|
142
|
+
[[Category:Foo]]
|
|
143
|
+
[[Κατηγορία:Bar]]
|
|
144
|
+
[[Other link]]
|
|
145
|
+
""")
|
|
146
|
+
>>> categories = [
|
|
147
|
+
wl
|
|
148
|
+
for wl
|
|
149
|
+
in parsed.wikilinks
|
|
150
|
+
if wl.title.partition(':')[0]
|
|
151
|
+
.strip()
|
|
152
|
+
.lower()
|
|
153
|
+
in ["category", "κατηγορία"]
|
|
154
|
+
]
|
|
155
|
+
>>> categories
|
|
156
|
+
[WikiLink('[[Category:Foo]]'), WikiLink('[[Category:Bar]]')]
|
|
157
|
+
|
|
158
|
+
Sections
|
|
159
|
+
--------
|
|
160
|
+
|
|
161
|
+
.. code:: python
|
|
162
|
+
|
|
163
|
+
>>> parsed = wtp.parse("""
|
|
164
|
+
... == h2 ==
|
|
165
|
+
... t2
|
|
166
|
+
... === h3 ===
|
|
167
|
+
... t3
|
|
168
|
+
... === h3 ===
|
|
169
|
+
... t3
|
|
170
|
+
... == h22 ==
|
|
171
|
+
... t22
|
|
172
|
+
... {{text|value3}}
|
|
173
|
+
... [[Z|X]]
|
|
174
|
+
... """)
|
|
175
|
+
>>> parsed.sections
|
|
176
|
+
[Section('\n'),
|
|
177
|
+
Section('== h2 ==\nt2\n=== h3 ===\nt3\n=== h3 ===\nt3\n'),
|
|
178
|
+
Section('=== h3 ===\nt3\n'),
|
|
179
|
+
Section('=== h3 ===\nt3\n'),
|
|
180
|
+
Section('== h22 ==\nt22\n{{text|value3}}\n[[Z|X]]\n')]
|
|
181
|
+
>>> parsed.sections[1].title = 'newtitle'
|
|
182
|
+
>>> print(parsed)
|
|
183
|
+
|
|
184
|
+
==newtitle==
|
|
185
|
+
t2
|
|
186
|
+
=== h3 ===
|
|
187
|
+
t3
|
|
188
|
+
=== h3 ===
|
|
189
|
+
t3
|
|
190
|
+
== h22 ==
|
|
191
|
+
t22
|
|
192
|
+
{{text|value3}}
|
|
193
|
+
[[Z|X]]
|
|
194
|
+
>>> del parsed.sections[1].title
|
|
195
|
+
>>>> print(parsed)
|
|
196
|
+
|
|
197
|
+
t2
|
|
198
|
+
=== h3 ===
|
|
199
|
+
t3
|
|
200
|
+
=== h3 ===
|
|
201
|
+
t3
|
|
202
|
+
== h22 ==
|
|
203
|
+
t22
|
|
204
|
+
{{text|value3}}
|
|
205
|
+
[[Z|X]]
|
|
206
|
+
|
|
207
|
+
Tables
|
|
208
|
+
------
|
|
209
|
+
|
|
210
|
+
Extracting cell values of a table:
|
|
211
|
+
|
|
212
|
+
.. code:: python
|
|
213
|
+
|
|
214
|
+
>>> p = wtp.parse("""{|
|
|
215
|
+
... | Orange || Apple || more
|
|
216
|
+
... |-
|
|
217
|
+
... | Bread || Pie || more
|
|
218
|
+
... |-
|
|
219
|
+
... | Butter || Ice cream || and more
|
|
220
|
+
... |}""")
|
|
221
|
+
>>> p.tables[0].data()
|
|
222
|
+
[['Orange', 'Apple', 'more'],
|
|
223
|
+
['Bread', 'Pie', 'more'],
|
|
224
|
+
['Butter', 'Ice cream', 'and more']]
|
|
225
|
+
|
|
226
|
+
By default, values are arranged according to ``colspan`` and ``rowspan`` attributes:
|
|
227
|
+
|
|
228
|
+
.. code:: python
|
|
229
|
+
|
|
230
|
+
>>> t = wtp.Table("""{| class="wikitable sortable"
|
|
231
|
+
... |-
|
|
232
|
+
... ! a !! b !! c
|
|
233
|
+
... |-
|
|
234
|
+
... !colspan = "2" | d || e
|
|
235
|
+
... |-
|
|
236
|
+
... |}""")
|
|
237
|
+
>>> t.data()
|
|
238
|
+
[['a', 'b', 'c'], ['d', 'd', 'e']]
|
|
239
|
+
>>> t.data(span=False)
|
|
240
|
+
[['a', 'b', 'c'], ['d', 'e']]
|
|
241
|
+
|
|
242
|
+
Calling the ``cells`` method of a ``Table`` returns table cells as ``Cell`` objects. Cell objects provide methods for getting or setting each cell's attributes or values individually:
|
|
243
|
+
|
|
244
|
+
.. code:: python
|
|
245
|
+
|
|
246
|
+
>>> cell = t.cells(row=1, column=1)
|
|
247
|
+
>>> cell.attrs
|
|
248
|
+
{'colspan': '2'}
|
|
249
|
+
>>> cell.set('colspan', '3')
|
|
250
|
+
>>> print(t)
|
|
251
|
+
{| class="wikitable sortable"
|
|
252
|
+
|-
|
|
253
|
+
! a !! b !! c
|
|
254
|
+
|-
|
|
255
|
+
!colspan = "3" | d || e
|
|
256
|
+
|-
|
|
257
|
+
|}
|
|
258
|
+
|
|
259
|
+
HTML attributes of Table, Cell, and Tag objects are accessible via
|
|
260
|
+
``get_attr``, ``set_attr``, ``has_attr``, and ``del_attr`` methods.
|
|
261
|
+
|
|
262
|
+
Lists
|
|
263
|
+
-----
|
|
264
|
+
|
|
265
|
+
The ``get_lists`` method provides access to lists within the wikitext.
|
|
266
|
+
|
|
267
|
+
.. code:: python
|
|
268
|
+
|
|
269
|
+
>>> parsed = wtp.parse(
|
|
270
|
+
... 'text\n'
|
|
271
|
+
... '* list item a\n'
|
|
272
|
+
... '* list item b\n'
|
|
273
|
+
... '** sub-list of b\n'
|
|
274
|
+
... '* list item c\n'
|
|
275
|
+
... '** sub-list of b\n'
|
|
276
|
+
... 'text'
|
|
277
|
+
... )
|
|
278
|
+
>>> wikilist = parsed.get_lists()[0]
|
|
279
|
+
>>> wikilist.items
|
|
280
|
+
[' list item a', ' list item b', ' list item c']
|
|
281
|
+
|
|
282
|
+
The ``sublists`` method can be used to get all sub-lists of the current list or just sub-lists of specific items:
|
|
283
|
+
|
|
284
|
+
.. code:: python
|
|
285
|
+
|
|
286
|
+
>>> wikilist.sublists()
|
|
287
|
+
[WikiList('** sub-list of b\n'), WikiList('** sub-list of b\n')]
|
|
288
|
+
>>> wikilist.sublists(1)[0].items
|
|
289
|
+
[' sub-list of b']
|
|
290
|
+
|
|
291
|
+
It also has an optional ``pattern`` argument that works similar to ``lists``, except that the current list pattern will be automatically added to it as a prefix:
|
|
292
|
+
|
|
293
|
+
.. code:: python
|
|
294
|
+
|
|
295
|
+
>>> wikilist = wtp.WikiList('#a\n#b\n##ba\n#*bb\n#:bc\n#c', '\#')
|
|
296
|
+
>>> wikilist.sublists()
|
|
297
|
+
[WikiList('##ba\n'), WikiList('#*bb\n'), WikiList('#:bc\n')]
|
|
298
|
+
>>> wikilist.sublists(pattern='\*')
|
|
299
|
+
[WikiList('#*bb\n')]
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
Convert one type of list to another using the convert method. Specifying the starting pattern of the desired lists can facilitate finding them and improves the performance:
|
|
303
|
+
|
|
304
|
+
.. code:: python
|
|
305
|
+
|
|
306
|
+
>>> wl = wtp.WikiList(
|
|
307
|
+
... ':*A1\n:*#B1\n:*#B2\n:*:continuing A1\n:*A2',
|
|
308
|
+
... pattern=':\*'
|
|
309
|
+
... )
|
|
310
|
+
>>> print(wl)
|
|
311
|
+
:*A1
|
|
312
|
+
:*#B1
|
|
313
|
+
:*#B2
|
|
314
|
+
:*:continuing A1
|
|
315
|
+
:*A2
|
|
316
|
+
>>> wl.convert('#')
|
|
317
|
+
>>> print(wl)
|
|
318
|
+
#A1
|
|
319
|
+
##B1
|
|
320
|
+
##B2
|
|
321
|
+
#:continuing A1
|
|
322
|
+
#A2
|
|
323
|
+
|
|
324
|
+
Tags
|
|
325
|
+
----
|
|
326
|
+
|
|
327
|
+
Accessing HTML tags:
|
|
328
|
+
|
|
329
|
+
.. code:: python
|
|
330
|
+
|
|
331
|
+
>>> p = wtp.parse('text<ref name="c">citation</ref>\n<references/>')
|
|
332
|
+
>>> ref, references = p.get_tags()
|
|
333
|
+
>>> ref.name = 'X'
|
|
334
|
+
>>> ref
|
|
335
|
+
Tag('<X name="c">citation</X>')
|
|
336
|
+
>>> references
|
|
337
|
+
Tag('<references/>')
|
|
338
|
+
|
|
339
|
+
WikiTextParser is able to handle common usages of HTML and extension tags. However it is not a fully-fledged HTML parser and may fail on edge cases or malformed HTML input. Please open an issue on github if you encounter bugs.
|
|
340
|
+
|
|
341
|
+
Miscellaneous
|
|
342
|
+
-------------
|
|
343
|
+
``parent`` and ``ancestors`` methods can be used to access a node's parent or ancestors respectively:
|
|
344
|
+
|
|
345
|
+
.. code:: python
|
|
346
|
+
|
|
347
|
+
>>> template_d = parse("{{a|{{b|{{c|{{d}}}}}}}}").templates[3]
|
|
348
|
+
>>> template_d.ancestors()
|
|
349
|
+
[Template('{{c|{{d}}}}'),
|
|
350
|
+
Template('{{b|{{c|{{d}}}}}}'),
|
|
351
|
+
Template('{{a|{{b|{{c|{{d}}}}}}}}')]
|
|
352
|
+
>>> template_d.parent()
|
|
353
|
+
Template('{{c|{{d}}}}')
|
|
354
|
+
>>> _.parent()
|
|
355
|
+
Template('{{b|{{c|{{d}}}}}}')
|
|
356
|
+
>>> _.parent()
|
|
357
|
+
Template('{{a|{{b|{{c|{{d}}}}}}}}')
|
|
358
|
+
>>> _.parent() # Returns None
|
|
359
|
+
|
|
360
|
+
Use the optional ``type_`` argument if looking for ancestors of a specific type:
|
|
361
|
+
|
|
362
|
+
.. code:: python
|
|
363
|
+
|
|
364
|
+
>>> parsed = parse('{{a|{{#if:{{b{{c<!---->}}}}}}}}')
|
|
365
|
+
>>> comment = parsed.comments[0]
|
|
366
|
+
>>> comment.ancestors(type_='ParserFunction')
|
|
367
|
+
[ParserFunction('{{#if:{{b{{c<!---->}}}}}}')]
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
To delete/remove any object from its parents use ``del object[:]`` or ``del object.string``.
|
|
371
|
+
|
|
372
|
+
The ``remove_markup`` function or ``plain_text`` method can be used to remove wiki markup:
|
|
373
|
+
|
|
374
|
+
.. code:: python
|
|
375
|
+
|
|
376
|
+
>>> from wikitextparser import remove_markup, parse
|
|
377
|
+
>>> s = "'''a'''<!--comment--> [[b|c]] [[d]]"
|
|
378
|
+
>>> remove_markup(s)
|
|
379
|
+
'a c d'
|
|
380
|
+
>>> parse(s).plain_text()
|
|
381
|
+
'a c d'
|
|
382
|
+
|
|
383
|
+
Compared with mwparserfromhell
|
|
384
|
+
==============================
|
|
385
|
+
|
|
386
|
+
`mwparserfromhell <https://github.com/earwig/mwparserfromhell>`_ is a mature and widely used library with nearly the same purposes as ``wikitextparser``. The main reason leading me to create ``wikitextparser`` was that ``mwparserfromhell`` could not parse wikitext in certain situations that I needed it for. See mwparserfromhell's issues `40 <https://github.com/earwig/mwparserfromhell/issues/40>`_, `42 <https://github.com/earwig/mwparserfromhell/issues/42>`_, `88 <https://github.com/earwig/mwparserfromhell/issues/88>`_, and other related issues. In many of those situation ``wikitextparser`` may be able to give you more acceptable results.
|
|
387
|
+
|
|
388
|
+
Also note that ``wikitextparser`` is still using 0.x.y version `meaning <https://semver.org/>`_ that the API is not stable and may change in the future versions.
|
|
389
|
+
|
|
390
|
+
The tokenizer in ``mwparserfromhell`` is written in C. Tokenization in ``wikitextparser`` is mostly done using the ``regex`` library which is also in C.
|
|
391
|
+
I have not rigorously compared the two libraries in terms of performance, i.e. execution time and memory usage. In my limited experience, ``wikitextparser`` has a decent performance in realistic cases and should be able to compete and may even have little performance benefits in some situations.
|
|
392
|
+
|
|
393
|
+
If you have had a chance to compare these libraries in terms of performance or capabilities please share your experience by opening an issue on github.
|
|
394
|
+
|
|
395
|
+
Some of the unique features of ``wikitextparser`` are: Providing access to individual cells of each table, pretty-printing templates, a WikiList class with rudimentary methods to work with `lists <https://www.mediawiki.org/wiki/Help:Lists>`_, and a few other functions.
|
|
396
|
+
|
|
397
|
+
Known issues and limitations
|
|
398
|
+
============================
|
|
399
|
+
|
|
400
|
+
* The contents of templates/parameters are not known to offline parsers. For example an offline parser cannot know if the markup ``[[{{z|a}}]]`` should be treated as wikilink or not, it depends on the inner-workings of the ``{{z}}`` template. In these situations ``wikitextparser`` tries to use a best guess. ``[[{{z|a}}]]`` is treated as a wikilink (why else would anyone call a template inside wikilink markup, and even if it is not a wikilink, usually no harm is done).
|
|
401
|
+
* Localized namespace names are unknown, so for example ``[[File:...]]`` links are treated as normal wikilinks. ``mwparserfromhell`` has similar issue, see `#87 <https://github.com/earwig/mwparserfromhell/issues/87>`_ and `#136 <https://github.com/earwig/mwparserfromhell/issues/136>`_. As a workaround, `Pywikibot <https://www.mediawiki.org/wiki/Manual:Pywikibot>`_ can be used for determining the namespace.
|
|
402
|
+
* `Linktrails <https://www.mediawiki.org/wiki/Help:Links>`_ are language dependant and are not supported. `Also not supported by mwparserfromhell <https://github.com/earwig/mwparserfromhell/issues/82>`_. However given the trail pattern and knowing that ``wikilink.span[1]`` is the ending position of a wikilink, it is possible to compute a WikiLink's linktrail.
|
|
403
|
+
* Templates adjacent to external links are never considered part of the link. In reality, this depends on the contents of the template. Example: ``parse('http://example.com{{dead link}}').external_links[0].url == 'http://example.com'``
|
|
404
|
+
* List of valid `extension tags <https://www.mediawiki.org/wiki/Parser_extension_tags>`_ depends on the extensions intalled on the wiki. The ``tags`` method currently only supports the ones on English Wikipedia. A configuration option might be added in the future to address this issue.
|
|
405
|
+
* ``wikitextparser`` currently does not provide an `ast.walk <https://docs.python.org/3/library/ast.html#ast.walk>`_-like method yielding all descendant nodes.
|
|
406
|
+
* `Parser functions <https://www.mediawiki.org/wiki/Help:Extension:ParserFunctions>`_ and `magic words <https://www.mediawiki.org/wiki/Help:Magic_words>`_ are not evaluated.
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
Credits
|
|
410
|
+
=======
|
|
411
|
+
* `python <https://www.python.org/>`_
|
|
412
|
+
* `regex <https://github.com/mrabarnett/mrab-regex>`_
|
|
413
|
+
* `wcwidth <https://github.com/jquast/wcwidth>`_
|
|
@@ -113,7 +113,26 @@ WikiLinks
|
|
|
113
113
|
>>> wl
|
|
114
114
|
WikiLink('[[new_title#new_fragmet]]')
|
|
115
115
|
|
|
116
|
-
All WikiLink properties support get, set, and delete operations.
|
|
116
|
+
All WikiLink properties support get, set, and delete operations. Categories are special cases of WikiLinks, in that they are prefixed with the category namespace, which is case insensitive and may be internationalized:
|
|
117
|
+
|
|
118
|
+
.. code:: python
|
|
119
|
+
|
|
120
|
+
>>> parsed = wtp.parse("""
|
|
121
|
+
[[Category:Foo]]
|
|
122
|
+
[[Κατηγορία:Bar]]
|
|
123
|
+
[[Other link]]
|
|
124
|
+
""")
|
|
125
|
+
>>> categories = [
|
|
126
|
+
wl
|
|
127
|
+
for wl
|
|
128
|
+
in parsed.wikilinks
|
|
129
|
+
if wl.title.partition(':')[0]
|
|
130
|
+
.strip()
|
|
131
|
+
.lower()
|
|
132
|
+
in ["category", "κατηγορία"]
|
|
133
|
+
]
|
|
134
|
+
>>> categories
|
|
135
|
+
[WikiLink('[[Category:Foo]]'), WikiLink('[[Category:Bar]]')]
|
|
117
136
|
|
|
118
137
|
Sections
|
|
119
138
|
--------
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[build-system]
|
|
2
|
-
requires = ['
|
|
3
|
-
build-backend = '
|
|
2
|
+
requires = ['uv_build>=0.8.3,<0.9.0']
|
|
3
|
+
build-backend = 'uv_build'
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "wikitextparser"
|
|
@@ -24,9 +24,7 @@ dependencies = [
|
|
|
24
24
|
"regex >= 2022.9.11",
|
|
25
25
|
"wcwidth",
|
|
26
26
|
]
|
|
27
|
-
|
|
28
|
-
"version",
|
|
29
|
-
]
|
|
27
|
+
version = "1.0.0"
|
|
30
28
|
|
|
31
29
|
[project.license]
|
|
32
30
|
text = "GNU General Public License v3 (GPLv3)"
|
|
@@ -42,25 +40,24 @@ tests = [
|
|
|
42
40
|
"pytest",
|
|
43
41
|
]
|
|
44
42
|
|
|
45
|
-
[tool.flit.sdist]
|
|
46
|
-
exclude = ['tests/', 'doc/', 'dev/']
|
|
47
|
-
|
|
48
43
|
[tool.ruff]
|
|
49
44
|
line-length = 79
|
|
50
45
|
format.quote-style = 'single'
|
|
51
46
|
lint.isort.combine-as-imports = true
|
|
52
47
|
lint.extend-select = [
|
|
53
48
|
'W605', # invalid-escape-sequence
|
|
54
|
-
'FA',
|
|
55
|
-
'I',
|
|
56
|
-
'UP',
|
|
57
|
-
'RUF',
|
|
49
|
+
'FA', # flake8-future-annotations
|
|
50
|
+
'I', # isort
|
|
51
|
+
'UP', # pyupgrade
|
|
52
|
+
'RUF', # Ruff-specific rules (RUF)
|
|
58
53
|
]
|
|
59
54
|
lint.ignore = [
|
|
60
|
-
'E721',
|
|
61
|
-
'RUF001',
|
|
62
|
-
'RUF002',
|
|
63
|
-
'RUF003',
|
|
55
|
+
'E721', # Do not compare types, use `isinstance()`
|
|
56
|
+
'RUF001', # ambiguous-unicode-character-string
|
|
57
|
+
'RUF002', # ambiguous-unicode-character-docstring
|
|
58
|
+
'RUF003', # ambiguous-unicode-character-comment
|
|
59
|
+
'RUF012', # mutable-class-default
|
|
60
|
+
'RUF059', # Unpacked variable never used
|
|
64
61
|
]
|
|
65
62
|
|
|
66
63
|
[tool.pytest.ini_options]
|
|
@@ -78,3 +75,14 @@ reportInvalidStringEscapeSequence = false
|
|
|
78
75
|
reportConstantRedefinition = 'error'
|
|
79
76
|
reportTypeCommentUsage = 'warning'
|
|
80
77
|
reportUnnecessaryComparison = 'warning'
|
|
78
|
+
venvPath = "."
|
|
79
|
+
venv = ".venv"
|
|
80
|
+
|
|
81
|
+
[tool.uv.build-backend]
|
|
82
|
+
module-root = ""
|
|
83
|
+
module-name = "wikitextparser"
|
|
84
|
+
|
|
85
|
+
[dependency-groups]
|
|
86
|
+
dev = [
|
|
87
|
+
"pytest>=8.3.5",
|
|
88
|
+
]
|
|
@@ -221,7 +221,7 @@ class Template(SubWikiTextWithArgs):
|
|
|
221
221
|
pre_name_ws_mode = mode(before_names)
|
|
222
222
|
name_length_mode = mode(name_lengths)
|
|
223
223
|
post_value_ws_mode = mode(
|
|
224
|
-
[SPACE_AFTER_SEARCH(self.string)[0]
|
|
224
|
+
[SPACE_AFTER_SEARCH(self.string)[0], *after_values[1:]] # type: ignore
|
|
225
225
|
)
|
|
226
226
|
pre_value_ws_mode = mode(before_values)
|
|
227
227
|
else:
|
|
@@ -57,7 +57,7 @@ BRACKET_EXTERNAL_LINK_SCHEMES = regex_pattern(
|
|
|
57
57
|
BRACKET_EXTERNAL_LINK_URL = (
|
|
58
58
|
BRACKET_EXTERNAL_LINK_SCHEMES + EXTERNAL_LINK_URL_TAIL
|
|
59
59
|
)
|
|
60
|
-
BRACKET_EXTERNAL_LINK = rb'\[' + BRACKET_EXTERNAL_LINK_URL + rb'[^\]\n]*+\]'
|
|
60
|
+
BRACKET_EXTERNAL_LINK = rb'\[' + BRACKET_EXTERNAL_LINK_URL + rb'[^\]\r\n]*+\]'
|
|
61
61
|
EXTERNAL_LINK = (
|
|
62
62
|
rb'(?>' + BARE_EXTERNAL_LINK + rb'|' + BRACKET_EXTERNAL_LINK + rb')'
|
|
63
63
|
)
|
|
@@ -67,8 +67,8 @@ INVALID_EL_TPP_CHRS_SUB = rc( # the [:-4] slice allows \[ and \]
|
|
|
67
67
|
).sub
|
|
68
68
|
|
|
69
69
|
# Sections
|
|
70
|
-
SECTION_HEADING = rb'^(?<equals>={1,6})[^\n]+?(?P=equals)[ \t]
|
|
71
|
-
SUB_SECTION = rb'(?:^(?P=equals)=[^\n]+?(?P=equals)=[ \t]
|
|
70
|
+
SECTION_HEADING = rb'^(?<equals>={1,6})[^\r\n]+?(?P=equals)[ \t]*+\r?+$'
|
|
71
|
+
SUB_SECTION = rb'(?:^(?P=equals)=[^\r\n]+?(?P=equals)=[ \t]*+\r?+$.*?)*'
|
|
72
72
|
LEAD_SECTION = rb'(?<section>(?<equals>).*?)'
|
|
73
73
|
SECTIONS_FULLMATCH = rc(
|
|
74
74
|
LEAD_SECTION
|
|
@@ -100,7 +100,7 @@ TABLE_FINDITER = rc(
|
|
|
100
100
|
(?!^\ *+\{\|).
|
|
101
101
|
)*?
|
|
102
102
|
# Table-end
|
|
103
|
-
\n\s*+
|
|
103
|
+
\r?\n\s*+
|
|
104
104
|
(?> \|} | \Z )
|
|
105
105
|
""",
|
|
106
106
|
DOTALL | MULTILINE | VERBOSE,
|
|
@@ -113,7 +113,7 @@ BOLD_FINDITER = rc(
|
|
|
113
113
|
# start token
|
|
114
114
|
'\0*+'\0*+'
|
|
115
115
|
# content
|
|
116
|
-
(\0*+[^'\n]++.*?)
|
|
116
|
+
(\0*+[^'\r\n]++.*?)
|
|
117
117
|
# end token
|
|
118
118
|
(?:'\0*+'\0*+'|$)
|
|
119
119
|
""",
|
|
@@ -125,7 +125,7 @@ ITALIC_FINDITER = rc(
|
|
|
125
125
|
# start token
|
|
126
126
|
'\0*+'
|
|
127
127
|
# content
|
|
128
|
-
(\0*+[^'\n]++.*?)
|
|
128
|
+
(\0*+[^'\r\n]++.*?)
|
|
129
129
|
# end token
|
|
130
130
|
(?:'\0*+'|$)
|
|
131
131
|
""",
|
|
@@ -1369,7 +1369,7 @@ class WikiText:
|
|
|
1369
1369
|
type_to_spans = self._type_to_spans
|
|
1370
1370
|
lststr = self._lststr
|
|
1371
1371
|
shadow_copy = self._shadow[:]
|
|
1372
|
-
ss,
|
|
1372
|
+
ss, _se, _, _ = self._span_data
|
|
1373
1373
|
spans = type_to_spans.setdefault('Table', [])
|
|
1374
1374
|
spans_append = spans.append
|
|
1375
1375
|
skip_self_span = self._type == 'Table'
|
|
@@ -1535,7 +1535,7 @@ class WikiText:
|
|
|
1535
1535
|
span_tuple_to_span_get = {(s[0], s[1]): s for s in spans}.get
|
|
1536
1536
|
spans_append = spans.append
|
|
1537
1537
|
for start_match in reversed_start_matches:
|
|
1538
|
-
if start_match[0].rstrip(b' \t\n>')[-1] == 47: # ord('/') == 47
|
|
1538
|
+
if start_match[0].rstrip(b' \t\r\n>')[-1] == 47: # ord('/') == 47
|
|
1539
1539
|
# Self-closing tag. Don't look for the end tag.
|
|
1540
1540
|
# todo: some self-closing tags actually should be treated
|
|
1541
1541
|
# as start tag in HTML5, see:
|