wikicorpus 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikicorpus-0.1.0/PKG-INFO +287 -0
- wikicorpus-0.1.0/README.md +266 -0
- wikicorpus-0.1.0/pyproject.toml +52 -0
- wikicorpus-0.1.0/tests/__init__.py +0 -0
- wikicorpus-0.1.0/tests/test_aligner.py +107 -0
- wikicorpus-0.1.0/tests/test_filter.py +191 -0
- wikicorpus-0.1.0/tests/test_retriever.py +104 -0
- wikicorpus-0.1.0/tests/test_writer.py +128 -0
- wikicorpus-0.1.0/wikicorpus/__init__.py +5 -0
- wikicorpus-0.1.0/wikicorpus/aligner.py +176 -0
- wikicorpus-0.1.0/wikicorpus/cli.py +470 -0
- wikicorpus-0.1.0/wikicorpus/filter.py +216 -0
- wikicorpus-0.1.0/wikicorpus/retriever.py +257 -0
- wikicorpus-0.1.0/wikicorpus/writer.py +147 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wikicorpus
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Retrieve and filter Wikipedia articles to build a corpus for knowledge extraction
|
|
5
|
+
Author-email: Andrea Schimmenti <andschimmenti@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: corpus,knowledge-extraction,nlp,text-mining,wikidata,wikipedia
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
16
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: nltk
|
|
19
|
+
Requires-Dist: requests
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# wikicorpus
|
|
23
|
+
|
|
24
|
+
## Abstract
|
|
25
|
+
|
|
26
|
+
wikicorpus is a Python command-line tool for constructing text corpora from
|
|
27
|
+
Wikipedia categories, designed to support knowledge extraction and natural
|
|
28
|
+
language processing research. Given one or more Wikipedia category names, the
|
|
29
|
+
tool retrieves all member articles, aligns each article with its Wikidata
|
|
30
|
+
entity, filters sections by configurable interpretive-content keywords, and
|
|
31
|
+
extracts candidate sentences containing epistemic verbs (verbs that signal
|
|
32
|
+
belief, attribution, argumentation, or uncertainty). All filtering parameters
|
|
33
|
+
are runtime-configurable: researchers can supply custom verb lexicons and
|
|
34
|
+
section-keyword lists without modifying source code, making the tool
|
|
35
|
+
reproducible across different domains and corpora. Output is written to a
|
|
36
|
+
structured directory tree with per-article files and a root index, suitable
|
|
37
|
+
for downstream parsing, annotation, or training pipelines.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install -e .
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Python 3.10 or later is required. The only runtime dependencies are
|
|
48
|
+
`requests` and `nltk`; NLTK corpora are downloaded automatically on first
|
|
49
|
+
use of the sentence-filtering step.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Quick-start examples
|
|
54
|
+
|
|
55
|
+
### Retrieve all articles in a single category with no filtering
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
wikicorpus --categories "Painting forgeries" --output ./corpus
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Retrieve two categories, keep only articles with a Wikidata entity, and
|
|
62
|
+
### require that the entity has the creator property (P170)
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
wikicorpus \
|
|
66
|
+
--categories "Painting forgeries | Document forgeries" \
|
|
67
|
+
--output ./corpus \
|
|
68
|
+
--filter-wikidata \
|
|
69
|
+
--wikidata-properties "P170!" \
|
|
70
|
+
--verbose
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Full pipeline with interpretive-section and sentence filtering,
|
|
74
|
+
### plus custom section keywords and additional verb lemmas
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
wikicorpus \
|
|
78
|
+
--categories-file categories.txt \
|
|
79
|
+
--output ./corpus \
|
|
80
|
+
--filter-wikidata \
|
|
81
|
+
--wikidata-properties "P170! | P571?" \
|
|
82
|
+
--filter-sections \
|
|
83
|
+
--add-sections "analysis | methodology" \
|
|
84
|
+
--filter-sentences \
|
|
85
|
+
--add-verbs "posit | allege | maintain" \
|
|
86
|
+
--verbose
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## CLI reference
|
|
92
|
+
|
|
93
|
+
| Flag | Type / default | Description |
|
|
94
|
+
|------|----------------|-------------|
|
|
95
|
+
| `--categories` | string / — | Quoted string of Wikipedia category names separated by ` \| `. At least one of `--categories` or `--categories-file` is required. |
|
|
96
|
+
| `--categories-file` | path / — | Path to a plain-text file with one category name per line. Lines beginning with `#` and blank lines are ignored. |
|
|
97
|
+
| `--output` | path / `./corpus` | Root output directory. Created if it does not exist. |
|
|
98
|
+
| `--filter-wikidata` | flag / off | Skip articles with no matching Wikidata entity. When `--wikidata-properties` is also given, additionally enforce the property constraints. |
|
|
99
|
+
| `--wikidata-properties` | string / — | Pipe-separated Wikidata property IDs. Suffix each ID with `!` (mandatory: all must be present) or `?` (optional: at least one must be present). Bare IDs are treated as mandatory. Example: `"P170! | P571?"`. Has no filtering effect unless `--filter-wikidata` is also set. |
|
|
100
|
+
| `--filter-sections` | flag / off | Skip articles that contain no interpretive sections (as determined by the active section-keyword list). |
|
|
101
|
+
| `--filter-sentences` | flag / off | Skip articles that contain no candidate epistemic sentences. |
|
|
102
|
+
| `--verbs-file` | path / — | Path to a JSON file mapping verb-category names to lists of verb lemmas. Merged into the built-in categories: existing categories receive extra verbs; new category names are created. See format specification below. |
|
|
103
|
+
| `--add-verbs` | string / — | Pipe-separated verb lemmas to add to the `custom` verb category on top of whatever `--verbs-file` provides. Example: `"allege \| posit \| maintain"`. |
|
|
104
|
+
| `--sections-file` | path / — | Path to a JSON file containing an array of section-header keywords. Appended to the built-in list; duplicates are ignored. See format specification below. |
|
|
105
|
+
| `--add-sections` | string / — | Pipe-separated section-header keywords to add on top of `--sections-file` and the built-in list. Example: `"analysis \| interpretation"`. |
|
|
106
|
+
| `--verbose` | flag / off | Enable INFO-level progress messages written to stderr. |
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## File format specifications
|
|
111
|
+
|
|
112
|
+
### `--verbs-file` (JSON object)
|
|
113
|
+
|
|
114
|
+
A JSON object mapping verb-category names (strings) to lists of verb lemmas
|
|
115
|
+
(strings). Category names that already exist in the built-in list receive the
|
|
116
|
+
extra verbs appended with duplicates removed. New category names are created.
|
|
117
|
+
|
|
118
|
+
```json
|
|
119
|
+
{
|
|
120
|
+
"revision": ["posit", "maintain"],
|
|
121
|
+
"custom": ["allege", "impute"]
|
|
122
|
+
}
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Built-in verb categories and their default lemmas:
|
|
126
|
+
|
|
127
|
+
| Category | Default lemmas |
|
|
128
|
+
|----------|---------------|
|
|
129
|
+
| `argumentation` | argue, dispute, contend, refute, contest, challenge, oppose |
|
|
130
|
+
| `assertion` | claim, state, declare, assert, report, attribute, ascribe |
|
|
131
|
+
| `epistemic_uncertainty` | believe, think, suppose, assume, suspect, doubt, question, suggest |
|
|
132
|
+
| `inference` | conclude, deduce, infer, imply, indicate, derive, propose |
|
|
133
|
+
| `revision` | revise, reassign, reattribute, reconsider, overturn, correct, update |
|
|
134
|
+
|
|
135
|
+
### `--sections-file` (JSON array)
|
|
136
|
+
|
|
137
|
+
A JSON array of lowercase keyword strings. A section is considered
|
|
138
|
+
interpretive if any keyword appears as a substring of its header
|
|
139
|
+
(case-insensitive). Keywords are appended to the built-in list; duplicates
|
|
140
|
+
are ignored.
|
|
141
|
+
|
|
142
|
+
```json
|
|
143
|
+
["analysis", "interpretation", "methodology", "significance"]
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Built-in interpretive keywords: `attribution`, `provenance`, `dating`,
|
|
147
|
+
`controversy`, `authenticity`, `authorship`, `historiography`, `reception`,
|
|
148
|
+
`debate`, `forgery`, `misattribution`, `reattribution`.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Output structure
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
<output>/
|
|
156
|
+
├── index.json
|
|
157
|
+
└── <Category_Name>_<hash>/
|
|
158
|
+
└── <Article_Title>_<hash>/
|
|
159
|
+
├── full_text.txt
|
|
160
|
+
├── sections.json
|
|
161
|
+
├── wikidata.json
|
|
162
|
+
└── candidates.json
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Directory names are derived from the category and article title by replacing
|
|
166
|
+
spaces with underscores, removing characters outside `[a-zA-Z0-9_-]`, and
|
|
167
|
+
appending an 8-character MD5 hash of the original name to guarantee
|
|
168
|
+
uniqueness.
|
|
169
|
+
|
|
170
|
+
### Per-article files
|
|
171
|
+
|
|
172
|
+
| File | Content |
|
|
173
|
+
|------|---------|
|
|
174
|
+
| `full_text.txt` | Plain-text extract of the full article as returned by the Wikipedia API. |
|
|
175
|
+
| `sections.json` | JSON array of section objects, each with `"header"` (string) and `"text"` (string) keys. |
|
|
176
|
+
| `wikidata.json` | JSON object with `"qid"`, `"labels"`, and `"properties"` keys, or `null` if no Wikidata entity was found. |
|
|
177
|
+
| `candidates.json` | JSON array of candidate-sentence objects (see below). |
|
|
178
|
+
|
|
179
|
+
### `index.json` fields
|
|
180
|
+
|
|
181
|
+
The root `index.json` contains one object per saved article:
|
|
182
|
+
|
|
183
|
+
| Field | Type | Description |
|
|
184
|
+
|-------|------|-------------|
|
|
185
|
+
| `title` | string | Wikipedia article title. |
|
|
186
|
+
| `url` | string | Full URL of the Wikipedia article. |
|
|
187
|
+
| `category` | string | Category label as supplied on the command line. |
|
|
188
|
+
| `qid` | string or null | Wikidata item identifier (e.g. `"Q12418"`), or `null`. |
|
|
189
|
+
| `has_target_properties` | boolean | Whether the entity satisfies the property filter. Always `true` when no property spec is given. |
|
|
190
|
+
| `candidate_sentence_count` | integer | Number of candidate sentences extracted from the article. |
|
|
191
|
+
| `interpretive_section_count` | integer | Number of interpretive sections found. |
|
|
192
|
+
|
|
193
|
+
### `candidates.json` fields
|
|
194
|
+
|
|
195
|
+
Each entry in `candidates.json` describes one candidate sentence:
|
|
196
|
+
|
|
197
|
+
| Field | Type | Description |
|
|
198
|
+
|-------|------|-------------|
|
|
199
|
+
| `sentence` | string | The full sentence text. |
|
|
200
|
+
| `section_header` | string | Header of the section containing the sentence. |
|
|
201
|
+
| `verb_category` | string | Name of the first matched verb category (e.g. `"assertion"`). |
|
|
202
|
+
| `matched_verbs` | array of strings | All matched verb lemmas found in the sentence. |
|
|
203
|
+
| `negated` | boolean | `true` if a negation word (`not`, `never`, `no`) immediately precedes any matched verb. |
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Pipeline description
|
|
208
|
+
|
|
209
|
+
The pipeline has four stages:
|
|
210
|
+
|
|
211
|
+
1. **Retrieve** — `get_category_articles` queries the Wikipedia
|
|
212
|
+
`categorymembers` API to list all articles in each requested category.
|
|
213
|
+
`get_article_content` fetches the plain-text extract and internal links
|
|
214
|
+
for each article. Section boundaries are detected by wiki-markup headers
|
|
215
|
+
(`== Header ==`) and returned as a structured list.
|
|
216
|
+
|
|
217
|
+
2. **Align** — `get_wikidata_entity` queries the Wikidata API for the entity
|
|
218
|
+
linked to each Wikipedia article. If `--filter-wikidata` is active,
|
|
219
|
+
articles with no entity (or whose entity does not satisfy the property
|
|
220
|
+
spec) are dropped. The entity's QID, labels, and property values are
|
|
221
|
+
stored alongside the article data.
|
|
222
|
+
|
|
223
|
+
3. **Filter** — `get_interpretive_sections` selects sections whose headers
|
|
224
|
+
match at least one keyword from the active keyword list. If
|
|
225
|
+
`--filter-sections` is active, articles with no matching sections are
|
|
226
|
+
dropped. `get_candidate_sentences` tokenises, POS-tags, and lemmatises
|
|
227
|
+
sentences within interpretive sections (falling back to all sections if
|
|
228
|
+
none are found), then retains only those containing at least one
|
|
229
|
+
epistemic verb from the active verb-category dictionary. If
|
|
230
|
+
`--filter-sentences` is active, articles with no candidate sentences are
|
|
231
|
+
dropped.
|
|
232
|
+
|
|
233
|
+
4. **Write** — `save_article` creates the per-article directory and writes
|
|
234
|
+
the four output files. `save_index` writes the root `index.json`
|
|
235
|
+
summarising every saved article.
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Reproducibility
|
|
240
|
+
|
|
241
|
+
All filtering parameters are runtime-configurable through CLI flags and JSON
|
|
242
|
+
configuration files. The tool contains no hard-coded domain assumptions: the
|
|
243
|
+
built-in keyword and verb lists are defaults that can be extended or replaced
|
|
244
|
+
without modifying source code. To reproduce a corpus exactly, record:
|
|
245
|
+
|
|
246
|
+
- The exact `--categories` or `--categories-file` input.
|
|
247
|
+
- The `--wikidata-properties` spec string.
|
|
248
|
+
- The contents of any `--verbs-file` or `--sections-file` used.
|
|
249
|
+
- The exact set of `--add-verbs` and `--add-sections` tokens.
|
|
250
|
+
- The Wikipedia API snapshot date (the API returns the current live version
|
|
251
|
+
of articles; consider archiving `full_text.txt` for long-term
|
|
252
|
+
reproducibility).
|
|
253
|
+
|
|
254
|
+
The filtering logic is deterministic given fixed inputs: no randomness is
|
|
255
|
+
introduced at any stage.
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Summary output
|
|
260
|
+
|
|
261
|
+
After the pipeline completes, a summary is printed to stdout:
|
|
262
|
+
|
|
263
|
+
```
|
|
264
|
+
Categories processed : 2
|
|
265
|
+
Articles retrieved : 147
|
|
266
|
+
After Wikidata filter: 132
|
|
267
|
+
Wikidata properties : P170! | P571?
|
|
268
|
+
After section filter : 89
|
|
269
|
+
Section keywords : 12 active (+2 custom: analysis, interpretation)
|
|
270
|
+
After sentence filter: 61
|
|
271
|
+
Verb categories : argumentation(7), assertion(7), epistemic_uncertainty(8), inference(7), revision(7)
|
|
272
|
+
Candidate sentences : 430
|
|
273
|
+
Output directory : /absolute/path/to/corpus
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
| Field | Description |
|
|
277
|
+
|-------|-------------|
|
|
278
|
+
| Categories processed | Number of category names processed. |
|
|
279
|
+
| Articles retrieved | Total articles found across all categories. |
|
|
280
|
+
| After Wikidata filter | Articles remaining after the Wikidata filter (equals retrieved count if `--filter-wikidata` is off). |
|
|
281
|
+
| Wikidata properties | Active property spec, or `none (all entities retained)`. |
|
|
282
|
+
| After section filter | Articles remaining after the section filter. |
|
|
283
|
+
| Section keywords | Count of active keywords and any custom additions. |
|
|
284
|
+
| After sentence filter | Articles remaining after the sentence filter. |
|
|
285
|
+
| Verb categories | Each active category name and the number of lemmas it contains. |
|
|
286
|
+
| Candidate sentences | Total candidate sentences across all saved articles. |
|
|
287
|
+
| Output directory | Absolute path to the output root. |
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# wikicorpus
|
|
2
|
+
|
|
3
|
+
## Abstract
|
|
4
|
+
|
|
5
|
+
wikicorpus is a Python command-line tool for constructing text corpora from
|
|
6
|
+
Wikipedia categories, designed to support knowledge extraction and natural
|
|
7
|
+
language processing research. Given one or more Wikipedia category names, the
|
|
8
|
+
tool retrieves all member articles, aligns each article with its Wikidata
|
|
9
|
+
entity, filters sections by configurable interpretive-content keywords, and
|
|
10
|
+
extracts candidate sentences containing epistemic verbs (verbs that signal
|
|
11
|
+
belief, attribution, argumentation, or uncertainty). All filtering parameters
|
|
12
|
+
are runtime-configurable: researchers can supply custom verb lexicons and
|
|
13
|
+
section-keyword lists without modifying source code, making the tool
|
|
14
|
+
reproducible across different domains and corpora. Output is written to a
|
|
15
|
+
structured directory tree with per-article files and a root index, suitable
|
|
16
|
+
for downstream parsing, annotation, or training pipelines.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install -e .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Python 3.10 or later is required. The only runtime dependencies are
|
|
27
|
+
`requests` and `nltk`; NLTK corpora are downloaded automatically on first
|
|
28
|
+
use of the sentence-filtering step.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Quick-start examples
|
|
33
|
+
|
|
34
|
+
### Retrieve all articles in a single category with no filtering
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
wikicorpus --categories "Painting forgeries" --output ./corpus
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Retrieve two categories, keep only articles with a Wikidata entity, and
|
|
41
|
+
### require that the entity has the creator property (P170)
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
wikicorpus \
|
|
45
|
+
--categories "Painting forgeries | Document forgeries" \
|
|
46
|
+
--output ./corpus \
|
|
47
|
+
--filter-wikidata \
|
|
48
|
+
--wikidata-properties "P170!" \
|
|
49
|
+
--verbose
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Full pipeline with interpretive-section and sentence filtering,
|
|
53
|
+
### plus custom section keywords and additional verb lemmas
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
wikicorpus \
|
|
57
|
+
--categories-file categories.txt \
|
|
58
|
+
--output ./corpus \
|
|
59
|
+
--filter-wikidata \
|
|
60
|
+
--wikidata-properties "P170! | P571?" \
|
|
61
|
+
--filter-sections \
|
|
62
|
+
--add-sections "analysis | methodology" \
|
|
63
|
+
--filter-sentences \
|
|
64
|
+
--add-verbs "posit | allege | maintain" \
|
|
65
|
+
--verbose
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## CLI reference
|
|
71
|
+
|
|
72
|
+
| Flag | Type / default | Description |
|
|
73
|
+
|------|----------------|-------------|
|
|
74
|
+
| `--categories` | string / — | Quoted string of Wikipedia category names separated by ` \| `. At least one of `--categories` or `--categories-file` is required. |
|
|
75
|
+
| `--categories-file` | path / — | Path to a plain-text file with one category name per line. Lines beginning with `#` and blank lines are ignored. |
|
|
76
|
+
| `--output` | path / `./corpus` | Root output directory. Created if it does not exist. |
|
|
77
|
+
| `--filter-wikidata` | flag / off | Skip articles with no matching Wikidata entity. When `--wikidata-properties` is also given, additionally enforce the property constraints. |
|
|
78
|
+
| `--wikidata-properties` | string / — | Pipe-separated Wikidata property IDs. Suffix each ID with `!` (mandatory: all must be present) or `?` (optional: at least one must be present). Bare IDs are treated as mandatory. Example: `"P170! | P571?"`. Has no filtering effect unless `--filter-wikidata` is also set. |
|
|
79
|
+
| `--filter-sections` | flag / off | Skip articles that contain no interpretive sections (as determined by the active section-keyword list). |
|
|
80
|
+
| `--filter-sentences` | flag / off | Skip articles that contain no candidate epistemic sentences. |
|
|
81
|
+
| `--verbs-file` | path / — | Path to a JSON file mapping verb-category names to lists of verb lemmas. Merged into the built-in categories: existing categories receive extra verbs; new category names are created. See format specification below. |
|
|
82
|
+
| `--add-verbs` | string / — | Pipe-separated verb lemmas to add to the `custom` verb category on top of whatever `--verbs-file` provides. Example: `"allege \| posit \| maintain"`. |
|
|
83
|
+
| `--sections-file` | path / — | Path to a JSON file containing an array of section-header keywords. Appended to the built-in list; duplicates are ignored. See format specification below. |
|
|
84
|
+
| `--add-sections` | string / — | Pipe-separated section-header keywords to add on top of `--sections-file` and the built-in list. Example: `"analysis \| interpretation"`. |
|
|
85
|
+
| `--verbose` | flag / off | Enable INFO-level progress messages written to stderr. |
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## File format specifications
|
|
90
|
+
|
|
91
|
+
### `--verbs-file` (JSON object)
|
|
92
|
+
|
|
93
|
+
A JSON object mapping verb-category names (strings) to lists of verb lemmas
|
|
94
|
+
(strings). Category names that already exist in the built-in list receive the
|
|
95
|
+
extra verbs appended with duplicates removed. New category names are created.
|
|
96
|
+
|
|
97
|
+
```json
|
|
98
|
+
{
|
|
99
|
+
"revision": ["posit", "maintain"],
|
|
100
|
+
"custom": ["allege", "impute"]
|
|
101
|
+
}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Built-in verb categories and their default lemmas:
|
|
105
|
+
|
|
106
|
+
| Category | Default lemmas |
|
|
107
|
+
|----------|---------------|
|
|
108
|
+
| `argumentation` | argue, dispute, contend, refute, contest, challenge, oppose |
|
|
109
|
+
| `assertion` | claim, state, declare, assert, report, attribute, ascribe |
|
|
110
|
+
| `epistemic_uncertainty` | believe, think, suppose, assume, suspect, doubt, question, suggest |
|
|
111
|
+
| `inference` | conclude, deduce, infer, imply, indicate, derive, propose |
|
|
112
|
+
| `revision` | revise, reassign, reattribute, reconsider, overturn, correct, update |
|
|
113
|
+
|
|
114
|
+
### `--sections-file` (JSON array)
|
|
115
|
+
|
|
116
|
+
A JSON array of lowercase keyword strings. A section is considered
|
|
117
|
+
interpretive if any keyword appears as a substring of its header
|
|
118
|
+
(case-insensitive). Keywords are appended to the built-in list; duplicates
|
|
119
|
+
are ignored.
|
|
120
|
+
|
|
121
|
+
```json
|
|
122
|
+
["analysis", "interpretation", "methodology", "significance"]
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Built-in interpretive keywords: `attribution`, `provenance`, `dating`,
|
|
126
|
+
`controversy`, `authenticity`, `authorship`, `historiography`, `reception`,
|
|
127
|
+
`debate`, `forgery`, `misattribution`, `reattribution`.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Output structure
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
<output>/
|
|
135
|
+
├── index.json
|
|
136
|
+
└── <Category_Name>_<hash>/
|
|
137
|
+
└── <Article_Title>_<hash>/
|
|
138
|
+
├── full_text.txt
|
|
139
|
+
├── sections.json
|
|
140
|
+
├── wikidata.json
|
|
141
|
+
└── candidates.json
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Directory names are derived from the category and article title by replacing
|
|
145
|
+
spaces with underscores, removing characters outside `[a-zA-Z0-9_-]`, and
|
|
146
|
+
appending an 8-character MD5 hash of the original name to guarantee
|
|
147
|
+
uniqueness.
|
|
148
|
+
|
|
149
|
+
### Per-article files
|
|
150
|
+
|
|
151
|
+
| File | Content |
|
|
152
|
+
|------|---------|
|
|
153
|
+
| `full_text.txt` | Plain-text extract of the full article as returned by the Wikipedia API. |
|
|
154
|
+
| `sections.json` | JSON array of section objects, each with `"header"` (string) and `"text"` (string) keys. |
|
|
155
|
+
| `wikidata.json` | JSON object with `"qid"`, `"labels"`, and `"properties"` keys, or `null` if no Wikidata entity was found. |
|
|
156
|
+
| `candidates.json` | JSON array of candidate-sentence objects (see below). |
|
|
157
|
+
|
|
158
|
+
### `index.json` fields
|
|
159
|
+
|
|
160
|
+
The root `index.json` contains one object per saved article:
|
|
161
|
+
|
|
162
|
+
| Field | Type | Description |
|
|
163
|
+
|-------|------|-------------|
|
|
164
|
+
| `title` | string | Wikipedia article title. |
|
|
165
|
+
| `url` | string | Full URL of the Wikipedia article. |
|
|
166
|
+
| `category` | string | Category label as supplied on the command line. |
|
|
167
|
+
| `qid` | string or null | Wikidata item identifier (e.g. `"Q12418"`), or `null`. |
|
|
168
|
+
| `has_target_properties` | boolean | Whether the entity satisfies the property filter. Always `true` when no property spec is given. |
|
|
169
|
+
| `candidate_sentence_count` | integer | Number of candidate sentences extracted from the article. |
|
|
170
|
+
| `interpretive_section_count` | integer | Number of interpretive sections found. |
|
|
171
|
+
|
|
172
|
+
### `candidates.json` fields
|
|
173
|
+
|
|
174
|
+
Each entry in `candidates.json` describes one candidate sentence:
|
|
175
|
+
|
|
176
|
+
| Field | Type | Description |
|
|
177
|
+
|-------|------|-------------|
|
|
178
|
+
| `sentence` | string | The full sentence text. |
|
|
179
|
+
| `section_header` | string | Header of the section containing the sentence. |
|
|
180
|
+
| `verb_category` | string | Name of the first matched verb category (e.g. `"assertion"`). |
|
|
181
|
+
| `matched_verbs` | array of strings | All matched verb lemmas found in the sentence. |
|
|
182
|
+
| `negated` | boolean | `true` if a negation word (`not`, `never`, `no`) immediately precedes any matched verb. |
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Pipeline description
|
|
187
|
+
|
|
188
|
+
The pipeline has four stages:
|
|
189
|
+
|
|
190
|
+
1. **Retrieve** — `get_category_articles` queries the Wikipedia
|
|
191
|
+
`categorymembers` API to list all articles in each requested category.
|
|
192
|
+
`get_article_content` fetches the plain-text extract and internal links
|
|
193
|
+
for each article. Section boundaries are detected by wiki-markup headers
|
|
194
|
+
(`== Header ==`) and returned as a structured list.
|
|
195
|
+
|
|
196
|
+
2. **Align** — `get_wikidata_entity` queries the Wikidata API for the entity
|
|
197
|
+
linked to each Wikipedia article. If `--filter-wikidata` is active,
|
|
198
|
+
articles with no entity (or whose entity does not satisfy the property
|
|
199
|
+
spec) are dropped. The entity's QID, labels, and property values are
|
|
200
|
+
stored alongside the article data.
|
|
201
|
+
|
|
202
|
+
3. **Filter** — `get_interpretive_sections` selects sections whose headers
|
|
203
|
+
match at least one keyword from the active keyword list. If
|
|
204
|
+
`--filter-sections` is active, articles with no matching sections are
|
|
205
|
+
dropped. `get_candidate_sentences` tokenises, POS-tags, and lemmatises
|
|
206
|
+
sentences within interpretive sections (falling back to all sections if
|
|
207
|
+
none are found), then retains only those containing at least one
|
|
208
|
+
epistemic verb from the active verb-category dictionary. If
|
|
209
|
+
`--filter-sentences` is active, articles with no candidate sentences are
|
|
210
|
+
dropped.
|
|
211
|
+
|
|
212
|
+
4. **Write** — `save_article` creates the per-article directory and writes
|
|
213
|
+
the four output files. `save_index` writes the root `index.json`
|
|
214
|
+
summarising every saved article.
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## Reproducibility
|
|
219
|
+
|
|
220
|
+
All filtering parameters are runtime-configurable through CLI flags and JSON
|
|
221
|
+
configuration files. The tool contains no hard-coded domain assumptions: the
|
|
222
|
+
built-in keyword and verb lists are defaults that can be extended or replaced
|
|
223
|
+
without modifying source code. To reproduce a corpus exactly, record:
|
|
224
|
+
|
|
225
|
+
- The exact `--categories` or `--categories-file` input.
|
|
226
|
+
- The `--wikidata-properties` spec string.
|
|
227
|
+
- The contents of any `--verbs-file` or `--sections-file` used.
|
|
228
|
+
- The exact set of `--add-verbs` and `--add-sections` tokens.
|
|
229
|
+
- The Wikipedia API snapshot date (the API returns the current live version
|
|
230
|
+
of articles; consider archiving `full_text.txt` for long-term
|
|
231
|
+
reproducibility).
|
|
232
|
+
|
|
233
|
+
The filtering logic is deterministic given fixed inputs: no randomness is
|
|
234
|
+
introduced at any stage.
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Summary output
|
|
239
|
+
|
|
240
|
+
After the pipeline completes, a summary is printed to stdout:
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
Categories processed : 2
|
|
244
|
+
Articles retrieved : 147
|
|
245
|
+
After Wikidata filter: 132
|
|
246
|
+
Wikidata properties : P170! | P571?
|
|
247
|
+
After section filter : 89
|
|
248
|
+
Section keywords : 12 active (+2 custom: analysis, interpretation)
|
|
249
|
+
After sentence filter: 61
|
|
250
|
+
Verb categories : argumentation(7), assertion(7), epistemic_uncertainty(8), inference(7), revision(7)
|
|
251
|
+
Candidate sentences : 430
|
|
252
|
+
Output directory : /absolute/path/to/corpus
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
| Field | Description |
|
|
256
|
+
|-------|-------------|
|
|
257
|
+
| Categories processed | Number of category names processed. |
|
|
258
|
+
| Articles retrieved | Total articles found across all categories. |
|
|
259
|
+
| After Wikidata filter | Articles remaining after the Wikidata filter (equals retrieved count if `--filter-wikidata` is off). |
|
|
260
|
+
| Wikidata properties | Active property spec, or `none (all entities retained)`. |
|
|
261
|
+
| After section filter | Articles remaining after the section filter. |
|
|
262
|
+
| Section keywords | Count of active keywords and any custom additions. |
|
|
263
|
+
| After sentence filter | Articles remaining after the sentence filter. |
|
|
264
|
+
| Verb categories | Each active category name and the number of lemmas it contains. |
|
|
265
|
+
| Candidate sentences | Total candidate sentences across all saved articles. |
|
|
266
|
+
| Output directory | Absolute path to the output root. |
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "wikicorpus"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Retrieve and filter Wikipedia articles to build a corpus for knowledge extraction"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Andrea Schimmenti", email = "andschimmenti@gmail.com" },
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"wikipedia",
|
|
17
|
+
"corpus",
|
|
18
|
+
"nlp",
|
|
19
|
+
"knowledge-extraction",
|
|
20
|
+
"wikidata",
|
|
21
|
+
"text-mining",
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 3 - Alpha",
|
|
25
|
+
"Intended Audience :: Science/Research",
|
|
26
|
+
"License :: OSI Approved :: MIT License",
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Programming Language :: Python :: 3.10",
|
|
29
|
+
"Programming Language :: Python :: 3.11",
|
|
30
|
+
"Programming Language :: Python :: 3.12",
|
|
31
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
32
|
+
"Topic :: Text Processing :: Linguistic",
|
|
33
|
+
]
|
|
34
|
+
dependencies = [
|
|
35
|
+
"requests",
|
|
36
|
+
"nltk",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.scripts]
|
|
40
|
+
wikicorpus = "wikicorpus.cli:main"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["wikicorpus"]
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.sdist]
|
|
46
|
+
exclude = [
|
|
47
|
+
".claude/",
|
|
48
|
+
"issues.md",
|
|
49
|
+
"corpus/",
|
|
50
|
+
"corpus_test/",
|
|
51
|
+
"dist/",
|
|
52
|
+
]
|
|
File without changes
|