wikicorpus 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,287 @@
1
+ Metadata-Version: 2.4
2
+ Name: wikicorpus
3
+ Version: 0.1.0
4
+ Summary: Retrieve and filter Wikipedia articles to build a corpus for knowledge extraction
5
+ Author-email: Andrea Schimmenti <andschimmenti@gmail.com>
6
+ License: MIT
7
+ Keywords: corpus,knowledge-extraction,nlp,text-mining,wikidata,wikipedia
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
16
+ Classifier: Topic :: Text Processing :: Linguistic
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: nltk
19
+ Requires-Dist: requests
20
+ Description-Content-Type: text/markdown
21
+
22
+ # wikicorpus
23
+
24
+ ## Abstract
25
+
26
+ wikicorpus is a Python command-line tool for constructing text corpora from
27
+ Wikipedia categories, designed to support knowledge extraction and natural
28
+ language processing research. Given one or more Wikipedia category names, the
29
+ tool retrieves all member articles, aligns each article with its Wikidata
30
+ entity, filters sections by configurable interpretive-content keywords, and
31
+ extracts candidate sentences containing epistemic verbs (verbs that signal
32
+ belief, attribution, argumentation, or uncertainty). All filtering parameters
33
+ are runtime-configurable: researchers can supply custom verb lexicons and
34
+ section-keyword lists without modifying source code, making the tool
35
+ reproducible across different domains and corpora. Output is written to a
36
+ structured directory tree with per-article files and a root index, suitable
37
+ for downstream parsing, annotation, or training pipelines.
38
+
39
+ ---
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ pip install -e .
45
+ ```
46
+
47
+ Python 3.10 or later is required. The only runtime dependencies are
48
+ `requests` and `nltk`; NLTK corpora are downloaded automatically on first
49
+ use of the sentence-filtering step.
50
+
51
+ ---
52
+
53
+ ## Quick-start examples
54
+
55
+ ### Retrieve all articles in a single category with no filtering
56
+
57
+ ```bash
58
+ wikicorpus --categories "Painting forgeries" --output ./corpus
59
+ ```
60
+
61
+ ### Retrieve two categories, keep only articles with a Wikidata entity, and
62
+ ### require that the entity has the creator property (P170)
63
+
64
+ ```bash
65
+ wikicorpus \
66
+ --categories "Painting forgeries | Document forgeries" \
67
+ --output ./corpus \
68
+ --filter-wikidata \
69
+ --wikidata-properties "P170!" \
70
+ --verbose
71
+ ```
72
+
73
+ ### Full pipeline with interpretive-section and sentence filtering,
74
+ ### plus custom section keywords and additional verb lemmas
75
+
76
+ ```bash
77
+ wikicorpus \
78
+ --categories-file categories.txt \
79
+ --output ./corpus \
80
+ --filter-wikidata \
81
+ --wikidata-properties "P170! | P571?" \
82
+ --filter-sections \
83
+ --add-sections "analysis | methodology" \
84
+ --filter-sentences \
85
+ --add-verbs "posit | allege | maintain" \
86
+ --verbose
87
+ ```
88
+
89
+ ---
90
+
91
+ ## CLI reference
92
+
93
+ | Flag | Type / default | Description |
94
+ |------|----------------|-------------|
95
+ | `--categories` | string / — | Quoted string of Wikipedia category names separated by ` \| `. At least one of `--categories` or `--categories-file` is required. |
96
+ | `--categories-file` | path / — | Path to a plain-text file with one category name per line. Lines beginning with `#` and blank lines are ignored. |
97
+ | `--output` | path / `./corpus` | Root output directory. Created if it does not exist. |
98
+ | `--filter-wikidata` | flag / off | Skip articles with no matching Wikidata entity. When `--wikidata-properties` is also given, additionally enforce the property constraints. |
99
+ | `--wikidata-properties` | string / — | Pipe-separated Wikidata property IDs. Suffix each ID with `!` (mandatory: all must be present) or `?` (optional: at least one must be present). Bare IDs are treated as mandatory. Example: `"P170! | P571?"`. Has no filtering effect unless `--filter-wikidata` is also set. |
100
+ | `--filter-sections` | flag / off | Skip articles that contain no interpretive sections (as determined by the active section-keyword list). |
101
+ | `--filter-sentences` | flag / off | Skip articles that contain no candidate epistemic sentences. |
102
+ | `--verbs-file` | path / — | Path to a JSON file mapping verb-category names to lists of verb lemmas. Merged into the built-in categories: existing categories receive extra verbs; new category names are created. See format specification below. |
103
+ | `--add-verbs` | string / — | Pipe-separated verb lemmas to add to the `custom` verb category on top of whatever `--verbs-file` provides. Example: `"allege \| posit \| maintain"`. |
104
+ | `--sections-file` | path / — | Path to a JSON file containing an array of section-header keywords. Appended to the built-in list; duplicates are ignored. See format specification below. |
105
+ | `--add-sections` | string / — | Pipe-separated section-header keywords to add on top of `--sections-file` and the built-in list. Example: `"analysis \| interpretation"`. |
106
+ | `--verbose` | flag / off | Enable INFO-level progress messages written to stderr. |
107
+
108
+ ---
109
+
110
+ ## File format specifications
111
+
112
+ ### `--verbs-file` (JSON object)
113
+
114
+ A JSON object mapping verb-category names (strings) to lists of verb lemmas
115
+ (strings). Category names that already exist in the built-in list receive the
116
+ extra verbs appended with duplicates removed. New category names are created.
117
+
118
+ ```json
119
+ {
120
+ "revision": ["posit", "maintain"],
121
+ "custom": ["allege", "impute"]
122
+ }
123
+ ```
124
+
125
+ Built-in verb categories and their default lemmas:
126
+
127
+ | Category | Default lemmas |
128
+ |----------|---------------|
129
+ | `argumentation` | argue, dispute, contend, refute, contest, challenge, oppose |
130
+ | `assertion` | claim, state, declare, assert, report, attribute, ascribe |
131
+ | `epistemic_uncertainty` | believe, think, suppose, assume, suspect, doubt, question, suggest |
132
+ | `inference` | conclude, deduce, infer, imply, indicate, derive, propose |
133
+ | `revision` | revise, reassign, reattribute, reconsider, overturn, correct, update |
134
+
135
+ ### `--sections-file` (JSON array)
136
+
137
+ A JSON array of lowercase keyword strings. A section is considered
138
+ interpretive if any keyword appears as a substring of its header
139
+ (case-insensitive). Keywords are appended to the built-in list; duplicates
140
+ are ignored.
141
+
142
+ ```json
143
+ ["analysis", "interpretation", "methodology", "significance"]
144
+ ```
145
+
146
+ Built-in interpretive keywords: `attribution`, `provenance`, `dating`,
147
+ `controversy`, `authenticity`, `authorship`, `historiography`, `reception`,
148
+ `debate`, `forgery`, `misattribution`, `reattribution`.
149
+
150
+ ---
151
+
152
+ ## Output structure
153
+
154
+ ```
155
+ <output>/
156
+ ├── index.json
157
+ └── <Category_Name>_<hash>/
158
+ └── <Article_Title>_<hash>/
159
+ ├── full_text.txt
160
+ ├── sections.json
161
+ ├── wikidata.json
162
+ └── candidates.json
163
+ ```
164
+
165
+ Directory names are derived from the category and article title by replacing
166
+ spaces with underscores, removing characters outside `[a-zA-Z0-9_-]`, and
167
+ appending an 8-character MD5 hash of the original name to guarantee
168
+ uniqueness.
169
+
170
+ ### Per-article files
171
+
172
+ | File | Content |
173
+ |------|---------|
174
+ | `full_text.txt` | Plain-text extract of the full article as returned by the Wikipedia API. |
175
+ | `sections.json` | JSON array of section objects, each with `"header"` (string) and `"text"` (string) keys. |
176
+ | `wikidata.json` | JSON object with `"qid"`, `"labels"`, and `"properties"` keys, or `null` if no Wikidata entity was found. |
177
+ | `candidates.json` | JSON array of candidate-sentence objects (see below). |
178
+
179
+ ### `index.json` fields
180
+
181
+ The root `index.json` contains one object per saved article:
182
+
183
+ | Field | Type | Description |
184
+ |-------|------|-------------|
185
+ | `title` | string | Wikipedia article title. |
186
+ | `url` | string | Full URL of the Wikipedia article. |
187
+ | `category` | string | Category label as supplied on the command line. |
188
+ | `qid` | string or null | Wikidata item identifier (e.g. `"Q12418"`), or `null`. |
189
+ | `has_target_properties` | boolean | Whether the entity satisfies the property filter. Always `true` when no property spec is given. |
190
+ | `candidate_sentence_count` | integer | Number of candidate sentences extracted from the article. |
191
+ | `interpretive_section_count` | integer | Number of interpretive sections found. |
192
+
193
+ ### `candidates.json` fields
194
+
195
+ Each entry in `candidates.json` describes one candidate sentence:
196
+
197
+ | Field | Type | Description |
198
+ |-------|------|-------------|
199
+ | `sentence` | string | The full sentence text. |
200
+ | `section_header` | string | Header of the section containing the sentence. |
201
+ | `verb_category` | string | Name of the first matched verb category (e.g. `"assertion"`). |
202
+ | `matched_verbs` | array of strings | All matched verb lemmas found in the sentence. |
203
+ | `negated` | boolean | `true` if a negation word (`not`, `never`, `no`) immediately precedes any matched verb. |
204
+
205
+ ---
206
+
207
+ ## Pipeline description
208
+
209
+ The pipeline has four stages:
210
+
211
+ 1. **Retrieve** — `get_category_articles` queries the Wikipedia
212
+ `categorymembers` API to list all articles in each requested category.
213
+ `get_article_content` fetches the plain-text extract and internal links
214
+ for each article. Section boundaries are detected by wiki-markup headers
215
+ (`== Header ==`) and returned as a structured list.
216
+
217
+ 2. **Align** — `get_wikidata_entity` queries the Wikidata API for the entity
218
+ linked to each Wikipedia article. If `--filter-wikidata` is active,
219
+ articles with no entity (or whose entity does not satisfy the property
220
+ spec) are dropped. The entity's QID, labels, and property values are
221
+ stored alongside the article data.
222
+
223
+ 3. **Filter** — `get_interpretive_sections` selects sections whose headers
224
+ match at least one keyword from the active keyword list. If
225
+ `--filter-sections` is active, articles with no matching sections are
226
+ dropped. `get_candidate_sentences` tokenises, POS-tags, and lemmatises
227
+ sentences within interpretive sections (falling back to all sections if
228
+ none are found), then retains only those containing at least one
229
+ epistemic verb from the active verb-category dictionary. If
230
+ `--filter-sentences` is active, articles with no candidate sentences are
231
+ dropped.
232
+
233
+ 4. **Write** — `save_article` creates the per-article directory and writes
234
+ the four output files. `save_index` writes the root `index.json`
235
+ summarising every saved article.
236
+
237
+ ---
238
+
239
+ ## Reproducibility
240
+
241
+ All filtering parameters are runtime-configurable through CLI flags and JSON
242
+ configuration files. The tool contains no hard-coded domain assumptions: the
243
+ built-in keyword and verb lists are defaults that can be extended or replaced
244
+ without modifying source code. To reproduce a corpus exactly, record:
245
+
246
+ - The exact `--categories` or `--categories-file` input.
247
+ - The `--wikidata-properties` spec string.
248
+ - The contents of any `--verbs-file` or `--sections-file` used.
249
+ - The exact set of `--add-verbs` and `--add-sections` tokens.
250
+ - The Wikipedia API snapshot date (the API returns the current live version
251
+ of articles; consider archiving `full_text.txt` for long-term
252
+ reproducibility).
253
+
254
+ The filtering logic is deterministic given fixed inputs: no randomness is
255
+ introduced at any stage.
256
+
257
+ ---
258
+
259
+ ## Summary output
260
+
261
+ After the pipeline completes, a summary is printed to stdout:
262
+
263
+ ```
264
+ Categories processed : 2
265
+ Articles retrieved : 147
266
+ After Wikidata filter: 132
267
+ Wikidata properties : P170! | P571?
268
+ After section filter : 89
269
+ Section keywords : 12 active (+2 custom: analysis, interpretation)
270
+ After sentence filter: 61
271
+ Verb categories : argumentation(7), assertion(7), epistemic_uncertainty(8), inference(7), revision(7)
272
+ Candidate sentences : 430
273
+ Output directory : /absolute/path/to/corpus
274
+ ```
275
+
276
+ | Field | Description |
277
+ |-------|-------------|
278
+ | Categories processed | Number of category names processed. |
279
+ | Articles retrieved | Total articles found across all categories. |
280
+ | After Wikidata filter | Articles remaining after the Wikidata filter (equals retrieved count if `--filter-wikidata` is off). |
281
+ | Wikidata properties | Active property spec, or `none (all entities retained)`. |
282
+ | After section filter | Articles remaining after the section filter. |
283
+ | Section keywords | Count of active keywords and any custom additions. |
284
+ | After sentence filter | Articles remaining after the sentence filter. |
285
+ | Verb categories | Each active category name and the number of lemmas it contains. |
286
+ | Candidate sentences | Total candidate sentences across all saved articles. |
287
+ | Output directory | Absolute path to the output root. |
@@ -0,0 +1,266 @@
1
+ # wikicorpus
2
+
3
+ ## Abstract
4
+
5
+ wikicorpus is a Python command-line tool for constructing text corpora from
6
+ Wikipedia categories, designed to support knowledge extraction and natural
7
+ language processing research. Given one or more Wikipedia category names, the
8
+ tool retrieves all member articles, aligns each article with its Wikidata
9
+ entity, filters sections by configurable interpretive-content keywords, and
10
+ extracts candidate sentences containing epistemic verbs (verbs that signal
11
+ belief, attribution, argumentation, or uncertainty). All filtering parameters
12
+ are runtime-configurable: researchers can supply custom verb lexicons and
13
+ section-keyword lists without modifying source code, making the tool
14
+ reproducible across different domains and corpora. Output is written to a
15
+ structured directory tree with per-article files and a root index, suitable
16
+ for downstream parsing, annotation, or training pipelines.
17
+
18
+ ---
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ pip install -e .
24
+ ```
25
+
26
+ Python 3.10 or later is required. The only runtime dependencies are
27
+ `requests` and `nltk`; NLTK corpora are downloaded automatically on first
28
+ use of the sentence-filtering step.
29
+
30
+ ---
31
+
32
+ ## Quick-start examples
33
+
34
+ ### Retrieve all articles in a single category with no filtering
35
+
36
+ ```bash
37
+ wikicorpus --categories "Painting forgeries" --output ./corpus
38
+ ```
39
+
40
+ ### Retrieve two categories, keep only articles with a Wikidata entity, and
41
+ ### require that the entity has the creator property (P170)
42
+
43
+ ```bash
44
+ wikicorpus \
45
+ --categories "Painting forgeries | Document forgeries" \
46
+ --output ./corpus \
47
+ --filter-wikidata \
48
+ --wikidata-properties "P170!" \
49
+ --verbose
50
+ ```
51
+
52
+ ### Full pipeline with interpretive-section and sentence filtering,
53
+ ### plus custom section keywords and additional verb lemmas
54
+
55
+ ```bash
56
+ wikicorpus \
57
+ --categories-file categories.txt \
58
+ --output ./corpus \
59
+ --filter-wikidata \
60
+ --wikidata-properties "P170! | P571?" \
61
+ --filter-sections \
62
+ --add-sections "analysis | methodology" \
63
+ --filter-sentences \
64
+ --add-verbs "posit | allege | maintain" \
65
+ --verbose
66
+ ```
67
+
68
+ ---
69
+
70
+ ## CLI reference
71
+
72
+ | Flag | Type / default | Description |
73
+ |------|----------------|-------------|
74
+ | `--categories` | string / — | Quoted string of Wikipedia category names separated by ` \| `. At least one of `--categories` or `--categories-file` is required. |
75
+ | `--categories-file` | path / — | Path to a plain-text file with one category name per line. Lines beginning with `#` and blank lines are ignored. |
76
+ | `--output` | path / `./corpus` | Root output directory. Created if it does not exist. |
77
+ | `--filter-wikidata` | flag / off | Skip articles with no matching Wikidata entity. When `--wikidata-properties` is also given, additionally enforce the property constraints. |
78
+ | `--wikidata-properties` | string / — | Pipe-separated Wikidata property IDs. Suffix each ID with `!` (mandatory: all must be present) or `?` (optional: at least one must be present). Bare IDs are treated as mandatory. Example: `"P170! | P571?"`. Has no filtering effect unless `--filter-wikidata` is also set. |
79
+ | `--filter-sections` | flag / off | Skip articles that contain no interpretive sections (as determined by the active section-keyword list). |
80
+ | `--filter-sentences` | flag / off | Skip articles that contain no candidate epistemic sentences. |
81
+ | `--verbs-file` | path / — | Path to a JSON file mapping verb-category names to lists of verb lemmas. Merged into the built-in categories: existing categories receive extra verbs; new category names are created. See format specification below. |
82
+ | `--add-verbs` | string / — | Pipe-separated verb lemmas to add to the `custom` verb category on top of whatever `--verbs-file` provides. Example: `"allege \| posit \| maintain"`. |
83
+ | `--sections-file` | path / — | Path to a JSON file containing an array of section-header keywords. Appended to the built-in list; duplicates are ignored. See format specification below. |
84
+ | `--add-sections` | string / — | Pipe-separated section-header keywords to add on top of `--sections-file` and the built-in list. Example: `"analysis \| interpretation"`. |
85
+ | `--verbose` | flag / off | Enable INFO-level progress messages written to stderr. |
86
+
87
+ ---
88
+
89
+ ## File format specifications
90
+
91
+ ### `--verbs-file` (JSON object)
92
+
93
+ A JSON object mapping verb-category names (strings) to lists of verb lemmas
94
+ (strings). Category names that already exist in the built-in list receive the
95
+ extra verbs appended with duplicates removed. New category names are created.
96
+
97
+ ```json
98
+ {
99
+ "revision": ["posit", "maintain"],
100
+ "custom": ["allege", "impute"]
101
+ }
102
+ ```
103
+
104
+ Built-in verb categories and their default lemmas:
105
+
106
+ | Category | Default lemmas |
107
+ |----------|---------------|
108
+ | `argumentation` | argue, dispute, contend, refute, contest, challenge, oppose |
109
+ | `assertion` | claim, state, declare, assert, report, attribute, ascribe |
110
+ | `epistemic_uncertainty` | believe, think, suppose, assume, suspect, doubt, question, suggest |
111
+ | `inference` | conclude, deduce, infer, imply, indicate, derive, propose |
112
+ | `revision` | revise, reassign, reattribute, reconsider, overturn, correct, update |
113
+
114
+ ### `--sections-file` (JSON array)
115
+
116
+ A JSON array of lowercase keyword strings. A section is considered
117
+ interpretive if any keyword appears as a substring of its header
118
+ (case-insensitive). Keywords are appended to the built-in list; duplicates
119
+ are ignored.
120
+
121
+ ```json
122
+ ["analysis", "interpretation", "methodology", "significance"]
123
+ ```
124
+
125
+ Built-in interpretive keywords: `attribution`, `provenance`, `dating`,
126
+ `controversy`, `authenticity`, `authorship`, `historiography`, `reception`,
127
+ `debate`, `forgery`, `misattribution`, `reattribution`.
128
+
129
+ ---
130
+
131
+ ## Output structure
132
+
133
+ ```
134
+ <output>/
135
+ ├── index.json
136
+ └── <Category_Name>_<hash>/
137
+ └── <Article_Title>_<hash>/
138
+ ├── full_text.txt
139
+ ├── sections.json
140
+ ├── wikidata.json
141
+ └── candidates.json
142
+ ```
143
+
144
+ Directory names are derived from the category and article title by replacing
145
+ spaces with underscores, removing characters outside `[a-zA-Z0-9_-]`, and
146
+ appending an 8-character MD5 hash of the original name to guarantee
147
+ uniqueness.
148
+
149
+ ### Per-article files
150
+
151
+ | File | Content |
152
+ |------|---------|
153
+ | `full_text.txt` | Plain-text extract of the full article as returned by the Wikipedia API. |
154
+ | `sections.json` | JSON array of section objects, each with `"header"` (string) and `"text"` (string) keys. |
155
+ | `wikidata.json` | JSON object with `"qid"`, `"labels"`, and `"properties"` keys, or `null` if no Wikidata entity was found. |
156
+ | `candidates.json` | JSON array of candidate-sentence objects (see below). |
157
+
158
+ ### `index.json` fields
159
+
160
+ The root `index.json` contains one object per saved article:
161
+
162
+ | Field | Type | Description |
163
+ |-------|------|-------------|
164
+ | `title` | string | Wikipedia article title. |
165
+ | `url` | string | Full URL of the Wikipedia article. |
166
+ | `category` | string | Category label as supplied on the command line. |
167
+ | `qid` | string or null | Wikidata item identifier (e.g. `"Q12418"`), or `null`. |
168
+ | `has_target_properties` | boolean | Whether the entity satisfies the property filter. Always `true` when no property spec is given. |
169
+ | `candidate_sentence_count` | integer | Number of candidate sentences extracted from the article. |
170
+ | `interpretive_section_count` | integer | Number of interpretive sections found. |
171
+
172
+ ### `candidates.json` fields
173
+
174
+ Each entry in `candidates.json` describes one candidate sentence:
175
+
176
+ | Field | Type | Description |
177
+ |-------|------|-------------|
178
+ | `sentence` | string | The full sentence text. |
179
+ | `section_header` | string | Header of the section containing the sentence. |
180
+ | `verb_category` | string | Name of the first matched verb category (e.g. `"assertion"`). |
181
+ | `matched_verbs` | array of strings | All matched verb lemmas found in the sentence. |
182
+ | `negated` | boolean | `true` if a negation word (`not`, `never`, `no`) immediately precedes any matched verb. |
183
+
184
+ ---
185
+
186
+ ## Pipeline description
187
+
188
+ The pipeline has four stages:
189
+
190
+ 1. **Retrieve** — `get_category_articles` queries the Wikipedia
191
+ `categorymembers` API to list all articles in each requested category.
192
+ `get_article_content` fetches the plain-text extract and internal links
193
+ for each article. Section boundaries are detected by wiki-markup headers
194
+ (`== Header ==`) and returned as a structured list.
195
+
196
+ 2. **Align** — `get_wikidata_entity` queries the Wikidata API for the entity
197
+ linked to each Wikipedia article. If `--filter-wikidata` is active,
198
+ articles with no entity (or whose entity does not satisfy the property
199
+ spec) are dropped. The entity's QID, labels, and property values are
200
+ stored alongside the article data.
201
+
202
+ 3. **Filter** — `get_interpretive_sections` selects sections whose headers
203
+ match at least one keyword from the active keyword list. If
204
+ `--filter-sections` is active, articles with no matching sections are
205
+ dropped. `get_candidate_sentences` tokenises, POS-tags, and lemmatises
206
+ sentences within interpretive sections (falling back to all sections if
207
+ none are found), then retains only those containing at least one
208
+ epistemic verb from the active verb-category dictionary. If
209
+ `--filter-sentences` is active, articles with no candidate sentences are
210
+ dropped.
211
+
212
+ 4. **Write** — `save_article` creates the per-article directory and writes
213
+ the four output files. `save_index` writes the root `index.json`
214
+ summarising every saved article.
215
+
216
+ ---
217
+
218
+ ## Reproducibility
219
+
220
+ All filtering parameters are runtime-configurable through CLI flags and JSON
221
+ configuration files. The tool contains no hard-coded domain assumptions: the
222
+ built-in keyword and verb lists are defaults that can be extended or replaced
223
+ without modifying source code. To reproduce a corpus exactly, record:
224
+
225
+ - The exact `--categories` or `--categories-file` input.
226
+ - The `--wikidata-properties` spec string.
227
+ - The contents of any `--verbs-file` or `--sections-file` used.
228
+ - The exact set of `--add-verbs` and `--add-sections` tokens.
229
+ - The Wikipedia API snapshot date (the API returns the current live version
230
+ of articles; consider archiving `full_text.txt` for long-term
231
+ reproducibility).
232
+
233
+ The filtering logic is deterministic given fixed inputs: no randomness is
234
+ introduced at any stage.
235
+
236
+ ---
237
+
238
+ ## Summary output
239
+
240
+ After the pipeline completes, a summary is printed to stdout:
241
+
242
+ ```
243
+ Categories processed : 2
244
+ Articles retrieved : 147
245
+ After Wikidata filter: 132
246
+ Wikidata properties : P170! | P571?
247
+ After section filter : 89
248
+ Section keywords : 12 active (+2 custom: analysis, interpretation)
249
+ After sentence filter: 61
250
+ Verb categories : argumentation(7), assertion(7), epistemic_uncertainty(8), inference(7), revision(7)
251
+ Candidate sentences : 430
252
+ Output directory : /absolute/path/to/corpus
253
+ ```
254
+
255
+ | Field | Description |
256
+ |-------|-------------|
257
+ | Categories processed | Number of category names processed. |
258
+ | Articles retrieved | Total articles found across all categories. |
259
+ | After Wikidata filter | Articles remaining after the Wikidata filter (equals retrieved count if `--filter-wikidata` is off). |
260
+ | Wikidata properties | Active property spec, or `none (all entities retained)`. |
261
+ | After section filter | Articles remaining after the section filter. |
262
+ | Section keywords | Count of active keywords and any custom additions. |
263
+ | After sentence filter | Articles remaining after the sentence filter. |
264
+ | Verb categories | Each active category name and the number of lemmas it contains. |
265
+ | Candidate sentences | Total candidate sentences across all saved articles. |
266
+ | Output directory | Absolute path to the output root. |
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "wikicorpus"
7
+ version = "0.1.0"
8
+ description = "Retrieve and filter Wikipedia articles to build a corpus for knowledge extraction"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Andrea Schimmenti", email = "andschimmenti@gmail.com" },
14
+ ]
15
+ keywords = [
16
+ "wikipedia",
17
+ "corpus",
18
+ "nlp",
19
+ "knowledge-extraction",
20
+ "wikidata",
21
+ "text-mining",
22
+ ]
23
+ classifiers = [
24
+ "Development Status :: 3 - Alpha",
25
+ "Intended Audience :: Science/Research",
26
+ "License :: OSI Approved :: MIT License",
27
+ "Programming Language :: Python :: 3",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Programming Language :: Python :: 3.11",
30
+ "Programming Language :: Python :: 3.12",
31
+ "Topic :: Scientific/Engineering :: Information Analysis",
32
+ "Topic :: Text Processing :: Linguistic",
33
+ ]
34
+ dependencies = [
35
+ "requests",
36
+ "nltk",
37
+ ]
38
+
39
+ [project.scripts]
40
+ wikicorpus = "wikicorpus.cli:main"
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["wikicorpus"]
44
+
45
+ [tool.hatch.build.targets.sdist]
46
+ exclude = [
47
+ ".claude/",
48
+ "issues.md",
49
+ "corpus/",
50
+ "corpus_test/",
51
+ "dist/",
52
+ ]
File without changes