wp2txt 2.1.0 → 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +27 -20
- data/README_ja.md +11 -4
- data/lib/wp2txt/extractor.rb +2 -2
- data/lib/wp2txt/section_extractor.rb +36 -15
- data/lib/wp2txt/version.rb +1 -1
- data/spec/multistream_spec.rb +14 -14
- data/spec/section_extractor_spec.rb +70 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 464bf436280592e916e565d24cacdbde13c925ceaa9b390a2b36d4835053a323
|
|
4
|
+
data.tar.gz: 205a30ed5e9193d974f3a93d04f42ca55f9a241c0215a2755567c949776a9c3b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d6cf5dbef0e429802a5f66450e43598b94615c92fb144bc6630d44469ba73030002ec539049babd56271ef9c50d84ac22d7b90172e47d112b78676b68aca3324
|
|
7
|
+
data.tar.gz: 04c368cb623116823bd1a036fe760f38bb1bce78fa3ba6d15ea1db13f00ad7224a1e67b3904a5e1a9b1f329bcc7a8805255ad811ddd2925299c2d58a5968a8ac
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [2.1.1] - 2026-02-21
|
|
9
|
+
|
|
10
|
+
- **Bidirectional alias matching**: Section extraction now supports reverse alias lookup - specifying an alias name (e.g., "Synopsis") as target matches the canonical heading ("Plot") and vice versa
|
|
11
|
+
- **Expanded default section aliases**: Increased from 2 to 12 alias groups covering common English Wikipedia sections (Plot, Reception, References, Bibliography, Awards, Legacy, Early life, Career, etc.)
|
|
12
|
+
- **Config forwarding fix**: `--pre`, `--ref`, `--expand-templates`, and `--metadata-only` options now correctly forwarded in `--articles` and `--from-category` modes
|
|
13
|
+
|
|
8
14
|
## [2.1.0] - 2026-02-19
|
|
9
15
|
|
|
10
16
|
- **SQLite-based caching infrastructure**: New high-performance caching using SQLite for faster startup and repeated operations:
|
data/README.md
CHANGED
|
@@ -10,14 +10,14 @@ English | [日本語](README_ja.md)
|
|
|
10
10
|
# Install
|
|
11
11
|
gem install wp2txt
|
|
12
12
|
|
|
13
|
-
# Extract text from
|
|
14
|
-
wp2txt --lang=
|
|
13
|
+
# Extract text from English Wikipedia (auto-download)
|
|
14
|
+
wp2txt --lang=en -o ./output
|
|
15
15
|
|
|
16
16
|
# Extract specific articles
|
|
17
|
-
wp2txt --lang=
|
|
17
|
+
wp2txt --lang=en --articles="Tokyo,Kyoto" -o ./articles
|
|
18
18
|
|
|
19
19
|
# Extract articles from a category
|
|
20
|
-
wp2txt --lang=
|
|
20
|
+
wp2txt --lang=en --from-category="Cities in Japan" -o ./cities
|
|
21
21
|
```
|
|
22
22
|
|
|
23
23
|
## About
|
|
@@ -80,27 +80,27 @@ The `wp2txt` command is available inside the container. Use `/data` for input/ou
|
|
|
80
80
|
|
|
81
81
|
### Auto-download and process (Recommended)
|
|
82
82
|
|
|
83
|
-
$ wp2txt --lang=
|
|
83
|
+
$ wp2txt --lang=en -o ./text
|
|
84
84
|
|
|
85
|
-
This automatically downloads the
|
|
85
|
+
This automatically downloads the English Wikipedia dump and extracts plain text. Downloads are cached in `~/.wp2txt/cache/`.
|
|
86
86
|
|
|
87
87
|
### Extract specific articles by title
|
|
88
88
|
|
|
89
|
-
$ wp2txt --lang=
|
|
89
|
+
$ wp2txt --lang=en --articles="Cognitive linguistics,Generative grammar" -o ./articles
|
|
90
90
|
|
|
91
91
|
Only the index file and necessary data streams are downloaded, making it much faster than processing the full dump.
|
|
92
92
|
|
|
93
93
|
### Extract articles from a category
|
|
94
94
|
|
|
95
|
-
$ wp2txt --lang=
|
|
95
|
+
$ wp2txt --lang=en --from-category="Cities in Japan" -o ./cities
|
|
96
96
|
|
|
97
97
|
Include subcategories with `--depth`:
|
|
98
98
|
|
|
99
|
-
$ wp2txt --lang=
|
|
99
|
+
$ wp2txt --lang=en --from-category="Cities in Japan" --depth=2 -o ./cities
|
|
100
100
|
|
|
101
101
|
Preview without downloading (shows article counts):
|
|
102
102
|
|
|
103
|
-
$ wp2txt --lang=
|
|
103
|
+
$ wp2txt --lang=en --from-category="Cities in Japan" --dry-run
|
|
104
104
|
|
|
105
105
|
### Process local dump file
|
|
106
106
|
|
|
@@ -109,22 +109,29 @@ Preview without downloading (shows article counts):
|
|
|
109
109
|
### Other extraction modes
|
|
110
110
|
|
|
111
111
|
# Category info only (title + categories)
|
|
112
|
-
$ wp2txt -g --lang=
|
|
112
|
+
$ wp2txt -g --lang=en -o ./category
|
|
113
113
|
|
|
114
114
|
# Summary only (title + categories + opening paragraphs)
|
|
115
|
-
$ wp2txt -s --lang=
|
|
115
|
+
$ wp2txt -s --lang=en -o ./summary
|
|
116
116
|
|
|
117
117
|
# Metadata only (title + section headings + categories)
|
|
118
|
-
$ wp2txt -M --lang=
|
|
118
|
+
$ wp2txt -M --lang=en --format json -o ./metadata
|
|
119
119
|
|
|
120
|
-
# Extract specific sections
|
|
121
|
-
|
|
120
|
+
# Extract specific sections from particular articles (fast)
|
|
121
|
+
# Section names are case-insensitive; alias matching is enabled by default
|
|
122
|
+
$ wp2txt --lang=en --articles="Tokyo" --sections="summary,history,geography" --format json -o ./sections
|
|
122
123
|
|
|
123
|
-
#
|
|
124
|
-
$ wp2txt --lang=
|
|
124
|
+
# Extract specific sections from a category (moderate)
|
|
125
|
+
$ wp2txt --lang=en --from-category="Cities in Japan" --sections="summary,history" --format json -o ./sections
|
|
126
|
+
|
|
127
|
+
# Extract specific sections from full dump (slow - processes all articles)
|
|
128
|
+
$ wp2txt --lang=en --sections="summary,plot,reception" --format json -o ./sections
|
|
129
|
+
|
|
130
|
+
# Section heading statistics (useful for discovering section names before extraction)
|
|
131
|
+
$ wp2txt --lang=en --section-stats -o ./stats
|
|
125
132
|
|
|
126
133
|
# JSON/JSONL output
|
|
127
|
-
$ wp2txt --format json --lang=
|
|
134
|
+
$ wp2txt --format json --lang=en -o ./json
|
|
128
135
|
|
|
129
136
|
## Sample Output
|
|
130
137
|
|
|
@@ -156,7 +163,7 @@ For redirect articles:
|
|
|
156
163
|
|
|
157
164
|
$ wp2txt --cache-status # Show cache status
|
|
158
165
|
$ wp2txt --cache-clear # Clear all cache
|
|
159
|
-
$ wp2txt --cache-clear --lang=
|
|
166
|
+
$ wp2txt --cache-clear --lang=en # Clear cache for English only
|
|
160
167
|
$ wp2txt --update-cache # Force fresh download
|
|
161
168
|
|
|
162
169
|
When cache exceeds the expiry period (default: 30 days), wp2txt displays a warning but allows using cached data.
|
|
@@ -260,7 +267,7 @@ Supported: `{{cite book}}`, `{{cite web}}`, `{{cite news}}`, `{{cite journal}}`,
|
|
|
260
267
|
-M, --metadata-only Extract only title, headings, and categories
|
|
261
268
|
|
|
262
269
|
Section extraction:
|
|
263
|
-
-S, --sections=<s> Extract specific sections (comma-separated)
|
|
270
|
+
-S, --sections=<s> Extract specific sections (comma-separated, case-insensitive)
|
|
264
271
|
--section-output=<s> Output mode: structured or combined (default: structured)
|
|
265
272
|
--min-section-length=<i> Minimum section length in characters (default: 0)
|
|
266
273
|
--skip-empty Skip articles with no matching sections
|
data/README_ja.md
CHANGED
|
@@ -117,10 +117,17 @@ docker run -it -v /path/to/localdata:/data yohasebe/wp2txt
|
|
|
117
117
|
# メタデータのみ(タイトル + セクション見出し + カテゴリ)
|
|
118
118
|
$ wp2txt -M --lang=ja --format json -o ./metadata
|
|
119
119
|
|
|
120
|
-
#
|
|
121
|
-
|
|
120
|
+
# 特定記事から特定セクションを抽出(高速)
|
|
121
|
+
# セクション名は大文字小文字を区別しません。エイリアスマッチングもデフォルトで有効です
|
|
122
|
+
$ wp2txt --lang=ja --articles="東京" --sections="summary,概要,歴史" --format json -o ./sections
|
|
122
123
|
|
|
123
|
-
#
|
|
124
|
+
# カテゴリ内の記事から特定セクションを抽出(中速)
|
|
125
|
+
$ wp2txt --lang=ja --from-category="日本の都市" --sections="summary,概要,歴史" --format json -o ./sections
|
|
126
|
+
|
|
127
|
+
# フルダンプから特定セクションを抽出(低速 - 全記事を処理)
|
|
128
|
+
$ wp2txt --lang=ja --sections="summary,概要,歴史,関連項目" --format json -o ./sections
|
|
129
|
+
|
|
130
|
+
# セクション見出しの統計(抽出前のセクション名の調査に便利)
|
|
124
131
|
$ wp2txt --lang=ja --section-stats -o ./stats
|
|
125
132
|
|
|
126
133
|
# JSON/JSONL出力
|
|
@@ -260,7 +267,7 @@ CATEGORIES: カテゴリ1, カテゴリ2, カテゴリ3
|
|
|
260
267
|
-M, --metadata-only タイトル、見出し、カテゴリのみ抽出
|
|
261
268
|
|
|
262
269
|
セクション抽出:
|
|
263
|
-
-S, --sections=<s>
|
|
270
|
+
-S, --sections=<s> 特定セクションを抽出(カンマ区切り、大文字小文字区別なし)
|
|
264
271
|
--section-output=<s> 出力モード: structured または combined(デフォルト: structured)
|
|
265
272
|
--min-section-length=<i> 最小セクション長(文字数)(デフォルト: 0)
|
|
266
273
|
--skip-empty 該当セクションのない記事をスキップ
|
data/lib/wp2txt/extractor.rb
CHANGED
|
@@ -253,8 +253,8 @@ module Wp2txt
|
|
|
253
253
|
bz2_gem: opts[:bz2_gem]
|
|
254
254
|
}
|
|
255
255
|
|
|
256
|
-
%i[title list heading table redirect multiline category category_only
|
|
257
|
-
summary_only marker extract_citations].each do |opt|
|
|
256
|
+
%i[title list heading table pre ref redirect multiline category category_only
|
|
257
|
+
summary_only metadata_only marker extract_citations expand_templates].each do |opt|
|
|
258
258
|
config[opt] = opts[opt]
|
|
259
259
|
end
|
|
260
260
|
|
|
@@ -11,9 +11,21 @@ module Wp2txt
|
|
|
11
11
|
SUMMARY_KEY = "summary"
|
|
12
12
|
|
|
13
13
|
# Default section aliases (canonical name => array of aliases)
|
|
14
|
+
# These cover common variations found across English Wikipedia articles.
|
|
15
|
+
# Users can add custom aliases via --alias-file for other languages or domains.
|
|
14
16
|
DEFAULT_ALIASES = {
|
|
15
|
-
"Plot" => ["Synopsis"],
|
|
16
|
-
"Reception" => ["Critical reception"]
|
|
17
|
+
"Plot" => ["Synopsis", "Plot summary", "Story"],
|
|
18
|
+
"Reception" => ["Critical reception", "Reviews", "Critical response"],
|
|
19
|
+
"References" => ["Notes", "Footnotes", "Citations", "Notes and references"],
|
|
20
|
+
"External links" => ["External sources"],
|
|
21
|
+
"See also" => ["Related articles", "Related pages"],
|
|
22
|
+
"Bibliography" => ["Works", "Publications", "Selected works", "Selected bibliography"],
|
|
23
|
+
"Awards" => ["Awards and nominations", "Honors", "Accolades"],
|
|
24
|
+
"Legacy" => ["Impact", "Influence", "Cultural impact", "Cultural legacy"],
|
|
25
|
+
"Early life" => ["Early life and education", "Childhood", "Early years"],
|
|
26
|
+
"Career" => ["Professional career"],
|
|
27
|
+
"Filmography" => ["Films"],
|
|
28
|
+
"Discography" => ["Discography and videography"]
|
|
17
29
|
}.freeze
|
|
18
30
|
|
|
19
31
|
# Track which actual headings matched which requested sections
|
|
@@ -230,19 +242,22 @@ module Wp2txt
|
|
|
230
242
|
end
|
|
231
243
|
|
|
232
244
|
# Find canonical name for a heading (handles aliases)
|
|
245
|
+
# Supports bidirectional alias matching:
|
|
246
|
+
# - Target is canonical name, heading matches an alias (e.g., target="plot" matches "Synopsis")
|
|
247
|
+
# - Target is an alias, heading matches canonical or another alias in the same group
|
|
248
|
+
# (e.g., target="synopsis" matches "Plot" or "Plot summary")
|
|
233
249
|
# @param heading [String] The actual heading text from the article
|
|
234
250
|
# @param record_match [Boolean] Whether to record the match for tracking
|
|
235
|
-
# @return [String, nil] The
|
|
251
|
+
# @return [String, nil] The target section name as specified by the user, or nil
|
|
236
252
|
def find_canonical_name(heading, record_match: true)
|
|
237
253
|
return nil if heading.nil? || heading.empty?
|
|
238
254
|
return nil if @targets.nil?
|
|
239
255
|
|
|
240
256
|
heading_lower = heading.downcase.strip
|
|
241
257
|
|
|
242
|
-
# Direct match
|
|
258
|
+
# Direct match (target name == heading name)
|
|
243
259
|
@targets.each do |target|
|
|
244
260
|
if target.downcase == heading_lower
|
|
245
|
-
# Record direct match (only if heading differs in case)
|
|
246
261
|
if @track_matches && record_match && target != heading
|
|
247
262
|
@matched_sections[target] = heading
|
|
248
263
|
end
|
|
@@ -250,20 +265,26 @@ module Wp2txt
|
|
|
250
265
|
end
|
|
251
266
|
end
|
|
252
267
|
|
|
253
|
-
# Alias match
|
|
254
268
|
return nil unless @use_aliases
|
|
255
269
|
|
|
256
|
-
@
|
|
257
|
-
|
|
270
|
+
@targets.each do |target|
|
|
271
|
+
target_lower = target.downcase
|
|
258
272
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
273
|
+
@aliases.each do |canonical, alias_list|
|
|
274
|
+
canonical_lower = canonical.downcase
|
|
275
|
+
aliases_lower = alias_list.map(&:downcase)
|
|
276
|
+
all_names = [canonical_lower] + aliases_lower
|
|
277
|
+
|
|
278
|
+
# Check if this target belongs to this alias group
|
|
279
|
+
next unless all_names.include?(target_lower)
|
|
280
|
+
|
|
281
|
+
# Check if the heading matches any name in the same group
|
|
282
|
+
if all_names.include?(heading_lower) && target_lower != heading_lower
|
|
283
|
+
if @track_matches && record_match
|
|
284
|
+
@matched_sections[target] = heading
|
|
285
|
+
end
|
|
286
|
+
return target
|
|
265
287
|
end
|
|
266
|
-
return target
|
|
267
288
|
end
|
|
268
289
|
end
|
|
269
290
|
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/multistream_spec.rb
CHANGED
|
@@ -239,13 +239,13 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
239
239
|
|
|
240
240
|
describe "#initialize" do
|
|
241
241
|
it "loads the index file" do
|
|
242
|
-
index = described_class.new(index_path)
|
|
242
|
+
index = described_class.new(index_path, cache_dir: temp_dir)
|
|
243
243
|
expect(index.size).to eq(4)
|
|
244
244
|
end
|
|
245
245
|
end
|
|
246
246
|
|
|
247
247
|
describe "#find_by_title" do
|
|
248
|
-
let(:index) { described_class.new(index_path) }
|
|
248
|
+
let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
|
|
249
249
|
|
|
250
250
|
it "finds article by exact title" do
|
|
251
251
|
result = index.find_by_title("Article One")
|
|
@@ -268,7 +268,7 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
268
268
|
end
|
|
269
269
|
|
|
270
270
|
describe "#find_by_id" do
|
|
271
|
-
let(:index) { described_class.new(index_path) }
|
|
271
|
+
let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
|
|
272
272
|
|
|
273
273
|
it "finds article by page ID" do
|
|
274
274
|
result = index.find_by_id(2)
|
|
@@ -283,7 +283,7 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
283
283
|
end
|
|
284
284
|
|
|
285
285
|
describe "#articles_in_stream" do
|
|
286
|
-
let(:index) { described_class.new(index_path) }
|
|
286
|
+
let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
|
|
287
287
|
|
|
288
288
|
it "returns articles at given byte offset" do
|
|
289
289
|
articles = index.articles_in_stream(100)
|
|
@@ -298,7 +298,7 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
298
298
|
end
|
|
299
299
|
|
|
300
300
|
describe "#stream_offset_for" do
|
|
301
|
-
let(:index) { described_class.new(index_path) }
|
|
301
|
+
let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
|
|
302
302
|
|
|
303
303
|
it "returns byte offset for article" do
|
|
304
304
|
offset = index.stream_offset_for("Article Three")
|
|
@@ -312,7 +312,7 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
312
312
|
end
|
|
313
313
|
|
|
314
314
|
describe "#random_articles" do
|
|
315
|
-
let(:index) { described_class.new(index_path) }
|
|
315
|
+
let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
|
|
316
316
|
|
|
317
317
|
it "returns requested number of random articles" do
|
|
318
318
|
articles = index.random_articles(2)
|
|
@@ -326,7 +326,7 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
326
326
|
end
|
|
327
327
|
|
|
328
328
|
describe "#first_articles" do
|
|
329
|
-
let(:index) { described_class.new(index_path) }
|
|
329
|
+
let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
|
|
330
330
|
|
|
331
331
|
it "returns first N articles" do
|
|
332
332
|
articles = index.first_articles(2)
|
|
@@ -335,7 +335,7 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
335
335
|
end
|
|
336
336
|
|
|
337
337
|
describe "#stream_offsets" do
|
|
338
|
-
let(:index) { described_class.new(index_path) }
|
|
338
|
+
let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
|
|
339
339
|
|
|
340
340
|
it "returns unique sorted offsets" do
|
|
341
341
|
offsets = index.stream_offsets
|
|
@@ -447,7 +447,7 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
447
447
|
|
|
448
448
|
describe "#initialize" do
|
|
449
449
|
it "creates reader with paths" do
|
|
450
|
-
reader = described_class.new(multistream_path, index_path)
|
|
450
|
+
reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
|
|
451
451
|
expect(reader.multistream_path).to eq(multistream_path)
|
|
452
452
|
expect(reader.index).to be_a(Wp2txt::MultistreamIndex)
|
|
453
453
|
end
|
|
@@ -456,7 +456,7 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
456
456
|
describe "#extract_article" do
|
|
457
457
|
it "returns nil for non-existent article" do
|
|
458
458
|
# Without actual bz2 file, can't extract, but should handle gracefully
|
|
459
|
-
reader = described_class.new(multistream_path, index_path)
|
|
459
|
+
reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
|
|
460
460
|
# Will return nil because file doesn't exist
|
|
461
461
|
expect { reader.extract_article("Non Existent") }.not_to raise_error
|
|
462
462
|
end
|
|
@@ -464,13 +464,13 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
464
464
|
|
|
465
465
|
describe "#extract_articles_parallel" do
|
|
466
466
|
it "handles empty titles array" do
|
|
467
|
-
reader = described_class.new(multistream_path, index_path)
|
|
467
|
+
reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
|
|
468
468
|
result = reader.extract_articles_parallel([], num_processes: 2)
|
|
469
469
|
expect(result).to eq({})
|
|
470
470
|
end
|
|
471
471
|
|
|
472
472
|
it "handles titles not in index" do
|
|
473
|
-
reader = described_class.new(multistream_path, index_path)
|
|
473
|
+
reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
|
|
474
474
|
result = reader.extract_articles_parallel(["Non Existent"], num_processes: 2)
|
|
475
475
|
expect(result).to eq({})
|
|
476
476
|
end
|
|
@@ -478,13 +478,13 @@ RSpec.describe "Wp2txt Multistream" do
|
|
|
478
478
|
|
|
479
479
|
describe "#each_article_parallel" do
|
|
480
480
|
it "returns an enumerator when no block given" do
|
|
481
|
-
reader = described_class.new(multistream_path, index_path)
|
|
481
|
+
reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
|
|
482
482
|
result = reader.each_article_parallel([], num_processes: 2)
|
|
483
483
|
expect(result).to be_an(Enumerator)
|
|
484
484
|
end
|
|
485
485
|
|
|
486
486
|
it "handles empty entries array" do
|
|
487
|
-
reader = described_class.new(multistream_path, index_path)
|
|
487
|
+
reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
|
|
488
488
|
pages = []
|
|
489
489
|
reader.each_article_parallel([], num_processes: 2) { |page| pages << page }
|
|
490
490
|
expect(pages).to eq([])
|
|
@@ -161,6 +161,76 @@ RSpec.describe Wp2txt::SectionExtractor do
|
|
|
161
161
|
end
|
|
162
162
|
end
|
|
163
163
|
|
|
164
|
+
describe "bidirectional alias matching" do
|
|
165
|
+
let(:wiki_with_plot) do
|
|
166
|
+
<<~WIKI
|
|
167
|
+
Summary.
|
|
168
|
+
|
|
169
|
+
== Plot ==
|
|
170
|
+
The story begins...
|
|
171
|
+
WIKI
|
|
172
|
+
end
|
|
173
|
+
let(:plot_article) { Wp2txt::Article.new(wiki_with_plot, "Film") }
|
|
174
|
+
|
|
175
|
+
let(:wiki_with_synopsis) do
|
|
176
|
+
<<~WIKI
|
|
177
|
+
Summary.
|
|
178
|
+
|
|
179
|
+
== Synopsis ==
|
|
180
|
+
The story follows...
|
|
181
|
+
WIKI
|
|
182
|
+
end
|
|
183
|
+
let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Movie") }
|
|
184
|
+
|
|
185
|
+
let(:wiki_with_reviews) do
|
|
186
|
+
<<~WIKI
|
|
187
|
+
Summary.
|
|
188
|
+
|
|
189
|
+
== Reviews ==
|
|
190
|
+
Critics praised...
|
|
191
|
+
WIKI
|
|
192
|
+
end
|
|
193
|
+
let(:reviews_article) { Wp2txt::Article.new(wiki_with_reviews, "Album") }
|
|
194
|
+
|
|
195
|
+
context "when target is canonical name" do
|
|
196
|
+
let(:extractor) { described_class.new(["Plot"]) }
|
|
197
|
+
|
|
198
|
+
it "matches alias heading (Synopsis)" do
|
|
199
|
+
sections = extractor.extract_sections(synopsis_article)
|
|
200
|
+
expect(sections["Plot"]).to include("story follows")
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
context "when target is an alias name" do
|
|
205
|
+
let(:extractor) { described_class.new(["Synopsis"]) }
|
|
206
|
+
|
|
207
|
+
it "matches canonical heading (Plot)" do
|
|
208
|
+
sections = extractor.extract_sections(plot_article)
|
|
209
|
+
expect(sections["Synopsis"]).to include("story begins")
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
context "when target is one alias and heading is another alias in the same group" do
|
|
214
|
+
let(:extractor) { described_class.new(["Reviews"]) }
|
|
215
|
+
|
|
216
|
+
it "matches Critical reception heading via shared alias group" do
|
|
217
|
+
wiki = "== Critical reception ==\nWell received."
|
|
218
|
+
art = Wp2txt::Article.new(wiki, "Work")
|
|
219
|
+
sections = extractor.extract_sections(art)
|
|
220
|
+
expect(sections["Reviews"]).to include("Well received")
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
context "when aliases are disabled" do
|
|
225
|
+
let(:extractor) { described_class.new(["Synopsis"], use_aliases: false) }
|
|
226
|
+
|
|
227
|
+
it "does not match canonical name (Plot)" do
|
|
228
|
+
sections = extractor.extract_sections(plot_article)
|
|
229
|
+
expect(sections["Synopsis"]).to be_nil
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
164
234
|
describe "case-insensitive matching" do
|
|
165
235
|
let(:extractor) { described_class.new(["early life", "CAREER"]) }
|
|
166
236
|
|