wp2txt 2.1.0 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6542679abdbb9ac3e8c00581ce7c82b583c742ef0425f9f1ccd3eab619598c1b
4
- data.tar.gz: d822011ec24cd6d512cb9725880b4780daec5b9ce401caafbae9e8df5e8593a5
3
+ metadata.gz: 464bf436280592e916e565d24cacdbde13c925ceaa9b390a2b36d4835053a323
4
+ data.tar.gz: 205a30ed5e9193d974f3a93d04f42ca55f9a241c0215a2755567c949776a9c3b
5
5
  SHA512:
6
- metadata.gz: 8ffad99cceab4797a03e857203ebe0cd4f5df8e592d30920c975cb89e1d079709356810466bec32027b8be8de026138c1c579ffe97f0da47d6a90d799ba60222
7
- data.tar.gz: 312f68371040f86384cb2bd01a68e178f2adbd97c3bdb71ec75ccaf7cf8a47c3fec733ca3874abcb16d30776222c313b1b04f393a24b62f119dad0217c35ed3c
6
+ metadata.gz: d6cf5dbef0e429802a5f66450e43598b94615c92fb144bc6630d44469ba73030002ec539049babd56271ef9c50d84ac22d7b90172e47d112b78676b68aca3324
7
+ data.tar.gz: 04c368cb623116823bd1a036fe760f38bb1bce78fa3ba6d15ea1db13f00ad7224a1e67b3904a5e1a9b1f329bcc7a8805255ad811ddd2925299c2d58a5968a8ac
data/CHANGELOG.md CHANGED
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [2.1.1] - 2026-02-21
9
+
10
+ - **Bidirectional alias matching**: Section extraction now supports reverse alias lookup - specifying an alias name (e.g., "Synopsis") as target matches the canonical heading ("Plot") and vice versa
11
+ - **Expanded default section aliases**: Increased from 2 to 12 alias groups covering common English Wikipedia sections (Plot, Reception, References, Bibliography, Awards, Legacy, Early life, Career, etc.)
12
+ - **Config forwarding fix**: `--pre`, `--ref`, `--expand-templates`, and `--metadata-only` options now correctly forwarded in `--articles` and `--from-category` modes
13
+
8
14
  ## [2.1.0] - 2026-02-19
9
15
 
10
16
  - **SQLite-based caching infrastructure**: New high-performance caching using SQLite for faster startup and repeated operations:
data/README.md CHANGED
@@ -10,14 +10,14 @@ English | [日本語](README_ja.md)
10
10
  # Install
11
11
  gem install wp2txt
12
12
 
13
- # Extract text from Japanese Wikipedia (auto-download)
14
- wp2txt --lang=ja -o ./output
13
+ # Extract text from English Wikipedia (auto-download)
14
+ wp2txt --lang=en -o ./output
15
15
 
16
16
  # Extract specific articles
17
- wp2txt --lang=ja --articles="東京,京都" -o ./articles
17
+ wp2txt --lang=en --articles="Tokyo,Kyoto" -o ./articles
18
18
 
19
19
  # Extract articles from a category
20
- wp2txt --lang=ja --from-category="日本の都市" -o ./cities
20
+ wp2txt --lang=en --from-category="Cities in Japan" -o ./cities
21
21
  ```
22
22
 
23
23
  ## About
@@ -80,27 +80,27 @@ The `wp2txt` command is available inside the container. Use `/data` for input/ou
80
80
 
81
81
  ### Auto-download and process (Recommended)
82
82
 
83
- $ wp2txt --lang=ja -o ./text
83
+ $ wp2txt --lang=en -o ./text
84
84
 
85
- This automatically downloads the Japanese Wikipedia dump and extracts plain text. Downloads are cached in `~/.wp2txt/cache/`.
85
+ This automatically downloads the English Wikipedia dump and extracts plain text. Downloads are cached in `~/.wp2txt/cache/`.
86
86
 
87
87
  ### Extract specific articles by title
88
88
 
89
- $ wp2txt --lang=ja --articles="認知言語学,生成文法" -o ./articles
89
+ $ wp2txt --lang=en --articles="Cognitive linguistics,Generative grammar" -o ./articles
90
90
 
91
91
  Only the index file and necessary data streams are downloaded, making it much faster than processing the full dump.
92
92
 
93
93
  ### Extract articles from a category
94
94
 
95
- $ wp2txt --lang=ja --from-category="日本の都市" -o ./cities
95
+ $ wp2txt --lang=en --from-category="Cities in Japan" -o ./cities
96
96
 
97
97
  Include subcategories with `--depth`:
98
98
 
99
- $ wp2txt --lang=ja --from-category="日本の都市" --depth=2 -o ./cities
99
+ $ wp2txt --lang=en --from-category="Cities in Japan" --depth=2 -o ./cities
100
100
 
101
101
  Preview without downloading (shows article counts):
102
102
 
103
- $ wp2txt --lang=ja --from-category="日本の都市" --dry-run
103
+ $ wp2txt --lang=en --from-category="Cities in Japan" --dry-run
104
104
 
105
105
  ### Process local dump file
106
106
 
@@ -109,22 +109,29 @@ Preview without downloading (shows article counts):
109
109
  ### Other extraction modes
110
110
 
111
111
  # Category info only (title + categories)
112
- $ wp2txt -g --lang=ja -o ./category
112
+ $ wp2txt -g --lang=en -o ./category
113
113
 
114
114
  # Summary only (title + categories + opening paragraphs)
115
- $ wp2txt -s --lang=ja -o ./summary
115
+ $ wp2txt -s --lang=en -o ./summary
116
116
 
117
117
  # Metadata only (title + section headings + categories)
118
- $ wp2txt -M --lang=ja --format json -o ./metadata
118
+ $ wp2txt -M --lang=en --format json -o ./metadata
119
119
 
120
- # Extract specific sections (comma-separated, 'summary' for lead text)
121
- $ wp2txt --lang=en --sections="summary,Plot,Reception" --format json -o ./sections
120
+ # Extract specific sections from particular articles (fast)
121
+ # Section names are case-insensitive; alias matching is enabled by default
122
+ $ wp2txt --lang=en --articles="Tokyo" --sections="summary,history,geography" --format json -o ./sections
122
123
 
123
- # Section heading statistics
124
- $ wp2txt --lang=ja --section-stats -o ./stats
124
+ # Extract specific sections from a category (moderate)
125
+ $ wp2txt --lang=en --from-category="Cities in Japan" --sections="summary,history" --format json -o ./sections
126
+
127
+ # Extract specific sections from full dump (slow - processes all articles)
128
+ $ wp2txt --lang=en --sections="summary,plot,reception" --format json -o ./sections
129
+
130
+ # Section heading statistics (useful for discovering section names before extraction)
131
+ $ wp2txt --lang=en --section-stats -o ./stats
125
132
 
126
133
  # JSON/JSONL output
127
- $ wp2txt --format json --lang=ja -o ./json
134
+ $ wp2txt --format json --lang=en -o ./json
128
135
 
129
136
  ## Sample Output
130
137
 
@@ -156,7 +163,7 @@ For redirect articles:
156
163
 
157
164
  $ wp2txt --cache-status # Show cache status
158
165
  $ wp2txt --cache-clear # Clear all cache
159
- $ wp2txt --cache-clear --lang=ja # Clear cache for Japanese only
166
+ $ wp2txt --cache-clear --lang=en # Clear cache for English only
160
167
  $ wp2txt --update-cache # Force fresh download
161
168
 
162
169
  When cache exceeds the expiry period (default: 30 days), wp2txt displays a warning but allows using cached data.
@@ -260,7 +267,7 @@ Supported: `{{cite book}}`, `{{cite web}}`, `{{cite news}}`, `{{cite journal}}`,
260
267
  -M, --metadata-only Extract only title, headings, and categories
261
268
 
262
269
  Section extraction:
263
- -S, --sections=<s> Extract specific sections (comma-separated)
270
+ -S, --sections=<s> Extract specific sections (comma-separated, case-insensitive)
264
271
  --section-output=<s> Output mode: structured or combined (default: structured)
265
272
  --min-section-length=<i> Minimum section length in characters (default: 0)
266
273
  --skip-empty Skip articles with no matching sections
data/README_ja.md CHANGED
@@ -117,10 +117,17 @@ docker run -it -v /path/to/localdata:/data yohasebe/wp2txt
117
117
  # メタデータのみ(タイトル + セクション見出し + カテゴリ)
118
118
  $ wp2txt -M --lang=ja --format json -o ./metadata
119
119
 
120
- # 特定セクションを抽出(カンマ区切り、'summary'で冒頭テキスト)
121
- $ wp2txt --lang=ja --sections="概要,歴史,関連項目" --format json -o ./sections
120
+ # 特定記事から特定セクションを抽出(高速)
121
+ # セクション名は大文字小文字を区別しません。エイリアスマッチングもデフォルトで有効です
122
+ $ wp2txt --lang=ja --articles="東京" --sections="summary,概要,歴史" --format json -o ./sections
122
123
 
123
- # セクション見出しの統計
124
+ # カテゴリ内の記事から特定セクションを抽出(中速)
125
+ $ wp2txt --lang=ja --from-category="日本の都市" --sections="summary,概要,歴史" --format json -o ./sections
126
+
127
+ # フルダンプから特定セクションを抽出(低速 - 全記事を処理)
128
+ $ wp2txt --lang=ja --sections="summary,概要,歴史,関連項目" --format json -o ./sections
129
+
130
+ # セクション見出しの統計(抽出前のセクション名の調査に便利)
124
131
  $ wp2txt --lang=ja --section-stats -o ./stats
125
132
 
126
133
  # JSON/JSONL出力
@@ -260,7 +267,7 @@ CATEGORIES: カテゴリ1, カテゴリ2, カテゴリ3
260
267
  -M, --metadata-only タイトル、見出し、カテゴリのみ抽出
261
268
 
262
269
  セクション抽出:
263
- -S, --sections=<s> 特定セクションを抽出(カンマ区切り)
270
+ -S, --sections=<s> 特定セクションを抽出(カンマ区切り、大文字小文字区別なし)
264
271
  --section-output=<s> 出力モード: structured または combined(デフォルト: structured)
265
272
  --min-section-length=<i> 最小セクション長(文字数)(デフォルト: 0)
266
273
  --skip-empty 該当セクションのない記事をスキップ
@@ -253,8 +253,8 @@ module Wp2txt
253
253
  bz2_gem: opts[:bz2_gem]
254
254
  }
255
255
 
256
- %i[title list heading table redirect multiline category category_only
257
- summary_only marker extract_citations].each do |opt|
256
+ %i[title list heading table pre ref redirect multiline category category_only
257
+ summary_only metadata_only marker extract_citations expand_templates].each do |opt|
258
258
  config[opt] = opts[opt]
259
259
  end
260
260
 
@@ -11,9 +11,21 @@ module Wp2txt
11
11
  SUMMARY_KEY = "summary"
12
12
 
13
13
  # Default section aliases (canonical name => array of aliases)
14
+ # These cover common variations found across English Wikipedia articles.
15
+ # Users can add custom aliases via --alias-file for other languages or domains.
14
16
  DEFAULT_ALIASES = {
15
- "Plot" => ["Synopsis"],
16
- "Reception" => ["Critical reception"]
17
+ "Plot" => ["Synopsis", "Plot summary", "Story"],
18
+ "Reception" => ["Critical reception", "Reviews", "Critical response"],
19
+ "References" => ["Notes", "Footnotes", "Citations", "Notes and references"],
20
+ "External links" => ["External sources"],
21
+ "See also" => ["Related articles", "Related pages"],
22
+ "Bibliography" => ["Works", "Publications", "Selected works", "Selected bibliography"],
23
+ "Awards" => ["Awards and nominations", "Honors", "Accolades"],
24
+ "Legacy" => ["Impact", "Influence", "Cultural impact", "Cultural legacy"],
25
+ "Early life" => ["Early life and education", "Childhood", "Early years"],
26
+ "Career" => ["Professional career"],
27
+ "Filmography" => ["Films"],
28
+ "Discography" => ["Discography and videography"]
17
29
  }.freeze
18
30
 
19
31
  # Track which actual headings matched which requested sections
@@ -230,19 +242,22 @@ module Wp2txt
230
242
  end
231
243
 
232
244
  # Find canonical name for a heading (handles aliases)
245
+ # Supports bidirectional alias matching:
246
+ # - Target is canonical name, heading matches an alias (e.g., target="plot" matches "Synopsis")
247
+ # - Target is an alias, heading matches canonical or another alias in the same group
248
+ # (e.g., target="synopsis" matches "Plot" or "Plot summary")
233
249
  # @param heading [String] The actual heading text from the article
234
250
  # @param record_match [Boolean] Whether to record the match for tracking
235
- # @return [String, nil] The canonical (requested) section name, or nil
251
+ # @return [String, nil] The target section name as specified by the user, or nil
236
252
  def find_canonical_name(heading, record_match: true)
237
253
  return nil if heading.nil? || heading.empty?
238
254
  return nil if @targets.nil?
239
255
 
240
256
  heading_lower = heading.downcase.strip
241
257
 
242
- # Direct match
258
+ # Direct match (target name == heading name)
243
259
  @targets.each do |target|
244
260
  if target.downcase == heading_lower
245
- # Record direct match (only if heading differs in case)
246
261
  if @track_matches && record_match && target != heading
247
262
  @matched_sections[target] = heading
248
263
  end
@@ -250,20 +265,26 @@ module Wp2txt
250
265
  end
251
266
  end
252
267
 
253
- # Alias match
254
268
  return nil unless @use_aliases
255
269
 
256
- @aliases.each do |canonical, alias_list|
257
- next unless @targets.any? { |t| t.downcase == canonical.downcase }
270
+ @targets.each do |target|
271
+ target_lower = target.downcase
258
272
 
259
- if alias_list.any? { |a| a.downcase == heading_lower }
260
- # Return the target that matches canonical
261
- target = @targets.find { |t| t.downcase == canonical.downcase }
262
- # Record alias match
263
- if @track_matches && record_match && target
264
- @matched_sections[target] = heading
273
+ @aliases.each do |canonical, alias_list|
274
+ canonical_lower = canonical.downcase
275
+ aliases_lower = alias_list.map(&:downcase)
276
+ all_names = [canonical_lower] + aliases_lower
277
+
278
+ # Check if this target belongs to this alias group
279
+ next unless all_names.include?(target_lower)
280
+
281
+ # Check if the heading matches any name in the same group
282
+ if all_names.include?(heading_lower) && target_lower != heading_lower
283
+ if @track_matches && record_match
284
+ @matched_sections[target] = heading
285
+ end
286
+ return target
265
287
  end
266
- return target
267
288
  end
268
289
  end
269
290
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Wp2txt
4
- VERSION = "2.1.0"
4
+ VERSION = "2.1.1"
5
5
  end
@@ -239,13 +239,13 @@ RSpec.describe "Wp2txt Multistream" do
239
239
 
240
240
  describe "#initialize" do
241
241
  it "loads the index file" do
242
- index = described_class.new(index_path)
242
+ index = described_class.new(index_path, cache_dir: temp_dir)
243
243
  expect(index.size).to eq(4)
244
244
  end
245
245
  end
246
246
 
247
247
  describe "#find_by_title" do
248
- let(:index) { described_class.new(index_path) }
248
+ let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
249
249
 
250
250
  it "finds article by exact title" do
251
251
  result = index.find_by_title("Article One")
@@ -268,7 +268,7 @@ RSpec.describe "Wp2txt Multistream" do
268
268
  end
269
269
 
270
270
  describe "#find_by_id" do
271
- let(:index) { described_class.new(index_path) }
271
+ let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
272
272
 
273
273
  it "finds article by page ID" do
274
274
  result = index.find_by_id(2)
@@ -283,7 +283,7 @@ RSpec.describe "Wp2txt Multistream" do
283
283
  end
284
284
 
285
285
  describe "#articles_in_stream" do
286
- let(:index) { described_class.new(index_path) }
286
+ let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
287
287
 
288
288
  it "returns articles at given byte offset" do
289
289
  articles = index.articles_in_stream(100)
@@ -298,7 +298,7 @@ RSpec.describe "Wp2txt Multistream" do
298
298
  end
299
299
 
300
300
  describe "#stream_offset_for" do
301
- let(:index) { described_class.new(index_path) }
301
+ let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
302
302
 
303
303
  it "returns byte offset for article" do
304
304
  offset = index.stream_offset_for("Article Three")
@@ -312,7 +312,7 @@ RSpec.describe "Wp2txt Multistream" do
312
312
  end
313
313
 
314
314
  describe "#random_articles" do
315
- let(:index) { described_class.new(index_path) }
315
+ let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
316
316
 
317
317
  it "returns requested number of random articles" do
318
318
  articles = index.random_articles(2)
@@ -326,7 +326,7 @@ RSpec.describe "Wp2txt Multistream" do
326
326
  end
327
327
 
328
328
  describe "#first_articles" do
329
- let(:index) { described_class.new(index_path) }
329
+ let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
330
330
 
331
331
  it "returns first N articles" do
332
332
  articles = index.first_articles(2)
@@ -335,7 +335,7 @@ RSpec.describe "Wp2txt Multistream" do
335
335
  end
336
336
 
337
337
  describe "#stream_offsets" do
338
- let(:index) { described_class.new(index_path) }
338
+ let(:index) { described_class.new(index_path, cache_dir: temp_dir) }
339
339
 
340
340
  it "returns unique sorted offsets" do
341
341
  offsets = index.stream_offsets
@@ -447,7 +447,7 @@ RSpec.describe "Wp2txt Multistream" do
447
447
 
448
448
  describe "#initialize" do
449
449
  it "creates reader with paths" do
450
- reader = described_class.new(multistream_path, index_path)
450
+ reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
451
451
  expect(reader.multistream_path).to eq(multistream_path)
452
452
  expect(reader.index).to be_a(Wp2txt::MultistreamIndex)
453
453
  end
@@ -456,7 +456,7 @@ RSpec.describe "Wp2txt Multistream" do
456
456
  describe "#extract_article" do
457
457
  it "returns nil for non-existent article" do
458
458
  # Without actual bz2 file, can't extract, but should handle gracefully
459
- reader = described_class.new(multistream_path, index_path)
459
+ reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
460
460
  # Will return nil because file doesn't exist
461
461
  expect { reader.extract_article("Non Existent") }.not_to raise_error
462
462
  end
@@ -464,13 +464,13 @@ RSpec.describe "Wp2txt Multistream" do
464
464
 
465
465
  describe "#extract_articles_parallel" do
466
466
  it "handles empty titles array" do
467
- reader = described_class.new(multistream_path, index_path)
467
+ reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
468
468
  result = reader.extract_articles_parallel([], num_processes: 2)
469
469
  expect(result).to eq({})
470
470
  end
471
471
 
472
472
  it "handles titles not in index" do
473
- reader = described_class.new(multistream_path, index_path)
473
+ reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
474
474
  result = reader.extract_articles_parallel(["Non Existent"], num_processes: 2)
475
475
  expect(result).to eq({})
476
476
  end
@@ -478,13 +478,13 @@ RSpec.describe "Wp2txt Multistream" do
478
478
 
479
479
  describe "#each_article_parallel" do
480
480
  it "returns an enumerator when no block given" do
481
- reader = described_class.new(multistream_path, index_path)
481
+ reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
482
482
  result = reader.each_article_parallel([], num_processes: 2)
483
483
  expect(result).to be_an(Enumerator)
484
484
  end
485
485
 
486
486
  it "handles empty entries array" do
487
- reader = described_class.new(multistream_path, index_path)
487
+ reader = described_class.new(multistream_path, index_path, cache_dir: temp_dir)
488
488
  pages = []
489
489
  reader.each_article_parallel([], num_processes: 2) { |page| pages << page }
490
490
  expect(pages).to eq([])
@@ -161,6 +161,76 @@ RSpec.describe Wp2txt::SectionExtractor do
161
161
  end
162
162
  end
163
163
 
164
+ describe "bidirectional alias matching" do
165
+ let(:wiki_with_plot) do
166
+ <<~WIKI
167
+ Summary.
168
+
169
+ == Plot ==
170
+ The story begins...
171
+ WIKI
172
+ end
173
+ let(:plot_article) { Wp2txt::Article.new(wiki_with_plot, "Film") }
174
+
175
+ let(:wiki_with_synopsis) do
176
+ <<~WIKI
177
+ Summary.
178
+
179
+ == Synopsis ==
180
+ The story follows...
181
+ WIKI
182
+ end
183
+ let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Movie") }
184
+
185
+ let(:wiki_with_reviews) do
186
+ <<~WIKI
187
+ Summary.
188
+
189
+ == Reviews ==
190
+ Critics praised...
191
+ WIKI
192
+ end
193
+ let(:reviews_article) { Wp2txt::Article.new(wiki_with_reviews, "Album") }
194
+
195
+ context "when target is canonical name" do
196
+ let(:extractor) { described_class.new(["Plot"]) }
197
+
198
+ it "matches alias heading (Synopsis)" do
199
+ sections = extractor.extract_sections(synopsis_article)
200
+ expect(sections["Plot"]).to include("story follows")
201
+ end
202
+ end
203
+
204
+ context "when target is an alias name" do
205
+ let(:extractor) { described_class.new(["Synopsis"]) }
206
+
207
+ it "matches canonical heading (Plot)" do
208
+ sections = extractor.extract_sections(plot_article)
209
+ expect(sections["Synopsis"]).to include("story begins")
210
+ end
211
+ end
212
+
213
+ context "when target is one alias and heading is another alias in the same group" do
214
+ let(:extractor) { described_class.new(["Reviews"]) }
215
+
216
+ it "matches Critical reception heading via shared alias group" do
217
+ wiki = "== Critical reception ==\nWell received."
218
+ art = Wp2txt::Article.new(wiki, "Work")
219
+ sections = extractor.extract_sections(art)
220
+ expect(sections["Reviews"]).to include("Well received")
221
+ end
222
+ end
223
+
224
+ context "when aliases are disabled" do
225
+ let(:extractor) { described_class.new(["Synopsis"], use_aliases: false) }
226
+
227
+ it "does not match canonical name (Plot)" do
228
+ sections = extractor.extract_sections(plot_article)
229
+ expect(sections["Synopsis"]).to be_nil
230
+ end
231
+ end
232
+ end
233
+
164
234
  describe "case-insensitive matching" do
165
235
  let(:extractor) { described_class.new(["early life", "CAREER"]) }
166
236
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe