wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6542679abdbb9ac3e8c00581ce7c82b583c742ef0425f9f1ccd3eab619598c1b
|
|
4
|
+
data.tar.gz: d822011ec24cd6d512cb9725880b4780daec5b9ce401caafbae9e8df5e8593a5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8ffad99cceab4797a03e857203ebe0cd4f5df8e592d30920c975cb89e1d079709356810466bec32027b8be8de026138c1c579ffe97f0da47d6a90d799ba60222
|
|
7
|
+
data.tar.gz: 312f68371040f86384cb2bd01a68e178f2adbd97c3bdb71ec75ccaf7cf8a47c3fec733ca3874abcb16d30776222c313b1b04f393a24b62f119dad0217c35ed3c
|
data/.dockerignore
CHANGED
|
@@ -1,8 +1,20 @@
|
|
|
1
1
|
.git
|
|
2
2
|
.github
|
|
3
|
+
.claude
|
|
3
4
|
image
|
|
4
5
|
pkg
|
|
5
6
|
spec
|
|
7
|
+
coverage
|
|
8
|
+
tmp
|
|
9
|
+
benchmark_results
|
|
10
|
+
data/output_samples
|
|
11
|
+
scripts
|
|
6
12
|
.dockerignore
|
|
7
13
|
.gitignore
|
|
14
|
+
.solargraph.yml
|
|
15
|
+
.rubocop.yml
|
|
8
16
|
Gemfile.lock
|
|
17
|
+
CLAUDE.md
|
|
18
|
+
DEVELOPMENT.md
|
|
19
|
+
DEVELOPMENT_ja.md
|
|
20
|
+
*.gem
|
data/.github/workflows/ci.yml
CHANGED
|
@@ -12,25 +12,25 @@ jobs:
|
|
|
12
12
|
strategy:
|
|
13
13
|
fail-fast: false
|
|
14
14
|
matrix:
|
|
15
|
-
ruby: [
|
|
15
|
+
ruby: ['3.1', '3.2', '3.3']
|
|
16
|
+
include:
|
|
17
|
+
- ruby: 'head'
|
|
18
|
+
experimental: true
|
|
16
19
|
|
|
17
20
|
runs-on: ubuntu-latest
|
|
18
|
-
name: Ruby ${{matrix.ruby}}
|
|
19
|
-
|
|
21
|
+
name: Ruby ${{ matrix.ruby }}
|
|
22
|
+
continue-on-error: ${{ matrix.experimental || false }}
|
|
20
23
|
|
|
21
24
|
steps:
|
|
22
|
-
- uses: actions/checkout@
|
|
25
|
+
- uses: actions/checkout@v4
|
|
23
26
|
|
|
24
|
-
- name:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
- name: Install rspec
|
|
31
|
-
run: gem install rspec
|
|
27
|
+
- name: Set up Ruby
|
|
28
|
+
uses: ruby/setup-ruby@v1
|
|
29
|
+
with:
|
|
30
|
+
ruby-version: ${{ matrix.ruby }}
|
|
31
|
+
bundler-cache: true
|
|
32
32
|
|
|
33
33
|
- name: Run tests
|
|
34
|
-
run: rspec
|
|
34
|
+
run: bundle exec rspec
|
|
35
35
|
|
|
36
36
|
|
data/.gitignore
CHANGED
|
@@ -19,3 +19,17 @@ tmp
|
|
|
19
19
|
*.bak
|
|
20
20
|
*.~
|
|
21
21
|
tags
|
|
22
|
+
|
|
23
|
+
# Claude Code development files
|
|
24
|
+
CLAUDE.md
|
|
25
|
+
|
|
26
|
+
# Error logs
|
|
27
|
+
error_log.txt
|
|
28
|
+
spec/examples.txt
|
|
29
|
+
IMPROVEMENTS.md
|
|
30
|
+
|
|
31
|
+
# Generated data
|
|
32
|
+
benchmark_results/
|
|
33
|
+
|
|
34
|
+
# Developer-specific Ruby version
|
|
35
|
+
.ruby-version
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [2.1.0] - 2026-02-19
|
|
9
|
+
|
|
10
|
+
- **SQLite-based caching infrastructure**: New high-performance caching using SQLite for faster startup and repeated operations:
|
|
11
|
+
- `GlobalDataCache`: Caches parsed JSON data files (templates, MediaWiki aliases, HTML entities)
|
|
12
|
+
- Eliminates ~500KB JSON parsing overhead on each startup
|
|
13
|
+
- Validates cache against source file modification time and size
|
|
14
|
+
- Location: `~/.wp2txt/cache/global_data.sqlite3`
|
|
15
|
+
- `CategoryCache`: Caches Wikipedia category hierarchy from API
|
|
16
|
+
- Stores category members (pages and subcategories) in SQLite tables
|
|
17
|
+
- Supports recursive tree traversal and bulk page retrieval
|
|
18
|
+
- Per-language cache files: `~/.wp2txt/cache/categories_en.sqlite3`
|
|
19
|
+
- Configurable expiry (default: 7 days)
|
|
20
|
+
- `IndexCache`: Caches parsed multistream index (already existed, now with SQLite3 2.x compatibility)
|
|
21
|
+
- Reduces index parsing from ~10 minutes to seconds on subsequent runs
|
|
22
|
+
- All caches use WAL mode for concurrent read access during parallel processing
|
|
23
|
+
|
|
24
|
+
- **Ractor parallel processing (Ruby 4.0+)**: New `--ractor` option for thread-based parallelism:
|
|
25
|
+
- Requires Ruby 4.0 or later for stable operation
|
|
26
|
+
- Uses map-join-value pattern for reliable Ractor orchestration
|
|
27
|
+
- ~2x speedup compared to sequential processing
|
|
28
|
+
- Lower memory footprint than process-based parallelism (Parallel gem)
|
|
29
|
+
- Automatic fallback to sequential processing on Ruby 3.x
|
|
30
|
+
- Performance: Parallel gem (~3x) remains faster, Ractor (~2x) uses less memory
|
|
31
|
+
|
|
32
|
+
- **Template expansion**: New `--expand-templates` (`-E`) option expands common templates to readable text:
|
|
33
|
+
- Date templates: `{{birth date|1990|5|15}}` → "May 15, 1990"
|
|
34
|
+
- Convert templates: `{{convert|100|km|mi}}` → "100 km (62 mi)"
|
|
35
|
+
- Coordinate templates: `{{coord|35|41|N|139|41|E}}` → "35°41′N 139°41′E"
|
|
36
|
+
- Language templates: `{{lang|ja|日本語}}` → "日本語"
|
|
37
|
+
- Quote templates: `{{blockquote|text}}` → "text"
|
|
38
|
+
- And 20+ more template types
|
|
39
|
+
- **Enabled by default** - use `--no-expand-templates` to disable
|
|
40
|
+
- Parser functions support: `{{#if:}}`, `{{#switch:}}`, `{{#ifeq:}}`, `{{#expr:}}`
|
|
41
|
+
- Magic words support: `{{PAGENAME}}`, `{{CURRENTYEAR}}`, `{{NAMESPACE}}`
|
|
42
|
+
|
|
43
|
+
- **Removed legacy test data**: Deleted obsolete static test files:
|
|
44
|
+
- `data/testdata_en.bz2` (2.8MB, from 2022)
|
|
45
|
+
- `data/testdata_ja.bz2` (2.6MB, from 2022)
|
|
46
|
+
- `data/output_samples/` directory (~20MB)
|
|
47
|
+
- Tests now use live Wikipedia data with caching
|
|
48
|
+
|
|
49
|
+
- **Incremental dump downloads**: Smart handling of partial dump files when downloading full dumps:
|
|
50
|
+
- Detects existing partial downloads and offers to resume (download only remaining data)
|
|
51
|
+
- Validates dump dates - if dates match, can resume; if outdated, offers choices
|
|
52
|
+
- User options: resume download, download fresh, keep old partial, or use old as-is
|
|
53
|
+
- Automatic bz2 validation before and after incremental download
|
|
54
|
+
- Falls back to full download if server doesn't support HTTP Range headers
|
|
55
|
+
|
|
56
|
+
- **bz2 file validation**: New `Bz2Validator` module detects corrupt or invalid bz2 files before processing:
|
|
57
|
+
- Validates magic bytes (`BZ`), version byte (`h`), and block size (`1`-`9`)
|
|
58
|
+
- Optional decompression test to verify file integrity
|
|
59
|
+
- `StreamProcessor` validates bz2 files by default (configurable via `validate_bz2: false`)
|
|
60
|
+
- Detailed error types: `not_found`, `too_small`, `invalid_magic`, `invalid_version`, `invalid_block_size`, `decompression_failed`
|
|
61
|
+
|
|
62
|
+
- **Memory monitoring**: New `MemoryMonitor` module for adaptive resource management:
|
|
63
|
+
- Cross-platform memory detection (Linux, macOS, Windows)
|
|
64
|
+
- Adaptive buffer sizing based on available memory
|
|
65
|
+
- Memory statistics: `current_memory_usage`, `available_memory`, `memory_usage_percent`
|
|
66
|
+
- Automatic garbage collection when memory is low
|
|
67
|
+
|
|
68
|
+
- **Parallel article extraction**: `MultistreamReader` now supports parallel processing:
|
|
69
|
+
- `extract_articles_parallel(titles, num_processes: 4)` - Extract multiple articles in parallel
|
|
70
|
+
- `each_article_parallel(entries, num_processes: 4)` - Iterate with parallel processing
|
|
71
|
+
- Automatically groups articles by stream offset to minimize bz2 decompression overhead
|
|
72
|
+
|
|
73
|
+
- **Performance optimizations**:
|
|
74
|
+
- Pre-compiled 14 additional regex patterns for text cleanup
|
|
75
|
+
- Consolidated gsub chains (3 fewer calls per cleanup operation)
|
|
76
|
+
- Adaptive buffer sizing in `StreamProcessor` based on system memory
|
|
77
|
+
|
|
78
|
+
- **Cache staleness warnings**: Cache status now shows age and staleness information:
|
|
79
|
+
- Displays cache date and age (e.g., "2025-01-05 - 4 days ago")
|
|
80
|
+
- Warns when cache exceeds configured `dump_expiry_days` (default: 30 days)
|
|
81
|
+
- New `--update-cache` (`-U`) option to force refresh of cached dump files
|
|
82
|
+
- Users can choose to use stale cache or force update
|
|
83
|
+
|
|
84
|
+
- **Category-based extraction**: New `--from-category` option extracts all articles from a Wikipedia category:
|
|
85
|
+
- `wp2txt --lang=ja --from-category="日本の都市" -o ./output` extracts all articles in the category
|
|
86
|
+
- `--depth` option for subcategory recursion (e.g., `--depth=2` includes 2 levels of subcategories)
|
|
87
|
+
- `--dry-run` for preview mode (shows article counts without downloading)
|
|
88
|
+
- `--yes` to skip confirmation prompt for automation
|
|
89
|
+
- Circular reference prevention for category hierarchies
|
|
90
|
+
- Rate limiting for Wikipedia API requests
|
|
91
|
+
|
|
92
|
+
- **Configuration file**: New `--config-init` option creates persistent configuration:
|
|
93
|
+
- Settings stored in `~/.wp2txt/config.yml`
|
|
94
|
+
- Configurable: `dump_expiry_days`, `category_expiry_days`, `cache.directory`
|
|
95
|
+
- Default output format and subcategory depth
|
|
96
|
+
- CLI options override config file settings
|
|
97
|
+
|
|
98
|
+
- **Deprecated `--markers=none`**: Complete removal of special content is now deprecated
|
|
99
|
+
- Removing inline content (e.g., math formulas) makes surrounding text nonsensical
|
|
100
|
+
- `--markers=none` now shows a warning and behaves like `--markers=all`
|
|
101
|
+
- Use `--markers=math,code` to show only specific marker types
|
|
102
|
+
|
|
103
|
+
- **CLI option validation**: Extraction modes are now mutually exclusive with clear error messages:
|
|
104
|
+
- `--category-only`, `--summary-only`, `--metadata-only` cannot be combined
|
|
105
|
+
- `--sections` cannot be used with extraction modes
|
|
106
|
+
- `--section-stats` cannot be combined with extraction modes or `--sections`
|
|
107
|
+
|
|
108
|
+
- **Network retry with exponential backoff**: HTTP requests now retry on transient errors:
|
|
109
|
+
- Retries up to 3 times with exponential backoff (2, 4, 8 seconds)
|
|
110
|
+
- Handles timeouts, connection resets, and DNS failures
|
|
111
|
+
- CategoryFetcher API requests now log failures instead of silently returning nil
|
|
112
|
+
|
|
113
|
+
- **Disk full error handling**: OutputWriter now handles `Errno::ENOSPC` gracefully:
|
|
114
|
+
- Raises `Wp2txt::FileIOError` with descriptive message on disk full or I/O errors
|
|
115
|
+
- Properly closes file handles before raising
|
|
116
|
+
|
|
117
|
+
- **File rotation at article boundaries**: OutputWriter `write_from_file` now rotates output files only at blank lines (article boundaries):
|
|
118
|
+
- Prevents articles from being split across output files
|
|
119
|
+
- Eliminates UTF-8 character corruption at file boundaries (e.g., 3-byte Japanese characters split mid-byte)
|
|
120
|
+
- Uses line-by-line reading (`each_line` with `"r:UTF-8"`) instead of fixed-size byte chunks
|
|
121
|
+
- Verified with full Japanese Wikipedia (1.49M articles) and English Wikipedia (24.2 GB) dumps
|
|
122
|
+
|
|
123
|
+
- **HTTP timeout consistency**: All HTTP methods in `DumpManager` now use `DEFAULT_HTTP_TIMEOUT`:
|
|
124
|
+
- Added `open_timeout`/`read_timeout` to `download_incremental`, `get_remote_file_size`, `download_file_with_progress`, `download_file_range`
|
|
125
|
+
- Previously these methods had no timeout, risking indefinite hangs on network issues
|
|
126
|
+
|
|
127
|
+
- **Security: Command injection prevention**: All `IO.popen` calls now use array form:
|
|
128
|
+
- Fixed unsafe string interpolation in `wp2txt.rb`, `stream_processor.rb`, `bz2_validator.rb`, `memory_monitor.rb`
|
|
129
|
+
- Prevents shell metacharacter interpretation in file paths
|
|
130
|
+
|
|
131
|
+
- **Security: SSL certificate verification**: Restored proper TLS certificate validation:
|
|
132
|
+
- Removed `verify_callback` that unconditionally returned `true` (7 locations in `multistream.rb`)
|
|
133
|
+
- `VERIFY_PEER` now performs actual certificate verification
|
|
134
|
+
|
|
135
|
+
- **Security: Temp file handling**: `file_mod` now uses `Tempfile` instead of hardcoded `"temp"` filename:
|
|
136
|
+
- Prevents predictable file names and potential race conditions
|
|
137
|
+
- Temp files created in same directory as target file
|
|
138
|
+
|
|
139
|
+
- **CLI option fixes**:
|
|
140
|
+
- Added missing `--table` option (keep wiki table content)
|
|
141
|
+
- Added missing `--multiline` option (keep multi-line templates)
|
|
142
|
+
- Added missing `--pre` option (keep preformatted text blocks)
|
|
143
|
+
- Fixed `--ref` option not being transferred to processing config
|
|
144
|
+
- Reference removal is now conditional (respects `--ref` flag)
|
|
145
|
+
|
|
146
|
+
- **Ractor turbo mode warning**: Shows explicit warning when `--ractor` is used with turbo mode (unsupported combination)
|
|
147
|
+
|
|
148
|
+
- **Constants extraction**: Replaced magic numbers with named constants:
|
|
149
|
+
- `DEFAULT_HTTP_TIMEOUT`, `DEFAULT_PROGRESS_INTERVAL`, `INDEX_PROGRESS_THRESHOLD`
|
|
150
|
+
- `DEFAULT_TOP_N_SECTIONS`, `RESUME_METADATA_MAX_AGE_DAYS`, `MAX_HTTP_RETRIES`
|
|
151
|
+
|
|
152
|
+
- **Marker classification**: Markers now categorized as inline or block
|
|
153
|
+
- **Inline markers** (`[MATH]`, `[CODE]`, `[CHEM]`, `[IPA]`): Content that appears mid-sentence; removal would break grammar
|
|
154
|
+
- **Block markers** (`[TABLE]`, `[CODEBLOCK]`, `[INFOBOX]`, etc.): Standalone content that can be safely removed
|
|
155
|
+
- New `[CODEBLOCK]` marker for `<syntaxhighlight>`, `<source>`, `<pre>` tags (block-level code)
|
|
156
|
+
- `[CODE]` marker now only applies to inline `<code>` tags
|
|
157
|
+
|
|
158
|
+
## [2.0.0] - 2026-01-08
|
|
159
|
+
|
|
160
|
+
### Added
|
|
161
|
+
|
|
162
|
+
- **Auto-download mode**: New `--lang` option automatically downloads Wikipedia dumps:
|
|
163
|
+
- `wp2txt --lang=ja -o ./output` downloads and processes Japanese Wikipedia
|
|
164
|
+
- Downloads cached to `~/.wp2txt/cache/` for reuse
|
|
165
|
+
- Supports any Wikipedia language code (en, ja, de, fr, zh, etc.)
|
|
166
|
+
|
|
167
|
+
- **Article extraction**: New `--articles` option extracts specific articles by title:
|
|
168
|
+
- `wp2txt --lang=en --articles="Tokyo,Kyoto,Osaka" -o ./articles`
|
|
169
|
+
- Only downloads index + needed data streams (efficient partial download)
|
|
170
|
+
- O(1) hash lookup for article search
|
|
171
|
+
|
|
172
|
+
- **Cache management**: New options to manage downloaded dumps:
|
|
173
|
+
- `--cache-status` - Show cache status for all languages
|
|
174
|
+
- `--cache-clear` - Clear all cache
|
|
175
|
+
- `--cache-clear --lang=ja` - Clear cache for specific language
|
|
176
|
+
- `--cache-dir` - Custom cache directory
|
|
177
|
+
|
|
178
|
+
- **Content type markers**: New `--markers` option marks special content:
|
|
179
|
+
- Supported types: `[MATH]`, `[CODE]`, `[CHEM]`, `[TABLE]`, `[SCORE]`, `[TIMELINE]`, `[GRAPH]`, `[IPA]`, `[INFOBOX]`, `[NAVBOX]`, `[GALLERY]`, `[SIDEBAR]`, `[MAPFRAME]`, `[IMAGEMAP]`, `[REFERENCES]`
|
|
180
|
+
- `--markers=all` (default) - Enable all markers
|
|
181
|
+
- `--markers=none` - Disable markers (content removed)
|
|
182
|
+
- `--markers=math,code` - Enable specific markers only
|
|
183
|
+
|
|
184
|
+
- **Citation extraction**: New `--extract-citations` (`-C`) option for formatted bibliography output:
|
|
185
|
+
- Extracts author, title, and year from `{{cite book}}`, `{{cite web}}`, `{{Citation}}` templates
|
|
186
|
+
- Formats citations as "Author. \"Title\". Year."
|
|
187
|
+
- Available via CLI (`--extract-citations`) and Ruby API (`extract_citations: true`)
|
|
188
|
+
|
|
189
|
+
- **Multistream support**: New classes for efficient Wikipedia dump processing:
|
|
190
|
+
- `MultistreamIndex` - Parse multistream index files
|
|
191
|
+
- `MultistreamReader` - Extract articles from multistream dumps
|
|
192
|
+
- `DumpManager` - Download and cache dump files
|
|
193
|
+
- Enables targeted article extraction without downloading full dump
|
|
194
|
+
|
|
195
|
+
- **Validation framework**: New rake tasks for validating Wikipedia dump processing:
|
|
196
|
+
- `testdata:prepare[lang,level]` - Download and cache test data
|
|
197
|
+
- `validate:run[lang,level]` - Run validation on cached data
|
|
198
|
+
- `validate:full[lang]` - Full dump validation
|
|
199
|
+
|
|
200
|
+
- **HTML entity management**: Comprehensive entity support from authoritative sources:
|
|
201
|
+
- 2125 entities from WHATWG HTML specification (`html_entities.json`)
|
|
202
|
+
- Wikipedia-specific entities (`wikipedia_entities.json`): `∶`, `‐`, `&nbso;`
|
|
203
|
+
- New script `scripts/fetch_html_entities.rb` to update from WHATWG
|
|
204
|
+
- Replaces hardcoded entity list with data-driven approach
|
|
205
|
+
|
|
206
|
+
- **MediaWiki data auto-generation**: Magic words and namespace aliases fetched from all Wikipedia APIs:
|
|
207
|
+
- New script `scripts/fetch_mediawiki_data.rb` queries 350+ Wikipedia language editions
|
|
208
|
+
- Data stored in `lib/wp2txt/data/mediawiki_aliases.json`
|
|
209
|
+
- 176 redirect keywords, 231 category aliases, 313 file aliases
|
|
210
|
+
- Run `ruby scripts/fetch_mediawiki_data.rb` to update
|
|
211
|
+
|
|
212
|
+
- **JSON/JSONL output format**: New `--format json` option outputs articles as JSONL (one JSON object per line) with `title`, `categories`, `text`, and `redirect` fields. Ideal for data pipelines and machine learning workflows.
|
|
213
|
+
|
|
214
|
+
- **Streaming processing**: Complete rewrite of the processing architecture:
|
|
215
|
+
- No longer creates intermediate XML files
|
|
216
|
+
- Directly streams from bz2 compressed files
|
|
217
|
+
- Reduced disk I/O and storage requirements
|
|
218
|
+
- New `StreamProcessor` and `OutputWriter` classes for modular design
|
|
219
|
+
|
|
220
|
+
- **Regex cache**: Dynamic regex patterns are now cached to avoid repeated compilation
|
|
221
|
+
|
|
222
|
+
- **Multilingual category support**: Added support for category namespaces in 30+ languages (European, Cyrillic, Asian, Middle Eastern)
|
|
223
|
+
|
|
224
|
+
- **Multilingual redirect support**: Added support for redirect keywords in 25+ languages
|
|
225
|
+
|
|
226
|
+
- **Comprehensive test suite**: 395 tests covering:
|
|
227
|
+
- Unicode handling (CJK, Cyrillic, Arabic, emoji)
|
|
228
|
+
- Edge cases (deeply nested templates, malformed markup)
|
|
229
|
+
- Multilingual category and redirect extraction
|
|
230
|
+
- Text processing utilities
|
|
231
|
+
- Integration tests with real Wikipedia content
|
|
232
|
+
|
|
233
|
+
- **SimpleCov integration**: Added code coverage reporting for development
|
|
234
|
+
|
|
235
|
+
- **Ruby 4.0 compatibility**: Full support for Ruby 4.0
|
|
236
|
+
|
|
237
|
+
### Changed
|
|
238
|
+
|
|
239
|
+
- **Performance improvements**:
|
|
240
|
+
- `format_wiki`: Reduced intermediate string allocations by using `gsub!` for in-place modifications
|
|
241
|
+
- `cleanup`: Optimized with `gsub!` to reduce memory allocations
|
|
242
|
+
- `remove_complex`, `make_reference`: Optimized with `gsub!`
|
|
243
|
+
- Category deduplication: Changed from O(n²) to O(n) by calling `uniq!` once at end instead of every line
|
|
244
|
+
- `correct_separator`: Uses `tr` instead of `gsub` for single character replacement
|
|
245
|
+
- `remove_inbetween`: Dynamic regex patterns are now cached
|
|
246
|
+
|
|
247
|
+
- **BREAKING**: `REMOVE_HR_REGEX` now matches 4 or more hyphens (previously 3+) to align with MediaWiki specification where `----` is the minimum for horizontal rules
|
|
248
|
+
|
|
249
|
+
- **`chrref_to_utf` function**: Completely rewritten to support all Unicode codepoints (U+0001 to U+10FFFF), including:
|
|
250
|
+
- Supplementary plane characters (emoji, CJK Extension B, etc.)
|
|
251
|
+
- Proper handling of invalid codepoints (returns empty string)
|
|
252
|
+
|
|
253
|
+
- **`convert_characters` function**: Now uses `String#scrub` for safe handling of invalid UTF-8 sequences instead of calling `exit`
|
|
254
|
+
|
|
255
|
+
- **`command_exist?` function**: Updated to use `IO.popen` instead of `open("| ...")` for Ruby 4.0 compatibility
|
|
256
|
+
|
|
257
|
+
### Fixed
|
|
258
|
+
|
|
259
|
+
- **Unicode BMP limitation**: Fixed `chrref_to_utf` to correctly convert character references beyond the Basic Multilingual Plane (U+FFFF). Previously, emoji like `😀` would produce invalid characters.
|
|
260
|
+
|
|
261
|
+
- **Encoding error crash**: Fixed `convert_characters` which previously called `exit` on encoding errors, now gracefully handles invalid byte sequences using `scrub`
|
|
262
|
+
|
|
263
|
+
- **Horizontal rule detection**: Fixed `REMOVE_HR_REGEX` to correctly match MediaWiki horizontal rules (4+ hyphens)
|
|
264
|
+
|
|
265
|
+
- **Heading regex**: Fixed `IN_HEADING_REGEX` to allow trailing whitespace after closing equal signs
|
|
266
|
+
|
|
267
|
+
- **Ruby 4.0 compatibility**: Fixed `open("| which cmd")` pattern which no longer works in Ruby 4.0
|
|
268
|
+
|
|
269
|
+
### Deprecated
|
|
270
|
+
|
|
271
|
+
- **`--convert` / `-c` option**: No longer needed as streaming processing always converts
|
|
272
|
+
- **`--del-interfile` / `-x` option**: No longer needed as intermediate files are no longer created
|
|
273
|
+
|
|
274
|
+
### Removed
|
|
275
|
+
|
|
276
|
+
- **Intermediate XML file creation**: The `Splitter` class no longer creates intermediate XML files; processing is now fully streamed
|
|
277
|
+
|
|
278
|
+
### Security
|
|
279
|
+
|
|
280
|
+
- None
|
|
281
|
+
|
|
282
|
+
## [1.0.2] - Previous releases
|
|
283
|
+
|
|
284
|
+
See git history for changes prior to 2.0.0.
|