wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
data/lib/wp2txt/cli.rb
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "optimist"
|
|
4
|
+
require_relative "version"
|
|
5
|
+
require_relative "multistream"
|
|
6
|
+
require_relative "config"
|
|
7
|
+
require_relative "memory_monitor"
|
|
8
|
+
|
|
9
|
+
module Wp2txt
|
|
10
|
+
# CLI option parsing and validation
|
|
11
|
+
module CLI
|
|
12
|
+
class << self
|
|
13
|
+
# Get the optimal number of processors for this system
|
|
14
|
+
# Based on CPU cores and available memory
|
|
15
|
+
# @return [Integer] Optimal number of parallel processes
|
|
16
|
+
def max_processors
|
|
17
|
+
MemoryMonitor.optimal_processes
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Load configuration
|
|
21
|
+
# @return [Config] Configuration object
|
|
22
|
+
def load_config(config_path = nil)
|
|
23
|
+
path = config_path || Config.default_path
|
|
24
|
+
@config = Config.load(path)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Get current config (lazy load)
|
|
28
|
+
# @return [Config] Configuration object
|
|
29
|
+
def config
|
|
30
|
+
@config ||= load_config
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Parse command line options
|
|
34
|
+
# @param args [Array<String>] Command line arguments
|
|
35
|
+
# @return [Hash] Parsed options
|
|
36
|
+
def parse_options(args)
|
|
37
|
+
# Pre-load config for defaults
|
|
38
|
+
cfg = config
|
|
39
|
+
|
|
40
|
+
opts = Optimist.options(args) do
|
|
41
|
+
version Wp2txt::VERSION
|
|
42
|
+
banner <<~BANNER
|
|
43
|
+
WP2TXT extracts plain text data from Wikipedia dump files.
|
|
44
|
+
|
|
45
|
+
Usage:
|
|
46
|
+
wp2txt --lang=ja [options] # Auto-download and process
|
|
47
|
+
wp2txt --input=FILE [options] # Process local file
|
|
48
|
+
wp2txt --lang=en --from-category="Cities" -o ./output # Extract category
|
|
49
|
+
wp2txt --lang=en --from-category="Cities" --dry-run # Preview only
|
|
50
|
+
wp2txt --cache-status # Show cache status
|
|
51
|
+
wp2txt --cache-clear [--lang=CODE] # Clear cache
|
|
52
|
+
wp2txt --config-init # Create default config file
|
|
53
|
+
|
|
54
|
+
Options:
|
|
55
|
+
BANNER
|
|
56
|
+
|
|
57
|
+
# Input source (one of --input or --lang required, unless cache operations)
|
|
58
|
+
opt :input, "Path to compressed file (bz2) or XML file",
|
|
59
|
+
type: String, short: "-i"
|
|
60
|
+
opt :lang, "Wikipedia language code (e.g., ja, en, de) for auto-download",
|
|
61
|
+
type: String, short: "-L"
|
|
62
|
+
opt :articles, "Specific article titles to extract (comma-separated, requires --lang)",
|
|
63
|
+
type: String, short: "-A"
|
|
64
|
+
opt :from_category, "Extract articles from Wikipedia category (requires --lang)",
|
|
65
|
+
type: String, short: "-G"
|
|
66
|
+
opt :depth, "Subcategory recursion depth for --from-category (0 = no recursion)",
|
|
67
|
+
default: cfg.default_depth, type: Integer, short: "-D"
|
|
68
|
+
opt :yes, "Skip confirmation prompt for category extraction",
|
|
69
|
+
default: false, short: "-y"
|
|
70
|
+
opt :dry_run, "Preview category extraction without downloading",
|
|
71
|
+
default: false
|
|
72
|
+
opt :update_cache, "Force refresh of cached dump files",
|
|
73
|
+
default: false, short: "-U"
|
|
74
|
+
|
|
75
|
+
# Output options
|
|
76
|
+
opt :output_dir, "Path to output directory",
|
|
77
|
+
default: Dir.pwd, type: String, short: "-o"
|
|
78
|
+
opt :format, "Output format: text or json (JSONL)",
|
|
79
|
+
default: cfg.default_format, short: "-j"
|
|
80
|
+
|
|
81
|
+
# Cache management
|
|
82
|
+
opt :cache_dir, "Cache directory for downloaded dumps",
|
|
83
|
+
default: cfg.cache_directory, type: String
|
|
84
|
+
opt :cache_status, "Show cache status and exit",
|
|
85
|
+
default: false
|
|
86
|
+
opt :cache_clear, "Clear cache and exit",
|
|
87
|
+
default: false
|
|
88
|
+
opt :config_init, "Create default configuration file (~/.wp2txt/config.yml)",
|
|
89
|
+
default: false
|
|
90
|
+
opt :config_path, "Path to configuration file",
|
|
91
|
+
type: String
|
|
92
|
+
|
|
93
|
+
# Processing options
|
|
94
|
+
opt :category, "Show article category information",
|
|
95
|
+
default: true, short: "-a"
|
|
96
|
+
opt :category_only, "Extract only article title and categories",
|
|
97
|
+
default: false, short: "-g"
|
|
98
|
+
opt :summary_only, "Extract only title, categories, and summary",
|
|
99
|
+
default: false, short: "-s"
|
|
100
|
+
opt :metadata_only, "Extract only title, section headings, and categories (for analysis)",
|
|
101
|
+
default: false, short: "-M"
|
|
102
|
+
|
|
103
|
+
# Section extraction options
|
|
104
|
+
opt :sections, "Extract specific sections (comma-separated, 'summary' for lead text)",
|
|
105
|
+
type: String, short: "-S"
|
|
106
|
+
opt :section_output, "Section output mode: 'structured' (default) or 'combined'",
|
|
107
|
+
default: "structured"
|
|
108
|
+
opt :min_section_length, "Minimum section length in characters (shorter sections become null)",
|
|
109
|
+
default: 0, type: Integer
|
|
110
|
+
opt :skip_empty, "Skip articles with no matching sections",
|
|
111
|
+
default: false
|
|
112
|
+
opt :alias_file, "Custom section alias definitions file (YAML format)",
|
|
113
|
+
type: String
|
|
114
|
+
opt :no_section_aliases, "Disable section alias matching (exact match only)",
|
|
115
|
+
default: false
|
|
116
|
+
opt :section_stats, "Collect and output section heading statistics (JSON)",
|
|
117
|
+
default: false
|
|
118
|
+
opt :show_matched_sections, "Include matched_sections field in JSON output (shows actual headings)",
|
|
119
|
+
default: false
|
|
120
|
+
|
|
121
|
+
opt :file_size, "Approximate size (in MB) of each output file (0 for single file)",
|
|
122
|
+
default: 10, short: "-f"
|
|
123
|
+
opt :num_procs, "Number of parallel processes (auto-detected based on CPU/memory)",
|
|
124
|
+
type: Integer, short: "-n"
|
|
125
|
+
opt :title, "Keep page titles in output",
|
|
126
|
+
default: true, short: "-t"
|
|
127
|
+
opt :heading, "Keep section titles in output",
|
|
128
|
+
default: true, short: "-d"
|
|
129
|
+
opt :list, "Keep unprocessed list items in output",
|
|
130
|
+
default: false, short: "-l"
|
|
131
|
+
opt :table, "Keep wiki table content in output",
|
|
132
|
+
default: false
|
|
133
|
+
opt :pre, "Keep preformatted text blocks in output",
|
|
134
|
+
default: false, short: "-p"
|
|
135
|
+
opt :ref, "Keep reference notations [ref]...[/ref]",
|
|
136
|
+
default: false, short: "-r"
|
|
137
|
+
opt :multiline, "Keep multi-line templates in output",
|
|
138
|
+
default: false
|
|
139
|
+
opt :redirect, "Show redirect destination",
|
|
140
|
+
default: false, short: "-e"
|
|
141
|
+
opt :marker, "Show symbols prefixed to list items",
|
|
142
|
+
default: true, short: "-m"
|
|
143
|
+
opt :markers, "Content type markers (math,code,chem,table,score,timeline,graph,ipa or 'all')",
|
|
144
|
+
default: "all", short: "-k"
|
|
145
|
+
opt :extract_citations, "Extract formatted citations instead of removing them",
|
|
146
|
+
default: false, short: "-C"
|
|
147
|
+
opt :expand_templates, "Expand common templates (birth date, convert, etc.) to readable text",
|
|
148
|
+
default: true, short: "-E"
|
|
149
|
+
opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of system command",
|
|
150
|
+
default: false, short: "-b"
|
|
151
|
+
opt :ractor, "Use Ractor for parallel processing (Ruby 4.0+, streaming mode only)",
|
|
152
|
+
default: false, short: "-R"
|
|
153
|
+
opt :no_turbo, "Disable turbo mode (use streaming instead, saves disk space)",
|
|
154
|
+
default: false
|
|
155
|
+
|
|
156
|
+
# Output control
|
|
157
|
+
opt :quiet, "Suppress progress output (only show errors and final result)",
|
|
158
|
+
default: false, short: "-q"
|
|
159
|
+
opt :no_color, "Disable colored output (also respects NO_COLOR env variable)",
|
|
160
|
+
default: false
|
|
161
|
+
|
|
162
|
+
# Deprecated options
|
|
163
|
+
opt :convert, "[DEPRECATED] No longer needed",
|
|
164
|
+
default: true, short: "-c"
|
|
165
|
+
opt :del_interfile, "[DEPRECATED] Intermediate files no longer created",
|
|
166
|
+
default: false, short: "-x"
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
validate_options!(opts)
|
|
170
|
+
opts
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Validate parsed options
|
|
174
|
+
def validate_options!(opts)
|
|
175
|
+
# Cache and config operations don't need input/lang
|
|
176
|
+
return if opts[:cache_status] || opts[:cache_clear] || opts[:config_init]
|
|
177
|
+
|
|
178
|
+
# Either --input or --lang is required
|
|
179
|
+
if opts[:input].nil? && opts[:lang].nil?
|
|
180
|
+
Optimist.die "Either --input or --lang is required"
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Cannot specify both --input and --lang
|
|
184
|
+
if opts[:input] && opts[:lang]
|
|
185
|
+
Optimist.die "Cannot specify both --input and --lang"
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# --articles requires --lang
|
|
189
|
+
if opts[:articles] && opts[:lang].nil?
|
|
190
|
+
Optimist.die "--articles requires --lang"
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# --articles cannot be used with --input
|
|
194
|
+
if opts[:articles] && opts[:input]
|
|
195
|
+
Optimist.die "--articles cannot be used with --input"
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# --from-category requires --lang
|
|
199
|
+
if opts[:from_category] && opts[:lang].nil?
|
|
200
|
+
Optimist.die "--from-category requires --lang"
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# --from-category cannot be used with --input
|
|
204
|
+
if opts[:from_category] && opts[:input]
|
|
205
|
+
Optimist.die "--from-category cannot be used with --input"
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# --from-category cannot be used with --articles
|
|
209
|
+
if opts[:from_category] && opts[:articles]
|
|
210
|
+
Optimist.die "--from-category cannot be used with --articles"
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# --depth must be >= 0
|
|
214
|
+
if opts[:depth] < 0
|
|
215
|
+
Optimist.die :depth, "must be 0 or greater"
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Warn if depth > 3 (can result in many articles)
|
|
219
|
+
if opts[:depth] > 3
|
|
220
|
+
warn "Warning: --depth > 3 may result in a very large number of articles"
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# --dry-run only makes sense with --from-category
|
|
224
|
+
if opts[:dry_run] && opts[:from_category].nil?
|
|
225
|
+
Optimist.die "--dry-run requires --from-category"
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# --yes only makes sense with --from-category
|
|
229
|
+
if opts[:yes] && opts[:from_category].nil?
|
|
230
|
+
Optimist.die "--yes requires --from-category"
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Validate --input exists
|
|
234
|
+
if opts[:input] && !File.exist?(opts[:input])
|
|
235
|
+
Optimist.die :input, "file does not exist"
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Validate language code
|
|
239
|
+
if opts[:lang] && !valid_language_code?(opts[:lang])
|
|
240
|
+
Optimist.die :lang, "invalid language code format"
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Validate output directory exists
|
|
244
|
+
unless File.exist?(opts[:output_dir])
|
|
245
|
+
Optimist.die :output_dir, "directory does not exist"
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Validate format
|
|
249
|
+
unless %w[text json].include?(opts[:format].to_s.downcase)
|
|
250
|
+
Optimist.die :format, "must be 'text' or 'json'"
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Validate file_size
|
|
254
|
+
Optimist.die :file_size, "must be 0 or larger" if opts[:file_size] < 0
|
|
255
|
+
|
|
256
|
+
# Validate --alias-file exists and is valid YAML
|
|
257
|
+
if opts[:alias_file]
|
|
258
|
+
unless File.exist?(opts[:alias_file])
|
|
259
|
+
Optimist.die :alias_file, "file does not exist"
|
|
260
|
+
end
|
|
261
|
+
begin
|
|
262
|
+
require "yaml"
|
|
263
|
+
YAML.safe_load(File.read(opts[:alias_file]), permitted_classes: [Symbol])
|
|
264
|
+
rescue Psych::SyntaxError => e
|
|
265
|
+
Optimist.die :alias_file, "invalid YAML syntax: #{e.message}"
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
# Extraction modes are mutually exclusive
|
|
270
|
+
extraction_modes = []
|
|
271
|
+
extraction_modes << "--category-only" if opts[:category_only]
|
|
272
|
+
extraction_modes << "--summary-only" if opts[:summary_only]
|
|
273
|
+
extraction_modes << "--metadata-only" if opts[:metadata_only]
|
|
274
|
+
if extraction_modes.size > 1
|
|
275
|
+
Optimist.die "#{extraction_modes.join(', ')} cannot be combined (choose one extraction mode)"
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# --sections conflicts with extraction modes
|
|
279
|
+
if opts[:sections] && extraction_modes.any?
|
|
280
|
+
Optimist.die "--sections cannot be used with #{extraction_modes.first}"
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# --section-stats is a standalone mode
|
|
284
|
+
if opts[:section_stats]
|
|
285
|
+
if opts[:sections]
|
|
286
|
+
Optimist.die "--section-stats cannot be used with --sections"
|
|
287
|
+
end
|
|
288
|
+
if extraction_modes.any?
|
|
289
|
+
Optimist.die "--section-stats cannot be used with #{extraction_modes.first}"
|
|
290
|
+
end
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# --show-matched-sections only works with JSON format
|
|
294
|
+
if opts[:show_matched_sections] && opts[:format].to_s.downcase != "json"
|
|
295
|
+
Optimist.die "--show-matched-sections requires --format json"
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# Parse article list from comma-separated string
|
|
300
|
+
def parse_article_list(articles_str)
|
|
301
|
+
return [] if articles_str.nil? || articles_str.empty?
|
|
302
|
+
articles_str.split(",").map(&:strip).reject(&:empty?)
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
# Check if a language code is valid
|
|
306
|
+
# Valid codes: 2-10 lowercase letters (e.g., en, ja, simple, zh-yue)
|
|
307
|
+
def valid_language_code?(code)
|
|
308
|
+
return false if code.nil? || code.empty?
|
|
309
|
+
# Allow codes like: en, ja, zh, simple, zh-yue, etc.
|
|
310
|
+
code.match?(/\A[a-z]{2,10}(-[a-z]{2,10})?\z/)
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Get default cache directory
|
|
314
|
+
def default_cache_dir
|
|
315
|
+
Config::DEFAULT_CACHE_DIR
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
end
|
|
319
|
+
end
|