wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
data/bin/wp2txt
CHANGED
|
@@ -2,191 +2,893 @@
|
|
|
2
2
|
|
|
3
3
|
# frozen_string_literal: true
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
# Enable YJIT for better performance (Ruby 3.3+)
|
|
6
|
+
RubyVM::YJIT.enable if defined?(RubyVM::YJIT) && RubyVM::YJIT.respond_to?(:enable)
|
|
7
7
|
|
|
8
8
|
require_relative "../lib/wp2txt"
|
|
9
9
|
require_relative "../lib/wp2txt/utils"
|
|
10
10
|
require_relative "../lib/wp2txt/version"
|
|
11
|
+
require_relative "../lib/wp2txt/cli"
|
|
12
|
+
require_relative "../lib/wp2txt/multistream"
|
|
13
|
+
require_relative "../lib/wp2txt/cli_ui"
|
|
14
|
+
require_relative "../lib/wp2txt/formatter"
|
|
15
|
+
require_relative "../lib/wp2txt/extractor"
|
|
16
|
+
require_relative "../lib/wp2txt/ractor_worker"
|
|
11
17
|
|
|
12
18
|
require "etc"
|
|
19
|
+
require "json"
|
|
13
20
|
require "optimist"
|
|
14
21
|
require "parallel"
|
|
15
22
|
require "pastel"
|
|
16
23
|
require "tty-spinner"
|
|
24
|
+
require "tty-progressbar"
|
|
17
25
|
|
|
18
26
|
class WpApp
|
|
19
27
|
include Wp2txt
|
|
28
|
+
include Wp2txt::CliUI
|
|
29
|
+
include Wp2txt::Formatter
|
|
30
|
+
include Wp2txt::Extractor
|
|
20
31
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
opt :category, "Show article category information", default: true, short: "-a"
|
|
35
|
-
opt :category_only, "Extract only article title and categories", default: false, short: "-g"
|
|
36
|
-
opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
|
|
37
|
-
opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
|
|
38
|
-
opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
|
|
39
|
-
opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
|
|
40
|
-
opt :title, "Keep page titles in output", default: true, short: "-t"
|
|
41
|
-
opt :heading, "Keep section titles in output", default: true, short: "-d"
|
|
42
|
-
opt :list, "Keep unprocessed list items in output", default: false, short: "-l"
|
|
43
|
-
opt :ref, "Keep reference notations in the format [ref]...[/ref]", default: false, short: "-r"
|
|
44
|
-
opt :redirect, "Show redirect destination", default: false, short: "-e"
|
|
45
|
-
opt :marker, "Show symbols prefixed to list items, definitions, etc.", default: true, short: "-m"
|
|
46
|
-
opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", default: false, short: "-b"
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
Optimist.die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
|
50
|
-
Optimist.die :input, "must exist" unless File.exist?(opts[:input])
|
|
51
|
-
Optimist.die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
|
52
|
-
|
|
53
|
-
pastel = Pastel.new
|
|
54
|
-
|
|
55
|
-
input_file = opts[:input]
|
|
56
|
-
output_dir = opts[:output_dir]
|
|
57
|
-
tfile_size = opts[:file_size]
|
|
58
|
-
num_processors = Etc.nprocessors
|
|
59
|
-
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
|
|
60
|
-
opts[:num_procs]
|
|
61
|
-
else
|
|
62
|
-
minus2 = num_processors - 2
|
|
63
|
-
minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
|
|
64
|
-
end
|
|
65
|
-
num_processes = 1 if num_processes < 1
|
|
66
|
-
|
|
67
|
-
convert = opts[:convert]
|
|
68
|
-
strip_tmarker = opts[:marker] ? false : true
|
|
69
|
-
opt_array = %i[title list heading table redirect multiline category category_only summary_only del_interfile bz2_gem]
|
|
70
|
-
|
|
71
|
-
config = {}
|
|
72
|
-
opt_array.each do |opt|
|
|
73
|
-
config[opt] = opts[opt]
|
|
74
|
-
end
|
|
32
|
+
# Debug mode flag
|
|
33
|
+
DEBUG_MODE = false
|
|
34
|
+
|
|
35
|
+
def initialize
|
|
36
|
+
@pastel = Pastel.new
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
# Calculate the number of processes to be used for parallel processing
|
|
42
|
+
# Uses MemoryMonitor to determine optimal parallelism based on CPU and memory
|
|
43
|
+
def calculate_num_processes(opts)
|
|
44
|
+
optimal = Wp2txt::MemoryMonitor.optimal_processes
|
|
75
45
|
|
|
76
|
-
if
|
|
77
|
-
|
|
46
|
+
if opts[:num_procs]
|
|
47
|
+
# User specified a value - use it if reasonable
|
|
48
|
+
requested = opts[:num_procs].to_i
|
|
49
|
+
max_allowed = Etc.nprocessors
|
|
50
|
+
[requested, max_allowed, 1].max == requested ? requested : optimal
|
|
78
51
|
else
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
52
|
+
optimal
|
|
53
|
+
end.tap { |n| n = 1 if n < 1 }
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Process articles using turbo mode (split-first architecture from v1.x)
|
|
57
|
+
# This splits the bz2 file into XML chunks first, then processes in parallel
|
|
58
|
+
# Much faster for large dumps due to parallel decompression benefit
|
|
59
|
+
def process_with_turbo(input_path, output_dir, config)
|
|
60
|
+
require "tmpdir"
|
|
61
|
+
require "fileutils"
|
|
62
|
+
|
|
63
|
+
num_processes = config[:num_procs]
|
|
64
|
+
file_size_mb = config[:file_size]
|
|
65
|
+
format = config[:format]
|
|
66
|
+
bz2_gem = config[:bz2_gem]
|
|
67
|
+
|
|
68
|
+
# Determine base name for output files
|
|
69
|
+
base_name = File.basename(input_path, ".*")
|
|
70
|
+
base_name = base_name.sub(/\.xml$/, "") # Handle .xml.bz2
|
|
71
|
+
|
|
72
|
+
# Get input file size for display
|
|
73
|
+
input_size = File.size(input_path) rescue 0
|
|
74
|
+
input_size_str = input_size > 0 ? format_size(input_size) : "unknown"
|
|
75
|
+
|
|
76
|
+
print_mode_banner("Turbo Mode Processing", {
|
|
77
|
+
"Input" => File.basename(input_path),
|
|
78
|
+
"Size" => input_size_str,
|
|
79
|
+
"Format" => format.to_s,
|
|
80
|
+
"CPU cores" => num_processes.to_s,
|
|
81
|
+
"Mode" => "Split-first (parallel decompression)"
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
time_start = Time.now
|
|
85
|
+
|
|
86
|
+
# Create temp directory for split XML files
|
|
87
|
+
temp_dir = Dir.mktmpdir("wp2txt_turbo_")
|
|
88
|
+
puts pastel.cyan("Phase 1: Splitting bz2 file into XML chunks...")
|
|
89
|
+
puts pastel.dim(" Temp directory: #{temp_dir}")
|
|
90
|
+
puts
|
|
91
|
+
|
|
92
|
+
begin
|
|
93
|
+
# Phase 1: Split bz2 into XML files using Splitter
|
|
94
|
+
# Split into 10MB chunks for good parallelism
|
|
95
|
+
$stdout.sync = true
|
|
96
|
+
splitter = Splitter.new(input_path, temp_dir, 10, bz2_gem) do |bytes_read, file_count|
|
|
97
|
+
# Progress callback - called every 5 seconds
|
|
98
|
+
size_str = format_size(bytes_read)
|
|
99
|
+
elapsed = Time.now - time_start
|
|
100
|
+
rate = bytes_read / elapsed / 1024 / 1024 # MB/s
|
|
101
|
+
puts pastel.dim(format(" [%s] Decompressed: %s | %.1f MB/s | %d XML files created",
|
|
102
|
+
Time.now.strftime("%H:%M:%S"),
|
|
103
|
+
size_str,
|
|
104
|
+
rate,
|
|
105
|
+
file_count))
|
|
106
|
+
end
|
|
107
|
+
splitter.split_file
|
|
108
|
+
xml_files = Dir.glob(File.join(temp_dir, "*.xml")).sort
|
|
109
|
+
|
|
110
|
+
split_time = Time.now - time_start
|
|
111
|
+
final_size = splitter.size_read || 0
|
|
112
|
+
puts
|
|
113
|
+
puts pastel.green("#{ICONS[:success]} Split complete: #{xml_files.size} XML files, #{format_size(final_size)} decompressed (#{format_duration(split_time)})")
|
|
114
|
+
puts
|
|
115
|
+
|
|
116
|
+
# Phase 2: Process XML files in parallel and write output directly
|
|
117
|
+
puts pastel.cyan("Phase 2: Processing XML files in parallel...")
|
|
118
|
+
puts pastel.dim(" Using #{num_processes} parallel processes")
|
|
119
|
+
puts
|
|
120
|
+
|
|
121
|
+
strip_tmarker = !config[:marker]
|
|
122
|
+
|
|
123
|
+
# Each parallel process writes to its own temp output file
|
|
124
|
+
# This avoids memory accumulation and enables streaming output
|
|
125
|
+
$stdout.sync = true
|
|
126
|
+
processed_count = 0
|
|
127
|
+
last_report_time = Time.now
|
|
128
|
+
temp_output_dir = File.join(temp_dir, "output")
|
|
129
|
+
FileUtils.mkdir_p(temp_output_dir)
|
|
130
|
+
|
|
131
|
+
# Process XML files in parallel - each writes its own output
|
|
132
|
+
article_counts = Parallel.map(
|
|
133
|
+
xml_files.each_with_index.to_a,
|
|
134
|
+
in_processes: num_processes,
|
|
135
|
+
finish: lambda { |_item, _index, _result|
|
|
136
|
+
processed_count += 1
|
|
137
|
+
now = Time.now
|
|
138
|
+
if now - last_report_time >= Wp2txt::DEFAULT_PROGRESS_INTERVAL || processed_count == xml_files.size
|
|
139
|
+
last_report_time = now
|
|
140
|
+
percent = (processed_count.to_f / xml_files.size * 100).round(1)
|
|
141
|
+
elapsed = now - time_start
|
|
142
|
+
rate = processed_count / elapsed
|
|
143
|
+
remaining = xml_files.size - processed_count
|
|
144
|
+
eta = remaining > 0 && rate > 0 ? remaining / rate : 0
|
|
145
|
+
puts pastel.dim(format(" [%d/%d] %.1f%% | %.1f files/sec | ETA: %s",
|
|
146
|
+
processed_count, xml_files.size,
|
|
147
|
+
percent, rate,
|
|
148
|
+
format_duration(eta)))
|
|
149
|
+
end
|
|
150
|
+
}
|
|
151
|
+
) do |xml_file, idx|
|
|
152
|
+
# Each process writes directly to its own temp file
|
|
153
|
+
temp_output_file = File.join(temp_output_dir, "part_#{idx.to_s.rjust(5, '0')}.txt")
|
|
154
|
+
process_xml_file_and_write(xml_file, temp_output_file, config, strip_tmarker, format)
|
|
155
|
+
end
|
|
120
156
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
157
|
+
total_articles = article_counts.sum
|
|
158
|
+
|
|
159
|
+
# Phase 3: Merge temp output files into final output (streaming)
|
|
160
|
+
puts
|
|
161
|
+
puts pastel.cyan("Merging output files...")
|
|
162
|
+
|
|
163
|
+
temp_files = Dir.glob(File.join(temp_output_dir, "part_*.txt")).sort
|
|
164
|
+
writer = OutputWriter.new(
|
|
165
|
+
output_dir: output_dir,
|
|
166
|
+
base_name: base_name,
|
|
167
|
+
format: format,
|
|
168
|
+
file_size_mb: file_size_mb
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
temp_files.each do |temp_file|
|
|
172
|
+
next if File.size(temp_file).zero?
|
|
173
|
+
# Stream copy instead of loading entire file into memory
|
|
174
|
+
writer.write_from_file(temp_file)
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
output_files = writer.close
|
|
178
|
+
|
|
179
|
+
time_elapsed = Time.now - time_start
|
|
180
|
+
puts
|
|
181
|
+
puts pastel.green("#{ICONS[:success]} Processing complete!")
|
|
182
|
+
|
|
183
|
+
print_summary("Turbo Processing Complete", {
|
|
184
|
+
"XML files processed" => xml_files.size.to_s,
|
|
185
|
+
"Articles" => total_articles.to_s,
|
|
186
|
+
"Output files" => output_files.size.to_s,
|
|
187
|
+
"Time" => format_duration(time_elapsed)
|
|
188
|
+
}, status: :success)
|
|
189
|
+
|
|
190
|
+
puts
|
|
191
|
+
puts pastel.dim("Output files:")
|
|
192
|
+
output_files.each { |f| print_list_item(f, status: :success) }
|
|
193
|
+
ensure
|
|
194
|
+
# Cleanup temp directory
|
|
195
|
+
FileUtils.rm_rf(temp_dir) if File.exist?(temp_dir)
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Regex patterns for fast XML extraction (avoid full DOM parsing)
|
|
200
|
+
TITLE_REGEX = %r{<title>([^<]*)</title>}m
|
|
201
|
+
TEXT_REGEX = %r{<text[^>]*>(.*)$}m
|
|
202
|
+
TEXT_END_REGEX = %r{</text>}
|
|
203
|
+
|
|
204
|
+
# Process a single XML file and write directly to output file
|
|
205
|
+
# Returns the number of articles processed
|
|
206
|
+
def process_xml_file_and_write(xml_file, output_file, config, strip_tmarker, format)
|
|
207
|
+
article_count = 0
|
|
208
|
+
runner = Runner.new(xml_file, File.dirname(xml_file), strip_tmarker, false)
|
|
209
|
+
|
|
210
|
+
File.open(output_file, "w") do |out|
|
|
211
|
+
while (page_xml = runner.get_page)
|
|
212
|
+
begin
|
|
213
|
+
# Fast regex extraction instead of full Nokogiri DOM parsing
|
|
214
|
+
title_match = TITLE_REGEX.match(page_xml)
|
|
215
|
+
next unless title_match
|
|
216
|
+
|
|
217
|
+
title = title_match[1]
|
|
218
|
+
next if title.nil? || title.empty? || title.include?(":")
|
|
219
|
+
|
|
220
|
+
# Extract text content
|
|
221
|
+
text_match = TEXT_REGEX.match(page_xml)
|
|
222
|
+
next unless text_match
|
|
223
|
+
|
|
224
|
+
# Find end of text and extract content
|
|
225
|
+
text_start = text_match.begin(1)
|
|
226
|
+
text_end_match = TEXT_END_REGEX.match(page_xml, text_start)
|
|
227
|
+
next unless text_end_match
|
|
228
|
+
|
|
229
|
+
text = page_xml[text_start...text_end_match.begin(0)]
|
|
230
|
+
next if text.nil? || text.empty?
|
|
231
|
+
|
|
232
|
+
# Decode XML entities
|
|
233
|
+
text = text.gsub("<", "<").gsub(">", ">").gsub("&", "&").gsub(""", '"')
|
|
234
|
+
|
|
235
|
+
# Remove HTML comments
|
|
236
|
+
text.gsub!(/<!--(.*?)-->/m) do |content|
|
|
237
|
+
num_of_newlines = content.count("\n")
|
|
238
|
+
num_of_newlines.zero? ? +"" : "\n" * num_of_newlines
|
|
178
239
|
end
|
|
240
|
+
|
|
241
|
+
next if redirect_page?(text)
|
|
242
|
+
|
|
243
|
+
article = Article.new(text, title, strip_tmarker)
|
|
244
|
+
result = format_article(article, config)
|
|
245
|
+
next unless result
|
|
246
|
+
|
|
247
|
+
# Write directly to file
|
|
248
|
+
if format == :json
|
|
249
|
+
out.puts(result.to_json)
|
|
250
|
+
else
|
|
251
|
+
out.puts(result)
|
|
252
|
+
end
|
|
253
|
+
article_count += 1
|
|
254
|
+
rescue StandardError
|
|
255
|
+
next
|
|
179
256
|
end
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
article_count
|
|
261
|
+
end
|
|
180
262
|
|
|
181
|
-
|
|
182
|
-
|
|
263
|
+
# Fast redirect detection (same as in stream_processor)
|
|
264
|
+
def redirect_page?(text)
|
|
265
|
+
return false if text.nil? || text.empty?
|
|
266
|
+
first_part = text[0, 200]
|
|
267
|
+
return false unless first_part
|
|
268
|
+
stripped = first_part.lstrip
|
|
269
|
+
return false unless stripped.start_with?("#", "#")
|
|
270
|
+
stripped.include?("[[")
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# Process articles using streaming (new architecture)
|
|
274
|
+
def process_stream(input_path, output_dir, config)
|
|
275
|
+
num_processes = config[:num_procs]
|
|
276
|
+
file_size_mb = config[:file_size]
|
|
277
|
+
format = config[:format]
|
|
278
|
+
bz2_gem = config[:bz2_gem]
|
|
279
|
+
|
|
280
|
+
# Determine base name for output files
|
|
281
|
+
base_name = File.basename(input_path, ".*")
|
|
282
|
+
base_name = base_name.sub(/\.xml$/, "") # Handle .xml.bz2
|
|
283
|
+
|
|
284
|
+
# Create stream processor
|
|
285
|
+
stream = StreamProcessor.new(input_path, bz2_gem: bz2_gem)
|
|
286
|
+
|
|
287
|
+
# Create output writer
|
|
288
|
+
writer = OutputWriter.new(
|
|
289
|
+
output_dir: output_dir,
|
|
290
|
+
base_name: base_name,
|
|
291
|
+
format: format,
|
|
292
|
+
file_size_mb: file_size_mb
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Collect pages for parallel processing
|
|
296
|
+
pages = []
|
|
297
|
+
page_count = 0
|
|
298
|
+
|
|
299
|
+
# Determine parallelism mode
|
|
300
|
+
use_ractor = config[:use_ractor] && Wp2txt::RactorWorker.available?
|
|
301
|
+
parallel_mode = use_ractor ? "Ractor (experimental)" : "Parallel (processes)"
|
|
302
|
+
|
|
303
|
+
# Show warning for experimental Ractor mode
|
|
304
|
+
if config[:use_ractor]
|
|
305
|
+
if use_ractor
|
|
306
|
+
print_warning("Ractor mode is experimental and may be unstable.")
|
|
307
|
+
puts pastel.yellow(" If processing hangs, restart without --ractor option.") unless quiet?
|
|
308
|
+
else
|
|
309
|
+
print_warning("Ractor not available on this Ruby version. Using Parallel gem.")
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# Get input file size for progress estimation
|
|
314
|
+
input_size = File.size(input_path) rescue 0
|
|
315
|
+
input_size_str = input_size > 0 ? format_size(input_size) : "unknown"
|
|
316
|
+
|
|
317
|
+
# Estimate total articles for ETA calculation
|
|
318
|
+
estimated_total = estimate_total_articles(input_path)
|
|
319
|
+
estimated_total_str = estimated_total ? "~#{(estimated_total / 1_000_000.0).round(1)}M" : "unknown"
|
|
320
|
+
|
|
321
|
+
print_mode_banner("Full Dump Processing", {
|
|
322
|
+
"Input" => File.basename(input_path),
|
|
323
|
+
"Size" => input_size_str,
|
|
324
|
+
"Articles (est.)" => estimated_total_str,
|
|
325
|
+
"Format" => format.to_s,
|
|
326
|
+
"CPU cores" => num_processes.to_s,
|
|
327
|
+
"Parallel" => parallel_mode,
|
|
328
|
+
"Skip redirects" => "yes"
|
|
329
|
+
})
|
|
330
|
+
|
|
331
|
+
# Ensure output is not buffered (important for piped output)
|
|
332
|
+
$stdout.sync = true
|
|
333
|
+
|
|
334
|
+
time_start = Time.now
|
|
335
|
+
last_progress_time = time_start
|
|
336
|
+
last_progress_count = 0
|
|
337
|
+
batch_count = 0
|
|
338
|
+
|
|
339
|
+
# Progress reporting interval (seconds)
|
|
340
|
+
progress_interval = Wp2txt::DEFAULT_PROGRESS_INTERVAL
|
|
341
|
+
|
|
342
|
+
# Process in batches for memory efficiency
|
|
343
|
+
batch_size = num_processes * 100
|
|
344
|
+
strip_tmarker = !config[:marker]
|
|
345
|
+
|
|
346
|
+
# Show initial progress message
|
|
347
|
+
puts pastel.cyan("Processing started at #{time_start.strftime('%H:%M:%S')}")
|
|
348
|
+
if estimated_total
|
|
349
|
+
puts pastel.dim("Progress updates every #{progress_interval} seconds (with ETA)...")
|
|
350
|
+
else
|
|
351
|
+
puts pastel.dim("Progress updates every #{progress_interval} seconds...")
|
|
352
|
+
end
|
|
353
|
+
puts
|
|
354
|
+
|
|
355
|
+
stream.each_page do |title, text|
|
|
356
|
+
pages << [title, text]
|
|
357
|
+
page_count += 1
|
|
358
|
+
|
|
359
|
+
# Process batch when full
|
|
360
|
+
next unless pages.size >= batch_size
|
|
361
|
+
|
|
362
|
+
process_batch(pages, writer, config, strip_tmarker, num_processes)
|
|
363
|
+
pages.clear
|
|
364
|
+
batch_count += 1
|
|
365
|
+
|
|
366
|
+
# Show progress every N seconds
|
|
367
|
+
now = Time.now
|
|
368
|
+
elapsed_since_update = now - last_progress_time
|
|
369
|
+
if elapsed_since_update >= progress_interval
|
|
370
|
+
elapsed_total = now - time_start
|
|
371
|
+
articles_per_sec = (page_count - last_progress_count) / elapsed_since_update
|
|
372
|
+
output_count = writer.file_count rescue batch_count
|
|
373
|
+
|
|
374
|
+
# Calculate ETA
|
|
375
|
+
eta_seconds = calculate_eta(page_count, estimated_total, elapsed_total)
|
|
376
|
+
eta_str = format_eta(eta_seconds)
|
|
377
|
+
|
|
378
|
+
# Calculate progress percentage if total is known
|
|
379
|
+
if estimated_total && estimated_total > 0
|
|
380
|
+
percent = (page_count.to_f / estimated_total * 100).round(1)
|
|
381
|
+
progress_line = format(
|
|
382
|
+
" [%s] %s articles (%s%%) | %s/sec | %s files | Elapsed: %s | ETA: %s",
|
|
383
|
+
now.strftime("%H:%M:%S"),
|
|
384
|
+
page_count.to_s.rjust(8),
|
|
385
|
+
percent.to_s.rjust(5),
|
|
386
|
+
articles_per_sec.round(1).to_s.rjust(6),
|
|
387
|
+
output_count.to_s.rjust(4),
|
|
388
|
+
format_duration(elapsed_total),
|
|
389
|
+
eta_str
|
|
390
|
+
)
|
|
183
391
|
else
|
|
184
|
-
|
|
392
|
+
progress_line = format(
|
|
393
|
+
" [%s] %s articles | %s/sec | %s files | Elapsed: %s",
|
|
394
|
+
now.strftime("%H:%M:%S"),
|
|
395
|
+
page_count.to_s.rjust(8),
|
|
396
|
+
articles_per_sec.round(1).to_s.rjust(6),
|
|
397
|
+
output_count.to_s.rjust(4),
|
|
398
|
+
format_duration(elapsed_total)
|
|
399
|
+
)
|
|
185
400
|
end
|
|
401
|
+
puts pastel.dim(progress_line)
|
|
402
|
+
|
|
403
|
+
last_progress_time = now
|
|
404
|
+
last_progress_count = page_count
|
|
405
|
+
end
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
# Process remaining pages
|
|
409
|
+
process_batch(pages, writer, config, strip_tmarker, num_processes) unless pages.empty?
|
|
410
|
+
|
|
411
|
+
# Close output
|
|
412
|
+
output_files = writer.close
|
|
413
|
+
|
|
414
|
+
# Get redirect skip count
|
|
415
|
+
redirects_skipped = stream.redirects_skipped
|
|
416
|
+
|
|
417
|
+
time_elapsed = Time.now - time_start
|
|
418
|
+
puts
|
|
419
|
+
puts pastel.green("#{ICONS[:success]} Processing complete!")
|
|
420
|
+
|
|
421
|
+
# Summary
|
|
422
|
+
summary_data = {
|
|
423
|
+
"Articles" => page_count.to_s,
|
|
424
|
+
"Output files" => output_files.size.to_s,
|
|
425
|
+
"Time" => format_duration(time_elapsed)
|
|
426
|
+
}
|
|
427
|
+
summary_data["Redirects skipped"] = redirects_skipped.to_s if redirects_skipped > 0
|
|
428
|
+
|
|
429
|
+
print_summary("Processing Complete", summary_data, status: :success)
|
|
430
|
+
|
|
431
|
+
puts
|
|
432
|
+
puts pastel.dim("Output files:")
|
|
433
|
+
output_files.each { |f| print_list_item(f, status: :success) }
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
# Process a batch of pages in parallel
|
|
437
|
+
# Uses Ractor for true parallelism when enabled, otherwise falls back to Parallel gem
|
|
438
|
+
def process_batch(pages, writer, config, strip_tmarker, num_processes)
|
|
439
|
+
results = if config[:use_ractor] && Wp2txt::RactorWorker.available?
|
|
440
|
+
# Use Ractor-based parallel processing (true parallelism)
|
|
441
|
+
Wp2txt::RactorWorker.process_articles(
|
|
442
|
+
pages,
|
|
443
|
+
config: config,
|
|
444
|
+
strip_tmarker: strip_tmarker,
|
|
445
|
+
num_workers: num_processes
|
|
446
|
+
)
|
|
447
|
+
else
|
|
448
|
+
# Fall back to Parallel gem (process-based parallelism)
|
|
449
|
+
Parallel.map(pages, in_processes: num_processes) do |title, text|
|
|
450
|
+
article = Article.new(text, title, strip_tmarker)
|
|
451
|
+
format_article(article, config)
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
results.each do |result|
|
|
456
|
+
writer.write(result) if result
|
|
457
|
+
end
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
# Process section statistics mode
|
|
461
|
+
# Collects section heading statistics and outputs JSON to stdout
|
|
462
|
+
def process_section_stats(input_path, config)
|
|
463
|
+
require_relative "../lib/wp2txt/section_extractor"
|
|
464
|
+
|
|
465
|
+
bz2_gem = config[:bz2_gem]
|
|
466
|
+
no_turbo = config[:no_turbo]
|
|
467
|
+
num_processes = config[:num_procs]
|
|
468
|
+
|
|
469
|
+
# Use turbo mode for bz2 files unless disabled
|
|
470
|
+
if input_path.end_with?(".bz2") && !no_turbo
|
|
471
|
+
process_section_stats_turbo(input_path, bz2_gem, num_processes)
|
|
472
|
+
else
|
|
473
|
+
process_section_stats_stream(input_path, bz2_gem)
|
|
474
|
+
end
|
|
475
|
+
end
|
|
476
|
+
|
|
477
|
+
def process_section_stats_stream(input_path, bz2_gem)
|
|
478
|
+
print_mode_banner("Section Statistics", {
|
|
479
|
+
"Input" => File.basename(input_path),
|
|
480
|
+
"Mode" => "Statistics collection (streaming)"
|
|
481
|
+
})
|
|
482
|
+
|
|
483
|
+
puts pastel.cyan("Collecting section statistics...")
|
|
484
|
+
puts pastel.dim("This may take a while for large dumps.")
|
|
485
|
+
puts
|
|
486
|
+
|
|
487
|
+
# Create stream processor and stats collector
|
|
488
|
+
stream = StreamProcessor.new(input_path, bz2_gem: bz2_gem)
|
|
489
|
+
collector = Wp2txt::SectionStatsCollector.new
|
|
490
|
+
|
|
491
|
+
time_start = Time.now
|
|
492
|
+
last_progress_time = time_start
|
|
493
|
+
progress_interval = Wp2txt::DEFAULT_PROGRESS_INTERVAL
|
|
494
|
+
|
|
495
|
+
# Process pages without full text processing (just extract headings)
|
|
496
|
+
stream.each_page do |title, text|
|
|
497
|
+
# Create minimal article just for heading extraction
|
|
498
|
+
article = Article.new(text, title, false)
|
|
499
|
+
collector.process(article)
|
|
500
|
+
|
|
501
|
+
# Show progress periodically
|
|
502
|
+
now = Time.now
|
|
503
|
+
if now - last_progress_time >= progress_interval
|
|
504
|
+
elapsed = now - time_start
|
|
505
|
+
rate = collector.total_articles / elapsed
|
|
506
|
+
puts pastel.dim(format(" [%s] %d articles processed (%.1f/sec)",
|
|
507
|
+
now.strftime("%H:%M:%S"),
|
|
508
|
+
collector.total_articles,
|
|
509
|
+
rate))
|
|
510
|
+
last_progress_time = now
|
|
511
|
+
end
|
|
512
|
+
end
|
|
513
|
+
|
|
514
|
+
output_section_stats_result(collector, time_start)
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
def process_section_stats_turbo(input_path, bz2_gem, num_processes)
|
|
518
|
+
require "tmpdir"
|
|
519
|
+
require "fileutils"
|
|
520
|
+
|
|
521
|
+
print_mode_banner("Section Statistics (Turbo)", {
|
|
522
|
+
"Input" => File.basename(input_path),
|
|
523
|
+
"Mode" => "Statistics collection (parallel)",
|
|
524
|
+
"CPU cores" => num_processes.to_s
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
time_start = Time.now
|
|
528
|
+
|
|
529
|
+
# Create temp directory for split XML files
|
|
530
|
+
temp_dir = Dir.mktmpdir("wp2txt_stats_")
|
|
531
|
+
puts pastel.cyan("Phase 1: Splitting bz2 file...")
|
|
532
|
+
puts pastel.dim(" Temp directory: #{temp_dir}")
|
|
533
|
+
puts
|
|
534
|
+
|
|
535
|
+
begin
|
|
536
|
+
# Phase 1: Split bz2 into XML files
|
|
537
|
+
$stdout.sync = true
|
|
538
|
+
splitter = Splitter.new(input_path, temp_dir, 10, bz2_gem) do |bytes_read, file_count|
|
|
539
|
+
size_str = format_size(bytes_read)
|
|
540
|
+
elapsed = Time.now - time_start
|
|
541
|
+
rate = bytes_read / elapsed / 1024 / 1024
|
|
542
|
+
puts pastel.dim(format(" [%s] Decompressed: %s | %.1f MB/s | %d XML files",
|
|
543
|
+
Time.now.strftime("%H:%M:%S"),
|
|
544
|
+
size_str, rate, file_count))
|
|
545
|
+
end
|
|
546
|
+
splitter.split_file
|
|
547
|
+
xml_files = Dir.glob(File.join(temp_dir, "*.xml")).sort
|
|
548
|
+
|
|
549
|
+
split_time = Time.now - time_start
|
|
550
|
+
final_size = splitter.size_read || 0
|
|
551
|
+
puts
|
|
552
|
+
puts pastel.green("#{ICONS[:success]} Split complete: #{xml_files.size} XML files, #{format_size(final_size)} (#{format_duration(split_time)})")
|
|
553
|
+
puts
|
|
554
|
+
|
|
555
|
+
# Phase 2: Process XML files in parallel
|
|
556
|
+
puts pastel.cyan("Phase 2: Collecting statistics in parallel...")
|
|
557
|
+
puts
|
|
558
|
+
|
|
559
|
+
processed_count = 0
|
|
560
|
+
last_report_time = Time.now
|
|
561
|
+
|
|
562
|
+
# Process XML files in parallel and collect stats
|
|
563
|
+
partial_results = Parallel.map(
|
|
564
|
+
xml_files,
|
|
565
|
+
in_processes: num_processes,
|
|
566
|
+
finish: lambda { |_item, _index, _result|
|
|
567
|
+
processed_count += 1
|
|
568
|
+
now = Time.now
|
|
569
|
+
if now - last_report_time >= Wp2txt::DEFAULT_PROGRESS_INTERVAL || processed_count == xml_files.size
|
|
570
|
+
last_report_time = now
|
|
571
|
+
percent = (processed_count.to_f / xml_files.size * 100).round(1)
|
|
572
|
+
elapsed = now - time_start
|
|
573
|
+
rate = processed_count / elapsed
|
|
574
|
+
remaining = xml_files.size - processed_count
|
|
575
|
+
eta = remaining > 0 && rate > 0 ? remaining / rate : 0
|
|
576
|
+
puts pastel.dim(format(" [%d/%d] %.1f%% | %.1f files/sec | ETA: %s",
|
|
577
|
+
processed_count, xml_files.size,
|
|
578
|
+
percent, rate, format_duration(eta)))
|
|
579
|
+
end
|
|
580
|
+
}
|
|
581
|
+
) do |xml_file|
|
|
582
|
+
process_xml_file_for_stats(xml_file)
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
# Merge all partial results
|
|
586
|
+
puts
|
|
587
|
+
puts pastel.cyan("Merging results...")
|
|
588
|
+
|
|
589
|
+
collector = Wp2txt::SectionStatsCollector.new
|
|
590
|
+
partial_results.each { |result| collector.merge(result) }
|
|
591
|
+
|
|
592
|
+
output_section_stats_result(collector, time_start)
|
|
593
|
+
ensure
|
|
594
|
+
FileUtils.rm_rf(temp_dir) if File.exist?(temp_dir)
|
|
595
|
+
end
|
|
596
|
+
end
|
|
597
|
+
|
|
598
|
+
# Process a single XML file for section stats (used by turbo mode)
|
|
599
|
+
def process_xml_file_for_stats(xml_file)
|
|
600
|
+
collector = Wp2txt::SectionStatsCollector.new
|
|
601
|
+
runner = Runner.new(xml_file, File.dirname(xml_file), false, false)
|
|
602
|
+
|
|
603
|
+
while (page_xml = runner.get_page)
|
|
604
|
+
begin
|
|
605
|
+
# Fast regex extraction
|
|
606
|
+
title_match = TITLE_REGEX.match(page_xml)
|
|
607
|
+
next unless title_match
|
|
608
|
+
|
|
609
|
+
title = title_match[1]
|
|
610
|
+
next if title.nil? || title.empty? || title.include?(":")
|
|
611
|
+
|
|
612
|
+
text_match = TEXT_REGEX.match(page_xml)
|
|
613
|
+
next unless text_match
|
|
614
|
+
|
|
615
|
+
text_start = text_match.begin(1)
|
|
616
|
+
text_end_match = TEXT_END_REGEX.match(page_xml, text_start)
|
|
617
|
+
next unless text_end_match
|
|
618
|
+
|
|
619
|
+
text = page_xml[text_start...text_end_match.begin(0)]
|
|
620
|
+
next if text.nil? || text.empty?
|
|
621
|
+
|
|
622
|
+
text = text.gsub("<", "<").gsub(">", ">").gsub("&", "&").gsub(""", '"')
|
|
623
|
+
next if redirect_page?(text)
|
|
624
|
+
|
|
625
|
+
article = Article.new(text, title, false)
|
|
626
|
+
collector.process(article)
|
|
627
|
+
rescue StandardError
|
|
628
|
+
next
|
|
629
|
+
end
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
collector.to_mergeable_hash
|
|
633
|
+
end
|
|
634
|
+
|
|
635
|
+
def output_section_stats_result(collector, time_start)
|
|
636
|
+
time_elapsed = Time.now - time_start
|
|
637
|
+
|
|
638
|
+
puts
|
|
639
|
+
puts pastel.green("#{ICONS[:success]} Statistics collection complete!")
|
|
640
|
+
puts
|
|
641
|
+
|
|
642
|
+
# Print summary to stderr so JSON goes to stdout cleanly
|
|
643
|
+
$stderr.puts pastel.dim("Total articles: #{collector.total_articles}")
|
|
644
|
+
$stderr.puts pastel.dim("Unique sections: #{collector.section_counts.size}")
|
|
645
|
+
$stderr.puts pastel.dim("Time: #{format_duration(time_elapsed)}")
|
|
646
|
+
$stderr.puts
|
|
647
|
+
|
|
648
|
+
# Output JSON to stdout
|
|
649
|
+
puts collector.to_json(top_n: Wp2txt::DEFAULT_TOP_N_SECTIONS)
|
|
650
|
+
|
|
651
|
+
EXIT_SUCCESS
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
# Parse --markers option value
|
|
655
|
+
# "all" -> true (all markers enabled)
|
|
656
|
+
# "none" -> DEPRECATED (now treated as "all" with warning)
|
|
657
|
+
# "math,code,chem" -> [:math, :code, :chem]
|
|
658
|
+
def parse_markers_option(value)
|
|
659
|
+
case value.to_s.downcase.strip
|
|
660
|
+
when "all", "true", ""
|
|
661
|
+
true
|
|
662
|
+
when "none", "false"
|
|
663
|
+
# Deprecation warning - none/false no longer removes content completely
|
|
664
|
+
puts @pastel.yellow("Warning: --markers=none is deprecated and will be removed in a future version.")
|
|
665
|
+
puts @pastel.yellow(" Complete removal of special content can make surrounding text nonsensical.")
|
|
666
|
+
puts @pastel.yellow(" Using --markers=all instead. Markers will be shown for all special content.")
|
|
667
|
+
puts
|
|
668
|
+
true # Treat as "all" instead of removing content
|
|
669
|
+
else
|
|
670
|
+
# Parse comma-separated list
|
|
671
|
+
value.split(",").map { |m| m.strip.downcase.to_sym }.select do |m|
|
|
672
|
+
Wp2txt::MARKER_TYPES.include?(m)
|
|
186
673
|
end
|
|
187
674
|
end
|
|
188
|
-
puts pastel.blue.bold("Complete!")
|
|
189
675
|
end
|
|
676
|
+
|
|
677
|
+
public
|
|
678
|
+
|
|
679
|
+
# Main execution method
|
|
680
|
+
# @return [Integer] Exit code (0=success, 1=error, 2=partial)
|
|
681
|
+
def run
|
|
682
|
+
# Parse command line options using CLI module
|
|
683
|
+
opts = Wp2txt::CLI.parse_options(ARGV)
|
|
684
|
+
|
|
685
|
+
# Configure UI settings (color, quiet mode)
|
|
686
|
+
configure_ui(no_color: opts[:no_color], quiet: opts[:quiet])
|
|
687
|
+
reset_pastel! # Reset pastel to apply color settings
|
|
688
|
+
@pastel = pastel # Reinitialize with new settings
|
|
689
|
+
|
|
690
|
+
# Handle config-init
|
|
691
|
+
if opts[:config_init]
|
|
692
|
+
init_config
|
|
693
|
+
return EXIT_SUCCESS
|
|
694
|
+
end
|
|
695
|
+
|
|
696
|
+
# Handle cache operations
|
|
697
|
+
if opts[:cache_status]
|
|
698
|
+
show_cache_status(opts[:cache_dir])
|
|
699
|
+
return EXIT_SUCCESS
|
|
700
|
+
end
|
|
701
|
+
|
|
702
|
+
if opts[:cache_clear]
|
|
703
|
+
clear_cache(opts[:cache_dir], opts[:lang])
|
|
704
|
+
return EXIT_SUCCESS
|
|
705
|
+
end
|
|
706
|
+
|
|
707
|
+
# Determine input source
|
|
708
|
+
if opts[:from_category] && opts[:lang]
|
|
709
|
+
# Category extraction mode
|
|
710
|
+
return extract_category_articles(opts)
|
|
711
|
+
end
|
|
712
|
+
|
|
713
|
+
if opts[:articles] && opts[:lang]
|
|
714
|
+
# Article extraction mode
|
|
715
|
+
return extract_specific_articles(opts)
|
|
716
|
+
end
|
|
717
|
+
|
|
718
|
+
input_path = if opts[:lang]
|
|
719
|
+
download_dump(opts[:lang], opts[:cache_dir])
|
|
720
|
+
else
|
|
721
|
+
opts[:input]
|
|
722
|
+
end
|
|
723
|
+
|
|
724
|
+
# Validate format option
|
|
725
|
+
format = opts[:format].to_s.downcase.to_sym
|
|
726
|
+
|
|
727
|
+
# Show deprecation warnings
|
|
728
|
+
if opts[:convert_given] || opts[:del_interfile_given]
|
|
729
|
+
print_warning("--convert and --del-interfile options are deprecated and will be ignored.")
|
|
730
|
+
puts pastel.yellow(" Intermediate files are no longer created in v2.0+") unless quiet?
|
|
731
|
+
end
|
|
732
|
+
|
|
733
|
+
num_processes = calculate_num_processes(opts)
|
|
734
|
+
|
|
735
|
+
# Build configuration hash from options
|
|
736
|
+
config = {
|
|
737
|
+
format: format,
|
|
738
|
+
num_procs: num_processes,
|
|
739
|
+
file_size: opts[:file_size],
|
|
740
|
+
bz2_gem: opts[:bz2_gem],
|
|
741
|
+
use_ractor: opts[:ractor],
|
|
742
|
+
no_turbo: opts[:no_turbo]
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
%i[title list heading table pre ref redirect multiline category category_only
|
|
746
|
+
summary_only metadata_only marker extract_citations expand_templates
|
|
747
|
+
section_output min_section_length skip_empty
|
|
748
|
+
alias_file no_section_aliases section_stats show_matched_sections].each do |opt|
|
|
749
|
+
config[opt] = opts[opt]
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
# Parse sections option (comma-separated string to array)
|
|
753
|
+
if opts[:sections]
|
|
754
|
+
config[:sections] = opts[:sections].split(",").map(&:strip).reject(&:empty?)
|
|
755
|
+
end
|
|
756
|
+
|
|
757
|
+
# Parse markers option
|
|
758
|
+
config[:markers] = parse_markers_option(opts[:markers])
|
|
759
|
+
|
|
760
|
+
# Handle section-stats mode (standalone, outputs to stdout)
|
|
761
|
+
if opts[:section_stats]
|
|
762
|
+
return process_section_stats(input_path, config)
|
|
763
|
+
end
|
|
764
|
+
|
|
765
|
+
# Process input - turbo mode is default for bz2 files (faster parallel decompression)
|
|
766
|
+
# Use --no-turbo to disable (saves disk space but much slower)
|
|
767
|
+
if input_path.end_with?(".bz2") && !opts[:no_turbo]
|
|
768
|
+
if config[:use_ractor]
|
|
769
|
+
puts pastel.yellow("Note: --ractor is not supported with turbo mode. Using parallel gem instead.")
|
|
770
|
+
puts pastel.yellow(" Use --no-turbo to enable Ractor-based processing.")
|
|
771
|
+
puts
|
|
772
|
+
end
|
|
773
|
+
process_with_turbo(input_path, opts[:output_dir], config)
|
|
774
|
+
else
|
|
775
|
+
process_stream(input_path, opts[:output_dir], config)
|
|
776
|
+
end
|
|
777
|
+
|
|
778
|
+
EXIT_SUCCESS
|
|
779
|
+
end
|
|
780
|
+
|
|
781
|
+
# Show cache status
|
|
782
|
+
def show_cache_status(cache_dir)
|
|
783
|
+
print_mode_banner("Cache Status", { "Directory" => cache_dir })
|
|
784
|
+
|
|
785
|
+
status = Wp2txt::DumpManager.all_cache_status(cache_dir)
|
|
786
|
+
|
|
787
|
+
if status.empty?
|
|
788
|
+
print_info_message("No cached dumps found.")
|
|
789
|
+
return
|
|
790
|
+
end
|
|
791
|
+
|
|
792
|
+
status.each do |lang, info|
|
|
793
|
+
if info[:error]
|
|
794
|
+
print_list_item("#{lang}: Error - #{info[:error]}", status: :error)
|
|
795
|
+
else
|
|
796
|
+
index_size = info[:index_size] > 0 ? format_size(info[:index_size]) : pastel.dim("not downloaded")
|
|
797
|
+
multistream_size = info[:multistream_size] > 0 ? format_size(info[:multistream_size]) : pastel.dim("not downloaded")
|
|
798
|
+
status_icon = info[:fresh] ? :success : :warning
|
|
799
|
+
|
|
800
|
+
puts pastel.bold(lang.to_s.upcase)
|
|
801
|
+
print_list_item("Index: #{index_size}", status: status_icon)
|
|
802
|
+
print_list_item("Multistream: #{multistream_size}", status: status_icon)
|
|
803
|
+
print_info("Date", info[:dump_date] || "unknown", indent: 1)
|
|
804
|
+
puts
|
|
805
|
+
end
|
|
806
|
+
end
|
|
807
|
+
end
|
|
808
|
+
|
|
809
|
+
# Clear cache
|
|
810
|
+
def clear_cache(cache_dir, lang = nil)
|
|
811
|
+
if lang
|
|
812
|
+
spinner = create_spinner("Clearing cache for #{lang}...")
|
|
813
|
+
spinner.auto_spin
|
|
814
|
+
manager = Wp2txt::DumpManager.new(lang, cache_dir: cache_dir)
|
|
815
|
+
manager.clear_cache!
|
|
816
|
+
spinner.success(pastel.green("Done!"))
|
|
817
|
+
print_success("Cache cleared for #{lang}.")
|
|
818
|
+
else
|
|
819
|
+
spinner = create_spinner("Clearing all cache...")
|
|
820
|
+
spinner.auto_spin
|
|
821
|
+
Wp2txt::DumpManager.clear_all_cache!(cache_dir)
|
|
822
|
+
spinner.success(pastel.green("Done!"))
|
|
823
|
+
print_success("All cache cleared.")
|
|
824
|
+
end
|
|
825
|
+
end
|
|
826
|
+
|
|
827
|
+
# Initialize configuration file
|
|
828
|
+
def init_config
|
|
829
|
+
config_path = Wp2txt::Config.default_path
|
|
830
|
+
|
|
831
|
+
if File.exist?(config_path)
|
|
832
|
+
print_warning("Configuration file already exists: #{config_path}")
|
|
833
|
+
|
|
834
|
+
unless confirm?("Overwrite?")
|
|
835
|
+
puts "Cancelled."
|
|
836
|
+
return
|
|
837
|
+
end
|
|
838
|
+
end
|
|
839
|
+
|
|
840
|
+
Wp2txt::Config.create_default(config_path, force: true)
|
|
841
|
+
print_success("Configuration file created: #{config_path}")
|
|
842
|
+
puts
|
|
843
|
+
puts pastel.dim("Available settings:")
|
|
844
|
+
print_list_item("cache.dump_expiry_days - Days before dump cache expires (default: 30)")
|
|
845
|
+
print_list_item("cache.category_expiry_days - Days before category cache expires (default: 7)")
|
|
846
|
+
print_list_item("cache.directory - Cache directory location")
|
|
847
|
+
print_list_item("defaults.format - Default output format (text/json)")
|
|
848
|
+
print_list_item("defaults.depth - Default subcategory recursion depth")
|
|
849
|
+
end
|
|
850
|
+
|
|
851
|
+
# Download dump for a language
|
|
852
|
+
def download_dump(lang, cache_dir)
|
|
853
|
+
app_config = Wp2txt::CLI.config
|
|
854
|
+
|
|
855
|
+
print_mode_banner("Auto-Download", {
|
|
856
|
+
"Language" => lang,
|
|
857
|
+
"Cache" => cache_dir
|
|
858
|
+
})
|
|
859
|
+
|
|
860
|
+
manager = Wp2txt::DumpManager.new(
|
|
861
|
+
lang,
|
|
862
|
+
cache_dir: cache_dir,
|
|
863
|
+
dump_expiry_days: app_config.dump_expiry_days
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
# Check for latest dump
|
|
867
|
+
spinner = create_spinner("Checking for latest dump...")
|
|
868
|
+
spinner.auto_spin
|
|
869
|
+
dump_date = manager.latest_dump_date
|
|
870
|
+
spinner.success(pastel.green(dump_date))
|
|
871
|
+
|
|
872
|
+
# Download index and multistream
|
|
873
|
+
print_header("Downloading files")
|
|
874
|
+
manager.download_index
|
|
875
|
+
manager.download_multistream
|
|
876
|
+
|
|
877
|
+
print_success("Download complete!")
|
|
878
|
+
|
|
879
|
+
# Return path to multistream file
|
|
880
|
+
manager.cached_multistream_path
|
|
881
|
+
end
|
|
882
|
+
end
|
|
883
|
+
|
|
884
|
+
# Handle Ctrl+C gracefully
|
|
885
|
+
Signal.trap("INT") do
|
|
886
|
+
# Show cursor (in case it was hidden by spinner/progress bar)
|
|
887
|
+
print "\e[?25h"
|
|
888
|
+
puts "\n\nInterrupted by user."
|
|
889
|
+
exit Wp2txt::CliUI::EXIT_ERROR
|
|
190
890
|
end
|
|
191
891
|
|
|
192
|
-
|
|
892
|
+
# Create new instance and run with proper exit code
|
|
893
|
+
exit_code = WpApp.new.run
|
|
894
|
+
exit(exit_code || Wp2txt::CliUI::EXIT_SUCCESS)
|