wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
data/bin/wp2txt CHANGED
@@ -2,191 +2,893 @@
2
2
 
3
3
  # frozen_string_literal: true
4
4
 
5
- DEBUG_MODE = false
6
- MAX_PROCESSORS = 8
5
+ # Enable YJIT for better performance (Ruby 3.3+)
6
+ RubyVM::YJIT.enable if defined?(RubyVM::YJIT) && RubyVM::YJIT.respond_to?(:enable)
7
7
 
8
8
  require_relative "../lib/wp2txt"
9
9
  require_relative "../lib/wp2txt/utils"
10
10
  require_relative "../lib/wp2txt/version"
11
+ require_relative "../lib/wp2txt/cli"
12
+ require_relative "../lib/wp2txt/multistream"
13
+ require_relative "../lib/wp2txt/cli_ui"
14
+ require_relative "../lib/wp2txt/formatter"
15
+ require_relative "../lib/wp2txt/extractor"
16
+ require_relative "../lib/wp2txt/ractor_worker"
11
17
 
12
18
  require "etc"
19
+ require "json"
13
20
  require "optimist"
14
21
  require "parallel"
15
22
  require "pastel"
16
23
  require "tty-spinner"
24
+ require "tty-progressbar"
17
25
 
18
26
  class WpApp
19
27
  include Wp2txt
28
+ include Wp2txt::CliUI
29
+ include Wp2txt::Formatter
30
+ include Wp2txt::Extractor
20
31
 
21
- def run
22
- opts = Optimist.options do
23
- version VERSION
24
- banner <<~BANNER
25
- WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
26
-
27
- Usage: wp2txt [options]
28
- where [options] are:
29
- BANNER
30
-
31
- opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", type: String, required: true, short: "-i"
32
- opt :output_dir, "Path to output directory", default: Dir.pwd, type: String, short: "-o"
33
- opt :convert, "Output in plain text (converting from XML)", default: true, short: "-c"
34
- opt :category, "Show article category information", default: true, short: "-a"
35
- opt :category_only, "Extract only article title and categories", default: false, short: "-g"
36
- opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
37
- opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
38
- opt :num_procs, "Number of proccesses (up to #{MAX_PROCESSORS}) to be run concurrently (default: max num of CPU cores minus two)", type: Integer, short: "-n"
39
- opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
40
- opt :title, "Keep page titles in output", default: true, short: "-t"
41
- opt :heading, "Keep section titles in output", default: true, short: "-d"
42
- opt :list, "Keep unprocessed list items in output", default: false, short: "-l"
43
- opt :ref, "Keep reference notations in the format [ref]...[/ref]", default: false, short: "-r"
44
- opt :redirect, "Show redirect destination", default: false, short: "-e"
45
- opt :marker, "Show symbols prefixed to list items, definitions, etc.", default: true, short: "-m"
46
- opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", default: false, short: "-b"
47
- end
48
-
49
- Optimist.die :size, "must be larger than 0" unless opts[:file_size] >= 0
50
- Optimist.die :input, "must exist" unless File.exist?(opts[:input])
51
- Optimist.die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
52
-
53
- pastel = Pastel.new
54
-
55
- input_file = opts[:input]
56
- output_dir = opts[:output_dir]
57
- tfile_size = opts[:file_size]
58
- num_processors = Etc.nprocessors
59
- num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors && opts[:num_procs].to_i <= MAX_PROCESSORS
60
- opts[:num_procs]
61
- else
62
- minus2 = num_processors - 2
63
- minus2 < MAX_PROCESSORS ? minus2 : MAX_PROCESSORS
64
- end
65
- num_processes = 1 if num_processes < 1
66
-
67
- convert = opts[:convert]
68
- strip_tmarker = opts[:marker] ? false : true
69
- opt_array = %i[title list heading table redirect multiline category category_only summary_only del_interfile bz2_gem]
70
-
71
- config = {}
72
- opt_array.each do |opt|
73
- config[opt] = opts[opt]
74
- end
32
+ # Debug mode flag
33
+ DEBUG_MODE = false
34
+
35
+ def initialize
36
+ @pastel = Pastel.new
37
+ end
38
+
39
+ private
40
+
41
+ # Calculate the number of processes to be used for parallel processing
42
+ # Uses MemoryMonitor to determine optimal parallelism based on CPU and memory
43
+ def calculate_num_processes(opts)
44
+ optimal = Wp2txt::MemoryMonitor.optimal_processes
75
45
 
76
- if File.ftype(input_file) == "directory"
77
- input_files = Dir.glob("#{input_file}/*.xml")
46
+ if opts[:num_procs]
47
+ # User specified a value - use it if reasonable
48
+ requested = opts[:num_procs].to_i
49
+ max_allowed = Etc.nprocessors
50
+ [requested, max_allowed, 1].max == requested ? requested : optimal
78
51
  else
79
- puts ""
80
- puts pastel.green.bold("Preprocessing")
81
- puts "Decompressing and splitting the original dump file."
82
- puts pastel.underline("This may take a while. Please be patient!")
83
-
84
- time_start = Time.now.to_i
85
- wpsplitter = Splitter.new(input_file, output_dir, tfile_size)
86
- spinner = TTY::Spinner.new(":spinner", format: :arrow_pulse, hide_cursor: true, interval: 5)
87
- spinner.auto_spin
88
- wpsplitter.split_file
89
- time_finish = Time.now.to_i
90
-
91
- spinner.stop("Time: #{sec_to_str(time_finish - time_start)}") # Stop animation
92
- puts pastel.blue.bold("Complete!")
93
- exit unless convert
94
- input_files = Dir.glob("#{output_dir}/*.xml")
95
- end
96
-
97
- puts ""
98
- puts pastel.red.bold("Converting")
99
- puts "Number of files being processed: " + pastel.bold(input_files.size.to_s)
100
- puts "Number of CPU cores being used: " + pastel.bold(num_processes.to_s)
101
-
102
- Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |infile|
103
- wpconv = Runner.new(infile, output_dir, strip_tmarker, config[:del_interfile])
104
- wpconv.extract_text do |article|
105
- article.title = format_wiki(article.title, config)
106
-
107
- if config[:category_only]
108
- title = "#{article.title}\t"
109
- contents = article.categories.join(", ")
110
- contents << "\n"
111
- elsif config[:category] && !article.categories.empty?
112
- title = "\n[[#{article.title}]]\n\n"
113
- contents = +"\nCATEGORIES: "
114
- contents << article.categories.join(", ")
115
- contents << "\n\n"
116
- else
117
- title = "\n[[#{article.title}]]\n\n"
118
- contents = +""
119
- end
52
+ optimal
53
+ end.tap { |n| n = 1 if n < 1 }
54
+ end
55
+
56
+ # Process articles using turbo mode (split-first architecture from v1.x)
57
+ # This splits the bz2 file into XML chunks first, then processes in parallel
58
+ # Much faster for large dumps due to parallel decompression benefit
59
+ def process_with_turbo(input_path, output_dir, config)
60
+ require "tmpdir"
61
+ require "fileutils"
62
+
63
+ num_processes = config[:num_procs]
64
+ file_size_mb = config[:file_size]
65
+ format = config[:format]
66
+ bz2_gem = config[:bz2_gem]
67
+
68
+ # Determine base name for output files
69
+ base_name = File.basename(input_path, ".*")
70
+ base_name = base_name.sub(/\.xml$/, "") # Handle .xml.bz2
71
+
72
+ # Get input file size for display
73
+ input_size = File.size(input_path) rescue 0
74
+ input_size_str = input_size > 0 ? format_size(input_size) : "unknown"
75
+
76
+ print_mode_banner("Turbo Mode Processing", {
77
+ "Input" => File.basename(input_path),
78
+ "Size" => input_size_str,
79
+ "Format" => format.to_s,
80
+ "CPU cores" => num_processes.to_s,
81
+ "Mode" => "Split-first (parallel decompression)"
82
+ })
83
+
84
+ time_start = Time.now
85
+
86
+ # Create temp directory for split XML files
87
+ temp_dir = Dir.mktmpdir("wp2txt_turbo_")
88
+ puts pastel.cyan("Phase 1: Splitting bz2 file into XML chunks...")
89
+ puts pastel.dim(" Temp directory: #{temp_dir}")
90
+ puts
91
+
92
+ begin
93
+ # Phase 1: Split bz2 into XML files using Splitter
94
+ # Split into 10MB chunks for good parallelism
95
+ $stdout.sync = true
96
+ splitter = Splitter.new(input_path, temp_dir, 10, bz2_gem) do |bytes_read, file_count|
97
+ # Progress callback - called every 5 seconds
98
+ size_str = format_size(bytes_read)
99
+ elapsed = Time.now - time_start
100
+ rate = bytes_read / elapsed / 1024 / 1024 # MB/s
101
+ puts pastel.dim(format(" [%s] Decompressed: %s | %.1f MB/s | %d XML files created",
102
+ Time.now.strftime("%H:%M:%S"),
103
+ size_str,
104
+ rate,
105
+ file_count))
106
+ end
107
+ splitter.split_file
108
+ xml_files = Dir.glob(File.join(temp_dir, "*.xml")).sort
109
+
110
+ split_time = Time.now - time_start
111
+ final_size = splitter.size_read || 0
112
+ puts
113
+ puts pastel.green("#{ICONS[:success]} Split complete: #{xml_files.size} XML files, #{format_size(final_size)} decompressed (#{format_duration(split_time)})")
114
+ puts
115
+
116
+ # Phase 2: Process XML files in parallel and write output directly
117
+ puts pastel.cyan("Phase 2: Processing XML files in parallel...")
118
+ puts pastel.dim(" Using #{num_processes} parallel processes")
119
+ puts
120
+
121
+ strip_tmarker = !config[:marker]
122
+
123
+ # Each parallel process writes to its own temp output file
124
+ # This avoids memory accumulation and enables streaming output
125
+ $stdout.sync = true
126
+ processed_count = 0
127
+ last_report_time = Time.now
128
+ temp_output_dir = File.join(temp_dir, "output")
129
+ FileUtils.mkdir_p(temp_output_dir)
130
+
131
+ # Process XML files in parallel - each writes its own output
132
+ article_counts = Parallel.map(
133
+ xml_files.each_with_index.to_a,
134
+ in_processes: num_processes,
135
+ finish: lambda { |_item, _index, _result|
136
+ processed_count += 1
137
+ now = Time.now
138
+ if now - last_report_time >= Wp2txt::DEFAULT_PROGRESS_INTERVAL || processed_count == xml_files.size
139
+ last_report_time = now
140
+ percent = (processed_count.to_f / xml_files.size * 100).round(1)
141
+ elapsed = now - time_start
142
+ rate = processed_count / elapsed
143
+ remaining = xml_files.size - processed_count
144
+ eta = remaining > 0 && rate > 0 ? remaining / rate : 0
145
+ puts pastel.dim(format(" [%d/%d] %.1f%% | %.1f files/sec | ETA: %s",
146
+ processed_count, xml_files.size,
147
+ percent, rate,
148
+ format_duration(eta)))
149
+ end
150
+ }
151
+ ) do |xml_file, idx|
152
+ # Each process writes directly to its own temp file
153
+ temp_output_file = File.join(temp_output_dir, "part_#{idx.to_s.rjust(5, '0')}.txt")
154
+ process_xml_file_and_write(xml_file, temp_output_file, config, strip_tmarker, format)
155
+ end
120
156
 
121
- unless config[:category_only]
122
- article.elements.each do |e|
123
- case e.first
124
- when :mw_heading
125
- break if config[:summary_only]
126
- next unless config[:heading]
127
-
128
- e[-1] = format_wiki(e.last, config)
129
- line = e.last
130
- line << "+HEADING+" if DEBUG_MODE
131
- when :mw_paragraph
132
- e[-1] = format_wiki(e.last, config)
133
- line = e.last + "\n"
134
- line << "+PARAGRAPH+" if DEBUG_MODE
135
- when :mw_table, :mw_htable
136
- next unless config[:table]
137
-
138
- line = e.last
139
- line << "+TABLE+" if DEBUG_MODE
140
- when :mw_pre
141
- next unless config[:pre]
142
-
143
- line = e.last
144
- line << "+PRE+" if DEBUG_MODE
145
- when :mw_quote
146
- line = e.last
147
- line << "+QUOTE+" if DEBUG_MODE
148
- when :mw_unordered, :mw_ordered, :mw_definition
149
- next unless config[:list]
150
-
151
- line = e.last
152
- line << "+LIST+" if DEBUG_MODE
153
- when :mw_ml_template
154
- next unless config[:multiline]
155
-
156
- line = e.last
157
- line << "+MLTEMPLATE+" if DEBUG_MODE
158
- when :mw_redirect
159
- next unless config[:redirect]
160
-
161
- line = e.last
162
- line << "+REDIRECT+" if DEBUG_MODE
163
- line << "\n\n"
164
- when :mw_isolated_template
165
- next unless config[:multiline]
166
-
167
- line = e.last
168
- line << "+ISOLATED_TEMPLATE+" if DEBUG_MODE
169
- when :mw_isolated_tag
170
- next
171
- else
172
- next unless DEBUG_MODE
173
-
174
- line = e.last
175
- line << "+OTHER+"
176
- end
177
- contents << line << "\n"
157
+ total_articles = article_counts.sum
158
+
159
+ # Phase 3: Merge temp output files into final output (streaming)
160
+ puts
161
+ puts pastel.cyan("Merging output files...")
162
+
163
+ temp_files = Dir.glob(File.join(temp_output_dir, "part_*.txt")).sort
164
+ writer = OutputWriter.new(
165
+ output_dir: output_dir,
166
+ base_name: base_name,
167
+ format: format,
168
+ file_size_mb: file_size_mb
169
+ )
170
+
171
+ temp_files.each do |temp_file|
172
+ next if File.size(temp_file).zero?
173
+ # Stream copy instead of loading entire file into memory
174
+ writer.write_from_file(temp_file)
175
+ end
176
+
177
+ output_files = writer.close
178
+
179
+ time_elapsed = Time.now - time_start
180
+ puts
181
+ puts pastel.green("#{ICONS[:success]} Processing complete!")
182
+
183
+ print_summary("Turbo Processing Complete", {
184
+ "XML files processed" => xml_files.size.to_s,
185
+ "Articles" => total_articles.to_s,
186
+ "Output files" => output_files.size.to_s,
187
+ "Time" => format_duration(time_elapsed)
188
+ }, status: :success)
189
+
190
+ puts
191
+ puts pastel.dim("Output files:")
192
+ output_files.each { |f| print_list_item(f, status: :success) }
193
+ ensure
194
+ # Cleanup temp directory
195
+ FileUtils.rm_rf(temp_dir) if File.exist?(temp_dir)
196
+ end
197
+ end
198
+
199
+ # Regex patterns for fast XML extraction (avoid full DOM parsing)
200
+ TITLE_REGEX = %r{<title>([^<]*)</title>}m
201
+ TEXT_REGEX = %r{<text[^>]*>(.*)$}m
202
+ TEXT_END_REGEX = %r{</text>}
203
+
204
+ # Process a single XML file and write directly to output file
205
+ # Returns the number of articles processed
206
+ def process_xml_file_and_write(xml_file, output_file, config, strip_tmarker, format)
207
+ article_count = 0
208
+ runner = Runner.new(xml_file, File.dirname(xml_file), strip_tmarker, false)
209
+
210
+ File.open(output_file, "w") do |out|
211
+ while (page_xml = runner.get_page)
212
+ begin
213
+ # Fast regex extraction instead of full Nokogiri DOM parsing
214
+ title_match = TITLE_REGEX.match(page_xml)
215
+ next unless title_match
216
+
217
+ title = title_match[1]
218
+ next if title.nil? || title.empty? || title.include?(":")
219
+
220
+ # Extract text content
221
+ text_match = TEXT_REGEX.match(page_xml)
222
+ next unless text_match
223
+
224
+ # Find end of text and extract content
225
+ text_start = text_match.begin(1)
226
+ text_end_match = TEXT_END_REGEX.match(page_xml, text_start)
227
+ next unless text_end_match
228
+
229
+ text = page_xml[text_start...text_end_match.begin(0)]
230
+ next if text.nil? || text.empty?
231
+
232
+ # Decode XML entities
233
+ text = text.gsub("&lt;", "<").gsub("&gt;", ">").gsub("&amp;", "&").gsub("&quot;", '"')
234
+
235
+ # Remove HTML comments
236
+ text.gsub!(/<!--(.*?)-->/m) do |content|
237
+ num_of_newlines = content.count("\n")
238
+ num_of_newlines.zero? ? +"" : "\n" * num_of_newlines
178
239
  end
240
+
241
+ next if redirect_page?(text)
242
+
243
+ article = Article.new(text, title, strip_tmarker)
244
+ result = format_article(article, config)
245
+ next unless result
246
+
247
+ # Write directly to file
248
+ if format == :json
249
+ out.puts(result.to_json)
250
+ else
251
+ out.puts(result)
252
+ end
253
+ article_count += 1
254
+ rescue StandardError
255
+ next
179
256
  end
257
+ end
258
+ end
259
+
260
+ article_count
261
+ end
180
262
 
181
- if /\A[\s ]*\z/m =~ contents
182
- ""
263
+ # Fast redirect detection (same as in stream_processor)
264
+ def redirect_page?(text)
265
+ return false if text.nil? || text.empty?
266
+ first_part = text[0, 200]
267
+ return false unless first_part
268
+ stripped = first_part.lstrip
269
+ return false unless stripped.start_with?("#", "#")
270
+ stripped.include?("[[")
271
+ end
272
+
273
+ # Process articles using streaming (new architecture)
274
+ def process_stream(input_path, output_dir, config)
275
+ num_processes = config[:num_procs]
276
+ file_size_mb = config[:file_size]
277
+ format = config[:format]
278
+ bz2_gem = config[:bz2_gem]
279
+
280
+ # Determine base name for output files
281
+ base_name = File.basename(input_path, ".*")
282
+ base_name = base_name.sub(/\.xml$/, "") # Handle .xml.bz2
283
+
284
+ # Create stream processor
285
+ stream = StreamProcessor.new(input_path, bz2_gem: bz2_gem)
286
+
287
+ # Create output writer
288
+ writer = OutputWriter.new(
289
+ output_dir: output_dir,
290
+ base_name: base_name,
291
+ format: format,
292
+ file_size_mb: file_size_mb
293
+ )
294
+
295
+ # Collect pages for parallel processing
296
+ pages = []
297
+ page_count = 0
298
+
299
+ # Determine parallelism mode
300
+ use_ractor = config[:use_ractor] && Wp2txt::RactorWorker.available?
301
+ parallel_mode = use_ractor ? "Ractor (experimental)" : "Parallel (processes)"
302
+
303
+ # Show warning for experimental Ractor mode
304
+ if config[:use_ractor]
305
+ if use_ractor
306
+ print_warning("Ractor mode is experimental and may be unstable.")
307
+ puts pastel.yellow(" If processing hangs, restart without --ractor option.") unless quiet?
308
+ else
309
+ print_warning("Ractor not available on this Ruby version. Using Parallel gem.")
310
+ end
311
+ end
312
+
313
+ # Get input file size for progress estimation
314
+ input_size = File.size(input_path) rescue 0
315
+ input_size_str = input_size > 0 ? format_size(input_size) : "unknown"
316
+
317
+ # Estimate total articles for ETA calculation
318
+ estimated_total = estimate_total_articles(input_path)
319
+ estimated_total_str = estimated_total ? "~#{(estimated_total / 1_000_000.0).round(1)}M" : "unknown"
320
+
321
+ print_mode_banner("Full Dump Processing", {
322
+ "Input" => File.basename(input_path),
323
+ "Size" => input_size_str,
324
+ "Articles (est.)" => estimated_total_str,
325
+ "Format" => format.to_s,
326
+ "CPU cores" => num_processes.to_s,
327
+ "Parallel" => parallel_mode,
328
+ "Skip redirects" => "yes"
329
+ })
330
+
331
+ # Ensure output is not buffered (important for piped output)
332
+ $stdout.sync = true
333
+
334
+ time_start = Time.now
335
+ last_progress_time = time_start
336
+ last_progress_count = 0
337
+ batch_count = 0
338
+
339
+ # Progress reporting interval (seconds)
340
+ progress_interval = Wp2txt::DEFAULT_PROGRESS_INTERVAL
341
+
342
+ # Process in batches for memory efficiency
343
+ batch_size = num_processes * 100
344
+ strip_tmarker = !config[:marker]
345
+
346
+ # Show initial progress message
347
+ puts pastel.cyan("Processing started at #{time_start.strftime('%H:%M:%S')}")
348
+ if estimated_total
349
+ puts pastel.dim("Progress updates every #{progress_interval} seconds (with ETA)...")
350
+ else
351
+ puts pastel.dim("Progress updates every #{progress_interval} seconds...")
352
+ end
353
+ puts
354
+
355
+ stream.each_page do |title, text|
356
+ pages << [title, text]
357
+ page_count += 1
358
+
359
+ # Process batch when full
360
+ next unless pages.size >= batch_size
361
+
362
+ process_batch(pages, writer, config, strip_tmarker, num_processes)
363
+ pages.clear
364
+ batch_count += 1
365
+
366
+ # Show progress every N seconds
367
+ now = Time.now
368
+ elapsed_since_update = now - last_progress_time
369
+ if elapsed_since_update >= progress_interval
370
+ elapsed_total = now - time_start
371
+ articles_per_sec = (page_count - last_progress_count) / elapsed_since_update
372
+ output_count = writer.file_count rescue batch_count
373
+
374
+ # Calculate ETA
375
+ eta_seconds = calculate_eta(page_count, estimated_total, elapsed_total)
376
+ eta_str = format_eta(eta_seconds)
377
+
378
+ # Calculate progress percentage if total is known
379
+ if estimated_total && estimated_total > 0
380
+ percent = (page_count.to_f / estimated_total * 100).round(1)
381
+ progress_line = format(
382
+ " [%s] %s articles (%s%%) | %s/sec | %s files | Elapsed: %s | ETA: %s",
383
+ now.strftime("%H:%M:%S"),
384
+ page_count.to_s.rjust(8),
385
+ percent.to_s.rjust(5),
386
+ articles_per_sec.round(1).to_s.rjust(6),
387
+ output_count.to_s.rjust(4),
388
+ format_duration(elapsed_total),
389
+ eta_str
390
+ )
183
391
  else
184
- config[:title] ? title << contents : contents
392
+ progress_line = format(
393
+ " [%s] %s articles | %s/sec | %s files | Elapsed: %s",
394
+ now.strftime("%H:%M:%S"),
395
+ page_count.to_s.rjust(8),
396
+ articles_per_sec.round(1).to_s.rjust(6),
397
+ output_count.to_s.rjust(4),
398
+ format_duration(elapsed_total)
399
+ )
185
400
  end
401
+ puts pastel.dim(progress_line)
402
+
403
+ last_progress_time = now
404
+ last_progress_count = page_count
405
+ end
406
+ end
407
+
408
+ # Process remaining pages
409
+ process_batch(pages, writer, config, strip_tmarker, num_processes) unless pages.empty?
410
+
411
+ # Close output
412
+ output_files = writer.close
413
+
414
+ # Get redirect skip count
415
+ redirects_skipped = stream.redirects_skipped
416
+
417
+ time_elapsed = Time.now - time_start
418
+ puts
419
+ puts pastel.green("#{ICONS[:success]} Processing complete!")
420
+
421
+ # Summary
422
+ summary_data = {
423
+ "Articles" => page_count.to_s,
424
+ "Output files" => output_files.size.to_s,
425
+ "Time" => format_duration(time_elapsed)
426
+ }
427
+ summary_data["Redirects skipped"] = redirects_skipped.to_s if redirects_skipped > 0
428
+
429
+ print_summary("Processing Complete", summary_data, status: :success)
430
+
431
+ puts
432
+ puts pastel.dim("Output files:")
433
+ output_files.each { |f| print_list_item(f, status: :success) }
434
+ end
435
+
436
+ # Process a batch of pages in parallel
437
+ # Uses Ractor for true parallelism when enabled, otherwise falls back to Parallel gem
438
+ def process_batch(pages, writer, config, strip_tmarker, num_processes)
439
+ results = if config[:use_ractor] && Wp2txt::RactorWorker.available?
440
+ # Use Ractor-based parallel processing (true parallelism)
441
+ Wp2txt::RactorWorker.process_articles(
442
+ pages,
443
+ config: config,
444
+ strip_tmarker: strip_tmarker,
445
+ num_workers: num_processes
446
+ )
447
+ else
448
+ # Fall back to Parallel gem (process-based parallelism)
449
+ Parallel.map(pages, in_processes: num_processes) do |title, text|
450
+ article = Article.new(text, title, strip_tmarker)
451
+ format_article(article, config)
452
+ end
453
+ end
454
+
455
+ results.each do |result|
456
+ writer.write(result) if result
457
+ end
458
+ end
459
+
460
+ # Process section statistics mode
461
+ # Collects section heading statistics and outputs JSON to stdout
462
+ def process_section_stats(input_path, config)
463
+ require_relative "../lib/wp2txt/section_extractor"
464
+
465
+ bz2_gem = config[:bz2_gem]
466
+ no_turbo = config[:no_turbo]
467
+ num_processes = config[:num_procs]
468
+
469
+ # Use turbo mode for bz2 files unless disabled
470
+ if input_path.end_with?(".bz2") && !no_turbo
471
+ process_section_stats_turbo(input_path, bz2_gem, num_processes)
472
+ else
473
+ process_section_stats_stream(input_path, bz2_gem)
474
+ end
475
+ end
476
+
477
+ def process_section_stats_stream(input_path, bz2_gem)
478
+ print_mode_banner("Section Statistics", {
479
+ "Input" => File.basename(input_path),
480
+ "Mode" => "Statistics collection (streaming)"
481
+ })
482
+
483
+ puts pastel.cyan("Collecting section statistics...")
484
+ puts pastel.dim("This may take a while for large dumps.")
485
+ puts
486
+
487
+ # Create stream processor and stats collector
488
+ stream = StreamProcessor.new(input_path, bz2_gem: bz2_gem)
489
+ collector = Wp2txt::SectionStatsCollector.new
490
+
491
+ time_start = Time.now
492
+ last_progress_time = time_start
493
+ progress_interval = Wp2txt::DEFAULT_PROGRESS_INTERVAL
494
+
495
+ # Process pages without full text processing (just extract headings)
496
+ stream.each_page do |title, text|
497
+ # Create minimal article just for heading extraction
498
+ article = Article.new(text, title, false)
499
+ collector.process(article)
500
+
501
+ # Show progress periodically
502
+ now = Time.now
503
+ if now - last_progress_time >= progress_interval
504
+ elapsed = now - time_start
505
+ rate = collector.total_articles / elapsed
506
+ puts pastel.dim(format(" [%s] %d articles processed (%.1f/sec)",
507
+ now.strftime("%H:%M:%S"),
508
+ collector.total_articles,
509
+ rate))
510
+ last_progress_time = now
511
+ end
512
+ end
513
+
514
+ output_section_stats_result(collector, time_start)
515
+ end
516
+
517
+ def process_section_stats_turbo(input_path, bz2_gem, num_processes)
518
+ require "tmpdir"
519
+ require "fileutils"
520
+
521
+ print_mode_banner("Section Statistics (Turbo)", {
522
+ "Input" => File.basename(input_path),
523
+ "Mode" => "Statistics collection (parallel)",
524
+ "CPU cores" => num_processes.to_s
525
+ })
526
+
527
+ time_start = Time.now
528
+
529
+ # Create temp directory for split XML files
530
+ temp_dir = Dir.mktmpdir("wp2txt_stats_")
531
+ puts pastel.cyan("Phase 1: Splitting bz2 file...")
532
+ puts pastel.dim(" Temp directory: #{temp_dir}")
533
+ puts
534
+
535
+ begin
536
+ # Phase 1: Split bz2 into XML files
537
+ $stdout.sync = true
538
+ splitter = Splitter.new(input_path, temp_dir, 10, bz2_gem) do |bytes_read, file_count|
539
+ size_str = format_size(bytes_read)
540
+ elapsed = Time.now - time_start
541
+ rate = bytes_read / elapsed / 1024 / 1024
542
+ puts pastel.dim(format(" [%s] Decompressed: %s | %.1f MB/s | %d XML files",
543
+ Time.now.strftime("%H:%M:%S"),
544
+ size_str, rate, file_count))
545
+ end
546
+ splitter.split_file
547
+ xml_files = Dir.glob(File.join(temp_dir, "*.xml")).sort
548
+
549
+ split_time = Time.now - time_start
550
+ final_size = splitter.size_read || 0
551
+ puts
552
+ puts pastel.green("#{ICONS[:success]} Split complete: #{xml_files.size} XML files, #{format_size(final_size)} (#{format_duration(split_time)})")
553
+ puts
554
+
555
+ # Phase 2: Process XML files in parallel
556
+ puts pastel.cyan("Phase 2: Collecting statistics in parallel...")
557
+ puts
558
+
559
+ processed_count = 0
560
+ last_report_time = Time.now
561
+
562
+ # Process XML files in parallel and collect stats
563
+ partial_results = Parallel.map(
564
+ xml_files,
565
+ in_processes: num_processes,
566
+ finish: lambda { |_item, _index, _result|
567
+ processed_count += 1
568
+ now = Time.now
569
+ if now - last_report_time >= Wp2txt::DEFAULT_PROGRESS_INTERVAL || processed_count == xml_files.size
570
+ last_report_time = now
571
+ percent = (processed_count.to_f / xml_files.size * 100).round(1)
572
+ elapsed = now - time_start
573
+ rate = processed_count / elapsed
574
+ remaining = xml_files.size - processed_count
575
+ eta = remaining > 0 && rate > 0 ? remaining / rate : 0
576
+ puts pastel.dim(format(" [%d/%d] %.1f%% | %.1f files/sec | ETA: %s",
577
+ processed_count, xml_files.size,
578
+ percent, rate, format_duration(eta)))
579
+ end
580
+ }
581
+ ) do |xml_file|
582
+ process_xml_file_for_stats(xml_file)
583
+ end
584
+
585
+ # Merge all partial results
586
+ puts
587
+ puts pastel.cyan("Merging results...")
588
+
589
+ collector = Wp2txt::SectionStatsCollector.new
590
+ partial_results.each { |result| collector.merge(result) }
591
+
592
+ output_section_stats_result(collector, time_start)
593
+ ensure
594
+ FileUtils.rm_rf(temp_dir) if File.exist?(temp_dir)
595
+ end
596
+ end
597
+
598
+ # Process a single XML file for section stats (used by turbo mode)
599
+ def process_xml_file_for_stats(xml_file)
600
+ collector = Wp2txt::SectionStatsCollector.new
601
+ runner = Runner.new(xml_file, File.dirname(xml_file), false, false)
602
+
603
+ while (page_xml = runner.get_page)
604
+ begin
605
+ # Fast regex extraction
606
+ title_match = TITLE_REGEX.match(page_xml)
607
+ next unless title_match
608
+
609
+ title = title_match[1]
610
+ next if title.nil? || title.empty? || title.include?(":")
611
+
612
+ text_match = TEXT_REGEX.match(page_xml)
613
+ next unless text_match
614
+
615
+ text_start = text_match.begin(1)
616
+ text_end_match = TEXT_END_REGEX.match(page_xml, text_start)
617
+ next unless text_end_match
618
+
619
+ text = page_xml[text_start...text_end_match.begin(0)]
620
+ next if text.nil? || text.empty?
621
+
622
+ text = text.gsub("&lt;", "<").gsub("&gt;", ">").gsub("&amp;", "&").gsub("&quot;", '"')
623
+ next if redirect_page?(text)
624
+
625
+ article = Article.new(text, title, false)
626
+ collector.process(article)
627
+ rescue StandardError
628
+ next
629
+ end
630
+ end
631
+
632
+ collector.to_mergeable_hash
633
+ end
634
+
635
+ def output_section_stats_result(collector, time_start)
636
+ time_elapsed = Time.now - time_start
637
+
638
+ puts
639
+ puts pastel.green("#{ICONS[:success]} Statistics collection complete!")
640
+ puts
641
+
642
+ # Print summary to stderr so JSON goes to stdout cleanly
643
+ $stderr.puts pastel.dim("Total articles: #{collector.total_articles}")
644
+ $stderr.puts pastel.dim("Unique sections: #{collector.section_counts.size}")
645
+ $stderr.puts pastel.dim("Time: #{format_duration(time_elapsed)}")
646
+ $stderr.puts
647
+
648
+ # Output JSON to stdout
649
+ puts collector.to_json(top_n: Wp2txt::DEFAULT_TOP_N_SECTIONS)
650
+
651
+ EXIT_SUCCESS
652
+ end
653
+
654
+ # Parse --markers option value
655
+ # "all" -> true (all markers enabled)
656
+ # "none" -> DEPRECATED (now treated as "all" with warning)
657
+ # "math,code,chem" -> [:math, :code, :chem]
658
+ def parse_markers_option(value)
659
+ case value.to_s.downcase.strip
660
+ when "all", "true", ""
661
+ true
662
+ when "none", "false"
663
+ # Deprecation warning - none/false no longer removes content completely
664
+ puts @pastel.yellow("Warning: --markers=none is deprecated and will be removed in a future version.")
665
+ puts @pastel.yellow(" Complete removal of special content can make surrounding text nonsensical.")
666
+ puts @pastel.yellow(" Using --markers=all instead. Markers will be shown for all special content.")
667
+ puts
668
+ true # Treat as "all" instead of removing content
669
+ else
670
+ # Parse comma-separated list
671
+ value.split(",").map { |m| m.strip.downcase.to_sym }.select do |m|
672
+ Wp2txt::MARKER_TYPES.include?(m)
186
673
  end
187
674
  end
188
- puts pastel.blue.bold("Complete!")
189
675
  end
676
+
677
+ public
678
+
679
+ # Main execution method
680
+ # @return [Integer] Exit code (0=success, 1=error, 2=partial)
681
+ def run
682
+ # Parse command line options using CLI module
683
+ opts = Wp2txt::CLI.parse_options(ARGV)
684
+
685
+ # Configure UI settings (color, quiet mode)
686
+ configure_ui(no_color: opts[:no_color], quiet: opts[:quiet])
687
+ reset_pastel! # Reset pastel to apply color settings
688
+ @pastel = pastel # Reinitialize with new settings
689
+
690
+ # Handle config-init
691
+ if opts[:config_init]
692
+ init_config
693
+ return EXIT_SUCCESS
694
+ end
695
+
696
+ # Handle cache operations
697
+ if opts[:cache_status]
698
+ show_cache_status(opts[:cache_dir])
699
+ return EXIT_SUCCESS
700
+ end
701
+
702
+ if opts[:cache_clear]
703
+ clear_cache(opts[:cache_dir], opts[:lang])
704
+ return EXIT_SUCCESS
705
+ end
706
+
707
+ # Determine input source
708
+ if opts[:from_category] && opts[:lang]
709
+ # Category extraction mode
710
+ return extract_category_articles(opts)
711
+ end
712
+
713
+ if opts[:articles] && opts[:lang]
714
+ # Article extraction mode
715
+ return extract_specific_articles(opts)
716
+ end
717
+
718
+ input_path = if opts[:lang]
719
+ download_dump(opts[:lang], opts[:cache_dir])
720
+ else
721
+ opts[:input]
722
+ end
723
+
724
+ # Validate format option
725
+ format = opts[:format].to_s.downcase.to_sym
726
+
727
+ # Show deprecation warnings
728
+ if opts[:convert_given] || opts[:del_interfile_given]
729
+ print_warning("--convert and --del-interfile options are deprecated and will be ignored.")
730
+ puts pastel.yellow(" Intermediate files are no longer created in v2.0+") unless quiet?
731
+ end
732
+
733
+ num_processes = calculate_num_processes(opts)
734
+
735
+ # Build configuration hash from options
736
+ config = {
737
+ format: format,
738
+ num_procs: num_processes,
739
+ file_size: opts[:file_size],
740
+ bz2_gem: opts[:bz2_gem],
741
+ use_ractor: opts[:ractor],
742
+ no_turbo: opts[:no_turbo]
743
+ }
744
+
745
+ %i[title list heading table pre ref redirect multiline category category_only
746
+ summary_only metadata_only marker extract_citations expand_templates
747
+ section_output min_section_length skip_empty
748
+ alias_file no_section_aliases section_stats show_matched_sections].each do |opt|
749
+ config[opt] = opts[opt]
750
+ end
751
+
752
+ # Parse sections option (comma-separated string to array)
753
+ if opts[:sections]
754
+ config[:sections] = opts[:sections].split(",").map(&:strip).reject(&:empty?)
755
+ end
756
+
757
+ # Parse markers option
758
+ config[:markers] = parse_markers_option(opts[:markers])
759
+
760
+ # Handle section-stats mode (standalone, outputs to stdout)
761
+ if opts[:section_stats]
762
+ return process_section_stats(input_path, config)
763
+ end
764
+
765
+ # Process input - turbo mode is default for bz2 files (faster parallel decompression)
766
+ # Use --no-turbo to disable (saves disk space but much slower)
767
+ if input_path.end_with?(".bz2") && !opts[:no_turbo]
768
+ if config[:use_ractor]
769
+ puts pastel.yellow("Note: --ractor is not supported with turbo mode. Using parallel gem instead.")
770
+ puts pastel.yellow(" Use --no-turbo to enable Ractor-based processing.")
771
+ puts
772
+ end
773
+ process_with_turbo(input_path, opts[:output_dir], config)
774
+ else
775
+ process_stream(input_path, opts[:output_dir], config)
776
+ end
777
+
778
+ EXIT_SUCCESS
779
+ end
780
+
781
+ # Show cache status
782
+ def show_cache_status(cache_dir)
783
+ print_mode_banner("Cache Status", { "Directory" => cache_dir })
784
+
785
+ status = Wp2txt::DumpManager.all_cache_status(cache_dir)
786
+
787
+ if status.empty?
788
+ print_info_message("No cached dumps found.")
789
+ return
790
+ end
791
+
792
+ status.each do |lang, info|
793
+ if info[:error]
794
+ print_list_item("#{lang}: Error - #{info[:error]}", status: :error)
795
+ else
796
+ index_size = info[:index_size] > 0 ? format_size(info[:index_size]) : pastel.dim("not downloaded")
797
+ multistream_size = info[:multistream_size] > 0 ? format_size(info[:multistream_size]) : pastel.dim("not downloaded")
798
+ status_icon = info[:fresh] ? :success : :warning
799
+
800
+ puts pastel.bold(lang.to_s.upcase)
801
+ print_list_item("Index: #{index_size}", status: status_icon)
802
+ print_list_item("Multistream: #{multistream_size}", status: status_icon)
803
+ print_info("Date", info[:dump_date] || "unknown", indent: 1)
804
+ puts
805
+ end
806
+ end
807
+ end
808
+
809
+ # Clear cache
810
+ def clear_cache(cache_dir, lang = nil)
811
+ if lang
812
+ spinner = create_spinner("Clearing cache for #{lang}...")
813
+ spinner.auto_spin
814
+ manager = Wp2txt::DumpManager.new(lang, cache_dir: cache_dir)
815
+ manager.clear_cache!
816
+ spinner.success(pastel.green("Done!"))
817
+ print_success("Cache cleared for #{lang}.")
818
+ else
819
+ spinner = create_spinner("Clearing all cache...")
820
+ spinner.auto_spin
821
+ Wp2txt::DumpManager.clear_all_cache!(cache_dir)
822
+ spinner.success(pastel.green("Done!"))
823
+ print_success("All cache cleared.")
824
+ end
825
+ end
826
+
827
+ # Initialize configuration file
828
+ def init_config
829
+ config_path = Wp2txt::Config.default_path
830
+
831
+ if File.exist?(config_path)
832
+ print_warning("Configuration file already exists: #{config_path}")
833
+
834
+ unless confirm?("Overwrite?")
835
+ puts "Cancelled."
836
+ return
837
+ end
838
+ end
839
+
840
+ Wp2txt::Config.create_default(config_path, force: true)
841
+ print_success("Configuration file created: #{config_path}")
842
+ puts
843
+ puts pastel.dim("Available settings:")
844
+ print_list_item("cache.dump_expiry_days - Days before dump cache expires (default: 30)")
845
+ print_list_item("cache.category_expiry_days - Days before category cache expires (default: 7)")
846
+ print_list_item("cache.directory - Cache directory location")
847
+ print_list_item("defaults.format - Default output format (text/json)")
848
+ print_list_item("defaults.depth - Default subcategory recursion depth")
849
+ end
850
+
851
+ # Download dump for a language
852
+ def download_dump(lang, cache_dir)
853
+ app_config = Wp2txt::CLI.config
854
+
855
+ print_mode_banner("Auto-Download", {
856
+ "Language" => lang,
857
+ "Cache" => cache_dir
858
+ })
859
+
860
+ manager = Wp2txt::DumpManager.new(
861
+ lang,
862
+ cache_dir: cache_dir,
863
+ dump_expiry_days: app_config.dump_expiry_days
864
+ )
865
+
866
+ # Check for latest dump
867
+ spinner = create_spinner("Checking for latest dump...")
868
+ spinner.auto_spin
869
+ dump_date = manager.latest_dump_date
870
+ spinner.success(pastel.green(dump_date))
871
+
872
+ # Download index and multistream
873
+ print_header("Downloading files")
874
+ manager.download_index
875
+ manager.download_multistream
876
+
877
+ print_success("Download complete!")
878
+
879
+ # Return path to multistream file
880
+ manager.cached_multistream_path
881
+ end
882
+ end
883
+
884
+ # Handle Ctrl+C gracefully
885
+ Signal.trap("INT") do
886
+ # Show cursor (in case it was hidden by spinner/progress bar)
887
+ print "\e[?25h"
888
+ puts "\n\nInterrupted by user."
889
+ exit Wp2txt::CliUI::EXIT_ERROR
190
890
  end
191
891
 
192
- WpApp.new.run
892
+ # Create new instance and run with proper exit code
893
+ exit_code = WpApp.new.run
894
+ exit(exit_code || Wp2txt::CliUI::EXIT_SUCCESS)