wp2txt 1.1.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +261 -121
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -159
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
data/bin/wp2txt CHANGED
@@ -2,189 +2,893 @@
2
2
 
3
3
  # frozen_string_literal: true
4
4
 
5
- DEBUG_MODE = false
5
+ # Enable YJIT for better performance (Ruby 3.3+)
6
+ RubyVM::YJIT.enable if defined?(RubyVM::YJIT) && RubyVM::YJIT.respond_to?(:enable)
6
7
 
7
8
  require_relative "../lib/wp2txt"
8
9
  require_relative "../lib/wp2txt/utils"
9
10
  require_relative "../lib/wp2txt/version"
11
+ require_relative "../lib/wp2txt/cli"
12
+ require_relative "../lib/wp2txt/multistream"
13
+ require_relative "../lib/wp2txt/cli_ui"
14
+ require_relative "../lib/wp2txt/formatter"
15
+ require_relative "../lib/wp2txt/extractor"
16
+ require_relative "../lib/wp2txt/ractor_worker"
10
17
 
11
18
  require "etc"
19
+ require "json"
12
20
  require "optimist"
13
21
  require "parallel"
14
22
  require "pastel"
15
23
  require "tty-spinner"
24
+ require "tty-progressbar"
16
25
 
17
26
  class WpApp
18
27
  include Wp2txt
28
+ include Wp2txt::CliUI
29
+ include Wp2txt::Formatter
30
+ include Wp2txt::Extractor
19
31
 
20
- def run
21
- opts = Optimist.options do
22
- version VERSION
23
- banner <<~BANNER
24
- WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
25
-
26
- Usage: wp2txt [options]
27
- where [options] are:
28
- BANNER
29
-
30
- opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", type: String, required: true, short: "-i"
31
- opt :output_dir, "Path to output directory", default: Dir.pwd, type: String, short: "-o"
32
- opt :convert, "Output in plain text (converting from XML)", default: true, short: "-c"
33
- opt :category, "Show article category information", default: true, short: "-a"
34
- opt :category_only, "Extract only article title and categories", default: false, short: "-g"
35
- opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
36
- opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
37
- opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
38
- opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
39
- opt :title, "Keep page titles in output", default: true, short: "-t"
40
- opt :heading, "Keep section titles in output", default: true, short: "-d"
41
- opt :list, "Keep unprocessed list items in output", default: false, short: "-l"
42
- opt :ref, "Keep reference notations in the format [ref]...[/ref]", default: false, short: "-r"
43
- opt :redirect, "Show redirect destination", default: false, short: "-e"
44
- opt :marker, "Show symbols prefixed to list items, definitions, etc.", default: true, short: "-m"
45
- opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", default: false, short: "-b"
46
- end
47
-
48
- Optimist.die :size, "must be larger than 0" unless opts[:file_size] >= 0
49
- Optimist.die :input, "must exist" unless File.exist?(opts[:input])
50
- Optimist.die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
51
-
52
- pastel = Pastel.new
53
-
54
- input_file = opts[:input]
55
- output_dir = opts[:output_dir]
56
- tfile_size = opts[:file_size]
57
- num_processors = Etc.nprocessors
58
- num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
59
- opts[:num_procs]
60
- else
61
- num_processors - 2
62
- end
63
- num_processes = 1 if num_processes < 1
64
-
65
- convert = opts[:convert]
66
- strip_tmarker = opts[:marker] ? false : true
67
- opt_array = %i[title list heading table redirect multiline category category_only summary_only del_interfile bz2_gem]
68
-
69
- config = {}
70
- opt_array.each do |opt|
71
- config[opt] = opts[opt]
72
- end
32
+ # Debug mode flag
33
+ DEBUG_MODE = false
34
+
35
+ def initialize
36
+ @pastel = Pastel.new
37
+ end
38
+
39
+ private
40
+
41
+ # Calculate the number of processes to be used for parallel processing
42
+ # Uses MemoryMonitor to determine optimal parallelism based on CPU and memory
43
+ def calculate_num_processes(opts)
44
+ optimal = Wp2txt::MemoryMonitor.optimal_processes
73
45
 
74
- if File.ftype(input_file) == "directory"
75
- input_files = Dir.glob("#{input_file}/*.xml")
46
+ if opts[:num_procs]
47
+ # User specified a value - use it if reasonable
48
+ requested = opts[:num_procs].to_i
49
+ max_allowed = Etc.nprocessors
50
+ [requested, max_allowed, 1].max == requested ? requested : optimal
76
51
  else
77
- puts ""
78
- puts pastel.green.bold("Preprocessing")
79
- puts "Decompressing and splitting the original dump file."
80
- puts pastel.underline("This may take a while. Please be patient!")
81
-
82
- time_start = Time.now.to_i
83
- wpsplitter = Splitter.new(input_file, output_dir, tfile_size)
84
- spinner = TTY::Spinner.new(":spinner", format: :arrow_pulse, hide_cursor: true, interval: 5)
85
- spinner.auto_spin
86
- wpsplitter.split_file
87
- time_finish = Time.now.to_i
88
-
89
- spinner.stop("Time: #{sec_to_str(time_finish - time_start)}") # Stop animation
90
- puts pastel.blue.bold("Complete!")
91
- exit unless convert
92
- input_files = Dir.glob("#{output_dir}/*.xml")
93
- end
94
-
95
- puts ""
96
- puts pastel.red.bold("Converting")
97
- puts "Number of files being processed: " + pastel.bold(input_files.size.to_s)
98
- puts "Number of CPU cores being used: " + pastel.bold(num_processes.to_s)
99
-
100
- Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |infile|
101
- wpconv = Runner.new(infile, output_dir, strip_tmarker, config[:del_interfile])
102
- wpconv.extract_text do |article|
103
- article.title = format_wiki(article.title, config)
104
-
105
- if config[:category_only]
106
- title = "#{article.title}\t"
107
- contents = article.categories.join(", ")
108
- contents << "\n"
109
- elsif config[:category] && !article.categories.empty?
110
- title = "\n[[#{article.title}]]\n\n"
111
- contents = +"\nCATEGORIES: "
112
- contents << article.categories.join(", ")
113
- contents << "\n\n"
114
- else
115
- title = "\n[[#{article.title}]]\n\n"
116
- contents = +""
117
- end
52
+ optimal
53
+ end.tap { |n| n = 1 if n < 1 }
54
+ end
55
+
56
+ # Process articles using turbo mode (split-first architecture from v1.x)
57
+ # This splits the bz2 file into XML chunks first, then processes in parallel
58
+ # Much faster for large dumps due to parallel decompression benefit
59
+ def process_with_turbo(input_path, output_dir, config)
60
+ require "tmpdir"
61
+ require "fileutils"
62
+
63
+ num_processes = config[:num_procs]
64
+ file_size_mb = config[:file_size]
65
+ format = config[:format]
66
+ bz2_gem = config[:bz2_gem]
67
+
68
+ # Determine base name for output files
69
+ base_name = File.basename(input_path, ".*")
70
+ base_name = base_name.sub(/\.xml$/, "") # Handle .xml.bz2
71
+
72
+ # Get input file size for display
73
+ input_size = File.size(input_path) rescue 0
74
+ input_size_str = input_size > 0 ? format_size(input_size) : "unknown"
75
+
76
+ print_mode_banner("Turbo Mode Processing", {
77
+ "Input" => File.basename(input_path),
78
+ "Size" => input_size_str,
79
+ "Format" => format.to_s,
80
+ "CPU cores" => num_processes.to_s,
81
+ "Mode" => "Split-first (parallel decompression)"
82
+ })
83
+
84
+ time_start = Time.now
85
+
86
+ # Create temp directory for split XML files
87
+ temp_dir = Dir.mktmpdir("wp2txt_turbo_")
88
+ puts pastel.cyan("Phase 1: Splitting bz2 file into XML chunks...")
89
+ puts pastel.dim(" Temp directory: #{temp_dir}")
90
+ puts
91
+
92
+ begin
93
+ # Phase 1: Split bz2 into XML files using Splitter
94
+ # Split into 10MB chunks for good parallelism
95
+ $stdout.sync = true
96
+ splitter = Splitter.new(input_path, temp_dir, 10, bz2_gem) do |bytes_read, file_count|
97
+ # Progress callback - called every 5 seconds
98
+ size_str = format_size(bytes_read)
99
+ elapsed = Time.now - time_start
100
+ rate = bytes_read / elapsed / 1024 / 1024 # MB/s
101
+ puts pastel.dim(format(" [%s] Decompressed: %s | %.1f MB/s | %d XML files created",
102
+ Time.now.strftime("%H:%M:%S"),
103
+ size_str,
104
+ rate,
105
+ file_count))
106
+ end
107
+ splitter.split_file
108
+ xml_files = Dir.glob(File.join(temp_dir, "*.xml")).sort
109
+
110
+ split_time = Time.now - time_start
111
+ final_size = splitter.size_read || 0
112
+ puts
113
+ puts pastel.green("#{ICONS[:success]} Split complete: #{xml_files.size} XML files, #{format_size(final_size)} decompressed (#{format_duration(split_time)})")
114
+ puts
115
+
116
+ # Phase 2: Process XML files in parallel and write output directly
117
+ puts pastel.cyan("Phase 2: Processing XML files in parallel...")
118
+ puts pastel.dim(" Using #{num_processes} parallel processes")
119
+ puts
120
+
121
+ strip_tmarker = !config[:marker]
122
+
123
+ # Each parallel process writes to its own temp output file
124
+ # This avoids memory accumulation and enables streaming output
125
+ $stdout.sync = true
126
+ processed_count = 0
127
+ last_report_time = Time.now
128
+ temp_output_dir = File.join(temp_dir, "output")
129
+ FileUtils.mkdir_p(temp_output_dir)
130
+
131
+ # Process XML files in parallel - each writes its own output
132
+ article_counts = Parallel.map(
133
+ xml_files.each_with_index.to_a,
134
+ in_processes: num_processes,
135
+ finish: lambda { |_item, _index, _result|
136
+ processed_count += 1
137
+ now = Time.now
138
+ if now - last_report_time >= Wp2txt::DEFAULT_PROGRESS_INTERVAL || processed_count == xml_files.size
139
+ last_report_time = now
140
+ percent = (processed_count.to_f / xml_files.size * 100).round(1)
141
+ elapsed = now - time_start
142
+ rate = processed_count / elapsed
143
+ remaining = xml_files.size - processed_count
144
+ eta = remaining > 0 && rate > 0 ? remaining / rate : 0
145
+ puts pastel.dim(format(" [%d/%d] %.1f%% | %.1f files/sec | ETA: %s",
146
+ processed_count, xml_files.size,
147
+ percent, rate,
148
+ format_duration(eta)))
149
+ end
150
+ }
151
+ ) do |xml_file, idx|
152
+ # Each process writes directly to its own temp file
153
+ temp_output_file = File.join(temp_output_dir, "part_#{idx.to_s.rjust(5, '0')}.txt")
154
+ process_xml_file_and_write(xml_file, temp_output_file, config, strip_tmarker, format)
155
+ end
118
156
 
119
- unless config[:category_only]
120
- article.elements.each do |e|
121
- case e.first
122
- when :mw_heading
123
- break if config[:summary_only]
124
- next unless config[:heading]
125
-
126
- e[-1] = format_wiki(e.last, config)
127
- line = e.last
128
- line << "+HEADING+" if DEBUG_MODE
129
- when :mw_paragraph
130
- e[-1] = format_wiki(e.last, config)
131
- line = e.last + "\n"
132
- line << "+PARAGRAPH+" if DEBUG_MODE
133
- when :mw_table, :mw_htable
134
- next unless config[:table]
135
-
136
- line = e.last
137
- line << "+TABLE+" if DEBUG_MODE
138
- when :mw_pre
139
- next unless config[:pre]
140
-
141
- line = e.last
142
- line << "+PRE+" if DEBUG_MODE
143
- when :mw_quote
144
- line = e.last
145
- line << "+QUOTE+" if DEBUG_MODE
146
- when :mw_unordered, :mw_ordered, :mw_definition
147
- next unless config[:list]
148
-
149
- line = e.last
150
- line << "+LIST+" if DEBUG_MODE
151
- when :mw_ml_template
152
- next unless config[:multiline]
153
-
154
- line = e.last
155
- line << "+MLTEMPLATE+" if DEBUG_MODE
156
- when :mw_redirect
157
- next unless config[:redirect]
158
-
159
- line = e.last
160
- line << "+REDIRECT+" if DEBUG_MODE
161
- line << "\n\n"
162
- when :mw_isolated_template
163
- next unless config[:multiline]
164
-
165
- line = e.last
166
- line << "+ISOLATED_TEMPLATE+" if DEBUG_MODE
167
- when :mw_isolated_tag
168
- next
169
- else
170
- next unless DEBUG_MODE
171
-
172
- line = e.last
173
- line << "+OTHER+"
174
- end
175
- contents << line << "\n"
157
+ total_articles = article_counts.sum
158
+
159
+ # Phase 3: Merge temp output files into final output (streaming)
160
+ puts
161
+ puts pastel.cyan("Merging output files...")
162
+
163
+ temp_files = Dir.glob(File.join(temp_output_dir, "part_*.txt")).sort
164
+ writer = OutputWriter.new(
165
+ output_dir: output_dir,
166
+ base_name: base_name,
167
+ format: format,
168
+ file_size_mb: file_size_mb
169
+ )
170
+
171
+ temp_files.each do |temp_file|
172
+ next if File.size(temp_file).zero?
173
+ # Stream copy instead of loading entire file into memory
174
+ writer.write_from_file(temp_file)
175
+ end
176
+
177
+ output_files = writer.close
178
+
179
+ time_elapsed = Time.now - time_start
180
+ puts
181
+ puts pastel.green("#{ICONS[:success]} Processing complete!")
182
+
183
+ print_summary("Turbo Processing Complete", {
184
+ "XML files processed" => xml_files.size.to_s,
185
+ "Articles" => total_articles.to_s,
186
+ "Output files" => output_files.size.to_s,
187
+ "Time" => format_duration(time_elapsed)
188
+ }, status: :success)
189
+
190
+ puts
191
+ puts pastel.dim("Output files:")
192
+ output_files.each { |f| print_list_item(f, status: :success) }
193
+ ensure
194
+ # Cleanup temp directory
195
+ FileUtils.rm_rf(temp_dir) if File.exist?(temp_dir)
196
+ end
197
+ end
198
+
199
+ # Regex patterns for fast XML extraction (avoid full DOM parsing)
200
+ TITLE_REGEX = %r{<title>([^<]*)</title>}m
201
+ TEXT_REGEX = %r{<text[^>]*>(.*)$}m
202
+ TEXT_END_REGEX = %r{</text>}
203
+
204
+ # Process a single XML file and write directly to output file
205
+ # Returns the number of articles processed
206
+ def process_xml_file_and_write(xml_file, output_file, config, strip_tmarker, format)
207
+ article_count = 0
208
+ runner = Runner.new(xml_file, File.dirname(xml_file), strip_tmarker, false)
209
+
210
+ File.open(output_file, "w") do |out|
211
+ while (page_xml = runner.get_page)
212
+ begin
213
+ # Fast regex extraction instead of full Nokogiri DOM parsing
214
+ title_match = TITLE_REGEX.match(page_xml)
215
+ next unless title_match
216
+
217
+ title = title_match[1]
218
+ next if title.nil? || title.empty? || title.include?(":")
219
+
220
+ # Extract text content
221
+ text_match = TEXT_REGEX.match(page_xml)
222
+ next unless text_match
223
+
224
+ # Find end of text and extract content
225
+ text_start = text_match.begin(1)
226
+ text_end_match = TEXT_END_REGEX.match(page_xml, text_start)
227
+ next unless text_end_match
228
+
229
+ text = page_xml[text_start...text_end_match.begin(0)]
230
+ next if text.nil? || text.empty?
231
+
232
+ # Decode XML entities
233
+ text = text.gsub("&lt;", "<").gsub("&gt;", ">").gsub("&amp;", "&").gsub("&quot;", '"')
234
+
235
+ # Remove HTML comments
236
+ text.gsub!(/<!--(.*?)-->/m) do |content|
237
+ num_of_newlines = content.count("\n")
238
+ num_of_newlines.zero? ? +"" : "\n" * num_of_newlines
176
239
  end
240
+
241
+ next if redirect_page?(text)
242
+
243
+ article = Article.new(text, title, strip_tmarker)
244
+ result = format_article(article, config)
245
+ next unless result
246
+
247
+ # Write directly to file
248
+ if format == :json
249
+ out.puts(result.to_json)
250
+ else
251
+ out.puts(result)
252
+ end
253
+ article_count += 1
254
+ rescue StandardError
255
+ next
177
256
  end
257
+ end
258
+ end
259
+
260
+ article_count
261
+ end
178
262
 
179
- if /\A[\s ]*\z/m =~ contents
180
- ""
263
+ # Fast redirect detection (same as in stream_processor)
264
+ def redirect_page?(text)
265
+ return false if text.nil? || text.empty?
266
+ first_part = text[0, 200]
267
+ return false unless first_part
268
+ stripped = first_part.lstrip
269
+ return false unless stripped.start_with?("#", "#")
270
+ stripped.include?("[[")
271
+ end
272
+
273
+ # Process articles using streaming (new architecture)
274
+ def process_stream(input_path, output_dir, config)
275
+ num_processes = config[:num_procs]
276
+ file_size_mb = config[:file_size]
277
+ format = config[:format]
278
+ bz2_gem = config[:bz2_gem]
279
+
280
+ # Determine base name for output files
281
+ base_name = File.basename(input_path, ".*")
282
+ base_name = base_name.sub(/\.xml$/, "") # Handle .xml.bz2
283
+
284
+ # Create stream processor
285
+ stream = StreamProcessor.new(input_path, bz2_gem: bz2_gem)
286
+
287
+ # Create output writer
288
+ writer = OutputWriter.new(
289
+ output_dir: output_dir,
290
+ base_name: base_name,
291
+ format: format,
292
+ file_size_mb: file_size_mb
293
+ )
294
+
295
+ # Collect pages for parallel processing
296
+ pages = []
297
+ page_count = 0
298
+
299
+ # Determine parallelism mode
300
+ use_ractor = config[:use_ractor] && Wp2txt::RactorWorker.available?
301
+ parallel_mode = use_ractor ? "Ractor (experimental)" : "Parallel (processes)"
302
+
303
+ # Show warning for experimental Ractor mode
304
+ if config[:use_ractor]
305
+ if use_ractor
306
+ print_warning("Ractor mode is experimental and may be unstable.")
307
+ puts pastel.yellow(" If processing hangs, restart without --ractor option.") unless quiet?
308
+ else
309
+ print_warning("Ractor not available on this Ruby version. Using Parallel gem.")
310
+ end
311
+ end
312
+
313
+ # Get input file size for progress estimation
314
+ input_size = File.size(input_path) rescue 0
315
+ input_size_str = input_size > 0 ? format_size(input_size) : "unknown"
316
+
317
+ # Estimate total articles for ETA calculation
318
+ estimated_total = estimate_total_articles(input_path)
319
+ estimated_total_str = estimated_total ? "~#{(estimated_total / 1_000_000.0).round(1)}M" : "unknown"
320
+
321
+ print_mode_banner("Full Dump Processing", {
322
+ "Input" => File.basename(input_path),
323
+ "Size" => input_size_str,
324
+ "Articles (est.)" => estimated_total_str,
325
+ "Format" => format.to_s,
326
+ "CPU cores" => num_processes.to_s,
327
+ "Parallel" => parallel_mode,
328
+ "Skip redirects" => "yes"
329
+ })
330
+
331
+ # Ensure output is not buffered (important for piped output)
332
+ $stdout.sync = true
333
+
334
+ time_start = Time.now
335
+ last_progress_time = time_start
336
+ last_progress_count = 0
337
+ batch_count = 0
338
+
339
+ # Progress reporting interval (seconds)
340
+ progress_interval = Wp2txt::DEFAULT_PROGRESS_INTERVAL
341
+
342
+ # Process in batches for memory efficiency
343
+ batch_size = num_processes * 100
344
+ strip_tmarker = !config[:marker]
345
+
346
+ # Show initial progress message
347
+ puts pastel.cyan("Processing started at #{time_start.strftime('%H:%M:%S')}")
348
+ if estimated_total
349
+ puts pastel.dim("Progress updates every #{progress_interval} seconds (with ETA)...")
350
+ else
351
+ puts pastel.dim("Progress updates every #{progress_interval} seconds...")
352
+ end
353
+ puts
354
+
355
+ stream.each_page do |title, text|
356
+ pages << [title, text]
357
+ page_count += 1
358
+
359
+ # Process batch when full
360
+ next unless pages.size >= batch_size
361
+
362
+ process_batch(pages, writer, config, strip_tmarker, num_processes)
363
+ pages.clear
364
+ batch_count += 1
365
+
366
+ # Show progress every N seconds
367
+ now = Time.now
368
+ elapsed_since_update = now - last_progress_time
369
+ if elapsed_since_update >= progress_interval
370
+ elapsed_total = now - time_start
371
+ articles_per_sec = (page_count - last_progress_count) / elapsed_since_update
372
+ output_count = writer.file_count rescue batch_count
373
+
374
+ # Calculate ETA
375
+ eta_seconds = calculate_eta(page_count, estimated_total, elapsed_total)
376
+ eta_str = format_eta(eta_seconds)
377
+
378
+ # Calculate progress percentage if total is known
379
+ if estimated_total && estimated_total > 0
380
+ percent = (page_count.to_f / estimated_total * 100).round(1)
381
+ progress_line = format(
382
+ " [%s] %s articles (%s%%) | %s/sec | %s files | Elapsed: %s | ETA: %s",
383
+ now.strftime("%H:%M:%S"),
384
+ page_count.to_s.rjust(8),
385
+ percent.to_s.rjust(5),
386
+ articles_per_sec.round(1).to_s.rjust(6),
387
+ output_count.to_s.rjust(4),
388
+ format_duration(elapsed_total),
389
+ eta_str
390
+ )
181
391
  else
182
- config[:title] ? title << contents : contents
392
+ progress_line = format(
393
+ " [%s] %s articles | %s/sec | %s files | Elapsed: %s",
394
+ now.strftime("%H:%M:%S"),
395
+ page_count.to_s.rjust(8),
396
+ articles_per_sec.round(1).to_s.rjust(6),
397
+ output_count.to_s.rjust(4),
398
+ format_duration(elapsed_total)
399
+ )
183
400
  end
401
+ puts pastel.dim(progress_line)
402
+
403
+ last_progress_time = now
404
+ last_progress_count = page_count
405
+ end
406
+ end
407
+
408
+ # Process remaining pages
409
+ process_batch(pages, writer, config, strip_tmarker, num_processes) unless pages.empty?
410
+
411
+ # Close output
412
+ output_files = writer.close
413
+
414
+ # Get redirect skip count
415
+ redirects_skipped = stream.redirects_skipped
416
+
417
+ time_elapsed = Time.now - time_start
418
+ puts
419
+ puts pastel.green("#{ICONS[:success]} Processing complete!")
420
+
421
+ # Summary
422
+ summary_data = {
423
+ "Articles" => page_count.to_s,
424
+ "Output files" => output_files.size.to_s,
425
+ "Time" => format_duration(time_elapsed)
426
+ }
427
+ summary_data["Redirects skipped"] = redirects_skipped.to_s if redirects_skipped > 0
428
+
429
+ print_summary("Processing Complete", summary_data, status: :success)
430
+
431
+ puts
432
+ puts pastel.dim("Output files:")
433
+ output_files.each { |f| print_list_item(f, status: :success) }
434
+ end
435
+
436
+ # Process a batch of pages in parallel
437
+ # Uses Ractor for true parallelism when enabled, otherwise falls back to Parallel gem
438
+ def process_batch(pages, writer, config, strip_tmarker, num_processes)
439
+ results = if config[:use_ractor] && Wp2txt::RactorWorker.available?
440
+ # Use Ractor-based parallel processing (true parallelism)
441
+ Wp2txt::RactorWorker.process_articles(
442
+ pages,
443
+ config: config,
444
+ strip_tmarker: strip_tmarker,
445
+ num_workers: num_processes
446
+ )
447
+ else
448
+ # Fall back to Parallel gem (process-based parallelism)
449
+ Parallel.map(pages, in_processes: num_processes) do |title, text|
450
+ article = Article.new(text, title, strip_tmarker)
451
+ format_article(article, config)
452
+ end
453
+ end
454
+
455
+ results.each do |result|
456
+ writer.write(result) if result
457
+ end
458
+ end
459
+
460
+ # Process section statistics mode
461
+ # Collects section heading statistics and outputs JSON to stdout
462
+ def process_section_stats(input_path, config)
463
+ require_relative "../lib/wp2txt/section_extractor"
464
+
465
+ bz2_gem = config[:bz2_gem]
466
+ no_turbo = config[:no_turbo]
467
+ num_processes = config[:num_procs]
468
+
469
+ # Use turbo mode for bz2 files unless disabled
470
+ if input_path.end_with?(".bz2") && !no_turbo
471
+ process_section_stats_turbo(input_path, bz2_gem, num_processes)
472
+ else
473
+ process_section_stats_stream(input_path, bz2_gem)
474
+ end
475
+ end
476
+
477
+ def process_section_stats_stream(input_path, bz2_gem)
478
+ print_mode_banner("Section Statistics", {
479
+ "Input" => File.basename(input_path),
480
+ "Mode" => "Statistics collection (streaming)"
481
+ })
482
+
483
+ puts pastel.cyan("Collecting section statistics...")
484
+ puts pastel.dim("This may take a while for large dumps.")
485
+ puts
486
+
487
+ # Create stream processor and stats collector
488
+ stream = StreamProcessor.new(input_path, bz2_gem: bz2_gem)
489
+ collector = Wp2txt::SectionStatsCollector.new
490
+
491
+ time_start = Time.now
492
+ last_progress_time = time_start
493
+ progress_interval = Wp2txt::DEFAULT_PROGRESS_INTERVAL
494
+
495
+ # Process pages without full text processing (just extract headings)
496
+ stream.each_page do |title, text|
497
+ # Create minimal article just for heading extraction
498
+ article = Article.new(text, title, false)
499
+ collector.process(article)
500
+
501
+ # Show progress periodically
502
+ now = Time.now
503
+ if now - last_progress_time >= progress_interval
504
+ elapsed = now - time_start
505
+ rate = collector.total_articles / elapsed
506
+ puts pastel.dim(format(" [%s] %d articles processed (%.1f/sec)",
507
+ now.strftime("%H:%M:%S"),
508
+ collector.total_articles,
509
+ rate))
510
+ last_progress_time = now
511
+ end
512
+ end
513
+
514
+ output_section_stats_result(collector, time_start)
515
+ end
516
+
517
+ def process_section_stats_turbo(input_path, bz2_gem, num_processes)
518
+ require "tmpdir"
519
+ require "fileutils"
520
+
521
+ print_mode_banner("Section Statistics (Turbo)", {
522
+ "Input" => File.basename(input_path),
523
+ "Mode" => "Statistics collection (parallel)",
524
+ "CPU cores" => num_processes.to_s
525
+ })
526
+
527
+ time_start = Time.now
528
+
529
+ # Create temp directory for split XML files
530
+ temp_dir = Dir.mktmpdir("wp2txt_stats_")
531
+ puts pastel.cyan("Phase 1: Splitting bz2 file...")
532
+ puts pastel.dim(" Temp directory: #{temp_dir}")
533
+ puts
534
+
535
+ begin
536
+ # Phase 1: Split bz2 into XML files
537
+ $stdout.sync = true
538
+ splitter = Splitter.new(input_path, temp_dir, 10, bz2_gem) do |bytes_read, file_count|
539
+ size_str = format_size(bytes_read)
540
+ elapsed = Time.now - time_start
541
+ rate = bytes_read / elapsed / 1024 / 1024
542
+ puts pastel.dim(format(" [%s] Decompressed: %s | %.1f MB/s | %d XML files",
543
+ Time.now.strftime("%H:%M:%S"),
544
+ size_str, rate, file_count))
545
+ end
546
+ splitter.split_file
547
+ xml_files = Dir.glob(File.join(temp_dir, "*.xml")).sort
548
+
549
+ split_time = Time.now - time_start
550
+ final_size = splitter.size_read || 0
551
+ puts
552
+ puts pastel.green("#{ICONS[:success]} Split complete: #{xml_files.size} XML files, #{format_size(final_size)} (#{format_duration(split_time)})")
553
+ puts
554
+
555
+ # Phase 2: Process XML files in parallel
556
+ puts pastel.cyan("Phase 2: Collecting statistics in parallel...")
557
+ puts
558
+
559
+ processed_count = 0
560
+ last_report_time = Time.now
561
+
562
+ # Process XML files in parallel and collect stats
563
+ partial_results = Parallel.map(
564
+ xml_files,
565
+ in_processes: num_processes,
566
+ finish: lambda { |_item, _index, _result|
567
+ processed_count += 1
568
+ now = Time.now
569
+ if now - last_report_time >= Wp2txt::DEFAULT_PROGRESS_INTERVAL || processed_count == xml_files.size
570
+ last_report_time = now
571
+ percent = (processed_count.to_f / xml_files.size * 100).round(1)
572
+ elapsed = now - time_start
573
+ rate = processed_count / elapsed
574
+ remaining = xml_files.size - processed_count
575
+ eta = remaining > 0 && rate > 0 ? remaining / rate : 0
576
+ puts pastel.dim(format(" [%d/%d] %.1f%% | %.1f files/sec | ETA: %s",
577
+ processed_count, xml_files.size,
578
+ percent, rate, format_duration(eta)))
579
+ end
580
+ }
581
+ ) do |xml_file|
582
+ process_xml_file_for_stats(xml_file)
583
+ end
584
+
585
+ # Merge all partial results
586
+ puts
587
+ puts pastel.cyan("Merging results...")
588
+
589
+ collector = Wp2txt::SectionStatsCollector.new
590
+ partial_results.each { |result| collector.merge(result) }
591
+
592
+ output_section_stats_result(collector, time_start)
593
+ ensure
594
+ FileUtils.rm_rf(temp_dir) if File.exist?(temp_dir)
595
+ end
596
+ end
597
+
598
+ # Process a single XML file for section stats (used by turbo mode)
599
+ def process_xml_file_for_stats(xml_file)
600
+ collector = Wp2txt::SectionStatsCollector.new
601
+ runner = Runner.new(xml_file, File.dirname(xml_file), false, false)
602
+
603
+ while (page_xml = runner.get_page)
604
+ begin
605
+ # Fast regex extraction
606
+ title_match = TITLE_REGEX.match(page_xml)
607
+ next unless title_match
608
+
609
+ title = title_match[1]
610
+ next if title.nil? || title.empty? || title.include?(":")
611
+
612
+ text_match = TEXT_REGEX.match(page_xml)
613
+ next unless text_match
614
+
615
+ text_start = text_match.begin(1)
616
+ text_end_match = TEXT_END_REGEX.match(page_xml, text_start)
617
+ next unless text_end_match
618
+
619
+ text = page_xml[text_start...text_end_match.begin(0)]
620
+ next if text.nil? || text.empty?
621
+
622
+ text = text.gsub("&lt;", "<").gsub("&gt;", ">").gsub("&amp;", "&").gsub("&quot;", '"')
623
+ next if redirect_page?(text)
624
+
625
+ article = Article.new(text, title, false)
626
+ collector.process(article)
627
+ rescue StandardError
628
+ next
629
+ end
630
+ end
631
+
632
+ collector.to_mergeable_hash
633
+ end
634
+
635
+ def output_section_stats_result(collector, time_start)
636
+ time_elapsed = Time.now - time_start
637
+
638
+ puts
639
+ puts pastel.green("#{ICONS[:success]} Statistics collection complete!")
640
+ puts
641
+
642
+ # Print summary to stderr so JSON goes to stdout cleanly
643
+ $stderr.puts pastel.dim("Total articles: #{collector.total_articles}")
644
+ $stderr.puts pastel.dim("Unique sections: #{collector.section_counts.size}")
645
+ $stderr.puts pastel.dim("Time: #{format_duration(time_elapsed)}")
646
+ $stderr.puts
647
+
648
+ # Output JSON to stdout
649
+ puts collector.to_json(top_n: Wp2txt::DEFAULT_TOP_N_SECTIONS)
650
+
651
+ EXIT_SUCCESS
652
+ end
653
+
654
+ # Parse --markers option value
655
+ # "all" -> true (all markers enabled)
656
+ # "none" -> DEPRECATED (now treated as "all" with warning)
657
+ # "math,code,chem" -> [:math, :code, :chem]
658
+ def parse_markers_option(value)
659
+ case value.to_s.downcase.strip
660
+ when "all", "true", ""
661
+ true
662
+ when "none", "false"
663
+ # Deprecation warning - none/false no longer removes content completely
664
+ puts @pastel.yellow("Warning: --markers=none is deprecated and will be removed in a future version.")
665
+ puts @pastel.yellow(" Complete removal of special content can make surrounding text nonsensical.")
666
+ puts @pastel.yellow(" Using --markers=all instead. Markers will be shown for all special content.")
667
+ puts
668
+ true # Treat as "all" instead of removing content
669
+ else
670
+ # Parse comma-separated list
671
+ value.split(",").map { |m| m.strip.downcase.to_sym }.select do |m|
672
+ Wp2txt::MARKER_TYPES.include?(m)
184
673
  end
185
674
  end
186
- puts pastel.blue.bold("Complete!")
187
675
  end
676
+
677
+ public
678
+
679
+ # Main execution method
680
+ # @return [Integer] Exit code (0=success, 1=error, 2=partial)
681
+ def run
682
+ # Parse command line options using CLI module
683
+ opts = Wp2txt::CLI.parse_options(ARGV)
684
+
685
+ # Configure UI settings (color, quiet mode)
686
+ configure_ui(no_color: opts[:no_color], quiet: opts[:quiet])
687
+ reset_pastel! # Reset pastel to apply color settings
688
+ @pastel = pastel # Reinitialize with new settings
689
+
690
+ # Handle config-init
691
+ if opts[:config_init]
692
+ init_config
693
+ return EXIT_SUCCESS
694
+ end
695
+
696
+ # Handle cache operations
697
+ if opts[:cache_status]
698
+ show_cache_status(opts[:cache_dir])
699
+ return EXIT_SUCCESS
700
+ end
701
+
702
+ if opts[:cache_clear]
703
+ clear_cache(opts[:cache_dir], opts[:lang])
704
+ return EXIT_SUCCESS
705
+ end
706
+
707
+ # Determine input source
708
+ if opts[:from_category] && opts[:lang]
709
+ # Category extraction mode
710
+ return extract_category_articles(opts)
711
+ end
712
+
713
+ if opts[:articles] && opts[:lang]
714
+ # Article extraction mode
715
+ return extract_specific_articles(opts)
716
+ end
717
+
718
+ input_path = if opts[:lang]
719
+ download_dump(opts[:lang], opts[:cache_dir])
720
+ else
721
+ opts[:input]
722
+ end
723
+
724
+ # Validate format option
725
+ format = opts[:format].to_s.downcase.to_sym
726
+
727
+ # Show deprecation warnings
728
+ if opts[:convert_given] || opts[:del_interfile_given]
729
+ print_warning("--convert and --del-interfile options are deprecated and will be ignored.")
730
+ puts pastel.yellow(" Intermediate files are no longer created in v2.0+") unless quiet?
731
+ end
732
+
733
+ num_processes = calculate_num_processes(opts)
734
+
735
+ # Build configuration hash from options
736
+ config = {
737
+ format: format,
738
+ num_procs: num_processes,
739
+ file_size: opts[:file_size],
740
+ bz2_gem: opts[:bz2_gem],
741
+ use_ractor: opts[:ractor],
742
+ no_turbo: opts[:no_turbo]
743
+ }
744
+
745
+ %i[title list heading table pre ref redirect multiline category category_only
746
+ summary_only metadata_only marker extract_citations expand_templates
747
+ section_output min_section_length skip_empty
748
+ alias_file no_section_aliases section_stats show_matched_sections].each do |opt|
749
+ config[opt] = opts[opt]
750
+ end
751
+
752
+ # Parse sections option (comma-separated string to array)
753
+ if opts[:sections]
754
+ config[:sections] = opts[:sections].split(",").map(&:strip).reject(&:empty?)
755
+ end
756
+
757
+ # Parse markers option
758
+ config[:markers] = parse_markers_option(opts[:markers])
759
+
760
+ # Handle section-stats mode (standalone, outputs to stdout)
761
+ if opts[:section_stats]
762
+ return process_section_stats(input_path, config)
763
+ end
764
+
765
+ # Process input - turbo mode is default for bz2 files (faster parallel decompression)
766
+ # Use --no-turbo to disable (saves disk space but much slower)
767
+ if input_path.end_with?(".bz2") && !opts[:no_turbo]
768
+ if config[:use_ractor]
769
+ puts pastel.yellow("Note: --ractor is not supported with turbo mode. Using parallel gem instead.")
770
+ puts pastel.yellow(" Use --no-turbo to enable Ractor-based processing.")
771
+ puts
772
+ end
773
+ process_with_turbo(input_path, opts[:output_dir], config)
774
+ else
775
+ process_stream(input_path, opts[:output_dir], config)
776
+ end
777
+
778
+ EXIT_SUCCESS
779
+ end
780
+
781
+ # Show cache status
782
+ def show_cache_status(cache_dir)
783
+ print_mode_banner("Cache Status", { "Directory" => cache_dir })
784
+
785
+ status = Wp2txt::DumpManager.all_cache_status(cache_dir)
786
+
787
+ if status.empty?
788
+ print_info_message("No cached dumps found.")
789
+ return
790
+ end
791
+
792
+ status.each do |lang, info|
793
+ if info[:error]
794
+ print_list_item("#{lang}: Error - #{info[:error]}", status: :error)
795
+ else
796
+ index_size = info[:index_size] > 0 ? format_size(info[:index_size]) : pastel.dim("not downloaded")
797
+ multistream_size = info[:multistream_size] > 0 ? format_size(info[:multistream_size]) : pastel.dim("not downloaded")
798
+ status_icon = info[:fresh] ? :success : :warning
799
+
800
+ puts pastel.bold(lang.to_s.upcase)
801
+ print_list_item("Index: #{index_size}", status: status_icon)
802
+ print_list_item("Multistream: #{multistream_size}", status: status_icon)
803
+ print_info("Date", info[:dump_date] || "unknown", indent: 1)
804
+ puts
805
+ end
806
+ end
807
+ end
808
+
809
+ # Clear cache
810
+ def clear_cache(cache_dir, lang = nil)
811
+ if lang
812
+ spinner = create_spinner("Clearing cache for #{lang}...")
813
+ spinner.auto_spin
814
+ manager = Wp2txt::DumpManager.new(lang, cache_dir: cache_dir)
815
+ manager.clear_cache!
816
+ spinner.success(pastel.green("Done!"))
817
+ print_success("Cache cleared for #{lang}.")
818
+ else
819
+ spinner = create_spinner("Clearing all cache...")
820
+ spinner.auto_spin
821
+ Wp2txt::DumpManager.clear_all_cache!(cache_dir)
822
+ spinner.success(pastel.green("Done!"))
823
+ print_success("All cache cleared.")
824
+ end
825
+ end
826
+
827
+ # Initialize configuration file
828
+ def init_config
829
+ config_path = Wp2txt::Config.default_path
830
+
831
+ if File.exist?(config_path)
832
+ print_warning("Configuration file already exists: #{config_path}")
833
+
834
+ unless confirm?("Overwrite?")
835
+ puts "Cancelled."
836
+ return
837
+ end
838
+ end
839
+
840
+ Wp2txt::Config.create_default(config_path, force: true)
841
+ print_success("Configuration file created: #{config_path}")
842
+ puts
843
+ puts pastel.dim("Available settings:")
844
+ print_list_item("cache.dump_expiry_days - Days before dump cache expires (default: 30)")
845
+ print_list_item("cache.category_expiry_days - Days before category cache expires (default: 7)")
846
+ print_list_item("cache.directory - Cache directory location")
847
+ print_list_item("defaults.format - Default output format (text/json)")
848
+ print_list_item("defaults.depth - Default subcategory recursion depth")
849
+ end
850
+
851
+ # Download dump for a language
852
+ def download_dump(lang, cache_dir)
853
+ app_config = Wp2txt::CLI.config
854
+
855
+ print_mode_banner("Auto-Download", {
856
+ "Language" => lang,
857
+ "Cache" => cache_dir
858
+ })
859
+
860
+ manager = Wp2txt::DumpManager.new(
861
+ lang,
862
+ cache_dir: cache_dir,
863
+ dump_expiry_days: app_config.dump_expiry_days
864
+ )
865
+
866
+ # Check for latest dump
867
+ spinner = create_spinner("Checking for latest dump...")
868
+ spinner.auto_spin
869
+ dump_date = manager.latest_dump_date
870
+ spinner.success(pastel.green(dump_date))
871
+
872
+ # Download index and multistream
873
+ print_header("Downloading files")
874
+ manager.download_index
875
+ manager.download_multistream
876
+
877
+ print_success("Download complete!")
878
+
879
+ # Return path to multistream file
880
+ manager.cached_multistream_path
881
+ end
882
+ end
883
+
884
+ # Handle Ctrl+C gracefully
885
+ Signal.trap("INT") do
886
+ # Show cursor (in case it was hidden by spinner/progress bar)
887
+ print "\e[?25h"
888
+ puts "\n\nInterrupted by user."
889
+ exit Wp2txt::CliUI::EXIT_ERROR
188
890
  end
189
891
 
190
- WpApp.new.run
892
+ # Create new instance and run with proper exit code
893
+ exit_code = WpApp.new.run
894
+ exit(exit_code || Wp2txt::CliUI::EXIT_SUCCESS)