wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,354 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require_relative "constants"
5
+
6
+ module Wp2txt
7
+ # SectionExtractor handles extraction of sections from Wikipedia articles
8
+ # Supports both metadata extraction (headings only) and content extraction
9
+ class SectionExtractor
10
+ # Reserved keyword for the lead section (text before first heading)
11
+ SUMMARY_KEY = "summary"
12
+
13
+ # Default section aliases (canonical name => array of aliases)
14
+ DEFAULT_ALIASES = {
15
+ "Plot" => ["Synopsis"],
16
+ "Reception" => ["Critical reception"]
17
+ }.freeze
18
+
19
+ # Track which actual headings matched which requested sections
20
+ attr_reader :matched_sections
21
+
22
+ # @param target_sections [Array<String>, nil] List of section names to extract (nil = all)
23
+ # @param options [Hash] Extraction options
24
+ # @option options [Integer] :min_length Minimum section length (default: 0)
25
+ # @option options [Boolean] :skip_empty Skip articles with no matching sections (default: false)
26
+ # @option options [Hash] :aliases Custom section aliases (merged with defaults)
27
+ # @option options [String] :alias_file Path to YAML file with custom aliases
28
+ # @option options [Boolean] :use_aliases Enable alias matching (default: true)
29
+ # @option options [Boolean] :track_matches Track which headings matched (default: false)
30
+ def initialize(target_sections = nil, options = {})
31
+ @targets = normalize_targets(target_sections)
32
+ @min_length = options[:min_length] || 0
33
+ @skip_empty = options[:skip_empty] || false
34
+ @use_aliases = options.fetch(:use_aliases, true)
35
+ @track_matches = options[:track_matches] || false
36
+ @matched_sections = {}
37
+ @aliases = build_aliases(options[:aliases], options[:alias_file])
38
+ end
39
+
40
+ # Load aliases from YAML file
41
+ # @param file_path [String] Path to YAML file
42
+ # @return [Hash] Aliases hash (canonical => [aliases])
43
+ def self.load_aliases_from_file(file_path)
44
+ return {} unless file_path && File.exist?(file_path)
45
+
46
+ data = YAML.safe_load(File.read(file_path), permitted_classes: [Symbol])
47
+ return {} unless data.is_a?(Hash)
48
+
49
+ # Normalize: ensure values are arrays
50
+ data.transform_values { |v| Array(v) }
51
+ rescue Psych::SyntaxError, Errno::ENOENT
52
+ {}
53
+ end
54
+
55
+ # Extract section headings from article (for --metadata-only)
56
+ # @param article [Article] The article to extract from
57
+ # @return [Array<String>] List of section heading names
58
+ def extract_headings(article)
59
+ headings = []
60
+ article.elements.each do |element|
61
+ next unless element[0] == :mw_heading
62
+
63
+ heading_text = clean_heading_text(element[1])
64
+ headings << heading_text unless heading_text.empty?
65
+ end
66
+ headings
67
+ end
68
+
69
+ # Extract section headings with levels (for detailed analysis)
70
+ # @param article [Article] The article to extract from
71
+ # @return [Array<Hash>] List of {name:, level:} hashes
72
+ def extract_headings_with_levels(article)
73
+ headings = []
74
+ article.elements.each do |element|
75
+ next unless element[0] == :mw_heading
76
+
77
+ heading_text = clean_heading_text(element[1])
78
+ level = element[2] || 2
79
+ headings << { name: heading_text, level: level } unless heading_text.empty?
80
+ end
81
+ headings
82
+ end
83
+
84
+ # Extract summary (lead section) from article
85
+ # @param article [Article] The article to extract from
86
+ # @param config [Hash] Formatting configuration
87
+ # @return [String, nil] The summary text or nil if empty
88
+ def extract_summary(article, config = {})
89
+ contents = +""
90
+ article.elements.each do |element|
91
+ # Stop at first heading
92
+ break if element[0] == :mw_heading
93
+
94
+ # Skip non-content elements
95
+ next if %i[mw_blank mw_redirect mw_comment].include?(element[0])
96
+
97
+ content = element[1].to_s
98
+ contents << content
99
+ end
100
+
101
+ result = contents.strip
102
+ result.empty? ? nil : result
103
+ end
104
+
105
+ # Extract specified sections from article
106
+ # @param article [Article] The article to extract from
107
+ # @param config [Hash] Formatting configuration
108
+ # @return [Hash] Section name => content (nil if not found)
109
+ def extract_sections(article, config = {})
110
+ return {} if @targets.nil? || @targets.empty?
111
+
112
+ # Reset matched sections for this article
113
+ @matched_sections = {}
114
+
115
+ result = {}
116
+
117
+ # Initialize all targets with nil
118
+ @targets.each { |t| result[t] = nil }
119
+
120
+ # Handle summary separately
121
+ if @targets.include?(SUMMARY_KEY)
122
+ summary = extract_summary(article, config)
123
+ result[SUMMARY_KEY] = apply_min_length_filter(summary)
124
+ end
125
+
126
+ # Extract other sections
127
+ current_section = nil
128
+ current_level = nil
129
+ buffer = +""
130
+
131
+ article.elements.each do |element|
132
+ type = element[0]
133
+ content = element[1]
134
+ level = element[2]
135
+
136
+ if type == :mw_heading
137
+ # Save previous section if it was a target
138
+ if current_section
139
+ canonical = find_canonical_name(current_section)
140
+ if canonical && canonical != SUMMARY_KEY
141
+ result[canonical] = apply_min_length_filter(buffer.strip)
142
+ end
143
+ end
144
+
145
+ # Check if this heading is a target
146
+ heading_text = clean_heading_text(content)
147
+ canonical = find_canonical_name(heading_text)
148
+
149
+ if canonical && canonical != SUMMARY_KEY
150
+ current_section = heading_text
151
+ current_level = level || 2
152
+ buffer = +""
153
+ elsif current_level && (level.nil? || level <= current_level)
154
+ # Same or higher level heading ends current section
155
+ current_section = nil
156
+ current_level = nil
157
+ buffer = +""
158
+ end
159
+ elsif current_section
160
+ # Accumulate content for current section
161
+ buffer << content.to_s
162
+ end
163
+ end
164
+
165
+ # Save final section
166
+ if current_section
167
+ canonical = find_canonical_name(current_section)
168
+ if canonical && canonical != SUMMARY_KEY
169
+ result[canonical] = apply_min_length_filter(buffer.strip)
170
+ end
171
+ end
172
+
173
+ result
174
+ end
175
+
176
+ # Check if article has any matching sections
177
+ # @param article [Article] The article to check
178
+ # @return [Boolean] true if at least one target section exists
179
+ def has_matching_sections?(article)
180
+ return true if @targets.nil? || @targets.empty?
181
+
182
+ # Check summary
183
+ if @targets.include?(SUMMARY_KEY)
184
+ summary = extract_summary(article)
185
+ return true if summary && !summary.empty?
186
+ end
187
+
188
+ # Check headings (don't record matches during check)
189
+ headings = extract_headings(article)
190
+ headings.any? { |h| find_canonical_name(h, record_match: false) }
191
+ end
192
+
193
+ # Check if extraction should be skipped for this article
194
+ # @param article [Article] The article to check
195
+ # @return [Boolean] true if article should be skipped
196
+ def should_skip?(article)
197
+ return false unless @skip_empty
198
+ !has_matching_sections?(article)
199
+ end
200
+
201
+ private
202
+
203
+ # Normalize target section names
204
+ def normalize_targets(targets)
205
+ return nil if targets.nil?
206
+
207
+ Array(targets).map { |t| t.to_s.strip }.reject(&:empty?)
208
+ end
209
+
210
+ # Build aliases hash from options, file, and defaults
211
+ def build_aliases(custom_aliases, alias_file = nil)
212
+ return {} unless @use_aliases
213
+
214
+ aliases = DEFAULT_ALIASES.dup
215
+
216
+ # Load from file if specified
217
+ if alias_file
218
+ file_aliases = self.class.load_aliases_from_file(alias_file)
219
+ aliases.merge!(file_aliases)
220
+ end
221
+
222
+ # Merge inline custom aliases
223
+ aliases.merge!(custom_aliases) if custom_aliases.is_a?(Hash)
224
+ aliases
225
+ end
226
+
227
+ # Clean heading text by removing = markers and whitespace
228
+ def clean_heading_text(text)
229
+ text.to_s.gsub(/^[\s\n]*=+\s*/, "").gsub(/\s*=+[\s\n]*$/, "").strip
230
+ end
231
+
232
+ # Find canonical name for a heading (handles aliases)
233
+ # @param heading [String] The actual heading text from the article
234
+ # @param record_match [Boolean] Whether to record the match for tracking
235
+ # @return [String, nil] The canonical (requested) section name, or nil
236
+ def find_canonical_name(heading, record_match: true)
237
+ return nil if heading.nil? || heading.empty?
238
+ return nil if @targets.nil?
239
+
240
+ heading_lower = heading.downcase.strip
241
+
242
+ # Direct match
243
+ @targets.each do |target|
244
+ if target.downcase == heading_lower
245
+ # Record direct match (only if heading differs in case)
246
+ if @track_matches && record_match && target != heading
247
+ @matched_sections[target] = heading
248
+ end
249
+ return target
250
+ end
251
+ end
252
+
253
+ # Alias match
254
+ return nil unless @use_aliases
255
+
256
+ @aliases.each do |canonical, alias_list|
257
+ next unless @targets.any? { |t| t.downcase == canonical.downcase }
258
+
259
+ if alias_list.any? { |a| a.downcase == heading_lower }
260
+ # Return the target that matches canonical
261
+ target = @targets.find { |t| t.downcase == canonical.downcase }
262
+ # Record alias match
263
+ if @track_matches && record_match && target
264
+ @matched_sections[target] = heading
265
+ end
266
+ return target
267
+ end
268
+ end
269
+
270
+ nil
271
+ end
272
+
273
+ # Apply minimum length filter
274
+ def apply_min_length_filter(text)
275
+ return nil if text.nil?
276
+ return nil if @min_length > 0 && text.length < @min_length
277
+
278
+ text
279
+ end
280
+ end
281
+
282
+ # Collects section heading statistics across multiple articles
283
+ # Used for --section-stats mode
284
+ class SectionStatsCollector
285
+ attr_reader :total_articles, :section_counts
286
+
287
+ def initialize
288
+ @total_articles = 0
289
+ @section_counts = Hash.new(0)
290
+ @extractor = SectionExtractor.new
291
+ end
292
+
293
+ # Process an article and collect section heading statistics
294
+ # @param article [Article] The article to process
295
+ def process(article)
296
+ @total_articles += 1
297
+ headings = @extractor.extract_headings(article)
298
+ headings.each { |h| @section_counts[h] += 1 }
299
+ end
300
+
301
+ # Get top N sections by count
302
+ # @param n [Integer] Number of sections to return (default: 50)
303
+ # @return [Array<Hash>] Array of {name:, count:} hashes
304
+ def top_sections(n = 50)
305
+ @section_counts
306
+ .sort_by { |_name, count| -count }
307
+ .first(n)
308
+ .map { |name, count| { "name" => name, "count" => count } }
309
+ end
310
+
311
+ # Generate statistics output as a hash
312
+ # @param top_n [Integer] Number of top sections to include
313
+ # @return [Hash] Statistics hash
314
+ def to_hash(top_n: DEFAULT_TOP_N_SECTIONS)
315
+ {
316
+ "total_articles" => @total_articles,
317
+ "section_counts" => @section_counts.sort_by { |_k, v| -v }.to_h,
318
+ "top_sections" => top_sections(top_n)
319
+ }
320
+ end
321
+
322
+ # Generate JSON output
323
+ # @param top_n [Integer] Number of top sections to include
324
+ # @return [String] JSON string
325
+ def to_json(top_n: DEFAULT_TOP_N_SECTIONS)
326
+ require "json"
327
+ JSON.pretty_generate(to_hash(top_n: top_n))
328
+ end
329
+
330
+ # Merge another collector's results into this one
331
+ # Used for combining results from parallel processing
332
+ # @param other [SectionStatsCollector, Hash] Another collector or hash with results
333
+ def merge(other)
334
+ if other.is_a?(SectionStatsCollector)
335
+ @total_articles += other.total_articles
336
+ other.section_counts.each { |name, count| @section_counts[name] += count }
337
+ elsif other.is_a?(Hash)
338
+ @total_articles += other[:total_articles] || other["total_articles"] || 0
339
+ counts = other[:section_counts] || other["section_counts"] || {}
340
+ counts.each { |name, count| @section_counts[name] += count }
341
+ end
342
+ self
343
+ end
344
+
345
+ # Export current state as a hash (for parallel processing)
346
+ # @return [Hash] Hash with total_articles and section_counts
347
+ def to_mergeable_hash
348
+ {
349
+ total_articles: @total_articles,
350
+ section_counts: @section_counts.dup
351
+ }
352
+ end
353
+ end
354
+ end
@@ -0,0 +1,271 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require "stringio"
5
+ require_relative "constants"
6
+ require_relative "memory_monitor"
7
+ require_relative "bz2_validator"
8
+
9
+ module Wp2txt
10
+ # StreamProcessor handles streaming decompression and XML parsing
11
+ # without creating intermediate files
12
+ class StreamProcessor
13
+ include Wp2txt
14
+
15
+ # Buffer size bounds from constants
16
+ MIN_BUFFER_SIZE = Wp2txt::MIN_BUFFER_SIZE
17
+ MAX_BUFFER_SIZE = Wp2txt::MAX_BUFFER_SIZE
18
+ DEFAULT_BUFFER_SIZE = Wp2txt::DEFAULT_BUFFER_SIZE
19
+
20
+ attr_reader :buffer_size, :pages_processed, :bytes_read, :redirects_skipped
21
+
22
+ def initialize(input_path, bz2_gem: false, adaptive_buffer: true, validate_bz2: true, skip_redirects: true)
23
+ @input_path = input_path
24
+ @bz2_gem = bz2_gem
25
+ @buffer = +""
26
+ @file_pointer = nil
27
+ @adaptive_buffer = adaptive_buffer
28
+ @buffer_size = adaptive_buffer ? calculate_optimal_buffer_size : DEFAULT_BUFFER_SIZE
29
+ @pages_processed = 0
30
+ @bytes_read = 0
31
+ @validate_bz2 = validate_bz2
32
+ @skip_redirects = skip_redirects
33
+ @redirects_skipped = 0
34
+ end
35
+
36
+ # Validate bz2 file before processing
37
+ # @param quick [Boolean] Use quick validation (header only) vs full validation
38
+ # @return [Bz2Validator::ValidationResult] Validation result
39
+ # @raise [Wp2txt::FileIOError] If validation fails and raise_on_error is true
40
+ def validate_input(quick: false, raise_on_error: false)
41
+ return nil unless @input_path.end_with?(".bz2")
42
+
43
+ result = quick ? Bz2Validator.validate_quick(@input_path) : Bz2Validator.validate(@input_path)
44
+
45
+ if !result.valid? && raise_on_error
46
+ raise Wp2txt::FileIOError, "Invalid bz2 file: #{result.message}"
47
+ end
48
+
49
+ result
50
+ end
51
+
52
+ # Calculate optimal buffer size based on available memory
53
+ def calculate_optimal_buffer_size
54
+ MemoryMonitor.optimal_buffer_size
55
+ rescue StandardError
56
+ DEFAULT_BUFFER_SIZE
57
+ end
58
+
59
+ # Get current memory statistics
60
+ def memory_stats
61
+ MemoryMonitor.memory_stats
62
+ end
63
+
64
+ # Iterate over each page in the input
65
+ # Yields [title, text] for each page
66
+ def each_page
67
+ return enum_for(:each_page) unless block_given?
68
+
69
+ if File.directory?(@input_path)
70
+ # Process XML files in directory
71
+ Dir.glob(File.join(@input_path, "*.xml")).sort.each do |xml_file|
72
+ process_xml_file(xml_file) { |title, text| yield title, text }
73
+ end
74
+ elsif @input_path.end_with?(".bz2")
75
+ # Process bz2 compressed file with streaming
76
+ process_bz2_stream { |title, text| yield title, text }
77
+ elsif @input_path.end_with?(".xml")
78
+ # Process single XML file
79
+ process_xml_file(@input_path) { |title, text| yield title, text }
80
+ else
81
+ raise ArgumentError, "Unsupported input format: #{@input_path}"
82
+ end
83
+ end
84
+
85
+ # Get processing statistics (public API for monitoring)
86
+ def stats
87
+ {
88
+ pages_processed: @pages_processed,
89
+ redirects_skipped: @redirects_skipped,
90
+ bytes_read: @bytes_read,
91
+ buffer_size: @buffer_size,
92
+ current_buffer_length: @buffer.bytesize,
93
+ memory: memory_stats
94
+ }
95
+ end
96
+
97
+ private
98
+
99
+ # Process a single XML file
100
+ def process_xml_file(xml_file)
101
+ @buffer = +""
102
+ @file_pointer = File.open(xml_file, "r:UTF-8")
103
+
104
+ while (page = extract_next_page)
105
+ result = parse_page_xml(page)
106
+ yield result if result
107
+ end
108
+
109
+ @file_pointer.close
110
+ end
111
+
112
+ # Process bz2 stream directly without intermediate files
113
+ def process_bz2_stream
114
+ # Validate bz2 file before processing (if enabled)
115
+ if @validate_bz2
116
+ validation = validate_input(quick: false)
117
+ unless validation.nil? || validation.valid?
118
+ raise Wp2txt::FileIOError, "Cannot process corrupted bz2 file: #{validation.message}"
119
+ end
120
+ end
121
+
122
+ @buffer = +""
123
+ @file_pointer = open_bz2_stream
124
+
125
+ while (page = extract_next_page)
126
+ result = parse_page_xml(page)
127
+ yield result if result
128
+ end
129
+
130
+ @file_pointer.close
131
+ rescue Errno::EPIPE
132
+ # Ignore broken pipe (can happen if we stop reading early)
133
+ end
134
+
135
+ # Open bz2 stream using external command or gem
136
+ def open_bz2_stream
137
+ if @bz2_gem
138
+ require "bzip2-ruby"
139
+ Bzip2::Reader.new(File.open(@input_path, "rb"))
140
+ elsif Gem.win_platform?
141
+ IO.popen(["bunzip2.exe", "-c", @input_path], "rb")
142
+ else
143
+ bzpath = find_bzip2_command
144
+ raise "No bzip2 decompression command found" unless bzpath
145
+ IO.popen([bzpath, "-c", "-d", @input_path], "rb")
146
+ end
147
+ end
148
+
149
+ # Find available bzip2 command
150
+ def find_bzip2_command
151
+ %w[lbzip2 pbzip2 bzip2].each do |cmd|
152
+ path = IO.popen(["which", cmd], err: File::NULL, &:read).strip
153
+ return path unless path.empty?
154
+ end
155
+ nil
156
+ end
157
+
158
+ # Fill buffer from file pointer
159
+ def fill_buffer
160
+ chunk = @file_pointer.read(@buffer_size)
161
+ return false unless chunk
162
+
163
+ @bytes_read += chunk.bytesize
164
+
165
+ # Handle encoding for bz2 streams
166
+ chunk = chunk.force_encoding("UTF-8")
167
+ chunk = chunk.scrub("")
168
+ @buffer << chunk
169
+
170
+ # Adaptive buffer adjustment: if memory is low, reduce buffer size
171
+ if @adaptive_buffer && MemoryMonitor.memory_low?
172
+ new_size = [@buffer_size / 2, MIN_BUFFER_SIZE].max
173
+ @buffer_size = new_size if new_size != @buffer_size
174
+ end
175
+
176
+ true
177
+ end
178
+
179
+ # Extract next <page>...</page> from buffer
180
+ def extract_next_page
181
+ loop do
182
+ # Look for complete page in buffer
183
+ start_idx = @buffer.index("<page>")
184
+ if start_idx
185
+ end_idx = @buffer.index("</page>", start_idx)
186
+ if end_idx
187
+ # Extract the complete page
188
+ page_end = end_idx + "</page>".length
189
+ page = @buffer[start_idx...page_end]
190
+ @buffer = @buffer[page_end..]
191
+ return page
192
+ end
193
+ end
194
+
195
+ # Need more data
196
+ break unless fill_buffer
197
+ end
198
+
199
+ # Check for remaining page in buffer (end of file)
200
+ start_idx = @buffer.index("<page>")
201
+ return nil unless start_idx
202
+
203
+ end_idx = @buffer.index("</page>", start_idx)
204
+ return nil unless end_idx
205
+
206
+ page_end = end_idx + "</page>".length
207
+ page = @buffer[start_idx...page_end]
208
+ @buffer = @buffer[page_end..]
209
+ page
210
+ end
211
+
212
+ # Parse page XML and extract title and text
213
+ def parse_page_xml(page_xml)
214
+ # Wrap in minimal mediawiki element for parsing
215
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/">'
216
+ xml = xmlns + page_xml + "</mediawiki>"
217
+
218
+ doc = Nokogiri::XML(xml, nil, "UTF-8")
219
+ text_node = doc.xpath("//xmlns:text").first
220
+ return nil unless text_node
221
+
222
+ title_node = text_node.parent.parent.at_css("title")
223
+ return nil unless title_node
224
+
225
+ title = title_node.content
226
+ # Skip special pages (containing colon in title like "Wikipedia:", "File:", etc.)
227
+ return nil if title.include?(":")
228
+
229
+ text = text_node.content
230
+
231
+ # Early redirect detection and skip (before expensive processing)
232
+ # Redirects start with # or # followed by redirect keyword and [[target]]
233
+ if @skip_redirects && redirect_page?(text)
234
+ @redirects_skipped += 1
235
+ return nil
236
+ end
237
+
238
+ # Remove HTML comments while preserving newline count
239
+ text = text.gsub(/<!--(.*?)-->/m) do |content|
240
+ num_newlines = content.count("\n")
241
+ num_newlines.zero? ? "" : "\n" * num_newlines
242
+ end
243
+
244
+ @pages_processed += 1
245
+ [title, text]
246
+ rescue Nokogiri::XML::SyntaxError
247
+ # Skip malformed XML
248
+ nil
249
+ end
250
+
251
+ # Fast redirect detection using heuristic check
252
+ # Checks if text starts with redirect pattern without full regex evaluation
253
+ # @param text [String] The page text content
254
+ # @return [Boolean] true if page appears to be a redirect
255
+ def redirect_page?(text)
256
+ return false if text.nil? || text.empty?
257
+
258
+ # Check first 200 characters for redirect pattern
259
+ # Redirects are always at the start: #REDIRECT [[Target]] or #転送 [[ターゲット]]
260
+ first_part = text[0, 200]
261
+ return false unless first_part
262
+
263
+ # Quick check: must start with # or # (after optional whitespace)
264
+ stripped = first_part.lstrip
265
+ return false unless stripped.start_with?("#", "#")
266
+
267
+ # Must contain [[ which indicates the redirect target
268
+ stripped.include?("[[")
269
+ end
270
+ end
271
+ end