wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
require_relative "constants"
|
|
5
|
+
|
|
6
|
+
module Wp2txt
|
|
7
|
+
# SectionExtractor handles extraction of sections from Wikipedia articles
|
|
8
|
+
# Supports both metadata extraction (headings only) and content extraction
|
|
9
|
+
class SectionExtractor
|
|
10
|
+
# Reserved keyword for the lead section (text before first heading)
|
|
11
|
+
SUMMARY_KEY = "summary"
|
|
12
|
+
|
|
13
|
+
# Default section aliases (canonical name => array of aliases)
|
|
14
|
+
DEFAULT_ALIASES = {
|
|
15
|
+
"Plot" => ["Synopsis"],
|
|
16
|
+
"Reception" => ["Critical reception"]
|
|
17
|
+
}.freeze
|
|
18
|
+
|
|
19
|
+
# Track which actual headings matched which requested sections
|
|
20
|
+
attr_reader :matched_sections
|
|
21
|
+
|
|
22
|
+
# @param target_sections [Array<String>, nil] List of section names to extract (nil = all)
|
|
23
|
+
# @param options [Hash] Extraction options
|
|
24
|
+
# @option options [Integer] :min_length Minimum section length (default: 0)
|
|
25
|
+
# @option options [Boolean] :skip_empty Skip articles with no matching sections (default: false)
|
|
26
|
+
# @option options [Hash] :aliases Custom section aliases (merged with defaults)
|
|
27
|
+
# @option options [String] :alias_file Path to YAML file with custom aliases
|
|
28
|
+
# @option options [Boolean] :use_aliases Enable alias matching (default: true)
|
|
29
|
+
# @option options [Boolean] :track_matches Track which headings matched (default: false)
|
|
30
|
+
def initialize(target_sections = nil, options = {})
|
|
31
|
+
@targets = normalize_targets(target_sections)
|
|
32
|
+
@min_length = options[:min_length] || 0
|
|
33
|
+
@skip_empty = options[:skip_empty] || false
|
|
34
|
+
@use_aliases = options.fetch(:use_aliases, true)
|
|
35
|
+
@track_matches = options[:track_matches] || false
|
|
36
|
+
@matched_sections = {}
|
|
37
|
+
@aliases = build_aliases(options[:aliases], options[:alias_file])
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Load aliases from YAML file
|
|
41
|
+
# @param file_path [String] Path to YAML file
|
|
42
|
+
# @return [Hash] Aliases hash (canonical => [aliases])
|
|
43
|
+
def self.load_aliases_from_file(file_path)
|
|
44
|
+
return {} unless file_path && File.exist?(file_path)
|
|
45
|
+
|
|
46
|
+
data = YAML.safe_load(File.read(file_path), permitted_classes: [Symbol])
|
|
47
|
+
return {} unless data.is_a?(Hash)
|
|
48
|
+
|
|
49
|
+
# Normalize: ensure values are arrays
|
|
50
|
+
data.transform_values { |v| Array(v) }
|
|
51
|
+
rescue Psych::SyntaxError, Errno::ENOENT
|
|
52
|
+
{}
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Extract section headings from article (for --metadata-only)
|
|
56
|
+
# @param article [Article] The article to extract from
|
|
57
|
+
# @return [Array<String>] List of section heading names
|
|
58
|
+
def extract_headings(article)
|
|
59
|
+
headings = []
|
|
60
|
+
article.elements.each do |element|
|
|
61
|
+
next unless element[0] == :mw_heading
|
|
62
|
+
|
|
63
|
+
heading_text = clean_heading_text(element[1])
|
|
64
|
+
headings << heading_text unless heading_text.empty?
|
|
65
|
+
end
|
|
66
|
+
headings
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Extract section headings with levels (for detailed analysis)
|
|
70
|
+
# @param article [Article] The article to extract from
|
|
71
|
+
# @return [Array<Hash>] List of {name:, level:} hashes
|
|
72
|
+
def extract_headings_with_levels(article)
|
|
73
|
+
headings = []
|
|
74
|
+
article.elements.each do |element|
|
|
75
|
+
next unless element[0] == :mw_heading
|
|
76
|
+
|
|
77
|
+
heading_text = clean_heading_text(element[1])
|
|
78
|
+
level = element[2] || 2
|
|
79
|
+
headings << { name: heading_text, level: level } unless heading_text.empty?
|
|
80
|
+
end
|
|
81
|
+
headings
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Extract summary (lead section) from article
|
|
85
|
+
# @param article [Article] The article to extract from
|
|
86
|
+
# @param config [Hash] Formatting configuration
|
|
87
|
+
# @return [String, nil] The summary text or nil if empty
|
|
88
|
+
def extract_summary(article, config = {})
|
|
89
|
+
contents = +""
|
|
90
|
+
article.elements.each do |element|
|
|
91
|
+
# Stop at first heading
|
|
92
|
+
break if element[0] == :mw_heading
|
|
93
|
+
|
|
94
|
+
# Skip non-content elements
|
|
95
|
+
next if %i[mw_blank mw_redirect mw_comment].include?(element[0])
|
|
96
|
+
|
|
97
|
+
content = element[1].to_s
|
|
98
|
+
contents << content
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
result = contents.strip
|
|
102
|
+
result.empty? ? nil : result
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Extract specified sections from article
|
|
106
|
+
# @param article [Article] The article to extract from
|
|
107
|
+
# @param config [Hash] Formatting configuration
|
|
108
|
+
# @return [Hash] Section name => content (nil if not found)
|
|
109
|
+
def extract_sections(article, config = {})
|
|
110
|
+
return {} if @targets.nil? || @targets.empty?
|
|
111
|
+
|
|
112
|
+
# Reset matched sections for this article
|
|
113
|
+
@matched_sections = {}
|
|
114
|
+
|
|
115
|
+
result = {}
|
|
116
|
+
|
|
117
|
+
# Initialize all targets with nil
|
|
118
|
+
@targets.each { |t| result[t] = nil }
|
|
119
|
+
|
|
120
|
+
# Handle summary separately
|
|
121
|
+
if @targets.include?(SUMMARY_KEY)
|
|
122
|
+
summary = extract_summary(article, config)
|
|
123
|
+
result[SUMMARY_KEY] = apply_min_length_filter(summary)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Extract other sections
|
|
127
|
+
current_section = nil
|
|
128
|
+
current_level = nil
|
|
129
|
+
buffer = +""
|
|
130
|
+
|
|
131
|
+
article.elements.each do |element|
|
|
132
|
+
type = element[0]
|
|
133
|
+
content = element[1]
|
|
134
|
+
level = element[2]
|
|
135
|
+
|
|
136
|
+
if type == :mw_heading
|
|
137
|
+
# Save previous section if it was a target
|
|
138
|
+
if current_section
|
|
139
|
+
canonical = find_canonical_name(current_section)
|
|
140
|
+
if canonical && canonical != SUMMARY_KEY
|
|
141
|
+
result[canonical] = apply_min_length_filter(buffer.strip)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Check if this heading is a target
|
|
146
|
+
heading_text = clean_heading_text(content)
|
|
147
|
+
canonical = find_canonical_name(heading_text)
|
|
148
|
+
|
|
149
|
+
if canonical && canonical != SUMMARY_KEY
|
|
150
|
+
current_section = heading_text
|
|
151
|
+
current_level = level || 2
|
|
152
|
+
buffer = +""
|
|
153
|
+
elsif current_level && (level.nil? || level <= current_level)
|
|
154
|
+
# Same or higher level heading ends current section
|
|
155
|
+
current_section = nil
|
|
156
|
+
current_level = nil
|
|
157
|
+
buffer = +""
|
|
158
|
+
end
|
|
159
|
+
elsif current_section
|
|
160
|
+
# Accumulate content for current section
|
|
161
|
+
buffer << content.to_s
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Save final section
|
|
166
|
+
if current_section
|
|
167
|
+
canonical = find_canonical_name(current_section)
|
|
168
|
+
if canonical && canonical != SUMMARY_KEY
|
|
169
|
+
result[canonical] = apply_min_length_filter(buffer.strip)
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
result
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Check if article has any matching sections
|
|
177
|
+
# @param article [Article] The article to check
|
|
178
|
+
# @return [Boolean] true if at least one target section exists
|
|
179
|
+
def has_matching_sections?(article)
|
|
180
|
+
return true if @targets.nil? || @targets.empty?
|
|
181
|
+
|
|
182
|
+
# Check summary
|
|
183
|
+
if @targets.include?(SUMMARY_KEY)
|
|
184
|
+
summary = extract_summary(article)
|
|
185
|
+
return true if summary && !summary.empty?
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Check headings (don't record matches during check)
|
|
189
|
+
headings = extract_headings(article)
|
|
190
|
+
headings.any? { |h| find_canonical_name(h, record_match: false) }
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Check if extraction should be skipped for this article
|
|
194
|
+
# @param article [Article] The article to check
|
|
195
|
+
# @return [Boolean] true if article should be skipped
|
|
196
|
+
def should_skip?(article)
|
|
197
|
+
return false unless @skip_empty
|
|
198
|
+
!has_matching_sections?(article)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
private
|
|
202
|
+
|
|
203
|
+
# Normalize target section names
|
|
204
|
+
def normalize_targets(targets)
|
|
205
|
+
return nil if targets.nil?
|
|
206
|
+
|
|
207
|
+
Array(targets).map { |t| t.to_s.strip }.reject(&:empty?)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Build aliases hash from options, file, and defaults
|
|
211
|
+
def build_aliases(custom_aliases, alias_file = nil)
|
|
212
|
+
return {} unless @use_aliases
|
|
213
|
+
|
|
214
|
+
aliases = DEFAULT_ALIASES.dup
|
|
215
|
+
|
|
216
|
+
# Load from file if specified
|
|
217
|
+
if alias_file
|
|
218
|
+
file_aliases = self.class.load_aliases_from_file(alias_file)
|
|
219
|
+
aliases.merge!(file_aliases)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Merge inline custom aliases
|
|
223
|
+
aliases.merge!(custom_aliases) if custom_aliases.is_a?(Hash)
|
|
224
|
+
aliases
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Clean heading text by removing = markers and whitespace
|
|
228
|
+
def clean_heading_text(text)
|
|
229
|
+
text.to_s.gsub(/^[\s\n]*=+\s*/, "").gsub(/\s*=+[\s\n]*$/, "").strip
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# Find canonical name for a heading (handles aliases)
|
|
233
|
+
# @param heading [String] The actual heading text from the article
|
|
234
|
+
# @param record_match [Boolean] Whether to record the match for tracking
|
|
235
|
+
# @return [String, nil] The canonical (requested) section name, or nil
|
|
236
|
+
def find_canonical_name(heading, record_match: true)
|
|
237
|
+
return nil if heading.nil? || heading.empty?
|
|
238
|
+
return nil if @targets.nil?
|
|
239
|
+
|
|
240
|
+
heading_lower = heading.downcase.strip
|
|
241
|
+
|
|
242
|
+
# Direct match
|
|
243
|
+
@targets.each do |target|
|
|
244
|
+
if target.downcase == heading_lower
|
|
245
|
+
# Record direct match (only if heading differs in case)
|
|
246
|
+
if @track_matches && record_match && target != heading
|
|
247
|
+
@matched_sections[target] = heading
|
|
248
|
+
end
|
|
249
|
+
return target
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Alias match
|
|
254
|
+
return nil unless @use_aliases
|
|
255
|
+
|
|
256
|
+
@aliases.each do |canonical, alias_list|
|
|
257
|
+
next unless @targets.any? { |t| t.downcase == canonical.downcase }
|
|
258
|
+
|
|
259
|
+
if alias_list.any? { |a| a.downcase == heading_lower }
|
|
260
|
+
# Return the target that matches canonical
|
|
261
|
+
target = @targets.find { |t| t.downcase == canonical.downcase }
|
|
262
|
+
# Record alias match
|
|
263
|
+
if @track_matches && record_match && target
|
|
264
|
+
@matched_sections[target] = heading
|
|
265
|
+
end
|
|
266
|
+
return target
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
nil
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# Apply minimum length filter
|
|
274
|
+
def apply_min_length_filter(text)
|
|
275
|
+
return nil if text.nil?
|
|
276
|
+
return nil if @min_length > 0 && text.length < @min_length
|
|
277
|
+
|
|
278
|
+
text
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Collects section heading statistics across multiple articles
|
|
283
|
+
# Used for --section-stats mode
|
|
284
|
+
class SectionStatsCollector
|
|
285
|
+
attr_reader :total_articles, :section_counts
|
|
286
|
+
|
|
287
|
+
def initialize
|
|
288
|
+
@total_articles = 0
|
|
289
|
+
@section_counts = Hash.new(0)
|
|
290
|
+
@extractor = SectionExtractor.new
|
|
291
|
+
end
|
|
292
|
+
|
|
293
|
+
# Process an article and collect section heading statistics
|
|
294
|
+
# @param article [Article] The article to process
|
|
295
|
+
def process(article)
|
|
296
|
+
@total_articles += 1
|
|
297
|
+
headings = @extractor.extract_headings(article)
|
|
298
|
+
headings.each { |h| @section_counts[h] += 1 }
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Get top N sections by count
|
|
302
|
+
# @param n [Integer] Number of sections to return (default: 50)
|
|
303
|
+
# @return [Array<Hash>] Array of {name:, count:} hashes
|
|
304
|
+
def top_sections(n = 50)
|
|
305
|
+
@section_counts
|
|
306
|
+
.sort_by { |_name, count| -count }
|
|
307
|
+
.first(n)
|
|
308
|
+
.map { |name, count| { "name" => name, "count" => count } }
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
# Generate statistics output as a hash
|
|
312
|
+
# @param top_n [Integer] Number of top sections to include
|
|
313
|
+
# @return [Hash] Statistics hash
|
|
314
|
+
def to_hash(top_n: DEFAULT_TOP_N_SECTIONS)
|
|
315
|
+
{
|
|
316
|
+
"total_articles" => @total_articles,
|
|
317
|
+
"section_counts" => @section_counts.sort_by { |_k, v| -v }.to_h,
|
|
318
|
+
"top_sections" => top_sections(top_n)
|
|
319
|
+
}
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
# Generate JSON output
|
|
323
|
+
# @param top_n [Integer] Number of top sections to include
|
|
324
|
+
# @return [String] JSON string
|
|
325
|
+
def to_json(top_n: DEFAULT_TOP_N_SECTIONS)
|
|
326
|
+
require "json"
|
|
327
|
+
JSON.pretty_generate(to_hash(top_n: top_n))
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Merge another collector's results into this one
|
|
331
|
+
# Used for combining results from parallel processing
|
|
332
|
+
# @param other [SectionStatsCollector, Hash] Another collector or hash with results
|
|
333
|
+
def merge(other)
|
|
334
|
+
if other.is_a?(SectionStatsCollector)
|
|
335
|
+
@total_articles += other.total_articles
|
|
336
|
+
other.section_counts.each { |name, count| @section_counts[name] += count }
|
|
337
|
+
elsif other.is_a?(Hash)
|
|
338
|
+
@total_articles += other[:total_articles] || other["total_articles"] || 0
|
|
339
|
+
counts = other[:section_counts] || other["section_counts"] || {}
|
|
340
|
+
counts.each { |name, count| @section_counts[name] += count }
|
|
341
|
+
end
|
|
342
|
+
self
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# Export current state as a hash (for parallel processing)
|
|
346
|
+
# @return [Hash] Hash with total_articles and section_counts
|
|
347
|
+
def to_mergeable_hash
|
|
348
|
+
{
|
|
349
|
+
total_articles: @total_articles,
|
|
350
|
+
section_counts: @section_counts.dup
|
|
351
|
+
}
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
end
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
require "stringio"
|
|
5
|
+
require_relative "constants"
|
|
6
|
+
require_relative "memory_monitor"
|
|
7
|
+
require_relative "bz2_validator"
|
|
8
|
+
|
|
9
|
+
module Wp2txt
|
|
10
|
+
# StreamProcessor handles streaming decompression and XML parsing
|
|
11
|
+
# without creating intermediate files
|
|
12
|
+
class StreamProcessor
|
|
13
|
+
include Wp2txt
|
|
14
|
+
|
|
15
|
+
# Buffer size bounds from constants
|
|
16
|
+
MIN_BUFFER_SIZE = Wp2txt::MIN_BUFFER_SIZE
|
|
17
|
+
MAX_BUFFER_SIZE = Wp2txt::MAX_BUFFER_SIZE
|
|
18
|
+
DEFAULT_BUFFER_SIZE = Wp2txt::DEFAULT_BUFFER_SIZE
|
|
19
|
+
|
|
20
|
+
attr_reader :buffer_size, :pages_processed, :bytes_read, :redirects_skipped
|
|
21
|
+
|
|
22
|
+
def initialize(input_path, bz2_gem: false, adaptive_buffer: true, validate_bz2: true, skip_redirects: true)
|
|
23
|
+
@input_path = input_path
|
|
24
|
+
@bz2_gem = bz2_gem
|
|
25
|
+
@buffer = +""
|
|
26
|
+
@file_pointer = nil
|
|
27
|
+
@adaptive_buffer = adaptive_buffer
|
|
28
|
+
@buffer_size = adaptive_buffer ? calculate_optimal_buffer_size : DEFAULT_BUFFER_SIZE
|
|
29
|
+
@pages_processed = 0
|
|
30
|
+
@bytes_read = 0
|
|
31
|
+
@validate_bz2 = validate_bz2
|
|
32
|
+
@skip_redirects = skip_redirects
|
|
33
|
+
@redirects_skipped = 0
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Validate bz2 file before processing
|
|
37
|
+
# @param quick [Boolean] Use quick validation (header only) vs full validation
|
|
38
|
+
# @return [Bz2Validator::ValidationResult] Validation result
|
|
39
|
+
# @raise [Wp2txt::FileIOError] If validation fails and raise_on_error is true
|
|
40
|
+
def validate_input(quick: false, raise_on_error: false)
|
|
41
|
+
return nil unless @input_path.end_with?(".bz2")
|
|
42
|
+
|
|
43
|
+
result = quick ? Bz2Validator.validate_quick(@input_path) : Bz2Validator.validate(@input_path)
|
|
44
|
+
|
|
45
|
+
if !result.valid? && raise_on_error
|
|
46
|
+
raise Wp2txt::FileIOError, "Invalid bz2 file: #{result.message}"
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
result
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Calculate optimal buffer size based on available memory
|
|
53
|
+
def calculate_optimal_buffer_size
|
|
54
|
+
MemoryMonitor.optimal_buffer_size
|
|
55
|
+
rescue StandardError
|
|
56
|
+
DEFAULT_BUFFER_SIZE
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Get current memory statistics
|
|
60
|
+
def memory_stats
|
|
61
|
+
MemoryMonitor.memory_stats
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Iterate over each page in the input
|
|
65
|
+
# Yields [title, text] for each page
|
|
66
|
+
def each_page
|
|
67
|
+
return enum_for(:each_page) unless block_given?
|
|
68
|
+
|
|
69
|
+
if File.directory?(@input_path)
|
|
70
|
+
# Process XML files in directory
|
|
71
|
+
Dir.glob(File.join(@input_path, "*.xml")).sort.each do |xml_file|
|
|
72
|
+
process_xml_file(xml_file) { |title, text| yield title, text }
|
|
73
|
+
end
|
|
74
|
+
elsif @input_path.end_with?(".bz2")
|
|
75
|
+
# Process bz2 compressed file with streaming
|
|
76
|
+
process_bz2_stream { |title, text| yield title, text }
|
|
77
|
+
elsif @input_path.end_with?(".xml")
|
|
78
|
+
# Process single XML file
|
|
79
|
+
process_xml_file(@input_path) { |title, text| yield title, text }
|
|
80
|
+
else
|
|
81
|
+
raise ArgumentError, "Unsupported input format: #{@input_path}"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Get processing statistics (public API for monitoring)
|
|
86
|
+
def stats
|
|
87
|
+
{
|
|
88
|
+
pages_processed: @pages_processed,
|
|
89
|
+
redirects_skipped: @redirects_skipped,
|
|
90
|
+
bytes_read: @bytes_read,
|
|
91
|
+
buffer_size: @buffer_size,
|
|
92
|
+
current_buffer_length: @buffer.bytesize,
|
|
93
|
+
memory: memory_stats
|
|
94
|
+
}
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
private
|
|
98
|
+
|
|
99
|
+
# Process a single XML file
|
|
100
|
+
def process_xml_file(xml_file)
|
|
101
|
+
@buffer = +""
|
|
102
|
+
@file_pointer = File.open(xml_file, "r:UTF-8")
|
|
103
|
+
|
|
104
|
+
while (page = extract_next_page)
|
|
105
|
+
result = parse_page_xml(page)
|
|
106
|
+
yield result if result
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
@file_pointer.close
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Process bz2 stream directly without intermediate files
|
|
113
|
+
def process_bz2_stream
|
|
114
|
+
# Validate bz2 file before processing (if enabled)
|
|
115
|
+
if @validate_bz2
|
|
116
|
+
validation = validate_input(quick: false)
|
|
117
|
+
unless validation.nil? || validation.valid?
|
|
118
|
+
raise Wp2txt::FileIOError, "Cannot process corrupted bz2 file: #{validation.message}"
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
@buffer = +""
|
|
123
|
+
@file_pointer = open_bz2_stream
|
|
124
|
+
|
|
125
|
+
while (page = extract_next_page)
|
|
126
|
+
result = parse_page_xml(page)
|
|
127
|
+
yield result if result
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
@file_pointer.close
|
|
131
|
+
rescue Errno::EPIPE
|
|
132
|
+
# Ignore broken pipe (can happen if we stop reading early)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Open bz2 stream using external command or gem
|
|
136
|
+
def open_bz2_stream
|
|
137
|
+
if @bz2_gem
|
|
138
|
+
require "bzip2-ruby"
|
|
139
|
+
Bzip2::Reader.new(File.open(@input_path, "rb"))
|
|
140
|
+
elsif Gem.win_platform?
|
|
141
|
+
IO.popen(["bunzip2.exe", "-c", @input_path], "rb")
|
|
142
|
+
else
|
|
143
|
+
bzpath = find_bzip2_command
|
|
144
|
+
raise "No bzip2 decompression command found" unless bzpath
|
|
145
|
+
IO.popen([bzpath, "-c", "-d", @input_path], "rb")
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Find available bzip2 command
|
|
150
|
+
def find_bzip2_command
|
|
151
|
+
%w[lbzip2 pbzip2 bzip2].each do |cmd|
|
|
152
|
+
path = IO.popen(["which", cmd], err: File::NULL, &:read).strip
|
|
153
|
+
return path unless path.empty?
|
|
154
|
+
end
|
|
155
|
+
nil
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Fill buffer from file pointer
|
|
159
|
+
def fill_buffer
|
|
160
|
+
chunk = @file_pointer.read(@buffer_size)
|
|
161
|
+
return false unless chunk
|
|
162
|
+
|
|
163
|
+
@bytes_read += chunk.bytesize
|
|
164
|
+
|
|
165
|
+
# Handle encoding for bz2 streams
|
|
166
|
+
chunk = chunk.force_encoding("UTF-8")
|
|
167
|
+
chunk = chunk.scrub("")
|
|
168
|
+
@buffer << chunk
|
|
169
|
+
|
|
170
|
+
# Adaptive buffer adjustment: if memory is low, reduce buffer size
|
|
171
|
+
if @adaptive_buffer && MemoryMonitor.memory_low?
|
|
172
|
+
new_size = [@buffer_size / 2, MIN_BUFFER_SIZE].max
|
|
173
|
+
@buffer_size = new_size if new_size != @buffer_size
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
true
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Extract next <page>...</page> from buffer
|
|
180
|
+
def extract_next_page
|
|
181
|
+
loop do
|
|
182
|
+
# Look for complete page in buffer
|
|
183
|
+
start_idx = @buffer.index("<page>")
|
|
184
|
+
if start_idx
|
|
185
|
+
end_idx = @buffer.index("</page>", start_idx)
|
|
186
|
+
if end_idx
|
|
187
|
+
# Extract the complete page
|
|
188
|
+
page_end = end_idx + "</page>".length
|
|
189
|
+
page = @buffer[start_idx...page_end]
|
|
190
|
+
@buffer = @buffer[page_end..]
|
|
191
|
+
return page
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Need more data
|
|
196
|
+
break unless fill_buffer
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Check for remaining page in buffer (end of file)
|
|
200
|
+
start_idx = @buffer.index("<page>")
|
|
201
|
+
return nil unless start_idx
|
|
202
|
+
|
|
203
|
+
end_idx = @buffer.index("</page>", start_idx)
|
|
204
|
+
return nil unless end_idx
|
|
205
|
+
|
|
206
|
+
page_end = end_idx + "</page>".length
|
|
207
|
+
page = @buffer[start_idx...page_end]
|
|
208
|
+
@buffer = @buffer[page_end..]
|
|
209
|
+
page
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Parse page XML and extract title and text
|
|
213
|
+
def parse_page_xml(page_xml)
|
|
214
|
+
# Wrap in minimal mediawiki element for parsing
|
|
215
|
+
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/">'
|
|
216
|
+
xml = xmlns + page_xml + "</mediawiki>"
|
|
217
|
+
|
|
218
|
+
doc = Nokogiri::XML(xml, nil, "UTF-8")
|
|
219
|
+
text_node = doc.xpath("//xmlns:text").first
|
|
220
|
+
return nil unless text_node
|
|
221
|
+
|
|
222
|
+
title_node = text_node.parent.parent.at_css("title")
|
|
223
|
+
return nil unless title_node
|
|
224
|
+
|
|
225
|
+
title = title_node.content
|
|
226
|
+
# Skip special pages (containing colon in title like "Wikipedia:", "File:", etc.)
|
|
227
|
+
return nil if title.include?(":")
|
|
228
|
+
|
|
229
|
+
text = text_node.content
|
|
230
|
+
|
|
231
|
+
# Early redirect detection and skip (before expensive processing)
|
|
232
|
+
# Redirects start with # or # followed by redirect keyword and [[target]]
|
|
233
|
+
if @skip_redirects && redirect_page?(text)
|
|
234
|
+
@redirects_skipped += 1
|
|
235
|
+
return nil
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Remove HTML comments while preserving newline count
|
|
239
|
+
text = text.gsub(/<!--(.*?)-->/m) do |content|
|
|
240
|
+
num_newlines = content.count("\n")
|
|
241
|
+
num_newlines.zero? ? "" : "\n" * num_newlines
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
@pages_processed += 1
|
|
245
|
+
[title, text]
|
|
246
|
+
rescue Nokogiri::XML::SyntaxError
|
|
247
|
+
# Skip malformed XML
|
|
248
|
+
nil
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Fast redirect detection using heuristic check
|
|
252
|
+
# Checks if text starts with redirect pattern without full regex evaluation
|
|
253
|
+
# @param text [String] The page text content
|
|
254
|
+
# @return [Boolean] true if page appears to be a redirect
|
|
255
|
+
def redirect_page?(text)
|
|
256
|
+
return false if text.nil? || text.empty?
|
|
257
|
+
|
|
258
|
+
# Check first 200 characters for redirect pattern
|
|
259
|
+
# Redirects are always at the start: #REDIRECT [[Target]] or #転送 [[ターゲット]]
|
|
260
|
+
first_part = text[0, 200]
|
|
261
|
+
return false unless first_part
|
|
262
|
+
|
|
263
|
+
# Quick check: must start with # or # (after optional whitespace)
|
|
264
|
+
stripped = first_part.lstrip
|
|
265
|
+
return false unless stripped.start_with?("#", "#")
|
|
266
|
+
|
|
267
|
+
# Must contain [[ which indicates the redirect target
|
|
268
|
+
stripped.include?("[[")
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
end
|