wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "etc"
|
|
4
|
+
|
|
5
|
+
module Wp2txt
|
|
6
|
+
# Ractor-based parallel processing for Wikipedia article conversion
|
|
7
|
+
#
|
|
8
|
+
# Ractor allows true parallelism by bypassing Ruby's GVL (Global VM Lock),
|
|
9
|
+
# enabling significant speedups for CPU-intensive text processing.
|
|
10
|
+
#
|
|
11
|
+
# REQUIREMENTS: Ruby 4.0+ (Ractor API stabilized in Ruby 4.0)
|
|
12
|
+
# For Ruby 3.x, the Parallel gem is used instead (process-based parallelism).
|
|
13
|
+
#
|
|
14
|
+
# Performance: Typically 1.5-2x speedup with 4 workers on multi-core systems.
|
|
15
|
+
#
|
|
16
|
+
# Usage:
|
|
17
|
+
# pages = [["Title1", "wiki text..."], ["Title2", "wiki text..."]]
|
|
18
|
+
# results = RactorWorker.process_articles(pages, config: config)
|
|
19
|
+
#
|
|
20
|
+
module RactorWorker
|
|
21
|
+
# Minimum Ruby version required for stable Ractor support
|
|
22
|
+
MINIMUM_RUBY_VERSION = "4.0"
|
|
23
|
+
|
|
24
|
+
# Registry of available operations
|
|
25
|
+
OPERATIONS = %i[process_article double fib].freeze
|
|
26
|
+
|
|
27
|
+
module_function
|
|
28
|
+
|
|
29
|
+
# Check if Ractor is available and usable
|
|
30
|
+
# Requires Ruby 4.0+ for stable Ractor support
|
|
31
|
+
# @return [Boolean] true if Ractor can be used
|
|
32
|
+
def available?
|
|
33
|
+
return @available if defined?(@available)
|
|
34
|
+
|
|
35
|
+
@available = check_ractor_available
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Internal method to check Ractor availability
|
|
39
|
+
# @return [Boolean] true if Ractor can be used
|
|
40
|
+
def check_ractor_available
|
|
41
|
+
return false unless ruby_version_sufficient?
|
|
42
|
+
return false unless defined?(Ractor)
|
|
43
|
+
|
|
44
|
+
# Test basic Ractor functionality with Ruby 4.0 API
|
|
45
|
+
r = Ractor.new { 1 + 1 }
|
|
46
|
+
r.join
|
|
47
|
+
r.value == 2
|
|
48
|
+
rescue StandardError
|
|
49
|
+
false
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Check if Ruby version meets minimum requirement
|
|
53
|
+
# @return [Boolean] true if Ruby version is 4.0 or higher
|
|
54
|
+
def ruby_version_sufficient?
|
|
55
|
+
Gem::Version.new(RUBY_VERSION) >= Gem::Version.new(MINIMUM_RUBY_VERSION)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Process articles in parallel using Ractor (main entry point)
|
|
59
|
+
# @param pages [Array<Array>] Array of [title, text] pairs
|
|
60
|
+
# @param config [Hash] Configuration options for formatting
|
|
61
|
+
# @param strip_tmarker [Boolean] Whether to strip list markers
|
|
62
|
+
# @param num_workers [Integer] Number of parallel Ractors (optional)
|
|
63
|
+
# @return [Array<String>] Formatted article results
|
|
64
|
+
def process_articles(pages, config:, strip_tmarker: false, num_workers: nil)
|
|
65
|
+
items = pages.map { |title, text| [title, text, strip_tmarker] }
|
|
66
|
+
|
|
67
|
+
parallel_process(
|
|
68
|
+
items,
|
|
69
|
+
operation: :process_article,
|
|
70
|
+
config: config,
|
|
71
|
+
num_workers: num_workers
|
|
72
|
+
)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Process items in parallel using map-join-value pattern (Ruby 4.0+)
|
|
76
|
+
# @param items [Array] Items to process
|
|
77
|
+
# @param operation [Symbol] Operation to perform (:process_article, :double, :fib)
|
|
78
|
+
# @param config [Hash] Configuration to pass to each operation
|
|
79
|
+
# @param num_workers [Integer] Max concurrent Ractors (default: optimal_workers)
|
|
80
|
+
# @return [Array] Results from processing (in original order)
|
|
81
|
+
def parallel_process(items, operation:, config: {}, num_workers: nil)
|
|
82
|
+
batch_size = num_workers || optimal_workers
|
|
83
|
+
batch_size = [batch_size, 1].max
|
|
84
|
+
|
|
85
|
+
# Fall back to sequential if Ractor not available or single item
|
|
86
|
+
unless available? && items.size > 1
|
|
87
|
+
return items.map { |item| process_single(item, operation, config) }
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Freeze config for sharing across Ractors
|
|
91
|
+
frozen_config = deep_freeze(config.dup)
|
|
92
|
+
|
|
93
|
+
# Process in batches to limit concurrent Ractors
|
|
94
|
+
results = []
|
|
95
|
+
items.each_slice(batch_size) do |batch|
|
|
96
|
+
batch_results = process_batch(batch, operation, frozen_config)
|
|
97
|
+
results.concat(batch_results)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
results
|
|
101
|
+
rescue Ractor::Error => e
|
|
102
|
+
warn "Ractor error (#{e.message}), falling back to sequential processing"
|
|
103
|
+
items.map { |item| process_single(item, operation, config) }
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Process a batch using map-join-value pattern (Ruby 4.0 API)
|
|
107
|
+
# @param items [Array] Items to process in this batch
|
|
108
|
+
# @param operation [Symbol] Operation to perform
|
|
109
|
+
# @param frozen_config [Hash] Frozen configuration hash
|
|
110
|
+
# @return [Array] Results in original order
|
|
111
|
+
def process_batch(items, operation, frozen_config)
|
|
112
|
+
# Create one Ractor per item
|
|
113
|
+
ractors = items.map.with_index do |item, idx|
|
|
114
|
+
Ractor.new(item, frozen_config, operation, idx) do |it, cfg, op, i|
|
|
115
|
+
result = begin
|
|
116
|
+
case op
|
|
117
|
+
when :process_article
|
|
118
|
+
require_relative "utils"
|
|
119
|
+
require_relative "regex"
|
|
120
|
+
require_relative "article"
|
|
121
|
+
require_relative "formatter"
|
|
122
|
+
|
|
123
|
+
title, text, strip_tmarker = it
|
|
124
|
+
formatter = Object.new
|
|
125
|
+
formatter.extend(Wp2txt)
|
|
126
|
+
formatter.extend(Wp2txt::Formatter)
|
|
127
|
+
article = Wp2txt::Article.new(text, title, strip_tmarker)
|
|
128
|
+
formatter.format_article(article, cfg)
|
|
129
|
+
when :double
|
|
130
|
+
it * 2
|
|
131
|
+
when :fib
|
|
132
|
+
fib = ->(n) { n <= 1 ? n : fib.call(n - 1) + fib.call(n - 2) }
|
|
133
|
+
fib.call(it)
|
|
134
|
+
else
|
|
135
|
+
raise "Unknown operation: #{op}"
|
|
136
|
+
end
|
|
137
|
+
rescue StandardError
|
|
138
|
+
nil # Return nil on error
|
|
139
|
+
end
|
|
140
|
+
[i, result] # Return index and result for ordering
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Wait for all Ractors to complete and collect results
|
|
145
|
+
collected = Array.new(items.size)
|
|
146
|
+
ractors.each do |r|
|
|
147
|
+
r.join
|
|
148
|
+
idx, result = r.value
|
|
149
|
+
collected[idx] = result
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
collected
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Process a single item (for fallback/sequential processing)
|
|
156
|
+
# @param item [Object] Item to process
|
|
157
|
+
# @param operation [Symbol] Operation to perform
|
|
158
|
+
# @param config [Hash] Configuration options
|
|
159
|
+
# @return [Object] Processing result
|
|
160
|
+
def process_single(item, operation, config)
|
|
161
|
+
case operation
|
|
162
|
+
when :process_article
|
|
163
|
+
require_relative "utils"
|
|
164
|
+
require_relative "regex"
|
|
165
|
+
require_relative "article"
|
|
166
|
+
require_relative "formatter"
|
|
167
|
+
|
|
168
|
+
title, text, strip_tmarker = item
|
|
169
|
+
formatter = Object.new
|
|
170
|
+
formatter.extend(Wp2txt)
|
|
171
|
+
formatter.extend(Wp2txt::Formatter)
|
|
172
|
+
article = Wp2txt::Article.new(text, title, strip_tmarker)
|
|
173
|
+
formatter.format_article(article, config)
|
|
174
|
+
when :double
|
|
175
|
+
item * 2
|
|
176
|
+
when :fib
|
|
177
|
+
fib = ->(n) { n <= 1 ? n : fib.call(n - 1) + fib.call(n - 2) }
|
|
178
|
+
fib.call(item)
|
|
179
|
+
else
|
|
180
|
+
raise "Unknown operation: #{operation}"
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Calculate optimal number of workers based on CPU cores
|
|
185
|
+
# @return [Integer] Recommended concurrency level
|
|
186
|
+
def optimal_workers
|
|
187
|
+
cores = Etc.nprocessors
|
|
188
|
+
case cores
|
|
189
|
+
when 1..4 then cores
|
|
190
|
+
when 5..8 then cores - 1
|
|
191
|
+
else (cores * 0.8).to_i
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Deep freeze an object for Ractor sharing
|
|
196
|
+
# @param obj [Object] Object to freeze
|
|
197
|
+
# @return [Object] The frozen object
|
|
198
|
+
def deep_freeze(obj)
|
|
199
|
+
case obj
|
|
200
|
+
when Hash
|
|
201
|
+
obj.transform_keys { |k| deep_freeze(k) }
|
|
202
|
+
.transform_values { |v| deep_freeze(v) }
|
|
203
|
+
.freeze
|
|
204
|
+
when Array
|
|
205
|
+
obj.map { |v| deep_freeze(v) }.freeze
|
|
206
|
+
when String
|
|
207
|
+
obj.frozen? ? obj : obj.dup.freeze
|
|
208
|
+
when Symbol, Integer, Float, TrueClass, FalseClass, NilClass
|
|
209
|
+
obj
|
|
210
|
+
else
|
|
211
|
+
obj.freeze rescue obj
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
end
|
data/lib/wp2txt/regex.rb
CHANGED
|
@@ -1,8 +1,145 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "htmlentities"
|
|
4
|
+
require "json"
|
|
5
|
+
require_relative "global_data_cache"
|
|
6
|
+
|
|
7
|
+
# Make HTMLEntities gem Ractor-shareable
|
|
8
|
+
# This must be done before any Ractor tries to use HTMLEntities
|
|
9
|
+
if defined?(Ractor) && Ractor.respond_to?(:make_shareable)
|
|
10
|
+
begin
|
|
11
|
+
HTMLEntities.constants.each do |const_name|
|
|
12
|
+
const = HTMLEntities.const_get(const_name)
|
|
13
|
+
next if Ractor.shareable?(const)
|
|
14
|
+
|
|
15
|
+
Ractor.make_shareable(const)
|
|
16
|
+
rescue Ractor::IsolationError, FrozenError, TypeError
|
|
17
|
+
# Skip if can't be made shareable
|
|
18
|
+
end
|
|
19
|
+
rescue StandardError
|
|
20
|
+
# Ignore errors during shareable setup
|
|
21
|
+
end
|
|
22
|
+
end
|
|
4
23
|
|
|
5
24
|
module Wp2txt
|
|
25
|
+
# Data file paths
|
|
26
|
+
MEDIAWIKI_DATA_PATH = File.join(__dir__, "data", "mediawiki_aliases.json")
|
|
27
|
+
HTML_ENTITIES_PATH = File.join(__dir__, "data", "html_entities.json")
|
|
28
|
+
WIKIPEDIA_ENTITIES_PATH = File.join(__dir__, "data", "wikipedia_entities.json")
|
|
29
|
+
TEMPLATE_DATA_PATH = File.join(__dir__, "data", "template_aliases.json")
|
|
30
|
+
|
|
31
|
+
# Load MediaWiki aliases from data file (generated by scripts/fetch_mediawiki_data.rb)
|
|
32
|
+
# Uses SQLite cache for faster subsequent loads
|
|
33
|
+
def self.load_mediawiki_data
|
|
34
|
+
return @mediawiki_data if @mediawiki_data
|
|
35
|
+
|
|
36
|
+
# Try SQLite cache first
|
|
37
|
+
cached = GlobalDataCache.load(:mediawiki)
|
|
38
|
+
if cached
|
|
39
|
+
@mediawiki_data = cached
|
|
40
|
+
return @mediawiki_data
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Load from JSON file
|
|
44
|
+
if File.exist?(MEDIAWIKI_DATA_PATH)
|
|
45
|
+
@mediawiki_data = JSON.parse(File.read(MEDIAWIKI_DATA_PATH))
|
|
46
|
+
# Save to cache for future use
|
|
47
|
+
GlobalDataCache.save(:mediawiki, @mediawiki_data)
|
|
48
|
+
else
|
|
49
|
+
# Fallback to minimal defaults if data file is missing
|
|
50
|
+
@mediawiki_data = {
|
|
51
|
+
"magic_words" => { "redirect" => ["REDIRECT"] },
|
|
52
|
+
"namespaces" => { "category" => ["Category"], "file" => ["File", "Image"] }
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
@mediawiki_data
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Load template aliases from data file (generated by scripts/fetch_template_data.rb)
|
|
59
|
+
# Uses SQLite cache for faster subsequent loads
|
|
60
|
+
def self.load_template_data
|
|
61
|
+
return @template_data if @template_data
|
|
62
|
+
|
|
63
|
+
# Try SQLite cache first
|
|
64
|
+
cached = GlobalDataCache.load(:template)
|
|
65
|
+
if cached
|
|
66
|
+
@template_data = cached
|
|
67
|
+
return @template_data
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Load from JSON file
|
|
71
|
+
if File.exist?(TEMPLATE_DATA_PATH)
|
|
72
|
+
@template_data = JSON.parse(File.read(TEMPLATE_DATA_PATH))
|
|
73
|
+
# Save to cache for future use
|
|
74
|
+
GlobalDataCache.save(:template, @template_data)
|
|
75
|
+
else
|
|
76
|
+
# Fallback to minimal defaults if data file is missing
|
|
77
|
+
@template_data = {
|
|
78
|
+
"remove_templates" => %w[reflist notelist main see\ also portal],
|
|
79
|
+
"authority_control" => %w[authority\ control normdaten],
|
|
80
|
+
"cleanup_remnants" => %w[clear clearleft clearright]
|
|
81
|
+
}
|
|
82
|
+
end
|
|
83
|
+
@template_data
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Build regex pattern from template list (escapes special chars, joins with |)
|
|
87
|
+
def self.build_template_regex(templates, options = {})
|
|
88
|
+
return nil if templates.nil? || templates.empty?
|
|
89
|
+
|
|
90
|
+
pattern = templates.map { |t| Regexp.escape(t) }.join("|")
|
|
91
|
+
if options[:anchor_start]
|
|
92
|
+
pattern = '\A\s*(?:' + pattern + ')'
|
|
93
|
+
else
|
|
94
|
+
pattern = '(?:' + pattern + ')'
|
|
95
|
+
end
|
|
96
|
+
if options[:require_pipe_or_end]
|
|
97
|
+
pattern += '\s*(?:\||$)'
|
|
98
|
+
end
|
|
99
|
+
Regexp.new(pattern, Regexp::IGNORECASE)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Load HTML entities from WHATWG data file (generated by scripts/fetch_html_entities.rb)
|
|
103
|
+
# Uses SQLite cache for faster subsequent loads
|
|
104
|
+
def self.load_html_entities
|
|
105
|
+
return @html_entities if @html_entities
|
|
106
|
+
|
|
107
|
+
# Try SQLite cache first (combined entities)
|
|
108
|
+
cached = GlobalDataCache.load(:html_entities_combined)
|
|
109
|
+
if cached
|
|
110
|
+
@html_entities = cached
|
|
111
|
+
return @html_entities
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
@html_entities = {}
|
|
115
|
+
|
|
116
|
+
# Load WHATWG HTML entities
|
|
117
|
+
if File.exist?(HTML_ENTITIES_PATH)
|
|
118
|
+
data = JSON.parse(File.read(HTML_ENTITIES_PATH))
|
|
119
|
+
@html_entities.merge!(data["entities"] || {})
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Load Wikipedia-specific entities (override/supplement WHATWG)
|
|
123
|
+
if File.exist?(WIKIPEDIA_ENTITIES_PATH)
|
|
124
|
+
data = JSON.parse(File.read(WIKIPEDIA_ENTITIES_PATH))
|
|
125
|
+
@html_entities.merge!(data["entities"] || {})
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Save combined entities to cache
|
|
129
|
+
GlobalDataCache.save(:html_entities_combined, @html_entities) unless @html_entities.empty?
|
|
130
|
+
|
|
131
|
+
@html_entities
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Build regex for extra entities not handled by HTMLEntities gem
|
|
135
|
+
def self.build_extra_entities_regex
|
|
136
|
+
entities = load_html_entities
|
|
137
|
+
return nil if entities.empty?
|
|
138
|
+
|
|
139
|
+
# Build regex pattern for all entity keys
|
|
140
|
+
pattern = "(" + entities.keys.map { |k| Regexp.escape(k) }.join("|") + ")"
|
|
141
|
+
Regexp.new(pattern)
|
|
142
|
+
end
|
|
6
143
|
###################################################
|
|
7
144
|
# variables to save resource for generating regexps
|
|
8
145
|
# those with a trailing number 1 represent opening tag/markup
|
|
@@ -14,9 +151,22 @@ module Wp2txt
|
|
|
14
151
|
ENTITIES = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
|
15
152
|
HTML_HASH = Hash[*ENTITIES.flatten]
|
|
16
153
|
HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
|
|
154
|
+
|
|
155
|
+
# Extra HTML entities loaded from JSON files (WHATWG + Wikipedia-specific)
|
|
156
|
+
# These supplement the HTMLEntities gem which only covers ~250 basic entities
|
|
157
|
+
# Data sources:
|
|
158
|
+
# - lib/wp2txt/data/html_entities.json (2000+ WHATWG entities)
|
|
159
|
+
# - lib/wp2txt/data/wikipedia_entities.json (Wikipedia-specific)
|
|
160
|
+
EXTRA_ENTITIES = load_html_entities.freeze
|
|
161
|
+
EXTRA_ENTITIES_REGEX = build_extra_entities_regex
|
|
162
|
+
|
|
163
|
+
# Legacy alias for backward compatibility
|
|
164
|
+
MATH_ENTITIES = EXTRA_ENTITIES
|
|
165
|
+
MATH_ENTITIES_REGEX = EXTRA_ENTITIES_REGEX
|
|
17
166
|
ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
|
|
18
167
|
ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
|
|
19
|
-
|
|
168
|
+
# Match lines starting with [[ that don't end with ]] (handles inner links)
|
|
169
|
+
ML_LINK_ONSET_REGEX = Regexp.new('^\[\[(?!.*\]\]\s*$).*$')
|
|
20
170
|
ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
|
|
21
171
|
ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
|
22
172
|
ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
|
@@ -30,10 +180,12 @@ module Wp2txt
|
|
|
30
180
|
IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
|
|
31
181
|
IN_MATH_REGEX1 = Regexp.new('<math.*?>')
|
|
32
182
|
IN_MATH_REGEX2 = Regexp.new('<\/math>')
|
|
33
|
-
IN_HEADING_REGEX = Regexp.new('
|
|
183
|
+
IN_HEADING_REGEX = Regexp.new('^=+.*?=+\s*$')
|
|
34
184
|
IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
|
|
35
185
|
IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
|
|
36
186
|
IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
|
|
187
|
+
# HTML comments (can span multiple lines)
|
|
188
|
+
HTML_COMMENT_REGEX = Regexp.new('<!--.*?-->', Regexp::MULTILINE)
|
|
37
189
|
IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
|
|
38
190
|
IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
|
|
39
191
|
IN_UNORDERED_REGEX = Regexp.new('^\*')
|
|
@@ -41,17 +193,25 @@ module Wp2txt
|
|
|
41
193
|
IN_PRE_REGEX = Regexp.new('^ ')
|
|
42
194
|
IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
|
|
43
195
|
BLANK_LINE_REGEX = Regexp.new('^\s*$')
|
|
44
|
-
|
|
196
|
+
|
|
197
|
+
# Multilingual redirect keyword support
|
|
198
|
+
# Data source: MediaWiki API (siteinfo) via scripts/fetch_mediawiki_data.rb
|
|
199
|
+
REDIRECT_KEYWORDS = load_mediawiki_data.dig("magic_words", "redirect")&.join("|") || "REDIRECT"
|
|
200
|
+
REDIRECT_REGEX = Regexp.new('[##](?:' + REDIRECT_KEYWORDS + ')\s*:?\s*\[\[([^\]]+)\]\]', Regexp::IGNORECASE)
|
|
45
201
|
REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
|
|
46
|
-
|
|
202
|
+
|
|
203
|
+
# Legacy generic pattern for double-underscore directives
|
|
204
|
+
# Note: Data-driven REMOVE_DIRECTIVES_REGEX is defined later (after DOUBLE_UNDERSCORE_MAGIC_REGEX)
|
|
205
|
+
REMOVE_DIRECTIVES_REGEX_GENERIC = Regexp.new("\_\_[^\_]*\_\_")
|
|
206
|
+
|
|
47
207
|
REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
|
48
208
|
CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
|
49
209
|
MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
|
|
50
|
-
REMOVE_HR_REGEX = Regexp.new('^\s
|
|
51
|
-
MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
|
|
52
|
-
MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
|
|
53
|
-
MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
|
|
54
|
-
MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
|
|
210
|
+
REMOVE_HR_REGEX = Regexp.new('^\s*\-{4,}\s*$')
|
|
211
|
+
MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>', Regexp::IGNORECASE)
|
|
212
|
+
MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>', Regexp::IGNORECASE)
|
|
213
|
+
MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>', Regexp::IGNORECASE)
|
|
214
|
+
MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>', Regexp::IGNORECASE)
|
|
55
215
|
FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
|
56
216
|
HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
|
|
57
217
|
HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
|
|
@@ -60,15 +220,44 @@ module Wp2txt
|
|
|
60
220
|
DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
|
|
61
221
|
ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
|
|
62
222
|
|
|
63
|
-
|
|
64
|
-
|
|
223
|
+
# Multilingual category namespace support
|
|
224
|
+
# Data source: MediaWiki API (siteinfo) via scripts/fetch_mediawiki_data.rb
|
|
225
|
+
CATEGORY_NAMESPACES = load_mediawiki_data.dig("namespaces", "category")&.join("|") || "Category"
|
|
226
|
+
|
|
227
|
+
# Multilingual file namespace support (for image/file links)
|
|
228
|
+
FILE_NAMESPACES = load_mediawiki_data.dig("namespaces", "file")&.join("|") || "File|Image"
|
|
229
|
+
FILE_NAMESPACES_REGEX = Regexp.new('\A\s*(?:' + FILE_NAMESPACES + ')\s*:', Regexp::IGNORECASE)
|
|
230
|
+
|
|
231
|
+
# Multilingual category namespace support (for filtering out category links from body text)
|
|
232
|
+
CATEGORY_NAMESPACE_REGEX = Regexp.new('\A\s*(?:' + CATEGORY_NAMESPACES + ')\s*:', Regexp::IGNORECASE)
|
|
233
|
+
|
|
234
|
+
# Image parameters (multilingual) - used for filtering out non-caption parts of File/Image links
|
|
235
|
+
# Combines: img_thumbnail, img_framed, img_frameless, img_border, img_right, img_left, img_center, img_none,
|
|
236
|
+
# img_upright, img_baseline, img_sub, img_super, img_top, img_text_top, img_middle, img_bottom, img_text_bottom
|
|
237
|
+
IMAGE_PARAM_KEYS = %w[img_thumbnail img_framed img_frameless img_border img_right img_left img_center img_none
|
|
238
|
+
img_upright img_baseline img_sub img_super img_top img_text_top img_middle img_bottom img_text_bottom].freeze
|
|
239
|
+
IMAGE_PARAMS = IMAGE_PARAM_KEYS.flat_map { |k| load_mediawiki_data.dig("magic_words", k) || [] }.uniq.join("|")
|
|
240
|
+
IMAGE_PARAMS_REGEX = IMAGE_PARAMS.empty? ? nil : Regexp.new('\A(' + IMAGE_PARAMS + ')\z', Regexp::IGNORECASE)
|
|
241
|
+
|
|
242
|
+
# Cleanup regex patterns using dynamic file namespaces
|
|
243
|
+
# For lines like "Image:file.jpg|thumb|...|caption" (gallery/imagemap remnants)
|
|
244
|
+
CLEANUP_FILE_LINE_REGEX = Regexp.new('^(?:' + FILE_NAMESPACES + '):[^\n]+\|[^\n]+$', Regexp::IGNORECASE | Regexp::MULTILINE)
|
|
245
|
+
# For incomplete File/Image links (opened but not closed)
|
|
246
|
+
CLEANUP_FILE_INCOMPLETE_REGEX = Regexp.new('\[\[(?:' + FILE_NAMESPACES + '):[^\]]*\|?\s*$', Regexp::IGNORECASE | Regexp::MULTILINE)
|
|
247
|
+
|
|
248
|
+
# Category regex - captures category name without sortkey
|
|
249
|
+
# [[Category:Name|sortkey]] -> captures only "Name" (not "Name|sortkey")
|
|
250
|
+
# The (?:[^\|\]\}]*) captures the category name up to | or ] or }
|
|
251
|
+
CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_NAMESPACES + ')\s*:([^\|\]\}]+)[\|\]\}]', Regexp::IGNORECASE)
|
|
65
252
|
|
|
66
253
|
ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
|
67
254
|
UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
|
|
68
255
|
|
|
69
256
|
REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
|
70
257
|
REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
|
|
71
|
-
|
|
258
|
+
|
|
259
|
+
# Note: TYPE_CODE_REGEX removed (was unused dead code)
|
|
260
|
+
# Template type detection is now handled by data-driven patterns in template_aliases.json
|
|
72
261
|
|
|
73
262
|
SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
|
|
74
263
|
DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
|
|
@@ -76,6 +265,7 @@ module Wp2txt
|
|
|
76
265
|
DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
|
|
77
266
|
CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
|
|
78
267
|
|
|
268
|
+
SELF_CLOSING_TAG_REGEX = Regexp.new('<[^<>]+/>')
|
|
79
269
|
COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
|
|
80
270
|
COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
|
|
81
271
|
COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
|
|
@@ -90,4 +280,198 @@ module Wp2txt
|
|
|
90
280
|
CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
|
|
91
281
|
CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
|
|
92
282
|
CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
|
|
283
|
+
|
|
284
|
+
# Additional cleanup patterns (pre-compiled for performance)
|
|
285
|
+
# Mixed whitespace between newlines: \n<spaces/tabs>\n<spaces/tabs>\n+ -> \n\n
|
|
286
|
+
CLEANUP_MIXED_WHITESPACE_REGEX = Regexp.new('\n[ \t]*\n[ \t]*\n+')
|
|
287
|
+
# Multiple consecutive spaces (not at line start) -> single space
|
|
288
|
+
CLEANUP_MULTIPLE_SPACES_REGEX = Regexp.new('([^\n]) {2,}')
|
|
289
|
+
# Empty parentheses (ASCII and Japanese) - combined for single-pass
|
|
290
|
+
CLEANUP_EMPTY_PARENS_REGEX = Regexp.new('\(\s*\)|(\s*)')
|
|
291
|
+
# Multiple pipes (table remnants)
|
|
292
|
+
CLEANUP_MULTIPLE_PIPES_REGEX = Regexp.new('\|\|+')
|
|
293
|
+
# Trailing pipe at end of line
|
|
294
|
+
CLEANUP_TRAILING_PIPE_REGEX = Regexp.new('\|\s*$')
|
|
295
|
+
# Lines that are just pipe + content (table rows)
|
|
296
|
+
CLEANUP_PIPE_LINE_REGEX = Regexp.new('^\s*\|[^|]*$\n?', Regexp::MULTILINE)
|
|
297
|
+
# Lines with multiple pipe-separated key=value pairs (infobox remnants)
|
|
298
|
+
CLEANUP_KEY_VALUE_LINE_REGEX = Regexp.new('^\s*\|?\w+=[\w\s-]+(?:\|\w+=[\w\s-]+)+\s*$', Regexp::MULTILINE)
|
|
299
|
+
# Orphaned closing brackets (]] at start of line or after whitespace)
|
|
300
|
+
CLEANUP_ORPHANED_CLOSE_REGEX = Regexp.new('(?:^|(?<=\s))([^|\[\]\n]+)\]\]')
|
|
301
|
+
# Orphaned opening wiki brackets not closed on same line
|
|
302
|
+
CLEANUP_ORPHANED_OPEN_REGEX = Regexp.new('\[\[[^\[\]\n]*$')
|
|
303
|
+
# Standalone ]] on its own line
|
|
304
|
+
CLEANUP_STANDALONE_CLOSE_REGEX = Regexp.new('^\s*\]\]\s*$', Regexp::MULTILINE)
|
|
305
|
+
# Combined pattern for orphaned brackets (both open and standalone close) - single pass removal
|
|
306
|
+
CLEANUP_ORPHANED_BRACKETS_REGEX = Regexp.new('\[\[[^\[\]\n]*$|^\s*\]\]\s*$', Regexp::MULTILINE)
|
|
307
|
+
# ]] preceded by pipe without matching [[ (orphaned from broken links)
|
|
308
|
+
CLEANUP_PIPE_CLOSE_REGEX = Regexp.new('([^|\[\]\n])\|([^|\[\]\n]+)\]\](?!\])')
|
|
309
|
+
# Multiple blank lines (final cleanup)
|
|
310
|
+
CLEANUP_MULTI_BLANK_REGEX = Regexp.new('\n{3,}')
|
|
311
|
+
# Imagemap coordinate remnants (rect, poly, circle, default with coordinates)
|
|
312
|
+
IMAGEMAP_COORD_REGEX = Regexp.new('^(?:rect|poly|circle|default)\s+[\d\s]+.*$', Regexp::IGNORECASE)
|
|
313
|
+
|
|
314
|
+
# =========================================================================
|
|
315
|
+
# Multilingual cleanup patterns (language-agnostic)
|
|
316
|
+
# =========================================================================
|
|
317
|
+
|
|
318
|
+
# MediaWiki magic words (universal across all wikis)
|
|
319
|
+
# DEFAULTSORT, DISPLAYTITLE, etc. - loaded from mediawiki_aliases.json for multilingual support
|
|
320
|
+
DEFAULTSORT_KEYWORDS = load_mediawiki_data.dig("magic_words", "defaultsort")&.join("|") || "DEFAULTSORT"
|
|
321
|
+
DISPLAYTITLE_KEYWORDS = load_mediawiki_data.dig("magic_words", "displaytitle")&.join("|") || "DISPLAYTITLE"
|
|
322
|
+
|
|
323
|
+
# Match bare magic words on their own line: DEFAULTSORT:value or デフォルトソート:value
|
|
324
|
+
MAGIC_WORD_LINE_REGEX = Regexp.new('^(?:' + DEFAULTSORT_KEYWORDS + '|' + DISPLAYTITLE_KEYWORDS + ')[^\n]*$', Regexp::IGNORECASE)
|
|
325
|
+
|
|
326
|
+
# Match magic word template format: {{DEFAULTSORT:value}} or {{デフォルトソート:value}}
|
|
327
|
+
MAGIC_WORD_TEMPLATE_REGEX = Regexp.new('\{\{\s*(?:' + DEFAULTSORT_KEYWORDS + '|' + DISPLAYTITLE_KEYWORDS + ')[^\}]*\}\}', Regexp::IGNORECASE)
|
|
328
|
+
|
|
329
|
+
# Double-underscore magic words: __NOTOC__, __TOC__, __FORCETOC__, __NOEDITSECTION__, etc.
|
|
330
|
+
# Data source: MediaWiki API (siteinfo) via scripts/fetch_mediawiki_data.rb
|
|
331
|
+
# Contains 1198 multilingual aliases for behavior switches
|
|
332
|
+
DOUBLE_UNDERSCORE_PATTERNS = load_mediawiki_data.dig("magic_words", "double_underscore") || []
|
|
333
|
+
DOUBLE_UNDERSCORE_MAGIC_REGEX = if DOUBLE_UNDERSCORE_PATTERNS.empty?
|
|
334
|
+
Regexp.new('__[A-Z]+__') # Fallback to basic pattern
|
|
335
|
+
else
|
|
336
|
+
# Build alternation pattern from actual magic word aliases
|
|
337
|
+
pattern = DOUBLE_UNDERSCORE_PATTERNS.map { |p| Regexp.escape(p) }.join("|")
|
|
338
|
+
Regexp.new('(?:' + pattern + ')', Regexp::IGNORECASE)
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
# Data-driven pattern for removing double-underscore behavior switches from text
|
|
342
|
+
# Uses the comprehensive multilingual magic word list (1198 aliases)
|
|
343
|
+
# Falls back to generic pattern if data file is empty
|
|
344
|
+
REMOVE_DIRECTIVES_REGEX = DOUBLE_UNDERSCORE_PATTERNS.empty? ?
|
|
345
|
+
REMOVE_DIRECTIVES_REGEX_GENERIC :
|
|
346
|
+
DOUBLE_UNDERSCORE_MAGIC_REGEX
|
|
347
|
+
|
|
348
|
+
# Interwiki links: :en:Article, :fr:Article, :de:Article, etc.
|
|
349
|
+
# Removes the prefix but keeps the article name
|
|
350
|
+
INTERWIKI_PREFIX_REGEX = Regexp.new(':([a-z]{2,3}):(?=[^\s\]]+)')
|
|
351
|
+
|
|
352
|
+
# Authority control and metadata templates (standalone lines)
|
|
353
|
+
# These are template names that appear alone on a line after processing
|
|
354
|
+
# Data source: template_aliases.json (authority_control category)
|
|
355
|
+
AUTHORITY_CONTROL_TEMPLATES = load_template_data["authority_control"] || []
|
|
356
|
+
AUTHORITY_CONTROL_REGEX = if AUTHORITY_CONTROL_TEMPLATES.empty?
|
|
357
|
+
# Fallback to basic pattern
|
|
358
|
+
Regexp.new(
|
|
359
|
+
'^\s*(Normdaten|Authority\s*control|Persondata|VIAF|LCCN|GND)\s*$',
|
|
360
|
+
Regexp::MULTILINE | Regexp::IGNORECASE
|
|
361
|
+
)
|
|
362
|
+
else
|
|
363
|
+
pattern = AUTHORITY_CONTROL_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
|
|
364
|
+
Regexp.new('^\s*(' + pattern + ')\s*$', Regexp::MULTILINE | Regexp::IGNORECASE)
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Cleanup remnants - template names that appear as artifacts after processing
|
|
368
|
+
# Data source: template_aliases.json (cleanup_remnants category)
|
|
369
|
+
CLEANUP_REMNANTS_TEMPLATES = load_template_data["cleanup_remnants"] || []
|
|
370
|
+
CLEANUP_REMNANTS_REGEX = if CLEANUP_REMNANTS_TEMPLATES.empty?
|
|
371
|
+
# Fallback to basic pattern
|
|
372
|
+
Regexp.new('^\s*(Clear|Clearleft|Clearright|notelist\d*)\s*$', Regexp::MULTILINE | Regexp::IGNORECASE)
|
|
373
|
+
else
|
|
374
|
+
pattern = CLEANUP_REMNANTS_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
|
|
375
|
+
# Also match notelist with numbers (notelist2, notelist3, etc.)
|
|
376
|
+
pattern += '|notelist\d+'
|
|
377
|
+
Regexp.new('^\s*(' + pattern + ')\s*$', Regexp::MULTILINE | Regexp::IGNORECASE)
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
# Category line patterns for all Wikipedia languages
|
|
381
|
+
# Loaded from mediawiki_aliases.json for complete multilingual support (230+ languages)
|
|
382
|
+
# Note: Must NOT match "CATEGORIES:" (our summary line)
|
|
383
|
+
CATEGORY_LINE_REGEX = Regexp.new(
|
|
384
|
+
'^\s*\*?\s*(?!CATEGORIES)(?:' + CATEGORY_NAMESPACES + '):[^\n]+$',
|
|
385
|
+
Regexp::MULTILINE | Regexp::IGNORECASE
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# Wikimedia sister project markers (standalone lines)
|
|
389
|
+
# Data source: MediaWiki API (siteinfo interwikimap) via scripts/fetch_mediawiki_data.rb
|
|
390
|
+
# Contains 546 sister project prefixes from all Wikipedia language editions
|
|
391
|
+
SISTER_PROJECTS = load_mediawiki_data.dig("interwiki", "sister_projects") || []
|
|
392
|
+
# Filter to only keep known Wikimedia project names (not language codes)
|
|
393
|
+
WIKIMEDIA_PROJECT_NAMES = %w[
|
|
394
|
+
wikibooks wikiversity wikisource wikiquote wikinews wiktionary
|
|
395
|
+
wikivoyage wikispecies wikidata commons meta mediawiki
|
|
396
|
+
mediawikiwiki species oldwikisource wikifunctions school
|
|
397
|
+
].freeze
|
|
398
|
+
WIKIMEDIA_PROJECT_REGEX = begin
|
|
399
|
+
# Combine known project names with any from data
|
|
400
|
+
projects_from_data = SISTER_PROJECTS.select { |p| WIKIMEDIA_PROJECT_NAMES.include?(p.downcase) }
|
|
401
|
+
# Always include all known project names (ensures complete coverage)
|
|
402
|
+
all_projects = (WIKIMEDIA_PROJECT_NAMES + projects_from_data).uniq
|
|
403
|
+
# Add common variations
|
|
404
|
+
pattern_parts = all_projects.map { |p| Regexp.escape(p) }
|
|
405
|
+
pattern_parts << 'Wikimedia\s*Commons' # Common alternate form
|
|
406
|
+
pattern_parts << 'Commons\s*cat(?:egory)?' # Commons category template
|
|
407
|
+
Regexp.new(
|
|
408
|
+
'^\s*(' + pattern_parts.join("|") + ')(?::|$)',
|
|
409
|
+
Regexp::MULTILINE | Regexp::IGNORECASE
|
|
410
|
+
)
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
# Lines that are just a single asterisk (list marker without content)
|
|
414
|
+
LONE_ASTERISK_REGEX = Regexp.new('^\s*\*\s*$', Regexp::MULTILINE)
|
|
415
|
+
|
|
416
|
+
# =========================================================================
|
|
417
|
+
# Non-article namespace prefixes (for validation filtering)
|
|
418
|
+
# =========================================================================
|
|
419
|
+
# These are namespaces that should be excluded from article validation
|
|
420
|
+
# as they contain templates, portals, help pages, etc. not encyclopedia content
|
|
421
|
+
#
|
|
422
|
+
# Data source: MediaWiki API (siteinfo) via scripts/fetch_mediawiki_data.rb
|
|
423
|
+
# Contains 6083 namespace aliases from 351 Wikipedia language editions
|
|
424
|
+
NON_ARTICLE_NAMESPACES = (load_mediawiki_data.dig("namespaces", "non_article") || []).freeze
|
|
425
|
+
|
|
426
|
+
# Build regex for matching non-article titles
|
|
427
|
+
# Matches "Namespace:Title" where Namespace is in the list
|
|
428
|
+
NON_ARTICLE_NAMESPACE_REGEX = if NON_ARTICLE_NAMESPACES.empty?
|
|
429
|
+
# Fallback to basic English namespaces
|
|
430
|
+
Regexp.new(
|
|
431
|
+
'\A\s*(Wikipedia|Template|Portal|Help|Category|File|Image|User|Talk|Module|Draft|MediaWiki)\s*:',
|
|
432
|
+
Regexp::IGNORECASE
|
|
433
|
+
)
|
|
434
|
+
else
|
|
435
|
+
Regexp.new(
|
|
436
|
+
'\A\s*(' + NON_ARTICLE_NAMESPACES.map { |ns| Regexp.escape(ns) }.join("|") + ')\s*:',
|
|
437
|
+
Regexp::IGNORECASE
|
|
438
|
+
)
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
# Helper method to check if a title is an article page (not a special namespace)
|
|
442
|
+
def self.article_page?(title)
|
|
443
|
+
return true if title.nil? || title.empty?
|
|
444
|
+
!(title =~ NON_ARTICLE_NAMESPACE_REGEX)
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
# =========================================================================
|
|
448
|
+
# Make constants Ractor-shareable for parallel processing
|
|
449
|
+
# =========================================================================
|
|
450
|
+
# This allows Ractor workers to access these constants without isolation errors.
|
|
451
|
+
# All Regexp and frozen String/Array constants are made shareable.
|
|
452
|
+
|
|
453
|
+
# Constants that should NOT be made Ractor-shareable
|
|
454
|
+
# (they require mutable state or are already shareable)
|
|
455
|
+
RACTOR_SHAREABLE_EXCLUDES = %i[HTML_DECODER RACTOR_SHAREABLE_EXCLUDES].freeze
|
|
456
|
+
|
|
457
|
+
def self.make_constants_ractor_shareable!
|
|
458
|
+
return unless defined?(Ractor) && Ractor.respond_to?(:make_shareable)
|
|
459
|
+
|
|
460
|
+
constants(false).each do |const_name|
|
|
461
|
+
next if RACTOR_SHAREABLE_EXCLUDES.include?(const_name)
|
|
462
|
+
|
|
463
|
+
const = const_get(const_name)
|
|
464
|
+
next if Ractor.shareable?(const)
|
|
465
|
+
|
|
466
|
+
begin
|
|
467
|
+
Ractor.make_shareable(const)
|
|
468
|
+
rescue Ractor::IsolationError, FrozenError, TypeError
|
|
469
|
+
# Some constants can't be made shareable, skip them
|
|
470
|
+
end
|
|
471
|
+
end
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# Make constants shareable when this module is loaded
|
|
475
|
+
# Excludes constants that require mutable state (like HTML_DECODER)
|
|
476
|
+
make_constants_ractor_shareable!
|
|
93
477
|
end
|