wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,215 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "etc"
4
+
5
+ module Wp2txt
6
+ # Ractor-based parallel processing for Wikipedia article conversion
7
+ #
8
+ # Ractor allows true parallelism by bypassing Ruby's GVL (Global VM Lock),
9
+ # enabling significant speedups for CPU-intensive text processing.
10
+ #
11
+ # REQUIREMENTS: Ruby 4.0+ (Ractor API stabilized in Ruby 4.0)
12
+ # For Ruby 3.x, the Parallel gem is used instead (process-based parallelism).
13
+ #
14
+ # Performance: Typically 1.5-2x speedup with 4 workers on multi-core systems.
15
+ #
16
+ # Usage:
17
+ # pages = [["Title1", "wiki text..."], ["Title2", "wiki text..."]]
18
+ # results = RactorWorker.process_articles(pages, config: config)
19
+ #
20
+ module RactorWorker
21
+ # Minimum Ruby version required for stable Ractor support
22
+ MINIMUM_RUBY_VERSION = "4.0"
23
+
24
+ # Registry of available operations
25
+ OPERATIONS = %i[process_article double fib].freeze
26
+
27
+ module_function
28
+
29
+ # Check if Ractor is available and usable
30
+ # Requires Ruby 4.0+ for stable Ractor support
31
+ # @return [Boolean] true if Ractor can be used
32
+ def available?
33
+ return @available if defined?(@available)
34
+
35
+ @available = check_ractor_available
36
+ end
37
+
38
+ # Internal method to check Ractor availability
39
+ # @return [Boolean] true if Ractor can be used
40
+ def check_ractor_available
41
+ return false unless ruby_version_sufficient?
42
+ return false unless defined?(Ractor)
43
+
44
+ # Test basic Ractor functionality with Ruby 4.0 API
45
+ r = Ractor.new { 1 + 1 }
46
+ r.join
47
+ r.value == 2
48
+ rescue StandardError
49
+ false
50
+ end
51
+
52
+ # Check if Ruby version meets minimum requirement
53
+ # @return [Boolean] true if Ruby version is 4.0 or higher
54
+ def ruby_version_sufficient?
55
+ Gem::Version.new(RUBY_VERSION) >= Gem::Version.new(MINIMUM_RUBY_VERSION)
56
+ end
57
+
58
+ # Process articles in parallel using Ractor (main entry point)
59
+ # @param pages [Array<Array>] Array of [title, text] pairs
60
+ # @param config [Hash] Configuration options for formatting
61
+ # @param strip_tmarker [Boolean] Whether to strip list markers
62
+ # @param num_workers [Integer] Number of parallel Ractors (optional)
63
+ # @return [Array<String>] Formatted article results
64
+ def process_articles(pages, config:, strip_tmarker: false, num_workers: nil)
65
+ items = pages.map { |title, text| [title, text, strip_tmarker] }
66
+
67
+ parallel_process(
68
+ items,
69
+ operation: :process_article,
70
+ config: config,
71
+ num_workers: num_workers
72
+ )
73
+ end
74
+
75
+ # Process items in parallel using map-join-value pattern (Ruby 4.0+)
76
+ # @param items [Array] Items to process
77
+ # @param operation [Symbol] Operation to perform (:process_article, :double, :fib)
78
+ # @param config [Hash] Configuration to pass to each operation
79
+ # @param num_workers [Integer] Max concurrent Ractors (default: optimal_workers)
80
+ # @return [Array] Results from processing (in original order)
81
+ def parallel_process(items, operation:, config: {}, num_workers: nil)
82
+ batch_size = num_workers || optimal_workers
83
+ batch_size = [batch_size, 1].max
84
+
85
+ # Fall back to sequential if Ractor not available or single item
86
+ unless available? && items.size > 1
87
+ return items.map { |item| process_single(item, operation, config) }
88
+ end
89
+
90
+ # Freeze config for sharing across Ractors
91
+ frozen_config = deep_freeze(config.dup)
92
+
93
+ # Process in batches to limit concurrent Ractors
94
+ results = []
95
+ items.each_slice(batch_size) do |batch|
96
+ batch_results = process_batch(batch, operation, frozen_config)
97
+ results.concat(batch_results)
98
+ end
99
+
100
+ results
101
+ rescue Ractor::Error => e
102
+ warn "Ractor error (#{e.message}), falling back to sequential processing"
103
+ items.map { |item| process_single(item, operation, config) }
104
+ end
105
+
106
+ # Process a batch using map-join-value pattern (Ruby 4.0 API)
107
+ # @param items [Array] Items to process in this batch
108
+ # @param operation [Symbol] Operation to perform
109
+ # @param frozen_config [Hash] Frozen configuration hash
110
+ # @return [Array] Results in original order
111
+ def process_batch(items, operation, frozen_config)
112
+ # Create one Ractor per item
113
+ ractors = items.map.with_index do |item, idx|
114
+ Ractor.new(item, frozen_config, operation, idx) do |it, cfg, op, i|
115
+ result = begin
116
+ case op
117
+ when :process_article
118
+ require_relative "utils"
119
+ require_relative "regex"
120
+ require_relative "article"
121
+ require_relative "formatter"
122
+
123
+ title, text, strip_tmarker = it
124
+ formatter = Object.new
125
+ formatter.extend(Wp2txt)
126
+ formatter.extend(Wp2txt::Formatter)
127
+ article = Wp2txt::Article.new(text, title, strip_tmarker)
128
+ formatter.format_article(article, cfg)
129
+ when :double
130
+ it * 2
131
+ when :fib
132
+ fib = ->(n) { n <= 1 ? n : fib.call(n - 1) + fib.call(n - 2) }
133
+ fib.call(it)
134
+ else
135
+ raise "Unknown operation: #{op}"
136
+ end
137
+ rescue StandardError
138
+ nil # Return nil on error
139
+ end
140
+ [i, result] # Return index and result for ordering
141
+ end
142
+ end
143
+
144
+ # Wait for all Ractors to complete and collect results
145
+ collected = Array.new(items.size)
146
+ ractors.each do |r|
147
+ r.join
148
+ idx, result = r.value
149
+ collected[idx] = result
150
+ end
151
+
152
+ collected
153
+ end
154
+
155
+ # Process a single item (for fallback/sequential processing)
156
+ # @param item [Object] Item to process
157
+ # @param operation [Symbol] Operation to perform
158
+ # @param config [Hash] Configuration options
159
+ # @return [Object] Processing result
160
+ def process_single(item, operation, config)
161
+ case operation
162
+ when :process_article
163
+ require_relative "utils"
164
+ require_relative "regex"
165
+ require_relative "article"
166
+ require_relative "formatter"
167
+
168
+ title, text, strip_tmarker = item
169
+ formatter = Object.new
170
+ formatter.extend(Wp2txt)
171
+ formatter.extend(Wp2txt::Formatter)
172
+ article = Wp2txt::Article.new(text, title, strip_tmarker)
173
+ formatter.format_article(article, config)
174
+ when :double
175
+ item * 2
176
+ when :fib
177
+ fib = ->(n) { n <= 1 ? n : fib.call(n - 1) + fib.call(n - 2) }
178
+ fib.call(item)
179
+ else
180
+ raise "Unknown operation: #{operation}"
181
+ end
182
+ end
183
+
184
+ # Calculate optimal number of workers based on CPU cores
185
+ # @return [Integer] Recommended concurrency level
186
+ def optimal_workers
187
+ cores = Etc.nprocessors
188
+ case cores
189
+ when 1..4 then cores
190
+ when 5..8 then cores - 1
191
+ else (cores * 0.8).to_i
192
+ end
193
+ end
194
+
195
+ # Deep freeze an object for Ractor sharing
196
+ # @param obj [Object] Object to freeze
197
+ # @return [Object] The frozen object
198
+ def deep_freeze(obj)
199
+ case obj
200
+ when Hash
201
+ obj.transform_keys { |k| deep_freeze(k) }
202
+ .transform_values { |v| deep_freeze(v) }
203
+ .freeze
204
+ when Array
205
+ obj.map { |v| deep_freeze(v) }.freeze
206
+ when String
207
+ obj.frozen? ? obj : obj.dup.freeze
208
+ when Symbol, Integer, Float, TrueClass, FalseClass, NilClass
209
+ obj
210
+ else
211
+ obj.freeze rescue obj
212
+ end
213
+ end
214
+ end
215
+ end
data/lib/wp2txt/regex.rb CHANGED
@@ -1,8 +1,145 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "htmlentities"
4
+ require "json"
5
+ require_relative "global_data_cache"
6
+
7
+ # Make HTMLEntities gem Ractor-shareable
8
+ # This must be done before any Ractor tries to use HTMLEntities
9
+ if defined?(Ractor) && Ractor.respond_to?(:make_shareable)
10
+ begin
11
+ HTMLEntities.constants.each do |const_name|
12
+ const = HTMLEntities.const_get(const_name)
13
+ next if Ractor.shareable?(const)
14
+
15
+ Ractor.make_shareable(const)
16
+ rescue Ractor::IsolationError, FrozenError, TypeError
17
+ # Skip if can't be made shareable
18
+ end
19
+ rescue StandardError
20
+ # Ignore errors during shareable setup
21
+ end
22
+ end
4
23
 
5
24
  module Wp2txt
25
+ # Data file paths
26
+ MEDIAWIKI_DATA_PATH = File.join(__dir__, "data", "mediawiki_aliases.json")
27
+ HTML_ENTITIES_PATH = File.join(__dir__, "data", "html_entities.json")
28
+ WIKIPEDIA_ENTITIES_PATH = File.join(__dir__, "data", "wikipedia_entities.json")
29
+ TEMPLATE_DATA_PATH = File.join(__dir__, "data", "template_aliases.json")
30
+
31
+ # Load MediaWiki aliases from data file (generated by scripts/fetch_mediawiki_data.rb)
32
+ # Uses SQLite cache for faster subsequent loads
33
+ def self.load_mediawiki_data
34
+ return @mediawiki_data if @mediawiki_data
35
+
36
+ # Try SQLite cache first
37
+ cached = GlobalDataCache.load(:mediawiki)
38
+ if cached
39
+ @mediawiki_data = cached
40
+ return @mediawiki_data
41
+ end
42
+
43
+ # Load from JSON file
44
+ if File.exist?(MEDIAWIKI_DATA_PATH)
45
+ @mediawiki_data = JSON.parse(File.read(MEDIAWIKI_DATA_PATH))
46
+ # Save to cache for future use
47
+ GlobalDataCache.save(:mediawiki, @mediawiki_data)
48
+ else
49
+ # Fallback to minimal defaults if data file is missing
50
+ @mediawiki_data = {
51
+ "magic_words" => { "redirect" => ["REDIRECT"] },
52
+ "namespaces" => { "category" => ["Category"], "file" => ["File", "Image"] }
53
+ }
54
+ end
55
+ @mediawiki_data
56
+ end
57
+
58
+ # Load template aliases from data file (generated by scripts/fetch_template_data.rb)
59
+ # Uses SQLite cache for faster subsequent loads
60
+ def self.load_template_data
61
+ return @template_data if @template_data
62
+
63
+ # Try SQLite cache first
64
+ cached = GlobalDataCache.load(:template)
65
+ if cached
66
+ @template_data = cached
67
+ return @template_data
68
+ end
69
+
70
+ # Load from JSON file
71
+ if File.exist?(TEMPLATE_DATA_PATH)
72
+ @template_data = JSON.parse(File.read(TEMPLATE_DATA_PATH))
73
+ # Save to cache for future use
74
+ GlobalDataCache.save(:template, @template_data)
75
+ else
76
+ # Fallback to minimal defaults if data file is missing
77
+ @template_data = {
78
+ "remove_templates" => %w[reflist notelist main see\ also portal],
79
+ "authority_control" => %w[authority\ control normdaten],
80
+ "cleanup_remnants" => %w[clear clearleft clearright]
81
+ }
82
+ end
83
+ @template_data
84
+ end
85
+
86
+ # Build regex pattern from template list (escapes special chars, joins with |)
87
+ def self.build_template_regex(templates, options = {})
88
+ return nil if templates.nil? || templates.empty?
89
+
90
+ pattern = templates.map { |t| Regexp.escape(t) }.join("|")
91
+ if options[:anchor_start]
92
+ pattern = '\A\s*(?:' + pattern + ')'
93
+ else
94
+ pattern = '(?:' + pattern + ')'
95
+ end
96
+ if options[:require_pipe_or_end]
97
+ pattern += '\s*(?:\||$)'
98
+ end
99
+ Regexp.new(pattern, Regexp::IGNORECASE)
100
+ end
101
+
102
+ # Load HTML entities from WHATWG data file (generated by scripts/fetch_html_entities.rb)
103
+ # Uses SQLite cache for faster subsequent loads
104
+ def self.load_html_entities
105
+ return @html_entities if @html_entities
106
+
107
+ # Try SQLite cache first (combined entities)
108
+ cached = GlobalDataCache.load(:html_entities_combined)
109
+ if cached
110
+ @html_entities = cached
111
+ return @html_entities
112
+ end
113
+
114
+ @html_entities = {}
115
+
116
+ # Load WHATWG HTML entities
117
+ if File.exist?(HTML_ENTITIES_PATH)
118
+ data = JSON.parse(File.read(HTML_ENTITIES_PATH))
119
+ @html_entities.merge!(data["entities"] || {})
120
+ end
121
+
122
+ # Load Wikipedia-specific entities (override/supplement WHATWG)
123
+ if File.exist?(WIKIPEDIA_ENTITIES_PATH)
124
+ data = JSON.parse(File.read(WIKIPEDIA_ENTITIES_PATH))
125
+ @html_entities.merge!(data["entities"] || {})
126
+ end
127
+
128
+ # Save combined entities to cache
129
+ GlobalDataCache.save(:html_entities_combined, @html_entities) unless @html_entities.empty?
130
+
131
+ @html_entities
132
+ end
133
+
134
+ # Build regex for extra entities not handled by HTMLEntities gem
135
+ def self.build_extra_entities_regex
136
+ entities = load_html_entities
137
+ return nil if entities.empty?
138
+
139
+ # Build regex pattern for all entity keys
140
+ pattern = "(" + entities.keys.map { |k| Regexp.escape(k) }.join("|") + ")"
141
+ Regexp.new(pattern)
142
+ end
6
143
  ###################################################
7
144
  # variables to save resource for generating regexps
8
145
  # those with a trailing number 1 represent opening tag/markup
@@ -14,9 +151,22 @@ module Wp2txt
14
151
  ENTITIES = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
15
152
  HTML_HASH = Hash[*ENTITIES.flatten]
16
153
  HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
154
+
155
+ # Extra HTML entities loaded from JSON files (WHATWG + Wikipedia-specific)
156
+ # These supplement the HTMLEntities gem which only covers ~250 basic entities
157
+ # Data sources:
158
+ # - lib/wp2txt/data/html_entities.json (2000+ WHATWG entities)
159
+ # - lib/wp2txt/data/wikipedia_entities.json (Wikipedia-specific)
160
+ EXTRA_ENTITIES = load_html_entities.freeze
161
+ EXTRA_ENTITIES_REGEX = build_extra_entities_regex
162
+
163
+ # Legacy alias for backward compatibility
164
+ MATH_ENTITIES = EXTRA_ENTITIES
165
+ MATH_ENTITIES_REGEX = EXTRA_ENTITIES_REGEX
17
166
  ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
18
167
  ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
19
- ML_LINK_ONSET_REGEX = Regexp.new('^\[\[[^\]]*$')
168
+ # Match lines starting with [[ that don't end with ]] (handles inner links)
169
+ ML_LINK_ONSET_REGEX = Regexp.new('^\[\[(?!.*\]\]\s*$).*$')
20
170
  ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
21
171
  ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
22
172
  ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
@@ -30,10 +180,12 @@ module Wp2txt
30
180
  IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
31
181
  IN_MATH_REGEX1 = Regexp.new('<math.*?>')
32
182
  IN_MATH_REGEX2 = Regexp.new('<\/math>')
33
- IN_HEADING_REGEX = Regexp.new('^=+.*?=+$')
183
+ IN_HEADING_REGEX = Regexp.new('^=+.*?=+\s*$')
34
184
  IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
35
185
  IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
36
186
  IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
187
+ # HTML comments (can span multiple lines)
188
+ HTML_COMMENT_REGEX = Regexp.new('<!--.*?-->', Regexp::MULTILINE)
37
189
  IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
38
190
  IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
39
191
  IN_UNORDERED_REGEX = Regexp.new('^\*')
@@ -41,17 +193,25 @@ module Wp2txt
41
193
  IN_PRE_REGEX = Regexp.new('^ ')
42
194
  IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
43
195
  BLANK_LINE_REGEX = Regexp.new('^\s*$')
44
- REDIRECT_REGEX = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
196
+
197
+ # Multilingual redirect keyword support
198
+ # Data source: MediaWiki API (siteinfo) via scripts/fetch_mediawiki_data.rb
199
+ REDIRECT_KEYWORDS = load_mediawiki_data.dig("magic_words", "redirect")&.join("|") || "REDIRECT"
200
+ REDIRECT_REGEX = Regexp.new('[##](?:' + REDIRECT_KEYWORDS + ')\s*:?\s*\[\[([^\]]+)\]\]', Regexp::IGNORECASE)
45
201
  REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
46
- REMOVE_DIRECTIVES_REGEX = Regexp.new("\_\_[^\_]*\_\_")
202
+
203
+ # Legacy generic pattern for double-underscore directives
204
+ # Note: Data-driven REMOVE_DIRECTIVES_REGEX is defined later (after DOUBLE_UNDERSCORE_MAGIC_REGEX)
205
+ REMOVE_DIRECTIVES_REGEX_GENERIC = Regexp.new("\_\_[^\_]*\_\_")
206
+
47
207
  REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
48
208
  CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
49
209
  MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
50
- REMOVE_HR_REGEX = Regexp.new('^\s*\-+\s*$')
51
- MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
52
- MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
53
- MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
54
- MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
210
+ REMOVE_HR_REGEX = Regexp.new('^\s*\-{4,}\s*$')
211
+ MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>', Regexp::IGNORECASE)
212
+ MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>', Regexp::IGNORECASE)
213
+ MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>', Regexp::IGNORECASE)
214
+ MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>', Regexp::IGNORECASE)
55
215
  FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
56
216
  HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
57
217
  HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
@@ -60,15 +220,44 @@ module Wp2txt
60
220
  DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
61
221
  ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
62
222
 
63
- CATEGORY_PATTERNS = ["Category", "Categoria"].join("|")
64
- CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
223
+ # Multilingual category namespace support
224
+ # Data source: MediaWiki API (siteinfo) via scripts/fetch_mediawiki_data.rb
225
+ CATEGORY_NAMESPACES = load_mediawiki_data.dig("namespaces", "category")&.join("|") || "Category"
226
+
227
+ # Multilingual file namespace support (for image/file links)
228
+ FILE_NAMESPACES = load_mediawiki_data.dig("namespaces", "file")&.join("|") || "File|Image"
229
+ FILE_NAMESPACES_REGEX = Regexp.new('\A\s*(?:' + FILE_NAMESPACES + ')\s*:', Regexp::IGNORECASE)
230
+
231
+ # Multilingual category namespace support (for filtering out category links from body text)
232
+ CATEGORY_NAMESPACE_REGEX = Regexp.new('\A\s*(?:' + CATEGORY_NAMESPACES + ')\s*:', Regexp::IGNORECASE)
233
+
234
+ # Image parameters (multilingual) - used for filtering out non-caption parts of File/Image links
235
+ # Combines: img_thumbnail, img_framed, img_frameless, img_border, img_right, img_left, img_center, img_none,
236
+ # img_upright, img_baseline, img_sub, img_super, img_top, img_text_top, img_middle, img_bottom, img_text_bottom
237
+ IMAGE_PARAM_KEYS = %w[img_thumbnail img_framed img_frameless img_border img_right img_left img_center img_none
238
+ img_upright img_baseline img_sub img_super img_top img_text_top img_middle img_bottom img_text_bottom].freeze
239
+ IMAGE_PARAMS = IMAGE_PARAM_KEYS.flat_map { |k| load_mediawiki_data.dig("magic_words", k) || [] }.uniq.join("|")
240
+ IMAGE_PARAMS_REGEX = IMAGE_PARAMS.empty? ? nil : Regexp.new('\A(' + IMAGE_PARAMS + ')\z', Regexp::IGNORECASE)
241
+
242
+ # Cleanup regex patterns using dynamic file namespaces
243
+ # For lines like "Image:file.jpg|thumb|...|caption" (gallery/imagemap remnants)
244
+ CLEANUP_FILE_LINE_REGEX = Regexp.new('^(?:' + FILE_NAMESPACES + '):[^\n]+\|[^\n]+$', Regexp::IGNORECASE | Regexp::MULTILINE)
245
+ # For incomplete File/Image links (opened but not closed)
246
+ CLEANUP_FILE_INCOMPLETE_REGEX = Regexp.new('\[\[(?:' + FILE_NAMESPACES + '):[^\]]*\|?\s*$', Regexp::IGNORECASE | Regexp::MULTILINE)
247
+
248
+ # Category regex - captures category name without sortkey
249
+ # [[Category:Name|sortkey]] -> captures only "Name" (not "Name|sortkey")
250
+ # The (?:[^\|\]\}]*) captures the category name up to | or ] or }
251
+ CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_NAMESPACES + ')\s*:([^\|\]\}]+)[\|\]\}]', Regexp::IGNORECASE)
65
252
 
66
253
  ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
67
254
  UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
68
255
 
69
256
  REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
70
257
  REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
71
- TYPE_CODE_REGEX = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
258
+
259
+ # Note: TYPE_CODE_REGEX removed (was unused dead code)
260
+ # Template type detection is now handled by data-driven patterns in template_aliases.json
72
261
 
73
262
  SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
74
263
  DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
@@ -76,6 +265,7 @@ module Wp2txt
76
265
  DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
77
266
  CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
78
267
 
268
+ SELF_CLOSING_TAG_REGEX = Regexp.new('<[^<>]+/>')
79
269
  COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
80
270
  COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
81
271
  COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
@@ -90,4 +280,198 @@ module Wp2txt
90
280
  CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
91
281
  CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
92
282
  CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
283
+
284
+ # Additional cleanup patterns (pre-compiled for performance)
285
+ # Mixed whitespace between newlines: \n<spaces/tabs>\n<spaces/tabs>\n+ -> \n\n
286
+ CLEANUP_MIXED_WHITESPACE_REGEX = Regexp.new('\n[ \t]*\n[ \t]*\n+')
287
+ # Multiple consecutive spaces (not at line start) -> single space
288
+ CLEANUP_MULTIPLE_SPACES_REGEX = Regexp.new('([^\n]) {2,}')
289
+ # Empty parentheses (ASCII and Japanese) - combined for single-pass
290
+ CLEANUP_EMPTY_PARENS_REGEX = Regexp.new('\(\s*\)|(\s*)')
291
+ # Multiple pipes (table remnants)
292
+ CLEANUP_MULTIPLE_PIPES_REGEX = Regexp.new('\|\|+')
293
+ # Trailing pipe at end of line
294
+ CLEANUP_TRAILING_PIPE_REGEX = Regexp.new('\|\s*$')
295
+ # Lines that are just pipe + content (table rows)
296
+ CLEANUP_PIPE_LINE_REGEX = Regexp.new('^\s*\|[^|]*$\n?', Regexp::MULTILINE)
297
+ # Lines with multiple pipe-separated key=value pairs (infobox remnants)
298
+ CLEANUP_KEY_VALUE_LINE_REGEX = Regexp.new('^\s*\|?\w+=[\w\s-]+(?:\|\w+=[\w\s-]+)+\s*$', Regexp::MULTILINE)
299
+ # Orphaned closing brackets (]] at start of line or after whitespace)
300
+ CLEANUP_ORPHANED_CLOSE_REGEX = Regexp.new('(?:^|(?<=\s))([^|\[\]\n]+)\]\]')
301
+ # Orphaned opening wiki brackets not closed on same line
302
+ CLEANUP_ORPHANED_OPEN_REGEX = Regexp.new('\[\[[^\[\]\n]*$')
303
+ # Standalone ]] on its own line
304
+ CLEANUP_STANDALONE_CLOSE_REGEX = Regexp.new('^\s*\]\]\s*$', Regexp::MULTILINE)
305
+ # Combined pattern for orphaned brackets (both open and standalone close) - single pass removal
306
+ CLEANUP_ORPHANED_BRACKETS_REGEX = Regexp.new('\[\[[^\[\]\n]*$|^\s*\]\]\s*$', Regexp::MULTILINE)
307
+ # ]] preceded by pipe without matching [[ (orphaned from broken links)
308
+ CLEANUP_PIPE_CLOSE_REGEX = Regexp.new('([^|\[\]\n])\|([^|\[\]\n]+)\]\](?!\])')
309
+ # Multiple blank lines (final cleanup)
310
+ CLEANUP_MULTI_BLANK_REGEX = Regexp.new('\n{3,}')
311
+ # Imagemap coordinate remnants (rect, poly, circle, default with coordinates)
312
+ IMAGEMAP_COORD_REGEX = Regexp.new('^(?:rect|poly|circle|default)\s+[\d\s]+.*$', Regexp::IGNORECASE)
313
+
314
+ # =========================================================================
315
+ # Multilingual cleanup patterns (language-agnostic)
316
+ # =========================================================================
317
+
318
+ # MediaWiki magic words (universal across all wikis)
319
+ # DEFAULTSORT, DISPLAYTITLE, etc. - loaded from mediawiki_aliases.json for multilingual support
320
+ DEFAULTSORT_KEYWORDS = load_mediawiki_data.dig("magic_words", "defaultsort")&.join("|") || "DEFAULTSORT"
321
+ DISPLAYTITLE_KEYWORDS = load_mediawiki_data.dig("magic_words", "displaytitle")&.join("|") || "DISPLAYTITLE"
322
+
323
+ # Match bare magic words on their own line: DEFAULTSORT:value or デフォルトソート:value
324
+ MAGIC_WORD_LINE_REGEX = Regexp.new('^(?:' + DEFAULTSORT_KEYWORDS + '|' + DISPLAYTITLE_KEYWORDS + ')[^\n]*$', Regexp::IGNORECASE)
325
+
326
+ # Match magic word template format: {{DEFAULTSORT:value}} or {{デフォルトソート:value}}
327
+ MAGIC_WORD_TEMPLATE_REGEX = Regexp.new('\{\{\s*(?:' + DEFAULTSORT_KEYWORDS + '|' + DISPLAYTITLE_KEYWORDS + ')[^\}]*\}\}', Regexp::IGNORECASE)
328
+
329
+ # Double-underscore magic words: __NOTOC__, __TOC__, __FORCETOC__, __NOEDITSECTION__, etc.
330
+ # Data source: MediaWiki API (siteinfo) via scripts/fetch_mediawiki_data.rb
331
+ # Contains 1198 multilingual aliases for behavior switches
332
+ DOUBLE_UNDERSCORE_PATTERNS = load_mediawiki_data.dig("magic_words", "double_underscore") || []
333
+ DOUBLE_UNDERSCORE_MAGIC_REGEX = if DOUBLE_UNDERSCORE_PATTERNS.empty?
334
+ Regexp.new('__[A-Z]+__') # Fallback to basic pattern
335
+ else
336
+ # Build alternation pattern from actual magic word aliases
337
+ pattern = DOUBLE_UNDERSCORE_PATTERNS.map { |p| Regexp.escape(p) }.join("|")
338
+ Regexp.new('(?:' + pattern + ')', Regexp::IGNORECASE)
339
+ end
340
+
341
+ # Data-driven pattern for removing double-underscore behavior switches from text
342
+ # Uses the comprehensive multilingual magic word list (1198 aliases)
343
+ # Falls back to generic pattern if data file is empty
344
+ REMOVE_DIRECTIVES_REGEX = DOUBLE_UNDERSCORE_PATTERNS.empty? ?
345
+ REMOVE_DIRECTIVES_REGEX_GENERIC :
346
+ DOUBLE_UNDERSCORE_MAGIC_REGEX
347
+
348
+ # Interwiki links: :en:Article, :fr:Article, :de:Article, etc.
349
+ # Removes the prefix but keeps the article name
350
+ INTERWIKI_PREFIX_REGEX = Regexp.new(':([a-z]{2,3}):(?=[^\s\]]+)')
351
+
352
+ # Authority control and metadata templates (standalone lines)
353
+ # These are template names that appear alone on a line after processing
354
+ # Data source: template_aliases.json (authority_control category)
355
+ AUTHORITY_CONTROL_TEMPLATES = load_template_data["authority_control"] || []
356
+ AUTHORITY_CONTROL_REGEX = if AUTHORITY_CONTROL_TEMPLATES.empty?
357
+ # Fallback to basic pattern
358
+ Regexp.new(
359
+ '^\s*(Normdaten|Authority\s*control|Persondata|VIAF|LCCN|GND)\s*$',
360
+ Regexp::MULTILINE | Regexp::IGNORECASE
361
+ )
362
+ else
363
+ pattern = AUTHORITY_CONTROL_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
364
+ Regexp.new('^\s*(' + pattern + ')\s*$', Regexp::MULTILINE | Regexp::IGNORECASE)
365
+ end
366
+
367
+ # Cleanup remnants - template names that appear as artifacts after processing
368
+ # Data source: template_aliases.json (cleanup_remnants category)
369
+ CLEANUP_REMNANTS_TEMPLATES = load_template_data["cleanup_remnants"] || []
370
+ CLEANUP_REMNANTS_REGEX = if CLEANUP_REMNANTS_TEMPLATES.empty?
371
+ # Fallback to basic pattern
372
+ Regexp.new('^\s*(Clear|Clearleft|Clearright|notelist\d*)\s*$', Regexp::MULTILINE | Regexp::IGNORECASE)
373
+ else
374
+ pattern = CLEANUP_REMNANTS_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
375
+ # Also match notelist with numbers (notelist2, notelist3, etc.)
376
+ pattern += '|notelist\d+'
377
+ Regexp.new('^\s*(' + pattern + ')\s*$', Regexp::MULTILINE | Regexp::IGNORECASE)
378
+ end
379
+
380
+ # Category line patterns for all Wikipedia languages
381
+ # Loaded from mediawiki_aliases.json for complete multilingual support (230+ languages)
382
+ # Note: Must NOT match "CATEGORIES:" (our summary line)
383
+ CATEGORY_LINE_REGEX = Regexp.new(
384
+ '^\s*\*?\s*(?!CATEGORIES)(?:' + CATEGORY_NAMESPACES + '):[^\n]+$',
385
+ Regexp::MULTILINE | Regexp::IGNORECASE
386
+ )
387
+
388
+ # Wikimedia sister project markers (standalone lines)
389
+ # Data source: MediaWiki API (siteinfo interwikimap) via scripts/fetch_mediawiki_data.rb
390
+ # Contains 546 sister project prefixes from all Wikipedia language editions
391
+ SISTER_PROJECTS = load_mediawiki_data.dig("interwiki", "sister_projects") || []
392
+ # Filter to only keep known Wikimedia project names (not language codes)
393
+ WIKIMEDIA_PROJECT_NAMES = %w[
394
+ wikibooks wikiversity wikisource wikiquote wikinews wiktionary
395
+ wikivoyage wikispecies wikidata commons meta mediawiki
396
+ mediawikiwiki species oldwikisource wikifunctions school
397
+ ].freeze
398
+ WIKIMEDIA_PROJECT_REGEX = begin
399
+ # Combine known project names with any from data
400
+ projects_from_data = SISTER_PROJECTS.select { |p| WIKIMEDIA_PROJECT_NAMES.include?(p.downcase) }
401
+ # Always include all known project names (ensures complete coverage)
402
+ all_projects = (WIKIMEDIA_PROJECT_NAMES + projects_from_data).uniq
403
+ # Add common variations
404
+ pattern_parts = all_projects.map { |p| Regexp.escape(p) }
405
+ pattern_parts << 'Wikimedia\s*Commons' # Common alternate form
406
+ pattern_parts << 'Commons\s*cat(?:egory)?' # Commons category template
407
+ Regexp.new(
408
+ '^\s*(' + pattern_parts.join("|") + ')(?::|$)',
409
+ Regexp::MULTILINE | Regexp::IGNORECASE
410
+ )
411
+ end
412
+
413
+ # Lines that are just a single asterisk (list marker without content)
414
+ LONE_ASTERISK_REGEX = Regexp.new('^\s*\*\s*$', Regexp::MULTILINE)
415
+
416
+ # =========================================================================
417
+ # Non-article namespace prefixes (for validation filtering)
418
+ # =========================================================================
419
+ # These are namespaces that should be excluded from article validation
420
+ # as they contain templates, portals, help pages, etc. not encyclopedia content
421
+ #
422
+ # Data source: MediaWiki API (siteinfo) via scripts/fetch_mediawiki_data.rb
423
+ # Contains 6083 namespace aliases from 351 Wikipedia language editions
424
+ NON_ARTICLE_NAMESPACES = (load_mediawiki_data.dig("namespaces", "non_article") || []).freeze
425
+
426
+ # Build regex for matching non-article titles
427
+ # Matches "Namespace:Title" where Namespace is in the list
428
+ NON_ARTICLE_NAMESPACE_REGEX = if NON_ARTICLE_NAMESPACES.empty?
429
+ # Fallback to basic English namespaces
430
+ Regexp.new(
431
+ '\A\s*(Wikipedia|Template|Portal|Help|Category|File|Image|User|Talk|Module|Draft|MediaWiki)\s*:',
432
+ Regexp::IGNORECASE
433
+ )
434
+ else
435
+ Regexp.new(
436
+ '\A\s*(' + NON_ARTICLE_NAMESPACES.map { |ns| Regexp.escape(ns) }.join("|") + ')\s*:',
437
+ Regexp::IGNORECASE
438
+ )
439
+ end
440
+
441
+ # Helper method to check if a title is an article page (not a special namespace)
442
+ def self.article_page?(title)
443
+ return true if title.nil? || title.empty?
444
+ !(title =~ NON_ARTICLE_NAMESPACE_REGEX)
445
+ end
446
+
447
+ # =========================================================================
448
+ # Make constants Ractor-shareable for parallel processing
449
+ # =========================================================================
450
+ # This allows Ractor workers to access these constants without isolation errors.
451
+ # All Regexp and frozen String/Array constants are made shareable.
452
+
453
+ # Constants that should NOT be made Ractor-shareable
454
+ # (they require mutable state or are already shareable)
455
+ RACTOR_SHAREABLE_EXCLUDES = %i[HTML_DECODER RACTOR_SHAREABLE_EXCLUDES].freeze
456
+
457
+ def self.make_constants_ractor_shareable!
458
+ return unless defined?(Ractor) && Ractor.respond_to?(:make_shareable)
459
+
460
+ constants(false).each do |const_name|
461
+ next if RACTOR_SHAREABLE_EXCLUDES.include?(const_name)
462
+
463
+ const = const_get(const_name)
464
+ next if Ractor.shareable?(const)
465
+
466
+ begin
467
+ Ractor.make_shareable(const)
468
+ rescue Ractor::IsolationError, FrozenError, TypeError
469
+ # Some constants can't be made shareable, skip them
470
+ end
471
+ end
472
+ end
473
+
474
+ # Make constants shareable when this module is loaded
475
+ # Excludes constants that require mutable state (like HTML_DECODER)
476
+ make_constants_ractor_shareable!
93
477
  end