wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
data/lib/wp2txt/utils.rb CHANGED
@@ -1,134 +1,399 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "strscan"
4
- require "find"
4
+ require_relative "constants"
5
5
  require_relative "regex"
6
+ require_relative "text_processing"
7
+ require_relative "file_utils"
8
+ require_relative "magic_words"
9
+ require_relative "template_expander"
10
+ require_relative "parser_functions"
6
11
 
7
12
  module Wp2txt
8
- def convert_characters(text, has_retried = false)
9
- text << ""
10
- text = chrref_to_utf(text)
11
- text = special_chr(text)
12
- text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
13
- rescue StandardError # detect invalid byte sequence in UTF-8
14
- if has_retried
15
- puts "invalid byte sequence detected"
16
- puts "******************************"
17
- File.open("error_log.txt", "w") do |f|
18
- f.write text
19
- end
20
- exit
13
+ # Main wiki formatting utilities: format_wiki, markers, templates, links
14
+
15
+ # Marker types for special content
16
+ MARKER_TYPES = %i[math code chem table score timeline graph ipa infobox navbox gallery sidebar mapframe imagemap references codeblock].freeze
17
+
18
+ # Inline markers: removing these can break surrounding text
19
+ INLINE_MARKERS = %i[math chem ipa code].freeze
20
+
21
+ # Block markers: these are standalone and can be safely removed
22
+ BLOCK_MARKERS = %i[table score timeline graph infobox navbox gallery sidebar mapframe imagemap references codeblock].freeze
23
+
24
+ # Default: all markers enabled
25
+ DEFAULT_MARKERS = MARKER_TYPES.dup.freeze
26
+
27
+ # Regex patterns for marker detection
28
+ MARKER_PATTERNS = {
29
+ # MATH: <math>...</math>, {{math|...}}, {{mvar|...}}
30
+ math: {
31
+ tags: [/<math[^>]*>.*?<\/math>/mi],
32
+ templates: [/\{\{(?:math|mvar)\s*\|/i]
33
+ },
34
+ # CODE: <code>...</code> (inline only)
35
+ code: {
36
+ tags: [
37
+ /<code[^>]*>.*?<\/code>/mi
38
+ ],
39
+ templates: []
40
+ },
41
+ # CODEBLOCK: <syntaxhighlight>...</syntaxhighlight>, <source>...</source>, <pre>...</pre> (block)
42
+ codeblock: {
43
+ tags: [
44
+ /<syntaxhighlight[^>]*>.*?<\/syntaxhighlight>/mi,
45
+ /<source[^>]*>.*?<\/source>/mi,
46
+ /<pre[^>]*>.*?<\/pre>/mi
47
+ ],
48
+ templates: []
49
+ },
50
+ # CHEM: <chem>...</chem>, {{chem|...}}, {{ce|...}}
51
+ chem: {
52
+ tags: [/<chem[^>]*>.*?<\/chem>/mi],
53
+ templates: [/\{\{(?:chem|ce)\s*\|/i]
54
+ },
55
+ # TABLE: {|...|}, <table>...</table>
56
+ table: {
57
+ tags: [/<table[^>]*>.*?<\/table>/mi],
58
+ wiki_table: true
59
+ },
60
+ # SCORE: <score>...</score>
61
+ score: {
62
+ tags: [/<score[^>]*>.*?<\/score>/mi],
63
+ templates: []
64
+ },
65
+ # TIMELINE: <timeline>...</timeline>
66
+ timeline: {
67
+ tags: [/<timeline[^>]*>.*?<\/timeline>/mi],
68
+ templates: []
69
+ },
70
+ # GRAPH: <graph>...</graph>
71
+ graph: {
72
+ tags: [/<graph[^>]*>.*?<\/graph>/mi],
73
+ templates: []
74
+ },
75
+ # IPA: {{IPA|...}}, {{IPAc-en|...}}, etc.
76
+ ipa: {
77
+ tags: [],
78
+ templates: [/\{\{IPA[c]?(?:-[a-z]{2,3})?\s*\|/i]
79
+ },
80
+ # INFOBOX: {{Infobox ...}}
81
+ infobox: {
82
+ tags: [],
83
+ templates: [/\{\{[Ii]nfobox\s*/]
84
+ },
85
+ # NAVBOX: {{Navbox ...}}
86
+ navbox: {
87
+ tags: [],
88
+ templates: [/\{\{[Nn]avbox\s*/]
89
+ },
90
+ # GALLERY: <gallery>...</gallery>
91
+ gallery: {
92
+ tags: [/<gallery[^>]*>.*?<\/gallery>/mi],
93
+ templates: []
94
+ },
95
+ # SIDEBAR: {{Sidebar ...}}
96
+ sidebar: {
97
+ tags: [],
98
+ templates: [/\{\{[Ss]idebar\s*/]
99
+ },
100
+ # MAPFRAME: <mapframe>...</mapframe>
101
+ mapframe: {
102
+ tags: [/<mapframe[^>]*>.*?<\/mapframe>/mi],
103
+ templates: []
104
+ },
105
+ # IMAGEMAP: <imagemap>...</imagemap>
106
+ imagemap: {
107
+ tags: [/<imagemap[^>]*>.*?<\/imagemap>/mi],
108
+ templates: []
109
+ },
110
+ # REFERENCES: {{reflist}}, {{refbegin}}...{{refend}}, <references/>
111
+ references: {
112
+ tags: [
113
+ /<references\s*\/>/mi,
114
+ /<references[^>]*>.*?<\/references>/mi
115
+ ],
116
+ templates: [/\{\{[Rr]eflist\s*/],
117
+ paired_templates: [{ start: /\{\{[Rr]efbegin/i, end_name: "refend" }]
118
+ }
119
+ }.freeze
120
+
121
+ def format_wiki(text, config = {})
122
+ # Work with a mutable copy to reduce intermediate string allocations
123
+ result = +text.to_s
124
+
125
+ # Early exit: Skip expensive processing if no templates present
126
+ has_templates = result.include?("{{")
127
+
128
+ # Expand magic words if title is provided and text contains templates
129
+ # This converts {{PAGENAME}}, {{CURRENTYEAR}}, {{lc:...}}, etc. to actual values
130
+ if config[:title] && has_templates
131
+ magic_expander = MagicWordExpander.new(
132
+ config[:title],
133
+ namespace: config[:namespace] || "",
134
+ dump_date: config[:dump_date]
135
+ )
136
+ result = magic_expander.expand(result)
137
+ end
138
+
139
+ # Expand parser functions if enabled and text contains parser function syntax
140
+ # This evaluates {{#if:...}}, {{#switch:...}}, {{#expr:...}}, etc.
141
+ if config[:expand_templates] && has_templates && result.include?("{{#")
142
+ parser_functions = ParserFunctions.new(
143
+ reference_date: config[:dump_date]
144
+ )
145
+ result = parser_functions.evaluate(result)
146
+ end
147
+
148
+ # Expand common templates if enabled and text still contains templates
149
+ # This converts {{birth date|...}}, {{convert|...}}, etc. to readable text
150
+ if config[:expand_templates] && result.include?("{{")
151
+ template_expander = TemplateExpander.new(
152
+ reference_date: config[:dump_date]
153
+ )
154
+ result = template_expander.expand(result)
155
+ end
156
+
157
+ # CPU-intensive regex processing (can be parallelized with Ractor)
158
+ result = format_wiki_regex_transform(result, config)
159
+
160
+ # Decode HTML entities (e.g., &Oslash; → Ø)
161
+ # This uses HTMLEntities gem - must be done outside Ractor
162
+ result = special_chr(result)
163
+
164
+ # Convert marker placeholders to final [MARKER] format
165
+ result = finalize_markers(result)
166
+ result
167
+ end
168
+
169
+ # CPU-intensive regex transformations - Ractor-safe (no external gem dependencies)
170
+ # This is the part that benefits from parallel processing
171
+ def format_wiki_regex_transform(text, config = {})
172
+ result = +text.to_s
173
+
174
+ # Determine which markers are enabled
175
+ markers_config = config.fetch(:markers, true)
176
+ enabled_markers = parse_markers_config(markers_config)
177
+
178
+ # Citation extraction option
179
+ extract_citations = config.fetch(:extract_citations, false)
180
+
181
+ # Apply markers BEFORE other processing (to preserve content for replacement)
182
+ markers_to_apply = extract_citations ? enabled_markers - [:references] : enabled_markers
183
+ result = apply_markers(result, markers_to_apply)
184
+
185
+ result = remove_complex(result)
186
+ result = escape_nowiki(result)
187
+ result = process_interwiki_links(result)
188
+ result = process_external_links(result)
189
+ result = unescape_nowiki(result)
190
+
191
+ # Use in-place modifications for simple regex replacements
192
+ result.gsub!(REMOVE_DIRECTIVES_REGEX, "")
193
+ result.gsub!(REMOVE_EMPHASIS_REGEX) { $2 }
194
+ result.gsub!(MNDASH_REGEX, "–")
195
+ result.gsub!(REMOVE_HR_REGEX, "")
196
+ result.gsub!(REMOVE_TAG_REGEX, "")
197
+
198
+ # Remove [ref]...[/ref] markers unless --ref option is enabled
199
+ result = remove_ref(result) unless config[:ref]
200
+
201
+ result = correct_inline_template(result, enabled_markers, extract_citations) unless config[:inline]
202
+ result = remove_templates(result) unless config[:inline]
203
+ result = remove_table(result, enabled_markers) unless config[:table]
204
+
205
+ result
206
+ end
207
+
208
+ # Parse markers configuration
209
+ # true or nil: all markers enabled
210
+ # false: no markers
211
+ # Array: only specified markers
212
+ def parse_markers_config(config)
213
+ case config
214
+ when true, nil
215
+ DEFAULT_MARKERS.dup
216
+ when false
217
+ []
218
+ when Array
219
+ config.map(&:to_sym) & MARKER_TYPES
21
220
  else
22
- text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
23
- text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
24
- convert_characters(text, true)
221
+ DEFAULT_MARKERS.dup
25
222
  end
26
223
  end
27
224
 
28
- def format_wiki(text, config = {})
29
- text = remove_complex(text)
30
- text = escape_nowiki(text)
31
- text = process_interwiki_links(text)
32
- text = process_external_links(text)
33
- text = unescape_nowiki(text)
34
- text = remove_directive(text)
35
- text = remove_emphasis(text)
36
- text = mndash(text)
37
- text = remove_hr(text)
38
- text = remove_tag(text)
39
- text = correct_inline_template(text) unless config[:inline]
40
- text = remove_templates(text) unless config[:inline]
41
- text = remove_table(text) unless config[:table]
42
- text
225
+ # Placeholder format for markers (to avoid conflicts with bracket processing)
226
+ # These get converted to [MARKER] at the end of format_wiki
227
+ def marker_placeholder(type)
228
+ "\u00AB\u00AB#{type.to_s.upcase}\u00BB\u00BB" # «« MARKER »»
43
229
  end
44
230
 
45
- def cleanup(text)
46
- text = text.gsub(CLEANUP_REGEX_01) { "" }
47
- text = text.gsub(CLEANUP_REGEX_02) { "" }
48
- text = text.gsub(CLEANUP_REGEX_03) { "" }
49
- text = text.gsub(CLEANUP_REGEX_04) { "" }
50
- text = text.gsub(CLEANUP_REGEX_05) { "" }
51
- text = text.gsub(CLEANUP_REGEX_06) { "" }
52
- text = text.gsub(CLEANUP_REGEX_07) { "" }
53
- text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
54
- text = text.strip
55
- text << "\n\n"
231
+ # Convert marker placeholders to final [MARKER] format
232
+ def finalize_markers(str)
233
+ result = +str.to_s
234
+ MARKER_TYPES.each do |marker_type|
235
+ placeholder = marker_placeholder(marker_type)
236
+ final_marker = "[#{marker_type.to_s.upcase}]"
237
+ result.gsub!(placeholder, final_marker)
238
+ end
239
+ result
56
240
  end
57
241
 
58
- #################### parser for nested structure ####################
59
-
60
- def process_nested_structure(scanner, left, right, &block)
61
- buffer = +""
62
- begin
63
- regex = if left == "[" && right == "]"
64
- SINGLE_SQUARE_BRACKET_REGEX
65
- elsif left == "[[" && right == "]]"
66
- DOUBLE_SQUARE_BRACKET_REGEX
67
- elsif left == "{" && right == "}"
68
- SINGLE_CURLY_BRACKET_REGEX
69
- elsif left == "{{" && right == "}}"
70
- DOUBLE_CURLY_BRACKET_REGEX
71
- elsif left == "{|" && right == "|}"
72
- CURLY_SQUARE_BRACKET_REGEX
73
- else
74
- Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
75
- end
76
- while (str = scanner.scan_until(regex))
77
- case scanner[1]
78
- when left
79
- buffer << str
80
- has_left = true
81
- when right
82
- if has_left
83
- buffer = buffer[0...-left.size]
84
- contents = block.call(str[0...-left.size])
85
- buffer << contents
86
- break
87
- else
88
- buffer << str
89
- end
242
+ # Apply marker replacements for enabled marker types
243
+ # When markers are disabled, content is removed (not marked)
244
+ def apply_markers(str, enabled_markers)
245
+ result = +str.to_s
246
+
247
+ MARKER_PATTERNS.each do |marker_type, patterns|
248
+ placeholder = marker_placeholder(marker_type)
249
+ should_mark = enabled_markers.include?(marker_type)
250
+
251
+ # Process HTML-style tags
252
+ patterns[:tags]&.each do |tag_regex|
253
+ if should_mark
254
+ result.gsub!(tag_regex, placeholder)
255
+ else
256
+ # Remove content when marker is not enabled
257
+ result.gsub!(tag_regex, "")
90
258
  end
91
259
  end
92
- buffer << scanner.rest
93
260
 
94
- return buffer if buffer == scanner.string
261
+ # Process wiki tables specially (need nested handling)
262
+ if patterns[:wiki_table] && result.include?("{|")
263
+ if should_mark
264
+ result = replace_wiki_table_with_marker(result, placeholder)
265
+ end
266
+ # If not marking, remove_table will handle it later
267
+ end
268
+
269
+ # Process template-based markers (Infobox, Navbox, Sidebar)
270
+ patterns[:templates]&.each do |template_regex|
271
+ result = replace_template_with_marker(result, template_regex, placeholder, should_mark)
272
+ end
95
273
 
96
- scanner.string = buffer
97
- process_nested_structure(scanner, left, right, &block) || ""
98
- rescue StandardError
99
- scanner.string
274
+ # Process paired templates (refbegin...refend)
275
+ patterns[:paired_templates]&.each do |pair|
276
+ result = replace_paired_templates_with_marker(result, pair[:start], pair[:end_name], placeholder, should_mark)
277
+ end
100
278
  end
279
+
280
+ result
101
281
  end
102
282
 
103
- #################### methods used from format_wiki ####################
104
- def escape_nowiki(str)
105
- if @nowikis
106
- @nowikis.clear
107
- else
108
- @nowikis = {}
109
- end
110
- str.gsub(ESCAPE_NOWIKI_REGEX) do
111
- nowiki = $1
112
- nowiki_id = nowiki.object_id
113
- @nowikis[nowiki_id] = nowiki
114
- "<nowiki-#{nowiki_id}>"
283
+ # Replace paired templates like {{refbegin}}...{{refend}} with marker
284
+ # When should_mark is false, skip processing entirely (don't remove content)
285
+ # This allows extract_citations to process the inner templates
286
+ def replace_paired_templates_with_marker(str, start_pattern, end_name, placeholder, should_mark)
287
+ return str unless should_mark # Skip if not marking - let content be processed later
288
+
289
+ result = +str.to_s
290
+ end_regex = /\{\{#{Regexp.escape(end_name)}\s*\}\}/i
291
+
292
+ loop do
293
+ match = result.match(start_pattern)
294
+ break unless match
295
+
296
+ start_pos = match.begin(0)
297
+
298
+ # Find the closing template (e.g., {{refend}})
299
+ end_match = result.match(end_regex, start_pos)
300
+ break unless end_match
301
+
302
+ end_pos = end_match.end(0)
303
+
304
+ result = result[0...start_pos] + placeholder + result[end_pos..]
115
305
  end
306
+ result
116
307
  end
117
308
 
118
- def unescape_nowiki(str)
119
- str.gsub(UNESCAPE_NOWIKI_REGEX) do
120
- obj_id = $1.to_i
121
- @nowikis[obj_id]
309
+ # Replace templates matching pattern with marker (handles nested braces)
310
+ def replace_template_with_marker(str, pattern, placeholder, should_mark)
311
+ result = +str.to_s
312
+ # Find all positions where template pattern matches
313
+ loop do
314
+ match = result.match(pattern)
315
+ break unless match
316
+
317
+ start_pos = match.begin(0)
318
+ # Find the end of this template by counting braces
319
+ depth = 0
320
+ pos = start_pos
321
+ template_end = nil
322
+
323
+ while pos < result.length
324
+ if result[pos, 2] == "{{"
325
+ depth += 1
326
+ pos += 2
327
+ elsif result[pos, 2] == "}}"
328
+ depth -= 1
329
+ pos += 2
330
+ if depth == 0
331
+ template_end = pos
332
+ break
333
+ end
334
+ else
335
+ pos += 1
336
+ end
337
+ end
338
+
339
+ if template_end
340
+ if should_mark
341
+ result = result[0...start_pos] + placeholder + result[template_end..]
342
+ else
343
+ result = result[0...start_pos] + result[template_end..]
344
+ end
345
+ else
346
+ # Unclosed template, break to avoid infinite loop
347
+ break
348
+ end
122
349
  end
350
+ result
123
351
  end
124
352
 
353
+ # Replace wiki tables {|...|} with marker
354
+ def replace_wiki_table_with_marker(str, placeholder)
355
+ return str unless str.include?("{|")
356
+ process_nested_single_pass(str, "{|", "|}") { placeholder }
357
+ end
358
+
359
+ #################### link processing ####################
360
+
361
+ # File/Image namespace and parameter regexes are now defined in regex.rb
362
+ # FILE_NAMESPACES_REGEX - matches file namespace prefixes (313 aliases from 350+ languages)
363
+ # IMAGE_PARAMS_REGEX - matches image parameters like thumb, right, left, etc.
364
+
125
365
  def process_interwiki_links(str)
126
- scanner = StringScanner.new(str)
127
- process_nested_structure(scanner, "[[", "]]") do |contents|
128
- parts = contents.split("|")
129
- case parts.size
130
- when 1
131
- parts.first || ""
366
+ # Early exit if no links present
367
+ return str unless str.include?("[[")
368
+
369
+ process_nested_single_pass(str, "[[", "]]") do |contents|
370
+ # Use -1 to preserve trailing empty strings (for pipe trick detection)
371
+ parts = contents.split("|", -1)
372
+ first_part = parts.first || ""
373
+
374
+ # Category links should be removed entirely (categories are extracted separately)
375
+ if CATEGORY_NAMESPACE_REGEX.match?(first_part)
376
+ ""
377
+ elsif FILE_NAMESPACES_REGEX.match?(first_part)
378
+ # For File/Image links, extract caption (last non-parameter part)
379
+ # Normalize newlines to pipes (handles malformed markup with newlines instead of pipes)
380
+ normalized = contents.gsub(/\n/, "|")
381
+ parts = normalized.split("|", -1)
382
+ # Skip parts that look like parameters (contain =, or are size specs like 200px)
383
+ if parts.size > 1
384
+ caption = parts[1..].reverse.find do |p|
385
+ stripped = p.strip
386
+ !stripped.empty? && !stripped.include?("=") && !stripped.match?(/\A\d+px\z/i) && !(IMAGE_PARAMS_REGEX && IMAGE_PARAMS_REGEX.match?(stripped))
387
+ end
388
+ caption&.strip || ""
389
+ else
390
+ ""
391
+ end
392
+ elsif parts.size == 1
393
+ first_part
394
+ elsif parts.size == 2 && parts[1].strip.empty?
395
+ # Pipe trick: [[Namespace:Page|]] or [[Page (disambiguation)|]]
396
+ apply_pipe_trick(first_part)
132
397
  else
133
398
  parts.shift
134
399
  parts.join("|")
@@ -136,9 +401,25 @@ module Wp2txt
136
401
  end
137
402
  end
138
403
 
404
+ # MediaWiki pipe trick: extracts display text from link target
405
+ # [[Wikipedia:著作権|]] → 著作権
406
+ # [[東京 (曖昧さ回避)|]] → 東京
407
+ def apply_pipe_trick(target)
408
+ result = target.dup
409
+ # Remove namespace prefix (everything before and including the last colon)
410
+ result = result.sub(/\A[^:]+:/, "") if result.include?(":")
411
+ # Remove trailing parenthetical (disambiguation)
412
+ result = result.sub(/\s*\([^)]+\)\s*\z/, "")
413
+ # Remove trailing comma and following text (for names like "LastName, FirstName")
414
+ result = result.sub(/\s*,.*\z/, "")
415
+ result.strip
416
+ end
417
+
139
418
  def process_external_links(str)
140
- scanner = StringScanner.new(str)
141
- process_nested_structure(scanner, "[", "]") do |contents|
419
+ # Early exit if no external links present
420
+ return str unless str.include?("[")
421
+
422
+ process_nested_single_pass(str, "[", "]") do |contents|
142
423
  if /\A\s.+\s\z/ =~ contents
143
424
  " (#{contents.strip}) "
144
425
  else
@@ -153,217 +434,295 @@ module Wp2txt
153
434
  end
154
435
  end
155
436
 
156
- #################### methods used from format_article ####################
437
+ #################### template processing ####################
157
438
 
158
439
  def remove_templates(str)
159
- scanner1 = StringScanner.new(str)
160
- result = process_nested_structure(scanner1, "{{", "}}") do
161
- ""
162
- end
163
- scanner2 = StringScanner.new(result)
164
- process_nested_structure(scanner2, "{", "}") do
165
- ""
166
- end
440
+ # Early exit if no templates present
441
+ return str unless str.include?("{{")
442
+
443
+ result = process_nested_single_pass(str, "{{", "}}") { "" }
444
+
445
+ # Handle single brace templates (less common)
446
+ return result unless result.include?("{")
447
+ process_nested_single_pass(result, "{", "}") { "" }
167
448
  end
168
449
 
169
- def remove_table(str)
170
- scanner = StringScanner.new(str)
171
- process_nested_structure(scanner, "{|", "|}") do
172
- ""
450
+ def remove_table(str, enabled_markers = [])
451
+ # Early exit if no tables present
452
+ return str unless str.include?("{|")
453
+
454
+ # If table marker is enabled, tables are already replaced with [TABLE]
455
+ # Only remove if marker is not enabled
456
+ if enabled_markers.include?(:table)
457
+ str
458
+ else
459
+ process_nested_single_pass(str, "{|", "|}") { "" }
173
460
  end
174
461
  end
175
462
 
176
- def special_chr(str)
177
- HTML_DECODER.decode(str)
463
+ # Citation templates that can be extracted
464
+ # Data source: template_aliases.json (citation_templates category)
465
+ CITATION_TEMPLATES = Wp2txt.load_template_data["citation_templates"] || []
466
+ CITATION_TEMPLATE_REGEX = if CITATION_TEMPLATES.empty?
467
+ # Fallback to basic pattern
468
+ /\A\s*(?:cite\s*(?:web|book|news|journal)|citation)\s*(?:\||$)/i
469
+ else
470
+ pattern = CITATION_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
471
+ Regexp.new('\A\s*(?:' + pattern + ')\s*(?:\||$)', Regexp::IGNORECASE)
178
472
  end
179
473
 
180
- def remove_inbetween(str, tagset = ["<", ">"])
181
- tagsets = Regexp.quote(tagset.uniq.join(""))
182
- regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
183
- str.gsub(regex, "")
474
+ # Templates that should be completely removed (references, navigation, but NOT citations when extracting)
475
+ # Data source: template_aliases.json (remove_templates category)
476
+ REMOVE_TEMPLATES = Wp2txt.load_template_data["remove_templates"] || []
477
+ REMOVE_TEMPLATES_REGEX = if REMOVE_TEMPLATES.empty?
478
+ # Fallback to basic pattern
479
+ /\A\s*(?:sfn|efn|refn|reflist|notelist|main|see\s*also|portal)\s*(?:\||$)/i
480
+ else
481
+ pattern = REMOVE_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
482
+ Regexp.new('\A\s*(?:' + pattern + ')\s*(?:\||$)', Regexp::IGNORECASE)
184
483
  end
185
484
 
186
- def remove_tag(str)
187
- str.gsub(REMOVE_TAG_REGEX, "")
485
+ # Flag templates to remove
486
+ # Data source: template_aliases.json (flag_templates category)
487
+ FLAG_TEMPLATES = Wp2txt.load_template_data["flag_templates"] || []
488
+ FLAG_TEMPLATE_REGEX = if FLAG_TEMPLATES.empty?
489
+ /\A\s*(?:flag|flagicon|flagcountry)\s*(?:\||$)/i
490
+ else
491
+ pattern = FLAG_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
492
+ Regexp.new('\A\s*(?:' + pattern + ')\s*(?:\||$)', Regexp::IGNORECASE)
188
493
  end
189
494
 
190
- def remove_directive(str)
191
- str.gsub(REMOVE_DIRECTIVES_REGEX, "")
495
+ # Formatting templates (extract content)
496
+ # Data source: template_aliases.json (formatting_templates category)
497
+ FORMATTING_TEMPLATES = Wp2txt.load_template_data["formatting_templates"] || []
498
+ FORMATTING_TEMPLATE_REGEX = if FORMATTING_TEMPLATES.empty?
499
+ /\A\s*(?:small|smaller|large|larger|nowrap|nbsp)\s*(?:\||$)/i
500
+ else
501
+ pattern = FORMATTING_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
502
+ Regexp.new('\A\s*(?:' + pattern + ')\s*(?:\||$)', Regexp::IGNORECASE)
192
503
  end
193
504
 
194
- def remove_emphasis(str)
195
- str.gsub(REMOVE_EMPHASIS_REGEX) do
196
- $2
197
- end
198
- end
505
+ # Ruby text templates (読み仮名 equivalent across languages)
506
+ # Data source: template_aliases.json (ruby_text_templates category)
507
+ RUBY_TEXT_TEMPLATES = Wp2txt.load_template_data["ruby_text_templates"] || []
199
508
 
200
- def chrref_to_utf(num_str)
201
- num_str.gsub(CHRREF_TO_UTF_REGEX) do
202
- ch = if $1 == "x"
203
- $2.to_i(16)
204
- else
205
- $2.to_i
206
- end
207
- hi = ch >> 8
208
- lo = ch & 0xff
209
- u = +"\377\376" << lo.chr << hi.chr
210
- u.encode("UTF-8", "UTF-16")
211
- end
212
- rescue StandardError
213
- num_str
214
- end
509
+ # Interwiki link templates (仮リンク equivalent across languages)
510
+ # Data source: template_aliases.json (interwiki_link_templates category)
511
+ INTERWIKI_LINK_TEMPLATES = Wp2txt.load_template_data["interwiki_link_templates"] || []
215
512
 
216
- def mndash(str)
217
- str.gsub(MNDASH_REGEX, "–")
218
- end
513
+ # Mixed script templates (nihongo equivalent across languages)
514
+ # Data source: template_aliases.json (mixed_script_templates category)
515
+ MIXED_SCRIPT_TEMPLATES = Wp2txt.load_template_data["mixed_script_templates"] || []
219
516
 
220
- def remove_hr(str)
221
- str.gsub(REMOVE_HR_REGEX, "")
222
- end
517
+ # Convert templates
518
+ # Data source: template_aliases.json (convert_templates category)
519
+ CONVERT_TEMPLATES = Wp2txt.load_template_data["convert_templates"] || []
223
520
 
224
- def remove_ref(str)
225
- str.gsub(FORMAT_REF_REGEX) { "" }
226
- end
521
+ # Country code templates (2-3 letter codes that represent flags)
522
+ COUNTRY_CODE_REGEX = /\A[A-Z]{2,3}\z/
227
523
 
228
- def remove_html(str)
229
- res = +str.dup
230
- res.gsub!(%r{<[^<>]+/>}) { "" }
231
- ["div", "gallery", "timeline", "noinclude"].each do |tag|
232
- scanner = StringScanner.new(res)
233
- result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
234
- ""
524
+ # Extract formatted citation from template parameters
525
+ def format_citation(contents)
526
+ params = {}
527
+ contents.split("|").each do |part|
528
+ if part.include?("=")
529
+ key, value = part.split("=", 2)
530
+ params[key.strip.downcase] = value&.strip
235
531
  end
236
- res.replace(result)
237
532
  end
238
- res
239
- end
240
533
 
241
- def remove_complex(str)
242
- str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
243
- str = str.gsub(COMPLEX_REGEX_02) { "" }
244
- str = str.gsub(COMPLEX_REGEX_03) { "" }
245
- str = str.gsub(COMPLEX_REGEX_04) { "" }
246
- str.gsub(COMPLEX_REGEX_05) { "" }
534
+ # Extract author (last name, or author field)
535
+ author = params["last"] || params["last1"] || params["author"] || params["author1"] || ""
536
+ first = params["first"] || params["first1"] || ""
537
+ author = "#{author}, #{first}" if !author.empty? && !first.empty?
538
+
539
+ # Extract title
540
+ title = params["title"] || ""
541
+
542
+ # Extract year/date
543
+ year = params["year"] || ""
544
+ if year.empty? && params["date"]
545
+ # Extract year from date like "2021-05-15"
546
+ year = params["date"][0, 4] if params["date"] =~ /^\d{4}/
547
+ end
548
+
549
+ # Format: "Author. Title. Year." or partial if fields missing
550
+ parts = []
551
+ parts << author unless author.empty?
552
+ parts << "\"#{title}\"" unless title.empty?
553
+ parts << year unless year.empty?
554
+
555
+ parts.empty? ? "" : parts.join(". ") + "."
247
556
  end
248
557
 
249
- def make_reference(str)
250
- str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
251
- str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
252
- str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
253
- str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
558
+ # Helper to check if template name matches any in a list (case-insensitive)
559
+ def template_matches?(name, template_list)
560
+ return false if template_list.nil? || template_list.empty?
561
+ normalized_name = name.to_s.strip.downcase
562
+ template_list.any? { |t| t.downcase == normalized_name }
254
563
  end
255
564
 
256
- def correct_inline_template(str)
257
- scanner = StringScanner.new(str)
258
- process_nested_structure(scanner, "{{", "}}") do |contents|
565
+ def correct_inline_template(str, enabled_markers = [], extract_citations = false)
566
+ # Early exit if no templates present
567
+ return str unless str.include?("{{")
568
+
569
+ process_nested_single_pass(str, "{{", "}}") do |contents|
259
570
  parts = contents.split("|")
260
- if /\A(?:lang|fontsize)\z/i =~ parts[0]
261
- parts.shift
262
- elsif /\Alang-/i =~ parts[0]
263
- parts.shift
264
- elsif /\Alang=/i =~ parts[1]
265
- parts.shift
266
- end
571
+ template_name = (parts[0] || "").strip
572
+ template_name_lower = template_name.downcase
573
+
574
+ # =========================================================================
575
+ # Specific template handlers (order matters - check before generic patterns)
576
+ # =========================================================================
577
+
578
+ # {{IPA|...}} or {{IPA-xx|...}} or {{IPAc-xx|...}}
579
+ # Must be checked BEFORE mixed_script_templates which also contains IPA
580
+ if template_name_lower == "ipa" || template_name_lower.start_with?("ipa-") || template_name_lower.start_with?("ipac-")
581
+ if enabled_markers.include?(:ipa)
582
+ marker_placeholder(:ipa)
583
+ else
584
+ (parts[1] || "").to_s.strip
585
+ end
586
+ # Language templates: {{lang|code|text}} or {{lang-xx|text}}
587
+ # Must be checked BEFORE mixed_script_templates which also contains lang
588
+ elsif template_name_lower == "lang"
589
+ parts.size >= 3 ? parts[2].to_s.strip : (parts[1] || "").to_s.strip
590
+ elsif template_name_lower.start_with?("lang-")
591
+ (parts[1] || "").to_s.strip
592
+ elsif template_name_lower == "fontsize"
593
+ parts.size >= 3 ? parts[2].to_s.strip : (parts[1] || "").to_s.strip
594
+ # {{langwithname|code|name|text}} - extract the text (3rd param)
595
+ elsif template_name_lower == "langwithname"
596
+ parts.size >= 4 ? parts[3].to_s.strip : (parts.last || "").to_s.strip
597
+ # {{math|...}} or {{mvar|...}} - mathematical notation
598
+ elsif template_name_lower == "math" || template_name_lower == "mvar"
599
+ if enabled_markers.include?(:math)
600
+ marker_placeholder(:math)
601
+ else
602
+ (parts[1] || "").to_s.strip
603
+ end
604
+ # {{chem|...}} or {{ce|...}} - chemical formulas
605
+ elsif template_name_lower == "chem" || template_name_lower == "ce"
606
+ if enabled_markers.include?(:chem)
607
+ marker_placeholder(:chem)
608
+ else
609
+ (parts[1] || "").to_s.strip
610
+ end
267
611
 
268
- if parts.size == 1
269
- out = parts[0]
270
- else
271
- begin
272
- keyval = parts[1].split("=")
273
- out = if keyval.size > 1
274
- keyval[1]
275
- else
276
- parts[1] || ""
277
- end
278
- rescue StandardError
279
- out = parts[1] || ""
612
+ # =========================================================================
613
+ # Data-driven template matching (generic patterns from template_aliases.json)
614
+ # =========================================================================
615
+
616
+ # Handle citation templates
617
+ elsif CITATION_TEMPLATE_REGEX.match?(contents)
618
+ if extract_citations
619
+ format_citation(contents)
620
+ else
621
+ ""
622
+ end
623
+ # Remove navigation/reference templates entirely
624
+ elsif REMOVE_TEMPLATES_REGEX.match?(contents)
625
+ ""
626
+ # Remove flag templates (data-driven)
627
+ elsif FLAG_TEMPLATE_REGEX.match?(contents) || COUNTRY_CODE_REGEX.match?(template_name)
628
+ ""
629
+ # Ruby text templates: 読み仮名, ruby, etc. (data-driven)
630
+ elsif template_matches?(template_name, RUBY_TEXT_TEMPLATES)
631
+ text = (parts[1] || "").strip
632
+ reading = (parts[2] || "").strip
633
+ reading.empty? ? text : "#{text}(#{reading})"
634
+ # Interwiki link templates: 仮リンク, ill, interlanguage link (data-driven)
635
+ elsif template_matches?(template_name, INTERWIKI_LINK_TEMPLATES)
636
+ # First parameter is display text
637
+ (parts[1] || "").to_s.strip
638
+ # Mixed script templates: nihongo, transl, etc. (data-driven)
639
+ elsif template_matches?(template_name, MIXED_SCRIPT_TEMPLATES)
640
+ # Format depends on template type
641
+ if template_name_lower == "nihongo" || template_name_lower.start_with?("nihongo")
642
+ text = (parts[1] || "").strip
643
+ kanji = (parts[2] || "").strip
644
+ romaji = (parts[3] || "").strip
645
+ if kanji.empty? && romaji.empty?
646
+ text
647
+ elsif romaji.empty?
648
+ "#{text} (#{kanji})"
649
+ elsif kanji.empty?
650
+ "#{text} (#{romaji})"
651
+ else
652
+ "#{text} (#{kanji}, #{romaji})"
653
+ end
654
+ elsif template_name_lower == "transl" || template_name_lower == "transliteration"
655
+ # {{transl|lang|text}} -> text
656
+ (parts[2] || parts[1] || "").to_s.strip
657
+ else
658
+ # Default: extract first content parameter
659
+ (parts[1] || "").to_s.strip
280
660
  end
661
+ # Convert templates (data-driven)
662
+ elsif template_matches?(template_name, CONVERT_TEMPLATES)
663
+ num = (parts[1] || "").strip
664
+ unit = (parts[2] || "").strip
665
+ unit.empty? ? num : "#{num} #{unit}"
666
+ # Formatting templates: small, nowrap, nbsp, etc. (data-driven)
667
+ elsif FORMATTING_TEMPLATE_REGEX.match?(contents)
668
+ if template_name_lower == "nbsp"
669
+ " " # Non-breaking space
670
+ else
671
+ # Extract content from formatting template
672
+ (parts[1] || "").to_s.strip
673
+ end
674
+ # Default handling for other templates
675
+ else
676
+ extract_template_content(parts)
281
677
  end
282
- out.strip
283
678
  end
284
679
  end
285
680
 
286
- #################### file related utilities ####################
287
-
288
- # collect filenames recursively
289
- def collect_files(str, regex = nil)
290
- regex ||= //
291
- text_array = []
292
- Find.find(str) do |f|
293
- text_array << f if regex =~ f
681
+ # Extract meaningful content from template parts
682
+ def extract_template_content(parts)
683
+ return "" if parts.empty?
684
+ return parts[0].to_s.strip if parts.size == 1
685
+
686
+ # Skip the template name, try to find non-parameter content
687
+ parts[1..].each do |part|
688
+ next if part.nil?
689
+ # Skip if it looks like a parameter (contains =)
690
+ next if part.include?("=")
691
+ content = part.strip
692
+ return content unless content.empty?
294
693
  end
295
- text_array.sort
296
- end
297
694
 
298
- # modify a file using block/yield mechanism
299
- def file_mod(file_path, backup = false)
300
- File.open(file_path, "r") do |fr|
301
- str = fr.read
302
- newstr = yield(str)
303
- str = newstr if nil? newstr
304
- File.open("temp", "w") do |tf|
305
- tf.write(str)
695
+ # If all parts have =, try to extract value from first parameter
696
+ parts[1..].each do |part|
697
+ next if part.nil?
698
+ if part.include?("=")
699
+ key, value = part.split("=", 2)
700
+ return value.to_s.strip unless value.nil? || value.strip.empty?
306
701
  end
307
702
  end
308
703
 
309
- File.rename(file_path, file_path + ".bak")
310
- File.rename("temp", file_path)
311
- File.unlink(file_path + ".bak") unless backup
704
+ ""
312
705
  end
313
706
 
314
- # modify files under a directry (recursive)
315
- def batch_file_mod(dir_path)
316
- if FileTest.directory?(dir_path)
317
- collect_files(dir_path).each do |file|
318
- yield file if FileTest.file?(file)
319
- end
320
- elsif FileTest.file?(dir_path)
321
- yield dir_path
322
- end
323
- end
707
+ # =========================================================================
708
+ # Make constants Ractor-shareable for parallel processing
709
+ # =========================================================================
710
+ module_function
324
711
 
325
- # take care of difference of separators among environments
326
- def correct_separator(input)
327
- case input
328
- when String
329
- if RUBY_PLATFORM.index("win32")
330
- input.gsub("/", "\\")
331
- else
332
- input.gsub("\\", "/")
333
- end
334
- when Array
335
- ret_array = []
336
- input.each do |item|
337
- ret_array << correct_separator(item)
338
- end
339
- ret_array
340
- end
341
- end
712
+ def self.make_constants_ractor_shareable!
713
+ return unless defined?(Ractor) && Ractor.respond_to?(:make_shareable)
342
714
 
343
- def rename(files, ext = "txt")
344
- # num of digits necessary to name the last file generated
345
- maxwidth = 0
715
+ constants(false).each do |const_name|
716
+ const = const_get(const_name)
717
+ next if Ractor.shareable?(const)
346
718
 
347
- files.each do |f|
348
- width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
349
- maxwidth = width if maxwidth < width
350
- newname = f.sub(/-(\d+)\z/) do
351
- "-" + format("%0#{maxwidth}d", $1.to_i)
719
+ begin
720
+ Ractor.make_shareable(const)
721
+ rescue Ractor::IsolationError, FrozenError, TypeError
722
+ # Some constants can't be made shareable, skip them
352
723
  end
353
- File.rename(f, newname + ".#{ext}")
354
724
  end
355
- true
356
725
  end
357
726
 
358
- # convert int of seconds to string in the format 00:00:00
359
- def sec_to_str(int)
360
- unless int
361
- str = "--:--:--"
362
- return str
363
- end
364
- h = int / 3600
365
- m = (int - h * 3600) / 60
366
- s = int % 60
367
- format("%02d:%02d:%02d", h, m, s)
368
- end
727
+ make_constants_ractor_shareable!
369
728
  end