wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
data/lib/wp2txt/utils.rb
CHANGED
|
@@ -1,134 +1,399 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "strscan"
|
|
4
|
-
|
|
4
|
+
require_relative "constants"
|
|
5
5
|
require_relative "regex"
|
|
6
|
+
require_relative "text_processing"
|
|
7
|
+
require_relative "file_utils"
|
|
8
|
+
require_relative "magic_words"
|
|
9
|
+
require_relative "template_expander"
|
|
10
|
+
require_relative "parser_functions"
|
|
6
11
|
|
|
7
12
|
module Wp2txt
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
13
|
+
# Main wiki formatting utilities: format_wiki, markers, templates, links
|
|
14
|
+
|
|
15
|
+
# Marker types for special content
|
|
16
|
+
MARKER_TYPES = %i[math code chem table score timeline graph ipa infobox navbox gallery sidebar mapframe imagemap references codeblock].freeze
|
|
17
|
+
|
|
18
|
+
# Inline markers: removing these can break surrounding text
|
|
19
|
+
INLINE_MARKERS = %i[math chem ipa code].freeze
|
|
20
|
+
|
|
21
|
+
# Block markers: these are standalone and can be safely removed
|
|
22
|
+
BLOCK_MARKERS = %i[table score timeline graph infobox navbox gallery sidebar mapframe imagemap references codeblock].freeze
|
|
23
|
+
|
|
24
|
+
# Default: all markers enabled
|
|
25
|
+
DEFAULT_MARKERS = MARKER_TYPES.dup.freeze
|
|
26
|
+
|
|
27
|
+
# Regex patterns for marker detection
|
|
28
|
+
MARKER_PATTERNS = {
|
|
29
|
+
# MATH: <math>...</math>, {{math|...}}, {{mvar|...}}
|
|
30
|
+
math: {
|
|
31
|
+
tags: [/<math[^>]*>.*?<\/math>/mi],
|
|
32
|
+
templates: [/\{\{(?:math|mvar)\s*\|/i]
|
|
33
|
+
},
|
|
34
|
+
# CODE: <code>...</code> (inline only)
|
|
35
|
+
code: {
|
|
36
|
+
tags: [
|
|
37
|
+
/<code[^>]*>.*?<\/code>/mi
|
|
38
|
+
],
|
|
39
|
+
templates: []
|
|
40
|
+
},
|
|
41
|
+
# CODEBLOCK: <syntaxhighlight>...</syntaxhighlight>, <source>...</source>, <pre>...</pre> (block)
|
|
42
|
+
codeblock: {
|
|
43
|
+
tags: [
|
|
44
|
+
/<syntaxhighlight[^>]*>.*?<\/syntaxhighlight>/mi,
|
|
45
|
+
/<source[^>]*>.*?<\/source>/mi,
|
|
46
|
+
/<pre[^>]*>.*?<\/pre>/mi
|
|
47
|
+
],
|
|
48
|
+
templates: []
|
|
49
|
+
},
|
|
50
|
+
# CHEM: <chem>...</chem>, {{chem|...}}, {{ce|...}}
|
|
51
|
+
chem: {
|
|
52
|
+
tags: [/<chem[^>]*>.*?<\/chem>/mi],
|
|
53
|
+
templates: [/\{\{(?:chem|ce)\s*\|/i]
|
|
54
|
+
},
|
|
55
|
+
# TABLE: {|...|}, <table>...</table>
|
|
56
|
+
table: {
|
|
57
|
+
tags: [/<table[^>]*>.*?<\/table>/mi],
|
|
58
|
+
wiki_table: true
|
|
59
|
+
},
|
|
60
|
+
# SCORE: <score>...</score>
|
|
61
|
+
score: {
|
|
62
|
+
tags: [/<score[^>]*>.*?<\/score>/mi],
|
|
63
|
+
templates: []
|
|
64
|
+
},
|
|
65
|
+
# TIMELINE: <timeline>...</timeline>
|
|
66
|
+
timeline: {
|
|
67
|
+
tags: [/<timeline[^>]*>.*?<\/timeline>/mi],
|
|
68
|
+
templates: []
|
|
69
|
+
},
|
|
70
|
+
# GRAPH: <graph>...</graph>
|
|
71
|
+
graph: {
|
|
72
|
+
tags: [/<graph[^>]*>.*?<\/graph>/mi],
|
|
73
|
+
templates: []
|
|
74
|
+
},
|
|
75
|
+
# IPA: {{IPA|...}}, {{IPAc-en|...}}, etc.
|
|
76
|
+
ipa: {
|
|
77
|
+
tags: [],
|
|
78
|
+
templates: [/\{\{IPA[c]?(?:-[a-z]{2,3})?\s*\|/i]
|
|
79
|
+
},
|
|
80
|
+
# INFOBOX: {{Infobox ...}}
|
|
81
|
+
infobox: {
|
|
82
|
+
tags: [],
|
|
83
|
+
templates: [/\{\{[Ii]nfobox\s*/]
|
|
84
|
+
},
|
|
85
|
+
# NAVBOX: {{Navbox ...}}
|
|
86
|
+
navbox: {
|
|
87
|
+
tags: [],
|
|
88
|
+
templates: [/\{\{[Nn]avbox\s*/]
|
|
89
|
+
},
|
|
90
|
+
# GALLERY: <gallery>...</gallery>
|
|
91
|
+
gallery: {
|
|
92
|
+
tags: [/<gallery[^>]*>.*?<\/gallery>/mi],
|
|
93
|
+
templates: []
|
|
94
|
+
},
|
|
95
|
+
# SIDEBAR: {{Sidebar ...}}
|
|
96
|
+
sidebar: {
|
|
97
|
+
tags: [],
|
|
98
|
+
templates: [/\{\{[Ss]idebar\s*/]
|
|
99
|
+
},
|
|
100
|
+
# MAPFRAME: <mapframe>...</mapframe>
|
|
101
|
+
mapframe: {
|
|
102
|
+
tags: [/<mapframe[^>]*>.*?<\/mapframe>/mi],
|
|
103
|
+
templates: []
|
|
104
|
+
},
|
|
105
|
+
# IMAGEMAP: <imagemap>...</imagemap>
|
|
106
|
+
imagemap: {
|
|
107
|
+
tags: [/<imagemap[^>]*>.*?<\/imagemap>/mi],
|
|
108
|
+
templates: []
|
|
109
|
+
},
|
|
110
|
+
# REFERENCES: {{reflist}}, {{refbegin}}...{{refend}}, <references/>
|
|
111
|
+
references: {
|
|
112
|
+
tags: [
|
|
113
|
+
/<references\s*\/>/mi,
|
|
114
|
+
/<references[^>]*>.*?<\/references>/mi
|
|
115
|
+
],
|
|
116
|
+
templates: [/\{\{[Rr]eflist\s*/],
|
|
117
|
+
paired_templates: [{ start: /\{\{[Rr]efbegin/i, end_name: "refend" }]
|
|
118
|
+
}
|
|
119
|
+
}.freeze
|
|
120
|
+
|
|
121
|
+
def format_wiki(text, config = {})
|
|
122
|
+
# Work with a mutable copy to reduce intermediate string allocations
|
|
123
|
+
result = +text.to_s
|
|
124
|
+
|
|
125
|
+
# Early exit: Skip expensive processing if no templates present
|
|
126
|
+
has_templates = result.include?("{{")
|
|
127
|
+
|
|
128
|
+
# Expand magic words if title is provided and text contains templates
|
|
129
|
+
# This converts {{PAGENAME}}, {{CURRENTYEAR}}, {{lc:...}}, etc. to actual values
|
|
130
|
+
if config[:title] && has_templates
|
|
131
|
+
magic_expander = MagicWordExpander.new(
|
|
132
|
+
config[:title],
|
|
133
|
+
namespace: config[:namespace] || "",
|
|
134
|
+
dump_date: config[:dump_date]
|
|
135
|
+
)
|
|
136
|
+
result = magic_expander.expand(result)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Expand parser functions if enabled and text contains parser function syntax
|
|
140
|
+
# This evaluates {{#if:...}}, {{#switch:...}}, {{#expr:...}}, etc.
|
|
141
|
+
if config[:expand_templates] && has_templates && result.include?("{{#")
|
|
142
|
+
parser_functions = ParserFunctions.new(
|
|
143
|
+
reference_date: config[:dump_date]
|
|
144
|
+
)
|
|
145
|
+
result = parser_functions.evaluate(result)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Expand common templates if enabled and text still contains templates
|
|
149
|
+
# This converts {{birth date|...}}, {{convert|...}}, etc. to readable text
|
|
150
|
+
if config[:expand_templates] && result.include?("{{")
|
|
151
|
+
template_expander = TemplateExpander.new(
|
|
152
|
+
reference_date: config[:dump_date]
|
|
153
|
+
)
|
|
154
|
+
result = template_expander.expand(result)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# CPU-intensive regex processing (can be parallelized with Ractor)
|
|
158
|
+
result = format_wiki_regex_transform(result, config)
|
|
159
|
+
|
|
160
|
+
# Decode HTML entities (e.g., Ø → Ø)
|
|
161
|
+
# This uses HTMLEntities gem - must be done outside Ractor
|
|
162
|
+
result = special_chr(result)
|
|
163
|
+
|
|
164
|
+
# Convert marker placeholders to final [MARKER] format
|
|
165
|
+
result = finalize_markers(result)
|
|
166
|
+
result
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# CPU-intensive regex transformations - Ractor-safe (no external gem dependencies)
|
|
170
|
+
# This is the part that benefits from parallel processing
|
|
171
|
+
def format_wiki_regex_transform(text, config = {})
|
|
172
|
+
result = +text.to_s
|
|
173
|
+
|
|
174
|
+
# Determine which markers are enabled
|
|
175
|
+
markers_config = config.fetch(:markers, true)
|
|
176
|
+
enabled_markers = parse_markers_config(markers_config)
|
|
177
|
+
|
|
178
|
+
# Citation extraction option
|
|
179
|
+
extract_citations = config.fetch(:extract_citations, false)
|
|
180
|
+
|
|
181
|
+
# Apply markers BEFORE other processing (to preserve content for replacement)
|
|
182
|
+
markers_to_apply = extract_citations ? enabled_markers - [:references] : enabled_markers
|
|
183
|
+
result = apply_markers(result, markers_to_apply)
|
|
184
|
+
|
|
185
|
+
result = remove_complex(result)
|
|
186
|
+
result = escape_nowiki(result)
|
|
187
|
+
result = process_interwiki_links(result)
|
|
188
|
+
result = process_external_links(result)
|
|
189
|
+
result = unescape_nowiki(result)
|
|
190
|
+
|
|
191
|
+
# Use in-place modifications for simple regex replacements
|
|
192
|
+
result.gsub!(REMOVE_DIRECTIVES_REGEX, "")
|
|
193
|
+
result.gsub!(REMOVE_EMPHASIS_REGEX) { $2 }
|
|
194
|
+
result.gsub!(MNDASH_REGEX, "–")
|
|
195
|
+
result.gsub!(REMOVE_HR_REGEX, "")
|
|
196
|
+
result.gsub!(REMOVE_TAG_REGEX, "")
|
|
197
|
+
|
|
198
|
+
# Remove [ref]...[/ref] markers unless --ref option is enabled
|
|
199
|
+
result = remove_ref(result) unless config[:ref]
|
|
200
|
+
|
|
201
|
+
result = correct_inline_template(result, enabled_markers, extract_citations) unless config[:inline]
|
|
202
|
+
result = remove_templates(result) unless config[:inline]
|
|
203
|
+
result = remove_table(result, enabled_markers) unless config[:table]
|
|
204
|
+
|
|
205
|
+
result
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Parse markers configuration
|
|
209
|
+
# true or nil: all markers enabled
|
|
210
|
+
# false: no markers
|
|
211
|
+
# Array: only specified markers
|
|
212
|
+
def parse_markers_config(config)
|
|
213
|
+
case config
|
|
214
|
+
when true, nil
|
|
215
|
+
DEFAULT_MARKERS.dup
|
|
216
|
+
when false
|
|
217
|
+
[]
|
|
218
|
+
when Array
|
|
219
|
+
config.map(&:to_sym) & MARKER_TYPES
|
|
21
220
|
else
|
|
22
|
-
|
|
23
|
-
text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
|
|
24
|
-
convert_characters(text, true)
|
|
221
|
+
DEFAULT_MARKERS.dup
|
|
25
222
|
end
|
|
26
223
|
end
|
|
27
224
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
text = process_external_links(text)
|
|
33
|
-
text = unescape_nowiki(text)
|
|
34
|
-
text = remove_directive(text)
|
|
35
|
-
text = remove_emphasis(text)
|
|
36
|
-
text = mndash(text)
|
|
37
|
-
text = remove_hr(text)
|
|
38
|
-
text = remove_tag(text)
|
|
39
|
-
text = correct_inline_template(text) unless config[:inline]
|
|
40
|
-
text = remove_templates(text) unless config[:inline]
|
|
41
|
-
text = remove_table(text) unless config[:table]
|
|
42
|
-
text
|
|
225
|
+
# Placeholder format for markers (to avoid conflicts with bracket processing)
|
|
226
|
+
# These get converted to [MARKER] at the end of format_wiki
|
|
227
|
+
def marker_placeholder(type)
|
|
228
|
+
"\u00AB\u00AB#{type.to_s.upcase}\u00BB\u00BB" # «« MARKER »»
|
|
43
229
|
end
|
|
44
230
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
text = text.strip
|
|
55
|
-
text << "\n\n"
|
|
231
|
+
# Convert marker placeholders to final [MARKER] format
|
|
232
|
+
def finalize_markers(str)
|
|
233
|
+
result = +str.to_s
|
|
234
|
+
MARKER_TYPES.each do |marker_type|
|
|
235
|
+
placeholder = marker_placeholder(marker_type)
|
|
236
|
+
final_marker = "[#{marker_type.to_s.upcase}]"
|
|
237
|
+
result.gsub!(placeholder, final_marker)
|
|
238
|
+
end
|
|
239
|
+
result
|
|
56
240
|
end
|
|
57
241
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
|
|
75
|
-
end
|
|
76
|
-
while (str = scanner.scan_until(regex))
|
|
77
|
-
case scanner[1]
|
|
78
|
-
when left
|
|
79
|
-
buffer << str
|
|
80
|
-
has_left = true
|
|
81
|
-
when right
|
|
82
|
-
if has_left
|
|
83
|
-
buffer = buffer[0...-left.size]
|
|
84
|
-
contents = block.call(str[0...-left.size])
|
|
85
|
-
buffer << contents
|
|
86
|
-
break
|
|
87
|
-
else
|
|
88
|
-
buffer << str
|
|
89
|
-
end
|
|
242
|
+
# Apply marker replacements for enabled marker types
|
|
243
|
+
# When markers are disabled, content is removed (not marked)
|
|
244
|
+
def apply_markers(str, enabled_markers)
|
|
245
|
+
result = +str.to_s
|
|
246
|
+
|
|
247
|
+
MARKER_PATTERNS.each do |marker_type, patterns|
|
|
248
|
+
placeholder = marker_placeholder(marker_type)
|
|
249
|
+
should_mark = enabled_markers.include?(marker_type)
|
|
250
|
+
|
|
251
|
+
# Process HTML-style tags
|
|
252
|
+
patterns[:tags]&.each do |tag_regex|
|
|
253
|
+
if should_mark
|
|
254
|
+
result.gsub!(tag_regex, placeholder)
|
|
255
|
+
else
|
|
256
|
+
# Remove content when marker is not enabled
|
|
257
|
+
result.gsub!(tag_regex, "")
|
|
90
258
|
end
|
|
91
259
|
end
|
|
92
|
-
buffer << scanner.rest
|
|
93
260
|
|
|
94
|
-
|
|
261
|
+
# Process wiki tables specially (need nested handling)
|
|
262
|
+
if patterns[:wiki_table] && result.include?("{|")
|
|
263
|
+
if should_mark
|
|
264
|
+
result = replace_wiki_table_with_marker(result, placeholder)
|
|
265
|
+
end
|
|
266
|
+
# If not marking, remove_table will handle it later
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
# Process template-based markers (Infobox, Navbox, Sidebar)
|
|
270
|
+
patterns[:templates]&.each do |template_regex|
|
|
271
|
+
result = replace_template_with_marker(result, template_regex, placeholder, should_mark)
|
|
272
|
+
end
|
|
95
273
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
274
|
+
# Process paired templates (refbegin...refend)
|
|
275
|
+
patterns[:paired_templates]&.each do |pair|
|
|
276
|
+
result = replace_paired_templates_with_marker(result, pair[:start], pair[:end_name], placeholder, should_mark)
|
|
277
|
+
end
|
|
100
278
|
end
|
|
279
|
+
|
|
280
|
+
result
|
|
101
281
|
end
|
|
102
282
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
283
|
+
# Replace paired templates like {{refbegin}}...{{refend}} with marker
|
|
284
|
+
# When should_mark is false, skip processing entirely (don't remove content)
|
|
285
|
+
# This allows extract_citations to process the inner templates
|
|
286
|
+
def replace_paired_templates_with_marker(str, start_pattern, end_name, placeholder, should_mark)
|
|
287
|
+
return str unless should_mark # Skip if not marking - let content be processed later
|
|
288
|
+
|
|
289
|
+
result = +str.to_s
|
|
290
|
+
end_regex = /\{\{#{Regexp.escape(end_name)}\s*\}\}/i
|
|
291
|
+
|
|
292
|
+
loop do
|
|
293
|
+
match = result.match(start_pattern)
|
|
294
|
+
break unless match
|
|
295
|
+
|
|
296
|
+
start_pos = match.begin(0)
|
|
297
|
+
|
|
298
|
+
# Find the closing template (e.g., {{refend}})
|
|
299
|
+
end_match = result.match(end_regex, start_pos)
|
|
300
|
+
break unless end_match
|
|
301
|
+
|
|
302
|
+
end_pos = end_match.end(0)
|
|
303
|
+
|
|
304
|
+
result = result[0...start_pos] + placeholder + result[end_pos..]
|
|
115
305
|
end
|
|
306
|
+
result
|
|
116
307
|
end
|
|
117
308
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
309
|
+
# Replace templates matching pattern with marker (handles nested braces)
|
|
310
|
+
def replace_template_with_marker(str, pattern, placeholder, should_mark)
|
|
311
|
+
result = +str.to_s
|
|
312
|
+
# Find all positions where template pattern matches
|
|
313
|
+
loop do
|
|
314
|
+
match = result.match(pattern)
|
|
315
|
+
break unless match
|
|
316
|
+
|
|
317
|
+
start_pos = match.begin(0)
|
|
318
|
+
# Find the end of this template by counting braces
|
|
319
|
+
depth = 0
|
|
320
|
+
pos = start_pos
|
|
321
|
+
template_end = nil
|
|
322
|
+
|
|
323
|
+
while pos < result.length
|
|
324
|
+
if result[pos, 2] == "{{"
|
|
325
|
+
depth += 1
|
|
326
|
+
pos += 2
|
|
327
|
+
elsif result[pos, 2] == "}}"
|
|
328
|
+
depth -= 1
|
|
329
|
+
pos += 2
|
|
330
|
+
if depth == 0
|
|
331
|
+
template_end = pos
|
|
332
|
+
break
|
|
333
|
+
end
|
|
334
|
+
else
|
|
335
|
+
pos += 1
|
|
336
|
+
end
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
if template_end
|
|
340
|
+
if should_mark
|
|
341
|
+
result = result[0...start_pos] + placeholder + result[template_end..]
|
|
342
|
+
else
|
|
343
|
+
result = result[0...start_pos] + result[template_end..]
|
|
344
|
+
end
|
|
345
|
+
else
|
|
346
|
+
# Unclosed template, break to avoid infinite loop
|
|
347
|
+
break
|
|
348
|
+
end
|
|
122
349
|
end
|
|
350
|
+
result
|
|
123
351
|
end
|
|
124
352
|
|
|
353
|
+
# Replace wiki tables {|...|} with marker
|
|
354
|
+
def replace_wiki_table_with_marker(str, placeholder)
|
|
355
|
+
return str unless str.include?("{|")
|
|
356
|
+
process_nested_single_pass(str, "{|", "|}") { placeholder }
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
#################### link processing ####################
|
|
360
|
+
|
|
361
|
+
# File/Image namespace and parameter regexes are now defined in regex.rb
|
|
362
|
+
# FILE_NAMESPACES_REGEX - matches file namespace prefixes (313 aliases from 350+ languages)
|
|
363
|
+
# IMAGE_PARAMS_REGEX - matches image parameters like thumb, right, left, etc.
|
|
364
|
+
|
|
125
365
|
def process_interwiki_links(str)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
366
|
+
# Early exit if no links present
|
|
367
|
+
return str unless str.include?("[[")
|
|
368
|
+
|
|
369
|
+
process_nested_single_pass(str, "[[", "]]") do |contents|
|
|
370
|
+
# Use -1 to preserve trailing empty strings (for pipe trick detection)
|
|
371
|
+
parts = contents.split("|", -1)
|
|
372
|
+
first_part = parts.first || ""
|
|
373
|
+
|
|
374
|
+
# Category links should be removed entirely (categories are extracted separately)
|
|
375
|
+
if CATEGORY_NAMESPACE_REGEX.match?(first_part)
|
|
376
|
+
""
|
|
377
|
+
elsif FILE_NAMESPACES_REGEX.match?(first_part)
|
|
378
|
+
# For File/Image links, extract caption (last non-parameter part)
|
|
379
|
+
# Normalize newlines to pipes (handles malformed markup with newlines instead of pipes)
|
|
380
|
+
normalized = contents.gsub(/\n/, "|")
|
|
381
|
+
parts = normalized.split("|", -1)
|
|
382
|
+
# Skip parts that look like parameters (contain =, or are size specs like 200px)
|
|
383
|
+
if parts.size > 1
|
|
384
|
+
caption = parts[1..].reverse.find do |p|
|
|
385
|
+
stripped = p.strip
|
|
386
|
+
!stripped.empty? && !stripped.include?("=") && !stripped.match?(/\A\d+px\z/i) && !(IMAGE_PARAMS_REGEX && IMAGE_PARAMS_REGEX.match?(stripped))
|
|
387
|
+
end
|
|
388
|
+
caption&.strip || ""
|
|
389
|
+
else
|
|
390
|
+
""
|
|
391
|
+
end
|
|
392
|
+
elsif parts.size == 1
|
|
393
|
+
first_part
|
|
394
|
+
elsif parts.size == 2 && parts[1].strip.empty?
|
|
395
|
+
# Pipe trick: [[Namespace:Page|]] or [[Page (disambiguation)|]]
|
|
396
|
+
apply_pipe_trick(first_part)
|
|
132
397
|
else
|
|
133
398
|
parts.shift
|
|
134
399
|
parts.join("|")
|
|
@@ -136,9 +401,25 @@ module Wp2txt
|
|
|
136
401
|
end
|
|
137
402
|
end
|
|
138
403
|
|
|
404
|
+
# MediaWiki pipe trick: extracts display text from link target
|
|
405
|
+
# [[Wikipedia:著作権|]] → 著作権
|
|
406
|
+
# [[東京 (曖昧さ回避)|]] → 東京
|
|
407
|
+
def apply_pipe_trick(target)
|
|
408
|
+
result = target.dup
|
|
409
|
+
# Remove namespace prefix (everything before and including the last colon)
|
|
410
|
+
result = result.sub(/\A[^:]+:/, "") if result.include?(":")
|
|
411
|
+
# Remove trailing parenthetical (disambiguation)
|
|
412
|
+
result = result.sub(/\s*\([^)]+\)\s*\z/, "")
|
|
413
|
+
# Remove trailing comma and following text (for names like "LastName, FirstName")
|
|
414
|
+
result = result.sub(/\s*,.*\z/, "")
|
|
415
|
+
result.strip
|
|
416
|
+
end
|
|
417
|
+
|
|
139
418
|
def process_external_links(str)
|
|
140
|
-
|
|
141
|
-
|
|
419
|
+
# Early exit if no external links present
|
|
420
|
+
return str unless str.include?("[")
|
|
421
|
+
|
|
422
|
+
process_nested_single_pass(str, "[", "]") do |contents|
|
|
142
423
|
if /\A\s.+\s\z/ =~ contents
|
|
143
424
|
" (#{contents.strip}) "
|
|
144
425
|
else
|
|
@@ -153,217 +434,295 @@ module Wp2txt
|
|
|
153
434
|
end
|
|
154
435
|
end
|
|
155
436
|
|
|
156
|
-
####################
|
|
437
|
+
#################### template processing ####################
|
|
157
438
|
|
|
158
439
|
def remove_templates(str)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
440
|
+
# Early exit if no templates present
|
|
441
|
+
return str unless str.include?("{{")
|
|
442
|
+
|
|
443
|
+
result = process_nested_single_pass(str, "{{", "}}") { "" }
|
|
444
|
+
|
|
445
|
+
# Handle single brace templates (less common)
|
|
446
|
+
return result unless result.include?("{")
|
|
447
|
+
process_nested_single_pass(result, "{", "}") { "" }
|
|
167
448
|
end
|
|
168
449
|
|
|
169
|
-
def remove_table(str)
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
450
|
+
def remove_table(str, enabled_markers = [])
|
|
451
|
+
# Early exit if no tables present
|
|
452
|
+
return str unless str.include?("{|")
|
|
453
|
+
|
|
454
|
+
# If table marker is enabled, tables are already replaced with [TABLE]
|
|
455
|
+
# Only remove if marker is not enabled
|
|
456
|
+
if enabled_markers.include?(:table)
|
|
457
|
+
str
|
|
458
|
+
else
|
|
459
|
+
process_nested_single_pass(str, "{|", "|}") { "" }
|
|
173
460
|
end
|
|
174
461
|
end
|
|
175
462
|
|
|
176
|
-
|
|
177
|
-
|
|
463
|
+
# Citation templates that can be extracted
|
|
464
|
+
# Data source: template_aliases.json (citation_templates category)
|
|
465
|
+
CITATION_TEMPLATES = Wp2txt.load_template_data["citation_templates"] || []
|
|
466
|
+
CITATION_TEMPLATE_REGEX = if CITATION_TEMPLATES.empty?
|
|
467
|
+
# Fallback to basic pattern
|
|
468
|
+
/\A\s*(?:cite\s*(?:web|book|news|journal)|citation)\s*(?:\||$)/i
|
|
469
|
+
else
|
|
470
|
+
pattern = CITATION_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
|
|
471
|
+
Regexp.new('\A\s*(?:' + pattern + ')\s*(?:\||$)', Regexp::IGNORECASE)
|
|
178
472
|
end
|
|
179
473
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
474
|
+
# Templates that should be completely removed (references, navigation, but NOT citations when extracting)
|
|
475
|
+
# Data source: template_aliases.json (remove_templates category)
|
|
476
|
+
REMOVE_TEMPLATES = Wp2txt.load_template_data["remove_templates"] || []
|
|
477
|
+
REMOVE_TEMPLATES_REGEX = if REMOVE_TEMPLATES.empty?
|
|
478
|
+
# Fallback to basic pattern
|
|
479
|
+
/\A\s*(?:sfn|efn|refn|reflist|notelist|main|see\s*also|portal)\s*(?:\||$)/i
|
|
480
|
+
else
|
|
481
|
+
pattern = REMOVE_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
|
|
482
|
+
Regexp.new('\A\s*(?:' + pattern + ')\s*(?:\||$)', Regexp::IGNORECASE)
|
|
184
483
|
end
|
|
185
484
|
|
|
186
|
-
|
|
187
|
-
|
|
485
|
+
# Flag templates to remove
|
|
486
|
+
# Data source: template_aliases.json (flag_templates category)
|
|
487
|
+
FLAG_TEMPLATES = Wp2txt.load_template_data["flag_templates"] || []
|
|
488
|
+
FLAG_TEMPLATE_REGEX = if FLAG_TEMPLATES.empty?
|
|
489
|
+
/\A\s*(?:flag|flagicon|flagcountry)\s*(?:\||$)/i
|
|
490
|
+
else
|
|
491
|
+
pattern = FLAG_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
|
|
492
|
+
Regexp.new('\A\s*(?:' + pattern + ')\s*(?:\||$)', Regexp::IGNORECASE)
|
|
188
493
|
end
|
|
189
494
|
|
|
190
|
-
|
|
191
|
-
|
|
495
|
+
# Formatting templates (extract content)
|
|
496
|
+
# Data source: template_aliases.json (formatting_templates category)
|
|
497
|
+
FORMATTING_TEMPLATES = Wp2txt.load_template_data["formatting_templates"] || []
|
|
498
|
+
FORMATTING_TEMPLATE_REGEX = if FORMATTING_TEMPLATES.empty?
|
|
499
|
+
/\A\s*(?:small|smaller|large|larger|nowrap|nbsp)\s*(?:\||$)/i
|
|
500
|
+
else
|
|
501
|
+
pattern = FORMATTING_TEMPLATES.map { |t| Regexp.escape(t) }.join("|")
|
|
502
|
+
Regexp.new('\A\s*(?:' + pattern + ')\s*(?:\||$)', Regexp::IGNORECASE)
|
|
192
503
|
end
|
|
193
504
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
end
|
|
198
|
-
end
|
|
505
|
+
# Ruby text templates (読み仮名 equivalent across languages)
|
|
506
|
+
# Data source: template_aliases.json (ruby_text_templates category)
|
|
507
|
+
RUBY_TEXT_TEMPLATES = Wp2txt.load_template_data["ruby_text_templates"] || []
|
|
199
508
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
$2.to_i(16)
|
|
204
|
-
else
|
|
205
|
-
$2.to_i
|
|
206
|
-
end
|
|
207
|
-
hi = ch >> 8
|
|
208
|
-
lo = ch & 0xff
|
|
209
|
-
u = +"\377\376" << lo.chr << hi.chr
|
|
210
|
-
u.encode("UTF-8", "UTF-16")
|
|
211
|
-
end
|
|
212
|
-
rescue StandardError
|
|
213
|
-
num_str
|
|
214
|
-
end
|
|
509
|
+
# Interwiki link templates (仮リンク equivalent across languages)
|
|
510
|
+
# Data source: template_aliases.json (interwiki_link_templates category)
|
|
511
|
+
INTERWIKI_LINK_TEMPLATES = Wp2txt.load_template_data["interwiki_link_templates"] || []
|
|
215
512
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
513
|
+
# Mixed script templates (nihongo equivalent across languages)
|
|
514
|
+
# Data source: template_aliases.json (mixed_script_templates category)
|
|
515
|
+
MIXED_SCRIPT_TEMPLATES = Wp2txt.load_template_data["mixed_script_templates"] || []
|
|
219
516
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
517
|
+
# Convert templates
|
|
518
|
+
# Data source: template_aliases.json (convert_templates category)
|
|
519
|
+
CONVERT_TEMPLATES = Wp2txt.load_template_data["convert_templates"] || []
|
|
223
520
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
end
|
|
521
|
+
# Country code templates (2-3 letter codes that represent flags)
|
|
522
|
+
COUNTRY_CODE_REGEX = /\A[A-Z]{2,3}\z/
|
|
227
523
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
524
|
+
# Extract formatted citation from template parameters
|
|
525
|
+
def format_citation(contents)
|
|
526
|
+
params = {}
|
|
527
|
+
contents.split("|").each do |part|
|
|
528
|
+
if part.include?("=")
|
|
529
|
+
key, value = part.split("=", 2)
|
|
530
|
+
params[key.strip.downcase] = value&.strip
|
|
235
531
|
end
|
|
236
|
-
res.replace(result)
|
|
237
532
|
end
|
|
238
|
-
res
|
|
239
|
-
end
|
|
240
533
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
534
|
+
# Extract author (last name, or author field)
|
|
535
|
+
author = params["last"] || params["last1"] || params["author"] || params["author1"] || ""
|
|
536
|
+
first = params["first"] || params["first1"] || ""
|
|
537
|
+
author = "#{author}, #{first}" if !author.empty? && !first.empty?
|
|
538
|
+
|
|
539
|
+
# Extract title
|
|
540
|
+
title = params["title"] || ""
|
|
541
|
+
|
|
542
|
+
# Extract year/date
|
|
543
|
+
year = params["year"] || ""
|
|
544
|
+
if year.empty? && params["date"]
|
|
545
|
+
# Extract year from date like "2021-05-15"
|
|
546
|
+
year = params["date"][0, 4] if params["date"] =~ /^\d{4}/
|
|
547
|
+
end
|
|
548
|
+
|
|
549
|
+
# Format: "Author. Title. Year." or partial if fields missing
|
|
550
|
+
parts = []
|
|
551
|
+
parts << author unless author.empty?
|
|
552
|
+
parts << "\"#{title}\"" unless title.empty?
|
|
553
|
+
parts << year unless year.empty?
|
|
554
|
+
|
|
555
|
+
parts.empty? ? "" : parts.join(". ") + "."
|
|
247
556
|
end
|
|
248
557
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
558
|
+
# Helper to check if template name matches any in a list (case-insensitive)
|
|
559
|
+
def template_matches?(name, template_list)
|
|
560
|
+
return false if template_list.nil? || template_list.empty?
|
|
561
|
+
normalized_name = name.to_s.strip.downcase
|
|
562
|
+
template_list.any? { |t| t.downcase == normalized_name }
|
|
254
563
|
end
|
|
255
564
|
|
|
256
|
-
def correct_inline_template(str)
|
|
257
|
-
|
|
258
|
-
|
|
565
|
+
def correct_inline_template(str, enabled_markers = [], extract_citations = false)
|
|
566
|
+
# Early exit if no templates present
|
|
567
|
+
return str unless str.include?("{{")
|
|
568
|
+
|
|
569
|
+
process_nested_single_pass(str, "{{", "}}") do |contents|
|
|
259
570
|
parts = contents.split("|")
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
571
|
+
template_name = (parts[0] || "").strip
|
|
572
|
+
template_name_lower = template_name.downcase
|
|
573
|
+
|
|
574
|
+
# =========================================================================
|
|
575
|
+
# Specific template handlers (order matters - check before generic patterns)
|
|
576
|
+
# =========================================================================
|
|
577
|
+
|
|
578
|
+
# {{IPA|...}} or {{IPA-xx|...}} or {{IPAc-xx|...}}
|
|
579
|
+
# Must be checked BEFORE mixed_script_templates which also contains IPA
|
|
580
|
+
if template_name_lower == "ipa" || template_name_lower.start_with?("ipa-") || template_name_lower.start_with?("ipac-")
|
|
581
|
+
if enabled_markers.include?(:ipa)
|
|
582
|
+
marker_placeholder(:ipa)
|
|
583
|
+
else
|
|
584
|
+
(parts[1] || "").to_s.strip
|
|
585
|
+
end
|
|
586
|
+
# Language templates: {{lang|code|text}} or {{lang-xx|text}}
|
|
587
|
+
# Must be checked BEFORE mixed_script_templates which also contains lang
|
|
588
|
+
elsif template_name_lower == "lang"
|
|
589
|
+
parts.size >= 3 ? parts[2].to_s.strip : (parts[1] || "").to_s.strip
|
|
590
|
+
elsif template_name_lower.start_with?("lang-")
|
|
591
|
+
(parts[1] || "").to_s.strip
|
|
592
|
+
elsif template_name_lower == "fontsize"
|
|
593
|
+
parts.size >= 3 ? parts[2].to_s.strip : (parts[1] || "").to_s.strip
|
|
594
|
+
# {{langwithname|code|name|text}} - extract the text (3rd param)
|
|
595
|
+
elsif template_name_lower == "langwithname"
|
|
596
|
+
parts.size >= 4 ? parts[3].to_s.strip : (parts.last || "").to_s.strip
|
|
597
|
+
# {{math|...}} or {{mvar|...}} - mathematical notation
|
|
598
|
+
elsif template_name_lower == "math" || template_name_lower == "mvar"
|
|
599
|
+
if enabled_markers.include?(:math)
|
|
600
|
+
marker_placeholder(:math)
|
|
601
|
+
else
|
|
602
|
+
(parts[1] || "").to_s.strip
|
|
603
|
+
end
|
|
604
|
+
# {{chem|...}} or {{ce|...}} - chemical formulas
|
|
605
|
+
elsif template_name_lower == "chem" || template_name_lower == "ce"
|
|
606
|
+
if enabled_markers.include?(:chem)
|
|
607
|
+
marker_placeholder(:chem)
|
|
608
|
+
else
|
|
609
|
+
(parts[1] || "").to_s.strip
|
|
610
|
+
end
|
|
267
611
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
612
|
+
# =========================================================================
|
|
613
|
+
# Data-driven template matching (generic patterns from template_aliases.json)
|
|
614
|
+
# =========================================================================
|
|
615
|
+
|
|
616
|
+
# Handle citation templates
|
|
617
|
+
elsif CITATION_TEMPLATE_REGEX.match?(contents)
|
|
618
|
+
if extract_citations
|
|
619
|
+
format_citation(contents)
|
|
620
|
+
else
|
|
621
|
+
""
|
|
622
|
+
end
|
|
623
|
+
# Remove navigation/reference templates entirely
|
|
624
|
+
elsif REMOVE_TEMPLATES_REGEX.match?(contents)
|
|
625
|
+
""
|
|
626
|
+
# Remove flag templates (data-driven)
|
|
627
|
+
elsif FLAG_TEMPLATE_REGEX.match?(contents) || COUNTRY_CODE_REGEX.match?(template_name)
|
|
628
|
+
""
|
|
629
|
+
# Ruby text templates: 読み仮名, ruby, etc. (data-driven)
|
|
630
|
+
elsif template_matches?(template_name, RUBY_TEXT_TEMPLATES)
|
|
631
|
+
text = (parts[1] || "").strip
|
|
632
|
+
reading = (parts[2] || "").strip
|
|
633
|
+
reading.empty? ? text : "#{text}(#{reading})"
|
|
634
|
+
# Interwiki link templates: 仮リンク, ill, interlanguage link (data-driven)
|
|
635
|
+
elsif template_matches?(template_name, INTERWIKI_LINK_TEMPLATES)
|
|
636
|
+
# First parameter is display text
|
|
637
|
+
(parts[1] || "").to_s.strip
|
|
638
|
+
# Mixed script templates: nihongo, transl, etc. (data-driven)
|
|
639
|
+
elsif template_matches?(template_name, MIXED_SCRIPT_TEMPLATES)
|
|
640
|
+
# Format depends on template type
|
|
641
|
+
if template_name_lower == "nihongo" || template_name_lower.start_with?("nihongo")
|
|
642
|
+
text = (parts[1] || "").strip
|
|
643
|
+
kanji = (parts[2] || "").strip
|
|
644
|
+
romaji = (parts[3] || "").strip
|
|
645
|
+
if kanji.empty? && romaji.empty?
|
|
646
|
+
text
|
|
647
|
+
elsif romaji.empty?
|
|
648
|
+
"#{text} (#{kanji})"
|
|
649
|
+
elsif kanji.empty?
|
|
650
|
+
"#{text} (#{romaji})"
|
|
651
|
+
else
|
|
652
|
+
"#{text} (#{kanji}, #{romaji})"
|
|
653
|
+
end
|
|
654
|
+
elsif template_name_lower == "transl" || template_name_lower == "transliteration"
|
|
655
|
+
# {{transl|lang|text}} -> text
|
|
656
|
+
(parts[2] || parts[1] || "").to_s.strip
|
|
657
|
+
else
|
|
658
|
+
# Default: extract first content parameter
|
|
659
|
+
(parts[1] || "").to_s.strip
|
|
280
660
|
end
|
|
661
|
+
# Convert templates (data-driven)
|
|
662
|
+
elsif template_matches?(template_name, CONVERT_TEMPLATES)
|
|
663
|
+
num = (parts[1] || "").strip
|
|
664
|
+
unit = (parts[2] || "").strip
|
|
665
|
+
unit.empty? ? num : "#{num} #{unit}"
|
|
666
|
+
# Formatting templates: small, nowrap, nbsp, etc. (data-driven)
|
|
667
|
+
elsif FORMATTING_TEMPLATE_REGEX.match?(contents)
|
|
668
|
+
if template_name_lower == "nbsp"
|
|
669
|
+
" " # Non-breaking space
|
|
670
|
+
else
|
|
671
|
+
# Extract content from formatting template
|
|
672
|
+
(parts[1] || "").to_s.strip
|
|
673
|
+
end
|
|
674
|
+
# Default handling for other templates
|
|
675
|
+
else
|
|
676
|
+
extract_template_content(parts)
|
|
281
677
|
end
|
|
282
|
-
out.strip
|
|
283
678
|
end
|
|
284
679
|
end
|
|
285
680
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
681
|
+
# Extract meaningful content from template parts
|
|
682
|
+
def extract_template_content(parts)
|
|
683
|
+
return "" if parts.empty?
|
|
684
|
+
return parts[0].to_s.strip if parts.size == 1
|
|
685
|
+
|
|
686
|
+
# Skip the template name, try to find non-parameter content
|
|
687
|
+
parts[1..].each do |part|
|
|
688
|
+
next if part.nil?
|
|
689
|
+
# Skip if it looks like a parameter (contains =)
|
|
690
|
+
next if part.include?("=")
|
|
691
|
+
content = part.strip
|
|
692
|
+
return content unless content.empty?
|
|
294
693
|
end
|
|
295
|
-
text_array.sort
|
|
296
|
-
end
|
|
297
694
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
File.open("temp", "w") do |tf|
|
|
305
|
-
tf.write(str)
|
|
695
|
+
# If all parts have =, try to extract value from first parameter
|
|
696
|
+
parts[1..].each do |part|
|
|
697
|
+
next if part.nil?
|
|
698
|
+
if part.include?("=")
|
|
699
|
+
key, value = part.split("=", 2)
|
|
700
|
+
return value.to_s.strip unless value.nil? || value.strip.empty?
|
|
306
701
|
end
|
|
307
702
|
end
|
|
308
703
|
|
|
309
|
-
|
|
310
|
-
File.rename("temp", file_path)
|
|
311
|
-
File.unlink(file_path + ".bak") unless backup
|
|
704
|
+
""
|
|
312
705
|
end
|
|
313
706
|
|
|
314
|
-
#
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
yield file if FileTest.file?(file)
|
|
319
|
-
end
|
|
320
|
-
elsif FileTest.file?(dir_path)
|
|
321
|
-
yield dir_path
|
|
322
|
-
end
|
|
323
|
-
end
|
|
707
|
+
# =========================================================================
|
|
708
|
+
# Make constants Ractor-shareable for parallel processing
|
|
709
|
+
# =========================================================================
|
|
710
|
+
module_function
|
|
324
711
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
case input
|
|
328
|
-
when String
|
|
329
|
-
if RUBY_PLATFORM.index("win32")
|
|
330
|
-
input.gsub("/", "\\")
|
|
331
|
-
else
|
|
332
|
-
input.gsub("\\", "/")
|
|
333
|
-
end
|
|
334
|
-
when Array
|
|
335
|
-
ret_array = []
|
|
336
|
-
input.each do |item|
|
|
337
|
-
ret_array << correct_separator(item)
|
|
338
|
-
end
|
|
339
|
-
ret_array
|
|
340
|
-
end
|
|
341
|
-
end
|
|
712
|
+
def self.make_constants_ractor_shareable!
|
|
713
|
+
return unless defined?(Ractor) && Ractor.respond_to?(:make_shareable)
|
|
342
714
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
715
|
+
constants(false).each do |const_name|
|
|
716
|
+
const = const_get(const_name)
|
|
717
|
+
next if Ractor.shareable?(const)
|
|
346
718
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
"-" + format("%0#{maxwidth}d", $1.to_i)
|
|
719
|
+
begin
|
|
720
|
+
Ractor.make_shareable(const)
|
|
721
|
+
rescue Ractor::IsolationError, FrozenError, TypeError
|
|
722
|
+
# Some constants can't be made shareable, skip them
|
|
352
723
|
end
|
|
353
|
-
File.rename(f, newname + ".#{ext}")
|
|
354
724
|
end
|
|
355
|
-
true
|
|
356
725
|
end
|
|
357
726
|
|
|
358
|
-
|
|
359
|
-
def sec_to_str(int)
|
|
360
|
-
unless int
|
|
361
|
-
str = "--:--:--"
|
|
362
|
-
return str
|
|
363
|
-
end
|
|
364
|
-
h = int / 3600
|
|
365
|
-
m = (int - h * 3600) / 60
|
|
366
|
-
s = int % 60
|
|
367
|
-
format("%02d:%02d:%02d", h, m, s)
|
|
368
|
-
end
|
|
727
|
+
make_constants_ractor_shareable!
|
|
369
728
|
end
|