wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
|
|
5
|
+
RSpec.describe "Wp2txt Text Processing" do
|
|
6
|
+
include Wp2txt
|
|
7
|
+
|
|
8
|
+
describe "convert_characters" do
|
|
9
|
+
it "handles valid UTF-8 text" do
|
|
10
|
+
result = convert_characters("Hello World")
|
|
11
|
+
expect(result).to eq("Hello World")
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
it "handles Unicode text" do
|
|
15
|
+
result = convert_characters("日本語テキスト")
|
|
16
|
+
expect(result).to eq("日本語テキスト")
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it "converts HTML entities" do
|
|
20
|
+
result = convert_characters("Hello & World")
|
|
21
|
+
expect(result).to eq("Hello & World")
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it "handles nil input" do
|
|
25
|
+
result = convert_characters(nil)
|
|
26
|
+
expect(result).to eq("")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it "handles numeric character references" do
|
|
30
|
+
result = convert_characters("ABC")
|
|
31
|
+
expect(result).to eq("ABC")
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
describe "special_chr" do
|
|
36
|
+
it "decodes HTML entities" do
|
|
37
|
+
result = special_chr("& < >")
|
|
38
|
+
expect(result).to eq("& < >")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it "decodes special quotes" do
|
|
42
|
+
result = special_chr("“text”")
|
|
43
|
+
expect(result).to include("text")
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
describe "chrref_to_utf" do
|
|
48
|
+
it "converts decimal character references" do
|
|
49
|
+
result = chrref_to_utf("A")
|
|
50
|
+
expect(result).to eq("A")
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it "converts hex character references" do
|
|
54
|
+
result = chrref_to_utf("A")
|
|
55
|
+
expect(result).to eq("A")
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it "handles Japanese characters" do
|
|
59
|
+
result = chrref_to_utf("あ")
|
|
60
|
+
expect(result).to eq("あ")
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it "handles invalid codepoints" do
|
|
64
|
+
result = chrref_to_utf("�")
|
|
65
|
+
expect(result).to eq("")
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
it "preserves non-reference text" do
|
|
69
|
+
result = chrref_to_utf("normal text")
|
|
70
|
+
expect(result).to eq("normal text")
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
describe "mndash" do
|
|
75
|
+
it "converts ndash template" do
|
|
76
|
+
result = mndash("1990{{ndash}}2000")
|
|
77
|
+
# The implementation wraps the dash in braces
|
|
78
|
+
expect(result).to include("–")
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
it "handles mdash" do
|
|
82
|
+
result = mndash("text{{mdash}}more")
|
|
83
|
+
expect(result).to include("–")
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it "preserves text without dashes" do
|
|
87
|
+
result = mndash("normal text")
|
|
88
|
+
expect(result).to eq("normal text")
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
describe "process_nested_structure" do
|
|
93
|
+
it "processes simple nested brackets" do
|
|
94
|
+
result = process_nested_structure("[[test]]", "[[", "]]") do |content|
|
|
95
|
+
content.upcase
|
|
96
|
+
end
|
|
97
|
+
expect(result).to eq("TEST")
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
it "processes multiple nested levels" do
|
|
101
|
+
result = process_nested_structure("[[outer [[inner]]]]", "[[", "]]") do |content|
|
|
102
|
+
"[#{content}]"
|
|
103
|
+
end
|
|
104
|
+
# The algorithm processes innermost first, then outer
|
|
105
|
+
expect(result).to include("[inner]")
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
it "handles empty content" do
|
|
109
|
+
result = process_nested_structure("[[]]", "[[", "]]") do |_content|
|
|
110
|
+
"empty"
|
|
111
|
+
end
|
|
112
|
+
expect(result).to eq("empty")
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it "preserves text without brackets" do
|
|
116
|
+
result = process_nested_structure("no brackets here", "[[", "]]") do |_content|
|
|
117
|
+
"replaced"
|
|
118
|
+
end
|
|
119
|
+
expect(result).to eq("no brackets here")
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
it "handles curly braces" do
|
|
123
|
+
result = process_nested_structure("{{template}}", "{{", "}}") do |content|
|
|
124
|
+
"T:#{content}"
|
|
125
|
+
end
|
|
126
|
+
expect(result).to eq("T:template")
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
describe "escape_nowiki and unescape_nowiki" do
|
|
131
|
+
it "escapes and unescapes nowiki tags" do
|
|
132
|
+
original = "text <nowiki>[[preserved]]</nowiki> more"
|
|
133
|
+
escaped = escape_nowiki(original)
|
|
134
|
+
expect(escaped).not_to include("[[preserved]]")
|
|
135
|
+
expect(escaped).to include("<nowiki-")
|
|
136
|
+
|
|
137
|
+
unescaped = unescape_nowiki(escaped)
|
|
138
|
+
expect(unescaped).to include("[[preserved]]")
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
it "handles multiple nowiki tags" do
|
|
142
|
+
original = "<nowiki>a</nowiki> and <nowiki>b</nowiki>"
|
|
143
|
+
escaped = escape_nowiki(original)
|
|
144
|
+
expect(escaped.scan(/<nowiki-\d+>/).size).to eq(2)
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
describe "cleanup" do
|
|
149
|
+
it "removes excessive newlines" do
|
|
150
|
+
result = cleanup("text\n\n\n\n\nmore")
|
|
151
|
+
expect(result.count("\n")).to be <= 4 # max 2 consecutive + trailing
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
it "removes empty parentheses" do
|
|
155
|
+
result = cleanup("text () more")
|
|
156
|
+
expect(result).not_to include("()")
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it "removes empty Japanese parentheses" do
|
|
160
|
+
result = cleanup("text()more")
|
|
161
|
+
expect(result).not_to include("()")
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
it "adds trailing newlines" do
|
|
165
|
+
result = cleanup("text")
|
|
166
|
+
expect(result).to end_with("\n\n")
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
it "strips leading/trailing whitespace" do
|
|
170
|
+
result = cleanup(" text ")
|
|
171
|
+
expect(result).to start_with("text")
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
describe "remove_html" do
|
|
176
|
+
it "removes HTML comments" do
|
|
177
|
+
result = remove_html("before <!-- comment --> after")
|
|
178
|
+
expect(result).to include("before")
|
|
179
|
+
expect(result).to include("after")
|
|
180
|
+
expect(result).not_to include("comment")
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
it "removes self-closing tags" do
|
|
184
|
+
result = remove_html("text<br/>more")
|
|
185
|
+
expect(result).to eq("textmore")
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
it "removes gallery tags" do
|
|
189
|
+
result = remove_html("<gallery>image.jpg</gallery>")
|
|
190
|
+
expect(result).not_to include("image.jpg")
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
it "handles nested div tags" do
|
|
194
|
+
result = remove_html("<div><div>inner</div></div>outside")
|
|
195
|
+
expect(result).to eq("outside")
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
describe "remove_complex" do
|
|
200
|
+
it "converts ruby annotations" do
|
|
201
|
+
# Ruby annotation: {{Ruby|漢字|かんじ}} style patterns
|
|
202
|
+
result = remove_complex("text{{Ruby|漢字|かんじ}}more")
|
|
203
|
+
# Should convert to 《》 format
|
|
204
|
+
expect(result).to include("漢字")
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
describe "remove_inbetween" do
|
|
209
|
+
it "removes content between angle brackets" do
|
|
210
|
+
result = remove_inbetween("before <tag> after")
|
|
211
|
+
expect(result).to eq("before after")
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
it "removes multiple occurrences" do
|
|
215
|
+
result = remove_inbetween("a<1>b<2>c")
|
|
216
|
+
expect(result).to eq("abc")
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
it "uses custom tagset" do
|
|
220
|
+
result = remove_inbetween("before [content] after", ["[", "]"])
|
|
221
|
+
expect(result).to eq("before after")
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
describe "remove_tag" do
|
|
226
|
+
it "removes HTML tags" do
|
|
227
|
+
result = remove_tag("<p>content</p>")
|
|
228
|
+
expect(result).to eq("content")
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
it "removes inline tags" do
|
|
232
|
+
result = remove_tag("<b>bold</b> and <i>italic</i>")
|
|
233
|
+
expect(result).to eq("bold and italic")
|
|
234
|
+
end
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
describe "remove_directive" do
|
|
238
|
+
it "removes behavior switches" do
|
|
239
|
+
result = remove_directive("__NOTOC__text")
|
|
240
|
+
expect(result).to eq("text")
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
it "removes TOC directive" do
|
|
244
|
+
result = remove_directive("before__TOC__after")
|
|
245
|
+
expect(result).to eq("beforeafter")
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
describe "remove_emphasis" do
|
|
250
|
+
it "removes bold markup" do
|
|
251
|
+
result = remove_emphasis("'''bold''' text")
|
|
252
|
+
expect(result).to include("bold")
|
|
253
|
+
expect(result).not_to include("'''")
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
it "removes italic markup" do
|
|
257
|
+
result = remove_emphasis("''italic'' text")
|
|
258
|
+
expect(result).to include("italic")
|
|
259
|
+
expect(result).not_to include("''")
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
it "removes bold-italic markup" do
|
|
263
|
+
result = remove_emphasis("'''''both''''' text")
|
|
264
|
+
expect(result).to include("both")
|
|
265
|
+
expect(result).not_to include("'''''")
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
describe "remove_hr" do
|
|
270
|
+
it "removes horizontal rules" do
|
|
271
|
+
result = remove_hr("before\n----\nafter")
|
|
272
|
+
expect(result).not_to include("----")
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
it "removes longer rules" do
|
|
276
|
+
result = remove_hr("text\n------\nmore")
|
|
277
|
+
expect(result).not_to include("------")
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
describe "remove_ref" do
|
|
282
|
+
# remove_ref removes [ref]...[/ref] markers (not HTML <ref> tags)
|
|
283
|
+
# Use make_reference first to convert <ref> to [ref]
|
|
284
|
+
it "removes [ref] marker tags" do
|
|
285
|
+
result = remove_ref("text[ref]citation[/ref]more")
|
|
286
|
+
expect(result).to eq("textmore")
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
it "removes multiple [ref] markers" do
|
|
290
|
+
result = remove_ref("a[ref]1[/ref]b[ref]2[/ref]c")
|
|
291
|
+
expect(result).to eq("abc")
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
it "preserves text without markers" do
|
|
295
|
+
result = remove_ref("text without references")
|
|
296
|
+
expect(result).to eq("text without references")
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
describe "make_reference" do
|
|
301
|
+
it "converts reference tags to markers" do
|
|
302
|
+
result = make_reference("text<ref>citation</ref>more")
|
|
303
|
+
expect(result).to include("[ref]")
|
|
304
|
+
expect(result).to include("[/ref]")
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
it "handles multiple references" do
|
|
308
|
+
result = make_reference("a<ref>1</ref>b<ref>2</ref>c")
|
|
309
|
+
expect(result.scan("[ref]").size).to eq(2)
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
end
|
data/spec/utils_spec.rb
CHANGED
|
@@ -1,19 +1,10 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "spec_helper"
|
|
4
|
-
require_relative "../lib/wp2txt"
|
|
5
|
-
require_relative "../lib/wp2txt/article"
|
|
6
|
-
require_relative "../lib/wp2txt/utils"
|
|
7
|
-
|
|
8
|
-
describe "Wp2txt" do
|
|
9
|
-
it "contains mediawiki-format related functions:" do
|
|
10
|
-
end
|
|
11
4
|
|
|
5
|
+
RSpec.describe "Wp2txt Utils" do
|
|
12
6
|
include Wp2txt
|
|
13
7
|
|
|
14
|
-
before do
|
|
15
|
-
end
|
|
16
|
-
|
|
17
8
|
describe "process_nested_structure" do
|
|
18
9
|
it "parse nested structure replacing str in the format specified" do
|
|
19
10
|
str_before1 = "[[ab[[cde[[alfa]]]]fg]]"
|
|
@@ -77,9 +68,18 @@ describe "Wp2txt" do
|
|
|
77
68
|
end
|
|
78
69
|
|
|
79
70
|
describe "remove_hr" do
|
|
80
|
-
it "removes horizontal lines" do
|
|
81
|
-
|
|
82
|
-
|
|
71
|
+
it "removes horizontal lines with 4+ hyphens" do
|
|
72
|
+
# MediaWiki requires 4+ hyphens for horizontal rules
|
|
73
|
+
# The hyphens are removed but newlines around them are preserved
|
|
74
|
+
str_before = "text\n----\nmore"
|
|
75
|
+
str_after = "text\n\nmore"
|
|
76
|
+
expect(remove_hr(str_before)).to eq str_after
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "does not remove lines with fewer than 4 hyphens" do
|
|
80
|
+
# Lines with fewer than 4 hyphens should be preserved
|
|
81
|
+
str_before = "text\n--\n---\nmore"
|
|
82
|
+
str_after = "text\n--\n---\nmore"
|
|
83
83
|
expect(remove_hr(str_before)).to eq str_after
|
|
84
84
|
end
|
|
85
85
|
end
|
|
@@ -97,11 +97,27 @@ describe "Wp2txt" do
|
|
|
97
97
|
end
|
|
98
98
|
|
|
99
99
|
describe "remove_directive" do
|
|
100
|
-
it "removes
|
|
101
|
-
|
|
100
|
+
it "removes MediaWiki magic words" do
|
|
101
|
+
# Use actual MediaWiki behavior switches (loaded from mediawiki_aliases.json)
|
|
102
|
+
str_before = "__NOTOC__\n __TOC__"
|
|
102
103
|
str_after = "\n "
|
|
103
104
|
expect(remove_directive(str_before)).to eq str_after
|
|
104
105
|
end
|
|
106
|
+
|
|
107
|
+
it "removes multilingual magic words" do
|
|
108
|
+
# Japanese/German/other language magic words should also be removed
|
|
109
|
+
str_before = "__KEIN_INHALTSVERZEICHNIS__\n__目次非表示__"
|
|
110
|
+
str_after = "\n"
|
|
111
|
+
expect(remove_directive(str_before)).to eq str_after
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it "preserves non-magic-word patterns" do
|
|
115
|
+
# Arbitrary __something__ patterns that aren't valid magic words should be preserved
|
|
116
|
+
# (This is the expected behavior with data-driven approach)
|
|
117
|
+
str_before = "__custom_marker__"
|
|
118
|
+
# With data-driven approach, unknown patterns are NOT removed
|
|
119
|
+
expect(remove_directive(str_before)).to eq str_before
|
|
120
|
+
end
|
|
105
121
|
end
|
|
106
122
|
|
|
107
123
|
describe "remove_emphasis" do
|
|
@@ -144,6 +160,55 @@ describe "Wp2txt" do
|
|
|
144
160
|
expect(c2).to eq "b|c"
|
|
145
161
|
expect(d2).to eq "[ɲ], /J/"
|
|
146
162
|
end
|
|
163
|
+
|
|
164
|
+
it "handles pipe trick (empty display text)" do
|
|
165
|
+
# Namespace prefix removal
|
|
166
|
+
expect(process_interwiki_links("[[Wikipedia:著作権|]]")).to eq "著作権"
|
|
167
|
+
expect(process_interwiki_links("[[Help:Contents|]]")).to eq "Contents"
|
|
168
|
+
|
|
169
|
+
# Disambiguation suffix removal
|
|
170
|
+
expect(process_interwiki_links("[[Tokyo (disambiguation)|]]")).to eq "Tokyo"
|
|
171
|
+
expect(process_interwiki_links("[[Mercury (planet)|]]")).to eq "Mercury"
|
|
172
|
+
|
|
173
|
+
# Comma suffix removal
|
|
174
|
+
expect(process_interwiki_links("[[Paris, Texas|]]")).to eq "Paris"
|
|
175
|
+
expect(process_interwiki_links("[[San Francisco, California|]]")).to eq "San Francisco"
|
|
176
|
+
|
|
177
|
+
# Combined: namespace and disambiguation
|
|
178
|
+
expect(process_interwiki_links("[[Wikipedia:Manual of Style (dates)|]]")).to eq "Manual of Style"
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
it "handles interwiki links" do
|
|
182
|
+
expect(process_interwiki_links("[[Wikisource:日本国憲法]]")).to eq "Wikisource:日本国憲法"
|
|
183
|
+
expect(process_interwiki_links("[[s:日本国憲法|日本国憲法]]")).to eq "日本国憲法"
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
describe "apply_pipe_trick" do
|
|
188
|
+
it "removes namespace prefix" do
|
|
189
|
+
expect(apply_pipe_trick("Wikipedia:Manual of Style")).to eq "Manual of Style"
|
|
190
|
+
expect(apply_pipe_trick("Help:Contents")).to eq "Contents"
|
|
191
|
+
expect(apply_pipe_trick("カテゴリ:日本")).to eq "日本"
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
it "removes disambiguation parenthetical" do
|
|
195
|
+
expect(apply_pipe_trick("Mercury (planet)")).to eq "Mercury"
|
|
196
|
+
expect(apply_pipe_trick("東京 (曖昧さ回避)")).to eq "東京"
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
it "removes comma and following text" do
|
|
200
|
+
expect(apply_pipe_trick("Paris, Texas")).to eq "Paris"
|
|
201
|
+
expect(apply_pipe_trick("San Francisco, California")).to eq "San Francisco"
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
it "handles combined cases" do
|
|
205
|
+
expect(apply_pipe_trick("Wikipedia:Manual of Style (dates)")).to eq "Manual of Style"
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
it "returns original if no transformation needed" do
|
|
209
|
+
expect(apply_pipe_trick("Simple")).to eq "Simple"
|
|
210
|
+
expect(apply_pipe_trick("東京")).to eq "東京"
|
|
211
|
+
end
|
|
147
212
|
end
|
|
148
213
|
|
|
149
214
|
describe "process_external_links" do
|
|
@@ -162,8 +227,9 @@ describe "Wp2txt" do
|
|
|
162
227
|
|
|
163
228
|
describe "correct_inline_template" do
|
|
164
229
|
it "removes brackets and leaving some text" do
|
|
230
|
+
# Flag/country templates should be removed entirely
|
|
165
231
|
str_before1 = "{{MedalCountry | {{JPN}} }}"
|
|
166
|
-
str_after1 = "
|
|
232
|
+
str_after1 = ""
|
|
167
233
|
expect(correct_inline_template(str_before1)).to eq str_after1
|
|
168
234
|
|
|
169
235
|
str_before2 = "{{lang|en|Japan}}"
|
|
@@ -182,5 +248,118 @@ describe "Wp2txt" do
|
|
|
182
248
|
str_after5 = "日本人に多く見受けられる"
|
|
183
249
|
expect(correct_inline_template(str_before5)).to eq str_after5
|
|
184
250
|
end
|
|
251
|
+
|
|
252
|
+
it "removes citation templates entirely" do
|
|
253
|
+
expect(correct_inline_template("{{cite web|url=http://example.com|title=Test}}")).to eq ""
|
|
254
|
+
expect(correct_inline_template("{{cite book|title=Book|author=Author}}")).to eq ""
|
|
255
|
+
expect(correct_inline_template("{{sfn|Smith|2020|p=123}}")).to eq ""
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
it "extracts content from language templates" do
|
|
259
|
+
expect(correct_inline_template("{{lang-en|Hello}}")).to eq "Hello"
|
|
260
|
+
expect(correct_inline_template("{{langwithname|en|English|Hello World}}")).to eq "Hello World"
|
|
261
|
+
expect(correct_inline_template("{{IPA|/həˈloʊ/}}")).to eq "/həˈloʊ/"
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it "formats nihongo template correctly" do
|
|
265
|
+
expect(correct_inline_template("{{nihongo|Tokyo|東京|Tōkyō}}")).to eq "Tokyo (東京, Tōkyō)"
|
|
266
|
+
expect(correct_inline_template("{{nihongo|Tokyo|東京}}")).to eq "Tokyo (東京)"
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
it "handles convert template" do
|
|
270
|
+
expect(correct_inline_template("{{convert|100|km|mi}}")).to eq "100 km"
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
it "removes flag templates" do
|
|
274
|
+
expect(correct_inline_template("{{flagicon|Japan}}")).to eq ""
|
|
275
|
+
expect(correct_inline_template("{{JPN}}")).to eq ""
|
|
276
|
+
expect(correct_inline_template("{{USA}}")).to eq ""
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
describe "parse_markers_config" do
|
|
281
|
+
it "returns default markers for true" do
|
|
282
|
+
result = parse_markers_config(true)
|
|
283
|
+
expect(result).to be_an(Array)
|
|
284
|
+
expect(result).not_to be_empty
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
it "returns empty array for false" do
|
|
288
|
+
result = parse_markers_config(false)
|
|
289
|
+
expect(result).to eq([])
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
it "filters array to valid marker types" do
|
|
293
|
+
result = parse_markers_config([:math, :code, :invalid_type])
|
|
294
|
+
expect(result).to include(:math)
|
|
295
|
+
expect(result).to include(:code)
|
|
296
|
+
expect(result).not_to include(:invalid_type)
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
it "returns default markers for unexpected input" do
|
|
300
|
+
result = parse_markers_config("unexpected string")
|
|
301
|
+
expect(result).to be_an(Array)
|
|
302
|
+
expect(result).not_to be_empty
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
it "returns default markers for nil" do
|
|
306
|
+
result = parse_markers_config(nil)
|
|
307
|
+
expect(result).to be_an(Array)
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
describe "process_interwiki_links" do
|
|
312
|
+
it "removes category links" do
|
|
313
|
+
result = process_interwiki_links("[[Category:Test]]")
|
|
314
|
+
expect(result).to eq("")
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
it "removes category links in Japanese" do
|
|
318
|
+
result = process_interwiki_links("[[カテゴリ:テスト]]")
|
|
319
|
+
expect(result).to eq("")
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
it "extracts caption from file links" do
|
|
323
|
+
result = process_interwiki_links("[[File:Image.jpg|thumb|200px|A caption]]")
|
|
324
|
+
expect(result).to include("caption")
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
it "handles file links without caption" do
|
|
328
|
+
result = process_interwiki_links("[[File:Image.jpg]]")
|
|
329
|
+
expect(result).to eq("")
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
it "handles pipe trick" do
|
|
333
|
+
result = process_interwiki_links("[[Tokyo (city)|]]")
|
|
334
|
+
expect(result).to eq("Tokyo")
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
it "handles simple links" do
|
|
338
|
+
result = process_interwiki_links("[[Simple Link]]")
|
|
339
|
+
expect(result).to eq("Simple Link")
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
it "handles links with display text" do
|
|
343
|
+
result = process_interwiki_links("[[Target|Display Text]]")
|
|
344
|
+
expect(result).to eq("Display Text")
|
|
345
|
+
end
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
describe "marker_placeholder" do
|
|
349
|
+
it "creates placeholder with marker type" do
|
|
350
|
+
result = marker_placeholder(:math)
|
|
351
|
+
expect(result).to include("MATH")
|
|
352
|
+
expect(result).to include("««")
|
|
353
|
+
expect(result).to include("»»")
|
|
354
|
+
end
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
describe "finalize_markers" do
|
|
358
|
+
it "converts placeholders to final format" do
|
|
359
|
+
placeholder = marker_placeholder(:math)
|
|
360
|
+
result = finalize_markers("text #{placeholder} more")
|
|
361
|
+
expect(result).to include("[MATH]")
|
|
362
|
+
expect(result).not_to include("««")
|
|
363
|
+
end
|
|
185
364
|
end
|
|
186
365
|
end
|