wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,312 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+
5
+ RSpec.describe "Wp2txt Text Processing" do
6
+ include Wp2txt
7
+
8
+ describe "convert_characters" do
9
+ it "handles valid UTF-8 text" do
10
+ result = convert_characters("Hello World")
11
+ expect(result).to eq("Hello World")
12
+ end
13
+
14
+ it "handles Unicode text" do
15
+ result = convert_characters("日本語テキスト")
16
+ expect(result).to eq("日本語テキスト")
17
+ end
18
+
19
+ it "converts HTML entities" do
20
+ result = convert_characters("Hello & World")
21
+ expect(result).to eq("Hello & World")
22
+ end
23
+
24
+ it "handles nil input" do
25
+ result = convert_characters(nil)
26
+ expect(result).to eq("")
27
+ end
28
+
29
+ it "handles numeric character references" do
30
+ result = convert_characters("ABC")
31
+ expect(result).to eq("ABC")
32
+ end
33
+ end
34
+
35
+ describe "special_chr" do
36
+ it "decodes HTML entities" do
37
+ result = special_chr("& < >")
38
+ expect(result).to eq("& < >")
39
+ end
40
+
41
+ it "decodes special quotes" do
42
+ result = special_chr("&ldquo;text&rdquo;")
43
+ expect(result).to include("text")
44
+ end
45
+ end
46
+
47
+ describe "chrref_to_utf" do
48
+ it "converts decimal character references" do
49
+ result = chrref_to_utf("&#65;")
50
+ expect(result).to eq("A")
51
+ end
52
+
53
+ it "converts hex character references" do
54
+ result = chrref_to_utf("&#x41;")
55
+ expect(result).to eq("A")
56
+ end
57
+
58
+ it "handles Japanese characters" do
59
+ result = chrref_to_utf("&#12354;")
60
+ expect(result).to eq("あ")
61
+ end
62
+
63
+ it "handles invalid codepoints" do
64
+ result = chrref_to_utf("&#0;")
65
+ expect(result).to eq("")
66
+ end
67
+
68
+ it "preserves non-reference text" do
69
+ result = chrref_to_utf("normal text")
70
+ expect(result).to eq("normal text")
71
+ end
72
+ end
73
+
74
+ describe "mndash" do
75
+ it "converts ndash template" do
76
+ result = mndash("1990{{ndash}}2000")
77
+ # The implementation wraps the dash in braces
78
+ expect(result).to include("–")
79
+ end
80
+
81
+ it "handles mdash" do
82
+ result = mndash("text{{mdash}}more")
83
+ expect(result).to include("–")
84
+ end
85
+
86
+ it "preserves text without dashes" do
87
+ result = mndash("normal text")
88
+ expect(result).to eq("normal text")
89
+ end
90
+ end
91
+
92
+ describe "process_nested_structure" do
93
+ it "processes simple nested brackets" do
94
+ result = process_nested_structure("[[test]]", "[[", "]]") do |content|
95
+ content.upcase
96
+ end
97
+ expect(result).to eq("TEST")
98
+ end
99
+
100
+ it "processes multiple nested levels" do
101
+ result = process_nested_structure("[[outer [[inner]]]]", "[[", "]]") do |content|
102
+ "[#{content}]"
103
+ end
104
+ # The algorithm processes innermost first, then outer
105
+ expect(result).to include("[inner]")
106
+ end
107
+
108
+ it "handles empty content" do
109
+ result = process_nested_structure("[[]]", "[[", "]]") do |_content|
110
+ "empty"
111
+ end
112
+ expect(result).to eq("empty")
113
+ end
114
+
115
+ it "preserves text without brackets" do
116
+ result = process_nested_structure("no brackets here", "[[", "]]") do |_content|
117
+ "replaced"
118
+ end
119
+ expect(result).to eq("no brackets here")
120
+ end
121
+
122
+ it "handles curly braces" do
123
+ result = process_nested_structure("{{template}}", "{{", "}}") do |content|
124
+ "T:#{content}"
125
+ end
126
+ expect(result).to eq("T:template")
127
+ end
128
+ end
129
+
130
+ describe "escape_nowiki and unescape_nowiki" do
131
+ it "escapes and unescapes nowiki tags" do
132
+ original = "text <nowiki>[[preserved]]</nowiki> more"
133
+ escaped = escape_nowiki(original)
134
+ expect(escaped).not_to include("[[preserved]]")
135
+ expect(escaped).to include("<nowiki-")
136
+
137
+ unescaped = unescape_nowiki(escaped)
138
+ expect(unescaped).to include("[[preserved]]")
139
+ end
140
+
141
+ it "handles multiple nowiki tags" do
142
+ original = "<nowiki>a</nowiki> and <nowiki>b</nowiki>"
143
+ escaped = escape_nowiki(original)
144
+ expect(escaped.scan(/<nowiki-\d+>/).size).to eq(2)
145
+ end
146
+ end
147
+
148
+ describe "cleanup" do
149
+ it "removes excessive newlines" do
150
+ result = cleanup("text\n\n\n\n\nmore")
151
+ expect(result.count("\n")).to be <= 4 # max 2 consecutive + trailing
152
+ end
153
+
154
+ it "removes empty parentheses" do
155
+ result = cleanup("text () more")
156
+ expect(result).not_to include("()")
157
+ end
158
+
159
+ it "removes empty Japanese parentheses" do
160
+ result = cleanup("text()more")
161
+ expect(result).not_to include("()")
162
+ end
163
+
164
+ it "adds trailing newlines" do
165
+ result = cleanup("text")
166
+ expect(result).to end_with("\n\n")
167
+ end
168
+
169
+ it "strips leading/trailing whitespace" do
170
+ result = cleanup(" text ")
171
+ expect(result).to start_with("text")
172
+ end
173
+ end
174
+
175
+ describe "remove_html" do
176
+ it "removes HTML comments" do
177
+ result = remove_html("before <!-- comment --> after")
178
+ expect(result).to include("before")
179
+ expect(result).to include("after")
180
+ expect(result).not_to include("comment")
181
+ end
182
+
183
+ it "removes self-closing tags" do
184
+ result = remove_html("text<br/>more")
185
+ expect(result).to eq("textmore")
186
+ end
187
+
188
+ it "removes gallery tags" do
189
+ result = remove_html("<gallery>image.jpg</gallery>")
190
+ expect(result).not_to include("image.jpg")
191
+ end
192
+
193
+ it "handles nested div tags" do
194
+ result = remove_html("<div><div>inner</div></div>outside")
195
+ expect(result).to eq("outside")
196
+ end
197
+ end
198
+
199
+ describe "remove_complex" do
200
+ it "converts ruby annotations" do
201
+ # Ruby annotation: {{Ruby|漢字|かんじ}} style patterns
202
+ result = remove_complex("text{{Ruby|漢字|かんじ}}more")
203
+ # Should convert to 《》 format
204
+ expect(result).to include("漢字")
205
+ end
206
+ end
207
+
208
+ describe "remove_inbetween" do
209
+ it "removes content between angle brackets" do
210
+ result = remove_inbetween("before <tag> after")
211
+ expect(result).to eq("before after")
212
+ end
213
+
214
+ it "removes multiple occurrences" do
215
+ result = remove_inbetween("a<1>b<2>c")
216
+ expect(result).to eq("abc")
217
+ end
218
+
219
+ it "uses custom tagset" do
220
+ result = remove_inbetween("before [content] after", ["[", "]"])
221
+ expect(result).to eq("before after")
222
+ end
223
+ end
224
+
225
+ describe "remove_tag" do
226
+ it "removes HTML tags" do
227
+ result = remove_tag("<p>content</p>")
228
+ expect(result).to eq("content")
229
+ end
230
+
231
+ it "removes inline tags" do
232
+ result = remove_tag("<b>bold</b> and <i>italic</i>")
233
+ expect(result).to eq("bold and italic")
234
+ end
235
+ end
236
+
237
+ describe "remove_directive" do
238
+ it "removes behavior switches" do
239
+ result = remove_directive("__NOTOC__text")
240
+ expect(result).to eq("text")
241
+ end
242
+
243
+ it "removes TOC directive" do
244
+ result = remove_directive("before__TOC__after")
245
+ expect(result).to eq("beforeafter")
246
+ end
247
+ end
248
+
249
+ describe "remove_emphasis" do
250
+ it "removes bold markup" do
251
+ result = remove_emphasis("'''bold''' text")
252
+ expect(result).to include("bold")
253
+ expect(result).not_to include("'''")
254
+ end
255
+
256
+ it "removes italic markup" do
257
+ result = remove_emphasis("''italic'' text")
258
+ expect(result).to include("italic")
259
+ expect(result).not_to include("''")
260
+ end
261
+
262
+ it "removes bold-italic markup" do
263
+ result = remove_emphasis("'''''both''''' text")
264
+ expect(result).to include("both")
265
+ expect(result).not_to include("'''''")
266
+ end
267
+ end
268
+
269
+ describe "remove_hr" do
270
+ it "removes horizontal rules" do
271
+ result = remove_hr("before\n----\nafter")
272
+ expect(result).not_to include("----")
273
+ end
274
+
275
+ it "removes longer rules" do
276
+ result = remove_hr("text\n------\nmore")
277
+ expect(result).not_to include("------")
278
+ end
279
+ end
280
+
281
+ describe "remove_ref" do
282
+ # remove_ref removes [ref]...[/ref] markers (not HTML <ref> tags)
283
+ # Use make_reference first to convert <ref> to [ref]
284
+ it "removes [ref] marker tags" do
285
+ result = remove_ref("text[ref]citation[/ref]more")
286
+ expect(result).to eq("textmore")
287
+ end
288
+
289
+ it "removes multiple [ref] markers" do
290
+ result = remove_ref("a[ref]1[/ref]b[ref]2[/ref]c")
291
+ expect(result).to eq("abc")
292
+ end
293
+
294
+ it "preserves text without markers" do
295
+ result = remove_ref("text without references")
296
+ expect(result).to eq("text without references")
297
+ end
298
+ end
299
+
300
+ describe "make_reference" do
301
+ it "converts reference tags to markers" do
302
+ result = make_reference("text<ref>citation</ref>more")
303
+ expect(result).to include("[ref]")
304
+ expect(result).to include("[/ref]")
305
+ end
306
+
307
+ it "handles multiple references" do
308
+ result = make_reference("a<ref>1</ref>b<ref>2</ref>c")
309
+ expect(result.scan("[ref]").size).to eq(2)
310
+ end
311
+ end
312
+ end
data/spec/utils_spec.rb CHANGED
@@ -1,19 +1,10 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "spec_helper"
4
- require_relative "../lib/wp2txt"
5
- require_relative "../lib/wp2txt/article"
6
- require_relative "../lib/wp2txt/utils"
7
-
8
- describe "Wp2txt" do
9
- it "contains mediawiki-format related functions:" do
10
- end
11
4
 
5
+ RSpec.describe "Wp2txt Utils" do
12
6
  include Wp2txt
13
7
 
14
- before do
15
- end
16
-
17
8
  describe "process_nested_structure" do
18
9
  it "parse nested structure replacing str in the format specified" do
19
10
  str_before1 = "[[ab[[cde[[alfa]]]]fg]]"
@@ -77,9 +68,18 @@ describe "Wp2txt" do
77
68
  end
78
69
 
79
70
  describe "remove_hr" do
80
- it "removes horizontal lines" do
81
- str_before = "\n----\n--\n--\n"
82
- str_after = "\n\n"
71
+ it "removes horizontal lines with 4+ hyphens" do
72
+ # MediaWiki requires 4+ hyphens for horizontal rules
73
+ # The hyphens are removed but newlines around them are preserved
74
+ str_before = "text\n----\nmore"
75
+ str_after = "text\n\nmore"
76
+ expect(remove_hr(str_before)).to eq str_after
77
+ end
78
+
79
+ it "does not remove lines with fewer than 4 hyphens" do
80
+ # Lines with fewer than 4 hyphens should be preserved
81
+ str_before = "text\n--\n---\nmore"
82
+ str_after = "text\n--\n---\nmore"
83
83
  expect(remove_hr(str_before)).to eq str_after
84
84
  end
85
85
  end
@@ -97,11 +97,27 @@ describe "Wp2txt" do
97
97
  end
98
98
 
99
99
  describe "remove_directive" do
100
- it "removes directive" do
101
- str_before = "__abc__\n __def__"
100
+ it "removes MediaWiki magic words" do
101
+ # Use actual MediaWiki behavior switches (loaded from mediawiki_aliases.json)
102
+ str_before = "__NOTOC__\n __TOC__"
102
103
  str_after = "\n "
103
104
  expect(remove_directive(str_before)).to eq str_after
104
105
  end
106
+
107
+ it "removes multilingual magic words" do
108
+ # Japanese/German/other language magic words should also be removed
109
+ str_before = "__KEIN_INHALTSVERZEICHNIS__\n__目次非表示__"
110
+ str_after = "\n"
111
+ expect(remove_directive(str_before)).to eq str_after
112
+ end
113
+
114
+ it "preserves non-magic-word patterns" do
115
+ # Arbitrary __something__ patterns that aren't valid magic words should be preserved
116
+ # (This is the expected behavior with data-driven approach)
117
+ str_before = "__custom_marker__"
118
+ # With data-driven approach, unknown patterns are NOT removed
119
+ expect(remove_directive(str_before)).to eq str_before
120
+ end
105
121
  end
106
122
 
107
123
  describe "remove_emphasis" do
@@ -144,6 +160,55 @@ describe "Wp2txt" do
144
160
  expect(c2).to eq "b|c"
145
161
  expect(d2).to eq "[ɲ], /J/"
146
162
  end
163
+
164
+ it "handles pipe trick (empty display text)" do
165
+ # Namespace prefix removal
166
+ expect(process_interwiki_links("[[Wikipedia:著作権|]]")).to eq "著作権"
167
+ expect(process_interwiki_links("[[Help:Contents|]]")).to eq "Contents"
168
+
169
+ # Disambiguation suffix removal
170
+ expect(process_interwiki_links("[[Tokyo (disambiguation)|]]")).to eq "Tokyo"
171
+ expect(process_interwiki_links("[[Mercury (planet)|]]")).to eq "Mercury"
172
+
173
+ # Comma suffix removal
174
+ expect(process_interwiki_links("[[Paris, Texas|]]")).to eq "Paris"
175
+ expect(process_interwiki_links("[[San Francisco, California|]]")).to eq "San Francisco"
176
+
177
+ # Combined: namespace and disambiguation
178
+ expect(process_interwiki_links("[[Wikipedia:Manual of Style (dates)|]]")).to eq "Manual of Style"
179
+ end
180
+
181
+ it "handles interwiki links" do
182
+ expect(process_interwiki_links("[[Wikisource:日本国憲法]]")).to eq "Wikisource:日本国憲法"
183
+ expect(process_interwiki_links("[[s:日本国憲法|日本国憲法]]")).to eq "日本国憲法"
184
+ end
185
+ end
186
+
187
+ describe "apply_pipe_trick" do
188
+ it "removes namespace prefix" do
189
+ expect(apply_pipe_trick("Wikipedia:Manual of Style")).to eq "Manual of Style"
190
+ expect(apply_pipe_trick("Help:Contents")).to eq "Contents"
191
+ expect(apply_pipe_trick("カテゴリ:日本")).to eq "日本"
192
+ end
193
+
194
+ it "removes disambiguation parenthetical" do
195
+ expect(apply_pipe_trick("Mercury (planet)")).to eq "Mercury"
196
+ expect(apply_pipe_trick("東京 (曖昧さ回避)")).to eq "東京"
197
+ end
198
+
199
+ it "removes comma and following text" do
200
+ expect(apply_pipe_trick("Paris, Texas")).to eq "Paris"
201
+ expect(apply_pipe_trick("San Francisco, California")).to eq "San Francisco"
202
+ end
203
+
204
+ it "handles combined cases" do
205
+ expect(apply_pipe_trick("Wikipedia:Manual of Style (dates)")).to eq "Manual of Style"
206
+ end
207
+
208
+ it "returns original if no transformation needed" do
209
+ expect(apply_pipe_trick("Simple")).to eq "Simple"
210
+ expect(apply_pipe_trick("東京")).to eq "東京"
211
+ end
147
212
  end
148
213
 
149
214
  describe "process_external_links" do
@@ -162,8 +227,9 @@ describe "Wp2txt" do
162
227
 
163
228
  describe "correct_inline_template" do
164
229
  it "removes brackets and leaving some text" do
230
+ # Flag/country templates should be removed entirely
165
231
  str_before1 = "{{MedalCountry | {{JPN}} }}"
166
- str_after1 = "JPN"
232
+ str_after1 = ""
167
233
  expect(correct_inline_template(str_before1)).to eq str_after1
168
234
 
169
235
  str_before2 = "{{lang|en|Japan}}"
@@ -182,5 +248,118 @@ describe "Wp2txt" do
182
248
  str_after5 = "日本人に多く見受けられる"
183
249
  expect(correct_inline_template(str_before5)).to eq str_after5
184
250
  end
251
+
252
+ it "removes citation templates entirely" do
253
+ expect(correct_inline_template("{{cite web|url=http://example.com|title=Test}}")).to eq ""
254
+ expect(correct_inline_template("{{cite book|title=Book|author=Author}}")).to eq ""
255
+ expect(correct_inline_template("{{sfn|Smith|2020|p=123}}")).to eq ""
256
+ end
257
+
258
+ it "extracts content from language templates" do
259
+ expect(correct_inline_template("{{lang-en|Hello}}")).to eq "Hello"
260
+ expect(correct_inline_template("{{langwithname|en|English|Hello World}}")).to eq "Hello World"
261
+ expect(correct_inline_template("{{IPA|/həˈloʊ/}}")).to eq "/həˈloʊ/"
262
+ end
263
+
264
+ it "formats nihongo template correctly" do
265
+ expect(correct_inline_template("{{nihongo|Tokyo|東京|Tōkyō}}")).to eq "Tokyo (東京, Tōkyō)"
266
+ expect(correct_inline_template("{{nihongo|Tokyo|東京}}")).to eq "Tokyo (東京)"
267
+ end
268
+
269
+ it "handles convert template" do
270
+ expect(correct_inline_template("{{convert|100|km|mi}}")).to eq "100 km"
271
+ end
272
+
273
+ it "removes flag templates" do
274
+ expect(correct_inline_template("{{flagicon|Japan}}")).to eq ""
275
+ expect(correct_inline_template("{{JPN}}")).to eq ""
276
+ expect(correct_inline_template("{{USA}}")).to eq ""
277
+ end
278
+ end
279
+
280
+ describe "parse_markers_config" do
281
+ it "returns default markers for true" do
282
+ result = parse_markers_config(true)
283
+ expect(result).to be_an(Array)
284
+ expect(result).not_to be_empty
285
+ end
286
+
287
+ it "returns empty array for false" do
288
+ result = parse_markers_config(false)
289
+ expect(result).to eq([])
290
+ end
291
+
292
+ it "filters array to valid marker types" do
293
+ result = parse_markers_config([:math, :code, :invalid_type])
294
+ expect(result).to include(:math)
295
+ expect(result).to include(:code)
296
+ expect(result).not_to include(:invalid_type)
297
+ end
298
+
299
+ it "returns default markers for unexpected input" do
300
+ result = parse_markers_config("unexpected string")
301
+ expect(result).to be_an(Array)
302
+ expect(result).not_to be_empty
303
+ end
304
+
305
+ it "returns default markers for nil" do
306
+ result = parse_markers_config(nil)
307
+ expect(result).to be_an(Array)
308
+ end
309
+ end
310
+
311
+ describe "process_interwiki_links" do
312
+ it "removes category links" do
313
+ result = process_interwiki_links("[[Category:Test]]")
314
+ expect(result).to eq("")
315
+ end
316
+
317
+ it "removes category links in Japanese" do
318
+ result = process_interwiki_links("[[カテゴリ:テスト]]")
319
+ expect(result).to eq("")
320
+ end
321
+
322
+ it "extracts caption from file links" do
323
+ result = process_interwiki_links("[[File:Image.jpg|thumb|200px|A caption]]")
324
+ expect(result).to include("caption")
325
+ end
326
+
327
+ it "handles file links without caption" do
328
+ result = process_interwiki_links("[[File:Image.jpg]]")
329
+ expect(result).to eq("")
330
+ end
331
+
332
+ it "handles pipe trick" do
333
+ result = process_interwiki_links("[[Tokyo (city)|]]")
334
+ expect(result).to eq("Tokyo")
335
+ end
336
+
337
+ it "handles simple links" do
338
+ result = process_interwiki_links("[[Simple Link]]")
339
+ expect(result).to eq("Simple Link")
340
+ end
341
+
342
+ it "handles links with display text" do
343
+ result = process_interwiki_links("[[Target|Display Text]]")
344
+ expect(result).to eq("Display Text")
345
+ end
346
+ end
347
+
348
+ describe "marker_placeholder" do
349
+ it "creates placeholder with marker type" do
350
+ result = marker_placeholder(:math)
351
+ expect(result).to include("MATH")
352
+ expect(result).to include("««")
353
+ expect(result).to include("»»")
354
+ end
355
+ end
356
+
357
+ describe "finalize_markers" do
358
+ it "converts placeholders to final format" do
359
+ placeholder = marker_placeholder(:math)
360
+ result = finalize_markers("text #{placeholder} more")
361
+ expect(result).to include("[MATH]")
362
+ expect(result).not_to include("««")
363
+ end
185
364
  end
186
365
  end