wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,472 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+
5
+ RSpec.describe Wp2txt::TemplateExpander do
6
+ let(:expander) { described_class.new }
7
+ # Fixed reference date for age calculations
8
+ let(:reference_date) { Time.new(2024, 6, 15) }
9
+ let(:expander_with_date) { described_class.new(reference_date: reference_date) }
10
+
11
+ describe "date templates" do
12
+ describe "{{birth date}}" do
13
+ it "expands {{birth date|1990|5|15}} to formatted date" do
14
+ expect(expander.expand("{{birth date|1990|5|15}}")).to eq("May 15, 1990")
15
+ end
16
+
17
+ it "expands {{Birth date|1990|5|15}} (case-insensitive)" do
18
+ expect(expander.expand("{{Birth date|1990|5|15}}")).to eq("May 15, 1990")
19
+ end
20
+
21
+ it "handles single-digit day" do
22
+ expect(expander.expand("{{birth date|1990|5|5}}")).to eq("May 5, 1990")
23
+ end
24
+
25
+ it "handles different months" do
26
+ expect(expander.expand("{{birth date|2000|12|25}}")).to eq("December 25, 2000")
27
+ end
28
+
29
+ it "handles df=yes parameter (day first)" do
30
+ expect(expander.expand("{{birth date|1990|5|15|df=yes}}")).to eq("15 May 1990")
31
+ end
32
+
33
+ it "handles mf=yes parameter (month first, default)" do
34
+ expect(expander.expand("{{birth date|1990|5|15|mf=yes}}")).to eq("May 15, 1990")
35
+ end
36
+ end
37
+
38
+ describe "{{birth date and age}}" do
39
+ it "expands with calculated age" do
40
+ result = expander_with_date.expand("{{birth date and age|1990|5|15}}")
41
+ expect(result).to eq("May 15, 1990 (age 34)")
42
+ end
43
+
44
+ it "calculates age correctly when birthday hasn't occurred yet" do
45
+ result = expander_with_date.expand("{{birth date and age|1990|12|25}}")
46
+ expect(result).to eq("December 25, 1990 (age 33)")
47
+ end
48
+
49
+ it "handles df=yes parameter" do
50
+ result = expander_with_date.expand("{{birth date and age|1990|5|15|df=yes}}")
51
+ expect(result).to eq("15 May 1990 (age 34)")
52
+ end
53
+ end
54
+
55
+ describe "{{death date}}" do
56
+ it "expands {{death date|2020|3|1}} to formatted date" do
57
+ expect(expander.expand("{{death date|2020|3|1}}")).to eq("March 1, 2020")
58
+ end
59
+
60
+ it "handles df=yes parameter" do
61
+ expect(expander.expand("{{death date|2020|3|1|df=yes}}")).to eq("1 March 2020")
62
+ end
63
+ end
64
+
65
+ describe "{{death date and age}}" do
66
+ it "expands with age at death" do
67
+ result = expander.expand("{{death date and age|2020|3|1|1950|6|15}}")
68
+ expect(result).to eq("March 1, 2020 (aged 69)")
69
+ end
70
+
71
+ it "handles df=yes parameter" do
72
+ result = expander.expand("{{death date and age|2020|3|1|1950|6|15|df=yes}}")
73
+ expect(result).to eq("1 March 2020 (aged 69)")
74
+ end
75
+ end
76
+
77
+ describe "{{start date}}" do
78
+ it "expands to formatted date" do
79
+ expect(expander.expand("{{start date|2024|1|1}}")).to eq("January 1, 2024")
80
+ end
81
+
82
+ it "handles df=yes parameter" do
83
+ expect(expander.expand("{{start date|2024|1|1|df=yes}}")).to eq("1 January 2024")
84
+ end
85
+ end
86
+
87
+ describe "{{end date}}" do
88
+ it "expands to formatted date" do
89
+ expect(expander.expand("{{end date|2024|12|31}}")).to eq("December 31, 2024")
90
+ end
91
+ end
92
+
93
+ describe "{{date}}" do
94
+ it "expands simple date" do
95
+ expect(expander.expand("{{date|2024|6|15}}")).to eq("June 15, 2024")
96
+ end
97
+
98
+ it "handles year and month only" do
99
+ expect(expander.expand("{{date|2024|6}}")).to eq("June 2024")
100
+ end
101
+
102
+ it "handles year only" do
103
+ expect(expander.expand("{{date|2024}}")).to eq("2024")
104
+ end
105
+ end
106
+ end
107
+
108
+ describe "age templates" do
109
+ describe "{{age}}" do
110
+ it "calculates age from birth date" do
111
+ result = expander_with_date.expand("{{age|1990|5|15}}")
112
+ expect(result).to eq("34")
113
+ end
114
+
115
+ it "returns correct age when birthday hasn't occurred" do
116
+ result = expander_with_date.expand("{{age|1990|12|25}}")
117
+ expect(result).to eq("33")
118
+ end
119
+ end
120
+
121
+ describe "{{age in years}}" do
122
+ it "calculates age between two dates" do
123
+ result = expander.expand("{{age in years|1950|6|15|2020|3|1}}")
124
+ expect(result).to eq("69")
125
+ end
126
+ end
127
+
128
+ describe "{{age in days}}" do
129
+ it "calculates days between dates" do
130
+ result = expander.expand("{{age in days|2024|1|1|2024|1|10}}")
131
+ expect(result).to eq("9")
132
+ end
133
+ end
134
+ end
135
+
136
+ describe "convert templates" do
137
+ describe "length conversions" do
138
+ it "converts km to mi" do
139
+ result = expander.expand("{{convert|100|km|mi}}")
140
+ expect(result).to match(/100 km \(6[0-9](\.[0-9])? mi\)/)
141
+ end
142
+
143
+ it "converts mi to km" do
144
+ result = expander.expand("{{convert|100|mi|km}}")
145
+ expect(result).to match(/100 mi \(16[0-9](\.[0-9])? km\)/)
146
+ end
147
+
148
+ it "converts m to ft" do
149
+ result = expander.expand("{{convert|100|m|ft}}")
150
+ expect(result).to match(/100 m \(32[0-9](\.[0-9])? ft\)/)
151
+ end
152
+
153
+ it "converts ft to m" do
154
+ result = expander.expand("{{convert|100|ft|m}}")
155
+ expect(result).to match(/100 ft \(30(\.[0-9])? m\)/)
156
+ end
157
+
158
+ it "converts cm to in" do
159
+ result = expander.expand("{{convert|100|cm|in}}")
160
+ expect(result).to match(/100 cm \(39(\.[0-9])? in\)/)
161
+ end
162
+
163
+ it "converts in to cm" do
164
+ result = expander.expand("{{convert|10|in|cm}}")
165
+ expect(result).to match(/10 in \(25(\.[0-9])? cm\)/)
166
+ end
167
+ end
168
+
169
+ describe "weight conversions" do
170
+ it "converts kg to lb" do
171
+ result = expander.expand("{{convert|100|kg|lb}}")
172
+ expect(result).to match(/100 kg \(22[0-9](\.[0-9])? lb\)/)
173
+ end
174
+
175
+ it "converts lb to kg" do
176
+ result = expander.expand("{{convert|100|lb|kg}}")
177
+ expect(result).to match(/100 lb \(4[0-9](\.[0-9])? kg\)/)
178
+ end
179
+
180
+ it "converts g to oz" do
181
+ result = expander.expand("{{convert|100|g|oz}}")
182
+ expect(result).to match(/100 g \(3\.[0-9] oz\)/)
183
+ end
184
+ end
185
+
186
+ describe "temperature conversions" do
187
+ it "converts °C to °F" do
188
+ expect(expander.expand("{{convert|0|°C|°F}}")).to eq("0 °C (32 °F)")
189
+ end
190
+
191
+ it "converts C to F (without degree symbol)" do
192
+ expect(expander.expand("{{convert|100|C|F}}")).to eq("100 °C (212 °F)")
193
+ end
194
+
195
+ it "converts °F to °C" do
196
+ expect(expander.expand("{{convert|32|°F|°C}}")).to eq("32 °F (0 °C)")
197
+ end
198
+
199
+ it "converts F to C (without degree symbol)" do
200
+ expect(expander.expand("{{convert|212|F|C}}")).to eq("212 °F (100 °C)")
201
+ end
202
+ end
203
+
204
+ describe "area conversions" do
205
+ it "converts km2 to sqmi" do
206
+ result = expander.expand("{{convert|100|km2|sqmi}}")
207
+ expect(result).to match(/100 km² \(3[0-9](\.[0-9])? sq mi\)/)
208
+ end
209
+
210
+ it "converts sqmi to km2" do
211
+ result = expander.expand("{{convert|100|sqmi|km2}}")
212
+ expect(result).to match(/100 sq mi \(25[0-9](\.[0-9])? km²\)/)
213
+ end
214
+
215
+ it "converts ha to acre" do
216
+ result = expander.expand("{{convert|100|ha|acre}}")
217
+ expect(result).to match(/100 ha \(24[0-9](\.[0-9])? acres\)/)
218
+ end
219
+ end
220
+
221
+ describe "speed conversions" do
222
+ it "converts km/h to mph" do
223
+ result = expander.expand("{{convert|100|km/h|mph}}")
224
+ expect(result).to match(/100 km\/h \(6[0-9](\.[0-9])? mph\)/)
225
+ end
226
+
227
+ it "converts mph to km/h" do
228
+ result = expander.expand("{{convert|60|mph|km/h}}")
229
+ expect(result).to match(/60 mph \(9[0-9](\.[0-9])? km\/h\)/)
230
+ end
231
+ end
232
+
233
+ describe "edge cases" do
234
+ it "handles decimal values" do
235
+ result = expander.expand("{{convert|3.5|km|mi}}")
236
+ expect(result).to match(/3\.5 km \(2\.[0-9] mi\)/)
237
+ end
238
+
239
+ it "handles unknown units gracefully" do
240
+ expect(expander.expand("{{convert|100|foo|bar}}")).to eq("100 foo")
241
+ end
242
+
243
+ it "handles abbr=on parameter" do
244
+ result = expander.expand("{{convert|100|km|mi|abbr=on}}")
245
+ expect(result).to match(/100 km \(6[0-9](\.[0-9])? mi\)/)
246
+ end
247
+ end
248
+ end
249
+
250
+ describe "common templates" do
251
+ describe "{{circa}}" do
252
+ it "expands to c. prefix" do
253
+ expect(expander.expand("{{circa|1500}}")).to eq("c. 1500")
254
+ end
255
+
256
+ it "handles range" do
257
+ expect(expander.expand("{{circa|1500|1550}}")).to eq("c. 1500 – c. 1550")
258
+ end
259
+ end
260
+
261
+ describe "{{floruit}}" do
262
+ it "expands single year" do
263
+ expect(expander.expand("{{floruit|1500}}")).to eq("fl. 1500")
264
+ end
265
+
266
+ it "expands year range" do
267
+ expect(expander.expand("{{floruit|1500|1550}}")).to eq("fl. 1500–1550")
268
+ end
269
+ end
270
+
271
+ describe "{{reign}}" do
272
+ it "expands reign years" do
273
+ expect(expander.expand("{{reign|1500|1550}}")).to eq("r. 1500–1550")
274
+ end
275
+ end
276
+
277
+ describe "{{marriage}}" do
278
+ it "expands simple marriage" do
279
+ expect(expander.expand("{{marriage|Jane Doe|1990}}")).to eq("Jane Doe (m. 1990)")
280
+ end
281
+
282
+ it "expands marriage with end" do
283
+ expect(expander.expand("{{marriage|Jane Doe|1990|2020}}")).to eq("Jane Doe (m. 1990; div. 2020)")
284
+ end
285
+
286
+ it "handles widowed end reason" do
287
+ expect(expander.expand("{{marriage|Jane Doe|1990|2020|reason=widowed}}")).to eq("Jane Doe (m. 1990; wid. 2020)")
288
+ end
289
+
290
+ it "handles died end reason" do
291
+ expect(expander.expand("{{marriage|Jane Doe|1990|2020|reason=died}}")).to eq("Jane Doe (m. 1990; d. 2020)")
292
+ end
293
+ end
294
+
295
+ describe "{{played years}}" do
296
+ it "expands playing career span" do
297
+ expect(expander.expand("{{played years|2000|2020}}")).to eq("2000–2020")
298
+ end
299
+ end
300
+
301
+ describe "{{age in years and days}}" do
302
+ it "formats age with years and days" do
303
+ result = expander.expand("{{age in years and days|1990|1|1|2024|6|15}}")
304
+ expect(result).to match(/34 years, \d+ days/)
305
+ end
306
+ end
307
+
308
+ describe "{{time ago}}" do
309
+ it "formats time since date" do
310
+ result = expander_with_date.expand("{{time ago|2024|1|1}}")
311
+ expect(result).to match(/\d+ months ago/)
312
+ end
313
+ end
314
+ end
315
+
316
+ describe "formatting preservation" do
317
+ it "preserves text around templates" do
318
+ result = expander.expand("Born on {{birth date|1990|5|15}} in Tokyo")
319
+ expect(result).to eq("Born on May 15, 1990 in Tokyo")
320
+ end
321
+
322
+ it "handles multiple templates in one string" do
323
+ result = expander.expand("{{birth date|1990|5|15}} – {{death date|2020|3|1}}")
324
+ expect(result).to eq("May 15, 1990 – March 1, 2020")
325
+ end
326
+
327
+ it "handles nested templates" do
328
+ # This tests that inner templates are expanded first
329
+ result = expander.expand("Born {{circa|1500}}")
330
+ expect(result).to eq("Born c. 1500")
331
+ end
332
+ end
333
+
334
+ describe "unknown templates" do
335
+ it "returns empty for unknown templates" do
336
+ expect(expander.expand("{{unknown template|foo|bar}}")).to eq("")
337
+ end
338
+
339
+ it "can be configured to preserve unknown templates" do
340
+ exp = described_class.new(preserve_unknown: true)
341
+ expect(exp.expand("{{unknown|foo}}")).to eq("{{unknown|foo}}")
342
+ end
343
+ end
344
+
345
+ describe "coordinate templates" do
346
+ describe "{{coord}}" do
347
+ it "expands decimal coordinates" do
348
+ result = expander.expand("{{coord|40.7128|N|74.0060|W}}")
349
+ expect(result).to match(/40\.7128°\s*N.*74\.0060°\s*W/i)
350
+ end
351
+
352
+ it "expands DMS coordinates" do
353
+ result = expander.expand("{{coord|40|42|46|N|74|0|22|W}}")
354
+ expect(result).to match(/40°42['′]46["″]?\s*N.*74°0['′]22["″]?\s*W/i)
355
+ end
356
+
357
+ it "expands coordinates with display parameter" do
358
+ result = expander.expand("{{coord|51.5074|N|0.1278|W|display=title}}")
359
+ expect(result).to include("51.5074")
360
+ end
361
+
362
+ it "handles simple lat/lon format" do
363
+ result = expander.expand("{{coord|35.6762|139.6503}}")
364
+ expect(result).to include("35.6762")
365
+ expect(result).to include("139.6503")
366
+ end
367
+ end
368
+ end
369
+
370
+ describe "language templates" do
371
+ describe "{{lang}}" do
372
+ it "expands basic lang template" do
373
+ expect(expander.expand("{{lang|fr|Bonjour}}")).to eq("Bonjour")
374
+ end
375
+
376
+ it "expands with literal translation" do
377
+ result = expander.expand("{{lang|la|Carpe diem|lit=seize the day}}")
378
+ expect(result).to include("Carpe diem")
379
+ expect(result).to include("seize the day")
380
+ end
381
+ end
382
+
383
+ describe "{{lang-xx}}" do
384
+ it "expands lang-fr template" do
385
+ result = expander.expand("{{lang-fr|Bonjour}}")
386
+ expect(result).to match(/French.*Bonjour/i)
387
+ end
388
+
389
+ it "expands lang-de template" do
390
+ result = expander.expand("{{lang-de|Guten Tag}}")
391
+ expect(result).to match(/German.*Guten Tag/i)
392
+ end
393
+
394
+ it "expands lang-ja template" do
395
+ result = expander.expand("{{lang-ja|こんにちは}}")
396
+ expect(result).to match(/Japanese.*こんにちは/i)
397
+ end
398
+
399
+ it "expands lang-la template with literal" do
400
+ result = expander.expand("{{lang-la|Veni, vidi, vici|lit=I came, I saw, I conquered}}")
401
+ expect(result).to include("Latin")
402
+ expect(result).to include("Veni, vidi, vici")
403
+ expect(result).to include("I came, I saw, I conquered")
404
+ end
405
+ end
406
+
407
+ describe "{{transl}}" do
408
+ it "expands transliteration template" do
409
+ result = expander.expand("{{transl|ru|Moskva}}")
410
+ expect(result).to eq("Moskva")
411
+ end
412
+ end
413
+
414
+ describe "{{nihongo}}" do
415
+ it "expands nihongo template" do
416
+ result = expander.expand("{{nihongo|Tokyo|東京|Tōkyō}}")
417
+ expect(result).to include("Tokyo")
418
+ expect(result).to include("東京")
419
+ expect(result).to include("Tōkyō")
420
+ end
421
+
422
+ it "handles nihongo without romaji" do
423
+ result = expander.expand("{{nihongo|Tokyo|東京}}")
424
+ expect(result).to include("Tokyo")
425
+ expect(result).to include("東京")
426
+ end
427
+ end
428
+ end
429
+
430
+ describe "formatting templates" do
431
+ describe "{{nowrap}}" do
432
+ it "preserves text" do
433
+ expect(expander.expand("{{nowrap|100 km}}")).to eq("100 km")
434
+ end
435
+ end
436
+
437
+ describe "{{small}}" do
438
+ it "preserves text" do
439
+ expect(expander.expand("{{small|tiny text}}")).to eq("tiny text")
440
+ end
441
+ end
442
+
443
+ describe "{{em}}" do
444
+ it "preserves text (emphasis)" do
445
+ expect(expander.expand("{{em|important}}")).to eq("important")
446
+ end
447
+ end
448
+
449
+ describe "{{abbr}}" do
450
+ it "returns abbreviation" do
451
+ expect(expander.expand("{{abbr|HTML|Hypertext Markup Language}}")).to eq("HTML")
452
+ end
453
+ end
454
+ end
455
+
456
+ describe "integration with format_wiki" do
457
+ include Wp2txt
458
+
459
+ it "expands templates during format_wiki processing" do
460
+ input = "He was born on {{birth date|1990|5|15}}."
461
+ result = format_wiki(input, title: "Test", expand_templates: true)
462
+ expect(result).to include("May 15, 1990")
463
+ end
464
+
465
+ it "expands convert templates" do
466
+ input = "The mountain is {{convert|8848|m|ft}} tall."
467
+ result = format_wiki(input, title: "Test", expand_templates: true)
468
+ expect(result).to include("8848 m")
469
+ expect(result).to include("ft")
470
+ end
471
+ end
472
+ end
@@ -0,0 +1,217 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+
5
+ RSpec.describe "Template Processing (Data-Driven)" do
6
+ include Wp2txt
7
+
8
+ # Helper to extract template name from {{...}} for testing regex
9
+ def template_content(str)
10
+ str.gsub(/^\{\{/, "").gsub(/\}\}$/, "")
11
+ end
12
+
13
+ describe "REMOVE_TEMPLATES_REGEX" do
14
+ it "is loaded from template_aliases.json" do
15
+ # Verify the constant exists and is a Regexp
16
+ expect(Wp2txt::REMOVE_TEMPLATES_REGEX).to be_a(Regexp)
17
+ end
18
+
19
+ it "matches English navigation templates" do
20
+ %w[sfn efn refn reflist notelist main portal].each do |template|
21
+ content = "#{template}|content"
22
+ expect(content).to match(Wp2txt::REMOVE_TEMPLATES_REGEX), "Expected '#{template}' to match"
23
+ end
24
+ end
25
+
26
+ it "matches hatnote templates" do
27
+ %w[about redirect distinguish further details].each do |template|
28
+ content = "#{template}|content"
29
+ expect(content).to match(Wp2txt::REMOVE_TEMPLATES_REGEX), "Expected '#{template}' to match"
30
+ end
31
+ end
32
+
33
+ it "matches Japanese navigation templates" do
34
+ expect("脚注ヘルプ").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
35
+ expect("関連項目|記事").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
36
+ end
37
+
38
+ it "matches German navigation templates" do
39
+ expect("Hauptartikel|Artikel").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
40
+ expect("Siehe auch|Artikel").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
41
+ end
42
+
43
+ it "matches French navigation templates" do
44
+ expect("Article principal|Article").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
45
+ expect("Voir aussi|Article").to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
46
+ end
47
+
48
+ it "does not match citation templates" do
49
+ expect("cite web|url=...").not_to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
50
+ expect("cite book|title=...").not_to match(Wp2txt::REMOVE_TEMPLATES_REGEX)
51
+ end
52
+ end
53
+
54
+ describe "AUTHORITY_CONTROL_REGEX" do
55
+ it "is loaded from template_aliases.json" do
56
+ expect(Wp2txt::AUTHORITY_CONTROL_REGEX).to be_a(Regexp)
57
+ end
58
+
59
+ it "matches English authority control" do
60
+ expect("Authority control").to match(Wp2txt::AUTHORITY_CONTROL_REGEX)
61
+ end
62
+
63
+ it "matches German Normdaten" do
64
+ expect("Normdaten").to match(Wp2txt::AUTHORITY_CONTROL_REGEX)
65
+ end
66
+
67
+ it "matches identifier templates" do
68
+ %w[VIAF LCCN GND ISNI ORCID].each do |id|
69
+ expect(id).to match(Wp2txt::AUTHORITY_CONTROL_REGEX)
70
+ end
71
+ end
72
+ end
73
+
74
+ describe "CLEANUP_REMNANTS_REGEX" do
75
+ it "is loaded from template_aliases.json" do
76
+ expect(Wp2txt::CLEANUP_REMNANTS_REGEX).to be_a(Regexp)
77
+ end
78
+
79
+ it "matches layout templates" do
80
+ %w[Clear Clearleft Clearright].each do |template|
81
+ expect(template).to match(Wp2txt::CLEANUP_REMNANTS_REGEX)
82
+ end
83
+ end
84
+
85
+ it "matches notelist variants" do
86
+ expect("notelist").to match(Wp2txt::CLEANUP_REMNANTS_REGEX)
87
+ expect("notelist2").to match(Wp2txt::CLEANUP_REMNANTS_REGEX)
88
+ end
89
+ end
90
+
91
+ describe "correct_inline_template" do
92
+ # Ruby text templates (読み仮名 equivalent)
93
+ describe "ruby text templates" do
94
+ it "handles Japanese 読み仮名 template" do
95
+ result = correct_inline_template("{{読み仮名|漢字|かんじ}}")
96
+ expect(result).to eq("漢字(かんじ)")
97
+ end
98
+
99
+ it "handles English ruby template" do
100
+ result = correct_inline_template("{{ruby|漢字|かんじ}}")
101
+ expect(result).to include("漢字")
102
+ end
103
+ end
104
+
105
+ # Interwiki link templates (仮リンク equivalent)
106
+ describe "interwiki link templates" do
107
+ it "handles Japanese 仮リンク template" do
108
+ result = correct_inline_template("{{仮リンク|表示名|en|English Article}}")
109
+ expect(result).to eq("表示名")
110
+ end
111
+
112
+ it "handles English ill template" do
113
+ result = correct_inline_template("{{ill|Display|ja|日本語記事}}")
114
+ expect(result).to eq("Display")
115
+ end
116
+
117
+ it "handles interlanguage link template" do
118
+ result = correct_inline_template("{{interlanguage link|Display|de|Deutscher Artikel}}")
119
+ expect(result).to eq("Display")
120
+ end
121
+ end
122
+
123
+ # Mixed script templates (nihongo equivalent)
124
+ describe "mixed script templates" do
125
+ it "handles nihongo template with all parts" do
126
+ result = correct_inline_template("{{nihongo|Tokyo|東京|Tōkyō}}")
127
+ expect(result).to eq("Tokyo (東京, Tōkyō)")
128
+ end
129
+
130
+ it "handles nihongo template with only kanji" do
131
+ result = correct_inline_template("{{nihongo|Tokyo|東京}}")
132
+ expect(result).to eq("Tokyo (東京)")
133
+ end
134
+
135
+ it "handles transl template" do
136
+ result = correct_inline_template("{{transl|ja|tōkyō}}")
137
+ expect(result).to eq("tōkyō")
138
+ end
139
+ end
140
+
141
+ # Convert templates
142
+ describe "convert templates" do
143
+ it "handles convert template" do
144
+ result = correct_inline_template("{{convert|100|km}}")
145
+ expect(result).to eq("100 km")
146
+ end
147
+
148
+ it "handles Japanese 単位変換 template" do
149
+ result = correct_inline_template("{{単位変換|100|km}}")
150
+ expect(result).to eq("100 km")
151
+ end
152
+ end
153
+
154
+ # Flag templates
155
+ describe "flag templates" do
156
+ it "removes flag templates" do
157
+ result = correct_inline_template("{{flag|Japan}}")
158
+ expect(result).to eq("")
159
+ end
160
+
161
+ it "removes flagicon templates" do
162
+ result = correct_inline_template("{{flagicon|USA}}")
163
+ expect(result).to eq("")
164
+ end
165
+
166
+ it "removes country code templates" do
167
+ result = correct_inline_template("{{JPN}}")
168
+ expect(result).to eq("")
169
+ end
170
+ end
171
+
172
+ # Formatting templates
173
+ describe "formatting templates" do
174
+ it "extracts content from small template" do
175
+ result = correct_inline_template("{{small|text}}")
176
+ expect(result).to eq("text")
177
+ end
178
+
179
+ it "extracts content from nowrap template" do
180
+ result = correct_inline_template("{{nowrap|text here}}")
181
+ expect(result).to eq("text here")
182
+ end
183
+
184
+ it "handles nbsp template" do
185
+ result = correct_inline_template("before{{nbsp}}after")
186
+ expect(result).to eq("before after")
187
+ end
188
+ end
189
+ end
190
+
191
+ describe "cleanup" do
192
+ it "removes authority control remnants" do
193
+ text = "Article content\n\nAuthority control\n\n"
194
+ result = cleanup(text)
195
+ expect(result).not_to include("Authority control")
196
+ end
197
+
198
+ it "removes Normdaten remnants" do
199
+ text = "Article content\n\nNormdaten\n\n"
200
+ result = cleanup(text)
201
+ expect(result).not_to include("Normdaten")
202
+ end
203
+
204
+ it "removes cleanup remnants like Clearleft" do
205
+ text = "Content\n\nClearleft\n\nMore content"
206
+ result = cleanup(text)
207
+ expect(result).not_to include("Clearleft")
208
+ end
209
+
210
+ it "removes sister project markers" do
211
+ text = "Content\n\nCommons:\n\nWiktionary:\n\n"
212
+ result = cleanup(text)
213
+ expect(result).not_to include("Commons:")
214
+ expect(result).not_to include("Wiktionary:")
215
+ end
216
+ end
217
+ end