wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,543 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require_relative "fixtures/samples"
5
+
6
+ RSpec.describe "Integration Tests" do
7
+ include Wp2txt
8
+
9
+ # Use let blocks to avoid constant redefinition warnings
10
+ let(:japanese_article) { Wp2txt::TestSamples::JAPANESE_ARTICLE }
11
+ let(:russian_article) { Wp2txt::TestSamples::RUSSIAN_ARTICLE }
12
+ let(:arabic_article) { Wp2txt::TestSamples::ARABIC_ARTICLE }
13
+ let(:deeply_nested) { Wp2txt::TestSamples::DEEPLY_NESTED }
14
+ let(:malformed_markup) { Wp2txt::TestSamples::MALFORMED_MARKUP }
15
+ let(:table_content) { Wp2txt::TestSamples::TABLE_CONTENT }
16
+ let(:multiline_link) { Wp2txt::TestSamples::MULTILINE_LINK }
17
+
18
+ describe "Full article processing" do
19
+ let(:sample_article) do
20
+ <<~WIKI
21
+ {{Infobox person
22
+ |name = Test Person
23
+ |birth_date = {{Birth date|1990|1|15}}
24
+ }}
25
+ '''Test Person''' (born January 15, 1990) is a [[scientist]].
26
+
27
+ == Early Life ==
28
+ Born in [[Tokyo]], [[Japan]].
29
+
30
+ == Career ==
31
+ * Started at [[Company A]]
32
+ * Moved to [[Company B]]
33
+
34
+ === Publications ===
35
+ # First paper (2010)
36
+ # Second paper (2015)
37
+
38
+ == References ==
39
+ <ref>Citation 1</ref>
40
+ <ref name="ref2">Citation 2</ref>
41
+
42
+ == External Links ==
43
+ * [http://example.com Official website]
44
+
45
+ [[Category:Scientists]]
46
+ [[Category:1990 births]]
47
+ WIKI
48
+ end
49
+
50
+ it "extracts clean text with correct structure" do
51
+ article = Wp2txt::Article.new(sample_article, "Test Person")
52
+ types = article.elements.map(&:first)
53
+
54
+ expect(types).to include(:mw_heading)
55
+ expect(types).to include(:mw_paragraph)
56
+ expect(types).to include(:mw_unordered)
57
+ expect(types).to include(:mw_ordered)
58
+ end
59
+
60
+ it "extracts categories correctly" do
61
+ article = Wp2txt::Article.new(sample_article, "Test Person")
62
+ categories = article.categories.flatten
63
+ expect(categories).to include("Scientists")
64
+ end
65
+ end
66
+
67
+ describe "Unicode handling" do
68
+ it "handles CJK characters in articles" do
69
+ article = Wp2txt::Article.new(japanese_article)
70
+ expect(article.elements).not_to be_empty
71
+ end
72
+
73
+ it "handles Cyrillic characters" do
74
+ article = Wp2txt::Article.new(russian_article)
75
+ expect(article.elements).not_to be_empty
76
+ end
77
+
78
+ it "handles Arabic characters" do
79
+ article = Wp2txt::Article.new(arabic_article)
80
+ expect(article.elements).not_to be_empty
81
+ end
82
+
83
+ # Test for emoji handling (will fail until bug is fixed)
84
+ it "handles emoji character references" do
85
+ result = chrref_to_utf("&#x1F600;")
86
+ # This test exposes the BMP limitation bug
87
+ # After fix, this should equal the grinning face emoji
88
+ expect(result.valid_encoding?).to be true
89
+ end
90
+ end
91
+
92
+ describe "Edge cases" do
93
+ it "handles deeply nested templates without hanging" do
94
+ start_time = Time.now
95
+ expect { Wp2txt::Article.new(deeply_nested) }.not_to raise_error
96
+ elapsed = Time.now - start_time
97
+ expect(elapsed).to be < 5 # Should complete in under 5 seconds
98
+ end
99
+
100
+ it "handles malformed markup gracefully" do
101
+ expect { Wp2txt::Article.new(malformed_markup) }.not_to raise_error
102
+ end
103
+
104
+ it "handles multi-line links" do
105
+ expect { Wp2txt::Article.new(multiline_link) }.not_to raise_error
106
+ end
107
+
108
+ it "handles table content" do
109
+ article = Wp2txt::Article.new(table_content)
110
+ types = article.elements.map(&:first)
111
+ expect(types).to include(:mw_table)
112
+ end
113
+ end
114
+
115
+ describe "Text processing utilities" do
116
+ describe "chrref_to_utf" do
117
+ it "converts basic ASCII character references" do
118
+ expect(chrref_to_utf("&#65;")).to eq "A"
119
+ expect(chrref_to_utf("&#97;")).to eq "a"
120
+ end
121
+
122
+ it "converts hexadecimal character references" do
123
+ expect(chrref_to_utf("&#x41;")).to eq "A"
124
+ expect(chrref_to_utf("&#x61;")).to eq "a"
125
+ end
126
+
127
+ it "converts BMP characters" do
128
+ # Musical note U+266A
129
+ expect(chrref_to_utf("&#x266A;")).to eq "\u266A"
130
+ expect(chrref_to_utf("&#9834;")).to eq "\u266A"
131
+ end
132
+
133
+ # This test will FAIL until the BMP limitation is fixed
134
+ it "converts supplementary plane characters (emoji)" do
135
+ # Grinning face U+1F600
136
+ expect(chrref_to_utf("&#x1F600;")).to eq "\u{1F600}"
137
+ expect(chrref_to_utf("&#128512;")).to eq "\u{1F600}"
138
+ end
139
+ end
140
+
141
+ describe "convert_characters" do
142
+ it "handles valid UTF-8 content" do
143
+ text = "Hello 世界 こんにちは"
144
+ result = convert_characters(text)
145
+ expect(result).to eq text
146
+ expect(result.valid_encoding?).to be true
147
+ end
148
+
149
+ it "handles invalid UTF-8 sequences" do
150
+ # Invalid UTF-8 byte sequence
151
+ invalid = "Hello\xC0World"
152
+ result = convert_characters(invalid)
153
+ expect(result.valid_encoding?).to be true
154
+ expect(result.encoding.name).to eq "UTF-8"
155
+ end
156
+ end
157
+
158
+ describe "special_chr" do
159
+ it "converts common HTML entities" do
160
+ # &nbsp; converts to U+00A0 (non-breaking space), not regular space
161
+ expect(special_chr("&nbsp;")).to eq "\u00A0"
162
+ expect(special_chr("&lt;")).to eq "<"
163
+ expect(special_chr("&gt;")).to eq ">"
164
+ expect(special_chr("&amp;")).to eq "&"
165
+ end
166
+
167
+ it "converts Wikipedia-specific entities" do
168
+ expect(special_chr("&ratio;")).to eq "∶"
169
+ expect(special_chr("&dash;")).to eq "–"
170
+ expect(special_chr("&nbso;")).to eq " " # Common typo for &nbsp;
171
+ end
172
+
173
+ it "converts mathematical entities" do
174
+ expect(special_chr("&alpha;")).to eq "α"
175
+ expect(special_chr("&beta;")).to eq "β"
176
+ expect(special_chr("&infin;")).to eq "∞"
177
+ expect(special_chr("&sum;")).to eq "∑"
178
+ end
179
+ end
180
+ end
181
+
182
+ describe "Process nested structure" do
183
+ describe "process_nested_structure" do
184
+ it "processes simple nested brackets" do
185
+ scanner = StringScanner.new("[[test]]")
186
+ result = process_nested_structure(scanner, "[[", "]]") { |c| "<#{c}>" }
187
+ expect(result).to eq "<test>"
188
+ end
189
+
190
+ it "processes nested templates" do
191
+ scanner = StringScanner.new("{{outer}}")
192
+ result = process_nested_structure(scanner, "{{", "}}") { |c| "[#{c}]" }
193
+ expect(result).to eq "[outer]"
194
+ end
195
+
196
+ # This test exposes the state leakage bug
197
+ it "handles consecutive calls without state leakage" do
198
+ scanner1 = StringScanner.new("[[first]]")
199
+ result1 = process_nested_structure(scanner1, "[[", "]]") { |c| "<#{c}>" }
200
+ expect(result1).to eq "<first>"
201
+
202
+ # Second call should not be affected by first call's state
203
+ scanner2 = StringScanner.new("plain text")
204
+ result2 = process_nested_structure(scanner2, "[[", "]]") { |c| "<#{c}>" }
205
+ expect(result2).to eq "plain text"
206
+ end
207
+
208
+ it "handles table brackets" do
209
+ scanner = StringScanner.new("{|content|}")
210
+ result = process_nested_structure(scanner, "{|", "|}") { |c| "[#{c}]" }
211
+ expect(result).to eq "[content]"
212
+ end
213
+ end
214
+ end
215
+
216
+ # === Additional Edge Case Tests for v2.0.0 ===
217
+
218
+ describe "Additional edge cases" do
219
+ let(:special_title_article) { Wp2txt::TestSamples::SPECIAL_TITLE_ARTICLE }
220
+ let(:very_deeply_nested) { Wp2txt::TestSamples::VERY_DEEPLY_NESTED }
221
+ let(:mixed_content) { Wp2txt::TestSamples::MIXED_CONTENT }
222
+ let(:complex_links) { Wp2txt::TestSamples::COMPLEX_LINKS }
223
+ let(:consecutive_templates) { Wp2txt::TestSamples::CONSECUTIVE_TEMPLATES }
224
+ let(:html_entities_mixed) { Wp2txt::TestSamples::HTML_ENTITIES_MIXED }
225
+ let(:horizontal_rules) { Wp2txt::TestSamples::HORIZONTAL_RULES }
226
+ let(:complex_headings) { Wp2txt::TestSamples::COMPLEX_HEADINGS }
227
+ let(:redirect_variations) { Wp2txt::TestSamples::REDIRECT_VARIATIONS }
228
+
229
+ it "handles special characters in article content" do
230
+ expect { Wp2txt::Article.new(special_title_article) }.not_to raise_error
231
+ article = Wp2txt::Article.new(special_title_article)
232
+ expect(article.elements).not_to be_empty
233
+ end
234
+
235
+ it "handles very deeply nested templates (10 levels)" do
236
+ start_time = Time.now
237
+ expect { Wp2txt::Article.new(very_deeply_nested) }.not_to raise_error
238
+ elapsed = Time.now - start_time
239
+ expect(elapsed).to be < 5 # Should complete quickly
240
+ end
241
+
242
+ it "handles mixed multilingual content with emoji" do
243
+ expect { Wp2txt::Article.new(mixed_content) }.not_to raise_error
244
+ article = Wp2txt::Article.new(mixed_content)
245
+ expect(article.elements).not_to be_empty
246
+ end
247
+
248
+ it "handles complex wikilinks" do
249
+ expect { Wp2txt::Article.new(complex_links) }.not_to raise_error
250
+ end
251
+
252
+ it "handles consecutive templates" do
253
+ expect { Wp2txt::Article.new(consecutive_templates) }.not_to raise_error
254
+ end
255
+
256
+ it "handles HTML entities mixed with character references" do
257
+ expect { Wp2txt::Article.new(html_entities_mixed) }.not_to raise_error
258
+ end
259
+
260
+ it "handles horizontal rules correctly (only 4+ hyphens)" do
261
+ article = Wp2txt::Article.new(horizontal_rules)
262
+ # The article should process without error
263
+ expect(article.elements).not_to be_empty
264
+ end
265
+
266
+ it "handles complex headings with formatting" do
267
+ article = Wp2txt::Article.new(complex_headings)
268
+ types = article.elements.map(&:first)
269
+ expect(types.count(:mw_heading)).to be >= 4
270
+ end
271
+
272
+ it "handles redirect variations" do
273
+ # Test each redirect variation
274
+ ["#REDIRECT [[Target]]", "#redirect [[lowercase]]"].each do |redirect|
275
+ article = Wp2txt::Article.new(redirect)
276
+ types = article.elements.map(&:first)
277
+ expect(types).to include(:mw_redirect)
278
+ end
279
+ end
280
+ end
281
+
282
+ describe "Multilingual category extraction" do
283
+ it "extracts Japanese categories" do
284
+ article = Wp2txt::Article.new("[[カテゴリ:テスト]][[カテゴリ:例]]")
285
+ categories = article.categories.flatten
286
+ expect(categories).to include("テスト")
287
+ end
288
+
289
+ it "extracts Chinese categories" do
290
+ article = Wp2txt::Article.new("[[分类:测试]][[分類:範例]]")
291
+ categories = article.categories.flatten
292
+ expect(categories.size).to be >= 1
293
+ end
294
+
295
+ it "extracts German categories" do
296
+ article = Wp2txt::Article.new("[[Kategorie:Test]]")
297
+ categories = article.categories.flatten
298
+ expect(categories).to include("Test")
299
+ end
300
+
301
+ it "extracts French categories" do
302
+ article = Wp2txt::Article.new("[[Catégorie:Test]]")
303
+ categories = article.categories.flatten
304
+ expect(categories).to include("Test")
305
+ end
306
+
307
+ it "extracts Russian categories" do
308
+ article = Wp2txt::Article.new("[[Категория:Тест]]")
309
+ categories = article.categories.flatten
310
+ expect(categories).to include("Тест")
311
+ end
312
+
313
+ it "extracts mixed language categories from one article" do
314
+ mixed = "[[Category:English]][[カテゴリ:日本語]][[分类:中文]]"
315
+ article = Wp2txt::Article.new(mixed)
316
+ categories = article.categories.flatten
317
+ expect(categories.size).to be >= 2
318
+ end
319
+ end
320
+
321
+ describe "Emoji and supplementary plane character handling" do
322
+ it "converts emoji character references correctly" do
323
+ result = chrref_to_utf("&#x1F600;")
324
+ expect(result).to eq "😀"
325
+ expect(result.valid_encoding?).to be true
326
+ end
327
+
328
+ it "converts multiple emoji in text" do
329
+ result = chrref_to_utf("Hello &#x1F600; World &#x1F4BB;!")
330
+ expect(result).to include("😀")
331
+ expect(result).to include("💻")
332
+ end
333
+
334
+ it "handles CJK Extension B characters" do
335
+ # U+20000 is 𠀀 (CJK Extension B)
336
+ result = chrref_to_utf("&#x20000;")
337
+ expect(result.valid_encoding?).to be true
338
+ expect(result.length).to eq 1
339
+ end
340
+
341
+ it "handles invalid codepoints gracefully" do
342
+ # U+110000 is beyond Unicode max
343
+ result = chrref_to_utf("&#x110000;")
344
+ expect(result).to eq ""
345
+ end
346
+ end
347
+
348
+ describe "Horizontal rule processing" do
349
+ it "removes lines with 4+ hyphens" do
350
+ result = remove_hr("text\n----\nmore")
351
+ expect(result).not_to include("----")
352
+ end
353
+
354
+ it "preserves lines with fewer than 4 hyphens" do
355
+ result = remove_hr("text\n--\nmore\n---\nend")
356
+ expect(result).to include("--")
357
+ expect(result).to include("---")
358
+ end
359
+
360
+ it "removes very long horizontal rules" do
361
+ result = remove_hr("text\n" + "-" * 20 + "\nmore")
362
+ expect(result).not_to include("-" * 20)
363
+ end
364
+ end
365
+
366
+ describe "Full article output format" do
367
+ let(:wiki_with_categories) do
368
+ <<~WIKI
369
+ '''Test Person''' is a [[scientist]] who studies [[physics]].
370
+
371
+ == Early Life ==
372
+ Born in [[Tokyo]], [[Japan]].
373
+
374
+ == Career ==
375
+ Worked at [[University]].
376
+
377
+ [[Category:Scientists]]
378
+ [[Category:Physicists]]
379
+ WIKI
380
+ end
381
+
382
+ it "extracts both body text and categories from articles" do
383
+ article = Wp2txt::Article.new(wiki_with_categories, "Test Person")
384
+
385
+ # Should have body content
386
+ paragraphs = article.elements.select { |e| e.first == :mw_paragraph }
387
+ expect(paragraphs).not_to be_empty
388
+
389
+ # First paragraph should contain the intro
390
+ first_para_content = paragraphs.first.last
391
+ expect(first_para_content).to include("scientist")
392
+
393
+ # Should have categories
394
+ categories = article.categories.flatten
395
+ expect(categories).to include("Scientists")
396
+ expect(categories).to include("Physicists")
397
+
398
+ # Should have headings
399
+ headings = article.elements.select { |e| e.first == :mw_heading }
400
+ expect(headings.size).to eq 2
401
+ end
402
+
403
+ it "format_wiki removes markup but preserves text content" do
404
+ article = Wp2txt::Article.new(wiki_with_categories, "Test Person")
405
+
406
+ paragraphs = article.elements.select { |e| e.first == :mw_paragraph }
407
+ first_para = paragraphs.first.last
408
+
409
+ formatted = format_wiki(first_para)
410
+
411
+ # Text should be preserved
412
+ expect(formatted).to include("scientist")
413
+ expect(formatted).to include("physics")
414
+
415
+ # Wiki markup should be removed
416
+ expect(formatted).not_to include("[[")
417
+ expect(formatted).not_to include("]]")
418
+ expect(formatted).not_to include("'''")
419
+ end
420
+
421
+ it "cleanup produces valid output" do
422
+ raw_output = <<~TEXT
423
+ [[Title]]
424
+
425
+ Some text here.
426
+
427
+ [ref][/ref]
428
+
429
+
430
+ More text.
431
+
432
+
433
+
434
+ Final text.
435
+ TEXT
436
+
437
+ cleaned = cleanup(raw_output)
438
+
439
+ # Should remove empty refs
440
+ expect(cleaned).not_to include("[ref][/ref]")
441
+
442
+ # Should collapse multiple newlines
443
+ expect(cleaned).not_to include("\n\n\n")
444
+
445
+ # Should preserve content
446
+ expect(cleaned).to include("Some text")
447
+ expect(cleaned).to include("More text")
448
+ end
449
+ end
450
+
451
+ describe "Performance optimizations" do
452
+ it "regex_cache stores dynamically created patterns for remove_inbetween" do
453
+ # Clear cache first
454
+ Wp2txt.regex_cache.clear
455
+
456
+ # remove_inbetween uses the regex cache for custom tagsets
457
+ remove_inbetween("<tag>content</tag>", ["<tag>", "</tag>"])
458
+
459
+ # Cache should now have an entry
460
+ expect(Wp2txt.regex_cache).not_to be_empty
461
+ expect(Wp2txt.regex_cache.keys.first).to include("inbetween")
462
+ end
463
+
464
+ it "processes articles without creating excessive intermediate strings" do
465
+ large_text = "[[link]] " * 100 + "'''bold''' " * 100 + "text " * 100
466
+ article = Wp2txt::Article.new(large_text, "Large Article")
467
+
468
+ # Should complete without error
469
+ expect(article.elements).not_to be_empty
470
+
471
+ # Format should work
472
+ article.elements.each do |type, content|
473
+ if type == :mw_paragraph
474
+ result = format_wiki(content)
475
+ expect(result).to be_a(String)
476
+ expect(result.valid_encoding?).to be true
477
+ end
478
+ end
479
+ end
480
+ end
481
+
482
+ describe "HTML Entity Management" do
483
+ describe "Wp2txt.load_html_entities" do
484
+ it "loads entities from JSON files" do
485
+ entities = Wp2txt.load_html_entities
486
+ expect(entities).to be_a(Hash)
487
+ expect(entities.size).to be > 2000
488
+ end
489
+
490
+ it "includes WHATWG standard entities" do
491
+ entities = Wp2txt.load_html_entities
492
+ expect(entities["&alpha;"]).to eq "α"
493
+ expect(entities["&AElig;"]).to eq "Æ"
494
+ expect(entities["&copy;"]).to eq "©"
495
+ expect(entities["&nbsp;"]).to eq "\u00A0"
496
+ end
497
+
498
+ it "includes Wikipedia-specific entities" do
499
+ entities = Wp2txt.load_html_entities
500
+ expect(entities["&ratio;"]).to eq "∶"
501
+ expect(entities["&dash;"]).to eq "–"
502
+ expect(entities["&nbso;"]).to eq " "
503
+ end
504
+ end
505
+
506
+ describe "EXTRA_ENTITIES constant" do
507
+ it "is frozen to prevent modification" do
508
+ expect(Wp2txt::EXTRA_ENTITIES).to be_frozen
509
+ end
510
+
511
+ it "contains comprehensive entity coverage" do
512
+ # Should have 2000+ entities from WHATWG + Wikipedia-specific
513
+ expect(Wp2txt::EXTRA_ENTITIES.size).to be > 2000
514
+ end
515
+ end
516
+
517
+ describe "EXTRA_ENTITIES_REGEX" do
518
+ it "matches entity patterns" do
519
+ regex = Wp2txt::EXTRA_ENTITIES_REGEX
520
+ expect("&alpha;").to match(regex)
521
+ expect("&ratio;").to match(regex)
522
+ expect("&AElig;").to match(regex)
523
+ end
524
+
525
+ it "captures entity name in match" do
526
+ regex = Wp2txt::EXTRA_ENTITIES_REGEX
527
+ match = "text &alpha; more".match(regex)
528
+ expect(match).not_to be_nil
529
+ expect(match[1]).to eq "&alpha;"
530
+ end
531
+ end
532
+
533
+ describe "backward compatibility" do
534
+ it "MATH_ENTITIES is aliased to EXTRA_ENTITIES" do
535
+ expect(Wp2txt::MATH_ENTITIES).to eq Wp2txt::EXTRA_ENTITIES
536
+ end
537
+
538
+ it "MATH_ENTITIES_REGEX is aliased to EXTRA_ENTITIES_REGEX" do
539
+ expect(Wp2txt::MATH_ENTITIES_REGEX).to eq Wp2txt::EXTRA_ENTITIES_REGEX
540
+ end
541
+ end
542
+ end
543
+ end