wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,678 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require_relative "../lib/wp2txt"
5
+ require_relative "../lib/wp2txt/utils"
6
+
7
+ # Simulate CLI app for testing options
8
+ class CLITestApp
9
+ include Wp2txt
10
+
11
+ # Default configuration matching bin/wp2txt defaults
12
+ DEFAULT_CONFIG = {
13
+ title: true,
14
+ heading: true,
15
+ list: false,
16
+ table: false,
17
+ redirect: false,
18
+ category: true,
19
+ category_only: false,
20
+ summary_only: false,
21
+ marker: true,
22
+ extract_citations: false
23
+ }.freeze
24
+
25
+ def self.default_config
26
+ DEFAULT_CONFIG.dup
27
+ end
28
+
29
+ def format_article(article, config)
30
+ article.title = format_wiki(article.title, config)
31
+
32
+ if config[:category_only]
33
+ format_category_only(article)
34
+ elsif config[:category] && !article.categories.empty?
35
+ format_with_categories(article, config)
36
+ else
37
+ format_full_article(article, config)
38
+ end
39
+ end
40
+
41
+ def format_category_only(article)
42
+ title = "#{article.title}\t"
43
+ contents = article.categories.join(", ")
44
+ contents << "\n"
45
+ title + contents
46
+ end
47
+
48
+ def format_with_categories(article, config)
49
+ title = "\n[[#{article.title}]]\n\n"
50
+ contents = +""
51
+
52
+ article.elements.each do |e|
53
+ line = process_element(e, config)
54
+ contents << line if line
55
+ end
56
+
57
+ contents << "\nCATEGORIES: "
58
+ contents << article.categories.join(", ")
59
+ contents << "\n\n"
60
+
61
+ config[:title] ? title + contents : contents
62
+ end
63
+
64
+ def format_full_article(article, config)
65
+ title = "\n[[#{article.title}]]\n\n"
66
+ contents = +""
67
+
68
+ article.elements.each do |e|
69
+ line = process_element(e, config)
70
+ contents << line if line
71
+ end
72
+
73
+ config[:title] ? title + contents : contents
74
+ end
75
+
76
+ def process_element(element, config)
77
+ type, content = element
78
+ case type
79
+ when :mw_heading
80
+ return nil if config[:summary_only]
81
+ return nil unless config[:heading]
82
+
83
+ content = format_wiki(content, config)
84
+ content + "\n"
85
+ when :mw_paragraph
86
+ content = format_wiki(content, config)
87
+ content + "\n"
88
+ when :mw_table, :mw_htable
89
+ return nil unless config[:table]
90
+
91
+ content + "\n"
92
+ when :mw_unordered, :mw_ordered, :mw_definition
93
+ return nil unless config[:list]
94
+
95
+ content + "\n"
96
+ when :mw_redirect
97
+ return nil unless config[:redirect]
98
+
99
+ content + "\n\n"
100
+ when :mw_isolated_template, :mw_isolated_tag
101
+ nil
102
+ else
103
+ nil
104
+ end
105
+ end
106
+ end
107
+
108
+ RSpec.describe "CLI Options" do
109
+ let(:app) { CLITestApp.new }
110
+
111
+ # Standard test article with various elements
112
+ let(:full_article_wiki) do
113
+ <<~WIKI
114
+ '''Test Article''' is about [[testing]] software.
115
+
116
+ == Introduction ==
117
+ This is the introduction paragraph.
118
+
119
+ == Features ==
120
+ The features section.
121
+
122
+ * Feature one
123
+ * Feature two
124
+ # Step one
125
+ # Step two
126
+
127
+ {| class="wikitable"
128
+ |-
129
+ | Cell 1 || Cell 2
130
+ |}
131
+
132
+ == See Also ==
133
+ Related content here.
134
+
135
+ [[Category:Software]]
136
+ [[Category:Testing]]
137
+ WIKI
138
+ end
139
+
140
+ let(:redirect_wiki) { "#REDIRECT [[Target Page]]" }
141
+
142
+ let(:article) { Wp2txt::Article.new(full_article_wiki, "Test Article") }
143
+ let(:redirect_article) { Wp2txt::Article.new(redirect_wiki, "Redirect Source") }
144
+
145
+ describe "Default configuration values" do
146
+ let(:defaults) { CLITestApp.default_config }
147
+
148
+ it "title defaults to true" do
149
+ expect(defaults[:title]).to be true
150
+ end
151
+
152
+ it "heading defaults to true" do
153
+ expect(defaults[:heading]).to be true
154
+ end
155
+
156
+ it "list defaults to false" do
157
+ expect(defaults[:list]).to be false
158
+ end
159
+
160
+ it "table defaults to false" do
161
+ expect(defaults[:table]).to be false
162
+ end
163
+
164
+ it "redirect defaults to false" do
165
+ expect(defaults[:redirect]).to be false
166
+ end
167
+
168
+ it "category defaults to true" do
169
+ expect(defaults[:category]).to be true
170
+ end
171
+
172
+ it "category_only defaults to false" do
173
+ expect(defaults[:category_only]).to be false
174
+ end
175
+
176
+ it "summary_only defaults to false" do
177
+ expect(defaults[:summary_only]).to be false
178
+ end
179
+
180
+ it "marker defaults to true" do
181
+ expect(defaults[:marker]).to be true
182
+ end
183
+
184
+ it "extract_citations defaults to false" do
185
+ expect(defaults[:extract_citations]).to be false
186
+ end
187
+ end
188
+
189
+ describe "--title / -t option" do
190
+ context "when title is true (default)" do
191
+ it "includes article title in output" do
192
+ config = CLITestApp.default_config
193
+ result = app.format_article(article, config)
194
+
195
+ expect(result).to include("[[Test Article]]")
196
+ end
197
+ end
198
+
199
+ context "when title is false" do
200
+ it "excludes article title from output" do
201
+ config = CLITestApp.default_config.merge(title: false)
202
+ result = app.format_article(article, config)
203
+
204
+ expect(result).not_to include("[[Test Article]]")
205
+ end
206
+
207
+ it "still includes body content" do
208
+ config = CLITestApp.default_config.merge(title: false)
209
+ result = app.format_article(article, config)
210
+
211
+ expect(result).to include("testing")
212
+ end
213
+ end
214
+ end
215
+
216
+ describe "--heading / -d option" do
217
+ context "when heading is true (default)" do
218
+ it "includes section headings in output" do
219
+ config = CLITestApp.default_config
220
+ result = app.format_article(article, config)
221
+
222
+ expect(result).to include("Introduction")
223
+ expect(result).to include("Features")
224
+ expect(result).to include("See Also")
225
+ end
226
+ end
227
+
228
+ context "when heading is false" do
229
+ it "excludes section headings from output" do
230
+ config = CLITestApp.default_config.merge(heading: false)
231
+ result = app.format_article(article, config)
232
+
233
+ expect(result).not_to include("Introduction")
234
+ expect(result).not_to include("Features")
235
+ end
236
+
237
+ it "still includes paragraph content" do
238
+ config = CLITestApp.default_config.merge(heading: false)
239
+ result = app.format_article(article, config)
240
+
241
+ expect(result).to include("introduction paragraph")
242
+ end
243
+ end
244
+ end
245
+
246
+ describe "--list / -l option" do
247
+ context "when list is false (default)" do
248
+ it "excludes list items from output" do
249
+ config = CLITestApp.default_config
250
+ result = app.format_article(article, config)
251
+
252
+ expect(result).not_to include("Feature one")
253
+ expect(result).not_to include("Step one")
254
+ end
255
+ end
256
+
257
+ context "when list is true" do
258
+ it "includes unordered list items" do
259
+ config = CLITestApp.default_config.merge(list: true)
260
+ result = app.format_article(article, config)
261
+
262
+ expect(result).to include("Feature one")
263
+ expect(result).to include("Feature two")
264
+ end
265
+
266
+ it "includes ordered list items" do
267
+ config = CLITestApp.default_config.merge(list: true)
268
+ result = app.format_article(article, config)
269
+
270
+ expect(result).to include("Step one")
271
+ expect(result).to include("Step two")
272
+ end
273
+ end
274
+ end
275
+
276
+ describe "--table option" do
277
+ context "when table is false (default)" do
278
+ it "excludes table content from output" do
279
+ config = CLITestApp.default_config
280
+ result = app.format_article(article, config)
281
+
282
+ # Table raw content should not appear
283
+ expect(result).not_to include("Cell 1")
284
+ end
285
+ end
286
+
287
+ context "when table is true" do
288
+ it "includes table content in output" do
289
+ config = CLITestApp.default_config.merge(table: true)
290
+ result = app.format_article(article, config)
291
+
292
+ expect(result).to include("Cell 1")
293
+ end
294
+ end
295
+ end
296
+
297
+ describe "--redirect / -e option" do
298
+ context "when redirect is false (default)" do
299
+ it "excludes redirect information" do
300
+ config = CLITestApp.default_config.merge(category: false)
301
+ result = app.format_article(redirect_article, config)
302
+
303
+ expect(result).not_to include("REDIRECT")
304
+ expect(result).not_to include("Target Page")
305
+ end
306
+ end
307
+
308
+ context "when redirect is true" do
309
+ it "includes redirect information" do
310
+ config = CLITestApp.default_config.merge(redirect: true, category: false)
311
+ result = app.format_article(redirect_article, config)
312
+
313
+ expect(result).to include("REDIRECT")
314
+ end
315
+ end
316
+ end
317
+
318
+ describe "--category / -a option" do
319
+ context "when category is true (default)" do
320
+ it "includes categories in output" do
321
+ config = CLITestApp.default_config
322
+ result = app.format_article(article, config)
323
+
324
+ expect(result).to include("CATEGORIES:")
325
+ expect(result).to include("Software")
326
+ expect(result).to include("Testing")
327
+ end
328
+
329
+ it "also includes body text" do
330
+ config = CLITestApp.default_config
331
+ result = app.format_article(article, config)
332
+
333
+ expect(result).to include("testing")
334
+ expect(result).to include("introduction paragraph")
335
+ end
336
+ end
337
+
338
+ context "when category is false" do
339
+ it "excludes categories section from output" do
340
+ config = CLITestApp.default_config.merge(category: false)
341
+ result = app.format_article(article, config)
342
+
343
+ expect(result).not_to include("CATEGORIES:")
344
+ end
345
+
346
+ it "still includes body text" do
347
+ config = CLITestApp.default_config.merge(category: false)
348
+ result = app.format_article(article, config)
349
+
350
+ expect(result).to include("testing")
351
+ end
352
+ end
353
+ end
354
+
355
+ describe "--category-only / -g option" do
356
+ context "when category_only is false (default)" do
357
+ it "includes full article content" do
358
+ config = CLITestApp.default_config
359
+ result = app.format_article(article, config)
360
+
361
+ expect(result).to include("testing")
362
+ expect(result).to include("Introduction")
363
+ end
364
+ end
365
+
366
+ context "when category_only is true" do
367
+ it "outputs only title and categories" do
368
+ config = CLITestApp.default_config.merge(category_only: true)
369
+ result = app.format_article(article, config)
370
+
371
+ expect(result).to include("Test Article")
372
+ expect(result).to include("Software")
373
+ expect(result).to include("Testing")
374
+ end
375
+
376
+ it "excludes body text" do
377
+ config = CLITestApp.default_config.merge(category_only: true)
378
+ result = app.format_article(article, config)
379
+
380
+ expect(result).not_to include("introduction paragraph")
381
+ expect(result).not_to include("Features")
382
+ end
383
+
384
+ it "uses tab-separated format" do
385
+ config = CLITestApp.default_config.merge(category_only: true)
386
+ result = app.format_article(article, config)
387
+
388
+ expect(result).to include("\t")
389
+ end
390
+ end
391
+ end
392
+
393
+ describe "--summary-only / -s option" do
394
+ context "when summary_only is false (default)" do
395
+ it "includes all headings" do
396
+ config = CLITestApp.default_config
397
+ result = app.format_article(article, config)
398
+
399
+ expect(result).to include("Introduction")
400
+ expect(result).to include("Features")
401
+ expect(result).to include("See Also")
402
+ end
403
+ end
404
+
405
+ context "when summary_only is true" do
406
+ it "excludes section headings" do
407
+ config = CLITestApp.default_config.merge(summary_only: true)
408
+ result = app.format_article(article, config)
409
+
410
+ expect(result).not_to include("Introduction")
411
+ expect(result).not_to include("Features")
412
+ end
413
+
414
+ it "includes first paragraph (summary)" do
415
+ config = CLITestApp.default_config.merge(summary_only: true)
416
+ result = app.format_article(article, config)
417
+
418
+ expect(result).to include("testing")
419
+ end
420
+
421
+ it "includes categories if category option is true" do
422
+ config = CLITestApp.default_config.merge(summary_only: true)
423
+ result = app.format_article(article, config)
424
+
425
+ expect(result).to include("CATEGORIES:")
426
+ end
427
+ end
428
+ end
429
+
430
+ describe "Option combinations" do
431
+ it "category + title both false outputs only body" do
432
+ config = CLITestApp.default_config.merge(category: false, title: false)
433
+ result = app.format_article(article, config)
434
+
435
+ expect(result).not_to include("[[Test Article]]")
436
+ expect(result).not_to include("CATEGORIES:")
437
+ expect(result).to include("testing")
438
+ end
439
+
440
+ it "summary_only + category outputs summary with categories" do
441
+ config = CLITestApp.default_config.merge(summary_only: true, category: true)
442
+ result = app.format_article(article, config)
443
+
444
+ expect(result).to include("testing")
445
+ expect(result).to include("CATEGORIES:")
446
+ expect(result).not_to include("Introduction")
447
+ end
448
+
449
+ it "heading false + list true shows lists but not headings" do
450
+ config = CLITestApp.default_config.merge(heading: false, list: true)
451
+ result = app.format_article(article, config)
452
+
453
+ expect(result).not_to include("Introduction")
454
+ expect(result).to include("Feature one")
455
+ end
456
+
457
+ it "all content options enabled shows everything" do
458
+ config = CLITestApp.default_config.merge(
459
+ heading: true,
460
+ list: true,
461
+ table: true,
462
+ redirect: true
463
+ )
464
+ result = app.format_article(article, config)
465
+
466
+ expect(result).to include("Introduction")
467
+ expect(result).to include("Feature one")
468
+ expect(result).to include("Cell 1")
469
+ end
470
+
471
+ it "category_only takes precedence over other content options" do
472
+ config = CLITestApp.default_config.merge(
473
+ category_only: true,
474
+ heading: true,
475
+ list: true
476
+ )
477
+ result = app.format_article(article, config)
478
+
479
+ # Should only have title and categories
480
+ expect(result).to include("Test Article")
481
+ expect(result).to include("Software")
482
+ expect(result).not_to include("Introduction")
483
+ expect(result).not_to include("Feature one")
484
+ end
485
+ end
486
+
487
+ describe "Edge cases" do
488
+ it "handles article with no categories when category is true" do
489
+ wiki_no_cat = "'''Simple''' article without categories."
490
+ article_no_cat = Wp2txt::Article.new(wiki_no_cat, "Simple")
491
+ config = CLITestApp.default_config
492
+
493
+ result = app.format_article(article_no_cat, config)
494
+
495
+ # Should use format_full_article (no CATEGORIES section)
496
+ expect(result).to include("[[Simple]]")
497
+ expect(result).to include("article without categories")
498
+ expect(result).not_to include("CATEGORIES:")
499
+ end
500
+
501
+ it "handles empty article" do
502
+ empty_article = Wp2txt::Article.new("", "Empty")
503
+ config = CLITestApp.default_config.merge(category: false)
504
+
505
+ result = app.format_article(empty_article, config)
506
+
507
+ expect(result).to include("[[Empty]]")
508
+ end
509
+
510
+ it "handles article with only categories" do
511
+ cat_only_wiki = "[[Category:Test]][[Category:Example]]"
512
+ cat_article = Wp2txt::Article.new(cat_only_wiki, "Categories Only")
513
+ config = CLITestApp.default_config
514
+
515
+ result = app.format_article(cat_article, config)
516
+
517
+ expect(result).to include("CATEGORIES:")
518
+ expect(result).to include("Test")
519
+ end
520
+
521
+ it "handles article with deeply nested markup" do
522
+ nested_wiki = "{{outer|{{inner|{{deep|content}}}}}} and [[link|[[nested]]]]"
523
+ nested_article = Wp2txt::Article.new(nested_wiki, "Nested")
524
+ config = CLITestApp.default_config.merge(category: false)
525
+
526
+ # Should not raise error
527
+ expect { app.format_article(nested_article, config) }.not_to raise_error
528
+ end
529
+
530
+ it "handles article with special characters in title" do
531
+ special_article = Wp2txt::Article.new("Content here.", "C++ Programming")
532
+ config = CLITestApp.default_config.merge(category: false)
533
+
534
+ result = app.format_article(special_article, config)
535
+ expect(result).to include("C++ Programming")
536
+ end
537
+
538
+ it "handles Unicode content correctly" do
539
+ unicode_wiki = "'''日本語記事''' は [[テスト]] です。\n[[カテゴリ:日本語]]"
540
+ unicode_article = Wp2txt::Article.new(unicode_wiki, "日本語")
541
+ config = CLITestApp.default_config
542
+
543
+ result = app.format_article(unicode_article, config)
544
+
545
+ expect(result).to include("日本語")
546
+ expect(result.valid_encoding?).to be true
547
+ end
548
+ end
549
+
550
+ describe "--extract-citations option" do
551
+ include Wp2txt
552
+
553
+ # Test with inline citations in paragraph text
554
+ let(:inline_citation_wiki) do
555
+ <<~WIKI
556
+ '''Test Article''' is about testing.
557
+
558
+ The main source is {{cite book |last=Smith |first=John |title=The Book Title |year=2020}}.
559
+
560
+ Another reference: {{cite web |title=Web Page |url=http://example.com |date=2021-05-15}}.
561
+ WIKI
562
+ end
563
+
564
+ let(:inline_citation_article) { Wp2txt::Article.new(inline_citation_wiki, "Test Article") }
565
+
566
+ context "when extract_citations is false (default)" do
567
+ it "removes citations from text" do
568
+ config = CLITestApp.default_config
569
+ result = app.format_article(inline_citation_article, config)
570
+
571
+ expect(result).not_to include("Smith")
572
+ expect(result).not_to include("The Book Title")
573
+ expect(result).to include("The main source is")
574
+ end
575
+ end
576
+
577
+ context "when extract_citations is true" do
578
+ it "extracts formatted citations" do
579
+ config = CLITestApp.default_config.merge(extract_citations: true)
580
+ result = app.format_article(inline_citation_article, config)
581
+
582
+ expect(result).to include("Smith")
583
+ expect(result).to include("The Book Title")
584
+ expect(result).to include("2020")
585
+ end
586
+
587
+ it "extracts multiple citations" do
588
+ config = CLITestApp.default_config.merge(extract_citations: true)
589
+ result = app.format_article(inline_citation_article, config)
590
+
591
+ expect(result).to include("Smith")
592
+ expect(result).to include("Web Page")
593
+ end
594
+ end
595
+
596
+ # Test format_wiki directly for [REFERENCES] marker
597
+ describe "format_wiki with references" do
598
+ it "replaces {{reflist}} with [REFERENCES] marker by default" do
599
+ input = "Text\n{{reflist}}"
600
+ result = format_wiki(input)
601
+ expect(result).to include("[REFERENCES]")
602
+ end
603
+
604
+ it "replaces {{refbegin}}...{{refend}} with [REFERENCES] marker by default" do
605
+ input = "{{refbegin}}\n* Citation\n{{refend}}"
606
+ result = format_wiki(input)
607
+ expect(result).to include("[REFERENCES]")
608
+ end
609
+
610
+ it "extracts citations when extract_citations is true" do
611
+ input = "{{cite book |last=Author |title=Book |year=2020}}"
612
+ result = format_wiki(input, extract_citations: true)
613
+ expect(result).to include("Author")
614
+ expect(result).to include("Book")
615
+ end
616
+ end
617
+ end
618
+ end
619
+
620
+ RSpec.describe "Article element type coverage" do
621
+ include Wp2txt
622
+
623
+ describe "All element types are parsed correctly" do
624
+ it "detects :mw_heading" do
625
+ article = Wp2txt::Article.new("== Heading ==", "Test")
626
+ types = article.elements.map(&:first)
627
+ expect(types).to include(:mw_heading)
628
+ end
629
+
630
+ it "detects :mw_paragraph" do
631
+ article = Wp2txt::Article.new("Simple paragraph text.", "Test")
632
+ types = article.elements.map(&:first)
633
+ expect(types).to include(:mw_paragraph)
634
+ end
635
+
636
+ it "detects :mw_unordered" do
637
+ article = Wp2txt::Article.new("* List item", "Test")
638
+ types = article.elements.map(&:first)
639
+ expect(types).to include(:mw_unordered)
640
+ end
641
+
642
+ it "detects :mw_ordered" do
643
+ article = Wp2txt::Article.new("# Numbered item", "Test")
644
+ types = article.elements.map(&:first)
645
+ expect(types).to include(:mw_ordered)
646
+ end
647
+
648
+ it "detects :mw_definition" do
649
+ article = Wp2txt::Article.new("; Term\n: Definition", "Test")
650
+ types = article.elements.map(&:first)
651
+ expect(types).to include(:mw_definition)
652
+ end
653
+
654
+ it "detects :mw_table" do
655
+ article = Wp2txt::Article.new("{| class=\"wikitable\"\n|-\n| Cell\n|}", "Test")
656
+ types = article.elements.map(&:first)
657
+ expect(types).to include(:mw_table)
658
+ end
659
+
660
+ it "detects :mw_redirect" do
661
+ article = Wp2txt::Article.new("#REDIRECT [[Target]]", "Test")
662
+ types = article.elements.map(&:first)
663
+ expect(types).to include(:mw_redirect)
664
+ end
665
+
666
+ it "detects :mw_blank" do
667
+ article = Wp2txt::Article.new("Text\n\nMore text", "Test")
668
+ types = article.elements.map(&:first)
669
+ expect(types).to include(:mw_blank)
670
+ end
671
+
672
+ it "detects :mw_isolated_template" do
673
+ article = Wp2txt::Article.new("{{Stub}}", "Test")
674
+ types = article.elements.map(&:first)
675
+ expect(types).to include(:mw_isolated_template)
676
+ end
677
+ end
678
+ end