wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
data/spec/cli_spec.rb ADDED
@@ -0,0 +1,876 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require_relative "../lib/wp2txt/cli"
5
+ require "tmpdir"
6
+
7
+ # Load the CLI app class for testing
8
+ require_relative "../lib/wp2txt"
9
+ require_relative "../lib/wp2txt/utils"
10
+
11
+ RSpec.describe Wp2txt::CLI do
12
+ describe ".parse_options" do
13
+ context "with --from-category option" do
14
+ it "requires --lang" do
15
+ suppress_stderr do
16
+ expect do
17
+ described_class.parse_options(["--from-category=Test"])
18
+ end.to raise_error(SystemExit)
19
+ end
20
+ end
21
+
22
+ it "cannot be used with --input" do
23
+ Dir.mktmpdir do |dir|
24
+ # Create a dummy file
25
+ dummy_file = File.join(dir, "test.bz2")
26
+ File.write(dummy_file, "test")
27
+
28
+ suppress_stderr do
29
+ expect do
30
+ described_class.parse_options([
31
+ "--from-category=Test",
32
+ "--lang=en",
33
+ "--input=#{dummy_file}",
34
+ "-o", dir
35
+ ])
36
+ end.to raise_error(SystemExit)
37
+ end
38
+ end
39
+ end
40
+
41
+ it "cannot be used with --articles" do
42
+ Dir.mktmpdir do |dir|
43
+ suppress_stderr do
44
+ expect do
45
+ described_class.parse_options([
46
+ "--from-category=Test",
47
+ "--lang=en",
48
+ "--articles=Article1",
49
+ "-o", dir
50
+ ])
51
+ end.to raise_error(SystemExit)
52
+ end
53
+ end
54
+ end
55
+
56
+ it "accepts valid options" do
57
+ Dir.mktmpdir do |dir|
58
+ opts = described_class.parse_options([
59
+ "--from-category=Japanese cities",
60
+ "--lang=en",
61
+ "--depth=2",
62
+ "-o", dir
63
+ ])
64
+
65
+ expect(opts[:from_category]).to eq "Japanese cities"
66
+ expect(opts[:lang]).to eq "en"
67
+ expect(opts[:depth]).to eq 2
68
+ end
69
+ end
70
+ end
71
+
72
+ context "with --depth option" do
73
+ it "defaults to 0" do
74
+ Dir.mktmpdir do |dir|
75
+ opts = described_class.parse_options([
76
+ "--from-category=Test",
77
+ "--lang=en",
78
+ "-o", dir
79
+ ])
80
+
81
+ expect(opts[:depth]).to eq 0
82
+ end
83
+ end
84
+
85
+ it "rejects negative values" do
86
+ Dir.mktmpdir do |dir|
87
+ suppress_stderr do
88
+ expect do
89
+ described_class.parse_options([
90
+ "--from-category=Test",
91
+ "--lang=en",
92
+ "--depth=-1",
93
+ "-o", dir
94
+ ])
95
+ end.to raise_error(SystemExit)
96
+ end
97
+ end
98
+ end
99
+
100
+ it "warns when depth > 3" do
101
+ Dir.mktmpdir do |dir|
102
+ expect do
103
+ described_class.parse_options([
104
+ "--from-category=Test",
105
+ "--lang=en",
106
+ "--depth=4",
107
+ "-o", dir
108
+ ])
109
+ end.to output(/Warning.*depth.*3/i).to_stderr
110
+ end
111
+ end
112
+ end
113
+
114
+ context "with --dry-run option" do
115
+ it "requires --from-category" do
116
+ Dir.mktmpdir do |dir|
117
+ suppress_stderr do
118
+ expect do
119
+ described_class.parse_options([
120
+ "--lang=en",
121
+ "--dry-run",
122
+ "-o", dir
123
+ ])
124
+ end.to raise_error(SystemExit)
125
+ end
126
+ end
127
+ end
128
+
129
+ it "works with --from-category" do
130
+ Dir.mktmpdir do |dir|
131
+ opts = described_class.parse_options([
132
+ "--from-category=Test",
133
+ "--lang=en",
134
+ "--dry-run",
135
+ "-o", dir
136
+ ])
137
+
138
+ expect(opts[:dry_run]).to be true
139
+ end
140
+ end
141
+ end
142
+
143
+ context "with --yes option" do
144
+ it "requires --from-category" do
145
+ Dir.mktmpdir do |dir|
146
+ suppress_stderr do
147
+ expect do
148
+ described_class.parse_options([
149
+ "--lang=en",
150
+ "--yes",
151
+ "-o", dir
152
+ ])
153
+ end.to raise_error(SystemExit)
154
+ end
155
+ end
156
+ end
157
+
158
+ it "works with --from-category" do
159
+ Dir.mktmpdir do |dir|
160
+ opts = described_class.parse_options([
161
+ "--from-category=Test",
162
+ "--lang=en",
163
+ "--yes",
164
+ "-o", dir
165
+ ])
166
+
167
+ expect(opts[:yes]).to be true
168
+ end
169
+ end
170
+ end
171
+
172
+ context "with --update-cache option" do
173
+ it "defaults to false" do
174
+ Dir.mktmpdir do |dir|
175
+ opts = described_class.parse_options([
176
+ "--from-category=Test",
177
+ "--lang=en",
178
+ "-o", dir
179
+ ])
180
+
181
+ expect(opts[:update_cache]).to be false
182
+ end
183
+ end
184
+
185
+ it "can be set to true" do
186
+ Dir.mktmpdir do |dir|
187
+ opts = described_class.parse_options([
188
+ "--from-category=Test",
189
+ "--lang=en",
190
+ "--update-cache",
191
+ "-o", dir
192
+ ])
193
+
194
+ expect(opts[:update_cache]).to be true
195
+ end
196
+ end
197
+
198
+ it "accepts short form -U" do
199
+ Dir.mktmpdir do |dir|
200
+ opts = described_class.parse_options([
201
+ "--from-category=Test",
202
+ "--lang=en",
203
+ "-U",
204
+ "-o", dir
205
+ ])
206
+
207
+ expect(opts[:update_cache]).to be true
208
+ end
209
+ end
210
+ end
211
+
212
+ context "extraction mode mutual exclusion" do
213
+ it "rejects --category-only with --summary-only" do
214
+ Dir.mktmpdir do |dir|
215
+ suppress_stderr do
216
+ expect do
217
+ described_class.parse_options([
218
+ "--lang=en",
219
+ "--category-only",
220
+ "--summary-only",
221
+ "-o", dir
222
+ ])
223
+ end.to raise_error(SystemExit)
224
+ end
225
+ end
226
+ end
227
+
228
+ it "rejects --category-only with --metadata-only" do
229
+ Dir.mktmpdir do |dir|
230
+ suppress_stderr do
231
+ expect do
232
+ described_class.parse_options([
233
+ "--lang=en",
234
+ "--category-only",
235
+ "--metadata-only",
236
+ "-o", dir
237
+ ])
238
+ end.to raise_error(SystemExit)
239
+ end
240
+ end
241
+ end
242
+
243
+ it "rejects --summary-only with --metadata-only" do
244
+ Dir.mktmpdir do |dir|
245
+ suppress_stderr do
246
+ expect do
247
+ described_class.parse_options([
248
+ "--lang=en",
249
+ "--summary-only",
250
+ "--metadata-only",
251
+ "-o", dir
252
+ ])
253
+ end.to raise_error(SystemExit)
254
+ end
255
+ end
256
+ end
257
+
258
+ it "rejects all three extraction modes combined" do
259
+ Dir.mktmpdir do |dir|
260
+ suppress_stderr do
261
+ expect do
262
+ described_class.parse_options([
263
+ "--lang=en",
264
+ "--category-only",
265
+ "--summary-only",
266
+ "--metadata-only",
267
+ "-o", dir
268
+ ])
269
+ end.to raise_error(SystemExit)
270
+ end
271
+ end
272
+ end
273
+
274
+ it "rejects --sections with --category-only" do
275
+ Dir.mktmpdir do |dir|
276
+ suppress_stderr do
277
+ expect do
278
+ described_class.parse_options([
279
+ "--lang=en",
280
+ "--category-only",
281
+ "--sections=Plot",
282
+ "-o", dir
283
+ ])
284
+ end.to raise_error(SystemExit)
285
+ end
286
+ end
287
+ end
288
+
289
+ it "rejects --sections with --summary-only" do
290
+ Dir.mktmpdir do |dir|
291
+ suppress_stderr do
292
+ expect do
293
+ described_class.parse_options([
294
+ "--lang=en",
295
+ "--summary-only",
296
+ "--sections=Plot",
297
+ "-o", dir
298
+ ])
299
+ end.to raise_error(SystemExit)
300
+ end
301
+ end
302
+ end
303
+
304
+ it "rejects --sections with --metadata-only" do
305
+ Dir.mktmpdir do |dir|
306
+ suppress_stderr do
307
+ expect do
308
+ described_class.parse_options([
309
+ "--lang=en",
310
+ "--metadata-only",
311
+ "--sections=Plot",
312
+ "-o", dir
313
+ ])
314
+ end.to raise_error(SystemExit)
315
+ end
316
+ end
317
+ end
318
+
319
+ it "rejects --section-stats with --category-only" do
320
+ Dir.mktmpdir do |dir|
321
+ suppress_stderr do
322
+ expect do
323
+ described_class.parse_options([
324
+ "--lang=en",
325
+ "--section-stats",
326
+ "--category-only",
327
+ "-o", dir
328
+ ])
329
+ end.to raise_error(SystemExit)
330
+ end
331
+ end
332
+ end
333
+
334
+ it "rejects --section-stats with --summary-only" do
335
+ Dir.mktmpdir do |dir|
336
+ suppress_stderr do
337
+ expect do
338
+ described_class.parse_options([
339
+ "--lang=en",
340
+ "--section-stats",
341
+ "--summary-only",
342
+ "-o", dir
343
+ ])
344
+ end.to raise_error(SystemExit)
345
+ end
346
+ end
347
+ end
348
+
349
+ it "allows single extraction mode" do
350
+ Dir.mktmpdir do |dir|
351
+ opts = described_class.parse_options([
352
+ "--lang=en",
353
+ "--category-only",
354
+ "-o", dir
355
+ ])
356
+ expect(opts[:category_only]).to be true
357
+ end
358
+ end
359
+ end
360
+
361
+ context "with content filtering options" do
362
+ it "parses --table option (defaults to false)" do
363
+ Dir.mktmpdir do |dir|
364
+ opts = described_class.parse_options([
365
+ "--lang=en",
366
+ "-o", dir
367
+ ])
368
+ expect(opts[:table]).to be false
369
+ end
370
+ end
371
+
372
+ it "enables --table option" do
373
+ Dir.mktmpdir do |dir|
374
+ opts = described_class.parse_options([
375
+ "--lang=en",
376
+ "--table",
377
+ "-o", dir
378
+ ])
379
+ expect(opts[:table]).to be true
380
+ end
381
+ end
382
+
383
+ it "parses --pre option (defaults to false)" do
384
+ Dir.mktmpdir do |dir|
385
+ opts = described_class.parse_options([
386
+ "--lang=en",
387
+ "-o", dir
388
+ ])
389
+ expect(opts[:pre]).to be false
390
+ end
391
+ end
392
+
393
+ it "enables --pre option with short form -p" do
394
+ Dir.mktmpdir do |dir|
395
+ opts = described_class.parse_options([
396
+ "--lang=en",
397
+ "-p",
398
+ "-o", dir
399
+ ])
400
+ expect(opts[:pre]).to be true
401
+ end
402
+ end
403
+
404
+ it "parses --multiline option (defaults to false)" do
405
+ Dir.mktmpdir do |dir|
406
+ opts = described_class.parse_options([
407
+ "--lang=en",
408
+ "-o", dir
409
+ ])
410
+ expect(opts[:multiline]).to be false
411
+ end
412
+ end
413
+
414
+ it "enables --multiline option" do
415
+ Dir.mktmpdir do |dir|
416
+ opts = described_class.parse_options([
417
+ "--lang=en",
418
+ "--multiline",
419
+ "-o", dir
420
+ ])
421
+ expect(opts[:multiline]).to be true
422
+ end
423
+ end
424
+
425
+ it "allows combining content filtering options" do
426
+ Dir.mktmpdir do |dir|
427
+ opts = described_class.parse_options([
428
+ "--lang=en",
429
+ "--table",
430
+ "--pre",
431
+ "--multiline",
432
+ "--list",
433
+ "-o", dir
434
+ ])
435
+ expect(opts[:table]).to be true
436
+ expect(opts[:pre]).to be true
437
+ expect(opts[:multiline]).to be true
438
+ expect(opts[:list]).to be true
439
+ end
440
+ end
441
+ end
442
+
443
+ context "with section extraction options" do
444
+ it "parses --sections option" do
445
+ Dir.mktmpdir do |dir|
446
+ opts = described_class.parse_options([
447
+ "--lang=en",
448
+ "--sections=summary,Plot,Reception",
449
+ "-o", dir
450
+ ])
451
+
452
+ expect(opts[:sections]).to eq("summary,Plot,Reception")
453
+ end
454
+ end
455
+
456
+ it "parses --no-section-aliases option" do
457
+ Dir.mktmpdir do |dir|
458
+ opts = described_class.parse_options([
459
+ "--lang=en",
460
+ "--sections=Plot",
461
+ "--no-section-aliases",
462
+ "-o", dir
463
+ ])
464
+
465
+ expect(opts[:no_section_aliases]).to be true
466
+ end
467
+ end
468
+
469
+ it "parses --show-matched-sections option" do
470
+ Dir.mktmpdir do |dir|
471
+ opts = described_class.parse_options([
472
+ "--lang=en",
473
+ "--sections=Plot",
474
+ "--show-matched-sections",
475
+ "--format=json",
476
+ "-o", dir
477
+ ])
478
+
479
+ expect(opts[:show_matched_sections]).to be true
480
+ end
481
+ end
482
+
483
+ it "rejects --show-matched-sections without JSON format" do
484
+ Dir.mktmpdir do |dir|
485
+ suppress_stderr do
486
+ expect do
487
+ described_class.parse_options([
488
+ "--lang=en",
489
+ "--sections=Plot",
490
+ "--show-matched-sections",
491
+ "--format=text",
492
+ "-o", dir
493
+ ])
494
+ end.to raise_error(SystemExit)
495
+ end
496
+ end
497
+ end
498
+
499
+ it "parses --section-stats option" do
500
+ Dir.mktmpdir do |dir|
501
+ opts = described_class.parse_options([
502
+ "--lang=en",
503
+ "--section-stats",
504
+ "-o", dir
505
+ ])
506
+
507
+ expect(opts[:section_stats]).to be true
508
+ end
509
+ end
510
+
511
+ it "rejects --section-stats with --sections" do
512
+ Dir.mktmpdir do |dir|
513
+ suppress_stderr do
514
+ expect do
515
+ described_class.parse_options([
516
+ "--lang=en",
517
+ "--section-stats",
518
+ "--sections=Plot",
519
+ "-o", dir
520
+ ])
521
+ end.to raise_error(SystemExit)
522
+ end
523
+ end
524
+ end
525
+
526
+ it "rejects --section-stats with --metadata-only" do
527
+ Dir.mktmpdir do |dir|
528
+ suppress_stderr do
529
+ expect do
530
+ described_class.parse_options([
531
+ "--lang=en",
532
+ "--section-stats",
533
+ "--metadata-only",
534
+ "-o", dir
535
+ ])
536
+ end.to raise_error(SystemExit)
537
+ end
538
+ end
539
+ end
540
+ end
541
+
542
+ context "with --alias-file option" do
543
+ let(:temp_dir) { Dir.mktmpdir }
544
+ let(:alias_file) { File.join(temp_dir, "aliases.yml") }
545
+
546
+ after { FileUtils.remove_entry(temp_dir) }
547
+
548
+ it "parses --alias-file option" do
549
+ File.write(alias_file, "Plot:\n - Synopsis\n")
550
+ Dir.mktmpdir do |dir|
551
+ opts = described_class.parse_options([
552
+ "--lang=en",
553
+ "--sections=Plot",
554
+ "--alias-file=#{alias_file}",
555
+ "-o", dir
556
+ ])
557
+
558
+ expect(opts[:alias_file]).to eq(alias_file)
559
+ end
560
+ end
561
+
562
+ it "rejects non-existent alias file" do
563
+ Dir.mktmpdir do |dir|
564
+ suppress_stderr do
565
+ expect do
566
+ described_class.parse_options([
567
+ "--lang=en",
568
+ "--sections=Plot",
569
+ "--alias-file=/nonexistent/file.yml",
570
+ "-o", dir
571
+ ])
572
+ end.to raise_error(SystemExit)
573
+ end
574
+ end
575
+ end
576
+
577
+ it "rejects invalid YAML alias file" do
578
+ File.write(alias_file, "invalid: yaml: {{")
579
+ Dir.mktmpdir do |dir|
580
+ suppress_stderr do
581
+ expect do
582
+ described_class.parse_options([
583
+ "--lang=en",
584
+ "--sections=Plot",
585
+ "--alias-file=#{alias_file}",
586
+ "-o", dir
587
+ ])
588
+ end.to raise_error(SystemExit)
589
+ end
590
+ end
591
+ end
592
+ end
593
+ end
594
+ end
595
+
596
+ # Test the WpApp class methods
597
+ class TestWpApp
598
+ include Wp2txt
599
+
600
+ def format_article(article, config)
601
+ article.title = format_wiki(article.title, config)
602
+
603
+ if config[:category_only]
604
+ format_category_only(article)
605
+ elsif config[:category] && !article.categories.empty?
606
+ format_with_categories(article, config)
607
+ else
608
+ format_full_article(article, config)
609
+ end
610
+ end
611
+
612
+ def format_category_only(article)
613
+ title = "#{article.title}\t"
614
+ contents = article.categories.join(", ")
615
+ contents << "\n"
616
+ title + contents
617
+ end
618
+
619
+ def format_with_categories(article, config)
620
+ title = "\n[[#{article.title}]]\n\n"
621
+ contents = +""
622
+
623
+ article.elements.each do |e|
624
+ line = process_element(e, config)
625
+ contents << line if line
626
+ end
627
+
628
+ contents << "\nCATEGORIES: "
629
+ contents << article.categories.join(", ")
630
+ contents << "\n\n"
631
+
632
+ config[:title] ? title + contents : contents
633
+ end
634
+
635
+ def format_full_article(article, config)
636
+ title = "\n[[#{article.title}]]\n\n"
637
+ contents = +""
638
+
639
+ article.elements.each do |e|
640
+ line = process_element(e, config)
641
+ contents << line if line
642
+ end
643
+
644
+ config[:title] ? title + contents : contents
645
+ end
646
+
647
+ def process_element(element, config)
648
+ type, content = element
649
+ case type
650
+ when :mw_heading
651
+ return nil if config[:summary_only]
652
+ return nil unless config[:heading]
653
+
654
+ content = format_wiki(content, config)
655
+ content + "\n"
656
+ when :mw_paragraph
657
+ content = format_wiki(content, config)
658
+ content + "\n"
659
+ when :mw_table, :mw_htable
660
+ return nil unless config[:table]
661
+
662
+ content + "\n"
663
+ when :mw_unordered, :mw_ordered, :mw_definition
664
+ return nil unless config[:list]
665
+
666
+ content + "\n"
667
+ when :mw_redirect
668
+ return nil unless config[:redirect]
669
+
670
+ content + "\n\n"
671
+ else
672
+ nil
673
+ end
674
+ end
675
+ end
676
+
677
+ RSpec.describe "CLI format_article" do
678
+ let(:app) { TestWpApp.new }
679
+
680
+ let(:sample_wiki) do
681
+ <<~WIKI
682
+ '''Test Article''' is about [[testing]].
683
+
684
+ == Section One ==
685
+ This is paragraph one.
686
+
687
+ == Section Two ==
688
+ This is paragraph two.
689
+
690
+ [[Category:Testing]]
691
+ [[Category:Examples]]
692
+ WIKI
693
+ end
694
+
695
+ let(:article) { Wp2txt::Article.new(sample_wiki, "Test Article") }
696
+
697
+ let(:default_config) do
698
+ {
699
+ title: true,
700
+ heading: true,
701
+ list: false,
702
+ table: false,
703
+ redirect: false,
704
+ category: true,
705
+ category_only: false,
706
+ summary_only: false
707
+ }
708
+ end
709
+
710
+ describe "format_with_categories" do
711
+ it "includes both body text and categories" do
712
+ result = app.format_article(article, default_config)
713
+
714
+ # Should include title
715
+ expect(result).to include("[[Test Article]]")
716
+
717
+ # Should include body text
718
+ expect(result).to include("is about")
719
+ expect(result).to include("Section One")
720
+ expect(result).to include("paragraph one")
721
+
722
+ # Should include categories
723
+ expect(result).to include("CATEGORIES:")
724
+ expect(result).to include("Testing")
725
+ expect(result).to include("Examples")
726
+ end
727
+
728
+ it "places categories after body text" do
729
+ result = app.format_article(article, default_config)
730
+
731
+ body_position = result.index("paragraph")
732
+ categories_position = result.index("CATEGORIES:")
733
+
734
+ expect(categories_position).to be > body_position
735
+ end
736
+ end
737
+
738
+ describe "format_category_only" do
739
+ it "outputs only title and categories without body" do
740
+ config = default_config.merge(category_only: true)
741
+ result = app.format_article(article, config)
742
+
743
+ # Should include title and categories
744
+ expect(result).to include("Test Article")
745
+ expect(result).to include("Testing")
746
+
747
+ # Should NOT include body text
748
+ expect(result).not_to include("paragraph")
749
+ expect(result).not_to include("Section One")
750
+ end
751
+ end
752
+
753
+ describe "format_full_article without categories" do
754
+ it "outputs body without categories section when article has no categories" do
755
+ wiki_no_categories = "'''Simple''' article with no categories."
756
+ article_no_cat = Wp2txt::Article.new(wiki_no_categories, "Simple")
757
+
758
+ result = app.format_article(article_no_cat, default_config)
759
+
760
+ expect(result).to include("[[Simple]]")
761
+ expect(result).to include("article with no categories")
762
+ expect(result).not_to include("CATEGORIES:")
763
+ end
764
+ end
765
+
766
+ describe "summary_only mode" do
767
+ it "excludes headings when summary_only is true" do
768
+ config = default_config.merge(summary_only: true)
769
+ result = app.format_article(article, config)
770
+
771
+ # Should include first paragraph
772
+ expect(result).to include("is about")
773
+
774
+ # Should NOT include section headings
775
+ expect(result).not_to include("Section One")
776
+ expect(result).not_to include("Section Two")
777
+ end
778
+ end
779
+
780
+ describe "heading option" do
781
+ it "excludes headings when heading is false" do
782
+ config = default_config.merge(heading: false)
783
+ result = app.format_article(article, config)
784
+
785
+ # Should include paragraph content
786
+ expect(result).to include("is about")
787
+
788
+ # Should NOT include headings
789
+ expect(result).not_to include("Section One")
790
+ end
791
+ end
792
+
793
+ describe "redirect handling" do
794
+ let(:redirect_wiki) { "#REDIRECT [[Target Article]]" }
795
+ let(:redirect_article) { Wp2txt::Article.new(redirect_wiki, "Redirect Test") }
796
+
797
+ it "excludes redirect by default" do
798
+ result = app.format_article(redirect_article, default_config)
799
+ expect(result).not_to include("REDIRECT")
800
+ expect(result).not_to include("Target Article")
801
+ end
802
+
803
+ it "includes redirect when redirect option is true" do
804
+ config = default_config.merge(redirect: true, category: false)
805
+ result = app.format_article(redirect_article, config)
806
+ expect(result).to include("REDIRECT")
807
+ end
808
+ end
809
+ end
810
+
811
+ RSpec.describe "End-to-end article processing" do
812
+ include Wp2txt
813
+
814
+ let(:complex_article) do
815
+ <<~WIKI
816
+ {{Infobox
817
+ |name = Test
818
+ }}
819
+ '''Complex Article''' is a [[test]] with '''bold''' and ''italic''.
820
+
821
+ == History ==
822
+ The history section with a [[link|display text]].
823
+
824
+ == Features ==
825
+ * Feature one
826
+ * Feature two
827
+
828
+ {| class="wikitable"
829
+ |-
830
+ | Cell 1 || Cell 2
831
+ |}
832
+
833
+ == References ==
834
+ <ref>Citation</ref>
835
+
836
+ [[Category:Complex]]
837
+ [[Category:Test Articles]]
838
+ WIKI
839
+ end
840
+
841
+ it "correctly processes complex articles with categories" do
842
+ article = Wp2txt::Article.new(complex_article, "Complex Article")
843
+
844
+ # Article should have elements
845
+ expect(article.elements).not_to be_empty
846
+
847
+ # Article should have categories
848
+ expect(article.categories.flatten).to include("Complex")
849
+ expect(article.categories.flatten).to include("Test Articles")
850
+
851
+ # Article should have headings
852
+ types = article.elements.map(&:first)
853
+ expect(types).to include(:mw_heading)
854
+ expect(types).to include(:mw_paragraph)
855
+ end
856
+
857
+ it "extracts body text correctly through format_wiki" do
858
+ article = Wp2txt::Article.new(complex_article, "Complex Article")
859
+
860
+ # Find paragraph elements and format them
861
+ paragraphs = article.elements.select { |e| e.first == :mw_paragraph }
862
+ expect(paragraphs).not_to be_empty
863
+
864
+ # Format the first paragraph
865
+ first_para = paragraphs.first.last
866
+ formatted = format_wiki(first_para)
867
+
868
+ # Should contain the text without wiki markup
869
+ expect(formatted).to include("Complex Article")
870
+ expect(formatted).to include("test")
871
+
872
+ # Should not contain raw wiki markup
873
+ expect(formatted).not_to include("'''")
874
+ expect(formatted).not_to include("[[")
875
+ end
876
+ end