wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,504 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require "webmock/rspec"
5
+ require "tmpdir"
6
+ require "fileutils"
7
+
8
+ RSpec.describe Wp2txt::CategoryFetcher do
9
+ let(:lang) { "en" }
10
+ let(:category) { "Japanese cities" }
11
+ let(:fetcher) { described_class.new(lang, category) }
12
+
13
+ before do
14
+ WebMock.enable!
15
+ WebMock.disable_net_connect!
16
+ end
17
+
18
+ after do
19
+ WebMock.disable!
20
+ end
21
+
22
+ describe "#initialize" do
23
+ it "accepts language and category name" do
24
+ expect(fetcher.lang).to eq "en"
25
+ expect(fetcher.category).to eq "Japanese cities"
26
+ end
27
+
28
+ it "normalizes category name by removing Category: prefix" do
29
+ f = described_class.new("en", "Category:Test Category")
30
+ expect(f.category).to eq "Test Category"
31
+ end
32
+
33
+ it "handles lowercase category prefix" do
34
+ f = described_class.new("en", "category:Another Test")
35
+ expect(f.category).to eq "Another Test"
36
+ end
37
+
38
+ it "trims whitespace from category name" do
39
+ f = described_class.new("en", " Test Category ")
40
+ expect(f.category).to eq "Test Category"
41
+ end
42
+
43
+ it "defaults max_depth to 0" do
44
+ expect(fetcher.max_depth).to eq 0
45
+ end
46
+
47
+ it "accepts custom max_depth" do
48
+ f = described_class.new("ja", "Test", max_depth: 3)
49
+ expect(f.max_depth).to eq 3
50
+ end
51
+ end
52
+
53
+ describe "#fetch_preview" do
54
+ it "returns statistics without full article list" do
55
+ stub_request(:get, /en\.wikipedia\.org/)
56
+ .to_return(
57
+ status: 200,
58
+ body: {
59
+ query: {
60
+ categorymembers: [
61
+ { ns: 0, title: "Tokyo" },
62
+ { ns: 0, title: "Osaka" },
63
+ { ns: 0, title: "Kyoto" }
64
+ ]
65
+ }
66
+ }.to_json
67
+ )
68
+
69
+ preview = fetcher.fetch_preview
70
+
71
+ expect(preview[:category]).to eq "Japanese cities"
72
+ expect(preview[:depth]).to eq 0
73
+ expect(preview[:total_articles]).to eq 3
74
+ expect(preview[:subcategories]).to be_an(Array)
75
+ end
76
+
77
+ it "includes subcategory statistics when depth > 0" do
78
+ f = described_class.new("en", "Japanese cities", max_depth: 1)
79
+
80
+ # Parent category
81
+ stub_request(:get, /cmtitle=Category:Japanese%20cities/)
82
+ .to_return(
83
+ status: 200,
84
+ body: {
85
+ query: {
86
+ categorymembers: [
87
+ { ns: 0, title: "Tokyo" },
88
+ { ns: 14, title: "Category:Cities in Kanto" }
89
+ ]
90
+ }
91
+ }.to_json
92
+ )
93
+
94
+ # Subcategory
95
+ stub_request(:get, /cmtitle=Category:Cities%20in%20Kanto/)
96
+ .to_return(
97
+ status: 200,
98
+ body: {
99
+ query: {
100
+ categorymembers: [
101
+ { ns: 0, title: "Yokohama" },
102
+ { ns: 0, title: "Chiba" }
103
+ ]
104
+ }
105
+ }.to_json
106
+ )
107
+
108
+ preview = f.fetch_preview
109
+
110
+ expect(preview[:total_articles]).to eq 3
111
+ expect(preview[:total_subcategories]).to eq 1
112
+ expect(preview[:subcategories].size).to eq 2
113
+ end
114
+ end
115
+
116
+ describe "#fetch_articles" do
117
+ it "fetches articles from single page response" do
118
+ stub_request(:get, /en\.wikipedia\.org/)
119
+ .to_return(
120
+ status: 200,
121
+ body: {
122
+ query: {
123
+ categorymembers: [
124
+ { ns: 0, title: "Tokyo" },
125
+ { ns: 0, title: "Osaka" },
126
+ { ns: 0, title: "Kyoto" }
127
+ ]
128
+ }
129
+ }.to_json
130
+ )
131
+
132
+ articles = fetcher.fetch_articles
133
+ expect(articles).to contain_exactly("Tokyo", "Osaka", "Kyoto")
134
+ end
135
+
136
+ it "handles pagination with cmcontinue token" do
137
+ # First request
138
+ stub_request(:get, /en\.wikipedia\.org/)
139
+ .with(query: hash_excluding("cmcontinue"))
140
+ .to_return(
141
+ status: 200,
142
+ body: {
143
+ query: {
144
+ categorymembers: [
145
+ { ns: 0, title: "Article1" },
146
+ { ns: 0, title: "Article2" }
147
+ ]
148
+ },
149
+ continue: { cmcontinue: "page2token" }
150
+ }.to_json
151
+ )
152
+
153
+ # Second request with continue token
154
+ stub_request(:get, /en\.wikipedia\.org/)
155
+ .with(query: hash_including("cmcontinue" => "page2token"))
156
+ .to_return(
157
+ status: 200,
158
+ body: {
159
+ query: {
160
+ categorymembers: [
161
+ { ns: 0, title: "Article3" }
162
+ ]
163
+ }
164
+ }.to_json
165
+ )
166
+
167
+ articles = fetcher.fetch_articles
168
+ expect(articles.size).to eq 3
169
+ expect(articles).to include("Article1", "Article2", "Article3")
170
+ end
171
+
172
+ it "returns unique articles when duplicates exist" do
173
+ stub_request(:get, /en\.wikipedia\.org/)
174
+ .to_return(
175
+ status: 200,
176
+ body: {
177
+ query: {
178
+ categorymembers: [
179
+ { ns: 0, title: "Tokyo" },
180
+ { ns: 0, title: "Tokyo" },
181
+ { ns: 0, title: "Osaka" }
182
+ ]
183
+ }
184
+ }.to_json
185
+ )
186
+
187
+ articles = fetcher.fetch_articles
188
+ expect(articles).to contain_exactly("Tokyo", "Osaka")
189
+ end
190
+
191
+ it "returns empty array for non-existent category" do
192
+ stub_request(:get, /en\.wikipedia\.org/)
193
+ .to_return(
194
+ status: 200,
195
+ body: { query: { categorymembers: [] } }.to_json
196
+ )
197
+
198
+ articles = fetcher.fetch_articles
199
+ expect(articles).to be_empty
200
+ end
201
+
202
+ it "handles API errors gracefully" do
203
+ stub_request(:get, /en\.wikipedia\.org/)
204
+ .to_return(status: 500)
205
+
206
+ articles = fetcher.fetch_articles
207
+ expect(articles).to be_empty
208
+ end
209
+
210
+ it "handles network timeout gracefully" do
211
+ stub_request(:get, /en\.wikipedia\.org/)
212
+ .to_timeout
213
+
214
+ articles = fetcher.fetch_articles
215
+ expect(articles).to be_empty
216
+ end
217
+
218
+ it "handles malformed JSON response gracefully" do
219
+ stub_request(:get, /en\.wikipedia\.org/)
220
+ .to_return(
221
+ status: 200,
222
+ body: "not valid json"
223
+ )
224
+
225
+ articles = fetcher.fetch_articles
226
+ expect(articles).to be_empty
227
+ end
228
+ end
229
+
230
+ describe "subcategory recursion" do
231
+ it "does not recurse into subcategories when max_depth is 0" do
232
+ stub_request(:get, /cmtitle=Category:Japanese%20cities/)
233
+ .to_return(
234
+ status: 200,
235
+ body: {
236
+ query: {
237
+ categorymembers: [
238
+ { ns: 0, title: "Tokyo" },
239
+ { ns: 14, title: "Category:Cities in Kanto" }
240
+ ]
241
+ }
242
+ }.to_json
243
+ )
244
+
245
+ articles = fetcher.fetch_articles
246
+ expect(articles).to eq ["Tokyo"]
247
+ expect(WebMock).not_to have_requested(:get, /cmtitle=Category:Cities%20in%20Kanto/)
248
+ end
249
+
250
+ it "recurses into subcategories when max_depth > 0" do
251
+ f = described_class.new("en", "Japanese cities", max_depth: 1)
252
+
253
+ # Parent category
254
+ stub_request(:get, /cmtitle=Category:Japanese%20cities/)
255
+ .to_return(
256
+ status: 200,
257
+ body: {
258
+ query: {
259
+ categorymembers: [
260
+ { ns: 0, title: "Tokyo" },
261
+ { ns: 14, title: "Category:Cities in Kanto" }
262
+ ]
263
+ }
264
+ }.to_json
265
+ )
266
+
267
+ # Subcategory
268
+ stub_request(:get, /cmtitle=Category:Cities%20in%20Kanto/)
269
+ .to_return(
270
+ status: 200,
271
+ body: {
272
+ query: {
273
+ categorymembers: [
274
+ { ns: 0, title: "Yokohama" },
275
+ { ns: 0, title: "Chiba" }
276
+ ]
277
+ }
278
+ }.to_json
279
+ )
280
+
281
+ articles = f.fetch_articles
282
+ expect(articles).to contain_exactly("Tokyo", "Yokohama", "Chiba")
283
+ end
284
+
285
+ it "prevents infinite loops with circular category references" do
286
+ f = described_class.new("en", "Category A", max_depth: 5)
287
+
288
+ stub_request(:get, /cmtitle=Category:Category%20A/)
289
+ .to_return(
290
+ status: 200,
291
+ body: {
292
+ query: {
293
+ categorymembers: [
294
+ { ns: 0, title: "Article1" },
295
+ { ns: 14, title: "Category:Category B" }
296
+ ]
297
+ }
298
+ }.to_json
299
+ )
300
+
301
+ stub_request(:get, /cmtitle=Category:Category%20B/)
302
+ .to_return(
303
+ status: 200,
304
+ body: {
305
+ query: {
306
+ categorymembers: [
307
+ { ns: 0, title: "Article2" },
308
+ { ns: 14, title: "Category:Category A" }
309
+ ]
310
+ }
311
+ }.to_json
312
+ )
313
+
314
+ # Should complete without infinite loop
315
+ expect { f.fetch_articles }.not_to raise_error
316
+ articles = f.fetch_articles
317
+ expect(articles).to include("Article1", "Article2")
318
+ end
319
+
320
+ it "respects max_depth limit" do
321
+ f = described_class.new("en", "Root", max_depth: 1)
322
+
323
+ # Root
324
+ stub_request(:get, /cmtitle=Category:Root/)
325
+ .to_return(
326
+ status: 200,
327
+ body: {
328
+ query: {
329
+ categorymembers: [
330
+ { ns: 14, title: "Category:Level1" }
331
+ ]
332
+ }
333
+ }.to_json
334
+ )
335
+
336
+ # Level 1
337
+ stub_request(:get, /cmtitle=Category:Level1/)
338
+ .to_return(
339
+ status: 200,
340
+ body: {
341
+ query: {
342
+ categorymembers: [
343
+ { ns: 0, title: "Article1" },
344
+ { ns: 14, title: "Category:Level2" }
345
+ ]
346
+ }
347
+ }.to_json
348
+ )
349
+
350
+ # Level 2 should not be called
351
+ stub_request(:get, /cmtitle=Category:Level2/)
352
+ .to_return(
353
+ status: 200,
354
+ body: {
355
+ query: { categorymembers: [{ ns: 0, title: "Article2" }] }
356
+ }.to_json
357
+ )
358
+
359
+ articles = f.fetch_articles
360
+ expect(articles).to eq ["Article1"]
361
+ expect(WebMock).not_to have_requested(:get, /cmtitle=Category:Level2/)
362
+ end
363
+ end
364
+
365
+ describe "caching" do
366
+ let(:cache_dir) { Dir.mktmpdir("wp2txt_test_") }
367
+
368
+ after do
369
+ FileUtils.rm_rf(cache_dir)
370
+ end
371
+
372
+ it "caches category members to SQLite database" do
373
+ f = described_class.new("en", "Test Category")
374
+ f.enable_cache(cache_dir)
375
+
376
+ stub_request(:get, /en\.wikipedia\.org/)
377
+ .to_return(
378
+ status: 200,
379
+ body: {
380
+ query: {
381
+ categorymembers: [{ ns: 0, title: "Cached Article" }]
382
+ }
383
+ }.to_json
384
+ )
385
+
386
+ f.fetch_articles
387
+
388
+ # SQLite cache file should exist
389
+ cache_files = Dir.glob(File.join(cache_dir, "categories_*.sqlite3"))
390
+ expect(cache_files.size).to eq 1
391
+ end
392
+
393
+ it "uses cached data on subsequent calls" do
394
+ f = described_class.new("en", "Test Category")
395
+ f.enable_cache(cache_dir)
396
+
397
+ # Pre-populate cache using CategoryCache
398
+ cache = Wp2txt::CategoryCache.new("en", cache_dir: cache_dir)
399
+ cache.save("Test Category", ["Cached Article"], [])
400
+ cache.close
401
+
402
+ # Should not make API request
403
+ articles = f.fetch_articles
404
+ expect(articles).to eq ["Cached Article"]
405
+ expect(WebMock).not_to have_requested(:get, /wikipedia\.org/)
406
+ end
407
+
408
+ it "ignores stale cache" do
409
+ f = described_class.new("en", "Test Category")
410
+ f.enable_cache(cache_dir)
411
+
412
+ # Pre-populate cache using CategoryCache
413
+ cache = Wp2txt::CategoryCache.new("en", cache_dir: cache_dir, expiry_days: 7)
414
+ cache.save("Test Category", ["Old Article"], [])
415
+
416
+ # Manually update cached_at to make it old (8 days ago)
417
+ cache.instance_variable_get(:@db).execute(
418
+ "UPDATE categories SET cached_at = ? WHERE name = ?",
419
+ [Time.now.to_i - (8 * 24 * 3600), "Test Category"]
420
+ )
421
+ cache.close
422
+
423
+ stub_request(:get, /en\.wikipedia\.org/)
424
+ .to_return(
425
+ status: 200,
426
+ body: {
427
+ query: {
428
+ categorymembers: [{ ns: 0, title: "Fresh Article" }]
429
+ }
430
+ }.to_json
431
+ )
432
+
433
+ articles = f.fetch_articles
434
+ expect(articles).to eq ["Fresh Article"]
435
+ end
436
+ end
437
+
438
+ describe "special characters in category names" do
439
+ it "handles spaces in category names" do
440
+ f = described_class.new("en", "Japanese cities")
441
+
442
+ stub_request(:get, /cmtitle=Category:Japanese%20cities/)
443
+ .to_return(
444
+ status: 200,
445
+ body: { query: { categorymembers: [{ ns: 0, title: "Tokyo" }] } }.to_json
446
+ )
447
+
448
+ articles = f.fetch_articles
449
+ expect(articles).to eq ["Tokyo"]
450
+ end
451
+
452
+ it "handles Unicode category names" do
453
+ f = described_class.new("ja", "日本の都市")
454
+
455
+ stub_request(:get, /ja\.wikipedia\.org/)
456
+ .to_return(
457
+ status: 200,
458
+ body: { query: { categorymembers: [{ ns: 0, title: "東京" }] } }.to_json
459
+ )
460
+
461
+ articles = f.fetch_articles
462
+ expect(articles).to eq ["東京"]
463
+ end
464
+
465
+ it "handles special characters in category names" do
466
+ f = described_class.new("en", "Rock & Roll")
467
+
468
+ stub_request(:get, /cmtitle=Category:Rock%20%26%20Roll/)
469
+ .to_return(
470
+ status: 200,
471
+ body: { query: { categorymembers: [{ ns: 0, title: "Elvis" }] } }.to_json
472
+ )
473
+
474
+ articles = f.fetch_articles
475
+ expect(articles).to eq ["Elvis"]
476
+ end
477
+ end
478
+
479
+ describe "multilingual support" do
480
+ it "works with Japanese Wikipedia" do
481
+ f = described_class.new("ja", "日本の都市")
482
+
483
+ stub_request(:get, /ja\.wikipedia\.org/)
484
+ .to_return(
485
+ status: 200,
486
+ body: { query: { categorymembers: [{ ns: 0, title: "東京" }] } }.to_json
487
+ )
488
+
489
+ expect(f.fetch_articles).to eq ["東京"]
490
+ end
491
+
492
+ it "works with German Wikipedia" do
493
+ f = described_class.new("de", "Stadt in Deutschland")
494
+
495
+ stub_request(:get, /de\.wikipedia\.org/)
496
+ .to_return(
497
+ status: 200,
498
+ body: { query: { categorymembers: [{ ns: 0, title: "Berlin" }] } }.to_json
499
+ )
500
+
501
+ expect(f.fetch_articles).to eq ["Berlin"]
502
+ end
503
+ end
504
+ end
@@ -0,0 +1,197 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require_relative "../lib/wp2txt/utils"
5
+
6
+ RSpec.describe "Wp2txt Cleanup" do
7
+ include Wp2txt
8
+
9
+ describe "MediaWiki magic words" do
10
+ it "removes DEFAULTSORT lines" do
11
+ input = "Some text\nDEFAULTSORT:にんちけんこかく\nMore text"
12
+ result = cleanup(input)
13
+ expect(result).not_to include("DEFAULTSORT")
14
+ expect(result).to include("Some text")
15
+ expect(result).to include("More text")
16
+ end
17
+
18
+ it "removes DISPLAYTITLE lines" do
19
+ input = "Some text\nDISPLAYTITLE:Custom Title\nMore text"
20
+ result = cleanup(input)
21
+ expect(result).not_to include("DISPLAYTITLE")
22
+ end
23
+
24
+ it "removes __NOTOC__ and similar" do
25
+ input = "Some text\n__NOTOC__\n__TOC__\n__FORCETOC__\nMore text"
26
+ result = cleanup(input)
27
+ expect(result).not_to include("__NOTOC__")
28
+ expect(result).not_to include("__TOC__")
29
+ expect(result).not_to include("__FORCETOC__")
30
+ end
31
+
32
+ it "removes __NOEDITSECTION__" do
33
+ input = "Some text\n__NOEDITSECTION__\nMore text"
34
+ result = cleanup(input)
35
+ expect(result).not_to include("__NOEDITSECTION__")
36
+ end
37
+ end
38
+
39
+ describe "Interwiki links" do
40
+ it "removes :en: prefixed links" do
41
+ input = "See :en:Force dynamics for more"
42
+ result = cleanup(input)
43
+ expect(result).to include("Force dynamics")
44
+ expect(result).not_to include(":en:")
45
+ end
46
+
47
+ it "removes :fr: prefixed links" do
48
+ input = "See :fr:Société de Linguistique de Paris"
49
+ result = cleanup(input)
50
+ expect(result).to include("Société de Linguistique de Paris")
51
+ expect(result).not_to include(":fr:")
52
+ end
53
+
54
+ it "removes :de: prefixed links" do
55
+ input = "Related: :de:Sprachwissenschaft"
56
+ result = cleanup(input)
57
+ expect(result).to include("Sprachwissenschaft")
58
+ expect(result).not_to include(":de:")
59
+ end
60
+
61
+ it "handles multiple interwiki links" do
62
+ input = "See :en:Article1 and :fr:Article2 for details"
63
+ result = cleanup(input)
64
+ expect(result).to include("Article1")
65
+ expect(result).to include("Article2")
66
+ expect(result).not_to match(/:[a-z]{2}:/)
67
+ end
68
+ end
69
+
70
+ describe "Authority control templates" do
71
+ it "removes Normdaten line" do
72
+ input = "Some text\nNormdaten\nMore text"
73
+ result = cleanup(input)
74
+ expect(result).not_to include("Normdaten")
75
+ end
76
+
77
+ it "removes Authority control line" do
78
+ input = "Some text\nAuthority control\nMore text"
79
+ result = cleanup(input)
80
+ expect(result).not_to include("Authority control")
81
+ end
82
+
83
+ it "removes Persondata line" do
84
+ input = "Some text\nPersondata\nMore text"
85
+ result = cleanup(input)
86
+ expect(result).not_to include("Persondata")
87
+ end
88
+ end
89
+
90
+ describe "Category line cleanup" do
91
+ it "removes standalone Category: lines (English)" do
92
+ input = "Text\nCategory:Linguistics\nCategory:Science\nMore"
93
+ result = cleanup(input)
94
+ expect(result).not_to match(/^Category:/)
95
+ end
96
+
97
+ it "removes standalone カテゴリ lines (Japanese)" do
98
+ input = "Text\nカテゴリ:言語学\nMore"
99
+ result = cleanup(input)
100
+ expect(result).not_to match(/^カテゴリ:/)
101
+ end
102
+
103
+ it "removes standalone Kategorie lines (German)" do
104
+ input = "Text\nKategorie:Sprachwissenschaft\nMore"
105
+ result = cleanup(input)
106
+ expect(result).not_to match(/^Kategorie:/)
107
+ end
108
+
109
+ it "removes standalone Catégorie lines (French)" do
110
+ input = "Text\nCatégorie:Linguistique\nMore"
111
+ result = cleanup(input)
112
+ expect(result).not_to match(/^Catégorie:/)
113
+ end
114
+
115
+ it "removes Category lines with asterisk prefix" do
116
+ input = "Text\n*\nCategory:Main\nMore"
117
+ result = cleanup(input)
118
+ expect(result).not_to match(/^Category:/)
119
+ end
120
+
121
+ it "preserves CATEGORIES summary line" do
122
+ input = "Text\nCATEGORIES: Foo, Bar, Baz\nMore"
123
+ result = cleanup(input)
124
+ expect(result).to include("CATEGORIES: Foo, Bar, Baz")
125
+ end
126
+ end
127
+
128
+ describe "Template artifact cleanup" do
129
+ it "removes stub template markers" do
130
+ # Common stub patterns across languages
131
+ input = "Text\n節スタブ\nMore"
132
+ result = cleanup(input)
133
+ # This might be Japanese-specific, but the pattern should be general
134
+ end
135
+
136
+ it "removes reference help markers" do
137
+ input = "Text\n脚注ヘルプ\nMore"
138
+ result = cleanup(input)
139
+ # Japanese-specific, need general approach
140
+ end
141
+
142
+ it "removes lines that are just asterisk + single word" do
143
+ input = "Text\n*和書\n*洋書\nMore"
144
+ result = cleanup(input)
145
+ # Pattern: ^\*[^\s\*]+$ (single word after asterisk)
146
+ end
147
+
148
+ it "removes Wikibooks/Wikiversity markers" do
149
+ input = "Text\nWikibooks\nSchool:言語学\nMore"
150
+ result = cleanup(input)
151
+ expect(result).not_to match(/^Wikibooks$/)
152
+ expect(result).not_to match(/^School:/)
153
+ end
154
+
155
+ it "removes commons/wikimedia markers" do
156
+ input = "Text\nCommons\nWikimedia Commons\nMore"
157
+ result = cleanup(input)
158
+ expect(result).not_to match(/^Commons$/)
159
+ end
160
+ end
161
+
162
+ describe "Combined cleanup" do
163
+ it "cleans up a realistic Wikipedia article footer" do
164
+ input = <<~TEXT
165
+ This is the main content.
166
+
167
+ == References ==
168
+ 脚注ヘルプ
169
+
170
+ == External links ==
171
+ Wikibooks
172
+ School:言語学
173
+
174
+ Normdaten
175
+ DEFAULTSORT:けんこかく
176
+ Category:言語学
177
+ Category:人文科学
178
+ *
179
+
180
+ CATEGORIES: 言語学, 人文科学
181
+ TEXT
182
+
183
+ result = cleanup(input)
184
+
185
+ expect(result).to include("This is the main content")
186
+ expect(result).to include("== References ==")
187
+ expect(result).to include("== External links ==")
188
+ expect(result).to include("CATEGORIES: 言語学, 人文科学")
189
+
190
+ expect(result).not_to include("Normdaten")
191
+ expect(result).not_to include("DEFAULTSORT")
192
+ expect(result).not_to match(/^Category:/)
193
+ expect(result).not_to match(/^Wikibooks$/)
194
+ expect(result).not_to match(/^School:/)
195
+ end
196
+ end
197
+ end