wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,402 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+ require_relative "fixtures/samples"
5
+
6
+ RSpec.describe Wp2txt::Article do
7
+ # Use let blocks for lazy evaluation to avoid triggering bugs at load time
8
+ let(:english_article) { Wp2txt::TestSamples::ENGLISH_ARTICLE }
9
+ let(:japanese_article) { Wp2txt::TestSamples::JAPANESE_ARTICLE }
10
+ let(:german_article) { Wp2txt::TestSamples::GERMAN_ARTICLE }
11
+ let(:french_article) { Wp2txt::TestSamples::FRENCH_ARTICLE }
12
+ let(:chinese_article) { Wp2txt::TestSamples::CHINESE_ARTICLE }
13
+ let(:russian_article) { Wp2txt::TestSamples::RUSSIAN_ARTICLE }
14
+ let(:korean_article) { Wp2txt::TestSamples::KOREAN_ARTICLE }
15
+ let(:arabic_article) { Wp2txt::TestSamples::ARABIC_ARTICLE }
16
+ let(:emoji_content) { Wp2txt::TestSamples::EMOJI_CONTENT }
17
+ let(:deeply_nested) { Wp2txt::TestSamples::DEEPLY_NESTED }
18
+ let(:malformed_markup) { Wp2txt::TestSamples::MALFORMED_MARKUP }
19
+ let(:nested_templates) { Wp2txt::TestSamples::NESTED_TEMPLATES }
20
+ let(:table_content) { Wp2txt::TestSamples::TABLE_CONTENT }
21
+ let(:reference_content) { Wp2txt::TestSamples::REFERENCE_CONTENT }
22
+ let(:multiline_link) { Wp2txt::TestSamples::MULTILINE_LINK }
23
+
24
+ describe "#parse" do
25
+ it "classifies headings correctly" do
26
+ article = Wp2txt::Article.new("== Heading ==\nParagraph text")
27
+ types = article.elements.map(&:first)
28
+ expect(types).to include(:mw_heading)
29
+ expect(types).to include(:mw_paragraph)
30
+ end
31
+
32
+ it "classifies unordered lists" do
33
+ article = Wp2txt::Article.new("* Item 1\n* Item 2\n* Item 3")
34
+ types = article.elements.map(&:first)
35
+ expect(types.count(:mw_unordered)).to eq 3
36
+ end
37
+
38
+ it "classifies ordered lists" do
39
+ article = Wp2txt::Article.new("# First\n# Second\n# Third")
40
+ types = article.elements.map(&:first)
41
+ expect(types.count(:mw_ordered)).to eq 3
42
+ end
43
+
44
+ it "classifies definition lists" do
45
+ article = Wp2txt::Article.new("; Term\n: Definition")
46
+ types = article.elements.map(&:first)
47
+ expect(types).to include(:mw_definition)
48
+ end
49
+
50
+ it "classifies blank lines" do
51
+ article = Wp2txt::Article.new("Text\n\nMore text")
52
+ types = article.elements.map(&:first)
53
+ expect(types).to include(:mw_blank)
54
+ end
55
+
56
+ it "handles multi-line templates" do
57
+ article = Wp2txt::Article.new(nested_templates)
58
+ types = article.elements.map(&:first)
59
+ expect(types).to include(:mw_ml_template)
60
+ end
61
+
62
+ it "handles table content" do
63
+ article = Wp2txt::Article.new(table_content)
64
+ types = article.elements.map(&:first)
65
+ expect(types).to include(:mw_table)
66
+ end
67
+
68
+ it "detects redirects" do
69
+ article = Wp2txt::Article.new("#REDIRECT [[Other Page]]")
70
+ types = article.elements.map(&:first)
71
+ expect(types).to include(:mw_redirect)
72
+ end
73
+ end
74
+
75
+ describe "#categories" do
76
+ it "extracts English categories" do
77
+ article = Wp2txt::Article.new(english_article)
78
+ categories = article.categories.flatten
79
+ expect(categories).to include("Tests")
80
+ end
81
+
82
+ # Tests for multilingual category extraction
83
+ # Will fail until multilingual support is implemented
84
+ # it "extracts Japanese categories" do
85
+ # article = Wp2txt::Article.new(japanese_article)
86
+ # categories = article.categories.flatten
87
+ # expect(categories).to include("テスト")
88
+ # end
89
+
90
+ it "extracts multiple categories from one article" do
91
+ article = Wp2txt::Article.new(english_article)
92
+ categories = article.categories.flatten
93
+ expect(categories.size).to be >= 1
94
+ end
95
+ end
96
+
97
+ describe "edge cases" do
98
+ it "handles malformed markup gracefully" do
99
+ # This test exposes the exit bug in convert_characters
100
+ expect { Wp2txt::Article.new(malformed_markup) }.not_to raise_error
101
+ end
102
+
103
+ it "handles deeply nested templates" do
104
+ # This test exposes the exit bug in convert_characters
105
+ expect { Wp2txt::Article.new(deeply_nested) }.not_to raise_error
106
+ end
107
+
108
+ it "handles empty input" do
109
+ article = Wp2txt::Article.new("")
110
+ expect(article.elements).to be_empty
111
+ end
112
+
113
+ it "handles whitespace-only input" do
114
+ article = Wp2txt::Article.new(" \n \n ")
115
+ expect { article }.not_to raise_error
116
+ end
117
+ end
118
+
119
+ describe "title handling" do
120
+ it "stores the article title" do
121
+ article = Wp2txt::Article.new("Content", "Test Title")
122
+ expect(article.title).to eq "Test Title"
123
+ end
124
+
125
+ it "strips whitespace from title" do
126
+ article = Wp2txt::Article.new("Content", " Title ")
127
+ expect(article.title).to eq "Title"
128
+ end
129
+ end
130
+
131
+ describe "multilingual content" do
132
+ it "handles Japanese content" do
133
+ expect { Wp2txt::Article.new(japanese_article) }.not_to raise_error
134
+ end
135
+
136
+ it "handles German content" do
137
+ expect { Wp2txt::Article.new(german_article) }.not_to raise_error
138
+ end
139
+
140
+ it "handles Chinese content" do
141
+ expect { Wp2txt::Article.new(chinese_article) }.not_to raise_error
142
+ end
143
+
144
+ it "handles Russian content" do
145
+ expect { Wp2txt::Article.new(russian_article) }.not_to raise_error
146
+ end
147
+
148
+ it "handles Korean content" do
149
+ expect { Wp2txt::Article.new(korean_article) }.not_to raise_error
150
+ end
151
+
152
+ it "handles Arabic content" do
153
+ expect { Wp2txt::Article.new(arabic_article) }.not_to raise_error
154
+ end
155
+ end
156
+
157
+ describe "multiline structures" do
158
+ it "handles multiline templates" do
159
+ wiki = "{{Infobox\n|name = Test\n|value = 123\n}}"
160
+ article = Wp2txt::Article.new(wiki)
161
+ types = article.elements.map(&:first)
162
+ expect(types).to include(:mw_ml_template)
163
+ end
164
+
165
+ it "extracts content after closing }} on same line" do
166
+ wiki = "{{Template\n|param = value\n}}Following paragraph text."
167
+ article = Wp2txt::Article.new(wiki)
168
+ types = article.elements.map(&:first)
169
+ expect(types).to include(:mw_ml_template)
170
+ expect(types).to include(:mw_paragraph)
171
+ # Check that the paragraph content is extracted
172
+ paragraph = article.elements.find { |t, _| t == :mw_paragraph }
173
+ expect(paragraph.last).to include("Following paragraph text")
174
+ end
175
+
176
+ it "handles nested braces in multiline templates" do
177
+ wiki = "{{Outer\n|inner = {{nested}}\n}}After template."
178
+ article = Wp2txt::Article.new(wiki)
179
+ types = article.elements.map(&:first)
180
+ expect(types).to include(:mw_ml_template)
181
+ paragraph = article.elements.find { |t, _| t == :mw_paragraph }
182
+ expect(paragraph.last).to include("After template")
183
+ end
184
+
185
+ it "handles multiline links" do
186
+ wiki = "[[File:Image.jpg|thumb|Description\nthat spans\nmultiple lines]]"
187
+ article = Wp2txt::Article.new(wiki)
188
+ types = article.elements.map(&:first)
189
+ expect(types).to include(:mw_ml_link)
190
+ end
191
+
192
+ it "handles source code blocks" do
193
+ wiki = "<source lang=\"ruby\">\ndef hello\n puts 'world'\nend\n</source>"
194
+ article = Wp2txt::Article.new(wiki)
195
+ types = article.elements.map(&:first)
196
+ expect(types).to include(:mw_source)
197
+ end
198
+
199
+ it "handles multiline source blocks starting mid-content" do
200
+ # Source block that starts in middle of content
201
+ wiki = "text before\n<source lang=\"ruby\">\ncode here\n</source>\ntext after"
202
+ article = Wp2txt::Article.new(wiki)
203
+ types = article.elements.map(&:first)
204
+ expect(types).to include(:mw_source)
205
+ end
206
+
207
+ it "handles math blocks" do
208
+ wiki = "<math>\nx = \\frac{-b \\pm \\sqrt{b^2-4ac}}{2a}\n</math>"
209
+ article = Wp2txt::Article.new(wiki)
210
+ types = article.elements.map(&:first)
211
+ expect(types).to include(:mw_math)
212
+ end
213
+
214
+ it "handles single-line math blocks with content" do
215
+ wiki = "formula: <math>E = mc^2</math> explained"
216
+ article = Wp2txt::Article.new(wiki)
217
+ types = article.elements.map(&:first)
218
+ expect(types).to include(:mw_math)
219
+ end
220
+
221
+ it "handles inputbox blocks" do
222
+ wiki = "<inputbox>\ntype=search\nwidth=30\n</inputbox>"
223
+ article = Wp2txt::Article.new(wiki)
224
+ types = article.elements.map(&:first)
225
+ expect(types).to include(:mw_inputbox)
226
+ end
227
+
228
+ it "handles single-line inputbox with content" do
229
+ wiki = "search: <inputbox>type=search</inputbox> here"
230
+ article = Wp2txt::Article.new(wiki)
231
+ types = article.elements.map(&:first)
232
+ expect(types).to include(:mw_inputbox)
233
+ end
234
+
235
+ it "handles HTML tables" do
236
+ wiki = "<table>\n<tr><td>Cell</td></tr>\n</table>"
237
+ article = Wp2txt::Article.new(wiki)
238
+ types = article.elements.map(&:first)
239
+ expect(types).to include(:mw_htable)
240
+ end
241
+
242
+ it "handles single-line HTML tables with content" do
243
+ wiki = "data: <table><tr><td>x</td></tr></table> end"
244
+ article = Wp2txt::Article.new(wiki)
245
+ types = article.elements.map(&:first)
246
+ expect(types).to include(:mw_htable)
247
+ end
248
+ end
249
+
250
+ describe "pre-formatted text" do
251
+ it "classifies pre-formatted text" do
252
+ article = Wp2txt::Article.new(" preformatted text")
253
+ types = article.elements.map(&:first)
254
+ expect(types).to include(:mw_pre)
255
+ end
256
+ end
257
+
258
+ describe "strip_tmarker option" do
259
+ it "strips list markers when enabled" do
260
+ article = Wp2txt::Article.new("* List item", "", true)
261
+ content = article.elements.find { |e| e.first == :mw_unordered }&.last
262
+ expect(content).not_to start_with("*")
263
+ end
264
+
265
+ it "preserves list markers when disabled" do
266
+ article = Wp2txt::Article.new("* List item", "", false)
267
+ content = article.elements.find { |e| e.first == :mw_unordered }&.last
268
+ expect(content).to start_with("*")
269
+ end
270
+
271
+ it "strips definition markers when enabled" do
272
+ article = Wp2txt::Article.new(": Definition", "", true)
273
+ content = article.elements.find { |e| e.first == :mw_definition }&.last
274
+ expect(content).not_to start_with(":")
275
+ end
276
+
277
+ it "strips pre markers when enabled" do
278
+ article = Wp2txt::Article.new(" preformatted", "", true)
279
+ content = article.elements.find { |e| e.first == :mw_pre }&.last
280
+ # Pre marker is the leading space; when stripped, content should not have it
281
+ expect(content&.strip).to eq("preformatted")
282
+ end
283
+
284
+ it "strips ordered list markers when enabled" do
285
+ article = Wp2txt::Article.new("# Numbered", "", true)
286
+ content = article.elements.find { |e| e.first == :mw_ordered }&.last
287
+ expect(content).not_to start_with("#")
288
+ end
289
+ end
290
+
291
+ describe "isolated elements" do
292
+ it "detects isolated templates" do
293
+ article = Wp2txt::Article.new("{{stub}}")
294
+ types = article.elements.map(&:first)
295
+ expect(types).to include(:mw_isolated_template)
296
+ end
297
+
298
+ it "detects isolated tags with content" do
299
+ # ISOLATED_TAG_REGEX matches tags with content between them
300
+ # Using <span> which is not removed by remove_html
301
+ article = Wp2txt::Article.new("<span>content</span>")
302
+ types = article.elements.map(&:first)
303
+ expect(types).to include(:mw_isolated_tag)
304
+ end
305
+ end
306
+
307
+ describe "link handling" do
308
+ it "detects standalone link lines" do
309
+ article = Wp2txt::Article.new("[[Link Target]]")
310
+ types = article.elements.map(&:first)
311
+ expect(types).to include(:mw_link)
312
+ end
313
+ end
314
+
315
+ describe "multilingual redirects" do
316
+ it "detects German redirect" do
317
+ article = Wp2txt::Article.new("#WEITERLEITUNG [[Ziel]]")
318
+ types = article.elements.map(&:first)
319
+ expect(types).to include(:mw_redirect)
320
+ end
321
+
322
+ it "detects French redirect" do
323
+ article = Wp2txt::Article.new("#REDIRECTION [[Cible]]")
324
+ types = article.elements.map(&:first)
325
+ expect(types).to include(:mw_redirect)
326
+ end
327
+
328
+ it "detects Japanese redirect" do
329
+ article = Wp2txt::Article.new("#転送 [[転送先]]")
330
+ types = article.elements.map(&:first)
331
+ expect(types).to include(:mw_redirect)
332
+ end
333
+
334
+ it "detects Russian redirect" do
335
+ article = Wp2txt::Article.new("#ПЕРЕНАПРАВЛЕНИЕ [[Цель]]")
336
+ types = article.elements.map(&:first)
337
+ expect(types).to include(:mw_redirect)
338
+ end
339
+
340
+ it "detects Chinese redirect" do
341
+ article = Wp2txt::Article.new("#重定向 [[目标]]")
342
+ types = article.elements.map(&:first)
343
+ expect(types).to include(:mw_redirect)
344
+ end
345
+
346
+ it "detects Japanese alternative redirect (リダイレクト)" do
347
+ article = Wp2txt::Article.new("#リダイレクト [[転送先]]")
348
+ types = article.elements.map(&:first)
349
+ expect(types).to include(:mw_redirect)
350
+ end
351
+
352
+ it "detects Russian abbreviated redirect (перенапр)" do
353
+ article = Wp2txt::Article.new("#перенапр [[Цель]]")
354
+ types = article.elements.map(&:first)
355
+ expect(types).to include(:mw_redirect)
356
+ end
357
+
358
+ it "detects Hindi redirect (पुनर्प्रेषित)" do
359
+ article = Wp2txt::Article.new("#पुनर्प्रेषित [[लक्ष्य]]")
360
+ types = article.elements.map(&:first)
361
+ expect(types).to include(:mw_redirect)
362
+ end
363
+ end
364
+
365
+ describe "multilingual categories" do
366
+ it "extracts Japanese categories" do
367
+ article = Wp2txt::Article.new("[[カテゴリ:テスト]]")
368
+ categories = article.categories.flatten
369
+ expect(categories).to include("テスト")
370
+ end
371
+
372
+ it "extracts German categories" do
373
+ article = Wp2txt::Article.new("[[Kategorie:Test]]")
374
+ categories = article.categories.flatten
375
+ expect(categories).to include("Test")
376
+ end
377
+
378
+ it "extracts French categories" do
379
+ article = Wp2txt::Article.new("[[Catégorie:Test]]")
380
+ categories = article.categories.flatten
381
+ expect(categories).to include("Test")
382
+ end
383
+
384
+ it "extracts Russian categories" do
385
+ article = Wp2txt::Article.new("[[Категория:Тест]]")
386
+ categories = article.categories.flatten
387
+ expect(categories).to include("Тест")
388
+ end
389
+
390
+ it "extracts Chinese simplified categories" do
391
+ article = Wp2txt::Article.new("[[分类:测试]]")
392
+ categories = article.categories.flatten
393
+ expect(categories).to include("测试")
394
+ end
395
+
396
+ it "extracts Chinese traditional categories" do
397
+ article = Wp2txt::Article.new("[[分類:測試]]")
398
+ categories = article.categories.flatten
399
+ expect(categories).to include("測試")
400
+ end
401
+ end
402
+ end