wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,476 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "spec_helper"
4
+
5
+ RSpec.describe "Wp2txt Markers" do
6
+ include Wp2txt
7
+
8
+ describe "marker classification constants" do
9
+ it "defines INLINE_MARKERS" do
10
+ expect(Wp2txt::INLINE_MARKERS).to be_a(Array)
11
+ expect(Wp2txt::INLINE_MARKERS).to include(:math, :chem, :ipa, :code)
12
+ end
13
+
14
+ it "defines BLOCK_MARKERS" do
15
+ expect(Wp2txt::BLOCK_MARKERS).to be_a(Array)
16
+ expect(Wp2txt::BLOCK_MARKERS).to include(:table, :infobox, :navbox, :codeblock)
17
+ end
18
+
19
+ it "has no overlap between inline and block markers" do
20
+ overlap = Wp2txt::INLINE_MARKERS & Wp2txt::BLOCK_MARKERS
21
+ expect(overlap).to be_empty
22
+ end
23
+
24
+ it "includes all marker types in either inline or block" do
25
+ all_classified = Wp2txt::INLINE_MARKERS + Wp2txt::BLOCK_MARKERS
26
+ expect(all_classified.sort).to eq(Wp2txt::MARKER_TYPES.sort)
27
+ end
28
+ end
29
+
30
+ # Default behavior: markers are ON
31
+ describe "marker replacement (default: enabled)" do
32
+ describe "MATH marker" do
33
+ it "replaces <math> tags with [MATH]" do
34
+ input = "The equation <math>E = mc^2</math> is famous."
35
+ result = format_wiki(input)
36
+ expect(result).to include("[MATH]")
37
+ expect(result).not_to include("<math>")
38
+ expect(result).not_to include("E = mc^2")
39
+ end
40
+
41
+ it "replaces {{math}} templates with [MATH]" do
42
+ input = "The formula {{math|x^2 + y^2 = z^2}} is well known."
43
+ result = format_wiki(input)
44
+ expect(result).to include("[MATH]")
45
+ expect(result).not_to include("{{math")
46
+ end
47
+
48
+ it "replaces {{mvar}} templates with [MATH]" do
49
+ input = "Let {{mvar|n}} be an integer."
50
+ result = format_wiki(input)
51
+ expect(result).to include("[MATH]")
52
+ end
53
+ end
54
+
55
+ describe "CODE marker (inline)" do
56
+ it "replaces <code> tags with [CODE]" do
57
+ input = "Use <code>printf()</code> to print."
58
+ result = format_wiki(input)
59
+ expect(result).to include("[CODE]")
60
+ expect(result).not_to include("<code>")
61
+ end
62
+
63
+ it "handles inline code in sentence" do
64
+ input = "The variable <code>x</code> and <code>y</code> are integers."
65
+ result = format_wiki(input)
66
+ expect(result).to include("[CODE]")
67
+ expect(result).to include("are integers")
68
+ end
69
+ end
70
+
71
+ describe "CODEBLOCK marker (block)" do
72
+ it "replaces <syntaxhighlight> tags with [CODEBLOCK]" do
73
+ input = "<syntaxhighlight lang=\"python\">def hello():\n print('Hello')</syntaxhighlight>"
74
+ result = format_wiki(input)
75
+ expect(result).to include("[CODEBLOCK]")
76
+ expect(result).not_to include("<syntaxhighlight")
77
+ expect(result).not_to include("[CODE]")
78
+ end
79
+
80
+ it "replaces <source> tags with [CODEBLOCK]" do
81
+ input = "<source lang=\"ruby\">puts 'hello'</source>"
82
+ result = format_wiki(input)
83
+ expect(result).to include("[CODEBLOCK]")
84
+ expect(result).not_to include("<source")
85
+ expect(result).not_to include("[CODE]")
86
+ end
87
+
88
+ it "replaces <pre> tags with [CODEBLOCK]" do
89
+ input = "<pre>some preformatted code</pre>"
90
+ result = format_wiki(input)
91
+ expect(result).to include("[CODEBLOCK]")
92
+ expect(result).not_to include("<pre>")
93
+ expect(result).not_to include("[CODE]")
94
+ end
95
+
96
+ it "handles multiple codeblocks" do
97
+ input = "<syntaxhighlight>code1</syntaxhighlight>\n\n<source>code2</source>"
98
+ result = format_wiki(input)
99
+ expect(result.scan("[CODEBLOCK]").count).to eq(2)
100
+ end
101
+ end
102
+
103
+ describe "CHEM marker" do
104
+ it "replaces <chem> tags with [CHEM]" do
105
+ input = "Water is <chem>H2O</chem>."
106
+ result = format_wiki(input)
107
+ expect(result).to include("[CHEM]")
108
+ expect(result).not_to include("<chem>")
109
+ end
110
+
111
+ it "replaces {{chem}} templates with [CHEM]" do
112
+ input = "The reaction produces {{chem|CO|2}}."
113
+ result = format_wiki(input)
114
+ expect(result).to include("[CHEM]")
115
+ end
116
+
117
+ it "replaces {{ce}} templates with [CHEM]" do
118
+ input = "Salt is {{ce|NaCl}}."
119
+ result = format_wiki(input)
120
+ expect(result).to include("[CHEM]")
121
+ end
122
+ end
123
+
124
+ describe "TABLE marker" do
125
+ it "replaces wiki tables with [TABLE]" do
126
+ input = "Data:\n{| class=\"wikitable\"\n|-\n! Header\n|-\n| Cell\n|}\nMore text."
127
+ result = format_wiki(input)
128
+ expect(result).to include("[TABLE]")
129
+ expect(result).not_to include("{|")
130
+ expect(result).not_to include("|}")
131
+ end
132
+
133
+ it "replaces <table> tags with [TABLE]" do
134
+ input = "Data: <table><tr><td>Cell</td></tr></table> more."
135
+ result = format_wiki(input)
136
+ expect(result).to include("[TABLE]")
137
+ expect(result).not_to include("<table>")
138
+ end
139
+ end
140
+
141
+ describe "SCORE marker" do
142
+ it "replaces <score> tags with [SCORE]" do
143
+ input = "The melody: <score>\\relative c' { c d e f g }</score>"
144
+ result = format_wiki(input)
145
+ expect(result).to include("[SCORE]")
146
+ expect(result).not_to include("<score>")
147
+ end
148
+ end
149
+
150
+ describe "TIMELINE marker" do
151
+ it "replaces <timeline> tags with [TIMELINE]" do
152
+ input = "History:\n<timeline>\nImageSize = width:800\n</timeline>\nEnd."
153
+ result = format_wiki(input)
154
+ expect(result).to include("[TIMELINE]")
155
+ expect(result).not_to include("<timeline>")
156
+ end
157
+ end
158
+
159
+ describe "GRAPH marker" do
160
+ it "replaces <graph> tags with [GRAPH]" do
161
+ input = "Chart: <graph>{\"data\": []}</graph> shown above."
162
+ result = format_wiki(input)
163
+ expect(result).to include("[GRAPH]")
164
+ expect(result).not_to include("<graph>")
165
+ end
166
+ end
167
+
168
+ describe "IPA marker" do
169
+ it "replaces {{IPA}} templates with [IPA]" do
170
+ input = "Pronounced {{IPA|/həˈloʊ/}}."
171
+ result = format_wiki(input)
172
+ expect(result).to include("[IPA]")
173
+ end
174
+
175
+ it "replaces {{IPAc-en}} templates with [IPA]" do
176
+ input = "Say {{IPAc-en|ˈ|h|ɛ|l|oʊ}}."
177
+ result = format_wiki(input)
178
+ expect(result).to include("[IPA]")
179
+ end
180
+ end
181
+
182
+ describe "INFOBOX marker" do
183
+ it "replaces {{Infobox}} templates with [INFOBOX]" do
184
+ input = "{{Infobox person\n|name = John\n|birth_date = 1990\n}}\nJohn is a person."
185
+ result = format_wiki(input)
186
+ expect(result).to include("[INFOBOX]")
187
+ expect(result).not_to include("{{Infobox")
188
+ expect(result).not_to include("name = John")
189
+ end
190
+
191
+ it "handles nested templates in infobox" do
192
+ input = "{{Infobox country\n|name = {{flag|Japan}}\n|capital = Tokyo\n}}"
193
+ result = format_wiki(input)
194
+ expect(result).to include("[INFOBOX]")
195
+ expect(result).not_to include("{{Infobox")
196
+ end
197
+ end
198
+
199
+ describe "NAVBOX marker" do
200
+ it "replaces {{Navbox}} templates with [NAVBOX]" do
201
+ input = "Text\n{{Navbox\n|title = Navigation\n|list1 = Item1\n}}"
202
+ result = format_wiki(input)
203
+ expect(result).to include("[NAVBOX]")
204
+ expect(result).not_to include("{{Navbox")
205
+ end
206
+ end
207
+
208
+ describe "GALLERY marker" do
209
+ it "replaces <gallery> tags with [GALLERY]" do
210
+ input = "Images:\n<gallery>\nFile:Test.jpg|Caption\nFile:Test2.jpg|Caption2\n</gallery>"
211
+ result = format_wiki(input)
212
+ expect(result).to include("[GALLERY]")
213
+ expect(result).not_to include("<gallery>")
214
+ end
215
+ end
216
+
217
+ describe "SIDEBAR marker" do
218
+ it "replaces {{Sidebar}} templates with [SIDEBAR]" do
219
+ input = "{{Sidebar\n|title = Test\n|content = Text\n}}"
220
+ result = format_wiki(input)
221
+ expect(result).to include("[SIDEBAR]")
222
+ expect(result).not_to include("{{Sidebar")
223
+ end
224
+ end
225
+
226
+ describe "MAPFRAME marker" do
227
+ it "replaces <mapframe> tags with [MAPFRAME]" do
228
+ input = "Map: <mapframe latitude=\"51.5\" longitude=\"-0.1\">data</mapframe>"
229
+ result = format_wiki(input)
230
+ expect(result).to include("[MAPFRAME]")
231
+ expect(result).not_to include("<mapframe")
232
+ end
233
+ end
234
+
235
+ describe "IMAGEMAP marker" do
236
+ it "replaces <imagemap> tags with [IMAGEMAP]" do
237
+ input = "<imagemap>\nImage:Test.png|100px\nrect 0 0 100 100 [[Link]]\n</imagemap>"
238
+ result = format_wiki(input)
239
+ expect(result).to include("[IMAGEMAP]")
240
+ expect(result).not_to include("<imagemap>")
241
+ end
242
+ end
243
+
244
+ describe "REFERENCES marker" do
245
+ it "replaces {{reflist}} templates with [REFERENCES]" do
246
+ input = "Text with citations.\n\n== References ==\n{{reflist}}"
247
+ result = format_wiki(input)
248
+ expect(result).to include("[REFERENCES]")
249
+ expect(result).not_to include("{{reflist")
250
+ end
251
+
252
+ it "replaces {{Reflist}} with parameters with [REFERENCES]" do
253
+ input = "== References ==\n{{Reflist|30em}}"
254
+ result = format_wiki(input)
255
+ expect(result).to include("[REFERENCES]")
256
+ end
257
+
258
+ it "replaces <references/> self-closing tag with [REFERENCES]" do
259
+ input = "== References ==\n<references/>"
260
+ result = format_wiki(input)
261
+ expect(result).to include("[REFERENCES]")
262
+ expect(result).not_to include("<references")
263
+ end
264
+
265
+ it "replaces <references>...</references> tag with [REFERENCES]" do
266
+ input = "== References ==\n<references>\n<ref name=\"test\">Content</ref>\n</references>"
267
+ result = format_wiki(input)
268
+ expect(result).to include("[REFERENCES]")
269
+ expect(result).not_to include("<references>")
270
+ end
271
+
272
+ it "replaces {{refbegin}}...{{refend}} blocks with [REFERENCES]" do
273
+ input = "== Bibliography ==\n{{refbegin}}\n* Book one\n* Book two\n{{refend}}"
274
+ result = format_wiki(input)
275
+ expect(result).to include("[REFERENCES]")
276
+ expect(result).not_to include("{{refbegin")
277
+ expect(result).not_to include("{{refend")
278
+ expect(result).not_to include("Book one")
279
+ end
280
+
281
+ it "handles {{refbegin}} with parameters" do
282
+ input = "{{refbegin|30em|indent=yes}}\n* Citation\n{{refend}}"
283
+ result = format_wiki(input)
284
+ expect(result).to include("[REFERENCES]")
285
+ expect(result).not_to include("Citation")
286
+ end
287
+ end
288
+
289
+ describe "Citation extraction (extract_citations option)" do
290
+ it "extracts author, title, year from {{cite book}}" do
291
+ input = "{{cite book |last=Smith |first=John |title=The Book Title |year=2020}}"
292
+ result = format_wiki(input, extract_citations: true)
293
+ expect(result).to include("Smith")
294
+ expect(result).to include("The Book Title")
295
+ expect(result).to include("2020")
296
+ end
297
+
298
+ it "extracts from {{cite web}}" do
299
+ input = "{{cite web |title=Web Page Title |url=http://example.com |date=2021-05-15}}"
300
+ result = format_wiki(input, extract_citations: true)
301
+ expect(result).to include("Web Page Title")
302
+ expect(result).to include("2021")
303
+ end
304
+
305
+ it "extracts from {{cite news}}" do
306
+ input = "{{cite news |last=Reporter |title=News Article |newspaper=Daily News |date=2022-03-20}}"
307
+ result = format_wiki(input, extract_citations: true)
308
+ expect(result).to include("Reporter")
309
+ expect(result).to include("News Article")
310
+ expect(result).to include("2022")
311
+ end
312
+
313
+ it "extracts from {{cite journal}}" do
314
+ input = "{{cite journal |last=Scientist |title=Research Paper |journal=Nature |year=2023}}"
315
+ result = format_wiki(input, extract_citations: true)
316
+ expect(result).to include("Scientist")
317
+ expect(result).to include("Research Paper")
318
+ expect(result).to include("2023")
319
+ end
320
+
321
+ it "extracts from {{Citation}}" do
322
+ input = "{{Citation |last=Doe |first=Jane |title=Article Title |year=2019 |publisher=Publisher Name}}"
323
+ result = format_wiki(input, extract_citations: true)
324
+ expect(result).to include("Doe")
325
+ expect(result).to include("Article Title")
326
+ expect(result).to include("2019")
327
+ end
328
+
329
+ it "handles multiple citations" do
330
+ input = "* {{cite book |last=Author1 |title=Book One |year=2001}}\n* {{cite book |last=Author2 |title=Book Two |year=2002}}"
331
+ result = format_wiki(input, extract_citations: true)
332
+ expect(result).to include("Author1")
333
+ expect(result).to include("Book One")
334
+ expect(result).to include("Author2")
335
+ expect(result).to include("Book Two")
336
+ end
337
+
338
+ it "extracts citations from refbegin/refend blocks" do
339
+ input = "{{refbegin}}\n* {{cite book |last=Smith |title=Book Title |year=2020}}\n{{refend}}"
340
+ result = format_wiki(input, extract_citations: true)
341
+ expect(result).to include("Smith")
342
+ expect(result).to include("Book Title")
343
+ expect(result).not_to include("{{refbegin")
344
+ expect(result).not_to include("{{refend")
345
+ end
346
+
347
+ it "removes citations when extract_citations is false (default)" do
348
+ input = "Text. {{cite book |last=Smith |title=Book |year=2020}}"
349
+ result = format_wiki(input)
350
+ expect(result).not_to include("Smith")
351
+ expect(result).not_to include("Book")
352
+ end
353
+
354
+ it "handles citations with only title" do
355
+ input = "{{cite web |title=Untitled Page |url=http://example.com}}"
356
+ result = format_wiki(input, extract_citations: true)
357
+ expect(result).to include("Untitled Page")
358
+ end
359
+
360
+ it "handles author1/first1 format" do
361
+ input = "{{cite book |last1=Primary |first1=Author |title=Multi-Author Book |year=2021}}"
362
+ result = format_wiki(input, extract_citations: true)
363
+ expect(result).to include("Primary")
364
+ expect(result).to include("Multi-Author Book")
365
+ end
366
+ end
367
+ end
368
+
369
+ # Markers can be disabled
370
+ describe "marker replacement (disabled)" do
371
+ it "removes content without markers when markers disabled" do
372
+ input = "The equation <math>E = mc^2</math> is famous."
373
+ result = format_wiki(input, markers: false)
374
+ expect(result).not_to include("[MATH]")
375
+ expect(result).not_to include("<math>")
376
+ expect(result).not_to include("E = mc^2")
377
+ end
378
+
379
+ it "removes all marker types when disabled" do
380
+ input = "<code>x</code> <chem>H2O</chem> <score>notes</score>"
381
+ result = format_wiki(input, markers: false)
382
+ expect(result).not_to include("[CODE]")
383
+ expect(result).not_to include("[CHEM]")
384
+ expect(result).not_to include("[SCORE]")
385
+ end
386
+
387
+ it "removes codeblock when disabled" do
388
+ input = "<syntaxhighlight lang=\"python\">print('hello')</syntaxhighlight>"
389
+ result = format_wiki(input, markers: false)
390
+ expect(result).not_to include("[CODEBLOCK]")
391
+ expect(result).not_to include("<syntaxhighlight")
392
+ end
393
+
394
+ it "removes infobox when markers disabled" do
395
+ input = "{{Infobox person\n|name = John\n}}\nText."
396
+ result = format_wiki(input, markers: false)
397
+ expect(result).not_to include("[INFOBOX]")
398
+ expect(result).not_to include("{{Infobox")
399
+ expect(result).to include("Text")
400
+ end
401
+
402
+ it "removes navbox when markers disabled" do
403
+ input = "Text.\n{{Navbox\n|title = Nav\n}}"
404
+ result = format_wiki(input, markers: false)
405
+ expect(result).not_to include("[NAVBOX]")
406
+ expect(result).not_to include("{{Navbox")
407
+ end
408
+
409
+ it "removes gallery when markers disabled" do
410
+ input = "<gallery>\nFile:Test.jpg\n</gallery>"
411
+ result = format_wiki(input, markers: false)
412
+ expect(result).not_to include("[GALLERY]")
413
+ expect(result).not_to include("<gallery>")
414
+ end
415
+
416
+ it "removes references when markers disabled" do
417
+ input = "Text.\n{{reflist}}"
418
+ result = format_wiki(input, markers: false)
419
+ expect(result).not_to include("[REFERENCES]")
420
+ expect(result).not_to include("{{reflist")
421
+ end
422
+ end
423
+
424
+ # Selective markers
425
+ describe "selective marker replacement" do
426
+ it "enables only specified markers" do
427
+ input = "<math>x</math> and <code>y</code>"
428
+ result = format_wiki(input, markers: [:math])
429
+ expect(result).to include("[MATH]")
430
+ expect(result).not_to include("[CODE]")
431
+ expect(result).not_to include("<code>")
432
+ end
433
+
434
+ it "accepts array of marker symbols" do
435
+ input = "<math>x</math> <code>y</code> <chem>H2O</chem>"
436
+ result = format_wiki(input, markers: [:math, :code])
437
+ expect(result).to include("[MATH]")
438
+ expect(result).to include("[CODE]")
439
+ expect(result).not_to include("[CHEM]")
440
+ end
441
+
442
+ it "distinguishes code and codeblock markers" do
443
+ input = "<code>inline</code>\n<syntaxhighlight>block</syntaxhighlight>"
444
+ result = format_wiki(input, markers: [:code])
445
+ expect(result).to include("[CODE]")
446
+ expect(result).not_to include("[CODEBLOCK]")
447
+ expect(result).not_to include("block")
448
+ end
449
+
450
+ it "enables codeblock independently from code" do
451
+ input = "<code>inline</code>\n<syntaxhighlight>block</syntaxhighlight>"
452
+ result = format_wiki(input, markers: [:codeblock])
453
+ expect(result).to include("[CODEBLOCK]")
454
+ expect(result).not_to include("[CODE]")
455
+ expect(result).not_to include("inline")
456
+ end
457
+ end
458
+
459
+ # Multiple markers in one text
460
+ describe "multiple markers" do
461
+ it "handles multiple marker types in same text" do
462
+ input = "Formula <math>E=mc^2</math>, code <code>x=1</code>, and water <chem>H2O</chem>."
463
+ result = format_wiki(input)
464
+ expect(result).to include("[MATH]")
465
+ expect(result).to include("[CODE]")
466
+ expect(result).to include("[CHEM]")
467
+ end
468
+
469
+ it "handles nested content correctly" do
470
+ input = "{| class=\"wikitable\"\n|-\n| <math>x^2</math>\n|}"
471
+ result = format_wiki(input)
472
+ expect(result).to include("[TABLE]")
473
+ # Math inside table is processed with the table
474
+ end
475
+ end
476
+ end
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "wp2txt/memory_monitor"
5
+
6
+ RSpec.describe Wp2txt::MemoryMonitor do
7
+ describe ".current_memory_usage" do
8
+ it "returns a non-negative integer" do
9
+ usage = described_class.current_memory_usage
10
+ expect(usage).to be_a(Integer)
11
+ expect(usage).to be >= 0
12
+ end
13
+ end
14
+
15
+ describe ".total_system_memory" do
16
+ it "returns a positive integer" do
17
+ total = described_class.total_system_memory
18
+ expect(total).to be_a(Integer)
19
+ expect(total).to be > 0
20
+ end
21
+
22
+ it "returns at least 1 GB (reasonable minimum for running tests)" do
23
+ total = described_class.total_system_memory
24
+ one_gb = 1024 * 1024 * 1024
25
+ expect(total).to be >= one_gb
26
+ end
27
+ end
28
+
29
+ describe ".available_memory" do
30
+ it "returns a positive integer" do
31
+ available = described_class.available_memory
32
+ expect(available).to be_a(Integer)
33
+ expect(available).to be > 0
34
+ end
35
+
36
+ it "is less than or equal to total system memory" do
37
+ available = described_class.available_memory
38
+ total = described_class.total_system_memory
39
+ expect(available).to be <= total
40
+ end
41
+ end
42
+
43
+ describe ".memory_usage_percent" do
44
+ it "returns a float between 0 and 100" do
45
+ percent = described_class.memory_usage_percent
46
+ expect(percent).to be_a(Float)
47
+ expect(percent).to be >= 0
48
+ expect(percent).to be <= 100
49
+ end
50
+ end
51
+
52
+ describe ".memory_low?" do
53
+ it "returns a boolean" do
54
+ result = described_class.memory_low?
55
+ expect([true, false]).to include(result)
56
+ end
57
+ end
58
+
59
+ describe ".optimal_buffer_size" do
60
+ it "returns an integer within bounds" do
61
+ size = described_class.optimal_buffer_size
62
+ expect(size).to be_a(Integer)
63
+ expect(size).to be >= described_class::MIN_BUFFER_SIZE
64
+ expect(size).to be <= described_class::MAX_BUFFER_SIZE
65
+ end
66
+
67
+ it "returns a multiple of 1 MB" do
68
+ size = described_class.optimal_buffer_size
69
+ one_mb = 1_048_576
70
+ expect(size % one_mb).to eq(0)
71
+ end
72
+ end
73
+
74
+ describe ".memory_stats" do
75
+ it "returns a hash with expected keys" do
76
+ stats = described_class.memory_stats
77
+ expect(stats).to be_a(Hash)
78
+ expect(stats).to have_key(:current_usage_mb)
79
+ expect(stats).to have_key(:total_system_mb)
80
+ expect(stats).to have_key(:available_mb)
81
+ expect(stats).to have_key(:usage_percent)
82
+ expect(stats).to have_key(:recommended_buffer_mb)
83
+ expect(stats).to have_key(:low_memory)
84
+ end
85
+
86
+ it "returns numeric values for memory metrics" do
87
+ stats = described_class.memory_stats
88
+ expect(stats[:current_usage_mb]).to be_a(Numeric)
89
+ expect(stats[:total_system_mb]).to be_a(Numeric)
90
+ expect(stats[:available_mb]).to be_a(Numeric)
91
+ expect(stats[:usage_percent]).to be_a(Numeric)
92
+ expect(stats[:recommended_buffer_mb]).to be_a(Numeric)
93
+ end
94
+ end
95
+
96
+ describe ".format_memory" do
97
+ it "formats bytes" do
98
+ expect(described_class.format_memory(500)).to eq("500 B")
99
+ end
100
+
101
+ it "formats kilobytes" do
102
+ expect(described_class.format_memory(2048)).to eq("2.0 KB")
103
+ end
104
+
105
+ it "formats megabytes" do
106
+ expect(described_class.format_memory(5_242_880)).to eq("5.0 MB")
107
+ end
108
+
109
+ it "formats gigabytes" do
110
+ expect(described_class.format_memory(2_147_483_648)).to eq("2.0 GB")
111
+ end
112
+ end
113
+
114
+ describe ".gc_if_needed" do
115
+ it "returns a boolean" do
116
+ result = described_class.gc_if_needed
117
+ expect([true, false]).to include(result)
118
+ end
119
+ end
120
+
121
+ describe "constants" do
122
+ it "has reasonable threshold values" do
123
+ expect(described_class::LOW_MEMORY_THRESHOLD_MB).to be > 0
124
+ expect(described_class::HIGH_MEMORY_THRESHOLD_MB).to be > described_class::LOW_MEMORY_THRESHOLD_MB
125
+ expect(described_class::TARGET_MEMORY_USAGE_PERCENT).to be_between(50, 90)
126
+ end
127
+
128
+ it "has reasonable buffer size bounds" do
129
+ expect(described_class::MIN_BUFFER_SIZE).to be > 0
130
+ expect(described_class::MAX_BUFFER_SIZE).to be > described_class::MIN_BUFFER_SIZE
131
+ expect(described_class::DEFAULT_BUFFER_SIZE).to be >= described_class::MIN_BUFFER_SIZE
132
+ expect(described_class::DEFAULT_BUFFER_SIZE).to be <= described_class::MAX_BUFFER_SIZE
133
+ end
134
+
135
+ it "has reasonable memory per process value" do
136
+ expect(described_class::MEMORY_PER_PROCESS_MB).to be_between(100, 1000)
137
+ end
138
+ end
139
+
140
+ describe ".optimal_processes" do
141
+ it "returns a positive integer" do
142
+ result = described_class.optimal_processes
143
+ expect(result).to be_a(Integer)
144
+ expect(result).to be >= 1
145
+ end
146
+
147
+ it "returns a value less than or equal to CPU cores" do
148
+ result = described_class.optimal_processes
149
+ cores = Etc.nprocessors
150
+ expect(result).to be <= cores
151
+ end
152
+
153
+ it "accepts custom memory_per_process_mb parameter" do
154
+ # With very high memory requirement, should return fewer processes
155
+ high_mem = described_class.optimal_processes(memory_per_process_mb: 10_000)
156
+ low_mem = described_class.optimal_processes(memory_per_process_mb: 100)
157
+ expect(high_mem).to be <= low_mem
158
+ end
159
+
160
+ it "returns at least 1 even with extreme memory constraints" do
161
+ result = described_class.optimal_processes(memory_per_process_mb: 1_000_000)
162
+ expect(result).to be >= 1
163
+ end
164
+ end
165
+
166
+ describe ".parallel_processing_info" do
167
+ it "returns a hash with expected keys" do
168
+ info = described_class.parallel_processing_info
169
+ expect(info).to be_a(Hash)
170
+ expect(info).to have_key(:cpu_cores)
171
+ expect(info).to have_key(:available_memory_mb)
172
+ expect(info).to have_key(:memory_per_process_mb)
173
+ expect(info).to have_key(:optimal_processes)
174
+ expect(info).to have_key(:max_by_cpu)
175
+ expect(info).to have_key(:max_by_memory)
176
+ end
177
+
178
+ it "returns consistent values" do
179
+ info = described_class.parallel_processing_info
180
+ expect(info[:cpu_cores]).to eq(Etc.nprocessors)
181
+ expect(info[:memory_per_process_mb]).to eq(described_class::MEMORY_PER_PROCESS_MB)
182
+ expect(info[:optimal_processes]).to eq(described_class.optimal_processes)
183
+ end
184
+
185
+ it "returns positive values for all numeric fields" do
186
+ info = described_class.parallel_processing_info
187
+ expect(info[:cpu_cores]).to be > 0
188
+ expect(info[:available_memory_mb]).to be > 0
189
+ expect(info[:optimal_processes]).to be > 0
190
+ end
191
+ end
192
+ end