wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,455 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+
5
+ RSpec.describe Wp2txt::ParserFunctions do
6
+ let(:parser) { described_class.new }
7
+
8
+ describe "#if" do
9
+ it "returns then-value when condition is non-empty" do
10
+ expect(parser.evaluate("{{#if:yes|true|false}}")).to eq("true")
11
+ end
12
+
13
+ it "returns else-value when condition is empty" do
14
+ expect(parser.evaluate("{{#if:|true|false}}")).to eq("false")
15
+ end
16
+
17
+ it "returns else-value when condition is whitespace only" do
18
+ expect(parser.evaluate("{{#if: |true|false}}")).to eq("false")
19
+ end
20
+
21
+ it "returns then-value with text condition" do
22
+ expect(parser.evaluate("{{#if:something|yes|no}}")).to eq("yes")
23
+ end
24
+
25
+ it "returns empty when no else-value and condition is empty" do
26
+ expect(parser.evaluate("{{#if:|true}}")).to eq("")
27
+ end
28
+
29
+ it "handles nested #if" do
30
+ expect(parser.evaluate("{{#if:x|{{#if:y|inner|}}|outer}}")).to eq("inner")
31
+ end
32
+ end
33
+
34
+ describe "#ifeq" do
35
+ it "returns then-value when strings are equal" do
36
+ expect(parser.evaluate("{{#ifeq:foo|foo|equal|not equal}}")).to eq("equal")
37
+ end
38
+
39
+ it "returns else-value when strings differ" do
40
+ expect(parser.evaluate("{{#ifeq:foo|bar|equal|not equal}}")).to eq("not equal")
41
+ end
42
+
43
+ it "handles numeric comparison" do
44
+ expect(parser.evaluate("{{#ifeq:01|1|equal|not equal}}")).to eq("equal")
45
+ end
46
+
47
+ it "handles case-sensitive comparison" do
48
+ expect(parser.evaluate("{{#ifeq:Foo|foo|equal|not equal}}")).to eq("not equal")
49
+ end
50
+
51
+ it "trims whitespace in comparison" do
52
+ expect(parser.evaluate("{{#ifeq: foo |foo|equal|not equal}}")).to eq("equal")
53
+ end
54
+
55
+ it "returns empty when no else-value and not equal" do
56
+ expect(parser.evaluate("{{#ifeq:a|b|equal}}")).to eq("")
57
+ end
58
+ end
59
+
60
+ describe "#switch" do
61
+ it "returns matching case value" do
62
+ expect(parser.evaluate("{{#switch:b|a=first|b=second|c=third}}")).to eq("second")
63
+ end
64
+
65
+ it "returns default value when no match" do
66
+ expect(parser.evaluate("{{#switch:x|a=first|b=second|#default=none}}")).to eq("none")
67
+ end
68
+
69
+ it "returns last unnamed value as default" do
70
+ expect(parser.evaluate("{{#switch:x|a=first|b=second|fallback}}")).to eq("fallback")
71
+ end
72
+
73
+ it "returns empty when no match and no default" do
74
+ expect(parser.evaluate("{{#switch:x|a=first|b=second}}")).to eq("")
75
+ end
76
+
77
+ it "handles fall-through cases" do
78
+ expect(parser.evaluate("{{#switch:b|a|b|c=result}}")).to eq("result")
79
+ end
80
+
81
+ it "handles numeric matching" do
82
+ expect(parser.evaluate("{{#switch:2|1=one|2=two|3=three}}")).to eq("two")
83
+ end
84
+
85
+ it "trims whitespace in comparisons" do
86
+ expect(parser.evaluate("{{#switch: b |a=first| b =second}}")).to eq("second")
87
+ end
88
+ end
89
+
90
+ describe "#ifexpr" do
91
+ it "returns then-value when expression is non-zero" do
92
+ expect(parser.evaluate("{{#ifexpr:1|yes|no}}")).to eq("yes")
93
+ end
94
+
95
+ it "returns else-value when expression is zero" do
96
+ expect(parser.evaluate("{{#ifexpr:0|yes|no}}")).to eq("no")
97
+ end
98
+
99
+ it "evaluates simple arithmetic" do
100
+ expect(parser.evaluate("{{#ifexpr:2+2=4|yes|no}}")).to eq("yes")
101
+ end
102
+
103
+ it "evaluates comparison operators" do
104
+ expect(parser.evaluate("{{#ifexpr:5>3|yes|no}}")).to eq("yes")
105
+ end
106
+
107
+ it "handles negative results" do
108
+ expect(parser.evaluate("{{#ifexpr:3-5|yes|no}}")).to eq("yes")
109
+ end
110
+ end
111
+
112
+ describe "#expr" do
113
+ it "evaluates addition" do
114
+ expect(parser.evaluate("{{#expr:2+3}}")).to eq("5")
115
+ end
116
+
117
+ it "evaluates subtraction" do
118
+ expect(parser.evaluate("{{#expr:10-3}}")).to eq("7")
119
+ end
120
+
121
+ it "evaluates multiplication" do
122
+ expect(parser.evaluate("{{#expr:4*5}}")).to eq("20")
123
+ end
124
+
125
+ it "evaluates division" do
126
+ expect(parser.evaluate("{{#expr:20/4}}")).to eq("5")
127
+ end
128
+
129
+ it "evaluates modulo" do
130
+ expect(parser.evaluate("{{#expr:17 mod 5}}")).to eq("2")
131
+ end
132
+
133
+ it "evaluates parentheses" do
134
+ expect(parser.evaluate("{{#expr:(2+3)*4}}")).to eq("20")
135
+ end
136
+
137
+ it "evaluates power" do
138
+ expect(parser.evaluate("{{#expr:2^3}}")).to eq("8")
139
+ end
140
+
141
+ it "handles decimal results" do
142
+ result = parser.evaluate("{{#expr:10/3}}")
143
+ expect(result.to_f).to be_within(0.01).of(3.33)
144
+ end
145
+
146
+ it "handles comparison operators returning 1 or 0" do
147
+ expect(parser.evaluate("{{#expr:5>3}}")).to eq("1")
148
+ expect(parser.evaluate("{{#expr:5<3}}")).to eq("0")
149
+ end
150
+
151
+ it "handles equality comparison" do
152
+ expect(parser.evaluate("{{#expr:5=5}}")).to eq("1")
153
+ expect(parser.evaluate("{{#expr:5=6}}")).to eq("0")
154
+ end
155
+
156
+ it "handles and/or operators" do
157
+ expect(parser.evaluate("{{#expr:1 and 1}}")).to eq("1")
158
+ expect(parser.evaluate("{{#expr:1 and 0}}")).to eq("0")
159
+ expect(parser.evaluate("{{#expr:0 or 1}}")).to eq("1")
160
+ end
161
+
162
+ it "handles unary minus" do
163
+ expect(parser.evaluate("{{#expr:-5}}")).to eq("-5")
164
+ end
165
+
166
+ it "returns error indicator for invalid expressions" do
167
+ expect(parser.evaluate("{{#expr:invalid}}")).to eq("")
168
+ end
169
+ end
170
+
171
+ describe "#len" do
172
+ it "returns string length" do
173
+ expect(parser.evaluate("{{#len:hello}}")).to eq("5")
174
+ end
175
+
176
+ it "counts unicode characters" do
177
+ expect(parser.evaluate("{{#len:日本語}}")).to eq("3")
178
+ end
179
+
180
+ it "returns 0 for empty string" do
181
+ expect(parser.evaluate("{{#len:}}")).to eq("0")
182
+ end
183
+ end
184
+
185
+ describe "#pos" do
186
+ it "returns position of substring" do
187
+ expect(parser.evaluate("{{#pos:hello|l}}")).to eq("2")
188
+ end
189
+
190
+ it "returns empty when not found" do
191
+ expect(parser.evaluate("{{#pos:hello|x}}")).to eq("")
192
+ end
193
+
194
+ it "returns position of first occurrence" do
195
+ expect(parser.evaluate("{{#pos:hello|l}}")).to eq("2")
196
+ end
197
+ end
198
+
199
+ describe "#sub" do
200
+ it "extracts substring from start" do
201
+ expect(parser.evaluate("{{#sub:hello|0|3}}")).to eq("hel")
202
+ end
203
+
204
+ it "extracts substring from position" do
205
+ expect(parser.evaluate("{{#sub:hello|2|3}}")).to eq("llo")
206
+ end
207
+
208
+ it "handles negative start (from end)" do
209
+ expect(parser.evaluate("{{#sub:hello|-2}}")).to eq("lo")
210
+ end
211
+
212
+ it "handles length beyond string" do
213
+ expect(parser.evaluate("{{#sub:hello|0|100}}")).to eq("hello")
214
+ end
215
+ end
216
+
217
+ describe "#replace" do
218
+ it "replaces substring" do
219
+ expect(parser.evaluate("{{#replace:hello world|world|universe}}")).to eq("hello universe")
220
+ end
221
+
222
+ it "replaces all occurrences" do
223
+ expect(parser.evaluate("{{#replace:ababa|a|x}}")).to eq("xbxbx")
224
+ end
225
+
226
+ it "handles empty replacement" do
227
+ expect(parser.evaluate("{{#replace:hello|l|}}")).to eq("heo")
228
+ end
229
+ end
230
+
231
+ describe "#titleparts" do
232
+ it "extracts first part of title" do
233
+ expect(parser.evaluate("{{#titleparts:Talk:Foo/Bar/Baz|1}}")).to eq("Talk:Foo")
234
+ end
235
+
236
+ it "extracts multiple parts" do
237
+ expect(parser.evaluate("{{#titleparts:Talk:Foo/Bar/Baz|2}}")).to eq("Talk:Foo/Bar")
238
+ end
239
+
240
+ it "extracts from offset" do
241
+ expect(parser.evaluate("{{#titleparts:Talk:Foo/Bar/Baz|1|1}}")).to eq("Bar")
242
+ end
243
+
244
+ it "handles negative count (from end)" do
245
+ expect(parser.evaluate("{{#titleparts:Talk:Foo/Bar/Baz|-1}}")).to eq("Talk:Foo/Bar")
246
+ end
247
+ end
248
+
249
+ describe "#time" do
250
+ let(:parser_with_date) { described_class.new(reference_date: Time.new(2024, 6, 15, 10, 30, 45)) }
251
+
252
+ it "formats year" do
253
+ expect(parser_with_date.evaluate("{{#time:Y}}")).to eq("2024")
254
+ end
255
+
256
+ it "formats month name" do
257
+ expect(parser_with_date.evaluate("{{#time:F}}")).to eq("June")
258
+ end
259
+
260
+ it "formats day" do
261
+ expect(parser_with_date.evaluate("{{#time:j}}")).to eq("15")
262
+ end
263
+
264
+ it "formats full date" do
265
+ expect(parser_with_date.evaluate("{{#time:Y-m-d}}")).to eq("2024-06-15")
266
+ end
267
+
268
+ it "parses input date" do
269
+ expect(parser.evaluate("{{#time:Y|2020-05-15}}")).to eq("2020")
270
+ end
271
+ end
272
+
273
+ describe "integration with template_expander" do
274
+ include Wp2txt
275
+
276
+ it "expands parser functions in format_wiki" do
277
+ input = "Result: {{#if:yes|shown|hidden}}"
278
+ result = format_wiki(input, title: "Test", expand_templates: true)
279
+ expect(result).to include("Result: shown")
280
+ end
281
+
282
+ it "handles parser functions within templates" do
283
+ input = "{{#switch:2|1=one|2=two|3=three}}"
284
+ result = format_wiki(input, title: "Test", expand_templates: true)
285
+ expect(result).to eq("two")
286
+ end
287
+
288
+ it "handles nested parser functions and templates" do
289
+ input = "{{#if:yes|{{circa|1500}}|unknown}}"
290
+ result = format_wiki(input, title: "Test", expand_templates: true)
291
+ expect(result).to eq("c. 1500")
292
+ end
293
+ end
294
+
295
+ describe "edge cases" do
296
+ it "handles malformed parser function gracefully" do
297
+ expect(parser.evaluate("{{#if:}}")).to eq("")
298
+ end
299
+
300
+ it "handles unknown parser function" do
301
+ expect(parser.evaluate("{{#unknown:foo|bar}}")).to eq("")
302
+ end
303
+
304
+ it "handles deeply nested functions" do
305
+ result = parser.evaluate("{{#if:x|{{#ifeq:a|a|{{#switch:1|1=deep}}|}}|}}")
306
+ expect(result).to eq("deep")
307
+ end
308
+
309
+ it "preserves text around parser functions" do
310
+ expect(parser.evaluate("Before {{#if:x|middle|}} after")).to eq("Before middle after")
311
+ end
312
+ end
313
+
314
+ # New parser functions for WikiExtractor parity
315
+ describe "#iferror" do
316
+ it "returns then-value when input contains error class" do
317
+ expect(parser.evaluate("{{#iferror:<span class=\"error\">Error</span>|error found|no error}}")).to eq("error found")
318
+ end
319
+
320
+ it "returns else-value when input is normal" do
321
+ expect(parser.evaluate("{{#iferror:normal text|error|no error}}")).to eq("no error")
322
+ end
323
+
324
+ it "returns empty when no else-value and no error" do
325
+ expect(parser.evaluate("{{#iferror:normal text|error}}")).to eq("")
326
+ end
327
+
328
+ it "returns input when no then-value and no error" do
329
+ expect(parser.evaluate("{{#iferror:normal text}}")).to eq("normal text")
330
+ end
331
+ end
332
+
333
+ describe "#rpos" do
334
+ it "returns position of last occurrence" do
335
+ expect(parser.evaluate("{{#rpos:abcabc|b}}")).to eq("4")
336
+ end
337
+
338
+ it "returns empty when not found" do
339
+ expect(parser.evaluate("{{#rpos:hello|x}}")).to eq("-1")
340
+ end
341
+
342
+ it "handles single occurrence same as #pos" do
343
+ expect(parser.evaluate("{{#rpos:hello|l}}")).to eq("3")
344
+ end
345
+ end
346
+
347
+ describe "#count" do
348
+ it "counts occurrences of substring" do
349
+ expect(parser.evaluate("{{#count:abcabc|a}}")).to eq("2")
350
+ end
351
+
352
+ it "returns 0 when not found" do
353
+ expect(parser.evaluate("{{#count:hello|x}}")).to eq("0")
354
+ end
355
+
356
+ it "counts overlapping occurrences" do
357
+ expect(parser.evaluate("{{#count:aaaa|aa}}")).to eq("2")
358
+ end
359
+ end
360
+
361
+ describe "#explode" do
362
+ it "splits and returns nth element" do
363
+ expect(parser.evaluate("{{#explode:a,b,c|,|1}}")).to eq("b")
364
+ end
365
+
366
+ it "returns first element by default" do
367
+ expect(parser.evaluate("{{#explode:a-b-c|-}}")).to eq("a")
368
+ end
369
+
370
+ it "handles negative index (from end)" do
371
+ expect(parser.evaluate("{{#explode:a,b,c|,|-1}}")).to eq("c")
372
+ end
373
+
374
+ it "returns empty for out of bounds" do
375
+ expect(parser.evaluate("{{#explode:a,b|,|5}}")).to eq("")
376
+ end
377
+ end
378
+
379
+ describe "#urldecode" do
380
+ it "decodes URL-encoded string" do
381
+ expect(parser.evaluate("{{#urldecode:Hello%20World}}")).to eq("Hello World")
382
+ end
383
+
384
+ it "decodes special characters" do
385
+ expect(parser.evaluate("{{#urldecode:%26%3D%3F}}")).to eq("&=?")
386
+ end
387
+
388
+ it "handles already decoded string" do
389
+ expect(parser.evaluate("{{#urldecode:hello}}")).to eq("hello")
390
+ end
391
+ end
392
+
393
+ describe "#urlencode" do
394
+ it "encodes string for URL" do
395
+ expect(parser.evaluate("{{#urlencode:Hello World}}")).to eq("Hello%20World")
396
+ end
397
+
398
+ it "encodes special characters" do
399
+ expect(parser.evaluate("{{#urlencode:a&b=c}}")).to eq("a%26b%3Dc")
400
+ end
401
+ end
402
+
403
+ describe "#padleft" do
404
+ it "pads string on left" do
405
+ expect(parser.evaluate("{{#padleft:7|3|0}}")).to eq("007")
406
+ end
407
+
408
+ it "does not truncate if already longer" do
409
+ expect(parser.evaluate("{{#padleft:hello|3|x}}")).to eq("hello")
410
+ end
411
+
412
+ it "uses space as default padding" do
413
+ expect(parser.evaluate("{{#padleft:a|3}}")).to eq(" a")
414
+ end
415
+ end
416
+
417
+ describe "#padright" do
418
+ it "pads string on right" do
419
+ expect(parser.evaluate("{{#padright:7|3|0}}")).to eq("700")
420
+ end
421
+
422
+ it "does not truncate if already longer" do
423
+ expect(parser.evaluate("{{#padright:hello|3|x}}")).to eq("hello")
424
+ end
425
+ end
426
+
427
+ describe "enhanced #time" do
428
+ let(:parser_with_date) { described_class.new(reference_date: Time.new(2024, 6, 15, 14, 30, 45)) }
429
+
430
+ it "formats 12-hour time" do
431
+ expect(parser_with_date.evaluate("{{#time:g:i a}}")).to eq("2:30 pm")
432
+ end
433
+
434
+ it "formats ISO week number" do
435
+ expect(parser_with_date.evaluate("{{#time:W}}")).to eq("24")
436
+ end
437
+
438
+ it "formats day of week" do
439
+ expect(parser_with_date.evaluate("{{#time:l}}")).to eq("Saturday")
440
+ end
441
+
442
+ it "formats short day of week" do
443
+ expect(parser_with_date.evaluate("{{#time:D}}")).to eq("Sat")
444
+ end
445
+
446
+ it "formats ordinal day suffix" do
447
+ expect(parser_with_date.evaluate("{{#time:jS}}")).to eq("15th")
448
+ end
449
+
450
+ it "formats timezone" do
451
+ result = parser_with_date.evaluate("{{#time:T}}")
452
+ expect(result).not_to be_empty
453
+ end
454
+ end
455
+ end
@@ -0,0 +1,197 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "wp2txt/ractor_worker"
5
+
6
+ RSpec.describe Wp2txt::RactorWorker do
7
+ describe "MINIMUM_RUBY_VERSION" do
8
+ it "is set to 4.0" do
9
+ expect(described_class::MINIMUM_RUBY_VERSION).to eq("4.0")
10
+ end
11
+ end
12
+
13
+ describe "OPERATIONS" do
14
+ it "includes expected operations" do
15
+ expect(described_class::OPERATIONS).to include(:process_article)
16
+ expect(described_class::OPERATIONS).to include(:double)
17
+ expect(described_class::OPERATIONS).to include(:fib)
18
+ end
19
+
20
+ it "does not include removed operations" do
21
+ expect(described_class::OPERATIONS).not_to include(:regex_transform)
22
+ expect(described_class::OPERATIONS).not_to include(:format_wiki)
23
+ end
24
+ end
25
+
26
+ describe ".ruby_version_sufficient?" do
27
+ it "returns boolean based on Ruby version" do
28
+ if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("4.0")
29
+ expect(described_class.ruby_version_sufficient?).to be false
30
+ else
31
+ expect(described_class.ruby_version_sufficient?).to be true
32
+ end
33
+ end
34
+ end
35
+
36
+ describe ".available?" do
37
+ it "returns a boolean" do
38
+ result = described_class.available?
39
+ expect([true, false]).to include(result)
40
+ end
41
+
42
+ it "caches the result" do
43
+ result1 = described_class.available?
44
+ result2 = described_class.available?
45
+ expect(result1).to eq(result2)
46
+ end
47
+
48
+ it "returns false on Ruby < 4.0" do
49
+ if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("4.0")
50
+ if described_class.instance_variable_defined?(:@available)
51
+ described_class.remove_instance_variable(:@available)
52
+ end
53
+ expect(described_class.available?).to be false
54
+ end
55
+ end
56
+ end
57
+
58
+ describe ".optimal_workers" do
59
+ it "returns a positive integer" do
60
+ result = described_class.optimal_workers
61
+ expect(result).to be_a(Integer)
62
+ expect(result).to be >= 1
63
+ end
64
+
65
+ it "does not exceed CPU count" do
66
+ result = described_class.optimal_workers
67
+ expect(result).to be <= Etc.nprocessors
68
+ end
69
+ end
70
+
71
+ describe ".deep_freeze" do
72
+ it "freezes a hash" do
73
+ hash = { a: 1, b: "hello" }
74
+ frozen = described_class.deep_freeze(hash)
75
+ expect(frozen).to be_frozen
76
+ expect(frozen[:b]).to be_frozen
77
+ end
78
+
79
+ it "freezes nested structures" do
80
+ nested = { a: [1, 2, { b: "c" }] }
81
+ frozen = described_class.deep_freeze(nested)
82
+ expect(frozen).to be_frozen
83
+ expect(frozen[:a]).to be_frozen
84
+ expect(frozen[:a][2]).to be_frozen
85
+ end
86
+
87
+ it "handles already frozen objects" do
88
+ str = "hello".freeze
89
+ expect { described_class.deep_freeze(str) }.not_to raise_error
90
+ end
91
+ end
92
+
93
+ describe ".process_single" do
94
+ it "processes :double operation" do
95
+ result = described_class.process_single(5, :double, {})
96
+ expect(result).to eq(10)
97
+ end
98
+
99
+ it "processes :fib operation" do
100
+ result = described_class.process_single(10, :fib, {})
101
+ expect(result).to eq(55)
102
+ end
103
+
104
+ it "raises error for unknown operation" do
105
+ expect {
106
+ described_class.process_single(1, :unknown_op, {})
107
+ }.to raise_error(/Unknown operation/)
108
+ end
109
+ end
110
+
111
+ describe ".parallel_process" do
112
+ context "with simple operations" do
113
+ it "processes items with :double operation" do
114
+ items = [1, 2, 3, 4, 5]
115
+ results = described_class.parallel_process(
116
+ items,
117
+ operation: :double,
118
+ config: {},
119
+ num_workers: 2
120
+ )
121
+ expect(results).to eq([2, 4, 6, 8, 10])
122
+ end
123
+
124
+ it "returns empty array for empty input" do
125
+ results = described_class.parallel_process(
126
+ [],
127
+ operation: :double,
128
+ config: {}
129
+ )
130
+ expect(results).to eq([])
131
+ end
132
+
133
+ it "handles single item (falls back to sequential)" do
134
+ results = described_class.parallel_process(
135
+ [5],
136
+ operation: :double,
137
+ config: {}
138
+ )
139
+ expect(results).to eq([10])
140
+ end
141
+
142
+ it "preserves result order" do
143
+ items = [5, 3, 7, 1, 9]
144
+ results = described_class.parallel_process(
145
+ items,
146
+ operation: :double,
147
+ config: {},
148
+ num_workers: 4
149
+ )
150
+ expect(results).to eq([10, 6, 14, 2, 18])
151
+ end
152
+ end
153
+
154
+ context "with process_article operation" do
155
+ let(:config) { { format: :text, title: true, heading: true, category: true } }
156
+
157
+ it "processes multiple articles" do
158
+ items = [
159
+ ["Article1", "Text one. [[Category:C1]]", false],
160
+ ["Article2", "Text two. [[Category:C2]]", false]
161
+ ]
162
+ results = described_class.parallel_process(
163
+ items,
164
+ operation: :process_article,
165
+ config: config,
166
+ num_workers: 2
167
+ )
168
+ expect(results.size).to eq(2)
169
+ expect(results.compact.size).to eq(2)
170
+ expect(results[0]).to include("Article1")
171
+ expect(results[1]).to include("Article2")
172
+ end
173
+ end
174
+ end
175
+
176
+ describe ".process_articles" do
177
+ let(:config) { { format: :text, title: true, heading: true, category: true } }
178
+
179
+ it "processes pages as [title, text] pairs" do
180
+ pages = [
181
+ ["Test1", "Content one. [[Category:Cat]]"],
182
+ ["Test2", "Content two. [[Category:Cat]]"]
183
+ ]
184
+ results = described_class.process_articles(pages, config: config, num_workers: 2)
185
+ expect(results.size).to eq(2)
186
+ expect(results.compact.size).to eq(2)
187
+ end
188
+
189
+ it "includes article titles in output" do
190
+ pages = [
191
+ ["MyTitle", "Some content here."]
192
+ ]
193
+ results = described_class.process_articles(pages, config: config)
194
+ expect(results.first).to include("MyTitle")
195
+ end
196
+ end
197
+ end