wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,690 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+ require "fileutils"
6
+ require "zlib"
7
+ require "webmock/rspec"
8
+
9
+ RSpec.describe "Wp2txt Multistream" do
10
+ before do
11
+ # Ensure WebMock is enabled (may be disabled by other specs)
12
+ WebMock.enable!
13
+ # Allow localhost connections, stub external
14
+ WebMock.disable_net_connect!(allow_localhost: true)
15
+
16
+ # Stub Wikipedia dump listing page
17
+ stub_request(:get, %r{dumps\.wikimedia\.org})
18
+ .to_return(status: 200, body: '<a href="20260101/">20260101/</a>')
19
+ end
20
+
21
+ after do
22
+ WebMock.allow_net_connect!
23
+ end
24
+ describe Wp2txt::DumpManager do
25
+ let(:temp_dir) { Dir.mktmpdir }
26
+ let(:manager) { described_class.new("en", cache_dir: temp_dir) }
27
+
28
+ after { FileUtils.remove_entry(temp_dir) }
29
+
30
+ describe "#format_size" do
31
+ it "formats bytes" do
32
+ expect(manager.send(:format_size, 500)).to eq("500 B")
33
+ end
34
+
35
+ it "formats kilobytes" do
36
+ expect(manager.send(:format_size, 2048)).to eq("2.0 KB")
37
+ end
38
+
39
+ it "formats megabytes" do
40
+ expect(manager.send(:format_size, 5_242_880)).to eq("5.0 MB")
41
+ end
42
+
43
+ it "formats gigabytes" do
44
+ expect(manager.send(:format_size, 2_147_483_648)).to eq("2.0 GB")
45
+ end
46
+ end
47
+
48
+ describe "#cached_index_path" do
49
+ it "returns correct path format" do
50
+ path = manager.cached_index_path
51
+ expect(path).to include("enwiki")
52
+ expect(path).to include("index")
53
+ end
54
+ end
55
+
56
+ describe "#cached_multistream_path" do
57
+ it "returns correct path format" do
58
+ path = manager.cached_multistream_path
59
+ expect(path).to include("enwiki")
60
+ expect(path).to end_with(".xml.bz2")
61
+ end
62
+ end
63
+
64
+ describe "#cache_fresh?" do
65
+ context "when cache does not exist" do
66
+ it "returns false" do
67
+ expect(manager.cache_fresh?).to be false
68
+ end
69
+ end
70
+
71
+ context "when cache exists and is fresh" do
72
+ before do
73
+ FileUtils.mkdir_p(File.dirname(manager.cached_index_path))
74
+ File.write(manager.cached_index_path, "test")
75
+ end
76
+
77
+ it "returns true" do
78
+ expect(manager.cache_fresh?(30)).to be true
79
+ end
80
+ end
81
+ end
82
+
83
+ describe "#cache_stale?" do
84
+ context "when cache does not exist" do
85
+ it "returns true" do
86
+ expect(manager.cache_stale?).to be true
87
+ end
88
+ end
89
+ end
90
+
91
+ describe "#cache_age_days" do
92
+ context "when cache does not exist" do
93
+ it "returns nil" do
94
+ expect(manager.cache_age_days).to be_nil
95
+ end
96
+ end
97
+
98
+ context "when cache exists" do
99
+ before do
100
+ FileUtils.mkdir_p(File.dirname(manager.cached_index_path))
101
+ File.write(manager.cached_index_path, "test")
102
+ end
103
+
104
+ it "returns age in days" do
105
+ age = manager.cache_age_days
106
+ expect(age).to be_a(Float)
107
+ expect(age).to be >= 0
108
+ expect(age).to be < 1
109
+ end
110
+ end
111
+ end
112
+
113
+ describe "#cache_mtime" do
114
+ context "when cache does not exist" do
115
+ it "returns nil" do
116
+ expect(manager.cache_mtime).to be_nil
117
+ end
118
+ end
119
+
120
+ context "when cache exists" do
121
+ before do
122
+ FileUtils.mkdir_p(File.dirname(manager.cached_index_path))
123
+ File.write(manager.cached_index_path, "test")
124
+ end
125
+
126
+ it "returns Time object" do
127
+ expect(manager.cache_mtime).to be_a(Time)
128
+ end
129
+ end
130
+ end
131
+
132
+ describe "#cache_status" do
133
+ context "when cache is empty" do
134
+ it "returns status hash with zero sizes" do
135
+ status = manager.cache_status
136
+ expect(status[:index_size]).to eq(0)
137
+ expect(status[:multistream_size]).to eq(0)
138
+ expect(status[:fresh]).to be false
139
+ end
140
+ end
141
+ end
142
+
143
+ describe "#clear_cache!" do
144
+ it "does not raise error when no cache exists" do
145
+ expect { manager.clear_cache! }.not_to raise_error
146
+ end
147
+ end
148
+
149
+ describe ".all_cache_status" do
150
+ it "returns hash of all cached languages" do
151
+ status = described_class.all_cache_status(temp_dir)
152
+ expect(status).to be_a(Hash)
153
+ end
154
+ end
155
+
156
+ describe "#find_suitable_partial_cache" do
157
+ context "when no partial cache exists" do
158
+ it "returns nil" do
159
+ expect(manager.find_suitable_partial_cache(100)).to be_nil
160
+ end
161
+ end
162
+ end
163
+
164
+ describe "resumable download support" do
165
+ let(:test_file_path) { File.join(temp_dir, "test_download.bz2") }
166
+ let(:test_url) { "https://example.com/test.bz2" }
167
+
168
+ describe "#download_meta_path" do
169
+ it "returns path with .wp2txt_download suffix" do
170
+ path = manager.send(:download_meta_path, test_file_path)
171
+ expect(path).to eq("#{test_file_path}.wp2txt_download")
172
+ end
173
+ end
174
+
175
+ describe "#save_download_meta and #load_download_meta" do
176
+ let(:remote_info) do
177
+ {
178
+ size: 1_000_000,
179
+ etag: '"abc123"',
180
+ last_modified: "Wed, 01 Jan 2026 00:00:00 GMT"
181
+ }
182
+ end
183
+
184
+ it "saves and loads metadata correctly" do
185
+ manager.send(:save_download_meta, test_file_path, test_url, remote_info)
186
+ loaded = manager.send(:load_download_meta, test_file_path)
187
+
188
+ expect(loaded[:url]).to eq(test_url)
189
+ expect(loaded[:size]).to eq(1_000_000)
190
+ expect(loaded[:etag]).to eq('"abc123"')
191
+ expect(loaded[:last_modified]).to eq("Wed, 01 Jan 2026 00:00:00 GMT")
192
+ expect(loaded[:started_at]).not_to be_nil
193
+ end
194
+ end
195
+
196
+ describe "#cleanup_download_meta" do
197
+ it "removes metadata file" do
198
+ meta_path = manager.send(:download_meta_path, test_file_path)
199
+ File.write(meta_path, "{}")
200
+
201
+ expect(File.exist?(meta_path)).to be true
202
+ manager.send(:cleanup_download_meta, test_file_path)
203
+ expect(File.exist?(meta_path)).to be false
204
+ end
205
+ end
206
+
207
+ describe "#load_download_meta" do
208
+ it "returns nil when file does not exist" do
209
+ result = manager.send(:load_download_meta, "/nonexistent/path")
210
+ expect(result).to be_nil
211
+ end
212
+
213
+ it "returns nil for invalid JSON" do
214
+ meta_path = manager.send(:download_meta_path, test_file_path)
215
+ File.write(meta_path, "invalid json {{{")
216
+
217
+ result = manager.send(:load_download_meta, test_file_path)
218
+ expect(result).to be_nil
219
+ end
220
+ end
221
+ end
222
+ end
223
+
224
+ describe Wp2txt::MultistreamIndex do
225
+ let(:temp_dir) { Dir.mktmpdir }
226
+ let(:index_path) { File.join(temp_dir, "test-index.txt") }
227
+
228
+ after { FileUtils.remove_entry(temp_dir) }
229
+
230
+ before do
231
+ # Create a minimal index file
232
+ File.write(index_path, <<~INDEX)
233
+ 100:1:Article One
234
+ 100:2:Article Two
235
+ 200:3:Article Three
236
+ 200:4:日本語記事
237
+ INDEX
238
+ end
239
+
240
+ describe "#initialize" do
241
+ it "loads the index file" do
242
+ index = described_class.new(index_path)
243
+ expect(index.size).to eq(4)
244
+ end
245
+ end
246
+
247
+ describe "#find_by_title" do
248
+ let(:index) { described_class.new(index_path) }
249
+
250
+ it "finds article by exact title" do
251
+ result = index.find_by_title("Article One")
252
+ expect(result).not_to be_nil
253
+ expect(result[:title]).to eq("Article One")
254
+ expect(result[:offset]).to eq(100)
255
+ expect(result[:page_id]).to eq(1)
256
+ end
257
+
258
+ it "finds Japanese article" do
259
+ result = index.find_by_title("日本語記事")
260
+ expect(result).not_to be_nil
261
+ expect(result[:title]).to eq("日本語記事")
262
+ end
263
+
264
+ it "returns nil for non-existent title" do
265
+ result = index.find_by_title("Non Existent")
266
+ expect(result).to be_nil
267
+ end
268
+ end
269
+
270
+ describe "#find_by_id" do
271
+ let(:index) { described_class.new(index_path) }
272
+
273
+ it "finds article by page ID" do
274
+ result = index.find_by_id(2)
275
+ expect(result).not_to be_nil
276
+ expect(result[:title]).to eq("Article Two")
277
+ end
278
+
279
+ it "returns nil for non-existent ID" do
280
+ result = index.find_by_id(999)
281
+ expect(result).to be_nil
282
+ end
283
+ end
284
+
285
+ describe "#articles_in_stream" do
286
+ let(:index) { described_class.new(index_path) }
287
+
288
+ it "returns articles at given byte offset" do
289
+ articles = index.articles_in_stream(100)
290
+ expect(articles.size).to eq(2)
291
+ expect(articles.map { |a| a[:title] }).to include("Article One", "Article Two")
292
+ end
293
+
294
+ it "returns empty array for non-existent offset" do
295
+ articles = index.articles_in_stream(999)
296
+ expect(articles).to eq([])
297
+ end
298
+ end
299
+
300
+ describe "#stream_offset_for" do
301
+ let(:index) { described_class.new(index_path) }
302
+
303
+ it "returns byte offset for article" do
304
+ offset = index.stream_offset_for("Article Three")
305
+ expect(offset).to eq(200)
306
+ end
307
+
308
+ it "returns nil for non-existent title" do
309
+ offset = index.stream_offset_for("Non Existent")
310
+ expect(offset).to be_nil
311
+ end
312
+ end
313
+
314
+ describe "#random_articles" do
315
+ let(:index) { described_class.new(index_path) }
316
+
317
+ it "returns requested number of random articles" do
318
+ articles = index.random_articles(2)
319
+ expect(articles.size).to eq(2)
320
+ end
321
+
322
+ it "returns all articles if count exceeds size" do
323
+ articles = index.random_articles(100)
324
+ expect(articles.size).to eq(4)
325
+ end
326
+ end
327
+
328
+ describe "#first_articles" do
329
+ let(:index) { described_class.new(index_path) }
330
+
331
+ it "returns first N articles" do
332
+ articles = index.first_articles(2)
333
+ expect(articles.size).to eq(2)
334
+ end
335
+ end
336
+
337
+ describe "#stream_offsets" do
338
+ let(:index) { described_class.new(index_path) }
339
+
340
+ it "returns unique sorted offsets" do
341
+ offsets = index.stream_offsets
342
+ expect(offsets).to eq([100, 200])
343
+ end
344
+ end
345
+ end
346
+
347
+ describe Wp2txt::CategoryFetcher do
348
+ let(:fetcher) { described_class.new("en", "Test Category") }
349
+
350
+ describe "#initialize" do
351
+ it "normalizes category name" do
352
+ fetcher = described_class.new("en", "test_category")
353
+ # Category name should be normalized (underscores to spaces)
354
+ expect(fetcher.instance_variable_get(:@category)).to include("test")
355
+ end
356
+
357
+ it "sets default max_depth to 0" do
358
+ expect(fetcher.instance_variable_get(:@max_depth)).to eq(0)
359
+ end
360
+
361
+ it "accepts custom max_depth" do
362
+ fetcher = described_class.new("en", "Test", max_depth: 2)
363
+ expect(fetcher.instance_variable_get(:@max_depth)).to eq(2)
364
+ end
365
+
366
+ it "strips Category: prefix" do
367
+ fetcher = described_class.new("en", "Category:Test")
368
+ expect(fetcher.instance_variable_get(:@category)).to eq("Test")
369
+ end
370
+
371
+ it "accepts different languages" do
372
+ fetcher = described_class.new("ja", "テスト")
373
+ expect(fetcher.instance_variable_get(:@lang)).to eq("ja")
374
+ end
375
+
376
+ it "accepts custom cache_expiry_days" do
377
+ fetcher = described_class.new("en", "Test", cache_expiry_days: 14)
378
+ expect(fetcher.instance_variable_get(:@cache_expiry_days)).to eq(14)
379
+ end
380
+ end
381
+
382
+ describe "#enable_cache" do
383
+ it "sets cache directory" do
384
+ fetcher.enable_cache("/tmp/test_cache")
385
+ expect(fetcher.instance_variable_get(:@cache_dir)).to eq("/tmp/test_cache")
386
+ end
387
+ end
388
+
389
+ describe "cache operations" do
390
+ let(:temp_cache) { Dir.mktmpdir }
391
+ let(:fetcher_with_cache) do
392
+ f = described_class.new("en", "Test Category")
393
+ f.enable_cache(temp_cache)
394
+ f
395
+ end
396
+
397
+ after { FileUtils.rm_rf(temp_cache) if File.exist?(temp_cache) }
398
+
399
+ it "creates CategoryCache when cache enabled" do
400
+ expect(fetcher_with_cache.cache).to be_a(Wp2txt::CategoryCache)
401
+ end
402
+
403
+ it "returns nil for cache when cache disabled" do
404
+ fetcher_no_cache = described_class.new("en", "Test")
405
+ expect(fetcher_no_cache.cache).to be_nil
406
+ end
407
+
408
+ it "uses SQLite-based cache file" do
409
+ fetcher_with_cache.cache
410
+ cache_files = Dir.glob(File.join(temp_cache, "categories_*.sqlite3"))
411
+ expect(cache_files.size).to eq 1
412
+ end
413
+
414
+ it "saves and loads from cache" do
415
+ category = "Cache_Test"
416
+ members = { pages: ["Article1", "Article2"], subcats: ["SubCat1"] }
417
+
418
+ fetcher_with_cache.send(:save_to_cache, category, members)
419
+ loaded = fetcher_with_cache.send(:load_from_cache, category)
420
+
421
+ expect(loaded[:pages]).to contain_exactly("Article1", "Article2")
422
+ expect(loaded[:subcats]).to contain_exactly("SubCat1")
423
+ end
424
+
425
+ it "returns nil for non-existent cache" do
426
+ result = fetcher_with_cache.send(:load_from_cache, "NonExistent")
427
+ expect(result).to be_nil
428
+ end
429
+ end
430
+ end
431
+
432
+ describe Wp2txt::MultistreamReader do
433
+ let(:temp_dir) { Dir.mktmpdir }
434
+ let(:index_path) { File.join(temp_dir, "test-index.txt") }
435
+ let(:multistream_path) { File.join(temp_dir, "test-multistream.xml.bz2") }
436
+
437
+ after { FileUtils.remove_entry(temp_dir) }
438
+
439
+ before do
440
+ # Create a minimal index file
441
+ File.write(index_path, <<~INDEX)
442
+ 100:1:Article One
443
+ 100:2:Article Two
444
+ 200:3:Article Three
445
+ INDEX
446
+ end
447
+
448
+ describe "#initialize" do
449
+ it "creates reader with paths" do
450
+ reader = described_class.new(multistream_path, index_path)
451
+ expect(reader.multistream_path).to eq(multistream_path)
452
+ expect(reader.index).to be_a(Wp2txt::MultistreamIndex)
453
+ end
454
+ end
455
+
456
+ describe "#extract_article" do
457
+ it "returns nil for non-existent article" do
458
+ # Without actual bz2 file, can't extract, but should handle gracefully
459
+ reader = described_class.new(multistream_path, index_path)
460
+ # Will return nil because file doesn't exist
461
+ expect { reader.extract_article("Non Existent") }.not_to raise_error
462
+ end
463
+ end
464
+
465
+ describe "#extract_articles_parallel" do
466
+ it "handles empty titles array" do
467
+ reader = described_class.new(multistream_path, index_path)
468
+ result = reader.extract_articles_parallel([], num_processes: 2)
469
+ expect(result).to eq({})
470
+ end
471
+
472
+ it "handles titles not in index" do
473
+ reader = described_class.new(multistream_path, index_path)
474
+ result = reader.extract_articles_parallel(["Non Existent"], num_processes: 2)
475
+ expect(result).to eq({})
476
+ end
477
+ end
478
+
479
+ describe "#each_article_parallel" do
480
+ it "returns an enumerator when no block given" do
481
+ reader = described_class.new(multistream_path, index_path)
482
+ result = reader.each_article_parallel([], num_processes: 2)
483
+ expect(result).to be_an(Enumerator)
484
+ end
485
+
486
+ it "handles empty entries array" do
487
+ reader = described_class.new(multistream_path, index_path)
488
+ pages = []
489
+ reader.each_article_parallel([], num_processes: 2) { |page| pages << page }
490
+ expect(pages).to eq([])
491
+ end
492
+ end
493
+ end
494
+
495
+ describe "Wp2txt.ssl_safe_get" do
496
+ it "creates HTTP request with SSL verification callback" do
497
+ # Test the structure of ssl_safe_get
498
+ uri = URI("https://example.com/test")
499
+
500
+ # Mock Net::HTTP to verify configuration
501
+ http_mock = instance_double(Net::HTTP)
502
+ allow(Net::HTTP).to receive(:new).and_return(http_mock)
503
+ allow(http_mock).to receive(:use_ssl=)
504
+ allow(http_mock).to receive(:use_ssl?).and_return(true)
505
+ allow(http_mock).to receive(:open_timeout=)
506
+ allow(http_mock).to receive(:read_timeout=)
507
+ allow(http_mock).to receive(:verify_mode=)
508
+ allow(http_mock).to receive(:verify_callback=)
509
+ allow(http_mock).to receive(:request).and_return(Net::HTTPSuccess.new("1.1", "200", "OK"))
510
+
511
+ expect { Wp2txt.ssl_safe_get(uri) }.not_to raise_error
512
+ end
513
+ end
514
+
515
+ describe Wp2txt::DumpManager do
516
+ describe ".default_cache_dir" do
517
+ it "returns default cache directory path" do
518
+ path = described_class.default_cache_dir
519
+ expect(path).to include(".wp2txt/cache")
520
+ end
521
+ end
522
+
523
+ describe ".clear_all_cache!" do
524
+ let(:temp_cache) { Dir.mktmpdir }
525
+
526
+ after { FileUtils.rm_rf(temp_cache) if File.exist?(temp_cache) }
527
+
528
+ it "does not raise error when cache does not exist" do
529
+ expect { described_class.clear_all_cache!("/nonexistent/path") }.not_to raise_error
530
+ end
531
+
532
+ it "removes existing cache directory" do
533
+ FileUtils.mkdir_p(File.join(temp_cache, "subdir"))
534
+ File.write(File.join(temp_cache, "test.txt"), "content")
535
+
536
+ described_class.clear_all_cache!(temp_cache)
537
+
538
+ expect(File.exist?(temp_cache)).to be false
539
+ end
540
+ end
541
+
542
+ describe "#cached_partial_multistream_path" do
543
+ let(:temp_dir) { Dir.mktmpdir }
544
+ let(:manager) { described_class.new("en", cache_dir: temp_dir) }
545
+
546
+ after { FileUtils.remove_entry(temp_dir) }
547
+
548
+ it "includes stream count in filename" do
549
+ path = manager.cached_partial_multistream_path(1000)
550
+ expect(path).to include("1000streams")
551
+ expect(path).to end_with(".xml.bz2")
552
+ end
553
+ end
554
+
555
+ describe "#find_any_partial_cache" do
556
+ let(:temp_dir) { Dir.mktmpdir }
557
+ let(:manager) { described_class.new("en", cache_dir: temp_dir) }
558
+
559
+ after { FileUtils.remove_entry(temp_dir) }
560
+
561
+ context "when no partial exists" do
562
+ it "returns nil" do
563
+ expect(manager.find_any_partial_cache).to be_nil
564
+ end
565
+ end
566
+
567
+ context "when partial dumps exist" do
568
+ before do
569
+ # Create fake partial dump files
570
+ File.write(File.join(temp_dir, "enwiki-20260101-multistream-100streams.xml.bz2"), "BZh9" + "x" * 100)
571
+ File.write(File.join(temp_dir, "enwiki-20260101-multistream-500streams.xml.bz2"), "BZh9" + "x" * 500)
572
+ end
573
+
574
+ it "returns the largest partial by stream count" do
575
+ result = manager.find_any_partial_cache
576
+ expect(result).not_to be_nil
577
+ expect(result[:stream_count]).to eq(500)
578
+ expect(result[:dump_date]).to eq("20260101")
579
+ end
580
+
581
+ it "includes file size and mtime" do
582
+ result = manager.find_any_partial_cache
583
+ expect(result[:size]).to be > 0
584
+ expect(result[:mtime]).to be_a(Time)
585
+ end
586
+ end
587
+
588
+ context "with partials from different dates" do
589
+ before do
590
+ File.write(File.join(temp_dir, "enwiki-20260101-multistream-100streams.xml.bz2"), "BZh9" + "x" * 100)
591
+ File.write(File.join(temp_dir, "enwiki-20260201-multistream-50streams.xml.bz2"), "BZh9" + "x" * 50)
592
+ end
593
+
594
+ it "returns the largest regardless of date" do
595
+ result = manager.find_any_partial_cache
596
+ expect(result[:stream_count]).to eq(100)
597
+ expect(result[:dump_date]).to eq("20260101")
598
+ end
599
+ end
600
+ end
601
+
602
+ describe "#can_resume_from_partial?" do
603
+ let(:temp_dir) { Dir.mktmpdir }
604
+ let(:manager) { described_class.new("en", cache_dir: temp_dir) }
605
+
606
+ after { FileUtils.remove_entry(temp_dir) }
607
+
608
+ context "when partial_info is nil" do
609
+ it "returns not possible with :no_partial reason" do
610
+ result = manager.can_resume_from_partial?(nil)
611
+ expect(result[:possible]).to be false
612
+ expect(result[:reason]).to eq(:no_partial)
613
+ end
614
+ end
615
+
616
+ context "when dump dates don't match" do
617
+ let(:partial_info) do
618
+ {
619
+ path: File.join(temp_dir, "enwiki-20250101-multistream-100streams.xml.bz2"),
620
+ dump_date: "20250101",
621
+ stream_count: 100,
622
+ size: 1000
623
+ }
624
+ end
625
+
626
+ before do
627
+ # Create the file
628
+ File.write(partial_info[:path], "BZh9" + "x" * 100)
629
+ # Stub the latest_dump_date to return a different date
630
+ allow(manager).to receive(:latest_dump_date).and_return("20260101")
631
+ end
632
+
633
+ it "returns not possible with :date_mismatch reason" do
634
+ result = manager.can_resume_from_partial?(partial_info)
635
+ expect(result[:possible]).to be false
636
+ expect(result[:reason]).to eq(:date_mismatch)
637
+ expect(result[:partial_date]).to eq("20250101")
638
+ expect(result[:latest_date]).to eq("20260101")
639
+ end
640
+ end
641
+
642
+ context "when partial file is invalid" do
643
+ let(:partial_info) do
644
+ {
645
+ path: File.join(temp_dir, "enwiki-20260101-multistream-100streams.xml.bz2"),
646
+ dump_date: "20260101",
647
+ stream_count: 100,
648
+ size: 1000
649
+ }
650
+ end
651
+
652
+ before do
653
+ # Create an invalid bz2 file (wrong magic bytes)
654
+ File.write(partial_info[:path], "XXXX" + "x" * 100)
655
+ allow(manager).to receive(:latest_dump_date).and_return("20260101")
656
+ end
657
+
658
+ it "returns not possible with :invalid_partial reason" do
659
+ result = manager.can_resume_from_partial?(partial_info)
660
+ expect(result[:possible]).to be false
661
+ expect(result[:reason]).to eq(:invalid_partial)
662
+ end
663
+ end
664
+ end
665
+
666
+ describe "#get_remote_file_size" do
667
+ let(:temp_dir) { Dir.mktmpdir }
668
+ let(:manager) { described_class.new("en", cache_dir: temp_dir) }
669
+
670
+ after { FileUtils.remove_entry(temp_dir) }
671
+
672
+ it "returns file size from Content-Length header" do
673
+ stub_request(:head, %r{dumps\.wikimedia\.org})
674
+ .to_return(status: 200, headers: { "Content-Length" => "12345678" })
675
+
676
+ allow(manager).to receive(:latest_dump_date).and_return("20260101")
677
+ size = manager.send(:get_remote_file_size, "https://dumps.wikimedia.org/enwiki/20260101/test.xml.bz2")
678
+ expect(size).to eq(12_345_678)
679
+ end
680
+
681
+ it "returns 0 when Content-Length is missing" do
682
+ stub_request(:head, %r{dumps\.wikimedia\.org})
683
+ .to_return(status: 200, headers: {})
684
+
685
+ size = manager.send(:get_remote_file_size, "https://dumps.wikimedia.org/test.xml.bz2")
686
+ expect(size).to eq(0)
687
+ end
688
+ end
689
+ end
690
+ end