wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,690 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "tempfile"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require "zlib"
|
|
7
|
+
require "webmock/rspec"
|
|
8
|
+
|
|
9
|
+
RSpec.describe "Wp2txt Multistream" do
|
|
10
|
+
before do
|
|
11
|
+
# Ensure WebMock is enabled (may be disabled by other specs)
|
|
12
|
+
WebMock.enable!
|
|
13
|
+
# Allow localhost connections, stub external
|
|
14
|
+
WebMock.disable_net_connect!(allow_localhost: true)
|
|
15
|
+
|
|
16
|
+
# Stub Wikipedia dump listing page
|
|
17
|
+
stub_request(:get, %r{dumps\.wikimedia\.org})
|
|
18
|
+
.to_return(status: 200, body: '<a href="20260101/">20260101/</a>')
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
after do
|
|
22
|
+
WebMock.allow_net_connect!
|
|
23
|
+
end
|
|
24
|
+
describe Wp2txt::DumpManager do
|
|
25
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
26
|
+
let(:manager) { described_class.new("en", cache_dir: temp_dir) }
|
|
27
|
+
|
|
28
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
29
|
+
|
|
30
|
+
describe "#format_size" do
|
|
31
|
+
it "formats bytes" do
|
|
32
|
+
expect(manager.send(:format_size, 500)).to eq("500 B")
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it "formats kilobytes" do
|
|
36
|
+
expect(manager.send(:format_size, 2048)).to eq("2.0 KB")
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it "formats megabytes" do
|
|
40
|
+
expect(manager.send(:format_size, 5_242_880)).to eq("5.0 MB")
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "formats gigabytes" do
|
|
44
|
+
expect(manager.send(:format_size, 2_147_483_648)).to eq("2.0 GB")
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
describe "#cached_index_path" do
|
|
49
|
+
it "returns correct path format" do
|
|
50
|
+
path = manager.cached_index_path
|
|
51
|
+
expect(path).to include("enwiki")
|
|
52
|
+
expect(path).to include("index")
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
describe "#cached_multistream_path" do
|
|
57
|
+
it "returns correct path format" do
|
|
58
|
+
path = manager.cached_multistream_path
|
|
59
|
+
expect(path).to include("enwiki")
|
|
60
|
+
expect(path).to end_with(".xml.bz2")
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
describe "#cache_fresh?" do
|
|
65
|
+
context "when cache does not exist" do
|
|
66
|
+
it "returns false" do
|
|
67
|
+
expect(manager.cache_fresh?).to be false
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
context "when cache exists and is fresh" do
|
|
72
|
+
before do
|
|
73
|
+
FileUtils.mkdir_p(File.dirname(manager.cached_index_path))
|
|
74
|
+
File.write(manager.cached_index_path, "test")
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it "returns true" do
|
|
78
|
+
expect(manager.cache_fresh?(30)).to be true
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
describe "#cache_stale?" do
|
|
84
|
+
context "when cache does not exist" do
|
|
85
|
+
it "returns true" do
|
|
86
|
+
expect(manager.cache_stale?).to be true
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
describe "#cache_age_days" do
|
|
92
|
+
context "when cache does not exist" do
|
|
93
|
+
it "returns nil" do
|
|
94
|
+
expect(manager.cache_age_days).to be_nil
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
context "when cache exists" do
|
|
99
|
+
before do
|
|
100
|
+
FileUtils.mkdir_p(File.dirname(manager.cached_index_path))
|
|
101
|
+
File.write(manager.cached_index_path, "test")
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
it "returns age in days" do
|
|
105
|
+
age = manager.cache_age_days
|
|
106
|
+
expect(age).to be_a(Float)
|
|
107
|
+
expect(age).to be >= 0
|
|
108
|
+
expect(age).to be < 1
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
describe "#cache_mtime" do
|
|
114
|
+
context "when cache does not exist" do
|
|
115
|
+
it "returns nil" do
|
|
116
|
+
expect(manager.cache_mtime).to be_nil
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
context "when cache exists" do
|
|
121
|
+
before do
|
|
122
|
+
FileUtils.mkdir_p(File.dirname(manager.cached_index_path))
|
|
123
|
+
File.write(manager.cached_index_path, "test")
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
it "returns Time object" do
|
|
127
|
+
expect(manager.cache_mtime).to be_a(Time)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
describe "#cache_status" do
|
|
133
|
+
context "when cache is empty" do
|
|
134
|
+
it "returns status hash with zero sizes" do
|
|
135
|
+
status = manager.cache_status
|
|
136
|
+
expect(status[:index_size]).to eq(0)
|
|
137
|
+
expect(status[:multistream_size]).to eq(0)
|
|
138
|
+
expect(status[:fresh]).to be false
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
describe "#clear_cache!" do
|
|
144
|
+
it "does not raise error when no cache exists" do
|
|
145
|
+
expect { manager.clear_cache! }.not_to raise_error
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
describe ".all_cache_status" do
|
|
150
|
+
it "returns hash of all cached languages" do
|
|
151
|
+
status = described_class.all_cache_status(temp_dir)
|
|
152
|
+
expect(status).to be_a(Hash)
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
describe "#find_suitable_partial_cache" do
|
|
157
|
+
context "when no partial cache exists" do
|
|
158
|
+
it "returns nil" do
|
|
159
|
+
expect(manager.find_suitable_partial_cache(100)).to be_nil
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
describe "resumable download support" do
|
|
165
|
+
let(:test_file_path) { File.join(temp_dir, "test_download.bz2") }
|
|
166
|
+
let(:test_url) { "https://example.com/test.bz2" }
|
|
167
|
+
|
|
168
|
+
describe "#download_meta_path" do
|
|
169
|
+
it "returns path with .wp2txt_download suffix" do
|
|
170
|
+
path = manager.send(:download_meta_path, test_file_path)
|
|
171
|
+
expect(path).to eq("#{test_file_path}.wp2txt_download")
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
describe "#save_download_meta and #load_download_meta" do
|
|
176
|
+
let(:remote_info) do
|
|
177
|
+
{
|
|
178
|
+
size: 1_000_000,
|
|
179
|
+
etag: '"abc123"',
|
|
180
|
+
last_modified: "Wed, 01 Jan 2026 00:00:00 GMT"
|
|
181
|
+
}
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
it "saves and loads metadata correctly" do
|
|
185
|
+
manager.send(:save_download_meta, test_file_path, test_url, remote_info)
|
|
186
|
+
loaded = manager.send(:load_download_meta, test_file_path)
|
|
187
|
+
|
|
188
|
+
expect(loaded[:url]).to eq(test_url)
|
|
189
|
+
expect(loaded[:size]).to eq(1_000_000)
|
|
190
|
+
expect(loaded[:etag]).to eq('"abc123"')
|
|
191
|
+
expect(loaded[:last_modified]).to eq("Wed, 01 Jan 2026 00:00:00 GMT")
|
|
192
|
+
expect(loaded[:started_at]).not_to be_nil
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
describe "#cleanup_download_meta" do
|
|
197
|
+
it "removes metadata file" do
|
|
198
|
+
meta_path = manager.send(:download_meta_path, test_file_path)
|
|
199
|
+
File.write(meta_path, "{}")
|
|
200
|
+
|
|
201
|
+
expect(File.exist?(meta_path)).to be true
|
|
202
|
+
manager.send(:cleanup_download_meta, test_file_path)
|
|
203
|
+
expect(File.exist?(meta_path)).to be false
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
describe "#load_download_meta" do
|
|
208
|
+
it "returns nil when file does not exist" do
|
|
209
|
+
result = manager.send(:load_download_meta, "/nonexistent/path")
|
|
210
|
+
expect(result).to be_nil
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
it "returns nil for invalid JSON" do
|
|
214
|
+
meta_path = manager.send(:download_meta_path, test_file_path)
|
|
215
|
+
File.write(meta_path, "invalid json {{{")
|
|
216
|
+
|
|
217
|
+
result = manager.send(:load_download_meta, test_file_path)
|
|
218
|
+
expect(result).to be_nil
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
describe Wp2txt::MultistreamIndex do
|
|
225
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
226
|
+
let(:index_path) { File.join(temp_dir, "test-index.txt") }
|
|
227
|
+
|
|
228
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
229
|
+
|
|
230
|
+
before do
|
|
231
|
+
# Create a minimal index file
|
|
232
|
+
File.write(index_path, <<~INDEX)
|
|
233
|
+
100:1:Article One
|
|
234
|
+
100:2:Article Two
|
|
235
|
+
200:3:Article Three
|
|
236
|
+
200:4:日本語記事
|
|
237
|
+
INDEX
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
describe "#initialize" do
|
|
241
|
+
it "loads the index file" do
|
|
242
|
+
index = described_class.new(index_path)
|
|
243
|
+
expect(index.size).to eq(4)
|
|
244
|
+
end
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
describe "#find_by_title" do
|
|
248
|
+
let(:index) { described_class.new(index_path) }
|
|
249
|
+
|
|
250
|
+
it "finds article by exact title" do
|
|
251
|
+
result = index.find_by_title("Article One")
|
|
252
|
+
expect(result).not_to be_nil
|
|
253
|
+
expect(result[:title]).to eq("Article One")
|
|
254
|
+
expect(result[:offset]).to eq(100)
|
|
255
|
+
expect(result[:page_id]).to eq(1)
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
it "finds Japanese article" do
|
|
259
|
+
result = index.find_by_title("日本語記事")
|
|
260
|
+
expect(result).not_to be_nil
|
|
261
|
+
expect(result[:title]).to eq("日本語記事")
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it "returns nil for non-existent title" do
|
|
265
|
+
result = index.find_by_title("Non Existent")
|
|
266
|
+
expect(result).to be_nil
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
describe "#find_by_id" do
|
|
271
|
+
let(:index) { described_class.new(index_path) }
|
|
272
|
+
|
|
273
|
+
it "finds article by page ID" do
|
|
274
|
+
result = index.find_by_id(2)
|
|
275
|
+
expect(result).not_to be_nil
|
|
276
|
+
expect(result[:title]).to eq("Article Two")
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
it "returns nil for non-existent ID" do
|
|
280
|
+
result = index.find_by_id(999)
|
|
281
|
+
expect(result).to be_nil
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
describe "#articles_in_stream" do
|
|
286
|
+
let(:index) { described_class.new(index_path) }
|
|
287
|
+
|
|
288
|
+
it "returns articles at given byte offset" do
|
|
289
|
+
articles = index.articles_in_stream(100)
|
|
290
|
+
expect(articles.size).to eq(2)
|
|
291
|
+
expect(articles.map { |a| a[:title] }).to include("Article One", "Article Two")
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
it "returns empty array for non-existent offset" do
|
|
295
|
+
articles = index.articles_in_stream(999)
|
|
296
|
+
expect(articles).to eq([])
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
describe "#stream_offset_for" do
|
|
301
|
+
let(:index) { described_class.new(index_path) }
|
|
302
|
+
|
|
303
|
+
it "returns byte offset for article" do
|
|
304
|
+
offset = index.stream_offset_for("Article Three")
|
|
305
|
+
expect(offset).to eq(200)
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
it "returns nil for non-existent title" do
|
|
309
|
+
offset = index.stream_offset_for("Non Existent")
|
|
310
|
+
expect(offset).to be_nil
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
describe "#random_articles" do
|
|
315
|
+
let(:index) { described_class.new(index_path) }
|
|
316
|
+
|
|
317
|
+
it "returns requested number of random articles" do
|
|
318
|
+
articles = index.random_articles(2)
|
|
319
|
+
expect(articles.size).to eq(2)
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
it "returns all articles if count exceeds size" do
|
|
323
|
+
articles = index.random_articles(100)
|
|
324
|
+
expect(articles.size).to eq(4)
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
describe "#first_articles" do
|
|
329
|
+
let(:index) { described_class.new(index_path) }
|
|
330
|
+
|
|
331
|
+
it "returns first N articles" do
|
|
332
|
+
articles = index.first_articles(2)
|
|
333
|
+
expect(articles.size).to eq(2)
|
|
334
|
+
end
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
describe "#stream_offsets" do
|
|
338
|
+
let(:index) { described_class.new(index_path) }
|
|
339
|
+
|
|
340
|
+
it "returns unique sorted offsets" do
|
|
341
|
+
offsets = index.stream_offsets
|
|
342
|
+
expect(offsets).to eq([100, 200])
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
describe Wp2txt::CategoryFetcher do
|
|
348
|
+
let(:fetcher) { described_class.new("en", "Test Category") }
|
|
349
|
+
|
|
350
|
+
describe "#initialize" do
|
|
351
|
+
it "normalizes category name" do
|
|
352
|
+
fetcher = described_class.new("en", "test_category")
|
|
353
|
+
# Category name should be normalized (underscores to spaces)
|
|
354
|
+
expect(fetcher.instance_variable_get(:@category)).to include("test")
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
it "sets default max_depth to 0" do
|
|
358
|
+
expect(fetcher.instance_variable_get(:@max_depth)).to eq(0)
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
it "accepts custom max_depth" do
|
|
362
|
+
fetcher = described_class.new("en", "Test", max_depth: 2)
|
|
363
|
+
expect(fetcher.instance_variable_get(:@max_depth)).to eq(2)
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
it "strips Category: prefix" do
|
|
367
|
+
fetcher = described_class.new("en", "Category:Test")
|
|
368
|
+
expect(fetcher.instance_variable_get(:@category)).to eq("Test")
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
it "accepts different languages" do
|
|
372
|
+
fetcher = described_class.new("ja", "テスト")
|
|
373
|
+
expect(fetcher.instance_variable_get(:@lang)).to eq("ja")
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
it "accepts custom cache_expiry_days" do
|
|
377
|
+
fetcher = described_class.new("en", "Test", cache_expiry_days: 14)
|
|
378
|
+
expect(fetcher.instance_variable_get(:@cache_expiry_days)).to eq(14)
|
|
379
|
+
end
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
describe "#enable_cache" do
|
|
383
|
+
it "sets cache directory" do
|
|
384
|
+
fetcher.enable_cache("/tmp/test_cache")
|
|
385
|
+
expect(fetcher.instance_variable_get(:@cache_dir)).to eq("/tmp/test_cache")
|
|
386
|
+
end
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
describe "cache operations" do
|
|
390
|
+
let(:temp_cache) { Dir.mktmpdir }
|
|
391
|
+
let(:fetcher_with_cache) do
|
|
392
|
+
f = described_class.new("en", "Test Category")
|
|
393
|
+
f.enable_cache(temp_cache)
|
|
394
|
+
f
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
after { FileUtils.rm_rf(temp_cache) if File.exist?(temp_cache) }
|
|
398
|
+
|
|
399
|
+
it "creates CategoryCache when cache enabled" do
|
|
400
|
+
expect(fetcher_with_cache.cache).to be_a(Wp2txt::CategoryCache)
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
it "returns nil for cache when cache disabled" do
|
|
404
|
+
fetcher_no_cache = described_class.new("en", "Test")
|
|
405
|
+
expect(fetcher_no_cache.cache).to be_nil
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
it "uses SQLite-based cache file" do
|
|
409
|
+
fetcher_with_cache.cache
|
|
410
|
+
cache_files = Dir.glob(File.join(temp_cache, "categories_*.sqlite3"))
|
|
411
|
+
expect(cache_files.size).to eq 1
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
it "saves and loads from cache" do
|
|
415
|
+
category = "Cache_Test"
|
|
416
|
+
members = { pages: ["Article1", "Article2"], subcats: ["SubCat1"] }
|
|
417
|
+
|
|
418
|
+
fetcher_with_cache.send(:save_to_cache, category, members)
|
|
419
|
+
loaded = fetcher_with_cache.send(:load_from_cache, category)
|
|
420
|
+
|
|
421
|
+
expect(loaded[:pages]).to contain_exactly("Article1", "Article2")
|
|
422
|
+
expect(loaded[:subcats]).to contain_exactly("SubCat1")
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
it "returns nil for non-existent cache" do
|
|
426
|
+
result = fetcher_with_cache.send(:load_from_cache, "NonExistent")
|
|
427
|
+
expect(result).to be_nil
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
describe Wp2txt::MultistreamReader do
|
|
433
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
434
|
+
let(:index_path) { File.join(temp_dir, "test-index.txt") }
|
|
435
|
+
let(:multistream_path) { File.join(temp_dir, "test-multistream.xml.bz2") }
|
|
436
|
+
|
|
437
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
438
|
+
|
|
439
|
+
before do
|
|
440
|
+
# Create a minimal index file
|
|
441
|
+
File.write(index_path, <<~INDEX)
|
|
442
|
+
100:1:Article One
|
|
443
|
+
100:2:Article Two
|
|
444
|
+
200:3:Article Three
|
|
445
|
+
INDEX
|
|
446
|
+
end
|
|
447
|
+
|
|
448
|
+
describe "#initialize" do
|
|
449
|
+
it "creates reader with paths" do
|
|
450
|
+
reader = described_class.new(multistream_path, index_path)
|
|
451
|
+
expect(reader.multistream_path).to eq(multistream_path)
|
|
452
|
+
expect(reader.index).to be_a(Wp2txt::MultistreamIndex)
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
describe "#extract_article" do
|
|
457
|
+
it "returns nil for non-existent article" do
|
|
458
|
+
# Without actual bz2 file, can't extract, but should handle gracefully
|
|
459
|
+
reader = described_class.new(multistream_path, index_path)
|
|
460
|
+
# Will return nil because file doesn't exist
|
|
461
|
+
expect { reader.extract_article("Non Existent") }.not_to raise_error
|
|
462
|
+
end
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
describe "#extract_articles_parallel" do
|
|
466
|
+
it "handles empty titles array" do
|
|
467
|
+
reader = described_class.new(multistream_path, index_path)
|
|
468
|
+
result = reader.extract_articles_parallel([], num_processes: 2)
|
|
469
|
+
expect(result).to eq({})
|
|
470
|
+
end
|
|
471
|
+
|
|
472
|
+
it "handles titles not in index" do
|
|
473
|
+
reader = described_class.new(multistream_path, index_path)
|
|
474
|
+
result = reader.extract_articles_parallel(["Non Existent"], num_processes: 2)
|
|
475
|
+
expect(result).to eq({})
|
|
476
|
+
end
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
describe "#each_article_parallel" do
|
|
480
|
+
it "returns an enumerator when no block given" do
|
|
481
|
+
reader = described_class.new(multistream_path, index_path)
|
|
482
|
+
result = reader.each_article_parallel([], num_processes: 2)
|
|
483
|
+
expect(result).to be_an(Enumerator)
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
it "handles empty entries array" do
|
|
487
|
+
reader = described_class.new(multistream_path, index_path)
|
|
488
|
+
pages = []
|
|
489
|
+
reader.each_article_parallel([], num_processes: 2) { |page| pages << page }
|
|
490
|
+
expect(pages).to eq([])
|
|
491
|
+
end
|
|
492
|
+
end
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
describe "Wp2txt.ssl_safe_get" do
|
|
496
|
+
it "creates HTTP request with SSL verification callback" do
|
|
497
|
+
# Test the structure of ssl_safe_get
|
|
498
|
+
uri = URI("https://example.com/test")
|
|
499
|
+
|
|
500
|
+
# Mock Net::HTTP to verify configuration
|
|
501
|
+
http_mock = instance_double(Net::HTTP)
|
|
502
|
+
allow(Net::HTTP).to receive(:new).and_return(http_mock)
|
|
503
|
+
allow(http_mock).to receive(:use_ssl=)
|
|
504
|
+
allow(http_mock).to receive(:use_ssl?).and_return(true)
|
|
505
|
+
allow(http_mock).to receive(:open_timeout=)
|
|
506
|
+
allow(http_mock).to receive(:read_timeout=)
|
|
507
|
+
allow(http_mock).to receive(:verify_mode=)
|
|
508
|
+
allow(http_mock).to receive(:verify_callback=)
|
|
509
|
+
allow(http_mock).to receive(:request).and_return(Net::HTTPSuccess.new("1.1", "200", "OK"))
|
|
510
|
+
|
|
511
|
+
expect { Wp2txt.ssl_safe_get(uri) }.not_to raise_error
|
|
512
|
+
end
|
|
513
|
+
end
|
|
514
|
+
|
|
515
|
+
describe Wp2txt::DumpManager do
|
|
516
|
+
describe ".default_cache_dir" do
|
|
517
|
+
it "returns default cache directory path" do
|
|
518
|
+
path = described_class.default_cache_dir
|
|
519
|
+
expect(path).to include(".wp2txt/cache")
|
|
520
|
+
end
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
describe ".clear_all_cache!" do
|
|
524
|
+
let(:temp_cache) { Dir.mktmpdir }
|
|
525
|
+
|
|
526
|
+
after { FileUtils.rm_rf(temp_cache) if File.exist?(temp_cache) }
|
|
527
|
+
|
|
528
|
+
it "does not raise error when cache does not exist" do
|
|
529
|
+
expect { described_class.clear_all_cache!("/nonexistent/path") }.not_to raise_error
|
|
530
|
+
end
|
|
531
|
+
|
|
532
|
+
it "removes existing cache directory" do
|
|
533
|
+
FileUtils.mkdir_p(File.join(temp_cache, "subdir"))
|
|
534
|
+
File.write(File.join(temp_cache, "test.txt"), "content")
|
|
535
|
+
|
|
536
|
+
described_class.clear_all_cache!(temp_cache)
|
|
537
|
+
|
|
538
|
+
expect(File.exist?(temp_cache)).to be false
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
describe "#cached_partial_multistream_path" do
|
|
543
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
544
|
+
let(:manager) { described_class.new("en", cache_dir: temp_dir) }
|
|
545
|
+
|
|
546
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
547
|
+
|
|
548
|
+
it "includes stream count in filename" do
|
|
549
|
+
path = manager.cached_partial_multistream_path(1000)
|
|
550
|
+
expect(path).to include("1000streams")
|
|
551
|
+
expect(path).to end_with(".xml.bz2")
|
|
552
|
+
end
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
describe "#find_any_partial_cache" do
|
|
556
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
557
|
+
let(:manager) { described_class.new("en", cache_dir: temp_dir) }
|
|
558
|
+
|
|
559
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
560
|
+
|
|
561
|
+
context "when no partial exists" do
|
|
562
|
+
it "returns nil" do
|
|
563
|
+
expect(manager.find_any_partial_cache).to be_nil
|
|
564
|
+
end
|
|
565
|
+
end
|
|
566
|
+
|
|
567
|
+
context "when partial dumps exist" do
|
|
568
|
+
before do
|
|
569
|
+
# Create fake partial dump files
|
|
570
|
+
File.write(File.join(temp_dir, "enwiki-20260101-multistream-100streams.xml.bz2"), "BZh9" + "x" * 100)
|
|
571
|
+
File.write(File.join(temp_dir, "enwiki-20260101-multistream-500streams.xml.bz2"), "BZh9" + "x" * 500)
|
|
572
|
+
end
|
|
573
|
+
|
|
574
|
+
it "returns the largest partial by stream count" do
|
|
575
|
+
result = manager.find_any_partial_cache
|
|
576
|
+
expect(result).not_to be_nil
|
|
577
|
+
expect(result[:stream_count]).to eq(500)
|
|
578
|
+
expect(result[:dump_date]).to eq("20260101")
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
it "includes file size and mtime" do
|
|
582
|
+
result = manager.find_any_partial_cache
|
|
583
|
+
expect(result[:size]).to be > 0
|
|
584
|
+
expect(result[:mtime]).to be_a(Time)
|
|
585
|
+
end
|
|
586
|
+
end
|
|
587
|
+
|
|
588
|
+
context "with partials from different dates" do
|
|
589
|
+
before do
|
|
590
|
+
File.write(File.join(temp_dir, "enwiki-20260101-multistream-100streams.xml.bz2"), "BZh9" + "x" * 100)
|
|
591
|
+
File.write(File.join(temp_dir, "enwiki-20260201-multistream-50streams.xml.bz2"), "BZh9" + "x" * 50)
|
|
592
|
+
end
|
|
593
|
+
|
|
594
|
+
it "returns the largest regardless of date" do
|
|
595
|
+
result = manager.find_any_partial_cache
|
|
596
|
+
expect(result[:stream_count]).to eq(100)
|
|
597
|
+
expect(result[:dump_date]).to eq("20260101")
|
|
598
|
+
end
|
|
599
|
+
end
|
|
600
|
+
end
|
|
601
|
+
|
|
602
|
+
describe "#can_resume_from_partial?" do
|
|
603
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
604
|
+
let(:manager) { described_class.new("en", cache_dir: temp_dir) }
|
|
605
|
+
|
|
606
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
607
|
+
|
|
608
|
+
context "when partial_info is nil" do
|
|
609
|
+
it "returns not possible with :no_partial reason" do
|
|
610
|
+
result = manager.can_resume_from_partial?(nil)
|
|
611
|
+
expect(result[:possible]).to be false
|
|
612
|
+
expect(result[:reason]).to eq(:no_partial)
|
|
613
|
+
end
|
|
614
|
+
end
|
|
615
|
+
|
|
616
|
+
context "when dump dates don't match" do
|
|
617
|
+
let(:partial_info) do
|
|
618
|
+
{
|
|
619
|
+
path: File.join(temp_dir, "enwiki-20250101-multistream-100streams.xml.bz2"),
|
|
620
|
+
dump_date: "20250101",
|
|
621
|
+
stream_count: 100,
|
|
622
|
+
size: 1000
|
|
623
|
+
}
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
before do
|
|
627
|
+
# Create the file
|
|
628
|
+
File.write(partial_info[:path], "BZh9" + "x" * 100)
|
|
629
|
+
# Stub the latest_dump_date to return a different date
|
|
630
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
it "returns not possible with :date_mismatch reason" do
|
|
634
|
+
result = manager.can_resume_from_partial?(partial_info)
|
|
635
|
+
expect(result[:possible]).to be false
|
|
636
|
+
expect(result[:reason]).to eq(:date_mismatch)
|
|
637
|
+
expect(result[:partial_date]).to eq("20250101")
|
|
638
|
+
expect(result[:latest_date]).to eq("20260101")
|
|
639
|
+
end
|
|
640
|
+
end
|
|
641
|
+
|
|
642
|
+
context "when partial file is invalid" do
|
|
643
|
+
let(:partial_info) do
|
|
644
|
+
{
|
|
645
|
+
path: File.join(temp_dir, "enwiki-20260101-multistream-100streams.xml.bz2"),
|
|
646
|
+
dump_date: "20260101",
|
|
647
|
+
stream_count: 100,
|
|
648
|
+
size: 1000
|
|
649
|
+
}
|
|
650
|
+
end
|
|
651
|
+
|
|
652
|
+
before do
|
|
653
|
+
# Create an invalid bz2 file (wrong magic bytes)
|
|
654
|
+
File.write(partial_info[:path], "XXXX" + "x" * 100)
|
|
655
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
it "returns not possible with :invalid_partial reason" do
|
|
659
|
+
result = manager.can_resume_from_partial?(partial_info)
|
|
660
|
+
expect(result[:possible]).to be false
|
|
661
|
+
expect(result[:reason]).to eq(:invalid_partial)
|
|
662
|
+
end
|
|
663
|
+
end
|
|
664
|
+
end
|
|
665
|
+
|
|
666
|
+
describe "#get_remote_file_size" do
|
|
667
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
668
|
+
let(:manager) { described_class.new("en", cache_dir: temp_dir) }
|
|
669
|
+
|
|
670
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
671
|
+
|
|
672
|
+
it "returns file size from Content-Length header" do
|
|
673
|
+
stub_request(:head, %r{dumps\.wikimedia\.org})
|
|
674
|
+
.to_return(status: 200, headers: { "Content-Length" => "12345678" })
|
|
675
|
+
|
|
676
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
677
|
+
size = manager.send(:get_remote_file_size, "https://dumps.wikimedia.org/enwiki/20260101/test.xml.bz2")
|
|
678
|
+
expect(size).to eq(12_345_678)
|
|
679
|
+
end
|
|
680
|
+
|
|
681
|
+
it "returns 0 when Content-Length is missing" do
|
|
682
|
+
stub_request(:head, %r{dumps\.wikimedia\.org})
|
|
683
|
+
.to_return(status: 200, headers: {})
|
|
684
|
+
|
|
685
|
+
size = manager.send(:get_remote_file_size, "https://dumps.wikimedia.org/test.xml.bz2")
|
|
686
|
+
expect(size).to eq(0)
|
|
687
|
+
end
|
|
688
|
+
end
|
|
689
|
+
end
|
|
690
|
+
end
|