wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require "tmpdir"
|
|
5
|
+
require_relative "../lib/wp2txt/multistream"
|
|
6
|
+
require_relative "../lib/wp2txt/cli"
|
|
7
|
+
|
|
8
|
+
RSpec.describe "Wp2txt Auto Download" do
|
|
9
|
+
include Wp2txt
|
|
10
|
+
|
|
11
|
+
describe "DumpManager" do
|
|
12
|
+
let(:cache_dir) { File.join(Dir.tmpdir, "wp2txt_test_cache_#{Process.pid}") }
|
|
13
|
+
|
|
14
|
+
after do
|
|
15
|
+
FileUtils.rm_rf(cache_dir) if File.exist?(cache_dir)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
describe ".default_cache_dir" do
|
|
19
|
+
it "returns ~/.wp2txt/cache by default" do
|
|
20
|
+
expect(Wp2txt::DumpManager.default_cache_dir).to eq(File.expand_path("~/.wp2txt/cache"))
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
describe "#initialize" do
|
|
25
|
+
it "accepts custom cache directory" do
|
|
26
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
27
|
+
expect(manager.cache_dir).to eq(cache_dir)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
it "uses default cache directory when not specified" do
|
|
31
|
+
manager = Wp2txt::DumpManager.new(:ja)
|
|
32
|
+
expect(manager.cache_dir).to eq(Wp2txt::DumpManager.default_cache_dir)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
describe "#cache_status" do
|
|
37
|
+
it "returns status hash with expected keys" do
|
|
38
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
39
|
+
# Stub the dump date to avoid network call
|
|
40
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
41
|
+
|
|
42
|
+
status = manager.cache_status
|
|
43
|
+
expect(status).to have_key(:lang)
|
|
44
|
+
expect(status).to have_key(:cache_dir)
|
|
45
|
+
expect(status).to have_key(:index_exists)
|
|
46
|
+
expect(status).to have_key(:multistream_exists)
|
|
47
|
+
expect(status).to have_key(:age_days)
|
|
48
|
+
expect(status).to have_key(:mtime)
|
|
49
|
+
expect(status).to have_key(:expiry_days)
|
|
50
|
+
expect(status[:lang]).to eq(:ja)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it "returns false for index_exists when cache is empty" do
|
|
54
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
55
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
56
|
+
|
|
57
|
+
status = manager.cache_status
|
|
58
|
+
expect(status[:index_exists]).to be false
|
|
59
|
+
expect(status[:multistream_exists]).to be false
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
describe "#cache_age_days" do
|
|
64
|
+
it "returns nil when cache does not exist" do
|
|
65
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
66
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
67
|
+
|
|
68
|
+
expect(manager.cache_age_days).to be_nil
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
it "returns age in days when cache exists" do
|
|
72
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
73
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
74
|
+
|
|
75
|
+
# Create a fake index file
|
|
76
|
+
index_path = manager.cached_index_path
|
|
77
|
+
FileUtils.mkdir_p(File.dirname(index_path))
|
|
78
|
+
File.write(index_path, "test")
|
|
79
|
+
|
|
80
|
+
age = manager.cache_age_days
|
|
81
|
+
expect(age).to be_a(Float)
|
|
82
|
+
expect(age).to be >= 0
|
|
83
|
+
expect(age).to be < 1 # Just created
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
describe "#cache_mtime" do
|
|
88
|
+
it "returns nil when cache does not exist" do
|
|
89
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
90
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
91
|
+
|
|
92
|
+
expect(manager.cache_mtime).to be_nil
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
it "returns modification time when cache exists" do
|
|
96
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
97
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
98
|
+
|
|
99
|
+
# Create a fake index file
|
|
100
|
+
index_path = manager.cached_index_path
|
|
101
|
+
FileUtils.mkdir_p(File.dirname(index_path))
|
|
102
|
+
File.write(index_path, "test")
|
|
103
|
+
|
|
104
|
+
mtime = manager.cache_mtime
|
|
105
|
+
expect(mtime).to be_a(Time)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
describe "#cache_stale?" do
|
|
110
|
+
it "returns true when cache does not exist" do
|
|
111
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
112
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
113
|
+
|
|
114
|
+
expect(manager.cache_stale?).to be true
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it "returns false when cache is fresh" do
|
|
118
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir, dump_expiry_days: 30)
|
|
119
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
120
|
+
|
|
121
|
+
# Create a fake index file (just created = fresh)
|
|
122
|
+
index_path = manager.cached_index_path
|
|
123
|
+
FileUtils.mkdir_p(File.dirname(index_path))
|
|
124
|
+
File.write(index_path, "test")
|
|
125
|
+
|
|
126
|
+
expect(manager.cache_stale?).to be false
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
it "returns true when cache is older than expiry days" do
|
|
130
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir, dump_expiry_days: 1)
|
|
131
|
+
allow(manager).to receive(:latest_dump_date).and_return("20260101")
|
|
132
|
+
|
|
133
|
+
# Create a fake index file
|
|
134
|
+
index_path = manager.cached_index_path
|
|
135
|
+
FileUtils.mkdir_p(File.dirname(index_path))
|
|
136
|
+
File.write(index_path, "test")
|
|
137
|
+
|
|
138
|
+
# Set modification time to 2 days ago
|
|
139
|
+
old_time = Time.now - (2 * 86400)
|
|
140
|
+
File.utime(old_time, old_time, index_path)
|
|
141
|
+
|
|
142
|
+
expect(manager.cache_stale?).to be true
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
describe "#clear_cache!" do
|
|
147
|
+
it "removes language-specific cache directory" do
|
|
148
|
+
manager = Wp2txt::DumpManager.new(:ja, cache_dir: cache_dir)
|
|
149
|
+
lang_dir = File.join(cache_dir, "jawiki")
|
|
150
|
+
FileUtils.mkdir_p(lang_dir)
|
|
151
|
+
File.write(File.join(lang_dir, "test.txt"), "test")
|
|
152
|
+
|
|
153
|
+
expect(File.exist?(lang_dir)).to be true
|
|
154
|
+
manager.clear_cache!
|
|
155
|
+
expect(File.exist?(lang_dir)).to be false
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
describe ".clear_all_cache!" do
|
|
160
|
+
it "removes entire cache directory" do
|
|
161
|
+
FileUtils.mkdir_p(File.join(cache_dir, "jawiki"))
|
|
162
|
+
FileUtils.mkdir_p(File.join(cache_dir, "enwiki"))
|
|
163
|
+
File.write(File.join(cache_dir, "jawiki", "test.txt"), "test")
|
|
164
|
+
|
|
165
|
+
Wp2txt::DumpManager.clear_all_cache!(cache_dir)
|
|
166
|
+
expect(File.exist?(cache_dir)).to be false
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
describe "Wp2txt::CLI" do
|
|
172
|
+
describe ".valid_language_code?" do
|
|
173
|
+
it "accepts valid 2-letter codes" do
|
|
174
|
+
expect(Wp2txt::CLI.valid_language_code?("ja")).to be true
|
|
175
|
+
expect(Wp2txt::CLI.valid_language_code?("en")).to be true
|
|
176
|
+
expect(Wp2txt::CLI.valid_language_code?("zh")).to be true
|
|
177
|
+
expect(Wp2txt::CLI.valid_language_code?("de")).to be true
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
it "accepts valid longer codes" do
|
|
181
|
+
expect(Wp2txt::CLI.valid_language_code?("simple")).to be true
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
it "accepts hyphenated codes" do
|
|
185
|
+
expect(Wp2txt::CLI.valid_language_code?("zh-yue")).to be true
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
it "rejects invalid codes" do
|
|
189
|
+
expect(Wp2txt::CLI.valid_language_code?("INVALID")).to be false
|
|
190
|
+
expect(Wp2txt::CLI.valid_language_code?("123")).to be false
|
|
191
|
+
expect(Wp2txt::CLI.valid_language_code?("")).to be false
|
|
192
|
+
expect(Wp2txt::CLI.valid_language_code?(nil)).to be false
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
describe ".default_cache_dir" do
|
|
197
|
+
it "returns ~/.wp2txt/cache" do
|
|
198
|
+
expect(Wp2txt::CLI.default_cache_dir).to eq(File.expand_path("~/.wp2txt/cache"))
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
describe ".parse_options" do
|
|
203
|
+
let(:cache_dir) { File.join(Dir.tmpdir, "wp2txt_cli_test_#{Process.pid}") }
|
|
204
|
+
|
|
205
|
+
before do
|
|
206
|
+
FileUtils.mkdir_p(cache_dir)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
after do
|
|
210
|
+
FileUtils.rm_rf(cache_dir) if File.exist?(cache_dir)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
it "accepts --lang option" do
|
|
214
|
+
opts = Wp2txt::CLI.parse_options(["--lang=ja", "--cache-dir=#{cache_dir}"])
|
|
215
|
+
expect(opts[:lang]).to eq("ja")
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
it "accepts --cache-dir option" do
|
|
219
|
+
opts = Wp2txt::CLI.parse_options(["--lang=ja", "--cache-dir=#{cache_dir}"])
|
|
220
|
+
expect(opts[:cache_dir]).to eq(cache_dir)
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
it "accepts --cache-status option" do
|
|
224
|
+
opts = Wp2txt::CLI.parse_options(["--cache-status", "--cache-dir=#{cache_dir}"])
|
|
225
|
+
expect(opts[:cache_status]).to be true
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
it "accepts --cache-clear option" do
|
|
229
|
+
opts = Wp2txt::CLI.parse_options(["--cache-clear", "--cache-dir=#{cache_dir}"])
|
|
230
|
+
expect(opts[:cache_clear]).to be true
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
it "allows --cache-status without --input or --lang" do
|
|
234
|
+
opts = Wp2txt::CLI.parse_options(["--cache-status", "--cache-dir=#{cache_dir}"])
|
|
235
|
+
expect(opts[:cache_status]).to be true
|
|
236
|
+
expect(opts[:input]).to be_nil
|
|
237
|
+
expect(opts[:lang]).to be_nil
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
it "allows --cache-clear without --input or --lang" do
|
|
241
|
+
opts = Wp2txt::CLI.parse_options(["--cache-clear", "--cache-dir=#{cache_dir}"])
|
|
242
|
+
expect(opts[:cache_clear]).to be true
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
context "input source validation" do
|
|
246
|
+
it "requires either --input or --lang for normal operation" do
|
|
247
|
+
suppress_stderr do
|
|
248
|
+
expect { Wp2txt::CLI.parse_options(["--output-dir=#{cache_dir}"]) }.to raise_error(SystemExit)
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
it "rejects both --input and --lang together" do
|
|
253
|
+
# Create a dummy input file
|
|
254
|
+
input_file = File.join(cache_dir, "test.xml")
|
|
255
|
+
File.write(input_file, "<test/>")
|
|
256
|
+
|
|
257
|
+
suppress_stderr do
|
|
258
|
+
expect { Wp2txt::CLI.parse_options(["--input=#{input_file}", "--lang=ja"]) }.to raise_error(SystemExit)
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
context "--articles option" do
|
|
264
|
+
it "accepts --articles with --lang" do
|
|
265
|
+
opts = Wp2txt::CLI.parse_options(["--lang=ja", "--articles=認知言語学"])
|
|
266
|
+
expect(opts[:articles]).to eq("認知言語学")
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
it "accepts multiple articles separated by comma" do
|
|
270
|
+
opts = Wp2txt::CLI.parse_options(["--lang=ja", "--articles=認知言語学,生成文法,言語学"])
|
|
271
|
+
expect(opts[:articles]).to eq("認知言語学,生成文法,言語学")
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
it "requires --lang when --articles is specified" do
|
|
275
|
+
suppress_stderr do
|
|
276
|
+
expect { Wp2txt::CLI.parse_options(["--articles=Test"]) }.to raise_error(SystemExit)
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
it "rejects --articles with --input" do
|
|
281
|
+
input_file = File.join(cache_dir, "test.xml")
|
|
282
|
+
File.write(input_file, "<test/>")
|
|
283
|
+
suppress_stderr do
|
|
284
|
+
expect { Wp2txt::CLI.parse_options(["--input=#{input_file}", "--articles=Test"]) }.to raise_error(SystemExit)
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
describe "Article extraction" do
|
|
292
|
+
describe "Wp2txt::CLI.parse_article_list" do
|
|
293
|
+
it "parses single article" do
|
|
294
|
+
articles = Wp2txt::CLI.parse_article_list("認知言語学")
|
|
295
|
+
expect(articles).to eq(["認知言語学"])
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
it "parses multiple articles" do
|
|
299
|
+
articles = Wp2txt::CLI.parse_article_list("認知言語学,生成文法,言語学")
|
|
300
|
+
expect(articles).to eq(["認知言語学", "生成文法", "言語学"])
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it "trims whitespace" do
|
|
304
|
+
articles = Wp2txt::CLI.parse_article_list(" 認知言語学 , 生成文法 ")
|
|
305
|
+
expect(articles).to eq(["認知言語学", "生成文法"])
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
it "returns empty array for nil" do
|
|
309
|
+
articles = Wp2txt::CLI.parse_article_list(nil)
|
|
310
|
+
expect(articles).to eq([])
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
end
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "tempfile"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require "wp2txt/bz2_validator"
|
|
7
|
+
|
|
8
|
+
RSpec.describe Wp2txt::Bz2Validator do
|
|
9
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
10
|
+
|
|
11
|
+
after { FileUtils.rm_rf(temp_dir) }
|
|
12
|
+
|
|
13
|
+
describe ".validate" do
|
|
14
|
+
context "with non-existent file" do
|
|
15
|
+
it "returns not_found error" do
|
|
16
|
+
result = described_class.validate("/nonexistent/file.bz2")
|
|
17
|
+
expect(result.valid?).to be false
|
|
18
|
+
expect(result.error_type).to eq(:not_found)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
context "with too small file" do
|
|
23
|
+
let(:small_file) { File.join(temp_dir, "small.bz2") }
|
|
24
|
+
|
|
25
|
+
before { File.write(small_file, "BZh9") }
|
|
26
|
+
|
|
27
|
+
it "returns too_small error" do
|
|
28
|
+
result = described_class.validate(small_file)
|
|
29
|
+
expect(result.valid?).to be false
|
|
30
|
+
expect(result.error_type).to eq(:too_small)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
context "with invalid magic bytes" do
|
|
35
|
+
let(:invalid_file) { File.join(temp_dir, "invalid.bz2") }
|
|
36
|
+
|
|
37
|
+
before { File.write(invalid_file, "XX" + ("x" * 100)) }
|
|
38
|
+
|
|
39
|
+
it "returns invalid_magic error" do
|
|
40
|
+
result = described_class.validate(invalid_file)
|
|
41
|
+
expect(result.valid?).to be false
|
|
42
|
+
expect(result.error_type).to eq(:invalid_magic)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
context "with invalid version byte" do
|
|
47
|
+
let(:invalid_version) { File.join(temp_dir, "bad_version.bz2") }
|
|
48
|
+
|
|
49
|
+
before { File.write(invalid_version, "BZx9" + ("x" * 100)) }
|
|
50
|
+
|
|
51
|
+
it "returns invalid_version error" do
|
|
52
|
+
result = described_class.validate(invalid_version)
|
|
53
|
+
expect(result.valid?).to be false
|
|
54
|
+
expect(result.error_type).to eq(:invalid_version)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
context "with invalid block size" do
|
|
59
|
+
let(:invalid_block) { File.join(temp_dir, "bad_block.bz2") }
|
|
60
|
+
|
|
61
|
+
before { File.write(invalid_block, "BZh0" + ("x" * 100)) }
|
|
62
|
+
|
|
63
|
+
it "returns invalid_block_size error" do
|
|
64
|
+
result = described_class.validate(invalid_block)
|
|
65
|
+
expect(result.valid?).to be false
|
|
66
|
+
expect(result.error_type).to eq(:invalid_block_size)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
describe ".validate_quick" do
|
|
72
|
+
context "with valid header" do
|
|
73
|
+
let(:valid_header_file) { File.join(temp_dir, "valid_header.bz2") }
|
|
74
|
+
|
|
75
|
+
before { File.write(valid_header_file, "BZh9" + ("x" * 100)) }
|
|
76
|
+
|
|
77
|
+
it "returns valid for correct header" do
|
|
78
|
+
result = described_class.validate_quick(valid_header_file)
|
|
79
|
+
expect(result.valid?).to be true
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
context "with invalid header" do
|
|
84
|
+
let(:invalid_file) { File.join(temp_dir, "invalid.bz2") }
|
|
85
|
+
|
|
86
|
+
before { File.write(invalid_file, "XXXX" + ("x" * 100)) }
|
|
87
|
+
|
|
88
|
+
it "returns invalid" do
|
|
89
|
+
result = described_class.validate_quick(invalid_file)
|
|
90
|
+
expect(result.valid?).to be false
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe ".validate_magic_bytes" do
|
|
96
|
+
context "with valid bz2 header" do
|
|
97
|
+
let(:valid_file) { File.join(temp_dir, "valid.bz2") }
|
|
98
|
+
|
|
99
|
+
before { File.write(valid_file, "BZh9" + ("x" * 100)) }
|
|
100
|
+
|
|
101
|
+
it "returns valid result" do
|
|
102
|
+
result = described_class.validate_magic_bytes(valid_file)
|
|
103
|
+
expect(result.valid?).to be true
|
|
104
|
+
expect(result.details[:version]).to eq("h")
|
|
105
|
+
expect(result.details[:block_size]).to eq(9)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
context "with different block sizes" do
|
|
110
|
+
(1..9).each do |size|
|
|
111
|
+
it "accepts block size #{size}" do
|
|
112
|
+
file = File.join(temp_dir, "block#{size}.bz2")
|
|
113
|
+
File.write(file, "BZh#{size}" + ("x" * 100))
|
|
114
|
+
result = described_class.validate_magic_bytes(file)
|
|
115
|
+
expect(result.valid?).to be true
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
describe ".find_bzip2_command" do
|
|
122
|
+
it "returns a string path or nil" do
|
|
123
|
+
result = described_class.find_bzip2_command
|
|
124
|
+
expect(result.nil? || result.is_a?(String)).to be true
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
describe ".file_info" do
|
|
129
|
+
context "with valid bz2 header" do
|
|
130
|
+
let(:valid_file) { File.join(temp_dir, "info_test.bz2") }
|
|
131
|
+
|
|
132
|
+
before { File.write(valid_file, "BZh5" + ("data" * 100)) }
|
|
133
|
+
|
|
134
|
+
it "returns file information hash" do
|
|
135
|
+
info = described_class.file_info(valid_file)
|
|
136
|
+
expect(info).to be_a(Hash)
|
|
137
|
+
expect(info[:path]).to eq(valid_file)
|
|
138
|
+
expect(info[:size]).to be > 0
|
|
139
|
+
expect(info[:valid_header]).to be true
|
|
140
|
+
expect(info[:version]).to eq("h")
|
|
141
|
+
expect(info[:block_size]).to eq(5)
|
|
142
|
+
expect(info[:mtime]).to be_a(Time)
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
context "with non-existent file" do
|
|
147
|
+
it "returns nil" do
|
|
148
|
+
info = described_class.file_info("/nonexistent/file.bz2")
|
|
149
|
+
expect(info).to be_nil
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
describe "ValidationResult" do
|
|
155
|
+
describe "#valid?" do
|
|
156
|
+
it "returns true when valid is true" do
|
|
157
|
+
result = described_class::ValidationResult.new(valid: true)
|
|
158
|
+
expect(result.valid?).to be true
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
it "returns false when valid is false" do
|
|
162
|
+
result = described_class::ValidationResult.new(valid: false)
|
|
163
|
+
expect(result.valid?).to be false
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
describe "#to_s" do
|
|
168
|
+
it "returns success message for valid result" do
|
|
169
|
+
result = described_class::ValidationResult.new(valid: true)
|
|
170
|
+
expect(result.to_s).to eq("Valid bz2 file")
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
it "returns error message for invalid result" do
|
|
174
|
+
result = described_class::ValidationResult.new(valid: false, message: "Test error")
|
|
175
|
+
expect(result.to_s).to eq("Invalid: Test error")
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
describe "constants" do
|
|
181
|
+
it "has correct BZ2 magic bytes" do
|
|
182
|
+
expect(described_class::BZ2_MAGIC).to eq("BZ")
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
it "has correct BZ2 version" do
|
|
186
|
+
expect(described_class::BZ2_VERSION).to eq("h")
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
it "has valid block size range" do
|
|
190
|
+
expect(described_class::BZ2_BLOCK_SIZES).to eq(("1".."9").to_a)
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|