wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require "tmpdir"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
|
|
7
|
+
RSpec.describe Wp2txt::IndexCache do
|
|
8
|
+
let(:cache_dir) { Dir.mktmpdir("wp2txt_index_cache_test_") }
|
|
9
|
+
let(:source_file) { File.join(cache_dir, "test-index.txt") }
|
|
10
|
+
let(:cache) { described_class.new(source_file, cache_dir: cache_dir) }
|
|
11
|
+
|
|
12
|
+
before do
|
|
13
|
+
# Create a dummy source file
|
|
14
|
+
File.write(source_file, "test content")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
after do
|
|
18
|
+
FileUtils.rm_rf(cache_dir)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
describe "#initialize" do
|
|
22
|
+
it "builds cache path from source file" do
|
|
23
|
+
expect(cache.cache_path).to include("test")
|
|
24
|
+
expect(cache.cache_path).to end_with(".sqlite3")
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it "stores source path" do
|
|
28
|
+
expect(cache.source_path).to eq source_file
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe "#valid?" do
|
|
33
|
+
it "returns false when cache does not exist" do
|
|
34
|
+
expect(cache.valid?).to be false
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it "returns false when source file does not exist" do
|
|
38
|
+
FileUtils.rm_f(source_file)
|
|
39
|
+
expect(cache.valid?).to be false
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
context "with saved cache" do
|
|
43
|
+
before do
|
|
44
|
+
entries = {
|
|
45
|
+
"Article 1" => { offset: 1000, page_id: 1, title: "Article 1" },
|
|
46
|
+
"Article 2" => { offset: 2000, page_id: 2, title: "Article 2" }
|
|
47
|
+
}
|
|
48
|
+
cache.save(entries, [0, 1000, 2000])
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "returns true for valid cache" do
|
|
52
|
+
expect(cache.valid?).to be true
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it "returns false when source file changes" do
|
|
56
|
+
# Modify source file
|
|
57
|
+
sleep 0.1 # Ensure mtime changes
|
|
58
|
+
File.write(source_file, "modified content that is longer")
|
|
59
|
+
expect(cache.valid?).to be false
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
describe "#save and #load" do
|
|
65
|
+
let(:entries) do
|
|
66
|
+
{
|
|
67
|
+
"Article A" => { offset: 100, page_id: 1, title: "Article A" },
|
|
68
|
+
"Article B" => { offset: 200, page_id: 2, title: "Article B" },
|
|
69
|
+
"Article C" => { offset: 300, page_id: 3, title: "Article C" }
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
let(:stream_offsets) { [0, 100, 200, 300] }
|
|
73
|
+
|
|
74
|
+
it "saves and loads entries" do
|
|
75
|
+
cache.save(entries, stream_offsets)
|
|
76
|
+
|
|
77
|
+
loaded = cache.load
|
|
78
|
+
expect(loaded[:entries_by_title].size).to eq 3
|
|
79
|
+
expect(loaded[:entries_by_title]["Article A"][:offset]).to eq 100
|
|
80
|
+
expect(loaded[:entries_by_title]["Article B"][:page_id]).to eq 2
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
it "saves and loads stream offsets" do
|
|
84
|
+
cache.save(entries, stream_offsets)
|
|
85
|
+
|
|
86
|
+
loaded = cache.load
|
|
87
|
+
expect(loaded[:stream_offsets]).to eq stream_offsets
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it "loads entries by ID" do
|
|
91
|
+
cache.save(entries, stream_offsets)
|
|
92
|
+
|
|
93
|
+
loaded = cache.load
|
|
94
|
+
expect(loaded[:entries_by_id][1][:title]).to eq "Article A"
|
|
95
|
+
expect(loaded[:entries_by_id][2][:title]).to eq "Article B"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
it "returns nil when cache is invalid" do
|
|
99
|
+
expect(cache.load).to be_nil
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
it "handles large number of entries" do
|
|
103
|
+
large_entries = {}
|
|
104
|
+
10_000.times do |i|
|
|
105
|
+
large_entries["Article #{i}"] = { offset: i * 1000, page_id: i, title: "Article #{i}" }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
cache.save(large_entries, [0])
|
|
109
|
+
loaded = cache.load
|
|
110
|
+
expect(loaded[:entries_by_title].size).to eq 10_000
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it "handles Unicode titles" do
|
|
114
|
+
unicode_entries = {
|
|
115
|
+
"東京" => { offset: 100, page_id: 1, title: "東京" },
|
|
116
|
+
"Москва" => { offset: 200, page_id: 2, title: "Москва" },
|
|
117
|
+
"القاهرة" => { offset: 300, page_id: 3, title: "القاهرة" }
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
cache.save(unicode_entries, [0])
|
|
121
|
+
loaded = cache.load
|
|
122
|
+
expect(loaded[:entries_by_title]["東京"][:offset]).to eq 100
|
|
123
|
+
expect(loaded[:entries_by_title]["Москва"][:offset]).to eq 200
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
describe "#find_by_titles" do
|
|
128
|
+
before do
|
|
129
|
+
entries = {
|
|
130
|
+
"Article 1" => { offset: 100, page_id: 1, title: "Article 1" },
|
|
131
|
+
"Article 2" => { offset: 200, page_id: 2, title: "Article 2" },
|
|
132
|
+
"Article 3" => { offset: 300, page_id: 3, title: "Article 3" }
|
|
133
|
+
}
|
|
134
|
+
cache.save(entries, [0])
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
it "finds existing titles" do
|
|
138
|
+
results = cache.find_by_titles(["Article 1", "Article 3"])
|
|
139
|
+
expect(results.size).to eq 2
|
|
140
|
+
expect(results["Article 1"][:offset]).to eq 100
|
|
141
|
+
expect(results["Article 3"][:offset]).to eq 300
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
it "ignores non-existent titles" do
|
|
145
|
+
results = cache.find_by_titles(["Article 1", "Nonexistent"])
|
|
146
|
+
expect(results.size).to eq 1
|
|
147
|
+
expect(results).to have_key("Article 1")
|
|
148
|
+
expect(results).not_to have_key("Nonexistent")
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
it "returns empty hash for empty input" do
|
|
152
|
+
expect(cache.find_by_titles([])).to eq({})
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it "returns empty hash when cache is invalid" do
|
|
156
|
+
cache.clear!
|
|
157
|
+
expect(cache.find_by_titles(["Article 1"])).to eq({})
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
describe "#stats" do
|
|
162
|
+
it "returns cache statistics" do
|
|
163
|
+
entries = {
|
|
164
|
+
"Article 1" => { offset: 100, page_id: 1, title: "Article 1" },
|
|
165
|
+
"Article 2" => { offset: 200, page_id: 2, title: "Article 2" }
|
|
166
|
+
}
|
|
167
|
+
cache.save(entries, [0, 100, 200])
|
|
168
|
+
|
|
169
|
+
stats = cache.stats
|
|
170
|
+
expect(stats[:cache_path]).to eq cache.cache_path
|
|
171
|
+
expect(stats[:entry_count]).to eq 2
|
|
172
|
+
expect(stats[:stream_count]).to eq 3
|
|
173
|
+
expect(stats[:cache_size]).to be > 0
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it "returns nil when cache does not exist" do
|
|
177
|
+
expect(cache.stats).to be_nil
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
describe "#clear!" do
|
|
182
|
+
it "removes cache file" do
|
|
183
|
+
entries = { "Test" => { offset: 100, page_id: 1, title: "Test" } }
|
|
184
|
+
cache.save(entries, [0])
|
|
185
|
+
|
|
186
|
+
expect(File.exist?(cache.cache_path)).to be true
|
|
187
|
+
cache.clear!
|
|
188
|
+
expect(File.exist?(cache.cache_path)).to be false
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
describe "concurrent access" do
|
|
193
|
+
it "handles multiple readers" do
|
|
194
|
+
entries = { "Test" => { offset: 100, page_id: 1, title: "Test" } }
|
|
195
|
+
cache.save(entries, [0])
|
|
196
|
+
|
|
197
|
+
# Simulate multiple readers
|
|
198
|
+
results = 3.times.map do
|
|
199
|
+
Thread.new do
|
|
200
|
+
c = described_class.new(source_file, cache_dir: cache_dir)
|
|
201
|
+
c.load
|
|
202
|
+
end
|
|
203
|
+
end.map(&:value)
|
|
204
|
+
|
|
205
|
+
results.each do |result|
|
|
206
|
+
expect(result[:entries_by_title]).to have_key("Test")
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|