wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require "tmpdir"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
|
|
7
|
+
RSpec.describe Wp2txt::CategoryCache do
|
|
8
|
+
let(:cache_dir) { Dir.mktmpdir("wp2txt_category_cache_test_") }
|
|
9
|
+
let(:cache) { described_class.new("en", cache_dir: cache_dir) }
|
|
10
|
+
|
|
11
|
+
after do
|
|
12
|
+
cache.close
|
|
13
|
+
FileUtils.rm_rf(cache_dir)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
describe "#initialize" do
|
|
17
|
+
it "creates cache file" do
|
|
18
|
+
expect(File.exist?(cache.cache_path)).to be true
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it "sets language" do
|
|
22
|
+
expect(cache.lang).to eq "en"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it "uses default expiry days" do
|
|
26
|
+
expect(cache.expiry_days).to eq Wp2txt::DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it "accepts custom expiry days" do
|
|
30
|
+
custom_cache = described_class.new("ja", cache_dir: cache_dir, expiry_days: 14)
|
|
31
|
+
expect(custom_cache.expiry_days).to eq 14
|
|
32
|
+
custom_cache.close
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
describe "#save and #get" do
|
|
37
|
+
it "saves and retrieves category data" do
|
|
38
|
+
pages = ["Article 1", "Article 2", "Article 3"]
|
|
39
|
+
subcats = ["Subcategory A", "Subcategory B"]
|
|
40
|
+
|
|
41
|
+
cache.save("Test Category", pages, subcats)
|
|
42
|
+
data = cache.get("Test Category")
|
|
43
|
+
|
|
44
|
+
expect(data[:pages]).to eq pages
|
|
45
|
+
expect(data[:subcats]).to eq subcats
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "handles empty pages" do
|
|
49
|
+
cache.save("Empty Pages", [], ["Subcat"])
|
|
50
|
+
data = cache.get("Empty Pages")
|
|
51
|
+
|
|
52
|
+
expect(data[:pages]).to eq []
|
|
53
|
+
expect(data[:subcats]).to eq ["Subcat"]
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it "handles empty subcategories" do
|
|
57
|
+
cache.save("No Subcats", ["Article"], [])
|
|
58
|
+
data = cache.get("No Subcats")
|
|
59
|
+
|
|
60
|
+
expect(data[:pages]).to eq ["Article"]
|
|
61
|
+
expect(data[:subcats]).to eq []
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it "handles Unicode category names" do
|
|
65
|
+
cache.save("日本の都市", ["東京", "大阪"], ["関東の都市"])
|
|
66
|
+
data = cache.get("日本の都市")
|
|
67
|
+
|
|
68
|
+
expect(data[:pages]).to contain_exactly("東京", "大阪")
|
|
69
|
+
expect(data[:subcats]).to contain_exactly("関東の都市")
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
it "normalizes category name by removing Category: prefix" do
|
|
73
|
+
cache.save("Category:Test", ["Article"], [])
|
|
74
|
+
data = cache.get("Test")
|
|
75
|
+
|
|
76
|
+
expect(data[:pages]).to eq ["Article"]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
it "returns nil for non-existent category" do
|
|
80
|
+
expect(cache.get("Nonexistent")).to be_nil
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
describe "#cached?" do
|
|
85
|
+
it "returns true for cached category" do
|
|
86
|
+
cache.save("Cached", ["Article"], [])
|
|
87
|
+
expect(cache.cached?("Cached")).to be true
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it "returns false for non-cached category" do
|
|
91
|
+
expect(cache.cached?("Not Cached")).to be false
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe "#get_all_pages" do
|
|
96
|
+
before do
|
|
97
|
+
cache.save("Root", ["Article1", "Article2"], ["Child1", "Child2"])
|
|
98
|
+
cache.save("Child1", ["Article3"], ["Grandchild"])
|
|
99
|
+
cache.save("Child2", ["Article4", "Article5"], [])
|
|
100
|
+
cache.save("Grandchild", ["Article6"], [])
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
it "returns only direct pages with max_depth 0" do
|
|
104
|
+
pages = cache.get_all_pages("Root", max_depth: 0)
|
|
105
|
+
expect(pages).to contain_exactly("Article1", "Article2")
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
it "includes subcategory pages with max_depth 1" do
|
|
109
|
+
pages = cache.get_all_pages("Root", max_depth: 1)
|
|
110
|
+
expect(pages).to contain_exactly("Article1", "Article2", "Article3", "Article4", "Article5")
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it "includes all nested pages with sufficient depth" do
|
|
114
|
+
pages = cache.get_all_pages("Root", max_depth: 2)
|
|
115
|
+
expect(pages).to contain_exactly("Article1", "Article2", "Article3", "Article4", "Article5", "Article6")
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it "returns unique pages (no duplicates)" do
|
|
119
|
+
cache.save("DupParent", ["SharedArticle"], ["DupChild"])
|
|
120
|
+
cache.save("DupChild", ["SharedArticle", "UniqueArticle"], [])
|
|
121
|
+
|
|
122
|
+
pages = cache.get_all_pages("DupParent", max_depth: 1)
|
|
123
|
+
expect(pages.count("SharedArticle")).to eq 1
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
it "handles circular references" do
|
|
127
|
+
cache.save("CircularA", ["A1"], ["CircularB"])
|
|
128
|
+
cache.save("CircularB", ["B1"], ["CircularA"])
|
|
129
|
+
|
|
130
|
+
pages = cache.get_all_pages("CircularA", max_depth: 10)
|
|
131
|
+
expect(pages).to contain_exactly("A1", "B1")
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
it "returns empty array for non-existent category" do
|
|
135
|
+
expect(cache.get_all_pages("Nonexistent")).to eq []
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
describe "#get_tree" do
|
|
140
|
+
before do
|
|
141
|
+
cache.save("Root", ["A1"], ["Child"])
|
|
142
|
+
cache.save("Child", ["C1", "C2"], [])
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
it "returns tree structure" do
|
|
146
|
+
tree = cache.get_tree("Root", max_depth: 1)
|
|
147
|
+
|
|
148
|
+
expect(tree[:name]).to eq "Root"
|
|
149
|
+
expect(tree[:cached]).to be true
|
|
150
|
+
expect(tree[:page_count]).to eq 1
|
|
151
|
+
expect(tree[:children].size).to eq 1
|
|
152
|
+
expect(tree[:children].first[:name]).to eq "Child"
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it "limits depth" do
|
|
156
|
+
cache.save("Deep", ["D1"], ["Root"])
|
|
157
|
+
|
|
158
|
+
tree = cache.get_tree("Deep", max_depth: 0)
|
|
159
|
+
expect(tree[:children]).to be_empty
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
describe "#stats" do
|
|
164
|
+
it "returns cache statistics" do
|
|
165
|
+
cache.save("Cat1", ["A1", "A2"], ["Sub1"])
|
|
166
|
+
cache.save("Cat2", ["A3"], [])
|
|
167
|
+
|
|
168
|
+
stats = cache.stats
|
|
169
|
+
|
|
170
|
+
expect(stats[:lang]).to eq "en"
|
|
171
|
+
expect(stats[:total_categories]).to eq 2
|
|
172
|
+
expect(stats[:total_pages]).to eq 3
|
|
173
|
+
expect(stats[:total_relations]).to eq 1
|
|
174
|
+
expect(stats[:cache_size]).to be > 0
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
describe "#clear!" do
|
|
179
|
+
it "removes all cached data" do
|
|
180
|
+
cache.save("Test", ["Article"], [])
|
|
181
|
+
cache.clear!
|
|
182
|
+
|
|
183
|
+
expect(cache.cached?("Test")).to be false
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
describe "#cleanup_expired!" do
|
|
188
|
+
it "removes expired entries" do
|
|
189
|
+
# Save a category
|
|
190
|
+
cache.save("Old", ["Article"], [])
|
|
191
|
+
|
|
192
|
+
# Manually update the cached_at to make it old
|
|
193
|
+
# We need to access the database directly for this test
|
|
194
|
+
cache.instance_variable_get(:@db).execute(
|
|
195
|
+
"UPDATE categories SET cached_at = ? WHERE name = ?",
|
|
196
|
+
[Time.now.to_i - (30 * 24 * 3600), "Old"] # 30 days ago
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Save a fresh category
|
|
200
|
+
cache.save("Fresh", ["NewArticle"], [])
|
|
201
|
+
|
|
202
|
+
# Cleanup with default 7-day expiry
|
|
203
|
+
removed = cache.cleanup_expired!
|
|
204
|
+
|
|
205
|
+
expect(removed).to eq 1
|
|
206
|
+
expect(cache.cached?("Old")).to be false
|
|
207
|
+
expect(cache.cached?("Fresh")).to be true
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
describe "per-language isolation" do
|
|
212
|
+
it "creates separate cache per language" do
|
|
213
|
+
en_cache = described_class.new("en", cache_dir: cache_dir)
|
|
214
|
+
ja_cache = described_class.new("ja", cache_dir: cache_dir)
|
|
215
|
+
|
|
216
|
+
en_cache.save("Cities", ["New York"], [])
|
|
217
|
+
ja_cache.save("Cities", ["東京"], [])
|
|
218
|
+
|
|
219
|
+
expect(en_cache.get("Cities")[:pages]).to eq ["New York"]
|
|
220
|
+
expect(ja_cache.get("Cities")[:pages]).to eq ["東京"]
|
|
221
|
+
|
|
222
|
+
en_cache.close
|
|
223
|
+
ja_cache.close
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
end
|