wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require "webmock/rspec"
|
|
5
|
+
require "tmpdir"
|
|
6
|
+
require "fileutils"
|
|
7
|
+
|
|
8
|
+
RSpec.describe Wp2txt::CategoryFetcher do
|
|
9
|
+
let(:lang) { "en" }
|
|
10
|
+
let(:category) { "Japanese cities" }
|
|
11
|
+
let(:fetcher) { described_class.new(lang, category) }
|
|
12
|
+
|
|
13
|
+
before do
|
|
14
|
+
WebMock.enable!
|
|
15
|
+
WebMock.disable_net_connect!
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
after do
|
|
19
|
+
WebMock.disable!
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
describe "#initialize" do
|
|
23
|
+
it "accepts language and category name" do
|
|
24
|
+
expect(fetcher.lang).to eq "en"
|
|
25
|
+
expect(fetcher.category).to eq "Japanese cities"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
it "normalizes category name by removing Category: prefix" do
|
|
29
|
+
f = described_class.new("en", "Category:Test Category")
|
|
30
|
+
expect(f.category).to eq "Test Category"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "handles lowercase category prefix" do
|
|
34
|
+
f = described_class.new("en", "category:Another Test")
|
|
35
|
+
expect(f.category).to eq "Another Test"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it "trims whitespace from category name" do
|
|
39
|
+
f = described_class.new("en", " Test Category ")
|
|
40
|
+
expect(f.category).to eq "Test Category"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "defaults max_depth to 0" do
|
|
44
|
+
expect(fetcher.max_depth).to eq 0
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "accepts custom max_depth" do
|
|
48
|
+
f = described_class.new("ja", "Test", max_depth: 3)
|
|
49
|
+
expect(f.max_depth).to eq 3
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
describe "#fetch_preview" do
|
|
54
|
+
it "returns statistics without full article list" do
|
|
55
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
56
|
+
.to_return(
|
|
57
|
+
status: 200,
|
|
58
|
+
body: {
|
|
59
|
+
query: {
|
|
60
|
+
categorymembers: [
|
|
61
|
+
{ ns: 0, title: "Tokyo" },
|
|
62
|
+
{ ns: 0, title: "Osaka" },
|
|
63
|
+
{ ns: 0, title: "Kyoto" }
|
|
64
|
+
]
|
|
65
|
+
}
|
|
66
|
+
}.to_json
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
preview = fetcher.fetch_preview
|
|
70
|
+
|
|
71
|
+
expect(preview[:category]).to eq "Japanese cities"
|
|
72
|
+
expect(preview[:depth]).to eq 0
|
|
73
|
+
expect(preview[:total_articles]).to eq 3
|
|
74
|
+
expect(preview[:subcategories]).to be_an(Array)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it "includes subcategory statistics when depth > 0" do
|
|
78
|
+
f = described_class.new("en", "Japanese cities", max_depth: 1)
|
|
79
|
+
|
|
80
|
+
# Parent category
|
|
81
|
+
stub_request(:get, /cmtitle=Category:Japanese%20cities/)
|
|
82
|
+
.to_return(
|
|
83
|
+
status: 200,
|
|
84
|
+
body: {
|
|
85
|
+
query: {
|
|
86
|
+
categorymembers: [
|
|
87
|
+
{ ns: 0, title: "Tokyo" },
|
|
88
|
+
{ ns: 14, title: "Category:Cities in Kanto" }
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
}.to_json
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Subcategory
|
|
95
|
+
stub_request(:get, /cmtitle=Category:Cities%20in%20Kanto/)
|
|
96
|
+
.to_return(
|
|
97
|
+
status: 200,
|
|
98
|
+
body: {
|
|
99
|
+
query: {
|
|
100
|
+
categorymembers: [
|
|
101
|
+
{ ns: 0, title: "Yokohama" },
|
|
102
|
+
{ ns: 0, title: "Chiba" }
|
|
103
|
+
]
|
|
104
|
+
}
|
|
105
|
+
}.to_json
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
preview = f.fetch_preview
|
|
109
|
+
|
|
110
|
+
expect(preview[:total_articles]).to eq 3
|
|
111
|
+
expect(preview[:total_subcategories]).to eq 1
|
|
112
|
+
expect(preview[:subcategories].size).to eq 2
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
describe "#fetch_articles" do
|
|
117
|
+
it "fetches articles from single page response" do
|
|
118
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
119
|
+
.to_return(
|
|
120
|
+
status: 200,
|
|
121
|
+
body: {
|
|
122
|
+
query: {
|
|
123
|
+
categorymembers: [
|
|
124
|
+
{ ns: 0, title: "Tokyo" },
|
|
125
|
+
{ ns: 0, title: "Osaka" },
|
|
126
|
+
{ ns: 0, title: "Kyoto" }
|
|
127
|
+
]
|
|
128
|
+
}
|
|
129
|
+
}.to_json
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
articles = fetcher.fetch_articles
|
|
133
|
+
expect(articles).to contain_exactly("Tokyo", "Osaka", "Kyoto")
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
it "handles pagination with cmcontinue token" do
|
|
137
|
+
# First request
|
|
138
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
139
|
+
.with(query: hash_excluding("cmcontinue"))
|
|
140
|
+
.to_return(
|
|
141
|
+
status: 200,
|
|
142
|
+
body: {
|
|
143
|
+
query: {
|
|
144
|
+
categorymembers: [
|
|
145
|
+
{ ns: 0, title: "Article1" },
|
|
146
|
+
{ ns: 0, title: "Article2" }
|
|
147
|
+
]
|
|
148
|
+
},
|
|
149
|
+
continue: { cmcontinue: "page2token" }
|
|
150
|
+
}.to_json
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Second request with continue token
|
|
154
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
155
|
+
.with(query: hash_including("cmcontinue" => "page2token"))
|
|
156
|
+
.to_return(
|
|
157
|
+
status: 200,
|
|
158
|
+
body: {
|
|
159
|
+
query: {
|
|
160
|
+
categorymembers: [
|
|
161
|
+
{ ns: 0, title: "Article3" }
|
|
162
|
+
]
|
|
163
|
+
}
|
|
164
|
+
}.to_json
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
articles = fetcher.fetch_articles
|
|
168
|
+
expect(articles.size).to eq 3
|
|
169
|
+
expect(articles).to include("Article1", "Article2", "Article3")
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it "returns unique articles when duplicates exist" do
|
|
173
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
174
|
+
.to_return(
|
|
175
|
+
status: 200,
|
|
176
|
+
body: {
|
|
177
|
+
query: {
|
|
178
|
+
categorymembers: [
|
|
179
|
+
{ ns: 0, title: "Tokyo" },
|
|
180
|
+
{ ns: 0, title: "Tokyo" },
|
|
181
|
+
{ ns: 0, title: "Osaka" }
|
|
182
|
+
]
|
|
183
|
+
}
|
|
184
|
+
}.to_json
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
articles = fetcher.fetch_articles
|
|
188
|
+
expect(articles).to contain_exactly("Tokyo", "Osaka")
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
it "returns empty array for non-existent category" do
|
|
192
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
193
|
+
.to_return(
|
|
194
|
+
status: 200,
|
|
195
|
+
body: { query: { categorymembers: [] } }.to_json
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
articles = fetcher.fetch_articles
|
|
199
|
+
expect(articles).to be_empty
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it "handles API errors gracefully" do
|
|
203
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
204
|
+
.to_return(status: 500)
|
|
205
|
+
|
|
206
|
+
articles = fetcher.fetch_articles
|
|
207
|
+
expect(articles).to be_empty
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
it "handles network timeout gracefully" do
|
|
211
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
212
|
+
.to_timeout
|
|
213
|
+
|
|
214
|
+
articles = fetcher.fetch_articles
|
|
215
|
+
expect(articles).to be_empty
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
it "handles malformed JSON response gracefully" do
|
|
219
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
220
|
+
.to_return(
|
|
221
|
+
status: 200,
|
|
222
|
+
body: "not valid json"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
articles = fetcher.fetch_articles
|
|
226
|
+
expect(articles).to be_empty
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
describe "subcategory recursion" do
|
|
231
|
+
it "does not recurse into subcategories when max_depth is 0" do
|
|
232
|
+
stub_request(:get, /cmtitle=Category:Japanese%20cities/)
|
|
233
|
+
.to_return(
|
|
234
|
+
status: 200,
|
|
235
|
+
body: {
|
|
236
|
+
query: {
|
|
237
|
+
categorymembers: [
|
|
238
|
+
{ ns: 0, title: "Tokyo" },
|
|
239
|
+
{ ns: 14, title: "Category:Cities in Kanto" }
|
|
240
|
+
]
|
|
241
|
+
}
|
|
242
|
+
}.to_json
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
articles = fetcher.fetch_articles
|
|
246
|
+
expect(articles).to eq ["Tokyo"]
|
|
247
|
+
expect(WebMock).not_to have_requested(:get, /cmtitle=Category:Cities%20in%20Kanto/)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
it "recurses into subcategories when max_depth > 0" do
|
|
251
|
+
f = described_class.new("en", "Japanese cities", max_depth: 1)
|
|
252
|
+
|
|
253
|
+
# Parent category
|
|
254
|
+
stub_request(:get, /cmtitle=Category:Japanese%20cities/)
|
|
255
|
+
.to_return(
|
|
256
|
+
status: 200,
|
|
257
|
+
body: {
|
|
258
|
+
query: {
|
|
259
|
+
categorymembers: [
|
|
260
|
+
{ ns: 0, title: "Tokyo" },
|
|
261
|
+
{ ns: 14, title: "Category:Cities in Kanto" }
|
|
262
|
+
]
|
|
263
|
+
}
|
|
264
|
+
}.to_json
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Subcategory
|
|
268
|
+
stub_request(:get, /cmtitle=Category:Cities%20in%20Kanto/)
|
|
269
|
+
.to_return(
|
|
270
|
+
status: 200,
|
|
271
|
+
body: {
|
|
272
|
+
query: {
|
|
273
|
+
categorymembers: [
|
|
274
|
+
{ ns: 0, title: "Yokohama" },
|
|
275
|
+
{ ns: 0, title: "Chiba" }
|
|
276
|
+
]
|
|
277
|
+
}
|
|
278
|
+
}.to_json
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
articles = f.fetch_articles
|
|
282
|
+
expect(articles).to contain_exactly("Tokyo", "Yokohama", "Chiba")
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
it "prevents infinite loops with circular category references" do
|
|
286
|
+
f = described_class.new("en", "Category A", max_depth: 5)
|
|
287
|
+
|
|
288
|
+
stub_request(:get, /cmtitle=Category:Category%20A/)
|
|
289
|
+
.to_return(
|
|
290
|
+
status: 200,
|
|
291
|
+
body: {
|
|
292
|
+
query: {
|
|
293
|
+
categorymembers: [
|
|
294
|
+
{ ns: 0, title: "Article1" },
|
|
295
|
+
{ ns: 14, title: "Category:Category B" }
|
|
296
|
+
]
|
|
297
|
+
}
|
|
298
|
+
}.to_json
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
stub_request(:get, /cmtitle=Category:Category%20B/)
|
|
302
|
+
.to_return(
|
|
303
|
+
status: 200,
|
|
304
|
+
body: {
|
|
305
|
+
query: {
|
|
306
|
+
categorymembers: [
|
|
307
|
+
{ ns: 0, title: "Article2" },
|
|
308
|
+
{ ns: 14, title: "Category:Category A" }
|
|
309
|
+
]
|
|
310
|
+
}
|
|
311
|
+
}.to_json
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# Should complete without infinite loop
|
|
315
|
+
expect { f.fetch_articles }.not_to raise_error
|
|
316
|
+
articles = f.fetch_articles
|
|
317
|
+
expect(articles).to include("Article1", "Article2")
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
it "respects max_depth limit" do
|
|
321
|
+
f = described_class.new("en", "Root", max_depth: 1)
|
|
322
|
+
|
|
323
|
+
# Root
|
|
324
|
+
stub_request(:get, /cmtitle=Category:Root/)
|
|
325
|
+
.to_return(
|
|
326
|
+
status: 200,
|
|
327
|
+
body: {
|
|
328
|
+
query: {
|
|
329
|
+
categorymembers: [
|
|
330
|
+
{ ns: 14, title: "Category:Level1" }
|
|
331
|
+
]
|
|
332
|
+
}
|
|
333
|
+
}.to_json
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# Level 1
|
|
337
|
+
stub_request(:get, /cmtitle=Category:Level1/)
|
|
338
|
+
.to_return(
|
|
339
|
+
status: 200,
|
|
340
|
+
body: {
|
|
341
|
+
query: {
|
|
342
|
+
categorymembers: [
|
|
343
|
+
{ ns: 0, title: "Article1" },
|
|
344
|
+
{ ns: 14, title: "Category:Level2" }
|
|
345
|
+
]
|
|
346
|
+
}
|
|
347
|
+
}.to_json
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Level 2 should not be called
|
|
351
|
+
stub_request(:get, /cmtitle=Category:Level2/)
|
|
352
|
+
.to_return(
|
|
353
|
+
status: 200,
|
|
354
|
+
body: {
|
|
355
|
+
query: { categorymembers: [{ ns: 0, title: "Article2" }] }
|
|
356
|
+
}.to_json
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
articles = f.fetch_articles
|
|
360
|
+
expect(articles).to eq ["Article1"]
|
|
361
|
+
expect(WebMock).not_to have_requested(:get, /cmtitle=Category:Level2/)
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
describe "caching" do
|
|
366
|
+
let(:cache_dir) { Dir.mktmpdir("wp2txt_test_") }
|
|
367
|
+
|
|
368
|
+
after do
|
|
369
|
+
FileUtils.rm_rf(cache_dir)
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
it "caches category members to SQLite database" do
|
|
373
|
+
f = described_class.new("en", "Test Category")
|
|
374
|
+
f.enable_cache(cache_dir)
|
|
375
|
+
|
|
376
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
377
|
+
.to_return(
|
|
378
|
+
status: 200,
|
|
379
|
+
body: {
|
|
380
|
+
query: {
|
|
381
|
+
categorymembers: [{ ns: 0, title: "Cached Article" }]
|
|
382
|
+
}
|
|
383
|
+
}.to_json
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
f.fetch_articles
|
|
387
|
+
|
|
388
|
+
# SQLite cache file should exist
|
|
389
|
+
cache_files = Dir.glob(File.join(cache_dir, "categories_*.sqlite3"))
|
|
390
|
+
expect(cache_files.size).to eq 1
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
it "uses cached data on subsequent calls" do
|
|
394
|
+
f = described_class.new("en", "Test Category")
|
|
395
|
+
f.enable_cache(cache_dir)
|
|
396
|
+
|
|
397
|
+
# Pre-populate cache using CategoryCache
|
|
398
|
+
cache = Wp2txt::CategoryCache.new("en", cache_dir: cache_dir)
|
|
399
|
+
cache.save("Test Category", ["Cached Article"], [])
|
|
400
|
+
cache.close
|
|
401
|
+
|
|
402
|
+
# Should not make API request
|
|
403
|
+
articles = f.fetch_articles
|
|
404
|
+
expect(articles).to eq ["Cached Article"]
|
|
405
|
+
expect(WebMock).not_to have_requested(:get, /wikipedia\.org/)
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
it "ignores stale cache" do
|
|
409
|
+
f = described_class.new("en", "Test Category")
|
|
410
|
+
f.enable_cache(cache_dir)
|
|
411
|
+
|
|
412
|
+
# Pre-populate cache using CategoryCache
|
|
413
|
+
cache = Wp2txt::CategoryCache.new("en", cache_dir: cache_dir, expiry_days: 7)
|
|
414
|
+
cache.save("Test Category", ["Old Article"], [])
|
|
415
|
+
|
|
416
|
+
# Manually update cached_at to make it old (8 days ago)
|
|
417
|
+
cache.instance_variable_get(:@db).execute(
|
|
418
|
+
"UPDATE categories SET cached_at = ? WHERE name = ?",
|
|
419
|
+
[Time.now.to_i - (8 * 24 * 3600), "Test Category"]
|
|
420
|
+
)
|
|
421
|
+
cache.close
|
|
422
|
+
|
|
423
|
+
stub_request(:get, /en\.wikipedia\.org/)
|
|
424
|
+
.to_return(
|
|
425
|
+
status: 200,
|
|
426
|
+
body: {
|
|
427
|
+
query: {
|
|
428
|
+
categorymembers: [{ ns: 0, title: "Fresh Article" }]
|
|
429
|
+
}
|
|
430
|
+
}.to_json
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
articles = f.fetch_articles
|
|
434
|
+
expect(articles).to eq ["Fresh Article"]
|
|
435
|
+
end
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
describe "special characters in category names" do
|
|
439
|
+
it "handles spaces in category names" do
|
|
440
|
+
f = described_class.new("en", "Japanese cities")
|
|
441
|
+
|
|
442
|
+
stub_request(:get, /cmtitle=Category:Japanese%20cities/)
|
|
443
|
+
.to_return(
|
|
444
|
+
status: 200,
|
|
445
|
+
body: { query: { categorymembers: [{ ns: 0, title: "Tokyo" }] } }.to_json
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
articles = f.fetch_articles
|
|
449
|
+
expect(articles).to eq ["Tokyo"]
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
it "handles Unicode category names" do
|
|
453
|
+
f = described_class.new("ja", "日本の都市")
|
|
454
|
+
|
|
455
|
+
stub_request(:get, /ja\.wikipedia\.org/)
|
|
456
|
+
.to_return(
|
|
457
|
+
status: 200,
|
|
458
|
+
body: { query: { categorymembers: [{ ns: 0, title: "東京" }] } }.to_json
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
articles = f.fetch_articles
|
|
462
|
+
expect(articles).to eq ["東京"]
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
it "handles special characters in category names" do
|
|
466
|
+
f = described_class.new("en", "Rock & Roll")
|
|
467
|
+
|
|
468
|
+
stub_request(:get, /cmtitle=Category:Rock%20%26%20Roll/)
|
|
469
|
+
.to_return(
|
|
470
|
+
status: 200,
|
|
471
|
+
body: { query: { categorymembers: [{ ns: 0, title: "Elvis" }] } }.to_json
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
articles = f.fetch_articles
|
|
475
|
+
expect(articles).to eq ["Elvis"]
|
|
476
|
+
end
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
describe "multilingual support" do
|
|
480
|
+
it "works with Japanese Wikipedia" do
|
|
481
|
+
f = described_class.new("ja", "日本の都市")
|
|
482
|
+
|
|
483
|
+
stub_request(:get, /ja\.wikipedia\.org/)
|
|
484
|
+
.to_return(
|
|
485
|
+
status: 200,
|
|
486
|
+
body: { query: { categorymembers: [{ ns: 0, title: "東京" }] } }.to_json
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
expect(f.fetch_articles).to eq ["東京"]
|
|
490
|
+
end
|
|
491
|
+
|
|
492
|
+
it "works with German Wikipedia" do
|
|
493
|
+
f = described_class.new("de", "Stadt in Deutschland")
|
|
494
|
+
|
|
495
|
+
stub_request(:get, /de\.wikipedia\.org/)
|
|
496
|
+
.to_return(
|
|
497
|
+
status: 200,
|
|
498
|
+
body: { query: { categorymembers: [{ ns: 0, title: "Berlin" }] } }.to_json
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
expect(f.fetch_articles).to eq ["Berlin"]
|
|
502
|
+
end
|
|
503
|
+
end
|
|
504
|
+
end
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require_relative "../lib/wp2txt/utils"
|
|
5
|
+
|
|
6
|
+
RSpec.describe "Wp2txt Cleanup" do
|
|
7
|
+
include Wp2txt
|
|
8
|
+
|
|
9
|
+
describe "MediaWiki magic words" do
|
|
10
|
+
it "removes DEFAULTSORT lines" do
|
|
11
|
+
input = "Some text\nDEFAULTSORT:にんちけんこかく\nMore text"
|
|
12
|
+
result = cleanup(input)
|
|
13
|
+
expect(result).not_to include("DEFAULTSORT")
|
|
14
|
+
expect(result).to include("Some text")
|
|
15
|
+
expect(result).to include("More text")
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it "removes DISPLAYTITLE lines" do
|
|
19
|
+
input = "Some text\nDISPLAYTITLE:Custom Title\nMore text"
|
|
20
|
+
result = cleanup(input)
|
|
21
|
+
expect(result).not_to include("DISPLAYTITLE")
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it "removes __NOTOC__ and similar" do
|
|
25
|
+
input = "Some text\n__NOTOC__\n__TOC__\n__FORCETOC__\nMore text"
|
|
26
|
+
result = cleanup(input)
|
|
27
|
+
expect(result).not_to include("__NOTOC__")
|
|
28
|
+
expect(result).not_to include("__TOC__")
|
|
29
|
+
expect(result).not_to include("__FORCETOC__")
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
it "removes __NOEDITSECTION__" do
|
|
33
|
+
input = "Some text\n__NOEDITSECTION__\nMore text"
|
|
34
|
+
result = cleanup(input)
|
|
35
|
+
expect(result).not_to include("__NOEDITSECTION__")
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
describe "Interwiki links" do
|
|
40
|
+
it "removes :en: prefixed links" do
|
|
41
|
+
input = "See :en:Force dynamics for more"
|
|
42
|
+
result = cleanup(input)
|
|
43
|
+
expect(result).to include("Force dynamics")
|
|
44
|
+
expect(result).not_to include(":en:")
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "removes :fr: prefixed links" do
|
|
48
|
+
input = "See :fr:Société de Linguistique de Paris"
|
|
49
|
+
result = cleanup(input)
|
|
50
|
+
expect(result).to include("Société de Linguistique de Paris")
|
|
51
|
+
expect(result).not_to include(":fr:")
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it "removes :de: prefixed links" do
|
|
55
|
+
input = "Related: :de:Sprachwissenschaft"
|
|
56
|
+
result = cleanup(input)
|
|
57
|
+
expect(result).to include("Sprachwissenschaft")
|
|
58
|
+
expect(result).not_to include(":de:")
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it "handles multiple interwiki links" do
|
|
62
|
+
input = "See :en:Article1 and :fr:Article2 for details"
|
|
63
|
+
result = cleanup(input)
|
|
64
|
+
expect(result).to include("Article1")
|
|
65
|
+
expect(result).to include("Article2")
|
|
66
|
+
expect(result).not_to match(/:[a-z]{2}:/)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
describe "Authority control templates" do
|
|
71
|
+
it "removes Normdaten line" do
|
|
72
|
+
input = "Some text\nNormdaten\nMore text"
|
|
73
|
+
result = cleanup(input)
|
|
74
|
+
expect(result).not_to include("Normdaten")
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it "removes Authority control line" do
|
|
78
|
+
input = "Some text\nAuthority control\nMore text"
|
|
79
|
+
result = cleanup(input)
|
|
80
|
+
expect(result).not_to include("Authority control")
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
it "removes Persondata line" do
|
|
84
|
+
input = "Some text\nPersondata\nMore text"
|
|
85
|
+
result = cleanup(input)
|
|
86
|
+
expect(result).not_to include("Persondata")
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
describe "Category line cleanup" do
|
|
91
|
+
it "removes standalone Category: lines (English)" do
|
|
92
|
+
input = "Text\nCategory:Linguistics\nCategory:Science\nMore"
|
|
93
|
+
result = cleanup(input)
|
|
94
|
+
expect(result).not_to match(/^Category:/)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
it "removes standalone カテゴリ lines (Japanese)" do
|
|
98
|
+
input = "Text\nカテゴリ:言語学\nMore"
|
|
99
|
+
result = cleanup(input)
|
|
100
|
+
expect(result).not_to match(/^カテゴリ:/)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
it "removes standalone Kategorie lines (German)" do
|
|
104
|
+
input = "Text\nKategorie:Sprachwissenschaft\nMore"
|
|
105
|
+
result = cleanup(input)
|
|
106
|
+
expect(result).not_to match(/^Kategorie:/)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
it "removes standalone Catégorie lines (French)" do
|
|
110
|
+
input = "Text\nCatégorie:Linguistique\nMore"
|
|
111
|
+
result = cleanup(input)
|
|
112
|
+
expect(result).not_to match(/^Catégorie:/)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it "removes Category lines with asterisk prefix" do
|
|
116
|
+
input = "Text\n*\nCategory:Main\nMore"
|
|
117
|
+
result = cleanup(input)
|
|
118
|
+
expect(result).not_to match(/^Category:/)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
it "preserves CATEGORIES summary line" do
|
|
122
|
+
input = "Text\nCATEGORIES: Foo, Bar, Baz\nMore"
|
|
123
|
+
result = cleanup(input)
|
|
124
|
+
expect(result).to include("CATEGORIES: Foo, Bar, Baz")
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
describe "Template artifact cleanup" do
|
|
129
|
+
it "removes stub template markers" do
|
|
130
|
+
# Common stub patterns across languages
|
|
131
|
+
input = "Text\n節スタブ\nMore"
|
|
132
|
+
result = cleanup(input)
|
|
133
|
+
# This might be Japanese-specific, but the pattern should be general
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
it "removes reference help markers" do
|
|
137
|
+
input = "Text\n脚注ヘルプ\nMore"
|
|
138
|
+
result = cleanup(input)
|
|
139
|
+
# Japanese-specific, need general approach
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
it "removes lines that are just asterisk + single word" do
|
|
143
|
+
input = "Text\n*和書\n*洋書\nMore"
|
|
144
|
+
result = cleanup(input)
|
|
145
|
+
# Pattern: ^\*[^\s\*]+$ (single word after asterisk)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
it "removes Wikibooks/Wikiversity markers" do
|
|
149
|
+
input = "Text\nWikibooks\nSchool:言語学\nMore"
|
|
150
|
+
result = cleanup(input)
|
|
151
|
+
expect(result).not_to match(/^Wikibooks$/)
|
|
152
|
+
expect(result).not_to match(/^School:/)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it "removes commons/wikimedia markers" do
|
|
156
|
+
input = "Text\nCommons\nWikimedia Commons\nMore"
|
|
157
|
+
result = cleanup(input)
|
|
158
|
+
expect(result).not_to match(/^Commons$/)
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
describe "Combined cleanup" do
|
|
163
|
+
it "cleans up a realistic Wikipedia article footer" do
|
|
164
|
+
input = <<~TEXT
|
|
165
|
+
This is the main content.
|
|
166
|
+
|
|
167
|
+
== References ==
|
|
168
|
+
脚注ヘルプ
|
|
169
|
+
|
|
170
|
+
== External links ==
|
|
171
|
+
Wikibooks
|
|
172
|
+
School:言語学
|
|
173
|
+
|
|
174
|
+
Normdaten
|
|
175
|
+
DEFAULTSORT:けんこかく
|
|
176
|
+
Category:言語学
|
|
177
|
+
Category:人文科学
|
|
178
|
+
*
|
|
179
|
+
|
|
180
|
+
CATEGORIES: 言語学, 人文科学
|
|
181
|
+
TEXT
|
|
182
|
+
|
|
183
|
+
result = cleanup(input)
|
|
184
|
+
|
|
185
|
+
expect(result).to include("This is the main content")
|
|
186
|
+
expect(result).to include("== References ==")
|
|
187
|
+
expect(result).to include("== External links ==")
|
|
188
|
+
expect(result).to include("CATEGORIES: 言語学, 人文科学")
|
|
189
|
+
|
|
190
|
+
expect(result).not_to include("Normdaten")
|
|
191
|
+
expect(result).not_to include("DEFAULTSORT")
|
|
192
|
+
expect(result).not_to match(/^Category:/)
|
|
193
|
+
expect(result).not_to match(/^Wikibooks$/)
|
|
194
|
+
expect(result).not_to match(/^School:/)
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|