wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "spec_helper"
|
|
4
|
+
require "tmpdir"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require_relative "../lib/wp2txt/article"
|
|
7
|
+
require_relative "../lib/wp2txt/formatter"
|
|
8
|
+
|
|
9
|
+
RSpec.describe "Formatter section extraction" do
|
|
10
|
+
include Wp2txt::Formatter
|
|
11
|
+
include Wp2txt
|
|
12
|
+
|
|
13
|
+
let(:sample_wiki_text) do
|
|
14
|
+
<<~WIKI
|
|
15
|
+
'''The Godfather''' is a 1972 American crime film.
|
|
16
|
+
|
|
17
|
+
== Plot ==
|
|
18
|
+
The story of the Corleone crime family.
|
|
19
|
+
|
|
20
|
+
== Cast ==
|
|
21
|
+
* Marlon Brando as Vito Corleone
|
|
22
|
+
* Al Pacino as Michael Corleone
|
|
23
|
+
|
|
24
|
+
== Reception ==
|
|
25
|
+
The film received critical acclaim.
|
|
26
|
+
|
|
27
|
+
=== Awards ===
|
|
28
|
+
Won three Academy Awards.
|
|
29
|
+
|
|
30
|
+
[[Category:1972 films]]
|
|
31
|
+
[[Category:Crime films]]
|
|
32
|
+
WIKI
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
let(:article) { Wp2txt::Article.new(sample_wiki_text, "The Godfather") }
|
|
36
|
+
|
|
37
|
+
describe "format_article with sections" do
|
|
38
|
+
context "structured JSON output" do
|
|
39
|
+
let(:config) do
|
|
40
|
+
{
|
|
41
|
+
format: :json,
|
|
42
|
+
sections: ["summary", "Plot", "Reception"],
|
|
43
|
+
category: true
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "returns sections object with each section" do
|
|
48
|
+
result = format_article(article, config)
|
|
49
|
+
expect(result).to be_a(Hash)
|
|
50
|
+
expect(result["sections"]).to be_a(Hash)
|
|
51
|
+
expect(result["sections"].keys).to include("summary", "Plot", "Reception")
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it "includes summary text" do
|
|
55
|
+
result = format_article(article, config)
|
|
56
|
+
expect(result["sections"]["summary"]).to include("1972 American crime film")
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
it "includes section content" do
|
|
60
|
+
result = format_article(article, config)
|
|
61
|
+
expect(result["sections"]["Plot"]).to include("Corleone crime family")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it "includes subsections in parent section" do
|
|
65
|
+
result = format_article(article, config)
|
|
66
|
+
expect(result["sections"]["Reception"]).to include("Academy Awards")
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
it "includes categories" do
|
|
70
|
+
result = format_article(article, config)
|
|
71
|
+
expect(result["categories"]).to include("1972 films", "Crime films")
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
context "combined JSON output" do
|
|
76
|
+
let(:config) do
|
|
77
|
+
{
|
|
78
|
+
format: :json,
|
|
79
|
+
sections: ["summary", "Plot"],
|
|
80
|
+
section_output: "combined",
|
|
81
|
+
category: true
|
|
82
|
+
}
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it "returns concatenated text" do
|
|
86
|
+
result = format_article(article, config)
|
|
87
|
+
expect(result["text"]).to include("crime film")
|
|
88
|
+
expect(result["text"]).to include("Corleone")
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
it "includes sections_included array" do
|
|
92
|
+
result = format_article(article, config)
|
|
93
|
+
expect(result["sections_included"]).to eq(["summary", "Plot"])
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
context "structured text output" do
|
|
98
|
+
let(:config) do
|
|
99
|
+
{
|
|
100
|
+
format: :text,
|
|
101
|
+
sections: ["summary", "Plot", "Cast"],
|
|
102
|
+
category: true
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
it "includes TITLE header" do
|
|
107
|
+
result = format_article(article, config)
|
|
108
|
+
expect(result).to include("TITLE: The Godfather")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
it "includes SECTION labels" do
|
|
112
|
+
result = format_article(article, config)
|
|
113
|
+
expect(result).to include("SECTION [summary]:")
|
|
114
|
+
expect(result).to include("SECTION [Plot]:")
|
|
115
|
+
expect(result).to include("SECTION [Cast]:")
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
it "includes CATEGORIES footer" do
|
|
119
|
+
result = format_article(article, config)
|
|
120
|
+
expect(result).to include("CATEGORIES: 1972 films, Crime films")
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
context "combined text output" do
|
|
125
|
+
let(:config) do
|
|
126
|
+
{
|
|
127
|
+
format: :text,
|
|
128
|
+
sections: ["summary", "Plot"],
|
|
129
|
+
section_output: "combined",
|
|
130
|
+
category: true
|
|
131
|
+
}
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
it "includes SECTIONS header listing included sections" do
|
|
135
|
+
result = format_article(article, config)
|
|
136
|
+
expect(result).to include("SECTIONS: summary, Plot")
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
it "includes concatenated content" do
|
|
140
|
+
result = format_article(article, config)
|
|
141
|
+
expect(result).to include("crime film")
|
|
142
|
+
expect(result).to include("Corleone")
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
context "with non-existent sections" do
|
|
147
|
+
let(:config) do
|
|
148
|
+
{
|
|
149
|
+
format: :json,
|
|
150
|
+
sections: ["summary", "Gameplay", "Plot"],
|
|
151
|
+
category: true
|
|
152
|
+
}
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
it "returns nil for non-existent sections" do
|
|
156
|
+
result = format_article(article, config)
|
|
157
|
+
expect(result["sections"]["Gameplay"]).to be_nil
|
|
158
|
+
expect(result["sections"]["Plot"]).not_to be_nil
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
context "with min_section_length filter" do
|
|
163
|
+
let(:config) do
|
|
164
|
+
{
|
|
165
|
+
format: :json,
|
|
166
|
+
sections: ["summary", "Plot"],
|
|
167
|
+
min_section_length: 100,
|
|
168
|
+
category: true
|
|
169
|
+
}
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it "filters out short sections" do
|
|
173
|
+
result = format_article(article, config)
|
|
174
|
+
# Summary is short in this test
|
|
175
|
+
expect(result["sections"]["summary"]).to be_nil
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
context "with skip_empty option" do
|
|
180
|
+
let(:no_match_config) do
|
|
181
|
+
{
|
|
182
|
+
format: :json,
|
|
183
|
+
sections: ["Gameplay", "Soundtrack"],
|
|
184
|
+
skip_empty: true,
|
|
185
|
+
category: true
|
|
186
|
+
}
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
it "returns nil for articles with no matching sections" do
|
|
190
|
+
result = format_article(article, no_match_config)
|
|
191
|
+
expect(result).to be_nil
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
describe "summary_only refactoring" do
|
|
197
|
+
let(:config) do
|
|
198
|
+
{
|
|
199
|
+
format: :json,
|
|
200
|
+
summary_only: true,
|
|
201
|
+
category: true
|
|
202
|
+
}
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
it "extracts only summary" do
|
|
206
|
+
result = format_article(article, config)
|
|
207
|
+
expect(result["text"]).to include("crime film")
|
|
208
|
+
expect(result["text"]).not_to include("Corleone")
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
it "uses combined output mode" do
|
|
212
|
+
result = format_article(article, config)
|
|
213
|
+
expect(result["sections_included"]).to eq(["summary"])
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
describe "alias matching in extraction" do
|
|
218
|
+
let(:wiki_with_synopsis) do
|
|
219
|
+
<<~WIKI
|
|
220
|
+
A movie summary.
|
|
221
|
+
|
|
222
|
+
== Synopsis ==
|
|
223
|
+
The story follows the main character.
|
|
224
|
+
|
|
225
|
+
[[Category:Films]]
|
|
226
|
+
WIKI
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Test Movie") }
|
|
230
|
+
|
|
231
|
+
let(:config) do
|
|
232
|
+
{
|
|
233
|
+
format: :json,
|
|
234
|
+
sections: ["summary", "Plot"],
|
|
235
|
+
category: true
|
|
236
|
+
}
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
it "matches Synopsis as alias for Plot" do
|
|
240
|
+
result = format_article(synopsis_article, config)
|
|
241
|
+
expect(result["sections"]["Plot"]).to include("main character")
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
describe "show_matched_sections option" do
|
|
246
|
+
let(:wiki_with_synopsis) do
|
|
247
|
+
<<~WIKI
|
|
248
|
+
A movie summary.
|
|
249
|
+
|
|
250
|
+
== Synopsis ==
|
|
251
|
+
The story follows the main character.
|
|
252
|
+
|
|
253
|
+
[[Category:Films]]
|
|
254
|
+
WIKI
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Test Movie") }
|
|
258
|
+
|
|
259
|
+
context "when enabled" do
|
|
260
|
+
let(:config) do
|
|
261
|
+
{
|
|
262
|
+
format: :json,
|
|
263
|
+
sections: ["summary", "Plot"],
|
|
264
|
+
show_matched_sections: true,
|
|
265
|
+
category: true
|
|
266
|
+
}
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
it "includes matched_sections field" do
|
|
270
|
+
result = format_article(synopsis_article, config)
|
|
271
|
+
expect(result["matched_sections"]).to be_a(Hash)
|
|
272
|
+
expect(result["matched_sections"]["Plot"]).to eq("Synopsis")
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
context "when disabled (default)" do
|
|
277
|
+
let(:config) do
|
|
278
|
+
{
|
|
279
|
+
format: :json,
|
|
280
|
+
sections: ["summary", "Plot"],
|
|
281
|
+
show_matched_sections: false,
|
|
282
|
+
category: true
|
|
283
|
+
}
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
it "does not include matched_sections field" do
|
|
287
|
+
result = format_article(synopsis_article, config)
|
|
288
|
+
expect(result).not_to have_key("matched_sections")
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
context "with combined output mode" do
|
|
293
|
+
let(:config) do
|
|
294
|
+
{
|
|
295
|
+
format: :json,
|
|
296
|
+
sections: ["summary", "Plot"],
|
|
297
|
+
section_output: "combined",
|
|
298
|
+
show_matched_sections: true,
|
|
299
|
+
category: true
|
|
300
|
+
}
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it "includes matched_sections in combined output" do
|
|
304
|
+
result = format_article(synopsis_article, config)
|
|
305
|
+
expect(result["matched_sections"]["Plot"]).to eq("Synopsis")
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
describe "no_section_aliases option" do
|
|
311
|
+
let(:wiki_with_synopsis) do
|
|
312
|
+
<<~WIKI
|
|
313
|
+
A movie summary.
|
|
314
|
+
|
|
315
|
+
== Synopsis ==
|
|
316
|
+
The story follows the main character.
|
|
317
|
+
|
|
318
|
+
[[Category:Films]]
|
|
319
|
+
WIKI
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
let(:synopsis_article) { Wp2txt::Article.new(wiki_with_synopsis, "Test Movie") }
|
|
323
|
+
|
|
324
|
+
context "when aliases are disabled" do
|
|
325
|
+
let(:config) do
|
|
326
|
+
{
|
|
327
|
+
format: :json,
|
|
328
|
+
sections: ["summary", "Plot"],
|
|
329
|
+
no_section_aliases: true,
|
|
330
|
+
category: true
|
|
331
|
+
}
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
it "does not match Synopsis as Plot" do
|
|
335
|
+
result = format_article(synopsis_article, config)
|
|
336
|
+
expect(result["sections"]["Plot"]).to be_nil
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
|
|
341
|
+
describe "alias_file option" do
|
|
342
|
+
let(:temp_dir) { Dir.mktmpdir }
|
|
343
|
+
let(:alias_file) { File.join(temp_dir, "custom_aliases.yml") }
|
|
344
|
+
|
|
345
|
+
after { FileUtils.remove_entry(temp_dir) }
|
|
346
|
+
|
|
347
|
+
let(:wiki_with_story) do
|
|
348
|
+
<<~WIKI
|
|
349
|
+
A summary.
|
|
350
|
+
|
|
351
|
+
== Storyline ==
|
|
352
|
+
The narrative unfolds.
|
|
353
|
+
|
|
354
|
+
[[Category:Films]]
|
|
355
|
+
WIKI
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
let(:story_article) { Wp2txt::Article.new(wiki_with_story, "Story Film") }
|
|
359
|
+
|
|
360
|
+
before do
|
|
361
|
+
File.write(alias_file, <<~YAML)
|
|
362
|
+
Plot:
|
|
363
|
+
- Storyline
|
|
364
|
+
- Narrative
|
|
365
|
+
YAML
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
let(:config) do
|
|
369
|
+
{
|
|
370
|
+
format: :json,
|
|
371
|
+
sections: ["Plot"],
|
|
372
|
+
alias_file: alias_file,
|
|
373
|
+
category: true
|
|
374
|
+
}
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
it "uses custom aliases from file" do
|
|
378
|
+
result = format_article(story_article, config)
|
|
379
|
+
expect(result["sections"]["Plot"]).to include("narrative unfolds")
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
end
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "spec_helper"
|
|
4
|
+
require "tmpdir"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
|
|
7
|
+
RSpec.describe Wp2txt::GlobalDataCache do
|
|
8
|
+
let(:cache_dir) { Dir.mktmpdir("wp2txt_global_cache_test_") }
|
|
9
|
+
|
|
10
|
+
before do
|
|
11
|
+
described_class.configure(cache_dir: cache_dir, enabled: true)
|
|
12
|
+
described_class.clear!
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
after do
|
|
16
|
+
described_class.clear!
|
|
17
|
+
FileUtils.rm_rf(cache_dir)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
describe ".configure" do
|
|
21
|
+
it "sets cache directory" do
|
|
22
|
+
described_class.configure(cache_dir: "/tmp/custom")
|
|
23
|
+
expect(described_class.cache_path).to eq "/tmp/custom/global_data.sqlite3"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "can disable caching" do
|
|
27
|
+
described_class.configure(enabled: false)
|
|
28
|
+
expect(described_class.enabled).to be false
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe ".cache_path" do
|
|
33
|
+
it "returns path to SQLite database" do
|
|
34
|
+
expect(described_class.cache_path).to end_with("global_data.sqlite3")
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
describe ".save and .load" do
|
|
39
|
+
it "saves and loads data" do
|
|
40
|
+
test_data = { "key1" => "value1", "nested" => { "a" => 1 } }
|
|
41
|
+
described_class.save(:test_category, test_data)
|
|
42
|
+
|
|
43
|
+
loaded = described_class.load(:test_category)
|
|
44
|
+
expect(loaded).to eq test_data
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
it "returns nil for non-existent category" do
|
|
48
|
+
expect(described_class.load(:nonexistent)).to be_nil
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "handles empty hash" do
|
|
52
|
+
described_class.save(:empty, {})
|
|
53
|
+
expect(described_class.load(:empty)).to eq({})
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it "handles arrays in data" do
|
|
57
|
+
test_data = { "list" => [1, 2, 3], "strings" => %w[a b c] }
|
|
58
|
+
described_class.save(:with_arrays, test_data)
|
|
59
|
+
|
|
60
|
+
loaded = described_class.load(:with_arrays)
|
|
61
|
+
expect(loaded["list"]).to eq [1, 2, 3]
|
|
62
|
+
expect(loaded["strings"]).to eq %w[a b c]
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
describe ".load_all" do
|
|
67
|
+
it "loads all cached categories" do
|
|
68
|
+
described_class.save(:cat1, { "a" => 1 })
|
|
69
|
+
described_class.save(:cat2, { "b" => 2 })
|
|
70
|
+
|
|
71
|
+
all = described_class.load_all
|
|
72
|
+
expect(all[:cat1]).to eq({ "a" => 1 })
|
|
73
|
+
expect(all[:cat2]).to eq({ "b" => 2 })
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
it "returns empty hash when cache is empty" do
|
|
77
|
+
expect(described_class.load_all).to eq({})
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
describe ".save_all" do
|
|
82
|
+
it "saves multiple categories at once" do
|
|
83
|
+
data = {
|
|
84
|
+
cat1: { "x" => 1 },
|
|
85
|
+
cat2: { "y" => 2 }
|
|
86
|
+
}
|
|
87
|
+
described_class.save_all(data)
|
|
88
|
+
|
|
89
|
+
expect(described_class.load(:cat1)).to eq({ "x" => 1 })
|
|
90
|
+
expect(described_class.load(:cat2)).to eq({ "y" => 2 })
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
describe ".clear!" do
|
|
95
|
+
it "removes the cache file" do
|
|
96
|
+
described_class.save(:test, { "data" => true })
|
|
97
|
+
expect(File.exist?(described_class.cache_path)).to be true
|
|
98
|
+
|
|
99
|
+
described_class.clear!
|
|
100
|
+
expect(File.exist?(described_class.cache_path)).to be false
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
describe ".stats" do
|
|
105
|
+
it "returns cache statistics" do
|
|
106
|
+
described_class.save(:test, { "data" => "value" })
|
|
107
|
+
|
|
108
|
+
stats = described_class.stats
|
|
109
|
+
expect(stats[:cache_path]).to eq described_class.cache_path
|
|
110
|
+
expect(stats[:cache_size]).to be > 0
|
|
111
|
+
expect(stats[:categories]).to be_an(Array)
|
|
112
|
+
expect(stats[:categories].first[:category]).to eq "test"
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
it "returns nil when cache doesn't exist" do
|
|
116
|
+
described_class.clear!
|
|
117
|
+
expect(described_class.stats).to be_nil
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
describe "caching disabled" do
|
|
122
|
+
before do
|
|
123
|
+
described_class.configure(cache_dir: cache_dir, enabled: false)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
it "does not save data when disabled" do
|
|
127
|
+
described_class.save(:test, { "data" => true })
|
|
128
|
+
expect(File.exist?(described_class.cache_path)).to be false
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it "returns nil when loading with cache disabled" do
|
|
132
|
+
# Enable temporarily to save
|
|
133
|
+
described_class.configure(cache_dir: cache_dir, enabled: true)
|
|
134
|
+
described_class.save(:test, { "data" => true })
|
|
135
|
+
|
|
136
|
+
# Disable and try to load
|
|
137
|
+
described_class.configure(cache_dir: cache_dir, enabled: false)
|
|
138
|
+
expect(described_class.load(:test)).to be_nil
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
describe "integration with real data files" do
|
|
143
|
+
before do
|
|
144
|
+
described_class.configure(cache_dir: cache_dir, enabled: true)
|
|
145
|
+
described_class.clear!
|
|
146
|
+
# Clear cached instance variables
|
|
147
|
+
Wp2txt.instance_variable_set(:@mediawiki_data, nil)
|
|
148
|
+
Wp2txt.instance_variable_set(:@template_data, nil)
|
|
149
|
+
Wp2txt.instance_variable_set(:@html_entities, nil)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it "caches mediawiki data" do
|
|
153
|
+
# First load - from JSON
|
|
154
|
+
data1 = Wp2txt.load_mediawiki_data
|
|
155
|
+
expect(data1).to be_a(Hash)
|
|
156
|
+
expect(data1).to have_key("magic_words")
|
|
157
|
+
|
|
158
|
+
# Clear instance variable to force reload
|
|
159
|
+
Wp2txt.instance_variable_set(:@mediawiki_data, nil)
|
|
160
|
+
|
|
161
|
+
# Second load - from cache
|
|
162
|
+
data2 = Wp2txt.load_mediawiki_data
|
|
163
|
+
expect(data2).to eq data1
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
it "caches template data" do
|
|
167
|
+
data1 = Wp2txt.load_template_data
|
|
168
|
+
expect(data1).to be_a(Hash)
|
|
169
|
+
|
|
170
|
+
Wp2txt.instance_variable_set(:@template_data, nil)
|
|
171
|
+
|
|
172
|
+
data2 = Wp2txt.load_template_data
|
|
173
|
+
expect(data2).to eq data1
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it "caches html entities" do
|
|
177
|
+
data1 = Wp2txt.load_html_entities
|
|
178
|
+
expect(data1).to be_a(Hash)
|
|
179
|
+
|
|
180
|
+
Wp2txt.instance_variable_set(:@html_entities, nil)
|
|
181
|
+
|
|
182
|
+
data2 = Wp2txt.load_html_entities
|
|
183
|
+
expect(data2).to eq data1
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|