wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "utils"
|
|
4
|
+
require_relative "regex"
|
|
5
|
+
require_relative "section_extractor"
|
|
6
|
+
|
|
7
|
+
module Wp2txt
|
|
8
|
+
# Article formatting utilities for WpApp
|
|
9
|
+
module Formatter
|
|
10
|
+
# Debug mode flag (inherited from including class if defined)
|
|
11
|
+
def formatter_debug_mode
|
|
12
|
+
defined?(DEBUG_MODE) ? DEBUG_MODE : false
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Format article based on configuration and output format
|
|
16
|
+
def format_article(article, config)
|
|
17
|
+
# Store original title for magic word expansion in content
|
|
18
|
+
original_title = article.title.dup
|
|
19
|
+
article.title = format_wiki(article.title, config)
|
|
20
|
+
|
|
21
|
+
# Add title to config for magic word expansion in content processing
|
|
22
|
+
config_with_title = config.merge(title: original_title)
|
|
23
|
+
|
|
24
|
+
# Handle metadata_only mode (title + sections + categories)
|
|
25
|
+
if config[:metadata_only]
|
|
26
|
+
return format_metadata_only(article, config_with_title)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Handle summary_only as section extraction (for consistency)
|
|
30
|
+
if config[:summary_only]
|
|
31
|
+
summary_config = config_with_title.merge(
|
|
32
|
+
sections: [SectionExtractor::SUMMARY_KEY],
|
|
33
|
+
section_output: "combined"
|
|
34
|
+
)
|
|
35
|
+
return format_with_sections(article, summary_config)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Handle section extraction mode (--sections option)
|
|
39
|
+
if config[:sections] && !config[:sections].empty?
|
|
40
|
+
return format_with_sections(article, config_with_title)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
if config[:format] == :json
|
|
44
|
+
format_article_json(article, config_with_title)
|
|
45
|
+
else
|
|
46
|
+
format_article_text(article, config_with_title)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Format article with specific section extraction
|
|
51
|
+
def format_with_sections(article, config)
|
|
52
|
+
extractor = SectionExtractor.new(
|
|
53
|
+
config[:sections],
|
|
54
|
+
min_length: config[:min_section_length] || 0,
|
|
55
|
+
skip_empty: config[:skip_empty] || false,
|
|
56
|
+
use_aliases: !config[:no_section_aliases],
|
|
57
|
+
alias_file: config[:alias_file],
|
|
58
|
+
track_matches: config[:show_matched_sections] || false
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Skip article if no matching sections and skip_empty is true
|
|
62
|
+
return nil if extractor.should_skip?(article)
|
|
63
|
+
|
|
64
|
+
sections = extractor.extract_sections(article, config)
|
|
65
|
+
matched_sections = extractor.matched_sections
|
|
66
|
+
|
|
67
|
+
# Apply format_wiki to section content
|
|
68
|
+
sections.transform_values! do |content|
|
|
69
|
+
next nil if content.nil?
|
|
70
|
+
cleanup(format_wiki(content, config))
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
output_mode = config[:section_output] || "structured"
|
|
74
|
+
|
|
75
|
+
if config[:format] == :json
|
|
76
|
+
if output_mode == "combined"
|
|
77
|
+
format_sections_combined_json(article, sections, config, matched_sections)
|
|
78
|
+
else
|
|
79
|
+
format_sections_structured_json(article, sections, config, matched_sections)
|
|
80
|
+
end
|
|
81
|
+
else
|
|
82
|
+
if output_mode == "combined"
|
|
83
|
+
format_sections_combined_text(article, sections, config)
|
|
84
|
+
else
|
|
85
|
+
format_sections_structured_text(article, sections, config)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Format sections as structured JSON (each section as separate field)
|
|
91
|
+
def format_sections_structured_json(article, sections, config, matched_sections = {})
|
|
92
|
+
result = {
|
|
93
|
+
"title" => article.title,
|
|
94
|
+
"sections" => sections
|
|
95
|
+
}
|
|
96
|
+
result["categories"] = article.categories.flatten if config[:category]
|
|
97
|
+
# Include matched_sections if tracking is enabled and there are matches
|
|
98
|
+
if config[:show_matched_sections] && matched_sections && !matched_sections.empty?
|
|
99
|
+
result["matched_sections"] = matched_sections
|
|
100
|
+
end
|
|
101
|
+
result
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Format sections as combined JSON (all sections concatenated)
|
|
105
|
+
def format_sections_combined_json(article, sections, config, matched_sections = {})
|
|
106
|
+
included = sections.keys.select { |k| sections[k] && !sections[k].empty? }
|
|
107
|
+
text = included.map { |k| sections[k] }.join("\n\n")
|
|
108
|
+
|
|
109
|
+
result = {
|
|
110
|
+
"title" => article.title,
|
|
111
|
+
"text" => text,
|
|
112
|
+
"sections_included" => included
|
|
113
|
+
}
|
|
114
|
+
result["categories"] = article.categories.flatten if config[:category]
|
|
115
|
+
# Include matched_sections if tracking is enabled and there are matches
|
|
116
|
+
if config[:show_matched_sections] && matched_sections && !matched_sections.empty?
|
|
117
|
+
result["matched_sections"] = matched_sections
|
|
118
|
+
end
|
|
119
|
+
result
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Format sections as structured text
|
|
123
|
+
def format_sections_structured_text(article, sections, config)
|
|
124
|
+
output = +"TITLE: #{article.title}\n\n"
|
|
125
|
+
|
|
126
|
+
sections.each do |name, content|
|
|
127
|
+
if content.nil?
|
|
128
|
+
output << "SECTION [#{name}]: (not found)\n\n"
|
|
129
|
+
else
|
|
130
|
+
output << "SECTION [#{name}]:\n#{content}\n\n"
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
if config[:category] && !article.categories.empty?
|
|
135
|
+
output << "CATEGORIES: #{article.categories.flatten.join(', ')}\n"
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
output << "\n"
|
|
139
|
+
output
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Format sections as combined text
|
|
143
|
+
def format_sections_combined_text(article, sections, config)
|
|
144
|
+
included = sections.keys.select { |k| sections[k] && !sections[k].empty? }
|
|
145
|
+
text = included.map { |k| sections[k] }.join("\n\n")
|
|
146
|
+
|
|
147
|
+
output = +"TITLE: #{article.title}\n"
|
|
148
|
+
output << "SECTIONS: #{included.join(', ')}\n\n"
|
|
149
|
+
output << text
|
|
150
|
+
output << "\n\n"
|
|
151
|
+
|
|
152
|
+
if config[:category] && !article.categories.empty?
|
|
153
|
+
output << "CATEGORIES: #{article.categories.flatten.join(', ')}\n"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
output << "\n"
|
|
157
|
+
output
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Format article with metadata only (title, section headings, categories)
|
|
161
|
+
# Used for analyzing section distribution across Wikipedia dumps
|
|
162
|
+
def format_metadata_only(article, config)
|
|
163
|
+
extractor = SectionExtractor.new
|
|
164
|
+
sections = extractor.extract_headings(article)
|
|
165
|
+
|
|
166
|
+
if config[:format] == :json
|
|
167
|
+
format_metadata_only_json(article, sections)
|
|
168
|
+
else
|
|
169
|
+
format_metadata_only_text(article, sections)
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Format metadata as JSON
|
|
174
|
+
def format_metadata_only_json(article, sections)
|
|
175
|
+
{
|
|
176
|
+
"title" => article.title,
|
|
177
|
+
"sections" => sections,
|
|
178
|
+
"categories" => article.categories.flatten
|
|
179
|
+
}
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
# Format metadata as TSV text
|
|
183
|
+
# Format: Title<TAB>Section1|Section2|...<TAB>Category1,Category2,...
|
|
184
|
+
def format_metadata_only_text(article, sections)
|
|
185
|
+
title = article.title
|
|
186
|
+
sections_str = sections.join("|")
|
|
187
|
+
categories_str = article.categories.flatten.join(",")
|
|
188
|
+
|
|
189
|
+
"#{title}\t#{sections_str}\t#{categories_str}\n"
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Format article as JSON hash
|
|
193
|
+
def format_article_json(article, config)
|
|
194
|
+
result = { "title" => article.title }
|
|
195
|
+
|
|
196
|
+
# Categories
|
|
197
|
+
if config[:category]
|
|
198
|
+
result["categories"] = article.categories.flatten
|
|
199
|
+
else
|
|
200
|
+
result["categories"] = nil
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Text content
|
|
204
|
+
if config[:category_only]
|
|
205
|
+
result["text"] = nil
|
|
206
|
+
else
|
|
207
|
+
text = build_text_content(article, config)
|
|
208
|
+
result["text"] = text.strip
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Redirect
|
|
212
|
+
redirect_target = extract_redirect(article)
|
|
213
|
+
result["redirect"] = redirect_target
|
|
214
|
+
|
|
215
|
+
result
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# Extract redirect target from article if it's a redirect
|
|
219
|
+
def extract_redirect(article)
|
|
220
|
+
article.elements.each do |type, content|
|
|
221
|
+
if type == :mw_redirect
|
|
222
|
+
match = content.match(REDIRECT_REGEX)
|
|
223
|
+
return match[1] if match
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
nil
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Format article as text string
|
|
230
|
+
def format_article_text(article, config)
|
|
231
|
+
if config[:category_only]
|
|
232
|
+
format_category_only(article)
|
|
233
|
+
elsif config[:category] && !article.categories.empty?
|
|
234
|
+
format_with_categories(article, config)
|
|
235
|
+
else
|
|
236
|
+
format_full_article(article, config)
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Build text content from article elements
|
|
241
|
+
def build_text_content(article, config)
|
|
242
|
+
contents = +""
|
|
243
|
+
article.elements.each do |e|
|
|
244
|
+
line = process_element(e, config)
|
|
245
|
+
contents << line if line
|
|
246
|
+
end
|
|
247
|
+
# Apply cleanup to remove leftover markup, normalize whitespace, etc.
|
|
248
|
+
cleanup(contents)
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Format article with only category information (text format)
|
|
252
|
+
def format_category_only(article)
|
|
253
|
+
title = "#{article.title}\t"
|
|
254
|
+
contents = article.categories.join(", ")
|
|
255
|
+
contents << "\n"
|
|
256
|
+
title + contents
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
# Format article with categories (includes body text)
|
|
260
|
+
def format_with_categories(article, config)
|
|
261
|
+
title = "\n[[#{article.title}]]\n\n"
|
|
262
|
+
contents = build_text_content(article, config)
|
|
263
|
+
|
|
264
|
+
# Add categories at the end
|
|
265
|
+
contents << "\nCATEGORIES: "
|
|
266
|
+
contents << article.categories.join(", ")
|
|
267
|
+
contents << "\n\n"
|
|
268
|
+
|
|
269
|
+
config[:title] ? title + contents : contents
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
# Format full article content
|
|
273
|
+
def format_full_article(article, config)
|
|
274
|
+
title = "\n[[#{article.title}]]\n\n"
|
|
275
|
+
contents = build_text_content(article, config)
|
|
276
|
+
|
|
277
|
+
config[:title] ? title + contents : contents
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Process individual element of the article
|
|
281
|
+
def process_element(element, config)
|
|
282
|
+
type, content = element
|
|
283
|
+
debug_mode = formatter_debug_mode
|
|
284
|
+
|
|
285
|
+
case type
|
|
286
|
+
when :mw_heading
|
|
287
|
+
return nil if config[:summary_only]
|
|
288
|
+
return nil unless config[:heading]
|
|
289
|
+
|
|
290
|
+
content = format_wiki(content, config)
|
|
291
|
+
content += "+HEADING+" if debug_mode
|
|
292
|
+
content + "\n"
|
|
293
|
+
when :mw_paragraph
|
|
294
|
+
content = format_wiki(content, config)
|
|
295
|
+
content += "+PARAGRAPH+" if debug_mode
|
|
296
|
+
content + "\n"
|
|
297
|
+
when :mw_table, :mw_htable
|
|
298
|
+
return nil unless config[:table]
|
|
299
|
+
|
|
300
|
+
content += "+TABLE+" if debug_mode
|
|
301
|
+
content + "\n"
|
|
302
|
+
when :mw_pre
|
|
303
|
+
return nil unless config[:pre]
|
|
304
|
+
|
|
305
|
+
content += "+PRE+" if debug_mode
|
|
306
|
+
content + "\n"
|
|
307
|
+
when :mw_quote
|
|
308
|
+
content += "+QUOTE+" if debug_mode
|
|
309
|
+
content + "\n"
|
|
310
|
+
when :mw_unordered, :mw_ordered, :mw_definition
|
|
311
|
+
return nil unless config[:list]
|
|
312
|
+
|
|
313
|
+
content += "+LIST+" if debug_mode
|
|
314
|
+
content + "\n"
|
|
315
|
+
when :mw_ml_template
|
|
316
|
+
return nil unless config[:multiline]
|
|
317
|
+
|
|
318
|
+
content += "+MLTEMPLATE+" if debug_mode
|
|
319
|
+
content + "\n"
|
|
320
|
+
when :mw_link
|
|
321
|
+
content = format_wiki(content, config)
|
|
322
|
+
return nil if content.strip.empty?
|
|
323
|
+
|
|
324
|
+
content += "+LINK+" if debug_mode
|
|
325
|
+
content + "\n"
|
|
326
|
+
when :mw_ml_link
|
|
327
|
+
content = format_wiki(content, config)
|
|
328
|
+
return nil if content.strip.empty?
|
|
329
|
+
|
|
330
|
+
content += "+MLLINK+" if debug_mode
|
|
331
|
+
content + "\n"
|
|
332
|
+
when :mw_redirect
|
|
333
|
+
return nil unless config[:redirect]
|
|
334
|
+
|
|
335
|
+
content += "+REDIRECT+" if debug_mode
|
|
336
|
+
content + "\n\n"
|
|
337
|
+
when :mw_isolated_template
|
|
338
|
+
return nil unless config[:multiline]
|
|
339
|
+
|
|
340
|
+
content += "+ISOLATED_TEMPLATE+" if debug_mode
|
|
341
|
+
content + "\n"
|
|
342
|
+
when :mw_isolated_tag
|
|
343
|
+
nil
|
|
344
|
+
else
|
|
345
|
+
return nil unless debug_mode
|
|
346
|
+
|
|
347
|
+
content += "+OTHER+"
|
|
348
|
+
content + "\n"
|
|
349
|
+
end
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
end
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sqlite3"
|
|
4
|
+
require "json"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require "digest"
|
|
7
|
+
|
|
8
|
+
module Wp2txt
|
|
9
|
+
# SQLite-based cache for global data files (templates, mediawiki aliases, entities)
|
|
10
|
+
# Dramatically speeds up startup by avoiding JSON parsing overhead
|
|
11
|
+
class GlobalDataCache
|
|
12
|
+
CACHE_VERSION = 1
|
|
13
|
+
DEFAULT_CACHE_DIR = File.expand_path("~/.wp2txt/cache")
|
|
14
|
+
|
|
15
|
+
# Data categories and their source paths
|
|
16
|
+
# Note: html_entities_combined has no direct source (derived from html_entities + wikipedia_entities)
|
|
17
|
+
DATA_SOURCES = {
|
|
18
|
+
mediawiki: "mediawiki_aliases.json",
|
|
19
|
+
template: "template_aliases.json",
|
|
20
|
+
html_entities: "html_entities.json",
|
|
21
|
+
wikipedia_entities: "wikipedia_entities.json",
|
|
22
|
+
language_metadata: "language_metadata.json",
|
|
23
|
+
language_tiers: "language_tiers.json"
|
|
24
|
+
}.freeze
|
|
25
|
+
|
|
26
|
+
# Categories that are derived (combined from multiple sources)
|
|
27
|
+
# These are validated by checking their source files
|
|
28
|
+
DERIVED_SOURCES = {
|
|
29
|
+
html_entities_combined: [:html_entities, :wikipedia_entities]
|
|
30
|
+
}.freeze
|
|
31
|
+
|
|
32
|
+
class << self
|
|
33
|
+
attr_accessor :cache_dir, :enabled
|
|
34
|
+
|
|
35
|
+
def configure(cache_dir: nil, enabled: true)
|
|
36
|
+
@cache_dir = cache_dir || DEFAULT_CACHE_DIR
|
|
37
|
+
@enabled = enabled
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def cache_path
|
|
41
|
+
@cache_dir ||= DEFAULT_CACHE_DIR
|
|
42
|
+
File.join(@cache_dir, "global_data.sqlite3")
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def data_dir
|
|
46
|
+
File.join(__dir__, "data")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Check if cache is valid for all source files
|
|
50
|
+
def cache_valid?
|
|
51
|
+
return false unless @enabled
|
|
52
|
+
return false unless File.exist?(cache_path)
|
|
53
|
+
|
|
54
|
+
begin
|
|
55
|
+
db = open_db
|
|
56
|
+
DATA_SOURCES.each do |category, filename|
|
|
57
|
+
source_path = File.join(data_dir, filename)
|
|
58
|
+
next unless File.exist?(source_path)
|
|
59
|
+
|
|
60
|
+
meta = load_metadata(db, category)
|
|
61
|
+
return false unless meta
|
|
62
|
+
|
|
63
|
+
# Check version
|
|
64
|
+
return false if meta[:cache_version].to_i != CACHE_VERSION
|
|
65
|
+
|
|
66
|
+
# Check source file hasn't changed
|
|
67
|
+
source_stat = File.stat(source_path)
|
|
68
|
+
return false if meta[:source_mtime].to_i != source_stat.mtime.to_i
|
|
69
|
+
return false if meta[:source_size].to_i != source_stat.size
|
|
70
|
+
end
|
|
71
|
+
true
|
|
72
|
+
rescue SQLite3::Exception
|
|
73
|
+
false
|
|
74
|
+
ensure
|
|
75
|
+
db&.close
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Check if a specific category's cache is valid
|
|
80
|
+
def category_valid?(category)
|
|
81
|
+
return false unless @enabled
|
|
82
|
+
return false unless File.exist?(cache_path)
|
|
83
|
+
|
|
84
|
+
# For derived categories, check source categories
|
|
85
|
+
if DERIVED_SOURCES.key?(category)
|
|
86
|
+
return DERIVED_SOURCES[category].all? { |src| category_valid?(src) }
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# For unknown categories (not in DATA_SOURCES), just check if it exists in cache
|
|
90
|
+
filename = DATA_SOURCES[category]
|
|
91
|
+
unless filename
|
|
92
|
+
begin
|
|
93
|
+
db = open_db
|
|
94
|
+
row = db.get_first_row("SELECT 1 FROM global_data WHERE category = ?", [category.to_s])
|
|
95
|
+
return !row.nil?
|
|
96
|
+
rescue SQLite3::Exception
|
|
97
|
+
return false
|
|
98
|
+
ensure
|
|
99
|
+
db&.close
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# For known data sources, validate against source file
|
|
104
|
+
begin
|
|
105
|
+
db = open_db
|
|
106
|
+
source_path = File.join(data_dir, filename)
|
|
107
|
+
return true unless File.exist?(source_path)
|
|
108
|
+
|
|
109
|
+
meta = load_metadata(db, category)
|
|
110
|
+
return false unless meta
|
|
111
|
+
return false if meta[:cache_version].to_i != CACHE_VERSION
|
|
112
|
+
|
|
113
|
+
source_stat = File.stat(source_path)
|
|
114
|
+
return false if meta[:source_mtime].to_i != source_stat.mtime.to_i
|
|
115
|
+
return false if meta[:source_size].to_i != source_stat.size
|
|
116
|
+
|
|
117
|
+
true
|
|
118
|
+
rescue SQLite3::Exception
|
|
119
|
+
false
|
|
120
|
+
ensure
|
|
121
|
+
db&.close
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Load data from cache
|
|
126
|
+
# @param category [Symbol] Data category (:mediawiki, :template, etc.)
|
|
127
|
+
# @return [Hash, nil] Parsed data or nil if not cached or invalid
|
|
128
|
+
def load(category)
|
|
129
|
+
return nil unless @enabled
|
|
130
|
+
return nil unless File.exist?(cache_path)
|
|
131
|
+
return nil unless category_valid?(category)
|
|
132
|
+
|
|
133
|
+
begin
|
|
134
|
+
db = open_db
|
|
135
|
+
row = db.get_first_row(
|
|
136
|
+
"SELECT data FROM global_data WHERE category = ?",
|
|
137
|
+
[category.to_s]
|
|
138
|
+
)
|
|
139
|
+
return nil unless row
|
|
140
|
+
|
|
141
|
+
JSON.parse(row[0])
|
|
142
|
+
rescue SQLite3::Exception, JSON::ParserError
|
|
143
|
+
nil
|
|
144
|
+
ensure
|
|
145
|
+
db&.close
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Save data to cache
|
|
150
|
+
# @param category [Symbol] Data category
|
|
151
|
+
# @param data [Hash] Data to cache
|
|
152
|
+
def save(category, data)
|
|
153
|
+
return unless @enabled
|
|
154
|
+
|
|
155
|
+
FileUtils.mkdir_p(File.dirname(cache_path))
|
|
156
|
+
|
|
157
|
+
begin
|
|
158
|
+
db = open_db
|
|
159
|
+
create_schema(db)
|
|
160
|
+
|
|
161
|
+
db.execute(
|
|
162
|
+
"INSERT OR REPLACE INTO global_data (category, data, updated_at) VALUES (?, ?, ?)",
|
|
163
|
+
[category.to_s, JSON.generate(data), Time.now.to_i]
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# For derived categories, save metadata from source files
|
|
167
|
+
if DERIVED_SOURCES.key?(category)
|
|
168
|
+
DERIVED_SOURCES[category].each do |src_category|
|
|
169
|
+
filename = DATA_SOURCES[src_category]
|
|
170
|
+
next unless filename
|
|
171
|
+
|
|
172
|
+
source_path = File.join(data_dir, filename)
|
|
173
|
+
next unless File.exist?(source_path)
|
|
174
|
+
|
|
175
|
+
source_stat = File.stat(source_path)
|
|
176
|
+
save_metadata(db, src_category,
|
|
177
|
+
source_path: source_path,
|
|
178
|
+
source_mtime: source_stat.mtime.to_i,
|
|
179
|
+
source_size: source_stat.size,
|
|
180
|
+
cache_version: CACHE_VERSION
|
|
181
|
+
)
|
|
182
|
+
end
|
|
183
|
+
else
|
|
184
|
+
# For regular categories, save metadata from the source file
|
|
185
|
+
filename = DATA_SOURCES[category]
|
|
186
|
+
if filename
|
|
187
|
+
source_path = File.join(data_dir, filename)
|
|
188
|
+
if File.exist?(source_path)
|
|
189
|
+
source_stat = File.stat(source_path)
|
|
190
|
+
save_metadata(db, category,
|
|
191
|
+
source_path: source_path,
|
|
192
|
+
source_mtime: source_stat.mtime.to_i,
|
|
193
|
+
source_size: source_stat.size,
|
|
194
|
+
cache_version: CACHE_VERSION
|
|
195
|
+
)
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
rescue SQLite3::Exception => e
|
|
200
|
+
warn "GlobalDataCache: Failed to save #{category}: #{e.message}"
|
|
201
|
+
ensure
|
|
202
|
+
db&.close
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Load all data categories at once (more efficient)
|
|
207
|
+
# @return [Hash] { category => data }
|
|
208
|
+
def load_all
|
|
209
|
+
return {} unless @enabled
|
|
210
|
+
return {} unless File.exist?(cache_path)
|
|
211
|
+
|
|
212
|
+
result = {}
|
|
213
|
+
begin
|
|
214
|
+
db = open_db
|
|
215
|
+
db.execute("SELECT category, data FROM global_data") do |row|
|
|
216
|
+
category = row[0].to_sym
|
|
217
|
+
result[category] = JSON.parse(row[1])
|
|
218
|
+
end
|
|
219
|
+
result
|
|
220
|
+
rescue SQLite3::Exception, JSON::ParserError
|
|
221
|
+
{}
|
|
222
|
+
ensure
|
|
223
|
+
db&.close
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# Save all data categories at once
|
|
228
|
+
# @param data_hash [Hash] { category => data }
|
|
229
|
+
def save_all(data_hash)
|
|
230
|
+
return unless @enabled
|
|
231
|
+
|
|
232
|
+
FileUtils.mkdir_p(File.dirname(cache_path))
|
|
233
|
+
|
|
234
|
+
begin
|
|
235
|
+
db = open_db
|
|
236
|
+
create_schema(db)
|
|
237
|
+
|
|
238
|
+
db.execute("BEGIN TRANSACTION")
|
|
239
|
+
|
|
240
|
+
data_hash.each do |category, data|
|
|
241
|
+
db.execute(
|
|
242
|
+
"INSERT OR REPLACE INTO global_data (category, data, updated_at) VALUES (?, ?, ?)",
|
|
243
|
+
[category.to_s, JSON.generate(data), Time.now.to_i]
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Only save metadata if this is a known data source
|
|
247
|
+
filename = DATA_SOURCES[category]
|
|
248
|
+
if filename
|
|
249
|
+
source_path = File.join(data_dir, filename)
|
|
250
|
+
if File.exist?(source_path)
|
|
251
|
+
source_stat = File.stat(source_path)
|
|
252
|
+
save_metadata(db, category,
|
|
253
|
+
source_path: source_path,
|
|
254
|
+
source_mtime: source_stat.mtime.to_i,
|
|
255
|
+
source_size: source_stat.size,
|
|
256
|
+
cache_version: CACHE_VERSION
|
|
257
|
+
)
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
db.execute("COMMIT")
|
|
263
|
+
rescue SQLite3::Exception => e
|
|
264
|
+
db&.execute("ROLLBACK") rescue nil
|
|
265
|
+
warn "GlobalDataCache: Failed to save all: #{e.message}"
|
|
266
|
+
ensure
|
|
267
|
+
db&.close
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Clear cache
|
|
272
|
+
def clear!
|
|
273
|
+
FileUtils.rm_f(cache_path)
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Get cache statistics
|
|
277
|
+
def stats
|
|
278
|
+
return nil unless File.exist?(cache_path)
|
|
279
|
+
|
|
280
|
+
begin
|
|
281
|
+
db = open_db
|
|
282
|
+
categories = db.execute("SELECT category, LENGTH(data), updated_at FROM global_data")
|
|
283
|
+
|
|
284
|
+
{
|
|
285
|
+
cache_path: cache_path,
|
|
286
|
+
cache_size: File.size(cache_path),
|
|
287
|
+
categories: categories.map do |row|
|
|
288
|
+
{
|
|
289
|
+
category: row[0],
|
|
290
|
+
data_size: row[1],
|
|
291
|
+
updated_at: row[2] ? Time.at(row[2]) : nil
|
|
292
|
+
}
|
|
293
|
+
end
|
|
294
|
+
}
|
|
295
|
+
rescue SQLite3::Exception
|
|
296
|
+
nil
|
|
297
|
+
ensure
|
|
298
|
+
db&.close
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
private
|
|
303
|
+
|
|
304
|
+
def open_db
|
|
305
|
+
db = SQLite3::Database.new(cache_path)
|
|
306
|
+
db.execute("PRAGMA journal_mode = WAL")
|
|
307
|
+
db.execute("PRAGMA synchronous = NORMAL")
|
|
308
|
+
db
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
def create_schema(db)
|
|
312
|
+
db.execute(<<~SQL)
|
|
313
|
+
CREATE TABLE IF NOT EXISTS global_data (
|
|
314
|
+
category TEXT PRIMARY KEY,
|
|
315
|
+
data TEXT NOT NULL,
|
|
316
|
+
updated_at INTEGER
|
|
317
|
+
)
|
|
318
|
+
SQL
|
|
319
|
+
|
|
320
|
+
db.execute(<<~SQL)
|
|
321
|
+
CREATE TABLE IF NOT EXISTS metadata (
|
|
322
|
+
category TEXT,
|
|
323
|
+
key TEXT,
|
|
324
|
+
value TEXT,
|
|
325
|
+
PRIMARY KEY (category, key)
|
|
326
|
+
)
|
|
327
|
+
SQL
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
def save_metadata(db, category, hash)
|
|
331
|
+
hash.each do |key, value|
|
|
332
|
+
db.execute(
|
|
333
|
+
"INSERT OR REPLACE INTO metadata (category, key, value) VALUES (?, ?, ?)",
|
|
334
|
+
[category.to_s, key.to_s, value.to_s]
|
|
335
|
+
)
|
|
336
|
+
end
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
def load_metadata(db, category)
|
|
340
|
+
result = {}
|
|
341
|
+
db.execute("SELECT key, value FROM metadata WHERE category = ?", [category.to_s]) do |row|
|
|
342
|
+
result[row[0].to_sym] = row[1]
|
|
343
|
+
end
|
|
344
|
+
result.empty? ? nil : result
|
|
345
|
+
rescue SQLite3::Exception
|
|
346
|
+
nil
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Initialize with default settings
|
|
351
|
+
configure
|
|
352
|
+
end
|
|
353
|
+
end
|