wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,352 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "utils"
4
+ require_relative "regex"
5
+ require_relative "section_extractor"
6
+
7
+ module Wp2txt
8
+ # Article formatting utilities for WpApp
9
+ module Formatter
10
+ # Debug mode flag (inherited from including class if defined)
11
+ def formatter_debug_mode
12
+ defined?(DEBUG_MODE) ? DEBUG_MODE : false
13
+ end
14
+
15
+ # Format article based on configuration and output format
16
+ def format_article(article, config)
17
+ # Store original title for magic word expansion in content
18
+ original_title = article.title.dup
19
+ article.title = format_wiki(article.title, config)
20
+
21
+ # Add title to config for magic word expansion in content processing
22
+ config_with_title = config.merge(title: original_title)
23
+
24
+ # Handle metadata_only mode (title + sections + categories)
25
+ if config[:metadata_only]
26
+ return format_metadata_only(article, config_with_title)
27
+ end
28
+
29
+ # Handle summary_only as section extraction (for consistency)
30
+ if config[:summary_only]
31
+ summary_config = config_with_title.merge(
32
+ sections: [SectionExtractor::SUMMARY_KEY],
33
+ section_output: "combined"
34
+ )
35
+ return format_with_sections(article, summary_config)
36
+ end
37
+
38
+ # Handle section extraction mode (--sections option)
39
+ if config[:sections] && !config[:sections].empty?
40
+ return format_with_sections(article, config_with_title)
41
+ end
42
+
43
+ if config[:format] == :json
44
+ format_article_json(article, config_with_title)
45
+ else
46
+ format_article_text(article, config_with_title)
47
+ end
48
+ end
49
+
50
+ # Format article with specific section extraction
51
+ def format_with_sections(article, config)
52
+ extractor = SectionExtractor.new(
53
+ config[:sections],
54
+ min_length: config[:min_section_length] || 0,
55
+ skip_empty: config[:skip_empty] || false,
56
+ use_aliases: !config[:no_section_aliases],
57
+ alias_file: config[:alias_file],
58
+ track_matches: config[:show_matched_sections] || false
59
+ )
60
+
61
+ # Skip article if no matching sections and skip_empty is true
62
+ return nil if extractor.should_skip?(article)
63
+
64
+ sections = extractor.extract_sections(article, config)
65
+ matched_sections = extractor.matched_sections
66
+
67
+ # Apply format_wiki to section content
68
+ sections.transform_values! do |content|
69
+ next nil if content.nil?
70
+ cleanup(format_wiki(content, config))
71
+ end
72
+
73
+ output_mode = config[:section_output] || "structured"
74
+
75
+ if config[:format] == :json
76
+ if output_mode == "combined"
77
+ format_sections_combined_json(article, sections, config, matched_sections)
78
+ else
79
+ format_sections_structured_json(article, sections, config, matched_sections)
80
+ end
81
+ else
82
+ if output_mode == "combined"
83
+ format_sections_combined_text(article, sections, config)
84
+ else
85
+ format_sections_structured_text(article, sections, config)
86
+ end
87
+ end
88
+ end
89
+
90
+ # Format sections as structured JSON (each section as separate field)
91
+ def format_sections_structured_json(article, sections, config, matched_sections = {})
92
+ result = {
93
+ "title" => article.title,
94
+ "sections" => sections
95
+ }
96
+ result["categories"] = article.categories.flatten if config[:category]
97
+ # Include matched_sections if tracking is enabled and there are matches
98
+ if config[:show_matched_sections] && matched_sections && !matched_sections.empty?
99
+ result["matched_sections"] = matched_sections
100
+ end
101
+ result
102
+ end
103
+
104
+ # Format sections as combined JSON (all sections concatenated)
105
+ def format_sections_combined_json(article, sections, config, matched_sections = {})
106
+ included = sections.keys.select { |k| sections[k] && !sections[k].empty? }
107
+ text = included.map { |k| sections[k] }.join("\n\n")
108
+
109
+ result = {
110
+ "title" => article.title,
111
+ "text" => text,
112
+ "sections_included" => included
113
+ }
114
+ result["categories"] = article.categories.flatten if config[:category]
115
+ # Include matched_sections if tracking is enabled and there are matches
116
+ if config[:show_matched_sections] && matched_sections && !matched_sections.empty?
117
+ result["matched_sections"] = matched_sections
118
+ end
119
+ result
120
+ end
121
+
122
+ # Format sections as structured text
123
+ def format_sections_structured_text(article, sections, config)
124
+ output = +"TITLE: #{article.title}\n\n"
125
+
126
+ sections.each do |name, content|
127
+ if content.nil?
128
+ output << "SECTION [#{name}]: (not found)\n\n"
129
+ else
130
+ output << "SECTION [#{name}]:\n#{content}\n\n"
131
+ end
132
+ end
133
+
134
+ if config[:category] && !article.categories.empty?
135
+ output << "CATEGORIES: #{article.categories.flatten.join(', ')}\n"
136
+ end
137
+
138
+ output << "\n"
139
+ output
140
+ end
141
+
142
+ # Format sections as combined text
143
+ def format_sections_combined_text(article, sections, config)
144
+ included = sections.keys.select { |k| sections[k] && !sections[k].empty? }
145
+ text = included.map { |k| sections[k] }.join("\n\n")
146
+
147
+ output = +"TITLE: #{article.title}\n"
148
+ output << "SECTIONS: #{included.join(', ')}\n\n"
149
+ output << text
150
+ output << "\n\n"
151
+
152
+ if config[:category] && !article.categories.empty?
153
+ output << "CATEGORIES: #{article.categories.flatten.join(', ')}\n"
154
+ end
155
+
156
+ output << "\n"
157
+ output
158
+ end
159
+
160
+ # Format article with metadata only (title, section headings, categories)
161
+ # Used for analyzing section distribution across Wikipedia dumps
162
+ def format_metadata_only(article, config)
163
+ extractor = SectionExtractor.new
164
+ sections = extractor.extract_headings(article)
165
+
166
+ if config[:format] == :json
167
+ format_metadata_only_json(article, sections)
168
+ else
169
+ format_metadata_only_text(article, sections)
170
+ end
171
+ end
172
+
173
+ # Format metadata as JSON
174
+ def format_metadata_only_json(article, sections)
175
+ {
176
+ "title" => article.title,
177
+ "sections" => sections,
178
+ "categories" => article.categories.flatten
179
+ }
180
+ end
181
+
182
+ # Format metadata as TSV text
183
+ # Format: Title<TAB>Section1|Section2|...<TAB>Category1,Category2,...
184
+ def format_metadata_only_text(article, sections)
185
+ title = article.title
186
+ sections_str = sections.join("|")
187
+ categories_str = article.categories.flatten.join(",")
188
+
189
+ "#{title}\t#{sections_str}\t#{categories_str}\n"
190
+ end
191
+
192
+ # Format article as JSON hash
193
+ def format_article_json(article, config)
194
+ result = { "title" => article.title }
195
+
196
+ # Categories
197
+ if config[:category]
198
+ result["categories"] = article.categories.flatten
199
+ else
200
+ result["categories"] = nil
201
+ end
202
+
203
+ # Text content
204
+ if config[:category_only]
205
+ result["text"] = nil
206
+ else
207
+ text = build_text_content(article, config)
208
+ result["text"] = text.strip
209
+ end
210
+
211
+ # Redirect
212
+ redirect_target = extract_redirect(article)
213
+ result["redirect"] = redirect_target
214
+
215
+ result
216
+ end
217
+
218
+ # Extract redirect target from article if it's a redirect
219
+ def extract_redirect(article)
220
+ article.elements.each do |type, content|
221
+ if type == :mw_redirect
222
+ match = content.match(REDIRECT_REGEX)
223
+ return match[1] if match
224
+ end
225
+ end
226
+ nil
227
+ end
228
+
229
+ # Format article as text string
230
+ def format_article_text(article, config)
231
+ if config[:category_only]
232
+ format_category_only(article)
233
+ elsif config[:category] && !article.categories.empty?
234
+ format_with_categories(article, config)
235
+ else
236
+ format_full_article(article, config)
237
+ end
238
+ end
239
+
240
+ # Build text content from article elements
241
+ def build_text_content(article, config)
242
+ contents = +""
243
+ article.elements.each do |e|
244
+ line = process_element(e, config)
245
+ contents << line if line
246
+ end
247
+ # Apply cleanup to remove leftover markup, normalize whitespace, etc.
248
+ cleanup(contents)
249
+ end
250
+
251
+ # Format article with only category information (text format)
252
+ def format_category_only(article)
253
+ title = "#{article.title}\t"
254
+ contents = article.categories.join(", ")
255
+ contents << "\n"
256
+ title + contents
257
+ end
258
+
259
+ # Format article with categories (includes body text)
260
+ def format_with_categories(article, config)
261
+ title = "\n[[#{article.title}]]\n\n"
262
+ contents = build_text_content(article, config)
263
+
264
+ # Add categories at the end
265
+ contents << "\nCATEGORIES: "
266
+ contents << article.categories.join(", ")
267
+ contents << "\n\n"
268
+
269
+ config[:title] ? title + contents : contents
270
+ end
271
+
272
+ # Format full article content
273
+ def format_full_article(article, config)
274
+ title = "\n[[#{article.title}]]\n\n"
275
+ contents = build_text_content(article, config)
276
+
277
+ config[:title] ? title + contents : contents
278
+ end
279
+
280
+ # Process individual element of the article
281
+ def process_element(element, config)
282
+ type, content = element
283
+ debug_mode = formatter_debug_mode
284
+
285
+ case type
286
+ when :mw_heading
287
+ return nil if config[:summary_only]
288
+ return nil unless config[:heading]
289
+
290
+ content = format_wiki(content, config)
291
+ content += "+HEADING+" if debug_mode
292
+ content + "\n"
293
+ when :mw_paragraph
294
+ content = format_wiki(content, config)
295
+ content += "+PARAGRAPH+" if debug_mode
296
+ content + "\n"
297
+ when :mw_table, :mw_htable
298
+ return nil unless config[:table]
299
+
300
+ content += "+TABLE+" if debug_mode
301
+ content + "\n"
302
+ when :mw_pre
303
+ return nil unless config[:pre]
304
+
305
+ content += "+PRE+" if debug_mode
306
+ content + "\n"
307
+ when :mw_quote
308
+ content += "+QUOTE+" if debug_mode
309
+ content + "\n"
310
+ when :mw_unordered, :mw_ordered, :mw_definition
311
+ return nil unless config[:list]
312
+
313
+ content += "+LIST+" if debug_mode
314
+ content + "\n"
315
+ when :mw_ml_template
316
+ return nil unless config[:multiline]
317
+
318
+ content += "+MLTEMPLATE+" if debug_mode
319
+ content + "\n"
320
+ when :mw_link
321
+ content = format_wiki(content, config)
322
+ return nil if content.strip.empty?
323
+
324
+ content += "+LINK+" if debug_mode
325
+ content + "\n"
326
+ when :mw_ml_link
327
+ content = format_wiki(content, config)
328
+ return nil if content.strip.empty?
329
+
330
+ content += "+MLLINK+" if debug_mode
331
+ content + "\n"
332
+ when :mw_redirect
333
+ return nil unless config[:redirect]
334
+
335
+ content += "+REDIRECT+" if debug_mode
336
+ content + "\n\n"
337
+ when :mw_isolated_template
338
+ return nil unless config[:multiline]
339
+
340
+ content += "+ISOLATED_TEMPLATE+" if debug_mode
341
+ content + "\n"
342
+ when :mw_isolated_tag
343
+ nil
344
+ else
345
+ return nil unless debug_mode
346
+
347
+ content += "+OTHER+"
348
+ content + "\n"
349
+ end
350
+ end
351
+ end
352
+ end
@@ -0,0 +1,353 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sqlite3"
4
+ require "json"
5
+ require "fileutils"
6
+ require "digest"
7
+
8
+ module Wp2txt
9
+ # SQLite-based cache for global data files (templates, mediawiki aliases, entities)
10
+ # Dramatically speeds up startup by avoiding JSON parsing overhead
11
+ class GlobalDataCache
12
+ CACHE_VERSION = 1
13
+ DEFAULT_CACHE_DIR = File.expand_path("~/.wp2txt/cache")
14
+
15
+ # Data categories and their source paths
16
+ # Note: html_entities_combined has no direct source (derived from html_entities + wikipedia_entities)
17
+ DATA_SOURCES = {
18
+ mediawiki: "mediawiki_aliases.json",
19
+ template: "template_aliases.json",
20
+ html_entities: "html_entities.json",
21
+ wikipedia_entities: "wikipedia_entities.json",
22
+ language_metadata: "language_metadata.json",
23
+ language_tiers: "language_tiers.json"
24
+ }.freeze
25
+
26
+ # Categories that are derived (combined from multiple sources)
27
+ # These are validated by checking their source files
28
+ DERIVED_SOURCES = {
29
+ html_entities_combined: [:html_entities, :wikipedia_entities]
30
+ }.freeze
31
+
32
+ class << self
33
+ attr_accessor :cache_dir, :enabled
34
+
35
+ def configure(cache_dir: nil, enabled: true)
36
+ @cache_dir = cache_dir || DEFAULT_CACHE_DIR
37
+ @enabled = enabled
38
+ end
39
+
40
+ def cache_path
41
+ @cache_dir ||= DEFAULT_CACHE_DIR
42
+ File.join(@cache_dir, "global_data.sqlite3")
43
+ end
44
+
45
+ def data_dir
46
+ File.join(__dir__, "data")
47
+ end
48
+
49
+ # Check if cache is valid for all source files
50
+ def cache_valid?
51
+ return false unless @enabled
52
+ return false unless File.exist?(cache_path)
53
+
54
+ begin
55
+ db = open_db
56
+ DATA_SOURCES.each do |category, filename|
57
+ source_path = File.join(data_dir, filename)
58
+ next unless File.exist?(source_path)
59
+
60
+ meta = load_metadata(db, category)
61
+ return false unless meta
62
+
63
+ # Check version
64
+ return false if meta[:cache_version].to_i != CACHE_VERSION
65
+
66
+ # Check source file hasn't changed
67
+ source_stat = File.stat(source_path)
68
+ return false if meta[:source_mtime].to_i != source_stat.mtime.to_i
69
+ return false if meta[:source_size].to_i != source_stat.size
70
+ end
71
+ true
72
+ rescue SQLite3::Exception
73
+ false
74
+ ensure
75
+ db&.close
76
+ end
77
+ end
78
+
79
+ # Check if a specific category's cache is valid
80
+ def category_valid?(category)
81
+ return false unless @enabled
82
+ return false unless File.exist?(cache_path)
83
+
84
+ # For derived categories, check source categories
85
+ if DERIVED_SOURCES.key?(category)
86
+ return DERIVED_SOURCES[category].all? { |src| category_valid?(src) }
87
+ end
88
+
89
+ # For unknown categories (not in DATA_SOURCES), just check if it exists in cache
90
+ filename = DATA_SOURCES[category]
91
+ unless filename
92
+ begin
93
+ db = open_db
94
+ row = db.get_first_row("SELECT 1 FROM global_data WHERE category = ?", [category.to_s])
95
+ return !row.nil?
96
+ rescue SQLite3::Exception
97
+ return false
98
+ ensure
99
+ db&.close
100
+ end
101
+ end
102
+
103
+ # For known data sources, validate against source file
104
+ begin
105
+ db = open_db
106
+ source_path = File.join(data_dir, filename)
107
+ return true unless File.exist?(source_path)
108
+
109
+ meta = load_metadata(db, category)
110
+ return false unless meta
111
+ return false if meta[:cache_version].to_i != CACHE_VERSION
112
+
113
+ source_stat = File.stat(source_path)
114
+ return false if meta[:source_mtime].to_i != source_stat.mtime.to_i
115
+ return false if meta[:source_size].to_i != source_stat.size
116
+
117
+ true
118
+ rescue SQLite3::Exception
119
+ false
120
+ ensure
121
+ db&.close
122
+ end
123
+ end
124
+
125
+ # Load data from cache
126
+ # @param category [Symbol] Data category (:mediawiki, :template, etc.)
127
+ # @return [Hash, nil] Parsed data or nil if not cached or invalid
128
+ def load(category)
129
+ return nil unless @enabled
130
+ return nil unless File.exist?(cache_path)
131
+ return nil unless category_valid?(category)
132
+
133
+ begin
134
+ db = open_db
135
+ row = db.get_first_row(
136
+ "SELECT data FROM global_data WHERE category = ?",
137
+ [category.to_s]
138
+ )
139
+ return nil unless row
140
+
141
+ JSON.parse(row[0])
142
+ rescue SQLite3::Exception, JSON::ParserError
143
+ nil
144
+ ensure
145
+ db&.close
146
+ end
147
+ end
148
+
149
+ # Save data to cache
150
+ # @param category [Symbol] Data category
151
+ # @param data [Hash] Data to cache
152
+ def save(category, data)
153
+ return unless @enabled
154
+
155
+ FileUtils.mkdir_p(File.dirname(cache_path))
156
+
157
+ begin
158
+ db = open_db
159
+ create_schema(db)
160
+
161
+ db.execute(
162
+ "INSERT OR REPLACE INTO global_data (category, data, updated_at) VALUES (?, ?, ?)",
163
+ [category.to_s, JSON.generate(data), Time.now.to_i]
164
+ )
165
+
166
+ # For derived categories, save metadata from source files
167
+ if DERIVED_SOURCES.key?(category)
168
+ DERIVED_SOURCES[category].each do |src_category|
169
+ filename = DATA_SOURCES[src_category]
170
+ next unless filename
171
+
172
+ source_path = File.join(data_dir, filename)
173
+ next unless File.exist?(source_path)
174
+
175
+ source_stat = File.stat(source_path)
176
+ save_metadata(db, src_category,
177
+ source_path: source_path,
178
+ source_mtime: source_stat.mtime.to_i,
179
+ source_size: source_stat.size,
180
+ cache_version: CACHE_VERSION
181
+ )
182
+ end
183
+ else
184
+ # For regular categories, save metadata from the source file
185
+ filename = DATA_SOURCES[category]
186
+ if filename
187
+ source_path = File.join(data_dir, filename)
188
+ if File.exist?(source_path)
189
+ source_stat = File.stat(source_path)
190
+ save_metadata(db, category,
191
+ source_path: source_path,
192
+ source_mtime: source_stat.mtime.to_i,
193
+ source_size: source_stat.size,
194
+ cache_version: CACHE_VERSION
195
+ )
196
+ end
197
+ end
198
+ end
199
+ rescue SQLite3::Exception => e
200
+ warn "GlobalDataCache: Failed to save #{category}: #{e.message}"
201
+ ensure
202
+ db&.close
203
+ end
204
+ end
205
+
206
+ # Load all data categories at once (more efficient)
207
+ # @return [Hash] { category => data }
208
+ def load_all
209
+ return {} unless @enabled
210
+ return {} unless File.exist?(cache_path)
211
+
212
+ result = {}
213
+ begin
214
+ db = open_db
215
+ db.execute("SELECT category, data FROM global_data") do |row|
216
+ category = row[0].to_sym
217
+ result[category] = JSON.parse(row[1])
218
+ end
219
+ result
220
+ rescue SQLite3::Exception, JSON::ParserError
221
+ {}
222
+ ensure
223
+ db&.close
224
+ end
225
+ end
226
+
227
+ # Save all data categories at once
228
+ # @param data_hash [Hash] { category => data }
229
+ def save_all(data_hash)
230
+ return unless @enabled
231
+
232
+ FileUtils.mkdir_p(File.dirname(cache_path))
233
+
234
+ begin
235
+ db = open_db
236
+ create_schema(db)
237
+
238
+ db.execute("BEGIN TRANSACTION")
239
+
240
+ data_hash.each do |category, data|
241
+ db.execute(
242
+ "INSERT OR REPLACE INTO global_data (category, data, updated_at) VALUES (?, ?, ?)",
243
+ [category.to_s, JSON.generate(data), Time.now.to_i]
244
+ )
245
+
246
+ # Only save metadata if this is a known data source
247
+ filename = DATA_SOURCES[category]
248
+ if filename
249
+ source_path = File.join(data_dir, filename)
250
+ if File.exist?(source_path)
251
+ source_stat = File.stat(source_path)
252
+ save_metadata(db, category,
253
+ source_path: source_path,
254
+ source_mtime: source_stat.mtime.to_i,
255
+ source_size: source_stat.size,
256
+ cache_version: CACHE_VERSION
257
+ )
258
+ end
259
+ end
260
+ end
261
+
262
+ db.execute("COMMIT")
263
+ rescue SQLite3::Exception => e
264
+ db&.execute("ROLLBACK") rescue nil
265
+ warn "GlobalDataCache: Failed to save all: #{e.message}"
266
+ ensure
267
+ db&.close
268
+ end
269
+ end
270
+
271
+ # Clear cache
272
+ def clear!
273
+ FileUtils.rm_f(cache_path)
274
+ end
275
+
276
+ # Get cache statistics
277
+ def stats
278
+ return nil unless File.exist?(cache_path)
279
+
280
+ begin
281
+ db = open_db
282
+ categories = db.execute("SELECT category, LENGTH(data), updated_at FROM global_data")
283
+
284
+ {
285
+ cache_path: cache_path,
286
+ cache_size: File.size(cache_path),
287
+ categories: categories.map do |row|
288
+ {
289
+ category: row[0],
290
+ data_size: row[1],
291
+ updated_at: row[2] ? Time.at(row[2]) : nil
292
+ }
293
+ end
294
+ }
295
+ rescue SQLite3::Exception
296
+ nil
297
+ ensure
298
+ db&.close
299
+ end
300
+ end
301
+
302
+ private
303
+
304
+ def open_db
305
+ db = SQLite3::Database.new(cache_path)
306
+ db.execute("PRAGMA journal_mode = WAL")
307
+ db.execute("PRAGMA synchronous = NORMAL")
308
+ db
309
+ end
310
+
311
+ def create_schema(db)
312
+ db.execute(<<~SQL)
313
+ CREATE TABLE IF NOT EXISTS global_data (
314
+ category TEXT PRIMARY KEY,
315
+ data TEXT NOT NULL,
316
+ updated_at INTEGER
317
+ )
318
+ SQL
319
+
320
+ db.execute(<<~SQL)
321
+ CREATE TABLE IF NOT EXISTS metadata (
322
+ category TEXT,
323
+ key TEXT,
324
+ value TEXT,
325
+ PRIMARY KEY (category, key)
326
+ )
327
+ SQL
328
+ end
329
+
330
+ def save_metadata(db, category, hash)
331
+ hash.each do |key, value|
332
+ db.execute(
333
+ "INSERT OR REPLACE INTO metadata (category, key, value) VALUES (?, ?, ?)",
334
+ [category.to_s, key.to_s, value.to_s]
335
+ )
336
+ end
337
+ end
338
+
339
+ def load_metadata(db, category)
340
+ result = {}
341
+ db.execute("SELECT key, value FROM metadata WHERE category = ?", [category.to_s]) do |row|
342
+ result[row[0].to_sym] = row[1]
343
+ end
344
+ result.empty? ? nil : result
345
+ rescue SQLite3::Exception
346
+ nil
347
+ end
348
+ end
349
+
350
+ # Initialize with default settings
351
+ configure
352
+ end
353
+ end