wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -35,7 +35,6 @@ module Wp2txt
35
35
  text = text.gsub(/\|\n\n+/m) { "|\n" }
36
36
  text = remove_html(text)
37
37
  text = make_reference(text)
38
- text = remove_ref(text)
39
38
  parse text
40
39
  end
41
40
 
@@ -43,23 +42,97 @@ module Wp2txt
43
42
  [tpx, text]
44
43
  end
45
44
 
45
+ # Create a heading element with level information
46
+ # @param text [String] The heading text (with or without = markers)
47
+ # @param level [Integer] The heading level (2 for ==, 3 for ===, etc.)
48
+ # @return [Array] [:mw_heading, text, level]
49
+ def create_heading_element(text, level)
50
+ [:mw_heading, text, level]
51
+ end
52
+
53
+ # Extract heading level from line with = markers
54
+ # @param line [String] The heading line (e.g., "== Heading ==")
55
+ # @return [Integer] The heading level (count of = signs)
56
+ def extract_heading_level(line)
57
+ match = line.match(/^(=+)/)
58
+ match ? match[1].length : 2
59
+ end
60
+
61
+ # Extract clean heading text without = markers
62
+ # @param line [String] The heading line
63
+ # @return [String] The heading text without = markers
64
+ def extract_heading_text(line)
65
+ line.gsub(/^=+\s*/, "").gsub(/\s*=+$/, "").strip
66
+ end
67
+
68
+ # Check if a line has unbalanced [[ ]] brackets
69
+ # Returns true if there are more [[ than ]] (indicating multi-line link)
70
+ def has_unbalanced_link_brackets?(line)
71
+ open_count = line.scan(/\[\[/).size
72
+ close_count = line.scan(/\]\]/).size
73
+ open_count > close_count
74
+ end
75
+
76
+ # Process a line in multi-line template mode, tracking brace depth
77
+ # Updates @brace_depth and returns remaining content after }} if template closed, nil otherwise
78
+ def process_ml_template_line(line)
79
+ pos = 0
80
+ close_pos = nil
81
+
82
+ while pos < line.length
83
+ open_idx = line.index("{{", pos)
84
+ close_idx = line.index("}}", pos)
85
+
86
+ if open_idx && (!close_idx || open_idx < close_idx)
87
+ @brace_depth += 1
88
+ pos = open_idx + 2
89
+ elsif close_idx
90
+ @brace_depth -= 1
91
+ pos = close_idx + 2
92
+ if @brace_depth == 0
93
+ close_pos = close_idx + 2
94
+ break
95
+ end
96
+ else
97
+ break
98
+ end
99
+ end
100
+
101
+ if close_pos
102
+ # Template closed at close_pos
103
+ template_part = line[0...close_pos]
104
+ remaining = line[close_pos..]
105
+ @elements.last.last << template_part
106
+ remaining
107
+ else
108
+ # Template continues
109
+ @elements.last.last << line
110
+ nil
111
+ end
112
+ end
113
+
46
114
  def parse(source)
47
115
  @elements = []
48
116
  @categories = []
49
117
  mode = nil
118
+ @brace_depth = 0
50
119
  source.each_line do |line|
120
+ # Collect categories without deduplicating on each line (O(n²) → O(n))
51
121
  matched = line.scan(CATEGORY_REGEX)
52
- if matched && !matched.empty?
53
- @categories += matched
54
- @categories.uniq!
55
- end
122
+ @categories.concat(matched) if matched && !matched.empty?
56
123
 
57
124
  case mode
58
125
  when :mw_ml_template
59
- scanner = StringScanner.new(line)
60
- str = process_nested_structure(scanner, "{{", "}}") { "" }
61
- mode = nil if ML_TEMPLATE_END_REGEX =~ str
62
- @elements.last.last << line
126
+ # Track brace depth to find where template actually ends
127
+ remaining = process_ml_template_line(line)
128
+ if remaining
129
+ # Template closed, remaining content needs to be processed
130
+ mode = nil
131
+ # Process remaining content if any
132
+ unless remaining.strip.empty?
133
+ @elements << create_element(:mw_paragraph, "\n" + remaining)
134
+ end
135
+ end
63
136
  next
64
137
  when :mw_ml_link
65
138
  scanner = StringScanner.new(line)
@@ -99,16 +172,26 @@ module Wp2txt
99
172
  when REDIRECT_REGEX
100
173
  @elements << create_element(:mw_redirect, line)
101
174
  when IN_HEADING_REGEX
102
- line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
103
- @elements << create_element(:mw_heading, "\n" + line + "\n")
175
+ level = extract_heading_level(line)
176
+ # Keep original format for backward compatibility, but also store level
177
+ formatted_line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
178
+ @elements << create_heading_element("\n" + formatted_line + "\n", level)
104
179
  when IN_INPUTBOX_REGEX
105
180
  @elements << create_element(:mw_inputbox, line)
106
181
  when ML_TEMPLATE_ONSET_REGEX
107
182
  @elements << create_element(:mw_ml_template, line)
108
183
  mode = :mw_ml_template
184
+ # Count initial braces: count {{ minus }} in this line
185
+ @brace_depth = line.scan(/\{\{/).size - line.scan(/\}\}/).size
109
186
  when ML_LINK_ONSET_REGEX
110
- @elements << create_element(:mw_ml_link, line)
111
- mode = :mw_ml_link
187
+ # Only treat as multi-line link if brackets are actually unbalanced
188
+ if has_unbalanced_link_brackets?(line)
189
+ @elements << create_element(:mw_ml_link, line)
190
+ mode = :mw_ml_link
191
+ else
192
+ # Brackets are balanced, treat as paragraph
193
+ @elements << create_element(:mw_paragraph, "\n" + line)
194
+ end
112
195
  when IN_INPUTBOX_REGEX1
113
196
  mode = :mw_inputbox
114
197
  @elements << create_element(:mw_inputbox, line)
@@ -148,6 +231,8 @@ module Wp2txt
148
231
  @elements << create_element(:mw_paragraph, "\n" + line)
149
232
  end
150
233
  end
234
+ # Deduplicate categories once at the end (O(n) instead of O(n²))
235
+ @categories.uniq!
151
236
  @elements
152
237
  end
153
238
  end
@@ -0,0 +1,239 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "constants"
4
+
5
+ module Wp2txt
6
+ # Validates bz2 files for corruption and integrity
7
+ # Provides early detection of corrupt files before processing
8
+ module Bz2Validator
9
+ # Bz2 magic bytes: "BZ" followed by version ('h') and block size ('1'-'9')
10
+ BZ2_MAGIC = "BZ".freeze
11
+ BZ2_VERSION = "h".freeze
12
+ BZ2_BLOCK_SIZES = ("1".."9").to_a.freeze
13
+
14
+ # Minimum valid bz2 file size (header + minimal compressed data)
15
+ MIN_BZ2_SIZE = 14
16
+
17
+ # Test chunk size for decompression validation
18
+ TEST_CHUNK_SIZE = 1_048_576 # 1 MB
19
+
20
+ # Validation result structure
21
+ ValidationResult = Struct.new(:valid, :error_type, :message, :details, keyword_init: true) do
22
+ def valid?
23
+ valid
24
+ end
25
+
26
+ def to_s
27
+ valid ? "Valid bz2 file" : "Invalid: #{message}"
28
+ end
29
+ end
30
+
31
+ module_function
32
+
33
+ # Perform full validation of a bz2 file
34
+ # @param path [String] Path to bz2 file
35
+ # @param test_decompress [Boolean] Whether to test decompression (slower but more thorough)
36
+ # @return [ValidationResult] Validation result
37
+ def validate(path, test_decompress: true)
38
+ # Check file exists
39
+ unless File.exist?(path)
40
+ return ValidationResult.new(
41
+ valid: false,
42
+ error_type: :not_found,
43
+ message: "File not found",
44
+ details: { path: path }
45
+ )
46
+ end
47
+
48
+ # Check file size
49
+ file_size = File.size(path)
50
+ if file_size < MIN_BZ2_SIZE
51
+ return ValidationResult.new(
52
+ valid: false,
53
+ error_type: :too_small,
54
+ message: "File too small to be valid bz2 (#{file_size} bytes)",
55
+ details: { size: file_size, minimum: MIN_BZ2_SIZE }
56
+ )
57
+ end
58
+
59
+ # Check magic bytes
60
+ magic_result = validate_magic_bytes(path)
61
+ return magic_result unless magic_result.valid?
62
+
63
+ # Test decompression if requested
64
+ if test_decompress
65
+ decompress_result = test_decompression(path)
66
+ return decompress_result unless decompress_result.valid?
67
+ end
68
+
69
+ ValidationResult.new(
70
+ valid: true,
71
+ error_type: nil,
72
+ message: "Valid bz2 file",
73
+ details: { size: file_size, path: path }
74
+ )
75
+ end
76
+
77
+ # Quick validation (magic bytes only, no decompression test)
78
+ # @param path [String] Path to bz2 file
79
+ # @return [ValidationResult] Validation result
80
+ def validate_quick(path)
81
+ validate(path, test_decompress: false)
82
+ end
83
+
84
+ # Validate bz2 magic bytes
85
+ # @param path [String] Path to bz2 file
86
+ # @return [ValidationResult] Validation result
87
+ def validate_magic_bytes(path)
88
+ header = File.binread(path, 4)
89
+
90
+ # Check "BZ" signature
91
+ unless header[0, 2] == BZ2_MAGIC
92
+ return ValidationResult.new(
93
+ valid: false,
94
+ error_type: :invalid_magic,
95
+ message: "Invalid bz2 header (expected 'BZ', got '#{header[0, 2].inspect}')",
96
+ details: { expected: BZ2_MAGIC, actual: header[0, 2] }
97
+ )
98
+ end
99
+
100
+ # Check version byte ('h' for bzip2)
101
+ unless header[2] == BZ2_VERSION
102
+ return ValidationResult.new(
103
+ valid: false,
104
+ error_type: :invalid_version,
105
+ message: "Invalid bz2 version byte (expected 'h', got '#{header[2].inspect}')",
106
+ details: { expected: BZ2_VERSION, actual: header[2] }
107
+ )
108
+ end
109
+
110
+ # Check block size byte ('1'-'9')
111
+ unless BZ2_BLOCK_SIZES.include?(header[3])
112
+ return ValidationResult.new(
113
+ valid: false,
114
+ error_type: :invalid_block_size,
115
+ message: "Invalid bz2 block size (expected '1'-'9', got '#{header[3].inspect}')",
116
+ details: { expected: BZ2_BLOCK_SIZES, actual: header[3] }
117
+ )
118
+ end
119
+
120
+ ValidationResult.new(
121
+ valid: true,
122
+ error_type: nil,
123
+ message: "Valid bz2 header",
124
+ details: { version: header[2], block_size: header[3].to_i }
125
+ )
126
+ rescue IOError, Errno::ENOENT, Errno::EACCES => e
127
+ ValidationResult.new(
128
+ valid: false,
129
+ error_type: :read_error,
130
+ message: "Cannot read file: #{e.message}",
131
+ details: { error: e.class.name }
132
+ )
133
+ end
134
+
135
+ # Test decompression of first chunk
136
+ # @param path [String] Path to bz2 file
137
+ # @return [ValidationResult] Validation result
138
+ def test_decompression(path)
139
+ bzcat_cmd = find_bzip2_command
140
+ unless bzcat_cmd
141
+ # Skip decompression test if no command available
142
+ return ValidationResult.new(
143
+ valid: true,
144
+ error_type: nil,
145
+ message: "Skipped decompression test (no bzip2 command)",
146
+ details: { skipped: true }
147
+ )
148
+ end
149
+
150
+ # Try to decompress first chunk
151
+ begin
152
+ # Use head to limit output and timeout to prevent hanging on large files
153
+ output = nil
154
+ error = nil
155
+
156
+ IO.popen([bzcat_cmd, "-c", "-d", path], "rb", err: [:child, :out]) do |io|
157
+ output = io.read(TEST_CHUNK_SIZE)
158
+ end
159
+
160
+ exit_status = $?.exitstatus
161
+
162
+ if exit_status != 0 && (output.nil? || output.empty?)
163
+ return ValidationResult.new(
164
+ valid: false,
165
+ error_type: :decompression_failed,
166
+ message: "Decompression failed (corrupted data or truncated file)",
167
+ details: { exit_status: exit_status }
168
+ )
169
+ end
170
+
171
+ # Check if output looks like XML (Wikipedia dumps are XML)
172
+ if output && output.bytesize > 0
173
+ # Simple check for XML-like content
174
+ sample = output[0, 1000].to_s.scrub("")
175
+ unless sample.include?("<") && sample.include?(">")
176
+ return ValidationResult.new(
177
+ valid: false,
178
+ error_type: :invalid_content,
179
+ message: "Decompressed content does not appear to be XML",
180
+ details: { sample_size: output.bytesize }
181
+ )
182
+ end
183
+ end
184
+
185
+ ValidationResult.new(
186
+ valid: true,
187
+ error_type: nil,
188
+ message: "Decompression test passed",
189
+ details: { bytes_tested: output&.bytesize || 0 }
190
+ )
191
+ rescue Errno::EPIPE
192
+ # Broken pipe is OK - we only read partial output
193
+ ValidationResult.new(
194
+ valid: true,
195
+ error_type: nil,
196
+ message: "Decompression test passed (partial read)",
197
+ details: {}
198
+ )
199
+ rescue IOError, Errno::ENOENT, Errno::EACCES => e
200
+ ValidationResult.new(
201
+ valid: false,
202
+ error_type: :decompression_error,
203
+ message: "Decompression error: #{e.message}",
204
+ details: { error: e.class.name }
205
+ )
206
+ end
207
+ end
208
+
209
+ # Find available bzip2 decompression command
210
+ # @return [String, nil] Path to command or nil
211
+ def find_bzip2_command
212
+ %w[lbzip2 pbzip2 bzip2 bzcat].each do |cmd|
213
+ path = IO.popen(["which", cmd], err: File::NULL, &:read).strip
214
+ return path unless path.empty?
215
+ end
216
+ nil
217
+ end
218
+
219
+ # Get bz2 file information
220
+ # @param path [String] Path to bz2 file
221
+ # @return [Hash] File information
222
+ def file_info(path)
223
+ return nil unless File.exist?(path)
224
+
225
+ header = File.binread(path, 4)
226
+ {
227
+ path: path,
228
+ size: File.size(path),
229
+ size_formatted: Wp2txt.format_file_size(File.size(path)),
230
+ valid_header: header[0, 2] == BZ2_MAGIC,
231
+ version: header[2],
232
+ block_size: header[3]&.to_i,
233
+ mtime: File.mtime(path)
234
+ }
235
+ rescue IOError, Errno::ENOENT, Errno::EACCES
236
+ nil
237
+ end
238
+ end
239
+ end
@@ -0,0 +1,313 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sqlite3"
4
+ require "json"
5
+ require "fileutils"
6
+ require_relative "constants"
7
+
8
+ module Wp2txt
9
+ # SQLite-based cache for Wikipedia category hierarchy and members
10
+ # Dramatically speeds up repeated category extraction operations
11
+ class CategoryCache
12
+ CACHE_VERSION = 1
13
+ DEFAULT_CACHE_DIR = File.expand_path("~/.wp2txt/cache")
14
+
15
+ attr_reader :lang, :cache_path, :expiry_days
16
+
17
+ def initialize(lang, cache_dir: nil, expiry_days: nil)
18
+ @lang = lang.to_s
19
+ @cache_dir = cache_dir || DEFAULT_CACHE_DIR
20
+ @expiry_days = expiry_days || DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS
21
+ @cache_path = File.join(@cache_dir, "categories_#{@lang}.sqlite3")
22
+ @db = nil
23
+ ensure_schema
24
+ end
25
+
26
+ # Check if a category is cached and fresh
27
+ # @param category_name [String] Category name (without "Category:" prefix)
28
+ # @return [Boolean]
29
+ def cached?(category_name)
30
+ open_db
31
+ row = @db.get_first_row(
32
+ "SELECT cached_at FROM categories WHERE name = ?",
33
+ [normalize_name(category_name)]
34
+ )
35
+ return false unless row
36
+
37
+ cached_at = row[0]
38
+ return false unless cached_at
39
+
40
+ # Check freshness
41
+ Time.at(cached_at) > Time.now - (@expiry_days * SECONDS_PER_DAY)
42
+ rescue SQLite3::Exception
43
+ false
44
+ end
45
+
46
+ # Get category data from cache
47
+ # @param category_name [String] Category name
48
+ # @return [Hash, nil] { pages: [...], subcats: [...] } or nil if not cached
49
+ def get(category_name)
50
+ return nil unless cached?(category_name)
51
+
52
+ name = normalize_name(category_name)
53
+ open_db
54
+
55
+ pages = []
56
+ subcats = []
57
+
58
+ # Get pages
59
+ @db.execute(
60
+ "SELECT page_title FROM category_pages WHERE category_name = ?",
61
+ [name]
62
+ ) do |row|
63
+ pages << row[0]
64
+ end
65
+
66
+ # Get subcategories
67
+ @db.execute(
68
+ "SELECT child_name FROM category_hierarchy WHERE parent_name = ?",
69
+ [name]
70
+ ) do |row|
71
+ subcats << row[0]
72
+ end
73
+
74
+ { pages: pages, subcats: subcats }
75
+ rescue SQLite3::Exception
76
+ nil
77
+ end
78
+
79
+ # Save category data to cache
80
+ # @param category_name [String] Category name
81
+ # @param pages [Array<String>] Article titles in this category
82
+ # @param subcats [Array<String>] Subcategory names
83
+ def save(category_name, pages, subcats)
84
+ name = normalize_name(category_name)
85
+ open_db
86
+
87
+ @db.execute("BEGIN TRANSACTION")
88
+
89
+ # Update or insert category
90
+ @db.execute(
91
+ "INSERT OR REPLACE INTO categories (name, page_count, subcat_count, cached_at) VALUES (?, ?, ?, ?)",
92
+ [name, pages.size, subcats.size, Time.now.to_i]
93
+ )
94
+
95
+ # Clear old pages and hierarchy
96
+ @db.execute("DELETE FROM category_pages WHERE category_name = ?", [name])
97
+ @db.execute("DELETE FROM category_hierarchy WHERE parent_name = ?", [name])
98
+
99
+ # Insert pages
100
+ unless pages.empty?
101
+ stmt = @db.prepare("INSERT INTO category_pages (category_name, page_title) VALUES (?, ?)")
102
+ pages.each { |page| stmt.execute([name, page]) }
103
+ stmt.close
104
+ end
105
+
106
+ # Insert subcategories
107
+ unless subcats.empty?
108
+ stmt = @db.prepare("INSERT INTO category_hierarchy (parent_name, child_name) VALUES (?, ?)")
109
+ subcats.each { |subcat| stmt.execute([name, normalize_name(subcat)]) }
110
+ stmt.close
111
+ end
112
+
113
+ @db.execute("COMMIT")
114
+ rescue SQLite3::Exception => e
115
+ @db&.execute("ROLLBACK") rescue nil
116
+ warn "CategoryCache: Failed to save #{category_name}: #{e.message}"
117
+ end
118
+
119
+ # Get all pages in a category tree (recursive)
120
+ # @param category_name [String] Root category name
121
+ # @param max_depth [Integer] Maximum recursion depth (0 = no recursion)
122
+ # @param visited [Set] Already visited categories (for cycle detection)
123
+ # @return [Array<String>] All article titles
124
+ def get_all_pages(category_name, max_depth: 0, visited: nil)
125
+ visited ||= Set.new
126
+ name = normalize_name(category_name)
127
+ return [] if visited.include?(name)
128
+
129
+ visited << name
130
+ data = get(name)
131
+ return [] unless data
132
+
133
+ pages = data[:pages].dup
134
+
135
+ if max_depth > 0
136
+ data[:subcats].each do |subcat|
137
+ pages.concat(get_all_pages(subcat, max_depth: max_depth - 1, visited: visited))
138
+ end
139
+ end
140
+
141
+ pages.uniq
142
+ end
143
+
144
+ # Get category tree structure
145
+ # @param category_name [String] Root category name
146
+ # @param max_depth [Integer] Maximum recursion depth
147
+ # @return [Hash] Tree structure with category info
148
+ def get_tree(category_name, max_depth: 0)
149
+ build_tree(category_name, max_depth, Set.new)
150
+ end
151
+
152
+ # Get statistics for all cached categories
153
+ # @return [Hash] Statistics
154
+ def stats
155
+ open_db
156
+
157
+ total_categories = @db.get_first_value("SELECT COUNT(*) FROM categories")
158
+ total_pages = @db.get_first_value("SELECT COUNT(*) FROM category_pages")
159
+ total_relations = @db.get_first_value("SELECT COUNT(*) FROM category_hierarchy")
160
+
161
+ oldest_cache = @db.get_first_value("SELECT MIN(cached_at) FROM categories")
162
+ newest_cache = @db.get_first_value("SELECT MAX(cached_at) FROM categories")
163
+
164
+ {
165
+ lang: @lang,
166
+ cache_path: @cache_path,
167
+ cache_size: File.exist?(@cache_path) ? File.size(@cache_path) : 0,
168
+ total_categories: total_categories || 0,
169
+ total_pages: total_pages || 0,
170
+ total_relations: total_relations || 0,
171
+ oldest_cache: oldest_cache ? Time.at(oldest_cache) : nil,
172
+ newest_cache: newest_cache ? Time.at(newest_cache) : nil,
173
+ expiry_days: @expiry_days
174
+ }
175
+ rescue SQLite3::Exception
176
+ { lang: @lang, error: "Failed to read stats" }
177
+ end
178
+
179
+ # Clear all cached data
180
+ def clear!
181
+ close_db
182
+ FileUtils.rm_f(@cache_path)
183
+ ensure_schema
184
+ end
185
+
186
+ # Clear expired entries
187
+ def cleanup_expired!
188
+ open_db
189
+ cutoff = Time.now.to_i - (@expiry_days * SECONDS_PER_DAY)
190
+
191
+ @db.execute("BEGIN TRANSACTION")
192
+
193
+ # Get expired categories
194
+ expired = []
195
+ @db.execute("SELECT name FROM categories WHERE cached_at < ?", [cutoff]) do |row|
196
+ expired << row[0]
197
+ end
198
+
199
+ # Delete expired data
200
+ expired.each do |name|
201
+ @db.execute("DELETE FROM category_pages WHERE category_name = ?", [name])
202
+ @db.execute("DELETE FROM category_hierarchy WHERE parent_name = ?", [name])
203
+ @db.execute("DELETE FROM categories WHERE name = ?", [name])
204
+ end
205
+
206
+ @db.execute("COMMIT")
207
+
208
+ expired.size
209
+ rescue SQLite3::Exception
210
+ @db&.execute("ROLLBACK") rescue nil
211
+ 0
212
+ end
213
+
214
+ # Close database connection
215
+ def close
216
+ close_db
217
+ end
218
+
219
+ private
220
+
221
+ def normalize_name(name)
222
+ name.to_s.sub(/^[Cc]ategory:/, "").strip
223
+ end
224
+
225
+ def build_tree(category_name, max_depth, visited)
226
+ name = normalize_name(category_name)
227
+ return nil if visited.include?(name)
228
+
229
+ visited << name
230
+ data = get(name)
231
+
232
+ result = {
233
+ name: name,
234
+ cached: !data.nil?,
235
+ page_count: data ? data[:pages].size : 0,
236
+ children: []
237
+ }
238
+
239
+ if data && max_depth > 0
240
+ data[:subcats].each do |subcat|
241
+ child = build_tree(subcat, max_depth - 1, visited)
242
+ result[:children] << child if child
243
+ end
244
+ end
245
+
246
+ result
247
+ end
248
+
249
+ def open_db
250
+ return if @db
251
+
252
+ FileUtils.mkdir_p(File.dirname(@cache_path))
253
+ @db = SQLite3::Database.new(@cache_path)
254
+ @db.execute("PRAGMA journal_mode = WAL")
255
+ @db.execute("PRAGMA synchronous = NORMAL")
256
+ @db.execute("PRAGMA cache_size = -16000") # 16MB cache
257
+ end
258
+
259
+ def close_db
260
+ @db&.close
261
+ @db = nil
262
+ end
263
+
264
+ def ensure_schema
265
+ open_db
266
+
267
+ @db.execute(<<~SQL)
268
+ CREATE TABLE IF NOT EXISTS categories (
269
+ name TEXT PRIMARY KEY,
270
+ page_count INTEGER DEFAULT 0,
271
+ subcat_count INTEGER DEFAULT 0,
272
+ cached_at INTEGER
273
+ )
274
+ SQL
275
+
276
+ @db.execute(<<~SQL)
277
+ CREATE TABLE IF NOT EXISTS category_pages (
278
+ category_name TEXT NOT NULL,
279
+ page_title TEXT NOT NULL,
280
+ PRIMARY KEY (category_name, page_title)
281
+ )
282
+ SQL
283
+
284
+ @db.execute("CREATE INDEX IF NOT EXISTS idx_category_pages_category ON category_pages(category_name)")
285
+
286
+ @db.execute(<<~SQL)
287
+ CREATE TABLE IF NOT EXISTS category_hierarchy (
288
+ parent_name TEXT NOT NULL,
289
+ child_name TEXT NOT NULL,
290
+ PRIMARY KEY (parent_name, child_name)
291
+ )
292
+ SQL
293
+
294
+ @db.execute("CREATE INDEX IF NOT EXISTS idx_hierarchy_parent ON category_hierarchy(parent_name)")
295
+ @db.execute("CREATE INDEX IF NOT EXISTS idx_hierarchy_child ON category_hierarchy(child_name)")
296
+
297
+ @db.execute(<<~SQL)
298
+ CREATE TABLE IF NOT EXISTS metadata (
299
+ key TEXT PRIMARY KEY,
300
+ value TEXT
301
+ )
302
+ SQL
303
+
304
+ # Store cache version
305
+ @db.execute(
306
+ "INSERT OR REPLACE INTO metadata (key, value) VALUES ('cache_version', ?)",
307
+ [CACHE_VERSION.to_s]
308
+ )
309
+ rescue SQLite3::Exception => e
310
+ warn "CategoryCache: Failed to create schema: #{e.message}"
311
+ end
312
+ end
313
+ end