wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
data/lib/wp2txt/article.rb
CHANGED
|
@@ -35,7 +35,6 @@ module Wp2txt
|
|
|
35
35
|
text = text.gsub(/\|\n\n+/m) { "|\n" }
|
|
36
36
|
text = remove_html(text)
|
|
37
37
|
text = make_reference(text)
|
|
38
|
-
text = remove_ref(text)
|
|
39
38
|
parse text
|
|
40
39
|
end
|
|
41
40
|
|
|
@@ -43,23 +42,97 @@ module Wp2txt
|
|
|
43
42
|
[tpx, text]
|
|
44
43
|
end
|
|
45
44
|
|
|
45
|
+
# Create a heading element with level information
|
|
46
|
+
# @param text [String] The heading text (with or without = markers)
|
|
47
|
+
# @param level [Integer] The heading level (2 for ==, 3 for ===, etc.)
|
|
48
|
+
# @return [Array] [:mw_heading, text, level]
|
|
49
|
+
def create_heading_element(text, level)
|
|
50
|
+
[:mw_heading, text, level]
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Extract heading level from line with = markers
|
|
54
|
+
# @param line [String] The heading line (e.g., "== Heading ==")
|
|
55
|
+
# @return [Integer] The heading level (count of = signs)
|
|
56
|
+
def extract_heading_level(line)
|
|
57
|
+
match = line.match(/^(=+)/)
|
|
58
|
+
match ? match[1].length : 2
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Extract clean heading text without = markers
|
|
62
|
+
# @param line [String] The heading line
|
|
63
|
+
# @return [String] The heading text without = markers
|
|
64
|
+
def extract_heading_text(line)
|
|
65
|
+
line.gsub(/^=+\s*/, "").gsub(/\s*=+$/, "").strip
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Check if a line has unbalanced [[ ]] brackets
|
|
69
|
+
# Returns true if there are more [[ than ]] (indicating multi-line link)
|
|
70
|
+
def has_unbalanced_link_brackets?(line)
|
|
71
|
+
open_count = line.scan(/\[\[/).size
|
|
72
|
+
close_count = line.scan(/\]\]/).size
|
|
73
|
+
open_count > close_count
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Process a line in multi-line template mode, tracking brace depth
|
|
77
|
+
# Updates @brace_depth and returns remaining content after }} if template closed, nil otherwise
|
|
78
|
+
def process_ml_template_line(line)
|
|
79
|
+
pos = 0
|
|
80
|
+
close_pos = nil
|
|
81
|
+
|
|
82
|
+
while pos < line.length
|
|
83
|
+
open_idx = line.index("{{", pos)
|
|
84
|
+
close_idx = line.index("}}", pos)
|
|
85
|
+
|
|
86
|
+
if open_idx && (!close_idx || open_idx < close_idx)
|
|
87
|
+
@brace_depth += 1
|
|
88
|
+
pos = open_idx + 2
|
|
89
|
+
elsif close_idx
|
|
90
|
+
@brace_depth -= 1
|
|
91
|
+
pos = close_idx + 2
|
|
92
|
+
if @brace_depth == 0
|
|
93
|
+
close_pos = close_idx + 2
|
|
94
|
+
break
|
|
95
|
+
end
|
|
96
|
+
else
|
|
97
|
+
break
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
if close_pos
|
|
102
|
+
# Template closed at close_pos
|
|
103
|
+
template_part = line[0...close_pos]
|
|
104
|
+
remaining = line[close_pos..]
|
|
105
|
+
@elements.last.last << template_part
|
|
106
|
+
remaining
|
|
107
|
+
else
|
|
108
|
+
# Template continues
|
|
109
|
+
@elements.last.last << line
|
|
110
|
+
nil
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
46
114
|
def parse(source)
|
|
47
115
|
@elements = []
|
|
48
116
|
@categories = []
|
|
49
117
|
mode = nil
|
|
118
|
+
@brace_depth = 0
|
|
50
119
|
source.each_line do |line|
|
|
120
|
+
# Collect categories without deduplicating on each line (O(n²) → O(n))
|
|
51
121
|
matched = line.scan(CATEGORY_REGEX)
|
|
52
|
-
if matched && !matched.empty?
|
|
53
|
-
@categories += matched
|
|
54
|
-
@categories.uniq!
|
|
55
|
-
end
|
|
122
|
+
@categories.concat(matched) if matched && !matched.empty?
|
|
56
123
|
|
|
57
124
|
case mode
|
|
58
125
|
when :mw_ml_template
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
126
|
+
# Track brace depth to find where template actually ends
|
|
127
|
+
remaining = process_ml_template_line(line)
|
|
128
|
+
if remaining
|
|
129
|
+
# Template closed, remaining content needs to be processed
|
|
130
|
+
mode = nil
|
|
131
|
+
# Process remaining content if any
|
|
132
|
+
unless remaining.strip.empty?
|
|
133
|
+
@elements << create_element(:mw_paragraph, "\n" + remaining)
|
|
134
|
+
end
|
|
135
|
+
end
|
|
63
136
|
next
|
|
64
137
|
when :mw_ml_link
|
|
65
138
|
scanner = StringScanner.new(line)
|
|
@@ -99,16 +172,26 @@ module Wp2txt
|
|
|
99
172
|
when REDIRECT_REGEX
|
|
100
173
|
@elements << create_element(:mw_redirect, line)
|
|
101
174
|
when IN_HEADING_REGEX
|
|
102
|
-
|
|
103
|
-
|
|
175
|
+
level = extract_heading_level(line)
|
|
176
|
+
# Keep original format for backward compatibility, but also store level
|
|
177
|
+
formatted_line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
|
|
178
|
+
@elements << create_heading_element("\n" + formatted_line + "\n", level)
|
|
104
179
|
when IN_INPUTBOX_REGEX
|
|
105
180
|
@elements << create_element(:mw_inputbox, line)
|
|
106
181
|
when ML_TEMPLATE_ONSET_REGEX
|
|
107
182
|
@elements << create_element(:mw_ml_template, line)
|
|
108
183
|
mode = :mw_ml_template
|
|
184
|
+
# Count initial braces: count {{ minus }} in this line
|
|
185
|
+
@brace_depth = line.scan(/\{\{/).size - line.scan(/\}\}/).size
|
|
109
186
|
when ML_LINK_ONSET_REGEX
|
|
110
|
-
|
|
111
|
-
|
|
187
|
+
# Only treat as multi-line link if brackets are actually unbalanced
|
|
188
|
+
if has_unbalanced_link_brackets?(line)
|
|
189
|
+
@elements << create_element(:mw_ml_link, line)
|
|
190
|
+
mode = :mw_ml_link
|
|
191
|
+
else
|
|
192
|
+
# Brackets are balanced, treat as paragraph
|
|
193
|
+
@elements << create_element(:mw_paragraph, "\n" + line)
|
|
194
|
+
end
|
|
112
195
|
when IN_INPUTBOX_REGEX1
|
|
113
196
|
mode = :mw_inputbox
|
|
114
197
|
@elements << create_element(:mw_inputbox, line)
|
|
@@ -148,6 +231,8 @@ module Wp2txt
|
|
|
148
231
|
@elements << create_element(:mw_paragraph, "\n" + line)
|
|
149
232
|
end
|
|
150
233
|
end
|
|
234
|
+
# Deduplicate categories once at the end (O(n) instead of O(n²))
|
|
235
|
+
@categories.uniq!
|
|
151
236
|
@elements
|
|
152
237
|
end
|
|
153
238
|
end
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "constants"
|
|
4
|
+
|
|
5
|
+
module Wp2txt
|
|
6
|
+
# Validates bz2 files for corruption and integrity
|
|
7
|
+
# Provides early detection of corrupt files before processing
|
|
8
|
+
module Bz2Validator
|
|
9
|
+
# Bz2 magic bytes: "BZ" followed by version ('h') and block size ('1'-'9')
|
|
10
|
+
BZ2_MAGIC = "BZ".freeze
|
|
11
|
+
BZ2_VERSION = "h".freeze
|
|
12
|
+
BZ2_BLOCK_SIZES = ("1".."9").to_a.freeze
|
|
13
|
+
|
|
14
|
+
# Minimum valid bz2 file size (header + minimal compressed data)
|
|
15
|
+
MIN_BZ2_SIZE = 14
|
|
16
|
+
|
|
17
|
+
# Test chunk size for decompression validation
|
|
18
|
+
TEST_CHUNK_SIZE = 1_048_576 # 1 MB
|
|
19
|
+
|
|
20
|
+
# Validation result structure
|
|
21
|
+
ValidationResult = Struct.new(:valid, :error_type, :message, :details, keyword_init: true) do
|
|
22
|
+
def valid?
|
|
23
|
+
valid
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def to_s
|
|
27
|
+
valid ? "Valid bz2 file" : "Invalid: #{message}"
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
module_function
|
|
32
|
+
|
|
33
|
+
# Perform full validation of a bz2 file
|
|
34
|
+
# @param path [String] Path to bz2 file
|
|
35
|
+
# @param test_decompress [Boolean] Whether to test decompression (slower but more thorough)
|
|
36
|
+
# @return [ValidationResult] Validation result
|
|
37
|
+
def validate(path, test_decompress: true)
|
|
38
|
+
# Check file exists
|
|
39
|
+
unless File.exist?(path)
|
|
40
|
+
return ValidationResult.new(
|
|
41
|
+
valid: false,
|
|
42
|
+
error_type: :not_found,
|
|
43
|
+
message: "File not found",
|
|
44
|
+
details: { path: path }
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Check file size
|
|
49
|
+
file_size = File.size(path)
|
|
50
|
+
if file_size < MIN_BZ2_SIZE
|
|
51
|
+
return ValidationResult.new(
|
|
52
|
+
valid: false,
|
|
53
|
+
error_type: :too_small,
|
|
54
|
+
message: "File too small to be valid bz2 (#{file_size} bytes)",
|
|
55
|
+
details: { size: file_size, minimum: MIN_BZ2_SIZE }
|
|
56
|
+
)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Check magic bytes
|
|
60
|
+
magic_result = validate_magic_bytes(path)
|
|
61
|
+
return magic_result unless magic_result.valid?
|
|
62
|
+
|
|
63
|
+
# Test decompression if requested
|
|
64
|
+
if test_decompress
|
|
65
|
+
decompress_result = test_decompression(path)
|
|
66
|
+
return decompress_result unless decompress_result.valid?
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
ValidationResult.new(
|
|
70
|
+
valid: true,
|
|
71
|
+
error_type: nil,
|
|
72
|
+
message: "Valid bz2 file",
|
|
73
|
+
details: { size: file_size, path: path }
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# Quick validation (magic bytes only, no decompression test)
|
|
78
|
+
# @param path [String] Path to bz2 file
|
|
79
|
+
# @return [ValidationResult] Validation result
|
|
80
|
+
def validate_quick(path)
|
|
81
|
+
validate(path, test_decompress: false)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Validate bz2 magic bytes
|
|
85
|
+
# @param path [String] Path to bz2 file
|
|
86
|
+
# @return [ValidationResult] Validation result
|
|
87
|
+
def validate_magic_bytes(path)
|
|
88
|
+
header = File.binread(path, 4)
|
|
89
|
+
|
|
90
|
+
# Check "BZ" signature
|
|
91
|
+
unless header[0, 2] == BZ2_MAGIC
|
|
92
|
+
return ValidationResult.new(
|
|
93
|
+
valid: false,
|
|
94
|
+
error_type: :invalid_magic,
|
|
95
|
+
message: "Invalid bz2 header (expected 'BZ', got '#{header[0, 2].inspect}')",
|
|
96
|
+
details: { expected: BZ2_MAGIC, actual: header[0, 2] }
|
|
97
|
+
)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Check version byte ('h' for bzip2)
|
|
101
|
+
unless header[2] == BZ2_VERSION
|
|
102
|
+
return ValidationResult.new(
|
|
103
|
+
valid: false,
|
|
104
|
+
error_type: :invalid_version,
|
|
105
|
+
message: "Invalid bz2 version byte (expected 'h', got '#{header[2].inspect}')",
|
|
106
|
+
details: { expected: BZ2_VERSION, actual: header[2] }
|
|
107
|
+
)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Check block size byte ('1'-'9')
|
|
111
|
+
unless BZ2_BLOCK_SIZES.include?(header[3])
|
|
112
|
+
return ValidationResult.new(
|
|
113
|
+
valid: false,
|
|
114
|
+
error_type: :invalid_block_size,
|
|
115
|
+
message: "Invalid bz2 block size (expected '1'-'9', got '#{header[3].inspect}')",
|
|
116
|
+
details: { expected: BZ2_BLOCK_SIZES, actual: header[3] }
|
|
117
|
+
)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
ValidationResult.new(
|
|
121
|
+
valid: true,
|
|
122
|
+
error_type: nil,
|
|
123
|
+
message: "Valid bz2 header",
|
|
124
|
+
details: { version: header[2], block_size: header[3].to_i }
|
|
125
|
+
)
|
|
126
|
+
rescue IOError, Errno::ENOENT, Errno::EACCES => e
|
|
127
|
+
ValidationResult.new(
|
|
128
|
+
valid: false,
|
|
129
|
+
error_type: :read_error,
|
|
130
|
+
message: "Cannot read file: #{e.message}",
|
|
131
|
+
details: { error: e.class.name }
|
|
132
|
+
)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Test decompression of first chunk
|
|
136
|
+
# @param path [String] Path to bz2 file
|
|
137
|
+
# @return [ValidationResult] Validation result
|
|
138
|
+
def test_decompression(path)
|
|
139
|
+
bzcat_cmd = find_bzip2_command
|
|
140
|
+
unless bzcat_cmd
|
|
141
|
+
# Skip decompression test if no command available
|
|
142
|
+
return ValidationResult.new(
|
|
143
|
+
valid: true,
|
|
144
|
+
error_type: nil,
|
|
145
|
+
message: "Skipped decompression test (no bzip2 command)",
|
|
146
|
+
details: { skipped: true }
|
|
147
|
+
)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Try to decompress first chunk
|
|
151
|
+
begin
|
|
152
|
+
# Use head to limit output and timeout to prevent hanging on large files
|
|
153
|
+
output = nil
|
|
154
|
+
error = nil
|
|
155
|
+
|
|
156
|
+
IO.popen([bzcat_cmd, "-c", "-d", path], "rb", err: [:child, :out]) do |io|
|
|
157
|
+
output = io.read(TEST_CHUNK_SIZE)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
exit_status = $?.exitstatus
|
|
161
|
+
|
|
162
|
+
if exit_status != 0 && (output.nil? || output.empty?)
|
|
163
|
+
return ValidationResult.new(
|
|
164
|
+
valid: false,
|
|
165
|
+
error_type: :decompression_failed,
|
|
166
|
+
message: "Decompression failed (corrupted data or truncated file)",
|
|
167
|
+
details: { exit_status: exit_status }
|
|
168
|
+
)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Check if output looks like XML (Wikipedia dumps are XML)
|
|
172
|
+
if output && output.bytesize > 0
|
|
173
|
+
# Simple check for XML-like content
|
|
174
|
+
sample = output[0, 1000].to_s.scrub("")
|
|
175
|
+
unless sample.include?("<") && sample.include?(">")
|
|
176
|
+
return ValidationResult.new(
|
|
177
|
+
valid: false,
|
|
178
|
+
error_type: :invalid_content,
|
|
179
|
+
message: "Decompressed content does not appear to be XML",
|
|
180
|
+
details: { sample_size: output.bytesize }
|
|
181
|
+
)
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
ValidationResult.new(
|
|
186
|
+
valid: true,
|
|
187
|
+
error_type: nil,
|
|
188
|
+
message: "Decompression test passed",
|
|
189
|
+
details: { bytes_tested: output&.bytesize || 0 }
|
|
190
|
+
)
|
|
191
|
+
rescue Errno::EPIPE
|
|
192
|
+
# Broken pipe is OK - we only read partial output
|
|
193
|
+
ValidationResult.new(
|
|
194
|
+
valid: true,
|
|
195
|
+
error_type: nil,
|
|
196
|
+
message: "Decompression test passed (partial read)",
|
|
197
|
+
details: {}
|
|
198
|
+
)
|
|
199
|
+
rescue IOError, Errno::ENOENT, Errno::EACCES => e
|
|
200
|
+
ValidationResult.new(
|
|
201
|
+
valid: false,
|
|
202
|
+
error_type: :decompression_error,
|
|
203
|
+
message: "Decompression error: #{e.message}",
|
|
204
|
+
details: { error: e.class.name }
|
|
205
|
+
)
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Find available bzip2 decompression command
|
|
210
|
+
# @return [String, nil] Path to command or nil
|
|
211
|
+
def find_bzip2_command
|
|
212
|
+
%w[lbzip2 pbzip2 bzip2 bzcat].each do |cmd|
|
|
213
|
+
path = IO.popen(["which", cmd], err: File::NULL, &:read).strip
|
|
214
|
+
return path unless path.empty?
|
|
215
|
+
end
|
|
216
|
+
nil
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Get bz2 file information
|
|
220
|
+
# @param path [String] Path to bz2 file
|
|
221
|
+
# @return [Hash] File information
|
|
222
|
+
def file_info(path)
|
|
223
|
+
return nil unless File.exist?(path)
|
|
224
|
+
|
|
225
|
+
header = File.binread(path, 4)
|
|
226
|
+
{
|
|
227
|
+
path: path,
|
|
228
|
+
size: File.size(path),
|
|
229
|
+
size_formatted: Wp2txt.format_file_size(File.size(path)),
|
|
230
|
+
valid_header: header[0, 2] == BZ2_MAGIC,
|
|
231
|
+
version: header[2],
|
|
232
|
+
block_size: header[3]&.to_i,
|
|
233
|
+
mtime: File.mtime(path)
|
|
234
|
+
}
|
|
235
|
+
rescue IOError, Errno::ENOENT, Errno::EACCES
|
|
236
|
+
nil
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sqlite3"
|
|
4
|
+
require "json"
|
|
5
|
+
require "fileutils"
|
|
6
|
+
require_relative "constants"
|
|
7
|
+
|
|
8
|
+
module Wp2txt
|
|
9
|
+
# SQLite-based cache for Wikipedia category hierarchy and members
|
|
10
|
+
# Dramatically speeds up repeated category extraction operations
|
|
11
|
+
class CategoryCache
|
|
12
|
+
CACHE_VERSION = 1
|
|
13
|
+
DEFAULT_CACHE_DIR = File.expand_path("~/.wp2txt/cache")
|
|
14
|
+
|
|
15
|
+
attr_reader :lang, :cache_path, :expiry_days
|
|
16
|
+
|
|
17
|
+
def initialize(lang, cache_dir: nil, expiry_days: nil)
|
|
18
|
+
@lang = lang.to_s
|
|
19
|
+
@cache_dir = cache_dir || DEFAULT_CACHE_DIR
|
|
20
|
+
@expiry_days = expiry_days || DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS
|
|
21
|
+
@cache_path = File.join(@cache_dir, "categories_#{@lang}.sqlite3")
|
|
22
|
+
@db = nil
|
|
23
|
+
ensure_schema
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Check if a category is cached and fresh
|
|
27
|
+
# @param category_name [String] Category name (without "Category:" prefix)
|
|
28
|
+
# @return [Boolean]
|
|
29
|
+
def cached?(category_name)
|
|
30
|
+
open_db
|
|
31
|
+
row = @db.get_first_row(
|
|
32
|
+
"SELECT cached_at FROM categories WHERE name = ?",
|
|
33
|
+
[normalize_name(category_name)]
|
|
34
|
+
)
|
|
35
|
+
return false unless row
|
|
36
|
+
|
|
37
|
+
cached_at = row[0]
|
|
38
|
+
return false unless cached_at
|
|
39
|
+
|
|
40
|
+
# Check freshness
|
|
41
|
+
Time.at(cached_at) > Time.now - (@expiry_days * SECONDS_PER_DAY)
|
|
42
|
+
rescue SQLite3::Exception
|
|
43
|
+
false
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Get category data from cache
|
|
47
|
+
# @param category_name [String] Category name
|
|
48
|
+
# @return [Hash, nil] { pages: [...], subcats: [...] } or nil if not cached
|
|
49
|
+
def get(category_name)
|
|
50
|
+
return nil unless cached?(category_name)
|
|
51
|
+
|
|
52
|
+
name = normalize_name(category_name)
|
|
53
|
+
open_db
|
|
54
|
+
|
|
55
|
+
pages = []
|
|
56
|
+
subcats = []
|
|
57
|
+
|
|
58
|
+
# Get pages
|
|
59
|
+
@db.execute(
|
|
60
|
+
"SELECT page_title FROM category_pages WHERE category_name = ?",
|
|
61
|
+
[name]
|
|
62
|
+
) do |row|
|
|
63
|
+
pages << row[0]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Get subcategories
|
|
67
|
+
@db.execute(
|
|
68
|
+
"SELECT child_name FROM category_hierarchy WHERE parent_name = ?",
|
|
69
|
+
[name]
|
|
70
|
+
) do |row|
|
|
71
|
+
subcats << row[0]
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
{ pages: pages, subcats: subcats }
|
|
75
|
+
rescue SQLite3::Exception
|
|
76
|
+
nil
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Save category data to cache
|
|
80
|
+
# @param category_name [String] Category name
|
|
81
|
+
# @param pages [Array<String>] Article titles in this category
|
|
82
|
+
# @param subcats [Array<String>] Subcategory names
|
|
83
|
+
def save(category_name, pages, subcats)
|
|
84
|
+
name = normalize_name(category_name)
|
|
85
|
+
open_db
|
|
86
|
+
|
|
87
|
+
@db.execute("BEGIN TRANSACTION")
|
|
88
|
+
|
|
89
|
+
# Update or insert category
|
|
90
|
+
@db.execute(
|
|
91
|
+
"INSERT OR REPLACE INTO categories (name, page_count, subcat_count, cached_at) VALUES (?, ?, ?, ?)",
|
|
92
|
+
[name, pages.size, subcats.size, Time.now.to_i]
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Clear old pages and hierarchy
|
|
96
|
+
@db.execute("DELETE FROM category_pages WHERE category_name = ?", [name])
|
|
97
|
+
@db.execute("DELETE FROM category_hierarchy WHERE parent_name = ?", [name])
|
|
98
|
+
|
|
99
|
+
# Insert pages
|
|
100
|
+
unless pages.empty?
|
|
101
|
+
stmt = @db.prepare("INSERT INTO category_pages (category_name, page_title) VALUES (?, ?)")
|
|
102
|
+
pages.each { |page| stmt.execute([name, page]) }
|
|
103
|
+
stmt.close
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Insert subcategories
|
|
107
|
+
unless subcats.empty?
|
|
108
|
+
stmt = @db.prepare("INSERT INTO category_hierarchy (parent_name, child_name) VALUES (?, ?)")
|
|
109
|
+
subcats.each { |subcat| stmt.execute([name, normalize_name(subcat)]) }
|
|
110
|
+
stmt.close
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
@db.execute("COMMIT")
|
|
114
|
+
rescue SQLite3::Exception => e
|
|
115
|
+
@db&.execute("ROLLBACK") rescue nil
|
|
116
|
+
warn "CategoryCache: Failed to save #{category_name}: #{e.message}"
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Get all pages in a category tree (recursive)
|
|
120
|
+
# @param category_name [String] Root category name
|
|
121
|
+
# @param max_depth [Integer] Maximum recursion depth (0 = no recursion)
|
|
122
|
+
# @param visited [Set] Already visited categories (for cycle detection)
|
|
123
|
+
# @return [Array<String>] All article titles
|
|
124
|
+
def get_all_pages(category_name, max_depth: 0, visited: nil)
|
|
125
|
+
visited ||= Set.new
|
|
126
|
+
name = normalize_name(category_name)
|
|
127
|
+
return [] if visited.include?(name)
|
|
128
|
+
|
|
129
|
+
visited << name
|
|
130
|
+
data = get(name)
|
|
131
|
+
return [] unless data
|
|
132
|
+
|
|
133
|
+
pages = data[:pages].dup
|
|
134
|
+
|
|
135
|
+
if max_depth > 0
|
|
136
|
+
data[:subcats].each do |subcat|
|
|
137
|
+
pages.concat(get_all_pages(subcat, max_depth: max_depth - 1, visited: visited))
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
pages.uniq
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# Get category tree structure
|
|
145
|
+
# @param category_name [String] Root category name
|
|
146
|
+
# @param max_depth [Integer] Maximum recursion depth
|
|
147
|
+
# @return [Hash] Tree structure with category info
|
|
148
|
+
def get_tree(category_name, max_depth: 0)
|
|
149
|
+
build_tree(category_name, max_depth, Set.new)
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Get statistics for all cached categories
|
|
153
|
+
# @return [Hash] Statistics
|
|
154
|
+
def stats
|
|
155
|
+
open_db
|
|
156
|
+
|
|
157
|
+
total_categories = @db.get_first_value("SELECT COUNT(*) FROM categories")
|
|
158
|
+
total_pages = @db.get_first_value("SELECT COUNT(*) FROM category_pages")
|
|
159
|
+
total_relations = @db.get_first_value("SELECT COUNT(*) FROM category_hierarchy")
|
|
160
|
+
|
|
161
|
+
oldest_cache = @db.get_first_value("SELECT MIN(cached_at) FROM categories")
|
|
162
|
+
newest_cache = @db.get_first_value("SELECT MAX(cached_at) FROM categories")
|
|
163
|
+
|
|
164
|
+
{
|
|
165
|
+
lang: @lang,
|
|
166
|
+
cache_path: @cache_path,
|
|
167
|
+
cache_size: File.exist?(@cache_path) ? File.size(@cache_path) : 0,
|
|
168
|
+
total_categories: total_categories || 0,
|
|
169
|
+
total_pages: total_pages || 0,
|
|
170
|
+
total_relations: total_relations || 0,
|
|
171
|
+
oldest_cache: oldest_cache ? Time.at(oldest_cache) : nil,
|
|
172
|
+
newest_cache: newest_cache ? Time.at(newest_cache) : nil,
|
|
173
|
+
expiry_days: @expiry_days
|
|
174
|
+
}
|
|
175
|
+
rescue SQLite3::Exception
|
|
176
|
+
{ lang: @lang, error: "Failed to read stats" }
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Clear all cached data
|
|
180
|
+
def clear!
|
|
181
|
+
close_db
|
|
182
|
+
FileUtils.rm_f(@cache_path)
|
|
183
|
+
ensure_schema
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Clear expired entries
|
|
187
|
+
def cleanup_expired!
|
|
188
|
+
open_db
|
|
189
|
+
cutoff = Time.now.to_i - (@expiry_days * SECONDS_PER_DAY)
|
|
190
|
+
|
|
191
|
+
@db.execute("BEGIN TRANSACTION")
|
|
192
|
+
|
|
193
|
+
# Get expired categories
|
|
194
|
+
expired = []
|
|
195
|
+
@db.execute("SELECT name FROM categories WHERE cached_at < ?", [cutoff]) do |row|
|
|
196
|
+
expired << row[0]
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Delete expired data
|
|
200
|
+
expired.each do |name|
|
|
201
|
+
@db.execute("DELETE FROM category_pages WHERE category_name = ?", [name])
|
|
202
|
+
@db.execute("DELETE FROM category_hierarchy WHERE parent_name = ?", [name])
|
|
203
|
+
@db.execute("DELETE FROM categories WHERE name = ?", [name])
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
@db.execute("COMMIT")
|
|
207
|
+
|
|
208
|
+
expired.size
|
|
209
|
+
rescue SQLite3::Exception
|
|
210
|
+
@db&.execute("ROLLBACK") rescue nil
|
|
211
|
+
0
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Close database connection
|
|
215
|
+
def close
|
|
216
|
+
close_db
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
private
|
|
220
|
+
|
|
221
|
+
def normalize_name(name)
|
|
222
|
+
name.to_s.sub(/^[Cc]ategory:/, "").strip
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def build_tree(category_name, max_depth, visited)
|
|
226
|
+
name = normalize_name(category_name)
|
|
227
|
+
return nil if visited.include?(name)
|
|
228
|
+
|
|
229
|
+
visited << name
|
|
230
|
+
data = get(name)
|
|
231
|
+
|
|
232
|
+
result = {
|
|
233
|
+
name: name,
|
|
234
|
+
cached: !data.nil?,
|
|
235
|
+
page_count: data ? data[:pages].size : 0,
|
|
236
|
+
children: []
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if data && max_depth > 0
|
|
240
|
+
data[:subcats].each do |subcat|
|
|
241
|
+
child = build_tree(subcat, max_depth - 1, visited)
|
|
242
|
+
result[:children] << child if child
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
result
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def open_db
|
|
250
|
+
return if @db
|
|
251
|
+
|
|
252
|
+
FileUtils.mkdir_p(File.dirname(@cache_path))
|
|
253
|
+
@db = SQLite3::Database.new(@cache_path)
|
|
254
|
+
@db.execute("PRAGMA journal_mode = WAL")
|
|
255
|
+
@db.execute("PRAGMA synchronous = NORMAL")
|
|
256
|
+
@db.execute("PRAGMA cache_size = -16000") # 16MB cache
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def close_db
|
|
260
|
+
@db&.close
|
|
261
|
+
@db = nil
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def ensure_schema
|
|
265
|
+
open_db
|
|
266
|
+
|
|
267
|
+
@db.execute(<<~SQL)
|
|
268
|
+
CREATE TABLE IF NOT EXISTS categories (
|
|
269
|
+
name TEXT PRIMARY KEY,
|
|
270
|
+
page_count INTEGER DEFAULT 0,
|
|
271
|
+
subcat_count INTEGER DEFAULT 0,
|
|
272
|
+
cached_at INTEGER
|
|
273
|
+
)
|
|
274
|
+
SQL
|
|
275
|
+
|
|
276
|
+
@db.execute(<<~SQL)
|
|
277
|
+
CREATE TABLE IF NOT EXISTS category_pages (
|
|
278
|
+
category_name TEXT NOT NULL,
|
|
279
|
+
page_title TEXT NOT NULL,
|
|
280
|
+
PRIMARY KEY (category_name, page_title)
|
|
281
|
+
)
|
|
282
|
+
SQL
|
|
283
|
+
|
|
284
|
+
@db.execute("CREATE INDEX IF NOT EXISTS idx_category_pages_category ON category_pages(category_name)")
|
|
285
|
+
|
|
286
|
+
@db.execute(<<~SQL)
|
|
287
|
+
CREATE TABLE IF NOT EXISTS category_hierarchy (
|
|
288
|
+
parent_name TEXT NOT NULL,
|
|
289
|
+
child_name TEXT NOT NULL,
|
|
290
|
+
PRIMARY KEY (parent_name, child_name)
|
|
291
|
+
)
|
|
292
|
+
SQL
|
|
293
|
+
|
|
294
|
+
@db.execute("CREATE INDEX IF NOT EXISTS idx_hierarchy_parent ON category_hierarchy(parent_name)")
|
|
295
|
+
@db.execute("CREATE INDEX IF NOT EXISTS idx_hierarchy_child ON category_hierarchy(child_name)")
|
|
296
|
+
|
|
297
|
+
@db.execute(<<~SQL)
|
|
298
|
+
CREATE TABLE IF NOT EXISTS metadata (
|
|
299
|
+
key TEXT PRIMARY KEY,
|
|
300
|
+
value TEXT
|
|
301
|
+
)
|
|
302
|
+
SQL
|
|
303
|
+
|
|
304
|
+
# Store cache version
|
|
305
|
+
@db.execute(
|
|
306
|
+
"INSERT OR REPLACE INTO metadata (key, value) VALUES ('cache_version', ?)",
|
|
307
|
+
[CACHE_VERSION.to_s]
|
|
308
|
+
)
|
|
309
|
+
rescue SQLite3::Exception => e
|
|
310
|
+
warn "CategoryCache: Failed to create schema: #{e.message}"
|
|
311
|
+
end
|
|
312
|
+
end
|
|
313
|
+
end
|