wp2txt 1.1.3 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.dockerignore +12 -0
- data/.github/workflows/ci.yml +13 -13
- data/.gitignore +14 -0
- data/CHANGELOG.md +284 -0
- data/DEVELOPMENT.md +415 -0
- data/DEVELOPMENT_ja.md +415 -0
- data/Dockerfile +19 -10
- data/Gemfile +2 -8
- data/README.md +259 -123
- data/README_ja.md +375 -0
- data/Rakefile +4 -0
- data/bin/wp2txt +863 -161
- data/lib/wp2txt/article.rb +98 -13
- data/lib/wp2txt/bz2_validator.rb +239 -0
- data/lib/wp2txt/category_cache.rb +313 -0
- data/lib/wp2txt/cli.rb +319 -0
- data/lib/wp2txt/cli_ui.rb +428 -0
- data/lib/wp2txt/config.rb +158 -0
- data/lib/wp2txt/constants.rb +134 -0
- data/lib/wp2txt/data/html_entities.json +2135 -0
- data/lib/wp2txt/data/language_metadata.json +4769 -0
- data/lib/wp2txt/data/language_tiers.json +59 -0
- data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
- data/lib/wp2txt/data/template_aliases.json +193 -0
- data/lib/wp2txt/data/wikipedia_entities.json +12 -0
- data/lib/wp2txt/extractor.rb +545 -0
- data/lib/wp2txt/file_utils.rb +91 -0
- data/lib/wp2txt/formatter.rb +352 -0
- data/lib/wp2txt/global_data_cache.rb +353 -0
- data/lib/wp2txt/index_cache.rb +258 -0
- data/lib/wp2txt/magic_words.rb +353 -0
- data/lib/wp2txt/memory_monitor.rb +236 -0
- data/lib/wp2txt/multistream.rb +1383 -0
- data/lib/wp2txt/output_writer.rb +182 -0
- data/lib/wp2txt/parser_functions.rb +606 -0
- data/lib/wp2txt/ractor_worker.rb +215 -0
- data/lib/wp2txt/regex.rb +396 -12
- data/lib/wp2txt/section_extractor.rb +354 -0
- data/lib/wp2txt/stream_processor.rb +271 -0
- data/lib/wp2txt/template_expander.rb +830 -0
- data/lib/wp2txt/text_processing.rb +337 -0
- data/lib/wp2txt/utils.rb +629 -270
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +53 -26
- data/scripts/benchmark_regex.rb +161 -0
- data/scripts/fetch_html_entities.rb +94 -0
- data/scripts/fetch_language_metadata.rb +180 -0
- data/scripts/fetch_mediawiki_data.rb +334 -0
- data/scripts/fetch_template_data.rb +186 -0
- data/scripts/profile_memory.rb +139 -0
- data/spec/article_spec.rb +402 -0
- data/spec/auto_download_spec.rb +314 -0
- data/spec/bz2_validator_spec.rb +193 -0
- data/spec/category_cache_spec.rb +226 -0
- data/spec/category_fetcher_spec.rb +504 -0
- data/spec/cleanup_spec.rb +197 -0
- data/spec/cli_options_spec.rb +678 -0
- data/spec/cli_spec.rb +876 -0
- data/spec/config_spec.rb +194 -0
- data/spec/constants_spec.rb +138 -0
- data/spec/file_utils_spec.rb +170 -0
- data/spec/fixtures/samples.rb +181 -0
- data/spec/formatter_sections_spec.rb +382 -0
- data/spec/global_data_cache_spec.rb +186 -0
- data/spec/index_cache_spec.rb +210 -0
- data/spec/integration_spec.rb +543 -0
- data/spec/magic_words_spec.rb +261 -0
- data/spec/markers_spec.rb +476 -0
- data/spec/memory_monitor_spec.rb +192 -0
- data/spec/multistream_spec.rb +690 -0
- data/spec/output_writer_spec.rb +400 -0
- data/spec/parser_functions_spec.rb +455 -0
- data/spec/ractor_worker_spec.rb +197 -0
- data/spec/regex_spec.rb +281 -0
- data/spec/section_extractor_spec.rb +397 -0
- data/spec/spec_helper.rb +63 -0
- data/spec/stream_processor_spec.rb +579 -0
- data/spec/template_data_spec.rb +246 -0
- data/spec/template_expander_spec.rb +472 -0
- data/spec/template_processing_spec.rb +217 -0
- data/spec/text_processing_spec.rb +312 -0
- data/spec/utils_spec.rb +195 -16
- data/spec/wp2txt_spec.rb +510 -0
- data/wp2txt.gemspec +5 -3
- metadata +146 -18
- data/.rubocop.yml +0 -80
- data/data/output_samples/testdata_en.txt +0 -23002
- data/data/output_samples/testdata_en_category.txt +0 -132
- data/data/output_samples/testdata_en_summary.txt +0 -1376
- data/data/output_samples/testdata_ja.txt +0 -22774
- data/data/output_samples/testdata_ja_category.txt +0 -206
- data/data/output_samples/testdata_ja_summary.txt +0 -1560
- data/data/testdata_en.bz2 +0 -0
- data/data/testdata_ja.bz2 +0 -0
- data/image/screenshot.png +0 -0
|
@@ -0,0 +1,1383 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "json"
|
|
5
|
+
require "net/http"
|
|
6
|
+
require "uri"
|
|
7
|
+
require "openssl"
|
|
8
|
+
require "parallel"
|
|
9
|
+
require "set"
|
|
10
|
+
require_relative "constants"
|
|
11
|
+
require_relative "index_cache"
|
|
12
|
+
require_relative "category_cache"
|
|
13
|
+
|
|
14
|
+
module Wp2txt
|
|
15
|
+
# Maximum number of retries for transient network errors
|
|
16
|
+
MAX_HTTP_RETRIES = 3
|
|
17
|
+
|
|
18
|
+
# HTTPS-aware HTTP GET helper with proper SSL verification and retry
|
|
19
|
+
# @param uri [URI] The URI to request
|
|
20
|
+
# @param timeout [Integer] Timeout in seconds
|
|
21
|
+
# @param retries [Integer] Maximum number of retries on transient errors
|
|
22
|
+
# @return [Net::HTTPResponse] The HTTP response
|
|
23
|
+
def self.ssl_safe_get(uri, timeout: DEFAULT_HTTP_TIMEOUT, retries: MAX_HTTP_RETRIES)
|
|
24
|
+
attempts = 0
|
|
25
|
+
begin
|
|
26
|
+
attempts += 1
|
|
27
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
28
|
+
http.use_ssl = (uri.scheme == "https")
|
|
29
|
+
http.open_timeout = timeout
|
|
30
|
+
http.read_timeout = timeout
|
|
31
|
+
|
|
32
|
+
if http.use_ssl?
|
|
33
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
request = Net::HTTP::Get.new(uri)
|
|
37
|
+
http.request(request)
|
|
38
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, SocketError, Errno::ECONNRESET,
|
|
39
|
+
Errno::ECONNREFUSED, Errno::EHOSTUNREACH, OpenSSL::SSL::SSLError => e
|
|
40
|
+
if attempts <= retries
|
|
41
|
+
delay = 2**attempts # Exponential backoff: 2, 4, 8 seconds
|
|
42
|
+
warn " Network error (attempt #{attempts}/#{retries + 1}): #{e.message}. Retrying in #{delay}s..."
|
|
43
|
+
sleep delay
|
|
44
|
+
retry
|
|
45
|
+
end
|
|
46
|
+
raise
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
# Manages multistream index for random access to Wikipedia dumps
|
|
50
|
+
# Supports SQLite caching for fast repeated access
|
|
51
|
+
class MultistreamIndex
|
|
52
|
+
attr_reader :index_path, :entries_by_title, :entries_by_id, :stream_offsets
|
|
53
|
+
|
|
54
|
+
# Initialize index with optional SQLite caching and early termination
|
|
55
|
+
# @param index_path [String] Path to the bz2 index file
|
|
56
|
+
# @param use_cache [Boolean] Whether to use SQLite cache (default: true)
|
|
57
|
+
# @param cache_dir [String, nil] Directory for SQLite cache (default: ~/.wp2txt/cache)
|
|
58
|
+
# @param target_titles [Array<String>, nil] If provided, stop parsing when all titles found
|
|
59
|
+
# @param show_progress [Boolean] Whether to show progress during parsing (default: true)
|
|
60
|
+
def initialize(index_path, use_cache: true, cache_dir: nil, target_titles: nil, show_progress: true)
|
|
61
|
+
@index_path = index_path
|
|
62
|
+
@entries_by_title = {}
|
|
63
|
+
@entries_by_id = {}
|
|
64
|
+
@stream_offsets = []
|
|
65
|
+
@show_progress = show_progress
|
|
66
|
+
@target_titles = target_titles ? Set.new(target_titles) : nil
|
|
67
|
+
@found_targets = Set.new if @target_titles
|
|
68
|
+
|
|
69
|
+
# Try to load from cache first
|
|
70
|
+
if use_cache && @target_titles.nil?
|
|
71
|
+
@cache = IndexCache.new(index_path, cache_dir: cache_dir)
|
|
72
|
+
if load_from_cache
|
|
73
|
+
return
|
|
74
|
+
end
|
|
75
|
+
else
|
|
76
|
+
@cache = nil
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Parse index file
|
|
80
|
+
load_index
|
|
81
|
+
|
|
82
|
+
# Save to cache for future use (only if full parse completed)
|
|
83
|
+
if @cache && @target_titles.nil?
|
|
84
|
+
save_to_cache
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Check if this index was loaded from cache
|
|
89
|
+
def loaded_from_cache?
|
|
90
|
+
@loaded_from_cache == true
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Check if early termination was triggered
|
|
94
|
+
def early_terminated?
|
|
95
|
+
@early_terminated == true
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def find_by_title(title)
|
|
99
|
+
@entries_by_title[title]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def find_by_id(page_id)
|
|
103
|
+
@entries_by_id[page_id]
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Get all articles in a specific stream (by byte offset)
|
|
107
|
+
def articles_in_stream(byte_offset)
|
|
108
|
+
@entries_by_title.values.select { |e| e[:offset] == byte_offset }
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Get stream offset for a given article
|
|
112
|
+
def stream_offset_for(title)
|
|
113
|
+
entry = find_by_title(title)
|
|
114
|
+
entry ? entry[:offset] : nil
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Get N random articles
|
|
118
|
+
def random_articles(count)
|
|
119
|
+
@entries_by_title.keys.sample(count)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Get first N articles
|
|
123
|
+
def first_articles(count)
|
|
124
|
+
@entries_by_title.keys.first(count)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Total number of articles
|
|
128
|
+
def size
|
|
129
|
+
@entries_by_title.size
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
private
|
|
133
|
+
|
|
134
|
+
def load_from_cache
|
|
135
|
+
return false unless @cache&.valid?
|
|
136
|
+
|
|
137
|
+
print " Loading index from cache..." if @show_progress
|
|
138
|
+
$stdout.flush
|
|
139
|
+
|
|
140
|
+
data = @cache.load
|
|
141
|
+
return false unless data
|
|
142
|
+
|
|
143
|
+
@entries_by_title = data[:entries_by_title]
|
|
144
|
+
@entries_by_id = data[:entries_by_id]
|
|
145
|
+
@stream_offsets = data[:stream_offsets]
|
|
146
|
+
@loaded_from_cache = true
|
|
147
|
+
|
|
148
|
+
puts " #{@entries_by_title.size} entries loaded" if @show_progress
|
|
149
|
+
true
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def save_to_cache
|
|
153
|
+
return unless @cache
|
|
154
|
+
|
|
155
|
+
print " Saving index to cache..." if @show_progress
|
|
156
|
+
$stdout.flush
|
|
157
|
+
|
|
158
|
+
@cache.save(@entries_by_title, @stream_offsets)
|
|
159
|
+
|
|
160
|
+
puts " done" if @show_progress
|
|
161
|
+
rescue StandardError => e
|
|
162
|
+
puts " failed (#{e.message})" if @show_progress
|
|
163
|
+
# Non-fatal: continue without cache
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def load_index
|
|
167
|
+
return unless File.exist?(@index_path)
|
|
168
|
+
|
|
169
|
+
# Handle both .bz2 and plain text index files
|
|
170
|
+
if @index_path.end_with?(".bz2")
|
|
171
|
+
require "open3"
|
|
172
|
+
IO.popen(["bzcat", @index_path], "r") do |io|
|
|
173
|
+
parse_index_stream(io)
|
|
174
|
+
end
|
|
175
|
+
else
|
|
176
|
+
File.open(@index_path, "r") do |io|
|
|
177
|
+
parse_index_stream(io)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def parse_index_stream(io)
|
|
183
|
+
count = 0
|
|
184
|
+
io.each_line do |line|
|
|
185
|
+
line = line.strip
|
|
186
|
+
next if line.empty?
|
|
187
|
+
|
|
188
|
+
parts = line.split(":", 3)
|
|
189
|
+
next if parts.size < 3
|
|
190
|
+
|
|
191
|
+
offset = parts[0].to_i
|
|
192
|
+
page_id = parts[1].to_i
|
|
193
|
+
title = parts[2]
|
|
194
|
+
|
|
195
|
+
entry = { offset: offset, page_id: page_id, title: title }
|
|
196
|
+
@entries_by_title[title] = entry
|
|
197
|
+
@entries_by_id[page_id] = entry
|
|
198
|
+
|
|
199
|
+
if @stream_offsets.empty? || @stream_offsets.last != offset
|
|
200
|
+
@stream_offsets << offset
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Early termination: check if we found all target titles
|
|
204
|
+
if @target_titles
|
|
205
|
+
@found_targets << title if @target_titles.include?(title)
|
|
206
|
+
if @found_targets.size == @target_titles.size
|
|
207
|
+
@early_terminated = true
|
|
208
|
+
print "\r Found all #{@target_titles.size} target articles" if @show_progress
|
|
209
|
+
puts if @show_progress
|
|
210
|
+
break
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
count += 1
|
|
215
|
+
if @show_progress && count % INDEX_PROGRESS_THRESHOLD == 0
|
|
216
|
+
print "\r Parsed #{count / 1_000_000.0}M entries..."
|
|
217
|
+
$stdout.flush
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
print "\r" + " " * 40 + "\r" if @show_progress && count >= INDEX_PROGRESS_THRESHOLD && !@early_terminated
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Reads articles from multistream bz2 files
|
|
225
|
+
class MultistreamReader
|
|
226
|
+
attr_reader :multistream_path, :index
|
|
227
|
+
|
|
228
|
+
# Initialize reader with multistream file and index
|
|
229
|
+
# @param multistream_path [String] Path to the multistream bz2 file
|
|
230
|
+
# @param index_or_path [MultistreamIndex, String] Either an existing index instance or path to index file
|
|
231
|
+
# @param use_cache [Boolean] Whether to use SQLite cache for index (default: true, only used if index_or_path is a path)
|
|
232
|
+
# @param cache_dir [String, nil] Directory for SQLite cache (only used if index_or_path is a path)
|
|
233
|
+
def initialize(multistream_path, index_or_path, use_cache: true, cache_dir: nil)
|
|
234
|
+
@multistream_path = multistream_path
|
|
235
|
+
|
|
236
|
+
# Accept either an existing index or a path to create one
|
|
237
|
+
if index_or_path.is_a?(MultistreamIndex)
|
|
238
|
+
@index = index_or_path
|
|
239
|
+
else
|
|
240
|
+
@index = MultistreamIndex.new(index_or_path, use_cache: use_cache, cache_dir: cache_dir)
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Extract a single article by title
|
|
245
|
+
def extract_article(title)
|
|
246
|
+
entry = @index.find_by_title(title)
|
|
247
|
+
return nil unless entry
|
|
248
|
+
|
|
249
|
+
stream_content = read_stream_at(entry[:offset])
|
|
250
|
+
extract_page_from_xml(stream_content, title)
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Extract multiple articles (sequential)
|
|
254
|
+
def extract_articles(titles)
|
|
255
|
+
# Group by stream offset for efficiency
|
|
256
|
+
grouped = titles.group_by { |t| @index.stream_offset_for(t) }
|
|
257
|
+
|
|
258
|
+
results = {}
|
|
259
|
+
grouped.each do |offset, titles_in_stream|
|
|
260
|
+
next unless offset
|
|
261
|
+
|
|
262
|
+
stream_content = read_stream_at(offset)
|
|
263
|
+
titles_in_stream.each do |title|
|
|
264
|
+
page = extract_page_from_xml(stream_content, title)
|
|
265
|
+
results[title] = page if page
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
results
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Extract multiple articles in parallel (by stream)
|
|
272
|
+
# @param titles [Array<String>] Article titles to extract
|
|
273
|
+
# @param num_processes [Integer] Number of parallel processes (default: 4)
|
|
274
|
+
# @param progress_callback [Proc, nil] Optional callback for progress updates
|
|
275
|
+
# @return [Hash] Map of title => page data
|
|
276
|
+
def extract_articles_parallel(titles, num_processes: 4, &progress_callback)
|
|
277
|
+
# Group titles by stream offset
|
|
278
|
+
grouped = titles.group_by { |t| @index.stream_offset_for(t) }
|
|
279
|
+
grouped.delete(nil) # Remove titles not found in index
|
|
280
|
+
|
|
281
|
+
# Process streams in parallel
|
|
282
|
+
stream_results = Parallel.map(grouped.keys, in_processes: num_processes) do |offset|
|
|
283
|
+
titles_in_stream = grouped[offset]
|
|
284
|
+
stream_content = read_stream_at(offset)
|
|
285
|
+
|
|
286
|
+
stream_pages = {}
|
|
287
|
+
titles_in_stream.each do |title|
|
|
288
|
+
page = extract_page_from_xml(stream_content, title)
|
|
289
|
+
stream_pages[title] = page if page
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
stream_pages
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# Merge results from all streams
|
|
296
|
+
results = {}
|
|
297
|
+
stream_results.each do |stream_pages|
|
|
298
|
+
results.merge!(stream_pages)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
results
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# Iterate through articles in parallel, yielding each page
|
|
305
|
+
# Groups articles by stream and processes streams in parallel
|
|
306
|
+
# @param entries [Array<Hash>] Array of index entries with :title and :offset
|
|
307
|
+
# @param num_processes [Integer] Number of parallel processes
|
|
308
|
+
# @yield [Hash] Page data for each article
|
|
309
|
+
def each_article_parallel(entries, num_processes: 4)
|
|
310
|
+
return enum_for(:each_article_parallel, entries, num_processes: num_processes) unless block_given?
|
|
311
|
+
|
|
312
|
+
# Group by stream offset
|
|
313
|
+
grouped = entries.group_by { |e| e[:offset] }
|
|
314
|
+
|
|
315
|
+
# Process streams in parallel, collecting all pages
|
|
316
|
+
all_pages = Parallel.flat_map(grouped.keys, in_processes: num_processes) do |offset|
|
|
317
|
+
entries_in_stream = grouped[offset]
|
|
318
|
+
stream_content = read_stream_at(offset)
|
|
319
|
+
|
|
320
|
+
pages = []
|
|
321
|
+
entries_in_stream.each do |entry|
|
|
322
|
+
page = extract_page_from_xml(stream_content, entry[:title])
|
|
323
|
+
pages << page if page
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
pages
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# Yield each page (sequential, as yielding must happen in main process)
|
|
330
|
+
all_pages.each { |page| yield page }
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
# Iterate through all articles in a stream
|
|
334
|
+
def each_article_in_stream(offset, &block)
|
|
335
|
+
stream_content = read_stream_at(offset)
|
|
336
|
+
extract_all_pages_from_xml(stream_content, &block)
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
# Iterate through first N streams
|
|
340
|
+
def each_article_in_first_streams(stream_count, &block)
|
|
341
|
+
@index.stream_offsets.first(stream_count).each do |offset|
|
|
342
|
+
each_article_in_stream(offset, &block)
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
private
|
|
347
|
+
|
|
348
|
+
def read_stream_at(offset)
|
|
349
|
+
# Read the bz2 stream starting at the given offset
|
|
350
|
+
# We need to find where this stream ends (next stream start or EOF)
|
|
351
|
+
next_offset = find_next_offset(offset)
|
|
352
|
+
|
|
353
|
+
File.open(@multistream_path, "rb") do |f|
|
|
354
|
+
f.seek(offset)
|
|
355
|
+
|
|
356
|
+
if next_offset
|
|
357
|
+
compressed_data = f.read(next_offset - offset)
|
|
358
|
+
else
|
|
359
|
+
# Last stream - read to end
|
|
360
|
+
compressed_data = f.read
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
decompress_bz2(compressed_data)
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
def find_next_offset(current_offset)
|
|
368
|
+
idx = @index.stream_offsets.index(current_offset)
|
|
369
|
+
return nil unless idx
|
|
370
|
+
|
|
371
|
+
@index.stream_offsets[idx + 1]
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
def decompress_bz2(data)
|
|
375
|
+
require "stringio"
|
|
376
|
+
require "open3"
|
|
377
|
+
|
|
378
|
+
stdout, status = Open3.capture2("bzcat", stdin_data: data)
|
|
379
|
+
stdout
|
|
380
|
+
end
|
|
381
|
+
|
|
382
|
+
def extract_page_from_xml(xml_content, title)
|
|
383
|
+
# Simple extraction - find the page with matching title
|
|
384
|
+
require "nokogiri"
|
|
385
|
+
|
|
386
|
+
doc = Nokogiri::XML("<root>#{xml_content}</root>")
|
|
387
|
+
doc.xpath("//page").each do |page_node|
|
|
388
|
+
page_title = page_node.at_xpath("title")&.text
|
|
389
|
+
if page_title == title
|
|
390
|
+
return {
|
|
391
|
+
title: page_title,
|
|
392
|
+
id: page_node.at_xpath("id")&.text&.to_i,
|
|
393
|
+
text: page_node.at_xpath(".//text")&.text || ""
|
|
394
|
+
}
|
|
395
|
+
end
|
|
396
|
+
end
|
|
397
|
+
nil
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
def extract_all_pages_from_xml(xml_content, &block)
|
|
401
|
+
require "nokogiri"
|
|
402
|
+
|
|
403
|
+
doc = Nokogiri::XML("<root>#{xml_content}</root>")
|
|
404
|
+
doc.xpath("//page").each do |page_node|
|
|
405
|
+
page = {
|
|
406
|
+
title: page_node.at_xpath("title")&.text,
|
|
407
|
+
id: page_node.at_xpath("id")&.text&.to_i,
|
|
408
|
+
text: page_node.at_xpath(".//text")&.text || ""
|
|
409
|
+
}
|
|
410
|
+
yield page if page[:title]
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
# Manages downloading and caching of dump files
|
|
416
|
+
# Supports any Wikipedia language code (e.g., en, ja, de, fr, zh, ar, etc.)
|
|
417
|
+
# Language metadata is stored in lib/wp2txt/data/language_metadata.json
|
|
418
|
+
class DumpManager
|
|
419
|
+
DUMP_BASE_URL = "https://dumps.wikimedia.org"
|
|
420
|
+
DEFAULT_CACHE_DIR = File.expand_path("~/.wp2txt/cache")
|
|
421
|
+
|
|
422
|
+
# Legacy constant for backward compatibility
|
|
423
|
+
CACHE_DIR = "tmp/dump_cache"
|
|
424
|
+
|
|
425
|
+
attr_reader :lang, :cache_dir, :dump_expiry_days
|
|
426
|
+
|
|
427
|
+
class << self
|
|
428
|
+
# Get default cache directory
|
|
429
|
+
def default_cache_dir
|
|
430
|
+
DEFAULT_CACHE_DIR
|
|
431
|
+
end
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
def initialize(lang, cache_dir: nil, dump_expiry_days: nil)
|
|
435
|
+
@lang = lang.to_sym
|
|
436
|
+
@cache_dir = cache_dir || DEFAULT_CACHE_DIR
|
|
437
|
+
@dump_expiry_days = dump_expiry_days || Wp2txt::DEFAULT_DUMP_EXPIRY_DAYS
|
|
438
|
+
FileUtils.mkdir_p(@cache_dir)
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
# Format bytes as human-readable string
|
|
442
|
+
def format_size(bytes)
|
|
443
|
+
Wp2txt.format_file_size(bytes)
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
# Get the latest dump date for a language
|
|
447
|
+
def latest_dump_date
|
|
448
|
+
@latest_dump_date ||= fetch_latest_dump_date
|
|
449
|
+
end
|
|
450
|
+
|
|
451
|
+
# Download multistream index file
|
|
452
|
+
def download_index(force: false)
|
|
453
|
+
index_path = cached_index_path
|
|
454
|
+
if File.exist?(index_path) && !force
|
|
455
|
+
puts "Index already cached: #{File.basename(index_path)}"
|
|
456
|
+
$stdout.flush
|
|
457
|
+
return index_path
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
url = index_url
|
|
461
|
+
puts "Downloading index: #{url}"
|
|
462
|
+
$stdout.flush
|
|
463
|
+
download_file(url, index_path)
|
|
464
|
+
index_path
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
# Download multistream dump file
|
|
468
|
+
# @param force [Boolean] Force re-download even if cached
|
|
469
|
+
# @param max_streams [Integer, nil] If set, only download first N streams (partial download)
|
|
470
|
+
def download_multistream(force: false, max_streams: nil)
|
|
471
|
+
# For partial downloads, first check if full dump exists (most efficient)
|
|
472
|
+
if max_streams && !force
|
|
473
|
+
full_path = cached_multistream_path
|
|
474
|
+
if File.exist?(full_path)
|
|
475
|
+
puts "Using cached full dump: #{File.basename(full_path)}"
|
|
476
|
+
$stdout.flush
|
|
477
|
+
return full_path
|
|
478
|
+
end
|
|
479
|
+
|
|
480
|
+
# Check if a larger partial download exists
|
|
481
|
+
existing_partial = find_suitable_partial_cache(max_streams)
|
|
482
|
+
if existing_partial
|
|
483
|
+
puts "Using cached partial: #{File.basename(existing_partial)}"
|
|
484
|
+
$stdout.flush
|
|
485
|
+
return existing_partial
|
|
486
|
+
end
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
dump_path = max_streams ? cached_partial_multistream_path(max_streams) : cached_multistream_path
|
|
490
|
+
if File.exist?(dump_path) && !force
|
|
491
|
+
puts "Multistream already cached: #{File.basename(dump_path)}"
|
|
492
|
+
$stdout.flush
|
|
493
|
+
return dump_path
|
|
494
|
+
end
|
|
495
|
+
|
|
496
|
+
url = multistream_url
|
|
497
|
+
|
|
498
|
+
if max_streams
|
|
499
|
+
# Partial download: need index first to know byte range
|
|
500
|
+
index_path = download_index
|
|
501
|
+
index = MultistreamIndex.new(index_path, cache_dir: @cache_dir)
|
|
502
|
+
|
|
503
|
+
if index.stream_offsets.size >= max_streams
|
|
504
|
+
# Get byte range for first N streams
|
|
505
|
+
end_offset = index.stream_offsets[max_streams]
|
|
506
|
+
puts "Downloading first #{max_streams} streams (#{format_size(end_offset)}): #{url}"
|
|
507
|
+
$stdout.flush
|
|
508
|
+
download_file_range(url, dump_path, 0, end_offset - 1)
|
|
509
|
+
else
|
|
510
|
+
puts "Only #{index.stream_offsets.size} streams available, downloading all"
|
|
511
|
+
$stdout.flush
|
|
512
|
+
download_file(url, dump_path)
|
|
513
|
+
end
|
|
514
|
+
else
|
|
515
|
+
puts "Downloading multistream: #{url}"
|
|
516
|
+
$stdout.flush
|
|
517
|
+
download_file(url, dump_path)
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
dump_path
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
# Find a suitable cached partial download (same or larger than needed)
|
|
524
|
+
# @param min_streams [Integer] Minimum number of streams needed
|
|
525
|
+
# @return [String, nil] Path to suitable cached file, or nil
|
|
526
|
+
def find_suitable_partial_cache(min_streams)
|
|
527
|
+
pattern = File.join(@cache_dir, "#{@lang}wiki-#{latest_dump_date}-multistream-*streams.xml.bz2")
|
|
528
|
+
Dir.glob(pattern).each do |path|
|
|
529
|
+
if path =~ /multistream-(\d+)streams\.xml\.bz2$/
|
|
530
|
+
stream_count = $1.to_i
|
|
531
|
+
return path if stream_count >= min_streams
|
|
532
|
+
end
|
|
533
|
+
end
|
|
534
|
+
nil
|
|
535
|
+
end
|
|
536
|
+
|
|
537
|
+
# Find any existing partial dump (any date)
|
|
538
|
+
# @return [Hash, nil] Info about existing partial dump, or nil
|
|
539
|
+
def find_any_partial_cache
|
|
540
|
+
pattern = File.join(@cache_dir, "#{@lang}wiki-*-multistream-*streams.xml.bz2")
|
|
541
|
+
partials = []
|
|
542
|
+
|
|
543
|
+
Dir.glob(pattern).each do |path|
|
|
544
|
+
if path =~ /#{@lang}wiki-(\d{8})-multistream-(\d+)streams\.xml\.bz2$/
|
|
545
|
+
dump_date = $1
|
|
546
|
+
stream_count = $2.to_i
|
|
547
|
+
partials << {
|
|
548
|
+
path: path,
|
|
549
|
+
dump_date: dump_date,
|
|
550
|
+
stream_count: stream_count,
|
|
551
|
+
size: File.size(path),
|
|
552
|
+
mtime: File.mtime(path)
|
|
553
|
+
}
|
|
554
|
+
end
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
# Return the largest partial (by stream count)
|
|
558
|
+
partials.max_by { |p| p[:stream_count] }
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
# Check if incremental download is possible from existing partial
|
|
562
|
+
# @param partial_info [Hash] Info from find_any_partial_cache
|
|
563
|
+
# @return [Hash] Result with :possible, :reason, and details
|
|
564
|
+
def can_resume_from_partial?(partial_info)
|
|
565
|
+
return { possible: false, reason: :no_partial } unless partial_info
|
|
566
|
+
|
|
567
|
+
current_date = latest_dump_date
|
|
568
|
+
|
|
569
|
+
# Check if dump dates match
|
|
570
|
+
if partial_info[:dump_date] != current_date
|
|
571
|
+
return {
|
|
572
|
+
possible: false,
|
|
573
|
+
reason: :date_mismatch,
|
|
574
|
+
partial_date: partial_info[:dump_date],
|
|
575
|
+
latest_date: current_date
|
|
576
|
+
}
|
|
577
|
+
end
|
|
578
|
+
|
|
579
|
+
# Validate the partial file with Bz2Validator
|
|
580
|
+
require_relative "bz2_validator"
|
|
581
|
+
validation = Bz2Validator.validate_quick(partial_info[:path])
|
|
582
|
+
unless validation.valid?
|
|
583
|
+
return {
|
|
584
|
+
possible: false,
|
|
585
|
+
reason: :invalid_partial,
|
|
586
|
+
error: validation.message
|
|
587
|
+
}
|
|
588
|
+
end
|
|
589
|
+
|
|
590
|
+
# Verify file size matches expected offset
|
|
591
|
+
index_path = download_index
|
|
592
|
+
index = MultistreamIndex.new(index_path, cache_dir: @cache_dir)
|
|
593
|
+
|
|
594
|
+
expected_size = if partial_info[:stream_count] < index.stream_offsets.size
|
|
595
|
+
index.stream_offsets[partial_info[:stream_count]]
|
|
596
|
+
else
|
|
597
|
+
# Partial has all streams - no need to resume
|
|
598
|
+
return { possible: false, reason: :already_complete }
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
actual_size = partial_info[:size]
|
|
602
|
+
if actual_size != expected_size
|
|
603
|
+
return {
|
|
604
|
+
possible: false,
|
|
605
|
+
reason: :size_mismatch,
|
|
606
|
+
expected: expected_size,
|
|
607
|
+
actual: actual_size
|
|
608
|
+
}
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
{
|
|
612
|
+
possible: true,
|
|
613
|
+
partial_info: partial_info,
|
|
614
|
+
current_streams: partial_info[:stream_count],
|
|
615
|
+
total_streams: index.stream_offsets.size,
|
|
616
|
+
current_size: actual_size
|
|
617
|
+
}
|
|
618
|
+
end
|
|
619
|
+
|
|
620
|
+
# Download full dump with incremental support
|
|
621
|
+
# @param force [Boolean] Force re-download
|
|
622
|
+
# @param interactive [Boolean] Prompt user for choices (default: true)
|
|
623
|
+
# @return [String] Path to downloaded file
|
|
624
|
+
def download_multistream_full(force: false, interactive: true)
|
|
625
|
+
full_path = cached_multistream_path
|
|
626
|
+
|
|
627
|
+
# If full dump exists, use it
|
|
628
|
+
if File.exist?(full_path) && !force
|
|
629
|
+
puts "Using cached full dump: #{File.basename(full_path)}"
|
|
630
|
+
$stdout.flush
|
|
631
|
+
return full_path
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
# Check for existing partial dump
|
|
635
|
+
partial = find_any_partial_cache
|
|
636
|
+
if partial && interactive
|
|
637
|
+
resume_info = can_resume_from_partial?(partial)
|
|
638
|
+
|
|
639
|
+
if resume_info[:possible]
|
|
640
|
+
# Same date - can resume
|
|
641
|
+
return handle_resumable_partial(partial, resume_info, force)
|
|
642
|
+
elsif resume_info[:reason] == :date_mismatch
|
|
643
|
+
# Different date - ask user
|
|
644
|
+
return handle_outdated_partial(partial, resume_info, force)
|
|
645
|
+
elsif resume_info[:reason] == :size_mismatch || resume_info[:reason] == :invalid_partial
|
|
646
|
+
# Corrupted partial - inform and re-download
|
|
647
|
+
puts "Warning: Existing partial dump appears corrupted."
|
|
648
|
+
puts " Reason: #{resume_info[:reason]}"
|
|
649
|
+
puts " Will download fresh copy."
|
|
650
|
+
FileUtils.rm_f(partial[:path])
|
|
651
|
+
end
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
# Standard full download
|
|
655
|
+
download_multistream(force: force, max_streams: nil)
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
private
|
|
659
|
+
|
|
660
|
+
def handle_resumable_partial(partial, resume_info, force)
|
|
661
|
+
current = resume_info[:current_streams]
|
|
662
|
+
total = resume_info[:total_streams]
|
|
663
|
+
current_size = resume_info[:current_size]
|
|
664
|
+
|
|
665
|
+
# Calculate remaining download size
|
|
666
|
+
index_path = cached_index_path
|
|
667
|
+
index = MultistreamIndex.new(index_path, cache_dir: @cache_dir)
|
|
668
|
+
|
|
669
|
+
# Get total file size from HTTP HEAD request
|
|
670
|
+
url = multistream_url
|
|
671
|
+
total_size = get_remote_file_size(url)
|
|
672
|
+
remaining_size = total_size - current_size
|
|
673
|
+
|
|
674
|
+
puts
|
|
675
|
+
puts "Found existing partial dump (same date):"
|
|
676
|
+
puts " Current: #{current} streams (#{format_size(current_size)})"
|
|
677
|
+
puts " Total: #{total} streams (#{format_size(total_size)})"
|
|
678
|
+
puts " Remaining: #{format_size(remaining_size)}"
|
|
679
|
+
puts
|
|
680
|
+
|
|
681
|
+
print "Download remaining data? [Y/n/f(ull fresh download)]: "
|
|
682
|
+
$stdout.flush
|
|
683
|
+
response = $stdin.gets&.strip&.downcase || "y"
|
|
684
|
+
|
|
685
|
+
case response
|
|
686
|
+
when "n", "no"
|
|
687
|
+
puts "Using existing partial dump."
|
|
688
|
+
partial[:path]
|
|
689
|
+
when "f", "full", "fresh"
|
|
690
|
+
puts "Downloading fresh full dump..."
|
|
691
|
+
FileUtils.rm_f(partial[:path])
|
|
692
|
+
download_multistream(force: true, max_streams: nil)
|
|
693
|
+
else
|
|
694
|
+
# Resume download
|
|
695
|
+
puts "Resuming download..."
|
|
696
|
+
download_incremental(partial[:path], current_size, total_size)
|
|
697
|
+
end
|
|
698
|
+
end
|
|
699
|
+
|
|
700
|
+
def handle_outdated_partial(partial, resume_info, force)
|
|
701
|
+
puts
|
|
702
|
+
puts "Found existing partial dump with different date:"
|
|
703
|
+
puts " Partial dump: #{partial[:dump_date]} (#{partial[:stream_count]} streams, #{format_size(partial[:size])})"
|
|
704
|
+
puts " Latest dump: #{resume_info[:latest_date]}"
|
|
705
|
+
puts
|
|
706
|
+
puts "Options:"
|
|
707
|
+
puts " [D] Delete old partial and download latest full dump (recommended)"
|
|
708
|
+
puts " [K] Keep old partial, download latest full dump separately"
|
|
709
|
+
puts " [U] Use old partial as-is (may have outdated content)"
|
|
710
|
+
puts
|
|
711
|
+
|
|
712
|
+
print "Choice [D/k/u]: "
|
|
713
|
+
$stdout.flush
|
|
714
|
+
response = $stdin.gets&.strip&.downcase || "d"
|
|
715
|
+
|
|
716
|
+
case response
|
|
717
|
+
when "k", "keep"
|
|
718
|
+
puts "Keeping old partial, downloading latest full dump..."
|
|
719
|
+
download_multistream(force: true, max_streams: nil)
|
|
720
|
+
when "u", "use"
|
|
721
|
+
puts "Using old partial dump (content may be outdated)."
|
|
722
|
+
partial[:path]
|
|
723
|
+
else
|
|
724
|
+
puts "Deleting old partial and downloading latest..."
|
|
725
|
+
FileUtils.rm_f(partial[:path])
|
|
726
|
+
download_multistream(force: true, max_streams: nil)
|
|
727
|
+
end
|
|
728
|
+
end
|
|
729
|
+
|
|
730
|
+
def download_incremental(partial_path, start_byte, total_size)
|
|
731
|
+
url = multistream_url
|
|
732
|
+
full_path = cached_multistream_path
|
|
733
|
+
|
|
734
|
+
uri = URI(url)
|
|
735
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
736
|
+
http.use_ssl = (uri.scheme == "https")
|
|
737
|
+
http.open_timeout = DEFAULT_HTTP_TIMEOUT
|
|
738
|
+
http.read_timeout = DEFAULT_HTTP_TIMEOUT
|
|
739
|
+
if http.use_ssl?
|
|
740
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
741
|
+
end
|
|
742
|
+
|
|
743
|
+
request = Net::HTTP::Get.new(uri)
|
|
744
|
+
request["Range"] = "bytes=#{start_byte}-"
|
|
745
|
+
|
|
746
|
+
# Copy partial to full path first, then append
|
|
747
|
+
FileUtils.cp(partial_path, full_path)
|
|
748
|
+
|
|
749
|
+
File.open(full_path, "ab") do |file|
|
|
750
|
+
http.request(request) do |response|
|
|
751
|
+
if response.code == "206"
|
|
752
|
+
remaining = total_size - start_byte
|
|
753
|
+
downloaded = 0
|
|
754
|
+
|
|
755
|
+
response.read_body do |chunk|
|
|
756
|
+
file.write(chunk)
|
|
757
|
+
downloaded += chunk.size
|
|
758
|
+
total_downloaded = start_byte + downloaded
|
|
759
|
+
percent = (total_downloaded * 100.0 / total_size).round(1)
|
|
760
|
+
print "\r Progress: #{percent}% (#{format_size(total_downloaded)} / #{format_size(total_size)})"
|
|
761
|
+
$stdout.flush
|
|
762
|
+
end
|
|
763
|
+
puts
|
|
764
|
+
elsif response.code == "200"
|
|
765
|
+
# Server doesn't support Range - need full download
|
|
766
|
+
puts "\nServer doesn't support resume. Downloading full file..."
|
|
767
|
+
file.close
|
|
768
|
+
FileUtils.rm_f(full_path)
|
|
769
|
+
return download_multistream(force: true, max_streams: nil)
|
|
770
|
+
else
|
|
771
|
+
raise "Download failed: #{response.code} #{response.message}"
|
|
772
|
+
end
|
|
773
|
+
end
|
|
774
|
+
end
|
|
775
|
+
|
|
776
|
+
# Validate the combined file
|
|
777
|
+
require_relative "bz2_validator"
|
|
778
|
+
validation = Bz2Validator.validate_quick(full_path)
|
|
779
|
+
unless validation.valid?
|
|
780
|
+
puts "Warning: Combined file validation failed. Re-downloading..."
|
|
781
|
+
FileUtils.rm_f(full_path)
|
|
782
|
+
return download_multistream(force: true, max_streams: nil)
|
|
783
|
+
end
|
|
784
|
+
|
|
785
|
+
puts "Successfully resumed download!"
|
|
786
|
+
|
|
787
|
+
# Optionally remove the partial file
|
|
788
|
+
FileUtils.rm_f(partial_path) if partial_path != full_path
|
|
789
|
+
|
|
790
|
+
full_path
|
|
791
|
+
end
|
|
792
|
+
|
|
793
|
+
def get_remote_file_size(url)
|
|
794
|
+
uri = URI(url)
|
|
795
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
796
|
+
http.use_ssl = (uri.scheme == "https")
|
|
797
|
+
http.open_timeout = DEFAULT_HTTP_TIMEOUT
|
|
798
|
+
http.read_timeout = DEFAULT_HTTP_TIMEOUT
|
|
799
|
+
if http.use_ssl?
|
|
800
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
801
|
+
end
|
|
802
|
+
|
|
803
|
+
request = Net::HTTP::Head.new(uri)
|
|
804
|
+
response = http.request(request)
|
|
805
|
+
|
|
806
|
+
response["Content-Length"]&.to_i || 0
|
|
807
|
+
end
|
|
808
|
+
|
|
809
|
+
public
|
|
810
|
+
|
|
811
|
+
# Path for partial multistream cache
|
|
812
|
+
def cached_partial_multistream_path(stream_count)
|
|
813
|
+
File.join(@cache_dir, "#{@lang}wiki-#{latest_dump_date}-multistream-#{stream_count}streams.xml.bz2")
|
|
814
|
+
end
|
|
815
|
+
|
|
816
|
+
# Get paths for cached files
|
|
817
|
+
def cached_index_path
|
|
818
|
+
File.join(@cache_dir, "#{@lang}wiki-#{latest_dump_date}-multistream-index.txt.bz2")
|
|
819
|
+
end
|
|
820
|
+
|
|
821
|
+
def cached_multistream_path
|
|
822
|
+
File.join(@cache_dir, "#{@lang}wiki-#{latest_dump_date}-multistream.xml.bz2")
|
|
823
|
+
end
|
|
824
|
+
|
|
825
|
+
# Check if cache is fresh (within configured days)
|
|
826
|
+
def cache_fresh?(days = nil)
|
|
827
|
+
days ||= @dump_expiry_days
|
|
828
|
+
Wp2txt.file_fresh?(cached_index_path, days)
|
|
829
|
+
end
|
|
830
|
+
|
|
831
|
+
# Check if cache is stale (beyond configured expiry days)
|
|
832
|
+
def cache_stale?
|
|
833
|
+
!cache_fresh?
|
|
834
|
+
end
|
|
835
|
+
|
|
836
|
+
# Get cache age in days
|
|
837
|
+
# Returns nil if no cache exists
|
|
838
|
+
def cache_age_days
|
|
839
|
+
Wp2txt.file_age_days(cached_index_path)
|
|
840
|
+
end
|
|
841
|
+
|
|
842
|
+
# Get cache modification time
|
|
843
|
+
# Returns nil if no cache exists
|
|
844
|
+
def cache_mtime
|
|
845
|
+
path = cached_index_path
|
|
846
|
+
return nil unless File.exist?(path)
|
|
847
|
+
|
|
848
|
+
File.mtime(path)
|
|
849
|
+
end
|
|
850
|
+
|
|
851
|
+
# Get cache status information
|
|
852
|
+
def cache_status
|
|
853
|
+
{
|
|
854
|
+
lang: @lang,
|
|
855
|
+
cache_dir: @cache_dir,
|
|
856
|
+
index_exists: File.exist?(cached_index_path),
|
|
857
|
+
index_path: cached_index_path,
|
|
858
|
+
index_size: File.exist?(cached_index_path) ? File.size(cached_index_path) : 0,
|
|
859
|
+
multistream_exists: File.exist?(cached_multistream_path),
|
|
860
|
+
multistream_path: cached_multistream_path,
|
|
861
|
+
multistream_size: File.exist?(cached_multistream_path) ? File.size(cached_multistream_path) : 0,
|
|
862
|
+
dump_date: (latest_dump_date rescue nil),
|
|
863
|
+
fresh: cache_fresh?,
|
|
864
|
+
age_days: cache_age_days,
|
|
865
|
+
mtime: cache_mtime,
|
|
866
|
+
expiry_days: @dump_expiry_days
|
|
867
|
+
}
|
|
868
|
+
end
|
|
869
|
+
|
|
870
|
+
# Clear cache for this language
|
|
871
|
+
def clear_cache!
|
|
872
|
+
lang_dir = File.join(@cache_dir, "#{@lang}wiki")
|
|
873
|
+
FileUtils.rm_rf(lang_dir) if File.exist?(lang_dir)
|
|
874
|
+
end
|
|
875
|
+
|
|
876
|
+
# Clear all cache
|
|
877
|
+
def self.clear_all_cache!(cache_dir = DEFAULT_CACHE_DIR)
|
|
878
|
+
FileUtils.rm_rf(cache_dir) if File.exist?(cache_dir)
|
|
879
|
+
end
|
|
880
|
+
|
|
881
|
+
# Get status for all cached languages
|
|
882
|
+
def self.all_cache_status(cache_dir = DEFAULT_CACHE_DIR)
|
|
883
|
+
return {} unless File.exist?(cache_dir)
|
|
884
|
+
|
|
885
|
+
status = {}
|
|
886
|
+
Dir.glob(File.join(cache_dir, "*wiki")).each do |lang_dir|
|
|
887
|
+
lang = File.basename(lang_dir).sub(/wiki$/, "").to_sym
|
|
888
|
+
manager = new(lang, cache_dir: cache_dir)
|
|
889
|
+
status[lang] = manager.cache_status
|
|
890
|
+
rescue IOError, Errno::ENOENT, Errno::EACCES, JSON::ParserError => e
|
|
891
|
+
status[lang] = { error: e.message }
|
|
892
|
+
end
|
|
893
|
+
status
|
|
894
|
+
end
|
|
895
|
+
|
|
896
|
+
private
|
|
897
|
+
|
|
898
|
+
def fetch_latest_dump_date
|
|
899
|
+
# Try to find the latest available dump
|
|
900
|
+
wiki = "#{@lang}wiki"
|
|
901
|
+
uri = URI("#{DUMP_BASE_URL}/#{wiki}/")
|
|
902
|
+
|
|
903
|
+
response = Wp2txt.ssl_safe_get(uri)
|
|
904
|
+
raise("Failed to fetch dump list for #{wiki}") unless response.is_a?(Net::HTTPSuccess)
|
|
905
|
+
|
|
906
|
+
# Find dates in format YYYYMMDD
|
|
907
|
+
dates = response.body.scan(/href="(\d{8})\/"/).flatten
|
|
908
|
+
dates.sort.last || raise("No dumps found for #{wiki}")
|
|
909
|
+
end
|
|
910
|
+
|
|
911
|
+
def index_url
|
|
912
|
+
wiki = "#{@lang}wiki"
|
|
913
|
+
date = latest_dump_date
|
|
914
|
+
"#{DUMP_BASE_URL}/#{wiki}/#{date}/#{wiki}-#{date}-pages-articles-multistream-index.txt.bz2"
|
|
915
|
+
end
|
|
916
|
+
|
|
917
|
+
def multistream_url
|
|
918
|
+
wiki = "#{@lang}wiki"
|
|
919
|
+
date = latest_dump_date
|
|
920
|
+
"#{DUMP_BASE_URL}/#{wiki}/#{date}/#{wiki}-#{date}-pages-articles-multistream.xml.bz2"
|
|
921
|
+
end
|
|
922
|
+
|
|
923
|
+
# Download metadata file path for tracking resumable downloads
|
|
924
|
+
def download_meta_path(path)
|
|
925
|
+
"#{path}.wp2txt_download"
|
|
926
|
+
end
|
|
927
|
+
|
|
928
|
+
# Get remote file info via HEAD request
|
|
929
|
+
# @return [Hash] { size:, etag:, last_modified: }
|
|
930
|
+
def get_remote_file_info(url)
|
|
931
|
+
uri = URI(url)
|
|
932
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
933
|
+
http.use_ssl = (uri.scheme == "https")
|
|
934
|
+
http.open_timeout = DEFAULT_HTTP_TIMEOUT
|
|
935
|
+
http.read_timeout = DEFAULT_HTTP_TIMEOUT
|
|
936
|
+
if http.use_ssl?
|
|
937
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
938
|
+
end
|
|
939
|
+
|
|
940
|
+
request = Net::HTTP::Head.new(uri)
|
|
941
|
+
response = http.request(request)
|
|
942
|
+
|
|
943
|
+
{
|
|
944
|
+
size: response["Content-Length"]&.to_i || 0,
|
|
945
|
+
etag: response["ETag"],
|
|
946
|
+
last_modified: response["Last-Modified"],
|
|
947
|
+
accept_ranges: response["Accept-Ranges"] == "bytes"
|
|
948
|
+
}
|
|
949
|
+
end
|
|
950
|
+
|
|
951
|
+
# Save download metadata for resume support
|
|
952
|
+
def save_download_meta(path, url, remote_info)
|
|
953
|
+
meta = {
|
|
954
|
+
url: url,
|
|
955
|
+
size: remote_info[:size],
|
|
956
|
+
etag: remote_info[:etag],
|
|
957
|
+
last_modified: remote_info[:last_modified],
|
|
958
|
+
started_at: Time.now.iso8601
|
|
959
|
+
}
|
|
960
|
+
File.write(download_meta_path(path), JSON.pretty_generate(meta))
|
|
961
|
+
end
|
|
962
|
+
|
|
963
|
+
# Load download metadata
|
|
964
|
+
# @return [Hash, nil] Metadata or nil if not found/invalid
|
|
965
|
+
def load_download_meta(path)
|
|
966
|
+
meta_path = download_meta_path(path)
|
|
967
|
+
return nil unless File.exist?(meta_path)
|
|
968
|
+
|
|
969
|
+
JSON.parse(File.read(meta_path), symbolize_names: true)
|
|
970
|
+
rescue JSON::ParserError
|
|
971
|
+
nil
|
|
972
|
+
end
|
|
973
|
+
|
|
974
|
+
# Clean up download metadata
|
|
975
|
+
def cleanup_download_meta(path)
|
|
976
|
+
FileUtils.rm_f(download_meta_path(path))
|
|
977
|
+
end
|
|
978
|
+
|
|
979
|
+
# Check if resume is safe (server file hasn't changed)
|
|
980
|
+
def can_resume_download?(path, url)
|
|
981
|
+
return false unless File.exist?(path)
|
|
982
|
+
|
|
983
|
+
meta = load_download_meta(path)
|
|
984
|
+
return false unless meta
|
|
985
|
+
|
|
986
|
+
# Check if metadata is not too old (max 7 days)
|
|
987
|
+
if meta[:started_at]
|
|
988
|
+
started = Time.parse(meta[:started_at]) rescue nil
|
|
989
|
+
if started && (Time.now - started) > days_to_seconds(RESUME_METADATA_MAX_AGE_DAYS)
|
|
990
|
+
puts " Partial download is too old (>#{RESUME_METADATA_MAX_AGE_DAYS} days). Starting fresh."
|
|
991
|
+
return false
|
|
992
|
+
end
|
|
993
|
+
end
|
|
994
|
+
|
|
995
|
+
# Get current remote file info
|
|
996
|
+
remote_info = get_remote_file_info(url)
|
|
997
|
+
|
|
998
|
+
# Check if ETag matches (most reliable)
|
|
999
|
+
if meta[:etag] && remote_info[:etag]
|
|
1000
|
+
if meta[:etag] != remote_info[:etag]
|
|
1001
|
+
puts " Server file has changed (ETag mismatch). Starting fresh."
|
|
1002
|
+
return false
|
|
1003
|
+
end
|
|
1004
|
+
# Fallback: check Last-Modified
|
|
1005
|
+
elsif meta[:last_modified] && remote_info[:last_modified]
|
|
1006
|
+
if meta[:last_modified] != remote_info[:last_modified]
|
|
1007
|
+
puts " Server file has changed (Last-Modified mismatch). Starting fresh."
|
|
1008
|
+
return false
|
|
1009
|
+
end
|
|
1010
|
+
end
|
|
1011
|
+
|
|
1012
|
+
# Check if server supports Range requests
|
|
1013
|
+
unless remote_info[:accept_ranges]
|
|
1014
|
+
puts " Server doesn't support resume. Starting fresh."
|
|
1015
|
+
return false
|
|
1016
|
+
end
|
|
1017
|
+
|
|
1018
|
+
true
|
|
1019
|
+
end
|
|
1020
|
+
|
|
1021
|
+
def download_file(url, path)
|
|
1022
|
+
uri = URI(url)
|
|
1023
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
1024
|
+
|
|
1025
|
+
# Check for resumable download
|
|
1026
|
+
partial_size = File.exist?(path) ? File.size(path) : 0
|
|
1027
|
+
resume_mode = false
|
|
1028
|
+
|
|
1029
|
+
if partial_size > 0 && can_resume_download?(path, url)
|
|
1030
|
+
meta = load_download_meta(path)
|
|
1031
|
+
total_size = meta[:size]
|
|
1032
|
+
if partial_size < total_size
|
|
1033
|
+
resume_mode = true
|
|
1034
|
+
puts " Resuming download from #{format_size(partial_size)} / #{format_size(total_size)} (#{(partial_size * 100.0 / total_size).round(1)}%)"
|
|
1035
|
+
elsif partial_size == total_size
|
|
1036
|
+
puts " Download already complete."
|
|
1037
|
+
cleanup_download_meta(path)
|
|
1038
|
+
return path
|
|
1039
|
+
else
|
|
1040
|
+
# Partial is larger than expected - corrupted, start fresh
|
|
1041
|
+
puts " Partial file corrupted (size mismatch). Starting fresh."
|
|
1042
|
+
FileUtils.rm_f(path)
|
|
1043
|
+
partial_size = 0
|
|
1044
|
+
end
|
|
1045
|
+
elsif partial_size > 0
|
|
1046
|
+
# Can't resume - remove partial and start fresh
|
|
1047
|
+
FileUtils.rm_f(path)
|
|
1048
|
+
cleanup_download_meta(path)
|
|
1049
|
+
partial_size = 0
|
|
1050
|
+
end
|
|
1051
|
+
|
|
1052
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
1053
|
+
http.use_ssl = (uri.scheme == "https")
|
|
1054
|
+
http.open_timeout = DEFAULT_HTTP_TIMEOUT
|
|
1055
|
+
http.read_timeout = DEFAULT_HTTP_TIMEOUT
|
|
1056
|
+
if http.use_ssl?
|
|
1057
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
1058
|
+
end
|
|
1059
|
+
|
|
1060
|
+
request = Net::HTTP::Get.new(uri)
|
|
1061
|
+
|
|
1062
|
+
if resume_mode
|
|
1063
|
+
request["Range"] = "bytes=#{partial_size}-"
|
|
1064
|
+
file_mode = "ab" # Append mode
|
|
1065
|
+
else
|
|
1066
|
+
file_mode = "wb" # Write mode (overwrite)
|
|
1067
|
+
# Save metadata for potential future resume
|
|
1068
|
+
remote_info = get_remote_file_info(url)
|
|
1069
|
+
save_download_meta(path, url, remote_info) if remote_info[:size] > 0
|
|
1070
|
+
end
|
|
1071
|
+
|
|
1072
|
+
File.open(path, file_mode) do |file|
|
|
1073
|
+
http.request(request) do |response|
|
|
1074
|
+
if response.code == "200" || response.code == "206"
|
|
1075
|
+
total = if resume_mode
|
|
1076
|
+
load_download_meta(path)[:size]
|
|
1077
|
+
else
|
|
1078
|
+
response["Content-Length"]&.to_i
|
|
1079
|
+
end
|
|
1080
|
+
downloaded = partial_size
|
|
1081
|
+
|
|
1082
|
+
response.read_body do |chunk|
|
|
1083
|
+
file.write(chunk)
|
|
1084
|
+
downloaded += chunk.size
|
|
1085
|
+
if total && total > 0
|
|
1086
|
+
percent = (downloaded * 100.0 / total).round(1)
|
|
1087
|
+
print "\r Progress: #{percent}% (#{format_size(downloaded)} / #{format_size(total)})"
|
|
1088
|
+
$stdout.flush
|
|
1089
|
+
end
|
|
1090
|
+
end
|
|
1091
|
+
puts
|
|
1092
|
+
elsif response.code == "416"
|
|
1093
|
+
# Range Not Satisfiable - file might be complete or corrupted
|
|
1094
|
+
puts "\n Range error. Verifying file..."
|
|
1095
|
+
remote_info = get_remote_file_info(url)
|
|
1096
|
+
if File.size(path) == remote_info[:size]
|
|
1097
|
+
puts " File is already complete."
|
|
1098
|
+
else
|
|
1099
|
+
puts " File corrupted. Re-downloading..."
|
|
1100
|
+
file.close
|
|
1101
|
+
FileUtils.rm_f(path)
|
|
1102
|
+
cleanup_download_meta(path)
|
|
1103
|
+
return download_file(url, path)
|
|
1104
|
+
end
|
|
1105
|
+
else
|
|
1106
|
+
raise "Download failed: #{response.code} #{response.message}"
|
|
1107
|
+
end
|
|
1108
|
+
end
|
|
1109
|
+
end
|
|
1110
|
+
|
|
1111
|
+
# Clean up metadata on successful completion
|
|
1112
|
+
cleanup_download_meta(path)
|
|
1113
|
+
|
|
1114
|
+
path
|
|
1115
|
+
end
|
|
1116
|
+
|
|
1117
|
+
# Download a range of bytes from a URL using HTTP Range header
|
|
1118
|
+
def download_file_range(url, path, start_byte, end_byte)
|
|
1119
|
+
uri = URI(url)
|
|
1120
|
+
|
|
1121
|
+
FileUtils.mkdir_p(File.dirname(path))
|
|
1122
|
+
|
|
1123
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
1124
|
+
http.use_ssl = (uri.scheme == "https")
|
|
1125
|
+
http.open_timeout = DEFAULT_HTTP_TIMEOUT
|
|
1126
|
+
http.read_timeout = DEFAULT_HTTP_TIMEOUT
|
|
1127
|
+
if http.use_ssl?
|
|
1128
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
1129
|
+
end
|
|
1130
|
+
|
|
1131
|
+
request = Net::HTTP::Get.new(uri)
|
|
1132
|
+
request["Range"] = "bytes=#{start_byte}-#{end_byte}"
|
|
1133
|
+
|
|
1134
|
+
File.open(path, "wb") do |file|
|
|
1135
|
+
http.request(request) do |response|
|
|
1136
|
+
if response.code == "206" || response.code == "200"
|
|
1137
|
+
total = end_byte - start_byte + 1
|
|
1138
|
+
downloaded = 0
|
|
1139
|
+
|
|
1140
|
+
response.read_body do |chunk|
|
|
1141
|
+
file.write(chunk)
|
|
1142
|
+
downloaded += chunk.size
|
|
1143
|
+
percent = (downloaded * 100.0 / total).round(1)
|
|
1144
|
+
print "\r Progress: #{percent}% (#{format_size(downloaded)} / #{format_size(total)})"
|
|
1145
|
+
$stdout.flush
|
|
1146
|
+
end
|
|
1147
|
+
puts
|
|
1148
|
+
else
|
|
1149
|
+
raise "Download failed: #{response.code} #{response.message}"
|
|
1150
|
+
end
|
|
1151
|
+
end
|
|
1152
|
+
end
|
|
1153
|
+
|
|
1154
|
+
path
|
|
1155
|
+
end
|
|
1156
|
+
end
|
|
1157
|
+
|
|
1158
|
+
# Fetches category members from Wikipedia API
|
|
1159
|
+
# Uses SQLite-based CategoryCache for efficient repeated access
|
|
1160
|
+
class CategoryFetcher
|
|
1161
|
+
API_ENDPOINT = "https://%s.wikipedia.org/w/api.php"
|
|
1162
|
+
MAX_LIMIT = 500
|
|
1163
|
+
RATE_LIMIT_DELAY = 0.1
|
|
1164
|
+
|
|
1165
|
+
attr_reader :lang, :category, :max_depth, :cache_expiry_days
|
|
1166
|
+
|
|
1167
|
+
def initialize(lang, category, max_depth: 0, cache_expiry_days: nil, cache_dir: nil)
|
|
1168
|
+
@lang = lang.to_s
|
|
1169
|
+
@category = normalize_category_name(category)
|
|
1170
|
+
@max_depth = max_depth
|
|
1171
|
+
@cache_expiry_days = cache_expiry_days || Wp2txt::DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS
|
|
1172
|
+
@cache_dir = cache_dir
|
|
1173
|
+
@cache = nil
|
|
1174
|
+
@visited_categories = Set.new
|
|
1175
|
+
end
|
|
1176
|
+
|
|
1177
|
+
# Enable caching of category member lists
|
|
1178
|
+
# @param cache_dir [String] Directory for cache files
|
|
1179
|
+
def enable_cache(cache_dir)
|
|
1180
|
+
@cache_dir = cache_dir
|
|
1181
|
+
@cache = CategoryCache.new(@lang, cache_dir: cache_dir, expiry_days: @cache_expiry_days)
|
|
1182
|
+
end
|
|
1183
|
+
|
|
1184
|
+
# Get the category cache instance
|
|
1185
|
+
# Creates one if caching is enabled but cache not yet initialized
|
|
1186
|
+
def cache
|
|
1187
|
+
return @cache if @cache
|
|
1188
|
+
return nil unless @cache_dir
|
|
1189
|
+
|
|
1190
|
+
@cache = CategoryCache.new(@lang, cache_dir: @cache_dir, expiry_days: @cache_expiry_days)
|
|
1191
|
+
end
|
|
1192
|
+
|
|
1193
|
+
# Preview mode - returns statistics without full article list
|
|
1194
|
+
def fetch_preview
|
|
1195
|
+
@visited_categories = Set.new
|
|
1196
|
+
subcategories = []
|
|
1197
|
+
total_articles = 0
|
|
1198
|
+
|
|
1199
|
+
fetch_category_stats(@category, 0, subcategories)
|
|
1200
|
+
|
|
1201
|
+
total_articles = subcategories.sum { |s| s[:article_count] }
|
|
1202
|
+
|
|
1203
|
+
{
|
|
1204
|
+
category: @category,
|
|
1205
|
+
depth: @max_depth,
|
|
1206
|
+
subcategories: subcategories,
|
|
1207
|
+
total_subcategories: subcategories.size - 1,
|
|
1208
|
+
total_articles: total_articles
|
|
1209
|
+
}
|
|
1210
|
+
end
|
|
1211
|
+
|
|
1212
|
+
# Fetch all article titles in the category (and subcategories if depth > 0)
|
|
1213
|
+
def fetch_articles
|
|
1214
|
+
@visited_categories = Set.new
|
|
1215
|
+
@articles = []
|
|
1216
|
+
fetch_category_members(@category, 0)
|
|
1217
|
+
@articles.uniq
|
|
1218
|
+
end
|
|
1219
|
+
|
|
1220
|
+
private
|
|
1221
|
+
|
|
1222
|
+
def normalize_category_name(name)
|
|
1223
|
+
name.to_s.sub(/^[Cc]ategory:/, "").strip
|
|
1224
|
+
end
|
|
1225
|
+
|
|
1226
|
+
def fetch_category_stats(category_name, current_depth, results)
|
|
1227
|
+
return if @visited_categories.include?(category_name)
|
|
1228
|
+
@visited_categories << category_name
|
|
1229
|
+
|
|
1230
|
+
cached = load_from_cache(category_name)
|
|
1231
|
+
if cached
|
|
1232
|
+
results << { name: category_name, article_count: (cached[:pages] || []).size }
|
|
1233
|
+
if current_depth < @max_depth
|
|
1234
|
+
(cached[:subcats] || []).each do |subcat|
|
|
1235
|
+
fetch_category_stats(subcat, current_depth + 1, results)
|
|
1236
|
+
end
|
|
1237
|
+
end
|
|
1238
|
+
return
|
|
1239
|
+
end
|
|
1240
|
+
|
|
1241
|
+
pages = []
|
|
1242
|
+
subcats = []
|
|
1243
|
+
continue_token = nil
|
|
1244
|
+
|
|
1245
|
+
loop do
|
|
1246
|
+
response = api_request(category_name, continue_token)
|
|
1247
|
+
break unless response
|
|
1248
|
+
|
|
1249
|
+
categorymembers = response.dig("query", "categorymembers") || []
|
|
1250
|
+
categorymembers.each do |member|
|
|
1251
|
+
case member["ns"]
|
|
1252
|
+
when 0
|
|
1253
|
+
pages << member["title"]
|
|
1254
|
+
when 14
|
|
1255
|
+
subcats << member["title"].sub(/^Category:/, "")
|
|
1256
|
+
end
|
|
1257
|
+
end
|
|
1258
|
+
|
|
1259
|
+
continue_token = response.dig("continue", "cmcontinue")
|
|
1260
|
+
break unless continue_token
|
|
1261
|
+
|
|
1262
|
+
sleep(RATE_LIMIT_DELAY)
|
|
1263
|
+
end
|
|
1264
|
+
|
|
1265
|
+
save_to_cache(category_name, { pages: pages, subcats: subcats })
|
|
1266
|
+
|
|
1267
|
+
results << { name: category_name, article_count: pages.size }
|
|
1268
|
+
|
|
1269
|
+
if current_depth < @max_depth
|
|
1270
|
+
subcats.each do |subcat|
|
|
1271
|
+
fetch_category_stats(subcat, current_depth + 1, results)
|
|
1272
|
+
end
|
|
1273
|
+
end
|
|
1274
|
+
end
|
|
1275
|
+
|
|
1276
|
+
def fetch_category_members(category_name, current_depth)
|
|
1277
|
+
return if @visited_categories.include?(category_name)
|
|
1278
|
+
@visited_categories << category_name
|
|
1279
|
+
|
|
1280
|
+
cached = load_from_cache(category_name)
|
|
1281
|
+
if cached
|
|
1282
|
+
@articles.concat(cached[:pages] || [])
|
|
1283
|
+
if current_depth < @max_depth
|
|
1284
|
+
(cached[:subcats] || []).each do |subcat|
|
|
1285
|
+
fetch_category_members(subcat, current_depth + 1)
|
|
1286
|
+
end
|
|
1287
|
+
end
|
|
1288
|
+
return
|
|
1289
|
+
end
|
|
1290
|
+
|
|
1291
|
+
pages = []
|
|
1292
|
+
subcats = []
|
|
1293
|
+
continue_token = nil
|
|
1294
|
+
|
|
1295
|
+
loop do
|
|
1296
|
+
response = api_request(category_name, continue_token)
|
|
1297
|
+
break unless response
|
|
1298
|
+
|
|
1299
|
+
categorymembers = response.dig("query", "categorymembers") || []
|
|
1300
|
+
categorymembers.each do |member|
|
|
1301
|
+
case member["ns"]
|
|
1302
|
+
when 0
|
|
1303
|
+
pages << member["title"]
|
|
1304
|
+
when 14
|
|
1305
|
+
subcats << member["title"].sub(/^Category:/, "")
|
|
1306
|
+
end
|
|
1307
|
+
end
|
|
1308
|
+
|
|
1309
|
+
continue_token = response.dig("continue", "cmcontinue")
|
|
1310
|
+
break unless continue_token
|
|
1311
|
+
|
|
1312
|
+
sleep(RATE_LIMIT_DELAY)
|
|
1313
|
+
end
|
|
1314
|
+
|
|
1315
|
+
save_to_cache(category_name, { pages: pages, subcats: subcats })
|
|
1316
|
+
|
|
1317
|
+
@articles.concat(pages)
|
|
1318
|
+
|
|
1319
|
+
if current_depth < @max_depth
|
|
1320
|
+
subcats.each do |subcat|
|
|
1321
|
+
fetch_category_members(subcat, current_depth + 1)
|
|
1322
|
+
end
|
|
1323
|
+
end
|
|
1324
|
+
end
|
|
1325
|
+
|
|
1326
|
+
def api_request(category_name, continue_token = nil)
|
|
1327
|
+
uri = URI(format(API_ENDPOINT, @lang))
|
|
1328
|
+
params = {
|
|
1329
|
+
action: "query",
|
|
1330
|
+
list: "categorymembers",
|
|
1331
|
+
cmtitle: "Category:#{category_name}",
|
|
1332
|
+
cmtype: "page|subcat",
|
|
1333
|
+
cmlimit: MAX_LIMIT,
|
|
1334
|
+
format: "json"
|
|
1335
|
+
}
|
|
1336
|
+
params[:cmcontinue] = continue_token if continue_token
|
|
1337
|
+
uri.query = URI.encode_www_form(params)
|
|
1338
|
+
|
|
1339
|
+
attempts = 0
|
|
1340
|
+
begin
|
|
1341
|
+
attempts += 1
|
|
1342
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
1343
|
+
http.use_ssl = true
|
|
1344
|
+
http.open_timeout = DEFAULT_HTTP_TIMEOUT
|
|
1345
|
+
http.read_timeout = DEFAULT_HTTP_TIMEOUT
|
|
1346
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_PEER
|
|
1347
|
+
|
|
1348
|
+
request = Net::HTTP::Get.new(uri)
|
|
1349
|
+
response = http.request(request)
|
|
1350
|
+
return nil unless response.is_a?(Net::HTTPSuccess)
|
|
1351
|
+
|
|
1352
|
+
JSON.parse(response.body)
|
|
1353
|
+
rescue Net::OpenTimeout, Net::ReadTimeout, SocketError, Errno::ECONNRESET,
|
|
1354
|
+
Errno::ECONNREFUSED, Errno::EHOSTUNREACH, OpenSSL::SSL::SSLError => e
|
|
1355
|
+
if attempts <= MAX_HTTP_RETRIES
|
|
1356
|
+
delay = 2**attempts
|
|
1357
|
+
warn " API request failed (attempt #{attempts}/#{MAX_HTTP_RETRIES + 1}): #{e.message}. Retrying in #{delay}s..."
|
|
1358
|
+
sleep delay
|
|
1359
|
+
retry
|
|
1360
|
+
end
|
|
1361
|
+
warn " API request failed after #{attempts} attempts for category '#{category_name}': #{e.message}"
|
|
1362
|
+
nil
|
|
1363
|
+
rescue JSON::ParserError => e
|
|
1364
|
+
warn " Invalid JSON response for category '#{category_name}': #{e.message}"
|
|
1365
|
+
nil
|
|
1366
|
+
end
|
|
1367
|
+
end
|
|
1368
|
+
|
|
1369
|
+
def load_from_cache(category_name)
|
|
1370
|
+
return nil unless cache
|
|
1371
|
+
|
|
1372
|
+
cache.get(category_name)
|
|
1373
|
+
end
|
|
1374
|
+
|
|
1375
|
+
def save_to_cache(category_name, members)
|
|
1376
|
+
return unless cache
|
|
1377
|
+
|
|
1378
|
+
pages = members[:pages] || members["pages"] || []
|
|
1379
|
+
subcats = members[:subcats] || members["subcats"] || []
|
|
1380
|
+
cache.save(category_name, pages, subcats)
|
|
1381
|
+
end
|
|
1382
|
+
end
|
|
1383
|
+
end
|