wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. checksums.yaml +4 -4
  2. data/.dockerignore +12 -0
  3. data/.github/workflows/ci.yml +13 -13
  4. data/.gitignore +14 -0
  5. data/CHANGELOG.md +284 -0
  6. data/DEVELOPMENT.md +415 -0
  7. data/DEVELOPMENT_ja.md +415 -0
  8. data/Dockerfile +19 -10
  9. data/Gemfile +2 -8
  10. data/README.md +259 -123
  11. data/README_ja.md +375 -0
  12. data/Rakefile +4 -0
  13. data/bin/wp2txt +863 -161
  14. data/lib/wp2txt/article.rb +98 -13
  15. data/lib/wp2txt/bz2_validator.rb +239 -0
  16. data/lib/wp2txt/category_cache.rb +313 -0
  17. data/lib/wp2txt/cli.rb +319 -0
  18. data/lib/wp2txt/cli_ui.rb +428 -0
  19. data/lib/wp2txt/config.rb +158 -0
  20. data/lib/wp2txt/constants.rb +134 -0
  21. data/lib/wp2txt/data/html_entities.json +2135 -0
  22. data/lib/wp2txt/data/language_metadata.json +4769 -0
  23. data/lib/wp2txt/data/language_tiers.json +59 -0
  24. data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
  25. data/lib/wp2txt/data/template_aliases.json +193 -0
  26. data/lib/wp2txt/data/wikipedia_entities.json +12 -0
  27. data/lib/wp2txt/extractor.rb +545 -0
  28. data/lib/wp2txt/file_utils.rb +91 -0
  29. data/lib/wp2txt/formatter.rb +352 -0
  30. data/lib/wp2txt/global_data_cache.rb +353 -0
  31. data/lib/wp2txt/index_cache.rb +258 -0
  32. data/lib/wp2txt/magic_words.rb +353 -0
  33. data/lib/wp2txt/memory_monitor.rb +236 -0
  34. data/lib/wp2txt/multistream.rb +1383 -0
  35. data/lib/wp2txt/output_writer.rb +182 -0
  36. data/lib/wp2txt/parser_functions.rb +606 -0
  37. data/lib/wp2txt/ractor_worker.rb +215 -0
  38. data/lib/wp2txt/regex.rb +396 -12
  39. data/lib/wp2txt/section_extractor.rb +354 -0
  40. data/lib/wp2txt/stream_processor.rb +271 -0
  41. data/lib/wp2txt/template_expander.rb +830 -0
  42. data/lib/wp2txt/text_processing.rb +337 -0
  43. data/lib/wp2txt/utils.rb +629 -270
  44. data/lib/wp2txt/version.rb +1 -1
  45. data/lib/wp2txt.rb +53 -26
  46. data/scripts/benchmark_regex.rb +161 -0
  47. data/scripts/fetch_html_entities.rb +94 -0
  48. data/scripts/fetch_language_metadata.rb +180 -0
  49. data/scripts/fetch_mediawiki_data.rb +334 -0
  50. data/scripts/fetch_template_data.rb +186 -0
  51. data/scripts/profile_memory.rb +139 -0
  52. data/spec/article_spec.rb +402 -0
  53. data/spec/auto_download_spec.rb +314 -0
  54. data/spec/bz2_validator_spec.rb +193 -0
  55. data/spec/category_cache_spec.rb +226 -0
  56. data/spec/category_fetcher_spec.rb +504 -0
  57. data/spec/cleanup_spec.rb +197 -0
  58. data/spec/cli_options_spec.rb +678 -0
  59. data/spec/cli_spec.rb +876 -0
  60. data/spec/config_spec.rb +194 -0
  61. data/spec/constants_spec.rb +138 -0
  62. data/spec/file_utils_spec.rb +170 -0
  63. data/spec/fixtures/samples.rb +181 -0
  64. data/spec/formatter_sections_spec.rb +382 -0
  65. data/spec/global_data_cache_spec.rb +186 -0
  66. data/spec/index_cache_spec.rb +210 -0
  67. data/spec/integration_spec.rb +543 -0
  68. data/spec/magic_words_spec.rb +261 -0
  69. data/spec/markers_spec.rb +476 -0
  70. data/spec/memory_monitor_spec.rb +192 -0
  71. data/spec/multistream_spec.rb +690 -0
  72. data/spec/output_writer_spec.rb +400 -0
  73. data/spec/parser_functions_spec.rb +455 -0
  74. data/spec/ractor_worker_spec.rb +197 -0
  75. data/spec/regex_spec.rb +281 -0
  76. data/spec/section_extractor_spec.rb +397 -0
  77. data/spec/spec_helper.rb +63 -0
  78. data/spec/stream_processor_spec.rb +579 -0
  79. data/spec/template_data_spec.rb +246 -0
  80. data/spec/template_expander_spec.rb +472 -0
  81. data/spec/template_processing_spec.rb +217 -0
  82. data/spec/text_processing_spec.rb +312 -0
  83. data/spec/utils_spec.rb +195 -16
  84. data/spec/wp2txt_spec.rb +510 -0
  85. data/wp2txt.gemspec +5 -3
  86. metadata +146 -18
  87. data/.rubocop.yml +0 -80
  88. data/data/output_samples/testdata_en.txt +0 -23002
  89. data/data/output_samples/testdata_en_category.txt +0 -132
  90. data/data/output_samples/testdata_en_summary.txt +0 -1376
  91. data/data/output_samples/testdata_ja.txt +0 -22774
  92. data/data/output_samples/testdata_ja_category.txt +0 -206
  93. data/data/output_samples/testdata_ja_summary.txt +0 -1560
  94. data/data/testdata_en.bz2 +0 -0
  95. data/data/testdata_ja.bz2 +0 -0
  96. data/image/screenshot.png +0 -0
@@ -0,0 +1,1383 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "json"
5
+ require "net/http"
6
+ require "uri"
7
+ require "openssl"
8
+ require "parallel"
9
+ require "set"
10
+ require_relative "constants"
11
+ require_relative "index_cache"
12
+ require_relative "category_cache"
13
+
14
+ module Wp2txt
15
+ # Maximum number of retries for transient network errors
16
+ MAX_HTTP_RETRIES = 3
17
+
18
+ # HTTPS-aware HTTP GET helper with proper SSL verification and retry
19
+ # @param uri [URI] The URI to request
20
+ # @param timeout [Integer] Timeout in seconds
21
+ # @param retries [Integer] Maximum number of retries on transient errors
22
+ # @return [Net::HTTPResponse] The HTTP response
23
+ def self.ssl_safe_get(uri, timeout: DEFAULT_HTTP_TIMEOUT, retries: MAX_HTTP_RETRIES)
24
+ attempts = 0
25
+ begin
26
+ attempts += 1
27
+ http = Net::HTTP.new(uri.host, uri.port)
28
+ http.use_ssl = (uri.scheme == "https")
29
+ http.open_timeout = timeout
30
+ http.read_timeout = timeout
31
+
32
+ if http.use_ssl?
33
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
34
+ end
35
+
36
+ request = Net::HTTP::Get.new(uri)
37
+ http.request(request)
38
+ rescue Net::OpenTimeout, Net::ReadTimeout, SocketError, Errno::ECONNRESET,
39
+ Errno::ECONNREFUSED, Errno::EHOSTUNREACH, OpenSSL::SSL::SSLError => e
40
+ if attempts <= retries
41
+ delay = 2**attempts # Exponential backoff: 2, 4, 8 seconds
42
+ warn " Network error (attempt #{attempts}/#{retries + 1}): #{e.message}. Retrying in #{delay}s..."
43
+ sleep delay
44
+ retry
45
+ end
46
+ raise
47
+ end
48
+ end
49
+ # Manages multistream index for random access to Wikipedia dumps
50
+ # Supports SQLite caching for fast repeated access
51
+ class MultistreamIndex
52
+ attr_reader :index_path, :entries_by_title, :entries_by_id, :stream_offsets
53
+
54
+ # Initialize index with optional SQLite caching and early termination
55
+ # @param index_path [String] Path to the bz2 index file
56
+ # @param use_cache [Boolean] Whether to use SQLite cache (default: true)
57
+ # @param cache_dir [String, nil] Directory for SQLite cache (default: ~/.wp2txt/cache)
58
+ # @param target_titles [Array<String>, nil] If provided, stop parsing when all titles found
59
+ # @param show_progress [Boolean] Whether to show progress during parsing (default: true)
60
+ def initialize(index_path, use_cache: true, cache_dir: nil, target_titles: nil, show_progress: true)
61
+ @index_path = index_path
62
+ @entries_by_title = {}
63
+ @entries_by_id = {}
64
+ @stream_offsets = []
65
+ @show_progress = show_progress
66
+ @target_titles = target_titles ? Set.new(target_titles) : nil
67
+ @found_targets = Set.new if @target_titles
68
+
69
+ # Try to load from cache first
70
+ if use_cache && @target_titles.nil?
71
+ @cache = IndexCache.new(index_path, cache_dir: cache_dir)
72
+ if load_from_cache
73
+ return
74
+ end
75
+ else
76
+ @cache = nil
77
+ end
78
+
79
+ # Parse index file
80
+ load_index
81
+
82
+ # Save to cache for future use (only if full parse completed)
83
+ if @cache && @target_titles.nil?
84
+ save_to_cache
85
+ end
86
+ end
87
+
88
+ # Check if this index was loaded from cache
89
+ def loaded_from_cache?
90
+ @loaded_from_cache == true
91
+ end
92
+
93
+ # Check if early termination was triggered
94
+ def early_terminated?
95
+ @early_terminated == true
96
+ end
97
+
98
+ def find_by_title(title)
99
+ @entries_by_title[title]
100
+ end
101
+
102
+ def find_by_id(page_id)
103
+ @entries_by_id[page_id]
104
+ end
105
+
106
+ # Get all articles in a specific stream (by byte offset)
107
+ def articles_in_stream(byte_offset)
108
+ @entries_by_title.values.select { |e| e[:offset] == byte_offset }
109
+ end
110
+
111
+ # Get stream offset for a given article
112
+ def stream_offset_for(title)
113
+ entry = find_by_title(title)
114
+ entry ? entry[:offset] : nil
115
+ end
116
+
117
+ # Get N random articles
118
+ def random_articles(count)
119
+ @entries_by_title.keys.sample(count)
120
+ end
121
+
122
+ # Get first N articles
123
+ def first_articles(count)
124
+ @entries_by_title.keys.first(count)
125
+ end
126
+
127
+ # Total number of articles
128
+ def size
129
+ @entries_by_title.size
130
+ end
131
+
132
+ private
133
+
134
+ def load_from_cache
135
+ return false unless @cache&.valid?
136
+
137
+ print " Loading index from cache..." if @show_progress
138
+ $stdout.flush
139
+
140
+ data = @cache.load
141
+ return false unless data
142
+
143
+ @entries_by_title = data[:entries_by_title]
144
+ @entries_by_id = data[:entries_by_id]
145
+ @stream_offsets = data[:stream_offsets]
146
+ @loaded_from_cache = true
147
+
148
+ puts " #{@entries_by_title.size} entries loaded" if @show_progress
149
+ true
150
+ end
151
+
152
+ def save_to_cache
153
+ return unless @cache
154
+
155
+ print " Saving index to cache..." if @show_progress
156
+ $stdout.flush
157
+
158
+ @cache.save(@entries_by_title, @stream_offsets)
159
+
160
+ puts " done" if @show_progress
161
+ rescue StandardError => e
162
+ puts " failed (#{e.message})" if @show_progress
163
+ # Non-fatal: continue without cache
164
+ end
165
+
166
+ def load_index
167
+ return unless File.exist?(@index_path)
168
+
169
+ # Handle both .bz2 and plain text index files
170
+ if @index_path.end_with?(".bz2")
171
+ require "open3"
172
+ IO.popen(["bzcat", @index_path], "r") do |io|
173
+ parse_index_stream(io)
174
+ end
175
+ else
176
+ File.open(@index_path, "r") do |io|
177
+ parse_index_stream(io)
178
+ end
179
+ end
180
+ end
181
+
182
+ def parse_index_stream(io)
183
+ count = 0
184
+ io.each_line do |line|
185
+ line = line.strip
186
+ next if line.empty?
187
+
188
+ parts = line.split(":", 3)
189
+ next if parts.size < 3
190
+
191
+ offset = parts[0].to_i
192
+ page_id = parts[1].to_i
193
+ title = parts[2]
194
+
195
+ entry = { offset: offset, page_id: page_id, title: title }
196
+ @entries_by_title[title] = entry
197
+ @entries_by_id[page_id] = entry
198
+
199
+ if @stream_offsets.empty? || @stream_offsets.last != offset
200
+ @stream_offsets << offset
201
+ end
202
+
203
+ # Early termination: check if we found all target titles
204
+ if @target_titles
205
+ @found_targets << title if @target_titles.include?(title)
206
+ if @found_targets.size == @target_titles.size
207
+ @early_terminated = true
208
+ print "\r Found all #{@target_titles.size} target articles" if @show_progress
209
+ puts if @show_progress
210
+ break
211
+ end
212
+ end
213
+
214
+ count += 1
215
+ if @show_progress && count % INDEX_PROGRESS_THRESHOLD == 0
216
+ print "\r Parsed #{count / 1_000_000.0}M entries..."
217
+ $stdout.flush
218
+ end
219
+ end
220
+ print "\r" + " " * 40 + "\r" if @show_progress && count >= INDEX_PROGRESS_THRESHOLD && !@early_terminated
221
+ end
222
+ end
223
+
224
+ # Reads articles from multistream bz2 files
225
+ class MultistreamReader
226
+ attr_reader :multistream_path, :index
227
+
228
+ # Initialize reader with multistream file and index
229
+ # @param multistream_path [String] Path to the multistream bz2 file
230
+ # @param index_or_path [MultistreamIndex, String] Either an existing index instance or path to index file
231
+ # @param use_cache [Boolean] Whether to use SQLite cache for index (default: true, only used if index_or_path is a path)
232
+ # @param cache_dir [String, nil] Directory for SQLite cache (only used if index_or_path is a path)
233
+ def initialize(multistream_path, index_or_path, use_cache: true, cache_dir: nil)
234
+ @multistream_path = multistream_path
235
+
236
+ # Accept either an existing index or a path to create one
237
+ if index_or_path.is_a?(MultistreamIndex)
238
+ @index = index_or_path
239
+ else
240
+ @index = MultistreamIndex.new(index_or_path, use_cache: use_cache, cache_dir: cache_dir)
241
+ end
242
+ end
243
+
244
+ # Extract a single article by title
245
+ def extract_article(title)
246
+ entry = @index.find_by_title(title)
247
+ return nil unless entry
248
+
249
+ stream_content = read_stream_at(entry[:offset])
250
+ extract_page_from_xml(stream_content, title)
251
+ end
252
+
253
+ # Extract multiple articles (sequential)
254
+ def extract_articles(titles)
255
+ # Group by stream offset for efficiency
256
+ grouped = titles.group_by { |t| @index.stream_offset_for(t) }
257
+
258
+ results = {}
259
+ grouped.each do |offset, titles_in_stream|
260
+ next unless offset
261
+
262
+ stream_content = read_stream_at(offset)
263
+ titles_in_stream.each do |title|
264
+ page = extract_page_from_xml(stream_content, title)
265
+ results[title] = page if page
266
+ end
267
+ end
268
+ results
269
+ end
270
+
271
+ # Extract multiple articles in parallel (by stream)
272
+ # @param titles [Array<String>] Article titles to extract
273
+ # @param num_processes [Integer] Number of parallel processes (default: 4)
274
+ # @param progress_callback [Proc, nil] Optional callback for progress updates
275
+ # @return [Hash] Map of title => page data
276
+ def extract_articles_parallel(titles, num_processes: 4, &progress_callback)
277
+ # Group titles by stream offset
278
+ grouped = titles.group_by { |t| @index.stream_offset_for(t) }
279
+ grouped.delete(nil) # Remove titles not found in index
280
+
281
+ # Process streams in parallel
282
+ stream_results = Parallel.map(grouped.keys, in_processes: num_processes) do |offset|
283
+ titles_in_stream = grouped[offset]
284
+ stream_content = read_stream_at(offset)
285
+
286
+ stream_pages = {}
287
+ titles_in_stream.each do |title|
288
+ page = extract_page_from_xml(stream_content, title)
289
+ stream_pages[title] = page if page
290
+ end
291
+
292
+ stream_pages
293
+ end
294
+
295
+ # Merge results from all streams
296
+ results = {}
297
+ stream_results.each do |stream_pages|
298
+ results.merge!(stream_pages)
299
+ end
300
+
301
+ results
302
+ end
303
+
304
+ # Iterate through articles in parallel, yielding each page
305
+ # Groups articles by stream and processes streams in parallel
306
+ # @param entries [Array<Hash>] Array of index entries with :title and :offset
307
+ # @param num_processes [Integer] Number of parallel processes
308
+ # @yield [Hash] Page data for each article
309
+ def each_article_parallel(entries, num_processes: 4)
310
+ return enum_for(:each_article_parallel, entries, num_processes: num_processes) unless block_given?
311
+
312
+ # Group by stream offset
313
+ grouped = entries.group_by { |e| e[:offset] }
314
+
315
+ # Process streams in parallel, collecting all pages
316
+ all_pages = Parallel.flat_map(grouped.keys, in_processes: num_processes) do |offset|
317
+ entries_in_stream = grouped[offset]
318
+ stream_content = read_stream_at(offset)
319
+
320
+ pages = []
321
+ entries_in_stream.each do |entry|
322
+ page = extract_page_from_xml(stream_content, entry[:title])
323
+ pages << page if page
324
+ end
325
+
326
+ pages
327
+ end
328
+
329
+ # Yield each page (sequential, as yielding must happen in main process)
330
+ all_pages.each { |page| yield page }
331
+ end
332
+
333
+ # Iterate through all articles in a stream
334
+ def each_article_in_stream(offset, &block)
335
+ stream_content = read_stream_at(offset)
336
+ extract_all_pages_from_xml(stream_content, &block)
337
+ end
338
+
339
+ # Iterate through first N streams
340
+ def each_article_in_first_streams(stream_count, &block)
341
+ @index.stream_offsets.first(stream_count).each do |offset|
342
+ each_article_in_stream(offset, &block)
343
+ end
344
+ end
345
+
346
+ private
347
+
348
+ def read_stream_at(offset)
349
+ # Read the bz2 stream starting at the given offset
350
+ # We need to find where this stream ends (next stream start or EOF)
351
+ next_offset = find_next_offset(offset)
352
+
353
+ File.open(@multistream_path, "rb") do |f|
354
+ f.seek(offset)
355
+
356
+ if next_offset
357
+ compressed_data = f.read(next_offset - offset)
358
+ else
359
+ # Last stream - read to end
360
+ compressed_data = f.read
361
+ end
362
+
363
+ decompress_bz2(compressed_data)
364
+ end
365
+ end
366
+
367
+ def find_next_offset(current_offset)
368
+ idx = @index.stream_offsets.index(current_offset)
369
+ return nil unless idx
370
+
371
+ @index.stream_offsets[idx + 1]
372
+ end
373
+
374
+ def decompress_bz2(data)
375
+ require "stringio"
376
+ require "open3"
377
+
378
+ stdout, status = Open3.capture2("bzcat", stdin_data: data)
379
+ stdout
380
+ end
381
+
382
+ def extract_page_from_xml(xml_content, title)
383
+ # Simple extraction - find the page with matching title
384
+ require "nokogiri"
385
+
386
+ doc = Nokogiri::XML("<root>#{xml_content}</root>")
387
+ doc.xpath("//page").each do |page_node|
388
+ page_title = page_node.at_xpath("title")&.text
389
+ if page_title == title
390
+ return {
391
+ title: page_title,
392
+ id: page_node.at_xpath("id")&.text&.to_i,
393
+ text: page_node.at_xpath(".//text")&.text || ""
394
+ }
395
+ end
396
+ end
397
+ nil
398
+ end
399
+
400
+ def extract_all_pages_from_xml(xml_content, &block)
401
+ require "nokogiri"
402
+
403
+ doc = Nokogiri::XML("<root>#{xml_content}</root>")
404
+ doc.xpath("//page").each do |page_node|
405
+ page = {
406
+ title: page_node.at_xpath("title")&.text,
407
+ id: page_node.at_xpath("id")&.text&.to_i,
408
+ text: page_node.at_xpath(".//text")&.text || ""
409
+ }
410
+ yield page if page[:title]
411
+ end
412
+ end
413
+ end
414
+
415
+ # Manages downloading and caching of dump files
416
+ # Supports any Wikipedia language code (e.g., en, ja, de, fr, zh, ar, etc.)
417
+ # Language metadata is stored in lib/wp2txt/data/language_metadata.json
418
+ class DumpManager
419
+ DUMP_BASE_URL = "https://dumps.wikimedia.org"
420
+ DEFAULT_CACHE_DIR = File.expand_path("~/.wp2txt/cache")
421
+
422
+ # Legacy constant for backward compatibility
423
+ CACHE_DIR = "tmp/dump_cache"
424
+
425
+ attr_reader :lang, :cache_dir, :dump_expiry_days
426
+
427
+ class << self
428
+ # Get default cache directory
429
+ def default_cache_dir
430
+ DEFAULT_CACHE_DIR
431
+ end
432
+ end
433
+
434
+ def initialize(lang, cache_dir: nil, dump_expiry_days: nil)
435
+ @lang = lang.to_sym
436
+ @cache_dir = cache_dir || DEFAULT_CACHE_DIR
437
+ @dump_expiry_days = dump_expiry_days || Wp2txt::DEFAULT_DUMP_EXPIRY_DAYS
438
+ FileUtils.mkdir_p(@cache_dir)
439
+ end
440
+
441
+ # Format bytes as human-readable string
442
+ def format_size(bytes)
443
+ Wp2txt.format_file_size(bytes)
444
+ end
445
+
446
+ # Get the latest dump date for a language
447
+ def latest_dump_date
448
+ @latest_dump_date ||= fetch_latest_dump_date
449
+ end
450
+
451
+ # Download multistream index file
452
+ def download_index(force: false)
453
+ index_path = cached_index_path
454
+ if File.exist?(index_path) && !force
455
+ puts "Index already cached: #{File.basename(index_path)}"
456
+ $stdout.flush
457
+ return index_path
458
+ end
459
+
460
+ url = index_url
461
+ puts "Downloading index: #{url}"
462
+ $stdout.flush
463
+ download_file(url, index_path)
464
+ index_path
465
+ end
466
+
467
+ # Download multistream dump file
468
+ # @param force [Boolean] Force re-download even if cached
469
+ # @param max_streams [Integer, nil] If set, only download first N streams (partial download)
470
+ def download_multistream(force: false, max_streams: nil)
471
+ # For partial downloads, first check if full dump exists (most efficient)
472
+ if max_streams && !force
473
+ full_path = cached_multistream_path
474
+ if File.exist?(full_path)
475
+ puts "Using cached full dump: #{File.basename(full_path)}"
476
+ $stdout.flush
477
+ return full_path
478
+ end
479
+
480
+ # Check if a larger partial download exists
481
+ existing_partial = find_suitable_partial_cache(max_streams)
482
+ if existing_partial
483
+ puts "Using cached partial: #{File.basename(existing_partial)}"
484
+ $stdout.flush
485
+ return existing_partial
486
+ end
487
+ end
488
+
489
+ dump_path = max_streams ? cached_partial_multistream_path(max_streams) : cached_multistream_path
490
+ if File.exist?(dump_path) && !force
491
+ puts "Multistream already cached: #{File.basename(dump_path)}"
492
+ $stdout.flush
493
+ return dump_path
494
+ end
495
+
496
+ url = multistream_url
497
+
498
+ if max_streams
499
+ # Partial download: need index first to know byte range
500
+ index_path = download_index
501
+ index = MultistreamIndex.new(index_path, cache_dir: @cache_dir)
502
+
503
+ if index.stream_offsets.size >= max_streams
504
+ # Get byte range for first N streams
505
+ end_offset = index.stream_offsets[max_streams]
506
+ puts "Downloading first #{max_streams} streams (#{format_size(end_offset)}): #{url}"
507
+ $stdout.flush
508
+ download_file_range(url, dump_path, 0, end_offset - 1)
509
+ else
510
+ puts "Only #{index.stream_offsets.size} streams available, downloading all"
511
+ $stdout.flush
512
+ download_file(url, dump_path)
513
+ end
514
+ else
515
+ puts "Downloading multistream: #{url}"
516
+ $stdout.flush
517
+ download_file(url, dump_path)
518
+ end
519
+
520
+ dump_path
521
+ end
522
+
523
+ # Find a suitable cached partial download (same or larger than needed)
524
+ # @param min_streams [Integer] Minimum number of streams needed
525
+ # @return [String, nil] Path to suitable cached file, or nil
526
+ def find_suitable_partial_cache(min_streams)
527
+ pattern = File.join(@cache_dir, "#{@lang}wiki-#{latest_dump_date}-multistream-*streams.xml.bz2")
528
+ Dir.glob(pattern).each do |path|
529
+ if path =~ /multistream-(\d+)streams\.xml\.bz2$/
530
+ stream_count = $1.to_i
531
+ return path if stream_count >= min_streams
532
+ end
533
+ end
534
+ nil
535
+ end
536
+
537
+ # Find any existing partial dump (any date)
538
+ # @return [Hash, nil] Info about existing partial dump, or nil
539
+ def find_any_partial_cache
540
+ pattern = File.join(@cache_dir, "#{@lang}wiki-*-multistream-*streams.xml.bz2")
541
+ partials = []
542
+
543
+ Dir.glob(pattern).each do |path|
544
+ if path =~ /#{@lang}wiki-(\d{8})-multistream-(\d+)streams\.xml\.bz2$/
545
+ dump_date = $1
546
+ stream_count = $2.to_i
547
+ partials << {
548
+ path: path,
549
+ dump_date: dump_date,
550
+ stream_count: stream_count,
551
+ size: File.size(path),
552
+ mtime: File.mtime(path)
553
+ }
554
+ end
555
+ end
556
+
557
+ # Return the largest partial (by stream count)
558
+ partials.max_by { |p| p[:stream_count] }
559
+ end
560
+
561
+ # Check if incremental download is possible from existing partial
562
+ # @param partial_info [Hash] Info from find_any_partial_cache
563
+ # @return [Hash] Result with :possible, :reason, and details
564
+ def can_resume_from_partial?(partial_info)
565
+ return { possible: false, reason: :no_partial } unless partial_info
566
+
567
+ current_date = latest_dump_date
568
+
569
+ # Check if dump dates match
570
+ if partial_info[:dump_date] != current_date
571
+ return {
572
+ possible: false,
573
+ reason: :date_mismatch,
574
+ partial_date: partial_info[:dump_date],
575
+ latest_date: current_date
576
+ }
577
+ end
578
+
579
+ # Validate the partial file with Bz2Validator
580
+ require_relative "bz2_validator"
581
+ validation = Bz2Validator.validate_quick(partial_info[:path])
582
+ unless validation.valid?
583
+ return {
584
+ possible: false,
585
+ reason: :invalid_partial,
586
+ error: validation.message
587
+ }
588
+ end
589
+
590
+ # Verify file size matches expected offset
591
+ index_path = download_index
592
+ index = MultistreamIndex.new(index_path, cache_dir: @cache_dir)
593
+
594
+ expected_size = if partial_info[:stream_count] < index.stream_offsets.size
595
+ index.stream_offsets[partial_info[:stream_count]]
596
+ else
597
+ # Partial has all streams - no need to resume
598
+ return { possible: false, reason: :already_complete }
599
+ end
600
+
601
+ actual_size = partial_info[:size]
602
+ if actual_size != expected_size
603
+ return {
604
+ possible: false,
605
+ reason: :size_mismatch,
606
+ expected: expected_size,
607
+ actual: actual_size
608
+ }
609
+ end
610
+
611
+ {
612
+ possible: true,
613
+ partial_info: partial_info,
614
+ current_streams: partial_info[:stream_count],
615
+ total_streams: index.stream_offsets.size,
616
+ current_size: actual_size
617
+ }
618
+ end
619
+
620
+ # Download full dump with incremental support
621
+ # @param force [Boolean] Force re-download
622
+ # @param interactive [Boolean] Prompt user for choices (default: true)
623
+ # @return [String] Path to downloaded file
624
+ def download_multistream_full(force: false, interactive: true)
625
+ full_path = cached_multistream_path
626
+
627
+ # If full dump exists, use it
628
+ if File.exist?(full_path) && !force
629
+ puts "Using cached full dump: #{File.basename(full_path)}"
630
+ $stdout.flush
631
+ return full_path
632
+ end
633
+
634
+ # Check for existing partial dump
635
+ partial = find_any_partial_cache
636
+ if partial && interactive
637
+ resume_info = can_resume_from_partial?(partial)
638
+
639
+ if resume_info[:possible]
640
+ # Same date - can resume
641
+ return handle_resumable_partial(partial, resume_info, force)
642
+ elsif resume_info[:reason] == :date_mismatch
643
+ # Different date - ask user
644
+ return handle_outdated_partial(partial, resume_info, force)
645
+ elsif resume_info[:reason] == :size_mismatch || resume_info[:reason] == :invalid_partial
646
+ # Corrupted partial - inform and re-download
647
+ puts "Warning: Existing partial dump appears corrupted."
648
+ puts " Reason: #{resume_info[:reason]}"
649
+ puts " Will download fresh copy."
650
+ FileUtils.rm_f(partial[:path])
651
+ end
652
+ end
653
+
654
+ # Standard full download
655
+ download_multistream(force: force, max_streams: nil)
656
+ end
657
+
658
+ private
659
+
660
+ def handle_resumable_partial(partial, resume_info, force)
661
+ current = resume_info[:current_streams]
662
+ total = resume_info[:total_streams]
663
+ current_size = resume_info[:current_size]
664
+
665
+ # Calculate remaining download size
666
+ index_path = cached_index_path
667
+ index = MultistreamIndex.new(index_path, cache_dir: @cache_dir)
668
+
669
+ # Get total file size from HTTP HEAD request
670
+ url = multistream_url
671
+ total_size = get_remote_file_size(url)
672
+ remaining_size = total_size - current_size
673
+
674
+ puts
675
+ puts "Found existing partial dump (same date):"
676
+ puts " Current: #{current} streams (#{format_size(current_size)})"
677
+ puts " Total: #{total} streams (#{format_size(total_size)})"
678
+ puts " Remaining: #{format_size(remaining_size)}"
679
+ puts
680
+
681
+ print "Download remaining data? [Y/n/f(ull fresh download)]: "
682
+ $stdout.flush
683
+ response = $stdin.gets&.strip&.downcase || "y"
684
+
685
+ case response
686
+ when "n", "no"
687
+ puts "Using existing partial dump."
688
+ partial[:path]
689
+ when "f", "full", "fresh"
690
+ puts "Downloading fresh full dump..."
691
+ FileUtils.rm_f(partial[:path])
692
+ download_multistream(force: true, max_streams: nil)
693
+ else
694
+ # Resume download
695
+ puts "Resuming download..."
696
+ download_incremental(partial[:path], current_size, total_size)
697
+ end
698
+ end
699
+
700
+ def handle_outdated_partial(partial, resume_info, force)
701
+ puts
702
+ puts "Found existing partial dump with different date:"
703
+ puts " Partial dump: #{partial[:dump_date]} (#{partial[:stream_count]} streams, #{format_size(partial[:size])})"
704
+ puts " Latest dump: #{resume_info[:latest_date]}"
705
+ puts
706
+ puts "Options:"
707
+ puts " [D] Delete old partial and download latest full dump (recommended)"
708
+ puts " [K] Keep old partial, download latest full dump separately"
709
+ puts " [U] Use old partial as-is (may have outdated content)"
710
+ puts
711
+
712
+ print "Choice [D/k/u]: "
713
+ $stdout.flush
714
+ response = $stdin.gets&.strip&.downcase || "d"
715
+
716
+ case response
717
+ when "k", "keep"
718
+ puts "Keeping old partial, downloading latest full dump..."
719
+ download_multistream(force: true, max_streams: nil)
720
+ when "u", "use"
721
+ puts "Using old partial dump (content may be outdated)."
722
+ partial[:path]
723
+ else
724
+ puts "Deleting old partial and downloading latest..."
725
+ FileUtils.rm_f(partial[:path])
726
+ download_multistream(force: true, max_streams: nil)
727
+ end
728
+ end
729
+
730
+ def download_incremental(partial_path, start_byte, total_size)
731
+ url = multistream_url
732
+ full_path = cached_multistream_path
733
+
734
+ uri = URI(url)
735
+ http = Net::HTTP.new(uri.host, uri.port)
736
+ http.use_ssl = (uri.scheme == "https")
737
+ http.open_timeout = DEFAULT_HTTP_TIMEOUT
738
+ http.read_timeout = DEFAULT_HTTP_TIMEOUT
739
+ if http.use_ssl?
740
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
741
+ end
742
+
743
+ request = Net::HTTP::Get.new(uri)
744
+ request["Range"] = "bytes=#{start_byte}-"
745
+
746
+ # Copy partial to full path first, then append
747
+ FileUtils.cp(partial_path, full_path)
748
+
749
+ File.open(full_path, "ab") do |file|
750
+ http.request(request) do |response|
751
+ if response.code == "206"
752
+ remaining = total_size - start_byte
753
+ downloaded = 0
754
+
755
+ response.read_body do |chunk|
756
+ file.write(chunk)
757
+ downloaded += chunk.size
758
+ total_downloaded = start_byte + downloaded
759
+ percent = (total_downloaded * 100.0 / total_size).round(1)
760
+ print "\r Progress: #{percent}% (#{format_size(total_downloaded)} / #{format_size(total_size)})"
761
+ $stdout.flush
762
+ end
763
+ puts
764
+ elsif response.code == "200"
765
+ # Server doesn't support Range - need full download
766
+ puts "\nServer doesn't support resume. Downloading full file..."
767
+ file.close
768
+ FileUtils.rm_f(full_path)
769
+ return download_multistream(force: true, max_streams: nil)
770
+ else
771
+ raise "Download failed: #{response.code} #{response.message}"
772
+ end
773
+ end
774
+ end
775
+
776
+ # Validate the combined file
777
+ require_relative "bz2_validator"
778
+ validation = Bz2Validator.validate_quick(full_path)
779
+ unless validation.valid?
780
+ puts "Warning: Combined file validation failed. Re-downloading..."
781
+ FileUtils.rm_f(full_path)
782
+ return download_multistream(force: true, max_streams: nil)
783
+ end
784
+
785
+ puts "Successfully resumed download!"
786
+
787
+ # Optionally remove the partial file
788
+ FileUtils.rm_f(partial_path) if partial_path != full_path
789
+
790
+ full_path
791
+ end
792
+
793
+ def get_remote_file_size(url)
794
+ uri = URI(url)
795
+ http = Net::HTTP.new(uri.host, uri.port)
796
+ http.use_ssl = (uri.scheme == "https")
797
+ http.open_timeout = DEFAULT_HTTP_TIMEOUT
798
+ http.read_timeout = DEFAULT_HTTP_TIMEOUT
799
+ if http.use_ssl?
800
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
801
+ end
802
+
803
+ request = Net::HTTP::Head.new(uri)
804
+ response = http.request(request)
805
+
806
+ response["Content-Length"]&.to_i || 0
807
+ end
808
+
809
+ public
810
+
811
+ # Path for partial multistream cache
812
+ def cached_partial_multistream_path(stream_count)
813
+ File.join(@cache_dir, "#{@lang}wiki-#{latest_dump_date}-multistream-#{stream_count}streams.xml.bz2")
814
+ end
815
+
816
+ # Get paths for cached files
817
+ def cached_index_path
818
+ File.join(@cache_dir, "#{@lang}wiki-#{latest_dump_date}-multistream-index.txt.bz2")
819
+ end
820
+
821
+ def cached_multistream_path
822
+ File.join(@cache_dir, "#{@lang}wiki-#{latest_dump_date}-multistream.xml.bz2")
823
+ end
824
+
825
+ # Check if cache is fresh (within configured days)
826
+ def cache_fresh?(days = nil)
827
+ days ||= @dump_expiry_days
828
+ Wp2txt.file_fresh?(cached_index_path, days)
829
+ end
830
+
831
+ # Check if cache is stale (beyond configured expiry days)
832
+ def cache_stale?
833
+ !cache_fresh?
834
+ end
835
+
836
+ # Get cache age in days
837
+ # Returns nil if no cache exists
838
+ def cache_age_days
839
+ Wp2txt.file_age_days(cached_index_path)
840
+ end
841
+
842
+ # Get cache modification time
843
+ # Returns nil if no cache exists
844
+ def cache_mtime
845
+ path = cached_index_path
846
+ return nil unless File.exist?(path)
847
+
848
+ File.mtime(path)
849
+ end
850
+
851
+ # Get cache status information
852
+ def cache_status
853
+ {
854
+ lang: @lang,
855
+ cache_dir: @cache_dir,
856
+ index_exists: File.exist?(cached_index_path),
857
+ index_path: cached_index_path,
858
+ index_size: File.exist?(cached_index_path) ? File.size(cached_index_path) : 0,
859
+ multistream_exists: File.exist?(cached_multistream_path),
860
+ multistream_path: cached_multistream_path,
861
+ multistream_size: File.exist?(cached_multistream_path) ? File.size(cached_multistream_path) : 0,
862
+ dump_date: (latest_dump_date rescue nil),
863
+ fresh: cache_fresh?,
864
+ age_days: cache_age_days,
865
+ mtime: cache_mtime,
866
+ expiry_days: @dump_expiry_days
867
+ }
868
+ end
869
+
870
+ # Clear cache for this language
871
+ def clear_cache!
872
+ lang_dir = File.join(@cache_dir, "#{@lang}wiki")
873
+ FileUtils.rm_rf(lang_dir) if File.exist?(lang_dir)
874
+ end
875
+
876
+ # Clear all cache
877
+ def self.clear_all_cache!(cache_dir = DEFAULT_CACHE_DIR)
878
+ FileUtils.rm_rf(cache_dir) if File.exist?(cache_dir)
879
+ end
880
+
881
+ # Get status for all cached languages
882
+ def self.all_cache_status(cache_dir = DEFAULT_CACHE_DIR)
883
+ return {} unless File.exist?(cache_dir)
884
+
885
+ status = {}
886
+ Dir.glob(File.join(cache_dir, "*wiki")).each do |lang_dir|
887
+ lang = File.basename(lang_dir).sub(/wiki$/, "").to_sym
888
+ manager = new(lang, cache_dir: cache_dir)
889
+ status[lang] = manager.cache_status
890
+ rescue IOError, Errno::ENOENT, Errno::EACCES, JSON::ParserError => e
891
+ status[lang] = { error: e.message }
892
+ end
893
+ status
894
+ end
895
+
896
+ private
897
+
898
+ def fetch_latest_dump_date
899
+ # Try to find the latest available dump
900
+ wiki = "#{@lang}wiki"
901
+ uri = URI("#{DUMP_BASE_URL}/#{wiki}/")
902
+
903
+ response = Wp2txt.ssl_safe_get(uri)
904
+ raise("Failed to fetch dump list for #{wiki}") unless response.is_a?(Net::HTTPSuccess)
905
+
906
+ # Find dates in format YYYYMMDD
907
+ dates = response.body.scan(/href="(\d{8})\/"/).flatten
908
+ dates.sort.last || raise("No dumps found for #{wiki}")
909
+ end
910
+
911
+ def index_url
912
+ wiki = "#{@lang}wiki"
913
+ date = latest_dump_date
914
+ "#{DUMP_BASE_URL}/#{wiki}/#{date}/#{wiki}-#{date}-pages-articles-multistream-index.txt.bz2"
915
+ end
916
+
917
+ def multistream_url
918
+ wiki = "#{@lang}wiki"
919
+ date = latest_dump_date
920
+ "#{DUMP_BASE_URL}/#{wiki}/#{date}/#{wiki}-#{date}-pages-articles-multistream.xml.bz2"
921
+ end
922
+
923
+ # Download metadata file path for tracking resumable downloads
924
+ def download_meta_path(path)
925
+ "#{path}.wp2txt_download"
926
+ end
927
+
928
+ # Get remote file info via HEAD request
929
+ # @return [Hash] { size:, etag:, last_modified: }
930
+ def get_remote_file_info(url)
931
+ uri = URI(url)
932
+ http = Net::HTTP.new(uri.host, uri.port)
933
+ http.use_ssl = (uri.scheme == "https")
934
+ http.open_timeout = DEFAULT_HTTP_TIMEOUT
935
+ http.read_timeout = DEFAULT_HTTP_TIMEOUT
936
+ if http.use_ssl?
937
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
938
+ end
939
+
940
+ request = Net::HTTP::Head.new(uri)
941
+ response = http.request(request)
942
+
943
+ {
944
+ size: response["Content-Length"]&.to_i || 0,
945
+ etag: response["ETag"],
946
+ last_modified: response["Last-Modified"],
947
+ accept_ranges: response["Accept-Ranges"] == "bytes"
948
+ }
949
+ end
950
+
951
+ # Save download metadata for resume support
952
+ def save_download_meta(path, url, remote_info)
953
+ meta = {
954
+ url: url,
955
+ size: remote_info[:size],
956
+ etag: remote_info[:etag],
957
+ last_modified: remote_info[:last_modified],
958
+ started_at: Time.now.iso8601
959
+ }
960
+ File.write(download_meta_path(path), JSON.pretty_generate(meta))
961
+ end
962
+
963
+ # Load download metadata
964
+ # @return [Hash, nil] Metadata or nil if not found/invalid
965
+ def load_download_meta(path)
966
+ meta_path = download_meta_path(path)
967
+ return nil unless File.exist?(meta_path)
968
+
969
+ JSON.parse(File.read(meta_path), symbolize_names: true)
970
+ rescue JSON::ParserError
971
+ nil
972
+ end
973
+
974
+ # Clean up download metadata
975
+ def cleanup_download_meta(path)
976
+ FileUtils.rm_f(download_meta_path(path))
977
+ end
978
+
979
+ # Check if resume is safe (server file hasn't changed)
980
+ def can_resume_download?(path, url)
981
+ return false unless File.exist?(path)
982
+
983
+ meta = load_download_meta(path)
984
+ return false unless meta
985
+
986
+ # Check if metadata is not too old (max 7 days)
987
+ if meta[:started_at]
988
+ started = Time.parse(meta[:started_at]) rescue nil
989
+ if started && (Time.now - started) > days_to_seconds(RESUME_METADATA_MAX_AGE_DAYS)
990
+ puts " Partial download is too old (>#{RESUME_METADATA_MAX_AGE_DAYS} days). Starting fresh."
991
+ return false
992
+ end
993
+ end
994
+
995
+ # Get current remote file info
996
+ remote_info = get_remote_file_info(url)
997
+
998
+ # Check if ETag matches (most reliable)
999
+ if meta[:etag] && remote_info[:etag]
1000
+ if meta[:etag] != remote_info[:etag]
1001
+ puts " Server file has changed (ETag mismatch). Starting fresh."
1002
+ return false
1003
+ end
1004
+ # Fallback: check Last-Modified
1005
+ elsif meta[:last_modified] && remote_info[:last_modified]
1006
+ if meta[:last_modified] != remote_info[:last_modified]
1007
+ puts " Server file has changed (Last-Modified mismatch). Starting fresh."
1008
+ return false
1009
+ end
1010
+ end
1011
+
1012
+ # Check if server supports Range requests
1013
+ unless remote_info[:accept_ranges]
1014
+ puts " Server doesn't support resume. Starting fresh."
1015
+ return false
1016
+ end
1017
+
1018
+ true
1019
+ end
1020
+
1021
+ def download_file(url, path)
1022
+ uri = URI(url)
1023
+ FileUtils.mkdir_p(File.dirname(path))
1024
+
1025
+ # Check for resumable download
1026
+ partial_size = File.exist?(path) ? File.size(path) : 0
1027
+ resume_mode = false
1028
+
1029
+ if partial_size > 0 && can_resume_download?(path, url)
1030
+ meta = load_download_meta(path)
1031
+ total_size = meta[:size]
1032
+ if partial_size < total_size
1033
+ resume_mode = true
1034
+ puts " Resuming download from #{format_size(partial_size)} / #{format_size(total_size)} (#{(partial_size * 100.0 / total_size).round(1)}%)"
1035
+ elsif partial_size == total_size
1036
+ puts " Download already complete."
1037
+ cleanup_download_meta(path)
1038
+ return path
1039
+ else
1040
+ # Partial is larger than expected - corrupted, start fresh
1041
+ puts " Partial file corrupted (size mismatch). Starting fresh."
1042
+ FileUtils.rm_f(path)
1043
+ partial_size = 0
1044
+ end
1045
+ elsif partial_size > 0
1046
+ # Can't resume - remove partial and start fresh
1047
+ FileUtils.rm_f(path)
1048
+ cleanup_download_meta(path)
1049
+ partial_size = 0
1050
+ end
1051
+
1052
+ http = Net::HTTP.new(uri.host, uri.port)
1053
+ http.use_ssl = (uri.scheme == "https")
1054
+ http.open_timeout = DEFAULT_HTTP_TIMEOUT
1055
+ http.read_timeout = DEFAULT_HTTP_TIMEOUT
1056
+ if http.use_ssl?
1057
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
1058
+ end
1059
+
1060
+ request = Net::HTTP::Get.new(uri)
1061
+
1062
+ if resume_mode
1063
+ request["Range"] = "bytes=#{partial_size}-"
1064
+ file_mode = "ab" # Append mode
1065
+ else
1066
+ file_mode = "wb" # Write mode (overwrite)
1067
+ # Save metadata for potential future resume
1068
+ remote_info = get_remote_file_info(url)
1069
+ save_download_meta(path, url, remote_info) if remote_info[:size] > 0
1070
+ end
1071
+
1072
+ File.open(path, file_mode) do |file|
1073
+ http.request(request) do |response|
1074
+ if response.code == "200" || response.code == "206"
1075
+ total = if resume_mode
1076
+ load_download_meta(path)[:size]
1077
+ else
1078
+ response["Content-Length"]&.to_i
1079
+ end
1080
+ downloaded = partial_size
1081
+
1082
+ response.read_body do |chunk|
1083
+ file.write(chunk)
1084
+ downloaded += chunk.size
1085
+ if total && total > 0
1086
+ percent = (downloaded * 100.0 / total).round(1)
1087
+ print "\r Progress: #{percent}% (#{format_size(downloaded)} / #{format_size(total)})"
1088
+ $stdout.flush
1089
+ end
1090
+ end
1091
+ puts
1092
+ elsif response.code == "416"
1093
+ # Range Not Satisfiable - file might be complete or corrupted
1094
+ puts "\n Range error. Verifying file..."
1095
+ remote_info = get_remote_file_info(url)
1096
+ if File.size(path) == remote_info[:size]
1097
+ puts " File is already complete."
1098
+ else
1099
+ puts " File corrupted. Re-downloading..."
1100
+ file.close
1101
+ FileUtils.rm_f(path)
1102
+ cleanup_download_meta(path)
1103
+ return download_file(url, path)
1104
+ end
1105
+ else
1106
+ raise "Download failed: #{response.code} #{response.message}"
1107
+ end
1108
+ end
1109
+ end
1110
+
1111
+ # Clean up metadata on successful completion
1112
+ cleanup_download_meta(path)
1113
+
1114
+ path
1115
+ end
1116
+
1117
+ # Download a range of bytes from a URL using HTTP Range header
1118
+ def download_file_range(url, path, start_byte, end_byte)
1119
+ uri = URI(url)
1120
+
1121
+ FileUtils.mkdir_p(File.dirname(path))
1122
+
1123
+ http = Net::HTTP.new(uri.host, uri.port)
1124
+ http.use_ssl = (uri.scheme == "https")
1125
+ http.open_timeout = DEFAULT_HTTP_TIMEOUT
1126
+ http.read_timeout = DEFAULT_HTTP_TIMEOUT
1127
+ if http.use_ssl?
1128
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
1129
+ end
1130
+
1131
+ request = Net::HTTP::Get.new(uri)
1132
+ request["Range"] = "bytes=#{start_byte}-#{end_byte}"
1133
+
1134
+ File.open(path, "wb") do |file|
1135
+ http.request(request) do |response|
1136
+ if response.code == "206" || response.code == "200"
1137
+ total = end_byte - start_byte + 1
1138
+ downloaded = 0
1139
+
1140
+ response.read_body do |chunk|
1141
+ file.write(chunk)
1142
+ downloaded += chunk.size
1143
+ percent = (downloaded * 100.0 / total).round(1)
1144
+ print "\r Progress: #{percent}% (#{format_size(downloaded)} / #{format_size(total)})"
1145
+ $stdout.flush
1146
+ end
1147
+ puts
1148
+ else
1149
+ raise "Download failed: #{response.code} #{response.message}"
1150
+ end
1151
+ end
1152
+ end
1153
+
1154
+ path
1155
+ end
1156
+ end
1157
+
1158
+ # Fetches category members from Wikipedia API
1159
+ # Uses SQLite-based CategoryCache for efficient repeated access
1160
+ class CategoryFetcher
1161
+ API_ENDPOINT = "https://%s.wikipedia.org/w/api.php"
1162
+ MAX_LIMIT = 500
1163
+ RATE_LIMIT_DELAY = 0.1
1164
+
1165
+ attr_reader :lang, :category, :max_depth, :cache_expiry_days
1166
+
1167
+ def initialize(lang, category, max_depth: 0, cache_expiry_days: nil, cache_dir: nil)
1168
+ @lang = lang.to_s
1169
+ @category = normalize_category_name(category)
1170
+ @max_depth = max_depth
1171
+ @cache_expiry_days = cache_expiry_days || Wp2txt::DEFAULT_CATEGORY_CACHE_EXPIRY_DAYS
1172
+ @cache_dir = cache_dir
1173
+ @cache = nil
1174
+ @visited_categories = Set.new
1175
+ end
1176
+
1177
+ # Enable caching of category member lists
1178
+ # @param cache_dir [String] Directory for cache files
1179
+ def enable_cache(cache_dir)
1180
+ @cache_dir = cache_dir
1181
+ @cache = CategoryCache.new(@lang, cache_dir: cache_dir, expiry_days: @cache_expiry_days)
1182
+ end
1183
+
1184
+ # Get the category cache instance
1185
+ # Creates one if caching is enabled but cache not yet initialized
1186
+ def cache
1187
+ return @cache if @cache
1188
+ return nil unless @cache_dir
1189
+
1190
+ @cache = CategoryCache.new(@lang, cache_dir: @cache_dir, expiry_days: @cache_expiry_days)
1191
+ end
1192
+
1193
+ # Preview mode - returns statistics without full article list
1194
+ def fetch_preview
1195
+ @visited_categories = Set.new
1196
+ subcategories = []
1197
+ total_articles = 0
1198
+
1199
+ fetch_category_stats(@category, 0, subcategories)
1200
+
1201
+ total_articles = subcategories.sum { |s| s[:article_count] }
1202
+
1203
+ {
1204
+ category: @category,
1205
+ depth: @max_depth,
1206
+ subcategories: subcategories,
1207
+ total_subcategories: subcategories.size - 1,
1208
+ total_articles: total_articles
1209
+ }
1210
+ end
1211
+
1212
+ # Fetch all article titles in the category (and subcategories if depth > 0)
1213
+ def fetch_articles
1214
+ @visited_categories = Set.new
1215
+ @articles = []
1216
+ fetch_category_members(@category, 0)
1217
+ @articles.uniq
1218
+ end
1219
+
1220
+ private
1221
+
1222
+ def normalize_category_name(name)
1223
+ name.to_s.sub(/^[Cc]ategory:/, "").strip
1224
+ end
1225
+
1226
+ def fetch_category_stats(category_name, current_depth, results)
1227
+ return if @visited_categories.include?(category_name)
1228
+ @visited_categories << category_name
1229
+
1230
+ cached = load_from_cache(category_name)
1231
+ if cached
1232
+ results << { name: category_name, article_count: (cached[:pages] || []).size }
1233
+ if current_depth < @max_depth
1234
+ (cached[:subcats] || []).each do |subcat|
1235
+ fetch_category_stats(subcat, current_depth + 1, results)
1236
+ end
1237
+ end
1238
+ return
1239
+ end
1240
+
1241
+ pages = []
1242
+ subcats = []
1243
+ continue_token = nil
1244
+
1245
+ loop do
1246
+ response = api_request(category_name, continue_token)
1247
+ break unless response
1248
+
1249
+ categorymembers = response.dig("query", "categorymembers") || []
1250
+ categorymembers.each do |member|
1251
+ case member["ns"]
1252
+ when 0
1253
+ pages << member["title"]
1254
+ when 14
1255
+ subcats << member["title"].sub(/^Category:/, "")
1256
+ end
1257
+ end
1258
+
1259
+ continue_token = response.dig("continue", "cmcontinue")
1260
+ break unless continue_token
1261
+
1262
+ sleep(RATE_LIMIT_DELAY)
1263
+ end
1264
+
1265
+ save_to_cache(category_name, { pages: pages, subcats: subcats })
1266
+
1267
+ results << { name: category_name, article_count: pages.size }
1268
+
1269
+ if current_depth < @max_depth
1270
+ subcats.each do |subcat|
1271
+ fetch_category_stats(subcat, current_depth + 1, results)
1272
+ end
1273
+ end
1274
+ end
1275
+
1276
+ def fetch_category_members(category_name, current_depth)
1277
+ return if @visited_categories.include?(category_name)
1278
+ @visited_categories << category_name
1279
+
1280
+ cached = load_from_cache(category_name)
1281
+ if cached
1282
+ @articles.concat(cached[:pages] || [])
1283
+ if current_depth < @max_depth
1284
+ (cached[:subcats] || []).each do |subcat|
1285
+ fetch_category_members(subcat, current_depth + 1)
1286
+ end
1287
+ end
1288
+ return
1289
+ end
1290
+
1291
+ pages = []
1292
+ subcats = []
1293
+ continue_token = nil
1294
+
1295
+ loop do
1296
+ response = api_request(category_name, continue_token)
1297
+ break unless response
1298
+
1299
+ categorymembers = response.dig("query", "categorymembers") || []
1300
+ categorymembers.each do |member|
1301
+ case member["ns"]
1302
+ when 0
1303
+ pages << member["title"]
1304
+ when 14
1305
+ subcats << member["title"].sub(/^Category:/, "")
1306
+ end
1307
+ end
1308
+
1309
+ continue_token = response.dig("continue", "cmcontinue")
1310
+ break unless continue_token
1311
+
1312
+ sleep(RATE_LIMIT_DELAY)
1313
+ end
1314
+
1315
+ save_to_cache(category_name, { pages: pages, subcats: subcats })
1316
+
1317
+ @articles.concat(pages)
1318
+
1319
+ if current_depth < @max_depth
1320
+ subcats.each do |subcat|
1321
+ fetch_category_members(subcat, current_depth + 1)
1322
+ end
1323
+ end
1324
+ end
1325
+
1326
+ def api_request(category_name, continue_token = nil)
1327
+ uri = URI(format(API_ENDPOINT, @lang))
1328
+ params = {
1329
+ action: "query",
1330
+ list: "categorymembers",
1331
+ cmtitle: "Category:#{category_name}",
1332
+ cmtype: "page|subcat",
1333
+ cmlimit: MAX_LIMIT,
1334
+ format: "json"
1335
+ }
1336
+ params[:cmcontinue] = continue_token if continue_token
1337
+ uri.query = URI.encode_www_form(params)
1338
+
1339
+ attempts = 0
1340
+ begin
1341
+ attempts += 1
1342
+ http = Net::HTTP.new(uri.host, uri.port)
1343
+ http.use_ssl = true
1344
+ http.open_timeout = DEFAULT_HTTP_TIMEOUT
1345
+ http.read_timeout = DEFAULT_HTTP_TIMEOUT
1346
+ http.verify_mode = OpenSSL::SSL::VERIFY_PEER
1347
+
1348
+ request = Net::HTTP::Get.new(uri)
1349
+ response = http.request(request)
1350
+ return nil unless response.is_a?(Net::HTTPSuccess)
1351
+
1352
+ JSON.parse(response.body)
1353
+ rescue Net::OpenTimeout, Net::ReadTimeout, SocketError, Errno::ECONNRESET,
1354
+ Errno::ECONNREFUSED, Errno::EHOSTUNREACH, OpenSSL::SSL::SSLError => e
1355
+ if attempts <= MAX_HTTP_RETRIES
1356
+ delay = 2**attempts
1357
+ warn " API request failed (attempt #{attempts}/#{MAX_HTTP_RETRIES + 1}): #{e.message}. Retrying in #{delay}s..."
1358
+ sleep delay
1359
+ retry
1360
+ end
1361
+ warn " API request failed after #{attempts} attempts for category '#{category_name}': #{e.message}"
1362
+ nil
1363
+ rescue JSON::ParserError => e
1364
+ warn " Invalid JSON response for category '#{category_name}': #{e.message}"
1365
+ nil
1366
+ end
1367
+ end
1368
+
1369
+ def load_from_cache(category_name)
1370
+ return nil unless cache
1371
+
1372
+ cache.get(category_name)
1373
+ end
1374
+
1375
+ def save_to_cache(category_name, members)
1376
+ return unless cache
1377
+
1378
+ pages = members[:pages] || members["pages"] || []
1379
+ subcats = members[:subcats] || members["subcats"] || []
1380
+ cache.save(category_name, pages, subcats)
1381
+ end
1382
+ end
1383
+ end