RubyGems - wp2txt - Versions diffs - 1.1.3 → 2.1.0 - Mend

wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

checksums.yaml +4 -4
data/.dockerignore +12 -0
data/.github/workflows/ci.yml +13 -13
data/.gitignore +14 -0
data/CHANGELOG.md +284 -0
data/DEVELOPMENT.md +415 -0
data/DEVELOPMENT_ja.md +415 -0
data/Dockerfile +19 -10
data/Gemfile +2 -8
data/README.md +259 -123
data/README_ja.md +375 -0
data/Rakefile +4 -0
data/bin/wp2txt +863 -161
data/lib/wp2txt/article.rb +98 -13
data/lib/wp2txt/bz2_validator.rb +239 -0
data/lib/wp2txt/category_cache.rb +313 -0
data/lib/wp2txt/cli.rb +319 -0
data/lib/wp2txt/cli_ui.rb +428 -0
data/lib/wp2txt/config.rb +158 -0
data/lib/wp2txt/constants.rb +134 -0
data/lib/wp2txt/data/html_entities.json +2135 -0
data/lib/wp2txt/data/language_metadata.json +4769 -0
data/lib/wp2txt/data/language_tiers.json +59 -0
data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
data/lib/wp2txt/data/template_aliases.json +193 -0
data/lib/wp2txt/data/wikipedia_entities.json +12 -0
data/lib/wp2txt/extractor.rb +545 -0
data/lib/wp2txt/file_utils.rb +91 -0
data/lib/wp2txt/formatter.rb +352 -0
data/lib/wp2txt/global_data_cache.rb +353 -0
data/lib/wp2txt/index_cache.rb +258 -0
data/lib/wp2txt/magic_words.rb +353 -0
data/lib/wp2txt/memory_monitor.rb +236 -0
data/lib/wp2txt/multistream.rb +1383 -0
data/lib/wp2txt/output_writer.rb +182 -0
data/lib/wp2txt/parser_functions.rb +606 -0
data/lib/wp2txt/ractor_worker.rb +215 -0
data/lib/wp2txt/regex.rb +396 -12
data/lib/wp2txt/section_extractor.rb +354 -0
data/lib/wp2txt/stream_processor.rb +271 -0
data/lib/wp2txt/template_expander.rb +830 -0
data/lib/wp2txt/text_processing.rb +337 -0
data/lib/wp2txt/utils.rb +629 -270
data/lib/wp2txt/version.rb +1 -1
data/lib/wp2txt.rb +53 -26
data/scripts/benchmark_regex.rb +161 -0
data/scripts/fetch_html_entities.rb +94 -0
data/scripts/fetch_language_metadata.rb +180 -0
data/scripts/fetch_mediawiki_data.rb +334 -0
data/scripts/fetch_template_data.rb +186 -0
data/scripts/profile_memory.rb +139 -0
data/spec/article_spec.rb +402 -0
data/spec/auto_download_spec.rb +314 -0
data/spec/bz2_validator_spec.rb +193 -0
data/spec/category_cache_spec.rb +226 -0
data/spec/category_fetcher_spec.rb +504 -0
data/spec/cleanup_spec.rb +197 -0
data/spec/cli_options_spec.rb +678 -0
data/spec/cli_spec.rb +876 -0
data/spec/config_spec.rb +194 -0
data/spec/constants_spec.rb +138 -0
data/spec/file_utils_spec.rb +170 -0
data/spec/fixtures/samples.rb +181 -0
data/spec/formatter_sections_spec.rb +382 -0
data/spec/global_data_cache_spec.rb +186 -0
data/spec/index_cache_spec.rb +210 -0
data/spec/integration_spec.rb +543 -0
data/spec/magic_words_spec.rb +261 -0
data/spec/markers_spec.rb +476 -0
data/spec/memory_monitor_spec.rb +192 -0
data/spec/multistream_spec.rb +690 -0
data/spec/output_writer_spec.rb +400 -0
data/spec/parser_functions_spec.rb +455 -0
data/spec/ractor_worker_spec.rb +197 -0
data/spec/regex_spec.rb +281 -0
data/spec/section_extractor_spec.rb +397 -0
data/spec/spec_helper.rb +63 -0
data/spec/stream_processor_spec.rb +579 -0
data/spec/template_data_spec.rb +246 -0
data/spec/template_expander_spec.rb +472 -0
data/spec/template_processing_spec.rb +217 -0
data/spec/text_processing_spec.rb +312 -0
data/spec/utils_spec.rb +195 -16
data/spec/wp2txt_spec.rb +510 -0
data/wp2txt.gemspec +5 -3
metadata +146 -18
data/.rubocop.yml +0 -80
data/data/output_samples/testdata_en.txt +0 -23002
data/data/output_samples/testdata_en_category.txt +0 -132
data/data/output_samples/testdata_en_summary.txt +0 -1376
data/data/output_samples/testdata_ja.txt +0 -22774
data/data/output_samples/testdata_ja_category.txt +0 -206
data/data/output_samples/testdata_ja_summary.txt +0 -1560
data/data/testdata_en.bz2 +0 -0
data/data/testdata_ja.bz2 +0 -0
data/image/screenshot.png +0 -0

data/lib/wp2txt/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Wp2txt
-  VERSION = "1.1.3"
+  VERSION = "2.1.0"
 end

data/lib/wp2txt.rb CHANGED Viewed

@@ -3,17 +3,24 @@
 require "nokogiri"
 require_relative "wp2txt/article"
 require_relative "wp2txt/utils"
+require_relative "wp2txt/stream_processor"
+require_relative "wp2txt/output_writer"
 module Wp2txt
   class Splitter
     include Wp2txt
-    def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)
+    attr_reader :size_read, :file_index
+    def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false, &progress_callback)
       @fp = nil
       @input_file = input_file
       @output_dir = output_dir
       @tfile_size = tfile_size
       require "bzip2-ruby" if bz2_gem
       @bz2_gem = bz2_gem
+      @progress_callback = progress_callback
+      @last_progress_time = Time.now
       prepare
     end
@@ -26,7 +33,7 @@ module Wp2txt
       loop do
         begin
           a = file.read(unit)
-        rescue StandardError
+        rescue IOError, Errno::EIO, Errno::ENOENT
           a = nil
         end
         break unless a
@@ -46,20 +53,22 @@ module Wp2txt
     # check if a given command exists: return the path if it does, return false if not
     def command_exist?(command)
       basename = File.basename(command)
-      path = +""
       print "Checking #{basename}: "
       begin
-        if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
-          puts "detected [#{path}]"
-          path.strip
-        elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
-          puts "detected [#{path}]"
-          path.strip
-        else
+        # Use IO.popen instead of open("| ...") for Ruby 4.0 compatibility
+        path = IO.popen(["which", command], err: File::NULL, &:read).strip
+        if path.empty?
+          path = IO.popen(["which", basename], err: File::NULL, &:read).strip
+        end
+        if path.empty?
           puts "#{basename} not found"
           false
+        else
+          puts "detected [#{path}]"
+          path
         end
-      rescue StandardError
+      rescue Errno::ENOENT, Errno::EPIPE, IOError
         puts "#{basename} not found"
         false
       end
@@ -75,13 +84,13 @@ module Wp2txt
         if @bz2_gem
           file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
         elsif Gem.win_platform?
-          file = IO.popen("bunzip2.exe -c #{@input_file}")
+          file = IO.popen(["bunzip2.exe", "-c", @input_file])
         elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
-          file = IO.popen("#{bzpath} -c -d #{@input_file}")
+          file = IO.popen([bzpath, "-c", "-d", @input_file])
         end
       else # meaning that it is a text file
         @infile_size = File.stat(@input_file).size
-        file = open(@input_file)
+        file = File.open(@input_file, "r:UTF-8")
       end
       # create basename of output file
@@ -101,7 +110,7 @@ module Wp2txt
       loop do
         begin
           new_lines = @file_pointer.read(10_485_760)
-        rescue StandardError
+        rescue IOError, Errno::EIO, Errno::ENOENT, Errno::EPIPE
           return nil
         end
         return nil unless new_lines
@@ -114,9 +123,10 @@ module Wp2txt
         new_first_line = temp_buf.shift
         @buffer.last << new_first_line
-        @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
-        @buffer += temp_buf unless temp_buf.empty?
-        @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
+        # Use end_with? instead of [-1, 1] for clarity and performance
+        @buffer << +"" if new_first_line.end_with?("\n")
+        @buffer.concat(temp_buf) unless temp_buf.empty?
+        @buffer << +"" if @buffer.last.end_with?("\n")
         break if @buffer.size > 1
       end
       true
@@ -144,6 +154,10 @@ module Wp2txt
         @total_size += text.bytesize
         output_text << text
         end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
+        # Report progress every 5 seconds
+        report_progress
         # never close the file until the end of the page even if end_flag is on
         next unless end_flag && %r{</page} =~ text
@@ -157,7 +171,7 @@ module Wp2txt
         @outfiles << outfilename
         @fp = File.open(outfilename, "w")
       end
-      @fp.puts(output_text) if output_text != ""
+      @fp.puts(output_text) unless output_text.empty?
       @fp.close
       if outfilename && File.size(outfilename).zero?
@@ -167,6 +181,18 @@ module Wp2txt
       rename(@outfiles, "xml")
     end
+    private
+    def report_progress
+      return unless @progress_callback
+      now = Time.now
+      return if now - @last_progress_time < 5 # Report every 5 seconds
+      @last_progress_time = now
+      @progress_callback.call(@size_read, @file_index)
+    end
   end
   class Runner
@@ -183,7 +209,7 @@ module Wp2txt
     def prepare
       @infile_size = File.stat(@input_file).size
-      file = open(@input_file)
+      file = File.open(@input_file, "r:UTF-8")
       @file_pointer = file
       @outfile_base = File.basename(@input_file, ".*")
       @total_size = 0
@@ -194,7 +220,7 @@ module Wp2txt
       loop do
         begin
           new_lines = @file_pointer.read(10_485_760)
-        rescue StandardError
+        rescue IOError, Errno::EIO, Errno::ENOENT, Errno::EPIPE
           return nil
         end
         return nil unless new_lines
@@ -206,10 +232,11 @@ module Wp2txt
         temp_buf << ss.rest unless ss.eos?
         new_first_line = temp_buf.shift
-        @buffer.last <<  new_first_line
-        @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
-        @buffer += temp_buf unless temp_buf.empty?
-        @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
+        @buffer.last << new_first_line
+        # Use end_with? instead of [-1, 1] for clarity and performance
+        @buffer << +"" if new_first_line.end_with?("\n")
+        @buffer.concat(temp_buf) unless temp_buf.empty?
+        @buffer << +"" if @buffer.last.end_with?("\n")
         break if @buffer.size > 1
       end
       true
@@ -247,7 +274,7 @@ module Wp2txt
       else
         page.force_encoding("utf-8")
       end
-    rescue StandardError
+    rescue ::Encoding::InvalidByteSequenceError, ::Encoding::UndefinedConversionError
       page
     end

data/scripts/benchmark_regex.rb ADDED Viewed

@@ -0,0 +1,161 @@
+# frozen_string_literal: true
+# Benchmark script for wp2txt regex performance
+# Compares pre-compiled regex patterns vs inline compilation
+#
+# Usage: ruby scripts/benchmark_regex.rb
+require "benchmark"
+begin
+  require "benchmark/ips"
+rescue LoadError
+  # benchmark-ips is optional
+end
+# Add lib to load path
+$LOAD_PATH.unshift(File.expand_path("../lib", __dir__))
+require "wp2txt"
+require "wp2txt/article"
+# Sample Wikipedia-like content for benchmarking
+SAMPLE_TEXT = <<~WIKI
+  {{Infobox person
+  | name = Test Person
+  | birth_date = 1980-01-01
+  | occupation = Writer
+  }}
+  '''Test Article''' is a [[test]] article with various [[wiki markup|markup]].
+  == Section 1 ==
+  This section has some {{cite web|url=http://example.com|title=Example}} references.
+  There are also [[Category:Test]] links and [[File:Image.jpg|thumb|A caption]].
+  === Subsection ===
+  More content with '''bold''' and ''italic'' text.
+  * List item 1
+  * List item 2
+  # Numbered item
+  == Section 2 ==
+  {| class="wikitable"
+  |-
+  ! Header 1 !! Header 2
+  |-
+  | Cell 1 || Cell 2
+  |}
+  Some text with &nbsp; entities and &#x266A; characters.
+  Also has <ref name="test">Reference content</ref> and <nowiki>[[preserved]]</nowiki>.
+  {{DEFAULTSORT:Test Article}}
+  [[Category:Articles]]
+  [[Category:Tests]]
+WIKI
+# Create multiple copies for more realistic benchmarking
+LARGE_TEXT = (SAMPLE_TEXT * 100).freeze
+class BenchmarkRunner
+  include Wp2txt
+  def initialize
+    @nowikis = {}
+  end
+  def run_cleanup(text)
+    cleanup(text.dup)
+  end
+  def run_full_format(text)
+    format_wiki(text.dup)
+  end
+end
+def run_benchmarks
+  puts "=" * 60
+  puts "wp2txt Regex Performance Benchmark"
+  puts "=" * 60
+  puts
+  puts "Ruby version: #{RUBY_VERSION}"
+  puts "Sample text size: #{SAMPLE_TEXT.bytesize} bytes"
+  puts "Large text size: #{LARGE_TEXT.bytesize} bytes"
+  puts
+  runner = BenchmarkRunner.new
+  puts "-" * 60
+  puts "Warmup (JIT compilation, method caching)"
+  puts "-" * 60
+  5.times { runner.run_cleanup(SAMPLE_TEXT) }
+  5.times { runner.run_full_format(SAMPLE_TEXT) }
+  puts "Done."
+  puts
+  puts "-" * 60
+  puts "Benchmark: cleanup() method"
+  puts "-" * 60
+  Benchmark.bm(20) do |x|
+    x.report("cleanup (small):") do
+      1000.times { runner.run_cleanup(SAMPLE_TEXT) }
+    end
+    x.report("cleanup (large):") do
+      10.times { runner.run_cleanup(LARGE_TEXT) }
+    end
+  end
+  puts
+  puts "-" * 60
+  puts "Benchmark: format_wiki() method (full pipeline)"
+  puts "-" * 60
+  Benchmark.bm(20) do |x|
+    x.report("format_wiki (small):") do
+      1000.times { runner.run_full_format(SAMPLE_TEXT) }
+    end
+    x.report("format_wiki (large):") do
+      10.times { runner.run_full_format(LARGE_TEXT) }
+    end
+  end
+  # If benchmark-ips is available, run IPS benchmarks
+  if defined?(Benchmark::IPS)
+    puts
+    puts "-" * 60
+    puts "IPS Benchmark (iterations per second)"
+    puts "-" * 60
+    Benchmark.ips do |x|
+      x.report("cleanup") { runner.run_cleanup(SAMPLE_TEXT) }
+      x.report("format_wiki") { runner.run_full_format(SAMPLE_TEXT) }
+      x.compare!
+    end
+  end
+  puts
+  puts "-" * 60
+  puts "Memory profile (approximate)"
+  puts "-" * 60
+  # Simple memory measurement
+  GC.start
+  before = GC.stat[:total_allocated_objects]
+  100.times { runner.run_cleanup(SAMPLE_TEXT) }
+  after = GC.stat[:total_allocated_objects]
+  puts "cleanup() allocations per call: ~#{(after - before) / 100}"
+  GC.start
+  before = GC.stat[:total_allocated_objects]
+  100.times { runner.run_full_format(SAMPLE_TEXT) }
+  after = GC.stat[:total_allocated_objects]
+  puts "format_wiki() allocations per call: ~#{(after - before) / 100}"
+  puts
+  puts "=" * 60
+  puts "Benchmark complete"
+  puts "=" * 60
+end
+run_benchmarks

data/scripts/fetch_html_entities.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+# Fetches HTML named character references from WHATWG HTML specification
+# Usage: ruby scripts/fetch_html_entities.rb
+#
+# This script downloads the official entities.json from WHATWG and converts
+# it into a format suitable for wp2txt text processing.
+require "net/http"
+require "json"
+require "fileutils"
+WHATWG_ENTITIES_URL = "https://html.spec.whatwg.org/entities.json"
+def fetch_whatwg_entities
+  puts "Fetching entities from WHATWG HTML specification..."
+  uri = URI(WHATWG_ENTITIES_URL)
+  response = Net::HTTP.get_response(uri)
+  unless response.is_a?(Net::HTTPSuccess)
+    warn "Failed to fetch entities: HTTP #{response.code}"
+    return nil
+  end
+  JSON.parse(response.body)
+rescue StandardError => e
+  warn "Error fetching entities: #{e.message}"
+  nil
+end
+def convert_entities(raw_data)
+  entities = {}
+  raw_data.each do |name, info|
+    # Only include entries with semicolon (standard form)
+    # Skip legacy forms without semicolon like "&nbsp"
+    next unless name.end_with?(";")
+    # Extract entity name without & and ;
+    # e.g., "&alpha;" -> "alpha"
+    key = name
+    # Get the character(s)
+    characters = info["characters"]
+    next if characters.nil? || characters.empty?
+    entities[key] = characters
+  end
+  entities
+end
+def main
+  raw_data = fetch_whatwg_entities
+  if raw_data.nil?
+    warn "Failed to fetch entities. Aborting."
+    exit 1
+  end
+  puts "Processing #{raw_data.size} raw entries..."
+  entities = convert_entities(raw_data)
+  puts "Converted to #{entities.size} standard entities (with semicolon)"
+  result = {
+    "meta" => {
+      "generated_at" => Time.now.utc.iso8601,
+      "source" => WHATWG_ENTITIES_URL,
+      "description" => "HTML named character references from WHATWG HTML specification",
+      "total_entities" => entities.size
+    },
+    "entities" => entities.sort.to_h
+  }
+  # Write output
+  output_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "html_entities.json")
+  FileUtils.mkdir_p(File.dirname(output_path))
+  File.write(output_path, JSON.pretty_generate(result))
+  puts "\nData written to: #{output_path}"
+  # Summary - show some categories
+  greek = entities.keys.select { |k| k.match?(/&(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega);/i) }
+  math = entities.keys.select { |k| k.match?(/&(sum|prod|int|infin|nabla|part|forall|exist|empty|isin|notin|cap|cup|sub|sup|oplus|otimes);/i) }
+  arrows = entities.keys.select { |k| k.match?(/arr;$/i) }
+  puts "\n=== Summary ==="
+  puts "Total entities: #{entities.size}"
+  puts "Greek letters: #{greek.size}"
+  puts "Math symbols: #{math.size}"
+  puts "Arrows: #{arrows.size}"
+end
+main if __FILE__ == $PROGRAM_NAME

data/scripts/fetch_language_metadata.rb ADDED Viewed

@@ -0,0 +1,180 @@
+# frozen_string_literal: true
+# Fetches Wikipedia language metadata from Wikimedia APIs
+# Usage: ruby scripts/fetch_language_metadata.rb
+#
+# This script queries the Wikimedia sitematrix API to get all Wikipedia
+# language editions and their statistics (article counts, etc.)
+require "net/http"
+require "json"
+require "fileutils"
+# Fetch all Wikipedia languages with statistics from sitematrix API
+def fetch_wikipedia_languages
+  uri = URI("https://meta.wikimedia.org/w/api.php")
+  params = {
+    action: "sitematrix",
+    smtype: "language",
+    format: "json"
+  }
+  uri.query = URI.encode_www_form(params)
+  response = Net::HTTP.get_response(uri)
+  return {} unless response.is_a?(Net::HTTPSuccess)
+  data = JSON.parse(response.body)
+  languages = {}
+  data["sitematrix"].each do |key, val|
+    next unless key.match?(/^\d+$/) && val.is_a?(Hash) && val["site"]
+    # Find Wikipedia site info
+    wiki_site = val["site"].find { |site| site["code"] == "wiki" }
+    next unless wiki_site
+    lang_code = val["code"]
+    languages[lang_code] = {
+      "name" => val["name"],
+      "localname" => val["localname"],
+      "url" => wiki_site["url"],
+      "dbname" => wiki_site["dbname"],
+      "closed" => wiki_site["closed"] || false,
+      "private" => wiki_site["private"] || false
+    }
+  end
+  languages
+rescue StandardError => e
+  warn "Error fetching sitematrix: #{e.message}"
+  {}
+end
+# Fetch article statistics for a specific Wikipedia
+def fetch_wiki_statistics(lang_code)
+  uri = URI("https://#{lang_code}.wikipedia.org/w/api.php")
+  params = {
+    action: "query",
+    meta: "siteinfo",
+    siprop: "statistics",
+    format: "json"
+  }
+  uri.query = URI.encode_www_form(params)
+  response = Net::HTTP.get_response(uri)
+  return nil unless response.is_a?(Net::HTTPSuccess)
+  data = JSON.parse(response.body)
+  stats = data.dig("query", "statistics")
+  return nil unless stats
+  {
+    "articles" => stats["articles"],
+    "pages" => stats["pages"],
+    "edits" => stats["edits"],
+    "users" => stats["users"],
+    "activeusers" => stats["activeusers"]
+  }
+rescue StandardError
+  nil
+end
+def main
+  puts "Fetching Wikipedia language list..."
+  languages = fetch_wikipedia_languages
+  if languages.empty?
+    warn "Failed to fetch language list. Aborting."
+    exit 1
+  end
+  # Filter out closed/private wikis
+  active_languages = languages.reject { |_, info| info["closed"] || info["private"] }
+  puts "Found #{active_languages.size} active Wikipedia editions."
+  puts "Fetching statistics for each Wikipedia (this may take a few minutes)..."
+  successful = 0
+  failed = []
+  active_languages.each_with_index do |(lang_code, info), idx|
+    print "\r  Processing: #{lang_code.ljust(10)} (#{idx + 1}/#{active_languages.size})"
+    $stdout.flush
+    stats = fetch_wiki_statistics(lang_code)
+    if stats
+      info.merge!(stats)
+      successful += 1
+    else
+      failed << lang_code
+    end
+    sleep 0.05 # Rate limiting
+  end
+  puts "\n  Successfully fetched: #{successful}/#{active_languages.size}"
+  puts "  Failed: #{failed.size} (#{failed.first(10).join(', ')}#{failed.size > 10 ? '...' : ''})" if failed.any?
+  # Categorize by size
+  size_categories = {
+    "large" => [],    # 1M+ articles
+    "medium" => [],   # 100K-1M articles
+    "small" => [],    # 10K-100K articles
+    "mini" => []      # <10K articles
+  }
+  active_languages.each do |lang_code, info|
+    articles = info["articles"] || 0
+    category = if articles >= 1_000_000
+                 "large"
+               elsif articles >= 100_000
+                 "medium"
+               elsif articles >= 10_000
+                 "small"
+               else
+                 "mini"
+               end
+    size_categories[category] << lang_code
+    info["size_category"] = category
+  end
+  # Build result
+  result = {
+    "meta" => {
+      "generated_at" => Time.now.utc.iso8601,
+      "source" => "Wikimedia sitematrix + siteinfo APIs",
+      "total_languages" => active_languages.size,
+      "statistics_fetched" => successful
+    },
+    "size_summary" => {
+      "large" => size_categories["large"].size,
+      "medium" => size_categories["medium"].size,
+      "small" => size_categories["small"].size,
+      "mini" => size_categories["mini"].size
+    },
+    "languages" => active_languages.sort_by { |_, info| -(info["articles"] || 0) }.to_h
+  }
+  # Write output
+  output_path = File.join(__dir__, "..", "lib", "wp2txt", "data", "language_metadata.json")
+  FileUtils.mkdir_p(File.dirname(output_path))
+  File.write(output_path, JSON.pretty_generate(result))
+  puts "\nData written to: #{output_path}"
+  # Summary
+  puts "\n=== Summary ==="
+  puts "Total active Wikipedias: #{active_languages.size}"
+  puts "Size categories:"
+  puts "  Large (1M+ articles): #{size_categories['large'].size} - #{size_categories['large'].first(5).join(', ')}..."
+  puts "  Medium (100K-1M): #{size_categories['medium'].size}"
+  puts "  Small (10K-100K): #{size_categories['small'].size}"
+  puts "  Mini (<10K): #{size_categories['mini'].size}"
+  # Top 20 by article count
+  puts "\nTop 20 Wikipedias by article count:"
+  active_languages.sort_by { |_, info| -(info["articles"] || 0) }.first(20).each_with_index do |(code, info), idx|
+    puts "  #{(idx + 1).to_s.rjust(2)}. #{code.ljust(5)} #{info['name'].to_s.ljust(20)} #{(info['articles'] || 0).to_s.rjust(10)} articles"
+  end
+end
+main if __FILE__ == $PROGRAM_NAME