RubyGems - wp2txt - Versions diffs - 1.1.3 → 2.1.0 - Mend

wp2txt 1.1.3 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

checksums.yaml +4 -4
data/.dockerignore +12 -0
data/.github/workflows/ci.yml +13 -13
data/.gitignore +14 -0
data/CHANGELOG.md +284 -0
data/DEVELOPMENT.md +415 -0
data/DEVELOPMENT_ja.md +415 -0
data/Dockerfile +19 -10
data/Gemfile +2 -8
data/README.md +259 -123
data/README_ja.md +375 -0
data/Rakefile +4 -0
data/bin/wp2txt +863 -161
data/lib/wp2txt/article.rb +98 -13
data/lib/wp2txt/bz2_validator.rb +239 -0
data/lib/wp2txt/category_cache.rb +313 -0
data/lib/wp2txt/cli.rb +319 -0
data/lib/wp2txt/cli_ui.rb +428 -0
data/lib/wp2txt/config.rb +158 -0
data/lib/wp2txt/constants.rb +134 -0
data/lib/wp2txt/data/html_entities.json +2135 -0
data/lib/wp2txt/data/language_metadata.json +4769 -0
data/lib/wp2txt/data/language_tiers.json +59 -0
data/lib/wp2txt/data/mediawiki_aliases.json +12366 -0
data/lib/wp2txt/data/template_aliases.json +193 -0
data/lib/wp2txt/data/wikipedia_entities.json +12 -0
data/lib/wp2txt/extractor.rb +545 -0
data/lib/wp2txt/file_utils.rb +91 -0
data/lib/wp2txt/formatter.rb +352 -0
data/lib/wp2txt/global_data_cache.rb +353 -0
data/lib/wp2txt/index_cache.rb +258 -0
data/lib/wp2txt/magic_words.rb +353 -0
data/lib/wp2txt/memory_monitor.rb +236 -0
data/lib/wp2txt/multistream.rb +1383 -0
data/lib/wp2txt/output_writer.rb +182 -0
data/lib/wp2txt/parser_functions.rb +606 -0
data/lib/wp2txt/ractor_worker.rb +215 -0
data/lib/wp2txt/regex.rb +396 -12
data/lib/wp2txt/section_extractor.rb +354 -0
data/lib/wp2txt/stream_processor.rb +271 -0
data/lib/wp2txt/template_expander.rb +830 -0
data/lib/wp2txt/text_processing.rb +337 -0
data/lib/wp2txt/utils.rb +629 -270
data/lib/wp2txt/version.rb +1 -1
data/lib/wp2txt.rb +53 -26
data/scripts/benchmark_regex.rb +161 -0
data/scripts/fetch_html_entities.rb +94 -0
data/scripts/fetch_language_metadata.rb +180 -0
data/scripts/fetch_mediawiki_data.rb +334 -0
data/scripts/fetch_template_data.rb +186 -0
data/scripts/profile_memory.rb +139 -0
data/spec/article_spec.rb +402 -0
data/spec/auto_download_spec.rb +314 -0
data/spec/bz2_validator_spec.rb +193 -0
data/spec/category_cache_spec.rb +226 -0
data/spec/category_fetcher_spec.rb +504 -0
data/spec/cleanup_spec.rb +197 -0
data/spec/cli_options_spec.rb +678 -0
data/spec/cli_spec.rb +876 -0
data/spec/config_spec.rb +194 -0
data/spec/constants_spec.rb +138 -0
data/spec/file_utils_spec.rb +170 -0
data/spec/fixtures/samples.rb +181 -0
data/spec/formatter_sections_spec.rb +382 -0
data/spec/global_data_cache_spec.rb +186 -0
data/spec/index_cache_spec.rb +210 -0
data/spec/integration_spec.rb +543 -0
data/spec/magic_words_spec.rb +261 -0
data/spec/markers_spec.rb +476 -0
data/spec/memory_monitor_spec.rb +192 -0
data/spec/multistream_spec.rb +690 -0
data/spec/output_writer_spec.rb +400 -0
data/spec/parser_functions_spec.rb +455 -0
data/spec/ractor_worker_spec.rb +197 -0
data/spec/regex_spec.rb +281 -0
data/spec/section_extractor_spec.rb +397 -0
data/spec/spec_helper.rb +63 -0
data/spec/stream_processor_spec.rb +579 -0
data/spec/template_data_spec.rb +246 -0
data/spec/template_expander_spec.rb +472 -0
data/spec/template_processing_spec.rb +217 -0
data/spec/text_processing_spec.rb +312 -0
data/spec/utils_spec.rb +195 -16
data/spec/wp2txt_spec.rb +510 -0
data/wp2txt.gemspec +5 -3
metadata +146 -18
data/.rubocop.yml +0 -80
data/data/output_samples/testdata_en.txt +0 -23002
data/data/output_samples/testdata_en_category.txt +0 -132
data/data/output_samples/testdata_en_summary.txt +0 -1376
data/data/output_samples/testdata_ja.txt +0 -22774
data/data/output_samples/testdata_ja_category.txt +0 -206
data/data/output_samples/testdata_ja_summary.txt +0 -1560
data/data/testdata_en.bz2 +0 -0
data/data/testdata_ja.bz2 +0 -0
data/image/screenshot.png +0 -0

data/lib/wp2txt/parser_functions.rb ADDED Viewed

@@ -0,0 +1,606 @@
+# frozen_string_literal: true
+require "time"
+module Wp2txt
+  # Evaluates MediaWiki parser functions
+  # Handles #if, #ifeq, #switch, #expr, #ifexpr, and string functions
+  class ParserFunctions
+    MONTH_NAMES = %w[
+      January February March April May June
+      July August September October November December
+    ].freeze
+    def initialize(reference_date: nil, preserve_unknown: false)
+      @reference_date = reference_date || Time.now
+      @preserve_unknown = preserve_unknown
+    end
+    # Main evaluation method
+    def evaluate(text)
+      return text if text.nil? || text.empty?
+      # Early exit: no parser functions to evaluate
+      return text unless text.include?("{{#")
+      result = text.dup
+      # Process parser functions from innermost to outermost
+      max_iterations = 10
+      iteration = 0
+      while result.include?("{{#") && iteration < max_iterations
+        previous = result.dup
+        result = evaluate_single_pass(result)
+        break if result == previous
+        iteration += 1
+      end
+      result
+    end
+    private
+    def evaluate_single_pass(text)
+      result = +""
+      pos = 0
+      while pos < text.length
+        start_idx = text.index("{{#", pos)
+        if start_idx.nil?
+          result << text[pos..]
+          break
+        end
+        # Add text before parser function
+        result << text[pos...start_idx]
+        # Find matching }}
+        end_idx = find_template_end(text, start_idx + 2)
+        if end_idx.nil?
+          result << text[start_idx..]
+          break
+        end
+        content = text[(start_idx + 2)...end_idx]
+        expanded = evaluate_parser_function(content)
+        result << expanded
+        pos = end_idx + 2
+      end
+      result
+    end
+    def find_template_end(text, start_pos)
+      depth = 1
+      pos = start_pos
+      while pos < text.length - 1
+        if text[pos, 2] == "{{"
+          depth += 1
+          pos += 2
+        elsif text[pos, 2] == "}}"
+          depth -= 1
+          return pos if depth.zero?
+          pos += 2
+        else
+          pos += 1
+        end
+      end
+      nil
+    end
+    def evaluate_parser_function(content)
+      # Parse function name and arguments
+      # Content starts with # (e.g., "#if:condition|then|else")
+      return "" unless content.start_with?("#")
+      # Find function name (up to first : or |)
+      colon_idx = content.index(":")
+      return "" if colon_idx.nil?
+      function_name = content[1...colon_idx].downcase
+      args_str = content[(colon_idx + 1)..]
+      args = split_arguments(args_str)
+      case function_name
+      when "if"
+        evaluate_if(args)
+      when "ifeq"
+        evaluate_ifeq(args)
+      when "iferror"
+        evaluate_iferror(args)
+      when "switch"
+        evaluate_switch(args)
+      when "ifexpr"
+        evaluate_ifexpr(args)
+      when "expr"
+        evaluate_expr(args)
+      when "len"
+        evaluate_len(args)
+      when "pos"
+        evaluate_pos(args)
+      when "rpos"
+        evaluate_rpos(args)
+      when "count"
+        evaluate_count(args)
+      when "sub"
+        evaluate_sub(args)
+      when "replace"
+        evaluate_replace(args)
+      when "explode"
+        evaluate_explode(args)
+      when "urldecode"
+        evaluate_urldecode(args)
+      when "urlencode"
+        evaluate_urlencode(args)
+      when "padleft"
+        evaluate_padleft(args)
+      when "padright"
+        evaluate_padright(args)
+      when "titleparts"
+        evaluate_titleparts(args)
+      when "time"
+        evaluate_time(args)
+      else
+        @preserve_unknown ? "{{##{content}}}" : ""
+      end
+    end
+    def split_arguments(str)
+      args = []
+      current = +""
+      depth = 0
+      str.each_char do |c|
+        case c
+        when "{", "["
+          depth += 1
+          current << c
+        when "}", "]"
+          depth -= 1
+          current << c
+        when "|"
+          if depth.zero?
+            args << current
+            current = +""
+          else
+            current << c
+          end
+        else
+          current << c
+        end
+      end
+      args << current
+      args
+    end
+    # #if: condition | then | else
+    def evaluate_if(args)
+      return "" if args.empty?
+      condition = args[0]&.strip || ""
+      then_value = args[1] || ""
+      else_value = args[2] || ""
+      if condition.empty?
+        else_value
+      else
+        then_value
+      end
+    end
+    # #ifeq: value1 | value2 | then | else
+    def evaluate_ifeq(args)
+      return "" if args.length < 2
+      value1 = args[0]&.strip || ""
+      value2 = args[1]&.strip || ""
+      then_value = args[2] || ""
+      else_value = args[3] || ""
+      # Try numeric comparison first
+      if numeric?(value1) && numeric?(value2)
+        equal = value1.to_f == value2.to_f
+      else
+        equal = value1 == value2
+      end
+      equal ? then_value : else_value
+    end
+    # #switch: value | case1=result1 | case2=result2 | #default=default
+    def evaluate_switch(args)
+      return "" if args.empty?
+      value = args[0]&.strip || ""
+      cases = args[1..]
+      default = ""
+      pending_cases = []
+      cases.each do |case_arg|
+        if case_arg.include?("=")
+          key, result = case_arg.split("=", 2)
+          key = key.strip
+          if key == "#default"
+            default = result
+          elsif key == value || pending_cases.include?(value)
+            return result
+          end
+          pending_cases.clear
+        else
+          # Fall-through case
+          trimmed = case_arg.strip
+          if trimmed == value
+            pending_cases << trimmed
+          else
+            pending_cases << trimmed
+            # Last unnamed value becomes default
+            default = case_arg.strip
+          end
+        end
+      end
+      default
+    end
+    # #ifexpr: expression | then | else
+    def evaluate_ifexpr(args)
+      return "" if args.empty?
+      expr_str = args[0] || ""
+      then_value = args[1] || ""
+      else_value = args[2] || ""
+      result = calculate_expression(expr_str)
+      return else_value if result.nil?
+      result != 0 ? then_value : else_value
+    end
+    # #expr: expression
+    def evaluate_expr(args)
+      return "" if args.empty?
+      expr_str = args[0] || ""
+      result = calculate_expression(expr_str)
+      return "" if result.nil?
+      # Format result
+      if result == result.to_i && !expr_str.include?("/")
+        result.to_i.to_s
+      elsif result == result.to_i
+        result.to_i.to_s
+      else
+        format("%.2f", result).sub(/0+$/, "").sub(/\.$/, "")
+      end
+    end
+    def calculate_expression(expr_str)
+      # Normalize expression
+      expr = expr_str.strip
+      return nil if expr.empty?
+      # Check if expression contains logical operators
+      has_logical = expr.match?(/\b(and|or|not)\b/i)
+      # Replace MediaWiki operators with Ruby equivalents
+      expr = expr.gsub(/\bmod\b/i, " % ")
+      expr = expr.gsub("^", "**")
+      # Handle single = as equality (MediaWiki style)
+      # Be careful not to replace ==, <=, >=, !=
+      expr = expr.gsub(/(?<![=!<>])=(?!=)/, "==")
+      # Convert integers to floats for division
+      expr = expr.gsub(/\b(\d+)\b/) { "#{$1}.0" }
+      # For logical operators, convert numbers to booleans (0 = false, non-zero = true)
+      if has_logical
+        # Convert "X and Y" to "(X != 0) && (Y != 0) ? 1 : 0" style
+        # But simpler: replace and/or/not to work on != 0 comparison
+        expr = expr.gsub(/\band\b/i, "!= 0.0 && ")
+        expr = expr.gsub(/\bor\b/i, "!= 0.0 || ")
+        expr = expr.gsub(/\bnot\b/i, "== 0.0 ||")
+        # Add trailing != 0 for the last operand
+        expr = "(#{expr} != 0.0 ? 1.0 : 0.0)"
+      end
+      # Evaluate safely
+      begin
+        # Only allow safe characters (numbers, operators, parentheses, whitespace, ?)
+        return nil unless expr.match?(/\A[\d\s\+\-\*\/\%\(\)\.\<\>\=\!\&\|\?:]+\z/)
+        # Additional validation: reject invalid number formats like "1.0.38.0"
+        # These can appear from version numbers or IP addresses in templates
+        return nil if expr.match?(/\d+\.\d+\.\d+/)
+        result = eval(expr)
+        # Convert boolean results to 1/0
+        case result
+        when true then 1.0
+        when false then 0.0
+        else result.to_f
+        end
+      rescue StandardError, SyntaxError
+        nil
+      end
+    end
+    def numeric?(str)
+      !!(str =~ /\A-?\d+\.?\d*\z/)
+    end
+    # #len: string
+    def evaluate_len(args)
+      str = args[0] || ""
+      str.length.to_s
+    end
+    # #pos: string | search
+    def evaluate_pos(args)
+      str = args[0] || ""
+      search = args[1] || ""
+      pos = str.index(search)
+      pos.nil? ? "" : pos.to_s
+    end
+    # #sub: string | start | length
+    def evaluate_sub(args)
+      str = args[0] || ""
+      start = (args[1] || "0").to_i
+      length = args[2]&.to_i
+      if length
+        str[start, length] || ""
+      else
+        str[start..] || ""
+      end
+    end
+    # #replace: string | search | replace
+    def evaluate_replace(args)
+      str = args[0] || ""
+      search = args[1] || ""
+      replace = args[2] || ""
+      str.gsub(search, replace)
+    end
+    # #rpos: string | search (find last occurrence)
+    def evaluate_rpos(args)
+      str = args[0] || ""
+      search = args[1] || ""
+      pos = str.rindex(search)
+      pos.nil? ? "-1" : pos.to_s
+    end
+    # #count: string | search (count occurrences)
+    def evaluate_count(args)
+      str = args[0] || ""
+      search = args[1] || ""
+      return "0" if search.empty?
+      # Non-overlapping count
+      str.scan(search).length.to_s
+    end
+    # #explode: string | delimiter | index
+    def evaluate_explode(args)
+      str = args[0] || ""
+      delimiter = args[1] || ""
+      index = (args[2] || "0").to_i
+      parts = str.split(delimiter)
+      return "" if parts.empty?
+      # Handle negative index (from end)
+      if index.negative?
+        index = parts.length + index
+      end
+      return "" if index < 0 || index >= parts.length
+      parts[index] || ""
+    end
+    # #urldecode: string
+    def evaluate_urldecode(args)
+      str = args[0] || ""
+      require "cgi"
+      CGI.unescape(str)
+    end
+    # #urlencode: string
+    def evaluate_urlencode(args)
+      str = args[0] || ""
+      require "uri"
+      URI.encode_www_form_component(str).gsub("+", "%20")
+    end
+    # #padleft: string | length | padding
+    def evaluate_padleft(args)
+      str = args[0] || ""
+      length = (args[1] || "0").to_i
+      padding = args[2] || " "
+      padding = " " if padding.empty?
+      return str if str.length >= length
+      (padding * ((length - str.length) / padding.length + 1))[0, length - str.length] + str
+    end
+    # #padright: string | length | padding
+    def evaluate_padright(args)
+      str = args[0] || ""
+      length = (args[1] || "0").to_i
+      padding = args[2] || " "
+      padding = " " if padding.empty?
+      return str if str.length >= length
+      str + (padding * ((length - str.length) / padding.length + 1))[0, length - str.length]
+    end
+    # #iferror: input | then | else
+    def evaluate_iferror(args)
+      input = args[0] || ""
+      then_value = args[1]
+      else_value = args[2] || ""
+      # Check for error indicators
+      has_error = input.include?('class="error"') ||
+                  input.include?("class='error'") ||
+                  input.match?(/Expression error/i)
+      if has_error
+        then_value || ""
+      elsif then_value.nil?
+        input
+      else
+        else_value
+      end
+    end
+    # #titleparts: title | parts | offset
+    def evaluate_titleparts(args)
+      title = args[0] || ""
+      parts_count = (args[1] || "0").to_i
+      offset = (args[2] || "0").to_i
+      # Split by / but keep namespace prefix with first part
+      segments = title.split("/")
+      return title if segments.empty?
+      # Apply offset
+      if offset.positive?
+        segments = segments[offset..] || []
+      elsif offset.negative?
+        segments = segments[0...offset] || []
+      end
+      # Apply parts count
+      if parts_count.positive?
+        segments = segments[0, parts_count]
+      elsif parts_count.negative?
+        segments = segments[0...parts_count]
+      end
+      segments.join("/")
+    end
+    # #time: format | date
+    def evaluate_time(args)
+      format_str = args[0] || ""
+      date_str = args[1]
+      time = if date_str && !date_str.strip.empty?
+               parse_date(date_str.strip)
+             else
+               @reference_date
+             end
+      return "" unless time
+      format_time(time, format_str)
+    end
+    def parse_date(str)
+      return nil if str.nil? || str.strip.empty?
+      # Try common formats
+      formats = ["%Y-%m-%d", "%Y/%m/%d", "%d %B %Y", "%B %d, %Y", "%Y"]
+      formats.each do |fmt|
+        begin
+          time = Time.strptime(str.strip, fmt)
+          # Validate the parsed time is reasonable (year 1-9999)
+          return time if time.year > 0 && time.year < 10000
+        rescue ArgumentError, RangeError
+          next
+        end
+      end
+      nil
+    rescue StandardError
+      # Catch any unexpected errors during date parsing
+      nil
+    end
+    DAY_NAMES = %w[Sunday Monday Tuesday Wednesday Thursday Friday Saturday].freeze
+    def format_time(time, format_str)
+      result = +""
+      i = 0
+      while i < format_str.length
+        c = format_str[i]
+        next_c = format_str[i + 1]
+        # Handle two-character sequences
+        if c == "j" && next_c == "S"
+          result << time.day.to_s << ordinal_suffix(time.day)
+          i += 2
+          next
+        end
+        result << case c
+                  # Year
+                  when "Y" then time.year.to_s
+                  when "y" then (time.year % 100).to_s.rjust(2, "0")
+                  # Month
+                  when "m" then time.month.to_s.rjust(2, "0")
+                  when "n" then time.month.to_s
+                  when "F" then MONTH_NAMES[time.month - 1]
+                  when "M" then MONTH_NAMES[time.month - 1][0, 3]
+                  # Day
+                  when "d" then time.day.to_s.rjust(2, "0")
+                  when "j" then time.day.to_s
+                  when "S" then ordinal_suffix(time.day)
+                  # Day of week
+                  when "l" then DAY_NAMES[time.wday]
+                  when "D" then DAY_NAMES[time.wday][0, 3]
+                  when "N" then (time.wday == 0 ? 7 : time.wday).to_s
+                  when "w" then time.wday.to_s
+                  # Week
+                  when "W" then time.strftime("%V")
+                  # Hour
+                  when "H" then time.hour.to_s.rjust(2, "0")
+                  when "G" then time.hour.to_s
+                  when "g" then (time.hour % 12 == 0 ? 12 : time.hour % 12).to_s
+                  when "h" then (time.hour % 12 == 0 ? 12 : time.hour % 12).to_s.rjust(2, "0")
+                  # Minute/Second
+                  when "i" then time.min.to_s.rjust(2, "0")
+                  when "s" then time.sec.to_s.rjust(2, "0")
+                  # AM/PM
+                  when "a" then time.hour < 12 ? "am" : "pm"
+                  when "A" then time.hour < 12 ? "AM" : "PM"
+                  # Timezone
+                  when "T" then time.strftime("%Z")
+                  when "O" then time.strftime("%z")
+                  # Unix timestamp
+                  when "U" then time.to_i.to_s
+                  else c
+                  end
+        i += 1
+      end
+      result
+    end
+    def ordinal_suffix(day)
+      if (11..13).include?(day % 100)
+        "th"
+      else
+        case day % 10
+        when 1 then "st"
+        when 2 then "nd"
+        when 3 then "rd"
+        else "th"
+        end
+      end
+    end
+  end
+end