RubyGems - wp2txt - Versions diffs - 1.0.2 → 1.1.0 - Mend

wp2txt 1.0.2 → 1.1.0

Files changed (21) hide show

data/lib/wp2txt/article.rb CHANGED Viewed

@@ -1,62 +1,54 @@
-#!/usr/bin/env ruby
-# -*- coding: utf-8 -*-
-$: << File.join(File.dirname(__FILE__))
+# frozen_string_literal: true
 require 'strscan'
-require 'utils'
+require_relative 'utils'
 module Wp2txt
   # possible element type, which could be later chosen to print or not to print
-    # :mw_heading
-    # :mw_htable
-    # :mw_quote
-    # :mw_unordered
-    # :mw_ordered
-    # :mw_definition
-    # :mw_pre
-    # :mw_paragraph
-    # :mw_comment
-    # :mw_math
-    # :mw_source
-    # :mw_inputbox
-    # :mw_template
-    # :mw_link
-    # :mw_summary
-    # :mw_blank
-    # :mw_redirect
+  # :mw_heading
+  # :mw_htable
+  # :mw_quote
+  # :mw_unordered
+  # :mw_ordered
+  # :mw_definition
+  # :mw_pre
+  # :mw_paragraph
+  # :mw_comment
+  # :mw_math
+  # :mw_source
+  # :mw_inputbox
+  # :mw_template
+  # :mw_link
+  # :mw_summary
+  # :mw_blank
+  # :mw_redirect
   # an article contains elements, each of which is [TYPE, string]
   class Article
     include Wp2txt
     attr_accessor :elements, :title, :categories
     def initialize(text, title = "", strip_tmarker = false)
       @title = title.strip
       @strip_tmarker = strip_tmarker
-      convert_characters!(text)
-      text.gsub!(/\|\n\n+/m){"|\n"}
-      remove_html!(text)
-      make_reference!(text)
-      remove_ref!(text)
+      text = convert_characters(text)
+      text = text.gsub(/\|\n\n+/m) { "|\n" }
+      text = remove_html(text)
+      text = make_reference(text)
+      text = remove_ref(text)
       parse text
     end
-    def create_element(tp, text)
-      [tp, text]
+    def create_element(tpx, text)
+      [tpx, text]
     end
     def parse(source)
       @elements = []
-      @categories  = []
+      @categories = []
       mode = nil
-      open_stack  = []
-      close_stack = []
       source.each_line do |line|
-        matched = line.scan($category_regex)
+        matched = line.scan(CATEGORY_REGEX)
         if matched && !matched.empty?
           @categories += matched
           @categories.uniq!
@@ -65,108 +57,94 @@ module Wp2txt
         case mode
         when :mw_ml_template
           scanner = StringScanner.new(line)
-          str= process_nested_structure(scanner, "{{", "}}") {""}
-          if $ml_template_end_regex =~ str
-            mode = nil
-          end
+          str = process_nested_structure(scanner, "{{", "}}") { "" }
+          mode = nil if ML_TEMPLATE_END_REGEX =~ str
           @elements.last.last << line
           next
         when :mw_ml_link
           scanner = StringScanner.new(line)
-          str= process_nested_structure(scanner, "[[", "]]") {""}
-          if $ml_link_end_regex =~ str
-            mode = nil
-          end
+          str = process_nested_structure(scanner, "[[", "]]") { "" }
+          mode = nil if ML_LINK_END_REGEX =~ str
           @elements.last.last << line
           next
         when :mw_table
-          if $in_table_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_TABLE_REGEX2 =~ line
           @elements.last.last << line
-          next
+          next
         when :mw_inputbox
-          if $in_inputbox_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_INPUTBOX_REGEX2 =~ line
           @elements.last.last << line
           next
         when :mw_source
-          if $in_source_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_SOURCE_REGEX2 =~ line
           @elements.last.last << line
           next
         when :mw_math
-          if $in_math_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_MATH_REGEX2 =~ line
           @elements.last.last << line
           next
         when :mw_htable
-          if $in_html_table_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_HTML_TABLE_REGEX2 =~ line
           @elements.last.last << line
           next
         end
         case line
-        when $isolated_template_regex
+        when ISOLATED_TEMPLATE_REGEX
           @elements << create_element(:mw_isolated_template, line)
-        when $isolated_tag_regex
+        when ISOLATED_TAG_REGEX
           @elements << create_element(:mw_isolated_tag, line)
-        when $blank_line_regex
-          @elements << create_element(:mw_blank, "\n")
-        when $redirect_regex
+        when BLANK_LINE_REGEX
+          @elements << create_element(:mw_blank, "\n")
+        when REDIRECT_REGEX
           @elements << create_element(:mw_redirect, line)
-        when $in_heading_regex
-          line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
+        when IN_HEADING_REGEX
+          line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
           @elements << create_element(:mw_heading, "\n" + line + "\n")
-        when $in_inputbox_regex
+        when IN_INPUTBOX_REGEX
           @elements << create_element(:mw_inputbox, line)
-        when $ml_template_onset_regex
+        when ML_TEMPLATE_ONSET_REGEX
           @elements << create_element(:mw_ml_template, line)
           mode = :mw_ml_template
-        when $ml_link_onset_regex
+        when ML_LINK_ONSET_REGEX
           @elements << create_element(:mw_ml_link, line)
           mode = :mw_ml_link
-        when $in_inputbox_regex1
+        when IN_INPUTBOX_REGEX1
           mode = :mw_inputbox
           @elements << create_element(:mw_inputbox, line)
-        when $in_source_regex
-        @elements << create_element(:mw_source, line)
-        when $in_source_regex1
+        when IN_SOURCE_REGEX
+          @elements << create_element(:mw_source, line)
+        when IN_SOURCE_REGEX1
           mode = :mw_source
           @elements << create_element(:mw_source, line)
-        when $in_math_regex
+        when IN_MATH_REGEX
           @elements << create_element(:mw_math, line)
-        when $in_math_regex1
+        when IN_MATH_REGEX1
           mode = :mw_math
           @elements << create_element(:mw_math, line)
-        when $in_html_table_regex
+        when IN_HTML_TABLE_REGEX
           @elements << create_element(:mw_htable, line)
-        when $in_html_table_regex1
+        when IN_HTML_TABLE_REGEX1
           mode = :mw_htable
           @elements << create_element(:mw_htable, line)
-        when $in_table_regex1
+        when IN_TABLE_REGEX1
           mode = :mw_table
           @elements << create_element(:mw_table, line)
-        when $in_unordered_regex
-          line = line.sub($list_marks_regex, "") if @strip_tmarker
+        when IN_UNORDERED_REGEX
+          line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
           @elements << create_element(:mw_unordered, line)
-        when $in_ordered_regex
-          line = line.sub($list_marks_regex, "") if @strip_tmarker
+        when IN_ORDERED_REGEX
+          line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
           @elements << create_element(:mw_ordered, line)
-        when $in_pre_regex
-          line = line.sub($pre_marks_regex, "") if @strip_tmarker
+        when IN_PRE_REGEX
+          line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker
           @elements << create_element(:mw_pre, line)
-        when $in_definition_regex
-          line = line.sub($def_marks_regex, "") if @strip_tmarker
+        when IN_DEFINITION_REGEX
+          line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker
           @elements << create_element(:mw_definition, line)
-        when $in_link_regex
+        when IN_LINK_REGEX
           @elements << create_element(:mw_link, line)
-        else
+        else
           @elements << create_element(:mw_paragraph, "\n" + line)
         end
       end

data/lib/wp2txt/regex.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+require "htmlentities"
+module Wp2txt
+  ###################################################
+  # variables to save resource for generating regexps
+  # those with a trailing number 1 represent opening tag/markup
+  # those with a trailing number 2 represent closing tag/markup
+  # those without a trailing number contain both opening/closing tags/markups
+  HTML_DECODER = HTMLEntities.new
+  ENTITIES = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
+  HTML_HASH = Hash[*ENTITIES.flatten]
+  HTML_REGEX = Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
+  ML_TEMPLATE_ONSET_REGEX = Regexp.new('^\{\{[^\}]*$')
+  ML_TEMPLATE_END_REGEX = Regexp.new('\}\}\s*$')
+  ML_LINK_ONSET_REGEX = Regexp.new('^\[\[[^\]]*$')
+  ML_LINK_END_REGEX = Regexp.new('\]\]\s*$')
+  ISOLATED_TEMPLATE_REGEX = Regexp.new('^\s*\{\{.+\}\}\s*$')
+  ISOLATED_TAG_REGEX = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
+  IN_LINK_REGEX = Regexp.new('^\s*\[.*\]\s*$')
+  IN_INPUTBOX_REGEX = Regexp.new('<inputbox>.*?<\/inputbox>')
+  IN_INPUTBOX_REGEX1 = Regexp.new('<inputbox>')
+  IN_INPUTBOX_REGEX2 = Regexp.new('<\/inputbox>')
+  IN_SOURCE_REGEX = Regexp.new('<source.*?>.*?<\/source>')
+  IN_SOURCE_REGEX1 = Regexp.new('<source.*?>')
+  IN_SOURCE_REGEX2 = Regexp.new('<\/source>')
+  IN_MATH_REGEX = Regexp.new('<math.*?>.*?<\/math>')
+  IN_MATH_REGEX1 = Regexp.new('<math.*?>')
+  IN_MATH_REGEX2 = Regexp.new('<\/math>')
+  IN_HEADING_REGEX = Regexp.new('^=+.*?=+$')
+  IN_HTML_TABLE_REGEX = Regexp.new("<table.*?><\/table>")
+  IN_HTML_TABLE_REGEX1 = Regexp.new('<table\b')
+  IN_HTML_TABLE_REGEX2 = Regexp.new('<\/\s*table>')
+  IN_TABLE_REGEX1 = Regexp.new('^\s*\{\|')
+  IN_TABLE_REGEX2 = Regexp.new('^\|\}.*?$')
+  IN_UNORDERED_REGEX = Regexp.new('^\*')
+  IN_ORDERED_REGEX = Regexp.new('^\#')
+  IN_PRE_REGEX = Regexp.new('^ ')
+  IN_DEFINITION_REGEX = Regexp.new('^[\;\:]')
+  BLANK_LINE_REGEX = Regexp.new('^\s*$')
+  REDIRECT_REGEX = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
+  REMOVE_TAG_REGEX = Regexp.new("\<[^\<\>]*\>")
+  REMOVE_DIRECTIVES_REGEX = Regexp.new("\_\_[^\_]*\_\_")
+  REMOVE_EMPHASIS_REGEX = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
+  CHRREF_TO_UTF_REGEX = Regexp.new('&#(x?)([0-9a-fA-F]+);')
+  MNDASH_REGEX = Regexp.new('\{(mdash|ndash|–)\}')
+  REMOVE_HR_REGEX = Regexp.new('^\s*\-+\s*$')
+  MAKE_REFERENCE_REGEX_A = Regexp.new('<br ?\/>')
+  MAKE_REFERENCE_REGEX_B = Regexp.new('<ref[^>]*\/>')
+  MAKE_REFERENCE_REGEX_C = Regexp.new('<ref[^>]*>')
+  MAKE_REFERENCE_REGEX_D = Regexp.new('<\/ref>')
+  FORMAT_REF_REGEX = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
+  HEADING_ONSET_REGEX = Regexp.new('^(\=+)\s+')
+  HEADING_CODA_REGEX = Regexp.new('\s+(\=+)$')
+  LIST_MARKS_REGEX = Regexp.new('\A[\*\#\;\:\ ]+')
+  PRE_MARKS_REGEX = Regexp.new('\A\^\ ')
+  DEF_MARKS_REGEX = Regexp.new('\A[\;\:\ ]+')
+  ONSET_BAR_REGEX = Regexp.new('\A[^\|]+\z')
+  CATEGORY_PATTERNS = ["Category", "Categoria"].join("|")
+  CATEGORY_REGEX = Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
+  ESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
+  UNESCAPE_NOWIKI_REGEX = Regexp.new('<nowiki\-(\d+?)>')
+  REMOVE_ISOLATED_REGEX = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
+  REMOVE_INLINE_REGEX = Regexp.new('\{\{(.*?)\}\}')
+  TYPE_CODE_REGEX = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
+  SINGLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
+  DOUBLE_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
+  SINGLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
+  DOUBLE_CURLY_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
+  CURLY_SQUARE_BRACKET_REGEX = Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
+  COMPLEX_REGEX_01 = Regexp.new('\<\<([^<>]++)\>\>\s?')
+  COMPLEX_REGEX_02 = Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
+  COMPLEX_REGEX_03 = Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
+  COMPLEX_REGEX_04 = Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
+  COMPLEX_REGEX_05 = Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
+  CLEANUP_REGEX_01 = Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
+  CLEANUP_REGEX_02 = Regexp.new('^File:.+$')
+  CLEANUP_REGEX_03 = Regexp.new('^\|.*$')
+  CLEANUP_REGEX_04 = Regexp.new('\{\{.*$')
+  CLEANUP_REGEX_05 = Regexp.new('^.*\}\}')
+  CLEANUP_REGEX_06 = Regexp.new('\{\|.*$')
+  CLEANUP_REGEX_07 = Regexp.new('^.*\|\}')
+  CLEANUP_REGEX_08 = Regexp.new('\n\n\n+', Regexp::MULTILINE)
+end