RubyGems - wp2txt - Versions diffs - 0.4.1 - Mend

wp2txt 0.4.1

Files changed (17) hide show

@@ -0,0 +1,430 @@
+#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
+require 'strscan'
+require 'find'
+require 'sanitize'
+module Wp2txt
+  def format_wiki(original_text, has_retried = false)
+    begin
+      text = original_text + ""
+      text = chrref_to_utf(text)
+      text = escape_nowiki(text)
+      text = process_interwiki_links(text)
+      text = process_external_links(text)
+      text = remove_directive(text)
+      text = remove_emphasis(text)
+      text = mndash(text)
+      text = make_reference(text)
+      text = format_ref(text)
+      text = remove_hr(text)
+      text = remove_tag(text)
+      text = special_chr(text)
+      unescape_nowiki(text)
+    rescue # detect invalid byte sequence in UTF-8
+      if has_retried
+        puts "invalid byte sequence detected"
+        puts "******************************"
+        File.open("error_log.txt", "w") do |f|
+          f.write original_text
+        end
+        exit
+      else
+        fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
+        return format_wiki(fixed_text, true)
+      end
+    end
+  end
+  #################### parser for nested structure ####################
+  def process_nested_structure(scanner, left, right, &block)
+    buffer = ""
+    while str = scanner.scan_until(/(#{Regexp.escape(left)}|#{Regexp.escape(right)})/m)
+      # begin
+      case scanner[1]
+      when left
+        buffer << str
+        has_left = true
+      when right
+        if has_left
+          buffer = buffer[0...-(left.size)]
+          contents = block.call(str[0...-(left.size)])
+          buffer << contents
+          break
+        else
+          buffer << str
+        end
+      end
+    end
+    buffer << scanner.rest
+    if buffer == scanner.string
+      return scanner.string
+    else
+      scanner.string = buffer
+      return process_nested_structure(scanner, left, right, &block) || ""
+    end
+  end
+  def remove_templates(str, only_not_inline = true)
+    scanner = StringScanner.new(str)
+    result = process_nested_structure(scanner, "{{", "}}") do |contents|
+      if contents.index("\n")
+        "\n"
+      else
+        "[tpl]#{contents}[/tpl]"
+      end
+    end
+  end
+  #################### methods used from format_wiki ####################
+  def escape_nowiki(str)
+    if @nowikis
+      @nowikis.clear
+    else
+      @nowikis = {}
+    end
+    str.gsub(/<nowiki>(.*?)<\/nowiki>/m) do
+      nowiki = $1
+      nowiki_id = nowiki.object_id
+      @nowikis[nowiki_id] = nowiki
+      "<nowiki-#{nowiki_id}>"
+    end
+  end
+  def unescape_nowiki(str)
+    str.gsub(/<nowiki\-(\d+?)>/) do
+      obj_id = $1.to_i
+      @nowikis[obj_id]
+    end
+  end
+  def process_interwiki_links(str)
+    scanner = StringScanner.new(str)
+    result = process_nested_structure(scanner, "[[", "]]") do |contents|
+      str_new = ""
+      parts = contents.split("|")
+      case parts.size
+      when 1
+        parts.first || ""
+      else
+        parts.shift
+        parts.join("|")
+      end
+    end
+    result
+  end
+  def process_external_links(str)
+    scanner = StringScanner.new(str)
+    result = process_nested_structure(scanner, "[", "]") do |contents|
+      parts = contents.split(" ", 2)
+      case parts.size
+      when 1
+        parts.first || ""
+      else
+        parts.last || ""
+      end
+    end
+    result
+  end
+  def special_chr(str)
+    unless @sp_hash
+      html = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;']\
+      .zip([' ', '<', '>', '&', '"'])
+      umraut_accent = ['&Agrave;', '&Aacute;', '&Acirc;', '&Atilde;', '&Auml;',
+      '&Aring;', '&AElig;', '&Ccedil;', '&Egrave;', '&Eacute;', '&Ecirc;',
+      '&Euml;', '&Igrave;', '&Iacute;', '&Icirc;', '&Iuml;', '&Ntilde;',
+      '&Ograve;', '&Oacute;', '&Ocirc;', '&Otilde;', '&Ouml;', '&Oslash;',
+      '&Ugrave;', '&Uacute;', '&Ucirc;', '&Uuml;', '&szlig;', '&agrave;',
+      '&aacute;', '&acirc;', '&atilde;', '&auml;', '&aring;', '&aelig;',
+      '&ccedil;', '&egrave;', '&eacute;', '&ecirc;', '&euml;', '&igrave;',
+      '&iacute;', '&icirc;', '&iuml;', '&ntilde;', '&ograve;', '&oacute;',
+      '&ocirc;', '&oelig;', '&otilde;', '&ouml;', '&oslash;', '&ugrave;',
+      '&uacute;', '&ucirc;', '&uuml;', '&yuml;']\
+      .zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
+      'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
+      'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
+      'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
+      punctuation = ['&iquest;', '&iexcl;', '&laquo;', '&raquo;', '&sect;',
+      '&para;', '&dagger;', '&Dagger;', '&bull;', '&ndash;', '&mdash;']\
+      .zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
+      commercial = ['&trade;', '&copy;', '&reg;', '&cent;', '&euro;', '&yen;',
+      '&pound;', '&curren;'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
+      greek_chr = ['&alpha;', '&beta;', '&gamma;', '&delta;', '&epsilon;',
+      '&zeta;', '&eta;', '&theta;', '&iota;', '&kappa;', '&lambda;', '&mu;',
+      '&nu;', '&xi;', '&omicron;', '&pi;', '&rho;', '&sigma;', '&sigmaf;',
+      '&tau;', '&upsilon;', '&phi;', '&chi;', '&psi;', '&omega;', '&Gamma;',
+      '&Delta;', '&Theta;', '&Lambda;', '&Xi;', '&Pi;', '&Sigma;', '&Phi;',
+      '&Psi;', '&Omega;']\
+      .zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
+      'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
+      'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
+      math_chr1 = ['&int;', '&sum;', '&prod;', '&radic;', '&minus;', '&plusmn;',
+      '&infin;', '&asymp;', '&prop;', '&equiv;', '&ne;', '&le;', '&ge;',
+      '&times;', '&middot;', '&divide;', '&part;', '&prime;', '&Prime;',
+      '&nabla;', '&permil;', '&deg;', '&there4;', '&oslash;', '&isin;', '&cap;',
+      '&cup;', '&sub;', '&sup;', '&sube;', '&supe;', '&not;', '&and;', '&or;',
+      '&exist;', '&forall;', '&rArr;', '&hArr;', '&rarr;', '&harr;', '&uarr;']\
+      .zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
+      '≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
+      '∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
+      '⇔', '→', '↔', '↑'])
+      math_chr2 = ['&alefsym;', '&notin;'].zip(['ℵ', '∉'])
+      others = ['&uml;', '&ordf;',
+      '&macr;', '&acute;', '&micro;', '&cedil;', '&ordm;', '&lsquo;', '&rsquo;',
+      '&ldquo;', '&sbquo;', '&rdquo;', '&bdquo;', '&spades;', '&clubs;', '&loz;',
+      '&hearts;', '&larr;', '&diams;', '&lsaquo;', '&rsaquo;', '&darr;']\
+      .zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
+      '„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
+      spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
+                  math_chr1 + math_chr2 + others
+      @sp_hash  = Hash[*spc_array.flatten]
+      @sp_regex = Regexp.new("(" + @sp_hash.keys.join("|") + ")")
+    end
+    #str.gsub!("&amp;"){'&'}
+    str.gsub!(@sp_regex) do
+      @sp_hash[$1]
+    end
+    return str
+  end
+  def remove_tag(str, tagset = ['<', '>'])
+    if tagset == ['<', '>']
+      return remove_html_tag(str)
+    end
+    tagsets = Regexp.quote(tagset.uniq.join(""))
+    regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
+    newstr = str.gsub(regex, "")
+    # newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
+    return newstr
+  end
+  def remove_html_tag(str)
+    str = ::Sanitize.clean(str)
+  end
+  def remove_emphasis(str)
+    str.gsub(/(''+)(.+?)\1/) do
+      $2
+    end
+  end
+  def chrref_to_utf(num_str)
+    begin
+      utf_str = num_str.gsub(/&#(x?)([0-9a-fA-F]+);/) do
+        if $1 == 'x'
+          ch = $2.to_i(16)
+        else
+          ch = $2.to_i
+        end
+        hi = ch>>8
+        lo = ch&0xff
+        u = "\377\376" << lo.chr << hi.chr
+        u.encode("UTF-8", "UTF-16")
+      end
+    rescue StandardError
+      return num_str
+    end
+    return utf_str
+  end
+  def remove_directive(str)
+    remove_tag(str, ['__', '__'])
+  end
+  def mndash(str)
+    str = str.gsub(/\{(mdash|ndash|–)\}/, "–")
+  end
+  def remove_hr(page)
+    page = page.gsub(/^\s*\-+\s*$/, "")
+  end
+  def make_reference(str)
+    new_str = str.dup
+    new_str.gsub!(/<br ?\/>/, "\n")
+    new_str.gsub!(/<ref[^>]*\/>/, "")
+    new_str.gsub!(/<ref[^>]*>/, "[ref]")
+    new_str.gsub!(/<\/ref>/, "[/ref]")
+    return new_str
+  end
+  def format_ref(page)
+    page = page.gsub(/\[ref\](.*?)\[\/ref\]/m) do
+      ref = $1.dup
+      ref.gsub(/(?:[\r\n]+|<br ?\/>)/, " ")
+    end
+  end
+  #################### methods currently unused ####################
+  def process_template(str)
+    scanner = StringScanner.new(str)
+    result = process_nested_structure(scanner, "{{", "}}") do |contents|
+      parts = contents.split("|")
+      case parts.size
+      when 0
+        ""
+      when 1
+        parts.first || ""
+      else
+        if parts.last.split("=").size > 1
+          parts.first || ""
+        else
+          parts.last || ""
+        end
+      end
+    end
+    result
+  end
+  def remove_table(str)
+    new_str = str.gsub(/\{\|[^\{\|\}]*?\|\}/m, "")
+    if str != new_str
+      new_str = remove_table(new_str)
+    end
+    new_str = remove_table(new_str) unless str == new_str
+    return new_str
+  end
+  def remove_clade(page)
+    new_page = page.gsub(/\{\{(?:C|c)lade[^\{\}]*\}\}/m, "")
+    new_page = remove_clade(new_page) unless page == new_page
+    new_page
+  end
+  def remove_inline_template(str)
+    str.gsub(/\{\{(.*?)\}\}/) do
+       key = $1
+       if /\A[^\|]+\z/ =~ key
+         result = key
+       else
+         info = key.split("|")
+         type_code = info.first
+         case type_code
+         when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
+              /\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
+              /\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
+           out = info[-1]
+         else
+           out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
+         end
+         result = out
+       end
+     end
+  end
+  #################### file related utilities ####################
+  # collect filenames recursively
+  def collect_files(str, regex = nil)
+    regex ||= //
+    text_array = Array.new
+    Find.find(str) do |f|
+      text_array << f if regex =~ f
+    end
+    text_array.sort
+  end
+  # modify a file using block/yield mechanism
+  def file_mod(file_path, backup = false, &block)
+    File.open(file_path, "r") do |fr|
+      str = fr.read
+      newstr = yield(str)
+      str = newstr unless newstr == nil
+      File.open("temp", "w") do |tf|
+        tf.write(str)
+      end
+    end
+    File.rename(file_path, file_path + ".bak")
+    File.rename("temp", file_path)
+    File.unlink(file_path + ".bak") unless backup
+  end
+  # modify files under a directry (recursive)
+  def batch_file_mod(dir_path, &block)
+    if FileTest.directory?(dir_path)
+      collect_files(dir_path).each do |file|
+        yield file if FileTest.file?(file)
+      end
+    else
+      yield dir_path if FileTest.file?(dir_path)
+    end
+  end
+  # take care of difference of separators among environments
+  def correct_separator(input)
+    if input.is_a?(String)
+      ret_str = String.new
+      if RUBY_PLATFORM.index("win32")
+        ret_str = input.gsub("/", "\\")
+      else
+        ret_str = input.gsub("\\", "/")
+      end
+      return ret_str
+    elsif input.is_a?(Array)
+      ret_array = Array.new
+      input.each do |item|
+        ret_array << correct_separator(item)
+      end
+      return ret_array
+    end
+  end
+  def rename(files)
+    # num of digits necessary to name the last file generated
+    maxwidth = 0
+    files.each do |f|
+      width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
+      maxwidth = width if maxwidth < width
+    end
+    files.each do |f|
+      newname= f.sub(/\-(\d+)\z/) do
+        "-" + sprintf("%0#{maxwidth}d", $1.to_i)
+      end
+      File.rename(f, newname + ".txt")
+    end
+  end
+  # convert int of seconds to string in the format 00:00:00
+  def sec_to_str(int)
+    unless int
+      str = "--:--:--"
+      return str
+    end
+    h = int / 3600
+    m = (int - h * 3600) / 60
+    s = int % 60
+    str = sprintf("%02d:%02d:%02d", h, m, s)
+    return str
+  end
+  def decimal_format(i)
+    str = i.to_s.reverse
+    return str.scan(/.?.?./).join(',').reverse
+  end
+end

data/lib/wp2txt/version.rb ADDED

@@ -0,0 +1,3 @@
+module Wp2txt
+  VERSION = "0.4.1"
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,6 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'rspec'
+RSpec.configure do |config|
+  # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
+end

data/spec/utils_spec.rb ADDED

@@ -0,0 +1,195 @@
+#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+require 'wp2txt'
+require 'wp2txt/article'
+require 'wp2txt/utils'
+describe "Wp2txt" do
+  it "contains mediawiki-format related functions:" do
+  end
+  include Wp2txt
+  before do
+  end
+  describe "process_nested_structure" do
+    it "parse nested structure replacing str in the format specified" do
+      str_before = "[[ab[[cde[[alfa]]]]fg]]"
+      str_after  = "<<ab<<cde<<alfa>>>>fg>>"
+      scanner = StringScanner.new(str_before)
+      str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
+        "<<" + content + ">>"
+      end
+      str_processed.should == str_after
+      str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
+      |passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
+      str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
+      |passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
+      scanner = StringScanner.new(str_before)
+      str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
+        "<<" + content + ">>"
+      end
+      str_processed.should == str_after
+    end
+  end
+  describe "special_chr" do
+    it "replaces character references with real characters" do
+      str_before = "&nbsp; &lt; &gt; &amp; &quot;"
+      str_after  = "  < > & \""
+      special_chr(str_before).should == str_after
+    end
+  end
+  describe "chrref_to_utf" do
+    it "replaces character references with real characters" do
+      str_before = "&#x266A;"
+      str_after  = "♪"
+      chrref_to_utf(str_before).should == str_after
+    end
+  end
+  describe "mndash" do
+    it "replaces {mdash}, {ndash}, or {–} with '–'" do
+      str_before = "{mdash} {ndash} {–}"
+      str_after  = "– – –"
+      mndash(str_before).should == str_after
+    end
+  end
+  describe "format_ref" do
+    it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
+      str_before = "[ref]...\r\n...<br />...[/ref]"
+      str_after  = "... ... ..."
+      format_ref(str_before).should == str_after
+    end
+  end
+  describe "make_reference" do
+    it "replaces <ref> tag with [ref]" do
+      str_before = "<ref> ... <br /> ... </ref> \n <ref />"
+      str_after  = "[ref] ... \n ... [/ref] \n "
+      make_reference(str_before).should == str_after
+    end
+  end
+  describe "remove_table" do
+    it "removes table formated parts" do
+      str_before = "{| ... \n{| ... \n ...|}\n ...|}"
+      str_after  = ""
+      remove_table(str_before).should == str_after
+    end
+  end
+  describe "remove_clade" do
+    it "removes clade formated parts" do
+      str_before = "\{\{clade ... \n ... \n ... \n\}\}"
+      str_after  = ""
+      remove_clade(str_before).should == str_after
+    end
+  end
+  describe "remove_hr" do
+    it "removes horizontal lines" do
+      str_before = "\n----\n--\n--\n"
+      str_after  = "\n\n"
+      remove_hr(str_before).should == str_after
+    end
+  end
+  describe "remove_tag" do
+    it "removes tags" do
+      str_before = "<tag>abc</tag>"
+      str_after  = "abc"
+      remove_tag(str_before).should == str_after
+      str_before = "[tag]def[/tag]"
+      str_after  = "def"
+      remove_tag(str_before, ['[', ']']).should == str_after
+    end
+  end
+  describe "remove_directive" do
+    it "removes directive" do
+      str_before = "__abc__\n __def__"
+      str_after  = "\n "
+      remove_directive(str_before).should == str_after
+    end
+  end
+  describe "remove_emphasis" do
+    it "removes directive" do
+      str_before = "''abc''\n'''def'''"
+      str_after  = "abc\ndef"
+      remove_emphasis(str_before).should == str_after
+    end
+  end
+  describe "escape_nowiki" do
+    it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
+      str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
+      str_after  = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
+      escape_nowiki(str_before).should =~ str_after
+    end
+  end
+  describe "unescape_nowiki" do
+    it "replaces <nowiki-object_id> with string stored elsewhere" do
+      @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
+      str_before = "<nowiki-123>def<nowiki-124>"
+      str_after  = "[[abc]]def[[ghi]]"
+      unescape_nowiki(str_before).should == str_after
+    end
+  end
+  describe "process_interwiki_links" do
+    it "formats text link and remove brackets" do
+      process_interwiki_links("[[a b]]").should   == "a b"
+      process_interwiki_links("[[a b|c]]").should == "c"
+      process_interwiki_links("[[a|b|c]]").should == "b|c"
+      process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]").should == "[ɲ], /J/"
+    end
+  end
+  describe "process_external_links" do
+    it "formats text link and remove brackets" do
+      process_external_links("[http://yohasebe.com yohasebe.com]").should   == "yohasebe.com"
+      process_external_links("[http://yohasebe.com]").should   == "http://yohasebe.com"
+      process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}").should == "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
+    end
+  end
+  describe "process_template" do
+    it "removes brackets and leaving some text" do
+      str_before = "{{}}"
+      str_after = ""
+      process_template(str_before).should == str_after
+      str_before = "{{lang|en|Japan}}"
+      str_after  = "Japan"
+      process_template(str_before).should == str_after
+      str_before = "{{a|b=c|d=f}}"
+      str_after  = "a"
+      process_template(str_before).should == str_after
+      str_before = "{{a|b|{{c|d|e}}}}"
+      str_after  = "e"
+      process_template(str_before).should == str_after
+    end
+  end
+  describe "expand_template" do
+    it "gets data corresponding to a given template using mediawiki api" do
+      uri = "http://en.wiktionary.org/w/api.php"
+      template = "{{en-verb}}"
+      word = "kick"
+      expanded = expand_template(uri, template, word)
+      html =<<EOD
+<span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
+EOD
+      html.strip!
+      expanded.should == html
+    end
+  end
+end