RubyGems - wp2txt - Versions diffs - 1.0.1 → 1.1.0 - Mend

wp2txt 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/bin/wp2txt CHANGED Viewed

@@ -1,197 +1,192 @@
 #!/usr/bin/env ruby
-# -*- coding: utf-8 -*-
-$: << File.join(File.dirname(__FILE__))
-$: << File.join(File.dirname(__FILE__), '..', 'lib')
-$DEBUG_MODE = false
-SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
-DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
-require 'wp2txt'
-require 'wp2txt/utils'
-require 'wp2txt/version'
-require 'etc'
-require 'optimist'
-require 'parallel'
-require 'pastel'
-require 'tty-spinner'
-include Wp2txt
-opts = Optimist::options do
-  version Wp2txt::VERSION
-  banner <<-EOS
-WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
-Usage: wp2txt [options]
-where [options] are:
-EOS
-  opt :input,  "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", :required => true, :short => "-i"
-  opt :output_dir,  "Path to output directory", :default => Dir::pwd, :type => String, :short => "-o"
-  opt :convert,     "Output in plain text (converting from XML)", :default => true, :short => "-c"
-  opt :category, "Show article category information", :default => true, :short => "-a"
-  opt :category_only, "Extract only article title and categories", :default => false, :short => "-g"
-  opt :summary_only, "Extract only article title, categories, and summary text before first heading", :default => false, :short => "-s"
-  opt :file_size,   "Approximate size (in MB) of each output file", :default => 10, :short => "-f"
-  opt :num_procs,   "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", :short => "-n"
-  opt :del_interfile,   "Delete intermediate XML files from output dir", :short => "-x", :default => false
-  opt :title,   "Keep page titles in output", :default => true, :short => "-t"
-  opt :heading, "Keep section titles in output", :default => true, :short => "-d"
-  opt :list,    "Keep unprocessed list items in output", :default => false, :short => "-l"
-  opt :ref, "Keep reference notations in the format [ref]...[/ref]", :default => false, :short => "-r"
-  opt :redirect, "Show redirect destination", :default => false, :short => "-e"
-  opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true, :short => "-m"
-  opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", :default => false, :short => "-b"
-end
-Optimist::die :size, "must be larger than 0" unless opts[:file_size] >= 0
-Optimist::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
-pastel = Pastel.new
-input_file = ARGV[0]
-output_dir = opts[:output_dir]
-tfile_size = opts[:file_size]
-num_processors = Etc.nprocessors
-if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
-  num_processes = opts[:num_procs]
-else
-  num_processes = num_processors - 2
-end
-num_processes = 1 if num_processes < 1
-convert = opts[:convert]
-strip_tmarker = opts[:marker] ? false : true
-opt_array = [:title,
-             :list,
-             :heading,
-             :table,
-             :redirect,
-             :multiline,
-             :category,
-             :category_only,
-             :summary_only,
-             :del_interfile,
-             :bz2_gem ]
-$leave_inline_template = true if opts[:inline]
-$leave_ref = true if opts[:ref]
-config = {}
-opt_array.each do |opt|
-  config[opt] = opts[opt]
-end
+# frozen_string_literal: true
+DEBUG_MODE = false
+SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
+DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
+require_relative "../lib/wp2txt"
+require_relative "../lib/wp2txt/utils"
+require_relative "../lib/wp2txt/version"
+require "etc"
+require "optimist"
+require "parallel"
+require "pastel"
+require "tty-spinner"
+class WpApp
+  include Wp2txt
+  def run
+    opts = Optimist.options do
+      version VERSION
+      banner <<~BANNER
+        WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
+        Usage: wp2txt [options]
+        where [options] are:
+      BANNER
+      opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", type: String, required: true, short: "-i"
+      opt :output_dir, "Path to output directory", default: Dir.pwd, type: String, short: "-o"
+      opt :convert, "Output in plain text (converting from XML)", default: true, short: "-c"
+      opt :category, "Show article category information", default: true, short: "-a"
+      opt :category_only, "Extract only article title and categories", default: false, short: "-g"
+      opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
+      opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
+      opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
+      opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
+      opt :title, "Keep page titles in output", default: true, short: "-t"
+      opt :heading, "Keep section titles in output", default: true, short: "-d"
+      opt :list, "Keep unprocessed list items in output", default: false, short: "-l"
+      opt :ref, "Keep reference notations in the format [ref]...[/ref]", default: false, short: "-r"
+      opt :redirect, "Show redirect destination", default: false, short: "-e"
+      opt :marker, "Show symbols prefixed to list items, definitions, etc.", default: true, short: "-m"
+      opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", default: false, short: "-b"
+    end
-if File::ftype(input_file) == "directory"
-  input_files = Dir.glob("#{input_file}/*.xml")
-else
-  puts ""
-  puts pastel.green.bold("Preprocessing")
-  puts "Decompressing and splitting the original dump file."
-  puts pastel.underline("This may take a while. Please be patient!")
-  time_start = Time.now.to_i
-  wpsplitter = Wp2txt::Splitter.new(input_file, output_dir, tfile_size)
-  spinner = TTY::Spinner.new(":spinner", format: :arrow_pulse, hide_cursor: true, interval: 5)
-  spinner.auto_spin
-  wpsplitter.split_file
-  time_finish = Time.now.to_i
-  spinner.stop("Time: #{sec_to_str(time_finish - time_start)}")# Stop animation
-  puts pastel.blue.bold("Complete!")
-  exit if !convert
-  input_files = Dir.glob("#{output_dir}/*.xml")
-end
+    Optimist.die :size, "must be larger than 0" unless opts[:file_size] >= 0
+    Optimist.die :input, "must exist" unless File.exist?(opts[:input])
+    Optimist.die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
+    pastel = Pastel.new
+    input_file = opts[:input]
+    output_dir = opts[:output_dir]
+    tfile_size = opts[:file_size]
+    num_processors = Etc.nprocessors
+    num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
+                      opts[:num_procs]
+                    else
+                      num_processors - 2
+                    end
+    num_processes = 1 if num_processes < 1
+    convert = opts[:convert]
+    strip_tmarker = opts[:marker] ? false : true
+    opt_array = %i[title list heading table redirect multiline category category_only summary_only del_interfile bz2_gem]
+    config = {}
+    opt_array.each do |opt|
+      config[opt] = opts[opt]
+    end
-puts ""
-puts pastel.red.bold("Converting")
-puts "Number of files being processed: " + pastel.bold("#{input_files.size}")
-puts "Number of CPU cores being used:  " + pastel.bold("#{num_processes}")
-Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |input_file|
-  wpconv = Wp2txt::Runner.new(input_file, output_dir, strip_tmarker, config[:del_interfile])
-  wpconv.extract_text do |article|
-    format_wiki!(article.title)
-    if config[:category_only]
-      title = "#{article.title}\t"
-      contents = article.categories.join(", ")
-      contents << "\n"
-    elsif config[:category] && !article.categories.empty?
-      title = "\n[[#{article.title}]]\n\n"
-      contents = "\nCATEGORIES: "
-      contents << article.categories.join(", ")
-      contents << "\n\n"
+    if File.ftype(input_file) == "directory"
+      input_files = Dir.glob("#{input_file}/*.xml")
     else
-      title = "\n[[#{article.title}]]\n\n"
-      contents = ""
+      puts ""
+      puts pastel.green.bold("Preprocessing")
+      puts "Decompressing and splitting the original dump file."
+      puts pastel.underline("This may take a while. Please be patient!")
+      time_start = Time.now.to_i
+      wpsplitter = Splitter.new(input_file, output_dir, tfile_size)
+      spinner = TTY::Spinner.new(":spinner", format: :arrow_pulse, hide_cursor: true, interval: 5)
+      spinner.auto_spin
+      wpsplitter.split_file
+      time_finish = Time.now.to_i
+      spinner.stop("Time: #{sec_to_str(time_finish - time_start)}") # Stop animation
+      puts pastel.blue.bold("Complete!")
+      exit unless convert
+      input_files = Dir.glob("#{output_dir}/*.xml")
     end
-    unless config[:category_only]
-      article.elements.each do |e|
-        case e.first
-        when :mw_heading
-          break if config[:summary_only]
-          next if !config[:heading]
-          format_wiki!(e.last)
-          line = e.last
-          line << "+HEADING+" if $DEBUG_MODE
-        when :mw_paragraph
-          format_wiki!(e.last)
-          line = e.last + "\n"
-          line << "+PARAGRAPH+" if $DEBUG_MODE
-        when :mw_table, :mw_htable
-          next if !config[:table]
-          line = e.last
-          line << "+TABLE+" if $DEBUG_MODE
-        when :mw_pre
-          next if !config[:pre]
-          line = e.last
-          line << "+PRE+" if $DEBUG_MODE
-        when :mw_quote
-          line = e.last
-          line << "+QUOTE+" if $DEBUG_MODE
-        when :mw_unordered, :mw_ordered, :mw_definition
-          next if !config[:list]
-          line = e.last
-          line << "+LIST+" if $DEBUG_MODE
-        when :mw_ml_template
-          next if !config[:multiline]
-          line = e.last
-          line << "+MLTEMPLATE+" if $DEBUG_MODE
-        when :mw_redirect
-          next if !config[:redirect]
-          line = e.last
-          line << "+REDIRECT+" if $DEBUG_MODE
-          line << "\n\n"
-        when :mw_isolated_template
-          next if !config[:multiline]
-          line = e.last
-          line << "+ISOLATED_TEMPLATE+" if $DEBUG_MODE
-        when :mw_isolated_tag
-          next
+    puts ""
+    puts pastel.red.bold("Converting")
+    puts "Number of files being processed: " + pastel.bold(input_files.size.to_s)
+    puts "Number of CPU cores being used:  " + pastel.bold(num_processes.to_s)
+    Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |infile|
+      wpconv = Runner.new(infile, output_dir, strip_tmarker, config[:del_interfile])
+      wpconv.extract_text do |article|
+        article.title = format_wiki(article.title, config)
+        if config[:category_only]
+          title = "#{article.title}\t"
+          contents = article.categories.join(", ")
+          contents << "\n"
+        elsif config[:category] && !article.categories.empty?
+          title = "\n[[#{article.title}]]\n\n"
+          contents = +"\nCATEGORIES: "
+          contents << article.categories.join(", ")
+          contents << "\n\n"
         else
-          if $DEBUG_MODE
-            # format_wiki!(e.last)
-            line = e.last
-            line << "+OTHER+"
-          else
-            next
+          title = "\n[[#{article.title}]]\n\n"
+          contents = +""
+        end
+        unless config[:category_only]
+          article.elements.each do |e|
+            case e.first
+            when :mw_heading
+              break if config[:summary_only]
+              next unless config[:heading]
+              e[-1] = format_wiki(e.last, config)
+              line = e.last
+              line << "+HEADING+" if DEBUG_MODE
+            when :mw_paragraph
+              e[-1] = format_wiki(e.last, config)
+              line = e.last + "\n"
+              line << "+PARAGRAPH+" if DEBUG_MODE
+            when :mw_table, :mw_htable
+              next unless config[:table]
+              line = e.last
+              line << "+TABLE+" if DEBUG_MODE
+            when :mw_pre
+              next unless config[:pre]
+              line = e.last
+              line << "+PRE+" if DEBUG_MODE
+            when :mw_quote
+              line = e.last
+              line << "+QUOTE+" if DEBUG_MODE
+            when :mw_unordered, :mw_ordered, :mw_definition
+              next unless config[:list]
+              line = e.last
+              line << "+LIST+" if DEBUG_MODE
+            when :mw_ml_template
+              next unless config[:multiline]
+              line = e.last
+              line << "+MLTEMPLATE+" if DEBUG_MODE
+            when :mw_redirect
+              next unless config[:redirect]
+              line = e.last
+              line << "+REDIRECT+" if DEBUG_MODE
+              line << "\n\n"
+            when :mw_isolated_template
+              next unless config[:multiline]
+              line = e.last
+              line << "+ISOLATED_TEMPLATE+" if DEBUG_MODE
+            when :mw_isolated_tag
+              next
+            else
+              next unless DEBUG_MODE
+              line = e.last
+              line << "+OTHER+"
+            end
+            contents << line << "\n"
           end
         end
-        contents << line << "\n"
-      end
-    end
-    if /\A[\s　]*\z/m =~ contents
-      result = ""
-    else
-      result = config[:title] ? title << contents : contents
+        if /\A[\s　]*\z/m =~ contents
+          ""
+        else
+          config[:title] ? title << contents : contents
+        end
+      end
     end
+    puts pastel.blue.bold("Complete!")
   end
 end
-puts pastel.blue.bold("Complete!")
+WpApp.new.run

data/lib/wp2txt/article.rb CHANGED Viewed

@@ -1,62 +1,54 @@
-#!/usr/bin/env ruby
-# -*- coding: utf-8 -*-
-$: << File.join(File.dirname(__FILE__))
+# frozen_string_literal: true
 require 'strscan'
-require 'utils'
+require_relative 'utils'
 module Wp2txt
   # possible element type, which could be later chosen to print or not to print
-    # :mw_heading
-    # :mw_htable
-    # :mw_quote
-    # :mw_unordered
-    # :mw_ordered
-    # :mw_definition
-    # :mw_pre
-    # :mw_paragraph
-    # :mw_comment
-    # :mw_math
-    # :mw_source
-    # :mw_inputbox
-    # :mw_template
-    # :mw_link
-    # :mw_summary
-    # :mw_blank
-    # :mw_redirect
+  # :mw_heading
+  # :mw_htable
+  # :mw_quote
+  # :mw_unordered
+  # :mw_ordered
+  # :mw_definition
+  # :mw_pre
+  # :mw_paragraph
+  # :mw_comment
+  # :mw_math
+  # :mw_source
+  # :mw_inputbox
+  # :mw_template
+  # :mw_link
+  # :mw_summary
+  # :mw_blank
+  # :mw_redirect
   # an article contains elements, each of which is [TYPE, string]
   class Article
     include Wp2txt
     attr_accessor :elements, :title, :categories
     def initialize(text, title = "", strip_tmarker = false)
       @title = title.strip
       @strip_tmarker = strip_tmarker
-      convert_characters!(text)
-      text.gsub!(/\|\n\n+/m){"|\n"}
-      remove_html!(text)
-      make_reference!(text)
-      remove_ref!(text)
+      text = convert_characters(text)
+      text = text.gsub(/\|\n\n+/m) { "|\n" }
+      text = remove_html(text)
+      text = make_reference(text)
+      text = remove_ref(text)
       parse text
     end
-    def create_element(tp, text)
-      [tp, text]
+    def create_element(tpx, text)
+      [tpx, text]
     end
     def parse(source)
       @elements = []
-      @categories  = []
+      @categories = []
       mode = nil
-      open_stack  = []
-      close_stack = []
       source.each_line do |line|
-        matched = line.scan($category_regex)
+        matched = line.scan(CATEGORY_REGEX)
         if matched && !matched.empty?
           @categories += matched
           @categories.uniq!
@@ -65,108 +57,94 @@ module Wp2txt
         case mode
         when :mw_ml_template
           scanner = StringScanner.new(line)
-          str= process_nested_structure(scanner, "{{", "}}") {""}
-          if $ml_template_end_regex =~ str
-            mode = nil
-          end
+          str = process_nested_structure(scanner, "{{", "}}") { "" }
+          mode = nil if ML_TEMPLATE_END_REGEX =~ str
           @elements.last.last << line
           next
         when :mw_ml_link
           scanner = StringScanner.new(line)
-          str= process_nested_structure(scanner, "[[", "]]") {""}
-          if $ml_link_end_regex =~ str
-            mode = nil
-          end
+          str = process_nested_structure(scanner, "[[", "]]") { "" }
+          mode = nil if ML_LINK_END_REGEX =~ str
           @elements.last.last << line
           next
         when :mw_table
-          if $in_table_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_TABLE_REGEX2 =~ line
           @elements.last.last << line
-          next
+          next
         when :mw_inputbox
-          if $in_inputbox_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_INPUTBOX_REGEX2 =~ line
           @elements.last.last << line
           next
         when :mw_source
-          if $in_source_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_SOURCE_REGEX2 =~ line
           @elements.last.last << line
           next
         when :mw_math
-          if $in_math_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_MATH_REGEX2 =~ line
           @elements.last.last << line
           next
         when :mw_htable
-          if $in_html_table_regex2 =~ line
-            mode = nil
-          end
+          mode = nil if IN_HTML_TABLE_REGEX2 =~ line
           @elements.last.last << line
           next
         end
         case line
-        when $isolated_template_regex
+        when ISOLATED_TEMPLATE_REGEX
           @elements << create_element(:mw_isolated_template, line)
-        when $isolated_tag_regex
+        when ISOLATED_TAG_REGEX
           @elements << create_element(:mw_isolated_tag, line)
-        when $blank_line_regex
-          @elements << create_element(:mw_blank, "\n")
-        when $redirect_regex
+        when BLANK_LINE_REGEX
+          @elements << create_element(:mw_blank, "\n")
+        when REDIRECT_REGEX
           @elements << create_element(:mw_redirect, line)
-        when $in_heading_regex
-          line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
+        when IN_HEADING_REGEX
+          line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 }
           @elements << create_element(:mw_heading, "\n" + line + "\n")
-        when $in_inputbox_regex
+        when IN_INPUTBOX_REGEX
           @elements << create_element(:mw_inputbox, line)
-        when $ml_template_onset_regex
+        when ML_TEMPLATE_ONSET_REGEX
           @elements << create_element(:mw_ml_template, line)
           mode = :mw_ml_template
-        when $ml_link_onset_regex
+        when ML_LINK_ONSET_REGEX
           @elements << create_element(:mw_ml_link, line)
           mode = :mw_ml_link
-        when $in_inputbox_regex1
+        when IN_INPUTBOX_REGEX1
           mode = :mw_inputbox
           @elements << create_element(:mw_inputbox, line)
-        when $in_source_regex
-        @elements << create_element(:mw_source, line)
-        when $in_source_regex1
+        when IN_SOURCE_REGEX
+          @elements << create_element(:mw_source, line)
+        when IN_SOURCE_REGEX1
           mode = :mw_source
           @elements << create_element(:mw_source, line)
-        when $in_math_regex
+        when IN_MATH_REGEX
           @elements << create_element(:mw_math, line)
-        when $in_math_regex1
+        when IN_MATH_REGEX1
           mode = :mw_math
           @elements << create_element(:mw_math, line)
-        when $in_html_table_regex
+        when IN_HTML_TABLE_REGEX
           @elements << create_element(:mw_htable, line)
-        when $in_html_table_regex1
+        when IN_HTML_TABLE_REGEX1
           mode = :mw_htable
           @elements << create_element(:mw_htable, line)
-        when $in_table_regex1
+        when IN_TABLE_REGEX1
           mode = :mw_table
           @elements << create_element(:mw_table, line)
-        when $in_unordered_regex
-          line = line.sub($list_marks_regex, "") if @strip_tmarker
+        when IN_UNORDERED_REGEX
+          line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
           @elements << create_element(:mw_unordered, line)
-        when $in_ordered_regex
-          line = line.sub($list_marks_regex, "") if @strip_tmarker
+        when IN_ORDERED_REGEX
+          line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker
           @elements << create_element(:mw_ordered, line)
-        when $in_pre_regex
-          line = line.sub($pre_marks_regex, "") if @strip_tmarker
+        when IN_PRE_REGEX
+          line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker
           @elements << create_element(:mw_pre, line)
-        when $in_definition_regex
-          line = line.sub($def_marks_regex, "") if @strip_tmarker
+        when IN_DEFINITION_REGEX
+          line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker
           @elements << create_element(:mw_definition, line)
-        when $in_link_regex
+        when IN_LINK_REGEX
           @elements << create_element(:mw_link, line)
-        else
+        else
           @elements << create_element(:mw_paragraph, "\n" + line)
         end
       end