RubyGems - wp2txt - Versions diffs - 0.7.7 → 0.7.8 - Mend

wp2txt 0.7.7 → 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d239913dc9fcda87677ec2eeed1ae51542b9ae7b
-  data.tar.gz: 6ee520b960dc9bc23a6cf20345cc36a0740d5b96
+  metadata.gz: ee8448d2dc341c9f26a613522c0b9a225b62a7df
+  data.tar.gz: 036aa5184a19b4351c65af605f2ebc23b9e73398
 SHA512:
-  metadata.gz: 4520570cf6f4c8c9c955a574523a4222ea3a9e308a86a510c407ac095040f73dc9cf393c49711787709129383080da077cc322a8e0aae9a401d24e9015e5baa8
-  data.tar.gz: 93f51183f722e6371394350777971f4dc66adf777d4613e547384fe96232071dff1078ff5189841cebebb027a63f072fd09013298a3d6ef042c3f22e25653dc1
+  metadata.gz: 05dd0bd2462bc72f030c0bd03233e359d1febdb4b30ad1309f4baf35ab6241684d164269ae1bae527163da787188d915ccb7ab460d83cd83732fbf9627d7ada1
+  data.tar.gz: 2bc83d1854656a4b3a83e6a2e1b9cfe86c86163d27a64582f994fc997b8104e4ab28d8d28881c054e323fd69934c53b63909cd7458a8d2ed0243c95702f8a14e

data/README.md CHANGED

@@ -2,6 +2,8 @@
 Wikipedia dump file to text converter
+**Important** This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution
 ### About ###
 WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
@@ -26,14 +28,13 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and  `yyy
 Command line options are as follows:
-*CAUTION:* Command line options in the current version have been drastically changed from previous versions.
+**Important** Command line options in the current version have been drastically changed from previous versions.
     Usage: wp2txt [options]
     where [options] are:
                --input-file, -i:   Wikipedia dump file with .bz2 (compressed) or
                                    .txt (uncompressed) format
-           --output-dir, -o <s>:   Output directory (default:
-                                   /Users/yohasebe/Dropbox/code/wp2txt)
+           --output-dir, -o <s>:   Output directory (default: current directory)
     --convert, --no-convert, -c:   Output in plain text (converting from XML)
                                    (default: true)
           --list, --no-list, -l:   Show list items in output (default: true)
@@ -41,14 +42,14 @@ Command line options are as follows:
         --title, --no-title, -t:   Show page titles in output (default: true)
                     --table, -a:   Show table source code in output
                  --template, -e:   leave inline template notations unmodified
-                 --redirect, -r:   Show redirect destination
+                      --ref, -r:   leave reference notations in the format
+                                   [ref]...[/ref]
+                     --redirect:   Show redirect destination
       --marker, --no-marker, -m:   Show symbols prefixed to list items,
                                    definitions, etc. (Default: true)
                  --category, -g:   Show article category information
             --file-size, -f <i>:   Approximate size (in MB) of each output file
                                    (default: 10)
-          --limit-recur, -u <i>:   Max number of recursive call (0 to 10)
-                                   (default: 10)
                   --version, -v:   Print version and exit
                      --help, -h:   Show this message

data/bin/wp2txt CHANGED

@@ -32,32 +32,31 @@ EOS
   opt :title,   "Show page titles in output", :default => true
   opt :table,   "Show table source code in output", :default => false
   opt :template, "leave inline template notations unmodified", :default => false
+  opt :ref, "leave reference notations in the format [ref]...[/ref]", :default => false
   opt :redirect, "Show redirect destination", :default => false
   opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
   opt :category, "Show article category information", :default => false
   opt :file_size,   "Approximate size (in MB) of each output file", :default => 10
-  opt :limit_recur, "Max number of recursive call (0 to 10)", :default => 10
 end
 Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
 Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
-Trollop::die :limit_recur, "must be 10 or smaller" if opts[:limit_recur] > 10
 input_file = ARGV[0]
 output_dir = opts[:output_dir]
 tfile_size = opts[:file_size]
-limit_recur = opts[:limit_recur]
 convert = opts[:convert]
 strip_tmarker = opts[:marker] ? false : true
 opt_array = [:title, :list, :heading, :table, :redirect]
 $leave_template = true if opts[:template]
 $leave_table = true if opts[:table]
+$leave_ref = true if opts[:ref]
 config = {}
 opt_array.each do |opt|
   config[opt] = opts[opt]
 end
 parent = Wp2txt::CmdProgbar.new
-wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker, limit_recur)
+wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
 wpconv.extract_text do |article|
   format_wiki!(article.title)

data/lib/wp2txt.rb CHANGED

@@ -29,7 +29,7 @@ module Wp2txt
     include Wp2txt
-    def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false, limit_recur = 10)
+    def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
       @parent = parent
       @fp = nil
@@ -38,9 +38,6 @@ module Wp2txt
       @tfile_size = tfile_size
       @convert = convert
       @strip_tmarker = strip_tmarker
-      #max number of recursive calls (global variable)
-      $limit_recur = limit_recur
     end
     def file_size(file)

data/lib/wp2txt/utils.rb CHANGED

@@ -134,7 +134,7 @@ module Wp2txt
   #################### parser for nested structure ####################
-  def process_nested_structure(scanner, left, right, recur_count, &block)
+  def process_nested_structure(scanner, left, right, &block)
     buffer = ""
     begin
     if left == "[" && right == "]"
@@ -168,12 +168,11 @@ module Wp2txt
     end
     buffer << scanner.rest
-    recur_count = recur_count - 1
-    if recur_count < 0 || buffer == scanner.string
+    if buffer == scanner.string
       return buffer
     else
       scanner.string = buffer
-      return process_nested_structure(scanner, left, right, recur_count, &block) || ""
+      return process_nested_structure(scanner, left, right, &block) || ""
     end
     rescue => e
       return scanner.string
@@ -204,7 +203,7 @@ module Wp2txt
   def process_interwiki_links!(str)
     scanner = StringScanner.new(str)
-    result = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |contents|
+    result = process_nested_structure(scanner, "[[", "]]") do |contents|
       parts = contents.split("|")
       case parts.size
       when 1
@@ -219,7 +218,7 @@ module Wp2txt
   def process_external_links!(str)
     scanner = StringScanner.new(str)
-    result = process_nested_structure(scanner, "[", "]", $limit_recur) do |contents|
+    result = process_nested_structure(scanner, "[", "]") do |contents|
       parts = contents.split(" ", 2)
       case parts.size
       when 1
@@ -235,7 +234,7 @@ module Wp2txt
   def remove_templates!(str)
     scanner = StringScanner.new(str)
-    result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
+    result = process_nested_structure(scanner, "{{", "}}") do |contents|
       ""
     end
     str.replace(result)
@@ -243,7 +242,7 @@ module Wp2txt
   def remove_table!(str)
     scanner = StringScanner.new(str)
-    result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
+    result = process_nested_structure(scanner, "{|", "|}") do |contents|
       ""
     end
     str.replace(result)
@@ -301,10 +300,11 @@ module Wp2txt
   end
   def make_reference!(str)
-    str.gsub!($make_reference_regex_a, "\n")
-    str.gsub!($make_reference_regex_b, "")
-    str.gsub!($make_reference_regex_c, "[ref]")
-    str.gsub!($make_reference_regex_d, "[/ref]")
+    str.gsub!($make_reference_regex_a){"\n"}
+    str.gsub!($make_reference_regex_b){""}
+    str.gsub!($make_reference_regex_c){"[ref]"}
+    str.gsub!($make_reference_regex_d){"[/ref]"}
+    str.gsub!($format_ref_regex){""} unless $leave_ref
   end
   def format_ref!(page)

data/lib/wp2txt/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Wp2txt
-  VERSION = "0.7.7"
+  VERSION = "0.7.8"
 end

data/spec/utils_spec.rb CHANGED

@@ -6,8 +6,6 @@ require 'wp2txt'
 require 'wp2txt/article'
 require 'wp2txt/utils'
-$limit_recur = 3
 describe "Wp2txt" do
   it "contains mediawiki-format related functions:" do
   end
@@ -22,7 +20,7 @@ describe "Wp2txt" do
       str_before = "[[ab[[cde[[alfa]]]]fg]]"
       str_after  = "<<ab<<cde<<alfa>>>>fg>>"
       scanner = StringScanner.new(str_before)
-      str_processed = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |content|
+      str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
         "<<" + content + ">>"
       end
       expect(str_processed).to eq str_after
@@ -32,7 +30,7 @@ describe "Wp2txt" do
       str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
       |passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
       scanner = StringScanner.new(str_before)
-      str_processed = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |content|
+      str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
         "<<" + content + ">>"
       end
       #str_processed.should == str_after

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wp2txt
 version: !ruby/object:Gem::Version
-  version: 0.7.7
+  version: 0.7.8
 platform: ruby
 authors:
 - Yoichiro Hasebe