RubyGems - wp2txt - Versions diffs - 0.4.2 → 0.5.0 - Mend

wp2txt 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README.md CHANGED

@@ -16,8 +16,9 @@ WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure co
 ### Installation
-<!-- `gem install` method will become available soon.  In the meantime, use the source code on Github. -->
+    $ gem install bundler
+    $ bundle install
     $ gem install wp2txt
 ### Usage

data/bin/wp2txt CHANGED

@@ -34,6 +34,7 @@ EOS
   opt :template_off, "Remove template notations from output", :default => true
   opt :redirect_off, "Not show redirect destination", :default => false
   opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
+  opt :category_off, "Not output article category information", :default => false
   opt :file_size,   "Approximate size (in MB) of each output file", :default => 10
 end
 Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
@@ -58,7 +59,14 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert_
 wpconv.extract_text do |article|
   title = format_wiki article.title
   title = "[[#{title}]]\n"
-  contents = ""
+  if !opts[:category_off] && !article.categories.empty?
+    contents = "\nCATEGORIES: "
+    contents += article.categories.join(", ")
+    contents += "\n\n"
+  else
+    contents = ""
+  end
   article.elements.each do |e|
     case e.first
@@ -102,11 +110,13 @@ wpconv.extract_text do |article|
     contents += line
     contents = remove_templates(contents) if config[:template_off]
   end
+  ##### cleanup #####
   if /\A\s*\z/m =~ contents
     result = ""
   else
     result = config[:title_off] ? contents : title + "\n" + contents
   end
   result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
-  result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
+  result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
 end

data/lib/wp2txt.rb CHANGED

@@ -206,7 +206,7 @@ module Wp2txt
       if page.empty?
         return false
       else
-        return page.force_encoding("utf-8")
+        return page.force_encoding("utf-8") rescue page
       end
     end

data/lib/wp2txt/article.rb CHANGED

@@ -31,7 +31,7 @@ module Wp2txt
   class Article
     include Wp2txt
-    attr_accessor :elements, :title
+    attr_accessor :elements, :title, :categories
     # class varialbes to save resource for generating regexps
     # those with a trailing number 1 represent opening tag/markup
@@ -70,6 +70,8 @@ module Wp2txt
     @@blank_line_regex = Regexp.new('^\s*$')
     @@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
+    @@category_regex = Regexp.new('[\{\[\|\b](?:C|c)ategory\:(.*?)[\}\]\|\b]')
     def initialize(text, title = "", strip_tmarker = false)
       @title = title.strip
@@ -83,10 +85,16 @@ module Wp2txt
     def parse(source)
       @elements = []
+      @categories  = []
       mode = nil
       open_stack  = []
       close_stack = []
       source.each_line do |line|
+        matched = line.scan(@@category_regex)
+        if matched && !matched.empty?
+          @categories += matched
+          @categories = @categories.uniq
+        end
         case mode
         when :mw_table
@@ -129,6 +137,7 @@ module Wp2txt
         when @@in_template_regex
           @elements << create_element(:mw_template, line)
         when @@in_heading_regex
+          line = line.sub(/^(\=+)\s+/){$1}.sub(/\s+(\=+)$/){$1}
           @elements << create_element(:mw_heading, "\n" + line + "\n")
         when @@in_inputbox_regex
           @elements << create_element(:mw_inputbox, line)

data/lib/wp2txt/utils.rb CHANGED

@@ -37,7 +37,7 @@ module Wp2txt
         end
         exit
       else
-        fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
+        fixed_text = original_text.encode("UTF-16").encode("UTF-8")
         return format_wiki(fixed_text, true)
       end
     end
@@ -240,7 +240,7 @@ module Wp2txt
         hi = ch>>8
         lo = ch&0xff
         u = "\377\376" << lo.chr << hi.chr
-        u.encode("UTF-8", "UTF-16")
+        u.encode("UTF-8", "UTF-16")
       end
     rescue StandardError
       return num_str

data/lib/wp2txt/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Wp2txt
-  VERSION = "0.4.2"
+  VERSION = "0.5.0"
 end

data/wp2txt.gemspec CHANGED

@@ -23,4 +23,5 @@ Gem::Specification.new do |s|
   s.add_runtime_dependency "bzip2-ruby"
   s.add_runtime_dependency "trollop"
   s.add_runtime_dependency "nokogiri"
+  s.add_runtime_dependency "json"
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wp2txt
 version: !ruby/object:Gem::Version
-  version: 0.4.2
+  version: 0.5.0
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-10-22 00:00:00.000000000 Z
+date: 2013-01-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -91,6 +91,22 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: json
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
   XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
 email: