wp2txt 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -16,8 +16,9 @@ WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure co
16
16
 
17
17
  ### Installation
18
18
 
19
- <!-- `gem install` method will become available soon. In the meantime, use the source code on Github. -->
20
-
19
+ $ gem install bundler
20
+ $ bundle install
21
+
21
22
  $ gem install wp2txt
22
23
 
23
24
  ### Usage
data/bin/wp2txt CHANGED
@@ -34,6 +34,7 @@ EOS
34
34
  opt :template_off, "Remove template notations from output", :default => true
35
35
  opt :redirect_off, "Not show redirect destination", :default => false
36
36
  opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
37
+ opt :category_off, "Not output article category information", :default => false
37
38
  opt :file_size, "Approximate size (in MB) of each output file", :default => 10
38
39
  end
39
40
  Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
@@ -58,7 +59,14 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert_
58
59
  wpconv.extract_text do |article|
59
60
  title = format_wiki article.title
60
61
  title = "[[#{title}]]\n"
61
- contents = ""
62
+
63
+ if !opts[:category_off] && !article.categories.empty?
64
+ contents = "\nCATEGORIES: "
65
+ contents += article.categories.join(", ")
66
+ contents += "\n\n"
67
+ else
68
+ contents = ""
69
+ end
62
70
 
63
71
  article.elements.each do |e|
64
72
  case e.first
@@ -102,11 +110,13 @@ wpconv.extract_text do |article|
102
110
  contents += line
103
111
  contents = remove_templates(contents) if config[:template_off]
104
112
  end
113
+
114
+ ##### cleanup #####
105
115
  if /\A\s*\z/m =~ contents
106
116
  result = ""
107
117
  else
108
118
  result = config[:title_off] ? contents : title + "\n" + contents
109
119
  end
110
120
  result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
111
- result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
121
+ result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
112
122
  end
@@ -206,7 +206,7 @@ module Wp2txt
206
206
  if page.empty?
207
207
  return false
208
208
  else
209
- return page.force_encoding("utf-8")
209
+ return page.force_encoding("utf-8") rescue page
210
210
  end
211
211
  end
212
212
 
@@ -31,7 +31,7 @@ module Wp2txt
31
31
  class Article
32
32
 
33
33
  include Wp2txt
34
- attr_accessor :elements, :title
34
+ attr_accessor :elements, :title, :categories
35
35
 
36
36
  # class varialbes to save resource for generating regexps
37
37
  # those with a trailing number 1 represent opening tag/markup
@@ -70,6 +70,8 @@ module Wp2txt
70
70
  @@blank_line_regex = Regexp.new('^\s*$')
71
71
 
72
72
  @@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
73
+
74
+ @@category_regex = Regexp.new('[\{\[\|\b](?:C|c)ategory\:(.*?)[\}\]\|\b]')
73
75
 
74
76
  def initialize(text, title = "", strip_tmarker = false)
75
77
  @title = title.strip
@@ -83,10 +85,16 @@ module Wp2txt
83
85
 
84
86
  def parse(source)
85
87
  @elements = []
88
+ @categories = []
86
89
  mode = nil
87
90
  open_stack = []
88
91
  close_stack = []
89
92
  source.each_line do |line|
93
+ matched = line.scan(@@category_regex)
94
+ if matched && !matched.empty?
95
+ @categories += matched
96
+ @categories = @categories.uniq
97
+ end
90
98
 
91
99
  case mode
92
100
  when :mw_table
@@ -129,6 +137,7 @@ module Wp2txt
129
137
  when @@in_template_regex
130
138
  @elements << create_element(:mw_template, line)
131
139
  when @@in_heading_regex
140
+ line = line.sub(/^(\=+)\s+/){$1}.sub(/\s+(\=+)$/){$1}
132
141
  @elements << create_element(:mw_heading, "\n" + line + "\n")
133
142
  when @@in_inputbox_regex
134
143
  @elements << create_element(:mw_inputbox, line)
@@ -37,7 +37,7 @@ module Wp2txt
37
37
  end
38
38
  exit
39
39
  else
40
- fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
40
+ fixed_text = original_text.encode("UTF-16").encode("UTF-8")
41
41
  return format_wiki(fixed_text, true)
42
42
  end
43
43
  end
@@ -240,7 +240,7 @@ module Wp2txt
240
240
  hi = ch>>8
241
241
  lo = ch&0xff
242
242
  u = "\377\376" << lo.chr << hi.chr
243
- u.encode("UTF-8", "UTF-16")
243
+ u.encode("UTF-8", "UTF-16")
244
244
  end
245
245
  rescue StandardError
246
246
  return num_str
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -23,4 +23,5 @@ Gem::Specification.new do |s|
23
23
  s.add_runtime_dependency "bzip2-ruby"
24
24
  s.add_runtime_dependency "trollop"
25
25
  s.add_runtime_dependency "nokogiri"
26
+ s.add_runtime_dependency "json"
26
27
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-22 00:00:00.000000000 Z
12
+ date: 2013-01-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -91,6 +91,22 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: json
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
94
110
  description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
95
111
  XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
96
112
  email: