wp2txt 0.4.2 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -16,8 +16,9 @@ WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure co
16
16
 
17
17
  ### Installation
18
18
 
19
- <!-- `gem install` method will become available soon. In the meantime, use the source code on Github. -->
20
-
19
+ $ gem install bundler
20
+ $ bundle install
21
+
21
22
  $ gem install wp2txt
22
23
 
23
24
  ### Usage
data/bin/wp2txt CHANGED
@@ -34,6 +34,7 @@ EOS
34
34
  opt :template_off, "Remove template notations from output", :default => true
35
35
  opt :redirect_off, "Not show redirect destination", :default => false
36
36
  opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
37
+ opt :category_off, "Not output article category information", :default => false
37
38
  opt :file_size, "Approximate size (in MB) of each output file", :default => 10
38
39
  end
39
40
  Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
@@ -58,7 +59,14 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert_
58
59
  wpconv.extract_text do |article|
59
60
  title = format_wiki article.title
60
61
  title = "[[#{title}]]\n"
61
- contents = ""
62
+
63
+ if !opts[:category_off] && !article.categories.empty?
64
+ contents = "\nCATEGORIES: "
65
+ contents += article.categories.join(", ")
66
+ contents += "\n\n"
67
+ else
68
+ contents = ""
69
+ end
62
70
 
63
71
  article.elements.each do |e|
64
72
  case e.first
@@ -102,11 +110,13 @@ wpconv.extract_text do |article|
102
110
  contents += line
103
111
  contents = remove_templates(contents) if config[:template_off]
104
112
  end
113
+
114
+ ##### cleanup #####
105
115
  if /\A\s*\z/m =~ contents
106
116
  result = ""
107
117
  else
108
118
  result = config[:title_off] ? contents : title + "\n" + contents
109
119
  end
110
120
  result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
111
- result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
121
+ result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
112
122
  end
@@ -206,7 +206,7 @@ module Wp2txt
206
206
  if page.empty?
207
207
  return false
208
208
  else
209
- return page.force_encoding("utf-8")
209
+ return page.force_encoding("utf-8") rescue page
210
210
  end
211
211
  end
212
212
 
@@ -31,7 +31,7 @@ module Wp2txt
31
31
  class Article
32
32
 
33
33
  include Wp2txt
34
- attr_accessor :elements, :title
34
+ attr_accessor :elements, :title, :categories
35
35
 
36
36
  # class varialbes to save resource for generating regexps
37
37
  # those with a trailing number 1 represent opening tag/markup
@@ -70,6 +70,8 @@ module Wp2txt
70
70
  @@blank_line_regex = Regexp.new('^\s*$')
71
71
 
72
72
  @@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
73
+
74
+ @@category_regex = Regexp.new('[\{\[\|\b](?:C|c)ategory\:(.*?)[\}\]\|\b]')
73
75
 
74
76
  def initialize(text, title = "", strip_tmarker = false)
75
77
  @title = title.strip
@@ -83,10 +85,16 @@ module Wp2txt
83
85
 
84
86
  def parse(source)
85
87
  @elements = []
88
+ @categories = []
86
89
  mode = nil
87
90
  open_stack = []
88
91
  close_stack = []
89
92
  source.each_line do |line|
93
+ matched = line.scan(@@category_regex)
94
+ if matched && !matched.empty?
95
+ @categories += matched
96
+ @categories = @categories.uniq
97
+ end
90
98
 
91
99
  case mode
92
100
  when :mw_table
@@ -129,6 +137,7 @@ module Wp2txt
129
137
  when @@in_template_regex
130
138
  @elements << create_element(:mw_template, line)
131
139
  when @@in_heading_regex
140
+ line = line.sub(/^(\=+)\s+/){$1}.sub(/\s+(\=+)$/){$1}
132
141
  @elements << create_element(:mw_heading, "\n" + line + "\n")
133
142
  when @@in_inputbox_regex
134
143
  @elements << create_element(:mw_inputbox, line)
@@ -37,7 +37,7 @@ module Wp2txt
37
37
  end
38
38
  exit
39
39
  else
40
- fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
40
+ fixed_text = original_text.encode("UTF-16").encode("UTF-8")
41
41
  return format_wiki(fixed_text, true)
42
42
  end
43
43
  end
@@ -240,7 +240,7 @@ module Wp2txt
240
240
  hi = ch>>8
241
241
  lo = ch&0xff
242
242
  u = "\377\376" << lo.chr << hi.chr
243
- u.encode("UTF-8", "UTF-16")
243
+ u.encode("UTF-8", "UTF-16")
244
244
  end
245
245
  rescue StandardError
246
246
  return num_str
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -23,4 +23,5 @@ Gem::Specification.new do |s|
23
23
  s.add_runtime_dependency "bzip2-ruby"
24
24
  s.add_runtime_dependency "trollop"
25
25
  s.add_runtime_dependency "nokogiri"
26
+ s.add_runtime_dependency "json"
26
27
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-10-22 00:00:00.000000000 Z
12
+ date: 2013-01-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -91,6 +91,22 @@ dependencies:
91
91
  - - ! '>='
92
92
  - !ruby/object:Gem::Version
93
93
  version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: json
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
94
110
  description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
95
111
  XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
96
112
  email: