wp2txt 0.4.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +3 -2
- data/bin/wp2txt +12 -2
- data/lib/wp2txt.rb +1 -1
- data/lib/wp2txt/article.rb +10 -1
- data/lib/wp2txt/utils.rb +2 -2
- data/lib/wp2txt/version.rb +1 -1
- data/wp2txt.gemspec +1 -0
- metadata +18 -2
data/README.md
CHANGED
@@ -16,8 +16,9 @@ WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure co
|
|
16
16
|
|
17
17
|
### Installation
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
$ gem install bundler
|
20
|
+
$ bundle install
|
21
|
+
|
21
22
|
$ gem install wp2txt
|
22
23
|
|
23
24
|
### Usage
|
data/bin/wp2txt
CHANGED
@@ -34,6 +34,7 @@ EOS
|
|
34
34
|
opt :template_off, "Remove template notations from output", :default => true
|
35
35
|
opt :redirect_off, "Not show redirect destination", :default => false
|
36
36
|
opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
|
37
|
+
opt :category_off, "Not output article category information", :default => false
|
37
38
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
38
39
|
end
|
39
40
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
@@ -58,7 +59,14 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert_
|
|
58
59
|
wpconv.extract_text do |article|
|
59
60
|
title = format_wiki article.title
|
60
61
|
title = "[[#{title}]]\n"
|
61
|
-
|
62
|
+
|
63
|
+
if !opts[:category_off] && !article.categories.empty?
|
64
|
+
contents = "\nCATEGORIES: "
|
65
|
+
contents += article.categories.join(", ")
|
66
|
+
contents += "\n\n"
|
67
|
+
else
|
68
|
+
contents = ""
|
69
|
+
end
|
62
70
|
|
63
71
|
article.elements.each do |e|
|
64
72
|
case e.first
|
@@ -102,11 +110,13 @@ wpconv.extract_text do |article|
|
|
102
110
|
contents += line
|
103
111
|
contents = remove_templates(contents) if config[:template_off]
|
104
112
|
end
|
113
|
+
|
114
|
+
##### cleanup #####
|
105
115
|
if /\A\s*\z/m =~ contents
|
106
116
|
result = ""
|
107
117
|
else
|
108
118
|
result = config[:title_off] ? contents : title + "\n" + contents
|
109
119
|
end
|
110
120
|
result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
|
111
|
-
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
121
|
+
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
112
122
|
end
|
data/lib/wp2txt.rb
CHANGED
data/lib/wp2txt/article.rb
CHANGED
@@ -31,7 +31,7 @@ module Wp2txt
|
|
31
31
|
class Article
|
32
32
|
|
33
33
|
include Wp2txt
|
34
|
-
attr_accessor :elements, :title
|
34
|
+
attr_accessor :elements, :title, :categories
|
35
35
|
|
36
36
|
# class varialbes to save resource for generating regexps
|
37
37
|
# those with a trailing number 1 represent opening tag/markup
|
@@ -70,6 +70,8 @@ module Wp2txt
|
|
70
70
|
@@blank_line_regex = Regexp.new('^\s*$')
|
71
71
|
|
72
72
|
@@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
73
|
+
|
74
|
+
@@category_regex = Regexp.new('[\{\[\|\b](?:C|c)ategory\:(.*?)[\}\]\|\b]')
|
73
75
|
|
74
76
|
def initialize(text, title = "", strip_tmarker = false)
|
75
77
|
@title = title.strip
|
@@ -83,10 +85,16 @@ module Wp2txt
|
|
83
85
|
|
84
86
|
def parse(source)
|
85
87
|
@elements = []
|
88
|
+
@categories = []
|
86
89
|
mode = nil
|
87
90
|
open_stack = []
|
88
91
|
close_stack = []
|
89
92
|
source.each_line do |line|
|
93
|
+
matched = line.scan(@@category_regex)
|
94
|
+
if matched && !matched.empty?
|
95
|
+
@categories += matched
|
96
|
+
@categories = @categories.uniq
|
97
|
+
end
|
90
98
|
|
91
99
|
case mode
|
92
100
|
when :mw_table
|
@@ -129,6 +137,7 @@ module Wp2txt
|
|
129
137
|
when @@in_template_regex
|
130
138
|
@elements << create_element(:mw_template, line)
|
131
139
|
when @@in_heading_regex
|
140
|
+
line = line.sub(/^(\=+)\s+/){$1}.sub(/\s+(\=+)$/){$1}
|
132
141
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
133
142
|
when @@in_inputbox_regex
|
134
143
|
@elements << create_element(:mw_inputbox, line)
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -37,7 +37,7 @@ module Wp2txt
|
|
37
37
|
end
|
38
38
|
exit
|
39
39
|
else
|
40
|
-
fixed_text = original_text.encode("UTF-16"
|
40
|
+
fixed_text = original_text.encode("UTF-16").encode("UTF-8")
|
41
41
|
return format_wiki(fixed_text, true)
|
42
42
|
end
|
43
43
|
end
|
@@ -240,7 +240,7 @@ module Wp2txt
|
|
240
240
|
hi = ch>>8
|
241
241
|
lo = ch&0xff
|
242
242
|
u = "\377\376" << lo.chr << hi.chr
|
243
|
-
u.encode("UTF-8", "UTF-16")
|
243
|
+
u.encode("UTF-8", "UTF-16")
|
244
244
|
end
|
245
245
|
rescue StandardError
|
246
246
|
return num_str
|
data/lib/wp2txt/version.rb
CHANGED
data/wp2txt.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-01-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -91,6 +91,22 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: json
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
94
110
|
description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
|
95
111
|
XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
96
112
|
email:
|