wp2txt 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +3 -2
- data/bin/wp2txt +12 -2
- data/lib/wp2txt.rb +1 -1
- data/lib/wp2txt/article.rb +10 -1
- data/lib/wp2txt/utils.rb +2 -2
- data/lib/wp2txt/version.rb +1 -1
- data/wp2txt.gemspec +1 -0
- metadata +18 -2
data/README.md
CHANGED
@@ -16,8 +16,9 @@ WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure co
|
|
16
16
|
|
17
17
|
### Installation
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
$ gem install bundler
|
20
|
+
$ bundle install
|
21
|
+
|
21
22
|
$ gem install wp2txt
|
22
23
|
|
23
24
|
### Usage
|
data/bin/wp2txt
CHANGED
@@ -34,6 +34,7 @@ EOS
|
|
34
34
|
opt :template_off, "Remove template notations from output", :default => true
|
35
35
|
opt :redirect_off, "Not show redirect destination", :default => false
|
36
36
|
opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
|
37
|
+
opt :category_off, "Not output article category information", :default => false
|
37
38
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
38
39
|
end
|
39
40
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
@@ -58,7 +59,14 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert_
|
|
58
59
|
wpconv.extract_text do |article|
|
59
60
|
title = format_wiki article.title
|
60
61
|
title = "[[#{title}]]\n"
|
61
|
-
|
62
|
+
|
63
|
+
if !opts[:category_off] && !article.categories.empty?
|
64
|
+
contents = "\nCATEGORIES: "
|
65
|
+
contents += article.categories.join(", ")
|
66
|
+
contents += "\n\n"
|
67
|
+
else
|
68
|
+
contents = ""
|
69
|
+
end
|
62
70
|
|
63
71
|
article.elements.each do |e|
|
64
72
|
case e.first
|
@@ -102,11 +110,13 @@ wpconv.extract_text do |article|
|
|
102
110
|
contents += line
|
103
111
|
contents = remove_templates(contents) if config[:template_off]
|
104
112
|
end
|
113
|
+
|
114
|
+
##### cleanup #####
|
105
115
|
if /\A\s*\z/m =~ contents
|
106
116
|
result = ""
|
107
117
|
else
|
108
118
|
result = config[:title_off] ? contents : title + "\n" + contents
|
109
119
|
end
|
110
120
|
result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
|
111
|
-
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
121
|
+
result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
|
112
122
|
end
|
data/lib/wp2txt.rb
CHANGED
data/lib/wp2txt/article.rb
CHANGED
@@ -31,7 +31,7 @@ module Wp2txt
|
|
31
31
|
class Article
|
32
32
|
|
33
33
|
include Wp2txt
|
34
|
-
attr_accessor :elements, :title
|
34
|
+
attr_accessor :elements, :title, :categories
|
35
35
|
|
36
36
|
# class varialbes to save resource for generating regexps
|
37
37
|
# those with a trailing number 1 represent opening tag/markup
|
@@ -70,6 +70,8 @@ module Wp2txt
|
|
70
70
|
@@blank_line_regex = Regexp.new('^\s*$')
|
71
71
|
|
72
72
|
@@redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
73
|
+
|
74
|
+
@@category_regex = Regexp.new('[\{\[\|\b](?:C|c)ategory\:(.*?)[\}\]\|\b]')
|
73
75
|
|
74
76
|
def initialize(text, title = "", strip_tmarker = false)
|
75
77
|
@title = title.strip
|
@@ -83,10 +85,16 @@ module Wp2txt
|
|
83
85
|
|
84
86
|
def parse(source)
|
85
87
|
@elements = []
|
88
|
+
@categories = []
|
86
89
|
mode = nil
|
87
90
|
open_stack = []
|
88
91
|
close_stack = []
|
89
92
|
source.each_line do |line|
|
93
|
+
matched = line.scan(@@category_regex)
|
94
|
+
if matched && !matched.empty?
|
95
|
+
@categories += matched
|
96
|
+
@categories = @categories.uniq
|
97
|
+
end
|
90
98
|
|
91
99
|
case mode
|
92
100
|
when :mw_table
|
@@ -129,6 +137,7 @@ module Wp2txt
|
|
129
137
|
when @@in_template_regex
|
130
138
|
@elements << create_element(:mw_template, line)
|
131
139
|
when @@in_heading_regex
|
140
|
+
line = line.sub(/^(\=+)\s+/){$1}.sub(/\s+(\=+)$/){$1}
|
132
141
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
133
142
|
when @@in_inputbox_regex
|
134
143
|
@elements << create_element(:mw_inputbox, line)
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -37,7 +37,7 @@ module Wp2txt
|
|
37
37
|
end
|
38
38
|
exit
|
39
39
|
else
|
40
|
-
fixed_text = original_text.encode("UTF-16"
|
40
|
+
fixed_text = original_text.encode("UTF-16").encode("UTF-8")
|
41
41
|
return format_wiki(fixed_text, true)
|
42
42
|
end
|
43
43
|
end
|
@@ -240,7 +240,7 @@ module Wp2txt
|
|
240
240
|
hi = ch>>8
|
241
241
|
lo = ch&0xff
|
242
242
|
u = "\377\376" << lo.chr << hi.chr
|
243
|
-
u.encode("UTF-8", "UTF-16")
|
243
|
+
u.encode("UTF-8", "UTF-16")
|
244
244
|
end
|
245
245
|
rescue StandardError
|
246
246
|
return num_str
|
data/lib/wp2txt/version.rb
CHANGED
data/wp2txt.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-01-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -91,6 +91,22 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: json
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
94
110
|
description: WP2TXT extracts plain text data from Wikipedia dump file (encoded in
|
95
111
|
XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
96
112
|
email:
|