wp2txt 0.7.7 → 0.7.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +7 -6
- data/bin/wp2txt +3 -4
- data/lib/wp2txt.rb +1 -4
- data/lib/wp2txt/utils.rb +12 -12
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +2 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ee8448d2dc341c9f26a613522c0b9a225b62a7df
|
4
|
+
data.tar.gz: 036aa5184a19b4351c65af605f2ebc23b9e73398
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05dd0bd2462bc72f030c0bd03233e359d1febdb4b30ad1309f4baf35ab6241684d164269ae1bae527163da787188d915ccb7ab460d83cd83732fbf9627d7ada1
|
7
|
+
data.tar.gz: 2bc83d1854656a4b3a83e6a2e1b9cfe86c86163d27a64582f994fc997b8104e4ab28d8d28881c054e323fd69934c53b63909cd7458a8d2ed0243c95702f8a14e
|
data/README.md
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
Wikipedia dump file to text converter
|
4
4
|
|
5
|
+
**Important** This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution
|
6
|
+
|
5
7
|
### About ###
|
6
8
|
|
7
9
|
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
|
@@ -26,14 +28,13 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
|
|
26
28
|
|
27
29
|
Command line options are as follows:
|
28
30
|
|
29
|
-
|
31
|
+
**Important** Command line options in the current version have been drastically changed from previous versions.
|
30
32
|
|
31
33
|
Usage: wp2txt [options]
|
32
34
|
where [options] are:
|
33
35
|
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or
|
34
36
|
.txt (uncompressed) format
|
35
|
-
--output-dir, -o <s>: Output directory (default:
|
36
|
-
/Users/yohasebe/Dropbox/code/wp2txt)
|
37
|
+
--output-dir, -o <s>: Output directory (default: current directory)
|
37
38
|
--convert, --no-convert, -c: Output in plain text (converting from XML)
|
38
39
|
(default: true)
|
39
40
|
--list, --no-list, -l: Show list items in output (default: true)
|
@@ -41,14 +42,14 @@ Command line options are as follows:
|
|
41
42
|
--title, --no-title, -t: Show page titles in output (default: true)
|
42
43
|
--table, -a: Show table source code in output
|
43
44
|
--template, -e: leave inline template notations unmodified
|
44
|
-
|
45
|
+
--ref, -r: leave reference notations in the format
|
46
|
+
[ref]...[/ref]
|
47
|
+
--redirect: Show redirect destination
|
45
48
|
--marker, --no-marker, -m: Show symbols prefixed to list items,
|
46
49
|
definitions, etc. (Default: true)
|
47
50
|
--category, -g: Show article category information
|
48
51
|
--file-size, -f <i>: Approximate size (in MB) of each output file
|
49
52
|
(default: 10)
|
50
|
-
--limit-recur, -u <i>: Max number of recursive call (0 to 10)
|
51
|
-
(default: 10)
|
52
53
|
--version, -v: Print version and exit
|
53
54
|
--help, -h: Show this message
|
54
55
|
|
data/bin/wp2txt
CHANGED
@@ -32,32 +32,31 @@ EOS
|
|
32
32
|
opt :title, "Show page titles in output", :default => true
|
33
33
|
opt :table, "Show table source code in output", :default => false
|
34
34
|
opt :template, "leave inline template notations unmodified", :default => false
|
35
|
+
opt :ref, "leave reference notations in the format [ref]...[/ref]", :default => false
|
35
36
|
opt :redirect, "Show redirect destination", :default => false
|
36
37
|
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
|
37
38
|
opt :category, "Show article category information", :default => false
|
38
39
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
39
|
-
opt :limit_recur, "Max number of recursive call (0 to 10)", :default => 10
|
40
40
|
end
|
41
41
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
42
42
|
Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
43
|
-
Trollop::die :limit_recur, "must be 10 or smaller" if opts[:limit_recur] > 10
|
44
43
|
|
45
44
|
input_file = ARGV[0]
|
46
45
|
output_dir = opts[:output_dir]
|
47
46
|
tfile_size = opts[:file_size]
|
48
|
-
limit_recur = opts[:limit_recur]
|
49
47
|
convert = opts[:convert]
|
50
48
|
strip_tmarker = opts[:marker] ? false : true
|
51
49
|
opt_array = [:title, :list, :heading, :table, :redirect]
|
52
50
|
$leave_template = true if opts[:template]
|
53
51
|
$leave_table = true if opts[:table]
|
52
|
+
$leave_ref = true if opts[:ref]
|
54
53
|
config = {}
|
55
54
|
opt_array.each do |opt|
|
56
55
|
config[opt] = opts[opt]
|
57
56
|
end
|
58
57
|
|
59
58
|
parent = Wp2txt::CmdProgbar.new
|
60
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker
|
59
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
61
60
|
|
62
61
|
wpconv.extract_text do |article|
|
63
62
|
format_wiki!(article.title)
|
data/lib/wp2txt.rb
CHANGED
@@ -29,7 +29,7 @@ module Wp2txt
|
|
29
29
|
|
30
30
|
include Wp2txt
|
31
31
|
|
32
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false
|
32
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
33
33
|
@parent = parent
|
34
34
|
@fp = nil
|
35
35
|
|
@@ -38,9 +38,6 @@ module Wp2txt
|
|
38
38
|
@tfile_size = tfile_size
|
39
39
|
@convert = convert
|
40
40
|
@strip_tmarker = strip_tmarker
|
41
|
-
|
42
|
-
#max number of recursive calls (global variable)
|
43
|
-
$limit_recur = limit_recur
|
44
41
|
end
|
45
42
|
|
46
43
|
def file_size(file)
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -134,7 +134,7 @@ module Wp2txt
|
|
134
134
|
|
135
135
|
#################### parser for nested structure ####################
|
136
136
|
|
137
|
-
def process_nested_structure(scanner, left, right,
|
137
|
+
def process_nested_structure(scanner, left, right, &block)
|
138
138
|
buffer = ""
|
139
139
|
begin
|
140
140
|
if left == "[" && right == "]"
|
@@ -168,12 +168,11 @@ module Wp2txt
|
|
168
168
|
end
|
169
169
|
buffer << scanner.rest
|
170
170
|
|
171
|
-
|
172
|
-
if recur_count < 0 || buffer == scanner.string
|
171
|
+
if buffer == scanner.string
|
173
172
|
return buffer
|
174
173
|
else
|
175
174
|
scanner.string = buffer
|
176
|
-
return process_nested_structure(scanner, left, right,
|
175
|
+
return process_nested_structure(scanner, left, right, &block) || ""
|
177
176
|
end
|
178
177
|
rescue => e
|
179
178
|
return scanner.string
|
@@ -204,7 +203,7 @@ module Wp2txt
|
|
204
203
|
|
205
204
|
def process_interwiki_links!(str)
|
206
205
|
scanner = StringScanner.new(str)
|
207
|
-
result = process_nested_structure(scanner, "[[", "]]"
|
206
|
+
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
208
207
|
parts = contents.split("|")
|
209
208
|
case parts.size
|
210
209
|
when 1
|
@@ -219,7 +218,7 @@ module Wp2txt
|
|
219
218
|
|
220
219
|
def process_external_links!(str)
|
221
220
|
scanner = StringScanner.new(str)
|
222
|
-
result = process_nested_structure(scanner, "[", "]"
|
221
|
+
result = process_nested_structure(scanner, "[", "]") do |contents|
|
223
222
|
parts = contents.split(" ", 2)
|
224
223
|
case parts.size
|
225
224
|
when 1
|
@@ -235,7 +234,7 @@ module Wp2txt
|
|
235
234
|
|
236
235
|
def remove_templates!(str)
|
237
236
|
scanner = StringScanner.new(str)
|
238
|
-
result = process_nested_structure(scanner, "{{", "}}"
|
237
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
239
238
|
""
|
240
239
|
end
|
241
240
|
str.replace(result)
|
@@ -243,7 +242,7 @@ module Wp2txt
|
|
243
242
|
|
244
243
|
def remove_table!(str)
|
245
244
|
scanner = StringScanner.new(str)
|
246
|
-
result = process_nested_structure(scanner, "{|", "|}"
|
245
|
+
result = process_nested_structure(scanner, "{|", "|}") do |contents|
|
247
246
|
""
|
248
247
|
end
|
249
248
|
str.replace(result)
|
@@ -301,10 +300,11 @@ module Wp2txt
|
|
301
300
|
end
|
302
301
|
|
303
302
|
def make_reference!(str)
|
304
|
-
str.gsub!($make_reference_regex_a
|
305
|
-
str.gsub!($make_reference_regex_b
|
306
|
-
str.gsub!($make_reference_regex_c
|
307
|
-
str.gsub!($make_reference_regex_d
|
303
|
+
str.gsub!($make_reference_regex_a){"\n"}
|
304
|
+
str.gsub!($make_reference_regex_b){""}
|
305
|
+
str.gsub!($make_reference_regex_c){"[ref]"}
|
306
|
+
str.gsub!($make_reference_regex_d){"[/ref]"}
|
307
|
+
str.gsub!($format_ref_regex){""} unless $leave_ref
|
308
308
|
end
|
309
309
|
|
310
310
|
def format_ref!(page)
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/utils_spec.rb
CHANGED
@@ -6,8 +6,6 @@ require 'wp2txt'
|
|
6
6
|
require 'wp2txt/article'
|
7
7
|
require 'wp2txt/utils'
|
8
8
|
|
9
|
-
$limit_recur = 3
|
10
|
-
|
11
9
|
describe "Wp2txt" do
|
12
10
|
it "contains mediawiki-format related functions:" do
|
13
11
|
end
|
@@ -22,7 +20,7 @@ describe "Wp2txt" do
|
|
22
20
|
str_before = "[[ab[[cde[[alfa]]]]fg]]"
|
23
21
|
str_after = "<<ab<<cde<<alfa>>>>fg>>"
|
24
22
|
scanner = StringScanner.new(str_before)
|
25
|
-
str_processed = process_nested_structure(scanner, "[[", "]]"
|
23
|
+
str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
|
26
24
|
"<<" + content + ">>"
|
27
25
|
end
|
28
26
|
expect(str_processed).to eq str_after
|
@@ -32,7 +30,7 @@ describe "Wp2txt" do
|
|
32
30
|
str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
33
31
|
|passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
|
34
32
|
scanner = StringScanner.new(str_before)
|
35
|
-
str_processed = process_nested_structure(scanner, "{{", "}}"
|
33
|
+
str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
|
36
34
|
"<<" + content + ">>"
|
37
35
|
end
|
38
36
|
#str_processed.should == str_after
|