wp2txt 0.7.7 → 0.7.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +7 -6
- data/bin/wp2txt +3 -4
- data/lib/wp2txt.rb +1 -4
- data/lib/wp2txt/utils.rb +12 -12
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +2 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ee8448d2dc341c9f26a613522c0b9a225b62a7df
|
4
|
+
data.tar.gz: 036aa5184a19b4351c65af605f2ebc23b9e73398
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05dd0bd2462bc72f030c0bd03233e359d1febdb4b30ad1309f4baf35ab6241684d164269ae1bae527163da787188d915ccb7ab460d83cd83732fbf9627d7ada1
|
7
|
+
data.tar.gz: 2bc83d1854656a4b3a83e6a2e1b9cfe86c86163d27a64582f994fc997b8104e4ab28d8d28881c054e323fd69934c53b63909cd7458a8d2ed0243c95702f8a14e
|
data/README.md
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
Wikipedia dump file to text converter
|
4
4
|
|
5
|
+
**Important** This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution
|
6
|
+
|
5
7
|
### About ###
|
6
8
|
|
7
9
|
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
|
@@ -26,14 +28,13 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
|
|
26
28
|
|
27
29
|
Command line options are as follows:
|
28
30
|
|
29
|
-
|
31
|
+
**Important** Command line options in the current version have been drastically changed from previous versions.
|
30
32
|
|
31
33
|
Usage: wp2txt [options]
|
32
34
|
where [options] are:
|
33
35
|
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or
|
34
36
|
.txt (uncompressed) format
|
35
|
-
--output-dir, -o <s>: Output directory (default:
|
36
|
-
/Users/yohasebe/Dropbox/code/wp2txt)
|
37
|
+
--output-dir, -o <s>: Output directory (default: current directory)
|
37
38
|
--convert, --no-convert, -c: Output in plain text (converting from XML)
|
38
39
|
(default: true)
|
39
40
|
--list, --no-list, -l: Show list items in output (default: true)
|
@@ -41,14 +42,14 @@ Command line options are as follows:
|
|
41
42
|
--title, --no-title, -t: Show page titles in output (default: true)
|
42
43
|
--table, -a: Show table source code in output
|
43
44
|
--template, -e: leave inline template notations unmodified
|
44
|
-
|
45
|
+
--ref, -r: leave reference notations in the format
|
46
|
+
[ref]...[/ref]
|
47
|
+
--redirect: Show redirect destination
|
45
48
|
--marker, --no-marker, -m: Show symbols prefixed to list items,
|
46
49
|
definitions, etc. (Default: true)
|
47
50
|
--category, -g: Show article category information
|
48
51
|
--file-size, -f <i>: Approximate size (in MB) of each output file
|
49
52
|
(default: 10)
|
50
|
-
--limit-recur, -u <i>: Max number of recursive call (0 to 10)
|
51
|
-
(default: 10)
|
52
53
|
--version, -v: Print version and exit
|
53
54
|
--help, -h: Show this message
|
54
55
|
|
data/bin/wp2txt
CHANGED
@@ -32,32 +32,31 @@ EOS
|
|
32
32
|
opt :title, "Show page titles in output", :default => true
|
33
33
|
opt :table, "Show table source code in output", :default => false
|
34
34
|
opt :template, "leave inline template notations unmodified", :default => false
|
35
|
+
opt :ref, "leave reference notations in the format [ref]...[/ref]", :default => false
|
35
36
|
opt :redirect, "Show redirect destination", :default => false
|
36
37
|
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
|
37
38
|
opt :category, "Show article category information", :default => false
|
38
39
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
39
|
-
opt :limit_recur, "Max number of recursive call (0 to 10)", :default => 10
|
40
40
|
end
|
41
41
|
Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
42
42
|
Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
43
|
-
Trollop::die :limit_recur, "must be 10 or smaller" if opts[:limit_recur] > 10
|
44
43
|
|
45
44
|
input_file = ARGV[0]
|
46
45
|
output_dir = opts[:output_dir]
|
47
46
|
tfile_size = opts[:file_size]
|
48
|
-
limit_recur = opts[:limit_recur]
|
49
47
|
convert = opts[:convert]
|
50
48
|
strip_tmarker = opts[:marker] ? false : true
|
51
49
|
opt_array = [:title, :list, :heading, :table, :redirect]
|
52
50
|
$leave_template = true if opts[:template]
|
53
51
|
$leave_table = true if opts[:table]
|
52
|
+
$leave_ref = true if opts[:ref]
|
54
53
|
config = {}
|
55
54
|
opt_array.each do |opt|
|
56
55
|
config[opt] = opts[opt]
|
57
56
|
end
|
58
57
|
|
59
58
|
parent = Wp2txt::CmdProgbar.new
|
60
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker
|
59
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
61
60
|
|
62
61
|
wpconv.extract_text do |article|
|
63
62
|
format_wiki!(article.title)
|
data/lib/wp2txt.rb
CHANGED
@@ -29,7 +29,7 @@ module Wp2txt
|
|
29
29
|
|
30
30
|
include Wp2txt
|
31
31
|
|
32
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false
|
32
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
33
33
|
@parent = parent
|
34
34
|
@fp = nil
|
35
35
|
|
@@ -38,9 +38,6 @@ module Wp2txt
|
|
38
38
|
@tfile_size = tfile_size
|
39
39
|
@convert = convert
|
40
40
|
@strip_tmarker = strip_tmarker
|
41
|
-
|
42
|
-
#max number of recursive calls (global variable)
|
43
|
-
$limit_recur = limit_recur
|
44
41
|
end
|
45
42
|
|
46
43
|
def file_size(file)
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -134,7 +134,7 @@ module Wp2txt
|
|
134
134
|
|
135
135
|
#################### parser for nested structure ####################
|
136
136
|
|
137
|
-
def process_nested_structure(scanner, left, right,
|
137
|
+
def process_nested_structure(scanner, left, right, &block)
|
138
138
|
buffer = ""
|
139
139
|
begin
|
140
140
|
if left == "[" && right == "]"
|
@@ -168,12 +168,11 @@ module Wp2txt
|
|
168
168
|
end
|
169
169
|
buffer << scanner.rest
|
170
170
|
|
171
|
-
|
172
|
-
if recur_count < 0 || buffer == scanner.string
|
171
|
+
if buffer == scanner.string
|
173
172
|
return buffer
|
174
173
|
else
|
175
174
|
scanner.string = buffer
|
176
|
-
return process_nested_structure(scanner, left, right,
|
175
|
+
return process_nested_structure(scanner, left, right, &block) || ""
|
177
176
|
end
|
178
177
|
rescue => e
|
179
178
|
return scanner.string
|
@@ -204,7 +203,7 @@ module Wp2txt
|
|
204
203
|
|
205
204
|
def process_interwiki_links!(str)
|
206
205
|
scanner = StringScanner.new(str)
|
207
|
-
result = process_nested_structure(scanner, "[[", "]]"
|
206
|
+
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
208
207
|
parts = contents.split("|")
|
209
208
|
case parts.size
|
210
209
|
when 1
|
@@ -219,7 +218,7 @@ module Wp2txt
|
|
219
218
|
|
220
219
|
def process_external_links!(str)
|
221
220
|
scanner = StringScanner.new(str)
|
222
|
-
result = process_nested_structure(scanner, "[", "]"
|
221
|
+
result = process_nested_structure(scanner, "[", "]") do |contents|
|
223
222
|
parts = contents.split(" ", 2)
|
224
223
|
case parts.size
|
225
224
|
when 1
|
@@ -235,7 +234,7 @@ module Wp2txt
|
|
235
234
|
|
236
235
|
def remove_templates!(str)
|
237
236
|
scanner = StringScanner.new(str)
|
238
|
-
result = process_nested_structure(scanner, "{{", "}}"
|
237
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
239
238
|
""
|
240
239
|
end
|
241
240
|
str.replace(result)
|
@@ -243,7 +242,7 @@ module Wp2txt
|
|
243
242
|
|
244
243
|
def remove_table!(str)
|
245
244
|
scanner = StringScanner.new(str)
|
246
|
-
result = process_nested_structure(scanner, "{|", "|}"
|
245
|
+
result = process_nested_structure(scanner, "{|", "|}") do |contents|
|
247
246
|
""
|
248
247
|
end
|
249
248
|
str.replace(result)
|
@@ -301,10 +300,11 @@ module Wp2txt
|
|
301
300
|
end
|
302
301
|
|
303
302
|
def make_reference!(str)
|
304
|
-
str.gsub!($make_reference_regex_a
|
305
|
-
str.gsub!($make_reference_regex_b
|
306
|
-
str.gsub!($make_reference_regex_c
|
307
|
-
str.gsub!($make_reference_regex_d
|
303
|
+
str.gsub!($make_reference_regex_a){"\n"}
|
304
|
+
str.gsub!($make_reference_regex_b){""}
|
305
|
+
str.gsub!($make_reference_regex_c){"[ref]"}
|
306
|
+
str.gsub!($make_reference_regex_d){"[/ref]"}
|
307
|
+
str.gsub!($format_ref_regex){""} unless $leave_ref
|
308
308
|
end
|
309
309
|
|
310
310
|
def format_ref!(page)
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/utils_spec.rb
CHANGED
@@ -6,8 +6,6 @@ require 'wp2txt'
|
|
6
6
|
require 'wp2txt/article'
|
7
7
|
require 'wp2txt/utils'
|
8
8
|
|
9
|
-
$limit_recur = 3
|
10
|
-
|
11
9
|
describe "Wp2txt" do
|
12
10
|
it "contains mediawiki-format related functions:" do
|
13
11
|
end
|
@@ -22,7 +20,7 @@ describe "Wp2txt" do
|
|
22
20
|
str_before = "[[ab[[cde[[alfa]]]]fg]]"
|
23
21
|
str_after = "<<ab<<cde<<alfa>>>>fg>>"
|
24
22
|
scanner = StringScanner.new(str_before)
|
25
|
-
str_processed = process_nested_structure(scanner, "[[", "]]"
|
23
|
+
str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
|
26
24
|
"<<" + content + ">>"
|
27
25
|
end
|
28
26
|
expect(str_processed).to eq str_after
|
@@ -32,7 +30,7 @@ describe "Wp2txt" do
|
|
32
30
|
str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
33
31
|
|passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
|
34
32
|
scanner = StringScanner.new(str_before)
|
35
|
-
str_processed = process_nested_structure(scanner, "{{", "}}"
|
33
|
+
str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
|
36
34
|
"<<" + content + ">>"
|
37
35
|
end
|
38
36
|
#str_processed.should == str_after
|