wp2txt 0.7.8 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +11 -6
- data/bin/benchmark.rb +5 -4
- data/bin/wp2txt +29 -30
- data/data/output_samples/testdata_en.txt +49076 -0
- data/data/output_samples/testdata_ja.txt +9382 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/{testdata.bz2 → testdata_ja.bz2} +0 -0
- data/lib/wp2txt/article.rb +34 -4
- data/lib/wp2txt/utils.rb +50 -53
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +69 -75
- data/spec/utils_spec.rb +28 -16
- data/wp2txt.gemspec +2 -1
- metadata +25 -10
- data/error_log.txt +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3ed3d7e29a8f1c6b5f97ca0da646ddfb53ae88add38f647eae0bdc03e626269e
|
4
|
+
data.tar.gz: '009188addebcd908f449f2ce4cf39036406f3816cafeeb61beba097fe036e890'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d91531685df204222ab7bae9b3153653d61ccd36270e36f14575cabc3c2b1d6009bfa15f9033cb8eeb837f7c1a97fdb6303611166ec62ca96b9e4c8fc1e1ec15
|
7
|
+
data.tar.gz: 19183feee7eb8f7c03d3f7bf60eebb7e75ffeb6c6eec6967a8c3e480f82f2b48b6e171d2aa22c7aa44a9336b981ad51dfd37ab423c3db2fe1a0d854860c37231
|
data/README.md
CHANGED
@@ -2,12 +2,14 @@
|
|
2
2
|
|
3
3
|
Wikipedia dump file to text converter
|
4
4
|
|
5
|
-
**
|
5
|
+
**IMPORTANT:** This is a project still work in progress and it could be slow, unstable, and even destructive! It should be used with caution.
|
6
6
|
|
7
7
|
### About ###
|
8
8
|
|
9
9
|
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
|
10
10
|
|
11
|
+
**UPDATE:** Version 0.9.1 has added a new option `num-threads`, which improves the performance significantly . Note also that `--category` option is enabled by default, resulting with output format somewhat different from previous versions. Check out the new format using test data in `data/output_samples` folder before going on to convert a huge wikipedia dump.
|
12
|
+
|
11
13
|
### Features ###
|
12
14
|
|
13
15
|
* Convert dump files of Wikipedia of various languages (I hope).
|
@@ -28,8 +30,6 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
|
|
28
30
|
|
29
31
|
Command line options are as follows:
|
30
32
|
|
31
|
-
**Important** Command line options in the current version have been drastically changed from previous versions.
|
32
|
-
|
33
33
|
Usage: wp2txt [options]
|
34
34
|
where [options] are:
|
35
35
|
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or
|
@@ -41,15 +41,18 @@ Command line options are as follows:
|
|
41
41
|
--heading, --no-heading, -d: Show section titles in output (default: true)
|
42
42
|
--title, --no-title, -t: Show page titles in output (default: true)
|
43
43
|
--table, -a: Show table source code in output
|
44
|
-
|
44
|
+
--inline, -n: leave inline template notations unmodified
|
45
|
+
--multiline, -m: leave multiline template notations unmodified
|
45
46
|
--ref, -r: leave reference notations in the format
|
46
47
|
[ref]...[/ref]
|
47
|
-
|
48
|
-
--marker, --no-marker, -
|
48
|
+
--redirect, -e: Show redirect destination
|
49
|
+
--marker, --no-marker, -k: Show symbols prefixed to list items,
|
49
50
|
definitions, etc. (Default: true)
|
50
51
|
--category, -g: Show article category information
|
51
52
|
--file-size, -f <i>: Approximate size (in MB) of each output file
|
52
53
|
(default: 10)
|
54
|
+
-u, --num-threads=<i>: Number of threads to be spawned (capped to the number of CPU cores;
|
55
|
+
set 99 to spawn max num of threads) (default: 4)
|
53
56
|
--version, -v: Print version and exit
|
54
57
|
--help, -h: Show this message
|
55
58
|
|
@@ -70,6 +73,8 @@ Command line options are as follows:
|
|
70
73
|
|
71
74
|
### References ###
|
72
75
|
|
76
|
+
The author will appreciate your mentioning one of these in your research.
|
77
|
+
|
73
78
|
* Yoichiro HASEBE. 2006. [Method for using Wikipedia as Japanese corpus.](http://ci.nii.ac.jp/naid/110006226727) _Doshisha Studies in Language and Culture_ 9(2), 373-403.
|
74
79
|
* 長谷部陽一郎. 2006. [Wikipedia日本語版をコーパスとして用いた言語研究の手法](http://ci.nii.ac.jp/naid/110006226727). 『言語文化』9(2), 373-403.
|
75
80
|
|
data/bin/benchmark.rb
CHANGED
@@ -12,15 +12,16 @@ require 'benchmark'
|
|
12
12
|
data_dir = File.join(File.dirname(__FILE__), '..', "data")
|
13
13
|
|
14
14
|
parent = Wp2txt::CmdProgbar.new
|
15
|
-
input_file = File.join(data_dir, "
|
15
|
+
input_file = File.join(data_dir, "testdata_ja.bz2")
|
16
16
|
output_dir = data_dir
|
17
17
|
tfile_size = 10
|
18
|
+
num_threads = 1
|
18
19
|
convert = true
|
19
20
|
strip_tmarker = true
|
20
21
|
|
21
22
|
Benchmark.bm do |x|
|
22
23
|
x.report do
|
23
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
24
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
|
24
25
|
wpconv.extract_text do |article|
|
25
26
|
format_wiki!(article.title)
|
26
27
|
title = "[[#{article.title}]]\n"
|
@@ -58,11 +59,11 @@ Benchmark.bm do |x|
|
|
58
59
|
end
|
59
60
|
contents << line
|
60
61
|
end
|
61
|
-
|
62
|
+
format_wiki!(contents)
|
62
63
|
convert_characters!(contents)
|
63
64
|
|
64
65
|
##### cleanup #####
|
65
|
-
if /\A\s*\z/m =~ contents
|
66
|
+
if /\A[\s ]*\z/m =~ contents
|
66
67
|
result = ""
|
67
68
|
else
|
68
69
|
result = title + "\n" + contents
|
data/bin/wp2txt
CHANGED
@@ -11,11 +11,11 @@ DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
|
|
11
11
|
require 'wp2txt'
|
12
12
|
require 'wp2txt/utils'
|
13
13
|
require 'wp2txt/version'
|
14
|
-
require '
|
14
|
+
require 'optimist'
|
15
15
|
|
16
16
|
include Wp2txt
|
17
17
|
|
18
|
-
opts =
|
18
|
+
opts = Optimist::options do
|
19
19
|
version Wp2txt::VERSION
|
20
20
|
banner <<-EOS
|
21
21
|
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
@@ -31,37 +31,39 @@ EOS
|
|
31
31
|
opt :heading, "Show section titles in output", :default => true, :short => "-d"
|
32
32
|
opt :title, "Show page titles in output", :default => true
|
33
33
|
opt :table, "Show table source code in output", :default => false
|
34
|
-
opt :
|
34
|
+
opt :inline, "leave inline template notations as they are", :default => false
|
35
|
+
opt :multiline, "leave multiline template notations as they are", :default => false
|
35
36
|
opt :ref, "leave reference notations in the format [ref]...[/ref]", :default => false
|
36
37
|
opt :redirect, "Show redirect destination", :default => false
|
37
38
|
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
|
38
|
-
opt :category, "Show article category information", :default =>
|
39
|
+
opt :category, "Show article category information", :default => true
|
39
40
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
41
|
+
opt :num_threads, "Number of threads to be spawned (capped to the number of CPU cores; set 99 to spawn max num of threads)", :default => 4
|
40
42
|
end
|
41
|
-
|
42
|
-
|
43
|
+
Optimist::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
44
|
+
Optimist::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
43
45
|
|
44
46
|
input_file = ARGV[0]
|
45
47
|
output_dir = opts[:output_dir]
|
46
48
|
tfile_size = opts[:file_size]
|
49
|
+
num_threads = opts[:num_threads]
|
47
50
|
convert = opts[:convert]
|
48
51
|
strip_tmarker = opts[:marker] ? false : true
|
49
|
-
opt_array = [:title, :list, :heading, :table, :redirect]
|
50
|
-
$
|
51
|
-
$leave_table = true if opts[:table]
|
52
|
+
opt_array = [:title, :list, :heading, :table, :redirect, :multiline]
|
53
|
+
$leave_inline_template = true if opts[:inline]
|
52
54
|
$leave_ref = true if opts[:ref]
|
55
|
+
# $leave_table = true if opts[:table]
|
53
56
|
config = {}
|
54
57
|
opt_array.each do |opt|
|
55
58
|
config[opt] = opts[opt]
|
56
59
|
end
|
57
60
|
|
58
61
|
parent = Wp2txt::CmdProgbar.new
|
59
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
62
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
|
60
63
|
|
61
64
|
wpconv.extract_text do |article|
|
62
65
|
format_wiki!(article.title)
|
63
66
|
title = "[[#{article.title}]]\n"
|
64
|
-
convert_characters!(title)
|
65
67
|
|
66
68
|
if opts[:category] && !article.categories.empty?
|
67
69
|
contents = "\nCATEGORIES: "
|
@@ -79,13 +81,11 @@ wpconv.extract_text do |article|
|
|
79
81
|
line = e.last
|
80
82
|
line << "+HEADING+" if $DEBUG_MODE
|
81
83
|
when :mw_paragraph
|
82
|
-
# next if !config[:paragraph]
|
83
84
|
format_wiki!(e.last)
|
84
|
-
line = e.last
|
85
|
+
line = e.last + "\n"
|
85
86
|
line << "+PARAGRAPH+" if $DEBUG_MODE
|
86
87
|
when :mw_table, :mw_htable
|
87
88
|
next if !config[:table]
|
88
|
-
format_wiki!(e.last)
|
89
89
|
line = e.last
|
90
90
|
line << "+TABLE+" if $DEBUG_MODE
|
91
91
|
when :mw_pre
|
@@ -93,43 +93,42 @@ wpconv.extract_text do |article|
|
|
93
93
|
line = e.last
|
94
94
|
line << "+PRE+" if $DEBUG_MODE
|
95
95
|
when :mw_quote
|
96
|
-
# next if !config[:quote]
|
97
|
-
format_wiki!(e.last)
|
98
96
|
line = e.last
|
99
97
|
line << "+QUOTE+" if $DEBUG_MODE
|
100
98
|
when :mw_unordered, :mw_ordered, :mw_definition
|
101
99
|
next if !config[:list]
|
102
|
-
format_wiki!(e.last)
|
103
100
|
line = e.last
|
104
101
|
line << "+LIST+" if $DEBUG_MODE
|
102
|
+
when :mw_ml_template
|
103
|
+
next if !config[:multiline]
|
104
|
+
line = e.last
|
105
|
+
line << "+MLTEMPLATE+" if $DEBUG_MODE
|
105
106
|
when :mw_redirect
|
106
107
|
next if !config[:redirect]
|
107
|
-
format_wiki!(e.last)
|
108
108
|
line = e.last
|
109
109
|
line << "+REDIRECT+" if $DEBUG_MODE
|
110
110
|
line << "\n\n"
|
111
|
+
when :mw_isolated_template
|
112
|
+
next if !config[:multiline]
|
113
|
+
line = e.last
|
114
|
+
line << "+ISOLATED_TEMPLATE+" if $DEBUG_MODE
|
115
|
+
when :mw_isolated_tag
|
116
|
+
next
|
111
117
|
else
|
112
118
|
if $DEBUG_MODE
|
113
|
-
format_wiki!(e.last)
|
119
|
+
# format_wiki!(e.last)
|
114
120
|
line = e.last
|
115
121
|
line << "+OTHER+"
|
116
122
|
else
|
117
123
|
next
|
118
124
|
end
|
119
125
|
end
|
120
|
-
contents << line
|
126
|
+
contents << line << "\n"
|
121
127
|
end
|
122
|
-
format_article!(contents)
|
123
|
-
convert_characters!(contents)
|
124
128
|
|
125
|
-
|
126
|
-
if /\A\s*\z/m =~ contents
|
129
|
+
if /\A[\s ]*\z/m =~ contents
|
127
130
|
result = ""
|
128
131
|
else
|
129
|
-
result = config[:title] ?
|
132
|
+
result = config[:title] ? "\n#{title}\n" << contents : contents
|
130
133
|
end
|
131
|
-
|
132
|
-
result.gsub!(/^[\s\W]+$/)
|
133
|
-
result.gsub!(/\n\n\n+/m){"\n\n"}
|
134
|
-
result << "\n"
|
135
|
-
end
|
134
|
+
end
|