wp2txt 0.7.8 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +11 -6
- data/bin/benchmark.rb +5 -4
- data/bin/wp2txt +29 -30
- data/data/output_samples/testdata_en.txt +49076 -0
- data/data/output_samples/testdata_ja.txt +9382 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/{testdata.bz2 → testdata_ja.bz2} +0 -0
- data/lib/wp2txt/article.rb +34 -4
- data/lib/wp2txt/utils.rb +50 -53
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +69 -75
- data/spec/utils_spec.rb +28 -16
- data/wp2txt.gemspec +2 -1
- metadata +25 -10
- data/error_log.txt +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3ed3d7e29a8f1c6b5f97ca0da646ddfb53ae88add38f647eae0bdc03e626269e
|
4
|
+
data.tar.gz: '009188addebcd908f449f2ce4cf39036406f3816cafeeb61beba097fe036e890'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d91531685df204222ab7bae9b3153653d61ccd36270e36f14575cabc3c2b1d6009bfa15f9033cb8eeb837f7c1a97fdb6303611166ec62ca96b9e4c8fc1e1ec15
|
7
|
+
data.tar.gz: 19183feee7eb8f7c03d3f7bf60eebb7e75ffeb6c6eec6967a8c3e480f82f2b48b6e171d2aa22c7aa44a9336b981ad51dfd37ab423c3db2fe1a0d854860c37231
|
data/README.md
CHANGED
@@ -2,12 +2,14 @@
|
|
2
2
|
|
3
3
|
Wikipedia dump file to text converter
|
4
4
|
|
5
|
-
**
|
5
|
+
**IMPORTANT:** This is a project still work in progress and it could be slow, unstable, and even destructive! It should be used with caution.
|
6
6
|
|
7
7
|
### About ###
|
8
8
|
|
9
9
|
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
|
10
10
|
|
11
|
+
**UPDATE:** Version 0.9.1 has added a new option `num-threads`, which improves the performance significantly . Note also that `--category` option is enabled by default, resulting with output format somewhat different from previous versions. Check out the new format using test data in `data/output_samples` folder before going on to convert a huge wikipedia dump.
|
12
|
+
|
11
13
|
### Features ###
|
12
14
|
|
13
15
|
* Convert dump files of Wikipedia of various languages (I hope).
|
@@ -28,8 +30,6 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
|
|
28
30
|
|
29
31
|
Command line options are as follows:
|
30
32
|
|
31
|
-
**Important** Command line options in the current version have been drastically changed from previous versions.
|
32
|
-
|
33
33
|
Usage: wp2txt [options]
|
34
34
|
where [options] are:
|
35
35
|
--input-file, -i: Wikipedia dump file with .bz2 (compressed) or
|
@@ -41,15 +41,18 @@ Command line options are as follows:
|
|
41
41
|
--heading, --no-heading, -d: Show section titles in output (default: true)
|
42
42
|
--title, --no-title, -t: Show page titles in output (default: true)
|
43
43
|
--table, -a: Show table source code in output
|
44
|
-
|
44
|
+
--inline, -n: leave inline template notations unmodified
|
45
|
+
--multiline, -m: leave multiline template notations unmodified
|
45
46
|
--ref, -r: leave reference notations in the format
|
46
47
|
[ref]...[/ref]
|
47
|
-
|
48
|
-
--marker, --no-marker, -
|
48
|
+
--redirect, -e: Show redirect destination
|
49
|
+
--marker, --no-marker, -k: Show symbols prefixed to list items,
|
49
50
|
definitions, etc. (Default: true)
|
50
51
|
--category, -g: Show article category information
|
51
52
|
--file-size, -f <i>: Approximate size (in MB) of each output file
|
52
53
|
(default: 10)
|
54
|
+
-u, --num-threads=<i>: Number of threads to be spawned (capped to the number of CPU cores;
|
55
|
+
set 99 to spawn max num of threads) (default: 4)
|
53
56
|
--version, -v: Print version and exit
|
54
57
|
--help, -h: Show this message
|
55
58
|
|
@@ -70,6 +73,8 @@ Command line options are as follows:
|
|
70
73
|
|
71
74
|
### References ###
|
72
75
|
|
76
|
+
The author will appreciate your mentioning one of these in your research.
|
77
|
+
|
73
78
|
* Yoichiro HASEBE. 2006. [Method for using Wikipedia as Japanese corpus.](http://ci.nii.ac.jp/naid/110006226727) _Doshisha Studies in Language and Culture_ 9(2), 373-403.
|
74
79
|
* 長谷部陽一郎. 2006. [Wikipedia日本語版をコーパスとして用いた言語研究の手法](http://ci.nii.ac.jp/naid/110006226727). 『言語文化』9(2), 373-403.
|
75
80
|
|
data/bin/benchmark.rb
CHANGED
@@ -12,15 +12,16 @@ require 'benchmark'
|
|
12
12
|
data_dir = File.join(File.dirname(__FILE__), '..', "data")
|
13
13
|
|
14
14
|
parent = Wp2txt::CmdProgbar.new
|
15
|
-
input_file = File.join(data_dir, "
|
15
|
+
input_file = File.join(data_dir, "testdata_ja.bz2")
|
16
16
|
output_dir = data_dir
|
17
17
|
tfile_size = 10
|
18
|
+
num_threads = 1
|
18
19
|
convert = true
|
19
20
|
strip_tmarker = true
|
20
21
|
|
21
22
|
Benchmark.bm do |x|
|
22
23
|
x.report do
|
23
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
24
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
|
24
25
|
wpconv.extract_text do |article|
|
25
26
|
format_wiki!(article.title)
|
26
27
|
title = "[[#{article.title}]]\n"
|
@@ -58,11 +59,11 @@ Benchmark.bm do |x|
|
|
58
59
|
end
|
59
60
|
contents << line
|
60
61
|
end
|
61
|
-
|
62
|
+
format_wiki!(contents)
|
62
63
|
convert_characters!(contents)
|
63
64
|
|
64
65
|
##### cleanup #####
|
65
|
-
if /\A\s*\z/m =~ contents
|
66
|
+
if /\A[\s ]*\z/m =~ contents
|
66
67
|
result = ""
|
67
68
|
else
|
68
69
|
result = title + "\n" + contents
|
data/bin/wp2txt
CHANGED
@@ -11,11 +11,11 @@ DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
|
|
11
11
|
require 'wp2txt'
|
12
12
|
require 'wp2txt/utils'
|
13
13
|
require 'wp2txt/version'
|
14
|
-
require '
|
14
|
+
require 'optimist'
|
15
15
|
|
16
16
|
include Wp2txt
|
17
17
|
|
18
|
-
opts =
|
18
|
+
opts = Optimist::options do
|
19
19
|
version Wp2txt::VERSION
|
20
20
|
banner <<-EOS
|
21
21
|
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
@@ -31,37 +31,39 @@ EOS
|
|
31
31
|
opt :heading, "Show section titles in output", :default => true, :short => "-d"
|
32
32
|
opt :title, "Show page titles in output", :default => true
|
33
33
|
opt :table, "Show table source code in output", :default => false
|
34
|
-
opt :
|
34
|
+
opt :inline, "leave inline template notations as they are", :default => false
|
35
|
+
opt :multiline, "leave multiline template notations as they are", :default => false
|
35
36
|
opt :ref, "leave reference notations in the format [ref]...[/ref]", :default => false
|
36
37
|
opt :redirect, "Show redirect destination", :default => false
|
37
38
|
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
|
38
|
-
opt :category, "Show article category information", :default =>
|
39
|
+
opt :category, "Show article category information", :default => true
|
39
40
|
opt :file_size, "Approximate size (in MB) of each output file", :default => 10
|
41
|
+
opt :num_threads, "Number of threads to be spawned (capped to the number of CPU cores; set 99 to spawn max num of threads)", :default => 4
|
40
42
|
end
|
41
|
-
|
42
|
-
|
43
|
+
Optimist::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
44
|
+
Optimist::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
43
45
|
|
44
46
|
input_file = ARGV[0]
|
45
47
|
output_dir = opts[:output_dir]
|
46
48
|
tfile_size = opts[:file_size]
|
49
|
+
num_threads = opts[:num_threads]
|
47
50
|
convert = opts[:convert]
|
48
51
|
strip_tmarker = opts[:marker] ? false : true
|
49
|
-
opt_array = [:title, :list, :heading, :table, :redirect]
|
50
|
-
$
|
51
|
-
$leave_table = true if opts[:table]
|
52
|
+
opt_array = [:title, :list, :heading, :table, :redirect, :multiline]
|
53
|
+
$leave_inline_template = true if opts[:inline]
|
52
54
|
$leave_ref = true if opts[:ref]
|
55
|
+
# $leave_table = true if opts[:table]
|
53
56
|
config = {}
|
54
57
|
opt_array.each do |opt|
|
55
58
|
config[opt] = opts[opt]
|
56
59
|
end
|
57
60
|
|
58
61
|
parent = Wp2txt::CmdProgbar.new
|
59
|
-
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
62
|
+
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
|
60
63
|
|
61
64
|
wpconv.extract_text do |article|
|
62
65
|
format_wiki!(article.title)
|
63
66
|
title = "[[#{article.title}]]\n"
|
64
|
-
convert_characters!(title)
|
65
67
|
|
66
68
|
if opts[:category] && !article.categories.empty?
|
67
69
|
contents = "\nCATEGORIES: "
|
@@ -79,13 +81,11 @@ wpconv.extract_text do |article|
|
|
79
81
|
line = e.last
|
80
82
|
line << "+HEADING+" if $DEBUG_MODE
|
81
83
|
when :mw_paragraph
|
82
|
-
# next if !config[:paragraph]
|
83
84
|
format_wiki!(e.last)
|
84
|
-
line = e.last
|
85
|
+
line = e.last + "\n"
|
85
86
|
line << "+PARAGRAPH+" if $DEBUG_MODE
|
86
87
|
when :mw_table, :mw_htable
|
87
88
|
next if !config[:table]
|
88
|
-
format_wiki!(e.last)
|
89
89
|
line = e.last
|
90
90
|
line << "+TABLE+" if $DEBUG_MODE
|
91
91
|
when :mw_pre
|
@@ -93,43 +93,42 @@ wpconv.extract_text do |article|
|
|
93
93
|
line = e.last
|
94
94
|
line << "+PRE+" if $DEBUG_MODE
|
95
95
|
when :mw_quote
|
96
|
-
# next if !config[:quote]
|
97
|
-
format_wiki!(e.last)
|
98
96
|
line = e.last
|
99
97
|
line << "+QUOTE+" if $DEBUG_MODE
|
100
98
|
when :mw_unordered, :mw_ordered, :mw_definition
|
101
99
|
next if !config[:list]
|
102
|
-
format_wiki!(e.last)
|
103
100
|
line = e.last
|
104
101
|
line << "+LIST+" if $DEBUG_MODE
|
102
|
+
when :mw_ml_template
|
103
|
+
next if !config[:multiline]
|
104
|
+
line = e.last
|
105
|
+
line << "+MLTEMPLATE+" if $DEBUG_MODE
|
105
106
|
when :mw_redirect
|
106
107
|
next if !config[:redirect]
|
107
|
-
format_wiki!(e.last)
|
108
108
|
line = e.last
|
109
109
|
line << "+REDIRECT+" if $DEBUG_MODE
|
110
110
|
line << "\n\n"
|
111
|
+
when :mw_isolated_template
|
112
|
+
next if !config[:multiline]
|
113
|
+
line = e.last
|
114
|
+
line << "+ISOLATED_TEMPLATE+" if $DEBUG_MODE
|
115
|
+
when :mw_isolated_tag
|
116
|
+
next
|
111
117
|
else
|
112
118
|
if $DEBUG_MODE
|
113
|
-
format_wiki!(e.last)
|
119
|
+
# format_wiki!(e.last)
|
114
120
|
line = e.last
|
115
121
|
line << "+OTHER+"
|
116
122
|
else
|
117
123
|
next
|
118
124
|
end
|
119
125
|
end
|
120
|
-
contents << line
|
126
|
+
contents << line << "\n"
|
121
127
|
end
|
122
|
-
format_article!(contents)
|
123
|
-
convert_characters!(contents)
|
124
128
|
|
125
|
-
|
126
|
-
if /\A\s*\z/m =~ contents
|
129
|
+
if /\A[\s ]*\z/m =~ contents
|
127
130
|
result = ""
|
128
131
|
else
|
129
|
-
result = config[:title] ?
|
132
|
+
result = config[:title] ? "\n#{title}\n" << contents : contents
|
130
133
|
end
|
131
|
-
|
132
|
-
result.gsub!(/^[\s\W]+$/)
|
133
|
-
result.gsub!(/\n\n\n+/m){"\n\n"}
|
134
|
-
result << "\n"
|
135
|
-
end
|
134
|
+
end
|