wp2txt 0.7.8 → 0.9.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: ee8448d2dc341c9f26a613522c0b9a225b62a7df
4
- data.tar.gz: 036aa5184a19b4351c65af605f2ebc23b9e73398
2
+ SHA256:
3
+ metadata.gz: 3ed3d7e29a8f1c6b5f97ca0da646ddfb53ae88add38f647eae0bdc03e626269e
4
+ data.tar.gz: '009188addebcd908f449f2ce4cf39036406f3816cafeeb61beba097fe036e890'
5
5
  SHA512:
6
- metadata.gz: 05dd0bd2462bc72f030c0bd03233e359d1febdb4b30ad1309f4baf35ab6241684d164269ae1bae527163da787188d915ccb7ab460d83cd83732fbf9627d7ada1
7
- data.tar.gz: 2bc83d1854656a4b3a83e6a2e1b9cfe86c86163d27a64582f994fc997b8104e4ab28d8d28881c054e323fd69934c53b63909cd7458a8d2ed0243c95702f8a14e
6
+ metadata.gz: d91531685df204222ab7bae9b3153653d61ccd36270e36f14575cabc3c2b1d6009bfa15f9033cb8eeb837f7c1a97fdb6303611166ec62ca96b9e4c8fc1e1ec15
7
+ data.tar.gz: 19183feee7eb8f7c03d3f7bf60eebb7e75ffeb6c6eec6967a8c3e480f82f2b48b6e171d2aa22c7aa44a9336b981ad51dfd37ab423c3db2fe1a0d854860c37231
data/README.md CHANGED
@@ -2,12 +2,14 @@
2
2
 
3
3
  Wikipedia dump file to text converter
4
4
 
5
- **Important** This is a project *work in progress* and it could be slow, unstable, and even destructive! Please use it with caution
5
+ **IMPORTANT:** This is a project still work in progress and it could be slow, unstable, and even destructive! It should be used with caution.
6
6
 
7
7
  ### About ###
8
8
 
9
9
  WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
10
10
 
11
+ **UPDATE:** Version 0.9.1 has added a new option `num-threads`, which improves the performance significantly . Note also that `--category` option is enabled by default, resulting with output format somewhat different from previous versions. Check out the new format using test data in `data/output_samples` folder before going on to convert a huge wikipedia dump.
12
+
11
13
  ### Features ###
12
14
 
13
15
  * Convert dump files of Wikipedia of various languages (I hope).
@@ -28,8 +30,6 @@ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyy
28
30
 
29
31
  Command line options are as follows:
30
32
 
31
- **Important** Command line options in the current version have been drastically changed from previous versions.
32
-
33
33
  Usage: wp2txt [options]
34
34
  where [options] are:
35
35
  --input-file, -i: Wikipedia dump file with .bz2 (compressed) or
@@ -41,15 +41,18 @@ Command line options are as follows:
41
41
  --heading, --no-heading, -d: Show section titles in output (default: true)
42
42
  --title, --no-title, -t: Show page titles in output (default: true)
43
43
  --table, -a: Show table source code in output
44
- --template, -e: leave inline template notations unmodified
44
+ --inline, -n: leave inline template notations unmodified
45
+ --multiline, -m: leave multiline template notations unmodified
45
46
  --ref, -r: leave reference notations in the format
46
47
  [ref]...[/ref]
47
- --redirect: Show redirect destination
48
- --marker, --no-marker, -m: Show symbols prefixed to list items,
48
+ --redirect, -e: Show redirect destination
49
+ --marker, --no-marker, -k: Show symbols prefixed to list items,
49
50
  definitions, etc. (Default: true)
50
51
  --category, -g: Show article category information
51
52
  --file-size, -f <i>: Approximate size (in MB) of each output file
52
53
  (default: 10)
54
+ -u, --num-threads=<i>: Number of threads to be spawned (capped to the number of CPU cores;
55
+ set 99 to spawn max num of threads) (default: 4)
53
56
  --version, -v: Print version and exit
54
57
  --help, -h: Show this message
55
58
 
@@ -70,6 +73,8 @@ Command line options are as follows:
70
73
 
71
74
  ### References ###
72
75
 
76
+ The author will appreciate your mentioning one of these in your research.
77
+
73
78
  * Yoichiro HASEBE. 2006. [Method for using Wikipedia as Japanese corpus.](http://ci.nii.ac.jp/naid/110006226727) _Doshisha Studies in Language and Culture_ 9(2), 373-403.
74
79
  * 長谷部陽一郎. 2006. [Wikipedia日本語版をコーパスとして用いた言語研究の手法](http://ci.nii.ac.jp/naid/110006226727). 『言語文化』9(2), 373-403.
75
80
 
data/bin/benchmark.rb CHANGED
@@ -12,15 +12,16 @@ require 'benchmark'
12
12
  data_dir = File.join(File.dirname(__FILE__), '..', "data")
13
13
 
14
14
  parent = Wp2txt::CmdProgbar.new
15
- input_file = File.join(data_dir, "testdata.bz2")
15
+ input_file = File.join(data_dir, "testdata_ja.bz2")
16
16
  output_dir = data_dir
17
17
  tfile_size = 10
18
+ num_threads = 1
18
19
  convert = true
19
20
  strip_tmarker = true
20
21
 
21
22
  Benchmark.bm do |x|
22
23
  x.report do
23
- wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
24
+ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
24
25
  wpconv.extract_text do |article|
25
26
  format_wiki!(article.title)
26
27
  title = "[[#{article.title}]]\n"
@@ -58,11 +59,11 @@ Benchmark.bm do |x|
58
59
  end
59
60
  contents << line
60
61
  end
61
- format_article!(contents)
62
+ format_wiki!(contents)
62
63
  convert_characters!(contents)
63
64
 
64
65
  ##### cleanup #####
65
- if /\A\s*\z/m =~ contents
66
+ if /\A[\s ]*\z/m =~ contents
66
67
  result = ""
67
68
  else
68
69
  result = title + "\n" + contents
data/bin/wp2txt CHANGED
@@ -11,11 +11,11 @@ DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
11
11
  require 'wp2txt'
12
12
  require 'wp2txt/utils'
13
13
  require 'wp2txt/version'
14
- require 'trollop'
14
+ require 'optimist'
15
15
 
16
16
  include Wp2txt
17
17
 
18
- opts = Trollop::options do
18
+ opts = Optimist::options do
19
19
  version Wp2txt::VERSION
20
20
  banner <<-EOS
21
21
  WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
@@ -31,37 +31,39 @@ EOS
31
31
  opt :heading, "Show section titles in output", :default => true, :short => "-d"
32
32
  opt :title, "Show page titles in output", :default => true
33
33
  opt :table, "Show table source code in output", :default => false
34
- opt :template, "leave inline template notations unmodified", :default => false
34
+ opt :inline, "leave inline template notations as they are", :default => false
35
+ opt :multiline, "leave multiline template notations as they are", :default => false
35
36
  opt :ref, "leave reference notations in the format [ref]...[/ref]", :default => false
36
37
  opt :redirect, "Show redirect destination", :default => false
37
38
  opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true
38
- opt :category, "Show article category information", :default => false
39
+ opt :category, "Show article category information", :default => true
39
40
  opt :file_size, "Approximate size (in MB) of each output file", :default => 10
41
+ opt :num_threads, "Number of threads to be spawned (capped to the number of CPU cores; set 99 to spawn max num of threads)", :default => 4
40
42
  end
41
- Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
42
- Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
43
+ Optimist::die :size, "must be larger than 0" unless opts[:file_size] >= 0
44
+ Optimist::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
43
45
 
44
46
  input_file = ARGV[0]
45
47
  output_dir = opts[:output_dir]
46
48
  tfile_size = opts[:file_size]
49
+ num_threads = opts[:num_threads]
47
50
  convert = opts[:convert]
48
51
  strip_tmarker = opts[:marker] ? false : true
49
- opt_array = [:title, :list, :heading, :table, :redirect]
50
- $leave_template = true if opts[:template]
51
- $leave_table = true if opts[:table]
52
+ opt_array = [:title, :list, :heading, :table, :redirect, :multiline]
53
+ $leave_inline_template = true if opts[:inline]
52
54
  $leave_ref = true if opts[:ref]
55
+ # $leave_table = true if opts[:table]
53
56
  config = {}
54
57
  opt_array.each do |opt|
55
58
  config[opt] = opts[opt]
56
59
  end
57
60
 
58
61
  parent = Wp2txt::CmdProgbar.new
59
- wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
62
+ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, num_threads, convert, strip_tmarker)
60
63
 
61
64
  wpconv.extract_text do |article|
62
65
  format_wiki!(article.title)
63
66
  title = "[[#{article.title}]]\n"
64
- convert_characters!(title)
65
67
 
66
68
  if opts[:category] && !article.categories.empty?
67
69
  contents = "\nCATEGORIES: "
@@ -79,13 +81,11 @@ wpconv.extract_text do |article|
79
81
  line = e.last
80
82
  line << "+HEADING+" if $DEBUG_MODE
81
83
  when :mw_paragraph
82
- # next if !config[:paragraph]
83
84
  format_wiki!(e.last)
84
- line = e.last
85
+ line = e.last + "\n"
85
86
  line << "+PARAGRAPH+" if $DEBUG_MODE
86
87
  when :mw_table, :mw_htable
87
88
  next if !config[:table]
88
- format_wiki!(e.last)
89
89
  line = e.last
90
90
  line << "+TABLE+" if $DEBUG_MODE
91
91
  when :mw_pre
@@ -93,43 +93,42 @@ wpconv.extract_text do |article|
93
93
  line = e.last
94
94
  line << "+PRE+" if $DEBUG_MODE
95
95
  when :mw_quote
96
- # next if !config[:quote]
97
- format_wiki!(e.last)
98
96
  line = e.last
99
97
  line << "+QUOTE+" if $DEBUG_MODE
100
98
  when :mw_unordered, :mw_ordered, :mw_definition
101
99
  next if !config[:list]
102
- format_wiki!(e.last)
103
100
  line = e.last
104
101
  line << "+LIST+" if $DEBUG_MODE
102
+ when :mw_ml_template
103
+ next if !config[:multiline]
104
+ line = e.last
105
+ line << "+MLTEMPLATE+" if $DEBUG_MODE
105
106
  when :mw_redirect
106
107
  next if !config[:redirect]
107
- format_wiki!(e.last)
108
108
  line = e.last
109
109
  line << "+REDIRECT+" if $DEBUG_MODE
110
110
  line << "\n\n"
111
+ when :mw_isolated_template
112
+ next if !config[:multiline]
113
+ line = e.last
114
+ line << "+ISOLATED_TEMPLATE+" if $DEBUG_MODE
115
+ when :mw_isolated_tag
116
+ next
111
117
  else
112
118
  if $DEBUG_MODE
113
- format_wiki!(e.last)
119
+ # format_wiki!(e.last)
114
120
  line = e.last
115
121
  line << "+OTHER+"
116
122
  else
117
123
  next
118
124
  end
119
125
  end
120
- contents << line
126
+ contents << line << "\n"
121
127
  end
122
- format_article!(contents)
123
- convert_characters!(contents)
124
128
 
125
- ##### cleanup #####
126
- if /\A\s*\z/m =~ contents
129
+ if /\A[\s ]*\z/m =~ contents
127
130
  result = ""
128
131
  else
129
- result = config[:title] ? title + "\n" << contents : contents
132
+ result = config[:title] ? "\n#{title}\n" << contents : contents
130
133
  end
131
- result.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
132
- result.gsub!(/^[\s\W]+$/)
133
- result.gsub!(/\n\n\n+/m){"\n\n"}
134
- result << "\n"
135
- end
134
+ end