wp2txt 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_Store
19
+ *.bak
20
+ *.~
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in wp2txt.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Yoichiro Hasebe
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,65 @@
1
+ # WP2TXT
2
+
3
+ Wikipedia dump file to text converter
4
+
5
+ ### About ###
6
+
7
+ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata. It is originally intended to be useful for researchers who look for an easy way to obtain open-source multi-lingual corpora, but may be handy for other purposes.
8
+
9
+ ### Features ###
10
+
11
+ * Convert dump files of Wikipedia of multiple languages (I hope).
12
+ * Create output files of specified size.
13
+ * Allow users to specify text elements to be extracted/converted (page titles, section titles, lists, and tables).
14
+
15
+ WP2TXT before version 0.4.0 came with Mac/Windows GUI. Now it's become a pure command-line application--Sorry GUI folks, but there seems more demand for an easy-to-hack CUI package than a not-very-flexible GUI app.
16
+
17
+ ### Installation
18
+
19
+ `gem install` method will become available soon. In the meantime, use the source code on Github.
20
+
21
+ $ gem install wp2txt
22
+
23
+ ### Usage
24
+
25
+ Obtain a Wikipedia dump file (see the link below) with a file name such as:
26
+
27
+ xxwiki-yyyymmdd-pages-articles.xml.bz2
28
+
29
+ where `xx` is language code such as "en (English)" or "ja (Japanese)", and `yyyymmdd` is the date of creation (e.g. 20120601).
30
+
31
+ Command line options are as follows:
32
+
33
+ Usage: wp2txt [options]
34
+ where [options] are:
35
+ --input-file, -i: Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format
36
+ --output-dir, -o <s>: Output directory (default: current directory)
37
+ --convert-off, -c: Output XML (without converting to plain text)
38
+ --list-off, -l: Exclude list items from output
39
+ --heading-off, -d: Exclude section titles from output
40
+ --title-off, -t: Exclude page titles from output
41
+ --table-off, -a: Exclude page titles from output (default: true)
42
+ --template-off, -e: Remove multi-line template notations from output
43
+ --strip-marker, -s: Remove symbols prefixed to list items, definitions, etc.
44
+ --file-size, -f <i>: Approximate size (in MB) of each output file (default: 10)
45
+ --version, -v: Print version and exit
46
+ --help, -h: Show this message
47
+
48
+ ### Limitations ###
49
+
50
+ * Certain types of data such as mathematical equations and computer source code are not be properly converted. Please remember this software is originally intended for correcting “sentences” for linguistic studies.
51
+ * Extraction of normal text data could sometimes fail for various reasons (e.g. illegal matching of begin/end tags, language-specific conventions of formatting, etc).
52
+ * Conversion process can take far more than you would expect. It could take several hours or more when dealing with a huge data set such as the English Wikipedia on a low-spec environments.
53
+ * Because of nature of the task, WP2TXT needs much machine power and consumes a lot of memory/storage resources. The process thus could halt unexpectedly. It may even get stuck, in the worst case, without getting gracefully terminated. Please understand this and use the software __at your own risk__.
54
+
55
+ ### Useful Link ###
56
+
57
+ * [Wikipedia Database backup dumps](http://dumps.wikimedia.org/backup-index.html)
58
+
59
+ ### Author ###
60
+
61
+ * Yoichiro Hasebe (<yohasebe@gmail.com>)
62
+
63
+ ### License ###
64
+
65
+ This software is distributed under the MIT License. Please see the LICENSE file.
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core'
3
+ require 'rspec/core/rake_task'
4
+
5
+ RSpec::Core::RakeTask.new(:spec) do |spec|
6
+ spec.pattern = FileList['spec/**/*_spec.rb']
7
+ end
8
+
9
+ task :default => :spec
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ $: << File.join(File.dirname(__FILE__))
5
+ $: << File.join(File.dirname(__FILE__), '..', 'lib')
6
+
7
+ $DEBUG_MODE = false
8
+ SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
9
+ DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
10
+
11
+ require 'wp2txt'
12
+ require 'wp2txt/utils'
13
+ require 'wp2txt/version'
14
+ require 'trollop'
15
+
16
+ include Wp2txt
17
+
18
+ opts = Trollop::options do
19
+ version Wp2txt::VERSION
20
+ banner <<-EOS
21
+ WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
22
+
23
+ Usage: wp2txt [options]
24
+ where [options] are:
25
+ EOS
26
+
27
+ opt :input_file, "Wikipedia dump file with .bz2 (compressed) or .txt (uncompressed) format", :required => true
28
+ opt :output_dir, "Output directory", :default => Dir::pwd, :type => String
29
+ opt :convert_off, "Output XML (without converting to plain text)", :default => false
30
+ opt :list_off, "Exclude list items from output", :default => false
31
+ opt :heading_off, "Exclude section titles from output", :default => false, :short => "-d"
32
+ opt :title_off, "Exclude page titles from output", :default => false
33
+ opt :table_off, "Exclude page titles from output", :default => true
34
+ opt :template_off, "Remove template notations from output", :default => true
35
+ opt :redirect_off, "Not show redirect destination", :default => false
36
+ opt :strip_marker, "Remove symbols prefixed to list items, definitions, etc.", :default => false
37
+ opt :file_size, "Approximate size (in MB) of each output file", :default => 10
38
+ end
39
+ Trollop::die :size, "must be larger than 0" unless opts[:file_size] >= 0
40
+ Trollop::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
41
+
42
+ input_file = ARGV[0]
43
+ output_dir = opts[:output_dir]
44
+ tfile_size = opts[:file_size]
45
+ convert_off = opts[:convert_off]
46
+ strip_tmarker = opts[:strip_marker]
47
+ opt_array = [:title_off, :list_off, :heading_off, :table_off, :template_off, :redirect_off]
48
+ config = {}
49
+ opt_array.each do |opt|
50
+ config[opt] = opts[opt]
51
+ end
52
+
53
+ # a "parent" is either commandline progress bar or
54
+ # a gui window (not available for now)
55
+ parent = Wp2txt::CmdProgbar.new
56
+ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert_off, strip_tmarker)
57
+
58
+ wpconv.extract_text do |article|
59
+ title = format_wiki article.title
60
+ title = "[[#{title}]]\n"
61
+ contents = ""
62
+
63
+ article.elements.each do |e|
64
+ case e.first
65
+ when :mw_heading
66
+ next if config[:heading_off]
67
+ line = format_wiki(e.last)
68
+ line += "+HEADING+" if $DEBUG_MODE
69
+ when :mw_paragraph
70
+ next if config[:paragraph_off]
71
+ line = format_wiki(e.last)
72
+ line += "+PARAGRAPH+" if $DEBUG_MODE
73
+ when :mw_table, :mw_htable
74
+ next if config[:table_off]
75
+ line = format_wiki(e.last)
76
+ line += "+TABLE+" if $DEBUG_MODE
77
+ when :mw_pre
78
+ next if config[:pre_off]
79
+ line = e.last
80
+ line += "+PRE+" if $DEBUG_MODE
81
+ when :mw_quote
82
+ next if config[:quote_off]
83
+ line = format_wiki(e.last)
84
+ line += "+QUOTE+" if $DEBUG_MODE
85
+ when :mw_unordered, :mw_ordered, :mw_definition
86
+ next if config[:list_off]
87
+ line = format_wiki(e.last)
88
+ line += "+LIST+" if $DEBUG_MODE
89
+ when :mw_redirect
90
+ next if config[:redirect_off]
91
+ line = format_wiki(e.last)
92
+ line += "+REDIRECT+" if $DEBUG_MODE
93
+ line += "\n\n"
94
+ else
95
+ if $DEBUG_MODE
96
+ line = format_wiki(e.last)
97
+ line += "+OTHER+"
98
+ else
99
+ next
100
+ end
101
+ end
102
+ contents += line
103
+ contents = remove_templates(contents) if config[:template_off]
104
+ end
105
+ if /\A\s*\z/m =~ contents
106
+ result = ""
107
+ else
108
+ result = config[:title_off] ? contents : title + "\n" + contents
109
+ end
110
+ result = result.gsub(/\[ref\]\s*\[\/ref\]/m){""}
111
+ result = result.gsub(/\n\n\n+/m){"\n\n"} + "\n"
112
+ end
Binary file
@@ -0,0 +1,323 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ $: << File.join(File.dirname(__FILE__))
5
+
6
+ require "rubygems"
7
+ require "bundler/setup"
8
+ require "nokogiri"
9
+
10
+ require "wp2txt/article"
11
+ require "wp2txt/utils"
12
+ require "wp2txt/mw_api"
13
+ require "wp2txt/progressbar"
14
+
15
+ begin
16
+ require "bzip2-ruby"
17
+ NO_BZ2 = false
18
+ rescue LoadError
19
+ # in case bzip2-ruby gem is not available
20
+ NO_BZ2 = true
21
+ end
22
+
23
+ module Wp2txt
24
+ class Runner
25
+
26
+ include Wp2txt
27
+
28
+ # attr_accessor :pause_flag, :stop_flag, :outfiles, :convert_off
29
+
30
+ def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert_off = false, strip_tmarker = false)
31
+ @parent = parent
32
+ @fp = nil
33
+
34
+ @input_file = input_file
35
+ @output_dir = output_dir
36
+ @tfile_size = tfile_size
37
+ @convert_off = convert_off
38
+ @strip_tmarker = strip_tmarker
39
+ end
40
+
41
+ def file_size(file)
42
+ origin = Time.now
43
+ size = 0; unit = 10485760; star = 0; before = Time.now.to_f
44
+ error_count = 10
45
+ while true do
46
+ begin
47
+ a = file.read(unit)
48
+ rescue => e
49
+ a = nil
50
+ end
51
+ break unless a
52
+
53
+ present = Time.now.to_f
54
+ size += a.size
55
+ if present - before > 0.3
56
+ star = 0 if star > 10
57
+ star += 1
58
+ before = present
59
+ end
60
+ end
61
+ time_elapsed = Time.now - origin
62
+ size
63
+ end
64
+
65
+ # control the display of command line progressbar (or gui which is not available for now)
66
+ def notify_parent(last = false)
67
+ @last_time ||= Time.now.to_f
68
+ @elapsed_sum ||= 0
69
+ time_now = Time.now.to_f
70
+ elapsed_from_last = (time_now - @last_time).to_i
71
+
72
+ if elapsed_from_last > 0.3 || last
73
+
74
+ @last_time = time_now
75
+ @elapsed_sum += elapsed_from_last
76
+ gvalue = (@size_read.to_f / @infile_size.to_f * 100 * 100).to_i
77
+ elt_str = sec_to_str(@elapsed_sum)
78
+ if last
79
+ eta_str = "00:00:00"
80
+ else
81
+ lines_persec = @size_read / @elapsed_sum if @elapsed_sum > 0
82
+ eta_sec = (@infile_size - @size_read) / lines_persec
83
+ eta_str = sec_to_str(eta_sec)
84
+ end
85
+ @parent.prg_update(gvalue, elt_str, eta_str)
86
+ end
87
+ end
88
+
89
+ # check the size of input file (bz2 or plain xml) when uncompressed
90
+ def prepare
91
+
92
+ # if output_dir is not specified, output in the same directory
93
+ # as the imput file
94
+ if !@output_dir && @input_file
95
+ @output_dir = File.dirname(@input_file)
96
+ end
97
+
98
+ # if input file is bz2 compressed, use bz2-ruby if available,
99
+ # use command line bzip2 program otherwise.
100
+ if /.bz2$/ =~ @input_file
101
+ unless NO_BZ2
102
+ file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
103
+ @parent.msg("Preparing ... This may take several minutes or more ", 0)
104
+ @infile_size = file_size(file)
105
+ @parent.msg("... Done.", 1)
106
+ file.close
107
+ file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
108
+ else
109
+ if RUBY_PLATFORM.index("win32")
110
+ file = IO.popen("bunzip2.exe -c #{@input_file}")
111
+ else
112
+ file = IO.popen("bzip2 -c -d #{@input_file}")
113
+ end
114
+ @infile_size = file_size(file)
115
+ file.close # try to reopen since rewind method is unavailable
116
+ if RUBY_PLATFORM.index("win32")
117
+ file = IO.popen("bunzip2.exe -c #{@input_file}")
118
+ else
119
+ file = IO.popen("bzip2 -c -d #{@input_file}")
120
+ end
121
+ end
122
+ else # meaning that it is a text file
123
+ @infile_size = File.stat(@input_file).size
124
+ file = open(@input_file)
125
+ end
126
+
127
+ #create basename of output file
128
+ @outfile_base = File.basename(@input_file, ".*") + "-"
129
+ @total_size = 0
130
+ @file_index = 1
131
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
132
+ @outfiles = []
133
+ @outfiles << outfilename
134
+ @fp = File.open(outfilename, "w")
135
+ @parent.before
136
+ @parent.data_set(@input_file, 100 * 100)
137
+ @file_pointer = file
138
+ return true
139
+ end
140
+
141
+ # read text data from bz2 compressed file by 1 megabyte
142
+ def fill_buffer
143
+ while true do
144
+ begin
145
+ new_lines = @file_pointer.read(10485760)
146
+ rescue => e
147
+ return nil
148
+ end
149
+ return nil unless new_lines
150
+
151
+ # temp_buf is filled with text split by "\n"
152
+ temp_buf = []
153
+ ss = StringScanner.new(new_lines)
154
+ while ss.scan(/.*?\n/m)
155
+ temp_buf << ss[0]
156
+ end
157
+ temp_buf << ss.rest unless ss.eos?
158
+
159
+ new_first_line = temp_buf.shift
160
+ if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
161
+ @buffer.last << new_first_line
162
+ @buffer << ""
163
+ else
164
+ @buffer.last << new_first_line
165
+ end
166
+ @buffer += temp_buf unless temp_buf.empty?
167
+ if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
168
+ @buffer << ""
169
+ end
170
+ break if @buffer.size > 1
171
+ end
172
+ return true
173
+ end
174
+
175
+ def get_newline
176
+ @buffer ||= [""]
177
+ if @buffer.size == 1
178
+ return nil unless fill_buffer
179
+ end
180
+ if @buffer.empty?
181
+ return nil
182
+ else
183
+ new_line = @buffer.shift
184
+ return new_line
185
+ end
186
+ end
187
+
188
+ def get_page
189
+ inside_page = false
190
+ page = ""
191
+ while line = get_newline
192
+ notify_parent
193
+ @size_read ||=0; @size_read += line.bytesize
194
+
195
+ if /<page>/ =~ line #
196
+ page << line
197
+ inside_page = true
198
+ next
199
+ elsif /<\/page>/ =~ line #
200
+ page << line
201
+ inside_page = false
202
+ break
203
+ end
204
+ page << line if inside_page
205
+ end
206
+ if page.empty?
207
+ return false
208
+ else
209
+ return page.force_encoding("utf-8")
210
+ end
211
+ end
212
+
213
+ # call this method to do the job
214
+ def extract_text(&block)
215
+ prepare
216
+ # output the original xml only split to files of the specified size
217
+ if @convert_off
218
+ extract
219
+ # convert xml to plain text
220
+ else
221
+ if block
222
+ extract_and_convert(&block)
223
+ else
224
+ extract_and_convert
225
+ end
226
+ end
227
+ end
228
+
229
+ def extract_and_convert(&block)
230
+ in_text = false
231
+ in_message = false
232
+ result_text = ""
233
+ title = nil
234
+ end_flag = false
235
+ terminal_round = false
236
+ output_text = ""
237
+
238
+ while page = get_page
239
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
240
+ xml = xmlns + page + "</mediawiki>"
241
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
242
+ page = input.xpath("//xmlns:text").first
243
+ pp_title = page.parent.parent.at_css "title"
244
+ title = pp_title.content
245
+
246
+ next if /\:/ =~ title
247
+ text = page.content
248
+ # remove all comment texts
249
+ # and insert as many number of new line chars included in
250
+ # each comment instead
251
+ text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
252
+ num_of_newlines = content.count("\n")
253
+ if num_of_newlines == 0
254
+ ""
255
+ else
256
+ "\n" * num_of_newlines
257
+ end
258
+ end
259
+
260
+ @count ||= 0;@count += 1;
261
+
262
+ article = Article.new(text, title, @strip_tmarker)
263
+ output_text += block.call(article)
264
+ @total_size = output_text.bytesize
265
+
266
+ # flagged when data exceeds the size of output file
267
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
268
+
269
+ #close the present file, then open a new one
270
+ if end_flag
271
+ @fp.puts(output_text)
272
+ output_text = ""
273
+ @total_size = 0
274
+ end_flag = false
275
+ @fp.close
276
+ @file_index += 1
277
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
278
+ @outfiles << outfilename
279
+ @fp = File.open(outfilename, "w")
280
+ next
281
+ end
282
+ end
283
+ @fp.puts(output_text) if output_text != ""
284
+ notify_parent(true)
285
+ @parent.after
286
+ @fp.close
287
+ rename(@outfiles)
288
+ @parent.msg("Processing finished", 1)
289
+ end
290
+
291
+ def extract
292
+ output_text = ""
293
+ end_flag = false
294
+ while text = get_newline
295
+ @count ||= 0;@count += 1;
296
+ @size_read ||=0;@size_read += text.bytesize
297
+ @total_size += text.bytesize
298
+ output_text << text
299
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
300
+ notify_parent
301
+ # never close the file until the end of the page even if end_flag is on
302
+ if end_flag && /<\/page/ =~ text
303
+ @fp.puts(output_text)
304
+ output_text = ""
305
+ @total_size = 0
306
+ end_flag = false
307
+ @fp.close
308
+ @file_index += 1
309
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
310
+ @outfiles << outfilename
311
+ @fp = File.open(outfilename, "w")
312
+ next
313
+ end
314
+ end
315
+ @fp.puts(output_text) if output_text != ""
316
+ notify_parent(true)
317
+ @parent.after
318
+ @fp.close
319
+ rename(@outfiles)
320
+ @parent.msg("Processing finished", 1)
321
+ end
322
+ end
323
+ end