wp2txt 0.8.0 → 0.9.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -6
- data/bin/benchmark.rb +5 -4
- data/bin/wp2txt +24 -27
- data/data/output_samples/testdata_en.txt +49076 -0
- data/data/output_samples/testdata_ja.txt +9382 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/{testdata.bz2 → testdata_ja.bz2} +0 -0
- data/lib/wp2txt.rb +66 -42
- data/lib/wp2txt/article.rb +33 -3
- data/lib/wp2txt/utils.rb +44 -49
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +28 -16
- data/wp2txt.gemspec +1 -0
- metadata +21 -4
Binary file
|
File without changes
|
data/lib/wp2txt.rb
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
6
|
require "nokogiri"
|
7
|
+
require "parallel"
|
7
8
|
|
8
9
|
require 'pp'
|
9
10
|
require "wp2txt/article"
|
@@ -24,7 +25,7 @@ module Wp2txt
|
|
24
25
|
|
25
26
|
include Wp2txt
|
26
27
|
|
27
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
28
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
|
28
29
|
@parent = parent
|
29
30
|
@fp = nil
|
30
31
|
|
@@ -33,6 +34,8 @@ module Wp2txt
|
|
33
34
|
@tfile_size = tfile_size
|
34
35
|
@convert = convert
|
35
36
|
@strip_tmarker = strip_tmarker
|
37
|
+
num_cores_available = Etc.nprocessors
|
38
|
+
@num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
|
36
39
|
end
|
37
40
|
|
38
41
|
def file_size(file)
|
@@ -97,6 +100,7 @@ module Wp2txt
|
|
97
100
|
if /.bz2$/ =~ @input_file
|
98
101
|
unless NO_BZ2
|
99
102
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
103
|
+
@parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
|
100
104
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
101
105
|
@infile_size = file_size(file)
|
102
106
|
@parent.msg("... Done.", 1)
|
@@ -108,6 +112,7 @@ module Wp2txt
|
|
108
112
|
else
|
109
113
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
110
114
|
end
|
115
|
+
@parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
|
111
116
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
112
117
|
@infile_size = file_size(file)
|
113
118
|
@parent.msg("... Done.", 1)
|
@@ -232,53 +237,71 @@ module Wp2txt
|
|
232
237
|
end_flag = false
|
233
238
|
terminal_round = false
|
234
239
|
output_text = ""
|
240
|
+
pages = []
|
241
|
+
data_empty = false
|
235
242
|
|
236
|
-
|
237
|
-
|
238
|
-
|
243
|
+
begin
|
244
|
+
page = get_page
|
245
|
+
if page
|
246
|
+
pages << page
|
247
|
+
else
|
248
|
+
data_empty = true
|
249
|
+
end
|
250
|
+
if data_empty || pages.size == @num_threads
|
251
|
+
# pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
252
|
+
pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
253
|
+
page_text = {:order => n, :data => nil}
|
254
|
+
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
255
|
+
xml = xmlns + page + "</mediawiki>"
|
239
256
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
257
|
+
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
258
|
+
page = input.xpath("//xmlns:text").first
|
259
|
+
pp_title = page.parent.parent.at_css "title"
|
260
|
+
title = pp_title.content
|
261
|
+
unless /\:/ =~ title
|
262
|
+
text = page.content
|
263
|
+
text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
|
264
|
+
num_of_newlines = content.count("\n")
|
265
|
+
if num_of_newlines == 0
|
266
|
+
""
|
267
|
+
else
|
268
|
+
"\n" * num_of_newlines
|
269
|
+
end
|
270
|
+
end
|
271
|
+
article = Article.new(text, title, @strip_tmarker)
|
272
|
+
page_text[:data] = block.call(article)
|
273
|
+
end
|
274
|
+
page_text
|
275
|
+
end
|
276
|
+
pages.clear
|
277
|
+
pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
|
278
|
+
pages_text.each do |page_text|
|
279
|
+
output_text << page_text
|
280
|
+
@count ||= 0; @count += 1;
|
281
|
+
@total_size = output_text.bytesize
|
282
|
+
# flagged when data exceeds the size of output file
|
283
|
+
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
284
|
+
end
|
246
285
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
286
|
+
#close the present file, then open a new one
|
287
|
+
if end_flag
|
288
|
+
cleanup!(output_text)
|
289
|
+
@fp.puts(output_text)
|
290
|
+
output_text = ""
|
291
|
+
@total_size = 0
|
292
|
+
end_flag = false
|
293
|
+
@fp.close
|
294
|
+
@file_index += 1
|
295
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
296
|
+
@outfiles << outfilename
|
297
|
+
@fp = File.open(outfilename, "w")
|
298
|
+
next
|
253
299
|
end
|
254
300
|
end
|
255
|
-
|
256
|
-
@count ||= 0;@count += 1;
|
257
|
-
|
258
|
-
article = Article.new(text, title, @strip_tmarker)
|
259
|
-
output_text += block.call(article)
|
260
|
-
@total_size = output_text.bytesize
|
261
|
-
|
262
|
-
# flagged when data exceeds the size of output file
|
263
|
-
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
301
|
+
end while !data_empty
|
264
302
|
|
265
|
-
#close the present file, then open a new one
|
266
|
-
if end_flag
|
267
|
-
output_text.strip!
|
268
|
-
@fp.puts(output_text)
|
269
|
-
output_text = ""
|
270
|
-
@total_size = 0
|
271
|
-
end_flag = false
|
272
|
-
@fp.close
|
273
|
-
@file_index += 1
|
274
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
275
|
-
@outfiles << outfilename
|
276
|
-
@fp = File.open(outfilename, "w")
|
277
|
-
next
|
278
|
-
end
|
279
|
-
end
|
280
303
|
if output_text != ""
|
281
|
-
output_text
|
304
|
+
cleanup!(output_text)
|
282
305
|
@fp.puts(output_text)
|
283
306
|
end
|
284
307
|
notify_parent(true)
|
@@ -320,4 +343,5 @@ module Wp2txt
|
|
320
343
|
@parent.msg("Processing finished", 1)
|
321
344
|
end
|
322
345
|
end
|
323
|
-
end
|
346
|
+
end
|
347
|
+
|
data/lib/wp2txt/article.rb
CHANGED
@@ -37,6 +37,10 @@ module Wp2txt
|
|
37
37
|
def initialize(text, title = "", strip_tmarker = false)
|
38
38
|
@title = title.strip
|
39
39
|
@strip_tmarker = strip_tmarker
|
40
|
+
convert_characters!(text)
|
41
|
+
make_reference!(text)
|
42
|
+
remove_ref!(text)
|
43
|
+
|
40
44
|
parse text
|
41
45
|
end
|
42
46
|
|
@@ -58,6 +62,22 @@ module Wp2txt
|
|
58
62
|
end
|
59
63
|
|
60
64
|
case mode
|
65
|
+
when :mw_ml_template
|
66
|
+
scanner = StringScanner.new(line)
|
67
|
+
str= process_nested_structure(scanner, "{{", "}}") {""}
|
68
|
+
if $ml_template_end_regex =~ str
|
69
|
+
mode = nil
|
70
|
+
end
|
71
|
+
@elements.last.last << line
|
72
|
+
next
|
73
|
+
when :mw_ml_link
|
74
|
+
scanner = StringScanner.new(line)
|
75
|
+
str= process_nested_structure(scanner, "[[", "]]") {""}
|
76
|
+
if $ml_link_end_regex =~ str
|
77
|
+
mode = nil
|
78
|
+
end
|
79
|
+
@elements.last.last << line
|
80
|
+
next
|
61
81
|
when :mw_table
|
62
82
|
if $in_table_regex2 =~ line
|
63
83
|
mode = nil
|
@@ -91,17 +111,27 @@ module Wp2txt
|
|
91
111
|
end
|
92
112
|
|
93
113
|
case line
|
114
|
+
when $isolated_template_regex
|
115
|
+
@elements << create_element(:mw_isolated_template, line)
|
116
|
+
when $isolated_tag_regex
|
117
|
+
@elements << create_element(:mw_isolated_tag, line)
|
94
118
|
when $blank_line_regex
|
95
119
|
@elements << create_element(:mw_blank, "\n")
|
96
120
|
when $redirect_regex
|
97
121
|
@elements << create_element(:mw_redirect, line)
|
98
|
-
when $in_template_regex
|
99
|
-
|
122
|
+
# when $in_template_regex
|
123
|
+
# @elements << create_element(:mw_template, line)
|
100
124
|
when $in_heading_regex
|
101
125
|
line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
|
102
126
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
103
127
|
when $in_inputbox_regex
|
104
128
|
@elements << create_element(:mw_inputbox, line)
|
129
|
+
when $ml_template_onset_regex
|
130
|
+
@elements << create_element(:mw_ml_template, line)
|
131
|
+
mode = :mw_ml_template
|
132
|
+
when $ml_link_onset_regex
|
133
|
+
@elements << create_element(:mw_ml_link, line)
|
134
|
+
mode = :mw_ml_link
|
105
135
|
when $in_inputbox_regex1
|
106
136
|
mode = :mw_inputbox
|
107
137
|
@elements << create_element(:mw_inputbox, line)
|
@@ -138,7 +168,7 @@ module Wp2txt
|
|
138
168
|
when $in_link_regex
|
139
169
|
@elements << create_element(:mw_link, line)
|
140
170
|
else
|
141
|
-
@elements << create_element(:mw_paragraph, line)
|
171
|
+
@elements << create_element(:mw_paragraph, "\n" + line)
|
142
172
|
end
|
143
173
|
end
|
144
174
|
@elements
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -16,43 +16,36 @@ $html_decoder = HTMLEntities.new
|
|
16
16
|
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
17
|
$html_hash = Hash[*$entities.flatten]
|
18
18
|
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
-
|
20
|
-
$
|
19
|
+
$ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
|
20
|
+
$ml_template_end_regex = Regexp.new('\}\}\s*$')
|
21
|
+
$ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
|
22
|
+
$ml_linkend_regex = Regexp.new('\]\]\s*$')
|
23
|
+
$isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
24
|
+
$isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
21
25
|
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
22
|
-
|
23
26
|
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
24
27
|
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
25
28
|
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
26
|
-
|
27
29
|
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
28
30
|
$in_source_regex1 = Regexp.new('<source.*?>')
|
29
31
|
$in_source_regex2 = Regexp.new('<\/source>')
|
30
|
-
|
31
32
|
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
32
33
|
$in_math_regex1 = Regexp.new('<math.*?>')
|
33
34
|
$in_math_regex2 = Regexp.new('<\/math>')
|
34
|
-
|
35
35
|
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
36
|
-
|
37
36
|
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
38
37
|
$in_html_table_regex1 = Regexp.new('<table\b')
|
39
38
|
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
40
|
-
|
41
39
|
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
42
40
|
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
43
|
-
|
44
41
|
$in_unordered_regex = Regexp.new('^\*')
|
45
42
|
$in_ordered_regex = Regexp.new('^\#')
|
46
43
|
$in_pre_regex = Regexp.new('^ ')
|
47
44
|
$in_definition_regex = Regexp.new('^[\;\:]')
|
48
|
-
|
49
45
|
$blank_line_regex = Regexp.new('^\s*$')
|
50
|
-
|
51
46
|
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
52
|
-
|
53
47
|
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
54
48
|
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
55
|
-
|
56
49
|
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
57
50
|
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
58
51
|
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
@@ -75,6 +68,7 @@ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}
|
|
75
68
|
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
76
69
|
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
77
70
|
|
71
|
+
$remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
78
72
|
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
79
73
|
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
80
74
|
|
@@ -111,26 +105,26 @@ module Wp2txt
|
|
111
105
|
|
112
106
|
def format_wiki!(text, has_retried = false)
|
113
107
|
escape_nowiki!(text)
|
114
|
-
|
115
108
|
process_interwiki_links!(text)
|
116
109
|
process_external_links!(text)
|
117
|
-
|
118
110
|
unescape_nowiki!(text)
|
119
|
-
|
120
|
-
|
121
|
-
def format_article!(text)
|
111
|
+
#####
|
122
112
|
remove_directive!(text)
|
123
113
|
remove_emphasis!(text)
|
124
114
|
mndash!(text)
|
125
|
-
make_reference!(text)
|
126
|
-
format_ref!(text)
|
127
115
|
remove_hr!(text)
|
128
116
|
remove_tag!(text)
|
129
|
-
|
130
|
-
|
131
|
-
|
117
|
+
correct_inline_template!(text) unless $leave_inline_template
|
118
|
+
remove_templates!(text) unless $leave_inline_template
|
119
|
+
# remove_table!(text) unless $leave_table
|
132
120
|
end
|
133
121
|
|
122
|
+
def cleanup!(text)
|
123
|
+
text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
124
|
+
text.gsub!(/\n\n\n+/m){"\n\n"}
|
125
|
+
text.strip!
|
126
|
+
text << "\n\n"
|
127
|
+
end
|
134
128
|
#################### parser for nested structure ####################
|
135
129
|
|
136
130
|
def process_nested_structure(scanner, left, right, &block)
|
@@ -236,6 +230,10 @@ module Wp2txt
|
|
236
230
|
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
237
231
|
""
|
238
232
|
end
|
233
|
+
scanner = StringScanner.new(result)
|
234
|
+
result = process_nested_structure(scanner, "{", "}") do |contents|
|
235
|
+
""
|
236
|
+
end
|
239
237
|
str.replace(result)
|
240
238
|
end
|
241
239
|
|
@@ -309,36 +307,33 @@ module Wp2txt
|
|
309
307
|
str.gsub!($make_reference_regex_d){"[/ref]"}
|
310
308
|
end
|
311
309
|
|
312
|
-
def format_ref!(page)
|
313
|
-
###### do nothing for now
|
314
|
-
# page.gsub!($format_ref_regex) do
|
315
|
-
# end
|
316
|
-
end
|
317
|
-
|
318
310
|
def correct_inline_template!(str)
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
311
|
+
scanner = StringScanner.new(str)
|
312
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
313
|
+
parts = contents.split("|")
|
314
|
+
# type_code = parts.first
|
315
|
+
# case type_code
|
316
|
+
# when $type_code_regex
|
317
|
+
# out = parts[-1]
|
318
|
+
# else
|
319
|
+
# case parts.size
|
320
|
+
# when 0
|
321
|
+
# out = ""
|
322
|
+
# when 1
|
323
|
+
# out = parts.first || ""
|
324
|
+
# else
|
325
|
+
# while parts.size > 2 && parts.last.split("=").size > 1
|
326
|
+
while parts.size > 1 && parts.last.split("=").size > 1
|
327
|
+
parts.pop
|
334
328
|
end
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
end
|
329
|
+
out = parts.last || ""
|
330
|
+
# end
|
331
|
+
# end
|
332
|
+
out.strip
|
340
333
|
end
|
334
|
+
str.replace result
|
341
335
|
end
|
336
|
+
|
342
337
|
|
343
338
|
#################### file related utilities ####################
|
344
339
|
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/utils_spec.rb
CHANGED
@@ -182,22 +182,34 @@ describe "Wp2txt" do
|
|
182
182
|
end
|
183
183
|
end
|
184
184
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
185
|
+
describe "correct_inline_template!" do
|
186
|
+
it "removes brackets and leaving some text" do
|
187
|
+
# str_before = "{{}}"
|
188
|
+
# str_after = ""
|
189
|
+
# correct_inline_template!(str_before)
|
190
|
+
# expect(str_before).to eq str_after
|
191
|
+
str_before = "{{MedalCountry | {{JPN}} }}"
|
192
|
+
str_after = "JPN"
|
193
|
+
correct_inline_template!(str_before)
|
194
|
+
expect(str_before).to eq str_after
|
195
|
+
str_before = "{{lang|en|Japan}}"
|
196
|
+
str_after = "Japan"
|
197
|
+
correct_inline_template!(str_before)
|
198
|
+
expect(str_before).to eq str_after
|
199
|
+
str_before = "{{a|b=c|d=f}}"
|
200
|
+
str_after = "a"
|
201
|
+
correct_inline_template!(str_before)
|
202
|
+
expect(str_before).to eq str_after
|
203
|
+
str_before = "{{a|b|{{c|d|e}}}}"
|
204
|
+
str_after = "e"
|
205
|
+
correct_inline_template!(str_before)
|
206
|
+
expect(str_before).to eq str_after
|
207
|
+
str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
|
208
|
+
str_after = "日本人に多く見受けられる"
|
209
|
+
correct_inline_template!(str_before)
|
210
|
+
expect(str_before).to eq str_after
|
211
|
+
end
|
212
|
+
end
|
201
213
|
|
202
214
|
# describe "expand_template" do
|
203
215
|
# it "gets data corresponding to a given template using mediawiki api" do
|