wp2txt 0.8.0 → 0.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -6
- data/bin/benchmark.rb +5 -4
- data/bin/wp2txt +24 -27
- data/data/output_samples/testdata_en.txt +49076 -0
- data/data/output_samples/testdata_ja.txt +9382 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/{testdata.bz2 → testdata_ja.bz2} +0 -0
- data/lib/wp2txt.rb +66 -42
- data/lib/wp2txt/article.rb +33 -3
- data/lib/wp2txt/utils.rb +44 -49
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +28 -16
- data/wp2txt.gemspec +1 -0
- metadata +21 -4
Binary file
|
File without changes
|
data/lib/wp2txt.rb
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
6
|
require "nokogiri"
|
7
|
+
require "parallel"
|
7
8
|
|
8
9
|
require 'pp'
|
9
10
|
require "wp2txt/article"
|
@@ -24,7 +25,7 @@ module Wp2txt
|
|
24
25
|
|
25
26
|
include Wp2txt
|
26
27
|
|
27
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
28
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
|
28
29
|
@parent = parent
|
29
30
|
@fp = nil
|
30
31
|
|
@@ -33,6 +34,8 @@ module Wp2txt
|
|
33
34
|
@tfile_size = tfile_size
|
34
35
|
@convert = convert
|
35
36
|
@strip_tmarker = strip_tmarker
|
37
|
+
num_cores_available = Etc.nprocessors
|
38
|
+
@num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
|
36
39
|
end
|
37
40
|
|
38
41
|
def file_size(file)
|
@@ -97,6 +100,7 @@ module Wp2txt
|
|
97
100
|
if /.bz2$/ =~ @input_file
|
98
101
|
unless NO_BZ2
|
99
102
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
103
|
+
@parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
|
100
104
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
101
105
|
@infile_size = file_size(file)
|
102
106
|
@parent.msg("... Done.", 1)
|
@@ -108,6 +112,7 @@ module Wp2txt
|
|
108
112
|
else
|
109
113
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
110
114
|
end
|
115
|
+
@parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
|
111
116
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
112
117
|
@infile_size = file_size(file)
|
113
118
|
@parent.msg("... Done.", 1)
|
@@ -232,53 +237,71 @@ module Wp2txt
|
|
232
237
|
end_flag = false
|
233
238
|
terminal_round = false
|
234
239
|
output_text = ""
|
240
|
+
pages = []
|
241
|
+
data_empty = false
|
235
242
|
|
236
|
-
|
237
|
-
|
238
|
-
|
243
|
+
begin
|
244
|
+
page = get_page
|
245
|
+
if page
|
246
|
+
pages << page
|
247
|
+
else
|
248
|
+
data_empty = true
|
249
|
+
end
|
250
|
+
if data_empty || pages.size == @num_threads
|
251
|
+
# pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
252
|
+
pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
253
|
+
page_text = {:order => n, :data => nil}
|
254
|
+
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
255
|
+
xml = xmlns + page + "</mediawiki>"
|
239
256
|
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
257
|
+
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
258
|
+
page = input.xpath("//xmlns:text").first
|
259
|
+
pp_title = page.parent.parent.at_css "title"
|
260
|
+
title = pp_title.content
|
261
|
+
unless /\:/ =~ title
|
262
|
+
text = page.content
|
263
|
+
text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
|
264
|
+
num_of_newlines = content.count("\n")
|
265
|
+
if num_of_newlines == 0
|
266
|
+
""
|
267
|
+
else
|
268
|
+
"\n" * num_of_newlines
|
269
|
+
end
|
270
|
+
end
|
271
|
+
article = Article.new(text, title, @strip_tmarker)
|
272
|
+
page_text[:data] = block.call(article)
|
273
|
+
end
|
274
|
+
page_text
|
275
|
+
end
|
276
|
+
pages.clear
|
277
|
+
pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
|
278
|
+
pages_text.each do |page_text|
|
279
|
+
output_text << page_text
|
280
|
+
@count ||= 0; @count += 1;
|
281
|
+
@total_size = output_text.bytesize
|
282
|
+
# flagged when data exceeds the size of output file
|
283
|
+
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
284
|
+
end
|
246
285
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
286
|
+
#close the present file, then open a new one
|
287
|
+
if end_flag
|
288
|
+
cleanup!(output_text)
|
289
|
+
@fp.puts(output_text)
|
290
|
+
output_text = ""
|
291
|
+
@total_size = 0
|
292
|
+
end_flag = false
|
293
|
+
@fp.close
|
294
|
+
@file_index += 1
|
295
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
296
|
+
@outfiles << outfilename
|
297
|
+
@fp = File.open(outfilename, "w")
|
298
|
+
next
|
253
299
|
end
|
254
300
|
end
|
255
|
-
|
256
|
-
@count ||= 0;@count += 1;
|
257
|
-
|
258
|
-
article = Article.new(text, title, @strip_tmarker)
|
259
|
-
output_text += block.call(article)
|
260
|
-
@total_size = output_text.bytesize
|
261
|
-
|
262
|
-
# flagged when data exceeds the size of output file
|
263
|
-
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
301
|
+
end while !data_empty
|
264
302
|
|
265
|
-
#close the present file, then open a new one
|
266
|
-
if end_flag
|
267
|
-
output_text.strip!
|
268
|
-
@fp.puts(output_text)
|
269
|
-
output_text = ""
|
270
|
-
@total_size = 0
|
271
|
-
end_flag = false
|
272
|
-
@fp.close
|
273
|
-
@file_index += 1
|
274
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
275
|
-
@outfiles << outfilename
|
276
|
-
@fp = File.open(outfilename, "w")
|
277
|
-
next
|
278
|
-
end
|
279
|
-
end
|
280
303
|
if output_text != ""
|
281
|
-
output_text
|
304
|
+
cleanup!(output_text)
|
282
305
|
@fp.puts(output_text)
|
283
306
|
end
|
284
307
|
notify_parent(true)
|
@@ -320,4 +343,5 @@ module Wp2txt
|
|
320
343
|
@parent.msg("Processing finished", 1)
|
321
344
|
end
|
322
345
|
end
|
323
|
-
end
|
346
|
+
end
|
347
|
+
|
data/lib/wp2txt/article.rb
CHANGED
@@ -37,6 +37,10 @@ module Wp2txt
|
|
37
37
|
def initialize(text, title = "", strip_tmarker = false)
|
38
38
|
@title = title.strip
|
39
39
|
@strip_tmarker = strip_tmarker
|
40
|
+
convert_characters!(text)
|
41
|
+
make_reference!(text)
|
42
|
+
remove_ref!(text)
|
43
|
+
|
40
44
|
parse text
|
41
45
|
end
|
42
46
|
|
@@ -58,6 +62,22 @@ module Wp2txt
|
|
58
62
|
end
|
59
63
|
|
60
64
|
case mode
|
65
|
+
when :mw_ml_template
|
66
|
+
scanner = StringScanner.new(line)
|
67
|
+
str= process_nested_structure(scanner, "{{", "}}") {""}
|
68
|
+
if $ml_template_end_regex =~ str
|
69
|
+
mode = nil
|
70
|
+
end
|
71
|
+
@elements.last.last << line
|
72
|
+
next
|
73
|
+
when :mw_ml_link
|
74
|
+
scanner = StringScanner.new(line)
|
75
|
+
str= process_nested_structure(scanner, "[[", "]]") {""}
|
76
|
+
if $ml_link_end_regex =~ str
|
77
|
+
mode = nil
|
78
|
+
end
|
79
|
+
@elements.last.last << line
|
80
|
+
next
|
61
81
|
when :mw_table
|
62
82
|
if $in_table_regex2 =~ line
|
63
83
|
mode = nil
|
@@ -91,17 +111,27 @@ module Wp2txt
|
|
91
111
|
end
|
92
112
|
|
93
113
|
case line
|
114
|
+
when $isolated_template_regex
|
115
|
+
@elements << create_element(:mw_isolated_template, line)
|
116
|
+
when $isolated_tag_regex
|
117
|
+
@elements << create_element(:mw_isolated_tag, line)
|
94
118
|
when $blank_line_regex
|
95
119
|
@elements << create_element(:mw_blank, "\n")
|
96
120
|
when $redirect_regex
|
97
121
|
@elements << create_element(:mw_redirect, line)
|
98
|
-
when $in_template_regex
|
99
|
-
|
122
|
+
# when $in_template_regex
|
123
|
+
# @elements << create_element(:mw_template, line)
|
100
124
|
when $in_heading_regex
|
101
125
|
line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
|
102
126
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
103
127
|
when $in_inputbox_regex
|
104
128
|
@elements << create_element(:mw_inputbox, line)
|
129
|
+
when $ml_template_onset_regex
|
130
|
+
@elements << create_element(:mw_ml_template, line)
|
131
|
+
mode = :mw_ml_template
|
132
|
+
when $ml_link_onset_regex
|
133
|
+
@elements << create_element(:mw_ml_link, line)
|
134
|
+
mode = :mw_ml_link
|
105
135
|
when $in_inputbox_regex1
|
106
136
|
mode = :mw_inputbox
|
107
137
|
@elements << create_element(:mw_inputbox, line)
|
@@ -138,7 +168,7 @@ module Wp2txt
|
|
138
168
|
when $in_link_regex
|
139
169
|
@elements << create_element(:mw_link, line)
|
140
170
|
else
|
141
|
-
@elements << create_element(:mw_paragraph, line)
|
171
|
+
@elements << create_element(:mw_paragraph, "\n" + line)
|
142
172
|
end
|
143
173
|
end
|
144
174
|
@elements
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -16,43 +16,36 @@ $html_decoder = HTMLEntities.new
|
|
16
16
|
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
17
|
$html_hash = Hash[*$entities.flatten]
|
18
18
|
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
-
|
20
|
-
$
|
19
|
+
$ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
|
20
|
+
$ml_template_end_regex = Regexp.new('\}\}\s*$')
|
21
|
+
$ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
|
22
|
+
$ml_linkend_regex = Regexp.new('\]\]\s*$')
|
23
|
+
$isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
24
|
+
$isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
21
25
|
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
22
|
-
|
23
26
|
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
24
27
|
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
25
28
|
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
26
|
-
|
27
29
|
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
28
30
|
$in_source_regex1 = Regexp.new('<source.*?>')
|
29
31
|
$in_source_regex2 = Regexp.new('<\/source>')
|
30
|
-
|
31
32
|
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
32
33
|
$in_math_regex1 = Regexp.new('<math.*?>')
|
33
34
|
$in_math_regex2 = Regexp.new('<\/math>')
|
34
|
-
|
35
35
|
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
36
|
-
|
37
36
|
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
38
37
|
$in_html_table_regex1 = Regexp.new('<table\b')
|
39
38
|
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
40
|
-
|
41
39
|
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
42
40
|
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
43
|
-
|
44
41
|
$in_unordered_regex = Regexp.new('^\*')
|
45
42
|
$in_ordered_regex = Regexp.new('^\#')
|
46
43
|
$in_pre_regex = Regexp.new('^ ')
|
47
44
|
$in_definition_regex = Regexp.new('^[\;\:]')
|
48
|
-
|
49
45
|
$blank_line_regex = Regexp.new('^\s*$')
|
50
|
-
|
51
46
|
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
52
|
-
|
53
47
|
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
54
48
|
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
55
|
-
|
56
49
|
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
57
50
|
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
58
51
|
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
@@ -75,6 +68,7 @@ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}
|
|
75
68
|
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
76
69
|
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
77
70
|
|
71
|
+
$remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
78
72
|
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
79
73
|
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
80
74
|
|
@@ -111,26 +105,26 @@ module Wp2txt
|
|
111
105
|
|
112
106
|
def format_wiki!(text, has_retried = false)
|
113
107
|
escape_nowiki!(text)
|
114
|
-
|
115
108
|
process_interwiki_links!(text)
|
116
109
|
process_external_links!(text)
|
117
|
-
|
118
110
|
unescape_nowiki!(text)
|
119
|
-
|
120
|
-
|
121
|
-
def format_article!(text)
|
111
|
+
#####
|
122
112
|
remove_directive!(text)
|
123
113
|
remove_emphasis!(text)
|
124
114
|
mndash!(text)
|
125
|
-
make_reference!(text)
|
126
|
-
format_ref!(text)
|
127
115
|
remove_hr!(text)
|
128
116
|
remove_tag!(text)
|
129
|
-
|
130
|
-
|
131
|
-
|
117
|
+
correct_inline_template!(text) unless $leave_inline_template
|
118
|
+
remove_templates!(text) unless $leave_inline_template
|
119
|
+
# remove_table!(text) unless $leave_table
|
132
120
|
end
|
133
121
|
|
122
|
+
def cleanup!(text)
|
123
|
+
text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
124
|
+
text.gsub!(/\n\n\n+/m){"\n\n"}
|
125
|
+
text.strip!
|
126
|
+
text << "\n\n"
|
127
|
+
end
|
134
128
|
#################### parser for nested structure ####################
|
135
129
|
|
136
130
|
def process_nested_structure(scanner, left, right, &block)
|
@@ -236,6 +230,10 @@ module Wp2txt
|
|
236
230
|
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
237
231
|
""
|
238
232
|
end
|
233
|
+
scanner = StringScanner.new(result)
|
234
|
+
result = process_nested_structure(scanner, "{", "}") do |contents|
|
235
|
+
""
|
236
|
+
end
|
239
237
|
str.replace(result)
|
240
238
|
end
|
241
239
|
|
@@ -309,36 +307,33 @@ module Wp2txt
|
|
309
307
|
str.gsub!($make_reference_regex_d){"[/ref]"}
|
310
308
|
end
|
311
309
|
|
312
|
-
def format_ref!(page)
|
313
|
-
###### do nothing for now
|
314
|
-
# page.gsub!($format_ref_regex) do
|
315
|
-
# end
|
316
|
-
end
|
317
|
-
|
318
310
|
def correct_inline_template!(str)
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
311
|
+
scanner = StringScanner.new(str)
|
312
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
313
|
+
parts = contents.split("|")
|
314
|
+
# type_code = parts.first
|
315
|
+
# case type_code
|
316
|
+
# when $type_code_regex
|
317
|
+
# out = parts[-1]
|
318
|
+
# else
|
319
|
+
# case parts.size
|
320
|
+
# when 0
|
321
|
+
# out = ""
|
322
|
+
# when 1
|
323
|
+
# out = parts.first || ""
|
324
|
+
# else
|
325
|
+
# while parts.size > 2 && parts.last.split("=").size > 1
|
326
|
+
while parts.size > 1 && parts.last.split("=").size > 1
|
327
|
+
parts.pop
|
334
328
|
end
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
end
|
329
|
+
out = parts.last || ""
|
330
|
+
# end
|
331
|
+
# end
|
332
|
+
out.strip
|
340
333
|
end
|
334
|
+
str.replace result
|
341
335
|
end
|
336
|
+
|
342
337
|
|
343
338
|
#################### file related utilities ####################
|
344
339
|
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/utils_spec.rb
CHANGED
@@ -182,22 +182,34 @@ describe "Wp2txt" do
|
|
182
182
|
end
|
183
183
|
end
|
184
184
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
185
|
+
describe "correct_inline_template!" do
|
186
|
+
it "removes brackets and leaving some text" do
|
187
|
+
# str_before = "{{}}"
|
188
|
+
# str_after = ""
|
189
|
+
# correct_inline_template!(str_before)
|
190
|
+
# expect(str_before).to eq str_after
|
191
|
+
str_before = "{{MedalCountry | {{JPN}} }}"
|
192
|
+
str_after = "JPN"
|
193
|
+
correct_inline_template!(str_before)
|
194
|
+
expect(str_before).to eq str_after
|
195
|
+
str_before = "{{lang|en|Japan}}"
|
196
|
+
str_after = "Japan"
|
197
|
+
correct_inline_template!(str_before)
|
198
|
+
expect(str_before).to eq str_after
|
199
|
+
str_before = "{{a|b=c|d=f}}"
|
200
|
+
str_after = "a"
|
201
|
+
correct_inline_template!(str_before)
|
202
|
+
expect(str_before).to eq str_after
|
203
|
+
str_before = "{{a|b|{{c|d|e}}}}"
|
204
|
+
str_after = "e"
|
205
|
+
correct_inline_template!(str_before)
|
206
|
+
expect(str_before).to eq str_after
|
207
|
+
str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
|
208
|
+
str_after = "日本人に多く見受けられる"
|
209
|
+
correct_inline_template!(str_before)
|
210
|
+
expect(str_before).to eq str_after
|
211
|
+
end
|
212
|
+
end
|
201
213
|
|
202
214
|
# describe "expand_template" do
|
203
215
|
# it "gets data corresponding to a given template using mediawiki api" do
|