wp2txt 0.8.0 → 0.9.1

Sign up to get free protection for your applications and to get access to all the features.
Binary file
File without changes
@@ -4,6 +4,7 @@
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
6
  require "nokogiri"
7
+ require "parallel"
7
8
 
8
9
  require 'pp'
9
10
  require "wp2txt/article"
@@ -24,7 +25,7 @@ module Wp2txt
24
25
 
25
26
  include Wp2txt
26
27
 
27
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
28
+ def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
28
29
  @parent = parent
29
30
  @fp = nil
30
31
 
@@ -33,6 +34,8 @@ module Wp2txt
33
34
  @tfile_size = tfile_size
34
35
  @convert = convert
35
36
  @strip_tmarker = strip_tmarker
37
+ num_cores_available = Etc.nprocessors
38
+ @num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
36
39
  end
37
40
 
38
41
  def file_size(file)
@@ -97,6 +100,7 @@ module Wp2txt
97
100
  if /.bz2$/ =~ @input_file
98
101
  unless NO_BZ2
99
102
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
103
+ @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
100
104
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
101
105
  @infile_size = file_size(file)
102
106
  @parent.msg("... Done.", 1)
@@ -108,6 +112,7 @@ module Wp2txt
108
112
  else
109
113
  file = IO.popen("bzip2 -c -d #{@input_file}")
110
114
  end
115
+ @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
111
116
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
112
117
  @infile_size = file_size(file)
113
118
  @parent.msg("... Done.", 1)
@@ -232,53 +237,71 @@ module Wp2txt
232
237
  end_flag = false
233
238
  terminal_round = false
234
239
  output_text = ""
240
+ pages = []
241
+ data_empty = false
235
242
 
236
- while page = get_page
237
- xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
238
- xml = xmlns + page + "</mediawiki>"
243
+ begin
244
+ page = get_page
245
+ if page
246
+ pages << page
247
+ else
248
+ data_empty = true
249
+ end
250
+ if data_empty || pages.size == @num_threads
251
+ # pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
252
+ pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
253
+ page_text = {:order => n, :data => nil}
254
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
255
+ xml = xmlns + page + "</mediawiki>"
239
256
 
240
- input = Nokogiri::XML(xml, nil, 'UTF-8')
241
- page = input.xpath("//xmlns:text").first
242
- pp_title = page.parent.parent.at_css "title"
243
- title = pp_title.content
244
- next if /\:/ =~ title
245
- text = page.content
257
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
258
+ page = input.xpath("//xmlns:text").first
259
+ pp_title = page.parent.parent.at_css "title"
260
+ title = pp_title.content
261
+ unless /\:/ =~ title
262
+ text = page.content
263
+ text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
264
+ num_of_newlines = content.count("\n")
265
+ if num_of_newlines == 0
266
+ ""
267
+ else
268
+ "\n" * num_of_newlines
269
+ end
270
+ end
271
+ article = Article.new(text, title, @strip_tmarker)
272
+ page_text[:data] = block.call(article)
273
+ end
274
+ page_text
275
+ end
276
+ pages.clear
277
+ pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
278
+ pages_text.each do |page_text|
279
+ output_text << page_text
280
+ @count ||= 0; @count += 1;
281
+ @total_size = output_text.bytesize
282
+ # flagged when data exceeds the size of output file
283
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
284
+ end
246
285
 
247
- text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
248
- num_of_newlines = content.count("\n")
249
- if num_of_newlines == 0
250
- ""
251
- else
252
- "\n" * num_of_newlines
286
+ #close the present file, then open a new one
287
+ if end_flag
288
+ cleanup!(output_text)
289
+ @fp.puts(output_text)
290
+ output_text = ""
291
+ @total_size = 0
292
+ end_flag = false
293
+ @fp.close
294
+ @file_index += 1
295
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
296
+ @outfiles << outfilename
297
+ @fp = File.open(outfilename, "w")
298
+ next
253
299
  end
254
300
  end
255
-
256
- @count ||= 0;@count += 1;
257
-
258
- article = Article.new(text, title, @strip_tmarker)
259
- output_text += block.call(article)
260
- @total_size = output_text.bytesize
261
-
262
- # flagged when data exceeds the size of output file
263
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
301
+ end while !data_empty
264
302
 
265
- #close the present file, then open a new one
266
- if end_flag
267
- output_text.strip!
268
- @fp.puts(output_text)
269
- output_text = ""
270
- @total_size = 0
271
- end_flag = false
272
- @fp.close
273
- @file_index += 1
274
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
275
- @outfiles << outfilename
276
- @fp = File.open(outfilename, "w")
277
- next
278
- end
279
- end
280
303
  if output_text != ""
281
- output_text.strip!
304
+ cleanup!(output_text)
282
305
  @fp.puts(output_text)
283
306
  end
284
307
  notify_parent(true)
@@ -320,4 +343,5 @@ module Wp2txt
320
343
  @parent.msg("Processing finished", 1)
321
344
  end
322
345
  end
323
- end
346
+ end
347
+
@@ -37,6 +37,10 @@ module Wp2txt
37
37
  def initialize(text, title = "", strip_tmarker = false)
38
38
  @title = title.strip
39
39
  @strip_tmarker = strip_tmarker
40
+ convert_characters!(text)
41
+ make_reference!(text)
42
+ remove_ref!(text)
43
+
40
44
  parse text
41
45
  end
42
46
 
@@ -58,6 +62,22 @@ module Wp2txt
58
62
  end
59
63
 
60
64
  case mode
65
+ when :mw_ml_template
66
+ scanner = StringScanner.new(line)
67
+ str= process_nested_structure(scanner, "{{", "}}") {""}
68
+ if $ml_template_end_regex =~ str
69
+ mode = nil
70
+ end
71
+ @elements.last.last << line
72
+ next
73
+ when :mw_ml_link
74
+ scanner = StringScanner.new(line)
75
+ str= process_nested_structure(scanner, "[[", "]]") {""}
76
+ if $ml_link_end_regex =~ str
77
+ mode = nil
78
+ end
79
+ @elements.last.last << line
80
+ next
61
81
  when :mw_table
62
82
  if $in_table_regex2 =~ line
63
83
  mode = nil
@@ -91,17 +111,27 @@ module Wp2txt
91
111
  end
92
112
 
93
113
  case line
114
+ when $isolated_template_regex
115
+ @elements << create_element(:mw_isolated_template, line)
116
+ when $isolated_tag_regex
117
+ @elements << create_element(:mw_isolated_tag, line)
94
118
  when $blank_line_regex
95
119
  @elements << create_element(:mw_blank, "\n")
96
120
  when $redirect_regex
97
121
  @elements << create_element(:mw_redirect, line)
98
- when $in_template_regex
99
- @elements << create_element(:mw_template, line)
122
+ # when $in_template_regex
123
+ # @elements << create_element(:mw_template, line)
100
124
  when $in_heading_regex
101
125
  line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
102
126
  @elements << create_element(:mw_heading, "\n" + line + "\n")
103
127
  when $in_inputbox_regex
104
128
  @elements << create_element(:mw_inputbox, line)
129
+ when $ml_template_onset_regex
130
+ @elements << create_element(:mw_ml_template, line)
131
+ mode = :mw_ml_template
132
+ when $ml_link_onset_regex
133
+ @elements << create_element(:mw_ml_link, line)
134
+ mode = :mw_ml_link
105
135
  when $in_inputbox_regex1
106
136
  mode = :mw_inputbox
107
137
  @elements << create_element(:mw_inputbox, line)
@@ -138,7 +168,7 @@ module Wp2txt
138
168
  when $in_link_regex
139
169
  @elements << create_element(:mw_link, line)
140
170
  else
141
- @elements << create_element(:mw_paragraph, line)
171
+ @elements << create_element(:mw_paragraph, "\n" + line)
142
172
  end
143
173
  end
144
174
  @elements
@@ -16,43 +16,36 @@ $html_decoder = HTMLEntities.new
16
16
  $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
17
  $html_hash = Hash[*$entities.flatten]
18
18
  $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
-
20
- $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
19
+ $ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
20
+ $ml_template_end_regex = Regexp.new('\}\}\s*$')
21
+ $ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
22
+ $ml_linkend_regex = Regexp.new('\]\]\s*$')
23
+ $isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
24
+ $isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
21
25
  $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
22
-
23
26
  $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
24
27
  $in_inputbox_regex1 = Regexp.new('<inputbox>')
25
28
  $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
26
-
27
29
  $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
28
30
  $in_source_regex1 = Regexp.new('<source.*?>')
29
31
  $in_source_regex2 = Regexp.new('<\/source>')
30
-
31
32
  $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
32
33
  $in_math_regex1 = Regexp.new('<math.*?>')
33
34
  $in_math_regex2 = Regexp.new('<\/math>')
34
-
35
35
  $in_heading_regex = Regexp.new('^=+.*?=+$')
36
-
37
36
  $in_html_table_regex = Regexp.new('<table.*?><\/table>')
38
37
  $in_html_table_regex1 = Regexp.new('<table\b')
39
38
  $in_html_table_regex2 = Regexp.new('<\/\s*table>')
40
-
41
39
  $in_table_regex1 = Regexp.new('^\s*\{\|')
42
40
  $in_table_regex2 = Regexp.new('^\|\}.*?$')
43
-
44
41
  $in_unordered_regex = Regexp.new('^\*')
45
42
  $in_ordered_regex = Regexp.new('^\#')
46
43
  $in_pre_regex = Regexp.new('^ ')
47
44
  $in_definition_regex = Regexp.new('^[\;\:]')
48
-
49
45
  $blank_line_regex = Regexp.new('^\s*$')
50
-
51
46
  $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
52
-
53
47
  $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
54
48
  $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
55
-
56
49
  $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
57
50
  $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
58
51
  $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
@@ -75,6 +68,7 @@ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}
75
68
  $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
76
69
  $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
77
70
 
71
+ $remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
78
72
  $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
79
73
  $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
80
74
 
@@ -111,26 +105,26 @@ module Wp2txt
111
105
 
112
106
  def format_wiki!(text, has_retried = false)
113
107
  escape_nowiki!(text)
114
-
115
108
  process_interwiki_links!(text)
116
109
  process_external_links!(text)
117
-
118
110
  unescape_nowiki!(text)
119
- end
120
-
121
- def format_article!(text)
111
+ #####
122
112
  remove_directive!(text)
123
113
  remove_emphasis!(text)
124
114
  mndash!(text)
125
- make_reference!(text)
126
- format_ref!(text)
127
115
  remove_hr!(text)
128
116
  remove_tag!(text)
129
- convert_characters!(text)
130
- correct_inline_template!(text) unless $leave_template
131
- remove_templates!(text) unless $leave_template
117
+ correct_inline_template!(text) unless $leave_inline_template
118
+ remove_templates!(text) unless $leave_inline_template
119
+ # remove_table!(text) unless $leave_table
132
120
  end
133
121
 
122
+ def cleanup!(text)
123
+ text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
+ text.gsub!(/\n\n\n+/m){"\n\n"}
125
+ text.strip!
126
+ text << "\n\n"
127
+ end
134
128
  #################### parser for nested structure ####################
135
129
 
136
130
  def process_nested_structure(scanner, left, right, &block)
@@ -236,6 +230,10 @@ module Wp2txt
236
230
  result = process_nested_structure(scanner, "{{", "}}") do |contents|
237
231
  ""
238
232
  end
233
+ scanner = StringScanner.new(result)
234
+ result = process_nested_structure(scanner, "{", "}") do |contents|
235
+ ""
236
+ end
239
237
  str.replace(result)
240
238
  end
241
239
 
@@ -309,36 +307,33 @@ module Wp2txt
309
307
  str.gsub!($make_reference_regex_d){"[/ref]"}
310
308
  end
311
309
 
312
- def format_ref!(page)
313
- ###### do nothing for now
314
- # page.gsub!($format_ref_regex) do
315
- # end
316
- end
317
-
318
310
  def correct_inline_template!(str)
319
- str.gsub!($remove_inline_regex) do
320
- key = $1
321
- if $onset_bar_regex =~ key
322
- result = key
323
- elsif
324
- info = key.split("|")
325
- type_code = info.first
326
- case type_code
327
- when $type_code_regex
328
- out = info[-1]
329
- else
330
- if $leave_template
331
- out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
332
- else
333
- out = ""
311
+ scanner = StringScanner.new(str)
312
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
313
+ parts = contents.split("|")
314
+ # type_code = parts.first
315
+ # case type_code
316
+ # when $type_code_regex
317
+ # out = parts[-1]
318
+ # else
319
+ # case parts.size
320
+ # when 0
321
+ # out = ""
322
+ # when 1
323
+ # out = parts.first || ""
324
+ # else
325
+ # while parts.size > 2 && parts.last.split("=").size > 1
326
+ while parts.size > 1 && parts.last.split("=").size > 1
327
+ parts.pop
334
328
  end
335
- end
336
- out
337
- else
338
- ""
339
- end
329
+ out = parts.last || ""
330
+ # end
331
+ # end
332
+ out.strip
340
333
  end
334
+ str.replace result
341
335
  end
336
+
342
337
 
343
338
  #################### file related utilities ####################
344
339
 
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.8.0"
2
+ VERSION = "0.9.1"
3
3
  end
@@ -182,22 +182,34 @@ describe "Wp2txt" do
182
182
  end
183
183
  end
184
184
 
185
- # describe "process_template" do
186
- # it "removes brackets and leaving some text" do
187
- # str_before = "{{}}"
188
- # str_after = ""
189
- # expect(process_template(str_before)).to eq str_after
190
- # str_before = "{{lang|en|Japan}}"
191
- # str_after = "Japan"
192
- # expect(process_template(str_before)).to eq str_after
193
- # str_before = "{{a|b=c|d=f}}"
194
- # str_after = "a"
195
- # expect(process_template(str_before)).to eq str_after
196
- # str_before = "{{a|b|{{c|d|e}}}}"
197
- # str_after = "e"
198
- # expect(process_template(str_before)).to eq str_after
199
- # end
200
- # end
185
+ describe "correct_inline_template!" do
186
+ it "removes brackets and leaving some text" do
187
+ # str_before = "{{}}"
188
+ # str_after = ""
189
+ # correct_inline_template!(str_before)
190
+ # expect(str_before).to eq str_after
191
+ str_before = "{{MedalCountry | {{JPN}} }}"
192
+ str_after = "JPN"
193
+ correct_inline_template!(str_before)
194
+ expect(str_before).to eq str_after
195
+ str_before = "{{lang|en|Japan}}"
196
+ str_after = "Japan"
197
+ correct_inline_template!(str_before)
198
+ expect(str_before).to eq str_after
199
+ str_before = "{{a|b=c|d=f}}"
200
+ str_after = "a"
201
+ correct_inline_template!(str_before)
202
+ expect(str_before).to eq str_after
203
+ str_before = "{{a|b|{{c|d|e}}}}"
204
+ str_after = "e"
205
+ correct_inline_template!(str_before)
206
+ expect(str_before).to eq str_after
207
+ str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
208
+ str_after = "日本人に多く見受けられる"
209
+ correct_inline_template!(str_before)
210
+ expect(str_before).to eq str_after
211
+ end
212
+ end
201
213
 
202
214
  # describe "expand_template" do
203
215
  # it "gets data corresponding to a given template using mediawiki api" do