wp2txt 0.8.0 → 0.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
File without changes
@@ -4,6 +4,7 @@
4
4
  $: << File.join(File.dirname(__FILE__))
5
5
 
6
6
  require "nokogiri"
7
+ require "parallel"
7
8
 
8
9
  require 'pp'
9
10
  require "wp2txt/article"
@@ -24,7 +25,7 @@ module Wp2txt
24
25
 
25
26
  include Wp2txt
26
27
 
27
- def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
28
+ def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
28
29
  @parent = parent
29
30
  @fp = nil
30
31
 
@@ -33,6 +34,8 @@ module Wp2txt
33
34
  @tfile_size = tfile_size
34
35
  @convert = convert
35
36
  @strip_tmarker = strip_tmarker
37
+ num_cores_available = Etc.nprocessors
38
+ @num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
36
39
  end
37
40
 
38
41
  def file_size(file)
@@ -97,6 +100,7 @@ module Wp2txt
97
100
  if /.bz2$/ =~ @input_file
98
101
  unless NO_BZ2
99
102
  file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
103
+ @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
100
104
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
101
105
  @infile_size = file_size(file)
102
106
  @parent.msg("... Done.", 1)
@@ -108,6 +112,7 @@ module Wp2txt
108
112
  else
109
113
  file = IO.popen("bzip2 -c -d #{@input_file}")
110
114
  end
115
+ @parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
111
116
  @parent.msg("Preparing ... This may take several minutes or more ", 0)
112
117
  @infile_size = file_size(file)
113
118
  @parent.msg("... Done.", 1)
@@ -232,53 +237,71 @@ module Wp2txt
232
237
  end_flag = false
233
238
  terminal_round = false
234
239
  output_text = ""
240
+ pages = []
241
+ data_empty = false
235
242
 
236
- while page = get_page
237
- xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
238
- xml = xmlns + page + "</mediawiki>"
243
+ begin
244
+ page = get_page
245
+ if page
246
+ pages << page
247
+ else
248
+ data_empty = true
249
+ end
250
+ if data_empty || pages.size == @num_threads
251
+ # pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
252
+ pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
253
+ page_text = {:order => n, :data => nil}
254
+ xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
255
+ xml = xmlns + page + "</mediawiki>"
239
256
 
240
- input = Nokogiri::XML(xml, nil, 'UTF-8')
241
- page = input.xpath("//xmlns:text").first
242
- pp_title = page.parent.parent.at_css "title"
243
- title = pp_title.content
244
- next if /\:/ =~ title
245
- text = page.content
257
+ input = Nokogiri::XML(xml, nil, 'UTF-8')
258
+ page = input.xpath("//xmlns:text").first
259
+ pp_title = page.parent.parent.at_css "title"
260
+ title = pp_title.content
261
+ unless /\:/ =~ title
262
+ text = page.content
263
+ text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
264
+ num_of_newlines = content.count("\n")
265
+ if num_of_newlines == 0
266
+ ""
267
+ else
268
+ "\n" * num_of_newlines
269
+ end
270
+ end
271
+ article = Article.new(text, title, @strip_tmarker)
272
+ page_text[:data] = block.call(article)
273
+ end
274
+ page_text
275
+ end
276
+ pages.clear
277
+ pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
278
+ pages_text.each do |page_text|
279
+ output_text << page_text
280
+ @count ||= 0; @count += 1;
281
+ @total_size = output_text.bytesize
282
+ # flagged when data exceeds the size of output file
283
+ end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
284
+ end
246
285
 
247
- text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
248
- num_of_newlines = content.count("\n")
249
- if num_of_newlines == 0
250
- ""
251
- else
252
- "\n" * num_of_newlines
286
+ #close the present file, then open a new one
287
+ if end_flag
288
+ cleanup!(output_text)
289
+ @fp.puts(output_text)
290
+ output_text = ""
291
+ @total_size = 0
292
+ end_flag = false
293
+ @fp.close
294
+ @file_index += 1
295
+ outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
296
+ @outfiles << outfilename
297
+ @fp = File.open(outfilename, "w")
298
+ next
253
299
  end
254
300
  end
255
-
256
- @count ||= 0;@count += 1;
257
-
258
- article = Article.new(text, title, @strip_tmarker)
259
- output_text += block.call(article)
260
- @total_size = output_text.bytesize
261
-
262
- # flagged when data exceeds the size of output file
263
- end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
301
+ end while !data_empty
264
302
 
265
- #close the present file, then open a new one
266
- if end_flag
267
- output_text.strip!
268
- @fp.puts(output_text)
269
- output_text = ""
270
- @total_size = 0
271
- end_flag = false
272
- @fp.close
273
- @file_index += 1
274
- outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
275
- @outfiles << outfilename
276
- @fp = File.open(outfilename, "w")
277
- next
278
- end
279
- end
280
303
  if output_text != ""
281
- output_text.strip!
304
+ cleanup!(output_text)
282
305
  @fp.puts(output_text)
283
306
  end
284
307
  notify_parent(true)
@@ -320,4 +343,5 @@ module Wp2txt
320
343
  @parent.msg("Processing finished", 1)
321
344
  end
322
345
  end
323
- end
346
+ end
347
+
@@ -37,6 +37,10 @@ module Wp2txt
37
37
  def initialize(text, title = "", strip_tmarker = false)
38
38
  @title = title.strip
39
39
  @strip_tmarker = strip_tmarker
40
+ convert_characters!(text)
41
+ make_reference!(text)
42
+ remove_ref!(text)
43
+
40
44
  parse text
41
45
  end
42
46
 
@@ -58,6 +62,22 @@ module Wp2txt
58
62
  end
59
63
 
60
64
  case mode
65
+ when :mw_ml_template
66
+ scanner = StringScanner.new(line)
67
+ str= process_nested_structure(scanner, "{{", "}}") {""}
68
+ if $ml_template_end_regex =~ str
69
+ mode = nil
70
+ end
71
+ @elements.last.last << line
72
+ next
73
+ when :mw_ml_link
74
+ scanner = StringScanner.new(line)
75
+ str= process_nested_structure(scanner, "[[", "]]") {""}
76
+ if $ml_link_end_regex =~ str
77
+ mode = nil
78
+ end
79
+ @elements.last.last << line
80
+ next
61
81
  when :mw_table
62
82
  if $in_table_regex2 =~ line
63
83
  mode = nil
@@ -91,17 +111,27 @@ module Wp2txt
91
111
  end
92
112
 
93
113
  case line
114
+ when $isolated_template_regex
115
+ @elements << create_element(:mw_isolated_template, line)
116
+ when $isolated_tag_regex
117
+ @elements << create_element(:mw_isolated_tag, line)
94
118
  when $blank_line_regex
95
119
  @elements << create_element(:mw_blank, "\n")
96
120
  when $redirect_regex
97
121
  @elements << create_element(:mw_redirect, line)
98
- when $in_template_regex
99
- @elements << create_element(:mw_template, line)
122
+ # when $in_template_regex
123
+ # @elements << create_element(:mw_template, line)
100
124
  when $in_heading_regex
101
125
  line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
102
126
  @elements << create_element(:mw_heading, "\n" + line + "\n")
103
127
  when $in_inputbox_regex
104
128
  @elements << create_element(:mw_inputbox, line)
129
+ when $ml_template_onset_regex
130
+ @elements << create_element(:mw_ml_template, line)
131
+ mode = :mw_ml_template
132
+ when $ml_link_onset_regex
133
+ @elements << create_element(:mw_ml_link, line)
134
+ mode = :mw_ml_link
105
135
  when $in_inputbox_regex1
106
136
  mode = :mw_inputbox
107
137
  @elements << create_element(:mw_inputbox, line)
@@ -138,7 +168,7 @@ module Wp2txt
138
168
  when $in_link_regex
139
169
  @elements << create_element(:mw_link, line)
140
170
  else
141
- @elements << create_element(:mw_paragraph, line)
171
+ @elements << create_element(:mw_paragraph, "\n" + line)
142
172
  end
143
173
  end
144
174
  @elements
@@ -16,43 +16,36 @@ $html_decoder = HTMLEntities.new
16
16
  $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
17
  $html_hash = Hash[*$entities.flatten]
18
18
  $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
-
20
- $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
19
+ $ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
20
+ $ml_template_end_regex = Regexp.new('\}\}\s*$')
21
+ $ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
22
+ $ml_linkend_regex = Regexp.new('\]\]\s*$')
23
+ $isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
24
+ $isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
21
25
  $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
22
-
23
26
  $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
24
27
  $in_inputbox_regex1 = Regexp.new('<inputbox>')
25
28
  $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
26
-
27
29
  $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
28
30
  $in_source_regex1 = Regexp.new('<source.*?>')
29
31
  $in_source_regex2 = Regexp.new('<\/source>')
30
-
31
32
  $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
32
33
  $in_math_regex1 = Regexp.new('<math.*?>')
33
34
  $in_math_regex2 = Regexp.new('<\/math>')
34
-
35
35
  $in_heading_regex = Regexp.new('^=+.*?=+$')
36
-
37
36
  $in_html_table_regex = Regexp.new('<table.*?><\/table>')
38
37
  $in_html_table_regex1 = Regexp.new('<table\b')
39
38
  $in_html_table_regex2 = Regexp.new('<\/\s*table>')
40
-
41
39
  $in_table_regex1 = Regexp.new('^\s*\{\|')
42
40
  $in_table_regex2 = Regexp.new('^\|\}.*?$')
43
-
44
41
  $in_unordered_regex = Regexp.new('^\*')
45
42
  $in_ordered_regex = Regexp.new('^\#')
46
43
  $in_pre_regex = Regexp.new('^ ')
47
44
  $in_definition_regex = Regexp.new('^[\;\:]')
48
-
49
45
  $blank_line_regex = Regexp.new('^\s*$')
50
-
51
46
  $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
52
-
53
47
  $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
54
48
  $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
55
-
56
49
  $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
57
50
  $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
58
51
  $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
@@ -75,6 +68,7 @@ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}
75
68
  $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
76
69
  $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
77
70
 
71
+ $remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
78
72
  $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
79
73
  $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
80
74
 
@@ -111,26 +105,26 @@ module Wp2txt
111
105
 
112
106
  def format_wiki!(text, has_retried = false)
113
107
  escape_nowiki!(text)
114
-
115
108
  process_interwiki_links!(text)
116
109
  process_external_links!(text)
117
-
118
110
  unescape_nowiki!(text)
119
- end
120
-
121
- def format_article!(text)
111
+ #####
122
112
  remove_directive!(text)
123
113
  remove_emphasis!(text)
124
114
  mndash!(text)
125
- make_reference!(text)
126
- format_ref!(text)
127
115
  remove_hr!(text)
128
116
  remove_tag!(text)
129
- convert_characters!(text)
130
- correct_inline_template!(text) unless $leave_template
131
- remove_templates!(text) unless $leave_template
117
+ correct_inline_template!(text) unless $leave_inline_template
118
+ remove_templates!(text) unless $leave_inline_template
119
+ # remove_table!(text) unless $leave_table
132
120
  end
133
121
 
122
+ def cleanup!(text)
123
+ text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
124
+ text.gsub!(/\n\n\n+/m){"\n\n"}
125
+ text.strip!
126
+ text << "\n\n"
127
+ end
134
128
  #################### parser for nested structure ####################
135
129
 
136
130
  def process_nested_structure(scanner, left, right, &block)
@@ -236,6 +230,10 @@ module Wp2txt
236
230
  result = process_nested_structure(scanner, "{{", "}}") do |contents|
237
231
  ""
238
232
  end
233
+ scanner = StringScanner.new(result)
234
+ result = process_nested_structure(scanner, "{", "}") do |contents|
235
+ ""
236
+ end
239
237
  str.replace(result)
240
238
  end
241
239
 
@@ -309,36 +307,33 @@ module Wp2txt
309
307
  str.gsub!($make_reference_regex_d){"[/ref]"}
310
308
  end
311
309
 
312
- def format_ref!(page)
313
- ###### do nothing for now
314
- # page.gsub!($format_ref_regex) do
315
- # end
316
- end
317
-
318
310
  def correct_inline_template!(str)
319
- str.gsub!($remove_inline_regex) do
320
- key = $1
321
- if $onset_bar_regex =~ key
322
- result = key
323
- elsif
324
- info = key.split("|")
325
- type_code = info.first
326
- case type_code
327
- when $type_code_regex
328
- out = info[-1]
329
- else
330
- if $leave_template
331
- out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
332
- else
333
- out = ""
311
+ scanner = StringScanner.new(str)
312
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
313
+ parts = contents.split("|")
314
+ # type_code = parts.first
315
+ # case type_code
316
+ # when $type_code_regex
317
+ # out = parts[-1]
318
+ # else
319
+ # case parts.size
320
+ # when 0
321
+ # out = ""
322
+ # when 1
323
+ # out = parts.first || ""
324
+ # else
325
+ # while parts.size > 2 && parts.last.split("=").size > 1
326
+ while parts.size > 1 && parts.last.split("=").size > 1
327
+ parts.pop
334
328
  end
335
- end
336
- out
337
- else
338
- ""
339
- end
329
+ out = parts.last || ""
330
+ # end
331
+ # end
332
+ out.strip
340
333
  end
334
+ str.replace result
341
335
  end
336
+
342
337
 
343
338
  #################### file related utilities ####################
344
339
 
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.8.0"
2
+ VERSION = "0.9.1"
3
3
  end
@@ -182,22 +182,34 @@ describe "Wp2txt" do
182
182
  end
183
183
  end
184
184
 
185
- # describe "process_template" do
186
- # it "removes brackets and leaving some text" do
187
- # str_before = "{{}}"
188
- # str_after = ""
189
- # expect(process_template(str_before)).to eq str_after
190
- # str_before = "{{lang|en|Japan}}"
191
- # str_after = "Japan"
192
- # expect(process_template(str_before)).to eq str_after
193
- # str_before = "{{a|b=c|d=f}}"
194
- # str_after = "a"
195
- # expect(process_template(str_before)).to eq str_after
196
- # str_before = "{{a|b|{{c|d|e}}}}"
197
- # str_after = "e"
198
- # expect(process_template(str_before)).to eq str_after
199
- # end
200
- # end
185
+ describe "correct_inline_template!" do
186
+ it "removes brackets and leaving some text" do
187
+ # str_before = "{{}}"
188
+ # str_after = ""
189
+ # correct_inline_template!(str_before)
190
+ # expect(str_before).to eq str_after
191
+ str_before = "{{MedalCountry | {{JPN}} }}"
192
+ str_after = "JPN"
193
+ correct_inline_template!(str_before)
194
+ expect(str_before).to eq str_after
195
+ str_before = "{{lang|en|Japan}}"
196
+ str_after = "Japan"
197
+ correct_inline_template!(str_before)
198
+ expect(str_before).to eq str_after
199
+ str_before = "{{a|b=c|d=f}}"
200
+ str_after = "a"
201
+ correct_inline_template!(str_before)
202
+ expect(str_before).to eq str_after
203
+ str_before = "{{a|b|{{c|d|e}}}}"
204
+ str_after = "e"
205
+ correct_inline_template!(str_before)
206
+ expect(str_before).to eq str_after
207
+ str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
208
+ str_after = "日本人に多く見受けられる"
209
+ correct_inline_template!(str_before)
210
+ expect(str_before).to eq str_after
211
+ end
212
+ end
201
213
 
202
214
  # describe "expand_template" do
203
215
  # it "gets data corresponding to a given template using mediawiki api" do