wp2txt 0.7.8 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +11 -6
- data/bin/benchmark.rb +5 -4
- data/bin/wp2txt +29 -30
- data/data/output_samples/testdata_en.txt +49076 -0
- data/data/output_samples/testdata_ja.txt +9382 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/{testdata.bz2 → testdata_ja.bz2} +0 -0
- data/lib/wp2txt/article.rb +34 -4
- data/lib/wp2txt/utils.rb +50 -53
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +69 -75
- data/spec/utils_spec.rb +28 -16
- data/wp2txt.gemspec +2 -1
- metadata +25 -10
- data/error_log.txt +0 -1
Binary file
|
File without changes
|
data/lib/wp2txt/article.rb
CHANGED
@@ -37,6 +37,10 @@ module Wp2txt
|
|
37
37
|
def initialize(text, title = "", strip_tmarker = false)
|
38
38
|
@title = title.strip
|
39
39
|
@strip_tmarker = strip_tmarker
|
40
|
+
convert_characters!(text)
|
41
|
+
make_reference!(text)
|
42
|
+
remove_ref!(text)
|
43
|
+
|
40
44
|
parse text
|
41
45
|
end
|
42
46
|
|
@@ -58,6 +62,22 @@ module Wp2txt
|
|
58
62
|
end
|
59
63
|
|
60
64
|
case mode
|
65
|
+
when :mw_ml_template
|
66
|
+
scanner = StringScanner.new(line)
|
67
|
+
str= process_nested_structure(scanner, "{{", "}}") {""}
|
68
|
+
if $ml_template_end_regex =~ str
|
69
|
+
mode = nil
|
70
|
+
end
|
71
|
+
@elements.last.last << line
|
72
|
+
next
|
73
|
+
when :mw_ml_link
|
74
|
+
scanner = StringScanner.new(line)
|
75
|
+
str= process_nested_structure(scanner, "[[", "]]") {""}
|
76
|
+
if $ml_link_end_regex =~ str
|
77
|
+
mode = nil
|
78
|
+
end
|
79
|
+
@elements.last.last << line
|
80
|
+
next
|
61
81
|
when :mw_table
|
62
82
|
if $in_table_regex2 =~ line
|
63
83
|
mode = nil
|
@@ -91,19 +111,29 @@ module Wp2txt
|
|
91
111
|
end
|
92
112
|
|
93
113
|
case line
|
114
|
+
when $isolated_template_regex
|
115
|
+
@elements << create_element(:mw_isolated_template, line)
|
116
|
+
when $isolated_tag_regex
|
117
|
+
@elements << create_element(:mw_isolated_tag, line)
|
94
118
|
when $blank_line_regex
|
95
119
|
@elements << create_element(:mw_blank, "\n")
|
96
120
|
when $redirect_regex
|
97
121
|
@elements << create_element(:mw_redirect, line)
|
98
|
-
when $in_template_regex
|
99
|
-
|
122
|
+
# when $in_template_regex
|
123
|
+
# @elements << create_element(:mw_template, line)
|
100
124
|
when $in_heading_regex
|
101
125
|
line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
|
102
126
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
103
127
|
when $in_inputbox_regex
|
104
128
|
@elements << create_element(:mw_inputbox, line)
|
129
|
+
when $ml_template_onset_regex
|
130
|
+
@elements << create_element(:mw_ml_template, line)
|
131
|
+
mode = :mw_ml_template
|
132
|
+
when $ml_link_onset_regex
|
133
|
+
@elements << create_element(:mw_ml_link, line)
|
134
|
+
mode = :mw_ml_link
|
105
135
|
when $in_inputbox_regex1
|
106
|
-
mode = :mw_inputbox
|
136
|
+
mode = :mw_inputbox
|
107
137
|
@elements << create_element(:mw_inputbox, line)
|
108
138
|
when $in_source_regex
|
109
139
|
@elements << create_element(:mw_source, line)
|
@@ -138,7 +168,7 @@ module Wp2txt
|
|
138
168
|
when $in_link_regex
|
139
169
|
@elements << create_element(:mw_link, line)
|
140
170
|
else
|
141
|
-
@elements << create_element(:mw_paragraph, line)
|
171
|
+
@elements << create_element(:mw_paragraph, "\n" + line)
|
142
172
|
end
|
143
173
|
end
|
144
174
|
@elements
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -16,43 +16,36 @@ $html_decoder = HTMLEntities.new
|
|
16
16
|
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
17
|
$html_hash = Hash[*$entities.flatten]
|
18
18
|
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
-
|
20
|
-
$
|
19
|
+
$ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
|
20
|
+
$ml_template_end_regex = Regexp.new('\}\}\s*$')
|
21
|
+
$ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
|
22
|
+
$ml_linkend_regex = Regexp.new('\]\]\s*$')
|
23
|
+
$isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
24
|
+
$isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
21
25
|
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
22
|
-
|
23
26
|
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
24
27
|
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
25
28
|
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
26
|
-
|
27
29
|
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
28
30
|
$in_source_regex1 = Regexp.new('<source.*?>')
|
29
31
|
$in_source_regex2 = Regexp.new('<\/source>')
|
30
|
-
|
31
32
|
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
32
33
|
$in_math_regex1 = Regexp.new('<math.*?>')
|
33
34
|
$in_math_regex2 = Regexp.new('<\/math>')
|
34
|
-
|
35
35
|
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
36
|
-
|
37
36
|
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
38
37
|
$in_html_table_regex1 = Regexp.new('<table\b')
|
39
38
|
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
40
|
-
|
41
39
|
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
42
40
|
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
43
|
-
|
44
41
|
$in_unordered_regex = Regexp.new('^\*')
|
45
42
|
$in_ordered_regex = Regexp.new('^\#')
|
46
43
|
$in_pre_regex = Regexp.new('^ ')
|
47
44
|
$in_definition_regex = Regexp.new('^[\;\:]')
|
48
|
-
|
49
45
|
$blank_line_regex = Regexp.new('^\s*$')
|
50
|
-
|
51
46
|
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
52
|
-
|
53
47
|
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
54
48
|
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
55
|
-
|
56
49
|
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
57
50
|
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
58
51
|
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
@@ -75,6 +68,7 @@ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}
|
|
75
68
|
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
76
69
|
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
77
70
|
|
71
|
+
$remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
78
72
|
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
79
73
|
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
80
74
|
|
@@ -111,27 +105,26 @@ module Wp2txt
|
|
111
105
|
|
112
106
|
def format_wiki!(text, has_retried = false)
|
113
107
|
escape_nowiki!(text)
|
114
|
-
|
115
108
|
process_interwiki_links!(text)
|
116
109
|
process_external_links!(text)
|
117
|
-
|
118
110
|
unescape_nowiki!(text)
|
119
|
-
|
120
|
-
|
121
|
-
def format_article!(text)
|
111
|
+
#####
|
122
112
|
remove_directive!(text)
|
123
113
|
remove_emphasis!(text)
|
124
114
|
mndash!(text)
|
125
|
-
make_reference!(text)
|
126
|
-
format_ref!(text)
|
127
115
|
remove_hr!(text)
|
128
116
|
remove_tag!(text)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
remove_table!(text) unless $leave_table
|
117
|
+
correct_inline_template!(text) unless $leave_inline_template
|
118
|
+
remove_templates!(text) unless $leave_inline_template
|
119
|
+
# remove_table!(text) unless $leave_table
|
133
120
|
end
|
134
121
|
|
122
|
+
def cleanup!(text)
|
123
|
+
text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
124
|
+
text.gsub!(/\n\n\n+/m){"\n\n"}
|
125
|
+
text.strip!
|
126
|
+
text << "\n\n"
|
127
|
+
end
|
135
128
|
#################### parser for nested structure ####################
|
136
129
|
|
137
130
|
def process_nested_structure(scanner, left, right, &block)
|
@@ -237,6 +230,10 @@ module Wp2txt
|
|
237
230
|
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
238
231
|
""
|
239
232
|
end
|
233
|
+
scanner = StringScanner.new(result)
|
234
|
+
result = process_nested_structure(scanner, "{", "}") do |contents|
|
235
|
+
""
|
236
|
+
end
|
240
237
|
str.replace(result)
|
241
238
|
end
|
242
239
|
|
@@ -295,48 +292,48 @@ module Wp2txt
|
|
295
292
|
str.gsub!($mndash_regex, "–")
|
296
293
|
end
|
297
294
|
|
298
|
-
def remove_hr!(
|
299
|
-
|
295
|
+
def remove_hr!(str)
|
296
|
+
str.gsub!($remove_hr_regex, "")
|
300
297
|
end
|
301
298
|
|
299
|
+
def remove_ref!(str)
|
300
|
+
str.gsub!($format_ref_regex){""}
|
301
|
+
end
|
302
|
+
|
302
303
|
def make_reference!(str)
|
303
304
|
str.gsub!($make_reference_regex_a){"\n"}
|
304
305
|
str.gsub!($make_reference_regex_b){""}
|
305
306
|
str.gsub!($make_reference_regex_c){"[ref]"}
|
306
307
|
str.gsub!($make_reference_regex_d){"[/ref]"}
|
307
|
-
str.gsub!($format_ref_regex){""} unless $leave_ref
|
308
|
-
end
|
309
|
-
|
310
|
-
def format_ref!(page)
|
311
|
-
###### do nothing for now
|
312
|
-
# page.gsub!($format_ref_regex) do
|
313
|
-
# end
|
314
308
|
end
|
315
309
|
|
316
310
|
def correct_inline_template!(str)
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
311
|
+
scanner = StringScanner.new(str)
|
312
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
313
|
+
parts = contents.split("|")
|
314
|
+
# type_code = parts.first
|
315
|
+
# case type_code
|
316
|
+
# when $type_code_regex
|
317
|
+
# out = parts[-1]
|
318
|
+
# else
|
319
|
+
# case parts.size
|
320
|
+
# when 0
|
321
|
+
# out = ""
|
322
|
+
# when 1
|
323
|
+
# out = parts.first || ""
|
324
|
+
# else
|
325
|
+
# while parts.size > 2 && parts.last.split("=").size > 1
|
326
|
+
while parts.size > 1 && parts.last.split("=").size > 1
|
327
|
+
parts.pop
|
332
328
|
end
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
end
|
329
|
+
out = parts.last || ""
|
330
|
+
# end
|
331
|
+
# end
|
332
|
+
out.strip
|
338
333
|
end
|
334
|
+
str.replace result
|
339
335
|
end
|
336
|
+
|
340
337
|
|
341
338
|
#################### file related utilities ####################
|
342
339
|
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
@@ -3,13 +3,10 @@
|
|
3
3
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
|
-
# require "rubygems"
|
7
|
-
# require "bundler/setup"
|
8
|
-
|
9
6
|
require "nokogiri"
|
10
|
-
|
11
|
-
# require "ox"
|
7
|
+
require "parallel"
|
12
8
|
|
9
|
+
require 'etc'
|
13
10
|
require 'pp'
|
14
11
|
require "wp2txt/article"
|
15
12
|
require "wp2txt/utils"
|
@@ -29,7 +26,7 @@ module Wp2txt
|
|
29
26
|
|
30
27
|
include Wp2txt
|
31
28
|
|
32
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
29
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
|
33
30
|
@parent = parent
|
34
31
|
@fp = nil
|
35
32
|
|
@@ -38,6 +35,8 @@ module Wp2txt
|
|
38
35
|
@tfile_size = tfile_size
|
39
36
|
@convert = convert
|
40
37
|
@strip_tmarker = strip_tmarker
|
38
|
+
num_cores_available = Etc.nprocessors
|
39
|
+
@num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
|
41
40
|
end
|
42
41
|
|
43
42
|
def file_size(file)
|
@@ -102,6 +101,7 @@ module Wp2txt
|
|
102
101
|
if /.bz2$/ =~ @input_file
|
103
102
|
unless NO_BZ2
|
104
103
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
104
|
+
@parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
|
105
105
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
106
106
|
@infile_size = file_size(file)
|
107
107
|
@parent.msg("... Done.", 1)
|
@@ -113,6 +113,7 @@ module Wp2txt
|
|
113
113
|
else
|
114
114
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
115
115
|
end
|
116
|
+
@parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
|
116
117
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
117
118
|
@infile_size = file_size(file)
|
118
119
|
@parent.msg("... Done.", 1)
|
@@ -237,81 +238,73 @@ module Wp2txt
|
|
237
238
|
end_flag = false
|
238
239
|
terminal_round = false
|
239
240
|
output_text = ""
|
241
|
+
pages = []
|
242
|
+
data_empty = false
|
240
243
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
# title = page.parent.parent.xpath("//xmlns:title").first.text
|
255
|
-
# next if /\:/ =~ title
|
256
|
-
# text = page.text
|
244
|
+
begin
|
245
|
+
page = get_page
|
246
|
+
if page
|
247
|
+
pages << page
|
248
|
+
else
|
249
|
+
data_empty = true
|
250
|
+
end
|
251
|
+
if data_empty || pages.size == @num_threads
|
252
|
+
# pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
253
|
+
pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
254
|
+
page_text = {:order => n, :data => nil}
|
255
|
+
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
256
|
+
xml = xmlns + page + "</mediawiki>"
|
257
257
|
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
258
|
+
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
259
|
+
page = input.xpath("//xmlns:text").first
|
260
|
+
pp_title = page.parent.parent.at_css "title"
|
261
|
+
title = pp_title.content
|
262
|
+
unless /\:/ =~ title
|
263
|
+
text = page.content
|
264
|
+
text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
|
265
|
+
num_of_newlines = content.count("\n")
|
266
|
+
if num_of_newlines == 0
|
267
|
+
""
|
268
|
+
else
|
269
|
+
"\n" * num_of_newlines
|
270
|
+
end
|
271
|
+
end
|
272
|
+
article = Article.new(text, title, @strip_tmarker)
|
273
|
+
page_text[:data] = block.call(article)
|
274
|
+
end
|
275
|
+
page_text
|
276
|
+
end
|
277
|
+
pages.clear
|
278
|
+
pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
|
279
|
+
pages_text.each do |page_text|
|
280
|
+
output_text << page_text
|
281
|
+
@count ||= 0; @count += 1;
|
282
|
+
@total_size = output_text.bytesize
|
283
|
+
# flagged when data exceeds the size of output file
|
284
|
+
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
285
|
+
end
|
278
286
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
287
|
+
#close the present file, then open a new one
|
288
|
+
if end_flag
|
289
|
+
cleanup!(output_text)
|
290
|
+
@fp.puts(output_text)
|
291
|
+
output_text = ""
|
292
|
+
@total_size = 0
|
293
|
+
end_flag = false
|
294
|
+
@fp.close
|
295
|
+
@file_index += 1
|
296
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
297
|
+
@outfiles << outfilename
|
298
|
+
@fp = File.open(outfilename, "w")
|
299
|
+
next
|
288
300
|
end
|
289
301
|
end
|
290
|
-
|
291
|
-
@count ||= 0;@count += 1;
|
292
|
-
|
293
|
-
article = Article.new(text, title, @strip_tmarker)
|
294
|
-
output_text += block.call(article)
|
295
|
-
@total_size = output_text.bytesize
|
302
|
+
end while !data_empty
|
296
303
|
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
#close the present file, then open a new one
|
301
|
-
if end_flag
|
302
|
-
@fp.puts(output_text)
|
303
|
-
output_text = ""
|
304
|
-
@total_size = 0
|
305
|
-
end_flag = false
|
306
|
-
@fp.close
|
307
|
-
@file_index += 1
|
308
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
309
|
-
@outfiles << outfilename
|
310
|
-
@fp = File.open(outfilename, "w")
|
311
|
-
next
|
312
|
-
end
|
304
|
+
if output_text != ""
|
305
|
+
cleanup!(output_text)
|
306
|
+
@fp.puts(output_text)
|
313
307
|
end
|
314
|
-
@fp.puts(output_text) if output_text != ""
|
315
308
|
notify_parent(true)
|
316
309
|
@parent.after
|
317
310
|
@fp.close
|
@@ -351,4 +344,5 @@ module Wp2txt
|
|
351
344
|
@parent.msg("Processing finished", 1)
|
352
345
|
end
|
353
346
|
end
|
354
|
-
end
|
347
|
+
end
|
348
|
+
|
data/spec/utils_spec.rb
CHANGED
@@ -182,22 +182,34 @@ describe "Wp2txt" do
|
|
182
182
|
end
|
183
183
|
end
|
184
184
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
185
|
+
describe "correct_inline_template!" do
|
186
|
+
it "removes brackets and leaving some text" do
|
187
|
+
# str_before = "{{}}"
|
188
|
+
# str_after = ""
|
189
|
+
# correct_inline_template!(str_before)
|
190
|
+
# expect(str_before).to eq str_after
|
191
|
+
str_before = "{{MedalCountry | {{JPN}} }}"
|
192
|
+
str_after = "JPN"
|
193
|
+
correct_inline_template!(str_before)
|
194
|
+
expect(str_before).to eq str_after
|
195
|
+
str_before = "{{lang|en|Japan}}"
|
196
|
+
str_after = "Japan"
|
197
|
+
correct_inline_template!(str_before)
|
198
|
+
expect(str_before).to eq str_after
|
199
|
+
str_before = "{{a|b=c|d=f}}"
|
200
|
+
str_after = "a"
|
201
|
+
correct_inline_template!(str_before)
|
202
|
+
expect(str_before).to eq str_after
|
203
|
+
str_before = "{{a|b|{{c|d|e}}}}"
|
204
|
+
str_after = "e"
|
205
|
+
correct_inline_template!(str_before)
|
206
|
+
expect(str_before).to eq str_after
|
207
|
+
str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
|
208
|
+
str_after = "日本人に多く見受けられる"
|
209
|
+
correct_inline_template!(str_before)
|
210
|
+
expect(str_before).to eq str_after
|
211
|
+
end
|
212
|
+
end
|
201
213
|
|
202
214
|
# describe "expand_template" do
|
203
215
|
# it "gets data corresponding to a given template using mediawiki api" do
|
data/wp2txt.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-07-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: parallel
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: htmlentities
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,7 +53,7 @@ dependencies:
|
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: optimist
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - ">="
|
@@ -69,8 +83,10 @@ files:
|
|
69
83
|
- Rakefile
|
70
84
|
- bin/benchmark.rb
|
71
85
|
- bin/wp2txt
|
72
|
-
- data/
|
73
|
-
-
|
86
|
+
- data/output_samples/testdata_en.txt
|
87
|
+
- data/output_samples/testdata_ja.txt
|
88
|
+
- data/testdata_en.bz2
|
89
|
+
- data/testdata_ja.bz2
|
74
90
|
- lib/wp2txt.rb
|
75
91
|
- lib/wp2txt/article.rb
|
76
92
|
- lib/wp2txt/mw_api.rb
|
@@ -83,7 +99,7 @@ files:
|
|
83
99
|
homepage: http://github.com/yohasebe/wp2txt
|
84
100
|
licenses: []
|
85
101
|
metadata: {}
|
86
|
-
post_install_message:
|
102
|
+
post_install_message:
|
87
103
|
rdoc_options: []
|
88
104
|
require_paths:
|
89
105
|
- lib
|
@@ -98,9 +114,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
114
|
- !ruby/object:Gem::Version
|
99
115
|
version: '0'
|
100
116
|
requirements: []
|
101
|
-
|
102
|
-
|
103
|
-
signing_key:
|
117
|
+
rubygems_version: 3.3.3
|
118
|
+
signing_key:
|
104
119
|
specification_version: 4
|
105
120
|
summary: Wikipedia dump to text converter
|
106
121
|
test_files:
|
data/error_log.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
[[アンパサンド]]
|