wp2txt 0.7.8 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +11 -6
- data/bin/benchmark.rb +5 -4
- data/bin/wp2txt +29 -30
- data/data/output_samples/testdata_en.txt +49076 -0
- data/data/output_samples/testdata_ja.txt +9382 -0
- data/data/testdata_en.bz2 +0 -0
- data/data/{testdata.bz2 → testdata_ja.bz2} +0 -0
- data/lib/wp2txt/article.rb +34 -4
- data/lib/wp2txt/utils.rb +50 -53
- data/lib/wp2txt/version.rb +1 -1
- data/lib/wp2txt.rb +69 -75
- data/spec/utils_spec.rb +28 -16
- data/wp2txt.gemspec +2 -1
- metadata +25 -10
- data/error_log.txt +0 -1
Binary file
|
File without changes
|
data/lib/wp2txt/article.rb
CHANGED
@@ -37,6 +37,10 @@ module Wp2txt
|
|
37
37
|
def initialize(text, title = "", strip_tmarker = false)
|
38
38
|
@title = title.strip
|
39
39
|
@strip_tmarker = strip_tmarker
|
40
|
+
convert_characters!(text)
|
41
|
+
make_reference!(text)
|
42
|
+
remove_ref!(text)
|
43
|
+
|
40
44
|
parse text
|
41
45
|
end
|
42
46
|
|
@@ -58,6 +62,22 @@ module Wp2txt
|
|
58
62
|
end
|
59
63
|
|
60
64
|
case mode
|
65
|
+
when :mw_ml_template
|
66
|
+
scanner = StringScanner.new(line)
|
67
|
+
str= process_nested_structure(scanner, "{{", "}}") {""}
|
68
|
+
if $ml_template_end_regex =~ str
|
69
|
+
mode = nil
|
70
|
+
end
|
71
|
+
@elements.last.last << line
|
72
|
+
next
|
73
|
+
when :mw_ml_link
|
74
|
+
scanner = StringScanner.new(line)
|
75
|
+
str= process_nested_structure(scanner, "[[", "]]") {""}
|
76
|
+
if $ml_link_end_regex =~ str
|
77
|
+
mode = nil
|
78
|
+
end
|
79
|
+
@elements.last.last << line
|
80
|
+
next
|
61
81
|
when :mw_table
|
62
82
|
if $in_table_regex2 =~ line
|
63
83
|
mode = nil
|
@@ -91,19 +111,29 @@ module Wp2txt
|
|
91
111
|
end
|
92
112
|
|
93
113
|
case line
|
114
|
+
when $isolated_template_regex
|
115
|
+
@elements << create_element(:mw_isolated_template, line)
|
116
|
+
when $isolated_tag_regex
|
117
|
+
@elements << create_element(:mw_isolated_tag, line)
|
94
118
|
when $blank_line_regex
|
95
119
|
@elements << create_element(:mw_blank, "\n")
|
96
120
|
when $redirect_regex
|
97
121
|
@elements << create_element(:mw_redirect, line)
|
98
|
-
when $in_template_regex
|
99
|
-
|
122
|
+
# when $in_template_regex
|
123
|
+
# @elements << create_element(:mw_template, line)
|
100
124
|
when $in_heading_regex
|
101
125
|
line = line.sub($heading_onset_regex){$1}.sub($heading_coda_regex){$1}
|
102
126
|
@elements << create_element(:mw_heading, "\n" + line + "\n")
|
103
127
|
when $in_inputbox_regex
|
104
128
|
@elements << create_element(:mw_inputbox, line)
|
129
|
+
when $ml_template_onset_regex
|
130
|
+
@elements << create_element(:mw_ml_template, line)
|
131
|
+
mode = :mw_ml_template
|
132
|
+
when $ml_link_onset_regex
|
133
|
+
@elements << create_element(:mw_ml_link, line)
|
134
|
+
mode = :mw_ml_link
|
105
135
|
when $in_inputbox_regex1
|
106
|
-
mode = :mw_inputbox
|
136
|
+
mode = :mw_inputbox
|
107
137
|
@elements << create_element(:mw_inputbox, line)
|
108
138
|
when $in_source_regex
|
109
139
|
@elements << create_element(:mw_source, line)
|
@@ -138,7 +168,7 @@ module Wp2txt
|
|
138
168
|
when $in_link_regex
|
139
169
|
@elements << create_element(:mw_link, line)
|
140
170
|
else
|
141
|
-
@elements << create_element(:mw_paragraph, line)
|
171
|
+
@elements << create_element(:mw_paragraph, "\n" + line)
|
142
172
|
end
|
143
173
|
end
|
144
174
|
@elements
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -16,43 +16,36 @@ $html_decoder = HTMLEntities.new
|
|
16
16
|
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
17
|
$html_hash = Hash[*$entities.flatten]
|
18
18
|
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
-
|
20
|
-
$
|
19
|
+
$ml_template_onset_regex = Regexp.new('^\{\{[^\}]*$')
|
20
|
+
$ml_template_end_regex = Regexp.new('\}\}\s*$')
|
21
|
+
$ml_link_onset_regex = Regexp.new('^\[\[[^\]]*$')
|
22
|
+
$ml_linkend_regex = Regexp.new('\]\]\s*$')
|
23
|
+
$isolated_template_regex = Regexp.new('^\s*\{\{.+\}\}\s*$')
|
24
|
+
$isolated_tag_regex = Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
|
21
25
|
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
22
|
-
|
23
26
|
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
24
27
|
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
25
28
|
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
26
|
-
|
27
29
|
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
28
30
|
$in_source_regex1 = Regexp.new('<source.*?>')
|
29
31
|
$in_source_regex2 = Regexp.new('<\/source>')
|
30
|
-
|
31
32
|
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
32
33
|
$in_math_regex1 = Regexp.new('<math.*?>')
|
33
34
|
$in_math_regex2 = Regexp.new('<\/math>')
|
34
|
-
|
35
35
|
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
36
|
-
|
37
36
|
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
38
37
|
$in_html_table_regex1 = Regexp.new('<table\b')
|
39
38
|
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
40
|
-
|
41
39
|
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
42
40
|
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
43
|
-
|
44
41
|
$in_unordered_regex = Regexp.new('^\*')
|
45
42
|
$in_ordered_regex = Regexp.new('^\#')
|
46
43
|
$in_pre_regex = Regexp.new('^ ')
|
47
44
|
$in_definition_regex = Regexp.new('^[\;\:]')
|
48
|
-
|
49
45
|
$blank_line_regex = Regexp.new('^\s*$')
|
50
|
-
|
51
46
|
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
52
|
-
|
53
47
|
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
54
48
|
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
55
|
-
|
56
49
|
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
57
50
|
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
58
51
|
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
@@ -75,6 +68,7 @@ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}
|
|
75
68
|
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
76
69
|
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
77
70
|
|
71
|
+
$remove_isolated_regex = Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
|
78
72
|
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
79
73
|
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
80
74
|
|
@@ -111,27 +105,26 @@ module Wp2txt
|
|
111
105
|
|
112
106
|
def format_wiki!(text, has_retried = false)
|
113
107
|
escape_nowiki!(text)
|
114
|
-
|
115
108
|
process_interwiki_links!(text)
|
116
109
|
process_external_links!(text)
|
117
|
-
|
118
110
|
unescape_nowiki!(text)
|
119
|
-
|
120
|
-
|
121
|
-
def format_article!(text)
|
111
|
+
#####
|
122
112
|
remove_directive!(text)
|
123
113
|
remove_emphasis!(text)
|
124
114
|
mndash!(text)
|
125
|
-
make_reference!(text)
|
126
|
-
format_ref!(text)
|
127
115
|
remove_hr!(text)
|
128
116
|
remove_tag!(text)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
remove_table!(text) unless $leave_table
|
117
|
+
correct_inline_template!(text) unless $leave_inline_template
|
118
|
+
remove_templates!(text) unless $leave_inline_template
|
119
|
+
# remove_table!(text) unless $leave_table
|
133
120
|
end
|
134
121
|
|
122
|
+
def cleanup!(text)
|
123
|
+
text.gsub!(/\[ref\]\s*\[\/ref\]/m){""}
|
124
|
+
text.gsub!(/\n\n\n+/m){"\n\n"}
|
125
|
+
text.strip!
|
126
|
+
text << "\n\n"
|
127
|
+
end
|
135
128
|
#################### parser for nested structure ####################
|
136
129
|
|
137
130
|
def process_nested_structure(scanner, left, right, &block)
|
@@ -237,6 +230,10 @@ module Wp2txt
|
|
237
230
|
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
238
231
|
""
|
239
232
|
end
|
233
|
+
scanner = StringScanner.new(result)
|
234
|
+
result = process_nested_structure(scanner, "{", "}") do |contents|
|
235
|
+
""
|
236
|
+
end
|
240
237
|
str.replace(result)
|
241
238
|
end
|
242
239
|
|
@@ -295,48 +292,48 @@ module Wp2txt
|
|
295
292
|
str.gsub!($mndash_regex, "–")
|
296
293
|
end
|
297
294
|
|
298
|
-
def remove_hr!(
|
299
|
-
|
295
|
+
def remove_hr!(str)
|
296
|
+
str.gsub!($remove_hr_regex, "")
|
300
297
|
end
|
301
298
|
|
299
|
+
def remove_ref!(str)
|
300
|
+
str.gsub!($format_ref_regex){""}
|
301
|
+
end
|
302
|
+
|
302
303
|
def make_reference!(str)
|
303
304
|
str.gsub!($make_reference_regex_a){"\n"}
|
304
305
|
str.gsub!($make_reference_regex_b){""}
|
305
306
|
str.gsub!($make_reference_regex_c){"[ref]"}
|
306
307
|
str.gsub!($make_reference_regex_d){"[/ref]"}
|
307
|
-
str.gsub!($format_ref_regex){""} unless $leave_ref
|
308
|
-
end
|
309
|
-
|
310
|
-
def format_ref!(page)
|
311
|
-
###### do nothing for now
|
312
|
-
# page.gsub!($format_ref_regex) do
|
313
|
-
# end
|
314
308
|
end
|
315
309
|
|
316
310
|
def correct_inline_template!(str)
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
311
|
+
scanner = StringScanner.new(str)
|
312
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
313
|
+
parts = contents.split("|")
|
314
|
+
# type_code = parts.first
|
315
|
+
# case type_code
|
316
|
+
# when $type_code_regex
|
317
|
+
# out = parts[-1]
|
318
|
+
# else
|
319
|
+
# case parts.size
|
320
|
+
# when 0
|
321
|
+
# out = ""
|
322
|
+
# when 1
|
323
|
+
# out = parts.first || ""
|
324
|
+
# else
|
325
|
+
# while parts.size > 2 && parts.last.split("=").size > 1
|
326
|
+
while parts.size > 1 && parts.last.split("=").size > 1
|
327
|
+
parts.pop
|
332
328
|
end
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
end
|
329
|
+
out = parts.last || ""
|
330
|
+
# end
|
331
|
+
# end
|
332
|
+
out.strip
|
338
333
|
end
|
334
|
+
str.replace result
|
339
335
|
end
|
336
|
+
|
340
337
|
|
341
338
|
#################### file related utilities ####################
|
342
339
|
|
data/lib/wp2txt/version.rb
CHANGED
data/lib/wp2txt.rb
CHANGED
@@ -3,13 +3,10 @@
|
|
3
3
|
|
4
4
|
$: << File.join(File.dirname(__FILE__))
|
5
5
|
|
6
|
-
# require "rubygems"
|
7
|
-
# require "bundler/setup"
|
8
|
-
|
9
6
|
require "nokogiri"
|
10
|
-
|
11
|
-
# require "ox"
|
7
|
+
require "parallel"
|
12
8
|
|
9
|
+
require 'etc'
|
13
10
|
require 'pp'
|
14
11
|
require "wp2txt/article"
|
15
12
|
require "wp2txt/utils"
|
@@ -29,7 +26,7 @@ module Wp2txt
|
|
29
26
|
|
30
27
|
include Wp2txt
|
31
28
|
|
32
|
-
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, convert = true, strip_tmarker = false)
|
29
|
+
def initialize(parent, input_file, output_dir = ".", tfile_size = 10, num_threads = 1, convert = true, strip_tmarker = false)
|
33
30
|
@parent = parent
|
34
31
|
@fp = nil
|
35
32
|
|
@@ -38,6 +35,8 @@ module Wp2txt
|
|
38
35
|
@tfile_size = tfile_size
|
39
36
|
@convert = convert
|
40
37
|
@strip_tmarker = strip_tmarker
|
38
|
+
num_cores_available = Etc.nprocessors
|
39
|
+
@num_threads = num_threads <= num_cores_available ? num_threads : num_cores_available
|
41
40
|
end
|
42
41
|
|
43
42
|
def file_size(file)
|
@@ -102,6 +101,7 @@ module Wp2txt
|
|
102
101
|
if /.bz2$/ =~ @input_file
|
103
102
|
unless NO_BZ2
|
104
103
|
file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
|
104
|
+
@parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
|
105
105
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
106
106
|
@infile_size = file_size(file)
|
107
107
|
@parent.msg("... Done.", 1)
|
@@ -113,6 +113,7 @@ module Wp2txt
|
|
113
113
|
else
|
114
114
|
file = IO.popen("bzip2 -c -d #{@input_file}")
|
115
115
|
end
|
116
|
+
@parent.msg("WP2TXT is spawming #{@num_threads} threads to process data \n", 0)
|
116
117
|
@parent.msg("Preparing ... This may take several minutes or more ", 0)
|
117
118
|
@infile_size = file_size(file)
|
118
119
|
@parent.msg("... Done.", 1)
|
@@ -237,81 +238,73 @@ module Wp2txt
|
|
237
238
|
end_flag = false
|
238
239
|
terminal_round = false
|
239
240
|
output_text = ""
|
241
|
+
pages = []
|
242
|
+
data_empty = false
|
240
243
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
# title = page.parent.parent.xpath("//xmlns:title").first.text
|
255
|
-
# next if /\:/ =~ title
|
256
|
-
# text = page.text
|
244
|
+
begin
|
245
|
+
page = get_page
|
246
|
+
if page
|
247
|
+
pages << page
|
248
|
+
else
|
249
|
+
data_empty = true
|
250
|
+
end
|
251
|
+
if data_empty || pages.size == @num_threads
|
252
|
+
# pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
253
|
+
pages_text = Parallel.map_with_index(pages, in_threads: @num_threads) do |page, n|
|
254
|
+
page_text = {:order => n, :data => nil}
|
255
|
+
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
|
256
|
+
xml = xmlns + page + "</mediawiki>"
|
257
257
|
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
258
|
+
input = Nokogiri::XML(xml, nil, 'UTF-8')
|
259
|
+
page = input.xpath("//xmlns:text").first
|
260
|
+
pp_title = page.parent.parent.at_css "title"
|
261
|
+
title = pp_title.content
|
262
|
+
unless /\:/ =~ title
|
263
|
+
text = page.content
|
264
|
+
text.gsub!(/\<\!\-\-(.*?)\-\-\>/m) do |content|
|
265
|
+
num_of_newlines = content.count("\n")
|
266
|
+
if num_of_newlines == 0
|
267
|
+
""
|
268
|
+
else
|
269
|
+
"\n" * num_of_newlines
|
270
|
+
end
|
271
|
+
end
|
272
|
+
article = Article.new(text, title, @strip_tmarker)
|
273
|
+
page_text[:data] = block.call(article)
|
274
|
+
end
|
275
|
+
page_text
|
276
|
+
end
|
277
|
+
pages.clear
|
278
|
+
pages_text = pages_text.sort_by{|v| v[:order]}.map{|v| v[:data]}.compact
|
279
|
+
pages_text.each do |page_text|
|
280
|
+
output_text << page_text
|
281
|
+
@count ||= 0; @count += 1;
|
282
|
+
@total_size = output_text.bytesize
|
283
|
+
# flagged when data exceeds the size of output file
|
284
|
+
end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
|
285
|
+
end
|
278
286
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
287
|
+
#close the present file, then open a new one
|
288
|
+
if end_flag
|
289
|
+
cleanup!(output_text)
|
290
|
+
@fp.puts(output_text)
|
291
|
+
output_text = ""
|
292
|
+
@total_size = 0
|
293
|
+
end_flag = false
|
294
|
+
@fp.close
|
295
|
+
@file_index += 1
|
296
|
+
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
297
|
+
@outfiles << outfilename
|
298
|
+
@fp = File.open(outfilename, "w")
|
299
|
+
next
|
288
300
|
end
|
289
301
|
end
|
290
|
-
|
291
|
-
@count ||= 0;@count += 1;
|
292
|
-
|
293
|
-
article = Article.new(text, title, @strip_tmarker)
|
294
|
-
output_text += block.call(article)
|
295
|
-
@total_size = output_text.bytesize
|
302
|
+
end while !data_empty
|
296
303
|
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
#close the present file, then open a new one
|
301
|
-
if end_flag
|
302
|
-
@fp.puts(output_text)
|
303
|
-
output_text = ""
|
304
|
-
@total_size = 0
|
305
|
-
end_flag = false
|
306
|
-
@fp.close
|
307
|
-
@file_index += 1
|
308
|
-
outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
|
309
|
-
@outfiles << outfilename
|
310
|
-
@fp = File.open(outfilename, "w")
|
311
|
-
next
|
312
|
-
end
|
304
|
+
if output_text != ""
|
305
|
+
cleanup!(output_text)
|
306
|
+
@fp.puts(output_text)
|
313
307
|
end
|
314
|
-
@fp.puts(output_text) if output_text != ""
|
315
308
|
notify_parent(true)
|
316
309
|
@parent.after
|
317
310
|
@fp.close
|
@@ -351,4 +344,5 @@ module Wp2txt
|
|
351
344
|
@parent.msg("Processing finished", 1)
|
352
345
|
end
|
353
346
|
end
|
354
|
-
end
|
347
|
+
end
|
348
|
+
|
data/spec/utils_spec.rb
CHANGED
@@ -182,22 +182,34 @@ describe "Wp2txt" do
|
|
182
182
|
end
|
183
183
|
end
|
184
184
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
185
|
+
describe "correct_inline_template!" do
|
186
|
+
it "removes brackets and leaving some text" do
|
187
|
+
# str_before = "{{}}"
|
188
|
+
# str_after = ""
|
189
|
+
# correct_inline_template!(str_before)
|
190
|
+
# expect(str_before).to eq str_after
|
191
|
+
str_before = "{{MedalCountry | {{JPN}} }}"
|
192
|
+
str_after = "JPN"
|
193
|
+
correct_inline_template!(str_before)
|
194
|
+
expect(str_before).to eq str_after
|
195
|
+
str_before = "{{lang|en|Japan}}"
|
196
|
+
str_after = "Japan"
|
197
|
+
correct_inline_template!(str_before)
|
198
|
+
expect(str_before).to eq str_after
|
199
|
+
str_before = "{{a|b=c|d=f}}"
|
200
|
+
str_after = "a"
|
201
|
+
correct_inline_template!(str_before)
|
202
|
+
expect(str_before).to eq str_after
|
203
|
+
str_before = "{{a|b|{{c|d|e}}}}"
|
204
|
+
str_after = "e"
|
205
|
+
correct_inline_template!(str_before)
|
206
|
+
expect(str_before).to eq str_after
|
207
|
+
str_before = "{{要出典範囲|日本人に多く見受けられる|date=2013年8月|title=日本人特有なのか、本当に多いのかを示す必要がある}}"
|
208
|
+
str_after = "日本人に多く見受けられる"
|
209
|
+
correct_inline_template!(str_before)
|
210
|
+
expect(str_before).to eq str_after
|
211
|
+
end
|
212
|
+
end
|
201
213
|
|
202
214
|
# describe "expand_template" do
|
203
215
|
# it "gets data corresponding to a given template using mediawiki api" do
|
data/wp2txt.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-07-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: parallel
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: htmlentities
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,7 +53,7 @@ dependencies:
|
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: optimist
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - ">="
|
@@ -69,8 +83,10 @@ files:
|
|
69
83
|
- Rakefile
|
70
84
|
- bin/benchmark.rb
|
71
85
|
- bin/wp2txt
|
72
|
-
- data/
|
73
|
-
-
|
86
|
+
- data/output_samples/testdata_en.txt
|
87
|
+
- data/output_samples/testdata_ja.txt
|
88
|
+
- data/testdata_en.bz2
|
89
|
+
- data/testdata_ja.bz2
|
74
90
|
- lib/wp2txt.rb
|
75
91
|
- lib/wp2txt/article.rb
|
76
92
|
- lib/wp2txt/mw_api.rb
|
@@ -83,7 +99,7 @@ files:
|
|
83
99
|
homepage: http://github.com/yohasebe/wp2txt
|
84
100
|
licenses: []
|
85
101
|
metadata: {}
|
86
|
-
post_install_message:
|
102
|
+
post_install_message:
|
87
103
|
rdoc_options: []
|
88
104
|
require_paths:
|
89
105
|
- lib
|
@@ -98,9 +114,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
114
|
- !ruby/object:Gem::Version
|
99
115
|
version: '0'
|
100
116
|
requirements: []
|
101
|
-
|
102
|
-
|
103
|
-
signing_key:
|
117
|
+
rubygems_version: 3.3.3
|
118
|
+
signing_key:
|
104
119
|
specification_version: 4
|
105
120
|
summary: Wikipedia dump to text converter
|
106
121
|
test_files:
|
data/error_log.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
[[アンパサンド]]
|