wp2txt 0.6.1 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +12 -11
- data/bin/benchmark.rb +14 -10
- data/bin/wp2txt +48 -28
- data/lib/wp2txt.rb +46 -11
- data/lib/wp2txt/article.rb +49 -89
- data/lib/wp2txt/mw_api.rb +0 -0
- data/lib/wp2txt/utils.rb +174 -112
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +60 -41
- data/wp2txt.gemspec +3 -9
- metadata +3 -59
data/lib/wp2txt/mw_api.rb
CHANGED
File without changes
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -3,52 +3,127 @@
|
|
3
3
|
|
4
4
|
require 'strscan'
|
5
5
|
require 'find'
|
6
|
-
|
6
|
+
|
7
|
+
###################################################
|
8
|
+
# global variables to save resource for generating regexps
|
9
|
+
# those with a trailing number 1 represent opening tag/markup
|
10
|
+
# those with a trailing number 2 represent closing tag/markup
|
11
|
+
# those without a trailing number contain both opening/closing tags/markups
|
12
|
+
|
13
|
+
$in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
|
14
|
+
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
15
|
+
|
16
|
+
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
17
|
+
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
18
|
+
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
19
|
+
|
20
|
+
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
21
|
+
$in_source_regex1 = Regexp.new('<source.*?>')
|
22
|
+
$in_source_regex2 = Regexp.new('<\/source>')
|
23
|
+
|
24
|
+
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
25
|
+
$in_math_regex1 = Regexp.new('<math.*?>')
|
26
|
+
$in_math_regex2 = Regexp.new('<\/math>')
|
27
|
+
|
28
|
+
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
29
|
+
|
30
|
+
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
31
|
+
$in_html_table_regex1 = Regexp.new('<table\b')
|
32
|
+
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
33
|
+
|
34
|
+
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
35
|
+
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
36
|
+
|
37
|
+
$in_unordered_regex = Regexp.new('^\*')
|
38
|
+
$in_ordered_regex = Regexp.new('^\#')
|
39
|
+
$in_pre_regex = Regexp.new('^ ')
|
40
|
+
$in_definition_regex = Regexp.new('^[\;\:]')
|
41
|
+
|
42
|
+
$blank_line_regex = Regexp.new('^\s*$')
|
43
|
+
|
44
|
+
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
45
|
+
|
46
|
+
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
47
|
+
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
48
|
+
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
49
|
+
$remove_hr_regex = Regexp.new('^\s*\-+\s*$')
|
50
|
+
$make_reference_regex_a = Regexp.new('<br ?\/>')
|
51
|
+
$make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
|
52
|
+
$make_reference_regex_c = Regexp.new('<ref[^>]*>')
|
53
|
+
$make_reference_regex_d = Regexp.new('<\/ref>')
|
54
|
+
$format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
55
|
+
$heading_onset_regex = Regexp.new('^(\=+)\s+')
|
56
|
+
$heading_coda_regex = Regexp.new('\s+(\=+)$')
|
57
|
+
$list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
58
|
+
$pre_marks_regex = Regexp.new('\A\^\ ')
|
59
|
+
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
60
|
+
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
61
|
+
$remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
|
62
|
+
$remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
|
63
|
+
|
64
|
+
$category_patterns = ["Category", "Categoria"].join("|")
|
65
|
+
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
66
|
+
|
67
|
+
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
68
|
+
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
69
|
+
|
70
|
+
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
71
|
+
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
72
|
+
|
73
|
+
$single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
|
74
|
+
$double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
|
75
|
+
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
76
|
+
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
77
|
+
|
78
|
+
###################################################
|
7
79
|
|
8
80
|
module Wp2txt
|
9
81
|
|
10
|
-
def format_wiki(
|
82
|
+
def format_wiki!(text, has_retried = false)
|
11
83
|
begin
|
12
|
-
text
|
84
|
+
text << ""
|
13
85
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
text = process_interwiki_links(text)
|
18
|
-
text = process_external_links(text)
|
86
|
+
chrref_to_utf!(text)
|
87
|
+
escape_nowiki!(text)
|
19
88
|
|
20
|
-
|
21
|
-
|
89
|
+
process_interwiki_links!(text)
|
90
|
+
process_external_links!(text)
|
22
91
|
|
23
|
-
|
24
|
-
|
25
|
-
text = format_ref(text)
|
26
|
-
text = remove_hr(text)
|
27
|
-
text = remove_tag(text)
|
28
|
-
text = special_chr(text)
|
29
|
-
|
30
|
-
unescape_nowiki(text)
|
92
|
+
unescape_nowiki!(text)
|
93
|
+
|
31
94
|
rescue # detect invalid byte sequence in UTF-8
|
32
95
|
if has_retried
|
33
96
|
puts "invalid byte sequence detected"
|
34
97
|
puts "******************************"
|
35
98
|
File.open("error_log.txt", "w") do |f|
|
36
|
-
f.write
|
99
|
+
f.write text
|
37
100
|
end
|
38
101
|
exit
|
39
102
|
else
|
40
|
-
|
41
|
-
|
103
|
+
text.encode!("UTF-16")
|
104
|
+
text.encode!("UTF-8")
|
105
|
+
format_wiki!(text, true)
|
42
106
|
end
|
43
107
|
end
|
44
108
|
end
|
45
109
|
|
46
110
|
#################### parser for nested structure ####################
|
47
111
|
|
48
|
-
def process_nested_structure(scanner, left, right, &block)
|
112
|
+
def process_nested_structure(scanner, left, right, recur_count, &block)
|
49
113
|
buffer = ""
|
50
114
|
begin
|
51
|
-
|
115
|
+
if left == "[" && right == "]"
|
116
|
+
regex = $single_square_bracket_regex
|
117
|
+
elsif left == "[[" && right == "]]"
|
118
|
+
regex = $double_square_bracket_regex
|
119
|
+
elsif left == "{" && right == "}"
|
120
|
+
regex = $single_curly_bracket_regex
|
121
|
+
elsif left == "{{" && right == "}}"
|
122
|
+
regex = $double_curly_bracket_regex
|
123
|
+
else
|
124
|
+
regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
|
125
|
+
end
|
126
|
+
while str = scanner.scan_until(regex)
|
52
127
|
case scanner[1]
|
53
128
|
when left
|
54
129
|
buffer << str
|
@@ -66,38 +141,35 @@ module Wp2txt
|
|
66
141
|
end
|
67
142
|
buffer << scanner.rest
|
68
143
|
|
69
|
-
|
70
|
-
|
144
|
+
recur_count = recur_count - 1
|
145
|
+
if recur_count < 0 || buffer == scanner.string
|
146
|
+
return buffer
|
71
147
|
else
|
72
148
|
scanner.string = buffer
|
73
|
-
return process_nested_structure(scanner, left, right, &block) || ""
|
149
|
+
return process_nested_structure(scanner, left, right, recur_count, &block) || ""
|
74
150
|
end
|
75
151
|
rescue => e
|
76
152
|
return scanner.string
|
77
153
|
end
|
78
154
|
end
|
79
155
|
|
80
|
-
|
156
|
+
#################### methods used from format_wiki ####################
|
157
|
+
|
158
|
+
def remove_templates!(str)
|
81
159
|
scanner = StringScanner.new(str)
|
82
|
-
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
83
|
-
|
84
|
-
"\n"
|
85
|
-
else
|
86
|
-
"[tpl]#{contents}[/tpl]"
|
87
|
-
end
|
160
|
+
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
161
|
+
""
|
88
162
|
end
|
163
|
+
str.replace(result)
|
89
164
|
end
|
90
|
-
|
91
|
-
|
92
|
-
#################### methods used from format_wiki ####################
|
93
165
|
|
94
|
-
def escape_nowiki(str)
|
166
|
+
def escape_nowiki!(str)
|
95
167
|
if @nowikis
|
96
168
|
@nowikis.clear
|
97
169
|
else
|
98
170
|
@nowikis = {}
|
99
171
|
end
|
100
|
-
str.gsub(
|
172
|
+
str.gsub!($escape_nowiki_regex) do
|
101
173
|
nowiki = $1
|
102
174
|
nowiki_id = nowiki.object_id
|
103
175
|
@nowikis[nowiki_id] = nowiki
|
@@ -105,17 +177,16 @@ module Wp2txt
|
|
105
177
|
end
|
106
178
|
end
|
107
179
|
|
108
|
-
def unescape_nowiki(str)
|
109
|
-
str.gsub(
|
180
|
+
def unescape_nowiki!(str)
|
181
|
+
str.gsub!($unescape_nowiki_regex) do
|
110
182
|
obj_id = $1.to_i
|
111
183
|
@nowikis[obj_id]
|
112
184
|
end
|
113
185
|
end
|
114
186
|
|
115
|
-
def process_interwiki_links(str)
|
187
|
+
def process_interwiki_links!(str)
|
116
188
|
scanner = StringScanner.new(str)
|
117
|
-
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
118
|
-
str_new = ""
|
189
|
+
result = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |contents|
|
119
190
|
parts = contents.split("|")
|
120
191
|
case parts.size
|
121
192
|
when 1
|
@@ -125,12 +196,12 @@ module Wp2txt
|
|
125
196
|
parts.join("|")
|
126
197
|
end
|
127
198
|
end
|
128
|
-
result
|
199
|
+
str.replace(result)
|
129
200
|
end
|
130
201
|
|
131
|
-
def process_external_links(str)
|
202
|
+
def process_external_links!(str)
|
132
203
|
scanner = StringScanner.new(str)
|
133
|
-
result = process_nested_structure(scanner, "[", "]") do |contents|
|
204
|
+
result = process_nested_structure(scanner, "[", "]", $limit_recur) do |contents|
|
134
205
|
parts = contents.split(" ", 2)
|
135
206
|
case parts.size
|
136
207
|
when 1
|
@@ -139,11 +210,11 @@ module Wp2txt
|
|
139
210
|
parts.last || ""
|
140
211
|
end
|
141
212
|
end
|
142
|
-
result
|
213
|
+
str.replace(result)
|
143
214
|
end
|
144
215
|
|
145
|
-
def special_chr(str)
|
146
|
-
unless
|
216
|
+
def special_chr!(str)
|
217
|
+
unless $sp_hash
|
147
218
|
html = [' ', '<', '>', '&', '"']\
|
148
219
|
.zip([' ', '<', '>', '&', '"'])
|
149
220
|
|
@@ -201,40 +272,30 @@ module Wp2txt
|
|
201
272
|
|
202
273
|
spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
|
203
274
|
math_chr1 + math_chr2 + others
|
204
|
-
|
205
|
-
|
275
|
+
$sp_hash = Hash[*spc_array.flatten]
|
276
|
+
$sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
|
206
277
|
end
|
207
278
|
#str.gsub!("&"){'&'}
|
208
|
-
str.gsub!(
|
209
|
-
|
279
|
+
str.gsub!($sp_regex) do
|
280
|
+
$sp_hash[$1]
|
210
281
|
end
|
211
|
-
return str
|
212
282
|
end
|
213
283
|
|
214
|
-
def remove_tag(str, tagset = ['<', '>'])
|
215
|
-
if tagset == ['<', '>']
|
216
|
-
return remove_html_tag(str)
|
217
|
-
end
|
284
|
+
def remove_tag!(str, tagset = ['<', '>'])
|
218
285
|
tagsets = Regexp.quote(tagset.uniq.join(""))
|
219
286
|
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
220
|
-
|
221
|
-
# newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
|
222
|
-
return newstr
|
287
|
+
str.gsub!(regex, "")
|
223
288
|
end
|
224
289
|
|
225
|
-
def
|
226
|
-
str
|
227
|
-
end
|
228
|
-
|
229
|
-
def remove_emphasis(str)
|
230
|
-
str.gsub(/(''+)(.+?)\1/) do
|
290
|
+
def remove_emphasis!(str)
|
291
|
+
str.gsub!($remove_emphasis_regex) do
|
231
292
|
$2
|
232
293
|
end
|
233
294
|
end
|
234
295
|
|
235
|
-
def chrref_to_utf(num_str)
|
296
|
+
def chrref_to_utf!(num_str)
|
236
297
|
begin
|
237
|
-
|
298
|
+
num_str.gsub!($chrref_to_utf_regex) do
|
238
299
|
if $1 == 'x'
|
239
300
|
ch = $2.to_i(16)
|
240
301
|
else
|
@@ -246,36 +307,58 @@ module Wp2txt
|
|
246
307
|
u.encode("UTF-8", "UTF-16")
|
247
308
|
end
|
248
309
|
rescue StandardError
|
249
|
-
return
|
310
|
+
return nil
|
250
311
|
end
|
251
|
-
return
|
312
|
+
return true
|
252
313
|
end
|
253
314
|
|
254
|
-
def remove_directive(str)
|
255
|
-
remove_tag(str, ['__', '__'])
|
315
|
+
def remove_directive!(str)
|
316
|
+
remove_tag!(str, ['__', '__'])
|
256
317
|
end
|
257
318
|
|
258
|
-
def mndash(str)
|
259
|
-
str
|
319
|
+
def mndash!(str)
|
320
|
+
str.gsub!($mndash_regex, "–")
|
260
321
|
end
|
261
322
|
|
262
|
-
def remove_hr(page)
|
263
|
-
page
|
323
|
+
def remove_hr!(page)
|
324
|
+
page.gsub!($remove_hr_regex, "")
|
264
325
|
end
|
265
326
|
|
266
|
-
def make_reference(str)
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
327
|
+
def make_reference!(str)
|
328
|
+
str.gsub!($make_reference_regex_a, "\n")
|
329
|
+
str.gsub!($make_reference_regex_b, "")
|
330
|
+
str.gsub!($make_reference_regex_c, "[ref]")
|
331
|
+
str.gsub!($make_reference_regex_d, "[/ref]")
|
332
|
+
end
|
333
|
+
|
334
|
+
def format_ref!(page)
|
335
|
+
###### do nothing for now
|
336
|
+
# page.gsub!($format_ref_regex) do
|
337
|
+
# end
|
273
338
|
end
|
274
339
|
|
275
|
-
def
|
276
|
-
|
277
|
-
|
278
|
-
|
340
|
+
def correct_inline_template!(str)
|
341
|
+
str.gsub!($remove_inline_regex) do
|
342
|
+
key = $1
|
343
|
+
if $onset_bar_regex =~ key
|
344
|
+
result = key
|
345
|
+
elsif
|
346
|
+
info = key.split("|")
|
347
|
+
type_code = info.first
|
348
|
+
case type_code
|
349
|
+
when $type_code_regex
|
350
|
+
out = info[-1]
|
351
|
+
else
|
352
|
+
if $leave_template
|
353
|
+
out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
|
354
|
+
else
|
355
|
+
out = ""
|
356
|
+
end
|
357
|
+
end
|
358
|
+
out
|
359
|
+
else
|
360
|
+
""
|
361
|
+
end
|
279
362
|
end
|
280
363
|
end
|
281
364
|
|
@@ -283,7 +366,7 @@ module Wp2txt
|
|
283
366
|
|
284
367
|
def process_template(str)
|
285
368
|
scanner = StringScanner.new(str)
|
286
|
-
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
369
|
+
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
287
370
|
parts = contents.split("|")
|
288
371
|
case parts.size
|
289
372
|
when 0
|
@@ -302,7 +385,7 @@ module Wp2txt
|
|
302
385
|
end
|
303
386
|
|
304
387
|
def remove_table(str)
|
305
|
-
new_str = str.gsub(
|
388
|
+
new_str = str.gsub($remove_table_regex, "")
|
306
389
|
if str != new_str
|
307
390
|
new_str = remove_table(new_str)
|
308
391
|
end
|
@@ -311,32 +394,11 @@ module Wp2txt
|
|
311
394
|
end
|
312
395
|
|
313
396
|
def remove_clade(page)
|
314
|
-
new_page = page.gsub(
|
397
|
+
new_page = page.gsub($remove_clade_regex, "")
|
315
398
|
new_page = remove_clade(new_page) unless page == new_page
|
316
399
|
new_page
|
317
400
|
end
|
318
401
|
|
319
|
-
def remove_inline_template(str)
|
320
|
-
str.gsub(/\{\{(.*?)\}\}/) do
|
321
|
-
key = $1
|
322
|
-
if /\A[^\|]+\z/ =~ key
|
323
|
-
result = key
|
324
|
-
else
|
325
|
-
info = key.split("|")
|
326
|
-
type_code = info.first
|
327
|
-
case type_code
|
328
|
-
when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
|
329
|
-
/\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
|
330
|
-
/\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
|
331
|
-
out = info[-1]
|
332
|
-
else
|
333
|
-
out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
|
334
|
-
end
|
335
|
-
result = out
|
336
|
-
end
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
340
402
|
#################### file related utilities ####################
|
341
403
|
|
342
404
|
# collect filenames recursively
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/utils_spec.rb
CHANGED
@@ -6,6 +6,8 @@ require 'wp2txt'
|
|
6
6
|
require 'wp2txt/article'
|
7
7
|
require 'wp2txt/utils'
|
8
8
|
|
9
|
+
$limit_recur = 3
|
10
|
+
|
9
11
|
describe "Wp2txt" do
|
10
12
|
it "contains mediawiki-format related functions:" do
|
11
13
|
end
|
@@ -20,7 +22,7 @@ describe "Wp2txt" do
|
|
20
22
|
str_before = "[[ab[[cde[[alfa]]]]fg]]"
|
21
23
|
str_after = "<<ab<<cde<<alfa>>>>fg>>"
|
22
24
|
scanner = StringScanner.new(str_before)
|
23
|
-
str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
|
25
|
+
str_processed = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |content|
|
24
26
|
"<<" + content + ">>"
|
25
27
|
end
|
26
28
|
expect(str_processed).to eq str_after
|
@@ -30,7 +32,7 @@ describe "Wp2txt" do
|
|
30
32
|
str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
31
33
|
|passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
|
32
34
|
scanner = StringScanner.new(str_before)
|
33
|
-
str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
|
35
|
+
str_processed = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |content|
|
34
36
|
"<<" + content + ">>"
|
35
37
|
end
|
36
38
|
#str_processed.should == str_after
|
@@ -39,43 +41,39 @@ describe "Wp2txt" do
|
|
39
41
|
end
|
40
42
|
end
|
41
43
|
|
42
|
-
describe "special_chr" do
|
44
|
+
describe "special_chr!" do
|
43
45
|
it "replaces character references with real characters" do
|
44
46
|
str_before = " < > & ""
|
45
47
|
str_after = " < > & \""
|
46
|
-
|
48
|
+
special_chr!(str_before)
|
49
|
+
expect(str_before).to eq str_after
|
47
50
|
end
|
48
51
|
end
|
49
52
|
|
50
|
-
describe "chrref_to_utf" do
|
53
|
+
describe "chrref_to_utf!" do
|
51
54
|
it "replaces character references with real characters" do
|
52
55
|
str_before = "♪"
|
53
56
|
str_after = "♪"
|
54
|
-
|
57
|
+
chrref_to_utf!(str_before)
|
58
|
+
expect(str_before).to eq str_after
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
58
|
-
describe "mndash" do
|
62
|
+
describe "mndash!" do
|
59
63
|
it "replaces {mdash}, {ndash}, or {–} with '–'" do
|
60
64
|
str_before = "{mdash} {ndash} {–}"
|
61
65
|
str_after = "– – –"
|
62
|
-
|
66
|
+
mndash!(str_before)
|
67
|
+
expect(str_before).to eq str_after
|
63
68
|
end
|
64
69
|
end
|
65
|
-
|
66
|
-
describe "format_ref" do
|
67
|
-
it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
|
68
|
-
str_before = "[ref]...\r\n...<br />...[/ref]"
|
69
|
-
str_after = "... ... ..."
|
70
|
-
expect(format_ref(str_before)).to eq str_after
|
71
|
-
end
|
72
|
-
end
|
73
70
|
|
74
71
|
describe "make_reference" do
|
75
72
|
it "replaces <ref> tag with [ref]" do
|
76
|
-
str_before = "<ref> ...
|
77
|
-
str_after = "[ref] ...
|
78
|
-
|
73
|
+
str_before = "<ref> ... </ref>"
|
74
|
+
str_after = "[ref] ... [/ref]"
|
75
|
+
make_reference!(str_before)
|
76
|
+
expect(str_before).to eq str_after
|
79
77
|
end
|
80
78
|
end
|
81
79
|
|
@@ -95,72 +93,93 @@ describe "Wp2txt" do
|
|
95
93
|
end
|
96
94
|
end
|
97
95
|
|
98
|
-
describe "remove_hr" do
|
96
|
+
describe "remove_hr!" do
|
99
97
|
it "removes horizontal lines" do
|
100
98
|
str_before = "\n----\n--\n--\n"
|
101
99
|
str_after = "\n\n"
|
102
|
-
|
100
|
+
remove_hr!(str_before)
|
101
|
+
expect(str_before).to eq str_after
|
103
102
|
end
|
104
103
|
end
|
105
104
|
|
106
|
-
describe "remove_tag" do
|
105
|
+
describe "remove_tag!" do
|
107
106
|
it "removes tags" do
|
108
107
|
str_before = "<tag>abc</tag>"
|
109
108
|
str_after = "abc"
|
110
|
-
|
109
|
+
remove_tag!(str_before)
|
110
|
+
expect(str_before).to eq str_after
|
111
111
|
str_before = "[tag]def[/tag]"
|
112
112
|
str_after = "def"
|
113
|
-
|
113
|
+
remove_tag!(str_before, ['[', ']'])
|
114
|
+
expect(str_before).to eq str_after
|
114
115
|
end
|
115
116
|
end
|
116
117
|
|
117
|
-
describe "remove_directive" do
|
118
|
+
describe "remove_directive!" do
|
118
119
|
it "removes directive" do
|
119
120
|
str_before = "__abc__\n __def__"
|
120
121
|
str_after = "\n "
|
121
|
-
|
122
|
+
remove_directive!(str_before)
|
123
|
+
expect(str_before).to eq str_after
|
122
124
|
end
|
123
125
|
end
|
124
126
|
|
125
|
-
describe "remove_emphasis" do
|
127
|
+
describe "remove_emphasis!" do
|
126
128
|
it "removes directive" do
|
127
129
|
str_before = "''abc''\n'''def'''"
|
128
130
|
str_after = "abc\ndef"
|
129
|
-
|
131
|
+
remove_emphasis!(str_before)
|
132
|
+
expect(str_before).to eq str_after
|
130
133
|
end
|
131
134
|
end
|
132
135
|
|
133
|
-
describe "escape_nowiki" do
|
136
|
+
describe "escape_nowiki!" do
|
134
137
|
it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
|
135
138
|
str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
|
136
139
|
str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
|
137
|
-
|
140
|
+
escape_nowiki!(str_before)
|
141
|
+
expect(str_before).to match str_after
|
138
142
|
end
|
139
143
|
end
|
140
144
|
|
141
|
-
describe "unescape_nowiki" do
|
145
|
+
describe "unescape_nowiki!" do
|
142
146
|
it "replaces <nowiki-object_id> with string stored elsewhere" do
|
143
147
|
@nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
|
144
148
|
str_before = "<nowiki-123>def<nowiki-124>"
|
145
149
|
str_after = "[[abc]]def[[ghi]]"
|
146
|
-
|
150
|
+
unescape_nowiki!(str_before)
|
151
|
+
expect(str_before).to eq str_after
|
147
152
|
end
|
148
153
|
end
|
149
154
|
|
150
|
-
describe "process_interwiki_links" do
|
155
|
+
describe "process_interwiki_links!" do
|
151
156
|
it "formats text link and remove brackets" do
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
157
|
+
a = "[[a b]]"
|
158
|
+
b = "[[a b|c]]"
|
159
|
+
c = "[[a|b|c]]"
|
160
|
+
d = "[[硬口蓋鼻音|[ɲ], /J/]]"
|
161
|
+
process_interwiki_links!(a)
|
162
|
+
process_interwiki_links!(b)
|
163
|
+
process_interwiki_links!(c)
|
164
|
+
process_interwiki_links!(d)
|
165
|
+
expect(a).to eq "a b"
|
166
|
+
expect(b).to eq "c"
|
167
|
+
expect(c).to eq "b|c"
|
168
|
+
expect(d).to eq "[ɲ], /J/"
|
156
169
|
end
|
157
170
|
end
|
158
171
|
|
159
|
-
describe "process_external_links" do
|
172
|
+
describe "process_external_links!" do
|
160
173
|
it "formats text link and remove brackets" do
|
161
|
-
|
162
|
-
|
163
|
-
|
174
|
+
a = "[http://yohasebe.com yohasebe.com]"
|
175
|
+
b = "[http://yohasebe.com]"
|
176
|
+
c = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
177
|
+
process_external_links!(a)
|
178
|
+
process_external_links!(b)
|
179
|
+
process_external_links!(c)
|
180
|
+
expect(a).to eq "yohasebe.com"
|
181
|
+
expect(b).to eq "http://yohasebe.com"
|
182
|
+
expect(c).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
164
183
|
end
|
165
184
|
end
|
166
185
|
|