wp2txt 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -11
- data/bin/benchmark.rb +14 -10
- data/bin/wp2txt +48 -28
- data/lib/wp2txt.rb +46 -11
- data/lib/wp2txt/article.rb +49 -89
- data/lib/wp2txt/mw_api.rb +0 -0
- data/lib/wp2txt/utils.rb +174 -112
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +60 -41
- data/wp2txt.gemspec +3 -9
- metadata +3 -59
data/lib/wp2txt/mw_api.rb
CHANGED
File without changes
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -3,52 +3,127 @@
|
|
3
3
|
|
4
4
|
require 'strscan'
|
5
5
|
require 'find'
|
6
|
-
|
6
|
+
|
7
|
+
###################################################
|
8
|
+
# global variables to save resource for generating regexps
|
9
|
+
# those with a trailing number 1 represent opening tag/markup
|
10
|
+
# those with a trailing number 2 represent closing tag/markup
|
11
|
+
# those without a trailing number contain both opening/closing tags/markups
|
12
|
+
|
13
|
+
$in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
|
14
|
+
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
15
|
+
|
16
|
+
$in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
|
17
|
+
$in_inputbox_regex1 = Regexp.new('<inputbox>')
|
18
|
+
$in_inputbox_regex2 = Regexp.new('<\/inputbox>')
|
19
|
+
|
20
|
+
$in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
|
21
|
+
$in_source_regex1 = Regexp.new('<source.*?>')
|
22
|
+
$in_source_regex2 = Regexp.new('<\/source>')
|
23
|
+
|
24
|
+
$in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
|
25
|
+
$in_math_regex1 = Regexp.new('<math.*?>')
|
26
|
+
$in_math_regex2 = Regexp.new('<\/math>')
|
27
|
+
|
28
|
+
$in_heading_regex = Regexp.new('^=+.*?=+$')
|
29
|
+
|
30
|
+
$in_html_table_regex = Regexp.new('<table.*?><\/table>')
|
31
|
+
$in_html_table_regex1 = Regexp.new('<table\b')
|
32
|
+
$in_html_table_regex2 = Regexp.new('<\/\s*table>')
|
33
|
+
|
34
|
+
$in_table_regex1 = Regexp.new('^\s*\{\|')
|
35
|
+
$in_table_regex2 = Regexp.new('^\|\}.*?$')
|
36
|
+
|
37
|
+
$in_unordered_regex = Regexp.new('^\*')
|
38
|
+
$in_ordered_regex = Regexp.new('^\#')
|
39
|
+
$in_pre_regex = Regexp.new('^ ')
|
40
|
+
$in_definition_regex = Regexp.new('^[\;\:]')
|
41
|
+
|
42
|
+
$blank_line_regex = Regexp.new('^\s*$')
|
43
|
+
|
44
|
+
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
45
|
+
|
46
|
+
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
47
|
+
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
48
|
+
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
49
|
+
$remove_hr_regex = Regexp.new('^\s*\-+\s*$')
|
50
|
+
$make_reference_regex_a = Regexp.new('<br ?\/>')
|
51
|
+
$make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
|
52
|
+
$make_reference_regex_c = Regexp.new('<ref[^>]*>')
|
53
|
+
$make_reference_regex_d = Regexp.new('<\/ref>')
|
54
|
+
$format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
|
55
|
+
$heading_onset_regex = Regexp.new('^(\=+)\s+')
|
56
|
+
$heading_coda_regex = Regexp.new('\s+(\=+)$')
|
57
|
+
$list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
58
|
+
$pre_marks_regex = Regexp.new('\A\^\ ')
|
59
|
+
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
60
|
+
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
61
|
+
$remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
|
62
|
+
$remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
|
63
|
+
|
64
|
+
$category_patterns = ["Category", "Categoria"].join("|")
|
65
|
+
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
66
|
+
|
67
|
+
$escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
|
68
|
+
$unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
|
69
|
+
|
70
|
+
$remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
|
71
|
+
$type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
|
72
|
+
|
73
|
+
$single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
|
74
|
+
$double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
|
75
|
+
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
76
|
+
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
77
|
+
|
78
|
+
###################################################
|
7
79
|
|
8
80
|
module Wp2txt
|
9
81
|
|
10
|
-
def format_wiki(
|
82
|
+
def format_wiki!(text, has_retried = false)
|
11
83
|
begin
|
12
|
-
text
|
84
|
+
text << ""
|
13
85
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
text = process_interwiki_links(text)
|
18
|
-
text = process_external_links(text)
|
86
|
+
chrref_to_utf!(text)
|
87
|
+
escape_nowiki!(text)
|
19
88
|
|
20
|
-
|
21
|
-
|
89
|
+
process_interwiki_links!(text)
|
90
|
+
process_external_links!(text)
|
22
91
|
|
23
|
-
|
24
|
-
|
25
|
-
text = format_ref(text)
|
26
|
-
text = remove_hr(text)
|
27
|
-
text = remove_tag(text)
|
28
|
-
text = special_chr(text)
|
29
|
-
|
30
|
-
unescape_nowiki(text)
|
92
|
+
unescape_nowiki!(text)
|
93
|
+
|
31
94
|
rescue # detect invalid byte sequence in UTF-8
|
32
95
|
if has_retried
|
33
96
|
puts "invalid byte sequence detected"
|
34
97
|
puts "******************************"
|
35
98
|
File.open("error_log.txt", "w") do |f|
|
36
|
-
f.write
|
99
|
+
f.write text
|
37
100
|
end
|
38
101
|
exit
|
39
102
|
else
|
40
|
-
|
41
|
-
|
103
|
+
text.encode!("UTF-16")
|
104
|
+
text.encode!("UTF-8")
|
105
|
+
format_wiki!(text, true)
|
42
106
|
end
|
43
107
|
end
|
44
108
|
end
|
45
109
|
|
46
110
|
#################### parser for nested structure ####################
|
47
111
|
|
48
|
-
def process_nested_structure(scanner, left, right, &block)
|
112
|
+
def process_nested_structure(scanner, left, right, recur_count, &block)
|
49
113
|
buffer = ""
|
50
114
|
begin
|
51
|
-
|
115
|
+
if left == "[" && right == "]"
|
116
|
+
regex = $single_square_bracket_regex
|
117
|
+
elsif left == "[[" && right == "]]"
|
118
|
+
regex = $double_square_bracket_regex
|
119
|
+
elsif left == "{" && right == "}"
|
120
|
+
regex = $single_curly_bracket_regex
|
121
|
+
elsif left == "{{" && right == "}}"
|
122
|
+
regex = $double_curly_bracket_regex
|
123
|
+
else
|
124
|
+
regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
|
125
|
+
end
|
126
|
+
while str = scanner.scan_until(regex)
|
52
127
|
case scanner[1]
|
53
128
|
when left
|
54
129
|
buffer << str
|
@@ -66,38 +141,35 @@ module Wp2txt
|
|
66
141
|
end
|
67
142
|
buffer << scanner.rest
|
68
143
|
|
69
|
-
|
70
|
-
|
144
|
+
recur_count = recur_count - 1
|
145
|
+
if recur_count < 0 || buffer == scanner.string
|
146
|
+
return buffer
|
71
147
|
else
|
72
148
|
scanner.string = buffer
|
73
|
-
return process_nested_structure(scanner, left, right, &block) || ""
|
149
|
+
return process_nested_structure(scanner, left, right, recur_count, &block) || ""
|
74
150
|
end
|
75
151
|
rescue => e
|
76
152
|
return scanner.string
|
77
153
|
end
|
78
154
|
end
|
79
155
|
|
80
|
-
|
156
|
+
#################### methods used from format_wiki ####################
|
157
|
+
|
158
|
+
def remove_templates!(str)
|
81
159
|
scanner = StringScanner.new(str)
|
82
|
-
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
83
|
-
|
84
|
-
"\n"
|
85
|
-
else
|
86
|
-
"[tpl]#{contents}[/tpl]"
|
87
|
-
end
|
160
|
+
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
161
|
+
""
|
88
162
|
end
|
163
|
+
str.replace(result)
|
89
164
|
end
|
90
|
-
|
91
|
-
|
92
|
-
#################### methods used from format_wiki ####################
|
93
165
|
|
94
|
-
def escape_nowiki(str)
|
166
|
+
def escape_nowiki!(str)
|
95
167
|
if @nowikis
|
96
168
|
@nowikis.clear
|
97
169
|
else
|
98
170
|
@nowikis = {}
|
99
171
|
end
|
100
|
-
str.gsub(
|
172
|
+
str.gsub!($escape_nowiki_regex) do
|
101
173
|
nowiki = $1
|
102
174
|
nowiki_id = nowiki.object_id
|
103
175
|
@nowikis[nowiki_id] = nowiki
|
@@ -105,17 +177,16 @@ module Wp2txt
|
|
105
177
|
end
|
106
178
|
end
|
107
179
|
|
108
|
-
def unescape_nowiki(str)
|
109
|
-
str.gsub(
|
180
|
+
def unescape_nowiki!(str)
|
181
|
+
str.gsub!($unescape_nowiki_regex) do
|
110
182
|
obj_id = $1.to_i
|
111
183
|
@nowikis[obj_id]
|
112
184
|
end
|
113
185
|
end
|
114
186
|
|
115
|
-
def process_interwiki_links(str)
|
187
|
+
def process_interwiki_links!(str)
|
116
188
|
scanner = StringScanner.new(str)
|
117
|
-
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
118
|
-
str_new = ""
|
189
|
+
result = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |contents|
|
119
190
|
parts = contents.split("|")
|
120
191
|
case parts.size
|
121
192
|
when 1
|
@@ -125,12 +196,12 @@ module Wp2txt
|
|
125
196
|
parts.join("|")
|
126
197
|
end
|
127
198
|
end
|
128
|
-
result
|
199
|
+
str.replace(result)
|
129
200
|
end
|
130
201
|
|
131
|
-
def process_external_links(str)
|
202
|
+
def process_external_links!(str)
|
132
203
|
scanner = StringScanner.new(str)
|
133
|
-
result = process_nested_structure(scanner, "[", "]") do |contents|
|
204
|
+
result = process_nested_structure(scanner, "[", "]", $limit_recur) do |contents|
|
134
205
|
parts = contents.split(" ", 2)
|
135
206
|
case parts.size
|
136
207
|
when 1
|
@@ -139,11 +210,11 @@ module Wp2txt
|
|
139
210
|
parts.last || ""
|
140
211
|
end
|
141
212
|
end
|
142
|
-
result
|
213
|
+
str.replace(result)
|
143
214
|
end
|
144
215
|
|
145
|
-
def special_chr(str)
|
146
|
-
unless
|
216
|
+
def special_chr!(str)
|
217
|
+
unless $sp_hash
|
147
218
|
html = [' ', '<', '>', '&', '"']\
|
148
219
|
.zip([' ', '<', '>', '&', '"'])
|
149
220
|
|
@@ -201,40 +272,30 @@ module Wp2txt
|
|
201
272
|
|
202
273
|
spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
|
203
274
|
math_chr1 + math_chr2 + others
|
204
|
-
|
205
|
-
|
275
|
+
$sp_hash = Hash[*spc_array.flatten]
|
276
|
+
$sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
|
206
277
|
end
|
207
278
|
#str.gsub!("&"){'&'}
|
208
|
-
str.gsub!(
|
209
|
-
|
279
|
+
str.gsub!($sp_regex) do
|
280
|
+
$sp_hash[$1]
|
210
281
|
end
|
211
|
-
return str
|
212
282
|
end
|
213
283
|
|
214
|
-
def remove_tag(str, tagset = ['<', '>'])
|
215
|
-
if tagset == ['<', '>']
|
216
|
-
return remove_html_tag(str)
|
217
|
-
end
|
284
|
+
def remove_tag!(str, tagset = ['<', '>'])
|
218
285
|
tagsets = Regexp.quote(tagset.uniq.join(""))
|
219
286
|
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
220
|
-
|
221
|
-
# newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
|
222
|
-
return newstr
|
287
|
+
str.gsub!(regex, "")
|
223
288
|
end
|
224
289
|
|
225
|
-
def
|
226
|
-
str
|
227
|
-
end
|
228
|
-
|
229
|
-
def remove_emphasis(str)
|
230
|
-
str.gsub(/(''+)(.+?)\1/) do
|
290
|
+
def remove_emphasis!(str)
|
291
|
+
str.gsub!($remove_emphasis_regex) do
|
231
292
|
$2
|
232
293
|
end
|
233
294
|
end
|
234
295
|
|
235
|
-
def chrref_to_utf(num_str)
|
296
|
+
def chrref_to_utf!(num_str)
|
236
297
|
begin
|
237
|
-
|
298
|
+
num_str.gsub!($chrref_to_utf_regex) do
|
238
299
|
if $1 == 'x'
|
239
300
|
ch = $2.to_i(16)
|
240
301
|
else
|
@@ -246,36 +307,58 @@ module Wp2txt
|
|
246
307
|
u.encode("UTF-8", "UTF-16")
|
247
308
|
end
|
248
309
|
rescue StandardError
|
249
|
-
return
|
310
|
+
return nil
|
250
311
|
end
|
251
|
-
return
|
312
|
+
return true
|
252
313
|
end
|
253
314
|
|
254
|
-
def remove_directive(str)
|
255
|
-
remove_tag(str, ['__', '__'])
|
315
|
+
def remove_directive!(str)
|
316
|
+
remove_tag!(str, ['__', '__'])
|
256
317
|
end
|
257
318
|
|
258
|
-
def mndash(str)
|
259
|
-
str
|
319
|
+
def mndash!(str)
|
320
|
+
str.gsub!($mndash_regex, "–")
|
260
321
|
end
|
261
322
|
|
262
|
-
def remove_hr(page)
|
263
|
-
page
|
323
|
+
def remove_hr!(page)
|
324
|
+
page.gsub!($remove_hr_regex, "")
|
264
325
|
end
|
265
326
|
|
266
|
-
def make_reference(str)
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
327
|
+
def make_reference!(str)
|
328
|
+
str.gsub!($make_reference_regex_a, "\n")
|
329
|
+
str.gsub!($make_reference_regex_b, "")
|
330
|
+
str.gsub!($make_reference_regex_c, "[ref]")
|
331
|
+
str.gsub!($make_reference_regex_d, "[/ref]")
|
332
|
+
end
|
333
|
+
|
334
|
+
def format_ref!(page)
|
335
|
+
###### do nothing for now
|
336
|
+
# page.gsub!($format_ref_regex) do
|
337
|
+
# end
|
273
338
|
end
|
274
339
|
|
275
|
-
def
|
276
|
-
|
277
|
-
|
278
|
-
|
340
|
+
def correct_inline_template!(str)
|
341
|
+
str.gsub!($remove_inline_regex) do
|
342
|
+
key = $1
|
343
|
+
if $onset_bar_regex =~ key
|
344
|
+
result = key
|
345
|
+
elsif
|
346
|
+
info = key.split("|")
|
347
|
+
type_code = info.first
|
348
|
+
case type_code
|
349
|
+
when $type_code_regex
|
350
|
+
out = info[-1]
|
351
|
+
else
|
352
|
+
if $leave_template
|
353
|
+
out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
|
354
|
+
else
|
355
|
+
out = ""
|
356
|
+
end
|
357
|
+
end
|
358
|
+
out
|
359
|
+
else
|
360
|
+
""
|
361
|
+
end
|
279
362
|
end
|
280
363
|
end
|
281
364
|
|
@@ -283,7 +366,7 @@ module Wp2txt
|
|
283
366
|
|
284
367
|
def process_template(str)
|
285
368
|
scanner = StringScanner.new(str)
|
286
|
-
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
369
|
+
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
287
370
|
parts = contents.split("|")
|
288
371
|
case parts.size
|
289
372
|
when 0
|
@@ -302,7 +385,7 @@ module Wp2txt
|
|
302
385
|
end
|
303
386
|
|
304
387
|
def remove_table(str)
|
305
|
-
new_str = str.gsub(
|
388
|
+
new_str = str.gsub($remove_table_regex, "")
|
306
389
|
if str != new_str
|
307
390
|
new_str = remove_table(new_str)
|
308
391
|
end
|
@@ -311,32 +394,11 @@ module Wp2txt
|
|
311
394
|
end
|
312
395
|
|
313
396
|
def remove_clade(page)
|
314
|
-
new_page = page.gsub(
|
397
|
+
new_page = page.gsub($remove_clade_regex, "")
|
315
398
|
new_page = remove_clade(new_page) unless page == new_page
|
316
399
|
new_page
|
317
400
|
end
|
318
401
|
|
319
|
-
def remove_inline_template(str)
|
320
|
-
str.gsub(/\{\{(.*?)\}\}/) do
|
321
|
-
key = $1
|
322
|
-
if /\A[^\|]+\z/ =~ key
|
323
|
-
result = key
|
324
|
-
else
|
325
|
-
info = key.split("|")
|
326
|
-
type_code = info.first
|
327
|
-
case type_code
|
328
|
-
when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
|
329
|
-
/\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
|
330
|
-
/\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
|
331
|
-
out = info[-1]
|
332
|
-
else
|
333
|
-
out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
|
334
|
-
end
|
335
|
-
result = out
|
336
|
-
end
|
337
|
-
end
|
338
|
-
end
|
339
|
-
|
340
402
|
#################### file related utilities ####################
|
341
403
|
|
342
404
|
# collect filenames recursively
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/utils_spec.rb
CHANGED
@@ -6,6 +6,8 @@ require 'wp2txt'
|
|
6
6
|
require 'wp2txt/article'
|
7
7
|
require 'wp2txt/utils'
|
8
8
|
|
9
|
+
$limit_recur = 3
|
10
|
+
|
9
11
|
describe "Wp2txt" do
|
10
12
|
it "contains mediawiki-format related functions:" do
|
11
13
|
end
|
@@ -20,7 +22,7 @@ describe "Wp2txt" do
|
|
20
22
|
str_before = "[[ab[[cde[[alfa]]]]fg]]"
|
21
23
|
str_after = "<<ab<<cde<<alfa>>>>fg>>"
|
22
24
|
scanner = StringScanner.new(str_before)
|
23
|
-
str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
|
25
|
+
str_processed = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |content|
|
24
26
|
"<<" + content + ">>"
|
25
27
|
end
|
26
28
|
expect(str_processed).to eq str_after
|
@@ -30,7 +32,7 @@ describe "Wp2txt" do
|
|
30
32
|
str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
31
33
|
|passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
|
32
34
|
scanner = StringScanner.new(str_before)
|
33
|
-
str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
|
35
|
+
str_processed = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |content|
|
34
36
|
"<<" + content + ">>"
|
35
37
|
end
|
36
38
|
#str_processed.should == str_after
|
@@ -39,43 +41,39 @@ describe "Wp2txt" do
|
|
39
41
|
end
|
40
42
|
end
|
41
43
|
|
42
|
-
describe "special_chr" do
|
44
|
+
describe "special_chr!" do
|
43
45
|
it "replaces character references with real characters" do
|
44
46
|
str_before = " < > & ""
|
45
47
|
str_after = " < > & \""
|
46
|
-
|
48
|
+
special_chr!(str_before)
|
49
|
+
expect(str_before).to eq str_after
|
47
50
|
end
|
48
51
|
end
|
49
52
|
|
50
|
-
describe "chrref_to_utf" do
|
53
|
+
describe "chrref_to_utf!" do
|
51
54
|
it "replaces character references with real characters" do
|
52
55
|
str_before = "♪"
|
53
56
|
str_after = "♪"
|
54
|
-
|
57
|
+
chrref_to_utf!(str_before)
|
58
|
+
expect(str_before).to eq str_after
|
55
59
|
end
|
56
60
|
end
|
57
61
|
|
58
|
-
describe "mndash" do
|
62
|
+
describe "mndash!" do
|
59
63
|
it "replaces {mdash}, {ndash}, or {–} with '–'" do
|
60
64
|
str_before = "{mdash} {ndash} {–}"
|
61
65
|
str_after = "– – –"
|
62
|
-
|
66
|
+
mndash!(str_before)
|
67
|
+
expect(str_before).to eq str_after
|
63
68
|
end
|
64
69
|
end
|
65
|
-
|
66
|
-
describe "format_ref" do
|
67
|
-
it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
|
68
|
-
str_before = "[ref]...\r\n...<br />...[/ref]"
|
69
|
-
str_after = "... ... ..."
|
70
|
-
expect(format_ref(str_before)).to eq str_after
|
71
|
-
end
|
72
|
-
end
|
73
70
|
|
74
71
|
describe "make_reference" do
|
75
72
|
it "replaces <ref> tag with [ref]" do
|
76
|
-
str_before = "<ref> ...
|
77
|
-
str_after = "[ref] ...
|
78
|
-
|
73
|
+
str_before = "<ref> ... </ref>"
|
74
|
+
str_after = "[ref] ... [/ref]"
|
75
|
+
make_reference!(str_before)
|
76
|
+
expect(str_before).to eq str_after
|
79
77
|
end
|
80
78
|
end
|
81
79
|
|
@@ -95,72 +93,93 @@ describe "Wp2txt" do
|
|
95
93
|
end
|
96
94
|
end
|
97
95
|
|
98
|
-
describe "remove_hr" do
|
96
|
+
describe "remove_hr!" do
|
99
97
|
it "removes horizontal lines" do
|
100
98
|
str_before = "\n----\n--\n--\n"
|
101
99
|
str_after = "\n\n"
|
102
|
-
|
100
|
+
remove_hr!(str_before)
|
101
|
+
expect(str_before).to eq str_after
|
103
102
|
end
|
104
103
|
end
|
105
104
|
|
106
|
-
describe "remove_tag" do
|
105
|
+
describe "remove_tag!" do
|
107
106
|
it "removes tags" do
|
108
107
|
str_before = "<tag>abc</tag>"
|
109
108
|
str_after = "abc"
|
110
|
-
|
109
|
+
remove_tag!(str_before)
|
110
|
+
expect(str_before).to eq str_after
|
111
111
|
str_before = "[tag]def[/tag]"
|
112
112
|
str_after = "def"
|
113
|
-
|
113
|
+
remove_tag!(str_before, ['[', ']'])
|
114
|
+
expect(str_before).to eq str_after
|
114
115
|
end
|
115
116
|
end
|
116
117
|
|
117
|
-
describe "remove_directive" do
|
118
|
+
describe "remove_directive!" do
|
118
119
|
it "removes directive" do
|
119
120
|
str_before = "__abc__\n __def__"
|
120
121
|
str_after = "\n "
|
121
|
-
|
122
|
+
remove_directive!(str_before)
|
123
|
+
expect(str_before).to eq str_after
|
122
124
|
end
|
123
125
|
end
|
124
126
|
|
125
|
-
describe "remove_emphasis" do
|
127
|
+
describe "remove_emphasis!" do
|
126
128
|
it "removes directive" do
|
127
129
|
str_before = "''abc''\n'''def'''"
|
128
130
|
str_after = "abc\ndef"
|
129
|
-
|
131
|
+
remove_emphasis!(str_before)
|
132
|
+
expect(str_before).to eq str_after
|
130
133
|
end
|
131
134
|
end
|
132
135
|
|
133
|
-
describe "escape_nowiki" do
|
136
|
+
describe "escape_nowiki!" do
|
134
137
|
it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
|
135
138
|
str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
|
136
139
|
str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
|
137
|
-
|
140
|
+
escape_nowiki!(str_before)
|
141
|
+
expect(str_before).to match str_after
|
138
142
|
end
|
139
143
|
end
|
140
144
|
|
141
|
-
describe "unescape_nowiki" do
|
145
|
+
describe "unescape_nowiki!" do
|
142
146
|
it "replaces <nowiki-object_id> with string stored elsewhere" do
|
143
147
|
@nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
|
144
148
|
str_before = "<nowiki-123>def<nowiki-124>"
|
145
149
|
str_after = "[[abc]]def[[ghi]]"
|
146
|
-
|
150
|
+
unescape_nowiki!(str_before)
|
151
|
+
expect(str_before).to eq str_after
|
147
152
|
end
|
148
153
|
end
|
149
154
|
|
150
|
-
describe "process_interwiki_links" do
|
155
|
+
describe "process_interwiki_links!" do
|
151
156
|
it "formats text link and remove brackets" do
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
157
|
+
a = "[[a b]]"
|
158
|
+
b = "[[a b|c]]"
|
159
|
+
c = "[[a|b|c]]"
|
160
|
+
d = "[[硬口蓋鼻音|[ɲ], /J/]]"
|
161
|
+
process_interwiki_links!(a)
|
162
|
+
process_interwiki_links!(b)
|
163
|
+
process_interwiki_links!(c)
|
164
|
+
process_interwiki_links!(d)
|
165
|
+
expect(a).to eq "a b"
|
166
|
+
expect(b).to eq "c"
|
167
|
+
expect(c).to eq "b|c"
|
168
|
+
expect(d).to eq "[ɲ], /J/"
|
156
169
|
end
|
157
170
|
end
|
158
171
|
|
159
|
-
describe "process_external_links" do
|
172
|
+
describe "process_external_links!" do
|
160
173
|
it "formats text link and remove brackets" do
|
161
|
-
|
162
|
-
|
163
|
-
|
174
|
+
a = "[http://yohasebe.com yohasebe.com]"
|
175
|
+
b = "[http://yohasebe.com]"
|
176
|
+
c = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
177
|
+
process_external_links!(a)
|
178
|
+
process_external_links!(b)
|
179
|
+
process_external_links!(c)
|
180
|
+
expect(a).to eq "yohasebe.com"
|
181
|
+
expect(b).to eq "http://yohasebe.com"
|
182
|
+
expect(c).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
164
183
|
end
|
165
184
|
end
|
166
185
|
|