wp2txt 0.6.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
File without changes
@@ -3,52 +3,127 @@
3
3
 
4
4
  require 'strscan'
5
5
  require 'find'
6
- require 'sanitize'
6
+
7
+ ###################################################
8
+ # global variables to save resource for generating regexps
9
+ # those with a trailing number 1 represent opening tag/markup
10
+ # those with a trailing number 2 represent closing tag/markup
11
+ # those without a trailing number contain both opening/closing tags/markups
12
+
13
+ $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
14
+ $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
15
+
16
+ $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
17
+ $in_inputbox_regex1 = Regexp.new('<inputbox>')
18
+ $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
19
+
20
+ $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
21
+ $in_source_regex1 = Regexp.new('<source.*?>')
22
+ $in_source_regex2 = Regexp.new('<\/source>')
23
+
24
+ $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
25
+ $in_math_regex1 = Regexp.new('<math.*?>')
26
+ $in_math_regex2 = Regexp.new('<\/math>')
27
+
28
+ $in_heading_regex = Regexp.new('^=+.*?=+$')
29
+
30
+ $in_html_table_regex = Regexp.new('<table.*?><\/table>')
31
+ $in_html_table_regex1 = Regexp.new('<table\b')
32
+ $in_html_table_regex2 = Regexp.new('<\/\s*table>')
33
+
34
+ $in_table_regex1 = Regexp.new('^\s*\{\|')
35
+ $in_table_regex2 = Regexp.new('^\|\}.*?$')
36
+
37
+ $in_unordered_regex = Regexp.new('^\*')
38
+ $in_ordered_regex = Regexp.new('^\#')
39
+ $in_pre_regex = Regexp.new('^ ')
40
+ $in_definition_regex = Regexp.new('^[\;\:]')
41
+
42
+ $blank_line_regex = Regexp.new('^\s*$')
43
+
44
+ $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
45
+
46
+ $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
47
+ $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
48
+ $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
49
+ $remove_hr_regex = Regexp.new('^\s*\-+\s*$')
50
+ $make_reference_regex_a = Regexp.new('<br ?\/>')
51
+ $make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
52
+ $make_reference_regex_c = Regexp.new('<ref[^>]*>')
53
+ $make_reference_regex_d = Regexp.new('<\/ref>')
54
+ $format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
55
+ $heading_onset_regex = Regexp.new('^(\=+)\s+')
56
+ $heading_coda_regex = Regexp.new('\s+(\=+)$')
57
+ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
58
+ $pre_marks_regex = Regexp.new('\A\^\ ')
59
+ $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
60
+ $onset_bar_regex = Regexp.new('\A[^\|]+\z')
61
+ $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
62
+ $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
63
+
64
+ $category_patterns = ["Category", "Categoria"].join("|")
65
+ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
66
+
67
+ $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
68
+ $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
69
+
70
+ $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
71
+ $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
72
+
73
+ $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
74
+ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
75
+ $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
76
+ $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
77
+
78
+ ###################################################
7
79
 
8
80
  module Wp2txt
9
81
 
10
- def format_wiki(original_text, has_retried = false)
82
+ def format_wiki!(text, has_retried = false)
11
83
  begin
12
- text = original_text + ""
84
+ text << ""
13
85
 
14
- text = chrref_to_utf(text)
15
- text = escape_nowiki(text)
16
-
17
- text = process_interwiki_links(text)
18
- text = process_external_links(text)
86
+ chrref_to_utf!(text)
87
+ escape_nowiki!(text)
19
88
 
20
- text = remove_directive(text)
21
- text = remove_emphasis(text)
89
+ process_interwiki_links!(text)
90
+ process_external_links!(text)
22
91
 
23
- text = mndash(text)
24
- text = make_reference(text)
25
- text = format_ref(text)
26
- text = remove_hr(text)
27
- text = remove_tag(text)
28
- text = special_chr(text)
29
-
30
- unescape_nowiki(text)
92
+ unescape_nowiki!(text)
93
+
31
94
  rescue # detect invalid byte sequence in UTF-8
32
95
  if has_retried
33
96
  puts "invalid byte sequence detected"
34
97
  puts "******************************"
35
98
  File.open("error_log.txt", "w") do |f|
36
- f.write original_text
99
+ f.write text
37
100
  end
38
101
  exit
39
102
  else
40
- fixed_text = original_text.encode("UTF-16").encode("UTF-8")
41
- return format_wiki(fixed_text, true)
103
+ text.encode!("UTF-16")
104
+ text.encode!("UTF-8")
105
+ format_wiki!(text, true)
42
106
  end
43
107
  end
44
108
  end
45
109
 
46
110
  #################### parser for nested structure ####################
47
111
 
48
- def process_nested_structure(scanner, left, right, &block)
112
+ def process_nested_structure(scanner, left, right, recur_count, &block)
49
113
  buffer = ""
50
114
  begin
51
- while str = scanner.scan_until(/(#{Regexp.escape(left)}|#{Regexp.escape(right)})/m)
115
+ if left == "[" && right == "]"
116
+ regex = $single_square_bracket_regex
117
+ elsif left == "[[" && right == "]]"
118
+ regex = $double_square_bracket_regex
119
+ elsif left == "{" && right == "}"
120
+ regex = $single_curly_bracket_regex
121
+ elsif left == "{{" && right == "}}"
122
+ regex = $double_curly_bracket_regex
123
+ else
124
+ regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
125
+ end
126
+ while str = scanner.scan_until(regex)
52
127
  case scanner[1]
53
128
  when left
54
129
  buffer << str
@@ -66,38 +141,35 @@ module Wp2txt
66
141
  end
67
142
  buffer << scanner.rest
68
143
 
69
- if buffer == scanner.string
70
- return scanner.string
144
+ recur_count = recur_count - 1
145
+ if recur_count < 0 || buffer == scanner.string
146
+ return buffer
71
147
  else
72
148
  scanner.string = buffer
73
- return process_nested_structure(scanner, left, right, &block) || ""
149
+ return process_nested_structure(scanner, left, right, recur_count, &block) || ""
74
150
  end
75
151
  rescue => e
76
152
  return scanner.string
77
153
  end
78
154
  end
79
155
 
80
- def remove_templates(str, only_not_inline = true)
156
+ #################### methods used from format_wiki ####################
157
+
158
+ def remove_templates!(str)
81
159
  scanner = StringScanner.new(str)
82
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
83
- if contents.index("\n")
84
- "\n"
85
- else
86
- "[tpl]#{contents}[/tpl]"
87
- end
160
+ result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
161
+ ""
88
162
  end
163
+ str.replace(result)
89
164
  end
90
-
91
-
92
- #################### methods used from format_wiki ####################
93
165
 
94
- def escape_nowiki(str)
166
+ def escape_nowiki!(str)
95
167
  if @nowikis
96
168
  @nowikis.clear
97
169
  else
98
170
  @nowikis = {}
99
171
  end
100
- str.gsub(/<nowiki>(.*?)<\/nowiki>/m) do
172
+ str.gsub!($escape_nowiki_regex) do
101
173
  nowiki = $1
102
174
  nowiki_id = nowiki.object_id
103
175
  @nowikis[nowiki_id] = nowiki
@@ -105,17 +177,16 @@ module Wp2txt
105
177
  end
106
178
  end
107
179
 
108
- def unescape_nowiki(str)
109
- str.gsub(/<nowiki\-(\d+?)>/) do
180
+ def unescape_nowiki!(str)
181
+ str.gsub!($unescape_nowiki_regex) do
110
182
  obj_id = $1.to_i
111
183
  @nowikis[obj_id]
112
184
  end
113
185
  end
114
186
 
115
- def process_interwiki_links(str)
187
+ def process_interwiki_links!(str)
116
188
  scanner = StringScanner.new(str)
117
- result = process_nested_structure(scanner, "[[", "]]") do |contents|
118
- str_new = ""
189
+ result = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |contents|
119
190
  parts = contents.split("|")
120
191
  case parts.size
121
192
  when 1
@@ -125,12 +196,12 @@ module Wp2txt
125
196
  parts.join("|")
126
197
  end
127
198
  end
128
- result
199
+ str.replace(result)
129
200
  end
130
201
 
131
- def process_external_links(str)
202
+ def process_external_links!(str)
132
203
  scanner = StringScanner.new(str)
133
- result = process_nested_structure(scanner, "[", "]") do |contents|
204
+ result = process_nested_structure(scanner, "[", "]", $limit_recur) do |contents|
134
205
  parts = contents.split(" ", 2)
135
206
  case parts.size
136
207
  when 1
@@ -139,11 +210,11 @@ module Wp2txt
139
210
  parts.last || ""
140
211
  end
141
212
  end
142
- result
213
+ str.replace(result)
143
214
  end
144
215
 
145
- def special_chr(str)
146
- unless @sp_hash
216
+ def special_chr!(str)
217
+ unless $sp_hash
147
218
  html = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;']\
148
219
  .zip([' ', '<', '>', '&', '"'])
149
220
 
@@ -201,40 +272,30 @@ module Wp2txt
201
272
 
202
273
  spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
203
274
  math_chr1 + math_chr2 + others
204
- @sp_hash = Hash[*spc_array.flatten]
205
- @sp_regex = Regexp.new("(" + @sp_hash.keys.join("|") + ")")
275
+ $sp_hash = Hash[*spc_array.flatten]
276
+ $sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
206
277
  end
207
278
  #str.gsub!("&amp;"){'&'}
208
- str.gsub!(@sp_regex) do
209
- @sp_hash[$1]
279
+ str.gsub!($sp_regex) do
280
+ $sp_hash[$1]
210
281
  end
211
- return str
212
282
  end
213
283
 
214
- def remove_tag(str, tagset = ['<', '>'])
215
- if tagset == ['<', '>']
216
- return remove_html_tag(str)
217
- end
284
+ def remove_tag!(str, tagset = ['<', '>'])
218
285
  tagsets = Regexp.quote(tagset.uniq.join(""))
219
286
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
220
- newstr = str.gsub(regex, "")
221
- # newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
222
- return newstr
287
+ str.gsub!(regex, "")
223
288
  end
224
289
 
225
- def remove_html_tag(str)
226
- str = ::Sanitize.clean(str)
227
- end
228
-
229
- def remove_emphasis(str)
230
- str.gsub(/(''+)(.+?)\1/) do
290
+ def remove_emphasis!(str)
291
+ str.gsub!($remove_emphasis_regex) do
231
292
  $2
232
293
  end
233
294
  end
234
295
 
235
- def chrref_to_utf(num_str)
296
+ def chrref_to_utf!(num_str)
236
297
  begin
237
- utf_str = num_str.gsub(/&#(x?)([0-9a-fA-F]+);/) do
298
+ num_str.gsub!($chrref_to_utf_regex) do
238
299
  if $1 == 'x'
239
300
  ch = $2.to_i(16)
240
301
  else
@@ -246,36 +307,58 @@ module Wp2txt
246
307
  u.encode("UTF-8", "UTF-16")
247
308
  end
248
309
  rescue StandardError
249
- return num_str
310
+ return nil
250
311
  end
251
- return utf_str
312
+ return true
252
313
  end
253
314
 
254
- def remove_directive(str)
255
- remove_tag(str, ['__', '__'])
315
+ def remove_directive!(str)
316
+ remove_tag!(str, ['__', '__'])
256
317
  end
257
318
 
258
- def mndash(str)
259
- str = str.gsub(/\{(mdash|ndash|–)\}/, "–")
319
+ def mndash!(str)
320
+ str.gsub!($mndash_regex, "–")
260
321
  end
261
322
 
262
- def remove_hr(page)
263
- page = page.gsub(/^\s*\-+\s*$/, "")
323
+ def remove_hr!(page)
324
+ page.gsub!($remove_hr_regex, "")
264
325
  end
265
326
 
266
- def make_reference(str)
267
- new_str = str.dup
268
- new_str.gsub!(/<br ?\/>/, "\n")
269
- new_str.gsub!(/<ref[^>]*\/>/, "")
270
- new_str.gsub!(/<ref[^>]*>/, "[ref]")
271
- new_str.gsub!(/<\/ref>/, "[/ref]")
272
- return new_str
327
+ def make_reference!(str)
328
+ str.gsub!($make_reference_regex_a, "\n")
329
+ str.gsub!($make_reference_regex_b, "")
330
+ str.gsub!($make_reference_regex_c, "[ref]")
331
+ str.gsub!($make_reference_regex_d, "[/ref]")
332
+ end
333
+
334
+ def format_ref!(page)
335
+ ###### do nothing for now
336
+ # page.gsub!($format_ref_regex) do
337
+ # end
273
338
  end
274
339
 
275
- def format_ref(page)
276
- page = page.gsub(/\[ref\](.*?)\[\/ref\]/m) do
277
- ref = $1.dup
278
- ref.gsub(/(?:[\r\n]+|<br ?\/>)/, " ")
340
+ def correct_inline_template!(str)
341
+ str.gsub!($remove_inline_regex) do
342
+ key = $1
343
+ if $onset_bar_regex =~ key
344
+ result = key
345
+ elsif
346
+ info = key.split("|")
347
+ type_code = info.first
348
+ case type_code
349
+ when $type_code_regex
350
+ out = info[-1]
351
+ else
352
+ if $leave_template
353
+ out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
354
+ else
355
+ out = ""
356
+ end
357
+ end
358
+ out
359
+ else
360
+ ""
361
+ end
279
362
  end
280
363
  end
281
364
 
@@ -283,7 +366,7 @@ module Wp2txt
283
366
 
284
367
  def process_template(str)
285
368
  scanner = StringScanner.new(str)
286
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
369
+ result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
287
370
  parts = contents.split("|")
288
371
  case parts.size
289
372
  when 0
@@ -302,7 +385,7 @@ module Wp2txt
302
385
  end
303
386
 
304
387
  def remove_table(str)
305
- new_str = str.gsub(/\{\|[^\{\|\}]*?\|\}/m, "")
388
+ new_str = str.gsub($remove_table_regex, "")
306
389
  if str != new_str
307
390
  new_str = remove_table(new_str)
308
391
  end
@@ -311,32 +394,11 @@ module Wp2txt
311
394
  end
312
395
 
313
396
  def remove_clade(page)
314
- new_page = page.gsub(/\{\{(?:C|c)lade[^\{\}]*\}\}/m, "")
397
+ new_page = page.gsub($remove_clade_regex, "")
315
398
  new_page = remove_clade(new_page) unless page == new_page
316
399
  new_page
317
400
  end
318
401
 
319
- def remove_inline_template(str)
320
- str.gsub(/\{\{(.*?)\}\}/) do
321
- key = $1
322
- if /\A[^\|]+\z/ =~ key
323
- result = key
324
- else
325
- info = key.split("|")
326
- type_code = info.first
327
- case type_code
328
- when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
329
- /\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
330
- /\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
331
- out = info[-1]
332
- else
333
- out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
334
- end
335
- result = out
336
- end
337
- end
338
- end
339
-
340
402
  #################### file related utilities ####################
341
403
 
342
404
  # collect filenames recursively
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.6.1"
2
+ VERSION = "0.7.0"
3
3
  end
@@ -6,6 +6,8 @@ require 'wp2txt'
6
6
  require 'wp2txt/article'
7
7
  require 'wp2txt/utils'
8
8
 
9
+ $limit_recur = 3
10
+
9
11
  describe "Wp2txt" do
10
12
  it "contains mediawiki-format related functions:" do
11
13
  end
@@ -20,7 +22,7 @@ describe "Wp2txt" do
20
22
  str_before = "[[ab[[cde[[alfa]]]]fg]]"
21
23
  str_after = "<<ab<<cde<<alfa>>>>fg>>"
22
24
  scanner = StringScanner.new(str_before)
23
- str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
25
+ str_processed = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |content|
24
26
  "<<" + content + ">>"
25
27
  end
26
28
  expect(str_processed).to eq str_after
@@ -30,7 +32,7 @@ describe "Wp2txt" do
30
32
  str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
31
33
  |passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
32
34
  scanner = StringScanner.new(str_before)
33
- str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
35
+ str_processed = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |content|
34
36
  "<<" + content + ">>"
35
37
  end
36
38
  #str_processed.should == str_after
@@ -39,43 +41,39 @@ describe "Wp2txt" do
39
41
  end
40
42
  end
41
43
 
42
- describe "special_chr" do
44
+ describe "special_chr!" do
43
45
  it "replaces character references with real characters" do
44
46
  str_before = "&nbsp; &lt; &gt; &amp; &quot;"
45
47
  str_after = " < > & \""
46
- expect(special_chr(str_before)).to eq str_after
48
+ special_chr!(str_before)
49
+ expect(str_before).to eq str_after
47
50
  end
48
51
  end
49
52
 
50
- describe "chrref_to_utf" do
53
+ describe "chrref_to_utf!" do
51
54
  it "replaces character references with real characters" do
52
55
  str_before = "&#x266A;"
53
56
  str_after = "♪"
54
- expect(chrref_to_utf(str_before)).to eq str_after
57
+ chrref_to_utf!(str_before)
58
+ expect(str_before).to eq str_after
55
59
  end
56
60
  end
57
61
 
58
- describe "mndash" do
62
+ describe "mndash!" do
59
63
  it "replaces {mdash}, {ndash}, or {–} with '–'" do
60
64
  str_before = "{mdash} {ndash} {–}"
61
65
  str_after = "– – –"
62
- expect(mndash(str_before)).to eq str_after
66
+ mndash!(str_before)
67
+ expect(str_before).to eq str_after
63
68
  end
64
69
  end
65
-
66
- describe "format_ref" do
67
- it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
68
- str_before = "[ref]...\r\n...<br />...[/ref]"
69
- str_after = "... ... ..."
70
- expect(format_ref(str_before)).to eq str_after
71
- end
72
- end
73
70
 
74
71
  describe "make_reference" do
75
72
  it "replaces <ref> tag with [ref]" do
76
- str_before = "<ref> ... <br /> ... </ref> \n <ref />"
77
- str_after = "[ref] ... \n ... [/ref] \n "
78
- expect(make_reference(str_before)).to eq str_after
73
+ str_before = "<ref> ... </ref>"
74
+ str_after = "[ref] ... [/ref]"
75
+ make_reference!(str_before)
76
+ expect(str_before).to eq str_after
79
77
  end
80
78
  end
81
79
 
@@ -95,72 +93,93 @@ describe "Wp2txt" do
95
93
  end
96
94
  end
97
95
 
98
- describe "remove_hr" do
96
+ describe "remove_hr!" do
99
97
  it "removes horizontal lines" do
100
98
  str_before = "\n----\n--\n--\n"
101
99
  str_after = "\n\n"
102
- expect(remove_hr(str_before)).to eq str_after
100
+ remove_hr!(str_before)
101
+ expect(str_before).to eq str_after
103
102
  end
104
103
  end
105
104
 
106
- describe "remove_tag" do
105
+ describe "remove_tag!" do
107
106
  it "removes tags" do
108
107
  str_before = "<tag>abc</tag>"
109
108
  str_after = "abc"
110
- expect(remove_tag(str_before)).to eq str_after
109
+ remove_tag!(str_before)
110
+ expect(str_before).to eq str_after
111
111
  str_before = "[tag]def[/tag]"
112
112
  str_after = "def"
113
- expect(remove_tag(str_before, ['[', ']'])).to eq str_after
113
+ remove_tag!(str_before, ['[', ']'])
114
+ expect(str_before).to eq str_after
114
115
  end
115
116
  end
116
117
 
117
- describe "remove_directive" do
118
+ describe "remove_directive!" do
118
119
  it "removes directive" do
119
120
  str_before = "__abc__\n __def__"
120
121
  str_after = "\n "
121
- expect(remove_directive(str_before)).to eq str_after
122
+ remove_directive!(str_before)
123
+ expect(str_before).to eq str_after
122
124
  end
123
125
  end
124
126
 
125
- describe "remove_emphasis" do
127
+ describe "remove_emphasis!" do
126
128
  it "removes directive" do
127
129
  str_before = "''abc''\n'''def'''"
128
130
  str_after = "abc\ndef"
129
- expect(remove_emphasis(str_before)).to eq str_after
131
+ remove_emphasis!(str_before)
132
+ expect(str_before).to eq str_after
130
133
  end
131
134
  end
132
135
 
133
- describe "escape_nowiki" do
136
+ describe "escape_nowiki!" do
134
137
  it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
135
138
  str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
136
139
  str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
137
- expect(escape_nowiki(str_before)).to match str_after
140
+ escape_nowiki!(str_before)
141
+ expect(str_before).to match str_after
138
142
  end
139
143
  end
140
144
 
141
- describe "unescape_nowiki" do
145
+ describe "unescape_nowiki!" do
142
146
  it "replaces <nowiki-object_id> with string stored elsewhere" do
143
147
  @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
144
148
  str_before = "<nowiki-123>def<nowiki-124>"
145
149
  str_after = "[[abc]]def[[ghi]]"
146
- expect(unescape_nowiki(str_before)).to eq str_after
150
+ unescape_nowiki!(str_before)
151
+ expect(str_before).to eq str_after
147
152
  end
148
153
  end
149
154
 
150
- describe "process_interwiki_links" do
155
+ describe "process_interwiki_links!" do
151
156
  it "formats text link and remove brackets" do
152
- expect(process_interwiki_links("[[a b]]")).to eq "a b"
153
- expect(process_interwiki_links("[[a b|c]]")).to eq "c"
154
- expect(process_interwiki_links("[[a|b|c]]")).to eq "b|c"
155
- expect(process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]")).to eq "[ɲ], /J/"
157
+ a = "[[a b]]"
158
+ b = "[[a b|c]]"
159
+ c = "[[a|b|c]]"
160
+ d = "[[硬口蓋鼻音|[ɲ], /J/]]"
161
+ process_interwiki_links!(a)
162
+ process_interwiki_links!(b)
163
+ process_interwiki_links!(c)
164
+ process_interwiki_links!(d)
165
+ expect(a).to eq "a b"
166
+ expect(b).to eq "c"
167
+ expect(c).to eq "b|c"
168
+ expect(d).to eq "[ɲ], /J/"
156
169
  end
157
170
  end
158
171
 
159
- describe "process_external_links" do
172
+ describe "process_external_links!" do
160
173
  it "formats text link and remove brackets" do
161
- expect(process_external_links("[http://yohasebe.com yohasebe.com]")).to eq "yohasebe.com"
162
- expect(process_external_links("[http://yohasebe.com]")).to eq "http://yohasebe.com"
163
- expect(process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}")).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
174
+ a = "[http://yohasebe.com yohasebe.com]"
175
+ b = "[http://yohasebe.com]"
176
+ c = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
177
+ process_external_links!(a)
178
+ process_external_links!(b)
179
+ process_external_links!(c)
180
+ expect(a).to eq "yohasebe.com"
181
+ expect(b).to eq "http://yohasebe.com"
182
+ expect(c).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
164
183
  end
165
184
  end
166
185