wp2txt 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -3,52 +3,127 @@
3
3
 
4
4
  require 'strscan'
5
5
  require 'find'
6
- require 'sanitize'
6
+
7
+ ###################################################
8
+ # global variables to save resource for generating regexps
9
+ # those with a trailing number 1 represent opening tag/markup
10
+ # those with a trailing number 2 represent closing tag/markup
11
+ # those without a trailing number contain both opening/closing tags/markups
12
+
13
+ $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
14
+ $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
15
+
16
+ $in_inputbox_regex = Regexp.new('<inputbox>.*?<\/inputbox>')
17
+ $in_inputbox_regex1 = Regexp.new('<inputbox>')
18
+ $in_inputbox_regex2 = Regexp.new('<\/inputbox>')
19
+
20
+ $in_source_regex = Regexp.new('<source.*?>.*?<\/source>')
21
+ $in_source_regex1 = Regexp.new('<source.*?>')
22
+ $in_source_regex2 = Regexp.new('<\/source>')
23
+
24
+ $in_math_regex = Regexp.new('<math.*?>.*?<\/math>')
25
+ $in_math_regex1 = Regexp.new('<math.*?>')
26
+ $in_math_regex2 = Regexp.new('<\/math>')
27
+
28
+ $in_heading_regex = Regexp.new('^=+.*?=+$')
29
+
30
+ $in_html_table_regex = Regexp.new('<table.*?><\/table>')
31
+ $in_html_table_regex1 = Regexp.new('<table\b')
32
+ $in_html_table_regex2 = Regexp.new('<\/\s*table>')
33
+
34
+ $in_table_regex1 = Regexp.new('^\s*\{\|')
35
+ $in_table_regex2 = Regexp.new('^\|\}.*?$')
36
+
37
+ $in_unordered_regex = Regexp.new('^\*')
38
+ $in_ordered_regex = Regexp.new('^\#')
39
+ $in_pre_regex = Regexp.new('^ ')
40
+ $in_definition_regex = Regexp.new('^[\;\:]')
41
+
42
+ $blank_line_regex = Regexp.new('^\s*$')
43
+
44
+ $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
45
+
46
+ $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
47
+ $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
48
+ $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
49
+ $remove_hr_regex = Regexp.new('^\s*\-+\s*$')
50
+ $make_reference_regex_a = Regexp.new('<br ?\/>')
51
+ $make_reference_regex_b = Regexp.new('<ref[^>]*\/>')
52
+ $make_reference_regex_c = Regexp.new('<ref[^>]*>')
53
+ $make_reference_regex_d = Regexp.new('<\/ref>')
54
+ $format_ref_regex = Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
55
+ $heading_onset_regex = Regexp.new('^(\=+)\s+')
56
+ $heading_coda_regex = Regexp.new('\s+(\=+)$')
57
+ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
58
+ $pre_marks_regex = Regexp.new('\A\^\ ')
59
+ $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
60
+ $onset_bar_regex = Regexp.new('\A[^\|]+\z')
61
+ $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
62
+ $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
63
+
64
+ $category_patterns = ["Category", "Categoria"].join("|")
65
+ $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
66
+
67
+ $escape_nowiki_regex = Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
68
+ $unescape_nowiki_regex = Regexp.new('<nowiki\-(\d+?)>')
69
+
70
+ $remove_inline_regex = Regexp.new('\{\{(.*?)\}\}')
71
+ $type_code_regex = Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
72
+
73
+ $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escape(']')})", Regexp::MULTILINE)
74
+ $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
75
+ $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
76
+ $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
77
+
78
+ ###################################################
7
79
 
8
80
  module Wp2txt
9
81
 
10
- def format_wiki(original_text, has_retried = false)
82
+ def format_wiki!(text, has_retried = false)
11
83
  begin
12
- text = original_text + ""
84
+ text << ""
13
85
 
14
- text = chrref_to_utf(text)
15
- text = escape_nowiki(text)
16
-
17
- text = process_interwiki_links(text)
18
- text = process_external_links(text)
86
+ chrref_to_utf!(text)
87
+ escape_nowiki!(text)
19
88
 
20
- text = remove_directive(text)
21
- text = remove_emphasis(text)
89
+ process_interwiki_links!(text)
90
+ process_external_links!(text)
22
91
 
23
- text = mndash(text)
24
- text = make_reference(text)
25
- text = format_ref(text)
26
- text = remove_hr(text)
27
- text = remove_tag(text)
28
- text = special_chr(text)
29
-
30
- unescape_nowiki(text)
92
+ unescape_nowiki!(text)
93
+
31
94
  rescue # detect invalid byte sequence in UTF-8
32
95
  if has_retried
33
96
  puts "invalid byte sequence detected"
34
97
  puts "******************************"
35
98
  File.open("error_log.txt", "w") do |f|
36
- f.write original_text
99
+ f.write text
37
100
  end
38
101
  exit
39
102
  else
40
- fixed_text = original_text.encode("UTF-16").encode("UTF-8")
41
- return format_wiki(fixed_text, true)
103
+ text.encode!("UTF-16")
104
+ text.encode!("UTF-8")
105
+ format_wiki!(text, true)
42
106
  end
43
107
  end
44
108
  end
45
109
 
46
110
  #################### parser for nested structure ####################
47
111
 
48
- def process_nested_structure(scanner, left, right, &block)
112
+ def process_nested_structure(scanner, left, right, recur_count, &block)
49
113
  buffer = ""
50
114
  begin
51
- while str = scanner.scan_until(/(#{Regexp.escape(left)}|#{Regexp.escape(right)})/m)
115
+ if left == "[" && right == "]"
116
+ regex = $single_square_bracket_regex
117
+ elsif left == "[[" && right == "]]"
118
+ regex = $double_square_bracket_regex
119
+ elsif left == "{" && right == "}"
120
+ regex = $single_curly_bracket_regex
121
+ elsif left == "{{" && right == "}}"
122
+ regex = $double_curly_bracket_regex
123
+ else
124
+ regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
125
+ end
126
+ while str = scanner.scan_until(regex)
52
127
  case scanner[1]
53
128
  when left
54
129
  buffer << str
@@ -66,38 +141,35 @@ module Wp2txt
66
141
  end
67
142
  buffer << scanner.rest
68
143
 
69
- if buffer == scanner.string
70
- return scanner.string
144
+ recur_count = recur_count - 1
145
+ if recur_count < 0 || buffer == scanner.string
146
+ return buffer
71
147
  else
72
148
  scanner.string = buffer
73
- return process_nested_structure(scanner, left, right, &block) || ""
149
+ return process_nested_structure(scanner, left, right, recur_count, &block) || ""
74
150
  end
75
151
  rescue => e
76
152
  return scanner.string
77
153
  end
78
154
  end
79
155
 
80
- def remove_templates(str, only_not_inline = true)
156
+ #################### methods used from format_wiki ####################
157
+
158
+ def remove_templates!(str)
81
159
  scanner = StringScanner.new(str)
82
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
83
- if contents.index("\n")
84
- "\n"
85
- else
86
- "[tpl]#{contents}[/tpl]"
87
- end
160
+ result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
161
+ ""
88
162
  end
163
+ str.replace(result)
89
164
  end
90
-
91
-
92
- #################### methods used from format_wiki ####################
93
165
 
94
- def escape_nowiki(str)
166
+ def escape_nowiki!(str)
95
167
  if @nowikis
96
168
  @nowikis.clear
97
169
  else
98
170
  @nowikis = {}
99
171
  end
100
- str.gsub(/<nowiki>(.*?)<\/nowiki>/m) do
172
+ str.gsub!($escape_nowiki_regex) do
101
173
  nowiki = $1
102
174
  nowiki_id = nowiki.object_id
103
175
  @nowikis[nowiki_id] = nowiki
@@ -105,17 +177,16 @@ module Wp2txt
105
177
  end
106
178
  end
107
179
 
108
- def unescape_nowiki(str)
109
- str.gsub(/<nowiki\-(\d+?)>/) do
180
+ def unescape_nowiki!(str)
181
+ str.gsub!($unescape_nowiki_regex) do
110
182
  obj_id = $1.to_i
111
183
  @nowikis[obj_id]
112
184
  end
113
185
  end
114
186
 
115
- def process_interwiki_links(str)
187
+ def process_interwiki_links!(str)
116
188
  scanner = StringScanner.new(str)
117
- result = process_nested_structure(scanner, "[[", "]]") do |contents|
118
- str_new = ""
189
+ result = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |contents|
119
190
  parts = contents.split("|")
120
191
  case parts.size
121
192
  when 1
@@ -125,12 +196,12 @@ module Wp2txt
125
196
  parts.join("|")
126
197
  end
127
198
  end
128
- result
199
+ str.replace(result)
129
200
  end
130
201
 
131
- def process_external_links(str)
202
+ def process_external_links!(str)
132
203
  scanner = StringScanner.new(str)
133
- result = process_nested_structure(scanner, "[", "]") do |contents|
204
+ result = process_nested_structure(scanner, "[", "]", $limit_recur) do |contents|
134
205
  parts = contents.split(" ", 2)
135
206
  case parts.size
136
207
  when 1
@@ -139,11 +210,11 @@ module Wp2txt
139
210
  parts.last || ""
140
211
  end
141
212
  end
142
- result
213
+ str.replace(result)
143
214
  end
144
215
 
145
- def special_chr(str)
146
- unless @sp_hash
216
+ def special_chr!(str)
217
+ unless $sp_hash
147
218
  html = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;']\
148
219
  .zip([' ', '<', '>', '&', '"'])
149
220
 
@@ -201,40 +272,30 @@ module Wp2txt
201
272
 
202
273
  spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
203
274
  math_chr1 + math_chr2 + others
204
- @sp_hash = Hash[*spc_array.flatten]
205
- @sp_regex = Regexp.new("(" + @sp_hash.keys.join("|") + ")")
275
+ $sp_hash = Hash[*spc_array.flatten]
276
+ $sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
206
277
  end
207
278
  #str.gsub!("&amp;"){'&'}
208
- str.gsub!(@sp_regex) do
209
- @sp_hash[$1]
279
+ str.gsub!($sp_regex) do
280
+ $sp_hash[$1]
210
281
  end
211
- return str
212
282
  end
213
283
 
214
- def remove_tag(str, tagset = ['<', '>'])
215
- if tagset == ['<', '>']
216
- return remove_html_tag(str)
217
- end
284
+ def remove_tag!(str, tagset = ['<', '>'])
218
285
  tagsets = Regexp.quote(tagset.uniq.join(""))
219
286
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
220
- newstr = str.gsub(regex, "")
221
- # newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
222
- return newstr
287
+ str.gsub!(regex, "")
223
288
  end
224
289
 
225
- def remove_html_tag(str)
226
- str = ::Sanitize.clean(str)
227
- end
228
-
229
- def remove_emphasis(str)
230
- str.gsub(/(''+)(.+?)\1/) do
290
+ def remove_emphasis!(str)
291
+ str.gsub!($remove_emphasis_regex) do
231
292
  $2
232
293
  end
233
294
  end
234
295
 
235
- def chrref_to_utf(num_str)
296
+ def chrref_to_utf!(num_str)
236
297
  begin
237
- utf_str = num_str.gsub(/&#(x?)([0-9a-fA-F]+);/) do
298
+ num_str.gsub!($chrref_to_utf_regex) do
238
299
  if $1 == 'x'
239
300
  ch = $2.to_i(16)
240
301
  else
@@ -246,36 +307,58 @@ module Wp2txt
246
307
  u.encode("UTF-8", "UTF-16")
247
308
  end
248
309
  rescue StandardError
249
- return num_str
310
+ return nil
250
311
  end
251
- return utf_str
312
+ return true
252
313
  end
253
314
 
254
- def remove_directive(str)
255
- remove_tag(str, ['__', '__'])
315
+ def remove_directive!(str)
316
+ remove_tag!(str, ['__', '__'])
256
317
  end
257
318
 
258
- def mndash(str)
259
- str = str.gsub(/\{(mdash|ndash|–)\}/, "–")
319
+ def mndash!(str)
320
+ str.gsub!($mndash_regex, "–")
260
321
  end
261
322
 
262
- def remove_hr(page)
263
- page = page.gsub(/^\s*\-+\s*$/, "")
323
+ def remove_hr!(page)
324
+ page.gsub!($remove_hr_regex, "")
264
325
  end
265
326
 
266
- def make_reference(str)
267
- new_str = str.dup
268
- new_str.gsub!(/<br ?\/>/, "\n")
269
- new_str.gsub!(/<ref[^>]*\/>/, "")
270
- new_str.gsub!(/<ref[^>]*>/, "[ref]")
271
- new_str.gsub!(/<\/ref>/, "[/ref]")
272
- return new_str
327
+ def make_reference!(str)
328
+ str.gsub!($make_reference_regex_a, "\n")
329
+ str.gsub!($make_reference_regex_b, "")
330
+ str.gsub!($make_reference_regex_c, "[ref]")
331
+ str.gsub!($make_reference_regex_d, "[/ref]")
332
+ end
333
+
334
+ def format_ref!(page)
335
+ ###### do nothing for now
336
+ # page.gsub!($format_ref_regex) do
337
+ # end
273
338
  end
274
339
 
275
- def format_ref(page)
276
- page = page.gsub(/\[ref\](.*?)\[\/ref\]/m) do
277
- ref = $1.dup
278
- ref.gsub(/(?:[\r\n]+|<br ?\/>)/, " ")
340
+ def correct_inline_template!(str)
341
+ str.gsub!($remove_inline_regex) do
342
+ key = $1
343
+ if $onset_bar_regex =~ key
344
+ result = key
345
+ elsif
346
+ info = key.split("|")
347
+ type_code = info.first
348
+ case type_code
349
+ when $type_code_regex
350
+ out = info[-1]
351
+ else
352
+ if $leave_template
353
+ out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
354
+ else
355
+ out = ""
356
+ end
357
+ end
358
+ out
359
+ else
360
+ ""
361
+ end
279
362
  end
280
363
  end
281
364
 
@@ -283,7 +366,7 @@ module Wp2txt
283
366
 
284
367
  def process_template(str)
285
368
  scanner = StringScanner.new(str)
286
- result = process_nested_structure(scanner, "{{", "}}") do |contents|
369
+ result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
287
370
  parts = contents.split("|")
288
371
  case parts.size
289
372
  when 0
@@ -302,7 +385,7 @@ module Wp2txt
302
385
  end
303
386
 
304
387
  def remove_table(str)
305
- new_str = str.gsub(/\{\|[^\{\|\}]*?\|\}/m, "")
388
+ new_str = str.gsub($remove_table_regex, "")
306
389
  if str != new_str
307
390
  new_str = remove_table(new_str)
308
391
  end
@@ -311,32 +394,11 @@ module Wp2txt
311
394
  end
312
395
 
313
396
  def remove_clade(page)
314
- new_page = page.gsub(/\{\{(?:C|c)lade[^\{\}]*\}\}/m, "")
397
+ new_page = page.gsub($remove_clade_regex, "")
315
398
  new_page = remove_clade(new_page) unless page == new_page
316
399
  new_page
317
400
  end
318
401
 
319
- def remove_inline_template(str)
320
- str.gsub(/\{\{(.*?)\}\}/) do
321
- key = $1
322
- if /\A[^\|]+\z/ =~ key
323
- result = key
324
- else
325
- info = key.split("|")
326
- type_code = info.first
327
- case type_code
328
- when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
329
- /\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
330
- /\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
331
- out = info[-1]
332
- else
333
- out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
334
- end
335
- result = out
336
- end
337
- end
338
- end
339
-
340
402
  #################### file related utilities ####################
341
403
 
342
404
  # collect filenames recursively
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.6.1"
2
+ VERSION = "0.7.0"
3
3
  end
@@ -6,6 +6,8 @@ require 'wp2txt'
6
6
  require 'wp2txt/article'
7
7
  require 'wp2txt/utils'
8
8
 
9
+ $limit_recur = 3
10
+
9
11
  describe "Wp2txt" do
10
12
  it "contains mediawiki-format related functions:" do
11
13
  end
@@ -20,7 +22,7 @@ describe "Wp2txt" do
20
22
  str_before = "[[ab[[cde[[alfa]]]]fg]]"
21
23
  str_after = "<<ab<<cde<<alfa>>>>fg>>"
22
24
  scanner = StringScanner.new(str_before)
23
- str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
25
+ str_processed = process_nested_structure(scanner, "[[", "]]", $limit_recur) do |content|
24
26
  "<<" + content + ">>"
25
27
  end
26
28
  expect(str_processed).to eq str_after
@@ -30,7 +32,7 @@ describe "Wp2txt" do
30
32
  str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
31
33
  |passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
32
34
  scanner = StringScanner.new(str_before)
33
- str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
35
+ str_processed = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |content|
34
36
  "<<" + content + ">>"
35
37
  end
36
38
  #str_processed.should == str_after
@@ -39,43 +41,39 @@ describe "Wp2txt" do
39
41
  end
40
42
  end
41
43
 
42
- describe "special_chr" do
44
+ describe "special_chr!" do
43
45
  it "replaces character references with real characters" do
44
46
  str_before = "&nbsp; &lt; &gt; &amp; &quot;"
45
47
  str_after = " < > & \""
46
- expect(special_chr(str_before)).to eq str_after
48
+ special_chr!(str_before)
49
+ expect(str_before).to eq str_after
47
50
  end
48
51
  end
49
52
 
50
- describe "chrref_to_utf" do
53
+ describe "chrref_to_utf!" do
51
54
  it "replaces character references with real characters" do
52
55
  str_before = "&#x266A;"
53
56
  str_after = "♪"
54
- expect(chrref_to_utf(str_before)).to eq str_after
57
+ chrref_to_utf!(str_before)
58
+ expect(str_before).to eq str_after
55
59
  end
56
60
  end
57
61
 
58
- describe "mndash" do
62
+ describe "mndash!" do
59
63
  it "replaces {mdash}, {ndash}, or {–} with '–'" do
60
64
  str_before = "{mdash} {ndash} {–}"
61
65
  str_after = "– – –"
62
- expect(mndash(str_before)).to eq str_after
66
+ mndash!(str_before)
67
+ expect(str_before).to eq str_after
63
68
  end
64
69
  end
65
-
66
- describe "format_ref" do
67
- it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
68
- str_before = "[ref]...\r\n...<br />...[/ref]"
69
- str_after = "... ... ..."
70
- expect(format_ref(str_before)).to eq str_after
71
- end
72
- end
73
70
 
74
71
  describe "make_reference" do
75
72
  it "replaces <ref> tag with [ref]" do
76
- str_before = "<ref> ... <br /> ... </ref> \n <ref />"
77
- str_after = "[ref] ... \n ... [/ref] \n "
78
- expect(make_reference(str_before)).to eq str_after
73
+ str_before = "<ref> ... </ref>"
74
+ str_after = "[ref] ... [/ref]"
75
+ make_reference!(str_before)
76
+ expect(str_before).to eq str_after
79
77
  end
80
78
  end
81
79
 
@@ -95,72 +93,93 @@ describe "Wp2txt" do
95
93
  end
96
94
  end
97
95
 
98
- describe "remove_hr" do
96
+ describe "remove_hr!" do
99
97
  it "removes horizontal lines" do
100
98
  str_before = "\n----\n--\n--\n"
101
99
  str_after = "\n\n"
102
- expect(remove_hr(str_before)).to eq str_after
100
+ remove_hr!(str_before)
101
+ expect(str_before).to eq str_after
103
102
  end
104
103
  end
105
104
 
106
- describe "remove_tag" do
105
+ describe "remove_tag!" do
107
106
  it "removes tags" do
108
107
  str_before = "<tag>abc</tag>"
109
108
  str_after = "abc"
110
- expect(remove_tag(str_before)).to eq str_after
109
+ remove_tag!(str_before)
110
+ expect(str_before).to eq str_after
111
111
  str_before = "[tag]def[/tag]"
112
112
  str_after = "def"
113
- expect(remove_tag(str_before, ['[', ']'])).to eq str_after
113
+ remove_tag!(str_before, ['[', ']'])
114
+ expect(str_before).to eq str_after
114
115
  end
115
116
  end
116
117
 
117
- describe "remove_directive" do
118
+ describe "remove_directive!" do
118
119
  it "removes directive" do
119
120
  str_before = "__abc__\n __def__"
120
121
  str_after = "\n "
121
- expect(remove_directive(str_before)).to eq str_after
122
+ remove_directive!(str_before)
123
+ expect(str_before).to eq str_after
122
124
  end
123
125
  end
124
126
 
125
- describe "remove_emphasis" do
127
+ describe "remove_emphasis!" do
126
128
  it "removes directive" do
127
129
  str_before = "''abc''\n'''def'''"
128
130
  str_after = "abc\ndef"
129
- expect(remove_emphasis(str_before)).to eq str_after
131
+ remove_emphasis!(str_before)
132
+ expect(str_before).to eq str_after
130
133
  end
131
134
  end
132
135
 
133
- describe "escape_nowiki" do
136
+ describe "escape_nowiki!" do
134
137
  it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
135
138
  str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
136
139
  str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
137
- expect(escape_nowiki(str_before)).to match str_after
140
+ escape_nowiki!(str_before)
141
+ expect(str_before).to match str_after
138
142
  end
139
143
  end
140
144
 
141
- describe "unescape_nowiki" do
145
+ describe "unescape_nowiki!" do
142
146
  it "replaces <nowiki-object_id> with string stored elsewhere" do
143
147
  @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
144
148
  str_before = "<nowiki-123>def<nowiki-124>"
145
149
  str_after = "[[abc]]def[[ghi]]"
146
- expect(unescape_nowiki(str_before)).to eq str_after
150
+ unescape_nowiki!(str_before)
151
+ expect(str_before).to eq str_after
147
152
  end
148
153
  end
149
154
 
150
- describe "process_interwiki_links" do
155
+ describe "process_interwiki_links!" do
151
156
  it "formats text link and remove brackets" do
152
- expect(process_interwiki_links("[[a b]]")).to eq "a b"
153
- expect(process_interwiki_links("[[a b|c]]")).to eq "c"
154
- expect(process_interwiki_links("[[a|b|c]]")).to eq "b|c"
155
- expect(process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]")).to eq "[ɲ], /J/"
157
+ a = "[[a b]]"
158
+ b = "[[a b|c]]"
159
+ c = "[[a|b|c]]"
160
+ d = "[[硬口蓋鼻音|[ɲ], /J/]]"
161
+ process_interwiki_links!(a)
162
+ process_interwiki_links!(b)
163
+ process_interwiki_links!(c)
164
+ process_interwiki_links!(d)
165
+ expect(a).to eq "a b"
166
+ expect(b).to eq "c"
167
+ expect(c).to eq "b|c"
168
+ expect(d).to eq "[ɲ], /J/"
156
169
  end
157
170
  end
158
171
 
159
- describe "process_external_links" do
172
+ describe "process_external_links!" do
160
173
  it "formats text link and remove brackets" do
161
- expect(process_external_links("[http://yohasebe.com yohasebe.com]")).to eq "yohasebe.com"
162
- expect(process_external_links("[http://yohasebe.com]")).to eq "http://yohasebe.com"
163
- expect(process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}")).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
174
+ a = "[http://yohasebe.com yohasebe.com]"
175
+ b = "[http://yohasebe.com]"
176
+ c = "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
177
+ process_external_links!(a)
178
+ process_external_links!(b)
179
+ process_external_links!(c)
180
+ expect(a).to eq "yohasebe.com"
181
+ expect(b).to eq "http://yohasebe.com"
182
+ expect(c).to eq "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
164
183
  end
165
184
  end
166
185