wp2txt 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,430 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require 'strscan'
5
+ require 'find'
6
+ require 'sanitize'
7
+
8
+ module Wp2txt
9
+
10
+ def format_wiki(original_text, has_retried = false)
11
+ begin
12
+ text = original_text + ""
13
+
14
+ text = chrref_to_utf(text)
15
+ text = escape_nowiki(text)
16
+
17
+ text = process_interwiki_links(text)
18
+ text = process_external_links(text)
19
+
20
+ text = remove_directive(text)
21
+ text = remove_emphasis(text)
22
+
23
+ text = mndash(text)
24
+ text = make_reference(text)
25
+ text = format_ref(text)
26
+ text = remove_hr(text)
27
+ text = remove_tag(text)
28
+ text = special_chr(text)
29
+
30
+ unescape_nowiki(text)
31
+ rescue # detect invalid byte sequence in UTF-8
32
+ if has_retried
33
+ puts "invalid byte sequence detected"
34
+ puts "******************************"
35
+ File.open("error_log.txt", "w") do |f|
36
+ f.write original_text
37
+ end
38
+ exit
39
+ else
40
+ fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
41
+ return format_wiki(fixed_text, true)
42
+ end
43
+ end
44
+ end
45
+
46
+ #################### parser for nested structure ####################
47
+
48
+ def process_nested_structure(scanner, left, right, &block)
49
+ buffer = ""
50
+ while str = scanner.scan_until(/(#{Regexp.escape(left)}|#{Regexp.escape(right)})/m)
51
+ # begin
52
+ case scanner[1]
53
+ when left
54
+ buffer << str
55
+ has_left = true
56
+ when right
57
+ if has_left
58
+ buffer = buffer[0...-(left.size)]
59
+ contents = block.call(str[0...-(left.size)])
60
+ buffer << contents
61
+ break
62
+ else
63
+ buffer << str
64
+ end
65
+ end
66
+ end
67
+ buffer << scanner.rest
68
+
69
+ if buffer == scanner.string
70
+ return scanner.string
71
+ else
72
+ scanner.string = buffer
73
+ return process_nested_structure(scanner, left, right, &block) || ""
74
+ end
75
+ end
76
+
77
+ def remove_templates(str, only_not_inline = true)
78
+ scanner = StringScanner.new(str)
79
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
80
+ if contents.index("\n")
81
+ "\n"
82
+ else
83
+ "[tpl]#{contents}[/tpl]"
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+ #################### methods used from format_wiki ####################
90
+
91
+ def escape_nowiki(str)
92
+ if @nowikis
93
+ @nowikis.clear
94
+ else
95
+ @nowikis = {}
96
+ end
97
+ str.gsub(/<nowiki>(.*?)<\/nowiki>/m) do
98
+ nowiki = $1
99
+ nowiki_id = nowiki.object_id
100
+ @nowikis[nowiki_id] = nowiki
101
+ "<nowiki-#{nowiki_id}>"
102
+ end
103
+ end
104
+
105
+ def unescape_nowiki(str)
106
+ str.gsub(/<nowiki\-(\d+?)>/) do
107
+ obj_id = $1.to_i
108
+ @nowikis[obj_id]
109
+ end
110
+ end
111
+
112
+ def process_interwiki_links(str)
113
+ scanner = StringScanner.new(str)
114
+ result = process_nested_structure(scanner, "[[", "]]") do |contents|
115
+ str_new = ""
116
+ parts = contents.split("|")
117
+ case parts.size
118
+ when 1
119
+ parts.first || ""
120
+ else
121
+ parts.shift
122
+ parts.join("|")
123
+ end
124
+ end
125
+ result
126
+ end
127
+
128
+ def process_external_links(str)
129
+ scanner = StringScanner.new(str)
130
+ result = process_nested_structure(scanner, "[", "]") do |contents|
131
+ parts = contents.split(" ", 2)
132
+ case parts.size
133
+ when 1
134
+ parts.first || ""
135
+ else
136
+ parts.last || ""
137
+ end
138
+ end
139
+ result
140
+ end
141
+
142
+ def special_chr(str)
143
+ unless @sp_hash
144
+ html = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;']\
145
+ .zip([' ', '<', '>', '&', '"'])
146
+
147
+ umraut_accent = ['&Agrave;', '&Aacute;', '&Acirc;', '&Atilde;', '&Auml;',
148
+ '&Aring;', '&AElig;', '&Ccedil;', '&Egrave;', '&Eacute;', '&Ecirc;',
149
+ '&Euml;', '&Igrave;', '&Iacute;', '&Icirc;', '&Iuml;', '&Ntilde;',
150
+ '&Ograve;', '&Oacute;', '&Ocirc;', '&Otilde;', '&Ouml;', '&Oslash;',
151
+ '&Ugrave;', '&Uacute;', '&Ucirc;', '&Uuml;', '&szlig;', '&agrave;',
152
+ '&aacute;', '&acirc;', '&atilde;', '&auml;', '&aring;', '&aelig;',
153
+ '&ccedil;', '&egrave;', '&eacute;', '&ecirc;', '&euml;', '&igrave;',
154
+ '&iacute;', '&icirc;', '&iuml;', '&ntilde;', '&ograve;', '&oacute;',
155
+ '&ocirc;', '&oelig;', '&otilde;', '&ouml;', '&oslash;', '&ugrave;',
156
+ '&uacute;', '&ucirc;', '&uuml;', '&yuml;']\
157
+ .zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
158
+ 'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
159
+ 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
160
+ 'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
161
+
162
+ punctuation = ['&iquest;', '&iexcl;', '&laquo;', '&raquo;', '&sect;',
163
+ '&para;', '&dagger;', '&Dagger;', '&bull;', '&ndash;', '&mdash;']\
164
+ .zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
165
+
166
+ commercial = ['&trade;', '&copy;', '&reg;', '&cent;', '&euro;', '&yen;',
167
+ '&pound;', '&curren;'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
168
+
169
+ greek_chr = ['&alpha;', '&beta;', '&gamma;', '&delta;', '&epsilon;',
170
+ '&zeta;', '&eta;', '&theta;', '&iota;', '&kappa;', '&lambda;', '&mu;',
171
+ '&nu;', '&xi;', '&omicron;', '&pi;', '&rho;', '&sigma;', '&sigmaf;',
172
+ '&tau;', '&upsilon;', '&phi;', '&chi;', '&psi;', '&omega;', '&Gamma;',
173
+ '&Delta;', '&Theta;', '&Lambda;', '&Xi;', '&Pi;', '&Sigma;', '&Phi;',
174
+ '&Psi;', '&Omega;']\
175
+ .zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
176
+ 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
177
+ 'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
178
+
179
+ math_chr1 = ['&int;', '&sum;', '&prod;', '&radic;', '&minus;', '&plusmn;',
180
+ '&infin;', '&asymp;', '&prop;', '&equiv;', '&ne;', '&le;', '&ge;',
181
+ '&times;', '&middot;', '&divide;', '&part;', '&prime;', '&Prime;',
182
+ '&nabla;', '&permil;', '&deg;', '&there4;', '&oslash;', '&isin;', '&cap;',
183
+ '&cup;', '&sub;', '&sup;', '&sube;', '&supe;', '&not;', '&and;', '&or;',
184
+ '&exist;', '&forall;', '&rArr;', '&hArr;', '&rarr;', '&harr;', '&uarr;']\
185
+ .zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
186
+ '≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
187
+ '∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
188
+ '⇔', '→', '↔', '↑'])
189
+
190
+ math_chr2 = ['&alefsym;', '&notin;'].zip(['ℵ', '∉'])
191
+
192
+ others = ['&uml;', '&ordf;',
193
+ '&macr;', '&acute;', '&micro;', '&cedil;', '&ordm;', '&lsquo;', '&rsquo;',
194
+ '&ldquo;', '&sbquo;', '&rdquo;', '&bdquo;', '&spades;', '&clubs;', '&loz;',
195
+ '&hearts;', '&larr;', '&diams;', '&lsaquo;', '&rsaquo;', '&darr;']\
196
+ .zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
197
+ '„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
198
+
199
+ spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
200
+ math_chr1 + math_chr2 + others
201
+ @sp_hash = Hash[*spc_array.flatten]
202
+ @sp_regex = Regexp.new("(" + @sp_hash.keys.join("|") + ")")
203
+ end
204
+ #str.gsub!("&amp;"){'&'}
205
+ str.gsub!(@sp_regex) do
206
+ @sp_hash[$1]
207
+ end
208
+ return str
209
+ end
210
+
211
+ def remove_tag(str, tagset = ['<', '>'])
212
+ if tagset == ['<', '>']
213
+ return remove_html_tag(str)
214
+ end
215
+ tagsets = Regexp.quote(tagset.uniq.join(""))
216
+ regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
217
+ newstr = str.gsub(regex, "")
218
+ # newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
219
+ return newstr
220
+ end
221
+
222
+ def remove_html_tag(str)
223
+ str = ::Sanitize.clean(str)
224
+ end
225
+
226
+ def remove_emphasis(str)
227
+ str.gsub(/(''+)(.+?)\1/) do
228
+ $2
229
+ end
230
+ end
231
+
232
+ def chrref_to_utf(num_str)
233
+ begin
234
+ utf_str = num_str.gsub(/&#(x?)([0-9a-fA-F]+);/) do
235
+ if $1 == 'x'
236
+ ch = $2.to_i(16)
237
+ else
238
+ ch = $2.to_i
239
+ end
240
+ hi = ch>>8
241
+ lo = ch&0xff
242
+ u = "\377\376" << lo.chr << hi.chr
243
+ u.encode("UTF-8", "UTF-16")
244
+ end
245
+ rescue StandardError
246
+ return num_str
247
+ end
248
+ return utf_str
249
+ end
250
+
251
+ def remove_directive(str)
252
+ remove_tag(str, ['__', '__'])
253
+ end
254
+
255
+ def mndash(str)
256
+ str = str.gsub(/\{(mdash|ndash|–)\}/, "–")
257
+ end
258
+
259
+ def remove_hr(page)
260
+ page = page.gsub(/^\s*\-+\s*$/, "")
261
+ end
262
+
263
+ def make_reference(str)
264
+ new_str = str.dup
265
+ new_str.gsub!(/<br ?\/>/, "\n")
266
+ new_str.gsub!(/<ref[^>]*\/>/, "")
267
+ new_str.gsub!(/<ref[^>]*>/, "[ref]")
268
+ new_str.gsub!(/<\/ref>/, "[/ref]")
269
+ return new_str
270
+ end
271
+
272
+ def format_ref(page)
273
+ page = page.gsub(/\[ref\](.*?)\[\/ref\]/m) do
274
+ ref = $1.dup
275
+ ref.gsub(/(?:[\r\n]+|<br ?\/>)/, " ")
276
+ end
277
+ end
278
+
279
+ #################### methods currently unused ####################
280
+
281
+ def process_template(str)
282
+ scanner = StringScanner.new(str)
283
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
284
+ parts = contents.split("|")
285
+ case parts.size
286
+ when 0
287
+ ""
288
+ when 1
289
+ parts.first || ""
290
+ else
291
+ if parts.last.split("=").size > 1
292
+ parts.first || ""
293
+ else
294
+ parts.last || ""
295
+ end
296
+ end
297
+ end
298
+ result
299
+ end
300
+
301
+ def remove_table(str)
302
+ new_str = str.gsub(/\{\|[^\{\|\}]*?\|\}/m, "")
303
+ if str != new_str
304
+ new_str = remove_table(new_str)
305
+ end
306
+ new_str = remove_table(new_str) unless str == new_str
307
+ return new_str
308
+ end
309
+
310
+ def remove_clade(page)
311
+ new_page = page.gsub(/\{\{(?:C|c)lade[^\{\}]*\}\}/m, "")
312
+ new_page = remove_clade(new_page) unless page == new_page
313
+ new_page
314
+ end
315
+
316
+ def remove_inline_template(str)
317
+ str.gsub(/\{\{(.*?)\}\}/) do
318
+ key = $1
319
+ if /\A[^\|]+\z/ =~ key
320
+ result = key
321
+ else
322
+ info = key.split("|")
323
+ type_code = info.first
324
+ case type_code
325
+ when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
326
+ /\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
327
+ /\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
328
+ out = info[-1]
329
+ else
330
+ out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
331
+ end
332
+ result = out
333
+ end
334
+ end
335
+ end
336
+
337
+ #################### file related utilities ####################
338
+
339
+ # collect filenames recursively
340
+ def collect_files(str, regex = nil)
341
+ regex ||= //
342
+ text_array = Array.new
343
+ Find.find(str) do |f|
344
+ text_array << f if regex =~ f
345
+ end
346
+ text_array.sort
347
+ end
348
+
349
+ # modify a file using block/yield mechanism
350
+ def file_mod(file_path, backup = false, &block)
351
+ File.open(file_path, "r") do |fr|
352
+ str = fr.read
353
+ newstr = yield(str)
354
+ str = newstr unless newstr == nil
355
+ File.open("temp", "w") do |tf|
356
+ tf.write(str)
357
+ end
358
+ end
359
+
360
+ File.rename(file_path, file_path + ".bak")
361
+ File.rename("temp", file_path)
362
+ File.unlink(file_path + ".bak") unless backup
363
+ end
364
+
365
+ # modify files under a directry (recursive)
366
+ def batch_file_mod(dir_path, &block)
367
+ if FileTest.directory?(dir_path)
368
+ collect_files(dir_path).each do |file|
369
+ yield file if FileTest.file?(file)
370
+ end
371
+ else
372
+ yield dir_path if FileTest.file?(dir_path)
373
+ end
374
+ end
375
+
376
+ # take care of difference of separators among environments
377
+ def correct_separator(input)
378
+ if input.is_a?(String)
379
+ ret_str = String.new
380
+ if RUBY_PLATFORM.index("win32")
381
+ ret_str = input.gsub("/", "\\")
382
+ else
383
+ ret_str = input.gsub("\\", "/")
384
+ end
385
+ return ret_str
386
+ elsif input.is_a?(Array)
387
+ ret_array = Array.new
388
+ input.each do |item|
389
+ ret_array << correct_separator(item)
390
+ end
391
+ return ret_array
392
+ end
393
+ end
394
+
395
+ def rename(files)
396
+ # num of digits necessary to name the last file generated
397
+ maxwidth = 0
398
+
399
+ files.each do |f|
400
+ width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
401
+ maxwidth = width if maxwidth < width
402
+ end
403
+
404
+ files.each do |f|
405
+ newname= f.sub(/\-(\d+)\z/) do
406
+ "-" + sprintf("%0#{maxwidth}d", $1.to_i)
407
+ end
408
+ File.rename(f, newname + ".txt")
409
+ end
410
+ end
411
+
412
+ # convert int of seconds to string in the format 00:00:00
413
+ def sec_to_str(int)
414
+ unless int
415
+ str = "--:--:--"
416
+ return str
417
+ end
418
+ h = int / 3600
419
+ m = (int - h * 3600) / 60
420
+ s = int % 60
421
+ str = sprintf("%02d:%02d:%02d", h, m, s)
422
+ return str
423
+ end
424
+
425
+ def decimal_format(i)
426
+ str = i.to_s.reverse
427
+ return str.scan(/.?.?./).join(',').reverse
428
+ end
429
+
430
+ end
@@ -0,0 +1,3 @@
1
+ module Wp2txt
2
+ VERSION = "0.4.1"
3
+ end
@@ -0,0 +1,6 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require 'rspec'
3
+
4
+ RSpec.configure do |config|
5
+ # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
+ end
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
5
+ require 'wp2txt'
6
+ require 'wp2txt/article'
7
+ require 'wp2txt/utils'
8
+
9
+ describe "Wp2txt" do
10
+ it "contains mediawiki-format related functions:" do
11
+ end
12
+
13
+ include Wp2txt
14
+
15
+ before do
16
+ end
17
+
18
+ describe "process_nested_structure" do
19
+ it "parse nested structure replacing str in the format specified" do
20
+ str_before = "[[ab[[cde[[alfa]]]]fg]]"
21
+ str_after = "<<ab<<cde<<alfa>>>>fg>>"
22
+ scanner = StringScanner.new(str_before)
23
+ str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
24
+ "<<" + content + ">>"
25
+ end
26
+ str_processed.should == str_after
27
+
28
+ str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
29
+ |passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
30
+ str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
31
+ |passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
32
+ scanner = StringScanner.new(str_before)
33
+ str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
34
+ "<<" + content + ">>"
35
+ end
36
+ str_processed.should == str_after
37
+ end
38
+ end
39
+
40
+ describe "special_chr" do
41
+ it "replaces character references with real characters" do
42
+ str_before = "&nbsp; &lt; &gt; &amp; &quot;"
43
+ str_after = " < > & \""
44
+ special_chr(str_before).should == str_after
45
+ end
46
+ end
47
+
48
+ describe "chrref_to_utf" do
49
+ it "replaces character references with real characters" do
50
+ str_before = "&#x266A;"
51
+ str_after = "♪"
52
+ chrref_to_utf(str_before).should == str_after
53
+ end
54
+ end
55
+
56
+ describe "mndash" do
57
+ it "replaces {mdash}, {ndash}, or {–} with '–'" do
58
+ str_before = "{mdash} {ndash} {–}"
59
+ str_after = "– – –"
60
+ mndash(str_before).should == str_after
61
+ end
62
+ end
63
+
64
+ describe "format_ref" do
65
+ it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
66
+ str_before = "[ref]...\r\n...<br />...[/ref]"
67
+ str_after = "... ... ..."
68
+ format_ref(str_before).should == str_after
69
+ end
70
+ end
71
+
72
+ describe "make_reference" do
73
+ it "replaces <ref> tag with [ref]" do
74
+ str_before = "<ref> ... <br /> ... </ref> \n <ref />"
75
+ str_after = "[ref] ... \n ... [/ref] \n "
76
+ make_reference(str_before).should == str_after
77
+ end
78
+ end
79
+
80
+ describe "remove_table" do
81
+ it "removes table formated parts" do
82
+ str_before = "{| ... \n{| ... \n ...|}\n ...|}"
83
+ str_after = ""
84
+ remove_table(str_before).should == str_after
85
+ end
86
+ end
87
+
88
+ describe "remove_clade" do
89
+ it "removes clade formated parts" do
90
+ str_before = "\{\{clade ... \n ... \n ... \n\}\}"
91
+ str_after = ""
92
+ remove_clade(str_before).should == str_after
93
+ end
94
+ end
95
+
96
+ describe "remove_hr" do
97
+ it "removes horizontal lines" do
98
+ str_before = "\n----\n--\n--\n"
99
+ str_after = "\n\n"
100
+ remove_hr(str_before).should == str_after
101
+ end
102
+ end
103
+
104
+ describe "remove_tag" do
105
+ it "removes tags" do
106
+ str_before = "<tag>abc</tag>"
107
+ str_after = "abc"
108
+ remove_tag(str_before).should == str_after
109
+ str_before = "[tag]def[/tag]"
110
+ str_after = "def"
111
+ remove_tag(str_before, ['[', ']']).should == str_after
112
+ end
113
+ end
114
+
115
+ describe "remove_directive" do
116
+ it "removes directive" do
117
+ str_before = "__abc__\n __def__"
118
+ str_after = "\n "
119
+ remove_directive(str_before).should == str_after
120
+ end
121
+ end
122
+
123
+ describe "remove_emphasis" do
124
+ it "removes directive" do
125
+ str_before = "''abc''\n'''def'''"
126
+ str_after = "abc\ndef"
127
+ remove_emphasis(str_before).should == str_after
128
+ end
129
+ end
130
+
131
+ describe "escape_nowiki" do
132
+ it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
133
+ str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
134
+ str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
135
+ escape_nowiki(str_before).should =~ str_after
136
+ end
137
+ end
138
+
139
+ describe "unescape_nowiki" do
140
+ it "replaces <nowiki-object_id> with string stored elsewhere" do
141
+ @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
142
+ str_before = "<nowiki-123>def<nowiki-124>"
143
+ str_after = "[[abc]]def[[ghi]]"
144
+ unescape_nowiki(str_before).should == str_after
145
+ end
146
+ end
147
+
148
+ describe "process_interwiki_links" do
149
+ it "formats text link and remove brackets" do
150
+ process_interwiki_links("[[a b]]").should == "a b"
151
+ process_interwiki_links("[[a b|c]]").should == "c"
152
+ process_interwiki_links("[[a|b|c]]").should == "b|c"
153
+ process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]").should == "[ɲ], /J/"
154
+ end
155
+ end
156
+
157
+ describe "process_external_links" do
158
+ it "formats text link and remove brackets" do
159
+ process_external_links("[http://yohasebe.com yohasebe.com]").should == "yohasebe.com"
160
+ process_external_links("[http://yohasebe.com]").should == "http://yohasebe.com"
161
+ process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}").should == "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
162
+ end
163
+ end
164
+
165
+ describe "process_template" do
166
+ it "removes brackets and leaving some text" do
167
+ str_before = "{{}}"
168
+ str_after = ""
169
+ process_template(str_before).should == str_after
170
+ str_before = "{{lang|en|Japan}}"
171
+ str_after = "Japan"
172
+ process_template(str_before).should == str_after
173
+ str_before = "{{a|b=c|d=f}}"
174
+ str_after = "a"
175
+ process_template(str_before).should == str_after
176
+ str_before = "{{a|b|{{c|d|e}}}}"
177
+ str_after = "e"
178
+ process_template(str_before).should == str_after
179
+ end
180
+ end
181
+
182
+ describe "expand_template" do
183
+ it "gets data corresponding to a given template using mediawiki api" do
184
+ uri = "http://en.wiktionary.org/w/api.php"
185
+ template = "{{en-verb}}"
186
+ word = "kick"
187
+ expanded = expand_template(uri, template, word)
188
+ html =<<EOD
189
+ <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
190
+ EOD
191
+ html.strip!
192
+ expanded.should == html
193
+ end
194
+ end
195
+ end