wp2txt 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,430 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require 'strscan'
5
+ require 'find'
6
+ require 'sanitize'
7
+
8
+ module Wp2txt
9
+
10
+ def format_wiki(original_text, has_retried = false)
11
+ begin
12
+ text = original_text + ""
13
+
14
+ text = chrref_to_utf(text)
15
+ text = escape_nowiki(text)
16
+
17
+ text = process_interwiki_links(text)
18
+ text = process_external_links(text)
19
+
20
+ text = remove_directive(text)
21
+ text = remove_emphasis(text)
22
+
23
+ text = mndash(text)
24
+ text = make_reference(text)
25
+ text = format_ref(text)
26
+ text = remove_hr(text)
27
+ text = remove_tag(text)
28
+ text = special_chr(text)
29
+
30
+ unescape_nowiki(text)
31
+ rescue # detect invalid byte sequence in UTF-8
32
+ if has_retried
33
+ puts "invalid byte sequence detected"
34
+ puts "******************************"
35
+ File.open("error_log.txt", "w") do |f|
36
+ f.write original_text
37
+ end
38
+ exit
39
+ else
40
+ fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
41
+ return format_wiki(fixed_text, true)
42
+ end
43
+ end
44
+ end
45
+
46
+ #################### parser for nested structure ####################
47
+
48
+ def process_nested_structure(scanner, left, right, &block)
49
+ buffer = ""
50
+ while str = scanner.scan_until(/(#{Regexp.escape(left)}|#{Regexp.escape(right)})/m)
51
+ # begin
52
+ case scanner[1]
53
+ when left
54
+ buffer << str
55
+ has_left = true
56
+ when right
57
+ if has_left
58
+ buffer = buffer[0...-(left.size)]
59
+ contents = block.call(str[0...-(left.size)])
60
+ buffer << contents
61
+ break
62
+ else
63
+ buffer << str
64
+ end
65
+ end
66
+ end
67
+ buffer << scanner.rest
68
+
69
+ if buffer == scanner.string
70
+ return scanner.string
71
+ else
72
+ scanner.string = buffer
73
+ return process_nested_structure(scanner, left, right, &block) || ""
74
+ end
75
+ end
76
+
77
+ def remove_templates(str, only_not_inline = true)
78
+ scanner = StringScanner.new(str)
79
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
80
+ if contents.index("\n")
81
+ "\n"
82
+ else
83
+ "[tpl]#{contents}[/tpl]"
84
+ end
85
+ end
86
+ end
87
+
88
+
89
+ #################### methods used from format_wiki ####################
90
+
91
+ def escape_nowiki(str)
92
+ if @nowikis
93
+ @nowikis.clear
94
+ else
95
+ @nowikis = {}
96
+ end
97
+ str.gsub(/<nowiki>(.*?)<\/nowiki>/m) do
98
+ nowiki = $1
99
+ nowiki_id = nowiki.object_id
100
+ @nowikis[nowiki_id] = nowiki
101
+ "<nowiki-#{nowiki_id}>"
102
+ end
103
+ end
104
+
105
+ def unescape_nowiki(str)
106
+ str.gsub(/<nowiki\-(\d+?)>/) do
107
+ obj_id = $1.to_i
108
+ @nowikis[obj_id]
109
+ end
110
+ end
111
+
112
+ def process_interwiki_links(str)
113
+ scanner = StringScanner.new(str)
114
+ result = process_nested_structure(scanner, "[[", "]]") do |contents|
115
+ str_new = ""
116
+ parts = contents.split("|")
117
+ case parts.size
118
+ when 1
119
+ parts.first || ""
120
+ else
121
+ parts.shift
122
+ parts.join("|")
123
+ end
124
+ end
125
+ result
126
+ end
127
+
128
+ def process_external_links(str)
129
+ scanner = StringScanner.new(str)
130
+ result = process_nested_structure(scanner, "[", "]") do |contents|
131
+ parts = contents.split(" ", 2)
132
+ case parts.size
133
+ when 1
134
+ parts.first || ""
135
+ else
136
+ parts.last || ""
137
+ end
138
+ end
139
+ result
140
+ end
141
+
142
+ def special_chr(str)
143
+ unless @sp_hash
144
+ html = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;']\
145
+ .zip([' ', '<', '>', '&', '"'])
146
+
147
+ umraut_accent = ['&Agrave;', '&Aacute;', '&Acirc;', '&Atilde;', '&Auml;',
148
+ '&Aring;', '&AElig;', '&Ccedil;', '&Egrave;', '&Eacute;', '&Ecirc;',
149
+ '&Euml;', '&Igrave;', '&Iacute;', '&Icirc;', '&Iuml;', '&Ntilde;',
150
+ '&Ograve;', '&Oacute;', '&Ocirc;', '&Otilde;', '&Ouml;', '&Oslash;',
151
+ '&Ugrave;', '&Uacute;', '&Ucirc;', '&Uuml;', '&szlig;', '&agrave;',
152
+ '&aacute;', '&acirc;', '&atilde;', '&auml;', '&aring;', '&aelig;',
153
+ '&ccedil;', '&egrave;', '&eacute;', '&ecirc;', '&euml;', '&igrave;',
154
+ '&iacute;', '&icirc;', '&iuml;', '&ntilde;', '&ograve;', '&oacute;',
155
+ '&ocirc;', '&oelig;', '&otilde;', '&ouml;', '&oslash;', '&ugrave;',
156
+ '&uacute;', '&ucirc;', '&uuml;', '&yuml;']\
157
+ .zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
158
+ 'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
159
+ 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
160
+ 'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
161
+
162
+ punctuation = ['&iquest;', '&iexcl;', '&laquo;', '&raquo;', '&sect;',
163
+ '&para;', '&dagger;', '&Dagger;', '&bull;', '&ndash;', '&mdash;']\
164
+ .zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
165
+
166
+ commercial = ['&trade;', '&copy;', '&reg;', '&cent;', '&euro;', '&yen;',
167
+ '&pound;', '&curren;'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
168
+
169
+ greek_chr = ['&alpha;', '&beta;', '&gamma;', '&delta;', '&epsilon;',
170
+ '&zeta;', '&eta;', '&theta;', '&iota;', '&kappa;', '&lambda;', '&mu;',
171
+ '&nu;', '&xi;', '&omicron;', '&pi;', '&rho;', '&sigma;', '&sigmaf;',
172
+ '&tau;', '&upsilon;', '&phi;', '&chi;', '&psi;', '&omega;', '&Gamma;',
173
+ '&Delta;', '&Theta;', '&Lambda;', '&Xi;', '&Pi;', '&Sigma;', '&Phi;',
174
+ '&Psi;', '&Omega;']\
175
+ .zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
176
+ 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
177
+ 'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
178
+
179
+ math_chr1 = ['&int;', '&sum;', '&prod;', '&radic;', '&minus;', '&plusmn;',
180
+ '&infin;', '&asymp;', '&prop;', '&equiv;', '&ne;', '&le;', '&ge;',
181
+ '&times;', '&middot;', '&divide;', '&part;', '&prime;', '&Prime;',
182
+ '&nabla;', '&permil;', '&deg;', '&there4;', '&oslash;', '&isin;', '&cap;',
183
+ '&cup;', '&sub;', '&sup;', '&sube;', '&supe;', '&not;', '&and;', '&or;',
184
+ '&exist;', '&forall;', '&rArr;', '&hArr;', '&rarr;', '&harr;', '&uarr;']\
185
+ .zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
186
+ '≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
187
+ '∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
188
+ '⇔', '→', '↔', '↑'])
189
+
190
+ math_chr2 = ['&alefsym;', '&notin;'].zip(['ℵ', '∉'])
191
+
192
+ others = ['&uml;', '&ordf;',
193
+ '&macr;', '&acute;', '&micro;', '&cedil;', '&ordm;', '&lsquo;', '&rsquo;',
194
+ '&ldquo;', '&sbquo;', '&rdquo;', '&bdquo;', '&spades;', '&clubs;', '&loz;',
195
+ '&hearts;', '&larr;', '&diams;', '&lsaquo;', '&rsaquo;', '&darr;']\
196
+ .zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
197
+ '„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
198
+
199
+ spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
200
+ math_chr1 + math_chr2 + others
201
+ @sp_hash = Hash[*spc_array.flatten]
202
+ @sp_regex = Regexp.new("(" + @sp_hash.keys.join("|") + ")")
203
+ end
204
+ #str.gsub!("&amp;"){'&'}
205
+ str.gsub!(@sp_regex) do
206
+ @sp_hash[$1]
207
+ end
208
+ return str
209
+ end
210
+
211
+ def remove_tag(str, tagset = ['<', '>'])
212
+ if tagset == ['<', '>']
213
+ return remove_html_tag(str)
214
+ end
215
+ tagsets = Regexp.quote(tagset.uniq.join(""))
216
+ regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
217
+ newstr = str.gsub(regex, "")
218
+ # newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
219
+ return newstr
220
+ end
221
+
222
+ def remove_html_tag(str)
223
+ str = ::Sanitize.clean(str)
224
+ end
225
+
226
+ def remove_emphasis(str)
227
+ str.gsub(/(''+)(.+?)\1/) do
228
+ $2
229
+ end
230
+ end
231
+
232
+ def chrref_to_utf(num_str)
233
+ begin
234
+ utf_str = num_str.gsub(/&#(x?)([0-9a-fA-F]+);/) do
235
+ if $1 == 'x'
236
+ ch = $2.to_i(16)
237
+ else
238
+ ch = $2.to_i
239
+ end
240
+ hi = ch>>8
241
+ lo = ch&0xff
242
+ u = "\377\376" << lo.chr << hi.chr
243
+ u.encode("UTF-8", "UTF-16")
244
+ end
245
+ rescue StandardError
246
+ return num_str
247
+ end
248
+ return utf_str
249
+ end
250
+
251
+ def remove_directive(str)
252
+ remove_tag(str, ['__', '__'])
253
+ end
254
+
255
+ def mndash(str)
256
+ str = str.gsub(/\{(mdash|ndash|–)\}/, "–")
257
+ end
258
+
259
+ def remove_hr(page)
260
+ page = page.gsub(/^\s*\-+\s*$/, "")
261
+ end
262
+
263
+ def make_reference(str)
264
+ new_str = str.dup
265
+ new_str.gsub!(/<br ?\/>/, "\n")
266
+ new_str.gsub!(/<ref[^>]*\/>/, "")
267
+ new_str.gsub!(/<ref[^>]*>/, "[ref]")
268
+ new_str.gsub!(/<\/ref>/, "[/ref]")
269
+ return new_str
270
+ end
271
+
272
+ def format_ref(page)
273
+ page = page.gsub(/\[ref\](.*?)\[\/ref\]/m) do
274
+ ref = $1.dup
275
+ ref.gsub(/(?:[\r\n]+|<br ?\/>)/, " ")
276
+ end
277
+ end
278
+
279
+ #################### methods currently unused ####################
280
+
281
+ def process_template(str)
282
+ scanner = StringScanner.new(str)
283
+ result = process_nested_structure(scanner, "{{", "}}") do |contents|
284
+ parts = contents.split("|")
285
+ case parts.size
286
+ when 0
287
+ ""
288
+ when 1
289
+ parts.first || ""
290
+ else
291
+ if parts.last.split("=").size > 1
292
+ parts.first || ""
293
+ else
294
+ parts.last || ""
295
+ end
296
+ end
297
+ end
298
+ result
299
+ end
300
+
301
+ def remove_table(str)
302
+ new_str = str.gsub(/\{\|[^\{\|\}]*?\|\}/m, "")
303
+ if str != new_str
304
+ new_str = remove_table(new_str)
305
+ end
306
+ new_str = remove_table(new_str) unless str == new_str
307
+ return new_str
308
+ end
309
+
310
+ def remove_clade(page)
311
+ new_page = page.gsub(/\{\{(?:C|c)lade[^\{\}]*\}\}/m, "")
312
+ new_page = remove_clade(new_page) unless page == new_page
313
+ new_page
314
+ end
315
+
316
+ def remove_inline_template(str)
317
+ str.gsub(/\{\{(.*?)\}\}/) do
318
+ key = $1
319
+ if /\A[^\|]+\z/ =~ key
320
+ result = key
321
+ else
322
+ info = key.split("|")
323
+ type_code = info.first
324
+ case type_code
325
+ when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
326
+ /\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
327
+ /\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
328
+ out = info[-1]
329
+ else
330
+ out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
331
+ end
332
+ result = out
333
+ end
334
+ end
335
+ end
336
+
337
+ #################### file related utilities ####################
338
+
339
+ # collect filenames recursively
340
+ def collect_files(str, regex = nil)
341
+ regex ||= //
342
+ text_array = Array.new
343
+ Find.find(str) do |f|
344
+ text_array << f if regex =~ f
345
+ end
346
+ text_array.sort
347
+ end
348
+
349
+ # modify a file using block/yield mechanism
350
+ def file_mod(file_path, backup = false, &block)
351
+ File.open(file_path, "r") do |fr|
352
+ str = fr.read
353
+ newstr = yield(str)
354
+ str = newstr unless newstr == nil
355
+ File.open("temp", "w") do |tf|
356
+ tf.write(str)
357
+ end
358
+ end
359
+
360
+ File.rename(file_path, file_path + ".bak")
361
+ File.rename("temp", file_path)
362
+ File.unlink(file_path + ".bak") unless backup
363
+ end
364
+
365
+ # modify files under a directry (recursive)
366
+ def batch_file_mod(dir_path, &block)
367
+ if FileTest.directory?(dir_path)
368
+ collect_files(dir_path).each do |file|
369
+ yield file if FileTest.file?(file)
370
+ end
371
+ else
372
+ yield dir_path if FileTest.file?(dir_path)
373
+ end
374
+ end
375
+
376
+ # take care of difference of separators among environments
377
+ def correct_separator(input)
378
+ if input.is_a?(String)
379
+ ret_str = String.new
380
+ if RUBY_PLATFORM.index("win32")
381
+ ret_str = input.gsub("/", "\\")
382
+ else
383
+ ret_str = input.gsub("\\", "/")
384
+ end
385
+ return ret_str
386
+ elsif input.is_a?(Array)
387
+ ret_array = Array.new
388
+ input.each do |item|
389
+ ret_array << correct_separator(item)
390
+ end
391
+ return ret_array
392
+ end
393
+ end
394
+
395
+ def rename(files)
396
+ # num of digits necessary to name the last file generated
397
+ maxwidth = 0
398
+
399
+ files.each do |f|
400
+ width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
401
+ maxwidth = width if maxwidth < width
402
+ end
403
+
404
+ files.each do |f|
405
+ newname= f.sub(/\-(\d+)\z/) do
406
+ "-" + sprintf("%0#{maxwidth}d", $1.to_i)
407
+ end
408
+ File.rename(f, newname + ".txt")
409
+ end
410
+ end
411
+
412
+ # convert int of seconds to string in the format 00:00:00
413
+ def sec_to_str(int)
414
+ unless int
415
+ str = "--:--:--"
416
+ return str
417
+ end
418
+ h = int / 3600
419
+ m = (int - h * 3600) / 60
420
+ s = int % 60
421
+ str = sprintf("%02d:%02d:%02d", h, m, s)
422
+ return str
423
+ end
424
+
425
+ def decimal_format(i)
426
+ str = i.to_s.reverse
427
+ return str.scan(/.?.?./).join(',').reverse
428
+ end
429
+
430
+ end
@@ -0,0 +1,3 @@
1
+ module Wp2txt
2
+ VERSION = "0.4.1"
3
+ end
@@ -0,0 +1,6 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ require 'rspec'
3
+
4
+ RSpec.configure do |config|
5
+ # see https://github.com/rspec/rspec-core/blob/master/lib/rspec/core/configuration.rb for more infomation
6
+ end
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
5
+ require 'wp2txt'
6
+ require 'wp2txt/article'
7
+ require 'wp2txt/utils'
8
+
9
+ describe "Wp2txt" do
10
+ it "contains mediawiki-format related functions:" do
11
+ end
12
+
13
+ include Wp2txt
14
+
15
+ before do
16
+ end
17
+
18
+ describe "process_nested_structure" do
19
+ it "parse nested structure replacing str in the format specified" do
20
+ str_before = "[[ab[[cde[[alfa]]]]fg]]"
21
+ str_after = "<<ab<<cde<<alfa>>>>fg>>"
22
+ scanner = StringScanner.new(str_before)
23
+ str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
24
+ "<<" + content + ">>"
25
+ end
26
+ str_processed.should == str_after
27
+
28
+ str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
29
+ |passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
30
+ str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
31
+ |passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
32
+ scanner = StringScanner.new(str_before)
33
+ str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
34
+ "<<" + content + ">>"
35
+ end
36
+ str_processed.should == str_after
37
+ end
38
+ end
39
+
40
+ describe "special_chr" do
41
+ it "replaces character references with real characters" do
42
+ str_before = "&nbsp; &lt; &gt; &amp; &quot;"
43
+ str_after = " < > & \""
44
+ special_chr(str_before).should == str_after
45
+ end
46
+ end
47
+
48
+ describe "chrref_to_utf" do
49
+ it "replaces character references with real characters" do
50
+ str_before = "&#x266A;"
51
+ str_after = "♪"
52
+ chrref_to_utf(str_before).should == str_after
53
+ end
54
+ end
55
+
56
+ describe "mndash" do
57
+ it "replaces {mdash}, {ndash}, or {–} with '–'" do
58
+ str_before = "{mdash} {ndash} {–}"
59
+ str_after = "– – –"
60
+ mndash(str_before).should == str_after
61
+ end
62
+ end
63
+
64
+ describe "format_ref" do
65
+ it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
66
+ str_before = "[ref]...\r\n...<br />...[/ref]"
67
+ str_after = "... ... ..."
68
+ format_ref(str_before).should == str_after
69
+ end
70
+ end
71
+
72
+ describe "make_reference" do
73
+ it "replaces <ref> tag with [ref]" do
74
+ str_before = "<ref> ... <br /> ... </ref> \n <ref />"
75
+ str_after = "[ref] ... \n ... [/ref] \n "
76
+ make_reference(str_before).should == str_after
77
+ end
78
+ end
79
+
80
+ describe "remove_table" do
81
+ it "removes table formated parts" do
82
+ str_before = "{| ... \n{| ... \n ...|}\n ...|}"
83
+ str_after = ""
84
+ remove_table(str_before).should == str_after
85
+ end
86
+ end
87
+
88
+ describe "remove_clade" do
89
+ it "removes clade formated parts" do
90
+ str_before = "\{\{clade ... \n ... \n ... \n\}\}"
91
+ str_after = ""
92
+ remove_clade(str_before).should == str_after
93
+ end
94
+ end
95
+
96
+ describe "remove_hr" do
97
+ it "removes horizontal lines" do
98
+ str_before = "\n----\n--\n--\n"
99
+ str_after = "\n\n"
100
+ remove_hr(str_before).should == str_after
101
+ end
102
+ end
103
+
104
+ describe "remove_tag" do
105
+ it "removes tags" do
106
+ str_before = "<tag>abc</tag>"
107
+ str_after = "abc"
108
+ remove_tag(str_before).should == str_after
109
+ str_before = "[tag]def[/tag]"
110
+ str_after = "def"
111
+ remove_tag(str_before, ['[', ']']).should == str_after
112
+ end
113
+ end
114
+
115
+ describe "remove_directive" do
116
+ it "removes directive" do
117
+ str_before = "__abc__\n __def__"
118
+ str_after = "\n "
119
+ remove_directive(str_before).should == str_after
120
+ end
121
+ end
122
+
123
+ describe "remove_emphasis" do
124
+ it "removes directive" do
125
+ str_before = "''abc''\n'''def'''"
126
+ str_after = "abc\ndef"
127
+ remove_emphasis(str_before).should == str_after
128
+ end
129
+ end
130
+
131
+ describe "escape_nowiki" do
132
+ it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
133
+ str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
134
+ str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
135
+ escape_nowiki(str_before).should =~ str_after
136
+ end
137
+ end
138
+
139
+ describe "unescape_nowiki" do
140
+ it "replaces <nowiki-object_id> with string stored elsewhere" do
141
+ @nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
142
+ str_before = "<nowiki-123>def<nowiki-124>"
143
+ str_after = "[[abc]]def[[ghi]]"
144
+ unescape_nowiki(str_before).should == str_after
145
+ end
146
+ end
147
+
148
+ describe "process_interwiki_links" do
149
+ it "formats text link and remove brackets" do
150
+ process_interwiki_links("[[a b]]").should == "a b"
151
+ process_interwiki_links("[[a b|c]]").should == "c"
152
+ process_interwiki_links("[[a|b|c]]").should == "b|c"
153
+ process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]").should == "[ɲ], /J/"
154
+ end
155
+ end
156
+
157
+ describe "process_external_links" do
158
+ it "formats text link and remove brackets" do
159
+ process_external_links("[http://yohasebe.com yohasebe.com]").should == "yohasebe.com"
160
+ process_external_links("[http://yohasebe.com]").should == "http://yohasebe.com"
161
+ process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}").should == "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
162
+ end
163
+ end
164
+
165
+ describe "process_template" do
166
+ it "removes brackets and leaving some text" do
167
+ str_before = "{{}}"
168
+ str_after = ""
169
+ process_template(str_before).should == str_after
170
+ str_before = "{{lang|en|Japan}}"
171
+ str_after = "Japan"
172
+ process_template(str_before).should == str_after
173
+ str_before = "{{a|b=c|d=f}}"
174
+ str_after = "a"
175
+ process_template(str_before).should == str_after
176
+ str_before = "{{a|b|{{c|d|e}}}}"
177
+ str_after = "e"
178
+ process_template(str_before).should == str_after
179
+ end
180
+ end
181
+
182
+ describe "expand_template" do
183
+ it "gets data corresponding to a given template using mediawiki api" do
184
+ uri = "http://en.wiktionary.org/w/api.php"
185
+ template = "{{en-verb}}"
186
+ word = "kick"
187
+ expanded = expand_template(uri, template, word)
188
+ html =<<EOD
189
+ <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
190
+ EOD
191
+ html.strip!
192
+ expanded.should == html
193
+ end
194
+ end
195
+ end