wp2txt 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +65 -0
- data/Rakefile +9 -0
- data/bin/wp2txt +112 -0
- data/data/testdata.bz2 +0 -0
- data/lib/wp2txt.rb +323 -0
- data/lib/wp2txt/article.rb +177 -0
- data/lib/wp2txt/mw_api.rb +65 -0
- data/lib/wp2txt/progressbar.rb +305 -0
- data/lib/wp2txt/utils.rb +430 -0
- data/lib/wp2txt/version.rb +3 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/utils_spec.rb +195 -0
- data/wp2txt.gemspec +26 -0
- metadata +145 -0
data/lib/wp2txt/utils.rb
ADDED
@@ -0,0 +1,430 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'strscan'
|
5
|
+
require 'find'
|
6
|
+
require 'sanitize'
|
7
|
+
|
8
|
+
module Wp2txt
|
9
|
+
|
10
|
+
def format_wiki(original_text, has_retried = false)
|
11
|
+
begin
|
12
|
+
text = original_text + ""
|
13
|
+
|
14
|
+
text = chrref_to_utf(text)
|
15
|
+
text = escape_nowiki(text)
|
16
|
+
|
17
|
+
text = process_interwiki_links(text)
|
18
|
+
text = process_external_links(text)
|
19
|
+
|
20
|
+
text = remove_directive(text)
|
21
|
+
text = remove_emphasis(text)
|
22
|
+
|
23
|
+
text = mndash(text)
|
24
|
+
text = make_reference(text)
|
25
|
+
text = format_ref(text)
|
26
|
+
text = remove_hr(text)
|
27
|
+
text = remove_tag(text)
|
28
|
+
text = special_chr(text)
|
29
|
+
|
30
|
+
unescape_nowiki(text)
|
31
|
+
rescue # detect invalid byte sequence in UTF-8
|
32
|
+
if has_retried
|
33
|
+
puts "invalid byte sequence detected"
|
34
|
+
puts "******************************"
|
35
|
+
File.open("error_log.txt", "w") do |f|
|
36
|
+
f.write original_text
|
37
|
+
end
|
38
|
+
exit
|
39
|
+
else
|
40
|
+
fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
|
41
|
+
return format_wiki(fixed_text, true)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
#################### parser for nested structure ####################
|
47
|
+
|
48
|
+
def process_nested_structure(scanner, left, right, &block)
|
49
|
+
buffer = ""
|
50
|
+
while str = scanner.scan_until(/(#{Regexp.escape(left)}|#{Regexp.escape(right)})/m)
|
51
|
+
# begin
|
52
|
+
case scanner[1]
|
53
|
+
when left
|
54
|
+
buffer << str
|
55
|
+
has_left = true
|
56
|
+
when right
|
57
|
+
if has_left
|
58
|
+
buffer = buffer[0...-(left.size)]
|
59
|
+
contents = block.call(str[0...-(left.size)])
|
60
|
+
buffer << contents
|
61
|
+
break
|
62
|
+
else
|
63
|
+
buffer << str
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
buffer << scanner.rest
|
68
|
+
|
69
|
+
if buffer == scanner.string
|
70
|
+
return scanner.string
|
71
|
+
else
|
72
|
+
scanner.string = buffer
|
73
|
+
return process_nested_structure(scanner, left, right, &block) || ""
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def remove_templates(str, only_not_inline = true)
|
78
|
+
scanner = StringScanner.new(str)
|
79
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
80
|
+
if contents.index("\n")
|
81
|
+
"\n"
|
82
|
+
else
|
83
|
+
"[tpl]#{contents}[/tpl]"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
#################### methods used from format_wiki ####################
|
90
|
+
|
91
|
+
def escape_nowiki(str)
|
92
|
+
if @nowikis
|
93
|
+
@nowikis.clear
|
94
|
+
else
|
95
|
+
@nowikis = {}
|
96
|
+
end
|
97
|
+
str.gsub(/<nowiki>(.*?)<\/nowiki>/m) do
|
98
|
+
nowiki = $1
|
99
|
+
nowiki_id = nowiki.object_id
|
100
|
+
@nowikis[nowiki_id] = nowiki
|
101
|
+
"<nowiki-#{nowiki_id}>"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def unescape_nowiki(str)
|
106
|
+
str.gsub(/<nowiki\-(\d+?)>/) do
|
107
|
+
obj_id = $1.to_i
|
108
|
+
@nowikis[obj_id]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def process_interwiki_links(str)
|
113
|
+
scanner = StringScanner.new(str)
|
114
|
+
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
115
|
+
str_new = ""
|
116
|
+
parts = contents.split("|")
|
117
|
+
case parts.size
|
118
|
+
when 1
|
119
|
+
parts.first || ""
|
120
|
+
else
|
121
|
+
parts.shift
|
122
|
+
parts.join("|")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
result
|
126
|
+
end
|
127
|
+
|
128
|
+
def process_external_links(str)
|
129
|
+
scanner = StringScanner.new(str)
|
130
|
+
result = process_nested_structure(scanner, "[", "]") do |contents|
|
131
|
+
parts = contents.split(" ", 2)
|
132
|
+
case parts.size
|
133
|
+
when 1
|
134
|
+
parts.first || ""
|
135
|
+
else
|
136
|
+
parts.last || ""
|
137
|
+
end
|
138
|
+
end
|
139
|
+
result
|
140
|
+
end
|
141
|
+
|
142
|
+
def special_chr(str)
|
143
|
+
unless @sp_hash
|
144
|
+
html = [' ', '<', '>', '&', '"']\
|
145
|
+
.zip([' ', '<', '>', '&', '"'])
|
146
|
+
|
147
|
+
umraut_accent = ['À', 'Á', 'Â', 'Ã', 'Ä',
|
148
|
+
'Å', 'Æ', 'Ç', 'È', 'É', 'Ê',
|
149
|
+
'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ñ',
|
150
|
+
'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
|
151
|
+
'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
152
|
+
'á', 'â', 'ã', 'ä', 'å', 'æ',
|
153
|
+
'ç', 'è', 'é', 'ê', 'ë', 'ì',
|
154
|
+
'í', 'î', 'ï', 'ñ', 'ò', 'ó',
|
155
|
+
'ô', 'œ', 'õ', 'ö', 'ø', 'ù',
|
156
|
+
'ú', 'û', 'ü', 'ÿ']\
|
157
|
+
.zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
|
158
|
+
'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
159
|
+
'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
|
160
|
+
'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
|
161
|
+
|
162
|
+
punctuation = ['¿', '¡', '«', '»', '§',
|
163
|
+
'¶', '†', '‡', '•', '–', '—']\
|
164
|
+
.zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
|
165
|
+
|
166
|
+
commercial = ['™', '©', '®', '¢', '€', '¥',
|
167
|
+
'£', '¤'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
|
168
|
+
|
169
|
+
greek_chr = ['α', 'β', 'γ', 'δ', 'ε',
|
170
|
+
'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ',
|
171
|
+
'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς',
|
172
|
+
'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'Γ',
|
173
|
+
'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ',
|
174
|
+
'Ψ', 'Ω']\
|
175
|
+
.zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
|
176
|
+
'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
|
177
|
+
'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
|
178
|
+
|
179
|
+
math_chr1 = ['∫', '∑', '∏', '√', '−', '±',
|
180
|
+
'∞', '≈', '∝', '≡', '≠', '≤', '≥',
|
181
|
+
'×', '·', '÷', '∂', '′', '″',
|
182
|
+
'∇', '‰', '°', '∴', 'ø', '∈', '∩',
|
183
|
+
'∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨',
|
184
|
+
'∃', '∀', '⇒', '⇔', '→', '↔', '↑']\
|
185
|
+
.zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
|
186
|
+
'≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
|
187
|
+
'∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
|
188
|
+
'⇔', '→', '↔', '↑'])
|
189
|
+
|
190
|
+
math_chr2 = ['ℵ', '∉'].zip(['ℵ', '∉'])
|
191
|
+
|
192
|
+
others = ['¨', 'ª',
|
193
|
+
'¯', '´', 'µ', '¸', 'º', '‘', '’',
|
194
|
+
'“', '‚', '”', '„', '♠', '♣', '◊',
|
195
|
+
'♥', '←', '♦', '‹', '›', '↓']\
|
196
|
+
.zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
|
197
|
+
'„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
|
198
|
+
|
199
|
+
spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
|
200
|
+
math_chr1 + math_chr2 + others
|
201
|
+
@sp_hash = Hash[*spc_array.flatten]
|
202
|
+
@sp_regex = Regexp.new("(" + @sp_hash.keys.join("|") + ")")
|
203
|
+
end
|
204
|
+
#str.gsub!("&"){'&'}
|
205
|
+
str.gsub!(@sp_regex) do
|
206
|
+
@sp_hash[$1]
|
207
|
+
end
|
208
|
+
return str
|
209
|
+
end
|
210
|
+
|
211
|
+
def remove_tag(str, tagset = ['<', '>'])
|
212
|
+
if tagset == ['<', '>']
|
213
|
+
return remove_html_tag(str)
|
214
|
+
end
|
215
|
+
tagsets = Regexp.quote(tagset.uniq.join(""))
|
216
|
+
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
217
|
+
newstr = str.gsub(regex, "")
|
218
|
+
# newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
|
219
|
+
return newstr
|
220
|
+
end
|
221
|
+
|
222
|
+
def remove_html_tag(str)
|
223
|
+
str = ::Sanitize.clean(str)
|
224
|
+
end
|
225
|
+
|
226
|
+
def remove_emphasis(str)
|
227
|
+
str.gsub(/(''+)(.+?)\1/) do
|
228
|
+
$2
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def chrref_to_utf(num_str)
|
233
|
+
begin
|
234
|
+
utf_str = num_str.gsub(/&#(x?)([0-9a-fA-F]+);/) do
|
235
|
+
if $1 == 'x'
|
236
|
+
ch = $2.to_i(16)
|
237
|
+
else
|
238
|
+
ch = $2.to_i
|
239
|
+
end
|
240
|
+
hi = ch>>8
|
241
|
+
lo = ch&0xff
|
242
|
+
u = "\377\376" << lo.chr << hi.chr
|
243
|
+
u.encode("UTF-8", "UTF-16")
|
244
|
+
end
|
245
|
+
rescue StandardError
|
246
|
+
return num_str
|
247
|
+
end
|
248
|
+
return utf_str
|
249
|
+
end
|
250
|
+
|
251
|
+
def remove_directive(str)
|
252
|
+
remove_tag(str, ['__', '__'])
|
253
|
+
end
|
254
|
+
|
255
|
+
def mndash(str)
|
256
|
+
str = str.gsub(/\{(mdash|ndash|–)\}/, "–")
|
257
|
+
end
|
258
|
+
|
259
|
+
def remove_hr(page)
|
260
|
+
page = page.gsub(/^\s*\-+\s*$/, "")
|
261
|
+
end
|
262
|
+
|
263
|
+
def make_reference(str)
|
264
|
+
new_str = str.dup
|
265
|
+
new_str.gsub!(/<br ?\/>/, "\n")
|
266
|
+
new_str.gsub!(/<ref[^>]*\/>/, "")
|
267
|
+
new_str.gsub!(/<ref[^>]*>/, "[ref]")
|
268
|
+
new_str.gsub!(/<\/ref>/, "[/ref]")
|
269
|
+
return new_str
|
270
|
+
end
|
271
|
+
|
272
|
+
def format_ref(page)
|
273
|
+
page = page.gsub(/\[ref\](.*?)\[\/ref\]/m) do
|
274
|
+
ref = $1.dup
|
275
|
+
ref.gsub(/(?:[\r\n]+|<br ?\/>)/, " ")
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
#################### methods currently unused ####################
|
280
|
+
|
281
|
+
def process_template(str)
|
282
|
+
scanner = StringScanner.new(str)
|
283
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
284
|
+
parts = contents.split("|")
|
285
|
+
case parts.size
|
286
|
+
when 0
|
287
|
+
""
|
288
|
+
when 1
|
289
|
+
parts.first || ""
|
290
|
+
else
|
291
|
+
if parts.last.split("=").size > 1
|
292
|
+
parts.first || ""
|
293
|
+
else
|
294
|
+
parts.last || ""
|
295
|
+
end
|
296
|
+
end
|
297
|
+
end
|
298
|
+
result
|
299
|
+
end
|
300
|
+
|
301
|
+
def remove_table(str)
|
302
|
+
new_str = str.gsub(/\{\|[^\{\|\}]*?\|\}/m, "")
|
303
|
+
if str != new_str
|
304
|
+
new_str = remove_table(new_str)
|
305
|
+
end
|
306
|
+
new_str = remove_table(new_str) unless str == new_str
|
307
|
+
return new_str
|
308
|
+
end
|
309
|
+
|
310
|
+
def remove_clade(page)
|
311
|
+
new_page = page.gsub(/\{\{(?:C|c)lade[^\{\}]*\}\}/m, "")
|
312
|
+
new_page = remove_clade(new_page) unless page == new_page
|
313
|
+
new_page
|
314
|
+
end
|
315
|
+
|
316
|
+
def remove_inline_template(str)
|
317
|
+
str.gsub(/\{\{(.*?)\}\}/) do
|
318
|
+
key = $1
|
319
|
+
if /\A[^\|]+\z/ =~ key
|
320
|
+
result = key
|
321
|
+
else
|
322
|
+
info = key.split("|")
|
323
|
+
type_code = info.first
|
324
|
+
case type_code
|
325
|
+
when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
|
326
|
+
/\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
|
327
|
+
/\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
|
328
|
+
out = info[-1]
|
329
|
+
else
|
330
|
+
out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
|
331
|
+
end
|
332
|
+
result = out
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
#################### file related utilities ####################
|
338
|
+
|
339
|
+
# collect filenames recursively
|
340
|
+
def collect_files(str, regex = nil)
|
341
|
+
regex ||= //
|
342
|
+
text_array = Array.new
|
343
|
+
Find.find(str) do |f|
|
344
|
+
text_array << f if regex =~ f
|
345
|
+
end
|
346
|
+
text_array.sort
|
347
|
+
end
|
348
|
+
|
349
|
+
# modify a file using block/yield mechanism
|
350
|
+
def file_mod(file_path, backup = false, &block)
|
351
|
+
File.open(file_path, "r") do |fr|
|
352
|
+
str = fr.read
|
353
|
+
newstr = yield(str)
|
354
|
+
str = newstr unless newstr == nil
|
355
|
+
File.open("temp", "w") do |tf|
|
356
|
+
tf.write(str)
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
File.rename(file_path, file_path + ".bak")
|
361
|
+
File.rename("temp", file_path)
|
362
|
+
File.unlink(file_path + ".bak") unless backup
|
363
|
+
end
|
364
|
+
|
365
|
+
# modify files under a directry (recursive)
|
366
|
+
def batch_file_mod(dir_path, &block)
|
367
|
+
if FileTest.directory?(dir_path)
|
368
|
+
collect_files(dir_path).each do |file|
|
369
|
+
yield file if FileTest.file?(file)
|
370
|
+
end
|
371
|
+
else
|
372
|
+
yield dir_path if FileTest.file?(dir_path)
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
# take care of difference of separators among environments
|
377
|
+
def correct_separator(input)
|
378
|
+
if input.is_a?(String)
|
379
|
+
ret_str = String.new
|
380
|
+
if RUBY_PLATFORM.index("win32")
|
381
|
+
ret_str = input.gsub("/", "\\")
|
382
|
+
else
|
383
|
+
ret_str = input.gsub("\\", "/")
|
384
|
+
end
|
385
|
+
return ret_str
|
386
|
+
elsif input.is_a?(Array)
|
387
|
+
ret_array = Array.new
|
388
|
+
input.each do |item|
|
389
|
+
ret_array << correct_separator(item)
|
390
|
+
end
|
391
|
+
return ret_array
|
392
|
+
end
|
393
|
+
end
|
394
|
+
|
395
|
+
def rename(files)
|
396
|
+
# num of digits necessary to name the last file generated
|
397
|
+
maxwidth = 0
|
398
|
+
|
399
|
+
files.each do |f|
|
400
|
+
width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
|
401
|
+
maxwidth = width if maxwidth < width
|
402
|
+
end
|
403
|
+
|
404
|
+
files.each do |f|
|
405
|
+
newname= f.sub(/\-(\d+)\z/) do
|
406
|
+
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
407
|
+
end
|
408
|
+
File.rename(f, newname + ".txt")
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
# convert int of seconds to string in the format 00:00:00
|
413
|
+
def sec_to_str(int)
|
414
|
+
unless int
|
415
|
+
str = "--:--:--"
|
416
|
+
return str
|
417
|
+
end
|
418
|
+
h = int / 3600
|
419
|
+
m = (int - h * 3600) / 60
|
420
|
+
s = int % 60
|
421
|
+
str = sprintf("%02d:%02d:%02d", h, m, s)
|
422
|
+
return str
|
423
|
+
end
|
424
|
+
|
425
|
+
def decimal_format(i)
|
426
|
+
str = i.to_s.reverse
|
427
|
+
return str.scan(/.?.?./).join(',').reverse
|
428
|
+
end
|
429
|
+
|
430
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/utils_spec.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
5
|
+
require 'wp2txt'
|
6
|
+
require 'wp2txt/article'
|
7
|
+
require 'wp2txt/utils'
|
8
|
+
|
9
|
+
describe "Wp2txt" do
|
10
|
+
it "contains mediawiki-format related functions:" do
|
11
|
+
end
|
12
|
+
|
13
|
+
include Wp2txt
|
14
|
+
|
15
|
+
before do
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "process_nested_structure" do
|
19
|
+
it "parse nested structure replacing str in the format specified" do
|
20
|
+
str_before = "[[ab[[cde[[alfa]]]]fg]]"
|
21
|
+
str_after = "<<ab<<cde<<alfa>>>>fg>>"
|
22
|
+
scanner = StringScanner.new(str_before)
|
23
|
+
str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
|
24
|
+
"<<" + content + ">>"
|
25
|
+
end
|
26
|
+
str_processed.should == str_after
|
27
|
+
|
28
|
+
str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
29
|
+
|passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
|
30
|
+
str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
31
|
+
|passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
|
32
|
+
scanner = StringScanner.new(str_before)
|
33
|
+
str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
|
34
|
+
"<<" + content + ">>"
|
35
|
+
end
|
36
|
+
str_processed.should == str_after
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "special_chr" do
|
41
|
+
it "replaces character references with real characters" do
|
42
|
+
str_before = " < > & ""
|
43
|
+
str_after = " < > & \""
|
44
|
+
special_chr(str_before).should == str_after
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "chrref_to_utf" do
|
49
|
+
it "replaces character references with real characters" do
|
50
|
+
str_before = "♪"
|
51
|
+
str_after = "♪"
|
52
|
+
chrref_to_utf(str_before).should == str_after
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
describe "mndash" do
|
57
|
+
it "replaces {mdash}, {ndash}, or {–} with '–'" do
|
58
|
+
str_before = "{mdash} {ndash} {–}"
|
59
|
+
str_after = "– – –"
|
60
|
+
mndash(str_before).should == str_after
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe "format_ref" do
|
65
|
+
it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
|
66
|
+
str_before = "[ref]...\r\n...<br />...[/ref]"
|
67
|
+
str_after = "... ... ..."
|
68
|
+
format_ref(str_before).should == str_after
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
describe "make_reference" do
|
73
|
+
it "replaces <ref> tag with [ref]" do
|
74
|
+
str_before = "<ref> ... <br /> ... </ref> \n <ref />"
|
75
|
+
str_after = "[ref] ... \n ... [/ref] \n "
|
76
|
+
make_reference(str_before).should == str_after
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
describe "remove_table" do
|
81
|
+
it "removes table formated parts" do
|
82
|
+
str_before = "{| ... \n{| ... \n ...|}\n ...|}"
|
83
|
+
str_after = ""
|
84
|
+
remove_table(str_before).should == str_after
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "remove_clade" do
|
89
|
+
it "removes clade formated parts" do
|
90
|
+
str_before = "\{\{clade ... \n ... \n ... \n\}\}"
|
91
|
+
str_after = ""
|
92
|
+
remove_clade(str_before).should == str_after
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe "remove_hr" do
|
97
|
+
it "removes horizontal lines" do
|
98
|
+
str_before = "\n----\n--\n--\n"
|
99
|
+
str_after = "\n\n"
|
100
|
+
remove_hr(str_before).should == str_after
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
describe "remove_tag" do
|
105
|
+
it "removes tags" do
|
106
|
+
str_before = "<tag>abc</tag>"
|
107
|
+
str_after = "abc"
|
108
|
+
remove_tag(str_before).should == str_after
|
109
|
+
str_before = "[tag]def[/tag]"
|
110
|
+
str_after = "def"
|
111
|
+
remove_tag(str_before, ['[', ']']).should == str_after
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
describe "remove_directive" do
|
116
|
+
it "removes directive" do
|
117
|
+
str_before = "__abc__\n __def__"
|
118
|
+
str_after = "\n "
|
119
|
+
remove_directive(str_before).should == str_after
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
describe "remove_emphasis" do
|
124
|
+
it "removes directive" do
|
125
|
+
str_before = "''abc''\n'''def'''"
|
126
|
+
str_after = "abc\ndef"
|
127
|
+
remove_emphasis(str_before).should == str_after
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
describe "escape_nowiki" do
|
132
|
+
it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
|
133
|
+
str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
|
134
|
+
str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
|
135
|
+
escape_nowiki(str_before).should =~ str_after
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe "unescape_nowiki" do
|
140
|
+
it "replaces <nowiki-object_id> with string stored elsewhere" do
|
141
|
+
@nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
|
142
|
+
str_before = "<nowiki-123>def<nowiki-124>"
|
143
|
+
str_after = "[[abc]]def[[ghi]]"
|
144
|
+
unescape_nowiki(str_before).should == str_after
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
describe "process_interwiki_links" do
|
149
|
+
it "formats text link and remove brackets" do
|
150
|
+
process_interwiki_links("[[a b]]").should == "a b"
|
151
|
+
process_interwiki_links("[[a b|c]]").should == "c"
|
152
|
+
process_interwiki_links("[[a|b|c]]").should == "b|c"
|
153
|
+
process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]").should == "[ɲ], /J/"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
describe "process_external_links" do
|
158
|
+
it "formats text link and remove brackets" do
|
159
|
+
process_external_links("[http://yohasebe.com yohasebe.com]").should == "yohasebe.com"
|
160
|
+
process_external_links("[http://yohasebe.com]").should == "http://yohasebe.com"
|
161
|
+
process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}").should == "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
describe "process_template" do
|
166
|
+
it "removes brackets and leaving some text" do
|
167
|
+
str_before = "{{}}"
|
168
|
+
str_after = ""
|
169
|
+
process_template(str_before).should == str_after
|
170
|
+
str_before = "{{lang|en|Japan}}"
|
171
|
+
str_after = "Japan"
|
172
|
+
process_template(str_before).should == str_after
|
173
|
+
str_before = "{{a|b=c|d=f}}"
|
174
|
+
str_after = "a"
|
175
|
+
process_template(str_before).should == str_after
|
176
|
+
str_before = "{{a|b|{{c|d|e}}}}"
|
177
|
+
str_after = "e"
|
178
|
+
process_template(str_before).should == str_after
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe "expand_template" do
|
183
|
+
it "gets data corresponding to a given template using mediawiki api" do
|
184
|
+
uri = "http://en.wiktionary.org/w/api.php"
|
185
|
+
template = "{{en-verb}}"
|
186
|
+
word = "kick"
|
187
|
+
expanded = expand_template(uri, template, word)
|
188
|
+
html =<<EOD
|
189
|
+
<span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
190
|
+
EOD
|
191
|
+
html.strip!
|
192
|
+
expanded.should == html
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|