wp2txt 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +65 -0
- data/Rakefile +9 -0
- data/bin/wp2txt +112 -0
- data/data/testdata.bz2 +0 -0
- data/lib/wp2txt.rb +323 -0
- data/lib/wp2txt/article.rb +177 -0
- data/lib/wp2txt/mw_api.rb +65 -0
- data/lib/wp2txt/progressbar.rb +305 -0
- data/lib/wp2txt/utils.rb +430 -0
- data/lib/wp2txt/version.rb +3 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/utils_spec.rb +195 -0
- data/wp2txt.gemspec +26 -0
- metadata +145 -0
data/lib/wp2txt/utils.rb
ADDED
@@ -0,0 +1,430 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require 'strscan'
|
5
|
+
require 'find'
|
6
|
+
require 'sanitize'
|
7
|
+
|
8
|
+
module Wp2txt
|
9
|
+
|
10
|
+
def format_wiki(original_text, has_retried = false)
|
11
|
+
begin
|
12
|
+
text = original_text + ""
|
13
|
+
|
14
|
+
text = chrref_to_utf(text)
|
15
|
+
text = escape_nowiki(text)
|
16
|
+
|
17
|
+
text = process_interwiki_links(text)
|
18
|
+
text = process_external_links(text)
|
19
|
+
|
20
|
+
text = remove_directive(text)
|
21
|
+
text = remove_emphasis(text)
|
22
|
+
|
23
|
+
text = mndash(text)
|
24
|
+
text = make_reference(text)
|
25
|
+
text = format_ref(text)
|
26
|
+
text = remove_hr(text)
|
27
|
+
text = remove_tag(text)
|
28
|
+
text = special_chr(text)
|
29
|
+
|
30
|
+
unescape_nowiki(text)
|
31
|
+
rescue # detect invalid byte sequence in UTF-8
|
32
|
+
if has_retried
|
33
|
+
puts "invalid byte sequence detected"
|
34
|
+
puts "******************************"
|
35
|
+
File.open("error_log.txt", "w") do |f|
|
36
|
+
f.write original_text
|
37
|
+
end
|
38
|
+
exit
|
39
|
+
else
|
40
|
+
fixed_text = original_text.encode("UTF-16", :invalid => :replace, :replace => '').encode("UTF-8")
|
41
|
+
return format_wiki(fixed_text, true)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
#################### parser for nested structure ####################
|
47
|
+
|
48
|
+
def process_nested_structure(scanner, left, right, &block)
|
49
|
+
buffer = ""
|
50
|
+
while str = scanner.scan_until(/(#{Regexp.escape(left)}|#{Regexp.escape(right)})/m)
|
51
|
+
# begin
|
52
|
+
case scanner[1]
|
53
|
+
when left
|
54
|
+
buffer << str
|
55
|
+
has_left = true
|
56
|
+
when right
|
57
|
+
if has_left
|
58
|
+
buffer = buffer[0...-(left.size)]
|
59
|
+
contents = block.call(str[0...-(left.size)])
|
60
|
+
buffer << contents
|
61
|
+
break
|
62
|
+
else
|
63
|
+
buffer << str
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
buffer << scanner.rest
|
68
|
+
|
69
|
+
if buffer == scanner.string
|
70
|
+
return scanner.string
|
71
|
+
else
|
72
|
+
scanner.string = buffer
|
73
|
+
return process_nested_structure(scanner, left, right, &block) || ""
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def remove_templates(str, only_not_inline = true)
|
78
|
+
scanner = StringScanner.new(str)
|
79
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
80
|
+
if contents.index("\n")
|
81
|
+
"\n"
|
82
|
+
else
|
83
|
+
"[tpl]#{contents}[/tpl]"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
#################### methods used from format_wiki ####################
|
90
|
+
|
91
|
+
def escape_nowiki(str)
|
92
|
+
if @nowikis
|
93
|
+
@nowikis.clear
|
94
|
+
else
|
95
|
+
@nowikis = {}
|
96
|
+
end
|
97
|
+
str.gsub(/<nowiki>(.*?)<\/nowiki>/m) do
|
98
|
+
nowiki = $1
|
99
|
+
nowiki_id = nowiki.object_id
|
100
|
+
@nowikis[nowiki_id] = nowiki
|
101
|
+
"<nowiki-#{nowiki_id}>"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def unescape_nowiki(str)
|
106
|
+
str.gsub(/<nowiki\-(\d+?)>/) do
|
107
|
+
obj_id = $1.to_i
|
108
|
+
@nowikis[obj_id]
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def process_interwiki_links(str)
|
113
|
+
scanner = StringScanner.new(str)
|
114
|
+
result = process_nested_structure(scanner, "[[", "]]") do |contents|
|
115
|
+
str_new = ""
|
116
|
+
parts = contents.split("|")
|
117
|
+
case parts.size
|
118
|
+
when 1
|
119
|
+
parts.first || ""
|
120
|
+
else
|
121
|
+
parts.shift
|
122
|
+
parts.join("|")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
result
|
126
|
+
end
|
127
|
+
|
128
|
+
def process_external_links(str)
|
129
|
+
scanner = StringScanner.new(str)
|
130
|
+
result = process_nested_structure(scanner, "[", "]") do |contents|
|
131
|
+
parts = contents.split(" ", 2)
|
132
|
+
case parts.size
|
133
|
+
when 1
|
134
|
+
parts.first || ""
|
135
|
+
else
|
136
|
+
parts.last || ""
|
137
|
+
end
|
138
|
+
end
|
139
|
+
result
|
140
|
+
end
|
141
|
+
|
142
|
+
def special_chr(str)
|
143
|
+
unless @sp_hash
|
144
|
+
html = [' ', '<', '>', '&', '"']\
|
145
|
+
.zip([' ', '<', '>', '&', '"'])
|
146
|
+
|
147
|
+
umraut_accent = ['À', 'Á', 'Â', 'Ã', 'Ä',
|
148
|
+
'Å', 'Æ', 'Ç', 'È', 'É', 'Ê',
|
149
|
+
'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ñ',
|
150
|
+
'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
|
151
|
+
'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
152
|
+
'á', 'â', 'ã', 'ä', 'å', 'æ',
|
153
|
+
'ç', 'è', 'é', 'ê', 'ë', 'ì',
|
154
|
+
'í', 'î', 'ï', 'ñ', 'ò', 'ó',
|
155
|
+
'ô', 'œ', 'õ', 'ö', 'ø', 'ù',
|
156
|
+
'ú', 'û', 'ü', 'ÿ']\
|
157
|
+
.zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
|
158
|
+
'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
159
|
+
'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
|
160
|
+
'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
|
161
|
+
|
162
|
+
punctuation = ['¿', '¡', '«', '»', '§',
|
163
|
+
'¶', '†', '‡', '•', '–', '—']\
|
164
|
+
.zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
|
165
|
+
|
166
|
+
commercial = ['™', '©', '®', '¢', '€', '¥',
|
167
|
+
'£', '¤'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
|
168
|
+
|
169
|
+
greek_chr = ['α', 'β', 'γ', 'δ', 'ε',
|
170
|
+
'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ',
|
171
|
+
'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς',
|
172
|
+
'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'Γ',
|
173
|
+
'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ',
|
174
|
+
'Ψ', 'Ω']\
|
175
|
+
.zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
|
176
|
+
'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
|
177
|
+
'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
|
178
|
+
|
179
|
+
math_chr1 = ['∫', '∑', '∏', '√', '−', '±',
|
180
|
+
'∞', '≈', '∝', '≡', '≠', '≤', '≥',
|
181
|
+
'×', '·', '÷', '∂', '′', '″',
|
182
|
+
'∇', '‰', '°', '∴', 'ø', '∈', '∩',
|
183
|
+
'∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨',
|
184
|
+
'∃', '∀', '⇒', '⇔', '→', '↔', '↑']\
|
185
|
+
.zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
|
186
|
+
'≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
|
187
|
+
'∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
|
188
|
+
'⇔', '→', '↔', '↑'])
|
189
|
+
|
190
|
+
math_chr2 = ['ℵ', '∉'].zip(['ℵ', '∉'])
|
191
|
+
|
192
|
+
others = ['¨', 'ª',
|
193
|
+
'¯', '´', 'µ', '¸', 'º', '‘', '’',
|
194
|
+
'“', '‚', '”', '„', '♠', '♣', '◊',
|
195
|
+
'♥', '←', '♦', '‹', '›', '↓']\
|
196
|
+
.zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
|
197
|
+
'„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
|
198
|
+
|
199
|
+
spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
|
200
|
+
math_chr1 + math_chr2 + others
|
201
|
+
@sp_hash = Hash[*spc_array.flatten]
|
202
|
+
@sp_regex = Regexp.new("(" + @sp_hash.keys.join("|") + ")")
|
203
|
+
end
|
204
|
+
#str.gsub!("&"){'&'}
|
205
|
+
str.gsub!(@sp_regex) do
|
206
|
+
@sp_hash[$1]
|
207
|
+
end
|
208
|
+
return str
|
209
|
+
end
|
210
|
+
|
211
|
+
def remove_tag(str, tagset = ['<', '>'])
|
212
|
+
if tagset == ['<', '>']
|
213
|
+
return remove_html_tag(str)
|
214
|
+
end
|
215
|
+
tagsets = Regexp.quote(tagset.uniq.join(""))
|
216
|
+
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
217
|
+
newstr = str.gsub(regex, "")
|
218
|
+
# newstr = newstr.gsub(/<\!\-\-.*?\-\->/, "")
|
219
|
+
return newstr
|
220
|
+
end
|
221
|
+
|
222
|
+
def remove_html_tag(str)
|
223
|
+
str = ::Sanitize.clean(str)
|
224
|
+
end
|
225
|
+
|
226
|
+
def remove_emphasis(str)
|
227
|
+
str.gsub(/(''+)(.+?)\1/) do
|
228
|
+
$2
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
def chrref_to_utf(num_str)
|
233
|
+
begin
|
234
|
+
utf_str = num_str.gsub(/&#(x?)([0-9a-fA-F]+);/) do
|
235
|
+
if $1 == 'x'
|
236
|
+
ch = $2.to_i(16)
|
237
|
+
else
|
238
|
+
ch = $2.to_i
|
239
|
+
end
|
240
|
+
hi = ch>>8
|
241
|
+
lo = ch&0xff
|
242
|
+
u = "\377\376" << lo.chr << hi.chr
|
243
|
+
u.encode("UTF-8", "UTF-16")
|
244
|
+
end
|
245
|
+
rescue StandardError
|
246
|
+
return num_str
|
247
|
+
end
|
248
|
+
return utf_str
|
249
|
+
end
|
250
|
+
|
251
|
+
def remove_directive(str)
|
252
|
+
remove_tag(str, ['__', '__'])
|
253
|
+
end
|
254
|
+
|
255
|
+
def mndash(str)
|
256
|
+
str = str.gsub(/\{(mdash|ndash|–)\}/, "–")
|
257
|
+
end
|
258
|
+
|
259
|
+
def remove_hr(page)
|
260
|
+
page = page.gsub(/^\s*\-+\s*$/, "")
|
261
|
+
end
|
262
|
+
|
263
|
+
def make_reference(str)
|
264
|
+
new_str = str.dup
|
265
|
+
new_str.gsub!(/<br ?\/>/, "\n")
|
266
|
+
new_str.gsub!(/<ref[^>]*\/>/, "")
|
267
|
+
new_str.gsub!(/<ref[^>]*>/, "[ref]")
|
268
|
+
new_str.gsub!(/<\/ref>/, "[/ref]")
|
269
|
+
return new_str
|
270
|
+
end
|
271
|
+
|
272
|
+
def format_ref(page)
|
273
|
+
page = page.gsub(/\[ref\](.*?)\[\/ref\]/m) do
|
274
|
+
ref = $1.dup
|
275
|
+
ref.gsub(/(?:[\r\n]+|<br ?\/>)/, " ")
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
#################### methods currently unused ####################
|
280
|
+
|
281
|
+
def process_template(str)
|
282
|
+
scanner = StringScanner.new(str)
|
283
|
+
result = process_nested_structure(scanner, "{{", "}}") do |contents|
|
284
|
+
parts = contents.split("|")
|
285
|
+
case parts.size
|
286
|
+
when 0
|
287
|
+
""
|
288
|
+
when 1
|
289
|
+
parts.first || ""
|
290
|
+
else
|
291
|
+
if parts.last.split("=").size > 1
|
292
|
+
parts.first || ""
|
293
|
+
else
|
294
|
+
parts.last || ""
|
295
|
+
end
|
296
|
+
end
|
297
|
+
end
|
298
|
+
result
|
299
|
+
end
|
300
|
+
|
301
|
+
def remove_table(str)
|
302
|
+
new_str = str.gsub(/\{\|[^\{\|\}]*?\|\}/m, "")
|
303
|
+
if str != new_str
|
304
|
+
new_str = remove_table(new_str)
|
305
|
+
end
|
306
|
+
new_str = remove_table(new_str) unless str == new_str
|
307
|
+
return new_str
|
308
|
+
end
|
309
|
+
|
310
|
+
def remove_clade(page)
|
311
|
+
new_page = page.gsub(/\{\{(?:C|c)lade[^\{\}]*\}\}/m, "")
|
312
|
+
new_page = remove_clade(new_page) unless page == new_page
|
313
|
+
new_page
|
314
|
+
end
|
315
|
+
|
316
|
+
def remove_inline_template(str)
|
317
|
+
str.gsub(/\{\{(.*?)\}\}/) do
|
318
|
+
key = $1
|
319
|
+
if /\A[^\|]+\z/ =~ key
|
320
|
+
result = key
|
321
|
+
else
|
322
|
+
info = key.split("|")
|
323
|
+
type_code = info.first
|
324
|
+
case type_code
|
325
|
+
when /\Alang*/i, /\AIPA/i, /\AIEP/i, /\ASEP/i, /\Aindent/i, /\Aaudio/i, /\Asmall/i,
|
326
|
+
/\Admoz/i, /\Apron/i, /\Aunicode/i, /\Anote label/i, /\Anowrap/i,
|
327
|
+
/\AArabDIN/i, /\Atrans/i, /\ANihongo/i, /\APolytonic/i
|
328
|
+
out = info[-1]
|
329
|
+
else
|
330
|
+
out = "{" + info.collect{|i|i.chomp}.join("|") + "}"
|
331
|
+
end
|
332
|
+
result = out
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
#################### file related utilities ####################
|
338
|
+
|
339
|
+
# collect filenames recursively
|
340
|
+
def collect_files(str, regex = nil)
|
341
|
+
regex ||= //
|
342
|
+
text_array = Array.new
|
343
|
+
Find.find(str) do |f|
|
344
|
+
text_array << f if regex =~ f
|
345
|
+
end
|
346
|
+
text_array.sort
|
347
|
+
end
|
348
|
+
|
349
|
+
# modify a file using block/yield mechanism
|
350
|
+
def file_mod(file_path, backup = false, &block)
|
351
|
+
File.open(file_path, "r") do |fr|
|
352
|
+
str = fr.read
|
353
|
+
newstr = yield(str)
|
354
|
+
str = newstr unless newstr == nil
|
355
|
+
File.open("temp", "w") do |tf|
|
356
|
+
tf.write(str)
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
File.rename(file_path, file_path + ".bak")
|
361
|
+
File.rename("temp", file_path)
|
362
|
+
File.unlink(file_path + ".bak") unless backup
|
363
|
+
end
|
364
|
+
|
365
|
+
# modify files under a directry (recursive)
|
366
|
+
def batch_file_mod(dir_path, &block)
|
367
|
+
if FileTest.directory?(dir_path)
|
368
|
+
collect_files(dir_path).each do |file|
|
369
|
+
yield file if FileTest.file?(file)
|
370
|
+
end
|
371
|
+
else
|
372
|
+
yield dir_path if FileTest.file?(dir_path)
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
# take care of difference of separators among environments
|
377
|
+
def correct_separator(input)
|
378
|
+
if input.is_a?(String)
|
379
|
+
ret_str = String.new
|
380
|
+
if RUBY_PLATFORM.index("win32")
|
381
|
+
ret_str = input.gsub("/", "\\")
|
382
|
+
else
|
383
|
+
ret_str = input.gsub("\\", "/")
|
384
|
+
end
|
385
|
+
return ret_str
|
386
|
+
elsif input.is_a?(Array)
|
387
|
+
ret_array = Array.new
|
388
|
+
input.each do |item|
|
389
|
+
ret_array << correct_separator(item)
|
390
|
+
end
|
391
|
+
return ret_array
|
392
|
+
end
|
393
|
+
end
|
394
|
+
|
395
|
+
def rename(files)
|
396
|
+
# num of digits necessary to name the last file generated
|
397
|
+
maxwidth = 0
|
398
|
+
|
399
|
+
files.each do |f|
|
400
|
+
width = f.slice(/\-(\d+)\z/, 1).to_s.length.to_i
|
401
|
+
maxwidth = width if maxwidth < width
|
402
|
+
end
|
403
|
+
|
404
|
+
files.each do |f|
|
405
|
+
newname= f.sub(/\-(\d+)\z/) do
|
406
|
+
"-" + sprintf("%0#{maxwidth}d", $1.to_i)
|
407
|
+
end
|
408
|
+
File.rename(f, newname + ".txt")
|
409
|
+
end
|
410
|
+
end
|
411
|
+
|
412
|
+
# convert int of seconds to string in the format 00:00:00
|
413
|
+
def sec_to_str(int)
|
414
|
+
unless int
|
415
|
+
str = "--:--:--"
|
416
|
+
return str
|
417
|
+
end
|
418
|
+
h = int / 3600
|
419
|
+
m = (int - h * 3600) / 60
|
420
|
+
s = int % 60
|
421
|
+
str = sprintf("%02d:%02d:%02d", h, m, s)
|
422
|
+
return str
|
423
|
+
end
|
424
|
+
|
425
|
+
def decimal_format(i)
|
426
|
+
str = i.to_s.reverse
|
427
|
+
return str.scan(/.?.?./).join(',').reverse
|
428
|
+
end
|
429
|
+
|
430
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/utils_spec.rb
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
5
|
+
require 'wp2txt'
|
6
|
+
require 'wp2txt/article'
|
7
|
+
require 'wp2txt/utils'
|
8
|
+
|
9
|
+
describe "Wp2txt" do
|
10
|
+
it "contains mediawiki-format related functions:" do
|
11
|
+
end
|
12
|
+
|
13
|
+
include Wp2txt
|
14
|
+
|
15
|
+
before do
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "process_nested_structure" do
|
19
|
+
it "parse nested structure replacing str in the format specified" do
|
20
|
+
str_before = "[[ab[[cde[[alfa]]]]fg]]"
|
21
|
+
str_after = "<<ab<<cde<<alfa>>>>fg>>"
|
22
|
+
scanner = StringScanner.new(str_before)
|
23
|
+
str_processed = process_nested_structure(scanner, "[[", "]]") do |content|
|
24
|
+
"<<" + content + ">>"
|
25
|
+
end
|
26
|
+
str_processed.should == str_after
|
27
|
+
|
28
|
+
str_before = "#* {{quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
29
|
+
|passage={{...}} every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.}}"
|
30
|
+
str_after = "#* <<quote-book|1503|year_published=1836|chapter=19 Henry VII. c. 5: Coin||A Collection of Statutes Connected with the General Administration of the Law|page=158|url=http://books.google.com/books?id=QtYuAAAAIAAJ
|
31
|
+
|passage=<<...>> every of them, being gold, whole and weight, shall '''go''' and be current in payment throughout this his realm for the sum that they were coined for.>>"
|
32
|
+
scanner = StringScanner.new(str_before)
|
33
|
+
str_processed = process_nested_structure(scanner, "{{", "}}") do |content|
|
34
|
+
"<<" + content + ">>"
|
35
|
+
end
|
36
|
+
str_processed.should == str_after
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe "special_chr" do
|
41
|
+
it "replaces character references with real characters" do
|
42
|
+
str_before = " < > & ""
|
43
|
+
str_after = " < > & \""
|
44
|
+
special_chr(str_before).should == str_after
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe "chrref_to_utf" do
|
49
|
+
it "replaces character references with real characters" do
|
50
|
+
str_before = "♪"
|
51
|
+
str_after = "♪"
|
52
|
+
chrref_to_utf(str_before).should == str_after
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
describe "mndash" do
|
57
|
+
it "replaces {mdash}, {ndash}, or {–} with '–'" do
|
58
|
+
str_before = "{mdash} {ndash} {–}"
|
59
|
+
str_after = "– – –"
|
60
|
+
mndash(str_before).should == str_after
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe "format_ref" do
|
65
|
+
it "replaces \\r\\n and <br /> inside [ref] ... [/ref] to ' '" do
|
66
|
+
str_before = "[ref]...\r\n...<br />...[/ref]"
|
67
|
+
str_after = "... ... ..."
|
68
|
+
format_ref(str_before).should == str_after
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
describe "make_reference" do
|
73
|
+
it "replaces <ref> tag with [ref]" do
|
74
|
+
str_before = "<ref> ... <br /> ... </ref> \n <ref />"
|
75
|
+
str_after = "[ref] ... \n ... [/ref] \n "
|
76
|
+
make_reference(str_before).should == str_after
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
describe "remove_table" do
|
81
|
+
it "removes table formated parts" do
|
82
|
+
str_before = "{| ... \n{| ... \n ...|}\n ...|}"
|
83
|
+
str_after = ""
|
84
|
+
remove_table(str_before).should == str_after
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "remove_clade" do
|
89
|
+
it "removes clade formated parts" do
|
90
|
+
str_before = "\{\{clade ... \n ... \n ... \n\}\}"
|
91
|
+
str_after = ""
|
92
|
+
remove_clade(str_before).should == str_after
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe "remove_hr" do
|
97
|
+
it "removes horizontal lines" do
|
98
|
+
str_before = "\n----\n--\n--\n"
|
99
|
+
str_after = "\n\n"
|
100
|
+
remove_hr(str_before).should == str_after
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
describe "remove_tag" do
|
105
|
+
it "removes tags" do
|
106
|
+
str_before = "<tag>abc</tag>"
|
107
|
+
str_after = "abc"
|
108
|
+
remove_tag(str_before).should == str_after
|
109
|
+
str_before = "[tag]def[/tag]"
|
110
|
+
str_after = "def"
|
111
|
+
remove_tag(str_before, ['[', ']']).should == str_after
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
describe "remove_directive" do
|
116
|
+
it "removes directive" do
|
117
|
+
str_before = "__abc__\n __def__"
|
118
|
+
str_after = "\n "
|
119
|
+
remove_directive(str_before).should == str_after
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
describe "remove_emphasis" do
|
124
|
+
it "removes directive" do
|
125
|
+
str_before = "''abc''\n'''def'''"
|
126
|
+
str_after = "abc\ndef"
|
127
|
+
remove_emphasis(str_before).should == str_after
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
describe "escape_nowiki" do
|
132
|
+
it "replaces <nowiki>...</nowiki> with <nowiki-object_id>" do
|
133
|
+
str_before = "<nowiki>[[abc]]</nowiki>def<nowiki>[[ghi]]</nowiki>"
|
134
|
+
str_after = Regexp.new("<nowiki-\\d+>def<nowiki-\\d+>")
|
135
|
+
escape_nowiki(str_before).should =~ str_after
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe "unescape_nowiki" do
|
140
|
+
it "replaces <nowiki-object_id> with string stored elsewhere" do
|
141
|
+
@nowikis = {123 => "[[abc]]", 124 => "[[ghi]]"}
|
142
|
+
str_before = "<nowiki-123>def<nowiki-124>"
|
143
|
+
str_after = "[[abc]]def[[ghi]]"
|
144
|
+
unescape_nowiki(str_before).should == str_after
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
describe "process_interwiki_links" do
|
149
|
+
it "formats text link and remove brackets" do
|
150
|
+
process_interwiki_links("[[a b]]").should == "a b"
|
151
|
+
process_interwiki_links("[[a b|c]]").should == "c"
|
152
|
+
process_interwiki_links("[[a|b|c]]").should == "b|c"
|
153
|
+
process_interwiki_links("[[硬口蓋鼻音|[ɲ], /J/]]").should == "[ɲ], /J/"
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
describe "process_external_links" do
|
158
|
+
it "formats text link and remove brackets" do
|
159
|
+
process_external_links("[http://yohasebe.com yohasebe.com]").should == "yohasebe.com"
|
160
|
+
process_external_links("[http://yohasebe.com]").should == "http://yohasebe.com"
|
161
|
+
process_external_links("* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}").should == "* Turkish: {{t+|tr|köken bilimi}}]], {{t+|tr|etimoloji}}"
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
describe "process_template" do
|
166
|
+
it "removes brackets and leaving some text" do
|
167
|
+
str_before = "{{}}"
|
168
|
+
str_after = ""
|
169
|
+
process_template(str_before).should == str_after
|
170
|
+
str_before = "{{lang|en|Japan}}"
|
171
|
+
str_after = "Japan"
|
172
|
+
process_template(str_before).should == str_after
|
173
|
+
str_before = "{{a|b=c|d=f}}"
|
174
|
+
str_after = "a"
|
175
|
+
process_template(str_before).should == str_after
|
176
|
+
str_before = "{{a|b|{{c|d|e}}}}"
|
177
|
+
str_after = "e"
|
178
|
+
process_template(str_before).should == str_after
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe "expand_template" do
|
183
|
+
it "gets data corresponding to a given template using mediawiki api" do
|
184
|
+
uri = "http://en.wiktionary.org/w/api.php"
|
185
|
+
template = "{{en-verb}}"
|
186
|
+
word = "kick"
|
187
|
+
expanded = expand_template(uri, template, word)
|
188
|
+
html =<<EOD
|
189
|
+
<span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
190
|
+
EOD
|
191
|
+
html.strip!
|
192
|
+
expanded.should == html
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|