wp2txt 0.7.0 → 0.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/benchmark.rb +10 -8
- data/bin/wp2txt +4 -12
- data/error_log.txt +1 -0
- data/lib/wp2txt/utils.rb +101 -123
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +43 -42
- data/wp2txt.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 911e08e181a6bedb664b797d49183d0988daeba5
|
4
|
+
data.tar.gz: 076d1349a8aa8cf454dac42bdce7b89a82f3fca0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ebc035e4f1635f150294d8b79eb474457a280707a416688f3e7712bb7788d15888b6718bfd6f4e3a790e6fb8a7623e1415255fde913bfe658dd237fa7f599cd
|
7
|
+
data.tar.gz: ccee00a9e1b85186d52d0b3c07b52c04fff1ecd133ff245010943312cf37e279874b5f3a757880c005ad877e957df6a4176af2269f40b3c3210951530eb4c511
|
data/bin/benchmark.rb
CHANGED
@@ -22,12 +22,13 @@ Benchmark.bm do |x|
|
|
22
22
|
x.report do
|
23
23
|
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
24
24
|
wpconv.extract_text do |article|
|
25
|
-
|
26
|
-
title = "[[#{title}]]\n"
|
25
|
+
format_wiki!(article.title)
|
26
|
+
title = "[[#{article.title}]]\n"
|
27
|
+
convert_characters!(title)
|
27
28
|
|
28
|
-
|
29
|
-
|
30
|
-
|
29
|
+
contents = "\nCATEGORIES: "
|
30
|
+
contents += article.categories.join(", ")
|
31
|
+
contents += "\n\n"
|
31
32
|
|
32
33
|
article.elements.each do |e|
|
33
34
|
case e.first
|
@@ -55,10 +56,11 @@ Benchmark.bm do |x|
|
|
55
56
|
else
|
56
57
|
next
|
57
58
|
end
|
58
|
-
contents
|
59
|
-
remove_templates!(contents)
|
59
|
+
contents << line
|
60
60
|
end
|
61
|
-
|
61
|
+
format_article!(contents)
|
62
|
+
convert_characters!(contents)
|
63
|
+
|
62
64
|
##### cleanup #####
|
63
65
|
if /\A\s*\z/m =~ contents
|
64
66
|
result = ""
|
data/bin/wp2txt
CHANGED
@@ -50,6 +50,7 @@ convert = opts[:convert]
|
|
50
50
|
strip_tmarker = opts[:marker] ? false : true
|
51
51
|
opt_array = [:title, :list, :heading, :table, :redirect]
|
52
52
|
$leave_template = true if opts[:template]
|
53
|
+
$leave_table = true if opts[:table]
|
53
54
|
config = {}
|
54
55
|
opt_array.each do |opt|
|
55
56
|
config[opt] = opts[opt]
|
@@ -61,6 +62,7 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert,
|
|
61
62
|
wpconv.extract_text do |article|
|
62
63
|
format_wiki!(article.title)
|
63
64
|
title = "[[#{article.title}]]\n"
|
65
|
+
convert_characters!(title)
|
64
66
|
|
65
67
|
if opts[:category] && !article.categories.empty?
|
66
68
|
contents = "\nCATEGORIES: "
|
@@ -118,18 +120,8 @@ wpconv.extract_text do |article|
|
|
118
120
|
end
|
119
121
|
contents << line
|
120
122
|
end
|
121
|
-
|
122
|
-
|
123
|
-
remove_emphasis!(contents)
|
124
|
-
mndash!(contents)
|
125
|
-
make_reference!(contents)
|
126
|
-
format_ref!(contents)
|
127
|
-
remove_hr!(contents)
|
128
|
-
remove_tag!(contents)
|
129
|
-
special_chr!(contents)
|
130
|
-
|
131
|
-
correct_inline_template!(contents) unless $leave_template
|
132
|
-
remove_templates!(contents) unless $leave_template
|
123
|
+
format_article!(contents)
|
124
|
+
convert_characters!(contents)
|
133
125
|
|
134
126
|
##### cleanup #####
|
135
127
|
if /\A\s*\z/m =~ contents
|
data/error_log.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
[[アンパサンド]]
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
require 'strscan'
|
5
5
|
require 'find'
|
6
|
+
require 'htmlentities'
|
6
7
|
|
7
8
|
###################################################
|
8
9
|
# global variables to save resource for generating regexps
|
@@ -10,6 +11,12 @@ require 'find'
|
|
10
11
|
# those with a trailing number 2 represent closing tag/markup
|
11
12
|
# those without a trailing number contain both opening/closing tags/markups
|
12
13
|
|
14
|
+
$html_decoder = HTMLEntities.new
|
15
|
+
|
16
|
+
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
|
+
$html_hash = Hash[*$entities.flatten]
|
18
|
+
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
+
|
13
20
|
$in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
|
14
21
|
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
15
22
|
|
@@ -43,6 +50,9 @@ $blank_line_regex = Regexp.new('^\s*$')
|
|
43
50
|
|
44
51
|
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
45
52
|
|
53
|
+
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
54
|
+
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
55
|
+
|
46
56
|
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
47
57
|
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
48
58
|
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
@@ -58,8 +68,8 @@ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
|
58
68
|
$pre_marks_regex = Regexp.new('\A\^\ ')
|
59
69
|
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
60
70
|
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
61
|
-
$remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
|
62
|
-
$remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
|
71
|
+
# $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
|
72
|
+
# $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
|
63
73
|
|
64
74
|
$category_patterns = ["Category", "Categoria"].join("|")
|
65
75
|
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
@@ -74,22 +84,16 @@ $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escap
|
|
74
84
|
$double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
|
75
85
|
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
76
86
|
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
77
|
-
|
87
|
+
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
78
88
|
###################################################
|
79
89
|
|
80
90
|
module Wp2txt
|
81
91
|
|
82
|
-
def
|
92
|
+
def convert_characters!(text, has_retried = false)
|
83
93
|
begin
|
84
94
|
text << ""
|
85
|
-
|
86
95
|
chrref_to_utf!(text)
|
87
|
-
|
88
|
-
|
89
|
-
process_interwiki_links!(text)
|
90
|
-
process_external_links!(text)
|
91
|
-
|
92
|
-
unescape_nowiki!(text)
|
96
|
+
special_chr!(text)
|
93
97
|
|
94
98
|
rescue # detect invalid byte sequence in UTF-8
|
95
99
|
if has_retried
|
@@ -102,11 +106,34 @@ module Wp2txt
|
|
102
106
|
else
|
103
107
|
text.encode!("UTF-16")
|
104
108
|
text.encode!("UTF-8")
|
105
|
-
|
109
|
+
convert_characters!(text, true)
|
106
110
|
end
|
107
111
|
end
|
108
112
|
end
|
113
|
+
|
114
|
+
def format_wiki!(text, has_retried = false)
|
115
|
+
escape_nowiki!(text)
|
116
|
+
|
117
|
+
process_interwiki_links!(text)
|
118
|
+
process_external_links!(text)
|
109
119
|
|
120
|
+
unescape_nowiki!(text)
|
121
|
+
end
|
122
|
+
|
123
|
+
def format_article!(text)
|
124
|
+
remove_directive!(text)
|
125
|
+
remove_emphasis!(text)
|
126
|
+
mndash!(text)
|
127
|
+
make_reference!(text)
|
128
|
+
format_ref!(text)
|
129
|
+
remove_hr!(text)
|
130
|
+
remove_tag!(text)
|
131
|
+
convert_characters!(text)
|
132
|
+
correct_inline_template!(text) unless $leave_template
|
133
|
+
remove_templates!(text) unless $leave_template
|
134
|
+
remove_table!(text) unless $leave_table
|
135
|
+
end
|
136
|
+
|
110
137
|
#################### parser for nested structure ####################
|
111
138
|
|
112
139
|
def process_nested_structure(scanner, left, right, recur_count, &block)
|
@@ -120,6 +147,8 @@ module Wp2txt
|
|
120
147
|
regex = $single_curly_bracket_regex
|
121
148
|
elsif left == "{{" && right == "}}"
|
122
149
|
regex = $double_curly_bracket_regex
|
150
|
+
elsif left == "{|" && right == "|}"
|
151
|
+
regex = $curly_square_bracket_regex
|
123
152
|
else
|
124
153
|
regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
|
125
154
|
end
|
@@ -154,15 +183,6 @@ module Wp2txt
|
|
154
183
|
end
|
155
184
|
|
156
185
|
#################### methods used from format_wiki ####################
|
157
|
-
|
158
|
-
def remove_templates!(str)
|
159
|
-
scanner = StringScanner.new(str)
|
160
|
-
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
161
|
-
""
|
162
|
-
end
|
163
|
-
str.replace(result)
|
164
|
-
end
|
165
|
-
|
166
186
|
def escape_nowiki!(str)
|
167
187
|
if @nowikis
|
168
188
|
@nowikis.clear
|
@@ -213,80 +233,42 @@ module Wp2txt
|
|
213
233
|
str.replace(result)
|
214
234
|
end
|
215
235
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
'Å', 'Æ', 'Ç', 'È', 'É', 'Ê',
|
223
|
-
'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ñ',
|
224
|
-
'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
|
225
|
-
'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
226
|
-
'á', 'â', 'ã', 'ä', 'å', 'æ',
|
227
|
-
'ç', 'è', 'é', 'ê', 'ë', 'ì',
|
228
|
-
'í', 'î', 'ï', 'ñ', 'ò', 'ó',
|
229
|
-
'ô', 'œ', 'õ', 'ö', 'ø', 'ù',
|
230
|
-
'ú', 'û', 'ü', 'ÿ']\
|
231
|
-
.zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
|
232
|
-
'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
233
|
-
'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
|
234
|
-
'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
|
235
|
-
|
236
|
-
punctuation = ['¿', '¡', '«', '»', '§',
|
237
|
-
'¶', '†', '‡', '•', '–', '—']\
|
238
|
-
.zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
|
239
|
-
|
240
|
-
commercial = ['™', '©', '®', '¢', '€', '¥',
|
241
|
-
'£', '¤'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
|
242
|
-
|
243
|
-
greek_chr = ['α', 'β', 'γ', 'δ', 'ε',
|
244
|
-
'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ',
|
245
|
-
'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς',
|
246
|
-
'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'Γ',
|
247
|
-
'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ',
|
248
|
-
'Ψ', 'Ω']\
|
249
|
-
.zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
|
250
|
-
'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
|
251
|
-
'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
|
252
|
-
|
253
|
-
math_chr1 = ['∫', '∑', '∏', '√', '−', '±',
|
254
|
-
'∞', '≈', '∝', '≡', '≠', '≤', '≥',
|
255
|
-
'×', '·', '÷', '∂', '′', '″',
|
256
|
-
'∇', '‰', '°', '∴', 'ø', '∈', '∩',
|
257
|
-
'∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨',
|
258
|
-
'∃', '∀', '⇒', '⇔', '→', '↔', '↑']\
|
259
|
-
.zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
|
260
|
-
'≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
|
261
|
-
'∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
|
262
|
-
'⇔', '→', '↔', '↑'])
|
263
|
-
|
264
|
-
math_chr2 = ['ℵ', '∉'].zip(['ℵ', '∉'])
|
265
|
-
|
266
|
-
others = ['¨', 'ª',
|
267
|
-
'¯', '´', 'µ', '¸', 'º', '‘', '’',
|
268
|
-
'“', '‚', '”', '„', '♠', '♣', '◊',
|
269
|
-
'♥', '←', '♦', '‹', '›', '↓']\
|
270
|
-
.zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
|
271
|
-
'„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
|
272
|
-
|
273
|
-
spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
|
274
|
-
math_chr1 + math_chr2 + others
|
275
|
-
$sp_hash = Hash[*spc_array.flatten]
|
276
|
-
$sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
|
236
|
+
#################### methods used from format_article ####################
|
237
|
+
|
238
|
+
def remove_templates!(str)
|
239
|
+
scanner = StringScanner.new(str)
|
240
|
+
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
241
|
+
""
|
277
242
|
end
|
278
|
-
|
279
|
-
|
280
|
-
|
243
|
+
str.replace(result)
|
244
|
+
end
|
245
|
+
|
246
|
+
def remove_table!(str)
|
247
|
+
scanner = StringScanner.new(str)
|
248
|
+
result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
|
249
|
+
""
|
281
250
|
end
|
251
|
+
str.replace(result)
|
252
|
+
end
|
253
|
+
|
254
|
+
def special_chr!(str)
|
255
|
+
str.replace $html_decoder.decode(str)
|
282
256
|
end
|
283
257
|
|
284
|
-
def
|
258
|
+
def remove_inbetween!(str, tagset = ['<', '>'])
|
285
259
|
tagsets = Regexp.quote(tagset.uniq.join(""))
|
286
260
|
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
287
261
|
str.gsub!(regex, "")
|
288
262
|
end
|
289
263
|
|
264
|
+
def remove_tag!(str)
|
265
|
+
str.gsub!($remove_tag_regex, "")
|
266
|
+
end
|
267
|
+
|
268
|
+
def remove_directive!(str)
|
269
|
+
str.gsub!($remove_directives_regex, "")
|
270
|
+
end
|
271
|
+
|
290
272
|
def remove_emphasis!(str)
|
291
273
|
str.gsub!($remove_emphasis_regex) do
|
292
274
|
$2
|
@@ -311,10 +293,6 @@ module Wp2txt
|
|
311
293
|
end
|
312
294
|
return true
|
313
295
|
end
|
314
|
-
|
315
|
-
def remove_directive!(str)
|
316
|
-
remove_tag!(str, ['__', '__'])
|
317
|
-
end
|
318
296
|
|
319
297
|
def mndash!(str)
|
320
298
|
str.gsub!($mndash_regex, "–")
|
@@ -364,40 +342,40 @@ module Wp2txt
|
|
364
342
|
|
365
343
|
#################### methods currently unused ####################
|
366
344
|
|
367
|
-
def process_template(str)
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
end
|
386
|
-
|
387
|
-
def remove_table(str)
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
end
|
345
|
+
# def process_template(str)
|
346
|
+
# scanner = StringScanner.new(str)
|
347
|
+
# result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
348
|
+
# parts = contents.split("|")
|
349
|
+
# case parts.size
|
350
|
+
# when 0
|
351
|
+
# ""
|
352
|
+
# when 1
|
353
|
+
# parts.first || ""
|
354
|
+
# else
|
355
|
+
# if parts.last.split("=").size > 1
|
356
|
+
# parts.first || ""
|
357
|
+
# else
|
358
|
+
# parts.last || ""
|
359
|
+
# end
|
360
|
+
# end
|
361
|
+
# end
|
362
|
+
# result
|
363
|
+
# end
|
364
|
+
|
365
|
+
# def remove_table(str)
|
366
|
+
# new_str = str.gsub($remove_table_regex, "")
|
367
|
+
# if str != new_str
|
368
|
+
# new_str = remove_table(new_str)
|
369
|
+
# end
|
370
|
+
# new_str = remove_table(new_str) unless str == new_str
|
371
|
+
# return new_str
|
372
|
+
# end
|
395
373
|
|
396
|
-
def remove_clade(page)
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
end
|
374
|
+
# def remove_clade(page)
|
375
|
+
# new_page = page.gsub($remove_clade_regex, "")
|
376
|
+
# new_page = remove_clade(new_page) unless page == new_page
|
377
|
+
# new_page
|
378
|
+
# end
|
401
379
|
|
402
380
|
#################### file related utilities ####################
|
403
381
|
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/utils_spec.rb
CHANGED
@@ -44,7 +44,7 @@ describe "Wp2txt" do
|
|
44
44
|
describe "special_chr!" do
|
45
45
|
it "replaces character references with real characters" do
|
46
46
|
str_before = " < > & ""
|
47
|
-
str_after = "
|
47
|
+
str_after = " < > & \""
|
48
48
|
special_chr!(str_before)
|
49
49
|
expect(str_before).to eq str_after
|
50
50
|
end
|
@@ -77,21 +77,22 @@ describe "Wp2txt" do
|
|
77
77
|
end
|
78
78
|
end
|
79
79
|
|
80
|
-
describe "remove_table" do
|
80
|
+
describe "remove_table!" do
|
81
81
|
it "removes table formated parts" do
|
82
82
|
str_before = "{| ... \n{| ... \n ...|}\n ...|}"
|
83
83
|
str_after = ""
|
84
|
-
|
84
|
+
remove_table!(str_before)
|
85
|
+
expect(str_before).to eq str_after
|
85
86
|
end
|
86
87
|
end
|
87
88
|
|
88
|
-
describe "remove_clade" do
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
end
|
89
|
+
# describe "remove_clade" do
|
90
|
+
# it "removes clade formated parts" do
|
91
|
+
# str_before = "\{\{clade ... \n ... \n ... \n\}\}"
|
92
|
+
# str_after = ""
|
93
|
+
# expect(remove_clade(str_before)).to eq str_after
|
94
|
+
# end
|
95
|
+
# end
|
95
96
|
|
96
97
|
describe "remove_hr!" do
|
97
98
|
it "removes horizontal lines" do
|
@@ -102,15 +103,15 @@ describe "Wp2txt" do
|
|
102
103
|
end
|
103
104
|
end
|
104
105
|
|
105
|
-
describe "
|
106
|
-
it "removes tags" do
|
106
|
+
describe "remove_inbetween!" do
|
107
|
+
it "removes tags and its contents" do
|
107
108
|
str_before = "<tag>abc</tag>"
|
108
109
|
str_after = "abc"
|
109
110
|
remove_tag!(str_before)
|
110
111
|
expect(str_before).to eq str_after
|
111
112
|
str_before = "[tag]def[/tag]"
|
112
113
|
str_after = "def"
|
113
|
-
|
114
|
+
remove_inbetween!(str_before, ['[', ']'])
|
114
115
|
expect(str_before).to eq str_after
|
115
116
|
end
|
116
117
|
end
|
@@ -183,34 +184,34 @@ describe "Wp2txt" do
|
|
183
184
|
end
|
184
185
|
end
|
185
186
|
|
186
|
-
describe "process_template" do
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
end
|
187
|
+
# describe "process_template" do
|
188
|
+
# it "removes brackets and leaving some text" do
|
189
|
+
# str_before = "{{}}"
|
190
|
+
# str_after = ""
|
191
|
+
# expect(process_template(str_before)).to eq str_after
|
192
|
+
# str_before = "{{lang|en|Japan}}"
|
193
|
+
# str_after = "Japan"
|
194
|
+
# expect(process_template(str_before)).to eq str_after
|
195
|
+
# str_before = "{{a|b=c|d=f}}"
|
196
|
+
# str_after = "a"
|
197
|
+
# expect(process_template(str_before)).to eq str_after
|
198
|
+
# str_before = "{{a|b|{{c|d|e}}}}"
|
199
|
+
# str_after = "e"
|
200
|
+
# expect(process_template(str_before)).to eq str_after
|
201
|
+
# end
|
202
|
+
# end
|
202
203
|
|
203
|
-
# describe "expand_template" do
|
204
|
-
# it "gets data corresponding to a given template using mediawiki api" do
|
205
|
-
# uri = "http://en.wiktionary.org/w/api.php"
|
206
|
-
# template = "{{en-verb}}"
|
207
|
-
# word = "kick"
|
208
|
-
# expanded = expand_template(uri, template, word)
|
209
|
-
# html =<<EOD
|
210
|
-
# <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
211
|
-
# EOD
|
212
|
-
# html.strip!
|
213
|
-
# expanded.should == html
|
214
|
-
# end
|
215
|
-
# end
|
204
|
+
# describe "expand_template" do
|
205
|
+
# it "gets data corresponding to a given template using mediawiki api" do
|
206
|
+
# uri = "http://en.wiktionary.org/w/api.php"
|
207
|
+
# template = "{{en-verb}}"
|
208
|
+
# word = "kick"
|
209
|
+
# expanded = expand_template(uri, template, word)
|
210
|
+
# html =<<EOD
|
211
|
+
# <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
212
|
+
# EOD
|
213
|
+
# html.strip!
|
214
|
+
# expanded.should == html
|
215
|
+
# end
|
216
|
+
# end
|
216
217
|
end
|
data/wp2txt.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: htmlentities
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: trollop
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -56,6 +70,7 @@ files:
|
|
56
70
|
- bin/benchmark.rb
|
57
71
|
- bin/wp2txt
|
58
72
|
- data/testdata.bz2
|
73
|
+
- error_log.txt
|
59
74
|
- lib/wp2txt.rb
|
60
75
|
- lib/wp2txt/article.rb
|
61
76
|
- lib/wp2txt/mw_api.rb
|