wp2txt 0.7.0 → 0.7.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/benchmark.rb +10 -8
- data/bin/wp2txt +4 -12
- data/error_log.txt +1 -0
- data/lib/wp2txt/utils.rb +101 -123
- data/lib/wp2txt/version.rb +1 -1
- data/spec/utils_spec.rb +43 -42
- data/wp2txt.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 911e08e181a6bedb664b797d49183d0988daeba5
|
4
|
+
data.tar.gz: 076d1349a8aa8cf454dac42bdce7b89a82f3fca0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ebc035e4f1635f150294d8b79eb474457a280707a416688f3e7712bb7788d15888b6718bfd6f4e3a790e6fb8a7623e1415255fde913bfe658dd237fa7f599cd
|
7
|
+
data.tar.gz: ccee00a9e1b85186d52d0b3c07b52c04fff1ecd133ff245010943312cf37e279874b5f3a757880c005ad877e957df6a4176af2269f40b3c3210951530eb4c511
|
data/bin/benchmark.rb
CHANGED
@@ -22,12 +22,13 @@ Benchmark.bm do |x|
|
|
22
22
|
x.report do
|
23
23
|
wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
|
24
24
|
wpconv.extract_text do |article|
|
25
|
-
|
26
|
-
title = "[[#{title}]]\n"
|
25
|
+
format_wiki!(article.title)
|
26
|
+
title = "[[#{article.title}]]\n"
|
27
|
+
convert_characters!(title)
|
27
28
|
|
28
|
-
|
29
|
-
|
30
|
-
|
29
|
+
contents = "\nCATEGORIES: "
|
30
|
+
contents += article.categories.join(", ")
|
31
|
+
contents += "\n\n"
|
31
32
|
|
32
33
|
article.elements.each do |e|
|
33
34
|
case e.first
|
@@ -55,10 +56,11 @@ Benchmark.bm do |x|
|
|
55
56
|
else
|
56
57
|
next
|
57
58
|
end
|
58
|
-
contents
|
59
|
-
remove_templates!(contents)
|
59
|
+
contents << line
|
60
60
|
end
|
61
|
-
|
61
|
+
format_article!(contents)
|
62
|
+
convert_characters!(contents)
|
63
|
+
|
62
64
|
##### cleanup #####
|
63
65
|
if /\A\s*\z/m =~ contents
|
64
66
|
result = ""
|
data/bin/wp2txt
CHANGED
@@ -50,6 +50,7 @@ convert = opts[:convert]
|
|
50
50
|
strip_tmarker = opts[:marker] ? false : true
|
51
51
|
opt_array = [:title, :list, :heading, :table, :redirect]
|
52
52
|
$leave_template = true if opts[:template]
|
53
|
+
$leave_table = true if opts[:table]
|
53
54
|
config = {}
|
54
55
|
opt_array.each do |opt|
|
55
56
|
config[opt] = opts[opt]
|
@@ -61,6 +62,7 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert,
|
|
61
62
|
wpconv.extract_text do |article|
|
62
63
|
format_wiki!(article.title)
|
63
64
|
title = "[[#{article.title}]]\n"
|
65
|
+
convert_characters!(title)
|
64
66
|
|
65
67
|
if opts[:category] && !article.categories.empty?
|
66
68
|
contents = "\nCATEGORIES: "
|
@@ -118,18 +120,8 @@ wpconv.extract_text do |article|
|
|
118
120
|
end
|
119
121
|
contents << line
|
120
122
|
end
|
121
|
-
|
122
|
-
|
123
|
-
remove_emphasis!(contents)
|
124
|
-
mndash!(contents)
|
125
|
-
make_reference!(contents)
|
126
|
-
format_ref!(contents)
|
127
|
-
remove_hr!(contents)
|
128
|
-
remove_tag!(contents)
|
129
|
-
special_chr!(contents)
|
130
|
-
|
131
|
-
correct_inline_template!(contents) unless $leave_template
|
132
|
-
remove_templates!(contents) unless $leave_template
|
123
|
+
format_article!(contents)
|
124
|
+
convert_characters!(contents)
|
133
125
|
|
134
126
|
##### cleanup #####
|
135
127
|
if /\A\s*\z/m =~ contents
|
data/error_log.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
[[アンパサンド]]
|
data/lib/wp2txt/utils.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
|
4
4
|
require 'strscan'
|
5
5
|
require 'find'
|
6
|
+
require 'htmlentities'
|
6
7
|
|
7
8
|
###################################################
|
8
9
|
# global variables to save resource for generating regexps
|
@@ -10,6 +11,12 @@ require 'find'
|
|
10
11
|
# those with a trailing number 2 represent closing tag/markup
|
11
12
|
# those without a trailing number contain both opening/closing tags/markups
|
12
13
|
|
14
|
+
$html_decoder = HTMLEntities.new
|
15
|
+
|
16
|
+
$entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
|
17
|
+
$html_hash = Hash[*$entities.flatten]
|
18
|
+
$html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
|
19
|
+
|
13
20
|
$in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
|
14
21
|
$in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
|
15
22
|
|
@@ -43,6 +50,9 @@ $blank_line_regex = Regexp.new('^\s*$')
|
|
43
50
|
|
44
51
|
$redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
|
45
52
|
|
53
|
+
$remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
|
54
|
+
$remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
|
55
|
+
|
46
56
|
$remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
|
47
57
|
$chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
|
48
58
|
$mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
|
@@ -58,8 +68,8 @@ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
|
|
58
68
|
$pre_marks_regex = Regexp.new('\A\^\ ')
|
59
69
|
$def_marks_regex = Regexp.new('\A[\;\:\ ]+')
|
60
70
|
$onset_bar_regex = Regexp.new('\A[^\|]+\z')
|
61
|
-
$remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
|
62
|
-
$remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
|
71
|
+
# $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
|
72
|
+
# $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
|
63
73
|
|
64
74
|
$category_patterns = ["Category", "Categoria"].join("|")
|
65
75
|
$category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
|
@@ -74,22 +84,16 @@ $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escap
|
|
74
84
|
$double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
|
75
85
|
$single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
|
76
86
|
$double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
|
77
|
-
|
87
|
+
$curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
|
78
88
|
###################################################
|
79
89
|
|
80
90
|
module Wp2txt
|
81
91
|
|
82
|
-
def
|
92
|
+
def convert_characters!(text, has_retried = false)
|
83
93
|
begin
|
84
94
|
text << ""
|
85
|
-
|
86
95
|
chrref_to_utf!(text)
|
87
|
-
|
88
|
-
|
89
|
-
process_interwiki_links!(text)
|
90
|
-
process_external_links!(text)
|
91
|
-
|
92
|
-
unescape_nowiki!(text)
|
96
|
+
special_chr!(text)
|
93
97
|
|
94
98
|
rescue # detect invalid byte sequence in UTF-8
|
95
99
|
if has_retried
|
@@ -102,11 +106,34 @@ module Wp2txt
|
|
102
106
|
else
|
103
107
|
text.encode!("UTF-16")
|
104
108
|
text.encode!("UTF-8")
|
105
|
-
|
109
|
+
convert_characters!(text, true)
|
106
110
|
end
|
107
111
|
end
|
108
112
|
end
|
113
|
+
|
114
|
+
def format_wiki!(text, has_retried = false)
|
115
|
+
escape_nowiki!(text)
|
116
|
+
|
117
|
+
process_interwiki_links!(text)
|
118
|
+
process_external_links!(text)
|
109
119
|
|
120
|
+
unescape_nowiki!(text)
|
121
|
+
end
|
122
|
+
|
123
|
+
def format_article!(text)
|
124
|
+
remove_directive!(text)
|
125
|
+
remove_emphasis!(text)
|
126
|
+
mndash!(text)
|
127
|
+
make_reference!(text)
|
128
|
+
format_ref!(text)
|
129
|
+
remove_hr!(text)
|
130
|
+
remove_tag!(text)
|
131
|
+
convert_characters!(text)
|
132
|
+
correct_inline_template!(text) unless $leave_template
|
133
|
+
remove_templates!(text) unless $leave_template
|
134
|
+
remove_table!(text) unless $leave_table
|
135
|
+
end
|
136
|
+
|
110
137
|
#################### parser for nested structure ####################
|
111
138
|
|
112
139
|
def process_nested_structure(scanner, left, right, recur_count, &block)
|
@@ -120,6 +147,8 @@ module Wp2txt
|
|
120
147
|
regex = $single_curly_bracket_regex
|
121
148
|
elsif left == "{{" && right == "}}"
|
122
149
|
regex = $double_curly_bracket_regex
|
150
|
+
elsif left == "{|" && right == "|}"
|
151
|
+
regex = $curly_square_bracket_regex
|
123
152
|
else
|
124
153
|
regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
|
125
154
|
end
|
@@ -154,15 +183,6 @@ module Wp2txt
|
|
154
183
|
end
|
155
184
|
|
156
185
|
#################### methods used from format_wiki ####################
|
157
|
-
|
158
|
-
def remove_templates!(str)
|
159
|
-
scanner = StringScanner.new(str)
|
160
|
-
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
161
|
-
""
|
162
|
-
end
|
163
|
-
str.replace(result)
|
164
|
-
end
|
165
|
-
|
166
186
|
def escape_nowiki!(str)
|
167
187
|
if @nowikis
|
168
188
|
@nowikis.clear
|
@@ -213,80 +233,42 @@ module Wp2txt
|
|
213
233
|
str.replace(result)
|
214
234
|
end
|
215
235
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
'Å', 'Æ', 'Ç', 'È', 'É', 'Ê',
|
223
|
-
'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ñ',
|
224
|
-
'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
|
225
|
-
'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
226
|
-
'á', 'â', 'ã', 'ä', 'å', 'æ',
|
227
|
-
'ç', 'è', 'é', 'ê', 'ë', 'ì',
|
228
|
-
'í', 'î', 'ï', 'ñ', 'ò', 'ó',
|
229
|
-
'ô', 'œ', 'õ', 'ö', 'ø', 'ù',
|
230
|
-
'ú', 'û', 'ü', 'ÿ']\
|
231
|
-
.zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
|
232
|
-
'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
|
233
|
-
'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
|
234
|
-
'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
|
235
|
-
|
236
|
-
punctuation = ['¿', '¡', '«', '»', '§',
|
237
|
-
'¶', '†', '‡', '•', '–', '—']\
|
238
|
-
.zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
|
239
|
-
|
240
|
-
commercial = ['™', '©', '®', '¢', '€', '¥',
|
241
|
-
'£', '¤'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
|
242
|
-
|
243
|
-
greek_chr = ['α', 'β', 'γ', 'δ', 'ε',
|
244
|
-
'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ',
|
245
|
-
'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς',
|
246
|
-
'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'Γ',
|
247
|
-
'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ',
|
248
|
-
'Ψ', 'Ω']\
|
249
|
-
.zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
|
250
|
-
'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
|
251
|
-
'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
|
252
|
-
|
253
|
-
math_chr1 = ['∫', '∑', '∏', '√', '−', '±',
|
254
|
-
'∞', '≈', '∝', '≡', '≠', '≤', '≥',
|
255
|
-
'×', '·', '÷', '∂', '′', '″',
|
256
|
-
'∇', '‰', '°', '∴', 'ø', '∈', '∩',
|
257
|
-
'∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨',
|
258
|
-
'∃', '∀', '⇒', '⇔', '→', '↔', '↑']\
|
259
|
-
.zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
|
260
|
-
'≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
|
261
|
-
'∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
|
262
|
-
'⇔', '→', '↔', '↑'])
|
263
|
-
|
264
|
-
math_chr2 = ['ℵ', '∉'].zip(['ℵ', '∉'])
|
265
|
-
|
266
|
-
others = ['¨', 'ª',
|
267
|
-
'¯', '´', 'µ', '¸', 'º', '‘', '’',
|
268
|
-
'“', '‚', '”', '„', '♠', '♣', '◊',
|
269
|
-
'♥', '←', '♦', '‹', '›', '↓']\
|
270
|
-
.zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
|
271
|
-
'„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
|
272
|
-
|
273
|
-
spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
|
274
|
-
math_chr1 + math_chr2 + others
|
275
|
-
$sp_hash = Hash[*spc_array.flatten]
|
276
|
-
$sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
|
236
|
+
#################### methods used from format_article ####################
|
237
|
+
|
238
|
+
def remove_templates!(str)
|
239
|
+
scanner = StringScanner.new(str)
|
240
|
+
result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
241
|
+
""
|
277
242
|
end
|
278
|
-
|
279
|
-
|
280
|
-
|
243
|
+
str.replace(result)
|
244
|
+
end
|
245
|
+
|
246
|
+
def remove_table!(str)
|
247
|
+
scanner = StringScanner.new(str)
|
248
|
+
result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
|
249
|
+
""
|
281
250
|
end
|
251
|
+
str.replace(result)
|
252
|
+
end
|
253
|
+
|
254
|
+
def special_chr!(str)
|
255
|
+
str.replace $html_decoder.decode(str)
|
282
256
|
end
|
283
257
|
|
284
|
-
def
|
258
|
+
def remove_inbetween!(str, tagset = ['<', '>'])
|
285
259
|
tagsets = Regexp.quote(tagset.uniq.join(""))
|
286
260
|
regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
|
287
261
|
str.gsub!(regex, "")
|
288
262
|
end
|
289
263
|
|
264
|
+
def remove_tag!(str)
|
265
|
+
str.gsub!($remove_tag_regex, "")
|
266
|
+
end
|
267
|
+
|
268
|
+
def remove_directive!(str)
|
269
|
+
str.gsub!($remove_directives_regex, "")
|
270
|
+
end
|
271
|
+
|
290
272
|
def remove_emphasis!(str)
|
291
273
|
str.gsub!($remove_emphasis_regex) do
|
292
274
|
$2
|
@@ -311,10 +293,6 @@ module Wp2txt
|
|
311
293
|
end
|
312
294
|
return true
|
313
295
|
end
|
314
|
-
|
315
|
-
def remove_directive!(str)
|
316
|
-
remove_tag!(str, ['__', '__'])
|
317
|
-
end
|
318
296
|
|
319
297
|
def mndash!(str)
|
320
298
|
str.gsub!($mndash_regex, "–")
|
@@ -364,40 +342,40 @@ module Wp2txt
|
|
364
342
|
|
365
343
|
#################### methods currently unused ####################
|
366
344
|
|
367
|
-
def process_template(str)
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
end
|
386
|
-
|
387
|
-
def remove_table(str)
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
end
|
345
|
+
# def process_template(str)
|
346
|
+
# scanner = StringScanner.new(str)
|
347
|
+
# result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
|
348
|
+
# parts = contents.split("|")
|
349
|
+
# case parts.size
|
350
|
+
# when 0
|
351
|
+
# ""
|
352
|
+
# when 1
|
353
|
+
# parts.first || ""
|
354
|
+
# else
|
355
|
+
# if parts.last.split("=").size > 1
|
356
|
+
# parts.first || ""
|
357
|
+
# else
|
358
|
+
# parts.last || ""
|
359
|
+
# end
|
360
|
+
# end
|
361
|
+
# end
|
362
|
+
# result
|
363
|
+
# end
|
364
|
+
|
365
|
+
# def remove_table(str)
|
366
|
+
# new_str = str.gsub($remove_table_regex, "")
|
367
|
+
# if str != new_str
|
368
|
+
# new_str = remove_table(new_str)
|
369
|
+
# end
|
370
|
+
# new_str = remove_table(new_str) unless str == new_str
|
371
|
+
# return new_str
|
372
|
+
# end
|
395
373
|
|
396
|
-
def remove_clade(page)
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
end
|
374
|
+
# def remove_clade(page)
|
375
|
+
# new_page = page.gsub($remove_clade_regex, "")
|
376
|
+
# new_page = remove_clade(new_page) unless page == new_page
|
377
|
+
# new_page
|
378
|
+
# end
|
401
379
|
|
402
380
|
#################### file related utilities ####################
|
403
381
|
|
data/lib/wp2txt/version.rb
CHANGED
data/spec/utils_spec.rb
CHANGED
@@ -44,7 +44,7 @@ describe "Wp2txt" do
|
|
44
44
|
describe "special_chr!" do
|
45
45
|
it "replaces character references with real characters" do
|
46
46
|
str_before = " < > & ""
|
47
|
-
str_after = "
|
47
|
+
str_after = " < > & \""
|
48
48
|
special_chr!(str_before)
|
49
49
|
expect(str_before).to eq str_after
|
50
50
|
end
|
@@ -77,21 +77,22 @@ describe "Wp2txt" do
|
|
77
77
|
end
|
78
78
|
end
|
79
79
|
|
80
|
-
describe "remove_table" do
|
80
|
+
describe "remove_table!" do
|
81
81
|
it "removes table formated parts" do
|
82
82
|
str_before = "{| ... \n{| ... \n ...|}\n ...|}"
|
83
83
|
str_after = ""
|
84
|
-
|
84
|
+
remove_table!(str_before)
|
85
|
+
expect(str_before).to eq str_after
|
85
86
|
end
|
86
87
|
end
|
87
88
|
|
88
|
-
describe "remove_clade" do
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
end
|
89
|
+
# describe "remove_clade" do
|
90
|
+
# it "removes clade formated parts" do
|
91
|
+
# str_before = "\{\{clade ... \n ... \n ... \n\}\}"
|
92
|
+
# str_after = ""
|
93
|
+
# expect(remove_clade(str_before)).to eq str_after
|
94
|
+
# end
|
95
|
+
# end
|
95
96
|
|
96
97
|
describe "remove_hr!" do
|
97
98
|
it "removes horizontal lines" do
|
@@ -102,15 +103,15 @@ describe "Wp2txt" do
|
|
102
103
|
end
|
103
104
|
end
|
104
105
|
|
105
|
-
describe "
|
106
|
-
it "removes tags" do
|
106
|
+
describe "remove_inbetween!" do
|
107
|
+
it "removes tags and its contents" do
|
107
108
|
str_before = "<tag>abc</tag>"
|
108
109
|
str_after = "abc"
|
109
110
|
remove_tag!(str_before)
|
110
111
|
expect(str_before).to eq str_after
|
111
112
|
str_before = "[tag]def[/tag]"
|
112
113
|
str_after = "def"
|
113
|
-
|
114
|
+
remove_inbetween!(str_before, ['[', ']'])
|
114
115
|
expect(str_before).to eq str_after
|
115
116
|
end
|
116
117
|
end
|
@@ -183,34 +184,34 @@ describe "Wp2txt" do
|
|
183
184
|
end
|
184
185
|
end
|
185
186
|
|
186
|
-
describe "process_template" do
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
end
|
187
|
+
# describe "process_template" do
|
188
|
+
# it "removes brackets and leaving some text" do
|
189
|
+
# str_before = "{{}}"
|
190
|
+
# str_after = ""
|
191
|
+
# expect(process_template(str_before)).to eq str_after
|
192
|
+
# str_before = "{{lang|en|Japan}}"
|
193
|
+
# str_after = "Japan"
|
194
|
+
# expect(process_template(str_before)).to eq str_after
|
195
|
+
# str_before = "{{a|b=c|d=f}}"
|
196
|
+
# str_after = "a"
|
197
|
+
# expect(process_template(str_before)).to eq str_after
|
198
|
+
# str_before = "{{a|b|{{c|d|e}}}}"
|
199
|
+
# str_after = "e"
|
200
|
+
# expect(process_template(str_before)).to eq str_after
|
201
|
+
# end
|
202
|
+
# end
|
202
203
|
|
203
|
-
# describe "expand_template" do
|
204
|
-
# it "gets data corresponding to a given template using mediawiki api" do
|
205
|
-
# uri = "http://en.wiktionary.org/w/api.php"
|
206
|
-
# template = "{{en-verb}}"
|
207
|
-
# word = "kick"
|
208
|
-
# expanded = expand_template(uri, template, word)
|
209
|
-
# html =<<EOD
|
210
|
-
# <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
211
|
-
# EOD
|
212
|
-
# html.strip!
|
213
|
-
# expanded.should == html
|
214
|
-
# end
|
215
|
-
# end
|
204
|
+
# describe "expand_template" do
|
205
|
+
# it "gets data corresponding to a given template using mediawiki api" do
|
206
|
+
# uri = "http://en.wiktionary.org/w/api.php"
|
207
|
+
# template = "{{en-verb}}"
|
208
|
+
# word = "kick"
|
209
|
+
# expanded = expand_template(uri, template, word)
|
210
|
+
# html =<<EOD
|
211
|
+
# <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
|
212
|
+
# EOD
|
213
|
+
# html.strip!
|
214
|
+
# expanded.should == html
|
215
|
+
# end
|
216
|
+
# end
|
216
217
|
end
|
data/wp2txt.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wp2txt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: htmlentities
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: trollop
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -56,6 +70,7 @@ files:
|
|
56
70
|
- bin/benchmark.rb
|
57
71
|
- bin/wp2txt
|
58
72
|
- data/testdata.bz2
|
73
|
+
- error_log.txt
|
59
74
|
- lib/wp2txt.rb
|
60
75
|
- lib/wp2txt/article.rb
|
61
76
|
- lib/wp2txt/mw_api.rb
|