wp2txt 0.7.0 → 0.7.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 80f68e6c1ac855160575f85f4d78ca378f0a1c2b
4
- data.tar.gz: 16bbac80e7139ea63dd46baf54fb5deaf0840e59
3
+ metadata.gz: 911e08e181a6bedb664b797d49183d0988daeba5
4
+ data.tar.gz: 076d1349a8aa8cf454dac42bdce7b89a82f3fca0
5
5
  SHA512:
6
- metadata.gz: 004d26fa39aae4eb194858cf85ae8aad33f65dc556a08bbfc499ead05d49e70af4f5ba5e708354aa816cd6b38d8e9860866cefa7d6c0730058e9a186ff9eec31
7
- data.tar.gz: c2523b8afeab165c37de028eedff36e719a2472f9440469e4041c342b08463d439351a89523d959ff28d53364c76a2af44502113bb2084eacbbc8ac14306f8a4
6
+ metadata.gz: 4ebc035e4f1635f150294d8b79eb474457a280707a416688f3e7712bb7788d15888b6718bfd6f4e3a790e6fb8a7623e1415255fde913bfe658dd237fa7f599cd
7
+ data.tar.gz: ccee00a9e1b85186d52d0b3c07b52c04fff1ecd133ff245010943312cf37e279874b5f3a757880c005ad877e957df6a4176af2269f40b3c3210951530eb4c511
@@ -22,12 +22,13 @@ Benchmark.bm do |x|
22
22
  x.report do
23
23
  wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
24
24
  wpconv.extract_text do |article|
25
- title = format_wiki! article.title
26
- title = "[[#{title}]]\n"
25
+ format_wiki!(article.title)
26
+ title = "[[#{article.title}]]\n"
27
+ convert_characters!(title)
27
28
 
28
- contents = "\nCATEGORIES: "
29
- contents += article.categories.join(", ")
30
- contents += "\n\n"
29
+ contents = "\nCATEGORIES: "
30
+ contents += article.categories.join(", ")
31
+ contents += "\n\n"
31
32
 
32
33
  article.elements.each do |e|
33
34
  case e.first
@@ -55,10 +56,11 @@ Benchmark.bm do |x|
55
56
  else
56
57
  next
57
58
  end
58
- contents += line
59
- remove_templates!(contents)
59
+ contents << line
60
60
  end
61
-
61
+ format_article!(contents)
62
+ convert_characters!(contents)
63
+
62
64
  ##### cleanup #####
63
65
  if /\A\s*\z/m =~ contents
64
66
  result = ""
data/bin/wp2txt CHANGED
@@ -50,6 +50,7 @@ convert = opts[:convert]
50
50
  strip_tmarker = opts[:marker] ? false : true
51
51
  opt_array = [:title, :list, :heading, :table, :redirect]
52
52
  $leave_template = true if opts[:template]
53
+ $leave_table = true if opts[:table]
53
54
  config = {}
54
55
  opt_array.each do |opt|
55
56
  config[opt] = opts[opt]
@@ -61,6 +62,7 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert,
61
62
  wpconv.extract_text do |article|
62
63
  format_wiki!(article.title)
63
64
  title = "[[#{article.title}]]\n"
65
+ convert_characters!(title)
64
66
 
65
67
  if opts[:category] && !article.categories.empty?
66
68
  contents = "\nCATEGORIES: "
@@ -118,18 +120,8 @@ wpconv.extract_text do |article|
118
120
  end
119
121
  contents << line
120
122
  end
121
-
122
- remove_directive!(contents)
123
- remove_emphasis!(contents)
124
- mndash!(contents)
125
- make_reference!(contents)
126
- format_ref!(contents)
127
- remove_hr!(contents)
128
- remove_tag!(contents)
129
- special_chr!(contents)
130
-
131
- correct_inline_template!(contents) unless $leave_template
132
- remove_templates!(contents) unless $leave_template
123
+ format_article!(contents)
124
+ convert_characters!(contents)
133
125
 
134
126
  ##### cleanup #####
135
127
  if /\A\s*\z/m =~ contents
@@ -0,0 +1 @@
1
+ [[アンパサンド]]
@@ -3,6 +3,7 @@
3
3
 
4
4
  require 'strscan'
5
5
  require 'find'
6
+ require 'htmlentities'
6
7
 
7
8
  ###################################################
8
9
  # global variables to save resource for generating regexps
@@ -10,6 +11,12 @@ require 'find'
10
11
  # those with a trailing number 2 represent closing tag/markup
11
12
  # those without a trailing number contain both opening/closing tags/markups
12
13
 
14
+ $html_decoder = HTMLEntities.new
15
+
16
+ $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
+ $html_hash = Hash[*$entities.flatten]
18
+ $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
+
13
20
  $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
14
21
  $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
15
22
 
@@ -43,6 +50,9 @@ $blank_line_regex = Regexp.new('^\s*$')
43
50
 
44
51
  $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
45
52
 
53
+ $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
54
+ $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
55
+
46
56
  $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
47
57
  $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
48
58
  $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
@@ -58,8 +68,8 @@ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
58
68
  $pre_marks_regex = Regexp.new('\A\^\ ')
59
69
  $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
60
70
  $onset_bar_regex = Regexp.new('\A[^\|]+\z')
61
- $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
62
- $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
71
+ # $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
72
+ # $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
63
73
 
64
74
  $category_patterns = ["Category", "Categoria"].join("|")
65
75
  $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
@@ -74,22 +84,16 @@ $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escap
74
84
  $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
75
85
  $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
76
86
  $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
77
-
87
+ $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
78
88
  ###################################################
79
89
 
80
90
  module Wp2txt
81
91
 
82
- def format_wiki!(text, has_retried = false)
92
+ def convert_characters!(text, has_retried = false)
83
93
  begin
84
94
  text << ""
85
-
86
95
  chrref_to_utf!(text)
87
- escape_nowiki!(text)
88
-
89
- process_interwiki_links!(text)
90
- process_external_links!(text)
91
-
92
- unescape_nowiki!(text)
96
+ special_chr!(text)
93
97
 
94
98
  rescue # detect invalid byte sequence in UTF-8
95
99
  if has_retried
@@ -102,11 +106,34 @@ module Wp2txt
102
106
  else
103
107
  text.encode!("UTF-16")
104
108
  text.encode!("UTF-8")
105
- format_wiki!(text, true)
109
+ convert_characters!(text, true)
106
110
  end
107
111
  end
108
112
  end
113
+
114
+ def format_wiki!(text, has_retried = false)
115
+ escape_nowiki!(text)
116
+
117
+ process_interwiki_links!(text)
118
+ process_external_links!(text)
109
119
 
120
+ unescape_nowiki!(text)
121
+ end
122
+
123
+ def format_article!(text)
124
+ remove_directive!(text)
125
+ remove_emphasis!(text)
126
+ mndash!(text)
127
+ make_reference!(text)
128
+ format_ref!(text)
129
+ remove_hr!(text)
130
+ remove_tag!(text)
131
+ convert_characters!(text)
132
+ correct_inline_template!(text) unless $leave_template
133
+ remove_templates!(text) unless $leave_template
134
+ remove_table!(text) unless $leave_table
135
+ end
136
+
110
137
  #################### parser for nested structure ####################
111
138
 
112
139
  def process_nested_structure(scanner, left, right, recur_count, &block)
@@ -120,6 +147,8 @@ module Wp2txt
120
147
  regex = $single_curly_bracket_regex
121
148
  elsif left == "{{" && right == "}}"
122
149
  regex = $double_curly_bracket_regex
150
+ elsif left == "{|" && right == "|}"
151
+ regex = $curly_square_bracket_regex
123
152
  else
124
153
  regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
125
154
  end
@@ -154,15 +183,6 @@ module Wp2txt
154
183
  end
155
184
 
156
185
  #################### methods used from format_wiki ####################
157
-
158
- def remove_templates!(str)
159
- scanner = StringScanner.new(str)
160
- result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
161
- ""
162
- end
163
- str.replace(result)
164
- end
165
-
166
186
  def escape_nowiki!(str)
167
187
  if @nowikis
168
188
  @nowikis.clear
@@ -213,80 +233,42 @@ module Wp2txt
213
233
  str.replace(result)
214
234
  end
215
235
 
216
- def special_chr!(str)
217
- unless $sp_hash
218
- html = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;']\
219
- .zip([' ', '<', '>', '&', '"'])
220
-
221
- umraut_accent = ['&Agrave;', '&Aacute;', '&Acirc;', '&Atilde;', '&Auml;',
222
- '&Aring;', '&AElig;', '&Ccedil;', '&Egrave;', '&Eacute;', '&Ecirc;',
223
- '&Euml;', '&Igrave;', '&Iacute;', '&Icirc;', '&Iuml;', '&Ntilde;',
224
- '&Ograve;', '&Oacute;', '&Ocirc;', '&Otilde;', '&Ouml;', '&Oslash;',
225
- '&Ugrave;', '&Uacute;', '&Ucirc;', '&Uuml;', '&szlig;', '&agrave;',
226
- '&aacute;', '&acirc;', '&atilde;', '&auml;', '&aring;', '&aelig;',
227
- '&ccedil;', '&egrave;', '&eacute;', '&ecirc;', '&euml;', '&igrave;',
228
- '&iacute;', '&icirc;', '&iuml;', '&ntilde;', '&ograve;', '&oacute;',
229
- '&ocirc;', '&oelig;', '&otilde;', '&ouml;', '&oslash;', '&ugrave;',
230
- '&uacute;', '&ucirc;', '&uuml;', '&yuml;']\
231
- .zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
232
- 'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
233
- 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
234
- 'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
235
-
236
- punctuation = ['&iquest;', '&iexcl;', '&laquo;', '&raquo;', '&sect;',
237
- '&para;', '&dagger;', '&Dagger;', '&bull;', '&ndash;', '&mdash;']\
238
- .zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
239
-
240
- commercial = ['&trade;', '&copy;', '&reg;', '&cent;', '&euro;', '&yen;',
241
- '&pound;', '&curren;'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
242
-
243
- greek_chr = ['&alpha;', '&beta;', '&gamma;', '&delta;', '&epsilon;',
244
- '&zeta;', '&eta;', '&theta;', '&iota;', '&kappa;', '&lambda;', '&mu;',
245
- '&nu;', '&xi;', '&omicron;', '&pi;', '&rho;', '&sigma;', '&sigmaf;',
246
- '&tau;', '&upsilon;', '&phi;', '&chi;', '&psi;', '&omega;', '&Gamma;',
247
- '&Delta;', '&Theta;', '&Lambda;', '&Xi;', '&Pi;', '&Sigma;', '&Phi;',
248
- '&Psi;', '&Omega;']\
249
- .zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
250
- 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
251
- 'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
252
-
253
- math_chr1 = ['&int;', '&sum;', '&prod;', '&radic;', '&minus;', '&plusmn;',
254
- '&infin;', '&asymp;', '&prop;', '&equiv;', '&ne;', '&le;', '&ge;',
255
- '&times;', '&middot;', '&divide;', '&part;', '&prime;', '&Prime;',
256
- '&nabla;', '&permil;', '&deg;', '&there4;', '&oslash;', '&isin;', '&cap;',
257
- '&cup;', '&sub;', '&sup;', '&sube;', '&supe;', '&not;', '&and;', '&or;',
258
- '&exist;', '&forall;', '&rArr;', '&hArr;', '&rarr;', '&harr;', '&uarr;']\
259
- .zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
260
- '≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
261
- '∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
262
- '⇔', '→', '↔', '↑'])
263
-
264
- math_chr2 = ['&alefsym;', '&notin;'].zip(['ℵ', '∉'])
265
-
266
- others = ['&uml;', '&ordf;',
267
- '&macr;', '&acute;', '&micro;', '&cedil;', '&ordm;', '&lsquo;', '&rsquo;',
268
- '&ldquo;', '&sbquo;', '&rdquo;', '&bdquo;', '&spades;', '&clubs;', '&loz;',
269
- '&hearts;', '&larr;', '&diams;', '&lsaquo;', '&rsaquo;', '&darr;']\
270
- .zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
271
- '„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
272
-
273
- spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
274
- math_chr1 + math_chr2 + others
275
- $sp_hash = Hash[*spc_array.flatten]
276
- $sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
236
+ #################### methods used from format_article ####################
237
+
238
+ def remove_templates!(str)
239
+ scanner = StringScanner.new(str)
240
+ result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
241
+ ""
277
242
  end
278
- #str.gsub!("&amp;"){'&'}
279
- str.gsub!($sp_regex) do
280
- $sp_hash[$1]
243
+ str.replace(result)
244
+ end
245
+
246
+ def remove_table!(str)
247
+ scanner = StringScanner.new(str)
248
+ result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
249
+ ""
281
250
  end
251
+ str.replace(result)
252
+ end
253
+
254
+ def special_chr!(str)
255
+ str.replace $html_decoder.decode(str)
282
256
  end
283
257
 
284
- def remove_tag!(str, tagset = ['<', '>'])
258
+ def remove_inbetween!(str, tagset = ['<', '>'])
285
259
  tagsets = Regexp.quote(tagset.uniq.join(""))
286
260
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
287
261
  str.gsub!(regex, "")
288
262
  end
289
263
 
264
+ def remove_tag!(str)
265
+ str.gsub!($remove_tag_regex, "")
266
+ end
267
+
268
+ def remove_directive!(str)
269
+ str.gsub!($remove_directives_regex, "")
270
+ end
271
+
290
272
  def remove_emphasis!(str)
291
273
  str.gsub!($remove_emphasis_regex) do
292
274
  $2
@@ -311,10 +293,6 @@ module Wp2txt
311
293
  end
312
294
  return true
313
295
  end
314
-
315
- def remove_directive!(str)
316
- remove_tag!(str, ['__', '__'])
317
- end
318
296
 
319
297
  def mndash!(str)
320
298
  str.gsub!($mndash_regex, "–")
@@ -364,40 +342,40 @@ module Wp2txt
364
342
 
365
343
  #################### methods currently unused ####################
366
344
 
367
- def process_template(str)
368
- scanner = StringScanner.new(str)
369
- result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
370
- parts = contents.split("|")
371
- case parts.size
372
- when 0
373
- ""
374
- when 1
375
- parts.first || ""
376
- else
377
- if parts.last.split("=").size > 1
378
- parts.first || ""
379
- else
380
- parts.last || ""
381
- end
382
- end
383
- end
384
- result
385
- end
386
-
387
- def remove_table(str)
388
- new_str = str.gsub($remove_table_regex, "")
389
- if str != new_str
390
- new_str = remove_table(new_str)
391
- end
392
- new_str = remove_table(new_str) unless str == new_str
393
- return new_str
394
- end
345
+ # def process_template(str)
346
+ # scanner = StringScanner.new(str)
347
+ # result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
348
+ # parts = contents.split("|")
349
+ # case parts.size
350
+ # when 0
351
+ # ""
352
+ # when 1
353
+ # parts.first || ""
354
+ # else
355
+ # if parts.last.split("=").size > 1
356
+ # parts.first || ""
357
+ # else
358
+ # parts.last || ""
359
+ # end
360
+ # end
361
+ # end
362
+ # result
363
+ # end
364
+
365
+ # def remove_table(str)
366
+ # new_str = str.gsub($remove_table_regex, "")
367
+ # if str != new_str
368
+ # new_str = remove_table(new_str)
369
+ # end
370
+ # new_str = remove_table(new_str) unless str == new_str
371
+ # return new_str
372
+ # end
395
373
 
396
- def remove_clade(page)
397
- new_page = page.gsub($remove_clade_regex, "")
398
- new_page = remove_clade(new_page) unless page == new_page
399
- new_page
400
- end
374
+ # def remove_clade(page)
375
+ # new_page = page.gsub($remove_clade_regex, "")
376
+ # new_page = remove_clade(new_page) unless page == new_page
377
+ # new_page
378
+ # end
401
379
 
402
380
  #################### file related utilities ####################
403
381
 
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.7.0"
2
+ VERSION = "0.7.5"
3
3
  end
@@ -44,7 +44,7 @@ describe "Wp2txt" do
44
44
  describe "special_chr!" do
45
45
  it "replaces character references with real characters" do
46
46
  str_before = "&nbsp; &lt; &gt; &amp; &quot;"
47
- str_after = " < > & \""
47
+ str_after = "  < > & \""
48
48
  special_chr!(str_before)
49
49
  expect(str_before).to eq str_after
50
50
  end
@@ -77,21 +77,22 @@ describe "Wp2txt" do
77
77
  end
78
78
  end
79
79
 
80
- describe "remove_table" do
80
+ describe "remove_table!" do
81
81
  it "removes table formated parts" do
82
82
  str_before = "{| ... \n{| ... \n ...|}\n ...|}"
83
83
  str_after = ""
84
- expect(remove_table(str_before)).to eq str_after
84
+ remove_table!(str_before)
85
+ expect(str_before).to eq str_after
85
86
  end
86
87
  end
87
88
 
88
- describe "remove_clade" do
89
- it "removes clade formated parts" do
90
- str_before = "\{\{clade ... \n ... \n ... \n\}\}"
91
- str_after = ""
92
- expect(remove_clade(str_before)).to eq str_after
93
- end
94
- end
89
+ # describe "remove_clade" do
90
+ # it "removes clade formated parts" do
91
+ # str_before = "\{\{clade ... \n ... \n ... \n\}\}"
92
+ # str_after = ""
93
+ # expect(remove_clade(str_before)).to eq str_after
94
+ # end
95
+ # end
95
96
 
96
97
  describe "remove_hr!" do
97
98
  it "removes horizontal lines" do
@@ -102,15 +103,15 @@ describe "Wp2txt" do
102
103
  end
103
104
  end
104
105
 
105
- describe "remove_tag!" do
106
- it "removes tags" do
106
+ describe "remove_inbetween!" do
107
+ it "removes tags and its contents" do
107
108
  str_before = "<tag>abc</tag>"
108
109
  str_after = "abc"
109
110
  remove_tag!(str_before)
110
111
  expect(str_before).to eq str_after
111
112
  str_before = "[tag]def[/tag]"
112
113
  str_after = "def"
113
- remove_tag!(str_before, ['[', ']'])
114
+ remove_inbetween!(str_before, ['[', ']'])
114
115
  expect(str_before).to eq str_after
115
116
  end
116
117
  end
@@ -183,34 +184,34 @@ describe "Wp2txt" do
183
184
  end
184
185
  end
185
186
 
186
- describe "process_template" do
187
- it "removes brackets and leaving some text" do
188
- str_before = "{{}}"
189
- str_after = ""
190
- expect(process_template(str_before)).to eq str_after
191
- str_before = "{{lang|en|Japan}}"
192
- str_after = "Japan"
193
- expect(process_template(str_before)).to eq str_after
194
- str_before = "{{a|b=c|d=f}}"
195
- str_after = "a"
196
- expect(process_template(str_before)).to eq str_after
197
- str_before = "{{a|b|{{c|d|e}}}}"
198
- str_after = "e"
199
- expect(process_template(str_before)).to eq str_after
200
- end
201
- end
187
+ # describe "process_template" do
188
+ # it "removes brackets and leaving some text" do
189
+ # str_before = "{{}}"
190
+ # str_after = ""
191
+ # expect(process_template(str_before)).to eq str_after
192
+ # str_before = "{{lang|en|Japan}}"
193
+ # str_after = "Japan"
194
+ # expect(process_template(str_before)).to eq str_after
195
+ # str_before = "{{a|b=c|d=f}}"
196
+ # str_after = "a"
197
+ # expect(process_template(str_before)).to eq str_after
198
+ # str_before = "{{a|b|{{c|d|e}}}}"
199
+ # str_after = "e"
200
+ # expect(process_template(str_before)).to eq str_after
201
+ # end
202
+ # end
202
203
 
203
- # describe "expand_template" do
204
- # it "gets data corresponding to a given template using mediawiki api" do
205
- # uri = "http://en.wiktionary.org/w/api.php"
206
- # template = "{{en-verb}}"
207
- # word = "kick"
208
- # expanded = expand_template(uri, template, word)
209
- # html =<<EOD
210
- # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
211
- # EOD
212
- # html.strip!
213
- # expanded.should == html
214
- # end
215
- # end
204
+ # describe "expand_template" do
205
+ # it "gets data corresponding to a given template using mediawiki api" do
206
+ # uri = "http://en.wiktionary.org/w/api.php"
207
+ # template = "{{en-verb}}"
208
+ # word = "kick"
209
+ # expanded = expand_template(uri, template, word)
210
+ # html =<<EOD
211
+ # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
212
+ # EOD
213
+ # html.strip!
214
+ # expanded.should == html
215
+ # end
216
+ # end
216
217
  end
@@ -23,5 +23,6 @@ Gem::Specification.new do |s|
23
23
  # s.add_development_dependency "rake"
24
24
 
25
25
  s.add_dependency "nokogiri"
26
+ s.add_dependency "htmlentities"
26
27
  s.add_dependency "trollop"
27
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-29 00:00:00.000000000 Z
11
+ date: 2014-11-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: htmlentities
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: trollop
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -56,6 +70,7 @@ files:
56
70
  - bin/benchmark.rb
57
71
  - bin/wp2txt
58
72
  - data/testdata.bz2
73
+ - error_log.txt
59
74
  - lib/wp2txt.rb
60
75
  - lib/wp2txt/article.rb
61
76
  - lib/wp2txt/mw_api.rb