wp2txt 0.7.0 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 80f68e6c1ac855160575f85f4d78ca378f0a1c2b
4
- data.tar.gz: 16bbac80e7139ea63dd46baf54fb5deaf0840e59
3
+ metadata.gz: 911e08e181a6bedb664b797d49183d0988daeba5
4
+ data.tar.gz: 076d1349a8aa8cf454dac42bdce7b89a82f3fca0
5
5
  SHA512:
6
- metadata.gz: 004d26fa39aae4eb194858cf85ae8aad33f65dc556a08bbfc499ead05d49e70af4f5ba5e708354aa816cd6b38d8e9860866cefa7d6c0730058e9a186ff9eec31
7
- data.tar.gz: c2523b8afeab165c37de028eedff36e719a2472f9440469e4041c342b08463d439351a89523d959ff28d53364c76a2af44502113bb2084eacbbc8ac14306f8a4
6
+ metadata.gz: 4ebc035e4f1635f150294d8b79eb474457a280707a416688f3e7712bb7788d15888b6718bfd6f4e3a790e6fb8a7623e1415255fde913bfe658dd237fa7f599cd
7
+ data.tar.gz: ccee00a9e1b85186d52d0b3c07b52c04fff1ecd133ff245010943312cf37e279874b5f3a757880c005ad877e957df6a4176af2269f40b3c3210951530eb4c511
@@ -22,12 +22,13 @@ Benchmark.bm do |x|
22
22
  x.report do
23
23
  wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
24
24
  wpconv.extract_text do |article|
25
- title = format_wiki! article.title
26
- title = "[[#{title}]]\n"
25
+ format_wiki!(article.title)
26
+ title = "[[#{article.title}]]\n"
27
+ convert_characters!(title)
27
28
 
28
- contents = "\nCATEGORIES: "
29
- contents += article.categories.join(", ")
30
- contents += "\n\n"
29
+ contents = "\nCATEGORIES: "
30
+ contents += article.categories.join(", ")
31
+ contents += "\n\n"
31
32
 
32
33
  article.elements.each do |e|
33
34
  case e.first
@@ -55,10 +56,11 @@ Benchmark.bm do |x|
55
56
  else
56
57
  next
57
58
  end
58
- contents += line
59
- remove_templates!(contents)
59
+ contents << line
60
60
  end
61
-
61
+ format_article!(contents)
62
+ convert_characters!(contents)
63
+
62
64
  ##### cleanup #####
63
65
  if /\A\s*\z/m =~ contents
64
66
  result = ""
data/bin/wp2txt CHANGED
@@ -50,6 +50,7 @@ convert = opts[:convert]
50
50
  strip_tmarker = opts[:marker] ? false : true
51
51
  opt_array = [:title, :list, :heading, :table, :redirect]
52
52
  $leave_template = true if opts[:template]
53
+ $leave_table = true if opts[:table]
53
54
  config = {}
54
55
  opt_array.each do |opt|
55
56
  config[opt] = opts[opt]
@@ -61,6 +62,7 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert,
61
62
  wpconv.extract_text do |article|
62
63
  format_wiki!(article.title)
63
64
  title = "[[#{article.title}]]\n"
65
+ convert_characters!(title)
64
66
 
65
67
  if opts[:category] && !article.categories.empty?
66
68
  contents = "\nCATEGORIES: "
@@ -118,18 +120,8 @@ wpconv.extract_text do |article|
118
120
  end
119
121
  contents << line
120
122
  end
121
-
122
- remove_directive!(contents)
123
- remove_emphasis!(contents)
124
- mndash!(contents)
125
- make_reference!(contents)
126
- format_ref!(contents)
127
- remove_hr!(contents)
128
- remove_tag!(contents)
129
- special_chr!(contents)
130
-
131
- correct_inline_template!(contents) unless $leave_template
132
- remove_templates!(contents) unless $leave_template
123
+ format_article!(contents)
124
+ convert_characters!(contents)
133
125
 
134
126
  ##### cleanup #####
135
127
  if /\A\s*\z/m =~ contents
@@ -0,0 +1 @@
1
+ [[アンパサンド]]
@@ -3,6 +3,7 @@
3
3
 
4
4
  require 'strscan'
5
5
  require 'find'
6
+ require 'htmlentities'
6
7
 
7
8
  ###################################################
8
9
  # global variables to save resource for generating regexps
@@ -10,6 +11,12 @@ require 'find'
10
11
  # those with a trailing number 2 represent closing tag/markup
11
12
  # those without a trailing number contain both opening/closing tags/markups
12
13
 
14
+ $html_decoder = HTMLEntities.new
15
+
16
+ $entities = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
17
+ $html_hash = Hash[*$entities.flatten]
18
+ $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
19
+
13
20
  $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
14
21
  $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
15
22
 
@@ -43,6 +50,9 @@ $blank_line_regex = Regexp.new('^\s*$')
43
50
 
44
51
  $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
45
52
 
53
+ $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
54
+ $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
55
+
46
56
  $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
47
57
  $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
48
58
  $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
@@ -58,8 +68,8 @@ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+')
58
68
  $pre_marks_regex = Regexp.new('\A\^\ ')
59
69
  $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
60
70
  $onset_bar_regex = Regexp.new('\A[^\|]+\z')
61
- $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
62
- $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
71
+ # $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
72
+ # $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
63
73
 
64
74
  $category_patterns = ["Category", "Categoria"].join("|")
65
75
  $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
@@ -74,22 +84,16 @@ $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escap
74
84
  $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
75
85
  $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
76
86
  $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
77
-
87
+ $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
78
88
  ###################################################
79
89
 
80
90
  module Wp2txt
81
91
 
82
- def format_wiki!(text, has_retried = false)
92
+ def convert_characters!(text, has_retried = false)
83
93
  begin
84
94
  text << ""
85
-
86
95
  chrref_to_utf!(text)
87
- escape_nowiki!(text)
88
-
89
- process_interwiki_links!(text)
90
- process_external_links!(text)
91
-
92
- unescape_nowiki!(text)
96
+ special_chr!(text)
93
97
 
94
98
  rescue # detect invalid byte sequence in UTF-8
95
99
  if has_retried
@@ -102,11 +106,34 @@ module Wp2txt
102
106
  else
103
107
  text.encode!("UTF-16")
104
108
  text.encode!("UTF-8")
105
- format_wiki!(text, true)
109
+ convert_characters!(text, true)
106
110
  end
107
111
  end
108
112
  end
113
+
114
+ def format_wiki!(text, has_retried = false)
115
+ escape_nowiki!(text)
116
+
117
+ process_interwiki_links!(text)
118
+ process_external_links!(text)
109
119
 
120
+ unescape_nowiki!(text)
121
+ end
122
+
123
+ def format_article!(text)
124
+ remove_directive!(text)
125
+ remove_emphasis!(text)
126
+ mndash!(text)
127
+ make_reference!(text)
128
+ format_ref!(text)
129
+ remove_hr!(text)
130
+ remove_tag!(text)
131
+ convert_characters!(text)
132
+ correct_inline_template!(text) unless $leave_template
133
+ remove_templates!(text) unless $leave_template
134
+ remove_table!(text) unless $leave_table
135
+ end
136
+
110
137
  #################### parser for nested structure ####################
111
138
 
112
139
  def process_nested_structure(scanner, left, right, recur_count, &block)
@@ -120,6 +147,8 @@ module Wp2txt
120
147
  regex = $single_curly_bracket_regex
121
148
  elsif left == "{{" && right == "}}"
122
149
  regex = $double_curly_bracket_regex
150
+ elsif left == "{|" && right == "|}"
151
+ regex = $curly_square_bracket_regex
123
152
  else
124
153
  regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
125
154
  end
@@ -154,15 +183,6 @@ module Wp2txt
154
183
  end
155
184
 
156
185
  #################### methods used from format_wiki ####################
157
-
158
- def remove_templates!(str)
159
- scanner = StringScanner.new(str)
160
- result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
161
- ""
162
- end
163
- str.replace(result)
164
- end
165
-
166
186
  def escape_nowiki!(str)
167
187
  if @nowikis
168
188
  @nowikis.clear
@@ -213,80 +233,42 @@ module Wp2txt
213
233
  str.replace(result)
214
234
  end
215
235
 
216
- def special_chr!(str)
217
- unless $sp_hash
218
- html = ['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;']\
219
- .zip([' ', '<', '>', '&', '"'])
220
-
221
- umraut_accent = ['&Agrave;', '&Aacute;', '&Acirc;', '&Atilde;', '&Auml;',
222
- '&Aring;', '&AElig;', '&Ccedil;', '&Egrave;', '&Eacute;', '&Ecirc;',
223
- '&Euml;', '&Igrave;', '&Iacute;', '&Icirc;', '&Iuml;', '&Ntilde;',
224
- '&Ograve;', '&Oacute;', '&Ocirc;', '&Otilde;', '&Ouml;', '&Oslash;',
225
- '&Ugrave;', '&Uacute;', '&Ucirc;', '&Uuml;', '&szlig;', '&agrave;',
226
- '&aacute;', '&acirc;', '&atilde;', '&auml;', '&aring;', '&aelig;',
227
- '&ccedil;', '&egrave;', '&eacute;', '&ecirc;', '&euml;', '&igrave;',
228
- '&iacute;', '&icirc;', '&iuml;', '&ntilde;', '&ograve;', '&oacute;',
229
- '&ocirc;', '&oelig;', '&otilde;', '&ouml;', '&oslash;', '&ugrave;',
230
- '&uacute;', '&ucirc;', '&uuml;', '&yuml;']\
231
- .zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í',
232
- 'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à',
233
- 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
234
- 'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
235
-
236
- punctuation = ['&iquest;', '&iexcl;', '&laquo;', '&raquo;', '&sect;',
237
- '&para;', '&dagger;', '&Dagger;', '&bull;', '&ndash;', '&mdash;']\
238
- .zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
239
-
240
- commercial = ['&trade;', '&copy;', '&reg;', '&cent;', '&euro;', '&yen;',
241
- '&pound;', '&curren;'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
242
-
243
- greek_chr = ['&alpha;', '&beta;', '&gamma;', '&delta;', '&epsilon;',
244
- '&zeta;', '&eta;', '&theta;', '&iota;', '&kappa;', '&lambda;', '&mu;',
245
- '&nu;', '&xi;', '&omicron;', '&pi;', '&rho;', '&sigma;', '&sigmaf;',
246
- '&tau;', '&upsilon;', '&phi;', '&chi;', '&psi;', '&omega;', '&Gamma;',
247
- '&Delta;', '&Theta;', '&Lambda;', '&Xi;', '&Pi;', '&Sigma;', '&Phi;',
248
- '&Psi;', '&Omega;']\
249
- .zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ',
250
- 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ',
251
- 'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
252
-
253
- math_chr1 = ['&int;', '&sum;', '&prod;', '&radic;', '&minus;', '&plusmn;',
254
- '&infin;', '&asymp;', '&prop;', '&equiv;', '&ne;', '&le;', '&ge;',
255
- '&times;', '&middot;', '&divide;', '&part;', '&prime;', '&Prime;',
256
- '&nabla;', '&permil;', '&deg;', '&there4;', '&oslash;', '&isin;', '&cap;',
257
- '&cup;', '&sub;', '&sup;', '&sube;', '&supe;', '&not;', '&and;', '&or;',
258
- '&exist;', '&forall;', '&rArr;', '&hArr;', '&rarr;', '&harr;', '&uarr;']\
259
- .zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤',
260
- '≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈',
261
- '∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒',
262
- '⇔', '→', '↔', '↑'])
263
-
264
- math_chr2 = ['&alefsym;', '&notin;'].zip(['ℵ', '∉'])
265
-
266
- others = ['&uml;', '&ordf;',
267
- '&macr;', '&acute;', '&micro;', '&cedil;', '&ordm;', '&lsquo;', '&rsquo;',
268
- '&ldquo;', '&sbquo;', '&rdquo;', '&bdquo;', '&spades;', '&clubs;', '&loz;',
269
- '&hearts;', '&larr;', '&diams;', '&lsaquo;', '&rsaquo;', '&darr;']\
270
- .zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”',
271
- '„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
272
-
273
- spc_array = html + umraut_accent + punctuation + commercial + greek_chr +
274
- math_chr1 + math_chr2 + others
275
- $sp_hash = Hash[*spc_array.flatten]
276
- $sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
236
+ #################### methods used from format_article ####################
237
+
238
+ def remove_templates!(str)
239
+ scanner = StringScanner.new(str)
240
+ result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
241
+ ""
277
242
  end
278
- #str.gsub!("&amp;"){'&'}
279
- str.gsub!($sp_regex) do
280
- $sp_hash[$1]
243
+ str.replace(result)
244
+ end
245
+
246
+ def remove_table!(str)
247
+ scanner = StringScanner.new(str)
248
+ result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
249
+ ""
281
250
  end
251
+ str.replace(result)
252
+ end
253
+
254
+ def special_chr!(str)
255
+ str.replace $html_decoder.decode(str)
282
256
  end
283
257
 
284
- def remove_tag!(str, tagset = ['<', '>'])
258
+ def remove_inbetween!(str, tagset = ['<', '>'])
285
259
  tagsets = Regexp.quote(tagset.uniq.join(""))
286
260
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
287
261
  str.gsub!(regex, "")
288
262
  end
289
263
 
264
+ def remove_tag!(str)
265
+ str.gsub!($remove_tag_regex, "")
266
+ end
267
+
268
+ def remove_directive!(str)
269
+ str.gsub!($remove_directives_regex, "")
270
+ end
271
+
290
272
  def remove_emphasis!(str)
291
273
  str.gsub!($remove_emphasis_regex) do
292
274
  $2
@@ -311,10 +293,6 @@ module Wp2txt
311
293
  end
312
294
  return true
313
295
  end
314
-
315
- def remove_directive!(str)
316
- remove_tag!(str, ['__', '__'])
317
- end
318
296
 
319
297
  def mndash!(str)
320
298
  str.gsub!($mndash_regex, "–")
@@ -364,40 +342,40 @@ module Wp2txt
364
342
 
365
343
  #################### methods currently unused ####################
366
344
 
367
- def process_template(str)
368
- scanner = StringScanner.new(str)
369
- result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
370
- parts = contents.split("|")
371
- case parts.size
372
- when 0
373
- ""
374
- when 1
375
- parts.first || ""
376
- else
377
- if parts.last.split("=").size > 1
378
- parts.first || ""
379
- else
380
- parts.last || ""
381
- end
382
- end
383
- end
384
- result
385
- end
386
-
387
- def remove_table(str)
388
- new_str = str.gsub($remove_table_regex, "")
389
- if str != new_str
390
- new_str = remove_table(new_str)
391
- end
392
- new_str = remove_table(new_str) unless str == new_str
393
- return new_str
394
- end
345
+ # def process_template(str)
346
+ # scanner = StringScanner.new(str)
347
+ # result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
348
+ # parts = contents.split("|")
349
+ # case parts.size
350
+ # when 0
351
+ # ""
352
+ # when 1
353
+ # parts.first || ""
354
+ # else
355
+ # if parts.last.split("=").size > 1
356
+ # parts.first || ""
357
+ # else
358
+ # parts.last || ""
359
+ # end
360
+ # end
361
+ # end
362
+ # result
363
+ # end
364
+
365
+ # def remove_table(str)
366
+ # new_str = str.gsub($remove_table_regex, "")
367
+ # if str != new_str
368
+ # new_str = remove_table(new_str)
369
+ # end
370
+ # new_str = remove_table(new_str) unless str == new_str
371
+ # return new_str
372
+ # end
395
373
 
396
- def remove_clade(page)
397
- new_page = page.gsub($remove_clade_regex, "")
398
- new_page = remove_clade(new_page) unless page == new_page
399
- new_page
400
- end
374
+ # def remove_clade(page)
375
+ # new_page = page.gsub($remove_clade_regex, "")
376
+ # new_page = remove_clade(new_page) unless page == new_page
377
+ # new_page
378
+ # end
401
379
 
402
380
  #################### file related utilities ####################
403
381
 
@@ -1,3 +1,3 @@
1
1
  module Wp2txt
2
- VERSION = "0.7.0"
2
+ VERSION = "0.7.5"
3
3
  end
@@ -44,7 +44,7 @@ describe "Wp2txt" do
44
44
  describe "special_chr!" do
45
45
  it "replaces character references with real characters" do
46
46
  str_before = "&nbsp; &lt; &gt; &amp; &quot;"
47
- str_after = " < > & \""
47
+ str_after = "  < > & \""
48
48
  special_chr!(str_before)
49
49
  expect(str_before).to eq str_after
50
50
  end
@@ -77,21 +77,22 @@ describe "Wp2txt" do
77
77
  end
78
78
  end
79
79
 
80
- describe "remove_table" do
80
+ describe "remove_table!" do
81
81
  it "removes table formated parts" do
82
82
  str_before = "{| ... \n{| ... \n ...|}\n ...|}"
83
83
  str_after = ""
84
- expect(remove_table(str_before)).to eq str_after
84
+ remove_table!(str_before)
85
+ expect(str_before).to eq str_after
85
86
  end
86
87
  end
87
88
 
88
- describe "remove_clade" do
89
- it "removes clade formated parts" do
90
- str_before = "\{\{clade ... \n ... \n ... \n\}\}"
91
- str_after = ""
92
- expect(remove_clade(str_before)).to eq str_after
93
- end
94
- end
89
+ # describe "remove_clade" do
90
+ # it "removes clade formated parts" do
91
+ # str_before = "\{\{clade ... \n ... \n ... \n\}\}"
92
+ # str_after = ""
93
+ # expect(remove_clade(str_before)).to eq str_after
94
+ # end
95
+ # end
95
96
 
96
97
  describe "remove_hr!" do
97
98
  it "removes horizontal lines" do
@@ -102,15 +103,15 @@ describe "Wp2txt" do
102
103
  end
103
104
  end
104
105
 
105
- describe "remove_tag!" do
106
- it "removes tags" do
106
+ describe "remove_inbetween!" do
107
+ it "removes tags and its contents" do
107
108
  str_before = "<tag>abc</tag>"
108
109
  str_after = "abc"
109
110
  remove_tag!(str_before)
110
111
  expect(str_before).to eq str_after
111
112
  str_before = "[tag]def[/tag]"
112
113
  str_after = "def"
113
- remove_tag!(str_before, ['[', ']'])
114
+ remove_inbetween!(str_before, ['[', ']'])
114
115
  expect(str_before).to eq str_after
115
116
  end
116
117
  end
@@ -183,34 +184,34 @@ describe "Wp2txt" do
183
184
  end
184
185
  end
185
186
 
186
- describe "process_template" do
187
- it "removes brackets and leaving some text" do
188
- str_before = "{{}}"
189
- str_after = ""
190
- expect(process_template(str_before)).to eq str_after
191
- str_before = "{{lang|en|Japan}}"
192
- str_after = "Japan"
193
- expect(process_template(str_before)).to eq str_after
194
- str_before = "{{a|b=c|d=f}}"
195
- str_after = "a"
196
- expect(process_template(str_before)).to eq str_after
197
- str_before = "{{a|b|{{c|d|e}}}}"
198
- str_after = "e"
199
- expect(process_template(str_before)).to eq str_after
200
- end
201
- end
187
+ # describe "process_template" do
188
+ # it "removes brackets and leaving some text" do
189
+ # str_before = "{{}}"
190
+ # str_after = ""
191
+ # expect(process_template(str_before)).to eq str_after
192
+ # str_before = "{{lang|en|Japan}}"
193
+ # str_after = "Japan"
194
+ # expect(process_template(str_before)).to eq str_after
195
+ # str_before = "{{a|b=c|d=f}}"
196
+ # str_after = "a"
197
+ # expect(process_template(str_before)).to eq str_after
198
+ # str_before = "{{a|b|{{c|d|e}}}}"
199
+ # str_after = "e"
200
+ # expect(process_template(str_before)).to eq str_after
201
+ # end
202
+ # end
202
203
 
203
- # describe "expand_template" do
204
- # it "gets data corresponding to a given template using mediawiki api" do
205
- # uri = "http://en.wiktionary.org/w/api.php"
206
- # template = "{{en-verb}}"
207
- # word = "kick"
208
- # expanded = expand_template(uri, template, word)
209
- # html =<<EOD
210
- # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
211
- # EOD
212
- # html.strip!
213
- # expanded.should == html
214
- # end
215
- # end
204
+ # describe "expand_template" do
205
+ # it "gets data corresponding to a given template using mediawiki api" do
206
+ # uri = "http://en.wiktionary.org/w/api.php"
207
+ # template = "{{en-verb}}"
208
+ # word = "kick"
209
+ # expanded = expand_template(uri, template, word)
210
+ # html =<<EOD
211
+ # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
212
+ # EOD
213
+ # html.strip!
214
+ # expanded.should == html
215
+ # end
216
+ # end
216
217
  end
@@ -23,5 +23,6 @@ Gem::Specification.new do |s|
23
23
  # s.add_development_dependency "rake"
24
24
 
25
25
  s.add_dependency "nokogiri"
26
+ s.add_dependency "htmlentities"
26
27
  s.add_dependency "trollop"
27
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wp2txt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-29 00:00:00.000000000 Z
11
+ date: 2014-11-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: htmlentities
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: trollop
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -56,6 +70,7 @@ files:
56
70
  - bin/benchmark.rb
57
71
  - bin/wp2txt
58
72
  - data/testdata.bz2
73
+ - error_log.txt
59
74
  - lib/wp2txt.rb
60
75
  - lib/wp2txt/article.rb
61
76
  - lib/wp2txt/mw_api.rb