wp2txt 0.7.0 → 0.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/bin/benchmark.rb +10 -8
 - data/bin/wp2txt +4 -12
 - data/error_log.txt +1 -0
 - data/lib/wp2txt/utils.rb +101 -123
 - data/lib/wp2txt/version.rb +1 -1
 - data/spec/utils_spec.rb +43 -42
 - data/wp2txt.gemspec +1 -0
 - metadata +17 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 911e08e181a6bedb664b797d49183d0988daeba5
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 076d1349a8aa8cf454dac42bdce7b89a82f3fca0
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 4ebc035e4f1635f150294d8b79eb474457a280707a416688f3e7712bb7788d15888b6718bfd6f4e3a790e6fb8a7623e1415255fde913bfe658dd237fa7f599cd
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: ccee00a9e1b85186d52d0b3c07b52c04fff1ecd133ff245010943312cf37e279874b5f3a757880c005ad877e957df6a4176af2269f40b3c3210951530eb4c511
         
     | 
    
        data/bin/benchmark.rb
    CHANGED
    
    | 
         @@ -22,12 +22,13 @@ Benchmark.bm do |x| 
     | 
|
| 
       22 
22 
     | 
    
         
             
              x.report do
         
     | 
| 
       23 
23 
     | 
    
         
             
                wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, strip_tmarker)
         
     | 
| 
       24 
24 
     | 
    
         
             
                wpconv.extract_text do |article|
         
     | 
| 
       25 
     | 
    
         
            -
                   
     | 
| 
       26 
     | 
    
         
            -
                  title = "[[#{title}]]\n"
         
     | 
| 
      
 25 
     | 
    
         
            +
                  format_wiki!(article.title)
         
     | 
| 
      
 26 
     | 
    
         
            +
                  title = "[[#{article.title}]]\n"
         
     | 
| 
      
 27 
     | 
    
         
            +
                  convert_characters!(title)
         
     | 
| 
       27 
28 
     | 
    
         | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
      
 29 
     | 
    
         
            +
                  contents = "\nCATEGORIES: "
         
     | 
| 
      
 30 
     | 
    
         
            +
                  contents += article.categories.join(", ")
         
     | 
| 
      
 31 
     | 
    
         
            +
                  contents += "\n\n"
         
     | 
| 
       31 
32 
     | 
    
         | 
| 
       32 
33 
     | 
    
         
             
                  article.elements.each do |e|
         
     | 
| 
       33 
34 
     | 
    
         
             
                    case e.first
         
     | 
| 
         @@ -55,10 +56,11 @@ Benchmark.bm do |x| 
     | 
|
| 
       55 
56 
     | 
    
         
             
                    else
         
     | 
| 
       56 
57 
     | 
    
         
             
                      next
         
     | 
| 
       57 
58 
     | 
    
         
             
                    end
         
     | 
| 
       58 
     | 
    
         
            -
                    contents  
     | 
| 
       59 
     | 
    
         
            -
                    remove_templates!(contents)
         
     | 
| 
      
 59 
     | 
    
         
            +
                    contents << line
         
     | 
| 
       60 
60 
     | 
    
         
             
                  end
         
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
      
 61 
     | 
    
         
            +
                  format_article!(contents)
         
     | 
| 
      
 62 
     | 
    
         
            +
                  convert_characters!(contents)
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
       62 
64 
     | 
    
         
             
                  ##### cleanup #####
         
     | 
| 
       63 
65 
     | 
    
         
             
                  if /\A\s*\z/m =~ contents
         
     | 
| 
       64 
66 
     | 
    
         
             
                    result = ""
         
     | 
    
        data/bin/wp2txt
    CHANGED
    
    | 
         @@ -50,6 +50,7 @@ convert = opts[:convert] 
     | 
|
| 
       50 
50 
     | 
    
         
             
            strip_tmarker = opts[:marker] ? false : true
         
     | 
| 
       51 
51 
     | 
    
         
             
            opt_array = [:title, :list, :heading, :table, :redirect]
         
     | 
| 
       52 
52 
     | 
    
         
             
            $leave_template = true if opts[:template]
         
     | 
| 
      
 53 
     | 
    
         
            +
            $leave_table = true if opts[:table]
         
     | 
| 
       53 
54 
     | 
    
         
             
            config = {}
         
     | 
| 
       54 
55 
     | 
    
         
             
            opt_array.each do |opt|
         
     | 
| 
       55 
56 
     | 
    
         
             
              config[opt] = opts[opt]
         
     | 
| 
         @@ -61,6 +62,7 @@ wpconv = Wp2txt::Runner.new(parent, input_file, output_dir, tfile_size, convert, 
     | 
|
| 
       61 
62 
     | 
    
         
             
            wpconv.extract_text do |article|
         
     | 
| 
       62 
63 
     | 
    
         
             
              format_wiki!(article.title)
         
     | 
| 
       63 
64 
     | 
    
         
             
              title = "[[#{article.title}]]\n"
         
     | 
| 
      
 65 
     | 
    
         
            +
              convert_characters!(title)
         
     | 
| 
       64 
66 
     | 
    
         | 
| 
       65 
67 
     | 
    
         
             
              if opts[:category] && !article.categories.empty?
         
     | 
| 
       66 
68 
     | 
    
         
             
                contents = "\nCATEGORIES: "
         
     | 
| 
         @@ -118,18 +120,8 @@ wpconv.extract_text do |article| 
     | 
|
| 
       118 
120 
     | 
    
         
             
                end
         
     | 
| 
       119 
121 
     | 
    
         
             
                contents << line    
         
     | 
| 
       120 
122 
     | 
    
         
             
              end
         
     | 
| 
       121 
     | 
    
         
            -
             
     | 
| 
       122 
     | 
    
         
            -
               
     | 
| 
       123 
     | 
    
         
            -
              remove_emphasis!(contents)
         
     | 
| 
       124 
     | 
    
         
            -
              mndash!(contents)
         
     | 
| 
       125 
     | 
    
         
            -
              make_reference!(contents)
         
     | 
| 
       126 
     | 
    
         
            -
              format_ref!(contents)
         
     | 
| 
       127 
     | 
    
         
            -
              remove_hr!(contents)
         
     | 
| 
       128 
     | 
    
         
            -
              remove_tag!(contents)
         
     | 
| 
       129 
     | 
    
         
            -
              special_chr!(contents)
         
     | 
| 
       130 
     | 
    
         
            -
             
     | 
| 
       131 
     | 
    
         
            -
              correct_inline_template!(contents) unless $leave_template
         
     | 
| 
       132 
     | 
    
         
            -
              remove_templates!(contents) unless $leave_template
         
     | 
| 
      
 123 
     | 
    
         
            +
              format_article!(contents)
         
     | 
| 
      
 124 
     | 
    
         
            +
              convert_characters!(contents)
         
     | 
| 
       133 
125 
     | 
    
         | 
| 
       134 
126 
     | 
    
         
             
              ##### cleanup #####
         
     | 
| 
       135 
127 
     | 
    
         
             
              if /\A\s*\z/m =~ contents
         
     | 
    
        data/error_log.txt
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            [[アンパサンド]]
         
     | 
    
        data/lib/wp2txt/utils.rb
    CHANGED
    
    | 
         @@ -3,6 +3,7 @@ 
     | 
|
| 
       3 
3 
     | 
    
         | 
| 
       4 
4 
     | 
    
         
             
            require 'strscan'
         
     | 
| 
       5 
5 
     | 
    
         
             
            require 'find'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require 'htmlentities'
         
     | 
| 
       6 
7 
     | 
    
         | 
| 
       7 
8 
     | 
    
         
             
            ###################################################
         
     | 
| 
       8 
9 
     | 
    
         
             
            # global variables to save resource for generating regexps
         
     | 
| 
         @@ -10,6 +11,12 @@ require 'find' 
     | 
|
| 
       10 
11 
     | 
    
         
             
            # those with a trailing number 2 represent closing tag/markup
         
     | 
| 
       11 
12 
     | 
    
         
             
            # those without a trailing number contain both opening/closing tags/markups
         
     | 
| 
       12 
13 
     | 
    
         | 
| 
      
 14 
     | 
    
         
            +
            $html_decoder = HTMLEntities.new
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            $entities = [' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
         
     | 
| 
      
 17 
     | 
    
         
            +
            $html_hash  = Hash[*$entities.flatten]
         
     | 
| 
      
 18 
     | 
    
         
            +
            $html_regex = Regexp.new("(" + $html_hash.keys.join("|") + ")")
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
       13 
20 
     | 
    
         
             
            $in_template_regex = Regexp.new('^\s*\{\{[^\}]+\}\}\s*$')
         
     | 
| 
       14 
21 
     | 
    
         
             
            $in_link_regex = Regexp.new('^\s*\[.*\]\s*$')
         
     | 
| 
       15 
22 
     | 
    
         | 
| 
         @@ -43,6 +50,9 @@ $blank_line_regex = Regexp.new('^\s*$') 
     | 
|
| 
       43 
50 
     | 
    
         | 
| 
       44 
51 
     | 
    
         
             
            $redirect_regex = Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
         
     | 
| 
       45 
52 
     | 
    
         | 
| 
      
 53 
     | 
    
         
            +
            $remove_tag_regex = Regexp.new("\<[^\<\>]*\>")
         
     | 
| 
      
 54 
     | 
    
         
            +
            $remove_directives_regex = Regexp.new("\_\_[^\_]*\_\_")
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
       46 
56 
     | 
    
         
             
            $remove_emphasis_regex = Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
         
     | 
| 
       47 
57 
     | 
    
         
             
            $chrref_to_utf_regex = Regexp.new('&#(x?)([0-9a-fA-F]+);')
         
     | 
| 
       48 
58 
     | 
    
         
             
            $mndash_regex = Regexp.new('\{(mdash|ndash|–)\}')
         
     | 
| 
         @@ -58,8 +68,8 @@ $list_marks_regex = Regexp.new('\A[\*\#\;\:\ ]+') 
     | 
|
| 
       58 
68 
     | 
    
         
             
            $pre_marks_regex = Regexp.new('\A\^\ ')
         
     | 
| 
       59 
69 
     | 
    
         
             
            $def_marks_regex = Regexp.new('\A[\;\:\ ]+')
         
     | 
| 
       60 
70 
     | 
    
         
             
            $onset_bar_regex = Regexp.new('\A[^\|]+\z')
         
     | 
| 
       61 
     | 
    
         
            -
            $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
         
     | 
| 
       62 
     | 
    
         
            -
            $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
         
     | 
| 
      
 71 
     | 
    
         
            +
            # $remove_table_regex = Regexp.new('\{\|[^\{\|\}]*?\|\}', Regexp::MULTILINE)
         
     | 
| 
      
 72 
     | 
    
         
            +
            # $remove_clade_regex = Regexp.new('\{\{(?:C|c)lade[^\{\}]*\}\}', Regexp::MULTILINE)
         
     | 
| 
       63 
73 
     | 
    
         | 
| 
       64 
74 
     | 
    
         
             
            $category_patterns = ["Category", "Categoria"].join("|")
         
     | 
| 
       65 
75 
     | 
    
         
             
            $category_regex = Regexp.new('[\{\[\|\b](?:' + $category_patterns + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
         
     | 
| 
         @@ -74,22 +84,16 @@ $single_square_bracket_regex = Regexp.new("(#{Regexp.escape('[')}|#{Regexp.escap 
     | 
|
| 
       74 
84 
     | 
    
         
             
            $double_square_bracket_regex = Regexp.new("(#{Regexp.escape('[[')}|#{Regexp.escape(']]')})", Regexp::MULTILINE)
         
     | 
| 
       75 
85 
     | 
    
         
             
            $single_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{')}|#{Regexp.escape('}')})", Regexp::MULTILINE)
         
     | 
| 
       76 
86 
     | 
    
         
             
            $double_curly_bracket_regex = Regexp.new("(#{Regexp.escape('{{')}|#{Regexp.escape('}}')})", Regexp::MULTILINE)
         
     | 
| 
       77 
     | 
    
         
            -
             
     | 
| 
      
 87 
     | 
    
         
            +
            $curly_square_bracket_regex = Regexp.new("(#{Regexp.escape('{|')}|#{Regexp.escape('|}')})", Regexp::MULTILINE)
         
     | 
| 
       78 
88 
     | 
    
         
             
            ###################################################
         
     | 
| 
       79 
89 
     | 
    
         | 
| 
       80 
90 
     | 
    
         
             
            module Wp2txt
         
     | 
| 
       81 
91 
     | 
    
         | 
| 
       82 
     | 
    
         
            -
              def  
     | 
| 
      
 92 
     | 
    
         
            +
              def convert_characters!(text, has_retried = false)
         
     | 
| 
       83 
93 
     | 
    
         
             
                begin 
         
     | 
| 
       84 
94 
     | 
    
         
             
                  text << "" 
         
     | 
| 
       85 
     | 
    
         
            -
                  
         
     | 
| 
       86 
95 
     | 
    
         
             
                  chrref_to_utf!(text)
         
     | 
| 
       87 
     | 
    
         
            -
                   
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
                  process_interwiki_links!(text)
         
     | 
| 
       90 
     | 
    
         
            -
                  process_external_links!(text)
         
     | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
       92 
     | 
    
         
            -
                  unescape_nowiki!(text)
         
     | 
| 
      
 96 
     | 
    
         
            +
                  special_chr!(text)
         
     | 
| 
       93 
97 
     | 
    
         | 
| 
       94 
98 
     | 
    
         
             
                rescue # detect invalid byte sequence in UTF-8
         
     | 
| 
       95 
99 
     | 
    
         
             
                  if has_retried
         
     | 
| 
         @@ -102,11 +106,34 @@ module Wp2txt 
     | 
|
| 
       102 
106 
     | 
    
         
             
                  else
         
     | 
| 
       103 
107 
     | 
    
         
             
                    text.encode!("UTF-16")
         
     | 
| 
       104 
108 
     | 
    
         
             
                    text.encode!("UTF-8")
         
     | 
| 
       105 
     | 
    
         
            -
                     
     | 
| 
      
 109 
     | 
    
         
            +
                    convert_characters!(text, true)
         
     | 
| 
       106 
110 
     | 
    
         
             
                  end
         
     | 
| 
       107 
111 
     | 
    
         
             
                end
         
     | 
| 
       108 
112 
     | 
    
         
             
              end
         
     | 
| 
      
 113 
     | 
    
         
            +
              
         
     | 
| 
      
 114 
     | 
    
         
            +
              def format_wiki!(text, has_retried = false)
         
     | 
| 
      
 115 
     | 
    
         
            +
                escape_nowiki!(text)
         
     | 
| 
      
 116 
     | 
    
         
            +
             
     | 
| 
      
 117 
     | 
    
         
            +
                process_interwiki_links!(text)
         
     | 
| 
      
 118 
     | 
    
         
            +
                process_external_links!(text)
         
     | 
| 
       109 
119 
     | 
    
         | 
| 
      
 120 
     | 
    
         
            +
                unescape_nowiki!(text)      
         
     | 
| 
      
 121 
     | 
    
         
            +
              end
         
     | 
| 
      
 122 
     | 
    
         
            +
              
         
     | 
| 
      
 123 
     | 
    
         
            +
              def format_article!(text)
         
     | 
| 
      
 124 
     | 
    
         
            +
                remove_directive!(text)
         
     | 
| 
      
 125 
     | 
    
         
            +
                remove_emphasis!(text)
         
     | 
| 
      
 126 
     | 
    
         
            +
                mndash!(text)
         
     | 
| 
      
 127 
     | 
    
         
            +
                make_reference!(text)
         
     | 
| 
      
 128 
     | 
    
         
            +
                format_ref!(text)
         
     | 
| 
      
 129 
     | 
    
         
            +
                remove_hr!(text)
         
     | 
| 
      
 130 
     | 
    
         
            +
                remove_tag!(text)
         
     | 
| 
      
 131 
     | 
    
         
            +
                convert_characters!(text)    
         
     | 
| 
      
 132 
     | 
    
         
            +
                correct_inline_template!(text) unless $leave_template
         
     | 
| 
      
 133 
     | 
    
         
            +
                remove_templates!(text) unless $leave_template
         
     | 
| 
      
 134 
     | 
    
         
            +
                remove_table!(text) unless $leave_table
         
     | 
| 
      
 135 
     | 
    
         
            +
              end
         
     | 
| 
      
 136 
     | 
    
         
            +
              
         
     | 
| 
       110 
137 
     | 
    
         
             
              #################### parser for nested structure ####################
         
     | 
| 
       111 
138 
     | 
    
         | 
| 
       112 
139 
     | 
    
         
             
              def process_nested_structure(scanner, left, right, recur_count, &block)
         
     | 
| 
         @@ -120,6 +147,8 @@ module Wp2txt 
     | 
|
| 
       120 
147 
     | 
    
         
             
                  regex = $single_curly_bracket_regex
         
     | 
| 
       121 
148 
     | 
    
         
             
                elsif left == "{{" && right == "}}"
         
     | 
| 
       122 
149 
     | 
    
         
             
                  regex = $double_curly_bracket_regex
         
     | 
| 
      
 150 
     | 
    
         
            +
                elsif left == "{|" && right == "|}"
         
     | 
| 
      
 151 
     | 
    
         
            +
                  regex = $curly_square_bracket_regex
         
     | 
| 
       123 
152 
     | 
    
         
             
                else
         
     | 
| 
       124 
153 
     | 
    
         
             
                  regex = Regexp.new('(#{Regexp.escape(left)}|#{Regexp.escape(right)})', Regexp::MULTILINE)
         
     | 
| 
       125 
154 
     | 
    
         
             
                end
         
     | 
| 
         @@ -154,15 +183,6 @@ module Wp2txt 
     | 
|
| 
       154 
183 
     | 
    
         
             
              end  
         
     | 
| 
       155 
184 
     | 
    
         | 
| 
       156 
185 
     | 
    
         
             
              #################### methods used from format_wiki ####################
         
     | 
| 
       157 
     | 
    
         
            -
             
     | 
| 
       158 
     | 
    
         
            -
              def remove_templates!(str)
         
     | 
| 
       159 
     | 
    
         
            -
                scanner = StringScanner.new(str)
         
     | 
| 
       160 
     | 
    
         
            -
                result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
         
     | 
| 
       161 
     | 
    
         
            -
                  ""
         
     | 
| 
       162 
     | 
    
         
            -
                end
         
     | 
| 
       163 
     | 
    
         
            -
                str.replace(result)
         
     | 
| 
       164 
     | 
    
         
            -
              end
         
     | 
| 
       165 
     | 
    
         
            -
              
         
     | 
| 
       166 
186 
     | 
    
         
             
              def escape_nowiki!(str)
         
     | 
| 
       167 
187 
     | 
    
         
             
                if @nowikis
         
     | 
| 
       168 
188 
     | 
    
         
             
                  @nowikis.clear
         
     | 
| 
         @@ -213,80 +233,42 @@ module Wp2txt 
     | 
|
| 
       213 
233 
     | 
    
         
             
                str.replace(result)
         
     | 
| 
       214 
234 
     | 
    
         
             
              end
         
     | 
| 
       215 
235 
     | 
    
         | 
| 
       216 
     | 
    
         
            -
               
     | 
| 
       217 
     | 
    
         
            -
             
     | 
| 
       218 
     | 
    
         
            -
             
     | 
| 
       219 
     | 
    
         
            -
             
     | 
| 
       220 
     | 
    
         
            -
             
     | 
| 
       221 
     | 
    
         
            -
                   
     | 
| 
       222 
     | 
    
         
            -
                  'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 
         
     | 
| 
       223 
     | 
    
         
            -
                  'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ñ', 
         
     | 
| 
       224 
     | 
    
         
            -
                  'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 
         
     | 
| 
       225 
     | 
    
         
            -
                  'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à', 
         
     | 
| 
       226 
     | 
    
         
            -
                  'á', 'â', 'ã', 'ä', 'å', 'æ', 
         
     | 
| 
       227 
     | 
    
         
            -
                  'ç', 'è', 'é', 'ê', 'ë', 'ì', 
         
     | 
| 
       228 
     | 
    
         
            -
                  'í', 'î', 'ï', 'ñ', 'ò', 'ó',
         
     | 
| 
       229 
     | 
    
         
            -
                  'ô', 'œ', 'õ', 'ö', 'ø', 'ù', 
         
     | 
| 
       230 
     | 
    
         
            -
                  'ú', 'û', 'ü', 'ÿ']\
         
     | 
| 
       231 
     | 
    
         
            -
                  .zip(['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 
         
     | 
| 
       232 
     | 
    
         
            -
                  'Î', 'Ï', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'ß', 'à', 
         
     | 
| 
       233 
     | 
    
         
            -
                  'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 
         
     | 
| 
       234 
     | 
    
         
            -
                  'ñ', 'ò', 'ó', 'ô','œ', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ÿ'])
         
     | 
| 
       235 
     | 
    
         
            -
              
         
     | 
| 
       236 
     | 
    
         
            -
                  punctuation = ['¿', '¡', '«', '»', '§', 
         
     | 
| 
       237 
     | 
    
         
            -
                  '¶', '†', '‡', '•', '–', '—']\
         
     | 
| 
       238 
     | 
    
         
            -
                  .zip(['¿', '¡', '«', '»', '§', '¶', '†', '‡', '•', '–', '—'])
         
     | 
| 
       239 
     | 
    
         
            -
              
         
     | 
| 
       240 
     | 
    
         
            -
                  commercial = ['™', '©', '®', '¢', '€', '¥',
         
     | 
| 
       241 
     | 
    
         
            -
                  '£', '¤'].zip(['™', '©', '®', '¢', '€', '¥', '£', '¤'])
         
     | 
| 
       242 
     | 
    
         
            -
              
         
     | 
| 
       243 
     | 
    
         
            -
                  greek_chr = ['α', 'β', 'γ', 'δ', 'ε', 
         
     | 
| 
       244 
     | 
    
         
            -
                  'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 
         
     | 
| 
       245 
     | 
    
         
            -
                  'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 
         
     | 
| 
       246 
     | 
    
         
            -
                  'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'Γ', 
         
     | 
| 
       247 
     | 
    
         
            -
                  'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 
         
     | 
| 
       248 
     | 
    
         
            -
                  'Ψ', 'Ω']\
         
     | 
| 
       249 
     | 
    
         
            -
                  .zip(['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 
         
     | 
| 
       250 
     | 
    
         
            -
                  'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ', 
         
     | 
| 
       251 
     | 
    
         
            -
                  'ψ', 'ω', 'Γ', 'Δ', 'Θ', 'Λ', 'Ξ', 'Π', 'Σ', 'Φ', 'Ψ', 'Ω'])
         
     | 
| 
       252 
     | 
    
         
            -
              
         
     | 
| 
       253 
     | 
    
         
            -
                  math_chr1 = ['∫', '∑', '∏', '√', '−', '±',
         
     | 
| 
       254 
     | 
    
         
            -
                  '∞', '≈', '∝', '≡', '≠', '≤', '≥', 
         
     | 
| 
       255 
     | 
    
         
            -
                  '×', '·', '÷', '∂', '′', '″', 
         
     | 
| 
       256 
     | 
    
         
            -
                  '∇', '‰', '°', '∴', 'ø', '∈', '∩', 
         
     | 
| 
       257 
     | 
    
         
            -
                  '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', 
         
     | 
| 
       258 
     | 
    
         
            -
                  '∃', '∀', '⇒', '⇔', '→', '↔', '↑']\
         
     | 
| 
       259 
     | 
    
         
            -
                  .zip(['∫', '∑', '∏', '√', '−', '±', '∞', '≈', '∝', '≡', '≠', '≤', 
         
     | 
| 
       260 
     | 
    
         
            -
                  '≥', '×', '·', '÷', '∂', '′', '″', '∇', '‰', '°', '∴', 'ø', '∈', 
         
     | 
| 
       261 
     | 
    
         
            -
                  '∩', '∪', '⊂', '⊃', '⊆', '⊇', '¬', '∧', '∨', '∃', '∀', '⇒', 
         
     | 
| 
       262 
     | 
    
         
            -
                  '⇔', '→', '↔', '↑'])
         
     | 
| 
       263 
     | 
    
         
            -
              
         
     | 
| 
       264 
     | 
    
         
            -
                  math_chr2 = ['ℵ', '∉'].zip(['ℵ', '∉'])
         
     | 
| 
       265 
     | 
    
         
            -
              
         
     | 
| 
       266 
     | 
    
         
            -
                  others = ['¨', 'ª', 
         
     | 
| 
       267 
     | 
    
         
            -
                  '¯', '´', 'µ', '¸', 'º', '‘', '’', 
         
     | 
| 
       268 
     | 
    
         
            -
                  '“', '‚', '”', '„', '♠', '♣', '◊', 
         
     | 
| 
       269 
     | 
    
         
            -
                  '♥', '←', '♦', '‹', '›', '↓']\
         
     | 
| 
       270 
     | 
    
         
            -
                  .zip(['¨', 'ª', '¯', '´', 'µ', '¸', 'º', '‘', '’', '“', '‚', '”', 
         
     | 
| 
       271 
     | 
    
         
            -
                  '„', '♠', '♣', '◊', '♥', '←', '♦', '‹', '›', '↓'] )
         
     | 
| 
       272 
     | 
    
         
            -
              
         
     | 
| 
       273 
     | 
    
         
            -
                  spc_array = html + umraut_accent + punctuation + commercial + greek_chr + 
         
     | 
| 
       274 
     | 
    
         
            -
                              math_chr1 + math_chr2 + others
         
     | 
| 
       275 
     | 
    
         
            -
                  $sp_hash  = Hash[*spc_array.flatten]
         
     | 
| 
       276 
     | 
    
         
            -
                  $sp_regex = Regexp.new("(" + $sp_hash.keys.join("|") + ")")
         
     | 
| 
      
 236 
     | 
    
         
            +
              #################### methods used from format_article ####################
         
     | 
| 
      
 237 
     | 
    
         
            +
             
     | 
| 
      
 238 
     | 
    
         
            +
              def remove_templates!(str)
         
     | 
| 
      
 239 
     | 
    
         
            +
                scanner = StringScanner.new(str)
         
     | 
| 
      
 240 
     | 
    
         
            +
                result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
         
     | 
| 
      
 241 
     | 
    
         
            +
                  ""
         
     | 
| 
       277 
242 
     | 
    
         
             
                end
         
     | 
| 
       278 
     | 
    
         
            -
                 
     | 
| 
       279 
     | 
    
         
            -
             
     | 
| 
       280 
     | 
    
         
            -
             
     | 
| 
      
 243 
     | 
    
         
            +
                str.replace(result)
         
     | 
| 
      
 244 
     | 
    
         
            +
              end
         
     | 
| 
      
 245 
     | 
    
         
            +
              
         
     | 
| 
      
 246 
     | 
    
         
            +
              def remove_table!(str)
         
     | 
| 
      
 247 
     | 
    
         
            +
                scanner = StringScanner.new(str)
         
     | 
| 
      
 248 
     | 
    
         
            +
                result = process_nested_structure(scanner, "{|", "|}", $limit_recur) do |contents|
         
     | 
| 
      
 249 
     | 
    
         
            +
                  ""
         
     | 
| 
       281 
250 
     | 
    
         
             
                end
         
     | 
| 
      
 251 
     | 
    
         
            +
                str.replace(result)
         
     | 
| 
      
 252 
     | 
    
         
            +
              end
         
     | 
| 
      
 253 
     | 
    
         
            +
              
         
     | 
| 
      
 254 
     | 
    
         
            +
              def special_chr!(str)
         
     | 
| 
      
 255 
     | 
    
         
            +
                str.replace $html_decoder.decode(str)
         
     | 
| 
       282 
256 
     | 
    
         
             
              end
         
     | 
| 
       283 
257 
     | 
    
         | 
| 
       284 
     | 
    
         
            -
              def  
     | 
| 
      
 258 
     | 
    
         
            +
              def remove_inbetween!(str, tagset = ['<', '>'])
         
     | 
| 
       285 
259 
     | 
    
         
             
                tagsets = Regexp.quote(tagset.uniq.join(""))
         
     | 
| 
       286 
260 
     | 
    
         
             
                regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
         
     | 
| 
       287 
261 
     | 
    
         
             
                str.gsub!(regex, "")
         
     | 
| 
       288 
262 
     | 
    
         
             
              end
         
     | 
| 
       289 
263 
     | 
    
         | 
| 
      
 264 
     | 
    
         
            +
              def remove_tag!(str)
         
     | 
| 
      
 265 
     | 
    
         
            +
                str.gsub!($remove_tag_regex, "")
         
     | 
| 
      
 266 
     | 
    
         
            +
              end
         
     | 
| 
      
 267 
     | 
    
         
            +
             
     | 
| 
      
 268 
     | 
    
         
            +
              def remove_directive!(str)
         
     | 
| 
      
 269 
     | 
    
         
            +
                str.gsub!($remove_directives_regex, "")
         
     | 
| 
      
 270 
     | 
    
         
            +
              end
         
     | 
| 
      
 271 
     | 
    
         
            +
             
     | 
| 
       290 
272 
     | 
    
         
             
              def remove_emphasis!(str)
         
     | 
| 
       291 
273 
     | 
    
         
             
                str.gsub!($remove_emphasis_regex) do
         
     | 
| 
       292 
274 
     | 
    
         
             
                  $2
         
     | 
| 
         @@ -311,10 +293,6 @@ module Wp2txt 
     | 
|
| 
       311 
293 
     | 
    
         
             
                end
         
     | 
| 
       312 
294 
     | 
    
         
             
                return true
         
     | 
| 
       313 
295 
     | 
    
         
             
              end
         
     | 
| 
       314 
     | 
    
         
            -
             
     | 
| 
       315 
     | 
    
         
            -
              def remove_directive!(str)
         
     | 
| 
       316 
     | 
    
         
            -
                remove_tag!(str, ['__', '__'])
         
     | 
| 
       317 
     | 
    
         
            -
              end
         
     | 
| 
       318 
296 
     | 
    
         | 
| 
       319 
297 
     | 
    
         
             
              def mndash!(str)
         
     | 
| 
       320 
298 
     | 
    
         
             
                str.gsub!($mndash_regex, "–")
         
     | 
| 
         @@ -364,40 +342,40 @@ module Wp2txt 
     | 
|
| 
       364 
342 
     | 
    
         | 
| 
       365 
343 
     | 
    
         
             
              #################### methods currently unused ####################
         
     | 
| 
       366 
344 
     | 
    
         | 
| 
       367 
     | 
    
         
            -
              def process_template(str)
         
     | 
| 
       368 
     | 
    
         
            -
             
     | 
| 
       369 
     | 
    
         
            -
             
     | 
| 
       370 
     | 
    
         
            -
             
     | 
| 
       371 
     | 
    
         
            -
             
     | 
| 
       372 
     | 
    
         
            -
             
     | 
| 
       373 
     | 
    
         
            -
             
     | 
| 
       374 
     | 
    
         
            -
             
     | 
| 
       375 
     | 
    
         
            -
             
     | 
| 
       376 
     | 
    
         
            -
             
     | 
| 
       377 
     | 
    
         
            -
             
     | 
| 
       378 
     | 
    
         
            -
             
     | 
| 
       379 
     | 
    
         
            -
             
     | 
| 
       380 
     | 
    
         
            -
             
     | 
| 
       381 
     | 
    
         
            -
             
     | 
| 
       382 
     | 
    
         
            -
             
     | 
| 
       383 
     | 
    
         
            -
             
     | 
| 
       384 
     | 
    
         
            -
             
     | 
| 
       385 
     | 
    
         
            -
              end
         
     | 
| 
       386 
     | 
    
         
            -
             
     | 
| 
       387 
     | 
    
         
            -
              def remove_table(str)
         
     | 
| 
       388 
     | 
    
         
            -
             
     | 
| 
       389 
     | 
    
         
            -
             
     | 
| 
       390 
     | 
    
         
            -
             
     | 
| 
       391 
     | 
    
         
            -
             
     | 
| 
       392 
     | 
    
         
            -
             
     | 
| 
       393 
     | 
    
         
            -
             
     | 
| 
       394 
     | 
    
         
            -
              end
         
     | 
| 
      
 345 
     | 
    
         
            +
              # def process_template(str)
         
     | 
| 
      
 346 
     | 
    
         
            +
              #   scanner = StringScanner.new(str)
         
     | 
| 
      
 347 
     | 
    
         
            +
              #   result = process_nested_structure(scanner, "{{", "}}", $limit_recur) do |contents|
         
     | 
| 
      
 348 
     | 
    
         
            +
              #     parts = contents.split("|")
         
     | 
| 
      
 349 
     | 
    
         
            +
              #     case parts.size
         
     | 
| 
      
 350 
     | 
    
         
            +
              #     when 0
         
     | 
| 
      
 351 
     | 
    
         
            +
              #       ""
         
     | 
| 
      
 352 
     | 
    
         
            +
              #     when 1
         
     | 
| 
      
 353 
     | 
    
         
            +
              #       parts.first || ""
         
     | 
| 
      
 354 
     | 
    
         
            +
              #     else
         
     | 
| 
      
 355 
     | 
    
         
            +
              #       if parts.last.split("=").size > 1
         
     | 
| 
      
 356 
     | 
    
         
            +
              #         parts.first || ""
         
     | 
| 
      
 357 
     | 
    
         
            +
              #       else
         
     | 
| 
      
 358 
     | 
    
         
            +
              #         parts.last || ""
         
     | 
| 
      
 359 
     | 
    
         
            +
              #       end
         
     | 
| 
      
 360 
     | 
    
         
            +
              #     end
         
     | 
| 
      
 361 
     | 
    
         
            +
              #   end
         
     | 
| 
      
 362 
     | 
    
         
            +
              #   result
         
     | 
| 
      
 363 
     | 
    
         
            +
              # end
         
     | 
| 
      
 364 
     | 
    
         
            +
             
     | 
| 
      
 365 
     | 
    
         
            +
              # def remove_table(str)
         
     | 
| 
      
 366 
     | 
    
         
            +
              #   new_str = str.gsub($remove_table_regex, "")
         
     | 
| 
      
 367 
     | 
    
         
            +
              #   if str != new_str
         
     | 
| 
      
 368 
     | 
    
         
            +
              #     new_str = remove_table(new_str)
         
     | 
| 
      
 369 
     | 
    
         
            +
              #   end
         
     | 
| 
      
 370 
     | 
    
         
            +
              #   new_str = remove_table(new_str) unless str == new_str
         
     | 
| 
      
 371 
     | 
    
         
            +
              #   return new_str
         
     | 
| 
      
 372 
     | 
    
         
            +
              # end
         
     | 
| 
       395 
373 
     | 
    
         | 
| 
       396 
     | 
    
         
            -
              def remove_clade(page)
         
     | 
| 
       397 
     | 
    
         
            -
             
     | 
| 
       398 
     | 
    
         
            -
             
     | 
| 
       399 
     | 
    
         
            -
             
     | 
| 
       400 
     | 
    
         
            -
              end
         
     | 
| 
      
 374 
     | 
    
         
            +
              # def remove_clade(page)
         
     | 
| 
      
 375 
     | 
    
         
            +
              #   new_page = page.gsub($remove_clade_regex, "")
         
     | 
| 
      
 376 
     | 
    
         
            +
              #   new_page = remove_clade(new_page) unless page == new_page
         
     | 
| 
      
 377 
     | 
    
         
            +
              #   new_page
         
     | 
| 
      
 378 
     | 
    
         
            +
              # end
         
     | 
| 
       401 
379 
     | 
    
         | 
| 
       402 
380 
     | 
    
         
             
              #################### file related utilities ####################
         
     | 
| 
       403 
381 
     | 
    
         | 
    
        data/lib/wp2txt/version.rb
    CHANGED
    
    
    
        data/spec/utils_spec.rb
    CHANGED
    
    | 
         @@ -44,7 +44,7 @@ describe "Wp2txt" do 
     | 
|
| 
       44 
44 
     | 
    
         
             
              describe "special_chr!" do
         
     | 
| 
       45 
45 
     | 
    
         
             
                it "replaces character references with real characters" do
         
     | 
| 
       46 
46 
     | 
    
         
             
                  str_before = "  < > & ""
         
     | 
| 
       47 
     | 
    
         
            -
                  str_after  = " 
     | 
| 
      
 47 
     | 
    
         
            +
                  str_after  = "  < > & \""
         
     | 
| 
       48 
48 
     | 
    
         
             
                  special_chr!(str_before)
         
     | 
| 
       49 
49 
     | 
    
         
             
                  expect(str_before).to eq str_after
         
     | 
| 
       50 
50 
     | 
    
         
             
                end    
         
     | 
| 
         @@ -77,21 +77,22 @@ describe "Wp2txt" do 
     | 
|
| 
       77 
77 
     | 
    
         
             
                end    
         
     | 
| 
       78 
78 
     | 
    
         
             
              end
         
     | 
| 
       79 
79 
     | 
    
         | 
| 
       80 
     | 
    
         
            -
              describe "remove_table" do
         
     | 
| 
      
 80 
     | 
    
         
            +
              describe "remove_table!" do
         
     | 
| 
       81 
81 
     | 
    
         
             
                it "removes table formated parts" do
         
     | 
| 
       82 
82 
     | 
    
         
             
                  str_before = "{| ... \n{| ... \n ...|}\n ...|}"
         
     | 
| 
       83 
83 
     | 
    
         
             
                  str_after  = ""
         
     | 
| 
       84 
     | 
    
         
            -
                   
     | 
| 
      
 84 
     | 
    
         
            +
                  remove_table!(str_before)
         
     | 
| 
      
 85 
     | 
    
         
            +
                  expect(str_before).to eq str_after
         
     | 
| 
       85 
86 
     | 
    
         
             
                end    
         
     | 
| 
       86 
87 
     | 
    
         
             
              end
         
     | 
| 
       87 
88 
     | 
    
         | 
| 
       88 
     | 
    
         
            -
              describe "remove_clade" do
         
     | 
| 
       89 
     | 
    
         
            -
             
     | 
| 
       90 
     | 
    
         
            -
             
     | 
| 
       91 
     | 
    
         
            -
             
     | 
| 
       92 
     | 
    
         
            -
             
     | 
| 
       93 
     | 
    
         
            -
             
     | 
| 
       94 
     | 
    
         
            -
              end
         
     | 
| 
      
 89 
     | 
    
         
            +
              # describe "remove_clade" do
         
     | 
| 
      
 90 
     | 
    
         
            +
              #   it "removes clade formated parts" do
         
     | 
| 
      
 91 
     | 
    
         
            +
              #     str_before = "\{\{clade ... \n ... \n ... \n\}\}"
         
     | 
| 
      
 92 
     | 
    
         
            +
              #     str_after  = ""
         
     | 
| 
      
 93 
     | 
    
         
            +
              #     expect(remove_clade(str_before)).to eq str_after
         
     | 
| 
      
 94 
     | 
    
         
            +
              #   end
         
     | 
| 
      
 95 
     | 
    
         
            +
              # end
         
     | 
| 
       95 
96 
     | 
    
         | 
| 
       96 
97 
     | 
    
         
             
              describe "remove_hr!" do
         
     | 
| 
       97 
98 
     | 
    
         
             
                it "removes horizontal lines" do
         
     | 
| 
         @@ -102,15 +103,15 @@ describe "Wp2txt" do 
     | 
|
| 
       102 
103 
     | 
    
         
             
                end    
         
     | 
| 
       103 
104 
     | 
    
         
             
              end
         
     | 
| 
       104 
105 
     | 
    
         | 
| 
       105 
     | 
    
         
            -
              describe " 
     | 
| 
       106 
     | 
    
         
            -
                it "removes tags" do
         
     | 
| 
      
 106 
     | 
    
         
            +
              describe "remove_inbetween!" do
         
     | 
| 
      
 107 
     | 
    
         
            +
                it "removes tags and its contents" do
         
     | 
| 
       107 
108 
     | 
    
         
             
                  str_before = "<tag>abc</tag>"
         
     | 
| 
       108 
109 
     | 
    
         
             
                  str_after  = "abc"
         
     | 
| 
       109 
110 
     | 
    
         
             
                  remove_tag!(str_before)
         
     | 
| 
       110 
111 
     | 
    
         
             
                  expect(str_before).to eq str_after
         
     | 
| 
       111 
112 
     | 
    
         
             
                  str_before = "[tag]def[/tag]"
         
     | 
| 
       112 
113 
     | 
    
         
             
                  str_after  = "def"
         
     | 
| 
       113 
     | 
    
         
            -
                   
     | 
| 
      
 114 
     | 
    
         
            +
                  remove_inbetween!(str_before, ['[', ']'])
         
     | 
| 
       114 
115 
     | 
    
         
             
                  expect(str_before).to eq str_after
         
     | 
| 
       115 
116 
     | 
    
         
             
                end    
         
     | 
| 
       116 
117 
     | 
    
         
             
              end
         
     | 
| 
         @@ -183,34 +184,34 @@ describe "Wp2txt" do 
     | 
|
| 
       183 
184 
     | 
    
         
             
                end
         
     | 
| 
       184 
185 
     | 
    
         
             
              end
         
     | 
| 
       185 
186 
     | 
    
         | 
| 
       186 
     | 
    
         
            -
              describe "process_template" do
         
     | 
| 
       187 
     | 
    
         
            -
             
     | 
| 
       188 
     | 
    
         
            -
             
     | 
| 
       189 
     | 
    
         
            -
             
     | 
| 
       190 
     | 
    
         
            -
             
     | 
| 
       191 
     | 
    
         
            -
             
     | 
| 
       192 
     | 
    
         
            -
             
     | 
| 
       193 
     | 
    
         
            -
             
     | 
| 
       194 
     | 
    
         
            -
             
     | 
| 
       195 
     | 
    
         
            -
             
     | 
| 
       196 
     | 
    
         
            -
             
     | 
| 
       197 
     | 
    
         
            -
             
     | 
| 
       198 
     | 
    
         
            -
             
     | 
| 
       199 
     | 
    
         
            -
             
     | 
| 
       200 
     | 
    
         
            -
             
     | 
| 
       201 
     | 
    
         
            -
              end
         
     | 
| 
      
 187 
     | 
    
         
            +
              # describe "process_template" do
         
     | 
| 
      
 188 
     | 
    
         
            +
              #   it "removes brackets and leaving some text" do
         
     | 
| 
      
 189 
     | 
    
         
            +
              #     str_before = "{{}}"
         
     | 
| 
      
 190 
     | 
    
         
            +
              #     str_after = ""
         
     | 
| 
      
 191 
     | 
    
         
            +
              #     expect(process_template(str_before)).to eq str_after
         
     | 
| 
      
 192 
     | 
    
         
            +
              #     str_before = "{{lang|en|Japan}}"
         
     | 
| 
      
 193 
     | 
    
         
            +
              #     str_after  = "Japan"
         
     | 
| 
      
 194 
     | 
    
         
            +
              #     expect(process_template(str_before)).to eq str_after
         
     | 
| 
      
 195 
     | 
    
         
            +
              #     str_before = "{{a|b=c|d=f}}"
         
     | 
| 
      
 196 
     | 
    
         
            +
              #     str_after  = "a"
         
     | 
| 
      
 197 
     | 
    
         
            +
              #     expect(process_template(str_before)).to eq str_after
         
     | 
| 
      
 198 
     | 
    
         
            +
              #     str_before = "{{a|b|{{c|d|e}}}}"
         
     | 
| 
      
 199 
     | 
    
         
            +
              #     str_after  = "e"
         
     | 
| 
      
 200 
     | 
    
         
            +
              #     expect(process_template(str_before)).to eq str_after
         
     | 
| 
      
 201 
     | 
    
         
            +
              #   end
         
     | 
| 
      
 202 
     | 
    
         
            +
              # end
         
     | 
| 
       202 
203 
     | 
    
         | 
| 
       203 
     | 
    
         
            -
            #   describe "expand_template" do
         
     | 
| 
       204 
     | 
    
         
            -
            #     it "gets data corresponding to a given template using mediawiki api" do
         
     | 
| 
       205 
     | 
    
         
            -
            #       uri = "http://en.wiktionary.org/w/api.php"
         
     | 
| 
       206 
     | 
    
         
            -
            #       template = "{{en-verb}}"
         
     | 
| 
       207 
     | 
    
         
            -
            #       word = "kick"
         
     | 
| 
       208 
     | 
    
         
            -
            #       expanded = expand_template(uri, template, word)
         
     | 
| 
       209 
     | 
    
         
            -
            #       html =<<EOD
         
     | 
| 
       210 
     | 
    
         
            -
            # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
         
     | 
| 
       211 
     | 
    
         
            -
            # EOD
         
     | 
| 
       212 
     | 
    
         
            -
            #       html.strip!
         
     | 
| 
       213 
     | 
    
         
            -
            #       expanded.should == html
         
     | 
| 
       214 
     | 
    
         
            -
            #     end
         
     | 
| 
       215 
     | 
    
         
            -
            #   end
         
     | 
| 
      
 204 
     | 
    
         
            +
              #   describe "expand_template" do
         
     | 
| 
      
 205 
     | 
    
         
            +
              #     it "gets data corresponding to a given template using mediawiki api" do
         
     | 
| 
      
 206 
     | 
    
         
            +
              #       uri = "http://en.wiktionary.org/w/api.php"
         
     | 
| 
      
 207 
     | 
    
         
            +
              #       template = "{{en-verb}}"
         
     | 
| 
      
 208 
     | 
    
         
            +
              #       word = "kick"
         
     | 
| 
      
 209 
     | 
    
         
            +
              #       expanded = expand_template(uri, template, word)
         
     | 
| 
      
 210 
     | 
    
         
            +
              #       html =<<EOD
         
     | 
| 
      
 211 
     | 
    
         
            +
              # <span class=\"infl-inline\"><b class=\"Latn \" lang=\"en\">kick</b> (''third-person singular simple present'' <span class=\"form-of third-person-singular-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicks#English|kicks]]</span>'''</span>, ''present participle'' <span class=\"form-of present-participle-form-of\">'''<span class=\"Latn \" lang=\"en\">[[kicking#English|kicking]]</span>'''</span>, ''simple past and past participle'' <span class=\"form-of simple-past-and-participle-form-of\"> '''<span class=\"Latn \" lang=\"en\">[[kicked#English|kicked]]</span>'''</span>)</span>[[Category:English verbs|kick]]
         
     | 
| 
      
 212 
     | 
    
         
            +
              # EOD
         
     | 
| 
      
 213 
     | 
    
         
            +
              #       html.strip!
         
     | 
| 
      
 214 
     | 
    
         
            +
              #       expanded.should == html
         
     | 
| 
      
 215 
     | 
    
         
            +
              #     end
         
     | 
| 
      
 216 
     | 
    
         
            +
              #   end
         
     | 
| 
       216 
217 
     | 
    
         
             
            end
         
     | 
    
        data/wp2txt.gemspec
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: wp2txt
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.7. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.7.5
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - Yoichiro Hasebe
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2014-11- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2014-11-30 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: nokogiri
         
     | 
| 
         @@ -24,6 +24,20 @@ dependencies: 
     | 
|
| 
       24 
24 
     | 
    
         
             
                - - ">="
         
     | 
| 
       25 
25 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       26 
26 
     | 
    
         
             
                    version: '0'
         
     | 
| 
      
 27 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency
         
     | 
| 
      
 28 
     | 
    
         
            +
              name: htmlentities
         
     | 
| 
      
 29 
     | 
    
         
            +
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
      
 30 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 31 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 32 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 33 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
      
 34 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
      
 35 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 36 
     | 
    
         
            +
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
      
 37 
     | 
    
         
            +
                requirements:
         
     | 
| 
      
 38 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 39 
     | 
    
         
            +
                  - !ruby/object:Gem::Version
         
     | 
| 
      
 40 
     | 
    
         
            +
                    version: '0'
         
     | 
| 
       27 
41 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       28 
42 
     | 
    
         
             
              name: trollop
         
     | 
| 
       29 
43 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
         @@ -56,6 +70,7 @@ files: 
     | 
|
| 
       56 
70 
     | 
    
         
             
            - bin/benchmark.rb
         
     | 
| 
       57 
71 
     | 
    
         
             
            - bin/wp2txt
         
     | 
| 
       58 
72 
     | 
    
         
             
            - data/testdata.bz2
         
     | 
| 
      
 73 
     | 
    
         
            +
            - error_log.txt
         
     | 
| 
       59 
74 
     | 
    
         
             
            - lib/wp2txt.rb
         
     | 
| 
       60 
75 
     | 
    
         
             
            - lib/wp2txt/article.rb
         
     | 
| 
       61 
76 
     | 
    
         
             
            - lib/wp2txt/mw_api.rb
         
     |