charles 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +10 -0
- data/Rakefile +13 -0
- data/bin/charles +23 -0
- data/charles.gemspec +25 -0
- data/lib/charles/document.rb +177 -0
- data/lib/charles/images.rb +77 -0
- data/lib/charles/internal_attributes.rb +40 -0
- data/lib/charles/misc.rb +84 -0
- data/lib/charles/version.rb +3 -0
- data/lib/charles.rb +66 -0
- data/optimise.rb +72 -0
- data/test/articles/20120525_1525_straitstimes.com.content.txt +5 -0
- data/test/articles/20120525_1525_straitstimes.com.html +1929 -0
- data/test/articles/20120525_1534_bbc.co.uk.content.txt +19 -0
- data/test/articles/20120525_1534_bbc.co.uk.html +1777 -0
- data/test/articles/20120525_1727_bbc.co.uk.content.txt +39 -0
- data/test/articles/20120525_1727_bbc.co.uk.html +1889 -0
- data/test/articles/20120525_1730_channelnewsasia.com.content.txt +19 -0
- data/test/articles/20120525_1730_channelnewsasia.com.html +963 -0
- data/test/articles/20120525_1733_channelnewsasia.com.content.txt +19 -0
- data/test/articles/20120525_1733_channelnewsasia.com.html +923 -0
- data/test/articles/20120525_1736_nytimes.com.content.txt +21 -0
- data/test/articles/20120525_1736_nytimes.com.html +856 -0
- data/test/articles/20120525_1743_nytimes.com.content.txt +11 -0
- data/test/articles/20120525_1743_nytimes.com.html +98 -0
- data/test/articles/20120525_1747_techcrunch.com.content.txt +11 -0
- data/test/articles/20120525_1747_techcrunch.com.html +1098 -0
- data/test/articles/20120528_0929_washingtonpost.com.content.txt +23 -0
- data/test/articles/20120528_0929_washingtonpost.com.html +3335 -0
- data/test/articles/20120528_0931_latimes.com.content.txt +45 -0
- data/test/articles/20120528_0931_latimes.com.html +6371 -0
- data/test/articles/20120528_0938_entertainment.time.com.content.txt +31 -0
- data/test/articles/20120528_0938_entertainment.time.com.html +1261 -0
- data/test/articles/20120528_0943_bloomberg.com.content.txt +13 -0
- data/test/articles/20120528_0943_bloomberg.com.html +2874 -0
- data/test/articles/20120528_0947_reuters.com.content.txt +35 -0
- data/test/articles/20120528_0947_reuters.com.html +1563 -0
- data/test/articles/20120528_1106_reuters.com.content.txt +5 -0
- data/test/articles/20120528_1106_reuters.com.html +551 -0
- data/test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt +19 -0
- data/test/articles/20120528_1109_musicthing.blogspot.co.uk.html +865 -0
- data/test/articles/20120528_1114_mobileinc.co.uk.content.txt +15 -0
- data/test/articles/20120528_1114_mobileinc.co.uk.html +550 -0
- data/test/articles/20120528_1119_forbes.com.content.txt +15 -0
- data/test/articles/20120528_1119_forbes.com.html +1406 -0
- data/test/articles/20120528_1122_techcrunch.com.content.txt +58 -0
- data/test/articles/20120528_1122_techcrunch.com.html +1131 -0
- data/test/articles/20120528_1126_blogs.adobe.com.content.txt +13 -0
- data/test/articles/20120528_1126_blogs.adobe.com.html +303 -0
- data/test/articles/20120528_1142_thestar.com.my.content.txt +27 -0
- data/test/articles/20120528_1142_thestar.com.my.html +943 -0
- data/test/articles/20120528_1146_suntimes.com.content.txt +33 -0
- data/test/articles/20120528_1146_suntimes.com.html +5166 -0
- data/test/articles/20120528_1148_asiaone.com.content.txt +27 -0
- data/test/articles/20120528_1148_asiaone.com.html +1070 -0
- data/test/articles/20120529_1120_online.wsj.com.content.txt +56 -0
- data/test/articles/20120529_1120_online.wsj.com.html +3035 -0
- data/test/articles/20120529_1122_online.wsj.com.content.txt +35 -0
- data/test/articles/20120529_1122_online.wsj.com.html +2725 -0
- data/test/articles/20120529_1127_smh.com.au.content.txt +13 -0
- data/test/articles/20120529_1127_smh.com.au.html +2034 -0
- data/test/articles.yml +221 -0
- data/test/test_charles.rb +70 -0
- metadata +279 -0
    
        data/.gitignore
    ADDED
    
    
    
        data/Gemfile
    ADDED
    
    
    
        data/LICENSE
    ADDED
    
    | @@ -0,0 +1,22 @@ | |
| 1 | 
            +
            Copyright (c) 2012 Jason Ling
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            MIT License
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining
         | 
| 6 | 
            +
            a copy of this software and associated documentation files (the
         | 
| 7 | 
            +
            "Software"), to deal in the Software without restriction, including
         | 
| 8 | 
            +
            without limitation the rights to use, copy, modify, merge, publish,
         | 
| 9 | 
            +
            distribute, sublicense, and/or sell copies of the Software, and to
         | 
| 10 | 
            +
            permit persons to whom the Software is furnished to do so, subject to
         | 
| 11 | 
            +
            the following conditions:
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            The above copyright notice and this permission notice shall be
         | 
| 14 | 
            +
            included in all copies or substantial portions of the Software.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         | 
| 17 | 
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         | 
| 18 | 
            +
            MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         | 
| 19 | 
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
         | 
| 20 | 
            +
            LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
         | 
| 21 | 
            +
            OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
         | 
| 22 | 
            +
            WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         | 
    
        data/README.md
    ADDED
    
    
    
        data/Rakefile
    ADDED
    
    
    
        data/bin/charles
    ADDED
    
    | @@ -0,0 +1,23 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'lib/charles'
         | 
| 4 | 
            +
            require 'yaml'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            Charles.options[:tmp_path] = File.dirname(__FILE__) + "/../test/tmp"  
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            url = ARGV.shift
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            unless url =~ /^http/
         | 
| 11 | 
            +
              url = File.read(url)
         | 
| 12 | 
            +
            end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            document = Charles.get(url)
         | 
| 15 | 
            +
            puts({
         | 
| 16 | 
            +
              :content => document.content,
         | 
| 17 | 
            +
              :title => document.title,
         | 
| 18 | 
            +
              :filtered_images => document.filtered_images.collect{|image| image[:url]}
         | 
| 19 | 
            +
            }.to_yaml)
         | 
| 20 | 
            +
             | 
| 21 | 
            +
             | 
| 22 | 
            +
             | 
| 23 | 
            +
             | 
    
        data/charles.gemspec
    ADDED
    
    | @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            # -*- encoding: utf-8 -*-
         | 
| 2 | 
            +
            require File.expand_path('../lib/charles/version', __FILE__)
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            Gem::Specification.new do |gem|
         | 
| 5 | 
            +
              gem.authors       = ["Jason Ling Xiaowei"]
         | 
| 6 | 
            +
              gem.email         = ["jason@jeyel.com"]
         | 
| 7 | 
            +
              gem.description   = 'Charles the Content Extractor'
         | 
| 8 | 
            +
              gem.summary       = 'Charles the Content Extractor'
         | 
| 9 | 
            +
              gem.homepage      = "https://github.com/jlxw/charles"
         | 
| 10 | 
            +
             | 
| 11 | 
            +
              gem.files         = `git ls-files`.split($\)
         | 
| 12 | 
            +
              gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
         | 
| 13 | 
            +
              gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
         | 
| 14 | 
            +
              gem.name          = "charles"
         | 
| 15 | 
            +
              gem.require_paths = ["lib"]
         | 
| 16 | 
            +
              gem.version       = Charles::VERSION
         | 
| 17 | 
            +
              
         | 
| 18 | 
            +
              gem.add_dependency "ferret"
         | 
| 19 | 
            +
              gem.add_dependency "nokogiri"
         | 
| 20 | 
            +
              gem.add_dependency "htmlentities"
         | 
| 21 | 
            +
              gem.add_dependency "mechanize"
         | 
| 22 | 
            +
              gem.add_dependency "activesupport"
         | 
| 23 | 
            +
              gem.add_dependency "rack"
         | 
| 24 | 
            +
              gem.add_dependency "imagesize"
         | 
| 25 | 
            +
            end
         | 
| @@ -0,0 +1,177 @@ | |
| 1 | 
            +
            require 'charles/images'
         | 
| 2 | 
            +
            require 'charles/internal_attributes'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module Charles
         | 
| 5 | 
            +
              class Document
         | 
| 6 | 
            +
                include Charles::InternalAttributes
         | 
| 7 | 
            +
                include Charles::Images
         | 
| 8 | 
            +
                
         | 
| 9 | 
            +
                def initialize(input, options={})
         | 
| 10 | 
            +
                  @document = Nokogiri::HTML.parse(input)
         | 
| 11 | 
            +
                  @document.search("script, style").remove
         | 
| 12 | 
            +
                  @nodes = @document.search('body *').select{|_n|
         | 
| 13 | 
            +
                    _n.clean_inner_tokens_text.size > 30 #arbitrary, minimum inner text limit of 30 chars
         | 
| 14 | 
            +
                  }
         | 
| 15 | 
            +
                  @options = options
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
                
         | 
| 18 | 
            +
                def logger; Charles.logger; end
         | 
| 19 | 
            +
                  
         | 
| 20 | 
            +
                def content(seeds={})
         | 
| 21 | 
            +
                  content_node = content_node(seeds)
         | 
| 22 | 
            +
                  return unless content_node
         | 
| 23 | 
            +
                  refine_content_node(content_node).clean_inner_text
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
                
         | 
| 26 | 
            +
                def content_node(seeds={})
         | 
| 27 | 
            +
                  content_nodes = calculate_content_nodes(seeds)
         | 
| 28 | 
            +
                  return unless content_nodes.first
         | 
| 29 | 
            +
                  content_nodes.first[:node]
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
                
         | 
| 32 | 
            +
                def calculate_content_nodes(seeds={})
         | 
| 33 | 
            +
                  default_seeds = {:title_match=>0.145422959269808,
         | 
| 34 | 
            +
              :title_match_buffer=>0.0174920023610796,
         | 
| 35 | 
            +
              :length=>1100.27450832379,
         | 
| 36 | 
            +
              :distance_from_top=>0.308408501217311,
         | 
| 37 | 
            +
              :internal_nodes=>25.680381972181,
         | 
| 38 | 
            +
              :internal_nodes_buffer=>20.2006169153009}
         | 
| 39 | 
            +
                  seeds = default_seeds.merge(seeds)
         | 
| 40 | 
            +
                  
         | 
| 41 | 
            +
                  o = []
         | 
| 42 | 
            +
                  _rank = 0
         | 
| 43 | 
            +
                  
         | 
| 44 | 
            +
                  @nodes.each_index{|_i|
         | 
| 45 | 
            +
                    _n = @nodes[_i]
         | 
| 46 | 
            +
                    _rank += 1
         | 
| 47 | 
            +
                    
         | 
| 48 | 
            +
                    scores={
         | 
| 49 | 
            +
                      :length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
         | 
| 50 | 
            +
                      :internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
         | 
| 51 | 
            +
                      :distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
         | 
| 52 | 
            +
                      :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1+ + seeds[:title_match_buffer])**seeds[:title_match].to_f #ferret index score, search score with page title
         | 
| 53 | 
            +
                      #:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
         | 
| 54 | 
            +
                    }
         | 
| 55 | 
            +
                    o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
         | 
| 56 | 
            +
                  }
         | 
| 57 | 
            +
                  
         | 
| 58 | 
            +
                  o.sort!{|a,b| b[:score] <=> a[:score]}
         | 
| 59 | 
            +
                  
         | 
| 60 | 
            +
                  #o[0,1].each{|o2| pp [o2[:score], o2[:scores]]}
         | 
| 61 | 
            +
                  #o[0,1].each{|o2| pp [refine_content_node(o2[:node]).clean_inner_text, o2[:score], o2[:scores]]}
         | 
| 62 | 
            +
                        
         | 
| 63 | 
            +
                  return o
         | 
| 64 | 
            +
                end
         | 
| 65 | 
            +
                
         | 
| 66 | 
            +
                def refine_content_node(node)
         | 
| 67 | 
            +
                  node = node.dup
         | 
| 68 | 
            +
                  
         | 
| 69 | 
            +
                  #strip 'clutter'
         | 
| 70 | 
            +
                  #i.children.each{|_n| pp _n.inner_text; pp _n.clean_inner_text.size}
         | 
| 71 | 
            +
                  _min_size = 30
         | 
| 72 | 
            +
                  node.children.each{|_n|
         | 
| 73 | 
            +
                    if(_n.clean_inner_tokens_text.size < _min_size)
         | 
| 74 | 
            +
                      _n.remove
         | 
| 75 | 
            +
                    else; break; end
         | 
| 76 | 
            +
                  }
         | 
| 77 | 
            +
                  node.children.reverse.each{|_n|
         | 
| 78 | 
            +
                    if(_n.clean_inner_tokens_text.size < _min_size)
         | 
| 79 | 
            +
                      _n.remove
         | 
| 80 | 
            +
                    else; break; end
         | 
| 81 | 
            +
                  }
         | 
| 82 | 
            +
                  node.search('*').each{|_n| _n.after(' ')}
         | 
| 83 | 
            +
                  
         | 
| 84 | 
            +
                  return node
         | 
| 85 | 
            +
                end
         | 
| 86 | 
            +
                
         | 
| 87 | 
            +
                
         | 
| 88 | 
            +
                def content_node_ferret_index
         | 
| 89 | 
            +
                  @content_node_ferret_index ||= caluclate_content_node_ferret_index
         | 
| 90 | 
            +
                end
         | 
| 91 | 
            +
                def caluclate_content_node_ferret_index
         | 
| 92 | 
            +
                  index = Ferret::Index::Index.new()
         | 
| 93 | 
            +
                  index.field_infos.add_field(:id, :store => :yes)
         | 
| 94 | 
            +
                  index.field_infos.add_field(:content, :store => :no, :boost => 1)
         | 
| 95 | 
            +
             | 
| 96 | 
            +
             | 
| 97 | 
            +
                  @nodes.each_index{|_i|
         | 
| 98 | 
            +
                    i=@nodes[_i]
         | 
| 99 | 
            +
                    index << {
         | 
| 100 | 
            +
                      :id => _i,
         | 
| 101 | 
            +
                      :content => i.clean_inner_text,
         | 
| 102 | 
            +
                    }
         | 
| 103 | 
            +
                  }
         | 
| 104 | 
            +
             | 
| 105 | 
            +
             | 
| 106 | 
            +
                  q=self.title.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'') #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
         | 
| 107 | 
            +
                  s=index.search(q, :limit => @nodes.size)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                  o=[]
         | 
| 110 | 
            +
                  s.hits.each {|hit|
         | 
| 111 | 
            +
                    _i = index[hit.doc][:id].to_i
         | 
| 112 | 
            +
                    _n = @nodes[_i]
         | 
| 113 | 
            +
                    _search_score = hit.score
         | 
| 114 | 
            +
                    _search_normalised_score = hit.score/s.max_score
         | 
| 115 | 
            +
                    #logger.info [_n.clean_inner_text, _search_score, _search_normalised_score].pretty_inspect
         | 
| 116 | 
            +
                    o[_i] = _search_normalised_score
         | 
| 117 | 
            +
                  }
         | 
| 118 | 
            +
                  o
         | 
| 119 | 
            +
                end
         | 
| 120 | 
            +
                
         | 
| 121 | 
            +
                def mechanize_agent
         | 
| 122 | 
            +
                  @options[:mechanize_agent] ||= Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
         | 
| 123 | 
            +
                end
         | 
| 124 | 
            +
             | 
| 125 | 
            +
              end
         | 
| 126 | 
            +
             | 
| 127 | 
            +
             | 
| 128 | 
            +
            end
         | 
| 129 | 
            +
             | 
| 130 | 
            +
             | 
| 131 | 
            +
             | 
| 132 | 
            +
             | 
| 133 | 
            +
             | 
| 134 | 
            +
            Nokogiri::XML::Node.class_eval {
         | 
| 135 | 
            +
              def clean_inner_text
         | 
| 136 | 
            +
                @clean_inner_text ||= Charles::Misc.normalize_string(inner_text)
         | 
| 137 | 
            +
              end
         | 
| 138 | 
            +
              def clean_inner_tokens_text
         | 
| 139 | 
            +
                @clean_inner_tokens_text ||= (
         | 
| 140 | 
            +
                    Charles::Misc.string_to_clean_tokens_string(clean_inner_text)
         | 
| 141 | 
            +
                  )
         | 
| 142 | 
            +
              end
         | 
| 143 | 
            +
              def internal_nodes_size
         | 
| 144 | 
            +
                @internal_nodes_size ||= search('*').size
         | 
| 145 | 
            +
              end
         | 
| 146 | 
            +
            }
         | 
| 147 | 
            +
             | 
| 148 | 
            +
             | 
| 149 | 
            +
            #https://github.com/cheald/pismo/blob/master/lib/pismo.rb
         | 
| 150 | 
            +
            class Nokogiri::HTML::Document
         | 
| 151 | 
            +
              def get_the(search)
         | 
| 152 | 
            +
                self.search(search).first rescue nil
         | 
| 153 | 
            +
              end
         | 
| 154 | 
            +
             | 
| 155 | 
            +
              def match(queries = [])
         | 
| 156 | 
            +
                [].tap do |results|
         | 
| 157 | 
            +
                  [*queries].each do |query|
         | 
| 158 | 
            +
                    result = begin
         | 
| 159 | 
            +
                      if query.is_a?(String)
         | 
| 160 | 
            +
                        if el = self.search(query).first
         | 
| 161 | 
            +
                          if el.name.downcase == "meta"
         | 
| 162 | 
            +
                            el['content']
         | 
| 163 | 
            +
                          else
         | 
| 164 | 
            +
                            el.inner_text
         | 
| 165 | 
            +
                          end
         | 
| 166 | 
            +
                        end
         | 
| 167 | 
            +
                      elsif query.is_a?(Array)
         | 
| 168 | 
            +
                        query.last.call( self.search(query.first).first )
         | 
| 169 | 
            +
                      end
         | 
| 170 | 
            +
                    rescue
         | 
| 171 | 
            +
                      nil
         | 
| 172 | 
            +
                    end
         | 
| 173 | 
            +
                    results << Charles::Misc.normalize_string(result) if result
         | 
| 174 | 
            +
                  end
         | 
| 175 | 
            +
                end.compact
         | 
| 176 | 
            +
              end
         | 
| 177 | 
            +
            end
         | 
| @@ -0,0 +1,77 @@ | |
| 1 | 
            +
            module Charles
         | 
| 2 | 
            +
              module Images
         | 
| 3 | 
            +
                def image
         | 
| 4 | 
            +
                  images && images.first
         | 
| 5 | 
            +
                end
         | 
| 6 | 
            +
                def images
         | 
| 7 | 
            +
                  @images ||= calculate_images
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
                def calculate_images
         | 
| 10 | 
            +
                  _node = self.content_node
         | 
| 11 | 
            +
                  return unless _node
         | 
| 12 | 
            +
                  #logger.info _node.pretty_inspect
         | 
| 13 | 
            +
              
         | 
| 14 | 
            +
                  (_node.ancestors.size/2).times do
         | 
| 15 | 
            +
                    o=self.calculate_image_from_node(_node)
         | 
| 16 | 
            +
                    #logger.info o.pretty_inspect
         | 
| 17 | 
            +
                    return o if o
         | 
| 18 | 
            +
                    _node = _node.parent
         | 
| 19 | 
            +
                  end
         | 
| 20 | 
            +
              
         | 
| 21 | 
            +
                  return []
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
                def calculate_image_from_node(_node)
         | 
| 24 | 
            +
                  _imgs = _node.search('img')
         | 
| 25 | 
            +
              
         | 
| 26 | 
            +
                  i=URI.parse(@options[:url])
         | 
| 27 | 
            +
                  if !_imgs.empty? && _imgs.size < 50 #sanity check if more than 50 images...
         | 
| 28 | 
            +
                    o=[]
         | 
| 29 | 
            +
                    _imgs.each do |_img|
         | 
| 30 | 
            +
                      next unless _img.attr('src')
         | 
| 31 | 
            +
                      begin
         | 
| 32 | 
            +
                        _u = (i + _img.attr('src')).to_s
         | 
| 33 | 
            +
                      rescue StandardError => e
         | 
| 34 | 
            +
                        logger.info "Error #{e}: #{i} + #{_img.attr('src')}"
         | 
| 35 | 
            +
                        next
         | 
| 36 | 
            +
                      end
         | 
| 37 | 
            +
                      o << _u
         | 
| 38 | 
            +
                    end
         | 
| 39 | 
            +
                    return o
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
              
         | 
| 42 | 
            +
                  return nil
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
                
         | 
| 45 | 
            +
                def filtered_images
         | 
| 46 | 
            +
                  _max_proportion = 2.5
         | 
| 47 | 
            +
                  _min_area = 88*88
         | 
| 48 | 
            +
                  _filtered_images = []
         | 
| 49 | 
            +
                  _images = self.images.dup
         | 
| 50 | 
            +
                  _images.each{|url|
         | 
| 51 | 
            +
                    data = get_image(url)
         | 
| 52 | 
            +
                    next unless data
         | 
| 53 | 
            +
                    size = ImageSize.new(data).get_size
         | 
| 54 | 
            +
                    if(size[0] * size[1] > _min_area &&
         | 
| 55 | 
            +
                      size[0].to_f/size[1] < _max_proportion &&
         | 
| 56 | 
            +
                      size[1].to_f/size[0] < _max_proportion)
         | 
| 57 | 
            +
                      _filtered_images << {:url => url, :data => data, :width => size[0], :height => size[1]}
         | 
| 58 | 
            +
                    end
         | 
| 59 | 
            +
                  }
         | 
| 60 | 
            +
                  return _filtered_images
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
                
         | 
| 63 | 
            +
                def get_image(url)
         | 
| 64 | 
            +
                  _cache_key = "get_image(#{url})"
         | 
| 65 | 
            +
                  begin
         | 
| 66 | 
            +
                    Charles.file_cache.fetch(_cache_key) {
         | 
| 67 | 
            +
                      body = mechanize_agent.get(url, [], URI.parse(@options[:url])).body
         | 
| 68 | 
            +
                      body.size < 900000 ? body : nil
         | 
| 69 | 
            +
                    }
         | 
| 70 | 
            +
                  rescue StandardError, Timeout::Error
         | 
| 71 | 
            +
                    Charles.file_cache.write(_cache_key, nil, :expires_in => 1.hour)
         | 
| 72 | 
            +
                  end
         | 
| 73 | 
            +
                end
         | 
| 74 | 
            +
              end
         | 
| 75 | 
            +
            end
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                
         | 
| @@ -0,0 +1,40 @@ | |
| 1 | 
            +
            module Charles
         | 
| 2 | 
            +
              module InternalAttributes
         | 
| 3 | 
            +
                def title
         | 
| 4 | 
            +
                  @title||=(
         | 
| 5 | 
            +
                    title = @document.search('title').first
         | 
| 6 | 
            +
                    title ? title.clean_inner_text : nil
         | 
| 7 | 
            +
                  )
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
                def clean_title
         | 
| 10 | 
            +
                  return title if !@options[:sample_titles] || @options[:sample_titles].size < 5
         | 
| 11 | 
            +
                  _title_words = {}
         | 
| 12 | 
            +
                  
         | 
| 13 | 
            +
                  _tokens = Charles::Misc.string_to_tokens_raw(self.title, type = :no_stop_words)
         | 
| 14 | 
            +
                  while(_tokens.first && words_to_filter_from_sample_titles.include?(_tokens.first.text)); _tokens.shift; end; #remove words from the beginning of the tokens
         | 
| 15 | 
            +
                  while(_tokens.last && words_to_filter_from_sample_titles.include?(_tokens.last.text)); _tokens.pop; end; #remove words from the end of the tokens
         | 
| 16 | 
            +
                  return title if _tokens.empty? #everything stripped? return nil, use other titles
         | 
| 17 | 
            +
              
         | 
| 18 | 
            +
                  _start = _tokens.first.start;
         | 
| 19 | 
            +
                  _end = _tokens.last.end;
         | 
| 20 | 
            +
                  _title = self.title.slice(_start, _end - _start)
         | 
| 21 | 
            +
                  _title = self.title.match(/[^\s\302\240]*#{Regexp.escape(_title)}[^\s\302\240]*/)[0].strip #include symbols or punctuation surrounding the title
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
                
         | 
| 24 | 
            +
                protected
         | 
| 25 | 
            +
                
         | 
| 26 | 
            +
                def words_to_filter_from_sample_titles
         | 
| 27 | 
            +
                  @words_to_filter_from_sample_titles = calculate_words_to_filter_from_sample_titles
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
                def calculate_words_to_filter_from_sample_titles
         | 
| 30 | 
            +
                  _title_words = {}
         | 
| 31 | 
            +
                  @options[:sample_titles].each{|sample_title|
         | 
| 32 | 
            +
                    Charles::Misc.string_to_tokens(sample_title, type = :no_stop_words).uniq.each{|token|
         | 
| 33 | 
            +
                      _title_words[token]||=0; _title_words[token]+=1
         | 
| 34 | 
            +
                    }
         | 
| 35 | 
            +
                  }
         | 
| 36 | 
            +
                  _threshold = (0.9 * @options[:sample_titles].size).ceil
         | 
| 37 | 
            +
                  _words_to_filter = _title_words.select{|k,v| v >= _threshold}.collect{|k,v| k} #select words used in more than 90% of the titles
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
            end
         | 
    
        data/lib/charles/misc.rb
    ADDED
    
    | @@ -0,0 +1,84 @@ | |
| 1 | 
            +
            module Charles
         | 
| 2 | 
            +
              module Misc
         | 
| 3 | 
            +
                def self.compare_strings(a,b)
         | 
| 4 | 
            +
                  [compare_strings_single_side(a,b),compare_strings_single_side(b,a)].mean
         | 
| 5 | 
            +
                end
         | 
| 6 | 
            +
                def self.compare_strings_single_side(a,b)
         | 
| 7 | 
            +
                  index = Ferret::Index::Index.new()
         | 
| 8 | 
            +
                  index.field_infos.add_field(:content, :store => :no, :boost => 1)
         | 
| 9 | 
            +
                  index << {:content => a}
         | 
| 10 | 
            +
                  search = index.search(b.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'')) #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
         | 
| 11 | 
            +
                  search.max_score
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
                
         | 
| 14 | 
            +
                
         | 
| 15 | 
            +
                
         | 
| 16 | 
            +
              
         | 
| 17 | 
            +
                def self.analyzer(type = :all_stop_words)
         | 
| 18 | 
            +
                  @analyzer||={}
         | 
| 19 | 
            +
                  @analyzer[type]||=self.send("analyzer_#{type}")
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
                def self.analyzer_all_stop_words
         | 
| 22 | 
            +
                  #http://blackwinter.github.com/ferret/classes/Ferret/Analysis.html
         | 
| 23 | 
            +
                  stop_words = Ferret::Analysis::EXTENDED_ENGLISH_STOP_WORDS |
         | 
| 24 | 
            +
                                Ferret::Analysis::FULL_FRENCH_STOP_WORDS |
         | 
| 25 | 
            +
                                Ferret::Analysis::FULL_SPANISH_STOP_WORDS |
         | 
| 26 | 
            +
                                Ferret::Analysis::FULL_PORTUGUESE_STOP_WORDS |
         | 
| 27 | 
            +
                                Ferret::Analysis::FULL_ITALIAN_STOP_WORDS |
         | 
| 28 | 
            +
                                Ferret::Analysis::FULL_GERMAN_STOP_WORDS |
         | 
| 29 | 
            +
                                Ferret::Analysis::FULL_DUTCH_STOP_WORDS |
         | 
| 30 | 
            +
                                Ferret::Analysis::FULL_SWEDISH_STOP_WORDS |
         | 
| 31 | 
            +
                                Ferret::Analysis::FULL_NORWEGIAN_STOP_WORDS |
         | 
| 32 | 
            +
                                Ferret::Analysis::FULL_DANISH_STOP_WORDS |
         | 
| 33 | 
            +
                                Ferret::Analysis::FULL_RUSSIAN_STOP_WORDS |
         | 
| 34 | 
            +
                                Ferret::Analysis::FULL_FINNISH_STOP_WORDS
         | 
| 35 | 
            +
                  Ferret::Analysis::StandardAnalyzer.new(stop_words,true)#(Ferret::Analysis::FULL_ENGLISH_STOP_WORDS) #no stop words
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
                def self.analyzer_no_stop_words
         | 
| 38 | 
            +
                  Ferret::Analysis::StandardAnalyzer.new([],true)#no stop words
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
              
         | 
| 41 | 
            +
                def self.string_to_tokens_raw(string, type = :all_stop_words)
         | 
| 42 | 
            +
                  token_stream = self.analyzer(type).token_stream('',string)
         | 
| 43 | 
            +
                  o=[]; while(j=token_stream.next); o << j; end;
         | 
| 44 | 
            +
                  return o
         | 
| 45 | 
            +
                end
         | 
| 46 | 
            +
                def self.string_to_tokens(string, type = :all_stop_words)
         | 
| 47 | 
            +
                  self.string_to_tokens_raw(string, type).collect{|token| token.text}
         | 
| 48 | 
            +
                end
         | 
| 49 | 
            +
                def self.string_to_clean_tokens(string, type = :all_stop_words)
         | 
| 50 | 
            +
                  tokens = string_to_tokens(string, type)
         | 
| 51 | 
            +
                  tokens.delete_if{|token| token.match(/\d/)}
         | 
| 52 | 
            +
                  tokens
         | 
| 53 | 
            +
                end
         | 
| 54 | 
            +
                def self.string_to_clean_tokens_string(string, type = :all_stop_words)
         | 
| 55 | 
            +
                  string_to_clean_tokens(string, type).join(' ')
         | 
| 56 | 
            +
                end
         | 
| 57 | 
            +
                
         | 
| 58 | 
            +
                
         | 
| 59 | 
            +
                
         | 
| 60 | 
            +
                
         | 
| 61 | 
            +
                
         | 
| 62 | 
            +
                
         | 
| 63 | 
            +
                def self.normalize_string(string)
         | 
| 64 | 
            +
                  @htmlentities||=HTMLEntities.new
         | 
| 65 | 
            +
                  @htmlentities.decode(normalize_unicode_characters(string.gsub(/[\s\302\240]+/,' ').strip))
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
                UNICODE_CONVERSIONS = {
         | 
| 68 | 
            +
                  "8230" => '...',
         | 
| 69 | 
            +
                  "8194" => ' ',
         | 
| 70 | 
            +
                  "8195" => ' ',
         | 
| 71 | 
            +
                  "8201" => ' ',
         | 
| 72 | 
            +
                  "8211" => '-',
         | 
| 73 | 
            +
                  "8216" => '\'',
         | 
| 74 | 
            +
                  "8217" => '\'',
         | 
| 75 | 
            +
                  "8220" => '"',
         | 
| 76 | 
            +
                  "8221" => '"'
         | 
| 77 | 
            +
                }
         | 
| 78 | 
            +
                TRANSLATED_CONVERSIONS = UNICODE_CONVERSIONS.map {|k, v| [[k.to_i].pack('U*'), v] }
         | 
| 79 | 
            +
                def self.normalize_unicode_characters(string)
         | 
| 80 | 
            +
                  TRANSLATED_CONVERSIONS.each {|k,v| string.gsub! k, v }
         | 
| 81 | 
            +
                  string
         | 
| 82 | 
            +
                end
         | 
| 83 | 
            +
              end
         | 
| 84 | 
            +
            end
         | 
    
        data/lib/charles.rb
    ADDED
    
    | @@ -0,0 +1,66 @@ | |
| 1 | 
            +
            #require "charles/version"
         | 
| 2 | 
            +
            require 'pp'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require 'rubygems'
         | 
| 5 | 
            +
            require 'bundler/setup'
         | 
| 6 | 
            +
            require 'nokogiri'
         | 
| 7 | 
            +
            require 'htmlentities'
         | 
| 8 | 
            +
            require 'mechanize'
         | 
| 9 | 
            +
            require 'active_support/cache'
         | 
| 10 | 
            +
            require 'active_support/cache/file_store'
         | 
| 11 | 
            +
            require 'image_size'
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            require 'ferret'
         | 
| 14 | 
            +
            Ferret.locale = "en_US.UTF-8" #if not set ferret segfaults on chinese/jap stuff randomly
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            require "charles/document"
         | 
| 17 | 
            +
            require "charles/misc"
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            module Charles
         | 
| 20 | 
            +
              # Your code goes here...
         | 
| 21 | 
            +
              def self.logger=(logger)
         | 
| 22 | 
            +
                @logger = logger
         | 
| 23 | 
            +
              end
         | 
| 24 | 
            +
              def self.logger
         | 
| 25 | 
            +
                @logger ||= Logger.new(STDERR)
         | 
| 26 | 
            +
              end
         | 
| 27 | 
            +
              
         | 
| 28 | 
            +
              def self.get(url)
         | 
| 29 | 
            +
                agent = Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
         | 
| 30 | 
            +
                body = file_cache.fetch("Charles.get(#{url})"){ 
         | 
| 31 | 
            +
                  agent.get(url).body
         | 
| 32 | 
            +
                }
         | 
| 33 | 
            +
                return Document.new(body, :url => url, :mechanize_agent => agent)
         | 
| 34 | 
            +
              end
         | 
| 35 | 
            +
              
         | 
| 36 | 
            +
              def self.options
         | 
| 37 | 
            +
                @options ||= {}
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
              
         | 
| 40 | 
            +
              def self.file_cache
         | 
| 41 | 
            +
                @file_cache ||= ActiveSupport::Cache::FileStore.new(Charles.options[:tmp_path], :namespace => 'charles')
         | 
| 42 | 
            +
              end
         | 
| 43 | 
            +
            end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            module Enumerable
         | 
| 46 | 
            +
             | 
| 47 | 
            +
              def sum
         | 
| 48 | 
            +
                return self.inject(0){|accum, i| accum + i }
         | 
| 49 | 
            +
              end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
              def mean
         | 
| 52 | 
            +
                return self.sum / self.length.to_f
         | 
| 53 | 
            +
              end
         | 
| 54 | 
            +
             | 
| 55 | 
            +
              def sample_variance
         | 
| 56 | 
            +
                m = self.mean
         | 
| 57 | 
            +
                sum = self.inject(0){|accum, i| accum + (i - m) ** 2 }
         | 
| 58 | 
            +
                return sum / (self.length - 1).to_f
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
              def standard_deviation
         | 
| 62 | 
            +
                return Math.sqrt(self.sample_variance)
         | 
| 63 | 
            +
              end
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            end
         | 
| 66 | 
            +
             | 
    
        data/optimise.rb
    ADDED
    
    | @@ -0,0 +1,72 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'lib/charles'
         | 
| 4 | 
            +
            require 'yaml'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            TEST_ARTICLES = YAML.load_file("test/articles.yml")
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            class CharlesOptimiser
         | 
| 9 | 
            +
              @@high_score = 0
         | 
| 10 | 
            +
              
         | 
| 11 | 
            +
              def initialize
         | 
| 12 | 
            +
                @articles = YAML.load_file("test/articles.yml")
         | 
| 13 | 
            +
                @articles.each{|article|
         | 
| 14 | 
            +
                  next if article[:file].empty?
         | 
| 15 | 
            +
                  article[:html] = File.read("test/articles/#{article[:file]}.html")
         | 
| 16 | 
            +
                  article[:document] = Charles::Document.new(article[:html])
         | 
| 17 | 
            +
                  article[:expected][:content] = File.read("test/articles/#{article[:file]}.content.txt")
         | 
| 18 | 
            +
                }
         | 
| 19 | 
            +
              end
         | 
| 20 | 
            +
              def optimise
         | 
| 21 | 
            +
                50.times do
         | 
| 22 | 
            +
                  seeds = {
         | 
| 23 | 
            +
                    :length => random(800,3000),
         | 
| 24 | 
            +
                    :distance_from_top => random(0.1,2),
         | 
| 25 | 
            +
                    :internal_nodes => random(5,50),
         | 
| 26 | 
            +
                    :internal_nodes_buffer => random(5,150),
         | 
| 27 | 
            +
                    :title_match => random(0,1),
         | 
| 28 | 
            +
                    :title_match_buffer => random(0,0.6)
         | 
| 29 | 
            +
                  }
         | 
| 30 | 
            +
                  _scores = articles_scores(seeds)
         | 
| 31 | 
            +
                  _scores.delete_if{|score| score > 1}
         | 
| 32 | 
            +
                  _score = _scores.mean
         | 
| 33 | 
            +
                  _std_dev = _scores.standard_deviation
         | 
| 34 | 
            +
                  if _score >= @@high_score
         | 
| 35 | 
            +
                    @@high_score = _score
         | 
| 36 | 
            +
                    pp [_score, _std_dev, seeds, _scores.select{|i| i<0.1}.size]
         | 
| 37 | 
            +
                  end
         | 
| 38 | 
            +
                end
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
              def articles_scores(seeds={})
         | 
| 41 | 
            +
                _scores = []
         | 
| 42 | 
            +
                @articles.each{|article|
         | 
| 43 | 
            +
                  next if article[:file].empty?
         | 
| 44 | 
            +
                  result = article[:document].content(seeds)
         | 
| 45 | 
            +
                  _score = compare_articles(result, article[:expected][:content])
         | 
| 46 | 
            +
                  _scores << _score
         | 
| 47 | 
            +
                }
         | 
| 48 | 
            +
                _scores
         | 
| 49 | 
            +
              end
         | 
| 50 | 
            +
              def compare_articles(a,b)
         | 
| 51 | 
            +
                [compare_articles_single_side(a,b),compare_articles_single_side(b,a)].mean
         | 
| 52 | 
            +
              end
         | 
| 53 | 
            +
              def compare_articles_single_side(a,b)
         | 
| 54 | 
            +
                index = Ferret::Index::Index.new()
         | 
| 55 | 
            +
                index.field_infos.add_field(:content, :store => :no, :boost => 1)
         | 
| 56 | 
            +
                index << {:content => a}
         | 
| 57 | 
            +
                search = index.search(b)
         | 
| 58 | 
            +
                search.max_score
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
              
         | 
| 61 | 
            +
              def random(min,max)
         | 
| 62 | 
            +
                rand * (max - min) + min
         | 
| 63 | 
            +
              end
         | 
| 64 | 
            +
            end
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            while true
         | 
| 67 | 
            +
            thread = Thread.new {
         | 
| 68 | 
            +
              CharlesOptimiser.new.optimise
         | 
| 69 | 
            +
            }
         | 
| 70 | 
            +
            thread.join
         | 
| 71 | 
            +
            puts "***"
         | 
| 72 | 
            +
            end
         | 
| @@ -0,0 +1,5 @@ | |
| 1 | 
            +
            The People's Action Party (PAP) ended its campaign in Hougang on Thursday with a call for change in the ward, urging voters to start afresh with its young candidate.
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            'Hougang, let's turn over a new page and start again,' declared party chairman Khaw Boon Wan at the PAP's final rally of the by-election on Thursday night.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            With the ward in opposition hands since 1991, Hougang residents had been adversely affected, said Mr Khaw.
         |