RubyGems - charles - Versions diffs - 0.0.1 - Mend

charles 0.0.1

Files changed (67) hide show

data/.gitignore +17 -0
data/Gemfile +4 -0
data/LICENSE +22 -0
data/README.md +10 -0
data/Rakefile +13 -0
data/bin/charles +23 -0
data/charles.gemspec +25 -0
data/lib/charles/document.rb +177 -0
data/lib/charles/images.rb +77 -0
data/lib/charles/internal_attributes.rb +40 -0
data/lib/charles/misc.rb +84 -0
data/lib/charles/version.rb +3 -0
data/lib/charles.rb +66 -0
data/optimise.rb +72 -0
data/test/articles/20120525_1525_straitstimes.com.content.txt +5 -0
data/test/articles/20120525_1525_straitstimes.com.html +1929 -0
data/test/articles/20120525_1534_bbc.co.uk.content.txt +19 -0
data/test/articles/20120525_1534_bbc.co.uk.html +1777 -0
data/test/articles/20120525_1727_bbc.co.uk.content.txt +39 -0
data/test/articles/20120525_1727_bbc.co.uk.html +1889 -0
data/test/articles/20120525_1730_channelnewsasia.com.content.txt +19 -0
data/test/articles/20120525_1730_channelnewsasia.com.html +963 -0
data/test/articles/20120525_1733_channelnewsasia.com.content.txt +19 -0
data/test/articles/20120525_1733_channelnewsasia.com.html +923 -0
data/test/articles/20120525_1736_nytimes.com.content.txt +21 -0
data/test/articles/20120525_1736_nytimes.com.html +856 -0
data/test/articles/20120525_1743_nytimes.com.content.txt +11 -0
data/test/articles/20120525_1743_nytimes.com.html +98 -0
data/test/articles/20120525_1747_techcrunch.com.content.txt +11 -0
data/test/articles/20120525_1747_techcrunch.com.html +1098 -0
data/test/articles/20120528_0929_washingtonpost.com.content.txt +23 -0
data/test/articles/20120528_0929_washingtonpost.com.html +3335 -0
data/test/articles/20120528_0931_latimes.com.content.txt +45 -0
data/test/articles/20120528_0931_latimes.com.html +6371 -0
data/test/articles/20120528_0938_entertainment.time.com.content.txt +31 -0
data/test/articles/20120528_0938_entertainment.time.com.html +1261 -0
data/test/articles/20120528_0943_bloomberg.com.content.txt +13 -0
data/test/articles/20120528_0943_bloomberg.com.html +2874 -0
data/test/articles/20120528_0947_reuters.com.content.txt +35 -0
data/test/articles/20120528_0947_reuters.com.html +1563 -0
data/test/articles/20120528_1106_reuters.com.content.txt +5 -0
data/test/articles/20120528_1106_reuters.com.html +551 -0
data/test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt +19 -0
data/test/articles/20120528_1109_musicthing.blogspot.co.uk.html +865 -0
data/test/articles/20120528_1114_mobileinc.co.uk.content.txt +15 -0
data/test/articles/20120528_1114_mobileinc.co.uk.html +550 -0
data/test/articles/20120528_1119_forbes.com.content.txt +15 -0
data/test/articles/20120528_1119_forbes.com.html +1406 -0
data/test/articles/20120528_1122_techcrunch.com.content.txt +58 -0
data/test/articles/20120528_1122_techcrunch.com.html +1131 -0
data/test/articles/20120528_1126_blogs.adobe.com.content.txt +13 -0
data/test/articles/20120528_1126_blogs.adobe.com.html +303 -0
data/test/articles/20120528_1142_thestar.com.my.content.txt +27 -0
data/test/articles/20120528_1142_thestar.com.my.html +943 -0
data/test/articles/20120528_1146_suntimes.com.content.txt +33 -0
data/test/articles/20120528_1146_suntimes.com.html +5166 -0
data/test/articles/20120528_1148_asiaone.com.content.txt +27 -0
data/test/articles/20120528_1148_asiaone.com.html +1070 -0
data/test/articles/20120529_1120_online.wsj.com.content.txt +56 -0
data/test/articles/20120529_1120_online.wsj.com.html +3035 -0
data/test/articles/20120529_1122_online.wsj.com.content.txt +35 -0
data/test/articles/20120529_1122_online.wsj.com.html +2725 -0
data/test/articles/20120529_1127_smh.com.au.content.txt +13 -0
data/test/articles/20120529_1127_smh.com.au.html +2034 -0
data/test/articles.yml +221 -0
data/test/test_charles.rb +70 -0
metadata +279 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in charles.gemspec
+gemspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 Jason Ling
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,10 @@
+Charles
+=======
+Charles the Content Extractor in Ruby
+# Similar Projects
+- https://github.com/iterationlabs/ruby-readability (Ruby)
+- https://github.com/peterc/pismo (Ruby)
+- https://github.com/jiminoc/goose (Scala)

data/Rakefile ADDED Viewed

@@ -0,0 +1,13 @@
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
+#http://guides.rubygems.org/make-your-own-gem/
+require 'rake/testtask'
+Rake::TestTask.new do |t|
+  t.libs << 'test'
+end
+desc "Run tests"
+task :default => :test

data/bin/charles ADDED Viewed

@@ -0,0 +1,23 @@
+#!/usr/bin/env ruby
+require 'lib/charles'
+require 'yaml'
+Charles.options[:tmp_path] = File.dirname(__FILE__) + "/../test/tmp"
+url = ARGV.shift
+unless url =~ /^http/
+  url = File.read(url)
+end
+document = Charles.get(url)
+puts({
+  :content => document.content,
+  :title => document.title,
+  :filtered_images => document.filtered_images.collect{|image| image[:url]}
+}.to_yaml)

data/charles.gemspec ADDED Viewed

@@ -0,0 +1,25 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path('../lib/charles/version', __FILE__)
+Gem::Specification.new do |gem|
+  gem.authors       = ["Jason Ling Xiaowei"]
+  gem.email         = ["jason@jeyel.com"]
+  gem.description   = 'Charles the Content Extractor'
+  gem.summary       = 'Charles the Content Extractor'
+  gem.homepage      = "https://github.com/jlxw/charles"
+  gem.files         = `git ls-files`.split($\)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.name          = "charles"
+  gem.require_paths = ["lib"]
+  gem.version       = Charles::VERSION
+  gem.add_dependency "ferret"
+  gem.add_dependency "nokogiri"
+  gem.add_dependency "htmlentities"
+  gem.add_dependency "mechanize"
+  gem.add_dependency "activesupport"
+  gem.add_dependency "rack"
+  gem.add_dependency "imagesize"
+end

data/lib/charles/document.rb ADDED Viewed

@@ -0,0 +1,177 @@
+require 'charles/images'
+require 'charles/internal_attributes'
+module Charles
+  class Document
+    include Charles::InternalAttributes
+    include Charles::Images
+    def initialize(input, options={})
+      @document = Nokogiri::HTML.parse(input)
+      @document.search("script, style").remove
+      @nodes = @document.search('body *').select{|_n|
+        _n.clean_inner_tokens_text.size > 30 #arbitrary, minimum inner text limit of 30 chars
+      }
+      @options = options
+    end
+    def logger; Charles.logger; end
+    def content(seeds={})
+      content_node = content_node(seeds)
+      return unless content_node
+      refine_content_node(content_node).clean_inner_text
+    end
+    def content_node(seeds={})
+      content_nodes = calculate_content_nodes(seeds)
+      return unless content_nodes.first
+      content_nodes.first[:node]
+    end
+    def calculate_content_nodes(seeds={})
+      default_seeds = {:title_match=>0.145422959269808,
+  :title_match_buffer=>0.0174920023610796,
+  :length=>1100.27450832379,
+  :distance_from_top=>0.308408501217311,
+  :internal_nodes=>25.680381972181,
+  :internal_nodes_buffer=>20.2006169153009}
+      seeds = default_seeds.merge(seeds)
+      o = []
+      _rank = 0
+      @nodes.each_index{|_i|
+        _n = @nodes[_i]
+        _rank += 1
+        scores={
+          :length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
+          :internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
+          :distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
+          :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1+ + seeds[:title_match_buffer])**seeds[:title_match].to_f #ferret index score, search score with page title
+          #:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
+        }
+        o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
+      }
+      o.sort!{|a,b| b[:score] <=> a[:score]}
+      #o[0,1].each{|o2| pp [o2[:score], o2[:scores]]}
+      #o[0,1].each{|o2| pp [refine_content_node(o2[:node]).clean_inner_text, o2[:score], o2[:scores]]}
+      return o
+    end
+    def refine_content_node(node)
+      node = node.dup
+      #strip 'clutter'
+      #i.children.each{|_n| pp _n.inner_text; pp _n.clean_inner_text.size}
+      _min_size = 30
+      node.children.each{|_n|
+        if(_n.clean_inner_tokens_text.size < _min_size)
+          _n.remove
+        else; break; end
+      }
+      node.children.reverse.each{|_n|
+        if(_n.clean_inner_tokens_text.size < _min_size)
+          _n.remove
+        else; break; end
+      }
+      node.search('*').each{|_n| _n.after(' ')}
+      return node
+    end
+    def content_node_ferret_index
+      @content_node_ferret_index ||= caluclate_content_node_ferret_index
+    end
+    def caluclate_content_node_ferret_index
+      index = Ferret::Index::Index.new()
+      index.field_infos.add_field(:id, :store => :yes)
+      index.field_infos.add_field(:content, :store => :no, :boost => 1)
+      @nodes.each_index{|_i|
+        i=@nodes[_i]
+        index << {
+          :id => _i,
+          :content => i.clean_inner_text,
+        }
+      }
+      q=self.title.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'') #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
+      s=index.search(q, :limit => @nodes.size)
+      o=[]
+      s.hits.each {|hit|
+        _i = index[hit.doc][:id].to_i
+        _n = @nodes[_i]
+        _search_score = hit.score
+        _search_normalised_score = hit.score/s.max_score
+        #logger.info [_n.clean_inner_text, _search_score, _search_normalised_score].pretty_inspect
+        o[_i] = _search_normalised_score
+      }
+      o
+    end
+    def mechanize_agent
+      @options[:mechanize_agent] ||= Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
+    end
+  end
+end
+Nokogiri::XML::Node.class_eval {
+  def clean_inner_text
+    @clean_inner_text ||= Charles::Misc.normalize_string(inner_text)
+  end
+  def clean_inner_tokens_text
+    @clean_inner_tokens_text ||= (
+        Charles::Misc.string_to_clean_tokens_string(clean_inner_text)
+      )
+  end
+  def internal_nodes_size
+    @internal_nodes_size ||= search('*').size
+  end
+}
+#https://github.com/cheald/pismo/blob/master/lib/pismo.rb
+class Nokogiri::HTML::Document
+  def get_the(search)
+    self.search(search).first rescue nil
+  end
+  def match(queries = [])
+    [].tap do |results|
+      [*queries].each do |query|
+        result = begin
+          if query.is_a?(String)
+            if el = self.search(query).first
+              if el.name.downcase == "meta"
+                el['content']
+              else
+                el.inner_text
+              end
+            end
+          elsif query.is_a?(Array)
+            query.last.call( self.search(query.first).first )
+          end
+        rescue
+          nil
+        end
+        results << Charles::Misc.normalize_string(result) if result
+      end
+    end.compact
+  end
+end

data/lib/charles/images.rb ADDED Viewed

@@ -0,0 +1,77 @@
+module Charles
+  module Images
+    def image
+      images && images.first
+    end
+    def images
+      @images ||= calculate_images
+    end
+    def calculate_images
+      _node = self.content_node
+      return unless _node
+      #logger.info _node.pretty_inspect
+      (_node.ancestors.size/2).times do
+        o=self.calculate_image_from_node(_node)
+        #logger.info o.pretty_inspect
+        return o if o
+        _node = _node.parent
+      end
+      return []
+    end
+    def calculate_image_from_node(_node)
+      _imgs = _node.search('img')
+      i=URI.parse(@options[:url])
+      if !_imgs.empty? && _imgs.size < 50 #sanity check if more than 50 images...
+        o=[]
+        _imgs.each do |_img|
+          next unless _img.attr('src')
+          begin
+            _u = (i + _img.attr('src')).to_s
+          rescue StandardError => e
+            logger.info "Error #{e}: #{i} + #{_img.attr('src')}"
+            next
+          end
+          o << _u
+        end
+        return o
+      end
+      return nil
+    end
+    def filtered_images
+      _max_proportion = 2.5
+      _min_area = 88*88
+      _filtered_images = []
+      _images = self.images.dup
+      _images.each{|url|
+        data = get_image(url)
+        next unless data
+        size = ImageSize.new(data).get_size
+        if(size[0] * size[1] > _min_area &&
+          size[0].to_f/size[1] < _max_proportion &&
+          size[1].to_f/size[0] < _max_proportion)
+          _filtered_images << {:url => url, :data => data, :width => size[0], :height => size[1]}
+        end
+      }
+      return _filtered_images
+    end
+    def get_image(url)
+      _cache_key = "get_image(#{url})"
+      begin
+        Charles.file_cache.fetch(_cache_key) {
+          body = mechanize_agent.get(url, [], URI.parse(@options[:url])).body
+          body.size < 900000 ? body : nil
+        }
+      rescue StandardError, Timeout::Error
+        Charles.file_cache.write(_cache_key, nil, :expires_in => 1.hour)
+      end
+    end
+  end
+end

data/lib/charles/internal_attributes.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Charles
+  module InternalAttributes
+    def title
+      @title||=(
+        title = @document.search('title').first
+        title ? title.clean_inner_text : nil
+      )
+    end
+    def clean_title
+      return title if !@options[:sample_titles] || @options[:sample_titles].size < 5
+      _title_words = {}
+      _tokens = Charles::Misc.string_to_tokens_raw(self.title, type = :no_stop_words)
+      while(_tokens.first && words_to_filter_from_sample_titles.include?(_tokens.first.text)); _tokens.shift; end; #remove words from the beginning of the tokens
+      while(_tokens.last && words_to_filter_from_sample_titles.include?(_tokens.last.text)); _tokens.pop; end; #remove words from the end of the tokens
+      return title if _tokens.empty? #everything stripped? return nil, use other titles
+      _start = _tokens.first.start;
+      _end = _tokens.last.end;
+      _title = self.title.slice(_start, _end - _start)
+      _title = self.title.match(/[^\s\302\240]*#{Regexp.escape(_title)}[^\s\302\240]*/)[0].strip #include symbols or punctuation surrounding the title
+    end
+    protected
+    def words_to_filter_from_sample_titles
+      @words_to_filter_from_sample_titles = calculate_words_to_filter_from_sample_titles
+    end
+    def calculate_words_to_filter_from_sample_titles
+      _title_words = {}
+      @options[:sample_titles].each{|sample_title|
+        Charles::Misc.string_to_tokens(sample_title, type = :no_stop_words).uniq.each{|token|
+          _title_words[token]||=0; _title_words[token]+=1
+        }
+      }
+      _threshold = (0.9 * @options[:sample_titles].size).ceil
+      _words_to_filter = _title_words.select{|k,v| v >= _threshold}.collect{|k,v| k} #select words used in more than 90% of the titles
+    end
+  end
+end

data/lib/charles/misc.rb ADDED Viewed

@@ -0,0 +1,84 @@
+module Charles
+  module Misc
+    def self.compare_strings(a,b)
+      [compare_strings_single_side(a,b),compare_strings_single_side(b,a)].mean
+    end
+    def self.compare_strings_single_side(a,b)
+      index = Ferret::Index::Index.new()
+      index.field_infos.add_field(:content, :store => :no, :boost => 1)
+      index << {:content => a}
+      search = index.search(b.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'')) #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
+      search.max_score
+    end
+    def self.analyzer(type = :all_stop_words)
+      @analyzer||={}
+      @analyzer[type]||=self.send("analyzer_#{type}")
+    end
+    def self.analyzer_all_stop_words
+      #http://blackwinter.github.com/ferret/classes/Ferret/Analysis.html
+      stop_words = Ferret::Analysis::EXTENDED_ENGLISH_STOP_WORDS |
+                    Ferret::Analysis::FULL_FRENCH_STOP_WORDS |
+                    Ferret::Analysis::FULL_SPANISH_STOP_WORDS |
+                    Ferret::Analysis::FULL_PORTUGUESE_STOP_WORDS |
+                    Ferret::Analysis::FULL_ITALIAN_STOP_WORDS |
+                    Ferret::Analysis::FULL_GERMAN_STOP_WORDS |
+                    Ferret::Analysis::FULL_DUTCH_STOP_WORDS |
+                    Ferret::Analysis::FULL_SWEDISH_STOP_WORDS |
+                    Ferret::Analysis::FULL_NORWEGIAN_STOP_WORDS |
+                    Ferret::Analysis::FULL_DANISH_STOP_WORDS |
+                    Ferret::Analysis::FULL_RUSSIAN_STOP_WORDS |
+                    Ferret::Analysis::FULL_FINNISH_STOP_WORDS
+      Ferret::Analysis::StandardAnalyzer.new(stop_words,true)#(Ferret::Analysis::FULL_ENGLISH_STOP_WORDS) #no stop words
+    end
+    def self.analyzer_no_stop_words
+      Ferret::Analysis::StandardAnalyzer.new([],true)#no stop words
+    end
+    def self.string_to_tokens_raw(string, type = :all_stop_words)
+      token_stream = self.analyzer(type).token_stream('',string)
+      o=[]; while(j=token_stream.next); o << j; end;
+      return o
+    end
+    def self.string_to_tokens(string, type = :all_stop_words)
+      self.string_to_tokens_raw(string, type).collect{|token| token.text}
+    end
+    def self.string_to_clean_tokens(string, type = :all_stop_words)
+      tokens = string_to_tokens(string, type)
+      tokens.delete_if{|token| token.match(/\d/)}
+      tokens
+    end
+    def self.string_to_clean_tokens_string(string, type = :all_stop_words)
+      string_to_clean_tokens(string, type).join(' ')
+    end
+    def self.normalize_string(string)
+      @htmlentities||=HTMLEntities.new
+      @htmlentities.decode(normalize_unicode_characters(string.gsub(/[\s\302\240]+/,' ').strip))
+    end
+    UNICODE_CONVERSIONS = {
+      "8230" => '...',
+      "8194" => ' ',
+      "8195" => ' ',
+      "8201" => ' ',
+      "8211" => '-',
+      "8216" => '\'',
+      "8217" => '\'',
+      "8220" => '"',
+      "8221" => '"'
+    }
+    TRANSLATED_CONVERSIONS = UNICODE_CONVERSIONS.map {|k, v| [[k.to_i].pack('U*'), v] }
+    def self.normalize_unicode_characters(string)
+      TRANSLATED_CONVERSIONS.each {|k,v| string.gsub! k, v }
+      string
+    end
+  end
+end

data/lib/charles/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Charles
+  VERSION = "0.0.1"
+end

data/lib/charles.rb ADDED Viewed

@@ -0,0 +1,66 @@
+#require "charles/version"
+require 'pp'
+require 'rubygems'
+require 'bundler/setup'
+require 'nokogiri'
+require 'htmlentities'
+require 'mechanize'
+require 'active_support/cache'
+require 'active_support/cache/file_store'
+require 'image_size'
+require 'ferret'
+Ferret.locale = "en_US.UTF-8" #if not set ferret segfaults on chinese/jap stuff randomly
+require "charles/document"
+require "charles/misc"
+module Charles
+  # Your code goes here...
+  def self.logger=(logger)
+    @logger = logger
+  end
+  def self.logger
+    @logger ||= Logger.new(STDERR)
+  end
+  def self.get(url)
+    agent = Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
+    body = file_cache.fetch("Charles.get(#{url})"){
+      agent.get(url).body
+    }
+    return Document.new(body, :url => url, :mechanize_agent => agent)
+  end
+  def self.options
+    @options ||= {}
+  end
+  def self.file_cache
+    @file_cache ||= ActiveSupport::Cache::FileStore.new(Charles.options[:tmp_path], :namespace => 'charles')
+  end
+end
+module Enumerable
+  def sum
+    return self.inject(0){|accum, i| accum + i }
+  end
+  def mean
+    return self.sum / self.length.to_f
+  end
+  def sample_variance
+    m = self.mean
+    sum = self.inject(0){|accum, i| accum + (i - m) ** 2 }
+    return sum / (self.length - 1).to_f
+  end
+  def standard_deviation
+    return Math.sqrt(self.sample_variance)
+  end
+end

data/optimise.rb ADDED Viewed

@@ -0,0 +1,72 @@
+#!/usr/bin/env ruby
+require 'lib/charles'
+require 'yaml'
+TEST_ARTICLES = YAML.load_file("test/articles.yml")
+class CharlesOptimiser
+  @@high_score = 0
+  def initialize
+    @articles = YAML.load_file("test/articles.yml")
+    @articles.each{|article|
+      next if article[:file].empty?
+      article[:html] = File.read("test/articles/#{article[:file]}.html")
+      article[:document] = Charles::Document.new(article[:html])
+      article[:expected][:content] = File.read("test/articles/#{article[:file]}.content.txt")
+    }
+  end
+  def optimise
+    50.times do
+      seeds = {
+        :length => random(800,3000),
+        :distance_from_top => random(0.1,2),
+        :internal_nodes => random(5,50),
+        :internal_nodes_buffer => random(5,150),
+        :title_match => random(0,1),
+        :title_match_buffer => random(0,0.6)
+      }
+      _scores = articles_scores(seeds)
+      _scores.delete_if{|score| score > 1}
+      _score = _scores.mean
+      _std_dev = _scores.standard_deviation
+      if _score >= @@high_score
+        @@high_score = _score
+        pp [_score, _std_dev, seeds, _scores.select{|i| i<0.1}.size]
+      end
+    end
+  end
+  def articles_scores(seeds={})
+    _scores = []
+    @articles.each{|article|
+      next if article[:file].empty?
+      result = article[:document].content(seeds)
+      _score = compare_articles(result, article[:expected][:content])
+      _scores << _score
+    }
+    _scores
+  end
+  def compare_articles(a,b)
+    [compare_articles_single_side(a,b),compare_articles_single_side(b,a)].mean
+  end
+  def compare_articles_single_side(a,b)
+    index = Ferret::Index::Index.new()
+    index.field_infos.add_field(:content, :store => :no, :boost => 1)
+    index << {:content => a}
+    search = index.search(b)
+    search.max_score
+  end
+  def random(min,max)
+    rand * (max - min) + min
+  end
+end
+while true
+thread = Thread.new {
+  CharlesOptimiser.new.optimise
+}
+thread.join
+puts "***"
+end

data/test/articles/20120525_1525_straitstimes.com.content.txt ADDED Viewed

@@ -0,0 +1,5 @@
+The People's Action Party (PAP) ended its campaign in Hougang on Thursday with a call for change in the ward, urging voters to start afresh with its young candidate.
+'Hougang, let's turn over a new page and start again,' declared party chairman Khaw Boon Wan at the PAP's final rally of the by-election on Thursday night.
+With the ward in opposition hands since 1991, Hougang residents had been adversely affected, said Mr Khaw.