charles 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE +22 -0
  4. data/README.md +10 -0
  5. data/Rakefile +13 -0
  6. data/bin/charles +23 -0
  7. data/charles.gemspec +25 -0
  8. data/lib/charles/document.rb +177 -0
  9. data/lib/charles/images.rb +77 -0
  10. data/lib/charles/internal_attributes.rb +40 -0
  11. data/lib/charles/misc.rb +84 -0
  12. data/lib/charles/version.rb +3 -0
  13. data/lib/charles.rb +66 -0
  14. data/optimise.rb +72 -0
  15. data/test/articles/20120525_1525_straitstimes.com.content.txt +5 -0
  16. data/test/articles/20120525_1525_straitstimes.com.html +1929 -0
  17. data/test/articles/20120525_1534_bbc.co.uk.content.txt +19 -0
  18. data/test/articles/20120525_1534_bbc.co.uk.html +1777 -0
  19. data/test/articles/20120525_1727_bbc.co.uk.content.txt +39 -0
  20. data/test/articles/20120525_1727_bbc.co.uk.html +1889 -0
  21. data/test/articles/20120525_1730_channelnewsasia.com.content.txt +19 -0
  22. data/test/articles/20120525_1730_channelnewsasia.com.html +963 -0
  23. data/test/articles/20120525_1733_channelnewsasia.com.content.txt +19 -0
  24. data/test/articles/20120525_1733_channelnewsasia.com.html +923 -0
  25. data/test/articles/20120525_1736_nytimes.com.content.txt +21 -0
  26. data/test/articles/20120525_1736_nytimes.com.html +856 -0
  27. data/test/articles/20120525_1743_nytimes.com.content.txt +11 -0
  28. data/test/articles/20120525_1743_nytimes.com.html +98 -0
  29. data/test/articles/20120525_1747_techcrunch.com.content.txt +11 -0
  30. data/test/articles/20120525_1747_techcrunch.com.html +1098 -0
  31. data/test/articles/20120528_0929_washingtonpost.com.content.txt +23 -0
  32. data/test/articles/20120528_0929_washingtonpost.com.html +3335 -0
  33. data/test/articles/20120528_0931_latimes.com.content.txt +45 -0
  34. data/test/articles/20120528_0931_latimes.com.html +6371 -0
  35. data/test/articles/20120528_0938_entertainment.time.com.content.txt +31 -0
  36. data/test/articles/20120528_0938_entertainment.time.com.html +1261 -0
  37. data/test/articles/20120528_0943_bloomberg.com.content.txt +13 -0
  38. data/test/articles/20120528_0943_bloomberg.com.html +2874 -0
  39. data/test/articles/20120528_0947_reuters.com.content.txt +35 -0
  40. data/test/articles/20120528_0947_reuters.com.html +1563 -0
  41. data/test/articles/20120528_1106_reuters.com.content.txt +5 -0
  42. data/test/articles/20120528_1106_reuters.com.html +551 -0
  43. data/test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt +19 -0
  44. data/test/articles/20120528_1109_musicthing.blogspot.co.uk.html +865 -0
  45. data/test/articles/20120528_1114_mobileinc.co.uk.content.txt +15 -0
  46. data/test/articles/20120528_1114_mobileinc.co.uk.html +550 -0
  47. data/test/articles/20120528_1119_forbes.com.content.txt +15 -0
  48. data/test/articles/20120528_1119_forbes.com.html +1406 -0
  49. data/test/articles/20120528_1122_techcrunch.com.content.txt +58 -0
  50. data/test/articles/20120528_1122_techcrunch.com.html +1131 -0
  51. data/test/articles/20120528_1126_blogs.adobe.com.content.txt +13 -0
  52. data/test/articles/20120528_1126_blogs.adobe.com.html +303 -0
  53. data/test/articles/20120528_1142_thestar.com.my.content.txt +27 -0
  54. data/test/articles/20120528_1142_thestar.com.my.html +943 -0
  55. data/test/articles/20120528_1146_suntimes.com.content.txt +33 -0
  56. data/test/articles/20120528_1146_suntimes.com.html +5166 -0
  57. data/test/articles/20120528_1148_asiaone.com.content.txt +27 -0
  58. data/test/articles/20120528_1148_asiaone.com.html +1070 -0
  59. data/test/articles/20120529_1120_online.wsj.com.content.txt +56 -0
  60. data/test/articles/20120529_1120_online.wsj.com.html +3035 -0
  61. data/test/articles/20120529_1122_online.wsj.com.content.txt +35 -0
  62. data/test/articles/20120529_1122_online.wsj.com.html +2725 -0
  63. data/test/articles/20120529_1127_smh.com.au.content.txt +13 -0
  64. data/test/articles/20120529_1127_smh.com.au.html +2034 -0
  65. data/test/articles.yml +221 -0
  66. data/test/test_charles.rb +70 -0
  67. metadata +279 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in charles.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Jason Ling
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,10 @@
1
+ Charles
2
+ =======
3
+
4
+ Charles the Content Extractor in Ruby
5
+
6
+ # Similar Projects
7
+
8
+ - https://github.com/iterationlabs/ruby-readability (Ruby)
9
+ - https://github.com/peterc/pismo (Ruby)
10
+ - https://github.com/jiminoc/goose (Scala)
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ #http://guides.rubygems.org/make-your-own-gem/
5
+ require 'rake/testtask'
6
+
7
+ Rake::TestTask.new do |t|
8
+ t.libs << 'test'
9
+ end
10
+
11
+ desc "Run tests"
12
+ task :default => :test
13
+
data/bin/charles ADDED
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'lib/charles'
4
+ require 'yaml'
5
+
6
+ Charles.options[:tmp_path] = File.dirname(__FILE__) + "/../test/tmp"
7
+
8
+ url = ARGV.shift
9
+
10
+ unless url =~ /^http/
11
+ url = File.read(url)
12
+ end
13
+
14
+ document = Charles.get(url)
15
+ puts({
16
+ :content => document.content,
17
+ :title => document.title,
18
+ :filtered_images => document.filtered_images.collect{|image| image[:url]}
19
+ }.to_yaml)
20
+
21
+
22
+
23
+
data/charles.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/charles/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Jason Ling Xiaowei"]
6
+ gem.email = ["jason@jeyel.com"]
7
+ gem.description = 'Charles the Content Extractor'
8
+ gem.summary = 'Charles the Content Extractor'
9
+ gem.homepage = "https://github.com/jlxw/charles"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "charles"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Charles::VERSION
17
+
18
+ gem.add_dependency "ferret"
19
+ gem.add_dependency "nokogiri"
20
+ gem.add_dependency "htmlentities"
21
+ gem.add_dependency "mechanize"
22
+ gem.add_dependency "activesupport"
23
+ gem.add_dependency "rack"
24
+ gem.add_dependency "imagesize"
25
+ end
@@ -0,0 +1,177 @@
1
+ require 'charles/images'
2
+ require 'charles/internal_attributes'
3
+
4
+ module Charles
5
+ class Document
6
+ include Charles::InternalAttributes
7
+ include Charles::Images
8
+
9
+ def initialize(input, options={})
10
+ @document = Nokogiri::HTML.parse(input)
11
+ @document.search("script, style").remove
12
+ @nodes = @document.search('body *').select{|_n|
13
+ _n.clean_inner_tokens_text.size > 30 #arbitrary, minimum inner text limit of 30 chars
14
+ }
15
+ @options = options
16
+ end
17
+
18
+ def logger; Charles.logger; end
19
+
20
+ def content(seeds={})
21
+ content_node = content_node(seeds)
22
+ return unless content_node
23
+ refine_content_node(content_node).clean_inner_text
24
+ end
25
+
26
+ def content_node(seeds={})
27
+ content_nodes = calculate_content_nodes(seeds)
28
+ return unless content_nodes.first
29
+ content_nodes.first[:node]
30
+ end
31
+
32
+ def calculate_content_nodes(seeds={})
33
+ default_seeds = {:title_match=>0.145422959269808,
34
+ :title_match_buffer=>0.0174920023610796,
35
+ :length=>1100.27450832379,
36
+ :distance_from_top=>0.308408501217311,
37
+ :internal_nodes=>25.680381972181,
38
+ :internal_nodes_buffer=>20.2006169153009}
39
+ seeds = default_seeds.merge(seeds)
40
+
41
+ o = []
42
+ _rank = 0
43
+
44
+ @nodes.each_index{|_i|
45
+ _n = @nodes[_i]
46
+ _rank += 1
47
+
48
+ scores={
49
+ :length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
50
+ :internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
51
+ :distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
52
+ :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1+ + seeds[:title_match_buffer])**seeds[:title_match].to_f #ferret index score, search score with page title
53
+ #:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
54
+ }
55
+ o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
56
+ }
57
+
58
+ o.sort!{|a,b| b[:score] <=> a[:score]}
59
+
60
+ #o[0,1].each{|o2| pp [o2[:score], o2[:scores]]}
61
+ #o[0,1].each{|o2| pp [refine_content_node(o2[:node]).clean_inner_text, o2[:score], o2[:scores]]}
62
+
63
+ return o
64
+ end
65
+
66
+ def refine_content_node(node)
67
+ node = node.dup
68
+
69
+ #strip 'clutter'
70
+ #i.children.each{|_n| pp _n.inner_text; pp _n.clean_inner_text.size}
71
+ _min_size = 30
72
+ node.children.each{|_n|
73
+ if(_n.clean_inner_tokens_text.size < _min_size)
74
+ _n.remove
75
+ else; break; end
76
+ }
77
+ node.children.reverse.each{|_n|
78
+ if(_n.clean_inner_tokens_text.size < _min_size)
79
+ _n.remove
80
+ else; break; end
81
+ }
82
+ node.search('*').each{|_n| _n.after(' ')}
83
+
84
+ return node
85
+ end
86
+
87
+
88
+ def content_node_ferret_index
89
+ @content_node_ferret_index ||= caluclate_content_node_ferret_index
90
+ end
91
+ def caluclate_content_node_ferret_index
92
+ index = Ferret::Index::Index.new()
93
+ index.field_infos.add_field(:id, :store => :yes)
94
+ index.field_infos.add_field(:content, :store => :no, :boost => 1)
95
+
96
+
97
+ @nodes.each_index{|_i|
98
+ i=@nodes[_i]
99
+ index << {
100
+ :id => _i,
101
+ :content => i.clean_inner_text,
102
+ }
103
+ }
104
+
105
+
106
+ q=self.title.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'') #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
107
+ s=index.search(q, :limit => @nodes.size)
108
+
109
+ o=[]
110
+ s.hits.each {|hit|
111
+ _i = index[hit.doc][:id].to_i
112
+ _n = @nodes[_i]
113
+ _search_score = hit.score
114
+ _search_normalised_score = hit.score/s.max_score
115
+ #logger.info [_n.clean_inner_text, _search_score, _search_normalised_score].pretty_inspect
116
+ o[_i] = _search_normalised_score
117
+ }
118
+ o
119
+ end
120
+
121
+ def mechanize_agent
122
+ @options[:mechanize_agent] ||= Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
123
+ end
124
+
125
+ end
126
+
127
+
128
+ end
129
+
130
+
131
+
132
+
133
+
134
+ Nokogiri::XML::Node.class_eval {
135
+ def clean_inner_text
136
+ @clean_inner_text ||= Charles::Misc.normalize_string(inner_text)
137
+ end
138
+ def clean_inner_tokens_text
139
+ @clean_inner_tokens_text ||= (
140
+ Charles::Misc.string_to_clean_tokens_string(clean_inner_text)
141
+ )
142
+ end
143
+ def internal_nodes_size
144
+ @internal_nodes_size ||= search('*').size
145
+ end
146
+ }
147
+
148
+
149
+ #https://github.com/cheald/pismo/blob/master/lib/pismo.rb
150
+ class Nokogiri::HTML::Document
151
+ def get_the(search)
152
+ self.search(search).first rescue nil
153
+ end
154
+
155
+ def match(queries = [])
156
+ [].tap do |results|
157
+ [*queries].each do |query|
158
+ result = begin
159
+ if query.is_a?(String)
160
+ if el = self.search(query).first
161
+ if el.name.downcase == "meta"
162
+ el['content']
163
+ else
164
+ el.inner_text
165
+ end
166
+ end
167
+ elsif query.is_a?(Array)
168
+ query.last.call( self.search(query.first).first )
169
+ end
170
+ rescue
171
+ nil
172
+ end
173
+ results << Charles::Misc.normalize_string(result) if result
174
+ end
175
+ end.compact
176
+ end
177
+ end
@@ -0,0 +1,77 @@
1
+ module Charles
2
+ module Images
3
+ def image
4
+ images && images.first
5
+ end
6
+ def images
7
+ @images ||= calculate_images
8
+ end
9
+ def calculate_images
10
+ _node = self.content_node
11
+ return unless _node
12
+ #logger.info _node.pretty_inspect
13
+
14
+ (_node.ancestors.size/2).times do
15
+ o=self.calculate_image_from_node(_node)
16
+ #logger.info o.pretty_inspect
17
+ return o if o
18
+ _node = _node.parent
19
+ end
20
+
21
+ return []
22
+ end
23
+ def calculate_image_from_node(_node)
24
+ _imgs = _node.search('img')
25
+
26
+ i=URI.parse(@options[:url])
27
+ if !_imgs.empty? && _imgs.size < 50 #sanity check if more than 50 images...
28
+ o=[]
29
+ _imgs.each do |_img|
30
+ next unless _img.attr('src')
31
+ begin
32
+ _u = (i + _img.attr('src')).to_s
33
+ rescue StandardError => e
34
+ logger.info "Error #{e}: #{i} + #{_img.attr('src')}"
35
+ next
36
+ end
37
+ o << _u
38
+ end
39
+ return o
40
+ end
41
+
42
+ return nil
43
+ end
44
+
45
+ def filtered_images
46
+ _max_proportion = 2.5
47
+ _min_area = 88*88
48
+ _filtered_images = []
49
+ _images = self.images.dup
50
+ _images.each{|url|
51
+ data = get_image(url)
52
+ next unless data
53
+ size = ImageSize.new(data).get_size
54
+ if(size[0] * size[1] > _min_area &&
55
+ size[0].to_f/size[1] < _max_proportion &&
56
+ size[1].to_f/size[0] < _max_proportion)
57
+ _filtered_images << {:url => url, :data => data, :width => size[0], :height => size[1]}
58
+ end
59
+ }
60
+ return _filtered_images
61
+ end
62
+
63
+ def get_image(url)
64
+ _cache_key = "get_image(#{url})"
65
+ begin
66
+ Charles.file_cache.fetch(_cache_key) {
67
+ body = mechanize_agent.get(url, [], URI.parse(@options[:url])).body
68
+ body.size < 900000 ? body : nil
69
+ }
70
+ rescue StandardError, Timeout::Error
71
+ Charles.file_cache.write(_cache_key, nil, :expires_in => 1.hour)
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+
@@ -0,0 +1,40 @@
1
+ module Charles
2
+ module InternalAttributes
3
+ def title
4
+ @title||=(
5
+ title = @document.search('title').first
6
+ title ? title.clean_inner_text : nil
7
+ )
8
+ end
9
+ def clean_title
10
+ return title if !@options[:sample_titles] || @options[:sample_titles].size < 5
11
+ _title_words = {}
12
+
13
+ _tokens = Charles::Misc.string_to_tokens_raw(self.title, type = :no_stop_words)
14
+ while(_tokens.first && words_to_filter_from_sample_titles.include?(_tokens.first.text)); _tokens.shift; end; #remove words from the beginning of the tokens
15
+ while(_tokens.last && words_to_filter_from_sample_titles.include?(_tokens.last.text)); _tokens.pop; end; #remove words from the end of the tokens
16
+ return title if _tokens.empty? #everything stripped? return nil, use other titles
17
+
18
+ _start = _tokens.first.start;
19
+ _end = _tokens.last.end;
20
+ _title = self.title.slice(_start, _end - _start)
21
+ _title = self.title.match(/[^\s\302\240]*#{Regexp.escape(_title)}[^\s\302\240]*/)[0].strip #include symbols or punctuation surrounding the title
22
+ end
23
+
24
+ protected
25
+
26
+ def words_to_filter_from_sample_titles
27
+ @words_to_filter_from_sample_titles = calculate_words_to_filter_from_sample_titles
28
+ end
29
+ def calculate_words_to_filter_from_sample_titles
30
+ _title_words = {}
31
+ @options[:sample_titles].each{|sample_title|
32
+ Charles::Misc.string_to_tokens(sample_title, type = :no_stop_words).uniq.each{|token|
33
+ _title_words[token]||=0; _title_words[token]+=1
34
+ }
35
+ }
36
+ _threshold = (0.9 * @options[:sample_titles].size).ceil
37
+ _words_to_filter = _title_words.select{|k,v| v >= _threshold}.collect{|k,v| k} #select words used in more than 90% of the titles
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,84 @@
1
+ module Charles
2
+ module Misc
3
+ def self.compare_strings(a,b)
4
+ [compare_strings_single_side(a,b),compare_strings_single_side(b,a)].mean
5
+ end
6
+ def self.compare_strings_single_side(a,b)
7
+ index = Ferret::Index::Index.new()
8
+ index.field_infos.add_field(:content, :store => :no, :boost => 1)
9
+ index << {:content => a}
10
+ search = index.search(b.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'')) #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
11
+ search.max_score
12
+ end
13
+
14
+
15
+
16
+
17
+ def self.analyzer(type = :all_stop_words)
18
+ @analyzer||={}
19
+ @analyzer[type]||=self.send("analyzer_#{type}")
20
+ end
21
+ def self.analyzer_all_stop_words
22
+ #http://blackwinter.github.com/ferret/classes/Ferret/Analysis.html
23
+ stop_words = Ferret::Analysis::EXTENDED_ENGLISH_STOP_WORDS |
24
+ Ferret::Analysis::FULL_FRENCH_STOP_WORDS |
25
+ Ferret::Analysis::FULL_SPANISH_STOP_WORDS |
26
+ Ferret::Analysis::FULL_PORTUGUESE_STOP_WORDS |
27
+ Ferret::Analysis::FULL_ITALIAN_STOP_WORDS |
28
+ Ferret::Analysis::FULL_GERMAN_STOP_WORDS |
29
+ Ferret::Analysis::FULL_DUTCH_STOP_WORDS |
30
+ Ferret::Analysis::FULL_SWEDISH_STOP_WORDS |
31
+ Ferret::Analysis::FULL_NORWEGIAN_STOP_WORDS |
32
+ Ferret::Analysis::FULL_DANISH_STOP_WORDS |
33
+ Ferret::Analysis::FULL_RUSSIAN_STOP_WORDS |
34
+ Ferret::Analysis::FULL_FINNISH_STOP_WORDS
35
+ Ferret::Analysis::StandardAnalyzer.new(stop_words,true)#(Ferret::Analysis::FULL_ENGLISH_STOP_WORDS) #no stop words
36
+ end
37
+ def self.analyzer_no_stop_words
38
+ Ferret::Analysis::StandardAnalyzer.new([],true)#no stop words
39
+ end
40
+
41
+ def self.string_to_tokens_raw(string, type = :all_stop_words)
42
+ token_stream = self.analyzer(type).token_stream('',string)
43
+ o=[]; while(j=token_stream.next); o << j; end;
44
+ return o
45
+ end
46
+ def self.string_to_tokens(string, type = :all_stop_words)
47
+ self.string_to_tokens_raw(string, type).collect{|token| token.text}
48
+ end
49
+ def self.string_to_clean_tokens(string, type = :all_stop_words)
50
+ tokens = string_to_tokens(string, type)
51
+ tokens.delete_if{|token| token.match(/\d/)}
52
+ tokens
53
+ end
54
+ def self.string_to_clean_tokens_string(string, type = :all_stop_words)
55
+ string_to_clean_tokens(string, type).join(' ')
56
+ end
57
+
58
+
59
+
60
+
61
+
62
+
63
+ def self.normalize_string(string)
64
+ @htmlentities||=HTMLEntities.new
65
+ @htmlentities.decode(normalize_unicode_characters(string.gsub(/[\s\302\240]+/,' ').strip))
66
+ end
67
+ UNICODE_CONVERSIONS = {
68
+ "8230" => '...',
69
+ "8194" => ' ',
70
+ "8195" => ' ',
71
+ "8201" => ' ',
72
+ "8211" => '-',
73
+ "8216" => '\'',
74
+ "8217" => '\'',
75
+ "8220" => '"',
76
+ "8221" => '"'
77
+ }
78
+ TRANSLATED_CONVERSIONS = UNICODE_CONVERSIONS.map {|k, v| [[k.to_i].pack('U*'), v] }
79
+ def self.normalize_unicode_characters(string)
80
+ TRANSLATED_CONVERSIONS.each {|k,v| string.gsub! k, v }
81
+ string
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,3 @@
1
+ module Charles
2
+ VERSION = "0.0.1"
3
+ end
data/lib/charles.rb ADDED
@@ -0,0 +1,66 @@
1
+ #require "charles/version"
2
+ require 'pp'
3
+
4
+ require 'rubygems'
5
+ require 'bundler/setup'
6
+ require 'nokogiri'
7
+ require 'htmlentities'
8
+ require 'mechanize'
9
+ require 'active_support/cache'
10
+ require 'active_support/cache/file_store'
11
+ require 'image_size'
12
+
13
+ require 'ferret'
14
+ Ferret.locale = "en_US.UTF-8" #if not set ferret segfaults on chinese/jap stuff randomly
15
+
16
+ require "charles/document"
17
+ require "charles/misc"
18
+
19
+ module Charles
20
+ # Your code goes here...
21
+ def self.logger=(logger)
22
+ @logger = logger
23
+ end
24
+ def self.logger
25
+ @logger ||= Logger.new(STDERR)
26
+ end
27
+
28
+ def self.get(url)
29
+ agent = Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
30
+ body = file_cache.fetch("Charles.get(#{url})"){
31
+ agent.get(url).body
32
+ }
33
+ return Document.new(body, :url => url, :mechanize_agent => agent)
34
+ end
35
+
36
+ def self.options
37
+ @options ||= {}
38
+ end
39
+
40
+ def self.file_cache
41
+ @file_cache ||= ActiveSupport::Cache::FileStore.new(Charles.options[:tmp_path], :namespace => 'charles')
42
+ end
43
+ end
44
+
45
+ module Enumerable
46
+
47
+ def sum
48
+ return self.inject(0){|accum, i| accum + i }
49
+ end
50
+
51
+ def mean
52
+ return self.sum / self.length.to_f
53
+ end
54
+
55
+ def sample_variance
56
+ m = self.mean
57
+ sum = self.inject(0){|accum, i| accum + (i - m) ** 2 }
58
+ return sum / (self.length - 1).to_f
59
+ end
60
+
61
+ def standard_deviation
62
+ return Math.sqrt(self.sample_variance)
63
+ end
64
+
65
+ end
66
+
data/optimise.rb ADDED
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'lib/charles'
4
+ require 'yaml'
5
+
6
+ TEST_ARTICLES = YAML.load_file("test/articles.yml")
7
+
8
+ class CharlesOptimiser
9
+ @@high_score = 0
10
+
11
+ def initialize
12
+ @articles = YAML.load_file("test/articles.yml")
13
+ @articles.each{|article|
14
+ next if article[:file].empty?
15
+ article[:html] = File.read("test/articles/#{article[:file]}.html")
16
+ article[:document] = Charles::Document.new(article[:html])
17
+ article[:expected][:content] = File.read("test/articles/#{article[:file]}.content.txt")
18
+ }
19
+ end
20
+ def optimise
21
+ 50.times do
22
+ seeds = {
23
+ :length => random(800,3000),
24
+ :distance_from_top => random(0.1,2),
25
+ :internal_nodes => random(5,50),
26
+ :internal_nodes_buffer => random(5,150),
27
+ :title_match => random(0,1),
28
+ :title_match_buffer => random(0,0.6)
29
+ }
30
+ _scores = articles_scores(seeds)
31
+ _scores.delete_if{|score| score > 1}
32
+ _score = _scores.mean
33
+ _std_dev = _scores.standard_deviation
34
+ if _score >= @@high_score
35
+ @@high_score = _score
36
+ pp [_score, _std_dev, seeds, _scores.select{|i| i<0.1}.size]
37
+ end
38
+ end
39
+ end
40
+ def articles_scores(seeds={})
41
+ _scores = []
42
+ @articles.each{|article|
43
+ next if article[:file].empty?
44
+ result = article[:document].content(seeds)
45
+ _score = compare_articles(result, article[:expected][:content])
46
+ _scores << _score
47
+ }
48
+ _scores
49
+ end
50
+ def compare_articles(a,b)
51
+ [compare_articles_single_side(a,b),compare_articles_single_side(b,a)].mean
52
+ end
53
+ def compare_articles_single_side(a,b)
54
+ index = Ferret::Index::Index.new()
55
+ index.field_infos.add_field(:content, :store => :no, :boost => 1)
56
+ index << {:content => a}
57
+ search = index.search(b)
58
+ search.max_score
59
+ end
60
+
61
+ def random(min,max)
62
+ rand * (max - min) + min
63
+ end
64
+ end
65
+
66
+ while true
67
+ thread = Thread.new {
68
+ CharlesOptimiser.new.optimise
69
+ }
70
+ thread.join
71
+ puts "***"
72
+ end
@@ -0,0 +1,5 @@
1
+ The People's Action Party (PAP) ended its campaign in Hougang on Thursday with a call for change in the ward, urging voters to start afresh with its young candidate.
2
+
3
+ 'Hougang, let's turn over a new page and start again,' declared party chairman Khaw Boon Wan at the PAP's final rally of the by-election on Thursday night.
4
+
5
+ With the ward in opposition hands since 1991, Hougang residents had been adversely affected, said Mr Khaw.