charles 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. data/.gitignore +17 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE +22 -0
  4. data/README.md +10 -0
  5. data/Rakefile +13 -0
  6. data/bin/charles +23 -0
  7. data/charles.gemspec +25 -0
  8. data/lib/charles/document.rb +177 -0
  9. data/lib/charles/images.rb +77 -0
  10. data/lib/charles/internal_attributes.rb +40 -0
  11. data/lib/charles/misc.rb +84 -0
  12. data/lib/charles/version.rb +3 -0
  13. data/lib/charles.rb +66 -0
  14. data/optimise.rb +72 -0
  15. data/test/articles/20120525_1525_straitstimes.com.content.txt +5 -0
  16. data/test/articles/20120525_1525_straitstimes.com.html +1929 -0
  17. data/test/articles/20120525_1534_bbc.co.uk.content.txt +19 -0
  18. data/test/articles/20120525_1534_bbc.co.uk.html +1777 -0
  19. data/test/articles/20120525_1727_bbc.co.uk.content.txt +39 -0
  20. data/test/articles/20120525_1727_bbc.co.uk.html +1889 -0
  21. data/test/articles/20120525_1730_channelnewsasia.com.content.txt +19 -0
  22. data/test/articles/20120525_1730_channelnewsasia.com.html +963 -0
  23. data/test/articles/20120525_1733_channelnewsasia.com.content.txt +19 -0
  24. data/test/articles/20120525_1733_channelnewsasia.com.html +923 -0
  25. data/test/articles/20120525_1736_nytimes.com.content.txt +21 -0
  26. data/test/articles/20120525_1736_nytimes.com.html +856 -0
  27. data/test/articles/20120525_1743_nytimes.com.content.txt +11 -0
  28. data/test/articles/20120525_1743_nytimes.com.html +98 -0
  29. data/test/articles/20120525_1747_techcrunch.com.content.txt +11 -0
  30. data/test/articles/20120525_1747_techcrunch.com.html +1098 -0
  31. data/test/articles/20120528_0929_washingtonpost.com.content.txt +23 -0
  32. data/test/articles/20120528_0929_washingtonpost.com.html +3335 -0
  33. data/test/articles/20120528_0931_latimes.com.content.txt +45 -0
  34. data/test/articles/20120528_0931_latimes.com.html +6371 -0
  35. data/test/articles/20120528_0938_entertainment.time.com.content.txt +31 -0
  36. data/test/articles/20120528_0938_entertainment.time.com.html +1261 -0
  37. data/test/articles/20120528_0943_bloomberg.com.content.txt +13 -0
  38. data/test/articles/20120528_0943_bloomberg.com.html +2874 -0
  39. data/test/articles/20120528_0947_reuters.com.content.txt +35 -0
  40. data/test/articles/20120528_0947_reuters.com.html +1563 -0
  41. data/test/articles/20120528_1106_reuters.com.content.txt +5 -0
  42. data/test/articles/20120528_1106_reuters.com.html +551 -0
  43. data/test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt +19 -0
  44. data/test/articles/20120528_1109_musicthing.blogspot.co.uk.html +865 -0
  45. data/test/articles/20120528_1114_mobileinc.co.uk.content.txt +15 -0
  46. data/test/articles/20120528_1114_mobileinc.co.uk.html +550 -0
  47. data/test/articles/20120528_1119_forbes.com.content.txt +15 -0
  48. data/test/articles/20120528_1119_forbes.com.html +1406 -0
  49. data/test/articles/20120528_1122_techcrunch.com.content.txt +58 -0
  50. data/test/articles/20120528_1122_techcrunch.com.html +1131 -0
  51. data/test/articles/20120528_1126_blogs.adobe.com.content.txt +13 -0
  52. data/test/articles/20120528_1126_blogs.adobe.com.html +303 -0
  53. data/test/articles/20120528_1142_thestar.com.my.content.txt +27 -0
  54. data/test/articles/20120528_1142_thestar.com.my.html +943 -0
  55. data/test/articles/20120528_1146_suntimes.com.content.txt +33 -0
  56. data/test/articles/20120528_1146_suntimes.com.html +5166 -0
  57. data/test/articles/20120528_1148_asiaone.com.content.txt +27 -0
  58. data/test/articles/20120528_1148_asiaone.com.html +1070 -0
  59. data/test/articles/20120529_1120_online.wsj.com.content.txt +56 -0
  60. data/test/articles/20120529_1120_online.wsj.com.html +3035 -0
  61. data/test/articles/20120529_1122_online.wsj.com.content.txt +35 -0
  62. data/test/articles/20120529_1122_online.wsj.com.html +2725 -0
  63. data/test/articles/20120529_1127_smh.com.au.content.txt +13 -0
  64. data/test/articles/20120529_1127_smh.com.au.html +2034 -0
  65. data/test/articles.yml +221 -0
  66. data/test/test_charles.rb +70 -0
  67. metadata +279 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in charles.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Jason Ling
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,10 @@
1
+ Charles
2
+ =======
3
+
4
+ Charles the Content Extractor in Ruby
5
+
6
+ # Similar Projects
7
+
8
+ - https://github.com/iterationlabs/ruby-readability (Ruby)
9
+ - https://github.com/peterc/pismo (Ruby)
10
+ - https://github.com/jiminoc/goose (Scala)
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ #http://guides.rubygems.org/make-your-own-gem/
5
+ require 'rake/testtask'
6
+
7
+ Rake::TestTask.new do |t|
8
+ t.libs << 'test'
9
+ end
10
+
11
+ desc "Run tests"
12
+ task :default => :test
13
+
data/bin/charles ADDED
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'lib/charles'
4
+ require 'yaml'
5
+
6
+ Charles.options[:tmp_path] = File.dirname(__FILE__) + "/../test/tmp"
7
+
8
+ url = ARGV.shift
9
+
10
+ unless url =~ /^http/
11
+ url = File.read(url)
12
+ end
13
+
14
+ document = Charles.get(url)
15
+ puts({
16
+ :content => document.content,
17
+ :title => document.title,
18
+ :filtered_images => document.filtered_images.collect{|image| image[:url]}
19
+ }.to_yaml)
20
+
21
+
22
+
23
+
data/charles.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/charles/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Jason Ling Xiaowei"]
6
+ gem.email = ["jason@jeyel.com"]
7
+ gem.description = 'Charles the Content Extractor'
8
+ gem.summary = 'Charles the Content Extractor'
9
+ gem.homepage = "https://github.com/jlxw/charles"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "charles"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Charles::VERSION
17
+
18
+ gem.add_dependency "ferret"
19
+ gem.add_dependency "nokogiri"
20
+ gem.add_dependency "htmlentities"
21
+ gem.add_dependency "mechanize"
22
+ gem.add_dependency "activesupport"
23
+ gem.add_dependency "rack"
24
+ gem.add_dependency "imagesize"
25
+ end
@@ -0,0 +1,177 @@
1
+ require 'charles/images'
2
+ require 'charles/internal_attributes'
3
+
4
+ module Charles
5
+ class Document
6
+ include Charles::InternalAttributes
7
+ include Charles::Images
8
+
9
+ def initialize(input, options={})
10
+ @document = Nokogiri::HTML.parse(input)
11
+ @document.search("script, style").remove
12
+ @nodes = @document.search('body *').select{|_n|
13
+ _n.clean_inner_tokens_text.size > 30 #arbitrary, minimum inner text limit of 30 chars
14
+ }
15
+ @options = options
16
+ end
17
+
18
+ def logger; Charles.logger; end
19
+
20
+ def content(seeds={})
21
+ content_node = content_node(seeds)
22
+ return unless content_node
23
+ refine_content_node(content_node).clean_inner_text
24
+ end
25
+
26
+ def content_node(seeds={})
27
+ content_nodes = calculate_content_nodes(seeds)
28
+ return unless content_nodes.first
29
+ content_nodes.first[:node]
30
+ end
31
+
32
+ def calculate_content_nodes(seeds={})
33
+ default_seeds = {:title_match=>0.145422959269808,
34
+ :title_match_buffer=>0.0174920023610796,
35
+ :length=>1100.27450832379,
36
+ :distance_from_top=>0.308408501217311,
37
+ :internal_nodes=>25.680381972181,
38
+ :internal_nodes_buffer=>20.2006169153009}
39
+ seeds = default_seeds.merge(seeds)
40
+
41
+ o = []
42
+ _rank = 0
43
+
44
+ @nodes.each_index{|_i|
45
+ _n = @nodes[_i]
46
+ _rank += 1
47
+
48
+ scores={
49
+ :length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
50
+ :internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
51
+ :distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
52
+ :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1+ + seeds[:title_match_buffer])**seeds[:title_match].to_f #ferret index score, search score with page title
53
+ #:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
54
+ }
55
+ o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
56
+ }
57
+
58
+ o.sort!{|a,b| b[:score] <=> a[:score]}
59
+
60
+ #o[0,1].each{|o2| pp [o2[:score], o2[:scores]]}
61
+ #o[0,1].each{|o2| pp [refine_content_node(o2[:node]).clean_inner_text, o2[:score], o2[:scores]]}
62
+
63
+ return o
64
+ end
65
+
66
+ def refine_content_node(node)
67
+ node = node.dup
68
+
69
+ #strip 'clutter'
70
+ #i.children.each{|_n| pp _n.inner_text; pp _n.clean_inner_text.size}
71
+ _min_size = 30
72
+ node.children.each{|_n|
73
+ if(_n.clean_inner_tokens_text.size < _min_size)
74
+ _n.remove
75
+ else; break; end
76
+ }
77
+ node.children.reverse.each{|_n|
78
+ if(_n.clean_inner_tokens_text.size < _min_size)
79
+ _n.remove
80
+ else; break; end
81
+ }
82
+ node.search('*').each{|_n| _n.after(' ')}
83
+
84
+ return node
85
+ end
86
+
87
+
88
+ def content_node_ferret_index
89
+ @content_node_ferret_index ||= caluclate_content_node_ferret_index
90
+ end
91
+ def caluclate_content_node_ferret_index
92
+ index = Ferret::Index::Index.new()
93
+ index.field_infos.add_field(:id, :store => :yes)
94
+ index.field_infos.add_field(:content, :store => :no, :boost => 1)
95
+
96
+
97
+ @nodes.each_index{|_i|
98
+ i=@nodes[_i]
99
+ index << {
100
+ :id => _i,
101
+ :content => i.clean_inner_text,
102
+ }
103
+ }
104
+
105
+
106
+ q=self.title.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'') #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
107
+ s=index.search(q, :limit => @nodes.size)
108
+
109
+ o=[]
110
+ s.hits.each {|hit|
111
+ _i = index[hit.doc][:id].to_i
112
+ _n = @nodes[_i]
113
+ _search_score = hit.score
114
+ _search_normalised_score = hit.score/s.max_score
115
+ #logger.info [_n.clean_inner_text, _search_score, _search_normalised_score].pretty_inspect
116
+ o[_i] = _search_normalised_score
117
+ }
118
+ o
119
+ end
120
+
121
+ def mechanize_agent
122
+ @options[:mechanize_agent] ||= Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
123
+ end
124
+
125
+ end
126
+
127
+
128
+ end
129
+
130
+
131
+
132
+
133
+
134
+ Nokogiri::XML::Node.class_eval {
135
+ def clean_inner_text
136
+ @clean_inner_text ||= Charles::Misc.normalize_string(inner_text)
137
+ end
138
+ def clean_inner_tokens_text
139
+ @clean_inner_tokens_text ||= (
140
+ Charles::Misc.string_to_clean_tokens_string(clean_inner_text)
141
+ )
142
+ end
143
+ def internal_nodes_size
144
+ @internal_nodes_size ||= search('*').size
145
+ end
146
+ }
147
+
148
+
149
+ #https://github.com/cheald/pismo/blob/master/lib/pismo.rb
150
+ class Nokogiri::HTML::Document
151
+ def get_the(search)
152
+ self.search(search).first rescue nil
153
+ end
154
+
155
+ def match(queries = [])
156
+ [].tap do |results|
157
+ [*queries].each do |query|
158
+ result = begin
159
+ if query.is_a?(String)
160
+ if el = self.search(query).first
161
+ if el.name.downcase == "meta"
162
+ el['content']
163
+ else
164
+ el.inner_text
165
+ end
166
+ end
167
+ elsif query.is_a?(Array)
168
+ query.last.call( self.search(query.first).first )
169
+ end
170
+ rescue
171
+ nil
172
+ end
173
+ results << Charles::Misc.normalize_string(result) if result
174
+ end
175
+ end.compact
176
+ end
177
+ end
@@ -0,0 +1,77 @@
1
+ module Charles
2
+ module Images
3
+ def image
4
+ images && images.first
5
+ end
6
+ def images
7
+ @images ||= calculate_images
8
+ end
9
+ def calculate_images
10
+ _node = self.content_node
11
+ return unless _node
12
+ #logger.info _node.pretty_inspect
13
+
14
+ (_node.ancestors.size/2).times do
15
+ o=self.calculate_image_from_node(_node)
16
+ #logger.info o.pretty_inspect
17
+ return o if o
18
+ _node = _node.parent
19
+ end
20
+
21
+ return []
22
+ end
23
+ def calculate_image_from_node(_node)
24
+ _imgs = _node.search('img')
25
+
26
+ i=URI.parse(@options[:url])
27
+ if !_imgs.empty? && _imgs.size < 50 #sanity check if more than 50 images...
28
+ o=[]
29
+ _imgs.each do |_img|
30
+ next unless _img.attr('src')
31
+ begin
32
+ _u = (i + _img.attr('src')).to_s
33
+ rescue StandardError => e
34
+ logger.info "Error #{e}: #{i} + #{_img.attr('src')}"
35
+ next
36
+ end
37
+ o << _u
38
+ end
39
+ return o
40
+ end
41
+
42
+ return nil
43
+ end
44
+
45
+ def filtered_images
46
+ _max_proportion = 2.5
47
+ _min_area = 88*88
48
+ _filtered_images = []
49
+ _images = self.images.dup
50
+ _images.each{|url|
51
+ data = get_image(url)
52
+ next unless data
53
+ size = ImageSize.new(data).get_size
54
+ if(size[0] * size[1] > _min_area &&
55
+ size[0].to_f/size[1] < _max_proportion &&
56
+ size[1].to_f/size[0] < _max_proportion)
57
+ _filtered_images << {:url => url, :data => data, :width => size[0], :height => size[1]}
58
+ end
59
+ }
60
+ return _filtered_images
61
+ end
62
+
63
+ def get_image(url)
64
+ _cache_key = "get_image(#{url})"
65
+ begin
66
+ Charles.file_cache.fetch(_cache_key) {
67
+ body = mechanize_agent.get(url, [], URI.parse(@options[:url])).body
68
+ body.size < 900000 ? body : nil
69
+ }
70
+ rescue StandardError, Timeout::Error
71
+ Charles.file_cache.write(_cache_key, nil, :expires_in => 1.hour)
72
+ end
73
+ end
74
+ end
75
+ end
76
+
77
+
@@ -0,0 +1,40 @@
1
+ module Charles
2
+ module InternalAttributes
3
+ def title
4
+ @title||=(
5
+ title = @document.search('title').first
6
+ title ? title.clean_inner_text : nil
7
+ )
8
+ end
9
+ def clean_title
10
+ return title if !@options[:sample_titles] || @options[:sample_titles].size < 5
11
+ _title_words = {}
12
+
13
+ _tokens = Charles::Misc.string_to_tokens_raw(self.title, type = :no_stop_words)
14
+ while(_tokens.first && words_to_filter_from_sample_titles.include?(_tokens.first.text)); _tokens.shift; end; #remove words from the beginning of the tokens
15
+ while(_tokens.last && words_to_filter_from_sample_titles.include?(_tokens.last.text)); _tokens.pop; end; #remove words from the end of the tokens
16
+ return title if _tokens.empty? #everything stripped? return nil, use other titles
17
+
18
+ _start = _tokens.first.start;
19
+ _end = _tokens.last.end;
20
+ _title = self.title.slice(_start, _end - _start)
21
+ _title = self.title.match(/[^\s\302\240]*#{Regexp.escape(_title)}[^\s\302\240]*/)[0].strip #include symbols or punctuation surrounding the title
22
+ end
23
+
24
+ protected
25
+
26
+ def words_to_filter_from_sample_titles
27
+ @words_to_filter_from_sample_titles = calculate_words_to_filter_from_sample_titles
28
+ end
29
+ def calculate_words_to_filter_from_sample_titles
30
+ _title_words = {}
31
+ @options[:sample_titles].each{|sample_title|
32
+ Charles::Misc.string_to_tokens(sample_title, type = :no_stop_words).uniq.each{|token|
33
+ _title_words[token]||=0; _title_words[token]+=1
34
+ }
35
+ }
36
+ _threshold = (0.9 * @options[:sample_titles].size).ceil
37
+ _words_to_filter = _title_words.select{|k,v| v >= _threshold}.collect{|k,v| k} #select words used in more than 90% of the titles
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,84 @@
1
+ module Charles
2
+ module Misc
3
+ def self.compare_strings(a,b)
4
+ [compare_strings_single_side(a,b),compare_strings_single_side(b,a)].mean
5
+ end
6
+ def self.compare_strings_single_side(a,b)
7
+ index = Ferret::Index::Index.new()
8
+ index.field_infos.add_field(:content, :store => :no, :boost => 1)
9
+ index << {:content => a}
10
+ search = index.search(b.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'')) #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
11
+ search.max_score
12
+ end
13
+
14
+
15
+
16
+
17
+ def self.analyzer(type = :all_stop_words)
18
+ @analyzer||={}
19
+ @analyzer[type]||=self.send("analyzer_#{type}")
20
+ end
21
+ def self.analyzer_all_stop_words
22
+ #http://blackwinter.github.com/ferret/classes/Ferret/Analysis.html
23
+ stop_words = Ferret::Analysis::EXTENDED_ENGLISH_STOP_WORDS |
24
+ Ferret::Analysis::FULL_FRENCH_STOP_WORDS |
25
+ Ferret::Analysis::FULL_SPANISH_STOP_WORDS |
26
+ Ferret::Analysis::FULL_PORTUGUESE_STOP_WORDS |
27
+ Ferret::Analysis::FULL_ITALIAN_STOP_WORDS |
28
+ Ferret::Analysis::FULL_GERMAN_STOP_WORDS |
29
+ Ferret::Analysis::FULL_DUTCH_STOP_WORDS |
30
+ Ferret::Analysis::FULL_SWEDISH_STOP_WORDS |
31
+ Ferret::Analysis::FULL_NORWEGIAN_STOP_WORDS |
32
+ Ferret::Analysis::FULL_DANISH_STOP_WORDS |
33
+ Ferret::Analysis::FULL_RUSSIAN_STOP_WORDS |
34
+ Ferret::Analysis::FULL_FINNISH_STOP_WORDS
35
+ Ferret::Analysis::StandardAnalyzer.new(stop_words,true)#(Ferret::Analysis::FULL_ENGLISH_STOP_WORDS) #no stop words
36
+ end
37
+ def self.analyzer_no_stop_words
38
+ Ferret::Analysis::StandardAnalyzer.new([],true)#no stop words
39
+ end
40
+
41
+ def self.string_to_tokens_raw(string, type = :all_stop_words)
42
+ token_stream = self.analyzer(type).token_stream('',string)
43
+ o=[]; while(j=token_stream.next); o << j; end;
44
+ return o
45
+ end
46
+ def self.string_to_tokens(string, type = :all_stop_words)
47
+ self.string_to_tokens_raw(string, type).collect{|token| token.text}
48
+ end
49
+ def self.string_to_clean_tokens(string, type = :all_stop_words)
50
+ tokens = string_to_tokens(string, type)
51
+ tokens.delete_if{|token| token.match(/\d/)}
52
+ tokens
53
+ end
54
+ def self.string_to_clean_tokens_string(string, type = :all_stop_words)
55
+ string_to_clean_tokens(string, type).join(' ')
56
+ end
57
+
58
+
59
+
60
+
61
+
62
+
63
+ def self.normalize_string(string)
64
+ @htmlentities||=HTMLEntities.new
65
+ @htmlentities.decode(normalize_unicode_characters(string.gsub(/[\s\302\240]+/,' ').strip))
66
+ end
67
+ UNICODE_CONVERSIONS = {
68
+ "8230" => '...',
69
+ "8194" => ' ',
70
+ "8195" => ' ',
71
+ "8201" => ' ',
72
+ "8211" => '-',
73
+ "8216" => '\'',
74
+ "8217" => '\'',
75
+ "8220" => '"',
76
+ "8221" => '"'
77
+ }
78
+ TRANSLATED_CONVERSIONS = UNICODE_CONVERSIONS.map {|k, v| [[k.to_i].pack('U*'), v] }
79
+ def self.normalize_unicode_characters(string)
80
+ TRANSLATED_CONVERSIONS.each {|k,v| string.gsub! k, v }
81
+ string
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,3 @@
1
+ module Charles
2
+ VERSION = "0.0.1"
3
+ end
data/lib/charles.rb ADDED
@@ -0,0 +1,66 @@
1
+ #require "charles/version"
2
+ require 'pp'
3
+
4
+ require 'rubygems'
5
+ require 'bundler/setup'
6
+ require 'nokogiri'
7
+ require 'htmlentities'
8
+ require 'mechanize'
9
+ require 'active_support/cache'
10
+ require 'active_support/cache/file_store'
11
+ require 'image_size'
12
+
13
+ require 'ferret'
14
+ Ferret.locale = "en_US.UTF-8" #if not set ferret segfaults on chinese/jap stuff randomly
15
+
16
+ require "charles/document"
17
+ require "charles/misc"
18
+
19
+ module Charles
20
+ # Your code goes here...
21
+ def self.logger=(logger)
22
+ @logger = logger
23
+ end
24
+ def self.logger
25
+ @logger ||= Logger.new(STDERR)
26
+ end
27
+
28
+ def self.get(url)
29
+ agent = Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
30
+ body = file_cache.fetch("Charles.get(#{url})"){
31
+ agent.get(url).body
32
+ }
33
+ return Document.new(body, :url => url, :mechanize_agent => agent)
34
+ end
35
+
36
+ def self.options
37
+ @options ||= {}
38
+ end
39
+
40
+ def self.file_cache
41
+ @file_cache ||= ActiveSupport::Cache::FileStore.new(Charles.options[:tmp_path], :namespace => 'charles')
42
+ end
43
+ end
44
+
45
+ module Enumerable
46
+
47
+ def sum
48
+ return self.inject(0){|accum, i| accum + i }
49
+ end
50
+
51
+ def mean
52
+ return self.sum / self.length.to_f
53
+ end
54
+
55
+ def sample_variance
56
+ m = self.mean
57
+ sum = self.inject(0){|accum, i| accum + (i - m) ** 2 }
58
+ return sum / (self.length - 1).to_f
59
+ end
60
+
61
+ def standard_deviation
62
+ return Math.sqrt(self.sample_variance)
63
+ end
64
+
65
+ end
66
+
data/optimise.rb ADDED
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'lib/charles'
4
+ require 'yaml'
5
+
6
+ TEST_ARTICLES = YAML.load_file("test/articles.yml")
7
+
8
+ class CharlesOptimiser
9
+ @@high_score = 0
10
+
11
+ def initialize
12
+ @articles = YAML.load_file("test/articles.yml")
13
+ @articles.each{|article|
14
+ next if article[:file].empty?
15
+ article[:html] = File.read("test/articles/#{article[:file]}.html")
16
+ article[:document] = Charles::Document.new(article[:html])
17
+ article[:expected][:content] = File.read("test/articles/#{article[:file]}.content.txt")
18
+ }
19
+ end
20
+ def optimise
21
+ 50.times do
22
+ seeds = {
23
+ :length => random(800,3000),
24
+ :distance_from_top => random(0.1,2),
25
+ :internal_nodes => random(5,50),
26
+ :internal_nodes_buffer => random(5,150),
27
+ :title_match => random(0,1),
28
+ :title_match_buffer => random(0,0.6)
29
+ }
30
+ _scores = articles_scores(seeds)
31
+ _scores.delete_if{|score| score > 1}
32
+ _score = _scores.mean
33
+ _std_dev = _scores.standard_deviation
34
+ if _score >= @@high_score
35
+ @@high_score = _score
36
+ pp [_score, _std_dev, seeds, _scores.select{|i| i<0.1}.size]
37
+ end
38
+ end
39
+ end
40
+ def articles_scores(seeds={})
41
+ _scores = []
42
+ @articles.each{|article|
43
+ next if article[:file].empty?
44
+ result = article[:document].content(seeds)
45
+ _score = compare_articles(result, article[:expected][:content])
46
+ _scores << _score
47
+ }
48
+ _scores
49
+ end
50
+ def compare_articles(a,b)
51
+ [compare_articles_single_side(a,b),compare_articles_single_side(b,a)].mean
52
+ end
53
+ def compare_articles_single_side(a,b)
54
+ index = Ferret::Index::Index.new()
55
+ index.field_infos.add_field(:content, :store => :no, :boost => 1)
56
+ index << {:content => a}
57
+ search = index.search(b)
58
+ search.max_score
59
+ end
60
+
61
+ def random(min,max)
62
+ rand * (max - min) + min
63
+ end
64
+ end
65
+
66
+ while true
67
+ thread = Thread.new {
68
+ CharlesOptimiser.new.optimise
69
+ }
70
+ thread.join
71
+ puts "***"
72
+ end
@@ -0,0 +1,5 @@
1
+ The People's Action Party (PAP) ended its campaign in Hougang on Thursday with a call for change in the ward, urging voters to start afresh with its young candidate.
2
+
3
+ 'Hougang, let's turn over a new page and start again,' declared party chairman Khaw Boon Wan at the PAP's final rally of the by-election on Thursday night.
4
+
5
+ With the ward in opposition hands since 1991, Hougang residents had been adversely affected, said Mr Khaw.