charles 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +10 -0
- data/Rakefile +13 -0
- data/bin/charles +23 -0
- data/charles.gemspec +25 -0
- data/lib/charles/document.rb +177 -0
- data/lib/charles/images.rb +77 -0
- data/lib/charles/internal_attributes.rb +40 -0
- data/lib/charles/misc.rb +84 -0
- data/lib/charles/version.rb +3 -0
- data/lib/charles.rb +66 -0
- data/optimise.rb +72 -0
- data/test/articles/20120525_1525_straitstimes.com.content.txt +5 -0
- data/test/articles/20120525_1525_straitstimes.com.html +1929 -0
- data/test/articles/20120525_1534_bbc.co.uk.content.txt +19 -0
- data/test/articles/20120525_1534_bbc.co.uk.html +1777 -0
- data/test/articles/20120525_1727_bbc.co.uk.content.txt +39 -0
- data/test/articles/20120525_1727_bbc.co.uk.html +1889 -0
- data/test/articles/20120525_1730_channelnewsasia.com.content.txt +19 -0
- data/test/articles/20120525_1730_channelnewsasia.com.html +963 -0
- data/test/articles/20120525_1733_channelnewsasia.com.content.txt +19 -0
- data/test/articles/20120525_1733_channelnewsasia.com.html +923 -0
- data/test/articles/20120525_1736_nytimes.com.content.txt +21 -0
- data/test/articles/20120525_1736_nytimes.com.html +856 -0
- data/test/articles/20120525_1743_nytimes.com.content.txt +11 -0
- data/test/articles/20120525_1743_nytimes.com.html +98 -0
- data/test/articles/20120525_1747_techcrunch.com.content.txt +11 -0
- data/test/articles/20120525_1747_techcrunch.com.html +1098 -0
- data/test/articles/20120528_0929_washingtonpost.com.content.txt +23 -0
- data/test/articles/20120528_0929_washingtonpost.com.html +3335 -0
- data/test/articles/20120528_0931_latimes.com.content.txt +45 -0
- data/test/articles/20120528_0931_latimes.com.html +6371 -0
- data/test/articles/20120528_0938_entertainment.time.com.content.txt +31 -0
- data/test/articles/20120528_0938_entertainment.time.com.html +1261 -0
- data/test/articles/20120528_0943_bloomberg.com.content.txt +13 -0
- data/test/articles/20120528_0943_bloomberg.com.html +2874 -0
- data/test/articles/20120528_0947_reuters.com.content.txt +35 -0
- data/test/articles/20120528_0947_reuters.com.html +1563 -0
- data/test/articles/20120528_1106_reuters.com.content.txt +5 -0
- data/test/articles/20120528_1106_reuters.com.html +551 -0
- data/test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt +19 -0
- data/test/articles/20120528_1109_musicthing.blogspot.co.uk.html +865 -0
- data/test/articles/20120528_1114_mobileinc.co.uk.content.txt +15 -0
- data/test/articles/20120528_1114_mobileinc.co.uk.html +550 -0
- data/test/articles/20120528_1119_forbes.com.content.txt +15 -0
- data/test/articles/20120528_1119_forbes.com.html +1406 -0
- data/test/articles/20120528_1122_techcrunch.com.content.txt +58 -0
- data/test/articles/20120528_1122_techcrunch.com.html +1131 -0
- data/test/articles/20120528_1126_blogs.adobe.com.content.txt +13 -0
- data/test/articles/20120528_1126_blogs.adobe.com.html +303 -0
- data/test/articles/20120528_1142_thestar.com.my.content.txt +27 -0
- data/test/articles/20120528_1142_thestar.com.my.html +943 -0
- data/test/articles/20120528_1146_suntimes.com.content.txt +33 -0
- data/test/articles/20120528_1146_suntimes.com.html +5166 -0
- data/test/articles/20120528_1148_asiaone.com.content.txt +27 -0
- data/test/articles/20120528_1148_asiaone.com.html +1070 -0
- data/test/articles/20120529_1120_online.wsj.com.content.txt +56 -0
- data/test/articles/20120529_1120_online.wsj.com.html +3035 -0
- data/test/articles/20120529_1122_online.wsj.com.content.txt +35 -0
- data/test/articles/20120529_1122_online.wsj.com.html +2725 -0
- data/test/articles/20120529_1127_smh.com.au.content.txt +13 -0
- data/test/articles/20120529_1127_smh.com.au.html +2034 -0
- data/test/articles.yml +221 -0
- data/test/test_charles.rb +70 -0
- metadata +279 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Jason Ling
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
data/Rakefile
ADDED
data/bin/charles
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'lib/charles'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
Charles.options[:tmp_path] = File.dirname(__FILE__) + "/../test/tmp"
|
7
|
+
|
8
|
+
url = ARGV.shift
|
9
|
+
|
10
|
+
unless url =~ /^http/
|
11
|
+
url = File.read(url)
|
12
|
+
end
|
13
|
+
|
14
|
+
document = Charles.get(url)
|
15
|
+
puts({
|
16
|
+
:content => document.content,
|
17
|
+
:title => document.title,
|
18
|
+
:filtered_images => document.filtered_images.collect{|image| image[:url]}
|
19
|
+
}.to_yaml)
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
data/charles.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/charles/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Jason Ling Xiaowei"]
|
6
|
+
gem.email = ["jason@jeyel.com"]
|
7
|
+
gem.description = 'Charles the Content Extractor'
|
8
|
+
gem.summary = 'Charles the Content Extractor'
|
9
|
+
gem.homepage = "https://github.com/jlxw/charles"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "charles"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = Charles::VERSION
|
17
|
+
|
18
|
+
gem.add_dependency "ferret"
|
19
|
+
gem.add_dependency "nokogiri"
|
20
|
+
gem.add_dependency "htmlentities"
|
21
|
+
gem.add_dependency "mechanize"
|
22
|
+
gem.add_dependency "activesupport"
|
23
|
+
gem.add_dependency "rack"
|
24
|
+
gem.add_dependency "imagesize"
|
25
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
require 'charles/images'
|
2
|
+
require 'charles/internal_attributes'
|
3
|
+
|
4
|
+
module Charles
|
5
|
+
class Document
|
6
|
+
include Charles::InternalAttributes
|
7
|
+
include Charles::Images
|
8
|
+
|
9
|
+
def initialize(input, options={})
|
10
|
+
@document = Nokogiri::HTML.parse(input)
|
11
|
+
@document.search("script, style").remove
|
12
|
+
@nodes = @document.search('body *').select{|_n|
|
13
|
+
_n.clean_inner_tokens_text.size > 30 #arbitrary, minimum inner text limit of 30 chars
|
14
|
+
}
|
15
|
+
@options = options
|
16
|
+
end
|
17
|
+
|
18
|
+
def logger; Charles.logger; end
|
19
|
+
|
20
|
+
def content(seeds={})
|
21
|
+
content_node = content_node(seeds)
|
22
|
+
return unless content_node
|
23
|
+
refine_content_node(content_node).clean_inner_text
|
24
|
+
end
|
25
|
+
|
26
|
+
def content_node(seeds={})
|
27
|
+
content_nodes = calculate_content_nodes(seeds)
|
28
|
+
return unless content_nodes.first
|
29
|
+
content_nodes.first[:node]
|
30
|
+
end
|
31
|
+
|
32
|
+
def calculate_content_nodes(seeds={})
|
33
|
+
default_seeds = {:title_match=>0.145422959269808,
|
34
|
+
:title_match_buffer=>0.0174920023610796,
|
35
|
+
:length=>1100.27450832379,
|
36
|
+
:distance_from_top=>0.308408501217311,
|
37
|
+
:internal_nodes=>25.680381972181,
|
38
|
+
:internal_nodes_buffer=>20.2006169153009}
|
39
|
+
seeds = default_seeds.merge(seeds)
|
40
|
+
|
41
|
+
o = []
|
42
|
+
_rank = 0
|
43
|
+
|
44
|
+
@nodes.each_index{|_i|
|
45
|
+
_n = @nodes[_i]
|
46
|
+
_rank += 1
|
47
|
+
|
48
|
+
scores={
|
49
|
+
:length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
|
50
|
+
:internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
|
51
|
+
:distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
|
52
|
+
:title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1+ + seeds[:title_match_buffer])**seeds[:title_match].to_f #ferret index score, search score with page title
|
53
|
+
#:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
|
54
|
+
}
|
55
|
+
o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
|
56
|
+
}
|
57
|
+
|
58
|
+
o.sort!{|a,b| b[:score] <=> a[:score]}
|
59
|
+
|
60
|
+
#o[0,1].each{|o2| pp [o2[:score], o2[:scores]]}
|
61
|
+
#o[0,1].each{|o2| pp [refine_content_node(o2[:node]).clean_inner_text, o2[:score], o2[:scores]]}
|
62
|
+
|
63
|
+
return o
|
64
|
+
end
|
65
|
+
|
66
|
+
def refine_content_node(node)
|
67
|
+
node = node.dup
|
68
|
+
|
69
|
+
#strip 'clutter'
|
70
|
+
#i.children.each{|_n| pp _n.inner_text; pp _n.clean_inner_text.size}
|
71
|
+
_min_size = 30
|
72
|
+
node.children.each{|_n|
|
73
|
+
if(_n.clean_inner_tokens_text.size < _min_size)
|
74
|
+
_n.remove
|
75
|
+
else; break; end
|
76
|
+
}
|
77
|
+
node.children.reverse.each{|_n|
|
78
|
+
if(_n.clean_inner_tokens_text.size < _min_size)
|
79
|
+
_n.remove
|
80
|
+
else; break; end
|
81
|
+
}
|
82
|
+
node.search('*').each{|_n| _n.after(' ')}
|
83
|
+
|
84
|
+
return node
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def content_node_ferret_index
|
89
|
+
@content_node_ferret_index ||= caluclate_content_node_ferret_index
|
90
|
+
end
|
91
|
+
def caluclate_content_node_ferret_index
|
92
|
+
index = Ferret::Index::Index.new()
|
93
|
+
index.field_infos.add_field(:id, :store => :yes)
|
94
|
+
index.field_infos.add_field(:content, :store => :no, :boost => 1)
|
95
|
+
|
96
|
+
|
97
|
+
@nodes.each_index{|_i|
|
98
|
+
i=@nodes[_i]
|
99
|
+
index << {
|
100
|
+
:id => _i,
|
101
|
+
:content => i.clean_inner_text,
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
|
106
|
+
q=self.title.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'') #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
|
107
|
+
s=index.search(q, :limit => @nodes.size)
|
108
|
+
|
109
|
+
o=[]
|
110
|
+
s.hits.each {|hit|
|
111
|
+
_i = index[hit.doc][:id].to_i
|
112
|
+
_n = @nodes[_i]
|
113
|
+
_search_score = hit.score
|
114
|
+
_search_normalised_score = hit.score/s.max_score
|
115
|
+
#logger.info [_n.clean_inner_text, _search_score, _search_normalised_score].pretty_inspect
|
116
|
+
o[_i] = _search_normalised_score
|
117
|
+
}
|
118
|
+
o
|
119
|
+
end
|
120
|
+
|
121
|
+
def mechanize_agent
|
122
|
+
@options[:mechanize_agent] ||= Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
Nokogiri::XML::Node.class_eval {
|
135
|
+
def clean_inner_text
|
136
|
+
@clean_inner_text ||= Charles::Misc.normalize_string(inner_text)
|
137
|
+
end
|
138
|
+
def clean_inner_tokens_text
|
139
|
+
@clean_inner_tokens_text ||= (
|
140
|
+
Charles::Misc.string_to_clean_tokens_string(clean_inner_text)
|
141
|
+
)
|
142
|
+
end
|
143
|
+
def internal_nodes_size
|
144
|
+
@internal_nodes_size ||= search('*').size
|
145
|
+
end
|
146
|
+
}
|
147
|
+
|
148
|
+
|
149
|
+
#https://github.com/cheald/pismo/blob/master/lib/pismo.rb
|
150
|
+
class Nokogiri::HTML::Document
|
151
|
+
def get_the(search)
|
152
|
+
self.search(search).first rescue nil
|
153
|
+
end
|
154
|
+
|
155
|
+
def match(queries = [])
|
156
|
+
[].tap do |results|
|
157
|
+
[*queries].each do |query|
|
158
|
+
result = begin
|
159
|
+
if query.is_a?(String)
|
160
|
+
if el = self.search(query).first
|
161
|
+
if el.name.downcase == "meta"
|
162
|
+
el['content']
|
163
|
+
else
|
164
|
+
el.inner_text
|
165
|
+
end
|
166
|
+
end
|
167
|
+
elsif query.is_a?(Array)
|
168
|
+
query.last.call( self.search(query.first).first )
|
169
|
+
end
|
170
|
+
rescue
|
171
|
+
nil
|
172
|
+
end
|
173
|
+
results << Charles::Misc.normalize_string(result) if result
|
174
|
+
end
|
175
|
+
end.compact
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Charles
|
2
|
+
module Images
|
3
|
+
def image
|
4
|
+
images && images.first
|
5
|
+
end
|
6
|
+
def images
|
7
|
+
@images ||= calculate_images
|
8
|
+
end
|
9
|
+
def calculate_images
|
10
|
+
_node = self.content_node
|
11
|
+
return unless _node
|
12
|
+
#logger.info _node.pretty_inspect
|
13
|
+
|
14
|
+
(_node.ancestors.size/2).times do
|
15
|
+
o=self.calculate_image_from_node(_node)
|
16
|
+
#logger.info o.pretty_inspect
|
17
|
+
return o if o
|
18
|
+
_node = _node.parent
|
19
|
+
end
|
20
|
+
|
21
|
+
return []
|
22
|
+
end
|
23
|
+
def calculate_image_from_node(_node)
|
24
|
+
_imgs = _node.search('img')
|
25
|
+
|
26
|
+
i=URI.parse(@options[:url])
|
27
|
+
if !_imgs.empty? && _imgs.size < 50 #sanity check if more than 50 images...
|
28
|
+
o=[]
|
29
|
+
_imgs.each do |_img|
|
30
|
+
next unless _img.attr('src')
|
31
|
+
begin
|
32
|
+
_u = (i + _img.attr('src')).to_s
|
33
|
+
rescue StandardError => e
|
34
|
+
logger.info "Error #{e}: #{i} + #{_img.attr('src')}"
|
35
|
+
next
|
36
|
+
end
|
37
|
+
o << _u
|
38
|
+
end
|
39
|
+
return o
|
40
|
+
end
|
41
|
+
|
42
|
+
return nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def filtered_images
|
46
|
+
_max_proportion = 2.5
|
47
|
+
_min_area = 88*88
|
48
|
+
_filtered_images = []
|
49
|
+
_images = self.images.dup
|
50
|
+
_images.each{|url|
|
51
|
+
data = get_image(url)
|
52
|
+
next unless data
|
53
|
+
size = ImageSize.new(data).get_size
|
54
|
+
if(size[0] * size[1] > _min_area &&
|
55
|
+
size[0].to_f/size[1] < _max_proportion &&
|
56
|
+
size[1].to_f/size[0] < _max_proportion)
|
57
|
+
_filtered_images << {:url => url, :data => data, :width => size[0], :height => size[1]}
|
58
|
+
end
|
59
|
+
}
|
60
|
+
return _filtered_images
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_image(url)
|
64
|
+
_cache_key = "get_image(#{url})"
|
65
|
+
begin
|
66
|
+
Charles.file_cache.fetch(_cache_key) {
|
67
|
+
body = mechanize_agent.get(url, [], URI.parse(@options[:url])).body
|
68
|
+
body.size < 900000 ? body : nil
|
69
|
+
}
|
70
|
+
rescue StandardError, Timeout::Error
|
71
|
+
Charles.file_cache.write(_cache_key, nil, :expires_in => 1.hour)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Charles
|
2
|
+
module InternalAttributes
|
3
|
+
def title
|
4
|
+
@title||=(
|
5
|
+
title = @document.search('title').first
|
6
|
+
title ? title.clean_inner_text : nil
|
7
|
+
)
|
8
|
+
end
|
9
|
+
def clean_title
|
10
|
+
return title if !@options[:sample_titles] || @options[:sample_titles].size < 5
|
11
|
+
_title_words = {}
|
12
|
+
|
13
|
+
_tokens = Charles::Misc.string_to_tokens_raw(self.title, type = :no_stop_words)
|
14
|
+
while(_tokens.first && words_to_filter_from_sample_titles.include?(_tokens.first.text)); _tokens.shift; end; #remove words from the beginning of the tokens
|
15
|
+
while(_tokens.last && words_to_filter_from_sample_titles.include?(_tokens.last.text)); _tokens.pop; end; #remove words from the end of the tokens
|
16
|
+
return title if _tokens.empty? #everything stripped? return nil, use other titles
|
17
|
+
|
18
|
+
_start = _tokens.first.start;
|
19
|
+
_end = _tokens.last.end;
|
20
|
+
_title = self.title.slice(_start, _end - _start)
|
21
|
+
_title = self.title.match(/[^\s\302\240]*#{Regexp.escape(_title)}[^\s\302\240]*/)[0].strip #include symbols or punctuation surrounding the title
|
22
|
+
end
|
23
|
+
|
24
|
+
protected
|
25
|
+
|
26
|
+
def words_to_filter_from_sample_titles
|
27
|
+
@words_to_filter_from_sample_titles = calculate_words_to_filter_from_sample_titles
|
28
|
+
end
|
29
|
+
def calculate_words_to_filter_from_sample_titles
|
30
|
+
_title_words = {}
|
31
|
+
@options[:sample_titles].each{|sample_title|
|
32
|
+
Charles::Misc.string_to_tokens(sample_title, type = :no_stop_words).uniq.each{|token|
|
33
|
+
_title_words[token]||=0; _title_words[token]+=1
|
34
|
+
}
|
35
|
+
}
|
36
|
+
_threshold = (0.9 * @options[:sample_titles].size).ceil
|
37
|
+
_words_to_filter = _title_words.select{|k,v| v >= _threshold}.collect{|k,v| k} #select words used in more than 90% of the titles
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/charles/misc.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
module Charles
|
2
|
+
module Misc
|
3
|
+
def self.compare_strings(a,b)
|
4
|
+
[compare_strings_single_side(a,b),compare_strings_single_side(b,a)].mean
|
5
|
+
end
|
6
|
+
def self.compare_strings_single_side(a,b)
|
7
|
+
index = Ferret::Index::Index.new()
|
8
|
+
index.field_infos.add_field(:content, :store => :no, :boost => 1)
|
9
|
+
index << {:content => a}
|
10
|
+
search = index.search(b.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'')) #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
|
11
|
+
search.max_score
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def self.analyzer(type = :all_stop_words)
|
18
|
+
@analyzer||={}
|
19
|
+
@analyzer[type]||=self.send("analyzer_#{type}")
|
20
|
+
end
|
21
|
+
def self.analyzer_all_stop_words
|
22
|
+
#http://blackwinter.github.com/ferret/classes/Ferret/Analysis.html
|
23
|
+
stop_words = Ferret::Analysis::EXTENDED_ENGLISH_STOP_WORDS |
|
24
|
+
Ferret::Analysis::FULL_FRENCH_STOP_WORDS |
|
25
|
+
Ferret::Analysis::FULL_SPANISH_STOP_WORDS |
|
26
|
+
Ferret::Analysis::FULL_PORTUGUESE_STOP_WORDS |
|
27
|
+
Ferret::Analysis::FULL_ITALIAN_STOP_WORDS |
|
28
|
+
Ferret::Analysis::FULL_GERMAN_STOP_WORDS |
|
29
|
+
Ferret::Analysis::FULL_DUTCH_STOP_WORDS |
|
30
|
+
Ferret::Analysis::FULL_SWEDISH_STOP_WORDS |
|
31
|
+
Ferret::Analysis::FULL_NORWEGIAN_STOP_WORDS |
|
32
|
+
Ferret::Analysis::FULL_DANISH_STOP_WORDS |
|
33
|
+
Ferret::Analysis::FULL_RUSSIAN_STOP_WORDS |
|
34
|
+
Ferret::Analysis::FULL_FINNISH_STOP_WORDS
|
35
|
+
Ferret::Analysis::StandardAnalyzer.new(stop_words,true)#(Ferret::Analysis::FULL_ENGLISH_STOP_WORDS) #no stop words
|
36
|
+
end
|
37
|
+
def self.analyzer_no_stop_words
|
38
|
+
Ferret::Analysis::StandardAnalyzer.new([],true)#no stop words
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.string_to_tokens_raw(string, type = :all_stop_words)
|
42
|
+
token_stream = self.analyzer(type).token_stream('',string)
|
43
|
+
o=[]; while(j=token_stream.next); o << j; end;
|
44
|
+
return o
|
45
|
+
end
|
46
|
+
def self.string_to_tokens(string, type = :all_stop_words)
|
47
|
+
self.string_to_tokens_raw(string, type).collect{|token| token.text}
|
48
|
+
end
|
49
|
+
def self.string_to_clean_tokens(string, type = :all_stop_words)
|
50
|
+
tokens = string_to_tokens(string, type)
|
51
|
+
tokens.delete_if{|token| token.match(/\d/)}
|
52
|
+
tokens
|
53
|
+
end
|
54
|
+
def self.string_to_clean_tokens_string(string, type = :all_stop_words)
|
55
|
+
string_to_clean_tokens(string, type).join(' ')
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
def self.normalize_string(string)
|
64
|
+
@htmlentities||=HTMLEntities.new
|
65
|
+
@htmlentities.decode(normalize_unicode_characters(string.gsub(/[\s\302\240]+/,' ').strip))
|
66
|
+
end
|
67
|
+
UNICODE_CONVERSIONS = {
|
68
|
+
"8230" => '...',
|
69
|
+
"8194" => ' ',
|
70
|
+
"8195" => ' ',
|
71
|
+
"8201" => ' ',
|
72
|
+
"8211" => '-',
|
73
|
+
"8216" => '\'',
|
74
|
+
"8217" => '\'',
|
75
|
+
"8220" => '"',
|
76
|
+
"8221" => '"'
|
77
|
+
}
|
78
|
+
TRANSLATED_CONVERSIONS = UNICODE_CONVERSIONS.map {|k, v| [[k.to_i].pack('U*'), v] }
|
79
|
+
def self.normalize_unicode_characters(string)
|
80
|
+
TRANSLATED_CONVERSIONS.each {|k,v| string.gsub! k, v }
|
81
|
+
string
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/charles.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
#require "charles/version"
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'bundler/setup'
|
6
|
+
require 'nokogiri'
|
7
|
+
require 'htmlentities'
|
8
|
+
require 'mechanize'
|
9
|
+
require 'active_support/cache'
|
10
|
+
require 'active_support/cache/file_store'
|
11
|
+
require 'image_size'
|
12
|
+
|
13
|
+
require 'ferret'
|
14
|
+
Ferret.locale = "en_US.UTF-8" #if not set ferret segfaults on chinese/jap stuff randomly
|
15
|
+
|
16
|
+
require "charles/document"
|
17
|
+
require "charles/misc"
|
18
|
+
|
19
|
+
module Charles
|
20
|
+
# Your code goes here...
|
21
|
+
def self.logger=(logger)
|
22
|
+
@logger = logger
|
23
|
+
end
|
24
|
+
def self.logger
|
25
|
+
@logger ||= Logger.new(STDERR)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.get(url)
|
29
|
+
agent = Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
|
30
|
+
body = file_cache.fetch("Charles.get(#{url})"){
|
31
|
+
agent.get(url).body
|
32
|
+
}
|
33
|
+
return Document.new(body, :url => url, :mechanize_agent => agent)
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.options
|
37
|
+
@options ||= {}
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.file_cache
|
41
|
+
@file_cache ||= ActiveSupport::Cache::FileStore.new(Charles.options[:tmp_path], :namespace => 'charles')
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
module Enumerable
|
46
|
+
|
47
|
+
def sum
|
48
|
+
return self.inject(0){|accum, i| accum + i }
|
49
|
+
end
|
50
|
+
|
51
|
+
def mean
|
52
|
+
return self.sum / self.length.to_f
|
53
|
+
end
|
54
|
+
|
55
|
+
def sample_variance
|
56
|
+
m = self.mean
|
57
|
+
sum = self.inject(0){|accum, i| accum + (i - m) ** 2 }
|
58
|
+
return sum / (self.length - 1).to_f
|
59
|
+
end
|
60
|
+
|
61
|
+
def standard_deviation
|
62
|
+
return Math.sqrt(self.sample_variance)
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
data/optimise.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'lib/charles'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
TEST_ARTICLES = YAML.load_file("test/articles.yml")
|
7
|
+
|
8
|
+
class CharlesOptimiser
|
9
|
+
@@high_score = 0
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@articles = YAML.load_file("test/articles.yml")
|
13
|
+
@articles.each{|article|
|
14
|
+
next if article[:file].empty?
|
15
|
+
article[:html] = File.read("test/articles/#{article[:file]}.html")
|
16
|
+
article[:document] = Charles::Document.new(article[:html])
|
17
|
+
article[:expected][:content] = File.read("test/articles/#{article[:file]}.content.txt")
|
18
|
+
}
|
19
|
+
end
|
20
|
+
def optimise
|
21
|
+
50.times do
|
22
|
+
seeds = {
|
23
|
+
:length => random(800,3000),
|
24
|
+
:distance_from_top => random(0.1,2),
|
25
|
+
:internal_nodes => random(5,50),
|
26
|
+
:internal_nodes_buffer => random(5,150),
|
27
|
+
:title_match => random(0,1),
|
28
|
+
:title_match_buffer => random(0,0.6)
|
29
|
+
}
|
30
|
+
_scores = articles_scores(seeds)
|
31
|
+
_scores.delete_if{|score| score > 1}
|
32
|
+
_score = _scores.mean
|
33
|
+
_std_dev = _scores.standard_deviation
|
34
|
+
if _score >= @@high_score
|
35
|
+
@@high_score = _score
|
36
|
+
pp [_score, _std_dev, seeds, _scores.select{|i| i<0.1}.size]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
def articles_scores(seeds={})
|
41
|
+
_scores = []
|
42
|
+
@articles.each{|article|
|
43
|
+
next if article[:file].empty?
|
44
|
+
result = article[:document].content(seeds)
|
45
|
+
_score = compare_articles(result, article[:expected][:content])
|
46
|
+
_scores << _score
|
47
|
+
}
|
48
|
+
_scores
|
49
|
+
end
|
50
|
+
def compare_articles(a,b)
|
51
|
+
[compare_articles_single_side(a,b),compare_articles_single_side(b,a)].mean
|
52
|
+
end
|
53
|
+
def compare_articles_single_side(a,b)
|
54
|
+
index = Ferret::Index::Index.new()
|
55
|
+
index.field_infos.add_field(:content, :store => :no, :boost => 1)
|
56
|
+
index << {:content => a}
|
57
|
+
search = index.search(b)
|
58
|
+
search.max_score
|
59
|
+
end
|
60
|
+
|
61
|
+
def random(min,max)
|
62
|
+
rand * (max - min) + min
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
while true
|
67
|
+
thread = Thread.new {
|
68
|
+
CharlesOptimiser.new.optimise
|
69
|
+
}
|
70
|
+
thread.join
|
71
|
+
puts "***"
|
72
|
+
end
|
@@ -0,0 +1,5 @@
|
|
1
|
+
The People's Action Party (PAP) ended its campaign in Hougang on Thursday with a call for change in the ward, urging voters to start afresh with its young candidate.
|
2
|
+
|
3
|
+
'Hougang, let's turn over a new page and start again,' declared party chairman Khaw Boon Wan at the PAP's final rally of the by-election on Thursday night.
|
4
|
+
|
5
|
+
With the ward in opposition hands since 1991, Hougang residents had been adversely affected, said Mr Khaw.
|