charles 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +10 -0
- data/Rakefile +13 -0
- data/bin/charles +23 -0
- data/charles.gemspec +25 -0
- data/lib/charles/document.rb +177 -0
- data/lib/charles/images.rb +77 -0
- data/lib/charles/internal_attributes.rb +40 -0
- data/lib/charles/misc.rb +84 -0
- data/lib/charles/version.rb +3 -0
- data/lib/charles.rb +66 -0
- data/optimise.rb +72 -0
- data/test/articles/20120525_1525_straitstimes.com.content.txt +5 -0
- data/test/articles/20120525_1525_straitstimes.com.html +1929 -0
- data/test/articles/20120525_1534_bbc.co.uk.content.txt +19 -0
- data/test/articles/20120525_1534_bbc.co.uk.html +1777 -0
- data/test/articles/20120525_1727_bbc.co.uk.content.txt +39 -0
- data/test/articles/20120525_1727_bbc.co.uk.html +1889 -0
- data/test/articles/20120525_1730_channelnewsasia.com.content.txt +19 -0
- data/test/articles/20120525_1730_channelnewsasia.com.html +963 -0
- data/test/articles/20120525_1733_channelnewsasia.com.content.txt +19 -0
- data/test/articles/20120525_1733_channelnewsasia.com.html +923 -0
- data/test/articles/20120525_1736_nytimes.com.content.txt +21 -0
- data/test/articles/20120525_1736_nytimes.com.html +856 -0
- data/test/articles/20120525_1743_nytimes.com.content.txt +11 -0
- data/test/articles/20120525_1743_nytimes.com.html +98 -0
- data/test/articles/20120525_1747_techcrunch.com.content.txt +11 -0
- data/test/articles/20120525_1747_techcrunch.com.html +1098 -0
- data/test/articles/20120528_0929_washingtonpost.com.content.txt +23 -0
- data/test/articles/20120528_0929_washingtonpost.com.html +3335 -0
- data/test/articles/20120528_0931_latimes.com.content.txt +45 -0
- data/test/articles/20120528_0931_latimes.com.html +6371 -0
- data/test/articles/20120528_0938_entertainment.time.com.content.txt +31 -0
- data/test/articles/20120528_0938_entertainment.time.com.html +1261 -0
- data/test/articles/20120528_0943_bloomberg.com.content.txt +13 -0
- data/test/articles/20120528_0943_bloomberg.com.html +2874 -0
- data/test/articles/20120528_0947_reuters.com.content.txt +35 -0
- data/test/articles/20120528_0947_reuters.com.html +1563 -0
- data/test/articles/20120528_1106_reuters.com.content.txt +5 -0
- data/test/articles/20120528_1106_reuters.com.html +551 -0
- data/test/articles/20120528_1109_musicthing.blogspot.co.uk.content.txt +19 -0
- data/test/articles/20120528_1109_musicthing.blogspot.co.uk.html +865 -0
- data/test/articles/20120528_1114_mobileinc.co.uk.content.txt +15 -0
- data/test/articles/20120528_1114_mobileinc.co.uk.html +550 -0
- data/test/articles/20120528_1119_forbes.com.content.txt +15 -0
- data/test/articles/20120528_1119_forbes.com.html +1406 -0
- data/test/articles/20120528_1122_techcrunch.com.content.txt +58 -0
- data/test/articles/20120528_1122_techcrunch.com.html +1131 -0
- data/test/articles/20120528_1126_blogs.adobe.com.content.txt +13 -0
- data/test/articles/20120528_1126_blogs.adobe.com.html +303 -0
- data/test/articles/20120528_1142_thestar.com.my.content.txt +27 -0
- data/test/articles/20120528_1142_thestar.com.my.html +943 -0
- data/test/articles/20120528_1146_suntimes.com.content.txt +33 -0
- data/test/articles/20120528_1146_suntimes.com.html +5166 -0
- data/test/articles/20120528_1148_asiaone.com.content.txt +27 -0
- data/test/articles/20120528_1148_asiaone.com.html +1070 -0
- data/test/articles/20120529_1120_online.wsj.com.content.txt +56 -0
- data/test/articles/20120529_1120_online.wsj.com.html +3035 -0
- data/test/articles/20120529_1122_online.wsj.com.content.txt +35 -0
- data/test/articles/20120529_1122_online.wsj.com.html +2725 -0
- data/test/articles/20120529_1127_smh.com.au.content.txt +13 -0
- data/test/articles/20120529_1127_smh.com.au.html +2034 -0
- data/test/articles.yml +221 -0
- data/test/test_charles.rb +70 -0
- metadata +279 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Jason Ling
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
data/Rakefile
ADDED
data/bin/charles
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'lib/charles'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
Charles.options[:tmp_path] = File.dirname(__FILE__) + "/../test/tmp"
|
7
|
+
|
8
|
+
url = ARGV.shift
|
9
|
+
|
10
|
+
unless url =~ /^http/
|
11
|
+
url = File.read(url)
|
12
|
+
end
|
13
|
+
|
14
|
+
document = Charles.get(url)
|
15
|
+
puts({
|
16
|
+
:content => document.content,
|
17
|
+
:title => document.title,
|
18
|
+
:filtered_images => document.filtered_images.collect{|image| image[:url]}
|
19
|
+
}.to_yaml)
|
20
|
+
|
21
|
+
|
22
|
+
|
23
|
+
|
data/charles.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/charles/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Jason Ling Xiaowei"]
|
6
|
+
gem.email = ["jason@jeyel.com"]
|
7
|
+
gem.description = 'Charles the Content Extractor'
|
8
|
+
gem.summary = 'Charles the Content Extractor'
|
9
|
+
gem.homepage = "https://github.com/jlxw/charles"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "charles"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = Charles::VERSION
|
17
|
+
|
18
|
+
gem.add_dependency "ferret"
|
19
|
+
gem.add_dependency "nokogiri"
|
20
|
+
gem.add_dependency "htmlentities"
|
21
|
+
gem.add_dependency "mechanize"
|
22
|
+
gem.add_dependency "activesupport"
|
23
|
+
gem.add_dependency "rack"
|
24
|
+
gem.add_dependency "imagesize"
|
25
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
require 'charles/images'
|
2
|
+
require 'charles/internal_attributes'
|
3
|
+
|
4
|
+
module Charles
|
5
|
+
class Document
|
6
|
+
include Charles::InternalAttributes
|
7
|
+
include Charles::Images
|
8
|
+
|
9
|
+
def initialize(input, options={})
|
10
|
+
@document = Nokogiri::HTML.parse(input)
|
11
|
+
@document.search("script, style").remove
|
12
|
+
@nodes = @document.search('body *').select{|_n|
|
13
|
+
_n.clean_inner_tokens_text.size > 30 #arbitrary, minimum inner text limit of 30 chars
|
14
|
+
}
|
15
|
+
@options = options
|
16
|
+
end
|
17
|
+
|
18
|
+
def logger; Charles.logger; end
|
19
|
+
|
20
|
+
def content(seeds={})
|
21
|
+
content_node = content_node(seeds)
|
22
|
+
return unless content_node
|
23
|
+
refine_content_node(content_node).clean_inner_text
|
24
|
+
end
|
25
|
+
|
26
|
+
def content_node(seeds={})
|
27
|
+
content_nodes = calculate_content_nodes(seeds)
|
28
|
+
return unless content_nodes.first
|
29
|
+
content_nodes.first[:node]
|
30
|
+
end
|
31
|
+
|
32
|
+
def calculate_content_nodes(seeds={})
|
33
|
+
default_seeds = {:title_match=>0.145422959269808,
|
34
|
+
:title_match_buffer=>0.0174920023610796,
|
35
|
+
:length=>1100.27450832379,
|
36
|
+
:distance_from_top=>0.308408501217311,
|
37
|
+
:internal_nodes=>25.680381972181,
|
38
|
+
:internal_nodes_buffer=>20.2006169153009}
|
39
|
+
seeds = default_seeds.merge(seeds)
|
40
|
+
|
41
|
+
o = []
|
42
|
+
_rank = 0
|
43
|
+
|
44
|
+
@nodes.each_index{|_i|
|
45
|
+
_n = @nodes[_i]
|
46
|
+
_rank += 1
|
47
|
+
|
48
|
+
scores={
|
49
|
+
:length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
|
50
|
+
:internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
|
51
|
+
:distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
|
52
|
+
:title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1+ + seeds[:title_match_buffer])**seeds[:title_match].to_f #ferret index score, search score with page title
|
53
|
+
#:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
|
54
|
+
}
|
55
|
+
o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
|
56
|
+
}
|
57
|
+
|
58
|
+
o.sort!{|a,b| b[:score] <=> a[:score]}
|
59
|
+
|
60
|
+
#o[0,1].each{|o2| pp [o2[:score], o2[:scores]]}
|
61
|
+
#o[0,1].each{|o2| pp [refine_content_node(o2[:node]).clean_inner_text, o2[:score], o2[:scores]]}
|
62
|
+
|
63
|
+
return o
|
64
|
+
end
|
65
|
+
|
66
|
+
def refine_content_node(node)
|
67
|
+
node = node.dup
|
68
|
+
|
69
|
+
#strip 'clutter'
|
70
|
+
#i.children.each{|_n| pp _n.inner_text; pp _n.clean_inner_text.size}
|
71
|
+
_min_size = 30
|
72
|
+
node.children.each{|_n|
|
73
|
+
if(_n.clean_inner_tokens_text.size < _min_size)
|
74
|
+
_n.remove
|
75
|
+
else; break; end
|
76
|
+
}
|
77
|
+
node.children.reverse.each{|_n|
|
78
|
+
if(_n.clean_inner_tokens_text.size < _min_size)
|
79
|
+
_n.remove
|
80
|
+
else; break; end
|
81
|
+
}
|
82
|
+
node.search('*').each{|_n| _n.after(' ')}
|
83
|
+
|
84
|
+
return node
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def content_node_ferret_index
|
89
|
+
@content_node_ferret_index ||= caluclate_content_node_ferret_index
|
90
|
+
end
|
91
|
+
def caluclate_content_node_ferret_index
|
92
|
+
index = Ferret::Index::Index.new()
|
93
|
+
index.field_infos.add_field(:id, :store => :yes)
|
94
|
+
index.field_infos.add_field(:content, :store => :no, :boost => 1)
|
95
|
+
|
96
|
+
|
97
|
+
@nodes.each_index{|_i|
|
98
|
+
i=@nodes[_i]
|
99
|
+
index << {
|
100
|
+
:id => _i,
|
101
|
+
:content => i.clean_inner_text,
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
|
106
|
+
q=self.title.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'') #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
|
107
|
+
s=index.search(q, :limit => @nodes.size)
|
108
|
+
|
109
|
+
o=[]
|
110
|
+
s.hits.each {|hit|
|
111
|
+
_i = index[hit.doc][:id].to_i
|
112
|
+
_n = @nodes[_i]
|
113
|
+
_search_score = hit.score
|
114
|
+
_search_normalised_score = hit.score/s.max_score
|
115
|
+
#logger.info [_n.clean_inner_text, _search_score, _search_normalised_score].pretty_inspect
|
116
|
+
o[_i] = _search_normalised_score
|
117
|
+
}
|
118
|
+
o
|
119
|
+
end
|
120
|
+
|
121
|
+
def mechanize_agent
|
122
|
+
@options[:mechanize_agent] ||= Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
Nokogiri::XML::Node.class_eval {
|
135
|
+
def clean_inner_text
|
136
|
+
@clean_inner_text ||= Charles::Misc.normalize_string(inner_text)
|
137
|
+
end
|
138
|
+
def clean_inner_tokens_text
|
139
|
+
@clean_inner_tokens_text ||= (
|
140
|
+
Charles::Misc.string_to_clean_tokens_string(clean_inner_text)
|
141
|
+
)
|
142
|
+
end
|
143
|
+
def internal_nodes_size
|
144
|
+
@internal_nodes_size ||= search('*').size
|
145
|
+
end
|
146
|
+
}
|
147
|
+
|
148
|
+
|
149
|
+
#https://github.com/cheald/pismo/blob/master/lib/pismo.rb
|
150
|
+
class Nokogiri::HTML::Document
|
151
|
+
def get_the(search)
|
152
|
+
self.search(search).first rescue nil
|
153
|
+
end
|
154
|
+
|
155
|
+
def match(queries = [])
|
156
|
+
[].tap do |results|
|
157
|
+
[*queries].each do |query|
|
158
|
+
result = begin
|
159
|
+
if query.is_a?(String)
|
160
|
+
if el = self.search(query).first
|
161
|
+
if el.name.downcase == "meta"
|
162
|
+
el['content']
|
163
|
+
else
|
164
|
+
el.inner_text
|
165
|
+
end
|
166
|
+
end
|
167
|
+
elsif query.is_a?(Array)
|
168
|
+
query.last.call( self.search(query.first).first )
|
169
|
+
end
|
170
|
+
rescue
|
171
|
+
nil
|
172
|
+
end
|
173
|
+
results << Charles::Misc.normalize_string(result) if result
|
174
|
+
end
|
175
|
+
end.compact
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Charles
|
2
|
+
module Images
|
3
|
+
def image
|
4
|
+
images && images.first
|
5
|
+
end
|
6
|
+
def images
|
7
|
+
@images ||= calculate_images
|
8
|
+
end
|
9
|
+
def calculate_images
|
10
|
+
_node = self.content_node
|
11
|
+
return unless _node
|
12
|
+
#logger.info _node.pretty_inspect
|
13
|
+
|
14
|
+
(_node.ancestors.size/2).times do
|
15
|
+
o=self.calculate_image_from_node(_node)
|
16
|
+
#logger.info o.pretty_inspect
|
17
|
+
return o if o
|
18
|
+
_node = _node.parent
|
19
|
+
end
|
20
|
+
|
21
|
+
return []
|
22
|
+
end
|
23
|
+
def calculate_image_from_node(_node)
|
24
|
+
_imgs = _node.search('img')
|
25
|
+
|
26
|
+
i=URI.parse(@options[:url])
|
27
|
+
if !_imgs.empty? && _imgs.size < 50 #sanity check if more than 50 images...
|
28
|
+
o=[]
|
29
|
+
_imgs.each do |_img|
|
30
|
+
next unless _img.attr('src')
|
31
|
+
begin
|
32
|
+
_u = (i + _img.attr('src')).to_s
|
33
|
+
rescue StandardError => e
|
34
|
+
logger.info "Error #{e}: #{i} + #{_img.attr('src')}"
|
35
|
+
next
|
36
|
+
end
|
37
|
+
o << _u
|
38
|
+
end
|
39
|
+
return o
|
40
|
+
end
|
41
|
+
|
42
|
+
return nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def filtered_images
|
46
|
+
_max_proportion = 2.5
|
47
|
+
_min_area = 88*88
|
48
|
+
_filtered_images = []
|
49
|
+
_images = self.images.dup
|
50
|
+
_images.each{|url|
|
51
|
+
data = get_image(url)
|
52
|
+
next unless data
|
53
|
+
size = ImageSize.new(data).get_size
|
54
|
+
if(size[0] * size[1] > _min_area &&
|
55
|
+
size[0].to_f/size[1] < _max_proportion &&
|
56
|
+
size[1].to_f/size[0] < _max_proportion)
|
57
|
+
_filtered_images << {:url => url, :data => data, :width => size[0], :height => size[1]}
|
58
|
+
end
|
59
|
+
}
|
60
|
+
return _filtered_images
|
61
|
+
end
|
62
|
+
|
63
|
+
def get_image(url)
|
64
|
+
_cache_key = "get_image(#{url})"
|
65
|
+
begin
|
66
|
+
Charles.file_cache.fetch(_cache_key) {
|
67
|
+
body = mechanize_agent.get(url, [], URI.parse(@options[:url])).body
|
68
|
+
body.size < 900000 ? body : nil
|
69
|
+
}
|
70
|
+
rescue StandardError, Timeout::Error
|
71
|
+
Charles.file_cache.write(_cache_key, nil, :expires_in => 1.hour)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Charles
|
2
|
+
module InternalAttributes
|
3
|
+
def title
|
4
|
+
@title||=(
|
5
|
+
title = @document.search('title').first
|
6
|
+
title ? title.clean_inner_text : nil
|
7
|
+
)
|
8
|
+
end
|
9
|
+
def clean_title
|
10
|
+
return title if !@options[:sample_titles] || @options[:sample_titles].size < 5
|
11
|
+
_title_words = {}
|
12
|
+
|
13
|
+
_tokens = Charles::Misc.string_to_tokens_raw(self.title, type = :no_stop_words)
|
14
|
+
while(_tokens.first && words_to_filter_from_sample_titles.include?(_tokens.first.text)); _tokens.shift; end; #remove words from the beginning of the tokens
|
15
|
+
while(_tokens.last && words_to_filter_from_sample_titles.include?(_tokens.last.text)); _tokens.pop; end; #remove words from the end of the tokens
|
16
|
+
return title if _tokens.empty? #everything stripped? return nil, use other titles
|
17
|
+
|
18
|
+
_start = _tokens.first.start;
|
19
|
+
_end = _tokens.last.end;
|
20
|
+
_title = self.title.slice(_start, _end - _start)
|
21
|
+
_title = self.title.match(/[^\s\302\240]*#{Regexp.escape(_title)}[^\s\302\240]*/)[0].strip #include symbols or punctuation surrounding the title
|
22
|
+
end
|
23
|
+
|
24
|
+
protected
|
25
|
+
|
26
|
+
def words_to_filter_from_sample_titles
|
27
|
+
@words_to_filter_from_sample_titles = calculate_words_to_filter_from_sample_titles
|
28
|
+
end
|
29
|
+
def calculate_words_to_filter_from_sample_titles
|
30
|
+
_title_words = {}
|
31
|
+
@options[:sample_titles].each{|sample_title|
|
32
|
+
Charles::Misc.string_to_tokens(sample_title, type = :no_stop_words).uniq.each{|token|
|
33
|
+
_title_words[token]||=0; _title_words[token]+=1
|
34
|
+
}
|
35
|
+
}
|
36
|
+
_threshold = (0.9 * @options[:sample_titles].size).ceil
|
37
|
+
_words_to_filter = _title_words.select{|k,v| v >= _threshold}.collect{|k,v| k} #select words used in more than 90% of the titles
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/charles/misc.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
module Charles
|
2
|
+
module Misc
|
3
|
+
def self.compare_strings(a,b)
|
4
|
+
[compare_strings_single_side(a,b),compare_strings_single_side(b,a)].mean
|
5
|
+
end
|
6
|
+
def self.compare_strings_single_side(a,b)
|
7
|
+
index = Ferret::Index::Index.new()
|
8
|
+
index.field_infos.add_field(:content, :store => :no, :boost => 1)
|
9
|
+
index << {:content => a}
|
10
|
+
search = index.search(b.gsub(/[:()\[\]{}!+"~^\-|<>=*?\\]/,'')) #remove special charcaters used by ferret query parser: http://www.davebalmain.com/api/classes/Ferret/QueryParser.html, http://www.regular-expressions.info/charclass.html
|
11
|
+
search.max_score
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
def self.analyzer(type = :all_stop_words)
|
18
|
+
@analyzer||={}
|
19
|
+
@analyzer[type]||=self.send("analyzer_#{type}")
|
20
|
+
end
|
21
|
+
def self.analyzer_all_stop_words
|
22
|
+
#http://blackwinter.github.com/ferret/classes/Ferret/Analysis.html
|
23
|
+
stop_words = Ferret::Analysis::EXTENDED_ENGLISH_STOP_WORDS |
|
24
|
+
Ferret::Analysis::FULL_FRENCH_STOP_WORDS |
|
25
|
+
Ferret::Analysis::FULL_SPANISH_STOP_WORDS |
|
26
|
+
Ferret::Analysis::FULL_PORTUGUESE_STOP_WORDS |
|
27
|
+
Ferret::Analysis::FULL_ITALIAN_STOP_WORDS |
|
28
|
+
Ferret::Analysis::FULL_GERMAN_STOP_WORDS |
|
29
|
+
Ferret::Analysis::FULL_DUTCH_STOP_WORDS |
|
30
|
+
Ferret::Analysis::FULL_SWEDISH_STOP_WORDS |
|
31
|
+
Ferret::Analysis::FULL_NORWEGIAN_STOP_WORDS |
|
32
|
+
Ferret::Analysis::FULL_DANISH_STOP_WORDS |
|
33
|
+
Ferret::Analysis::FULL_RUSSIAN_STOP_WORDS |
|
34
|
+
Ferret::Analysis::FULL_FINNISH_STOP_WORDS
|
35
|
+
Ferret::Analysis::StandardAnalyzer.new(stop_words,true)#(Ferret::Analysis::FULL_ENGLISH_STOP_WORDS) #no stop words
|
36
|
+
end
|
37
|
+
def self.analyzer_no_stop_words
|
38
|
+
Ferret::Analysis::StandardAnalyzer.new([],true)#no stop words
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.string_to_tokens_raw(string, type = :all_stop_words)
|
42
|
+
token_stream = self.analyzer(type).token_stream('',string)
|
43
|
+
o=[]; while(j=token_stream.next); o << j; end;
|
44
|
+
return o
|
45
|
+
end
|
46
|
+
def self.string_to_tokens(string, type = :all_stop_words)
|
47
|
+
self.string_to_tokens_raw(string, type).collect{|token| token.text}
|
48
|
+
end
|
49
|
+
def self.string_to_clean_tokens(string, type = :all_stop_words)
|
50
|
+
tokens = string_to_tokens(string, type)
|
51
|
+
tokens.delete_if{|token| token.match(/\d/)}
|
52
|
+
tokens
|
53
|
+
end
|
54
|
+
def self.string_to_clean_tokens_string(string, type = :all_stop_words)
|
55
|
+
string_to_clean_tokens(string, type).join(' ')
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
def self.normalize_string(string)
|
64
|
+
@htmlentities||=HTMLEntities.new
|
65
|
+
@htmlentities.decode(normalize_unicode_characters(string.gsub(/[\s\302\240]+/,' ').strip))
|
66
|
+
end
|
67
|
+
UNICODE_CONVERSIONS = {
|
68
|
+
"8230" => '...',
|
69
|
+
"8194" => ' ',
|
70
|
+
"8195" => ' ',
|
71
|
+
"8201" => ' ',
|
72
|
+
"8211" => '-',
|
73
|
+
"8216" => '\'',
|
74
|
+
"8217" => '\'',
|
75
|
+
"8220" => '"',
|
76
|
+
"8221" => '"'
|
77
|
+
}
|
78
|
+
TRANSLATED_CONVERSIONS = UNICODE_CONVERSIONS.map {|k, v| [[k.to_i].pack('U*'), v] }
|
79
|
+
def self.normalize_unicode_characters(string)
|
80
|
+
TRANSLATED_CONVERSIONS.each {|k,v| string.gsub! k, v }
|
81
|
+
string
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/charles.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
#require "charles/version"
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'bundler/setup'
|
6
|
+
require 'nokogiri'
|
7
|
+
require 'htmlentities'
|
8
|
+
require 'mechanize'
|
9
|
+
require 'active_support/cache'
|
10
|
+
require 'active_support/cache/file_store'
|
11
|
+
require 'image_size'
|
12
|
+
|
13
|
+
require 'ferret'
|
14
|
+
Ferret.locale = "en_US.UTF-8" #if not set ferret segfaults on chinese/jap stuff randomly
|
15
|
+
|
16
|
+
require "charles/document"
|
17
|
+
require "charles/misc"
|
18
|
+
|
19
|
+
module Charles
|
20
|
+
# Your code goes here...
|
21
|
+
def self.logger=(logger)
|
22
|
+
@logger = logger
|
23
|
+
end
|
24
|
+
def self.logger
|
25
|
+
@logger ||= Logger.new(STDERR)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.get(url)
|
29
|
+
agent = Mechanize.new{|a|a.user_agent_alias = 'Mac Mozilla'}
|
30
|
+
body = file_cache.fetch("Charles.get(#{url})"){
|
31
|
+
agent.get(url).body
|
32
|
+
}
|
33
|
+
return Document.new(body, :url => url, :mechanize_agent => agent)
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.options
|
37
|
+
@options ||= {}
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.file_cache
|
41
|
+
@file_cache ||= ActiveSupport::Cache::FileStore.new(Charles.options[:tmp_path], :namespace => 'charles')
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
module Enumerable
|
46
|
+
|
47
|
+
def sum
|
48
|
+
return self.inject(0){|accum, i| accum + i }
|
49
|
+
end
|
50
|
+
|
51
|
+
def mean
|
52
|
+
return self.sum / self.length.to_f
|
53
|
+
end
|
54
|
+
|
55
|
+
def sample_variance
|
56
|
+
m = self.mean
|
57
|
+
sum = self.inject(0){|accum, i| accum + (i - m) ** 2 }
|
58
|
+
return sum / (self.length - 1).to_f
|
59
|
+
end
|
60
|
+
|
61
|
+
def standard_deviation
|
62
|
+
return Math.sqrt(self.sample_variance)
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
data/optimise.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'lib/charles'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
TEST_ARTICLES = YAML.load_file("test/articles.yml")
|
7
|
+
|
8
|
+
class CharlesOptimiser
|
9
|
+
@@high_score = 0
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@articles = YAML.load_file("test/articles.yml")
|
13
|
+
@articles.each{|article|
|
14
|
+
next if article[:file].empty?
|
15
|
+
article[:html] = File.read("test/articles/#{article[:file]}.html")
|
16
|
+
article[:document] = Charles::Document.new(article[:html])
|
17
|
+
article[:expected][:content] = File.read("test/articles/#{article[:file]}.content.txt")
|
18
|
+
}
|
19
|
+
end
|
20
|
+
def optimise
|
21
|
+
50.times do
|
22
|
+
seeds = {
|
23
|
+
:length => random(800,3000),
|
24
|
+
:distance_from_top => random(0.1,2),
|
25
|
+
:internal_nodes => random(5,50),
|
26
|
+
:internal_nodes_buffer => random(5,150),
|
27
|
+
:title_match => random(0,1),
|
28
|
+
:title_match_buffer => random(0,0.6)
|
29
|
+
}
|
30
|
+
_scores = articles_scores(seeds)
|
31
|
+
_scores.delete_if{|score| score > 1}
|
32
|
+
_score = _scores.mean
|
33
|
+
_std_dev = _scores.standard_deviation
|
34
|
+
if _score >= @@high_score
|
35
|
+
@@high_score = _score
|
36
|
+
pp [_score, _std_dev, seeds, _scores.select{|i| i<0.1}.size]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
def articles_scores(seeds={})
|
41
|
+
_scores = []
|
42
|
+
@articles.each{|article|
|
43
|
+
next if article[:file].empty?
|
44
|
+
result = article[:document].content(seeds)
|
45
|
+
_score = compare_articles(result, article[:expected][:content])
|
46
|
+
_scores << _score
|
47
|
+
}
|
48
|
+
_scores
|
49
|
+
end
|
50
|
+
def compare_articles(a,b)
|
51
|
+
[compare_articles_single_side(a,b),compare_articles_single_side(b,a)].mean
|
52
|
+
end
|
53
|
+
def compare_articles_single_side(a,b)
|
54
|
+
index = Ferret::Index::Index.new()
|
55
|
+
index.field_infos.add_field(:content, :store => :no, :boost => 1)
|
56
|
+
index << {:content => a}
|
57
|
+
search = index.search(b)
|
58
|
+
search.max_score
|
59
|
+
end
|
60
|
+
|
61
|
+
def random(min,max)
|
62
|
+
rand * (max - min) + min
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
while true
|
67
|
+
thread = Thread.new {
|
68
|
+
CharlesOptimiser.new.optimise
|
69
|
+
}
|
70
|
+
thread.join
|
71
|
+
puts "***"
|
72
|
+
end
|
@@ -0,0 +1,5 @@
|
|
1
|
+
The People's Action Party (PAP) ended its campaign in Hougang on Thursday with a call for change in the ward, urging voters to start afresh with its young candidate.
|
2
|
+
|
3
|
+
'Hougang, let's turn over a new page and start again,' declared party chairman Khaw Boon Wan at the PAP's final rally of the by-election on Thursday night.
|
4
|
+
|
5
|
+
With the ward in opposition hands since 1991, Hougang residents had been adversely affected, said Mr Khaw.
|