charles 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/bin/charles +1 -0
- data/charles.gemspec +1 -0
- data/lib/charles.rb +1 -0
- data/lib/charles/document.rb +18 -7
- data/lib/charles/images.rb +1 -1
- data/lib/charles/version.rb +1 -1
- metadata +17 -3
data/Gemfile
CHANGED
data/bin/charles
CHANGED
data/charles.gemspec
CHANGED
data/lib/charles.rb
CHANGED
data/lib/charles/document.rb
CHANGED
@@ -17,6 +17,10 @@ module Charles
|
|
17
17
|
|
18
18
|
def logger; Charles.logger; end
|
19
19
|
|
20
|
+
def interesting_content(options = {:max_length => 388})
|
21
|
+
Shiner.shine(content, options)
|
22
|
+
end
|
23
|
+
|
20
24
|
def content(seeds={})
|
21
25
|
content_node = content_node(seeds)
|
22
26
|
return unless content_node
|
@@ -30,12 +34,12 @@ module Charles
|
|
30
34
|
end
|
31
35
|
|
32
36
|
def calculate_content_nodes(seeds={})
|
33
|
-
default_seeds = {:title_match=>0.
|
34
|
-
:title_match_buffer=>0.
|
35
|
-
:length=>
|
36
|
-
:distance_from_top=>0.
|
37
|
-
:internal_nodes=>
|
38
|
-
:internal_nodes_buffer=>
|
37
|
+
default_seeds = {:title_match=>0.0586074856962615, #0.238237272128463,0.173173520342878
|
38
|
+
:title_match_buffer=>0.508671373602233,
|
39
|
+
:length=>1246.27917099503,
|
40
|
+
:distance_from_top=>0.436005480844439,
|
41
|
+
:internal_nodes=>18.0265463704097,
|
42
|
+
:internal_nodes_buffer=>32.7588984705223}
|
39
43
|
seeds = default_seeds.merge(seeds)
|
40
44
|
|
41
45
|
o = []
|
@@ -49,7 +53,8 @@ module Charles
|
|
49
53
|
:length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
|
50
54
|
:internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
|
51
55
|
:distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
|
52
|
-
:title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1
|
56
|
+
:title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1 + seeds[:title_match_buffer])**seeds[:title_match].to_f, #ferret index score, search score with page title
|
57
|
+
#:interesting => (0.5 + _n.interesting_score) ** seeds[:interesting].to_f
|
53
58
|
#:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
|
54
59
|
}
|
55
60
|
o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
|
@@ -143,6 +148,12 @@ Nokogiri::XML::Node.class_eval {
|
|
143
148
|
def internal_nodes_size
|
144
149
|
@internal_nodes_size ||= search('*').size
|
145
150
|
end
|
151
|
+
#def interesting_score
|
152
|
+
# @interesting_score ||= (
|
153
|
+
# classifications = Shiner.classifier.classifications(clean_inner_text)
|
154
|
+
# 1 - classifications['Interesting'] / classifications['Uninteresting']
|
155
|
+
# )
|
156
|
+
#end
|
146
157
|
}
|
147
158
|
|
148
159
|
|
data/lib/charles/images.rb
CHANGED
data/lib/charles/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: charles
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jason Ling Xiaowei
|
@@ -115,6 +115,20 @@ dependencies:
|
|
115
115
|
version: "0"
|
116
116
|
type: :runtime
|
117
117
|
version_requirements: *id007
|
118
|
+
- !ruby/object:Gem::Dependency
|
119
|
+
name: shiner
|
120
|
+
prerelease: false
|
121
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
hash: 3
|
127
|
+
segments:
|
128
|
+
- 0
|
129
|
+
version: "0"
|
130
|
+
type: :runtime
|
131
|
+
version_requirements: *id008
|
118
132
|
description: Charles the Content Extractor
|
119
133
|
email:
|
120
134
|
- jason@jeyel.com
|