charles 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/bin/charles +1 -0
- data/charles.gemspec +1 -0
- data/lib/charles.rb +1 -0
- data/lib/charles/document.rb +18 -7
- data/lib/charles/images.rb +1 -1
- data/lib/charles/version.rb +1 -1
- metadata +17 -3
data/Gemfile
CHANGED
data/bin/charles
CHANGED
data/charles.gemspec
CHANGED
data/lib/charles.rb
CHANGED
data/lib/charles/document.rb
CHANGED
@@ -17,6 +17,10 @@ module Charles
|
|
17
17
|
|
18
18
|
def logger; Charles.logger; end
|
19
19
|
|
20
|
+
def interesting_content(options = {:max_length => 388})
|
21
|
+
Shiner.shine(content, options)
|
22
|
+
end
|
23
|
+
|
20
24
|
def content(seeds={})
|
21
25
|
content_node = content_node(seeds)
|
22
26
|
return unless content_node
|
@@ -30,12 +34,12 @@ module Charles
|
|
30
34
|
end
|
31
35
|
|
32
36
|
def calculate_content_nodes(seeds={})
|
33
|
-
default_seeds = {:title_match=>0.
|
34
|
-
:title_match_buffer=>0.
|
35
|
-
:length=>
|
36
|
-
:distance_from_top=>0.
|
37
|
-
:internal_nodes=>
|
38
|
-
:internal_nodes_buffer=>
|
37
|
+
default_seeds = {:title_match=>0.0586074856962615, #0.238237272128463,0.173173520342878
|
38
|
+
:title_match_buffer=>0.508671373602233,
|
39
|
+
:length=>1246.27917099503,
|
40
|
+
:distance_from_top=>0.436005480844439,
|
41
|
+
:internal_nodes=>18.0265463704097,
|
42
|
+
:internal_nodes_buffer=>32.7588984705223}
|
39
43
|
seeds = default_seeds.merge(seeds)
|
40
44
|
|
41
45
|
o = []
|
@@ -49,7 +53,8 @@ module Charles
|
|
49
53
|
:length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
|
50
54
|
:internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
|
51
55
|
:distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
|
52
|
-
:title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1
|
56
|
+
:title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1 + seeds[:title_match_buffer])**seeds[:title_match].to_f, #ferret index score, search score with page title
|
57
|
+
#:interesting => (0.5 + _n.interesting_score) ** seeds[:interesting].to_f
|
53
58
|
#:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
|
54
59
|
}
|
55
60
|
o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
|
@@ -143,6 +148,12 @@ Nokogiri::XML::Node.class_eval {
|
|
143
148
|
def internal_nodes_size
|
144
149
|
@internal_nodes_size ||= search('*').size
|
145
150
|
end
|
151
|
+
#def interesting_score
|
152
|
+
# @interesting_score ||= (
|
153
|
+
# classifications = Shiner.classifier.classifications(clean_inner_text)
|
154
|
+
# 1 - classifications['Interesting'] / classifications['Uninteresting']
|
155
|
+
# )
|
156
|
+
#end
|
146
157
|
}
|
147
158
|
|
148
159
|
|
data/lib/charles/images.rb
CHANGED
data/lib/charles/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: charles
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 2
|
10
|
+
version: 0.0.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jason Ling Xiaowei
|
@@ -115,6 +115,20 @@ dependencies:
|
|
115
115
|
version: "0"
|
116
116
|
type: :runtime
|
117
117
|
version_requirements: *id007
|
118
|
+
- !ruby/object:Gem::Dependency
|
119
|
+
name: shiner
|
120
|
+
prerelease: false
|
121
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
122
|
+
none: false
|
123
|
+
requirements:
|
124
|
+
- - ">="
|
125
|
+
- !ruby/object:Gem::Version
|
126
|
+
hash: 3
|
127
|
+
segments:
|
128
|
+
- 0
|
129
|
+
version: "0"
|
130
|
+
type: :runtime
|
131
|
+
version_requirements: *id008
|
118
132
|
description: Charles the Content Extractor
|
119
133
|
email:
|
120
134
|
- jason@jeyel.com
|