charles 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -2,3 +2,5 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in charles.gemspec
4
4
  gemspec
5
+
6
+ gem 'shiner', :path => '/u/apps/shiner/dev'
@@ -14,6 +14,7 @@ end
14
14
  document = Charles.get(url)
15
15
  puts({
16
16
  :content => document.content,
17
+ :interesting_content => document.interesting_content,
17
18
  :title => document.title,
18
19
  :filtered_images => document.filtered_images.collect{|image| image[:url]}
19
20
  }.to_yaml)
@@ -22,4 +22,5 @@ Gem::Specification.new do |gem|
22
22
  gem.add_dependency "activesupport"
23
23
  gem.add_dependency "rack"
24
24
  gem.add_dependency "imagesize"
25
+ gem.add_dependency "shiner"
25
26
  end
@@ -9,6 +9,7 @@ require 'mechanize'
9
9
  require 'active_support/cache'
10
10
  require 'active_support/cache/file_store'
11
11
  require 'image_size'
12
+ require 'shiner'
12
13
 
13
14
  require 'ferret'
14
15
  Ferret.locale = "en_US.UTF-8" #if not set ferret segfaults on chinese/jap stuff randomly
@@ -17,6 +17,10 @@ module Charles
17
17
 
18
18
  def logger; Charles.logger; end
19
19
 
20
+ def interesting_content(options = {:max_length => 388})
21
+ Shiner.shine(content, options)
22
+ end
23
+
20
24
  def content(seeds={})
21
25
  content_node = content_node(seeds)
22
26
  return unless content_node
@@ -30,12 +34,12 @@ module Charles
30
34
  end
31
35
 
32
36
  def calculate_content_nodes(seeds={})
33
- default_seeds = {:title_match=>0.145422959269808,
34
- :title_match_buffer=>0.0174920023610796,
35
- :length=>1100.27450832379,
36
- :distance_from_top=>0.308408501217311,
37
- :internal_nodes=>25.680381972181,
38
- :internal_nodes_buffer=>20.2006169153009}
37
+ default_seeds = {:title_match=>0.0586074856962615, #0.238237272128463,0.173173520342878
38
+ :title_match_buffer=>0.508671373602233,
39
+ :length=>1246.27917099503,
40
+ :distance_from_top=>0.436005480844439,
41
+ :internal_nodes=>18.0265463704097,
42
+ :internal_nodes_buffer=>32.7588984705223}
39
43
  seeds = default_seeds.merge(seeds)
40
44
 
41
45
  o = []
@@ -49,7 +53,8 @@ module Charles
49
53
  :length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
50
54
  :internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
51
55
  :distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
52
- :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1+ + seeds[:title_match_buffer])**seeds[:title_match].to_f #ferret index score, search score with page title
56
+ :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1 + seeds[:title_match_buffer])**seeds[:title_match].to_f, #ferret index score, search score with page title
57
+ #:interesting => (0.5 + _n.interesting_score) ** seeds[:interesting].to_f
53
58
  #:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
54
59
  }
55
60
  o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
@@ -143,6 +148,12 @@ Nokogiri::XML::Node.class_eval {
143
148
  def internal_nodes_size
144
149
  @internal_nodes_size ||= search('*').size
145
150
  end
151
+ #def interesting_score
152
+ # @interesting_score ||= (
153
+ # classifications = Shiner.classifier.classifications(clean_inner_text)
154
+ # 1 - classifications['Interesting'] / classifications['Uninteresting']
155
+ # )
156
+ #end
146
157
  }
147
158
 
148
159
 
@@ -11,7 +11,7 @@ module Charles
11
11
  return unless _node
12
12
  #logger.info _node.pretty_inspect
13
13
 
14
- (_node.ancestors.size/2).times do
14
+ (_node.ancestors.size.to_f/2).round.times do
15
15
  o=self.calculate_image_from_node(_node)
16
16
  #logger.info o.pretty_inspect
17
17
  return o if o
@@ -1,3 +1,3 @@
1
1
  module Charles
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: charles
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jason Ling Xiaowei
@@ -115,6 +115,20 @@ dependencies:
115
115
  version: "0"
116
116
  type: :runtime
117
117
  version_requirements: *id007
118
+ - !ruby/object:Gem::Dependency
119
+ name: shiner
120
+ prerelease: false
121
+ requirement: &id008 !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ hash: 3
127
+ segments:
128
+ - 0
129
+ version: "0"
130
+ type: :runtime
131
+ version_requirements: *id008
118
132
  description: Charles the Content Extractor
119
133
  email:
120
134
  - jason@jeyel.com