charles 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -2,3 +2,5 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in charles.gemspec
4
4
  gemspec
5
+
6
+ gem 'shiner', :path => '/u/apps/shiner/dev'
@@ -14,6 +14,7 @@ end
14
14
  document = Charles.get(url)
15
15
  puts({
16
16
  :content => document.content,
17
+ :interesting_content => document.interesting_content,
17
18
  :title => document.title,
18
19
  :filtered_images => document.filtered_images.collect{|image| image[:url]}
19
20
  }.to_yaml)
@@ -22,4 +22,5 @@ Gem::Specification.new do |gem|
22
22
  gem.add_dependency "activesupport"
23
23
  gem.add_dependency "rack"
24
24
  gem.add_dependency "imagesize"
25
+ gem.add_dependency "shiner"
25
26
  end
@@ -9,6 +9,7 @@ require 'mechanize'
9
9
  require 'active_support/cache'
10
10
  require 'active_support/cache/file_store'
11
11
  require 'image_size'
12
+ require 'shiner'
12
13
 
13
14
  require 'ferret'
14
15
  Ferret.locale = "en_US.UTF-8" #if not set ferret segfaults on chinese/jap stuff randomly
@@ -17,6 +17,10 @@ module Charles
17
17
 
18
18
  def logger; Charles.logger; end
19
19
 
20
+ def interesting_content(options = {:max_length => 388})
21
+ Shiner.shine(content, options)
22
+ end
23
+
20
24
  def content(seeds={})
21
25
  content_node = content_node(seeds)
22
26
  return unless content_node
@@ -30,12 +34,12 @@ module Charles
30
34
  end
31
35
 
32
36
  def calculate_content_nodes(seeds={})
33
- default_seeds = {:title_match=>0.145422959269808,
34
- :title_match_buffer=>0.0174920023610796,
35
- :length=>1100.27450832379,
36
- :distance_from_top=>0.308408501217311,
37
- :internal_nodes=>25.680381972181,
38
- :internal_nodes_buffer=>20.2006169153009}
37
+ default_seeds = {:title_match=>0.0586074856962615, #0.238237272128463,0.173173520342878
38
+ :title_match_buffer=>0.508671373602233,
39
+ :length=>1246.27917099503,
40
+ :distance_from_top=>0.436005480844439,
41
+ :internal_nodes=>18.0265463704097,
42
+ :internal_nodes_buffer=>32.7588984705223}
39
43
  seeds = default_seeds.merge(seeds)
40
44
 
41
45
  o = []
@@ -49,7 +53,8 @@ module Charles
49
53
  :length => 1 - seeds[:length].to_f / (_n.clean_inner_tokens_text.size + seeds[:length]), #length of inner text in this node, too little = less
50
54
  :internal_nodes => seeds[:internal_nodes].to_f / (_n.internal_nodes_size + seeds[:internal_nodes] + seeds[:internal_nodes_buffer]), #number of nodes in this node, too many = less
51
55
  :distance_from_top => (1-(_rank.to_f / @nodes.size))**seeds[:distance_from_top].to_f, #how far this element is from the top of the page
52
- :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1+ + seeds[:title_match_buffer])**seeds[:title_match].to_f #ferret index score, search score with page title
56
+ :title_match => ((content_node_ferret_index[_i]||0.0 + seeds[:title_match_buffer]) / 1 + seeds[:title_match_buffer])**seeds[:title_match].to_f, #ferret index score, search score with page title
57
+ #:interesting => (0.5 + _n.interesting_score) ** seeds[:interesting].to_f
53
58
  #:special_characters => (1 - (_n.inner_text.scan(/[^\s\302\240a-zA-Z]/).size.to_f / (_n.clean_inner_text.size+1)))**2 #number of special characters and numbers.. this is pretty cpu intensive!
54
59
  }
55
60
  o << {:node =>_n, :score => scores.values.inject(:*), :scores => scores}
@@ -143,6 +148,12 @@ Nokogiri::XML::Node.class_eval {
143
148
  def internal_nodes_size
144
149
  @internal_nodes_size ||= search('*').size
145
150
  end
151
+ #def interesting_score
152
+ # @interesting_score ||= (
153
+ # classifications = Shiner.classifier.classifications(clean_inner_text)
154
+ # 1 - classifications['Interesting'] / classifications['Uninteresting']
155
+ # )
156
+ #end
146
157
  }
147
158
 
148
159
 
@@ -11,7 +11,7 @@ module Charles
11
11
  return unless _node
12
12
  #logger.info _node.pretty_inspect
13
13
 
14
- (_node.ancestors.size/2).times do
14
+ (_node.ancestors.size.to_f/2).round.times do
15
15
  o=self.calculate_image_from_node(_node)
16
16
  #logger.info o.pretty_inspect
17
17
  return o if o
@@ -1,3 +1,3 @@
1
1
  module Charles
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: charles
3
3
  version: !ruby/object:Gem::Version
4
- hash: 29
4
+ hash: 27
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 1
10
- version: 0.0.1
9
+ - 2
10
+ version: 0.0.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jason Ling Xiaowei
@@ -115,6 +115,20 @@ dependencies:
115
115
  version: "0"
116
116
  type: :runtime
117
117
  version_requirements: *id007
118
+ - !ruby/object:Gem::Dependency
119
+ name: shiner
120
+ prerelease: false
121
+ requirement: &id008 !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ hash: 3
127
+ segments:
128
+ - 0
129
+ version: "0"
130
+ type: :runtime
131
+ version_requirements: *id008
118
132
  description: Charles the Content Extractor
119
133
  email:
120
134
  - jason@jeyel.com