link_thumbnailer 2.6.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: aa4bcf1a567d87a3fd739645cc72dea9c04badb1
4
- data.tar.gz: b284daca927931ce058f024ec5425c0d3c2aadc2
3
+ metadata.gz: a6f72611485eeb14494ca2ecc6dc7c4a2c1d7723
4
+ data.tar.gz: 8968fef4fb3d981a568a6cd94b0de53cedd8833b
5
5
  SHA512:
6
- metadata.gz: d5c617733e8aca11eef6604eaab3d077077bc7dfaa8f533dcb6297ebb466a9d7fa6dfa29b73d04711d818cb6cde4c1d7f813f4350699402283c5836f9ec5a05d
7
- data.tar.gz: 841dd6b8ca5dc185a0e77b8931c6a5cfc9fe74340f35d528f80d7dbd1af3ede76933c6873d024c3b213361c5d94136469b5a6001f27b13e9c79917100f1f6f39
6
+ metadata.gz: 1aadd1793f33cec74721d013954627a552693d2a50fd1d1a7e630194b6d19fbe945c549797a9edb23bb2d1254c0b3dc77d6f737f41c32a4aa3c7bda3fea3f796
7
+ data.tar.gz: 65631f74a0d44eee946ce64cd3e4f55a1c0fbb55756f17855990c98efc5d161cf543dc34dc7f384414df1d78f469f1f39b8b1f30f15ad8c6b2a6ea07c2413003
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ # 3.0.0
2
+
3
+ - Improved description sorting.
4
+ - Refactored how graders work. More information [here](https://github.com/gottfrois/link_thumbnailer/wiki/How-to-build-your-own-Grader%3F)
5
+
1
6
  # 2.6.1
2
7
 
3
8
  - Fix remove useless dependency
@@ -68,4 +68,9 @@ LinkThumbnailer.configure do |config|
68
68
  # does not have to fetch its size and type.
69
69
  #
70
70
  # config.image_stats = true
71
+ #
72
+ # Whether you want LinkThumbnailer to raise an exception if the Content-Type of the HTTP request
73
+ # is not an html or xml.
74
+ #
75
+ # config.raise_on_invalid_format = false
71
76
  end
@@ -51,9 +51,9 @@ module LinkThumbnailer
51
51
  ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :class) },
52
52
  ->(description) { ::LinkThumbnailer::Graders::HtmlAttribute.new(description, :id) },
53
53
  ->(description) { ::LinkThumbnailer::Graders::Position.new(description) },
54
- ->(description) { ::LinkThumbnailer::Graders::LinkDensity.new(description) }
54
+ ->(description) { ::LinkThumbnailer::Graders::LinkDensity.new(description) },
55
55
  ]
56
- @description_min_length = 25
56
+ @description_min_length = 50
57
57
  @positive_regex = /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i
58
58
  @negative_regex = /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|modal/i
59
59
  @image_limit = 5
@@ -3,6 +3,7 @@ require 'link_thumbnailer/graders/base'
3
3
  require 'link_thumbnailer/graders/length'
4
4
  require 'link_thumbnailer/graders/html_attribute'
5
5
  require 'link_thumbnailer/graders/link_density'
6
+ require 'link_thumbnailer/graders/punctuation_density'
6
7
  require 'link_thumbnailer/graders/position'
7
8
 
8
9
  module LinkThumbnailer
@@ -17,14 +18,18 @@ module LinkThumbnailer
17
18
  super(config)
18
19
  end
19
20
 
21
+ # For given description, computes probabilities returned by each graders by multipying them together.
22
+ #
23
+ # @return [Float] the probability for the given description to be considered good
20
24
  def call
21
- score = 0
25
+ probability = 1.0
26
+
22
27
  graders.each do |lambda|
23
28
  instance = lambda.call(description)
24
- score += instance.call(score)
29
+ probability *= instance.call.to_f
25
30
  end
26
31
 
27
- score
32
+ probability
28
33
  end
29
34
 
30
35
  private
@@ -13,7 +13,7 @@ module LinkThumbnailer
13
13
  super(config)
14
14
  end
15
15
 
16
- def call(current_score)
16
+ def call
17
17
  fail NotImplementedError
18
18
  end
19
19
 
@@ -24,7 +24,7 @@ module LinkThumbnailer
24
24
  end
25
25
 
26
26
  def text
27
- node.text
27
+ description.text
28
28
  end
29
29
 
30
30
  end
@@ -9,13 +9,10 @@ module LinkThumbnailer
9
9
  @attribute_name = attribute_name.to_sym
10
10
  end
11
11
 
12
- def call(current_score)
13
- return 0 unless attribute?
14
-
15
- score = 0
16
- score -= 25 if negative?
17
- score += 25 if positive?
18
- score
12
+ def call
13
+ return 1.0 if positive?
14
+ return 0.0 if negative?
15
+ 1.0
19
16
  end
20
17
 
21
18
  private
@@ -29,11 +26,11 @@ module LinkThumbnailer
29
26
  end
30
27
 
31
28
  def negative?
32
- attribute =~ negative_regex
29
+ attribute? && attribute =~ negative_regex
33
30
  end
34
31
 
35
32
  def positive?
36
- attribute =~ positive_regex
33
+ attribute? && attribute =~ positive_regex
37
34
  end
38
35
 
39
36
  def negative_regex
@@ -2,14 +2,34 @@ module LinkThumbnailer
2
2
  module Graders
3
3
  class Length < ::LinkThumbnailer::Graders::Base
4
4
 
5
- def call(current_score)
6
- return -Float::INFINITY if too_short?
5
+ def call
6
+ return 0.0 if too_short?
7
7
 
8
- [(text.length / 100).to_i, 3].min
8
+ y / ideal_description_gaussian_value
9
9
  end
10
10
 
11
11
  private
12
12
 
13
+ def get_gaussian_value_for(x)
14
+ Math.sqrt(2.0 * Math::PI ** 2) * Math.exp(-(x - ideal_description_length) ** 2 / 2.0 * 0.005 ** 2)
15
+ end
16
+
17
+ def x
18
+ text.length
19
+ end
20
+
21
+ def y
22
+ get_gaussian_value_for(x)
23
+ end
24
+
25
+ def ideal_description_length
26
+ 120.0
27
+ end
28
+
29
+ def ideal_description_gaussian_value
30
+ 4.442882938158366
31
+ end
32
+
13
33
  def too_short?
14
34
  text.length < config.description_min_length
15
35
  end
@@ -2,22 +2,13 @@ module LinkThumbnailer
2
2
  module Graders
3
3
  class LinkDensity < ::LinkThumbnailer::Graders::Base
4
4
 
5
- def call(current_score)
6
- return 0 if density_ratio == 0
7
- current_score *= density_ratio
5
+ def call
6
+ return 0.0 if text.length == 0
7
+ 1.0 - (links.count.to_f / text.length.to_f)
8
8
  end
9
9
 
10
10
  private
11
11
 
12
- def density
13
- return 0 if text.length == 0
14
- links.length / text.length.to_f
15
- end
16
-
17
- def density_ratio
18
- 1 - density
19
- end
20
-
21
12
  def links
22
13
  node.css('a').map(&:text).compact.reject(&:empty?)
23
14
  end
@@ -2,8 +2,8 @@ module LinkThumbnailer
2
2
  module Graders
3
3
  class Position < ::LinkThumbnailer::Graders::Base
4
4
 
5
- def call(current_score)
6
- 2.0 / description.position
5
+ def call
6
+ 1.0 - (description.position.to_f / description.candidates_number.to_f)
7
7
  end
8
8
 
9
9
  end
@@ -5,14 +5,15 @@ module LinkThumbnailer
5
5
  module Models
6
6
  class Description < ::LinkThumbnailer::Model
7
7
 
8
- attr_reader :node, :text, :position
9
- attr_accessor :score
10
-
11
- def initialize(node, text, position = 1)
12
- @node = node
13
- @text = sanitize(text)
14
- @position = position
15
- @score = compute_score
8
+ attr_reader :node, :text, :position, :candidates_number
9
+ attr_accessor :probability
10
+
11
+ def initialize(node, text, position = 1, candidates_number = 1)
12
+ @node = node
13
+ @text = sanitize(text)
14
+ @position = position
15
+ @candidates_number = candidates_number
16
+ @probability = compute_probability
16
17
  end
17
18
 
18
19
  def to_s
@@ -20,12 +21,12 @@ module LinkThumbnailer
20
21
  end
21
22
 
22
23
  def <=>(other)
23
- score <=> other.score
24
+ probability <=> other.probability
24
25
  end
25
26
 
26
27
  private
27
28
 
28
- def compute_score
29
+ def compute_probability
29
30
  ::LinkThumbnailer::Grader.new(self).call
30
31
  end
31
32
 
@@ -18,7 +18,7 @@ module LinkThumbnailer
18
18
  end
19
19
 
20
20
  def model_from_body
21
- nodes_from_body.each_with_index.map { |node, i| modelize(node, node.text, i + 1) }.sort.last
21
+ nodes_from_body.each_with_index.map { |node, i| modelize(node, node.text, i) }.sort.last
22
22
  end
23
23
 
24
24
  def node_from_meta
@@ -37,8 +37,8 @@ module LinkThumbnailer
37
37
  document.css('p,td')
38
38
  end
39
39
 
40
- def modelize(node, text, i = 1)
41
- model_class.new(node, text, i)
40
+ def modelize(node, text, i = 0)
41
+ model_class.new(node, text, i, nodes_from_body.count)
42
42
  end
43
43
 
44
44
  end
@@ -1,3 +1,3 @@
1
1
  module LinkThumbnailer
2
- VERSION = '2.6.1'
2
+ VERSION = '3.0.0'
3
3
  end
@@ -12,7 +12,7 @@ describe LinkThumbnailer::Configuration do
12
12
  it { expect(instance.blacklist_urls).to_not be_empty }
13
13
  it { expect(instance.attributes).to eq([:title, :images, :description, :videos, :favicon]) }
14
14
  it { expect(instance.graders).to_not be_empty }
15
- it { expect(instance.description_min_length).to eq(25) }
15
+ it { expect(instance.description_min_length).to eq(50) }
16
16
  it { expect(instance.positive_regex).to_not be_nil }
17
17
  it { expect(instance.negative_regex).to_not be_nil }
18
18
  it { expect(instance.image_limit).to eq(5) }
data/spec/grader_spec.rb CHANGED
@@ -7,8 +7,8 @@ describe LinkThumbnailer::Grader do
7
7
 
8
8
  describe '#call' do
9
9
 
10
- let(:score) { 10 }
11
- let(:grader) { double('grader', call: score) }
10
+ let(:probability) { 0.5 }
11
+ let(:grader) { double('grader', call: probability) }
12
12
  let(:lambda) { ->(_) { grader } }
13
13
  let(:graders) { [lambda, lambda] }
14
14
  let(:action) { instance.call }
@@ -17,7 +17,7 @@ describe LinkThumbnailer::Grader do
17
17
  instance.stub(:graders).and_return(graders)
18
18
  end
19
19
 
20
- it { expect(action).to eq(score * graders.size) }
20
+ it { expect(action).to eq(0.5 * 0.5) }
21
21
 
22
22
  end
23
23
 
@@ -2,11 +2,11 @@ require 'spec_helper'
2
2
 
3
3
  describe LinkThumbnailer::Graders::Base do
4
4
 
5
- let(:description) { double('description', node: node) }
5
+ let(:description) { double('description', text: 'foo', node: node) }
6
6
  let(:node) { double('node', text: 'foo') }
7
7
  let(:instance) { described_class.new(description) }
8
8
 
9
9
  it { expect(instance.send(:node)).to eq(description.node) }
10
- it { expect(instance.send(:text)).to eq(description.node.text) }
10
+ it { expect(instance.send(:text)).to eq(description.text) }
11
11
 
12
12
  end
@@ -7,7 +7,7 @@ describe LinkThumbnailer::Graders::HtmlAttribute do
7
7
 
8
8
  describe '#call' do
9
9
 
10
- let(:action) { instance.call(0) }
10
+ let(:action) { instance.call }
11
11
 
12
12
  context 'when current node does not match attribute' do
13
13
 
@@ -15,7 +15,7 @@ describe LinkThumbnailer::Graders::HtmlAttribute do
15
15
  instance.stub(:attribute?).and_return(false)
16
16
  end
17
17
 
18
- it { expect(action).to eq(0) }
18
+ it { expect(action).to eq(1.0) }
19
19
 
20
20
  end
21
21
 
@@ -27,7 +27,7 @@ describe LinkThumbnailer::Graders::HtmlAttribute do
27
27
  instance.stub(:positive?).and_return(false)
28
28
  end
29
29
 
30
- it { expect(action).to eq(-25) }
30
+ it { expect(action).to eq(0.0) }
31
31
 
32
32
  end
33
33
 
@@ -39,7 +39,7 @@ describe LinkThumbnailer::Graders::HtmlAttribute do
39
39
  instance.stub(:positive?).and_return(true)
40
40
  end
41
41
 
42
- it { expect(action).to eq(25) }
42
+ it { expect(action).to eq(1.0) }
43
43
 
44
44
  end
45
45
 
@@ -12,7 +12,7 @@ describe LinkThumbnailer::Graders::Length do
12
12
 
13
13
  describe '#call' do
14
14
 
15
- let(:action) { instance.call(0) }
15
+ let(:action) { instance.call }
16
16
 
17
17
  context 'when text is too short' do
18
18
 
@@ -20,7 +20,7 @@ describe LinkThumbnailer::Graders::Length do
20
20
  instance.stub(:too_short?).and_return(true)
21
21
  end
22
22
 
23
- it { expect(action).to eq(-Float::INFINITY) }
23
+ it { expect(action).to eq(0.0) }
24
24
 
25
25
  end
26
26
 
@@ -31,19 +31,27 @@ describe LinkThumbnailer::Graders::Length do
31
31
  instance.stub(:text).and_return(text)
32
32
  end
33
33
 
34
- context 'when text length is greater than 400' do
34
+ context 'when text length is 80' do
35
35
 
36
- let(:text) { 'f' * 400 }
36
+ let(:text) { 'f' * 80 }
37
37
 
38
- it { expect(action).to eq(3) }
38
+ it { expect(action).to eq(1.0) }
39
39
 
40
40
  end
41
41
 
42
- context 'when text length is less than 300' do
42
+ context 'when text length is 100' do
43
43
 
44
- let(:text) { 'f' * 299 }
44
+ let(:text) { 'f' * 100 }
45
45
 
46
- it { expect(action).to eq(2) }
46
+ it { expect(action).to be < 1.0 }
47
+
48
+ end
49
+
50
+ context 'when text length is 60' do
51
+
52
+ let(:text) { 'f' * 60 }
53
+
54
+ it { expect(action).to be < 1.0 }
47
55
 
48
56
  end
49
57
 
@@ -7,54 +7,41 @@ describe LinkThumbnailer::Graders::LinkDensity do
7
7
 
8
8
  describe '#call' do
9
9
 
10
- let(:previous_score) { 1 }
11
- let(:action) { instance.call(previous_score) }
12
-
13
- context 'when density_ratio is 0' do
14
-
15
- before do
16
- instance.stub(:density_ratio).and_return(0)
17
- end
18
-
19
- it { expect(action).to eq(0) }
10
+ let(:action) { instance.call }
20
11
 
12
+ before do
13
+ instance.stub(:text).and_return(text)
14
+ instance.stub(:links).and_return(links)
21
15
  end
22
16
 
23
- context 'when density is not 0' do
17
+ context 'when text length is 0' do
24
18
 
25
- before do
26
- instance.stub(:density_ratio).and_return(10)
27
- end
19
+ let(:text) { '' }
20
+ let(:links) { [] }
28
21
 
29
- it { expect(action).to eq(10) }
22
+ it { expect(action).to eq(0.0) }
30
23
 
31
24
  end
32
25
 
33
- end
26
+ context 'when text length is > 0' do
34
27
 
35
- describe '#density' do
28
+ let(:text) { 'foo' }
36
29
 
37
- let(:links) { ['foo'] }
38
- let(:action) { instance.send(:density) }
30
+ context 'and links is 0' do
39
31
 
40
- before do
41
- instance.stub(:links).and_return(links)
42
- instance.stub(:text).and_return(text)
43
- end
32
+ let(:links) { [] }
44
33
 
45
- context 'with text' do
34
+ it { expect(action).to eq(1.0) }
46
35
 
47
- let(:text) { 'abcd' }
48
-
49
- it { expect(action).to eq(0.25) }
36
+ end
50
37
 
51
- end
38
+ context 'and links is > 0' do
52
39
 
53
- context 'without text' do
40
+ let(:links) { [1] }
54
41
 
55
- let(:text) { '' }
42
+ it { expect(action).to be_within(0.001).of(0.666) }
56
43
 
57
- it { expect(action).to eq(0) }
44
+ end
58
45
 
59
46
  end
60
47
 
@@ -0,0 +1,47 @@
1
+ require 'spec_helper'
2
+
3
+ describe LinkThumbnailer::Graders::Position do
4
+
5
+ let(:description) { double('description') }
6
+ let(:instance) { described_class.new(description) }
7
+
8
+ describe '#call' do
9
+
10
+ let(:action) { instance.call }
11
+
12
+ context 'when position is 0' do
13
+
14
+ before do
15
+ description.stub(:position).and_return(0)
16
+ description.stub(:candidates_number).and_return(1)
17
+ end
18
+
19
+ it { expect(action).to eq(1.0) }
20
+
21
+ end
22
+
23
+ context 'when position is 1 over 1 candidates' do
24
+
25
+ before do
26
+ description.stub(:position).and_return(1)
27
+ description.stub(:candidates_number).and_return(1)
28
+ end
29
+
30
+ it { expect(action).to eq(0.0) }
31
+
32
+ end
33
+
34
+ context 'when position is 1 over more than 1 candidates' do
35
+
36
+ before do
37
+ description.stub(:position).and_return(1)
38
+ description.stub(:candidates_number).and_return(2)
39
+ end
40
+
41
+ it { expect(action).to eq(0.5) }
42
+
43
+ end
44
+
45
+ end
46
+
47
+ end
@@ -0,0 +1,50 @@
1
+ require 'spec_helper'
2
+
3
+ describe LinkThumbnailer::Graders::PunctuationDensity do
4
+
5
+ let(:description) { double('description') }
6
+ let(:instance) { described_class.new(description) }
7
+
8
+ describe '#call' do
9
+
10
+ let(:action) { instance.call }
11
+
12
+ before do
13
+ instance.stub(:text).and_return(text)
14
+ instance.stub(:punctuations).and_return(punctuations)
15
+ end
16
+
17
+ context 'when text length is 0' do
18
+
19
+ let(:text) { '' }
20
+ let(:punctuations) { [] }
21
+
22
+ it { expect(action).to eq(0.0) }
23
+
24
+ end
25
+
26
+ context 'when text length is > 0' do
27
+
28
+ let(:text) { 'foo' }
29
+
30
+ context 'and punctuations is 0' do
31
+
32
+ let(:punctuations) { [] }
33
+
34
+ it { expect(action).to eq(1.0) }
35
+
36
+ end
37
+
38
+ context 'and punctuations is > 0' do
39
+
40
+ let(:punctuations) { [1] }
41
+
42
+ it { expect(action).to be_within(0.001).of(0.666) }
43
+
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -22,37 +22,37 @@ describe LinkThumbnailer::Models::Description do
22
22
  describe '#<=>' do
23
23
 
24
24
  let(:another_instance) { described_class.new(node, text) }
25
- let(:score) { 5 }
25
+ let(:probability) { 0.5 }
26
26
  let(:action) { instance <=> another_instance }
27
27
 
28
28
  before do
29
- another_instance.score = score
29
+ another_instance.probability = probability
30
30
  end
31
31
 
32
- context 'when instance score is lower' do
32
+ context 'when instance probability is lower' do
33
33
 
34
34
  before do
35
- instance.score = score - 1
35
+ instance.probability = probability - 0.5
36
36
  end
37
37
 
38
38
  it { expect(action).to eq(-1) }
39
39
 
40
40
  end
41
41
 
42
- context 'when instance score is equal' do
42
+ context 'when instance probability is equal' do
43
43
 
44
44
  before do
45
- instance.score = score
45
+ instance.probability = probability
46
46
  end
47
47
 
48
48
  it { expect(action).to eq(0) }
49
49
 
50
50
  end
51
51
 
52
- context 'when instance score is greater' do
52
+ context 'when instance probability is greater' do
53
53
 
54
54
  before do
55
- instance.score = score + 1
55
+ instance.probability = probability + 0.5
56
56
  end
57
57
 
58
58
  it { expect(action).to eq(1) }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: link_thumbnailer
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.6.1
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Pierre-Louis Gottfrois
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-22 00:00:00.000000000 Z
11
+ date: 2015-06-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -193,6 +193,8 @@ files:
193
193
  - spec/graders/html_attribute_spec.rb
194
194
  - spec/graders/length_spec.rb
195
195
  - spec/graders/link_density_spec.rb
196
+ - spec/graders/position_spec.rb
197
+ - spec/graders/punctuation_density_spec.rb
196
198
  - spec/image_comparators/size_spec.rb
197
199
  - spec/image_parsers/size_spec.rb
198
200
  - spec/image_parsers/type_spec.rb
@@ -251,6 +253,8 @@ test_files:
251
253
  - spec/graders/html_attribute_spec.rb
252
254
  - spec/graders/length_spec.rb
253
255
  - spec/graders/link_density_spec.rb
256
+ - spec/graders/position_spec.rb
257
+ - spec/graders/punctuation_density_spec.rb
254
258
  - spec/image_comparators/size_spec.rb
255
259
  - spec/image_parsers/size_spec.rb
256
260
  - spec/image_parsers/type_spec.rb