distillery 0.2.10 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -60,7 +60,7 @@ module Distillery
60
60
  end
61
61
 
62
62
  describe 'remove_unlikely_elements!' do
63
- %w[combx comment disqus foot header meta nav rss shoutbox sidebar sponsor].each do |klass|
63
+ %w[combx comment community disqus extra foot header remark rss shoutbox sidebar sponsor ad-break agegate pagination pager popup].each do |klass|
64
64
  it "removes any elements classed .#{klass}, as it is unlikely to be page content" do
65
65
  doc = document_of("<div class='#{klass}'>foo</div>", :remove_unlikely_elements!)
66
66
  doc.inner_html.should == html_of("")
@@ -117,33 +117,33 @@ module Distillery
117
117
 
118
118
  it 'gives one point per comma in the text of an element' do
119
119
  doc = document_of("<p>foo,bar,baz</p>", :score!)
120
- doc.scores['/html/body/p'].should == 4
120
+ doc.scores['/html/body/p'].should == 68.0
121
121
  end
122
122
 
123
123
  it 'gives one point per chunk of 100 characters, max of 3' do
124
124
  doc = document_of("<p>#{'f'*201}</p>", :score!)
125
- doc.scores['/html/body/p'].should == 4
125
+ doc.scores['/html/body/p'].should == 68.0
126
126
 
127
127
  doc = document_of("<p>#{'f'*1000}</p>", :score!)
128
- doc.scores['/html/body/p'].should == 5
128
+ doc.scores['/html/body/p'].should == 85.0
129
129
  end
130
130
 
131
131
  it 'adds its own points to its parent' do
132
132
  doc = document_of("<div><div>foo</div></div>", :score!)
133
- doc.scores['/html/body/div/div'].should == 2
134
- doc.scores['/html/body/div'].should == 2
133
+ doc.scores['/html/body/div/div'].should == 30.0
134
+ doc.scores['/html/body/div'].should == 34.0
135
135
  end
136
136
 
137
137
  it 'adds 1/2 its points to its grandparent' do
138
138
  doc = document_of("<div><div><div>foo</div></div></div>", :score!)
139
- doc.scores['/html/body/div/div/div'].should == 2
140
- doc.scores['/html/body/div/div'].should == 2
141
- doc.scores['/html/body/div'].should == 1
139
+ doc.scores['/html/body/div/div/div'].should == 26
140
+ doc.scores['/html/body/div/div'].should == 30.0
141
+ doc.scores['/html/body/div'].should == 17.0
142
142
  end
143
143
 
144
144
  it 'scales the final score by the inverse link density' do
145
145
  doc = document_of("<p>foobar<a>baz</a></p>", :score!)
146
- doc.scores['/html/body/p'].should == 1.3333333333333335
146
+ doc.scores['/html/body/p'].should == 22.666666666666668
147
147
  end
148
148
 
149
149
  end
@@ -182,11 +182,6 @@ module Distillery
182
182
  doc.search('.remove').should be_empty
183
183
  end
184
184
 
185
- it 'removes elements that have way more li elements and it is not a list' do
186
- doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_elements!)
187
- doc.search('.remove').should be_empty
188
- end
189
-
190
185
  it 'removes elements that have more inputs than 1/3 the amount of p tags' do
191
186
  doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_elements!)
192
187
  doc.search('.remove').should be_empty
@@ -277,10 +272,11 @@ module Distillery
277
272
  end
278
273
 
279
274
  it 'picks the outtermost element in the event of a tie' do
280
- doc = document_of("<div><div class='included'>#{'f,'*10}</div></div>")
281
- doc.distill!.should =~ /included/
282
- doc.scores['/html/body/div/div'].should == 11
283
- doc.scores['/html/body/div'].should == 11
275
+ doc = document_of("<div><div class='is-included'>#{'f,'*10}</div><div class='also-included'>#{'f,'*10}</div></div>")
276
+ doc.distill!.should =~ /is-included/
277
+ doc.distill!.should =~ /also-included/
278
+ doc.scores['/html/body/div/div[1]'].should == 165.0
279
+ doc.scores['/html/body/div/div[2]'].should == 165.0
284
280
  end
285
281
 
286
282
  it 'returns sibling elements to the top scoring one that have > 25% of the top scoring element\'s score' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: distillery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.10
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-12-07 00:00:00.000000000Z
12
+ date: 2011-12-17 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
- requirement: &2161473880 !ruby/object:Gem::Requirement
16
+ requirement: &2160714940 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>'
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '1.0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *2161473880
24
+ version_requirements: *2160714940
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: slop
27
- requirement: &2161473380 !ruby/object:Gem::Requirement
27
+ requirement: &2160714440 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>'
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '1.0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *2161473380
35
+ version_requirements: *2160714440
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: rspec
38
- requirement: &2161472920 !ruby/object:Gem::Requirement
38
+ requirement: &2160713980 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>'
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '2.0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2161472920
46
+ version_requirements: *2160713980
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: guard
49
- requirement: &2161472540 !ruby/object:Gem::Requirement
49
+ requirement: &2160713600 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2161472540
57
+ version_requirements: *2160713600
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: guard-rspec
60
- requirement: &2161472080 !ruby/object:Gem::Requirement
60
+ requirement: &2160713140 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *2161472080
68
+ version_requirements: *2160713140
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: ruby-debug19
71
- requirement: &2161471660 !ruby/object:Gem::Requirement
71
+ requirement: &2160712720 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *2161471660
79
+ version_requirements: *2160712720
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: rb-fsevent
82
- requirement: &2161471240 !ruby/object:Gem::Requirement
82
+ requirement: &2160712300 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :development
89
89
  prerelease: false
90
- version_requirements: *2161471240
90
+ version_requirements: *2160712300
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: growl
93
- requirement: &2161470820 !ruby/object:Gem::Requirement
93
+ requirement: &2160711880 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,7 +98,7 @@ dependencies:
98
98
  version: '0'
99
99
  type: :development
100
100
  prerelease: false
101
- version_requirements: *2161470820
101
+ version_requirements: *2160711880
102
102
  description: Distillery extracts the "content" portion out of an HTML document. It
103
103
  applies heuristics based on element type, location, class/id name and other attributes
104
104
  to try and find the content part of the HTML document and return it.
@@ -136,10 +136,13 @@ files:
136
136
  - spec/fixtures/ginger_cookies.html
137
137
  - spec/fixtures/js_this_keyword.html
138
138
  - spec/fixtures/maple_cookies.html
139
+ - spec/fixtures/mothers_brisket.html
139
140
  - spec/fixtures/nyt_social_media.html
141
+ - spec/fixtures/oyako.html
140
142
  - spec/fixtures/pina_collada_cupcakes.html
141
143
  - spec/fixtures/pumpkin_scones.html
142
144
  - spec/fixtures/rhubarb.html
145
+ - spec/fixtures/swiss_chard_pie.html
143
146
  - spec/fixtures/tofu_bowl.html
144
147
  - spec/fixtures/vanilla_pound_cake.html
145
148
  - spec/lib/distillery/document_spec.rb
@@ -185,10 +188,13 @@ test_files:
185
188
  - spec/fixtures/ginger_cookies.html
186
189
  - spec/fixtures/js_this_keyword.html
187
190
  - spec/fixtures/maple_cookies.html
191
+ - spec/fixtures/mothers_brisket.html
188
192
  - spec/fixtures/nyt_social_media.html
193
+ - spec/fixtures/oyako.html
189
194
  - spec/fixtures/pina_collada_cupcakes.html
190
195
  - spec/fixtures/pumpkin_scones.html
191
196
  - spec/fixtures/rhubarb.html
197
+ - spec/fixtures/swiss_chard_pie.html
192
198
  - spec/fixtures/tofu_bowl.html
193
199
  - spec/fixtures/vanilla_pound_cake.html
194
200
  - spec/lib/distillery/document_spec.rb