distillery 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -74,21 +74,26 @@ module Distillery
74
74
 
75
75
  end
76
76
 
77
- describe 'coerce_elements_to_paragraphs!' do
77
+ describe 'mark_scorable_elements!' do
78
78
 
79
- it 'converts divs who have no children to paragraphs' do
80
- doc = document_of("<div>foo</div>", :coerce_elements_to_paragraphs!)
81
- doc.inner_html.should == html_of("<p>foo</p>")
79
+ it 'marks divs who have no children to paragraphs' do
80
+ doc = document_of("<div>foo</div>", :mark_scorable_elements!)
81
+ doc.inner_html.should == html_of('<div data-distillery="scorable">foo</div>')
82
82
  end
83
83
 
84
- it 'converts divs who have children that are not block-level elements to paragraphs' do
85
- doc = document_of("<div><span>foo</span></div>", :coerce_elements_to_paragraphs!)
86
- doc.inner_html.should == html_of("<p><span>foo</span></p>")
84
+ it 'marks divs who have children that are not block-level elements to paragraphs' do
85
+ doc = document_of("<div><span>foo</span></div>", :mark_scorable_elements!)
86
+ doc.inner_html.should == html_of('<div data-distillery="scorable"><span>foo</span></div>')
87
87
  end
88
88
 
89
- it 'converts divs whose have empty child divs to paragrahs' do
90
- doc = document_of("<div><pre>foo</pre><div></div></div>", :coerce_elements_to_paragraphs!)
91
- doc.inner_html.gsub("\n", "").should == html_of("<p><pre>foo</pre><p></p></p>")
89
+ it 'marks divs whose have empty child divs to paragrahs' do
90
+ doc = document_of("<div><pre>foo</pre><div></div></div>", :mark_scorable_elements!)
91
+ doc.inner_html.gsub("\n", "").should == html_of('<div data-distillery="scorable"><pre>foo</pre><div data-distillery="scorable"></div></div>')
92
+ end
93
+
94
+ it 'marks all paragraphs' do
95
+ doc = document_of("<p>foo</p><p></p></p>", :mark_scorable_elements!)
96
+ doc.inner_html.gsub("\n", "").should == html_of('<p data-distillery="scorable">foo</p><p data-distillery="scorable"></p>')
92
97
  end
93
98
 
94
99
  end
@@ -102,10 +107,12 @@ module Distillery
102
107
  subject.scores.should_not be_empty
103
108
  end
104
109
 
105
- it 'only calculates scores for paragraphs' do
106
- doc = document_of("<p>foo</p><div>bar</div>", :score!)
110
+ it 'calculates scores for divs that are empty or have no block level children' do
111
+ doc = document_of("<div><div><div><div>bar</div></div></div></div>", :score!)
112
+ doc.scores.should have_key('/html/body/div/div/div/div')
113
+ doc.scores.should have_key('/html/body/div/div/div')
114
+ doc.scores.should have_key('/html/body/div/div')
107
115
  doc.scores.should_not have_key('/html/body/div')
108
- doc.scores.should have_key('/html/body/p')
109
116
  end
110
117
 
111
118
  it 'gives one point per comma in the text of an element' do
@@ -122,14 +129,14 @@ module Distillery
122
129
  end
123
130
 
124
131
  it 'adds its own points to its parent' do
125
- doc = document_of("<p><div><p>foo</p></div></p>", :score!)
126
- doc.scores['/html/body/div/p'].should == 2
132
+ doc = document_of("<div><div>foo</div></div>", :score!)
133
+ doc.scores['/html/body/div/div'].should == 2
127
134
  doc.scores['/html/body/div'].should == 2
128
135
  end
129
136
 
130
137
  it 'adds 1/2 its points to its grandparent' do
131
- doc = document_of("<p><div><div><p>foo</p></div></div></p>", :score!)
132
- doc.scores['/html/body/div/div/p'].should == 2
138
+ doc = document_of("<div><div><div>foo</div></div></div>", :score!)
139
+ doc.scores['/html/body/div/div/div'].should == 2
133
140
  doc.scores['/html/body/div/div'].should == 2
134
141
  doc.scores['/html/body/div'].should == 1
135
142
  end
@@ -148,65 +155,65 @@ module Distillery
148
155
  end
149
156
 
150
157
  it 'removes all empty elements' do
151
- doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_element!)
158
+ doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_elements!)
152
159
  doc.search('span').should be_empty
153
160
  end
154
161
 
155
162
  it 'does not remove <br> elements' do
156
- doc = doc_with_top_scored_html_of("<div>foo<br class='noremove' /></div>", :clean_top_scoring_element!)
163
+ doc = doc_with_top_scored_html_of("<div>foo,foo,foo<br class='noremove' /></div>", :clean_top_scoring_elements!)
157
164
  doc.search('.noremove').should_not be_empty
158
165
  end
159
166
 
160
167
  %w[iframe form object].each do |tag|
161
168
  it "removes any #{tag} elements" do
162
- doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_element!)
169
+ doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_elements!)
163
170
  doc.search(tag).should be_empty
164
171
  end
165
172
  end
166
173
 
167
174
  it 'removes elements that have negative scores' do
168
- doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_element!)
175
+ doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_elements!)
169
176
  doc.search('.widget').should be_empty
170
177
  end
171
178
 
172
179
  it 'removes elements that have more images than p tags' do
173
- doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_element!)
180
+ doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_elements!)
174
181
  doc.search('.remove').should be_empty
175
182
  end
176
183
 
177
184
  it 'removes elements that have way more li elements and it is not a list' do
178
- doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_element!)
185
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_elements!)
179
186
  doc.search('.remove').should be_empty
180
187
  end
181
188
 
182
189
  it 'removes elements that have more inputs than 1/3 the amount of p tags' do
183
- doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_element!)
190
+ doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_elements!)
184
191
  doc.search('.remove').should be_empty
185
192
 
186
- doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_element!)
193
+ doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_elements!)
187
194
  doc.search('.remove').should_not be_empty
188
195
  end
189
196
 
190
197
  it 'removes elements that have < 25 characters and (no images or > 2 images' do
191
- doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_element!)
198
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_elements!)
192
199
  doc.search('.remove').should be_empty
193
200
 
194
- doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_element!)
201
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_elements!)
195
202
  doc.search('.remove').should be_empty
196
203
  end
197
204
 
198
205
  it 'removes elements that have a weight of < 25 and link density > 0.2' do
199
- doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_element!)
206
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_elements!)
200
207
  doc.search('.remove').should be_empty
201
208
  end
202
209
 
203
210
  it 'removes elements that have a weight of >= 25 and link density > 0.5' do
204
- doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_element!)
211
+ doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_elements!)
205
212
  doc.search('.remove').should be_empty
206
213
  end
207
214
 
208
215
  it 'should not clean the conntent elements not of table ul or div' do
209
- doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_element!)
216
+ doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_elements!)
210
217
  doc.search('.remove').should_not be_empty
211
218
  end
212
219
 
@@ -253,6 +260,23 @@ module Distillery
253
260
  document_of('foo').distill!.should == 'foo'
254
261
  end
255
262
 
263
+ it 'does not return any elements with a data-distillery attribute' do
264
+ html = document_of('<div><p>Hello</p></div>')
265
+ document_of(html).distill!.should_not =~ /data-distillery/
266
+ end
267
+
268
+ it 'picks the outtermost element in the event of a tie' do
269
+ doc = document_of("<div><div class='included'>#{'f,'*10}</div></div>")
270
+ doc.distill!.should =~ /included/
271
+ doc.scores['/html/body/div/div'].should == 11
272
+ doc.scores['/html/body/div'].should == 11
273
+ end
274
+
275
+ it 'returns sibling elements to the top scoring one that have > 25% of the top scoring element\'s score' do
276
+ doc = document_of("<div><div>#{'f,'*10}</div></div><div><div class='me_too'>#{'f,'*5}</div></div>")
277
+ doc.distill!.should =~ /me_too/
278
+ end
279
+
256
280
  end
257
281
 
258
282
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: distillery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,12 +9,12 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-04-30 00:00:00.000000000 -07:00
12
+ date: 2011-05-08 00:00:00.000000000 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: nokogiri
17
- requirement: &2161079840 !ruby/object:Gem::Requirement
17
+ requirement: &2168823320 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ! '>'
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: '1.0'
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *2161079840
25
+ version_requirements: *2168823320
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: slop
28
- requirement: &2161079300 !ruby/object:Gem::Requirement
28
+ requirement: &2168822640 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ! '>'
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: '1.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
- version_requirements: *2161079300
36
+ version_requirements: *2168822640
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: rspec
39
- requirement: &2161078840 !ruby/object:Gem::Requirement
39
+ requirement: &2168821740 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ! '>'
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: '2.0'
45
45
  type: :development
46
46
  prerelease: false
47
- version_requirements: *2161078840
47
+ version_requirements: *2168821740
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: guard
50
- requirement: &2161078340 !ruby/object:Gem::Requirement
50
+ requirement: &2168821200 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ! '>='
@@ -55,10 +55,10 @@ dependencies:
55
55
  version: '0'
56
56
  type: :development
57
57
  prerelease: false
58
- version_requirements: *2161078340
58
+ version_requirements: *2168821200
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: guard-rspec
61
- requirement: &2161077760 !ruby/object:Gem::Requirement
61
+ requirement: &2168820180 !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
64
  - - ! '>='
@@ -66,10 +66,10 @@ dependencies:
66
66
  version: '0'
67
67
  type: :development
68
68
  prerelease: false
69
- version_requirements: *2161077760
69
+ version_requirements: *2168820180
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: ruby-debug19
72
- requirement: &2161077220 !ruby/object:Gem::Requirement
72
+ requirement: &2168811040 !ruby/object:Gem::Requirement
73
73
  none: false
74
74
  requirements:
75
75
  - - ! '>='
@@ -77,10 +77,10 @@ dependencies:
77
77
  version: '0'
78
78
  type: :development
79
79
  prerelease: false
80
- version_requirements: *2161077220
80
+ version_requirements: *2168811040
81
81
  - !ruby/object:Gem::Dependency
82
82
  name: rb-fsevent
83
- requirement: &2161076740 !ruby/object:Gem::Requirement
83
+ requirement: &2168810540 !ruby/object:Gem::Requirement
84
84
  none: false
85
85
  requirements:
86
86
  - - ! '>='
@@ -88,10 +88,10 @@ dependencies:
88
88
  version: '0'
89
89
  type: :development
90
90
  prerelease: false
91
- version_requirements: *2161076740
91
+ version_requirements: *2168810540
92
92
  - !ruby/object:Gem::Dependency
93
93
  name: growl
94
- requirement: &2161057060 !ruby/object:Gem::Requirement
94
+ requirement: &2168809680 !ruby/object:Gem::Requirement
95
95
  none: false
96
96
  requirements:
97
97
  - - ! '>='
@@ -99,7 +99,7 @@ dependencies:
99
99
  version: '0'
100
100
  type: :development
101
101
  prerelease: false
102
- version_requirements: *2161057060
102
+ version_requirements: *2168809680
103
103
  description: Distillery extracts the "content" portion out of an HTML document. It
104
104
  applies heuristics based on element type, location, class/id name and other attributes
105
105
  to try and find the content part of the HTML document and return it.
@@ -123,9 +123,11 @@ files:
123
123
  - lib/distillery/document.rb
124
124
  - lib/distillery/version.rb
125
125
  - spec/acceptance_spec.rb
126
+ - spec/fixtures/.DS_Store
126
127
  - spec/fixtures/agave_cookies.html
127
128
  - spec/fixtures/baked_ziti.html
128
129
  - spec/fixtures/beef_jerkey.html
130
+ - spec/fixtures/bourbon_balls.html
129
131
  - spec/fixtures/clams_and_linguini.html
130
132
  - spec/fixtures/clouds_shining_moment.html
131
133
  - spec/fixtures/game_blog.html
@@ -164,9 +166,11 @@ specification_version: 3
164
166
  summary: Extract the content portion of an HTML document.
165
167
  test_files:
166
168
  - spec/acceptance_spec.rb
169
+ - spec/fixtures/.DS_Store
167
170
  - spec/fixtures/agave_cookies.html
168
171
  - spec/fixtures/baked_ziti.html
169
172
  - spec/fixtures/beef_jerkey.html
173
+ - spec/fixtures/bourbon_balls.html
170
174
  - spec/fixtures/clams_and_linguini.html
171
175
  - spec/fixtures/clouds_shining_moment.html
172
176
  - spec/fixtures/game_blog.html