distillery 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,21 +74,26 @@ module Distillery
74
74
 
75
75
  end
76
76
 
77
- describe 'coerce_elements_to_paragraphs!' do
77
+ describe 'mark_scorable_elements!' do
78
78
 
79
- it 'converts divs who have no children to paragraphs' do
80
- doc = document_of("<div>foo</div>", :coerce_elements_to_paragraphs!)
81
- doc.inner_html.should == html_of("<p>foo</p>")
79
+ it 'marks divs who have no children to paragraphs' do
80
+ doc = document_of("<div>foo</div>", :mark_scorable_elements!)
81
+ doc.inner_html.should == html_of('<div data-distillery="scorable">foo</div>')
82
82
  end
83
83
 
84
- it 'converts divs who have children that are not block-level elements to paragraphs' do
85
- doc = document_of("<div><span>foo</span></div>", :coerce_elements_to_paragraphs!)
86
- doc.inner_html.should == html_of("<p><span>foo</span></p>")
84
+ it 'marks divs who have children that are not block-level elements to paragraphs' do
85
+ doc = document_of("<div><span>foo</span></div>", :mark_scorable_elements!)
86
+ doc.inner_html.should == html_of('<div data-distillery="scorable"><span>foo</span></div>')
87
87
  end
88
88
 
89
- it 'converts divs whose have empty child divs to paragrahs' do
90
- doc = document_of("<div><pre>foo</pre><div></div></div>", :coerce_elements_to_paragraphs!)
91
- doc.inner_html.gsub("\n", "").should == html_of("<p><pre>foo</pre><p></p></p>")
89
+ it 'marks divs whose have empty child divs to paragrahs' do
90
+ doc = document_of("<div><pre>foo</pre><div></div></div>", :mark_scorable_elements!)
91
+ doc.inner_html.gsub("\n", "").should == html_of('<div data-distillery="scorable"><pre>foo</pre><div data-distillery="scorable"></div></div>')
92
+ end
93
+
94
+ it 'marks all paragraphs' do
95
+ doc = document_of("<p>foo</p><p></p></p>", :mark_scorable_elements!)
96
+ doc.inner_html.gsub("\n", "").should == html_of('<p data-distillery="scorable">foo</p><p data-distillery="scorable"></p>')
92
97
  end
93
98
 
94
99
  end
@@ -102,10 +107,12 @@ module Distillery
102
107
  subject.scores.should_not be_empty
103
108
  end
104
109
 
105
- it 'only calculates scores for paragraphs' do
106
- doc = document_of("<p>foo</p><div>bar</div>", :score!)
110
+ it 'calculates scores for divs that are empty or have no block level children' do
111
+ doc = document_of("<div><div><div><div>bar</div></div></div></div>", :score!)
112
+ doc.scores.should have_key('/html/body/div/div/div/div')
113
+ doc.scores.should have_key('/html/body/div/div/div')
114
+ doc.scores.should have_key('/html/body/div/div')
107
115
  doc.scores.should_not have_key('/html/body/div')
108
- doc.scores.should have_key('/html/body/p')
109
116
  end
110
117
 
111
118
  it 'gives one point per comma in the text of an element' do
@@ -122,14 +129,14 @@ module Distillery
122
129
  end
123
130
 
124
131
  it 'adds its own points to its parent' do
125
- doc = document_of("<p><div><p>foo</p></div></p>", :score!)
126
- doc.scores['/html/body/div/p'].should == 2
132
+ doc = document_of("<div><div>foo</div></div>", :score!)
133
+ doc.scores['/html/body/div/div'].should == 2
127
134
  doc.scores['/html/body/div'].should == 2
128
135
  end
129
136
 
130
137
  it 'adds 1/2 its points to its grandparent' do
131
- doc = document_of("<p><div><div><p>foo</p></div></div></p>", :score!)
132
- doc.scores['/html/body/div/div/p'].should == 2
138
+ doc = document_of("<div><div><div>foo</div></div></div>", :score!)
139
+ doc.scores['/html/body/div/div/div'].should == 2
133
140
  doc.scores['/html/body/div/div'].should == 2
134
141
  doc.scores['/html/body/div'].should == 1
135
142
  end
@@ -148,65 +155,65 @@ module Distillery
148
155
  end
149
156
 
150
157
  it 'removes all empty elements' do
151
- doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_element!)
158
+ doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_elements!)
152
159
  doc.search('span').should be_empty
153
160
  end
154
161
 
155
162
  it 'does not remove <br> elements' do
156
- doc = doc_with_top_scored_html_of("<div>foo<br class='noremove' /></div>", :clean_top_scoring_element!)
163
+ doc = doc_with_top_scored_html_of("<div>foo,foo,foo<br class='noremove' /></div>", :clean_top_scoring_elements!)
157
164
  doc.search('.noremove').should_not be_empty
158
165
  end
159
166
 
160
167
  %w[iframe form object].each do |tag|
161
168
  it "removes any #{tag} elements" do
162
- doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_element!)
169
+ doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_elements!)
163
170
  doc.search(tag).should be_empty
164
171
  end
165
172
  end
166
173
 
167
174
  it 'removes elements that have negative scores' do
168
- doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_element!)
175
+ doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_elements!)
169
176
  doc.search('.widget').should be_empty
170
177
  end
171
178
 
172
179
  it 'removes elements that have more images than p tags' do
173
- doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_element!)
180
+ doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_elements!)
174
181
  doc.search('.remove').should be_empty
175
182
  end
176
183
 
177
184
  it 'removes elements that have way more li elements and it is not a list' do
178
- doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_element!)
185
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_elements!)
179
186
  doc.search('.remove').should be_empty
180
187
  end
181
188
 
182
189
  it 'removes elements that have more inputs than 1/3 the amount of p tags' do
183
- doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_element!)
190
+ doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_elements!)
184
191
  doc.search('.remove').should be_empty
185
192
 
186
- doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_element!)
193
+ doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_elements!)
187
194
  doc.search('.remove').should_not be_empty
188
195
  end
189
196
 
190
197
  it 'removes elements that have < 25 characters and (no images or > 2 images' do
191
- doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_element!)
198
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_elements!)
192
199
  doc.search('.remove').should be_empty
193
200
 
194
- doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_element!)
201
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_elements!)
195
202
  doc.search('.remove').should be_empty
196
203
  end
197
204
 
198
205
  it 'removes elements that have a weight of < 25 and link density > 0.2' do
199
- doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_element!)
206
+ doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_elements!)
200
207
  doc.search('.remove').should be_empty
201
208
  end
202
209
 
203
210
  it 'removes elements that have a weight of >= 25 and link density > 0.5' do
204
- doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_element!)
211
+ doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_elements!)
205
212
  doc.search('.remove').should be_empty
206
213
  end
207
214
 
208
215
  it 'should not clean the conntent elements not of table ul or div' do
209
- doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_element!)
216
+ doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_elements!)
210
217
  doc.search('.remove').should_not be_empty
211
218
  end
212
219
 
@@ -253,6 +260,23 @@ module Distillery
253
260
  document_of('foo').distill!.should == 'foo'
254
261
  end
255
262
 
263
+ it 'does not return any elements with a data-distillery attribute' do
264
+ html = document_of('<div><p>Hello</p></div>')
265
+ document_of(html).distill!.should_not =~ /data-distillery/
266
+ end
267
+
268
+ it 'picks the outtermost element in the event of a tie' do
269
+ doc = document_of("<div><div class='included'>#{'f,'*10}</div></div>")
270
+ doc.distill!.should =~ /included/
271
+ doc.scores['/html/body/div/div'].should == 11
272
+ doc.scores['/html/body/div'].should == 11
273
+ end
274
+
275
+ it 'returns sibling elements to the top scoring one that have > 25% of the top scoring element\'s score' do
276
+ doc = document_of("<div><div>#{'f,'*10}</div></div><div><div class='me_too'>#{'f,'*5}</div></div>")
277
+ doc.distill!.should =~ /me_too/
278
+ end
279
+
256
280
  end
257
281
 
258
282
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: distillery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,12 +9,12 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-04-30 00:00:00.000000000 -07:00
12
+ date: 2011-05-08 00:00:00.000000000 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: nokogiri
17
- requirement: &2161079840 !ruby/object:Gem::Requirement
17
+ requirement: &2168823320 !ruby/object:Gem::Requirement
18
18
  none: false
19
19
  requirements:
20
20
  - - ! '>'
@@ -22,10 +22,10 @@ dependencies:
22
22
  version: '1.0'
23
23
  type: :runtime
24
24
  prerelease: false
25
- version_requirements: *2161079840
25
+ version_requirements: *2168823320
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: slop
28
- requirement: &2161079300 !ruby/object:Gem::Requirement
28
+ requirement: &2168822640 !ruby/object:Gem::Requirement
29
29
  none: false
30
30
  requirements:
31
31
  - - ! '>'
@@ -33,10 +33,10 @@ dependencies:
33
33
  version: '1.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
- version_requirements: *2161079300
36
+ version_requirements: *2168822640
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: rspec
39
- requirement: &2161078840 !ruby/object:Gem::Requirement
39
+ requirement: &2168821740 !ruby/object:Gem::Requirement
40
40
  none: false
41
41
  requirements:
42
42
  - - ! '>'
@@ -44,10 +44,10 @@ dependencies:
44
44
  version: '2.0'
45
45
  type: :development
46
46
  prerelease: false
47
- version_requirements: *2161078840
47
+ version_requirements: *2168821740
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: guard
50
- requirement: &2161078340 !ruby/object:Gem::Requirement
50
+ requirement: &2168821200 !ruby/object:Gem::Requirement
51
51
  none: false
52
52
  requirements:
53
53
  - - ! '>='
@@ -55,10 +55,10 @@ dependencies:
55
55
  version: '0'
56
56
  type: :development
57
57
  prerelease: false
58
- version_requirements: *2161078340
58
+ version_requirements: *2168821200
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: guard-rspec
61
- requirement: &2161077760 !ruby/object:Gem::Requirement
61
+ requirement: &2168820180 !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
64
  - - ! '>='
@@ -66,10 +66,10 @@ dependencies:
66
66
  version: '0'
67
67
  type: :development
68
68
  prerelease: false
69
- version_requirements: *2161077760
69
+ version_requirements: *2168820180
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: ruby-debug19
72
- requirement: &2161077220 !ruby/object:Gem::Requirement
72
+ requirement: &2168811040 !ruby/object:Gem::Requirement
73
73
  none: false
74
74
  requirements:
75
75
  - - ! '>='
@@ -77,10 +77,10 @@ dependencies:
77
77
  version: '0'
78
78
  type: :development
79
79
  prerelease: false
80
- version_requirements: *2161077220
80
+ version_requirements: *2168811040
81
81
  - !ruby/object:Gem::Dependency
82
82
  name: rb-fsevent
83
- requirement: &2161076740 !ruby/object:Gem::Requirement
83
+ requirement: &2168810540 !ruby/object:Gem::Requirement
84
84
  none: false
85
85
  requirements:
86
86
  - - ! '>='
@@ -88,10 +88,10 @@ dependencies:
88
88
  version: '0'
89
89
  type: :development
90
90
  prerelease: false
91
- version_requirements: *2161076740
91
+ version_requirements: *2168810540
92
92
  - !ruby/object:Gem::Dependency
93
93
  name: growl
94
- requirement: &2161057060 !ruby/object:Gem::Requirement
94
+ requirement: &2168809680 !ruby/object:Gem::Requirement
95
95
  none: false
96
96
  requirements:
97
97
  - - ! '>='
@@ -99,7 +99,7 @@ dependencies:
99
99
  version: '0'
100
100
  type: :development
101
101
  prerelease: false
102
- version_requirements: *2161057060
102
+ version_requirements: *2168809680
103
103
  description: Distillery extracts the "content" portion out of an HTML document. It
104
104
  applies heuristics based on element type, location, class/id name and other attributes
105
105
  to try and find the content part of the HTML document and return it.
@@ -123,9 +123,11 @@ files:
123
123
  - lib/distillery/document.rb
124
124
  - lib/distillery/version.rb
125
125
  - spec/acceptance_spec.rb
126
+ - spec/fixtures/.DS_Store
126
127
  - spec/fixtures/agave_cookies.html
127
128
  - spec/fixtures/baked_ziti.html
128
129
  - spec/fixtures/beef_jerkey.html
130
+ - spec/fixtures/bourbon_balls.html
129
131
  - spec/fixtures/clams_and_linguini.html
130
132
  - spec/fixtures/clouds_shining_moment.html
131
133
  - spec/fixtures/game_blog.html
@@ -164,9 +166,11 @@ specification_version: 3
164
166
  summary: Extract the content portion of an HTML document.
165
167
  test_files:
166
168
  - spec/acceptance_spec.rb
169
+ - spec/fixtures/.DS_Store
167
170
  - spec/fixtures/agave_cookies.html
168
171
  - spec/fixtures/baked_ziti.html
169
172
  - spec/fixtures/beef_jerkey.html
173
+ - spec/fixtures/bourbon_balls.html
170
174
  - spec/fixtures/clams_and_linguini.html
171
175
  - spec/fixtures/clouds_shining_moment.html
172
176
  - spec/fixtures/game_blog.html