distillery 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +6 -4
- data/Rakefile +1 -1
- data/lib/distillery/document.rb +54 -24
- data/lib/distillery/version.rb +1 -1
- data/spec/acceptance_spec.rb +10 -0
- data/spec/fixtures/.DS_Store +0 -0
- data/spec/fixtures/bourbon_balls.html +950 -0
- data/spec/lib/distillery/document_spec.rb +54 -30
- metadata +22 -18
@@ -74,21 +74,26 @@ module Distillery
|
|
74
74
|
|
75
75
|
end
|
76
76
|
|
77
|
-
describe '
|
77
|
+
describe 'mark_scorable_elements!' do
|
78
78
|
|
79
|
-
it '
|
80
|
-
doc = document_of("<div>foo</div>", :
|
81
|
-
doc.inner_html.should == html_of("
|
79
|
+
it 'marks divs who have no children to paragraphs' do
|
80
|
+
doc = document_of("<div>foo</div>", :mark_scorable_elements!)
|
81
|
+
doc.inner_html.should == html_of('<div data-distillery="scorable">foo</div>')
|
82
82
|
end
|
83
83
|
|
84
|
-
it '
|
85
|
-
doc = document_of("<div><span>foo</span></div>", :
|
86
|
-
doc.inner_html.should == html_of("
|
84
|
+
it 'marks divs who have children that are not block-level elements to paragraphs' do
|
85
|
+
doc = document_of("<div><span>foo</span></div>", :mark_scorable_elements!)
|
86
|
+
doc.inner_html.should == html_of('<div data-distillery="scorable"><span>foo</span></div>')
|
87
87
|
end
|
88
88
|
|
89
|
-
it '
|
90
|
-
doc = document_of("<div><pre>foo</pre><div></div></div>", :
|
91
|
-
doc.inner_html.gsub("\n", "").should == html_of("
|
89
|
+
it 'marks divs whose have empty child divs to paragrahs' do
|
90
|
+
doc = document_of("<div><pre>foo</pre><div></div></div>", :mark_scorable_elements!)
|
91
|
+
doc.inner_html.gsub("\n", "").should == html_of('<div data-distillery="scorable"><pre>foo</pre><div data-distillery="scorable"></div></div>')
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'marks all paragraphs' do
|
95
|
+
doc = document_of("<p>foo</p><p></p></p>", :mark_scorable_elements!)
|
96
|
+
doc.inner_html.gsub("\n", "").should == html_of('<p data-distillery="scorable">foo</p><p data-distillery="scorable"></p>')
|
92
97
|
end
|
93
98
|
|
94
99
|
end
|
@@ -102,10 +107,12 @@ module Distillery
|
|
102
107
|
subject.scores.should_not be_empty
|
103
108
|
end
|
104
109
|
|
105
|
-
it '
|
106
|
-
doc = document_of("<
|
110
|
+
it 'calculates scores for divs that are empty or have no block level children' do
|
111
|
+
doc = document_of("<div><div><div><div>bar</div></div></div></div>", :score!)
|
112
|
+
doc.scores.should have_key('/html/body/div/div/div/div')
|
113
|
+
doc.scores.should have_key('/html/body/div/div/div')
|
114
|
+
doc.scores.should have_key('/html/body/div/div')
|
107
115
|
doc.scores.should_not have_key('/html/body/div')
|
108
|
-
doc.scores.should have_key('/html/body/p')
|
109
116
|
end
|
110
117
|
|
111
118
|
it 'gives one point per comma in the text of an element' do
|
@@ -122,14 +129,14 @@ module Distillery
|
|
122
129
|
end
|
123
130
|
|
124
131
|
it 'adds its own points to its parent' do
|
125
|
-
doc = document_of("<
|
126
|
-
doc.scores['/html/body/div/
|
132
|
+
doc = document_of("<div><div>foo</div></div>", :score!)
|
133
|
+
doc.scores['/html/body/div/div'].should == 2
|
127
134
|
doc.scores['/html/body/div'].should == 2
|
128
135
|
end
|
129
136
|
|
130
137
|
it 'adds 1/2 its points to its grandparent' do
|
131
|
-
doc = document_of("<
|
132
|
-
doc.scores['/html/body/div/div/
|
138
|
+
doc = document_of("<div><div><div>foo</div></div></div>", :score!)
|
139
|
+
doc.scores['/html/body/div/div/div'].should == 2
|
133
140
|
doc.scores['/html/body/div/div'].should == 2
|
134
141
|
doc.scores['/html/body/div'].should == 1
|
135
142
|
end
|
@@ -148,65 +155,65 @@ module Distillery
|
|
148
155
|
end
|
149
156
|
|
150
157
|
it 'removes all empty elements' do
|
151
|
-
doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :
|
158
|
+
doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_elements!)
|
152
159
|
doc.search('span').should be_empty
|
153
160
|
end
|
154
161
|
|
155
162
|
it 'does not remove <br> elements' do
|
156
|
-
doc = doc_with_top_scored_html_of("<div>foo<br class='noremove' /></div>", :
|
163
|
+
doc = doc_with_top_scored_html_of("<div>foo,foo,foo<br class='noremove' /></div>", :clean_top_scoring_elements!)
|
157
164
|
doc.search('.noremove').should_not be_empty
|
158
165
|
end
|
159
166
|
|
160
167
|
%w[iframe form object].each do |tag|
|
161
168
|
it "removes any #{tag} elements" do
|
162
|
-
doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :
|
169
|
+
doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_elements!)
|
163
170
|
doc.search(tag).should be_empty
|
164
171
|
end
|
165
172
|
end
|
166
173
|
|
167
174
|
it 'removes elements that have negative scores' do
|
168
|
-
doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :
|
175
|
+
doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_elements!)
|
169
176
|
doc.search('.widget').should be_empty
|
170
177
|
end
|
171
178
|
|
172
179
|
it 'removes elements that have more images than p tags' do
|
173
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :
|
180
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_elements!)
|
174
181
|
doc.search('.remove').should be_empty
|
175
182
|
end
|
176
183
|
|
177
184
|
it 'removes elements that have way more li elements and it is not a list' do
|
178
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :
|
185
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_elements!)
|
179
186
|
doc.search('.remove').should be_empty
|
180
187
|
end
|
181
188
|
|
182
189
|
it 'removes elements that have more inputs than 1/3 the amount of p tags' do
|
183
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :
|
190
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_elements!)
|
184
191
|
doc.search('.remove').should be_empty
|
185
192
|
|
186
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :
|
193
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_elements!)
|
187
194
|
doc.search('.remove').should_not be_empty
|
188
195
|
end
|
189
196
|
|
190
197
|
it 'removes elements that have < 25 characters and (no images or > 2 images' do
|
191
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :
|
198
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_elements!)
|
192
199
|
doc.search('.remove').should be_empty
|
193
200
|
|
194
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :
|
201
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_elements!)
|
195
202
|
doc.search('.remove').should be_empty
|
196
203
|
end
|
197
204
|
|
198
205
|
it 'removes elements that have a weight of < 25 and link density > 0.2' do
|
199
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :
|
206
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_elements!)
|
200
207
|
doc.search('.remove').should be_empty
|
201
208
|
end
|
202
209
|
|
203
210
|
it 'removes elements that have a weight of >= 25 and link density > 0.5' do
|
204
|
-
doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :
|
211
|
+
doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_elements!)
|
205
212
|
doc.search('.remove').should be_empty
|
206
213
|
end
|
207
214
|
|
208
215
|
it 'should not clean the conntent elements not of table ul or div' do
|
209
|
-
doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :
|
216
|
+
doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_elements!)
|
210
217
|
doc.search('.remove').should_not be_empty
|
211
218
|
end
|
212
219
|
|
@@ -253,6 +260,23 @@ module Distillery
|
|
253
260
|
document_of('foo').distill!.should == 'foo'
|
254
261
|
end
|
255
262
|
|
263
|
+
it 'does not return any elements with a data-distillery attribute' do
|
264
|
+
html = document_of('<div><p>Hello</p></div>')
|
265
|
+
document_of(html).distill!.should_not =~ /data-distillery/
|
266
|
+
end
|
267
|
+
|
268
|
+
it 'picks the outtermost element in the event of a tie' do
|
269
|
+
doc = document_of("<div><div class='included'>#{'f,'*10}</div></div>")
|
270
|
+
doc.distill!.should =~ /included/
|
271
|
+
doc.scores['/html/body/div/div'].should == 11
|
272
|
+
doc.scores['/html/body/div'].should == 11
|
273
|
+
end
|
274
|
+
|
275
|
+
it 'returns sibling elements to the top scoring one that have > 25% of the top scoring element\'s score' do
|
276
|
+
doc = document_of("<div><div>#{'f,'*10}</div></div><div><div class='me_too'>#{'f,'*5}</div></div>")
|
277
|
+
doc.distill!.should =~ /me_too/
|
278
|
+
end
|
279
|
+
|
256
280
|
end
|
257
281
|
|
258
282
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: distillery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,12 +9,12 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-05-08 00:00:00.000000000 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|
17
|
-
requirement: &
|
17
|
+
requirement: &2168823320 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ! '>'
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: '1.0'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *2168823320
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: slop
|
28
|
-
requirement: &
|
28
|
+
requirement: &2168822640 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ! '>'
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: '1.0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *2168822640
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: rspec
|
39
|
-
requirement: &
|
39
|
+
requirement: &2168821740 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ! '>'
|
@@ -44,10 +44,10 @@ dependencies:
|
|
44
44
|
version: '2.0'
|
45
45
|
type: :development
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *2168821740
|
48
48
|
- !ruby/object:Gem::Dependency
|
49
49
|
name: guard
|
50
|
-
requirement: &
|
50
|
+
requirement: &2168821200 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
52
52
|
requirements:
|
53
53
|
- - ! '>='
|
@@ -55,10 +55,10 @@ dependencies:
|
|
55
55
|
version: '0'
|
56
56
|
type: :development
|
57
57
|
prerelease: false
|
58
|
-
version_requirements: *
|
58
|
+
version_requirements: *2168821200
|
59
59
|
- !ruby/object:Gem::Dependency
|
60
60
|
name: guard-rspec
|
61
|
-
requirement: &
|
61
|
+
requirement: &2168820180 !ruby/object:Gem::Requirement
|
62
62
|
none: false
|
63
63
|
requirements:
|
64
64
|
- - ! '>='
|
@@ -66,10 +66,10 @@ dependencies:
|
|
66
66
|
version: '0'
|
67
67
|
type: :development
|
68
68
|
prerelease: false
|
69
|
-
version_requirements: *
|
69
|
+
version_requirements: *2168820180
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
71
|
name: ruby-debug19
|
72
|
-
requirement: &
|
72
|
+
requirement: &2168811040 !ruby/object:Gem::Requirement
|
73
73
|
none: false
|
74
74
|
requirements:
|
75
75
|
- - ! '>='
|
@@ -77,10 +77,10 @@ dependencies:
|
|
77
77
|
version: '0'
|
78
78
|
type: :development
|
79
79
|
prerelease: false
|
80
|
-
version_requirements: *
|
80
|
+
version_requirements: *2168811040
|
81
81
|
- !ruby/object:Gem::Dependency
|
82
82
|
name: rb-fsevent
|
83
|
-
requirement: &
|
83
|
+
requirement: &2168810540 !ruby/object:Gem::Requirement
|
84
84
|
none: false
|
85
85
|
requirements:
|
86
86
|
- - ! '>='
|
@@ -88,10 +88,10 @@ dependencies:
|
|
88
88
|
version: '0'
|
89
89
|
type: :development
|
90
90
|
prerelease: false
|
91
|
-
version_requirements: *
|
91
|
+
version_requirements: *2168810540
|
92
92
|
- !ruby/object:Gem::Dependency
|
93
93
|
name: growl
|
94
|
-
requirement: &
|
94
|
+
requirement: &2168809680 !ruby/object:Gem::Requirement
|
95
95
|
none: false
|
96
96
|
requirements:
|
97
97
|
- - ! '>='
|
@@ -99,7 +99,7 @@ dependencies:
|
|
99
99
|
version: '0'
|
100
100
|
type: :development
|
101
101
|
prerelease: false
|
102
|
-
version_requirements: *
|
102
|
+
version_requirements: *2168809680
|
103
103
|
description: Distillery extracts the "content" portion out of an HTML document. It
|
104
104
|
applies heuristics based on element type, location, class/id name and other attributes
|
105
105
|
to try and find the content part of the HTML document and return it.
|
@@ -123,9 +123,11 @@ files:
|
|
123
123
|
- lib/distillery/document.rb
|
124
124
|
- lib/distillery/version.rb
|
125
125
|
- spec/acceptance_spec.rb
|
126
|
+
- spec/fixtures/.DS_Store
|
126
127
|
- spec/fixtures/agave_cookies.html
|
127
128
|
- spec/fixtures/baked_ziti.html
|
128
129
|
- spec/fixtures/beef_jerkey.html
|
130
|
+
- spec/fixtures/bourbon_balls.html
|
129
131
|
- spec/fixtures/clams_and_linguini.html
|
130
132
|
- spec/fixtures/clouds_shining_moment.html
|
131
133
|
- spec/fixtures/game_blog.html
|
@@ -164,9 +166,11 @@ specification_version: 3
|
|
164
166
|
summary: Extract the content portion of an HTML document.
|
165
167
|
test_files:
|
166
168
|
- spec/acceptance_spec.rb
|
169
|
+
- spec/fixtures/.DS_Store
|
167
170
|
- spec/fixtures/agave_cookies.html
|
168
171
|
- spec/fixtures/baked_ziti.html
|
169
172
|
- spec/fixtures/beef_jerkey.html
|
173
|
+
- spec/fixtures/bourbon_balls.html
|
170
174
|
- spec/fixtures/clams_and_linguini.html
|
171
175
|
- spec/fixtures/clouds_shining_moment.html
|
172
176
|
- spec/fixtures/game_blog.html
|