distillery 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +6 -4
- data/Rakefile +1 -1
- data/lib/distillery/document.rb +54 -24
- data/lib/distillery/version.rb +1 -1
- data/spec/acceptance_spec.rb +10 -0
- data/spec/fixtures/.DS_Store +0 -0
- data/spec/fixtures/bourbon_balls.html +950 -0
- data/spec/lib/distillery/document_spec.rb +54 -30
- metadata +22 -18
@@ -74,21 +74,26 @@ module Distillery
|
|
74
74
|
|
75
75
|
end
|
76
76
|
|
77
|
-
describe '
|
77
|
+
describe 'mark_scorable_elements!' do
|
78
78
|
|
79
|
-
it '
|
80
|
-
doc = document_of("<div>foo</div>", :
|
81
|
-
doc.inner_html.should == html_of("
|
79
|
+
it 'marks divs who have no children to paragraphs' do
|
80
|
+
doc = document_of("<div>foo</div>", :mark_scorable_elements!)
|
81
|
+
doc.inner_html.should == html_of('<div data-distillery="scorable">foo</div>')
|
82
82
|
end
|
83
83
|
|
84
|
-
it '
|
85
|
-
doc = document_of("<div><span>foo</span></div>", :
|
86
|
-
doc.inner_html.should == html_of("
|
84
|
+
it 'marks divs who have children that are not block-level elements to paragraphs' do
|
85
|
+
doc = document_of("<div><span>foo</span></div>", :mark_scorable_elements!)
|
86
|
+
doc.inner_html.should == html_of('<div data-distillery="scorable"><span>foo</span></div>')
|
87
87
|
end
|
88
88
|
|
89
|
-
it '
|
90
|
-
doc = document_of("<div><pre>foo</pre><div></div></div>", :
|
91
|
-
doc.inner_html.gsub("\n", "").should == html_of("
|
89
|
+
it 'marks divs whose have empty child divs to paragrahs' do
|
90
|
+
doc = document_of("<div><pre>foo</pre><div></div></div>", :mark_scorable_elements!)
|
91
|
+
doc.inner_html.gsub("\n", "").should == html_of('<div data-distillery="scorable"><pre>foo</pre><div data-distillery="scorable"></div></div>')
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'marks all paragraphs' do
|
95
|
+
doc = document_of("<p>foo</p><p></p></p>", :mark_scorable_elements!)
|
96
|
+
doc.inner_html.gsub("\n", "").should == html_of('<p data-distillery="scorable">foo</p><p data-distillery="scorable"></p>')
|
92
97
|
end
|
93
98
|
|
94
99
|
end
|
@@ -102,10 +107,12 @@ module Distillery
|
|
102
107
|
subject.scores.should_not be_empty
|
103
108
|
end
|
104
109
|
|
105
|
-
it '
|
106
|
-
doc = document_of("<
|
110
|
+
it 'calculates scores for divs that are empty or have no block level children' do
|
111
|
+
doc = document_of("<div><div><div><div>bar</div></div></div></div>", :score!)
|
112
|
+
doc.scores.should have_key('/html/body/div/div/div/div')
|
113
|
+
doc.scores.should have_key('/html/body/div/div/div')
|
114
|
+
doc.scores.should have_key('/html/body/div/div')
|
107
115
|
doc.scores.should_not have_key('/html/body/div')
|
108
|
-
doc.scores.should have_key('/html/body/p')
|
109
116
|
end
|
110
117
|
|
111
118
|
it 'gives one point per comma in the text of an element' do
|
@@ -122,14 +129,14 @@ module Distillery
|
|
122
129
|
end
|
123
130
|
|
124
131
|
it 'adds its own points to its parent' do
|
125
|
-
doc = document_of("<
|
126
|
-
doc.scores['/html/body/div/
|
132
|
+
doc = document_of("<div><div>foo</div></div>", :score!)
|
133
|
+
doc.scores['/html/body/div/div'].should == 2
|
127
134
|
doc.scores['/html/body/div'].should == 2
|
128
135
|
end
|
129
136
|
|
130
137
|
it 'adds 1/2 its points to its grandparent' do
|
131
|
-
doc = document_of("<
|
132
|
-
doc.scores['/html/body/div/div/
|
138
|
+
doc = document_of("<div><div><div>foo</div></div></div>", :score!)
|
139
|
+
doc.scores['/html/body/div/div/div'].should == 2
|
133
140
|
doc.scores['/html/body/div/div'].should == 2
|
134
141
|
doc.scores['/html/body/div'].should == 1
|
135
142
|
end
|
@@ -148,65 +155,65 @@ module Distillery
|
|
148
155
|
end
|
149
156
|
|
150
157
|
it 'removes all empty elements' do
|
151
|
-
doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :
|
158
|
+
doc = doc_with_top_scored_html_of("<div>foo <span></span</div>", :clean_top_scoring_elements!)
|
152
159
|
doc.search('span').should be_empty
|
153
160
|
end
|
154
161
|
|
155
162
|
it 'does not remove <br> elements' do
|
156
|
-
doc = doc_with_top_scored_html_of("<div>foo<br class='noremove' /></div>", :
|
163
|
+
doc = doc_with_top_scored_html_of("<div>foo,foo,foo<br class='noremove' /></div>", :clean_top_scoring_elements!)
|
157
164
|
doc.search('.noremove').should_not be_empty
|
158
165
|
end
|
159
166
|
|
160
167
|
%w[iframe form object].each do |tag|
|
161
168
|
it "removes any #{tag} elements" do
|
162
|
-
doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :
|
169
|
+
doc = doc_with_top_scored_html_of("foo <#{tag}></#{tag}>", :clean_top_scoring_elements!)
|
163
170
|
doc.search(tag).should be_empty
|
164
171
|
end
|
165
172
|
end
|
166
173
|
|
167
174
|
it 'removes elements that have negative scores' do
|
168
|
-
doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :
|
175
|
+
doc = doc_with_top_scored_html_of("<div class='widget'><div>bar</div></div>", :clean_top_scoring_elements!)
|
169
176
|
doc.search('.widget').should be_empty
|
170
177
|
end
|
171
178
|
|
172
179
|
it 'removes elements that have more images than p tags' do
|
173
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :
|
180
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><img><img><img><p>bar</p><div>foo</div></div>", :clean_top_scoring_elements!)
|
174
181
|
doc.search('.remove').should be_empty
|
175
182
|
end
|
176
183
|
|
177
184
|
it 'removes elements that have way more li elements and it is not a list' do
|
178
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :
|
185
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_elements!)
|
179
186
|
doc.search('.remove').should be_empty
|
180
187
|
end
|
181
188
|
|
182
189
|
it 'removes elements that have more inputs than 1/3 the amount of p tags' do
|
183
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :
|
190
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_elements!)
|
184
191
|
doc.search('.remove').should be_empty
|
185
192
|
|
186
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :
|
193
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><input><p>#{'f'*25}</p><p>f</p><p>f</p></div>", :clean_top_scoring_elements!)
|
187
194
|
doc.search('.remove').should_not be_empty
|
188
195
|
end
|
189
196
|
|
190
197
|
it 'removes elements that have < 25 characters and (no images or > 2 images' do
|
191
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :
|
198
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div>foo</div></div>", :clean_top_scoring_elements!)
|
192
199
|
doc.search('.remove').should be_empty
|
193
200
|
|
194
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :
|
201
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div>foo <img><img><img></div></div>", :clean_top_scoring_elements!)
|
195
202
|
doc.search('.remove').should be_empty
|
196
203
|
end
|
197
204
|
|
198
205
|
it 'removes elements that have a weight of < 25 and link density > 0.2' do
|
199
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :
|
206
|
+
doc = doc_with_top_scored_html_of("<div class='remove'><div>fffff<a>#{'b'*2}</a></div></div>", :clean_top_scoring_elements!)
|
200
207
|
doc.search('.remove').should be_empty
|
201
208
|
end
|
202
209
|
|
203
210
|
it 'removes elements that have a weight of >= 25 and link density > 0.5' do
|
204
|
-
doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :
|
211
|
+
doc = doc_with_top_scored_html_of("<div class='remove article'><div>#{'f'*100}<a>#{'b'*150}</a></div></div>", :clean_top_scoring_elements!)
|
205
212
|
doc.search('.remove').should be_empty
|
206
213
|
end
|
207
214
|
|
208
215
|
it 'should not clean the conntent elements not of table ul or div' do
|
209
|
-
doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :
|
216
|
+
doc = doc_with_top_scored_html_of("<span class='remove'><strong>Source:</strong> Wikipedia</span>", :clean_top_scoring_elements!)
|
210
217
|
doc.search('.remove').should_not be_empty
|
211
218
|
end
|
212
219
|
|
@@ -253,6 +260,23 @@ module Distillery
|
|
253
260
|
document_of('foo').distill!.should == 'foo'
|
254
261
|
end
|
255
262
|
|
263
|
+
it 'does not return any elements with a data-distillery attribute' do
|
264
|
+
html = document_of('<div><p>Hello</p></div>')
|
265
|
+
document_of(html).distill!.should_not =~ /data-distillery/
|
266
|
+
end
|
267
|
+
|
268
|
+
it 'picks the outtermost element in the event of a tie' do
|
269
|
+
doc = document_of("<div><div class='included'>#{'f,'*10}</div></div>")
|
270
|
+
doc.distill!.should =~ /included/
|
271
|
+
doc.scores['/html/body/div/div'].should == 11
|
272
|
+
doc.scores['/html/body/div'].should == 11
|
273
|
+
end
|
274
|
+
|
275
|
+
it 'returns sibling elements to the top scoring one that have > 25% of the top scoring element\'s score' do
|
276
|
+
doc = document_of("<div><div>#{'f,'*10}</div></div><div><div class='me_too'>#{'f,'*5}</div></div>")
|
277
|
+
doc.distill!.should =~ /me_too/
|
278
|
+
end
|
279
|
+
|
256
280
|
end
|
257
281
|
|
258
282
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: distillery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,12 +9,12 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-05-08 00:00:00.000000000 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: nokogiri
|
17
|
-
requirement: &
|
17
|
+
requirement: &2168823320 !ruby/object:Gem::Requirement
|
18
18
|
none: false
|
19
19
|
requirements:
|
20
20
|
- - ! '>'
|
@@ -22,10 +22,10 @@ dependencies:
|
|
22
22
|
version: '1.0'
|
23
23
|
type: :runtime
|
24
24
|
prerelease: false
|
25
|
-
version_requirements: *
|
25
|
+
version_requirements: *2168823320
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: slop
|
28
|
-
requirement: &
|
28
|
+
requirement: &2168822640 !ruby/object:Gem::Requirement
|
29
29
|
none: false
|
30
30
|
requirements:
|
31
31
|
- - ! '>'
|
@@ -33,10 +33,10 @@ dependencies:
|
|
33
33
|
version: '1.0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
|
-
version_requirements: *
|
36
|
+
version_requirements: *2168822640
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: rspec
|
39
|
-
requirement: &
|
39
|
+
requirement: &2168821740 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
42
|
- - ! '>'
|
@@ -44,10 +44,10 @@ dependencies:
|
|
44
44
|
version: '2.0'
|
45
45
|
type: :development
|
46
46
|
prerelease: false
|
47
|
-
version_requirements: *
|
47
|
+
version_requirements: *2168821740
|
48
48
|
- !ruby/object:Gem::Dependency
|
49
49
|
name: guard
|
50
|
-
requirement: &
|
50
|
+
requirement: &2168821200 !ruby/object:Gem::Requirement
|
51
51
|
none: false
|
52
52
|
requirements:
|
53
53
|
- - ! '>='
|
@@ -55,10 +55,10 @@ dependencies:
|
|
55
55
|
version: '0'
|
56
56
|
type: :development
|
57
57
|
prerelease: false
|
58
|
-
version_requirements: *
|
58
|
+
version_requirements: *2168821200
|
59
59
|
- !ruby/object:Gem::Dependency
|
60
60
|
name: guard-rspec
|
61
|
-
requirement: &
|
61
|
+
requirement: &2168820180 !ruby/object:Gem::Requirement
|
62
62
|
none: false
|
63
63
|
requirements:
|
64
64
|
- - ! '>='
|
@@ -66,10 +66,10 @@ dependencies:
|
|
66
66
|
version: '0'
|
67
67
|
type: :development
|
68
68
|
prerelease: false
|
69
|
-
version_requirements: *
|
69
|
+
version_requirements: *2168820180
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
71
|
name: ruby-debug19
|
72
|
-
requirement: &
|
72
|
+
requirement: &2168811040 !ruby/object:Gem::Requirement
|
73
73
|
none: false
|
74
74
|
requirements:
|
75
75
|
- - ! '>='
|
@@ -77,10 +77,10 @@ dependencies:
|
|
77
77
|
version: '0'
|
78
78
|
type: :development
|
79
79
|
prerelease: false
|
80
|
-
version_requirements: *
|
80
|
+
version_requirements: *2168811040
|
81
81
|
- !ruby/object:Gem::Dependency
|
82
82
|
name: rb-fsevent
|
83
|
-
requirement: &
|
83
|
+
requirement: &2168810540 !ruby/object:Gem::Requirement
|
84
84
|
none: false
|
85
85
|
requirements:
|
86
86
|
- - ! '>='
|
@@ -88,10 +88,10 @@ dependencies:
|
|
88
88
|
version: '0'
|
89
89
|
type: :development
|
90
90
|
prerelease: false
|
91
|
-
version_requirements: *
|
91
|
+
version_requirements: *2168810540
|
92
92
|
- !ruby/object:Gem::Dependency
|
93
93
|
name: growl
|
94
|
-
requirement: &
|
94
|
+
requirement: &2168809680 !ruby/object:Gem::Requirement
|
95
95
|
none: false
|
96
96
|
requirements:
|
97
97
|
- - ! '>='
|
@@ -99,7 +99,7 @@ dependencies:
|
|
99
99
|
version: '0'
|
100
100
|
type: :development
|
101
101
|
prerelease: false
|
102
|
-
version_requirements: *
|
102
|
+
version_requirements: *2168809680
|
103
103
|
description: Distillery extracts the "content" portion out of an HTML document. It
|
104
104
|
applies heuristics based on element type, location, class/id name and other attributes
|
105
105
|
to try and find the content part of the HTML document and return it.
|
@@ -123,9 +123,11 @@ files:
|
|
123
123
|
- lib/distillery/document.rb
|
124
124
|
- lib/distillery/version.rb
|
125
125
|
- spec/acceptance_spec.rb
|
126
|
+
- spec/fixtures/.DS_Store
|
126
127
|
- spec/fixtures/agave_cookies.html
|
127
128
|
- spec/fixtures/baked_ziti.html
|
128
129
|
- spec/fixtures/beef_jerkey.html
|
130
|
+
- spec/fixtures/bourbon_balls.html
|
129
131
|
- spec/fixtures/clams_and_linguini.html
|
130
132
|
- spec/fixtures/clouds_shining_moment.html
|
131
133
|
- spec/fixtures/game_blog.html
|
@@ -164,9 +166,11 @@ specification_version: 3
|
|
164
166
|
summary: Extract the content portion of an HTML document.
|
165
167
|
test_files:
|
166
168
|
- spec/acceptance_spec.rb
|
169
|
+
- spec/fixtures/.DS_Store
|
167
170
|
- spec/fixtures/agave_cookies.html
|
168
171
|
- spec/fixtures/baked_ziti.html
|
169
172
|
- spec/fixtures/beef_jerkey.html
|
173
|
+
- spec/fixtures/bourbon_balls.html
|
170
174
|
- spec/fixtures/clams_and_linguini.html
|
171
175
|
- spec/fixtures/clouds_shining_moment.html
|
172
176
|
- spec/fixtures/game_blog.html
|