distillery 0.2.10 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Guardfile +1 -1
- data/lib/distillery/document.rb +15 -3
- data/lib/distillery/version.rb +1 -1
- data/spec/acceptance_spec.rb +27 -2
- data/spec/fixtures/mothers_brisket.html +926 -0
- data/spec/fixtures/oyako.html +385 -0
- data/spec/fixtures/swiss_chard_pie.html +1025 -0
- data/spec/lib/distillery/document_spec.rb +15 -19
- metadata +24 -18
@@ -60,7 +60,7 @@ module Distillery
|
|
60
60
|
end
|
61
61
|
|
62
62
|
describe 'remove_unlikely_elements!' do
|
63
|
-
%w[combx comment disqus foot header
|
63
|
+
%w[combx comment community disqus extra foot header remark rss shoutbox sidebar sponsor ad-break agegate pagination pager popup].each do |klass|
|
64
64
|
it "removes any elements classed .#{klass}, as it is unlikely to be page content" do
|
65
65
|
doc = document_of("<div class='#{klass}'>foo</div>", :remove_unlikely_elements!)
|
66
66
|
doc.inner_html.should == html_of("")
|
@@ -117,33 +117,33 @@ module Distillery
|
|
117
117
|
|
118
118
|
it 'gives one point per comma in the text of an element' do
|
119
119
|
doc = document_of("<p>foo,bar,baz</p>", :score!)
|
120
|
-
doc.scores['/html/body/p'].should ==
|
120
|
+
doc.scores['/html/body/p'].should == 68.0
|
121
121
|
end
|
122
122
|
|
123
123
|
it 'gives one point per chunk of 100 characters, max of 3' do
|
124
124
|
doc = document_of("<p>#{'f'*201}</p>", :score!)
|
125
|
-
doc.scores['/html/body/p'].should ==
|
125
|
+
doc.scores['/html/body/p'].should == 68.0
|
126
126
|
|
127
127
|
doc = document_of("<p>#{'f'*1000}</p>", :score!)
|
128
|
-
doc.scores['/html/body/p'].should ==
|
128
|
+
doc.scores['/html/body/p'].should == 85.0
|
129
129
|
end
|
130
130
|
|
131
131
|
it 'adds its own points to its parent' do
|
132
132
|
doc = document_of("<div><div>foo</div></div>", :score!)
|
133
|
-
doc.scores['/html/body/div/div'].should ==
|
134
|
-
doc.scores['/html/body/div'].should ==
|
133
|
+
doc.scores['/html/body/div/div'].should == 30.0
|
134
|
+
doc.scores['/html/body/div'].should == 34.0
|
135
135
|
end
|
136
136
|
|
137
137
|
it 'adds 1/2 its points to its grandparent' do
|
138
138
|
doc = document_of("<div><div><div>foo</div></div></div>", :score!)
|
139
|
-
doc.scores['/html/body/div/div/div'].should ==
|
140
|
-
doc.scores['/html/body/div/div'].should ==
|
141
|
-
doc.scores['/html/body/div'].should ==
|
139
|
+
doc.scores['/html/body/div/div/div'].should == 26
|
140
|
+
doc.scores['/html/body/div/div'].should == 30.0
|
141
|
+
doc.scores['/html/body/div'].should == 17.0
|
142
142
|
end
|
143
143
|
|
144
144
|
it 'scales the final score by the inverse link density' do
|
145
145
|
doc = document_of("<p>foobar<a>baz</a></p>", :score!)
|
146
|
-
doc.scores['/html/body/p'].should ==
|
146
|
+
doc.scores['/html/body/p'].should == 22.666666666666668
|
147
147
|
end
|
148
148
|
|
149
149
|
end
|
@@ -182,11 +182,6 @@ module Distillery
|
|
182
182
|
doc.search('.remove').should be_empty
|
183
183
|
end
|
184
184
|
|
185
|
-
it 'removes elements that have way more li elements and it is not a list' do
|
186
|
-
doc = doc_with_top_scored_html_of("<div class='remove'><div>me<ul>#{'<li>a</li>'*200}</ul></div></div>", :clean_top_scoring_elements!)
|
187
|
-
doc.search('.remove').should be_empty
|
188
|
-
end
|
189
|
-
|
190
185
|
it 'removes elements that have more inputs than 1/3 the amount of p tags' do
|
191
186
|
doc = doc_with_top_scored_html_of("<div class='remove'><div><input><input><p>f</p><p>f</p><p>f</p></div></div>", :clean_top_scoring_elements!)
|
192
187
|
doc.search('.remove').should be_empty
|
@@ -277,10 +272,11 @@ module Distillery
|
|
277
272
|
end
|
278
273
|
|
279
274
|
it 'picks the outtermost element in the event of a tie' do
|
280
|
-
doc = document_of("<div><div class='included'>#{'f,'*10}</div></div>")
|
281
|
-
doc.distill!.should =~ /included/
|
282
|
-
doc.
|
283
|
-
doc.scores['/html/body/div'].should ==
|
275
|
+
doc = document_of("<div><div class='is-included'>#{'f,'*10}</div><div class='also-included'>#{'f,'*10}</div></div>")
|
276
|
+
doc.distill!.should =~ /is-included/
|
277
|
+
doc.distill!.should =~ /also-included/
|
278
|
+
doc.scores['/html/body/div/div[1]'].should == 165.0
|
279
|
+
doc.scores['/html/body/div/div[2]'].should == 165.0
|
284
280
|
end
|
285
281
|
|
286
282
|
it 'returns sibling elements to the top scoring one that have > 25% of the top scoring element\'s score' do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: distillery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-12-
|
12
|
+
date: 2011-12-17 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
16
|
-
requirement: &
|
16
|
+
requirement: &2160714940 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>'
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '1.0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2160714940
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: slop
|
27
|
-
requirement: &
|
27
|
+
requirement: &2160714440 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>'
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '1.0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2160714440
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rspec
|
38
|
-
requirement: &
|
38
|
+
requirement: &2160713980 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>'
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '2.0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2160713980
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: guard
|
49
|
-
requirement: &
|
49
|
+
requirement: &2160713600 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2160713600
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: guard-rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &2160713140 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2160713140
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: ruby-debug19
|
71
|
-
requirement: &
|
71
|
+
requirement: &2160712720 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *2160712720
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rb-fsevent
|
82
|
-
requirement: &
|
82
|
+
requirement: &2160712300 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *2160712300
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: growl
|
93
|
-
requirement: &
|
93
|
+
requirement: &2160711880 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,7 +98,7 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *2160711880
|
102
102
|
description: Distillery extracts the "content" portion out of an HTML document. It
|
103
103
|
applies heuristics based on element type, location, class/id name and other attributes
|
104
104
|
to try and find the content part of the HTML document and return it.
|
@@ -136,10 +136,13 @@ files:
|
|
136
136
|
- spec/fixtures/ginger_cookies.html
|
137
137
|
- spec/fixtures/js_this_keyword.html
|
138
138
|
- spec/fixtures/maple_cookies.html
|
139
|
+
- spec/fixtures/mothers_brisket.html
|
139
140
|
- spec/fixtures/nyt_social_media.html
|
141
|
+
- spec/fixtures/oyako.html
|
140
142
|
- spec/fixtures/pina_collada_cupcakes.html
|
141
143
|
- spec/fixtures/pumpkin_scones.html
|
142
144
|
- spec/fixtures/rhubarb.html
|
145
|
+
- spec/fixtures/swiss_chard_pie.html
|
143
146
|
- spec/fixtures/tofu_bowl.html
|
144
147
|
- spec/fixtures/vanilla_pound_cake.html
|
145
148
|
- spec/lib/distillery/document_spec.rb
|
@@ -185,10 +188,13 @@ test_files:
|
|
185
188
|
- spec/fixtures/ginger_cookies.html
|
186
189
|
- spec/fixtures/js_this_keyword.html
|
187
190
|
- spec/fixtures/maple_cookies.html
|
191
|
+
- spec/fixtures/mothers_brisket.html
|
188
192
|
- spec/fixtures/nyt_social_media.html
|
193
|
+
- spec/fixtures/oyako.html
|
189
194
|
- spec/fixtures/pina_collada_cupcakes.html
|
190
195
|
- spec/fixtures/pumpkin_scones.html
|
191
196
|
- spec/fixtures/rhubarb.html
|
197
|
+
- spec/fixtures/swiss_chard_pie.html
|
192
198
|
- spec/fixtures/tofu_bowl.html
|
193
199
|
- spec/fixtures/vanilla_pound_cake.html
|
194
200
|
- spec/lib/distillery/document_spec.rb
|