distillery 0.2.1 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +4 -0
- data/lib/distillery/document.rb +20 -5
- data/lib/distillery/version.rb +1 -1
- data/spec/acceptance_spec.rb +5 -0
- data/spec/fixtures/bulgogi.html +992 -0
- data/spec/fixtures/forest_ham.html +1115 -0
- data/spec/lib/distillery/document_spec.rb +6 -0
- metadata +23 -21
data/README.md
CHANGED
@@ -45,6 +45,10 @@ In its cleaning, Distillery will also remove all `<img>` tags from the content e
|
|
45
45
|
Distillery also ships with an executable that allows you to distill documents at the command line:
|
46
46
|
|
47
47
|
Usage: distill [options] http://www.example.com/
|
48
|
+
|
49
|
+
options:
|
50
|
+
|
48
51
|
-d, --dirty Do not clean content HTML
|
52
|
+
-i, --images Keep images in the content HTML
|
49
53
|
-v, --version Print the version
|
50
54
|
-h, --help Print this help message
|
data/lib/distillery/document.rb
CHANGED
@@ -107,12 +107,12 @@ module Distillery
|
|
107
107
|
top_scoring_elements.each do |element|
|
108
108
|
|
109
109
|
element.search("*").each do |node|
|
110
|
-
next if (node
|
110
|
+
next if contains_content_image?(node) && keep_images
|
111
111
|
node.remove if has_empty_text?(node)
|
112
112
|
end
|
113
113
|
|
114
114
|
element.search("*").each do |node|
|
115
|
-
next if node
|
115
|
+
next if contains_content_image?(node) && keep_images
|
116
116
|
if UNRELATED_ELEMENTS.include?(node.name) ||
|
117
117
|
(node.text.count(',') < 2 && unlikely_to_be_content?(node))
|
118
118
|
node.remove
|
@@ -123,6 +123,10 @@ module Distillery
|
|
123
123
|
|
124
124
|
private
|
125
125
|
|
126
|
+
def contains_content_image?(node)
|
127
|
+
node.name == 'img' || node.children.css('img').length == 1
|
128
|
+
end
|
129
|
+
|
126
130
|
def scorable_elements
|
127
131
|
search('[data-distillery=scorable]')
|
128
132
|
end
|
@@ -148,9 +152,7 @@ module Distillery
|
|
148
152
|
top_elements = [top_element]
|
149
153
|
|
150
154
|
top_element.parent.children.each do |sibling|
|
151
|
-
|
152
|
-
top_elements << sibling
|
153
|
-
end
|
155
|
+
top_elements << sibling if related_sibling?(top_element, sibling)
|
154
156
|
end
|
155
157
|
|
156
158
|
top_elements.each do |element|
|
@@ -162,6 +164,19 @@ module Distillery
|
|
162
164
|
top_elements
|
163
165
|
end
|
164
166
|
|
167
|
+
def related_sibling?(top_element, sibling)
|
168
|
+
score = scores[sibling.path]
|
169
|
+
top_score = scores[top_element.path]
|
170
|
+
identical = identical_attrubutes?(top_element, sibling)
|
171
|
+
|
172
|
+
related = (score > top_score*0.25 && sibling.path != top_element.path) ||
|
173
|
+
(identical && score > top_score*0.05)
|
174
|
+
end
|
175
|
+
|
176
|
+
def identical_attrubutes?(a, b)
|
177
|
+
a['id'] == b['id'] && a['class'] == b['class']
|
178
|
+
end
|
179
|
+
|
165
180
|
def scorable_div?(elem)
|
166
181
|
elem.name == 'div' &&
|
167
182
|
(has_no_block_children?(elem) || has_only_empty_div_children?(elem))
|
data/lib/distillery/version.rb
CHANGED
data/spec/acceptance_spec.rb
CHANGED
@@ -115,4 +115,9 @@ distillation_of 'bourbon_balls.html' do
|
|
115
115
|
should_not =~ /I just tried the recipe forCellar Doo/ # Comments
|
116
116
|
should_not =~ /FIND A STATION/ # Header
|
117
117
|
should_not =~ /Car Talk/ # Footer
|
118
|
+
end
|
119
|
+
|
120
|
+
distillation_of 'bulgogi.html' do
|
121
|
+
should =~ /early-season barbecue/
|
122
|
+
should =~ /Still, it is American to not fuss/
|
118
123
|
end
|