distillery 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +4 -0
- data/lib/distillery/document.rb +20 -5
- data/lib/distillery/version.rb +1 -1
- data/spec/acceptance_spec.rb +5 -0
- data/spec/fixtures/bulgogi.html +992 -0
- data/spec/fixtures/forest_ham.html +1115 -0
- data/spec/lib/distillery/document_spec.rb +6 -0
- metadata +23 -21
data/README.md
CHANGED
@@ -45,6 +45,10 @@ In its cleaning, Distillery will also remove all `<img>` tags from the content e
|
|
45
45
|
Distillery also ships with an executable that allows you to distill documents at the command line:
|
46
46
|
|
47
47
|
Usage: distill [options] http://www.example.com/
|
48
|
+
|
49
|
+
options:
|
50
|
+
|
48
51
|
-d, --dirty Do not clean content HTML
|
52
|
+
-i, --images Keep images in the content HTML
|
49
53
|
-v, --version Print the version
|
50
54
|
-h, --help Print this help message
|
data/lib/distillery/document.rb
CHANGED
@@ -107,12 +107,12 @@ module Distillery
|
|
107
107
|
top_scoring_elements.each do |element|
|
108
108
|
|
109
109
|
element.search("*").each do |node|
|
110
|
-
next if (node
|
110
|
+
next if contains_content_image?(node) && keep_images
|
111
111
|
node.remove if has_empty_text?(node)
|
112
112
|
end
|
113
113
|
|
114
114
|
element.search("*").each do |node|
|
115
|
-
next if node
|
115
|
+
next if contains_content_image?(node) && keep_images
|
116
116
|
if UNRELATED_ELEMENTS.include?(node.name) ||
|
117
117
|
(node.text.count(',') < 2 && unlikely_to_be_content?(node))
|
118
118
|
node.remove
|
@@ -123,6 +123,10 @@ module Distillery
|
|
123
123
|
|
124
124
|
private
|
125
125
|
|
126
|
+
def contains_content_image?(node)
|
127
|
+
node.name == 'img' || node.children.css('img').length == 1
|
128
|
+
end
|
129
|
+
|
126
130
|
def scorable_elements
|
127
131
|
search('[data-distillery=scorable]')
|
128
132
|
end
|
@@ -148,9 +152,7 @@ module Distillery
|
|
148
152
|
top_elements = [top_element]
|
149
153
|
|
150
154
|
top_element.parent.children.each do |sibling|
|
151
|
-
|
152
|
-
top_elements << sibling
|
153
|
-
end
|
155
|
+
top_elements << sibling if related_sibling?(top_element, sibling)
|
154
156
|
end
|
155
157
|
|
156
158
|
top_elements.each do |element|
|
@@ -162,6 +164,19 @@ module Distillery
|
|
162
164
|
top_elements
|
163
165
|
end
|
164
166
|
|
167
|
+
def related_sibling?(top_element, sibling)
|
168
|
+
score = scores[sibling.path]
|
169
|
+
top_score = scores[top_element.path]
|
170
|
+
identical = identical_attrubutes?(top_element, sibling)
|
171
|
+
|
172
|
+
related = (score > top_score*0.25 && sibling.path != top_element.path) ||
|
173
|
+
(identical && score > top_score*0.05)
|
174
|
+
end
|
175
|
+
|
176
|
+
def identical_attrubutes?(a, b)
|
177
|
+
a['id'] == b['id'] && a['class'] == b['class']
|
178
|
+
end
|
179
|
+
|
165
180
|
def scorable_div?(elem)
|
166
181
|
elem.name == 'div' &&
|
167
182
|
(has_no_block_children?(elem) || has_only_empty_div_children?(elem))
|
data/lib/distillery/version.rb
CHANGED
data/spec/acceptance_spec.rb
CHANGED
@@ -115,4 +115,9 @@ distillation_of 'bourbon_balls.html' do
|
|
115
115
|
should_not =~ /I just tried the recipe forCellar Doo/ # Comments
|
116
116
|
should_not =~ /FIND A STATION/ # Header
|
117
117
|
should_not =~ /Car Talk/ # Footer
|
118
|
+
end
|
119
|
+
|
120
|
+
distillation_of 'bulgogi.html' do
|
121
|
+
should =~ /early-season barbecue/
|
122
|
+
should =~ /Still, it is American to not fuss/
|
118
123
|
end
|