distillery 0.2.1 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -45,6 +45,10 @@ In its cleaning, Distillery will also remove all `<img>` tags from the content e
45
45
  Distillery also ships with an executable that allows you to distill documents at the command line:
46
46
 
47
47
  Usage: distill [options] http://www.example.com/
48
+
49
+ options:
50
+
48
51
  -d, --dirty Do not clean content HTML
52
+ -i, --images Keep images in the content HTML
49
53
  -v, --version Print the version
50
54
  -h, --help Print this help message
@@ -107,12 +107,12 @@ module Distillery
107
107
  top_scoring_elements.each do |element|
108
108
 
109
109
  element.search("*").each do |node|
110
- next if (node.name == 'img' || node.children.css('img').any?) && keep_images
110
+ next if contains_content_image?(node) && keep_images
111
111
  node.remove if has_empty_text?(node)
112
112
  end
113
113
 
114
114
  element.search("*").each do |node|
115
- next if node.name == 'img' && keep_images
115
+ next if contains_content_image?(node) && keep_images
116
116
  if UNRELATED_ELEMENTS.include?(node.name) ||
117
117
  (node.text.count(',') < 2 && unlikely_to_be_content?(node))
118
118
  node.remove
@@ -123,6 +123,10 @@ module Distillery
123
123
 
124
124
  private
125
125
 
126
+ def contains_content_image?(node)
127
+ node.name == 'img' || node.children.css('img').length == 1
128
+ end
129
+
126
130
  def scorable_elements
127
131
  search('[data-distillery=scorable]')
128
132
  end
@@ -148,9 +152,7 @@ module Distillery
148
152
  top_elements = [top_element]
149
153
 
150
154
  top_element.parent.children.each do |sibling|
151
- if scores[sibling.path] > top_score*0.25 && sibling.path != top_element.path
152
- top_elements << sibling
153
- end
155
+ top_elements << sibling if related_sibling?(top_element, sibling)
154
156
  end
155
157
 
156
158
  top_elements.each do |element|
@@ -162,6 +164,19 @@ module Distillery
162
164
  top_elements
163
165
  end
164
166
 
167
+ def related_sibling?(top_element, sibling)
168
+ score = scores[sibling.path]
169
+ top_score = scores[top_element.path]
170
+ identical = identical_attrubutes?(top_element, sibling)
171
+
172
+ related = (score > top_score*0.25 && sibling.path != top_element.path) ||
173
+ (identical && score > top_score*0.05)
174
+ end
175
+
176
+ def identical_attrubutes?(a, b)
177
+ a['id'] == b['id'] && a['class'] == b['class']
178
+ end
179
+
165
180
  def scorable_div?(elem)
166
181
  elem.name == 'div' &&
167
182
  (has_no_block_children?(elem) || has_only_empty_div_children?(elem))
@@ -1,3 +1,3 @@
1
1
  module Distillery
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.3"
3
3
  end
@@ -115,4 +115,9 @@ distillation_of 'bourbon_balls.html' do
115
115
  should_not =~ /I just tried the recipe forCellar Doo/ # Comments
116
116
  should_not =~ /FIND A STATION/ # Header
117
117
  should_not =~ /Car Talk/ # Footer
118
+ end
119
+
120
+ distillation_of 'bulgogi.html' do
121
+ should =~ /early-season barbecue/
122
+ should =~ /Still, it is American to not fuss/
118
123
  end