distillery 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -45,6 +45,10 @@ In its cleaning, Distillery will also remove all `<img>` tags from the content e
45
45
  Distillery also ships with an executable that allows you to distill documents at the command line:
46
46
 
47
47
  Usage: distill [options] http://www.example.com/
48
+
49
+ options:
50
+
48
51
  -d, --dirty Do not clean content HTML
52
+ -i, --images Keep images in the content HTML
49
53
  -v, --version Print the version
50
54
  -h, --help Print this help message
@@ -107,12 +107,12 @@ module Distillery
107
107
  top_scoring_elements.each do |element|
108
108
 
109
109
  element.search("*").each do |node|
110
- next if (node.name == 'img' || node.children.css('img').any?) && keep_images
110
+ next if contains_content_image?(node) && keep_images
111
111
  node.remove if has_empty_text?(node)
112
112
  end
113
113
 
114
114
  element.search("*").each do |node|
115
- next if node.name == 'img' && keep_images
115
+ next if contains_content_image?(node) && keep_images
116
116
  if UNRELATED_ELEMENTS.include?(node.name) ||
117
117
  (node.text.count(',') < 2 && unlikely_to_be_content?(node))
118
118
  node.remove
@@ -123,6 +123,10 @@ module Distillery
123
123
 
124
124
  private
125
125
 
126
+ def contains_content_image?(node)
127
+ node.name == 'img' || node.children.css('img').length == 1
128
+ end
129
+
126
130
  def scorable_elements
127
131
  search('[data-distillery=scorable]')
128
132
  end
@@ -148,9 +152,7 @@ module Distillery
148
152
  top_elements = [top_element]
149
153
 
150
154
  top_element.parent.children.each do |sibling|
151
- if scores[sibling.path] > top_score*0.25 && sibling.path != top_element.path
152
- top_elements << sibling
153
- end
155
+ top_elements << sibling if related_sibling?(top_element, sibling)
154
156
  end
155
157
 
156
158
  top_elements.each do |element|
@@ -162,6 +164,19 @@ module Distillery
162
164
  top_elements
163
165
  end
164
166
 
167
+ def related_sibling?(top_element, sibling)
168
+ score = scores[sibling.path]
169
+ top_score = scores[top_element.path]
170
+ identical = identical_attrubutes?(top_element, sibling)
171
+
172
+ related = (score > top_score*0.25 && sibling.path != top_element.path) ||
173
+ (identical && score > top_score*0.05)
174
+ end
175
+
176
+ def identical_attrubutes?(a, b)
177
+ a['id'] == b['id'] && a['class'] == b['class']
178
+ end
179
+
165
180
  def scorable_div?(elem)
166
181
  elem.name == 'div' &&
167
182
  (has_no_block_children?(elem) || has_only_empty_div_children?(elem))
@@ -1,3 +1,3 @@
1
1
  module Distillery
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.3"
3
3
  end
@@ -115,4 +115,9 @@ distillation_of 'bourbon_balls.html' do
115
115
  should_not =~ /I just tried the recipe forCellar Doo/ # Comments
116
116
  should_not =~ /FIND A STATION/ # Header
117
117
  should_not =~ /Car Talk/ # Footer
118
+ end
119
+
120
+ distillation_of 'bulgogi.html' do
121
+ should =~ /early-season barbecue/
122
+ should =~ /Still, it is American to not fuss/
118
123
  end