busk-ruby-readability 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- busk-ruby-readability (1.1.0)
4
+ busk-ruby-readability (1.1.1)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/lib/readability.rb CHANGED
@@ -42,6 +42,11 @@ module Readability
42
42
  @best_candidate = nil
43
43
  end
44
44
 
45
+ def has_special_rule?
46
+ !!rules[@base_uri]
47
+ end
48
+
49
+
45
50
  def content(remove_unlikely_candidates = true)
46
51
  debug "Starting the content heuristic"
47
52
  @document.css("script, style").each {|el| el.remove }
@@ -50,6 +55,7 @@ module Readability
50
55
  article = youtube if is_youtube? && remove_unlikely_candidates
51
56
  article = vimeo if is_vimeo? && remove_unlikely_candidates
52
57
  article = ted if is_ted? && remove_unlikely_candidates
58
+ article = apply_custom_rule if has_special_rule?
53
59
 
54
60
  if article && remove_unlikely_candidates
55
61
  return article.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
@@ -87,6 +93,10 @@ module Readability
87
93
  (@base_uri.to_s =~ REGEXES[:videoRe])
88
94
  end
89
95
 
96
+
97
+
98
+
99
+
90
100
  def youtube
91
101
  debug("I have a Youtube video page")
92
102
  if @request =~ /\?v=([_\-a-z0-9]+)&?/i
@@ -141,26 +151,28 @@ module Readability
141
151
 
142
152
  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
143
153
  output = Nokogiri::XML::Node.new('div', @document)
144
- best_candidate[:elem].parent.try(:children).each do |sibling|
145
- append = false
146
- append = true if sibling == best_candidate[:elem]
147
- append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
148
-
149
- if sibling.name.downcase == "p"
150
- link_density = get_link_density(sibling)
151
- node_content = sibling.text
152
- node_length = node_content.length
153
-
154
- if node_length > 80 && link_density < 0.25
155
- append = true
156
- elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
157
- append = true
154
+ begin
155
+ best_candidate[:elem].parent.try(:children).each do |sibling|
156
+ append = false
157
+ append = true if sibling == best_candidate[:elem]
158
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
159
+
160
+ if sibling.name.downcase == "p"
161
+ link_density = get_link_density(sibling)
162
+ node_content = sibling.text
163
+ node_length = node_content.length
164
+
165
+ if node_length > 80 && link_density < 0.25
166
+ append = true
167
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
168
+ append = true
169
+ end
158
170
  end
159
- end
160
171
 
161
- if append
162
- sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
163
- output << sibling
172
+ if append
173
+ sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
174
+ output << sibling
175
+ end
164
176
  end
165
177
  end
166
178
 
@@ -284,14 +296,7 @@ module Readability
284
296
  debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
285
297
  elem.name = "p"
286
298
  end
287
- else
288
- # wrap text nodes in p tags
289
- # elem.children.each do |child|
290
- # if child.text?
291
- ## debug("wrapping text node with a p")
292
- # child.swap("<p>#{child.text}</p>")
293
- # end
294
- # end
299
+
295
300
  end
296
301
  end
297
302
  end
@@ -387,5 +392,33 @@ module Readability
387
392
  node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
388
393
  end
389
394
 
395
+ private
396
+
397
+ def rules
398
+ @rules ||= YAML.load_file("special_rules.yml")["sites"]
399
+ end
400
+
401
+ def apply_custom_rule
402
+ extracted = @document.css(rules[@base_uri]["css"])
403
+ extracted.each do |elem|
404
+ if (elem.try(:inner_html) =~ /^\W*$/)
405
+ extracted.delete elem
406
+ end
407
+ end
408
+ extracted
409
+ end
410
+
390
411
  end
412
+
413
+
414
+ private
415
+
416
+ def remove_empty_tags(chunk)
417
+ chunk.css("p").each do |elem|
418
+ elem.remove if elem.content.strip.empty?
419
+ end
420
+ end
421
+
422
+
423
+
391
424
  end
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
2
2
  s.authors = ["Fabio Mont Alegre", "Rodrigo Flores"]
3
3
  s.email = "it-team@busk.com"
4
4
  s.homepage = "http://github.com/busk/ruby-readability"
5
- s.version = "1.1.1"
5
+ s.version = "1.2.0"
6
6
  s.name = "busk-ruby-readability"
7
7
  s.summary = "A rewrite of original ruby-readability"
8
8
 
@@ -1,3 +1,4 @@
1
+
1
2
  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
3
  <html>
3
4
  <head>