busk-ruby-readability 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- busk-ruby-readability (1.1.0)
4
+ busk-ruby-readability (1.1.1)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/lib/readability.rb CHANGED
@@ -42,6 +42,11 @@ module Readability
42
42
  @best_candidate = nil
43
43
  end
44
44
 
45
+ def has_special_rule?
46
+ !!rules[@base_uri]
47
+ end
48
+
49
+
45
50
  def content(remove_unlikely_candidates = true)
46
51
  debug "Starting the content heuristic"
47
52
  @document.css("script, style").each {|el| el.remove }
@@ -50,6 +55,7 @@ module Readability
50
55
  article = youtube if is_youtube? && remove_unlikely_candidates
51
56
  article = vimeo if is_vimeo? && remove_unlikely_candidates
52
57
  article = ted if is_ted? && remove_unlikely_candidates
58
+ article = apply_custom_rule if has_special_rule?
53
59
 
54
60
  if article && remove_unlikely_candidates
55
61
  return article.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
@@ -87,6 +93,10 @@ module Readability
87
93
  (@base_uri.to_s =~ REGEXES[:videoRe])
88
94
  end
89
95
 
96
+
97
+
98
+
99
+
90
100
  def youtube
91
101
  debug("I have a Youtube video page")
92
102
  if @request =~ /\?v=([_\-a-z0-9]+)&?/i
@@ -141,26 +151,28 @@ module Readability
141
151
 
142
152
  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
143
153
  output = Nokogiri::XML::Node.new('div', @document)
144
- best_candidate[:elem].parent.try(:children).each do |sibling|
145
- append = false
146
- append = true if sibling == best_candidate[:elem]
147
- append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
148
-
149
- if sibling.name.downcase == "p"
150
- link_density = get_link_density(sibling)
151
- node_content = sibling.text
152
- node_length = node_content.length
153
-
154
- if node_length > 80 && link_density < 0.25
155
- append = true
156
- elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
157
- append = true
154
+ begin
155
+ best_candidate[:elem].parent.try(:children).each do |sibling|
156
+ append = false
157
+ append = true if sibling == best_candidate[:elem]
158
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
159
+
160
+ if sibling.name.downcase == "p"
161
+ link_density = get_link_density(sibling)
162
+ node_content = sibling.text
163
+ node_length = node_content.length
164
+
165
+ if node_length > 80 && link_density < 0.25
166
+ append = true
167
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
168
+ append = true
169
+ end
158
170
  end
159
- end
160
171
 
161
- if append
162
- sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
163
- output << sibling
172
+ if append
173
+ sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
174
+ output << sibling
175
+ end
164
176
  end
165
177
  end
166
178
 
@@ -284,14 +296,7 @@ module Readability
284
296
  debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
285
297
  elem.name = "p"
286
298
  end
287
- else
288
- # wrap text nodes in p tags
289
- # elem.children.each do |child|
290
- # if child.text?
291
- ## debug("wrapping text node with a p")
292
- # child.swap("<p>#{child.text}</p>")
293
- # end
294
- # end
299
+
295
300
  end
296
301
  end
297
302
  end
@@ -387,5 +392,33 @@ module Readability
387
392
  node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
388
393
  end
389
394
 
395
+ private
396
+
397
+ def rules
398
+ @rules ||= YAML.load_file("special_rules.yml")["sites"]
399
+ end
400
+
401
+ def apply_custom_rule
402
+ extracted = @document.css(rules[@base_uri]["css"])
403
+ extracted.each do |elem|
404
+ if (elem.try(:inner_html) =~ /^\W*$/)
405
+ extracted.delete elem
406
+ end
407
+ end
408
+ extracted
409
+ end
410
+
390
411
  end
412
+
413
+
414
+ private
415
+
416
+ def remove_empty_tags(chunk)
417
+ chunk.css("p").each do |elem|
418
+ elem.remove if elem.content.strip.empty?
419
+ end
420
+ end
421
+
422
+
423
+
391
424
  end
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
2
2
  s.authors = ["Fabio Mont Alegre", "Rodrigo Flores"]
3
3
  s.email = "it-team@busk.com"
4
4
  s.homepage = "http://github.com/busk/ruby-readability"
5
- s.version = "1.1.1"
5
+ s.version = "1.2.0"
6
6
  s.name = "busk-ruby-readability"
7
7
  s.summary = "A rewrite of original ruby-readability"
8
8
 
@@ -1,3 +1,4 @@
1
+
1
2
  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
3
  <html>
3
4
  <head>