busk-ruby-readability 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +1 -1
- data/lib/readability.rb +59 -26
- data/ruby-readability.gemspec +1 -1
- data/spec/fixtures/folha.html +1 -0
- data/spec/fixtures/portalodia.com.html +933 -0
- data/spec/fixtures/portalodia_photo.html +970 -0
- data/spec/readability_spec.rb +45 -0
- data/special_rules.yml +5 -0
- metadata +10 -5
data/Gemfile.lock
CHANGED
data/lib/readability.rb
CHANGED
@@ -42,6 +42,11 @@ module Readability
|
|
42
42
|
@best_candidate = nil
|
43
43
|
end
|
44
44
|
|
45
|
+
def has_special_rule?
|
46
|
+
!!rules[@base_uri]
|
47
|
+
end
|
48
|
+
|
49
|
+
|
45
50
|
def content(remove_unlikely_candidates = true)
|
46
51
|
debug "Starting the content heuristic"
|
47
52
|
@document.css("script, style").each {|el| el.remove }
|
@@ -50,6 +55,7 @@ module Readability
|
|
50
55
|
article = youtube if is_youtube? && remove_unlikely_candidates
|
51
56
|
article = vimeo if is_vimeo? && remove_unlikely_candidates
|
52
57
|
article = ted if is_ted? && remove_unlikely_candidates
|
58
|
+
article = apply_custom_rule if has_special_rule?
|
53
59
|
|
54
60
|
if article && remove_unlikely_candidates
|
55
61
|
return article.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
@@ -87,6 +93,10 @@ module Readability
|
|
87
93
|
(@base_uri.to_s =~ REGEXES[:videoRe])
|
88
94
|
end
|
89
95
|
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
|
90
100
|
def youtube
|
91
101
|
debug("I have a Youtube video page")
|
92
102
|
if @request =~ /\?v=([_\-a-z0-9]+)&?/i
|
@@ -141,26 +151,28 @@ module Readability
|
|
141
151
|
|
142
152
|
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
143
153
|
output = Nokogiri::XML::Node.new('div', @document)
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
154
|
+
begin
|
155
|
+
best_candidate[:elem].parent.try(:children).each do |sibling|
|
156
|
+
append = false
|
157
|
+
append = true if sibling == best_candidate[:elem]
|
158
|
+
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
159
|
+
|
160
|
+
if sibling.name.downcase == "p"
|
161
|
+
link_density = get_link_density(sibling)
|
162
|
+
node_content = sibling.text
|
163
|
+
node_length = node_content.length
|
164
|
+
|
165
|
+
if node_length > 80 && link_density < 0.25
|
166
|
+
append = true
|
167
|
+
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
|
168
|
+
append = true
|
169
|
+
end
|
158
170
|
end
|
159
|
-
end
|
160
171
|
|
161
|
-
|
162
|
-
|
163
|
-
|
172
|
+
if append
|
173
|
+
sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
174
|
+
output << sibling
|
175
|
+
end
|
164
176
|
end
|
165
177
|
end
|
166
178
|
|
@@ -284,14 +296,7 @@ module Readability
|
|
284
296
|
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
285
297
|
elem.name = "p"
|
286
298
|
end
|
287
|
-
|
288
|
-
# wrap text nodes in p tags
|
289
|
-
# elem.children.each do |child|
|
290
|
-
# if child.text?
|
291
|
-
## debug("wrapping text node with a p")
|
292
|
-
# child.swap("<p>#{child.text}</p>")
|
293
|
-
# end
|
294
|
-
# end
|
299
|
+
|
295
300
|
end
|
296
301
|
end
|
297
302
|
end
|
@@ -387,5 +392,33 @@ module Readability
|
|
387
392
|
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
388
393
|
end
|
389
394
|
|
395
|
+
private
|
396
|
+
|
397
|
+
def rules
|
398
|
+
@rules ||= YAML.load_file("special_rules.yml")["sites"]
|
399
|
+
end
|
400
|
+
|
401
|
+
def apply_custom_rule
|
402
|
+
extracted = @document.css(rules[@base_uri]["css"])
|
403
|
+
extracted.each do |elem|
|
404
|
+
if (elem.try(:inner_html) =~ /^\W*$/)
|
405
|
+
extracted.delete elem
|
406
|
+
end
|
407
|
+
end
|
408
|
+
extracted
|
409
|
+
end
|
410
|
+
|
390
411
|
end
|
412
|
+
|
413
|
+
|
414
|
+
private
|
415
|
+
|
416
|
+
def remove_empty_tags(chunk)
|
417
|
+
chunk.css("p").each do |elem|
|
418
|
+
elem.remove if elem.content.strip.empty?
|
419
|
+
end
|
420
|
+
end
|
421
|
+
|
422
|
+
|
423
|
+
|
391
424
|
end
|
data/ruby-readability.gemspec
CHANGED
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
|
|
2
2
|
s.authors = ["Fabio Mont Alegre", "Rodrigo Flores"]
|
3
3
|
s.email = "it-team@busk.com"
|
4
4
|
s.homepage = "http://github.com/busk/ruby-readability"
|
5
|
-
s.version = "1.
|
5
|
+
s.version = "1.2.0"
|
6
6
|
s.name = "busk-ruby-readability"
|
7
7
|
s.summary = "A rewrite of original ruby-readability"
|
8
8
|
|
data/spec/fixtures/folha.html
CHANGED