busk-ruby-readability 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +1 -1
- data/lib/readability.rb +59 -26
- data/ruby-readability.gemspec +1 -1
- data/spec/fixtures/folha.html +1 -0
- data/spec/fixtures/portalodia.com.html +933 -0
- data/spec/fixtures/portalodia_photo.html +970 -0
- data/spec/readability_spec.rb +45 -0
- data/special_rules.yml +5 -0
- metadata +10 -5
data/Gemfile.lock
CHANGED
data/lib/readability.rb
CHANGED
@@ -42,6 +42,11 @@ module Readability
|
|
42
42
|
@best_candidate = nil
|
43
43
|
end
|
44
44
|
|
45
|
+
def has_special_rule?
|
46
|
+
!!rules[@base_uri]
|
47
|
+
end
|
48
|
+
|
49
|
+
|
45
50
|
def content(remove_unlikely_candidates = true)
|
46
51
|
debug "Starting the content heuristic"
|
47
52
|
@document.css("script, style").each {|el| el.remove }
|
@@ -50,6 +55,7 @@ module Readability
|
|
50
55
|
article = youtube if is_youtube? && remove_unlikely_candidates
|
51
56
|
article = vimeo if is_vimeo? && remove_unlikely_candidates
|
52
57
|
article = ted if is_ted? && remove_unlikely_candidates
|
58
|
+
article = apply_custom_rule if has_special_rule?
|
53
59
|
|
54
60
|
if article && remove_unlikely_candidates
|
55
61
|
return article.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
@@ -87,6 +93,10 @@ module Readability
|
|
87
93
|
(@base_uri.to_s =~ REGEXES[:videoRe])
|
88
94
|
end
|
89
95
|
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
|
90
100
|
def youtube
|
91
101
|
debug("I have a Youtube video page")
|
92
102
|
if @request =~ /\?v=([_\-a-z0-9]+)&?/i
|
@@ -141,26 +151,28 @@ module Readability
|
|
141
151
|
|
142
152
|
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
143
153
|
output = Nokogiri::XML::Node.new('div', @document)
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
154
|
+
begin
|
155
|
+
best_candidate[:elem].parent.try(:children).each do |sibling|
|
156
|
+
append = false
|
157
|
+
append = true if sibling == best_candidate[:elem]
|
158
|
+
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
159
|
+
|
160
|
+
if sibling.name.downcase == "p"
|
161
|
+
link_density = get_link_density(sibling)
|
162
|
+
node_content = sibling.text
|
163
|
+
node_length = node_content.length
|
164
|
+
|
165
|
+
if node_length > 80 && link_density < 0.25
|
166
|
+
append = true
|
167
|
+
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
|
168
|
+
append = true
|
169
|
+
end
|
158
170
|
end
|
159
|
-
end
|
160
171
|
|
161
|
-
|
162
|
-
|
163
|
-
|
172
|
+
if append
|
173
|
+
sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
174
|
+
output << sibling
|
175
|
+
end
|
164
176
|
end
|
165
177
|
end
|
166
178
|
|
@@ -284,14 +296,7 @@ module Readability
|
|
284
296
|
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
285
297
|
elem.name = "p"
|
286
298
|
end
|
287
|
-
|
288
|
-
# wrap text nodes in p tags
|
289
|
-
# elem.children.each do |child|
|
290
|
-
# if child.text?
|
291
|
-
## debug("wrapping text node with a p")
|
292
|
-
# child.swap("<p>#{child.text}</p>")
|
293
|
-
# end
|
294
|
-
# end
|
299
|
+
|
295
300
|
end
|
296
301
|
end
|
297
302
|
end
|
@@ -387,5 +392,33 @@ module Readability
|
|
387
392
|
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
388
393
|
end
|
389
394
|
|
395
|
+
private
|
396
|
+
|
397
|
+
def rules
|
398
|
+
@rules ||= YAML.load_file("special_rules.yml")["sites"]
|
399
|
+
end
|
400
|
+
|
401
|
+
def apply_custom_rule
|
402
|
+
extracted = @document.css(rules[@base_uri]["css"])
|
403
|
+
extracted.each do |elem|
|
404
|
+
if (elem.try(:inner_html) =~ /^\W*$/)
|
405
|
+
extracted.delete elem
|
406
|
+
end
|
407
|
+
end
|
408
|
+
extracted
|
409
|
+
end
|
410
|
+
|
390
411
|
end
|
412
|
+
|
413
|
+
|
414
|
+
private
|
415
|
+
|
416
|
+
def remove_empty_tags(chunk)
|
417
|
+
chunk.css("p").each do |elem|
|
418
|
+
elem.remove if elem.content.strip.empty?
|
419
|
+
end
|
420
|
+
end
|
421
|
+
|
422
|
+
|
423
|
+
|
391
424
|
end
|
data/ruby-readability.gemspec
CHANGED
@@ -2,7 +2,7 @@ Gem::Specification.new do |s|
|
|
2
2
|
s.authors = ["Fabio Mont Alegre", "Rodrigo Flores"]
|
3
3
|
s.email = "it-team@busk.com"
|
4
4
|
s.homepage = "http://github.com/busk/ruby-readability"
|
5
|
-
s.version = "1.
|
5
|
+
s.version = "1.2.0"
|
6
6
|
s.name = "busk-ruby-readability"
|
7
7
|
s.summary = "A rewrite of original ruby-readability"
|
8
8
|
|
data/spec/fixtures/folha.html
CHANGED