busk-ruby-readability 1.0.3 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/readability.rb +73 -24
- metadata +4 -4
data/lib/readability.rb
CHANGED
@@ -14,6 +14,20 @@ module Readability
|
|
14
14
|
make_html
|
15
15
|
end
|
16
16
|
|
17
|
+
# def charset
|
18
|
+
# @charset ||= begin
|
19
|
+
# if content_type = @input.read.to_s.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
|
20
|
+
# if content_type = content_type[0].match(/charset=([\w-]*)/i)
|
21
|
+
# content_type[1]
|
22
|
+
# else
|
23
|
+
# "utf-8"
|
24
|
+
# end
|
25
|
+
# else
|
26
|
+
# "utf-8"
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
# end
|
30
|
+
|
17
31
|
def make_html
|
18
32
|
@html = Nokogiri::HTML(@input, nil, 'UTF-8')
|
19
33
|
end
|
@@ -35,39 +49,74 @@ module Readability
|
|
35
49
|
def content(remove_unlikely_candidates = true)
|
36
50
|
@html.css("script, style").each { |i| i.remove }
|
37
51
|
|
38
|
-
|
39
|
-
|
40
|
-
candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
|
41
|
-
best_candidate = select_best_candidate(candidates)
|
42
|
-
article = get_article(candidates, best_candidate)
|
52
|
+
article = youtube if is_youtube? && remove_unlikely_candidates
|
53
|
+
article = vimeo if is_vimeo? && remove_unlikely_candidates
|
43
54
|
|
44
|
-
|
45
|
-
|
55
|
+
if article && remove_unlikely_candidates
|
56
|
+
return article.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
57
|
+
else
|
58
|
+
remove_unlikely_candidates! if remove_unlikely_candidates
|
59
|
+
transform_misused_divs_into_paragraphs!
|
60
|
+
candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
|
61
|
+
best_candidate = select_best_candidate(candidates)
|
62
|
+
article = get_article(candidates, best_candidate)
|
63
|
+
|
64
|
+
cleaned_article = sanitize(article, candidates, options)
|
46
65
|
|
47
|
-
|
48
|
-
|
49
|
-
|
66
|
+
if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
|
67
|
+
make_html
|
68
|
+
content(false)
|
69
|
+
else
|
70
|
+
cleaned_article
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def is_youtube?
|
76
|
+
(@input.base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
|
77
|
+
end
|
78
|
+
|
79
|
+
def is_vimeo?
|
80
|
+
(@input.base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
|
81
|
+
end
|
82
|
+
|
83
|
+
def is_special_case?
|
84
|
+
(@input.base_uri.to_s =~ REGEXES[:videoRe])
|
85
|
+
end
|
86
|
+
|
87
|
+
def youtube
|
88
|
+
if @input.base_uri.request_uri =~ /\?v=([_\-a-z0-9]+)&?/i
|
89
|
+
Nokogiri::HTML.fragment <<-HTML
|
90
|
+
<object width="480" height="385">
|
91
|
+
<param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&hl=en_US"></param>
|
92
|
+
<param name="allowFullScreen" value="true"></param>
|
93
|
+
<param name="allowscriptaccess" value="always"></param>
|
94
|
+
<embed src="http://www.youtube.com/v/#{$1}?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="385"></embed>
|
95
|
+
</object>
|
96
|
+
HTML
|
50
97
|
else
|
51
|
-
|
98
|
+
nil
|
52
99
|
end
|
53
100
|
end
|
54
101
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
<embed src="http://www.youtube.com/v/#{$1}?fs=1&hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="385"></embed>
|
64
|
-
</object>
|
65
|
-
HTML
|
102
|
+
def vimeo
|
103
|
+
# matches channel pages
|
104
|
+
if (player = @html.css(".player")).present?
|
105
|
+
html = ""
|
106
|
+
player.each do |video|
|
107
|
+
if video.to_html =~ /clip_id=([0-9]+)/
|
108
|
+
html << "<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>"
|
109
|
+
end
|
66
110
|
end
|
111
|
+
Nokogiri::HTML.fragment(html)
|
112
|
+
# matches non-channel or pages that used swfobject to print player
|
113
|
+
elsif @html.to_html =~ /clip_id=([0-9]+)/
|
114
|
+
Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
|
115
|
+
else
|
116
|
+
nil
|
67
117
|
end
|
68
|
-
content
|
69
118
|
end
|
70
|
-
|
119
|
+
|
71
120
|
def get_article(candidates, best_candidate)
|
72
121
|
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
73
122
|
# Things like preambles, content split by ads that we removed, etc.
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: busk-ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 31
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 1.0.
|
9
|
+
- 4
|
10
|
+
version: 1.0.4
|
11
11
|
platform: ruby
|
12
12
|
authors: []
|
13
13
|
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-09-
|
18
|
+
date: 2010-09-21 00:00:00 -03:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|