busk-ruby-readability 1.0.3 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/readability.rb +73 -24
  2. metadata +4 -4
data/lib/readability.rb CHANGED
@@ -14,6 +14,20 @@ module Readability
14
14
  make_html
15
15
  end
16
16
 
17
+ # def charset
18
+ # @charset ||= begin
19
+ # if content_type = @input.read.to_s.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
20
+ # if content_type = content_type[0].match(/charset=([\w-]*)/i)
21
+ # content_type[1]
22
+ # else
23
+ # "utf-8"
24
+ # end
25
+ # else
26
+ # "utf-8"
27
+ # end
28
+ # end
29
+ # end
30
+
17
31
  def make_html
18
32
  @html = Nokogiri::HTML(@input, nil, 'UTF-8')
19
33
  end
@@ -35,39 +49,74 @@ module Readability
35
49
  def content(remove_unlikely_candidates = true)
36
50
  @html.css("script, style").each { |i| i.remove }
37
51
 
38
- remove_unlikely_candidates! if remove_unlikely_candidates
39
- transform_misused_divs_into_paragraphs!
40
- candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
41
- best_candidate = select_best_candidate(candidates)
42
- article = get_article(candidates, best_candidate)
52
+ article = youtube if is_youtube? && remove_unlikely_candidates
53
+ article = vimeo if is_vimeo? && remove_unlikely_candidates
43
54
 
44
- cleaned_article = sanitize(article, candidates, options)
45
- cleaned_article = consider_special_cases(cleaned_article)
55
+ if article && remove_unlikely_candidates
56
+ return article.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
57
+ else
58
+ remove_unlikely_candidates! if remove_unlikely_candidates
59
+ transform_misused_divs_into_paragraphs!
60
+ candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
61
+ best_candidate = select_best_candidate(candidates)
62
+ article = get_article(candidates, best_candidate)
63
+
64
+ cleaned_article = sanitize(article, candidates, options)
46
65
 
47
- if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
48
- make_html
49
- content(false)
66
+ if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
67
+ make_html
68
+ content(false)
69
+ else
70
+ cleaned_article
71
+ end
72
+ end
73
+ end
74
+
75
+ def is_youtube?
76
+ (@input.base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
77
+ end
78
+
79
+ def is_vimeo?
80
+ (@input.base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
81
+ end
82
+
83
+ def is_special_case?
84
+ (@input.base_uri.to_s =~ REGEXES[:videoRe])
85
+ end
86
+
87
+ def youtube
88
+ if @input.base_uri.request_uri =~ /\?v=([_\-a-z0-9]+)&?/i
89
+ Nokogiri::HTML.fragment <<-HTML
90
+ <object width="480" height="385">
91
+ <param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US"></param>
92
+ <param name="allowFullScreen" value="true"></param>
93
+ <param name="allowscriptaccess" value="always"></param>
94
+ <embed src="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="385"></embed>
95
+ </object>
96
+ HTML
50
97
  else
51
- cleaned_article
98
+ nil
52
99
  end
53
100
  end
54
101
 
55
- def consider_special_cases(content)
56
- if @input.base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/
57
- if @input.base_uri.request_uri =~ /\?v=([_\-a-z0-9]+)&?/i
58
- content = <<-HTML
59
- <object width="480" height="385">
60
- <param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US"></param>
61
- <param name="allowFullScreen" value="true"></param>
62
- <param name="allowscriptaccess" value="always"></param>
63
- <embed src="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="385"></embed>
64
- </object>
65
- HTML
102
+ def vimeo
103
+ # matches channel pages
104
+ if (player = @html.css(".player")).present?
105
+ html = ""
106
+ player.each do |video|
107
+ if video.to_html =~ /clip_id=([0-9]+)/
108
+ html << "<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>"
109
+ end
66
110
  end
111
+ Nokogiri::HTML.fragment(html)
112
+ # matches non-channel or pages that used swfobject to print player
113
+ elsif @html.to_html =~ /clip_id=([0-9]+)/
114
+ Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
115
+ else
116
+ nil
67
117
  end
68
- content
69
118
  end
70
-
119
+
71
120
  def get_article(candidates, best_candidate)
72
121
  # Now that we have the top candidate, look through its siblings for content that might also be related.
73
122
  # Things like preambles, content split by ads that we removed, etc.
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: busk-ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 31
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 3
10
- version: 1.0.3
9
+ - 4
10
+ version: 1.0.4
11
11
  platform: ruby
12
12
  authors: []
13
13
 
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-20 00:00:00 -03:00
18
+ date: 2010-09-21 00:00:00 -03:00
19
19
  default_executable:
20
20
  dependencies: []
21
21