busk-ruby-readability 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/readability.rb +73 -24
  2. metadata +4 -4
data/lib/readability.rb CHANGED
@@ -14,6 +14,20 @@ module Readability
14
14
  make_html
15
15
  end
16
16
 
17
+ # def charset
18
+ # @charset ||= begin
19
+ # if content_type = @input.read.to_s.match(/(<meta\s*([^>]*)http-equiv=['"]?content-type['"]?([^>]*))/i)
20
+ # if content_type = content_type[0].match(/charset=([\w-]*)/i)
21
+ # content_type[1]
22
+ # else
23
+ # "utf-8"
24
+ # end
25
+ # else
26
+ # "utf-8"
27
+ # end
28
+ # end
29
+ # end
30
+
17
31
  def make_html
18
32
  @html = Nokogiri::HTML(@input, nil, 'UTF-8')
19
33
  end
@@ -35,39 +49,74 @@ module Readability
35
49
  def content(remove_unlikely_candidates = true)
36
50
  @html.css("script, style").each { |i| i.remove }
37
51
 
38
- remove_unlikely_candidates! if remove_unlikely_candidates
39
- transform_misused_divs_into_paragraphs!
40
- candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
41
- best_candidate = select_best_candidate(candidates)
42
- article = get_article(candidates, best_candidate)
52
+ article = youtube if is_youtube? && remove_unlikely_candidates
53
+ article = vimeo if is_vimeo? && remove_unlikely_candidates
43
54
 
44
- cleaned_article = sanitize(article, candidates, options)
45
- cleaned_article = consider_special_cases(cleaned_article)
55
+ if article && remove_unlikely_candidates
56
+ return article.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
57
+ else
58
+ remove_unlikely_candidates! if remove_unlikely_candidates
59
+ transform_misused_divs_into_paragraphs!
60
+ candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
61
+ best_candidate = select_best_candidate(candidates)
62
+ article = get_article(candidates, best_candidate)
63
+
64
+ cleaned_article = sanitize(article, candidates, options)
46
65
 
47
- if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
48
- make_html
49
- content(false)
66
+ if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
67
+ make_html
68
+ content(false)
69
+ else
70
+ cleaned_article
71
+ end
72
+ end
73
+ end
74
+
75
+ def is_youtube?
76
+ (@input.base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/)
77
+ end
78
+
79
+ def is_vimeo?
80
+ (@input.base_uri.to_s =~ /^http:\/\/(www.)?vimeo.com/)
81
+ end
82
+
83
+ def is_special_case?
84
+ (@input.base_uri.to_s =~ REGEXES[:videoRe])
85
+ end
86
+
87
+ def youtube
88
+ if @input.base_uri.request_uri =~ /\?v=([_\-a-z0-9]+)&?/i
89
+ Nokogiri::HTML.fragment <<-HTML
90
+ <object width="480" height="385">
91
+ <param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US"></param>
92
+ <param name="allowFullScreen" value="true"></param>
93
+ <param name="allowscriptaccess" value="always"></param>
94
+ <embed src="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="385"></embed>
95
+ </object>
96
+ HTML
50
97
  else
51
- cleaned_article
98
+ nil
52
99
  end
53
100
  end
54
101
 
55
- def consider_special_cases(content)
56
- if @input.base_uri.to_s =~ /^http:\/\/(www\.)?youtube.com/
57
- if @input.base_uri.request_uri =~ /\?v=([_\-a-z0-9]+)&?/i
58
- content = <<-HTML
59
- <object width="480" height="385">
60
- <param name="movie" value="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US"></param>
61
- <param name="allowFullScreen" value="true"></param>
62
- <param name="allowscriptaccess" value="always"></param>
63
- <embed src="http://www.youtube.com/v/#{$1}?fs=1&amp;hl=en_US" type="application/x-shockwave-flash" allowscriptaccess="always" allowfullscreen="true" width="480" height="385"></embed>
64
- </object>
65
- HTML
102
+ def vimeo
103
+ # matches channel pages
104
+ if (player = @html.css(".player")).present?
105
+ html = ""
106
+ player.each do |video|
107
+ if video.to_html =~ /clip_id=([0-9]+)/
108
+ html << "<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>"
109
+ end
66
110
  end
111
+ Nokogiri::HTML.fragment(html)
112
+ # matches non-channel or pages that used swfobject to print player
113
+ elsif @html.to_html =~ /clip_id=([0-9]+)/
114
+ Nokogiri::HTML.fragment("<iframe src=\"http://player.vimeo.com/video/#{$1}\" width=\"572\" height=\"322\" frameborder=\"0\"></iframe>")
115
+ else
116
+ nil
67
117
  end
68
- content
69
118
  end
70
-
119
+
71
120
  def get_article(candidates, best_candidate)
72
121
  # Now that we have the top candidate, look through its siblings for content that might also be related.
73
122
  # Things like preambles, content split by ads that we removed, etc.
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: busk-ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 31
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 0
9
- - 3
10
- version: 1.0.3
9
+ - 4
10
+ version: 1.0.4
11
11
  platform: ruby
12
12
  authors: []
13
13
 
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-20 00:00:00 -03:00
18
+ date: 2010-09-21 00:00:00 -03:00
19
19
  default_executable:
20
20
  dependencies: []
21
21