ruby-readability 0.1.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -101,7 +101,7 @@ describe Readability do
101
101
  <p id="some_text2">some more text</p>
102
102
  </div>
103
103
  </body>
104
- </html>
104
+ </html><!-- " -->
105
105
  HTML
106
106
  @candidates = @doc.score_paragraphs(0)
107
107
  end
@@ -117,6 +117,37 @@ describe Readability do
117
117
  end
118
118
  end
119
119
 
120
+ describe "score_paragraphs" do
121
+ context "when two consequent br tags are used instead of p" do
122
+ before :each do
123
+ @doc = Readability::Document.new(<<-HTML)
124
+ <html>
125
+ <head>
126
+ <title>title!</title>
127
+ </head>
128
+ <body id="body">
129
+ <div id="post1">
130
+ This is the main content!<br/><br/>
131
+ Zebra found killed butcher with the chainsaw.<br/><br/>
132
+ If only I could think of an example, oh, wait.
133
+ </div>
134
+ <div id="post2">
135
+ This is not the content and although it's longer if you meaure it in characters,
136
+ it's supposed to have lower score than the previous paragraph. And it's only because
137
+ of the previous paragraph is not one paragraph, it's three subparagraphs
138
+ </div>
139
+ </body>
140
+ </html>
141
+ HTML
142
+ @candidates = @doc.score_paragraphs(0)
143
+ end
144
+
145
+ it "should assign the higher score to the first paragraph in this particular example" do
146
+ @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
147
+ end
148
+ end
149
+ end
150
+
120
151
  describe "the cant_read.html fixture" do
121
152
  it "should work on the cant_read.html fixture with some allowed tags" do
122
153
  allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
@@ -147,7 +178,25 @@ describe Readability do
147
178
  @doc.content.should_not match("sidebar")
148
179
  end
149
180
  end
150
-
181
+
182
+ describe "inserting space for block elements" do
183
+ before do
184
+ @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
185
+ <html><head><title>title!</title></head>
186
+ <body>
187
+ <div>
188
+ <p>a<br>b<hr>c<address>d</address>f/p>
189
+ </div>
190
+ </body>
191
+ </html>
192
+ HTML
193
+ end
194
+
195
+ it "should not return the sidebar" do
196
+ @doc.content.should_not match("a b c d f")
197
+ end
198
+ end
199
+
151
200
  describe "outputs good stuff for known documents" do
152
201
  before do
153
202
  @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s -c
3
+ --loadby mtime
4
+ --reverse
metadata CHANGED
@@ -1,29 +1,35 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 21
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
8
+ - 2
7
9
  - 1
8
- - 0
9
- version: 0.1.0
10
+ version: 0.2.1
10
11
  platform: ruby
11
12
  authors:
13
+ - Andrew Cantino
14
+ - starrhorne
15
+ - libc
12
16
  - Kyle Maxwell
13
17
  autorequire:
14
18
  bindir: bin
15
19
  cert_chain: []
16
20
 
17
- date: 2010-08-30 00:00:00 -07:00
21
+ date: 2010-11-07 00:00:00 -07:00
18
22
  default_executable: readability
19
23
  dependencies:
20
24
  - !ruby/object:Gem::Dependency
21
25
  name: rspec
22
26
  prerelease: false
23
27
  requirement: &id001 !ruby/object:Gem::Requirement
28
+ none: false
24
29
  requirements:
25
30
  - - ">="
26
31
  - !ruby/object:Gem::Version
32
+ hash: 13
27
33
  segments:
28
34
  - 1
29
35
  - 2
@@ -31,8 +37,8 @@ dependencies:
31
37
  version: 1.2.9
32
38
  type: :development
33
39
  version_requirements: *id001
34
- description: ruby-readability
35
- email: kmaxwell@twitter.com
40
+ description: Port of arc90's readability project to ruby
41
+ email: andrew@iterationlabs.com
36
42
  executables:
37
43
  - readability
38
44
  extensions: []
@@ -47,18 +53,23 @@ files:
47
53
  - VERSION
48
54
  - bin/readability
49
55
  - lib/readability.rb
50
- - lib/readability_old.rb
56
+ - ruby-readability.gemspec
51
57
  - spec/fixtures/cant_read.html
52
58
  - spec/fixtures/sample.html
59
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
60
+ - spec/fixtures/samples/blogpost_with_links.html
53
61
  - spec/fixtures/samples/channel4-1-fragments.rb
54
62
  - spec/fixtures/samples/channel4-1.html
63
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
64
+ - spec/fixtures/samples/foxnews-india1.html
55
65
  - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
56
66
  - spec/fixtures/samples/globemail-ottawa-cuts.html
57
67
  - spec/fixtures/should_not_truncate.txt
58
68
  - spec/readability_spec.rb
69
+ - spec/spec.opts
59
70
  - spec/spec_helper.rb
60
71
  has_rdoc: true
61
- homepage: http://github.com/fizx/ruby-readability
72
+ homepage: http://github.com/iterationlabs/ruby-readability
62
73
  licenses: []
63
74
 
64
75
  post_install_message:
@@ -67,28 +78,34 @@ rdoc_options:
67
78
  require_paths:
68
79
  - lib
69
80
  required_ruby_version: !ruby/object:Gem::Requirement
81
+ none: false
70
82
  requirements:
71
83
  - - ">="
72
84
  - !ruby/object:Gem::Version
85
+ hash: 3
73
86
  segments:
74
87
  - 0
75
88
  version: "0"
76
89
  required_rubygems_version: !ruby/object:Gem::Requirement
90
+ none: false
77
91
  requirements:
78
92
  - - ">="
79
93
  - !ruby/object:Gem::Version
94
+ hash: 3
80
95
  segments:
81
96
  - 0
82
97
  version: "0"
83
98
  requirements: []
84
99
 
85
100
  rubyforge_project:
86
- rubygems_version: 1.3.6
101
+ rubygems_version: 1.3.7
87
102
  signing_key:
88
103
  specification_version: 3
89
- summary: ruby-readability
104
+ summary: Port of arc90's readability project to ruby
90
105
  test_files:
106
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
91
107
  - spec/fixtures/samples/channel4-1-fragments.rb
108
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
92
109
  - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
93
110
  - spec/readability_spec.rb
94
111
  - spec/spec_helper.rb
@@ -1,74 +0,0 @@
1
- require 'rubygems'
2
- require 'nokogiri'
3
-
4
- module Readability
5
- class Document
6
-
7
- def initialize(input, options = {})
8
- @options = options
9
- @html = Nokogiri::HTML(input, nil, 'UTF-8')
10
- end
11
-
12
-
13
- def content
14
-
15
- # Get all parent elements containing a <p> tag
16
- @parents = @html.css("p").map { |p| p.parent }.compact.uniq
17
-
18
- sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0])
19
-
20
- end
21
-
22
- def score(parent)
23
- s = 0
24
-
25
- # Adjust score based on parent's "class" attribute
26
- s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i
27
- s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i
28
-
29
- # Adjust score based on parent id
30
- s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i
31
- s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i
32
-
33
- # Adjust score based on # of <p> elements inside parent
34
- s += parent.css("p").size
35
-
36
- # Adjust score based on # of commas inside parent
37
- s += parent.text.count ","
38
-
39
- s
40
- end
41
-
42
- def sanitize(node)
43
-
44
- # Get rid of divs full of non-text items
45
- node.css("div").each do |el|
46
- counts = %w[p img li a embed].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
47
- el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"])
48
- end
49
-
50
- # We'll sanitize all elements using a whitelist
51
- whitelist = @options[:tags] || %w[div p]
52
-
53
- # Use a hash for speed (don't want to make a million calls to include?)
54
- whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
55
-
56
- ([node] + node.css("*")).each do |el|
57
-
58
- # If element is in whitelist, delete all its attributes
59
- if whitelist[el.node_name]
60
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
61
-
62
- # Otherwise, replace the element with its contents
63
- else
64
- el.swap(el.text)
65
- end
66
-
67
- end
68
-
69
- # Get rid of duplicate whitespace
70
- node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
71
- end
72
-
73
- end
74
- end