ruby-readability 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -101,7 +101,7 @@ describe Readability do
101
101
  <p id="some_text2">some more text</p>
102
102
  </div>
103
103
  </body>
104
- </html>
104
+ </html><!-- " -->
105
105
  HTML
106
106
  @candidates = @doc.score_paragraphs(0)
107
107
  end
@@ -117,6 +117,37 @@ describe Readability do
117
117
  end
118
118
  end
119
119
 
120
+ describe "score_paragraphs" do
121
+ context "when two consequent br tags are used instead of p" do
122
+ before :each do
123
+ @doc = Readability::Document.new(<<-HTML)
124
+ <html>
125
+ <head>
126
+ <title>title!</title>
127
+ </head>
128
+ <body id="body">
129
+ <div id="post1">
130
+ This is the main content!<br/><br/>
131
+ Zebra found killed butcher with the chainsaw.<br/><br/>
132
+ If only I could think of an example, oh, wait.
133
+ </div>
134
+ <div id="post2">
135
+ This is not the content and although it's longer if you meaure it in characters,
136
+ it's supposed to have lower score than the previous paragraph. And it's only because
137
+ of the previous paragraph is not one paragraph, it's three subparagraphs
138
+ </div>
139
+ </body>
140
+ </html>
141
+ HTML
142
+ @candidates = @doc.score_paragraphs(0)
143
+ end
144
+
145
+ it "should assign the higher score to the first paragraph in this particular example" do
146
+ @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
147
+ end
148
+ end
149
+ end
150
+
120
151
  describe "the cant_read.html fixture" do
121
152
  it "should work on the cant_read.html fixture with some allowed tags" do
122
153
  allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
@@ -147,7 +178,25 @@ describe Readability do
147
178
  @doc.content.should_not match("sidebar")
148
179
  end
149
180
  end
150
-
181
+
182
+ describe "inserting space for block elements" do
183
+ before do
184
+ @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
185
+ <html><head><title>title!</title></head>
186
+ <body>
187
+ <div>
188
+ <p>a<br>b<hr>c<address>d</address>f/p>
189
+ </div>
190
+ </body>
191
+ </html>
192
+ HTML
193
+ end
194
+
195
+ it "should not return the sidebar" do
196
+ @doc.content.should_not match("a b c d f")
197
+ end
198
+ end
199
+
151
200
  describe "outputs good stuff for known documents" do
152
201
  before do
153
202
  @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
data/spec/spec.opts ADDED
@@ -0,0 +1,4 @@
1
+ --colour
2
+ --format s -c
3
+ --loadby mtime
4
+ --reverse
metadata CHANGED
@@ -1,29 +1,35 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
+ hash: 21
4
5
  prerelease: false
5
6
  segments:
6
7
  - 0
8
+ - 2
7
9
  - 1
8
- - 0
9
- version: 0.1.0
10
+ version: 0.2.1
10
11
  platform: ruby
11
12
  authors:
13
+ - Andrew Cantino
14
+ - starrhorne
15
+ - libc
12
16
  - Kyle Maxwell
13
17
  autorequire:
14
18
  bindir: bin
15
19
  cert_chain: []
16
20
 
17
- date: 2010-08-30 00:00:00 -07:00
21
+ date: 2010-11-07 00:00:00 -07:00
18
22
  default_executable: readability
19
23
  dependencies:
20
24
  - !ruby/object:Gem::Dependency
21
25
  name: rspec
22
26
  prerelease: false
23
27
  requirement: &id001 !ruby/object:Gem::Requirement
28
+ none: false
24
29
  requirements:
25
30
  - - ">="
26
31
  - !ruby/object:Gem::Version
32
+ hash: 13
27
33
  segments:
28
34
  - 1
29
35
  - 2
@@ -31,8 +37,8 @@ dependencies:
31
37
  version: 1.2.9
32
38
  type: :development
33
39
  version_requirements: *id001
34
- description: ruby-readability
35
- email: kmaxwell@twitter.com
40
+ description: Port of arc90's readability project to ruby
41
+ email: andrew@iterationlabs.com
36
42
  executables:
37
43
  - readability
38
44
  extensions: []
@@ -47,18 +53,23 @@ files:
47
53
  - VERSION
48
54
  - bin/readability
49
55
  - lib/readability.rb
50
- - lib/readability_old.rb
56
+ - ruby-readability.gemspec
51
57
  - spec/fixtures/cant_read.html
52
58
  - spec/fixtures/sample.html
59
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
60
+ - spec/fixtures/samples/blogpost_with_links.html
53
61
  - spec/fixtures/samples/channel4-1-fragments.rb
54
62
  - spec/fixtures/samples/channel4-1.html
63
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
64
+ - spec/fixtures/samples/foxnews-india1.html
55
65
  - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
56
66
  - spec/fixtures/samples/globemail-ottawa-cuts.html
57
67
  - spec/fixtures/should_not_truncate.txt
58
68
  - spec/readability_spec.rb
69
+ - spec/spec.opts
59
70
  - spec/spec_helper.rb
60
71
  has_rdoc: true
61
- homepage: http://github.com/fizx/ruby-readability
72
+ homepage: http://github.com/iterationlabs/ruby-readability
62
73
  licenses: []
63
74
 
64
75
  post_install_message:
@@ -67,28 +78,34 @@ rdoc_options:
67
78
  require_paths:
68
79
  - lib
69
80
  required_ruby_version: !ruby/object:Gem::Requirement
81
+ none: false
70
82
  requirements:
71
83
  - - ">="
72
84
  - !ruby/object:Gem::Version
85
+ hash: 3
73
86
  segments:
74
87
  - 0
75
88
  version: "0"
76
89
  required_rubygems_version: !ruby/object:Gem::Requirement
90
+ none: false
77
91
  requirements:
78
92
  - - ">="
79
93
  - !ruby/object:Gem::Version
94
+ hash: 3
80
95
  segments:
81
96
  - 0
82
97
  version: "0"
83
98
  requirements: []
84
99
 
85
100
  rubyforge_project:
86
- rubygems_version: 1.3.6
101
+ rubygems_version: 1.3.7
87
102
  signing_key:
88
103
  specification_version: 3
89
- summary: ruby-readability
104
+ summary: Port of arc90's readability project to ruby
90
105
  test_files:
106
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
91
107
  - spec/fixtures/samples/channel4-1-fragments.rb
108
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
92
109
  - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
93
110
  - spec/readability_spec.rb
94
111
  - spec/spec_helper.rb
@@ -1,74 +0,0 @@
1
- require 'rubygems'
2
- require 'nokogiri'
3
-
4
- module Readability
5
- class Document
6
-
7
- def initialize(input, options = {})
8
- @options = options
9
- @html = Nokogiri::HTML(input, nil, 'UTF-8')
10
- end
11
-
12
-
13
- def content
14
-
15
- # Get all parent elements containing a <p> tag
16
- @parents = @html.css("p").map { |p| p.parent }.compact.uniq
17
-
18
- sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0])
19
-
20
- end
21
-
22
- def score(parent)
23
- s = 0
24
-
25
- # Adjust score based on parent's "class" attribute
26
- s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i
27
- s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i
28
-
29
- # Adjust score based on parent id
30
- s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i
31
- s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i
32
-
33
- # Adjust score based on # of <p> elements inside parent
34
- s += parent.css("p").size
35
-
36
- # Adjust score based on # of commas inside parent
37
- s += parent.text.count ","
38
-
39
- s
40
- end
41
-
42
- def sanitize(node)
43
-
44
- # Get rid of divs full of non-text items
45
- node.css("div").each do |el|
46
- counts = %w[p img li a embed].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
47
- el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"])
48
- end
49
-
50
- # We'll sanitize all elements using a whitelist
51
- whitelist = @options[:tags] || %w[div p]
52
-
53
- # Use a hash for speed (don't want to make a million calls to include?)
54
- whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
55
-
56
- ([node] + node.css("*")).each do |el|
57
-
58
- # If element is in whitelist, delete all its attributes
59
- if whitelist[el.node_name]
60
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
61
-
62
- # Otherwise, replace the element with its contents
63
- else
64
- el.swap(el.text)
65
- end
66
-
67
- end
68
-
69
- # Get rid of duplicate whitespace
70
- node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
71
- end
72
-
73
- end
74
- end