ruby-readability 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/README +1 -1
- data/Rakefile +5 -5
- data/VERSION +1 -1
- data/lib/readability.rb +75 -36
- data/ruby-readability.gemspec +71 -0
- data/spec/fixtures/samples/blogpost_with_links-fragments.rb +9 -0
- data/spec/fixtures/samples/blogpost_with_links.html +137 -0
- data/spec/fixtures/samples/channel4-1-fragments.rb +1 -2
- data/spec/fixtures/samples/foxnews-india1-fragments.rb +13 -0
- data/spec/fixtures/samples/foxnews-india1.html +2058 -0
- data/spec/readability_spec.rb +51 -2
- data/spec/spec.opts +4 -0
- metadata +26 -9
- data/lib/readability_old.rb +0 -74
data/spec/readability_spec.rb
CHANGED
@@ -101,7 +101,7 @@ describe Readability do
|
|
101
101
|
<p id="some_text2">some more text</p>
|
102
102
|
</div>
|
103
103
|
</body>
|
104
|
-
</html
|
104
|
+
</html><!-- " -->
|
105
105
|
HTML
|
106
106
|
@candidates = @doc.score_paragraphs(0)
|
107
107
|
end
|
@@ -117,6 +117,37 @@ describe Readability do
|
|
117
117
|
end
|
118
118
|
end
|
119
119
|
|
120
|
+
describe "score_paragraphs" do
|
121
|
+
context "when two consequent br tags are used instead of p" do
|
122
|
+
before :each do
|
123
|
+
@doc = Readability::Document.new(<<-HTML)
|
124
|
+
<html>
|
125
|
+
<head>
|
126
|
+
<title>title!</title>
|
127
|
+
</head>
|
128
|
+
<body id="body">
|
129
|
+
<div id="post1">
|
130
|
+
This is the main content!<br/><br/>
|
131
|
+
Zebra found killed butcher with the chainsaw.<br/><br/>
|
132
|
+
If only I could think of an example, oh, wait.
|
133
|
+
</div>
|
134
|
+
<div id="post2">
|
135
|
+
This is not the content and although it's longer if you meaure it in characters,
|
136
|
+
it's supposed to have lower score than the previous paragraph. And it's only because
|
137
|
+
of the previous paragraph is not one paragraph, it's three subparagraphs
|
138
|
+
</div>
|
139
|
+
</body>
|
140
|
+
</html>
|
141
|
+
HTML
|
142
|
+
@candidates = @doc.score_paragraphs(0)
|
143
|
+
end
|
144
|
+
|
145
|
+
it "should assign the higher score to the first paragraph in this particular example" do
|
146
|
+
@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
120
151
|
describe "the cant_read.html fixture" do
|
121
152
|
it "should work on the cant_read.html fixture with some allowed tags" do
|
122
153
|
allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
|
@@ -147,7 +178,25 @@ describe Readability do
|
|
147
178
|
@doc.content.should_not match("sidebar")
|
148
179
|
end
|
149
180
|
end
|
150
|
-
|
181
|
+
|
182
|
+
describe "inserting space for block elements" do
|
183
|
+
before do
|
184
|
+
@doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
|
185
|
+
<html><head><title>title!</title></head>
|
186
|
+
<body>
|
187
|
+
<div>
|
188
|
+
<p>a<br>b<hr>c<address>d</address>f/p>
|
189
|
+
</div>
|
190
|
+
</body>
|
191
|
+
</html>
|
192
|
+
HTML
|
193
|
+
end
|
194
|
+
|
195
|
+
it "should not return the sidebar" do
|
196
|
+
@doc.content.should_not match("a b c d f")
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
151
200
|
describe "outputs good stuff for known documents" do
|
152
201
|
before do
|
153
202
|
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
|
data/spec/spec.opts
ADDED
metadata
CHANGED
@@ -1,29 +1,35 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
8
|
+
- 2
|
7
9
|
- 1
|
8
|
-
|
9
|
-
version: 0.1.0
|
10
|
+
version: 0.2.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
13
|
+
- Andrew Cantino
|
14
|
+
- starrhorne
|
15
|
+
- libc
|
12
16
|
- Kyle Maxwell
|
13
17
|
autorequire:
|
14
18
|
bindir: bin
|
15
19
|
cert_chain: []
|
16
20
|
|
17
|
-
date: 2010-
|
21
|
+
date: 2010-11-07 00:00:00 -07:00
|
18
22
|
default_executable: readability
|
19
23
|
dependencies:
|
20
24
|
- !ruby/object:Gem::Dependency
|
21
25
|
name: rspec
|
22
26
|
prerelease: false
|
23
27
|
requirement: &id001 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
24
29
|
requirements:
|
25
30
|
- - ">="
|
26
31
|
- !ruby/object:Gem::Version
|
32
|
+
hash: 13
|
27
33
|
segments:
|
28
34
|
- 1
|
29
35
|
- 2
|
@@ -31,8 +37,8 @@ dependencies:
|
|
31
37
|
version: 1.2.9
|
32
38
|
type: :development
|
33
39
|
version_requirements: *id001
|
34
|
-
description: ruby
|
35
|
-
email:
|
40
|
+
description: Port of arc90's readability project to ruby
|
41
|
+
email: andrew@iterationlabs.com
|
36
42
|
executables:
|
37
43
|
- readability
|
38
44
|
extensions: []
|
@@ -47,18 +53,23 @@ files:
|
|
47
53
|
- VERSION
|
48
54
|
- bin/readability
|
49
55
|
- lib/readability.rb
|
50
|
-
-
|
56
|
+
- ruby-readability.gemspec
|
51
57
|
- spec/fixtures/cant_read.html
|
52
58
|
- spec/fixtures/sample.html
|
59
|
+
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
60
|
+
- spec/fixtures/samples/blogpost_with_links.html
|
53
61
|
- spec/fixtures/samples/channel4-1-fragments.rb
|
54
62
|
- spec/fixtures/samples/channel4-1.html
|
63
|
+
- spec/fixtures/samples/foxnews-india1-fragments.rb
|
64
|
+
- spec/fixtures/samples/foxnews-india1.html
|
55
65
|
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
56
66
|
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
57
67
|
- spec/fixtures/should_not_truncate.txt
|
58
68
|
- spec/readability_spec.rb
|
69
|
+
- spec/spec.opts
|
59
70
|
- spec/spec_helper.rb
|
60
71
|
has_rdoc: true
|
61
|
-
homepage: http://github.com/
|
72
|
+
homepage: http://github.com/iterationlabs/ruby-readability
|
62
73
|
licenses: []
|
63
74
|
|
64
75
|
post_install_message:
|
@@ -67,28 +78,34 @@ rdoc_options:
|
|
67
78
|
require_paths:
|
68
79
|
- lib
|
69
80
|
required_ruby_version: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
70
82
|
requirements:
|
71
83
|
- - ">="
|
72
84
|
- !ruby/object:Gem::Version
|
85
|
+
hash: 3
|
73
86
|
segments:
|
74
87
|
- 0
|
75
88
|
version: "0"
|
76
89
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
77
91
|
requirements:
|
78
92
|
- - ">="
|
79
93
|
- !ruby/object:Gem::Version
|
94
|
+
hash: 3
|
80
95
|
segments:
|
81
96
|
- 0
|
82
97
|
version: "0"
|
83
98
|
requirements: []
|
84
99
|
|
85
100
|
rubyforge_project:
|
86
|
-
rubygems_version: 1.3.
|
101
|
+
rubygems_version: 1.3.7
|
87
102
|
signing_key:
|
88
103
|
specification_version: 3
|
89
|
-
summary: ruby
|
104
|
+
summary: Port of arc90's readability project to ruby
|
90
105
|
test_files:
|
106
|
+
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
91
107
|
- spec/fixtures/samples/channel4-1-fragments.rb
|
108
|
+
- spec/fixtures/samples/foxnews-india1-fragments.rb
|
92
109
|
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
93
110
|
- spec/readability_spec.rb
|
94
111
|
- spec/spec_helper.rb
|
data/lib/readability_old.rb
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'nokogiri'
|
3
|
-
|
4
|
-
module Readability
|
5
|
-
class Document
|
6
|
-
|
7
|
-
def initialize(input, options = {})
|
8
|
-
@options = options
|
9
|
-
@html = Nokogiri::HTML(input, nil, 'UTF-8')
|
10
|
-
end
|
11
|
-
|
12
|
-
|
13
|
-
def content
|
14
|
-
|
15
|
-
# Get all parent elements containing a <p> tag
|
16
|
-
@parents = @html.css("p").map { |p| p.parent }.compact.uniq
|
17
|
-
|
18
|
-
sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0])
|
19
|
-
|
20
|
-
end
|
21
|
-
|
22
|
-
def score(parent)
|
23
|
-
s = 0
|
24
|
-
|
25
|
-
# Adjust score based on parent's "class" attribute
|
26
|
-
s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i
|
27
|
-
s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i
|
28
|
-
|
29
|
-
# Adjust score based on parent id
|
30
|
-
s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i
|
31
|
-
s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i
|
32
|
-
|
33
|
-
# Adjust score based on # of <p> elements inside parent
|
34
|
-
s += parent.css("p").size
|
35
|
-
|
36
|
-
# Adjust score based on # of commas inside parent
|
37
|
-
s += parent.text.count ","
|
38
|
-
|
39
|
-
s
|
40
|
-
end
|
41
|
-
|
42
|
-
def sanitize(node)
|
43
|
-
|
44
|
-
# Get rid of divs full of non-text items
|
45
|
-
node.css("div").each do |el|
|
46
|
-
counts = %w[p img li a embed].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
47
|
-
el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"])
|
48
|
-
end
|
49
|
-
|
50
|
-
# We'll sanitize all elements using a whitelist
|
51
|
-
whitelist = @options[:tags] || %w[div p]
|
52
|
-
|
53
|
-
# Use a hash for speed (don't want to make a million calls to include?)
|
54
|
-
whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
|
55
|
-
|
56
|
-
([node] + node.css("*")).each do |el|
|
57
|
-
|
58
|
-
# If element is in whitelist, delete all its attributes
|
59
|
-
if whitelist[el.node_name]
|
60
|
-
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
61
|
-
|
62
|
-
# Otherwise, replace the element with its contents
|
63
|
-
else
|
64
|
-
el.swap(el.text)
|
65
|
-
end
|
66
|
-
|
67
|
-
end
|
68
|
-
|
69
|
-
# Get rid of duplicate whitespace
|
70
|
-
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
71
|
-
end
|
72
|
-
|
73
|
-
end
|
74
|
-
end
|