pismo 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +19 -28
- data/NOTICE +4 -0
- data/README.markdown +37 -40
- data/Rakefile +3 -2
- data/VERSION +1 -1
- data/bin/pismo +15 -7
- data/lib/pismo/document.rb +2 -2
- data/lib/pismo/internal_attributes.rb +23 -16
- data/lib/pismo/reader.rb +390 -0
- data/lib/pismo.rb +3 -2
- data/pismo.gemspec +23 -15
- data/test/corpus/bbcnews2.html +1575 -0
- data/test/corpus/gmane.html +138 -0
- data/test/corpus/metadata_expected.yaml +20 -5
- data/test/corpus/queness.html +919 -0
- data/test/corpus/reader_expected.yaml +45 -0
- data/test/corpus/tweet.html +360 -0
- data/test/corpus/zefrank.html +535 -0
- data/test/test_corpus.rb +9 -1
- metadata +89 -34
- data/lib/pismo/readability.rb +0 -342
- data/test/test_readability.rb +0 -152
data/test/test_readability.rb
DELETED
@@ -1,152 +0,0 @@
|
|
1
|
-
require 'helper'
|
2
|
-
|
3
|
-
class TestReadability < Test::Unit::TestCase
|
4
|
-
context "Readability" do
|
5
|
-
setup do
|
6
|
-
@simple_html_fixture = <<-HTML
|
7
|
-
<html>
|
8
|
-
<head>
|
9
|
-
<title>title!</title>
|
10
|
-
</head>
|
11
|
-
<body class='comment'>
|
12
|
-
<div>
|
13
|
-
<p class='comment'>a comment</p>
|
14
|
-
<div class='comment' id='body'>real content</div>
|
15
|
-
<div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
|
16
|
-
</div>
|
17
|
-
</body>
|
18
|
-
</html>
|
19
|
-
HTML
|
20
|
-
end
|
21
|
-
|
22
|
-
context "transformMisusedDivsIntoParagraphs" do
|
23
|
-
setup do
|
24
|
-
@doc = Readability::Document.new(@simple_html_fixture)
|
25
|
-
@doc.transform_misused_divs_into_paragraphs!
|
26
|
-
end
|
27
|
-
|
28
|
-
should "transform divs containing no block elements into <p>s" do
|
29
|
-
assert_equal "p", @doc.html.css("#body").first.name
|
30
|
-
end
|
31
|
-
|
32
|
-
should "not transform divs that contain block elements" do
|
33
|
-
assert_equal "div", @doc.html.css("#contains_blockquote").first.name
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
context "score_node" do
|
38
|
-
setup do
|
39
|
-
@doc = Readability::Document.new(<<-HTML)
|
40
|
-
<html>
|
41
|
-
<body>
|
42
|
-
<div id='elem1'>
|
43
|
-
<p>some content</p>
|
44
|
-
</div>
|
45
|
-
<th id='elem2'>
|
46
|
-
<p>some other content</p>
|
47
|
-
</th>
|
48
|
-
</body>
|
49
|
-
</html>
|
50
|
-
HTML
|
51
|
-
@elem1 = @doc.html.css("#elem1").first
|
52
|
-
@elem2 = @doc.html.css("#elem2").first
|
53
|
-
end
|
54
|
-
|
55
|
-
should "like <div>s more than <th>s" do
|
56
|
-
assert @doc.score_node(@elem1)[:content_score] > @doc.score_node(@elem2)[:content_score]
|
57
|
-
end
|
58
|
-
|
59
|
-
should "like classes like text more than classes like comment" do
|
60
|
-
@elem2.name = "div"
|
61
|
-
assert_equal @doc.score_node(@elem2)[:content_score], @doc.score_node(@elem1)[:content_score]
|
62
|
-
@elem1['class'] = "text"
|
63
|
-
@elem2['class'] = "comment"
|
64
|
-
assert @doc.score_node(@elem1)[:content_score] > @doc.score_node(@elem2)[:content_score]
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
context "remove_unlikely_candidates!" do
|
69
|
-
setup do
|
70
|
-
@doc = Readability::Document.new(@simple_html_fixture)
|
71
|
-
@doc.remove_unlikely_candidates!
|
72
|
-
end
|
73
|
-
|
74
|
-
should "remove things that have class comment" do
|
75
|
-
assert @doc.html.inner_html !~ /a comment/
|
76
|
-
end
|
77
|
-
|
78
|
-
should "not remove body tags" do
|
79
|
-
assert @doc.html.inner_html =~ /<\/body>/
|
80
|
-
end
|
81
|
-
|
82
|
-
should "not remove things with class comment and id body" do
|
83
|
-
assert @doc.html.inner_html =~ /real content/
|
84
|
-
end
|
85
|
-
end
|
86
|
-
|
87
|
-
context "score_paragraphs" do
|
88
|
-
setup do
|
89
|
-
@doc = Readability::Document.new(%{
|
90
|
-
<html>
|
91
|
-
<head>
|
92
|
-
<title>title!</title>
|
93
|
-
</head>
|
94
|
-
<body id="body">
|
95
|
-
<div id="div1">
|
96
|
-
<div id="div2>
|
97
|
-
<p id="some_comment">a comment</p>
|
98
|
-
</div>
|
99
|
-
<p id="some_text">some text</p>
|
100
|
-
</div>
|
101
|
-
<div id="div3">
|
102
|
-
<p id="some_text2">some more text</p>
|
103
|
-
</div>
|
104
|
-
</body>
|
105
|
-
</html>
|
106
|
-
})
|
107
|
-
@candidates = @doc.score_paragraphs(0)
|
108
|
-
end
|
109
|
-
|
110
|
-
should "score elements in the document" do
|
111
|
-
assert_equal 3, @candidates.values.length
|
112
|
-
end
|
113
|
-
|
114
|
-
should "prefer the body in this particular example" do
|
115
|
-
assert_equal "body", @candidates.values.sort { |a, b|
|
116
|
-
b[:content_score] <=> a[:content_score]
|
117
|
-
}.first[:elem][:id]
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
context "the cant_read.html fixture" do
|
122
|
-
should "work on the cant_read.html fixture with some allowed tags" do
|
123
|
-
allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
|
124
|
-
allowed_attributes = %w[href]
|
125
|
-
html = File.read(HTML_DIRECTORY + "/cant_read.html")
|
126
|
-
assert Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.match(/Can you talk a little about how you developed the looks for the/)
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
context "general functionality" do
|
131
|
-
setup do
|
132
|
-
@doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
|
133
|
-
:min_text_length => 0, :retry_length => 1)
|
134
|
-
end
|
135
|
-
|
136
|
-
should "return the main page content" do
|
137
|
-
assert @doc.content.match("Some content")
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
context "ignoring sidebars" do
|
142
|
-
setup do
|
143
|
-
@doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
|
144
|
-
:min_text_length => 0, :retry_length => 1)
|
145
|
-
end
|
146
|
-
|
147
|
-
should "not return the sidebar" do
|
148
|
-
assert !@doc.content.match("sidebar")
|
149
|
-
end
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|