pismo 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/test/helper.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'open-uri'
5
+ require 'yaml'
6
+ begin; require 'turn'; rescue LoadError; end
7
+
8
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
10
+ require 'pismo'
11
+
12
+ class Test::Unit::TestCase
13
+ include Pismo
14
+ HTML_DIRECTORY = File.dirname(__FILE__) + "/corpus"
15
+ end
@@ -0,0 +1,33 @@
1
+ require 'helper'
2
+
3
+ class TestCorpus < Test::Unit::TestCase
4
+
5
+ context "A corpus of HTML documents" do
6
+ setup do
7
+ # Load the corpus files' HTML content into a hash
8
+ @corpus = {}
9
+ Dir[HTML_DIRECTORY + "/*.html"].each { |filename| @corpus[File.basename(filename).sub(/\.html$/, '').to_sym] = File.read(filename) }
10
+
11
+ # Load the "expected metadata" ready for tests
12
+ @metadata = YAML.load(open(HTML_DIRECTORY + "/metadata_expected.yaml"))
13
+ end
14
+
15
+ should "pass basic sanitization and result in Nokogiri documents" do
16
+ @corpus.values.each do |html|
17
+ doc = Document.new(html)
18
+ assert doc.html.length > 1000
19
+ assert doc.doc.kind_of?(Nokogiri::HTML::Document)
20
+ end
21
+ end
22
+
23
+ should "pass metadata extraction tests" do
24
+
25
+ @metadata.each do |file, expected|
26
+ @doc = Document.new(@corpus[file])
27
+ expected.each do |k, v|
28
+ assert_equal v, @doc.send(k)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ require 'helper'
2
+
3
+ class TestPismoDocument < Test::Unit::TestCase
4
+ context "Pismo::Document" do
5
+ should "process an IO/File object" do
6
+ doc = Document.new(open(HTML_DIRECTORY + "/rubyinside.html"))
7
+ assert doc.doc.kind_of?(Nokogiri::HTML::Document)
8
+ end
9
+ end
10
+
11
+ context "A very basic Pismo document" do
12
+ setup do
13
+ @doc = Document.new(%{<html><body><h1>Hello</h1></body></html>})
14
+ end
15
+
16
+ should "pass sanitization" do
17
+ assert_equal %{<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">\n<html><body><h1>Hello</h1></body></html>\n}, @doc.html
18
+ end
19
+
20
+ should "result in a Nokogiri document" do
21
+ assert @doc.doc.kind_of?(Nokogiri::HTML::Document)
22
+ end
23
+ end
24
+
25
+ context "A basic real world blog post" do
26
+ setup do
27
+ @doc = Document.new(open(HTML_DIRECTORY + "/rubyinside.html"))
28
+ end
29
+
30
+ should "provide a title" do
31
+ assert_equal "CoffeeScript: A New Language With A Pure Ruby Compiler", @doc.title
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,152 @@
1
+ require 'helper'
2
+
3
+ class TestReadability < Test::Unit::TestCase
4
+ context "Readability" do
5
+ setup do
6
+ @simple_html_fixture = <<-HTML
7
+ <html>
8
+ <head>
9
+ <title>title!</title>
10
+ </head>
11
+ <body class='comment'>
12
+ <div>
13
+ <p class='comment'>a comment</p>
14
+ <div class='comment' id='body'>real content</div>
15
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
16
+ </div>
17
+ </body>
18
+ </html>
19
+ HTML
20
+ end
21
+
22
+ context "transformMisusedDivsIntoParagraphs" do
23
+ setup do
24
+ @doc = Readability::Document.new(@simple_html_fixture)
25
+ @doc.transform_misused_divs_into_paragraphs!
26
+ end
27
+
28
+ should "transform divs containing no block elements into <p>s" do
29
+ assert_equal "p", @doc.html.css("#body").first.name
30
+ end
31
+
32
+ should "not transform divs that contain block elements" do
33
+ assert_equal "div", @doc.html.css("#contains_blockquote").first.name
34
+ end
35
+ end
36
+
37
+ context "score_node" do
38
+ setup do
39
+ @doc = Readability::Document.new(<<-HTML)
40
+ <html>
41
+ <body>
42
+ <div id='elem1'>
43
+ <p>some content</p>
44
+ </div>
45
+ <th id='elem2'>
46
+ <p>some other content</p>
47
+ </th>
48
+ </body>
49
+ </html>
50
+ HTML
51
+ @elem1 = @doc.html.css("#elem1").first
52
+ @elem2 = @doc.html.css("#elem2").first
53
+ end
54
+
55
+ should "like <div>s more than <th>s" do
56
+ assert @doc.score_node(@elem1)[:content_score] > @doc.score_node(@elem2)[:content_score]
57
+ end
58
+
59
+ should "like classes like text more than classes like comment" do
60
+ @elem2.name = "div"
61
+ assert_equal @doc.score_node(@elem2)[:content_score], @doc.score_node(@elem1)[:content_score]
62
+ @elem1['class'] = "text"
63
+ @elem2['class'] = "comment"
64
+ assert @doc.score_node(@elem1)[:content_score] > @doc.score_node(@elem2)[:content_score]
65
+ end
66
+ end
67
+
68
+ context "remove_unlikely_candidates!" do
69
+ setup do
70
+ @doc = Readability::Document.new(@simple_html_fixture)
71
+ @doc.remove_unlikely_candidates!
72
+ end
73
+
74
+ should "remove things that have class comment" do
75
+ assert @doc.html.inner_html !~ /a comment/
76
+ end
77
+
78
+ should "not remove body tags" do
79
+ assert @doc.html.inner_html =~ /<\/body>/
80
+ end
81
+
82
+ should "not remove things with class comment and id body" do
83
+ assert @doc.html.inner_html =~ /real content/
84
+ end
85
+ end
86
+
87
+ context "score_paragraphs" do
88
+ setup do
89
+ @doc = Readability::Document.new(%{
90
+ <html>
91
+ <head>
92
+ <title>title!</title>
93
+ </head>
94
+ <body id="body">
95
+ <div id="div1">
96
+ <div id="div2>
97
+ <p id="some_comment">a comment</p>
98
+ </div>
99
+ <p id="some_text">some text</p>
100
+ </div>
101
+ <div id="div3">
102
+ <p id="some_text2">some more text</p>
103
+ </div>
104
+ </body>
105
+ </html>
106
+ })
107
+ @candidates = @doc.score_paragraphs(0)
108
+ end
109
+
110
+ should "score elements in the document" do
111
+ assert_equal 3, @candidates.values.length
112
+ end
113
+
114
+ should "prefer the body in this particular example" do
115
+ assert_equal "body", @candidates.values.sort { |a, b|
116
+ b[:content_score] <=> a[:content_score]
117
+ }.first[:elem][:id]
118
+ end
119
+ end
120
+
121
+ context "the cant_read.html fixture" do
122
+ should "work on the cant_read.html fixture with some allowed tags" do
123
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
124
+ allowed_attributes = %w[href]
125
+ html = File.read(HTML_DIRECTORY + "/cant_read.html")
126
+ assert Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.match(/Can you talk a little about how you developed the looks for the/)
127
+ end
128
+ end
129
+
130
+ context "general functionality" do
131
+ setup do
132
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
133
+ :min_text_length => 0, :retry_length => 1)
134
+ end
135
+
136
+ should "return the main page content" do
137
+ assert @doc.content.match("Some content")
138
+ end
139
+ end
140
+
141
+ context "ignoring sidebars" do
142
+ setup do
143
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
144
+ :min_text_length => 0, :retry_length => 1)
145
+ end
146
+
147
+ should "not return the sidebar" do
148
+ assert !@doc.content.match("sidebar")
149
+ end
150
+ end
151
+ end
152
+ end
metadata ADDED
@@ -0,0 +1,146 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pismo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Peter Cooper
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-03-26 00:00:00 +00:00
13
+ default_executable: pismo
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: nokogiri
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: loofah
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: httparty
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ - !ruby/object:Gem::Dependency
56
+ name: fast-stemmer
57
+ type: :runtime
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ - !ruby/object:Gem::Dependency
66
+ name: chronic
67
+ type: :runtime
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
75
+ description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, del.icio.us tags, first image used in the content block, etc.
76
+ email: git@peterc.org
77
+ executables:
78
+ - pismo
79
+ extensions: []
80
+
81
+ extra_rdoc_files:
82
+ - LICENSE
83
+ - README.rdoc
84
+ files:
85
+ - .document
86
+ - .gitignore
87
+ - LICENSE
88
+ - README.rdoc
89
+ - Rakefile
90
+ - VERSION
91
+ - bin/pismo
92
+ - lib/pismo.rb
93
+ - lib/pismo/document.rb
94
+ - lib/pismo/external_attributes.rb
95
+ - lib/pismo/internal_attributes.rb
96
+ - lib/pismo/readability.rb
97
+ - lib/pismo/stopwords.txt
98
+ - pismo.gemspec
99
+ - test/corpus/bbcnews.html
100
+ - test/corpus/briancray.html
101
+ - test/corpus/cant_read.html
102
+ - test/corpus/factor.html
103
+ - test/corpus/huffington.html
104
+ - test/corpus/metadata_expected.yaml
105
+ - test/corpus/rubyinside.html
106
+ - test/corpus/rww.html
107
+ - test/corpus/spolsky.html
108
+ - test/corpus/techcrunch.html
109
+ - test/corpus/youtube.html
110
+ - test/helper.rb
111
+ - test/test_corpus.rb
112
+ - test/test_pismo_document.rb
113
+ - test/test_readability.rb
114
+ has_rdoc: true
115
+ homepage: http://github.com/peterc/pismo
116
+ licenses: []
117
+
118
+ post_install_message:
119
+ rdoc_options:
120
+ - --charset=UTF-8
121
+ require_paths:
122
+ - lib
123
+ required_ruby_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: "0"
128
+ version:
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: "0"
134
+ version:
135
+ requirements: []
136
+
137
+ rubyforge_project:
138
+ rubygems_version: 1.3.5
139
+ signing_key:
140
+ specification_version: 3
141
+ summary: Extracts or retrieves content-related metadata from HTML pages
142
+ test_files:
143
+ - test/helper.rb
144
+ - test/test_corpus.rb
145
+ - test/test_pismo_document.rb
146
+ - test/test_readability.rb