pismo 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/test/helper.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'open-uri'
5
+ require 'yaml'
6
+ begin; require 'turn'; rescue LoadError; end
7
+
8
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
9
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
10
+ require 'pismo'
11
+
12
+ class Test::Unit::TestCase
13
+ include Pismo
14
+ HTML_DIRECTORY = File.dirname(__FILE__) + "/corpus"
15
+ end
@@ -0,0 +1,33 @@
1
+ require 'helper'
2
+
3
+ class TestCorpus < Test::Unit::TestCase
4
+
5
+ context "A corpus of HTML documents" do
6
+ setup do
7
+ # Load the corpus files' HTML content into a hash
8
+ @corpus = {}
9
+ Dir[HTML_DIRECTORY + "/*.html"].each { |filename| @corpus[File.basename(filename).sub(/\.html$/, '').to_sym] = File.read(filename) }
10
+
11
+ # Load the "expected metadata" ready for tests
12
+ @metadata = YAML.load(open(HTML_DIRECTORY + "/metadata_expected.yaml"))
13
+ end
14
+
15
+ should "pass basic sanitization and result in Nokogiri documents" do
16
+ @corpus.values.each do |html|
17
+ doc = Document.new(html)
18
+ assert doc.html.length > 1000
19
+ assert doc.doc.kind_of?(Nokogiri::HTML::Document)
20
+ end
21
+ end
22
+
23
+ should "pass metadata extraction tests" do
24
+
25
+ @metadata.each do |file, expected|
26
+ @doc = Document.new(@corpus[file])
27
+ expected.each do |k, v|
28
+ assert_equal v, @doc.send(k)
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,34 @@
1
+ require 'helper'
2
+
3
+ class TestPismoDocument < Test::Unit::TestCase
4
+ context "Pismo::Document" do
5
+ should "process an IO/File object" do
6
+ doc = Document.new(open(HTML_DIRECTORY + "/rubyinside.html"))
7
+ assert doc.doc.kind_of?(Nokogiri::HTML::Document)
8
+ end
9
+ end
10
+
11
+ context "A very basic Pismo document" do
12
+ setup do
13
+ @doc = Document.new(%{<html><body><h1>Hello</h1></body></html>})
14
+ end
15
+
16
+ should "pass sanitization" do
17
+ assert_equal %{<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">\n<html><body><h1>Hello</h1></body></html>\n}, @doc.html
18
+ end
19
+
20
+ should "result in a Nokogiri document" do
21
+ assert @doc.doc.kind_of?(Nokogiri::HTML::Document)
22
+ end
23
+ end
24
+
25
+ context "A basic real world blog post" do
26
+ setup do
27
+ @doc = Document.new(open(HTML_DIRECTORY + "/rubyinside.html"))
28
+ end
29
+
30
+ should "provide a title" do
31
+ assert_equal "CoffeeScript: A New Language With A Pure Ruby Compiler", @doc.title
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,152 @@
1
+ require 'helper'
2
+
3
+ class TestReadability < Test::Unit::TestCase
4
+ context "Readability" do
5
+ setup do
6
+ @simple_html_fixture = <<-HTML
7
+ <html>
8
+ <head>
9
+ <title>title!</title>
10
+ </head>
11
+ <body class='comment'>
12
+ <div>
13
+ <p class='comment'>a comment</p>
14
+ <div class='comment' id='body'>real content</div>
15
+ <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
16
+ </div>
17
+ </body>
18
+ </html>
19
+ HTML
20
+ end
21
+
22
+ context "transformMisusedDivsIntoParagraphs" do
23
+ setup do
24
+ @doc = Readability::Document.new(@simple_html_fixture)
25
+ @doc.transform_misused_divs_into_paragraphs!
26
+ end
27
+
28
+ should "transform divs containing no block elements into <p>s" do
29
+ assert_equal "p", @doc.html.css("#body").first.name
30
+ end
31
+
32
+ should "not transform divs that contain block elements" do
33
+ assert_equal "div", @doc.html.css("#contains_blockquote").first.name
34
+ end
35
+ end
36
+
37
+ context "score_node" do
38
+ setup do
39
+ @doc = Readability::Document.new(<<-HTML)
40
+ <html>
41
+ <body>
42
+ <div id='elem1'>
43
+ <p>some content</p>
44
+ </div>
45
+ <th id='elem2'>
46
+ <p>some other content</p>
47
+ </th>
48
+ </body>
49
+ </html>
50
+ HTML
51
+ @elem1 = @doc.html.css("#elem1").first
52
+ @elem2 = @doc.html.css("#elem2").first
53
+ end
54
+
55
+ should "like <div>s more than <th>s" do
56
+ assert @doc.score_node(@elem1)[:content_score] > @doc.score_node(@elem2)[:content_score]
57
+ end
58
+
59
+ should "like classes like text more than classes like comment" do
60
+ @elem2.name = "div"
61
+ assert_equal @doc.score_node(@elem2)[:content_score], @doc.score_node(@elem1)[:content_score]
62
+ @elem1['class'] = "text"
63
+ @elem2['class'] = "comment"
64
+ assert @doc.score_node(@elem1)[:content_score] > @doc.score_node(@elem2)[:content_score]
65
+ end
66
+ end
67
+
68
+ context "remove_unlikely_candidates!" do
69
+ setup do
70
+ @doc = Readability::Document.new(@simple_html_fixture)
71
+ @doc.remove_unlikely_candidates!
72
+ end
73
+
74
+ should "remove things that have class comment" do
75
+ assert @doc.html.inner_html !~ /a comment/
76
+ end
77
+
78
+ should "not remove body tags" do
79
+ assert @doc.html.inner_html =~ /<\/body>/
80
+ end
81
+
82
+ should "not remove things with class comment and id body" do
83
+ assert @doc.html.inner_html =~ /real content/
84
+ end
85
+ end
86
+
87
+ context "score_paragraphs" do
88
+ setup do
89
+ @doc = Readability::Document.new(%{
90
+ <html>
91
+ <head>
92
+ <title>title!</title>
93
+ </head>
94
+ <body id="body">
95
+ <div id="div1">
96
+ <div id="div2>
97
+ <p id="some_comment">a comment</p>
98
+ </div>
99
+ <p id="some_text">some text</p>
100
+ </div>
101
+ <div id="div3">
102
+ <p id="some_text2">some more text</p>
103
+ </div>
104
+ </body>
105
+ </html>
106
+ })
107
+ @candidates = @doc.score_paragraphs(0)
108
+ end
109
+
110
+ should "score elements in the document" do
111
+ assert_equal 3, @candidates.values.length
112
+ end
113
+
114
+ should "prefer the body in this particular example" do
115
+ assert_equal "body", @candidates.values.sort { |a, b|
116
+ b[:content_score] <=> a[:content_score]
117
+ }.first[:elem][:id]
118
+ end
119
+ end
120
+
121
+ context "the cant_read.html fixture" do
122
+ should "work on the cant_read.html fixture with some allowed tags" do
123
+ allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
124
+ allowed_attributes = %w[href]
125
+ html = File.read(HTML_DIRECTORY + "/cant_read.html")
126
+ assert Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.match(/Can you talk a little about how you developed the looks for the/)
127
+ end
128
+ end
129
+
130
+ context "general functionality" do
131
+ setup do
132
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
133
+ :min_text_length => 0, :retry_length => 1)
134
+ end
135
+
136
+ should "return the main page content" do
137
+ assert @doc.content.match("Some content")
138
+ end
139
+ end
140
+
141
+ context "ignoring sidebars" do
142
+ setup do
143
+ @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
144
+ :min_text_length => 0, :retry_length => 1)
145
+ end
146
+
147
+ should "not return the sidebar" do
148
+ assert !@doc.content.match("sidebar")
149
+ end
150
+ end
151
+ end
152
+ end
metadata ADDED
@@ -0,0 +1,146 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pismo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Peter Cooper
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-03-26 00:00:00 +00:00
13
+ default_executable: pismo
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: shoulda
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: nokogiri
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: loofah
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: httparty
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ - !ruby/object:Gem::Dependency
56
+ name: fast-stemmer
57
+ type: :runtime
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ - !ruby/object:Gem::Dependency
66
+ name: chronic
67
+ type: :runtime
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
75
+ description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, del.icio.us tags, first image used in the content block, etc.
76
+ email: git@peterc.org
77
+ executables:
78
+ - pismo
79
+ extensions: []
80
+
81
+ extra_rdoc_files:
82
+ - LICENSE
83
+ - README.rdoc
84
+ files:
85
+ - .document
86
+ - .gitignore
87
+ - LICENSE
88
+ - README.rdoc
89
+ - Rakefile
90
+ - VERSION
91
+ - bin/pismo
92
+ - lib/pismo.rb
93
+ - lib/pismo/document.rb
94
+ - lib/pismo/external_attributes.rb
95
+ - lib/pismo/internal_attributes.rb
96
+ - lib/pismo/readability.rb
97
+ - lib/pismo/stopwords.txt
98
+ - pismo.gemspec
99
+ - test/corpus/bbcnews.html
100
+ - test/corpus/briancray.html
101
+ - test/corpus/cant_read.html
102
+ - test/corpus/factor.html
103
+ - test/corpus/huffington.html
104
+ - test/corpus/metadata_expected.yaml
105
+ - test/corpus/rubyinside.html
106
+ - test/corpus/rww.html
107
+ - test/corpus/spolsky.html
108
+ - test/corpus/techcrunch.html
109
+ - test/corpus/youtube.html
110
+ - test/helper.rb
111
+ - test/test_corpus.rb
112
+ - test/test_pismo_document.rb
113
+ - test/test_readability.rb
114
+ has_rdoc: true
115
+ homepage: http://github.com/peterc/pismo
116
+ licenses: []
117
+
118
+ post_install_message:
119
+ rdoc_options:
120
+ - --charset=UTF-8
121
+ require_paths:
122
+ - lib
123
+ required_ruby_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: "0"
128
+ version:
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: "0"
134
+ version:
135
+ requirements: []
136
+
137
+ rubyforge_project:
138
+ rubygems_version: 1.3.5
139
+ signing_key:
140
+ specification_version: 3
141
+ summary: Extracts or retrieves content-related metadata from HTML pages
142
+ test_files:
143
+ - test/helper.rb
144
+ - test/test_corpus.rb
145
+ - test/test_pismo_document.rb
146
+ - test/test_readability.rb