html_page_title 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Christoph Olszowka
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,40 @@
1
+ = HtmlPageTitle
2
+
3
+ gem install html_page_title
4
+
5
+ A simple class for finding the title of a given http url by fetching the
6
+ given url, following all eventual redirects and finally parsing it through
7
+ hpricot.
8
+
9
+ You can either use the shorthand form or initialize the instance properly:
10
+ * HtmlPageTitle('http://github.com')
11
+ * HtmlPageTitle.new('http://github.com')
12
+
13
+ Those calls are equivalent, except for one subtle difference:
14
+ The shorthand form will swallow SocketErrors and return nil (i.e. this will
15
+ happen for invalid urls), while the regular instantiation via new will
16
+ throw that error.
17
+
18
+ You can either get the title, the heading (which will be the content of the
19
+ first h1 tag in the body) or the label, which will be (in the following order
20
+ by availability) either the heading, or the title, or the target url after
21
+ redirecting.
22
+ Note that if the title or the heading can not be found (e.g. a non-HTML
23
+ document), both methods will return nil, so the label method is the only one
24
+ that will always return some kind of string
25
+
26
+ You can also have a look at the unit test to find out about the behaviour!
27
+
28
+ == Note on Patches/Pull Requests
29
+
30
+ * Fork the project.
31
+ * Make your feature addition or bug fix.
32
+ * Add tests for it. This is important so I don't break it in a
33
+ future version unintentionally.
34
+ * Commit, do not mess with rakefile, version, or history.
35
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
36
+ * Send me a pull request. Bonus points for topic branches.
37
+
38
+ == Copyright
39
+
40
+ Copyright (c) 2010 Christoph Olszowka. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "html_page_title"
8
+ gem.summary = %Q{Retrieve the page title for a given url (including redirects)}
9
+ gem.description = %Q{Retrieve the page title for a given url using redirect_follower and hpricot ruby gems}
10
+ gem.email = "'christoph at olszowka.de'"
11
+ gem.homepage = "http://github.com/colszowka/html_page_title"
12
+ gem.authors = ["Christoph Olszowka"]
13
+ gem.add_dependency "redirect_follower", ">= 0.1.1"
14
+ gem.add_dependency "hpricot", ">= 0.8.2"
15
+ gem.add_development_dependency "shoulda", ">= 2.11.1"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+ task :test => :check_dependencies
44
+
45
+ task :default => :test
46
+
47
+ require 'rake/rdoctask'
48
+ Rake::RDocTask.new do |rdoc|
49
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
50
+
51
+ rdoc.rdoc_dir = 'rdoc'
52
+ rdoc.title = "html_page_title #{version}"
53
+ rdoc.rdoc_files.include('README*')
54
+ rdoc.rdoc_files.include('lib/**/*.rb')
55
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{html_page_title}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Christoph Olszowka"]
12
+ s.date = %q{2010-07-15}
13
+ s.description = %q{Retrieve the page title for a given url using redirect_follower and hpricot ruby gems}
14
+ s.email = %q{'christoph at olszowka.de'}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "html_page_title.gemspec",
27
+ "lib/html_page_title.rb",
28
+ "test/helper.rb",
29
+ "test/test_html_page_title.rb"
30
+ ]
31
+ s.homepage = %q{http://github.com/colszowka/html_page_title}
32
+ s.rdoc_options = ["--charset=UTF-8"]
33
+ s.require_paths = ["lib"]
34
+ s.rubygems_version = %q{1.3.7}
35
+ s.summary = %q{Retrieve the page title for a given url (including redirects)}
36
+ s.test_files = [
37
+ "test/helper.rb",
38
+ "test/test_html_page_title.rb"
39
+ ]
40
+
41
+ if s.respond_to? :specification_version then
42
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
43
+ s.specification_version = 3
44
+
45
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
46
+ s.add_runtime_dependency(%q<redirect_follower>, [">= 0.1.1"])
47
+ s.add_runtime_dependency(%q<hpricot>, [">= 0.8.2"])
48
+ s.add_development_dependency(%q<shoulda>, [">= 2.11.1"])
49
+ else
50
+ s.add_dependency(%q<redirect_follower>, [">= 0.1.1"])
51
+ s.add_dependency(%q<hpricot>, [">= 0.8.2"])
52
+ s.add_dependency(%q<shoulda>, [">= 2.11.1"])
53
+ end
54
+ else
55
+ s.add_dependency(%q<redirect_follower>, [">= 0.1.1"])
56
+ s.add_dependency(%q<hpricot>, [">= 0.8.2"])
57
+ s.add_dependency(%q<shoulda>, [">= 2.11.1"])
58
+ end
59
+ end
60
+
@@ -0,0 +1,78 @@
1
+ require 'redirect_follower'
2
+ require 'hpricot'
3
+ def HtmlPageTitle(url)
4
+ HtmlPageTitle.new(url)
5
+ rescue SocketError => err
6
+ nil
7
+ end
8
+
9
+ #
10
+ # A simple class for finding the title of a given http url by fetching the
11
+ # given url, following all eventual redirects and finally parsing it through
12
+ # hpricot.
13
+ #
14
+ # You can either use the shorthand form or initialize the instance properly:
15
+ # * HtmlPageTitle('http://github.com')
16
+ # * HtmlPageTitle.new('http://github.com')
17
+ #
18
+ # Those calls are equivalent, except for one subtle difference:
19
+ # The shorthand form will swallow SocketErrors and return nil (i.e. this will
20
+ # happen for invalid urls), while the regular instantiation via new will
21
+ # throw that error.
22
+ #
23
+ # You can either get the title, the heading (which will be the content of the
24
+ # first h1 tag in the body) or the label, which will be (in the following order
25
+ # by availability) either the heading, or the title, or the target url after
26
+ # redirecting.
27
+ # Note that if the title or the heading can not be found (e.g. a non-HTML
28
+ # document), both methods will return nil, so the label method is the only one
29
+ # that will always return some kind of string
30
+ #
31
+ class HtmlPageTitle
32
+ attr_reader :original_url
33
+ def initialize(original_url)
34
+ @original_url = original_url
35
+ title # retrieve data so exceptions can be thrown
36
+ end
37
+
38
+ def document
39
+ @document ||= Hpricot(redirect.body)
40
+ end
41
+
42
+ def title
43
+ return @title if @title
44
+ if title_tag = document.at('head title')
45
+ @title = title_tag.inner_html.strip.chomp
46
+ end
47
+ end
48
+
49
+ # Retrieves the first h1 tag in the page and returns it's content
50
+ def heading
51
+ return @heading if @heading
52
+ if heading_tag = document.at('body h1')
53
+ @heading = heading_tag.inner_html.strip.chomp
54
+ end
55
+ end
56
+
57
+ # Returns either the heading, or the title, or the url in this order
58
+ # by availability
59
+ def label
60
+ heading or title or url
61
+ end
62
+
63
+ # Returns the redirect follower instance used for resolving
64
+ # this instances url
65
+ def redirect
66
+ @redirect = RedirectFollower.new(original_url)
67
+ end
68
+
69
+ # Returns the target url after all redirects
70
+ def url
71
+ redirect.url
72
+ end
73
+
74
+ # Returns the body of the document at the (redirected?) target
75
+ def body
76
+ redirect.body
77
+ end
78
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'html_page_title'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,63 @@
1
+ require 'helper'
2
+
3
+ class TestHtmlPageTitle < Test::Unit::TestCase
4
+ def test_quick_access
5
+ instance = HtmlPageTitle('http://www.spiegel.de')
6
+ assert_equal "SPIEGEL ONLINE - Nachrichten", instance.title
7
+ assert_equal 'http://www.spiegel.de', instance.url
8
+ assert_equal 'SPIEGEL ONLINE', instance.heading
9
+ assert_equal instance.heading, instance.label
10
+ end
11
+
12
+ def test_access_with_instantiation
13
+ instance = HtmlPageTitle.new('http://www.spiegel.de')
14
+ assert_equal "SPIEGEL ONLINE - Nachrichten", instance.title
15
+ assert_equal 'http://www.spiegel.de', instance.url
16
+ assert_equal 'SPIEGEL ONLINE', instance.heading
17
+ assert_equal instance.heading, instance.label
18
+
19
+ assert instance.body.kind_of?(String)
20
+ assert_equal instance.redirect.url, instance.url
21
+ assert_equal RedirectFollower, instance.redirect.class
22
+ assert_equal Hpricot::Doc, instance.document.class
23
+ end
24
+
25
+ def test_with_redirect
26
+ instance = HtmlPageTitle.new('http://is.gd/bNZYZ')
27
+ assert_equal "TASCHEN Books: Byrne, Six Books of Euclid", instance.title
28
+ assert_equal 'http://www.taschen.com/pages/en/catalogue/classics/all/06724/facts.byrne_six_books_of_euclid.htm', instance.url
29
+ assert_equal 'Byrne, Six Books of Euclid', instance.heading
30
+ assert_equal instance.heading, instance.label
31
+
32
+ assert instance.body.kind_of?(String)
33
+ assert_equal instance.redirect.url, instance.url
34
+ assert_equal RedirectFollower, instance.redirect.class
35
+ assert_equal Hpricot::Doc, instance.document.class
36
+ end
37
+
38
+ def test_quick_access_with_invalid_urls
39
+ assert_nil HtmlPageTitle('http://www.thisdoesnotexistforrealsure.de')
40
+ assert_nil HtmlPageTitle('http://www.notldisntniceeh')
41
+ end
42
+
43
+ def test_regular_access_with_invalid_urls
44
+ assert_raise SocketError do
45
+ HtmlPageTitle.new('http://www.thisdoesnotexistforsure.de')
46
+ end
47
+ assert_raise SocketError do
48
+ HtmlPageTitle.new('http://www.thisdoesnotexistforsur')
49
+ end
50
+ end
51
+
52
+ def test_non_html_url
53
+ instance = HtmlPageTitle.new('http://gist.github.com/raw/93965/64e0b8445d0c3481f755fe65fd79297fcf6da909/x')
54
+ assert_nil instance.title
55
+ assert_nil instance.heading
56
+ assert_equal 'http://gist.github.com/raw/93965/64e0b8445d0c3481f755fe65fd79297fcf6da909/x', instance.label
57
+ end
58
+
59
+ def test_url_without_h1
60
+ instance = HtmlPageTitle('http://gembundler.com/v1.0/index.html')
61
+ assert_equal instance.title, instance.label
62
+ end
63
+ end
metadata ADDED
@@ -0,0 +1,125 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_page_title
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Christoph Olszowka
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-07-15 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: redirect_follower
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 25
30
+ segments:
31
+ - 0
32
+ - 1
33
+ - 1
34
+ version: 0.1.1
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: hpricot
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 59
46
+ segments:
47
+ - 0
48
+ - 8
49
+ - 2
50
+ version: 0.8.2
51
+ type: :runtime
52
+ version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: shoulda
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 33
62
+ segments:
63
+ - 2
64
+ - 11
65
+ - 1
66
+ version: 2.11.1
67
+ type: :development
68
+ version_requirements: *id003
69
+ description: Retrieve the page title for a given url using redirect_follower and hpricot ruby gems
70
+ email: "'christoph at olszowka.de'"
71
+ executables: []
72
+
73
+ extensions: []
74
+
75
+ extra_rdoc_files:
76
+ - LICENSE
77
+ - README.rdoc
78
+ files:
79
+ - .document
80
+ - .gitignore
81
+ - LICENSE
82
+ - README.rdoc
83
+ - Rakefile
84
+ - VERSION
85
+ - html_page_title.gemspec
86
+ - lib/html_page_title.rb
87
+ - test/helper.rb
88
+ - test/test_html_page_title.rb
89
+ has_rdoc: true
90
+ homepage: http://github.com/colszowka/html_page_title
91
+ licenses: []
92
+
93
+ post_install_message:
94
+ rdoc_options:
95
+ - --charset=UTF-8
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ hash: 3
104
+ segments:
105
+ - 0
106
+ version: "0"
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ requirements: []
117
+
118
+ rubyforge_project:
119
+ rubygems_version: 1.3.7
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: Retrieve the page title for a given url (including redirects)
123
+ test_files:
124
+ - test/helper.rb
125
+ - test/test_html_page_title.rb