html_page_title 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Christoph Olszowka
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,40 @@
1
+ = HtmlPageTitle
2
+
3
+ gem install html_page_title
4
+
5
+ A simple class for finding the title of a given http url by fetching the
6
+ given url, following all eventual redirects and finally parsing it through
7
+ hpricot.
8
+
9
+ You can either use the shorthand form or initialize the instance properly:
10
+ * HtmlPageTitle('http://github.com')
11
+ * HtmlPageTitle.new('http://github.com')
12
+
13
+ Those calls are equivalent, except for one subtle difference:
14
+ The shorthand form will swallow SocketErrors and return nil (i.e. this will
15
+ happen for invalid urls), while the regular instantiation via new will
16
+ throw that error.
17
+
18
+ You can either get the title, the heading (which will be the content of the
19
+ first h1 tag in the body) or the label, which will be (in the following order
20
+ by availability) either the heading, or the title, or the target url after
21
+ redirecting.
22
+ Note that if the title or the heading can not be found (e.g. a non-HTML
23
+ document), both methods will return nil, so the label method is the only one
24
+ that will always return some kind of string
25
+
26
+ You can also have a look at the unit test to find out about the behaviour!
27
+
28
+ == Note on Patches/Pull Requests
29
+
30
+ * Fork the project.
31
+ * Make your feature addition or bug fix.
32
+ * Add tests for it. This is important so I don't break it in a
33
+ future version unintentionally.
34
+ * Commit, do not mess with rakefile, version, or history.
35
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
36
+ * Send me a pull request. Bonus points for topic branches.
37
+
38
+ == Copyright
39
+
40
+ Copyright (c) 2010 Christoph Olszowka. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "html_page_title"
8
+ gem.summary = %Q{Retrieve the page title for a given url (including redirects)}
9
+ gem.description = %Q{Retrieve the page title for a given url using redirect_follower and hpricot ruby gems}
10
+ gem.email = "'christoph at olszowka.de'"
11
+ gem.homepage = "http://github.com/colszowka/html_page_title"
12
+ gem.authors = ["Christoph Olszowka"]
13
+ gem.add_dependency "redirect_follower", ">= 0.1.1"
14
+ gem.add_dependency "hpricot", ">= 0.8.2"
15
+ gem.add_development_dependency "shoulda", ">= 2.11.1"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+ task :test => :check_dependencies
44
+
45
+ task :default => :test
46
+
47
+ require 'rake/rdoctask'
48
+ Rake::RDocTask.new do |rdoc|
49
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
50
+
51
+ rdoc.rdoc_dir = 'rdoc'
52
+ rdoc.title = "html_page_title #{version}"
53
+ rdoc.rdoc_files.include('README*')
54
+ rdoc.rdoc_files.include('lib/**/*.rb')
55
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{html_page_title}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Christoph Olszowka"]
12
+ s.date = %q{2010-07-15}
13
+ s.description = %q{Retrieve the page title for a given url using redirect_follower and hpricot ruby gems}
14
+ s.email = %q{'christoph at olszowka.de'}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "html_page_title.gemspec",
27
+ "lib/html_page_title.rb",
28
+ "test/helper.rb",
29
+ "test/test_html_page_title.rb"
30
+ ]
31
+ s.homepage = %q{http://github.com/colszowka/html_page_title}
32
+ s.rdoc_options = ["--charset=UTF-8"]
33
+ s.require_paths = ["lib"]
34
+ s.rubygems_version = %q{1.3.7}
35
+ s.summary = %q{Retrieve the page title for a given url (including redirects)}
36
+ s.test_files = [
37
+ "test/helper.rb",
38
+ "test/test_html_page_title.rb"
39
+ ]
40
+
41
+ if s.respond_to? :specification_version then
42
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
43
+ s.specification_version = 3
44
+
45
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
46
+ s.add_runtime_dependency(%q<redirect_follower>, [">= 0.1.1"])
47
+ s.add_runtime_dependency(%q<hpricot>, [">= 0.8.2"])
48
+ s.add_development_dependency(%q<shoulda>, [">= 2.11.1"])
49
+ else
50
+ s.add_dependency(%q<redirect_follower>, [">= 0.1.1"])
51
+ s.add_dependency(%q<hpricot>, [">= 0.8.2"])
52
+ s.add_dependency(%q<shoulda>, [">= 2.11.1"])
53
+ end
54
+ else
55
+ s.add_dependency(%q<redirect_follower>, [">= 0.1.1"])
56
+ s.add_dependency(%q<hpricot>, [">= 0.8.2"])
57
+ s.add_dependency(%q<shoulda>, [">= 2.11.1"])
58
+ end
59
+ end
60
+
@@ -0,0 +1,78 @@
1
+ require 'redirect_follower'
2
+ require 'hpricot'
3
+ def HtmlPageTitle(url)
4
+ HtmlPageTitle.new(url)
5
+ rescue SocketError => err
6
+ nil
7
+ end
8
+
9
+ #
10
+ # A simple class for finding the title of a given http url by fetching the
11
+ # given url, following all eventual redirects and finally parsing it through
12
+ # hpricot.
13
+ #
14
+ # You can either use the shorthand form or initialize the instance properly:
15
+ # * HtmlPageTitle('http://github.com')
16
+ # * HtmlPageTitle.new('http://github.com')
17
+ #
18
+ # Those calls are equivalent, except for one subtle difference:
19
+ # The shorthand form will swallow SocketErrors and return nil (i.e. this will
20
+ # happen for invalid urls), while the regular instantiation via new will
21
+ # throw that error.
22
+ #
23
+ # You can either get the title, the heading (which will be the content of the
24
+ # first h1 tag in the body) or the label, which will be (in the following order
25
+ # by availability) either the heading, or the title, or the target url after
26
+ # redirecting.
27
+ # Note that if the title or the heading can not be found (e.g. a non-HTML
28
+ # document), both methods will return nil, so the label method is the only one
29
+ # that will always return some kind of string
30
+ #
31
+ class HtmlPageTitle
32
+ attr_reader :original_url
33
+ def initialize(original_url)
34
+ @original_url = original_url
35
+ title # retrieve data so exceptions can be thrown
36
+ end
37
+
38
+ def document
39
+ @document ||= Hpricot(redirect.body)
40
+ end
41
+
42
+ def title
43
+ return @title if @title
44
+ if title_tag = document.at('head title')
45
+ @title = title_tag.inner_html.strip.chomp
46
+ end
47
+ end
48
+
49
+ # Retrieves the first h1 tag in the page and returns it's content
50
+ def heading
51
+ return @heading if @heading
52
+ if heading_tag = document.at('body h1')
53
+ @heading = heading_tag.inner_html.strip.chomp
54
+ end
55
+ end
56
+
57
+ # Returns either the heading, or the title, or the url in this order
58
+ # by availability
59
+ def label
60
+ heading or title or url
61
+ end
62
+
63
+ # Returns the redirect follower instance used for resolving
64
+ # this instances url
65
+ def redirect
66
+ @redirect = RedirectFollower.new(original_url)
67
+ end
68
+
69
+ # Returns the target url after all redirects
70
+ def url
71
+ redirect.url
72
+ end
73
+
74
+ # Returns the body of the document at the (redirected?) target
75
+ def body
76
+ redirect.body
77
+ end
78
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'html_page_title'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,63 @@
1
+ require 'helper'
2
+
3
+ class TestHtmlPageTitle < Test::Unit::TestCase
4
+ def test_quick_access
5
+ instance = HtmlPageTitle('http://www.spiegel.de')
6
+ assert_equal "SPIEGEL ONLINE - Nachrichten", instance.title
7
+ assert_equal 'http://www.spiegel.de', instance.url
8
+ assert_equal 'SPIEGEL ONLINE', instance.heading
9
+ assert_equal instance.heading, instance.label
10
+ end
11
+
12
+ def test_access_with_instantiation
13
+ instance = HtmlPageTitle.new('http://www.spiegel.de')
14
+ assert_equal "SPIEGEL ONLINE - Nachrichten", instance.title
15
+ assert_equal 'http://www.spiegel.de', instance.url
16
+ assert_equal 'SPIEGEL ONLINE', instance.heading
17
+ assert_equal instance.heading, instance.label
18
+
19
+ assert instance.body.kind_of?(String)
20
+ assert_equal instance.redirect.url, instance.url
21
+ assert_equal RedirectFollower, instance.redirect.class
22
+ assert_equal Hpricot::Doc, instance.document.class
23
+ end
24
+
25
+ def test_with_redirect
26
+ instance = HtmlPageTitle.new('http://is.gd/bNZYZ')
27
+ assert_equal "TASCHEN Books: Byrne, Six Books of Euclid", instance.title
28
+ assert_equal 'http://www.taschen.com/pages/en/catalogue/classics/all/06724/facts.byrne_six_books_of_euclid.htm', instance.url
29
+ assert_equal 'Byrne, Six Books of Euclid', instance.heading
30
+ assert_equal instance.heading, instance.label
31
+
32
+ assert instance.body.kind_of?(String)
33
+ assert_equal instance.redirect.url, instance.url
34
+ assert_equal RedirectFollower, instance.redirect.class
35
+ assert_equal Hpricot::Doc, instance.document.class
36
+ end
37
+
38
+ def test_quick_access_with_invalid_urls
39
+ assert_nil HtmlPageTitle('http://www.thisdoesnotexistforrealsure.de')
40
+ assert_nil HtmlPageTitle('http://www.notldisntniceeh')
41
+ end
42
+
43
+ def test_regular_access_with_invalid_urls
44
+ assert_raise SocketError do
45
+ HtmlPageTitle.new('http://www.thisdoesnotexistforsure.de')
46
+ end
47
+ assert_raise SocketError do
48
+ HtmlPageTitle.new('http://www.thisdoesnotexistforsur')
49
+ end
50
+ end
51
+
52
+ def test_non_html_url
53
+ instance = HtmlPageTitle.new('http://gist.github.com/raw/93965/64e0b8445d0c3481f755fe65fd79297fcf6da909/x')
54
+ assert_nil instance.title
55
+ assert_nil instance.heading
56
+ assert_equal 'http://gist.github.com/raw/93965/64e0b8445d0c3481f755fe65fd79297fcf6da909/x', instance.label
57
+ end
58
+
59
+ def test_url_without_h1
60
+ instance = HtmlPageTitle('http://gembundler.com/v1.0/index.html')
61
+ assert_equal instance.title, instance.label
62
+ end
63
+ end
metadata ADDED
@@ -0,0 +1,125 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_page_title
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Christoph Olszowka
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-07-15 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: redirect_follower
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 25
30
+ segments:
31
+ - 0
32
+ - 1
33
+ - 1
34
+ version: 0.1.1
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: hpricot
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 59
46
+ segments:
47
+ - 0
48
+ - 8
49
+ - 2
50
+ version: 0.8.2
51
+ type: :runtime
52
+ version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: shoulda
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 33
62
+ segments:
63
+ - 2
64
+ - 11
65
+ - 1
66
+ version: 2.11.1
67
+ type: :development
68
+ version_requirements: *id003
69
+ description: Retrieve the page title for a given url using redirect_follower and hpricot ruby gems
70
+ email: "'christoph at olszowka.de'"
71
+ executables: []
72
+
73
+ extensions: []
74
+
75
+ extra_rdoc_files:
76
+ - LICENSE
77
+ - README.rdoc
78
+ files:
79
+ - .document
80
+ - .gitignore
81
+ - LICENSE
82
+ - README.rdoc
83
+ - Rakefile
84
+ - VERSION
85
+ - html_page_title.gemspec
86
+ - lib/html_page_title.rb
87
+ - test/helper.rb
88
+ - test/test_html_page_title.rb
89
+ has_rdoc: true
90
+ homepage: http://github.com/colszowka/html_page_title
91
+ licenses: []
92
+
93
+ post_install_message:
94
+ rdoc_options:
95
+ - --charset=UTF-8
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ hash: 3
104
+ segments:
105
+ - 0
106
+ version: "0"
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ requirements: []
117
+
118
+ rubyforge_project:
119
+ rubygems_version: 1.3.7
120
+ signing_key:
121
+ specification_version: 3
122
+ summary: Retrieve the page title for a given url (including redirects)
123
+ test_files:
124
+ - test/helper.rb
125
+ - test/test_html_page_title.rb