url_info_extractor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ spec/cassettes/*
6
+
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in url_info_extractor.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,6 @@
1
+ guard 'rspec', :version => 2 do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
6
+
data/LICENSE ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2011, 2012 Florian Dütsch
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software is furnished to do so,
8
+ subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ UrlInfoExtractor
2
+ ================
3
+
4
+ The library helps you extracting some metadata out of HTML URLs.
5
+
6
+ [https://github.com/der-flo/url_info_extractor](github.com/der-flo/url_info_extractor)
7
+
8
+ Usage
9
+ -----
10
+ % gem install url_info_extractor
11
+
12
+ require 'url_info_extractor'
13
+ infos = UrlInfoExtractor.new('http://rubygems.org')
14
+ puts infos.title
15
+ # RubyGems.org | your community gem host
16
+ puts infos.favicon_url
17
+ # http://rubygems.org/favicon.ico
18
+ puts infos.favicon_exists?
19
+ # true
20
+ puts infos.description
21
+ #
22
+
23
+ Note that the favicon handling determines the URL by link-rel-icon tag and
24
+ base tag. `favicon_exists?` does an HTTP HEAD request to check whether the
25
+ icon is available at the determined path.
26
+
27
+ Fork me!
28
+ --------
29
+ If you find this library useful but miss some feature, please drop me a note
30
+ or - even better - fork the project at GitHub. Along with some specs I'll
31
+ accept your pull requests.
32
+
33
+ Author and license
34
+ ------------------
35
+ * Florian Dütsch (mail AT florian-duetsch.de)
36
+ * MIT, see LICENSE file.
37
+
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/TODO ADDED
@@ -0,0 +1,3 @@
1
+ * Error handling necessary?
2
+ * Nobody uses meta-keywords anymore. Should we support them?
3
+
@@ -0,0 +1,16 @@
1
+ class UrlInfoExtractor
2
+ module Favicon
3
+ def favicon_url
4
+ base = fetched_data[:base_link_value]
5
+ uri = base ? URI.parse(base) : @uri
6
+ link = fetched_data[:favicon_link_value]
7
+ link ||= '/favicon.ico'
8
+ uri.merge(URI.parse(link)).to_s
9
+ end
10
+ def favicon_exists?
11
+ response = do_http_request(favicon_url, :head)
12
+ response.is_a? Net::HTTPSuccess
13
+ end
14
+ end
15
+ end
16
+
@@ -0,0 +1,3 @@
1
+ class UrlInfoExtractor
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,65 @@
1
+ require "url_info_extractor/version"
2
+ require 'url_info_extractor/favicon'
3
+ require 'nokogiri'
4
+ require 'active_support'
5
+ require 'active_support/core_ext/object/try'
6
+ require 'active_support/core_ext/string/inflections'
7
+ require 'net/https'
8
+
9
+ class UrlInfoExtractor
10
+ include Favicon
11
+
12
+ class SchemeNotSupported < StandardError; end
13
+
14
+ def initialize(url)
15
+ @uri = URI.parse(url)
16
+ @path = @uri.path
17
+ @path = '/' if @path.empty?
18
+ end
19
+ def title
20
+ fetched_data[:title]
21
+ end
22
+
23
+ def description
24
+ fetched_data[:description]
25
+ end
26
+
27
+ def scheme_supported?
28
+ %w(http https).include? @uri.scheme
29
+ end
30
+
31
+ private
32
+
33
+ PROCESSABLE_MIME_TYPES = %w(application/xhtml+xml text/html)
34
+
35
+ def fetched_data
36
+ @fetched_data ||= fetch_data
37
+ end
38
+
39
+ def fetch_data
40
+ raise SchemeNotSupported unless scheme_supported?
41
+ response = do_http_request(@path, :get)
42
+
43
+ return {} unless PROCESSABLE_MIME_TYPES.include?(response.content_type)
44
+
45
+ doc = Nokogiri::HTML(response.body)
46
+ head = doc.css('html head')
47
+
48
+ { title: head.css('title').try(:first).try(:content),
49
+ description:
50
+ head.css('meta[name=description]').try(:first).try(:[], 'content'),
51
+ favicon_link_value:
52
+ head.css('link[rel~=icon]').try(:first).try(:[], 'href'),
53
+ base_link_value: head.css('base').try(:first).try(:[], 'href')
54
+ }
55
+ end
56
+
57
+ def do_http_request(path, method)
58
+ request = "Net::HTTP::#{method.to_s.camelize}".constantize.new(path)
59
+ Net::HTTP.start(@uri.host, @uri.port,
60
+ use_ssl: @uri.scheme == 'https') do |http|
61
+ http.request request
62
+ end
63
+ end
64
+ end
65
+
@@ -0,0 +1,13 @@
1
+ require 'url_info_extractor'
2
+ require 'vcr'
3
+
4
+ VCR.configure do |c|
5
+ c.cassette_library_dir = 'spec/cassettes'
6
+ c.default_cassette_options = { record: :new_episodes }
7
+ c.hook_into :webmock
8
+ end
9
+
10
+ RSpec.configure do |c|
11
+ c.extend VCR::RSpec::Macros
12
+ end
13
+
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe UrlInfoExtractor::Favicon do
4
+ use_vcr_cassette
5
+ it "returns favicon of rubygems.org" do
6
+ infos = UrlInfoExtractor.new('http://rubygems.org')
7
+ infos.favicon_url.should == 'http://rubygems.org/favicon.ico'
8
+ end
9
+ it 'returns the favicon of der-flo.org' do
10
+ infos = UrlInfoExtractor.new('http://der-flo.org')
11
+ infos.favicon_url.should == 'http://der-flo.org/favicon.ico'
12
+ end
13
+
14
+ it 'handles pages without an specified favicon' do
15
+ infos = UrlInfoExtractor.new('https://github.com')
16
+ infos.favicon_url.should == 'https://github.com/favicon.ico'
17
+ end
18
+
19
+ context 'with non-existing favicon' do
20
+ use_vcr_cassette 'favicon does not exist', record: :none
21
+
22
+ it 'checks whether the icon really exists' do
23
+ infos = UrlInfoExtractor.new('http://www.favicon-missing.com/')
24
+ infos.favicon_exists?.should be_false
25
+ end
26
+ end
27
+
28
+ context 'with <base> sample' do
29
+ use_vcr_cassette 'with base sample', record: :none
30
+ it 'handles pages with a base specification correct' do
31
+ infos = UrlInfoExtractor.new('http://www.with-base.com/')
32
+ infos.favicon_url.should == 'http://www2.with-base.com/icon.ico'
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,33 @@
1
+ require 'spec_helper'
2
+
3
+ describe UrlInfoExtractor do
4
+ use_vcr_cassette
5
+ it 'extracts the title from rubygems.org' do
6
+ infos = UrlInfoExtractor.new('http://rubygems.org')
7
+ infos.title.should =~ /rubygems/i
8
+ end
9
+ it 'extracts the title from rubyonrails.org' do
10
+ infos = UrlInfoExtractor.new('http://rubyonrails.org')
11
+ infos.title.should == 'Ruby on Rails'
12
+ end
13
+ it 'recognizes HTTPS url' do
14
+ infos = UrlInfoExtractor.new('https://github.com/')
15
+ infos.title.should == 'GitHub - Social Coding'
16
+ end
17
+ it 'verifies MIME type' do
18
+ infos = UrlInfoExtractor.new('http://rubyonrails.org/images/rails.png')
19
+ infos.title.should be_nil
20
+ end
21
+ it 'support only the HTTP and HTTPS protocol' do
22
+ infos = UrlInfoExtractor.new('ftp://test.com')
23
+ infos.scheme_supported?.should_not be
24
+ expect do
25
+ infos.title
26
+ end.to raise_error(UrlInfoExtractor::SchemeNotSupported)
27
+ end
28
+ it 'extracts the description out of the page' do
29
+ infos = UrlInfoExtractor.new('https://github.com/der-flo/dotfiles')
30
+ infos.description.should == 'dotfiles - My config files'
31
+ end
32
+ end
33
+
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "url_info_extractor/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "url_info_extractor"
7
+ s.version = UrlInfoExtractor::VERSION
8
+ s.authors = ["Florian Dütsch"]
9
+ s.email = ["mail@florian-duetsch.de"]
10
+ s.homepage = ""
11
+ s.summary = %q{Extract metadata out of a URL}
12
+ s.description = %q{Extract title, meta-description and favicon out of a URL}
13
+
14
+ s.rubyforge_project = "url_info_extractor"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map do |f|
19
+ File.basename(f)
20
+ end
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_development_dependency "rspec"
24
+ s.add_development_dependency 'guard-rspec'
25
+ s.add_development_dependency 'vcr', '2.0.0.rc1'
26
+ s.add_development_dependency 'webmock'
27
+ if RUBY_PLATFORM =~ /darwin/i
28
+ s.add_development_dependency 'rb-fsevent'
29
+ s.add_development_dependency 'growl'
30
+ end
31
+
32
+ s.add_runtime_dependency 'nokogiri', '~> 1.5'
33
+ s.add_runtime_dependency 'activesupport', '>= 3'
34
+ end
35
+
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_info_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Florian Dütsch
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-27 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70293470681620 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70293470681620
25
+ - !ruby/object:Gem::Dependency
26
+ name: guard-rspec
27
+ requirement: &70293470681200 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70293470681200
36
+ - !ruby/object:Gem::Dependency
37
+ name: vcr
38
+ requirement: &70293470680700 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - =
42
+ - !ruby/object:Gem::Version
43
+ version: 2.0.0.rc1
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70293470680700
47
+ - !ruby/object:Gem::Dependency
48
+ name: webmock
49
+ requirement: &70293469848420 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *70293469848420
58
+ - !ruby/object:Gem::Dependency
59
+ name: rb-fsevent
60
+ requirement: &70293469846880 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *70293469846880
69
+ - !ruby/object:Gem::Dependency
70
+ name: growl
71
+ requirement: &70293469844900 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *70293469844900
80
+ - !ruby/object:Gem::Dependency
81
+ name: nokogiri
82
+ requirement: &70293469843840 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ~>
86
+ - !ruby/object:Gem::Version
87
+ version: '1.5'
88
+ type: :runtime
89
+ prerelease: false
90
+ version_requirements: *70293469843840
91
+ - !ruby/object:Gem::Dependency
92
+ name: activesupport
93
+ requirement: &70293469842780 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '3'
99
+ type: :runtime
100
+ prerelease: false
101
+ version_requirements: *70293469842780
102
+ description: Extract title, meta-description and favicon out of a URL
103
+ email:
104
+ - mail@florian-duetsch.de
105
+ executables: []
106
+ extensions: []
107
+ extra_rdoc_files: []
108
+ files:
109
+ - .gitignore
110
+ - Gemfile
111
+ - Guardfile
112
+ - LICENSE
113
+ - README.md
114
+ - Rakefile
115
+ - TODO
116
+ - lib/url_info_extractor.rb
117
+ - lib/url_info_extractor/favicon.rb
118
+ - lib/url_info_extractor/version.rb
119
+ - spec/spec_helper.rb
120
+ - spec/url_info_extractor/favicon_spec.rb
121
+ - spec/url_info_extractor_spec.rb
122
+ - url_info_extractor.gemspec
123
+ homepage: ''
124
+ licenses: []
125
+ post_install_message:
126
+ rdoc_options: []
127
+ require_paths:
128
+ - lib
129
+ required_ruby_version: !ruby/object:Gem::Requirement
130
+ none: false
131
+ requirements:
132
+ - - ! '>='
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ required_rubygems_version: !ruby/object:Gem::Requirement
136
+ none: false
137
+ requirements:
138
+ - - ! '>='
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubyforge_project: url_info_extractor
143
+ rubygems_version: 1.8.10
144
+ signing_key:
145
+ specification_version: 3
146
+ summary: Extract metadata out of a URL
147
+ test_files:
148
+ - spec/spec_helper.rb
149
+ - spec/url_info_extractor/favicon_spec.rb
150
+ - spec/url_info_extractor_spec.rb