url_info_extractor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ spec/cassettes/*
6
+
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in url_info_extractor.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,6 @@
1
+ guard 'rspec', :version => 2 do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
6
+
data/LICENSE ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2011, 2012 Florian Dütsch
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7
+ the Software, and to permit persons to whom the Software is furnished to do so,
8
+ subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ UrlInfoExtractor
2
+ ================
3
+
4
+ The library helps you extracting some metadata out of HTML URLs.
5
+
6
+ [https://github.com/der-flo/url_info_extractor](github.com/der-flo/url_info_extractor)
7
+
8
+ Usage
9
+ -----
10
+ % gem install url_info_extractor
11
+
12
+ require 'url_info_extractor'
13
+ infos = UrlInfoExtractor.new('http://rubygems.org')
14
+ puts infos.title
15
+ # RubyGems.org | your community gem host
16
+ puts infos.favicon_url
17
+ # http://rubygems.org/favicon.ico
18
+ puts infos.favicon_exists?
19
+ # true
20
+ puts infos.description
21
+ #
22
+
23
+ Note that the favicon handling determines the URL by link-rel-icon tag and
24
+ base tag. `favicon_exists?` does an HTTP HEAD request to check whether the
25
+ icon is available at the determined path.
26
+
27
+ Fork me!
28
+ --------
29
+ If you find this library useful but miss some feature, please drop me a note
30
+ or - even better - fork the project at GitHub. Along with some specs I'll
31
+ accept your pull requests.
32
+
33
+ Author and license
34
+ ------------------
35
+ * Florian Dütsch (mail AT florian-duetsch.de)
36
+ * MIT, see LICENSE file.
37
+
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/TODO ADDED
@@ -0,0 +1,3 @@
1
+ * Error handling necessary?
2
+ * Nobody uses meta-keywords anymore. Should we support them?
3
+
@@ -0,0 +1,16 @@
1
+ class UrlInfoExtractor
2
+ module Favicon
3
+ def favicon_url
4
+ base = fetched_data[:base_link_value]
5
+ uri = base ? URI.parse(base) : @uri
6
+ link = fetched_data[:favicon_link_value]
7
+ link ||= '/favicon.ico'
8
+ uri.merge(URI.parse(link)).to_s
9
+ end
10
+ def favicon_exists?
11
+ response = do_http_request(favicon_url, :head)
12
+ response.is_a? Net::HTTPSuccess
13
+ end
14
+ end
15
+ end
16
+
@@ -0,0 +1,3 @@
1
+ class UrlInfoExtractor
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,65 @@
1
+ require "url_info_extractor/version"
2
+ require 'url_info_extractor/favicon'
3
+ require 'nokogiri'
4
+ require 'active_support'
5
+ require 'active_support/core_ext/object/try'
6
+ require 'active_support/core_ext/string/inflections'
7
+ require 'net/https'
8
+
9
+ class UrlInfoExtractor
10
+ include Favicon
11
+
12
+ class SchemeNotSupported < StandardError; end
13
+
14
+ def initialize(url)
15
+ @uri = URI.parse(url)
16
+ @path = @uri.path
17
+ @path = '/' if @path.empty?
18
+ end
19
+ def title
20
+ fetched_data[:title]
21
+ end
22
+
23
+ def description
24
+ fetched_data[:description]
25
+ end
26
+
27
+ def scheme_supported?
28
+ %w(http https).include? @uri.scheme
29
+ end
30
+
31
+ private
32
+
33
+ PROCESSABLE_MIME_TYPES = %w(application/xhtml+xml text/html)
34
+
35
+ def fetched_data
36
+ @fetched_data ||= fetch_data
37
+ end
38
+
39
+ def fetch_data
40
+ raise SchemeNotSupported unless scheme_supported?
41
+ response = do_http_request(@path, :get)
42
+
43
+ return {} unless PROCESSABLE_MIME_TYPES.include?(response.content_type)
44
+
45
+ doc = Nokogiri::HTML(response.body)
46
+ head = doc.css('html head')
47
+
48
+ { title: head.css('title').try(:first).try(:content),
49
+ description:
50
+ head.css('meta[name=description]').try(:first).try(:[], 'content'),
51
+ favicon_link_value:
52
+ head.css('link[rel~=icon]').try(:first).try(:[], 'href'),
53
+ base_link_value: head.css('base').try(:first).try(:[], 'href')
54
+ }
55
+ end
56
+
57
+ def do_http_request(path, method)
58
+ request = "Net::HTTP::#{method.to_s.camelize}".constantize.new(path)
59
+ Net::HTTP.start(@uri.host, @uri.port,
60
+ use_ssl: @uri.scheme == 'https') do |http|
61
+ http.request request
62
+ end
63
+ end
64
+ end
65
+
@@ -0,0 +1,13 @@
1
+ require 'url_info_extractor'
2
+ require 'vcr'
3
+
4
+ VCR.configure do |c|
5
+ c.cassette_library_dir = 'spec/cassettes'
6
+ c.default_cassette_options = { record: :new_episodes }
7
+ c.hook_into :webmock
8
+ end
9
+
10
+ RSpec.configure do |c|
11
+ c.extend VCR::RSpec::Macros
12
+ end
13
+
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe UrlInfoExtractor::Favicon do
4
+ use_vcr_cassette
5
+ it "returns favicon of rubygems.org" do
6
+ infos = UrlInfoExtractor.new('http://rubygems.org')
7
+ infos.favicon_url.should == 'http://rubygems.org/favicon.ico'
8
+ end
9
+ it 'returns the favicon of der-flo.org' do
10
+ infos = UrlInfoExtractor.new('http://der-flo.org')
11
+ infos.favicon_url.should == 'http://der-flo.org/favicon.ico'
12
+ end
13
+
14
+ it 'handles pages without an specified favicon' do
15
+ infos = UrlInfoExtractor.new('https://github.com')
16
+ infos.favicon_url.should == 'https://github.com/favicon.ico'
17
+ end
18
+
19
+ context 'with non-existing favicon' do
20
+ use_vcr_cassette 'favicon does not exist', record: :none
21
+
22
+ it 'checks whether the icon really exists' do
23
+ infos = UrlInfoExtractor.new('http://www.favicon-missing.com/')
24
+ infos.favicon_exists?.should be_false
25
+ end
26
+ end
27
+
28
+ context 'with <base> sample' do
29
+ use_vcr_cassette 'with base sample', record: :none
30
+ it 'handles pages with a base specification correct' do
31
+ infos = UrlInfoExtractor.new('http://www.with-base.com/')
32
+ infos.favicon_url.should == 'http://www2.with-base.com/icon.ico'
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,33 @@
1
+ require 'spec_helper'
2
+
3
+ describe UrlInfoExtractor do
4
+ use_vcr_cassette
5
+ it 'extracts the title from rubygems.org' do
6
+ infos = UrlInfoExtractor.new('http://rubygems.org')
7
+ infos.title.should =~ /rubygems/i
8
+ end
9
+ it 'extracts the title from rubyonrails.org' do
10
+ infos = UrlInfoExtractor.new('http://rubyonrails.org')
11
+ infos.title.should == 'Ruby on Rails'
12
+ end
13
+ it 'recognizes HTTPS url' do
14
+ infos = UrlInfoExtractor.new('https://github.com/')
15
+ infos.title.should == 'GitHub - Social Coding'
16
+ end
17
+ it 'verifies MIME type' do
18
+ infos = UrlInfoExtractor.new('http://rubyonrails.org/images/rails.png')
19
+ infos.title.should be_nil
20
+ end
21
+ it 'support only the HTTP and HTTPS protocol' do
22
+ infos = UrlInfoExtractor.new('ftp://test.com')
23
+ infos.scheme_supported?.should_not be
24
+ expect do
25
+ infos.title
26
+ end.to raise_error(UrlInfoExtractor::SchemeNotSupported)
27
+ end
28
+ it 'extracts the description out of the page' do
29
+ infos = UrlInfoExtractor.new('https://github.com/der-flo/dotfiles')
30
+ infos.description.should == 'dotfiles - My config files'
31
+ end
32
+ end
33
+
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "url_info_extractor/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "url_info_extractor"
7
+ s.version = UrlInfoExtractor::VERSION
8
+ s.authors = ["Florian Dütsch"]
9
+ s.email = ["mail@florian-duetsch.de"]
10
+ s.homepage = ""
11
+ s.summary = %q{Extract metadata out of a URL}
12
+ s.description = %q{Extract title, meta-description and favicon out of a URL}
13
+
14
+ s.rubyforge_project = "url_info_extractor"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map do |f|
19
+ File.basename(f)
20
+ end
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_development_dependency "rspec"
24
+ s.add_development_dependency 'guard-rspec'
25
+ s.add_development_dependency 'vcr', '2.0.0.rc1'
26
+ s.add_development_dependency 'webmock'
27
+ if RUBY_PLATFORM =~ /darwin/i
28
+ s.add_development_dependency 'rb-fsevent'
29
+ s.add_development_dependency 'growl'
30
+ end
31
+
32
+ s.add_runtime_dependency 'nokogiri', '~> 1.5'
33
+ s.add_runtime_dependency 'activesupport', '>= 3'
34
+ end
35
+
metadata ADDED
@@ -0,0 +1,150 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_info_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Florian Dütsch
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-27 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &70293470681620 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70293470681620
25
+ - !ruby/object:Gem::Dependency
26
+ name: guard-rspec
27
+ requirement: &70293470681200 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70293470681200
36
+ - !ruby/object:Gem::Dependency
37
+ name: vcr
38
+ requirement: &70293470680700 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - =
42
+ - !ruby/object:Gem::Version
43
+ version: 2.0.0.rc1
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70293470680700
47
+ - !ruby/object:Gem::Dependency
48
+ name: webmock
49
+ requirement: &70293469848420 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *70293469848420
58
+ - !ruby/object:Gem::Dependency
59
+ name: rb-fsevent
60
+ requirement: &70293469846880 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *70293469846880
69
+ - !ruby/object:Gem::Dependency
70
+ name: growl
71
+ requirement: &70293469844900 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *70293469844900
80
+ - !ruby/object:Gem::Dependency
81
+ name: nokogiri
82
+ requirement: &70293469843840 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ~>
86
+ - !ruby/object:Gem::Version
87
+ version: '1.5'
88
+ type: :runtime
89
+ prerelease: false
90
+ version_requirements: *70293469843840
91
+ - !ruby/object:Gem::Dependency
92
+ name: activesupport
93
+ requirement: &70293469842780 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ! '>='
97
+ - !ruby/object:Gem::Version
98
+ version: '3'
99
+ type: :runtime
100
+ prerelease: false
101
+ version_requirements: *70293469842780
102
+ description: Extract title, meta-description and favicon out of a URL
103
+ email:
104
+ - mail@florian-duetsch.de
105
+ executables: []
106
+ extensions: []
107
+ extra_rdoc_files: []
108
+ files:
109
+ - .gitignore
110
+ - Gemfile
111
+ - Guardfile
112
+ - LICENSE
113
+ - README.md
114
+ - Rakefile
115
+ - TODO
116
+ - lib/url_info_extractor.rb
117
+ - lib/url_info_extractor/favicon.rb
118
+ - lib/url_info_extractor/version.rb
119
+ - spec/spec_helper.rb
120
+ - spec/url_info_extractor/favicon_spec.rb
121
+ - spec/url_info_extractor_spec.rb
122
+ - url_info_extractor.gemspec
123
+ homepage: ''
124
+ licenses: []
125
+ post_install_message:
126
+ rdoc_options: []
127
+ require_paths:
128
+ - lib
129
+ required_ruby_version: !ruby/object:Gem::Requirement
130
+ none: false
131
+ requirements:
132
+ - - ! '>='
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ required_rubygems_version: !ruby/object:Gem::Requirement
136
+ none: false
137
+ requirements:
138
+ - - ! '>='
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubyforge_project: url_info_extractor
143
+ rubygems_version: 1.8.10
144
+ signing_key:
145
+ specification_version: 3
146
+ summary: Extract metadata out of a URL
147
+ test_files:
148
+ - spec/spec_helper.rb
149
+ - spec/url_info_extractor/favicon_spec.rb
150
+ - spec/url_info_extractor_spec.rb