url_info_extractor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/Gemfile +4 -0
- data/Guardfile +6 -0
- data/LICENSE +18 -0
- data/README.md +37 -0
- data/Rakefile +1 -0
- data/TODO +3 -0
- data/lib/url_info_extractor/favicon.rb +16 -0
- data/lib/url_info_extractor/version.rb +3 -0
- data/lib/url_info_extractor.rb +65 -0
- data/spec/spec_helper.rb +13 -0
- data/spec/url_info_extractor/favicon_spec.rb +36 -0
- data/spec/url_info_extractor_spec.rb +33 -0
- data/url_info_extractor.gemspec +35 -0
- metadata +150 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2011, 2012 Florian Dütsch
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
4
|
+
this software and associated documentation files (the "Software"), to deal in
|
5
|
+
the Software without restriction, including without limitation the rights to
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
7
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
8
|
+
subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
15
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
16
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
UrlInfoExtractor
|
2
|
+
================
|
3
|
+
|
4
|
+
The library helps you extracting some metadata out of HTML URLs.
|
5
|
+
|
6
|
+
[https://github.com/der-flo/url_info_extractor](github.com/der-flo/url_info_extractor)
|
7
|
+
|
8
|
+
Usage
|
9
|
+
-----
|
10
|
+
% gem install url_info_extractor
|
11
|
+
|
12
|
+
require 'url_info_extractor'
|
13
|
+
infos = UrlInfoExtractor.new('http://rubygems.org')
|
14
|
+
puts infos.title
|
15
|
+
# RubyGems.org | your community gem host
|
16
|
+
puts infos.favicon_url
|
17
|
+
# http://rubygems.org/favicon.ico
|
18
|
+
puts infos.favicon_exists?
|
19
|
+
# true
|
20
|
+
puts infos.description
|
21
|
+
#
|
22
|
+
|
23
|
+
Note that the favicon handling determines the URL by link-rel-icon tag and
|
24
|
+
base tag. `favicon_exists?` does an HTTP HEAD request to check whether the
|
25
|
+
icon is available at the determined path.
|
26
|
+
|
27
|
+
Fork me!
|
28
|
+
--------
|
29
|
+
If you find this library useful but miss some feature, please drop me a note
|
30
|
+
or - even better - fork the project at GitHub. Along with some specs I'll
|
31
|
+
accept your pull requests.
|
32
|
+
|
33
|
+
Author and license
|
34
|
+
------------------
|
35
|
+
* Florian Dütsch (mail AT florian-duetsch.de)
|
36
|
+
* MIT, see LICENSE file.
|
37
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/TODO
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
class UrlInfoExtractor
|
2
|
+
module Favicon
|
3
|
+
def favicon_url
|
4
|
+
base = fetched_data[:base_link_value]
|
5
|
+
uri = base ? URI.parse(base) : @uri
|
6
|
+
link = fetched_data[:favicon_link_value]
|
7
|
+
link ||= '/favicon.ico'
|
8
|
+
uri.merge(URI.parse(link)).to_s
|
9
|
+
end
|
10
|
+
def favicon_exists?
|
11
|
+
response = do_http_request(favicon_url, :head)
|
12
|
+
response.is_a? Net::HTTPSuccess
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require "url_info_extractor/version"
|
2
|
+
require 'url_info_extractor/favicon'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'active_support'
|
5
|
+
require 'active_support/core_ext/object/try'
|
6
|
+
require 'active_support/core_ext/string/inflections'
|
7
|
+
require 'net/https'
|
8
|
+
|
9
|
+
class UrlInfoExtractor
|
10
|
+
include Favicon
|
11
|
+
|
12
|
+
class SchemeNotSupported < StandardError; end
|
13
|
+
|
14
|
+
def initialize(url)
|
15
|
+
@uri = URI.parse(url)
|
16
|
+
@path = @uri.path
|
17
|
+
@path = '/' if @path.empty?
|
18
|
+
end
|
19
|
+
def title
|
20
|
+
fetched_data[:title]
|
21
|
+
end
|
22
|
+
|
23
|
+
def description
|
24
|
+
fetched_data[:description]
|
25
|
+
end
|
26
|
+
|
27
|
+
def scheme_supported?
|
28
|
+
%w(http https).include? @uri.scheme
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
PROCESSABLE_MIME_TYPES = %w(application/xhtml+xml text/html)
|
34
|
+
|
35
|
+
def fetched_data
|
36
|
+
@fetched_data ||= fetch_data
|
37
|
+
end
|
38
|
+
|
39
|
+
def fetch_data
|
40
|
+
raise SchemeNotSupported unless scheme_supported?
|
41
|
+
response = do_http_request(@path, :get)
|
42
|
+
|
43
|
+
return {} unless PROCESSABLE_MIME_TYPES.include?(response.content_type)
|
44
|
+
|
45
|
+
doc = Nokogiri::HTML(response.body)
|
46
|
+
head = doc.css('html head')
|
47
|
+
|
48
|
+
{ title: head.css('title').try(:first).try(:content),
|
49
|
+
description:
|
50
|
+
head.css('meta[name=description]').try(:first).try(:[], 'content'),
|
51
|
+
favicon_link_value:
|
52
|
+
head.css('link[rel~=icon]').try(:first).try(:[], 'href'),
|
53
|
+
base_link_value: head.css('base').try(:first).try(:[], 'href')
|
54
|
+
}
|
55
|
+
end
|
56
|
+
|
57
|
+
def do_http_request(path, method)
|
58
|
+
request = "Net::HTTP::#{method.to_s.camelize}".constantize.new(path)
|
59
|
+
Net::HTTP.start(@uri.host, @uri.port,
|
60
|
+
use_ssl: @uri.scheme == 'https') do |http|
|
61
|
+
http.request request
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'url_info_extractor'
|
2
|
+
require 'vcr'
|
3
|
+
|
4
|
+
VCR.configure do |c|
|
5
|
+
c.cassette_library_dir = 'spec/cassettes'
|
6
|
+
c.default_cassette_options = { record: :new_episodes }
|
7
|
+
c.hook_into :webmock
|
8
|
+
end
|
9
|
+
|
10
|
+
RSpec.configure do |c|
|
11
|
+
c.extend VCR::RSpec::Macros
|
12
|
+
end
|
13
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe UrlInfoExtractor::Favicon do
|
4
|
+
use_vcr_cassette
|
5
|
+
it "returns favicon of rubygems.org" do
|
6
|
+
infos = UrlInfoExtractor.new('http://rubygems.org')
|
7
|
+
infos.favicon_url.should == 'http://rubygems.org/favicon.ico'
|
8
|
+
end
|
9
|
+
it 'returns the favicon of der-flo.org' do
|
10
|
+
infos = UrlInfoExtractor.new('http://der-flo.org')
|
11
|
+
infos.favicon_url.should == 'http://der-flo.org/favicon.ico'
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'handles pages without an specified favicon' do
|
15
|
+
infos = UrlInfoExtractor.new('https://github.com')
|
16
|
+
infos.favicon_url.should == 'https://github.com/favicon.ico'
|
17
|
+
end
|
18
|
+
|
19
|
+
context 'with non-existing favicon' do
|
20
|
+
use_vcr_cassette 'favicon does not exist', record: :none
|
21
|
+
|
22
|
+
it 'checks whether the icon really exists' do
|
23
|
+
infos = UrlInfoExtractor.new('http://www.favicon-missing.com/')
|
24
|
+
infos.favicon_exists?.should be_false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context 'with <base> sample' do
|
29
|
+
use_vcr_cassette 'with base sample', record: :none
|
30
|
+
it 'handles pages with a base specification correct' do
|
31
|
+
infos = UrlInfoExtractor.new('http://www.with-base.com/')
|
32
|
+
infos.favicon_url.should == 'http://www2.with-base.com/icon.ico'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe UrlInfoExtractor do
|
4
|
+
use_vcr_cassette
|
5
|
+
it 'extracts the title from rubygems.org' do
|
6
|
+
infos = UrlInfoExtractor.new('http://rubygems.org')
|
7
|
+
infos.title.should =~ /rubygems/i
|
8
|
+
end
|
9
|
+
it 'extracts the title from rubyonrails.org' do
|
10
|
+
infos = UrlInfoExtractor.new('http://rubyonrails.org')
|
11
|
+
infos.title.should == 'Ruby on Rails'
|
12
|
+
end
|
13
|
+
it 'recognizes HTTPS url' do
|
14
|
+
infos = UrlInfoExtractor.new('https://github.com/')
|
15
|
+
infos.title.should == 'GitHub - Social Coding'
|
16
|
+
end
|
17
|
+
it 'verifies MIME type' do
|
18
|
+
infos = UrlInfoExtractor.new('http://rubyonrails.org/images/rails.png')
|
19
|
+
infos.title.should be_nil
|
20
|
+
end
|
21
|
+
it 'support only the HTTP and HTTPS protocol' do
|
22
|
+
infos = UrlInfoExtractor.new('ftp://test.com')
|
23
|
+
infos.scheme_supported?.should_not be
|
24
|
+
expect do
|
25
|
+
infos.title
|
26
|
+
end.to raise_error(UrlInfoExtractor::SchemeNotSupported)
|
27
|
+
end
|
28
|
+
it 'extracts the description out of the page' do
|
29
|
+
infos = UrlInfoExtractor.new('https://github.com/der-flo/dotfiles')
|
30
|
+
infos.description.should == 'dotfiles - My config files'
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "url_info_extractor/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "url_info_extractor"
|
7
|
+
s.version = UrlInfoExtractor::VERSION
|
8
|
+
s.authors = ["Florian Dütsch"]
|
9
|
+
s.email = ["mail@florian-duetsch.de"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Extract metadata out of a URL}
|
12
|
+
s.description = %q{Extract title, meta-description and favicon out of a URL}
|
13
|
+
|
14
|
+
s.rubyforge_project = "url_info_extractor"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map do |f|
|
19
|
+
File.basename(f)
|
20
|
+
end
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
|
23
|
+
s.add_development_dependency "rspec"
|
24
|
+
s.add_development_dependency 'guard-rspec'
|
25
|
+
s.add_development_dependency 'vcr', '2.0.0.rc1'
|
26
|
+
s.add_development_dependency 'webmock'
|
27
|
+
if RUBY_PLATFORM =~ /darwin/i
|
28
|
+
s.add_development_dependency 'rb-fsevent'
|
29
|
+
s.add_development_dependency 'growl'
|
30
|
+
end
|
31
|
+
|
32
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.5'
|
33
|
+
s.add_runtime_dependency 'activesupport', '>= 3'
|
34
|
+
end
|
35
|
+
|
metadata
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url_info_extractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Florian Dütsch
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-27 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &70293470681620 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70293470681620
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: guard-rspec
|
27
|
+
requirement: &70293470681200 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70293470681200
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: vcr
|
38
|
+
requirement: &70293470680700 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - =
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 2.0.0.rc1
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70293470680700
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: webmock
|
49
|
+
requirement: &70293469848420 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70293469848420
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: rb-fsevent
|
60
|
+
requirement: &70293469846880 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70293469846880
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: growl
|
71
|
+
requirement: &70293469844900 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *70293469844900
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: nokogiri
|
82
|
+
requirement: &70293469843840 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ~>
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '1.5'
|
88
|
+
type: :runtime
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *70293469843840
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: activesupport
|
93
|
+
requirement: &70293469842780 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '3'
|
99
|
+
type: :runtime
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *70293469842780
|
102
|
+
description: Extract title, meta-description and favicon out of a URL
|
103
|
+
email:
|
104
|
+
- mail@florian-duetsch.de
|
105
|
+
executables: []
|
106
|
+
extensions: []
|
107
|
+
extra_rdoc_files: []
|
108
|
+
files:
|
109
|
+
- .gitignore
|
110
|
+
- Gemfile
|
111
|
+
- Guardfile
|
112
|
+
- LICENSE
|
113
|
+
- README.md
|
114
|
+
- Rakefile
|
115
|
+
- TODO
|
116
|
+
- lib/url_info_extractor.rb
|
117
|
+
- lib/url_info_extractor/favicon.rb
|
118
|
+
- lib/url_info_extractor/version.rb
|
119
|
+
- spec/spec_helper.rb
|
120
|
+
- spec/url_info_extractor/favicon_spec.rb
|
121
|
+
- spec/url_info_extractor_spec.rb
|
122
|
+
- url_info_extractor.gemspec
|
123
|
+
homepage: ''
|
124
|
+
licenses: []
|
125
|
+
post_install_message:
|
126
|
+
rdoc_options: []
|
127
|
+
require_paths:
|
128
|
+
- lib
|
129
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
130
|
+
none: false
|
131
|
+
requirements:
|
132
|
+
- - ! '>='
|
133
|
+
- !ruby/object:Gem::Version
|
134
|
+
version: '0'
|
135
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
136
|
+
none: false
|
137
|
+
requirements:
|
138
|
+
- - ! '>='
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
requirements: []
|
142
|
+
rubyforge_project: url_info_extractor
|
143
|
+
rubygems_version: 1.8.10
|
144
|
+
signing_key:
|
145
|
+
specification_version: 3
|
146
|
+
summary: Extract metadata out of a URL
|
147
|
+
test_files:
|
148
|
+
- spec/spec_helper.rb
|
149
|
+
- spec/url_info_extractor/favicon_spec.rb
|
150
|
+
- spec/url_info_extractor_spec.rb
|