webinspector 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e032ae52ae21e3e3b5e6797e51e5f90c6682035e
4
+ data.tar.gz: 98407e3d9ef2b8f469a4c7886c86166e74e0fbf8
5
+ SHA512:
6
+ metadata.gz: 4499172570068e8a4ea5b747093d5ea8a2cb49f87745af6bc2884a39d4658cbfc7d27e110a5a794e97514caf7474265ef6b7ab6e8d513619c9180c0947ee997c
7
+ data.tar.gz: 56de25e0dacd21ec19a59c513076980d56f1bbe02a4290b1959e0cb3dddd3ea09f0f286b15d9f11c0130c2a378fff859f512a37c35f26a290eba3667e5ca8e3d
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in webinspector.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Davide Santangelo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # Webinspector
2
+
3
+ Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its title, description, meta, links, images and more.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'webinspector'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install webinspector
20
+
21
+ ## Usage
22
+
23
+ Initialize a WebInspector instance for an URL, like this:
24
+
25
+ ```ruby
26
+ page = WebInspector.new('http://davidesantangelo.com')
27
+ ```
28
+
29
+ ## Accessing response status and headers
30
+
31
+ You can check the status and headers from the response like this:
32
+
33
+ ```ruby
34
+ page.response.status # 200
35
+ page.response.headers # { "server"=>"apache", "content-type"=>"text/html; charset=utf-8", "cache-control"=>"must-revalidate, private, max-age=0", ... }
36
+ ```
37
+
38
+ ## Accessing inpsected data
39
+
40
+ You can see the data like this:
41
+
42
+ ```ruby
43
+ page.url # URL of the page
44
+ page.scheme # Scheme of the page (http, https)
45
+ page.host # Hostname of the page (like, davidesantangelo.com, without the scheme)
46
+ page.port # Port of the page
47
+ page.title # title of the page from the head section, as string
48
+ page.description # description of the page
49
+ page.links # every link found
50
+ page.images # every image found
51
+ page.meta # metatags of the page
52
+ ```
53
+
54
+ ## License
55
+ The restcountry GEM is released under the MIT License.
56
+
57
+ ## Contributing
58
+
59
+ 1. Fork it ( https://github.com/[my-github-username]/webinspector/fork )
60
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
61
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
62
+ 4. Push to the branch (`git push origin my-new-feature`)
63
+ 5. Create a new Pull Request
64
+ >>>>>>> develop
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "webinspector"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,10 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/page'))
2
+ require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/version'))
3
+
4
+ module WebInspector
5
+ extend self
6
+
7
+ def new(url, options = {})
8
+ Page.new(url, options)
9
+ end
10
+ end
@@ -0,0 +1,46 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'meta'))
2
+
3
+ module WebInspector
4
+ class Inspector
5
+
6
+ def initialize(page)
7
+ @page = page
8
+ @meta = WebInspector::Meta.new(page).meta
9
+ end
10
+
11
+ def title
12
+ @page.css('title').inner_text.strip rescue nil
13
+ end
14
+
15
+ def description
16
+ @meta['description'] || snippet
17
+ end
18
+
19
+ def meta
20
+ @meta
21
+ end
22
+
23
+ def links
24
+ links = []
25
+ @page.css("a").each do |a|
26
+ links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
27
+ end
28
+ return links
29
+ end
30
+
31
+ def images
32
+ images = []
33
+ @page.css("img").each do |img|
34
+ images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
35
+ end
36
+ return images
37
+ end
38
+
39
+ private
40
+
41
+ def snippet
42
+ first_long_paragraph = @page.search('//p[string-length() >= 120]').first
43
+ first_long_paragraph ? first_long_paragraph.text : ''
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,67 @@
1
+ module WebInspector
2
+ class Meta
3
+ def initialize(page)
4
+ @page = page
5
+ end
6
+
7
+ def meta_tags
8
+ {
9
+ 'name' => meta_tags_by('name'),
10
+ 'http-equiv' => meta_tags_by('http-equiv'),
11
+ 'property' => meta_tags_by('property'),
12
+ 'charset' => [charset_from_meta_charset]
13
+ }
14
+ end
15
+
16
+ def meta_tag
17
+ convert_each_array_to_first_element_on meta_tags
18
+ end
19
+
20
+ def meta
21
+ meta_tag['name']
22
+ .merge(meta_tag['http-equiv'])
23
+ .merge(meta_tag['property'])
24
+ .merge('charset' => meta_tag['charset'])
25
+ end
26
+
27
+ def charset
28
+ @charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
29
+ end
30
+
31
+ private
32
+
33
+ def charset_from_meta_charset
34
+ @page.css('meta[charset]')[0].attributes['charset'].value rescue nil
35
+ end
36
+
37
+ def charset_from_meta_content_type
38
+ @page.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(';')[1].split('=')[1] rescue nil
39
+ end
40
+
41
+ def meta_tags_by(attribute)
42
+ hash = {}
43
+ @page.css("meta[@#{attribute}]").map do |tag|
44
+ name = tag.attributes[attribute].value.downcase rescue nil
45
+ content = tag.attributes['content'].value rescue nil
46
+
47
+ if name && content
48
+ hash[name] ||= []
49
+ hash[name] << content
50
+ end
51
+ end
52
+ hash
53
+ end
54
+
55
+ def convert_each_array_to_first_element_on(hash)
56
+ hash.each_pair do |k, v|
57
+ hash[k] = if v.is_a?(Hash)
58
+ convert_each_array_to_first_element_on(v)
59
+ elsif v.is_a?(Array)
60
+ v.first
61
+ else
62
+ v
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,120 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+ require 'addressable/uri'
4
+ require 'open-uri'
5
+ require 'open_uri_redirections'
6
+ require 'faraday'
7
+
8
+ require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
9
+
10
+ module WebInspector
11
+ class Page
12
+ attr_reader :url, :scheme, :host, :port, :title, :description, :meta, :links, :images, :size, :response
13
+
14
+ def initialize(url, options = {})
15
+ @url = url
16
+ @options = options
17
+ @inspector = WebInspector::Inspector.new(page)
18
+ end
19
+
20
+ def title
21
+ @inspector.title
22
+ end
23
+
24
+ def description
25
+ @inspector.description
26
+ end
27
+
28
+ def links
29
+ @inspector.links
30
+ end
31
+
32
+ def images
33
+ @inspector.images
34
+ end
35
+
36
+ def meta
37
+ @inspector.meta
38
+ end
39
+
40
+ def url
41
+ normalized_uri
42
+ end
43
+
44
+ def host
45
+ uri.host
46
+ end
47
+
48
+ def scheme
49
+ uri.scheme
50
+ end
51
+
52
+ def port
53
+ URI(normalized_uri).port
54
+ end
55
+
56
+ def to_hash
57
+ {
58
+ 'url' => url,
59
+ 'scheme' => scheme,
60
+ 'host' => host,
61
+ 'port' => port,
62
+ 'title' => title,
63
+ 'description' => description,
64
+ 'meta' => meta,
65
+ 'links' => links,
66
+ 'images' => images,
67
+ 'response' => { 'status' => response.status,
68
+ 'headers' => response.headers }
69
+ }
70
+ end
71
+
72
+ def response
73
+ @response ||= fetch
74
+ rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError, URI::InvalidURIError => e
75
+ @exception_log << e
76
+ nil
77
+ end
78
+
79
+ private
80
+
81
+ def fetch
82
+ session = Faraday.new(:url => url) do |faraday|
83
+ faraday.request :retry, max: @retries
84
+
85
+ if @allow_redirections
86
+ faraday.use FaradayMiddleware::FollowRedirects, limit: 10
87
+ faraday.use :cookie_jar
88
+ end
89
+
90
+ faraday.headers.merge!(@headers || {})
91
+ faraday.adapter :net_http
92
+ end
93
+
94
+ response = session.get do |req|
95
+ req.options.timeout = @connection_timeout
96
+ req.options.open_timeout = @read_timeout
97
+ end
98
+
99
+ @url.url = response.env.url.to_s
100
+
101
+ response
102
+ end
103
+
104
+ def uri
105
+ Addressable::URI.parse(@url)
106
+ end
107
+
108
+ def normalized_uri
109
+ uri.normalize.to_s
110
+ end
111
+
112
+ def default_user_agent
113
+ "WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)"
114
+ end
115
+
116
+ def page
117
+ Nokogiri::HTML(open(normalized_uri, :allow_redirections => :safe))
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,3 @@
1
+ module WebInspector
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), './web_inspector'))
@@ -0,0 +1,37 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require File.expand_path('../lib/web_inspector/version', __FILE__)
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "webinspector"
8
+ spec.version = WebInspector::VERSION
9
+ spec.authors = ["Davide Santangelo"]
10
+ spec.email = ["davide.santangelo@gmail.com"]
11
+
12
+ spec.summary = %q{Ruby gem to inspect completely a web page.}
13
+ spec.description = %q{Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its meta, links, images and more.}
14
+ spec.homepage = ""
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.8"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+
25
+ spec.add_development_dependency 'rspec'
26
+ spec.add_development_dependency "vcr"
27
+ spec.add_development_dependency "typhoeus"
28
+
29
+ spec.required_ruby_version = ">= 1.9.3"
30
+
31
+ spec.add_dependency "faraday"
32
+ spec.add_dependency "json"
33
+ spec.add_dependency "addressable"
34
+ spec.add_dependency "nokogiri"
35
+ spec.add_dependency "open_uri_redirections"
36
+ spec.add_dependency "openurl"
37
+ end
metadata ADDED
@@ -0,0 +1,215 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webinspector
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Davide Santangelo
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-04-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: vcr
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: typhoeus
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: faraday
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: json
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: addressable
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: nokogiri
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: open_uri_redirections
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: openurl
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ description: Ruby gem to inspect completely a web page. It scrapes a given URL, and
168
+ returns you its meta, links, images and more.
169
+ email:
170
+ - davide.santangelo@gmail.com
171
+ executables: []
172
+ extensions: []
173
+ extra_rdoc_files: []
174
+ files:
175
+ - ".gitignore"
176
+ - ".rspec"
177
+ - ".travis.yml"
178
+ - Gemfile
179
+ - LICENSE
180
+ - README.md
181
+ - Rakefile
182
+ - bin/console
183
+ - bin/setup
184
+ - lib/web_inspector.rb
185
+ - lib/web_inspector/inspector.rb
186
+ - lib/web_inspector/meta.rb
187
+ - lib/web_inspector/page.rb
188
+ - lib/web_inspector/version.rb
189
+ - lib/webinspector.rb
190
+ - webinspector.gemspec
191
+ homepage: ''
192
+ licenses:
193
+ - MIT
194
+ metadata: {}
195
+ post_install_message:
196
+ rdoc_options: []
197
+ require_paths:
198
+ - lib
199
+ required_ruby_version: !ruby/object:Gem::Requirement
200
+ requirements:
201
+ - - ">="
202
+ - !ruby/object:Gem::Version
203
+ version: 1.9.3
204
+ required_rubygems_version: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ requirements: []
210
+ rubyforge_project:
211
+ rubygems_version: 2.4.6
212
+ signing_key:
213
+ specification_version: 4
214
+ summary: Ruby gem to inspect completely a web page.
215
+ test_files: []