webinspector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e032ae52ae21e3e3b5e6797e51e5f90c6682035e
4
+ data.tar.gz: 98407e3d9ef2b8f469a4c7886c86166e74e0fbf8
5
+ SHA512:
6
+ metadata.gz: 4499172570068e8a4ea5b747093d5ea8a2cb49f87745af6bc2884a39d4658cbfc7d27e110a5a794e97514caf7474265ef6b7ab6e8d513619c9180c0947ee997c
7
+ data.tar.gz: 56de25e0dacd21ec19a59c513076980d56f1bbe02a4290b1959e0cb3dddd3ea09f0f286b15d9f11c0130c2a378fff859f512a37c35f26a290eba3667e5ca8e3d
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in webinspector.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 Davide Santangelo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
data/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # Webinspector
2
+
3
+ Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its title, description, meta, links, images and more.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'webinspector'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install webinspector
20
+
21
+ ## Usage
22
+
23
+ Initialize a WebInspector instance for an URL, like this:
24
+
25
+ ```ruby
26
+ page = WebInspector.new('http://davidesantangelo.com')
27
+ ```
28
+
29
+ ## Accessing response status and headers
30
+
31
+ You can check the status and headers from the response like this:
32
+
33
+ ```ruby
34
+ page.response.status # 200
35
+ page.response.headers # { "server"=>"apache", "content-type"=>"text/html; charset=utf-8", "cache-control"=>"must-revalidate, private, max-age=0", ... }
36
+ ```
37
+
38
+ ## Accessing inpsected data
39
+
40
+ You can see the data like this:
41
+
42
+ ```ruby
43
+ page.url # URL of the page
44
+ page.scheme # Scheme of the page (http, https)
45
+ page.host # Hostname of the page (like, davidesantangelo.com, without the scheme)
46
+ page.port # Port of the page
47
+ page.title # title of the page from the head section, as string
48
+ page.description # description of the page
49
+ page.links # every link found
50
+ page.images # every image found
51
+ page.meta # metatags of the page
52
+ ```
53
+
54
+ ## License
55
+ The restcountry GEM is released under the MIT License.
56
+
57
+ ## Contributing
58
+
59
+ 1. Fork it ( https://github.com/[my-github-username]/webinspector/fork )
60
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
61
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
62
+ 4. Push to the branch (`git push origin my-new-feature`)
63
+ 5. Create a new Pull Request
64
+ >>>>>>> develop
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "webinspector"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,10 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/page'))
2
+ require File.expand_path(File.join(File.dirname(__FILE__), 'web_inspector/version'))
3
+
4
+ module WebInspector
5
+ extend self
6
+
7
+ def new(url, options = {})
8
+ Page.new(url, options)
9
+ end
10
+ end
@@ -0,0 +1,46 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'meta'))
2
+
3
+ module WebInspector
4
+ class Inspector
5
+
6
+ def initialize(page)
7
+ @page = page
8
+ @meta = WebInspector::Meta.new(page).meta
9
+ end
10
+
11
+ def title
12
+ @page.css('title').inner_text.strip rescue nil
13
+ end
14
+
15
+ def description
16
+ @meta['description'] || snippet
17
+ end
18
+
19
+ def meta
20
+ @meta
21
+ end
22
+
23
+ def links
24
+ links = []
25
+ @page.css("a").each do |a|
26
+ links.push((a[:href].to_s.start_with? @url.to_s) ? a[:href] : URI.join(@url, a[:href]).to_s) if (a and a[:href])
27
+ end
28
+ return links
29
+ end
30
+
31
+ def images
32
+ images = []
33
+ @page.css("img").each do |img|
34
+ images.push((img[:src].to_s.start_with? @url.to_s) ? img[:src] : URI.join(url, img[:src]).to_s) if (img and img[:src])
35
+ end
36
+ return images
37
+ end
38
+
39
+ private
40
+
41
+ def snippet
42
+ first_long_paragraph = @page.search('//p[string-length() >= 120]').first
43
+ first_long_paragraph ? first_long_paragraph.text : ''
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,67 @@
1
+ module WebInspector
2
+ class Meta
3
+ def initialize(page)
4
+ @page = page
5
+ end
6
+
7
+ def meta_tags
8
+ {
9
+ 'name' => meta_tags_by('name'),
10
+ 'http-equiv' => meta_tags_by('http-equiv'),
11
+ 'property' => meta_tags_by('property'),
12
+ 'charset' => [charset_from_meta_charset]
13
+ }
14
+ end
15
+
16
+ def meta_tag
17
+ convert_each_array_to_first_element_on meta_tags
18
+ end
19
+
20
+ def meta
21
+ meta_tag['name']
22
+ .merge(meta_tag['http-equiv'])
23
+ .merge(meta_tag['property'])
24
+ .merge('charset' => meta_tag['charset'])
25
+ end
26
+
27
+ def charset
28
+ @charset ||= (charset_from_meta_charset || charset_from_meta_content_type)
29
+ end
30
+
31
+ private
32
+
33
+ def charset_from_meta_charset
34
+ @page.css('meta[charset]')[0].attributes['charset'].value rescue nil
35
+ end
36
+
37
+ def charset_from_meta_content_type
38
+ @page.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(';')[1].split('=')[1] rescue nil
39
+ end
40
+
41
+ def meta_tags_by(attribute)
42
+ hash = {}
43
+ @page.css("meta[@#{attribute}]").map do |tag|
44
+ name = tag.attributes[attribute].value.downcase rescue nil
45
+ content = tag.attributes['content'].value rescue nil
46
+
47
+ if name && content
48
+ hash[name] ||= []
49
+ hash[name] << content
50
+ end
51
+ end
52
+ hash
53
+ end
54
+
55
+ def convert_each_array_to_first_element_on(hash)
56
+ hash.each_pair do |k, v|
57
+ hash[k] = if v.is_a?(Hash)
58
+ convert_each_array_to_first_element_on(v)
59
+ elsif v.is_a?(Array)
60
+ v.first
61
+ else
62
+ v
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,120 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+ require 'addressable/uri'
4
+ require 'open-uri'
5
+ require 'open_uri_redirections'
6
+ require 'faraday'
7
+
8
+ require File.expand_path(File.join(File.dirname(__FILE__), 'inspector'))
9
+
10
+ module WebInspector
11
+ class Page
12
+ attr_reader :url, :scheme, :host, :port, :title, :description, :meta, :links, :images, :size, :response
13
+
14
+ def initialize(url, options = {})
15
+ @url = url
16
+ @options = options
17
+ @inspector = WebInspector::Inspector.new(page)
18
+ end
19
+
20
+ def title
21
+ @inspector.title
22
+ end
23
+
24
+ def description
25
+ @inspector.description
26
+ end
27
+
28
+ def links
29
+ @inspector.links
30
+ end
31
+
32
+ def images
33
+ @inspector.images
34
+ end
35
+
36
+ def meta
37
+ @inspector.meta
38
+ end
39
+
40
+ def url
41
+ normalized_uri
42
+ end
43
+
44
+ def host
45
+ uri.host
46
+ end
47
+
48
+ def scheme
49
+ uri.scheme
50
+ end
51
+
52
+ def port
53
+ URI(normalized_uri).port
54
+ end
55
+
56
+ def to_hash
57
+ {
58
+ 'url' => url,
59
+ 'scheme' => scheme,
60
+ 'host' => host,
61
+ 'port' => port,
62
+ 'title' => title,
63
+ 'description' => description,
64
+ 'meta' => meta,
65
+ 'links' => links,
66
+ 'images' => images,
67
+ 'response' => { 'status' => response.status,
68
+ 'headers' => response.headers }
69
+ }
70
+ end
71
+
72
+ def response
73
+ @response ||= fetch
74
+ rescue Faraday::TimeoutError, Faraday::Error::ConnectionFailed, RuntimeError, URI::InvalidURIError => e
75
+ @exception_log << e
76
+ nil
77
+ end
78
+
79
+ private
80
+
81
+ def fetch
82
+ session = Faraday.new(:url => url) do |faraday|
83
+ faraday.request :retry, max: @retries
84
+
85
+ if @allow_redirections
86
+ faraday.use FaradayMiddleware::FollowRedirects, limit: 10
87
+ faraday.use :cookie_jar
88
+ end
89
+
90
+ faraday.headers.merge!(@headers || {})
91
+ faraday.adapter :net_http
92
+ end
93
+
94
+ response = session.get do |req|
95
+ req.options.timeout = @connection_timeout
96
+ req.options.open_timeout = @read_timeout
97
+ end
98
+
99
+ @url.url = response.env.url.to_s
100
+
101
+ response
102
+ end
103
+
104
+ def uri
105
+ Addressable::URI.parse(@url)
106
+ end
107
+
108
+ def normalized_uri
109
+ uri.normalize.to_s
110
+ end
111
+
112
+ def default_user_agent
113
+ "WebInspector/#{WebInspector::VERSION} (+https://github.com/davidesantangelo/webinspector)"
114
+ end
115
+
116
+ def page
117
+ Nokogiri::HTML(open(normalized_uri, :allow_redirections => :safe))
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,3 @@
1
+ module WebInspector
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), './web_inspector'))
@@ -0,0 +1,37 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require File.expand_path('../lib/web_inspector/version', __FILE__)
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "webinspector"
8
+ spec.version = WebInspector::VERSION
9
+ spec.authors = ["Davide Santangelo"]
10
+ spec.email = ["davide.santangelo@gmail.com"]
11
+
12
+ spec.summary = %q{Ruby gem to inspect completely a web page.}
13
+ spec.description = %q{Ruby gem to inspect completely a web page. It scrapes a given URL, and returns you its meta, links, images and more.}
14
+ spec.homepage = ""
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.8"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+
25
+ spec.add_development_dependency 'rspec'
26
+ spec.add_development_dependency "vcr"
27
+ spec.add_development_dependency "typhoeus"
28
+
29
+ spec.required_ruby_version = ">= 1.9.3"
30
+
31
+ spec.add_dependency "faraday"
32
+ spec.add_dependency "json"
33
+ spec.add_dependency "addressable"
34
+ spec.add_dependency "nokogiri"
35
+ spec.add_dependency "open_uri_redirections"
36
+ spec.add_dependency "openurl"
37
+ end
metadata ADDED
@@ -0,0 +1,215 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: webinspector
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Davide Santangelo
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-04-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: vcr
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: typhoeus
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: faraday
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: json
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: addressable
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: nokogiri
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: open_uri_redirections
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: openurl
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ description: Ruby gem to inspect completely a web page. It scrapes a given URL, and
168
+ returns you its meta, links, images and more.
169
+ email:
170
+ - davide.santangelo@gmail.com
171
+ executables: []
172
+ extensions: []
173
+ extra_rdoc_files: []
174
+ files:
175
+ - ".gitignore"
176
+ - ".rspec"
177
+ - ".travis.yml"
178
+ - Gemfile
179
+ - LICENSE
180
+ - README.md
181
+ - Rakefile
182
+ - bin/console
183
+ - bin/setup
184
+ - lib/web_inspector.rb
185
+ - lib/web_inspector/inspector.rb
186
+ - lib/web_inspector/meta.rb
187
+ - lib/web_inspector/page.rb
188
+ - lib/web_inspector/version.rb
189
+ - lib/webinspector.rb
190
+ - webinspector.gemspec
191
+ homepage: ''
192
+ licenses:
193
+ - MIT
194
+ metadata: {}
195
+ post_install_message:
196
+ rdoc_options: []
197
+ require_paths:
198
+ - lib
199
+ required_ruby_version: !ruby/object:Gem::Requirement
200
+ requirements:
201
+ - - ">="
202
+ - !ruby/object:Gem::Version
203
+ version: 1.9.3
204
+ required_rubygems_version: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ requirements: []
210
+ rubyforge_project:
211
+ rubygems_version: 2.4.6
212
+ signing_key:
213
+ specification_version: 4
214
+ summary: Ruby gem to inspect completely a web page.
215
+ test_files: []