webinspector 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Gemfile +2 -0
- data/README.md +52 -28
- data/Rakefile +2 -1
- data/bin/console +4 -3
- data/lib/web_inspector/inspector.rb +187 -95
- data/lib/web_inspector/meta.rb +36 -15
- data/lib/web_inspector/page.rb +142 -62
- data/lib/web_inspector/request.rb +10 -8
- data/lib/web_inspector/version.rb +3 -1
- data/lib/web_inspector.rb +4 -2
- data/lib/webinspector.rb +3 -1
- data/webinspector.gemspec +33 -26
- metadata +103 -60
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 0413d3ff948ab6efff6a1cbe8a7844287149ad06f09353655e6cb208968f9481
|
4
|
+
data.tar.gz: 152b950595afb57adc522da24c6959f71d160ba903b3d01ce6ee5f6a8b4d81d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c6230493b59a0d23585be729ec98706cfdbd6852e2de2d65db83d1638f85110369d41f2275b8e0aa09b58008d53924d036840ce63523d9004f19275999be90f8
|
7
|
+
data.tar.gz: dad518b0b04c1e341c14c29438ebcf84f4602bf394254a4c61382ad79ce53dae2ac92f138f331d5bf089d2220011f14b0cde0a79608a91177b2bda2ab1773a96
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
#
|
1
|
+
# WebInspector
|
2
2
|
|
3
|
-
Ruby gem to inspect
|
3
|
+
Ruby gem to inspect web pages. It scrapes a given URL and returns its title, description, meta tags, links, images, and more.
|
4
|
+
|
5
|
+
<a href="https://codeclimate.com/github/davidesantangelo/webinspector"><img src="https://codeclimate.com/github/davidesantangelo/webinspector/badges/gpa.svg" /></a>
|
4
6
|
|
5
|
-
## See it in action!
|
6
7
|
|
7
|
-
You can try WebInspector live at this little demo: [https://scrappet.herokuapp.com](https://scrappet.herokuapp.com)
|
8
8
|
## Installation
|
9
9
|
|
10
10
|
Add this line to your application's Gemfile:
|
@@ -23,50 +23,74 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
## Usage
|
25
25
|
|
26
|
-
Initialize a WebInspector instance
|
26
|
+
### Initialize a WebInspector instance
|
27
27
|
|
28
28
|
```ruby
|
29
|
-
page = WebInspector.new('http://
|
29
|
+
page = WebInspector.new('http://example.com')
|
30
30
|
```
|
31
31
|
|
32
|
-
|
32
|
+
### With options
|
33
|
+
|
34
|
+
```ruby
|
35
|
+
page = WebInspector.new('http://example.com', {
|
36
|
+
timeout: 30, # Request timeout in seconds (default: 30)
|
37
|
+
retries: 3, # Number of retries (default: 3)
|
38
|
+
headers: {'User-Agent': 'Custom UA'} # Custom HTTP headers
|
39
|
+
})
|
40
|
+
```
|
33
41
|
|
34
|
-
|
42
|
+
### Accessing response status and headers
|
35
43
|
|
36
44
|
```ruby
|
37
45
|
page.response.status # 200
|
38
|
-
page.response.headers # { "server"=>"apache", "content-type"=>"text/html; charset=utf-8",
|
46
|
+
page.response.headers # { "server"=>"apache", "content-type"=>"text/html; charset=utf-8", ... }
|
47
|
+
page.status_code # 200
|
48
|
+
page.success? # true if the page was loaded successfully
|
49
|
+
page.error_message # returns the error message if any
|
39
50
|
```
|
40
51
|
|
41
|
-
|
42
|
-
|
43
|
-
You can see the data like this:
|
52
|
+
### Accessing page data
|
44
53
|
|
45
54
|
```ruby
|
46
|
-
page.url
|
47
|
-
page.scheme
|
48
|
-
page.host
|
49
|
-
page.port
|
50
|
-
page.title
|
51
|
-
page.description
|
52
|
-
page.links
|
53
|
-
page.images
|
54
|
-
page.meta
|
55
|
+
page.url # URL of the page
|
56
|
+
page.scheme # Scheme of the page (http, https)
|
57
|
+
page.host # Hostname of the page (like, example.com, without the scheme)
|
58
|
+
page.port # Port of the page
|
59
|
+
page.title # title of the page from the head section
|
60
|
+
page.description # description of the page
|
61
|
+
page.links # array of all links found on the page (absolute URLs)
|
62
|
+
page.images # array of all images found on the page (absolute URLs)
|
63
|
+
page.meta # meta tags of the page
|
64
|
+
page.favicon # favicon URL if available
|
55
65
|
```
|
56
66
|
|
57
|
-
|
67
|
+
### Working with meta tags
|
58
68
|
|
59
69
|
```ruby
|
60
|
-
page.meta #
|
70
|
+
page.meta # all meta tags
|
61
71
|
page.meta['description'] # meta description
|
62
72
|
page.meta['keywords'] # meta keywords
|
73
|
+
page.meta['og:title'] # OpenGraph title
|
74
|
+
```
|
75
|
+
|
76
|
+
### Filtering links and images by domain
|
77
|
+
|
78
|
+
```ruby
|
79
|
+
page.domain_links('example.com') # returns only links pointing to example.com
|
80
|
+
page.domain_images('example.com') # returns only images hosted on example.com
|
63
81
|
```
|
64
82
|
|
65
|
-
|
83
|
+
### Searching for words
|
84
|
+
|
66
85
|
```ruby
|
67
|
-
page.find(["
|
86
|
+
page.find(["ruby", "rails"]) # returns [{"ruby"=>3}, {"rails"=>1}]
|
68
87
|
```
|
69
88
|
|
89
|
+
### Export all data to JSON
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
page.to_hash # returns a hash with all page data
|
93
|
+
```
|
70
94
|
|
71
95
|
## Contributors
|
72
96
|
|
@@ -74,13 +98,13 @@ page.find(["word1, word2"]) # return {"word1"=>3, "word2"=>1}
|
|
74
98
|
* Sam Nissen ([@samnissen](https://github.com/samnissen))
|
75
99
|
|
76
100
|
## License
|
77
|
-
|
101
|
+
|
102
|
+
The WebInspector gem is released under the MIT License.
|
78
103
|
|
79
104
|
## Contributing
|
80
105
|
|
81
|
-
1. Fork it ( https://github.com/
|
106
|
+
1. Fork it ( https://github.com/davidesantangelo/webinspector/fork )
|
82
107
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
83
108
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
84
109
|
4. Push to the branch (`git push origin my-new-feature`)
|
85
110
|
5. Create a new Pull Request
|
86
|
-
>>>>>>> develop
|
data/Rakefile
CHANGED
data/bin/console
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
|
-
require
|
4
|
-
require
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'webinspector'
|
5
6
|
|
6
7
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
8
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +11,5 @@ require "webinspector"
|
|
10
11
|
# require "pry"
|
11
12
|
# Pry.start
|
12
13
|
|
13
|
-
require
|
14
|
+
require 'irb'
|
14
15
|
IRB.start
|
@@ -1,144 +1,236 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require File.expand_path(File.join(File.dirname(__FILE__), 'meta'))
|
2
4
|
|
3
5
|
module WebInspector
|
4
6
|
class Inspector
|
7
|
+
attr_reader :page, :url, :host, :meta
|
5
8
|
|
6
9
|
def initialize(page)
|
7
10
|
@page = page
|
8
11
|
@meta = WebInspector::Meta.new(page).meta
|
12
|
+
@base_url = nil
|
13
|
+
end
|
14
|
+
|
15
|
+
def set_url(url, host)
|
16
|
+
@url = url
|
17
|
+
@host = host
|
9
18
|
end
|
10
19
|
|
11
20
|
def title
|
12
|
-
@page.css('title').inner_text.strip
|
21
|
+
@page.css('title').inner_text.strip
|
22
|
+
rescue StandardError
|
23
|
+
nil
|
13
24
|
end
|
14
25
|
|
15
26
|
def description
|
16
|
-
@meta['description'] || snippet
|
27
|
+
@meta['description'] || @meta['og:description'] || snippet
|
17
28
|
end
|
18
29
|
|
19
30
|
def body
|
20
31
|
@page.css('body').to_html
|
21
32
|
end
|
22
33
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
34
|
+
# Search for specific words in the page content
|
35
|
+
# @param words [Array<String>] List of words to search for
|
36
|
+
# @return [Array<Hash>] Counts of word occurrences
|
27
37
|
def find(words)
|
28
|
-
text = @page.at('html').inner_text
|
38
|
+
text = @page.at('html').inner_text
|
29
39
|
counter(text.downcase, words)
|
30
40
|
end
|
31
41
|
|
42
|
+
# Get all links from the page
|
43
|
+
# @return [Array<String>] Array of URLs
|
32
44
|
def links
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
45
|
+
@links ||= begin
|
46
|
+
links = []
|
47
|
+
@page.css('a').each do |a|
|
48
|
+
href = a[:href]
|
49
|
+
next unless href
|
50
|
+
|
51
|
+
# Skip javascript and mailto links
|
52
|
+
next if href.start_with?('javascript:', 'mailto:', 'tel:')
|
53
|
+
|
54
|
+
# Clean and normalize URL
|
55
|
+
href = href.strip
|
56
|
+
|
57
|
+
begin
|
58
|
+
absolute_url = make_absolute_url(href)
|
59
|
+
links << absolute_url if absolute_url
|
60
|
+
rescue URI::InvalidURIError
|
61
|
+
# Skip invalid URLs
|
62
|
+
end
|
63
|
+
end
|
64
|
+
links.uniq
|
53
65
|
end
|
54
|
-
|
55
|
-
return domain_links.compact
|
56
66
|
end
|
57
|
-
|
58
|
-
|
67
|
+
|
68
|
+
# Get links from a specific domain
|
69
|
+
# @param user_domain [String] Domain to filter links by
|
70
|
+
# @param host [String] Current host
|
71
|
+
# @return [Array<String>] Filtered links
|
72
|
+
def domain_links(user_domain, host = nil)
|
59
73
|
@host ||= host
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
74
|
+
|
75
|
+
return [] if links.empty?
|
76
|
+
|
77
|
+
# Handle nil user_domain
|
78
|
+
user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
|
79
|
+
|
80
|
+
# Normalize domain for comparison
|
81
|
+
user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
|
82
|
+
user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
|
83
|
+
|
84
|
+
links.select do |link|
|
85
|
+
uri = URI.parse(link.to_s)
|
86
|
+
next false unless uri.host # Skip URLs without hosts
|
87
|
+
|
88
|
+
uri_host = uri.host.to_s.downcase
|
89
|
+
uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
|
90
|
+
uri_host.include?(user_domain)
|
91
|
+
rescue URI::InvalidURIError, NoMethodError
|
92
|
+
false
|
73
93
|
end
|
74
|
-
|
75
|
-
return domain_images.compact
|
76
94
|
end
|
77
|
-
|
78
|
-
#
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
# The PublicSuffix object splits the domain and subdomain
|
99
|
-
# (unlike URI), which allows more liberal URL matching.
|
100
|
-
return PublicSuffix.parse(uri.host)
|
101
|
-
rescue URI::InvalidURIError, PublicSuffix::DomainInvalid => e
|
102
|
-
return false
|
95
|
+
|
96
|
+
# Get all images from the page
|
97
|
+
# @return [Array<String>] Array of image URLs
|
98
|
+
def images
|
99
|
+
@images ||= begin
|
100
|
+
images = []
|
101
|
+
@page.css('img').each do |img|
|
102
|
+
src = img[:src]
|
103
|
+
next unless src
|
104
|
+
|
105
|
+
# Clean and normalize URL
|
106
|
+
src = src.strip
|
107
|
+
|
108
|
+
begin
|
109
|
+
absolute_url = make_absolute_url(src)
|
110
|
+
images << absolute_url if absolute_url
|
111
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
112
|
+
# Skip invalid URLs
|
113
|
+
end
|
114
|
+
end
|
115
|
+
images.uniq.compact
|
103
116
|
end
|
104
117
|
end
|
105
118
|
|
106
|
-
|
107
|
-
|
108
|
-
|
119
|
+
# Get images from a specific domain
|
120
|
+
# @param user_domain [String] Domain to filter images by
|
121
|
+
# @param host [String] Current host
|
122
|
+
# @return [Array<String>] Filtered images
|
123
|
+
def domain_images(user_domain, host = nil)
|
124
|
+
@host ||= host
|
125
|
+
|
126
|
+
return [] if images.empty?
|
127
|
+
|
128
|
+
# Handle nil user_domain
|
129
|
+
user_domain = @host.to_s if user_domain.nil? || user_domain.empty?
|
130
|
+
|
131
|
+
# Normalize domain for comparison
|
132
|
+
user_domain = user_domain.to_s.downcase.gsub(/\s+/, '')
|
133
|
+
user_domain = user_domain.sub(/^www\./, '') # Remove www prefix for comparison
|
134
|
+
|
135
|
+
images.select do |img|
|
136
|
+
uri = URI.parse(img.to_s)
|
137
|
+
next false unless uri.host # Skip URLs without hosts
|
138
|
+
|
139
|
+
uri_host = uri.host.to_s.downcase
|
140
|
+
uri_host = uri_host.sub(/^www\./, '') # Remove www prefix for comparison
|
141
|
+
uri_host.include?(user_domain)
|
142
|
+
rescue URI::InvalidURIError, NoMethodError
|
143
|
+
false
|
144
|
+
end
|
109
145
|
end
|
110
146
|
|
111
147
|
private
|
112
|
-
|
113
|
-
def counter(text, words)
|
114
|
-
results = []
|
115
|
-
hash = Hash.new
|
116
148
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
149
|
+
# Count occurrences of words in text
|
150
|
+
# @param text [String] Text to search in
|
151
|
+
# @param words [Array<String>] Words to find
|
152
|
+
# @return [Array<Hash>] Count results
|
153
|
+
def counter(text, words)
|
154
|
+
words.map do |word|
|
155
|
+
{ word => text.scan(/#{word.downcase}/).size }
|
121
156
|
end
|
122
|
-
return results
|
123
157
|
end
|
124
158
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
159
|
+
# Validate a URL domain
|
160
|
+
# @param u [String] URL to validate
|
161
|
+
# @return [PublicSuffix::Domain, false] Domain object or false if invalid
|
162
|
+
def validate_url_domain(u)
|
163
|
+
u = u.to_s
|
164
|
+
u = '/' if u.empty?
|
165
|
+
|
166
|
+
begin
|
167
|
+
domained_url = if !(u.split('/').first || '').match(/(:|\.)/)
|
168
|
+
@host + u
|
169
|
+
else
|
170
|
+
u
|
171
|
+
end
|
172
|
+
|
173
|
+
httpped_url = domained_url.start_with?('http') ? domained_url : "http://#{domained_url}"
|
174
|
+
uri = URI.parse(httpped_url)
|
175
|
+
|
176
|
+
PublicSuffix.parse(uri.host)
|
177
|
+
rescue URI::InvalidURIError, PublicSuffix::DomainInvalid
|
178
|
+
false
|
129
179
|
end
|
130
180
|
end
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
181
|
+
|
182
|
+
# Make a URL absolute
|
183
|
+
# @param url [String] URL to make absolute
|
184
|
+
# @return [String, nil] Absolute URL or nil if invalid
|
185
|
+
def make_absolute_url(url)
|
186
|
+
return nil if url.nil? || url.empty?
|
187
|
+
|
188
|
+
# If it's already absolute, return it
|
189
|
+
return url if url.start_with?('http://', 'https://')
|
190
|
+
|
191
|
+
# Get base URL from the page if not already set
|
192
|
+
if @base_url.nil?
|
193
|
+
base_tag = @page.at_css('base[href]')
|
194
|
+
@base_url = base_tag ? base_tag['href'] : nil
|
195
|
+
end
|
196
|
+
|
197
|
+
begin
|
198
|
+
# Try joining with base URL first if available
|
199
|
+
if @base_url && !@base_url.empty?
|
200
|
+
begin
|
201
|
+
return URI.join(@base_url, url).to_s
|
202
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
203
|
+
# Fall through to next method
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
# If we have @url, try to use it
|
208
|
+
if @url
|
209
|
+
begin
|
210
|
+
return URI.join(@url, url).to_s
|
211
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
212
|
+
# Fall through to next method
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# Otherwise use a default http:// base if url is absolute path
|
217
|
+
return "http://#{@host}#{url}" if url.start_with?('/')
|
218
|
+
|
219
|
+
# For truly relative URLs with no base, we need to make our best guess
|
220
|
+
return "http://#{@host}/#{url}" if @host
|
221
|
+
|
222
|
+
# Last resort, return the original
|
223
|
+
url
|
224
|
+
rescue URI::InvalidURIError, URI::BadURIError
|
225
|
+
url # Return original instead of nil to be more lenient
|
136
226
|
end
|
137
227
|
end
|
138
228
|
|
229
|
+
# Extract a snippet from the first long paragraph
|
230
|
+
# @return [String] Text snippet
|
139
231
|
def snippet
|
140
232
|
first_long_paragraph = @page.search('//p[string-length() >= 120]').first
|
141
|
-
first_long_paragraph ? first_long_paragraph.text : ''
|
233
|
+
first_long_paragraph ? first_long_paragraph.text.strip[0..255] : ''
|
142
234
|
end
|
143
235
|
end
|
144
|
-
end
|
236
|
+
end
|
data/lib/web_inspector/meta.rb
CHANGED
@@ -1,15 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module WebInspector
|
2
4
|
class Meta
|
3
|
-
|
5
|
+
def initialize(page)
|
4
6
|
@page = page
|
5
7
|
end
|
6
8
|
|
7
9
|
def meta_tags
|
8
10
|
{
|
9
|
-
'name'
|
10
|
-
'http-equiv'
|
11
|
-
'property'
|
12
|
-
'charset'
|
11
|
+
'name' => meta_tags_by('name'),
|
12
|
+
'http-equiv' => meta_tags_by('http-equiv'),
|
13
|
+
'property' => meta_tags_by('property'),
|
14
|
+
'charset' => [charset_from_meta_charset],
|
15
|
+
'itemprop' => meta_tags_by('itemprop') # Add support for schema.org microdata
|
13
16
|
}
|
14
17
|
end
|
15
18
|
|
@@ -19,30 +22,48 @@ module WebInspector
|
|
19
22
|
|
20
23
|
def meta
|
21
24
|
meta_tag['name']
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
+
.merge(meta_tag['http-equiv'])
|
26
|
+
.merge(meta_tag['property'])
|
27
|
+
.merge(meta_tag['itemprop'] || {})
|
28
|
+
.merge('charset' => meta_tag['charset'])
|
25
29
|
end
|
26
30
|
|
27
31
|
def charset
|
28
|
-
@charset ||=
|
32
|
+
@charset ||= charset_from_meta_charset || charset_from_meta_content_type || charset_from_header || 'utf-8'
|
29
33
|
end
|
30
34
|
|
31
35
|
private
|
32
36
|
|
33
37
|
def charset_from_meta_charset
|
34
|
-
@page.css('meta[charset]')[0].attributes['charset'].value
|
38
|
+
@page.css('meta[charset]')[0].attributes['charset'].value
|
39
|
+
rescue StandardError
|
40
|
+
nil
|
35
41
|
end
|
36
42
|
|
37
43
|
def charset_from_meta_content_type
|
38
|
-
@page.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(';')[1].split('=')[1]
|
44
|
+
@page.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(';')[1].strip.split('=')[1]
|
45
|
+
rescue StandardError
|
46
|
+
nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def charset_from_header
|
50
|
+
# Try to get charset from Content-Type header if available
|
51
|
+
nil
|
39
52
|
end
|
40
53
|
|
41
|
-
|
54
|
+
def meta_tags_by(attribute)
|
42
55
|
hash = {}
|
43
56
|
@page.css("meta[@#{attribute}]").map do |tag|
|
44
|
-
name
|
45
|
-
|
57
|
+
name = begin
|
58
|
+
tag.attributes[attribute].value.downcase
|
59
|
+
rescue StandardError
|
60
|
+
nil
|
61
|
+
end
|
62
|
+
content = begin
|
63
|
+
tag.attributes['content'].value
|
64
|
+
rescue StandardError
|
65
|
+
nil
|
66
|
+
end
|
46
67
|
|
47
68
|
if name && content
|
48
69
|
hash[name] ||= []
|
@@ -64,4 +85,4 @@ module WebInspector
|
|
64
85
|
end
|
65
86
|
end
|
66
87
|
end
|
67
|
-
end
|
88
|
+
end
|