pageinfo 0.1.0 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +9 -9
- data/bin/pageinfo +4 -0
- data/exe/pageinfo +4 -0
- data/lib/pageinfo.rb +127 -3
- data/lib/pageinfo/version.rb +1 -1
- data/pageinfo.gemspec +3 -1
- metadata +35 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 551cee8d620c763efb8351f36e05c467294b1fc2
|
4
|
+
data.tar.gz: d3101badc2494708e3867afd031a605f3ac63489
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f53e5eb1f18d7098f56f0b282a03b3339da1c3a5d8ab2eec2f786c4c86ea6fc75e779178deebd14264b3073ea7f7376873e0f82b4e9464280545f819772a201
|
7
|
+
data.tar.gz: a8e641eadfc3174532177e5324e2127913d625129e2c679b9a685abf26ac23efdb90011a26e49d85ffd9f9d75c9d3d89ef40c52d5f57a930ade30c190f06051f
|
data/README.md
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# Pageinfo
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
3
|
+
Pageinfo will browse through your site then list every page on your site and give detail meta info about the page.
|
6
4
|
|
7
5
|
## Installation
|
8
6
|
|
@@ -22,18 +20,20 @@ Or install it yourself as:
|
|
22
20
|
|
23
21
|
## Usage
|
24
22
|
|
25
|
-
|
23
|
+
Browse your site using:
|
24
|
+
|
25
|
+
$ pageinfo "http://www.yoursite.com"
|
26
26
|
|
27
27
|
## Development
|
28
28
|
|
29
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
29
|
+
<!-- After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
30
|
|
31
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). -->
|
32
32
|
|
33
33
|
## Contributing
|
34
34
|
|
35
|
-
1. Fork it ( https://github.com/
|
36
|
-
2. Create your feature branch (`git checkout -b
|
35
|
+
1. Fork it ( https://github.com/aditiamahdar/pageinfo/fork )
|
36
|
+
2. Create your feature branch (`git checkout -b new-feature`)
|
37
37
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
|
-
4. Push to the branch (`git push origin
|
38
|
+
4. Push to the branch (`git push origin new-feature`)
|
39
39
|
5. Create a new Pull Request
|
data/bin/pageinfo
ADDED
data/exe/pageinfo
ADDED
data/lib/pageinfo.rb
CHANGED
@@ -1,8 +1,132 @@
|
|
1
1
|
require "pageinfo/version"
|
2
|
+
require "uri"
|
3
|
+
require "typhoeus"
|
4
|
+
require "nokogiri"
|
2
5
|
|
3
6
|
module Pageinfo
|
4
|
-
def self.
|
5
|
-
|
6
|
-
|
7
|
+
def self.hi
|
8
|
+
greeting = "Hello World!"
|
9
|
+
puts greeting
|
10
|
+
greeting
|
7
11
|
end
|
12
|
+
|
13
|
+
def self.detect(url)
|
14
|
+
content = ["url", "status", "time", "title", "description", "keyword"].join(",")
|
15
|
+
content << new_line
|
16
|
+
|
17
|
+
@@no = 0
|
18
|
+
@@main_host = get_host(URI.parse(url))
|
19
|
+
scrapped_links, scrapped_urls = [url], [url]
|
20
|
+
|
21
|
+
conn = Typhoeus.get(url)
|
22
|
+
page = Nokogiri::HTML(conn.body)
|
23
|
+
|
24
|
+
content << get_info(conn, page)
|
25
|
+
content << new_line
|
26
|
+
|
27
|
+
@links = get_page_links(page)
|
28
|
+
puts "Homepage links: #{@links.count}"
|
29
|
+
while true do
|
30
|
+
if link = @links.shift
|
31
|
+
full_url = get_full_url(link)
|
32
|
+
unless full_url.nil?
|
33
|
+
if (scrapped_urls & [full_url, "#{full_url}/", "#{full_url}/#"]).empty?
|
34
|
+
# No duplicate link
|
35
|
+
conn = Typhoeus.get(full_url)
|
36
|
+
page = Nokogiri::HTML(conn.body)
|
37
|
+
content << get_info(conn, page)
|
38
|
+
content << new_line
|
39
|
+
|
40
|
+
scrapped_links << link
|
41
|
+
scrapped_urls << full_url
|
42
|
+
|
43
|
+
new_links = get_page_links(page)
|
44
|
+
new_links = new_links - @links
|
45
|
+
new_links = new_links - scrapped_links
|
46
|
+
@links = @links + new_links unless new_links.empty?
|
47
|
+
puts "Links: #{@links.count} left, #{new_links.count} new links"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
else
|
51
|
+
break;
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
File.open("pageinfo.csv", "w") { |file| file.write content }
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
def self.new_line
|
60
|
+
"\n"
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.get_host(uri)
|
64
|
+
(uri.port.eql?(443) ? "https://" : "http://") +
|
65
|
+
uri.host +
|
66
|
+
(uri.port.eql?(80) ? "" : ":#{uri.port}")
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.get_info(conn, page)
|
70
|
+
@@no = @@no.next
|
71
|
+
puts "#{@@no}. #{conn.effective_url}"
|
72
|
+
[
|
73
|
+
"\"#{conn.effective_url}\"",
|
74
|
+
"\"#{conn.response_code}\"",
|
75
|
+
"\"#{conn.total_time}\"",
|
76
|
+
"\"#{get_head(page, "title")}\"",
|
77
|
+
"\"#{get_head(page, "description")}\"",
|
78
|
+
"\"#{get_head(page, "keywords")}\"",
|
79
|
+
].join(",")
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.get_head(page, type)
|
83
|
+
case type
|
84
|
+
when "title"
|
85
|
+
page.at("title").text.strip rescue ""
|
86
|
+
when "description"
|
87
|
+
page.at("meta[name=description]").attribute("content").value.strip rescue ""
|
88
|
+
when "keywords"
|
89
|
+
page.at("meta[name=keywords]").attribute("content").value.strip rescue ""
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.get_full_url(link)
|
94
|
+
if bad_link?(link)
|
95
|
+
nil
|
96
|
+
elsif link.match(/^\//)
|
97
|
+
"#{@@main_host}#{valid_link(link)}"
|
98
|
+
elsif link.match(@@main_host)
|
99
|
+
valid_link(link)
|
100
|
+
else
|
101
|
+
"#{@@main_host}/#{link}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.bad_link?(link)
|
106
|
+
[nil, "#"].include?(link) ||
|
107
|
+
link.match(/^javascript/) ||
|
108
|
+
(link.match(/^http/) && external_link?(link))
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.external_link?(link)
|
112
|
+
uri = URI.parse(link) rescue nil
|
113
|
+
uri.nil? ? true : !@@main_host.eql?(get_host(uri))
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.valid_link(link)
|
117
|
+
if link.match(/\/$/)
|
118
|
+
link[0..-2]
|
119
|
+
elsif link.match(/\/\#$/)
|
120
|
+
link[0..-3]
|
121
|
+
else
|
122
|
+
link
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.get_page_links(page)
|
127
|
+
links = page.css("a:not([rel]),a[rel!=nofollow]").map do |link|
|
128
|
+
link.attribute("href").value rescue nil
|
129
|
+
end
|
130
|
+
links.uniq.compact
|
131
|
+
end
|
8
132
|
end
|
data/lib/pageinfo/version.rb
CHANGED
data/pageinfo.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["adit@41studio.com"]
|
11
11
|
|
12
12
|
spec.summary = %q{Browse your page meta info and optimize your SEO strategy.}
|
13
|
-
spec.description = %q{
|
13
|
+
spec.description = %q{Pageinfo will browse through your site then list every page on your site and give detail meta info about the page.}
|
14
14
|
spec.homepage = "https://github.com/aditiamahdar/pageinfo"
|
15
15
|
spec.license = "MIT"
|
16
16
|
|
@@ -26,4 +26,6 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_development_dependency "bundler", "~> 1.9"
|
27
27
|
spec.add_development_dependency "rake", "~> 10.0"
|
28
28
|
spec.add_development_dependency "rspec", "~> 3.2"
|
29
|
+
spec.add_development_dependency "typhoeus", "~> 0.7"
|
30
|
+
spec.add_development_dependency "nokogiri", "~> 1.6"
|
29
31
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pageinfo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aditiamahdar
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,11 +52,40 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.2'
|
55
|
-
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: typhoeus
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.7'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.7'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.6'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.6'
|
83
|
+
description: Pageinfo will browse through your site then list every page on your site
|
56
84
|
and give detail meta info about the page.
|
57
85
|
email:
|
58
86
|
- adit@41studio.com
|
59
|
-
executables:
|
87
|
+
executables:
|
88
|
+
- pageinfo
|
60
89
|
extensions: []
|
61
90
|
extra_rdoc_files: []
|
62
91
|
files:
|
@@ -68,7 +97,9 @@ files:
|
|
68
97
|
- README.md
|
69
98
|
- Rakefile
|
70
99
|
- bin/console
|
100
|
+
- bin/pageinfo
|
71
101
|
- bin/setup
|
102
|
+
- exe/pageinfo
|
72
103
|
- lib/pageinfo.rb
|
73
104
|
- lib/pageinfo/version.rb
|
74
105
|
- pageinfo.gemspec
|