pageinfo 0.1.0 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -9
- data/bin/pageinfo +4 -0
- data/exe/pageinfo +4 -0
- data/lib/pageinfo.rb +127 -3
- data/lib/pageinfo/version.rb +1 -1
- data/pageinfo.gemspec +3 -1
- metadata +35 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 551cee8d620c763efb8351f36e05c467294b1fc2
|
4
|
+
data.tar.gz: d3101badc2494708e3867afd031a605f3ac63489
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f53e5eb1f18d7098f56f0b282a03b3339da1c3a5d8ab2eec2f786c4c86ea6fc75e779178deebd14264b3073ea7f7376873e0f82b4e9464280545f819772a201
|
7
|
+
data.tar.gz: a8e641eadfc3174532177e5324e2127913d625129e2c679b9a685abf26ac23efdb90011a26e49d85ffd9f9d75c9d3d89ef40c52d5f57a930ade30c190f06051f
|
data/README.md
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# Pageinfo
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
3
|
+
Pageinfo will browse through your site then list every page on your site and give detail meta info about the page.
|
6
4
|
|
7
5
|
## Installation
|
8
6
|
|
@@ -22,18 +20,20 @@ Or install it yourself as:
|
|
22
20
|
|
23
21
|
## Usage
|
24
22
|
|
25
|
-
|
23
|
+
Browse your site using:
|
24
|
+
|
25
|
+
$ pageinfo "http://www.yoursite.com"
|
26
26
|
|
27
27
|
## Development
|
28
28
|
|
29
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
29
|
+
<!-- After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
30
30
|
|
31
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
31
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). -->
|
32
32
|
|
33
33
|
## Contributing
|
34
34
|
|
35
|
-
1. Fork it ( https://github.com/
|
36
|
-
2. Create your feature branch (`git checkout -b
|
35
|
+
1. Fork it ( https://github.com/aditiamahdar/pageinfo/fork )
|
36
|
+
2. Create your feature branch (`git checkout -b new-feature`)
|
37
37
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
|
-
4. Push to the branch (`git push origin
|
38
|
+
4. Push to the branch (`git push origin new-feature`)
|
39
39
|
5. Create a new Pull Request
|
data/bin/pageinfo
ADDED
data/exe/pageinfo
ADDED
data/lib/pageinfo.rb
CHANGED
@@ -1,8 +1,132 @@
|
|
1
1
|
require "pageinfo/version"
|
2
|
+
require "uri"
|
3
|
+
require "typhoeus"
|
4
|
+
require "nokogiri"
|
2
5
|
|
3
6
|
module Pageinfo
|
4
|
-
def self.
|
5
|
-
|
6
|
-
|
7
|
+
def self.hi
|
8
|
+
greeting = "Hello World!"
|
9
|
+
puts greeting
|
10
|
+
greeting
|
7
11
|
end
|
12
|
+
|
13
|
+
def self.detect(url)
|
14
|
+
content = ["url", "status", "time", "title", "description", "keyword"].join(",")
|
15
|
+
content << new_line
|
16
|
+
|
17
|
+
@@no = 0
|
18
|
+
@@main_host = get_host(URI.parse(url))
|
19
|
+
scrapped_links, scrapped_urls = [url], [url]
|
20
|
+
|
21
|
+
conn = Typhoeus.get(url)
|
22
|
+
page = Nokogiri::HTML(conn.body)
|
23
|
+
|
24
|
+
content << get_info(conn, page)
|
25
|
+
content << new_line
|
26
|
+
|
27
|
+
@links = get_page_links(page)
|
28
|
+
puts "Homepage links: #{@links.count}"
|
29
|
+
while true do
|
30
|
+
if link = @links.shift
|
31
|
+
full_url = get_full_url(link)
|
32
|
+
unless full_url.nil?
|
33
|
+
if (scrapped_urls & [full_url, "#{full_url}/", "#{full_url}/#"]).empty?
|
34
|
+
# No duplicate link
|
35
|
+
conn = Typhoeus.get(full_url)
|
36
|
+
page = Nokogiri::HTML(conn.body)
|
37
|
+
content << get_info(conn, page)
|
38
|
+
content << new_line
|
39
|
+
|
40
|
+
scrapped_links << link
|
41
|
+
scrapped_urls << full_url
|
42
|
+
|
43
|
+
new_links = get_page_links(page)
|
44
|
+
new_links = new_links - @links
|
45
|
+
new_links = new_links - scrapped_links
|
46
|
+
@links = @links + new_links unless new_links.empty?
|
47
|
+
puts "Links: #{@links.count} left, #{new_links.count} new links"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
else
|
51
|
+
break;
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
File.open("pageinfo.csv", "w") { |file| file.write content }
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
def self.new_line
|
60
|
+
"\n"
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.get_host(uri)
|
64
|
+
(uri.port.eql?(443) ? "https://" : "http://") +
|
65
|
+
uri.host +
|
66
|
+
(uri.port.eql?(80) ? "" : ":#{uri.port}")
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.get_info(conn, page)
|
70
|
+
@@no = @@no.next
|
71
|
+
puts "#{@@no}. #{conn.effective_url}"
|
72
|
+
[
|
73
|
+
"\"#{conn.effective_url}\"",
|
74
|
+
"\"#{conn.response_code}\"",
|
75
|
+
"\"#{conn.total_time}\"",
|
76
|
+
"\"#{get_head(page, "title")}\"",
|
77
|
+
"\"#{get_head(page, "description")}\"",
|
78
|
+
"\"#{get_head(page, "keywords")}\"",
|
79
|
+
].join(",")
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.get_head(page, type)
|
83
|
+
case type
|
84
|
+
when "title"
|
85
|
+
page.at("title").text.strip rescue ""
|
86
|
+
when "description"
|
87
|
+
page.at("meta[name=description]").attribute("content").value.strip rescue ""
|
88
|
+
when "keywords"
|
89
|
+
page.at("meta[name=keywords]").attribute("content").value.strip rescue ""
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.get_full_url(link)
|
94
|
+
if bad_link?(link)
|
95
|
+
nil
|
96
|
+
elsif link.match(/^\//)
|
97
|
+
"#{@@main_host}#{valid_link(link)}"
|
98
|
+
elsif link.match(@@main_host)
|
99
|
+
valid_link(link)
|
100
|
+
else
|
101
|
+
"#{@@main_host}/#{link}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def self.bad_link?(link)
|
106
|
+
[nil, "#"].include?(link) ||
|
107
|
+
link.match(/^javascript/) ||
|
108
|
+
(link.match(/^http/) && external_link?(link))
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.external_link?(link)
|
112
|
+
uri = URI.parse(link) rescue nil
|
113
|
+
uri.nil? ? true : !@@main_host.eql?(get_host(uri))
|
114
|
+
end
|
115
|
+
|
116
|
+
def self.valid_link(link)
|
117
|
+
if link.match(/\/$/)
|
118
|
+
link[0..-2]
|
119
|
+
elsif link.match(/\/\#$/)
|
120
|
+
link[0..-3]
|
121
|
+
else
|
122
|
+
link
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.get_page_links(page)
|
127
|
+
links = page.css("a:not([rel]),a[rel!=nofollow]").map do |link|
|
128
|
+
link.attribute("href").value rescue nil
|
129
|
+
end
|
130
|
+
links.uniq.compact
|
131
|
+
end
|
8
132
|
end
|
data/lib/pageinfo/version.rb
CHANGED
data/pageinfo.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["adit@41studio.com"]
|
11
11
|
|
12
12
|
spec.summary = %q{Browse your page meta info and optimize your SEO strategy.}
|
13
|
-
spec.description = %q{
|
13
|
+
spec.description = %q{Pageinfo will browse through your site then list every page on your site and give detail meta info about the page.}
|
14
14
|
spec.homepage = "https://github.com/aditiamahdar/pageinfo"
|
15
15
|
spec.license = "MIT"
|
16
16
|
|
@@ -26,4 +26,6 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_development_dependency "bundler", "~> 1.9"
|
27
27
|
spec.add_development_dependency "rake", "~> 10.0"
|
28
28
|
spec.add_development_dependency "rspec", "~> 3.2"
|
29
|
+
spec.add_development_dependency "typhoeus", "~> 0.7"
|
30
|
+
spec.add_development_dependency "nokogiri", "~> 1.6"
|
29
31
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pageinfo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- aditiamahdar
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,11 +52,40 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.2'
|
55
|
-
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: typhoeus
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.7'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.7'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.6'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.6'
|
83
|
+
description: Pageinfo will browse through your site then list every page on your site
|
56
84
|
and give detail meta info about the page.
|
57
85
|
email:
|
58
86
|
- adit@41studio.com
|
59
|
-
executables:
|
87
|
+
executables:
|
88
|
+
- pageinfo
|
60
89
|
extensions: []
|
61
90
|
extra_rdoc_files: []
|
62
91
|
files:
|
@@ -68,7 +97,9 @@ files:
|
|
68
97
|
- README.md
|
69
98
|
- Rakefile
|
70
99
|
- bin/console
|
100
|
+
- bin/pageinfo
|
71
101
|
- bin/setup
|
102
|
+
- exe/pageinfo
|
72
103
|
- lib/pageinfo.rb
|
73
104
|
- lib/pageinfo/version.rb
|
74
105
|
- pageinfo.gemspec
|