pageinfo 0.1.0 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 13de38291ef72f6b4e26b68d8dab4d883957ba02
4
- data.tar.gz: 27c1569ddf15e528c4efe8428e14d66a78393bda
3
+ metadata.gz: 551cee8d620c763efb8351f36e05c467294b1fc2
4
+ data.tar.gz: d3101badc2494708e3867afd031a605f3ac63489
5
5
  SHA512:
6
- metadata.gz: b5e192e0e2751f1d0a3d9d924b20a3fc06029e147a3624e555640013ee2fe95290a9c76e2a7bb5d4a1a0d01865f9d2cf5047ee330a0cb0c1f9fc028cdb8c58d0
7
- data.tar.gz: 96eb29d6ff9bc10c339a23e93892edd331270d7572496ba2a0e4fb3050ccf7e656a219a0948a21b9b4cfcb76c21e0d1ec265695429f856f56e88d2edd931fd7d
6
+ metadata.gz: 2f53e5eb1f18d7098f56f0b282a03b3339da1c3a5d8ab2eec2f786c4c86ea6fc75e779178deebd14264b3073ea7f7376873e0f82b4e9464280545f819772a201
7
+ data.tar.gz: a8e641eadfc3174532177e5324e2127913d625129e2c679b9a685abf26ac23efdb90011a26e49d85ffd9f9d75c9d3d89ef40c52d5f57a930ade30c190f06051f
data/README.md CHANGED
@@ -1,8 +1,6 @@
1
1
  # Pageinfo
2
2
 
3
- Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/pageinfo`. To experiment with that code, run `bin/console` for an interactive prompt.
4
-
5
- TODO: Delete this and the text above, and describe your gem
3
+ Pageinfo will browse through your site then list every page on your site and give detail meta info about the page.
6
4
 
7
5
  ## Installation
8
6
 
@@ -22,18 +20,20 @@ Or install it yourself as:
22
20
 
23
21
  ## Usage
24
22
 
25
- TODO: Write usage instructions here
23
+ Browse your site using:
24
+
25
+ $ pageinfo "http://www.yoursite.com"
26
26
 
27
27
  ## Development
28
28
 
29
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
29
+ <!-- After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
30
30
 
31
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). -->
32
32
 
33
33
  ## Contributing
34
34
 
35
- 1. Fork it ( https://github.com/[my-github-username]/pageinfo/fork )
36
- 2. Create your feature branch (`git checkout -b my-new-feature`)
35
+ 1. Fork it ( https://github.com/aditiamahdar/pageinfo/fork )
36
+ 2. Create your feature branch (`git checkout -b new-feature`)
37
37
  3. Commit your changes (`git commit -am 'Add some feature'`)
38
- 4. Push to the branch (`git push origin my-new-feature`)
38
+ 4. Push to the branch (`git push origin new-feature`)
39
39
  5. Create a new Pull Request
data/bin/pageinfo ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'pageinfo'
4
+ Pageinfo.detect ARGV[0]
data/exe/pageinfo ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'pageinfo'
4
+ Pageinfo.detect ARGV[0]
data/lib/pageinfo.rb CHANGED
@@ -1,8 +1,132 @@
1
1
  require "pageinfo/version"
2
+ require "uri"
3
+ require "typhoeus"
4
+ require "nokogiri"
2
5
 
3
6
  module Pageinfo
4
- def self.process(str)
5
- # TODO: process `str`
6
- str
7
+ def self.hi
8
+ greeting = "Hello World!"
9
+ puts greeting
10
+ greeting
7
11
  end
12
+
13
+ def self.detect(url)
14
+ content = ["url", "status", "time", "title", "description", "keyword"].join(",")
15
+ content << new_line
16
+
17
+ @@no = 0
18
+ @@main_host = get_host(URI.parse(url))
19
+ scrapped_links, scrapped_urls = [url], [url]
20
+
21
+ conn = Typhoeus.get(url)
22
+ page = Nokogiri::HTML(conn.body)
23
+
24
+ content << get_info(conn, page)
25
+ content << new_line
26
+
27
+ @links = get_page_links(page)
28
+ puts "Homepage links: #{@links.count}"
29
+ while true do
30
+ if link = @links.shift
31
+ full_url = get_full_url(link)
32
+ unless full_url.nil?
33
+ if (scrapped_urls & [full_url, "#{full_url}/", "#{full_url}/#"]).empty?
34
+ # No duplicate link
35
+ conn = Typhoeus.get(full_url)
36
+ page = Nokogiri::HTML(conn.body)
37
+ content << get_info(conn, page)
38
+ content << new_line
39
+
40
+ scrapped_links << link
41
+ scrapped_urls << full_url
42
+
43
+ new_links = get_page_links(page)
44
+ new_links = new_links - @links
45
+ new_links = new_links - scrapped_links
46
+ @links = @links + new_links unless new_links.empty?
47
+ puts "Links: #{@links.count} left, #{new_links.count} new links"
48
+ end
49
+ end
50
+ else
51
+ break;
52
+ end
53
+ end
54
+
55
+ File.open("pageinfo.csv", "w") { |file| file.write content }
56
+ end
57
+
58
+ private
59
+ def self.new_line
60
+ "\n"
61
+ end
62
+
63
+ def self.get_host(uri)
64
+ (uri.port.eql?(443) ? "https://" : "http://") +
65
+ uri.host +
66
+ (uri.port.eql?(80) ? "" : ":#{uri.port}")
67
+ end
68
+
69
+ def self.get_info(conn, page)
70
+ @@no = @@no.next
71
+ puts "#{@@no}. #{conn.effective_url}"
72
+ [
73
+ "\"#{conn.effective_url}\"",
74
+ "\"#{conn.response_code}\"",
75
+ "\"#{conn.total_time}\"",
76
+ "\"#{get_head(page, "title")}\"",
77
+ "\"#{get_head(page, "description")}\"",
78
+ "\"#{get_head(page, "keywords")}\"",
79
+ ].join(",")
80
+ end
81
+
82
+ def self.get_head(page, type)
83
+ case type
84
+ when "title"
85
+ page.at("title").text.strip rescue ""
86
+ when "description"
87
+ page.at("meta[name=description]").attribute("content").value.strip rescue ""
88
+ when "keywords"
89
+ page.at("meta[name=keywords]").attribute("content").value.strip rescue ""
90
+ end
91
+ end
92
+
93
+ def self.get_full_url(link)
94
+ if bad_link?(link)
95
+ nil
96
+ elsif link.match(/^\//)
97
+ "#{@@main_host}#{valid_link(link)}"
98
+ elsif link.match(@@main_host)
99
+ valid_link(link)
100
+ else
101
+ "#{@@main_host}/#{link}"
102
+ end
103
+ end
104
+
105
+ def self.bad_link?(link)
106
+ [nil, "#"].include?(link) ||
107
+ link.match(/^javascript/) ||
108
+ (link.match(/^http/) && external_link?(link))
109
+ end
110
+
111
+ def self.external_link?(link)
112
+ uri = URI.parse(link) rescue nil
113
+ uri.nil? ? true : !@@main_host.eql?(get_host(uri))
114
+ end
115
+
116
+ def self.valid_link(link)
117
+ if link.match(/\/$/)
118
+ link[0..-2]
119
+ elsif link.match(/\/\#$/)
120
+ link[0..-3]
121
+ else
122
+ link
123
+ end
124
+ end
125
+
126
+ def self.get_page_links(page)
127
+ links = page.css("a:not([rel]),a[rel!=nofollow]").map do |link|
128
+ link.attribute("href").value rescue nil
129
+ end
130
+ links.uniq.compact
131
+ end
8
132
  end
@@ -1,3 +1,3 @@
1
1
  module Pageinfo
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.9"
3
3
  end
data/pageinfo.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ["adit@41studio.com"]
11
11
 
12
12
  spec.summary = %q{Browse your page meta info and optimize your SEO strategy.}
13
- spec.description = %q{pageinfo will browse through your site then list every page on your site and give detail meta info about the page.}
13
+ spec.description = %q{Pageinfo will browse through your site then list every page on your site and give detail meta info about the page.}
14
14
  spec.homepage = "https://github.com/aditiamahdar/pageinfo"
15
15
  spec.license = "MIT"
16
16
 
@@ -26,4 +26,6 @@ Gem::Specification.new do |spec|
26
26
  spec.add_development_dependency "bundler", "~> 1.9"
27
27
  spec.add_development_dependency "rake", "~> 10.0"
28
28
  spec.add_development_dependency "rspec", "~> 3.2"
29
+ spec.add_development_dependency "typhoeus", "~> 0.7"
30
+ spec.add_development_dependency "nokogiri", "~> 1.6"
29
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pageinfo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - aditiamahdar
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-11 00:00:00.000000000 Z
11
+ date: 2015-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,11 +52,40 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.2'
55
- description: pageinfo will browse through your site then list every page on your site
55
+ - !ruby/object:Gem::Dependency
56
+ name: typhoeus
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '0.7'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '0.7'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.6'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.6'
83
+ description: Pageinfo will browse through your site then list every page on your site
56
84
  and give detail meta info about the page.
57
85
  email:
58
86
  - adit@41studio.com
59
- executables: []
87
+ executables:
88
+ - pageinfo
60
89
  extensions: []
61
90
  extra_rdoc_files: []
62
91
  files:
@@ -68,7 +97,9 @@ files:
68
97
  - README.md
69
98
  - Rakefile
70
99
  - bin/console
100
+ - bin/pageinfo
71
101
  - bin/setup
102
+ - exe/pageinfo
72
103
  - lib/pageinfo.rb
73
104
  - lib/pageinfo/version.rb
74
105
  - pageinfo.gemspec