ruby-web-crawler 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/ruby-web-crawler.rb +70 -0
- metadata +73 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 16e50a5b1d4eb91ce797f9ce98467c445a35d1ae
|
4
|
+
data.tar.gz: 0f8e52dafc5fce1e23294bf5cb6cbd64bef8c143
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 345c699c94597ff0da7cbc1225f3513d19fc0c2ca675ce3afc957d67b7f39e4872a9940816ec85152323d81c2773dde499e281dc42a0d40e0a210b443774621b
|
7
|
+
data.tar.gz: dd4bbd7d6f18e4fbbb83c2c2d8691067609c9d7e9bf7389f63c085f2b408e18e77dd157fa2a1decd5293cd8f6a580669f3cdc5d2506b8a208c5c9f41b469ea88
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
class RubyWebCrawler
|
5
|
+
|
6
|
+
attr_accessor :urls, :root_url, :url_limit, :time_limit
|
7
|
+
|
8
|
+
def initialize url, url_limit = 50, time_limit = 60
|
9
|
+
self.urls = []
|
10
|
+
self.root_url = url # Root URL from where the Crawling Starts
|
11
|
+
self.url_limit = url_limit # Default URL Limit for the Crawler
|
12
|
+
self.time_limit = time_limit # Timeout limit in seconds
|
13
|
+
end
|
14
|
+
|
15
|
+
def start_crawl
|
16
|
+
begin
|
17
|
+
is_running = Timeout::timeout(self.time_limit) {
|
18
|
+
self.get_urls_for_page self.root_url
|
19
|
+
}
|
20
|
+
rescue Exception => e
|
21
|
+
# Do Nothing just don't let it error out
|
22
|
+
end
|
23
|
+
return self.urls
|
24
|
+
end
|
25
|
+
|
26
|
+
# Get all URLs on a Page
|
27
|
+
def get_urls_for_page url
|
28
|
+
page_content = self.get_page_content url
|
29
|
+
|
30
|
+
# Regex to get all "links" in the page
|
31
|
+
urls = page_content.scan(/\<a href\=(\"(http|https)\:.*?\")/)
|
32
|
+
urls.each { |u|
|
33
|
+
sanitized_url = u.first.gsub(/\"/, '').strip
|
34
|
+
unless self.urls.include? sanitized_url
|
35
|
+
self.urls.push(sanitized_url)
|
36
|
+
|
37
|
+
# If Unexpected Error happens when trying to fetch URLs move on to the next URL
|
38
|
+
begin
|
39
|
+
break if self.urls.count >= self.url_limit
|
40
|
+
self.get_urls_for_page(sanitized_url)
|
41
|
+
rescue Exception => e
|
42
|
+
next
|
43
|
+
end
|
44
|
+
end
|
45
|
+
}
|
46
|
+
return self.urls
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get HTML/Content of the Page to be parsed
|
50
|
+
def get_page_content url
|
51
|
+
uri = URI(url)
|
52
|
+
request = Net::HTTP::Get.new(uri)
|
53
|
+
|
54
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
55
|
+
|
56
|
+
# Neet to enable use of SSL if the URL protocol is HTTPS
|
57
|
+
http.use_ssl = (uri.scheme == "https")
|
58
|
+
|
59
|
+
response = http.request(request)
|
60
|
+
|
61
|
+
# Check if URL needs to be forwarded because of redirect
|
62
|
+
case response
|
63
|
+
when Net::HTTPSuccess
|
64
|
+
return response.body
|
65
|
+
when Net::HTTPMovedPermanently || Net::HTTPRedirection
|
66
|
+
self.get_page_content response['location']
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end # END RubyWebCrawler
|
metadata
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby-web-crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dwijen Purohit
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-06-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: net/http
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: uri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: A simple ruby gem to recursively traverse all URLs on a Root URL. It
|
42
|
+
returns all the URLs it encountered
|
43
|
+
email: dwijen4190@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- lib/ruby-web-crawler.rb
|
49
|
+
homepage: http://rubygems.org/gems/ruby-web-crawler
|
50
|
+
licenses:
|
51
|
+
- MIT
|
52
|
+
metadata: {}
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
requirements: []
|
68
|
+
rubyforge_project:
|
69
|
+
rubygems_version: 2.4.5.1
|
70
|
+
signing_key:
|
71
|
+
specification_version: 4
|
72
|
+
summary: Gem to crawl from a root URL to all child URLs
|
73
|
+
test_files: []
|