ruby-web-crawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/ruby-web-crawler.rb +70 -0
- metadata +73 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 16e50a5b1d4eb91ce797f9ce98467c445a35d1ae
|
4
|
+
data.tar.gz: 0f8e52dafc5fce1e23294bf5cb6cbd64bef8c143
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 345c699c94597ff0da7cbc1225f3513d19fc0c2ca675ce3afc957d67b7f39e4872a9940816ec85152323d81c2773dde499e281dc42a0d40e0a210b443774621b
|
7
|
+
data.tar.gz: dd4bbd7d6f18e4fbbb83c2c2d8691067609c9d7e9bf7389f63c085f2b408e18e77dd157fa2a1decd5293cd8f6a580669f3cdc5d2506b8a208c5c9f41b469ea88
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
class RubyWebCrawler
|
5
|
+
|
6
|
+
attr_accessor :urls, :root_url, :url_limit, :time_limit
|
7
|
+
|
8
|
+
def initialize url, url_limit = 50, time_limit = 60
|
9
|
+
self.urls = []
|
10
|
+
self.root_url = url # Root URL from where the Crawling Starts
|
11
|
+
self.url_limit = url_limit # Default URL Limit for the Crawler
|
12
|
+
self.time_limit = time_limit # Timeout limit in seconds
|
13
|
+
end
|
14
|
+
|
15
|
+
def start_crawl
|
16
|
+
begin
|
17
|
+
is_running = Timeout::timeout(self.time_limit) {
|
18
|
+
self.get_urls_for_page self.root_url
|
19
|
+
}
|
20
|
+
rescue Exception => e
|
21
|
+
# Do Nothing just don't let it error out
|
22
|
+
end
|
23
|
+
return self.urls
|
24
|
+
end
|
25
|
+
|
26
|
+
# Get all URLs on a Page
|
27
|
+
def get_urls_for_page url
|
28
|
+
page_content = self.get_page_content url
|
29
|
+
|
30
|
+
# Regex to get all "links" in the page
|
31
|
+
urls = page_content.scan(/\<a href\=(\"(http|https)\:.*?\")/)
|
32
|
+
urls.each { |u|
|
33
|
+
sanitized_url = u.first.gsub(/\"/, '').strip
|
34
|
+
unless self.urls.include? sanitized_url
|
35
|
+
self.urls.push(sanitized_url)
|
36
|
+
|
37
|
+
# If Unexpected Error happens when trying to fetch URLs move on to the next URL
|
38
|
+
begin
|
39
|
+
break if self.urls.count >= self.url_limit
|
40
|
+
self.get_urls_for_page(sanitized_url)
|
41
|
+
rescue Exception => e
|
42
|
+
next
|
43
|
+
end
|
44
|
+
end
|
45
|
+
}
|
46
|
+
return self.urls
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get HTML/Content of the Page to be parsed
|
50
|
+
def get_page_content url
|
51
|
+
uri = URI(url)
|
52
|
+
request = Net::HTTP::Get.new(uri)
|
53
|
+
|
54
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
55
|
+
|
56
|
+
# Neet to enable use of SSL if the URL protocol is HTTPS
|
57
|
+
http.use_ssl = (uri.scheme == "https")
|
58
|
+
|
59
|
+
response = http.request(request)
|
60
|
+
|
61
|
+
# Check if URL needs to be forwarded because of redirect
|
62
|
+
case response
|
63
|
+
when Net::HTTPSuccess
|
64
|
+
return response.body
|
65
|
+
when Net::HTTPMovedPermanently || Net::HTTPRedirection
|
66
|
+
self.get_page_content response['location']
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end # END RubyWebCrawler
|
metadata
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby-web-crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dwijen Purohit
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-06-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: net/http
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: uri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: A simple ruby gem to recursively traverse all URLs on a Root URL. It
|
42
|
+
returns all the URLs it encountered
|
43
|
+
email: dwijen4190@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- lib/ruby-web-crawler.rb
|
49
|
+
homepage: http://rubygems.org/gems/ruby-web-crawler
|
50
|
+
licenses:
|
51
|
+
- MIT
|
52
|
+
metadata: {}
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
requirements: []
|
68
|
+
rubyforge_project:
|
69
|
+
rubygems_version: 2.4.5.1
|
70
|
+
signing_key:
|
71
|
+
specification_version: 4
|
72
|
+
summary: Gem to crawl from a root URL to all child URLs
|
73
|
+
test_files: []
|