ruby-web-crawler 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/ruby-web-crawler.rb +70 -0
  3. metadata +73 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 16e50a5b1d4eb91ce797f9ce98467c445a35d1ae
4
+ data.tar.gz: 0f8e52dafc5fce1e23294bf5cb6cbd64bef8c143
5
+ SHA512:
6
+ metadata.gz: 345c699c94597ff0da7cbc1225f3513d19fc0c2ca675ce3afc957d67b7f39e4872a9940816ec85152323d81c2773dde499e281dc42a0d40e0a210b443774621b
7
+ data.tar.gz: dd4bbd7d6f18e4fbbb83c2c2d8691067609c9d7e9bf7389f63c085f2b408e18e77dd157fa2a1decd5293cd8f6a580669f3cdc5d2506b8a208c5c9f41b469ea88
@@ -0,0 +1,70 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ class RubyWebCrawler
5
+
6
+ attr_accessor :urls, :root_url, :url_limit, :time_limit
7
+
8
+ def initialize url, url_limit = 50, time_limit = 60
9
+ self.urls = []
10
+ self.root_url = url # Root URL from where the Crawling Starts
11
+ self.url_limit = url_limit # Default URL Limit for the Crawler
12
+ self.time_limit = time_limit # Timeout limit in seconds
13
+ end
14
+
15
+ def start_crawl
16
+ begin
17
+ is_running = Timeout::timeout(self.time_limit) {
18
+ self.get_urls_for_page self.root_url
19
+ }
20
+ rescue Exception => e
21
+ # Do Nothing just don't let it error out
22
+ end
23
+ return self.urls
24
+ end
25
+
26
+ # Get all URLs on a Page
27
+ def get_urls_for_page url
28
+ page_content = self.get_page_content url
29
+
30
+ # Regex to get all "links" in the page
31
+ urls = page_content.scan(/\<a href\=(\"(http|https)\:.*?\")/)
32
+ urls.each { |u|
33
+ sanitized_url = u.first.gsub(/\"/, '').strip
34
+ unless self.urls.include? sanitized_url
35
+ self.urls.push(sanitized_url)
36
+
37
+ # If Unexpected Error happens when trying to fetch URLs move on to the next URL
38
+ begin
39
+ break if self.urls.count >= self.url_limit
40
+ self.get_urls_for_page(sanitized_url)
41
+ rescue Exception => e
42
+ next
43
+ end
44
+ end
45
+ }
46
+ return self.urls
47
+ end
48
+
49
+ # Get HTML/Content of the Page to be parsed
50
+ def get_page_content url
51
+ uri = URI(url)
52
+ request = Net::HTTP::Get.new(uri)
53
+
54
+ http = Net::HTTP.new(uri.host, uri.port)
55
+
56
+ # Neet to enable use of SSL if the URL protocol is HTTPS
57
+ http.use_ssl = (uri.scheme == "https")
58
+
59
+ response = http.request(request)
60
+
61
+ # Check if URL needs to be forwarded because of redirect
62
+ case response
63
+ when Net::HTTPSuccess
64
+ return response.body
65
+ when Net::HTTPMovedPermanently || Net::HTTPRedirection
66
+ self.get_page_content response['location']
67
+ end
68
+ end
69
+
70
+ end # END RubyWebCrawler
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-web-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dwijen Purohit
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-06-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: net/http
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: uri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A simple ruby gem to recursively traverse all URLs on a Root URL. It
42
+ returns all the URLs it encountered
43
+ email: dwijen4190@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - lib/ruby-web-crawler.rb
49
+ homepage: http://rubygems.org/gems/ruby-web-crawler
50
+ licenses:
51
+ - MIT
52
+ metadata: {}
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ requirements: []
68
+ rubyforge_project:
69
+ rubygems_version: 2.4.5.1
70
+ signing_key:
71
+ specification_version: 4
72
+ summary: Gem to crawl from a root URL to all child URLs
73
+ test_files: []