ruby-web-crawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/ruby-web-crawler.rb +70 -0
  3. metadata +73 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 16e50a5b1d4eb91ce797f9ce98467c445a35d1ae
4
+ data.tar.gz: 0f8e52dafc5fce1e23294bf5cb6cbd64bef8c143
5
+ SHA512:
6
+ metadata.gz: 345c699c94597ff0da7cbc1225f3513d19fc0c2ca675ce3afc957d67b7f39e4872a9940816ec85152323d81c2773dde499e281dc42a0d40e0a210b443774621b
7
+ data.tar.gz: dd4bbd7d6f18e4fbbb83c2c2d8691067609c9d7e9bf7389f63c085f2b408e18e77dd157fa2a1decd5293cd8f6a580669f3cdc5d2506b8a208c5c9f41b469ea88
@@ -0,0 +1,70 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ class RubyWebCrawler
5
+
6
+ attr_accessor :urls, :root_url, :url_limit, :time_limit
7
+
8
+ def initialize url, url_limit = 50, time_limit = 60
9
+ self.urls = []
10
+ self.root_url = url # Root URL from where the Crawling Starts
11
+ self.url_limit = url_limit # Default URL Limit for the Crawler
12
+ self.time_limit = time_limit # Timeout limit in seconds
13
+ end
14
+
15
+ def start_crawl
16
+ begin
17
+ is_running = Timeout::timeout(self.time_limit) {
18
+ self.get_urls_for_page self.root_url
19
+ }
20
+ rescue Exception => e
21
+ # Do Nothing just don't let it error out
22
+ end
23
+ return self.urls
24
+ end
25
+
26
+ # Get all URLs on a Page
27
+ def get_urls_for_page url
28
+ page_content = self.get_page_content url
29
+
30
+ # Regex to get all "links" in the page
31
+ urls = page_content.scan(/\<a href\=(\"(http|https)\:.*?\")/)
32
+ urls.each { |u|
33
+ sanitized_url = u.first.gsub(/\"/, '').strip
34
+ unless self.urls.include? sanitized_url
35
+ self.urls.push(sanitized_url)
36
+
37
+ # If Unexpected Error happens when trying to fetch URLs move on to the next URL
38
+ begin
39
+ break if self.urls.count >= self.url_limit
40
+ self.get_urls_for_page(sanitized_url)
41
+ rescue Exception => e
42
+ next
43
+ end
44
+ end
45
+ }
46
+ return self.urls
47
+ end
48
+
49
+ # Get HTML/Content of the Page to be parsed
50
+ def get_page_content url
51
+ uri = URI(url)
52
+ request = Net::HTTP::Get.new(uri)
53
+
54
+ http = Net::HTTP.new(uri.host, uri.port)
55
+
56
+ # Neet to enable use of SSL if the URL protocol is HTTPS
57
+ http.use_ssl = (uri.scheme == "https")
58
+
59
+ response = http.request(request)
60
+
61
+ # Check if URL needs to be forwarded because of redirect
62
+ case response
63
+ when Net::HTTPSuccess
64
+ return response.body
65
+ when Net::HTTPMovedPermanently || Net::HTTPRedirection
66
+ self.get_page_content response['location']
67
+ end
68
+ end
69
+
70
+ end # END RubyWebCrawler
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-web-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Dwijen Purohit
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-06-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: net/http
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: uri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A simple ruby gem to recursively traverse all URLs on a Root URL. It
42
+ returns all the URLs it encountered
43
+ email: dwijen4190@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - lib/ruby-web-crawler.rb
49
+ homepage: http://rubygems.org/gems/ruby-web-crawler
50
+ licenses:
51
+ - MIT
52
+ metadata: {}
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ requirements: []
68
+ rubyforge_project:
69
+ rubygems_version: 2.4.5.1
70
+ signing_key:
71
+ specification_version: 4
72
+ summary: Gem to crawl from a root URL to all child URLs
73
+ test_files: []