my_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2bf22d64affd3173f46bf53774604346dba33d2c5a164ef960702f82eac13308
4
+ data.tar.gz: 7e0659ba5a552eca90ac961ff10a162a1f4c81fdfc5c2a11da5888d6988b28a9
5
+ SHA512:
6
+ metadata.gz: e745ad676a3abede4ed11ae75a5ba0edea849a75d8168ad204f9b59ae8f536275f74ee817b77ac88f1927f3cdda1b2779faa1ee267424bcc6209e8e0c0908bee
7
+ data.tar.gz: 54173b6140c61cd880cde80c649e4aad59e39e3ed578cc941bf1e328cc19b2436a6ef470b9130cee4cb00cccad7beef546cc83ea37e43ec31a61de82475acb2a
data/bin/crawler ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ require 'my_crawler'
5
+
6
+ crawler = MyCrawler.new
7
+
8
+ crawler.crawl
@@ -0,0 +1,54 @@
1
+ require 'getoptlong'
2
+
3
+ class Command_line_argument_parser
4
+ WEB_CRAWLER = 'web'
5
+ DOMAIN_CRAWLER = 'domain'
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+ @opts = GetoptLong.new(
14
+ ["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
15
+ ["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
16
+ ["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
17
+ ["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
18
+ )
19
+ @crawl_type = "data.txt"
20
+ @crawl_depth = 3
21
+ @page_limit = 100
22
+ @url_file = 'urls.txt'
23
+ end
24
+
25
+ def display_usage
26
+ p "Sample usage:"
27
+ p "ruby search-engine-main.rb -c web -d 3 -p 100 -f 'urls.txt'"
28
+ p "-c must be either 'web' or 'domain', will default to 'web' if you type garbage"
29
+ end
30
+
31
+ def parse_arguments
32
+ @opts.each do |opt, arg|
33
+ case opt
34
+ when '--crawl'
35
+ ensure_crawl_type_corrent(arg)
36
+ when '--crawl-depth'
37
+ @crawl_depth = arg.to_i
38
+ when '--page-limit'
39
+ @page_limit = arg.to_i
40
+ when '--url-file'
41
+ @url_file = arg
42
+ end
43
+ end
44
+ end
45
+
46
+ def ensure_crawl_type_corrent(value)
47
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
48
+ @crawl_type = WEB_CRAWLER
49
+ else
50
+ @crawl_type = value
51
+ end
52
+ end
53
+
54
+ end
data/lib/my_crawler.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'spider'
2
+ require 'command_line_argument_parser'
3
+ require 'url_store'
4
+
5
+ class MyCrawler
6
+ def initialize
7
+ @argument_parser = Command_line_argument_parser.new
8
+ @argument_parser.parse_arguments
9
+ @spider = Spider.new
10
+ @url_store = UrlStore.new(@argument_parser.url_file)
11
+ end
12
+
13
+ def crawl
14
+ if @argument_parser.crawl_type == Command_line_argument_parser::WEB_CRAWLER
15
+ @spider.crawl_web(
16
+ @url_store.get_urls,
17
+ @argument_parser.crawl_depth,
18
+ @argument_parser.page_limit
19
+ )
20
+ else
21
+ @spider.crawl_domain(
22
+ @url_store.get_urls,
23
+ @argument_parser.parse_arguments
24
+ )
25
+ end
26
+ end
27
+ end
data/lib/spider.rb ADDED
@@ -0,0 +1,81 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'url_utils'
7
+
8
+ class Spider
9
+ include UrlUtils
10
+
11
+ def initialize
12
+ @already_visited = {}
13
+ end
14
+
15
+ def crawl_web(urls, depth=2, page_limit=100)
16
+ depth.times do
17
+ next_urls = []
18
+ urls.each do |url|
19
+ url_object = open_url(url)
20
+ next if url_object.nil?
21
+
22
+ url = update_url_if_redirected(url_object)
23
+ parsed_doc = parse_url(url_object)
24
+ next if parsed_doc.nil?
25
+
26
+ @already_visited[url] = true if @already_visited[url].nil?
27
+ return if already_visited.size == page_limit
28
+
29
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
30
+ next_urls.uniq!
31
+ end
32
+ urls = next_urls
33
+ end
34
+
35
+ end
36
+
37
+ def crawl_domain(url, page_limit = 100)
38
+ return if @already_visited.size == page_limit
39
+
40
+ url_object = open_url(url)
41
+ return if url_object.nil?
42
+
43
+ parsed_doc = parse_url(url_object)
44
+ return if parsed_doc.nil?
45
+
46
+ @already_visited[url] = true if @already_visited[url].nil?
47
+ page_urls = find_urls_on_page(parsed_doc, url)
48
+ page_urls.each do |page_url|
49
+ if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
50
+ crawl_domain(page_url)
51
+ end
52
+ end
53
+ end
54
+
55
+ def open_url(url)
56
+ open(url)
57
+ rescue
58
+ puts "unable to open url: #{url}"
59
+ end
60
+
61
+ def update_url_if_redirected(url_object)
62
+ url_object.base_url.to_s
63
+ end
64
+
65
+ def parse_url(url_object)
66
+ doc = hpricot(url_object)
67
+ puts "Crawling url " + url_object.base_uri.to_s
68
+ rescue
69
+ puts "Could not parse url: " + url_object.base_uri.to_s
70
+ end
71
+
72
+ def find_urls_on_page(parsed_doc, current_url)
73
+ parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
74
+ new_url = x['href'].split('#')[0]
75
+ if new_url
76
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
77
+ urls_list.push(new_url)
78
+ end
79
+ end
80
+ end
81
+ end
data/lib/url_store.rb ADDED
@@ -0,0 +1,24 @@
1
+ class UrlStore
2
+ attr_reader :urls
3
+ alias :get_urls :urls
4
+
5
+ def initialize(url_file)
6
+ @urls = read_urls_from_file(url_file)
7
+ end
8
+
9
+ def get_url
10
+ @urls[0]
11
+ end
12
+
13
+ def read_urls_from_file(url_file)
14
+ urls = []
15
+ File.open(url_file, 'r') do |file|
16
+ file.readlines.each do |line|
17
+ urls.push(line.chomp)
18
+ end
19
+ end
20
+ urls
21
+ end
22
+
23
+ private :read_urls_from_file
24
+ end
data/lib/url_utils.rb ADDED
@@ -0,0 +1,70 @@
1
+ module UrlUtils
2
+ def relative?(url)
3
+ !url.matach(/^http/)
4
+ end
5
+
6
+ def make_absolute(potential_base, relative_url)
7
+ if relative_url.match(/^\//)
8
+ create_absolute_url_from_base(potential_base, relative_url)
9
+ else
10
+ create_absolute_url_from_context(potential_base, relative_url)
11
+ end
12
+ end
13
+
14
+ def urls_on_same_domain?(url1, url2)
15
+ get_domain(url1) == get_domain(url2)
16
+ end
17
+
18
+ def get_domain(url)
19
+ remove_extra_paths(url)
20
+ end
21
+
22
+ private
23
+ def create_absolute_url_from_base(potential_base, relative_url)
24
+ remove_extra_paths(potential_base) + relative_url
25
+ end
26
+
27
+ def remove_extra_paths(potential_base)
28
+ index_to_start_slash_search = potential_base.index('://') + 3
29
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
30
+ if index_of_first_relevant_slash != nil?
31
+ potential_base(0, index_of_first_relevant_slash)
32
+ else
33
+ potential_base
34
+ end
35
+ end
36
+
37
+ def create_absolute_url_from_context(potential_base, relative_url)
38
+ remove_extra_paths(potential_base) + relative_url
39
+ end
40
+
41
+ def remove_extra_paths(potential_base)
42
+ index_to_start_slash_search = potential_base.index('://') +3
43
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
44
+ if index_of_first_relevant_slash != nil
45
+ potential_base[0, index_of_first_relevant_slash]
46
+ else
47
+ potential_base
48
+ end
49
+ end
50
+
51
+ def create_absolute_url_from_context(potential_base, relative_url)
52
+ absolute_url = nil
53
+ if potential_base.match(/\/$/)
54
+ absolute_url = potential_base + relative_url
55
+ else
56
+ last_index_of_slash = potential_base.rindex('/')
57
+ if potential_base[last_index_of_slash-2, 2] == ':/'
58
+ absolute_url = potential_base + '/' + relative_url
59
+ else
60
+ last_index_of_dot = potential_base.rindex(',')
61
+ if last_index_of_dot < last_index_of_slash
62
+ absolute_url = potential_base + '/' + relative_url
63
+ else
64
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
65
+ end
66
+ end
67
+ end
68
+ absolute_url
69
+ end
70
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: my_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - 57581600@gmail.com
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-10-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: hpricot
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.8'
27
+ description: A simple web crawler gem
28
+ email:
29
+ executables:
30
+ - crawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/crawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/my_crawler.rb
37
+ - lib/spider.rb
38
+ - lib/url_store.rb
39
+ - lib/url_utils.rb
40
+ homepage: http://rubygems.org/gems/my_crawler
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubygems_version: 3.2.3
60
+ signing_key:
61
+ specification_version: 4
62
+ summary: My first gem
63
+ test_files: []