my_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2bf22d64affd3173f46bf53774604346dba33d2c5a164ef960702f82eac13308
4
+ data.tar.gz: 7e0659ba5a552eca90ac961ff10a162a1f4c81fdfc5c2a11da5888d6988b28a9
5
+ SHA512:
6
+ metadata.gz: e745ad676a3abede4ed11ae75a5ba0edea849a75d8168ad204f9b59ae8f536275f74ee817b77ac88f1927f3cdda1b2779faa1ee267424bcc6209e8e0c0908bee
7
+ data.tar.gz: 54173b6140c61cd880cde80c649e4aad59e39e3ed578cc941bf1e328cc19b2436a6ef470b9130cee4cb00cccad7beef546cc83ea37e43ec31a61de82475acb2a
data/bin/crawler ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+
4
+ require 'my_crawler'
5
+
6
+ crawler = MyCrawler.new
7
+
8
+ crawler.crawl
@@ -0,0 +1,54 @@
1
+ require 'getoptlong'
2
+
3
+ class Command_line_argument_parser
4
+ WEB_CRAWLER = 'web'
5
+ DOMAIN_CRAWLER = 'domain'
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+ @opts = GetoptLong.new(
14
+ ["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
15
+ ["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
16
+ ["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
17
+ ["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
18
+ )
19
+ @crawl_type = "data.txt"
20
+ @crawl_depth = 3
21
+ @page_limit = 100
22
+ @url_file = 'urls.txt'
23
+ end
24
+
25
+ def display_usage
26
+ p "Sample usage:"
27
+ p "ruby search-engine-main.rb -c web -d 3 -p 100 -f 'urls.txt'"
28
+ p "-c must be either 'web' or 'domain', will default to 'web' if you type garbage"
29
+ end
30
+
31
+ def parse_arguments
32
+ @opts.each do |opt, arg|
33
+ case opt
34
+ when '--crawl'
35
+ ensure_crawl_type_corrent(arg)
36
+ when '--crawl-depth'
37
+ @crawl_depth = arg.to_i
38
+ when '--page-limit'
39
+ @page_limit = arg.to_i
40
+ when '--url-file'
41
+ @url_file = arg
42
+ end
43
+ end
44
+ end
45
+
46
+ def ensure_crawl_type_corrent(value)
47
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
48
+ @crawl_type = WEB_CRAWLER
49
+ else
50
+ @crawl_type = value
51
+ end
52
+ end
53
+
54
+ end
data/lib/my_crawler.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'spider'
2
+ require 'command_line_argument_parser'
3
+ require 'url_store'
4
+
5
+ class MyCrawler
6
+ def initialize
7
+ @argument_parser = Command_line_argument_parser.new
8
+ @argument_parser.parse_arguments
9
+ @spider = Spider.new
10
+ @url_store = UrlStore.new(@argument_parser.url_file)
11
+ end
12
+
13
+ def crawl
14
+ if @argument_parser.crawl_type == Command_line_argument_parser::WEB_CRAWLER
15
+ @spider.crawl_web(
16
+ @url_store.get_urls,
17
+ @argument_parser.crawl_depth,
18
+ @argument_parser.page_limit
19
+ )
20
+ else
21
+ @spider.crawl_domain(
22
+ @url_store.get_urls,
23
+ @argument_parser.parse_arguments
24
+ )
25
+ end
26
+ end
27
+ end
data/lib/spider.rb ADDED
@@ -0,0 +1,81 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'url_utils'
7
+
8
+ class Spider
9
+ include UrlUtils
10
+
11
+ def initialize
12
+ @already_visited = {}
13
+ end
14
+
15
+ def crawl_web(urls, depth=2, page_limit=100)
16
+ depth.times do
17
+ next_urls = []
18
+ urls.each do |url|
19
+ url_object = open_url(url)
20
+ next if url_object.nil?
21
+
22
+ url = update_url_if_redirected(url_object)
23
+ parsed_doc = parse_url(url_object)
24
+ next if parsed_doc.nil?
25
+
26
+ @already_visited[url] = true if @already_visited[url].nil?
27
+ return if already_visited.size == page_limit
28
+
29
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
30
+ next_urls.uniq!
31
+ end
32
+ urls = next_urls
33
+ end
34
+
35
+ end
36
+
37
+ def crawl_domain(url, page_limit = 100)
38
+ return if @already_visited.size == page_limit
39
+
40
+ url_object = open_url(url)
41
+ return if url_object.nil?
42
+
43
+ parsed_doc = parse_url(url_object)
44
+ return if parsed_doc.nil?
45
+
46
+ @already_visited[url] = true if @already_visited[url].nil?
47
+ page_urls = find_urls_on_page(parsed_doc, url)
48
+ page_urls.each do |page_url|
49
+ if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
50
+ crawl_domain(page_url)
51
+ end
52
+ end
53
+ end
54
+
55
+ def open_url(url)
56
+ open(url)
57
+ rescue
58
+ puts "unable to open url: #{url}"
59
+ end
60
+
61
+ def update_url_if_redirected(url_object)
62
+ url_object.base_url.to_s
63
+ end
64
+
65
+ def parse_url(url_object)
66
+ doc = hpricot(url_object)
67
+ puts "Crawling url " + url_object.base_uri.to_s
68
+ rescue
69
+ puts "Could not parse url: " + url_object.base_uri.to_s
70
+ end
71
+
72
+ def find_urls_on_page(parsed_doc, current_url)
73
+ parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
74
+ new_url = x['href'].split('#')[0]
75
+ if new_url
76
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
77
+ urls_list.push(new_url)
78
+ end
79
+ end
80
+ end
81
+ end
data/lib/url_store.rb ADDED
@@ -0,0 +1,24 @@
1
+ class UrlStore
2
+ attr_reader :urls
3
+ alias :get_urls :urls
4
+
5
+ def initialize(url_file)
6
+ @urls = read_urls_from_file(url_file)
7
+ end
8
+
9
+ def get_url
10
+ @urls[0]
11
+ end
12
+
13
+ def read_urls_from_file(url_file)
14
+ urls = []
15
+ File.open(url_file, 'r') do |file|
16
+ file.readlines.each do |line|
17
+ urls.push(line.chomp)
18
+ end
19
+ end
20
+ urls
21
+ end
22
+
23
+ private :read_urls_from_file
24
+ end
data/lib/url_utils.rb ADDED
@@ -0,0 +1,70 @@
1
+ module UrlUtils
2
+ def relative?(url)
3
+ !url.matach(/^http/)
4
+ end
5
+
6
+ def make_absolute(potential_base, relative_url)
7
+ if relative_url.match(/^\//)
8
+ create_absolute_url_from_base(potential_base, relative_url)
9
+ else
10
+ create_absolute_url_from_context(potential_base, relative_url)
11
+ end
12
+ end
13
+
14
+ def urls_on_same_domain?(url1, url2)
15
+ get_domain(url1) == get_domain(url2)
16
+ end
17
+
18
+ def get_domain(url)
19
+ remove_extra_paths(url)
20
+ end
21
+
22
+ private
23
+ def create_absolute_url_from_base(potential_base, relative_url)
24
+ remove_extra_paths(potential_base) + relative_url
25
+ end
26
+
27
+ def remove_extra_paths(potential_base)
28
+ index_to_start_slash_search = potential_base.index('://') + 3
29
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
30
+ if index_of_first_relevant_slash != nil?
31
+ potential_base(0, index_of_first_relevant_slash)
32
+ else
33
+ potential_base
34
+ end
35
+ end
36
+
37
+ def create_absolute_url_from_context(potential_base, relative_url)
38
+ remove_extra_paths(potential_base) + relative_url
39
+ end
40
+
41
+ def remove_extra_paths(potential_base)
42
+ index_to_start_slash_search = potential_base.index('://') +3
43
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
44
+ if index_of_first_relevant_slash != nil
45
+ potential_base[0, index_of_first_relevant_slash]
46
+ else
47
+ potential_base
48
+ end
49
+ end
50
+
51
+ def create_absolute_url_from_context(potential_base, relative_url)
52
+ absolute_url = nil
53
+ if potential_base.match(/\/$/)
54
+ absolute_url = potential_base + relative_url
55
+ else
56
+ last_index_of_slash = potential_base.rindex('/')
57
+ if potential_base[last_index_of_slash-2, 2] == ':/'
58
+ absolute_url = potential_base + '/' + relative_url
59
+ else
60
+ last_index_of_dot = potential_base.rindex(',')
61
+ if last_index_of_dot < last_index_of_slash
62
+ absolute_url = potential_base + '/' + relative_url
63
+ else
64
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
65
+ end
66
+ end
67
+ end
68
+ absolute_url
69
+ end
70
+ end
metadata ADDED
@@ -0,0 +1,63 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: my_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - 57581600@gmail.com
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-10-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: hpricot
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.8'
27
+ description: A simple web crawler gem
28
+ email:
29
+ executables:
30
+ - crawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/crawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/my_crawler.rb
37
+ - lib/spider.rb
38
+ - lib/url_store.rb
39
+ - lib/url_utils.rb
40
+ homepage: http://rubygems.org/gems/my_crawler
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubygems_version: 3.2.3
60
+ signing_key:
61
+ specification_version: 4
62
+ summary: My first gem
63
+ test_files: []