yz_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 542b185a0e5259a82d4cec6d035f7c98803bb33f
4
+ data.tar.gz: e239ada03adc864e69f68df622414a80783c0fdd
5
+ SHA512:
6
+ metadata.gz: d4ff22cf76e3231bedf8ad7dea38124e1436510630e79c00e287e0110eead766d2e621e672b1d99ca166142fe9506079f6076e70d6129526ab3c6673cccc5fbd
7
+ data.tar.gz: 5603c47b7754586165410e91754abf8f63a366bc0236390f087cd64dff9e0a1fea123e73412eb9b04d4e89eff5b005b522ad9da209144251df9ccc5d75c3d85a
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require 'yz_crawler'
3
+
4
+ crawler = YzCrawler.new
5
+ crawler.crawl
@@ -0,0 +1,53 @@
1
+ require 'getoptlong'
2
+
3
+ class CommandLineArgumentParser
4
+ WEB_CRAWLER = 'web'
5
+ DOMAIN_CRAWLER = 'domain'
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+ @opts = GetoptLong.new(
14
+ ["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
15
+ ["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
16
+ ["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
17
+ ["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
18
+ )
19
+ @crawl_type = "data.txt"
20
+ @crawl_depth = 3
21
+ @page_limit = 100
22
+ @url_file = 'urls.txt'
23
+ end
24
+
25
+ def display_usage
26
+ p "Sample usage:"
27
+ p "ruby search-engine-main.rb -c web -d 3 -p 100 -f 'urls.txt'"
28
+ p "-c must be either 'web' or 'domain', will default to 'web' is you type garbage "
29
+ end
30
+
31
+ def parse_arguments
32
+ @opts.each do |opt, arg|
33
+ case opt
34
+ when '--crawl'
35
+ ensure_crawl_type_correct(arg)
36
+ when '--crawl-depth'
37
+ @crawl_depth = arg.to_i
38
+ when '--page-limit'
39
+ @page_limit = arg.to_i
40
+ when '--url-file'
41
+ @url_file = arg
42
+ end
43
+ end
44
+ end
45
+
46
+ def ensure_crawl_type_correct(value)
47
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
48
+ @crawl_type = WEB_CRAWLER
49
+ else
50
+ @crawl_type = value
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,84 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'url_utils'
7
+
8
+ class Spider
9
+ include UrlUtils
10
+
11
+ def initialize
12
+ @already_visited = {}
13
+ end
14
+
15
+ def crawl_web(urls, depth=2, page_limit = 100)
16
+ depth.times do
17
+ next_urls = []
18
+ urls.each do |url|
19
+ url_object = open_url(url)
20
+ next if url_object.nil?
21
+
22
+ url = update_url_if_redirected(url_object)
23
+ parsed_doc = parse_url(url_object)
24
+ next if parsed_doc.nil?
25
+
26
+ @already_visited[url] = true if @already_visited[url].nil?
27
+ return if @already_visited.size == page_limit
28
+
29
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
30
+ next_urls.uniq!
31
+ end
32
+ urls = next_urls
33
+ end
34
+ end
35
+
36
+ def crawl_domain(url, page_limit = 100)
37
+ return if @already_visited.size == page_limit
38
+
39
+ url_object = open_url(url)
40
+ return if url_object.nil?
41
+
42
+ parsed_doc = parse_url(url_object)
43
+ return if parsed_doc.nil?
44
+
45
+ @already_visited[url] = true if @already_visited[url].nil?
46
+ page_urls = find_urls_on_page(parsed_doc, url)
47
+ page_urls.each do |page_url|
48
+ if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
49
+ crawl_domain(page_url)
50
+ end
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def open_url(url)
57
+ open(url)
58
+ rescue
59
+ puts "Unable to open url: " + url
60
+ end
61
+
62
+ def update_url_if_redirected(url_object)
63
+ url_object.base_uri.to_s
64
+ end
65
+
66
+ def parse_url(url_object)
67
+ doc = Hpricot(url_object) #nokogiri
68
+ puts 'Crawling url ' + url_object.base_uri.to_s
69
+ doc
70
+ rescue
71
+ puts 'Could not parse url: ' + url_object.base_uri.to_s
72
+ end
73
+
74
+ def find_urls_on_page(parsed_doc, current_url)
75
+ parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
76
+ new_url = x['href'].split('#')[0]
77
+ if new_url
78
+ # complicated feature: make_absolute
79
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
80
+ urls_list.push(new_url)
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,24 @@
1
+ class UrlStore
2
+ attr_reader :urls
3
+ alias :get_urls :urls
4
+
5
+ def initialize(url_file)
6
+ @urls = read_urls_from_file(url_file)
7
+ end
8
+
9
+ def get_url
10
+ @urls[0]
11
+ end
12
+
13
+ def read_urls_from_file(url_file)
14
+ urls = []
15
+ File.open(url_file, 'r') do |file|
16
+ file.readlines.each do |line|
17
+ urls.push(line.chomp)
18
+ end
19
+ end
20
+ urls
21
+ end
22
+
23
+ private :read_urls_from_file
24
+ end
@@ -0,0 +1,57 @@
1
+ module UrlUtils
2
+ def relative?(url)
3
+ !url.match(/^http/)
4
+ end
5
+
6
+ def make_absolute(potential_base, relative_url)
7
+ if relative_url.match(/^\//)
8
+ create_absolute_url_from_base(potential_base, relative_url)
9
+ else
10
+ create_absolute_url_from_context(potential_base, relative_url)
11
+ end
12
+ end
13
+
14
+ def urls_on_same_domain?(url1, url2)
15
+ get_domain(url1) == get_domain(url2)
16
+ end
17
+
18
+ def get_domain(url)
19
+ remove_extra_paths(url)
20
+ end
21
+
22
+ private
23
+
24
+ def create_absolute_url_from_base(potential_base, relative_url)
25
+ remove_extra_paths(potential_base) + relative_url
26
+ end
27
+
28
+ def remove_extra_paths(potential_base)
29
+ index_to_start_slash_search = potential_base.index('://')+3
30
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
31
+ if index_of_first_relevant_slash != nil
32
+ potential_base[0, index_of_first_relevant_slash]
33
+ else
34
+ potential_base
35
+ end
36
+ end
37
+
38
+ def create_absolute_url_from_context(potential_base, relative_url)
39
+ absolute_url = nil
40
+ if potential_base.match(/\/$/)
41
+ absolute_url = potential_base+relative_url
42
+ else
43
+ last_index_of_slash = potential_base.rindex('/')
44
+ if potential_base[last_index_of_slash-2, 2] == ':/'
45
+ absolute_url = potential_base+'/'+relative_url
46
+ else
47
+ last_index_of_dot = potential_base.rindex('.')
48
+ if last_index_of_dot < last_index_of_slash
49
+ absolute_url = potential_base+'/'+relative_url
50
+ else
51
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
52
+ end
53
+ end
54
+ end
55
+ absolute_url
56
+ end
57
+ end
@@ -0,0 +1,28 @@
1
+ require 'spider'
2
+ require 'command_line_argument_parser'
3
+ require 'url_store'
4
+
5
+ class YzCrawler
6
+ def initialize
7
+ @argument_parser = CommandLineArgumentParser.new
8
+ @argument_parser.parse_arguments
9
+ @spider = Spider.new
10
+ @url_store = UrlStore.new(@argument_parser.url_file)
11
+ end
12
+
13
+ def crawl
14
+ if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
15
+ @spider.crawl_web(
16
+ @url_store.get_urls,
17
+ @argument_parser.crawl_depth,
18
+ @argument_parser.page_limit
19
+ )
20
+ else
21
+ @spider.crawl_domain(
22
+ @url_store.get_url,
23
+ @argument_parser.page_limit
24
+ )
25
+ end
26
+ end
27
+ end
28
+
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: yz_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Yang Zhao
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: hpricot
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.8'
27
+ description: A simple web crawler gem
28
+ email: yang.notold@gmail.com
29
+ executables:
30
+ - crawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/crawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/spider.rb
37
+ - lib/url_store.rb
38
+ - lib/url_utils.rb
39
+ - lib/yz_crawler.rb
40
+ homepage: http://rubygems.org/gems/yz_crawler
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.4.3
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: My first gem
64
+ test_files: []