yz_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 542b185a0e5259a82d4cec6d035f7c98803bb33f
4
+ data.tar.gz: e239ada03adc864e69f68df622414a80783c0fdd
5
+ SHA512:
6
+ metadata.gz: d4ff22cf76e3231bedf8ad7dea38124e1436510630e79c00e287e0110eead766d2e621e672b1d99ca166142fe9506079f6076e70d6129526ab3c6673cccc5fbd
7
+ data.tar.gz: 5603c47b7754586165410e91754abf8f63a366bc0236390f087cd64dff9e0a1fea123e73412eb9b04d4e89eff5b005b522ad9da209144251df9ccc5d75c3d85a
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require 'yz_crawler'
3
+
4
+ crawler = YzCrawler.new
5
+ crawler.crawl
@@ -0,0 +1,53 @@
1
+ require 'getoptlong'
2
+
3
+ class CommandLineArgumentParser
4
+ WEB_CRAWLER = 'web'
5
+ DOMAIN_CRAWLER = 'domain'
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+ @opts = GetoptLong.new(
14
+ ["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
15
+ ["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
16
+ ["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
17
+ ["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
18
+ )
19
+ @crawl_type = "data.txt"
20
+ @crawl_depth = 3
21
+ @page_limit = 100
22
+ @url_file = 'urls.txt'
23
+ end
24
+
25
+ def display_usage
26
+ p "Sample usage:"
27
+ p "ruby search-engine-main.rb -c web -d 3 -p 100 -f 'urls.txt'"
28
+ p "-c must be either 'web' or 'domain', will default to 'web' is you type garbage "
29
+ end
30
+
31
+ def parse_arguments
32
+ @opts.each do |opt, arg|
33
+ case opt
34
+ when '--crawl'
35
+ ensure_crawl_type_correct(arg)
36
+ when '--crawl-depth'
37
+ @crawl_depth = arg.to_i
38
+ when '--page-limit'
39
+ @page_limit = arg.to_i
40
+ when '--url-file'
41
+ @url_file = arg
42
+ end
43
+ end
44
+ end
45
+
46
+ def ensure_crawl_type_correct(value)
47
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
48
+ @crawl_type = WEB_CRAWLER
49
+ else
50
+ @crawl_type = value
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,84 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'url_utils'
7
+
8
+ class Spider
9
+ include UrlUtils
10
+
11
+ def initialize
12
+ @already_visited = {}
13
+ end
14
+
15
+ def crawl_web(urls, depth=2, page_limit = 100)
16
+ depth.times do
17
+ next_urls = []
18
+ urls.each do |url|
19
+ url_object = open_url(url)
20
+ next if url_object.nil?
21
+
22
+ url = update_url_if_redirected(url_object)
23
+ parsed_doc = parse_url(url_object)
24
+ next if parsed_doc.nil?
25
+
26
+ @already_visited[url] = true if @already_visited[url].nil?
27
+ return if @already_visited.size == page_limit
28
+
29
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
30
+ next_urls.uniq!
31
+ end
32
+ urls = next_urls
33
+ end
34
+ end
35
+
36
+ def crawl_domain(url, page_limit = 100)
37
+ return if @already_visited.size == page_limit
38
+
39
+ url_object = open_url(url)
40
+ return if url_object.nil?
41
+
42
+ parsed_doc = parse_url(url_object)
43
+ return if parsed_doc.nil?
44
+
45
+ @already_visited[url] = true if @already_visited[url].nil?
46
+ page_urls = find_urls_on_page(parsed_doc, url)
47
+ page_urls.each do |page_url|
48
+ if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
49
+ crawl_domain(page_url)
50
+ end
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def open_url(url)
57
+ open(url)
58
+ rescue
59
+ puts "Unable to open url: " + url
60
+ end
61
+
62
+ def update_url_if_redirected(url_object)
63
+ url_object.base_uri.to_s
64
+ end
65
+
66
+ def parse_url(url_object)
67
+ doc = Hpricot(url_object) #nokogiri
68
+ puts 'Crawling url ' + url_object.base_uri.to_s
69
+ doc
70
+ rescue
71
+ puts 'Could not parse url: ' + url_object.base_uri.to_s
72
+ end
73
+
74
+ def find_urls_on_page(parsed_doc, current_url)
75
+ parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
76
+ new_url = x['href'].split('#')[0]
77
+ if new_url
78
+ # complicated feature: make_absolute
79
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
80
+ urls_list.push(new_url)
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,24 @@
1
+ class UrlStore
2
+ attr_reader :urls
3
+ alias :get_urls :urls
4
+
5
+ def initialize(url_file)
6
+ @urls = read_urls_from_file(url_file)
7
+ end
8
+
9
+ def get_url
10
+ @urls[0]
11
+ end
12
+
13
+ def read_urls_from_file(url_file)
14
+ urls = []
15
+ File.open(url_file, 'r') do |file|
16
+ file.readlines.each do |line|
17
+ urls.push(line.chomp)
18
+ end
19
+ end
20
+ urls
21
+ end
22
+
23
+ private :read_urls_from_file
24
+ end
@@ -0,0 +1,57 @@
1
+ module UrlUtils
2
+ def relative?(url)
3
+ !url.match(/^http/)
4
+ end
5
+
6
+ def make_absolute(potential_base, relative_url)
7
+ if relative_url.match(/^\//)
8
+ create_absolute_url_from_base(potential_base, relative_url)
9
+ else
10
+ create_absolute_url_from_context(potential_base, relative_url)
11
+ end
12
+ end
13
+
14
+ def urls_on_same_domain?(url1, url2)
15
+ get_domain(url1) == get_domain(url2)
16
+ end
17
+
18
+ def get_domain(url)
19
+ remove_extra_paths(url)
20
+ end
21
+
22
+ private
23
+
24
+ def create_absolute_url_from_base(potential_base, relative_url)
25
+ remove_extra_paths(potential_base) + relative_url
26
+ end
27
+
28
+ def remove_extra_paths(potential_base)
29
+ index_to_start_slash_search = potential_base.index('://')+3
30
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
31
+ if index_of_first_relevant_slash != nil
32
+ potential_base[0, index_of_first_relevant_slash]
33
+ else
34
+ potential_base
35
+ end
36
+ end
37
+
38
+ def create_absolute_url_from_context(potential_base, relative_url)
39
+ absolute_url = nil
40
+ if potential_base.match(/\/$/)
41
+ absolute_url = potential_base+relative_url
42
+ else
43
+ last_index_of_slash = potential_base.rindex('/')
44
+ if potential_base[last_index_of_slash-2, 2] == ':/'
45
+ absolute_url = potential_base+'/'+relative_url
46
+ else
47
+ last_index_of_dot = potential_base.rindex('.')
48
+ if last_index_of_dot < last_index_of_slash
49
+ absolute_url = potential_base+'/'+relative_url
50
+ else
51
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
52
+ end
53
+ end
54
+ end
55
+ absolute_url
56
+ end
57
+ end
@@ -0,0 +1,28 @@
1
+ require 'spider'
2
+ require 'command_line_argument_parser'
3
+ require 'url_store'
4
+
5
+ class YzCrawler
6
+ def initialize
7
+ @argument_parser = CommandLineArgumentParser.new
8
+ @argument_parser.parse_arguments
9
+ @spider = Spider.new
10
+ @url_store = UrlStore.new(@argument_parser.url_file)
11
+ end
12
+
13
+ def crawl
14
+ if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
15
+ @spider.crawl_web(
16
+ @url_store.get_urls,
17
+ @argument_parser.crawl_depth,
18
+ @argument_parser.page_limit
19
+ )
20
+ else
21
+ @spider.crawl_domain(
22
+ @url_store.get_url,
23
+ @argument_parser.page_limit
24
+ )
25
+ end
26
+ end
27
+ end
28
+
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: yz_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Yang Zhao
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-03-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: hpricot
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.8'
27
+ description: A simple web crawler gem
28
+ email: yang.notold@gmail.com
29
+ executables:
30
+ - crawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/crawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/spider.rb
37
+ - lib/url_store.rb
38
+ - lib/url_utils.rb
39
+ - lib/yz_crawler.rb
40
+ homepage: http://rubygems.org/gems/yz_crawler
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.4.3
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: My first gem
64
+ test_files: []