yz_crawler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/crawler +5 -0
- data/lib/command_line_argument_parser.rb +53 -0
- data/lib/spider.rb +84 -0
- data/lib/url_store.rb +24 -0
- data/lib/url_utils.rb +57 -0
- data/lib/yz_crawler.rb +28 -0
- metadata +64 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 542b185a0e5259a82d4cec6d035f7c98803bb33f
|
4
|
+
data.tar.gz: e239ada03adc864e69f68df622414a80783c0fdd
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d4ff22cf76e3231bedf8ad7dea38124e1436510630e79c00e287e0110eead766d2e621e672b1d99ca166142fe9506079f6076e70d6129526ab3c6673cccc5fbd
|
7
|
+
data.tar.gz: 5603c47b7754586165410e91754abf8f63a366bc0236390f087cd64dff9e0a1fea123e73412eb9b04d4e89eff5b005b522ad9da209144251df9ccc5d75c3d85a
|
data/bin/crawler
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'getoptlong'
|
2
|
+
|
3
|
+
class CommandLineArgumentParser
|
4
|
+
WEB_CRAWLER = 'web'
|
5
|
+
DOMAIN_CRAWLER = 'domain'
|
6
|
+
attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
unless ARGV.length >= 1
|
10
|
+
display_usage
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
@opts = GetoptLong.new(
|
14
|
+
["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
|
15
|
+
["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
|
16
|
+
["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
|
17
|
+
["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
|
18
|
+
)
|
19
|
+
@crawl_type = "data.txt"
|
20
|
+
@crawl_depth = 3
|
21
|
+
@page_limit = 100
|
22
|
+
@url_file = 'urls.txt'
|
23
|
+
end
|
24
|
+
|
25
|
+
def display_usage
|
26
|
+
p "Sample usage:"
|
27
|
+
p "ruby search-engine-main.rb -c web -d 3 -p 100 -f 'urls.txt'"
|
28
|
+
p "-c must be either 'web' or 'domain', will default to 'web' is you type garbage "
|
29
|
+
end
|
30
|
+
|
31
|
+
def parse_arguments
|
32
|
+
@opts.each do |opt, arg|
|
33
|
+
case opt
|
34
|
+
when '--crawl'
|
35
|
+
ensure_crawl_type_correct(arg)
|
36
|
+
when '--crawl-depth'
|
37
|
+
@crawl_depth = arg.to_i
|
38
|
+
when '--page-limit'
|
39
|
+
@page_limit = arg.to_i
|
40
|
+
when '--url-file'
|
41
|
+
@url_file = arg
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def ensure_crawl_type_correct(value)
|
47
|
+
if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
|
48
|
+
@crawl_type = WEB_CRAWLER
|
49
|
+
else
|
50
|
+
@crawl_type = value
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/lib/spider.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'hpricot'
|
6
|
+
require 'url_utils'
|
7
|
+
|
8
|
+
class Spider
|
9
|
+
include UrlUtils
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@already_visited = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def crawl_web(urls, depth=2, page_limit = 100)
|
16
|
+
depth.times do
|
17
|
+
next_urls = []
|
18
|
+
urls.each do |url|
|
19
|
+
url_object = open_url(url)
|
20
|
+
next if url_object.nil?
|
21
|
+
|
22
|
+
url = update_url_if_redirected(url_object)
|
23
|
+
parsed_doc = parse_url(url_object)
|
24
|
+
next if parsed_doc.nil?
|
25
|
+
|
26
|
+
@already_visited[url] = true if @already_visited[url].nil?
|
27
|
+
return if @already_visited.size == page_limit
|
28
|
+
|
29
|
+
next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
|
30
|
+
next_urls.uniq!
|
31
|
+
end
|
32
|
+
urls = next_urls
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def crawl_domain(url, page_limit = 100)
|
37
|
+
return if @already_visited.size == page_limit
|
38
|
+
|
39
|
+
url_object = open_url(url)
|
40
|
+
return if url_object.nil?
|
41
|
+
|
42
|
+
parsed_doc = parse_url(url_object)
|
43
|
+
return if parsed_doc.nil?
|
44
|
+
|
45
|
+
@already_visited[url] = true if @already_visited[url].nil?
|
46
|
+
page_urls = find_urls_on_page(parsed_doc, url)
|
47
|
+
page_urls.each do |page_url|
|
48
|
+
if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
|
49
|
+
crawl_domain(page_url)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def open_url(url)
|
57
|
+
open(url)
|
58
|
+
rescue
|
59
|
+
puts "Unable to open url: " + url
|
60
|
+
end
|
61
|
+
|
62
|
+
def update_url_if_redirected(url_object)
|
63
|
+
url_object.base_uri.to_s
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse_url(url_object)
|
67
|
+
doc = Hpricot(url_object) #nokogiri
|
68
|
+
puts 'Crawling url ' + url_object.base_uri.to_s
|
69
|
+
doc
|
70
|
+
rescue
|
71
|
+
puts 'Could not parse url: ' + url_object.base_uri.to_s
|
72
|
+
end
|
73
|
+
|
74
|
+
def find_urls_on_page(parsed_doc, current_url)
|
75
|
+
parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
|
76
|
+
new_url = x['href'].split('#')[0]
|
77
|
+
if new_url
|
78
|
+
# complicated feature: make_absolute
|
79
|
+
new_url = make_absolute(current_url, new_url) if relative?(new_url)
|
80
|
+
urls_list.push(new_url)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/url_store.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
class UrlStore
|
2
|
+
attr_reader :urls
|
3
|
+
alias :get_urls :urls
|
4
|
+
|
5
|
+
def initialize(url_file)
|
6
|
+
@urls = read_urls_from_file(url_file)
|
7
|
+
end
|
8
|
+
|
9
|
+
def get_url
|
10
|
+
@urls[0]
|
11
|
+
end
|
12
|
+
|
13
|
+
def read_urls_from_file(url_file)
|
14
|
+
urls = []
|
15
|
+
File.open(url_file, 'r') do |file|
|
16
|
+
file.readlines.each do |line|
|
17
|
+
urls.push(line.chomp)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
urls
|
21
|
+
end
|
22
|
+
|
23
|
+
private :read_urls_from_file
|
24
|
+
end
|
data/lib/url_utils.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
module UrlUtils
|
2
|
+
def relative?(url)
|
3
|
+
!url.match(/^http/)
|
4
|
+
end
|
5
|
+
|
6
|
+
def make_absolute(potential_base, relative_url)
|
7
|
+
if relative_url.match(/^\//)
|
8
|
+
create_absolute_url_from_base(potential_base, relative_url)
|
9
|
+
else
|
10
|
+
create_absolute_url_from_context(potential_base, relative_url)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def urls_on_same_domain?(url1, url2)
|
15
|
+
get_domain(url1) == get_domain(url2)
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_domain(url)
|
19
|
+
remove_extra_paths(url)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def create_absolute_url_from_base(potential_base, relative_url)
|
25
|
+
remove_extra_paths(potential_base) + relative_url
|
26
|
+
end
|
27
|
+
|
28
|
+
def remove_extra_paths(potential_base)
|
29
|
+
index_to_start_slash_search = potential_base.index('://')+3
|
30
|
+
index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
|
31
|
+
if index_of_first_relevant_slash != nil
|
32
|
+
potential_base[0, index_of_first_relevant_slash]
|
33
|
+
else
|
34
|
+
potential_base
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def create_absolute_url_from_context(potential_base, relative_url)
|
39
|
+
absolute_url = nil
|
40
|
+
if potential_base.match(/\/$/)
|
41
|
+
absolute_url = potential_base+relative_url
|
42
|
+
else
|
43
|
+
last_index_of_slash = potential_base.rindex('/')
|
44
|
+
if potential_base[last_index_of_slash-2, 2] == ':/'
|
45
|
+
absolute_url = potential_base+'/'+relative_url
|
46
|
+
else
|
47
|
+
last_index_of_dot = potential_base.rindex('.')
|
48
|
+
if last_index_of_dot < last_index_of_slash
|
49
|
+
absolute_url = potential_base+'/'+relative_url
|
50
|
+
else
|
51
|
+
absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
absolute_url
|
56
|
+
end
|
57
|
+
end
|
data/lib/yz_crawler.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'spider'
|
2
|
+
require 'command_line_argument_parser'
|
3
|
+
require 'url_store'
|
4
|
+
|
5
|
+
class YzCrawler
|
6
|
+
def initialize
|
7
|
+
@argument_parser = CommandLineArgumentParser.new
|
8
|
+
@argument_parser.parse_arguments
|
9
|
+
@spider = Spider.new
|
10
|
+
@url_store = UrlStore.new(@argument_parser.url_file)
|
11
|
+
end
|
12
|
+
|
13
|
+
def crawl
|
14
|
+
if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
|
15
|
+
@spider.crawl_web(
|
16
|
+
@url_store.get_urls,
|
17
|
+
@argument_parser.crawl_depth,
|
18
|
+
@argument_parser.page_limit
|
19
|
+
)
|
20
|
+
else
|
21
|
+
@spider.crawl_domain(
|
22
|
+
@url_store.get_url,
|
23
|
+
@argument_parser.page_limit
|
24
|
+
)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
metadata
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: yz_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yang Zhao
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-03-22 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: hpricot
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.8'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.8'
|
27
|
+
description: A simple web crawler gem
|
28
|
+
email: yang.notold@gmail.com
|
29
|
+
executables:
|
30
|
+
- crawler
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- bin/crawler
|
35
|
+
- lib/command_line_argument_parser.rb
|
36
|
+
- lib/spider.rb
|
37
|
+
- lib/url_store.rb
|
38
|
+
- lib/url_utils.rb
|
39
|
+
- lib/yz_crawler.rb
|
40
|
+
homepage: http://rubygems.org/gems/yz_crawler
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata: {}
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 2.4.3
|
61
|
+
signing_key:
|
62
|
+
specification_version: 4
|
63
|
+
summary: My first gem
|
64
|
+
test_files: []
|