zy_crawler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/zycrawler +5 -0
- data/lib/command_line_argument_parser.rb +55 -0
- data/lib/spider.rb +84 -0
- data/lib/url_store.rb +26 -0
- data/lib/url_utils.rb +56 -0
- data/lib/zy_crawler.rb +23 -0
- metadata +66 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2024898f0c89209ea0a427d2fa0c03a1c628f486baac028522c9c2bb400feab1
|
4
|
+
data.tar.gz: f1e0d8bb0b62406e197cec47fd2e5c200a35669cca578b3f64bb788bd270b492
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a4ee958478df918e2ddf8b7fe6bb12949e973338aceb6ea0aa9024ef8c2404c0ea0c64bc69b7b6c2cc01d7d1e7d1476539430dde715025899f7a70218b2b01db
|
7
|
+
data.tar.gz: 10fcbdf784221de565506976d6aa9359203ef32b79efe074b745abaf420df96cff24e55e114795fcf05337e557127cb59cf6f08db0e9351782dcc31a93b6e2ca
|
data/bin/zycrawler
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'getoptlong'
|
2
|
+
# command line argument parser
|
3
|
+
class CommandLineArgumentParser
|
4
|
+
WEB_CRAWLER = 'web'.freeze
|
5
|
+
DOMAIN_CRAWLER = 'domain'.freeze
|
6
|
+
attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
unless ARGV.length >= 1
|
10
|
+
display_usage
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
@opts = GetoptLong.new(
|
14
|
+
['--crawl', '-c', GetoptLong::REQUIRED_ARGUMENT],
|
15
|
+
['--crawl-depth', '-d', GetoptLong::OPTIONAL_ARGUMENT],
|
16
|
+
['--page-limit', '-p', GetoptLong::OPTIONAL_ARGUMENT],
|
17
|
+
['--url-file', '-f', GetoptLong::OPTIONAL_ARGUMENT]
|
18
|
+
)
|
19
|
+
@crawl_type = 'data.txt'
|
20
|
+
@crawl_depth = 3
|
21
|
+
@page_limit = 100
|
22
|
+
@url_file = 'urls.txt'
|
23
|
+
end
|
24
|
+
|
25
|
+
def display_usage
|
26
|
+
p 'Sample usage:'
|
27
|
+
p "ruby zy_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
|
28
|
+
p "-c must be either 'web' or 'domain', will default to 'web' if you type garbage"
|
29
|
+
end
|
30
|
+
|
31
|
+
def parse_arguments
|
32
|
+
@opts.each do |opt, arg|
|
33
|
+
case opt
|
34
|
+
when '--crawl'
|
35
|
+
ensure_crawl_type_correct(arg)
|
36
|
+
when '--crawl-depth'
|
37
|
+
@crawl_depth = arg.to_i
|
38
|
+
when '--page-limit'
|
39
|
+
@page_limit = arg.to_i
|
40
|
+
when '--url-file'
|
41
|
+
@url_file = arg
|
42
|
+
else
|
43
|
+
puts 'what happend?'
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def ensure_crawl_type_correct(value)
|
49
|
+
if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
|
50
|
+
@crawl_type = WEB_CRAWLER
|
51
|
+
else
|
52
|
+
@crawl_type = value
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/spider.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'hpricot'
|
6
|
+
require 'url_utils'
|
7
|
+
# Young Spider
|
8
|
+
class Spider
|
9
|
+
include UrlUtils
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@already_visited = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def crawl_web(urls, depth = 2, page_limit = 100)
|
16
|
+
depth.times do
|
17
|
+
next_urls = []
|
18
|
+
urls.each do |url|
|
19
|
+
url_object = open_url(url)
|
20
|
+
next if url_object.nil?
|
21
|
+
|
22
|
+
url = upate_url_if_redirected(url_object)
|
23
|
+
parsed_doc = parse_url(url_object)
|
24
|
+
next if parsed_doc.nil?
|
25
|
+
|
26
|
+
@already_visited[url] == true if @already_visited[url].nil?
|
27
|
+
return if @already_visited.size == page_limit
|
28
|
+
|
29
|
+
next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
|
30
|
+
next_urls.uniq!
|
31
|
+
end
|
32
|
+
urls = next_urls
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def crawl_domain(url, page_limit = 100)
|
37
|
+
return if @already_visited.size == page_limit
|
38
|
+
|
39
|
+
url_object = open_url(url)
|
40
|
+
return if url_object.nil?
|
41
|
+
|
42
|
+
parsed_doc = parse_url(url_object)
|
43
|
+
return if parsed_doc.nil?
|
44
|
+
|
45
|
+
@already_visited[url] == true if @already_visited[url].nil?
|
46
|
+
page_urls = find_urls_on_page(parsed_doc, url)
|
47
|
+
page_urls.each do |page_url|
|
48
|
+
if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
|
49
|
+
crawl_domain(page_url)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def open_url(url)
|
57
|
+
URI.open(url)
|
58
|
+
rescue
|
59
|
+
puts 'Unable to open url: ' + url
|
60
|
+
end
|
61
|
+
|
62
|
+
def upate_url_if_redirected(url_object)
|
63
|
+
url_object.base_uri.to_s
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse_url(url_object)
|
67
|
+
doc = Hpricot(url_object) #nokogiri
|
68
|
+
puts 'Crawling url ' + url_object.base_uri.to_s
|
69
|
+
doc
|
70
|
+
rescue
|
71
|
+
puts 'Could not parse url: ' + url_object.base_uri.to_s
|
72
|
+
end
|
73
|
+
|
74
|
+
def find_urls_on_page(pared_doc, current_url)
|
75
|
+
pared_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
|
76
|
+
new_url = x['href'].split('#')[0]
|
77
|
+
if new_url
|
78
|
+
# complicated feature: make_absolute
|
79
|
+
new_url = make_absolute(current_url, new_url) if relative?(new_url)
|
80
|
+
urls_list.push(new_url)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/url_store.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# Memory URL Store
|
2
|
+
class URLStore
|
3
|
+
attr_reader :urls
|
4
|
+
alias get_urls urls
|
5
|
+
|
6
|
+
def initialize(url_file)
|
7
|
+
@urls = read_urls_from_file(url_file)
|
8
|
+
end
|
9
|
+
|
10
|
+
def firt_url
|
11
|
+
@urls[0]
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def read_urls_from_file(url_file)
|
17
|
+
urls = []
|
18
|
+
File.open(url_file, 'r') do |file|
|
19
|
+
file.readlines.each do |line|
|
20
|
+
urls.push(line.chomp)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
urls
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/lib/url_utils.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# URL Tools
|
2
|
+
module UrlUtils
|
3
|
+
def relative?(url)
|
4
|
+
!url.match(/^http/)
|
5
|
+
end
|
6
|
+
|
7
|
+
def make_absolute(potential_base, relative_url)
|
8
|
+
if relative_url =~ /^\//
|
9
|
+
create_absolute_url_from_base(potential_base, relative_url)
|
10
|
+
else
|
11
|
+
create_abs_url_from_ctx(potential_base, relative_url)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def urls_on_same_domain?(url1, url2)
|
16
|
+
get_domain(url1) == get_domain(url2)
|
17
|
+
end
|
18
|
+
|
19
|
+
def get_domain(url)
|
20
|
+
remove_extra_paths(url)
|
21
|
+
end
|
22
|
+
|
23
|
+
def create_absolute_url_from_base(potential_base, relative_url)
|
24
|
+
remove_extra_paths(potential_base) + relative_url
|
25
|
+
end
|
26
|
+
|
27
|
+
def remove_extra_paths(potential_base)
|
28
|
+
index_to_start_slash_search = potential_base.index('://') + 3
|
29
|
+
index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
|
30
|
+
if !index_of_first_relevant_slash.nil?
|
31
|
+
potential_base[0, index_of_first_relevant_slash]
|
32
|
+
else
|
33
|
+
potential_base
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def create_abs_url_from_ctx(potential_base, relative_url)
|
38
|
+
absolute_url = nil
|
39
|
+
if potential_base =~ /\/$/
|
40
|
+
absolute_url = potential_base + relative_url
|
41
|
+
else
|
42
|
+
last_index_of_slash = potential_base.rindex('/')
|
43
|
+
if potential_base[last_index_of_slash - 2, 2] == ':/'
|
44
|
+
absolute_url = potential_base + '/' + relative_url
|
45
|
+
else
|
46
|
+
last_index_of_dot = potential_base.rindex('.')
|
47
|
+
if last_index_of_dot < last_index_of_slash
|
48
|
+
absolute_url = potential_base + '/' + relative_url
|
49
|
+
else
|
50
|
+
absolute_url = potential_base[0, last_index_of_slash + 1] + relative_url
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
absolute_url
|
55
|
+
end
|
56
|
+
end
|
data/lib/zy_crawler.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'command_line_argument_parser'
|
2
|
+
require 'spider'
|
3
|
+
require 'url_store'
|
4
|
+
|
5
|
+
# ZyCrawler is from Jason zhao's YzCrawler
|
6
|
+
class ZyCrawler
|
7
|
+
def initialize
|
8
|
+
@argument_parser = CommandLineArgumentParser.new
|
9
|
+
@argument_parser.parse_arguments
|
10
|
+
@spider = Spider.new
|
11
|
+
@url_store = URLStore.new(@argument_parser.url_file)
|
12
|
+
end
|
13
|
+
|
14
|
+
def crawl
|
15
|
+
if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
|
16
|
+
@spider.crawl_web(@url_store.get_urls,
|
17
|
+
@argument_parser.crawl_depth,
|
18
|
+
@argument_parser.page_limit)
|
19
|
+
else
|
20
|
+
@spider.crawl_domain(@url_store.firt_url, @argument_parser.page_limit)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: zy_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- uuen sky
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-03-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: hpricot
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.8'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.8'
|
27
|
+
description: A simple crawler demo crawler
|
28
|
+
email: uuensky@163.com
|
29
|
+
executables:
|
30
|
+
- zycrawler
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- bin/zycrawler
|
35
|
+
- lib/command_line_argument_parser.rb
|
36
|
+
- lib/spider.rb
|
37
|
+
- lib/url_store.rb
|
38
|
+
- lib/url_utils.rb
|
39
|
+
- lib/zy_crawler.rb
|
40
|
+
homepage: https://rubygems.org/gems/zycrawler
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata:
|
44
|
+
changelog_uri: https://github.com/uuensky/zycrawler/blob/master/CHANGELOG.md
|
45
|
+
homepage_uri: https://rubygems.org/gems/zycrawler
|
46
|
+
source_code_uri: https://github.com/uuensky/zycrawler.git
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubygems_version: 3.2.32
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: A yong spider
|
66
|
+
test_files: []
|