zy_crawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/zycrawler +5 -0
- data/lib/command_line_argument_parser.rb +55 -0
- data/lib/spider.rb +84 -0
- data/lib/url_store.rb +26 -0
- data/lib/url_utils.rb +56 -0
- data/lib/zy_crawler.rb +23 -0
- metadata +66 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2024898f0c89209ea0a427d2fa0c03a1c628f486baac028522c9c2bb400feab1
|
4
|
+
data.tar.gz: f1e0d8bb0b62406e197cec47fd2e5c200a35669cca578b3f64bb788bd270b492
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a4ee958478df918e2ddf8b7fe6bb12949e973338aceb6ea0aa9024ef8c2404c0ea0c64bc69b7b6c2cc01d7d1e7d1476539430dde715025899f7a70218b2b01db
|
7
|
+
data.tar.gz: 10fcbdf784221de565506976d6aa9359203ef32b79efe074b745abaf420df96cff24e55e114795fcf05337e557127cb59cf6f08db0e9351782dcc31a93b6e2ca
|
data/bin/zycrawler
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'getoptlong'
|
2
|
+
# command line argument parser
|
3
|
+
class CommandLineArgumentParser
|
4
|
+
WEB_CRAWLER = 'web'.freeze
|
5
|
+
DOMAIN_CRAWLER = 'domain'.freeze
|
6
|
+
attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
unless ARGV.length >= 1
|
10
|
+
display_usage
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
@opts = GetoptLong.new(
|
14
|
+
['--crawl', '-c', GetoptLong::REQUIRED_ARGUMENT],
|
15
|
+
['--crawl-depth', '-d', GetoptLong::OPTIONAL_ARGUMENT],
|
16
|
+
['--page-limit', '-p', GetoptLong::OPTIONAL_ARGUMENT],
|
17
|
+
['--url-file', '-f', GetoptLong::OPTIONAL_ARGUMENT]
|
18
|
+
)
|
19
|
+
@crawl_type = 'data.txt'
|
20
|
+
@crawl_depth = 3
|
21
|
+
@page_limit = 100
|
22
|
+
@url_file = 'urls.txt'
|
23
|
+
end
|
24
|
+
|
25
|
+
def display_usage
|
26
|
+
p 'Sample usage:'
|
27
|
+
p "ruby zy_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
|
28
|
+
p "-c must be either 'web' or 'domain', will default to 'web' if you type garbage"
|
29
|
+
end
|
30
|
+
|
31
|
+
def parse_arguments
|
32
|
+
@opts.each do |opt, arg|
|
33
|
+
case opt
|
34
|
+
when '--crawl'
|
35
|
+
ensure_crawl_type_correct(arg)
|
36
|
+
when '--crawl-depth'
|
37
|
+
@crawl_depth = arg.to_i
|
38
|
+
when '--page-limit'
|
39
|
+
@page_limit = arg.to_i
|
40
|
+
when '--url-file'
|
41
|
+
@url_file = arg
|
42
|
+
else
|
43
|
+
puts 'what happend?'
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def ensure_crawl_type_correct(value)
|
49
|
+
if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
|
50
|
+
@crawl_type = WEB_CRAWLER
|
51
|
+
else
|
52
|
+
@crawl_type = value
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
data/lib/spider.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'hpricot'
|
6
|
+
require 'url_utils'
|
7
|
+
# Young Spider
|
8
|
+
class Spider
|
9
|
+
include UrlUtils
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@already_visited = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
def crawl_web(urls, depth = 2, page_limit = 100)
|
16
|
+
depth.times do
|
17
|
+
next_urls = []
|
18
|
+
urls.each do |url|
|
19
|
+
url_object = open_url(url)
|
20
|
+
next if url_object.nil?
|
21
|
+
|
22
|
+
url = upate_url_if_redirected(url_object)
|
23
|
+
parsed_doc = parse_url(url_object)
|
24
|
+
next if parsed_doc.nil?
|
25
|
+
|
26
|
+
@already_visited[url] == true if @already_visited[url].nil?
|
27
|
+
return if @already_visited.size == page_limit
|
28
|
+
|
29
|
+
next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
|
30
|
+
next_urls.uniq!
|
31
|
+
end
|
32
|
+
urls = next_urls
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def crawl_domain(url, page_limit = 100)
|
37
|
+
return if @already_visited.size == page_limit
|
38
|
+
|
39
|
+
url_object = open_url(url)
|
40
|
+
return if url_object.nil?
|
41
|
+
|
42
|
+
parsed_doc = parse_url(url_object)
|
43
|
+
return if parsed_doc.nil?
|
44
|
+
|
45
|
+
@already_visited[url] == true if @already_visited[url].nil?
|
46
|
+
page_urls = find_urls_on_page(parsed_doc, url)
|
47
|
+
page_urls.each do |page_url|
|
48
|
+
if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
|
49
|
+
crawl_domain(page_url)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def open_url(url)
|
57
|
+
URI.open(url)
|
58
|
+
rescue
|
59
|
+
puts 'Unable to open url: ' + url
|
60
|
+
end
|
61
|
+
|
62
|
+
def upate_url_if_redirected(url_object)
|
63
|
+
url_object.base_uri.to_s
|
64
|
+
end
|
65
|
+
|
66
|
+
def parse_url(url_object)
|
67
|
+
doc = Hpricot(url_object) #nokogiri
|
68
|
+
puts 'Crawling url ' + url_object.base_uri.to_s
|
69
|
+
doc
|
70
|
+
rescue
|
71
|
+
puts 'Could not parse url: ' + url_object.base_uri.to_s
|
72
|
+
end
|
73
|
+
|
74
|
+
def find_urls_on_page(pared_doc, current_url)
|
75
|
+
pared_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
|
76
|
+
new_url = x['href'].split('#')[0]
|
77
|
+
if new_url
|
78
|
+
# complicated feature: make_absolute
|
79
|
+
new_url = make_absolute(current_url, new_url) if relative?(new_url)
|
80
|
+
urls_list.push(new_url)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/url_store.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# Memory URL Store
|
2
|
+
class URLStore
|
3
|
+
attr_reader :urls
|
4
|
+
alias get_urls urls
|
5
|
+
|
6
|
+
def initialize(url_file)
|
7
|
+
@urls = read_urls_from_file(url_file)
|
8
|
+
end
|
9
|
+
|
10
|
+
def firt_url
|
11
|
+
@urls[0]
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def read_urls_from_file(url_file)
|
17
|
+
urls = []
|
18
|
+
File.open(url_file, 'r') do |file|
|
19
|
+
file.readlines.each do |line|
|
20
|
+
urls.push(line.chomp)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
urls
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/lib/url_utils.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
# URL Tools
|
2
|
+
module UrlUtils
|
3
|
+
def relative?(url)
|
4
|
+
!url.match(/^http/)
|
5
|
+
end
|
6
|
+
|
7
|
+
def make_absolute(potential_base, relative_url)
|
8
|
+
if relative_url =~ /^\//
|
9
|
+
create_absolute_url_from_base(potential_base, relative_url)
|
10
|
+
else
|
11
|
+
create_abs_url_from_ctx(potential_base, relative_url)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def urls_on_same_domain?(url1, url2)
|
16
|
+
get_domain(url1) == get_domain(url2)
|
17
|
+
end
|
18
|
+
|
19
|
+
def get_domain(url)
|
20
|
+
remove_extra_paths(url)
|
21
|
+
end
|
22
|
+
|
23
|
+
def create_absolute_url_from_base(potential_base, relative_url)
|
24
|
+
remove_extra_paths(potential_base) + relative_url
|
25
|
+
end
|
26
|
+
|
27
|
+
def remove_extra_paths(potential_base)
|
28
|
+
index_to_start_slash_search = potential_base.index('://') + 3
|
29
|
+
index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
|
30
|
+
if !index_of_first_relevant_slash.nil?
|
31
|
+
potential_base[0, index_of_first_relevant_slash]
|
32
|
+
else
|
33
|
+
potential_base
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def create_abs_url_from_ctx(potential_base, relative_url)
|
38
|
+
absolute_url = nil
|
39
|
+
if potential_base =~ /\/$/
|
40
|
+
absolute_url = potential_base + relative_url
|
41
|
+
else
|
42
|
+
last_index_of_slash = potential_base.rindex('/')
|
43
|
+
if potential_base[last_index_of_slash - 2, 2] == ':/'
|
44
|
+
absolute_url = potential_base + '/' + relative_url
|
45
|
+
else
|
46
|
+
last_index_of_dot = potential_base.rindex('.')
|
47
|
+
if last_index_of_dot < last_index_of_slash
|
48
|
+
absolute_url = potential_base + '/' + relative_url
|
49
|
+
else
|
50
|
+
absolute_url = potential_base[0, last_index_of_slash + 1] + relative_url
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
absolute_url
|
55
|
+
end
|
56
|
+
end
|
data/lib/zy_crawler.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'command_line_argument_parser'
|
2
|
+
require 'spider'
|
3
|
+
require 'url_store'
|
4
|
+
|
5
|
+
# ZyCrawler is from Jason zhao's YzCrawler
|
6
|
+
class ZyCrawler
|
7
|
+
def initialize
|
8
|
+
@argument_parser = CommandLineArgumentParser.new
|
9
|
+
@argument_parser.parse_arguments
|
10
|
+
@spider = Spider.new
|
11
|
+
@url_store = URLStore.new(@argument_parser.url_file)
|
12
|
+
end
|
13
|
+
|
14
|
+
def crawl
|
15
|
+
if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
|
16
|
+
@spider.crawl_web(@url_store.get_urls,
|
17
|
+
@argument_parser.crawl_depth,
|
18
|
+
@argument_parser.page_limit)
|
19
|
+
else
|
20
|
+
@spider.crawl_domain(@url_store.firt_url, @argument_parser.page_limit)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: zy_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- uuen sky
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-03-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: hpricot
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.8'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.8'
|
27
|
+
description: A simple crawler demo crawler
|
28
|
+
email: uuensky@163.com
|
29
|
+
executables:
|
30
|
+
- zycrawler
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- bin/zycrawler
|
35
|
+
- lib/command_line_argument_parser.rb
|
36
|
+
- lib/spider.rb
|
37
|
+
- lib/url_store.rb
|
38
|
+
- lib/url_utils.rb
|
39
|
+
- lib/zy_crawler.rb
|
40
|
+
homepage: https://rubygems.org/gems/zycrawler
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata:
|
44
|
+
changelog_uri: https://github.com/uuensky/zycrawler/blob/master/CHANGELOG.md
|
45
|
+
homepage_uri: https://rubygems.org/gems/zycrawler
|
46
|
+
source_code_uri: https://github.com/uuensky/zycrawler.git
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubygems_version: 3.2.32
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: A yong spider
|
66
|
+
test_files: []
|