resay_crawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/crawler +6 -0
- data/lib/command_line_argument_parser.rb +58 -0
- data/lib/resay_crawler.rb +38 -0
- data/lib/spider.rb +108 -0
- data/lib/url_store.rb +24 -0
- data/lib/url_utils.rb +58 -0
- metadata +64 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 419d0b9ff169be3f5addb5e5198ac0c4833d64e4
|
4
|
+
data.tar.gz: 3d4139ed449b9228bb7189f127857dc80eb10b30
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 779d114a34c9617d12fe364ad1f179ff9577586f668d1df091b50114082bcf252d2592f7dc7a7f5d5b6ae8d43fbe8c75a943a65df2c20ae59f095b1837b7b3d8
|
7
|
+
data.tar.gz: 0fabca670ba4707d801146af55c473312d8137a9914ad13387802412fe2ef98d35854d5d54cdedf4f38798f5556edaeae3bde9d6274dd1376dcbd1ae390bd6f2
|
data/bin/crawler
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'getoptlong'
|
2
|
+
|
3
|
+
class CommandLineArgumentParser
|
4
|
+
WEB_CRAWLER = 'web'
|
5
|
+
DOMAIN_CRAWLER = 'domain'
|
6
|
+
attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
unless ARGV.length >= 1
|
10
|
+
display_usage
|
11
|
+
exit
|
12
|
+
end
|
13
|
+
|
14
|
+
# initial argument
|
15
|
+
@opts = GetoptLong.new(
|
16
|
+
["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
|
17
|
+
["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
|
18
|
+
["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
|
19
|
+
["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
|
20
|
+
)
|
21
|
+
@crawl_type = "data.txt"
|
22
|
+
@crawl_depth = 3
|
23
|
+
@page_limit = 100
|
24
|
+
@url_file = 'urls.txt'
|
25
|
+
end
|
26
|
+
|
27
|
+
def display_usage
|
28
|
+
p "Sample usage:"
|
29
|
+
p "ruby resay_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
|
30
|
+
p "-c must be either 'web' or 'domain', will default to 'web' is you type garbage "
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse_arguments
|
34
|
+
@opts.each do |opt, arg|
|
35
|
+
case opt
|
36
|
+
when '--crawl'
|
37
|
+
ensure_crawl_type_correct(arg)
|
38
|
+
when '--crawl-depth'
|
39
|
+
@crawl_depth = arg.to_i
|
40
|
+
when '--page-limit'
|
41
|
+
@page_limit = arg.to_i
|
42
|
+
when '--url-file'
|
43
|
+
@url_file = arg
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def ensure_crawl_type_correct(value)
|
49
|
+
if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
|
50
|
+
@crawl_type = WEB_CRAWLER
|
51
|
+
else
|
52
|
+
@crawl_type = value
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'spider'
|
2
|
+
require 'command_line_argument_parser'
|
3
|
+
require 'url_store'
|
4
|
+
|
5
|
+
class ResayCrawler
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
## parse cosole argument -f -c -d
|
9
|
+
@argument_parser = CommandLineArgumentParser.new
|
10
|
+
@argument_parser.parse_arguments
|
11
|
+
|
12
|
+
## spider
|
13
|
+
@spider = Spider.new
|
14
|
+
|
15
|
+
## read url
|
16
|
+
@url_store = UrlStore.new(@argument_parser.url_file)
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
def crawl
|
21
|
+
## crawl
|
22
|
+
if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
|
23
|
+
@spider.crawl_web(
|
24
|
+
@url_store.get_urls,
|
25
|
+
@argument_parser.crawl_depth,
|
26
|
+
@argument_parser.page_limit
|
27
|
+
)
|
28
|
+
else
|
29
|
+
@spider.crawl_domain(
|
30
|
+
@url_store.get_url,
|
31
|
+
@argument_parser.page_limit
|
32
|
+
)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
|
data/lib/spider.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'rubygems'
|
5
|
+
# require 'hpricot'
|
6
|
+
require 'nokogiri'
|
7
|
+
require 'url_utils'
|
8
|
+
|
9
|
+
class Spider
|
10
|
+
|
11
|
+
# mixin
|
12
|
+
include UrlUtils
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@already_visited = {}
|
16
|
+
end
|
17
|
+
|
18
|
+
# web
|
19
|
+
def crawl_web(urls, depth = 2, page_limit = 100)
|
20
|
+
depth.times do # 几级
|
21
|
+
next_urls = []
|
22
|
+
|
23
|
+
# 读取初始url
|
24
|
+
urls.each do |url|
|
25
|
+
url_object = open_url(url)
|
26
|
+
next if url_object.nil?
|
27
|
+
|
28
|
+
# 重定向后? url
|
29
|
+
url = update_url_if_redirected(url_object)
|
30
|
+
|
31
|
+
# 解析url
|
32
|
+
parsed_doc = parse_url(url_object)
|
33
|
+
next if parsed_doc.nil?
|
34
|
+
|
35
|
+
# 标记已经访问过url
|
36
|
+
@already_visited[url] = true if @already_visited[url].nil?
|
37
|
+
# [ 循坏的终止条件] 超出page_limit,则跳出
|
38
|
+
return if @already_visited.size == page_limit
|
39
|
+
|
40
|
+
# 解析page上新url,加入[],剔除已经访问过
|
41
|
+
next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
|
42
|
+
# 去掉相同的url
|
43
|
+
next_urls.uniq!
|
44
|
+
end
|
45
|
+
# each 循环
|
46
|
+
urls = next_urls
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
# domain
|
52
|
+
def crawl_domain(url, page_limit = 100)
|
53
|
+
# [ 递归终止条件 ]
|
54
|
+
return if @already_visited.size == page_limit
|
55
|
+
|
56
|
+
url_object = open_url(url)
|
57
|
+
return if url_object.nil?
|
58
|
+
|
59
|
+
parsed_doc = parse_url(url_object)
|
60
|
+
return if parsed_doc.nil?
|
61
|
+
|
62
|
+
@already_visited[url] = true if @already_visited[url].nil?
|
63
|
+
page_urls = find_urls_on_page(parsed_doc,url)
|
64
|
+
page_urls.each do |page_url|
|
65
|
+
# 是同一域名&&未访问过的url
|
66
|
+
if urls_on_same_domain?(url,page_url) && @already_visited[page_url].nil?
|
67
|
+
# recursive 递归
|
68
|
+
crawl_domain(page_url)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
# add my exception to open(url)
|
77
|
+
def open_url(url)
|
78
|
+
open(url)
|
79
|
+
rescue
|
80
|
+
puts "Unable to open url: " + url
|
81
|
+
end
|
82
|
+
|
83
|
+
def update_url_if_redirected(url_object)
|
84
|
+
url_object.base_uri.to_s
|
85
|
+
end
|
86
|
+
|
87
|
+
# deal dom tree content
|
88
|
+
def parse_url(url_object)
|
89
|
+
# doc = Hpricot(url_object) #nokogiri
|
90
|
+
doc = Nokogiri(url_object)
|
91
|
+
puts 'Crawling url ' + url_object.base_uri.to_s
|
92
|
+
doc
|
93
|
+
rescue
|
94
|
+
puts 'Could not parse url: ' + url_object.base_uri.to_s
|
95
|
+
end
|
96
|
+
|
97
|
+
def find_urls_on_page(parsed_doc, current_url)
|
98
|
+
parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
|
99
|
+
new_url = x['href'].split('#')[0]
|
100
|
+
if new_url
|
101
|
+
# complicated feature: make_absolute
|
102
|
+
new_url = make_absolute(current_url, new_url) if relative?(new_url)
|
103
|
+
urls_list.push(new_url)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
data/lib/url_store.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
class UrlStore
|
2
|
+
attr_reader :urls
|
3
|
+
alias :get_urls :urls
|
4
|
+
def initialize(url_file)
|
5
|
+
@urls = read_urls_from_file(url_file)
|
6
|
+
end
|
7
|
+
|
8
|
+
def get_url
|
9
|
+
@urls[0]
|
10
|
+
end
|
11
|
+
|
12
|
+
def read_urls_from_file(url_file)
|
13
|
+
urls = []
|
14
|
+
File.open(url_file,'r') do |file|
|
15
|
+
file.readlines.each do |line|
|
16
|
+
urls.push(line.chomp)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
urls
|
20
|
+
end
|
21
|
+
|
22
|
+
private :read_urls_from_file
|
23
|
+
|
24
|
+
end
|
data/lib/url_utils.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
module UrlUtils
|
2
|
+
def relative?(url)
|
3
|
+
!url.match(/^http/)
|
4
|
+
end
|
5
|
+
|
6
|
+
def make_absolute(potential_base, relative_url)
|
7
|
+
if relative_url.match(/^\//)
|
8
|
+
create_absolute_url_from_base(potential_base,relative_url)
|
9
|
+
else
|
10
|
+
create_absolute_url_from_context(potential_base,relative_url)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def urls_on_same_domain?(url1,url2)
|
15
|
+
get_domain(url1) == get_domain(url2)
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_domain(url)
|
19
|
+
remove_extra_paths(url)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def create_absolute_url_from_base(potential_base,relative_url)
|
25
|
+
remove_extra_paths(potential_base) + relative_url
|
26
|
+
end
|
27
|
+
|
28
|
+
def remove_extra_paths(potential_base)
|
29
|
+
index_to_start_slash_search = potential_base.index('://')+3
|
30
|
+
index_of_first_relevant_slash = potential_base.index('/',index_to_start_slash_search)
|
31
|
+
if index_of_first_relevant_slash != nil
|
32
|
+
potential_base[0, index_of_first_relevant_slash]
|
33
|
+
else
|
34
|
+
potential_base
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def create_absolute_url_from_context(potential_base,relative_url)
|
39
|
+
absolute_url = nil
|
40
|
+
if potential_base.match(/\/$/)
|
41
|
+
absolute_url = potential_base + relative_url
|
42
|
+
else
|
43
|
+
last_index_of_slash = potential_base.rindex('/')
|
44
|
+
if potential_base[last_index_of_slash-2, 2] == ':/'
|
45
|
+
absolute_url = potential_base + '/' + relative_url
|
46
|
+
else
|
47
|
+
last_index_of_dot = potential_base.rindex('.')
|
48
|
+
if last_index_of_dot < last_index_of_slash
|
49
|
+
absolute_url = potential_base + '/' + relative_url
|
50
|
+
else
|
51
|
+
absolute_url = potential_base[0, last_index_of_slash + 1] + relative_url
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
absolute_url
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
metadata
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: resay_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Resay tao
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-05-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
description: A simple web crawler gem
|
28
|
+
email: sinotao1@gmail.com
|
29
|
+
executables:
|
30
|
+
- crawler
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- bin/crawler
|
35
|
+
- lib/command_line_argument_parser.rb
|
36
|
+
- lib/resay_crawler.rb
|
37
|
+
- lib/spider.rb
|
38
|
+
- lib/url_store.rb
|
39
|
+
- lib/url_utils.rb
|
40
|
+
homepage: http://rubygems.org/gems/resay_crawler
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata: {}
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 2.4.6
|
61
|
+
signing_key:
|
62
|
+
specification_version: 4
|
63
|
+
summary: My first gem
|
64
|
+
test_files: []
|