resay_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 419d0b9ff169be3f5addb5e5198ac0c4833d64e4
4
+ data.tar.gz: 3d4139ed449b9228bb7189f127857dc80eb10b30
5
+ SHA512:
6
+ metadata.gz: 779d114a34c9617d12fe364ad1f179ff9577586f668d1df091b50114082bcf252d2592f7dc7a7f5d5b6ae8d43fbe8c75a943a65df2c20ae59f095b1837b7b3d8
7
+ data.tar.gz: 0fabca670ba4707d801146af55c473312d8137a9914ad13387802412fe2ef98d35854d5d54cdedf4f38798f5556edaeae3bde9d6274dd1376dcbd1ae390bd6f2
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ require 'resay_crawler'
3
+
4
+
5
+ crawler = ResayCrawler.new
6
+ crawler.crawl
@@ -0,0 +1,58 @@
1
+ require 'getoptlong'
2
+
3
+ class CommandLineArgumentParser
4
+ WEB_CRAWLER = 'web'
5
+ DOMAIN_CRAWLER = 'domain'
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+
14
+ # initial argument
15
+ @opts = GetoptLong.new(
16
+ ["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
17
+ ["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
18
+ ["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
19
+ ["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
20
+ )
21
+ @crawl_type = "data.txt"
22
+ @crawl_depth = 3
23
+ @page_limit = 100
24
+ @url_file = 'urls.txt'
25
+ end
26
+
27
+ def display_usage
28
+ p "Sample usage:"
29
+ p "ruby resay_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
30
+ p "-c must be either 'web' or 'domain', will default to 'web' is you type garbage "
31
+ end
32
+
33
+ def parse_arguments
34
+ @opts.each do |opt, arg|
35
+ case opt
36
+ when '--crawl'
37
+ ensure_crawl_type_correct(arg)
38
+ when '--crawl-depth'
39
+ @crawl_depth = arg.to_i
40
+ when '--page-limit'
41
+ @page_limit = arg.to_i
42
+ when '--url-file'
43
+ @url_file = arg
44
+ end
45
+ end
46
+ end
47
+
48
+ def ensure_crawl_type_correct(value)
49
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
50
+ @crawl_type = WEB_CRAWLER
51
+ else
52
+ @crawl_type = value
53
+ end
54
+ end
55
+
56
+
57
+
58
+ end
@@ -0,0 +1,38 @@
1
+ require 'spider'
2
+ require 'command_line_argument_parser'
3
+ require 'url_store'
4
+
5
+ class ResayCrawler
6
+
7
+ def initialize
8
+ ## parse cosole argument -f -c -d
9
+ @argument_parser = CommandLineArgumentParser.new
10
+ @argument_parser.parse_arguments
11
+
12
+ ## spider
13
+ @spider = Spider.new
14
+
15
+ ## read url
16
+ @url_store = UrlStore.new(@argument_parser.url_file)
17
+ end
18
+
19
+
20
+ def crawl
21
+ ## crawl
22
+ if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
23
+ @spider.crawl_web(
24
+ @url_store.get_urls,
25
+ @argument_parser.crawl_depth,
26
+ @argument_parser.page_limit
27
+ )
28
+ else
29
+ @spider.crawl_domain(
30
+ @url_store.get_url,
31
+ @argument_parser.page_limit
32
+ )
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+
@@ -0,0 +1,108 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'rubygems'
5
+ # require 'hpricot'
6
+ require 'nokogiri'
7
+ require 'url_utils'
8
+
9
+ class Spider
10
+
11
+ # mixin
12
+ include UrlUtils
13
+
14
+ def initialize
15
+ @already_visited = {}
16
+ end
17
+
18
+ # web
19
+ def crawl_web(urls, depth = 2, page_limit = 100)
20
+ depth.times do # 几级
21
+ next_urls = []
22
+
23
+ # 读取初始url
24
+ urls.each do |url|
25
+ url_object = open_url(url)
26
+ next if url_object.nil?
27
+
28
+ # 重定向后? url
29
+ url = update_url_if_redirected(url_object)
30
+
31
+ # 解析url
32
+ parsed_doc = parse_url(url_object)
33
+ next if parsed_doc.nil?
34
+
35
+ # 标记已经访问过url
36
+ @already_visited[url] = true if @already_visited[url].nil?
37
+ # [ 循坏的终止条件] 超出page_limit,则跳出
38
+ return if @already_visited.size == page_limit
39
+
40
+ # 解析page上新url,加入[],剔除已经访问过
41
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
42
+ # 去掉相同的url
43
+ next_urls.uniq!
44
+ end
45
+ # each 循环
46
+ urls = next_urls
47
+ end
48
+
49
+ end
50
+
51
+ # domain
52
+ def crawl_domain(url, page_limit = 100)
53
+ # [ 递归终止条件 ]
54
+ return if @already_visited.size == page_limit
55
+
56
+ url_object = open_url(url)
57
+ return if url_object.nil?
58
+
59
+ parsed_doc = parse_url(url_object)
60
+ return if parsed_doc.nil?
61
+
62
+ @already_visited[url] = true if @already_visited[url].nil?
63
+ page_urls = find_urls_on_page(parsed_doc,url)
64
+ page_urls.each do |page_url|
65
+ # 是同一域名&&未访问过的url
66
+ if urls_on_same_domain?(url,page_url) && @already_visited[page_url].nil?
67
+ # recursive 递归
68
+ crawl_domain(page_url)
69
+ end
70
+ end
71
+
72
+ end
73
+
74
+ private
75
+
76
+ # add my exception to open(url)
77
+ def open_url(url)
78
+ open(url)
79
+ rescue
80
+ puts "Unable to open url: " + url
81
+ end
82
+
83
+ def update_url_if_redirected(url_object)
84
+ url_object.base_uri.to_s
85
+ end
86
+
87
+ # deal dom tree content
88
+ def parse_url(url_object)
89
+ # doc = Hpricot(url_object) #nokogiri
90
+ doc = Nokogiri(url_object)
91
+ puts 'Crawling url ' + url_object.base_uri.to_s
92
+ doc
93
+ rescue
94
+ puts 'Could not parse url: ' + url_object.base_uri.to_s
95
+ end
96
+
97
+ def find_urls_on_page(parsed_doc, current_url)
98
+ parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
99
+ new_url = x['href'].split('#')[0]
100
+ if new_url
101
+ # complicated feature: make_absolute
102
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
103
+ urls_list.push(new_url)
104
+ end
105
+ end
106
+ end
107
+
108
+ end
@@ -0,0 +1,24 @@
1
+ class UrlStore
2
+ attr_reader :urls
3
+ alias :get_urls :urls
4
+ def initialize(url_file)
5
+ @urls = read_urls_from_file(url_file)
6
+ end
7
+
8
+ def get_url
9
+ @urls[0]
10
+ end
11
+
12
+ def read_urls_from_file(url_file)
13
+ urls = []
14
+ File.open(url_file,'r') do |file|
15
+ file.readlines.each do |line|
16
+ urls.push(line.chomp)
17
+ end
18
+ end
19
+ urls
20
+ end
21
+
22
+ private :read_urls_from_file
23
+
24
+ end
@@ -0,0 +1,58 @@
1
+ module UrlUtils
2
+ def relative?(url)
3
+ !url.match(/^http/)
4
+ end
5
+
6
+ def make_absolute(potential_base, relative_url)
7
+ if relative_url.match(/^\//)
8
+ create_absolute_url_from_base(potential_base,relative_url)
9
+ else
10
+ create_absolute_url_from_context(potential_base,relative_url)
11
+ end
12
+ end
13
+
14
+ def urls_on_same_domain?(url1,url2)
15
+ get_domain(url1) == get_domain(url2)
16
+ end
17
+
18
+ def get_domain(url)
19
+ remove_extra_paths(url)
20
+ end
21
+
22
+ private
23
+
24
+ def create_absolute_url_from_base(potential_base,relative_url)
25
+ remove_extra_paths(potential_base) + relative_url
26
+ end
27
+
28
+ def remove_extra_paths(potential_base)
29
+ index_to_start_slash_search = potential_base.index('://')+3
30
+ index_of_first_relevant_slash = potential_base.index('/',index_to_start_slash_search)
31
+ if index_of_first_relevant_slash != nil
32
+ potential_base[0, index_of_first_relevant_slash]
33
+ else
34
+ potential_base
35
+ end
36
+ end
37
+
38
+ def create_absolute_url_from_context(potential_base,relative_url)
39
+ absolute_url = nil
40
+ if potential_base.match(/\/$/)
41
+ absolute_url = potential_base + relative_url
42
+ else
43
+ last_index_of_slash = potential_base.rindex('/')
44
+ if potential_base[last_index_of_slash-2, 2] == ':/'
45
+ absolute_url = potential_base + '/' + relative_url
46
+ else
47
+ last_index_of_dot = potential_base.rindex('.')
48
+ if last_index_of_dot < last_index_of_slash
49
+ absolute_url = potential_base + '/' + relative_url
50
+ else
51
+ absolute_url = potential_base[0, last_index_of_slash + 1] + relative_url
52
+ end
53
+ end
54
+ end
55
+ absolute_url
56
+ end
57
+
58
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: resay_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Resay tao
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-05-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ description: A simple web crawler gem
28
+ email: sinotao1@gmail.com
29
+ executables:
30
+ - crawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/crawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/resay_crawler.rb
37
+ - lib/spider.rb
38
+ - lib/url_store.rb
39
+ - lib/url_utils.rb
40
+ homepage: http://rubygems.org/gems/resay_crawler
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.4.6
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: My first gem
64
+ test_files: []