resay_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 419d0b9ff169be3f5addb5e5198ac0c4833d64e4
4
+ data.tar.gz: 3d4139ed449b9228bb7189f127857dc80eb10b30
5
+ SHA512:
6
+ metadata.gz: 779d114a34c9617d12fe364ad1f179ff9577586f668d1df091b50114082bcf252d2592f7dc7a7f5d5b6ae8d43fbe8c75a943a65df2c20ae59f095b1837b7b3d8
7
+ data.tar.gz: 0fabca670ba4707d801146af55c473312d8137a9914ad13387802412fe2ef98d35854d5d54cdedf4f38798f5556edaeae3bde9d6274dd1376dcbd1ae390bd6f2
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ require 'resay_crawler'
3
+
4
+
5
+ crawler = ResayCrawler.new
6
+ crawler.crawl
@@ -0,0 +1,58 @@
1
+ require 'getoptlong'
2
+
3
+ class CommandLineArgumentParser
4
+ WEB_CRAWLER = 'web'
5
+ DOMAIN_CRAWLER = 'domain'
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+
14
+ # initial argument
15
+ @opts = GetoptLong.new(
16
+ ["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
17
+ ["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
18
+ ["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
19
+ ["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
20
+ )
21
+ @crawl_type = "data.txt"
22
+ @crawl_depth = 3
23
+ @page_limit = 100
24
+ @url_file = 'urls.txt'
25
+ end
26
+
27
+ def display_usage
28
+ p "Sample usage:"
29
+ p "ruby resay_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
30
+ p "-c must be either 'web' or 'domain', will default to 'web' is you type garbage "
31
+ end
32
+
33
+ def parse_arguments
34
+ @opts.each do |opt, arg|
35
+ case opt
36
+ when '--crawl'
37
+ ensure_crawl_type_correct(arg)
38
+ when '--crawl-depth'
39
+ @crawl_depth = arg.to_i
40
+ when '--page-limit'
41
+ @page_limit = arg.to_i
42
+ when '--url-file'
43
+ @url_file = arg
44
+ end
45
+ end
46
+ end
47
+
48
+ def ensure_crawl_type_correct(value)
49
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
50
+ @crawl_type = WEB_CRAWLER
51
+ else
52
+ @crawl_type = value
53
+ end
54
+ end
55
+
56
+
57
+
58
+ end
@@ -0,0 +1,38 @@
1
+ require 'spider'
2
+ require 'command_line_argument_parser'
3
+ require 'url_store'
4
+
5
+ class ResayCrawler
6
+
7
+ def initialize
8
+ ## parse cosole argument -f -c -d
9
+ @argument_parser = CommandLineArgumentParser.new
10
+ @argument_parser.parse_arguments
11
+
12
+ ## spider
13
+ @spider = Spider.new
14
+
15
+ ## read url
16
+ @url_store = UrlStore.new(@argument_parser.url_file)
17
+ end
18
+
19
+
20
+ def crawl
21
+ ## crawl
22
+ if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
23
+ @spider.crawl_web(
24
+ @url_store.get_urls,
25
+ @argument_parser.crawl_depth,
26
+ @argument_parser.page_limit
27
+ )
28
+ else
29
+ @spider.crawl_domain(
30
+ @url_store.get_url,
31
+ @argument_parser.page_limit
32
+ )
33
+ end
34
+ end
35
+
36
+ end
37
+
38
+
@@ -0,0 +1,108 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'rubygems'
5
+ # require 'hpricot'
6
+ require 'nokogiri'
7
+ require 'url_utils'
8
+
9
+ class Spider
10
+
11
+ # mixin
12
+ include UrlUtils
13
+
14
+ def initialize
15
+ @already_visited = {}
16
+ end
17
+
18
+ # web
19
+ def crawl_web(urls, depth = 2, page_limit = 100)
20
+ depth.times do # 几级
21
+ next_urls = []
22
+
23
+ # 读取初始url
24
+ urls.each do |url|
25
+ url_object = open_url(url)
26
+ next if url_object.nil?
27
+
28
+ # 重定向后? url
29
+ url = update_url_if_redirected(url_object)
30
+
31
+ # 解析url
32
+ parsed_doc = parse_url(url_object)
33
+ next if parsed_doc.nil?
34
+
35
+ # 标记已经访问过url
36
+ @already_visited[url] = true if @already_visited[url].nil?
37
+ # [ 循坏的终止条件] 超出page_limit,则跳出
38
+ return if @already_visited.size == page_limit
39
+
40
+ # 解析page上新url,加入[],剔除已经访问过
41
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
42
+ # 去掉相同的url
43
+ next_urls.uniq!
44
+ end
45
+ # each 循环
46
+ urls = next_urls
47
+ end
48
+
49
+ end
50
+
51
+ # domain
52
+ def crawl_domain(url, page_limit = 100)
53
+ # [ 递归终止条件 ]
54
+ return if @already_visited.size == page_limit
55
+
56
+ url_object = open_url(url)
57
+ return if url_object.nil?
58
+
59
+ parsed_doc = parse_url(url_object)
60
+ return if parsed_doc.nil?
61
+
62
+ @already_visited[url] = true if @already_visited[url].nil?
63
+ page_urls = find_urls_on_page(parsed_doc,url)
64
+ page_urls.each do |page_url|
65
+ # 是同一域名&&未访问过的url
66
+ if urls_on_same_domain?(url,page_url) && @already_visited[page_url].nil?
67
+ # recursive 递归
68
+ crawl_domain(page_url)
69
+ end
70
+ end
71
+
72
+ end
73
+
74
+ private
75
+
76
+ # add my exception to open(url)
77
+ def open_url(url)
78
+ open(url)
79
+ rescue
80
+ puts "Unable to open url: " + url
81
+ end
82
+
83
+ def update_url_if_redirected(url_object)
84
+ url_object.base_uri.to_s
85
+ end
86
+
87
+ # deal dom tree content
88
+ def parse_url(url_object)
89
+ # doc = Hpricot(url_object) #nokogiri
90
+ doc = Nokogiri(url_object)
91
+ puts 'Crawling url ' + url_object.base_uri.to_s
92
+ doc
93
+ rescue
94
+ puts 'Could not parse url: ' + url_object.base_uri.to_s
95
+ end
96
+
97
+ def find_urls_on_page(parsed_doc, current_url)
98
+ parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
99
+ new_url = x['href'].split('#')[0]
100
+ if new_url
101
+ # complicated feature: make_absolute
102
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
103
+ urls_list.push(new_url)
104
+ end
105
+ end
106
+ end
107
+
108
+ end
@@ -0,0 +1,24 @@
1
+ class UrlStore
2
+ attr_reader :urls
3
+ alias :get_urls :urls
4
+ def initialize(url_file)
5
+ @urls = read_urls_from_file(url_file)
6
+ end
7
+
8
+ def get_url
9
+ @urls[0]
10
+ end
11
+
12
+ def read_urls_from_file(url_file)
13
+ urls = []
14
+ File.open(url_file,'r') do |file|
15
+ file.readlines.each do |line|
16
+ urls.push(line.chomp)
17
+ end
18
+ end
19
+ urls
20
+ end
21
+
22
+ private :read_urls_from_file
23
+
24
+ end
@@ -0,0 +1,58 @@
1
+ module UrlUtils
2
+ def relative?(url)
3
+ !url.match(/^http/)
4
+ end
5
+
6
+ def make_absolute(potential_base, relative_url)
7
+ if relative_url.match(/^\//)
8
+ create_absolute_url_from_base(potential_base,relative_url)
9
+ else
10
+ create_absolute_url_from_context(potential_base,relative_url)
11
+ end
12
+ end
13
+
14
+ def urls_on_same_domain?(url1,url2)
15
+ get_domain(url1) == get_domain(url2)
16
+ end
17
+
18
+ def get_domain(url)
19
+ remove_extra_paths(url)
20
+ end
21
+
22
+ private
23
+
24
+ def create_absolute_url_from_base(potential_base,relative_url)
25
+ remove_extra_paths(potential_base) + relative_url
26
+ end
27
+
28
+ def remove_extra_paths(potential_base)
29
+ index_to_start_slash_search = potential_base.index('://')+3
30
+ index_of_first_relevant_slash = potential_base.index('/',index_to_start_slash_search)
31
+ if index_of_first_relevant_slash != nil
32
+ potential_base[0, index_of_first_relevant_slash]
33
+ else
34
+ potential_base
35
+ end
36
+ end
37
+
38
+ def create_absolute_url_from_context(potential_base,relative_url)
39
+ absolute_url = nil
40
+ if potential_base.match(/\/$/)
41
+ absolute_url = potential_base + relative_url
42
+ else
43
+ last_index_of_slash = potential_base.rindex('/')
44
+ if potential_base[last_index_of_slash-2, 2] == ':/'
45
+ absolute_url = potential_base + '/' + relative_url
46
+ else
47
+ last_index_of_dot = potential_base.rindex('.')
48
+ if last_index_of_dot < last_index_of_slash
49
+ absolute_url = potential_base + '/' + relative_url
50
+ else
51
+ absolute_url = potential_base[0, last_index_of_slash + 1] + relative_url
52
+ end
53
+ end
54
+ end
55
+ absolute_url
56
+ end
57
+
58
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: resay_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Resay tao
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-05-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ description: A simple web crawler gem
28
+ email: sinotao1@gmail.com
29
+ executables:
30
+ - crawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/crawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/resay_crawler.rb
37
+ - lib/spider.rb
38
+ - lib/url_store.rb
39
+ - lib/url_utils.rb
40
+ homepage: http://rubygems.org/gems/resay_crawler
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.4.6
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: My first gem
64
+ test_files: []