zy_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2024898f0c89209ea0a427d2fa0c03a1c628f486baac028522c9c2bb400feab1
4
+ data.tar.gz: f1e0d8bb0b62406e197cec47fd2e5c200a35669cca578b3f64bb788bd270b492
5
+ SHA512:
6
+ metadata.gz: a4ee958478df918e2ddf8b7fe6bb12949e973338aceb6ea0aa9024ef8c2404c0ea0c64bc69b7b6c2cc01d7d1e7d1476539430dde715025899f7a70218b2b01db
7
+ data.tar.gz: 10fcbdf784221de565506976d6aa9359203ef32b79efe074b745abaf420df96cff24e55e114795fcf05337e557127cb59cf6f08db0e9351782dcc31a93b6e2ca
data/bin/zycrawler ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require 'zy_crawler'
3
+
4
+ crawler = ZyCrawler.new
5
+ crawler.crawl
@@ -0,0 +1,55 @@
1
+ require 'getoptlong'
2
+ # command line argument parser
3
+ class CommandLineArgumentParser
4
+ WEB_CRAWLER = 'web'.freeze
5
+ DOMAIN_CRAWLER = 'domain'.freeze
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+ @opts = GetoptLong.new(
14
+ ['--crawl', '-c', GetoptLong::REQUIRED_ARGUMENT],
15
+ ['--crawl-depth', '-d', GetoptLong::OPTIONAL_ARGUMENT],
16
+ ['--page-limit', '-p', GetoptLong::OPTIONAL_ARGUMENT],
17
+ ['--url-file', '-f', GetoptLong::OPTIONAL_ARGUMENT]
18
+ )
19
+ @crawl_type = 'data.txt'
20
+ @crawl_depth = 3
21
+ @page_limit = 100
22
+ @url_file = 'urls.txt'
23
+ end
24
+
25
+ def display_usage
26
+ p 'Sample usage:'
27
+ p "ruby zy_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
28
+ p "-c must be either 'web' or 'domain', will default to 'web' if you type garbage"
29
+ end
30
+
31
+ def parse_arguments
32
+ @opts.each do |opt, arg|
33
+ case opt
34
+ when '--crawl'
35
+ ensure_crawl_type_correct(arg)
36
+ when '--crawl-depth'
37
+ @crawl_depth = arg.to_i
38
+ when '--page-limit'
39
+ @page_limit = arg.to_i
40
+ when '--url-file'
41
+ @url_file = arg
42
+ else
43
+ puts 'what happend?'
44
+ end
45
+ end
46
+ end
47
+
48
+ def ensure_crawl_type_correct(value)
49
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
50
+ @crawl_type = WEB_CRAWLER
51
+ else
52
+ @crawl_type = value
53
+ end
54
+ end
55
+ end
data/lib/spider.rb ADDED
@@ -0,0 +1,84 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'url_utils'
7
+ # Young Spider
8
+ class Spider
9
+ include UrlUtils
10
+
11
+ def initialize
12
+ @already_visited = {}
13
+ end
14
+
15
+ def crawl_web(urls, depth = 2, page_limit = 100)
16
+ depth.times do
17
+ next_urls = []
18
+ urls.each do |url|
19
+ url_object = open_url(url)
20
+ next if url_object.nil?
21
+
22
+ url = upate_url_if_redirected(url_object)
23
+ parsed_doc = parse_url(url_object)
24
+ next if parsed_doc.nil?
25
+
26
+ @already_visited[url] == true if @already_visited[url].nil?
27
+ return if @already_visited.size == page_limit
28
+
29
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
30
+ next_urls.uniq!
31
+ end
32
+ urls = next_urls
33
+ end
34
+ end
35
+
36
+ def crawl_domain(url, page_limit = 100)
37
+ return if @already_visited.size == page_limit
38
+
39
+ url_object = open_url(url)
40
+ return if url_object.nil?
41
+
42
+ parsed_doc = parse_url(url_object)
43
+ return if parsed_doc.nil?
44
+
45
+ @already_visited[url] == true if @already_visited[url].nil?
46
+ page_urls = find_urls_on_page(parsed_doc, url)
47
+ page_urls.each do |page_url|
48
+ if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
49
+ crawl_domain(page_url)
50
+ end
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def open_url(url)
57
+ URI.open(url)
58
+ rescue
59
+ puts 'Unable to open url: ' + url
60
+ end
61
+
62
+ def upate_url_if_redirected(url_object)
63
+ url_object.base_uri.to_s
64
+ end
65
+
66
+ def parse_url(url_object)
67
+ doc = Hpricot(url_object) #nokogiri
68
+ puts 'Crawling url ' + url_object.base_uri.to_s
69
+ doc
70
+ rescue
71
+ puts 'Could not parse url: ' + url_object.base_uri.to_s
72
+ end
73
+
74
+ def find_urls_on_page(pared_doc, current_url)
75
+ pared_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
76
+ new_url = x['href'].split('#')[0]
77
+ if new_url
78
+ # complicated feature: make_absolute
79
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
80
+ urls_list.push(new_url)
81
+ end
82
+ end
83
+ end
84
+ end
data/lib/url_store.rb ADDED
@@ -0,0 +1,26 @@
1
+ # Memory URL Store
2
+ class URLStore
3
+ attr_reader :urls
4
+ alias get_urls urls
5
+
6
+ def initialize(url_file)
7
+ @urls = read_urls_from_file(url_file)
8
+ end
9
+
10
+ def firt_url
11
+ @urls[0]
12
+ end
13
+
14
+ private
15
+
16
+ def read_urls_from_file(url_file)
17
+ urls = []
18
+ File.open(url_file, 'r') do |file|
19
+ file.readlines.each do |line|
20
+ urls.push(line.chomp)
21
+ end
22
+ end
23
+ urls
24
+ end
25
+
26
+ end
data/lib/url_utils.rb ADDED
@@ -0,0 +1,56 @@
1
+ # URL Tools
2
+ module UrlUtils
3
+ def relative?(url)
4
+ !url.match(/^http/)
5
+ end
6
+
7
+ def make_absolute(potential_base, relative_url)
8
+ if relative_url =~ /^\//
9
+ create_absolute_url_from_base(potential_base, relative_url)
10
+ else
11
+ create_abs_url_from_ctx(potential_base, relative_url)
12
+ end
13
+ end
14
+
15
+ def urls_on_same_domain?(url1, url2)
16
+ get_domain(url1) == get_domain(url2)
17
+ end
18
+
19
+ def get_domain(url)
20
+ remove_extra_paths(url)
21
+ end
22
+
23
+ def create_absolute_url_from_base(potential_base, relative_url)
24
+ remove_extra_paths(potential_base) + relative_url
25
+ end
26
+
27
+ def remove_extra_paths(potential_base)
28
+ index_to_start_slash_search = potential_base.index('://') + 3
29
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
30
+ if !index_of_first_relevant_slash.nil?
31
+ potential_base[0, index_of_first_relevant_slash]
32
+ else
33
+ potential_base
34
+ end
35
+ end
36
+
37
+ def create_abs_url_from_ctx(potential_base, relative_url)
38
+ absolute_url = nil
39
+ if potential_base =~ /\/$/
40
+ absolute_url = potential_base + relative_url
41
+ else
42
+ last_index_of_slash = potential_base.rindex('/')
43
+ if potential_base[last_index_of_slash - 2, 2] == ':/'
44
+ absolute_url = potential_base + '/' + relative_url
45
+ else
46
+ last_index_of_dot = potential_base.rindex('.')
47
+ if last_index_of_dot < last_index_of_slash
48
+ absolute_url = potential_base + '/' + relative_url
49
+ else
50
+ absolute_url = potential_base[0, last_index_of_slash + 1] + relative_url
51
+ end
52
+ end
53
+ end
54
+ absolute_url
55
+ end
56
+ end
data/lib/zy_crawler.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'command_line_argument_parser'
2
+ require 'spider'
3
+ require 'url_store'
4
+
5
+ # ZyCrawler is from Jason zhao's YzCrawler
6
+ class ZyCrawler
7
+ def initialize
8
+ @argument_parser = CommandLineArgumentParser.new
9
+ @argument_parser.parse_arguments
10
+ @spider = Spider.new
11
+ @url_store = URLStore.new(@argument_parser.url_file)
12
+ end
13
+
14
+ def crawl
15
+ if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
16
+ @spider.crawl_web(@url_store.get_urls,
17
+ @argument_parser.crawl_depth,
18
+ @argument_parser.page_limit)
19
+ else
20
+ @spider.crawl_domain(@url_store.firt_url, @argument_parser.page_limit)
21
+ end
22
+ end
23
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zy_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - uuen sky
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-03-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: hpricot
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.8'
27
+ description: A simple crawler demo crawler
28
+ email: uuensky@163.com
29
+ executables:
30
+ - zycrawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/zycrawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/spider.rb
37
+ - lib/url_store.rb
38
+ - lib/url_utils.rb
39
+ - lib/zy_crawler.rb
40
+ homepage: https://rubygems.org/gems/zycrawler
41
+ licenses:
42
+ - MIT
43
+ metadata:
44
+ changelog_uri: https://github.com/uuensky/zycrawler/blob/master/CHANGELOG.md
45
+ homepage_uri: https://rubygems.org/gems/zycrawler
46
+ source_code_uri: https://github.com/uuensky/zycrawler.git
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubygems_version: 3.2.32
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: A yong spider
66
+ test_files: []