zy_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2024898f0c89209ea0a427d2fa0c03a1c628f486baac028522c9c2bb400feab1
4
+ data.tar.gz: f1e0d8bb0b62406e197cec47fd2e5c200a35669cca578b3f64bb788bd270b492
5
+ SHA512:
6
+ metadata.gz: a4ee958478df918e2ddf8b7fe6bb12949e973338aceb6ea0aa9024ef8c2404c0ea0c64bc69b7b6c2cc01d7d1e7d1476539430dde715025899f7a70218b2b01db
7
+ data.tar.gz: 10fcbdf784221de565506976d6aa9359203ef32b79efe074b745abaf420df96cff24e55e114795fcf05337e557127cb59cf6f08db0e9351782dcc31a93b6e2ca
data/bin/zycrawler ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ require 'zy_crawler'
3
+
4
+ crawler = ZyCrawler.new
5
+ crawler.crawl
@@ -0,0 +1,55 @@
1
+ require 'getoptlong'
2
+ # command line argument parser
3
+ class CommandLineArgumentParser
4
+ WEB_CRAWLER = 'web'.freeze
5
+ DOMAIN_CRAWLER = 'domain'.freeze
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+ @opts = GetoptLong.new(
14
+ ['--crawl', '-c', GetoptLong::REQUIRED_ARGUMENT],
15
+ ['--crawl-depth', '-d', GetoptLong::OPTIONAL_ARGUMENT],
16
+ ['--page-limit', '-p', GetoptLong::OPTIONAL_ARGUMENT],
17
+ ['--url-file', '-f', GetoptLong::OPTIONAL_ARGUMENT]
18
+ )
19
+ @crawl_type = 'data.txt'
20
+ @crawl_depth = 3
21
+ @page_limit = 100
22
+ @url_file = 'urls.txt'
23
+ end
24
+
25
+ def display_usage
26
+ p 'Sample usage:'
27
+ p "ruby zy_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
28
+ p "-c must be either 'web' or 'domain', will default to 'web' if you type garbage"
29
+ end
30
+
31
+ def parse_arguments
32
+ @opts.each do |opt, arg|
33
+ case opt
34
+ when '--crawl'
35
+ ensure_crawl_type_correct(arg)
36
+ when '--crawl-depth'
37
+ @crawl_depth = arg.to_i
38
+ when '--page-limit'
39
+ @page_limit = arg.to_i
40
+ when '--url-file'
41
+ @url_file = arg
42
+ else
43
+ puts 'what happend?'
44
+ end
45
+ end
46
+ end
47
+
48
+ def ensure_crawl_type_correct(value)
49
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
50
+ @crawl_type = WEB_CRAWLER
51
+ else
52
+ @crawl_type = value
53
+ end
54
+ end
55
+ end
data/lib/spider.rb ADDED
@@ -0,0 +1,84 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'rubygems'
5
+ require 'hpricot'
6
+ require 'url_utils'
7
+ # Young Spider
8
+ class Spider
9
+ include UrlUtils
10
+
11
+ def initialize
12
+ @already_visited = {}
13
+ end
14
+
15
+ def crawl_web(urls, depth = 2, page_limit = 100)
16
+ depth.times do
17
+ next_urls = []
18
+ urls.each do |url|
19
+ url_object = open_url(url)
20
+ next if url_object.nil?
21
+
22
+ url = upate_url_if_redirected(url_object)
23
+ parsed_doc = parse_url(url_object)
24
+ next if parsed_doc.nil?
25
+
26
+ @already_visited[url] == true if @already_visited[url].nil?
27
+ return if @already_visited.size == page_limit
28
+
29
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
30
+ next_urls.uniq!
31
+ end
32
+ urls = next_urls
33
+ end
34
+ end
35
+
36
+ def crawl_domain(url, page_limit = 100)
37
+ return if @already_visited.size == page_limit
38
+
39
+ url_object = open_url(url)
40
+ return if url_object.nil?
41
+
42
+ parsed_doc = parse_url(url_object)
43
+ return if parsed_doc.nil?
44
+
45
+ @already_visited[url] == true if @already_visited[url].nil?
46
+ page_urls = find_urls_on_page(parsed_doc, url)
47
+ page_urls.each do |page_url|
48
+ if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
49
+ crawl_domain(page_url)
50
+ end
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def open_url(url)
57
+ URI.open(url)
58
+ rescue
59
+ puts 'Unable to open url: ' + url
60
+ end
61
+
62
+ def upate_url_if_redirected(url_object)
63
+ url_object.base_uri.to_s
64
+ end
65
+
66
+ def parse_url(url_object)
67
+ doc = Hpricot(url_object) #nokogiri
68
+ puts 'Crawling url ' + url_object.base_uri.to_s
69
+ doc
70
+ rescue
71
+ puts 'Could not parse url: ' + url_object.base_uri.to_s
72
+ end
73
+
74
+ def find_urls_on_page(pared_doc, current_url)
75
+ pared_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
76
+ new_url = x['href'].split('#')[0]
77
+ if new_url
78
+ # complicated feature: make_absolute
79
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
80
+ urls_list.push(new_url)
81
+ end
82
+ end
83
+ end
84
+ end
data/lib/url_store.rb ADDED
@@ -0,0 +1,26 @@
1
+ # Memory URL Store
2
+ class URLStore
3
+ attr_reader :urls
4
+ alias get_urls urls
5
+
6
+ def initialize(url_file)
7
+ @urls = read_urls_from_file(url_file)
8
+ end
9
+
10
+ def firt_url
11
+ @urls[0]
12
+ end
13
+
14
+ private
15
+
16
+ def read_urls_from_file(url_file)
17
+ urls = []
18
+ File.open(url_file, 'r') do |file|
19
+ file.readlines.each do |line|
20
+ urls.push(line.chomp)
21
+ end
22
+ end
23
+ urls
24
+ end
25
+
26
+ end
data/lib/url_utils.rb ADDED
@@ -0,0 +1,56 @@
1
+ # URL Tools
2
+ module UrlUtils
3
+ def relative?(url)
4
+ !url.match(/^http/)
5
+ end
6
+
7
+ def make_absolute(potential_base, relative_url)
8
+ if relative_url =~ /^\//
9
+ create_absolute_url_from_base(potential_base, relative_url)
10
+ else
11
+ create_abs_url_from_ctx(potential_base, relative_url)
12
+ end
13
+ end
14
+
15
+ def urls_on_same_domain?(url1, url2)
16
+ get_domain(url1) == get_domain(url2)
17
+ end
18
+
19
+ def get_domain(url)
20
+ remove_extra_paths(url)
21
+ end
22
+
23
+ def create_absolute_url_from_base(potential_base, relative_url)
24
+ remove_extra_paths(potential_base) + relative_url
25
+ end
26
+
27
+ def remove_extra_paths(potential_base)
28
+ index_to_start_slash_search = potential_base.index('://') + 3
29
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
30
+ if !index_of_first_relevant_slash.nil?
31
+ potential_base[0, index_of_first_relevant_slash]
32
+ else
33
+ potential_base
34
+ end
35
+ end
36
+
37
+ def create_abs_url_from_ctx(potential_base, relative_url)
38
+ absolute_url = nil
39
+ if potential_base =~ /\/$/
40
+ absolute_url = potential_base + relative_url
41
+ else
42
+ last_index_of_slash = potential_base.rindex('/')
43
+ if potential_base[last_index_of_slash - 2, 2] == ':/'
44
+ absolute_url = potential_base + '/' + relative_url
45
+ else
46
+ last_index_of_dot = potential_base.rindex('.')
47
+ if last_index_of_dot < last_index_of_slash
48
+ absolute_url = potential_base + '/' + relative_url
49
+ else
50
+ absolute_url = potential_base[0, last_index_of_slash + 1] + relative_url
51
+ end
52
+ end
53
+ end
54
+ absolute_url
55
+ end
56
+ end
data/lib/zy_crawler.rb ADDED
@@ -0,0 +1,23 @@
1
+ require 'command_line_argument_parser'
2
+ require 'spider'
3
+ require 'url_store'
4
+
5
+ # ZyCrawler is from Jason zhao's YzCrawler
6
+ class ZyCrawler
7
+ def initialize
8
+ @argument_parser = CommandLineArgumentParser.new
9
+ @argument_parser.parse_arguments
10
+ @spider = Spider.new
11
+ @url_store = URLStore.new(@argument_parser.url_file)
12
+ end
13
+
14
+ def crawl
15
+ if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
16
+ @spider.crawl_web(@url_store.get_urls,
17
+ @argument_parser.crawl_depth,
18
+ @argument_parser.page_limit)
19
+ else
20
+ @spider.crawl_domain(@url_store.firt_url, @argument_parser.page_limit)
21
+ end
22
+ end
23
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: zy_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - uuen sky
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-03-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: hpricot
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.8'
27
+ description: A simple crawler demo crawler
28
+ email: uuensky@163.com
29
+ executables:
30
+ - zycrawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/zycrawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/spider.rb
37
+ - lib/url_store.rb
38
+ - lib/url_utils.rb
39
+ - lib/zy_crawler.rb
40
+ homepage: https://rubygems.org/gems/zycrawler
41
+ licenses:
42
+ - MIT
43
+ metadata:
44
+ changelog_uri: https://github.com/uuensky/zycrawler/blob/master/CHANGELOG.md
45
+ homepage_uri: https://rubygems.org/gems/zycrawler
46
+ source_code_uri: https://github.com/uuensky/zycrawler.git
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubygems_version: 3.2.32
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: A yong spider
66
+ test_files: []