dk_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 3f922703479939e3e4aeb66cc09e3eff2ab3a99c0ab53ad270f535c27df367e0
4
+ data.tar.gz: 57eda2be60eba639be8bd7927be6cf5ca0a21c68a40a45f3bd01f87df986f726
5
+ SHA512:
6
+ metadata.gz: 875908fe8cd7574fccae8ea1ea566cb36b20f756de7e3c2e2ab31572c1c7b8a58972a0cacf827c3f1cebac1d98a8c399e1f354391457479545294aeaa7d34289
7
+ data.tar.gz: a605666db11802894ea52b4a4503df9383eff1cc2eec07488ab31e953febdf656df081511a1a46a13e78d0d3359e9aa3b6503e30de4c69735d5b0f7a7ea6b422
data/bin/crawler ADDED
@@ -0,0 +1,5 @@
1
+ #!usr/bin/env ruby
2
+ require 'dk_crawler'
3
+
4
+ crawler = DkCrawler.new
5
+ crawler.crawl
@@ -0,0 +1,53 @@
1
+ require 'getoptlong'
2
+
3
+ class CommandLineArgumentParser
4
+ WEB_CRAWLER = 'web'
5
+ DOMAIN_CRAWLER = 'domain'
6
+ attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
7
+
8
+ def initialize
9
+ unless ARGV.length >= 1
10
+ display_usage
11
+ exit
12
+ end
13
+ @opts = GetoptLong.new(
14
+ ["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
15
+ ["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
16
+ ["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
17
+ ["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
18
+ )
19
+ @crawl_type = "data.txt"
20
+ @crawl_depth = 3
21
+ @page_limit = 100
22
+ @url_file = 'urls.txt'
23
+ end
24
+
25
+ def display_usage
26
+ p "Sample usage:"
27
+ p "ruby dk_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
28
+ p "-c must be either 'web' or 'domain', will default to 'web' is you type garbage "
29
+ end
30
+
31
+ def parse_arguments
32
+ @opts.each do |opt, arg|
33
+ case opt
34
+ when '--crawl'
35
+ ensure_crawl_type_correct(arg)
36
+ when '--crawl-depth'
37
+ @crawl_depth = arg.to_i
38
+ when '--page-limit'
39
+ @page_limit = arg.to_i
40
+ when '--url-file'
41
+ @url_file = arg
42
+ end
43
+ end
44
+ end
45
+
46
+ def ensure_crawl_type_correct(value)
47
+ if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
48
+ @crawl_type = WEB_CRAWLER
49
+ else
50
+ @crawl_type = value
51
+ end
52
+ end
53
+ end
data/lib/dk_crawler.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'spider'
2
+ require 'command_line_argument_parser'
3
+ require 'url_store'
4
+
5
+ class DkCrawler
6
+ def initialize
7
+ @argument_parser = CommandLineArgumentParser.new
8
+ @argument_parser.parse_arguments
9
+ @spider = Spider.new
10
+ @url_store = UrlStore.new(@argument_parser.url_file)
11
+ end
12
+
13
+ def crawl
14
+ if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
15
+ @spider.crawl_web(
16
+ @url_store.get_urls,
17
+ @argument_parser.crawl_depth,
18
+ @argument_parser.page_limit
19
+ )
20
+ else
21
+ @spider.crawl_domain(
22
+ @url_store.get_url,
23
+ @argument_parser.page_limit
24
+ )
25
+ end
26
+ end
27
+ end
data/lib/spider.rb ADDED
@@ -0,0 +1,84 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'nokogiri'
5
+ require 'url_utils'
6
+
7
+ class Spider
8
+ include UrlUtils
9
+ def initialize
10
+ @already_visited = {}
11
+ end
12
+
13
+ def crawl_web(urls, depth=2, page_limit = 100)
14
+ depth.times do
15
+ next_urls = []
16
+ urls.each do |url|
17
+ url_object = open_url(url)
18
+
19
+ next if url_object.nil?
20
+
21
+ url = update_url_if_redirected(url_object)
22
+ parsed_doc = parse_url(url_object)
23
+ next if parsed_doc.nil?
24
+
25
+ @already_visited[url] = true if @already_visited[url].nil?
26
+ return if @already_visited.size == page_limit
27
+
28
+ next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
29
+ next_urls.uniq!
30
+ end
31
+ urls = next_urls
32
+ end
33
+ end
34
+
35
+ def crawl_domain(url, page_limit = 100)
36
+ return if @already_visited.size == page_limit
37
+
38
+ url_object = open_url(url)
39
+ return if url_object.nil?
40
+
41
+ parsed_doc = parse_url(url_object)
42
+ return if parsed_doc.nil?
43
+
44
+ @already_visited[url] = true if @already_visited[url].nil?
45
+ page_urls = find_urls_on_page(parsed_doc, url)
46
+ page_urls.each do |page_url|
47
+ if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
48
+ crawl_domain(page_url)
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def open_url(url)
56
+ open(url)
57
+ rescue
58
+ puts "Unable to open url: " + url
59
+ end
60
+
61
+ def update_url_if_redirected(url_object)
62
+ url_object.base_uri.to_s
63
+ end
64
+
65
+ def parse_url(url_object)
66
+ doc = Nokogiri::HTML(url_object) #nokogiri
67
+ puts 'Crawling url ' + url_object.base_uri.to_s
68
+ doc
69
+ rescue
70
+ puts 'Could not parse url: ' + url_object.base_uri.to_s
71
+ end
72
+
73
+ def find_urls_on_page(parsed_doc, current_url)
74
+ parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
75
+ new_url = x['href'].split('#')[0]
76
+ if new_url
77
+ # complicated feature: make_absolute
78
+ new_url = make_absolute(current_url, new_url) if relative?(new_url)
79
+ urls_list.push(new_url)
80
+
81
+ end
82
+ end
83
+ end
84
+ end
data/lib/url_store.rb ADDED
@@ -0,0 +1,24 @@
1
+ class UrlStore
2
+ attr_reader :urls
3
+ alias :get_urls :urls
4
+
5
+ def initialize(url_file)
6
+ @urls = read_urls_from_file(url_file)
7
+ end
8
+
9
+ def get_url
10
+ @urls
11
+ end
12
+
13
+ def read_urls_from_file(url_file)
14
+ urls = []
15
+ File.open(url_file, 'r') do |file|
16
+ file.readlines.each do |line|
17
+ urls.push(line.chomp)
18
+ end
19
+ end
20
+ urls
21
+ end
22
+
23
+ private :read_urls_from_file
24
+ end
data/lib/url_utils.rb ADDED
@@ -0,0 +1,57 @@
1
+ module UrlUtils
2
+ def relative?(url)
3
+ !url.match(/^http/)
4
+ end
5
+
6
+ def make_absolute(potential_base, relative_url)
7
+ if relative_url.match(/^\//)
8
+ create_absolute_url_from_base(potential_base, relative_url)
9
+ else
10
+ create_absolute_url_from_context(potential_base, relative_url)
11
+ end
12
+ end
13
+
14
+ def urls_on_same_domain?(url1, url2)
15
+ get_domain(url1) == get_domain(url2)
16
+ end
17
+
18
+ def get_domain(url)
19
+ remove_extra_paths(url)
20
+ end
21
+
22
+ private
23
+
24
+ def create_absolute_url_from_base(potential_base, relative_url)
25
+ remove_extra_paths(potential_base) + relative_url
26
+ end
27
+
28
+ def remove_extra_paths(potential_base)
29
+ index_to_start_slash_search = potential_base.index('://')+3
30
+ index_of_first_relevant_slash = potential_base.index('/', index_to_start_slash_search)
31
+ if index_of_first_relevant_slash != nil
32
+ potential_base[0, index_of_first_relevant_slash]
33
+ else
34
+ potential_base
35
+ end
36
+ end
37
+
38
+ def create_absolute_url_from_context(potential_base, relative_url)
39
+ absolute_url = nil
40
+ if potential_base.match(/\/$/)
41
+ absolute_url = potential_base+relative_url
42
+ else
43
+ last_index_of_slash = potential_base.rindex('/')
44
+ if potential_base[last_index_of_slash-2, 2] == ':/'
45
+ absolute_url = potential_base+'/'+relative_url
46
+ else
47
+ last_index_of_dot = potential_base.rindex('.')
48
+ if last_index_of_dot < last_index_of_slash
49
+ absolute_url = potential_base+'/'+relative_url
50
+ else
51
+ absolute_url = potential_base[0, last_index_of_slash+1] + relative_url
52
+ end
53
+ end
54
+ end
55
+ absolute_url
56
+ end
57
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dk_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Derrick
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-04-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ description: A simple web crawler gem
28
+ email: derrickwang57@gmail.com
29
+ executables:
30
+ - crawler
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - bin/crawler
35
+ - lib/command_line_argument_parser.rb
36
+ - lib/dk_crawler.rb
37
+ - lib/spider.rb
38
+ - lib/url_store.rb
39
+ - lib/url_utils.rb
40
+ homepage: http://rubygems.org/gems/yz_crawler
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.7.3
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: My first gem
64
+ test_files: []