kudzu 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kudzu/adapter/memory/frontier.rb +1 -1
  3. data/lib/kudzu/adapter/memory/model/link.rb +2 -6
  4. data/lib/kudzu/adapter/memory/model/page.rb +3 -8
  5. data/lib/kudzu/adapter/memory/repository.rb +0 -2
  6. data/lib/kudzu/adapter/memory.rb +3 -4
  7. data/lib/kudzu/agent/all.rb +1 -1
  8. data/lib/kudzu/agent/fetcher.rb +46 -49
  9. data/lib/kudzu/agent/http/connection.rb +9 -0
  10. data/lib/kudzu/agent/http/connection_pool.rb +50 -0
  11. data/lib/kudzu/agent/page_filterer.rb +58 -0
  12. data/lib/kudzu/agent/reference.rb +9 -0
  13. data/lib/kudzu/agent/response.rb +14 -0
  14. data/lib/kudzu/agent/robots/parser.rb +91 -0
  15. data/lib/kudzu/agent/robots/txt.rb +34 -0
  16. data/lib/kudzu/agent/robots.rb +12 -123
  17. data/lib/kudzu/agent/sleeper.rb +2 -2
  18. data/lib/kudzu/agent/url_extractor.rb +60 -46
  19. data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
  20. data/lib/kudzu/agent/util/charset_detector.rb +84 -0
  21. data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
  22. data/lib/kudzu/agent/util/matcher.rb +25 -0
  23. data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
  24. data/lib/kudzu/agent/util/title_parser.rb +30 -0
  25. data/lib/kudzu/agent.rb +42 -0
  26. data/lib/kudzu/callback.rb +4 -2
  27. data/lib/kudzu/config/filter.rb +11 -11
  28. data/lib/kudzu/config.rb +20 -25
  29. data/lib/kudzu/crawler.rb +65 -146
  30. data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
  31. data/lib/kudzu/model/base.rb +9 -0
  32. data/lib/kudzu/model/link.rb +9 -0
  33. data/lib/kudzu/model/page.rb +112 -0
  34. data/lib/kudzu/thread_pool.rb +36 -0
  35. data/lib/kudzu/version.rb +1 -1
  36. data/lib/kudzu.rb +21 -3
  37. metadata +21 -19
  38. data/lib/kudzu/adapter/base/link.rb +0 -8
  39. data/lib/kudzu/adapter/base/page.rb +0 -106
  40. data/lib/kudzu/adapter/memory/all.rb +0 -3
  41. data/lib/kudzu/agent/charset_detector.rb +0 -84
  42. data/lib/kudzu/agent/filter.rb +0 -40
  43. data/lib/kudzu/agent/mime_type_detector.rb +0 -34
  44. data/lib/kudzu/agent/title_parser.rb +0 -16
  45. data/lib/kudzu/logger.rb +0 -20
  46. data/lib/kudzu/revisit/all.rb +0 -3
  47. data/lib/kudzu/revisit/scheduler.rb +0 -28
  48. data/lib/kudzu/util/all.rb +0 -3
  49. data/lib/kudzu/util/connection_pool.rb +0 -56
  50. data/lib/kudzu/util/content_type_parser.rb +0 -24
  51. data/lib/kudzu/util/matcher.rb +0 -21
  52. data/lib/kudzu/util/thread_pool.rb +0 -38
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ab6c69046e9faa3788ead18864ee6d13ddbe2980
4
- data.tar.gz: c9868fabe9542d877d6519e0f5297419c882a8e5
3
+ metadata.gz: 0ba76e95628d94560421358aa7982bdc429971e4
4
+ data.tar.gz: e1875f5760573a021fcf129018aaba5f6213ad23
5
5
  SHA512:
6
- metadata.gz: 7018e08e6744a9e74e601bad26a88df3d60140ddaa055fc194c88263cff37137402de967203999d9ba9c2bda199228215f380e207a4b12b6c2e50c5774827e16
7
- data.tar.gz: b03edd059ea5b5cb0f50fd0bc660c02e8727c1bb7f8f735fe86ba6f08b2a8bf743a04570eb171dc99f3ffcacba26c1df8a99cc2a1187e2e2fb481c01adb59e6c
6
+ metadata.gz: 4f17f799c2ad67722860bbad00e9e220db8265221d598aebbbd181fea41546454e954fc67db18abba2b8b68797fe44da3887ec4ea3a5d2486fd1afd61584c152
7
+ data.tar.gz: 65d3bf42fafbcf835740ebe5534a52163f09ef313795152fafadc87aeaa335d540b037180a4f03208471361a9284635ff15a85471f3def22ba43e341ee1eb724
@@ -9,7 +9,7 @@ module Kudzu
9
9
  @queued = {}
10
10
  end
11
11
 
12
- def enqueue(links, depth: 1)
12
+ def enqueue(links)
13
13
  @monitor.synchronize do
14
14
  Array(links).each do |link|
15
15
  next if @queued.key?(link.url)
@@ -1,14 +1,10 @@
1
1
  module Kudzu
2
2
  module Adapter
3
3
  module Memory
4
- class Link
5
- include Kudzu::Adapter::Base::Link
4
+ class Link < Kudzu::Model::Base
5
+ include Kudzu::Model::Link
6
6
 
7
7
  attr_accessor :uuid, :url, :title, :state, :depth
8
-
9
- def initialize(attr = {})
10
- attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
11
- end
12
8
  end
13
9
  end
14
10
  end
@@ -1,16 +1,11 @@
1
1
  module Kudzu
2
2
  module Adapter
3
3
  module Memory
4
- class Page
5
- include Kudzu::Adapter::Base::Page
4
+ class Page < Kudzu::Model::Base
5
+ include Kudzu::Model::Page
6
6
 
7
7
  attr_accessor :url, :title, :status, :mime_type, :size, :charset, :digest,
8
- :response_header, :response_time, :redirect_from, :fetched_at, :revised_at,
9
- :revisit_interval, :revisit_at
10
-
11
- def initialize(attr = {})
12
- attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
13
- end
8
+ :response_header, :response_time, :redirect_from, :fetched_at, :revised_at
14
9
  end
15
10
  end
16
11
  end
@@ -6,7 +6,6 @@ module Kudzu
6
6
 
7
7
  def initialize
8
8
  @page = {}
9
- @digest = {}
10
9
  end
11
10
 
12
11
  def find_by_url(url)
@@ -15,7 +14,6 @@ module Kudzu
15
14
 
16
15
  def register(page)
17
16
  @page[page.url] = page
18
- @digest[page.digest] = true
19
17
  end
20
18
 
21
19
  def delete(page)
@@ -1,4 +1,3 @@
1
- require_relative 'base/all'
2
- require_relative 'memory/all'
3
-
4
- Kudzu.adapter = Kudzu::Adapter::Memory
1
+ Dir[File.join(__dir__, 'memory/**/*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -1,3 +1,3 @@
1
- Dir[File.join(__dir__, '*.rb')].each do |file|
1
+ Dir[File.join(__dir__, '**/*.rb')].each do |file|
2
2
  require_relative file
3
3
  end
@@ -1,67 +1,65 @@
1
- require 'net/http'
2
- require 'http-cookie'
3
-
4
1
  module Kudzu
5
2
  class Agent
6
3
  class Fetcher
7
- class Response
8
- attr_accessor :url, :status, :header, :body, :time, :redirected
9
-
10
- def initialize(attr = {})
11
- attr.each { |k, v| public_send("#{k}=", v) }
12
- end
13
-
14
- def redirected?
15
- redirected
16
- end
17
- end
18
-
19
4
  attr_reader :pool
20
5
 
21
6
  def initialize(config, robots = nil)
22
7
  @config = config
23
- @pool = Kudzu::Util::ConnectionPool.new(@config.max_connection || 100)
24
- @sleeper = Kudzu::Agent::Sleeper.new(@config, robots)
8
+ @pool = Http::ConnectionPool.new(@config.max_connection || 100)
9
+ @sleeper = Sleeper.new(@config, robots)
10
+ @filterer = PageFilterer.new(@config)
25
11
  @jar = HTTP::CookieJar.new
26
12
  end
27
13
 
28
- def fetch(url, request_header: {}, redirect: max_redirect, method: :get)
14
+ def fetch(url, request_header: {}, method: :get, redirect: @config.max_redirect, redirect_from: nil)
29
15
  uri = Addressable::URI.parse(url)
30
- http = @pool.checkout(pool_name(uri)) { build_http(uri) }
31
16
  request = build_request(uri, request_header: request_header, method: method)
32
-
33
- append_cookie(url, request) if @config.handle_cookie
34
-
35
- @sleeper.politeness_delay(url)
36
-
37
- response = nil
38
- response_time = Benchmark.realtime { response = http.request(request) }
39
-
40
- parse_cookie(url, response) if @config.handle_cookie
17
+ response, response_time = send_request(uri, request)
41
18
 
42
19
  if redirection?(response.code) && response['location'] && redirect > 0
43
- fetch(uri.join(response['location']).to_s, request_header: request_header, redirect: redirect - 1)
20
+ fetch(uri.join(response['location']).to_s, request_header: request_header,
21
+ redirect: redirect - 1,
22
+ redirect_from: redirect_from || url)
44
23
  else
45
- res = build_response(url, response, response_time)
46
- res.redirected = (redirect != max_redirect)
47
- res
24
+ build_response(url, response, response_time, redirect_from)
48
25
  end
49
26
  end
50
27
 
51
28
  private
52
29
 
53
- def max_redirect
54
- @config.max_redirect || 5
55
- end
56
-
57
30
  def pool_name(uri)
58
31
  "#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
59
32
  end
60
33
 
34
+ def send_request(uri, request)
35
+ start_http(uri, request) do |http|
36
+ http.request(request) do |response|
37
+ unless @filterer.allowed_response_header?(uri.to_s, response)
38
+ http.finish
39
+ break response
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ def start_http(uri, request)
46
+ http = @pool.checkout(pool_name(uri)) { build_http(uri) }
47
+ append_cookie(uri, request) if @config.handle_cookie
48
+ @sleeper.politeness_delay(uri)
49
+
50
+ start = Time.now.to_f
51
+ response = yield http
52
+ response_time = Time.now.to_f - start
53
+
54
+ parse_cookie(uri, response) if @config.handle_cookie
55
+ return response, response_time
56
+ end
57
+
61
58
  def build_http(uri)
62
59
  http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
63
60
  http.open_timeout = @config.open_timeout if @config.open_timeout
64
61
  http.read_timeout = @config.read_timeout if @config.read_timeout
62
+ http.keep_alive_timeout = @config.keep_alive if @config.keep_alive
65
63
  if uri.scheme == 'https'
66
64
  http.use_ssl = true
67
65
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
@@ -70,7 +68,7 @@ module Kudzu
70
68
  end
71
69
 
72
70
  def build_request(uri, request_header:, method:)
73
- request = request_klass_for(method).new(uri.request_uri)
71
+ request = Object.const_get("Net::HTTP::#{method.capitalize}").new(uri.request_uri)
74
72
  request.basic_auth uri.user, uri.password if uri.user && uri.password
75
73
 
76
74
  request['User-Agent'] = @config.user_agent
@@ -80,16 +78,15 @@ module Kudzu
80
78
  request
81
79
  end
82
80
 
83
- def request_klass_for(method)
84
- Object.const_get("Net::HTTP::#{method.capitalize}")
85
- end
86
-
87
- def build_response(url, response, response_time)
81
+ def build_response(url, response, response_time, redirect_from)
82
+ fetched = response.instance_variable_get("@read")
88
83
  Response.new(url: url,
89
84
  status: response.code.to_i,
90
- header: Hash[response.each.to_a],
91
- body: response.body.to_s,
92
- time: response_time)
85
+ body: fetched ? response.body.to_s : nil,
86
+ response_header: Hash[response.each.to_a],
87
+ response_time: response_time,
88
+ redirect_from: redirect_from,
89
+ fetched: fetched)
93
90
  end
94
91
 
95
92
  def redirection?(code)
@@ -97,12 +94,12 @@ module Kudzu
97
94
  300 <= code && code <= 399
98
95
  end
99
96
 
100
- def parse_cookie(url, response)
101
- @jar.parse(response['set-cookie'], url) if response['set-cookie']
97
+ def parse_cookie(uri, response)
98
+ @jar.parse(response['set-cookie'], uri.to_s) if response['set-cookie']
102
99
  end
103
100
 
104
- def append_cookie(url, request)
105
- cookies = @jar.cookies(url)
101
+ def append_cookie(uri, request)
102
+ cookies = @jar.cookies(uri.to_s)
106
103
  unless cookies.empty?
107
104
  if request['Cookie']
108
105
  request['Cookie'] += '; ' + cookies.join('; ')
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Http
4
+ class Connection < Kudzu::Model::Base
5
+ attr_accessor :name, :http, :last_used
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,50 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Http
4
+ class ConnectionPool
5
+ def initialize(max_size = 10)
6
+ @max_size = max_size
7
+ end
8
+
9
+ def checkout(name)
10
+ pool[name] ||= Connection.new(name: name, http: yield)
11
+
12
+ conn = pool[name]
13
+ conn.last_used = Time.now
14
+
15
+ if pool.size > @max_size
16
+ reduce
17
+ end
18
+
19
+ conn.http
20
+ end
21
+
22
+ def close
23
+ pool.values.each do |conn|
24
+ finish_http(conn.http)
25
+ end
26
+ Thread.current[:kudzu_connection] = nil
27
+ end
28
+
29
+ private
30
+
31
+ def pool
32
+ Thread.current[:kudzu_connection] ||= {}
33
+ Thread.current[:kudzu_connection]
34
+ end
35
+
36
+ def reduce
37
+ conns = pool.values.sort_by { |conn| conn.last_used }
38
+ conns.first(pool.size - @max_size).each do |conn|
39
+ finish_http(conn.http)
40
+ pool.delete(conn.name)
41
+ end
42
+ end
43
+
44
+ def finish_http(http)
45
+ http.finish if http && http.started?
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,58 @@
1
+ module Kudzu
2
+ class Agent
3
+ class PageFilterer
4
+ def initialize(config)
5
+ @config = config
6
+ end
7
+
8
+ def allowed?(response)
9
+ filter = @config.find_filter(response.url)
10
+
11
+ if filter.nil? || (allowed_mime_type?(response.mime_type, filter) &&
12
+ allowed_size?(response.size, filter) &&
13
+ allowed_index?(response))
14
+ Kudzu.log :info, "passed page: #{response.url}"
15
+ true
16
+ else
17
+ Kudzu.log :info, "dropped page: #{response.url}"
18
+ false
19
+ end
20
+ end
21
+
22
+ def allowed_response_header?(url, response_header)
23
+ filter = @config.find_filter(url)
24
+
25
+ if response_header['content-type']
26
+ mime_type = Util::ContentTypeParser.parse(response_header['content-type']).first
27
+ end
28
+ if response_header['content-length']
29
+ size = response_header['content-length'].to_i
30
+ end
31
+
32
+ filter.nil? || (allowed_mime_type?(mime_type, filter) &&
33
+ allowed_size?(size, filter))
34
+ end
35
+
36
+ private
37
+
38
+ def allowed_mime_type?(mime_type, filter)
39
+ return true if mime_type.nil?
40
+ Util::Matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
41
+ end
42
+
43
+ def allowed_size?(size, filter)
44
+ return true if filter.max_size.nil? || size.nil?
45
+ size.to_i < filter.max_size.to_i
46
+ end
47
+
48
+ def allowed_index?(response)
49
+ return true if response.body.nil? || !response.html?
50
+ return true unless @config.respect_noindex
51
+
52
+ doc = response.parsed_doc
53
+ doc.xpath('html/head/meta[@name]')
54
+ .all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Reference < Kudzu::Model::Base
4
+ include Kudzu::Model::Link
5
+
6
+ attr_accessor :url, :title
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,14 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Response < Kudzu::Model::Base
4
+ include Kudzu::Model::Page
5
+
6
+ attr_accessor :url, :status, :body, :response_header, :response_time, :redirect_from, :fetched,
7
+ :size, :digest, :mime_type, :charset, :title
8
+
9
+ def fetched?
10
+ fetched
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,91 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Robots
4
+ class Parser
5
+ UNMATCH_REGEXP = /^$/
6
+
7
+ class << self
8
+ def parse(body)
9
+ txt = Txt.new
10
+ sets = []
11
+ prev_key = nil
12
+
13
+ parse_body(body).each do |key, value|
14
+ case key
15
+ when 'user-agent'
16
+ new_set = RuleSet.new(user_agent: ua_regexp(value))
17
+ txt.sets << new_set
18
+ if prev_key == 'user-agent'
19
+ sets << new_set
20
+ else
21
+ sets = [new_set]
22
+ end
23
+ when 'allow'
24
+ re = path_regexp(value)
25
+ sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
26
+ when 'disallow'
27
+ re = path_regexp(value)
28
+ sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
29
+ when 'crawl-delay'
30
+ sets.each { |set| set.crawl_delay = value.to_i }
31
+ when 'sitemap'
32
+ txt.sitemaps << value
33
+ end
34
+
35
+ prev_key = key
36
+ end
37
+
38
+ sort(txt)
39
+ end
40
+
41
+ private
42
+
43
+ def parse_body(body)
44
+ lines = body.to_s.split(/\r|\n|\r\n/)
45
+ lines.map { |line| parse_line(line) }.compact
46
+ end
47
+
48
+ def parse_line(line)
49
+ line.strip!
50
+ if line.empty? || line.start_with?('#')
51
+ nil
52
+ else
53
+ split_line(line)
54
+ end
55
+ end
56
+
57
+ def split_line(line)
58
+ key, value = line.split(':', 2)
59
+ key = key.to_s.strip.downcase
60
+ value = value.to_s.sub(/#.*$/, '').strip
61
+ if key.empty? || value.empty?
62
+ nil
63
+ else
64
+ [key, value]
65
+ end
66
+ end
67
+
68
+ def ua_regexp(value)
69
+ Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
70
+ rescue RegexpError
71
+ UNMATCH_REGEXP
72
+ end
73
+
74
+ def path_regexp(value)
75
+ Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
76
+ rescue RegexpError
77
+ UNMATCH_REGEXP
78
+ end
79
+
80
+ def sort(txt)
81
+ txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
82
+ txt.sets.each do |set|
83
+ set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
84
+ end
85
+ txt
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,34 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Robots
4
+ class Txt < Kudzu::Model::Base
5
+ attr_accessor :sets, :sitemaps
6
+
7
+ def initialize
8
+ self.sets = []
9
+ self.sitemaps = []
10
+ end
11
+ end
12
+
13
+ class RuleSet < Kudzu::Model::Base
14
+ attr_accessor :user_agent, :rules, :crawl_delay
15
+
16
+ def initialize(attr = {})
17
+ self.rules = []
18
+ super
19
+ end
20
+
21
+ def allowed_path?(uri)
22
+ rules.each do |rule|
23
+ return rule.allow if uri.path =~ rule.path
24
+ end
25
+ return true
26
+ end
27
+ end
28
+
29
+ class Rule < Kudzu::Model::Base
30
+ attr_accessor :path, :allow
31
+ end
32
+ end
33
+ end
34
+ end
@@ -3,7 +3,6 @@ module Kudzu
3
3
  class Robots
4
4
  def initialize(config)
5
5
  @user_agent = config.user_agent
6
- @page_fetcher = Kudzu::Agent::Fetcher.new(config)
7
6
  @monitor = Monitor.new
8
7
  @txt = {}
9
8
  end
@@ -49,11 +48,11 @@ module Kudzu
49
48
 
50
49
  def fetch_and_parse(uri)
51
50
  response = fetch(uri)
52
- if response && response.status == 200
51
+ if response && response.code.to_i == 200
53
52
  body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
54
- Parser.new.parse(body)
53
+ Parser.parse(body)
55
54
  else
56
- Parser.new.parse('')
55
+ Parser.parse('')
57
56
  end
58
57
  end
59
58
 
@@ -62,127 +61,17 @@ module Kudzu
62
61
  uri.path = 'robots.txt'
63
62
  uri.fragment = uri.query = nil
64
63
 
65
- begin
66
- @page_fetcher.fetch(uri.to_s)
67
- rescue
68
- nil
69
- end
70
- end
71
-
72
- class Txt
73
- attr_accessor :sets, :sitemaps
74
-
75
- def initialize
76
- self.sets = []
77
- self.sitemaps = []
78
- end
79
- end
80
-
81
- class RuleSet
82
- attr_accessor :user_agent, :rules, :crawl_delay
83
-
84
- def initialize(attr = {})
85
- self.rules = []
86
- attr.each { |k, v| public_send("#{k}=", v) }
87
- end
88
-
89
- def allowed_path?(uri)
90
- rules.each do |rule|
91
- return rule.allow if uri.path =~ rule.path
92
- end
93
- return true
94
- end
95
- end
96
-
97
- class Rule
98
- attr_accessor :path, :allow
99
-
100
- def initialize(attr = {})
101
- attr.each { |k, v| public_send("#{k}=", v) }
102
- end
103
- end
104
-
105
- class Parser
106
- UNMATCH_REGEXP = /^$/
107
-
108
- def parse(body)
109
- txt = Txt.new
110
- sets = []
111
- prev_key = nil
112
-
113
- parse_body(body).each do |key, value|
114
- case key
115
- when 'user-agent'
116
- new_set = RuleSet.new(user_agent: ua_regexp(value))
117
- txt.sets << new_set
118
- if prev_key == 'user-agent'
119
- sets << new_set
120
- else
121
- sets = [new_set]
122
- end
123
- when 'allow'
124
- re = path_regexp(value)
125
- sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
126
- when 'disallow'
127
- re = path_regexp(value)
128
- sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
129
- when 'crawl-delay'
130
- sets.each { |set| set.crawl_delay = value.to_i }
131
- when 'sitemap'
132
- txt.sitemaps << value
133
- end
134
-
135
- prev_key = key
136
- end
137
-
138
- sort(txt)
139
- end
140
-
141
- private
142
-
143
- def parse_body(body)
144
- lines = body.to_s.split(/\r|\n|\r\n/)
145
- lines.map { |line| parse_line(line) }.compact
146
- end
147
-
148
- def parse_line(line)
149
- line.strip!
150
- if line.empty? || line.start_with?('#')
151
- nil
152
- else
153
- split_line(line)
154
- end
155
- end
156
-
157
- def split_line(line)
158
- key, value = line.split(':', 2)
159
- key = key.to_s.strip.downcase
160
- value = value.to_s.sub(/#.*$/, '').strip
161
- if key.empty? || value.empty?
162
- nil
163
- else
164
- [key, value]
165
- end
166
- end
167
-
168
- def ua_regexp(value)
169
- Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
170
- rescue RegexpError
171
- UNMATCH_REGEXP
64
+ http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
65
+ if uri.scheme == 'https'
66
+ http.use_ssl = true
67
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
172
68
  end
173
69
 
174
- def path_regexp(value)
175
- Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
176
- rescue RegexpError
177
- UNMATCH_REGEXP
178
- end
179
-
180
- def sort(txt)
181
- txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
182
- txt.sets.each do |set|
183
- set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
184
- end
185
- txt
70
+ begin
71
+ http.get(uri.to_s)
72
+ rescue => e
73
+ Kudzu.log :error, "failed to fetch robots.txt: #{uri}", error: e
74
+ nil
186
75
  end
187
76
  end
188
77
  end
@@ -8,8 +8,8 @@ module Kudzu
8
8
  @last_accessed = {}
9
9
  end
10
10
 
11
- def politeness_delay(url)
12
- uri = Addressable::URI.parse(url)
11
+ def politeness_delay(uri)
12
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
13
13
  delay_sec = delay_second(uri)
14
14
  return unless delay_sec
15
15