kudzu 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kudzu/adapter/memory/frontier.rb +1 -1
  3. data/lib/kudzu/adapter/memory/model/link.rb +2 -6
  4. data/lib/kudzu/adapter/memory/model/page.rb +3 -8
  5. data/lib/kudzu/adapter/memory/repository.rb +0 -2
  6. data/lib/kudzu/adapter/memory.rb +3 -4
  7. data/lib/kudzu/agent/all.rb +1 -1
  8. data/lib/kudzu/agent/fetcher.rb +46 -49
  9. data/lib/kudzu/agent/http/connection.rb +9 -0
  10. data/lib/kudzu/agent/http/connection_pool.rb +50 -0
  11. data/lib/kudzu/agent/page_filterer.rb +58 -0
  12. data/lib/kudzu/agent/reference.rb +9 -0
  13. data/lib/kudzu/agent/response.rb +14 -0
  14. data/lib/kudzu/agent/robots/parser.rb +91 -0
  15. data/lib/kudzu/agent/robots/txt.rb +34 -0
  16. data/lib/kudzu/agent/robots.rb +12 -123
  17. data/lib/kudzu/agent/sleeper.rb +2 -2
  18. data/lib/kudzu/agent/url_extractor.rb +60 -46
  19. data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
  20. data/lib/kudzu/agent/util/charset_detector.rb +84 -0
  21. data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
  22. data/lib/kudzu/agent/util/matcher.rb +25 -0
  23. data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
  24. data/lib/kudzu/agent/util/title_parser.rb +30 -0
  25. data/lib/kudzu/agent.rb +42 -0
  26. data/lib/kudzu/callback.rb +4 -2
  27. data/lib/kudzu/config/filter.rb +11 -11
  28. data/lib/kudzu/config.rb +20 -25
  29. data/lib/kudzu/crawler.rb +65 -146
  30. data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
  31. data/lib/kudzu/model/base.rb +9 -0
  32. data/lib/kudzu/model/link.rb +9 -0
  33. data/lib/kudzu/model/page.rb +112 -0
  34. data/lib/kudzu/thread_pool.rb +36 -0
  35. data/lib/kudzu/version.rb +1 -1
  36. data/lib/kudzu.rb +21 -3
  37. metadata +21 -19
  38. data/lib/kudzu/adapter/base/link.rb +0 -8
  39. data/lib/kudzu/adapter/base/page.rb +0 -106
  40. data/lib/kudzu/adapter/memory/all.rb +0 -3
  41. data/lib/kudzu/agent/charset_detector.rb +0 -84
  42. data/lib/kudzu/agent/filter.rb +0 -40
  43. data/lib/kudzu/agent/mime_type_detector.rb +0 -34
  44. data/lib/kudzu/agent/title_parser.rb +0 -16
  45. data/lib/kudzu/logger.rb +0 -20
  46. data/lib/kudzu/revisit/all.rb +0 -3
  47. data/lib/kudzu/revisit/scheduler.rb +0 -28
  48. data/lib/kudzu/util/all.rb +0 -3
  49. data/lib/kudzu/util/connection_pool.rb +0 -56
  50. data/lib/kudzu/util/content_type_parser.rb +0 -24
  51. data/lib/kudzu/util/matcher.rb +0 -21
  52. data/lib/kudzu/util/thread_pool.rb +0 -38
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ab6c69046e9faa3788ead18864ee6d13ddbe2980
4
- data.tar.gz: c9868fabe9542d877d6519e0f5297419c882a8e5
3
+ metadata.gz: 0ba76e95628d94560421358aa7982bdc429971e4
4
+ data.tar.gz: e1875f5760573a021fcf129018aaba5f6213ad23
5
5
  SHA512:
6
- metadata.gz: 7018e08e6744a9e74e601bad26a88df3d60140ddaa055fc194c88263cff37137402de967203999d9ba9c2bda199228215f380e207a4b12b6c2e50c5774827e16
7
- data.tar.gz: b03edd059ea5b5cb0f50fd0bc660c02e8727c1bb7f8f735fe86ba6f08b2a8bf743a04570eb171dc99f3ffcacba26c1df8a99cc2a1187e2e2fb481c01adb59e6c
6
+ metadata.gz: 4f17f799c2ad67722860bbad00e9e220db8265221d598aebbbd181fea41546454e954fc67db18abba2b8b68797fe44da3887ec4ea3a5d2486fd1afd61584c152
7
+ data.tar.gz: 65d3bf42fafbcf835740ebe5534a52163f09ef313795152fafadc87aeaa335d540b037180a4f03208471361a9284635ff15a85471f3def22ba43e341ee1eb724
@@ -9,7 +9,7 @@ module Kudzu
9
9
  @queued = {}
10
10
  end
11
11
 
12
- def enqueue(links, depth: 1)
12
+ def enqueue(links)
13
13
  @monitor.synchronize do
14
14
  Array(links).each do |link|
15
15
  next if @queued.key?(link.url)
@@ -1,14 +1,10 @@
1
1
  module Kudzu
2
2
  module Adapter
3
3
  module Memory
4
- class Link
5
- include Kudzu::Adapter::Base::Link
4
+ class Link < Kudzu::Model::Base
5
+ include Kudzu::Model::Link
6
6
 
7
7
  attr_accessor :uuid, :url, :title, :state, :depth
8
-
9
- def initialize(attr = {})
10
- attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
11
- end
12
8
  end
13
9
  end
14
10
  end
@@ -1,16 +1,11 @@
1
1
  module Kudzu
2
2
  module Adapter
3
3
  module Memory
4
- class Page
5
- include Kudzu::Adapter::Base::Page
4
+ class Page < Kudzu::Model::Base
5
+ include Kudzu::Model::Page
6
6
 
7
7
  attr_accessor :url, :title, :status, :mime_type, :size, :charset, :digest,
8
- :response_header, :response_time, :redirect_from, :fetched_at, :revised_at,
9
- :revisit_interval, :revisit_at
10
-
11
- def initialize(attr = {})
12
- attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
13
- end
8
+ :response_header, :response_time, :redirect_from, :fetched_at, :revised_at
14
9
  end
15
10
  end
16
11
  end
@@ -6,7 +6,6 @@ module Kudzu
6
6
 
7
7
  def initialize
8
8
  @page = {}
9
- @digest = {}
10
9
  end
11
10
 
12
11
  def find_by_url(url)
@@ -15,7 +14,6 @@ module Kudzu
15
14
 
16
15
  def register(page)
17
16
  @page[page.url] = page
18
- @digest[page.digest] = true
19
17
  end
20
18
 
21
19
  def delete(page)
@@ -1,4 +1,3 @@
1
- require_relative 'base/all'
2
- require_relative 'memory/all'
3
-
4
- Kudzu.adapter = Kudzu::Adapter::Memory
1
+ Dir[File.join(__dir__, 'memory/**/*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -1,3 +1,3 @@
1
- Dir[File.join(__dir__, '*.rb')].each do |file|
1
+ Dir[File.join(__dir__, '**/*.rb')].each do |file|
2
2
  require_relative file
3
3
  end
@@ -1,67 +1,65 @@
1
- require 'net/http'
2
- require 'http-cookie'
3
-
4
1
  module Kudzu
5
2
  class Agent
6
3
  class Fetcher
7
- class Response
8
- attr_accessor :url, :status, :header, :body, :time, :redirected
9
-
10
- def initialize(attr = {})
11
- attr.each { |k, v| public_send("#{k}=", v) }
12
- end
13
-
14
- def redirected?
15
- redirected
16
- end
17
- end
18
-
19
4
  attr_reader :pool
20
5
 
21
6
  def initialize(config, robots = nil)
22
7
  @config = config
23
- @pool = Kudzu::Util::ConnectionPool.new(@config.max_connection || 100)
24
- @sleeper = Kudzu::Agent::Sleeper.new(@config, robots)
8
+ @pool = Http::ConnectionPool.new(@config.max_connection || 100)
9
+ @sleeper = Sleeper.new(@config, robots)
10
+ @filterer = PageFilterer.new(@config)
25
11
  @jar = HTTP::CookieJar.new
26
12
  end
27
13
 
28
- def fetch(url, request_header: {}, redirect: max_redirect, method: :get)
14
+ def fetch(url, request_header: {}, method: :get, redirect: @config.max_redirect, redirect_from: nil)
29
15
  uri = Addressable::URI.parse(url)
30
- http = @pool.checkout(pool_name(uri)) { build_http(uri) }
31
16
  request = build_request(uri, request_header: request_header, method: method)
32
-
33
- append_cookie(url, request) if @config.handle_cookie
34
-
35
- @sleeper.politeness_delay(url)
36
-
37
- response = nil
38
- response_time = Benchmark.realtime { response = http.request(request) }
39
-
40
- parse_cookie(url, response) if @config.handle_cookie
17
+ response, response_time = send_request(uri, request)
41
18
 
42
19
  if redirection?(response.code) && response['location'] && redirect > 0
43
- fetch(uri.join(response['location']).to_s, request_header: request_header, redirect: redirect - 1)
20
+ fetch(uri.join(response['location']).to_s, request_header: request_header,
21
+ redirect: redirect - 1,
22
+ redirect_from: redirect_from || url)
44
23
  else
45
- res = build_response(url, response, response_time)
46
- res.redirected = (redirect != max_redirect)
47
- res
24
+ build_response(url, response, response_time, redirect_from)
48
25
  end
49
26
  end
50
27
 
51
28
  private
52
29
 
53
- def max_redirect
54
- @config.max_redirect || 5
55
- end
56
-
57
30
  def pool_name(uri)
58
31
  "#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
59
32
  end
60
33
 
34
+ def send_request(uri, request)
35
+ start_http(uri, request) do |http|
36
+ http.request(request) do |response|
37
+ unless @filterer.allowed_response_header?(uri.to_s, response)
38
+ http.finish
39
+ break response
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ def start_http(uri, request)
46
+ http = @pool.checkout(pool_name(uri)) { build_http(uri) }
47
+ append_cookie(uri, request) if @config.handle_cookie
48
+ @sleeper.politeness_delay(uri)
49
+
50
+ start = Time.now.to_f
51
+ response = yield http
52
+ response_time = Time.now.to_f - start
53
+
54
+ parse_cookie(uri, response) if @config.handle_cookie
55
+ return response, response_time
56
+ end
57
+
61
58
  def build_http(uri)
62
59
  http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
63
60
  http.open_timeout = @config.open_timeout if @config.open_timeout
64
61
  http.read_timeout = @config.read_timeout if @config.read_timeout
62
+ http.keep_alive_timeout = @config.keep_alive if @config.keep_alive
65
63
  if uri.scheme == 'https'
66
64
  http.use_ssl = true
67
65
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
@@ -70,7 +68,7 @@ module Kudzu
70
68
  end
71
69
 
72
70
  def build_request(uri, request_header:, method:)
73
- request = request_klass_for(method).new(uri.request_uri)
71
+ request = Object.const_get("Net::HTTP::#{method.capitalize}").new(uri.request_uri)
74
72
  request.basic_auth uri.user, uri.password if uri.user && uri.password
75
73
 
76
74
  request['User-Agent'] = @config.user_agent
@@ -80,16 +78,15 @@ module Kudzu
80
78
  request
81
79
  end
82
80
 
83
- def request_klass_for(method)
84
- Object.const_get("Net::HTTP::#{method.capitalize}")
85
- end
86
-
87
- def build_response(url, response, response_time)
81
+ def build_response(url, response, response_time, redirect_from)
82
+ fetched = response.instance_variable_get("@read")
88
83
  Response.new(url: url,
89
84
  status: response.code.to_i,
90
- header: Hash[response.each.to_a],
91
- body: response.body.to_s,
92
- time: response_time)
85
+ body: fetched ? response.body.to_s : nil,
86
+ response_header: Hash[response.each.to_a],
87
+ response_time: response_time,
88
+ redirect_from: redirect_from,
89
+ fetched: fetched)
93
90
  end
94
91
 
95
92
  def redirection?(code)
@@ -97,12 +94,12 @@ module Kudzu
97
94
  300 <= code && code <= 399
98
95
  end
99
96
 
100
- def parse_cookie(url, response)
101
- @jar.parse(response['set-cookie'], url) if response['set-cookie']
97
+ def parse_cookie(uri, response)
98
+ @jar.parse(response['set-cookie'], uri.to_s) if response['set-cookie']
102
99
  end
103
100
 
104
- def append_cookie(url, request)
105
- cookies = @jar.cookies(url)
101
+ def append_cookie(uri, request)
102
+ cookies = @jar.cookies(uri.to_s)
106
103
  unless cookies.empty?
107
104
  if request['Cookie']
108
105
  request['Cookie'] += '; ' + cookies.join('; ')
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Http
4
+ class Connection < Kudzu::Model::Base
5
+ attr_accessor :name, :http, :last_used
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,50 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Http
4
+ class ConnectionPool
5
+ def initialize(max_size = 10)
6
+ @max_size = max_size
7
+ end
8
+
9
+ def checkout(name)
10
+ pool[name] ||= Connection.new(name: name, http: yield)
11
+
12
+ conn = pool[name]
13
+ conn.last_used = Time.now
14
+
15
+ if pool.size > @max_size
16
+ reduce
17
+ end
18
+
19
+ conn.http
20
+ end
21
+
22
+ def close
23
+ pool.values.each do |conn|
24
+ finish_http(conn.http)
25
+ end
26
+ Thread.current[:kudzu_connection] = nil
27
+ end
28
+
29
+ private
30
+
31
+ def pool
32
+ Thread.current[:kudzu_connection] ||= {}
33
+ Thread.current[:kudzu_connection]
34
+ end
35
+
36
+ def reduce
37
+ conns = pool.values.sort_by { |conn| conn.last_used }
38
+ conns.first(pool.size - @max_size).each do |conn|
39
+ finish_http(conn.http)
40
+ pool.delete(conn.name)
41
+ end
42
+ end
43
+
44
+ def finish_http(http)
45
+ http.finish if http && http.started?
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,58 @@
1
+ module Kudzu
2
+ class Agent
3
+ class PageFilterer
4
+ def initialize(config)
5
+ @config = config
6
+ end
7
+
8
+ def allowed?(response)
9
+ filter = @config.find_filter(response.url)
10
+
11
+ if filter.nil? || (allowed_mime_type?(response.mime_type, filter) &&
12
+ allowed_size?(response.size, filter) &&
13
+ allowed_index?(response))
14
+ Kudzu.log :info, "passed page: #{response.url}"
15
+ true
16
+ else
17
+ Kudzu.log :info, "dropped page: #{response.url}"
18
+ false
19
+ end
20
+ end
21
+
22
+ def allowed_response_header?(url, response_header)
23
+ filter = @config.find_filter(url)
24
+
25
+ if response_header['content-type']
26
+ mime_type = Util::ContentTypeParser.parse(response_header['content-type']).first
27
+ end
28
+ if response_header['content-length']
29
+ size = response_header['content-length'].to_i
30
+ end
31
+
32
+ filter.nil? || (allowed_mime_type?(mime_type, filter) &&
33
+ allowed_size?(size, filter))
34
+ end
35
+
36
+ private
37
+
38
+ def allowed_mime_type?(mime_type, filter)
39
+ return true if mime_type.nil?
40
+ Util::Matcher.match?(mime_type, allows: filter.allow_mime_type, denies: filter.deny_mime_type)
41
+ end
42
+
43
+ def allowed_size?(size, filter)
44
+ return true if filter.max_size.nil? || size.nil?
45
+ size.to_i < filter.max_size.to_i
46
+ end
47
+
48
+ def allowed_index?(response)
49
+ return true if response.body.nil? || !response.html?
50
+ return true unless @config.respect_noindex
51
+
52
+ doc = response.parsed_doc
53
+ doc.xpath('html/head/meta[@name]')
54
+ .all? { |meta| meta[:name] !~ /^robots$/i || meta[:content] !~ /noindex/i }
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Reference < Kudzu::Model::Base
4
+ include Kudzu::Model::Link
5
+
6
+ attr_accessor :url, :title
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,14 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Response < Kudzu::Model::Base
4
+ include Kudzu::Model::Page
5
+
6
+ attr_accessor :url, :status, :body, :response_header, :response_time, :redirect_from, :fetched,
7
+ :size, :digest, :mime_type, :charset, :title
8
+
9
+ def fetched?
10
+ fetched
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,91 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Robots
4
+ class Parser
5
+ UNMATCH_REGEXP = /^$/
6
+
7
+ class << self
8
+ def parse(body)
9
+ txt = Txt.new
10
+ sets = []
11
+ prev_key = nil
12
+
13
+ parse_body(body).each do |key, value|
14
+ case key
15
+ when 'user-agent'
16
+ new_set = RuleSet.new(user_agent: ua_regexp(value))
17
+ txt.sets << new_set
18
+ if prev_key == 'user-agent'
19
+ sets << new_set
20
+ else
21
+ sets = [new_set]
22
+ end
23
+ when 'allow'
24
+ re = path_regexp(value)
25
+ sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
26
+ when 'disallow'
27
+ re = path_regexp(value)
28
+ sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
29
+ when 'crawl-delay'
30
+ sets.each { |set| set.crawl_delay = value.to_i }
31
+ when 'sitemap'
32
+ txt.sitemaps << value
33
+ end
34
+
35
+ prev_key = key
36
+ end
37
+
38
+ sort(txt)
39
+ end
40
+
41
+ private
42
+
43
+ def parse_body(body)
44
+ lines = body.to_s.split(/\r|\n|\r\n/)
45
+ lines.map { |line| parse_line(line) }.compact
46
+ end
47
+
48
+ def parse_line(line)
49
+ line.strip!
50
+ if line.empty? || line.start_with?('#')
51
+ nil
52
+ else
53
+ split_line(line)
54
+ end
55
+ end
56
+
57
+ def split_line(line)
58
+ key, value = line.split(':', 2)
59
+ key = key.to_s.strip.downcase
60
+ value = value.to_s.sub(/#.*$/, '').strip
61
+ if key.empty? || value.empty?
62
+ nil
63
+ else
64
+ [key, value]
65
+ end
66
+ end
67
+
68
+ def ua_regexp(value)
69
+ Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
70
+ rescue RegexpError
71
+ UNMATCH_REGEXP
72
+ end
73
+
74
+ def path_regexp(value)
75
+ Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
76
+ rescue RegexpError
77
+ UNMATCH_REGEXP
78
+ end
79
+
80
+ def sort(txt)
81
+ txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
82
+ txt.sets.each do |set|
83
+ set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
84
+ end
85
+ txt
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,34 @@
1
+ module Kudzu
2
+ class Agent
3
+ class Robots
4
+ class Txt < Kudzu::Model::Base
5
+ attr_accessor :sets, :sitemaps
6
+
7
+ def initialize
8
+ self.sets = []
9
+ self.sitemaps = []
10
+ end
11
+ end
12
+
13
+ class RuleSet < Kudzu::Model::Base
14
+ attr_accessor :user_agent, :rules, :crawl_delay
15
+
16
+ def initialize(attr = {})
17
+ self.rules = []
18
+ super
19
+ end
20
+
21
+ def allowed_path?(uri)
22
+ rules.each do |rule|
23
+ return rule.allow if uri.path =~ rule.path
24
+ end
25
+ return true
26
+ end
27
+ end
28
+
29
+ class Rule < Kudzu::Model::Base
30
+ attr_accessor :path, :allow
31
+ end
32
+ end
33
+ end
34
+ end
@@ -3,7 +3,6 @@ module Kudzu
3
3
  class Robots
4
4
  def initialize(config)
5
5
  @user_agent = config.user_agent
6
- @page_fetcher = Kudzu::Agent::Fetcher.new(config)
7
6
  @monitor = Monitor.new
8
7
  @txt = {}
9
8
  end
@@ -49,11 +48,11 @@ module Kudzu
49
48
 
50
49
  def fetch_and_parse(uri)
51
50
  response = fetch(uri)
52
- if response && response.status == 200
51
+ if response && response.code.to_i == 200
53
52
  body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
54
- Parser.new.parse(body)
53
+ Parser.parse(body)
55
54
  else
56
- Parser.new.parse('')
55
+ Parser.parse('')
57
56
  end
58
57
  end
59
58
 
@@ -62,127 +61,17 @@ module Kudzu
62
61
  uri.path = 'robots.txt'
63
62
  uri.fragment = uri.query = nil
64
63
 
65
- begin
66
- @page_fetcher.fetch(uri.to_s)
67
- rescue
68
- nil
69
- end
70
- end
71
-
72
- class Txt
73
- attr_accessor :sets, :sitemaps
74
-
75
- def initialize
76
- self.sets = []
77
- self.sitemaps = []
78
- end
79
- end
80
-
81
- class RuleSet
82
- attr_accessor :user_agent, :rules, :crawl_delay
83
-
84
- def initialize(attr = {})
85
- self.rules = []
86
- attr.each { |k, v| public_send("#{k}=", v) }
87
- end
88
-
89
- def allowed_path?(uri)
90
- rules.each do |rule|
91
- return rule.allow if uri.path =~ rule.path
92
- end
93
- return true
94
- end
95
- end
96
-
97
- class Rule
98
- attr_accessor :path, :allow
99
-
100
- def initialize(attr = {})
101
- attr.each { |k, v| public_send("#{k}=", v) }
102
- end
103
- end
104
-
105
- class Parser
106
- UNMATCH_REGEXP = /^$/
107
-
108
- def parse(body)
109
- txt = Txt.new
110
- sets = []
111
- prev_key = nil
112
-
113
- parse_body(body).each do |key, value|
114
- case key
115
- when 'user-agent'
116
- new_set = RuleSet.new(user_agent: ua_regexp(value))
117
- txt.sets << new_set
118
- if prev_key == 'user-agent'
119
- sets << new_set
120
- else
121
- sets = [new_set]
122
- end
123
- when 'allow'
124
- re = path_regexp(value)
125
- sets.each { |set| set.rules << Rule.new(path: re, allow: true) }
126
- when 'disallow'
127
- re = path_regexp(value)
128
- sets.each { |set| set.rules << Rule.new(path: re, allow: false) }
129
- when 'crawl-delay'
130
- sets.each { |set| set.crawl_delay = value.to_i }
131
- when 'sitemap'
132
- txt.sitemaps << value
133
- end
134
-
135
- prev_key = key
136
- end
137
-
138
- sort(txt)
139
- end
140
-
141
- private
142
-
143
- def parse_body(body)
144
- lines = body.to_s.split(/\r|\n|\r\n/)
145
- lines.map { |line| parse_line(line) }.compact
146
- end
147
-
148
- def parse_line(line)
149
- line.strip!
150
- if line.empty? || line.start_with?('#')
151
- nil
152
- else
153
- split_line(line)
154
- end
155
- end
156
-
157
- def split_line(line)
158
- key, value = line.split(':', 2)
159
- key = key.to_s.strip.downcase
160
- value = value.to_s.sub(/#.*$/, '').strip
161
- if key.empty? || value.empty?
162
- nil
163
- else
164
- [key, value]
165
- end
166
- end
167
-
168
- def ua_regexp(value)
169
- Regexp.new(Regexp.escape(value).gsub('\*', '.*'))
170
- rescue RegexpError
171
- UNMATCH_REGEXP
64
+ http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
65
+ if uri.scheme == 'https'
66
+ http.use_ssl = true
67
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
172
68
  end
173
69
 
174
- def path_regexp(value)
175
- Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$'))
176
- rescue RegexpError
177
- UNMATCH_REGEXP
178
- end
179
-
180
- def sort(txt)
181
- txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse!
182
- txt.sets.each do |set|
183
- set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse!
184
- end
185
- txt
70
+ begin
71
+ http.get(uri.to_s)
72
+ rescue => e
73
+ Kudzu.log :error, "failed to fetch robots.txt: #{uri}", error: e
74
+ nil
186
75
  end
187
76
  end
188
77
  end
@@ -8,8 +8,8 @@ module Kudzu
8
8
  @last_accessed = {}
9
9
  end
10
10
 
11
- def politeness_delay(url)
12
- uri = Addressable::URI.parse(url)
11
+ def politeness_delay(uri)
12
+ uri = Addressable::URI.parse(uri) if uri.is_a?(String)
13
13
  delay_sec = delay_second(uri)
14
14
  return unless delay_sec
15
15