kudzu 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/lib/kudzu/adapter/memory/frontier.rb +1 -1
  3. data/lib/kudzu/adapter/memory/model/link.rb +2 -6
  4. data/lib/kudzu/adapter/memory/model/page.rb +3 -8
  5. data/lib/kudzu/adapter/memory/repository.rb +0 -2
  6. data/lib/kudzu/adapter/memory.rb +3 -4
  7. data/lib/kudzu/agent/all.rb +1 -1
  8. data/lib/kudzu/agent/fetcher.rb +46 -49
  9. data/lib/kudzu/agent/http/connection.rb +9 -0
  10. data/lib/kudzu/agent/http/connection_pool.rb +50 -0
  11. data/lib/kudzu/agent/page_filterer.rb +58 -0
  12. data/lib/kudzu/agent/reference.rb +9 -0
  13. data/lib/kudzu/agent/response.rb +14 -0
  14. data/lib/kudzu/agent/robots/parser.rb +91 -0
  15. data/lib/kudzu/agent/robots/txt.rb +34 -0
  16. data/lib/kudzu/agent/robots.rb +12 -123
  17. data/lib/kudzu/agent/sleeper.rb +2 -2
  18. data/lib/kudzu/agent/url_extractor.rb +60 -46
  19. data/lib/kudzu/agent/{url_filter.rb → url_filterer.rb} +26 -13
  20. data/lib/kudzu/agent/util/charset_detector.rb +84 -0
  21. data/lib/kudzu/agent/util/content_type_parser.rb +28 -0
  22. data/lib/kudzu/agent/util/matcher.rb +25 -0
  23. data/lib/kudzu/agent/util/mime_type_detector.rb +38 -0
  24. data/lib/kudzu/agent/util/title_parser.rb +30 -0
  25. data/lib/kudzu/agent.rb +42 -0
  26. data/lib/kudzu/callback.rb +4 -2
  27. data/lib/kudzu/config/filter.rb +11 -11
  28. data/lib/kudzu/config.rb +20 -25
  29. data/lib/kudzu/crawler.rb +65 -146
  30. data/lib/kudzu/{adapter/base → model}/all.rb +0 -0
  31. data/lib/kudzu/model/base.rb +9 -0
  32. data/lib/kudzu/model/link.rb +9 -0
  33. data/lib/kudzu/model/page.rb +112 -0
  34. data/lib/kudzu/thread_pool.rb +36 -0
  35. data/lib/kudzu/version.rb +1 -1
  36. data/lib/kudzu.rb +21 -3
  37. metadata +21 -19
  38. data/lib/kudzu/adapter/base/link.rb +0 -8
  39. data/lib/kudzu/adapter/base/page.rb +0 -106
  40. data/lib/kudzu/adapter/memory/all.rb +0 -3
  41. data/lib/kudzu/agent/charset_detector.rb +0 -84
  42. data/lib/kudzu/agent/filter.rb +0 -40
  43. data/lib/kudzu/agent/mime_type_detector.rb +0 -34
  44. data/lib/kudzu/agent/title_parser.rb +0 -16
  45. data/lib/kudzu/logger.rb +0 -20
  46. data/lib/kudzu/revisit/all.rb +0 -3
  47. data/lib/kudzu/revisit/scheduler.rb +0 -28
  48. data/lib/kudzu/util/all.rb +0 -3
  49. data/lib/kudzu/util/connection_pool.rb +0 -56
  50. data/lib/kudzu/util/content_type_parser.rb +0 -24
  51. data/lib/kudzu/util/matcher.rb +0 -21
  52. data/lib/kudzu/util/thread_pool.rb +0 -38
data/lib/kudzu/crawler.rb CHANGED
@@ -1,57 +1,39 @@
1
- require 'addressable'
2
- require 'nokogiri'
1
+ require_relative 'model/all'
2
+ require_relative 'adapter/memory'
3
+ require_relative 'agent'
4
+ require_relative 'callback'
3
5
  require_relative 'common'
4
6
  require_relative 'config'
5
- require_relative 'callback'
6
- require_relative 'logger'
7
- require_relative 'adapter/memory'
8
- require_relative 'util/all'
9
- require_relative 'agent/all'
10
- require_relative 'revisit/all'
7
+ require_relative 'thread_pool'
11
8
 
12
9
  module Kudzu
13
10
  class Crawler
14
11
  attr_reader :uuid, :config
15
- attr_reader :frontier, :repository
12
+ attr_reader :frontier, :repository, :agent
16
13
 
17
14
  def initialize(options = {}, &block)
18
15
  @uuid = options[:uuid] || SecureRandom.uuid
19
16
  @config = Kudzu::Config.new(options, &block)
20
- end
21
-
22
- def prepare(&block)
23
- @logger = Kudzu::Logger.new(@config.log_file, @config.log_level)
24
- @callback = Kudzu::Callback.new(&block)
25
17
 
26
18
  @frontier = Kudzu.adapter::Frontier.new(@uuid)
27
19
  @repository = Kudzu.adapter::Repository.new
28
-
29
- @robots = Kudzu::Agent::Robots.new(@config)
30
- @page_fetcher = Kudzu::Agent::Fetcher.new(@config, @robots)
31
- @page_filter = Kudzu::Agent::Filter.new(@config)
32
- @charset_detector = Kudzu::Agent::CharsetDetector.new
33
- @mime_type_detector = Kudzu::Agent::MimeTypeDetector.new
34
- @title_parser = Kudzu::Agent::TitleParser.new
35
-
36
- @url_extractor = Kudzu::Agent::UrlExtractor.new(@config)
37
- @url_filter = Kudzu::Agent::UrlFilter.new(@config)
38
-
39
- @revisit_scheduler = Kudzu::Revisit::Scheduler.new(@config)
20
+ @agent = Kudzu.agent.new(@config)
40
21
  end
41
22
 
42
23
  def run(seed_url, &block)
43
- prepare(&block)
24
+ @callback = Kudzu::Callback.new(&block)
44
25
 
45
- seeds = Array(seed_url).map { |url| { url: url } }
46
- enqueue_hrefs(seeds, 1)
26
+ seed_refs = Array(seed_url).map { |url| Kudzu::Agent::Reference.new(url: url) }
27
+ enqueue_links(refs_to_links(seed_refs, 1))
47
28
 
48
- if @config.thread_num.to_i <= 1
49
- single_thread
50
- else
51
- multi_thread(@config.thread_num)
29
+ @agent.start do
30
+ if @config.thread_num.to_i <= 1
31
+ single_thread
32
+ else
33
+ multi_thread(@config.thread_num)
34
+ end
52
35
  end
53
36
 
54
- @page_fetcher.pool.close
55
37
  @frontier.clear
56
38
  end
57
39
 
@@ -66,7 +48,7 @@ module Kudzu
66
48
  end
67
49
 
68
50
  def multi_thread(thread_num)
69
- @thread_pool = Kudzu::Util::ThreadPool.new(thread_num)
51
+ @thread_pool = Kudzu::ThreadPool.new(thread_num)
70
52
 
71
53
  @thread_pool.start do |queue|
72
54
  limit_num = [thread_num - queue.size, 0].max
@@ -82,22 +64,25 @@ module Kudzu
82
64
  end
83
65
 
84
66
  def visit_link(link)
85
- page = @repository.find_by_url(link.url)
86
- response = fetch_link(link, build_request_header(page))
67
+ response = fetch(link, @config.default_request_header.to_h)
87
68
  return unless response
88
69
 
89
- page = @repository.find_by_url(response.url) if response.redirected?
70
+ page = @repository.find_by_url(response.url)
90
71
  page.url = response.url
91
72
  page.status = response.status
92
- page.response_time = response.time
73
+ page.response_time = response.response_time
93
74
  page.fetched_at = Time.now
94
75
 
95
- if page.status_success?
96
- handle_success(page, link, response)
97
- elsif page.status_not_modified?
98
- @revisit_scheduler.schedule(page, modified: false)
99
- register_page(page)
100
- elsif page.status_not_found? || page.status_gone?
76
+ if response.fetched?
77
+ if page.status_success?
78
+ handle_success(page, link, response)
79
+ elsif page.status_not_modified?
80
+ register_page(page)
81
+ elsif page.status_not_found? || page.status_gone?
82
+ delete_page(page)
83
+ end
84
+ else
85
+ page.filtered = true
101
86
  delete_page(page)
102
87
  end
103
88
 
@@ -120,113 +105,44 @@ module Kudzu
120
105
  end
121
106
  end
122
107
 
123
- def build_request_header(page)
124
- header = @config.default_request_header.to_h
125
- if @config.revisit_mode
126
- header['If-Modified-Since'] = page.last_modified.httpdate if page.last_modified
127
- header['If-None-Match'] = page.etag if page.etag
108
+ def fetch(link, request_header)
109
+ response = nil
110
+ @callback.around(:fetch, link, request_header, response) do
111
+ response = @agent.fetch(link.url, request_header)
112
+ end
113
+ if response.fetched?
114
+ Kudzu.log :info, "fetched page: #{response.status} #{response.url}"
115
+ else
116
+ Kudzu.log :info, "skipped page: #{response.status} #{response.url}"
128
117
  end
129
- header
130
- end
131
-
132
- def fetch_link(link, request_header)
133
- response = @page_fetcher.fetch(link.url, request_header: request_header)
134
- @logger.log :info, "page fetched: #{response.status} #{response.url}"
135
118
  response
136
119
  rescue Exception => e
137
- @logger.log :warn, "couldn't fetch page: #{link.url}", error: e
120
+ Kudzu.log :warn, "failed to fetch page: #{link.url}", error: e
138
121
  @callback.on(:failure, link, e)
139
122
  nil
140
123
  end
141
124
 
142
125
  def handle_success(page, link, response)
143
- digest = Digest::MD5.hexdigest(response.body)
144
- @revisit_scheduler.schedule(page, modified: page.digest != digest)
145
-
146
- page.response_header = response.header
126
+ page.response_header = response.response_header
147
127
  page.body = response.body
148
- page.size = response.body.size
149
- page.mime_type = detect_mime_type(page)
150
- page.charset = detect_charset(page)
151
- page.title = parse_title(page)
152
- page.redirect_from = link.url if response.redirected?
153
- page.revised_at = Time.now if page.digest != digest
154
- page.digest = digest
128
+ page.size = response.size
129
+ page.mime_type = response.mime_type
130
+ page.charset = response.charset
131
+ page.title = response.title
132
+ page.redirect_from = response.redirect_from
133
+ page.revised_at = Time.now if page.digest != response.digest
134
+ page.digest = response.digest
155
135
 
156
- if follow_hrefs_from?(page, link)
157
- hrefs = extract_hrefs(page, page.url)
158
- enqueue_hrefs(hrefs, link.depth + 1) unless hrefs.empty?
136
+ if @config.max_depth.nil? || link.depth < @config.max_depth.to_i
137
+ refs = @agent.extract_refs(response)
138
+ enqueue_links(refs_to_links(refs, link.depth + 1)) unless refs.empty?
159
139
  end
160
140
 
161
- if allowed_page?(page)
162
- register_page(page)
163
- else
141
+ if @agent.filter_response?(response)
164
142
  page.filtered = true
165
143
  delete_page(page)
166
- end
167
- end
168
-
169
- def detect_mime_type(page)
170
- @mime_type_detector.detect(page)
171
- rescue => e
172
- @logger.log :warn, "couldn't detect mime type for #{page.url}", error: e
173
- nil
174
- end
175
-
176
- def detect_charset(page)
177
- if page.text?
178
- @charset_detector.detect(page)
179
- else
180
- nil
181
- end
182
- rescue => e
183
- @logger.log :warn, "couldn't detect charset for #{page.url}", error: e
184
- nil
185
- end
186
-
187
- def parse_title(page)
188
- if page.html?
189
- @title_parser.parse(page)
190
144
  else
191
- Addressable::URI.parse(page.url).basename
192
- end
193
- rescue => e
194
- @logger.log :warn, "couldn't parse title for #{page.url}", error: e
195
- nil
196
- end
197
-
198
- def follow_hrefs_from?(page, link)
199
- (page.html? || page.xml?) && (@config.max_depth.nil? || link.depth < @config.max_depth.to_i)
200
- end
201
-
202
- def extract_hrefs(page, base_url)
203
- hrefs = @url_extractor.extract(page, base_url)
204
- passed, dropped = @url_filter.filter(hrefs, base_url)
205
-
206
- if @config.respect_robots_txt
207
- passed, dropped_by_robots = passed.partition { |href| @robots.allowed?(href[:url]) }
208
- dropped += dropped_by_robots
209
- end
210
-
211
- if @config.log_level == :debug
212
- passed.each { |href| @logger.log :debug, "url passed: #{href[:url]}" }
213
- dropped.each { |href| @logger.log :debug, "url dropped: #{href[:url]}" }
214
- end
215
-
216
- passed
217
- rescue => e
218
- @logger.log :warn, "couldn't extract links from #{page.url}", error: e
219
- []
220
- end
221
-
222
- def allowed_page?(page)
223
- if @page_filter.allowed?(page) &&
224
- (!page.redirect_from || @url_filter.allowed?(page.url, page.redirect_from))
225
- @logger.log :info, "page passed: #{page.url}"
226
- true
227
- else
228
- @logger.log :info, "page dropped: #{page.url}"
229
- false
145
+ register_page(page)
230
146
  end
231
147
  end
232
148
 
@@ -242,16 +158,19 @@ module Kudzu
242
158
  end
243
159
  end
244
160
 
245
- def enqueue_hrefs(hrefs, depth)
246
- links = hrefs.map do |href|
247
- Kudzu.adapter::Link.new(uuid: @uuid,
248
- url: href[:url],
249
- title: href[:title],
250
- state: 0,
251
- depth: depth)
252
- end
161
+ def refs_to_links(refs, depth)
162
+ refs.map do |ref|
163
+ Kudzu.adapter::Link.new(uuid: @uuid,
164
+ url: ref.url,
165
+ title: ref.title,
166
+ state: 0,
167
+ depth: depth)
168
+ end
169
+ end
170
+
171
+ def enqueue_links(links)
253
172
  @callback.around(:enqueue, links) do
254
- @frontier.enqueue(links, depth: depth)
173
+ @frontier.enqueue(links)
255
174
  end
256
175
  end
257
176
  end
File without changes
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ module Model
3
+ class Base
4
+ def initialize(attr = {})
5
+ attr.each { |k, v| public_send("#{k}=", v) if respond_to?("#{k}=") }
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ module Model
3
+ module Link
4
+ def uri
5
+ Addressable::URI.parse(url)
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,112 @@
1
+ module Kudzu
2
+ module Model
3
+ module Page
4
+ def last_modified
5
+ last_modified = response_header['last-modified']
6
+ Time.parse(last_modified).localtime if last_modified
7
+ rescue
8
+ nil
9
+ end
10
+
11
+ def etag
12
+ response_header['etag']
13
+ end
14
+
15
+ def html?
16
+ !mime_type.to_s.match(%r{text/html|application/xhtml\+xml}).nil?
17
+ end
18
+
19
+ def xml?
20
+ !mime_type.to_s.match(%r{text/xml|application/xml|application/rss\+xml|application/atom\+xml}).nil?
21
+ end
22
+
23
+ def css?
24
+ !mime_type.to_s.match(%r{text/css}).nil?
25
+ end
26
+
27
+ def js?
28
+ !mime_type.to_s.match(%r{text/javascript|application/javascript|application/x-javascript}).nil?
29
+ end
30
+
31
+ def text?
32
+ html? || xml? || !mime_type.to_s.match(%r{text/}).nil?
33
+ end
34
+
35
+ def status_success?
36
+ 200 <= status && status <= 299
37
+ end
38
+
39
+ def status_redirection?
40
+ 300 <= status && status <= 399
41
+ end
42
+
43
+ def status_client_error?
44
+ 400 <= status && status <= 499
45
+ end
46
+
47
+ def status_server_error?
48
+ 500 <= status && status <= 599
49
+ end
50
+
51
+ def status_not_modified?
52
+ status == 304
53
+ end
54
+
55
+ def status_not_found?
56
+ status == 404
57
+ end
58
+
59
+ def status_gone?
60
+ status == 410
61
+ end
62
+
63
+ def body
64
+ @body
65
+ end
66
+
67
+ def body=(body)
68
+ @body = body
69
+ end
70
+
71
+ def filtered
72
+ @filtered
73
+ end
74
+
75
+ def filtered=(filtered)
76
+ @filtered = filtered
77
+ end
78
+
79
+ def decoded_body
80
+ @decoded_body ||= decode_body(body)
81
+ end
82
+
83
+ def parsed_doc
84
+ @parsed_doc ||= if html?
85
+ Nokogiri::HTML(decoded_body)
86
+ elsif xml?
87
+ Nokogiri::XML(decoded_body)
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ def decode_body(body)
94
+ if text?
95
+ if find_encoding
96
+ body.force_encoding(charset).encode('utf-8', invalid: :replace, undef: :replace)
97
+ else
98
+ body.encode('utf-8', invalid: :replace, undef: :replace)
99
+ end
100
+ else
101
+ body
102
+ end
103
+ end
104
+
105
+ def find_encoding
106
+ Encoding.find(charset)
107
+ rescue
108
+ nil
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,36 @@
1
+ module Kudzu
2
+ class ThreadPool
3
+ def initialize(size)
4
+ @size = size
5
+ @queue = Queue.new
6
+ @threads = []
7
+ end
8
+
9
+ def start(&block)
10
+ @threads = 1.upto(@size).map { create_thread(&block) }
11
+ end
12
+
13
+ def wait
14
+ until @queue.num_waiting == @threads.select { |t| t.alive? }.size
15
+ Thread.pass
16
+ sleep 1
17
+ end
18
+ end
19
+
20
+ def shutdown
21
+ @threads.each { |t| t.kill }
22
+ @threads = []
23
+ end
24
+
25
+ private
26
+
27
+ def create_thread(&block)
28
+ Thread.start do
29
+ loop do
30
+ ret = block.call(@queue)
31
+ break if ret == :end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
data/lib/kudzu/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Kudzu
2
- VERSION = '1.0.0'
2
+ VERSION = '1.1.0'
3
3
  end
data/lib/kudzu.rb CHANGED
@@ -1,8 +1,26 @@
1
+ require 'net/http'
2
+ require 'http-cookie'
3
+ require 'addressable'
4
+ require 'nokogiri'
5
+ require 'shared-mime-info'
6
+ require 'charlock_holmes'
7
+
8
+ require 'kudzu/version'
9
+ require 'kudzu/crawler'
10
+
1
11
  module Kudzu
2
12
  class << self
3
- attr_accessor :adapter
13
+ attr_accessor :adapter, :agent, :logger
14
+
15
+ def log(level, message, error: nil)
16
+ return unless @logger
17
+ if error
18
+ message += " - #{error.class}: #{error.message} at #{error.backtrace.take(5).join("\n")}"
19
+ end
20
+ @logger.send(level, message)
21
+ end
4
22
  end
5
23
  end
6
24
 
7
- require 'kudzu/version'
8
- require 'kudzu/crawler'
25
+ Kudzu.adapter = Kudzu::Adapter::Memory
26
+ Kudzu.agent = Kudzu::Agent
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kudzu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoshikazu Kaneta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-20 00:00:00.000000000 Z
11
+ date: 2018-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -174,38 +174,40 @@ files:
174
174
  - README.md
175
175
  - Rakefile
176
176
  - lib/kudzu.rb
177
- - lib/kudzu/adapter/base/all.rb
178
- - lib/kudzu/adapter/base/link.rb
179
- - lib/kudzu/adapter/base/page.rb
180
177
  - lib/kudzu/adapter/memory.rb
181
- - lib/kudzu/adapter/memory/all.rb
182
178
  - lib/kudzu/adapter/memory/frontier.rb
183
179
  - lib/kudzu/adapter/memory/model/link.rb
184
180
  - lib/kudzu/adapter/memory/model/page.rb
185
181
  - lib/kudzu/adapter/memory/repository.rb
182
+ - lib/kudzu/agent.rb
186
183
  - lib/kudzu/agent/all.rb
187
- - lib/kudzu/agent/charset_detector.rb
188
184
  - lib/kudzu/agent/fetcher.rb
189
- - lib/kudzu/agent/filter.rb
190
- - lib/kudzu/agent/mime_type_detector.rb
185
+ - lib/kudzu/agent/http/connection.rb
186
+ - lib/kudzu/agent/http/connection_pool.rb
187
+ - lib/kudzu/agent/page_filterer.rb
188
+ - lib/kudzu/agent/reference.rb
189
+ - lib/kudzu/agent/response.rb
191
190
  - lib/kudzu/agent/robots.rb
191
+ - lib/kudzu/agent/robots/parser.rb
192
+ - lib/kudzu/agent/robots/txt.rb
192
193
  - lib/kudzu/agent/sleeper.rb
193
- - lib/kudzu/agent/title_parser.rb
194
194
  - lib/kudzu/agent/url_extractor.rb
195
- - lib/kudzu/agent/url_filter.rb
195
+ - lib/kudzu/agent/url_filterer.rb
196
+ - lib/kudzu/agent/util/charset_detector.rb
197
+ - lib/kudzu/agent/util/content_type_parser.rb
198
+ - lib/kudzu/agent/util/matcher.rb
199
+ - lib/kudzu/agent/util/mime_type_detector.rb
200
+ - lib/kudzu/agent/util/title_parser.rb
196
201
  - lib/kudzu/callback.rb
197
202
  - lib/kudzu/common.rb
198
203
  - lib/kudzu/config.rb
199
204
  - lib/kudzu/config/filter.rb
200
205
  - lib/kudzu/crawler.rb
201
- - lib/kudzu/logger.rb
202
- - lib/kudzu/revisit/all.rb
203
- - lib/kudzu/revisit/scheduler.rb
204
- - lib/kudzu/util/all.rb
205
- - lib/kudzu/util/connection_pool.rb
206
- - lib/kudzu/util/content_type_parser.rb
207
- - lib/kudzu/util/matcher.rb
208
- - lib/kudzu/util/thread_pool.rb
206
+ - lib/kudzu/model/all.rb
207
+ - lib/kudzu/model/base.rb
208
+ - lib/kudzu/model/link.rb
209
+ - lib/kudzu/model/page.rb
210
+ - lib/kudzu/thread_pool.rb
209
211
  - lib/kudzu/version.rb
210
212
  homepage: https://github.com/kanety/kudzu
211
213
  licenses:
@@ -1,8 +0,0 @@
1
- module Kudzu
2
- module Adapter
3
- module Base
4
- module Link
5
- end
6
- end
7
- end
8
- end
@@ -1,106 +0,0 @@
1
- module Kudzu
2
- module Adapter
3
- module Base
4
- module Page
5
- def last_modified
6
- last_modified = response_header['last-modified']
7
- Time.parse(last_modified).localtime if last_modified
8
- rescue
9
- nil
10
- end
11
-
12
- def etag
13
- response_header['etag']
14
- end
15
-
16
- def html?
17
- !mime_type.to_s.match(%r{text/html|application/xhtml\+xml}).nil?
18
- end
19
-
20
- def xml?
21
- !mime_type.to_s.match(%r{text/xml|application/xml|application/rss\+xml|application/atom\+xml}).nil?
22
- end
23
-
24
- def css?
25
- !mime_type.to_s.match(%r{text/css}).nil?
26
- end
27
-
28
- def js?
29
- !mime_type.to_s.match(%r{text/javascript|application/javascript|application/x-javascript}).nil?
30
- end
31
-
32
- def text?
33
- html? || xml? || !mime_type.to_s.match(%r{text/}).nil?
34
- end
35
-
36
- def status_success?
37
- 200 <= status && status <= 299
38
- end
39
-
40
- def status_redirection?
41
- 300 <= status && status <= 399
42
- end
43
-
44
- def status_client_error?
45
- 400 <= status && status <= 499
46
- end
47
-
48
- def status_server_error?
49
- 500 <= status && status <= 599
50
- end
51
-
52
- def status_not_modified?
53
- status == 304
54
- end
55
-
56
- def status_not_found?
57
- status == 404
58
- end
59
-
60
- def status_gone?
61
- status == 410
62
- end
63
-
64
- def body
65
- @body
66
- end
67
-
68
- def body=(body)
69
- @body = body
70
- end
71
-
72
- def filtered
73
- @filtered
74
- end
75
-
76
- def filtered=(filtered)
77
- @filtered = filtered
78
- end
79
-
80
- def decoded_body
81
- @decoded_body ||= decode_body(body)
82
- end
83
-
84
- private
85
-
86
- def decode_body(body)
87
- if text?
88
- if find_encoding
89
- body.force_encoding(charset).encode('utf-8', invalid: :replace, undef: :replace)
90
- else
91
- body.encode('utf-8', invalid: :replace, undef: :replace)
92
- end
93
- else
94
- body
95
- end
96
- end
97
-
98
- def find_encoding
99
- Encoding.find(charset)
100
- rescue
101
- nil
102
- end
103
- end
104
- end
105
- end
106
- end
@@ -1,3 +0,0 @@
1
- Dir[File.join(__dir__, '**/*.rb')].each do |file|
2
- require_relative file
3
- end