parallel588_polipus 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bb50db8ac7efd7f04d3e044b2e1d708ab2434eb3
4
- data.tar.gz: fa4158852a74a75603d7a6f0e6af1de9b7940dfc
3
+ metadata.gz: bfa6988cb2f60027238d9ed8168b758b3e5d2384
4
+ data.tar.gz: eaf2c566ebea5aaa742f40ab2e2a955f507ccd49
5
5
  SHA512:
6
- metadata.gz: 8e43cec9505d08bceddfe6f64f5a335a8ee12a37fd5f11d1d1c4f20351df651e656b282e5f884259b0cd1df345975be105fa12dd77de52529d7d62c3387bcf38
7
- data.tar.gz: fe3941eaee3fc53104e222cc9761e51bbd2fa8d1eaea02e5c66015fe5a520bc68b43b6537fd0059056988f6674058307bf93e200009ffba045b449a678cff1f7
6
+ metadata.gz: 0dd405e16b033aef6da59d914349e42965a66625e19a9b749ae5c78e874292e1296d6930ed58a135993361902baf75741fdcfad02f7f72364fbf087e839e4875
7
+ data.tar.gz: b55f04c79845c6d74e31e3b26ec683c66825b7d16ef4d9e80333ed39c3de2cfc6509382727dbdb261504535486c865124100e2a61e68e511aa7c00366adc3ec6
@@ -72,7 +72,7 @@ module Polipus
72
72
  u = a['href']
73
73
  next if u.nil? || u.empty?
74
74
  abs = to_absolute(u) rescue next
75
- @links << abs if in_domain?(abs)
75
+ @links << abs if abs && in_domain?(abs)
76
76
  end
77
77
  @links.to_a
78
78
  end
@@ -82,7 +82,10 @@ module Polipus
82
82
  #
83
83
  def doc
84
84
  return @doc if @doc
85
- @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? rescue nil
85
+ @body ||= ''
86
+ @body = @body.encode('utf-8', 'binary', :invalid => :replace,
87
+ :undef => :replace, :replace => '')
88
+ @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
86
89
  end
87
90
 
88
91
  #
@@ -168,10 +171,22 @@ module Polipus
168
171
  def to_absolute(link)
169
172
  return nil if link.nil?
170
173
 
174
+ valid_link = link.to_s.encode('utf-8', 'binary', :invalid => :replace,
175
+ :undef => :replace, :replace => '')
176
+
171
177
  # remove anchor
172
- link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
178
+ link =
179
+ begin
180
+ URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
181
+ rescue URI::Error
182
+ return nil
183
+ end
173
184
 
174
- relative = URI(link)
185
+ relative = begin
186
+ URI(link)
187
+ rescue URI::Error
188
+ return nil
189
+ end
175
190
  absolute = base ? base.merge(relative) : @url.merge(relative)
176
191
 
177
192
  absolute.path = '/' if absolute.path.empty?
@@ -12,6 +12,9 @@ module Polipus
12
12
  @rethink = options[:conn]
13
13
  @table = options[:table]
14
14
 
15
+ unless @r.db_list.run(@rethink).include?(@rethink.default_db)
16
+ @r.db_create(@rethink.default_db).run(@rethink)
17
+ end
15
18
  unless @r.table_list.run(@rethink).include?(@table)
16
19
  @r.table_create(@table).run(@rethink)
17
20
  @r.table(@table).index_create('created_at')
@@ -1,5 +1,5 @@
1
1
  # encoding: UTF-8
2
2
  module Polipus
3
- VERSION = '0.4.0'
3
+ VERSION = '0.4.1'
4
4
  HOMEPAGE = 'https://github.com/taganaka/polipus'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parallel588_polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-03 00:00:00.000000000 Z
11
+ date: 2015-03-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -345,4 +345,3 @@ signing_key:
345
345
  specification_version: 4
346
346
  summary: Polipus distributed web-crawler framework
347
347
  test_files: []
348
- has_rdoc: