parallel588_polipus 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bb50db8ac7efd7f04d3e044b2e1d708ab2434eb3
4
- data.tar.gz: fa4158852a74a75603d7a6f0e6af1de9b7940dfc
3
+ metadata.gz: bfa6988cb2f60027238d9ed8168b758b3e5d2384
4
+ data.tar.gz: eaf2c566ebea5aaa742f40ab2e2a955f507ccd49
5
5
  SHA512:
6
- metadata.gz: 8e43cec9505d08bceddfe6f64f5a335a8ee12a37fd5f11d1d1c4f20351df651e656b282e5f884259b0cd1df345975be105fa12dd77de52529d7d62c3387bcf38
7
- data.tar.gz: fe3941eaee3fc53104e222cc9761e51bbd2fa8d1eaea02e5c66015fe5a520bc68b43b6537fd0059056988f6674058307bf93e200009ffba045b449a678cff1f7
6
+ metadata.gz: 0dd405e16b033aef6da59d914349e42965a66625e19a9b749ae5c78e874292e1296d6930ed58a135993361902baf75741fdcfad02f7f72364fbf087e839e4875
7
+ data.tar.gz: b55f04c79845c6d74e31e3b26ec683c66825b7d16ef4d9e80333ed39c3de2cfc6509382727dbdb261504535486c865124100e2a61e68e511aa7c00366adc3ec6
@@ -72,7 +72,7 @@ module Polipus
72
72
  u = a['href']
73
73
  next if u.nil? || u.empty?
74
74
  abs = to_absolute(u) rescue next
75
- @links << abs if in_domain?(abs)
75
+ @links << abs if abs && in_domain?(abs)
76
76
  end
77
77
  @links.to_a
78
78
  end
@@ -82,7 +82,10 @@ module Polipus
82
82
  #
83
83
  def doc
84
84
  return @doc if @doc
85
- @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? rescue nil
85
+ @body ||= ''
86
+ @body = @body.encode('utf-8', 'binary', :invalid => :replace,
87
+ :undef => :replace, :replace => '')
88
+ @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
86
89
  end
87
90
 
88
91
  #
@@ -168,10 +171,22 @@ module Polipus
168
171
  def to_absolute(link)
169
172
  return nil if link.nil?
170
173
 
174
+ valid_link = link.to_s.encode('utf-8', 'binary', :invalid => :replace,
175
+ :undef => :replace, :replace => '')
176
+
171
177
  # remove anchor
172
- link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
178
+ link =
179
+ begin
180
+ URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
181
+ rescue URI::Error
182
+ return nil
183
+ end
173
184
 
174
- relative = URI(link)
185
+ relative = begin
186
+ URI(link)
187
+ rescue URI::Error
188
+ return nil
189
+ end
175
190
  absolute = base ? base.merge(relative) : @url.merge(relative)
176
191
 
177
192
  absolute.path = '/' if absolute.path.empty?
@@ -12,6 +12,9 @@ module Polipus
12
12
  @rethink = options[:conn]
13
13
  @table = options[:table]
14
14
 
15
+ unless @r.db_list.run(@rethink).include?(@rethink.default_db)
16
+ @r.db_create(@rethink.default_db).run(@rethink)
17
+ end
15
18
  unless @r.table_list.run(@rethink).include?(@table)
16
19
  @r.table_create(@table).run(@rethink)
17
20
  @r.table(@table).index_create('created_at')
@@ -1,5 +1,5 @@
1
1
  # encoding: UTF-8
2
2
  module Polipus
3
- VERSION = '0.4.0'
3
+ VERSION = '0.4.1'
4
4
  HOMEPAGE = 'https://github.com/taganaka/polipus'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parallel588_polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-03 00:00:00.000000000 Z
11
+ date: 2015-03-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -345,4 +345,3 @@ signing_key:
345
345
  specification_version: 4
346
346
  summary: Polipus distributed web-crawler framework
347
347
  test_files: []
348
- has_rdoc: