parallel588_polipus 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/polipus/page.rb +19 -4
- data/lib/polipus/storage/rethink_store.rb +3 -0
- data/lib/polipus/version.rb +1 -1
- metadata +2 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bfa6988cb2f60027238d9ed8168b758b3e5d2384
|
4
|
+
data.tar.gz: eaf2c566ebea5aaa742f40ab2e2a955f507ccd49
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0dd405e16b033aef6da59d914349e42965a66625e19a9b749ae5c78e874292e1296d6930ed58a135993361902baf75741fdcfad02f7f72364fbf087e839e4875
|
7
|
+
data.tar.gz: b55f04c79845c6d74e31e3b26ec683c66825b7d16ef4d9e80333ed39c3de2cfc6509382727dbdb261504535486c865124100e2a61e68e511aa7c00366adc3ec6
|
data/lib/polipus/page.rb
CHANGED
@@ -72,7 +72,7 @@ module Polipus
|
|
72
72
|
u = a['href']
|
73
73
|
next if u.nil? || u.empty?
|
74
74
|
abs = to_absolute(u) rescue next
|
75
|
-
@links << abs if in_domain?(abs)
|
75
|
+
@links << abs if abs && in_domain?(abs)
|
76
76
|
end
|
77
77
|
@links.to_a
|
78
78
|
end
|
@@ -82,7 +82,10 @@ module Polipus
|
|
82
82
|
#
|
83
83
|
def doc
|
84
84
|
return @doc if @doc
|
85
|
-
@
|
85
|
+
@body ||= ''
|
86
|
+
@body = @body.encode('utf-8', 'binary', :invalid => :replace,
|
87
|
+
:undef => :replace, :replace => '')
|
88
|
+
@doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
|
86
89
|
end
|
87
90
|
|
88
91
|
#
|
@@ -168,10 +171,22 @@ module Polipus
|
|
168
171
|
def to_absolute(link)
|
169
172
|
return nil if link.nil?
|
170
173
|
|
174
|
+
valid_link = link.to_s.encode('utf-8', 'binary', :invalid => :replace,
|
175
|
+
:undef => :replace, :replace => '')
|
176
|
+
|
171
177
|
# remove anchor
|
172
|
-
link =
|
178
|
+
link =
|
179
|
+
begin
|
180
|
+
URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
181
|
+
rescue URI::Error
|
182
|
+
return nil
|
183
|
+
end
|
173
184
|
|
174
|
-
relative =
|
185
|
+
relative = begin
|
186
|
+
URI(link)
|
187
|
+
rescue URI::Error
|
188
|
+
return nil
|
189
|
+
end
|
175
190
|
absolute = base ? base.merge(relative) : @url.merge(relative)
|
176
191
|
|
177
192
|
absolute.path = '/' if absolute.path.empty?
|
@@ -12,6 +12,9 @@ module Polipus
|
|
12
12
|
@rethink = options[:conn]
|
13
13
|
@table = options[:table]
|
14
14
|
|
15
|
+
unless @r.db_list.run(@rethink).include?(@rethink.default_db)
|
16
|
+
@r.db_create(@rethink.default_db).run(@rethink)
|
17
|
+
end
|
15
18
|
unless @r.table_list.run(@rethink).include?(@table)
|
16
19
|
@r.table_create(@table).run(@rethink)
|
17
20
|
@r.table(@table).index_create('created_at')
|
data/lib/polipus/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parallel588_polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -345,4 +345,3 @@ signing_key:
|
|
345
345
|
specification_version: 4
|
346
346
|
summary: Polipus distributed web-crawler framework
|
347
347
|
test_files: []
|
348
|
-
has_rdoc:
|