parallel588_polipus 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/polipus/page.rb +19 -4
- data/lib/polipus/storage/rethink_store.rb +3 -0
- data/lib/polipus/version.rb +1 -1
- metadata +2 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bfa6988cb2f60027238d9ed8168b758b3e5d2384
|
|
4
|
+
data.tar.gz: eaf2c566ebea5aaa742f40ab2e2a955f507ccd49
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0dd405e16b033aef6da59d914349e42965a66625e19a9b749ae5c78e874292e1296d6930ed58a135993361902baf75741fdcfad02f7f72364fbf087e839e4875
|
|
7
|
+
data.tar.gz: b55f04c79845c6d74e31e3b26ec683c66825b7d16ef4d9e80333ed39c3de2cfc6509382727dbdb261504535486c865124100e2a61e68e511aa7c00366adc3ec6
|
data/lib/polipus/page.rb
CHANGED
|
@@ -72,7 +72,7 @@ module Polipus
|
|
|
72
72
|
u = a['href']
|
|
73
73
|
next if u.nil? || u.empty?
|
|
74
74
|
abs = to_absolute(u) rescue next
|
|
75
|
-
@links << abs if in_domain?(abs)
|
|
75
|
+
@links << abs if abs && in_domain?(abs)
|
|
76
76
|
end
|
|
77
77
|
@links.to_a
|
|
78
78
|
end
|
|
@@ -82,7 +82,10 @@ module Polipus
|
|
|
82
82
|
#
|
|
83
83
|
def doc
|
|
84
84
|
return @doc if @doc
|
|
85
|
-
@
|
|
85
|
+
@body ||= ''
|
|
86
|
+
@body = @body.encode('utf-8', 'binary', :invalid => :replace,
|
|
87
|
+
:undef => :replace, :replace => '')
|
|
88
|
+
@doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html?
|
|
86
89
|
end
|
|
87
90
|
|
|
88
91
|
#
|
|
@@ -168,10 +171,22 @@ module Polipus
|
|
|
168
171
|
def to_absolute(link)
|
|
169
172
|
return nil if link.nil?
|
|
170
173
|
|
|
174
|
+
valid_link = link.to_s.encode('utf-8', 'binary', :invalid => :replace,
|
|
175
|
+
:undef => :replace, :replace => '')
|
|
176
|
+
|
|
171
177
|
# remove anchor
|
|
172
|
-
link =
|
|
178
|
+
link =
|
|
179
|
+
begin
|
|
180
|
+
URI.encode(URI.decode(valid_link.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
|
181
|
+
rescue URI::Error
|
|
182
|
+
return nil
|
|
183
|
+
end
|
|
173
184
|
|
|
174
|
-
relative =
|
|
185
|
+
relative = begin
|
|
186
|
+
URI(link)
|
|
187
|
+
rescue URI::Error
|
|
188
|
+
return nil
|
|
189
|
+
end
|
|
175
190
|
absolute = base ? base.merge(relative) : @url.merge(relative)
|
|
176
191
|
|
|
177
192
|
absolute.path = '/' if absolute.path.empty?
|
|
@@ -12,6 +12,9 @@ module Polipus
|
|
|
12
12
|
@rethink = options[:conn]
|
|
13
13
|
@table = options[:table]
|
|
14
14
|
|
|
15
|
+
unless @r.db_list.run(@rethink).include?(@rethink.default_db)
|
|
16
|
+
@r.db_create(@rethink.default_db).run(@rethink)
|
|
17
|
+
end
|
|
15
18
|
unless @r.table_list.run(@rethink).include?(@table)
|
|
16
19
|
@r.table_create(@table).run(@rethink)
|
|
17
20
|
@r.table(@table).index_create('created_at')
|
data/lib/polipus/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: parallel588_polipus
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Francesco Laurita
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-03-
|
|
11
|
+
date: 2015-03-19 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: nokogiri
|
|
@@ -345,4 +345,3 @@ signing_key:
|
|
|
345
345
|
specification_version: 4
|
|
346
346
|
summary: Polipus distributed web-crawler framework
|
|
347
347
|
test_files: []
|
|
348
|
-
has_rdoc:
|