link_scrapper 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/link_scrapper.rb +14 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b6cb0bc3b1eacf9d8fdff2f6ee057bc411c6da03
|
4
|
+
data.tar.gz: 53c1f9da27e968db305e5fbaae3d3e4480905e90
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5aa5eac8873d6275b298beaf2663441a530f57389a42794250e137084d1ad2d98705f5ab83db11c0fc71c66e14899e27f066c39985ade3d50486a39a290c5f66
|
7
|
+
data.tar.gz: 821fe2bfba46ed516be52a7803e15a78a2e00e4a656d07e787b1830d4d2b0089b85901a51945505fd719649094a5d06a12aeefd53fb9224b8570fec8931d7055
|
data/lib/link_scrapper.rb
CHANGED
@@ -82,7 +82,7 @@ class LinkScrapper
|
|
82
82
|
if !@external_links[@search_uri.to_sym]
|
83
83
|
begin
|
84
84
|
t1 = Time.now
|
85
|
-
response = Net::HTTP.get_response(URI.parse(
|
85
|
+
response = Net::HTTP.get_response(URI.parse(@search_uri))
|
86
86
|
t2 = Time.now
|
87
87
|
delta = t2 - t1
|
88
88
|
code = response.code
|
@@ -170,7 +170,7 @@ class LinkScrapper
|
|
170
170
|
# gather page request response
|
171
171
|
begin
|
172
172
|
t1 = Time.now
|
173
|
-
response = Net::HTTP.get_response(URI.parse(
|
173
|
+
response = Net::HTTP.get_response(URI.parse(@search_uri.strip))
|
174
174
|
t2 = Time.now
|
175
175
|
delta = t2 - t1
|
176
176
|
|
@@ -184,7 +184,8 @@ class LinkScrapper
|
|
184
184
|
links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi)
|
185
185
|
|
186
186
|
# update anchors and indirect links to use direct links
|
187
|
-
links_array.
|
187
|
+
links_array.each_with_index { |val, index|
|
188
|
+
skip = 0
|
188
189
|
if (val[0][0,2] == "//" || val[0][0] == "/" || val[0][0,3] == "../") && val[0] !~ /^htt(p|ps):/
|
189
190
|
if val[0][0,3] == "../"
|
190
191
|
val[0][0,3] = ""
|
@@ -197,7 +198,16 @@ class LinkScrapper
|
|
197
198
|
end
|
198
199
|
val[0] = "#{@search_domain}#{val[0]}"
|
199
200
|
end
|
200
|
-
@
|
201
|
+
@links.each { |lnk|
|
202
|
+
if val[0] == lnk[0]
|
203
|
+
skip = 1
|
204
|
+
end
|
205
|
+
}
|
206
|
+
if skip == 0
|
207
|
+
@link_parents[val[0].chomp.to_sym] = @search_uri.strip
|
208
|
+
else
|
209
|
+
val.delete_at(index)
|
210
|
+
end
|
201
211
|
}
|
202
212
|
|
203
213
|
# combine found links with links array
|