link_scrapper 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/link_scrapper.rb +14 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b6cb0bc3b1eacf9d8fdff2f6ee057bc411c6da03
|
4
|
+
data.tar.gz: 53c1f9da27e968db305e5fbaae3d3e4480905e90
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5aa5eac8873d6275b298beaf2663441a530f57389a42794250e137084d1ad2d98705f5ab83db11c0fc71c66e14899e27f066c39985ade3d50486a39a290c5f66
|
7
|
+
data.tar.gz: 821fe2bfba46ed516be52a7803e15a78a2e00e4a656d07e787b1830d4d2b0089b85901a51945505fd719649094a5d06a12aeefd53fb9224b8570fec8931d7055
|
data/lib/link_scrapper.rb
CHANGED
@@ -82,7 +82,7 @@ class LinkScrapper
|
|
82
82
|
if !@external_links[@search_uri.to_sym]
|
83
83
|
begin
|
84
84
|
t1 = Time.now
|
85
|
-
response = Net::HTTP.get_response(URI.parse(
|
85
|
+
response = Net::HTTP.get_response(URI.parse(@search_uri))
|
86
86
|
t2 = Time.now
|
87
87
|
delta = t2 - t1
|
88
88
|
code = response.code
|
@@ -170,7 +170,7 @@ class LinkScrapper
|
|
170
170
|
# gather page request response
|
171
171
|
begin
|
172
172
|
t1 = Time.now
|
173
|
-
response = Net::HTTP.get_response(URI.parse(
|
173
|
+
response = Net::HTTP.get_response(URI.parse(@search_uri.strip))
|
174
174
|
t2 = Time.now
|
175
175
|
delta = t2 - t1
|
176
176
|
|
@@ -184,7 +184,8 @@ class LinkScrapper
|
|
184
184
|
links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi)
|
185
185
|
|
186
186
|
# update anchors and indirect links to use direct links
|
187
|
-
links_array.
|
187
|
+
links_array.each_with_index { |val, index|
|
188
|
+
skip = 0
|
188
189
|
if (val[0][0,2] == "//" || val[0][0] == "/" || val[0][0,3] == "../") && val[0] !~ /^htt(p|ps):/
|
189
190
|
if val[0][0,3] == "../"
|
190
191
|
val[0][0,3] = ""
|
@@ -197,7 +198,16 @@ class LinkScrapper
|
|
197
198
|
end
|
198
199
|
val[0] = "#{@search_domain}#{val[0]}"
|
199
200
|
end
|
200
|
-
@
|
201
|
+
@links.each { |lnk|
|
202
|
+
if val[0] == lnk[0]
|
203
|
+
skip = 1
|
204
|
+
end
|
205
|
+
}
|
206
|
+
if skip == 0
|
207
|
+
@link_parents[val[0].chomp.to_sym] = @search_uri.strip
|
208
|
+
else
|
209
|
+
val.delete_at(index)
|
210
|
+
end
|
201
211
|
}
|
202
212
|
|
203
213
|
# combine found links with links array
|