link_scrapper 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/link_scrapper.rb +16 -5
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 213970214a5f8e494141c4172373f5ae21190714
|
4
|
+
data.tar.gz: 0ba291b9a887f6fc244ebd25939b342f77c61b46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 579ab33563beb450bf54e78c88b2b19b4d034a87acb722224b91ddb15e075a527ef05fd2a6dfd7b5a69dac464f616bba161a4dedaaff14285dadfbb8a04e0717
|
7
|
+
data.tar.gz: a7180512da166efd1f9322199e69d454128b159610306289a6575220852a678991fee7922b29fe9fb7ce26833e1ce7178121475d786cd3fdd0a442dec2a87469
|
data/lib/link_scrapper.rb
CHANGED
@@ -19,6 +19,7 @@ class LinkScrapper
|
|
19
19
|
@search_index = 0
|
20
20
|
@search_iteration = 0
|
21
21
|
@links = Array.new
|
22
|
+
@link_parents = Hash.new
|
22
23
|
@checked_links = Hash.new
|
23
24
|
@error_links = Hash.new
|
24
25
|
@external_links = Hash.new
|
@@ -184,9 +185,19 @@ class LinkScrapper
|
|
184
185
|
|
185
186
|
# update anchors and indirect links to use direct links
|
186
187
|
links_array.each { |val|
|
187
|
-
if val[0]
|
188
|
-
val
|
188
|
+
if (val[0][0,2] == "//" || val[0][0] == "/" || val[0][0,3] == "../") && val[0] !~ /^htt(p|ps):/
|
189
|
+
if val[0][0,3] == "../"
|
190
|
+
val[0][0,3] = ""
|
191
|
+
end
|
192
|
+
if val[0][0,2] == "//"
|
193
|
+
val[0][0,2] = ""
|
194
|
+
end
|
195
|
+
if val[0][0] == "/"
|
196
|
+
val[0][0] = ""
|
197
|
+
end
|
198
|
+
val[0] = "#{@search_domain}#{val[0]}"
|
189
199
|
end
|
200
|
+
@link_parents[val[0].chomp.to_sym] = @search_uri.strip
|
190
201
|
}
|
191
202
|
|
192
203
|
# combine found links with links array
|
@@ -200,7 +211,7 @@ class LinkScrapper
|
|
200
211
|
end
|
201
212
|
|
202
213
|
# store results in checked hash
|
203
|
-
@checked_links[@search_uri.to_sym] = {res: code, time: delta}
|
214
|
+
@checked_links[@search_uri.to_sym] = {res: code, time: delta, parent: @link_parents[@search_uri.to_sym]}
|
204
215
|
|
205
216
|
end
|
206
217
|
|
@@ -215,13 +226,13 @@ class LinkScrapper
|
|
215
226
|
# save search results
|
216
227
|
CSV.open('results.csv', 'wb') {|csv|
|
217
228
|
@checked_links.each {|link|
|
218
|
-
csv << [link[0], link[1][:res], link[1][:time]]
|
229
|
+
csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]]
|
219
230
|
}
|
220
231
|
}
|
221
232
|
# save list of external links
|
222
233
|
CSV.open('external-links.csv', 'wb') {|csv|
|
223
234
|
@external_links.each do |link|
|
224
|
-
csv << [link[0], link[1][:res], link[1][:time]]
|
235
|
+
csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]]
|
225
236
|
end
|
226
237
|
}
|
227
238
|
# save list of invalid links
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: link_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert McDowell
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A simple gem for scrapping links within an assigned website. Results
|
14
14
|
for domain, external links, and invalid URLs can be saved as CSVs or returned as
|
@@ -19,7 +19,7 @@ extensions: []
|
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
21
|
- lib/link_scrapper.rb
|
22
|
-
homepage:
|
22
|
+
homepage: https://github.com/Studio-Center/Ruby-Link-Extractor
|
23
23
|
licenses:
|
24
24
|
- MIT
|
25
25
|
metadata: {}
|