link_scrapper 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/link_scrapper.rb +16 -5
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 213970214a5f8e494141c4172373f5ae21190714
|
4
|
+
data.tar.gz: 0ba291b9a887f6fc244ebd25939b342f77c61b46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 579ab33563beb450bf54e78c88b2b19b4d034a87acb722224b91ddb15e075a527ef05fd2a6dfd7b5a69dac464f616bba161a4dedaaff14285dadfbb8a04e0717
|
7
|
+
data.tar.gz: a7180512da166efd1f9322199e69d454128b159610306289a6575220852a678991fee7922b29fe9fb7ce26833e1ce7178121475d786cd3fdd0a442dec2a87469
|
data/lib/link_scrapper.rb
CHANGED
@@ -19,6 +19,7 @@ class LinkScrapper
|
|
19
19
|
@search_index = 0
|
20
20
|
@search_iteration = 0
|
21
21
|
@links = Array.new
|
22
|
+
@link_parents = Hash.new
|
22
23
|
@checked_links = Hash.new
|
23
24
|
@error_links = Hash.new
|
24
25
|
@external_links = Hash.new
|
@@ -184,9 +185,19 @@ class LinkScrapper
|
|
184
185
|
|
185
186
|
# update anchors and indirect links to use direct links
|
186
187
|
links_array.each { |val|
|
187
|
-
if val[0]
|
188
|
-
val
|
188
|
+
if (val[0][0,2] == "//" || val[0][0] == "/" || val[0][0,3] == "../") && val[0] !~ /^htt(p|ps):/
|
189
|
+
if val[0][0,3] == "../"
|
190
|
+
val[0][0,3] = ""
|
191
|
+
end
|
192
|
+
if val[0][0,2] == "//"
|
193
|
+
val[0][0,2] = ""
|
194
|
+
end
|
195
|
+
if val[0][0] == "/"
|
196
|
+
val[0][0] = ""
|
197
|
+
end
|
198
|
+
val[0] = "#{@search_domain}#{val[0]}"
|
189
199
|
end
|
200
|
+
@link_parents[val[0].chomp.to_sym] = @search_uri.strip
|
190
201
|
}
|
191
202
|
|
192
203
|
# combine found links with links array
|
@@ -200,7 +211,7 @@ class LinkScrapper
|
|
200
211
|
end
|
201
212
|
|
202
213
|
# store results in checked hash
|
203
|
-
@checked_links[@search_uri.to_sym] = {res: code, time: delta}
|
214
|
+
@checked_links[@search_uri.to_sym] = {res: code, time: delta, parent: @link_parents[@search_uri.to_sym]}
|
204
215
|
|
205
216
|
end
|
206
217
|
|
@@ -215,13 +226,13 @@ class LinkScrapper
|
|
215
226
|
# save search results
|
216
227
|
CSV.open('results.csv', 'wb') {|csv|
|
217
228
|
@checked_links.each {|link|
|
218
|
-
csv << [link[0], link[1][:res], link[1][:time]]
|
229
|
+
csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]]
|
219
230
|
}
|
220
231
|
}
|
221
232
|
# save list of external links
|
222
233
|
CSV.open('external-links.csv', 'wb') {|csv|
|
223
234
|
@external_links.each do |link|
|
224
|
-
csv << [link[0], link[1][:res], link[1][:time]]
|
235
|
+
csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]]
|
225
236
|
end
|
226
237
|
}
|
227
238
|
# save list of invalid links
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: link_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert McDowell
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-24 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A simple gem for scrapping links within an assigned website. Results
|
14
14
|
for domain, external links, and invalid URLs can be saved as CSVs or returned as
|
@@ -19,7 +19,7 @@ extensions: []
|
|
19
19
|
extra_rdoc_files: []
|
20
20
|
files:
|
21
21
|
- lib/link_scrapper.rb
|
22
|
-
homepage:
|
22
|
+
homepage: https://github.com/Studio-Center/Ruby-Link-Extractor
|
23
23
|
licenses:
|
24
24
|
- MIT
|
25
25
|
metadata: {}
|