omni_scrape 0.1.9 → 0.1.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/omni_scrape/version.rb +1 -1
- data/lib/omni_scrape.rb +324 -430
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1c091a8e69686b9b06509d534dae916b1de5d6f0
|
4
|
+
data.tar.gz: 69d97456f7d6bcd7233d92d8f1403986531dafe6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9753df86287deb32dc0c5d08fa97608900e57c3c561aaeee05a935c57e4172f1c0f52b2c9f74254c46b66702e3e718a15318c0581b62d8a140db9c5d88593090
|
7
|
+
data.tar.gz: 9ce62aef0e7dd0c861fb7fec00055c76e708602059794a4843cc48f8fe4196a7fd0f1bbfaf337cdc1fa4fea1604494ada28e0d6f5ba764a06c5798fbecd0f3d6
|
data/README.md
CHANGED
@@ -33,7 +33,7 @@ The third is a sub-url for internal links.
|
|
33
33
|
|
34
34
|
Method : Localize
|
35
35
|
|
36
|
-
example : OmniScrape.Localize("
|
36
|
+
example : OmniScrape.Localize("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org")
|
37
37
|
|
38
38
|
This method takes three parameters the first should be the url to start at.
|
39
39
|
|
data/lib/omni_scrape/version.rb
CHANGED
data/lib/omni_scrape.rb
CHANGED
@@ -1,448 +1,342 @@
|
|
1
1
|
require "omni_scrape/version"
|
2
2
|
module OmniScrape
|
3
3
|
|
4
|
-
##########################################################################################
|
4
|
+
##########################################################################################
|
5
5
|
|
6
|
-
def CrawlScrape(url, depth, sub_url)
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
13
|
-
|
14
|
-
links= page.css('a')
|
6
|
+
def CrawlScrape(url, depth, sub_url)
|
7
|
+
if (depth<0)
|
8
|
+
depth=0
|
9
|
+
end
|
10
|
+
s_depth = depth
|
11
|
+
#open the starting page
|
12
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
13
|
+
#collect all of the links from the page
|
14
|
+
links= page.css('a')
|
15
15
|
|
16
|
-
|
17
|
-
refarr=[]
|
18
|
-
hrefs = []
|
19
|
-
|
20
|
-
links.each do |link|
|
16
|
+
#initialize variables
|
17
|
+
refarr=[]
|
18
|
+
hrefs = []
|
19
|
+
#add title and href to arrays for each link
|
20
|
+
links.each do |link|
|
21
21
|
if(link['href']!=nil && link['href']!="")
|
22
|
-
|
23
|
-
end
|
24
|
-
|
25
|
-
|
26
|
-
#transfer links to other array
|
27
|
-
while(!hrefs.empty?)
|
28
|
-
value= hrefs.pop
|
29
|
-
|
30
|
-
refarr.push(value)
|
31
|
-
|
22
|
+
hrefs.push(link)
|
23
|
+
end
|
24
|
+
end
|
32
25
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end#begin
|
64
|
-
if (fourofour==false)
|
65
|
-
#store html from the link with title of the link
|
66
|
-
crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
|
67
|
-
crfile.puts pagina
|
68
|
-
crfile.close
|
69
|
-
end#if
|
70
|
-
end#if != "-"
|
71
|
-
|
72
|
-
end#end for each
|
73
|
-
|
74
|
-
end#def crawlscrape
|
75
|
-
|
76
|
-
#############################################################################################
|
77
|
-
|
78
|
-
def Localize(url, depth, sub_url)
|
79
|
-
|
80
|
-
#initialize to extract from user view
|
81
|
-
@location = Hash.new
|
82
|
-
s_depth = depth
|
83
|
-
i_page = 0
|
84
|
-
prev_ipage = 0
|
85
|
-
link_to_add =""
|
86
|
-
if (depth<0)
|
87
|
-
depth=0
|
88
|
-
end
|
89
|
-
#open the starting page
|
90
|
-
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
91
|
-
#collect all of the links from the page
|
92
|
-
links= page.css('a')
|
93
|
-
title = page.css('title')
|
94
|
-
#initialize variables
|
95
|
-
refarr=[]
|
96
|
-
hrefs = []
|
97
|
-
x=0
|
98
|
-
|
99
|
-
#add href to arrays for each link
|
100
|
-
links.each do |link|
|
101
|
-
if(link['href']!=nil && link['href']!="")
|
102
|
-
# puts x
|
103
|
-
# puts (link['title'].split.join)
|
104
|
-
# x+=1
|
105
|
-
hrefs.push(link)
|
26
|
+
#transfer links to other array
|
27
|
+
while(!hrefs.empty?)
|
28
|
+
value= hrefs.pop
|
29
|
+
refarr.push(value)
|
30
|
+
end
|
31
|
+
#setup for recognition of the end of the array
|
32
|
+
refarr.push("-")
|
33
|
+
|
34
|
+
#create folder for storing current set of scraped pages
|
35
|
+
if (Dir.exist?('./results'+depth.to_s))
|
36
|
+
else Dir.mkdir('./results'+depth.to_s)
|
37
|
+
end
|
38
|
+
#in each link
|
39
|
+
check =(refarr.length-1)
|
40
|
+
for i in 0..check
|
41
|
+
if(refarr[i]!="-")#still valid links
|
42
|
+
#evaluate whether link is internal or external
|
43
|
+
if(refarr[i]['href'].include?('http://') && refarr[i]!=nil)
|
44
|
+
url=refarr[i]['href']
|
45
|
+
else
|
46
|
+
url=sub_url+refarr[i]['href']
|
47
|
+
end
|
48
|
+
fourofour=false
|
49
|
+
|
50
|
+
begin
|
51
|
+
if(fourofour==false)
|
52
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
53
|
+
end
|
54
|
+
#test for a 404
|
55
|
+
rescue Exception =>ex
|
106
56
|
|
57
|
+
fourofour=true
|
58
|
+
retry
|
59
|
+
end
|
60
|
+
if (fourofour==false)
|
61
|
+
#store html from the link with title of the link
|
62
|
+
crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
|
63
|
+
crfile.puts pagina
|
64
|
+
crfile.close
|
65
|
+
end
|
107
66
|
end
|
108
67
|
|
109
|
-
|
110
|
-
|
111
|
-
#transfer links to other array
|
112
|
-
while(!hrefs.empty?)
|
113
|
-
value= hrefs.pop
|
114
|
-
refarr.push(value)
|
115
|
-
total+=1
|
68
|
+
end
|
69
|
+
|
116
70
|
end
|
117
71
|
|
118
|
-
|
119
|
-
|
120
|
-
#setup for recognition of the end of the array
|
121
|
-
refarr.push("-")
|
122
|
-
|
123
|
-
if(depth>0)
|
124
|
-
|
125
|
-
#create subdirectory for storing current set of scraped pages
|
126
|
-
|
127
|
-
if (Dir.exist?('./pages'+depth.to_s))
|
128
|
-
else Dir.mkdir('./pages'+depth.to_s)
|
129
|
-
end
|
130
|
-
#in each link
|
131
|
-
check = (refarr.length-1)
|
132
|
-
for i in 0..check
|
133
|
-
if(refarr[i]!="-")
|
134
|
-
#evaluate whether link is internal or external
|
135
|
-
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
136
|
-
if(refarr[i]['href'].include?('http://'))
|
137
|
-
url=refarr[i]['href']
|
138
|
-
else
|
139
|
-
url=sub_url+refarr[i]['href']
|
140
|
-
#puts "external link"
|
141
|
-
end#refarr[i]['href'].include?
|
142
|
-
end#refarr[i]['href']!=nil
|
143
|
-
fourofour=false
|
72
|
+
#############################################################################################
|
144
73
|
|
145
|
-
|
146
|
-
if(fourofour==false)
|
147
|
-
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
148
|
-
end
|
149
|
-
#test for a 404
|
150
|
-
rescue Exception =>ex
|
151
|
-
#puts "got a 404"
|
152
|
-
#replace href (no navigation onclick)
|
153
|
-
refarr[i]['href'] =""
|
154
|
-
fourofour=true
|
155
|
-
|
156
|
-
retry
|
157
|
-
end #begin
|
158
|
-
|
159
|
-
if (fourofour==false)
|
160
|
-
#make relevant links reference local files
|
161
|
-
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
162
|
-
|
163
|
-
|
164
|
-
j_depth = s_depth - depth
|
165
|
-
appendval = "../"
|
166
|
-
clutch = 0
|
167
|
-
for r in 1..j_depth
|
168
|
-
|
169
|
-
clutch +=1
|
170
|
-
end
|
171
|
-
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
172
|
-
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
173
|
-
end
|
174
|
-
if (depth == s_depth)
|
175
|
-
|
176
|
-
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
177
|
-
else
|
178
|
-
|
179
|
-
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
180
|
-
end
|
181
|
-
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
182
|
-
if (@location.has_key?(refarr[i]['href']))
|
183
|
-
loc = @location[(refarr[i]['href'])]
|
184
|
-
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
185
|
-
refarr[i]['href'] =sub_loc
|
186
|
-
else
|
187
|
-
initial_link=refarr[i]['href']
|
188
|
-
refarr[i]['href']=linkref
|
189
|
-
|
190
|
-
#HERE!!!!!**!*!*@*!!@@***!
|
191
|
-
if (depth == s_depth)
|
192
|
-
full_link = "../../"+linkref
|
193
|
-
else
|
194
|
-
full_link = linkref
|
195
|
-
end
|
196
|
-
@location[initial_link]=full_link
|
197
|
-
#puts "working"
|
198
|
-
end# @location.haskey
|
199
|
-
end #refarr[i]['href']!=""
|
200
|
-
|
201
|
-
#trim it down and remove special characters for display
|
202
|
-
trimval=refarr[i]['href']
|
203
|
-
finval=trimval.gsub!(/[!:\/-]/, '')
|
204
|
-
#puts refarr[i]
|
205
|
-
if(finval==nil && refarr[i]!=nil)
|
206
|
-
finval=refarr[i]
|
207
|
-
end #finval == nil
|
74
|
+
def Localize(url, depth, sub_url)
|
208
75
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
else#<< depth not > 0
|
231
|
-
check = (refarr.length-1)
|
232
|
-
for i in 0..check
|
233
|
-
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
234
|
-
refarr[i]['href']=""
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
if (depth == s_depth)
|
240
|
-
#store newly generated html/links for current page
|
241
|
-
mainpage =File.new('./page.html',"w")
|
242
|
-
mainpage.puts page
|
243
|
-
mainpage.close
|
244
|
-
|
245
|
-
|
246
|
-
else
|
247
|
-
#store page from the link in the subdirectory
|
248
|
-
puts "page: "
|
249
|
-
p_depth = depth +1
|
250
|
-
j_depth = s_depth - depth
|
251
|
-
appendval = ""
|
252
|
-
clutch = 0
|
253
|
-
for r in 1..j_depth
|
254
|
-
appendval += "../"
|
255
|
-
clutch +=1
|
256
|
-
end
|
257
|
-
clutch -=1
|
258
|
-
|
259
|
-
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
260
|
-
crfile.puts page
|
261
|
-
crfile.close
|
262
|
-
|
263
|
-
end
|
264
|
-
end #end def Localize
|
265
|
-
|
266
|
-
#########################################################################################
|
267
|
-
def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
|
268
|
-
#open the starting page
|
269
|
-
|
270
|
-
if (depth<0)
|
271
|
-
depth=0
|
272
|
-
end
|
273
|
-
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
274
|
-
#collect all of the links from the page
|
275
|
-
links= page.css('a')
|
276
|
-
title = page.css('title')
|
277
|
-
#initialize variables
|
278
|
-
refarr=[]
|
279
|
-
hrefs = []
|
280
|
-
x=0
|
281
|
-
|
282
|
-
#add href to arrays for each link
|
283
|
-
links.each do |link|
|
76
|
+
#initialize to extract from user view
|
77
|
+
@location = Hash.new
|
78
|
+
s_depth = depth
|
79
|
+
i_page = 0
|
80
|
+
prev_ipage = 0
|
81
|
+
link_to_add =""
|
82
|
+
if (depth<0)
|
83
|
+
depth=0
|
84
|
+
end
|
85
|
+
#open the starting page
|
86
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
87
|
+
#collect all of the links from the page
|
88
|
+
links= page.css('a')
|
89
|
+
title = page.css('title')
|
90
|
+
#initialize variables
|
91
|
+
refarr=[]
|
92
|
+
hrefs = []
|
93
|
+
x=0
|
94
|
+
|
95
|
+
#add href to arrays for each link
|
96
|
+
links.each do |link|
|
284
97
|
if(link['href']!=nil && link['href']!="")
|
285
|
-
|
286
|
-
# puts (link['title'].split.join)
|
287
|
-
# x+=1
|
288
|
-
hrefs.push(link)
|
289
|
-
|
98
|
+
hrefs.push(link)
|
290
99
|
end
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
100
|
+
end
|
101
|
+
total=0
|
102
|
+
#transfer links to other array
|
103
|
+
while(!hrefs.empty?)
|
104
|
+
value= hrefs.pop
|
105
|
+
refarr.push(value)
|
106
|
+
total+=1
|
107
|
+
end
|
108
|
+
#setup for recognition of the end of the array
|
109
|
+
refarr.push("-")
|
110
|
+
if(depth>0)
|
111
|
+
#create subdirectory for storing current set of scraped pages
|
112
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
113
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
114
|
+
end
|
115
|
+
#in each link
|
116
|
+
check = (refarr.length-1)
|
117
|
+
for i in 0..check
|
118
|
+
if(refarr[i]!="-")
|
119
|
+
#evaluate whether link is internal or external
|
120
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
121
|
+
if(refarr[i]['href'].include?('http://'))
|
122
|
+
url=refarr[i]['href']
|
123
|
+
else
|
124
|
+
url=sub_url+refarr[i]['href']
|
125
|
+
end#refarr[i]['href'].include?
|
126
|
+
end#refarr[i]['href']!=nil
|
127
|
+
fourofour=false
|
128
|
+
begin
|
129
|
+
if(fourofour==false)
|
130
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
131
|
+
end
|
132
|
+
#test for a 404
|
133
|
+
rescue Exception =>ex
|
134
|
+
#replace href (no navigation onclick)
|
135
|
+
refarr[i]['href'] =""
|
136
|
+
fourofour=true
|
137
|
+
retry
|
138
|
+
end
|
139
|
+
if (fourofour==false)
|
140
|
+
#make relevant links reference local files
|
141
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
142
|
+
j_depth = s_depth - depth
|
143
|
+
appendval = "../"
|
144
|
+
clutch = 0
|
145
|
+
for r in 1..j_depth
|
146
|
+
clutch +=1
|
147
|
+
end
|
148
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
149
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
150
|
+
end
|
151
|
+
if (depth == s_depth)
|
152
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
153
|
+
else
|
154
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
155
|
+
end
|
156
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
157
|
+
if (@location.has_key?(refarr[i]['href']))
|
158
|
+
loc = @location[(refarr[i]['href'])]
|
159
|
+
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
160
|
+
refarr[i]['href'] =sub_loc
|
161
|
+
else
|
162
|
+
initial_link=refarr[i]['href']
|
163
|
+
refarr[i]['href']=linkref
|
164
|
+
if (depth == s_depth)
|
165
|
+
full_link = "../../"+linkref
|
166
|
+
else
|
167
|
+
full_link = linkref
|
168
|
+
end
|
169
|
+
@location[initial_link]=full_link
|
170
|
+
end
|
171
|
+
end
|
172
|
+
#trim it down and remove special characters for display
|
173
|
+
trimval=refarr[i]['href']
|
174
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
175
|
+
if(finval==nil && refarr[i]!=nil)
|
176
|
+
finval=refarr[i]
|
177
|
+
end
|
178
|
+
n_depth = depth-1
|
179
|
+
if(finval!=nil)
|
180
|
+
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
331
184
|
end
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
185
|
+
else
|
186
|
+
check = (refarr.length-1)
|
187
|
+
for i in 0..check
|
188
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
189
|
+
refarr[i]['href']=""
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
if (depth == s_depth)
|
194
|
+
#store newly generated html/links for current page
|
195
|
+
mainpage =File.new('./page.html',"w")
|
196
|
+
mainpage.puts page
|
197
|
+
mainpage.close
|
198
|
+
else
|
199
|
+
#store page from the link in the subdirectory
|
200
|
+
p_depth = depth +1
|
201
|
+
j_depth = s_depth - depth
|
202
|
+
appendval = ""
|
203
|
+
clutch = 0
|
204
|
+
for r in 1..j_depth
|
205
|
+
appendval += "../"
|
206
|
+
clutch +=1
|
207
|
+
end
|
208
|
+
clutch -=1
|
209
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
210
|
+
crfile.puts page
|
211
|
+
crfile.close
|
212
|
+
end
|
213
|
+
end
|
214
|
+
#########################################################################################
|
215
|
+
def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
|
216
|
+
#open the starting page
|
217
|
+
if (depth<0)
|
218
|
+
depth=0
|
219
|
+
end
|
220
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
221
|
+
#collect all of the links from the page
|
222
|
+
links= page.css('a')
|
223
|
+
title = page.css('title')
|
224
|
+
#initialize variables
|
225
|
+
refarr=[]
|
226
|
+
hrefs = []
|
227
|
+
x=0
|
228
|
+
#add href to arrays for each link
|
229
|
+
links.each do |link|
|
230
|
+
if(link['href']!=nil && link['href']!="")
|
231
|
+
hrefs.push(link)
|
232
|
+
end
|
233
|
+
end
|
234
|
+
total=0
|
235
|
+
#transfer links to other array
|
236
|
+
while(!hrefs.empty?)
|
237
|
+
value= hrefs.pop
|
238
|
+
refarr.push(value)
|
239
|
+
total+=1
|
240
|
+
end
|
241
|
+
#setup for recognition of the end of the array
|
242
|
+
refarr.push("-")
|
243
|
+
if(depth>0)
|
244
|
+
#create subdirectory for storing current set of scraped pages
|
245
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
246
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
247
|
+
end
|
248
|
+
#in each link
|
249
|
+
check = (refarr.length-1)
|
250
|
+
for i in 0..check
|
251
|
+
if(refarr[i]!="-")
|
252
|
+
#evaluate whether link is internal or external
|
253
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
254
|
+
if(refarr[i]['href'].include?('http://'))
|
255
|
+
url=refarr[i]['href']
|
256
|
+
else
|
257
|
+
url=sub_url+refarr[i]['href']
|
258
|
+
end
|
259
|
+
end
|
260
|
+
fourofour=false
|
261
|
+
begin
|
262
|
+
if(fourofour==false)
|
263
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
264
|
+
end
|
265
|
+
#test for a 404
|
266
|
+
rescue Exception =>ex
|
267
|
+
#replace href (no navigation onclick)
|
268
|
+
refarr[i]['href'] =""
|
269
|
+
fourofour=true
|
270
|
+
retry
|
271
|
+
end
|
272
|
+
if (fourofour==false)
|
273
|
+
#make relevant links reference local files
|
274
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
275
|
+
j_depth = s_depth - depth
|
276
|
+
appendval = "../"
|
277
|
+
clutch = 0
|
278
|
+
for r in 1..j_depth
|
279
|
+
clutch +=1
|
280
|
+
end
|
281
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
282
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
283
|
+
end
|
284
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
285
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
286
|
+
if (@location.has_key?(refarr[i]['href']))
|
287
|
+
pass_a_link = "this_is_a_duplicate"
|
288
|
+
refarr[i]['href'] = @location[(refarr[i]['href'])]
|
289
|
+
else
|
290
|
+
initial_link=refarr[i]['href']
|
291
|
+
refarr[i]['href']=linkref
|
292
|
+
full_link = linkref
|
293
|
+
@location[initial_link]=linkref
|
294
|
+
end
|
295
|
+
end
|
296
|
+
#trim it down and remove special characters for display
|
297
|
+
trimval=refarr[i]['href']
|
298
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
299
|
+
if(finval==nil && refarr[i]!=nil)
|
300
|
+
finval=refarr[i]
|
301
|
+
end
|
302
|
+
n_depth = depth-1
|
303
|
+
if(finval!=nil)
|
304
|
+
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
309
|
+
else
|
310
|
+
check = (refarr.length-1)
|
311
|
+
for i in 0..check
|
312
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
313
|
+
refarr[i]['href']=""
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
if (depth == s_depth)
|
318
|
+
#store newly generated html/links for current page
|
319
|
+
mainpage =File.new('./page.html',"w")
|
320
|
+
mainpage.puts page
|
321
|
+
mainpage.close
|
322
|
+
else
|
323
|
+
#store page from the link in the subdirectory
|
324
|
+
p_depth = depth +1
|
325
|
+
j_depth = s_depth - depth
|
326
|
+
appendval = ""
|
327
|
+
clutch = 0
|
328
|
+
for r in 1..j_depth
|
329
|
+
appendval += "../"
|
330
|
+
clutch +=1
|
331
|
+
end
|
332
|
+
clutch -=1
|
333
|
+
if (link_to_add!="this_is_a_duplicate")
|
334
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
335
|
+
crfile.puts page
|
336
|
+
crfile.close
|
337
|
+
else
|
338
|
+
end
|
339
|
+
end
|
340
|
+
end
|
341
|
+
#########################################################################################
|
443
342
|
end
|
444
|
-
end #end def FLocalize
|
445
|
-
|
446
|
-
#########################################################################################
|
447
|
-
|
448
|
-
end#module
|