omni_scrape 0.1.9 → 0.1.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f44e8258cf10e287c2c9f089796197b111d98855
4
- data.tar.gz: b34419efbbeef99c68e365dcbf7d162d9fe525e1
3
+ metadata.gz: 1c091a8e69686b9b06509d534dae916b1de5d6f0
4
+ data.tar.gz: 69d97456f7d6bcd7233d92d8f1403986531dafe6
5
5
  SHA512:
6
- metadata.gz: 9f8fcca1689244e860ad4ffb6639a0daf1b6f9e02ef59362a0a5f03deb987e49029ca354571b5b183842be6bc2389a60c52d61d74c010c521b66566880d2afcc
7
- data.tar.gz: 2ebec8762d455b02c54fb1ce7770284f9a87d1c9ec5cd4b643420301a4dca88961ab160cb7e26f72ee640e4a446d3f8ed9b7030b7d7f38069dbdb85f9310d879
6
+ metadata.gz: 9753df86287deb32dc0c5d08fa97608900e57c3c561aaeee05a935c57e4172f1c0f52b2c9f74254c46b66702e3e718a15318c0581b62d8a140db9c5d88593090
7
+ data.tar.gz: 9ce62aef0e7dd0c861fb7fec00055c76e708602059794a4843cc48f8fe4196a7fd0f1bbfaf337cdc1fa4fea1604494ada28e0d6f5ba764a06c5798fbecd0f3d6
data/README.md CHANGED
@@ -33,7 +33,7 @@ The third is a sub-url for internal links.
33
33
 
34
34
  Method : Localize
35
35
 
36
- example : OmniScrape.Localize("http://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "http://en.wikipedia.org")
36
+ example : OmniScrape.Localize("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org")
37
37
 
38
38
  This method takes three parameters the first should be the url to start at.
39
39
 
@@ -1,3 +1,3 @@
1
1
  module OmniScrape
2
- VERSION = "0.1.9"
2
+ VERSION = "0.1.9.5"
3
3
  end
data/lib/omni_scrape.rb CHANGED
@@ -1,448 +1,342 @@
1
1
  require "omni_scrape/version"
2
2
  module OmniScrape
3
3
 
4
- ##########################################################################################
4
+ ##########################################################################################
5
5
 
6
- def CrawlScrape(url, depth, sub_url)
7
- if (depth<0)
8
- depth=0
9
- end#if
10
- s_depth = depth #true
11
- #open the starting page
12
- page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE})) #good
13
- #collect all of the links from the page
14
- links= page.css('a') #good
6
+ def CrawlScrape(url, depth, sub_url)
7
+ if (depth<0)
8
+ depth=0
9
+ end
10
+ s_depth = depth
11
+ #open the starting page
12
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
13
+ #collect all of the links from the page
14
+ links= page.css('a')
15
15
 
16
- #initialize variables
17
- refarr=[]
18
- hrefs = []
19
- #add title and href to arrays for each link
20
- links.each do |link|
16
+ #initialize variables
17
+ refarr=[]
18
+ hrefs = []
19
+ #add title and href to arrays for each link
20
+ links.each do |link|
21
21
  if(link['href']!=nil && link['href']!="")
22
- hrefs.push(link)
23
- end#if
24
- end#do
25
-
26
- #transfer links to other array
27
- while(!hrefs.empty?)
28
- value= hrefs.pop
29
-
30
- refarr.push(value)
31
-
22
+ hrefs.push(link)
23
+ end
24
+ end
32
25
 
33
- end#while
34
- #setup for recognition of the end of the array
35
- refarr.push("-")
36
-
37
- #create folder for storing current set of scraped pages
38
-
39
- if (Dir.exist?('./results'+depth.to_s))
40
- else Dir.mkdir('./results'+depth.to_s)
41
- end#if
42
- #in each link
43
- check =(refarr.length-1)
44
- for i in 0..check
45
- if(refarr[i]!="-")#still valid links
46
- #evaluate whether link is internal or external
47
- if(refarr[i]['href'].include?('http://') && refarr[i]!=nil)
48
- url=refarr[i]['href']
49
- else
50
- url=sub_url+refarr[i]['href']
51
- end#if include?
52
- fourofour=false
53
-
54
- begin
55
- if(fourofour==false)
56
- pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
57
- end#if
58
- #test for a 404
59
- rescue Exception =>ex
60
-
61
- fourofour=true
62
- retry
63
- end#begin
64
- if (fourofour==false)
65
- #store html from the link with title of the link
66
- crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
67
- crfile.puts pagina
68
- crfile.close
69
- end#if
70
- end#if != "-"
71
-
72
- end#end for each
73
-
74
- end#def crawlscrape
75
-
76
- #############################################################################################
77
-
78
- def Localize(url, depth, sub_url)
79
-
80
- #initialize to extract from user view
81
- @location = Hash.new
82
- s_depth = depth
83
- i_page = 0
84
- prev_ipage = 0
85
- link_to_add =""
86
- if (depth<0)
87
- depth=0
88
- end
89
- #open the starting page
90
- page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
91
- #collect all of the links from the page
92
- links= page.css('a')
93
- title = page.css('title')
94
- #initialize variables
95
- refarr=[]
96
- hrefs = []
97
- x=0
98
-
99
- #add href to arrays for each link
100
- links.each do |link|
101
- if(link['href']!=nil && link['href']!="")
102
- # puts x
103
- # puts (link['title'].split.join)
104
- # x+=1
105
- hrefs.push(link)
26
+ #transfer links to other array
27
+ while(!hrefs.empty?)
28
+ value= hrefs.pop
29
+ refarr.push(value)
30
+ end
31
+ #setup for recognition of the end of the array
32
+ refarr.push("-")
33
+
34
+ #create folder for storing current set of scraped pages
35
+ if (Dir.exist?('./results'+depth.to_s))
36
+ else Dir.mkdir('./results'+depth.to_s)
37
+ end
38
+ #in each link
39
+ check =(refarr.length-1)
40
+ for i in 0..check
41
+ if(refarr[i]!="-")#still valid links
42
+ #evaluate whether link is internal or external
43
+ if(refarr[i]['href'].include?('http://') && refarr[i]!=nil)
44
+ url=refarr[i]['href']
45
+ else
46
+ url=sub_url+refarr[i]['href']
47
+ end
48
+ fourofour=false
49
+
50
+ begin
51
+ if(fourofour==false)
52
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
53
+ end
54
+ #test for a 404
55
+ rescue Exception =>ex
106
56
 
57
+ fourofour=true
58
+ retry
59
+ end
60
+ if (fourofour==false)
61
+ #store html from the link with title of the link
62
+ crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
63
+ crfile.puts pagina
64
+ crfile.close
65
+ end
107
66
  end
108
67
 
109
- end
110
- total=0
111
- #transfer links to other array
112
- while(!hrefs.empty?)
113
- value= hrefs.pop
114
- refarr.push(value)
115
- total+=1
68
+ end
69
+
116
70
  end
117
71
 
118
-
119
-
120
- #setup for recognition of the end of the array
121
- refarr.push("-")
122
-
123
- if(depth>0)
124
-
125
- #create subdirectory for storing current set of scraped pages
126
-
127
- if (Dir.exist?('./pages'+depth.to_s))
128
- else Dir.mkdir('./pages'+depth.to_s)
129
- end
130
- #in each link
131
- check = (refarr.length-1)
132
- for i in 0..check
133
- if(refarr[i]!="-")
134
- #evaluate whether link is internal or external
135
- if(refarr[i]['href']!=nil && refarr[i]['href']!="")
136
- if(refarr[i]['href'].include?('http://'))
137
- url=refarr[i]['href']
138
- else
139
- url=sub_url+refarr[i]['href']
140
- #puts "external link"
141
- end#refarr[i]['href'].include?
142
- end#refarr[i]['href']!=nil
143
- fourofour=false
72
+ #############################################################################################
144
73
 
145
- begin
146
- if(fourofour==false)
147
- pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
148
- end
149
- #test for a 404
150
- rescue Exception =>ex
151
- #puts "got a 404"
152
- #replace href (no navigation onclick)
153
- refarr[i]['href'] =""
154
- fourofour=true
155
-
156
- retry
157
- end #begin
158
-
159
- if (fourofour==false)
160
- #make relevant links reference local files
161
- if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
162
-
163
-
164
- j_depth = s_depth - depth
165
- appendval = "../"
166
- clutch = 0
167
- for r in 1..j_depth
168
-
169
- clutch +=1
170
- end
171
- if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
172
- else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
173
- end
174
- if (depth == s_depth)
175
-
176
- linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
177
- else
178
-
179
- linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
180
- end
181
- pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
182
- if (@location.has_key?(refarr[i]['href']))
183
- loc = @location[(refarr[i]['href'])]
184
- sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
185
- refarr[i]['href'] =sub_loc
186
- else
187
- initial_link=refarr[i]['href']
188
- refarr[i]['href']=linkref
189
-
190
- #HERE!!!!!**!*!*@*!!@@***!
191
- if (depth == s_depth)
192
- full_link = "../../"+linkref
193
- else
194
- full_link = linkref
195
- end
196
- @location[initial_link]=full_link
197
- #puts "working"
198
- end# @location.haskey
199
- end #refarr[i]['href']!=""
200
-
201
- #trim it down and remove special characters for display
202
- trimval=refarr[i]['href']
203
- finval=trimval.gsub!(/[!:\/-]/, '')
204
- #puts refarr[i]
205
- if(finval==nil && refarr[i]!=nil)
206
- finval=refarr[i]
207
- end #finval == nil
74
+ def Localize(url, depth, sub_url)
208
75
 
209
- n_depth = depth-1
210
-
211
- if(finval!=nil)
212
- self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
213
- #create subdirectory for storing current links page
214
- #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
215
- #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
216
- #end
217
-
218
-
219
-
220
-
221
- end #finval!=nil
222
- end #fourofour==false
223
- end #refarr[i]!="-"
224
-
225
- end#end for each
226
-
227
-
228
-
229
-
230
- else#<< depth not > 0
231
- check = (refarr.length-1)
232
- for i in 0..check
233
- if (refarr[i]['href']!=nil && refarr[i]['href']!="")
234
- refarr[i]['href']=""
235
- end
236
- end
237
- end
238
-
239
- if (depth == s_depth)
240
- #store newly generated html/links for current page
241
- mainpage =File.new('./page.html',"w")
242
- mainpage.puts page
243
- mainpage.close
244
-
245
-
246
- else
247
- #store page from the link in the subdirectory
248
- puts "page: "
249
- p_depth = depth +1
250
- j_depth = s_depth - depth
251
- appendval = ""
252
- clutch = 0
253
- for r in 1..j_depth
254
- appendval += "../"
255
- clutch +=1
256
- end
257
- clutch -=1
258
-
259
- crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
260
- crfile.puts page
261
- crfile.close
262
-
263
- end
264
- end #end def Localize
265
-
266
- #########################################################################################
267
- def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
268
- #open the starting page
269
-
270
- if (depth<0)
271
- depth=0
272
- end
273
- page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
274
- #collect all of the links from the page
275
- links= page.css('a')
276
- title = page.css('title')
277
- #initialize variables
278
- refarr=[]
279
- hrefs = []
280
- x=0
281
-
282
- #add href to arrays for each link
283
- links.each do |link|
76
+ #initialize to extract from user view
77
+ @location = Hash.new
78
+ s_depth = depth
79
+ i_page = 0
80
+ prev_ipage = 0
81
+ link_to_add =""
82
+ if (depth<0)
83
+ depth=0
84
+ end
85
+ #open the starting page
86
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
87
+ #collect all of the links from the page
88
+ links= page.css('a')
89
+ title = page.css('title')
90
+ #initialize variables
91
+ refarr=[]
92
+ hrefs = []
93
+ x=0
94
+
95
+ #add href to arrays for each link
96
+ links.each do |link|
284
97
  if(link['href']!=nil && link['href']!="")
285
- # puts x
286
- # puts (link['title'].split.join)
287
- # x+=1
288
- hrefs.push(link)
289
-
98
+ hrefs.push(link)
290
99
  end
291
-
292
- end
293
- total=0
294
- #transfer links to other array
295
- while(!hrefs.empty?)
296
- value= hrefs.pop
297
- refarr.push(value)
298
- total+=1
299
- end
300
-
301
-
302
-
303
- #setup for recognition of the end of the array
304
- refarr.push("-")
305
-
306
- if(depth>0)
307
-
308
- #create subdirectory for storing current set of scraped pages
309
-
310
- if (Dir.exist?('./pages'+depth.to_s))
311
- else Dir.mkdir('./pages'+depth.to_s)
312
- end
313
- #in each link
314
- check = (refarr.length-1)
315
- for i in 0..check
316
- if(refarr[i]!="-")
317
- #evaluate whether link is internal or external
318
- if(refarr[i]['href']!=nil && refarr[i]['href']!="")
319
- if(refarr[i]['href'].include?('http://'))
320
- url=refarr[i]['href']
321
- else
322
- url=sub_url+refarr[i]['href']
323
- #puts "external link"
324
- end#refarr[i]['href'].include?
325
- end#refarr[i]['href']!=nil
326
- fourofour=false
327
-
328
- begin
329
- if(fourofour==false)
330
- pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
100
+ end
101
+ total=0
102
+ #transfer links to other array
103
+ while(!hrefs.empty?)
104
+ value= hrefs.pop
105
+ refarr.push(value)
106
+ total+=1
107
+ end
108
+ #setup for recognition of the end of the array
109
+ refarr.push("-")
110
+ if(depth>0)
111
+ #create subdirectory for storing current set of scraped pages
112
+ if (Dir.exist?('./pages'+depth.to_s))
113
+ else Dir.mkdir('./pages'+depth.to_s)
114
+ end
115
+ #in each link
116
+ check = (refarr.length-1)
117
+ for i in 0..check
118
+ if(refarr[i]!="-")
119
+ #evaluate whether link is internal or external
120
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
121
+ if(refarr[i]['href'].include?('http://'))
122
+ url=refarr[i]['href']
123
+ else
124
+ url=sub_url+refarr[i]['href']
125
+ end#refarr[i]['href'].include?
126
+ end#refarr[i]['href']!=nil
127
+ fourofour=false
128
+ begin
129
+ if(fourofour==false)
130
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
131
+ end
132
+ #test for a 404
133
+ rescue Exception =>ex
134
+ #replace href (no navigation onclick)
135
+ refarr[i]['href'] =""
136
+ fourofour=true
137
+ retry
138
+ end
139
+ if (fourofour==false)
140
+ #make relevant links reference local files
141
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
142
+ j_depth = s_depth - depth
143
+ appendval = "../"
144
+ clutch = 0
145
+ for r in 1..j_depth
146
+ clutch +=1
147
+ end
148
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
149
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
150
+ end
151
+ if (depth == s_depth)
152
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
153
+ else
154
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
155
+ end
156
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
157
+ if (@location.has_key?(refarr[i]['href']))
158
+ loc = @location[(refarr[i]['href'])]
159
+ sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
160
+ refarr[i]['href'] =sub_loc
161
+ else
162
+ initial_link=refarr[i]['href']
163
+ refarr[i]['href']=linkref
164
+ if (depth == s_depth)
165
+ full_link = "../../"+linkref
166
+ else
167
+ full_link = linkref
168
+ end
169
+ @location[initial_link]=full_link
170
+ end
171
+ end
172
+ #trim it down and remove special characters for display
173
+ trimval=refarr[i]['href']
174
+ finval=trimval.gsub!(/[!:\/-]/, '')
175
+ if(finval==nil && refarr[i]!=nil)
176
+ finval=refarr[i]
177
+ end
178
+ n_depth = depth-1
179
+ if(finval!=nil)
180
+ self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
181
+ end
182
+ end
183
+ end
331
184
  end
332
- #test for a 404
333
- rescue Exception =>ex
334
- #puts "got a 404"
335
- #replace href (no navigation onclick)
336
- refarr[i]['href'] =""
337
- fourofour=true
338
-
339
- retry
340
- end #begin
341
-
342
- if (fourofour==false)
343
- #make relevant links reference local files
344
- if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
345
-
346
-
347
- j_depth = s_depth - depth
348
- appendval = "../"
349
- clutch = 0
350
- for r in 1..j_depth
351
-
352
- clutch +=1
353
- end
354
- if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
355
- else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
356
- end
357
-
358
- linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
359
-
360
- pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
361
- if (@location.has_key?(refarr[i]['href']))
362
- pass_a_link = "this_is_a_duplicate"
363
- refarr[i]['href'] = @location[(refarr[i]['href'])]
364
-
365
- else
366
- initial_link=refarr[i]['href']
367
- refarr[i]['href']=linkref
368
-
369
-
370
-
371
- full_link = linkref
372
-
373
- @location[initial_link]=linkref
374
- #puts "working"
375
- end# @location.haskey
376
- end #refarr[i]['href']!=""
377
-
378
-
379
- #trim it down and remove special characters for display
380
- trimval=refarr[i]['href']
381
- finval=trimval.gsub!(/[!:\/-]/, '')
382
- #puts refarr[i]
383
- if(finval==nil && refarr[i]!=nil)
384
- finval=refarr[i]
385
- end #finval == nil
386
-
387
- n_depth = depth-1
388
-
389
- if(finval!=nil)
390
- self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
391
-
392
-
393
-
394
-
395
- end #finval!=nil
396
- end #fourofour==false
397
- end #refarr[i]!="-"
398
-
399
- end#end for each
400
-
401
-
402
-
403
-
404
- else#<< depth not > 0
405
- check = (refarr.length-1)
406
- for i in 0..check
407
- if (refarr[i]['href']!=nil && refarr[i]['href']!="")
408
- refarr[i]['href']=""
409
-
410
- end
411
- end
412
- end
413
-
414
- if (depth == s_depth)
415
- #store newly generated html/links for current page
416
- mainpage =File.new('./page.html',"w")
417
- mainpage.puts page
418
- mainpage.close
419
-
420
-
421
- else
422
- #store page from the link in the subdirectory
423
-
424
- p_depth = depth +1
425
- j_depth = s_depth - depth
426
- appendval = ""
427
- clutch = 0
428
- for r in 1..j_depth
429
- appendval += "../"
430
- clutch +=1
431
- end
432
- clutch -=1
433
-
434
- if (link_to_add!="this_is_a_duplicate")
435
-
436
- crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
437
- crfile.puts page
438
- crfile.close
439
- else
440
-
441
- end
442
-
185
+ else
186
+ check = (refarr.length-1)
187
+ for i in 0..check
188
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
189
+ refarr[i]['href']=""
190
+ end
191
+ end
192
+ end
193
+ if (depth == s_depth)
194
+ #store newly generated html/links for current page
195
+ mainpage =File.new('./page.html',"w")
196
+ mainpage.puts page
197
+ mainpage.close
198
+ else
199
+ #store page from the link in the subdirectory
200
+ p_depth = depth +1
201
+ j_depth = s_depth - depth
202
+ appendval = ""
203
+ clutch = 0
204
+ for r in 1..j_depth
205
+ appendval += "../"
206
+ clutch +=1
207
+ end
208
+ clutch -=1
209
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
210
+ crfile.puts page
211
+ crfile.close
212
+ end
213
+ end
214
+ #########################################################################################
215
+ def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
216
+ #open the starting page
217
+ if (depth<0)
218
+ depth=0
219
+ end
220
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
221
+ #collect all of the links from the page
222
+ links= page.css('a')
223
+ title = page.css('title')
224
+ #initialize variables
225
+ refarr=[]
226
+ hrefs = []
227
+ x=0
228
+ #add href to arrays for each link
229
+ links.each do |link|
230
+ if(link['href']!=nil && link['href']!="")
231
+ hrefs.push(link)
232
+ end
233
+ end
234
+ total=0
235
+ #transfer links to other array
236
+ while(!hrefs.empty?)
237
+ value= hrefs.pop
238
+ refarr.push(value)
239
+ total+=1
240
+ end
241
+ #setup for recognition of the end of the array
242
+ refarr.push("-")
243
+ if(depth>0)
244
+ #create subdirectory for storing current set of scraped pages
245
+ if (Dir.exist?('./pages'+depth.to_s))
246
+ else Dir.mkdir('./pages'+depth.to_s)
247
+ end
248
+ #in each link
249
+ check = (refarr.length-1)
250
+ for i in 0..check
251
+ if(refarr[i]!="-")
252
+ #evaluate whether link is internal or external
253
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
254
+ if(refarr[i]['href'].include?('http://'))
255
+ url=refarr[i]['href']
256
+ else
257
+ url=sub_url+refarr[i]['href']
258
+ end
259
+ end
260
+ fourofour=false
261
+ begin
262
+ if(fourofour==false)
263
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
264
+ end
265
+ #test for a 404
266
+ rescue Exception =>ex
267
+ #replace href (no navigation onclick)
268
+ refarr[i]['href'] =""
269
+ fourofour=true
270
+ retry
271
+ end
272
+ if (fourofour==false)
273
+ #make relevant links reference local files
274
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
275
+ j_depth = s_depth - depth
276
+ appendval = "../"
277
+ clutch = 0
278
+ for r in 1..j_depth
279
+ clutch +=1
280
+ end
281
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
282
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
283
+ end
284
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
285
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
286
+ if (@location.has_key?(refarr[i]['href']))
287
+ pass_a_link = "this_is_a_duplicate"
288
+ refarr[i]['href'] = @location[(refarr[i]['href'])]
289
+ else
290
+ initial_link=refarr[i]['href']
291
+ refarr[i]['href']=linkref
292
+ full_link = linkref
293
+ @location[initial_link]=linkref
294
+ end
295
+ end
296
+ #trim it down and remove special characters for display
297
+ trimval=refarr[i]['href']
298
+ finval=trimval.gsub!(/[!:\/-]/, '')
299
+ if(finval==nil && refarr[i]!=nil)
300
+ finval=refarr[i]
301
+ end
302
+ n_depth = depth-1
303
+ if(finval!=nil)
304
+ self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
305
+ end
306
+ end
307
+ end
308
+ end
309
+ else
310
+ check = (refarr.length-1)
311
+ for i in 0..check
312
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
313
+ refarr[i]['href']=""
314
+ end
315
+ end
316
+ end
317
+ if (depth == s_depth)
318
+ #store newly generated html/links for current page
319
+ mainpage =File.new('./page.html',"w")
320
+ mainpage.puts page
321
+ mainpage.close
322
+ else
323
+ #store page from the link in the subdirectory
324
+ p_depth = depth +1
325
+ j_depth = s_depth - depth
326
+ appendval = ""
327
+ clutch = 0
328
+ for r in 1..j_depth
329
+ appendval += "../"
330
+ clutch +=1
331
+ end
332
+ clutch -=1
333
+ if (link_to_add!="this_is_a_duplicate")
334
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
335
+ crfile.puts page
336
+ crfile.close
337
+ else
338
+ end
339
+ end
340
+ end
341
+ #########################################################################################
443
342
  end
444
- end #end def FLocalize
445
-
446
- #########################################################################################
447
-
448
- end#module
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omni_scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.9.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bradley Maynard