omni_scrape 0.1.9.9 → 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 16691660248616c512cfd9584c4427d8d1dbeeab
4
- data.tar.gz: 9bed0a9bbeee104d330a59d119b23d4ce0c08966
3
+ metadata.gz: a174480d89141b2c9bf07b87599b0672d2c393ef
4
+ data.tar.gz: d6582a817b2ce58de26030271f9a78df97c0cbc4
5
5
  SHA512:
6
- metadata.gz: a0770d6acc779099924a7a3ad9a2ca3748e727742964859ede575a8d2899ff166a97b5a5e83da10d53b7ad57c9145f203902700608c8fae914cc40fd310deed1
7
- data.tar.gz: f012b22b16276f3c1ee06ed8d2b654f2aa07ccf023035fb7ae51a5f4d249049d22b8ecb0f6bd243c3fc250f60099b4fb51a97e048cc45c0c84a11395b7a826de
6
+ metadata.gz: 09e4b0378c182c7c2cf31772e2f3668f635ac8c6480b8a23f1d9ceb3aa4eb267548711a339f0bfdf2d905be78df5da3baa2fa897f1afbb4feedd9120499a71c1
7
+ data.tar.gz: 945ff7d6d194e940b96972dbe7c6296897cd15f794ca5dcbea5f7d3d453c9f5e8ac84032ca6fbfb60a04a1f391f2a8b1a2767edefd87f4e3f64b4310aacb949b
data/README.md CHANGED
@@ -59,6 +59,30 @@ The fourth is a css selector for what parts of all pages you want to take the li
59
59
 
60
60
  description: Localize_CSS offers the same service that Localize provides while at the same time giving you the option to limit the result set using a css selector.
61
61
 
62
+ Method : Localize_IN
63
+
64
+ example : OmniScrape.Localize_IN("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org")
65
+
66
+ This will perform the same actions as Localize, but only for internal links
67
+
68
+ Method : Localize_EX
69
+
70
+ example : OmniScrape.Localize_EX("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org")
71
+
72
+ This will perform the same actions as Localize, but only for external links
73
+
74
+ Method : Localize_IN_CSS
75
+
76
+ example : OmniScrape.Localize_IN_CSS("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org", "div table.wikitable")
77
+
78
+ This will perform the same actions as Localize_CSS, but only for internal links
79
+
80
+ Method : Localize_EX_CSS
81
+
82
+ example : OmniScrape.Localize_EX_CSS("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org", "div table.wikitable") NOTE: There are no external links in the wikitable!
83
+
84
+ This will perform the same actions as Localize_CSS, but only for external links.
85
+
62
86
  ## Contributing
63
87
 
64
88
  1. Fork it ( https://github.com/bmaynard1991/omni-scrape )
data/lib/omni_scrape.rb CHANGED
@@ -64,8 +64,17 @@ for i in 0..check
64
64
  if (fourofour==false)
65
65
  #store html from the link with title of the link
66
66
  crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
67
- crfile.puts pagina
68
- crfile.close
67
+ encodingissue=false
68
+ begin
69
+ if(encodingissue==false)
70
+ crfile.puts page
71
+ end
72
+ rescue
73
+ encodingissue=true
74
+ retry
75
+
76
+ end
77
+ crfile.close
69
78
  end#if
70
79
  end#if != "-"
71
80
 
@@ -133,7 +142,7 @@ end
133
142
  if(refarr[i]!="-")
134
143
  #evaluate whether link is internal or external
135
144
  if(refarr[i]['href']!=nil && refarr[i]['href']!="")
136
- if(refarr[i]['href'].include?('http://'))
145
+ if(refarr[i]['href'].include?('://'))
137
146
  url=refarr[i]['href']
138
147
  else
139
148
  url=sub_url+refarr[i]['href']
@@ -257,7 +266,16 @@ else
257
266
  clutch -=1
258
267
 
259
268
  crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
269
+ encodingissue=false
270
+ begin
271
+ if(encodingissue==false)
260
272
  crfile.puts page
273
+ end
274
+ rescue
275
+ encodingissue=true
276
+ retry
277
+
278
+ end
261
279
  crfile.close
262
280
 
263
281
  end
@@ -316,7 +334,7 @@ end
316
334
  if(refarr[i]!="-")
317
335
  #evaluate whether link is internal or external
318
336
  if(refarr[i]['href']!=nil && refarr[i]['href']!="")
319
- if(refarr[i]['href'].include?('http://'))
337
+ if(refarr[i]['href'].include?('://'))
320
338
  url=refarr[i]['href']
321
339
  else
322
340
  url=sub_url+refarr[i]['href']
@@ -434,7 +452,16 @@ else
434
452
  if (link_to_add!="this_is_a_duplicate")
435
453
 
436
454
  crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
455
+ encodingissue=false
456
+ begin
457
+ if(encodingissue==false)
437
458
  crfile.puts page
459
+ end
460
+ rescue
461
+ encodingissue=true
462
+ retry
463
+
464
+ end
438
465
  crfile.close
439
466
  else
440
467
 
@@ -666,7 +693,16 @@ else
666
693
  clutch -=1
667
694
 
668
695
  crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
696
+ encodingissue=false
697
+ begin
698
+ if(encodingissue==false)
669
699
  crfile.puts page
700
+ end
701
+ rescue
702
+ encodingissue=true
703
+ retry
704
+
705
+ end
670
706
  crfile.close
671
707
 
672
708
  end
@@ -870,7 +906,16 @@ else
870
906
  if (link_to_add!="this_is_a_duplicate")
871
907
 
872
908
  crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
909
+ encodingissue=false
910
+ begin
911
+ if(encodingissue==false)
873
912
  crfile.puts page
913
+ end
914
+ rescue
915
+ encodingissue=true
916
+ retry
917
+
918
+ end
874
919
  crfile.close
875
920
  else
876
921
 
@@ -881,4 +926,1688 @@ end #end def FLocalize_CSS
881
926
 
882
927
  #########################################################################################
883
928
 
929
+ #############################################################################################
930
+
931
+ def Localize_IN(url, depth, sub_url)
932
+
933
+ #initialize to extract from user view
934
+ @location_in = Hash.new
935
+ s_depth = depth
936
+ i_page = 0
937
+ prev_ipage = 0
938
+ link_to_add =""
939
+ if (depth<0)
940
+ depth=0
941
+ end
942
+ #open the starting page
943
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
944
+ #collect all of the links from the page
945
+ links= page.css('a')
946
+ title = page.css('title')
947
+ #initialize variables
948
+ refarr=[]
949
+ hrefs = []
950
+ x=0
951
+
952
+ #add href to arrays for each link
953
+ links.each do |link|
954
+ if(link['href']!=nil && link['href']!="" && !link['href'].include?('://'))
955
+ # puts x
956
+ # puts (link['title'].split.join)
957
+ # x+=1
958
+ hrefs.push(link)
959
+
960
+ end
961
+
962
+ end
963
+ total=0
964
+ #transfer links to other array
965
+ while(!hrefs.empty?)
966
+ value= hrefs.pop
967
+ refarr.push(value)
968
+ total+=1
969
+ end
970
+
971
+
972
+
973
+ #setup for recognition of the end of the array
974
+ refarr.push("-")
975
+
976
+ if(depth>0)
977
+
978
+ #create subdirectory for storing current set of scraped pages
979
+
980
+ if (Dir.exist?('./pages'+depth.to_s))
981
+ else Dir.mkdir('./pages'+depth.to_s)
982
+ end
983
+ #in each link
984
+ check = (refarr.length-1)
985
+ for i in 0..check
986
+ if(refarr[i]!="-")
987
+ #evaluate whether link is internal or external
988
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
989
+ if(refarr[i]['href'].include?('://'))
990
+ url=refarr[i]['href']
991
+ else
992
+ url=sub_url+refarr[i]['href']
993
+ #puts "external link"
994
+ end#refarr[i]['href'].include?
995
+ end#refarr[i]['href']!=nil
996
+ fourofour=false
997
+ begin
998
+ if(fourofour==false)
999
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1000
+ end
1001
+ #test for a 404
1002
+ rescue Exception =>ex
1003
+ #puts "got a 404"
1004
+ #replace href (no navigation onclick)
1005
+ refarr[i]['href'] =""
1006
+ fourofour=true
1007
+
1008
+ retry
1009
+ end #begin
1010
+
1011
+ if (fourofour==false)
1012
+ #make relevant links reference local files
1013
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
1014
+
1015
+
1016
+ j_depth = s_depth - depth
1017
+ appendval = "../"
1018
+ clutch = 0
1019
+ for r in 1..j_depth
1020
+
1021
+ clutch +=1
1022
+ end
1023
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
1024
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
1025
+ end
1026
+ if (depth == s_depth)
1027
+
1028
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
1029
+ else
1030
+
1031
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
1032
+ end
1033
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
1034
+ if (@location_in.has_key?(refarr[i]['href']))
1035
+ loc = @location_in[(refarr[i]['href'])]
1036
+ sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
1037
+ refarr[i]['href'] =sub_loc
1038
+ else
1039
+ initial_link=refarr[i]['href']
1040
+ refarr[i]['href']=linkref
1041
+
1042
+ #HERE!!!!!**!*!*@*!!@@***!
1043
+ if (depth == s_depth)
1044
+ full_link = "../../"+linkref
1045
+ else
1046
+ full_link = linkref
1047
+ end
1048
+ @location_in[initial_link]=full_link
1049
+ #puts "working"
1050
+ end# @location.haskey
1051
+ end #refarr[i]['href']!=""
1052
+
1053
+ #trim it down and remove special characters for display
1054
+ trimval=refarr[i]['href']
1055
+ finval=trimval.gsub!(/[!:\/-]/, '')
1056
+ #puts refarr[i]
1057
+ if(finval==nil && refarr[i]!=nil)
1058
+ finval=refarr[i]
1059
+ end #finval == nil
1060
+
1061
+ n_depth = depth-1
1062
+
1063
+ if(finval!=nil)
1064
+ self. FLocalize_IN(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
1065
+ #create subdirectory for storing current links page
1066
+ #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
1067
+ #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
1068
+ #end
1069
+
1070
+
1071
+
1072
+
1073
+ end #finval!=nil
1074
+ end #fourofour==false
1075
+ end #refarr[i]!="-"
1076
+
1077
+ end#end for each
1078
+
1079
+
1080
+
1081
+
1082
+ else#<< depth not > 0
1083
+ check = (refarr.length-1)
1084
+ for i in 0..check
1085
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
1086
+ refarr[i]['href']=""
1087
+ end
1088
+ end
1089
+ end
1090
+
1091
+ if (depth == s_depth)
1092
+ #store newly generated html/links for current page
1093
+ mainpage =File.new('./page.html',"w")
1094
+ mainpage.puts page
1095
+ mainpage.close
1096
+
1097
+
1098
+ else
1099
+ #store page from the link in the subdirectory
1100
+ puts "page: "
1101
+ p_depth = depth +1
1102
+ j_depth = s_depth - depth
1103
+ appendval = ""
1104
+ clutch = 0
1105
+ for r in 1..j_depth
1106
+ appendval += "../"
1107
+ clutch +=1
1108
+ end
1109
+ clutch -=1
1110
+
1111
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
1112
+ encodingissue=false
1113
+ begin
1114
+ if(encodingissue==false)
1115
+ crfile.puts page
1116
+ end
1117
+ rescue
1118
+ encodingissue=true
1119
+ retry
1120
+ end
1121
+
1122
+ crfile.close
1123
+
1124
+ end
1125
+ end #end def Localize_IN
1126
+
1127
+ #########################################################################################
1128
+ def FLocalize_IN(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
1129
+ #open the starting page
1130
+
1131
+ if (depth<0)
1132
+ depth=0
1133
+ end
1134
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1135
+ #collect all of the links from the page
1136
+ links= page.css('a')
1137
+ title = page.css('title')
1138
+ #initialize variables
1139
+ refarr=[]
1140
+ hrefs = []
1141
+ x=0
1142
+
1143
+ #add href to arrays for each link
1144
+ links.each do |link|
1145
+ if(link['href']!=nil && link['href']!="" && !link['href'].include?('://'))
1146
+ # puts x
1147
+ # puts (link['title'].split.join)
1148
+ # x+=1
1149
+ hrefs.push(link)
1150
+
1151
+ end
1152
+
1153
+ end
1154
+ total=0
1155
+ #transfer links to other array
1156
+ while(!hrefs.empty?)
1157
+ value= hrefs.pop
1158
+ refarr.push(value)
1159
+ total+=1
1160
+ end
1161
+
1162
+
1163
+
1164
+ #setup for recognition of the end of the array
1165
+ refarr.push("-")
1166
+
1167
+ if(depth>0)
1168
+
1169
+ #create subdirectory for storing current set of scraped pages
1170
+
1171
+ if (Dir.exist?('./pages'+depth.to_s))
1172
+ else Dir.mkdir('./pages'+depth.to_s)
1173
+ end
1174
+ #in each link
1175
+ check = (refarr.length-1)
1176
+ for i in 0..check
1177
+ if(refarr[i]!="-")
1178
+ #evaluate whether link is internal or external
1179
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
1180
+ if(refarr[i]['href'].include?('://'))
1181
+ url=refarr[i]['href']
1182
+ else
1183
+ url=sub_url+refarr[i]['href']
1184
+ #puts "external link"
1185
+ end#refarr[i]['href'].include?
1186
+ end#refarr[i]['href']!=nil
1187
+ fourofour=false
1188
+
1189
+ begin
1190
+ if(fourofour==false)
1191
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1192
+ end
1193
+ #test for a 404
1194
+ rescue Exception =>ex
1195
+ #puts "got a 404"
1196
+ #replace href (no navigation onclick)
1197
+ refarr[i]['href'] =""
1198
+ fourofour=true
1199
+
1200
+ retry
1201
+ end #begin
1202
+
1203
+ if (fourofour==false)
1204
+ #make relevant links reference local files
1205
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
1206
+
1207
+
1208
+ j_depth = s_depth - depth
1209
+ appendval = "../"
1210
+ clutch = 0
1211
+ for r in 1..j_depth
1212
+
1213
+ clutch +=1
1214
+ end
1215
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
1216
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
1217
+ end
1218
+
1219
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
1220
+
1221
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
1222
+ if (@location_in.has_key?(refarr[i]['href']))
1223
+ pass_a_link = "this_is_a_duplicate"
1224
+ refarr[i]['href'] = @location_in[(refarr[i]['href'])]
1225
+
1226
+ else
1227
+ initial_link=refarr[i]['href']
1228
+ refarr[i]['href']=linkref
1229
+
1230
+
1231
+
1232
+ full_link = linkref
1233
+
1234
+ @location_in[initial_link]=linkref
1235
+ #puts "working"
1236
+ end# @location.haskey
1237
+ end #refarr[i]['href']!=""
1238
+
1239
+
1240
+ #trim it down and remove special characters for display
1241
+ trimval=refarr[i]['href']
1242
+ finval=trimval.gsub!(/[!:\/-]/, '')
1243
+ #puts refarr[i]
1244
+ if(finval==nil && refarr[i]!=nil)
1245
+ finval=refarr[i]
1246
+ end #finval == nil
1247
+
1248
+ n_depth = depth-1
1249
+
1250
+ if(finval!=nil)
1251
+ self. FLocalize_IN(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
1252
+
1253
+
1254
+
1255
+
1256
+ end #finval!=nil
1257
+ end #fourofour==false
1258
+ end #refarr[i]!="-"
1259
+
1260
+ end#end for each
1261
+
1262
+
1263
+
1264
+
1265
+ else#<< depth not > 0
1266
+ check = (refarr.length-1)
1267
+ for i in 0..check
1268
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
1269
+ refarr[i]['href']=""
1270
+
1271
+ end
1272
+ end
1273
+ end
1274
+
1275
+ if (depth == s_depth)
1276
+ #store newly generated html/links for current page
1277
+ mainpage =File.new('./page.html',"w")
1278
+ mainpage.puts page
1279
+ mainpage.close
1280
+
1281
+
1282
+ else
1283
+ #store page from the link in the subdirectory
1284
+
1285
+ p_depth = depth +1
1286
+ j_depth = s_depth - depth
1287
+ appendval = ""
1288
+ clutch = 0
1289
+ for r in 1..j_depth
1290
+ appendval += "../"
1291
+ clutch +=1
1292
+ end
1293
+ clutch -=1
1294
+
1295
+ if (link_to_add!="this_is_a_duplicate")
1296
+
1297
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
1298
+ encodingissue=false
1299
+ begin
1300
+ if(encodingissue==false)
1301
+ crfile.puts page
1302
+ end
1303
+ rescue
1304
+ encodingissue=true
1305
+ retry
1306
+
1307
+ end
1308
+ crfile.close
1309
+ else
1310
+
1311
+ end
1312
+
1313
+ end
1314
+ end #end def FLocalize_IN
1315
+
1316
+ #########################################################################################
1317
+
1318
+ #############################################################################################
1319
+
1320
+ def Localize_EX(url, depth, sub_url)
1321
+
1322
+ #initialize to extract from user view
1323
+ @location_ex = Hash.new
1324
+ s_depth = depth
1325
+ i_page = 0
1326
+ prev_ipage = 0
1327
+ link_to_add =""
1328
+ if (depth<0)
1329
+ depth=0
1330
+ end
1331
+ #open the starting page
1332
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1333
+ #collect all of the links from the page
1334
+ links= page.css('a')
1335
+ title = page.css('title')
1336
+ #initialize variables
1337
+ refarr=[]
1338
+ hrefs = []
1339
+ x=0
1340
+
1341
+ #add href to arrays for each link
1342
+ links.each do |link|
1343
+ if(link['href']!=nil && link['href']!="" && link['href'].include?('://'))
1344
+ # puts x
1345
+ # puts (link['title'].split.join)
1346
+ # x+=1
1347
+ hrefs.push(link)
1348
+
1349
+ end
1350
+
1351
+ end
1352
+ total=0
1353
+ #transfer links to other array
1354
+ while(!hrefs.empty?)
1355
+ value= hrefs.pop
1356
+ refarr.push(value)
1357
+ total+=1
1358
+ end
1359
+
1360
+
1361
+
1362
+ #setup for recognition of the end of the array
1363
+ refarr.push("-")
1364
+
1365
+ if(depth>0)
1366
+
1367
+ #create subdirectory for storing current set of scraped pages
1368
+
1369
+ if (Dir.exist?('./pages'+depth.to_s))
1370
+ else Dir.mkdir('./pages'+depth.to_s)
1371
+ end
1372
+ #in each link
1373
+ check = (refarr.length-1)
1374
+ for i in 0..check
1375
+ if(refarr[i]!="-")
1376
+ #evaluate whether link is internal or external
1377
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
1378
+ if(refarr[i]['href'].include?('://'))
1379
+ url=refarr[i]['href']
1380
+ else
1381
+ url=sub_url+refarr[i]['href']
1382
+ #puts "external link"
1383
+ end#refarr[i]['href'].include?
1384
+ end#refarr[i]['href']!=nil
1385
+ fourofour=false
1386
+ begin
1387
+ if(fourofour==false)
1388
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1389
+ end
1390
+ #test for a 404
1391
+ rescue Exception =>ex
1392
+ #puts "got a 404"
1393
+ #replace href (no navigation onclick)
1394
+ refarr[i]['href'] =""
1395
+ fourofour=true
1396
+
1397
+ retry
1398
+ end #begin
1399
+
1400
+ if (fourofour==false && refarr[i]['href']!="" && refarr[i]['href']!=nil)
1401
+ #make relevant links reference local files
1402
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
1403
+
1404
+
1405
+ j_depth = s_depth - depth
1406
+ appendval = "../"
1407
+ clutch = 0
1408
+ for r in 1..j_depth
1409
+
1410
+ clutch +=1
1411
+ end
1412
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
1413
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
1414
+ end
1415
+ if (depth == s_depth)
1416
+
1417
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
1418
+ else
1419
+
1420
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
1421
+ end
1422
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
1423
+ if (@location_ex.has_key?(refarr[i]['href']))
1424
+ loc = @location_ex[(refarr[i]['href'])]
1425
+ sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
1426
+ refarr[i]['href'] =sub_loc
1427
+ else
1428
+ initial_link=refarr[i]['href']
1429
+ refarr[i]['href']=linkref
1430
+
1431
+ #HERE!!!!!**!*!*@*!!@@***!
1432
+ if (depth == s_depth)
1433
+ full_link = "../../"+linkref
1434
+ else
1435
+ full_link = linkref
1436
+ end
1437
+ @location_ex[initial_link]=full_link
1438
+ #puts "working"
1439
+ end# @location.haskey
1440
+ end #refarr[i]['href']!=""
1441
+
1442
+ #trim it down and remove special characters for display
1443
+ trimval=refarr[i]['href']
1444
+ finval=trimval.gsub!(/[!:\/-]/, '')
1445
+ #puts refarr[i]
1446
+ if(finval==nil && refarr[i]!=nil)
1447
+ finval=refarr[i]
1448
+ end #finval == nil
1449
+
1450
+ n_depth = depth-1
1451
+
1452
+ if(finval!=nil)
1453
+ self. FLocalize_EX(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
1454
+ #create subdirectory for storing current links page
1455
+ #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
1456
+ #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
1457
+ #end
1458
+
1459
+
1460
+
1461
+
1462
+ end #finval!=nil
1463
+ end #fourofour==false
1464
+ end #refarr[i]!="-"
1465
+
1466
+ end#end for each
1467
+
1468
+
1469
+
1470
+
1471
+ else#<< depth not > 0
1472
+ check = (refarr.length-1)
1473
+ for i in 0..check
1474
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
1475
+ refarr[i]['href']=""
1476
+ end
1477
+ end
1478
+ end
1479
+
1480
+ if (depth == s_depth)
1481
+ #store newly generated html/links for current page
1482
+ mainpage =File.new('./page.html',"w")
1483
+ mainpage.puts page
1484
+ mainpage.close
1485
+
1486
+
1487
+ else
1488
+ #store page from the link in the subdirectory
1489
+ puts "page: "
1490
+ p_depth = depth +1
1491
+ j_depth = s_depth - depth
1492
+ appendval = ""
1493
+ clutch = 0
1494
+ for r in 1..j_depth
1495
+ appendval += "../"
1496
+ clutch +=1
1497
+ end
1498
+ clutch -=1
1499
+
1500
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
1501
+ encodingissue=false
1502
+ begin
1503
+ if(encodingissue==false)
1504
+ crfile.puts page
1505
+ end
1506
+ rescue
1507
+ encodingissue=true
1508
+ retry
1509
+ end
1510
+
1511
+ crfile.close
1512
+
1513
+ end
1514
+ end #end def Localize_EX
1515
+
1516
+ #########################################################################################
1517
+ def FLocalize_EX(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
1518
+ #open the starting page
1519
+
1520
+ if (depth<0)
1521
+ depth=0
1522
+ end
1523
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1524
+ #collect all of the links from the page
1525
+ links= page.css('a')
1526
+ title = page.css('title')
1527
+ #initialize variables
1528
+ refarr=[]
1529
+ hrefs = []
1530
+ x=0
1531
+
1532
+ #add href to arrays for each link
1533
+ links.each do |link|
1534
+ if(link['href']!=nil && link['href']!="" && link['href'].include?('://'))
1535
+ # puts x
1536
+ # puts (link['title'].split.join)
1537
+ # x+=1
1538
+ hrefs.push(link)
1539
+
1540
+ end
1541
+
1542
+ end
1543
+ total=0
1544
+ #transfer links to other array
1545
+ while(!hrefs.empty?)
1546
+ value= hrefs.pop
1547
+ refarr.push(value)
1548
+ total+=1
1549
+ end
1550
+
1551
+
1552
+
1553
+ #setup for recognition of the end of the array
1554
+ refarr.push("-")
1555
+
1556
+ if(depth>0)
1557
+
1558
+ #create subdirectory for storing current set of scraped pages
1559
+
1560
+ if (Dir.exist?('./pages'+depth.to_s))
1561
+ else Dir.mkdir('./pages'+depth.to_s)
1562
+ end
1563
+ #in each link
1564
+ check = (refarr.length-1)
1565
+ for i in 0..check
1566
+ if(refarr[i]!="-")
1567
+ #evaluate whether link is internal or external
1568
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
1569
+ if(refarr[i]['href'].include?('://'))
1570
+ url=refarr[i]['href']
1571
+ else
1572
+ url=sub_url+refarr[i]['href']
1573
+ #puts "external link"
1574
+ end#refarr[i]['href'].include?
1575
+ end#refarr[i]['href']!=nil
1576
+ fourofour=false
1577
+
1578
+ begin
1579
+ if(fourofour==false)
1580
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1581
+ end
1582
+ #test for a 404
1583
+ rescue Exception =>ex
1584
+ #puts "got a 404"
1585
+ #replace href (no navigation onclick)
1586
+ refarr[i]['href'] =""
1587
+ fourofour=true
1588
+
1589
+ retry
1590
+ end #begin
1591
+
1592
+ if (fourofour==false && refarr[i]['href']!="" && refarr[i]['href']!=nil)
1593
+ #make relevant links reference local files
1594
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
1595
+
1596
+
1597
+ j_depth = s_depth - depth
1598
+ appendval = "../"
1599
+ clutch = 0
1600
+ for r in 1..j_depth
1601
+
1602
+ clutch +=1
1603
+ end
1604
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
1605
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
1606
+ end
1607
+
1608
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
1609
+
1610
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
1611
+ if (@location_ex.has_key?(refarr[i]['href']))
1612
+ pass_a_link = "this_is_a_duplicate"
1613
+ refarr[i]['href'] = @location_ex[(refarr[i]['href'])]
1614
+
1615
+ else
1616
+ initial_link=refarr[i]['href']
1617
+ refarr[i]['href']=linkref
1618
+
1619
+
1620
+
1621
+ full_link = linkref
1622
+
1623
+ @location_ex[initial_link]=linkref
1624
+ #puts "working"
1625
+ end# @location.haskey
1626
+ end #refarr[i]['href']!=""
1627
+
1628
+
1629
+ #trim it down and remove special characters for display
1630
+ trimval=refarr[i]['href']
1631
+ finval=trimval.gsub!(/[!:\/-]/, '')
1632
+ #puts refarr[i]
1633
+ if(finval==nil && refarr[i]!=nil)
1634
+ finval=refarr[i]
1635
+ end #finval == nil
1636
+
1637
+ n_depth = depth-1
1638
+
1639
+ if(finval!=nil)
1640
+ self. FLocalize_EX(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
1641
+
1642
+
1643
+
1644
+
1645
+ end #finval!=nil
1646
+ end #fourofour==false
1647
+ end #refarr[i]!="-"
1648
+
1649
+ end#end for each
1650
+
1651
+
1652
+
1653
+
1654
+ else#<< depth not > 0
1655
+ check = (refarr.length-1)
1656
+ for i in 0..check
1657
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
1658
+ refarr[i]['href']=""
1659
+
1660
+ end
1661
+ end
1662
+ end
1663
+
1664
+ if (depth == s_depth)
1665
+ #store newly generated html/links for current page
1666
+ mainpage =File.new('./page.html',"w")
1667
+ mainpage.puts page
1668
+ mainpage.close
1669
+
1670
+
1671
+ else
1672
+ #store page from the link in the subdirectory
1673
+
1674
+ p_depth = depth +1
1675
+ j_depth = s_depth - depth
1676
+ appendval = ""
1677
+ clutch = 0
1678
+ for r in 1..j_depth
1679
+ appendval += "../"
1680
+ clutch +=1
1681
+ end
1682
+ clutch -=1
1683
+
1684
+ if (link_to_add!="this_is_a_duplicate")
1685
+
1686
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
1687
+ encodingissue=false
1688
+ begin
1689
+ if(encodingissue==false)
1690
+ crfile.puts page
1691
+ end
1692
+ rescue
1693
+ encodingissue=true
1694
+ retry
1695
+
1696
+ end
1697
+ crfile.close
1698
+ else
1699
+
1700
+ end
1701
+
1702
+ end
1703
+ end #end def FLocalize_EX
1704
+
1705
+ #########################################################################################
1706
+
1707
+ #############################################################################################
1708
+
1709
+ def Localize_IN_CSS(url, depth, sub_url,selector)
1710
+
1711
+ #initialize to extract from user view
1712
+ @location_IN_CSS = Hash.new
1713
+ s_depth = depth
1714
+ i_page = 0
1715
+ prev_ipage = 0
1716
+ link_to_add =""
1717
+ if (depth<0)
1718
+ depth=0
1719
+ end
1720
+ #open the starting page
1721
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1722
+ #collect all of the links from the page
1723
+ links= page.css('a')
1724
+ title = page.css('title')
1725
+ #initialize variables
1726
+ refarr=[]
1727
+ hrefs = []
1728
+ linkseti= []
1729
+ linkset= []
1730
+ x=0
1731
+
1732
+ linkseti = page.css(selector+' a')
1733
+ #add each link with valid href to array
1734
+ links.each do |link|
1735
+ if(link['href']!=nil && link['href']!="" && !link['href'].include?('://'))
1736
+ # puts x
1737
+ # puts (link['title'].split.join)
1738
+ # x+=1
1739
+ hrefs.push(link)
1740
+
1741
+ end
1742
+
1743
+ end
1744
+ linkseti.each do |ilink|
1745
+ if(ilink['href']!=nil && ilink['href']!="")
1746
+ # puts x
1747
+ # puts (link['title'].split.join)
1748
+ # x+=1
1749
+ linkset.push(ilink)
1750
+
1751
+ end
1752
+
1753
+ end
1754
+ hrefslength = (hrefs.length-1)
1755
+ for i in 0..hrefslength
1756
+ if(linkset.include?(hrefs[i]))
1757
+ else
1758
+ if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
1759
+ hrefs[i]['href']=""
1760
+ end
1761
+
1762
+ end
1763
+ end
1764
+
1765
+
1766
+ #transfer links to other array
1767
+ while(!hrefs.empty?)
1768
+ value= hrefs.pop
1769
+ if (value['href']!=nil && value['href']!="")
1770
+ refarr.push(value)
1771
+ end
1772
+
1773
+ end
1774
+
1775
+
1776
+
1777
+
1778
+
1779
+
1780
+
1781
+ #setup for recognition of the end of the array
1782
+ refarr.push("-")
1783
+
1784
+ if(depth>0)
1785
+
1786
+ #create subdirectory for storing current set of scraped pages
1787
+
1788
+ if (Dir.exist?('./pages'+depth.to_s))
1789
+ else Dir.mkdir('./pages'+depth.to_s)
1790
+ end
1791
+ #in each link
1792
+ check = (refarr.length-1)
1793
+ for i in 0..check
1794
+ if(refarr[i]!="-")
1795
+ if(linkset.include?(refarr[i]))
1796
+ else
1797
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
1798
+ refarr[i]['href']=""
1799
+ end
1800
+ end
1801
+ #evaluate whether link is internal or external
1802
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
1803
+ if(refarr[i]['href'].include?('://'))
1804
+ url=refarr[i]['href']
1805
+ else
1806
+ url=sub_url+refarr[i]['href']
1807
+ #puts "external link"
1808
+ end#refarr[i]['href'].include?
1809
+ end#refarr[i]['href']!=nil
1810
+ fourofour=false
1811
+
1812
+ begin
1813
+ if(fourofour==false && refarr[i]['href']!=nil)
1814
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1815
+ end
1816
+ #test for a 404
1817
+ rescue Exception =>ex
1818
+ #puts "got a 404"
1819
+ #replace href (no navigation onclick)
1820
+ refarr[i]['href'] =""
1821
+ fourofour=true
1822
+
1823
+ retry
1824
+ end #begin
1825
+
1826
+ if (fourofour==false)
1827
+ #make relevant links reference local files
1828
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
1829
+
1830
+
1831
+ j_depth = s_depth - depth
1832
+ appendval = "../"
1833
+ clutch = 0
1834
+ for r in 1..j_depth
1835
+
1836
+ clutch +=1
1837
+ end
1838
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
1839
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
1840
+ end
1841
+ if (depth == s_depth)
1842
+
1843
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
1844
+ else
1845
+
1846
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
1847
+ end
1848
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
1849
+ if (@location_IN_CSS.has_key?(refarr[i]['href']))
1850
+ loc = @location_IN_CSS[(refarr[i]['href'])]
1851
+ sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
1852
+ refarr[i]['href'] =sub_loc
1853
+ else
1854
+ initial_link=refarr[i]['href']
1855
+ refarr[i]['href']=linkref
1856
+
1857
+ #HERE!!!!!**!*!*@*!!@@***!
1858
+ if (depth == s_depth)
1859
+ full_link = "../../"+linkref
1860
+ else
1861
+ full_link = linkref
1862
+ end
1863
+ @location_IN_CSS[initial_link]=full_link
1864
+ #puts "working"
1865
+ end# @location_CSS.haskey
1866
+ end #refarr[i]['href']!=""
1867
+
1868
+ #trim it down and remove special characters for display
1869
+ trimval=refarr[i]['href']
1870
+ finval=trimval.gsub!(/[!:\/-]/, '')
1871
+ #puts refarr[i]
1872
+ if(finval==nil && refarr[i]!=nil)
1873
+ finval=refarr[i]
1874
+ end #finval == nil
1875
+
1876
+ n_depth = depth-1
1877
+
1878
+ if(finval!=nil)
1879
+ self. FLocalize_IN_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
1880
+ #create subdirectory for storing current links page
1881
+ #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
1882
+ #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
1883
+ #end
1884
+
1885
+
1886
+
1887
+
1888
+ end #finval!=nil
1889
+ end #fourofour==false
1890
+ end #refarr[i]!="-"
1891
+
1892
+ end#end for each
1893
+
1894
+
1895
+
1896
+
1897
+ else#<< depth not > 0
1898
+ check = (refarr.length-1)
1899
+ for i in 0..check
1900
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
1901
+ refarr[i]['href']=""
1902
+ end
1903
+ end
1904
+ end
1905
+
1906
+ if (depth == s_depth)
1907
+ #store newly generated html/links for current page
1908
+ mainpage =File.new('./page.html',"w")
1909
+ mainpage.puts page
1910
+ mainpage.close
1911
+
1912
+
1913
+ else
1914
+ #store page from the link in the subdirectory
1915
+ puts "page: "
1916
+ p_depth = depth +1
1917
+ j_depth = s_depth - depth
1918
+ appendval = ""
1919
+ clutch = 0
1920
+ for r in 1..j_depth
1921
+ appendval += "../"
1922
+ clutch +=1
1923
+ end
1924
+ clutch -=1
1925
+
1926
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
1927
+ encodingissue=false
1928
+ begin
1929
+ if(encodingissue==false)
1930
+ crfile.puts page
1931
+ end
1932
+ rescue
1933
+ encodingissue=true
1934
+ retry
1935
+
1936
+ end
1937
+ crfile.close
1938
+
1939
+ end
1940
+ end #end def Localize_IN_CSS
1941
+
1942
+ #########################################################################################
1943
+ def FLocalize_IN_CSS(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add, selector)
1944
+ #open the starting page
1945
+
1946
+ if (depth<0)
1947
+ depth=0
1948
+ end
1949
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
1950
+ #collect all of the links from the page
1951
+ links= page.css('a')
1952
+ title = page.css('title')
1953
+ #initialize variables
1954
+ refarr=[]
1955
+ hrefs = []
1956
+ linkseti= []
1957
+ linkset= []
1958
+ x=0
1959
+
1960
+ linkseti = page.css(selector+' a')
1961
+ #add each link with valid href to array
1962
+ links.each do |link|
1963
+ if(link['href']!=nil && link['href']!="" && !link['href'].include?('://'))
1964
+ # puts x
1965
+ # puts (link['title'].split.join)
1966
+ # x+=1
1967
+ hrefs.push(link)
1968
+
1969
+ end
1970
+
1971
+ end
1972
+ linkseti.each do |ilink|
1973
+ if(ilink['href']!=nil && ilink['href']!="")
1974
+ # puts x
1975
+ # puts (link['title'].split.join)
1976
+ # x+=1
1977
+ linkset.push(ilink)
1978
+
1979
+ end
1980
+
1981
+ end
1982
+ hrefslength = (hrefs.length-1)
1983
+ for i in 0..hrefslength
1984
+ if(linkset.include?(hrefs[i]))
1985
+ else
1986
+ if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
1987
+ hrefs[i]['href']=""
1988
+ end
1989
+
1990
+ end
1991
+ end
1992
+
1993
+
1994
+
1995
+ #transfer links to other array
1996
+ while(!hrefs.empty?)
1997
+ value= hrefs.pop
1998
+ if (value['href']!=nil && value['href']!="")
1999
+ refarr.push(value)
2000
+ end
2001
+
2002
+ end
2003
+
2004
+ #setup for recognition of the end of the array
2005
+ refarr.push("-")
2006
+
2007
+ if(depth>0)
2008
+
2009
+ #create subdirectory for storing current set of scraped pages
2010
+
2011
+ if (Dir.exist?('./pages'+depth.to_s))
2012
+ else Dir.mkdir('./pages'+depth.to_s)
2013
+ end
2014
+ #in each link
2015
+ check = (refarr.length-1)
2016
+ for i in 0..check
2017
+ if(refarr[i]!="-")
2018
+
2019
+
2020
+ #evaluate whether link is internal or external
2021
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
2022
+ if(refarr[i]['href'].include?('://'))
2023
+ url=refarr[i]['href']
2024
+ else
2025
+ url=sub_url+refarr[i]['href']
2026
+ #puts "external link"
2027
+ end#refarr[i]['href'].include?
2028
+ end#refarr[i]['href']!=nil
2029
+ fourofour=false
2030
+ #refarr[i]['href'] is nil :S this a result of reference to other array? how to do a true dup without reference?
2031
+ begin
2032
+ if(fourofour==false)
2033
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
2034
+ end
2035
+ #test for a 404
2036
+ rescue Exception =>ex
2037
+ #puts "got a 404"
2038
+ #replace href (no navigation onclick)
2039
+ refarr[i]['href'] =""
2040
+ fourofour=true
2041
+
2042
+ retry
2043
+ end #begin
2044
+
2045
+ if (fourofour==false)
2046
+ #make relevant links reference local files
2047
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
2048
+
2049
+
2050
+ j_depth = s_depth - depth
2051
+ appendval = "../"
2052
+ clutch = 0
2053
+ for r in 1..j_depth
2054
+
2055
+ clutch +=1
2056
+ end
2057
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
2058
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
2059
+ end
2060
+
2061
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
2062
+
2063
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
2064
+ if (@location_IN_CSS.has_key?(refarr[i]['href']))
2065
+ pass_a_link = "this_is_a_duplicate"
2066
+ refarr[i]['href'] = @location_IN_CSS[(refarr[i]['href'])]
2067
+
2068
+ else
2069
+ initial_link=refarr[i]['href']
2070
+ refarr[i]['href']=linkref
2071
+
2072
+
2073
+
2074
+ full_link = linkref
2075
+
2076
+ @location_IN_CSS[initial_link]=linkref
2077
+ #puts "working"
2078
+ end# @location_CSS.haskey
2079
+ end #refarr[i]['href']!=""
2080
+
2081
+
2082
+ #trim it down and remove special characters for display
2083
+ trimval=refarr[i]['href']
2084
+ finval=trimval.gsub!(/[!:\/-]/, '')
2085
+ #puts refarr[i]
2086
+ if(finval==nil && refarr[i]!=nil)
2087
+ finval=refarr[i]
2088
+ end #finval == nil
2089
+
2090
+ n_depth = depth-1
2091
+
2092
+ if(finval!=nil)
2093
+ self. FLocalize_IN_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
2094
+
2095
+
2096
+
2097
+
2098
+ end #finval!=nil
2099
+ end #fourofour==false
2100
+ end #refarr[i]!="-"
2101
+
2102
+ end#end for each
2103
+
2104
+
2105
+
2106
+
2107
+ else#<< depth not > 0
2108
+ check = (refarr.length-1)
2109
+ for i in 0..check
2110
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
2111
+ refarr[i]['href']=""
2112
+
2113
+ end
2114
+ end
2115
+ end
2116
+
2117
+ if (depth == s_depth)
2118
+ #store newly generated html/links for current page
2119
+ mainpage =File.new('./page.html',"w")
2120
+ mainpage.puts page
2121
+ mainpage.close
2122
+
2123
+
2124
+ else
2125
+ #store page from the link in the subdirectory
2126
+
2127
+ p_depth = depth +1
2128
+ j_depth = s_depth - depth
2129
+ appendval = ""
2130
+ clutch = 0
2131
+ for r in 1..j_depth
2132
+ appendval += "../"
2133
+ clutch +=1
2134
+ end
2135
+ clutch -=1
2136
+
2137
+ if (link_to_add!="this_is_a_duplicate")
2138
+
2139
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
2140
+ encodingissue=false
2141
+ begin
2142
+ if(encodingissue==false)
2143
+ crfile.puts page
2144
+ end
2145
+ rescue
2146
+ encodingissue=true
2147
+ retry
2148
+
2149
+ end
2150
+ crfile.close
2151
+ else
2152
+
2153
+ end
2154
+
2155
+ end
2156
+ end #end def FLocalize_IN_CSS
2157
+
2158
+ #########################################################################################
2159
+
2160
+ #############################################################################################
2161
+
2162
+ def Localize_EX_CSS(url, depth, sub_url,selector)
2163
+
2164
+ #initialize to extract from user view
2165
+ @location_EX_CSS = Hash.new
2166
+ s_depth = depth
2167
+ i_page = 0
2168
+ prev_ipage = 0
2169
+ link_to_add =""
2170
+ if (depth<0)
2171
+ depth=0
2172
+ end
2173
+ #open the starting page
2174
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
2175
+ #collect all of the links from the page
2176
+ links= page.css('a')
2177
+ title = page.css('title')
2178
+ #initialize variables
2179
+ refarr=[]
2180
+ hrefs = []
2181
+ linkseti= []
2182
+ linkset= []
2183
+ x=0
2184
+
2185
+ linkseti = page.css(selector+' a')
2186
+ #add each link with valid href to array
2187
+ links.each do |link|
2188
+ if(link['href']!=nil && link['href']!="" && link['href'].include?('://'))
2189
+ # puts x
2190
+ # puts (link['title'].split.join)
2191
+ # x+=1
2192
+ hrefs.push(link)
2193
+
2194
+ end
2195
+
2196
+ end
2197
+ linkseti.each do |ilink|
2198
+ if(ilink['href']!=nil && ilink['href']!="")
2199
+ # puts x
2200
+ # puts (link['title'].split.join)
2201
+ # x+=1
2202
+ linkset.push(ilink)
2203
+
2204
+ end
2205
+
2206
+ end
2207
+ hrefslength = (hrefs.length-1)
2208
+ for i in 0..hrefslength
2209
+ if(linkset.include?(hrefs[i]))
2210
+ else
2211
+ if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
2212
+ hrefs[i]['href']=""
2213
+ end
2214
+
2215
+ end
2216
+ end
2217
+
2218
+
2219
+ #transfer links to other array
2220
+ while(!hrefs.empty?)
2221
+ value= hrefs.pop
2222
+ if (value['href']!=nil && value['href']!="")
2223
+ refarr.push(value)
2224
+ end
2225
+
2226
+ end
2227
+
2228
+
2229
+
2230
+
2231
+
2232
+
2233
+
2234
+ #setup for recognition of the end of the array
2235
+ refarr.push("-")
2236
+
2237
+ if(depth>0)
2238
+
2239
+ #create subdirectory for storing current set of scraped pages
2240
+
2241
+ if (Dir.exist?('./pages'+depth.to_s))
2242
+ else Dir.mkdir('./pages'+depth.to_s)
2243
+ end
2244
+ #in each link
2245
+ check = (refarr.length-1)
2246
+ for i in 0..check
2247
+ if(refarr[i]!="-")
2248
+ if(linkset.include?(refarr[i]))
2249
+ else
2250
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
2251
+ refarr[i]['href']=""
2252
+ end
2253
+ end
2254
+ #evaluate whether link is internal or external
2255
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
2256
+ if(refarr[i]['href'].include?('://'))
2257
+ url=refarr[i]['href']
2258
+ else
2259
+ url=sub_url+refarr[i]['href']
2260
+ #puts "external link"
2261
+ end#refarr[i]['href'].include?
2262
+ end#refarr[i]['href']!=nil
2263
+ fourofour=false
2264
+
2265
+ begin
2266
+ if(fourofour==false && refarr[i]['href']!=nil)
2267
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
2268
+ end
2269
+ #test for a 404
2270
+ rescue Exception =>ex
2271
+ #puts "got a 404"
2272
+ #replace href (no navigation onclick)
2273
+ refarr[i]['href'] =""
2274
+ fourofour=true
2275
+
2276
+ retry
2277
+ end #begin
2278
+
2279
+ if (fourofour==false)
2280
+ #make relevant links reference local files
2281
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
2282
+
2283
+
2284
+ j_depth = s_depth - depth
2285
+ appendval = "../"
2286
+ clutch = 0
2287
+ for r in 1..j_depth
2288
+
2289
+ clutch +=1
2290
+ end
2291
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
2292
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
2293
+ end
2294
+ if (depth == s_depth)
2295
+
2296
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
2297
+ else
2298
+
2299
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
2300
+ end
2301
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
2302
+ if (@location_EX_CSS.has_key?(refarr[i]['href']))
2303
+ loc = @location_EX_CSS[(refarr[i]['href'])]
2304
+ sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
2305
+ refarr[i]['href'] =sub_loc
2306
+ else
2307
+ initial_link=refarr[i]['href']
2308
+ refarr[i]['href']=linkref
2309
+
2310
+ #HERE!!!!!**!*!*@*!!@@***!
2311
+ if (depth == s_depth)
2312
+ full_link = "../../"+linkref
2313
+ else
2314
+ full_link = linkref
2315
+ end
2316
+ @location_EX_CSS[initial_link]=full_link
2317
+ #puts "working"
2318
+ end# @location_CSS.haskey
2319
+ end #refarr[i]['href']!=""
2320
+
2321
+ #trim it down and remove special characters for display
2322
+ trimval=refarr[i]['href']
2323
+ finval=trimval.gsub!(/[!:\/-]/, '')
2324
+ #puts refarr[i]
2325
+ if(finval==nil && refarr[i]!=nil)
2326
+ finval=refarr[i]
2327
+ end #finval == nil
2328
+
2329
+ n_depth = depth-1
2330
+
2331
+ if(finval!=nil)
2332
+ self. FLocalize_EX_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
2333
+ #create subdirectory for storing current links page
2334
+ #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
2335
+ #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
2336
+ #end
2337
+
2338
+
2339
+
2340
+
2341
+ end #finval!=nil
2342
+ end #fourofour==false
2343
+ end #refarr[i]!="-"
2344
+
2345
+ end#end for each
2346
+
2347
+
2348
+
2349
+
2350
+ else#<< depth not > 0
2351
+ check = (refarr.length-1)
2352
+ for i in 0..check
2353
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
2354
+ refarr[i]['href']=""
2355
+ end
2356
+ end
2357
+ end
2358
+
2359
+ if (depth == s_depth)
2360
+ #store newly generated html/links for current page
2361
+ mainpage =File.new('./page.html',"w")
2362
+ mainpage.puts page
2363
+ mainpage.close
2364
+
2365
+
2366
+ else
2367
+ #store page from the link in the subdirectory
2368
+ puts "page: "
2369
+ p_depth = depth +1
2370
+ j_depth = s_depth - depth
2371
+ appendval = ""
2372
+ clutch = 0
2373
+ for r in 1..j_depth
2374
+ appendval += "../"
2375
+ clutch +=1
2376
+ end
2377
+ clutch -=1
2378
+
2379
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
2380
+ encodingissue=false
2381
+ begin
2382
+ if(encodingissue==false)
2383
+ crfile.puts page
2384
+ end
2385
+ rescue
2386
+ encodingissue=true
2387
+ retry
2388
+
2389
+ end
2390
+ crfile.close
2391
+
2392
+ end
2393
+ end #end def Localize_EX_CSS
2394
+
2395
+ #########################################################################################
2396
+ def FLocalize_EX_CSS(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add, selector)
2397
+ #open the starting page
2398
+
2399
+ if (depth<0)
2400
+ depth=0
2401
+ end
2402
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
2403
+ #collect all of the links from the page
2404
+ links= page.css('a')
2405
+ title = page.css('title')
2406
+ #initialize variables
2407
+ refarr=[]
2408
+ hrefs = []
2409
+ linkseti= []
2410
+ linkset= []
2411
+ x=0
2412
+
2413
+ linkseti = page.css(selector+' a')
2414
+ #add each link with valid href to array
2415
+ links.each do |link|
2416
+ if(link['href']!=nil && link['href']!="" && link['href'].include?('://'))
2417
+ # puts x
2418
+ # puts (link['title'].split.join)
2419
+ # x+=1
2420
+ hrefs.push(link)
2421
+
2422
+ end
2423
+
2424
+ end
2425
+ linkseti.each do |ilink|
2426
+ if(ilink['href']!=nil && ilink['href']!="")
2427
+ # puts x
2428
+ # puts (link['title'].split.join)
2429
+ # x+=1
2430
+ linkset.push(ilink)
2431
+
2432
+ end
2433
+
2434
+ end
2435
+ hrefslength = (hrefs.length-1)
2436
+ for i in 0..hrefslength
2437
+ if(linkset.include?(hrefs[i]))
2438
+ else
2439
+ if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
2440
+ hrefs[i]['href']=""
2441
+ end
2442
+
2443
+ end
2444
+ end
2445
+
2446
+
2447
+
2448
+ #transfer links to other array
2449
+ while(!hrefs.empty?)
2450
+ value= hrefs.pop
2451
+ if (value['href']!=nil && value['href']!="")
2452
+ refarr.push(value)
2453
+ end
2454
+
2455
+ end
2456
+
2457
+ #setup for recognition of the end of the array
2458
+ refarr.push("-")
2459
+
2460
+ if(depth>0)
2461
+
2462
+ #create subdirectory for storing current set of scraped pages
2463
+
2464
+ if (Dir.exist?('./pages'+depth.to_s))
2465
+ else Dir.mkdir('./pages'+depth.to_s)
2466
+ end
2467
+ #in each link
2468
+ check = (refarr.length-1)
2469
+ for i in 0..check
2470
+ if(refarr[i]!="-")
2471
+
2472
+
2473
+ #evaluate whether link is internal or external
2474
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
2475
+ if(refarr[i]['href'].include?('://'))
2476
+ url=refarr[i]['href']
2477
+ else
2478
+ url=sub_url+refarr[i]['href']
2479
+ #puts "external link"
2480
+ end#refarr[i]['href'].include?
2481
+ end#refarr[i]['href']!=nil
2482
+ fourofour=false
2483
+ #refarr[i]['href'] is nil :S this a result of reference to other array? how to do a true dup without reference?
2484
+ begin
2485
+ if(fourofour==false)
2486
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
2487
+ end
2488
+ #test for a 404
2489
+ rescue Exception =>ex
2490
+ #puts "got a 404"
2491
+ #replace href (no navigation onclick)
2492
+ refarr[i]['href'] =""
2493
+ fourofour=true
2494
+
2495
+ retry
2496
+ end #begin
2497
+
2498
+ if (fourofour==false)
2499
+ #make relevant links reference local files
2500
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
2501
+
2502
+
2503
+ j_depth = s_depth - depth
2504
+ appendval = "../"
2505
+ clutch = 0
2506
+ for r in 1..j_depth
2507
+
2508
+ clutch +=1
2509
+ end
2510
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
2511
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
2512
+ end
2513
+
2514
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
2515
+
2516
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
2517
+ if (@location_EX_CSS.has_key?(refarr[i]['href']))
2518
+ pass_a_link = "this_is_a_duplicate"
2519
+ refarr[i]['href'] = @location_EX_CSS[(refarr[i]['href'])]
2520
+
2521
+ else
2522
+ initial_link=refarr[i]['href']
2523
+ refarr[i]['href']=linkref
2524
+
2525
+
2526
+
2527
+ full_link = linkref
2528
+
2529
+ @location_EX_CSS[initial_link]=linkref
2530
+ #puts "working"
2531
+ end# @location_CSS.haskey
2532
+ end #refarr[i]['href']!=""
2533
+
2534
+
2535
+ #trim it down and remove special characters for display
2536
+ trimval=refarr[i]['href']
2537
+ finval=trimval.gsub!(/[!:\/-]/, '')
2538
+ #puts refarr[i]
2539
+ if(finval==nil && refarr[i]!=nil)
2540
+ finval=refarr[i]
2541
+ end #finval == nil
2542
+
2543
+ n_depth = depth-1
2544
+
2545
+ if(finval!=nil)
2546
+ self. FLocalize_EX_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
2547
+
2548
+
2549
+
2550
+
2551
+ end #finval!=nil
2552
+ end #fourofour==false
2553
+ end #refarr[i]!="-"
2554
+
2555
+ end#end for each
2556
+
2557
+
2558
+
2559
+
2560
+ else#<< depth not > 0
2561
+ check = (refarr.length-1)
2562
+ for i in 0..check
2563
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
2564
+ refarr[i]['href']=""
2565
+
2566
+ end
2567
+ end
2568
+ end
2569
+
2570
+ if (depth == s_depth)
2571
+ #store newly generated html/links for current page
2572
+ mainpage =File.new('./page.html',"w")
2573
+ mainpage.puts page
2574
+ mainpage.close
2575
+
2576
+
2577
+ else
2578
+ #store page from the link in the subdirectory
2579
+
2580
+ p_depth = depth +1
2581
+ j_depth = s_depth - depth
2582
+ appendval = ""
2583
+ clutch = 0
2584
+ for r in 1..j_depth
2585
+ appendval += "../"
2586
+ clutch +=1
2587
+ end
2588
+ clutch -=1
2589
+
2590
+ if (link_to_add!="this_is_a_duplicate")
2591
+
2592
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
2593
+ encodingissue=false
2594
+ begin
2595
+ if(encodingissue==false)
2596
+ crfile.puts page
2597
+ end
2598
+ rescue
2599
+ encodingissue=true
2600
+ retry
2601
+
2602
+ end
2603
+ crfile.close
2604
+ else
2605
+
2606
+ end
2607
+
2608
+ end
2609
+ end #end def FLocalize_EX_CSS
2610
+
2611
+ #########################################################################################
2612
+
884
2613
  end#module