omni_scrape 0.1.9.9 → 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +24 -0
- data/lib/omni_scrape.rb +1733 -4
- data/lib/omni_scrape/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a174480d89141b2c9bf07b87599b0672d2c393ef
|
4
|
+
data.tar.gz: d6582a817b2ce58de26030271f9a78df97c0cbc4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 09e4b0378c182c7c2cf31772e2f3668f635ac8c6480b8a23f1d9ceb3aa4eb267548711a339f0bfdf2d905be78df5da3baa2fa897f1afbb4feedd9120499a71c1
|
7
|
+
data.tar.gz: 945ff7d6d194e940b96972dbe7c6296897cd15f794ca5dcbea5f7d3d453c9f5e8ac84032ca6fbfb60a04a1f391f2a8b1a2767edefd87f4e3f64b4310aacb949b
|
data/README.md
CHANGED
@@ -59,6 +59,30 @@ The fourth is a css selector for what parts of all pages you want to take the li
|
|
59
59
|
|
60
60
|
description: Localize_CSS offers the same service that Localize provides while at the same time giving you the option to limit the result set using a css selector.
|
61
61
|
|
62
|
+
Method : Localize_IN
|
63
|
+
|
64
|
+
example : OmniScrape.Localize_IN("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org")
|
65
|
+
|
66
|
+
This will perform the same actions as Localize, but only for internal links
|
67
|
+
|
68
|
+
Method : Localize_EX
|
69
|
+
|
70
|
+
example : OmniScrape.Localize_EX("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org")
|
71
|
+
|
72
|
+
This will perform the same actions as Localize, but only for external links
|
73
|
+
|
74
|
+
Method : Localize_IN_CSS
|
75
|
+
|
76
|
+
example : OmniScrape.Localize_IN_CSS("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org", "div table.wikitable")
|
77
|
+
|
78
|
+
This will perform the same actions as Localize_CSS, but only for internal links
|
79
|
+
|
80
|
+
Method : Localize_EX_CSS
|
81
|
+
|
82
|
+
example : OmniScrape.Localize_EX_CSS("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org", "div table.wikitable") NOTE: There are no external links in the wikitable!
|
83
|
+
|
84
|
+
This will perform the same actions as Localize_CSS, but only for external links.
|
85
|
+
|
62
86
|
## Contributing
|
63
87
|
|
64
88
|
1. Fork it ( https://github.com/bmaynard1991/omni-scrape )
|
data/lib/omni_scrape.rb
CHANGED
@@ -64,8 +64,17 @@ for i in 0..check
|
|
64
64
|
if (fourofour==false)
|
65
65
|
#store html from the link with title of the link
|
66
66
|
crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
|
67
|
-
|
68
|
-
|
67
|
+
encodingissue=false
|
68
|
+
begin
|
69
|
+
if(encodingissue==false)
|
70
|
+
crfile.puts page
|
71
|
+
end
|
72
|
+
rescue
|
73
|
+
encodingissue=true
|
74
|
+
retry
|
75
|
+
|
76
|
+
end
|
77
|
+
crfile.close
|
69
78
|
end#if
|
70
79
|
end#if != "-"
|
71
80
|
|
@@ -133,7 +142,7 @@ end
|
|
133
142
|
if(refarr[i]!="-")
|
134
143
|
#evaluate whether link is internal or external
|
135
144
|
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
136
|
-
if(refarr[i]['href'].include?('
|
145
|
+
if(refarr[i]['href'].include?('://'))
|
137
146
|
url=refarr[i]['href']
|
138
147
|
else
|
139
148
|
url=sub_url+refarr[i]['href']
|
@@ -257,7 +266,16 @@ else
|
|
257
266
|
clutch -=1
|
258
267
|
|
259
268
|
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
269
|
+
encodingissue=false
|
270
|
+
begin
|
271
|
+
if(encodingissue==false)
|
260
272
|
crfile.puts page
|
273
|
+
end
|
274
|
+
rescue
|
275
|
+
encodingissue=true
|
276
|
+
retry
|
277
|
+
|
278
|
+
end
|
261
279
|
crfile.close
|
262
280
|
|
263
281
|
end
|
@@ -316,7 +334,7 @@ end
|
|
316
334
|
if(refarr[i]!="-")
|
317
335
|
#evaluate whether link is internal or external
|
318
336
|
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
319
|
-
if(refarr[i]['href'].include?('
|
337
|
+
if(refarr[i]['href'].include?('://'))
|
320
338
|
url=refarr[i]['href']
|
321
339
|
else
|
322
340
|
url=sub_url+refarr[i]['href']
|
@@ -434,7 +452,16 @@ else
|
|
434
452
|
if (link_to_add!="this_is_a_duplicate")
|
435
453
|
|
436
454
|
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
455
|
+
encodingissue=false
|
456
|
+
begin
|
457
|
+
if(encodingissue==false)
|
437
458
|
crfile.puts page
|
459
|
+
end
|
460
|
+
rescue
|
461
|
+
encodingissue=true
|
462
|
+
retry
|
463
|
+
|
464
|
+
end
|
438
465
|
crfile.close
|
439
466
|
else
|
440
467
|
|
@@ -666,7 +693,16 @@ else
|
|
666
693
|
clutch -=1
|
667
694
|
|
668
695
|
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
696
|
+
encodingissue=false
|
697
|
+
begin
|
698
|
+
if(encodingissue==false)
|
669
699
|
crfile.puts page
|
700
|
+
end
|
701
|
+
rescue
|
702
|
+
encodingissue=true
|
703
|
+
retry
|
704
|
+
|
705
|
+
end
|
670
706
|
crfile.close
|
671
707
|
|
672
708
|
end
|
@@ -870,7 +906,16 @@ else
|
|
870
906
|
if (link_to_add!="this_is_a_duplicate")
|
871
907
|
|
872
908
|
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
909
|
+
encodingissue=false
|
910
|
+
begin
|
911
|
+
if(encodingissue==false)
|
873
912
|
crfile.puts page
|
913
|
+
end
|
914
|
+
rescue
|
915
|
+
encodingissue=true
|
916
|
+
retry
|
917
|
+
|
918
|
+
end
|
874
919
|
crfile.close
|
875
920
|
else
|
876
921
|
|
@@ -881,4 +926,1688 @@ end #end def FLocalize_CSS
|
|
881
926
|
|
882
927
|
#########################################################################################
|
883
928
|
|
929
|
+
#############################################################################################
|
930
|
+
|
931
|
+
def Localize_IN(url, depth, sub_url)
|
932
|
+
|
933
|
+
#initialize to extract from user view
|
934
|
+
@location_in = Hash.new
|
935
|
+
s_depth = depth
|
936
|
+
i_page = 0
|
937
|
+
prev_ipage = 0
|
938
|
+
link_to_add =""
|
939
|
+
if (depth<0)
|
940
|
+
depth=0
|
941
|
+
end
|
942
|
+
#open the starting page
|
943
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
944
|
+
#collect all of the links from the page
|
945
|
+
links= page.css('a')
|
946
|
+
title = page.css('title')
|
947
|
+
#initialize variables
|
948
|
+
refarr=[]
|
949
|
+
hrefs = []
|
950
|
+
x=0
|
951
|
+
|
952
|
+
#add href to arrays for each link
|
953
|
+
links.each do |link|
|
954
|
+
if(link['href']!=nil && link['href']!="" && !link['href'].include?('://'))
|
955
|
+
# puts x
|
956
|
+
# puts (link['title'].split.join)
|
957
|
+
# x+=1
|
958
|
+
hrefs.push(link)
|
959
|
+
|
960
|
+
end
|
961
|
+
|
962
|
+
end
|
963
|
+
total=0
|
964
|
+
#transfer links to other array
|
965
|
+
while(!hrefs.empty?)
|
966
|
+
value= hrefs.pop
|
967
|
+
refarr.push(value)
|
968
|
+
total+=1
|
969
|
+
end
|
970
|
+
|
971
|
+
|
972
|
+
|
973
|
+
#setup for recognition of the end of the array
|
974
|
+
refarr.push("-")
|
975
|
+
|
976
|
+
if(depth>0)
|
977
|
+
|
978
|
+
#create subdirectory for storing current set of scraped pages
|
979
|
+
|
980
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
981
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
982
|
+
end
|
983
|
+
#in each link
|
984
|
+
check = (refarr.length-1)
|
985
|
+
for i in 0..check
|
986
|
+
if(refarr[i]!="-")
|
987
|
+
#evaluate whether link is internal or external
|
988
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
989
|
+
if(refarr[i]['href'].include?('://'))
|
990
|
+
url=refarr[i]['href']
|
991
|
+
else
|
992
|
+
url=sub_url+refarr[i]['href']
|
993
|
+
#puts "external link"
|
994
|
+
end#refarr[i]['href'].include?
|
995
|
+
end#refarr[i]['href']!=nil
|
996
|
+
fourofour=false
|
997
|
+
begin
|
998
|
+
if(fourofour==false)
|
999
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1000
|
+
end
|
1001
|
+
#test for a 404
|
1002
|
+
rescue Exception =>ex
|
1003
|
+
#puts "got a 404"
|
1004
|
+
#replace href (no navigation onclick)
|
1005
|
+
refarr[i]['href'] =""
|
1006
|
+
fourofour=true
|
1007
|
+
|
1008
|
+
retry
|
1009
|
+
end #begin
|
1010
|
+
|
1011
|
+
if (fourofour==false)
|
1012
|
+
#make relevant links reference local files
|
1013
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
1014
|
+
|
1015
|
+
|
1016
|
+
j_depth = s_depth - depth
|
1017
|
+
appendval = "../"
|
1018
|
+
clutch = 0
|
1019
|
+
for r in 1..j_depth
|
1020
|
+
|
1021
|
+
clutch +=1
|
1022
|
+
end
|
1023
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
1024
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
1025
|
+
end
|
1026
|
+
if (depth == s_depth)
|
1027
|
+
|
1028
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
1029
|
+
else
|
1030
|
+
|
1031
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
1032
|
+
end
|
1033
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
1034
|
+
if (@location_in.has_key?(refarr[i]['href']))
|
1035
|
+
loc = @location_in[(refarr[i]['href'])]
|
1036
|
+
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
1037
|
+
refarr[i]['href'] =sub_loc
|
1038
|
+
else
|
1039
|
+
initial_link=refarr[i]['href']
|
1040
|
+
refarr[i]['href']=linkref
|
1041
|
+
|
1042
|
+
#HERE!!!!!**!*!*@*!!@@***!
|
1043
|
+
if (depth == s_depth)
|
1044
|
+
full_link = "../../"+linkref
|
1045
|
+
else
|
1046
|
+
full_link = linkref
|
1047
|
+
end
|
1048
|
+
@location_in[initial_link]=full_link
|
1049
|
+
#puts "working"
|
1050
|
+
end# @location.haskey
|
1051
|
+
end #refarr[i]['href']!=""
|
1052
|
+
|
1053
|
+
#trim it down and remove special characters for display
|
1054
|
+
trimval=refarr[i]['href']
|
1055
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
1056
|
+
#puts refarr[i]
|
1057
|
+
if(finval==nil && refarr[i]!=nil)
|
1058
|
+
finval=refarr[i]
|
1059
|
+
end #finval == nil
|
1060
|
+
|
1061
|
+
n_depth = depth-1
|
1062
|
+
|
1063
|
+
if(finval!=nil)
|
1064
|
+
self. FLocalize_IN(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
1065
|
+
#create subdirectory for storing current links page
|
1066
|
+
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
1067
|
+
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
1068
|
+
#end
|
1069
|
+
|
1070
|
+
|
1071
|
+
|
1072
|
+
|
1073
|
+
end #finval!=nil
|
1074
|
+
end #fourofour==false
|
1075
|
+
end #refarr[i]!="-"
|
1076
|
+
|
1077
|
+
end#end for each
|
1078
|
+
|
1079
|
+
|
1080
|
+
|
1081
|
+
|
1082
|
+
else#<< depth not > 0
|
1083
|
+
check = (refarr.length-1)
|
1084
|
+
for i in 0..check
|
1085
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1086
|
+
refarr[i]['href']=""
|
1087
|
+
end
|
1088
|
+
end
|
1089
|
+
end
|
1090
|
+
|
1091
|
+
if (depth == s_depth)
|
1092
|
+
#store newly generated html/links for current page
|
1093
|
+
mainpage =File.new('./page.html',"w")
|
1094
|
+
mainpage.puts page
|
1095
|
+
mainpage.close
|
1096
|
+
|
1097
|
+
|
1098
|
+
else
|
1099
|
+
#store page from the link in the subdirectory
|
1100
|
+
puts "page: "
|
1101
|
+
p_depth = depth +1
|
1102
|
+
j_depth = s_depth - depth
|
1103
|
+
appendval = ""
|
1104
|
+
clutch = 0
|
1105
|
+
for r in 1..j_depth
|
1106
|
+
appendval += "../"
|
1107
|
+
clutch +=1
|
1108
|
+
end
|
1109
|
+
clutch -=1
|
1110
|
+
|
1111
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
1112
|
+
encodingissue=false
|
1113
|
+
begin
|
1114
|
+
if(encodingissue==false)
|
1115
|
+
crfile.puts page
|
1116
|
+
end
|
1117
|
+
rescue
|
1118
|
+
encodingissue=true
|
1119
|
+
retry
|
1120
|
+
end
|
1121
|
+
|
1122
|
+
crfile.close
|
1123
|
+
|
1124
|
+
end
|
1125
|
+
end #end def Localize_IN
|
1126
|
+
|
1127
|
+
#########################################################################################
|
1128
|
+
def FLocalize_IN(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
|
1129
|
+
#open the starting page
|
1130
|
+
|
1131
|
+
if (depth<0)
|
1132
|
+
depth=0
|
1133
|
+
end
|
1134
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1135
|
+
#collect all of the links from the page
|
1136
|
+
links= page.css('a')
|
1137
|
+
title = page.css('title')
|
1138
|
+
#initialize variables
|
1139
|
+
refarr=[]
|
1140
|
+
hrefs = []
|
1141
|
+
x=0
|
1142
|
+
|
1143
|
+
#add href to arrays for each link
|
1144
|
+
links.each do |link|
|
1145
|
+
if(link['href']!=nil && link['href']!="" && !link['href'].include?('://'))
|
1146
|
+
# puts x
|
1147
|
+
# puts (link['title'].split.join)
|
1148
|
+
# x+=1
|
1149
|
+
hrefs.push(link)
|
1150
|
+
|
1151
|
+
end
|
1152
|
+
|
1153
|
+
end
|
1154
|
+
total=0
|
1155
|
+
#transfer links to other array
|
1156
|
+
while(!hrefs.empty?)
|
1157
|
+
value= hrefs.pop
|
1158
|
+
refarr.push(value)
|
1159
|
+
total+=1
|
1160
|
+
end
|
1161
|
+
|
1162
|
+
|
1163
|
+
|
1164
|
+
#setup for recognition of the end of the array
|
1165
|
+
refarr.push("-")
|
1166
|
+
|
1167
|
+
if(depth>0)
|
1168
|
+
|
1169
|
+
#create subdirectory for storing current set of scraped pages
|
1170
|
+
|
1171
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
1172
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
1173
|
+
end
|
1174
|
+
#in each link
|
1175
|
+
check = (refarr.length-1)
|
1176
|
+
for i in 0..check
|
1177
|
+
if(refarr[i]!="-")
|
1178
|
+
#evaluate whether link is internal or external
|
1179
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1180
|
+
if(refarr[i]['href'].include?('://'))
|
1181
|
+
url=refarr[i]['href']
|
1182
|
+
else
|
1183
|
+
url=sub_url+refarr[i]['href']
|
1184
|
+
#puts "external link"
|
1185
|
+
end#refarr[i]['href'].include?
|
1186
|
+
end#refarr[i]['href']!=nil
|
1187
|
+
fourofour=false
|
1188
|
+
|
1189
|
+
begin
|
1190
|
+
if(fourofour==false)
|
1191
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1192
|
+
end
|
1193
|
+
#test for a 404
|
1194
|
+
rescue Exception =>ex
|
1195
|
+
#puts "got a 404"
|
1196
|
+
#replace href (no navigation onclick)
|
1197
|
+
refarr[i]['href'] =""
|
1198
|
+
fourofour=true
|
1199
|
+
|
1200
|
+
retry
|
1201
|
+
end #begin
|
1202
|
+
|
1203
|
+
if (fourofour==false)
|
1204
|
+
#make relevant links reference local files
|
1205
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
1206
|
+
|
1207
|
+
|
1208
|
+
j_depth = s_depth - depth
|
1209
|
+
appendval = "../"
|
1210
|
+
clutch = 0
|
1211
|
+
for r in 1..j_depth
|
1212
|
+
|
1213
|
+
clutch +=1
|
1214
|
+
end
|
1215
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
1216
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
1217
|
+
end
|
1218
|
+
|
1219
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
1220
|
+
|
1221
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
1222
|
+
if (@location_in.has_key?(refarr[i]['href']))
|
1223
|
+
pass_a_link = "this_is_a_duplicate"
|
1224
|
+
refarr[i]['href'] = @location_in[(refarr[i]['href'])]
|
1225
|
+
|
1226
|
+
else
|
1227
|
+
initial_link=refarr[i]['href']
|
1228
|
+
refarr[i]['href']=linkref
|
1229
|
+
|
1230
|
+
|
1231
|
+
|
1232
|
+
full_link = linkref
|
1233
|
+
|
1234
|
+
@location_in[initial_link]=linkref
|
1235
|
+
#puts "working"
|
1236
|
+
end# @location.haskey
|
1237
|
+
end #refarr[i]['href']!=""
|
1238
|
+
|
1239
|
+
|
1240
|
+
#trim it down and remove special characters for display
|
1241
|
+
trimval=refarr[i]['href']
|
1242
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
1243
|
+
#puts refarr[i]
|
1244
|
+
if(finval==nil && refarr[i]!=nil)
|
1245
|
+
finval=refarr[i]
|
1246
|
+
end #finval == nil
|
1247
|
+
|
1248
|
+
n_depth = depth-1
|
1249
|
+
|
1250
|
+
if(finval!=nil)
|
1251
|
+
self. FLocalize_IN(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
1252
|
+
|
1253
|
+
|
1254
|
+
|
1255
|
+
|
1256
|
+
end #finval!=nil
|
1257
|
+
end #fourofour==false
|
1258
|
+
end #refarr[i]!="-"
|
1259
|
+
|
1260
|
+
end#end for each
|
1261
|
+
|
1262
|
+
|
1263
|
+
|
1264
|
+
|
1265
|
+
else#<< depth not > 0
|
1266
|
+
check = (refarr.length-1)
|
1267
|
+
for i in 0..check
|
1268
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1269
|
+
refarr[i]['href']=""
|
1270
|
+
|
1271
|
+
end
|
1272
|
+
end
|
1273
|
+
end
|
1274
|
+
|
1275
|
+
if (depth == s_depth)
|
1276
|
+
#store newly generated html/links for current page
|
1277
|
+
mainpage =File.new('./page.html',"w")
|
1278
|
+
mainpage.puts page
|
1279
|
+
mainpage.close
|
1280
|
+
|
1281
|
+
|
1282
|
+
else
|
1283
|
+
#store page from the link in the subdirectory
|
1284
|
+
|
1285
|
+
p_depth = depth +1
|
1286
|
+
j_depth = s_depth - depth
|
1287
|
+
appendval = ""
|
1288
|
+
clutch = 0
|
1289
|
+
for r in 1..j_depth
|
1290
|
+
appendval += "../"
|
1291
|
+
clutch +=1
|
1292
|
+
end
|
1293
|
+
clutch -=1
|
1294
|
+
|
1295
|
+
if (link_to_add!="this_is_a_duplicate")
|
1296
|
+
|
1297
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
1298
|
+
encodingissue=false
|
1299
|
+
begin
|
1300
|
+
if(encodingissue==false)
|
1301
|
+
crfile.puts page
|
1302
|
+
end
|
1303
|
+
rescue
|
1304
|
+
encodingissue=true
|
1305
|
+
retry
|
1306
|
+
|
1307
|
+
end
|
1308
|
+
crfile.close
|
1309
|
+
else
|
1310
|
+
|
1311
|
+
end
|
1312
|
+
|
1313
|
+
end
|
1314
|
+
end #end def FLocalize_IN
|
1315
|
+
|
1316
|
+
#########################################################################################
|
1317
|
+
|
1318
|
+
#############################################################################################
|
1319
|
+
|
1320
|
+
def Localize_EX(url, depth, sub_url)
|
1321
|
+
|
1322
|
+
#initialize to extract from user view
|
1323
|
+
@location_ex = Hash.new
|
1324
|
+
s_depth = depth
|
1325
|
+
i_page = 0
|
1326
|
+
prev_ipage = 0
|
1327
|
+
link_to_add =""
|
1328
|
+
if (depth<0)
|
1329
|
+
depth=0
|
1330
|
+
end
|
1331
|
+
#open the starting page
|
1332
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1333
|
+
#collect all of the links from the page
|
1334
|
+
links= page.css('a')
|
1335
|
+
title = page.css('title')
|
1336
|
+
#initialize variables
|
1337
|
+
refarr=[]
|
1338
|
+
hrefs = []
|
1339
|
+
x=0
|
1340
|
+
|
1341
|
+
#add href to arrays for each link
|
1342
|
+
links.each do |link|
|
1343
|
+
if(link['href']!=nil && link['href']!="" && link['href'].include?('://'))
|
1344
|
+
# puts x
|
1345
|
+
# puts (link['title'].split.join)
|
1346
|
+
# x+=1
|
1347
|
+
hrefs.push(link)
|
1348
|
+
|
1349
|
+
end
|
1350
|
+
|
1351
|
+
end
|
1352
|
+
total=0
|
1353
|
+
#transfer links to other array
|
1354
|
+
while(!hrefs.empty?)
|
1355
|
+
value= hrefs.pop
|
1356
|
+
refarr.push(value)
|
1357
|
+
total+=1
|
1358
|
+
end
|
1359
|
+
|
1360
|
+
|
1361
|
+
|
1362
|
+
#setup for recognition of the end of the array
|
1363
|
+
refarr.push("-")
|
1364
|
+
|
1365
|
+
if(depth>0)
|
1366
|
+
|
1367
|
+
#create subdirectory for storing current set of scraped pages
|
1368
|
+
|
1369
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
1370
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
1371
|
+
end
|
1372
|
+
#in each link
|
1373
|
+
check = (refarr.length-1)
|
1374
|
+
for i in 0..check
|
1375
|
+
if(refarr[i]!="-")
|
1376
|
+
#evaluate whether link is internal or external
|
1377
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1378
|
+
if(refarr[i]['href'].include?('://'))
|
1379
|
+
url=refarr[i]['href']
|
1380
|
+
else
|
1381
|
+
url=sub_url+refarr[i]['href']
|
1382
|
+
#puts "external link"
|
1383
|
+
end#refarr[i]['href'].include?
|
1384
|
+
end#refarr[i]['href']!=nil
|
1385
|
+
fourofour=false
|
1386
|
+
begin
|
1387
|
+
if(fourofour==false)
|
1388
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1389
|
+
end
|
1390
|
+
#test for a 404
|
1391
|
+
rescue Exception =>ex
|
1392
|
+
#puts "got a 404"
|
1393
|
+
#replace href (no navigation onclick)
|
1394
|
+
refarr[i]['href'] =""
|
1395
|
+
fourofour=true
|
1396
|
+
|
1397
|
+
retry
|
1398
|
+
end #begin
|
1399
|
+
|
1400
|
+
if (fourofour==false && refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
1401
|
+
#make relevant links reference local files
|
1402
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
1403
|
+
|
1404
|
+
|
1405
|
+
j_depth = s_depth - depth
|
1406
|
+
appendval = "../"
|
1407
|
+
clutch = 0
|
1408
|
+
for r in 1..j_depth
|
1409
|
+
|
1410
|
+
clutch +=1
|
1411
|
+
end
|
1412
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
1413
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
1414
|
+
end
|
1415
|
+
if (depth == s_depth)
|
1416
|
+
|
1417
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
1418
|
+
else
|
1419
|
+
|
1420
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
1421
|
+
end
|
1422
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
1423
|
+
if (@location_ex.has_key?(refarr[i]['href']))
|
1424
|
+
loc = @location_ex[(refarr[i]['href'])]
|
1425
|
+
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
1426
|
+
refarr[i]['href'] =sub_loc
|
1427
|
+
else
|
1428
|
+
initial_link=refarr[i]['href']
|
1429
|
+
refarr[i]['href']=linkref
|
1430
|
+
|
1431
|
+
#HERE!!!!!**!*!*@*!!@@***!
|
1432
|
+
if (depth == s_depth)
|
1433
|
+
full_link = "../../"+linkref
|
1434
|
+
else
|
1435
|
+
full_link = linkref
|
1436
|
+
end
|
1437
|
+
@location_ex[initial_link]=full_link
|
1438
|
+
#puts "working"
|
1439
|
+
end# @location.haskey
|
1440
|
+
end #refarr[i]['href']!=""
|
1441
|
+
|
1442
|
+
#trim it down and remove special characters for display
|
1443
|
+
trimval=refarr[i]['href']
|
1444
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
1445
|
+
#puts refarr[i]
|
1446
|
+
if(finval==nil && refarr[i]!=nil)
|
1447
|
+
finval=refarr[i]
|
1448
|
+
end #finval == nil
|
1449
|
+
|
1450
|
+
n_depth = depth-1
|
1451
|
+
|
1452
|
+
if(finval!=nil)
|
1453
|
+
self. FLocalize_EX(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
1454
|
+
#create subdirectory for storing current links page
|
1455
|
+
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
1456
|
+
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
1457
|
+
#end
|
1458
|
+
|
1459
|
+
|
1460
|
+
|
1461
|
+
|
1462
|
+
end #finval!=nil
|
1463
|
+
end #fourofour==false
|
1464
|
+
end #refarr[i]!="-"
|
1465
|
+
|
1466
|
+
end#end for each
|
1467
|
+
|
1468
|
+
|
1469
|
+
|
1470
|
+
|
1471
|
+
else#<< depth not > 0
|
1472
|
+
check = (refarr.length-1)
|
1473
|
+
for i in 0..check
|
1474
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1475
|
+
refarr[i]['href']=""
|
1476
|
+
end
|
1477
|
+
end
|
1478
|
+
end
|
1479
|
+
|
1480
|
+
if (depth == s_depth)
|
1481
|
+
#store newly generated html/links for current page
|
1482
|
+
mainpage =File.new('./page.html',"w")
|
1483
|
+
mainpage.puts page
|
1484
|
+
mainpage.close
|
1485
|
+
|
1486
|
+
|
1487
|
+
else
|
1488
|
+
#store page from the link in the subdirectory
|
1489
|
+
puts "page: "
|
1490
|
+
p_depth = depth +1
|
1491
|
+
j_depth = s_depth - depth
|
1492
|
+
appendval = ""
|
1493
|
+
clutch = 0
|
1494
|
+
for r in 1..j_depth
|
1495
|
+
appendval += "../"
|
1496
|
+
clutch +=1
|
1497
|
+
end
|
1498
|
+
clutch -=1
|
1499
|
+
|
1500
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
1501
|
+
encodingissue=false
|
1502
|
+
begin
|
1503
|
+
if(encodingissue==false)
|
1504
|
+
crfile.puts page
|
1505
|
+
end
|
1506
|
+
rescue
|
1507
|
+
encodingissue=true
|
1508
|
+
retry
|
1509
|
+
end
|
1510
|
+
|
1511
|
+
crfile.close
|
1512
|
+
|
1513
|
+
end
|
1514
|
+
end #end def Localize_EX
|
1515
|
+
|
1516
|
+
#########################################################################################
|
1517
|
+
def FLocalize_EX(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
|
1518
|
+
#open the starting page
|
1519
|
+
|
1520
|
+
if (depth<0)
|
1521
|
+
depth=0
|
1522
|
+
end
|
1523
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1524
|
+
#collect all of the links from the page
|
1525
|
+
links= page.css('a')
|
1526
|
+
title = page.css('title')
|
1527
|
+
#initialize variables
|
1528
|
+
refarr=[]
|
1529
|
+
hrefs = []
|
1530
|
+
x=0
|
1531
|
+
|
1532
|
+
#add href to arrays for each link
|
1533
|
+
links.each do |link|
|
1534
|
+
if(link['href']!=nil && link['href']!="" && link['href'].include?('://'))
|
1535
|
+
# puts x
|
1536
|
+
# puts (link['title'].split.join)
|
1537
|
+
# x+=1
|
1538
|
+
hrefs.push(link)
|
1539
|
+
|
1540
|
+
end
|
1541
|
+
|
1542
|
+
end
|
1543
|
+
total=0
|
1544
|
+
#transfer links to other array
|
1545
|
+
while(!hrefs.empty?)
|
1546
|
+
value= hrefs.pop
|
1547
|
+
refarr.push(value)
|
1548
|
+
total+=1
|
1549
|
+
end
|
1550
|
+
|
1551
|
+
|
1552
|
+
|
1553
|
+
#setup for recognition of the end of the array
|
1554
|
+
refarr.push("-")
|
1555
|
+
|
1556
|
+
if(depth>0)
|
1557
|
+
|
1558
|
+
#create subdirectory for storing current set of scraped pages
|
1559
|
+
|
1560
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
1561
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
1562
|
+
end
|
1563
|
+
#in each link
|
1564
|
+
check = (refarr.length-1)
|
1565
|
+
for i in 0..check
|
1566
|
+
if(refarr[i]!="-")
|
1567
|
+
#evaluate whether link is internal or external
|
1568
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1569
|
+
if(refarr[i]['href'].include?('://'))
|
1570
|
+
url=refarr[i]['href']
|
1571
|
+
else
|
1572
|
+
url=sub_url+refarr[i]['href']
|
1573
|
+
#puts "external link"
|
1574
|
+
end#refarr[i]['href'].include?
|
1575
|
+
end#refarr[i]['href']!=nil
|
1576
|
+
fourofour=false
|
1577
|
+
|
1578
|
+
begin
|
1579
|
+
if(fourofour==false)
|
1580
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1581
|
+
end
|
1582
|
+
#test for a 404
|
1583
|
+
rescue Exception =>ex
|
1584
|
+
#puts "got a 404"
|
1585
|
+
#replace href (no navigation onclick)
|
1586
|
+
refarr[i]['href'] =""
|
1587
|
+
fourofour=true
|
1588
|
+
|
1589
|
+
retry
|
1590
|
+
end #begin
|
1591
|
+
|
1592
|
+
if (fourofour==false && refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
1593
|
+
#make relevant links reference local files
|
1594
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
1595
|
+
|
1596
|
+
|
1597
|
+
j_depth = s_depth - depth
|
1598
|
+
appendval = "../"
|
1599
|
+
clutch = 0
|
1600
|
+
for r in 1..j_depth
|
1601
|
+
|
1602
|
+
clutch +=1
|
1603
|
+
end
|
1604
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
1605
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
1606
|
+
end
|
1607
|
+
|
1608
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
1609
|
+
|
1610
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
1611
|
+
if (@location_ex.has_key?(refarr[i]['href']))
|
1612
|
+
pass_a_link = "this_is_a_duplicate"
|
1613
|
+
refarr[i]['href'] = @location_ex[(refarr[i]['href'])]
|
1614
|
+
|
1615
|
+
else
|
1616
|
+
initial_link=refarr[i]['href']
|
1617
|
+
refarr[i]['href']=linkref
|
1618
|
+
|
1619
|
+
|
1620
|
+
|
1621
|
+
full_link = linkref
|
1622
|
+
|
1623
|
+
@location_ex[initial_link]=linkref
|
1624
|
+
#puts "working"
|
1625
|
+
end# @location.haskey
|
1626
|
+
end #refarr[i]['href']!=""
|
1627
|
+
|
1628
|
+
|
1629
|
+
#trim it down and remove special characters for display
|
1630
|
+
trimval=refarr[i]['href']
|
1631
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
1632
|
+
#puts refarr[i]
|
1633
|
+
if(finval==nil && refarr[i]!=nil)
|
1634
|
+
finval=refarr[i]
|
1635
|
+
end #finval == nil
|
1636
|
+
|
1637
|
+
n_depth = depth-1
|
1638
|
+
|
1639
|
+
if(finval!=nil)
|
1640
|
+
self. FLocalize_EX(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
1641
|
+
|
1642
|
+
|
1643
|
+
|
1644
|
+
|
1645
|
+
end #finval!=nil
|
1646
|
+
end #fourofour==false
|
1647
|
+
end #refarr[i]!="-"
|
1648
|
+
|
1649
|
+
end#end for each
|
1650
|
+
|
1651
|
+
|
1652
|
+
|
1653
|
+
|
1654
|
+
else#<< depth not > 0
|
1655
|
+
check = (refarr.length-1)
|
1656
|
+
for i in 0..check
|
1657
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1658
|
+
refarr[i]['href']=""
|
1659
|
+
|
1660
|
+
end
|
1661
|
+
end
|
1662
|
+
end
|
1663
|
+
|
1664
|
+
if (depth == s_depth)
|
1665
|
+
#store newly generated html/links for current page
|
1666
|
+
mainpage =File.new('./page.html',"w")
|
1667
|
+
mainpage.puts page
|
1668
|
+
mainpage.close
|
1669
|
+
|
1670
|
+
|
1671
|
+
else
|
1672
|
+
#store page from the link in the subdirectory
|
1673
|
+
|
1674
|
+
p_depth = depth +1
|
1675
|
+
j_depth = s_depth - depth
|
1676
|
+
appendval = ""
|
1677
|
+
clutch = 0
|
1678
|
+
for r in 1..j_depth
|
1679
|
+
appendval += "../"
|
1680
|
+
clutch +=1
|
1681
|
+
end
|
1682
|
+
clutch -=1
|
1683
|
+
|
1684
|
+
if (link_to_add!="this_is_a_duplicate")
|
1685
|
+
|
1686
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
1687
|
+
encodingissue=false
|
1688
|
+
begin
|
1689
|
+
if(encodingissue==false)
|
1690
|
+
crfile.puts page
|
1691
|
+
end
|
1692
|
+
rescue
|
1693
|
+
encodingissue=true
|
1694
|
+
retry
|
1695
|
+
|
1696
|
+
end
|
1697
|
+
crfile.close
|
1698
|
+
else
|
1699
|
+
|
1700
|
+
end
|
1701
|
+
|
1702
|
+
end
|
1703
|
+
end #end def FLocalize_EX
|
1704
|
+
|
1705
|
+
#########################################################################################
|
1706
|
+
|
1707
|
+
#############################################################################################
|
1708
|
+
|
1709
|
+
def Localize_IN_CSS(url, depth, sub_url,selector)
|
1710
|
+
|
1711
|
+
#initialize to extract from user view
|
1712
|
+
@location_IN_CSS = Hash.new
|
1713
|
+
s_depth = depth
|
1714
|
+
i_page = 0
|
1715
|
+
prev_ipage = 0
|
1716
|
+
link_to_add =""
|
1717
|
+
if (depth<0)
|
1718
|
+
depth=0
|
1719
|
+
end
|
1720
|
+
#open the starting page
|
1721
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1722
|
+
#collect all of the links from the page
|
1723
|
+
links= page.css('a')
|
1724
|
+
title = page.css('title')
|
1725
|
+
#initialize variables
|
1726
|
+
refarr=[]
|
1727
|
+
hrefs = []
|
1728
|
+
linkseti= []
|
1729
|
+
linkset= []
|
1730
|
+
x=0
|
1731
|
+
|
1732
|
+
linkseti = page.css(selector+' a')
|
1733
|
+
#add each link with valid href to array
|
1734
|
+
links.each do |link|
|
1735
|
+
if(link['href']!=nil && link['href']!="" && !link['href'].include?('://'))
|
1736
|
+
# puts x
|
1737
|
+
# puts (link['title'].split.join)
|
1738
|
+
# x+=1
|
1739
|
+
hrefs.push(link)
|
1740
|
+
|
1741
|
+
end
|
1742
|
+
|
1743
|
+
end
|
1744
|
+
linkseti.each do |ilink|
|
1745
|
+
if(ilink['href']!=nil && ilink['href']!="")
|
1746
|
+
# puts x
|
1747
|
+
# puts (link['title'].split.join)
|
1748
|
+
# x+=1
|
1749
|
+
linkset.push(ilink)
|
1750
|
+
|
1751
|
+
end
|
1752
|
+
|
1753
|
+
end
|
1754
|
+
hrefslength = (hrefs.length-1)
|
1755
|
+
for i in 0..hrefslength
|
1756
|
+
if(linkset.include?(hrefs[i]))
|
1757
|
+
else
|
1758
|
+
if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
|
1759
|
+
hrefs[i]['href']=""
|
1760
|
+
end
|
1761
|
+
|
1762
|
+
end
|
1763
|
+
end
|
1764
|
+
|
1765
|
+
|
1766
|
+
#transfer links to other array
|
1767
|
+
while(!hrefs.empty?)
|
1768
|
+
value= hrefs.pop
|
1769
|
+
if (value['href']!=nil && value['href']!="")
|
1770
|
+
refarr.push(value)
|
1771
|
+
end
|
1772
|
+
|
1773
|
+
end
|
1774
|
+
|
1775
|
+
|
1776
|
+
|
1777
|
+
|
1778
|
+
|
1779
|
+
|
1780
|
+
|
1781
|
+
#setup for recognition of the end of the array
|
1782
|
+
refarr.push("-")
|
1783
|
+
|
1784
|
+
if(depth>0)
|
1785
|
+
|
1786
|
+
#create subdirectory for storing current set of scraped pages
|
1787
|
+
|
1788
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
1789
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
1790
|
+
end
|
1791
|
+
#in each link
|
1792
|
+
check = (refarr.length-1)
|
1793
|
+
for i in 0..check
|
1794
|
+
if(refarr[i]!="-")
|
1795
|
+
if(linkset.include?(refarr[i]))
|
1796
|
+
else
|
1797
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1798
|
+
refarr[i]['href']=""
|
1799
|
+
end
|
1800
|
+
end
|
1801
|
+
#evaluate whether link is internal or external
|
1802
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1803
|
+
if(refarr[i]['href'].include?('://'))
|
1804
|
+
url=refarr[i]['href']
|
1805
|
+
else
|
1806
|
+
url=sub_url+refarr[i]['href']
|
1807
|
+
#puts "external link"
|
1808
|
+
end#refarr[i]['href'].include?
|
1809
|
+
end#refarr[i]['href']!=nil
|
1810
|
+
fourofour=false
|
1811
|
+
|
1812
|
+
begin
|
1813
|
+
if(fourofour==false && refarr[i]['href']!=nil)
|
1814
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1815
|
+
end
|
1816
|
+
#test for a 404
|
1817
|
+
rescue Exception =>ex
|
1818
|
+
#puts "got a 404"
|
1819
|
+
#replace href (no navigation onclick)
|
1820
|
+
refarr[i]['href'] =""
|
1821
|
+
fourofour=true
|
1822
|
+
|
1823
|
+
retry
|
1824
|
+
end #begin
|
1825
|
+
|
1826
|
+
if (fourofour==false)
|
1827
|
+
#make relevant links reference local files
|
1828
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
1829
|
+
|
1830
|
+
|
1831
|
+
j_depth = s_depth - depth
|
1832
|
+
appendval = "../"
|
1833
|
+
clutch = 0
|
1834
|
+
for r in 1..j_depth
|
1835
|
+
|
1836
|
+
clutch +=1
|
1837
|
+
end
|
1838
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
1839
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
1840
|
+
end
|
1841
|
+
if (depth == s_depth)
|
1842
|
+
|
1843
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
1844
|
+
else
|
1845
|
+
|
1846
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
1847
|
+
end
|
1848
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
1849
|
+
if (@location_IN_CSS.has_key?(refarr[i]['href']))
|
1850
|
+
loc = @location_IN_CSS[(refarr[i]['href'])]
|
1851
|
+
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
1852
|
+
refarr[i]['href'] =sub_loc
|
1853
|
+
else
|
1854
|
+
initial_link=refarr[i]['href']
|
1855
|
+
refarr[i]['href']=linkref
|
1856
|
+
|
1857
|
+
#HERE!!!!!**!*!*@*!!@@***!
|
1858
|
+
if (depth == s_depth)
|
1859
|
+
full_link = "../../"+linkref
|
1860
|
+
else
|
1861
|
+
full_link = linkref
|
1862
|
+
end
|
1863
|
+
@location_IN_CSS[initial_link]=full_link
|
1864
|
+
#puts "working"
|
1865
|
+
end# @location_CSS.haskey
|
1866
|
+
end #refarr[i]['href']!=""
|
1867
|
+
|
1868
|
+
#trim it down and remove special characters for display
|
1869
|
+
trimval=refarr[i]['href']
|
1870
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
1871
|
+
#puts refarr[i]
|
1872
|
+
if(finval==nil && refarr[i]!=nil)
|
1873
|
+
finval=refarr[i]
|
1874
|
+
end #finval == nil
|
1875
|
+
|
1876
|
+
n_depth = depth-1
|
1877
|
+
|
1878
|
+
if(finval!=nil)
|
1879
|
+
self. FLocalize_IN_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
|
1880
|
+
#create subdirectory for storing current links page
|
1881
|
+
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
1882
|
+
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
1883
|
+
#end
|
1884
|
+
|
1885
|
+
|
1886
|
+
|
1887
|
+
|
1888
|
+
end #finval!=nil
|
1889
|
+
end #fourofour==false
|
1890
|
+
end #refarr[i]!="-"
|
1891
|
+
|
1892
|
+
end#end for each
|
1893
|
+
|
1894
|
+
|
1895
|
+
|
1896
|
+
|
1897
|
+
else#<< depth not > 0
|
1898
|
+
check = (refarr.length-1)
|
1899
|
+
for i in 0..check
|
1900
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
1901
|
+
refarr[i]['href']=""
|
1902
|
+
end
|
1903
|
+
end
|
1904
|
+
end
|
1905
|
+
|
1906
|
+
if (depth == s_depth)
|
1907
|
+
#store newly generated html/links for current page
|
1908
|
+
mainpage =File.new('./page.html',"w")
|
1909
|
+
mainpage.puts page
|
1910
|
+
mainpage.close
|
1911
|
+
|
1912
|
+
|
1913
|
+
else
|
1914
|
+
#store page from the link in the subdirectory
|
1915
|
+
puts "page: "
|
1916
|
+
p_depth = depth +1
|
1917
|
+
j_depth = s_depth - depth
|
1918
|
+
appendval = ""
|
1919
|
+
clutch = 0
|
1920
|
+
for r in 1..j_depth
|
1921
|
+
appendval += "../"
|
1922
|
+
clutch +=1
|
1923
|
+
end
|
1924
|
+
clutch -=1
|
1925
|
+
|
1926
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
1927
|
+
encodingissue=false
|
1928
|
+
begin
|
1929
|
+
if(encodingissue==false)
|
1930
|
+
crfile.puts page
|
1931
|
+
end
|
1932
|
+
rescue
|
1933
|
+
encodingissue=true
|
1934
|
+
retry
|
1935
|
+
|
1936
|
+
end
|
1937
|
+
crfile.close
|
1938
|
+
|
1939
|
+
end
|
1940
|
+
end #end def Localize_IN_CSS
|
1941
|
+
|
1942
|
+
#########################################################################################
|
1943
|
+
def FLocalize_IN_CSS(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add, selector)
|
1944
|
+
#open the starting page
|
1945
|
+
|
1946
|
+
if (depth<0)
|
1947
|
+
depth=0
|
1948
|
+
end
|
1949
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
1950
|
+
#collect all of the links from the page
|
1951
|
+
links= page.css('a')
|
1952
|
+
title = page.css('title')
|
1953
|
+
#initialize variables
|
1954
|
+
refarr=[]
|
1955
|
+
hrefs = []
|
1956
|
+
linkseti= []
|
1957
|
+
linkset= []
|
1958
|
+
x=0
|
1959
|
+
|
1960
|
+
linkseti = page.css(selector+' a')
|
1961
|
+
#add each link with valid href to array
|
1962
|
+
links.each do |link|
|
1963
|
+
if(link['href']!=nil && link['href']!="" && !link['href'].include?('://'))
|
1964
|
+
# puts x
|
1965
|
+
# puts (link['title'].split.join)
|
1966
|
+
# x+=1
|
1967
|
+
hrefs.push(link)
|
1968
|
+
|
1969
|
+
end
|
1970
|
+
|
1971
|
+
end
|
1972
|
+
linkseti.each do |ilink|
|
1973
|
+
if(ilink['href']!=nil && ilink['href']!="")
|
1974
|
+
# puts x
|
1975
|
+
# puts (link['title'].split.join)
|
1976
|
+
# x+=1
|
1977
|
+
linkset.push(ilink)
|
1978
|
+
|
1979
|
+
end
|
1980
|
+
|
1981
|
+
end
|
1982
|
+
hrefslength = (hrefs.length-1)
|
1983
|
+
for i in 0..hrefslength
|
1984
|
+
if(linkset.include?(hrefs[i]))
|
1985
|
+
else
|
1986
|
+
if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
|
1987
|
+
hrefs[i]['href']=""
|
1988
|
+
end
|
1989
|
+
|
1990
|
+
end
|
1991
|
+
end
|
1992
|
+
|
1993
|
+
|
1994
|
+
|
1995
|
+
#transfer links to other array
|
1996
|
+
while(!hrefs.empty?)
|
1997
|
+
value= hrefs.pop
|
1998
|
+
if (value['href']!=nil && value['href']!="")
|
1999
|
+
refarr.push(value)
|
2000
|
+
end
|
2001
|
+
|
2002
|
+
end
|
2003
|
+
|
2004
|
+
#setup for recognition of the end of the array
|
2005
|
+
refarr.push("-")
|
2006
|
+
|
2007
|
+
if(depth>0)
|
2008
|
+
|
2009
|
+
#create subdirectory for storing current set of scraped pages
|
2010
|
+
|
2011
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
2012
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
2013
|
+
end
|
2014
|
+
#in each link
|
2015
|
+
check = (refarr.length-1)
|
2016
|
+
for i in 0..check
|
2017
|
+
if(refarr[i]!="-")
|
2018
|
+
|
2019
|
+
|
2020
|
+
#evaluate whether link is internal or external
|
2021
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
2022
|
+
if(refarr[i]['href'].include?('://'))
|
2023
|
+
url=refarr[i]['href']
|
2024
|
+
else
|
2025
|
+
url=sub_url+refarr[i]['href']
|
2026
|
+
#puts "external link"
|
2027
|
+
end#refarr[i]['href'].include?
|
2028
|
+
end#refarr[i]['href']!=nil
|
2029
|
+
fourofour=false
|
2030
|
+
#refarr[i]['href'] is nil :S this a result of reference to other array? how to do a true dup without reference?
|
2031
|
+
begin
|
2032
|
+
if(fourofour==false)
|
2033
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
2034
|
+
end
|
2035
|
+
#test for a 404
|
2036
|
+
rescue Exception =>ex
|
2037
|
+
#puts "got a 404"
|
2038
|
+
#replace href (no navigation onclick)
|
2039
|
+
refarr[i]['href'] =""
|
2040
|
+
fourofour=true
|
2041
|
+
|
2042
|
+
retry
|
2043
|
+
end #begin
|
2044
|
+
|
2045
|
+
if (fourofour==false)
|
2046
|
+
#make relevant links reference local files
|
2047
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
2048
|
+
|
2049
|
+
|
2050
|
+
j_depth = s_depth - depth
|
2051
|
+
appendval = "../"
|
2052
|
+
clutch = 0
|
2053
|
+
for r in 1..j_depth
|
2054
|
+
|
2055
|
+
clutch +=1
|
2056
|
+
end
|
2057
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
2058
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
2059
|
+
end
|
2060
|
+
|
2061
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
2062
|
+
|
2063
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
2064
|
+
if (@location_IN_CSS.has_key?(refarr[i]['href']))
|
2065
|
+
pass_a_link = "this_is_a_duplicate"
|
2066
|
+
refarr[i]['href'] = @location_IN_CSS[(refarr[i]['href'])]
|
2067
|
+
|
2068
|
+
else
|
2069
|
+
initial_link=refarr[i]['href']
|
2070
|
+
refarr[i]['href']=linkref
|
2071
|
+
|
2072
|
+
|
2073
|
+
|
2074
|
+
full_link = linkref
|
2075
|
+
|
2076
|
+
@location_IN_CSS[initial_link]=linkref
|
2077
|
+
#puts "working"
|
2078
|
+
end# @location_CSS.haskey
|
2079
|
+
end #refarr[i]['href']!=""
|
2080
|
+
|
2081
|
+
|
2082
|
+
#trim it down and remove special characters for display
|
2083
|
+
trimval=refarr[i]['href']
|
2084
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
2085
|
+
#puts refarr[i]
|
2086
|
+
if(finval==nil && refarr[i]!=nil)
|
2087
|
+
finval=refarr[i]
|
2088
|
+
end #finval == nil
|
2089
|
+
|
2090
|
+
n_depth = depth-1
|
2091
|
+
|
2092
|
+
if(finval!=nil)
|
2093
|
+
self. FLocalize_IN_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
|
2094
|
+
|
2095
|
+
|
2096
|
+
|
2097
|
+
|
2098
|
+
end #finval!=nil
|
2099
|
+
end #fourofour==false
|
2100
|
+
end #refarr[i]!="-"
|
2101
|
+
|
2102
|
+
end#end for each
|
2103
|
+
|
2104
|
+
|
2105
|
+
|
2106
|
+
|
2107
|
+
else#<< depth not > 0
|
2108
|
+
check = (refarr.length-1)
|
2109
|
+
for i in 0..check
|
2110
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
2111
|
+
refarr[i]['href']=""
|
2112
|
+
|
2113
|
+
end
|
2114
|
+
end
|
2115
|
+
end
|
2116
|
+
|
2117
|
+
if (depth == s_depth)
|
2118
|
+
#store newly generated html/links for current page
|
2119
|
+
mainpage =File.new('./page.html',"w")
|
2120
|
+
mainpage.puts page
|
2121
|
+
mainpage.close
|
2122
|
+
|
2123
|
+
|
2124
|
+
else
|
2125
|
+
#store page from the link in the subdirectory
|
2126
|
+
|
2127
|
+
p_depth = depth +1
|
2128
|
+
j_depth = s_depth - depth
|
2129
|
+
appendval = ""
|
2130
|
+
clutch = 0
|
2131
|
+
for r in 1..j_depth
|
2132
|
+
appendval += "../"
|
2133
|
+
clutch +=1
|
2134
|
+
end
|
2135
|
+
clutch -=1
|
2136
|
+
|
2137
|
+
if (link_to_add!="this_is_a_duplicate")
|
2138
|
+
|
2139
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
2140
|
+
encodingissue=false
|
2141
|
+
begin
|
2142
|
+
if(encodingissue==false)
|
2143
|
+
crfile.puts page
|
2144
|
+
end
|
2145
|
+
rescue
|
2146
|
+
encodingissue=true
|
2147
|
+
retry
|
2148
|
+
|
2149
|
+
end
|
2150
|
+
crfile.close
|
2151
|
+
else
|
2152
|
+
|
2153
|
+
end
|
2154
|
+
|
2155
|
+
end
|
2156
|
+
end #end def FLocalize_IN_CSS
|
2157
|
+
|
2158
|
+
#########################################################################################
|
2159
|
+
|
2160
|
+
#############################################################################################
|
2161
|
+
|
2162
|
+
def Localize_EX_CSS(url, depth, sub_url,selector)
|
2163
|
+
|
2164
|
+
#initialize to extract from user view
|
2165
|
+
@location_EX_CSS = Hash.new
|
2166
|
+
s_depth = depth
|
2167
|
+
i_page = 0
|
2168
|
+
prev_ipage = 0
|
2169
|
+
link_to_add =""
|
2170
|
+
if (depth<0)
|
2171
|
+
depth=0
|
2172
|
+
end
|
2173
|
+
#open the starting page
|
2174
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
2175
|
+
#collect all of the links from the page
|
2176
|
+
links= page.css('a')
|
2177
|
+
title = page.css('title')
|
2178
|
+
#initialize variables
|
2179
|
+
refarr=[]
|
2180
|
+
hrefs = []
|
2181
|
+
linkseti= []
|
2182
|
+
linkset= []
|
2183
|
+
x=0
|
2184
|
+
|
2185
|
+
linkseti = page.css(selector+' a')
|
2186
|
+
#add each link with valid href to array
|
2187
|
+
links.each do |link|
|
2188
|
+
if(link['href']!=nil && link['href']!="" && link['href'].include?('://'))
|
2189
|
+
# puts x
|
2190
|
+
# puts (link['title'].split.join)
|
2191
|
+
# x+=1
|
2192
|
+
hrefs.push(link)
|
2193
|
+
|
2194
|
+
end
|
2195
|
+
|
2196
|
+
end
|
2197
|
+
linkseti.each do |ilink|
|
2198
|
+
if(ilink['href']!=nil && ilink['href']!="")
|
2199
|
+
# puts x
|
2200
|
+
# puts (link['title'].split.join)
|
2201
|
+
# x+=1
|
2202
|
+
linkset.push(ilink)
|
2203
|
+
|
2204
|
+
end
|
2205
|
+
|
2206
|
+
end
|
2207
|
+
hrefslength = (hrefs.length-1)
|
2208
|
+
for i in 0..hrefslength
|
2209
|
+
if(linkset.include?(hrefs[i]))
|
2210
|
+
else
|
2211
|
+
if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
|
2212
|
+
hrefs[i]['href']=""
|
2213
|
+
end
|
2214
|
+
|
2215
|
+
end
|
2216
|
+
end
|
2217
|
+
|
2218
|
+
|
2219
|
+
#transfer links to other array
|
2220
|
+
while(!hrefs.empty?)
|
2221
|
+
value= hrefs.pop
|
2222
|
+
if (value['href']!=nil && value['href']!="")
|
2223
|
+
refarr.push(value)
|
2224
|
+
end
|
2225
|
+
|
2226
|
+
end
|
2227
|
+
|
2228
|
+
|
2229
|
+
|
2230
|
+
|
2231
|
+
|
2232
|
+
|
2233
|
+
|
2234
|
+
#setup for recognition of the end of the array
|
2235
|
+
refarr.push("-")
|
2236
|
+
|
2237
|
+
if(depth>0)
|
2238
|
+
|
2239
|
+
#create subdirectory for storing current set of scraped pages
|
2240
|
+
|
2241
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
2242
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
2243
|
+
end
|
2244
|
+
#in each link
|
2245
|
+
check = (refarr.length-1)
|
2246
|
+
for i in 0..check
|
2247
|
+
if(refarr[i]!="-")
|
2248
|
+
if(linkset.include?(refarr[i]))
|
2249
|
+
else
|
2250
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
2251
|
+
refarr[i]['href']=""
|
2252
|
+
end
|
2253
|
+
end
|
2254
|
+
#evaluate whether link is internal or external
|
2255
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
2256
|
+
if(refarr[i]['href'].include?('://'))
|
2257
|
+
url=refarr[i]['href']
|
2258
|
+
else
|
2259
|
+
url=sub_url+refarr[i]['href']
|
2260
|
+
#puts "external link"
|
2261
|
+
end#refarr[i]['href'].include?
|
2262
|
+
end#refarr[i]['href']!=nil
|
2263
|
+
fourofour=false
|
2264
|
+
|
2265
|
+
begin
|
2266
|
+
if(fourofour==false && refarr[i]['href']!=nil)
|
2267
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
2268
|
+
end
|
2269
|
+
#test for a 404
|
2270
|
+
rescue Exception =>ex
|
2271
|
+
#puts "got a 404"
|
2272
|
+
#replace href (no navigation onclick)
|
2273
|
+
refarr[i]['href'] =""
|
2274
|
+
fourofour=true
|
2275
|
+
|
2276
|
+
retry
|
2277
|
+
end #begin
|
2278
|
+
|
2279
|
+
if (fourofour==false)
|
2280
|
+
#make relevant links reference local files
|
2281
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
2282
|
+
|
2283
|
+
|
2284
|
+
j_depth = s_depth - depth
|
2285
|
+
appendval = "../"
|
2286
|
+
clutch = 0
|
2287
|
+
for r in 1..j_depth
|
2288
|
+
|
2289
|
+
clutch +=1
|
2290
|
+
end
|
2291
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
2292
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
2293
|
+
end
|
2294
|
+
if (depth == s_depth)
|
2295
|
+
|
2296
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
2297
|
+
else
|
2298
|
+
|
2299
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
2300
|
+
end
|
2301
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
2302
|
+
if (@location_EX_CSS.has_key?(refarr[i]['href']))
|
2303
|
+
loc = @location_EX_CSS[(refarr[i]['href'])]
|
2304
|
+
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
2305
|
+
refarr[i]['href'] =sub_loc
|
2306
|
+
else
|
2307
|
+
initial_link=refarr[i]['href']
|
2308
|
+
refarr[i]['href']=linkref
|
2309
|
+
|
2310
|
+
#HERE!!!!!**!*!*@*!!@@***!
|
2311
|
+
if (depth == s_depth)
|
2312
|
+
full_link = "../../"+linkref
|
2313
|
+
else
|
2314
|
+
full_link = linkref
|
2315
|
+
end
|
2316
|
+
@location_EX_CSS[initial_link]=full_link
|
2317
|
+
#puts "working"
|
2318
|
+
end# @location_CSS.haskey
|
2319
|
+
end #refarr[i]['href']!=""
|
2320
|
+
|
2321
|
+
#trim it down and remove special characters for display
|
2322
|
+
trimval=refarr[i]['href']
|
2323
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
2324
|
+
#puts refarr[i]
|
2325
|
+
if(finval==nil && refarr[i]!=nil)
|
2326
|
+
finval=refarr[i]
|
2327
|
+
end #finval == nil
|
2328
|
+
|
2329
|
+
n_depth = depth-1
|
2330
|
+
|
2331
|
+
if(finval!=nil)
|
2332
|
+
self. FLocalize_EX_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
|
2333
|
+
#create subdirectory for storing current links page
|
2334
|
+
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
2335
|
+
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
2336
|
+
#end
|
2337
|
+
|
2338
|
+
|
2339
|
+
|
2340
|
+
|
2341
|
+
end #finval!=nil
|
2342
|
+
end #fourofour==false
|
2343
|
+
end #refarr[i]!="-"
|
2344
|
+
|
2345
|
+
end#end for each
|
2346
|
+
|
2347
|
+
|
2348
|
+
|
2349
|
+
|
2350
|
+
else#<< depth not > 0
|
2351
|
+
check = (refarr.length-1)
|
2352
|
+
for i in 0..check
|
2353
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
2354
|
+
refarr[i]['href']=""
|
2355
|
+
end
|
2356
|
+
end
|
2357
|
+
end
|
2358
|
+
|
2359
|
+
if (depth == s_depth)
|
2360
|
+
#store newly generated html/links for current page
|
2361
|
+
mainpage =File.new('./page.html',"w")
|
2362
|
+
mainpage.puts page
|
2363
|
+
mainpage.close
|
2364
|
+
|
2365
|
+
|
2366
|
+
else
|
2367
|
+
#store page from the link in the subdirectory
|
2368
|
+
puts "page: "
|
2369
|
+
p_depth = depth +1
|
2370
|
+
j_depth = s_depth - depth
|
2371
|
+
appendval = ""
|
2372
|
+
clutch = 0
|
2373
|
+
for r in 1..j_depth
|
2374
|
+
appendval += "../"
|
2375
|
+
clutch +=1
|
2376
|
+
end
|
2377
|
+
clutch -=1
|
2378
|
+
|
2379
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
2380
|
+
encodingissue=false
|
2381
|
+
begin
|
2382
|
+
if(encodingissue==false)
|
2383
|
+
crfile.puts page
|
2384
|
+
end
|
2385
|
+
rescue
|
2386
|
+
encodingissue=true
|
2387
|
+
retry
|
2388
|
+
|
2389
|
+
end
|
2390
|
+
crfile.close
|
2391
|
+
|
2392
|
+
end
|
2393
|
+
end #end def Localize_EX_CSS
|
2394
|
+
|
2395
|
+
#########################################################################################
|
2396
|
+
def FLocalize_EX_CSS(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add, selector)
|
2397
|
+
#open the starting page
|
2398
|
+
|
2399
|
+
if (depth<0)
|
2400
|
+
depth=0
|
2401
|
+
end
|
2402
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
2403
|
+
#collect all of the links from the page
|
2404
|
+
links= page.css('a')
|
2405
|
+
title = page.css('title')
|
2406
|
+
#initialize variables
|
2407
|
+
refarr=[]
|
2408
|
+
hrefs = []
|
2409
|
+
linkseti= []
|
2410
|
+
linkset= []
|
2411
|
+
x=0
|
2412
|
+
|
2413
|
+
linkseti = page.css(selector+' a')
|
2414
|
+
#add each link with valid href to array
|
2415
|
+
links.each do |link|
|
2416
|
+
if(link['href']!=nil && link['href']!="" && link['href'].include?('://'))
|
2417
|
+
# puts x
|
2418
|
+
# puts (link['title'].split.join)
|
2419
|
+
# x+=1
|
2420
|
+
hrefs.push(link)
|
2421
|
+
|
2422
|
+
end
|
2423
|
+
|
2424
|
+
end
|
2425
|
+
linkseti.each do |ilink|
|
2426
|
+
if(ilink['href']!=nil && ilink['href']!="")
|
2427
|
+
# puts x
|
2428
|
+
# puts (link['title'].split.join)
|
2429
|
+
# x+=1
|
2430
|
+
linkset.push(ilink)
|
2431
|
+
|
2432
|
+
end
|
2433
|
+
|
2434
|
+
end
|
2435
|
+
hrefslength = (hrefs.length-1)
|
2436
|
+
for i in 0..hrefslength
|
2437
|
+
if(linkset.include?(hrefs[i]))
|
2438
|
+
else
|
2439
|
+
if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
|
2440
|
+
hrefs[i]['href']=""
|
2441
|
+
end
|
2442
|
+
|
2443
|
+
end
|
2444
|
+
end
|
2445
|
+
|
2446
|
+
|
2447
|
+
|
2448
|
+
#transfer links to other array
|
2449
|
+
while(!hrefs.empty?)
|
2450
|
+
value= hrefs.pop
|
2451
|
+
if (value['href']!=nil && value['href']!="")
|
2452
|
+
refarr.push(value)
|
2453
|
+
end
|
2454
|
+
|
2455
|
+
end
|
2456
|
+
|
2457
|
+
#setup for recognition of the end of the array
|
2458
|
+
refarr.push("-")
|
2459
|
+
|
2460
|
+
if(depth>0)
|
2461
|
+
|
2462
|
+
#create subdirectory for storing current set of scraped pages
|
2463
|
+
|
2464
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
2465
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
2466
|
+
end
|
2467
|
+
#in each link
|
2468
|
+
check = (refarr.length-1)
|
2469
|
+
for i in 0..check
|
2470
|
+
if(refarr[i]!="-")
|
2471
|
+
|
2472
|
+
|
2473
|
+
#evaluate whether link is internal or external
|
2474
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
2475
|
+
if(refarr[i]['href'].include?('://'))
|
2476
|
+
url=refarr[i]['href']
|
2477
|
+
else
|
2478
|
+
url=sub_url+refarr[i]['href']
|
2479
|
+
#puts "external link"
|
2480
|
+
end#refarr[i]['href'].include?
|
2481
|
+
end#refarr[i]['href']!=nil
|
2482
|
+
fourofour=false
|
2483
|
+
#refarr[i]['href'] is nil :S this a result of reference to other array? how to do a true dup without reference?
|
2484
|
+
begin
|
2485
|
+
if(fourofour==false)
|
2486
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
2487
|
+
end
|
2488
|
+
#test for a 404
|
2489
|
+
rescue Exception =>ex
|
2490
|
+
#puts "got a 404"
|
2491
|
+
#replace href (no navigation onclick)
|
2492
|
+
refarr[i]['href'] =""
|
2493
|
+
fourofour=true
|
2494
|
+
|
2495
|
+
retry
|
2496
|
+
end #begin
|
2497
|
+
|
2498
|
+
if (fourofour==false)
|
2499
|
+
#make relevant links reference local files
|
2500
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
2501
|
+
|
2502
|
+
|
2503
|
+
j_depth = s_depth - depth
|
2504
|
+
appendval = "../"
|
2505
|
+
clutch = 0
|
2506
|
+
for r in 1..j_depth
|
2507
|
+
|
2508
|
+
clutch +=1
|
2509
|
+
end
|
2510
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
2511
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
2512
|
+
end
|
2513
|
+
|
2514
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
2515
|
+
|
2516
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
2517
|
+
if (@location_EX_CSS.has_key?(refarr[i]['href']))
|
2518
|
+
pass_a_link = "this_is_a_duplicate"
|
2519
|
+
refarr[i]['href'] = @location_EX_CSS[(refarr[i]['href'])]
|
2520
|
+
|
2521
|
+
else
|
2522
|
+
initial_link=refarr[i]['href']
|
2523
|
+
refarr[i]['href']=linkref
|
2524
|
+
|
2525
|
+
|
2526
|
+
|
2527
|
+
full_link = linkref
|
2528
|
+
|
2529
|
+
@location_EX_CSS[initial_link]=linkref
|
2530
|
+
#puts "working"
|
2531
|
+
end# @location_CSS.haskey
|
2532
|
+
end #refarr[i]['href']!=""
|
2533
|
+
|
2534
|
+
|
2535
|
+
#trim it down and remove special characters for display
|
2536
|
+
trimval=refarr[i]['href']
|
2537
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
2538
|
+
#puts refarr[i]
|
2539
|
+
if(finval==nil && refarr[i]!=nil)
|
2540
|
+
finval=refarr[i]
|
2541
|
+
end #finval == nil
|
2542
|
+
|
2543
|
+
n_depth = depth-1
|
2544
|
+
|
2545
|
+
if(finval!=nil)
|
2546
|
+
self. FLocalize_EX_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
|
2547
|
+
|
2548
|
+
|
2549
|
+
|
2550
|
+
|
2551
|
+
end #finval!=nil
|
2552
|
+
end #fourofour==false
|
2553
|
+
end #refarr[i]!="-"
|
2554
|
+
|
2555
|
+
end#end for each
|
2556
|
+
|
2557
|
+
|
2558
|
+
|
2559
|
+
|
2560
|
+
else#<< depth not > 0
|
2561
|
+
check = (refarr.length-1)
|
2562
|
+
for i in 0..check
|
2563
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
2564
|
+
refarr[i]['href']=""
|
2565
|
+
|
2566
|
+
end
|
2567
|
+
end
|
2568
|
+
end
|
2569
|
+
|
2570
|
+
if (depth == s_depth)
|
2571
|
+
#store newly generated html/links for current page
|
2572
|
+
mainpage =File.new('./page.html',"w")
|
2573
|
+
mainpage.puts page
|
2574
|
+
mainpage.close
|
2575
|
+
|
2576
|
+
|
2577
|
+
else
|
2578
|
+
#store page from the link in the subdirectory
|
2579
|
+
|
2580
|
+
p_depth = depth +1
|
2581
|
+
j_depth = s_depth - depth
|
2582
|
+
appendval = ""
|
2583
|
+
clutch = 0
|
2584
|
+
for r in 1..j_depth
|
2585
|
+
appendval += "../"
|
2586
|
+
clutch +=1
|
2587
|
+
end
|
2588
|
+
clutch -=1
|
2589
|
+
|
2590
|
+
if (link_to_add!="this_is_a_duplicate")
|
2591
|
+
|
2592
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
2593
|
+
encodingissue=false
|
2594
|
+
begin
|
2595
|
+
if(encodingissue==false)
|
2596
|
+
crfile.puts page
|
2597
|
+
end
|
2598
|
+
rescue
|
2599
|
+
encodingissue=true
|
2600
|
+
retry
|
2601
|
+
|
2602
|
+
end
|
2603
|
+
crfile.close
|
2604
|
+
else
|
2605
|
+
|
2606
|
+
end
|
2607
|
+
|
2608
|
+
end
|
2609
|
+
end #end def FLocalize_EX_CSS
|
2610
|
+
|
2611
|
+
#########################################################################################
|
2612
|
+
|
884
2613
|
end#module
|