omni_scrape 0.1.9.5 → 0.1.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -6
- data/lib/omni_scrape/version.rb +1 -1
- data/lib/omni_scrape.rb +861 -319
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 16691660248616c512cfd9584c4427d8d1dbeeab
|
4
|
+
data.tar.gz: 9bed0a9bbeee104d330a59d119b23d4ce0c08966
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a0770d6acc779099924a7a3ad9a2ca3748e727742964859ede575a8d2899ff166a97b5a5e83da10d53b7ad57c9145f203902700608c8fae914cc40fd310deed1
|
7
|
+
data.tar.gz: f012b22b16276f3c1ee06ed8d2b654f2aa07ccf023035fb7ae51a5f4d249049d22b8ecb0f6bd243c3fc250f60099b4fb51a97e048cc45c0c84a11395b7a826de
|
data/README.md
CHANGED
@@ -29,7 +29,7 @@ This method takes three parameters the first should be the url to start at.
|
|
29
29
|
|
30
30
|
The second parameter is currently unimplemented but will be the depth to crawl. (just pass it 1)
|
31
31
|
|
32
|
-
The third is a sub-url for internal links.
|
32
|
+
The third is a sub-url for internal links.q
|
33
33
|
|
34
34
|
Method : Localize
|
35
35
|
|
@@ -45,16 +45,19 @@ description: Localize will follow every link from the page provided and scrape t
|
|
45
45
|
|
46
46
|
The pages are linked to other local pages. NOTE: Removed duplication :)
|
47
47
|
|
48
|
-
Method :
|
48
|
+
Method : Localize_CSS
|
49
|
+
|
50
|
+
example:OmniScrape.Localize_CSS("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org", "div table.wikitable")
|
49
51
|
|
50
|
-
This
|
52
|
+
This method takes four parameters the first should be the url to start at.
|
51
53
|
|
54
|
+
The second parameter is the depth to crawl. ***Warning: crawling may grow at an INSANE rate.
|
52
55
|
|
53
|
-
|
56
|
+
The third is a sub-url for internal links.
|
54
57
|
|
55
|
-
|
58
|
+
The fourth is a css selector for what parts of all pages you want to take the links for.
|
56
59
|
|
57
|
-
|
60
|
+
description: Localize_CSS offers the same service that Localize provides while at the same time giving you the option to limit the result set using a css selector.
|
58
61
|
|
59
62
|
## Contributing
|
60
63
|
|
data/lib/omni_scrape/version.rb
CHANGED
data/lib/omni_scrape.rb
CHANGED
@@ -1,342 +1,884 @@
|
|
1
1
|
require "omni_scrape/version"
|
2
2
|
module OmniScrape
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
4
|
+
##########################################################################################
|
5
|
+
|
6
|
+
def CrawlScrape(url, depth, sub_url)
|
7
|
+
if (depth<0)
|
8
|
+
depth=0
|
9
|
+
end#if
|
10
|
+
s_depth = depth #true
|
11
|
+
#open the starting page
|
12
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE})) #good
|
13
|
+
#collect all of the links from the page
|
14
|
+
links= page.css('a') #good
|
15
|
+
|
16
|
+
#initialize variables
|
17
|
+
refarr=[]
|
18
|
+
hrefs = []
|
19
|
+
#add title and href to arrays for each link
|
20
|
+
links.each do |link|
|
21
21
|
if(link['href']!=nil && link['href']!="")
|
22
|
-
|
23
|
-
end
|
24
|
-
|
22
|
+
hrefs.push(link)
|
23
|
+
end#if
|
24
|
+
end#do
|
25
|
+
|
26
|
+
#transfer links to other array
|
27
|
+
while(!hrefs.empty?)
|
28
|
+
value= hrefs.pop
|
29
|
+
|
30
|
+
refarr.push(value)
|
31
|
+
|
25
32
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
33
|
+
end#while
|
34
|
+
#setup for recognition of the end of the array
|
35
|
+
refarr.push("-")
|
36
|
+
|
37
|
+
#create folder for storing current set of scraped pages
|
38
|
+
|
39
|
+
if (Dir.exist?('./results'+depth.to_s))
|
40
|
+
else Dir.mkdir('./results'+depth.to_s)
|
41
|
+
end#if
|
42
|
+
#in each link
|
43
|
+
check =(refarr.length-1)
|
44
|
+
for i in 0..check
|
45
|
+
if(refarr[i]!="-")#still valid links
|
46
|
+
#evaluate whether link is internal or external
|
47
|
+
if(refarr[i]['href'].include?('://') && refarr[i]!=nil)
|
48
|
+
url=refarr[i]['href']
|
49
|
+
else
|
50
|
+
url=sub_url+refarr[i]['href']
|
51
|
+
end#if include?
|
52
|
+
fourofour=false
|
53
|
+
|
54
|
+
begin
|
55
|
+
if(fourofour==false)
|
56
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
57
|
+
end#if
|
58
|
+
#test for a 404
|
59
|
+
rescue Exception =>ex
|
60
|
+
|
61
|
+
fourofour=true
|
62
|
+
retry
|
63
|
+
end#begin
|
64
|
+
if (fourofour==false)
|
65
|
+
#store html from the link with title of the link
|
66
|
+
crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
|
67
|
+
crfile.puts pagina
|
68
|
+
crfile.close
|
69
|
+
end#if
|
70
|
+
end#if != "-"
|
71
|
+
|
72
|
+
end#end for each
|
73
|
+
|
74
|
+
end#def crawlscrape
|
75
|
+
|
76
|
+
#############################################################################################
|
77
|
+
|
78
|
+
def Localize(url, depth, sub_url)
|
79
|
+
|
80
|
+
#initialize to extract from user view
|
81
|
+
@location = Hash.new
|
82
|
+
s_depth = depth
|
83
|
+
i_page = 0
|
84
|
+
prev_ipage = 0
|
85
|
+
link_to_add =""
|
86
|
+
if (depth<0)
|
87
|
+
depth=0
|
88
|
+
end
|
89
|
+
#open the starting page
|
90
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
91
|
+
#collect all of the links from the page
|
92
|
+
links= page.css('a')
|
93
|
+
title = page.css('title')
|
94
|
+
#initialize variables
|
95
|
+
refarr=[]
|
96
|
+
hrefs = []
|
97
|
+
x=0
|
98
|
+
|
99
|
+
#add href to arrays for each link
|
100
|
+
links.each do |link|
|
101
|
+
if(link['href']!=nil && link['href']!="")
|
102
|
+
# puts x
|
103
|
+
# puts (link['title'].split.join)
|
104
|
+
# x+=1
|
105
|
+
hrefs.push(link)
|
56
106
|
|
57
|
-
fourofour=true
|
58
|
-
retry
|
59
|
-
end
|
60
|
-
if (fourofour==false)
|
61
|
-
#store html from the link with title of the link
|
62
|
-
crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
|
63
|
-
crfile.puts pagina
|
64
|
-
crfile.close
|
65
|
-
end
|
66
107
|
end
|
67
108
|
|
68
|
-
|
69
|
-
|
109
|
+
end
|
110
|
+
total=0
|
111
|
+
#transfer links to other array
|
112
|
+
while(!hrefs.empty?)
|
113
|
+
value= hrefs.pop
|
114
|
+
refarr.push(value)
|
115
|
+
total+=1
|
70
116
|
end
|
71
117
|
|
72
|
-
|
118
|
+
|
119
|
+
|
120
|
+
#setup for recognition of the end of the array
|
121
|
+
refarr.push("-")
|
122
|
+
|
123
|
+
if(depth>0)
|
124
|
+
|
125
|
+
#create subdirectory for storing current set of scraped pages
|
126
|
+
|
127
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
128
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
129
|
+
end
|
130
|
+
#in each link
|
131
|
+
check = (refarr.length-1)
|
132
|
+
for i in 0..check
|
133
|
+
if(refarr[i]!="-")
|
134
|
+
#evaluate whether link is internal or external
|
135
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
136
|
+
if(refarr[i]['href'].include?('http://'))
|
137
|
+
url=refarr[i]['href']
|
138
|
+
else
|
139
|
+
url=sub_url+refarr[i]['href']
|
140
|
+
#puts "external link"
|
141
|
+
end#refarr[i]['href'].include?
|
142
|
+
end#refarr[i]['href']!=nil
|
143
|
+
fourofour=false
|
73
144
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
145
|
+
begin
|
146
|
+
if(fourofour==false)
|
147
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
148
|
+
end
|
149
|
+
#test for a 404
|
150
|
+
rescue Exception =>ex
|
151
|
+
#puts "got a 404"
|
152
|
+
#replace href (no navigation onclick)
|
153
|
+
refarr[i]['href'] =""
|
154
|
+
fourofour=true
|
155
|
+
|
156
|
+
retry
|
157
|
+
end #begin
|
158
|
+
|
159
|
+
if (fourofour==false)
|
160
|
+
#make relevant links reference local files
|
161
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
162
|
+
|
163
|
+
|
164
|
+
j_depth = s_depth - depth
|
165
|
+
appendval = "../"
|
166
|
+
clutch = 0
|
167
|
+
for r in 1..j_depth
|
168
|
+
|
169
|
+
clutch +=1
|
170
|
+
end
|
171
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
172
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
173
|
+
end
|
174
|
+
if (depth == s_depth)
|
175
|
+
|
176
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
177
|
+
else
|
178
|
+
|
179
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
180
|
+
end
|
181
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
182
|
+
if (@location.has_key?(refarr[i]['href']))
|
183
|
+
loc = @location[(refarr[i]['href'])]
|
184
|
+
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
185
|
+
refarr[i]['href'] =sub_loc
|
186
|
+
else
|
187
|
+
initial_link=refarr[i]['href']
|
188
|
+
refarr[i]['href']=linkref
|
189
|
+
|
190
|
+
#HERE!!!!!**!*!*@*!!@@***!
|
191
|
+
if (depth == s_depth)
|
192
|
+
full_link = "../../"+linkref
|
193
|
+
else
|
194
|
+
full_link = linkref
|
195
|
+
end
|
196
|
+
@location[initial_link]=full_link
|
197
|
+
#puts "working"
|
198
|
+
end# @location.haskey
|
199
|
+
end #refarr[i]['href']!=""
|
200
|
+
|
201
|
+
#trim it down and remove special characters for display
|
202
|
+
trimval=refarr[i]['href']
|
203
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
204
|
+
#puts refarr[i]
|
205
|
+
if(finval==nil && refarr[i]!=nil)
|
206
|
+
finval=refarr[i]
|
207
|
+
end #finval == nil
|
94
208
|
|
95
|
-
|
96
|
-
|
209
|
+
n_depth = depth-1
|
210
|
+
|
211
|
+
if(finval!=nil)
|
212
|
+
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
213
|
+
#create subdirectory for storing current links page
|
214
|
+
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
215
|
+
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
216
|
+
#end
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
end #finval!=nil
|
222
|
+
end #fourofour==false
|
223
|
+
end #refarr[i]!="-"
|
224
|
+
|
225
|
+
end#end for each
|
226
|
+
|
227
|
+
|
228
|
+
|
229
|
+
|
230
|
+
else#<< depth not > 0
|
231
|
+
check = (refarr.length-1)
|
232
|
+
for i in 0..check
|
233
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
234
|
+
refarr[i]['href']=""
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
if (depth == s_depth)
|
240
|
+
#store newly generated html/links for current page
|
241
|
+
mainpage =File.new('./page.html',"w")
|
242
|
+
mainpage.puts page
|
243
|
+
mainpage.close
|
244
|
+
|
245
|
+
|
246
|
+
else
|
247
|
+
#store page from the link in the subdirectory
|
248
|
+
puts "page: "
|
249
|
+
p_depth = depth +1
|
250
|
+
j_depth = s_depth - depth
|
251
|
+
appendval = ""
|
252
|
+
clutch = 0
|
253
|
+
for r in 1..j_depth
|
254
|
+
appendval += "../"
|
255
|
+
clutch +=1
|
256
|
+
end
|
257
|
+
clutch -=1
|
258
|
+
|
259
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
260
|
+
crfile.puts page
|
261
|
+
crfile.close
|
262
|
+
|
263
|
+
end
|
264
|
+
end #end def Localize
|
265
|
+
|
266
|
+
#########################################################################################
|
267
|
+
def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
|
268
|
+
#open the starting page
|
269
|
+
|
270
|
+
if (depth<0)
|
271
|
+
depth=0
|
272
|
+
end
|
273
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
274
|
+
#collect all of the links from the page
|
275
|
+
links= page.css('a')
|
276
|
+
title = page.css('title')
|
277
|
+
#initialize variables
|
278
|
+
refarr=[]
|
279
|
+
hrefs = []
|
280
|
+
x=0
|
281
|
+
|
282
|
+
#add href to arrays for each link
|
283
|
+
links.each do |link|
|
97
284
|
if(link['href']!=nil && link['href']!="")
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
while(!hrefs.empty?)
|
104
|
-
value= hrefs.pop
|
105
|
-
refarr.push(value)
|
106
|
-
total+=1
|
107
|
-
end
|
108
|
-
#setup for recognition of the end of the array
|
109
|
-
refarr.push("-")
|
110
|
-
if(depth>0)
|
111
|
-
#create subdirectory for storing current set of scraped pages
|
112
|
-
if (Dir.exist?('./pages'+depth.to_s))
|
113
|
-
else Dir.mkdir('./pages'+depth.to_s)
|
114
|
-
end
|
115
|
-
#in each link
|
116
|
-
check = (refarr.length-1)
|
117
|
-
for i in 0..check
|
118
|
-
if(refarr[i]!="-")
|
119
|
-
#evaluate whether link is internal or external
|
120
|
-
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
121
|
-
if(refarr[i]['href'].include?('http://'))
|
122
|
-
url=refarr[i]['href']
|
123
|
-
else
|
124
|
-
url=sub_url+refarr[i]['href']
|
125
|
-
end#refarr[i]['href'].include?
|
126
|
-
end#refarr[i]['href']!=nil
|
127
|
-
fourofour=false
|
128
|
-
begin
|
129
|
-
if(fourofour==false)
|
130
|
-
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
131
|
-
end
|
132
|
-
#test for a 404
|
133
|
-
rescue Exception =>ex
|
134
|
-
#replace href (no navigation onclick)
|
135
|
-
refarr[i]['href'] =""
|
136
|
-
fourofour=true
|
137
|
-
retry
|
138
|
-
end
|
139
|
-
if (fourofour==false)
|
140
|
-
#make relevant links reference local files
|
141
|
-
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
142
|
-
j_depth = s_depth - depth
|
143
|
-
appendval = "../"
|
144
|
-
clutch = 0
|
145
|
-
for r in 1..j_depth
|
146
|
-
clutch +=1
|
147
|
-
end
|
148
|
-
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
149
|
-
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
150
|
-
end
|
151
|
-
if (depth == s_depth)
|
152
|
-
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
153
|
-
else
|
154
|
-
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
155
|
-
end
|
156
|
-
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
157
|
-
if (@location.has_key?(refarr[i]['href']))
|
158
|
-
loc = @location[(refarr[i]['href'])]
|
159
|
-
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
160
|
-
refarr[i]['href'] =sub_loc
|
161
|
-
else
|
162
|
-
initial_link=refarr[i]['href']
|
163
|
-
refarr[i]['href']=linkref
|
164
|
-
if (depth == s_depth)
|
165
|
-
full_link = "../../"+linkref
|
166
|
-
else
|
167
|
-
full_link = linkref
|
168
|
-
end
|
169
|
-
@location[initial_link]=full_link
|
170
|
-
end
|
171
|
-
end
|
172
|
-
#trim it down and remove special characters for display
|
173
|
-
trimval=refarr[i]['href']
|
174
|
-
finval=trimval.gsub!(/[!:\/-]/, '')
|
175
|
-
if(finval==nil && refarr[i]!=nil)
|
176
|
-
finval=refarr[i]
|
177
|
-
end
|
178
|
-
n_depth = depth-1
|
179
|
-
if(finval!=nil)
|
180
|
-
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
181
|
-
end
|
182
|
-
end
|
183
|
-
end
|
184
|
-
end
|
185
|
-
else
|
186
|
-
check = (refarr.length-1)
|
187
|
-
for i in 0..check
|
188
|
-
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
189
|
-
refarr[i]['href']=""
|
190
|
-
end
|
285
|
+
# puts x
|
286
|
+
# puts (link['title'].split.join)
|
287
|
+
# x+=1
|
288
|
+
hrefs.push(link)
|
289
|
+
|
191
290
|
end
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
291
|
+
|
292
|
+
end
|
293
|
+
total=0
|
294
|
+
#transfer links to other array
|
295
|
+
while(!hrefs.empty?)
|
296
|
+
value= hrefs.pop
|
297
|
+
refarr.push(value)
|
298
|
+
total+=1
|
299
|
+
end
|
300
|
+
|
301
|
+
|
302
|
+
|
303
|
+
#setup for recognition of the end of the array
|
304
|
+
refarr.push("-")
|
305
|
+
|
306
|
+
if(depth>0)
|
307
|
+
|
308
|
+
#create subdirectory for storing current set of scraped pages
|
309
|
+
|
310
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
311
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
312
|
+
end
|
313
|
+
#in each link
|
314
|
+
check = (refarr.length-1)
|
315
|
+
for i in 0..check
|
316
|
+
if(refarr[i]!="-")
|
317
|
+
#evaluate whether link is internal or external
|
318
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
319
|
+
if(refarr[i]['href'].include?('http://'))
|
320
|
+
url=refarr[i]['href']
|
321
|
+
else
|
322
|
+
url=sub_url+refarr[i]['href']
|
323
|
+
#puts "external link"
|
324
|
+
end#refarr[i]['href'].include?
|
325
|
+
end#refarr[i]['href']!=nil
|
326
|
+
fourofour=false
|
327
|
+
|
328
|
+
begin
|
329
|
+
if(fourofour==false)
|
330
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
207
331
|
end
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
332
|
+
#test for a 404
|
333
|
+
rescue Exception =>ex
|
334
|
+
#puts "got a 404"
|
335
|
+
#replace href (no navigation onclick)
|
336
|
+
refarr[i]['href'] =""
|
337
|
+
fourofour=true
|
338
|
+
|
339
|
+
retry
|
340
|
+
end #begin
|
341
|
+
|
342
|
+
if (fourofour==false)
|
343
|
+
#make relevant links reference local files
|
344
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
345
|
+
|
346
|
+
|
347
|
+
j_depth = s_depth - depth
|
348
|
+
appendval = "../"
|
349
|
+
clutch = 0
|
350
|
+
for r in 1..j_depth
|
351
|
+
|
352
|
+
clutch +=1
|
353
|
+
end
|
354
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
355
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
356
|
+
end
|
357
|
+
|
358
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
359
|
+
|
360
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
361
|
+
if (@location.has_key?(refarr[i]['href']))
|
362
|
+
pass_a_link = "this_is_a_duplicate"
|
363
|
+
refarr[i]['href'] = @location[(refarr[i]['href'])]
|
364
|
+
|
365
|
+
else
|
366
|
+
initial_link=refarr[i]['href']
|
367
|
+
refarr[i]['href']=linkref
|
368
|
+
|
369
|
+
|
370
|
+
|
371
|
+
full_link = linkref
|
372
|
+
|
373
|
+
@location[initial_link]=linkref
|
374
|
+
#puts "working"
|
375
|
+
end# @location.haskey
|
376
|
+
end #refarr[i]['href']!=""
|
377
|
+
|
378
|
+
|
379
|
+
#trim it down and remove special characters for display
|
380
|
+
trimval=refarr[i]['href']
|
381
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
382
|
+
#puts refarr[i]
|
383
|
+
if(finval==nil && refarr[i]!=nil)
|
384
|
+
finval=refarr[i]
|
385
|
+
end #finval == nil
|
386
|
+
|
387
|
+
n_depth = depth-1
|
388
|
+
|
389
|
+
if(finval!=nil)
|
390
|
+
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
391
|
+
|
392
|
+
|
393
|
+
|
394
|
+
|
395
|
+
end #finval!=nil
|
396
|
+
end #fourofour==false
|
397
|
+
end #refarr[i]!="-"
|
398
|
+
|
399
|
+
end#end for each
|
400
|
+
|
401
|
+
|
402
|
+
|
403
|
+
|
404
|
+
else#<< depth not > 0
|
405
|
+
check = (refarr.length-1)
|
406
|
+
for i in 0..check
|
407
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
408
|
+
refarr[i]['href']=""
|
409
|
+
|
410
|
+
end
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
if (depth == s_depth)
|
415
|
+
#store newly generated html/links for current page
|
416
|
+
mainpage =File.new('./page.html',"w")
|
417
|
+
mainpage.puts page
|
418
|
+
mainpage.close
|
419
|
+
|
420
|
+
|
421
|
+
else
|
422
|
+
#store page from the link in the subdirectory
|
423
|
+
|
424
|
+
p_depth = depth +1
|
425
|
+
j_depth = s_depth - depth
|
426
|
+
appendval = ""
|
427
|
+
clutch = 0
|
428
|
+
for r in 1..j_depth
|
429
|
+
appendval += "../"
|
430
|
+
clutch +=1
|
431
|
+
end
|
432
|
+
clutch -=1
|
433
|
+
|
434
|
+
if (link_to_add!="this_is_a_duplicate")
|
435
|
+
|
436
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
437
|
+
crfile.puts page
|
438
|
+
crfile.close
|
439
|
+
else
|
440
|
+
|
441
|
+
end
|
442
|
+
|
443
|
+
end
|
444
|
+
end #end def FLocalize
|
445
|
+
|
446
|
+
#########################################################################################
|
447
|
+
|
448
|
+
|
449
|
+
#############################################################################################
|
450
|
+
|
451
|
+
def Localize_CSS(url, depth, sub_url,selector)
|
452
|
+
|
453
|
+
#initialize to extract from user view
|
454
|
+
@location_CSS = Hash.new
|
455
|
+
s_depth = depth
|
456
|
+
i_page = 0
|
457
|
+
prev_ipage = 0
|
458
|
+
link_to_add =""
|
459
|
+
if (depth<0)
|
460
|
+
depth=0
|
461
|
+
end
|
462
|
+
#open the starting page
|
463
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
464
|
+
#collect all of the links from the page
|
465
|
+
links= page.css('a')
|
466
|
+
title = page.css('title')
|
467
|
+
#initialize variables
|
468
|
+
refarr=[]
|
469
|
+
hrefs = []
|
470
|
+
linkseti= []
|
471
|
+
linkset= []
|
472
|
+
x=0
|
473
|
+
|
474
|
+
linkseti = page.css(selector+' a')
|
475
|
+
#add each link with valid href to array
|
476
|
+
links.each do |link|
|
230
477
|
if(link['href']!=nil && link['href']!="")
|
231
|
-
|
478
|
+
# puts x
|
479
|
+
# puts (link['title'].split.join)
|
480
|
+
# x+=1
|
481
|
+
hrefs.push(link)
|
482
|
+
|
232
483
|
end
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
refarr.push("-")
|
243
|
-
if(depth>0)
|
244
|
-
#create subdirectory for storing current set of scraped pages
|
245
|
-
if (Dir.exist?('./pages'+depth.to_s))
|
246
|
-
else Dir.mkdir('./pages'+depth.to_s)
|
247
|
-
end
|
248
|
-
#in each link
|
249
|
-
check = (refarr.length-1)
|
250
|
-
for i in 0..check
|
251
|
-
if(refarr[i]!="-")
|
252
|
-
#evaluate whether link is internal or external
|
253
|
-
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
254
|
-
if(refarr[i]['href'].include?('http://'))
|
255
|
-
url=refarr[i]['href']
|
256
|
-
else
|
257
|
-
url=sub_url+refarr[i]['href']
|
258
|
-
end
|
259
|
-
end
|
260
|
-
fourofour=false
|
261
|
-
begin
|
262
|
-
if(fourofour==false)
|
263
|
-
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
264
|
-
end
|
265
|
-
#test for a 404
|
266
|
-
rescue Exception =>ex
|
267
|
-
#replace href (no navigation onclick)
|
268
|
-
refarr[i]['href'] =""
|
269
|
-
fourofour=true
|
270
|
-
retry
|
271
|
-
end
|
272
|
-
if (fourofour==false)
|
273
|
-
#make relevant links reference local files
|
274
|
-
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
275
|
-
j_depth = s_depth - depth
|
276
|
-
appendval = "../"
|
277
|
-
clutch = 0
|
278
|
-
for r in 1..j_depth
|
279
|
-
clutch +=1
|
280
|
-
end
|
281
|
-
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
282
|
-
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
283
|
-
end
|
284
|
-
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
285
|
-
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
286
|
-
if (@location.has_key?(refarr[i]['href']))
|
287
|
-
pass_a_link = "this_is_a_duplicate"
|
288
|
-
refarr[i]['href'] = @location[(refarr[i]['href'])]
|
289
|
-
else
|
290
|
-
initial_link=refarr[i]['href']
|
291
|
-
refarr[i]['href']=linkref
|
292
|
-
full_link = linkref
|
293
|
-
@location[initial_link]=linkref
|
294
|
-
end
|
295
|
-
end
|
296
|
-
#trim it down and remove special characters for display
|
297
|
-
trimval=refarr[i]['href']
|
298
|
-
finval=trimval.gsub!(/[!:\/-]/, '')
|
299
|
-
if(finval==nil && refarr[i]!=nil)
|
300
|
-
finval=refarr[i]
|
301
|
-
end
|
302
|
-
n_depth = depth-1
|
303
|
-
if(finval!=nil)
|
304
|
-
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
305
|
-
end
|
306
|
-
end
|
484
|
+
|
485
|
+
end
|
486
|
+
linkseti.each do |ilink|
|
487
|
+
if(ilink['href']!=nil && ilink['href']!="")
|
488
|
+
# puts x
|
489
|
+
# puts (link['title'].split.join)
|
490
|
+
# x+=1
|
491
|
+
linkset.push(ilink)
|
492
|
+
|
307
493
|
end
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
494
|
+
|
495
|
+
end
|
496
|
+
hrefslength = (hrefs.length-1)
|
497
|
+
for i in 0..hrefslength
|
498
|
+
if(linkset.include?(hrefs[i]))
|
499
|
+
else
|
500
|
+
if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
|
501
|
+
hrefs[i]['href']=""
|
502
|
+
end
|
503
|
+
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
|
508
|
+
#transfer links to other array
|
509
|
+
while(!hrefs.empty?)
|
510
|
+
value= hrefs.pop
|
511
|
+
if (value['href']!=nil && value['href']!="")
|
512
|
+
refarr.push(value)
|
513
|
+
end
|
514
|
+
|
515
|
+
end
|
516
|
+
|
517
|
+
|
518
|
+
|
519
|
+
|
520
|
+
|
521
|
+
|
522
|
+
|
523
|
+
#setup for recognition of the end of the array
|
524
|
+
refarr.push("-")
|
525
|
+
|
526
|
+
if(depth>0)
|
527
|
+
|
528
|
+
#create subdirectory for storing current set of scraped pages
|
529
|
+
|
530
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
531
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
532
|
+
end
|
533
|
+
#in each link
|
534
|
+
check = (refarr.length-1)
|
535
|
+
for i in 0..check
|
536
|
+
if(refarr[i]!="-")
|
537
|
+
if(linkset.include?(refarr[i]))
|
538
|
+
else
|
539
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
540
|
+
refarr[i]['href']=""
|
541
|
+
end
|
542
|
+
end
|
543
|
+
#evaluate whether link is internal or external
|
544
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
545
|
+
if(refarr[i]['href'].include?('://'))
|
546
|
+
url=refarr[i]['href']
|
547
|
+
else
|
548
|
+
url=sub_url+refarr[i]['href']
|
549
|
+
#puts "external link"
|
550
|
+
end#refarr[i]['href'].include?
|
551
|
+
end#refarr[i]['href']!=nil
|
552
|
+
fourofour=false
|
553
|
+
|
554
|
+
begin
|
555
|
+
if(fourofour==false && refarr[i]['href']!=nil)
|
556
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
315
557
|
end
|
316
|
-
|
317
|
-
|
318
|
-
#
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
558
|
+
#test for a 404
|
559
|
+
rescue Exception =>ex
|
560
|
+
#puts "got a 404"
|
561
|
+
#replace href (no navigation onclick)
|
562
|
+
refarr[i]['href'] =""
|
563
|
+
fourofour=true
|
564
|
+
|
565
|
+
retry
|
566
|
+
end #begin
|
567
|
+
|
568
|
+
if (fourofour==false)
|
569
|
+
#make relevant links reference local files
|
570
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
571
|
+
|
572
|
+
|
573
|
+
j_depth = s_depth - depth
|
574
|
+
appendval = "../"
|
575
|
+
clutch = 0
|
576
|
+
for r in 1..j_depth
|
577
|
+
|
578
|
+
clutch +=1
|
579
|
+
end
|
580
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
581
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
582
|
+
end
|
583
|
+
if (depth == s_depth)
|
584
|
+
|
585
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
586
|
+
else
|
587
|
+
|
588
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
589
|
+
end
|
590
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
591
|
+
if (@location_CSS.has_key?(refarr[i]['href']))
|
592
|
+
loc = @location_CSS[(refarr[i]['href'])]
|
593
|
+
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
594
|
+
refarr[i]['href'] =sub_loc
|
595
|
+
else
|
596
|
+
initial_link=refarr[i]['href']
|
597
|
+
refarr[i]['href']=linkref
|
598
|
+
|
599
|
+
#HERE!!!!!**!*!*@*!!@@***!
|
600
|
+
if (depth == s_depth)
|
601
|
+
full_link = "../../"+linkref
|
602
|
+
else
|
603
|
+
full_link = linkref
|
604
|
+
end
|
605
|
+
@location_CSS[initial_link]=full_link
|
606
|
+
#puts "working"
|
607
|
+
end# @location_CSS.haskey
|
608
|
+
end #refarr[i]['href']!=""
|
609
|
+
|
610
|
+
#trim it down and remove special characters for display
|
611
|
+
trimval=refarr[i]['href']
|
612
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
613
|
+
#puts refarr[i]
|
614
|
+
if(finval==nil && refarr[i]!=nil)
|
615
|
+
finval=refarr[i]
|
616
|
+
end #finval == nil
|
617
|
+
|
618
|
+
n_depth = depth-1
|
619
|
+
|
620
|
+
if(finval!=nil)
|
621
|
+
self. FLocalize_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
|
622
|
+
#create subdirectory for storing current links page
|
623
|
+
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
624
|
+
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
625
|
+
#end
|
626
|
+
|
627
|
+
|
628
|
+
|
629
|
+
|
630
|
+
end #finval!=nil
|
631
|
+
end #fourofour==false
|
632
|
+
end #refarr[i]!="-"
|
633
|
+
|
634
|
+
end#end for each
|
635
|
+
|
636
|
+
|
637
|
+
|
638
|
+
|
639
|
+
else#<< depth not > 0
|
640
|
+
check = (refarr.length-1)
|
641
|
+
for i in 0..check
|
642
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
643
|
+
refarr[i]['href']=""
|
644
|
+
end
|
645
|
+
end
|
646
|
+
end
|
647
|
+
|
648
|
+
if (depth == s_depth)
|
649
|
+
#store newly generated html/links for current page
|
650
|
+
mainpage =File.new('./page.html',"w")
|
651
|
+
mainpage.puts page
|
652
|
+
mainpage.close
|
653
|
+
|
654
|
+
|
655
|
+
else
|
656
|
+
#store page from the link in the subdirectory
|
657
|
+
puts "page: "
|
658
|
+
p_depth = depth +1
|
659
|
+
j_depth = s_depth - depth
|
660
|
+
appendval = ""
|
661
|
+
clutch = 0
|
662
|
+
for r in 1..j_depth
|
663
|
+
appendval += "../"
|
664
|
+
clutch +=1
|
665
|
+
end
|
666
|
+
clutch -=1
|
667
|
+
|
668
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
669
|
+
crfile.puts page
|
670
|
+
crfile.close
|
671
|
+
|
672
|
+
end
|
673
|
+
end #end def Localize_CSS
|
674
|
+
|
675
|
+
#########################################################################################
|
676
|
+
def FLocalize_CSS(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add, selector)
|
677
|
+
#open the starting page
|
678
|
+
|
679
|
+
if (depth<0)
|
680
|
+
depth=0
|
681
|
+
end
|
682
|
+
page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
683
|
+
#collect all of the links from the page
|
684
|
+
links= page.css('a')
|
685
|
+
title = page.css('title')
|
686
|
+
#initialize variables
|
687
|
+
refarr=[]
|
688
|
+
hrefs = []
|
689
|
+
linkseti= []
|
690
|
+
linkset= []
|
691
|
+
x=0
|
692
|
+
|
693
|
+
linkseti = page.css(selector+' a')
|
694
|
+
#add each link with valid href to array
|
695
|
+
links.each do |link|
|
696
|
+
if(link['href']!=nil && link['href']!="")
|
697
|
+
# puts x
|
698
|
+
# puts (link['title'].split.join)
|
699
|
+
# x+=1
|
700
|
+
hrefs.push(link)
|
701
|
+
|
331
702
|
end
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
703
|
+
|
704
|
+
end
|
705
|
+
linkseti.each do |ilink|
|
706
|
+
if(ilink['href']!=nil && ilink['href']!="")
|
707
|
+
# puts x
|
708
|
+
# puts (link['title'].split.join)
|
709
|
+
# x+=1
|
710
|
+
linkset.push(ilink)
|
711
|
+
|
338
712
|
end
|
339
|
-
|
713
|
+
|
714
|
+
end
|
715
|
+
hrefslength = (hrefs.length-1)
|
716
|
+
for i in 0..hrefslength
|
717
|
+
if(linkset.include?(hrefs[i]))
|
718
|
+
else
|
719
|
+
if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
|
720
|
+
hrefs[i]['href']=""
|
721
|
+
end
|
722
|
+
|
723
|
+
end
|
724
|
+
end
|
725
|
+
|
726
|
+
|
727
|
+
|
728
|
+
#transfer links to other array
|
729
|
+
while(!hrefs.empty?)
|
730
|
+
value= hrefs.pop
|
731
|
+
if (value['href']!=nil && value['href']!="")
|
732
|
+
refarr.push(value)
|
733
|
+
end
|
734
|
+
|
340
735
|
end
|
341
|
-
|
736
|
+
|
737
|
+
#setup for recognition of the end of the array
|
738
|
+
refarr.push("-")
|
739
|
+
|
740
|
+
if(depth>0)
|
741
|
+
|
742
|
+
#create subdirectory for storing current set of scraped pages
|
743
|
+
|
744
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
745
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
746
|
+
end
|
747
|
+
#in each link
|
748
|
+
check = (refarr.length-1)
|
749
|
+
for i in 0..check
|
750
|
+
if(refarr[i]!="-")
|
751
|
+
|
752
|
+
|
753
|
+
#evaluate whether link is internal or external
|
754
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
755
|
+
if(refarr[i]['href'].include?('://'))
|
756
|
+
url=refarr[i]['href']
|
757
|
+
else
|
758
|
+
url=sub_url+refarr[i]['href']
|
759
|
+
#puts "external link"
|
760
|
+
end#refarr[i]['href'].include?
|
761
|
+
end#refarr[i]['href']!=nil
|
762
|
+
fourofour=false
|
763
|
+
#refarr[i]['href'] is nil :S this a result of reference to other array? how to do a true dup without reference?
|
764
|
+
begin
|
765
|
+
if(fourofour==false)
|
766
|
+
pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
|
767
|
+
end
|
768
|
+
#test for a 404
|
769
|
+
rescue Exception =>ex
|
770
|
+
#puts "got a 404"
|
771
|
+
#replace href (no navigation onclick)
|
772
|
+
refarr[i]['href'] =""
|
773
|
+
fourofour=true
|
774
|
+
|
775
|
+
retry
|
776
|
+
end #begin
|
777
|
+
|
778
|
+
if (fourofour==false)
|
779
|
+
#make relevant links reference local files
|
780
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
781
|
+
|
782
|
+
|
783
|
+
j_depth = s_depth - depth
|
784
|
+
appendval = "../"
|
785
|
+
clutch = 0
|
786
|
+
for r in 1..j_depth
|
787
|
+
|
788
|
+
clutch +=1
|
789
|
+
end
|
790
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
791
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
792
|
+
end
|
793
|
+
|
794
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
795
|
+
|
796
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
797
|
+
if (@location_CSS.has_key?(refarr[i]['href']))
|
798
|
+
pass_a_link = "this_is_a_duplicate"
|
799
|
+
refarr[i]['href'] = @location_CSS[(refarr[i]['href'])]
|
800
|
+
|
801
|
+
else
|
802
|
+
initial_link=refarr[i]['href']
|
803
|
+
refarr[i]['href']=linkref
|
804
|
+
|
805
|
+
|
806
|
+
|
807
|
+
full_link = linkref
|
808
|
+
|
809
|
+
@location_CSS[initial_link]=linkref
|
810
|
+
#puts "working"
|
811
|
+
end# @location_CSS.haskey
|
812
|
+
end #refarr[i]['href']!=""
|
813
|
+
|
814
|
+
|
815
|
+
#trim it down and remove special characters for display
|
816
|
+
trimval=refarr[i]['href']
|
817
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
818
|
+
#puts refarr[i]
|
819
|
+
if(finval==nil && refarr[i]!=nil)
|
820
|
+
finval=refarr[i]
|
821
|
+
end #finval == nil
|
822
|
+
|
823
|
+
n_depth = depth-1
|
824
|
+
|
825
|
+
if(finval!=nil)
|
826
|
+
self. FLocalize_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
|
827
|
+
|
828
|
+
|
829
|
+
|
830
|
+
|
831
|
+
end #finval!=nil
|
832
|
+
end #fourofour==false
|
833
|
+
end #refarr[i]!="-"
|
834
|
+
|
835
|
+
end#end for each
|
836
|
+
|
837
|
+
|
838
|
+
|
839
|
+
|
840
|
+
else#<< depth not > 0
|
841
|
+
check = (refarr.length-1)
|
842
|
+
for i in 0..check
|
843
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
844
|
+
refarr[i]['href']=""
|
845
|
+
|
846
|
+
end
|
847
|
+
end
|
342
848
|
end
|
849
|
+
|
850
|
+
if (depth == s_depth)
|
851
|
+
#store newly generated html/links for current page
|
852
|
+
mainpage =File.new('./page.html',"w")
|
853
|
+
mainpage.puts page
|
854
|
+
mainpage.close
|
855
|
+
|
856
|
+
|
857
|
+
else
|
858
|
+
#store page from the link in the subdirectory
|
859
|
+
|
860
|
+
p_depth = depth +1
|
861
|
+
j_depth = s_depth - depth
|
862
|
+
appendval = ""
|
863
|
+
clutch = 0
|
864
|
+
for r in 1..j_depth
|
865
|
+
appendval += "../"
|
866
|
+
clutch +=1
|
867
|
+
end
|
868
|
+
clutch -=1
|
869
|
+
|
870
|
+
if (link_to_add!="this_is_a_duplicate")
|
871
|
+
|
872
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
873
|
+
crfile.puts page
|
874
|
+
crfile.close
|
875
|
+
else
|
876
|
+
|
877
|
+
end
|
878
|
+
|
879
|
+
end
|
880
|
+
end #end def FLocalize_CSS
|
881
|
+
|
882
|
+
#########################################################################################
|
883
|
+
|
884
|
+
end#module
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omni_scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.9.
|
4
|
+
version: 0.1.9.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bradley Maynard
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|