omni_scrape 0.1.9.5 → 0.1.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1c091a8e69686b9b06509d534dae916b1de5d6f0
4
- data.tar.gz: 69d97456f7d6bcd7233d92d8f1403986531dafe6
3
+ metadata.gz: 16691660248616c512cfd9584c4427d8d1dbeeab
4
+ data.tar.gz: 9bed0a9bbeee104d330a59d119b23d4ce0c08966
5
5
  SHA512:
6
- metadata.gz: 9753df86287deb32dc0c5d08fa97608900e57c3c561aaeee05a935c57e4172f1c0f52b2c9f74254c46b66702e3e718a15318c0581b62d8a140db9c5d88593090
7
- data.tar.gz: 9ce62aef0e7dd0c861fb7fec00055c76e708602059794a4843cc48f8fe4196a7fd0f1bbfaf337cdc1fa4fea1604494ada28e0d6f5ba764a06c5798fbecd0f3d6
6
+ metadata.gz: a0770d6acc779099924a7a3ad9a2ca3748e727742964859ede575a8d2899ff166a97b5a5e83da10d53b7ad57c9145f203902700608c8fae914cc40fd310deed1
7
+ data.tar.gz: f012b22b16276f3c1ee06ed8d2b654f2aa07ccf023035fb7ae51a5f4d249049d22b8ecb0f6bd243c3fc250f60099b4fb51a97e048cc45c0c84a11395b7a826de
data/README.md CHANGED
@@ -29,7 +29,7 @@ This method takes three parameters the first should be the url to start at.
29
29
 
30
30
  The second parameter is currently unimplemented but will be the depth to crawl. (just pass it 1)
31
31
 
32
- The third is a sub-url for internal links.
32
+ The third is a sub-url for internal links.q
33
33
 
34
34
  Method : Localize
35
35
 
@@ -45,16 +45,19 @@ description: Localize will follow every link from the page provided and scrape t
45
45
 
46
46
  The pages are linked to other local pages. NOTE: Removed duplication :)
47
47
 
48
- Method : FLocalize
48
+ Method : Localize_CSS
49
+
50
+ example:OmniScrape.Localize_CSS("https://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "https://en.wikipedia.org", "div table.wikitable")
49
51
 
50
- This is the recursive method called by Localize and shouldn't be used directly. :)
52
+ This method takes four parameters the first should be the url to start at.
51
53
 
54
+ The second parameter is the depth to crawl. ***Warning: crawling may grow at an INSANE rate.
52
55
 
53
- ## Development
56
+ The third is a sub-url for internal links.
54
57
 
55
- After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
58
+ The fourth is a css selector for what parts of all pages you want to take the links for.
56
59
 
57
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
60
+ description: Localize_CSS offers the same service that Localize provides while at the same time giving you the option to limit the result set using a css selector.
58
61
 
59
62
  ## Contributing
60
63
 
@@ -1,3 +1,3 @@
1
1
  module OmniScrape
2
- VERSION = "0.1.9.5"
2
+ VERSION = "0.1.9.9"
3
3
  end
data/lib/omni_scrape.rb CHANGED
@@ -1,342 +1,884 @@
1
1
  require "omni_scrape/version"
2
2
  module OmniScrape
3
3
 
4
- ##########################################################################################
5
-
6
- def CrawlScrape(url, depth, sub_url)
7
- if (depth<0)
8
- depth=0
9
- end
10
- s_depth = depth
11
- #open the starting page
12
- page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
13
- #collect all of the links from the page
14
- links= page.css('a')
15
-
16
- #initialize variables
17
- refarr=[]
18
- hrefs = []
19
- #add title and href to arrays for each link
20
- links.each do |link|
4
+ ##########################################################################################
5
+
6
+ def CrawlScrape(url, depth, sub_url)
7
+ if (depth<0)
8
+ depth=0
9
+ end#if
10
+ s_depth = depth #true
11
+ #open the starting page
12
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE})) #good
13
+ #collect all of the links from the page
14
+ links= page.css('a') #good
15
+
16
+ #initialize variables
17
+ refarr=[]
18
+ hrefs = []
19
+ #add title and href to arrays for each link
20
+ links.each do |link|
21
21
  if(link['href']!=nil && link['href']!="")
22
- hrefs.push(link)
23
- end
24
- end
22
+ hrefs.push(link)
23
+ end#if
24
+ end#do
25
+
26
+ #transfer links to other array
27
+ while(!hrefs.empty?)
28
+ value= hrefs.pop
29
+
30
+ refarr.push(value)
31
+
25
32
 
26
- #transfer links to other array
27
- while(!hrefs.empty?)
28
- value= hrefs.pop
29
- refarr.push(value)
30
- end
31
- #setup for recognition of the end of the array
32
- refarr.push("-")
33
-
34
- #create folder for storing current set of scraped pages
35
- if (Dir.exist?('./results'+depth.to_s))
36
- else Dir.mkdir('./results'+depth.to_s)
37
- end
38
- #in each link
39
- check =(refarr.length-1)
40
- for i in 0..check
41
- if(refarr[i]!="-")#still valid links
42
- #evaluate whether link is internal or external
43
- if(refarr[i]['href'].include?('http://') && refarr[i]!=nil)
44
- url=refarr[i]['href']
45
- else
46
- url=sub_url+refarr[i]['href']
47
- end
48
- fourofour=false
49
-
50
- begin
51
- if(fourofour==false)
52
- pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
53
- end
54
- #test for a 404
55
- rescue Exception =>ex
33
+ end#while
34
+ #setup for recognition of the end of the array
35
+ refarr.push("-")
36
+
37
+ #create folder for storing current set of scraped pages
38
+
39
+ if (Dir.exist?('./results'+depth.to_s))
40
+ else Dir.mkdir('./results'+depth.to_s)
41
+ end#if
42
+ #in each link
43
+ check =(refarr.length-1)
44
+ for i in 0..check
45
+ if(refarr[i]!="-")#still valid links
46
+ #evaluate whether link is internal or external
47
+ if(refarr[i]['href'].include?('://') && refarr[i]!=nil)
48
+ url=refarr[i]['href']
49
+ else
50
+ url=sub_url+refarr[i]['href']
51
+ end#if include?
52
+ fourofour=false
53
+
54
+ begin
55
+ if(fourofour==false)
56
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
57
+ end#if
58
+ #test for a 404
59
+ rescue Exception =>ex
60
+
61
+ fourofour=true
62
+ retry
63
+ end#begin
64
+ if (fourofour==false)
65
+ #store html from the link with title of the link
66
+ crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
67
+ crfile.puts pagina
68
+ crfile.close
69
+ end#if
70
+ end#if != "-"
71
+
72
+ end#end for each
73
+
74
+ end#def crawlscrape
75
+
76
+ #############################################################################################
77
+
78
+ def Localize(url, depth, sub_url)
79
+
80
+ #initialize to extract from user view
81
+ @location = Hash.new
82
+ s_depth = depth
83
+ i_page = 0
84
+ prev_ipage = 0
85
+ link_to_add =""
86
+ if (depth<0)
87
+ depth=0
88
+ end
89
+ #open the starting page
90
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
91
+ #collect all of the links from the page
92
+ links= page.css('a')
93
+ title = page.css('title')
94
+ #initialize variables
95
+ refarr=[]
96
+ hrefs = []
97
+ x=0
98
+
99
+ #add href to arrays for each link
100
+ links.each do |link|
101
+ if(link['href']!=nil && link['href']!="")
102
+ # puts x
103
+ # puts (link['title'].split.join)
104
+ # x+=1
105
+ hrefs.push(link)
56
106
 
57
- fourofour=true
58
- retry
59
- end
60
- if (fourofour==false)
61
- #store html from the link with title of the link
62
- crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
63
- crfile.puts pagina
64
- crfile.close
65
- end
66
107
  end
67
108
 
68
- end
69
-
109
+ end
110
+ total=0
111
+ #transfer links to other array
112
+ while(!hrefs.empty?)
113
+ value= hrefs.pop
114
+ refarr.push(value)
115
+ total+=1
70
116
  end
71
117
 
72
- #############################################################################################
118
+
119
+
120
+ #setup for recognition of the end of the array
121
+ refarr.push("-")
122
+
123
+ if(depth>0)
124
+
125
+ #create subdirectory for storing current set of scraped pages
126
+
127
+ if (Dir.exist?('./pages'+depth.to_s))
128
+ else Dir.mkdir('./pages'+depth.to_s)
129
+ end
130
+ #in each link
131
+ check = (refarr.length-1)
132
+ for i in 0..check
133
+ if(refarr[i]!="-")
134
+ #evaluate whether link is internal or external
135
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
136
+ if(refarr[i]['href'].include?('http://'))
137
+ url=refarr[i]['href']
138
+ else
139
+ url=sub_url+refarr[i]['href']
140
+ #puts "external link"
141
+ end#refarr[i]['href'].include?
142
+ end#refarr[i]['href']!=nil
143
+ fourofour=false
73
144
 
74
- def Localize(url, depth, sub_url)
75
-
76
- #initialize to extract from user view
77
- @location = Hash.new
78
- s_depth = depth
79
- i_page = 0
80
- prev_ipage = 0
81
- link_to_add =""
82
- if (depth<0)
83
- depth=0
84
- end
85
- #open the starting page
86
- page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
87
- #collect all of the links from the page
88
- links= page.css('a')
89
- title = page.css('title')
90
- #initialize variables
91
- refarr=[]
92
- hrefs = []
93
- x=0
145
+ begin
146
+ if(fourofour==false)
147
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
148
+ end
149
+ #test for a 404
150
+ rescue Exception =>ex
151
+ #puts "got a 404"
152
+ #replace href (no navigation onclick)
153
+ refarr[i]['href'] =""
154
+ fourofour=true
155
+
156
+ retry
157
+ end #begin
158
+
159
+ if (fourofour==false)
160
+ #make relevant links reference local files
161
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
162
+
163
+
164
+ j_depth = s_depth - depth
165
+ appendval = "../"
166
+ clutch = 0
167
+ for r in 1..j_depth
168
+
169
+ clutch +=1
170
+ end
171
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
172
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
173
+ end
174
+ if (depth == s_depth)
175
+
176
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
177
+ else
178
+
179
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
180
+ end
181
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
182
+ if (@location.has_key?(refarr[i]['href']))
183
+ loc = @location[(refarr[i]['href'])]
184
+ sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
185
+ refarr[i]['href'] =sub_loc
186
+ else
187
+ initial_link=refarr[i]['href']
188
+ refarr[i]['href']=linkref
189
+
190
+ #HERE!!!!!**!*!*@*!!@@***!
191
+ if (depth == s_depth)
192
+ full_link = "../../"+linkref
193
+ else
194
+ full_link = linkref
195
+ end
196
+ @location[initial_link]=full_link
197
+ #puts "working"
198
+ end# @location.haskey
199
+ end #refarr[i]['href']!=""
200
+
201
+ #trim it down and remove special characters for display
202
+ trimval=refarr[i]['href']
203
+ finval=trimval.gsub!(/[!:\/-]/, '')
204
+ #puts refarr[i]
205
+ if(finval==nil && refarr[i]!=nil)
206
+ finval=refarr[i]
207
+ end #finval == nil
94
208
 
95
- #add href to arrays for each link
96
- links.each do |link|
209
+ n_depth = depth-1
210
+
211
+ if(finval!=nil)
212
+ self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
213
+ #create subdirectory for storing current links page
214
+ #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
215
+ #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
216
+ #end
217
+
218
+
219
+
220
+
221
+ end #finval!=nil
222
+ end #fourofour==false
223
+ end #refarr[i]!="-"
224
+
225
+ end#end for each
226
+
227
+
228
+
229
+
230
+ else#<< depth not > 0
231
+ check = (refarr.length-1)
232
+ for i in 0..check
233
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
234
+ refarr[i]['href']=""
235
+ end
236
+ end
237
+ end
238
+
239
+ if (depth == s_depth)
240
+ #store newly generated html/links for current page
241
+ mainpage =File.new('./page.html',"w")
242
+ mainpage.puts page
243
+ mainpage.close
244
+
245
+
246
+ else
247
+ #store page from the link in the subdirectory
248
+ puts "page: "
249
+ p_depth = depth +1
250
+ j_depth = s_depth - depth
251
+ appendval = ""
252
+ clutch = 0
253
+ for r in 1..j_depth
254
+ appendval += "../"
255
+ clutch +=1
256
+ end
257
+ clutch -=1
258
+
259
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
260
+ crfile.puts page
261
+ crfile.close
262
+
263
+ end
264
+ end #end def Localize
265
+
266
+ #########################################################################################
267
+ def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
268
+ #open the starting page
269
+
270
+ if (depth<0)
271
+ depth=0
272
+ end
273
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
274
+ #collect all of the links from the page
275
+ links= page.css('a')
276
+ title = page.css('title')
277
+ #initialize variables
278
+ refarr=[]
279
+ hrefs = []
280
+ x=0
281
+
282
+ #add href to arrays for each link
283
+ links.each do |link|
97
284
  if(link['href']!=nil && link['href']!="")
98
- hrefs.push(link)
99
- end
100
- end
101
- total=0
102
- #transfer links to other array
103
- while(!hrefs.empty?)
104
- value= hrefs.pop
105
- refarr.push(value)
106
- total+=1
107
- end
108
- #setup for recognition of the end of the array
109
- refarr.push("-")
110
- if(depth>0)
111
- #create subdirectory for storing current set of scraped pages
112
- if (Dir.exist?('./pages'+depth.to_s))
113
- else Dir.mkdir('./pages'+depth.to_s)
114
- end
115
- #in each link
116
- check = (refarr.length-1)
117
- for i in 0..check
118
- if(refarr[i]!="-")
119
- #evaluate whether link is internal or external
120
- if(refarr[i]['href']!=nil && refarr[i]['href']!="")
121
- if(refarr[i]['href'].include?('http://'))
122
- url=refarr[i]['href']
123
- else
124
- url=sub_url+refarr[i]['href']
125
- end#refarr[i]['href'].include?
126
- end#refarr[i]['href']!=nil
127
- fourofour=false
128
- begin
129
- if(fourofour==false)
130
- pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
131
- end
132
- #test for a 404
133
- rescue Exception =>ex
134
- #replace href (no navigation onclick)
135
- refarr[i]['href'] =""
136
- fourofour=true
137
- retry
138
- end
139
- if (fourofour==false)
140
- #make relevant links reference local files
141
- if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
142
- j_depth = s_depth - depth
143
- appendval = "../"
144
- clutch = 0
145
- for r in 1..j_depth
146
- clutch +=1
147
- end
148
- if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
149
- else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
150
- end
151
- if (depth == s_depth)
152
- linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
153
- else
154
- linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
155
- end
156
- pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
157
- if (@location.has_key?(refarr[i]['href']))
158
- loc = @location[(refarr[i]['href'])]
159
- sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
160
- refarr[i]['href'] =sub_loc
161
- else
162
- initial_link=refarr[i]['href']
163
- refarr[i]['href']=linkref
164
- if (depth == s_depth)
165
- full_link = "../../"+linkref
166
- else
167
- full_link = linkref
168
- end
169
- @location[initial_link]=full_link
170
- end
171
- end
172
- #trim it down and remove special characters for display
173
- trimval=refarr[i]['href']
174
- finval=trimval.gsub!(/[!:\/-]/, '')
175
- if(finval==nil && refarr[i]!=nil)
176
- finval=refarr[i]
177
- end
178
- n_depth = depth-1
179
- if(finval!=nil)
180
- self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
181
- end
182
- end
183
- end
184
- end
185
- else
186
- check = (refarr.length-1)
187
- for i in 0..check
188
- if (refarr[i]['href']!=nil && refarr[i]['href']!="")
189
- refarr[i]['href']=""
190
- end
285
+ # puts x
286
+ # puts (link['title'].split.join)
287
+ # x+=1
288
+ hrefs.push(link)
289
+
191
290
  end
192
- end
193
- if (depth == s_depth)
194
- #store newly generated html/links for current page
195
- mainpage =File.new('./page.html',"w")
196
- mainpage.puts page
197
- mainpage.close
198
- else
199
- #store page from the link in the subdirectory
200
- p_depth = depth +1
201
- j_depth = s_depth - depth
202
- appendval = ""
203
- clutch = 0
204
- for r in 1..j_depth
205
- appendval += "../"
206
- clutch +=1
291
+
292
+ end
293
+ total=0
294
+ #transfer links to other array
295
+ while(!hrefs.empty?)
296
+ value= hrefs.pop
297
+ refarr.push(value)
298
+ total+=1
299
+ end
300
+
301
+
302
+
303
+ #setup for recognition of the end of the array
304
+ refarr.push("-")
305
+
306
+ if(depth>0)
307
+
308
+ #create subdirectory for storing current set of scraped pages
309
+
310
+ if (Dir.exist?('./pages'+depth.to_s))
311
+ else Dir.mkdir('./pages'+depth.to_s)
312
+ end
313
+ #in each link
314
+ check = (refarr.length-1)
315
+ for i in 0..check
316
+ if(refarr[i]!="-")
317
+ #evaluate whether link is internal or external
318
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
319
+ if(refarr[i]['href'].include?('http://'))
320
+ url=refarr[i]['href']
321
+ else
322
+ url=sub_url+refarr[i]['href']
323
+ #puts "external link"
324
+ end#refarr[i]['href'].include?
325
+ end#refarr[i]['href']!=nil
326
+ fourofour=false
327
+
328
+ begin
329
+ if(fourofour==false)
330
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
207
331
  end
208
- clutch -=1
209
- crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
210
- crfile.puts page
211
- crfile.close
212
- end
213
- end
214
- #########################################################################################
215
- def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
216
- #open the starting page
217
- if (depth<0)
218
- depth=0
219
- end
220
- page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
221
- #collect all of the links from the page
222
- links= page.css('a')
223
- title = page.css('title')
224
- #initialize variables
225
- refarr=[]
226
- hrefs = []
227
- x=0
228
- #add href to arrays for each link
229
- links.each do |link|
332
+ #test for a 404
333
+ rescue Exception =>ex
334
+ #puts "got a 404"
335
+ #replace href (no navigation onclick)
336
+ refarr[i]['href'] =""
337
+ fourofour=true
338
+
339
+ retry
340
+ end #begin
341
+
342
+ if (fourofour==false)
343
+ #make relevant links reference local files
344
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
345
+
346
+
347
+ j_depth = s_depth - depth
348
+ appendval = "../"
349
+ clutch = 0
350
+ for r in 1..j_depth
351
+
352
+ clutch +=1
353
+ end
354
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
355
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
356
+ end
357
+
358
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
359
+
360
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
361
+ if (@location.has_key?(refarr[i]['href']))
362
+ pass_a_link = "this_is_a_duplicate"
363
+ refarr[i]['href'] = @location[(refarr[i]['href'])]
364
+
365
+ else
366
+ initial_link=refarr[i]['href']
367
+ refarr[i]['href']=linkref
368
+
369
+
370
+
371
+ full_link = linkref
372
+
373
+ @location[initial_link]=linkref
374
+ #puts "working"
375
+ end# @location.haskey
376
+ end #refarr[i]['href']!=""
377
+
378
+
379
+ #trim it down and remove special characters for display
380
+ trimval=refarr[i]['href']
381
+ finval=trimval.gsub!(/[!:\/-]/, '')
382
+ #puts refarr[i]
383
+ if(finval==nil && refarr[i]!=nil)
384
+ finval=refarr[i]
385
+ end #finval == nil
386
+
387
+ n_depth = depth-1
388
+
389
+ if(finval!=nil)
390
+ self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
391
+
392
+
393
+
394
+
395
+ end #finval!=nil
396
+ end #fourofour==false
397
+ end #refarr[i]!="-"
398
+
399
+ end#end for each
400
+
401
+
402
+
403
+
404
+ else#<< depth not > 0
405
+ check = (refarr.length-1)
406
+ for i in 0..check
407
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
408
+ refarr[i]['href']=""
409
+
410
+ end
411
+ end
412
+ end
413
+
414
+ if (depth == s_depth)
415
+ #store newly generated html/links for current page
416
+ mainpage =File.new('./page.html',"w")
417
+ mainpage.puts page
418
+ mainpage.close
419
+
420
+
421
+ else
422
+ #store page from the link in the subdirectory
423
+
424
+ p_depth = depth +1
425
+ j_depth = s_depth - depth
426
+ appendval = ""
427
+ clutch = 0
428
+ for r in 1..j_depth
429
+ appendval += "../"
430
+ clutch +=1
431
+ end
432
+ clutch -=1
433
+
434
+ if (link_to_add!="this_is_a_duplicate")
435
+
436
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
437
+ crfile.puts page
438
+ crfile.close
439
+ else
440
+
441
+ end
442
+
443
+ end
444
+ end #end def FLocalize
445
+
446
+ #########################################################################################
447
+
448
+
449
+ #############################################################################################
450
+
451
+ def Localize_CSS(url, depth, sub_url,selector)
452
+
453
+ #initialize to extract from user view
454
+ @location_CSS = Hash.new
455
+ s_depth = depth
456
+ i_page = 0
457
+ prev_ipage = 0
458
+ link_to_add =""
459
+ if (depth<0)
460
+ depth=0
461
+ end
462
+ #open the starting page
463
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
464
+ #collect all of the links from the page
465
+ links= page.css('a')
466
+ title = page.css('title')
467
+ #initialize variables
468
+ refarr=[]
469
+ hrefs = []
470
+ linkseti= []
471
+ linkset= []
472
+ x=0
473
+
474
+ linkseti = page.css(selector+' a')
475
+ #add each link with valid href to array
476
+ links.each do |link|
230
477
  if(link['href']!=nil && link['href']!="")
231
- hrefs.push(link)
478
+ # puts x
479
+ # puts (link['title'].split.join)
480
+ # x+=1
481
+ hrefs.push(link)
482
+
232
483
  end
233
- end
234
- total=0
235
- #transfer links to other array
236
- while(!hrefs.empty?)
237
- value= hrefs.pop
238
- refarr.push(value)
239
- total+=1
240
- end
241
- #setup for recognition of the end of the array
242
- refarr.push("-")
243
- if(depth>0)
244
- #create subdirectory for storing current set of scraped pages
245
- if (Dir.exist?('./pages'+depth.to_s))
246
- else Dir.mkdir('./pages'+depth.to_s)
247
- end
248
- #in each link
249
- check = (refarr.length-1)
250
- for i in 0..check
251
- if(refarr[i]!="-")
252
- #evaluate whether link is internal or external
253
- if(refarr[i]['href']!=nil && refarr[i]['href']!="")
254
- if(refarr[i]['href'].include?('http://'))
255
- url=refarr[i]['href']
256
- else
257
- url=sub_url+refarr[i]['href']
258
- end
259
- end
260
- fourofour=false
261
- begin
262
- if(fourofour==false)
263
- pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
264
- end
265
- #test for a 404
266
- rescue Exception =>ex
267
- #replace href (no navigation onclick)
268
- refarr[i]['href'] =""
269
- fourofour=true
270
- retry
271
- end
272
- if (fourofour==false)
273
- #make relevant links reference local files
274
- if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
275
- j_depth = s_depth - depth
276
- appendval = "../"
277
- clutch = 0
278
- for r in 1..j_depth
279
- clutch +=1
280
- end
281
- if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
282
- else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
283
- end
284
- linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
285
- pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
286
- if (@location.has_key?(refarr[i]['href']))
287
- pass_a_link = "this_is_a_duplicate"
288
- refarr[i]['href'] = @location[(refarr[i]['href'])]
289
- else
290
- initial_link=refarr[i]['href']
291
- refarr[i]['href']=linkref
292
- full_link = linkref
293
- @location[initial_link]=linkref
294
- end
295
- end
296
- #trim it down and remove special characters for display
297
- trimval=refarr[i]['href']
298
- finval=trimval.gsub!(/[!:\/-]/, '')
299
- if(finval==nil && refarr[i]!=nil)
300
- finval=refarr[i]
301
- end
302
- n_depth = depth-1
303
- if(finval!=nil)
304
- self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
305
- end
306
- end
484
+
485
+ end
486
+ linkseti.each do |ilink|
487
+ if(ilink['href']!=nil && ilink['href']!="")
488
+ # puts x
489
+ # puts (link['title'].split.join)
490
+ # x+=1
491
+ linkset.push(ilink)
492
+
307
493
  end
308
- end
309
- else
310
- check = (refarr.length-1)
311
- for i in 0..check
312
- if (refarr[i]['href']!=nil && refarr[i]['href']!="")
313
- refarr[i]['href']=""
314
- end
494
+
495
+ end
496
+ hrefslength = (hrefs.length-1)
497
+ for i in 0..hrefslength
498
+ if(linkset.include?(hrefs[i]))
499
+ else
500
+ if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
501
+ hrefs[i]['href']=""
502
+ end
503
+
504
+ end
505
+ end
506
+
507
+
508
+ #transfer links to other array
509
+ while(!hrefs.empty?)
510
+ value= hrefs.pop
511
+ if (value['href']!=nil && value['href']!="")
512
+ refarr.push(value)
513
+ end
514
+
515
+ end
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+ #setup for recognition of the end of the array
524
+ refarr.push("-")
525
+
526
+ if(depth>0)
527
+
528
+ #create subdirectory for storing current set of scraped pages
529
+
530
+ if (Dir.exist?('./pages'+depth.to_s))
531
+ else Dir.mkdir('./pages'+depth.to_s)
532
+ end
533
+ #in each link
534
+ check = (refarr.length-1)
535
+ for i in 0..check
536
+ if(refarr[i]!="-")
537
+ if(linkset.include?(refarr[i]))
538
+ else
539
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
540
+ refarr[i]['href']=""
541
+ end
542
+ end
543
+ #evaluate whether link is internal or external
544
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
545
+ if(refarr[i]['href'].include?('://'))
546
+ url=refarr[i]['href']
547
+ else
548
+ url=sub_url+refarr[i]['href']
549
+ #puts "external link"
550
+ end#refarr[i]['href'].include?
551
+ end#refarr[i]['href']!=nil
552
+ fourofour=false
553
+
554
+ begin
555
+ if(fourofour==false && refarr[i]['href']!=nil)
556
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
315
557
  end
316
- end
317
- if (depth == s_depth)
318
- #store newly generated html/links for current page
319
- mainpage =File.new('./page.html',"w")
320
- mainpage.puts page
321
- mainpage.close
322
- else
323
- #store page from the link in the subdirectory
324
- p_depth = depth +1
325
- j_depth = s_depth - depth
326
- appendval = ""
327
- clutch = 0
328
- for r in 1..j_depth
329
- appendval += "../"
330
- clutch +=1
558
+ #test for a 404
559
+ rescue Exception =>ex
560
+ #puts "got a 404"
561
+ #replace href (no navigation onclick)
562
+ refarr[i]['href'] =""
563
+ fourofour=true
564
+
565
+ retry
566
+ end #begin
567
+
568
+ if (fourofour==false)
569
+ #make relevant links reference local files
570
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
571
+
572
+
573
+ j_depth = s_depth - depth
574
+ appendval = "../"
575
+ clutch = 0
576
+ for r in 1..j_depth
577
+
578
+ clutch +=1
579
+ end
580
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
581
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
582
+ end
583
+ if (depth == s_depth)
584
+
585
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
586
+ else
587
+
588
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
589
+ end
590
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
591
+ if (@location_CSS.has_key?(refarr[i]['href']))
592
+ loc = @location_CSS[(refarr[i]['href'])]
593
+ sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
594
+ refarr[i]['href'] =sub_loc
595
+ else
596
+ initial_link=refarr[i]['href']
597
+ refarr[i]['href']=linkref
598
+
599
+ #HERE!!!!!**!*!*@*!!@@***!
600
+ if (depth == s_depth)
601
+ full_link = "../../"+linkref
602
+ else
603
+ full_link = linkref
604
+ end
605
+ @location_CSS[initial_link]=full_link
606
+ #puts "working"
607
+ end# @location_CSS.haskey
608
+ end #refarr[i]['href']!=""
609
+
610
+ #trim it down and remove special characters for display
611
+ trimval=refarr[i]['href']
612
+ finval=trimval.gsub!(/[!:\/-]/, '')
613
+ #puts refarr[i]
614
+ if(finval==nil && refarr[i]!=nil)
615
+ finval=refarr[i]
616
+ end #finval == nil
617
+
618
+ n_depth = depth-1
619
+
620
+ if(finval!=nil)
621
+ self. FLocalize_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
622
+ #create subdirectory for storing current links page
623
+ #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
624
+ #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
625
+ #end
626
+
627
+
628
+
629
+
630
+ end #finval!=nil
631
+ end #fourofour==false
632
+ end #refarr[i]!="-"
633
+
634
+ end#end for each
635
+
636
+
637
+
638
+
639
+ else#<< depth not > 0
640
+ check = (refarr.length-1)
641
+ for i in 0..check
642
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
643
+ refarr[i]['href']=""
644
+ end
645
+ end
646
+ end
647
+
648
+ if (depth == s_depth)
649
+ #store newly generated html/links for current page
650
+ mainpage =File.new('./page.html',"w")
651
+ mainpage.puts page
652
+ mainpage.close
653
+
654
+
655
+ else
656
+ #store page from the link in the subdirectory
657
+ puts "page: "
658
+ p_depth = depth +1
659
+ j_depth = s_depth - depth
660
+ appendval = ""
661
+ clutch = 0
662
+ for r in 1..j_depth
663
+ appendval += "../"
664
+ clutch +=1
665
+ end
666
+ clutch -=1
667
+
668
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
669
+ crfile.puts page
670
+ crfile.close
671
+
672
+ end
673
+ end #end def Localize_CSS
674
+
675
+ #########################################################################################
676
+ def FLocalize_CSS(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add, selector)
677
+ #open the starting page
678
+
679
+ if (depth<0)
680
+ depth=0
681
+ end
682
+ page = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
683
+ #collect all of the links from the page
684
+ links= page.css('a')
685
+ title = page.css('title')
686
+ #initialize variables
687
+ refarr=[]
688
+ hrefs = []
689
+ linkseti= []
690
+ linkset= []
691
+ x=0
692
+
693
+ linkseti = page.css(selector+' a')
694
+ #add each link with valid href to array
695
+ links.each do |link|
696
+ if(link['href']!=nil && link['href']!="")
697
+ # puts x
698
+ # puts (link['title'].split.join)
699
+ # x+=1
700
+ hrefs.push(link)
701
+
331
702
  end
332
- clutch -=1
333
- if (link_to_add!="this_is_a_duplicate")
334
- crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
335
- crfile.puts page
336
- crfile.close
337
- else
703
+
704
+ end
705
+ linkseti.each do |ilink|
706
+ if(ilink['href']!=nil && ilink['href']!="")
707
+ # puts x
708
+ # puts (link['title'].split.join)
709
+ # x+=1
710
+ linkset.push(ilink)
711
+
338
712
  end
339
- end
713
+
714
+ end
715
+ hrefslength = (hrefs.length-1)
716
+ for i in 0..hrefslength
717
+ if(linkset.include?(hrefs[i]))
718
+ else
719
+ if(hrefs[i]['href']!=nil && hrefs[i]['href']!="")
720
+ hrefs[i]['href']=""
721
+ end
722
+
723
+ end
724
+ end
725
+
726
+
727
+
728
+ #transfer links to other array
729
+ while(!hrefs.empty?)
730
+ value= hrefs.pop
731
+ if (value['href']!=nil && value['href']!="")
732
+ refarr.push(value)
733
+ end
734
+
340
735
  end
341
- #########################################################################################
736
+
737
+ #setup for recognition of the end of the array
738
+ refarr.push("-")
739
+
740
+ if(depth>0)
741
+
742
+ #create subdirectory for storing current set of scraped pages
743
+
744
+ if (Dir.exist?('./pages'+depth.to_s))
745
+ else Dir.mkdir('./pages'+depth.to_s)
746
+ end
747
+ #in each link
748
+ check = (refarr.length-1)
749
+ for i in 0..check
750
+ if(refarr[i]!="-")
751
+
752
+
753
+ #evaluate whether link is internal or external
754
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
755
+ if(refarr[i]['href'].include?('://'))
756
+ url=refarr[i]['href']
757
+ else
758
+ url=sub_url+refarr[i]['href']
759
+ #puts "external link"
760
+ end#refarr[i]['href'].include?
761
+ end#refarr[i]['href']!=nil
762
+ fourofour=false
763
+ #refarr[i]['href'] is nil :S this a result of reference to other array? how to do a true dup without reference?
764
+ begin
765
+ if(fourofour==false)
766
+ pagina = Nokogiri::HTML(open(url,{ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE}))
767
+ end
768
+ #test for a 404
769
+ rescue Exception =>ex
770
+ #puts "got a 404"
771
+ #replace href (no navigation onclick)
772
+ refarr[i]['href'] =""
773
+ fourofour=true
774
+
775
+ retry
776
+ end #begin
777
+
778
+ if (fourofour==false)
779
+ #make relevant links reference local files
780
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
781
+
782
+
783
+ j_depth = s_depth - depth
784
+ appendval = "../"
785
+ clutch = 0
786
+ for r in 1..j_depth
787
+
788
+ clutch +=1
789
+ end
790
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
791
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
792
+ end
793
+
794
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
795
+
796
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
797
+ if (@location_CSS.has_key?(refarr[i]['href']))
798
+ pass_a_link = "this_is_a_duplicate"
799
+ refarr[i]['href'] = @location_CSS[(refarr[i]['href'])]
800
+
801
+ else
802
+ initial_link=refarr[i]['href']
803
+ refarr[i]['href']=linkref
804
+
805
+
806
+
807
+ full_link = linkref
808
+
809
+ @location_CSS[initial_link]=linkref
810
+ #puts "working"
811
+ end# @location_CSS.haskey
812
+ end #refarr[i]['href']!=""
813
+
814
+
815
+ #trim it down and remove special characters for display
816
+ trimval=refarr[i]['href']
817
+ finval=trimval.gsub!(/[!:\/-]/, '')
818
+ #puts refarr[i]
819
+ if(finval==nil && refarr[i]!=nil)
820
+ finval=refarr[i]
821
+ end #finval == nil
822
+
823
+ n_depth = depth-1
824
+
825
+ if(finval!=nil)
826
+ self. FLocalize_CSS(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link, selector)
827
+
828
+
829
+
830
+
831
+ end #finval!=nil
832
+ end #fourofour==false
833
+ end #refarr[i]!="-"
834
+
835
+ end#end for each
836
+
837
+
838
+
839
+
840
+ else#<< depth not > 0
841
+ check = (refarr.length-1)
842
+ for i in 0..check
843
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
844
+ refarr[i]['href']=""
845
+
846
+ end
847
+ end
342
848
  end
849
+
850
+ if (depth == s_depth)
851
+ #store newly generated html/links for current page
852
+ mainpage =File.new('./page.html',"w")
853
+ mainpage.puts page
854
+ mainpage.close
855
+
856
+
857
+ else
858
+ #store page from the link in the subdirectory
859
+
860
+ p_depth = depth +1
861
+ j_depth = s_depth - depth
862
+ appendval = ""
863
+ clutch = 0
864
+ for r in 1..j_depth
865
+ appendval += "../"
866
+ clutch +=1
867
+ end
868
+ clutch -=1
869
+
870
+ if (link_to_add!="this_is_a_duplicate")
871
+
872
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
873
+ crfile.puts page
874
+ crfile.close
875
+ else
876
+
877
+ end
878
+
879
+ end
880
+ end #end def FLocalize_CSS
881
+
882
+ #########################################################################################
883
+
884
+ end#module
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omni_scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9.5
4
+ version: 0.1.9.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bradley Maynard
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-17 00:00:00.000000000 Z
11
+ date: 2015-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri