omni_scrape 0.1.5.4 → 0.1.5.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +6 -3
- data/lib/omni_scrape/version.rb +1 -1
- data/lib/omni_scrape.rb +250 -24
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YmI5NWNlZmYxN2Q5YzZmYzE4ZThjM2IxMDZjMWM2MDI3MDBiM2U0MA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OGYwZTdlOWFiYjMxMTNmNGU3MmJjZWM5MWU5ZWQ2MGY2MDUzYmQzNg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZWJkZWZhODhmM2Y0YjExZTBjYjc1MmNhNmJiNWI1YTJiYzQ4M2QyMDExYWZm
|
10
|
+
ZWM1NTMzMjYyYjkyNTRmMzc4Mjc1ODIxNzgxZTUyYzhhMzQ1Y2U5M2UyODE5
|
11
|
+
OTJhMGRhNjdhZjQ3N2YyNDM0NmE3YTdhY2ViNWU4NjBlNWQzNDM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
OGNkNTA2OWQ3YmI0MGVjMWQyMjQ4ZjM1MmIxNGUzYzVhMzMxODdkYjZmYjkz
|
14
|
+
MTU4MzIyNzIyMzg1ZDQ2YmEyNTA2MGE4MGI2MDMwY2RlNjA2YmFiYmUzZDJi
|
15
|
+
ZmY3YWI3ZDRkZjg0NjdjMTEwZTNmMWVmZjNhNDlkMmE1N2RhMGY=
|
data/README.md
CHANGED
@@ -21,7 +21,7 @@ Or install it yourself as:
|
|
21
21
|
## Usage
|
22
22
|
Add the lines : require 'omni_scrape' and include OmniScrape to your script file.
|
23
23
|
|
24
|
-
Method : CrawlScrape
|
24
|
+
Method : CrawlScrape Note: this method is currently on a back burner.
|
25
25
|
|
26
26
|
example : OmniScrape.CrawlScrape("http://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 0, "http://en.wikipedia.org")
|
27
27
|
|
@@ -37,14 +37,17 @@ example : OmniScrape.Localize("http://en.wikipedia.org/wiki/List_of_massively_mu
|
|
37
37
|
|
38
38
|
This method takes three parameters the first should be the url to start at.
|
39
39
|
|
40
|
-
The second parameter is the depth to crawl
|
40
|
+
The second parameter is the depth to crawl. ***Warning: crawling grows at an INSANE rate.
|
41
41
|
|
42
42
|
The third is a sub-url for internal links.
|
43
43
|
|
44
|
+
Method : FLocalize
|
45
|
+
|
46
|
+
This is the recursive method called by Localize and shouldn't be used directly. :)
|
44
47
|
|
45
48
|
description: Localize will follow every link from the page provided and scrape the html from those pages, storing it as html files in subdirectories.
|
46
49
|
|
47
|
-
|
50
|
+
The pages are linked to other local pages. Currently there is a lot of duplication in this regard. Note: Working on eliminating the duplication.
|
48
51
|
|
49
52
|
## Development
|
50
53
|
|
data/lib/omni_scrape/version.rb
CHANGED
data/lib/omni_scrape.rb
CHANGED
@@ -53,10 +53,13 @@ links.each do |link|
|
|
53
53
|
refarr.push("-")
|
54
54
|
|
55
55
|
#create folder for storing current set of scraped pages
|
56
|
-
|
57
|
-
|
56
|
+
g_depth = s_depth
|
57
|
+
while (g_depth>-1)
|
58
|
+
if (Dir.exist?('./pages'+g_depth.to_s))
|
59
|
+
else Dir.mkdir('./pages'+g_depth.to_s)
|
58
60
|
end
|
59
|
-
|
61
|
+
g_depth =g_depth-1
|
62
|
+
end
|
60
63
|
|
61
64
|
#in each link
|
62
65
|
for i in 1..titlearr.length
|
@@ -101,12 +104,18 @@ end#end for each
|
|
101
104
|
end#def crawlscrape
|
102
105
|
|
103
106
|
#############################################################################################
|
104
|
-
|
107
|
+
|
105
108
|
def Localize(url, depth, sub_url)
|
106
|
-
|
109
|
+
|
110
|
+
#initialize to extract from user view
|
111
|
+
s_depth = depth
|
112
|
+
i_page = 0
|
113
|
+
prev_ipage = 0
|
114
|
+
link_to_add =""
|
107
115
|
if (depth<0)
|
108
116
|
depth=0
|
109
117
|
end
|
118
|
+
#open the starting page
|
110
119
|
page = Nokogiri::HTML(open(url))
|
111
120
|
#collect all of the links from the page
|
112
121
|
links= page.css('a')
|
@@ -141,13 +150,13 @@ puts "links in page"
|
|
141
150
|
#setup for recognition of the end of the array
|
142
151
|
refarr.push("-")
|
143
152
|
|
144
|
-
|
145
|
-
|
153
|
+
if(depth>0)
|
154
|
+
|
155
|
+
#create subdirectory for storing current set of scraped pages
|
156
|
+
|
146
157
|
if (Dir.exist?('./pages'+depth.to_s))
|
147
158
|
else Dir.mkdir('./pages'+depth.to_s)
|
148
159
|
end
|
149
|
-
|
150
|
-
if(depth>0)
|
151
160
|
#in each link
|
152
161
|
check = (refarr.length-1)
|
153
162
|
for i in 0..check
|
@@ -159,8 +168,8 @@ if(depth>0)
|
|
159
168
|
else
|
160
169
|
url=sub_url+refarr[i]['href']
|
161
170
|
#puts "external link"
|
162
|
-
|
163
|
-
end
|
171
|
+
end#refarr[i]['href'].include?
|
172
|
+
end#refarr[i]['href']!=nil
|
164
173
|
fourofour=false
|
165
174
|
|
166
175
|
begin
|
@@ -175,16 +184,35 @@ if(depth>0)
|
|
175
184
|
fourofour=true
|
176
185
|
|
177
186
|
retry
|
178
|
-
|
187
|
+
end #begin
|
179
188
|
|
180
189
|
if (fourofour==false)
|
181
190
|
#make relevant links reference local files
|
182
191
|
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
183
|
-
|
192
|
+
puts "link: "
|
193
|
+
puts depth
|
194
|
+
#wutwut
|
195
|
+
j_depth = s_depth - depth
|
196
|
+
appendval = "../"
|
197
|
+
clutch = 0
|
198
|
+
for r in 1..j_depth
|
199
|
+
|
200
|
+
clutch +=1
|
201
|
+
end
|
202
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
203
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
204
|
+
end
|
205
|
+
if (depth == s_depth)
|
206
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
207
|
+
else
|
208
|
+
|
209
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
210
|
+
end
|
211
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
184
212
|
refarr[i]['href']=linkref
|
185
213
|
puts refarr[i]['href']
|
186
214
|
#puts "working"
|
187
|
-
end
|
215
|
+
end #refarr[i]['href']!=""
|
188
216
|
|
189
217
|
|
190
218
|
#trim it down and remove special characters for display
|
@@ -193,40 +221,238 @@ if(depth>0)
|
|
193
221
|
#puts refarr[i]
|
194
222
|
if(finval==nil && refarr[i]!=nil)
|
195
223
|
finval=refarr[i]
|
196
|
-
end
|
224
|
+
end #finval == nil
|
197
225
|
|
198
|
-
|
226
|
+
n_depth = depth-1
|
227
|
+
|
199
228
|
if(finval!=nil)
|
200
|
-
|
229
|
+
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
201
230
|
#create subdirectory for storing current links page
|
202
231
|
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
203
232
|
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
204
233
|
#end
|
205
|
-
|
206
|
-
|
207
|
-
|
234
|
+
|
235
|
+
#this is where we will call the method for each link **********
|
236
|
+
|
237
|
+
|
238
|
+
end #finval!=nil
|
239
|
+
end #fourofour==false
|
240
|
+
end #refarr[i]!="-"
|
241
|
+
|
242
|
+
end#end for each
|
243
|
+
|
244
|
+
|
245
|
+
puts "here?"
|
246
|
+
puts depth
|
247
|
+
|
248
|
+
else#<< depth not > 0
|
249
|
+
check = (refarr.length-1)
|
250
|
+
for i in 0..check
|
251
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
252
|
+
refarr[i]['href']=""
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
if (depth == s_depth)
|
258
|
+
#store newly generated html/links for current page
|
259
|
+
mainpage =File.new('./page.html',"w")
|
260
|
+
mainpage.puts page
|
261
|
+
mainpage.close
|
262
|
+
puts "finished"
|
263
|
+
|
264
|
+
else
|
265
|
+
#store page from the link in the subdirectory
|
266
|
+
puts "page: "
|
267
|
+
p_depth = depth +1
|
268
|
+
j_depth = s_depth - depth
|
269
|
+
appendval = ""
|
270
|
+
clutch = 0
|
271
|
+
for r in 1..j_depth
|
272
|
+
appendval += "../"
|
273
|
+
clutch +=1
|
274
|
+
end
|
275
|
+
clutch -=1
|
276
|
+
puts "link to pass"
|
277
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
278
|
+
crfile.puts page
|
208
279
|
crfile.close
|
280
|
+
|
281
|
+
end
|
282
|
+
end #end def Localize
|
283
|
+
|
284
|
+
#########################################################################################
|
285
|
+
def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
|
286
|
+
#open the starting page
|
287
|
+
|
288
|
+
if (depth<0)
|
289
|
+
depth=0
|
290
|
+
end
|
291
|
+
page = Nokogiri::HTML(open(url))
|
292
|
+
#collect all of the links from the page
|
293
|
+
links= page.css('a')
|
294
|
+
title = page.css('title')
|
295
|
+
#initialize variables
|
296
|
+
refarr=[]
|
297
|
+
hrefs = []
|
298
|
+
x=0
|
299
|
+
|
300
|
+
#add href to arrays for each link
|
301
|
+
links.each do |link|
|
302
|
+
if(link['href']!=nil && link['href']!="")
|
303
|
+
# puts x
|
304
|
+
# puts (link['title'].split.join)
|
305
|
+
# x+=1
|
306
|
+
hrefs.push(link)
|
307
|
+
|
308
|
+
end
|
309
|
+
|
310
|
+
end
|
311
|
+
total=0
|
312
|
+
#transfer links to other array
|
313
|
+
while(!hrefs.empty?)
|
314
|
+
value= hrefs.pop
|
315
|
+
refarr.push(value)
|
316
|
+
total+=1
|
209
317
|
end
|
210
|
-
|
211
|
-
|
318
|
+
puts total
|
319
|
+
puts "links in page"
|
320
|
+
|
212
321
|
|
213
|
-
|
322
|
+
#setup for recognition of the end of the array
|
323
|
+
refarr.push("-")
|
214
324
|
|
325
|
+
if(depth>0)
|
326
|
+
|
327
|
+
#create subdirectory for storing current set of scraped pages
|
328
|
+
|
329
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
330
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
331
|
+
end
|
332
|
+
#in each link
|
333
|
+
check = (refarr.length-1)
|
334
|
+
for i in 0..check
|
335
|
+
if(refarr[i]!="-")
|
336
|
+
#evaluate whether link is internal or external
|
337
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
338
|
+
if(refarr[i]['href'].include?('http://'))
|
339
|
+
url=refarr[i]['href']
|
340
|
+
else
|
341
|
+
url=sub_url+refarr[i]['href']
|
342
|
+
#puts "external link"
|
343
|
+
end#refarr[i]['href'].include?
|
344
|
+
end#refarr[i]['href']!=nil
|
345
|
+
fourofour=false
|
346
|
+
|
347
|
+
begin
|
348
|
+
if(fourofour==false)
|
349
|
+
pagina = Nokogiri::HTML(open(url))
|
350
|
+
end
|
351
|
+
#test for a 404
|
352
|
+
rescue Exception =>ex
|
353
|
+
#puts "got a 404"
|
354
|
+
#replace href (no navigation onclick)
|
355
|
+
refarr[i]['href'] =""
|
356
|
+
fourofour=true
|
357
|
+
|
358
|
+
retry
|
359
|
+
end #begin
|
360
|
+
|
361
|
+
if (fourofour==false)
|
362
|
+
#make relevant links reference local files
|
363
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
364
|
+
puts "link: "
|
365
|
+
puts depth
|
366
|
+
#wutwut
|
367
|
+
j_depth = s_depth - depth
|
368
|
+
appendval = "../"
|
369
|
+
clutch = 0
|
370
|
+
for r in 1..j_depth
|
371
|
+
|
372
|
+
clutch +=1
|
373
|
+
end
|
374
|
+
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
375
|
+
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
376
|
+
end
|
377
|
+
if (depth == s_depth)
|
378
|
+
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
379
|
+
else
|
380
|
+
|
381
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
382
|
+
end
|
383
|
+
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
384
|
+
refarr[i]['href']=linkref
|
385
|
+
puts refarr[i]['href']
|
386
|
+
#puts "working"
|
387
|
+
end #refarr[i]['href']!=""
|
388
|
+
|
389
|
+
|
390
|
+
#trim it down and remove special characters for display
|
391
|
+
trimval=refarr[i]['href']
|
392
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
393
|
+
#puts refarr[i]
|
394
|
+
if(finval==nil && refarr[i]!=nil)
|
395
|
+
finval=refarr[i]
|
396
|
+
end #finval == nil
|
397
|
+
|
398
|
+
n_depth = depth-1
|
399
|
+
|
400
|
+
if(finval!=nil)
|
401
|
+
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
402
|
+
#create subdirectory for storing current links page
|
403
|
+
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
404
|
+
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
405
|
+
#end
|
406
|
+
|
407
|
+
#this is where we will call the method for each link **********
|
408
|
+
|
409
|
+
|
410
|
+
end #finval!=nil
|
411
|
+
end #fourofour==false
|
412
|
+
end #refarr[i]!="-"
|
413
|
+
|
414
|
+
end#end for each
|
215
415
|
|
216
416
|
|
417
|
+
puts "here?"
|
418
|
+
puts depth
|
217
419
|
|
218
420
|
else#<< depth not > 0
|
219
|
-
|
421
|
+
check = (refarr.length-1)
|
422
|
+
for i in 0..check
|
423
|
+
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
220
424
|
refarr[i]['href']=""
|
425
|
+
end
|
221
426
|
end
|
222
427
|
end
|
223
428
|
|
429
|
+
if (depth == s_depth)
|
224
430
|
#store newly generated html/links for current page
|
225
431
|
mainpage =File.new('./page.html',"w")
|
226
432
|
mainpage.puts page
|
227
433
|
mainpage.close
|
228
434
|
puts "finished"
|
435
|
+
|
436
|
+
else
|
437
|
+
#store page from the link in the subdirectory
|
438
|
+
puts "page: "
|
439
|
+
p_depth = depth +1
|
440
|
+
j_depth = s_depth - depth
|
441
|
+
appendval = ""
|
442
|
+
clutch = 0
|
443
|
+
for r in 1..j_depth
|
444
|
+
appendval += "../"
|
445
|
+
clutch +=1
|
446
|
+
end
|
447
|
+
clutch -=1
|
448
|
+
puts "link to pass"
|
449
|
+
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
450
|
+
crfile.puts page
|
451
|
+
crfile.close
|
452
|
+
|
453
|
+
end
|
229
454
|
end #end def Localize
|
230
455
|
|
231
456
|
#########################################################################################
|
457
|
+
|
232
458
|
end#module
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omni_scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.5.4
|
4
|
+
version: 0.1.5.6.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bradley Maynard
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|