omni_scrape 0.1.5.6.4 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +5 -4
- data/lib/omni_scrape/version.rb +1 -1
- data/lib/omni_scrape.rb +56 -36
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NjNjMTQ5MmUyZWU0MzJmNDZlNTE1NDAwN2EzOTkwOTliZWZiYzdmMg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
MGUxYmIwMWMxM2FmMjkzY2JlZTg4OWZhNTU2Nzg4NDcyOGM4MTRmOQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZWJmZDEzMzBjYWY5ZTlkZTZlNGY3ZTFhZmI2Nzc0NjJiYzgzYzhmNDk3Nzhj
|
10
|
+
MzNmZGM1MTJkZGY2MjgyZWY4MWJjZTc5Y2Q0MjVkYzlmNThkMzU0OTExNjJi
|
11
|
+
OTA2MWIzOWU4MmIwYmVmZGY5MTNjZGE0NGMzOTlhZjRlOWU0YmY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YWVkOTg2NDUzMTk3YmExNDQ0YTlkYjdlMWZiMmY0ZDA0OWRlNWY3ZDkyOWQ2
|
14
|
+
OTY0NWRmY2ZjNTg3NDliOTUxNzg4MzNiOTdjZDdiYTI4MzIyYWY0NzA3MmY0
|
15
|
+
ZDY2NmI1ZWFmZjkyYmVlNmZjOTVmZjMxMzE5NjFiZDlkOTI0OGY=
|
data/README.md
CHANGED
@@ -41,13 +41,14 @@ The second parameter is the depth to crawl. ***Warning: crawling grows at an IN
|
|
41
41
|
|
42
42
|
The third is a sub-url for internal links.
|
43
43
|
|
44
|
+
description: Localize will follow every link from the page provided and scrape the html from those pages, storing it as html files in subdirectories.
|
45
|
+
|
46
|
+
The pages are linked to other local pages. NOTE: Removed duplication :)
|
47
|
+
|
44
48
|
Method : FLocalize
|
45
49
|
|
46
50
|
This is the recursive method called by Localize and shouldn't be used directly. :)
|
47
51
|
|
48
|
-
description: Localize will follow every link from the page provided and scrape the html from those pages, storing it as html files in subdirectories.
|
49
|
-
|
50
|
-
The pages are linked to other local pages. Currently there is a lot of duplication in this regard. Note: Working on eliminating the duplication.
|
51
52
|
|
52
53
|
## Development
|
53
54
|
|
@@ -57,7 +58,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
57
58
|
|
58
59
|
## Contributing
|
59
60
|
|
60
|
-
1. Fork it ( https://github.com/
|
61
|
+
1. Fork it ( https://github.com/bmaynard1991/omni-scrape )
|
61
62
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
62
63
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
63
64
|
4. Push to the branch (`git push origin my-new-feature`)
|
data/lib/omni_scrape/version.rb
CHANGED
data/lib/omni_scrape.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require "omni_scrape/version"
|
2
|
-
|
3
2
|
module OmniScrape
|
4
3
|
|
5
4
|
##########################################################################################
|
@@ -8,6 +7,7 @@ def CrawlScrape(url, depth, sub_url)
|
|
8
7
|
if (depth<0)
|
9
8
|
depth=0
|
10
9
|
end
|
10
|
+
s_depth = depth
|
11
11
|
#open the starting page
|
12
12
|
page = Nokogiri::HTML(open(url))
|
13
13
|
#collect all of the links from the page
|
@@ -108,6 +108,7 @@ end#end for each
|
|
108
108
|
def Localize(url, depth, sub_url)
|
109
109
|
|
110
110
|
#initialize to extract from user view
|
111
|
+
@location = Hash.new
|
111
112
|
s_depth = depth
|
112
113
|
i_page = 0
|
113
114
|
prev_ipage = 0
|
@@ -143,8 +144,7 @@ links.each do |link|
|
|
143
144
|
refarr.push(value)
|
144
145
|
total+=1
|
145
146
|
end
|
146
|
-
|
147
|
-
puts "links in page"
|
147
|
+
|
148
148
|
|
149
149
|
|
150
150
|
#setup for recognition of the end of the array
|
@@ -189,9 +189,8 @@ end
|
|
189
189
|
if (fourofour==false)
|
190
190
|
#make relevant links reference local files
|
191
191
|
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
192
|
-
|
193
|
-
|
194
|
-
#wutwut
|
192
|
+
|
193
|
+
|
195
194
|
j_depth = s_depth - depth
|
196
195
|
appendval = "../"
|
197
196
|
clutch = 0
|
@@ -203,18 +202,32 @@ end
|
|
203
202
|
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
204
203
|
end
|
205
204
|
if (depth == s_depth)
|
205
|
+
|
206
206
|
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
207
207
|
else
|
208
|
-
|
208
|
+
|
209
209
|
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
210
210
|
end
|
211
211
|
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
212
|
+
if (@location.has_key?(refarr[i]['href']))
|
213
|
+
loc = @location[(refarr[i]['href'])]
|
214
|
+
sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
|
215
|
+
refarr[i]['href'] =sub_loc
|
216
|
+
else
|
217
|
+
initial_link=refarr[i]['href']
|
212
218
|
refarr[i]['href']=linkref
|
213
|
-
|
219
|
+
|
220
|
+
#HERE!!!!!**!*!*@*!!@@***!
|
221
|
+
if (depth == s_depth)
|
222
|
+
full_link = "../../"+linkref
|
223
|
+
else
|
224
|
+
full_link = linkref
|
225
|
+
end
|
226
|
+
@location[initial_link]=full_link
|
214
227
|
#puts "working"
|
228
|
+
end# @location.haskey
|
215
229
|
end #refarr[i]['href']!=""
|
216
230
|
|
217
|
-
|
218
231
|
#trim it down and remove special characters for display
|
219
232
|
trimval=refarr[i]['href']
|
220
233
|
finval=trimval.gsub!(/[!:\/-]/, '')
|
@@ -232,7 +245,7 @@ end
|
|
232
245
|
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
233
246
|
#end
|
234
247
|
|
235
|
-
|
248
|
+
|
236
249
|
|
237
250
|
|
238
251
|
end #finval!=nil
|
@@ -242,8 +255,7 @@ end
|
|
242
255
|
end#end for each
|
243
256
|
|
244
257
|
|
245
|
-
|
246
|
-
puts depth
|
258
|
+
|
247
259
|
|
248
260
|
else#<< depth not > 0
|
249
261
|
check = (refarr.length-1)
|
@@ -259,7 +271,7 @@ if (depth == s_depth)
|
|
259
271
|
mainpage =File.new('./page.html',"w")
|
260
272
|
mainpage.puts page
|
261
273
|
mainpage.close
|
262
|
-
|
274
|
+
|
263
275
|
|
264
276
|
else
|
265
277
|
#store page from the link in the subdirectory
|
@@ -273,7 +285,7 @@ else
|
|
273
285
|
clutch +=1
|
274
286
|
end
|
275
287
|
clutch -=1
|
276
|
-
|
288
|
+
|
277
289
|
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
278
290
|
crfile.puts page
|
279
291
|
crfile.close
|
@@ -315,8 +327,7 @@ links.each do |link|
|
|
315
327
|
refarr.push(value)
|
316
328
|
total+=1
|
317
329
|
end
|
318
|
-
|
319
|
-
puts "links in page"
|
330
|
+
|
320
331
|
|
321
332
|
|
322
333
|
#setup for recognition of the end of the array
|
@@ -361,9 +372,8 @@ end
|
|
361
372
|
if (fourofour==false)
|
362
373
|
#make relevant links reference local files
|
363
374
|
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
364
|
-
|
365
|
-
|
366
|
-
#wutwut
|
375
|
+
|
376
|
+
|
367
377
|
j_depth = s_depth - depth
|
368
378
|
appendval = "../"
|
369
379
|
clutch = 0
|
@@ -374,16 +384,25 @@ end
|
|
374
384
|
if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
|
375
385
|
else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
|
376
386
|
end
|
377
|
-
if (depth == s_depth)
|
378
|
-
linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
|
379
|
-
else
|
380
387
|
|
381
|
-
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html")
|
382
|
-
|
388
|
+
linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
|
389
|
+
|
383
390
|
pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
|
391
|
+
if (@location.has_key?(refarr[i]['href']))
|
392
|
+
pass_a_link = "this_is_a_duplicate"
|
393
|
+
refarr[i]['href'] = @location[(refarr[i]['href'])]
|
394
|
+
|
395
|
+
else
|
396
|
+
initial_link=refarr[i]['href']
|
384
397
|
refarr[i]['href']=linkref
|
385
|
-
|
398
|
+
|
399
|
+
|
400
|
+
|
401
|
+
full_link = linkref
|
402
|
+
|
403
|
+
@location[initial_link]=linkref
|
386
404
|
#puts "working"
|
405
|
+
end# @location.haskey
|
387
406
|
end #refarr[i]['href']!=""
|
388
407
|
|
389
408
|
|
@@ -399,12 +418,8 @@ end
|
|
399
418
|
|
400
419
|
if(finval!=nil)
|
401
420
|
self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
|
402
|
-
|
403
|
-
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
404
|
-
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
405
|
-
#end
|
421
|
+
|
406
422
|
|
407
|
-
#this is where we will call the method for each link **********
|
408
423
|
|
409
424
|
|
410
425
|
end #finval!=nil
|
@@ -414,14 +429,14 @@ end
|
|
414
429
|
end#end for each
|
415
430
|
|
416
431
|
|
417
|
-
|
418
|
-
puts depth
|
432
|
+
|
419
433
|
|
420
434
|
else#<< depth not > 0
|
421
435
|
check = (refarr.length-1)
|
422
436
|
for i in 0..check
|
423
437
|
if (refarr[i]['href']!=nil && refarr[i]['href']!="")
|
424
438
|
refarr[i]['href']=""
|
439
|
+
|
425
440
|
end
|
426
441
|
end
|
427
442
|
end
|
@@ -431,11 +446,11 @@ if (depth == s_depth)
|
|
431
446
|
mainpage =File.new('./page.html',"w")
|
432
447
|
mainpage.puts page
|
433
448
|
mainpage.close
|
434
|
-
|
449
|
+
|
435
450
|
|
436
451
|
else
|
437
452
|
#store page from the link in the subdirectory
|
438
|
-
|
453
|
+
|
439
454
|
p_depth = depth +1
|
440
455
|
j_depth = s_depth - depth
|
441
456
|
appendval = ""
|
@@ -445,13 +460,18 @@ else
|
|
445
460
|
clutch +=1
|
446
461
|
end
|
447
462
|
clutch -=1
|
448
|
-
|
463
|
+
|
464
|
+
if (link_to_add!="this_is_a_duplicate")
|
465
|
+
|
449
466
|
crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
|
450
467
|
crfile.puts page
|
451
468
|
crfile.close
|
469
|
+
else
|
470
|
+
|
471
|
+
end
|
452
472
|
|
453
473
|
end
|
454
|
-
end #end def
|
474
|
+
end #end def FLocalize
|
455
475
|
|
456
476
|
#########################################################################################
|
457
477
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omni_scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bradley Maynard
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -88,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
88
|
version: '0'
|
89
89
|
requirements: []
|
90
90
|
rubyforge_project:
|
91
|
-
rubygems_version: 2.4.
|
91
|
+
rubygems_version: 2.4.8
|
92
92
|
signing_key:
|
93
93
|
specification_version: 4
|
94
94
|
summary: This is an all-purpose web scraper
|