omni_scrape 0.1.5.6.4 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YmI5NWNlZmYxN2Q5YzZmYzE4ZThjM2IxMDZjMWM2MDI3MDBiM2U0MA==
4
+ NjNjMTQ5MmUyZWU0MzJmNDZlNTE1NDAwN2EzOTkwOTliZWZiYzdmMg==
5
5
  data.tar.gz: !binary |-
6
- OGYwZTdlOWFiYjMxMTNmNGU3MmJjZWM5MWU5ZWQ2MGY2MDUzYmQzNg==
6
+ MGUxYmIwMWMxM2FmMjkzY2JlZTg4OWZhNTU2Nzg4NDcyOGM4MTRmOQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ZWJkZWZhODhmM2Y0YjExZTBjYjc1MmNhNmJiNWI1YTJiYzQ4M2QyMDExYWZm
10
- ZWM1NTMzMjYyYjkyNTRmMzc4Mjc1ODIxNzgxZTUyYzhhMzQ1Y2U5M2UyODE5
11
- OTJhMGRhNjdhZjQ3N2YyNDM0NmE3YTdhY2ViNWU4NjBlNWQzNDM=
9
+ ZWJmZDEzMzBjYWY5ZTlkZTZlNGY3ZTFhZmI2Nzc0NjJiYzgzYzhmNDk3Nzhj
10
+ MzNmZGM1MTJkZGY2MjgyZWY4MWJjZTc5Y2Q0MjVkYzlmNThkMzU0OTExNjJi
11
+ OTA2MWIzOWU4MmIwYmVmZGY5MTNjZGE0NGMzOTlhZjRlOWU0YmY=
12
12
  data.tar.gz: !binary |-
13
- OGNkNTA2OWQ3YmI0MGVjMWQyMjQ4ZjM1MmIxNGUzYzVhMzMxODdkYjZmYjkz
14
- MTU4MzIyNzIyMzg1ZDQ2YmEyNTA2MGE4MGI2MDMwY2RlNjA2YmFiYmUzZDJi
15
- ZmY3YWI3ZDRkZjg0NjdjMTEwZTNmMWVmZjNhNDlkMmE1N2RhMGY=
13
+ YWVkOTg2NDUzMTk3YmExNDQ0YTlkYjdlMWZiMmY0ZDA0OWRlNWY3ZDkyOWQ2
14
+ OTY0NWRmY2ZjNTg3NDliOTUxNzg4MzNiOTdjZDdiYTI4MzIyYWY0NzA3MmY0
15
+ ZDY2NmI1ZWFmZjkyYmVlNmZjOTVmZjMxMzE5NjFiZDlkOTI0OGY=
data/README.md CHANGED
@@ -41,13 +41,14 @@ The second parameter is the depth to crawl. ***Warning: crawling grows at an IN
41
41
 
42
42
  The third is a sub-url for internal links.
43
43
 
44
+ description: Localize will follow every link from the page provided and scrape the html from those pages, storing it as html files in subdirectories.
45
+
46
+ The pages are linked to other local pages. NOTE: Removed duplication :)
47
+
44
48
  Method : FLocalize
45
49
 
46
50
  This is the recursive method called by Localize and shouldn't be used directly. :)
47
51
 
48
- description: Localize will follow every link from the page provided and scrape the html from those pages, storing it as html files in subdirectories.
49
-
50
- The pages are linked to other local pages. Currently there is a lot of duplication in this regard. Note: Working on eliminating the duplication.
51
52
 
52
53
  ## Development
53
54
 
@@ -57,7 +58,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
57
58
 
58
59
  ## Contributing
59
60
 
60
- 1. Fork it ( https://github.com/[my-github-username]/omni_scrape/fork )
61
+ 1. Fork it ( https://github.com/bmaynard1991/omni-scrape )
61
62
  2. Create your feature branch (`git checkout -b my-new-feature`)
62
63
  3. Commit your changes (`git commit -am 'Add some feature'`)
63
64
  4. Push to the branch (`git push origin my-new-feature`)
@@ -1,3 +1,3 @@
1
1
  module OmniScrape
2
- VERSION = "0.1.5.6.4"
2
+ VERSION = "0.1.8"
3
3
  end
data/lib/omni_scrape.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  require "omni_scrape/version"
2
-
3
2
  module OmniScrape
4
3
 
5
4
  ##########################################################################################
@@ -8,6 +7,7 @@ def CrawlScrape(url, depth, sub_url)
8
7
  if (depth<0)
9
8
  depth=0
10
9
  end
10
+ s_depth = depth
11
11
  #open the starting page
12
12
  page = Nokogiri::HTML(open(url))
13
13
  #collect all of the links from the page
@@ -108,6 +108,7 @@ end#end for each
108
108
  def Localize(url, depth, sub_url)
109
109
 
110
110
  #initialize to extract from user view
111
+ @location = Hash.new
111
112
  s_depth = depth
112
113
  i_page = 0
113
114
  prev_ipage = 0
@@ -143,8 +144,7 @@ links.each do |link|
143
144
  refarr.push(value)
144
145
  total+=1
145
146
  end
146
- puts total
147
- puts "links in page"
147
+
148
148
 
149
149
 
150
150
  #setup for recognition of the end of the array
@@ -189,9 +189,8 @@ end
189
189
  if (fourofour==false)
190
190
  #make relevant links reference local files
191
191
  if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
192
- puts "link: "
193
- puts depth
194
- #wutwut
192
+
193
+
195
194
  j_depth = s_depth - depth
196
195
  appendval = "../"
197
196
  clutch = 0
@@ -203,18 +202,32 @@ end
203
202
  else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
204
203
  end
205
204
  if (depth == s_depth)
205
+
206
206
  linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
207
207
  else
208
-
208
+
209
209
  linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
210
210
  end
211
211
  pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
212
+ if (@location.has_key?(refarr[i]['href']))
213
+ loc = @location[(refarr[i]['href'])]
214
+ sub_loc = loc.match(/(.\/[a-z]{5}\d{1,20}\/\d{1,20}[a-z]{3}\/\d{1,20}[x]\d{1,20}[a-z]{4}.[a-z]{1,20})/)
215
+ refarr[i]['href'] =sub_loc
216
+ else
217
+ initial_link=refarr[i]['href']
212
218
  refarr[i]['href']=linkref
213
- puts refarr[i]['href']
219
+
220
+ #HERE!!!!!**!*!*@*!!@@***!
221
+ if (depth == s_depth)
222
+ full_link = "../../"+linkref
223
+ else
224
+ full_link = linkref
225
+ end
226
+ @location[initial_link]=full_link
214
227
  #puts "working"
228
+ end# @location.haskey
215
229
  end #refarr[i]['href']!=""
216
230
 
217
-
218
231
  #trim it down and remove special characters for display
219
232
  trimval=refarr[i]['href']
220
233
  finval=trimval.gsub!(/[!:\/-]/, '')
@@ -232,7 +245,7 @@ end
232
245
  #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
233
246
  #end
234
247
 
235
- #this is where we will call the method for each link **********
248
+
236
249
 
237
250
 
238
251
  end #finval!=nil
@@ -242,8 +255,7 @@ end
242
255
  end#end for each
243
256
 
244
257
 
245
- puts "here?"
246
- puts depth
258
+
247
259
 
248
260
  else#<< depth not > 0
249
261
  check = (refarr.length-1)
@@ -259,7 +271,7 @@ if (depth == s_depth)
259
271
  mainpage =File.new('./page.html',"w")
260
272
  mainpage.puts page
261
273
  mainpage.close
262
- puts "finished"
274
+
263
275
 
264
276
  else
265
277
  #store page from the link in the subdirectory
@@ -273,7 +285,7 @@ else
273
285
  clutch +=1
274
286
  end
275
287
  clutch -=1
276
- puts "link to pass"
288
+
277
289
  crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
278
290
  crfile.puts page
279
291
  crfile.close
@@ -315,8 +327,7 @@ links.each do |link|
315
327
  refarr.push(value)
316
328
  total+=1
317
329
  end
318
- puts total
319
- puts "links in page"
330
+
320
331
 
321
332
 
322
333
  #setup for recognition of the end of the array
@@ -361,9 +372,8 @@ end
361
372
  if (fourofour==false)
362
373
  #make relevant links reference local files
363
374
  if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
364
- puts "link: "
365
- puts depth
366
- #wutwut
375
+
376
+
367
377
  j_depth = s_depth - depth
368
378
  appendval = "../"
369
379
  clutch = 0
@@ -374,16 +384,25 @@ end
374
384
  if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
375
385
  else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
376
386
  end
377
- if (depth == s_depth)
378
- linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
379
- else
380
387
 
381
- linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
382
- end
388
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html"))
389
+
383
390
  pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
391
+ if (@location.has_key?(refarr[i]['href']))
392
+ pass_a_link = "this_is_a_duplicate"
393
+ refarr[i]['href'] = @location[(refarr[i]['href'])]
394
+
395
+ else
396
+ initial_link=refarr[i]['href']
384
397
  refarr[i]['href']=linkref
385
- puts refarr[i]['href']
398
+
399
+
400
+
401
+ full_link = linkref
402
+
403
+ @location[initial_link]=linkref
386
404
  #puts "working"
405
+ end# @location.haskey
387
406
  end #refarr[i]['href']!=""
388
407
 
389
408
 
@@ -399,12 +418,8 @@ end
399
418
 
400
419
  if(finval!=nil)
401
420
  self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
402
- #create subdirectory for storing current links page
403
- #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
404
- #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
405
- #end
421
+
406
422
 
407
- #this is where we will call the method for each link **********
408
423
 
409
424
 
410
425
  end #finval!=nil
@@ -414,14 +429,14 @@ end
414
429
  end#end for each
415
430
 
416
431
 
417
- puts "here?"
418
- puts depth
432
+
419
433
 
420
434
  else#<< depth not > 0
421
435
  check = (refarr.length-1)
422
436
  for i in 0..check
423
437
  if (refarr[i]['href']!=nil && refarr[i]['href']!="")
424
438
  refarr[i]['href']=""
439
+
425
440
  end
426
441
  end
427
442
  end
@@ -431,11 +446,11 @@ if (depth == s_depth)
431
446
  mainpage =File.new('./page.html',"w")
432
447
  mainpage.puts page
433
448
  mainpage.close
434
- puts "finished"
449
+
435
450
 
436
451
  else
437
452
  #store page from the link in the subdirectory
438
- puts "page: "
453
+
439
454
  p_depth = depth +1
440
455
  j_depth = s_depth - depth
441
456
  appendval = ""
@@ -445,13 +460,18 @@ else
445
460
  clutch +=1
446
461
  end
447
462
  clutch -=1
448
- puts "link to pass"
463
+
464
+ if (link_to_add!="this_is_a_duplicate")
465
+
449
466
  crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
450
467
  crfile.puts page
451
468
  crfile.close
469
+ else
470
+
471
+ end
452
472
 
453
473
  end
454
- end #end def Localize
474
+ end #end def FLocalize
455
475
 
456
476
  #########################################################################################
457
477
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omni_scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5.6.4
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bradley Maynard
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-12 00:00:00.000000000 Z
11
+ date: 2015-06-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -88,7 +88,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
88
  version: '0'
89
89
  requirements: []
90
90
  rubyforge_project:
91
- rubygems_version: 2.4.7
91
+ rubygems_version: 2.4.8
92
92
  signing_key:
93
93
  specification_version: 4
94
94
  summary: This is an all-purpose web scraper