omni_scrape 0.1.4.9 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- OTJkOGZlMjE4MGEyNGZjMzY4MTY4NjRhMGNlYzc2NGMzODUwZGMxNw==
4
+ Zjc4YzI5MjUzNWI1MWUzZGQwMzc1YWI1NmQwN2I1YjIyZmVkY2QwZA==
5
5
  data.tar.gz: !binary |-
6
- YzA4ZTRkODRmNWEyNDcxNDdmM2FhYzM3OGI0NTg2MDZlMmQ4NmM3Ng==
6
+ YjdjZTY3MGVjMjY5ZmUzZjc0MDM2YjRlMzliNzUwODczMzdjMWIzYQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NDhiNWY0MDU4OTUzZGFjZWU5MjQ1NDIzZjZhZTNkMTc2MmJlNmIyMzQ2NjBj
10
- MzhjMjljYWIzNzZlOGQ1OTc1MDZhYmNkYWJjN2NhMWJhMDVlMTg0MTgzNjQ0
11
- M2U4NmNhNjUwYmNkYjg3NTNjZTY3ZGU3MTUxNzVhNDY5NTJkNWQ=
9
+ MDA3ZmY2MTljMDVkNDQ1YTBhOTUxZTc3ZDM1NWI1OTA3OTZjNjUzNDRiNGI0
10
+ OTE5MWExNzgyYzYxOTQ3ZWViYmQzNTdhM2JlNDlkOTgwNTkzYmIwODdmZDMy
11
+ NzdmZGU3MGVjNDlmZDJmODg2MTJkZjRjOGRhZjdiNGMxODNhMGU=
12
12
  data.tar.gz: !binary |-
13
- MTU0OTRjYTIwODY1YTdjOGVhM2RmMTU2ZjJkZGExOGVjZmJlYjk5MmM1ZmRm
14
- YTUzNjMwMDk4YzBlNjVmNjJhZDI0NjA2YjkyZDY2ZDAwNzFjZjI5NTlmMTFj
15
- N2UxZGIyYzQ4NTk2Yjc1YmM1ZjU2OGI4N2Q4ZTFiNzdiYzIzZmY=
13
+ NDgwOWRmM2JkNmZkYmNkZDQxNDhlNzJmYmI4ZGQxM2RiMzhmNGJiZTEzYjZl
14
+ OTQ5YTdkYTEwNzJmMWIyZjQxZDY0ODVkODEwZDk0M2M0Y2NiZjM2MWU0YTVi
15
+ NjhlYTk0ZmZjMmI1NjMxOTM2OTg2M2Y1OTE1YmQxZGYxZjg0Mzg=
@@ -1,3 +1,3 @@
1
1
  module OmniScrape
2
- VERSION = "0.1.4.9"
2
+ VERSION = "0.1.5"
3
3
  end
data/lib/omni_scrape.rb CHANGED
@@ -2,15 +2,18 @@ require "omni_scrape/version"
2
2
 
3
3
  module OmniScrape
4
4
  def CrawlScrape(url, depth, sub_url)
5
+ #open the starting page
5
6
  page = Nokogiri::HTML(open(url))
6
-
7
+ #collect all of the links from the page
7
8
  links= page.css('a')
8
9
 
10
+ #initialize variables
9
11
  refarr=[]
10
12
  titlearr=[]
11
13
  titles =[]
12
14
  hrefs = []
13
15
  x=0
16
+ #add title and href to arrays for each link
14
17
  links.each do |link|
15
18
  if(link['title']!=nil && link['title']!="" &&link['href']!=nil && link['href']!="")
16
19
  # puts x
@@ -23,6 +26,7 @@ links.each do |link|
23
26
 
24
27
  end
25
28
  inc=0
29
+ #transfer links to other array
26
30
  while(!hrefs.empty?)
27
31
  value= hrefs.pop
28
32
  puts value
@@ -31,6 +35,7 @@ links.each do |link|
31
35
  inc+=1
32
36
  end
33
37
  inc=0
38
+ #transfer titles to other array
34
39
  while(!titles.empty?)
35
40
  value = titles.pop
36
41
  puts value
@@ -38,25 +43,32 @@ links.each do |link|
38
43
  puts titlearr[inc]
39
44
  inc+=1
40
45
  end
41
-
46
+ #setup for recognition of the end of the array
42
47
  refarr.push("-")
43
48
 
44
-
49
+ #in each link
45
50
  for i in 0..titlearr.length
46
51
  if(refarr[i]!="-")
52
+ #evaluate whether link is internal or external
53
+ if(refarr[i].include?('http://'))
54
+ url=refarr[i]
55
+ else
47
56
  url=sub_url+refarr[i]
57
+ end
48
58
  fourofour=false
49
-
59
+
50
60
  begin
51
61
  if(fourofour==false)
52
62
  pagina = Nokogiri::HTML(open(url))
53
63
  end
64
+ #test for a 404
54
65
  rescue Exception =>ex
55
66
  puts "got a 404"
56
67
  fourofour=true
57
68
  retry
58
69
  end
59
70
  if (fourofour==false)
71
+ #trim it down and remove special characters
60
72
  trimval=titlearr[i]
61
73
  finval=trimval.gsub!(/[!:\/-]/, '')
62
74
  puts titlearr[i]
@@ -65,6 +77,7 @@ for i in 0..titlearr.length
65
77
  end
66
78
  puts finval
67
79
  if(finval!=nil)
80
+ #store html from the link with title of the link
68
81
  crfile=File.new((finval+".html").chomp,"w")
69
82
  crfile.puts pagina
70
83
  crfile.close
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omni_scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4.9
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bradley Maynard
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-04 00:00:00.000000000 Z
11
+ date: 2015-06-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri