omni_scrape 0.1.8 → 0.1.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- NjNjMTQ5MmUyZWU0MzJmNDZlNTE1NDAwN2EzOTkwOTliZWZiYzdmMg==
5
- data.tar.gz: !binary |-
6
- MGUxYmIwMWMxM2FmMjkzY2JlZTg4OWZhNTU2Nzg4NDcyOGM4MTRmOQ==
2
+ SHA1:
3
+ metadata.gz: bfdd836a78c51ee9764c7aec015ce8438b5691d0
4
+ data.tar.gz: 25d62caf3ecdf5fed189c8bc4b6608e818b9b42d
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- ZWJmZDEzMzBjYWY5ZTlkZTZlNGY3ZTFhZmI2Nzc0NjJiYzgzYzhmNDk3Nzhj
10
- MzNmZGM1MTJkZGY2MjgyZWY4MWJjZTc5Y2Q0MjVkYzlmNThkMzU0OTExNjJi
11
- OTA2MWIzOWU4MmIwYmVmZGY5MTNjZGE0NGMzOTlhZjRlOWU0YmY=
12
- data.tar.gz: !binary |-
13
- YWVkOTg2NDUzMTk3YmExNDQ0YTlkYjdlMWZiMmY0ZDA0OWRlNWY3ZDkyOWQ2
14
- OTY0NWRmY2ZjNTg3NDliOTUxNzg4MzNiOTdjZDdiYTI4MzIyYWY0NzA3MmY0
15
- ZDY2NmI1ZWFmZjkyYmVlNmZjOTVmZjMxMzE5NjFiZDlkOTI0OGY=
6
+ metadata.gz: 83c83e53cb5871c8409af8a5c74a67e303399099baf321bae646c002f8d42160ddc44616bb58b39d8a941d8cf7269f320e5cd107c7f6671eeb6e6a1f317796f5
7
+ data.tar.gz: 9791b0cf2a6cbfce254a4d60bf3c2c1295eb301c797272b68e1a40c307d0d3babd8e22c5331d90b747ec10bc315e30134542bcb5a660775aac122aa26fe1ab40
@@ -1,3 +1,3 @@
1
1
  module OmniScrape
2
- VERSION = "0.1.8"
2
+ VERSION = "0.1.8.1"
3
3
  end
data/lib/omni_scrape.rb CHANGED
@@ -6,101 +6,71 @@ module OmniScrape
6
6
  def CrawlScrape(url, depth, sub_url)
7
7
  if (depth<0)
8
8
  depth=0
9
- end
10
- s_depth = depth
9
+ end#if
10
+ s_depth = depth #true
11
11
  #open the starting page
12
- page = Nokogiri::HTML(open(url))
12
+ page = Nokogiri::HTML(open(url)) #good
13
13
  #collect all of the links from the page
14
- links= page.css('a')
14
+ links= page.css('a') #good
15
15
 
16
16
  #initialize variables
17
17
  refarr=[]
18
- titlearr=[]
19
- titles =[]
20
18
  hrefs = []
21
- x=0
22
19
  #add title and href to arrays for each link
23
20
  links.each do |link|
24
- if(link['title']!=nil && link['title']!="" &&link['href']!=nil && link['href']!="")
25
- # puts x
26
- # puts (link['title'].split.join)
27
- # x+=1
28
- titles.push((link['title']).split.join)
29
- hrefs.push((link['href']).split.join)
30
-
31
- end
32
-
33
- end
34
- inc=0
21
+ if(link['href']!=nil && link['href']!="")
22
+ hrefs.push(link)
23
+ end#if
24
+ end#do
25
+
35
26
  #transfer links to other array
36
27
  while(!hrefs.empty?)
37
28
  value= hrefs.pop
38
- puts value
29
+
39
30
  refarr.push(value)
40
- refarr[inc]
41
- inc+=1
42
- end
43
- inc=0
44
- #transfer titles to other array
45
- while(!titles.empty?)
46
- value = titles.pop
47
- puts value
48
- titlearr.push(value)
49
- puts titlearr[inc]
50
- inc+=1
51
- end
52
- #setup for recognition of the end of the array
31
+
32
+
33
+ end#while
34
+ #setup for recognition of the end of the array
53
35
  refarr.push("-")
54
36
 
55
37
  #create folder for storing current set of scraped pages
56
- g_depth = s_depth
57
- while (g_depth>-1)
58
- if (Dir.exist?('./pages'+g_depth.to_s))
59
- else Dir.mkdir('./pages'+g_depth.to_s)
60
- end
61
- g_depth =g_depth-1
62
- end
63
38
 
39
+ if (Dir.exist?('./results'+depth.to_s))
40
+ else Dir.mkdir('./results'+depth.to_s)
41
+ end#if
64
42
  #in each link
65
- for i in 1..titlearr.length
66
- if(refarr[i]!="-")
43
+ check =(refarr.length-1)
44
+ for i in 0..check
45
+ if(refarr[i]!="-")#still valid links
67
46
  #evaluate whether link is internal or external
68
- if(refarr[i].include?('http://'))
69
- url=refarr[i]
47
+ if(refarr[i]['href'].include?('http://') && refarr[i]!=nil)
48
+ url=refarr[i]['href']
70
49
  else
71
- url=sub_url+refarr[i]
72
- end
50
+ url=sub_url+refarr[i]['href']
51
+ end#if include?
73
52
  fourofour=false
74
53
 
75
54
  begin
76
55
  if(fourofour==false)
77
56
  pagina = Nokogiri::HTML(open(url))
78
- end
57
+ end#if
79
58
  #test for a 404
80
59
  rescue Exception =>ex
81
- puts "got a 404"
60
+
82
61
  fourofour=true
83
62
  retry
84
- end
63
+ end#begin
85
64
  if (fourofour==false)
86
- #trim it down and remove special characters
87
- trimval=titlearr[i]
88
- finval=trimval.gsub!(/[!:\/-]/, '')
89
- puts titlearr[i]
90
- if(finval==nil && titlearr[i]!=nil)
91
- finval=titlearr[i]
92
- end
93
- puts finval
94
- if(finval!=nil)
95
65
  #store html from the link with title of the link
96
- crfile=File.new(('./results'+depth.to_s+"/"+finval+".html").chomp,"w")
66
+ crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
97
67
  crfile.puts pagina
98
68
  crfile.close
99
- end
100
- end
101
- end
69
+ end#if
70
+ end#if != "-"
71
+
102
72
  end#end for each
103
- puts "finished"
73
+
104
74
  end#def crawlscrape
105
75
 
106
76
  #############################################################################################
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omni_scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bradley Maynard
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-16 00:00:00.000000000 Z
11
+ date: 2015-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -78,17 +78,17 @@ require_paths:
78
78
  - lib
79
79
  required_ruby_version: !ruby/object:Gem::Requirement
80
80
  requirements:
81
- - - ! '>='
81
+ - - '>='
82
82
  - !ruby/object:Gem::Version
83
83
  version: '0'
84
84
  required_rubygems_version: !ruby/object:Gem::Requirement
85
85
  requirements:
86
- - - ! '>='
86
+ - - '>='
87
87
  - !ruby/object:Gem::Version
88
88
  version: '0'
89
89
  requirements: []
90
90
  rubyforge_project:
91
- rubygems_version: 2.4.8
91
+ rubygems_version: 2.0.14
92
92
  signing_key:
93
93
  specification_version: 4
94
94
  summary: This is an all-purpose web scraper