omni_scrape 0.1.8 → 0.1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/lib/omni_scrape/version.rb +1 -1
- data/lib/omni_scrape.rb +32 -62
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
MGUxYmIwMWMxM2FmMjkzY2JlZTg4OWZhNTU2Nzg4NDcyOGM4MTRmOQ==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bfdd836a78c51ee9764c7aec015ce8438b5691d0
|
4
|
+
data.tar.gz: 25d62caf3ecdf5fed189c8bc4b6608e818b9b42d
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
MzNmZGM1MTJkZGY2MjgyZWY4MWJjZTc5Y2Q0MjVkYzlmNThkMzU0OTExNjJi
|
11
|
-
OTA2MWIzOWU4MmIwYmVmZGY5MTNjZGE0NGMzOTlhZjRlOWU0YmY=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YWVkOTg2NDUzMTk3YmExNDQ0YTlkYjdlMWZiMmY0ZDA0OWRlNWY3ZDkyOWQ2
|
14
|
-
OTY0NWRmY2ZjNTg3NDliOTUxNzg4MzNiOTdjZDdiYTI4MzIyYWY0NzA3MmY0
|
15
|
-
ZDY2NmI1ZWFmZjkyYmVlNmZjOTVmZjMxMzE5NjFiZDlkOTI0OGY=
|
6
|
+
metadata.gz: 83c83e53cb5871c8409af8a5c74a67e303399099baf321bae646c002f8d42160ddc44616bb58b39d8a941d8cf7269f320e5cd107c7f6671eeb6e6a1f317796f5
|
7
|
+
data.tar.gz: 9791b0cf2a6cbfce254a4d60bf3c2c1295eb301c797272b68e1a40c307d0d3babd8e22c5331d90b747ec10bc315e30134542bcb5a660775aac122aa26fe1ab40
|
data/lib/omni_scrape/version.rb
CHANGED
data/lib/omni_scrape.rb
CHANGED
@@ -6,101 +6,71 @@ module OmniScrape
|
|
6
6
|
def CrawlScrape(url, depth, sub_url)
|
7
7
|
if (depth<0)
|
8
8
|
depth=0
|
9
|
-
end
|
10
|
-
s_depth = depth
|
9
|
+
end#if
|
10
|
+
s_depth = depth #true
|
11
11
|
#open the starting page
|
12
|
-
page = Nokogiri::HTML(open(url))
|
12
|
+
page = Nokogiri::HTML(open(url)) #good
|
13
13
|
#collect all of the links from the page
|
14
|
-
links= page.css('a')
|
14
|
+
links= page.css('a') #good
|
15
15
|
|
16
16
|
#initialize variables
|
17
17
|
refarr=[]
|
18
|
-
titlearr=[]
|
19
|
-
titles =[]
|
20
18
|
hrefs = []
|
21
|
-
x=0
|
22
19
|
#add title and href to arrays for each link
|
23
20
|
links.each do |link|
|
24
|
-
if(link['
|
25
|
-
|
26
|
-
#
|
27
|
-
|
28
|
-
|
29
|
-
hrefs.push((link['href']).split.join)
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|
34
|
-
inc=0
|
21
|
+
if(link['href']!=nil && link['href']!="")
|
22
|
+
hrefs.push(link)
|
23
|
+
end#if
|
24
|
+
end#do
|
25
|
+
|
35
26
|
#transfer links to other array
|
36
27
|
while(!hrefs.empty?)
|
37
28
|
value= hrefs.pop
|
38
|
-
|
29
|
+
|
39
30
|
refarr.push(value)
|
40
|
-
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
#transfer titles to other array
|
45
|
-
while(!titles.empty?)
|
46
|
-
value = titles.pop
|
47
|
-
puts value
|
48
|
-
titlearr.push(value)
|
49
|
-
puts titlearr[inc]
|
50
|
-
inc+=1
|
51
|
-
end
|
52
|
-
#setup for recognition of the end of the array
|
31
|
+
|
32
|
+
|
33
|
+
end#while
|
34
|
+
#setup for recognition of the end of the array
|
53
35
|
refarr.push("-")
|
54
36
|
|
55
37
|
#create folder for storing current set of scraped pages
|
56
|
-
g_depth = s_depth
|
57
|
-
while (g_depth>-1)
|
58
|
-
if (Dir.exist?('./pages'+g_depth.to_s))
|
59
|
-
else Dir.mkdir('./pages'+g_depth.to_s)
|
60
|
-
end
|
61
|
-
g_depth =g_depth-1
|
62
|
-
end
|
63
38
|
|
39
|
+
if (Dir.exist?('./results'+depth.to_s))
|
40
|
+
else Dir.mkdir('./results'+depth.to_s)
|
41
|
+
end#if
|
64
42
|
#in each link
|
65
|
-
|
66
|
-
|
43
|
+
check =(refarr.length-1)
|
44
|
+
for i in 0..check
|
45
|
+
if(refarr[i]!="-")#still valid links
|
67
46
|
#evaluate whether link is internal or external
|
68
|
-
if(refarr[i].include?('http://'))
|
69
|
-
url=refarr[i]
|
47
|
+
if(refarr[i]['href'].include?('http://') && refarr[i]!=nil)
|
48
|
+
url=refarr[i]['href']
|
70
49
|
else
|
71
|
-
url=sub_url+refarr[i]
|
72
|
-
end
|
50
|
+
url=sub_url+refarr[i]['href']
|
51
|
+
end#if include?
|
73
52
|
fourofour=false
|
74
53
|
|
75
54
|
begin
|
76
55
|
if(fourofour==false)
|
77
56
|
pagina = Nokogiri::HTML(open(url))
|
78
|
-
end
|
57
|
+
end#if
|
79
58
|
#test for a 404
|
80
59
|
rescue Exception =>ex
|
81
|
-
|
60
|
+
|
82
61
|
fourofour=true
|
83
62
|
retry
|
84
|
-
end
|
63
|
+
end#begin
|
85
64
|
if (fourofour==false)
|
86
|
-
#trim it down and remove special characters
|
87
|
-
trimval=titlearr[i]
|
88
|
-
finval=trimval.gsub!(/[!:\/-]/, '')
|
89
|
-
puts titlearr[i]
|
90
|
-
if(finval==nil && titlearr[i]!=nil)
|
91
|
-
finval=titlearr[i]
|
92
|
-
end
|
93
|
-
puts finval
|
94
|
-
if(finval!=nil)
|
95
65
|
#store html from the link with title of the link
|
96
|
-
crfile=File.new(('./results'+depth.to_s+"/"+
|
66
|
+
crfile=File.new(('./results'+depth.to_s+"/page"+i.to_s+".html").chomp,"w")
|
97
67
|
crfile.puts pagina
|
98
68
|
crfile.close
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
69
|
+
end#if
|
70
|
+
end#if != "-"
|
71
|
+
|
102
72
|
end#end for each
|
103
|
-
|
73
|
+
|
104
74
|
end#def crawlscrape
|
105
75
|
|
106
76
|
#############################################################################################
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omni_scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.8
|
4
|
+
version: 0.1.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bradley Maynard
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -78,17 +78,17 @@ require_paths:
|
|
78
78
|
- lib
|
79
79
|
required_ruby_version: !ruby/object:Gem::Requirement
|
80
80
|
requirements:
|
81
|
-
- -
|
81
|
+
- - '>='
|
82
82
|
- !ruby/object:Gem::Version
|
83
83
|
version: '0'
|
84
84
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
85
|
requirements:
|
86
|
-
- -
|
86
|
+
- - '>='
|
87
87
|
- !ruby/object:Gem::Version
|
88
88
|
version: '0'
|
89
89
|
requirements: []
|
90
90
|
rubyforge_project:
|
91
|
-
rubygems_version: 2.
|
91
|
+
rubygems_version: 2.0.14
|
92
92
|
signing_key:
|
93
93
|
specification_version: 4
|
94
94
|
summary: This is an all-purpose web scraper
|