omni_scrape 0.1.4.9 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/omni_scrape/version.rb +1 -1
- data/lib/omni_scrape.rb +17 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Zjc4YzI5MjUzNWI1MWUzZGQwMzc1YWI1NmQwN2I1YjIyZmVkY2QwZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YjdjZTY3MGVjMjY5ZmUzZjc0MDM2YjRlMzliNzUwODczMzdjMWIzYQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDA3ZmY2MTljMDVkNDQ1YTBhOTUxZTc3ZDM1NWI1OTA3OTZjNjUzNDRiNGI0
|
10
|
+
OTE5MWExNzgyYzYxOTQ3ZWViYmQzNTdhM2JlNDlkOTgwNTkzYmIwODdmZDMy
|
11
|
+
NzdmZGU3MGVjNDlmZDJmODg2MTJkZjRjOGRhZjdiNGMxODNhMGU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NDgwOWRmM2JkNmZkYmNkZDQxNDhlNzJmYmI4ZGQxM2RiMzhmNGJiZTEzYjZl
|
14
|
+
OTQ5YTdkYTEwNzJmMWIyZjQxZDY0ODVkODEwZDk0M2M0Y2NiZjM2MWU0YTVi
|
15
|
+
NjhlYTk0ZmZjMmI1NjMxOTM2OTg2M2Y1OTE1YmQxZGYxZjg0Mzg=
|
data/lib/omni_scrape/version.rb
CHANGED
data/lib/omni_scrape.rb
CHANGED
@@ -2,15 +2,18 @@ require "omni_scrape/version"
|
|
2
2
|
|
3
3
|
module OmniScrape
|
4
4
|
def CrawlScrape(url, depth, sub_url)
|
5
|
+
#open the starting page
|
5
6
|
page = Nokogiri::HTML(open(url))
|
6
|
-
|
7
|
+
#collect all of the links from the page
|
7
8
|
links= page.css('a')
|
8
9
|
|
10
|
+
#initialize variables
|
9
11
|
refarr=[]
|
10
12
|
titlearr=[]
|
11
13
|
titles =[]
|
12
14
|
hrefs = []
|
13
15
|
x=0
|
16
|
+
#add title and href to arrays for each link
|
14
17
|
links.each do |link|
|
15
18
|
if(link['title']!=nil && link['title']!="" &&link['href']!=nil && link['href']!="")
|
16
19
|
# puts x
|
@@ -23,6 +26,7 @@ links.each do |link|
|
|
23
26
|
|
24
27
|
end
|
25
28
|
inc=0
|
29
|
+
#transfer links to other array
|
26
30
|
while(!hrefs.empty?)
|
27
31
|
value= hrefs.pop
|
28
32
|
puts value
|
@@ -31,6 +35,7 @@ links.each do |link|
|
|
31
35
|
inc+=1
|
32
36
|
end
|
33
37
|
inc=0
|
38
|
+
#transfer titles to other array
|
34
39
|
while(!titles.empty?)
|
35
40
|
value = titles.pop
|
36
41
|
puts value
|
@@ -38,25 +43,32 @@ links.each do |link|
|
|
38
43
|
puts titlearr[inc]
|
39
44
|
inc+=1
|
40
45
|
end
|
41
|
-
|
46
|
+
#setup for recognition of the end of the array
|
42
47
|
refarr.push("-")
|
43
48
|
|
44
|
-
|
49
|
+
#in each link
|
45
50
|
for i in 0..titlearr.length
|
46
51
|
if(refarr[i]!="-")
|
52
|
+
#evaluate whether link is internal or external
|
53
|
+
if(refarr[i].include?('http://'))
|
54
|
+
url=refarr[i]
|
55
|
+
else
|
47
56
|
url=sub_url+refarr[i]
|
57
|
+
end
|
48
58
|
fourofour=false
|
49
|
-
|
59
|
+
|
50
60
|
begin
|
51
61
|
if(fourofour==false)
|
52
62
|
pagina = Nokogiri::HTML(open(url))
|
53
63
|
end
|
64
|
+
#test for a 404
|
54
65
|
rescue Exception =>ex
|
55
66
|
puts "got a 404"
|
56
67
|
fourofour=true
|
57
68
|
retry
|
58
69
|
end
|
59
70
|
if (fourofour==false)
|
71
|
+
#trim it down and remove special characters
|
60
72
|
trimval=titlearr[i]
|
61
73
|
finval=trimval.gsub!(/[!:\/-]/, '')
|
62
74
|
puts titlearr[i]
|
@@ -65,6 +77,7 @@ for i in 0..titlearr.length
|
|
65
77
|
end
|
66
78
|
puts finval
|
67
79
|
if(finval!=nil)
|
80
|
+
#store html from the link with title of the link
|
68
81
|
crfile=File.new((finval+".html").chomp,"w")
|
69
82
|
crfile.puts pagina
|
70
83
|
crfile.close
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omni_scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bradley Maynard
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|