omni_scrape 0.1.5.2 → 0.1.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +16 -2
- data/lib/omni_scrape/version.rb +1 -1
- data/lib/omni_scrape.rb +145 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YzRjNTk1YmVmNTU4NWVlOTg0MmY3MGJmMzIwYTRlZDk0MTQxM2JjMg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDEzN2FjNTQ2MmJmMDgxNjg5NTJlZTZkMThlNDYxM2YxN2MwOWUwYQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODQzMGU4NTdkNDI3MGNhMTc5Y2QyODY4OGI1ZTIxYzgwNmZiZTM5NjIzZjE0
|
10
|
+
YTYyMDM1NDgxMTczYjdiZDgyZWEyODg3ODkwNThlNWVmNWU4ZDVjNmJlMDNh
|
11
|
+
YjkyYWM5ZTk1YTg4YzhiNWFhNzdmZDhmOWFiM2ZkYjNhMTI0MWE=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZDI3MWNiNzVmYzIxNzhhMWRjZWZlM2IxMTBmOWQ3ZjY1Y2VmZDBlNTNkMjY1
|
14
|
+
MmE4MWQzOThlZGIzNzNhNTIxZGI1NTkyZTE5ZmI2YTFkZmFmZDY5NDdiYTEx
|
15
|
+
ZGM4ZDYyOGM5N2I1YTU1ODdjMjVlMWFkOTY2OWVlMDRmYTllZmI=
|
data/README.md
CHANGED
@@ -20,17 +20,31 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
Add the lines : require 'omni_scrape' and include OmniScrape to your script file.
|
23
|
+
|
23
24
|
Method : CrawlScrape
|
25
|
+
|
24
26
|
example : OmniScrape.CrawlScrape("http://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 0, "http://en.wikipedia.org")
|
25
27
|
|
26
28
|
This method takes three parameters the first should be the url to start at.
|
27
29
|
|
28
|
-
The second parameter is currently unimplemented but will be the depth to crawl. (just pass it
|
30
|
+
The second parameter is currently unimplemented but will be the depth to crawl. (just pass it 1)
|
31
|
+
|
32
|
+
The third is a sub-url for internal links.
|
33
|
+
|
34
|
+
Method : Localize
|
35
|
+
|
36
|
+
example : OmniScrape.Localize("http://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "http://en.wikipedia.org")
|
37
|
+
|
38
|
+
This method takes three parameters the first should be the url to start at.
|
39
|
+
|
40
|
+
The second parameter is the depth to crawl and currently only supports 1 layer. Note: recursion will be added soon for deeper crawling. *(just pass it 1)*
|
29
41
|
|
30
42
|
The third is a sub-url for internal links.
|
31
43
|
|
32
44
|
|
33
|
-
description:
|
45
|
+
description: Localize will follow every link from the page provided and scrape the html from those pages, storing it as html files in subdirectories.
|
46
|
+
|
47
|
+
Currently the first page will link to all other pages that are scraped and stored. Note: further linking will be added soon.
|
34
48
|
|
35
49
|
## Development
|
36
50
|
|
data/lib/omni_scrape/version.rb
CHANGED
data/lib/omni_scrape.rb
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
require "omni_scrape/version"
|
2
2
|
|
3
3
|
module OmniScrape
|
4
|
+
|
5
|
+
##########################################################################################
|
6
|
+
|
4
7
|
def CrawlScrape(url, depth, sub_url)
|
8
|
+
if (depth<0)
|
9
|
+
depth=0
|
10
|
+
end
|
5
11
|
#open the starting page
|
6
12
|
page = Nokogiri::HTML(open(url))
|
7
13
|
#collect all of the links from the page
|
@@ -46,8 +52,14 @@ links.each do |link|
|
|
46
52
|
#setup for recognition of the end of the array
|
47
53
|
refarr.push("-")
|
48
54
|
|
55
|
+
#create folder for storing current set of scraped pages
|
56
|
+
if (Dir.exist?('./results'+depth.to_s))
|
57
|
+
else Dir.mkdir('./results'+depth.to_s)
|
58
|
+
end
|
59
|
+
|
60
|
+
|
49
61
|
#in each link
|
50
|
-
for i in
|
62
|
+
for i in 1..titlearr.length
|
51
63
|
if(refarr[i]!="-")
|
52
64
|
#evaluate whether link is internal or external
|
53
65
|
if(refarr[i].include?('http://'))
|
@@ -78,13 +90,143 @@ for i in 0..titlearr.length
|
|
78
90
|
puts finval
|
79
91
|
if(finval!=nil)
|
80
92
|
#store html from the link with title of the link
|
81
|
-
|
93
|
+
crfile=File.new(('./results'+depth.to_s+"/"+finval+".html").chomp,"w")
|
82
94
|
crfile.puts pagina
|
83
95
|
crfile.close
|
84
96
|
end
|
85
97
|
end
|
86
98
|
end
|
99
|
+
end#end for each
|
100
|
+
puts "finished"
|
101
|
+
end#def crawlscrape
|
102
|
+
|
103
|
+
#############################################################################################
|
104
|
+
|
105
|
+
def Localize(url, depth, sub_url)
|
106
|
+
#open the starting page
|
107
|
+
if (depth<0)
|
108
|
+
depth=0
|
109
|
+
end
|
110
|
+
page = Nokogiri::HTML(open(url))
|
111
|
+
#collect all of the links from the page
|
112
|
+
links= page.css('a')
|
113
|
+
title = page.css('title')
|
114
|
+
#initialize variables
|
115
|
+
refarr=[]
|
116
|
+
hrefs = []
|
117
|
+
x=0
|
118
|
+
|
119
|
+
#add href to arrays for each link
|
120
|
+
links.each do |link|
|
121
|
+
if(link['href']!=nil && link['href']!="")
|
122
|
+
# puts x
|
123
|
+
# puts (link['title'].split.join)
|
124
|
+
# x+=1
|
125
|
+
hrefs.push(link)
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
130
|
+
total=0
|
131
|
+
#transfer links to other array
|
132
|
+
while(!hrefs.empty?)
|
133
|
+
value= hrefs.pop
|
134
|
+
refarr.push(value)
|
135
|
+
total+=1
|
136
|
+
end
|
137
|
+
puts total
|
138
|
+
puts "links in page"
|
139
|
+
|
140
|
+
|
141
|
+
#setup for recognition of the end of the array
|
142
|
+
refarr.push("-")
|
143
|
+
|
144
|
+
|
145
|
+
#create subdirectory for storing current set of scraped pages
|
146
|
+
if (Dir.exist?('./pages'+depth.to_s))
|
147
|
+
else Dir.mkdir('./pages'+depth.to_s)
|
148
|
+
end
|
149
|
+
|
150
|
+
if(depth>0)
|
151
|
+
#in each link
|
152
|
+
check = (refarr.length-1)
|
153
|
+
for i in 0..check
|
154
|
+
if(refarr[i]!="-")
|
155
|
+
#evaluate whether link is internal or external
|
156
|
+
if(refarr[i]['href']!=nil && refarr[i]['href']!="")
|
157
|
+
if(refarr[i]['href'].include?('http://'))
|
158
|
+
url=refarr[i]['href']
|
159
|
+
else
|
160
|
+
url=sub_url+refarr[i]['href']
|
161
|
+
#puts "external link"
|
162
|
+
end
|
163
|
+
end
|
164
|
+
fourofour=false
|
165
|
+
|
166
|
+
begin
|
167
|
+
if(fourofour==false)
|
168
|
+
pagina = Nokogiri::HTML(open(url))
|
169
|
+
end
|
170
|
+
#test for a 404
|
171
|
+
rescue Exception =>ex
|
172
|
+
#puts "got a 404"
|
173
|
+
#replace href (no navigation onclick)
|
174
|
+
refarr[i]['href'] =""
|
175
|
+
fourofour=true
|
176
|
+
|
177
|
+
retry
|
178
|
+
end
|
179
|
+
|
180
|
+
if (fourofour==false)
|
181
|
+
#make relevant links reference local files
|
182
|
+
if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
|
183
|
+
linkref = (('./pages'+depth.to_s+"/link"+i.to_s+".html").chomp)
|
184
|
+
refarr[i]['href']=linkref
|
185
|
+
puts refarr[i]['href']
|
186
|
+
#puts "working"
|
187
|
+
end
|
188
|
+
|
189
|
+
|
190
|
+
#trim it down and remove special characters for display
|
191
|
+
trimval=refarr[i]['href']
|
192
|
+
finval=trimval.gsub!(/[!:\/-]/, '')
|
193
|
+
#puts refarr[i]
|
194
|
+
if(finval==nil && refarr[i]!=nil)
|
195
|
+
finval=refarr[i]
|
196
|
+
end
|
197
|
+
|
198
|
+
|
199
|
+
if(finval!=nil)
|
200
|
+
|
201
|
+
#create subdirectory for storing current links page
|
202
|
+
#if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
|
203
|
+
#else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
|
204
|
+
#end
|
205
|
+
#store page from the link in the subdirectory
|
206
|
+
crfile=File.new(('./pages'+depth.to_s+"/link"+i.to_s+".html").chomp,"w")
|
207
|
+
crfile.puts pagina
|
208
|
+
crfile.close
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
end#end for each
|
214
|
+
|
215
|
+
|
216
|
+
|
217
|
+
|
218
|
+
else#<< depth not > 0
|
219
|
+
for i in 1..links.length
|
220
|
+
refarr[i]['href']=""
|
221
|
+
end
|
87
222
|
end
|
88
223
|
|
89
|
-
|
224
|
+
#store newly generated html/links for current page
|
225
|
+
mainpage =File.new('./page.html',"w")
|
226
|
+
mainpage.puts page
|
227
|
+
mainpage.close
|
228
|
+
puts "finished"
|
229
|
+
end #end def Localize
|
230
|
+
|
231
|
+
#########################################################################################
|
90
232
|
end#module
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omni_scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.5.
|
4
|
+
version: 0.1.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bradley Maynard
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|