omni_scrape 0.1.5.2 → 0.1.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NjQ5ZDBiODVkN2RkZDM0MWE1ZGQ2MGE1MmQ5ZTJlZDJjNjE2NWE3Yw==
4
+ YzRjNTk1YmVmNTU4NWVlOTg0MmY3MGJmMzIwYTRlZDk0MTQxM2JjMg==
5
5
  data.tar.gz: !binary |-
6
- MmI4MmQ0M2QwYWY1OTUwOWE0ZGY5MzQzYzRjZTk3MTdlMjVjOGE4MQ==
6
+ NDEzN2FjNTQ2MmJmMDgxNjg5NTJlZTZkMThlNDYxM2YxN2MwOWUwYQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- YzViNThmMjk3OWQ2NTUzMDNiNWYwNTM5ZTMyZDk5MDM0M2EyZGI1NzY3MjA5
10
- Y2QwYTM5MDgyNDVlNGU5MjFlMjBjYzMwYjU3MDM0NjM3NDhjZWViNjRlMDFl
11
- MTRhODc1NjA5Yzc1MjQ0YmE2NDIyNDc4MTFiOTI2ODQ5Y2IxNDY=
9
+ ODQzMGU4NTdkNDI3MGNhMTc5Y2QyODY4OGI1ZTIxYzgwNmZiZTM5NjIzZjE0
10
+ YTYyMDM1NDgxMTczYjdiZDgyZWEyODg3ODkwNThlNWVmNWU4ZDVjNmJlMDNh
11
+ YjkyYWM5ZTk1YTg4YzhiNWFhNzdmZDhmOWFiM2ZkYjNhMTI0MWE=
12
12
  data.tar.gz: !binary |-
13
- NjAzM2Q0NmY5ZTYyZGU2ZDRiNjNlNDM3ZDNjYjM0MGMzMTVmZGFjYTllM2Q4
14
- MDZhOGMzMjJmNTQ3YWJjNDg5MjkzOTQ4OTc3MTE0OGRlOWMxNmU2NzllMzAw
15
- ZjU2YjMxYWY4NjBlOThiYmRmODQ2ZWM0M2E3YzVhZGQ2NjEzMDY=
13
+ ZDI3MWNiNzVmYzIxNzhhMWRjZWZlM2IxMTBmOWQ3ZjY1Y2VmZDBlNTNkMjY1
14
+ MmE4MWQzOThlZGIzNzNhNTIxZGI1NTkyZTE5ZmI2YTFkZmFmZDY5NDdiYTEx
15
+ ZGM4ZDYyOGM5N2I1YTU1ODdjMjVlMWFkOTY2OWVlMDRmYTllZmI=
data/README.md CHANGED
@@ -20,17 +20,31 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
  Add the lines : require 'omni_scrape' and include OmniScrape to your script file.
23
+
23
24
  Method : CrawlScrape
25
+
24
26
  example : OmniScrape.CrawlScrape("http://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 0, "http://en.wikipedia.org")
25
27
 
26
28
  This method takes three parameters the first should be the url to start at.
27
29
 
28
- The second parameter is currently unimplemented but will be the depth to crawl. (just pass it 0)
30
+ The second parameter is currently unimplemented but will be the depth to crawl. (just pass it 1)
31
+
32
+ The third is a sub-url for internal links.
33
+
34
+ Method : Localize
35
+
36
+ example : OmniScrape.Localize("http://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 1, "http://en.wikipedia.org")
37
+
38
+ This method takes three parameters the first should be the url to start at.
39
+
40
+ The second parameter is the depth to crawl and currently only supports 1 layer. Note: recursion will be added soon for deeper crawling. *(just pass it 1)*
29
41
 
30
42
  The third is a sub-url for internal links.
31
43
 
32
44
 
33
- description: CrawlScrape will follow every link from the page provided and scrape the html from those pages, storing it as html files where the script is located.
45
+ description: Localize will follow every link from the page provided and scrape the html from those pages, storing it as html files in subdirectories.
46
+
47
+ Currently the first page will link to all other pages that are scraped and stored. Note: further linking will be added soon.
34
48
 
35
49
  ## Development
36
50
 
@@ -1,3 +1,3 @@
1
1
  module OmniScrape
2
- VERSION = "0.1.5.2"
2
+ VERSION = "0.1.5.4"
3
3
  end
data/lib/omni_scrape.rb CHANGED
@@ -1,7 +1,13 @@
1
1
  require "omni_scrape/version"
2
2
 
3
3
  module OmniScrape
4
+
5
+ ##########################################################################################
6
+
4
7
  def CrawlScrape(url, depth, sub_url)
8
+ if (depth<0)
9
+ depth=0
10
+ end
5
11
  #open the starting page
6
12
  page = Nokogiri::HTML(open(url))
7
13
  #collect all of the links from the page
@@ -46,8 +52,14 @@ links.each do |link|
46
52
  #setup for recognition of the end of the array
47
53
  refarr.push("-")
48
54
 
55
+ #create folder for storing current set of scraped pages
56
+ if (Dir.exist?('./results'+depth.to_s))
57
+ else Dir.mkdir('./results'+depth.to_s)
58
+ end
59
+
60
+
49
61
  #in each link
50
- for i in 0..titlearr.length
62
+ for i in 1..titlearr.length
51
63
  if(refarr[i]!="-")
52
64
  #evaluate whether link is internal or external
53
65
  if(refarr[i].include?('http://'))
@@ -78,13 +90,143 @@ for i in 0..titlearr.length
78
90
  puts finval
79
91
  if(finval!=nil)
80
92
  #store html from the link with title of the link
81
- crfile=File.new((finval+".html").chomp,"w")
93
+ crfile=File.new(('./results'+depth.to_s+"/"+finval+".html").chomp,"w")
82
94
  crfile.puts pagina
83
95
  crfile.close
84
96
  end
85
97
  end
86
98
  end
99
+ end#end for each
100
+ puts "finished"
101
+ end#def crawlscrape
102
+
103
+ #############################################################################################
104
+
105
+ def Localize(url, depth, sub_url)
106
+ #open the starting page
107
+ if (depth<0)
108
+ depth=0
109
+ end
110
+ page = Nokogiri::HTML(open(url))
111
+ #collect all of the links from the page
112
+ links= page.css('a')
113
+ title = page.css('title')
114
+ #initialize variables
115
+ refarr=[]
116
+ hrefs = []
117
+ x=0
118
+
119
+ #add href to arrays for each link
120
+ links.each do |link|
121
+ if(link['href']!=nil && link['href']!="")
122
+ # puts x
123
+ # puts (link['title'].split.join)
124
+ # x+=1
125
+ hrefs.push(link)
126
+
127
+ end
128
+
129
+ end
130
+ total=0
131
+ #transfer links to other array
132
+ while(!hrefs.empty?)
133
+ value= hrefs.pop
134
+ refarr.push(value)
135
+ total+=1
136
+ end
137
+ puts total
138
+ puts "links in page"
139
+
140
+
141
+ #setup for recognition of the end of the array
142
+ refarr.push("-")
143
+
144
+
145
+ #create subdirectory for storing current set of scraped pages
146
+ if (Dir.exist?('./pages'+depth.to_s))
147
+ else Dir.mkdir('./pages'+depth.to_s)
148
+ end
149
+
150
+ if(depth>0)
151
+ #in each link
152
+ check = (refarr.length-1)
153
+ for i in 0..check
154
+ if(refarr[i]!="-")
155
+ #evaluate whether link is internal or external
156
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
157
+ if(refarr[i]['href'].include?('http://'))
158
+ url=refarr[i]['href']
159
+ else
160
+ url=sub_url+refarr[i]['href']
161
+ #puts "external link"
162
+ end
163
+ end
164
+ fourofour=false
165
+
166
+ begin
167
+ if(fourofour==false)
168
+ pagina = Nokogiri::HTML(open(url))
169
+ end
170
+ #test for a 404
171
+ rescue Exception =>ex
172
+ #puts "got a 404"
173
+ #replace href (no navigation onclick)
174
+ refarr[i]['href'] =""
175
+ fourofour=true
176
+
177
+ retry
178
+ end
179
+
180
+ if (fourofour==false)
181
+ #make relevant links reference local files
182
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
183
+ linkref = (('./pages'+depth.to_s+"/link"+i.to_s+".html").chomp)
184
+ refarr[i]['href']=linkref
185
+ puts refarr[i]['href']
186
+ #puts "working"
187
+ end
188
+
189
+
190
+ #trim it down and remove special characters for display
191
+ trimval=refarr[i]['href']
192
+ finval=trimval.gsub!(/[!:\/-]/, '')
193
+ #puts refarr[i]
194
+ if(finval==nil && refarr[i]!=nil)
195
+ finval=refarr[i]
196
+ end
197
+
198
+
199
+ if(finval!=nil)
200
+
201
+ #create subdirectory for storing current links page
202
+ #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
203
+ #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
204
+ #end
205
+ #store page from the link in the subdirectory
206
+ crfile=File.new(('./pages'+depth.to_s+"/link"+i.to_s+".html").chomp,"w")
207
+ crfile.puts pagina
208
+ crfile.close
209
+ end
210
+ end
211
+ end
212
+
213
+ end#end for each
214
+
215
+
216
+
217
+
218
+ else#<< depth not > 0
219
+ for i in 1..links.length
220
+ refarr[i]['href']=""
221
+ end
87
222
  end
88
223
 
89
- end#def crawlscrape
224
+ #store newly generated html/links for current page
225
+ mainpage =File.new('./page.html',"w")
226
+ mainpage.puts page
227
+ mainpage.close
228
+ puts "finished"
229
+ end #end def Localize
230
+
231
+ #########################################################################################
90
232
  end#module
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omni_scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5.2
4
+ version: 0.1.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bradley Maynard
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-08 00:00:00.000000000 Z
11
+ date: 2015-06-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri