omni_scrape 0.1.5.4 → 0.1.5.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YzRjNTk1YmVmNTU4NWVlOTg0MmY3MGJmMzIwYTRlZDk0MTQxM2JjMg==
4
+ YmI5NWNlZmYxN2Q5YzZmYzE4ZThjM2IxMDZjMWM2MDI3MDBiM2U0MA==
5
5
  data.tar.gz: !binary |-
6
- NDEzN2FjNTQ2MmJmMDgxNjg5NTJlZTZkMThlNDYxM2YxN2MwOWUwYQ==
6
+ OGYwZTdlOWFiYjMxMTNmNGU3MmJjZWM5MWU5ZWQ2MGY2MDUzYmQzNg==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- ODQzMGU4NTdkNDI3MGNhMTc5Y2QyODY4OGI1ZTIxYzgwNmZiZTM5NjIzZjE0
10
- YTYyMDM1NDgxMTczYjdiZDgyZWEyODg3ODkwNThlNWVmNWU4ZDVjNmJlMDNh
11
- YjkyYWM5ZTk1YTg4YzhiNWFhNzdmZDhmOWFiM2ZkYjNhMTI0MWE=
9
+ ZWJkZWZhODhmM2Y0YjExZTBjYjc1MmNhNmJiNWI1YTJiYzQ4M2QyMDExYWZm
10
+ ZWM1NTMzMjYyYjkyNTRmMzc4Mjc1ODIxNzgxZTUyYzhhMzQ1Y2U5M2UyODE5
11
+ OTJhMGRhNjdhZjQ3N2YyNDM0NmE3YTdhY2ViNWU4NjBlNWQzNDM=
12
12
  data.tar.gz: !binary |-
13
- ZDI3MWNiNzVmYzIxNzhhMWRjZWZlM2IxMTBmOWQ3ZjY1Y2VmZDBlNTNkMjY1
14
- MmE4MWQzOThlZGIzNzNhNTIxZGI1NTkyZTE5ZmI2YTFkZmFmZDY5NDdiYTEx
15
- ZGM4ZDYyOGM5N2I1YTU1ODdjMjVlMWFkOTY2OWVlMDRmYTllZmI=
13
+ OGNkNTA2OWQ3YmI0MGVjMWQyMjQ4ZjM1MmIxNGUzYzVhMzMxODdkYjZmYjkz
14
+ MTU4MzIyNzIyMzg1ZDQ2YmEyNTA2MGE4MGI2MDMwY2RlNjA2YmFiYmUzZDJi
15
+ ZmY3YWI3ZDRkZjg0NjdjMTEwZTNmMWVmZjNhNDlkMmE1N2RhMGY=
data/README.md CHANGED
@@ -21,7 +21,7 @@ Or install it yourself as:
21
21
  ## Usage
22
22
  Add the lines : require 'omni_scrape' and include OmniScrape to your script file.
23
23
 
24
- Method : CrawlScrape
24
+ Method : CrawlScrape Note: this method is currently on a back burner.
25
25
 
26
26
  example : OmniScrape.CrawlScrape("http://en.wikipedia.org/wiki/List_of_massively_multiplayer_online_role-playing_games", 0, "http://en.wikipedia.org")
27
27
 
@@ -37,14 +37,17 @@ example : OmniScrape.Localize("http://en.wikipedia.org/wiki/List_of_massively_mu
37
37
 
38
38
  This method takes three parameters the first should be the url to start at.
39
39
 
40
- The second parameter is the depth to crawl and currently only supports 1 layer. Note: recursion will be added soon for deeper crawling. *(just pass it 1)*
40
+ The second parameter is the depth to crawl. ***Warning: crawling grows at an INSANE rate.
41
41
 
42
42
  The third is a sub-url for internal links.
43
43
 
44
+ Method : FLocalize
45
+
46
+ This is the recursive method called by Localize and shouldn't be used directly. :)
44
47
 
45
48
  description: Localize will follow every link from the page provided and scrape the html from those pages, storing it as html files in subdirectories.
46
49
 
47
- Currently the first page will link to all other pages that are scraped and stored. Note: further linking will be added soon.
50
+ The pages are linked to other local pages. Currently there is a lot of duplication in this regard. Note: Working on eliminating the duplication.
48
51
 
49
52
  ## Development
50
53
 
@@ -1,3 +1,3 @@
1
1
  module OmniScrape
2
- VERSION = "0.1.5.4"
2
+ VERSION = "0.1.5.6.4"
3
3
  end
data/lib/omni_scrape.rb CHANGED
@@ -53,10 +53,13 @@ links.each do |link|
53
53
  refarr.push("-")
54
54
 
55
55
  #create folder for storing current set of scraped pages
56
- if (Dir.exist?('./results'+depth.to_s))
57
- else Dir.mkdir('./results'+depth.to_s)
56
+ g_depth = s_depth
57
+ while (g_depth>-1)
58
+ if (Dir.exist?('./pages'+g_depth.to_s))
59
+ else Dir.mkdir('./pages'+g_depth.to_s)
58
60
  end
59
-
61
+ g_depth =g_depth-1
62
+ end
60
63
 
61
64
  #in each link
62
65
  for i in 1..titlearr.length
@@ -101,12 +104,18 @@ end#end for each
101
104
  end#def crawlscrape
102
105
 
103
106
  #############################################################################################
104
-
107
+
105
108
  def Localize(url, depth, sub_url)
106
- #open the starting page
109
+
110
+ #initialize to extract from user view
111
+ s_depth = depth
112
+ i_page = 0
113
+ prev_ipage = 0
114
+ link_to_add =""
107
115
  if (depth<0)
108
116
  depth=0
109
117
  end
118
+ #open the starting page
110
119
  page = Nokogiri::HTML(open(url))
111
120
  #collect all of the links from the page
112
121
  links= page.css('a')
@@ -141,13 +150,13 @@ puts "links in page"
141
150
  #setup for recognition of the end of the array
142
151
  refarr.push("-")
143
152
 
144
-
145
- #create subdirectory for storing current set of scraped pages
153
+ if(depth>0)
154
+
155
+ #create subdirectory for storing current set of scraped pages
156
+
146
157
  if (Dir.exist?('./pages'+depth.to_s))
147
158
  else Dir.mkdir('./pages'+depth.to_s)
148
159
  end
149
-
150
- if(depth>0)
151
160
  #in each link
152
161
  check = (refarr.length-1)
153
162
  for i in 0..check
@@ -159,8 +168,8 @@ if(depth>0)
159
168
  else
160
169
  url=sub_url+refarr[i]['href']
161
170
  #puts "external link"
162
- end
163
- end
171
+ end#refarr[i]['href'].include?
172
+ end#refarr[i]['href']!=nil
164
173
  fourofour=false
165
174
 
166
175
  begin
@@ -175,16 +184,35 @@ if(depth>0)
175
184
  fourofour=true
176
185
 
177
186
  retry
178
- end
187
+ end #begin
179
188
 
180
189
  if (fourofour==false)
181
190
  #make relevant links reference local files
182
191
  if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
183
- linkref = (('./pages'+depth.to_s+"/link"+i.to_s+".html").chomp)
192
+ puts "link: "
193
+ puts depth
194
+ #wutwut
195
+ j_depth = s_depth - depth
196
+ appendval = "../"
197
+ clutch = 0
198
+ for r in 1..j_depth
199
+
200
+ clutch +=1
201
+ end
202
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
203
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
204
+ end
205
+ if (depth == s_depth)
206
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
207
+ else
208
+
209
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
210
+ end
211
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
184
212
  refarr[i]['href']=linkref
185
213
  puts refarr[i]['href']
186
214
  #puts "working"
187
- end
215
+ end #refarr[i]['href']!=""
188
216
 
189
217
 
190
218
  #trim it down and remove special characters for display
@@ -193,40 +221,238 @@ if(depth>0)
193
221
  #puts refarr[i]
194
222
  if(finval==nil && refarr[i]!=nil)
195
223
  finval=refarr[i]
196
- end
224
+ end #finval == nil
197
225
 
198
-
226
+ n_depth = depth-1
227
+
199
228
  if(finval!=nil)
200
-
229
+ self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
201
230
  #create subdirectory for storing current links page
202
231
  #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
203
232
  #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
204
233
  #end
205
- #store page from the link in the subdirectory
206
- crfile=File.new(('./pages'+depth.to_s+"/link"+i.to_s+".html").chomp,"w")
207
- crfile.puts pagina
234
+
235
+ #this is where we will call the method for each link **********
236
+
237
+
238
+ end #finval!=nil
239
+ end #fourofour==false
240
+ end #refarr[i]!="-"
241
+
242
+ end#end for each
243
+
244
+
245
+ puts "here?"
246
+ puts depth
247
+
248
+ else#<< depth not > 0
249
+ check = (refarr.length-1)
250
+ for i in 0..check
251
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
252
+ refarr[i]['href']=""
253
+ end
254
+ end
255
+ end
256
+
257
+ if (depth == s_depth)
258
+ #store newly generated html/links for current page
259
+ mainpage =File.new('./page.html',"w")
260
+ mainpage.puts page
261
+ mainpage.close
262
+ puts "finished"
263
+
264
+ else
265
+ #store page from the link in the subdirectory
266
+ puts "page: "
267
+ p_depth = depth +1
268
+ j_depth = s_depth - depth
269
+ appendval = ""
270
+ clutch = 0
271
+ for r in 1..j_depth
272
+ appendval += "../"
273
+ clutch +=1
274
+ end
275
+ clutch -=1
276
+ puts "link to pass"
277
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
278
+ crfile.puts page
208
279
  crfile.close
280
+
281
+ end
282
+ end #end def Localize
283
+
284
+ #########################################################################################
285
+ def FLocalize(url, depth, sub_url, s_depth, i_page, prev_ipage, link_to_add)
286
+ #open the starting page
287
+
288
+ if (depth<0)
289
+ depth=0
290
+ end
291
+ page = Nokogiri::HTML(open(url))
292
+ #collect all of the links from the page
293
+ links= page.css('a')
294
+ title = page.css('title')
295
+ #initialize variables
296
+ refarr=[]
297
+ hrefs = []
298
+ x=0
299
+
300
+ #add href to arrays for each link
301
+ links.each do |link|
302
+ if(link['href']!=nil && link['href']!="")
303
+ # puts x
304
+ # puts (link['title'].split.join)
305
+ # x+=1
306
+ hrefs.push(link)
307
+
308
+ end
309
+
310
+ end
311
+ total=0
312
+ #transfer links to other array
313
+ while(!hrefs.empty?)
314
+ value= hrefs.pop
315
+ refarr.push(value)
316
+ total+=1
209
317
  end
210
- end
211
- end
318
+ puts total
319
+ puts "links in page"
320
+
212
321
 
213
- end#end for each
322
+ #setup for recognition of the end of the array
323
+ refarr.push("-")
214
324
 
325
+ if(depth>0)
326
+
327
+ #create subdirectory for storing current set of scraped pages
328
+
329
+ if (Dir.exist?('./pages'+depth.to_s))
330
+ else Dir.mkdir('./pages'+depth.to_s)
331
+ end
332
+ #in each link
333
+ check = (refarr.length-1)
334
+ for i in 0..check
335
+ if(refarr[i]!="-")
336
+ #evaluate whether link is internal or external
337
+ if(refarr[i]['href']!=nil && refarr[i]['href']!="")
338
+ if(refarr[i]['href'].include?('http://'))
339
+ url=refarr[i]['href']
340
+ else
341
+ url=sub_url+refarr[i]['href']
342
+ #puts "external link"
343
+ end#refarr[i]['href'].include?
344
+ end#refarr[i]['href']!=nil
345
+ fourofour=false
346
+
347
+ begin
348
+ if(fourofour==false)
349
+ pagina = Nokogiri::HTML(open(url))
350
+ end
351
+ #test for a 404
352
+ rescue Exception =>ex
353
+ #puts "got a 404"
354
+ #replace href (no navigation onclick)
355
+ refarr[i]['href'] =""
356
+ fourofour=true
357
+
358
+ retry
359
+ end #begin
360
+
361
+ if (fourofour==false)
362
+ #make relevant links reference local files
363
+ if(refarr[i]['href']!="" && refarr[i]['href']!=nil)
364
+ puts "link: "
365
+ puts depth
366
+ #wutwut
367
+ j_depth = s_depth - depth
368
+ appendval = "../"
369
+ clutch = 0
370
+ for r in 1..j_depth
371
+
372
+ clutch +=1
373
+ end
374
+ if (Dir.exist?('./pages'+depth.to_s+"/"+clutch.to_s+"set"))
375
+ else Dir.mkdir('./pages'+depth.to_s+"/"+clutch.to_s+"set")
376
+ end
377
+ if (depth == s_depth)
378
+ linkref = (('./pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
379
+ else
380
+
381
+ linkref = ((appendval+'../pages'+depth.to_s+"/"+clutch.to_s+"set/"+i_page.to_s+"x"+i.to_s+"page.html").chomp)
382
+ end
383
+ pass_a_link = i_page.to_s+"x"+i.to_s+"page.html"
384
+ refarr[i]['href']=linkref
385
+ puts refarr[i]['href']
386
+ #puts "working"
387
+ end #refarr[i]['href']!=""
388
+
389
+
390
+ #trim it down and remove special characters for display
391
+ trimval=refarr[i]['href']
392
+ finval=trimval.gsub!(/[!:\/-]/, '')
393
+ #puts refarr[i]
394
+ if(finval==nil && refarr[i]!=nil)
395
+ finval=refarr[i]
396
+ end #finval == nil
397
+
398
+ n_depth = depth-1
399
+
400
+ if(finval!=nil)
401
+ self. FLocalize(url, n_depth, sub_url, s_depth, i, i_page, pass_a_link)
402
+ #create subdirectory for storing current links page
403
+ #if (Dir.exist?('./pages'+depth.to_s+'/link'+i.to_s))
404
+ #else Dir.mkdir('./pages'+depth.to_s+'/link'+i.to_s)
405
+ #end
406
+
407
+ #this is where we will call the method for each link **********
408
+
409
+
410
+ end #finval!=nil
411
+ end #fourofour==false
412
+ end #refarr[i]!="-"
413
+
414
+ end#end for each
215
415
 
216
416
 
417
+ puts "here?"
418
+ puts depth
217
419
 
218
420
  else#<< depth not > 0
219
- for i in 1..links.length
421
+ check = (refarr.length-1)
422
+ for i in 0..check
423
+ if (refarr[i]['href']!=nil && refarr[i]['href']!="")
220
424
  refarr[i]['href']=""
425
+ end
221
426
  end
222
427
  end
223
428
 
429
+ if (depth == s_depth)
224
430
  #store newly generated html/links for current page
225
431
  mainpage =File.new('./page.html',"w")
226
432
  mainpage.puts page
227
433
  mainpage.close
228
434
  puts "finished"
435
+
436
+ else
437
+ #store page from the link in the subdirectory
438
+ puts "page: "
439
+ p_depth = depth +1
440
+ j_depth = s_depth - depth
441
+ appendval = ""
442
+ clutch = 0
443
+ for r in 1..j_depth
444
+ appendval += "../"
445
+ clutch +=1
446
+ end
447
+ clutch -=1
448
+ puts "link to pass"
449
+ crfile=File.new(('./pages'+p_depth.to_s+"/"+clutch.to_s+"set/"+link_to_add),"w")
450
+ crfile.puts page
451
+ crfile.close
452
+
453
+ end
229
454
  end #end def Localize
230
455
 
231
456
  #########################################################################################
457
+
232
458
  end#module
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omni_scrape
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5.4
4
+ version: 0.1.5.6.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bradley Maynard
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-06-09 00:00:00.000000000 Z
11
+ date: 2015-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri