rubyretriever 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
4
- data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
3
+ metadata.gz: 723b4186b7723a15b697a3e49705fa2972e9e643
4
+ data.tar.gz: fb0b273fd2da2281e1085aba4f4c22e14c6c89ce
5
5
  SHA512:
6
- metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
7
- data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39
6
+ metadata.gz: f21a631ff12dc191c2554bd92de118ebee11d577e9e76f4e4d9f971aec654bece4024c218fb3bff0a8460e7eafa08951e0f0f37bcb86df5342a92e2f465f111e
7
+ data.tar.gz: 831ae194294e72d1973bd8cb4593c2717b2ca5b6a321ff742afe24d3bbb143bd8585a4a566983830f6548b029703d730ab80196e1083a7fa4692732f69a89943
@@ -10,7 +10,7 @@ module Retriever
10
10
  class Fetch
11
11
  attr_reader :maxPages, :t
12
12
 
13
- def initialize(url,options)
13
+ def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
14
14
  @connection_tally = {
15
15
  :success => 0,
16
16
  :error => 0,
@@ -58,7 +58,7 @@ module Retriever
58
58
  def lg(msg)
59
59
  puts "### #{msg}" if @v
60
60
  end
61
- def dump
61
+ def dump #prints current data collection to STDOUT, meant for CLI use.
62
62
  puts "###############################"
63
63
  if @v
64
64
  puts "Connection Tally:"
@@ -85,7 +85,7 @@ module Retriever
85
85
  puts "###############################"
86
86
  puts
87
87
  end
88
- def write
88
+ def write #writes current data collection out to CSV in current directory
89
89
  if @output
90
90
  i = 0
91
91
  CSV.open("#{@output}.csv", "w") do |csv|
@@ -104,7 +104,7 @@ module Retriever
104
104
  puts
105
105
  end
106
106
  end
107
- def async_crawl_and_collect()
107
+ def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
108
108
  while (@already_crawled.size < @maxPages)
109
109
  if @linkStack.empty?
110
110
  if @prgrss
@@ -116,11 +116,11 @@ module Retriever
116
116
  end
117
117
  new_links_arr = self.asyncGetWave()
118
118
  next if (new_links_arr.nil? || new_links_arr.empty?)
119
- new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
119
+ new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
120
120
  @linkStack.concat(new_links_arr).uniq!
121
121
  @data.concat(new_links_arr) if @s
122
122
  end
123
- @progressbar.finish if @prgrss
123
+ @progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
124
124
  end
125
125
  def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
126
126
  return false if !resp
@@ -1,6 +1,6 @@
1
1
  module Retriever
2
2
  class FetchFiles < Fetch
3
- def initialize(url,options)
3
+ def initialize(url,options) #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
4
4
  super
5
5
  @data = []
6
6
  page_one = Retriever::Page.new(@t.source,@t)
@@ -21,7 +21,7 @@ module Retriever
21
21
  @data.sort_by! {|x| x.length}
22
22
  @data.uniq!
23
23
  end
24
- def download_file(path)
24
+ def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
25
25
  arr = path.split('/')
26
26
  shortname = arr.pop
27
27
  puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
@@ -32,7 +32,7 @@ module Retriever
32
32
  end
33
33
  puts " SUCCESS: Download Complete"
34
34
  end
35
- def autodownload()
35
+ def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
36
36
  lenny = @data.count
37
37
  puts "###################"
38
38
  puts "### Initiating Autodownload..."
@@ -1,6 +1,6 @@
1
1
  module Retriever
2
2
  class FetchSEO < Fetch
3
- def initialize(url,options)
3
+ def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
4
4
  super
5
5
  @data = []
6
6
  page_one = Retriever::Page.new(@t.source,@t)
@@ -1,6 +1,6 @@
1
1
  module Retriever
2
2
  class FetchSitemap < Fetch
3
- def initialize(url,options)
3
+ def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
4
4
  super
5
5
  @data = [@t.target]
6
6
  page_one = Retriever::Page.new(@t.source,@t)
@@ -18,7 +18,7 @@ module Retriever
18
18
  @data.sort_by! {|x| x.length} if @data.size>1
19
19
  @data.uniq!
20
20
  end
21
- def gen_xml
21
+ def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
22
22
  f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
23
23
  f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
24
24
  @data.each do |url|
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '1.0.0'
2
+ VERSION = '1.0.1'
3
3
  end
data/readme.md CHANGED
@@ -1,4 +1,4 @@
1
- [RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
1
+ [RubyRetriever] (http://softwarebyjoe.com/rubyretriever/)
2
2
  ==============
3
3
  [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever) [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
4
4
 
@@ -10,6 +10,8 @@ RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchro
10
10
 
11
11
  RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
12
12
 
13
+ v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
14
+
13
15
 
14
16
  getting started
15
17
  -----------
@@ -58,7 +60,7 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
58
60
 
59
61
  Where MODE FLAG is required, and is either:
60
62
  -s, --sitemap FORMAT (only accepts CSV or XML atm)
61
- -f, --files FILETYPE
63
+ -f, --files FILETYPE
62
64
  -e, --seo
63
65
 
64
66
  and OPTIONS is the applicable:
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-07 00:00:00.000000000 Z
11
+ date: 2014-06-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony