rubyretriever 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
4
- data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
3
+ metadata.gz: 723b4186b7723a15b697a3e49705fa2972e9e643
4
+ data.tar.gz: fb0b273fd2da2281e1085aba4f4c22e14c6c89ce
5
5
  SHA512:
6
- metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
7
- data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39
6
+ metadata.gz: f21a631ff12dc191c2554bd92de118ebee11d577e9e76f4e4d9f971aec654bece4024c218fb3bff0a8460e7eafa08951e0f0f37bcb86df5342a92e2f465f111e
7
+ data.tar.gz: 831ae194294e72d1973bd8cb4593c2717b2ca5b6a321ff742afe24d3bbb143bd8585a4a566983830f6548b029703d730ab80196e1083a7fa4692732f69a89943
@@ -10,7 +10,7 @@ module Retriever
10
10
  class Fetch
11
11
  attr_reader :maxPages, :t
12
12
 
13
- def initialize(url,options)
13
+ def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
14
14
  @connection_tally = {
15
15
  :success => 0,
16
16
  :error => 0,
@@ -58,7 +58,7 @@ module Retriever
58
58
  def lg(msg)
59
59
  puts "### #{msg}" if @v
60
60
  end
61
- def dump
61
+ def dump #prints current data collection to STDOUT, meant for CLI use.
62
62
  puts "###############################"
63
63
  if @v
64
64
  puts "Connection Tally:"
@@ -85,7 +85,7 @@ module Retriever
85
85
  puts "###############################"
86
86
  puts
87
87
  end
88
- def write
88
+ def write #writes current data collection out to CSV in current directory
89
89
  if @output
90
90
  i = 0
91
91
  CSV.open("#{@output}.csv", "w") do |csv|
@@ -104,7 +104,7 @@ module Retriever
104
104
  puts
105
105
  end
106
106
  end
107
- def async_crawl_and_collect()
107
+ def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
108
108
  while (@already_crawled.size < @maxPages)
109
109
  if @linkStack.empty?
110
110
  if @prgrss
@@ -116,11 +116,11 @@ module Retriever
116
116
  end
117
117
  new_links_arr = self.asyncGetWave()
118
118
  next if (new_links_arr.nil? || new_links_arr.empty?)
119
- new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
119
+ new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
120
120
  @linkStack.concat(new_links_arr).uniq!
121
121
  @data.concat(new_links_arr) if @s
122
122
  end
123
- @progressbar.finish if @prgrss
123
+ @progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
124
124
  end
125
125
  def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
126
126
  return false if !resp
@@ -1,6 +1,6 @@
1
1
  module Retriever
2
2
  class FetchFiles < Fetch
3
- def initialize(url,options)
3
+ def initialize(url,options) #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
4
4
  super
5
5
  @data = []
6
6
  page_one = Retriever::Page.new(@t.source,@t)
@@ -21,7 +21,7 @@ module Retriever
21
21
  @data.sort_by! {|x| x.length}
22
22
  @data.uniq!
23
23
  end
24
- def download_file(path)
24
+ def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
25
25
  arr = path.split('/')
26
26
  shortname = arr.pop
27
27
  puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
@@ -32,7 +32,7 @@ module Retriever
32
32
  end
33
33
  puts " SUCCESS: Download Complete"
34
34
  end
35
- def autodownload()
35
+ def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
36
36
  lenny = @data.count
37
37
  puts "###################"
38
38
  puts "### Initiating Autodownload..."
@@ -1,6 +1,6 @@
1
1
  module Retriever
2
2
  class FetchSEO < Fetch
3
- def initialize(url,options)
3
+ def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
4
4
  super
5
5
  @data = []
6
6
  page_one = Retriever::Page.new(@t.source,@t)
@@ -1,6 +1,6 @@
1
1
  module Retriever
2
2
  class FetchSitemap < Fetch
3
- def initialize(url,options)
3
+ def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
4
4
  super
5
5
  @data = [@t.target]
6
6
  page_one = Retriever::Page.new(@t.source,@t)
@@ -18,7 +18,7 @@ module Retriever
18
18
  @data.sort_by! {|x| x.length} if @data.size>1
19
19
  @data.uniq!
20
20
  end
21
- def gen_xml
21
+ def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
22
22
  f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
23
23
  f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
24
24
  @data.each do |url|
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '1.0.0'
2
+ VERSION = '1.0.1'
3
3
  end
data/readme.md CHANGED
@@ -1,4 +1,4 @@
1
- [RubyRetriever] (http://www.softwarebyjoe.com/rubyretriever/)
1
+ [RubyRetriever] (http://softwarebyjoe.com/rubyretriever/)
2
2
  ==============
3
3
  [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever) [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
4
4
 
@@ -10,6 +10,8 @@ RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchro
10
10
 
11
11
  RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
12
12
 
13
+ v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
14
+
13
15
 
14
16
  getting started
15
17
  -----------
@@ -58,7 +60,7 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
58
60
 
59
61
  Where MODE FLAG is required, and is either:
60
62
  -s, --sitemap FORMAT (only accepts CSV or XML atm)
61
- -f, --files FILETYPE
63
+ -f, --files FILETYPE
62
64
  -e, --seo
63
65
 
64
66
  and OPTIONS is the applicable:
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-07 00:00:00.000000000 Z
11
+ date: 2014-06-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony