rubyretriever 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/retriever/fetch.rb +6 -6
- data/lib/retriever/fetchfiles.rb +3 -3
- data/lib/retriever/fetchseo.rb +1 -1
- data/lib/retriever/fetchsitemap.rb +2 -2
- data/lib/retriever/version.rb +1 -1
- data/readme.md +4 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 723b4186b7723a15b697a3e49705fa2972e9e643
|
4
|
+
data.tar.gz: fb0b273fd2da2281e1085aba4f4c22e14c6c89ce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f21a631ff12dc191c2554bd92de118ebee11d577e9e76f4e4d9f971aec654bece4024c218fb3bff0a8460e7eafa08951e0f0f37bcb86df5342a92e2f465f111e
|
7
|
+
data.tar.gz: 831ae194294e72d1973bd8cb4593c2717b2ca5b6a321ff742afe24d3bbb143bd8585a4a566983830f6548b029703d730ab80196e1083a7fa4692732f69a89943
|
data/lib/retriever/fetch.rb
CHANGED
@@ -10,7 +10,7 @@ module Retriever
|
|
10
10
|
class Fetch
|
11
11
|
attr_reader :maxPages, :t
|
12
12
|
|
13
|
-
def initialize(url,options)
|
13
|
+
def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
|
14
14
|
@connection_tally = {
|
15
15
|
:success => 0,
|
16
16
|
:error => 0,
|
@@ -58,7 +58,7 @@ module Retriever
|
|
58
58
|
def lg(msg)
|
59
59
|
puts "### #{msg}" if @v
|
60
60
|
end
|
61
|
-
def dump
|
61
|
+
def dump #prints current data collection to STDOUT, meant for CLI use.
|
62
62
|
puts "###############################"
|
63
63
|
if @v
|
64
64
|
puts "Connection Tally:"
|
@@ -85,7 +85,7 @@ module Retriever
|
|
85
85
|
puts "###############################"
|
86
86
|
puts
|
87
87
|
end
|
88
|
-
def write
|
88
|
+
def write #writes current data collection out to CSV in current directory
|
89
89
|
if @output
|
90
90
|
i = 0
|
91
91
|
CSV.open("#{@output}.csv", "w") do |csv|
|
@@ -104,7 +104,7 @@ module Retriever
|
|
104
104
|
puts
|
105
105
|
end
|
106
106
|
end
|
107
|
-
def async_crawl_and_collect()
|
107
|
+
def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
|
108
108
|
while (@already_crawled.size < @maxPages)
|
109
109
|
if @linkStack.empty?
|
110
110
|
if @prgrss
|
@@ -116,11 +116,11 @@ module Retriever
|
|
116
116
|
end
|
117
117
|
new_links_arr = self.asyncGetWave()
|
118
118
|
next if (new_links_arr.nil? || new_links_arr.empty?)
|
119
|
-
new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
|
119
|
+
new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
|
120
120
|
@linkStack.concat(new_links_arr).uniq!
|
121
121
|
@data.concat(new_links_arr) if @s
|
122
122
|
end
|
123
|
-
@progressbar.finish if @prgrss
|
123
|
+
@progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
|
124
124
|
end
|
125
125
|
def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
|
126
126
|
return false if !resp
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchFiles < Fetch
|
3
|
-
def initialize(url,options)
|
3
|
+
def initialize(url,options) #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
|
4
4
|
super
|
5
5
|
@data = []
|
6
6
|
page_one = Retriever::Page.new(@t.source,@t)
|
@@ -21,7 +21,7 @@ module Retriever
|
|
21
21
|
@data.sort_by! {|x| x.length}
|
22
22
|
@data.uniq!
|
23
23
|
end
|
24
|
-
def download_file(path)
|
24
|
+
def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
|
25
25
|
arr = path.split('/')
|
26
26
|
shortname = arr.pop
|
27
27
|
puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
|
@@ -32,7 +32,7 @@ module Retriever
|
|
32
32
|
end
|
33
33
|
puts " SUCCESS: Download Complete"
|
34
34
|
end
|
35
|
-
def autodownload()
|
35
|
+
def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
|
36
36
|
lenny = @data.count
|
37
37
|
puts "###################"
|
38
38
|
puts "### Initiating Autodownload..."
|
data/lib/retriever/fetchseo.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchSEO < Fetch
|
3
|
-
def initialize(url,options)
|
3
|
+
def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
|
4
4
|
super
|
5
5
|
@data = []
|
6
6
|
page_one = Retriever::Page.new(@t.source,@t)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchSitemap < Fetch
|
3
|
-
def initialize(url,options)
|
3
|
+
def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
|
4
4
|
super
|
5
5
|
@data = [@t.target]
|
6
6
|
page_one = Retriever::Page.new(@t.source,@t)
|
@@ -18,7 +18,7 @@ module Retriever
|
|
18
18
|
@data.sort_by! {|x| x.length} if @data.size>1
|
19
19
|
@data.uniq!
|
20
20
|
end
|
21
|
-
def gen_xml
|
21
|
+
def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
|
22
22
|
f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
|
23
23
|
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
24
24
|
@data.each do |url|
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[RubyRetriever] (http://
|
1
|
+
[RubyRetriever] (http://softwarebyjoe.com/rubyretriever/)
|
2
2
|
==============
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever) [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
|
4
4
|
|
@@ -10,6 +10,8 @@ RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchro
|
|
10
10
|
|
11
11
|
RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
|
12
12
|
|
13
|
+
v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
|
14
|
+
|
13
15
|
|
14
16
|
getting started
|
15
17
|
-----------
|
@@ -58,7 +60,7 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
|
|
58
60
|
|
59
61
|
Where MODE FLAG is required, and is either:
|
60
62
|
-s, --sitemap FORMAT (only accepts CSV or XML atm)
|
61
|
-
-f, --files FILETYPE
|
63
|
+
-f, --files FILETYPE
|
62
64
|
-e, --seo
|
63
65
|
|
64
66
|
and OPTIONS is the applicable:
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|