rubyretriever 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/retriever/fetch.rb +6 -6
- data/lib/retriever/fetchfiles.rb +3 -3
- data/lib/retriever/fetchseo.rb +1 -1
- data/lib/retriever/fetchsitemap.rb +2 -2
- data/lib/retriever/version.rb +1 -1
- data/readme.md +4 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 723b4186b7723a15b697a3e49705fa2972e9e643
|
4
|
+
data.tar.gz: fb0b273fd2da2281e1085aba4f4c22e14c6c89ce
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f21a631ff12dc191c2554bd92de118ebee11d577e9e76f4e4d9f971aec654bece4024c218fb3bff0a8460e7eafa08951e0f0f37bcb86df5342a92e2f465f111e
|
7
|
+
data.tar.gz: 831ae194294e72d1973bd8cb4593c2717b2ca5b6a321ff742afe24d3bbb143bd8585a4a566983830f6548b029703d730ab80196e1083a7fa4692732f69a89943
|
data/lib/retriever/fetch.rb
CHANGED
@@ -10,7 +10,7 @@ module Retriever
|
|
10
10
|
class Fetch
|
11
11
|
attr_reader :maxPages, :t
|
12
12
|
|
13
|
-
def initialize(url,options)
|
13
|
+
def initialize(url,options) #given target URL and RR options, creates a fetch object. There is no direct output, this is a parent class that the other fetch classes build off of.
|
14
14
|
@connection_tally = {
|
15
15
|
:success => 0,
|
16
16
|
:error => 0,
|
@@ -58,7 +58,7 @@ module Retriever
|
|
58
58
|
def lg(msg)
|
59
59
|
puts "### #{msg}" if @v
|
60
60
|
end
|
61
|
-
def dump
|
61
|
+
def dump #prints current data collection to STDOUT, meant for CLI use.
|
62
62
|
puts "###############################"
|
63
63
|
if @v
|
64
64
|
puts "Connection Tally:"
|
@@ -85,7 +85,7 @@ module Retriever
|
|
85
85
|
puts "###############################"
|
86
86
|
puts
|
87
87
|
end
|
88
|
-
def write
|
88
|
+
def write #writes current data collection out to CSV in current directory
|
89
89
|
if @output
|
90
90
|
i = 0
|
91
91
|
CSV.open("#{@output}.csv", "w") do |csv|
|
@@ -104,7 +104,7 @@ module Retriever
|
|
104
104
|
puts
|
105
105
|
end
|
106
106
|
end
|
107
|
-
def async_crawl_and_collect()
|
107
|
+
def async_crawl_and_collect() #iterates over the excisting @linkStack, running asyncGetWave on it until we reach the @maxPages value.
|
108
108
|
while (@already_crawled.size < @maxPages)
|
109
109
|
if @linkStack.empty?
|
110
110
|
if @prgrss
|
@@ -116,11 +116,11 @@ module Retriever
|
|
116
116
|
end
|
117
117
|
new_links_arr = self.asyncGetWave()
|
118
118
|
next if (new_links_arr.nil? || new_links_arr.empty?)
|
119
|
-
new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
|
119
|
+
new_link_arr = new_links_arr-@linkStack #set operations to see are these in our previous visited pages arr?
|
120
120
|
@linkStack.concat(new_links_arr).uniq!
|
121
121
|
@data.concat(new_links_arr) if @s
|
122
122
|
end
|
123
|
-
@progressbar.finish if @prgrss
|
123
|
+
@progressbar.finish if @prgrss #if we are done, let's make sure progress bar says we are done
|
124
124
|
end
|
125
125
|
def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
|
126
126
|
return false if !resp
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchFiles < Fetch
|
3
|
-
def initialize(url,options)
|
3
|
+
def initialize(url,options) #recieves target url and RR options, returns an array of all unique files (based on given filetype) found on the site
|
4
4
|
super
|
5
5
|
@data = []
|
6
6
|
page_one = Retriever::Page.new(@t.source,@t)
|
@@ -21,7 +21,7 @@ module Retriever
|
|
21
21
|
@data.sort_by! {|x| x.length}
|
22
22
|
@data.uniq!
|
23
23
|
end
|
24
|
-
def download_file(path)
|
24
|
+
def download_file(path) #given valid url, downloads file to current directory in /rr-downloads/
|
25
25
|
arr = path.split('/')
|
26
26
|
shortname = arr.pop
|
27
27
|
puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
|
@@ -32,7 +32,7 @@ module Retriever
|
|
32
32
|
end
|
33
33
|
puts " SUCCESS: Download Complete"
|
34
34
|
end
|
35
|
-
def autodownload()
|
35
|
+
def autodownload() #when autodownload option is true, this will automatically go through the fetched file URL collection and download each one.
|
36
36
|
lenny = @data.count
|
37
37
|
puts "###################"
|
38
38
|
puts "### Initiating Autodownload..."
|
data/lib/retriever/fetchseo.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchSEO < Fetch
|
3
|
-
def initialize(url,options)
|
3
|
+
def initialize(url,options) #recieves target url and RR options, returns an array of onpage SEO related fields on all unique pages found on the site
|
4
4
|
super
|
5
5
|
@data = []
|
6
6
|
page_one = Retriever::Page.new(@t.source,@t)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Retriever
|
2
2
|
class FetchSitemap < Fetch
|
3
|
-
def initialize(url,options)
|
3
|
+
def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site
|
4
4
|
super
|
5
5
|
@data = [@t.target]
|
6
6
|
page_one = Retriever::Page.new(@t.source,@t)
|
@@ -18,7 +18,7 @@ module Retriever
|
|
18
18
|
@data.sort_by! {|x| x.length} if @data.size>1
|
19
19
|
@data.uniq!
|
20
20
|
end
|
21
|
-
def gen_xml
|
21
|
+
def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory.
|
22
22
|
f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
|
23
23
|
f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
24
24
|
@data.each do |url|
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[RubyRetriever] (http://
|
1
|
+
[RubyRetriever] (http://softwarebyjoe.com/rubyretriever/)
|
2
2
|
==============
|
3
3
|
[](http://badge.fury.io/rb/rubyretriever) [](https://travis-ci.org/joenorton/rubyretriever)
|
4
4
|
|
@@ -10,6 +10,8 @@ RubyRetriever uses aynchronous HTTP requests, thanks to eventmachine and Synchro
|
|
10
10
|
|
11
11
|
RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by default - launches up to 10 parallel GET requests at once. This is a feature, do not abuse it. Use at own risk.
|
12
12
|
|
13
|
+
v1.0 Update 6/07/2014 - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this upate was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
|
14
|
+
|
13
15
|
|
14
16
|
getting started
|
15
17
|
-----------
|
@@ -58,7 +60,7 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
|
|
58
60
|
|
59
61
|
Where MODE FLAG is required, and is either:
|
60
62
|
-s, --sitemap FORMAT (only accepts CSV or XML atm)
|
61
|
-
-f, --files FILETYPE
|
63
|
+
-f, --files FILETYPE
|
62
64
|
-e, --seo
|
63
65
|
|
64
66
|
and OPTIONS is the applicable:
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|