rubyretriever 1.2.3 → 1.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/rr +1 -1
- data/lib/retriever/fetch.rb +10 -10
- data/lib/retriever/fetchfiles.rb +6 -6
- data/lib/retriever/fetchseo.rb +2 -2
- data/lib/retriever/fetchsitemap.rb +6 -6
- data/lib/retriever/link.rb +6 -1
- data/lib/retriever/page.rb +1 -1
- data/lib/retriever/target.rb +1 -1
- data/lib/retriever/version.rb +1 -1
- data/readme.md +19 -15
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a5749cf55198f97bab6c77297bf6409a2518bca0
|
4
|
+
data.tar.gz: df792b6d3b1d03a8b70faadf651e20779f4fd1e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 76e12598873e8779e196f84ec040a5c9fef01e410c884fa88b18f41335848a69dea778459bbb9629f941dbe12c8ab2c57032d24007a1f4fd57c5caec3a76abea
|
7
|
+
data.tar.gz: 13b91c96ce17eb8ce802250505943d13efbd292cd95b120336ddd0ddd98278c513310c68eb9606a92b4018ef6ec9369369ee5869eff0abb589457c7b3bde41c3
|
data/bin/rr
CHANGED
data/lib/retriever/fetch.rb
CHANGED
@@ -10,12 +10,12 @@ module Retriever
|
|
10
10
|
#
|
11
11
|
class Fetch
|
12
12
|
HR = '###############################'
|
13
|
-
attr_reader :max_pages, :t
|
13
|
+
attr_reader :max_pages, :t, :result
|
14
14
|
# given target URL and RR options, creates a fetch object.
|
15
15
|
# There is no direct output
|
16
16
|
# this is a parent class that the other fetch classes build off of.
|
17
17
|
def initialize(url, options)
|
18
|
-
@
|
18
|
+
@result = []
|
19
19
|
@connection_tally = {
|
20
20
|
success: 0,
|
21
21
|
error: 0,
|
@@ -52,9 +52,9 @@ module Retriever
|
|
52
52
|
elsif @seo
|
53
53
|
puts 'SEO Metrics'
|
54
54
|
end
|
55
|
-
puts "Data Dump -- Object Count: #{@
|
55
|
+
puts "Data Dump -- Object Count: #{@result.size}"
|
56
56
|
puts HR
|
57
|
-
@
|
57
|
+
@result.each do |line|
|
58
58
|
puts line
|
59
59
|
end
|
60
60
|
puts
|
@@ -69,13 +69,13 @@ module Retriever
|
|
69
69
|
csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
|
70
70
|
i += 1
|
71
71
|
end
|
72
|
-
@
|
72
|
+
@result.each do |entry|
|
73
73
|
csv << entry
|
74
74
|
end
|
75
75
|
end
|
76
76
|
puts HR
|
77
77
|
puts "File Created: #{@output}.csv"
|
78
|
-
puts "Object Count: #{@
|
78
|
+
puts "Object Count: #{@result.size}"
|
79
79
|
puts HR
|
80
80
|
puts
|
81
81
|
end
|
@@ -152,9 +152,9 @@ module Retriever
|
|
152
152
|
next if new_links_arr.nil? || new_links_arr.empty?
|
153
153
|
@link_stack.concat(new_links_arr)
|
154
154
|
next unless @sitemap
|
155
|
-
@
|
155
|
+
@result.concat(new_links_arr)
|
156
156
|
end
|
157
|
-
@
|
157
|
+
@result.uniq!
|
158
158
|
end
|
159
159
|
|
160
160
|
# returns true is resp is ok to continue
|
@@ -193,13 +193,13 @@ module Retriever
|
|
193
193
|
def push_seo_to_data(url, new_page)
|
194
194
|
seos = [url]
|
195
195
|
seos.concat(new_page.parse_seo)
|
196
|
-
@
|
196
|
+
@result.push(seos)
|
197
197
|
lg('--page SEO scraped')
|
198
198
|
end
|
199
199
|
|
200
200
|
def push_files_to_data(new_page)
|
201
201
|
filez = new_page.parse_files(new_page.parse_internal)
|
202
|
-
@
|
202
|
+
@result.concat(filez) unless filez.empty?
|
203
203
|
lg("--#{filez.size} files found")
|
204
204
|
end
|
205
205
|
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -6,13 +6,13 @@ module Retriever
|
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
8
|
temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
|
9
|
-
@
|
10
|
-
lg("#{@
|
9
|
+
@result.concat(temp_file_collection) if temp_file_collection.size > 0
|
10
|
+
lg("#{@result.size} new files found")
|
11
11
|
|
12
12
|
async_crawl_and_collect
|
13
13
|
# done, make sure progress bar says we are done
|
14
14
|
@progressbar.finish if @progress
|
15
|
-
@
|
15
|
+
@result.sort_by! { |x| x.length }
|
16
16
|
end
|
17
17
|
|
18
18
|
def download_file(path)
|
@@ -33,7 +33,7 @@ module Retriever
|
|
33
33
|
puts HR
|
34
34
|
puts '### Initiating Autodownload...'
|
35
35
|
puts HR
|
36
|
-
puts "#{@
|
36
|
+
puts "#{@result.count} - #{@file_ext}'s Located"
|
37
37
|
puts HR
|
38
38
|
move_to_download_dir
|
39
39
|
iterate_thru_collection_and_download
|
@@ -43,8 +43,8 @@ module Retriever
|
|
43
43
|
private
|
44
44
|
|
45
45
|
def iterate_thru_collection_and_download
|
46
|
-
lenn = @
|
47
|
-
@
|
46
|
+
lenn = @result.count
|
47
|
+
@result.each_with_index do |entry, i|
|
48
48
|
begin
|
49
49
|
download_file(entry)
|
50
50
|
rescue StandardError
|
data/lib/retriever/fetchseo.rb
CHANGED
@@ -6,12 +6,12 @@ module Retriever
|
|
6
6
|
# on all unique pages found on the site
|
7
7
|
def initialize(url, options)
|
8
8
|
super
|
9
|
-
@
|
9
|
+
@result.push(@page_one.parse_seo)
|
10
10
|
|
11
11
|
async_crawl_and_collect
|
12
12
|
# done, make sure progress bar says we are done
|
13
13
|
@progressbar.finish if @progress
|
14
|
-
@
|
14
|
+
@result.sort_by! { |x| x[0].length }
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -5,14 +5,14 @@ module Retriever
|
|
5
5
|
# returns an array of all unique pages found on the site
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
|
-
@
|
9
|
-
@
|
8
|
+
@result.push(@t.target)
|
9
|
+
@result.concat(@link_stack)
|
10
10
|
|
11
11
|
async_crawl_and_collect
|
12
12
|
# done, make sure progress bar says we are done
|
13
13
|
@progressbar.finish if @progress
|
14
|
-
@
|
15
|
-
@
|
14
|
+
@result.sort_by! { |x| x.length } if @result.size > 1
|
15
|
+
@result.uniq!
|
16
16
|
end
|
17
17
|
|
18
18
|
private
|
@@ -24,7 +24,7 @@ module Retriever
|
|
24
24
|
f = File.open("sitemap-#{filename}.xml", 'w+')
|
25
25
|
f << "<?xml version='1.0' encoding='UTF-8'?>"
|
26
26
|
f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
27
|
-
@
|
27
|
+
@result.each do |url|
|
28
28
|
f << "<url><loc>#{url}</loc></url>"
|
29
29
|
end
|
30
30
|
f << '</urlset>'
|
@@ -35,7 +35,7 @@ module Retriever
|
|
35
35
|
def print_file_info(filename)
|
36
36
|
puts HR
|
37
37
|
puts "File Created: sitemap-#{filename}.xml"
|
38
|
-
puts "Object Count: #{@
|
38
|
+
puts "Object Count: #{@result.size}"
|
39
39
|
puts HR + "\n"
|
40
40
|
end
|
41
41
|
end
|
data/lib/retriever/link.rb
CHANGED
@@ -8,7 +8,12 @@ module Retriever
|
|
8
8
|
WWW_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
9
|
|
10
10
|
def initialize(target_scheme, target_host, this_link)
|
11
|
-
|
11
|
+
begin
|
12
|
+
@link_uri = Addressable::URI.parse(Addressable::URI.encode(this_link)).normalize
|
13
|
+
rescue Addressable::URI::InvalidURIError => e
|
14
|
+
dummy_link = Retriever::Link.new(target_scheme, target_host, target_host)
|
15
|
+
@link_uri = Addressable::URI.parse(dummy_link.path)
|
16
|
+
end
|
12
17
|
@scheme = target_scheme
|
13
18
|
@host = target_host
|
14
19
|
@this_link = @link_uri.to_s
|
data/lib/retriever/page.rb
CHANGED
data/lib/retriever/target.rb
CHANGED
@@ -11,7 +11,7 @@ module Retriever
|
|
11
11
|
def initialize(url, file_re = nil)
|
12
12
|
fail 'Bad URL' unless url.include?('.')
|
13
13
|
url = "http://#{url}" unless HTTP_RE =~ url
|
14
|
-
target_uri = Addressable::URI.parse(url)
|
14
|
+
target_uri = Addressable::URI.parse(Addressable::URI.encode(url))
|
15
15
|
@target = target_uri.to_s
|
16
16
|
@host = target_uri.host
|
17
17
|
@host_re = Regexp.new(@host.sub('www.', ''))
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -6,29 +6,33 @@ By Joe Norton
|
|
6
6
|
|
7
7
|
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
|
8
8
|
|
9
|
-
RubyRetriever (RR) uses asynchronous HTTP requests
|
9
|
+
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
|
10
10
|
|
11
|
-
**v1.0 Update (6/07/2014)** - Includes major code changes
|
12
|
-
|
11
|
+
**v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
|
12
|
+
|
13
|
+
Mission
|
13
14
|
-------
|
14
|
-
RubyRetriever aims to be the best command-line crawling
|
15
|
+
RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
|
15
16
|
|
16
|
-
|
17
|
+
Features
|
17
18
|
--------
|
18
19
|
* Asynchronous HTTP Requests thru EM & Synchrony
|
19
|
-
* Bloom filter for tracking pages
|
20
|
-
* 3 CLI modes
|
20
|
+
* Bloom filter for tracking visited pages
|
21
|
+
* 3 CLI modes
|
22
|
+
* Sitemap
|
23
|
+
* File Harvest
|
24
|
+
* SEO
|
21
25
|
|
22
|
-
|
26
|
+
Use cases
|
23
27
|
---------
|
24
|
-
RubyRetriever can do multiple things for you
|
28
|
+
RubyRetriever can do multiple things for you. With a single command at the terminal, RR can:
|
25
29
|
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
26
30
|
2. Crawl a target website and *download all files of a given filetype*.
|
27
|
-
3. Crawl a target website
|
31
|
+
3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
|
28
32
|
|
29
33
|
Help & Forks Welcome!
|
30
34
|
|
31
|
-
|
35
|
+
Getting started
|
32
36
|
-----------
|
33
37
|
Install the gem
|
34
38
|
```sh
|
@@ -44,7 +48,7 @@ OR -- SAME COMMAND
|
|
44
48
|
rr -s csv -p -l 100 http://www.cnet.com
|
45
49
|
```
|
46
50
|
|
47
|
-
This would
|
51
|
+
This would map http://www.cnet.com until it crawled a max of 100 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
|
48
52
|
|
49
53
|
**Example: File Harvesting mode**
|
50
54
|
```sh
|
@@ -55,7 +59,7 @@ OR -- SAME COMMAND
|
|
55
59
|
rr -f pdf -p -l 100 http://www.hubspot.com
|
56
60
|
```
|
57
61
|
|
58
|
-
This would
|
62
|
+
This would crawl http://www.hubspot.com looking for filetype:PDF until it hit a max of 100 pages, then write out a list of filepaths to a CSV named hubspot (based on the website host name). Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
|
59
63
|
|
60
64
|
**Example: SEO mode**
|
61
65
|
```sh
|
@@ -66,7 +70,7 @@ OR -- SAME COMMAND
|
|
66
70
|
rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
67
71
|
```
|
68
72
|
|
69
|
-
This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would
|
73
|
+
This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
|
70
74
|
|
71
75
|
|
72
76
|
command-line arguments
|
@@ -93,4 +97,4 @@ bloomfilter-rb
|
|
93
97
|
|
94
98
|
License
|
95
99
|
-------
|
96
|
-
See included 'LICENSE' file. It's the MIT license.
|
100
|
+
See included 'LICENSE' file. It's the MIT license.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|