rubyretriever 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/rr +1 -1
- data/lib/retriever/fetch.rb +10 -10
- data/lib/retriever/fetchfiles.rb +6 -6
- data/lib/retriever/fetchseo.rb +2 -2
- data/lib/retriever/fetchsitemap.rb +6 -6
- data/lib/retriever/link.rb +6 -1
- data/lib/retriever/page.rb +1 -1
- data/lib/retriever/target.rb +1 -1
- data/lib/retriever/version.rb +1 -1
- data/readme.md +19 -15
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a5749cf55198f97bab6c77297bf6409a2518bca0
|
4
|
+
data.tar.gz: df792b6d3b1d03a8b70faadf651e20779f4fd1e8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 76e12598873e8779e196f84ec040a5c9fef01e410c884fa88b18f41335848a69dea778459bbb9629f941dbe12c8ab2c57032d24007a1f4fd57c5caec3a76abea
|
7
|
+
data.tar.gz: 13b91c96ce17eb8ce802250505943d13efbd292cd95b120336ddd0ddd98278c513310c68eb9606a92b4018ef6ec9369369ee5869eff0abb589457c7b3bde41c3
|
data/bin/rr
CHANGED
data/lib/retriever/fetch.rb
CHANGED
@@ -10,12 +10,12 @@ module Retriever
|
|
10
10
|
#
|
11
11
|
class Fetch
|
12
12
|
HR = '###############################'
|
13
|
-
attr_reader :max_pages, :t
|
13
|
+
attr_reader :max_pages, :t, :result
|
14
14
|
# given target URL and RR options, creates a fetch object.
|
15
15
|
# There is no direct output
|
16
16
|
# this is a parent class that the other fetch classes build off of.
|
17
17
|
def initialize(url, options)
|
18
|
-
@
|
18
|
+
@result = []
|
19
19
|
@connection_tally = {
|
20
20
|
success: 0,
|
21
21
|
error: 0,
|
@@ -52,9 +52,9 @@ module Retriever
|
|
52
52
|
elsif @seo
|
53
53
|
puts 'SEO Metrics'
|
54
54
|
end
|
55
|
-
puts "Data Dump -- Object Count: #{@
|
55
|
+
puts "Data Dump -- Object Count: #{@result.size}"
|
56
56
|
puts HR
|
57
|
-
@
|
57
|
+
@result.each do |line|
|
58
58
|
puts line
|
59
59
|
end
|
60
60
|
puts
|
@@ -69,13 +69,13 @@ module Retriever
|
|
69
69
|
csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
|
70
70
|
i += 1
|
71
71
|
end
|
72
|
-
@
|
72
|
+
@result.each do |entry|
|
73
73
|
csv << entry
|
74
74
|
end
|
75
75
|
end
|
76
76
|
puts HR
|
77
77
|
puts "File Created: #{@output}.csv"
|
78
|
-
puts "Object Count: #{@
|
78
|
+
puts "Object Count: #{@result.size}"
|
79
79
|
puts HR
|
80
80
|
puts
|
81
81
|
end
|
@@ -152,9 +152,9 @@ module Retriever
|
|
152
152
|
next if new_links_arr.nil? || new_links_arr.empty?
|
153
153
|
@link_stack.concat(new_links_arr)
|
154
154
|
next unless @sitemap
|
155
|
-
@
|
155
|
+
@result.concat(new_links_arr)
|
156
156
|
end
|
157
|
-
@
|
157
|
+
@result.uniq!
|
158
158
|
end
|
159
159
|
|
160
160
|
# returns true is resp is ok to continue
|
@@ -193,13 +193,13 @@ module Retriever
|
|
193
193
|
def push_seo_to_data(url, new_page)
|
194
194
|
seos = [url]
|
195
195
|
seos.concat(new_page.parse_seo)
|
196
|
-
@
|
196
|
+
@result.push(seos)
|
197
197
|
lg('--page SEO scraped')
|
198
198
|
end
|
199
199
|
|
200
200
|
def push_files_to_data(new_page)
|
201
201
|
filez = new_page.parse_files(new_page.parse_internal)
|
202
|
-
@
|
202
|
+
@result.concat(filez) unless filez.empty?
|
203
203
|
lg("--#{filez.size} files found")
|
204
204
|
end
|
205
205
|
|
data/lib/retriever/fetchfiles.rb
CHANGED
@@ -6,13 +6,13 @@ module Retriever
|
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
8
|
temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
|
9
|
-
@
|
10
|
-
lg("#{@
|
9
|
+
@result.concat(temp_file_collection) if temp_file_collection.size > 0
|
10
|
+
lg("#{@result.size} new files found")
|
11
11
|
|
12
12
|
async_crawl_and_collect
|
13
13
|
# done, make sure progress bar says we are done
|
14
14
|
@progressbar.finish if @progress
|
15
|
-
@
|
15
|
+
@result.sort_by! { |x| x.length }
|
16
16
|
end
|
17
17
|
|
18
18
|
def download_file(path)
|
@@ -33,7 +33,7 @@ module Retriever
|
|
33
33
|
puts HR
|
34
34
|
puts '### Initiating Autodownload...'
|
35
35
|
puts HR
|
36
|
-
puts "#{@
|
36
|
+
puts "#{@result.count} - #{@file_ext}'s Located"
|
37
37
|
puts HR
|
38
38
|
move_to_download_dir
|
39
39
|
iterate_thru_collection_and_download
|
@@ -43,8 +43,8 @@ module Retriever
|
|
43
43
|
private
|
44
44
|
|
45
45
|
def iterate_thru_collection_and_download
|
46
|
-
lenn = @
|
47
|
-
@
|
46
|
+
lenn = @result.count
|
47
|
+
@result.each_with_index do |entry, i|
|
48
48
|
begin
|
49
49
|
download_file(entry)
|
50
50
|
rescue StandardError
|
data/lib/retriever/fetchseo.rb
CHANGED
@@ -6,12 +6,12 @@ module Retriever
|
|
6
6
|
# on all unique pages found on the site
|
7
7
|
def initialize(url, options)
|
8
8
|
super
|
9
|
-
@
|
9
|
+
@result.push(@page_one.parse_seo)
|
10
10
|
|
11
11
|
async_crawl_and_collect
|
12
12
|
# done, make sure progress bar says we are done
|
13
13
|
@progressbar.finish if @progress
|
14
|
-
@
|
14
|
+
@result.sort_by! { |x| x[0].length }
|
15
15
|
end
|
16
16
|
end
|
17
17
|
end
|
@@ -5,14 +5,14 @@ module Retriever
|
|
5
5
|
# returns an array of all unique pages found on the site
|
6
6
|
def initialize(url, options)
|
7
7
|
super
|
8
|
-
@
|
9
|
-
@
|
8
|
+
@result.push(@t.target)
|
9
|
+
@result.concat(@link_stack)
|
10
10
|
|
11
11
|
async_crawl_and_collect
|
12
12
|
# done, make sure progress bar says we are done
|
13
13
|
@progressbar.finish if @progress
|
14
|
-
@
|
15
|
-
@
|
14
|
+
@result.sort_by! { |x| x.length } if @result.size > 1
|
15
|
+
@result.uniq!
|
16
16
|
end
|
17
17
|
|
18
18
|
private
|
@@ -24,7 +24,7 @@ module Retriever
|
|
24
24
|
f = File.open("sitemap-#{filename}.xml", 'w+')
|
25
25
|
f << "<?xml version='1.0' encoding='UTF-8'?>"
|
26
26
|
f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
27
|
-
@
|
27
|
+
@result.each do |url|
|
28
28
|
f << "<url><loc>#{url}</loc></url>"
|
29
29
|
end
|
30
30
|
f << '</urlset>'
|
@@ -35,7 +35,7 @@ module Retriever
|
|
35
35
|
def print_file_info(filename)
|
36
36
|
puts HR
|
37
37
|
puts "File Created: sitemap-#{filename}.xml"
|
38
|
-
puts "Object Count: #{@
|
38
|
+
puts "Object Count: #{@result.size}"
|
39
39
|
puts HR + "\n"
|
40
40
|
end
|
41
41
|
end
|
data/lib/retriever/link.rb
CHANGED
@@ -8,7 +8,12 @@ module Retriever
|
|
8
8
|
WWW_DOT_RE = Regexp.new(/^www\./i).freeze
|
9
9
|
|
10
10
|
def initialize(target_scheme, target_host, this_link)
|
11
|
-
|
11
|
+
begin
|
12
|
+
@link_uri = Addressable::URI.parse(Addressable::URI.encode(this_link)).normalize
|
13
|
+
rescue Addressable::URI::InvalidURIError => e
|
14
|
+
dummy_link = Retriever::Link.new(target_scheme, target_host, target_host)
|
15
|
+
@link_uri = Addressable::URI.parse(dummy_link.path)
|
16
|
+
end
|
12
17
|
@scheme = target_scheme
|
13
18
|
@host = target_host
|
14
19
|
@this_link = @link_uri.to_s
|
data/lib/retriever/page.rb
CHANGED
data/lib/retriever/target.rb
CHANGED
@@ -11,7 +11,7 @@ module Retriever
|
|
11
11
|
def initialize(url, file_re = nil)
|
12
12
|
fail 'Bad URL' unless url.include?('.')
|
13
13
|
url = "http://#{url}" unless HTTP_RE =~ url
|
14
|
-
target_uri = Addressable::URI.parse(url)
|
14
|
+
target_uri = Addressable::URI.parse(Addressable::URI.encode(url))
|
15
15
|
@target = target_uri.to_s
|
16
16
|
@host = target_uri.host
|
17
17
|
@host_re = Regexp.new(@host.sub('www.', ''))
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -6,29 +6,33 @@ By Joe Norton
|
|
6
6
|
|
7
7
|
RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
|
8
8
|
|
9
|
-
RubyRetriever (RR) uses asynchronous HTTP requests
|
9
|
+
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
|
10
10
|
|
11
|
-
**v1.0 Update (6/07/2014)** - Includes major code changes
|
12
|
-
|
11
|
+
**v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
|
12
|
+
|
13
|
+
Mission
|
13
14
|
-------
|
14
|
-
RubyRetriever aims to be the best command-line crawling
|
15
|
+
RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
|
15
16
|
|
16
|
-
|
17
|
+
Features
|
17
18
|
--------
|
18
19
|
* Asynchronous HTTP Requests thru EM & Synchrony
|
19
|
-
* Bloom filter for tracking pages
|
20
|
-
* 3 CLI modes
|
20
|
+
* Bloom filter for tracking visited pages
|
21
|
+
* 3 CLI modes
|
22
|
+
* Sitemap
|
23
|
+
* File Harvest
|
24
|
+
* SEO
|
21
25
|
|
22
|
-
|
26
|
+
Use cases
|
23
27
|
---------
|
24
|
-
RubyRetriever can do multiple things for you
|
28
|
+
RubyRetriever can do multiple things for you. With a single command at the terminal, RR can:
|
25
29
|
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
26
30
|
2. Crawl a target website and *download all files of a given filetype*.
|
27
|
-
3. Crawl a target website
|
31
|
+
3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
|
28
32
|
|
29
33
|
Help & Forks Welcome!
|
30
34
|
|
31
|
-
|
35
|
+
Getting started
|
32
36
|
-----------
|
33
37
|
Install the gem
|
34
38
|
```sh
|
@@ -44,7 +48,7 @@ OR -- SAME COMMAND
|
|
44
48
|
rr -s csv -p -l 100 http://www.cnet.com
|
45
49
|
```
|
46
50
|
|
47
|
-
This would
|
51
|
+
This would map http://www.cnet.com until it crawled a max of 100 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
|
48
52
|
|
49
53
|
**Example: File Harvesting mode**
|
50
54
|
```sh
|
@@ -55,7 +59,7 @@ OR -- SAME COMMAND
|
|
55
59
|
rr -f pdf -p -l 100 http://www.hubspot.com
|
56
60
|
```
|
57
61
|
|
58
|
-
This would
|
62
|
+
This would crawl http://www.hubspot.com looking for filetype:PDF until it hit a max of 100 pages, then write out a list of filepaths to a CSV named hubspot (based on the website host name). Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
|
59
63
|
|
60
64
|
**Example: SEO mode**
|
61
65
|
```sh
|
@@ -66,7 +70,7 @@ OR -- SAME COMMAND
|
|
66
70
|
rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
67
71
|
```
|
68
72
|
|
69
|
-
This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would
|
73
|
+
This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
|
70
74
|
|
71
75
|
|
72
76
|
command-line arguments
|
@@ -93,4 +97,4 @@ bloomfilter-rb
|
|
93
97
|
|
94
98
|
License
|
95
99
|
-------
|
96
|
-
See included 'LICENSE' file. It's the MIT license.
|
100
|
+
See included 'LICENSE' file. It's the MIT license.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-06-
|
11
|
+
date: 2014-06-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-synchrony
|