rubyretriever 1.2.3 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3bb32aa2e9c8317d2f3cb13572e2cdecb1da24a9
4
- data.tar.gz: 732e5610104345efed80651929cb9a050e01d9be
3
+ metadata.gz: a5749cf55198f97bab6c77297bf6409a2518bca0
4
+ data.tar.gz: df792b6d3b1d03a8b70faadf651e20779f4fd1e8
5
5
  SHA512:
6
- metadata.gz: 3d4e109785452db3906dc7b66158846cda24e4c3e1b942f600918338e141d6a337f1f9b3087b94b2561c64095fcdc2f2fb439d29b73574a2ddae501a8f0d965b
7
- data.tar.gz: 2e0befea22dfc2bc689d15ad3c33efaf015f7b1ee5c53a322cccca7f6394a4def445e362585600e1934f770377ce193c5a762caa3639ccda480f7c481ce64d64
6
+ metadata.gz: 76e12598873e8779e196f84ec040a5c9fef01e410c884fa88b18f41335848a69dea778459bbb9629f941dbe12c8ab2c57032d24007a1f4fd57c5caec3a76abea
7
+ data.tar.gz: 13b91c96ce17eb8ce802250505943d13efbd292cd95b120336ddd0ddd98278c513310c68eb9606a92b4018ef6ec9369369ee5869eff0abb589457c7b3bde41c3
data/bin/rr CHANGED
@@ -56,7 +56,7 @@ if ARGV[0].nil?
56
56
  end
57
57
 
58
58
  ARGV.each do|q|
59
- if options[:verbose]
59
+ if options['verbose']
60
60
  puts '###############################'
61
61
  puts '### [RubyRetriever]'
62
62
  puts '### Creating Sitemap' if options['sitemap']
@@ -10,12 +10,12 @@ module Retriever
10
10
  #
11
11
  class Fetch
12
12
  HR = '###############################'
13
- attr_reader :max_pages, :t
13
+ attr_reader :max_pages, :t, :result
14
14
  # given target URL and RR options, creates a fetch object.
15
15
  # There is no direct output
16
16
  # this is a parent class that the other fetch classes build off of.
17
17
  def initialize(url, options)
18
- @data = []
18
+ @result = []
19
19
  @connection_tally = {
20
20
  success: 0,
21
21
  error: 0,
@@ -52,9 +52,9 @@ module Retriever
52
52
  elsif @seo
53
53
  puts 'SEO Metrics'
54
54
  end
55
- puts "Data Dump -- Object Count: #{@data.size}"
55
+ puts "Data Dump -- Object Count: #{@result.size}"
56
56
  puts HR
57
- @data.each do |line|
57
+ @result.each do |line|
58
58
  puts line
59
59
  end
60
60
  puts
@@ -69,13 +69,13 @@ module Retriever
69
69
  csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
70
70
  i += 1
71
71
  end
72
- @data.each do |entry|
72
+ @result.each do |entry|
73
73
  csv << entry
74
74
  end
75
75
  end
76
76
  puts HR
77
77
  puts "File Created: #{@output}.csv"
78
- puts "Object Count: #{@data.size}"
78
+ puts "Object Count: #{@result.size}"
79
79
  puts HR
80
80
  puts
81
81
  end
@@ -152,9 +152,9 @@ module Retriever
152
152
  next if new_links_arr.nil? || new_links_arr.empty?
153
153
  @link_stack.concat(new_links_arr)
154
154
  next unless @sitemap
155
- @data.concat(new_links_arr)
155
+ @result.concat(new_links_arr)
156
156
  end
157
- @data.uniq!
157
+ @result.uniq!
158
158
  end
159
159
 
160
160
  # returns true is resp is ok to continue
@@ -193,13 +193,13 @@ module Retriever
193
193
  def push_seo_to_data(url, new_page)
194
194
  seos = [url]
195
195
  seos.concat(new_page.parse_seo)
196
- @data.push(seos)
196
+ @result.push(seos)
197
197
  lg('--page SEO scraped')
198
198
  end
199
199
 
200
200
  def push_files_to_data(new_page)
201
201
  filez = new_page.parse_files(new_page.parse_internal)
202
- @data.concat(filez) unless filez.empty?
202
+ @result.concat(filez) unless filez.empty?
203
203
  lg("--#{filez.size} files found")
204
204
  end
205
205
 
@@ -6,13 +6,13 @@ module Retriever
6
6
  def initialize(url, options)
7
7
  super
8
8
  temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
9
- @data.concat(temp_file_collection) if temp_file_collection.size > 0
10
- lg("#{@data.size} new files found")
9
+ @result.concat(temp_file_collection) if temp_file_collection.size > 0
10
+ lg("#{@result.size} new files found")
11
11
 
12
12
  async_crawl_and_collect
13
13
  # done, make sure progress bar says we are done
14
14
  @progressbar.finish if @progress
15
- @data.sort_by! { |x| x.length }
15
+ @result.sort_by! { |x| x.length }
16
16
  end
17
17
 
18
18
  def download_file(path)
@@ -33,7 +33,7 @@ module Retriever
33
33
  puts HR
34
34
  puts '### Initiating Autodownload...'
35
35
  puts HR
36
- puts "#{@data.count} - #{@file_ext}'s Located"
36
+ puts "#{@result.count} - #{@file_ext}'s Located"
37
37
  puts HR
38
38
  move_to_download_dir
39
39
  iterate_thru_collection_and_download
@@ -43,8 +43,8 @@ module Retriever
43
43
  private
44
44
 
45
45
  def iterate_thru_collection_and_download
46
- lenn = @data.count
47
- @data.each_with_index do |entry, i|
46
+ lenn = @result.count
47
+ @result.each_with_index do |entry, i|
48
48
  begin
49
49
  download_file(entry)
50
50
  rescue StandardError
@@ -6,12 +6,12 @@ module Retriever
6
6
  # on all unique pages found on the site
7
7
  def initialize(url, options)
8
8
  super
9
- @data.push(@page_one.parse_seo)
9
+ @result.push(@page_one.parse_seo)
10
10
 
11
11
  async_crawl_and_collect
12
12
  # done, make sure progress bar says we are done
13
13
  @progressbar.finish if @progress
14
- @data.sort_by! { |x| x[0].length }
14
+ @result.sort_by! { |x| x[0].length }
15
15
  end
16
16
  end
17
17
  end
@@ -5,14 +5,14 @@ module Retriever
5
5
  # returns an array of all unique pages found on the site
6
6
  def initialize(url, options)
7
7
  super
8
- @data.push(@t.target)
9
- @data.concat(@link_stack)
8
+ @result.push(@t.target)
9
+ @result.concat(@link_stack)
10
10
 
11
11
  async_crawl_and_collect
12
12
  # done, make sure progress bar says we are done
13
13
  @progressbar.finish if @progress
14
- @data.sort_by! { |x| x.length } if @data.size > 1
15
- @data.uniq!
14
+ @result.sort_by! { |x| x.length } if @result.size > 1
15
+ @result.uniq!
16
16
  end
17
17
 
18
18
  private
@@ -24,7 +24,7 @@ module Retriever
24
24
  f = File.open("sitemap-#{filename}.xml", 'w+')
25
25
  f << "<?xml version='1.0' encoding='UTF-8'?>"
26
26
  f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
27
- @data.each do |url|
27
+ @result.each do |url|
28
28
  f << "<url><loc>#{url}</loc></url>"
29
29
  end
30
30
  f << '</urlset>'
@@ -35,7 +35,7 @@ module Retriever
35
35
  def print_file_info(filename)
36
36
  puts HR
37
37
  puts "File Created: sitemap-#{filename}.xml"
38
- puts "Object Count: #{@data.size}"
38
+ puts "Object Count: #{@result.size}"
39
39
  puts HR + "\n"
40
40
  end
41
41
  end
@@ -8,7 +8,12 @@ module Retriever
8
8
  WWW_DOT_RE = Regexp.new(/^www\./i).freeze
9
9
 
10
10
  def initialize(target_scheme, target_host, this_link)
11
- @link_uri = Addressable::URI.parse(this_link)
11
+ begin
12
+ @link_uri = Addressable::URI.parse(Addressable::URI.encode(this_link)).normalize
13
+ rescue Addressable::URI::InvalidURIError => e
14
+ dummy_link = Retriever::Link.new(target_scheme, target_host, target_host)
15
+ @link_uri = Addressable::URI.parse(dummy_link.path)
16
+ end
12
17
  @scheme = target_scheme
13
18
  @host = target_host
14
19
  @this_link = @link_uri.to_s
@@ -52,7 +52,7 @@ module Retriever
52
52
  end
53
53
 
54
54
  def parse_internal
55
- links.select { |x| @t.host == Addressable::URI.parse(x).host }
55
+ links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host }
56
56
  end
57
57
 
58
58
  def parse_internal_visitable
@@ -11,7 +11,7 @@ module Retriever
11
11
  def initialize(url, file_re = nil)
12
12
  fail 'Bad URL' unless url.include?('.')
13
13
  url = "http://#{url}" unless HTTP_RE =~ url
14
- target_uri = Addressable::URI.parse(url)
14
+ target_uri = Addressable::URI.parse(Addressable::URI.encode(url))
15
15
  @target = target_uri.to_s
16
16
  @host = target_uri.host
17
17
  @host_re = Regexp.new(@host.sub('www.', ''))
@@ -1,4 +1,4 @@
1
1
  #
2
2
  module Retriever
3
- VERSION = '1.2.3'
3
+ VERSION = '1.2.4'
4
4
  end
data/readme.md CHANGED
@@ -6,29 +6,33 @@ By Joe Norton
6
6
 
7
7
  RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
8
8
 
9
- RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
9
+ RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
10
10
 
11
- **v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
12
- mission
11
+ **v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
12
+
13
+ Mission
13
14
  -------
14
- RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
15
+ RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
15
16
 
16
- features
17
+ Features
17
18
  --------
18
19
  * Asynchronous HTTP Requests thru EM & Synchrony
19
- * Bloom filter for tracking pages visited.
20
- * 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
20
+ * Bloom filter for tracking visited pages
21
+ * 3 CLI modes
22
+ * Sitemap
23
+ * File Harvest
24
+ * SEO
21
25
 
22
- use-cases
26
+ Use cases
23
27
  ---------
24
- RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
28
+ RubyRetriever can do multiple things for you. With a single command at the terminal, RR can:
25
29
  1. Crawl your website and output a *valid XML sitemap* based on what it found.
26
30
  2. Crawl a target website and *download all files of a given filetype*.
27
- 3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
31
+ 3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
28
32
 
29
33
  Help & Forks Welcome!
30
34
 
31
- getting started
35
+ Getting started
32
36
  -----------
33
37
  Install the gem
34
38
  ```sh
@@ -44,7 +48,7 @@ OR -- SAME COMMAND
44
48
  rr -s csv -p -l 100 http://www.cnet.com
45
49
  ```
46
50
 
47
- This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
51
+ This would map http://www.cnet.com until it crawled a max of 100 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
48
52
 
49
53
  **Example: File Harvesting mode**
50
54
  ```sh
@@ -55,7 +59,7 @@ OR -- SAME COMMAND
55
59
  rr -f pdf -p -l 100 http://www.hubspot.com
56
60
  ```
57
61
 
58
- This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
62
+ This would crawl http://www.hubspot.com looking for filetype:PDF until it hit a max of 100 pages, then write out a list of filepaths to a CSV named hubspot (based on the website host name). Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
59
63
 
60
64
  **Example: SEO mode**
61
65
  ```sh
@@ -66,7 +70,7 @@ OR -- SAME COMMAND
66
70
  rr -e -p -l 10 -o cnet-seo http://www.cnet.com
67
71
  ```
68
72
 
69
- This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
73
+ This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
70
74
 
71
75
 
72
76
  command-line arguments
@@ -93,4 +97,4 @@ bloomfilter-rb
93
97
 
94
98
  License
95
99
  -------
96
- See included 'LICENSE' file. It's the MIT license.
100
+ See included 'LICENSE' file. It's the MIT license.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.3
4
+ version: 1.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-14 00:00:00.000000000 Z
11
+ date: 2014-06-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony