rubyretriever 1.2.3 → 1.2.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3bb32aa2e9c8317d2f3cb13572e2cdecb1da24a9
4
- data.tar.gz: 732e5610104345efed80651929cb9a050e01d9be
3
+ metadata.gz: a5749cf55198f97bab6c77297bf6409a2518bca0
4
+ data.tar.gz: df792b6d3b1d03a8b70faadf651e20779f4fd1e8
5
5
  SHA512:
6
- metadata.gz: 3d4e109785452db3906dc7b66158846cda24e4c3e1b942f600918338e141d6a337f1f9b3087b94b2561c64095fcdc2f2fb439d29b73574a2ddae501a8f0d965b
7
- data.tar.gz: 2e0befea22dfc2bc689d15ad3c33efaf015f7b1ee5c53a322cccca7f6394a4def445e362585600e1934f770377ce193c5a762caa3639ccda480f7c481ce64d64
6
+ metadata.gz: 76e12598873e8779e196f84ec040a5c9fef01e410c884fa88b18f41335848a69dea778459bbb9629f941dbe12c8ab2c57032d24007a1f4fd57c5caec3a76abea
7
+ data.tar.gz: 13b91c96ce17eb8ce802250505943d13efbd292cd95b120336ddd0ddd98278c513310c68eb9606a92b4018ef6ec9369369ee5869eff0abb589457c7b3bde41c3
data/bin/rr CHANGED
@@ -56,7 +56,7 @@ if ARGV[0].nil?
56
56
  end
57
57
 
58
58
  ARGV.each do|q|
59
- if options[:verbose]
59
+ if options['verbose']
60
60
  puts '###############################'
61
61
  puts '### [RubyRetriever]'
62
62
  puts '### Creating Sitemap' if options['sitemap']
@@ -10,12 +10,12 @@ module Retriever
10
10
  #
11
11
  class Fetch
12
12
  HR = '###############################'
13
- attr_reader :max_pages, :t
13
+ attr_reader :max_pages, :t, :result
14
14
  # given target URL and RR options, creates a fetch object.
15
15
  # There is no direct output
16
16
  # this is a parent class that the other fetch classes build off of.
17
17
  def initialize(url, options)
18
- @data = []
18
+ @result = []
19
19
  @connection_tally = {
20
20
  success: 0,
21
21
  error: 0,
@@ -52,9 +52,9 @@ module Retriever
52
52
  elsif @seo
53
53
  puts 'SEO Metrics'
54
54
  end
55
- puts "Data Dump -- Object Count: #{@data.size}"
55
+ puts "Data Dump -- Object Count: #{@result.size}"
56
56
  puts HR
57
- @data.each do |line|
57
+ @result.each do |line|
58
58
  puts line
59
59
  end
60
60
  puts
@@ -69,13 +69,13 @@ module Retriever
69
69
  csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
70
70
  i += 1
71
71
  end
72
- @data.each do |entry|
72
+ @result.each do |entry|
73
73
  csv << entry
74
74
  end
75
75
  end
76
76
  puts HR
77
77
  puts "File Created: #{@output}.csv"
78
- puts "Object Count: #{@data.size}"
78
+ puts "Object Count: #{@result.size}"
79
79
  puts HR
80
80
  puts
81
81
  end
@@ -152,9 +152,9 @@ module Retriever
152
152
  next if new_links_arr.nil? || new_links_arr.empty?
153
153
  @link_stack.concat(new_links_arr)
154
154
  next unless @sitemap
155
- @data.concat(new_links_arr)
155
+ @result.concat(new_links_arr)
156
156
  end
157
- @data.uniq!
157
+ @result.uniq!
158
158
  end
159
159
 
160
160
  # returns true is resp is ok to continue
@@ -193,13 +193,13 @@ module Retriever
193
193
  def push_seo_to_data(url, new_page)
194
194
  seos = [url]
195
195
  seos.concat(new_page.parse_seo)
196
- @data.push(seos)
196
+ @result.push(seos)
197
197
  lg('--page SEO scraped')
198
198
  end
199
199
 
200
200
  def push_files_to_data(new_page)
201
201
  filez = new_page.parse_files(new_page.parse_internal)
202
- @data.concat(filez) unless filez.empty?
202
+ @result.concat(filez) unless filez.empty?
203
203
  lg("--#{filez.size} files found")
204
204
  end
205
205
 
@@ -6,13 +6,13 @@ module Retriever
6
6
  def initialize(url, options)
7
7
  super
8
8
  temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
9
- @data.concat(temp_file_collection) if temp_file_collection.size > 0
10
- lg("#{@data.size} new files found")
9
+ @result.concat(temp_file_collection) if temp_file_collection.size > 0
10
+ lg("#{@result.size} new files found")
11
11
 
12
12
  async_crawl_and_collect
13
13
  # done, make sure progress bar says we are done
14
14
  @progressbar.finish if @progress
15
- @data.sort_by! { |x| x.length }
15
+ @result.sort_by! { |x| x.length }
16
16
  end
17
17
 
18
18
  def download_file(path)
@@ -33,7 +33,7 @@ module Retriever
33
33
  puts HR
34
34
  puts '### Initiating Autodownload...'
35
35
  puts HR
36
- puts "#{@data.count} - #{@file_ext}'s Located"
36
+ puts "#{@result.count} - #{@file_ext}'s Located"
37
37
  puts HR
38
38
  move_to_download_dir
39
39
  iterate_thru_collection_and_download
@@ -43,8 +43,8 @@ module Retriever
43
43
  private
44
44
 
45
45
  def iterate_thru_collection_and_download
46
- lenn = @data.count
47
- @data.each_with_index do |entry, i|
46
+ lenn = @result.count
47
+ @result.each_with_index do |entry, i|
48
48
  begin
49
49
  download_file(entry)
50
50
  rescue StandardError
@@ -6,12 +6,12 @@ module Retriever
6
6
  # on all unique pages found on the site
7
7
  def initialize(url, options)
8
8
  super
9
- @data.push(@page_one.parse_seo)
9
+ @result.push(@page_one.parse_seo)
10
10
 
11
11
  async_crawl_and_collect
12
12
  # done, make sure progress bar says we are done
13
13
  @progressbar.finish if @progress
14
- @data.sort_by! { |x| x[0].length }
14
+ @result.sort_by! { |x| x[0].length }
15
15
  end
16
16
  end
17
17
  end
@@ -5,14 +5,14 @@ module Retriever
5
5
  # returns an array of all unique pages found on the site
6
6
  def initialize(url, options)
7
7
  super
8
- @data.push(@t.target)
9
- @data.concat(@link_stack)
8
+ @result.push(@t.target)
9
+ @result.concat(@link_stack)
10
10
 
11
11
  async_crawl_and_collect
12
12
  # done, make sure progress bar says we are done
13
13
  @progressbar.finish if @progress
14
- @data.sort_by! { |x| x.length } if @data.size > 1
15
- @data.uniq!
14
+ @result.sort_by! { |x| x.length } if @result.size > 1
15
+ @result.uniq!
16
16
  end
17
17
 
18
18
  private
@@ -24,7 +24,7 @@ module Retriever
24
24
  f = File.open("sitemap-#{filename}.xml", 'w+')
25
25
  f << "<?xml version='1.0' encoding='UTF-8'?>"
26
26
  f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
27
- @data.each do |url|
27
+ @result.each do |url|
28
28
  f << "<url><loc>#{url}</loc></url>"
29
29
  end
30
30
  f << '</urlset>'
@@ -35,7 +35,7 @@ module Retriever
35
35
  def print_file_info(filename)
36
36
  puts HR
37
37
  puts "File Created: sitemap-#{filename}.xml"
38
- puts "Object Count: #{@data.size}"
38
+ puts "Object Count: #{@result.size}"
39
39
  puts HR + "\n"
40
40
  end
41
41
  end
@@ -8,7 +8,12 @@ module Retriever
8
8
  WWW_DOT_RE = Regexp.new(/^www\./i).freeze
9
9
 
10
10
  def initialize(target_scheme, target_host, this_link)
11
- @link_uri = Addressable::URI.parse(this_link)
11
+ begin
12
+ @link_uri = Addressable::URI.parse(Addressable::URI.encode(this_link)).normalize
13
+ rescue Addressable::URI::InvalidURIError => e
14
+ dummy_link = Retriever::Link.new(target_scheme, target_host, target_host)
15
+ @link_uri = Addressable::URI.parse(dummy_link.path)
16
+ end
12
17
  @scheme = target_scheme
13
18
  @host = target_host
14
19
  @this_link = @link_uri.to_s
@@ -52,7 +52,7 @@ module Retriever
52
52
  end
53
53
 
54
54
  def parse_internal
55
- links.select { |x| @t.host == Addressable::URI.parse(x).host }
55
+ links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host }
56
56
  end
57
57
 
58
58
  def parse_internal_visitable
@@ -11,7 +11,7 @@ module Retriever
11
11
  def initialize(url, file_re = nil)
12
12
  fail 'Bad URL' unless url.include?('.')
13
13
  url = "http://#{url}" unless HTTP_RE =~ url
14
- target_uri = Addressable::URI.parse(url)
14
+ target_uri = Addressable::URI.parse(Addressable::URI.encode(url))
15
15
  @target = target_uri.to_s
16
16
  @host = target_uri.host
17
17
  @host_re = Regexp.new(@host.sub('www.', ''))
@@ -1,4 +1,4 @@
1
1
  #
2
2
  module Retriever
3
- VERSION = '1.2.3'
3
+ VERSION = '1.2.4'
4
4
  end
data/readme.md CHANGED
@@ -6,29 +6,33 @@ By Joe Norton
6
6
 
7
7
  RubyRetriever is a Web Crawler, Site Mapper, File Harvester & Autodownloader.
8
8
 
9
- RubyRetriever (RR) uses asynchronous HTTP requests, thanks to [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony), to crawl webpages *very quickly*. Another neat thing about RR, is it uses a ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
9
+ RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled.
10
10
 
11
- **v1.0 Update (6/07/2014)** - Includes major code changes, a lot of bug fixes. Much better in dealing with redirects, and issues with the host changing, etc. Also, added the SEO mode -- which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility -- and thus, this was update 1.0!
12
- mission
11
+ **v1.0 Update (6/07/2014)** - Includes major code changes and a lot of bug fixes. It's now much better in dealing with redirects, issues with the host changing, etc. Also added the SEO mode, which grabs a number of key SEO components from every page on a site. Lastly, this update was so extensive that I could not ensure backward compatibility; thus, this was update 1.0!
12
+
13
+ Mission
13
14
  -------
14
- RubyRetriever aims to be the best command-line crawling, and scraping package written in Ruby.
15
+ RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby.
15
16
 
16
- features
17
+ Features
17
18
  --------
18
19
  * Asynchronous HTTP Requests thru EM & Synchrony
19
- * Bloom filter for tracking pages visited.
20
- * 3 CLI modes: 1) Sitemap, 2) File Harvest, 3) SEO
20
+ * Bloom filter for tracking visited pages
21
+ * 3 CLI modes
22
+ * Sitemap
23
+ * File Harvest
24
+ * SEO
21
25
 
22
- use-cases
26
+ Use cases
23
27
  ---------
24
- RubyRetriever can do multiple things for you, with a single command at the terminal RR can:
28
+ RubyRetriever can do multiple things for you. With a single command at the terminal, RR can:
25
29
  1. Crawl your website and output a *valid XML sitemap* based on what it found.
26
30
  2. Crawl a target website and *download all files of a given filetype*.
27
- 3. Crawl a target website and *collect important SEO information* such as page titles, meta descriptions, h1 tags, etc. and write it to CSV.
31
+ 3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
28
32
 
29
33
  Help & Forks Welcome!
30
34
 
31
- getting started
35
+ Getting started
32
36
  -----------
33
37
  Install the gem
34
38
  ```sh
@@ -44,7 +48,7 @@ OR -- SAME COMMAND
44
48
  rr -s csv -p -l 100 http://www.cnet.com
45
49
  ```
46
50
 
47
- This would go to http://www.cnet.com and map it until it crawled a max of 100 pages, and then it would write it out to a csv named cnet. Optionally, we can also use the format XML and then rubyretriever would output that same URL list into a valid XML sitemap that can be submitted to Google -- but that is not what this current example would do.
51
+ This would map http://www.cnet.com until it crawled a max of 100 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
48
52
 
49
53
  **Example: File Harvesting mode**
50
54
  ```sh
@@ -55,7 +59,7 @@ OR -- SAME COMMAND
55
59
  rr -f pdf -p -l 100 http://www.hubspot.com
56
60
  ```
57
61
 
58
- This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
62
+ This would crawl http://www.hubspot.com looking for filetype:PDF until it hit a max of 100 pages, then write out a list of filepaths to a CSV named hubspot (based on the website host name). Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
59
63
 
60
64
  **Example: SEO mode**
61
65
  ```sh
@@ -66,7 +70,7 @@ OR -- SAME COMMAND
66
70
  rr -e -p -l 10 -o cnet-seo http://www.cnet.com
67
71
  ```
68
72
 
69
- This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
73
+ This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
70
74
 
71
75
 
72
76
  command-line arguments
@@ -93,4 +97,4 @@ bloomfilter-rb
93
97
 
94
98
  License
95
99
  -------
96
- See included 'LICENSE' file. It's the MIT license.
100
+ See included 'LICENSE' file. It's the MIT license.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.3
4
+ version: 1.2.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-14 00:00:00.000000000 Z
11
+ date: 2014-06-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony