rubyretriever 1.4.1 → 1.4.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9b337916fddfc246a2b3cd73bdd40cfe13eccb9a
4
- data.tar.gz: 908d0a752ab89adaa9c8530ae7803329c14591ce
3
+ metadata.gz: dca26f55e68ab2c5095d9e633592735dc67323e9
4
+ data.tar.gz: 4e96d543d7ea4001ef4531db3beb5bf3e8accf4d
5
5
  SHA512:
6
- metadata.gz: 7bf4c80012e0232b6bea179b5e41ab36c8066e7b0bb6da39b0eaedd0a6b67a6b826754f6f4e94d9dd21287ebc05579ebc759b7887ed29218c82eccb68b31fd79
7
- data.tar.gz: 2bc3158bf98beb2c2b701b7811b774c1ff32236899683b6486bf2d1680ddbb87e8ebbb7aaa27eb8aa968a20f63158640c9d45c1d38c992484a9920069b38b976
6
+ metadata.gz: f08c1d9c7c2395cfc297f1c0abb80760d08d35d24394d381b703d982709757ffc9072e55a60e15b123e45ae6731a388cb2afa914f2a136ab3fce454af11b7b8c
7
+ data.tar.gz: 0f75a30cb8a5f969d7bc196c6f83e0f25ddd0f76ca2a931aefb6206180172dbb6c4dbb5d4b926d69e75555dd0eec3a8b1f8a528cf18799cca31af9a753e89fc5
@@ -16,8 +16,6 @@ module Retriever
16
16
  @result.uniq!
17
17
  end
18
18
 
19
- private
20
-
21
19
  # produces valid XML sitemap based on page collection fetched.
22
20
  # Writes to current directory.
23
21
  def gen_xml
@@ -33,6 +31,8 @@ module Retriever
33
31
  print_file_info(filename)
34
32
  end
35
33
 
34
+ private
35
+
36
36
  def print_file_info(filename)
37
37
  puts HR
38
38
  puts "File Created: sitemap-#{filename}.xml"
@@ -5,6 +5,7 @@ using SourceString
5
5
  module Retriever
6
6
  #
7
7
  class Page
8
+ HASH_RE = Regexp.new(/^#/i).freeze
8
9
  HTTP_RE = Regexp.new(/^http/i).freeze
9
10
  H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
10
11
  H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
@@ -50,6 +51,7 @@ module Retriever
50
51
  # filter some malformed URLS that come in
51
52
  # meant to be a loose filter to catch all reasonable HREF attributes.
52
53
  link = match[0]
54
+ next if HASH_RE =~ link
53
55
  Link.new(@t.scheme, @t.host, link, @url).path
54
56
  end.compact.uniq
55
57
  end
@@ -1,4 +1,4 @@
1
1
  #
2
2
  module Retriever
3
- VERSION = '1.4.1'
3
+ VERSION = '1.4.2'
4
4
  end
data/readme.md CHANGED
@@ -8,6 +8,8 @@ RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command
8
8
 
9
9
  RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
10
10
 
11
+ **v1.4.2 Update (3/24/2016)** - Fixes problem with named anchors (divs) being counted as links.
12
+
11
13
  **v1.4.1 Update (3/24/2016)** - Update gemfile & external dependency versioning
12
14
 
13
15
  **v1.4.0 Update (3/24/2016)** - Several bug fixes.
data/spec/page_spec.rb CHANGED
@@ -27,9 +27,10 @@ describe 'Page' do
27
27
  end
28
28
 
29
29
  describe '#links' do
30
- let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
31
- it 'collects all unique href links on the page' do
32
- expect(page.links.size).to eq(4)
30
+ let(:source) { "<a href='/profile/'>profile</a><a href='#top'>top</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
31
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
32
+ it 'collects all unique href links on the page, skips div anchors' do
33
+ expect(page.links.size).to eq(2)
33
34
  end
34
35
  end
35
36
 
@@ -42,11 +43,11 @@ describe 'Page' do
42
43
  end
43
44
 
44
45
  describe '#parse_internal_visitable' do
45
- let(:source) { "<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
46
+ let(:source) { "<a href='/profile/'>profile</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
46
47
  let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
47
48
  let(:links) { page.parse_internal_visitable }
48
49
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
49
- expect(links.size).to eq(0)
50
+ expect(links.size).to eq(1)
50
51
  end
51
52
  end
52
53
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.1
4
+ version: 1.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton