rubyretriever 1.4.1 → 1.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/retriever/fetchsitemap.rb +2 -2
- data/lib/retriever/page.rb +2 -0
- data/lib/retriever/version.rb +1 -1
- data/readme.md +2 -0
- data/spec/page_spec.rb +6 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dca26f55e68ab2c5095d9e633592735dc67323e9
|
4
|
+
data.tar.gz: 4e96d543d7ea4001ef4531db3beb5bf3e8accf4d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f08c1d9c7c2395cfc297f1c0abb80760d08d35d24394d381b703d982709757ffc9072e55a60e15b123e45ae6731a388cb2afa914f2a136ab3fce454af11b7b8c
|
7
|
+
data.tar.gz: 0f75a30cb8a5f969d7bc196c6f83e0f25ddd0f76ca2a931aefb6206180172dbb6c4dbb5d4b926d69e75555dd0eec3a8b1f8a528cf18799cca31af9a753e89fc5
|
@@ -16,8 +16,6 @@ module Retriever
|
|
16
16
|
@result.uniq!
|
17
17
|
end
|
18
18
|
|
19
|
-
private
|
20
|
-
|
21
19
|
# produces valid XML sitemap based on page collection fetched.
|
22
20
|
# Writes to current directory.
|
23
21
|
def gen_xml
|
@@ -33,6 +31,8 @@ module Retriever
|
|
33
31
|
print_file_info(filename)
|
34
32
|
end
|
35
33
|
|
34
|
+
private
|
35
|
+
|
36
36
|
def print_file_info(filename)
|
37
37
|
puts HR
|
38
38
|
puts "File Created: sitemap-#{filename}.xml"
|
data/lib/retriever/page.rb
CHANGED
@@ -5,6 +5,7 @@ using SourceString
|
|
5
5
|
module Retriever
|
6
6
|
#
|
7
7
|
class Page
|
8
|
+
HASH_RE = Regexp.new(/^#/i).freeze
|
8
9
|
HTTP_RE = Regexp.new(/^http/i).freeze
|
9
10
|
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
|
10
11
|
H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
|
@@ -50,6 +51,7 @@ module Retriever
|
|
50
51
|
# filter some malformed URLS that come in
|
51
52
|
# meant to be a loose filter to catch all reasonable HREF attributes.
|
52
53
|
link = match[0]
|
54
|
+
next if HASH_RE =~ link
|
53
55
|
Link.new(@t.scheme, @t.host, link, @url).path
|
54
56
|
end.compact.uniq
|
55
57
|
end
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -8,6 +8,8 @@ RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command
|
|
8
8
|
|
9
9
|
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
|
10
10
|
|
11
|
+
**v1.4.2 Update (3/24/2016)** - Fixes problem with named anchors (divs) being counted as links.
|
12
|
+
|
11
13
|
**v1.4.1 Update (3/24/2016)** - Update gemfile & external dependency versioning
|
12
14
|
|
13
15
|
**v1.4.0 Update (3/24/2016)** - Several bug fixes.
|
data/spec/page_spec.rb
CHANGED
@@ -27,9 +27,10 @@ describe 'Page' do
|
|
27
27
|
end
|
28
28
|
|
29
29
|
describe '#links' do
|
30
|
-
let(:
|
31
|
-
|
32
|
-
|
30
|
+
let(:source) { "<a href='/profile/'>profile</a><a href='#top'>top</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
|
31
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
|
32
|
+
it 'collects all unique href links on the page, skips div anchors' do
|
33
|
+
expect(page.links.size).to eq(2)
|
33
34
|
end
|
34
35
|
end
|
35
36
|
|
@@ -42,11 +43,11 @@ describe 'Page' do
|
|
42
43
|
end
|
43
44
|
|
44
45
|
describe '#parse_internal_visitable' do
|
45
|
-
let(:source) { "<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
|
46
|
+
let(:source) { "<a href='/profile/'>profile</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
|
46
47
|
let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
|
47
48
|
let(:links) { page.parse_internal_visitable }
|
48
49
|
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
49
|
-
expect(links.size).to eq(
|
50
|
+
expect(links.size).to eq(1)
|
50
51
|
end
|
51
52
|
end
|
52
53
|
|