rubyretriever 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/fetchsitemap.rb +25 -0
- data/lib/retriever/version.rb +1 -1
- data/readme.md +4 -4
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 849720cdbcbddf95e458dcf8a9928ad0430e1c6d
|
4
|
+
data.tar.gz: 2bd39dfcbd58a0b1b2fa66496be9a77c97f72623
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a76d5ea2c7087a2c63f84f2d31cde832bcf2d2f72b0e827f8cae581380f1fd3b3ad7e40306753a63ad06ebd5e1154eb2153e9a853ad39aca6ce800428733f4c8
|
7
|
+
data.tar.gz: aba0032ed5d5cb9701e7103f0928a0009443d2323df28f683990b7a75da962f2b0c617a7667fff4a19e9a28983297b60113799a49977bf49998fba5b9af4f98b
|
data/lib/fetchsitemap.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
module Retriever
|
2
|
+
class FetchSitemap < Fetch
|
3
|
+
attr_reader :sitemap
|
4
|
+
def initialize(url,options)
|
5
|
+
super
|
6
|
+
@sitemap = [@target]
|
7
|
+
@linkStack = self.parseInternalLinks(self.fetchLinks(fetchPage(@target)))
|
8
|
+
self.lg("#{@linkStack.size-1} new links found")
|
9
|
+
errlog("Bad URL -- #{@target}") if !@linkStack
|
10
|
+
|
11
|
+
@linkStack.delete(@target) if @linkStack.include?(@target)
|
12
|
+
@linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
|
13
|
+
@sitemap.concat(@linkStack)
|
14
|
+
|
15
|
+
self.async_crawl_and_collect()
|
16
|
+
|
17
|
+
@sitemap.sort_by! {|x| x.length} if @sitemap.size>1
|
18
|
+
@sitemap.uniq!
|
19
|
+
@sitemap = @sitemap.take(@maxPages) if (@sitemap.size+1 > @maxPages)
|
20
|
+
|
21
|
+
self.dump(self.sitemap)
|
22
|
+
self.write(self.sitemap) if @output
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/lib/retriever/version.rb
CHANGED
data/readme.md
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
RubyRetriever [![Gem Version](https://badge.fury.io/rb/
|
1
|
+
RubyRetriever [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever)
|
2
2
|
==============
|
3
3
|
|
4
|
-
Now an official RubyGem!
|
4
|
+
Now an official RubyGem!
|
5
5
|
```sh
|
6
|
-
gem install
|
6
|
+
gem install rubyretriever
|
7
7
|
```
|
8
8
|
|
9
9
|
Update (5/25):
|
@@ -25,7 +25,7 @@ RubyRetriever does NOT respect robots.txt, and RubyRetriever currently - by defa
|
|
25
25
|
HOW IT WORKS
|
26
26
|
-----------
|
27
27
|
```sh
|
28
|
-
gem install
|
28
|
+
gem install rubyretriever
|
29
29
|
rr [MODE] [OPTIONS] Target_URL
|
30
30
|
```
|
31
31
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rubyretriever
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Norton
|
@@ -118,6 +118,7 @@ extra_rdoc_files: []
|
|
118
118
|
files:
|
119
119
|
- LICENSE
|
120
120
|
- bin/rr
|
121
|
+
- lib/fetchsitemap.rb
|
121
122
|
- lib/retriever.rb
|
122
123
|
- lib/retriever/fetch.rb
|
123
124
|
- lib/retriever/fetchfiles.rb
|