crawl 1.1.1 → 1.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE.txt +20 -0
- data/README.md +6 -1
- data/bin/crawl +2 -0
- data/lib/crawl.rb +0 -1
- data/lib/crawl/engine.rb +6 -6
- data/lib/crawl/page.rb +8 -0
- data/lib/crawl/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe74cc42a5729f8b4a6773e6c7e2326abddefc7a
|
4
|
+
data.tar.gz: 22677615aa464e1549a34b99273631337bb11cc1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4fe1ac4cd0bf951d62a2b0c6851818b5317f8dfd469ce4133478a1a179a08e9566d627a5b2ba02b4301fa48bc511686ba02e71edeaf7f41f4e6125643b563105
|
7
|
+
data.tar.gz: 1f69f772212400ea07ad8cb27d2c6daefb5549805d3cb1e2aa632e7ac63b8fa5b801a926d8341721a4e35d94d55e0c54daa71e3393da96b593f17954416298f6
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 AlphaSights
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -10,7 +10,7 @@ Usage:
|
|
10
10
|
-s, --start /home,/about Starting path(s), defaults to /
|
11
11
|
-u, --username username Basic auth username
|
12
12
|
-p, --password password Basic auth password
|
13
|
-
-c, --connections count Max mumber of parallel connections to use. The default is
|
13
|
+
-c, --connections count Max mumber of parallel connections to use. The default is 5.
|
14
14
|
-v, --verbose Give details when crawling
|
15
15
|
-h, --help Show this message
|
16
16
|
--version Print version
|
@@ -34,3 +34,8 @@ Example:
|
|
34
34
|
|
35
35
|
5 pages crawled without errors.
|
36
36
|
|
37
|
+
### Copyright and License
|
38
|
+
|
39
|
+
Copyright AlphaSights and Contributors, 2015
|
40
|
+
|
41
|
+
[MIT Licence](LICENSE.txt)
|
data/bin/crawl
CHANGED
data/lib/crawl.rb
CHANGED
data/lib/crawl/engine.rb
CHANGED
@@ -59,21 +59,21 @@ private
|
|
59
59
|
def retrieve(page)
|
60
60
|
puts "Fetching #{page.url} ..." if $verbose
|
61
61
|
|
62
|
-
|
62
|
+
absolute_url = options[:domain] + page.relative_url
|
63
63
|
|
64
|
-
http = EventMachine::HttpRequest.new(
|
64
|
+
http = EventMachine::HttpRequest.new(absolute_url)
|
65
65
|
req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
|
66
66
|
req.timeout(15)
|
67
67
|
|
68
68
|
req.errback do
|
69
69
|
if req.nil?
|
70
|
-
|
70
|
+
page.intermittent("Req is nil. WAT?")
|
71
71
|
elsif msg = req.error
|
72
|
-
|
72
|
+
page.intermittent(msg)
|
73
73
|
elsif req.response.nil? || req.response.empty?
|
74
|
-
|
74
|
+
page.intermittent('Timeout?')
|
75
75
|
else
|
76
|
-
|
76
|
+
page.intermittent('Partial response: Server Broke Connection?')
|
77
77
|
end
|
78
78
|
process_next
|
79
79
|
end
|
data/lib/crawl/page.rb
CHANGED
data/lib/crawl/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tor Erik Linnerud
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -76,6 +76,7 @@ extra_rdoc_files: []
|
|
76
76
|
files:
|
77
77
|
- ".gitignore"
|
78
78
|
- Gemfile
|
79
|
+
- LICENSE.txt
|
79
80
|
- README.md
|
80
81
|
- Rakefile
|
81
82
|
- bin/crawl
|
@@ -107,7 +108,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
107
108
|
version: '0'
|
108
109
|
requirements: []
|
109
110
|
rubyforge_project:
|
110
|
-
rubygems_version: 2.
|
111
|
+
rubygems_version: 2.4.5.1
|
111
112
|
signing_key:
|
112
113
|
specification_version: 4
|
113
114
|
summary: Crawl pages witin a domain, reporting any page that returns a bad response
|