crawl 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d31f6ad78385f65e06abb7837358453330394baf
4
- data.tar.gz: 7a8f0781ae652f1500b9a5fde7a08380627ce1de
3
+ metadata.gz: fe74cc42a5729f8b4a6773e6c7e2326abddefc7a
4
+ data.tar.gz: 22677615aa464e1549a34b99273631337bb11cc1
5
5
  SHA512:
6
- metadata.gz: 071573a7d5bd59416b2d6a582a51a0a56a30000184391ae05688ed894b40b50e410661dfef2ac4554fd29f2dae117504bccdda00ef9144888c8ca46210c9f980
7
- data.tar.gz: 7954d33220e05baf7667590b6fcb082800fbbda753016d2014331a4afec02ef8f8951f68256a90ea4baaee4024abadde53541e256d59b2171b9f61f629323d71
6
+ metadata.gz: 4fe1ac4cd0bf951d62a2b0c6851818b5317f8dfd469ce4133478a1a179a08e9566d627a5b2ba02b4301fa48bc511686ba02e71edeaf7f41f4e6125643b563105
7
+ data.tar.gz: 1f69f772212400ea07ad8cb27d2c6daefb5549805d3cb1e2aa632e7ac63b8fa5b801a926d8341721a4e35d94d55e0c54daa71e3393da96b593f17954416298f6
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2015 AlphaSights
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -10,7 +10,7 @@ Usage:
10
10
  -s, --start /home,/about Starting path(s), defaults to /
11
11
  -u, --username username Basic auth username
12
12
  -p, --password password Basic auth password
13
- -c, --connections count Max mumber of parallel connections to use. The default is 20.
13
+ -c, --connections count Max mumber of parallel connections to use. The default is 5.
14
14
  -v, --verbose Give details when crawling
15
15
  -h, --help Show this message
16
16
  --version Print version
@@ -34,3 +34,8 @@ Example:
34
34
 
35
35
  5 pages crawled without errors.
36
36
 
37
+ ### Copyright and License
38
+
39
+ Copyright AlphaSights and Contributors, 2015
40
+
41
+ [MIT Licence](LICENSE.txt)
data/bin/crawl CHANGED
@@ -23,6 +23,8 @@ unless options[:domain]
23
23
  exit -1
24
24
  end
25
25
 
26
+ options[:domain] = "http://#{options[:domain]}" unless options[:domain].include?('://')
27
+
26
28
  crawler = Crawl::Engine.new(options)
27
29
 
28
30
  trap("SIGINT") do
data/lib/crawl.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  # encoding: utf-8
2
2
  require('nokogiri')
3
3
  require('rest_client')
4
- require 'ci/reporter/core'
5
4
  require 'eventmachine'
6
5
  require 'em-http-request'
7
6
  require 'base64'
data/lib/crawl/engine.rb CHANGED
@@ -59,21 +59,21 @@ private
59
59
  def retrieve(page)
60
60
  puts "Fetching #{page.url} ..." if $verbose
61
61
 
62
- full_url = options[:domain] + page.url
62
+ absolute_url = options[:domain] + page.relative_url
63
63
 
64
- http = EventMachine::HttpRequest.new(full_url)
64
+ http = EventMachine::HttpRequest.new(absolute_url)
65
65
  req = http.get :redirects => MAX_REDIRECTS, :head => {'authorization' => [options[:username], options[:password]]}
66
66
  req.timeout(15)
67
67
 
68
68
  req.errback do
69
69
  if req.nil?
70
- page.intermittent("Req is nil. WAT?")
70
+ page.intermittent("Req is nil. WAT?")
71
71
  elsif msg = req.error
72
- page.intermittent(msg)
72
+ page.intermittent(msg)
73
73
  elsif req.response.nil? || req.response.empty?
74
- page.intermittent('Timeout?')
74
+ page.intermittent('Timeout?')
75
75
  else
76
- page.intermittent('Partial response: Server Broke Connection?')
76
+ page.intermittent('Partial response: Server Broke Connection?')
77
77
  end
78
78
  process_next
79
79
  end
data/lib/crawl/page.rb CHANGED
@@ -13,6 +13,14 @@ class Page
13
13
  @errors = nil
14
14
  end
15
15
 
16
+ def relative_url
17
+ if url.start_with?('/')
18
+ url
19
+ else
20
+ "#{source}/#{url}"
21
+ end
22
+ end
23
+
16
24
  def <=>(other)
17
25
  url <=> other.url
18
26
  end
data/lib/crawl/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module Crawl
3
- VERSION = "1.1.1"
3
+ VERSION = "1.1.2"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tor Erik Linnerud
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-12 00:00:00.000000000 Z
11
+ date: 2015-10-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -76,6 +76,7 @@ extra_rdoc_files: []
76
76
  files:
77
77
  - ".gitignore"
78
78
  - Gemfile
79
+ - LICENSE.txt
79
80
  - README.md
80
81
  - Rakefile
81
82
  - bin/crawl
@@ -107,7 +108,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
107
108
  version: '0'
108
109
  requirements: []
109
110
  rubyforge_project:
110
- rubygems_version: 2.2.2
111
+ rubygems_version: 2.4.5.1
111
112
  signing_key:
112
113
  specification_version: 4
113
114
  summary: Crawl pages witin a domain, reporting any page that returns a bad response