iron-crawler 1.1.3 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b85415ba9b59160fdbfe5ca87c0f14eca5b1bcdc
4
- data.tar.gz: 712549785cf1adc3f0dbe205d69f2abe60905968
3
+ metadata.gz: 871ac52faf7a0c991e10bd990ea30ebe39a762ff
4
+ data.tar.gz: 21ab6bde0d981d688758560bcec94b3972534c0d
5
5
  SHA512:
6
- metadata.gz: a548d701fe83340f17bba0c406efd26fc76d401293f3a7ac36dbd80a822bae55afd2e9f73251250ab17560b5ccafbcfb44ae9818b6bf67876a1d7f88d39a0fda
7
- data.tar.gz: f9a1bf749346692fc5e0df6ce859dc9c7136a06525dd20bbcb165dc074cc00212c7e05b28aab6fd545fc31e7a2cebe43ec7890f04386c685ececda5c0bd5b1cf
6
+ metadata.gz: e730dadb49f824eff18939874b51c81df34a8961bccd56154614f968bc2447552ec2b84ee2e650ed0be4df735c68e29351465639f05b48762392933c9cfd3593
7
+ data.tar.gz: eb12a66e04b3cc12ca55a94041adf1d0e6a4e5768708e8d21d65d435d414e45defe4555bb9d9ad3de2e20106181d2d74736be09b6157418bb7c2fb58bf376c2f
data/README.md CHANGED
@@ -2,13 +2,14 @@
2
2
 
3
3
  A generic web crawler.
4
4
 
5
- ## Requirements
5
+ ## Features
6
6
 
7
- From a starting URL, crawl all links on that URL and print a list of URLs visited.
7
+ From a starting URL, it will crawl all links on that URL and print a list of URLs visited.
8
8
 
9
9
  - Follow href attributes contained in tags from the same domain
10
10
  - Ignores href attributes contained in tags from other domains (even subdomains)
11
11
  - Captures script src and link href tags for script and link tags respectively
12
+ - Outputs a list of visited URLs
12
13
 
13
14
  # Getting Started
14
15
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.1.3
1
+ 1.2.0
data/bin/iron-crawler CHANGED
@@ -13,4 +13,3 @@ url = ARGV.first
13
13
  Announce.info "Crawling #{url}..."
14
14
  agent = Crawler.new
15
15
  puts agent.spiderize(url)
16
-
data/iron-crawler.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
- # stub: iron-crawler 1.1.3 ruby lib
5
+ # stub: iron-crawler 1.2.0 ruby lib
6
6
 
7
7
  Gem::Specification.new do |s|
8
8
  s.name = "iron-crawler"
9
- s.version = "1.1.3"
9
+ s.version = "1.2.0"
10
10
 
11
11
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
12
12
  s.require_paths = ["lib"]
@@ -71,14 +71,8 @@ class Crawler < Mechanize
71
71
  # @return [Booolean] true when already spidered.
72
72
  #
73
73
  def already_spidered?(link)
74
- begin
75
- abs_url = @mech.history.first.uri.to_s.chomp('/') + link.href + '/'
76
- return true if @mech.visited? link.href
77
- return true if @mech.visited? abs_url
78
- rescue Mechanize::UnsupportedSchemeError
79
- puts "skipping #{link.uri}"
80
- return true
81
- end
74
+ abs_url = @mech.history.first.uri.to_s.chomp('/') + link.href + '/'
75
+ return true if (@mech.visited? link.href) || (@mech.visited? abs_url)
82
76
  end
83
77
 
84
78
 
@@ -87,7 +81,11 @@ class Crawler < Mechanize
87
81
  # @return [Booolean] true when valid URL.
88
82
  #
89
83
  def not_valid_uri?(link)
90
- return true unless link.uri && (/^http.+/ =~ link.uri.to_s || /\/.+/ =~ link.uri.to_s)
84
+ if link.uri
85
+ return true unless /^http.+/ =~ link.uri.to_s || /\/.+/ =~ link.uri.to_s
86
+ else
87
+ return true
88
+ end
91
89
  end
92
90
 
93
91
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iron-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.3
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Visser