iron-crawler 1.1.3 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/VERSION +1 -1
- data/bin/iron-crawler +0 -1
- data/iron-crawler.gemspec +2 -2
- data/lib/iron-crawler/crawler.rb +7 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 871ac52faf7a0c991e10bd990ea30ebe39a762ff
|
4
|
+
data.tar.gz: 21ab6bde0d981d688758560bcec94b3972534c0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e730dadb49f824eff18939874b51c81df34a8961bccd56154614f968bc2447552ec2b84ee2e650ed0be4df735c68e29351465639f05b48762392933c9cfd3593
|
7
|
+
data.tar.gz: eb12a66e04b3cc12ca55a94041adf1d0e6a4e5768708e8d21d65d435d414e45defe4555bb9d9ad3de2e20106181d2d74736be09b6157418bb7c2fb58bf376c2f
|
data/README.md
CHANGED
@@ -2,13 +2,14 @@
|
|
2
2
|
|
3
3
|
A generic web crawler.
|
4
4
|
|
5
|
-
##
|
5
|
+
## Features
|
6
6
|
|
7
|
-
From a starting URL, crawl all links on that URL and print a list of URLs visited.
|
7
|
+
From a starting URL, it will crawl all links on that URL and print a list of URLs visited.
|
8
8
|
|
9
9
|
- Follow href attributes contained in tags from the same domain
|
10
10
|
- Ignores href attributes contained in tags from other domains (even subdomains)
|
11
11
|
- Captures script src and link href tags for script and link tags respectively
|
12
|
+
- Outputs a list of visited URLs
|
12
13
|
|
13
14
|
# Getting Started
|
14
15
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.
|
1
|
+
1.2.0
|
data/bin/iron-crawler
CHANGED
data/iron-crawler.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: iron-crawler 1.
|
5
|
+
# stub: iron-crawler 1.2.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "iron-crawler"
|
9
|
-
s.version = "1.
|
9
|
+
s.version = "1.2.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
data/lib/iron-crawler/crawler.rb
CHANGED
@@ -71,14 +71,8 @@ class Crawler < Mechanize
|
|
71
71
|
# @return [Booolean] true when already spidered.
|
72
72
|
#
|
73
73
|
def already_spidered?(link)
|
74
|
-
|
75
|
-
|
76
|
-
return true if @mech.visited? link.href
|
77
|
-
return true if @mech.visited? abs_url
|
78
|
-
rescue Mechanize::UnsupportedSchemeError
|
79
|
-
puts "skipping #{link.uri}"
|
80
|
-
return true
|
81
|
-
end
|
74
|
+
abs_url = @mech.history.first.uri.to_s.chomp('/') + link.href + '/'
|
75
|
+
return true if (@mech.visited? link.href) || (@mech.visited? abs_url)
|
82
76
|
end
|
83
77
|
|
84
78
|
|
@@ -87,7 +81,11 @@ class Crawler < Mechanize
|
|
87
81
|
# @return [Booolean] true when valid URL.
|
88
82
|
#
|
89
83
|
def not_valid_uri?(link)
|
90
|
-
|
84
|
+
if link.uri
|
85
|
+
return true unless /^http.+/ =~ link.uri.to_s || /\/.+/ =~ link.uri.to_s
|
86
|
+
else
|
87
|
+
return true
|
88
|
+
end
|
91
89
|
end
|
92
90
|
|
93
91
|
|