iron-crawler 1.1.3 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -2
- data/VERSION +1 -1
- data/bin/iron-crawler +0 -1
- data/iron-crawler.gemspec +2 -2
- data/lib/iron-crawler/crawler.rb +7 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 871ac52faf7a0c991e10bd990ea30ebe39a762ff
|
4
|
+
data.tar.gz: 21ab6bde0d981d688758560bcec94b3972534c0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e730dadb49f824eff18939874b51c81df34a8961bccd56154614f968bc2447552ec2b84ee2e650ed0be4df735c68e29351465639f05b48762392933c9cfd3593
|
7
|
+
data.tar.gz: eb12a66e04b3cc12ca55a94041adf1d0e6a4e5768708e8d21d65d435d414e45defe4555bb9d9ad3de2e20106181d2d74736be09b6157418bb7c2fb58bf376c2f
|
data/README.md
CHANGED
@@ -2,13 +2,14 @@
|
|
2
2
|
|
3
3
|
A generic web crawler.
|
4
4
|
|
5
|
-
##
|
5
|
+
## Features
|
6
6
|
|
7
|
-
From a starting URL, crawl all links on that URL and print a list of URLs visited.
|
7
|
+
From a starting URL, it will crawl all links on that URL and print a list of URLs visited.
|
8
8
|
|
9
9
|
- Follow href attributes contained in tags from the same domain
|
10
10
|
- Ignores href attributes contained in tags from other domains (even subdomains)
|
11
11
|
- Captures script src and link href tags for script and link tags respectively
|
12
|
+
- Outputs a list of visited URLs
|
12
13
|
|
13
14
|
# Getting Started
|
14
15
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.
|
1
|
+
1.2.0
|
data/bin/iron-crawler
CHANGED
data/iron-crawler.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: iron-crawler 1.
|
5
|
+
# stub: iron-crawler 1.2.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "iron-crawler"
|
9
|
-
s.version = "1.
|
9
|
+
s.version = "1.2.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
data/lib/iron-crawler/crawler.rb
CHANGED
@@ -71,14 +71,8 @@ class Crawler < Mechanize
|
|
71
71
|
# @return [Booolean] true when already spidered.
|
72
72
|
#
|
73
73
|
def already_spidered?(link)
|
74
|
-
|
75
|
-
|
76
|
-
return true if @mech.visited? link.href
|
77
|
-
return true if @mech.visited? abs_url
|
78
|
-
rescue Mechanize::UnsupportedSchemeError
|
79
|
-
puts "skipping #{link.uri}"
|
80
|
-
return true
|
81
|
-
end
|
74
|
+
abs_url = @mech.history.first.uri.to_s.chomp('/') + link.href + '/'
|
75
|
+
return true if (@mech.visited? link.href) || (@mech.visited? abs_url)
|
82
76
|
end
|
83
77
|
|
84
78
|
|
@@ -87,7 +81,11 @@ class Crawler < Mechanize
|
|
87
81
|
# @return [Booolean] true when valid URL.
|
88
82
|
#
|
89
83
|
def not_valid_uri?(link)
|
90
|
-
|
84
|
+
if link.uri
|
85
|
+
return true unless /^http.+/ =~ link.uri.to_s || /\/.+/ =~ link.uri.to_s
|
86
|
+
else
|
87
|
+
return true
|
88
|
+
end
|
91
89
|
end
|
92
90
|
|
93
91
|
|