iron-crawler 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/iron-crawler.gemspec +2 -2
- data/lib/iron-crawler/crawler.rb +4 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5efca3651acb88f5cf1227dd0041967defab2888
|
4
|
+
data.tar.gz: 78013e58585dda3d4571ef4c9ad8c321ecafe7b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d1e75cc0ae365a95b8948d82535c869ee60f5550b41491561039b5838711c82248a11354a5351f7ed2be6bd8a3d7be3fea49299aae08ca5d35bb66ea1b1f4387
|
7
|
+
data.tar.gz: 46389122732c43f842b6785464dd9d78e9f3f9252e6dd607ab263588dc5ab06a27daa75550ed92d48bd39aaed818dbf79948572606e7a1c50b14b8b7c88e6833
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
1.1.
|
1
|
+
1.1.1
|
data/iron-crawler.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: iron-crawler 1.1.
|
5
|
+
# stub: iron-crawler 1.1.1 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "iron-crawler"
|
9
|
-
s.version = "1.1.
|
9
|
+
s.version = "1.1.1"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib"]
|
data/lib/iron-crawler/crawler.rb
CHANGED
@@ -23,7 +23,7 @@ class Crawler < Mechanize
|
|
23
23
|
puts "crawling #{link.uri}"
|
24
24
|
begin
|
25
25
|
page = link.click
|
26
|
-
next unless Mechanize::Page
|
26
|
+
next unless Mechanize::Page == page
|
27
27
|
stack.push(*src_links(page))
|
28
28
|
stack.push(*page.links)
|
29
29
|
rescue Mechanize::ResponseCodeError
|
@@ -40,7 +40,7 @@ class Crawler < Mechanize
|
|
40
40
|
#
|
41
41
|
def src_links(page)
|
42
42
|
links = []
|
43
|
-
page.search(
|
43
|
+
page.search('script').each do |element|
|
44
44
|
next if element.attributes['src'].nil?
|
45
45
|
doc = Nokogiri::HTML::Document.new
|
46
46
|
node = Nokogiri::XML::Node.new('foo', doc)
|
@@ -87,7 +87,8 @@ class Crawler < Mechanize
|
|
87
87
|
# @return [Booolean] true when valid URL.
|
88
88
|
#
|
89
89
|
def not_valid_uri?(link)
|
90
|
-
|
90
|
+
valid_uri_regex = (/^http.+/ =~ link.uri.to_s || /\/.+/ =~ link.uri.to_s)
|
91
|
+
return true unless link.uri && valid_uri_regex
|
91
92
|
end
|
92
93
|
|
93
94
|
|