RubyGems - iron-crawler - Versions diffs - 1.1.3 → 1.2.0 - Mend

iron-crawler 1.1.3 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b85415ba9b59160fdbfe5ca87c0f14eca5b1bcdc
-  data.tar.gz: 712549785cf1adc3f0dbe205d69f2abe60905968
+  metadata.gz: 871ac52faf7a0c991e10bd990ea30ebe39a762ff
+  data.tar.gz: 21ab6bde0d981d688758560bcec94b3972534c0d
 SHA512:
-  metadata.gz: a548d701fe83340f17bba0c406efd26fc76d401293f3a7ac36dbd80a822bae55afd2e9f73251250ab17560b5ccafbcfb44ae9818b6bf67876a1d7f88d39a0fda
-  data.tar.gz: f9a1bf749346692fc5e0df6ce859dc9c7136a06525dd20bbcb165dc074cc00212c7e05b28aab6fd545fc31e7a2cebe43ec7890f04386c685ececda5c0bd5b1cf
+  metadata.gz: e730dadb49f824eff18939874b51c81df34a8961bccd56154614f968bc2447552ec2b84ee2e650ed0be4df735c68e29351465639f05b48762392933c9cfd3593
+  data.tar.gz: eb12a66e04b3cc12ca55a94041adf1d0e6a4e5768708e8d21d65d435d414e45defe4555bb9d9ad3de2e20106181d2d74736be09b6157418bb7c2fb58bf376c2f

data/README.md CHANGED Viewed

@@ -2,13 +2,14 @@
 A generic web crawler.
-## Requirements
+## Features
-From a starting URL, crawl all links on that URL and print a list of URLs visited.
+From a starting URL, it will crawl all links on that URL and print a list of URLs visited.
 - Follow href attributes contained in tags from the same domain
 - Ignores href attributes contained in tags from other domains (even subdomains)
 - Captures script src and link href tags for script and link tags respectively
+- Outputs a list of visited URLs
 # Getting Started

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.1.3
1	+ 1.2.0

data/bin/iron-crawler CHANGED Viewed

@@ -13,4 +13,3 @@ url = ARGV.first
 Announce.info "Crawling #{url}..."
 agent = Crawler.new
 puts agent.spiderize(url)

data/iron-crawler.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 # DO NOT EDIT THIS FILE DIRECTLY
 # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 # -*- encoding: utf-8 -*-
-# stub: iron-crawler 1.1.3 ruby lib
+# stub: iron-crawler 1.2.0 ruby lib
 Gem::Specification.new do |s|
   s.name = "iron-crawler"
-  s.version = "1.1.3"
+  s.version = "1.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.require_paths = ["lib"]

data/lib/iron-crawler/crawler.rb CHANGED Viewed

@@ -71,14 +71,8 @@ class Crawler < Mechanize
   # @return [Booolean] true when already spidered.
   #
   def already_spidered?(link)
-    begin
-      abs_url = @mech.history.first.uri.to_s.chomp('/') + link.href + '/'
-      return true if @mech.visited? link.href
-      return true if @mech.visited? abs_url
-    rescue Mechanize::UnsupportedSchemeError
-      puts "skipping #{link.uri}"
-      return true
-    end
+    abs_url = @mech.history.first.uri.to_s.chomp('/') + link.href + '/'
+    return true if (@mech.visited? link.href) || (@mech.visited? abs_url)
   end
@@ -87,7 +81,11 @@ class Crawler < Mechanize
   # @return [Booolean] true when valid URL.
   #
   def not_valid_uri?(link)
-    return true unless link.uri && (/^http.+/ =~ link.uri.to_s || /\/.+/ =~ link.uri.to_s)
+    if link.uri
+      return true unless /^http.+/ =~ link.uri.to_s || /\/.+/ =~ link.uri.to_s
+    else
+      return true
+    end
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: iron-crawler
 version: !ruby/object:Gem::Version
-  version: 1.1.3
+  version: 1.2.0
 platform: ruby
 authors:
 - Ben Visser