slmndr 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
 - checksums.yaml.gz.sig +2 -0
 - data/lib/slmndr.rb +322 -0
 - data.tar.gz.sig +0 -0
 - metadata +66 -0
 - metadata.gz.sig +1 -0
 
    
        checksums.yaml
    ADDED
    
    | 
         @@ -0,0 +1,7 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ---
         
     | 
| 
      
 2 
     | 
    
         
            +
            SHA1:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 20770bf44914d9179137aee64b3f86626b260d57
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: dd6067dd15692a27fa5216ab67cbe0b7321eaacc
         
     | 
| 
      
 5 
     | 
    
         
            +
            SHA512:
         
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 2c7d1723c78474eec28e2f7f22e5b19b601fc5903fb09bec22aa80c6bd266e8260608bd0b7a4ca11e3c012a5bb84ead4c1475b8426cfed46335657c474bea190
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: f12be9d14cb2f5d4f1813cb09c77fd3e207ce6268f9e8f12e4caac8ebba01ca9e8e57ba77e3614dd46a616a779d76764cc3e6f182ae5652d880e4240657f1730
         
     | 
    
        checksums.yaml.gz.sig
    ADDED
    
    
    
        data/lib/slmndr.rb
    ADDED
    
    | 
         @@ -0,0 +1,322 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            ## Salamander: A minimalistic ruby web crawling framework.
         
     | 
| 
      
 2 
     | 
    
         
            +
            ## Authored by: John Lawrence M. Penafiel
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            require 'time'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'thread'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require 'set'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'open-uri'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require 'openssl'
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            require 'json'
         
     | 
| 
      
 11 
     | 
    
         
            +
            require 'open_uri_redirections'
         
     | 
| 
      
 12 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 13 
     | 
    
         
            +
            require 'addressable/uri'
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
            ## Module
         
     | 
| 
      
 16 
     | 
    
         
            +
            ##     Salamander
         
     | 
| 
      
 17 
     | 
    
         
            +
            ## Description
         
     | 
| 
      
 18 
     | 
    
         
            +
            ##     The Crawler module provides an easy way for the other components of the Salamander system to perform crawling.
         
     | 
| 
      
 19 
     | 
    
         
            +
            ## Functions
         
     | 
| 
      
 20 
     | 
    
         
            +
            ##     Salamander::crawl
         
     | 
| 
      
 21 
     | 
    
         
            +
            module Salamander
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            	## Function
         
     | 
| 
      
 24 
     | 
    
         
            +
            	##     get_links
         
     | 
| 
      
 25 
     | 
    
         
            +
            	## Description
         
     | 
| 
      
 26 
     | 
    
         
            +
            	##     Extracts outgoing links from the HTML pointed to by the given URL string.
         
     | 
| 
      
 27 
     | 
    
         
            +
            	## Parameters
         
     | 
| 
      
 28 
     | 
    
         
            +
            	##     url		-	The URL of the HTML page the function is extracting links from.
         
     | 
| 
      
 29 
     | 
    
         
            +
            	##     html		-	The HTML data to extract links from.
         
     | 
| 
      
 30 
     | 
    
         
            +
            	def self.get_links(url, html)
         
     | 
| 
      
 31 
     | 
    
         
            +
            		# Initialize
         
     | 
| 
      
 32 
     | 
    
         
            +
            		uri = Addressable::URI.parse(url)
         
     | 
| 
      
 33 
     | 
    
         
            +
            		# Parse as HTML
         
     | 
| 
      
 34 
     | 
    
         
            +
            		_html = Nokogiri::HTML(html)
         
     | 
| 
      
 35 
     | 
    
         
            +
            		# Get all anchors
         
     | 
| 
      
 36 
     | 
    
         
            +
            		_html.xpath('//a').each do |l|
         
     | 
| 
      
 37 
     | 
    
         
            +
            			# Extract hyper link
         
     | 
| 
      
 38 
     | 
    
         
            +
            			href = l['href']
         
     | 
| 
      
 39 
     | 
    
         
            +
            			# Skip if hyper link does not exist
         
     | 
| 
      
 40 
     | 
    
         
            +
            			if href == nil then
         
     | 
| 
      
 41 
     | 
    
         
            +
            				next
         
     | 
| 
      
 42 
     | 
    
         
            +
            			end
         
     | 
| 
      
 43 
     | 
    
         
            +
            			# Convert hyper link to URI object
         
     | 
| 
      
 44 
     | 
    
         
            +
            			link = Addressable::URI.parse(href)
         
     | 
| 
      
 45 
     | 
    
         
            +
            			# Skip if hyper link is not HTTP 
         
     | 
| 
      
 46 
     | 
    
         
            +
            			if link.scheme != nil && link.scheme != 'http' && link.scheme != 'https' then
         
     | 
| 
      
 47 
     | 
    
         
            +
            				next
         
     | 
| 
      
 48 
     | 
    
         
            +
            			end
         
     | 
| 
      
 49 
     | 
    
         
            +
            			# Convert hyper link to absolute form
         
     | 
| 
      
 50 
     | 
    
         
            +
            			if link.host == nil then
         
     | 
| 
      
 51 
     | 
    
         
            +
            				link.host = uri.host
         
     | 
| 
      
 52 
     | 
    
         
            +
            			end
         
     | 
| 
      
 53 
     | 
    
         
            +
            			if link.scheme == nil then
         
     | 
| 
      
 54 
     | 
    
         
            +
            				link.scheme = uri.scheme
         
     | 
| 
      
 55 
     | 
    
         
            +
            			end
         
     | 
| 
      
 56 
     | 
    
         
            +
            			if link.port == nil then
         
     | 
| 
      
 57 
     | 
    
         
            +
            				link.port = uri.port
         
     | 
| 
      
 58 
     | 
    
         
            +
            			end
         
     | 
| 
      
 59 
     | 
    
         
            +
            			# Remove link fragment
         
     | 
| 
      
 60 
     | 
    
         
            +
            			link.fragment = nil
         
     | 
| 
      
 61 
     | 
    
         
            +
            			# Yield
         
     | 
| 
      
 62 
     | 
    
         
            +
            			yield link
         
     | 
| 
      
 63 
     | 
    
         
            +
            		end
         
     | 
| 
      
 64 
     | 
    
         
            +
            	end
         
     | 
| 
      
 65 
     | 
    
         
            +
            	
         
     | 
| 
      
 66 
     | 
    
         
            +
            	## Function
         
     | 
| 
      
 67 
     | 
    
         
            +
            	##     crawl
         
     | 
| 
      
 68 
     | 
    
         
            +
            	## Description
         
     | 
| 
      
 69 
     | 
    
         
            +
            	##     Performs a restricted, unauthenticated, breadth-first crawl of the target web asset.
         
     | 
| 
      
 70 
     | 
    
         
            +
            	##     Function blocks until all threads terminate.
         
     | 
| 
      
 71 
     | 
    
         
            +
            	## Parameters
         
     | 
| 
      
 72 
     | 
    
         
            +
            	##     urls		-	Required. A list of strings containing the seed URLs.
         
     | 
| 
      
 73 
     | 
    
         
            +
            	##     args		-	Optional. Default: {}. A hash containing optional arguments for the function.
         
     | 
| 
      
 74 
     | 
    
         
            +
            	##         visit	-	Optional. Default: nil. A lambda which accepts a URL, and returns a boolean which tells the crawler if the URL should be visited.
         
     | 
| 
      
 75 
     | 
    
         
            +
            	##         delay	-	Optional. Default: 1. A positive float indicating the number of seconds between requests in one thread.
         
     | 
| 
      
 76 
     | 
    
         
            +
            	##         threads  -	Optional. Default: 1. A positive integer indicating the number of allowed simultaneous requests to the target web asset.
         
     | 
| 
      
 77 
     | 
    
         
            +
            	##         agent	-	Optional. Default: "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)". The user-agent string to be used.
         
     | 
| 
      
 78 
     | 
    
         
            +
            	def crawl(urls, args = {})
         
     | 
| 
      
 79 
     | 
    
         
            +
            		# Get arguments
         
     | 
| 
      
 80 
     | 
    
         
            +
            		visit = nil
         
     | 
| 
      
 81 
     | 
    
         
            +
            		if args[:visit] != nil then
         
     | 
| 
      
 82 
     | 
    
         
            +
            			visit = args[:visit]
         
     | 
| 
      
 83 
     | 
    
         
            +
            		end
         
     | 
| 
      
 84 
     | 
    
         
            +
            		delay = 1
         
     | 
| 
      
 85 
     | 
    
         
            +
            		if args[:delay] != nil then
         
     | 
| 
      
 86 
     | 
    
         
            +
            			delay = args[:delay]
         
     | 
| 
      
 87 
     | 
    
         
            +
            		end
         
     | 
| 
      
 88 
     | 
    
         
            +
            		if delay < 0 then
         
     | 
| 
      
 89 
     | 
    
         
            +
            			raise "delay must be a positive float"
         
     | 
| 
      
 90 
     | 
    
         
            +
            		end
         
     | 
| 
      
 91 
     | 
    
         
            +
            		threads = 1
         
     | 
| 
      
 92 
     | 
    
         
            +
            		if args[:threads] != nil then
         
     | 
| 
      
 93 
     | 
    
         
            +
            			threads = args[:threads]
         
     | 
| 
      
 94 
     | 
    
         
            +
            		end
         
     | 
| 
      
 95 
     | 
    
         
            +
            		if threads < 0 then
         
     | 
| 
      
 96 
     | 
    
         
            +
            			raise "threads must be a positive integer"
         
     | 
| 
      
 97 
     | 
    
         
            +
            		end
         
     | 
| 
      
 98 
     | 
    
         
            +
            		agent = "Mozilla/5.0 (MSIE 9.0; Windows NT 6.1; Trident/5.0)"
         
     | 
| 
      
 99 
     | 
    
         
            +
            		if args[:agent] != nil then
         
     | 
| 
      
 100 
     | 
    
         
            +
            			agent = args[:agent]
         
     | 
| 
      
 101 
     | 
    
         
            +
            		end
         
     | 
| 
      
 102 
     | 
    
         
            +
            		# Create threads list
         
     | 
| 
      
 103 
     | 
    
         
            +
            		_threads = []
         
     | 
| 
      
 104 
     | 
    
         
            +
            		# Create jobs map and lock
         
     | 
| 
      
 105 
     | 
    
         
            +
            		jobs = {}
         
     | 
| 
      
 106 
     | 
    
         
            +
            		jlock = Mutex.new
         
     | 
| 
      
 107 
     | 
    
         
            +
            		# Create job; Job States: 0: waiting, 1: working, 2: done
         
     | 
| 
      
 108 
     | 
    
         
            +
            		urls.each do |url|
         
     | 
| 
      
 109 
     | 
    
         
            +
            			jobs[:"#{url}"] = { state: 0, depth: 0 }
         
     | 
| 
      
 110 
     | 
    
         
            +
            		end
         
     | 
| 
      
 111 
     | 
    
         
            +
            		# Create and launch crawl threads
         
     | 
| 
      
 112 
     | 
    
         
            +
            		for id in 1..threads
         
     | 
| 
      
 113 
     | 
    
         
            +
            			# Create crawl thread
         
     | 
| 
      
 114 
     | 
    
         
            +
            			thread = Thread.new do
         
     | 
| 
      
 115 
     | 
    
         
            +
            				# Loop
         
     | 
| 
      
 116 
     | 
    
         
            +
            				while true
         
     | 
| 
      
 117 
     | 
    
         
            +
            					# Find job to do
         
     | 
| 
      
 118 
     | 
    
         
            +
            					kill = true
         
     | 
| 
      
 119 
     | 
    
         
            +
            					job_url = nil
         
     | 
| 
      
 120 
     | 
    
         
            +
            					jlock.synchronize do
         
     | 
| 
      
 121 
     | 
    
         
            +
            						# For each job
         
     | 
| 
      
 122 
     | 
    
         
            +
            						jobs.each do |u, j|
         
     | 
| 
      
 123 
     | 
    
         
            +
            							# If job is waiting
         
     | 
| 
      
 124 
     | 
    
         
            +
            							if j[:state] == 0 then
         
     | 
| 
      
 125 
     | 
    
         
            +
            								# Take job
         
     | 
| 
      
 126 
     | 
    
         
            +
            								job_url = u
         
     | 
| 
      
 127 
     | 
    
         
            +
            								j[:state] = 1
         
     | 
| 
      
 128 
     | 
    
         
            +
            								kill = false
         
     | 
| 
      
 129 
     | 
    
         
            +
            								break
         
     | 
| 
      
 130 
     | 
    
         
            +
            							elsif j[:state] == 1 then
         
     | 
| 
      
 131 
     | 
    
         
            +
            								# Some jobs are still working; anticipate more jobs in the future
         
     | 
| 
      
 132 
     | 
    
         
            +
            								kill = false
         
     | 
| 
      
 133 
     | 
    
         
            +
            							end
         
     | 
| 
      
 134 
     | 
    
         
            +
            						end
         
     | 
| 
      
 135 
     | 
    
         
            +
            					end
         
     | 
| 
      
 136 
     | 
    
         
            +
            					# If all jobs are done, and no job is found
         
     | 
| 
      
 137 
     | 
    
         
            +
            					if kill then
         
     | 
| 
      
 138 
     | 
    
         
            +
            						break
         
     | 
| 
      
 139 
     | 
    
         
            +
            					end
         
     | 
| 
      
 140 
     | 
    
         
            +
            					# If no job found but some jobs are still being worked on, skip
         
     | 
| 
      
 141 
     | 
    
         
            +
            					if job_url == nil then
         
     | 
| 
      
 142 
     | 
    
         
            +
            						next
         
     | 
| 
      
 143 
     | 
    
         
            +
            					end
         
     | 
| 
      
 144 
     | 
    
         
            +
            					# Get job depth
         
     | 
| 
      
 145 
     | 
    
         
            +
            					job_depth = jobs[:"#{job_url}"][:depth]
         
     | 
| 
      
 146 
     | 
    
         
            +
            					# Get all links in page pointed to by job URL
         
     | 
| 
      
 147 
     | 
    
         
            +
            					begin
         
     | 
| 
      
 148 
     | 
    
         
            +
            						open("#{job_url}", { :allow_redirections => :all, ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "User-Agent" => agent }) do |response|
         
     | 
| 
      
 149 
     | 
    
         
            +
            							# Callback
         
     | 
| 
      
 150 
     | 
    
         
            +
            							jlock.synchronize do
         
     | 
| 
      
 151 
     | 
    
         
            +
            								yield "#{job_url}", response, job_depth
         
     | 
| 
      
 152 
     | 
    
         
            +
            							end
         
     | 
| 
      
 153 
     | 
    
         
            +
            							# If resolved URL is in scope
         
     | 
| 
      
 154 
     | 
    
         
            +
            							if Addressable::URI.parse(response.base_uri).host == Addressable::URI.parse("#{job_url}").host then
         
     | 
| 
      
 155 
     | 
    
         
            +
            								# Add resolved URL to job queue and mark it as complete if it does not exist yet
         
     | 
| 
      
 156 
     | 
    
         
            +
            								jlock.synchronize do
         
     | 
| 
      
 157 
     | 
    
         
            +
            									if jobs[:"#{response.base_uri}"] == nil then
         
     | 
| 
      
 158 
     | 
    
         
            +
            										yield "#{response.base_uri}", response, job_depth
         
     | 
| 
      
 159 
     | 
    
         
            +
            										jobs[:"#{response.base_uri}"] = { state: 2, depth: job_depth }
         
     | 
| 
      
 160 
     | 
    
         
            +
            									end
         
     | 
| 
      
 161 
     | 
    
         
            +
            								end
         
     | 
| 
      
 162 
     | 
    
         
            +
            								# Get links for resolve URL
         
     | 
| 
      
 163 
     | 
    
         
            +
            								Salamander::get_links(response.base_uri, response) do |link|
         
     | 
| 
      
 164 
     | 
    
         
            +
            									# Determine if the link should be visited
         
     | 
| 
      
 165 
     | 
    
         
            +
            									if visit.nil? || visit.call(link) then
         
     | 
| 
      
 166 
     | 
    
         
            +
            										jlock.synchronize do
         
     | 
| 
      
 167 
     | 
    
         
            +
            											# If link is not in job queue
         
     | 
| 
      
 168 
     | 
    
         
            +
            											if jobs[:"#{link}"] == nil then
         
     | 
| 
      
 169 
     | 
    
         
            +
            												# Create job for the given link
         
     | 
| 
      
 170 
     | 
    
         
            +
            												jobs[:"#{link}"] = { state: 0, depth: job_depth + 1 }
         
     | 
| 
      
 171 
     | 
    
         
            +
            											end
         
     | 
| 
      
 172 
     | 
    
         
            +
            										end
         
     | 
| 
      
 173 
     | 
    
         
            +
            									end
         
     | 
| 
      
 174 
     | 
    
         
            +
            								end
         
     | 
| 
      
 175 
     | 
    
         
            +
            							end
         
     | 
| 
      
 176 
     | 
    
         
            +
            						end
         
     | 
| 
      
 177 
     | 
    
         
            +
            					rescue
         
     | 
| 
      
 178 
     | 
    
         
            +
            					end
         
     | 
| 
      
 179 
     | 
    
         
            +
            					# Flag job as complete
         
     | 
| 
      
 180 
     | 
    
         
            +
            					jlock.synchronize do
         
     | 
| 
      
 181 
     | 
    
         
            +
            						jobs[:"#{job_url}"][:state] = 2
         
     | 
| 
      
 182 
     | 
    
         
            +
            					end
         
     | 
| 
      
 183 
     | 
    
         
            +
            					# Perform delay
         
     | 
| 
      
 184 
     | 
    
         
            +
            					sleep(delay)
         
     | 
| 
      
 185 
     | 
    
         
            +
            				end
         
     | 
| 
      
 186 
     | 
    
         
            +
            			end
         
     | 
| 
      
 187 
     | 
    
         
            +
            			_threads << thread
         
     | 
| 
      
 188 
     | 
    
         
            +
            		end
         
     | 
| 
      
 189 
     | 
    
         
            +
            		# Wait for all threads to die
         
     | 
| 
      
 190 
     | 
    
         
            +
            		_threads.each do |_thread|
         
     | 
| 
      
 191 
     | 
    
         
            +
            			_thread.join
         
     | 
| 
      
 192 
     | 
    
         
            +
            		end
         
     | 
| 
      
 193 
     | 
    
         
            +
            	end
         
     | 
| 
      
 194 
     | 
    
         
            +
            	
         
     | 
| 
      
 195 
     | 
    
         
            +
            	module_function :crawl
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
      
 197 
     | 
    
         
            +
            end
         
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
      
 199 
     | 
    
         
            +
            # For direct invocation of crawler.rb
         
     | 
| 
      
 200 
     | 
    
         
            +
            if __FILE__ == $0 then
         
     | 
| 
      
 201 
     | 
    
         
            +
            	# Record start time
         
     | 
| 
      
 202 
     | 
    
         
            +
            	time = Time.new
         
     | 
| 
      
 203 
     | 
    
         
            +
            	# Arbitrary terminal width
         
     | 
| 
      
 204 
     | 
    
         
            +
            	twid = 70
         
     | 
| 
      
 205 
     | 
    
         
            +
            	# Declare crawl count variable
         
     | 
| 
      
 206 
     | 
    
         
            +
            	count = 0
         
     | 
| 
      
 207 
     | 
    
         
            +
            	# Attempt to catch interrupt signals and unknown errors
         
     | 
| 
      
 208 
     | 
    
         
            +
            	begin
         
     | 
| 
      
 209 
     | 
    
         
            +
            		# Read arguments JSON from standard input
         
     | 
| 
      
 210 
     | 
    
         
            +
            		stdin = STDIN.read
         
     | 
| 
      
 211 
     | 
    
         
            +
            		args = nil
         
     | 
| 
      
 212 
     | 
    
         
            +
            		begin
         
     | 
| 
      
 213 
     | 
    
         
            +
            			args = JSON.parse(stdin)
         
     | 
| 
      
 214 
     | 
    
         
            +
            		rescue
         
     | 
| 
      
 215 
     | 
    
         
            +
            			puts JSON.pretty_generate({ result: "exception", message: "unable to parse json from stdin" })
         
     | 
| 
      
 216 
     | 
    
         
            +
            			exit
         
     | 
| 
      
 217 
     | 
    
         
            +
            		end
         
     | 
| 
      
 218 
     | 
    
         
            +
            		# Make sure the urls parameter has been supplied
         
     | 
| 
      
 219 
     | 
    
         
            +
            		if args['urls'] == nil then
         
     | 
| 
      
 220 
     | 
    
         
            +
            			puts JSON.pretty_generate({ result: "misuse", message: "'urls' parameter not specified" })
         
     | 
| 
      
 221 
     | 
    
         
            +
            			exit
         
     | 
| 
      
 222 
     | 
    
         
            +
            		else
         
     | 
| 
      
 223 
     | 
    
         
            +
            			# Retrieve the url parameter
         
     | 
| 
      
 224 
     | 
    
         
            +
            			urls = args['urls']
         
     | 
| 
      
 225 
     | 
    
         
            +
            			if !urls.kind_of?(Array) then
         
     | 
| 
      
 226 
     | 
    
         
            +
            				puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
         
     | 
| 
      
 227 
     | 
    
         
            +
            				exit
         
     | 
| 
      
 228 
     | 
    
         
            +
            			end
         
     | 
| 
      
 229 
     | 
    
         
            +
            			urls.each do |url|
         
     | 
| 
      
 230 
     | 
    
         
            +
            				begin
         
     | 
| 
      
 231 
     | 
    
         
            +
            					Addressable::URI.parse(url)
         
     | 
| 
      
 232 
     | 
    
         
            +
            				rescue
         
     | 
| 
      
 233 
     | 
    
         
            +
            					puts JSON.pretty_generate({ result: "exception", message: "urls must be a list of valid URLs" })
         
     | 
| 
      
 234 
     | 
    
         
            +
            					exit
         
     | 
| 
      
 235 
     | 
    
         
            +
            				end
         
     | 
| 
      
 236 
     | 
    
         
            +
            			end
         
     | 
| 
      
 237 
     | 
    
         
            +
            			_args = {}
         
     | 
| 
      
 238 
     | 
    
         
            +
            			# Attempt to retrieve the delay parameter
         
     | 
| 
      
 239 
     | 
    
         
            +
            			if args['delay'] != nil then
         
     | 
| 
      
 240 
     | 
    
         
            +
            				begin
         
     | 
| 
      
 241 
     | 
    
         
            +
            					_args[:delay] = args['delay'].to_f
         
     | 
| 
      
 242 
     | 
    
         
            +
            				rescue
         
     | 
| 
      
 243 
     | 
    
         
            +
            					puts JSON.pretty_generate({ result: "exception", message: "delay must be a float" })
         
     | 
| 
      
 244 
     | 
    
         
            +
            					exit
         
     | 
| 
      
 245 
     | 
    
         
            +
            				end
         
     | 
| 
      
 246 
     | 
    
         
            +
            			end
         
     | 
| 
      
 247 
     | 
    
         
            +
            			# Attempt to retrieve the threads parameter
         
     | 
| 
      
 248 
     | 
    
         
            +
            			if args['threads'] != nil then
         
     | 
| 
      
 249 
     | 
    
         
            +
            				begin
         
     | 
| 
      
 250 
     | 
    
         
            +
            					_args[:threads] = args['threads'].to_i
         
     | 
| 
      
 251 
     | 
    
         
            +
            				rescue
         
     | 
| 
      
 252 
     | 
    
         
            +
            					puts JSON.pretty_generate({ result: "exception", message: "threads must be an integer" })
         
     | 
| 
      
 253 
     | 
    
         
            +
            					exit
         
     | 
| 
      
 254 
     | 
    
         
            +
            				end
         
     | 
| 
      
 255 
     | 
    
         
            +
            			end
         
     | 
| 
      
 256 
     | 
    
         
            +
            			# Attempt to retrieve the agent parameter
         
     | 
| 
      
 257 
     | 
    
         
            +
            			if args['agent'] != nil then
         
     | 
| 
      
 258 
     | 
    
         
            +
            				_args[:agent] = args['agent']
         
     | 
| 
      
 259 
     | 
    
         
            +
            			end
         
     | 
| 
      
 260 
     | 
    
         
            +
            			# Begin crawl; try to catch exceptions
         
     | 
| 
      
 261 
     | 
    
         
            +
            			begin
         
     | 
| 
      
 262 
     | 
    
         
            +
            				# Print banner
         
     | 
| 
      
 263 
     | 
    
         
            +
            				STDERR.puts
         
     | 
| 
      
 264 
     | 
    
         
            +
            				STDERR.puts " Welcome to the Salamander Web Crawler Demo!"
         
     | 
| 
      
 265 
     | 
    
         
            +
            				STDERR.puts
         
     | 
| 
      
 266 
     | 
    
         
            +
            				STDERR.puts " This is a command-line demonstration of the Salamander Web Crawler in action."
         
     | 
| 
      
 267 
     | 
    
         
            +
            				STDERR.puts " Press Ctrl+C at any time to interrupt the crawl."
         
     | 
| 
      
 268 
     | 
    
         
            +
            				STDERR.puts
         
     | 
| 
      
 269 
     | 
    
         
            +
            				STDERR.puts " Starting crawl at the following URLs:"
         
     | 
| 
      
 270 
     | 
    
         
            +
            				urls.each do |url|
         
     | 
| 
      
 271 
     | 
    
         
            +
            					STDERR.puts "     - #{url}"
         
     | 
| 
      
 272 
     | 
    
         
            +
            				end
         
     | 
| 
      
 273 
     | 
    
         
            +
            				STDERR.puts
         
     | 
| 
      
 274 
     | 
    
         
            +
            				STDERR.print " Depth    URL"
         
     | 
| 
      
 275 
     | 
    
         
            +
            				first = true
         
     | 
| 
      
 276 
     | 
    
         
            +
            				# Do actual crawl
         
     | 
| 
      
 277 
     | 
    
         
            +
            				Salamander::crawl(urls, _args) do |request, response, depth|
         
     | 
| 
      
 278 
     | 
    
         
            +
            					begin
         
     | 
| 
      
 279 
     | 
    
         
            +
            						# Increment crawl count
         
     | 
| 
      
 280 
     | 
    
         
            +
            						count = count + 1
         
     | 
| 
      
 281 
     | 
    
         
            +
            						# Truncate URL string
         
     | 
| 
      
 282 
     | 
    
         
            +
            						if request.length > twid - 2 then
         
     | 
| 
      
 283 
     | 
    
         
            +
            							_url = "#{request[0, twid - 5]}..."
         
     | 
| 
      
 284 
     | 
    
         
            +
            						else
         
     | 
| 
      
 285 
     | 
    
         
            +
            							_url = request
         
     | 
| 
      
 286 
     | 
    
         
            +
            						end
         
     | 
| 
      
 287 
     | 
    
         
            +
            						# Print crawl hit
         
     | 
| 
      
 288 
     | 
    
         
            +
            						if first then
         
     | 
| 
      
 289 
     | 
    
         
            +
            							STDERR.puts
         
     | 
| 
      
 290 
     | 
    
         
            +
            							first = false
         
     | 
| 
      
 291 
     | 
    
         
            +
            						end
         
     | 
| 
      
 292 
     | 
    
         
            +
            						STDERR.puts
         
     | 
| 
      
 293 
     | 
    
         
            +
            						STDERR.print "    #{format('%02d', depth)}    #{_url}"
         
     | 
| 
      
 294 
     | 
    
         
            +
            						STDERR.flush
         
     | 
| 
      
 295 
     | 
    
         
            +
            					rescue Interrupt => e
         
     | 
| 
      
 296 
     | 
    
         
            +
            						# Catch interrupt cleanly
         
     | 
| 
      
 297 
     | 
    
         
            +
            						STDERR.puts
         
     | 
| 
      
 298 
     | 
    
         
            +
            						STDERR.puts
         
     | 
| 
      
 299 
     | 
    
         
            +
            						STDERR.puts " Program terminated successfully"
         
     | 
| 
      
 300 
     | 
    
         
            +
            						STDERR.puts " Number of Pages Crawled: #{count}"
         
     | 
| 
      
 301 
     | 
    
         
            +
            						STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
         
     | 
| 
      
 302 
     | 
    
         
            +
            						break
         
     | 
| 
      
 303 
     | 
    
         
            +
            					end
         
     | 
| 
      
 304 
     | 
    
         
            +
            				end
         
     | 
| 
      
 305 
     | 
    
         
            +
            			rescue => e
         
     | 
| 
      
 306 
     | 
    
         
            +
            				puts JSON.pretty_generate({ result: "exception", message: "error encountered while crawling", inspect: e.inspect })
         
     | 
| 
      
 307 
     | 
    
         
            +
            				exit
         
     | 
| 
      
 308 
     | 
    
         
            +
            			end
         
     | 
| 
      
 309 
     | 
    
         
            +
            		end
         
     | 
| 
      
 310 
     | 
    
         
            +
            	rescue Interrupt => e
         
     | 
| 
      
 311 
     | 
    
         
            +
            		# Catch interrupt cleanly
         
     | 
| 
      
 312 
     | 
    
         
            +
            		STDERR.puts
         
     | 
| 
      
 313 
     | 
    
         
            +
            		STDERR.puts
         
     | 
| 
      
 314 
     | 
    
         
            +
            		STDERR.puts " Program terminated successfully"
         
     | 
| 
      
 315 
     | 
    
         
            +
            		STDERR.puts " Number of Pages Crawled: #{count}"
         
     | 
| 
      
 316 
     | 
    
         
            +
            		STDERR.puts " Running Time: #{Time.at(Time.new - time).gmtime.strftime("%H:%M:%S")}"
         
     | 
| 
      
 317 
     | 
    
         
            +
            	rescue e
         
     | 
| 
      
 318 
     | 
    
         
            +
            		# Print any uncaught exceptions
         
     | 
| 
      
 319 
     | 
    
         
            +
            		puts JSON.pretty_generate({ result: "exception", message: "unknown error", inspect: e.inspect })
         
     | 
| 
      
 320 
     | 
    
         
            +
            		exit
         
     | 
| 
      
 321 
     | 
    
         
            +
            	end
         
     | 
| 
      
 322 
     | 
    
         
            +
            end
         
     | 
    
        data.tar.gz.sig
    ADDED
    
    | 
         Binary file 
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,66 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: slmndr
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version
         
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.0
         
     | 
| 
      
 5 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 6 
     | 
    
         
            +
            authors:
         
     | 
| 
      
 7 
     | 
    
         
            +
            - John Lawrence M. Penafiel
         
     | 
| 
      
 8 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 9 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 10 
     | 
    
         
            +
            cert_chain:
         
     | 
| 
      
 11 
     | 
    
         
            +
            - |
         
     | 
| 
      
 12 
     | 
    
         
            +
              -----BEGIN CERTIFICATE-----
         
     | 
| 
      
 13 
     | 
    
         
            +
              MIIDfDCCAmSgAwIBAgIBATANBgkqhkiG9w0BAQUFADBCMRQwEgYDVQQDDAtwZW5h
         
     | 
| 
      
 14 
     | 
    
         
            +
              ZmllbGpsbTEVMBMGCgmSJomT8ixkARkWBWdtYWlsMRMwEQYKCZImiZPyLGQBGRYD
         
     | 
| 
      
 15 
     | 
    
         
            +
              Y29tMB4XDTE1MDYyOTE1MDQ1MVoXDTE2MDYyODE1MDQ1MVowQjEUMBIGA1UEAwwL
         
     | 
| 
      
 16 
     | 
    
         
            +
              cGVuYWZpZWxqbG0xFTATBgoJkiaJk/IsZAEZFgVnbWFpbDETMBEGCgmSJomT8ixk
         
     | 
| 
      
 17 
     | 
    
         
            +
              ARkWA2NvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAPO7rVa+t3P8
         
     | 
| 
      
 18 
     | 
    
         
            +
              pcv9C6k9LjbpVBQhLfHIkIrE98qGrXzzQANhTqRqrLgpCcYw44rMETCg2py7lti2
         
     | 
| 
      
 19 
     | 
    
         
            +
              +g5997njU0IpPVlwLX71S3ddsvezi2KeYm+q2z0DMN5y1MsgcmoRHL8tGlbLw9FC
         
     | 
| 
      
 20 
     | 
    
         
            +
              zq0KtJuIdcyrQQf9c3N/qxNbFyAJ5ZiuXq3bueOndt0zH8waOCdZKz+XZQYDJsTn
         
     | 
| 
      
 21 
     | 
    
         
            +
              Kob8pLZill4ftwP1AOj2wqNG6ElR8NWJW77Il6Lxlh8fVwOxSaYAtRYyQU3C59M+
         
     | 
| 
      
 22 
     | 
    
         
            +
              fEas9aXk12wEumDql2M3W0DNYiM1dXbamPSxFGu3c/BBrdJcEuQnuDAPYteaKj4c
         
     | 
| 
      
 23 
     | 
    
         
            +
              2DXoicDGn+UCAwEAAaN9MHswCQYDVR0TBAIwADALBgNVHQ8EBAMCBLAwHQYDVR0O
         
     | 
| 
      
 24 
     | 
    
         
            +
              BBYEFPMx9v4kXBciiOP03co6iSatkViCMCAGA1UdEQQZMBeBFXBlbmFmaWVsamxt
         
     | 
| 
      
 25 
     | 
    
         
            +
              QGdtYWlsLmNvbTAgBgNVHRIEGTAXgRVwZW5hZmllbGpsbUBnbWFpbC5jb20wDQYJ
         
     | 
| 
      
 26 
     | 
    
         
            +
              KoZIhvcNAQEFBQADggEBADUAbj69LIDpA6rLmCQoAKjZRaBYVweZp4G07Gf838+f
         
     | 
| 
      
 27 
     | 
    
         
            +
              swa/B68PpSjoWmWeErvAahkJhTCNY7SuFinUh6/lDqeqUXIQPQu8PW3Oyfp63u1U
         
     | 
| 
      
 28 
     | 
    
         
            +
              vnM0Opft5VBhaP/dBEYvN2e/F2q62ObsRtzq9hXW9k/EfgVtlmKeeAeH7k2mPeGi
         
     | 
| 
      
 29 
     | 
    
         
            +
              7rxC4nb8yYf55rPGLG52BYSBmDwFIh64JiEmLJi3jEiTEMGB+dVQk++0HSuHDFi1
         
     | 
| 
      
 30 
     | 
    
         
            +
              kX+zehuhNK2jecNBpCmYOdpV/Tf9rA2qQ+TFBx08FfsibhdjbvXI1oN2uv+KBeAi
         
     | 
| 
      
 31 
     | 
    
         
            +
              1ixqHDxvPm+/VQAK6wyHVbo6smzss/cry1yw2JTa6dk=
         
     | 
| 
      
 32 
     | 
    
         
            +
              -----END CERTIFICATE-----
         
     | 
| 
      
 33 
     | 
    
         
            +
            date: 2015-06-29 00:00:00.000000000 Z
         
     | 
| 
      
 34 
     | 
    
         
            +
            dependencies: []
         
     | 
| 
      
 35 
     | 
    
         
            +
            description: A minimalistic ruby web crawling framework
         
     | 
| 
      
 36 
     | 
    
         
            +
            email: penafieljlm@gmail.com
         
     | 
| 
      
 37 
     | 
    
         
            +
            executables: []
         
     | 
| 
      
 38 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 39 
     | 
    
         
            +
            extra_rdoc_files: []
         
     | 
| 
      
 40 
     | 
    
         
            +
            files:
         
     | 
| 
      
 41 
     | 
    
         
            +
            - lib/slmndr.rb
         
     | 
| 
      
 42 
     | 
    
         
            +
            homepage: http://rubygems.org/gems/slmndr
         
     | 
| 
      
 43 
     | 
    
         
            +
            licenses:
         
     | 
| 
      
 44 
     | 
    
         
            +
            - MIT
         
     | 
| 
      
 45 
     | 
    
         
            +
            metadata: {}
         
     | 
| 
      
 46 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 47 
     | 
    
         
            +
            rdoc_options: []
         
     | 
| 
      
 48 
     | 
    
         
            +
            require_paths:
         
     | 
| 
      
 49 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 50 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 51 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 52 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 53 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 54 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 55 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         
     | 
| 
      
 56 
     | 
    
         
            +
              requirements:
         
     | 
| 
      
 57 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 58 
     | 
    
         
            +
                - !ruby/object:Gem::Version
         
     | 
| 
      
 59 
     | 
    
         
            +
                  version: '0'
         
     | 
| 
      
 60 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 61 
     | 
    
         
            +
            rubyforge_project: 
         
     | 
| 
      
 62 
     | 
    
         
            +
            rubygems_version: 2.4.5
         
     | 
| 
      
 63 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 64 
     | 
    
         
            +
            specification_version: 4
         
     | 
| 
      
 65 
     | 
    
         
            +
            summary: Salamander
         
     | 
| 
      
 66 
     | 
    
         
            +
            test_files: []
         
     | 
    
        metadata.gz.sig
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            �@�v<gu���$�XGR�*�d�Xcca#x��X�S]��9o�-�f))�(`��J����/�~쎆�E�:y����Ǐ|캕�̟8���Hr�	#x/�%���/�IyF�M�8�_m�,�P��p��l��BDM�	�B̠��@���Jfd�'�!A�y��zq�=]�:��x��0,�f ֖��d��n}��Ŭoɯ��G��D�]�b��S�2:�K��L�y6�"�b؎T��9�C��~�pL.
         
     |