RubyGems - resay_crawler - Versions diffs - 0.0.1 - Mend

resay_crawler 0.0.1

Files changed (8) hide show

checksums.yaml +7 -0
data/bin/crawler +6 -0
data/lib/command_line_argument_parser.rb +58 -0
data/lib/resay_crawler.rb +38 -0
data/lib/spider.rb +108 -0
data/lib/url_store.rb +24 -0
data/lib/url_utils.rb +58 -0
metadata +64 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 419d0b9ff169be3f5addb5e5198ac0c4833d64e4
+  data.tar.gz: 3d4139ed449b9228bb7189f127857dc80eb10b30
+SHA512:
+  metadata.gz: 779d114a34c9617d12fe364ad1f179ff9577586f668d1df091b50114082bcf252d2592f7dc7a7f5d5b6ae8d43fbe8c75a943a65df2c20ae59f095b1837b7b3d8
+  data.tar.gz: 0fabca670ba4707d801146af55c473312d8137a9914ad13387802412fe2ef98d35854d5d54cdedf4f38798f5556edaeae3bde9d6274dd1376dcbd1ae390bd6f2

data/bin/crawler ADDED

@@ -0,0 +1,6 @@
+#!/usr/bin/env ruby
+require 'resay_crawler'
+crawler = ResayCrawler.new
+crawler.crawl

data/lib/command_line_argument_parser.rb ADDED

@@ -0,0 +1,58 @@
+require 'getoptlong'
+class CommandLineArgumentParser
+	WEB_CRAWLER = 'web'
+	DOMAIN_CRAWLER = 'domain'
+	attr_reader :crawl_type, :crawl_depth, :page_limit, :url_file
+	def initialize
+		unless ARGV.length >= 1
+			display_usage
+			exit
+		end
+		# initial argument
+		@opts = GetoptLong.new(
+			["--crawl", "-c", GetoptLong::REQUIRED_ARGUMENT],
+			["--crawl-depth", "-d", GetoptLong::OPTIONAL_ARGUMENT],
+			["--page-limit", "-p", GetoptLong::OPTIONAL_ARGUMENT],
+			["--url-file", "-f", GetoptLong::OPTIONAL_ARGUMENT]
+		)
+		@crawl_type = "data.txt"
+		@crawl_depth = 3
+		@page_limit = 100
+		@url_file = 'urls.txt'
+	end
+	def display_usage
+		p "Sample usage:"
+		p "ruby resay_crawler.rb -c web -d 3 -p 100 -f 'urls.txt'"
+		p "-c must be either 'web' or 'domain', will default to 'web' is you type garbage "
+	end
+	def parse_arguments
+		@opts.each do |opt, arg|
+			case opt
+			when '--crawl'
+				ensure_crawl_type_correct(arg)
+			when '--crawl-depth'
+				@crawl_depth = arg.to_i
+			when '--page-limit'
+				@page_limit = arg.to_i
+			when '--url-file'
+				@url_file = arg
+			end
+		end
+	end
+	def ensure_crawl_type_correct(value)
+		if value != WEB_CRAWLER && value != DOMAIN_CRAWLER
+			@crawl_type = WEB_CRAWLER
+		else
+			@crawl_type = value
+		end
+	end
+end

data/lib/resay_crawler.rb ADDED

@@ -0,0 +1,38 @@
+require 'spider'
+require 'command_line_argument_parser'
+require 'url_store'
+class ResayCrawler
+	def initialize
+		## parse cosole argument -f -c -d
+		@argument_parser = CommandLineArgumentParser.new
+		@argument_parser.parse_arguments
+		## spider
+		@spider = Spider.new
+		## read url
+		@url_store = UrlStore.new(@argument_parser.url_file)
+	end
+	def crawl
+		## crawl
+		if @argument_parser.crawl_type == CommandLineArgumentParser::WEB_CRAWLER
+			@spider.crawl_web(
+				@url_store.get_urls,
+				@argument_parser.crawl_depth,
+				@argument_parser.page_limit
+			)
+		else
+			@spider.crawl_domain(
+				@url_store.get_url,
+				@argument_parser.page_limit
+			)
+		end
+	end
+end

data/lib/spider.rb ADDED

@@ -0,0 +1,108 @@
+require 'net/http'
+require 'uri'
+require 'open-uri'
+require 'rubygems'
+# require 'hpricot'
+require 'nokogiri'
+require 'url_utils'
+class Spider
+	# mixin
+	include UrlUtils
+	def initialize
+		@already_visited = {}
+	end
+	# web
+	def crawl_web(urls, depth = 2, page_limit = 100)
+		depth.times do # 几级
+			next_urls = []
+			# 读取初始url
+			urls.each do |url|
+				url_object = open_url(url)
+				next if url_object.nil?
+				# 重定向后? url
+				url = update_url_if_redirected(url_object)
+				# 解析url
+				parsed_doc = parse_url(url_object)
+				next if parsed_doc.nil?
+				# 标记已经访问过url
+				@already_visited[url] = true if @already_visited[url].nil?
+				# [ 循坏的终止条件]  超出page_limit,则跳出
+				return if @already_visited.size == page_limit
+				# 解析page上新url，加入[]，剔除已经访问过
+				next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
+				# 去掉相同的url
+				next_urls.uniq!
+			end
+			# each 循环
+			urls = next_urls
+		end
+	end
+	# domain
+	def crawl_domain(url, page_limit = 100)
+		# [ 递归终止条件 ]
+		return if @already_visited.size == page_limit
+		url_object = open_url(url)
+		return if url_object.nil?
+		parsed_doc = parse_url(url_object)
+		return if parsed_doc.nil?
+		@already_visited[url] = true if @already_visited[url].nil?
+		page_urls = find_urls_on_page(parsed_doc,url)
+		page_urls.each do |page_url|
+			# 是同一域名&&未访问过的url
+			if urls_on_same_domain?(url,page_url) && @already_visited[page_url].nil?
+				# recursive 递归
+				crawl_domain(page_url)
+			end
+		end
+	end
+	private
+		# add my exception to open(url)
+		def open_url(url)
+			open(url)
+		rescue
+			puts "Unable to open url: " + url
+		end
+		def update_url_if_redirected(url_object)
+			url_object.base_uri.to_s
+		end
+		# deal dom tree content
+		def parse_url(url_object)
+			# doc = Hpricot(url_object)  #nokogiri
+			doc = Nokogiri(url_object)
+			puts 'Crawling url ' + url_object.base_uri.to_s
+			doc
+		rescue
+			puts 'Could not parse url: ' + url_object.base_uri.to_s
+		end
+		def find_urls_on_page(parsed_doc, current_url)
+			parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
+				new_url = x['href'].split('#')[0]
+				if new_url
+					# complicated feature: make_absolute
+					new_url = make_absolute(current_url, new_url) if relative?(new_url)
+					urls_list.push(new_url)
+				end
+			end
+		end
+end

data/lib/url_store.rb ADDED

@@ -0,0 +1,24 @@
+class UrlStore
+	attr_reader :urls
+	alias :get_urls :urls
+	def initialize(url_file)
+		@urls = read_urls_from_file(url_file)
+	end
+	def get_url
+		@urls[0]
+	end
+	def read_urls_from_file(url_file)
+		urls = []
+		File.open(url_file,'r') do |file|
+			file.readlines.each do |line|
+				urls.push(line.chomp)
+			end
+		end
+		urls
+	end
+	private :read_urls_from_file
+end

data/lib/url_utils.rb ADDED

@@ -0,0 +1,58 @@
+module UrlUtils
+	def relative?(url)
+		!url.match(/^http/)
+	end
+	def make_absolute(potential_base, relative_url)
+		if relative_url.match(/^\//)
+			create_absolute_url_from_base(potential_base,relative_url)
+		else
+			create_absolute_url_from_context(potential_base,relative_url)
+		end
+	end
+	def urls_on_same_domain?(url1,url2)
+		get_domain(url1) == get_domain(url2)
+	end
+	def get_domain(url)
+		remove_extra_paths(url)
+	end
+	private
+	def create_absolute_url_from_base(potential_base,relative_url)
+		remove_extra_paths(potential_base) + relative_url
+	end
+	def remove_extra_paths(potential_base)
+		index_to_start_slash_search = potential_base.index('://')+3
+		index_of_first_relevant_slash = potential_base.index('/',index_to_start_slash_search)
+		if index_of_first_relevant_slash != nil
+			potential_base[0, index_of_first_relevant_slash]
+		else
+			potential_base
+		end
+	end
+	def create_absolute_url_from_context(potential_base,relative_url)
+		absolute_url = nil
+		if potential_base.match(/\/$/)
+			absolute_url = potential_base + relative_url
+		else
+			last_index_of_slash = potential_base.rindex('/')
+			if potential_base[last_index_of_slash-2, 2] == ':/'
+				absolute_url = potential_base + '/' + relative_url
+			else
+				last_index_of_dot = potential_base.rindex('.')
+				if last_index_of_dot < last_index_of_slash
+					absolute_url = potential_base + '/' + relative_url
+				else
+					absolute_url = potential_base[0, last_index_of_slash + 1] + relative_url
+				end
+			end
+		end
+		absolute_url
+	end
+end

metadata ADDED

@@ -0,0 +1,64 @@
+--- !ruby/object:Gem::Specification
+name: resay_crawler
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Resay tao
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-05-23 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+description: A simple web crawler gem
+email: sinotao1@gmail.com
+executables:
+- crawler
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/crawler
+- lib/command_line_argument_parser.rb
+- lib/resay_crawler.rb
+- lib/spider.rb
+- lib/url_store.rb
+- lib/url_utils.rb
+homepage: http://rubygems.org/gems/resay_crawler
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.6
+signing_key:
+specification_version: 4
+summary: My first gem
+test_files: []