RubyGems - crawler-engine - Versions diffs - 0.1.0 - Mend

crawler-engine 0.1.0

Files changed (19) hide show

data/Gemfile +2 -0
data/README.md +25 -0
data/crawler_engine.gemsepc +25 -0
data/generators/crawler_engine_migration/crawler_engine_migration_generator.rb +18 -0
data/generators/crawler_engine_migration/templates/migration.rb +27 -0
data/lib/crawler_engine.rb +10 -0
data/lib/crawler_engine.rb~ +47 -0
data/lib/crawler_parser.rb +72 -0
data/lib/database.yml +8 -0
data/lib/db_adaptor.rb +7 -0
data/lib/generators/crawler_engine_generator.rb +24 -0
data/lib/generators/templates/migration.rb +27 -0
data/lib/generators/templates/post.rb +3 -0
data/lib/generators/templates/source.rb +4 -0
data/lib/post.rb +6 -0
data/lib/source.rb +7 -0
data/lib/source.rb~ +42 -0
data/rakefile +11 -0
metadata +109 -0

data/Gemfile ADDED

	@@ -0,0 +1,2 @@
1	+ source 'http://rubygems.org'
2	+ gemspec

data/README.md ADDED

@@ -0,0 +1,25 @@
+# Crawler Engine
+Crawler engine is the "Swiss Army bulldozer" plugin for
+Ruby on Rails. It allows you to crawl posts from the website.
+## Crawler Engine Features
+Crawler Engine many advanced features, including: custom generators.
+customized crawl links.
+Crawler Engine is compatible with Active Record **3.0** and **3.1**.
+## Rails Quickstart
+gem install crawler-engine
+rails g crawler_engine
+rake db:migrate
+# in your Gemfile
+gem "crawler-engine"
+# TODO:

data/crawler_engine.gemsepc ADDED

@@ -0,0 +1,25 @@
+# encoding: utf-8
+$:.push File.expand_path("../lib", __FILE__)
+require "crawler_engine"
+Gem::Specification.new do |s|
+  s.name = 'crawler-engine'
+  s.version = '0.1.0'
+  s.has_rdoc = true
+  s.required_ruby_version = ">= 1.9.2"
+  s.platform = "ruby"
+  s.required_rubygems_version = ">= 0"
+  s.author = "tim.tang"
+  s.email = "tang.jilong@139.com"
+  s.summary = "Gem for crawler news post"
+  s.homepage = 'http://www.everyday-cn.com'
+  s.add_runtime_dependency 'rails'
+  s.add_dependency("nokogiri",">=1.5.0")
+  s.add_dependency("simple-rss",">=1.2.3")
+  s.add_dependency("rest-open-uri",">=1.0.0")
+  s.files =`git ls-files`.split("\n")
+  s.require_paths = ['lib']
+  s.description="Crawler Engine provides function of crawl all news from the customized website"
+end

data/generators/crawler_engine_migration/crawler_engine_migration_generator.rb ADDED

@@ -0,0 +1,18 @@
+class CrawlerEngineMigrationGenerator < Rails::Generator::Base
+	#source_root File.expand_path('../templates', __FILE__)
+	def manifest
+		record do |m|
+			options = {
+				:migration_file_name => 'crawler_engine_migration'
+			}
+			m.migration_template 'migration.rb', 'db/migrate', options
+		end
+	end
+	def banner
+		"Usage: Generator crawler engine migration"
+	end
+end

data/generators/crawler_engine_migration/templates/migration.rb ADDED

@@ -0,0 +1,27 @@
+class CreateCrawlerEngine < ActiveRecord::Migration
+	def self.up
+		create_table :sources, :force => true do |t|
+			t.string :site_name
+			t.string :link
+			t.string :filter
+			t.string :category
+			t.datetime :crawled_at # When to run. Could be Time.now for immediately, or sometime in the future.
+		end
+		create_table :posts, :force => true do |t|
+			t.string :site_name
+			t.string :title
+			t.string :source
+			t.content :content
+			t.string :category
+			t.integer :speed
+			t.integer :support_num
+			t.datetime :published_at
+		end
+	end
+	def self.down
+		drop_table :sources
+		drop_table :posts
+	end
+end

data/lib/crawler_engine.rb ADDED

@@ -0,0 +1,10 @@
+# encoding:utf-8
+require File.expand_path('../crawler_parser',__FILE__)
+require File.expand_path('../source',__FILE__)
+class CrawlerEngine
+	@cp = CrawlerParser.new
+	#@sources = Source.find(:all)
+	#@sources = Source.where(:id=>1)
+	#@cp.parse_rss(@sources)
+end

data/lib/crawler_engine.rb~ ADDED

@@ -0,0 +1,47 @@
+#encoding:utf-8
+require File.expand_path('../crawler_parser',__FILE__)
+class CrawlerEngine
+	@cp = CrawlerParser.new
+	links = {
+		#financial time chinese
+		"http://www.ftchinese.com/rss/feed"=>'//div[@class="content"]',
+		#
+	}
+	#@cp.parse_rss(links)
+end
+# ================= SINA.com =======================
+ Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/world/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/china/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/society/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/finance/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/stock/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'科技频道-新浪', :link=>'http://rss.sina.com.cn/tech/rollnews.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+# # ================= 163.com =======================
+ Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gn.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gj.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_war.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_sh.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/headlines.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/kejiyejie.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ # ================= ifeng.com =====================
+ Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/world.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
+ Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/society.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="articial_real"]')
+ Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/mainland.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
+# # ================ financial times china ==========
+ Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/feed',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
+ Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/news',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
+# # ================ dong fang daily ================
+ Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/21.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
+ Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/113.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
+ Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/232.xml',:category=>'生活&时尚',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
+#
+# # =============== engadget china ==================
+ Source.create(:site_name=>'瘾科技', :link=>'http://cn.engadget.com/rss.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="postbody"]')

data/lib/crawler_parser.rb ADDED

@@ -0,0 +1,72 @@
+#encoding:utf-8
+require File.expand_path('../post',__FILE__)
+require 'nokogiri'
+require 'rest-open-uri'
+require 'simple-rss'
+require 'iconv'
+require 'logger'
+class CrawlerParser
+	#logger = Logger.new('/tmp/crawlerEngine.log', 'daily')
+	#logger.level = Logger::INFO
+	FS_LEN = 80
+	def parse_rss(sources)
+		return if sources.nil? or sources.size.eql?(0)
+		#logger.info("Links to crawl >>"+links.to_s)
+		puts "Links to crawl >>"+sources.to_s
+		sources.uniq!
+		sources.each do |source|
+			begin
+				rss = SimpleRSS.parse open(source.link)
+			rescue Exception=>ex
+				#logger.error(ex)
+				#logger.info("SimpleRSS got unexpected error, rss exit")
+				puts ex
+			end
+			#TODO:
+			#puts rss.feed_tags.title
+			#puts rss.feed_tags.description
+			for item in rss.items
+				print_rss_item(item)
+				#parse post details
+				begin
+					doc = Nokogiri::HTML open(item.link.to_s)
+				rescue Exception=>ex
+					#logger.error(ex)
+					#logger.info("Nokogiri got unexpected error")
+					puts ex
+				end
+				return unless doc
+				doc.xpath(source.filter).each do |content|
+					#puts content.to_s.force_encoding('GB2312')
+					#Iconv.iconv("GB2312//IGNORE","UTF-8//IGNORE", content)
+					#Iconv.iconv("UTF-8//IGNORE","GB2312//IGNORE", content)
+					Post.create(
+						:title=>item.title.to_s.force_encoding('UTF-8'),
+						:source=>item.link.to_s.force_encoding('UTF-8'),
+						:content=>content.to_s.force_encoding('UTF-8'),
+						:published_at=>item.pubDate,
+						:site_name=>source.site_name,
+						:category => source.category)
+				end
+			end
+			#update crawler exec timestamp
+			source.update_attribute("crawled_at", Time.now)
+		end
+	end
+	private
+	def print_rss_item(item)
+		puts "-" * FS_LEN
+		puts "title:" + item.title.to_s
+		puts "author:" + item.author.to_s
+		puts "description:" + item.description.to_s
+		#puts Iconv.iconv("UTF-8//IGNORE","GB2312//IGNORE",item.description.to_s )
+		puts "link:" + item.link.to_s
+		puts "pubDate:" + item.pubDate.to_s
+		puts "guid:" + item.guid.to_s
+		puts "category:" + item.category.to_s
+	end
+end

data/lib/database.yml ADDED

@@ -0,0 +1,8 @@
+adapter: mysql2
+encoding: utf8
+reconnect: false
+database: crawler_engine
+pool: 5
+username: root
+password: root
+socket: /var/run/mysqld/mysqld.sock

data/lib/db_adaptor.rb ADDED

@@ -0,0 +1,7 @@
+#encoding:utf-8
+require 'rubygems'
+require 'active_record'
+require 'logger'
+dbconfig = YAML::load(File.open(File.dirname(__FILE__)+'/database.yml'))
+ActiveRecord::Base.logger = Logger.new('/tmp/crawler_engine.log')
+ActiveRecord::Base.establish_connection(dbconfig)

data/lib/generators/crawler_engine_generator.rb ADDED

@@ -0,0 +1,24 @@
+require 'rails/generators'
+require 'rails/generators/migration'
+require 'rails/generators/active_record'
+class CrawlerEngineGenerator < Rails::Generators::Base
+	include Rails::Generators::Migration
+	extend ActiveRecord::Generators::Migration
+	desc "Generates migration for crawler engine model"
+	def self.source_root
+		File.expand_path('../templates', __FILE__)
+	end
+	def create_migration_file
+		migration_template 'migration.rb', 'db/migrate/create_crawler_engine'
+	end
+	def crate_model_file
+		template 'post.rb', 'app/models/post.rb'
+		template 'source.rb', 'app/models/source.rb'
+	end
+end

data/lib/generators/templates/migration.rb ADDED

@@ -0,0 +1,27 @@
+class CreateCrawlerEngine < ActiveRecord::Migration
+	def self.up
+		create_table :sources, :force => true do |t|
+			t.string :site_name
+			t.string :link
+			t.string :filter
+			t.string :category
+			t.datetime :crawled_at # When to run. Could be Time.now for immediately, or sometime in the future.
+		end
+		create_table :posts, :force => true do |t|
+			t.string :site_name
+			t.string :title
+			t.string :source
+			t.text :content
+			t.string :category
+			t.integer :speed
+			t.integer :support_num
+			t.datetime :published_at
+		end
+	end
+	def self.down
+		drop_table :sources
+		drop_table :posts
+	end
+end

data/lib/generators/templates/post.rb ADDED

@@ -0,0 +1,3 @@
+class Post < ActiveRecord::Base
+end

data/lib/generators/templates/source.rb ADDED

@@ -0,0 +1,4 @@
+class Source < ActiveRecord::Base
+end

data/lib/post.rb ADDED

@@ -0,0 +1,6 @@
+#encoding:utf-8
+require File.expand_path('../db_adaptor',__FILE__)
+class Post < ActiveRecord::Base
+	  set_table_name "posts"
+end

data/lib/source.rb ADDED

@@ -0,0 +1,7 @@
+#encoding:utf-8
+require File.expand_path('../db_adaptor',__FILE__)
+class Source < ActiveRecord::Base
+	set_table_name "sources"
+end

data/lib/source.rb~ ADDED

@@ -0,0 +1,42 @@
+#encoding:utf-8
+require File.expand_path('../db_adaptor',__FILE__)
+class Source < ActiveRecord::Base
+	  set_table_name "sources"
+end
+# ================= SINA.com =======================
+ Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/world/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/china/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/society/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/finance/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/stock/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+ Source.create(:site_name=>'科技频道-新浪', :link=>'http://rss.sina.com.cn/tech/rollnews.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
+# # ================= 163.com =======================
+ Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gn.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gj.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_war.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_sh.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/headlines.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/kejiyejie.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
+ # ================= ifeng.com =====================
+ Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/world.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
+ Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/society.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="articial_real"]')
+ Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/mainland.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
+# # ================ financial times china ==========
+ Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/feed',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
+ Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/news',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
+# # ================ dong fang daily ================
+ Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/21.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
+ Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/113.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
+ Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/232.xml',:category=>'生活&时尚',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
+#
+# # =============== engadget china ==================
+ Source.create(:site_name=>'瘾科技', :link=>'http://cn.engadget.com/rss.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="postbody"]')

data/rakefile ADDED

@@ -0,0 +1,11 @@
+require File.expand_path('../../source',__FILE__)
+desc "Initial crawler engine database"
+task :crawler_setup do
+	#puts Source.count
+#	if File.exists?(Dir.pwd + "/seeds.rb")
+#		if Rake.application.lookup('db:seed')
+#			Rake::Task['db:seed -l'+Dir.pwd + "/seeds.rb"].invoke
+#		end
+#	end
+	puts 'finished crawler engine db setup.'
+end

metadata ADDED

@@ -0,0 +1,109 @@
+--- !ruby/object:Gem::Specification
+name: crawler-engine
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- tim.tang
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-11-23 00:00:00.000000000 +08:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rails
+  requirement: &90417220 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: *90417220
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: &90416950 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+  type: :runtime
+  prerelease: false
+  version_requirements: *90416950
+- !ruby/object:Gem::Dependency
+  name: simple-rss
+  requirement: &90416700 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.2.3
+  type: :runtime
+  prerelease: false
+  version_requirements: *90416700
+- !ruby/object:Gem::Dependency
+  name: rest-open-uri
+  requirement: &90416470 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.0.0
+  type: :runtime
+  prerelease: false
+  version_requirements: *90416470
+description: Crawler Engine provides function of crawl all news from the customized
+  website
+email: tang.jilong@139.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- Gemfile
+- README.md
+- crawler_engine.gemsepc
+- generators/crawler_engine_migration/crawler_engine_migration_generator.rb
+- generators/crawler_engine_migration/templates/migration.rb
+- lib/crawler_engine.rb
+- lib/crawler_engine.rb~
+- lib/crawler_parser.rb
+- lib/database.yml
+- lib/db_adaptor.rb
+- lib/generators/crawler_engine_generator.rb
+- lib/generators/templates/migration.rb
+- lib/generators/templates/post.rb
+- lib/generators/templates/source.rb
+- lib/post.rb
+- lib/source.rb
+- lib/source.rb~
+- rakefile
+has_rdoc: true
+homepage: http://www.everyday-cn.com
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: 1.9.2
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.6.2
+signing_key:
+specification_version: 3
+summary: Gem for crawler news post
+test_files: []