crawler-engine 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/README.md +25 -0
- data/crawler_engine.gemsepc +25 -0
- data/generators/crawler_engine_migration/crawler_engine_migration_generator.rb +18 -0
- data/generators/crawler_engine_migration/templates/migration.rb +27 -0
- data/lib/crawler_engine.rb +10 -0
- data/lib/crawler_engine.rb~ +47 -0
- data/lib/crawler_parser.rb +72 -0
- data/lib/database.yml +8 -0
- data/lib/db_adaptor.rb +7 -0
- data/lib/generators/crawler_engine_generator.rb +24 -0
- data/lib/generators/templates/migration.rb +27 -0
- data/lib/generators/templates/post.rb +3 -0
- data/lib/generators/templates/source.rb +4 -0
- data/lib/post.rb +6 -0
- data/lib/source.rb +7 -0
- data/lib/source.rb~ +42 -0
- data/rakefile +11 -0
- metadata +109 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# Crawler Engine
|
2
|
+
|
3
|
+
|
4
|
+
Crawler engine is the "Swiss Army bulldozer" plugin for
|
5
|
+
Ruby on Rails. It allows you to crawl posts from the website.
|
6
|
+
|
7
|
+
## Crawler Engine Features
|
8
|
+
|
9
|
+
Crawler Engine many advanced features, including: custom generators.
|
10
|
+
customized crawl links.
|
11
|
+
|
12
|
+
Crawler Engine is compatible with Active Record **3.0** and **3.1**.
|
13
|
+
|
14
|
+
## Rails Quickstart
|
15
|
+
|
16
|
+
gem install crawler-engine
|
17
|
+
|
18
|
+
rails g crawler_engine
|
19
|
+
|
20
|
+
rake db:migrate
|
21
|
+
|
22
|
+
# in your Gemfile
|
23
|
+
gem "crawler-engine"
|
24
|
+
|
25
|
+
# TODO:
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "crawler_engine"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'crawler-engine'
|
7
|
+
s.version = '0.1.0'
|
8
|
+
s.has_rdoc = true
|
9
|
+
s.required_ruby_version = ">= 1.9.2"
|
10
|
+
s.platform = "ruby"
|
11
|
+
s.required_rubygems_version = ">= 0"
|
12
|
+
s.author = "tim.tang"
|
13
|
+
s.email = "tang.jilong@139.com"
|
14
|
+
s.summary = "Gem for crawler news post"
|
15
|
+
s.homepage = 'http://www.everyday-cn.com'
|
16
|
+
s.add_runtime_dependency 'rails'
|
17
|
+
s.add_dependency("nokogiri",">=1.5.0")
|
18
|
+
s.add_dependency("simple-rss",">=1.2.3")
|
19
|
+
s.add_dependency("rest-open-uri",">=1.0.0")
|
20
|
+
s.files =`git ls-files`.split("\n")
|
21
|
+
s.require_paths = ['lib']
|
22
|
+
s.description="Crawler Engine provides function of crawl all news from the customized website"
|
23
|
+
end
|
24
|
+
|
25
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
class CrawlerEngineMigrationGenerator < Rails::Generator::Base
|
2
|
+
#source_root File.expand_path('../templates', __FILE__)
|
3
|
+
|
4
|
+
def manifest
|
5
|
+
record do |m|
|
6
|
+
options = {
|
7
|
+
:migration_file_name => 'crawler_engine_migration'
|
8
|
+
}
|
9
|
+
m.migration_template 'migration.rb', 'db/migrate', options
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def banner
|
14
|
+
"Usage: Generator crawler engine migration"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class CreateCrawlerEngine < ActiveRecord::Migration
|
2
|
+
def self.up
|
3
|
+
create_table :sources, :force => true do |t|
|
4
|
+
t.string :site_name
|
5
|
+
t.string :link
|
6
|
+
t.string :filter
|
7
|
+
t.string :category
|
8
|
+
t.datetime :crawled_at # When to run. Could be Time.now for immediately, or sometime in the future.
|
9
|
+
end
|
10
|
+
|
11
|
+
create_table :posts, :force => true do |t|
|
12
|
+
t.string :site_name
|
13
|
+
t.string :title
|
14
|
+
t.string :source
|
15
|
+
t.content :content
|
16
|
+
t.string :category
|
17
|
+
t.integer :speed
|
18
|
+
t.integer :support_num
|
19
|
+
t.datetime :published_at
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.down
|
24
|
+
drop_table :sources
|
25
|
+
drop_table :posts
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# encoding:utf-8
|
2
|
+
require File.expand_path('../crawler_parser',__FILE__)
|
3
|
+
require File.expand_path('../source',__FILE__)
|
4
|
+
|
5
|
+
class CrawlerEngine
|
6
|
+
@cp = CrawlerParser.new
|
7
|
+
#@sources = Source.find(:all)
|
8
|
+
#@sources = Source.where(:id=>1)
|
9
|
+
#@cp.parse_rss(@sources)
|
10
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require File.expand_path('../crawler_parser',__FILE__)
|
3
|
+
class CrawlerEngine
|
4
|
+
@cp = CrawlerParser.new
|
5
|
+
links = {
|
6
|
+
#financial time chinese
|
7
|
+
"http://www.ftchinese.com/rss/feed"=>'//div[@class="content"]',
|
8
|
+
#
|
9
|
+
}
|
10
|
+
#@cp.parse_rss(links)
|
11
|
+
end
|
12
|
+
|
13
|
+
# ================= SINA.com =======================
|
14
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/world/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
15
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/china/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
16
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/society/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
17
|
+
|
18
|
+
Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/finance/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
19
|
+
Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/stock/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
20
|
+
|
21
|
+
Source.create(:site_name=>'科技频道-新浪', :link=>'http://rss.sina.com.cn/tech/rollnews.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
22
|
+
|
23
|
+
# # ================= 163.com =======================
|
24
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gn.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
25
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gj.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
26
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_war.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
27
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_sh.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
28
|
+
|
29
|
+
Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/headlines.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
30
|
+
Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/kejiyejie.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
31
|
+
|
32
|
+
# ================= ifeng.com =====================
|
33
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/world.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
|
34
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/society.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="articial_real"]')
|
35
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/mainland.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
|
36
|
+
|
37
|
+
# # ================ financial times china ==========
|
38
|
+
Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/feed',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
|
39
|
+
Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/news',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
|
40
|
+
|
41
|
+
# # ================ dong fang daily ================
|
42
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/21.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
43
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/113.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
44
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/232.xml',:category=>'生活&时尚',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
45
|
+
#
|
46
|
+
# # =============== engadget china ==================
|
47
|
+
Source.create(:site_name=>'瘾科技', :link=>'http://cn.engadget.com/rss.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="postbody"]')
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require File.expand_path('../post',__FILE__)
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'rest-open-uri'
|
5
|
+
require 'simple-rss'
|
6
|
+
require 'iconv'
|
7
|
+
require 'logger'
|
8
|
+
|
9
|
+
class CrawlerParser
|
10
|
+
#logger = Logger.new('/tmp/crawlerEngine.log', 'daily')
|
11
|
+
#logger.level = Logger::INFO
|
12
|
+
FS_LEN = 80
|
13
|
+
|
14
|
+
def parse_rss(sources)
|
15
|
+
return if sources.nil? or sources.size.eql?(0)
|
16
|
+
#logger.info("Links to crawl >>"+links.to_s)
|
17
|
+
puts "Links to crawl >>"+sources.to_s
|
18
|
+
sources.uniq!
|
19
|
+
sources.each do |source|
|
20
|
+
begin
|
21
|
+
rss = SimpleRSS.parse open(source.link)
|
22
|
+
rescue Exception=>ex
|
23
|
+
#logger.error(ex)
|
24
|
+
#logger.info("SimpleRSS got unexpected error, rss exit")
|
25
|
+
puts ex
|
26
|
+
end
|
27
|
+
#TODO:
|
28
|
+
#puts rss.feed_tags.title
|
29
|
+
#puts rss.feed_tags.description
|
30
|
+
|
31
|
+
for item in rss.items
|
32
|
+
print_rss_item(item)
|
33
|
+
#parse post details
|
34
|
+
begin
|
35
|
+
doc = Nokogiri::HTML open(item.link.to_s)
|
36
|
+
rescue Exception=>ex
|
37
|
+
#logger.error(ex)
|
38
|
+
#logger.info("Nokogiri got unexpected error")
|
39
|
+
puts ex
|
40
|
+
end
|
41
|
+
return unless doc
|
42
|
+
doc.xpath(source.filter).each do |content|
|
43
|
+
#puts content.to_s.force_encoding('GB2312')
|
44
|
+
#Iconv.iconv("GB2312//IGNORE","UTF-8//IGNORE", content)
|
45
|
+
#Iconv.iconv("UTF-8//IGNORE","GB2312//IGNORE", content)
|
46
|
+
Post.create(
|
47
|
+
:title=>item.title.to_s.force_encoding('UTF-8'),
|
48
|
+
:source=>item.link.to_s.force_encoding('UTF-8'),
|
49
|
+
:content=>content.to_s.force_encoding('UTF-8'),
|
50
|
+
:published_at=>item.pubDate,
|
51
|
+
:site_name=>source.site_name,
|
52
|
+
:category => source.category)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
#update crawler exec timestamp
|
56
|
+
source.update_attribute("crawled_at", Time.now)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
def print_rss_item(item)
|
62
|
+
puts "-" * FS_LEN
|
63
|
+
puts "title:" + item.title.to_s
|
64
|
+
puts "author:" + item.author.to_s
|
65
|
+
puts "description:" + item.description.to_s
|
66
|
+
#puts Iconv.iconv("UTF-8//IGNORE","GB2312//IGNORE",item.description.to_s )
|
67
|
+
puts "link:" + item.link.to_s
|
68
|
+
puts "pubDate:" + item.pubDate.to_s
|
69
|
+
puts "guid:" + item.guid.to_s
|
70
|
+
puts "category:" + item.category.to_s
|
71
|
+
end
|
72
|
+
end
|
data/lib/database.yml
ADDED
data/lib/db_adaptor.rb
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require 'rubygems'
|
3
|
+
require 'active_record'
|
4
|
+
require 'logger'
|
5
|
+
dbconfig = YAML::load(File.open(File.dirname(__FILE__)+'/database.yml'))
|
6
|
+
ActiveRecord::Base.logger = Logger.new('/tmp/crawler_engine.log')
|
7
|
+
ActiveRecord::Base.establish_connection(dbconfig)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'rails/generators'
|
2
|
+
require 'rails/generators/migration'
|
3
|
+
require 'rails/generators/active_record'
|
4
|
+
|
5
|
+
class CrawlerEngineGenerator < Rails::Generators::Base
|
6
|
+
include Rails::Generators::Migration
|
7
|
+
extend ActiveRecord::Generators::Migration
|
8
|
+
|
9
|
+
desc "Generates migration for crawler engine model"
|
10
|
+
def self.source_root
|
11
|
+
File.expand_path('../templates', __FILE__)
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_migration_file
|
15
|
+
migration_template 'migration.rb', 'db/migrate/create_crawler_engine'
|
16
|
+
end
|
17
|
+
|
18
|
+
def crate_model_file
|
19
|
+
template 'post.rb', 'app/models/post.rb'
|
20
|
+
template 'source.rb', 'app/models/source.rb'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class CreateCrawlerEngine < ActiveRecord::Migration
|
2
|
+
def self.up
|
3
|
+
create_table :sources, :force => true do |t|
|
4
|
+
t.string :site_name
|
5
|
+
t.string :link
|
6
|
+
t.string :filter
|
7
|
+
t.string :category
|
8
|
+
t.datetime :crawled_at # When to run. Could be Time.now for immediately, or sometime in the future.
|
9
|
+
end
|
10
|
+
|
11
|
+
create_table :posts, :force => true do |t|
|
12
|
+
t.string :site_name
|
13
|
+
t.string :title
|
14
|
+
t.string :source
|
15
|
+
t.text :content
|
16
|
+
t.string :category
|
17
|
+
t.integer :speed
|
18
|
+
t.integer :support_num
|
19
|
+
t.datetime :published_at
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.down
|
24
|
+
drop_table :sources
|
25
|
+
drop_table :posts
|
26
|
+
end
|
27
|
+
end
|
data/lib/post.rb
ADDED
data/lib/source.rb
ADDED
data/lib/source.rb~
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require File.expand_path('../db_adaptor',__FILE__)
|
3
|
+
|
4
|
+
class Source < ActiveRecord::Base
|
5
|
+
set_table_name "sources"
|
6
|
+
end
|
7
|
+
|
8
|
+
# ================= SINA.com =======================
|
9
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/world/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
10
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/china/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
11
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/society/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
12
|
+
|
13
|
+
Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/finance/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
14
|
+
Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/stock/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
15
|
+
|
16
|
+
Source.create(:site_name=>'科技频道-新浪', :link=>'http://rss.sina.com.cn/tech/rollnews.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
17
|
+
|
18
|
+
# # ================= 163.com =======================
|
19
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gn.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
20
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gj.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
21
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_war.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
22
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_sh.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
23
|
+
|
24
|
+
Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/headlines.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
25
|
+
Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/kejiyejie.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
26
|
+
|
27
|
+
# ================= ifeng.com =====================
|
28
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/world.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
|
29
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/society.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="articial_real"]')
|
30
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/mainland.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
|
31
|
+
|
32
|
+
# # ================ financial times china ==========
|
33
|
+
Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/feed',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
|
34
|
+
Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/news',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
|
35
|
+
|
36
|
+
# # ================ dong fang daily ================
|
37
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/21.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
38
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/113.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
39
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/232.xml',:category=>'生活&时尚',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
40
|
+
#
|
41
|
+
# # =============== engadget china ==================
|
42
|
+
Source.create(:site_name=>'瘾科技', :link=>'http://cn.engadget.com/rss.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="postbody"]')
|
data/rakefile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require File.expand_path('../../source',__FILE__)
|
2
|
+
desc "Initial crawler engine database"
|
3
|
+
task :crawler_setup do
|
4
|
+
#puts Source.count
|
5
|
+
# if File.exists?(Dir.pwd + "/seeds.rb")
|
6
|
+
# if Rake.application.lookup('db:seed')
|
7
|
+
# Rake::Task['db:seed -l'+Dir.pwd + "/seeds.rb"].invoke
|
8
|
+
# end
|
9
|
+
# end
|
10
|
+
puts 'finished crawler engine db setup.'
|
11
|
+
end
|
metadata
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawler-engine
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- tim.tang
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-11-23 00:00:00.000000000 +08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rails
|
17
|
+
requirement: &90417220 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *90417220
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: nokogiri
|
28
|
+
requirement: &90416950 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.5.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *90416950
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: simple-rss
|
39
|
+
requirement: &90416700 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 1.2.3
|
45
|
+
type: :runtime
|
46
|
+
prerelease: false
|
47
|
+
version_requirements: *90416700
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: rest-open-uri
|
50
|
+
requirement: &90416470 !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 1.0.0
|
56
|
+
type: :runtime
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: *90416470
|
59
|
+
description: Crawler Engine provides function of crawl all news from the customized
|
60
|
+
website
|
61
|
+
email: tang.jilong@139.com
|
62
|
+
executables: []
|
63
|
+
extensions: []
|
64
|
+
extra_rdoc_files: []
|
65
|
+
files:
|
66
|
+
- Gemfile
|
67
|
+
- README.md
|
68
|
+
- crawler_engine.gemsepc
|
69
|
+
- generators/crawler_engine_migration/crawler_engine_migration_generator.rb
|
70
|
+
- generators/crawler_engine_migration/templates/migration.rb
|
71
|
+
- lib/crawler_engine.rb
|
72
|
+
- lib/crawler_engine.rb~
|
73
|
+
- lib/crawler_parser.rb
|
74
|
+
- lib/database.yml
|
75
|
+
- lib/db_adaptor.rb
|
76
|
+
- lib/generators/crawler_engine_generator.rb
|
77
|
+
- lib/generators/templates/migration.rb
|
78
|
+
- lib/generators/templates/post.rb
|
79
|
+
- lib/generators/templates/source.rb
|
80
|
+
- lib/post.rb
|
81
|
+
- lib/source.rb
|
82
|
+
- lib/source.rb~
|
83
|
+
- rakefile
|
84
|
+
has_rdoc: true
|
85
|
+
homepage: http://www.everyday-cn.com
|
86
|
+
licenses: []
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 1.9.2
|
97
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
requirements: []
|
104
|
+
rubyforge_project:
|
105
|
+
rubygems_version: 1.6.2
|
106
|
+
signing_key:
|
107
|
+
specification_version: 3
|
108
|
+
summary: Gem for crawler news post
|
109
|
+
test_files: []
|