crawler-engine 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/README.md +25 -0
- data/crawler_engine.gemsepc +25 -0
- data/generators/crawler_engine_migration/crawler_engine_migration_generator.rb +18 -0
- data/generators/crawler_engine_migration/templates/migration.rb +27 -0
- data/lib/crawler_engine.rb +10 -0
- data/lib/crawler_engine.rb~ +47 -0
- data/lib/crawler_parser.rb +72 -0
- data/lib/database.yml +8 -0
- data/lib/db_adaptor.rb +7 -0
- data/lib/generators/crawler_engine_generator.rb +24 -0
- data/lib/generators/templates/migration.rb +27 -0
- data/lib/generators/templates/post.rb +3 -0
- data/lib/generators/templates/source.rb +4 -0
- data/lib/post.rb +6 -0
- data/lib/source.rb +7 -0
- data/lib/source.rb~ +42 -0
- data/rakefile +11 -0
- metadata +109 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# Crawler Engine
|
2
|
+
|
3
|
+
|
4
|
+
Crawler engine is the "Swiss Army bulldozer" plugin for
|
5
|
+
Ruby on Rails. It allows you to crawl posts from the website.
|
6
|
+
|
7
|
+
## Crawler Engine Features
|
8
|
+
|
9
|
+
Crawler Engine many advanced features, including: custom generators.
|
10
|
+
customized crawl links.
|
11
|
+
|
12
|
+
Crawler Engine is compatible with Active Record **3.0** and **3.1**.
|
13
|
+
|
14
|
+
## Rails Quickstart
|
15
|
+
|
16
|
+
gem install crawler-engine
|
17
|
+
|
18
|
+
rails g crawler_engine
|
19
|
+
|
20
|
+
rake db:migrate
|
21
|
+
|
22
|
+
# in your Gemfile
|
23
|
+
gem "crawler-engine"
|
24
|
+
|
25
|
+
# TODO:
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "crawler_engine"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = 'crawler-engine'
|
7
|
+
s.version = '0.1.0'
|
8
|
+
s.has_rdoc = true
|
9
|
+
s.required_ruby_version = ">= 1.9.2"
|
10
|
+
s.platform = "ruby"
|
11
|
+
s.required_rubygems_version = ">= 0"
|
12
|
+
s.author = "tim.tang"
|
13
|
+
s.email = "tang.jilong@139.com"
|
14
|
+
s.summary = "Gem for crawler news post"
|
15
|
+
s.homepage = 'http://www.everyday-cn.com'
|
16
|
+
s.add_runtime_dependency 'rails'
|
17
|
+
s.add_dependency("nokogiri",">=1.5.0")
|
18
|
+
s.add_dependency("simple-rss",">=1.2.3")
|
19
|
+
s.add_dependency("rest-open-uri",">=1.0.0")
|
20
|
+
s.files =`git ls-files`.split("\n")
|
21
|
+
s.require_paths = ['lib']
|
22
|
+
s.description="Crawler Engine provides function of crawl all news from the customized website"
|
23
|
+
end
|
24
|
+
|
25
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
class CrawlerEngineMigrationGenerator < Rails::Generator::Base
|
2
|
+
#source_root File.expand_path('../templates', __FILE__)
|
3
|
+
|
4
|
+
def manifest
|
5
|
+
record do |m|
|
6
|
+
options = {
|
7
|
+
:migration_file_name => 'crawler_engine_migration'
|
8
|
+
}
|
9
|
+
m.migration_template 'migration.rb', 'db/migrate', options
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def banner
|
14
|
+
"Usage: Generator crawler engine migration"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class CreateCrawlerEngine < ActiveRecord::Migration
|
2
|
+
def self.up
|
3
|
+
create_table :sources, :force => true do |t|
|
4
|
+
t.string :site_name
|
5
|
+
t.string :link
|
6
|
+
t.string :filter
|
7
|
+
t.string :category
|
8
|
+
t.datetime :crawled_at # When to run. Could be Time.now for immediately, or sometime in the future.
|
9
|
+
end
|
10
|
+
|
11
|
+
create_table :posts, :force => true do |t|
|
12
|
+
t.string :site_name
|
13
|
+
t.string :title
|
14
|
+
t.string :source
|
15
|
+
t.content :content
|
16
|
+
t.string :category
|
17
|
+
t.integer :speed
|
18
|
+
t.integer :support_num
|
19
|
+
t.datetime :published_at
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.down
|
24
|
+
drop_table :sources
|
25
|
+
drop_table :posts
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# encoding:utf-8
|
2
|
+
require File.expand_path('../crawler_parser',__FILE__)
|
3
|
+
require File.expand_path('../source',__FILE__)
|
4
|
+
|
5
|
+
class CrawlerEngine
|
6
|
+
@cp = CrawlerParser.new
|
7
|
+
#@sources = Source.find(:all)
|
8
|
+
#@sources = Source.where(:id=>1)
|
9
|
+
#@cp.parse_rss(@sources)
|
10
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require File.expand_path('../crawler_parser',__FILE__)
|
3
|
+
class CrawlerEngine
|
4
|
+
@cp = CrawlerParser.new
|
5
|
+
links = {
|
6
|
+
#financial time chinese
|
7
|
+
"http://www.ftchinese.com/rss/feed"=>'//div[@class="content"]',
|
8
|
+
#
|
9
|
+
}
|
10
|
+
#@cp.parse_rss(links)
|
11
|
+
end
|
12
|
+
|
13
|
+
# ================= SINA.com =======================
|
14
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/world/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
15
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/china/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
16
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/society/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
17
|
+
|
18
|
+
Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/finance/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
19
|
+
Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/stock/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
20
|
+
|
21
|
+
Source.create(:site_name=>'科技频道-新浪', :link=>'http://rss.sina.com.cn/tech/rollnews.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
22
|
+
|
23
|
+
# # ================= 163.com =======================
|
24
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gn.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
25
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gj.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
26
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_war.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
27
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_sh.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
28
|
+
|
29
|
+
Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/headlines.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
30
|
+
Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/kejiyejie.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
31
|
+
|
32
|
+
# ================= ifeng.com =====================
|
33
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/world.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
|
34
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/society.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="articial_real"]')
|
35
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/mainland.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
|
36
|
+
|
37
|
+
# # ================ financial times china ==========
|
38
|
+
Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/feed',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
|
39
|
+
Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/news',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
|
40
|
+
|
41
|
+
# # ================ dong fang daily ================
|
42
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/21.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
43
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/113.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
44
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/232.xml',:category=>'生活&时尚',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
45
|
+
#
|
46
|
+
# # =============== engadget china ==================
|
47
|
+
Source.create(:site_name=>'瘾科技', :link=>'http://cn.engadget.com/rss.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="postbody"]')
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require File.expand_path('../post',__FILE__)
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'rest-open-uri'
|
5
|
+
require 'simple-rss'
|
6
|
+
require 'iconv'
|
7
|
+
require 'logger'
|
8
|
+
|
9
|
+
class CrawlerParser
|
10
|
+
#logger = Logger.new('/tmp/crawlerEngine.log', 'daily')
|
11
|
+
#logger.level = Logger::INFO
|
12
|
+
FS_LEN = 80
|
13
|
+
|
14
|
+
def parse_rss(sources)
|
15
|
+
return if sources.nil? or sources.size.eql?(0)
|
16
|
+
#logger.info("Links to crawl >>"+links.to_s)
|
17
|
+
puts "Links to crawl >>"+sources.to_s
|
18
|
+
sources.uniq!
|
19
|
+
sources.each do |source|
|
20
|
+
begin
|
21
|
+
rss = SimpleRSS.parse open(source.link)
|
22
|
+
rescue Exception=>ex
|
23
|
+
#logger.error(ex)
|
24
|
+
#logger.info("SimpleRSS got unexpected error, rss exit")
|
25
|
+
puts ex
|
26
|
+
end
|
27
|
+
#TODO:
|
28
|
+
#puts rss.feed_tags.title
|
29
|
+
#puts rss.feed_tags.description
|
30
|
+
|
31
|
+
for item in rss.items
|
32
|
+
print_rss_item(item)
|
33
|
+
#parse post details
|
34
|
+
begin
|
35
|
+
doc = Nokogiri::HTML open(item.link.to_s)
|
36
|
+
rescue Exception=>ex
|
37
|
+
#logger.error(ex)
|
38
|
+
#logger.info("Nokogiri got unexpected error")
|
39
|
+
puts ex
|
40
|
+
end
|
41
|
+
return unless doc
|
42
|
+
doc.xpath(source.filter).each do |content|
|
43
|
+
#puts content.to_s.force_encoding('GB2312')
|
44
|
+
#Iconv.iconv("GB2312//IGNORE","UTF-8//IGNORE", content)
|
45
|
+
#Iconv.iconv("UTF-8//IGNORE","GB2312//IGNORE", content)
|
46
|
+
Post.create(
|
47
|
+
:title=>item.title.to_s.force_encoding('UTF-8'),
|
48
|
+
:source=>item.link.to_s.force_encoding('UTF-8'),
|
49
|
+
:content=>content.to_s.force_encoding('UTF-8'),
|
50
|
+
:published_at=>item.pubDate,
|
51
|
+
:site_name=>source.site_name,
|
52
|
+
:category => source.category)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
#update crawler exec timestamp
|
56
|
+
source.update_attribute("crawled_at", Time.now)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
def print_rss_item(item)
|
62
|
+
puts "-" * FS_LEN
|
63
|
+
puts "title:" + item.title.to_s
|
64
|
+
puts "author:" + item.author.to_s
|
65
|
+
puts "description:" + item.description.to_s
|
66
|
+
#puts Iconv.iconv("UTF-8//IGNORE","GB2312//IGNORE",item.description.to_s )
|
67
|
+
puts "link:" + item.link.to_s
|
68
|
+
puts "pubDate:" + item.pubDate.to_s
|
69
|
+
puts "guid:" + item.guid.to_s
|
70
|
+
puts "category:" + item.category.to_s
|
71
|
+
end
|
72
|
+
end
|
data/lib/database.yml
ADDED
data/lib/db_adaptor.rb
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require 'rubygems'
|
3
|
+
require 'active_record'
|
4
|
+
require 'logger'
|
5
|
+
dbconfig = YAML::load(File.open(File.dirname(__FILE__)+'/database.yml'))
|
6
|
+
ActiveRecord::Base.logger = Logger.new('/tmp/crawler_engine.log')
|
7
|
+
ActiveRecord::Base.establish_connection(dbconfig)
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'rails/generators'
|
2
|
+
require 'rails/generators/migration'
|
3
|
+
require 'rails/generators/active_record'
|
4
|
+
|
5
|
+
class CrawlerEngineGenerator < Rails::Generators::Base
|
6
|
+
include Rails::Generators::Migration
|
7
|
+
extend ActiveRecord::Generators::Migration
|
8
|
+
|
9
|
+
desc "Generates migration for crawler engine model"
|
10
|
+
def self.source_root
|
11
|
+
File.expand_path('../templates', __FILE__)
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_migration_file
|
15
|
+
migration_template 'migration.rb', 'db/migrate/create_crawler_engine'
|
16
|
+
end
|
17
|
+
|
18
|
+
def crate_model_file
|
19
|
+
template 'post.rb', 'app/models/post.rb'
|
20
|
+
template 'source.rb', 'app/models/source.rb'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class CreateCrawlerEngine < ActiveRecord::Migration
|
2
|
+
def self.up
|
3
|
+
create_table :sources, :force => true do |t|
|
4
|
+
t.string :site_name
|
5
|
+
t.string :link
|
6
|
+
t.string :filter
|
7
|
+
t.string :category
|
8
|
+
t.datetime :crawled_at # When to run. Could be Time.now for immediately, or sometime in the future.
|
9
|
+
end
|
10
|
+
|
11
|
+
create_table :posts, :force => true do |t|
|
12
|
+
t.string :site_name
|
13
|
+
t.string :title
|
14
|
+
t.string :source
|
15
|
+
t.text :content
|
16
|
+
t.string :category
|
17
|
+
t.integer :speed
|
18
|
+
t.integer :support_num
|
19
|
+
t.datetime :published_at
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.down
|
24
|
+
drop_table :sources
|
25
|
+
drop_table :posts
|
26
|
+
end
|
27
|
+
end
|
data/lib/post.rb
ADDED
data/lib/source.rb
ADDED
data/lib/source.rb~
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
require File.expand_path('../db_adaptor',__FILE__)
|
3
|
+
|
4
|
+
class Source < ActiveRecord::Base
|
5
|
+
set_table_name "sources"
|
6
|
+
end
|
7
|
+
|
8
|
+
# ================= SINA.com =======================
|
9
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/world/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
10
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/china/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
11
|
+
Source.create(:site_name=>'新闻中心-新浪', :link=>'http://rss.sina.com.cn/news/society/focus15.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
12
|
+
|
13
|
+
Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/finance/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
14
|
+
Source.create(:site_name=>'财经频道-新浪', :link=>'http://rss.sina.com.cn/roll/stock/hot_roll.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
15
|
+
|
16
|
+
Source.create(:site_name=>'科技频道-新浪', :link=>'http://rss.sina.com.cn/tech/rollnews.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="blkContainerSblkCon"]')
|
17
|
+
|
18
|
+
# # ================= 163.com =======================
|
19
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gn.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
20
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_gj.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
21
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_war.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
22
|
+
Source.create(:site_name=>'网易新闻', :link=>'http://news.163.com/special/00011K6L/rss_sh.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
23
|
+
|
24
|
+
Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/headlines.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
25
|
+
Source.create(:site_name=>'网易科技', :link=>'http://tech.163.com/special/000944OI/kejiyejie.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@id="endText"]')
|
26
|
+
|
27
|
+
# ================= ifeng.com =====================
|
28
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/world.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
|
29
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/society.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="articial_real"]')
|
30
|
+
Source.create(:site_name=>'凤凰网', :link=>'http://news.ifeng.com/rss/mainland.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="artical_real"]')
|
31
|
+
|
32
|
+
# # ================ financial times china ==========
|
33
|
+
Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/feed',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
|
34
|
+
Source.create(:site_name=>'FT中文网', :link=>'http://www.ftchinese.com/rss/news',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@class="content"]')
|
35
|
+
|
36
|
+
# # ================ dong fang daily ================
|
37
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/21.xml',:category=>'时政&社会',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
38
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/113.xml',:category=>'财经&股票',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
39
|
+
Source.create(:site_name=>'东方早报', :link=>'http://www.dfdaily.com/rss/232.xml',:category=>'生活&时尚',:crawled_at=>Time.now, :filter=>'//div[@id="newscontent"]')
|
40
|
+
#
|
41
|
+
# # =============== engadget china ==================
|
42
|
+
Source.create(:site_name=>'瘾科技', :link=>'http://cn.engadget.com/rss.xml',:category=>'科技&电子',:crawled_at=>Time.now, :filter=>'//div[@class="postbody"]')
|
data/rakefile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require File.expand_path('../../source',__FILE__)
|
2
|
+
desc "Initial crawler engine database"
|
3
|
+
task :crawler_setup do
|
4
|
+
#puts Source.count
|
5
|
+
# if File.exists?(Dir.pwd + "/seeds.rb")
|
6
|
+
# if Rake.application.lookup('db:seed')
|
7
|
+
# Rake::Task['db:seed -l'+Dir.pwd + "/seeds.rb"].invoke
|
8
|
+
# end
|
9
|
+
# end
|
10
|
+
puts 'finished crawler engine db setup.'
|
11
|
+
end
|
metadata
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawler-engine
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- tim.tang
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-11-23 00:00:00.000000000 +08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rails
|
17
|
+
requirement: &90417220 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *90417220
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: nokogiri
|
28
|
+
requirement: &90416950 !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.5.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: *90416950
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: simple-rss
|
39
|
+
requirement: &90416700 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 1.2.3
|
45
|
+
type: :runtime
|
46
|
+
prerelease: false
|
47
|
+
version_requirements: *90416700
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: rest-open-uri
|
50
|
+
requirement: &90416470 !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 1.0.0
|
56
|
+
type: :runtime
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: *90416470
|
59
|
+
description: Crawler Engine provides function of crawl all news from the customized
|
60
|
+
website
|
61
|
+
email: tang.jilong@139.com
|
62
|
+
executables: []
|
63
|
+
extensions: []
|
64
|
+
extra_rdoc_files: []
|
65
|
+
files:
|
66
|
+
- Gemfile
|
67
|
+
- README.md
|
68
|
+
- crawler_engine.gemsepc
|
69
|
+
- generators/crawler_engine_migration/crawler_engine_migration_generator.rb
|
70
|
+
- generators/crawler_engine_migration/templates/migration.rb
|
71
|
+
- lib/crawler_engine.rb
|
72
|
+
- lib/crawler_engine.rb~
|
73
|
+
- lib/crawler_parser.rb
|
74
|
+
- lib/database.yml
|
75
|
+
- lib/db_adaptor.rb
|
76
|
+
- lib/generators/crawler_engine_generator.rb
|
77
|
+
- lib/generators/templates/migration.rb
|
78
|
+
- lib/generators/templates/post.rb
|
79
|
+
- lib/generators/templates/source.rb
|
80
|
+
- lib/post.rb
|
81
|
+
- lib/source.rb
|
82
|
+
- lib/source.rb~
|
83
|
+
- rakefile
|
84
|
+
has_rdoc: true
|
85
|
+
homepage: http://www.everyday-cn.com
|
86
|
+
licenses: []
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 1.9.2
|
97
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
98
|
+
none: false
|
99
|
+
requirements:
|
100
|
+
- - ! '>='
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
requirements: []
|
104
|
+
rubyforge_project:
|
105
|
+
rubygems_version: 1.6.2
|
106
|
+
signing_key:
|
107
|
+
specification_version: 3
|
108
|
+
summary: Gem for crawler news post
|
109
|
+
test_files: []
|