loyal_spider 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ Copyright 2013 YOURNAME
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,4 @@
1
+ # LoyalSpider
2
+
3
+ 这是一些对网站内容的一些蜘蛛爬虫类. 爬取网页的内容.等
4
+
@@ -0,0 +1,21 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ rescue LoadError
4
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
5
+ end
6
+
7
+ require 'rdoc/task'
8
+
9
+ RDoc::Task.new(:rdoc) do |rdoc|
10
+ rdoc.rdoc_dir = 'rdoc'
11
+ rdoc.title = 'LoyalSpider'
12
+ rdoc.options << '--line-numbers'
13
+ rdoc.rdoc_files.include('README.rdoc')
14
+ rdoc.rdoc_files.include('lib/**/*.rb')
15
+ end
16
+
17
+
18
+
19
+
20
+ Bundler::GemHelper.install_tasks
21
+
@@ -0,0 +1,13 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'rest-client'
3
+ require 'sanitize'
4
+ require "rails_config"
5
+ require "nokogiri"
6
+
7
+ require "loyal_spider/config"
8
+ require "loyal_spider/ables"
9
+ require "loyal_spider/clients"
10
+ require "loyal_spider/utils"
11
+
12
+ module LoyalSpider
13
+ end
@@ -0,0 +1,7 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require "#{File.dirname(__FILE__)}/ables/fetch_able"
3
+ require "#{File.dirname(__FILE__)}/ables/entity_able"
4
+ require "#{File.dirname(__FILE__)}/ables/entity_lister_able"
5
+ require "#{File.dirname(__FILE__)}/ables/fetch_options"
6
+ require "#{File.dirname(__FILE__)}/ables/fetch_result"
7
+
@@ -0,0 +1,86 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module EntityAble
4
+ # 有返回信息的
5
+ def self.included base
6
+ base.class_eval do
7
+ attr_accessor :title # 标题
8
+ attr_accessor :url # 标题
9
+ attr_accessor :content # 正文
10
+ attr_accessor :tags # 标签
11
+ attr_accessor :up_rating # 好评数
12
+ attr_accessor :down_rating # 差评数目
13
+ attr_accessor :comments_count # 评论数目
14
+ attr_accessor :authors # 抓取的作者信息
15
+ attr_accessor :publish_time # 发布时间
16
+ attr_accessor :errors # 错误
17
+
18
+ include InstanceMethods
19
+ end
20
+ end
21
+
22
+ module InstanceMethods
23
+ def initialize attrs={}
24
+ attrs.each do |key, value|
25
+ self.send(:"#{key}=", value)
26
+ end
27
+
28
+ self.errors = {}
29
+ end
30
+
31
+ def tags
32
+ @tags ||= []
33
+ end
34
+
35
+ def tags_array
36
+ @tags_array ||= self.tags.map{|_tag| _tag[:text] }
37
+ end
38
+
39
+ # 图片
40
+ def images
41
+ @images ||= Nokogiri::HTML.parse(self.content).css('img').map do |img_doc|
42
+ {
43
+ :src => img_doc.attr('src'),
44
+ :title => img_doc.attr('title'),
45
+ :alt => img_doc.attr('alt')
46
+ }
47
+ end
48
+ end
49
+
50
+ def images?
51
+ self.images.any?
52
+ end
53
+
54
+ def authors
55
+ @authors ||= []
56
+ end
57
+
58
+ def authors_array
59
+ @authors_array ||= self.authors.map{|_author| _author[:text] }
60
+ end
61
+
62
+ def valid?
63
+ self.valid!
64
+ self.errors.empty?
65
+ end
66
+
67
+ def valid!
68
+ unless self.content.to_s.strip.size > 0
69
+ add_error :content, '不能为空'
70
+ end
71
+
72
+ unless self.url.to_s.strip.size > 0
73
+ add_error :url, '不能为空'
74
+ end
75
+ end
76
+
77
+ private
78
+
79
+ def add_error field, message
80
+ (self.errors[field] ||= []) << message
81
+ end
82
+
83
+ end
84
+
85
+ end
86
+ end
@@ -0,0 +1,119 @@
1
+ # -*- encoding : utf-8 -*-
2
+ ## -*- encoding : utf-8 -*-
3
+ module LoyalSpider
4
+ module EntityListerAble
5
+ # 有返回信息的
6
+ def self.included base
7
+ base.class_eval do
8
+ attr_writer :current_page # current_page
9
+
10
+ include ::LoyalSpider::FetchAble
11
+ include InstanceMethods
12
+ extend ClassMethods
13
+ end
14
+ end
15
+
16
+ module ClassMethods
17
+ # options:
18
+ # - url_format
19
+ # - url_format_first
20
+ # - url_format_options
21
+ def config_loyal_spider_entity_lister options={}
22
+ @entity_lister_options ||= options
23
+
24
+ self.config_loyal_spider_default_fetch_options(
25
+ self.entity_lister_options.delete(:fetch_options) || {}
26
+ )
27
+ end
28
+
29
+ def paged_fetch page, options={}, &block
30
+ lister = self.new
31
+ lister.current_page = page
32
+ lister.fetch options, &block
33
+ end
34
+
35
+ def entity_lister_options
36
+ @entity_lister_options ||= {}
37
+ end
38
+
39
+ # 按页抓取
40
+ def paged_fetch page, options={}, &block
41
+ self.new.paged_fetch page, options, &block
42
+ end
43
+ end
44
+
45
+ module InstanceMethods
46
+ def paged_fetch page, options={}, &block
47
+
48
+ self.url_format_options.merge!(
49
+ options[:url_format_options] || {}
50
+ )
51
+
52
+ self.current_page = page
53
+ self.fetch options, &block
54
+ end
55
+
56
+ def url_format
57
+ @url_format ||= self.class.entity_lister_options[:url_format]
58
+ end
59
+
60
+ def url_format_first
61
+ @url_format_first ||= self.class.entity_lister_options[:url_format_first]
62
+ end
63
+
64
+ def url_format_options
65
+ @url_format_options ||= (self.class.entity_lister_options[:url_format_options] || {})
66
+ end
67
+
68
+ def _before_fetch options={}
69
+ @entities = []
70
+ end
71
+
72
+ def _after_fetch_success result
73
+ result.entities = self.entities
74
+ end
75
+
76
+ def entities
77
+ @entities ||= []
78
+ end
79
+
80
+ # TODO
81
+ def entity_clazz
82
+ self.class.entity_lister_options[:entity_clazz]
83
+ end
84
+
85
+ def new_entity attrs={}
86
+ if self.entity_clazz
87
+ self.entity_clazz.new(attrs)
88
+ end
89
+ end
90
+
91
+ def add_entity entity
92
+ self.entities << entity
93
+ end
94
+
95
+ def current_page
96
+ @current_page ||= 1
97
+ end
98
+
99
+ def first_page?
100
+ self.current_page < 2
101
+ end
102
+
103
+ def fetch_url
104
+ return @fetch_url if defined?(@fetch_url)
105
+
106
+ _url_format = self.first_page? ? self.url_format_first : self.url_format
107
+
108
+ @fetch_url ||= sprintf(
109
+ _url_format, (
110
+ self.url_format_options || {}
111
+ ).merge(
112
+ :page => self.current_page
113
+ )
114
+ )
115
+ end
116
+ end
117
+
118
+ end
119
+ end
@@ -0,0 +1,130 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module FetchAble
4
+ # 有返回信息的
5
+ def self.included base
6
+ base.class_eval do
7
+ attr_accessor :fetch_options
8
+
9
+ include InstanceMethods
10
+ extend ClassMethods
11
+ end
12
+ end
13
+
14
+ module ClassMethods
15
+ # 配置蜘蛛的抓取配置
16
+ def config_loyal_spider_default_fetch_options options={}
17
+ @default_fetch_options ||= options
18
+ end
19
+
20
+ def default_fetch_options
21
+ @default_fetch_options ||= {}
22
+ end
23
+
24
+ def fetch options={}, &block
25
+ self.new.fetch options, &block
26
+ end
27
+ end
28
+
29
+ module InstanceMethods
30
+ def base_url
31
+ self.fetch_options.base_url
32
+ end
33
+
34
+ def fetch options={}, &block
35
+ self._before_fetch options if self.respond_to?(:_before_fetch, true)
36
+ self.before_fetch options if self.respond_to?(:before_fetch, true)
37
+
38
+ result = _perform_fetch options, &block
39
+
40
+ if result.success?
41
+ self.after_fetch_success(result) if self.respond_to?(:after_fetch_success, true)
42
+ self._after_fetch_success(result) if self.respond_to?(:_after_fetch_success, true)
43
+ else
44
+ self.after_fetch_fail(result) if self.respond_to?(:after_fetch_fail, true)
45
+ self._after_fetch_fail(result) if self.respond_to?(:_after_fetch_fail, true)
46
+ end
47
+
48
+ result
49
+ end
50
+
51
+ private
52
+
53
+ def _perform_fetch options={}, &block
54
+ _fetch_url = self.respond_to?(:fetch_url, true) ? self.fetch_url : self.url
55
+
56
+ @fetch_options = ::LoyalSpider::FetchOptions.new(
57
+ ::LoyalSpider::HashUtil.deep_merge(self.class.default_fetch_options, options).merge(
58
+ :url => _fetch_url
59
+ )
60
+ )
61
+
62
+ begin
63
+ ::RestClient::Request.execute @fetch_options.net_options do |response, request, response_result, &_block|
64
+ response_code = response.code
65
+
66
+ response_status = if (200..207).include?(response_code)
67
+ :success
68
+ elsif (300..307).include?(response_code)
69
+ :redirect
70
+ elsif (400..450).include?(response_code)
71
+ :request_error
72
+ else
73
+ :server_error
74
+ end
75
+
76
+ _result = ::LoyalSpider::FetchResult.new(
77
+ :response_status => response_status,
78
+ :response_code => response_code,
79
+ :response => response.force_encoding(self.fetch_options.encoding_type).encode!('UTF-8'),
80
+ :request => request,
81
+ :response_result => response_result,
82
+ :fetch_options => self.fetch_options,
83
+ :url => _fetch_url
84
+ )
85
+
86
+ if block_given?
87
+ block.call _result, &_block
88
+ end
89
+
90
+ _result
91
+ end
92
+ rescue Exception => exception
93
+ _error_result = {
94
+ :response => nil,
95
+ :request => nil,
96
+ :response_result => nil,
97
+ :fetch_options => self.fetch_options,
98
+ :url => _fetch_url,
99
+ :exception => exception
100
+ }
101
+
102
+ case exception
103
+ when ::SocketError
104
+ _error_result.merge!(
105
+ :response_status => :socket_error,
106
+ :response_code => 502
107
+ )
108
+ else
109
+ _error_result.merge!(
110
+ :response_status => :request_error,
111
+ :response_code => 400
112
+ )
113
+ end
114
+
115
+ _result = ::LoyalSpider::FetchResult.new(
116
+ _error_result
117
+ )
118
+
119
+ if block_given?
120
+ block.call _result, &_block
121
+ end
122
+
123
+ _result
124
+
125
+ end
126
+
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,38 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class FetchOptions
4
+ attr_accessor :method # 抓取方法
5
+ attr_accessor :open_timeout # 打开超时配置
6
+ attr_accessor :timeout # 打开超时配置
7
+ attr_accessor :encoding_type # 编码
8
+ attr_accessor :headers # 请求头
9
+ attr_accessor :url # url
10
+ attr_accessor :base_url # base_url
11
+
12
+ def initialize attrs={}
13
+ @url = attrs[:url] || ''
14
+ @base_url = attrs[:base_url] || ''
15
+ @method = attrs[:method] || :get
16
+ @timeout = attrs[:timeout] || 60 # 单位秒
17
+ @open_timeout = attrs[:open_time] || @timeout
18
+ @encoding_type = attrs[:encoding_type] || 'UTF-8'
19
+ @headers = {
20
+ :accept_charset => 'UTF-8,*;q=0.5',
21
+ :accept_encoding => 'gzip,deflate,sdch',
22
+ :user_agent => 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
23
+ }.merge(attrs[:headers] || {})
24
+ end
25
+
26
+ def net_options
27
+ {
28
+ :url => self.url,
29
+ :method => self.method,
30
+ :timeout => self.timeout,
31
+ :open_timeout => self.open_timeout,
32
+ :encoding_type => self.encoding_type,
33
+ :headers => self.headers
34
+ }
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,34 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class FetchResult
4
+
5
+ attr_accessor :response_status
6
+ attr_accessor :response_code
7
+ attr_accessor :response
8
+ attr_accessor :request
9
+ attr_accessor :response_result
10
+ attr_accessor :fetch_options
11
+ attr_accessor :url
12
+ attr_accessor :exception
13
+ attr_accessor :entities
14
+
15
+ def initialize attrs={}
16
+ attrs.each do |key, value|
17
+ self.send(:"#{key}=", value)
18
+ end
19
+ end
20
+
21
+ def success?
22
+ self.response_status == :success
23
+ end
24
+
25
+ def fail?
26
+ !success?
27
+ end
28
+
29
+ def response_html_doc
30
+ @response_html_doc ||= Nokogiri::HTML.parse(self.response) if self.response
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,13 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require "#{File.dirname(__FILE__)}/clients/kuaile_mahua/article_entity"
3
+ require "#{File.dirname(__FILE__)}/clients/kuaile_mahua/article_entity_lister"
4
+
5
+ require "#{File.dirname(__FILE__)}/clients/haha365/article_entity"
6
+ require "#{File.dirname(__FILE__)}/clients/haha365/article_entity_lister"
7
+
8
+ require "#{File.dirname(__FILE__)}/clients/lengxiaohua/article_entity"
9
+ require "#{File.dirname(__FILE__)}/clients/lengxiaohua/article_entity_lister"
10
+
11
+ require "#{File.dirname(__FILE__)}/clients/xiaohuadi/article_entity"
12
+ require "#{File.dirname(__FILE__)}/clients/xiaohuadi/article_entity_lister"
13
+
@@ -0,0 +1,11 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Haha365
5
+ class ArticleEntity
6
+ include ::LoyalSpider::EntityAble
7
+
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,94 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Haha365
5
+ class ArticleEntityLister
6
+ include ::LoyalSpider::EntityListerAble
7
+
8
+ self.config_loyal_spider_entity_lister :url_format_first => 'http://www.haha365.com/%{category}/',
9
+ :url_format => 'http://www.haha365.com/%{category}/index_%{page}.htm',
10
+ :entity_clazz => ::LoyalSpider::Clients::Haha365::ArticleEntity,
11
+ :fetch_options => {
12
+ :encoding_type => 'GBK',
13
+ :base_url => 'http://www.haha365.com'
14
+ }
15
+
16
+ # TODO:
17
+ def before_fetch options={}
18
+ puts "before_fetch: #{options}"
19
+ end
20
+
21
+ # TODO
22
+ def after_fetch_success result
23
+ # puts "after_fetch success: #{result}"
24
+ html_doc = result.response_html_doc
25
+
26
+ # _basic_doc = html_doc.css('html #main .content .left .r_c .cat_llb')
27
+ _basic_doc = html_doc.css('html #main .content .left .r_c')
28
+
29
+ # :content # 正文
30
+ # :tags # 标签
31
+ # :tags_text # 标签
32
+ # :up_rating # 好评数
33
+ # :down_rating # 差评数目
34
+ # :comments_count # 评论数目
35
+ # :authors # 抓取的作者信息
36
+
37
+ (0...(_basic_doc.css('.cat_llb .fl a').size)).each do |_index|
38
+
39
+ _title_doc = _basic_doc.css('.cat_llb h3 a')[_index]
40
+ _content_doc = _basic_doc.css('.cat_llb #endtext')[_index]
41
+ _category_doc = _basic_doc.css('.cat_llb .fl a')[_index]
42
+
43
+ if _title_doc.nil? || _content_doc.nil? || _category_doc.nil?
44
+ next
45
+ end
46
+
47
+ _entity_attr = {}
48
+
49
+ _text_content = _content_doc.try :inner_html
50
+
51
+ _content = _text_content.to_s.split("<br>\r\n").map do |_cnt|
52
+ "<p>#{(Sanitize.clean _cnt).to_s.strip}</p>"
53
+ end.join('')
54
+
55
+ _entity_attr[:content] = _content
56
+
57
+ _entity_attr[:url] = "#{self.base_url}#{_title_doc.attr('href')}"
58
+ _entity_attr[:title] = "#{_title_doc.text}"
59
+
60
+ if _category_doc
61
+ _entity_attr[:tags] = [
62
+ {
63
+ :text => _category_doc.text,
64
+ :href => "#{self.base_url}#{_category_doc.attr('href')}"
65
+ }
66
+ ]
67
+ else
68
+ _entity_attr[:tags] = []
69
+ end
70
+
71
+ _entity_attr[:authors] = []
72
+
73
+ _entity_attr[:up_rating] = -1
74
+ _entity_attr[:down_rating] = -1
75
+ _entity_attr[:comments_count] = -1
76
+
77
+ _entity = self.new_entity(_entity_attr)
78
+
79
+ if _entity.valid?
80
+ self.add_entity _entity
81
+ end
82
+ end
83
+
84
+ # debugger
85
+ end
86
+
87
+ def after_fetch_fail result
88
+ puts "after_fetch fail: #{result}"
89
+ end
90
+
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,49 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module KuaileMahua
5
+ class ArticleEntity
6
+ include ::LoyalSpider::EntityAble
7
+ include ::LoyalSpider::FetchAble
8
+
9
+ self.config_loyal_spider_default_fetch_options(
10
+ :encoding_type => 'GBK',
11
+ :base_url => 'http://www.kl688.com'
12
+ )
13
+
14
+ # TODO
15
+ def after_fetch_success result
16
+ # puts "after_fetch success: #{result}"
17
+ html_doc = result.response_html_doc
18
+ entity_doc = html_doc.css('.main .main-left .xiaohua .xiaohua-data')
19
+
20
+ self.title = entity_doc.css('h1').first.text.to_s.strip
21
+ self.content = entity_doc.css('.content').inner_html
22
+ self.tags = entity_doc.css('.link .tags h4 a').map do |_tag_doc|
23
+ {
24
+ :text => _tag_doc.text.to_s.strip,
25
+ :href => "#{self.base_url}#{_tag_doc.attr('href').to_s.strip}"
26
+ }
27
+ end
28
+
29
+ self.tags_text = self.tags.map do |_tag|
30
+ _tag[:text]
31
+ end
32
+
33
+ self.authors = entity_doc.css('.link .tags .pusher a').map do |_author_doc|
34
+ {
35
+ :text => _author_doc.text.to_s.strip,
36
+ :href => "#{_author_doc.attr('href').to_s.strip}"
37
+ }
38
+ end
39
+
40
+ self.up_rating = entity_doc.css('.tools li a.good').text.to_i
41
+ self.down_rating = entity_doc.css('.tools li a.bad').text.to_i
42
+ self.comments_count = entity_doc.css('.tools li s').first.text.to_i
43
+
44
+ end
45
+
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,85 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module KuaileMahua
5
+ class ArticleEntityLister
6
+ include ::LoyalSpider::EntityListerAble
7
+
8
+ self.config_loyal_spider_entity_lister :url_format_first => 'http://www.kl688.com/',
9
+ :url_format => 'http://www.kl688.com/newjokes/index_%{page}.htm',
10
+ :entity_clazz => ::LoyalSpider::Clients::KuaileMahua::ArticleEntity,
11
+ :fetch_options => {
12
+ :encoding_type => 'GBK',
13
+ :base_url => 'http://www.kl688.com'
14
+ }
15
+
16
+ # TODO:
17
+ def before_fetch options={}
18
+ puts "before_fetch: #{options}"
19
+ end
20
+
21
+ # TODO
22
+ def after_fetch_success result
23
+ # puts "after_fetch success: #{result}"
24
+ html_doc = result.response_html_doc
25
+
26
+ html_doc.css('.main .main-left .xiaohua').each do |entity_doc|
27
+ _entity_attr = {}
28
+
29
+ _fetch_options = result.fetch_options
30
+ _base_url = _fetch_options.base_url.to_s.strip
31
+
32
+ _title_link = entity_doc.css('h3 a').first
33
+
34
+ # :content # 正文
35
+ # :tags # 标签
36
+ # :tags_text # 标签
37
+ # :up_rating # 好评数
38
+ # :down_rating # 差评数目
39
+ # :comments_count # 评论数目
40
+ # :authors # 抓取的作者信息
41
+
42
+ _entity_attr[:url] = "#{_base_url}#{_title_link.attr('href').to_s.strip}"
43
+
44
+ _entity_attr[:title] = _title_link.text
45
+ _entity_attr[:content] = entity_doc.css('.content').inner_html
46
+ _entity_attr[:tags] = entity_doc.css('.link .tags h4 a').map do |_tag_doc|
47
+ {
48
+ :text => _tag_doc.text.to_s.strip,
49
+ :href => "#{_base_url}#{_tag_doc.attr('href').to_s.strip}"
50
+ }
51
+ end
52
+
53
+ _entity_attr[:authors] = entity_doc.css('.link .tags .pusher a').map do |_author_doc|
54
+ {
55
+ :text => _author_doc.text.to_s.strip,
56
+ :href => "#{_author_doc.attr('href').to_s.strip}"
57
+ }
58
+ end
59
+
60
+ _entity_attr[:up_rating] = entity_doc.css('.tools li a.good').text.to_i
61
+ _entity_attr[:down_rating] = entity_doc.css('.tools li a.bad').text.to_i
62
+ _entity_attr[:comments_count] = entity_doc.css('.tools li s').first.text.to_i
63
+
64
+ _entity = self.new_entity(_entity_attr)
65
+
66
+ if entity_doc.css('.content .more a').any?
67
+ _entity.fetch
68
+ end
69
+
70
+ if _entity.valid?
71
+ self.add_entity _entity
72
+ end
73
+ end
74
+
75
+ # debugger
76
+ end
77
+
78
+ def after_fetch_fail result
79
+ puts "after_fetch fail: #{result}"
80
+ end
81
+
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Lengxiaohua
5
+ class ArticleEntity
6
+ include ::LoyalSpider::EntityAble
7
+ # include ::LoyalSpider::FetchAble
8
+
9
+ # self.config_loyal_spider_default_fetch_options(
10
+ # :encoding_type => 'UTF-8',
11
+ # :base_url => 'http://lengxiaohua.com'
12
+ # )
13
+
14
+ def valid?
15
+ super
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,107 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Lengxiaohua
5
+ class ArticleEntityLister
6
+ include ::LoyalSpider::EntityListerAble
7
+
8
+ self.config_loyal_spider_entity_lister :url_format_first => 'http://lengxiaohua.com/',
9
+ :url_format => 'http://lengxiaohua.com/?page_num=%{page}',
10
+ :entity_clazz => ::LoyalSpider::Clients::Lengxiaohua::ArticleEntity,
11
+ :fetch_options => {
12
+ :encoding_type => 'UTF-8',
13
+ :base_url => 'http://lengxiaohua.com'
14
+ }
15
+
16
+ # TODO:
17
+ def before_fetch options={}
18
+ puts "before_fetch: #{options}"
19
+ end
20
+
21
+ # TODO
22
+ def after_fetch_success result
23
+ # puts "after_fetch success: #{result}"
24
+ html_doc = result.response_html_doc
25
+
26
+ html_doc.css('.joke_wrap li.joke_li').each do |entity_doc|
27
+ _entity_attr = {}
28
+
29
+ _fetch_options = result.fetch_options
30
+ _base_url = _fetch_options.base_url.to_s.strip
31
+
32
+ # :content # 正文
33
+ # :tags # 标签
34
+ # :tags_text # 标签
35
+ # :up_rating # 好评数
36
+ # :down_rating # 差评数目
37
+ # :comments_count # 评论数目
38
+ # :authors # 抓取的作者信息
39
+
40
+ _joke_id = entity_doc.css('.para_info .para_tool a').first.attr('jokeid')
41
+
42
+ _entity_attr[:url] = "#{_base_url}/joke/#{_joke_id}"
43
+
44
+ _entity_attr[:title] = ''
45
+
46
+ _text_content = entity_doc.css('.para_can pre').first.inner_html
47
+
48
+ _content = Sanitize.clean(_text_content).split(/\n/).map do |_cnt|
49
+ "<p>#{_cnt}</p>"
50
+ end.join('')
51
+
52
+ if _img_box = entity_doc.css('.default_load_imgbox').first
53
+
54
+ _image_content = _img_box.css('img').map do |_img|
55
+ "<img src='#{_img.attr('data-original').to_s.gsub('!water', '')}'/>"
56
+ end
57
+
58
+ _content = _content + "<p>#{_image_content.join('')}</p>" if _image_content.any?
59
+ end
60
+
61
+ _entity_attr[:content] = _content
62
+
63
+ _entity_attr[:tags] = entity_doc.css('.tag_box a').map do |_tag_doc|
64
+ {
65
+ :text => _tag_doc.text,
66
+ :href => "#{self.base_url}#{_tag_doc.attr('href')}"
67
+ }
68
+ end
69
+
70
+ # debugger
71
+
72
+ _author_doc = entity_doc.css('.para_info .user_info a').first
73
+
74
+ if _author_doc
75
+ _entity_attr[:authors] = [
76
+ {
77
+ :text => _author_doc.text.to_s.strip,
78
+ :href => "#{self.base_url}#{_author_doc.attr('href').to_s.strip}"
79
+ }
80
+ ]
81
+ else
82
+ _entity_attr[:authors] = []
83
+ end
84
+
85
+ _tool_doc = entity_doc.css('.para_tool')
86
+
87
+ _entity_attr[:up_rating] = _tool_doc.css('a[report=like_joke] span').last.text.gsub(/\W/, '').to_i
88
+ _entity_attr[:down_rating] = _tool_doc.css('a[report=unlike_joke] span').last.text.gsub(/\W/, '').to_i
89
+ _entity_attr[:comments_count] = _tool_doc.css("#show_comment_count_#{_joke_id}").text.gsub(/\W/, '').to_i
90
+
91
+ _entity = self.new_entity(_entity_attr)
92
+
93
+ if _entity.valid?
94
+ self.add_entity _entity
95
+ end
96
+ end
97
+
98
+ end
99
+
100
+ def after_fetch_fail result
101
+ puts "after_fetch fail: #{result}"
102
+ end
103
+
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,66 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Xiaohuadi
5
+ class ArticleEntity
6
+ include ::LoyalSpider::EntityAble
7
+ include ::LoyalSpider::FetchAble
8
+
9
+ self.config_loyal_spider_default_fetch_options(
10
+ :encoding_type => 'GBK',
11
+ :base_url => 'http://www.xiaohuadi.com'
12
+ )
13
+
14
+ # TODO
15
+ def after_fetch_success result
16
+ # puts "after_fetch success: #{result}"
17
+ entity_doc = result.response_html_doc.css('.listx')
18
+
19
+ _fetch_options = self.fetch_options
20
+ _base_url = _fetch_options.base_url.to_s.strip
21
+
22
+ # :content # 正文
23
+ # :tags # 标签
24
+ # :tags_text # 标签
25
+ # :up_rating # 好评数
26
+ # :down_rating # 差评数目
27
+ # :comments_count # 评论数目
28
+ # :authors # 抓取的作者信息
29
+
30
+ _text_doc = entity_doc.css('.sonxltitle h1')
31
+
32
+ self.title = "#{_text_doc.text}"
33
+
34
+ _text_content = entity_doc.css('.sonxlarticle').first.inner_html
35
+
36
+ _content = _text_content.split("<br>\r\n").map do |_cnt|
37
+ "<p>#{Sanitize.clean _cnt}</p>"
38
+ end.join('')
39
+
40
+ self.content = _content
41
+
42
+ _category_doc = entity_doc.css('.sonxlPosition a').last
43
+
44
+ if _category_doc
45
+ self.tags = [
46
+ {
47
+ :text => _category_doc.text,
48
+ :href => "#{self.base_url}#{_category_doc.attr('href')}"
49
+ }
50
+ ]
51
+ else
52
+ self.tags = []
53
+ end
54
+
55
+ self.authors = []
56
+
57
+ self.up_rating = -1
58
+ self.down_rating = -1
59
+ self.comments_count = -1
60
+
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,96 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Xiaohuadi
5
+ class ArticleEntityLister
6
+ include ::LoyalSpider::EntityListerAble
7
+
8
+ self.config_loyal_spider_entity_lister :url_format_first => 'http://www.xiaohuadi.com/%{category}/',
9
+ :url_format => 'http://www.xiaohuadi.com/%{category}/index_%{page}.html',
10
+ :entity_clazz => ::LoyalSpider::Clients::Xiaohuadi::ArticleEntity,
11
+ :fetch_options => {
12
+ :encoding_type => 'GBK',
13
+ :base_url => 'http://www.xiaohuadi.com'
14
+ }
15
+
16
+ # TODO:
17
+ def before_fetch options={}
18
+ puts "before_fetch: #{options}"
19
+ end
20
+
21
+ # TODO
22
+ def after_fetch_success result
23
+ # puts "after_fetch success: #{result}"
24
+ html_doc = result.response_html_doc
25
+
26
+ html_doc.css('.ilistxllist>ul').each do |entity_doc|
27
+ _entity_attr = {}
28
+
29
+ _fetch_options = result.fetch_options
30
+ _base_url = _fetch_options.base_url.to_s.strip
31
+
32
+ # :content # 正文
33
+ # :tags # 标签
34
+ # :tags_text # 标签
35
+ # :up_rating # 好评数
36
+ # :down_rating # 差评数目
37
+ # :comments_count # 评论数目
38
+ # :authors # 抓取的作者信息
39
+
40
+ _link_doc = entity_doc.css('.ilistxlctlB1 a')
41
+
42
+ _entity_attr[:url] = "#{_base_url}#{_link_doc.attr('href')}"
43
+
44
+ _entity_attr[:title] = "#{_link_doc.text}"
45
+
46
+ _text_content = entity_doc.css('.ilistxlctlB2').first.inner_html
47
+
48
+ _content = _text_content.split("<br>\r\n").map do |_cnt|
49
+ "<p>#{Sanitize.clean _cnt}</p>"
50
+ end.join('')
51
+
52
+ _entity_attr[:content] = _content
53
+
54
+ _category_doc = entity_doc.css('.ilistxlctlC table td a').last
55
+
56
+ if _category_doc
57
+ _entity_attr[:tags] = [
58
+ {
59
+ :text => _category_doc.text,
60
+ :href => "#{self.base_url}#{_category_doc.attr('href')}"
61
+ }
62
+ ]
63
+ else
64
+ _entity_attr[:tags] = []
65
+ end
66
+
67
+ _entity_attr[:authors] = []
68
+
69
+ _tool_doc = entity_doc.css('.ilistxlctlA ul li')
70
+
71
+ _entity_attr[:up_rating] = _tool_doc[1].text.to_i
72
+ _entity_attr[:down_rating] = _tool_doc[2].text.to_i
73
+ _entity_attr[:comments_count] = _tool_doc[0].text.to_i
74
+
75
+ _entity = self.new_entity(_entity_attr)
76
+
77
+ if _entity.content.include?('未显示完,查看全文')
78
+ _entity.fetch
79
+ end
80
+
81
+ if _entity.valid?
82
+ self.add_entity _entity
83
+ end
84
+ end
85
+
86
+ # debugger
87
+ end
88
+
89
+ def after_fetch_fail result
90
+ puts "after_fetch fail: #{result}"
91
+ end
92
+
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class << self
4
+ attr_writer :config
5
+
6
+ def config
7
+ @config ||= Config.new
8
+ end
9
+
10
+ def configure
11
+ yield self.config ||= Config.new
12
+ end
13
+
14
+ end
15
+
16
+ class Config
17
+
18
+ end
19
+ end
20
+
@@ -0,0 +1 @@
1
+ # -*- encoding : utf-8 -*-
@@ -0,0 +1,4 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require "#{File.dirname(__FILE__)}/utils/hash_util"
3
+ require "#{File.dirname(__FILE__)}/utils/array_util"
4
+
@@ -0,0 +1,16 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class ArrayUtil
4
+ def self.extract_options!(arr)
5
+ if arr.last.is_a?(Hash)
6
+ arr.pop
7
+ else
8
+ {}
9
+ end
10
+ end
11
+
12
+ def self.init args
13
+ args.is_a?(Array) ? args : (args.nil? ? [] : [args])
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class HashUtil
4
+ def self.deep_merge!(a_hash, b_hash)
5
+ b_hash.each_pair do |k,v|
6
+ tv = a_hash[k]
7
+ a_hash[k] = tv.is_a?(Hash) && v.is_a?(Hash) ? self.deep_merge(a_hash, v) : v
8
+ end
9
+
10
+ a_hash
11
+ end
12
+
13
+ def self.deep_merge a_hash, b_hash
14
+ self.deep_merge! self.deep_dup(a_hash), b_hash
15
+ end
16
+
17
+ def self.deep_dup hash
18
+ duplicate = hash.dup
19
+
20
+ duplicate.each_pair do |k,v|
21
+ tv = duplicate[k]
22
+ duplicate[k] = tv.is_a?(Hash) && v.is_a?(Hash) ? (self.deep_dup(tv)) : v
23
+ end
24
+
25
+ duplicate
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,4 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ VERSION = "0.0.1"
4
+ end
@@ -0,0 +1,4 @@
1
+ # desc "Explaining what the task does"
2
+ # task :loyal_spider do
3
+ # # Task goes here
4
+ # end
metadata ADDED
@@ -0,0 +1,173 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loyal_spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - happy
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-09-09 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ">="
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rest-client
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: sanitize
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rails_config
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: nokogiri
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Description of LoyalSpider.
111
+ email:
112
+ - andywang7259@gmail.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - lib/loyal_spider/ables/fetch_options.rb
118
+ - lib/loyal_spider/ables/fetch_able.rb
119
+ - lib/loyal_spider/ables/fetch_result.rb
120
+ - lib/loyal_spider/ables/entity_lister_able.rb
121
+ - lib/loyal_spider/ables/entity_able.rb
122
+ - lib/loyal_spider/utils/hash_util.rb
123
+ - lib/loyal_spider/utils/array_util.rb
124
+ - lib/loyal_spider/config.rb
125
+ - lib/loyal_spider/utils.rb
126
+ - lib/loyal_spider/version.rb
127
+ - lib/loyal_spider/image.rb
128
+ - lib/loyal_spider/clients.rb
129
+ - lib/loyal_spider/clients/kuaile_mahua/article_entity.rb
130
+ - lib/loyal_spider/clients/kuaile_mahua/article_entity_lister.rb
131
+ - lib/loyal_spider/clients/xiaohuadi/article_entity.rb
132
+ - lib/loyal_spider/clients/xiaohuadi/article_entity_lister.rb
133
+ - lib/loyal_spider/clients/haha365/article_entity.rb
134
+ - lib/loyal_spider/clients/haha365/article_entity_lister.rb
135
+ - lib/loyal_spider/clients/lengxiaohua/article_entity.rb
136
+ - lib/loyal_spider/clients/lengxiaohua/article_entity_lister.rb
137
+ - lib/loyal_spider/ables.rb
138
+ - lib/loyal_spider.rb
139
+ - lib/tasks/loyal_spider_tasks.rake
140
+ - MIT-LICENSE
141
+ - Rakefile
142
+ - README.md
143
+ homepage: http://github.com/xiuxian123
144
+ licenses: []
145
+ post_install_message:
146
+ rdoc_options: []
147
+ require_paths:
148
+ - lib
149
+ required_ruby_version: !ruby/object:Gem::Requirement
150
+ none: false
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ segments:
156
+ - 0
157
+ hash: -2402893976680931226
158
+ required_rubygems_version: !ruby/object:Gem::Requirement
159
+ none: false
160
+ requirements:
161
+ - - ">="
162
+ - !ruby/object:Gem::Version
163
+ version: '0'
164
+ segments:
165
+ - 0
166
+ hash: -2402893976680931226
167
+ requirements: []
168
+ rubyforge_project:
169
+ rubygems_version: 1.8.25
170
+ signing_key:
171
+ specification_version: 3
172
+ summary: Summary of LoyalSpider.
173
+ test_files: []