loyal_spider 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ Copyright 2013 YOURNAME
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,4 @@
1
+ # LoyalSpider
2
+
3
+ 这是一些对网站内容的一些蜘蛛爬虫类. 爬取网页的内容.等
4
+
@@ -0,0 +1,21 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ rescue LoadError
4
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
5
+ end
6
+
7
+ require 'rdoc/task'
8
+
9
+ RDoc::Task.new(:rdoc) do |rdoc|
10
+ rdoc.rdoc_dir = 'rdoc'
11
+ rdoc.title = 'LoyalSpider'
12
+ rdoc.options << '--line-numbers'
13
+ rdoc.rdoc_files.include('README.rdoc')
14
+ rdoc.rdoc_files.include('lib/**/*.rb')
15
+ end
16
+
17
+
18
+
19
+
20
+ Bundler::GemHelper.install_tasks
21
+
@@ -0,0 +1,13 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'rest-client'
3
+ require 'sanitize'
4
+ require "rails_config"
5
+ require "nokogiri"
6
+
7
+ require "loyal_spider/config"
8
+ require "loyal_spider/ables"
9
+ require "loyal_spider/clients"
10
+ require "loyal_spider/utils"
11
+
12
+ module LoyalSpider
13
+ end
@@ -0,0 +1,7 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require "#{File.dirname(__FILE__)}/ables/fetch_able"
3
+ require "#{File.dirname(__FILE__)}/ables/entity_able"
4
+ require "#{File.dirname(__FILE__)}/ables/entity_lister_able"
5
+ require "#{File.dirname(__FILE__)}/ables/fetch_options"
6
+ require "#{File.dirname(__FILE__)}/ables/fetch_result"
7
+
@@ -0,0 +1,86 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module EntityAble
4
+ # 有返回信息的
5
+ def self.included base
6
+ base.class_eval do
7
+ attr_accessor :title # 标题
8
+ attr_accessor :url # 标题
9
+ attr_accessor :content # 正文
10
+ attr_accessor :tags # 标签
11
+ attr_accessor :up_rating # 好评数
12
+ attr_accessor :down_rating # 差评数目
13
+ attr_accessor :comments_count # 评论数目
14
+ attr_accessor :authors # 抓取的作者信息
15
+ attr_accessor :publish_time # 发布时间
16
+ attr_accessor :errors # 错误
17
+
18
+ include InstanceMethods
19
+ end
20
+ end
21
+
22
+ module InstanceMethods
23
+ def initialize attrs={}
24
+ attrs.each do |key, value|
25
+ self.send(:"#{key}=", value)
26
+ end
27
+
28
+ self.errors = {}
29
+ end
30
+
31
+ def tags
32
+ @tags ||= []
33
+ end
34
+
35
+ def tags_array
36
+ @tags_array ||= self.tags.map{|_tag| _tag[:text] }
37
+ end
38
+
39
+ # 图片
40
+ def images
41
+ @images ||= Nokogiri::HTML.parse(self.content).css('img').map do |img_doc|
42
+ {
43
+ :src => img_doc.attr('src'),
44
+ :title => img_doc.attr('title'),
45
+ :alt => img_doc.attr('alt')
46
+ }
47
+ end
48
+ end
49
+
50
+ def images?
51
+ self.images.any?
52
+ end
53
+
54
+ def authors
55
+ @authors ||= []
56
+ end
57
+
58
+ def authors_array
59
+ @authors_array ||= self.authors.map{|_author| _author[:text] }
60
+ end
61
+
62
+ def valid?
63
+ self.valid!
64
+ self.errors.empty?
65
+ end
66
+
67
+ def valid!
68
+ unless self.content.to_s.strip.size > 0
69
+ add_error :content, '不能为空'
70
+ end
71
+
72
+ unless self.url.to_s.strip.size > 0
73
+ add_error :url, '不能为空'
74
+ end
75
+ end
76
+
77
+ private
78
+
79
+ def add_error field, message
80
+ (self.errors[field] ||= []) << message
81
+ end
82
+
83
+ end
84
+
85
+ end
86
+ end
@@ -0,0 +1,119 @@
1
+ # -*- encoding : utf-8 -*-
2
+ ## -*- encoding : utf-8 -*-
3
+ module LoyalSpider
4
+ module EntityListerAble
5
+ # 有返回信息的
6
+ def self.included base
7
+ base.class_eval do
8
+ attr_writer :current_page # current_page
9
+
10
+ include ::LoyalSpider::FetchAble
11
+ include InstanceMethods
12
+ extend ClassMethods
13
+ end
14
+ end
15
+
16
+ module ClassMethods
17
+ # options:
18
+ # - url_format
19
+ # - url_format_first
20
+ # - url_format_options
21
+ def config_loyal_spider_entity_lister options={}
22
+ @entity_lister_options ||= options
23
+
24
+ self.config_loyal_spider_default_fetch_options(
25
+ self.entity_lister_options.delete(:fetch_options) || {}
26
+ )
27
+ end
28
+
29
+ def paged_fetch page, options={}, &block
30
+ lister = self.new
31
+ lister.current_page = page
32
+ lister.fetch options, &block
33
+ end
34
+
35
+ def entity_lister_options
36
+ @entity_lister_options ||= {}
37
+ end
38
+
39
+ # 按页抓取
40
+ def paged_fetch page, options={}, &block
41
+ self.new.paged_fetch page, options, &block
42
+ end
43
+ end
44
+
45
+ module InstanceMethods
46
+ def paged_fetch page, options={}, &block
47
+
48
+ self.url_format_options.merge!(
49
+ options[:url_format_options] || {}
50
+ )
51
+
52
+ self.current_page = page
53
+ self.fetch options, &block
54
+ end
55
+
56
+ def url_format
57
+ @url_format ||= self.class.entity_lister_options[:url_format]
58
+ end
59
+
60
+ def url_format_first
61
+ @url_format_first ||= self.class.entity_lister_options[:url_format_first]
62
+ end
63
+
64
+ def url_format_options
65
+ @url_format_options ||= (self.class.entity_lister_options[:url_format_options] || {})
66
+ end
67
+
68
+ def _before_fetch options={}
69
+ @entities = []
70
+ end
71
+
72
+ def _after_fetch_success result
73
+ result.entities = self.entities
74
+ end
75
+
76
+ def entities
77
+ @entities ||= []
78
+ end
79
+
80
+ # TODO
81
+ def entity_clazz
82
+ self.class.entity_lister_options[:entity_clazz]
83
+ end
84
+
85
+ def new_entity attrs={}
86
+ if self.entity_clazz
87
+ self.entity_clazz.new(attrs)
88
+ end
89
+ end
90
+
91
+ def add_entity entity
92
+ self.entities << entity
93
+ end
94
+
95
+ def current_page
96
+ @current_page ||= 1
97
+ end
98
+
99
+ def first_page?
100
+ self.current_page < 2
101
+ end
102
+
103
+ def fetch_url
104
+ return @fetch_url if defined?(@fetch_url)
105
+
106
+ _url_format = self.first_page? ? self.url_format_first : self.url_format
107
+
108
+ @fetch_url ||= sprintf(
109
+ _url_format, (
110
+ self.url_format_options || {}
111
+ ).merge(
112
+ :page => self.current_page
113
+ )
114
+ )
115
+ end
116
+ end
117
+
118
+ end
119
+ end
@@ -0,0 +1,130 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module FetchAble
4
+ # 有返回信息的
5
+ def self.included base
6
+ base.class_eval do
7
+ attr_accessor :fetch_options
8
+
9
+ include InstanceMethods
10
+ extend ClassMethods
11
+ end
12
+ end
13
+
14
+ module ClassMethods
15
+ # 配置蜘蛛的抓取配置
16
+ def config_loyal_spider_default_fetch_options options={}
17
+ @default_fetch_options ||= options
18
+ end
19
+
20
+ def default_fetch_options
21
+ @default_fetch_options ||= {}
22
+ end
23
+
24
+ def fetch options={}, &block
25
+ self.new.fetch options, &block
26
+ end
27
+ end
28
+
29
+ module InstanceMethods
30
+ def base_url
31
+ self.fetch_options.base_url
32
+ end
33
+
34
+ def fetch options={}, &block
35
+ self._before_fetch options if self.respond_to?(:_before_fetch, true)
36
+ self.before_fetch options if self.respond_to?(:before_fetch, true)
37
+
38
+ result = _perform_fetch options, &block
39
+
40
+ if result.success?
41
+ self.after_fetch_success(result) if self.respond_to?(:after_fetch_success, true)
42
+ self._after_fetch_success(result) if self.respond_to?(:_after_fetch_success, true)
43
+ else
44
+ self.after_fetch_fail(result) if self.respond_to?(:after_fetch_fail, true)
45
+ self._after_fetch_fail(result) if self.respond_to?(:_after_fetch_fail, true)
46
+ end
47
+
48
+ result
49
+ end
50
+
51
+ private
52
+
53
+ def _perform_fetch options={}, &block
54
+ _fetch_url = self.respond_to?(:fetch_url, true) ? self.fetch_url : self.url
55
+
56
+ @fetch_options = ::LoyalSpider::FetchOptions.new(
57
+ ::LoyalSpider::HashUtil.deep_merge(self.class.default_fetch_options, options).merge(
58
+ :url => _fetch_url
59
+ )
60
+ )
61
+
62
+ begin
63
+ ::RestClient::Request.execute @fetch_options.net_options do |response, request, response_result, &_block|
64
+ response_code = response.code
65
+
66
+ response_status = if (200..207).include?(response_code)
67
+ :success
68
+ elsif (300..307).include?(response_code)
69
+ :redirect
70
+ elsif (400..450).include?(response_code)
71
+ :request_error
72
+ else
73
+ :server_error
74
+ end
75
+
76
+ _result = ::LoyalSpider::FetchResult.new(
77
+ :response_status => response_status,
78
+ :response_code => response_code,
79
+ :response => response.force_encoding(self.fetch_options.encoding_type).encode!('UTF-8'),
80
+ :request => request,
81
+ :response_result => response_result,
82
+ :fetch_options => self.fetch_options,
83
+ :url => _fetch_url
84
+ )
85
+
86
+ if block_given?
87
+ block.call _result, &_block
88
+ end
89
+
90
+ _result
91
+ end
92
+ rescue Exception => exception
93
+ _error_result = {
94
+ :response => nil,
95
+ :request => nil,
96
+ :response_result => nil,
97
+ :fetch_options => self.fetch_options,
98
+ :url => _fetch_url,
99
+ :exception => exception
100
+ }
101
+
102
+ case exception
103
+ when ::SocketError
104
+ _error_result.merge!(
105
+ :response_status => :socket_error,
106
+ :response_code => 502
107
+ )
108
+ else
109
+ _error_result.merge!(
110
+ :response_status => :request_error,
111
+ :response_code => 400
112
+ )
113
+ end
114
+
115
+ _result = ::LoyalSpider::FetchResult.new(
116
+ _error_result
117
+ )
118
+
119
+ if block_given?
120
+ block.call _result, &_block
121
+ end
122
+
123
+ _result
124
+
125
+ end
126
+
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,38 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class FetchOptions
4
+ attr_accessor :method # 抓取方法
5
+ attr_accessor :open_timeout # 打开超时配置
6
+ attr_accessor :timeout # 打开超时配置
7
+ attr_accessor :encoding_type # 编码
8
+ attr_accessor :headers # 请求头
9
+ attr_accessor :url # url
10
+ attr_accessor :base_url # base_url
11
+
12
+ def initialize attrs={}
13
+ @url = attrs[:url] || ''
14
+ @base_url = attrs[:base_url] || ''
15
+ @method = attrs[:method] || :get
16
+ @timeout = attrs[:timeout] || 60 # 单位秒
17
+ @open_timeout = attrs[:open_time] || @timeout
18
+ @encoding_type = attrs[:encoding_type] || 'UTF-8'
19
+ @headers = {
20
+ :accept_charset => 'UTF-8,*;q=0.5',
21
+ :accept_encoding => 'gzip,deflate,sdch',
22
+ :user_agent => 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
23
+ }.merge(attrs[:headers] || {})
24
+ end
25
+
26
+ def net_options
27
+ {
28
+ :url => self.url,
29
+ :method => self.method,
30
+ :timeout => self.timeout,
31
+ :open_timeout => self.open_timeout,
32
+ :encoding_type => self.encoding_type,
33
+ :headers => self.headers
34
+ }
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,34 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class FetchResult
4
+
5
+ attr_accessor :response_status
6
+ attr_accessor :response_code
7
+ attr_accessor :response
8
+ attr_accessor :request
9
+ attr_accessor :response_result
10
+ attr_accessor :fetch_options
11
+ attr_accessor :url
12
+ attr_accessor :exception
13
+ attr_accessor :entities
14
+
15
+ def initialize attrs={}
16
+ attrs.each do |key, value|
17
+ self.send(:"#{key}=", value)
18
+ end
19
+ end
20
+
21
+ def success?
22
+ self.response_status == :success
23
+ end
24
+
25
+ def fail?
26
+ !success?
27
+ end
28
+
29
+ def response_html_doc
30
+ @response_html_doc ||= Nokogiri::HTML.parse(self.response) if self.response
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,13 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require "#{File.dirname(__FILE__)}/clients/kuaile_mahua/article_entity"
3
+ require "#{File.dirname(__FILE__)}/clients/kuaile_mahua/article_entity_lister"
4
+
5
+ require "#{File.dirname(__FILE__)}/clients/haha365/article_entity"
6
+ require "#{File.dirname(__FILE__)}/clients/haha365/article_entity_lister"
7
+
8
+ require "#{File.dirname(__FILE__)}/clients/lengxiaohua/article_entity"
9
+ require "#{File.dirname(__FILE__)}/clients/lengxiaohua/article_entity_lister"
10
+
11
+ require "#{File.dirname(__FILE__)}/clients/xiaohuadi/article_entity"
12
+ require "#{File.dirname(__FILE__)}/clients/xiaohuadi/article_entity_lister"
13
+
@@ -0,0 +1,11 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Haha365
5
+ class ArticleEntity
6
+ include ::LoyalSpider::EntityAble
7
+
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,94 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Haha365
5
+ class ArticleEntityLister
6
+ include ::LoyalSpider::EntityListerAble
7
+
8
+ self.config_loyal_spider_entity_lister :url_format_first => 'http://www.haha365.com/%{category}/',
9
+ :url_format => 'http://www.haha365.com/%{category}/index_%{page}.htm',
10
+ :entity_clazz => ::LoyalSpider::Clients::Haha365::ArticleEntity,
11
+ :fetch_options => {
12
+ :encoding_type => 'GBK',
13
+ :base_url => 'http://www.haha365.com'
14
+ }
15
+
16
+ # TODO:
17
+ def before_fetch options={}
18
+ puts "before_fetch: #{options}"
19
+ end
20
+
21
+ # TODO
22
+ def after_fetch_success result
23
+ # puts "after_fetch success: #{result}"
24
+ html_doc = result.response_html_doc
25
+
26
+ # _basic_doc = html_doc.css('html #main .content .left .r_c .cat_llb')
27
+ _basic_doc = html_doc.css('html #main .content .left .r_c')
28
+
29
+ # :content # 正文
30
+ # :tags # 标签
31
+ # :tags_text # 标签
32
+ # :up_rating # 好评数
33
+ # :down_rating # 差评数目
34
+ # :comments_count # 评论数目
35
+ # :authors # 抓取的作者信息
36
+
37
+ (0...(_basic_doc.css('.cat_llb .fl a').size)).each do |_index|
38
+
39
+ _title_doc = _basic_doc.css('.cat_llb h3 a')[_index]
40
+ _content_doc = _basic_doc.css('.cat_llb #endtext')[_index]
41
+ _category_doc = _basic_doc.css('.cat_llb .fl a')[_index]
42
+
43
+ if _title_doc.nil? || _content_doc.nil? || _category_doc.nil?
44
+ next
45
+ end
46
+
47
+ _entity_attr = {}
48
+
49
+ _text_content = _content_doc.try :inner_html
50
+
51
+ _content = _text_content.to_s.split("<br>\r\n").map do |_cnt|
52
+ "<p>#{(Sanitize.clean _cnt).to_s.strip}</p>"
53
+ end.join('')
54
+
55
+ _entity_attr[:content] = _content
56
+
57
+ _entity_attr[:url] = "#{self.base_url}#{_title_doc.attr('href')}"
58
+ _entity_attr[:title] = "#{_title_doc.text}"
59
+
60
+ if _category_doc
61
+ _entity_attr[:tags] = [
62
+ {
63
+ :text => _category_doc.text,
64
+ :href => "#{self.base_url}#{_category_doc.attr('href')}"
65
+ }
66
+ ]
67
+ else
68
+ _entity_attr[:tags] = []
69
+ end
70
+
71
+ _entity_attr[:authors] = []
72
+
73
+ _entity_attr[:up_rating] = -1
74
+ _entity_attr[:down_rating] = -1
75
+ _entity_attr[:comments_count] = -1
76
+
77
+ _entity = self.new_entity(_entity_attr)
78
+
79
+ if _entity.valid?
80
+ self.add_entity _entity
81
+ end
82
+ end
83
+
84
+ # debugger
85
+ end
86
+
87
+ def after_fetch_fail result
88
+ puts "after_fetch fail: #{result}"
89
+ end
90
+
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,49 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module KuaileMahua
5
+ class ArticleEntity
6
+ include ::LoyalSpider::EntityAble
7
+ include ::LoyalSpider::FetchAble
8
+
9
+ self.config_loyal_spider_default_fetch_options(
10
+ :encoding_type => 'GBK',
11
+ :base_url => 'http://www.kl688.com'
12
+ )
13
+
14
+ # TODO
15
+ def after_fetch_success result
16
+ # puts "after_fetch success: #{result}"
17
+ html_doc = result.response_html_doc
18
+ entity_doc = html_doc.css('.main .main-left .xiaohua .xiaohua-data')
19
+
20
+ self.title = entity_doc.css('h1').first.text.to_s.strip
21
+ self.content = entity_doc.css('.content').inner_html
22
+ self.tags = entity_doc.css('.link .tags h4 a').map do |_tag_doc|
23
+ {
24
+ :text => _tag_doc.text.to_s.strip,
25
+ :href => "#{self.base_url}#{_tag_doc.attr('href').to_s.strip}"
26
+ }
27
+ end
28
+
29
+ self.tags_text = self.tags.map do |_tag|
30
+ _tag[:text]
31
+ end
32
+
33
+ self.authors = entity_doc.css('.link .tags .pusher a').map do |_author_doc|
34
+ {
35
+ :text => _author_doc.text.to_s.strip,
36
+ :href => "#{_author_doc.attr('href').to_s.strip}"
37
+ }
38
+ end
39
+
40
+ self.up_rating = entity_doc.css('.tools li a.good').text.to_i
41
+ self.down_rating = entity_doc.css('.tools li a.bad').text.to_i
42
+ self.comments_count = entity_doc.css('.tools li s').first.text.to_i
43
+
44
+ end
45
+
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,85 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module KuaileMahua
5
+ class ArticleEntityLister
6
+ include ::LoyalSpider::EntityListerAble
7
+
8
+ self.config_loyal_spider_entity_lister :url_format_first => 'http://www.kl688.com/',
9
+ :url_format => 'http://www.kl688.com/newjokes/index_%{page}.htm',
10
+ :entity_clazz => ::LoyalSpider::Clients::KuaileMahua::ArticleEntity,
11
+ :fetch_options => {
12
+ :encoding_type => 'GBK',
13
+ :base_url => 'http://www.kl688.com'
14
+ }
15
+
16
+ # TODO:
17
+ def before_fetch options={}
18
+ puts "before_fetch: #{options}"
19
+ end
20
+
21
+ # TODO
22
+ def after_fetch_success result
23
+ # puts "after_fetch success: #{result}"
24
+ html_doc = result.response_html_doc
25
+
26
+ html_doc.css('.main .main-left .xiaohua').each do |entity_doc|
27
+ _entity_attr = {}
28
+
29
+ _fetch_options = result.fetch_options
30
+ _base_url = _fetch_options.base_url.to_s.strip
31
+
32
+ _title_link = entity_doc.css('h3 a').first
33
+
34
+ # :content # 正文
35
+ # :tags # 标签
36
+ # :tags_text # 标签
37
+ # :up_rating # 好评数
38
+ # :down_rating # 差评数目
39
+ # :comments_count # 评论数目
40
+ # :authors # 抓取的作者信息
41
+
42
+ _entity_attr[:url] = "#{_base_url}#{_title_link.attr('href').to_s.strip}"
43
+
44
+ _entity_attr[:title] = _title_link.text
45
+ _entity_attr[:content] = entity_doc.css('.content').inner_html
46
+ _entity_attr[:tags] = entity_doc.css('.link .tags h4 a').map do |_tag_doc|
47
+ {
48
+ :text => _tag_doc.text.to_s.strip,
49
+ :href => "#{_base_url}#{_tag_doc.attr('href').to_s.strip}"
50
+ }
51
+ end
52
+
53
+ _entity_attr[:authors] = entity_doc.css('.link .tags .pusher a').map do |_author_doc|
54
+ {
55
+ :text => _author_doc.text.to_s.strip,
56
+ :href => "#{_author_doc.attr('href').to_s.strip}"
57
+ }
58
+ end
59
+
60
+ _entity_attr[:up_rating] = entity_doc.css('.tools li a.good').text.to_i
61
+ _entity_attr[:down_rating] = entity_doc.css('.tools li a.bad').text.to_i
62
+ _entity_attr[:comments_count] = entity_doc.css('.tools li s').first.text.to_i
63
+
64
+ _entity = self.new_entity(_entity_attr)
65
+
66
+ if entity_doc.css('.content .more a').any?
67
+ _entity.fetch
68
+ end
69
+
70
+ if _entity.valid?
71
+ self.add_entity _entity
72
+ end
73
+ end
74
+
75
+ # debugger
76
+ end
77
+
78
+ def after_fetch_fail result
79
+ puts "after_fetch fail: #{result}"
80
+ end
81
+
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Lengxiaohua
5
+ class ArticleEntity
6
+ include ::LoyalSpider::EntityAble
7
+ # include ::LoyalSpider::FetchAble
8
+
9
+ # self.config_loyal_spider_default_fetch_options(
10
+ # :encoding_type => 'UTF-8',
11
+ # :base_url => 'http://lengxiaohua.com'
12
+ # )
13
+
14
+ def valid?
15
+ super
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,107 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Lengxiaohua
5
+ class ArticleEntityLister
6
+ include ::LoyalSpider::EntityListerAble
7
+
8
+ self.config_loyal_spider_entity_lister :url_format_first => 'http://lengxiaohua.com/',
9
+ :url_format => 'http://lengxiaohua.com/?page_num=%{page}',
10
+ :entity_clazz => ::LoyalSpider::Clients::Lengxiaohua::ArticleEntity,
11
+ :fetch_options => {
12
+ :encoding_type => 'UTF-8',
13
+ :base_url => 'http://lengxiaohua.com'
14
+ }
15
+
16
+ # TODO:
17
+ def before_fetch options={}
18
+ puts "before_fetch: #{options}"
19
+ end
20
+
21
+ # TODO
22
+ def after_fetch_success result
23
+ # puts "after_fetch success: #{result}"
24
+ html_doc = result.response_html_doc
25
+
26
+ html_doc.css('.joke_wrap li.joke_li').each do |entity_doc|
27
+ _entity_attr = {}
28
+
29
+ _fetch_options = result.fetch_options
30
+ _base_url = _fetch_options.base_url.to_s.strip
31
+
32
+ # :content # 正文
33
+ # :tags # 标签
34
+ # :tags_text # 标签
35
+ # :up_rating # 好评数
36
+ # :down_rating # 差评数目
37
+ # :comments_count # 评论数目
38
+ # :authors # 抓取的作者信息
39
+
40
+ _joke_id = entity_doc.css('.para_info .para_tool a').first.attr('jokeid')
41
+
42
+ _entity_attr[:url] = "#{_base_url}/joke/#{_joke_id}"
43
+
44
+ _entity_attr[:title] = ''
45
+
46
+ _text_content = entity_doc.css('.para_can pre').first.inner_html
47
+
48
+ _content = Sanitize.clean(_text_content).split(/\n/).map do |_cnt|
49
+ "<p>#{_cnt}</p>"
50
+ end.join('')
51
+
52
+ if _img_box = entity_doc.css('.default_load_imgbox').first
53
+
54
+ _image_content = _img_box.css('img').map do |_img|
55
+ "<img src='#{_img.attr('data-original').to_s.gsub('!water', '')}'/>"
56
+ end
57
+
58
+ _content = _content + "<p>#{_image_content.join('')}</p>" if _image_content.any?
59
+ end
60
+
61
+ _entity_attr[:content] = _content
62
+
63
+ _entity_attr[:tags] = entity_doc.css('.tag_box a').map do |_tag_doc|
64
+ {
65
+ :text => _tag_doc.text,
66
+ :href => "#{self.base_url}#{_tag_doc.attr('href')}"
67
+ }
68
+ end
69
+
70
+ # debugger
71
+
72
+ _author_doc = entity_doc.css('.para_info .user_info a').first
73
+
74
+ if _author_doc
75
+ _entity_attr[:authors] = [
76
+ {
77
+ :text => _author_doc.text.to_s.strip,
78
+ :href => "#{self.base_url}#{_author_doc.attr('href').to_s.strip}"
79
+ }
80
+ ]
81
+ else
82
+ _entity_attr[:authors] = []
83
+ end
84
+
85
+ _tool_doc = entity_doc.css('.para_tool')
86
+
87
+ _entity_attr[:up_rating] = _tool_doc.css('a[report=like_joke] span').last.text.gsub(/\W/, '').to_i
88
+ _entity_attr[:down_rating] = _tool_doc.css('a[report=unlike_joke] span').last.text.gsub(/\W/, '').to_i
89
+ _entity_attr[:comments_count] = _tool_doc.css("#show_comment_count_#{_joke_id}").text.gsub(/\W/, '').to_i
90
+
91
+ _entity = self.new_entity(_entity_attr)
92
+
93
+ if _entity.valid?
94
+ self.add_entity _entity
95
+ end
96
+ end
97
+
98
+ end
99
+
100
+ def after_fetch_fail result
101
+ puts "after_fetch fail: #{result}"
102
+ end
103
+
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,66 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Xiaohuadi
5
+ class ArticleEntity
6
+ include ::LoyalSpider::EntityAble
7
+ include ::LoyalSpider::FetchAble
8
+
9
+ self.config_loyal_spider_default_fetch_options(
10
+ :encoding_type => 'GBK',
11
+ :base_url => 'http://www.xiaohuadi.com'
12
+ )
13
+
14
+ # TODO
15
+ def after_fetch_success result
16
+ # puts "after_fetch success: #{result}"
17
+ entity_doc = result.response_html_doc.css('.listx')
18
+
19
+ _fetch_options = self.fetch_options
20
+ _base_url = _fetch_options.base_url.to_s.strip
21
+
22
+ # :content # 正文
23
+ # :tags # 标签
24
+ # :tags_text # 标签
25
+ # :up_rating # 好评数
26
+ # :down_rating # 差评数目
27
+ # :comments_count # 评论数目
28
+ # :authors # 抓取的作者信息
29
+
30
+ _text_doc = entity_doc.css('.sonxltitle h1')
31
+
32
+ self.title = "#{_text_doc.text}"
33
+
34
+ _text_content = entity_doc.css('.sonxlarticle').first.inner_html
35
+
36
+ _content = _text_content.split("<br>\r\n").map do |_cnt|
37
+ "<p>#{Sanitize.clean _cnt}</p>"
38
+ end.join('')
39
+
40
+ self.content = _content
41
+
42
+ _category_doc = entity_doc.css('.sonxlPosition a').last
43
+
44
+ if _category_doc
45
+ self.tags = [
46
+ {
47
+ :text => _category_doc.text,
48
+ :href => "#{self.base_url}#{_category_doc.attr('href')}"
49
+ }
50
+ ]
51
+ else
52
+ self.tags = []
53
+ end
54
+
55
+ self.authors = []
56
+
57
+ self.up_rating = -1
58
+ self.down_rating = -1
59
+ self.comments_count = -1
60
+
61
+ end
62
+
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,96 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ module Clients
4
+ module Xiaohuadi
5
+ class ArticleEntityLister
6
+ include ::LoyalSpider::EntityListerAble
7
+
8
+ self.config_loyal_spider_entity_lister :url_format_first => 'http://www.xiaohuadi.com/%{category}/',
9
+ :url_format => 'http://www.xiaohuadi.com/%{category}/index_%{page}.html',
10
+ :entity_clazz => ::LoyalSpider::Clients::Xiaohuadi::ArticleEntity,
11
+ :fetch_options => {
12
+ :encoding_type => 'GBK',
13
+ :base_url => 'http://www.xiaohuadi.com'
14
+ }
15
+
16
+ # TODO:
17
+ def before_fetch options={}
18
+ puts "before_fetch: #{options}"
19
+ end
20
+
21
+ # TODO
22
+ def after_fetch_success result
23
+ # puts "after_fetch success: #{result}"
24
+ html_doc = result.response_html_doc
25
+
26
+ html_doc.css('.ilistxllist>ul').each do |entity_doc|
27
+ _entity_attr = {}
28
+
29
+ _fetch_options = result.fetch_options
30
+ _base_url = _fetch_options.base_url.to_s.strip
31
+
32
+ # :content # 正文
33
+ # :tags # 标签
34
+ # :tags_text # 标签
35
+ # :up_rating # 好评数
36
+ # :down_rating # 差评数目
37
+ # :comments_count # 评论数目
38
+ # :authors # 抓取的作者信息
39
+
40
+ _link_doc = entity_doc.css('.ilistxlctlB1 a')
41
+
42
+ _entity_attr[:url] = "#{_base_url}#{_link_doc.attr('href')}"
43
+
44
+ _entity_attr[:title] = "#{_link_doc.text}"
45
+
46
+ _text_content = entity_doc.css('.ilistxlctlB2').first.inner_html
47
+
48
+ _content = _text_content.split("<br>\r\n").map do |_cnt|
49
+ "<p>#{Sanitize.clean _cnt}</p>"
50
+ end.join('')
51
+
52
+ _entity_attr[:content] = _content
53
+
54
+ _category_doc = entity_doc.css('.ilistxlctlC table td a').last
55
+
56
+ if _category_doc
57
+ _entity_attr[:tags] = [
58
+ {
59
+ :text => _category_doc.text,
60
+ :href => "#{self.base_url}#{_category_doc.attr('href')}"
61
+ }
62
+ ]
63
+ else
64
+ _entity_attr[:tags] = []
65
+ end
66
+
67
+ _entity_attr[:authors] = []
68
+
69
+ _tool_doc = entity_doc.css('.ilistxlctlA ul li')
70
+
71
+ _entity_attr[:up_rating] = _tool_doc[1].text.to_i
72
+ _entity_attr[:down_rating] = _tool_doc[2].text.to_i
73
+ _entity_attr[:comments_count] = _tool_doc[0].text.to_i
74
+
75
+ _entity = self.new_entity(_entity_attr)
76
+
77
+ if _entity.content.include?('未显示完,查看全文')
78
+ _entity.fetch
79
+ end
80
+
81
+ if _entity.valid?
82
+ self.add_entity _entity
83
+ end
84
+ end
85
+
86
+ # debugger
87
+ end
88
+
89
+ def after_fetch_fail result
90
+ puts "after_fetch fail: #{result}"
91
+ end
92
+
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class << self
4
+ attr_writer :config
5
+
6
+ def config
7
+ @config ||= Config.new
8
+ end
9
+
10
+ def configure
11
+ yield self.config ||= Config.new
12
+ end
13
+
14
+ end
15
+
16
+ class Config
17
+
18
+ end
19
+ end
20
+
@@ -0,0 +1 @@
1
+ # -*- encoding : utf-8 -*-
@@ -0,0 +1,4 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require "#{File.dirname(__FILE__)}/utils/hash_util"
3
+ require "#{File.dirname(__FILE__)}/utils/array_util"
4
+
@@ -0,0 +1,16 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class ArrayUtil
4
+ def self.extract_options!(arr)
5
+ if arr.last.is_a?(Hash)
6
+ arr.pop
7
+ else
8
+ {}
9
+ end
10
+ end
11
+
12
+ def self.init args
13
+ args.is_a?(Array) ? args : (args.nil? ? [] : [args])
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ class HashUtil
4
+ def self.deep_merge!(a_hash, b_hash)
5
+ b_hash.each_pair do |k,v|
6
+ tv = a_hash[k]
7
+ a_hash[k] = tv.is_a?(Hash) && v.is_a?(Hash) ? self.deep_merge(a_hash, v) : v
8
+ end
9
+
10
+ a_hash
11
+ end
12
+
13
+ def self.deep_merge a_hash, b_hash
14
+ self.deep_merge! self.deep_dup(a_hash), b_hash
15
+ end
16
+
17
+ def self.deep_dup hash
18
+ duplicate = hash.dup
19
+
20
+ duplicate.each_pair do |k,v|
21
+ tv = duplicate[k]
22
+ duplicate[k] = tv.is_a?(Hash) && v.is_a?(Hash) ? (self.deep_dup(tv)) : v
23
+ end
24
+
25
+ duplicate
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,4 @@
1
+ # -*- encoding : utf-8 -*-
2
+ module LoyalSpider
3
+ VERSION = "0.0.1"
4
+ end
@@ -0,0 +1,4 @@
1
+ # desc "Explaining what the task does"
2
+ # task :loyal_spider do
3
+ # # Task goes here
4
+ # end
metadata ADDED
@@ -0,0 +1,173 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loyal_spider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - happy
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-09-09 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ">="
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rest-client
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: sanitize
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rails_config
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: nokogiri
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Description of LoyalSpider.
111
+ email:
112
+ - andywang7259@gmail.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - lib/loyal_spider/ables/fetch_options.rb
118
+ - lib/loyal_spider/ables/fetch_able.rb
119
+ - lib/loyal_spider/ables/fetch_result.rb
120
+ - lib/loyal_spider/ables/entity_lister_able.rb
121
+ - lib/loyal_spider/ables/entity_able.rb
122
+ - lib/loyal_spider/utils/hash_util.rb
123
+ - lib/loyal_spider/utils/array_util.rb
124
+ - lib/loyal_spider/config.rb
125
+ - lib/loyal_spider/utils.rb
126
+ - lib/loyal_spider/version.rb
127
+ - lib/loyal_spider/image.rb
128
+ - lib/loyal_spider/clients.rb
129
+ - lib/loyal_spider/clients/kuaile_mahua/article_entity.rb
130
+ - lib/loyal_spider/clients/kuaile_mahua/article_entity_lister.rb
131
+ - lib/loyal_spider/clients/xiaohuadi/article_entity.rb
132
+ - lib/loyal_spider/clients/xiaohuadi/article_entity_lister.rb
133
+ - lib/loyal_spider/clients/haha365/article_entity.rb
134
+ - lib/loyal_spider/clients/haha365/article_entity_lister.rb
135
+ - lib/loyal_spider/clients/lengxiaohua/article_entity.rb
136
+ - lib/loyal_spider/clients/lengxiaohua/article_entity_lister.rb
137
+ - lib/loyal_spider/ables.rb
138
+ - lib/loyal_spider.rb
139
+ - lib/tasks/loyal_spider_tasks.rake
140
+ - MIT-LICENSE
141
+ - Rakefile
142
+ - README.md
143
+ homepage: http://github.com/xiuxian123
144
+ licenses: []
145
+ post_install_message:
146
+ rdoc_options: []
147
+ require_paths:
148
+ - lib
149
+ required_ruby_version: !ruby/object:Gem::Requirement
150
+ none: false
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ segments:
156
+ - 0
157
+ hash: -2402893976680931226
158
+ required_rubygems_version: !ruby/object:Gem::Requirement
159
+ none: false
160
+ requirements:
161
+ - - ">="
162
+ - !ruby/object:Gem::Version
163
+ version: '0'
164
+ segments:
165
+ - 0
166
+ hash: -2402893976680931226
167
+ requirements: []
168
+ rubyforge_project:
169
+ rubygems_version: 1.8.25
170
+ signing_key:
171
+ specification_version: 3
172
+ summary: Summary of LoyalSpider.
173
+ test_files: []