spider_bot 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6c8009ebd495001ed425bd236e2b894d1b3124dd
4
+ data.tar.gz: 97e5ce361bc6165956fac9f879c526ffb7da922d
5
+ SHA512:
6
+ metadata.gz: 951b3f8a49ec3034b6b3031a40270eea9fcb64f1685f1982878523005ae292c0f45dbee000f67bc67ecd16a4a569b39c166d5be5559503ecdde2daa2e4ece00e
7
+ data.tar.gz: 26d664fb1edc26e8edcd8c5aab9f38cd4bc48b3618c098384e000c71aaa7f8f2c357e8dd447c87940d55f64c40ff73db84a01b356e09cbc88ec8c36a8a6f8818
data/.gitignore ADDED
@@ -0,0 +1,16 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ *.swp
15
+ *.swo
16
+ mkmf.log
data/.yardopts ADDED
@@ -0,0 +1,2 @@
1
+ --markup-provider=redcarpet
2
+ --markup=markdown
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in spider_bot.gemspec
4
+ gemspec
5
+
6
+ group :doc do
7
+ gem 'yard'
8
+ gem 'redcarpet'
9
+ gem 'github-markup'
10
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 yee.li
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,148 @@
1
+ # SpiderBot
2
+
3
+ 一个简单的机器爬虫
4
+
5
+ ## SpiderBot 安装
6
+
7
+ 将下列文字添加到你程序中的Gemfile里
8
+
9
+ ```
10
+ gem 'spider_bot'
11
+ ```
12
+
13
+ 并执行:
14
+
15
+ $ bundle
16
+
17
+ 或者直接通过命令安装:
18
+
19
+ $ gem install spider_bot
20
+
21
+ ## SpiderBot 文件
22
+
23
+ #### 文件格式
24
+
25
+ 1.单站单页爬取, 返回html文本
26
+
27
+ ```
28
+ SpiderBot.crawl("http://example.com", #{origin_options})
29
+ ```
30
+
31
+ 2.单站多页爬取
32
+
33
+ ```
34
+ SpiderBot.crawl("#{url}", data: Proce.new{ |data| data }, since: Proce.new{ |data| data }) do
35
+
36
+ paginate do
37
+ option :type, :json
38
+ option :path, '#{path}'
39
+
40
+ # 翻页页码设置
41
+ option :start, 0
42
+ option :add, 10
43
+ option :expire, 100 #如果设置为-1,将默认进行无限次爬取
44
+ option :sleep, 6
45
+
46
+ # 翻页后获取信息设置
47
+ option :data, Proc.new{ |data| data }
48
+ option :since, Proc.new{ |since| since }
49
+
50
+ option query, {page: "%{page}", since_id: %{since}}
51
+ end
52
+
53
+ crawl_data do |data|
54
+ # 解析爬取的数据...
55
+ end
56
+ end
57
+ ```
58
+
59
+ 3.多站,多页内容爬取, 可以配合Rails或者padrino进行任务爬去
60
+
61
+ ```
62
+ class Mybot < SpiderBot::Base
63
+
64
+ #通过 "spider start” 或者 “spider crawl" 自动执行的方法
65
+
66
+ auto do
67
+ origin "#{url}", data: Proc.new{ |data| data }, since: Proce.new{ |since| since }
68
+ execute do
69
+
70
+ paginate do
71
+ option :type, :json
72
+ option :path, '#{path}'
73
+
74
+ # 翻页页码设置
75
+ option :start, 0
76
+ option :add, 10
77
+ option :expire, 100
78
+ option :sleep, 6
79
+
80
+ # 翻页后获取信息设置
81
+ option :data, Proc.new{ |data| data }
82
+ option :since, Proc.new{ |since| since }
83
+
84
+ option query, { page: "%{page}", since_id: "%{since}" }
85
+ end
86
+
87
+ crawl_data do |data|
88
+ # 解析爬取的数据...
89
+ end
90
+ end
91
+ end
92
+ end
93
+ ```
94
+
95
+ ####初始页面参数设置 origin_options
96
+
97
+ * path
98
+ * type
99
+ * headers
100
+ * query
101
+ * data 获取初始页面数据
102
+ * since 获取初始页码数据最后一条参数,用户翻页
103
+
104
+ ####翻页参数设置
105
+
106
+ 1.翻页后文本设置
107
+
108
+ * paginate_type 翻页后类型[:html, :json, :xml]
109
+ * paginate_path 翻页后的Path
110
+ * paginate_query 翻页后的参数设置 {page: "%{page}", since: "%{since}"}
111
+
112
+
113
+ 2.翻页设置
114
+
115
+ * paginate_start #翻页起始页, 默认为0
116
+ * paginate_add #翻页增加数, 默认为 1
117
+ * paginate_expire #翻页总结数, 默认为30
118
+ * paginate_sleep #翻页休息数, 默认为 0
119
+
120
+ 3.翻页信息获取
121
+
122
+ * paginate_data 获取翻页后的数据, 不填写,默认为origin data
123
+ * paginate_since 获取翻页后最后数据, 不填写, 默认为 origin_since
124
+
125
+
126
+ ## SpiderBot 命令
127
+
128
+ * spider url #直接通过命令爬取, 返回html文本
129
+ - -q query, 设置Query
130
+ - -d data, 爬取数据
131
+ - -o out,输出到文件
132
+
133
+ * spider crawl #运行bot文件
134
+ - -b bot, 运行单一bot文件
135
+ - -d dir, 运行指定目录里的bot文件
136
+ - -p expire_page, 总翻页数(用以替代 option :export)
137
+
138
+ * spider start #运行爬取服务
139
+ - -d daemon, 后台运行
140
+ - -t time, #设置爬取时间间隔, 默认为10
141
+ - -r random #将爬取时间间隔, 设置为时间下一个随机数, 默认为10的随机数
142
+ - -e env #设置Sipder运行环境, 如果配合Rails或者Padrino, 获取指定运行环境
143
+ - -p expire_page, 总翻页数(用以替代 option :export)
144
+
145
+ * spider stop #停止爬取服务
146
+
147
+
148
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/spider ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/spider_bot/cli'
3
+
4
+ begin
5
+ SpiderBot::CLI.start
6
+ rescue SystemExit => e
7
+ Kernel.exit(e.status)
8
+ rescue Exception => e
9
+ STDERR.puts e.message
10
+ STDERR.puts e.backtrace.join("\n")
11
+ Kernel.exit 1
12
+ end
@@ -0,0 +1,31 @@
1
+ module SpiderBot
2
+ class Base
3
+ class << self
4
+ #
5
+ # execute method with command "spider start" and "spider crawl"
6
+ #
7
+
8
+ def auto &block
9
+ if defined?(BOTCONSOLE)
10
+ klass = Class.new do
11
+ def origin url, options = {}
12
+ @origin_url = url
13
+ @origin_options = options
14
+ end
15
+
16
+ def execute name = nil, &block
17
+ crawl_instance = Crawl.new(@origin_url, @origin_options)
18
+ crawl_instance.instance_eval &block
19
+ end
20
+ end
21
+ klass.allocate.instance_eval &block
22
+ end
23
+ end
24
+
25
+ def crawl url, options = {}
26
+ crawl_instance = Crawl.new(url, options)
27
+ crawl_instance.crawl_data
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,183 @@
1
+ # encoding: utf-8
2
+
3
+ $LOAD_PATH.unshift(File.expand_path('../..', __FILE__))
4
+ require "thor"
5
+ require 'spider_bot'
6
+ require 'daemons'
7
+
8
+ BOTCONSOLE = true
9
+
10
+ module SpiderBot
11
+ class CLI < Thor
12
+ desc "url", "Crawl url"
13
+
14
+ method_option :query,
15
+ aliases: "-q",
16
+ desc: "Set url query"
17
+
18
+ method_option :data,
19
+ aliases: "-d",
20
+ desc: "Match html data"
21
+
22
+ method_option :out,
23
+ aliases: ["-o"],
24
+ desc: "Write to file"
25
+
26
+ def url(arg)
27
+ data = Crawl.new(arg, options).crawl_data
28
+ return File.open(options[:out], "w"){ file.puts data } if options[:out]
29
+ return puts data
30
+ end
31
+
32
+ desc "crawl", "Run spider bot file"
33
+
34
+ method_option :bot,
35
+ aliases: ["-b"],
36
+ desc: "Read bot flle"
37
+
38
+ method_option :dir,
39
+ aliases: ["-d"],
40
+ desc: "Read bot directory"
41
+
42
+ method_option :expire,
43
+ aliases: ["-p"],
44
+ desc: "Read data expired number"
45
+
46
+ def crawl
47
+ $expire_num = options[:expire].to_i if options[:expire]
48
+ require File.join(File.expand_path('../..',__FILE__), "spider_bot/load")
49
+
50
+ if options[:bot]
51
+ bot_file = File.expand_path(options[:bot])
52
+ return raise "Bot file not found" if !File.exists?(bot_file)
53
+ load bot_file
54
+ end
55
+
56
+ if options[:dir]
57
+ bot_dir = File.expand_path(options[:dir])
58
+ return raise "Dir is not found" if !Dir.exists?(bot_dir)
59
+
60
+ threads = []
61
+ Dir.glob("#{bot_dir}/*_bot.rb").each do |file|
62
+ threads << Thread.new do
63
+ begin
64
+ SpiderBot.logger.info "loading bot file with #{file}."
65
+ load file
66
+ rescue Exception => e
67
+ SpiderBot.logger.error "has errors with loading bot file #{ file }"
68
+ SpiderBot.logger.error e.to_s
69
+ end
70
+ end
71
+ end
72
+ threads.each { |t| t.join }
73
+ end
74
+ end
75
+
76
+
77
+ desc "start", "Run spider bot service"
78
+
79
+ method_option :daemon,
80
+ aliases: ["-d"],
81
+ desc: "Run spider bot service in background"
82
+
83
+ method_option :time,
84
+ aliases: ["-t"],
85
+ desc: "Set crawl interval"
86
+
87
+ method_option :random,
88
+ aliases: ["-r"],
89
+ desc: "Set crawl interval to random "
90
+
91
+ method_option :env,
92
+ aliases: ["-e"],
93
+ desc: "set spider service environment"
94
+
95
+ method_option :expire,
96
+ aliases: ["-p"],
97
+ desc: "Read data expired page_number"
98
+
99
+ def start
100
+ puts "start....."
101
+
102
+ $expire_num = options[:expire].to_i if options[:expire]
103
+
104
+ if options[:env]
105
+ ENV['RACK_ENV'] = options[:env]
106
+ else
107
+ ENV['RACK_ENV']= 'development'
108
+ end
109
+
110
+ require File.join(File.expand_path('../..',__FILE__), "spider_bot/load")
111
+
112
+ FileUtils.mkdir_p("tmp/pids") if !File.exists?("tmp/pids")
113
+
114
+ daemon_options = {
115
+ app_name: 'spider',
116
+ ontop: true,
117
+ dir: 'tmp/pids',
118
+ }
119
+
120
+ sleep_time = 10
121
+
122
+ if options[:daemon]
123
+ daemon_options[:ontop] = false
124
+ else
125
+ puts "press ctrl-c exit"
126
+ end
127
+
128
+ stop if File.exists?("tmp/spider.pid")
129
+
130
+ if option_time = options[:time]
131
+ parse_time = option_time.match(/[d|h|m]/)
132
+ sleep_time = if parse_time
133
+ case parse_time[0]
134
+ when "d"
135
+ option_time.to_i * 60 * 60 * 24
136
+ when "h"
137
+ option_time.to_i * 60 * 60
138
+ when "m"
139
+ option_time.to_i * 60
140
+ end
141
+ else
142
+ option_time.to_i
143
+ end
144
+ end
145
+
146
+ Daemons.daemonize(daemon_options)
147
+
148
+ loop do
149
+ threads = []
150
+
151
+ BOTDIR.each do |file|
152
+ threads << Thread.new do
153
+ begin
154
+ SpiderBot.logger.info "loading bot file with #{file}."
155
+ load file
156
+ rescue Exception => e
157
+ SpiderBot.logger.error "has errors with loading bot file #{ file }"
158
+ SpiderBot.logger.error e.to_s
159
+ end
160
+ sleep(10)
161
+ end
162
+ end
163
+
164
+ threads.each { |t| t.join }
165
+
166
+ if options[:random]
167
+ random_time = Random.new.rand(sleep_time)
168
+ sleep(random_time.to_i)
169
+ else
170
+ sleep(sleep_time.to_i)
171
+ end
172
+ end
173
+ end
174
+
175
+ desc 'stop', "Stop spider bot service"
176
+
177
+ def stop
178
+ pid = File.read("tmp/pids/spider.pid").to_i
179
+ Process.kill(9, pid)
180
+ File.delete("tmp/pids/spider.pid")
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,235 @@
1
+ module SpiderBot
2
+ class Crawl
3
+
4
+ # Initialize a new Spider Bot
5
+ #
6
+ # @param url [String] the spider target website curl
7
+ # @param options [Hash] the spider crawl configurate options
8
+ # @option options :type [Symbol] the request body format, `:html` or `:json`
9
+ # @option options :headers [Hash] the custom request headers
10
+ # @option options :path, [String] the custom request path
11
+ # @option options :query [Hash] the request query
12
+ # @option options :user_agent [String] the custom request user agent
13
+ # @option options :source [Boolean]
14
+ # @option options :data [Proc] get crawl data list in body
15
+ # @option options :first [Proc] get crawl data list first item
16
+ # @option options :last [Porc] get crawl data list last item
17
+ # @option options :encode [String] custom request encode
18
+
19
+ def initialize(url, options = {})
20
+ parse_uri = URI.parse url
21
+ @uri = parse_uri.scheme + "://" + parse_uri.host
22
+
23
+ # don't add 443 port append to url when access https website
24
+ if !["80", "443"].include?(parse_uri.port.to_s)
25
+ @uri = @uri + ":" + parse_uri.port.to_s
26
+ end
27
+
28
+ @origin_path = parse_uri.path || "/"
29
+
30
+ @origin_type = options[:type] || :html
31
+ @origin_headers = options[:headers] || {}
32
+ @origin_query = options[:query] || {}
33
+
34
+ @origin_user_agent = options[:user_agent] || "Mac Safari"
35
+ @origin_source = options[:source] || false
36
+
37
+ @origin_data = options[:data]
38
+ @origin_first = options[:first]
39
+ @origin_last = options[:last]
40
+
41
+ @origin_encode = options[:encode]
42
+
43
+ @page_path = @origin_path
44
+ @page_type = @origin_type
45
+ @page_headers = @origin_headers || {}
46
+ @page_query = {}
47
+
48
+ @page_data = @origin_data
49
+ @page_first = @origin_first
50
+ @page_last = @origin_last
51
+
52
+ @page_start = 1
53
+ @page_add = 1
54
+ @page_expire = 10
55
+ @page_sleep = 0
56
+
57
+ @paginate_last = nil
58
+ @paginate_error = 0
59
+ @paginate_type = :html
60
+ @paginate_path = ""
61
+ @paginate_query = {}
62
+
63
+ @connection = Http::Client.new do |http|
64
+ http.url= @uri
65
+ http.user_agent= @origin_user_agent
66
+ http.headers= @origin_headers
67
+ end
68
+ end
69
+
70
+ # Process crawl data
71
+ #
72
+ # @param a [block]
73
+
74
+ def crawl_data(&block)
75
+ @paginate_num = @page_start
76
+
77
+ catch :all do
78
+ begin
79
+ crawl_response = crawl_request(@origin_path, @origin_query, @origin_type, @origin_data, @origin_first, @origin_last, &block)
80
+ return crawl_response if !block_given?
81
+ process_response(crawl_response, &block)
82
+ rescue Exception => e
83
+ handle_error(e)
84
+ crawl_data(&block)
85
+ end
86
+
87
+ @paginate_error = 0
88
+ return if @page_query.blank? && @page_path == @origin_path
89
+
90
+ crawl_paginate(&block)
91
+ end
92
+ end
93
+
94
+ private
95
+
96
+ def crawl_paginate(&block)
97
+ @page_headers.merge({"X-Requested-With" => "XMLHttpRequest"}) if @page_type.to_s == 'json'
98
+ @connection.headers = @page_headers
99
+ begin
100
+ loop do
101
+ real_page_num = (@page_start == 0 && @page_add > 1) ? (@paginate_num / @page_add) + 1 : @paginate_num
102
+ if defined?($expire_num)
103
+ if $expire_num > 1
104
+ break if real_page_num > $expire_num.to_i
105
+ else
106
+ break if real_page_num > 1
107
+ end
108
+ end
109
+ # break crawl_paginate current page number more than @page_expire and @page_expre
110
+ if real_page_num > @page_expire && @page_expire != -1
111
+ SpiderBot.logger.info "Crawl finished..."
112
+ SpiderBot.logger.info "Finish reson: The current page more than setting paginate expire"
113
+ break
114
+ end
115
+
116
+ sleep(@page_sleep) if @page_sleep > 0
117
+
118
+ path = @page_path.to_s % {page: @paginate_num}
119
+ query_str = @page_query.to_s % { page: @paginate_num, last: @paginate_last, first: @paginate_first }
120
+ query = eval(query_str)
121
+
122
+ crawl_response = crawl_request(path, query, @page_type, @page_data, @page_first, @page_last, &block)
123
+ process_response(crawl_response, &block)
124
+ end
125
+ rescue Exception => e
126
+ @paginate_num += @page_add if @paginate_error == 2
127
+ handle_error(e)
128
+ crawl_paginate(&block)
129
+ end
130
+ end
131
+
132
+ def crawl_request(path, query, type, data, first, last, &block)
133
+ @paginate_path = path
134
+ @paginate_query = query
135
+
136
+ response = @connection.get(path, query)
137
+
138
+ return if !response
139
+ return if response.status != 200
140
+
141
+ options = { encode: @origin_encode } if @origin_encode
142
+
143
+ if @origin_source && !block_given?
144
+ return response.body(options)
145
+ end
146
+
147
+ if type.to_s == "html"
148
+ @paginate_type = :html
149
+ body = Nokogiri::HTML response.body(options)
150
+ elsif type.to_s == "json"
151
+ @paginate_type = :json
152
+ body = MultiJson.load response.body(options)
153
+ else
154
+ @paginate_type = response.parser
155
+ body = response.parsed
156
+ end
157
+
158
+ return if body.nil?
159
+ return body if data.nil?
160
+
161
+ body_data = data.call(body) if data
162
+ @paginate_first = first.call(body_data, body) if first
163
+ @paginate_last = last.call(body_data, body) if last
164
+
165
+ return body_data
166
+ end
167
+
168
+ def get_page_url
169
+ if !@paginate_query.blank?
170
+ @uri + @paginate_path + "?" + @paginate_query.map{ |k,v| "#{k}=#{v}" }.join("&")
171
+ else
172
+ @uri + @paginate_path
173
+ end
174
+ end
175
+
176
+ def set_paginate_headers(arg)
177
+ @page_headers = arg || {}
178
+ end
179
+
180
+ # set crawl paginate settings
181
+ #
182
+ # @example
183
+ # paginate do
184
+ # option :path, '/path'
185
+ # option :query, {page: "%{page}"}
186
+ # option :first, Proc.new{|data| data.css("#item")}
187
+ # option :last, Proc.new{|data| data.css("#item")}
188
+ # option :type, :html
189
+ # option :data, Proc.new{|body| body.css("#item")}
190
+ # option :start, 1
191
+ # option :add, 1
192
+ # option :expire, 100
193
+ # option :sleep, 100
194
+ # end
195
+
196
+ def paginate(&block)
197
+ block.call
198
+ end
199
+
200
+ def option(name, params)
201
+ raise "set paginate options has error" if %i(path query first last type data start add expire sleep).include?(name.to_s)
202
+ eval("@page_#{name} = params")
203
+ end
204
+
205
+ def break_all
206
+ throw :all
207
+ end
208
+
209
+ def handle_error(error)
210
+ SpiderBot.logger.error "crawling url #{ get_page_url } has error..."
211
+ SpiderBot.logger.error error.to_s
212
+
213
+ break_all if @paginate_error == 3
214
+ @paginate_error += 1
215
+
216
+ sleep( 60 * @paginate_error )
217
+ end
218
+
219
+ # Print error infomation with http client response blank
220
+ #
221
+ # @param response [Object] The Faraday connection builder
222
+
223
+ def process_response(response, &block)
224
+ if response.blank?
225
+ SpiderBot.logger.info "Crawl finished..."
226
+ SpiderBot.logger.info "Finish reson: Crawl response body is blank..."
227
+ break_all
228
+ end
229
+ SpiderBot.logger.info "crawling page for #{get_page_url}"
230
+ yield response, @paginate_num, @paginate_type
231
+ @paginate_num += @page_add
232
+ @paginate_error = 0
233
+ end
234
+ end
235
+ end
@@ -0,0 +1,5 @@
1
+ module SpiderBot
2
+ class Error < StandardError; end
3
+ class TimeoutError < Faraday::TimeoutError; end
4
+ class ConnectionFaild < Faraday::ConnectionFailed; end
5
+ end