spider_bot 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6c8009ebd495001ed425bd236e2b894d1b3124dd
4
+ data.tar.gz: 97e5ce361bc6165956fac9f879c526ffb7da922d
5
+ SHA512:
6
+ metadata.gz: 951b3f8a49ec3034b6b3031a40270eea9fcb64f1685f1982878523005ae292c0f45dbee000f67bc67ecd16a4a569b39c166d5be5559503ecdde2daa2e4ece00e
7
+ data.tar.gz: 26d664fb1edc26e8edcd8c5aab9f38cd4bc48b3618c098384e000c71aaa7f8f2c357e8dd447c87940d55f64c40ff73db84a01b356e09cbc88ec8c36a8a6f8818
data/.gitignore ADDED
@@ -0,0 +1,16 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ *.swp
15
+ *.swo
16
+ mkmf.log
data/.yardopts ADDED
@@ -0,0 +1,2 @@
1
+ --markup-provider=redcarpet
2
+ --markup=markdown
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in spider_bot.gemspec
4
+ gemspec
5
+
6
+ group :doc do
7
+ gem 'yard'
8
+ gem 'redcarpet'
9
+ gem 'github-markup'
10
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 yee.li
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,148 @@
1
+ # SpiderBot
2
+
3
+ 一个简单的机器爬虫
4
+
5
+ ## SpiderBot 安装
6
+
7
+ 将下列文字添加到你程序中的Gemfile里
8
+
9
+ ```
10
+ gem 'spider_bot'
11
+ ```
12
+
13
+ 并执行:
14
+
15
+ $ bundle
16
+
17
+ 或者直接通过命令安装:
18
+
19
+ $ gem install spider_bot
20
+
21
+ ## SpiderBot 文件
22
+
23
+ #### 文件格式
24
+
25
+ 1.单站单页爬取, 返回html文本
26
+
27
+ ```
28
+ SpiderBot.crawl("http://example.com", #{origin_options})
29
+ ```
30
+
31
+ 2.单站多页爬取
32
+
33
+ ```
34
+ SpiderBot.crawl("#{url}", data: Proce.new{ |data| data }, since: Proce.new{ |data| data }) do
35
+
36
+ paginate do
37
+ option :type, :json
38
+ option :path, '#{path}'
39
+
40
+ # 翻页页码设置
41
+ option :start, 0
42
+ option :add, 10
43
+ option :expire, 100 #如果设置为-1,将默认进行无限次爬取
44
+ option :sleep, 6
45
+
46
+ # 翻页后获取信息设置
47
+ option :data, Proc.new{ |data| data }
48
+ option :since, Proc.new{ |since| since }
49
+
50
+ option query, {page: "%{page}", since_id: %{since}}
51
+ end
52
+
53
+ crawl_data do |data|
54
+ # 解析爬取的数据...
55
+ end
56
+ end
57
+ ```
58
+
59
+ 3.多站,多页内容爬取, 可以配合Rails或者padrino进行任务爬去
60
+
61
+ ```
62
+ class Mybot < SpiderBot::Base
63
+
64
+ #通过 "spider start” 或者 “spider crawl" 自动执行的方法
65
+
66
+ auto do
67
+ origin "#{url}", data: Proc.new{ |data| data }, since: Proce.new{ |since| since }
68
+ execute do
69
+
70
+ paginate do
71
+ option :type, :json
72
+ option :path, '#{path}'
73
+
74
+ # 翻页页码设置
75
+ option :start, 0
76
+ option :add, 10
77
+ option :expire, 100
78
+ option :sleep, 6
79
+
80
+ # 翻页后获取信息设置
81
+ option :data, Proc.new{ |data| data }
82
+ option :since, Proc.new{ |since| since }
83
+
84
+ option query, { page: "%{page}", since_id: "%{since}" }
85
+ end
86
+
87
+ crawl_data do |data|
88
+ # 解析爬取的数据...
89
+ end
90
+ end
91
+ end
92
+ end
93
+ ```
94
+
95
+ ####初始页面参数设置 origin_options
96
+
97
+ * path
98
+ * type
99
+ * headers
100
+ * query
101
+ * data 获取初始页面数据
102
+ * since 获取初始页码数据最后一条参数,用户翻页
103
+
104
+ ####翻页参数设置
105
+
106
+ 1.翻页后文本设置
107
+
108
+ * paginate_type 翻页后类型[:html, :json, :xml]
109
+ * paginate_path 翻页后的Path
110
+ * paginate_query 翻页后的参数设置 {page: "%{page}", since: "%{since}"}
111
+
112
+
113
+ 2.翻页设置
114
+
115
+ * paginate_start #翻页起始页, 默认为0
116
+ * paginate_add #翻页增加数, 默认为 1
117
+ * paginate_expire #翻页总结数, 默认为30
118
+ * paginate_sleep #翻页休息数, 默认为 0
119
+
120
+ 3.翻页信息获取
121
+
122
+ * paginate_data 获取翻页后的数据, 不填写,默认为origin data
123
+ * paginate_since 获取翻页后最后数据, 不填写, 默认为 origin_since
124
+
125
+
126
+ ## SpiderBot 命令
127
+
128
+ * spider url #直接通过命令爬取, 返回html文本
129
+ - -q query, 设置Query
130
+ - -d data, 爬取数据
131
+ - -o out,输出到文件
132
+
133
+ * spider crawl #运行bot文件
134
+ - -b bot, 运行单一bot文件
135
+ - -d dir, 运行指定目录里的bot文件
136
+ - -p expire_page, 总翻页数(用以替代 option :export)
137
+
138
+ * spider start #运行爬取服务
139
+ - -d daemon, 后台运行
140
+ - -t time, #设置爬取时间间隔, 默认为10
141
+ - -r random #将爬取时间间隔, 设置为时间下一个随机数, 默认为10的随机数
142
+ - -e env #设置Sipder运行环境, 如果配合Rails或者Padrino, 获取指定运行环境
143
+ - -p expire_page, 总翻页数(用以替代 option :export)
144
+
145
+ * spider stop #停止爬取服务
146
+
147
+
148
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/spider ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/spider_bot/cli'
3
+
4
+ begin
5
+ SpiderBot::CLI.start
6
+ rescue SystemExit => e
7
+ Kernel.exit(e.status)
8
+ rescue Exception => e
9
+ STDERR.puts e.message
10
+ STDERR.puts e.backtrace.join("\n")
11
+ Kernel.exit 1
12
+ end
@@ -0,0 +1,31 @@
1
+ module SpiderBot
2
+ class Base
3
+ class << self
4
+ #
5
+ # execute method with command "spider start" and "spider crawl"
6
+ #
7
+
8
+ def auto &block
9
+ if defined?(BOTCONSOLE)
10
+ klass = Class.new do
11
+ def origin url, options = {}
12
+ @origin_url = url
13
+ @origin_options = options
14
+ end
15
+
16
+ def execute name = nil, &block
17
+ crawl_instance = Crawl.new(@origin_url, @origin_options)
18
+ crawl_instance.instance_eval &block
19
+ end
20
+ end
21
+ klass.allocate.instance_eval &block
22
+ end
23
+ end
24
+
25
+ def crawl url, options = {}
26
+ crawl_instance = Crawl.new(url, options)
27
+ crawl_instance.crawl_data
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,183 @@
1
+ # encoding: utf-8
2
+
3
+ $LOAD_PATH.unshift(File.expand_path('../..', __FILE__))
4
+ require "thor"
5
+ require 'spider_bot'
6
+ require 'daemons'
7
+
8
+ BOTCONSOLE = true
9
+
10
+ module SpiderBot
11
+ class CLI < Thor
12
+ desc "url", "Crawl url"
13
+
14
+ method_option :query,
15
+ aliases: "-q",
16
+ desc: "Set url query"
17
+
18
+ method_option :data,
19
+ aliases: "-d",
20
+ desc: "Match html data"
21
+
22
+ method_option :out,
23
+ aliases: ["-o"],
24
+ desc: "Write to file"
25
+
26
+ def url(arg)
27
+ data = Crawl.new(arg, options).crawl_data
28
+ return File.open(options[:out], "w"){ file.puts data } if options[:out]
29
+ return puts data
30
+ end
31
+
32
+ desc "crawl", "Run spider bot file"
33
+
34
+ method_option :bot,
35
+ aliases: ["-b"],
36
+ desc: "Read bot flle"
37
+
38
+ method_option :dir,
39
+ aliases: ["-d"],
40
+ desc: "Read bot directory"
41
+
42
+ method_option :expire,
43
+ aliases: ["-p"],
44
+ desc: "Read data expired number"
45
+
46
+ def crawl
47
+ $expire_num = options[:expire].to_i if options[:expire]
48
+ require File.join(File.expand_path('../..',__FILE__), "spider_bot/load")
49
+
50
+ if options[:bot]
51
+ bot_file = File.expand_path(options[:bot])
52
+ return raise "Bot file not found" if !File.exists?(bot_file)
53
+ load bot_file
54
+ end
55
+
56
+ if options[:dir]
57
+ bot_dir = File.expand_path(options[:dir])
58
+ return raise "Dir is not found" if !Dir.exists?(bot_dir)
59
+
60
+ threads = []
61
+ Dir.glob("#{bot_dir}/*_bot.rb").each do |file|
62
+ threads << Thread.new do
63
+ begin
64
+ SpiderBot.logger.info "loading bot file with #{file}."
65
+ load file
66
+ rescue Exception => e
67
+ SpiderBot.logger.error "has errors with loading bot file #{ file }"
68
+ SpiderBot.logger.error e.to_s
69
+ end
70
+ end
71
+ end
72
+ threads.each { |t| t.join }
73
+ end
74
+ end
75
+
76
+
77
+ desc "start", "Run spider bot service"
78
+
79
+ method_option :daemon,
80
+ aliases: ["-d"],
81
+ desc: "Run spider bot service in background"
82
+
83
+ method_option :time,
84
+ aliases: ["-t"],
85
+ desc: "Set crawl interval"
86
+
87
+ method_option :random,
88
+ aliases: ["-r"],
89
+ desc: "Set crawl interval to random "
90
+
91
+ method_option :env,
92
+ aliases: ["-e"],
93
+ desc: "set spider service environment"
94
+
95
+ method_option :expire,
96
+ aliases: ["-p"],
97
+ desc: "Read data expired page_number"
98
+
99
+ def start
100
+ puts "start....."
101
+
102
+ $expire_num = options[:expire].to_i if options[:expire]
103
+
104
+ if options[:env]
105
+ ENV['RACK_ENV'] = options[:env]
106
+ else
107
+ ENV['RACK_ENV']= 'development'
108
+ end
109
+
110
+ require File.join(File.expand_path('../..',__FILE__), "spider_bot/load")
111
+
112
+ FileUtils.mkdir_p("tmp/pids") if !File.exists?("tmp/pids")
113
+
114
+ daemon_options = {
115
+ app_name: 'spider',
116
+ ontop: true,
117
+ dir: 'tmp/pids',
118
+ }
119
+
120
+ sleep_time = 10
121
+
122
+ if options[:daemon]
123
+ daemon_options[:ontop] = false
124
+ else
125
+ puts "press ctrl-c exit"
126
+ end
127
+
128
+ stop if File.exists?("tmp/spider.pid")
129
+
130
+ if option_time = options[:time]
131
+ parse_time = option_time.match(/[d|h|m]/)
132
+ sleep_time = if parse_time
133
+ case parse_time[0]
134
+ when "d"
135
+ option_time.to_i * 60 * 60 * 24
136
+ when "h"
137
+ option_time.to_i * 60 * 60
138
+ when "m"
139
+ option_time.to_i * 60
140
+ end
141
+ else
142
+ option_time.to_i
143
+ end
144
+ end
145
+
146
+ Daemons.daemonize(daemon_options)
147
+
148
+ loop do
149
+ threads = []
150
+
151
+ BOTDIR.each do |file|
152
+ threads << Thread.new do
153
+ begin
154
+ SpiderBot.logger.info "loading bot file with #{file}."
155
+ load file
156
+ rescue Exception => e
157
+ SpiderBot.logger.error "has errors with loading bot file #{ file }"
158
+ SpiderBot.logger.error e.to_s
159
+ end
160
+ sleep(10)
161
+ end
162
+ end
163
+
164
+ threads.each { |t| t.join }
165
+
166
+ if options[:random]
167
+ random_time = Random.new.rand(sleep_time)
168
+ sleep(random_time.to_i)
169
+ else
170
+ sleep(sleep_time.to_i)
171
+ end
172
+ end
173
+ end
174
+
175
+ desc 'stop', "Stop spider bot service"
176
+
177
+ def stop
178
+ pid = File.read("tmp/pids/spider.pid").to_i
179
+ Process.kill(9, pid)
180
+ File.delete("tmp/pids/spider.pid")
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,235 @@
1
+ module SpiderBot
2
+ class Crawl
3
+
4
+ # Initialize a new Spider Bot
5
+ #
6
+ # @param url [String] the spider target website curl
7
+ # @param options [Hash] the spider crawl configurate options
8
+ # @option options :type [Symbol] the request body format, `:html` or `:json`
9
+ # @option options :headers [Hash] the custom request headers
10
+ # @option options :path, [String] the custom request path
11
+ # @option options :query [Hash] the request query
12
+ # @option options :user_agent [String] the custom request user agent
13
+ # @option options :source [Boolean]
14
+ # @option options :data [Proc] get crawl data list in body
15
+ # @option options :first [Proc] get crawl data list first item
16
+ # @option options :last [Porc] get crawl data list last item
17
+ # @option options :encode [String] custom request encode
18
+
19
+ def initialize(url, options = {})
20
+ parse_uri = URI.parse url
21
+ @uri = parse_uri.scheme + "://" + parse_uri.host
22
+
23
+ # don't add 443 port append to url when access https website
24
+ if !["80", "443"].include?(parse_uri.port.to_s)
25
+ @uri = @uri + ":" + parse_uri.port.to_s
26
+ end
27
+
28
+ @origin_path = parse_uri.path || "/"
29
+
30
+ @origin_type = options[:type] || :html
31
+ @origin_headers = options[:headers] || {}
32
+ @origin_query = options[:query] || {}
33
+
34
+ @origin_user_agent = options[:user_agent] || "Mac Safari"
35
+ @origin_source = options[:source] || false
36
+
37
+ @origin_data = options[:data]
38
+ @origin_first = options[:first]
39
+ @origin_last = options[:last]
40
+
41
+ @origin_encode = options[:encode]
42
+
43
+ @page_path = @origin_path
44
+ @page_type = @origin_type
45
+ @page_headers = @origin_headers || {}
46
+ @page_query = {}
47
+
48
+ @page_data = @origin_data
49
+ @page_first = @origin_first
50
+ @page_last = @origin_last
51
+
52
+ @page_start = 1
53
+ @page_add = 1
54
+ @page_expire = 10
55
+ @page_sleep = 0
56
+
57
+ @paginate_last = nil
58
+ @paginate_error = 0
59
+ @paginate_type = :html
60
+ @paginate_path = ""
61
+ @paginate_query = {}
62
+
63
+ @connection = Http::Client.new do |http|
64
+ http.url= @uri
65
+ http.user_agent= @origin_user_agent
66
+ http.headers= @origin_headers
67
+ end
68
+ end
69
+
70
+ # Process crawl data
71
+ #
72
+ # @param a [block]
73
+
74
+ def crawl_data(&block)
75
+ @paginate_num = @page_start
76
+
77
+ catch :all do
78
+ begin
79
+ crawl_response = crawl_request(@origin_path, @origin_query, @origin_type, @origin_data, @origin_first, @origin_last, &block)
80
+ return crawl_response if !block_given?
81
+ process_response(crawl_response, &block)
82
+ rescue Exception => e
83
+ handle_error(e)
84
+ crawl_data(&block)
85
+ end
86
+
87
+ @paginate_error = 0
88
+ return if @page_query.blank? && @page_path == @origin_path
89
+
90
+ crawl_paginate(&block)
91
+ end
92
+ end
93
+
94
+ private
95
+
96
+ def crawl_paginate(&block)
97
+ @page_headers.merge({"X-Requested-With" => "XMLHttpRequest"}) if @page_type.to_s == 'json'
98
+ @connection.headers = @page_headers
99
+ begin
100
+ loop do
101
+ real_page_num = (@page_start == 0 && @page_add > 1) ? (@paginate_num / @page_add) + 1 : @paginate_num
102
+ if defined?($expire_num)
103
+ if $expire_num > 1
104
+ break if real_page_num > $expire_num.to_i
105
+ else
106
+ break if real_page_num > 1
107
+ end
108
+ end
109
+ # break crawl_paginate current page number more than @page_expire and @page_expre
110
+ if real_page_num > @page_expire && @page_expire != -1
111
+ SpiderBot.logger.info "Crawl finished..."
112
+ SpiderBot.logger.info "Finish reson: The current page more than setting paginate expire"
113
+ break
114
+ end
115
+
116
+ sleep(@page_sleep) if @page_sleep > 0
117
+
118
+ path = @page_path.to_s % {page: @paginate_num}
119
+ query_str = @page_query.to_s % { page: @paginate_num, last: @paginate_last, first: @paginate_first }
120
+ query = eval(query_str)
121
+
122
+ crawl_response = crawl_request(path, query, @page_type, @page_data, @page_first, @page_last, &block)
123
+ process_response(crawl_response, &block)
124
+ end
125
+ rescue Exception => e
126
+ @paginate_num += @page_add if @paginate_error == 2
127
+ handle_error(e)
128
+ crawl_paginate(&block)
129
+ end
130
+ end
131
+
132
+ def crawl_request(path, query, type, data, first, last, &block)
133
+ @paginate_path = path
134
+ @paginate_query = query
135
+
136
+ response = @connection.get(path, query)
137
+
138
+ return if !response
139
+ return if response.status != 200
140
+
141
+ options = { encode: @origin_encode } if @origin_encode
142
+
143
+ if @origin_source && !block_given?
144
+ return response.body(options)
145
+ end
146
+
147
+ if type.to_s == "html"
148
+ @paginate_type = :html
149
+ body = Nokogiri::HTML response.body(options)
150
+ elsif type.to_s == "json"
151
+ @paginate_type = :json
152
+ body = MultiJson.load response.body(options)
153
+ else
154
+ @paginate_type = response.parser
155
+ body = response.parsed
156
+ end
157
+
158
+ return if body.nil?
159
+ return body if data.nil?
160
+
161
+ body_data = data.call(body) if data
162
+ @paginate_first = first.call(body_data, body) if first
163
+ @paginate_last = last.call(body_data, body) if last
164
+
165
+ return body_data
166
+ end
167
+
168
+ def get_page_url
169
+ if !@paginate_query.blank?
170
+ @uri + @paginate_path + "?" + @paginate_query.map{ |k,v| "#{k}=#{v}" }.join("&")
171
+ else
172
+ @uri + @paginate_path
173
+ end
174
+ end
175
+
176
+ def set_paginate_headers(arg)
177
+ @page_headers = arg || {}
178
+ end
179
+
180
+ # set crawl paginate settings
181
+ #
182
+ # @example
183
+ # paginate do
184
+ # option :path, '/path'
185
+ # option :query, {page: "%{page}"}
186
+ # option :first, Proc.new{|data| data.css("#item")}
187
+ # option :last, Proc.new{|data| data.css("#item")}
188
+ # option :type, :html
189
+ # option :data, Proc.new{|body| body.css("#item")}
190
+ # option :start, 1
191
+ # option :add, 1
192
+ # option :expire, 100
193
+ # option :sleep, 100
194
+ # end
195
+
196
+ def paginate(&block)
197
+ block.call
198
+ end
199
+
200
+ def option(name, params)
201
+ raise "set paginate options has error" if %i(path query first last type data start add expire sleep).include?(name.to_s)
202
+ eval("@page_#{name} = params")
203
+ end
204
+
205
+ def break_all
206
+ throw :all
207
+ end
208
+
209
+ def handle_error(error)
210
+ SpiderBot.logger.error "crawling url #{ get_page_url } has error..."
211
+ SpiderBot.logger.error error.to_s
212
+
213
+ break_all if @paginate_error == 3
214
+ @paginate_error += 1
215
+
216
+ sleep( 60 * @paginate_error )
217
+ end
218
+
219
+ # Print error infomation with http client response blank
220
+ #
221
+ # @param response [Object] The Faraday connection builder
222
+
223
+ def process_response(response, &block)
224
+ if response.blank?
225
+ SpiderBot.logger.info "Crawl finished..."
226
+ SpiderBot.logger.info "Finish reson: Crawl response body is blank..."
227
+ break_all
228
+ end
229
+ SpiderBot.logger.info "crawling page for #{get_page_url}"
230
+ yield response, @paginate_num, @paginate_type
231
+ @paginate_num += @page_add
232
+ @paginate_error = 0
233
+ end
234
+ end
235
+ end
@@ -0,0 +1,5 @@
1
+ module SpiderBot
2
+ class Error < StandardError; end
3
+ class TimeoutError < Faraday::TimeoutError; end
4
+ class ConnectionFaild < Faraday::ConnectionFailed; end
5
+ end