spider_bot 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,166 @@
1
+ #coding: utf-8
2
+
3
+ module SpiderBot
4
+ module Http
5
+ class Client
6
+
7
+ # return url for HttpClient
8
+ attr_reader :url
9
+
10
+ # return http user_agent for HttpClient
11
+ attr_reader :user_agent
12
+
13
+ attr_reader :headers
14
+
15
+ #
16
+ attr_accessor :options
17
+
18
+ # return connection for HttpClient
19
+ attr_accessor :connection
20
+
21
+ attr_accessor :conn_build
22
+
23
+ # Supported User-Agent
24
+ #
25
+ # * Linux Firefox (3.6.1)
26
+ # * Linux Konqueror (3)
27
+ # * Linux Mozilla
28
+ # * Linux Chrome
29
+ # * Mac Firefox
30
+ # * Mac Mozilla
31
+ # * Mac Chrome
32
+ # * Mac Safari
33
+ # * Mechanize (default)
34
+ # * Windows IE 6
35
+ # * Windows IE 7
36
+ # * Windows IE 8
37
+ # * Windows IE 9
38
+ # * Windows Mozilla
39
+ # * iPhone (3.0)
40
+ # * iPad
41
+ # * Android
42
+
43
+ USER_AGENT = {
44
+ 'bot' => "bot/#{SpiderBot::VERSION}",
45
+ 'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1',
46
+ 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
47
+ 'Linux Chrome' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624 Chrome/26.0.1410.43',
48
+ 'Mac Firefox' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:35.0) Gecko/20100101 Firefox/35.0',
49
+ 'Mac Safari' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/8.0.3 Safari/600.3.18',
50
+ 'Mac Chrome' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36',
51
+ 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
52
+ 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
53
+ 'Windows IE 8' => 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
54
+ 'Windows IE 9' => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
55
+ 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
56
+ 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3',
57
+ 'iPad' => 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10',
58
+ 'Android' => 'Mozilla/5.0 (Linux; U; Android 3.0; en-us) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13'
59
+ }
60
+
61
+ # Initialize a new HttpClient
62
+ #
63
+ # @param uri [String] the uri with
64
+ # @param options [Hash] the options to create a http with configure
65
+ # @option options [String] :header set the http request headers
66
+ # @yield [builder]
67
+ #
68
+ # @example
69
+ # http = HttpClient.new
70
+ #
71
+ # http = HttpClient.new do |http|
72
+ # http.user_agent= "Mac Safri"
73
+ # http.url= "http://example.com"
74
+ # end
75
+
76
+ def initialize(uri = nil, options = nil, &block)
77
+ @url = uri
78
+ @options = options
79
+ @user_agent ||= USER_AGENT['bot']
80
+ yield self if block_given?
81
+ end
82
+
83
+ def builder(&block)
84
+ @conn_build = block
85
+ end
86
+
87
+ # Set the url for HttpClient
88
+ #
89
+ # @param uri [String] the HttpClient url
90
+
91
+ def url=(uri)
92
+ @conn = nil
93
+ @url = uri
94
+ end
95
+
96
+ # Set the headers for HttpClient
97
+ #
98
+ # @param headers [String] the HttpClient url
99
+ # @return [String]
100
+
101
+ def headers=(headers)
102
+ @headers = headers.merge({"User-Agent" => user_agent})
103
+ end
104
+
105
+ # Set the user agent for HttpClient
106
+ #
107
+ # @param name [Symbol] the HttpClient user agent
108
+
109
+ def user_agent=(name)
110
+ @user_agent = USER_AGENT[name] || USER_AGENT['bot']
111
+ end
112
+
113
+ # The Faraday connection object
114
+ # @return [connection] The Faraday connection builder
115
+
116
+ def connection
117
+ @connection ||= begin
118
+ conn = Faraday.new(url: url)
119
+ conn.build do |b|
120
+ conn_build.call(b)
121
+ end if conn_build
122
+ conn
123
+ end
124
+ end
125
+
126
+
127
+ # Make request with HttpClient
128
+ #
129
+ # @param verb [Symbol] verb one of :get, :post, :put, :delete
130
+ # @param uri [String] URL path for request
131
+ # @param query [Hash] additional query parameters for the URL of the request
132
+
133
+ def request(verb, uri, query={})
134
+ verb == :get ? query_get = query : query_post = query
135
+ uri = connection.build_url(uri, query_get)
136
+
137
+ response = connection.run_request(verb, uri, query_post, headers) do |request|
138
+ yield request if block_given?
139
+ end
140
+ response = Response.new(response)
141
+
142
+ case response.status
143
+ when 301, 302, 303, 307
144
+ request(verb, response.headers['location'], query)
145
+ when 200..299, 300..399
146
+ response
147
+ end
148
+ end
149
+
150
+ # Handle get request with HttpClient
151
+ #
152
+ # @param uri [String] URL path for request
153
+ # @param query [Hash] additional query parameters for the URL of the request
154
+
155
+ def get(uri, query = {}, &block)
156
+ request(:get, uri, query, &block)
157
+ end
158
+
159
+ # Handle post request with HttpClient
160
+ # @param (see #get)
161
+ def post(uri, query = {}, &block)
162
+ request(:post, uri, query, &block)
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,83 @@
1
+ module SpiderBot
2
+ module Http
3
+ class Response
4
+ attr_reader :response
5
+
6
+ CONTENT_TYPE = {
7
+ 'application/json' => :json,
8
+ 'application/x-www-form-urlencoded' => :html,
9
+ 'text/html' => :html,
10
+ 'text/javascript' => :json,
11
+ 'text/xml' => :xml
12
+ }
13
+
14
+ PARSERS = {
15
+ :json => lambda{ |body| MultiJson.respond_to?(:adapter) ? MultiJson.load(body) : MultiJson.decode(body) rescue body},
16
+ :html => lambda{ |body| Nokogiri::HTML(body)},
17
+ :xml => lambda{ |body| MultiXml.parse(body) }
18
+ }
19
+
20
+ def initialize(response)
21
+ @response = response
22
+ end
23
+
24
+ def headers
25
+ response.headers
26
+ end
27
+
28
+ def body(options = {})
29
+ options = options || {}
30
+ decode(response.body, options)
31
+ end
32
+
33
+ def decode(body, options = {})
34
+ return '' if !body
35
+ return body if json?
36
+ charset = body.match(/charset\s*=[\s|\W]*([\w-]+)/)
37
+ return body if charset[1].downcase == "utf-8"
38
+ charset_code = charset_covert(charset[1])
39
+ begin
40
+ if options[:encode]
41
+ return body.encode! "utf-8", options[:encode], {:invalid => :replace}
42
+ end
43
+ body.encode! "utf-8", charset_code, {:invalid => :replace}
44
+ rescue
45
+ body
46
+ end
47
+ end
48
+
49
+ def status
50
+ response.status
51
+ end
52
+
53
+ # Attempts to determine the content type of the response.
54
+ def content_type
55
+ ((response.headers.values_at('content-type', 'Content-Type').compact.first || '').split(';').first || '').strip
56
+ end
57
+
58
+ def json?
59
+ CONTENT_TYPE[content_type] == :json || !response.body.match(/\<html/)
60
+ end
61
+
62
+ def parser
63
+ type = CONTENT_TYPE[content_type]
64
+ type = :json if type == :html && !response.body.match(/\<.*html|/)
65
+ type = :html if type.nil?
66
+ return type
67
+ end
68
+
69
+ def parsed
70
+ @parsed ||= PARSERS[parser].call(body)
71
+ end
72
+
73
+ def charset_covert(charset)
74
+ case charset
75
+ when "gb2312", "GB2312", "GBK"
76
+ "gbk"
77
+ else
78
+ charset
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,30 @@
1
+ begin
2
+ require File.expand_path("./config/application")
3
+ rescue LoadError => e
4
+ system_boot = File.expand_path("./config/boot.rb")
5
+ require system_boot if File.exist?(system_boot)
6
+ end
7
+
8
+ if defined?(Padrino)
9
+ puts "read padrino environment #{Padrino.env}"
10
+ BOTDIR = Dir.glob("#{Padrino.root}/app/bots/**/*_bot.rb")
11
+ if Padrino.env != :development
12
+ SpiderBot::Logging.initialize_logger("#{Padrino.root}/log/spider.log")
13
+ end
14
+ end
15
+
16
+ if defined?(Rails)
17
+ class Railtie < Rails::Railtie
18
+ initializer "disable eager load" do |app|
19
+ app.config.eager_load = false
20
+ end
21
+ end
22
+ Rails.application.initialize!
23
+ puts "read rails environment #{Rails.env}"
24
+ BOTDIR = Dir.glob("#{Rails.root}/app/bots/**/*_bot.rb")
25
+ Rails.logger.level = Logger::WARN
26
+ if !Rails.env.development?
27
+ SpiderBot::Logging.initialize_logger("#{Rails.root}/log/spider.log")
28
+ end
29
+ end
30
+
@@ -0,0 +1,21 @@
1
+ require 'logger'
2
+
3
+ module SpiderBot
4
+ module Logging
5
+ def self.initialize_logger(log_target = STDOUT)
6
+ oldlogger = defined?(@logger) ? @logger : nil
7
+ @logger = Logger.new(log_target)
8
+ @logger.level = Logger::INFO
9
+ oldlogger.close if oldlogger && !$TESTING
10
+ @logger
11
+ end
12
+
13
+ def self.logger
14
+ defined?(@logger) ? @logger : initialize_logger
15
+ end
16
+
17
+ def self.logger=(log)
18
+ @logger = (log ? log : Logger.new('/dev/null'))
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,6 @@
1
+ module SpiderBot
2
+ class Railtie < Rails::Railtie
3
+ initializer "load bots" do |app|
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,29 @@
1
+
2
+ date:
3
+ time:
4
+ year: "年|year|years"
5
+ month: "月|month|months|mon"
6
+ week: "周|星期|week|weeks"
7
+ day: "日|天|day|days"
8
+ hour: "时|小时|時|小時|hour|hours|hr|hrs|h"
9
+ min: "分|分钟|minute|minutes|min|mins|m"
10
+ second: "秒|second|seconds|sec|secs|s"
11
+ month:
12
+ jan: "january|jan|一月"
13
+ feb: "february|feb|二月"
14
+ mar: "march|mar|三月"
15
+ apr: "april|apr|四月"
16
+ may: "may|五月"
17
+ jun: "june|jun|六月"
18
+ jul: "july|jul|七月"
19
+ aug: "august|aug|八月"
20
+ sep: "september|sep|九月"
21
+ oct: "october|oct|十月"
22
+ nov: "november|nov|十一月"
23
+ dec: "december|dec|十二月"
24
+ other:
25
+ ago: "ago|前|以前"
26
+ today: "today|今天"
27
+ am: "AM|am|上午"
28
+ pm: "PM|pm|下午"
29
+
@@ -0,0 +1,119 @@
1
+ #coding: utf-8
2
+ require "yaml"
3
+ require "active_support/time"
4
+ require 'tzinfo'
5
+
6
+ DATE_CONFIG = YAML.load_file(File.expand_path("../date.yml", __FILE__))
7
+
8
+ class String
9
+
10
+ # Parse content to local time
11
+ #
12
+ # @param [String] zone time zone with site
13
+ def parse_time(zone = nil)
14
+ Time.zone = zone.nil? ? "UTC" : zone
15
+
16
+ @time_config = DATE_CONFIG["date"]["time"]
17
+ @month_config = DATE_CONFIG["date"]["month"]
18
+ @other_config = DATE_CONFIG["date"]["other"]
19
+ @time_str = @time_config.values.join("|")
20
+
21
+ @time_regex = %r"\d+[\s|\S]*(?:#{@time_str})\s*(?:#{@other_config["ago"]})"
22
+ @today_regex = %r"#{@other_config["today"]}\s*\d{1,2}:\d{1,2}"
23
+
24
+ case self
25
+ when @time_regex
26
+ parse_date_ago
27
+ when @today_regex
28
+ parse_today
29
+ else
30
+ parse_date
31
+ end
32
+
33
+ end
34
+
35
+ private
36
+
37
+ # Parse content if has keyword mean 'ago'
38
+ def parse_date_ago
39
+ now = Time.zone.now
40
+ regex_text = self.match(@time_regex)[0]
41
+ @time = case regex_text
42
+ when %r"#{@time_config["year"]}"
43
+ now.years_ago regex_text.match(/\d+/)[0].to_i
44
+ when %r"#{@time_config["month"]}"
45
+ now.months_ago regex_text.match(/\d+/)[0].to_i
46
+ when %r"#{@time_config["week"]}"
47
+ now.ago regex_text.match(/\d+/)[0].to_i * 60 * 60 * 24 * 7
48
+ when %r"#{@time_config["day"]}"
49
+ now.ago regex_text.match(/\d+/)[0].to_i * 60 * 60 * 24
50
+ when %r"#{@time_config["hour"]}"
51
+ now.ago regex_text.match(/\d+/)[0].to_i * 60 * 60
52
+ when %r"#{@time_config["min"]}"
53
+ now.ago regex_text.match(/\d+/)[0].to_i * 60
54
+ when %r"#{@time_config["second"]}"
55
+ now.ago regex_text.match(/\d+/)[0].to_i
56
+ else
57
+ raise "get date errors"
58
+ end
59
+ end
60
+
61
+ # Parse content if has keyword mean 'today'
62
+ def parse_today
63
+ now = Time.zone.now
64
+ regex_text = self.match(/\d{1,2}\s*:\s*\d{1,2}:*\d{0,2}/)[0]
65
+ time_str = now.to_date.to_s + " " + regex_text
66
+ Time.zone.parse(time_str)
67
+ end
68
+
69
+ def parse_date
70
+ date_regex1 = %r"(\d{4})[^\d|:]{1,2}(\d{1,2})[^\d|:]{1,2}(\d{1,2})"
71
+ date_regex2 = %r"(\d{1,2})[^\d|:]{1,2}(\d{1,2})[^\d|:]{1,2}(\d{4})"
72
+ date_regex3 = %r"([\w|\W]+)[^\d|\w]{1,2}(\d{1,2})[^\d|:]*(\d{4})"
73
+ time = self.match %r"\d{1,2}\s*:\d{1,2}\s*:*\d{0,2}(?:#{@other_config["am"]}|#{@other_config["pm"]})*"
74
+ time = time[0].gsub(%r"#{@other_config["am"]}","am").gsub(%r"#{@other_config["pm"]}","pm") if time
75
+
76
+ case self
77
+ when date_regex1
78
+
79
+ date_text = self.match date_regex1
80
+
81
+ Time.zone.parse "#{date_text[1]}-#{date_text[2]}-#{date_text[3]} #{time}"
82
+ when date_regex2
83
+ date_text = self.match date_regex2
84
+ Time.zone.parse("#{date_text[3]}-#{date_text[1]}-#{date_text[2]} #{time}")
85
+ when date_regex3
86
+ date_text = self.match date_regex3
87
+ month = case date_text[1].downcase
88
+ when %r"#{@month_config["jan"]}"
89
+ 1
90
+ when %r"#{@month_config["feb"]}"
91
+ 2
92
+ when %r"#{@month_config["mar"]}"
93
+ 3
94
+ when %r"#{@month_config["apr"]}"
95
+ 4
96
+ when %r"#{@month_config["may"]}"
97
+ 5
98
+ when %r"#{@month_config["jun"]}"
99
+ 6
100
+ when %r"#{@month_config["jul"]}"
101
+ 7
102
+ when %r"#{@month_config["aug"]}"
103
+ 8
104
+ when %r"#{@month_config["sep"]}"
105
+ 9
106
+ when %r"#{@month_config["oct"]}"
107
+ 10
108
+ when %r"#{@month_config["nov"]}"
109
+ 11
110
+ when %r"#{@month_config["dec"]}"
111
+ 12
112
+ end
113
+ Time.zone.parse "#{date_text[3]}-#{month}-#{date_text[2]} #{time}"
114
+ else
115
+ Time.zone.parse(self)
116
+ end
117
+ end
118
+ end
119
+
@@ -0,0 +1,3 @@
1
+ module SpiderBot
2
+ VERSION = "0.0.4"
3
+ end
data/lib/spider_bot.rb ADDED
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+ require "faraday"
3
+ require 'uri'
4
+ require "nokogiri"
5
+ require "multi_json"
6
+ require "multi_xml"
7
+ require 'active_support/core_ext/string/conversions'
8
+ require 'spider_bot/logging'
9
+ require "spider_bot/version"
10
+
11
+ module SpiderBot
12
+ class << self
13
+ def crawl(url, options = {}, &block)
14
+ crawl_instance = Crawl.new(url, options)
15
+ return crawl_instance.crawl_data if !block_given?
16
+ crawl_instance.instance_eval &block
17
+ end
18
+
19
+ def logger
20
+ SpiderBot::Logging.logger
21
+ end
22
+
23
+ def logger=(log)
24
+ SpiderBot::Logging.logger = log
25
+ end
26
+ end
27
+
28
+ autoload :Crawl, 'spider_bot/crawl'
29
+ autoload :Base, 'spider_bot/base'
30
+ module Http
31
+ autoload :Client, 'spider_bot/http/client'
32
+ autoload :Response, 'spider_bot/http/response'
33
+ end
34
+ autoload :Engine, 'spider_bot/engine'
35
+ end
36
+
37
+ require 'spider_bot/railte' if defined?(Rails)
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'spider_bot/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "spider_bot"
8
+ spec.version = SpiderBot::VERSION
9
+ spec.authors = ["yee.li"]
10
+ spec.email = ["yeeli@outlook.com"]
11
+ spec.summary = %q{splider bot}
12
+ spec.description = %q{splider bot}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+
24
+ spec.add_dependency "activesupport"
25
+ spec.add_dependency "faraday"
26
+ spec.add_dependency "nokogiri"
27
+ spec.add_dependency "multi_json"
28
+ spec.add_dependency "multi_xml"
29
+ spec.add_dependency "tzinfo"
30
+ spec.add_dependency "thor"
31
+ spec.add_dependency 'daemons'
32
+ end