spider_bot 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ #coding: utf-8
2
+
3
+ module SpiderBot
4
+ module Http
5
+ class Client
6
+
7
+ # return url for HttpClient
8
+ attr_reader :url
9
+
10
+ # return http user_agent for HttpClient
11
+ attr_reader :user_agent
12
+
13
+ attr_reader :headers
14
+
15
+ #
16
+ attr_accessor :options
17
+
18
+ # return connection for HttpClient
19
+ attr_accessor :connection
20
+
21
+ attr_accessor :conn_build
22
+
23
+ # Supported User-Agent
24
+ #
25
+ # * Linux Firefox (3.6.1)
26
+ # * Linux Konqueror (3)
27
+ # * Linux Mozilla
28
+ # * Linux Chrome
29
+ # * Mac Firefox
30
+ # * Mac Mozilla
31
+ # * Mac Chrome
32
+ # * Mac Safari
33
+ # * Mechanize (default)
34
+ # * Windows IE 6
35
+ # * Windows IE 7
36
+ # * Windows IE 8
37
+ # * Windows IE 9
38
+ # * Windows Mozilla
39
+ # * iPhone (3.0)
40
+ # * iPad
41
+ # * Android
42
+
43
+ USER_AGENT = {
44
+ 'bot' => "bot/#{SpiderBot::VERSION}",
45
+ 'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1',
46
+ 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
47
+ 'Linux Chrome' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624 Chrome/26.0.1410.43',
48
+ 'Mac Firefox' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:35.0) Gecko/20100101 Firefox/35.0',
49
+ 'Mac Safari' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.3.18 (KHTML, like Gecko) Version/8.0.3 Safari/600.3.18',
50
+ 'Mac Chrome' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36',
51
+ 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
52
+ 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
53
+ 'Windows IE 8' => 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
54
+ 'Windows IE 9' => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
55
+ 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
56
+ 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3',
57
+ 'iPad' => 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10',
58
+ 'Android' => 'Mozilla/5.0 (Linux; U; Android 3.0; en-us) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13'
59
+ }
60
+
61
+ # Initialize a new HttpClient
62
+ #
63
+ # @param uri [String] the uri with
64
+ # @param options [Hash] the options to create a http with configure
65
+ # @option options [String] :header set the http request headers
66
+ # @yield [builder]
67
+ #
68
+ # @example
69
+ # http = HttpClient.new
70
+ #
71
+ # http = HttpClient.new do |http|
72
+ # http.user_agent= "Mac Safri"
73
+ # http.url= "http://example.com"
74
+ # end
75
+
76
+ def initialize(uri = nil, options = nil, &block)
77
+ @url = uri
78
+ @options = options
79
+ @user_agent ||= USER_AGENT['bot']
80
+ yield self if block_given?
81
+ end
82
+
83
+ def builder(&block)
84
+ @conn_build = block
85
+ end
86
+
87
+ # Set the url for HttpClient
88
+ #
89
+ # @param uri [String] the HttpClient url
90
+
91
+ def url=(uri)
92
+ @conn = nil
93
+ @url = uri
94
+ end
95
+
96
+ # Set the headers for HttpClient
97
+ #
98
+ # @param headers [String] the HttpClient url
99
+ # @return [String]
100
+
101
+ def headers=(headers)
102
+ @headers = headers.merge({"User-Agent" => user_agent})
103
+ end
104
+
105
+ # Set the user agent for HttpClient
106
+ #
107
+ # @param name [Symbol] the HttpClient user agent
108
+
109
+ def user_agent=(name)
110
+ @user_agent = USER_AGENT[name] || USER_AGENT['bot']
111
+ end
112
+
113
+ # The Faraday connection object
114
+ # @return [connection] The Faraday connection builder
115
+
116
+ def connection
117
+ @connection ||= begin
118
+ conn = Faraday.new(url: url)
119
+ conn.build do |b|
120
+ conn_build.call(b)
121
+ end if conn_build
122
+ conn
123
+ end
124
+ end
125
+
126
+
127
+ # Make request with HttpClient
128
+ #
129
+ # @param verb [Symbol] verb one of :get, :post, :put, :delete
130
+ # @param uri [String] URL path for request
131
+ # @param query [Hash] additional query parameters for the URL of the request
132
+
133
+ def request(verb, uri, query={})
134
+ verb == :get ? query_get = query : query_post = query
135
+ uri = connection.build_url(uri, query_get)
136
+
137
+ response = connection.run_request(verb, uri, query_post, headers) do |request|
138
+ yield request if block_given?
139
+ end
140
+ response = Response.new(response)
141
+
142
+ case response.status
143
+ when 301, 302, 303, 307
144
+ request(verb, response.headers['location'], query)
145
+ when 200..299, 300..399
146
+ response
147
+ end
148
+ end
149
+
150
+ # Handle get request with HttpClient
151
+ #
152
+ # @param uri [String] URL path for request
153
+ # @param query [Hash] additional query parameters for the URL of the request
154
+
155
+ def get(uri, query = {}, &block)
156
+ request(:get, uri, query, &block)
157
+ end
158
+
159
+ # Handle post request with HttpClient
160
+ # @param (see #get)
161
+ def post(uri, query = {}, &block)
162
+ request(:post, uri, query, &block)
163
+ end
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,83 @@
1
+ module SpiderBot
2
+ module Http
3
+ class Response
4
+ attr_reader :response
5
+
6
+ CONTENT_TYPE = {
7
+ 'application/json' => :json,
8
+ 'application/x-www-form-urlencoded' => :html,
9
+ 'text/html' => :html,
10
+ 'text/javascript' => :json,
11
+ 'text/xml' => :xml
12
+ }
13
+
14
+ PARSERS = {
15
+ :json => lambda{ |body| MultiJson.respond_to?(:adapter) ? MultiJson.load(body) : MultiJson.decode(body) rescue body},
16
+ :html => lambda{ |body| Nokogiri::HTML(body)},
17
+ :xml => lambda{ |body| MultiXml.parse(body) }
18
+ }
19
+
20
+ def initialize(response)
21
+ @response = response
22
+ end
23
+
24
+ def headers
25
+ response.headers
26
+ end
27
+
28
+ def body(options = {})
29
+ options = options || {}
30
+ decode(response.body, options)
31
+ end
32
+
33
+ def decode(body, options = {})
34
+ return '' if !body
35
+ return body if json?
36
+ charset = body.match(/charset\s*=[\s|\W]*([\w-]+)/)
37
+ return body if charset[1].downcase == "utf-8"
38
+ charset_code = charset_covert(charset[1])
39
+ begin
40
+ if options[:encode]
41
+ return body.encode! "utf-8", options[:encode], {:invalid => :replace}
42
+ end
43
+ body.encode! "utf-8", charset_code, {:invalid => :replace}
44
+ rescue
45
+ body
46
+ end
47
+ end
48
+
49
+ def status
50
+ response.status
51
+ end
52
+
53
+ # Attempts to determine the content type of the response.
54
+ def content_type
55
+ ((response.headers.values_at('content-type', 'Content-Type').compact.first || '').split(';').first || '').strip
56
+ end
57
+
58
+ def json?
59
+ CONTENT_TYPE[content_type] == :json || !response.body.match(/\<html/)
60
+ end
61
+
62
+ def parser
63
+ type = CONTENT_TYPE[content_type]
64
+ type = :json if type == :html && !response.body.match(/\<.*html|/)
65
+ type = :html if type.nil?
66
+ return type
67
+ end
68
+
69
+ def parsed
70
+ @parsed ||= PARSERS[parser].call(body)
71
+ end
72
+
73
+ def charset_covert(charset)
74
+ case charset
75
+ when "gb2312", "GB2312", "GBK"
76
+ "gbk"
77
+ else
78
+ charset
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,30 @@
1
+ begin
2
+ require File.expand_path("./config/application")
3
+ rescue LoadError => e
4
+ system_boot = File.expand_path("./config/boot.rb")
5
+ require system_boot if File.exist?(system_boot)
6
+ end
7
+
8
+ if defined?(Padrino)
9
+ puts "read padrino environment #{Padrino.env}"
10
+ BOTDIR = Dir.glob("#{Padrino.root}/app/bots/**/*_bot.rb")
11
+ if Padrino.env != :development
12
+ SpiderBot::Logging.initialize_logger("#{Padrino.root}/log/spider.log")
13
+ end
14
+ end
15
+
16
+ if defined?(Rails)
17
+ class Railtie < Rails::Railtie
18
+ initializer "disable eager load" do |app|
19
+ app.config.eager_load = false
20
+ end
21
+ end
22
+ Rails.application.initialize!
23
+ puts "read rails environment #{Rails.env}"
24
+ BOTDIR = Dir.glob("#{Rails.root}/app/bots/**/*_bot.rb")
25
+ Rails.logger.level = Logger::WARN
26
+ if !Rails.env.development?
27
+ SpiderBot::Logging.initialize_logger("#{Rails.root}/log/spider.log")
28
+ end
29
+ end
30
+
@@ -0,0 +1,21 @@
1
+ require 'logger'
2
+
3
+ module SpiderBot
4
+ module Logging
5
+ def self.initialize_logger(log_target = STDOUT)
6
+ oldlogger = defined?(@logger) ? @logger : nil
7
+ @logger = Logger.new(log_target)
8
+ @logger.level = Logger::INFO
9
+ oldlogger.close if oldlogger && !$TESTING
10
+ @logger
11
+ end
12
+
13
+ def self.logger
14
+ defined?(@logger) ? @logger : initialize_logger
15
+ end
16
+
17
+ def self.logger=(log)
18
+ @logger = (log ? log : Logger.new('/dev/null'))
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,6 @@
1
+ module SpiderBot
2
+ class Railtie < Rails::Railtie
3
+ initializer "load bots" do |app|
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,29 @@
1
+
2
+ date:
3
+ time:
4
+ year: "年|year|years"
5
+ month: "月|month|months|mon"
6
+ week: "周|星期|week|weeks"
7
+ day: "日|天|day|days"
8
+ hour: "时|小时|時|小時|hour|hours|hr|hrs|h"
9
+ min: "分|分钟|minute|minutes|min|mins|m"
10
+ second: "秒|second|seconds|sec|secs|s"
11
+ month:
12
+ jan: "january|jan|一月"
13
+ feb: "february|feb|二月"
14
+ mar: "march|mar|三月"
15
+ apr: "april|apr|四月"
16
+ may: "may|五月"
17
+ jun: "june|jun|六月"
18
+ jul: "july|jul|七月"
19
+ aug: "august|aug|八月"
20
+ sep: "september|sep|九月"
21
+ oct: "october|oct|十月"
22
+ nov: "november|nov|十一月"
23
+ dec: "december|dec|十二月"
24
+ other:
25
+ ago: "ago|前|以前"
26
+ today: "today|今天"
27
+ am: "AM|am|上午"
28
+ pm: "PM|pm|下午"
29
+
@@ -0,0 +1,119 @@
1
+ #coding: utf-8
2
+ require "yaml"
3
+ require "active_support/time"
4
+ require 'tzinfo'
5
+
6
+ DATE_CONFIG = YAML.load_file(File.expand_path("../date.yml", __FILE__))
7
+
8
+ class String
9
+
10
+ # Parse content to local time
11
+ #
12
+ # @param [String] zone time zone with site
13
+ def parse_time(zone = nil)
14
+ Time.zone = zone.nil? ? "UTC" : zone
15
+
16
+ @time_config = DATE_CONFIG["date"]["time"]
17
+ @month_config = DATE_CONFIG["date"]["month"]
18
+ @other_config = DATE_CONFIG["date"]["other"]
19
+ @time_str = @time_config.values.join("|")
20
+
21
+ @time_regex = %r"\d+[\s|\S]*(?:#{@time_str})\s*(?:#{@other_config["ago"]})"
22
+ @today_regex = %r"#{@other_config["today"]}\s*\d{1,2}:\d{1,2}"
23
+
24
+ case self
25
+ when @time_regex
26
+ parse_date_ago
27
+ when @today_regex
28
+ parse_today
29
+ else
30
+ parse_date
31
+ end
32
+
33
+ end
34
+
35
+ private
36
+
37
+ # Parse content if has keyword mean 'ago'
38
+ def parse_date_ago
39
+ now = Time.zone.now
40
+ regex_text = self.match(@time_regex)[0]
41
+ @time = case regex_text
42
+ when %r"#{@time_config["year"]}"
43
+ now.years_ago regex_text.match(/\d+/)[0].to_i
44
+ when %r"#{@time_config["month"]}"
45
+ now.months_ago regex_text.match(/\d+/)[0].to_i
46
+ when %r"#{@time_config["week"]}"
47
+ now.ago regex_text.match(/\d+/)[0].to_i * 60 * 60 * 24 * 7
48
+ when %r"#{@time_config["day"]}"
49
+ now.ago regex_text.match(/\d+/)[0].to_i * 60 * 60 * 24
50
+ when %r"#{@time_config["hour"]}"
51
+ now.ago regex_text.match(/\d+/)[0].to_i * 60 * 60
52
+ when %r"#{@time_config["min"]}"
53
+ now.ago regex_text.match(/\d+/)[0].to_i * 60
54
+ when %r"#{@time_config["second"]}"
55
+ now.ago regex_text.match(/\d+/)[0].to_i
56
+ else
57
+ raise "get date errors"
58
+ end
59
+ end
60
+
61
+ # Parse content if has keyword mean 'today'
62
+ def parse_today
63
+ now = Time.zone.now
64
+ regex_text = self.match(/\d{1,2}\s*:\s*\d{1,2}:*\d{0,2}/)[0]
65
+ time_str = now.to_date.to_s + " " + regex_text
66
+ Time.zone.parse(time_str)
67
+ end
68
+
69
+ def parse_date
70
+ date_regex1 = %r"(\d{4})[^\d|:]{1,2}(\d{1,2})[^\d|:]{1,2}(\d{1,2})"
71
+ date_regex2 = %r"(\d{1,2})[^\d|:]{1,2}(\d{1,2})[^\d|:]{1,2}(\d{4})"
72
+ date_regex3 = %r"([\w|\W]+)[^\d|\w]{1,2}(\d{1,2})[^\d|:]*(\d{4})"
73
+ time = self.match %r"\d{1,2}\s*:\d{1,2}\s*:*\d{0,2}(?:#{@other_config["am"]}|#{@other_config["pm"]})*"
74
+ time = time[0].gsub(%r"#{@other_config["am"]}","am").gsub(%r"#{@other_config["pm"]}","pm") if time
75
+
76
+ case self
77
+ when date_regex1
78
+
79
+ date_text = self.match date_regex1
80
+
81
+ Time.zone.parse "#{date_text[1]}-#{date_text[2]}-#{date_text[3]} #{time}"
82
+ when date_regex2
83
+ date_text = self.match date_regex2
84
+ Time.zone.parse("#{date_text[3]}-#{date_text[1]}-#{date_text[2]} #{time}")
85
+ when date_regex3
86
+ date_text = self.match date_regex3
87
+ month = case date_text[1].downcase
88
+ when %r"#{@month_config["jan"]}"
89
+ 1
90
+ when %r"#{@month_config["feb"]}"
91
+ 2
92
+ when %r"#{@month_config["mar"]}"
93
+ 3
94
+ when %r"#{@month_config["apr"]}"
95
+ 4
96
+ when %r"#{@month_config["may"]}"
97
+ 5
98
+ when %r"#{@month_config["jun"]}"
99
+ 6
100
+ when %r"#{@month_config["jul"]}"
101
+ 7
102
+ when %r"#{@month_config["aug"]}"
103
+ 8
104
+ when %r"#{@month_config["sep"]}"
105
+ 9
106
+ when %r"#{@month_config["oct"]}"
107
+ 10
108
+ when %r"#{@month_config["nov"]}"
109
+ 11
110
+ when %r"#{@month_config["dec"]}"
111
+ 12
112
+ end
113
+ Time.zone.parse "#{date_text[3]}-#{month}-#{date_text[2]} #{time}"
114
+ else
115
+ Time.zone.parse(self)
116
+ end
117
+ end
118
+ end
119
+
@@ -0,0 +1,3 @@
1
+ module SpiderBot
2
+ VERSION = "0.0.4"
3
+ end
data/lib/spider_bot.rb ADDED
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+ require "faraday"
3
+ require 'uri'
4
+ require "nokogiri"
5
+ require "multi_json"
6
+ require "multi_xml"
7
+ require 'active_support/core_ext/string/conversions'
8
+ require 'spider_bot/logging'
9
+ require "spider_bot/version"
10
+
11
+ module SpiderBot
12
+ class << self
13
+ def crawl(url, options = {}, &block)
14
+ crawl_instance = Crawl.new(url, options)
15
+ return crawl_instance.crawl_data if !block_given?
16
+ crawl_instance.instance_eval &block
17
+ end
18
+
19
+ def logger
20
+ SpiderBot::Logging.logger
21
+ end
22
+
23
+ def logger=(log)
24
+ SpiderBot::Logging.logger = log
25
+ end
26
+ end
27
+
28
+ autoload :Crawl, 'spider_bot/crawl'
29
+ autoload :Base, 'spider_bot/base'
30
+ module Http
31
+ autoload :Client, 'spider_bot/http/client'
32
+ autoload :Response, 'spider_bot/http/response'
33
+ end
34
+ autoload :Engine, 'spider_bot/engine'
35
+ end
36
+
37
+ require 'spider_bot/railte' if defined?(Rails)
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'spider_bot/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "spider_bot"
8
+ spec.version = SpiderBot::VERSION
9
+ spec.authors = ["yee.li"]
10
+ spec.email = ["yeeli@outlook.com"]
11
+ spec.summary = %q{splider bot}
12
+ spec.description = %q{splider bot}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.7"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+
24
+ spec.add_dependency "activesupport"
25
+ spec.add_dependency "faraday"
26
+ spec.add_dependency "nokogiri"
27
+ spec.add_dependency "multi_json"
28
+ spec.add_dependency "multi_xml"
29
+ spec.add_dependency "tzinfo"
30
+ spec.add_dependency "thor"
31
+ spec.add_dependency 'daemons'
32
+ end