http_crawler 0.3.0.5 → 0.3.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/http_crawler.rb +2 -5
- data/lib/http_crawler/client.rb +27 -3
- data/lib/http_crawler/http/response.rb +45 -8
- data/lib/http_crawler/version.rb +1 -1
- data/lib/http_crawler/web/baidu/client.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f9c1f28cef3cb0daf678534a97d54a4ee4ec583ebd91db8ae57be114a4579197
|
4
|
+
data.tar.gz: cdcdbb7d34d944409d56df437a38c8daae4a32dc27c02406df45f94b95c67b75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 370d52a4216581172c92206465ef02f7ec99d3c507065cd87da4baec70700e6b84f89f8593c93e3f9282ca7213f78344791f3f649e533b7c13f65becc8710085
|
7
|
+
data.tar.gz: '06957bba7ccbe041ac18c25de19e36d69ac67dbc793ac4f6890ac8243cde38e53270e26bb149df98f3462ca7080e874bb388b7977f57cfa10ccd341c76b68d0d'
|
data/lib/http_crawler.rb
CHANGED
@@ -4,12 +4,9 @@ require 'nokogiri'
|
|
4
4
|
|
5
5
|
# 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
|
6
6
|
class << self.class
|
7
|
-
|
8
|
-
|
9
|
-
alias_method :require_dependency, :require
|
10
|
-
end
|
7
|
+
# require 取别名 require_dependency
|
8
|
+
alias_method :require_dependency, :require
|
11
9
|
end
|
12
|
-
self.class.require_rename
|
13
10
|
|
14
11
|
# 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
|
15
12
|
require_dependency 'http_crawler/errors.rb'
|
data/lib/http_crawler/client.rb
CHANGED
@@ -66,6 +66,7 @@ module HttpCrawler
|
|
66
66
|
@connect_time = 5
|
67
67
|
@write_time = 5
|
68
68
|
@read_time = 5
|
69
|
+
@all_timeout = nil
|
69
70
|
end
|
70
71
|
|
71
72
|
# 初始化 ssl 协议
|
@@ -77,19 +78,28 @@ module HttpCrawler
|
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
81
|
+
attr_accessor :header
|
80
82
|
# 头文件相关方法
|
81
83
|
def header(parameter = {})
|
82
84
|
@header ||= init_header
|
83
85
|
end
|
84
86
|
|
85
87
|
def init_header(parameter = {})
|
86
|
-
@header = {
|
88
|
+
@header = {
|
89
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
90
|
+
"Accept-Encoding": "gzip, br",
|
91
|
+
"Accept-Language": "zh-CN,zh;q=0.9",
|
92
|
+
"Connection": "keep-alive",
|
93
|
+
"Upgrade-Insecure-Requests": "1",
|
94
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
|
95
|
+
}
|
87
96
|
end
|
88
97
|
|
89
98
|
def update_header(parameter = {})
|
90
|
-
|
99
|
+
@header = init_header
|
91
100
|
end
|
92
101
|
|
102
|
+
attr_accessor :cookies
|
93
103
|
# cookies相关方法
|
94
104
|
def cookies(parameter = {})
|
95
105
|
@cookies ||= init_cookies
|
@@ -103,6 +113,14 @@ module HttpCrawler
|
|
103
113
|
nil
|
104
114
|
end
|
105
115
|
|
116
|
+
# 字符串转换成cookies
|
117
|
+
# "abc=123; cd=412" => { "abc": "123", "cd": "412"}
|
118
|
+
def str_to_cookies(str)
|
119
|
+
str.scan(/([^=]*)=([^;]*);? ?/) do |m|
|
120
|
+
self.cookies[:"#{m[0]}"] = m[1]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
106
124
|
# 代理设置
|
107
125
|
def auto_proxy=(value)
|
108
126
|
Rails.logger.debug "自动更新代理"
|
@@ -206,7 +224,13 @@ module HttpCrawler
|
|
206
224
|
h = h.cookies(cookies) if cookies
|
207
225
|
|
208
226
|
# 添加超时时间
|
209
|
-
|
227
|
+
if(@all_timeout)
|
228
|
+
# 整体总计超时时间
|
229
|
+
h = h.timeout(@all_timeout)
|
230
|
+
else
|
231
|
+
# 指定每个处理超时时间
|
232
|
+
h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
|
233
|
+
end
|
210
234
|
|
211
235
|
h
|
212
236
|
end
|
@@ -1,10 +1,8 @@
|
|
1
1
|
module HTTP
|
2
2
|
class Response
|
3
3
|
|
4
|
-
|
5
4
|
# 解压并转码 body 数据
|
6
5
|
def decoding_body
|
7
|
-
|
8
6
|
return @decoding_body if @decoding_body
|
9
7
|
return nil unless self.body
|
10
8
|
|
@@ -58,28 +56,67 @@ module HTTP
|
|
58
56
|
end
|
59
57
|
|
60
58
|
alias_method :dec, :decoding_body
|
61
|
-
# def decoding_body
|
62
59
|
|
60
|
+
|
61
|
+
# 转换html格式
|
62
|
+
# @return [Nokogiri::HTML::Document]
|
63
63
|
def html
|
64
|
-
@html
|
64
|
+
return @html if @html
|
65
|
+
self.html = self.dec
|
65
66
|
end
|
66
67
|
|
68
|
+
# @return [Nokogiri::HTML::Document]
|
69
|
+
def html=(data)
|
70
|
+
if (Nokogiri::HTML::Document === data)
|
71
|
+
@html = data
|
72
|
+
else
|
73
|
+
@html = Nokogiri::HTML(data)
|
74
|
+
end
|
75
|
+
@html
|
76
|
+
end
|
77
|
+
|
78
|
+
# 转换json格式
|
79
|
+
# @return [Hash]
|
67
80
|
def json
|
68
|
-
@json
|
69
|
-
|
81
|
+
return @json if @json
|
82
|
+
self.json = self.dec
|
83
|
+
end
|
84
|
+
|
85
|
+
# @return [Hash]
|
86
|
+
def json=(data)
|
87
|
+
if (Hash === data)
|
88
|
+
@json = data
|
89
|
+
else
|
90
|
+
@json = JSON.parse(data)
|
91
|
+
@json = JSON.parse(@json) if String === @json
|
92
|
+
end
|
70
93
|
@json
|
71
94
|
end
|
72
95
|
|
73
96
|
# 通过readability 解析数据
|
74
|
-
# [Readability::Document]
|
97
|
+
# @return [Readability::Document]
|
75
98
|
def readability
|
76
|
-
@readability
|
99
|
+
return @readability if @readability
|
100
|
+
self.readability = self.dec
|
101
|
+
end
|
102
|
+
|
103
|
+
# @return [Readability::Document]
|
104
|
+
def readability=(data)
|
105
|
+
if (Readability::Document === data)
|
106
|
+
@readability = data
|
107
|
+
else
|
108
|
+
@readability = Readability::Document.new(data, {do_not_guess_encoding: true})
|
109
|
+
end
|
110
|
+
@readability
|
77
111
|
end
|
78
112
|
|
113
|
+
|
79
114
|
def content
|
80
115
|
Nokogiri::HTML(readability.content).text
|
81
116
|
end
|
117
|
+
|
82
118
|
# 解析
|
119
|
+
# 默认使用 json 的值
|
83
120
|
def parsing
|
84
121
|
self.json
|
85
122
|
end
|
data/lib/http_crawler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.0.
|
4
|
+
version: 0.3.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|