http_crawler 0.3.0.5 → 0.3.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/http_crawler.rb +2 -5
- data/lib/http_crawler/client.rb +27 -3
- data/lib/http_crawler/http/response.rb +45 -8
- data/lib/http_crawler/version.rb +1 -1
- data/lib/http_crawler/web/baidu/client.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f9c1f28cef3cb0daf678534a97d54a4ee4ec583ebd91db8ae57be114a4579197
|
4
|
+
data.tar.gz: cdcdbb7d34d944409d56df437a38c8daae4a32dc27c02406df45f94b95c67b75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 370d52a4216581172c92206465ef02f7ec99d3c507065cd87da4baec70700e6b84f89f8593c93e3f9282ca7213f78344791f3f649e533b7c13f65becc8710085
|
7
|
+
data.tar.gz: '06957bba7ccbe041ac18c25de19e36d69ac67dbc793ac4f6890ac8243cde38e53270e26bb149df98f3462ca7080e874bb388b7977f57cfa10ccd341c76b68d0d'
|
data/lib/http_crawler.rb
CHANGED
@@ -4,12 +4,9 @@ require 'nokogiri'
|
|
4
4
|
|
5
5
|
# 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
|
6
6
|
class << self.class
|
7
|
-
|
8
|
-
|
9
|
-
alias_method :require_dependency, :require
|
10
|
-
end
|
7
|
+
# require 取别名 require_dependency
|
8
|
+
alias_method :require_dependency, :require
|
11
9
|
end
|
12
|
-
self.class.require_rename
|
13
10
|
|
14
11
|
# 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
|
15
12
|
require_dependency 'http_crawler/errors.rb'
|
data/lib/http_crawler/client.rb
CHANGED
@@ -66,6 +66,7 @@ module HttpCrawler
|
|
66
66
|
@connect_time = 5
|
67
67
|
@write_time = 5
|
68
68
|
@read_time = 5
|
69
|
+
@all_timeout = nil
|
69
70
|
end
|
70
71
|
|
71
72
|
# 初始化 ssl 协议
|
@@ -77,19 +78,28 @@ module HttpCrawler
|
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
81
|
+
attr_accessor :header
|
80
82
|
# 头文件相关方法
|
81
83
|
def header(parameter = {})
|
82
84
|
@header ||= init_header
|
83
85
|
end
|
84
86
|
|
85
87
|
def init_header(parameter = {})
|
86
|
-
@header = {
|
88
|
+
@header = {
|
89
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
90
|
+
"Accept-Encoding": "gzip, br",
|
91
|
+
"Accept-Language": "zh-CN,zh;q=0.9",
|
92
|
+
"Connection": "keep-alive",
|
93
|
+
"Upgrade-Insecure-Requests": "1",
|
94
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
|
95
|
+
}
|
87
96
|
end
|
88
97
|
|
89
98
|
def update_header(parameter = {})
|
90
|
-
|
99
|
+
@header = init_header
|
91
100
|
end
|
92
101
|
|
102
|
+
attr_accessor :cookies
|
93
103
|
# cookies相关方法
|
94
104
|
def cookies(parameter = {})
|
95
105
|
@cookies ||= init_cookies
|
@@ -103,6 +113,14 @@ module HttpCrawler
|
|
103
113
|
nil
|
104
114
|
end
|
105
115
|
|
116
|
+
# 字符串转换成cookies
|
117
|
+
# "abc=123; cd=412" => { "abc": "123", "cd": "412"}
|
118
|
+
def str_to_cookies(str)
|
119
|
+
str.scan(/([^=]*)=([^;]*);? ?/) do |m|
|
120
|
+
self.cookies[:"#{m[0]}"] = m[1]
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
106
124
|
# 代理设置
|
107
125
|
def auto_proxy=(value)
|
108
126
|
Rails.logger.debug "自动更新代理"
|
@@ -206,7 +224,13 @@ module HttpCrawler
|
|
206
224
|
h = h.cookies(cookies) if cookies
|
207
225
|
|
208
226
|
# 添加超时时间
|
209
|
-
|
227
|
+
if(@all_timeout)
|
228
|
+
# 整体总计超时时间
|
229
|
+
h = h.timeout(@all_timeout)
|
230
|
+
else
|
231
|
+
# 指定每个处理超时时间
|
232
|
+
h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
|
233
|
+
end
|
210
234
|
|
211
235
|
h
|
212
236
|
end
|
@@ -1,10 +1,8 @@
|
|
1
1
|
module HTTP
|
2
2
|
class Response
|
3
3
|
|
4
|
-
|
5
4
|
# 解压并转码 body 数据
|
6
5
|
def decoding_body
|
7
|
-
|
8
6
|
return @decoding_body if @decoding_body
|
9
7
|
return nil unless self.body
|
10
8
|
|
@@ -58,28 +56,67 @@ module HTTP
|
|
58
56
|
end
|
59
57
|
|
60
58
|
alias_method :dec, :decoding_body
|
61
|
-
# def decoding_body
|
62
59
|
|
60
|
+
|
61
|
+
# 转换html格式
|
62
|
+
# @return [Nokogiri::HTML::Document]
|
63
63
|
def html
|
64
|
-
@html
|
64
|
+
return @html if @html
|
65
|
+
self.html = self.dec
|
65
66
|
end
|
66
67
|
|
68
|
+
# @return [Nokogiri::HTML::Document]
|
69
|
+
def html=(data)
|
70
|
+
if (Nokogiri::HTML::Document === data)
|
71
|
+
@html = data
|
72
|
+
else
|
73
|
+
@html = Nokogiri::HTML(data)
|
74
|
+
end
|
75
|
+
@html
|
76
|
+
end
|
77
|
+
|
78
|
+
# 转换json格式
|
79
|
+
# @return [Hash]
|
67
80
|
def json
|
68
|
-
@json
|
69
|
-
|
81
|
+
return @json if @json
|
82
|
+
self.json = self.dec
|
83
|
+
end
|
84
|
+
|
85
|
+
# @return [Hash]
|
86
|
+
def json=(data)
|
87
|
+
if (Hash === data)
|
88
|
+
@json = data
|
89
|
+
else
|
90
|
+
@json = JSON.parse(data)
|
91
|
+
@json = JSON.parse(@json) if String === @json
|
92
|
+
end
|
70
93
|
@json
|
71
94
|
end
|
72
95
|
|
73
96
|
# 通过readability 解析数据
|
74
|
-
# [Readability::Document]
|
97
|
+
# @return [Readability::Document]
|
75
98
|
def readability
|
76
|
-
@readability
|
99
|
+
return @readability if @readability
|
100
|
+
self.readability = self.dec
|
101
|
+
end
|
102
|
+
|
103
|
+
# @return [Readability::Document]
|
104
|
+
def readability=(data)
|
105
|
+
if (Readability::Document === data)
|
106
|
+
@readability = data
|
107
|
+
else
|
108
|
+
@readability = Readability::Document.new(data, {do_not_guess_encoding: true})
|
109
|
+
end
|
110
|
+
@readability
|
77
111
|
end
|
78
112
|
|
113
|
+
|
79
114
|
def content
|
80
115
|
Nokogiri::HTML(readability.content).text
|
81
116
|
end
|
117
|
+
|
82
118
|
# 解析
|
119
|
+
# 默认使用 json 的值
|
83
120
|
def parsing
|
84
121
|
self.json
|
85
122
|
end
|
data/lib/http_crawler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.0.
|
4
|
+
version: 0.3.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|