http_crawler 0.3.0.5 → 0.3.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4deb4d5965b16f6f8edfbeea11855ec7c2af4d3a1d2337b8afba6fd7dfc34b28
4
- data.tar.gz: 6ad1f67ff98bd61d7a7f105ea51ac2d692b9f81fbab3674404aa41f32978a1a3
3
+ metadata.gz: f9c1f28cef3cb0daf678534a97d54a4ee4ec583ebd91db8ae57be114a4579197
4
+ data.tar.gz: cdcdbb7d34d944409d56df437a38c8daae4a32dc27c02406df45f94b95c67b75
5
5
  SHA512:
6
- metadata.gz: 144d4ed97e3d60c541d5e878b8a7252378effd929e778dc743af7aaf83dfae3e9c228759297d1c7fb4274c6680e920ac4a1fcb3711228d2ca75d99473fde771b
7
- data.tar.gz: 44f5c97fc062d0a0424154f07bddb09078316df9a0c1aba27723083baeb4aebabed266f652b19d89f0448f08fe7c8db26871121ee6e6bdca9d4c29de7097f292
6
+ metadata.gz: 370d52a4216581172c92206465ef02f7ec99d3c507065cd87da4baec70700e6b84f89f8593c93e3f9282ca7213f78344791f3f649e533b7c13f65becc8710085
7
+ data.tar.gz: '06957bba7ccbe041ac18c25de19e36d69ac67dbc793ac4f6890ac8243cde38e53270e26bb149df98f3462ca7080e874bb388b7977f57cfa10ccd341c76b68d0d'
data/lib/http_crawler.rb CHANGED
@@ -4,12 +4,9 @@ require 'nokogiri'
4
4
 
5
5
  # 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
6
6
  class << self.class
7
- def require_rename
8
- # require 取别名 require_dependency
9
- alias_method :require_dependency, :require
10
- end
7
+ # require 取别名 require_dependency
8
+ alias_method :require_dependency, :require
11
9
  end
12
- self.class.require_rename
13
10
 
14
11
  # 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
15
12
  require_dependency 'http_crawler/errors.rb'
@@ -66,6 +66,7 @@ module HttpCrawler
66
66
  @connect_time = 5
67
67
  @write_time = 5
68
68
  @read_time = 5
69
+ @all_timeout = nil
69
70
  end
70
71
 
71
72
  # 初始化 ssl 协议
@@ -77,19 +78,28 @@ module HttpCrawler
77
78
  end
78
79
  end
79
80
 
81
+ attr_accessor :header
80
82
  # 头文件相关方法
81
83
  def header(parameter = {})
82
84
  @header ||= init_header
83
85
  end
84
86
 
85
87
  def init_header(parameter = {})
86
- @header = {}
88
+ @header = {
89
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
90
+ "Accept-Encoding": "gzip, br",
91
+ "Accept-Language": "zh-CN,zh;q=0.9",
92
+ "Connection": "keep-alive",
93
+ "Upgrade-Insecure-Requests": "1",
94
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
95
+ }
87
96
  end
88
97
 
89
98
  def update_header(parameter = {})
90
- nil
99
+ @header = init_header
91
100
  end
92
101
 
102
+ attr_accessor :cookies
93
103
  # cookies相关方法
94
104
  def cookies(parameter = {})
95
105
  @cookies ||= init_cookies
@@ -103,6 +113,14 @@ module HttpCrawler
103
113
  nil
104
114
  end
105
115
 
116
+ # 字符串转换成cookies
117
+ # "abc=123; cd=412" => { "abc": "123", "cd": "412"}
118
+ def str_to_cookies(str)
119
+ str.scan(/([^=]*)=([^;]*);? ?/) do |m|
120
+ self.cookies[:"#{m[0]}"] = m[1]
121
+ end
122
+ end
123
+
106
124
  # 代理设置
107
125
  def auto_proxy=(value)
108
126
  Rails.logger.debug "自动更新代理"
@@ -206,7 +224,13 @@ module HttpCrawler
206
224
  h = h.cookies(cookies) if cookies
207
225
 
208
226
  # 添加超时时间
209
- h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
227
+ if(@all_timeout)
228
+ # 整体总计超时时间
229
+ h = h.timeout(@all_timeout)
230
+ else
231
+ # 指定每个处理超时时间
232
+ h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
233
+ end
210
234
 
211
235
  h
212
236
  end
@@ -1,10 +1,8 @@
1
1
  module HTTP
2
2
  class Response
3
3
 
4
-
5
4
  # 解压并转码 body 数据
6
5
  def decoding_body
7
-
8
6
  return @decoding_body if @decoding_body
9
7
  return nil unless self.body
10
8
 
@@ -58,28 +56,67 @@ module HTTP
58
56
  end
59
57
 
60
58
  alias_method :dec, :decoding_body
61
- # def decoding_body
62
59
 
60
+
61
+ # 转换html格式
62
+ # @return [Nokogiri::HTML::Document]
63
63
  def html
64
- @html ||= Nokogiri::HTML(decoding_body)
64
+ return @html if @html
65
+ self.html = self.dec
65
66
  end
66
67
 
68
+ # @return [Nokogiri::HTML::Document]
69
+ def html=(data)
70
+ if (Nokogiri::HTML::Document === data)
71
+ @html = data
72
+ else
73
+ @html = Nokogiri::HTML(data)
74
+ end
75
+ @html
76
+ end
77
+
78
+ # 转换json格式
79
+ # @return [Hash]
67
80
  def json
68
- @json ||= JSON.parse(decoding_body)
69
- @json = JSON.parse(@json) if String === @json
81
+ return @json if @json
82
+ self.json = self.dec
83
+ end
84
+
85
+ # @return [Hash]
86
+ def json=(data)
87
+ if (Hash === data)
88
+ @json = data
89
+ else
90
+ @json = JSON.parse(data)
91
+ @json = JSON.parse(@json) if String === @json
92
+ end
70
93
  @json
71
94
  end
72
95
 
73
96
  # 通过readability 解析数据
74
- # [Readability::Document]
97
+ # @return [Readability::Document]
75
98
  def readability
76
- @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
99
+ return @readability if @readability
100
+ self.readability = self.dec
101
+ end
102
+
103
+ # @return [Readability::Document]
104
+ def readability=(data)
105
+ if (Readability::Document === data)
106
+ @readability = data
107
+ else
108
+ @readability = Readability::Document.new(data, {do_not_guess_encoding: true})
109
+ end
110
+ @readability
77
111
  end
78
112
 
113
+
79
114
  def content
80
115
  Nokogiri::HTML(readability.content).text
81
116
  end
117
+
82
118
  # 解析
119
+ # 默认使用 json 的值
83
120
  def parsing
84
121
  self.json
85
122
  end
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.0.5"
2
+ VERSION = "0.3.0.6"
3
3
  end
@@ -4,6 +4,11 @@ module HttpCrawler
4
4
  module Baidu
5
5
  class Client < HttpCrawler::Web::Client
6
6
 
7
+ def init_client
8
+ # 设置整体超时时间 3 秒
9
+ @all_timeout = 3
10
+ end
11
+
7
12
  def init_uri
8
13
  @uri = URI("https://www.baidu.com")
9
14
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.5
4
+ version: 0.3.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-22 00:00:00.000000000 Z
11
+ date: 2019-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec