http_crawler 0.3.0.5 → 0.3.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4deb4d5965b16f6f8edfbeea11855ec7c2af4d3a1d2337b8afba6fd7dfc34b28
4
- data.tar.gz: 6ad1f67ff98bd61d7a7f105ea51ac2d692b9f81fbab3674404aa41f32978a1a3
3
+ metadata.gz: f9c1f28cef3cb0daf678534a97d54a4ee4ec583ebd91db8ae57be114a4579197
4
+ data.tar.gz: cdcdbb7d34d944409d56df437a38c8daae4a32dc27c02406df45f94b95c67b75
5
5
  SHA512:
6
- metadata.gz: 144d4ed97e3d60c541d5e878b8a7252378effd929e778dc743af7aaf83dfae3e9c228759297d1c7fb4274c6680e920ac4a1fcb3711228d2ca75d99473fde771b
7
- data.tar.gz: 44f5c97fc062d0a0424154f07bddb09078316df9a0c1aba27723083baeb4aebabed266f652b19d89f0448f08fe7c8db26871121ee6e6bdca9d4c29de7097f292
6
+ metadata.gz: 370d52a4216581172c92206465ef02f7ec99d3c507065cd87da4baec70700e6b84f89f8593c93e3f9282ca7213f78344791f3f649e533b7c13f65becc8710085
7
+ data.tar.gz: '06957bba7ccbe041ac18c25de19e36d69ac67dbc793ac4f6890ac8243cde38e53270e26bb149df98f3462ca7080e874bb388b7977f57cfa10ccd341c76b68d0d'
data/lib/http_crawler.rb CHANGED
@@ -4,12 +4,9 @@ require 'nokogiri'
4
4
 
5
5
  # 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
6
6
  class << self.class
7
- def require_rename
8
- # require 取别名 require_dependency
9
- alias_method :require_dependency, :require
10
- end
7
+ # require 取别名 require_dependency
8
+ alias_method :require_dependency, :require
11
9
  end
12
- self.class.require_rename
13
10
 
14
11
  # 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
15
12
  require_dependency 'http_crawler/errors.rb'
@@ -66,6 +66,7 @@ module HttpCrawler
66
66
  @connect_time = 5
67
67
  @write_time = 5
68
68
  @read_time = 5
69
+ @all_timeout = nil
69
70
  end
70
71
 
71
72
  # 初始化 ssl 协议
@@ -77,19 +78,28 @@ module HttpCrawler
77
78
  end
78
79
  end
79
80
 
81
+ attr_accessor :header
80
82
  # 头文件相关方法
81
83
  def header(parameter = {})
82
84
  @header ||= init_header
83
85
  end
84
86
 
85
87
  def init_header(parameter = {})
86
- @header = {}
88
+ @header = {
89
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
90
+ "Accept-Encoding": "gzip, br",
91
+ "Accept-Language": "zh-CN,zh;q=0.9",
92
+ "Connection": "keep-alive",
93
+ "Upgrade-Insecure-Requests": "1",
94
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36",
95
+ }
87
96
  end
88
97
 
89
98
  def update_header(parameter = {})
90
- nil
99
+ @header = init_header
91
100
  end
92
101
 
102
+ attr_accessor :cookies
93
103
  # cookies相关方法
94
104
  def cookies(parameter = {})
95
105
  @cookies ||= init_cookies
@@ -103,6 +113,14 @@ module HttpCrawler
103
113
  nil
104
114
  end
105
115
 
116
+ # 字符串转换成cookies
117
+ # "abc=123; cd=412" => { "abc": "123", "cd": "412"}
118
+ def str_to_cookies(str)
119
+ str.scan(/([^=]*)=([^;]*);? ?/) do |m|
120
+ self.cookies[:"#{m[0]}"] = m[1]
121
+ end
122
+ end
123
+
106
124
  # 代理设置
107
125
  def auto_proxy=(value)
108
126
  Rails.logger.debug "自动更新代理"
@@ -206,7 +224,13 @@ module HttpCrawler
206
224
  h = h.cookies(cookies) if cookies
207
225
 
208
226
  # 添加超时时间
209
- h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
227
+ if(@all_timeout)
228
+ # 整体总计超时时间
229
+ h = h.timeout(@all_timeout)
230
+ else
231
+ # 指定每个处理超时时间
232
+ h = h.timeout(connect: @connect_time, write: @write_time, read: @read_time)
233
+ end
210
234
 
211
235
  h
212
236
  end
@@ -1,10 +1,8 @@
1
1
  module HTTP
2
2
  class Response
3
3
 
4
-
5
4
  # 解压并转码 body 数据
6
5
  def decoding_body
7
-
8
6
  return @decoding_body if @decoding_body
9
7
  return nil unless self.body
10
8
 
@@ -58,28 +56,67 @@ module HTTP
58
56
  end
59
57
 
60
58
  alias_method :dec, :decoding_body
61
- # def decoding_body
62
59
 
60
+
61
+ # 转换html格式
62
+ # @return [Nokogiri::HTML::Document]
63
63
  def html
64
- @html ||= Nokogiri::HTML(decoding_body)
64
+ return @html if @html
65
+ self.html = self.dec
65
66
  end
66
67
 
68
+ # @return [Nokogiri::HTML::Document]
69
+ def html=(data)
70
+ if (Nokogiri::HTML::Document === data)
71
+ @html = data
72
+ else
73
+ @html = Nokogiri::HTML(data)
74
+ end
75
+ @html
76
+ end
77
+
78
+ # 转换json格式
79
+ # @return [Hash]
67
80
  def json
68
- @json ||= JSON.parse(decoding_body)
69
- @json = JSON.parse(@json) if String === @json
81
+ return @json if @json
82
+ self.json = self.dec
83
+ end
84
+
85
+ # @return [Hash]
86
+ def json=(data)
87
+ if (Hash === data)
88
+ @json = data
89
+ else
90
+ @json = JSON.parse(data)
91
+ @json = JSON.parse(@json) if String === @json
92
+ end
70
93
  @json
71
94
  end
72
95
 
73
96
  # 通过readability 解析数据
74
- # [Readability::Document]
97
+ # @return [Readability::Document]
75
98
  def readability
76
- @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
99
+ return @readability if @readability
100
+ self.readability = self.dec
101
+ end
102
+
103
+ # @return [Readability::Document]
104
+ def readability=(data)
105
+ if (Readability::Document === data)
106
+ @readability = data
107
+ else
108
+ @readability = Readability::Document.new(data, {do_not_guess_encoding: true})
109
+ end
110
+ @readability
77
111
  end
78
112
 
113
+
79
114
  def content
80
115
  Nokogiri::HTML(readability.content).text
81
116
  end
117
+
82
118
  # 解析
119
+ # 默认使用 json 的值
83
120
  def parsing
84
121
  self.json
85
122
  end
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.0.5"
2
+ VERSION = "0.3.0.6"
3
3
  end
@@ -4,6 +4,11 @@ module HttpCrawler
4
4
  module Baidu
5
5
  class Client < HttpCrawler::Web::Client
6
6
 
7
+ def init_client
8
+ # 设置整体超时时间 3 秒
9
+ @all_timeout = 3
10
+ end
11
+
7
12
  def init_uri
8
13
  @uri = URI("https://www.baidu.com")
9
14
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.5
4
+ version: 0.3.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-22 00:00:00.000000000 Z
11
+ date: 2019-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec