http_crawler 0.3.1.12 → 0.3.1.13
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b691ef837786382eeaae3b6368bbd6709ddc2d4fbee03fac819fa6e691f89e03
|
4
|
+
data.tar.gz: c5e94dd28381d78dd2798dc74f9d83498fa33e2c173b7edc62f7bd73a8692748
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e05f70823be3fb9e88768f508f726c1d2769fd255df1eca0007748854deb4b123623cc660b034cee72273014effbde05a860b398b87e3fad7a2d794c63adb2e1
|
7
|
+
data.tar.gz: ce6b3a0f35a2e70adf9a6747a2808ebe066d35c988a186e910979353553dd882ae3599285cf73e6bffcf4f7688b37f43df27410e693e68cf7700f7eccfcb21e6
|
data/lib/http_crawler/client.rb
CHANGED
@@ -309,7 +309,12 @@ module HttpCrawler
|
|
309
309
|
raise "必须定义块" unless block_given?
|
310
310
|
n = max_error_num
|
311
311
|
begin
|
312
|
-
block.call
|
312
|
+
r = block.call
|
313
|
+
if r.status.success?
|
314
|
+
return r
|
315
|
+
else
|
316
|
+
raise "请求失败(#{r.code}):#{r.uri.to_s}"
|
317
|
+
end
|
313
318
|
rescue => error
|
314
319
|
Rails.logger.debug error.class
|
315
320
|
case error
|
data/lib/http_crawler/common.rb
CHANGED
@@ -3,7 +3,7 @@ class String
|
|
3
3
|
# 清除包含: 空格,回车
|
4
4
|
#
|
5
5
|
def del_inter
|
6
|
-
self.gsub(/(?:\n|\t|\r| )/, "")
|
6
|
+
self.gsub(/(?:\n|\t|\r| | )/, "")
|
7
7
|
end
|
8
8
|
|
9
9
|
# 转换成时间格式
|
@@ -20,6 +20,14 @@ class String
|
|
20
20
|
"%Y-%m-%d %H:%M:%S",
|
21
21
|
"%Y-%m-%d%H:%M",
|
22
22
|
"%Y-%m-%d %H:%M",
|
23
|
+
|
24
|
+
"%Y-%m-%d",
|
25
|
+
"%Y年%m月%d日",
|
26
|
+
"%Y%m%d",
|
27
|
+
|
28
|
+
"%Y%m%d%H%M%S",
|
29
|
+
"%Y%m%d%H%M",
|
30
|
+
|
23
31
|
].each do |v|
|
24
32
|
begin
|
25
33
|
return Time.strptime(self, v)
|
@@ -28,6 +36,9 @@ class String
|
|
28
36
|
end
|
29
37
|
end
|
30
38
|
|
39
|
+
Time.at(self.to_i / 1000.0) if self.length == 13
|
40
|
+
Time.at(self.to_i) if self.length == 10
|
41
|
+
|
31
42
|
# 最后用 Time通用类型尝试
|
32
43
|
return Time.parse(self)
|
33
44
|
end
|
@@ -60,7 +60,7 @@ module HTTP
|
|
60
60
|
def html(data = nil)
|
61
61
|
|
62
62
|
if (data.blank? && defined? @html)
|
63
|
-
# 如果 data 为空 并且 @
|
63
|
+
# 如果 data 为空 并且 @html 有值,直接返回 @html
|
64
64
|
return @html
|
65
65
|
end
|
66
66
|
|
@@ -117,7 +117,12 @@ module HTTP
|
|
117
117
|
end
|
118
118
|
|
119
119
|
# 获取正文内容
|
120
|
-
def content(data =
|
120
|
+
def content(data = nil)
|
121
|
+
if(data.blank?)
|
122
|
+
data = readability.content
|
123
|
+
else
|
124
|
+
data = readability(data).content
|
125
|
+
end
|
121
126
|
Nokogiri::HTML(data).text.del_inter
|
122
127
|
end
|
123
128
|
|
data/lib/http_crawler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.1.
|
4
|
+
version: 0.3.1.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- lib/http_crawler.rb
|
135
135
|
- lib/http_crawler/client.rb
|
136
136
|
- lib/http_crawler/common.rb
|
137
|
+
- lib/http_crawler/common/integer.rb
|
137
138
|
- lib/http_crawler/common/object.rb
|
138
139
|
- lib/http_crawler/common/string.rb
|
139
140
|
- lib/http_crawler/errors.rb
|