http_crawler 0.3.1.12 → 0.3.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b691ef837786382eeaae3b6368bbd6709ddc2d4fbee03fac819fa6e691f89e03
|
4
|
+
data.tar.gz: c5e94dd28381d78dd2798dc74f9d83498fa33e2c173b7edc62f7bd73a8692748
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e05f70823be3fb9e88768f508f726c1d2769fd255df1eca0007748854deb4b123623cc660b034cee72273014effbde05a860b398b87e3fad7a2d794c63adb2e1
|
7
|
+
data.tar.gz: ce6b3a0f35a2e70adf9a6747a2808ebe066d35c988a186e910979353553dd882ae3599285cf73e6bffcf4f7688b37f43df27410e693e68cf7700f7eccfcb21e6
|
data/lib/http_crawler/client.rb
CHANGED
@@ -309,7 +309,12 @@ module HttpCrawler
|
|
309
309
|
raise "必须定义块" unless block_given?
|
310
310
|
n = max_error_num
|
311
311
|
begin
|
312
|
-
block.call
|
312
|
+
r = block.call
|
313
|
+
if r.status.success?
|
314
|
+
return r
|
315
|
+
else
|
316
|
+
raise "请求失败(#{r.code}):#{r.uri.to_s}"
|
317
|
+
end
|
313
318
|
rescue => error
|
314
319
|
Rails.logger.debug error.class
|
315
320
|
case error
|
data/lib/http_crawler/common.rb
CHANGED
@@ -3,7 +3,7 @@ class String
|
|
3
3
|
# 清除包含: 空格,回车
|
4
4
|
#
|
5
5
|
def del_inter
|
6
|
-
self.gsub(/(?:\n|\t|\r| )/, "")
|
6
|
+
self.gsub(/(?:\n|\t|\r| | )/, "")
|
7
7
|
end
|
8
8
|
|
9
9
|
# 转换成时间格式
|
@@ -20,6 +20,14 @@ class String
|
|
20
20
|
"%Y-%m-%d %H:%M:%S",
|
21
21
|
"%Y-%m-%d%H:%M",
|
22
22
|
"%Y-%m-%d %H:%M",
|
23
|
+
|
24
|
+
"%Y-%m-%d",
|
25
|
+
"%Y年%m月%d日",
|
26
|
+
"%Y%m%d",
|
27
|
+
|
28
|
+
"%Y%m%d%H%M%S",
|
29
|
+
"%Y%m%d%H%M",
|
30
|
+
|
23
31
|
].each do |v|
|
24
32
|
begin
|
25
33
|
return Time.strptime(self, v)
|
@@ -28,6 +36,9 @@ class String
|
|
28
36
|
end
|
29
37
|
end
|
30
38
|
|
39
|
+
Time.at(self.to_i / 1000.0) if self.length == 13
|
40
|
+
Time.at(self.to_i) if self.length == 10
|
41
|
+
|
31
42
|
# 最后用 Time通用类型尝试
|
32
43
|
return Time.parse(self)
|
33
44
|
end
|
@@ -60,7 +60,7 @@ module HTTP
|
|
60
60
|
def html(data = nil)
|
61
61
|
|
62
62
|
if (data.blank? && defined? @html)
|
63
|
-
# 如果 data 为空 并且 @
|
63
|
+
# 如果 data 为空 并且 @html 有值,直接返回 @html
|
64
64
|
return @html
|
65
65
|
end
|
66
66
|
|
@@ -117,7 +117,12 @@ module HTTP
|
|
117
117
|
end
|
118
118
|
|
119
119
|
# 获取正文内容
|
120
|
-
def content(data =
|
120
|
+
def content(data = nil)
|
121
|
+
if(data.blank?)
|
122
|
+
data = readability.content
|
123
|
+
else
|
124
|
+
data = readability(data).content
|
125
|
+
end
|
121
126
|
Nokogiri::HTML(data).text.del_inter
|
122
127
|
end
|
123
128
|
|
data/lib/http_crawler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.1.
|
4
|
+
version: 0.3.1.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-04-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- lib/http_crawler.rb
|
135
135
|
- lib/http_crawler/client.rb
|
136
136
|
- lib/http_crawler/common.rb
|
137
|
+
- lib/http_crawler/common/integer.rb
|
137
138
|
- lib/http_crawler/common/object.rb
|
138
139
|
- lib/http_crawler/common/string.rb
|
139
140
|
- lib/http_crawler/errors.rb
|