http_crawler 0.3.1.12 → 0.3.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a3ad2fc0f149033558a66e87a8aa5f15b39aab1ce651b09ea8e8c86f99393d2
4
- data.tar.gz: fbcde9c6aed889e7dd1386499dc70efbc82e1415630a0ffb7eaf6b343061ae54
3
+ metadata.gz: b691ef837786382eeaae3b6368bbd6709ddc2d4fbee03fac819fa6e691f89e03
4
+ data.tar.gz: c5e94dd28381d78dd2798dc74f9d83498fa33e2c173b7edc62f7bd73a8692748
5
5
  SHA512:
6
- metadata.gz: 1d77e4285837d83229131b57b78fa13dd42aefb6465a8346ed06eca430bacfe31593e4ff05a25bb4637ac52ba9ab4eb4dd05da514c413cd5b0232b45a5be2a14
7
- data.tar.gz: cd974847c9f3f5cdc0a4576bd1932267c6c7516fff1aaddc6e1571735ae5a4e9b5f19b9acfad816ce99ab1bac9ce1cab0314412b3fbc34cfc3907cfb05a67c8c
6
+ metadata.gz: e05f70823be3fb9e88768f508f726c1d2769fd255df1eca0007748854deb4b123623cc660b034cee72273014effbde05a860b398b87e3fad7a2d794c63adb2e1
7
+ data.tar.gz: ce6b3a0f35a2e70adf9a6747a2808ebe066d35c988a186e910979353553dd882ae3599285cf73e6bffcf4f7688b37f43df27410e693e68cf7700f7eccfcb21e6
@@ -309,7 +309,12 @@ module HttpCrawler
309
309
  raise "必须定义块" unless block_given?
310
310
  n = max_error_num
311
311
  begin
312
- block.call
312
+ r = block.call
313
+ if r.status.success?
314
+ return r
315
+ else
316
+ raise "请求失败(#{r.code}):#{r.uri.to_s}"
317
+ end
313
318
  rescue => error
314
319
  Rails.logger.debug error.class
315
320
  case error
@@ -1,2 +1,3 @@
1
1
  require_dependency File.dirname(__FILE__) + '/common/object.rb'
2
2
  require_dependency File.dirname(__FILE__) + '/common/string.rb'
3
+ require_dependency File.dirname(__FILE__) + '/common/integer.rb'
@@ -0,0 +1,9 @@
1
+ class Integer
2
+ def to_time
3
+ if self >= 1000000000000 && self < 10000000000000
4
+ return Time.at(self / 1000.0)
5
+ else
6
+ return Time.at(self)
7
+ end
8
+ end
9
+ end
@@ -3,7 +3,7 @@ class String
3
3
  # 清除包含: 空格,回车
4
4
  #
5
5
  def del_inter
6
- self.gsub(/(?:\n|\t|\r| )/, "")
6
+ self.gsub(/(?:\n|\t|\r| | )/, "")
7
7
  end
8
8
 
9
9
  # 转换成时间格式
@@ -20,6 +20,14 @@ class String
20
20
  "%Y-%m-%d %H:%M:%S",
21
21
  "%Y-%m-%d%H:%M",
22
22
  "%Y-%m-%d %H:%M",
23
+
24
+ "%Y-%m-%d",
25
+ "%Y年%m月%d日",
26
+ "%Y%m%d",
27
+
28
+ "%Y%m%d%H%M%S",
29
+ "%Y%m%d%H%M",
30
+
23
31
  ].each do |v|
24
32
  begin
25
33
  return Time.strptime(self, v)
@@ -28,6 +36,9 @@ class String
28
36
  end
29
37
  end
30
38
 
39
+ Time.at(self.to_i / 1000.0) if self.length == 13
40
+ Time.at(self.to_i) if self.length == 10
41
+
31
42
  # 最后用 Time通用类型尝试
32
43
  return Time.parse(self)
33
44
  end
@@ -60,7 +60,7 @@ module HTTP
60
60
  def html(data = nil)
61
61
 
62
62
  if (data.blank? && defined? @html)
63
- # 如果 data 为空 并且 @json 有值,直接返回 @json
63
+ # 如果 data 为空 并且 @html 有值,直接返回 @html
64
64
  return @html
65
65
  end
66
66
 
@@ -117,7 +117,12 @@ module HTTP
117
117
  end
118
118
 
119
119
  # 获取正文内容
120
- def content(data = readability.content)
120
+ def content(data = nil)
121
+ if(data.blank?)
122
+ data = readability.content
123
+ else
124
+ data = readability(data).content
125
+ end
121
126
  Nokogiri::HTML(data).text.del_inter
122
127
  end
123
128
 
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.1.12"
2
+ VERSION = "0.3.1.13"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1.12
4
+ version: 0.3.1.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-23 00:00:00.000000000 Z
11
+ date: 2019-04-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -134,6 +134,7 @@ files:
134
134
  - lib/http_crawler.rb
135
135
  - lib/http_crawler/client.rb
136
136
  - lib/http_crawler/common.rb
137
+ - lib/http_crawler/common/integer.rb
137
138
  - lib/http_crawler/common/object.rb
138
139
  - lib/http_crawler/common/string.rb
139
140
  - lib/http_crawler/errors.rb