http_crawler 0.3.1.12 → 0.3.1.13

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a3ad2fc0f149033558a66e87a8aa5f15b39aab1ce651b09ea8e8c86f99393d2
4
- data.tar.gz: fbcde9c6aed889e7dd1386499dc70efbc82e1415630a0ffb7eaf6b343061ae54
3
+ metadata.gz: b691ef837786382eeaae3b6368bbd6709ddc2d4fbee03fac819fa6e691f89e03
4
+ data.tar.gz: c5e94dd28381d78dd2798dc74f9d83498fa33e2c173b7edc62f7bd73a8692748
5
5
  SHA512:
6
- metadata.gz: 1d77e4285837d83229131b57b78fa13dd42aefb6465a8346ed06eca430bacfe31593e4ff05a25bb4637ac52ba9ab4eb4dd05da514c413cd5b0232b45a5be2a14
7
- data.tar.gz: cd974847c9f3f5cdc0a4576bd1932267c6c7516fff1aaddc6e1571735ae5a4e9b5f19b9acfad816ce99ab1bac9ce1cab0314412b3fbc34cfc3907cfb05a67c8c
6
+ metadata.gz: e05f70823be3fb9e88768f508f726c1d2769fd255df1eca0007748854deb4b123623cc660b034cee72273014effbde05a860b398b87e3fad7a2d794c63adb2e1
7
+ data.tar.gz: ce6b3a0f35a2e70adf9a6747a2808ebe066d35c988a186e910979353553dd882ae3599285cf73e6bffcf4f7688b37f43df27410e693e68cf7700f7eccfcb21e6
@@ -309,7 +309,12 @@ module HttpCrawler
309
309
  raise "必须定义块" unless block_given?
310
310
  n = max_error_num
311
311
  begin
312
- block.call
312
+ r = block.call
313
+ if r.status.success?
314
+ return r
315
+ else
316
+ raise "请求失败(#{r.code}):#{r.uri.to_s}"
317
+ end
313
318
  rescue => error
314
319
  Rails.logger.debug error.class
315
320
  case error
@@ -1,2 +1,3 @@
1
1
  require_dependency File.dirname(__FILE__) + '/common/object.rb'
2
2
  require_dependency File.dirname(__FILE__) + '/common/string.rb'
3
+ require_dependency File.dirname(__FILE__) + '/common/integer.rb'
@@ -0,0 +1,9 @@
1
+ class Integer
2
+ def to_time
3
+ if self >= 1000000000000 && self < 10000000000000
4
+ return Time.at(self / 1000.0)
5
+ else
6
+ return Time.at(self)
7
+ end
8
+ end
9
+ end
@@ -3,7 +3,7 @@ class String
3
3
  # 清除包含: 空格,回车
4
4
  #
5
5
  def del_inter
6
- self.gsub(/(?:\n|\t|\r| )/, "")
6
+ self.gsub(/(?:\n|\t|\r| | )/, "")
7
7
  end
8
8
 
9
9
  # 转换成时间格式
@@ -20,6 +20,14 @@ class String
20
20
  "%Y-%m-%d %H:%M:%S",
21
21
  "%Y-%m-%d%H:%M",
22
22
  "%Y-%m-%d %H:%M",
23
+
24
+ "%Y-%m-%d",
25
+ "%Y年%m月%d日",
26
+ "%Y%m%d",
27
+
28
+ "%Y%m%d%H%M%S",
29
+ "%Y%m%d%H%M",
30
+
23
31
  ].each do |v|
24
32
  begin
25
33
  return Time.strptime(self, v)
@@ -28,6 +36,9 @@ class String
28
36
  end
29
37
  end
30
38
 
39
+ Time.at(self.to_i / 1000.0) if self.length == 13
40
+ Time.at(self.to_i) if self.length == 10
41
+
31
42
  # 最后用 Time通用类型尝试
32
43
  return Time.parse(self)
33
44
  end
@@ -60,7 +60,7 @@ module HTTP
60
60
  def html(data = nil)
61
61
 
62
62
  if (data.blank? && defined? @html)
63
- # 如果 data 为空 并且 @json 有值,直接返回 @json
63
+ # 如果 data 为空 并且 @html 有值,直接返回 @html
64
64
  return @html
65
65
  end
66
66
 
@@ -117,7 +117,12 @@ module HTTP
117
117
  end
118
118
 
119
119
  # 获取正文内容
120
- def content(data = readability.content)
120
+ def content(data = nil)
121
+ if(data.blank?)
122
+ data = readability.content
123
+ else
124
+ data = readability(data).content
125
+ end
121
126
  Nokogiri::HTML(data).text.del_inter
122
127
  end
123
128
 
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.1.12"
2
+ VERSION = "0.3.1.13"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1.12
4
+ version: 0.3.1.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-03-23 00:00:00.000000000 Z
11
+ date: 2019-04-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -134,6 +134,7 @@ files:
134
134
  - lib/http_crawler.rb
135
135
  - lib/http_crawler/client.rb
136
136
  - lib/http_crawler/common.rb
137
+ - lib/http_crawler/common/integer.rb
137
138
  - lib/http_crawler/common/object.rb
138
139
  - lib/http_crawler/common/string.rb
139
140
  - lib/http_crawler/errors.rb