grab_epg 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +1 -1
- data/lib/grab_tvmao.rb +2 -5
- data/lib/grabepg/grab_base.rb +32 -23
- data/lib/grabepg/grab_tvsou.rb +36 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MTI5MTMwMTY1NTRmZjk5NGIwZGM4MTk3NTljNGFjMzAwOWY4NTdiNA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NzNjZTc3YzY1OWZkYTZjOGUzNTVjNzVmZjgzYjg3NjQyZWUzNGFmMg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZGMwNThmMzZlY2FmZmI1ZmQzNjY5ODdkYTI4MTk5MWI2NWZiODBlZjQ1YzNk
|
10
|
+
MzllOGE1YmNkZjRiNjc3MDlhY2FjZjMyNjFiNTcxYjFlZTlmYzgwNmVlMmQx
|
11
|
+
NzUxODIwMGE1MjgxZWM0NWY3ZDlmNWE0YmYyN2U0NTY1YjU3NmQ=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MTA3MmE5MGZkMzU5YzliYjljNTQ1NzljNWViYTQ5YWU5ZmNlZGQ5OWJmZTUz
|
14
|
+
YmZiN2QyNjUzOTk0OGQwMzM0NmZjOTEwOTI2MzJkZjAxMDg5YzdlNzUxNjM3
|
15
|
+
Y2VkNmQzMGUyMjQ0Nzc5MTZkMGE5NjY4Y2IwZTY2ZGI5Y2MyOTA=
|
data/.grabepg.gemspec
CHANGED
data/lib/grab_tvmao.rb
CHANGED
@@ -264,7 +264,7 @@ module GrabTvmao
|
|
264
264
|
proxy = proxylist[@proxyindex+1]
|
265
265
|
end
|
266
266
|
begin
|
267
|
-
doc = Nokogiri::HTML(open(url,:proxy=>"
|
267
|
+
doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
|
268
268
|
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
269
269
|
@no_firest = 0
|
270
270
|
rescue => err
|
@@ -274,15 +274,12 @@ module GrabTvmao
|
|
274
274
|
end
|
275
275
|
|
276
276
|
@no_firest += 1
|
277
|
-
p "*************************Proxy:#{proxy}, url:#{url}"
|
277
|
+
p "*************************Proxy:#{proxy}, url:#{url} Error:#{err.to_s}"
|
278
278
|
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
|
279
279
|
get_doc_with_proxy(proxylist,url) if @no_firest<4
|
280
280
|
raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
|
281
281
|
end
|
282
282
|
@proxyindex += 1
|
283
|
-
unless doc
|
284
|
-
p "*************************Proxy:#{proxy}, url:#{url}"
|
285
|
-
end
|
286
283
|
doc
|
287
284
|
end
|
288
285
|
|
data/lib/grabepg/grab_base.rb
CHANGED
@@ -119,8 +119,28 @@ module Grabepg
|
|
119
119
|
end
|
120
120
|
|
121
121
|
|
122
|
+
def err_doc_proxy(proxy,proxylist,url="",err="")
|
123
|
+
if proxy.empty?||proxy.nil?
|
124
|
+
proxylist.delete_at[@proxyindex]
|
125
|
+
end
|
126
|
+
|
122
127
|
|
128
|
+
unless @no_firest
|
129
|
+
@no_firest = 0
|
130
|
+
end
|
123
131
|
|
132
|
+
@no_firest += 1
|
133
|
+
p "*************************Proxy:#{proxy}, url:#{url} Error:#{err}"
|
134
|
+
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
|
135
|
+
@proxyindex += 1
|
136
|
+
@proxyindex=@proxyindex%@size
|
137
|
+
doc=get_doc_with_proxy(proxylist,url) if @no_firest<4
|
138
|
+
unless @no_firest<4
|
139
|
+
@no_firest=0
|
140
|
+
raise RuntimeError,"Error: #{err}"
|
141
|
+
end
|
142
|
+
doc
|
143
|
+
end
|
124
144
|
|
125
145
|
|
126
146
|
#使用代理获取url的html的doc值
|
@@ -129,6 +149,7 @@ module Grabepg
|
|
129
149
|
unless @proxyindex
|
130
150
|
@proxyindex = 0
|
131
151
|
end
|
152
|
+
@size = proxylist.size
|
132
153
|
@proxyindex=@proxyindex%proxylist.size
|
133
154
|
if(proxylist[@proxyindex])
|
134
155
|
proxy = proxylist[@proxyindex]
|
@@ -136,35 +157,23 @@ module Grabepg
|
|
136
157
|
proxy = proxylist[@proxyindex+1]
|
137
158
|
end
|
138
159
|
begin
|
139
|
-
doc = Nokogiri::HTML(open(url,:proxy=>"
|
140
|
-
|
141
|
-
|
160
|
+
doc = Nokogiri::HTML(open(url,:proxy=>"#{proxy}")) unless proxy.nil?||proxy.empty?
|
161
|
+
if doc.nil?
|
162
|
+
doc=err_doc_proxy(proxy,proxylist,url,"doc nil")
|
163
|
+
@no_firest=0
|
142
164
|
end
|
143
165
|
@no_firest = 0
|
144
166
|
rescue => err
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
unless @no_firest
|
151
|
-
@no_firest = 0
|
152
|
-
end
|
153
|
-
|
154
|
-
@no_firest += 1
|
155
|
-
p "*************************Proxy:#{proxy}, url:#{url}"
|
156
|
-
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
|
167
|
+
p "IN Rescue"
|
168
|
+
doc=err_doc_proxy(proxy,proxylist,url,err.to_s)
|
169
|
+
@no_firest=0
|
170
|
+
p "Get DOC"
|
157
171
|
@proxyindex += 1
|
158
|
-
|
159
|
-
|
160
|
-
@no_firest=0
|
161
|
-
raise RuntimeError,"Error: #{err.to_s}"
|
162
|
-
end
|
172
|
+
@proxyindex=@proxyindex%@size
|
173
|
+
return doc
|
163
174
|
end
|
164
175
|
@proxyindex += 1
|
165
|
-
|
166
|
-
p "*************************Proxy:#{proxy}, url:#{url}"
|
167
|
-
end
|
176
|
+
@proxyindex=@proxyindex%@size
|
168
177
|
else
|
169
178
|
begin
|
170
179
|
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
data/lib/grabepg/grab_tvsou.rb
CHANGED
@@ -78,6 +78,7 @@ module Grabepg
|
|
78
78
|
|
79
79
|
|
80
80
|
doc = @grabbase.get_doc_with_proxy(@proxy_list,@home_page)
|
81
|
+
begin
|
81
82
|
doc.css("li").each do |li|
|
82
83
|
case ChannelTypeMap[li.get_attribute("class")]
|
83
84
|
when "央视"
|
@@ -90,6 +91,15 @@ module Grabepg
|
|
90
91
|
|
91
92
|
end
|
92
93
|
end
|
94
|
+
@error_num=0
|
95
|
+
rescue
|
96
|
+
unless @error_num
|
97
|
+
@error_num = 0
|
98
|
+
end
|
99
|
+
@error_num+=1
|
100
|
+
raise err.to_s if @error_num==5
|
101
|
+
dispose_home_page
|
102
|
+
end
|
93
103
|
return @channels
|
94
104
|
end
|
95
105
|
|
@@ -123,7 +133,17 @@ module Grabepg
|
|
123
133
|
def dispose_schedule_page(url,start_time,use_time)
|
124
134
|
url = @site +"/"+url
|
125
135
|
urls = url.split("?")
|
136
|
+
begin
|
126
137
|
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
138
|
+
@error_num = 0
|
139
|
+
rescue => err
|
140
|
+
unless @error_num
|
141
|
+
@error_num = 0
|
142
|
+
end
|
143
|
+
@error_num+=1
|
144
|
+
raise err.to_s if @error_num==5
|
145
|
+
dispose_schedule_page(url,start_time,use_time)
|
146
|
+
end
|
127
147
|
_url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
|
128
148
|
_url = urls[0]+_url
|
129
149
|
urls = dispose_href_schedule_data(_url,start_time,use_time)
|
@@ -135,7 +155,9 @@ module Grabepg
|
|
135
155
|
if url
|
136
156
|
doc = @grabbase.get_doc_with_proxy(@proxy_list,url[:url])
|
137
157
|
schedules = []
|
138
|
-
doc.css('div[class="time"]')[0]
|
158
|
+
div = doc.css('div[class="time"]')[0]
|
159
|
+
if div
|
160
|
+
div.css("li[class='gray']").each do |schedule|
|
139
161
|
begin
|
140
162
|
_dispose = schedule.content
|
141
163
|
_dispose_show =schedule.css("span")[0].text
|
@@ -153,6 +175,9 @@ module Grabepg
|
|
153
175
|
end
|
154
176
|
end
|
155
177
|
ret.merge!({url[:date]=>schedules})
|
178
|
+
else
|
179
|
+
p "Error In this url: #{url} couldn't get doc.css('div[class=time]')[0]"
|
180
|
+
end
|
156
181
|
end
|
157
182
|
end
|
158
183
|
return ret
|
@@ -165,11 +190,21 @@ module Grabepg
|
|
165
190
|
#解析节目详情页面
|
166
191
|
def dispose_show_info(url)
|
167
192
|
doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
|
193
|
+
begin
|
168
194
|
show_name = doc.css('div[class="tv_info_top"]')[0].content
|
169
195
|
_doc=doc.css("div[class='tv_info']")
|
170
196
|
img_url = _doc.css("img")[0].get_attribute("src").gsub(" ","")
|
171
197
|
show_info = _doc.css("p")[0].content.gsub("[全文]","")
|
198
|
+
@error_num = 0
|
172
199
|
{show_name:show_name,img_url:img_url,show_info:show_info}
|
200
|
+
rescue => err
|
201
|
+
unless @error_num
|
202
|
+
@error_num = 0
|
203
|
+
end
|
204
|
+
@error_num+=1
|
205
|
+
raise err.to_s if @error_num==5
|
206
|
+
dispose_show_info(url)
|
207
|
+
end
|
173
208
|
end
|
174
209
|
|
175
210
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: grab_epg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hahazql
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-06-
|
11
|
+
date: 2013-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: ! '"用于抓取EPG信息"'
|
14
14
|
email:
|