grab_epg 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +1 -1
- data/lib/debug.rb +12 -2
- data/lib/grabepg.rb +122 -11
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
NmI0ZDZlY2FkNzVhNTNiNTQwYjU0NDNhOTNhYmUyZWUzY2ViN2U5OA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NzBkMjAxZmUwNDE1YzAzNWNmMTc1NjEwNjJlM2NhYjlmZmU2N2MzZg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MmZiNGVkM2JjMGUwZDNmMDE5OTc2NzE2NjFmMDJmMzgyODU2Nzg1ZjgxZjBl
|
10
|
+
NmVkMmU0MThhZmY1YjZhOWYxNDBlOWZjZDEwODNlYWI3MWFhNmY2ZDQ1OThl
|
11
|
+
NTFjM2VmMjgxOTc1MmNkY2EzMmU2YzNjN2JkNjA0M2FlNjllZmE=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZTBmMzY2MDkyZDEzZDVhMzQ0MDA4MmVjYTBkMGM2YmJhYTlhYzkzOWYyNjJj
|
14
|
+
ZjAxOTNmZjgwZTljNWI3MWQxMjY3Y2FmZjczODNhYWU4MGQ3YzZhNDRjOWM1
|
15
|
+
NTJlZWYxNDViNDgwZjMxNDMxZWNhYzY1NzA1OTg5ZGIwY2YyYWI=
|
data/.grabepg.gemspec
CHANGED
data/lib/debug.rb
CHANGED
@@ -5,7 +5,7 @@ require 'open-uri'
|
|
5
5
|
require File.expand_path("../grabepg.rb", __FILE__)
|
6
6
|
class Debug
|
7
7
|
# To change this template use File | Settings | File Templates.
|
8
|
-
proxylist = [""]
|
8
|
+
proxylist = ["219.234.82.84:24809", "219.234.82.84:17130", "219.234.82.84:23684", "219.234.82.84:18253", "219.234.82.84:33987", "219.234.82.84:17183", "219.234.82.84:13243", "219.234.82.84:16158", "219.234.82.84:14826", "219.234.82.84:8489", "219.234.82.84:22222", "219.234.82.84:6370", "219.234.82.84:7571", "219.234.82.84:33944", "219.234.82.84:9743", "219.234.82.84:8089", "219.234.82.84:20991", "219.234.82.84:34032", "219.234.82.84:9415", "219.234.82.84:26149", "219.234.82.84:11095", "219.234.82.84:21724", "219.234.82.84:9177", "219.234.82.84:34034", "219.234.82.84:17945", "219.234.82.85:32229", "219.234.82.85:28341", "219.234.82.85:36314", "219.234.82.85:30605", "219.234.82.85:23684", "219.234.82.85:34015", "219.234.82.85:33919", "219.234.82.85:30639", "219.234.82.85:33965", "219.234.82.85:37299", "219.234.82.85:20747", "219.234.82.86:6666", "219.234.82.86:34106", "219.234.82.86:25301", "219.234.82.86:32896", "219.234.82.86:23034", "219.234.82.86:22685", "219.234.82.86:13078", "219.234.82.86:38770", "219.234.82.86:28402", "219.234.82.86:18887", "219.234.82.86:6588", "219.234.82.86:7292", "219.234.82.86:24268", "219.234.82.86:16472", "219.234.82.86:32597", "219.234.82.86:31122", "219.234.82.88:8817", "219.234.82.88:8160", "219.234.82.88:9239", "219.234.82.88:6133", "114.141.162.53:8080", "123.125.116.243:17656", "123.125.116.241:29156", "123.125.116.243:6938", "219.234.82.88:29484", "219.234.82.88:8084", "219.234.82.88:32229", "219.234.82.88:22758", "219.234.82.88:5616", "124.225.52.14:8080", "219.234.82.88:30028", "219.234.82.88:23685", "219.234.82.88:29037", "219.234.82.88:8755"]
|
9
9
|
|
10
10
|
def self.test_get_doc_with_proxy(proxylist)
|
11
11
|
herf = "http://www.tvmao.com/drama/HS5oLCs="
|
@@ -33,9 +33,19 @@ class Debug
|
|
33
33
|
Grabepg.get_show_schedule(proxylist,herf)
|
34
34
|
end
|
35
35
|
|
36
|
+
def self.test_get_schedulelist_atday(proxylist)
|
37
|
+
Grabepg.get_schedulelist_atday("CCTV1"," http://www.tvmao.com/program/CCTV-CCTV1-w1.html",proxylist)
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.debug_all
|
41
|
+
Grabepg.start
|
42
|
+
end
|
43
|
+
|
36
44
|
#Grabepg.start
|
37
45
|
#p test_get_show_schedule(proxylist)
|
38
46
|
#p test_getschedule(proxylist)
|
39
47
|
# p test_get_show_infomation(proxylist)
|
40
|
-
p test_get_assign_date_url
|
48
|
+
#p test_get_assign_date_url
|
49
|
+
# p test_get_schedulelist_atday(proxylist)
|
50
|
+
p debug_all
|
41
51
|
end
|
data/lib/grabepg.rb
CHANGED
@@ -95,17 +95,111 @@ module Grabepg
|
|
95
95
|
|
96
96
|
#调用此方法的例子
|
97
97
|
def self.start
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
98
|
+
#作用是获取俩个字符串的相似度
|
99
|
+
#get str1 and str2 similarity
|
100
|
+
get_similarity_string = lambda { |str1,str2|
|
101
|
+
_length = 0
|
102
|
+
type = 0
|
103
|
+
if str1.length>str2.length
|
104
|
+
_length=str2.length
|
105
|
+
type = 2
|
106
|
+
else
|
107
|
+
_length=str1.length
|
108
|
+
type =1
|
109
|
+
end
|
110
|
+
_str_list = []
|
111
|
+
_str = ""
|
112
|
+
for i in 0.._length
|
113
|
+
case type
|
114
|
+
when 2
|
115
|
+
n=i
|
116
|
+
0.upto(str1.length-1).each do |j|
|
117
|
+
p "N: #{n}"
|
118
|
+
if(str2[n]==str1[j])
|
119
|
+
_str =_str+str2[n]
|
120
|
+
n = n+1
|
121
|
+
p "Str = #{_str}"
|
122
|
+
else
|
123
|
+
_str_list << _str
|
124
|
+
_str = ""
|
125
|
+
end
|
126
|
+
end
|
127
|
+
when 1
|
128
|
+
n=i
|
129
|
+
0.upto(str2.length-1).each do |j|
|
130
|
+
p "N: #{n}"
|
131
|
+
if(str1[n]==str2[j])
|
132
|
+
_str =_str+str1[n]
|
133
|
+
n=n+1
|
134
|
+
p "Str = #{_str}"
|
135
|
+
else
|
136
|
+
_str_list << _str
|
137
|
+
_str = ""
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
p _str_list
|
143
|
+
_str = ""
|
144
|
+
_str_list.each do |str|
|
145
|
+
if _str.length<str.length
|
146
|
+
_str=str
|
147
|
+
end
|
148
|
+
end
|
149
|
+
_str
|
150
|
+
}
|
151
|
+
|
152
|
+
|
153
|
+
path = "/home/zql/workspace/New/smart_remote/img_path"
|
154
|
+
channel_list = Grabepg.getchannels(path)
|
155
|
+
channel_urls = channel_list['channel_urls']
|
156
|
+
channel_infos = channel_list['channel_info']
|
157
|
+
p "Channel img save file,path='#{Grabepg.img_down_path}'"
|
158
|
+
proxy_list=Grabepg.get_topfast_list(5) #get_topfast_list 参数是代表最慢用时 单位秒
|
159
|
+
|
160
|
+
|
161
|
+
#Use for Test
|
162
|
+
|
163
|
+
p "************************************"
|
164
|
+
p "proxy_list:#{proxy_list}"
|
165
|
+
p "************************************"
|
166
|
+
|
167
|
+
bool_start = false
|
168
|
+
|
169
|
+
|
105
170
|
channel_urls.each do |channel,url|
|
106
|
-
|
171
|
+
|
172
|
+
if(channel=="CCTV16")
|
173
|
+
bool_start = true
|
174
|
+
end
|
175
|
+
|
176
|
+
if bool_start
|
177
|
+
previous_show_name = ""
|
178
|
+
channel_info = channel_infos[channel]
|
179
|
+
channel_name = channel_info["channel_name"]
|
180
|
+
channel_type = channel_info["channel_type"]
|
181
|
+
channel_id = channel_info["channel_id"]
|
182
|
+
channel_img_path = channel_info["img_path"]
|
183
|
+
|
184
|
+
#channel,herf,proxylist,day_num=7
|
185
|
+
|
186
|
+
|
187
|
+
start_time=0
|
188
|
+
use_num =1
|
189
|
+
|
190
|
+
#getScheduleAssignDate参数:
|
191
|
+
# channel 频道
|
192
|
+
# herf 频道地址
|
193
|
+
# proxylist 代理列表
|
194
|
+
# start_num 开始时间 int 为开始时间与今天的差值 正数代表今天之后的第几天 负数代表今天之前的第几天
|
195
|
+
# day_num 抓取的时间段天数
|
196
|
+
# img_dir_down_path 图片网络地址保存路径 有默认值 可不设置
|
197
|
+
schedule_list=Grabepg.getScheduleAssignDate(channel,url,proxy_list,start_time,use_num) #抓取的七天后的1天的数据
|
198
|
+
|
199
|
+
|
200
|
+
end
|
201
|
+
end
|
107
202
|
end
|
108
|
-
end
|
109
203
|
|
110
204
|
def self.img_down_path
|
111
205
|
@img_down_path
|
@@ -174,6 +268,11 @@ module Grabepg
|
|
174
268
|
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
175
269
|
@no_firest = 0
|
176
270
|
rescue => err
|
271
|
+
|
272
|
+
unless @no_firest
|
273
|
+
@no_firest = 0
|
274
|
+
end
|
275
|
+
|
177
276
|
@no_firest += 1
|
178
277
|
p "*************************Proxy:#{proxy}, url:#{url}"
|
179
278
|
proxylist.delete(proxy)
|
@@ -192,13 +291,25 @@ module Grabepg
|
|
192
291
|
p "Grab: #{url}"
|
193
292
|
doc = get_doc_with_proxy(proxylist,url)
|
194
293
|
show_type = []
|
294
|
+
|
295
|
+
|
296
|
+
_img_url = "http://static.haotv.me/channel/logo/"
|
195
297
|
img_url = _img_url + channel+".jpg"
|
298
|
+
|
299
|
+
|
196
300
|
data=doc.css('div[class="mt10 clear"]')[0].content.split(" ")
|
197
301
|
date = data[0]
|
198
302
|
week = data[1]
|
199
303
|
p "Channel: #{channel} Date: #{date} Week: #{week}"
|
200
304
|
@date = "#{week}(#{date})"
|
201
305
|
schedule_list = []
|
306
|
+
|
307
|
+
_herf = doc.css("h1[style='float:left']").xpath('img[@src]')[0]
|
308
|
+
img_url = _herf.get_attribute("src") if _herf
|
309
|
+
|
310
|
+
p "**************IMG: #{img_url}"
|
311
|
+
|
312
|
+
|
202
313
|
doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule|
|
203
314
|
_herf= schedule.xpath('a[@href]')[0]
|
204
315
|
schedule_herf=_herf.get_attribute("href") if _herf
|
@@ -246,7 +357,7 @@ module Grabepg
|
|
246
357
|
wday = 1
|
247
358
|
end
|
248
359
|
|
249
|
-
end_day = wday + day_num
|
360
|
+
end_day = wday + day_num - 1
|
250
361
|
|
251
362
|
if end_day>(_wday+7)
|
252
363
|
end_day = _wday + 7
|
@@ -265,7 +376,7 @@ module Grabepg
|
|
265
376
|
|
266
377
|
|
267
378
|
#获取指定时间段的节目表
|
268
|
-
def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num,img_dir_down_path=@img_down_dir_path)
|
379
|
+
def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
|
269
380
|
begin
|
270
381
|
day_num = 1 if day_num<1
|
271
382
|
rescue
|