grab_epg 0.1.6 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.grabepg.gemspec +2 -2
- data/lib/debug.rb +4 -43
- data/lib/grab_tvmao.rb +595 -0
- data/lib/grabepg/grab_base.rb +173 -0
- data/lib/grabepg/grab_tvsou.rb +170 -0
- data/lib/grabepg.rb +4 -595
- data/lib/test/test_grab_tvsou.rb +52 -0
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MGEyOWI5Y2YwMTY2NzY1OWVmYzYwNTdmNDc4NGY4M2RlMzg2NWIwYg==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
M2Y5NzBiMzA2MTg5MGM4ZjkxMWU0N2I2MWM0OWJjZThjNmE3NDkyNQ==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MmI4NjQwYTY3MWJhYWRkMjA1NDk2MWQ4MDdhOTg1N2ZkYmQwOTVkZDdiNjBh
|
10
|
+
NmI3M2Y0OTI2ZDcxMzE2YzEyYTMwZGQ3OGMzYzY2YTg1NjViZmMwOTkxOTgx
|
11
|
+
NGVmMDUzZmE1OTVlNTc4M2MxMWI3NzlhZDRmMGZjOWJlOWZiYTI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YWJmNDIyNzYyNDExNzMzMjA5NjU3ZDhmYzAzMDQ2ZjYwY2U4MTU0Y2ZjMmQ0
|
14
|
+
YzRmMzljMDRmNDJlM2I5ZWI5NTJjMjRkZmZkZmY2YzVmYTMyODJkNTQyZDNj
|
15
|
+
ZWM3ZTNhNDFlN2FjYTA1MDE0N2Y3OWU5MGU4NWZlNTQ3MGQzMjU=
|
data/.grabepg.gemspec
CHANGED
@@ -3,13 +3,13 @@
|
|
3
3
|
Gem::Specification.new do |gem|
|
4
4
|
gem.authors = ["hahazql"]
|
5
5
|
gem.email = ["hahazhouqunli@gmail.com"]
|
6
|
-
gem.description = %q{"
|
6
|
+
gem.description = %q{"用于抓取EPG信息"}
|
7
7
|
gem.summary = %q{"Grab EPG"}
|
8
8
|
gem.homepage = ""
|
9
9
|
|
10
10
|
gem.files = `git ls-files`.split($\)
|
11
11
|
gem.name = "grab_epg"
|
12
12
|
gem.require_paths = ["lib"]
|
13
|
-
gem.version = "0.1.
|
13
|
+
gem.version = "0.1.8"
|
14
14
|
gem.homepage = "https://github.com/hahazql/grab_epg"
|
15
15
|
end
|
data/lib/debug.rb
CHANGED
@@ -2,50 +2,11 @@
|
|
2
2
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'open-uri'
|
5
|
-
require File.expand_path("../
|
5
|
+
require File.expand_path("../test/test_grab_tvsou.rb", __FILE__)
|
6
|
+
#require 'test/test_grab_tvsou'
|
6
7
|
class Debug
|
7
8
|
# To change this template use File | Settings | File Templates.
|
8
|
-
proxylist = ["219.234.82.84:24809", "219.234.82.84:17130", "219.234.82.84:23684", "219.234.82.84:18253", "219.234.82.84:33987", "219.234.82.84:17183", "219.234.82.84:13243", "219.234.82.84:16158", "219.234.82.84:14826", "219.234.82.84:8489", "219.234.82.84:22222", "219.234.82.84:6370", "219.234.82.84:7571", "219.234.82.84:33944", "219.234.82.84:9743", "219.234.82.84:8089", "219.234.82.84:20991", "219.234.82.84:34032", "219.234.82.84:9415", "219.234.82.84:26149", "219.234.82.84:11095", "219.234.82.84:21724", "219.234.82.84:9177", "219.234.82.84:34034", "219.234.82.84:17945", "219.234.82.85:32229", "219.234.82.85:28341", "219.234.82.85:36314", "219.234.82.85:30605", "219.234.82.85:23684", "219.234.82.85:34015", "219.234.82.85:33919", "219.234.82.85:30639", "219.234.82.85:33965", "219.234.82.85:37299", "219.234.82.85:20747", "219.234.82.86:6666", "219.234.82.86:34106", "219.234.82.86:25301", "219.234.82.86:32896", "219.234.82.86:23034", "219.234.82.86:22685", "219.234.82.86:13078", "219.234.82.86:38770", "219.234.82.86:28402", "219.234.82.86:18887", "219.234.82.86:6588", "219.234.82.86:7292", "219.234.82.86:24268", "219.234.82.86:16472", "219.234.82.86:32597", "219.234.82.86:31122", "219.234.82.88:8817", "219.234.82.88:8160", "219.234.82.88:9239", "219.234.82.88:6133", "114.141.162.53:8080", "123.125.116.243:17656", "123.125.116.241:29156", "123.125.116.243:6938", "219.234.82.88:29484", "219.234.82.88:8084", "219.234.82.88:32229", "219.234.82.88:22758", "219.234.82.88:5616", "124.225.52.14:8080", "219.234.82.88:30028", "219.234.82.88:23685", "219.234.82.88:29037", "219.234.82.88:8755"]
|
9
|
+
#proxylist = ["219.234.82.84:24809", "219.234.82.84:17130", "219.234.82.84:23684", "219.234.82.84:18253", "219.234.82.84:33987", "219.234.82.84:17183", "219.234.82.84:13243", "219.234.82.84:16158", "219.234.82.84:14826", "219.234.82.84:8489", "219.234.82.84:22222", "219.234.82.84:6370", "219.234.82.84:7571", "219.234.82.84:33944", "219.234.82.84:9743", "219.234.82.84:8089", "219.234.82.84:20991", "219.234.82.84:34032", "219.234.82.84:9415", "219.234.82.84:26149", "219.234.82.84:11095", "219.234.82.84:21724", "219.234.82.84:9177", "219.234.82.84:34034", "219.234.82.84:17945", "219.234.82.85:32229", "219.234.82.85:28341", "219.234.82.85:36314", "219.234.82.85:30605", "219.234.82.85:23684", "219.234.82.85:34015", "219.234.82.85:33919", "219.234.82.85:30639", "219.234.82.85:33965", "219.234.82.85:37299", "219.234.82.85:20747", "219.234.82.86:6666", "219.234.82.86:34106", "219.234.82.86:25301", "219.234.82.86:32896", "219.234.82.86:23034", "219.234.82.86:22685", "219.234.82.86:13078", "219.234.82.86:38770", "219.234.82.86:28402", "219.234.82.86:18887", "219.234.82.86:6588", "219.234.82.86:7292", "219.234.82.86:24268", "219.234.82.86:16472", "219.234.82.86:32597", "219.234.82.86:31122", "219.234.82.88:8817", "219.234.82.88:8160", "219.234.82.88:9239", "219.234.82.88:6133", "114.141.162.53:8080", "123.125.116.243:17656", "123.125.116.241:29156", "123.125.116.243:6938", "219.234.82.88:29484", "219.234.82.88:8084", "219.234.82.88:32229", "219.234.82.88:22758", "219.234.82.88:5616", "124.225.52.14:8080", "219.234.82.88:30028", "219.234.82.88:23685", "219.234.82.88:29037", "219.234.82.88:8755"]
|
9
10
|
|
10
|
-
|
11
|
-
herf = "http://www.tvmao.com/drama/HS5oLCs="
|
12
|
-
Grabepg.get_doc_with_proxy(proxylist,herf)
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.test_get_show_infomation(proxylist)
|
16
|
-
herf = "http://www.tvmao.com/tvcolumn/cVhPLQ=="
|
17
|
-
Grabepg.get_show_infomation(proxylist,herf)
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.test_getschedule(proxylist)
|
21
|
-
channel = "HUNANTV"
|
22
|
-
herf = "/program/HUNANTV-HUNANTV-w1.html"
|
23
|
-
Grabepg.getschedule(channel,herf,proxylist,1)
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.test_get_assign_date_url
|
27
|
-
herf = "/program/HUNANTV-HUNANTV-w1.html"
|
28
|
-
Grabepg.get_assign_date_url(herf,7,1)
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.test_get_show_schedule(proxylist)
|
32
|
-
herf = "http://www.tvmao.com/tvcolumn/cVhPLQ=="
|
33
|
-
Grabepg.get_show_schedule(proxylist,herf)
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.test_get_schedulelist_atday(proxylist)
|
37
|
-
Grabepg.get_schedulelist_atday("CCTV1"," http://www.tvmao.com/program/CCTV-CCTV1-w1.html",proxylist)
|
38
|
-
end
|
39
|
-
|
40
|
-
def self.debug_all
|
41
|
-
Grabepg.start
|
42
|
-
end
|
43
|
-
|
44
|
-
#Grabepg.start
|
45
|
-
#p test_get_show_schedule(proxylist)
|
46
|
-
#p test_getschedule(proxylist)
|
47
|
-
# p test_get_show_infomation(proxylist)
|
48
|
-
#p test_get_assign_date_url
|
49
|
-
# p test_get_schedulelist_atday(proxylist)
|
50
|
-
p debug_all
|
11
|
+
p TestGrabTvsou.start
|
51
12
|
end
|
data/lib/grab_tvmao.rb
ADDED
@@ -0,0 +1,595 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'open-uri'
|
5
|
+
|
6
|
+
module GrabTvmao
|
7
|
+
# To change this template use File | Settings | File Templates.
|
8
|
+
|
9
|
+
|
10
|
+
#图片的获取: Net::HTTP.get(url)
|
11
|
+
#图片的文件类型获取:
|
12
|
+
|
13
|
+
attr_reader :channel #频道列表
|
14
|
+
attr_reader :site #网站地址
|
15
|
+
attr_reader :proxyindex #代理的索引
|
16
|
+
attr_reader :show_schedule #根据节目的时间表
|
17
|
+
attr_reader :img_down_path #图片下载路径存放
|
18
|
+
|
19
|
+
DEFAULT_GrabtvType=["cctv","satellite","digital",]
|
20
|
+
DEFAULT_SITE = "http://www.tvmao.com"
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
#将星期的wday获取值转化为中文名
|
27
|
+
#conversion wady to chinese
|
28
|
+
def self.conversion_what_day(whatday)
|
29
|
+
ret = "星期"
|
30
|
+
case whatday.to_i
|
31
|
+
when 1
|
32
|
+
ret += "一"
|
33
|
+
when 2
|
34
|
+
ret += "二"
|
35
|
+
when 3
|
36
|
+
ret += "三"
|
37
|
+
when 4
|
38
|
+
ret += "四"
|
39
|
+
when 5
|
40
|
+
ret += "五"
|
41
|
+
when 6
|
42
|
+
ret += "六"
|
43
|
+
when 7
|
44
|
+
ret += "七"
|
45
|
+
end
|
46
|
+
ret
|
47
|
+
end
|
48
|
+
|
49
|
+
#如果时间为1~9的一位则为其在数字前加0补齐二位
|
50
|
+
def self.dispose_time(num)
|
51
|
+
num = num.to_s
|
52
|
+
if num.length < 2
|
53
|
+
num = "0"+num
|
54
|
+
end
|
55
|
+
num
|
56
|
+
end
|
57
|
+
|
58
|
+
#转化当前时间的格式
|
59
|
+
def self.get_week_date_time(time)
|
60
|
+
month = time.month
|
61
|
+
day = time.day
|
62
|
+
whatday = time.wday
|
63
|
+
ret = conversion_what_day(whatday) + "(" + dispose_time(month) + "-"+dispose_time(day)+")"
|
64
|
+
ret
|
65
|
+
end
|
66
|
+
|
67
|
+
#前几天需要减去的num
|
68
|
+
def self.del_day_num(day_num)
|
69
|
+
ret = day_num*60*60*24
|
70
|
+
ret
|
71
|
+
end
|
72
|
+
|
73
|
+
#获取距离当前多少天的之前的日期
|
74
|
+
def self.get_time_day_prior(num)
|
75
|
+
time = Time.now - del_day_num(num)
|
76
|
+
ret = get_week_date_time(time)
|
77
|
+
ret
|
78
|
+
end
|
79
|
+
|
80
|
+
#前面一周要删除的日期的列表
|
81
|
+
def self.del_time_list
|
82
|
+
ret = []
|
83
|
+
time = Time.now
|
84
|
+
wday = time.wday
|
85
|
+
if(wday==1)
|
86
|
+
for i in 0..7
|
87
|
+
ret<<self.get_time_day_prior(i)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
ret
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
#调用此方法的例子
|
97
|
+
def self.start
|
98
|
+
#作用是获取俩个字符串的相似度
|
99
|
+
#get str1 and str2 similarity
|
100
|
+
get_similarity_string = lambda { |str1,str2|
|
101
|
+
_length = 0
|
102
|
+
type = 0
|
103
|
+
if str1.length>str2.length
|
104
|
+
_length=str2.length
|
105
|
+
type = 2
|
106
|
+
else
|
107
|
+
_length=str1.length
|
108
|
+
type =1
|
109
|
+
end
|
110
|
+
_str_list = []
|
111
|
+
_str = ""
|
112
|
+
for i in 0.._length
|
113
|
+
case type
|
114
|
+
when 2
|
115
|
+
n=i
|
116
|
+
0.upto(str1.length-1).each do |j|
|
117
|
+
p "N: #{n}"
|
118
|
+
if(str2[n]==str1[j])
|
119
|
+
_str =_str+str2[n]
|
120
|
+
n = n+1
|
121
|
+
p "Str = #{_str}"
|
122
|
+
else
|
123
|
+
_str_list << _str
|
124
|
+
_str = ""
|
125
|
+
end
|
126
|
+
end
|
127
|
+
when 1
|
128
|
+
n=i
|
129
|
+
0.upto(str2.length-1).each do |j|
|
130
|
+
p "N: #{n}"
|
131
|
+
if(str1[n]==str2[j])
|
132
|
+
_str =_str+str1[n]
|
133
|
+
n=n+1
|
134
|
+
p "Str = #{_str}"
|
135
|
+
else
|
136
|
+
_str_list << _str
|
137
|
+
_str = ""
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
p _str_list
|
143
|
+
_str = ""
|
144
|
+
_str_list.each do |str|
|
145
|
+
if _str.length<str.length
|
146
|
+
_str=str
|
147
|
+
end
|
148
|
+
end
|
149
|
+
_str
|
150
|
+
}
|
151
|
+
|
152
|
+
|
153
|
+
path = "/home/zql/workspace/New/smart_remote/img_path"
|
154
|
+
channel_list = GrabTvmao.getchannels(path)
|
155
|
+
channel_urls = channel_list['channel_urls']
|
156
|
+
channel_infos = channel_list['channel_info']
|
157
|
+
p "Channel img save file,path='#{GrabTvmao.img_down_path}'"
|
158
|
+
proxy_list=GrabTvmao.get_topfast_list(5) #get_topfast_list 参数是代表最慢用时 单位秒
|
159
|
+
|
160
|
+
|
161
|
+
#Use for Test
|
162
|
+
|
163
|
+
p "************************************"
|
164
|
+
p "proxy_list:#{proxy_list}"
|
165
|
+
p "************************************"
|
166
|
+
|
167
|
+
bool_start = false
|
168
|
+
|
169
|
+
|
170
|
+
channel_urls.each do |channel,url|
|
171
|
+
|
172
|
+
if(channel=="CCTV16")
|
173
|
+
bool_start = true
|
174
|
+
end
|
175
|
+
|
176
|
+
if bool_start
|
177
|
+
previous_show_name = ""
|
178
|
+
channel_info = channel_infos[channel]
|
179
|
+
channel_name = channel_info["channel_name"]
|
180
|
+
channel_type = channel_info["channel_type"]
|
181
|
+
channel_id = channel_info["channel_id"]
|
182
|
+
channel_img_path = channel_info["img_path"]
|
183
|
+
|
184
|
+
#channel,herf,proxylist,day_num=7
|
185
|
+
|
186
|
+
|
187
|
+
start_time=0
|
188
|
+
use_num =1
|
189
|
+
|
190
|
+
#getScheduleAssignDate参数:
|
191
|
+
# channel 频道
|
192
|
+
# herf 频道地址
|
193
|
+
# proxylist 代理列表
|
194
|
+
# start_num 开始时间 int 为开始时间与今天的差值 正数代表今天之后的第几天 负数代表今天之前的第几天
|
195
|
+
# day_num 抓取的时间段天数
|
196
|
+
# img_dir_down_path 图片网络地址保存路径 有默认值 可不设置
|
197
|
+
schedule_list=GrabTvmao.getScheduleAssignDate(channel,url,proxy_list,start_time,use_num) #抓取的七天后的1天的数据
|
198
|
+
|
199
|
+
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
def self.img_down_path
|
205
|
+
@img_down_path
|
206
|
+
end
|
207
|
+
|
208
|
+
|
209
|
+
#获取网站的频道表
|
210
|
+
#img_path 图片存放路径
|
211
|
+
def self.getchannels(img_dir_path)
|
212
|
+
@channel = []
|
213
|
+
@site=DEFAULT_SITE
|
214
|
+
@proxyindex = 0
|
215
|
+
@img_down_dir_path = img_dir_path
|
216
|
+
@img_down_file = File.new(File.join(img_dir_path,"channel_img_down_path"),'w+')
|
217
|
+
|
218
|
+
channel_urls = {}
|
219
|
+
channel_info = {}
|
220
|
+
get_url =lambda { |type|
|
221
|
+
@site + "/program/duration/#{type}/w1.html" unless (type.nil?||type.empty?)
|
222
|
+
}
|
223
|
+
|
224
|
+
get_channel_id = lambda {|url|
|
225
|
+
channel_id = url.split("/")[2].split("-")[1] unless (url.nil?||url.empty?)
|
226
|
+
}
|
227
|
+
|
228
|
+
DEFAULT_GrabtvType.each do |type|
|
229
|
+
url = get_url.call(type)
|
230
|
+
p url
|
231
|
+
doc = Nokogiri::HTML(open(url))
|
232
|
+
p doc.content
|
233
|
+
p "*************************************************************"
|
234
|
+
doc.css('td[class="tdchn"]').each do |td|
|
235
|
+
channel_name=td.content
|
236
|
+
herf = ""
|
237
|
+
td.css('a').each do |a|
|
238
|
+
herf=a['href']
|
239
|
+
end
|
240
|
+
channel_id = get_channel_id.call(herf)
|
241
|
+
|
242
|
+
#获取频道图片的地址
|
243
|
+
img_path = "http://static.haotv.me/channel/logo/#{channel_id}.jpg"
|
244
|
+
@img_down_file.puts("#{channel_id}:#{img_path}")
|
245
|
+
@channel<<({channel_id=>{name:channel_name,herf:herf,type:type}})
|
246
|
+
channel_info.merge!({channel_id=>{"channel_name"=>channel_name,"channel_type"=>type,"channel_id"=>channel_id,"img_path"=>img_path}})
|
247
|
+
channel_urls.merge!({channel_id=>herf})
|
248
|
+
end
|
249
|
+
end
|
250
|
+
@img_down_file.close
|
251
|
+
p "Channel: #{@channel}"
|
252
|
+
{"channel_info"=>channel_info,"channel_urls"=>channel_urls}
|
253
|
+
end
|
254
|
+
|
255
|
+
#使用代理获取url的html的doc值
|
256
|
+
def self.get_doc_with_proxy(proxylist,url)
|
257
|
+
unless @proxyindex
|
258
|
+
@proxyindex = 0
|
259
|
+
end
|
260
|
+
@proxyindex=@proxyindex%proxylist.size
|
261
|
+
if(proxylist[@proxyindex])
|
262
|
+
proxy = proxylist[@proxyindex]
|
263
|
+
else
|
264
|
+
proxy = proxylist[@proxyindex+1]
|
265
|
+
end
|
266
|
+
begin
|
267
|
+
doc = Nokogiri::HTML(open(url,:proxy=>"http://#{proxy}")) unless proxy.nil?||proxy.empty?
|
268
|
+
doc = Nokogiri::HTML(open(url)) if proxy.nil?||proxy.empty?
|
269
|
+
@no_firest = 0
|
270
|
+
rescue => err
|
271
|
+
|
272
|
+
unless @no_firest
|
273
|
+
@no_firest = 0
|
274
|
+
end
|
275
|
+
|
276
|
+
@no_firest += 1
|
277
|
+
p "*************************Proxy:#{proxy}, url:#{url}"
|
278
|
+
#proxylist.delete(proxy) #删除出错的代理 但如果是此网页错误则会引起BUG待修复
|
279
|
+
get_doc_with_proxy(proxylist,url) if @no_firest<4
|
280
|
+
raise RuntimeError,"Error: #{err.to_s}" unless @no_firest<4
|
281
|
+
end
|
282
|
+
@proxyindex += 1
|
283
|
+
unless doc
|
284
|
+
p "*************************Proxy:#{proxy}, url:#{url}"
|
285
|
+
end
|
286
|
+
doc
|
287
|
+
end
|
288
|
+
|
289
|
+
#获取某天的节目表
|
290
|
+
def self.get_schedulelist_atday(channel,url,proxylist)
|
291
|
+
p "Grab: #{url}"
|
292
|
+
doc = get_doc_with_proxy(proxylist,url)
|
293
|
+
show_type = []
|
294
|
+
|
295
|
+
|
296
|
+
_img_url = "http://static.haotv.me/channel/logo/"
|
297
|
+
img_url = _img_url + channel+".jpg"
|
298
|
+
|
299
|
+
|
300
|
+
data=doc.css('div[class="mt10 clear"]')[0].content.split(" ")
|
301
|
+
date = data[0]
|
302
|
+
week = data[1]
|
303
|
+
p "Channel: #{channel} Date: #{date} Week: #{week}"
|
304
|
+
@date = "#{week}(#{date})"
|
305
|
+
schedule_list = []
|
306
|
+
|
307
|
+
_herf = doc.css("h1[style='float:left']").xpath('img[@src]')[0]
|
308
|
+
img_url = _herf.get_attribute("src") if _herf
|
309
|
+
|
310
|
+
p "**************IMG: #{img_url}"
|
311
|
+
|
312
|
+
|
313
|
+
doc.css('ul[id="pgrow"]')[0].css("li").each do |schedule|
|
314
|
+
_herf= schedule.xpath('a[@href]')[0]
|
315
|
+
schedule_herf=_herf.get_attribute("href") if _herf
|
316
|
+
unless _herf
|
317
|
+
drama =schedule.css('a[class="drama"]')[0]
|
318
|
+
if drama
|
319
|
+
_herfs=drama.get_attribute("href").gsub("/episode/section","#%#")
|
320
|
+
schedule_herf = _herfs.split("#%#")[0]
|
321
|
+
end
|
322
|
+
end
|
323
|
+
if schedule.content.split(" ").size>1
|
324
|
+
time = schedule.content.split(" ")[0]
|
325
|
+
schedule = schedule.content.split(" ")[1]
|
326
|
+
show_name = ""
|
327
|
+
unless schedule_herf.nil?||schedule_herf.empty?
|
328
|
+
show_infomation=get_show_infomation(proxylist,schedule_herf)
|
329
|
+
show_type=show_infomation["type"]
|
330
|
+
show_name = show_infomation["name"]
|
331
|
+
show_img = show_infomation["img"]
|
332
|
+
end
|
333
|
+
p "Time: #{time} schedule: #{schedule} show_infomation_herf: #{schedule_herf} type: #{show_type} name: #{show_name} img:#{show_img}"
|
334
|
+
schedule_list << {"schedule_name"=>schedule,"schedule_logo"=>show_img,"schedule_start"=>time,"show_infomation_herf"=>schedule_herf,"type"=>show_type,"name"=>show_name}
|
335
|
+
end
|
336
|
+
end
|
337
|
+
schedule_list
|
338
|
+
end
|
339
|
+
|
340
|
+
#获取制定时间和长度url
|
341
|
+
#start_time 为int型 开始时间和今天的差值 正数代表之后的第几天 负数代表之前的第几天
|
342
|
+
#day_num 为int型 代表抓取的时间从开始时间计算的多少天
|
343
|
+
def self.get_assign_date_url(url,start_time,day_num)
|
344
|
+
site="http://www.tvmao.com"
|
345
|
+
if(@site)
|
346
|
+
site=@site
|
347
|
+
end
|
348
|
+
|
349
|
+
_url = site
|
350
|
+
urls = []
|
351
|
+
_urls = url.split("-")
|
352
|
+
|
353
|
+
time = Time.now
|
354
|
+
_wday = time.wday
|
355
|
+
wday = _wday + start_time
|
356
|
+
if wday<0
|
357
|
+
wday = 1
|
358
|
+
end
|
359
|
+
|
360
|
+
end_day = wday + day_num - 1
|
361
|
+
|
362
|
+
if end_day>(_wday+7)
|
363
|
+
end_day = _wday + 7
|
364
|
+
end
|
365
|
+
|
366
|
+
0.upto(1).each do |i|
|
367
|
+
_url = _url+"#{_urls[i]}"+"-"
|
368
|
+
end
|
369
|
+
|
370
|
+
wday.upto(end_day).each do |i|
|
371
|
+
urls << _url+"w#{i}.html"
|
372
|
+
end
|
373
|
+
urls
|
374
|
+
end
|
375
|
+
|
376
|
+
|
377
|
+
|
378
|
+
#获取指定时间段的节目表
|
379
|
+
def self.getScheduleAssignDate(channel,herf,proxylist,start_num,day_num=0,img_dir_down_path=@img_down_dir_path)
|
380
|
+
begin
|
381
|
+
day_num = 1 if day_num<1
|
382
|
+
rescue
|
383
|
+
day_num = 1
|
384
|
+
end
|
385
|
+
site="http://www.tvmao.com"
|
386
|
+
unless img_dir_down_path
|
387
|
+
img_dir_down_path = __FILE__
|
388
|
+
end
|
389
|
+
@img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+")
|
390
|
+
|
391
|
+
if(@site)
|
392
|
+
site=@site
|
393
|
+
end
|
394
|
+
_img_url = "http://static.haotv.me/channel/logo/"
|
395
|
+
@show_schedule = {}
|
396
|
+
|
397
|
+
channel_schedule = {}
|
398
|
+
get_assign_date_url(herf,start_num,day_num).each do |url|
|
399
|
+
@date = ""
|
400
|
+
schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
|
401
|
+
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
|
402
|
+
end
|
403
|
+
@img_down_file.close
|
404
|
+
{"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
|
405
|
+
end
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
|
410
|
+
|
411
|
+
#因原已调用所以保留
|
412
|
+
#获取一周节目表
|
413
|
+
def self.getschedule(channel,herf,proxylist,day_num=7,img_dir_down_path=@img_down_dir_path)
|
414
|
+
p "Day Num is #{day_num}"
|
415
|
+
begin
|
416
|
+
day_num = 1 if day_num<1
|
417
|
+
rescue
|
418
|
+
day_num = 1
|
419
|
+
end
|
420
|
+
site="http://www.tvmao.com"
|
421
|
+
unless img_dir_down_path
|
422
|
+
img_dir_down_path = __FILE__
|
423
|
+
end
|
424
|
+
@img_down_file = File.new(File.join(img_dir_down_path,"schedule_img_down_path"),"w+")
|
425
|
+
|
426
|
+
if(@site)
|
427
|
+
site=@site
|
428
|
+
end
|
429
|
+
_img_url = "http://static.haotv.me/channel/logo/"
|
430
|
+
@show_schedule = {}
|
431
|
+
|
432
|
+
get_week_url = lambda {|url,day_num|
|
433
|
+
_url = site
|
434
|
+
urls = []
|
435
|
+
_urls = url.split("-")
|
436
|
+
0.upto(1).each do |i|
|
437
|
+
_url = _url+"#{_urls[i]}"+"-"
|
438
|
+
end
|
439
|
+
1.upto(day_num).each do |i|
|
440
|
+
urls << _url+"w#{i}.html"
|
441
|
+
end
|
442
|
+
urls
|
443
|
+
}
|
444
|
+
|
445
|
+
channel_schedule = {}
|
446
|
+
get_week_url.call(herf,day_num).each do |url|
|
447
|
+
@date = ""
|
448
|
+
schedule_list = self.get_schedulelist_atday(channel,url,proxylist)
|
449
|
+
channel_schedule.merge!({@date=>schedule_list}) unless @date.empty?
|
450
|
+
end
|
451
|
+
@img_down_file.close
|
452
|
+
{"channel_schedule"=>channel_schedule,"show_schedule"=>@show_schedule}
|
453
|
+
end
|
454
|
+
|
455
|
+
|
456
|
+
#获取节目详细信息
|
457
|
+
def self.get_show_infomation(proxy_list,schedule_herf)
|
458
|
+
begin
|
459
|
+
@proxyindex = 0
|
460
|
+
unless @site
|
461
|
+
@site = "http://www.tvmao.com"
|
462
|
+
end
|
463
|
+
schedule_herf = @site + schedule_herf
|
464
|
+
doc=get_doc_with_proxy(proxy_list,schedule_herf)
|
465
|
+
#title = doc.css("a[herf='#{schedule_herf}+/detail']")[0]['title']
|
466
|
+
# p "title: %s" % title
|
467
|
+
type = []
|
468
|
+
name = doc.css('span[itemprop="name"]')[0].content
|
469
|
+
|
470
|
+
#获取节目的图片
|
471
|
+
if doc.css('img[class="tvc"]')
|
472
|
+
schedule_img_down_path = doc.css('img[class="tvc"]')[0].get_attribute('src') if doc.css('img[class="tvc"]')[0]
|
473
|
+
end
|
474
|
+
|
475
|
+
|
476
|
+
|
477
|
+
doc.css('span[itemprop="genre"]').each do |_type|
|
478
|
+
type << _type.content
|
479
|
+
end
|
480
|
+
doc.css('a[itemprop="genre"]').each do |_type|
|
481
|
+
type<<_type.content
|
482
|
+
end
|
483
|
+
url = "#{schedule_herf}/detail"
|
484
|
+
doc = get_doc_with_proxy(proxy_list,url)
|
485
|
+
doc.css('span[itemprop="genre"]').each do |_type|
|
486
|
+
type << _type.content
|
487
|
+
end
|
488
|
+
doc.css('a[itemprop="genre"]').each do |_type|
|
489
|
+
type<<_type.content
|
490
|
+
end
|
491
|
+
type.uniq!
|
492
|
+
@img_down_file.puts("#{name}:#{schedule_img_down_path}")
|
493
|
+
@show_schedule.merge!(name=>get_show_schedule(proxy_list,schedule_herf)) unless @show_schedule.has_key?(name)
|
494
|
+
{"type"=>type,"name"=>name,"img"=>schedule_img_down_path}
|
495
|
+
rescue => e
|
496
|
+
p "Error In get_show_infomation msg : #{e.to_s}"
|
497
|
+
end
|
498
|
+
end
|
499
|
+
|
500
|
+
#获取节目的时间表
|
501
|
+
def self.get_show_schedule(proxylist,herf)
|
502
|
+
url = herf + "/playingtime"
|
503
|
+
doc = get_doc_with_proxy(proxylist,url)
|
504
|
+
i = 0
|
505
|
+
schedule = []
|
506
|
+
doc.css('div[id="epg"]')[0].css("div[class='c1 col']").each do |epg|
|
507
|
+
unless(i==0)
|
508
|
+
time = epg.css('div[class="f1 fld"]')[0].content
|
509
|
+
channel_name = epg.css('div[class="f2 fld"]')[0].content
|
510
|
+
show_name = epg.css('div[class="f3 fld"]')[0].content
|
511
|
+
times = time.split(" ")
|
512
|
+
week = times[0]
|
513
|
+
date = times[1]
|
514
|
+
_time = times[2]
|
515
|
+
schedule << {"week"=>week,"date"=>date,"time"=>_time,"channel_name"=>channel_name,"show_name"=>show_name}
|
516
|
+
end
|
517
|
+
i += 1
|
518
|
+
end
|
519
|
+
schedule
|
520
|
+
end
|
521
|
+
|
522
|
+
|
523
|
+
|
524
|
+
|
525
|
+
#获取指定访问速度的代理服务器
|
526
|
+
#time为最慢速度的时间 int型 代表秒
|
527
|
+
def self.get_topfast_list(use_time)
|
528
|
+
fast_list = []
|
529
|
+
time_use = 0
|
530
|
+
ips_ports = get_proxy_list()
|
531
|
+
ips_ports.each do |ip_port|
|
532
|
+
time_start = Time.now.to_i
|
533
|
+
begin
|
534
|
+
timeout(use_time) do
|
535
|
+
doc = Nokogiri::HTML(open("http://www.tvmao.com/program",:proxy=> "http://#{ip_port}"))
|
536
|
+
end
|
537
|
+
time_end = Time.now.to_i
|
538
|
+
time_use = time_end - time_start
|
539
|
+
p "http://#{ip_port} use_time:#{time_use}"
|
540
|
+
rescue Exception =>e
|
541
|
+
case e
|
542
|
+
when Errno::ETIMEDOUT
|
543
|
+
p "Use http://#{ip_port} timeout"
|
544
|
+
when Timeout::Error
|
545
|
+
p "Use http://#{ip_port} timeout"
|
546
|
+
when Errno::ECONNREFUSED
|
547
|
+
p "Use http://#{ip_port} Error connection"
|
548
|
+
else
|
549
|
+
p "Use http://#{ip_port} Error:#{e.to_s}"
|
550
|
+
end
|
551
|
+
time_use = -1
|
552
|
+
end
|
553
|
+
if(time_use > 0 &&time_use < 8)
|
554
|
+
fast_list << ip_port
|
555
|
+
end
|
556
|
+
end
|
557
|
+
fast_list
|
558
|
+
end
|
559
|
+
|
560
|
+
#获取代理列表
|
561
|
+
def self.get_proxy_list()
|
562
|
+
list = gg('http://www.proxycn.cn/html_proxy/30fastproxy-1.html')
|
563
|
+
if list.count ==0
|
564
|
+
list = gg('http://www.proxycn.cn/html_proxy/http-1.html')
|
565
|
+
end
|
566
|
+
ips_ports = []
|
567
|
+
regex_port = /(?<=<TD class="list">)[0-9]*?(?=<\/TD>)/
|
568
|
+
regex_ip = /(?<=a href\=whois.php\?whois\=)[0-9,.]*/
|
569
|
+
list.each do |proxy_txt|
|
570
|
+
port = proxy_txt[regex_port]
|
571
|
+
ip = proxy_txt[regex_ip]
|
572
|
+
if(ip != ""&& !port.to_s.eql?('3128'))
|
573
|
+
port_ip = ip.to_s + ":" + port.to_s
|
574
|
+
ips_ports << port_ip
|
575
|
+
end
|
576
|
+
end
|
577
|
+
p "Count: #{ips_ports.count}"
|
578
|
+
ips_ports
|
579
|
+
end
|
580
|
+
|
581
|
+
def self.gg(url)
|
582
|
+
regex_list = /<TD class="list">.*<\/TD>/
|
583
|
+
href =URI.parse(url)
|
584
|
+
contxt = ""
|
585
|
+
href.open{ |f|
|
586
|
+
f.each_line {|line| contxt =contxt + line + "\n"}
|
587
|
+
}
|
588
|
+
list = contxt.scan(regex_list)
|
589
|
+
end
|
590
|
+
|
591
|
+
def save_img
|
592
|
+
|
593
|
+
end
|
594
|
+
|
595
|
+
end
|