youtubesearchresultscraper 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (C) 2006 by in3c.org
2
+ # Copyright (C) 2006 by in3c.org, ARK-Web co., ltd
3
3
  #
4
4
  # Permission is hereby granted, free of charge, to any person obtaining
5
5
  # a copy of this software and associated documentation files (the
@@ -53,14 +53,18 @@ module Youtube #:nodoc:
53
53
  # http://www.ark-web.jp/sandbox/wiki/184.html (japanese only)
54
54
  #
55
55
  # Author:: Yuki SHIDA <shida@in3c.org>
56
- # Version:: 0.0.2
56
+ # Author:: Konuma Akio <konuma@ark-web.jp>
57
+ # Version:: 0.0.3
57
58
  # License:: MIT license
58
59
 
59
60
  class SearchResultScraper
60
61
 
61
62
  attr_accessor :keyword
62
63
  attr_accessor :page
63
-
64
+ attr_reader :video_count
65
+ attr_reader :video_from
66
+ attr_reader :video_to
67
+
64
68
  @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
65
69
 
66
70
  # Create Youtube::SearchResultScraper object specifying keyword and number of page.
@@ -103,9 +107,18 @@ module Youtube #:nodoc:
103
107
  video.thumbnail_url = scrape_thumbnail_url(video_html)
104
108
  video.tags = scrape_tags(video_html)
105
109
  video.url = scrape_url(video_html)
110
+
111
+ check_video video
112
+
106
113
  @videos << video
107
114
  end
108
115
 
116
+ @video_count = scrape_video_count
117
+ @video_from = scrape_video_from
118
+ @video_to = scrape_video_to
119
+
120
+ raise "scraping error" if (is_no_result != @videos.empty?)
121
+
109
122
  @videos
110
123
  end
111
124
 
@@ -118,7 +131,9 @@ module Youtube #:nodoc:
118
131
 
119
132
  # Return videos information as XML Format.
120
133
  def get_xml
121
- xml = "<ut_response status=\"ok\"><video_list>\n"
134
+ xml = "<ut_response status=\"ok\">" +
135
+ "<video_count>" + @video_count.to_s + "</video_count>" +
136
+ "<video_list>\n"
122
137
  each do |video|
123
138
  xml += video.to_xml
124
139
  end
@@ -139,7 +154,6 @@ module Youtube #:nodoc:
139
154
  video_html.search("div[@class='vfacets']").inner_html.sub(/.*From:<\/span> <a.*?>(.*?)<\/a>.*/m, '\1')
140
155
  end
141
156
 
142
-
143
157
  def scrape_title video_html
144
158
  video_html.search("div[@class='vtitle']/a").inner_html
145
159
  end
@@ -183,6 +197,51 @@ module Youtube #:nodoc:
183
197
  "http://www.youtube.com" +
184
198
  video_html.search("div[@class='vtitle']/a").to_html.sub(/.*href="(.*?)".*/m, '\1')
185
199
  end
200
+
201
+ def scrape_result_header
202
+ @search_result.search("div[@id='sectionHeader']/div[@class='my']").inner_html
203
+ end
204
+
205
+ def scrape_video_count
206
+ scrape_result_header.sub(/.+of *(\d+)/m , '\1').to_i
207
+ end
208
+
209
+ def scrape_video_from
210
+ scrape_result_header.sub(/Results *(\d+)-.+/m, '\1').to_i
211
+ end
212
+
213
+ def scrape_video_to
214
+ scrape_result_header.sub(/Results.+-(\d+) *of.+/m, '\1').to_i
215
+ end
216
+
217
+ def is_no_result
218
+ @search_result.search("div[@class='body']").inner_html.include?('No Videos found')
219
+ end
220
+
221
+ def check_video video
222
+ errors = []
223
+
224
+ errors << "author" if video.author.empty?
225
+ errors << "id" if video.id.empty?
226
+ errors << "title" if video.title.empty?
227
+ errors << "length_seconds" if video.length_seconds.to_s.empty?
228
+ errors << "rating_avg" if video.rating_avg.to_s.empty?
229
+ errors << "rating_count" if video.rating_count.to_s.empty?
230
+ errors << "description" if video.description.empty?
231
+ errors << "view_count" if video.view_count.to_s.empty?
232
+ errors << "tags" if video.tags.empty?
233
+ errors << "url" if video.url.empty?
234
+ errors << "thumbnail_url" if video.thumbnail_url.empty?
235
+
236
+ unless errors.empty? then
237
+ error_msg = "scraping error occurred.\n"
238
+ errors.each do |error|
239
+ error_msg << error + " is not setted.\n"
240
+ end
241
+ raise error_msg
242
+ end
243
+ end
244
+
186
245
  end
187
246
 
188
247
  end
@@ -0,0 +1,387 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
2
+
3
+
4
+
5
+
6
+ <html>
7
+
8
+ <!-- machid: 99 -->
9
+ <head>
10
+
11
+ <title>YouTube - Broadcast Yourself.</title>
12
+
13
+ <link rel="stylesheet" href="/css/styles_yts1164775696.css" type="text/css">
14
+ <link rel="stylesheet" href="/css/base_yts1165878295.css" type="text/css">
15
+ <link rel="icon" href="/favicon.ico" type="image/x-icon">
16
+ <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
17
+
18
+
19
+ <meta name="keywords" content="video,sharing,camera phone,video phone">
20
+
21
+ <link rel="alternate" title="YouTube - [RSS]" href="/rssls">
22
+
23
+ <script type="text/javascript" src="/js/ui_yts1164777409.js"></script>
24
+ <script type="text/javascript" src="/js/AJAX_yts1161839869.js"></script>
25
+ <script type="text/javascript" src="/js/watch_queue_yts1161839869.js"></script>
26
+ <script language="javascript" type="text/javascript">
27
+ onLoadFunctionList = new Array();
28
+ function performOnLoadFunctions()
29
+ {
30
+ for (var i in onLoadFunctionList)
31
+ {
32
+ onLoadFunctionList[i]();
33
+ }
34
+ }
35
+ </script>
36
+
37
+ <script language="javascript" type="text/javascript"> function _hbLink (a,b) { return false; } </script>
38
+
39
+
40
+ </head>
41
+
42
+
43
+ <body onLoad="performOnLoadFunctions();">
44
+
45
+ <div id="baseDiv">
46
+ <div id="logoTagDiv">
47
+ <a href="/" name="&lid=Logo&lpos=GlobalNav" title="Home"><img src="/img/pic_youtubelogo_123x63.gif" alt="Home" width="123" height="63" border="0" onmouseover="showDiv('logoHomeTip');" onmouseout="hideDiv('logoHomeTip');" /></a>
48
+ </div>
49
+ <div id="logoHomeTip" style="display: none;">
50
+ Home
51
+ </div>
52
+
53
+ <div id="utilDiv">
54
+
55
+ <div style="float:right; margin-top:5px;">
56
+ <span class="utilDelim">|</span>
57
+ <a href="/recently_watched" onclick="_hbLink('ViewingHistory','UtilityLinks');">History</a>
58
+ <span class="utilDelim">|</span>
59
+ <a href="/watch_queue?all" onclick="_hbLink('QuickList','UtilityLinks');">QuickList</a>
60
+ (<span id="quicklist_numb"><a href="/watch_queue?all"><script type="text/javascript">var quicklist_count=0;document.write(quicklist_count);</script></a></span>)
61
+ <span class="utilDelim">|</span>
62
+ <a href="/t/help_center">Help</a>
63
+ <span class="utilDelim">|</span>
64
+
65
+ <a href="#" onClick="document.logoutForm.submit()">Log Out</a>
66
+ </div>
67
+
68
+ <div class="myAccountContainer" style="margin: 5px 5px 0px 3px;">
69
+ <a href="/my_account" onclick="_hbLink('MyAccount','UtilityLinks');">My Account</a>
70
+
71
+ </div>
72
+
73
+ <div id="utilNavLeftContainer">
74
+ <b>Hello, <a href="/profile?user=rubypythonjava" onclick="_hbLink('ChannelProfile','UtilityLinks');">rubypythonjava</a></b> &nbsp;
75
+ <a href="/my_messages"><img src="/img/icn_nomail_21x17.gif" valign="bottom" border="0" id="iconMail"></a> (<a class="headerLink" href="/my_messages">0</a>)
76
+ <span class="utilDelim">|</span>
77
+ </div>
78
+
79
+ <form name="logoutForm" method="post" action="/index">
80
+ <input type="hidden" name="action_logout" value="1">
81
+ </form>
82
+ </div>
83
+
84
+
85
+ <div id="searchDiv">
86
+ <form name="searchForm" id="searchForm" method="get" action="/results">
87
+ <input tabindex="10000" type="text" name="search_query" maxlength="128" class="searchField" value="doraemon vs fakfj da">
88
+ &nbsp;
89
+ <input type="submit" name="search" value="Search">
90
+ </form>
91
+
92
+ </div>
93
+
94
+ <div id="gNavDiv">
95
+
96
+
97
+ <div id="upload"><a href="/my_videos_upload"><img src="/img/pic_upload_130x28.gif" width="130" height="28" alt="upload" border="0" /></a></div>
98
+
99
+ <div class="tab">
100
+ <a href="/community"><img src="/img/tab_community_118x28.gif" width="118" height="28" border="0" alt="community" /></a></div>
101
+ <div class="tab">
102
+ <a href="/members"><img src="/img/tab_channels_118x28.gif" width="118" height="28" border="0" alt="channels" /></a></div>
103
+ <div class="tab">
104
+ <a href="/categories"><img src="/img/tab_categories_118x28.gif" width="118" height="28" border="0" alt="categories" /></a></div>
105
+ <div class="tab">
106
+ <a href="/browse?s=mp"><img src="/img/tab_videos_118x28.gif" width="118" height="28" border="0" alt="videos" /></a></div>
107
+ </div>
108
+ <!-- end gNavDiv -->
109
+ <div id="gNavBottom">&nbsp;</div>
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+ <div id="leaderboardAd">
123
+ <!-- google_ad_section_start -->
124
+
125
+
126
+
127
+
128
+
129
+
130
+ <!-- begin ad tag -->
131
+ <script type="text/javascript">
132
+ ord=Math.random()*10000000000000000 + 1;
133
+ document.write('<script language="JavaScript" src="http://ad.doubleclick.net/adj/you.results/_default;sz=728x90;kch=1600166264;kbg=FFFFFF;ksearch=doraemon%20vs%20fakfj%20da;kgender=m;kage=26;ord=' + ord + '?" type="text/javascript"><\/script>');
134
+ </script>
135
+ <noscript><a
136
+ href="http://ad.doubleclick.net/jump/you.results/_default;sz=728x90;ord=123456789?" target="_blank"><img
137
+ src="http://ad.doubleclick.net/ad/you.results/_default;sz=728x90;ord=123456789?" width="728" height="90" border="0" alt=""></a>
138
+ </noscript>
139
+ <!-- End ad tag -->
140
+
141
+
142
+
143
+
144
+ </div>
145
+
146
+ <div id="sideContent">
147
+ <div>
148
+ <!-- google_ad_section_start -->
149
+
150
+
151
+
152
+
153
+
154
+
155
+ <!-- begin ad tag -->
156
+ <script type="text/javascript">
157
+ ord=Math.random()*10000000000000000 + 2;
158
+ document.write('<script language="JavaScript" src="http://ad.doubleclick.net/adj/you.results/_default;sz=160x600;kch=1187796739;kbg=FFFFFF;ksearch=doraemon%20vs%20fakfj%20da;kgender=m;kage=26;ord=' + ord + '?" type="text/javascript"><\/script>');
159
+ </script>
160
+ <noscript><a
161
+ href="http://ad.doubleclick.net/jump/you.results/_default;sz=160x600;ord=123456789?" target="_blank"><img
162
+ src="http://ad.doubleclick.net/ad/you.results/_default;sz=160x600;ord=123456789?" width="160" height="600" border="0" alt=""></a>
163
+ </noscript>
164
+ <!-- End ad tag -->
165
+
166
+
167
+
168
+
169
+ </div>
170
+
171
+ <div class="spOffersDiv">
172
+ <h4 class="label">New on YouTube</h4>
173
+ <div class="spOffersEntry">
174
+ Do you know how not to?
175
+ <a href="/contest/hownotto">Enter for a chance to win</a>!
176
+ </div>
177
+
178
+ <div class="spOffersEntry">
179
+ There&#146;s a new way play.
180
+ <a href="/profile?user=wii">Wii from Nintendo</a>.
181
+ </div>
182
+
183
+ <div class="spOffersEntry">
184
+ Real Drama all the time.
185
+ <a href="/profile?user=TheBadGirlsClub">Check out the Bad Girls Club</a>.
186
+ </div>
187
+
188
+ <div class="spOffersEntry">
189
+ The Dark Side of Fame.
190
+ <a href="/profile?user=FXDirt">Dirt on FX</a>.
191
+ </div>
192
+
193
+ <div class="spOffersEntry">
194
+ Show Us Your Undeniable Power.
195
+ <a href="/undeniabletv">Enter for a chance to win</a> a Panasonic Plasma TV.
196
+ </div>
197
+
198
+ </div>
199
+ </div> <!-- end sideContent -->
200
+
201
+
202
+
203
+ <div id="mainContent">
204
+
205
+ <div id="sectionHeader" class="searchColor">
206
+ <div class="name">Search</div>
207
+ <span class="title"> Video <span class="normalText">results for</span>
208
+ 'doraemon vs fakfj da'
209
+ </span>
210
+ </div>
211
+
212
+
213
+ <div id="sideNav">
214
+ <div class="navHead searchColor">Search In</div>
215
+ <div class="navBody12">
216
+ <div class="label"><img src="/img/pic_selected_dot_9x9.gif" alt="selected" /> Videos</div>
217
+ <a href="/results?search_type=search_users&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=0">Channels</a><br/>
218
+ <a href="/results?search_type=search_groups&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=0">Groups</a><br/>
219
+ <a href="/results?search_type=search_playlists&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=0">Playlists</a><br/>
220
+ </div>
221
+
222
+
223
+ <div class="navHead searchColor">Sort By</div>
224
+ <div class="navBody11">
225
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=relevance&search_category=0">Relevance</a><br/>
226
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=video_date_uploaded&search_category=0">Date Added</a><br/>
227
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=video_view_count&search_category=0">View Count</a><br/>
228
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=video_avg_rating&search_category=0">Rating</a><br/>
229
+ </div>
230
+
231
+
232
+ <div class="navHead searchColor">Refine by Category</div>
233
+ <div class="navBody11">
234
+ <div class="label"><img src="/img/pic_selected_dot_9x9.gif" alt="selected" /> All</div>
235
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=1">Arts &amp; Animation</a><br/>
236
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=2">Autos &amp; Vehicles</a><br/>
237
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=23">Comedy</a><br/>
238
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=24">Entertainment</a><br/>
239
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=10">Music</a><br/>
240
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=25">News &amp; Blogs</a><br/>
241
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=22">People</a><br/>
242
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=15">Pets &amp; Animals</a><br/>
243
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=26">Science &amp; Technology</a><br/>
244
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=17">Sports</a><br/>
245
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=19">Travel &amp; Places</a><br/>
246
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=20">Video Games</a><br/>
247
+ </div>
248
+
249
+
250
+ <div id="bottomAdDiv" style="text-align: left;">
251
+ <a href="/wishcast"><img src="/img/ad_cokewishcast_120x90.jpg" width="120" height="90" border="0" alt="Send a Holiday Wishcast"></a>
252
+ </div>
253
+
254
+ </div> <!-- end sideNav -->
255
+
256
+
257
+
258
+
259
+ <div id="mainContentWithNav">
260
+
261
+
262
+ <div class="footerBox">
263
+
264
+
265
+
266
+
267
+
268
+ </div>
269
+
270
+ </div> <!-- end mainContentWithNav -->
271
+ </div> <!-- end mainContent -->
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+ <div class="spacer">&nbsp;</div>
292
+ <div id="footerDiv">
293
+ <div id="footerContent">
294
+ <div id="footerSearch">
295
+ <form name="searchFormFooter" id="searchFormFooter" method="get" action="/results">
296
+ <input type="text" name="search_query" maxlength="128" class="searchField" value="">
297
+ &nbsp;
298
+ <input type="submit" name="search" value="Search">
299
+ </form>
300
+ </div> <!-- end footerSearch -->
301
+
302
+
303
+ <div id="footerLinks">
304
+
305
+ <table border="0" cellpadding="0" cellspacing="0" width="100%" align="center"><tr valign="top">
306
+
307
+ <td>
308
+ <div class="footColumnLeft">
309
+ <div class="footLabel">Your&nbsp;&nbsp;Account</div>
310
+ <div class="footValues">
311
+ <div class="column">
312
+ <a href="/my_videos">Videos</a><br/>
313
+ <a href="/my_favorites">Favorites</a><br/>
314
+ </div>
315
+ <div class="column">
316
+ <a href="/my_playlists">Playlists</a><br/>
317
+ <a href="/my_messages">Inbox</a><br/>
318
+ </div>
319
+ <div class="column">
320
+ <a href="/subscription_center">Subscriptions</a><br/>
321
+ <a href="/my_account">more...</a><br/>
322
+ </div>
323
+ </div>
324
+ </div>
325
+ </td>
326
+
327
+ <td>
328
+ <div class="footColumnMid">
329
+ <div class="footLabel">Help &amp; Info</div>
330
+ <div class="footValues">
331
+ <div class="column">
332
+ <a href="/t/help_center">Help Center</a><br/>
333
+ <a href="/t/video_toolbox">Video Toolbox</a><br/>
334
+ </div>
335
+ <div class="column">
336
+ <a href="/dev">Developer APIs</a><br/>
337
+ <a href="/t/safety">Safety Tips</a><br/>
338
+ </div>
339
+ <div class="column">
340
+ <a href="/t/dmca_policy">Copyright FAQ</a><br/>
341
+ <a href="/t/community_guidelines">Code of Conduct</a><br/>
342
+ </div>
343
+ </div>
344
+ </div>
345
+ </td>
346
+
347
+
348
+ <td>
349
+ <div class="footColumnRight">
350
+ <div class="footLabel">YouTube</div>
351
+ <div class="footValues">
352
+ <div class="column">
353
+ <a href="/t/about">Company Info</a><br/>
354
+ <a href="/testtube">Test Tube</a><br/>
355
+ </div>
356
+ <div class="column">
357
+ <a href="/t/terms">Terms of Use</a><br/>
358
+ <a href="/t/privacy">Privacy Policy</a><br/>
359
+ </div>
360
+ <div class="column">
361
+ <a href="/advertise">Advertising</a><br/>
362
+ <a href="/contact">Contact</a><br/>
363
+ </div>
364
+ <div class="column">
365
+ <a href="/press_room">Press</a><br/>
366
+ <a href="http://www.pcrecruiter.net/pcrbin/regmenu.exe?uid=youtube.youtube">Jobs</a><br/>
367
+ </div>
368
+ </div>
369
+ </div>
370
+ </td>
371
+
372
+ </tr></table>
373
+
374
+ </div> <!-- end footerLinks -->
375
+ </div> <!-- end footerContent -->
376
+
377
+
378
+ <div id="footerCopyright">
379
+ Copyright &copy; 2006 YouTube, Inc.
380
+ </div> <!-- end footerCopyright -->
381
+
382
+ </div> <!-- end footerDiv -->
383
+
384
+ </div> <!-- end baseDiv -->
385
+ </body>
386
+
387
+ </html>