youtubesearchresultscraper 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (C) 2006 by in3c.org
2
+ # Copyright (C) 2006 by in3c.org, ARK-Web co., ltd
3
3
  #
4
4
  # Permission is hereby granted, free of charge, to any person obtaining
5
5
  # a copy of this software and associated documentation files (the
@@ -53,14 +53,18 @@ module Youtube #:nodoc:
53
53
  # http://www.ark-web.jp/sandbox/wiki/184.html (japanese only)
54
54
  #
55
55
  # Author:: Yuki SHIDA <shida@in3c.org>
56
- # Version:: 0.0.2
56
+ # Author:: Konuma Akio <konuma@ark-web.jp>
57
+ # Version:: 0.0.3
57
58
  # License:: MIT license
58
59
 
59
60
  class SearchResultScraper
60
61
 
61
62
  attr_accessor :keyword
62
63
  attr_accessor :page
63
-
64
+ attr_reader :video_count
65
+ attr_reader :video_from
66
+ attr_reader :video_to
67
+
64
68
  @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
65
69
 
66
70
  # Create Youtube::SearchResultScraper object specifying keyword and number of page.
@@ -103,9 +107,18 @@ module Youtube #:nodoc:
103
107
  video.thumbnail_url = scrape_thumbnail_url(video_html)
104
108
  video.tags = scrape_tags(video_html)
105
109
  video.url = scrape_url(video_html)
110
+
111
+ check_video video
112
+
106
113
  @videos << video
107
114
  end
108
115
 
116
+ @video_count = scrape_video_count
117
+ @video_from = scrape_video_from
118
+ @video_to = scrape_video_to
119
+
120
+ raise "scraping error" if (is_no_result != @videos.empty?)
121
+
109
122
  @videos
110
123
  end
111
124
 
@@ -118,7 +131,9 @@ module Youtube #:nodoc:
118
131
 
119
132
  # Return videos information as XML Format.
120
133
  def get_xml
121
- xml = "<ut_response status=\"ok\"><video_list>\n"
134
+ xml = "<ut_response status=\"ok\">" +
135
+ "<video_count>" + @video_count.to_s + "</video_count>" +
136
+ "<video_list>\n"
122
137
  each do |video|
123
138
  xml += video.to_xml
124
139
  end
@@ -139,7 +154,6 @@ module Youtube #:nodoc:
139
154
  video_html.search("div[@class='vfacets']").inner_html.sub(/.*From:<\/span> <a.*?>(.*?)<\/a>.*/m, '\1')
140
155
  end
141
156
 
142
-
143
157
  def scrape_title video_html
144
158
  video_html.search("div[@class='vtitle']/a").inner_html
145
159
  end
@@ -183,6 +197,51 @@ module Youtube #:nodoc:
183
197
  "http://www.youtube.com" +
184
198
  video_html.search("div[@class='vtitle']/a").to_html.sub(/.*href="(.*?)".*/m, '\1')
185
199
  end
200
+
201
+ def scrape_result_header
202
+ @search_result.search("div[@id='sectionHeader']/div[@class='my']").inner_html
203
+ end
204
+
205
+ def scrape_video_count
206
+ scrape_result_header.sub(/.+of *(\d+)/m , '\1').to_i
207
+ end
208
+
209
+ def scrape_video_from
210
+ scrape_result_header.sub(/Results *(\d+)-.+/m, '\1').to_i
211
+ end
212
+
213
+ def scrape_video_to
214
+ scrape_result_header.sub(/Results.+-(\d+) *of.+/m, '\1').to_i
215
+ end
216
+
217
+ def is_no_result
218
+ @search_result.search("div[@class='body']").inner_html.include?('No Videos found')
219
+ end
220
+
221
+ def check_video video
222
+ errors = []
223
+
224
+ errors << "author" if video.author.empty?
225
+ errors << "id" if video.id.empty?
226
+ errors << "title" if video.title.empty?
227
+ errors << "length_seconds" if video.length_seconds.to_s.empty?
228
+ errors << "rating_avg" if video.rating_avg.to_s.empty?
229
+ errors << "rating_count" if video.rating_count.to_s.empty?
230
+ errors << "description" if video.description.empty?
231
+ errors << "view_count" if video.view_count.to_s.empty?
232
+ errors << "tags" if video.tags.empty?
233
+ errors << "url" if video.url.empty?
234
+ errors << "thumbnail_url" if video.thumbnail_url.empty?
235
+
236
+ unless errors.empty? then
237
+ error_msg = "scraping error occurred.\n"
238
+ errors.each do |error|
239
+ error_msg << error + " is not setted.\n"
240
+ end
241
+ raise error_msg
242
+ end
243
+ end
244
+
186
245
  end
187
246
 
188
247
  end
@@ -0,0 +1,387 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
2
+
3
+
4
+
5
+
6
+ <html>
7
+
8
+ <!-- machid: 99 -->
9
+ <head>
10
+
11
+ <title>YouTube - Broadcast Yourself.</title>
12
+
13
+ <link rel="stylesheet" href="/css/styles_yts1164775696.css" type="text/css">
14
+ <link rel="stylesheet" href="/css/base_yts1165878295.css" type="text/css">
15
+ <link rel="icon" href="/favicon.ico" type="image/x-icon">
16
+ <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
17
+
18
+
19
+ <meta name="keywords" content="video,sharing,camera phone,video phone">
20
+
21
+ <link rel="alternate" title="YouTube - [RSS]" href="/rssls">
22
+
23
+ <script type="text/javascript" src="/js/ui_yts1164777409.js"></script>
24
+ <script type="text/javascript" src="/js/AJAX_yts1161839869.js"></script>
25
+ <script type="text/javascript" src="/js/watch_queue_yts1161839869.js"></script>
26
+ <script language="javascript" type="text/javascript">
27
+ onLoadFunctionList = new Array();
28
+ function performOnLoadFunctions()
29
+ {
30
+ for (var i in onLoadFunctionList)
31
+ {
32
+ onLoadFunctionList[i]();
33
+ }
34
+ }
35
+ </script>
36
+
37
+ <script language="javascript" type="text/javascript"> function _hbLink (a,b) { return false; } </script>
38
+
39
+
40
+ </head>
41
+
42
+
43
+ <body onLoad="performOnLoadFunctions();">
44
+
45
+ <div id="baseDiv">
46
+ <div id="logoTagDiv">
47
+ <a href="/" name="&lid=Logo&lpos=GlobalNav" title="Home"><img src="/img/pic_youtubelogo_123x63.gif" alt="Home" width="123" height="63" border="0" onmouseover="showDiv('logoHomeTip');" onmouseout="hideDiv('logoHomeTip');" /></a>
48
+ </div>
49
+ <div id="logoHomeTip" style="display: none;">
50
+ Home
51
+ </div>
52
+
53
+ <div id="utilDiv">
54
+
55
+ <div style="float:right; margin-top:5px;">
56
+ <span class="utilDelim">|</span>
57
+ <a href="/recently_watched" onclick="_hbLink('ViewingHistory','UtilityLinks');">History</a>
58
+ <span class="utilDelim">|</span>
59
+ <a href="/watch_queue?all" onclick="_hbLink('QuickList','UtilityLinks');">QuickList</a>
60
+ (<span id="quicklist_numb"><a href="/watch_queue?all"><script type="text/javascript">var quicklist_count=0;document.write(quicklist_count);</script></a></span>)
61
+ <span class="utilDelim">|</span>
62
+ <a href="/t/help_center">Help</a>
63
+ <span class="utilDelim">|</span>
64
+
65
+ <a href="#" onClick="document.logoutForm.submit()">Log Out</a>
66
+ </div>
67
+
68
+ <div class="myAccountContainer" style="margin: 5px 5px 0px 3px;">
69
+ <a href="/my_account" onclick="_hbLink('MyAccount','UtilityLinks');">My Account</a>
70
+
71
+ </div>
72
+
73
+ <div id="utilNavLeftContainer">
74
+ <b>Hello, <a href="/profile?user=rubypythonjava" onclick="_hbLink('ChannelProfile','UtilityLinks');">rubypythonjava</a></b> &nbsp;
75
+ <a href="/my_messages"><img src="/img/icn_nomail_21x17.gif" valign="bottom" border="0" id="iconMail"></a> (<a class="headerLink" href="/my_messages">0</a>)
76
+ <span class="utilDelim">|</span>
77
+ </div>
78
+
79
+ <form name="logoutForm" method="post" action="/index">
80
+ <input type="hidden" name="action_logout" value="1">
81
+ </form>
82
+ </div>
83
+
84
+
85
+ <div id="searchDiv">
86
+ <form name="searchForm" id="searchForm" method="get" action="/results">
87
+ <input tabindex="10000" type="text" name="search_query" maxlength="128" class="searchField" value="doraemon vs fakfj da">
88
+ &nbsp;
89
+ <input type="submit" name="search" value="Search">
90
+ </form>
91
+
92
+ </div>
93
+
94
+ <div id="gNavDiv">
95
+
96
+
97
+ <div id="upload"><a href="/my_videos_upload"><img src="/img/pic_upload_130x28.gif" width="130" height="28" alt="upload" border="0" /></a></div>
98
+
99
+ <div class="tab">
100
+ <a href="/community"><img src="/img/tab_community_118x28.gif" width="118" height="28" border="0" alt="community" /></a></div>
101
+ <div class="tab">
102
+ <a href="/members"><img src="/img/tab_channels_118x28.gif" width="118" height="28" border="0" alt="channels" /></a></div>
103
+ <div class="tab">
104
+ <a href="/categories"><img src="/img/tab_categories_118x28.gif" width="118" height="28" border="0" alt="categories" /></a></div>
105
+ <div class="tab">
106
+ <a href="/browse?s=mp"><img src="/img/tab_videos_118x28.gif" width="118" height="28" border="0" alt="videos" /></a></div>
107
+ </div>
108
+ <!-- end gNavDiv -->
109
+ <div id="gNavBottom">&nbsp;</div>
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+ <div id="leaderboardAd">
123
+ <!-- google_ad_section_start -->
124
+
125
+
126
+
127
+
128
+
129
+
130
+ <!-- begin ad tag -->
131
+ <script type="text/javascript">
132
+ ord=Math.random()*10000000000000000 + 1;
133
+ document.write('<script language="JavaScript" src="http://ad.doubleclick.net/adj/you.results/_default;sz=728x90;kch=1600166264;kbg=FFFFFF;ksearch=doraemon%20vs%20fakfj%20da;kgender=m;kage=26;ord=' + ord + '?" type="text/javascript"><\/script>');
134
+ </script>
135
+ <noscript><a
136
+ href="http://ad.doubleclick.net/jump/you.results/_default;sz=728x90;ord=123456789?" target="_blank"><img
137
+ src="http://ad.doubleclick.net/ad/you.results/_default;sz=728x90;ord=123456789?" width="728" height="90" border="0" alt=""></a>
138
+ </noscript>
139
+ <!-- End ad tag -->
140
+
141
+
142
+
143
+
144
+ </div>
145
+
146
+ <div id="sideContent">
147
+ <div>
148
+ <!-- google_ad_section_start -->
149
+
150
+
151
+
152
+
153
+
154
+
155
+ <!-- begin ad tag -->
156
+ <script type="text/javascript">
157
+ ord=Math.random()*10000000000000000 + 2;
158
+ document.write('<script language="JavaScript" src="http://ad.doubleclick.net/adj/you.results/_default;sz=160x600;kch=1187796739;kbg=FFFFFF;ksearch=doraemon%20vs%20fakfj%20da;kgender=m;kage=26;ord=' + ord + '?" type="text/javascript"><\/script>');
159
+ </script>
160
+ <noscript><a
161
+ href="http://ad.doubleclick.net/jump/you.results/_default;sz=160x600;ord=123456789?" target="_blank"><img
162
+ src="http://ad.doubleclick.net/ad/you.results/_default;sz=160x600;ord=123456789?" width="160" height="600" border="0" alt=""></a>
163
+ </noscript>
164
+ <!-- End ad tag -->
165
+
166
+
167
+
168
+
169
+ </div>
170
+
171
+ <div class="spOffersDiv">
172
+ <h4 class="label">New on YouTube</h4>
173
+ <div class="spOffersEntry">
174
+ Do you know how not to?
175
+ <a href="/contest/hownotto">Enter for a chance to win</a>!
176
+ </div>
177
+
178
+ <div class="spOffersEntry">
179
+ There&#146;s a new way play.
180
+ <a href="/profile?user=wii">Wii from Nintendo</a>.
181
+ </div>
182
+
183
+ <div class="spOffersEntry">
184
+ Real Drama all the time.
185
+ <a href="/profile?user=TheBadGirlsClub">Check out the Bad Girls Club</a>.
186
+ </div>
187
+
188
+ <div class="spOffersEntry">
189
+ The Dark Side of Fame.
190
+ <a href="/profile?user=FXDirt">Dirt on FX</a>.
191
+ </div>
192
+
193
+ <div class="spOffersEntry">
194
+ Show Us Your Undeniable Power.
195
+ <a href="/undeniabletv">Enter for a chance to win</a> a Panasonic Plasma TV.
196
+ </div>
197
+
198
+ </div>
199
+ </div> <!-- end sideContent -->
200
+
201
+
202
+
203
+ <div id="mainContent">
204
+
205
+ <div id="sectionHeader" class="searchColor">
206
+ <div class="name">Search</div>
207
+ <span class="title"> Video <span class="normalText">results for</span>
208
+ 'doraemon vs fakfj da'
209
+ </span>
210
+ </div>
211
+
212
+
213
+ <div id="sideNav">
214
+ <div class="navHead searchColor">Search In</div>
215
+ <div class="navBody12">
216
+ <div class="label"><img src="/img/pic_selected_dot_9x9.gif" alt="selected" /> Videos</div>
217
+ <a href="/results?search_type=search_users&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=0">Channels</a><br/>
218
+ <a href="/results?search_type=search_groups&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=0">Groups</a><br/>
219
+ <a href="/results?search_type=search_playlists&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=0">Playlists</a><br/>
220
+ </div>
221
+
222
+
223
+ <div class="navHead searchColor">Sort By</div>
224
+ <div class="navBody11">
225
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=relevance&search_category=0">Relevance</a><br/>
226
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=video_date_uploaded&search_category=0">Date Added</a><br/>
227
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=video_view_count&search_category=0">View Count</a><br/>
228
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=video_avg_rating&search_category=0">Rating</a><br/>
229
+ </div>
230
+
231
+
232
+ <div class="navHead searchColor">Refine by Category</div>
233
+ <div class="navBody11">
234
+ <div class="label"><img src="/img/pic_selected_dot_9x9.gif" alt="selected" /> All</div>
235
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=1">Arts &amp; Animation</a><br/>
236
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=2">Autos &amp; Vehicles</a><br/>
237
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=23">Comedy</a><br/>
238
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=24">Entertainment</a><br/>
239
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=10">Music</a><br/>
240
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=25">News &amp; Blogs</a><br/>
241
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=22">People</a><br/>
242
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=15">Pets &amp; Animals</a><br/>
243
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=26">Science &amp; Technology</a><br/>
244
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=17">Sports</a><br/>
245
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=19">Travel &amp; Places</a><br/>
246
+ <a href="/results?search_type=search_videos&search_query=doraemon%20vs%20fakfj%20da&search_sort=&search_category=20">Video Games</a><br/>
247
+ </div>
248
+
249
+
250
+ <div id="bottomAdDiv" style="text-align: left;">
251
+ <a href="/wishcast"><img src="/img/ad_cokewishcast_120x90.jpg" width="120" height="90" border="0" alt="Send a Holiday Wishcast"></a>
252
+ </div>
253
+
254
+ </div> <!-- end sideNav -->
255
+
256
+
257
+
258
+
259
+ <div id="mainContentWithNav">
260
+
261
+
262
+ <div class="footerBox">
263
+
264
+
265
+
266
+
267
+
268
+ </div>
269
+
270
+ </div> <!-- end mainContentWithNav -->
271
+ </div> <!-- end mainContent -->
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+ <div class="spacer">&nbsp;</div>
292
+ <div id="footerDiv">
293
+ <div id="footerContent">
294
+ <div id="footerSearch">
295
+ <form name="searchFormFooter" id="searchFormFooter" method="get" action="/results">
296
+ <input type="text" name="search_query" maxlength="128" class="searchField" value="">
297
+ &nbsp;
298
+ <input type="submit" name="search" value="Search">
299
+ </form>
300
+ </div> <!-- end footerSearch -->
301
+
302
+
303
+ <div id="footerLinks">
304
+
305
+ <table border="0" cellpadding="0" cellspacing="0" width="100%" align="center"><tr valign="top">
306
+
307
+ <td>
308
+ <div class="footColumnLeft">
309
+ <div class="footLabel">Your&nbsp;&nbsp;Account</div>
310
+ <div class="footValues">
311
+ <div class="column">
312
+ <a href="/my_videos">Videos</a><br/>
313
+ <a href="/my_favorites">Favorites</a><br/>
314
+ </div>
315
+ <div class="column">
316
+ <a href="/my_playlists">Playlists</a><br/>
317
+ <a href="/my_messages">Inbox</a><br/>
318
+ </div>
319
+ <div class="column">
320
+ <a href="/subscription_center">Subscriptions</a><br/>
321
+ <a href="/my_account">more...</a><br/>
322
+ </div>
323
+ </div>
324
+ </div>
325
+ </td>
326
+
327
+ <td>
328
+ <div class="footColumnMid">
329
+ <div class="footLabel">Help &amp; Info</div>
330
+ <div class="footValues">
331
+ <div class="column">
332
+ <a href="/t/help_center">Help Center</a><br/>
333
+ <a href="/t/video_toolbox">Video Toolbox</a><br/>
334
+ </div>
335
+ <div class="column">
336
+ <a href="/dev">Developer APIs</a><br/>
337
+ <a href="/t/safety">Safety Tips</a><br/>
338
+ </div>
339
+ <div class="column">
340
+ <a href="/t/dmca_policy">Copyright FAQ</a><br/>
341
+ <a href="/t/community_guidelines">Code of Conduct</a><br/>
342
+ </div>
343
+ </div>
344
+ </div>
345
+ </td>
346
+
347
+
348
+ <td>
349
+ <div class="footColumnRight">
350
+ <div class="footLabel">YouTube</div>
351
+ <div class="footValues">
352
+ <div class="column">
353
+ <a href="/t/about">Company Info</a><br/>
354
+ <a href="/testtube">Test Tube</a><br/>
355
+ </div>
356
+ <div class="column">
357
+ <a href="/t/terms">Terms of Use</a><br/>
358
+ <a href="/t/privacy">Privacy Policy</a><br/>
359
+ </div>
360
+ <div class="column">
361
+ <a href="/advertise">Advertising</a><br/>
362
+ <a href="/contact">Contact</a><br/>
363
+ </div>
364
+ <div class="column">
365
+ <a href="/press_room">Press</a><br/>
366
+ <a href="http://www.pcrecruiter.net/pcrbin/regmenu.exe?uid=youtube.youtube">Jobs</a><br/>
367
+ </div>
368
+ </div>
369
+ </div>
370
+ </td>
371
+
372
+ </tr></table>
373
+
374
+ </div> <!-- end footerLinks -->
375
+ </div> <!-- end footerContent -->
376
+
377
+
378
+ <div id="footerCopyright">
379
+ Copyright &copy; 2006 YouTube, Inc.
380
+ </div> <!-- end footerCopyright -->
381
+
382
+ </div> <!-- end footerDiv -->
383
+
384
+ </div> <!-- end baseDiv -->
385
+ </body>
386
+
387
+ </html>