youtubesearchresultscraper 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -5,3 +5,10 @@
5
5
  Add rdoc.
6
6
  Add each() method into YouTube::SearchResultScraper
7
7
 
8
+ 0.0.3 2006-12-22
9
+ Add error check.
10
+ Add attribute for video_count, video_from, video_to
11
+
12
+ 0.0.4 2007-02-01
13
+ Add error check for scraping of pagination.
14
+ Fix scraping rule for html markup change of youtube.
@@ -48,7 +48,7 @@ module Youtube #:nodoc:
48
48
  # scraper.open
49
49
  # scraper.scrape
50
50
  # puts scraper.get_xml
51
- #
51
+ #
52
52
  # = More Information
53
53
  # http://www.ark-web.jp/sandbox/wiki/184.html (japanese only)
54
54
  #
@@ -64,7 +64,7 @@ module Youtube #:nodoc:
64
64
  attr_reader :video_count
65
65
  attr_reader :video_from
66
66
  attr_reader :video_to
67
-
67
+
68
68
  @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
69
69
 
70
70
  # Create Youtube::SearchResultScraper object specifying keyword and number of page.
@@ -123,7 +123,7 @@ module Youtube #:nodoc:
123
123
  end
124
124
 
125
125
  # Iterator for scraped videos.
126
- def each
126
+ def each
127
127
  @videos.each do |video|
128
128
  yield video
129
129
  end
@@ -131,7 +131,7 @@ module Youtube #:nodoc:
131
131
 
132
132
  # Return videos information as XML Format.
133
133
  def get_xml
134
- xml = "<ut_response status=\"ok\">" +
134
+ xml = "<ut_response status=\"ok\">" +
135
135
  "<video_count>" + @video_count.to_s + "</video_count>" +
136
136
  "<video_list>\n"
137
137
  each do |video|
@@ -199,23 +199,29 @@ module Youtube #:nodoc:
199
199
  end
200
200
 
201
201
  def scrape_result_header
202
- @search_result.search("div[@id='sectionHeader']/div[@class='my']").inner_html
202
+ @search_result.search("div[@id='sectionHeader']/div").inner_html
203
203
  end
204
204
 
205
205
  def scrape_video_count
206
- scrape_result_header.sub(/.+of *(\d+)/m , '\1').to_i
206
+ video_count = scrape_result_header.sub(/.+of *(\d+)/m , '\1')
207
+ raise "no video count" if video_count.empty?
208
+ video_count.to_i
207
209
  end
208
210
 
209
211
  def scrape_video_from
210
- scrape_result_header.sub(/Results *(\d+)-.+/m, '\1').to_i
212
+ video_from = scrape_result_header.sub(/Results *(\d+)-.+/m, '\1')
213
+ raise "no video from" if video_from.empty?
214
+ video_from.to_i
211
215
  end
212
216
 
213
217
  def scrape_video_to
214
- scrape_result_header.sub(/Results.+-(\d+) *of.+/m, '\1').to_i
218
+ video_to = scrape_result_header.sub(/Results.+-(\d+) *of.+/m, '\1')
219
+ raise "no video to" if video_to.empty?
220
+ video_to.to_i
215
221
  end
216
222
 
217
223
  def is_no_result
218
- @search_result.search("div[@class='body']").inner_html.include?('No Videos found')
224
+ @html.include?('No Videos found')
219
225
  end
220
226
 
221
227
  def check_video video
@@ -0,0 +1,250 @@
1
+ #--
2
+ # Copyright (C) 2006 by in3c.org, ARK-Web co., ltd
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # :main:Youtube::SearchResultScraper
24
+ # :title:Youtube::SearchResultScraper RDoc Documentation
25
+
26
+ require 'open-uri'
27
+ require 'cgi'
28
+ require 'rubygems'
29
+ require 'hpricot'
30
+ require 'youtube/video'
31
+
32
+ module Youtube #:nodoc:
33
+
34
+ # = Introduction
35
+ # Youtube::SearchResultScraper scrapes video information from search result page
36
+ # on http://www.youtube.com.
37
+ #
38
+ # You can get result as array or xml.
39
+ #
40
+ # XML format is same as YouTube Developer API
41
+ # (http://www.youtube.com/dev_api_ref?m=youtube.videos.list_by_tag).
42
+ #
43
+ # = Example
44
+ # require "rubygems"
45
+ # require "youtube/searchresultscraper"
46
+ #
47
+ # scraper = Youtube::SearchResultScraper.new(keyword, page)
48
+ # scraper.open
49
+ # scraper.scrape
50
+ # puts scraper.get_xml
51
+ #
52
+ # = More Information
53
+ # http://www.ark-web.jp/sandbox/wiki/184.html (japanese only)
54
+ #
55
+ # Author:: Yuki SHIDA <shida@in3c.org>
56
+ # Author:: Konuma Akio <konuma@ark-web.jp>
57
+ # Version:: 0.0.3
58
+ # License:: MIT license
59
+
60
+ class SearchResultScraper
61
+
62
+ attr_accessor :keyword
63
+ attr_accessor :page
64
+ attr_reader :video_count
65
+ attr_reader :video_from
66
+ attr_reader :video_to
67
+
68
+ @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
69
+
70
+ # Create Youtube::SearchResultScraper object specifying keyword and number of page.
71
+ #
72
+ # You cannot specify number of videos per page.
73
+ # Always, the number of videos is 20 per page.
74
+ #
75
+ # * keyword - specify keyword that you want to search on YouTube.
76
+ # You must specify keyword encoded by UTF-8.
77
+ # * page - specify number of page
78
+
79
+ def initialize keyword, page=nil
80
+ @keyword = keyword
81
+ @page = page if not page == nil
82
+ end
83
+
84
+ # Get search result from youtube by specified keyword.
85
+ def open
86
+ url = @@youtube_search_base_url + CGI.escape(@keyword)
87
+ url += "&page=#{@page}" if not @page == nil
88
+ @html = Kernel.open(url).read
89
+ replace_document_write_javascript
90
+ @search_result = Hpricot.parse(@html)
91
+ end
92
+
93
+ # Scrape video information from search result html.
94
+ def scrape
95
+ @videos = []
96
+
97
+ @search_result.search("//div[@class='vEntry']").each do |video_html|
98
+ video = Youtube::Video.new
99
+ video.id = scrape_id(video_html)
100
+ video.author = scrape_author(video_html)
101
+ video.title = scrape_title(video_html)
102
+ video.length_seconds = scrape_length_seconds(video_html)
103
+ video.rating_avg = scrape_rating_avg(video_html)
104
+ video.rating_count = scrape_rating_count(video_html)
105
+ video.description = scrape_description(video_html)
106
+ video.view_count = scrape_view_count(video_html)
107
+ video.thumbnail_url = scrape_thumbnail_url(video_html)
108
+ video.tags = scrape_tags(video_html)
109
+ video.url = scrape_url(video_html)
110
+
111
+ check_video video
112
+
113
+ @videos << video
114
+ end
115
+
116
+ @video_count = scrape_video_count
117
+ @video_from = scrape_video_from
118
+ @video_to = scrape_video_to
119
+ raise "no video count" if @video_count == nil
120
+ raise "no video from" if @video_from == nil
121
+ raise "no video to" if @video_to == nil
122
+
123
+ raise "scraping error" if (is_no_result != @videos.empty?)
124
+
125
+ @videos
126
+ end
127
+
128
+ # Iterator for scraped videos.
129
+ def each
130
+ @videos.each do |video|
131
+ yield video
132
+ end
133
+ end
134
+
135
+ # Return videos information as XML Format.
136
+ def get_xml
137
+ xml = "<ut_response status=\"ok\">" +
138
+ "<video_count>" + @video_count.to_s + "</video_count>" +
139
+ "<video_list>\n"
140
+ each do |video|
141
+ xml += video.to_xml
142
+ end
143
+ xml += "</video_list></ut_response>"
144
+ end
145
+
146
+ private
147
+
148
+ def replace_document_write_javascript
149
+ @html.gsub!(%r{<script language="javascript" type="text/javascript">.*?document.write\('(.*?)'\).*?</script>}m, '\1')
150
+ end
151
+
152
+ def scrape_id video_html
153
+ scrape_thumbnail_url(video_html).sub(%r{.*/([^/]+)/[^/]+.jpg}, '\1')
154
+ end
155
+
156
+ def scrape_author video_html
157
+ video_html.search("div[@class='vfacets']").inner_html.sub(/.*From:<\/span> <a.*?>(.*?)<\/a>.*/m, '\1')
158
+ end
159
+
160
+ def scrape_title video_html
161
+ video_html.search("div[@class='vtitle']/a").inner_html
162
+ end
163
+
164
+ def scrape_length_seconds video_html
165
+ length_seconds = video_html.search("span[@class='runtime']").inner_html
166
+ length_seconds =~ /(\d\d):(\d\d)/
167
+ $1.to_i * 60 + $2.to_i
168
+ end
169
+
170
+ def scrape_rating_avg video_html
171
+ video_html.search("img[@src='/img/star_sm.gif']").size +
172
+ video_html.search("img[@src='/img/star_sm_half.gif']").size * 0.5
173
+ end
174
+
175
+ def scrape_rating_count video_html
176
+ video_html.search("div[@class='rating']").inner_html.sub(/(\d+) rating/, '\1').to_i
177
+ end
178
+
179
+ def scrape_description video_html
180
+ description = video_html.search("div[@class='vdesc']/span").inner_html.sub(/^\n\t(.*?)\n\t$/m, '\1')
181
+ end
182
+
183
+ def scrape_view_count video_html
184
+ video_html.search("div[@class='vfacets']").inner_html.sub(/.*Views:<\/span> (\d+).*/m, '\1').to_i
185
+ end
186
+
187
+ def scrape_tags video_html
188
+ tags = []
189
+ video_html.search("div[@class='vtagValue']/a").each do |tag|
190
+ tags << tag.inner_html
191
+ end
192
+ tags.join(" ")
193
+ end
194
+
195
+ def scrape_thumbnail_url video_html
196
+ video_html.search("img[@class='vimg120']").to_html.sub(/.*src="(.*?)".*/, '\1')
197
+ end
198
+
199
+ def scrape_url video_html
200
+ "http://www.youtube.com" +
201
+ video_html.search("div[@class='vtitle']/a").to_html.sub(/.*href="(.*?)".*/m, '\1')
202
+ end
203
+
204
+ def scrape_result_header
205
+ @search_result.search("div[@id='sectionHeader']/div[@class='my']").inner_html
206
+ end
207
+
208
+ def scrape_video_count
209
+ scrape_result_header.sub(/.+of *(\d+)/m , '\1').to_i
210
+ end
211
+
212
+ def scrape_video_from
213
+ scrape_result_header.sub(/Results *(\d+)-.+/m, '\1').to_i
214
+ end
215
+
216
+ def scrape_video_to
217
+ scrape_result_header.sub(/Results.+-(\d+) *of.+/m, '\1').to_i
218
+ end
219
+
220
+ def is_no_result
221
+ @search_result.search("div[@class='body']").inner_html.include?('No Videos found')
222
+ end
223
+
224
+ def check_video video
225
+ errors = []
226
+
227
+ errors << "author" if video.author.empty?
228
+ errors << "id" if video.id.empty?
229
+ errors << "title" if video.title.empty?
230
+ errors << "length_seconds" if video.length_seconds.to_s.empty?
231
+ errors << "rating_avg" if video.rating_avg.to_s.empty?
232
+ errors << "rating_count" if video.rating_count.to_s.empty?
233
+ errors << "description" if video.description.empty?
234
+ errors << "view_count" if video.view_count.to_s.empty?
235
+ errors << "tags" if video.tags.empty?
236
+ errors << "url" if video.url.empty?
237
+ errors << "thumbnail_url" if video.thumbnail_url.empty?
238
+
239
+ unless errors.empty? then
240
+ error_msg = "scraping error occurred.\n"
241
+ errors.each do |error|
242
+ error_msg << error + " is not setted.\n"
243
+ end
244
+ raise error_msg
245
+ end
246
+ end
247
+
248
+ end
249
+
250
+ end
@@ -1,13 +1,13 @@
1
1
  #!/usr/bin/ruby
2
2
 
3
- require "runit/testcase"
4
- require "runit/cui/testrunner"
3
+ require 'test/unit'
4
+
5
5
 
6
6
  require "rubygems"
7
7
  require 'hpricot'
8
8
  require "youtube/searchresultscraper"
9
9
 
10
- class SearchResultScraperTest < RUNIT::TestCase
10
+ class SearchResultScraperTest < Test::Unit::TestCase
11
11
 
12
12
  def test_scrape
13
13
 
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/ruby
2
+
3
+ #require "runit/testcase"
4
+ #require "runit/cui/testrunner"
5
+ require 'test/unit'
6
+
7
+
8
+ require "rubygems"
9
+ require 'hpricot'
10
+ require "youtube/searchresultscraper"
11
+
12
+ class SearchResultScraperTest < Test::Unit::TestCase
13
+ #class SearchResultScraperTest < RUNIT::TestCase
14
+
15
+ def test_scrape
16
+
17
+ #
18
+ #�����
19
+ #
20
+ #������̤�����
21
+ open_and_scrape("http://www.youtube.com/results?search_query=", "doraemon", 2)
22
+ #������̤��ʤ�
23
+ open_and_scrape("http://www.youtube.com/results?search_query=", "aeudyr jahafudfhadf ahf", 2)
24
+
25
+ #
26
+ #�۾��
27
+ #
28
+ #������̤����뤬��Not Found��������
29
+ begin
30
+ open_local_file_and_scrape("html/dataY_noMsgY.htm")
31
+ assert_fail("������٤����顼��ȯ�����Ƥ��ʤ�")
32
+ rescue RuntimeError =>e
33
+ #puts e
34
+ end
35
+ #������̤��ʤ���Not Found��ʤ����
36
+ begin
37
+ open_local_file_and_scrape("html/dataN_noMsgN.htm")
38
+ assert_fail("������٤����顼��ȯ�����Ƥ��ʤ�")
39
+ rescue RuntimeError
40
+ #puts e
41
+ end
42
+
43
+ #�����ι��ܤ��������Ǥ��Ƥ��ʤ����
44
+ begin
45
+ open_local_file_and_scrape("html/scraping_error.html")
46
+ assert_fail("������٤����顼��ȯ�����Ƥ��ʤ�")
47
+ rescue RuntimeError => e
48
+ # puts e
49
+ end
50
+ end
51
+
52
+ def test_scrape_video_count
53
+
54
+ scraper = open_and_scrape("http://www.youtube.com/results?search_query=", "doraemon", 2)
55
+ puts scraper.video_count
56
+ assert( scraper.video_count > 0 )
57
+
58
+ scraper = open_and_scrape("http://www.youtube.com/results?search_query=", "doraemonifdadfa", 2)
59
+ puts scraper.video_count
60
+ assert( scraper.video_count == 0 )
61
+ end
62
+
63
+ def open_and_scrape url, keyword=nil, page=nil
64
+ scraper = MySearchResultScraper.new(url, keyword, page)
65
+ scraper.open
66
+ scraper.scrape
67
+ scraper
68
+ end
69
+
70
+ def open_local_file_and_scrape url
71
+ scraper = MySearchResultScraper.new(url)
72
+ scraper.open_local_file
73
+ scraper.scrape
74
+ end
75
+
76
+ end
77
+
78
+ class MySearchResultScraper < Youtube::SearchResultScraper
79
+ @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
80
+
81
+ def initialize url, keyword=nil, page=nil
82
+ @@youtube_search_base_url = url
83
+ @keyword = keyword
84
+ @page = page if not page == nil
85
+ end
86
+
87
+ def open_local_file
88
+ @html = Kernel.open(@@youtube_search_base_url).read
89
+ replace_document_write_javascript
90
+ @search_result = Hpricot.parse(@html)
91
+ end
92
+ end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: youtubesearchresultscraper
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.0.3
7
- date: 2006-12-22 00:00:00 +09:00
6
+ version: 0.0.4
7
+ date: 2007-02-01 00:00:00 +09:00
8
8
  summary: This gem provide function to scrape html of search result on youtube
9
9
  require_paths:
10
10
  - lib
@@ -30,10 +30,12 @@ authors:
30
30
  - Yuki SHIDA
31
31
  files:
32
32
  - lib/youtube
33
+ - lib/youtube/searchresultscraper.rb~
33
34
  - lib/youtube/video.rb
34
35
  - lib/youtube/searchresultscraper.rb
35
36
  - test/youtube_scraper_test.rb
36
37
  - test/html
38
+ - test/youtube_scraper_test.rb~
37
39
  - test/html/scraping_error.html
38
40
  - test/html/dataY_noMsgY.htm
39
41
  - test/html/dataN_noMsgN.htm