youtubesearchresultscraper 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -5,3 +5,10 @@
5
5
  Add rdoc.
6
6
  Add each() method into YouTube::SearchResultScraper
7
7
 
8
+ 0.0.3 2006-12-22
9
+ Add error check.
10
+ Add attribute for video_count, video_from, video_to
11
+
12
+ 0.0.4 2007-02-01
13
+ Add error check for scraping of pagination.
14
+ Fix scraping rule for html markup change of youtube.
@@ -48,7 +48,7 @@ module Youtube #:nodoc:
48
48
  # scraper.open
49
49
  # scraper.scrape
50
50
  # puts scraper.get_xml
51
- #
51
+ #
52
52
  # = More Information
53
53
  # http://www.ark-web.jp/sandbox/wiki/184.html (japanese only)
54
54
  #
@@ -64,7 +64,7 @@ module Youtube #:nodoc:
64
64
  attr_reader :video_count
65
65
  attr_reader :video_from
66
66
  attr_reader :video_to
67
-
67
+
68
68
  @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
69
69
 
70
70
  # Create Youtube::SearchResultScraper object specifying keyword and number of page.
@@ -123,7 +123,7 @@ module Youtube #:nodoc:
123
123
  end
124
124
 
125
125
  # Iterator for scraped videos.
126
- def each
126
+ def each
127
127
  @videos.each do |video|
128
128
  yield video
129
129
  end
@@ -131,7 +131,7 @@ module Youtube #:nodoc:
131
131
 
132
132
  # Return videos information as XML Format.
133
133
  def get_xml
134
- xml = "<ut_response status=\"ok\">" +
134
+ xml = "<ut_response status=\"ok\">" +
135
135
  "<video_count>" + @video_count.to_s + "</video_count>" +
136
136
  "<video_list>\n"
137
137
  each do |video|
@@ -199,23 +199,29 @@ module Youtube #:nodoc:
199
199
  end
200
200
 
201
201
  def scrape_result_header
202
- @search_result.search("div[@id='sectionHeader']/div[@class='my']").inner_html
202
+ @search_result.search("div[@id='sectionHeader']/div").inner_html
203
203
  end
204
204
 
205
205
  def scrape_video_count
206
- scrape_result_header.sub(/.+of *(\d+)/m , '\1').to_i
206
+ video_count = scrape_result_header.sub(/.+of *(\d+)/m , '\1')
207
+ raise "no video count" if video_count.empty?
208
+ video_count.to_i
207
209
  end
208
210
 
209
211
  def scrape_video_from
210
- scrape_result_header.sub(/Results *(\d+)-.+/m, '\1').to_i
212
+ video_from = scrape_result_header.sub(/Results *(\d+)-.+/m, '\1')
213
+ raise "no video from" if video_from.empty?
214
+ video_from.to_i
211
215
  end
212
216
 
213
217
  def scrape_video_to
214
- scrape_result_header.sub(/Results.+-(\d+) *of.+/m, '\1').to_i
218
+ video_to = scrape_result_header.sub(/Results.+-(\d+) *of.+/m, '\1')
219
+ raise "no video to" if video_to.empty?
220
+ video_to.to_i
215
221
  end
216
222
 
217
223
  def is_no_result
218
- @search_result.search("div[@class='body']").inner_html.include?('No Videos found')
224
+ @html.include?('No Videos found')
219
225
  end
220
226
 
221
227
  def check_video video
@@ -0,0 +1,250 @@
1
+ #--
2
+ # Copyright (C) 2006 by in3c.org, ARK-Web co., ltd
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+ # :main:Youtube::SearchResultScraper
24
+ # :title:Youtube::SearchResultScraper RDoc Documentation
25
+
26
+ require 'open-uri'
27
+ require 'cgi'
28
+ require 'rubygems'
29
+ require 'hpricot'
30
+ require 'youtube/video'
31
+
32
+ module Youtube #:nodoc:
33
+
34
+ # = Introduction
35
+ # Youtube::SearchResultScraper scrapes video information from search result page
36
+ # on http://www.youtube.com.
37
+ #
38
+ # You can get result as array or xml.
39
+ #
40
+ # XML format is same as YouTube Developer API
41
+ # (http://www.youtube.com/dev_api_ref?m=youtube.videos.list_by_tag).
42
+ #
43
+ # = Example
44
+ # require "rubygems"
45
+ # require "youtube/searchresultscraper"
46
+ #
47
+ # scraper = Youtube::SearchResultScraper.new(keyword, page)
48
+ # scraper.open
49
+ # scraper.scrape
50
+ # puts scraper.get_xml
51
+ #
52
+ # = More Information
53
+ # http://www.ark-web.jp/sandbox/wiki/184.html (japanese only)
54
+ #
55
+ # Author:: Yuki SHIDA <shida@in3c.org>
56
+ # Author:: Konuma Akio <konuma@ark-web.jp>
57
+ # Version:: 0.0.3
58
+ # License:: MIT license
59
+
60
+ class SearchResultScraper
61
+
62
+ attr_accessor :keyword
63
+ attr_accessor :page
64
+ attr_reader :video_count
65
+ attr_reader :video_from
66
+ attr_reader :video_to
67
+
68
+ @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
69
+
70
+ # Create Youtube::SearchResultScraper object specifying keyword and number of page.
71
+ #
72
+ # You cannot specify number of videos per page.
73
+ # Always, the number of videos is 20 per page.
74
+ #
75
+ # * keyword - specify keyword that you want to search on YouTube.
76
+ # You must specify keyword encoded by UTF-8.
77
+ # * page - specify number of page
78
+
79
+ def initialize keyword, page=nil
80
+ @keyword = keyword
81
+ @page = page if not page == nil
82
+ end
83
+
84
+ # Get search result from youtube by specified keyword.
85
+ def open
86
+ url = @@youtube_search_base_url + CGI.escape(@keyword)
87
+ url += "&page=#{@page}" if not @page == nil
88
+ @html = Kernel.open(url).read
89
+ replace_document_write_javascript
90
+ @search_result = Hpricot.parse(@html)
91
+ end
92
+
93
+ # Scrape video information from search result html.
94
+ def scrape
95
+ @videos = []
96
+
97
+ @search_result.search("//div[@class='vEntry']").each do |video_html|
98
+ video = Youtube::Video.new
99
+ video.id = scrape_id(video_html)
100
+ video.author = scrape_author(video_html)
101
+ video.title = scrape_title(video_html)
102
+ video.length_seconds = scrape_length_seconds(video_html)
103
+ video.rating_avg = scrape_rating_avg(video_html)
104
+ video.rating_count = scrape_rating_count(video_html)
105
+ video.description = scrape_description(video_html)
106
+ video.view_count = scrape_view_count(video_html)
107
+ video.thumbnail_url = scrape_thumbnail_url(video_html)
108
+ video.tags = scrape_tags(video_html)
109
+ video.url = scrape_url(video_html)
110
+
111
+ check_video video
112
+
113
+ @videos << video
114
+ end
115
+
116
+ @video_count = scrape_video_count
117
+ @video_from = scrape_video_from
118
+ @video_to = scrape_video_to
119
+ raise "no video count" if @video_count == nil
120
+ raise "no video from" if @video_from == nil
121
+ raise "no video to" if @video_to == nil
122
+
123
+ raise "scraping error" if (is_no_result != @videos.empty?)
124
+
125
+ @videos
126
+ end
127
+
128
+ # Iterator for scraped videos.
129
+ def each
130
+ @videos.each do |video|
131
+ yield video
132
+ end
133
+ end
134
+
135
+ # Return videos information as XML Format.
136
+ def get_xml
137
+ xml = "<ut_response status=\"ok\">" +
138
+ "<video_count>" + @video_count.to_s + "</video_count>" +
139
+ "<video_list>\n"
140
+ each do |video|
141
+ xml += video.to_xml
142
+ end
143
+ xml += "</video_list></ut_response>"
144
+ end
145
+
146
+ private
147
+
148
+ def replace_document_write_javascript
149
+ @html.gsub!(%r{<script language="javascript" type="text/javascript">.*?document.write\('(.*?)'\).*?</script>}m, '\1')
150
+ end
151
+
152
+ def scrape_id video_html
153
+ scrape_thumbnail_url(video_html).sub(%r{.*/([^/]+)/[^/]+.jpg}, '\1')
154
+ end
155
+
156
+ def scrape_author video_html
157
+ video_html.search("div[@class='vfacets']").inner_html.sub(/.*From:<\/span> <a.*?>(.*?)<\/a>.*/m, '\1')
158
+ end
159
+
160
+ def scrape_title video_html
161
+ video_html.search("div[@class='vtitle']/a").inner_html
162
+ end
163
+
164
+ def scrape_length_seconds video_html
165
+ length_seconds = video_html.search("span[@class='runtime']").inner_html
166
+ length_seconds =~ /(\d\d):(\d\d)/
167
+ $1.to_i * 60 + $2.to_i
168
+ end
169
+
170
+ def scrape_rating_avg video_html
171
+ video_html.search("img[@src='/img/star_sm.gif']").size +
172
+ video_html.search("img[@src='/img/star_sm_half.gif']").size * 0.5
173
+ end
174
+
175
+ def scrape_rating_count video_html
176
+ video_html.search("div[@class='rating']").inner_html.sub(/(\d+) rating/, '\1').to_i
177
+ end
178
+
179
+ def scrape_description video_html
180
+ description = video_html.search("div[@class='vdesc']/span").inner_html.sub(/^\n\t(.*?)\n\t$/m, '\1')
181
+ end
182
+
183
+ def scrape_view_count video_html
184
+ video_html.search("div[@class='vfacets']").inner_html.sub(/.*Views:<\/span> (\d+).*/m, '\1').to_i
185
+ end
186
+
187
+ def scrape_tags video_html
188
+ tags = []
189
+ video_html.search("div[@class='vtagValue']/a").each do |tag|
190
+ tags << tag.inner_html
191
+ end
192
+ tags.join(" ")
193
+ end
194
+
195
+ def scrape_thumbnail_url video_html
196
+ video_html.search("img[@class='vimg120']").to_html.sub(/.*src="(.*?)".*/, '\1')
197
+ end
198
+
199
+ def scrape_url video_html
200
+ "http://www.youtube.com" +
201
+ video_html.search("div[@class='vtitle']/a").to_html.sub(/.*href="(.*?)".*/m, '\1')
202
+ end
203
+
204
+ def scrape_result_header
205
+ @search_result.search("div[@id='sectionHeader']/div[@class='my']").inner_html
206
+ end
207
+
208
+ def scrape_video_count
209
+ scrape_result_header.sub(/.+of *(\d+)/m , '\1').to_i
210
+ end
211
+
212
+ def scrape_video_from
213
+ scrape_result_header.sub(/Results *(\d+)-.+/m, '\1').to_i
214
+ end
215
+
216
+ def scrape_video_to
217
+ scrape_result_header.sub(/Results.+-(\d+) *of.+/m, '\1').to_i
218
+ end
219
+
220
+ def is_no_result
221
+ @search_result.search("div[@class='body']").inner_html.include?('No Videos found')
222
+ end
223
+
224
+ def check_video video
225
+ errors = []
226
+
227
+ errors << "author" if video.author.empty?
228
+ errors << "id" if video.id.empty?
229
+ errors << "title" if video.title.empty?
230
+ errors << "length_seconds" if video.length_seconds.to_s.empty?
231
+ errors << "rating_avg" if video.rating_avg.to_s.empty?
232
+ errors << "rating_count" if video.rating_count.to_s.empty?
233
+ errors << "description" if video.description.empty?
234
+ errors << "view_count" if video.view_count.to_s.empty?
235
+ errors << "tags" if video.tags.empty?
236
+ errors << "url" if video.url.empty?
237
+ errors << "thumbnail_url" if video.thumbnail_url.empty?
238
+
239
+ unless errors.empty? then
240
+ error_msg = "scraping error occurred.\n"
241
+ errors.each do |error|
242
+ error_msg << error + " is not setted.\n"
243
+ end
244
+ raise error_msg
245
+ end
246
+ end
247
+
248
+ end
249
+
250
+ end
@@ -1,13 +1,13 @@
1
1
  #!/usr/bin/ruby
2
2
 
3
- require "runit/testcase"
4
- require "runit/cui/testrunner"
3
+ require 'test/unit'
4
+
5
5
 
6
6
  require "rubygems"
7
7
  require 'hpricot'
8
8
  require "youtube/searchresultscraper"
9
9
 
10
- class SearchResultScraperTest < RUNIT::TestCase
10
+ class SearchResultScraperTest < Test::Unit::TestCase
11
11
 
12
12
  def test_scrape
13
13
 
@@ -0,0 +1,92 @@
1
+ #!/usr/bin/ruby
2
+
3
+ #require "runit/testcase"
4
+ #require "runit/cui/testrunner"
5
+ require 'test/unit'
6
+
7
+
8
+ require "rubygems"
9
+ require 'hpricot'
10
+ require "youtube/searchresultscraper"
11
+
12
+ class SearchResultScraperTest < Test::Unit::TestCase
13
+ #class SearchResultScraperTest < RUNIT::TestCase
14
+
15
+ def test_scrape
16
+
17
+ #
18
+ #�����
19
+ #
20
+ #������̤�����
21
+ open_and_scrape("http://www.youtube.com/results?search_query=", "doraemon", 2)
22
+ #������̤��ʤ�
23
+ open_and_scrape("http://www.youtube.com/results?search_query=", "aeudyr jahafudfhadf ahf", 2)
24
+
25
+ #
26
+ #�۾��
27
+ #
28
+ #������̤����뤬��Not Found��������
29
+ begin
30
+ open_local_file_and_scrape("html/dataY_noMsgY.htm")
31
+ assert_fail("������٤����顼��ȯ�����Ƥ��ʤ�")
32
+ rescue RuntimeError =>e
33
+ #puts e
34
+ end
35
+ #������̤��ʤ���Not Found��ʤ����
36
+ begin
37
+ open_local_file_and_scrape("html/dataN_noMsgN.htm")
38
+ assert_fail("������٤����顼��ȯ�����Ƥ��ʤ�")
39
+ rescue RuntimeError
40
+ #puts e
41
+ end
42
+
43
+ #�����ι��ܤ��������Ǥ��Ƥ��ʤ����
44
+ begin
45
+ open_local_file_and_scrape("html/scraping_error.html")
46
+ assert_fail("������٤����顼��ȯ�����Ƥ��ʤ�")
47
+ rescue RuntimeError => e
48
+ # puts e
49
+ end
50
+ end
51
+
52
+ def test_scrape_video_count
53
+
54
+ scraper = open_and_scrape("http://www.youtube.com/results?search_query=", "doraemon", 2)
55
+ puts scraper.video_count
56
+ assert( scraper.video_count > 0 )
57
+
58
+ scraper = open_and_scrape("http://www.youtube.com/results?search_query=", "doraemonifdadfa", 2)
59
+ puts scraper.video_count
60
+ assert( scraper.video_count == 0 )
61
+ end
62
+
63
+ def open_and_scrape url, keyword=nil, page=nil
64
+ scraper = MySearchResultScraper.new(url, keyword, page)
65
+ scraper.open
66
+ scraper.scrape
67
+ scraper
68
+ end
69
+
70
+ def open_local_file_and_scrape url
71
+ scraper = MySearchResultScraper.new(url)
72
+ scraper.open_local_file
73
+ scraper.scrape
74
+ end
75
+
76
+ end
77
+
78
+ class MySearchResultScraper < Youtube::SearchResultScraper
79
+ @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
80
+
81
+ def initialize url, keyword=nil, page=nil
82
+ @@youtube_search_base_url = url
83
+ @keyword = keyword
84
+ @page = page if not page == nil
85
+ end
86
+
87
+ def open_local_file
88
+ @html = Kernel.open(@@youtube_search_base_url).read
89
+ replace_document_write_javascript
90
+ @search_result = Hpricot.parse(@html)
91
+ end
92
+ end
metadata CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: youtubesearchresultscraper
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.0.3
7
- date: 2006-12-22 00:00:00 +09:00
6
+ version: 0.0.4
7
+ date: 2007-02-01 00:00:00 +09:00
8
8
  summary: This gem provide function to scrape html of search result on youtube
9
9
  require_paths:
10
10
  - lib
@@ -30,10 +30,12 @@ authors:
30
30
  - Yuki SHIDA
31
31
  files:
32
32
  - lib/youtube
33
+ - lib/youtube/searchresultscraper.rb~
33
34
  - lib/youtube/video.rb
34
35
  - lib/youtube/searchresultscraper.rb
35
36
  - test/youtube_scraper_test.rb
36
37
  - test/html
38
+ - test/youtube_scraper_test.rb~
37
39
  - test/html/scraping_error.html
38
40
  - test/html/dataY_noMsgY.htm
39
41
  - test/html/dataN_noMsgN.htm