youtubescraper 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -18,3 +18,10 @@
18
18
 
19
19
  0.0.6 2007-02-16
20
20
  Fix error handling for video_count, video_from, video_to
21
+
22
+ 0.0.7 2007-03-29
23
+ Fix scraping rule for video_count
24
+ Add Youtube::BrowseScraper
25
+
26
+ 0.0.8 2007-03-29
27
+ Description is not required.
@@ -174,7 +174,7 @@ module Youtube #:nodoc:
174
174
  end
175
175
 
176
176
  def scrape_description video_html
177
- description = video_html.search("div[@class='vdesc']/span").inner_html.sub(/^\n\t(.*?)\n\t$/m, '\1')
177
+ description = video_html.search("div[@class='vdesc']/span").inner_html.sub(/^\s*(.*?)\s*$/m, '\1')
178
178
  end
179
179
 
180
180
  def scrape_view_count video_html
@@ -243,7 +243,6 @@ module Youtube #:nodoc:
243
243
  errors << "length_seconds" if video.length_seconds.to_s.empty?
244
244
  errors << "rating_avg" if video.rating_avg.to_s.empty?
245
245
  errors << "rating_count" if video.rating_count.to_s.empty?
246
- errors << "description" if video.description.empty?
247
246
  errors << "view_count" if video.view_count.to_s.empty?
248
247
  errors << "tags" if video.tags.empty?
249
248
  errors << "url" if video.url.empty?
metadata CHANGED
@@ -3,7 +3,7 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: youtubescraper
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.0.7
6
+ version: 0.0.8
7
7
  date: 2007-03-29 00:00:00 +09:00
8
8
  summary: This gem provide function to scrape html of search result on youtube
9
9
  require_paths:
@@ -31,7 +31,6 @@ authors:
31
31
  files:
32
32
  - lib/youtube
33
33
  - lib/youtube/browsescraper.rb
34
- - lib/youtube/searchresultscraper.rb~
35
34
  - lib/youtube/video.rb
36
35
  - lib/youtube/searchresultscraper.rb
37
36
  - test/youtube_scraper_test.rb
@@ -1,263 +0,0 @@
1
- #--
2
- # Copyright (C) 2006 by in3c.org, ARK-Web co., ltd
3
- #
4
- # Permission is hereby granted, free of charge, to any person obtaining
5
- # a copy of this software and associated documentation files (the
6
- # "Software"), to deal in the Software without restriction, including
7
- # without limitation the rights to use, copy, modify, merge, publish,
8
- # distribute, sublicense, and/or sell copies of the Software, and to
9
- # permit persons to whom the Software is furnished to do so, subject to
10
- # the following conditions:
11
- #
12
- # The above copyright notice and this permission notice shall be
13
- # included in all copies or substantial portions of the Software.
14
- #
15
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
- # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
- # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
- # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
- #++
23
- # :main:Youtube::SearchResultScraper
24
- # :title:Youtube::SearchResultScraper RDoc Documentation
25
-
26
- require 'open-uri'
27
- require 'cgi'
28
- require 'rubygems'
29
- require 'hpricot'
30
- require 'youtube/video'
31
-
32
- module Youtube #:nodoc:
33
-
34
- # = Introduction
35
- # Youtube::SearchResultScraper scrapes video information from search result page
36
- # on http://www.youtube.com.
37
- #
38
- # You can get result as array or xml.
39
- #
40
- # XML format is same as YouTube Developer API
41
- # (http://www.youtube.com/dev_api_ref?m=youtube.videos.list_by_tag).
42
- #
43
- # = Example
44
- # require "rubygems"
45
- # require "youtube/searchresultscraper"
46
- #
47
- # scraper = Youtube::SearchResultScraper.new(keyword, page)
48
- # scraper.open
49
- # scraper.scrape
50
- # puts scraper.get_xml
51
- #
52
- # = More Information
53
- # http://www.ark-web.jp/sandbox/wiki/184.html (japanese only)
54
- #
55
- # Author:: Yuki SHIDA <shida@in3c.org>
56
- # Author:: Konuma Akio <konuma@ark-web.jp>
57
- # Version:: 0.0.3
58
- # License:: MIT license
59
-
60
- class SearchResultScraper
61
-
62
- attr_accessor :keyword
63
- attr_accessor :page
64
- attr_reader :video_count
65
- attr_reader :video_from
66
- attr_reader :video_to
67
-
68
- @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
69
-
70
- # Create Youtube::SearchResultScraper object specifying keyword and number of page.
71
- #
72
- # You cannot specify number of videos per page.
73
- # Always, the number of videos is 20 per page.
74
- #
75
- # * keyword - specify keyword that you want to search on YouTube.
76
- # You must specify keyword encoded by UTF-8.
77
- # * page - specify number of page
78
-
79
- def initialize keyword, page=nil
80
- @keyword = keyword
81
- @page = page if not page == nil
82
- end
83
-
84
- # Get search result from youtube by specified keyword.
85
- def open
86
- @url = @@youtube_search_base_url + CGI.escape(@keyword)
87
- @url += "&page=#{@page}" if not @page == nil
88
- @html = Kernel.open(@url).read
89
- replace_document_write_javascript
90
- @search_result = Hpricot.parse(@html)
91
- end
92
-
93
- # Scrape video information from search result html.
94
- def scrape
95
- @videos = []
96
-
97
- @search_result.search("//div[@class='vEntry']").each do |video_html|
98
- video = Youtube::Video.new
99
- video.id = scrape_id(video_html)
100
- video.author = scrape_author(video_html)
101
- video.title = scrape_title(video_html)
102
- video.length_seconds = scrape_length_seconds(video_html)
103
- video.rating_avg = scrape_rating_avg(video_html)
104
- video.rating_count = scrape_rating_count(video_html)
105
- video.description = scrape_description(video_html)
106
- video.view_count = scrape_view_count(video_html)
107
- video.thumbnail_url = scrape_thumbnail_url(video_html)
108
- video.tags = scrape_tags(video_html)
109
- video.url = scrape_url(video_html)
110
-
111
- check_video video
112
-
113
- @videos << video
114
- end
115
-
116
- @video_count = scrape_video_count
117
- @video_from = scrape_video_from
118
- @video_to = scrape_video_to
119
-
120
- raise "scraping error" if (is_no_result != @videos.empty?)
121
-
122
- @videos
123
- end
124
-
125
- # Iterator for scraped videos.
126
- def each
127
- @videos.each do |video|
128
- yield video
129
- end
130
- end
131
-
132
- # Return videos information as XML Format.
133
- def get_xml
134
- xml = "<ut_response status=\"ok\">" +
135
- "<video_count>" + @video_count.to_s + "</video_count>" +
136
- "<video_list>\n"
137
- each do |video|
138
- xml += video.to_xml
139
- end
140
- xml += "</video_list></ut_response>"
141
- end
142
-
143
- private
144
-
145
- def replace_document_write_javascript
146
- @html.gsub!(%r{<script language="javascript" type="text/javascript">.*?document.write\('(.*?)'\).*?</script>}m, '\1')
147
- end
148
-
149
- def scrape_id video_html
150
- scrape_thumbnail_url(video_html).sub(%r{.*/([^/]+)/[^/]+.jpg}, '\1')
151
- end
152
-
153
- def scrape_author video_html
154
- video_html.search("div[@class='vfacets']").inner_html.sub(/.*From:<\/span> <a.*?>(.*?)<\/a>.*/m, '\1')
155
- end
156
-
157
- def scrape_title video_html
158
- video_html.search("div[@class='vtitle']/a").inner_html
159
- end
160
-
161
- def scrape_length_seconds video_html
162
- length_seconds = video_html.search("span[@class='runtime']").inner_html
163
- length_seconds =~ /(\d\d):(\d\d)/
164
- $1.to_i * 60 + $2.to_i
165
- end
166
-
167
- def scrape_rating_avg video_html
168
- video_html.search("img[@src='/img/star_sm.gif']").size +
169
- video_html.search("img[@src='/img/star_sm_half.gif']").size * 0.5
170
- end
171
-
172
- def scrape_rating_count video_html
173
- video_html.search("div[@class='rating']").inner_html.sub(/(\d+) rating/, '\1').to_i
174
- end
175
-
176
- def scrape_description video_html
177
- description = video_html.search("div[@class='vdesc']/span").inner_html.sub(/^\n\t(.*?)\n\t$/m, '\1')
178
- end
179
-
180
- def scrape_view_count video_html
181
- video_html.search("div[@class='vfacets']").inner_html.sub(/.*Views:<\/span> (\d+).*/m, '\1').to_i
182
- end
183
-
184
- def scrape_tags video_html
185
- tags = []
186
- video_html.search("div[@class='vtagValue']/a").each do |tag|
187
- tags << tag.inner_html
188
- end
189
- tags.join(" ")
190
- end
191
-
192
- def scrape_thumbnail_url video_html
193
- video_html.search("img[@class='vimg120']").to_html.sub(/.*src="(.*?)".*/, '\1')
194
- end
195
-
196
- def scrape_url video_html
197
- "http://www.youtube.com" +
198
- video_html.search("div[@class='vtitle']/a").to_html.sub(/.*href="(.*?)".*/m, '\1')
199
- end
200
-
201
- def scrape_result_header
202
- @search_result.search("div[@id='sectionHeader']").inner_html
203
- end
204
-
205
- def scrape_video_count
206
- video_count = scrape_result_header
207
- unless video_count.sub!(/.+Results \d+-\d+ of( | about )([0-9,]+)/m , '\2')
208
- raise "no video count: " + @url unless is_no_result
209
- end
210
- video_count.gsub!(/,/, '')
211
- video_count.to_i
212
- end
213
-
214
- def scrape_video_from
215
- video_from = scrape_result_header
216
- unless video_from.sub!(/.+Results (\d+)/m, '\1')
217
- raise "no video from: " + @url unless is_no_result
218
- end
219
- video_from.to_i
220
- end
221
-
222
- def scrape_video_to
223
- video_to = scrape_result_header
224
- unless video_to.sub!(/.+Results \d+-(\d+)/m, '\1')
225
- raise "no video to: " + @url unless is_no_result
226
- end
227
- video_to.to_i
228
- end
229
-
230
- def is_no_result
231
- if @is_no_result == nil
232
- @is_no_result = @html.include?('No Videos found')
233
- end
234
- @is_no_result
235
- end
236
-
237
- def check_video video
238
- errors = []
239
-
240
- errors << "author" if video.author.empty?
241
- errors << "id" if video.id.empty?
242
- errors << "title" if video.title.empty?
243
- errors << "length_seconds" if video.length_seconds.to_s.empty?
244
- errors << "rating_avg" if video.rating_avg.to_s.empty?
245
- errors << "rating_count" if video.rating_count.to_s.empty?
246
- errors << "description" if video.description.empty?
247
- errors << "view_count" if video.view_count.to_s.empty?
248
- errors << "tags" if video.tags.empty?
249
- errors << "url" if video.url.empty?
250
- errors << "thumbnail_url" if video.thumbnail_url.empty?
251
-
252
- unless errors.empty? then
253
- error_msg = "scraping error occurred.\n"
254
- errors.each do |error|
255
- error_msg << error + " is not setted.\n"
256
- end
257
- raise error_msg
258
- end
259
- end
260
-
261
- end
262
-
263
- end