youtubesearchresultscraper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
@@ -0,0 +1,3 @@
1
+ *0.1.0*
2
+
3
+ * First public release
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (C) 2006 by in3c.org
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,13 @@
1
+ YouTubeの検索結果のHTMLをスクレイピングして、YouTube Developer APIのyoutube.videos.list_by_tagと同様な形式のXMLを返すライブラリです。
2
+
3
+ 使用例:
4
+ 8< - - - - 8< - - - - 8< - - - - 8< - - - -
5
+ require "youtube/searchresultscraper"
6
+
7
+ scraper = Youtube::SearchResultScraper.new(keyword, page)
8
+ scraper.open
9
+ scraper.scrape
10
+ puts scraper.get_xml
11
+ 8< - - - - 8< - - - - 8< - - - - 8< - - - -
12
+
13
+ 参考) http://www.ark-web.jp/sandbox/wiki/183.html
@@ -0,0 +1,137 @@
1
+ # Copyright (C) 2006 by in3c.org
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining
4
+ # a copy of this software and associated documentation files (the
5
+ # "Software"), to deal in the Software without restriction, including
6
+ # without limitation the rights to use, copy, modify, merge, publish,
7
+ # distribute, sublicense, and/or sell copies of the Software, and to
8
+ # permit persons to whom the Software is furnished to do so, subject to
9
+ # the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be
12
+ # included in all copies or substantial portions of the Software.
13
+ #
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21
+
22
+ require 'open-uri'
23
+ require 'cgi'
24
+ require 'rubygems'
25
+ require 'hpricot'
26
+ require 'youtube/video'
27
+
28
+ module Youtube
29
+ class SearchResultScraper
30
+
31
+ attr_accessor :keyword
32
+ attr_accessor :page
33
+
34
+ @@youtube_search_base_url = "http://www.youtube.com/results?search_query="
35
+
36
+ def initialize keyword, page=nil
37
+ @keyword = keyword
38
+ @page = page if not page == nil
39
+ end
40
+
41
+ def open
42
+ url = @@youtube_search_base_url + CGI.escape(@keyword)
43
+ url += "&page=#{@page}" if not @page == nil
44
+ @html = Kernel.open(url).read
45
+ replace_document_write_javascript
46
+ @search_result = Hpricot.parse(@html)
47
+ end
48
+
49
+ def scrape
50
+ @videos = []
51
+
52
+ @search_result.search("//div[@class='vEntry']").each do |video_html|
53
+ video = Youtube::Video.new
54
+ video.id = scrape_id(video_html)
55
+ video.author = scrape_author(video_html)
56
+ video.title = scrape_title(video_html)
57
+ video.length_seconds = scrape_length_seconds(video_html)
58
+ video.rating_avg = scrape_rating_avg(video_html)
59
+ video.rating_count = scrape_rating_count(video_html)
60
+ video.description = scrape_description(video_html)
61
+ video.view_count = scrape_view_count(video_html)
62
+ video.thumbnail_url = scrape_thumbnail_url(video_html)
63
+ video.tags = scrape_tags(video_html)
64
+ video.url = scrape_url(video_html)
65
+ @videos << video
66
+ end
67
+ end
68
+
69
+ def get_xml
70
+ xml = "<ut_response status=\"ok\"><video_list>\n"
71
+ @videos.each do |video|
72
+ xml += video.to_xml
73
+ end
74
+ xml += "</video_list></ut_response>"
75
+ end
76
+
77
+ private
78
+
79
+ def replace_document_write_javascript
80
+ @html.gsub!(%r{<script language="javascript" type="text/javascript">.*?document.write\('(.*?)'\).*?</script>}m, '\1')
81
+ end
82
+
83
+ def scrape_id video_html
84
+ scrape_thumbnail_url(video_html).sub(%r{.*/([^/]+)/[^/]+.jpg}, '\1')
85
+ end
86
+
87
+ def scrape_author video_html
88
+ video_html.search("div[@class='vfacets']").inner_html.sub(/.*From:<\/span> <a.*?>(.*?)<\/a>.*/m, '\1')
89
+ end
90
+
91
+
92
+ def scrape_title video_html
93
+ video_html.search("div[@class='vtitle']/a").inner_html
94
+ end
95
+
96
+ def scrape_length_seconds video_html
97
+ length_seconds = video_html.search("span[@class='runtime']").inner_html
98
+ length_seconds =~ /(\d\d):(\d\d)/
99
+ $1.to_i * 60 + $2.to_i
100
+ end
101
+
102
+ def scrape_rating_avg video_html
103
+ video_html.search("img[@src='/img/star_sm.gif']").size +
104
+ video_html.search("img[@src='/img/star_sm_half.gif']").size * 0.5
105
+ end
106
+
107
+ def scrape_rating_count video_html
108
+ video_html.search("div[@class='rating']").inner_html.sub(/(\d+) rating/, '\1').to_i
109
+ end
110
+
111
+ def scrape_description video_html
112
+ description = video_html.search("div[@class='vdesc']/span").inner_html.sub(/^\n\t(.*?)\n\t$/m, '\1')
113
+ end
114
+
115
+ def scrape_view_count video_html
116
+ video_html.search("div[@class='vfacets']").inner_html.sub(/.*Views:<\/span> (\d+).*/m, '\1').to_i
117
+ end
118
+
119
+ def scrape_tags video_html
120
+ tags = []
121
+ video_html.search("div[@class='vtagValue']/a").each do |tag|
122
+ tags << tag.inner_html
123
+ end
124
+ tags.join(" ")
125
+ end
126
+
127
+ def scrape_thumbnail_url video_html
128
+ video_html.search("img[@class='vimg120']").to_html.sub(/.*src="(.*?)".*/, '\1')
129
+ end
130
+
131
+ def scrape_url video_html
132
+ "http://www.youtube.com" +
133
+ video_html.search("div[@class='vtitle']/a").to_html.sub(/.*href="(.*?)".*/m, '\1')
134
+ end
135
+ end
136
+
137
+ end
@@ -0,0 +1,59 @@
1
+ # Copyright (C) 2006 by in3c.org
2
+ # http://in3c.org/
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+
23
+ module Youtube
24
+
25
+ class Video
26
+ attr_accessor :author
27
+ attr_accessor :id
28
+ attr_accessor :title
29
+ attr_accessor :length_seconds
30
+ attr_accessor :rating_avg
31
+ attr_accessor :rating_count
32
+ attr_accessor :description
33
+ attr_accessor :view_count
34
+ attr_accessor :upload_time
35
+ attr_accessor :comment_count
36
+ attr_accessor :tags
37
+ attr_accessor :url
38
+ attr_accessor :thumbnail_url
39
+
40
+ def to_xml
41
+ xml = "<video>\n"
42
+ instance_variables.each do |attr|
43
+ value = instance_variable_get(attr).to_s
44
+ value.gsub!(/<br \/>/, "\n")
45
+ value.gsub!(/<.*?>/m, '')
46
+ value.gsub!(/&/m, '&amp;')
47
+ value.gsub!(/'/m, '&apos;')
48
+ value.gsub!(/"/m, '&quot;')
49
+ value.gsub!(/</m, '&lt;')
50
+ value.gsub!(/>/m, '&gt;')
51
+ attr.sub!(/@/, '')
52
+ xml += "<#{attr}>#{value}</#{attr}>\n"
53
+ end
54
+ xml += "</video>\n"
55
+ end
56
+ end
57
+
58
+ end
59
+
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: youtubesearchresultscraper
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.0.1
7
+ date: 2006-11-26 00:00:00 +09:00
8
+ summary: This gem provide function to scrape html of search result on youtube
9
+ require_paths:
10
+ - lib
11
+ email:
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: youtube/searchresultscraper
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.8.1
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Yuki SHIDA
31
+ files:
32
+ - lib/youtube
33
+ - lib/youtube/video.rb
34
+ - lib/youtube/searchresultscraper.rb
35
+ - CHANGELOG
36
+ - MIT-LICENSE
37
+ - README
38
+ test_files: []
39
+
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files:
43
+ - CHANGELOG
44
+ - MIT-LICENSE
45
+ - README
46
+ executables: []
47
+
48
+ extensions: []
49
+
50
+ requirements:
51
+ - hpricot rubygem
52
+ dependencies:
53
+ - !ruby/object:Gem::Dependency
54
+ name: hpricot
55
+ version_requirement:
56
+ version_requirements: !ruby/object:Gem::Version::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0.4"
61
+ version: