movie_spider 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5f678d1d4e3479f2fa48b889671b7bc2d915208c
4
- data.tar.gz: bdf47ebf540ed0cb9374e46038d5f2913b236dd0
3
+ metadata.gz: 73521a5ab3ecc72f9db79b0c0a640190386b5d57
4
+ data.tar.gz: 7ea781c07b339f867494eeced1d9dbdde15df20d
5
5
  SHA512:
6
- metadata.gz: fd34d70d9da3c88b3d38f2692949d2e07162d2a7fe8c860c98a580362ddf9c3670782a185f4dbbaf5f9ccabae310cc1f8af369a7bec01549e30225379a62e508
7
- data.tar.gz: c75f433426d2f5425259a17d06238e8e3155ad2933d1a1ac37123c6ffd6e36a5a6a0b7cb82d9560112c40b87479d9f8e80a2c04d545ab5856e903d8c94ed1a4f
6
+ metadata.gz: a060107747831a6434ec4d418db9a774bf6138a40045b194c83a0ac2ea62c16be935152c0fb523e436467326274e7b318e26bb571e00fce4f86c3e2c15a2fd21
7
+ data.tar.gz: 702e792ad6d6fb8890a86d29727aa3e45ff0a36546692f3fe4acf0fd137772b498a87c9a00a5211c4e74549afc0b15b12e3413e3fde5feb9e936e9f5fadefa8b
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # MovieSpider
2
2
 
3
- TODO: Write a gem description
3
+ 单车网电影信息爬虫
4
4
 
5
5
  ## Installation
6
6
 
@@ -19,8 +19,19 @@ Or install it yourself as:
19
19
  $ gem install movie_spider
20
20
 
21
21
  ## Usage
22
+ ###从时光网获取影片信息、剧照、海报
23
+ `MovieSpider.fetch_info_from_mtime("超能陆战队", "2014")`
24
+ `MovieSpider.fetch_stills_from_mtime("超能陆战队", "2014")`
25
+ `MovieSpider.fetch_posters_from_mtime("超能陆战队", "2014")`
26
+ ###从豆瓣网获取影片信息、剧照、海报
27
+ `MovieSpider.fetch_info_from_douban("超能陆战队", "2014")`
28
+ `MovieSpider.fetch_stills_from_douban("超能陆战队", "2014")`
29
+ `MovieSpider.fetch_posters_from_douban("超能陆战队", "2014")`
30
+ ###通过豆瓣ID或时光网id获取以上信息,只使用id参数调用,不需要传影片年份,注意ID要对应好,使用豆瓣的ID获取豆瓣电影信息,使用时光网ID获取时光网电影信息。
31
+ `MovieSpider.fetch_(info|posters|stills)_from_douban("11026735")`
32
+ `MovieSpider.fetch_(info|posters|stills)_from_mtime("160162")`
33
+ 以上都是获取电影木星上行的信息。
22
34
 
23
- TODO: Write usage instructions here
24
35
 
25
36
  ## Contributing
26
37
 
data/lib/movie_spider.rb CHANGED
@@ -5,52 +5,101 @@ require 'json'
5
5
  require 'open-uri'
6
6
 
7
7
  module MovieSpider
8
- def self.get_title_from_douban_id(douban_id)
9
- DoubanMovie.new.get_subject(douban_id)["title"]
10
- end
11
8
 
12
- def self.fetch_stills(douban_id,source)
13
- title = get_title_from_douban_id(douban_id)
9
+ def self.fetch_stills(id, source, img_size="img_1000")
14
10
  case source
15
11
  when /douban/
16
- stills = DoubanMovie.new.douban_movie_stills(douban_id)
12
+ DoubanMovie.douban_movie_stills(id)
17
13
  when /mtime/
18
- stills = MtimeMovie.new.get_mtime_stills(title)
14
+ MtimeMovie.mtime_movie_stills(id, img_size)
19
15
  when /all/
20
- stills = DoubanMovie.new.douban_movie_stills(douban_id) + MtimeMovie.new.get_mtime_stills(title)
16
+ DoubanMovie.douban_movie_stills(id) + MtimeMovie.mtime_movie.stills(id, img_size)
21
17
  end
22
- stills
23
18
  end
24
19
 
25
- def self.fetch_posters(douban_id,source)
26
- title = get_title_from_douban_id(douban_id)
20
+ def self.fetch_posters(id, source, img_size="img_1000")
27
21
  case source
28
22
  when /douban/
29
- posters = DoubanMovie.new.douban_movie_posters(douban_id)
23
+ DoubanMovie.douban_movie_posters(id)
30
24
  when /mtime/
31
- posters = MtimeMovie.new.get_mtime_posters(title)
25
+ MtimeMovie.mtime_movie_posters(id, img_size)
32
26
  when /all/
33
- posters = DoubanMovie.new.douban_movie_posters(douban_id) + MtimeMovie.new.get_mtime_posters(title)
27
+ DoubanMovie.douban_movie_posters(id) + MtimeMovie.mtime_movie_posters(id, img_size)
28
+ end
29
+ end
30
+
31
+ def self.fetch_info(id, source)
32
+ case source
33
+ when /douban/
34
+ DoubanMovie.douban_movie_info(id)
35
+ when /mtime/
36
+ MtimeMovie.mtime_movie_info(id)
37
+ end
38
+ end
39
+
40
+ def self.search_movies(title, source)
41
+ case source
42
+ when /douban/
43
+ DoubanMovie.douban_search_movies(title)
44
+ when /mtime/
45
+ MtimeMovie.mtime_search_movies(title)
34
46
  end
35
- posters
36
47
  end
37
48
 
38
- def self.fetch_rating(douban_id, source)
39
- title = get_title_from_douban_id(douban_id)
40
- if source =~ /douban/
41
- DoubanMovie.new.get_subject(douban_id)["rating"]["average"]
42
- elsif source =~ /mtime/
43
- MtimeMovie.new.get_mtime_rating(title)
44
- else
45
- "评分网站参数错误[douban|mtime]"
49
+ def self.get_id_from_title(title, year, source="douban")
50
+ year = year.to_i
51
+ subjects = search_movies(title, source)
52
+ match = []
53
+ unless subjects.empty?
54
+ subjects.each do |subject|
55
+ film_name = title.gsub(/\[.+\]/, "").gsub("(", "(").gsub(")",")")
56
+ if subject["year"].to_i == year and
57
+ string_similarity(film_name, subject["title"], 0.8)
58
+ match << subject['id']
59
+ end
60
+ end
46
61
  end
62
+ match[0]
47
63
  end
48
64
 
49
- def self.get_subject(douban_id)
50
- DoubanMovie.new.get_subject(douban_id)
65
+ def self.define_component(type, source)
66
+ define_singleton_method("fetch_#{type}_from_#{source}") do |id_or_title, year=Time.now.year|
67
+ if id_or_title.to_i.to_s == id_or_title
68
+ MovieSpider.send("fetch_#{type}", id_or_title, source)
69
+ else
70
+ id = get_id_from_title(id_or_title, year, source)
71
+ MovieSpider.send("fetch_#{type}", id, source)
72
+ end
73
+ end
51
74
  end
52
75
 
53
- def self.douban_search_movie(movie_name)
54
- DoubanMovie.new.douban_search_movie(movie_name)
76
+ define_component "stills", "douban"
77
+ define_component "stills", "mtime"
78
+ define_component "posters", "douban"
79
+ define_component "posters", "mtime"
80
+ define_component "info", "douban"
81
+ define_component "info", "mtime"
82
+
83
+ private
84
+
85
+ def self.string_similarity(origin, compare, score)
86
+ origin.downcase!
87
+ origin_pair = (0..origin.length-2).collect{|i| origin[i, 2]}.reject{|pair| pair.include? " "}
88
+ compare.downcase!
89
+ compare_pair = (0..compare.length-2).collect{|i| compare[i, 2]}.reject{|pair| pair.include? " "}
90
+
91
+ union = origin_pair.size + compare_pair.size
92
+ intersection = 0
93
+ origin_pair.each do |op|
94
+ 0.upto(compare_pair.size - 1) do |i|
95
+ if op == compare_pair[i]
96
+ intersection += 1
97
+ compare_pair.slice!(i)
98
+ break
99
+ end
100
+ end
101
+ end
102
+ (2.0 * intersection) / union > score
55
103
  end
104
+
56
105
  end
@@ -1,21 +1,49 @@
1
1
  # coding: utf-8
2
2
 
3
3
  module MovieSpider
4
- class DoubanMovie
4
+ module DoubanMovie
5
+
5
6
  UrlPrefix = "http://api.douban.com/v2/"
6
7
  Key = "0c89712b2297db4e259c538167c791ea"
7
- def get_subject(movie_id)
8
+
9
+ def self.douban_movie_info(movie_id)
8
10
  path = "movie/subject/#{movie_id}?apikey=#{Key}"
9
11
  data = api_get(path)
12
+ subject ={}
13
+ subject['id'] = data['id']
14
+ subject['title'] = data['title']
15
+ subject['alt'] = data['alt']
16
+ subject['original_title'] = data['original_title']
17
+ subject['year'] = data['year']
18
+ subject['genres'] = data['genres']
19
+ subject['length'] = nil
20
+ subject['area'] = data['countries'].join("|")
21
+ subject['directors'] = data['directors'].map { |e| e['name'] }
22
+ subject['actors'] = data['casts'].map { |e| e['name'] }
23
+ subject['summary'] = data['summary'].gsub("©豆瓣", "")
24
+ subject['rating'] = data['rating']['average']
25
+ subject
10
26
  end
11
27
 
12
- def douban_search_movie(movie_name)
28
+ def self.douban_search_movies(movie_name)
13
29
  movie_name = movie_name.gsub(/\[.+\]/, "")
14
30
  path = "movie/search?q=#{movie_name}&apikey=#{Key}"
15
- data = api_get(path)["subjects"]
31
+ api_get(path)["subjects"].map do |subject|
32
+ subject.delete_if { |key,value| not %w|id year title|.include? key }
33
+ end
34
+ end
35
+
36
+ def self.douban_movie_stills(douban_id)
37
+ fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=S&start=0&sortby=vote&size=a&subtype=o")
38
+ end
39
+
40
+ def self.douban_movie_posters(douban_id)
41
+ fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=R&start=0&sortby=vote&size=a&subtype=a")
16
42
  end
17
43
 
18
- def api_get(path)
44
+ private
45
+
46
+ def self.api_get(path)
19
47
  begin
20
48
  data = RestClient.get URI.encode(UrlPrefix + path)
21
49
  JSON.parse data
@@ -28,15 +56,7 @@ module MovieSpider
28
56
  end
29
57
  end
30
58
 
31
- def douban_movie_stills(douban_id)
32
- fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=S&start=0&sortby=vote&size=a&subtype=o")
33
- end
34
-
35
- def douban_movie_posters(douban_id)
36
- fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=R&start=0&sortby=vote&size=a&subtype=a")
37
- end
38
-
39
- def fetch_img(url)
59
+ def self.fetch_img(url)
40
60
  begin
41
61
  doc = Nokogiri::HTML(open(url))
42
62
  rescue
@@ -1,74 +1,90 @@
1
1
  # coding: utf-8
2
2
  module MovieSpider
3
- class MtimeMovie
4
- #获取官方剧照,三种参数选择img_200 img_235 img_1000,注意235的尺寸可能会fetch不到,期望使用220或1000
5
- def get_mtime_stills(movie_title, img_size="img_1000")
6
- parse_mtime(movie_title, "officialstageimage", img_size)
3
+ module MtimeMovie
4
+ # 时光剧照
5
+ def self.mtime_movie_stills(movie_id, img_size)
6
+ parse_mtime(movie_id, "officialstageimage", img_size)
7
7
  end
8
- #获取正式海报
9
- def get_general_posters(movie_title, img_size="img_1000")
10
- parse_mtime(movie_title, "generalposter", img_size)
11
- end
12
- #获取预告海报
13
- def get_forecast_posters(movie_title, img_size="img_1000")
14
- parse_mtime(movie_title, "forecastposter", img_size)
15
- end
16
- #获取角色海报
17
- def get_role_posters(movie_title, img_size="img_1000")
18
- parse_mtime(movie_title, "roleposter", img_size)
19
- end
20
- #全部海报
21
- def get_mtime_posters(movie_title, img_size="img_1000")
22
- posters = []
23
- posters += get_general_posters(movie_title, img_size) + get_forecast_posters(movie_title,img_size) + get_role_posters(movie_title, img_size)
24
- posters.delete(nil);posters.delete("")
8
+ # 时光海报
9
+ def self.mtime_movie_posters(movie_id, img_size)
10
+ posters = parse_mtime(movie_id, "generalposter", img_size) \
11
+ + parse_mtime(movie_id, "forecastposter", img_size) \
12
+ + parse_mtime(movie_id, "roleposter", img_size)
13
+ posters.delete(nil)
14
+ posters.delete("")
25
15
  posters
26
16
  end
27
- #获取时光网电影评分
28
- def get_mtime_rating(movie_title)
29
- fetch_result = parse_search(movie_title)
30
- if fetch_result.is_a?(Hash)
31
- movie_result = fetch_result['value']['movieResult']
32
- if movie_result['directMovie'].empty?
33
- #无法命中电影标题时
34
- return "影片暂无评分" if movie_result['moreMovies'][0]['movieRating'].nil?
35
- movie_result['moreMovies'][0]['movieRating']
36
- else
37
- #直接查找出电影
38
- return "影片暂无评分" if movie_result['directMovie']['movieRating'].nil?
39
- movie_result['directMovie']['movieRating']
40
- end
17
+ # 时光信息
18
+ def self.mtime_movie_info(movie_id)
19
+ subject = {}
20
+ begin
21
+ html_doc = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/"))
22
+ plots = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/plots.html"))
23
+ rescue
24
+ return subject
25
+ end
26
+ subject['id'] = movie_id.to_s
27
+ subject['title'] = html_doc.css(".db_head h1").text
28
+ subject['alt'] = "http://movie.mtime.com/#{movie_id}/"
29
+ subject['original_title'] = html_doc.css(".db_head .db_enname").text
30
+ subject['year'] = html_doc.css(".db_head .db_year").text.gsub(/[\(\)]/, "")
31
+ subject['genres'] = html_doc.css(".db_head a[property='v:genre']").map { |e| e.text }
32
+ subject['length'] = html_doc.css(".db_head span[property='v:runtime']").text
33
+ subject['area'] = html_doc.css("dl.info_l dd")[2].css("a").text
34
+ subject['directors'] = html_doc.css(".db_contout a[rel='v:directedBy']").map { |e| e.text }
35
+ subject['actors'] = html_doc.css("dl.main_actor a[pan][title]").map { |e| e['title']}
36
+ subject['summary'] = plots.css("div.plots_box").text.strip
37
+ subject['rating'] = mtime_rating(movie_id)
38
+ subject
39
+ end
40
+ # 使用电影标题搜索时光网电影信息,返回数组
41
+ def self.mtime_search_movies(movie_title)
42
+ search_result = mtime_search(movie_title)['value']['movieResult']
43
+ direct_subject = search_result['directMovie']
44
+ subjects = search_result['moreMovies']
45
+ # 合并结果,去除空值
46
+ unless subjects.nil?
47
+ subjects.insert(0,direct_subject).delete({})
41
48
  else
42
- "解析出错,暂无评分"
49
+ subjects = direct_subject.empty? ? [] : [direct_subject]
43
50
  end
51
+ subjects.map do |subject|
52
+ item = {}
53
+ subject['movieTitle'] =~ /\((.*)\)/
54
+ item['id'] = subject['movieId'].to_s
55
+ item['title'] = subject['movieTitle'].split(" ")[0]
56
+ item['year'] = $1
57
+ subject.clear
58
+ subject.update(item)
59
+ end
60
+ subjects
44
61
  end
45
-
46
- private
47
- #解析时光网search API返回
48
- def parse_search(movie_title)
62
+ # 时光网评分
63
+ def self.mtime_rating(movie_id)
49
64
  begin
50
- result_str = open(URI::encode("http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_CallBackArgument0=#{movie_title}")).read
65
+ result_str = open("http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F189691%2F&t=20153418472369218&Ajax_CallBackArgument0=#{movie_id}").read
51
66
  rescue
67
+ puts "parse error"
52
68
  return ""
53
69
  end
54
- result_str.scan(/{.*}/) {|match| return JSON.parse(match)}
70
+ result_str.scan(/{.*}/) {|match| return JSON.parse(match)['value']['movieRating']['RatingFinal'].to_i}
55
71
  end
56
- def get_mtime_movie_id_by_title(movie_title)
57
- movie_result = {}
72
+
73
+ private
74
+
75
+ # 时光网搜索
76
+ def self.mtime_search(movie_title)
58
77
  begin
59
- movie_result = parse_search(movie_title)['value']['movieResult']
78
+ result_str = open(URI::encode("http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fsearch.mtime.com%2Fsearch%2F%3Fq%3D%25E8%25B6%2585%25E8%2583%25BD%25E9%2599%2586%25E6%2588%2598%25E9%2598%259F&t=20153414382422867&Ajax_CallBackArgument0=#{movie_title}&Ajax_CallBackArgument1=0&Ajax_CallBackArgument2=365&Ajax_CallBackArgument3=0&Ajax_CallBackArgument4=1")).read
60
79
  rescue
80
+ puts "parse error"
61
81
  return ""
62
82
  end
63
- if movie_result['directMovie'].empty?
64
- movie_result['moreMovies'][0]['movieId']
65
- else
66
- movie_result['directMovie']['movieId']
67
- end
83
+ result_str.scan(/{.*}/) {|match| return JSON.parse(match)}
68
84
  end
69
- #解析时光网JS返回变量
70
- def parse_mtime(movie_title, type, img_size)
71
- movie_id = get_mtime_movie_id_by_title(movie_title)
85
+
86
+ # 时光网图片解析
87
+ def self.parse_mtime(movie_id, type, img_size)
72
88
  begin
73
89
  html_doc = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/posters_and_images/posters/hot.html"))
74
90
  parser = ""
@@ -1,3 +1,3 @@
1
1
  module MovieSpider
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: movie_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - hzlu
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-04 00:00:00.000000000 Z
11
+ date: 2015-03-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler