movie_spider 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5f678d1d4e3479f2fa48b889671b7bc2d915208c
4
- data.tar.gz: bdf47ebf540ed0cb9374e46038d5f2913b236dd0
3
+ metadata.gz: 73521a5ab3ecc72f9db79b0c0a640190386b5d57
4
+ data.tar.gz: 7ea781c07b339f867494eeced1d9dbdde15df20d
5
5
  SHA512:
6
- metadata.gz: fd34d70d9da3c88b3d38f2692949d2e07162d2a7fe8c860c98a580362ddf9c3670782a185f4dbbaf5f9ccabae310cc1f8af369a7bec01549e30225379a62e508
7
- data.tar.gz: c75f433426d2f5425259a17d06238e8e3155ad2933d1a1ac37123c6ffd6e36a5a6a0b7cb82d9560112c40b87479d9f8e80a2c04d545ab5856e903d8c94ed1a4f
6
+ metadata.gz: a060107747831a6434ec4d418db9a774bf6138a40045b194c83a0ac2ea62c16be935152c0fb523e436467326274e7b318e26bb571e00fce4f86c3e2c15a2fd21
7
+ data.tar.gz: 702e792ad6d6fb8890a86d29727aa3e45ff0a36546692f3fe4acf0fd137772b498a87c9a00a5211c4e74549afc0b15b12e3413e3fde5feb9e936e9f5fadefa8b
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # MovieSpider
2
2
 
3
- TODO: Write a gem description
3
+ 单车网电影信息爬虫
4
4
 
5
5
  ## Installation
6
6
 
@@ -19,8 +19,19 @@ Or install it yourself as:
19
19
  $ gem install movie_spider
20
20
 
21
21
  ## Usage
22
+ ###从时光网获取影片信息、剧照、海报
23
+ `MovieSpider.fetch_info_from_mtime("超能陆战队", "2014")`
24
+ `MovieSpider.fetch_stills_from_mtime("超能陆战队", "2014")`
25
+ `MovieSpider.fetch_posters_from_mtime("超能陆战队", "2014")`
26
+ ###从豆瓣网获取影片信息、剧照、海报
27
+ `MovieSpider.fetch_info_from_douban("超能陆战队", "2014")`
28
+ `MovieSpider.fetch_stills_from_douban("超能陆战队", "2014")`
29
+ `MovieSpider.fetch_posters_from_douban("超能陆战队", "2014")`
30
+ ###通过豆瓣ID或时光网id获取以上信息,只使用id参数调用,不需要传影片年份,注意ID要对应好,使用豆瓣的ID获取豆瓣电影信息,使用时光网ID获取时光网电影信息。
31
+ `MovieSpider.fetch_(info|posters|stills)_from_douban("11026735")`
32
+ `MovieSpider.fetch_(info|posters|stills)_from_mtime("160162")`
33
+ 以上都是获取电影木星上行的信息。
22
34
 
23
- TODO: Write usage instructions here
24
35
 
25
36
  ## Contributing
26
37
 
data/lib/movie_spider.rb CHANGED
@@ -5,52 +5,101 @@ require 'json'
5
5
  require 'open-uri'
6
6
 
7
7
  module MovieSpider
8
- def self.get_title_from_douban_id(douban_id)
9
- DoubanMovie.new.get_subject(douban_id)["title"]
10
- end
11
8
 
12
- def self.fetch_stills(douban_id,source)
13
- title = get_title_from_douban_id(douban_id)
9
+ def self.fetch_stills(id, source, img_size="img_1000")
14
10
  case source
15
11
  when /douban/
16
- stills = DoubanMovie.new.douban_movie_stills(douban_id)
12
+ DoubanMovie.douban_movie_stills(id)
17
13
  when /mtime/
18
- stills = MtimeMovie.new.get_mtime_stills(title)
14
+ MtimeMovie.mtime_movie_stills(id, img_size)
19
15
  when /all/
20
- stills = DoubanMovie.new.douban_movie_stills(douban_id) + MtimeMovie.new.get_mtime_stills(title)
16
+ DoubanMovie.douban_movie_stills(id) + MtimeMovie.mtime_movie.stills(id, img_size)
21
17
  end
22
- stills
23
18
  end
24
19
 
25
- def self.fetch_posters(douban_id,source)
26
- title = get_title_from_douban_id(douban_id)
20
+ def self.fetch_posters(id, source, img_size="img_1000")
27
21
  case source
28
22
  when /douban/
29
- posters = DoubanMovie.new.douban_movie_posters(douban_id)
23
+ DoubanMovie.douban_movie_posters(id)
30
24
  when /mtime/
31
- posters = MtimeMovie.new.get_mtime_posters(title)
25
+ MtimeMovie.mtime_movie_posters(id, img_size)
32
26
  when /all/
33
- posters = DoubanMovie.new.douban_movie_posters(douban_id) + MtimeMovie.new.get_mtime_posters(title)
27
+ DoubanMovie.douban_movie_posters(id) + MtimeMovie.mtime_movie_posters(id, img_size)
28
+ end
29
+ end
30
+
31
+ def self.fetch_info(id, source)
32
+ case source
33
+ when /douban/
34
+ DoubanMovie.douban_movie_info(id)
35
+ when /mtime/
36
+ MtimeMovie.mtime_movie_info(id)
37
+ end
38
+ end
39
+
40
+ def self.search_movies(title, source)
41
+ case source
42
+ when /douban/
43
+ DoubanMovie.douban_search_movies(title)
44
+ when /mtime/
45
+ MtimeMovie.mtime_search_movies(title)
34
46
  end
35
- posters
36
47
  end
37
48
 
38
- def self.fetch_rating(douban_id, source)
39
- title = get_title_from_douban_id(douban_id)
40
- if source =~ /douban/
41
- DoubanMovie.new.get_subject(douban_id)["rating"]["average"]
42
- elsif source =~ /mtime/
43
- MtimeMovie.new.get_mtime_rating(title)
44
- else
45
- "评分网站参数错误[douban|mtime]"
49
+ def self.get_id_from_title(title, year, source="douban")
50
+ year = year.to_i
51
+ subjects = search_movies(title, source)
52
+ match = []
53
+ unless subjects.empty?
54
+ subjects.each do |subject|
55
+ film_name = title.gsub(/\[.+\]/, "").gsub("(", "(").gsub(")",")")
56
+ if subject["year"].to_i == year and
57
+ string_similarity(film_name, subject["title"], 0.8)
58
+ match << subject['id']
59
+ end
60
+ end
46
61
  end
62
+ match[0]
47
63
  end
48
64
 
49
- def self.get_subject(douban_id)
50
- DoubanMovie.new.get_subject(douban_id)
65
+ def self.define_component(type, source)
66
+ define_singleton_method("fetch_#{type}_from_#{source}") do |id_or_title, year=Time.now.year|
67
+ if id_or_title.to_i.to_s == id_or_title
68
+ MovieSpider.send("fetch_#{type}", id_or_title, source)
69
+ else
70
+ id = get_id_from_title(id_or_title, year, source)
71
+ MovieSpider.send("fetch_#{type}", id, source)
72
+ end
73
+ end
51
74
  end
52
75
 
53
- def self.douban_search_movie(movie_name)
54
- DoubanMovie.new.douban_search_movie(movie_name)
76
+ define_component "stills", "douban"
77
+ define_component "stills", "mtime"
78
+ define_component "posters", "douban"
79
+ define_component "posters", "mtime"
80
+ define_component "info", "douban"
81
+ define_component "info", "mtime"
82
+
83
+ private
84
+
85
+ def self.string_similarity(origin, compare, score)
86
+ origin.downcase!
87
+ origin_pair = (0..origin.length-2).collect{|i| origin[i, 2]}.reject{|pair| pair.include? " "}
88
+ compare.downcase!
89
+ compare_pair = (0..compare.length-2).collect{|i| compare[i, 2]}.reject{|pair| pair.include? " "}
90
+
91
+ union = origin_pair.size + compare_pair.size
92
+ intersection = 0
93
+ origin_pair.each do |op|
94
+ 0.upto(compare_pair.size - 1) do |i|
95
+ if op == compare_pair[i]
96
+ intersection += 1
97
+ compare_pair.slice!(i)
98
+ break
99
+ end
100
+ end
101
+ end
102
+ (2.0 * intersection) / union > score
55
103
  end
104
+
56
105
  end
@@ -1,21 +1,49 @@
1
1
  # coding: utf-8
2
2
 
3
3
  module MovieSpider
4
- class DoubanMovie
4
+ module DoubanMovie
5
+
5
6
  UrlPrefix = "http://api.douban.com/v2/"
6
7
  Key = "0c89712b2297db4e259c538167c791ea"
7
- def get_subject(movie_id)
8
+
9
+ def self.douban_movie_info(movie_id)
8
10
  path = "movie/subject/#{movie_id}?apikey=#{Key}"
9
11
  data = api_get(path)
12
+ subject ={}
13
+ subject['id'] = data['id']
14
+ subject['title'] = data['title']
15
+ subject['alt'] = data['alt']
16
+ subject['original_title'] = data['original_title']
17
+ subject['year'] = data['year']
18
+ subject['genres'] = data['genres']
19
+ subject['length'] = nil
20
+ subject['area'] = data['countries'].join("|")
21
+ subject['directors'] = data['directors'].map { |e| e['name'] }
22
+ subject['actors'] = data['casts'].map { |e| e['name'] }
23
+ subject['summary'] = data['summary'].gsub("©豆瓣", "")
24
+ subject['rating'] = data['rating']['average']
25
+ subject
10
26
  end
11
27
 
12
- def douban_search_movie(movie_name)
28
+ def self.douban_search_movies(movie_name)
13
29
  movie_name = movie_name.gsub(/\[.+\]/, "")
14
30
  path = "movie/search?q=#{movie_name}&apikey=#{Key}"
15
- data = api_get(path)["subjects"]
31
+ api_get(path)["subjects"].map do |subject|
32
+ subject.delete_if { |key,value| not %w|id year title|.include? key }
33
+ end
34
+ end
35
+
36
+ def self.douban_movie_stills(douban_id)
37
+ fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=S&start=0&sortby=vote&size=a&subtype=o")
38
+ end
39
+
40
+ def self.douban_movie_posters(douban_id)
41
+ fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=R&start=0&sortby=vote&size=a&subtype=a")
16
42
  end
17
43
 
18
- def api_get(path)
44
+ private
45
+
46
+ def self.api_get(path)
19
47
  begin
20
48
  data = RestClient.get URI.encode(UrlPrefix + path)
21
49
  JSON.parse data
@@ -28,15 +56,7 @@ module MovieSpider
28
56
  end
29
57
  end
30
58
 
31
- def douban_movie_stills(douban_id)
32
- fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=S&start=0&sortby=vote&size=a&subtype=o")
33
- end
34
-
35
- def douban_movie_posters(douban_id)
36
- fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=R&start=0&sortby=vote&size=a&subtype=a")
37
- end
38
-
39
- def fetch_img(url)
59
+ def self.fetch_img(url)
40
60
  begin
41
61
  doc = Nokogiri::HTML(open(url))
42
62
  rescue
@@ -1,74 +1,90 @@
1
1
  # coding: utf-8
2
2
  module MovieSpider
3
- class MtimeMovie
4
- #获取官方剧照,三种参数选择img_200 img_235 img_1000,注意235的尺寸可能会fetch不到,期望使用220或1000
5
- def get_mtime_stills(movie_title, img_size="img_1000")
6
- parse_mtime(movie_title, "officialstageimage", img_size)
3
+ module MtimeMovie
4
+ # 时光剧照
5
+ def self.mtime_movie_stills(movie_id, img_size)
6
+ parse_mtime(movie_id, "officialstageimage", img_size)
7
7
  end
8
- #获取正式海报
9
- def get_general_posters(movie_title, img_size="img_1000")
10
- parse_mtime(movie_title, "generalposter", img_size)
11
- end
12
- #获取预告海报
13
- def get_forecast_posters(movie_title, img_size="img_1000")
14
- parse_mtime(movie_title, "forecastposter", img_size)
15
- end
16
- #获取角色海报
17
- def get_role_posters(movie_title, img_size="img_1000")
18
- parse_mtime(movie_title, "roleposter", img_size)
19
- end
20
- #全部海报
21
- def get_mtime_posters(movie_title, img_size="img_1000")
22
- posters = []
23
- posters += get_general_posters(movie_title, img_size) + get_forecast_posters(movie_title,img_size) + get_role_posters(movie_title, img_size)
24
- posters.delete(nil);posters.delete("")
8
+ # 时光海报
9
+ def self.mtime_movie_posters(movie_id, img_size)
10
+ posters = parse_mtime(movie_id, "generalposter", img_size) \
11
+ + parse_mtime(movie_id, "forecastposter", img_size) \
12
+ + parse_mtime(movie_id, "roleposter", img_size)
13
+ posters.delete(nil)
14
+ posters.delete("")
25
15
  posters
26
16
  end
27
- #获取时光网电影评分
28
- def get_mtime_rating(movie_title)
29
- fetch_result = parse_search(movie_title)
30
- if fetch_result.is_a?(Hash)
31
- movie_result = fetch_result['value']['movieResult']
32
- if movie_result['directMovie'].empty?
33
- #无法命中电影标题时
34
- return "影片暂无评分" if movie_result['moreMovies'][0]['movieRating'].nil?
35
- movie_result['moreMovies'][0]['movieRating']
36
- else
37
- #直接查找出电影
38
- return "影片暂无评分" if movie_result['directMovie']['movieRating'].nil?
39
- movie_result['directMovie']['movieRating']
40
- end
17
+ # 时光信息
18
+ def self.mtime_movie_info(movie_id)
19
+ subject = {}
20
+ begin
21
+ html_doc = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/"))
22
+ plots = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/plots.html"))
23
+ rescue
24
+ return subject
25
+ end
26
+ subject['id'] = movie_id.to_s
27
+ subject['title'] = html_doc.css(".db_head h1").text
28
+ subject['alt'] = "http://movie.mtime.com/#{movie_id}/"
29
+ subject['original_title'] = html_doc.css(".db_head .db_enname").text
30
+ subject['year'] = html_doc.css(".db_head .db_year").text.gsub(/[\(\)]/, "")
31
+ subject['genres'] = html_doc.css(".db_head a[property='v:genre']").map { |e| e.text }
32
+ subject['length'] = html_doc.css(".db_head span[property='v:runtime']").text
33
+ subject['area'] = html_doc.css("dl.info_l dd")[2].css("a").text
34
+ subject['directors'] = html_doc.css(".db_contout a[rel='v:directedBy']").map { |e| e.text }
35
+ subject['actors'] = html_doc.css("dl.main_actor a[pan][title]").map { |e| e['title']}
36
+ subject['summary'] = plots.css("div.plots_box").text.strip
37
+ subject['rating'] = mtime_rating(movie_id)
38
+ subject
39
+ end
40
+ # 使用电影标题搜索时光网电影信息,返回数组
41
+ def self.mtime_search_movies(movie_title)
42
+ search_result = mtime_search(movie_title)['value']['movieResult']
43
+ direct_subject = search_result['directMovie']
44
+ subjects = search_result['moreMovies']
45
+ # 合并结果,去除空值
46
+ unless subjects.nil?
47
+ subjects.insert(0,direct_subject).delete({})
41
48
  else
42
- "解析出错,暂无评分"
49
+ subjects = direct_subject.empty? ? [] : [direct_subject]
43
50
  end
51
+ subjects.map do |subject|
52
+ item = {}
53
+ subject['movieTitle'] =~ /\((.*)\)/
54
+ item['id'] = subject['movieId'].to_s
55
+ item['title'] = subject['movieTitle'].split(" ")[0]
56
+ item['year'] = $1
57
+ subject.clear
58
+ subject.update(item)
59
+ end
60
+ subjects
44
61
  end
45
-
46
- private
47
- #解析时光网search API返回
48
- def parse_search(movie_title)
62
+ # 时光网评分
63
+ def self.mtime_rating(movie_id)
49
64
  begin
50
- result_str = open(URI::encode("http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_CallBackArgument0=#{movie_title}")).read
65
+ result_str = open("http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F189691%2F&t=20153418472369218&Ajax_CallBackArgument0=#{movie_id}").read
51
66
  rescue
67
+ puts "parse error"
52
68
  return ""
53
69
  end
54
- result_str.scan(/{.*}/) {|match| return JSON.parse(match)}
70
+ result_str.scan(/{.*}/) {|match| return JSON.parse(match)['value']['movieRating']['RatingFinal'].to_i}
55
71
  end
56
- def get_mtime_movie_id_by_title(movie_title)
57
- movie_result = {}
72
+
73
+ private
74
+
75
+ # 时光网搜索
76
+ def self.mtime_search(movie_title)
58
77
  begin
59
- movie_result = parse_search(movie_title)['value']['movieResult']
78
+ result_str = open(URI::encode("http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fsearch.mtime.com%2Fsearch%2F%3Fq%3D%25E8%25B6%2585%25E8%2583%25BD%25E9%2599%2586%25E6%2588%2598%25E9%2598%259F&t=20153414382422867&Ajax_CallBackArgument0=#{movie_title}&Ajax_CallBackArgument1=0&Ajax_CallBackArgument2=365&Ajax_CallBackArgument3=0&Ajax_CallBackArgument4=1")).read
60
79
  rescue
80
+ puts "parse error"
61
81
  return ""
62
82
  end
63
- if movie_result['directMovie'].empty?
64
- movie_result['moreMovies'][0]['movieId']
65
- else
66
- movie_result['directMovie']['movieId']
67
- end
83
+ result_str.scan(/{.*}/) {|match| return JSON.parse(match)}
68
84
  end
69
- #解析时光网JS返回变量
70
- def parse_mtime(movie_title, type, img_size)
71
- movie_id = get_mtime_movie_id_by_title(movie_title)
85
+
86
+ # 时光网图片解析
87
+ def self.parse_mtime(movie_id, type, img_size)
72
88
  begin
73
89
  html_doc = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/posters_and_images/posters/hot.html"))
74
90
  parser = ""
@@ -1,3 +1,3 @@
1
1
  module MovieSpider
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: movie_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - hzlu
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-04 00:00:00.000000000 Z
11
+ date: 2015-03-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler