movie_spider 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +13 -2
- data/lib/movie_spider.rb +76 -27
- data/lib/movie_spider/douban_fetcher.rb +34 -14
- data/lib/movie_spider/mtime_fetcher.rb +69 -53
- data/lib/movie_spider/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 73521a5ab3ecc72f9db79b0c0a640190386b5d57
|
4
|
+
data.tar.gz: 7ea781c07b339f867494eeced1d9dbdde15df20d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a060107747831a6434ec4d418db9a774bf6138a40045b194c83a0ac2ea62c16be935152c0fb523e436467326274e7b318e26bb571e00fce4f86c3e2c15a2fd21
|
7
|
+
data.tar.gz: 702e792ad6d6fb8890a86d29727aa3e45ff0a36546692f3fe4acf0fd137772b498a87c9a00a5211c4e74549afc0b15b12e3413e3fde5feb9e936e9f5fadefa8b
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# MovieSpider
|
2
2
|
|
3
|
-
|
3
|
+
单车网电影信息爬虫
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -19,8 +19,19 @@ Or install it yourself as:
|
|
19
19
|
$ gem install movie_spider
|
20
20
|
|
21
21
|
## Usage
|
22
|
+
###从时光网获取影片信息、剧照、海报
|
23
|
+
`MovieSpider.fetch_info_from_mtime("超能陆战队", "2014")`
|
24
|
+
`MovieSpider.fetch_stills_from_mtime("超能陆战队", "2014")`
|
25
|
+
`MovieSpider.fetch_posters_from_mtime("超能陆战队", "2014")`
|
26
|
+
###从豆瓣网获取影片信息、剧照、海报
|
27
|
+
`MovieSpider.fetch_info_from_douban("超能陆战队", "2014")`
|
28
|
+
`MovieSpider.fetch_stills_from_douban("超能陆战队", "2014")`
|
29
|
+
`MovieSpider.fetch_posters_from_douban("超能陆战队", "2014")`
|
30
|
+
###通过豆瓣ID或时光网id获取以上信息,只使用id参数调用,不需要传影片年份,注意ID要对应好,使用豆瓣的ID获取豆瓣电影信息,使用时光网ID获取时光网电影信息。
|
31
|
+
`MovieSpider.fetch_(info|posters|stills)_from_douban("11026735")`
|
32
|
+
`MovieSpider.fetch_(info|posters|stills)_from_mtime("160162")`
|
33
|
+
以上都是获取电影木星上行的信息。
|
22
34
|
|
23
|
-
TODO: Write usage instructions here
|
24
35
|
|
25
36
|
## Contributing
|
26
37
|
|
data/lib/movie_spider.rb
CHANGED
@@ -5,52 +5,101 @@ require 'json'
|
|
5
5
|
require 'open-uri'
|
6
6
|
|
7
7
|
module MovieSpider
|
8
|
-
def self.get_title_from_douban_id(douban_id)
|
9
|
-
DoubanMovie.new.get_subject(douban_id)["title"]
|
10
|
-
end
|
11
8
|
|
12
|
-
def self.fetch_stills(
|
13
|
-
title = get_title_from_douban_id(douban_id)
|
9
|
+
def self.fetch_stills(id, source, img_size="img_1000")
|
14
10
|
case source
|
15
11
|
when /douban/
|
16
|
-
|
12
|
+
DoubanMovie.douban_movie_stills(id)
|
17
13
|
when /mtime/
|
18
|
-
|
14
|
+
MtimeMovie.mtime_movie_stills(id, img_size)
|
19
15
|
when /all/
|
20
|
-
|
16
|
+
DoubanMovie.douban_movie_stills(id) + MtimeMovie.mtime_movie.stills(id, img_size)
|
21
17
|
end
|
22
|
-
stills
|
23
18
|
end
|
24
19
|
|
25
|
-
def self.fetch_posters(
|
26
|
-
title = get_title_from_douban_id(douban_id)
|
20
|
+
def self.fetch_posters(id, source, img_size="img_1000")
|
27
21
|
case source
|
28
22
|
when /douban/
|
29
|
-
|
23
|
+
DoubanMovie.douban_movie_posters(id)
|
30
24
|
when /mtime/
|
31
|
-
|
25
|
+
MtimeMovie.mtime_movie_posters(id, img_size)
|
32
26
|
when /all/
|
33
|
-
|
27
|
+
DoubanMovie.douban_movie_posters(id) + MtimeMovie.mtime_movie_posters(id, img_size)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.fetch_info(id, source)
|
32
|
+
case source
|
33
|
+
when /douban/
|
34
|
+
DoubanMovie.douban_movie_info(id)
|
35
|
+
when /mtime/
|
36
|
+
MtimeMovie.mtime_movie_info(id)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.search_movies(title, source)
|
41
|
+
case source
|
42
|
+
when /douban/
|
43
|
+
DoubanMovie.douban_search_movies(title)
|
44
|
+
when /mtime/
|
45
|
+
MtimeMovie.mtime_search_movies(title)
|
34
46
|
end
|
35
|
-
posters
|
36
47
|
end
|
37
48
|
|
38
|
-
def self.
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
49
|
+
def self.get_id_from_title(title, year, source="douban")
|
50
|
+
year = year.to_i
|
51
|
+
subjects = search_movies(title, source)
|
52
|
+
match = []
|
53
|
+
unless subjects.empty?
|
54
|
+
subjects.each do |subject|
|
55
|
+
film_name = title.gsub(/\[.+\]/, "").gsub("(", "(").gsub(")",")")
|
56
|
+
if subject["year"].to_i == year and
|
57
|
+
string_similarity(film_name, subject["title"], 0.8)
|
58
|
+
match << subject['id']
|
59
|
+
end
|
60
|
+
end
|
46
61
|
end
|
62
|
+
match[0]
|
47
63
|
end
|
48
64
|
|
49
|
-
def self.
|
50
|
-
|
65
|
+
def self.define_component(type, source)
|
66
|
+
define_singleton_method("fetch_#{type}_from_#{source}") do |id_or_title, year=Time.now.year|
|
67
|
+
if id_or_title.to_i.to_s == id_or_title
|
68
|
+
MovieSpider.send("fetch_#{type}", id_or_title, source)
|
69
|
+
else
|
70
|
+
id = get_id_from_title(id_or_title, year, source)
|
71
|
+
MovieSpider.send("fetch_#{type}", id, source)
|
72
|
+
end
|
73
|
+
end
|
51
74
|
end
|
52
75
|
|
53
|
-
|
54
|
-
|
76
|
+
define_component "stills", "douban"
|
77
|
+
define_component "stills", "mtime"
|
78
|
+
define_component "posters", "douban"
|
79
|
+
define_component "posters", "mtime"
|
80
|
+
define_component "info", "douban"
|
81
|
+
define_component "info", "mtime"
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def self.string_similarity(origin, compare, score)
|
86
|
+
origin.downcase!
|
87
|
+
origin_pair = (0..origin.length-2).collect{|i| origin[i, 2]}.reject{|pair| pair.include? " "}
|
88
|
+
compare.downcase!
|
89
|
+
compare_pair = (0..compare.length-2).collect{|i| compare[i, 2]}.reject{|pair| pair.include? " "}
|
90
|
+
|
91
|
+
union = origin_pair.size + compare_pair.size
|
92
|
+
intersection = 0
|
93
|
+
origin_pair.each do |op|
|
94
|
+
0.upto(compare_pair.size - 1) do |i|
|
95
|
+
if op == compare_pair[i]
|
96
|
+
intersection += 1
|
97
|
+
compare_pair.slice!(i)
|
98
|
+
break
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
(2.0 * intersection) / union > score
|
55
103
|
end
|
104
|
+
|
56
105
|
end
|
@@ -1,21 +1,49 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
3
|
module MovieSpider
|
4
|
-
|
4
|
+
module DoubanMovie
|
5
|
+
|
5
6
|
UrlPrefix = "http://api.douban.com/v2/"
|
6
7
|
Key = "0c89712b2297db4e259c538167c791ea"
|
7
|
-
|
8
|
+
|
9
|
+
def self.douban_movie_info(movie_id)
|
8
10
|
path = "movie/subject/#{movie_id}?apikey=#{Key}"
|
9
11
|
data = api_get(path)
|
12
|
+
subject ={}
|
13
|
+
subject['id'] = data['id']
|
14
|
+
subject['title'] = data['title']
|
15
|
+
subject['alt'] = data['alt']
|
16
|
+
subject['original_title'] = data['original_title']
|
17
|
+
subject['year'] = data['year']
|
18
|
+
subject['genres'] = data['genres']
|
19
|
+
subject['length'] = nil
|
20
|
+
subject['area'] = data['countries'].join("|")
|
21
|
+
subject['directors'] = data['directors'].map { |e| e['name'] }
|
22
|
+
subject['actors'] = data['casts'].map { |e| e['name'] }
|
23
|
+
subject['summary'] = data['summary'].gsub("©豆瓣", "")
|
24
|
+
subject['rating'] = data['rating']['average']
|
25
|
+
subject
|
10
26
|
end
|
11
27
|
|
12
|
-
def
|
28
|
+
def self.douban_search_movies(movie_name)
|
13
29
|
movie_name = movie_name.gsub(/\[.+\]/, "")
|
14
30
|
path = "movie/search?q=#{movie_name}&apikey=#{Key}"
|
15
|
-
|
31
|
+
api_get(path)["subjects"].map do |subject|
|
32
|
+
subject.delete_if { |key,value| not %w|id year title|.include? key }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.douban_movie_stills(douban_id)
|
37
|
+
fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=S&start=0&sortby=vote&size=a&subtype=o")
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.douban_movie_posters(douban_id)
|
41
|
+
fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=R&start=0&sortby=vote&size=a&subtype=a")
|
16
42
|
end
|
17
43
|
|
18
|
-
|
44
|
+
private
|
45
|
+
|
46
|
+
def self.api_get(path)
|
19
47
|
begin
|
20
48
|
data = RestClient.get URI.encode(UrlPrefix + path)
|
21
49
|
JSON.parse data
|
@@ -28,15 +56,7 @@ module MovieSpider
|
|
28
56
|
end
|
29
57
|
end
|
30
58
|
|
31
|
-
def
|
32
|
-
fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=S&start=0&sortby=vote&size=a&subtype=o")
|
33
|
-
end
|
34
|
-
|
35
|
-
def douban_movie_posters(douban_id)
|
36
|
-
fetch_img("http://movie.douban.com/subject/#{douban_id}/photos?type=R&start=0&sortby=vote&size=a&subtype=a")
|
37
|
-
end
|
38
|
-
|
39
|
-
def fetch_img(url)
|
59
|
+
def self.fetch_img(url)
|
40
60
|
begin
|
41
61
|
doc = Nokogiri::HTML(open(url))
|
42
62
|
rescue
|
@@ -1,74 +1,90 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
module MovieSpider
|
3
|
-
|
4
|
-
|
5
|
-
def
|
6
|
-
parse_mtime(
|
3
|
+
module MtimeMovie
|
4
|
+
# 时光剧照
|
5
|
+
def self.mtime_movie_stills(movie_id, img_size)
|
6
|
+
parse_mtime(movie_id, "officialstageimage", img_size)
|
7
7
|
end
|
8
|
-
|
9
|
-
def
|
10
|
-
parse_mtime(
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
end
|
16
|
-
#获取角色海报
|
17
|
-
def get_role_posters(movie_title, img_size="img_1000")
|
18
|
-
parse_mtime(movie_title, "roleposter", img_size)
|
19
|
-
end
|
20
|
-
#全部海报
|
21
|
-
def get_mtime_posters(movie_title, img_size="img_1000")
|
22
|
-
posters = []
|
23
|
-
posters += get_general_posters(movie_title, img_size) + get_forecast_posters(movie_title,img_size) + get_role_posters(movie_title, img_size)
|
24
|
-
posters.delete(nil);posters.delete("")
|
8
|
+
# 时光海报
|
9
|
+
def self.mtime_movie_posters(movie_id, img_size)
|
10
|
+
posters = parse_mtime(movie_id, "generalposter", img_size) \
|
11
|
+
+ parse_mtime(movie_id, "forecastposter", img_size) \
|
12
|
+
+ parse_mtime(movie_id, "roleposter", img_size)
|
13
|
+
posters.delete(nil)
|
14
|
+
posters.delete("")
|
25
15
|
posters
|
26
16
|
end
|
27
|
-
|
28
|
-
def
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
17
|
+
# 时光信息
|
18
|
+
def self.mtime_movie_info(movie_id)
|
19
|
+
subject = {}
|
20
|
+
begin
|
21
|
+
html_doc = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/"))
|
22
|
+
plots = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/plots.html"))
|
23
|
+
rescue
|
24
|
+
return subject
|
25
|
+
end
|
26
|
+
subject['id'] = movie_id.to_s
|
27
|
+
subject['title'] = html_doc.css(".db_head h1").text
|
28
|
+
subject['alt'] = "http://movie.mtime.com/#{movie_id}/"
|
29
|
+
subject['original_title'] = html_doc.css(".db_head .db_enname").text
|
30
|
+
subject['year'] = html_doc.css(".db_head .db_year").text.gsub(/[\(\)]/, "")
|
31
|
+
subject['genres'] = html_doc.css(".db_head a[property='v:genre']").map { |e| e.text }
|
32
|
+
subject['length'] = html_doc.css(".db_head span[property='v:runtime']").text
|
33
|
+
subject['area'] = html_doc.css("dl.info_l dd")[2].css("a").text
|
34
|
+
subject['directors'] = html_doc.css(".db_contout a[rel='v:directedBy']").map { |e| e.text }
|
35
|
+
subject['actors'] = html_doc.css("dl.main_actor a[pan][title]").map { |e| e['title']}
|
36
|
+
subject['summary'] = plots.css("div.plots_box").text.strip
|
37
|
+
subject['rating'] = mtime_rating(movie_id)
|
38
|
+
subject
|
39
|
+
end
|
40
|
+
# 使用电影标题搜索时光网电影信息,返回数组
|
41
|
+
def self.mtime_search_movies(movie_title)
|
42
|
+
search_result = mtime_search(movie_title)['value']['movieResult']
|
43
|
+
direct_subject = search_result['directMovie']
|
44
|
+
subjects = search_result['moreMovies']
|
45
|
+
# 合并结果,去除空值
|
46
|
+
unless subjects.nil?
|
47
|
+
subjects.insert(0,direct_subject).delete({})
|
41
48
|
else
|
42
|
-
|
49
|
+
subjects = direct_subject.empty? ? [] : [direct_subject]
|
43
50
|
end
|
51
|
+
subjects.map do |subject|
|
52
|
+
item = {}
|
53
|
+
subject['movieTitle'] =~ /\((.*)\)/
|
54
|
+
item['id'] = subject['movieId'].to_s
|
55
|
+
item['title'] = subject['movieTitle'].split(" ")[0]
|
56
|
+
item['year'] = $1
|
57
|
+
subject.clear
|
58
|
+
subject.update(item)
|
59
|
+
end
|
60
|
+
subjects
|
44
61
|
end
|
45
|
-
|
46
|
-
|
47
|
-
#解析时光网search API返回
|
48
|
-
def parse_search(movie_title)
|
62
|
+
# 时光网评分
|
63
|
+
def self.mtime_rating(movie_id)
|
49
64
|
begin
|
50
|
-
result_str = open(
|
65
|
+
result_str = open("http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F189691%2F&t=20153418472369218&Ajax_CallBackArgument0=#{movie_id}").read
|
51
66
|
rescue
|
67
|
+
puts "parse error"
|
52
68
|
return ""
|
53
69
|
end
|
54
|
-
result_str.scan(/{.*}/) {|match| return JSON.parse(match)}
|
70
|
+
result_str.scan(/{.*}/) {|match| return JSON.parse(match)['value']['movieRating']['RatingFinal'].to_i}
|
55
71
|
end
|
56
|
-
|
57
|
-
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
# 时光网搜索
|
76
|
+
def self.mtime_search(movie_title)
|
58
77
|
begin
|
59
|
-
|
78
|
+
result_str = open(URI::encode("http://service.channel.mtime.com/Search.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Channel.Services&Ajax_CallBackMethod=GetSearchResult&Ajax_CrossDomain=1&Ajax_RequestUrl=http%3A%2F%2Fsearch.mtime.com%2Fsearch%2F%3Fq%3D%25E8%25B6%2585%25E8%2583%25BD%25E9%2599%2586%25E6%2588%2598%25E9%2598%259F&t=20153414382422867&Ajax_CallBackArgument0=#{movie_title}&Ajax_CallBackArgument1=0&Ajax_CallBackArgument2=365&Ajax_CallBackArgument3=0&Ajax_CallBackArgument4=1")).read
|
60
79
|
rescue
|
80
|
+
puts "parse error"
|
61
81
|
return ""
|
62
82
|
end
|
63
|
-
|
64
|
-
movie_result['moreMovies'][0]['movieId']
|
65
|
-
else
|
66
|
-
movie_result['directMovie']['movieId']
|
67
|
-
end
|
83
|
+
result_str.scan(/{.*}/) {|match| return JSON.parse(match)}
|
68
84
|
end
|
69
|
-
|
70
|
-
|
71
|
-
|
85
|
+
|
86
|
+
# 时光网图片解析
|
87
|
+
def self.parse_mtime(movie_id, type, img_size)
|
72
88
|
begin
|
73
89
|
html_doc = Nokogiri::HTML(open("http://movie.mtime.com/#{movie_id}/posters_and_images/posters/hot.html"))
|
74
90
|
parser = ""
|
data/lib/movie_spider/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: movie_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hzlu
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|